From 62dc3eccf37f70e7af10fc84178a52ca1384b232 Mon Sep 17 00:00:00 2001
From: Francesco Romani <fromani@redhat.com>
Date: Wed, 14 May 2025 18:06:37 +0200
Subject: [PATCH 1/6] [AA][HI]: perfprof creator: autosizing implementation

Implement reserved cpu (aka infra+control plane) sizing using
a the linear programming optimization (gonum/optimize).

The core idea is to model the constraints and let the optimization
package compute the desired target.

These changes where AI-Assisted (hence the AA tag),
then largely amended by a human (hence the HI tag - Human Intervention).

The initial penalty cost structure was suggested by google Gemini 2.5 flash,
and then amended by human intervention.

Assisted-by: Google Gemini
Assisted-by-model: gemini-2.5-flash
Signed-off-by: Francesco Romani <fromani@redhat.com>
---
 go.mod                                        |    1 +
 go.sum                                        |    2 +
 .../profilecreator/autosize/autosize.go       |  246 ++
 .../profilecreator/cmd/root.go                |   37 +-
 .../x/tools/container/intsets/sparse.go       | 1107 ++++++
 vendor/gonum.org/v1/gonum/AUTHORS             |  141 +
 vendor/gonum.org/v1/gonum/CONTRIBUTORS        |  144 +
 vendor/gonum.org/v1/gonum/LICENSE             |   23 +
 vendor/gonum.org/v1/gonum/blas/README.md      |   51 +
 vendor/gonum.org/v1/gonum/blas/blas.go        |  283 ++
 .../gonum.org/v1/gonum/blas/blas64/blas64.go  |  533 +++
 vendor/gonum.org/v1/gonum/blas/blas64/conv.go |  263 ++
 .../v1/gonum/blas/blas64/conv_symmetric.go    |  153 +
 vendor/gonum.org/v1/gonum/blas/blas64/doc.go  |    6 +
 .../v1/gonum/blas/cblas128/cblas128.go        |  600 ++++
 .../gonum.org/v1/gonum/blas/cblas128/conv.go  |  265 ++
 .../v1/gonum/blas/cblas128/conv_hermitian.go  |  155 +
 .../v1/gonum/blas/cblas128/conv_symmetric.go  |  155 +
 .../gonum.org/v1/gonum/blas/cblas128/doc.go   |    6 +
 .../gonum.org/v1/gonum/blas/conversions.bash  |  159 +
 vendor/gonum.org/v1/gonum/blas/doc.go         |  108 +
 vendor/gonum.org/v1/gonum/blas/gonum/dgemm.go |  297 ++
 vendor/gonum.org/v1/gonum/blas/gonum/doc.go   |   99 +
 .../gonum.org/v1/gonum/blas/gonum/errors.go   |   35 +
 vendor/gonum.org/v1/gonum/blas/gonum/gonum.go |   38 +
 .../v1/gonum/blas/gonum/level1cmplx128.go     |  454 +++
 .../v1/gonum/blas/gonum/level1cmplx64.go      |  476 +++
 .../v1/gonum/blas/gonum/level1float32.go      |  653 ++++
 .../gonum/blas/gonum/level1float32_dsdot.go   |   54 +
 .../v1/gonum/blas/gonum/level1float32_sdot.go |   54 +
 .../gonum/blas/gonum/level1float32_sdsdot.go  |   54 +
 .../v1/gonum/blas/gonum/level1float64.go      |  629 ++++
 .../v1/gonum/blas/gonum/level1float64_ddot.go |   50 +
 .../v1/gonum/blas/gonum/level2cmplx128.go     | 2940 ++++++++++++++++
 .../v1/gonum/blas/gonum/level2cmplx64.go      | 2976 +++++++++++++++++
 .../v1/gonum/blas/gonum/level2float32.go      | 2400 +++++++++++++
 .../v1/gonum/blas/gonum/level2float64.go      | 2366 +++++++++++++
 .../v1/gonum/blas/gonum/level3cmplx128.go     | 1751 ++++++++++
 .../v1/gonum/blas/gonum/level3cmplx64.go      | 1771 ++++++++++
 .../v1/gonum/blas/gonum/level3float32.go      |  925 +++++
 .../v1/gonum/blas/gonum/level3float64.go      |  913 +++++
 vendor/gonum.org/v1/gonum/blas/gonum/sgemm.go |  301 ++
 .../v1/gonum/blas/gonum/single_precision.bash |  224 ++
 vendor/gonum.org/v1/gonum/floats/README.md    |    7 +
 vendor/gonum.org/v1/gonum/floats/doc.go       |   11 +
 vendor/gonum.org/v1/gonum/floats/floats.go    |  808 +++++
 .../gonum.org/v1/gonum/floats/scalar/doc.go   |    6 +
 .../v1/gonum/floats/scalar/scalar.go          |  171 +
 .../gonum/internal/asm/c128/axpyinc_amd64.s   |  134 +
 .../gonum/internal/asm/c128/axpyincto_amd64.s |  141 +
 .../internal/asm/c128/axpyunitary_amd64.s     |  122 +
 .../internal/asm/c128/axpyunitaryto_amd64.s   |  123 +
 .../v1/gonum/internal/asm/c128/doc.go         |    6 +
 .../gonum/internal/asm/c128/dotcinc_amd64.s   |  153 +
 .../internal/asm/c128/dotcunitary_amd64.s     |  143 +
 .../gonum/internal/asm/c128/dotuinc_amd64.s   |  141 +
 .../internal/asm/c128/dotuunitary_amd64.s     |  130 +
 .../gonum/internal/asm/c128/dscalinc_amd64.s  |   69 +
 .../internal/asm/c128/dscalunitary_amd64.s    |   66 +
 .../v1/gonum/internal/asm/c128/scal.go        |   33 +
 .../internal/asm/c128/scalUnitary_amd64.s     |  116 +
 .../gonum/internal/asm/c128/scalinc_amd64.s   |  121 +
 .../v1/gonum/internal/asm/c128/stubs.go       |  180 +
 .../v1/gonum/internal/asm/c128/stubs_amd64.go |  109 +
 .../v1/gonum/internal/asm/c128/stubs_noasm.go |  176 +
 .../v1/gonum/internal/asm/c64/axpyinc_amd64.s |  151 +
 .../gonum/internal/asm/c64/axpyincto_amd64.s  |  156 +
 .../internal/asm/c64/axpyunitary_amd64.s      |  160 +
 .../internal/asm/c64/axpyunitaryto_amd64.s    |  157 +
 .../v1/gonum/internal/asm/c64/conj.go         |    7 +
 .../v1/gonum/internal/asm/c64/doc.go          |    6 +
 .../v1/gonum/internal/asm/c64/dotcinc_amd64.s |  160 +
 .../internal/asm/c64/dotcunitary_amd64.s      |  208 ++
 .../v1/gonum/internal/asm/c64/dotuinc_amd64.s |  148 +
 .../internal/asm/c64/dotuunitary_amd64.s      |  197 ++
 .../v1/gonum/internal/asm/c64/scal.go         |   85 +
 .../v1/gonum/internal/asm/c64/stubs.go        |  180 +
 .../v1/gonum/internal/asm/c64/stubs_amd64.go  |   77 +
 .../v1/gonum/internal/asm/c64/stubs_noasm.go  |  122 +
 .../v1/gonum/internal/asm/f32/axpyinc_amd64.s |   73 +
 .../gonum/internal/asm/f32/axpyincto_amd64.s  |   78 +
 .../internal/asm/f32/axpyunitary_amd64.s      |   97 +
 .../internal/asm/f32/axpyunitaryto_amd64.s    |   98 +
 .../v1/gonum/internal/asm/f32/ddotinc_amd64.s |   91 +
 .../internal/asm/f32/ddotunitary_amd64.s      |  110 +
 .../v1/gonum/internal/asm/f32/doc.go          |    6 +
 .../v1/gonum/internal/asm/f32/dotinc_amd64.s  |   85 +
 .../gonum/internal/asm/f32/dotunitary_amd64.s |  106 +
 .../v1/gonum/internal/asm/f32/ge_amd64.go     |   18 +
 .../v1/gonum/internal/asm/f32/ge_amd64.s      |  757 +++++
 .../v1/gonum/internal/asm/f32/ge_noasm.go     |   39 +
 .../v1/gonum/internal/asm/f32/gemv.go         |   92 +
 .../v1/gonum/internal/asm/f32/l2norm.go       |   90 +
 .../v1/gonum/internal/asm/f32/scal.go         |   59 +
 .../v1/gonum/internal/asm/f32/stubs_amd64.go  |   86 +
 .../v1/gonum/internal/asm/f32/stubs_noasm.go  |  137 +
 .../v1/gonum/internal/asm/f32/sum_amd64.s     |  100 +
 .../v1/gonum/internal/asm/f64/abssum_amd64.s  |   82 +
 .../gonum/internal/asm/f64/abssuminc_amd64.s  |   90 +
 .../v1/gonum/internal/asm/f64/add_amd64.s     |   66 +
 .../gonum/internal/asm/f64/addconst_amd64.s   |   53 +
 .../v1/gonum/internal/asm/f64/axpy.go         |   62 +
 .../v1/gonum/internal/asm/f64/axpyinc_amd64.s |  142 +
 .../gonum/internal/asm/f64/axpyincto_amd64.s  |  148 +
 .../internal/asm/f64/axpyunitary_amd64.s      |  134 +
 .../internal/asm/f64/axpyunitaryto_amd64.s    |  140 +
 .../v1/gonum/internal/asm/f64/cumprod_amd64.s |   71 +
 .../v1/gonum/internal/asm/f64/cumsum_amd64.s  |   64 +
 .../v1/gonum/internal/asm/f64/div_amd64.s     |   67 +
 .../v1/gonum/internal/asm/f64/divto_amd64.s   |   73 +
 .../v1/gonum/internal/asm/f64/doc.go          |    6 +
 .../v1/gonum/internal/asm/f64/dot.go          |   38 +
 .../v1/gonum/internal/asm/f64/dot_amd64.s     |  145 +
 .../v1/gonum/internal/asm/f64/ge_amd64.go     |   29 +
 .../v1/gonum/internal/asm/f64/ge_noasm.go     |  125 +
 .../v1/gonum/internal/asm/f64/gemvN_amd64.s   |  685 ++++
 .../v1/gonum/internal/asm/f64/gemvT_amd64.s   |  745 +++++
 .../v1/gonum/internal/asm/f64/ger_amd64.s     |  591 ++++
 .../v1/gonum/internal/asm/f64/l1norm_amd64.s  |   58 +
 .../v1/gonum/internal/asm/f64/l2norm_amd64.s  |  109 +
 .../v1/gonum/internal/asm/f64/l2norm_noasm.go |   93 +
 .../gonum/internal/asm/f64/l2normdist_amd64.s |  115 +
 .../gonum/internal/asm/f64/l2norminc_amd64.s  |  110 +
 .../gonum/internal/asm/f64/linfnorm_amd64.s   |   57 +
 .../v1/gonum/internal/asm/f64/scal.go         |   62 +
 .../v1/gonum/internal/asm/f64/scalinc_amd64.s |  113 +
 .../gonum/internal/asm/f64/scalincto_amd64.s  |  122 +
 .../internal/asm/f64/scalunitary_amd64.s      |  112 +
 .../internal/asm/f64/scalunitaryto_amd64.s    |  113 +
 .../v1/gonum/internal/asm/f64/stubs_amd64.go  |  277 ++
 .../v1/gonum/internal/asm/f64/stubs_noasm.go  |  182 +
 .../v1/gonum/internal/asm/f64/sum_amd64.s     |   99 +
 .../v1/gonum/internal/cmplx64/abs.go          |   14 +
 .../v1/gonum/internal/cmplx64/conj.go         |   12 +
 .../v1/gonum/internal/cmplx64/doc.go          |    7 +
 .../v1/gonum/internal/cmplx64/isinf.go        |   25 +
 .../v1/gonum/internal/cmplx64/isnan.go        |   29 +
 .../v1/gonum/internal/cmplx64/sqrt.go         |  108 +
 .../gonum.org/v1/gonum/internal/math32/doc.go |    7 +
 .../v1/gonum/internal/math32/math.go          |  166 +
 .../v1/gonum/internal/math32/signbit.go       |   16 +
 .../v1/gonum/internal/math32/sqrt.go          |   26 +
 .../v1/gonum/internal/math32/sqrt_amd64.go    |   22 +
 .../v1/gonum/internal/math32/sqrt_amd64.s     |   17 +
 .../v1/gonum/internal/math32/sqrt_arm64.go    |   22 +
 .../v1/gonum/internal/math32/sqrt_arm64.s     |   18 +
 vendor/gonum.org/v1/gonum/lapack/.gitignore   |    0
 vendor/gonum.org/v1/gonum/lapack/README.md    |   29 +
 vendor/gonum.org/v1/gonum/lapack/doc.go       |    6 +
 .../gonum.org/v1/gonum/lapack/gonum/dbdsqr.go |  506 +++
 .../gonum.org/v1/gonum/lapack/gonum/dgebak.go |   91 +
 .../gonum.org/v1/gonum/lapack/gonum/dgebal.go |  248 ++
 .../gonum.org/v1/gonum/lapack/gonum/dgebd2.go |   88 +
 .../gonum.org/v1/gonum/lapack/gonum/dgebrd.go |  169 +
 .../gonum.org/v1/gonum/lapack/gonum/dgecon.go |  106 +
 .../gonum.org/v1/gonum/lapack/gonum/dgeev.go  |  287 ++
 .../gonum.org/v1/gonum/lapack/gonum/dgehd2.go |  105 +
 .../gonum.org/v1/gonum/lapack/gonum/dgehrd.go |  202 ++
 .../gonum.org/v1/gonum/lapack/gonum/dgelq2.go |   65 +
 .../gonum.org/v1/gonum/lapack/gonum/dgelqf.go |   97 +
 .../gonum.org/v1/gonum/lapack/gonum/dgels.go  |  220 ++
 .../gonum.org/v1/gonum/lapack/gonum/dgeql2.go |   67 +
 .../gonum.org/v1/gonum/lapack/gonum/dgeqp3.go |  195 ++
 .../gonum.org/v1/gonum/lapack/gonum/dgeqr2.go |   78 +
 .../gonum.org/v1/gonum/lapack/gonum/dgeqrf.go |  108 +
 .../gonum.org/v1/gonum/lapack/gonum/dgerq2.go |   74 +
 .../gonum.org/v1/gonum/lapack/gonum/dgerqf.go |  135 +
 .../gonum.org/v1/gonum/lapack/gonum/dgesc2.go |   93 +
 .../gonum.org/v1/gonum/lapack/gonum/dgesv.go  |   60 +
 .../gonum.org/v1/gonum/lapack/gonum/dgesvd.go | 1378 ++++++++
 .../gonum.org/v1/gonum/lapack/gonum/dgetc2.go |  125 +
 .../gonum.org/v1/gonum/lapack/gonum/dgetf2.go |   90 +
 .../gonum.org/v1/gonum/lapack/gonum/dgetrf.go |   89 +
 .../gonum.org/v1/gonum/lapack/gonum/dgetri.go |  116 +
 .../gonum.org/v1/gonum/lapack/gonum/dgetrs.go |   74 +
 .../gonum.org/v1/gonum/lapack/gonum/dgghrd.go |  125 +
 .../v1/gonum/lapack/gonum/dggsvd3.go          |  258 ++
 .../v1/gonum/lapack/gonum/dggsvp3.go          |  286 ++
 .../gonum.org/v1/gonum/lapack/gonum/dgtsv.go  |  101 +
 .../gonum.org/v1/gonum/lapack/gonum/dhseqr.go |  272 ++
 .../gonum.org/v1/gonum/lapack/gonum/dlabrd.go |  183 +
 .../gonum.org/v1/gonum/lapack/gonum/dlacn2.go |  136 +
 .../gonum.org/v1/gonum/lapack/gonum/dlacpy.go |   59 +
 .../gonum.org/v1/gonum/lapack/gonum/dlae2.go  |   51 +
 .../gonum.org/v1/gonum/lapack/gonum/dlaev2.go |   85 +
 .../gonum.org/v1/gonum/lapack/gonum/dlaexc.go |  269 ++
 .../gonum.org/v1/gonum/lapack/gonum/dlag2.go  |  237 ++
 .../gonum.org/v1/gonum/lapack/gonum/dlags2.go |  186 ++
 .../gonum.org/v1/gonum/lapack/gonum/dlagtm.go |  111 +
 .../gonum.org/v1/gonum/lapack/gonum/dlahqr.go |  449 +++
 .../gonum.org/v1/gonum/lapack/gonum/dlahr2.go |  202 ++
 .../gonum.org/v1/gonum/lapack/gonum/dlaln2.go |  407 +++
 .../gonum.org/v1/gonum/lapack/gonum/dlangb.go |   87 +
 .../gonum.org/v1/gonum/lapack/gonum/dlange.go |   89 +
 .../gonum.org/v1/gonum/lapack/gonum/dlangt.go |  115 +
 .../gonum.org/v1/gonum/lapack/gonum/dlanhs.go |   78 +
 .../gonum.org/v1/gonum/lapack/gonum/dlansb.go |  131 +
 .../gonum.org/v1/gonum/lapack/gonum/dlanst.go |   75 +
 .../gonum.org/v1/gonum/lapack/gonum/dlansy.go |  125 +
 .../gonum.org/v1/gonum/lapack/gonum/dlantb.go |  209 ++
 .../gonum.org/v1/gonum/lapack/gonum/dlantr.go |  252 ++
 .../gonum.org/v1/gonum/lapack/gonum/dlanv2.go |  151 +
 .../gonum.org/v1/gonum/lapack/gonum/dlapll.go |   55 +
 .../gonum.org/v1/gonum/lapack/gonum/dlapmr.go |   88 +
 .../gonum.org/v1/gonum/lapack/gonum/dlapmt.go |   89 +
 .../gonum.org/v1/gonum/lapack/gonum/dlapy2.go |   14 +
 .../gonum.org/v1/gonum/lapack/gonum/dlaqp2.go |  127 +
 .../gonum.org/v1/gonum/lapack/gonum/dlaqps.go |  244 ++
 .../v1/gonum/lapack/gonum/dlaqr04.go          |  493 +++
 .../gonum.org/v1/gonum/lapack/gonum/dlaqr1.go |   61 +
 .../v1/gonum/lapack/gonum/dlaqr23.go          |  423 +++
 .../gonum.org/v1/gonum/lapack/gonum/dlaqr5.go |  560 ++++
 .../gonum.org/v1/gonum/lapack/gonum/dlarf.go  |  102 +
 .../gonum.org/v1/gonum/lapack/gonum/dlarfb.go |  461 +++
 .../gonum.org/v1/gonum/lapack/gonum/dlarfg.go |   75 +
 .../gonum.org/v1/gonum/lapack/gonum/dlarft.go |  169 +
 .../gonum.org/v1/gonum/lapack/gonum/dlarfx.go |  552 +++
 .../gonum.org/v1/gonum/lapack/gonum/dlartg.go |   73 +
 .../gonum.org/v1/gonum/lapack/gonum/dlas2.go  |   45 +
 .../gonum.org/v1/gonum/lapack/gonum/dlascl.go |  111 +
 .../gonum.org/v1/gonum/lapack/gonum/dlaset.go |   58 +
 .../gonum.org/v1/gonum/lapack/gonum/dlasq1.go |  100 +
 .../gonum.org/v1/gonum/lapack/gonum/dlasq2.go |  370 ++
 .../gonum.org/v1/gonum/lapack/gonum/dlasq3.go |  172 +
 .../gonum.org/v1/gonum/lapack/gonum/dlasq4.go |  249 ++
 .../gonum.org/v1/gonum/lapack/gonum/dlasq5.go |  140 +
 .../gonum.org/v1/gonum/lapack/gonum/dlasq6.go |  118 +
 .../gonum.org/v1/gonum/lapack/gonum/dlasr.go  |  287 ++
 .../gonum.org/v1/gonum/lapack/gonum/dlasrt.go |   36 +
 .../gonum.org/v1/gonum/lapack/gonum/dlassq.go |  131 +
 .../gonum.org/v1/gonum/lapack/gonum/dlasv2.go |  117 +
 .../gonum.org/v1/gonum/lapack/gonum/dlaswp.go |   58 +
 .../gonum.org/v1/gonum/lapack/gonum/dlasy2.go |  292 ++
 .../gonum.org/v1/gonum/lapack/gonum/dlatbs.go |  454 +++
 .../gonum.org/v1/gonum/lapack/gonum/dlatdf.go |  175 +
 .../gonum.org/v1/gonum/lapack/gonum/dlatrd.go |  176 +
 .../gonum.org/v1/gonum/lapack/gonum/dlatrs.go |  410 +++
 .../gonum.org/v1/gonum/lapack/gonum/dlauu2.go |   66 +
 .../gonum.org/v1/gonum/lapack/gonum/dlauum.go |   83 +
 vendor/gonum.org/v1/gonum/lapack/gonum/doc.go |   28 +
 .../gonum.org/v1/gonum/lapack/gonum/dorg2l.go |   78 +
 .../gonum.org/v1/gonum/lapack/gonum/dorg2r.go |   77 +
 .../gonum.org/v1/gonum/lapack/gonum/dorgbr.go |  138 +
 .../gonum.org/v1/gonum/lapack/gonum/dorghr.go |  103 +
 .../gonum.org/v1/gonum/lapack/gonum/dorgl2.go |   79 +
 .../gonum.org/v1/gonum/lapack/gonum/dorglq.go |  125 +
 .../gonum.org/v1/gonum/lapack/gonum/dorgql.go |  139 +
 .../gonum.org/v1/gonum/lapack/gonum/dorgqr.go |  136 +
 .../gonum.org/v1/gonum/lapack/gonum/dorgr2.go |   83 +
 .../gonum.org/v1/gonum/lapack/gonum/dorgtr.go |  106 +
 .../gonum.org/v1/gonum/lapack/gonum/dorm2r.go |  103 +
 .../gonum.org/v1/gonum/lapack/gonum/dormbr.go |  180 +
 .../gonum.org/v1/gonum/lapack/gonum/dormhr.go |  134 +
 .../gonum.org/v1/gonum/lapack/gonum/dorml2.go |  104 +
 .../gonum.org/v1/gonum/lapack/gonum/dormlq.go |  176 +
 .../gonum.org/v1/gonum/lapack/gonum/dormqr.go |  180 +
 .../gonum.org/v1/gonum/lapack/gonum/dormr2.go |  105 +
 .../gonum.org/v1/gonum/lapack/gonum/dpbcon.go |  111 +
 .../gonum.org/v1/gonum/lapack/gonum/dpbtf2.go |  114 +
 .../gonum.org/v1/gonum/lapack/gonum/dpbtrf.go |  216 ++
 .../gonum.org/v1/gonum/lapack/gonum/dpbtrs.go |   69 +
 .../gonum.org/v1/gonum/lapack/gonum/dpocon.go |   90 +
 .../gonum.org/v1/gonum/lapack/gonum/dpotf2.go |   82 +
 .../gonum.org/v1/gonum/lapack/gonum/dpotrf.go |   81 +
 .../gonum.org/v1/gonum/lapack/gonum/dpotri.go |   44 +
 .../gonum.org/v1/gonum/lapack/gonum/dpotrs.go |   64 +
 .../gonum.org/v1/gonum/lapack/gonum/dpstf2.go |  202 ++
 .../gonum.org/v1/gonum/lapack/gonum/dpstrf.go |  233 ++
 .../gonum.org/v1/gonum/lapack/gonum/dptcon.go |   99 +
 .../gonum.org/v1/gonum/lapack/gonum/dptsv.go  |   49 +
 .../gonum.org/v1/gonum/lapack/gonum/dpttrf.go |   80 +
 .../gonum.org/v1/gonum/lapack/gonum/dpttrs.go |   51 +
 .../gonum.org/v1/gonum/lapack/gonum/dptts2.go |   39 +
 .../gonum.org/v1/gonum/lapack/gonum/drscl.go  |   63 +
 .../gonum.org/v1/gonum/lapack/gonum/dsteqr.go |  376 +++
 .../gonum.org/v1/gonum/lapack/gonum/dsterf.go |  285 ++
 .../gonum.org/v1/gonum/lapack/gonum/dsyev.go  |  130 +
 .../gonum.org/v1/gonum/lapack/gonum/dsytd2.go |  147 +
 .../gonum.org/v1/gonum/lapack/gonum/dsytrd.go |  184 +
 .../gonum.org/v1/gonum/lapack/gonum/dtbtrs.go |   77 +
 .../gonum.org/v1/gonum/lapack/gonum/dtgsja.go |  389 +++
 .../gonum.org/v1/gonum/lapack/gonum/dtrcon.go |   90 +
 .../v1/gonum/lapack/gonum/dtrevc3.go          |  894 +++++
 .../gonum.org/v1/gonum/lapack/gonum/dtrexc.go |  230 ++
 .../gonum.org/v1/gonum/lapack/gonum/dtrti2.go |   69 +
 .../gonum.org/v1/gonum/lapack/gonum/dtrtri.go |   72 +
 .../gonum.org/v1/gonum/lapack/gonum/dtrtrs.go |   55 +
 .../gonum.org/v1/gonum/lapack/gonum/errors.go |  183 +
 .../gonum.org/v1/gonum/lapack/gonum/iladlc.go |   45 +
 .../gonum.org/v1/gonum/lapack/gonum/iladlr.go |   41 +
 .../gonum.org/v1/gonum/lapack/gonum/ilaenv.go |  395 +++
 .../gonum.org/v1/gonum/lapack/gonum/iparmq.go |  117 +
 .../gonum.org/v1/gonum/lapack/gonum/lapack.go |   64 +
 vendor/gonum.org/v1/gonum/lapack/lapack.go    |  240 ++
 .../gonum.org/v1/gonum/lapack/lapack64/doc.go |   20 +
 .../v1/gonum/lapack/lapack64/lapack64.go      |  908 +++++
 vendor/gonum.org/v1/gonum/mat/README.md       |    6 +
 vendor/gonum.org/v1/gonum/mat/band.go         |  368 ++
 vendor/gonum.org/v1/gonum/mat/cdense.go       |  368 ++
 vendor/gonum.org/v1/gonum/mat/cholesky.go     | 1203 +++++++
 vendor/gonum.org/v1/gonum/mat/cmatrix.go      |  314 ++
 vendor/gonum.org/v1/gonum/mat/consts.go       |   15 +
 vendor/gonum.org/v1/gonum/mat/dense.go        |  670 ++++
 .../v1/gonum/mat/dense_arithmetic.go          |  877 +++++
 vendor/gonum.org/v1/gonum/mat/diagonal.go     |  342 ++
 vendor/gonum.org/v1/gonum/mat/doc.go          |  200 ++
 vendor/gonum.org/v1/gonum/mat/eigen.go        |  450 +++
 vendor/gonum.org/v1/gonum/mat/errors.go       |  154 +
 vendor/gonum.org/v1/gonum/mat/format.go       |  516 +++
 vendor/gonum.org/v1/gonum/mat/gsvd.go         |  436 +++
 vendor/gonum.org/v1/gonum/mat/hogsvd.go       |  239 ++
 .../v1/gonum/mat/index_bound_checks.go        |  398 +++
 .../v1/gonum/mat/index_no_bound_checks.go     |  400 +++
 vendor/gonum.org/v1/gonum/mat/inner.go        |  126 +
 vendor/gonum.org/v1/gonum/mat/io.go           |  495 +++
 vendor/gonum.org/v1/gonum/mat/lq.go           |  305 ++
 vendor/gonum.org/v1/gonum/mat/lu.go           |  487 +++
 vendor/gonum.org/v1/gonum/mat/matrix.go       | 1000 ++++++
 vendor/gonum.org/v1/gonum/mat/offset.go       |   32 +
 .../v1/gonum/mat/offset_appengine.go          |   40 +
 vendor/gonum.org/v1/gonum/mat/pool.go         |  260 ++
 vendor/gonum.org/v1/gonum/mat/product.go      |  193 ++
 vendor/gonum.org/v1/gonum/mat/qr.go           |  349 ++
 vendor/gonum.org/v1/gonum/mat/shadow.go       |  243 ++
 .../gonum.org/v1/gonum/mat/shadow_common.go   |   54 +
 .../gonum.org/v1/gonum/mat/shadow_complex.go  |   72 +
 vendor/gonum.org/v1/gonum/mat/solve.go        |  124 +
 vendor/gonum.org/v1/gonum/mat/svd.go          |  425 +++
 vendor/gonum.org/v1/gonum/mat/symband.go      |  312 ++
 vendor/gonum.org/v1/gonum/mat/symmetric.go    |  698 ++++
 vendor/gonum.org/v1/gonum/mat/triangular.go   |  832 +++++
 vendor/gonum.org/v1/gonum/mat/triband.go      |  694 ++++
 vendor/gonum.org/v1/gonum/mat/tridiag.go      |  417 +++
 vendor/gonum.org/v1/gonum/mat/vector.go       |  855 +++++
 vendor/gonum.org/v1/gonum/mathext/README.md   |    6 +
 vendor/gonum.org/v1/gonum/mathext/airy.go     |   41 +
 vendor/gonum.org/v1/gonum/mathext/beta.go     |   40 +
 vendor/gonum.org/v1/gonum/mathext/betainc.go  |   33 +
 vendor/gonum.org/v1/gonum/mathext/digamma.go  |   45 +
 vendor/gonum.org/v1/gonum/mathext/doc.go      |    7 +
 .../gonum.org/v1/gonum/mathext/ell_carlson.go |  168 +
 .../v1/gonum/mathext/ell_complete.go          |  355 ++
 vendor/gonum.org/v1/gonum/mathext/erf.go      |   91 +
 .../gonum.org/v1/gonum/mathext/gamma_inc.go   |   58 +
 .../v1/gonum/mathext/gamma_inc_inv.go         |   58 +
 .../v1/gonum/mathext/internal/amos/amos.go    | 2136 ++++++++++++
 .../v1/gonum/mathext/internal/amos/doc.go     |    6 +
 .../mathext/internal/amos/staticcheck.conf    |    1 +
 .../gonum/mathext/internal/cephes/cephes.go   |   28 +
 .../v1/gonum/mathext/internal/cephes/doc.go   |    6 +
 .../v1/gonum/mathext/internal/cephes/igam.go  |  320 ++
 .../v1/gonum/mathext/internal/cephes/igami.go |  155 +
 .../gonum/mathext/internal/cephes/incbeta.go  |  312 ++
 .../v1/gonum/mathext/internal/cephes/incbi.go |  247 ++
 .../gonum/mathext/internal/cephes/lanczos.go  |  153 +
 .../v1/gonum/mathext/internal/cephes/ndtri.go |  150 +
 .../gonum/mathext/internal/cephes/polevl.go   |   84 +
 .../mathext/internal/cephes/staticcheck.conf  |    1 +
 .../v1/gonum/mathext/internal/cephes/unity.go |  184 +
 .../v1/gonum/mathext/internal/cephes/zeta.go  |  117 +
 .../v1/gonum/mathext/internal/gonum/beta.go   |   58 +
 .../v1/gonum/mathext/internal/gonum/doc.go    |    7 +
 .../v1/gonum/mathext/internal/gonum/gonum.go  |    5 +
 vendor/gonum.org/v1/gonum/mathext/mvgamma.go  |   32 +
 vendor/gonum.org/v1/gonum/mathext/roots.go    |  181 +
 vendor/gonum.org/v1/gonum/mathext/zeta.go     |   22 +
 vendor/gonum.org/v1/gonum/optimize/README.md  |    6 +
 .../v1/gonum/optimize/backtracking.go         |   84 +
 vendor/gonum.org/v1/gonum/optimize/bfgs.go    |  192 ++
 .../gonum.org/v1/gonum/optimize/bisection.go  |  146 +
 vendor/gonum.org/v1/gonum/optimize/cg.go      |  368 ++
 vendor/gonum.org/v1/gonum/optimize/cmaes.go   |  468 +++
 vendor/gonum.org/v1/gonum/optimize/doc.go     |    6 +
 vendor/gonum.org/v1/gonum/optimize/errors.go  |   78 +
 .../v1/gonum/optimize/functionconvergence.go  |   85 +
 .../v1/gonum/optimize/gradientdescent.go      |   95 +
 .../v1/gonum/optimize/guessandcheck.go        |   92 +
 .../gonum.org/v1/gonum/optimize/interfaces.go |  132 +
 vendor/gonum.org/v1/gonum/optimize/lbfgs.go   |  199 ++
 .../gonum.org/v1/gonum/optimize/linesearch.go |  218 ++
 .../gonum.org/v1/gonum/optimize/listsearch.go |  123 +
 vendor/gonum.org/v1/gonum/optimize/local.go   |  146 +
 .../gonum.org/v1/gonum/optimize/minimize.go   |  595 ++++
 .../v1/gonum/optimize/morethuente.go          |  387 +++
 .../gonum.org/v1/gonum/optimize/neldermead.go |  348 ++
 vendor/gonum.org/v1/gonum/optimize/newton.go  |  182 +
 vendor/gonum.org/v1/gonum/optimize/printer.go |  108 +
 .../gonum.org/v1/gonum/optimize/stepsizers.go |  194 ++
 .../v1/gonum/optimize/termination.go          |  123 +
 vendor/gonum.org/v1/gonum/optimize/types.go   |  273 ++
 vendor/gonum.org/v1/gonum/spatial/r1/doc.go   |    6 +
 .../gonum.org/v1/gonum/spatial/r1/interval.go |   10 +
 vendor/gonum.org/v1/gonum/stat/README.md      |    6 +
 .../gonum.org/v1/gonum/stat/combin/combin.go  |  683 ++++
 vendor/gonum.org/v1/gonum/stat/combin/doc.go  |    7 +
 .../v1/gonum/stat/distmv/dirichlet.go         |  149 +
 .../gonum.org/v1/gonum/stat/distmv/distmv.go  |   28 +
 vendor/gonum.org/v1/gonum/stat/distmv/doc.go  |    6 +
 .../v1/gonum/stat/distmv/interfaces.go        |   35 +
 .../gonum.org/v1/gonum/stat/distmv/normal.go  |  524 +++
 .../v1/gonum/stat/distmv/statdist.go          |  390 +++
 .../v1/gonum/stat/distmv/studentst.go         |  362 ++
 .../gonum.org/v1/gonum/stat/distmv/uniform.go |  200 ++
 .../v1/gonum/stat/distuv/alphastable.go       |  112 +
 .../v1/gonum/stat/distuv/bernoulli.go         |  140 +
 vendor/gonum.org/v1/gonum/stat/distuv/beta.go |  151 +
 .../v1/gonum/stat/distuv/binomial.go          |  189 ++
 .../v1/gonum/stat/distuv/categorical.go       |  184 +
 vendor/gonum.org/v1/gonum/stat/distuv/chi.go  |  124 +
 .../v1/gonum/stat/distuv/chisquared.go        |  101 +
 .../v1/gonum/stat/distuv/constants.go         |   28 +
 vendor/gonum.org/v1/gonum/stat/distuv/doc.go  |    6 +
 .../v1/gonum/stat/distuv/exponential.go       |  266 ++
 vendor/gonum.org/v1/gonum/stat/distuv/f.go    |  134 +
 .../gonum.org/v1/gonum/stat/distuv/gamma.go   |  203 ++
 .../gonum.org/v1/gonum/stat/distuv/general.go |   24 +
 .../gonum.org/v1/gonum/stat/distuv/gumbel.go  |  118 +
 .../v1/gonum/stat/distuv/interfaces.go        |   32 +
 .../v1/gonum/stat/distuv/inversegamma.go      |  123 +
 .../gonum.org/v1/gonum/stat/distuv/laplace.go |  267 ++
 .../v1/gonum/stat/distuv/logistic.go          |   98 +
 .../v1/gonum/stat/distuv/lognormal.go         |  113 +
 vendor/gonum.org/v1/gonum/stat/distuv/norm.go |  263 ++
 .../gonum.org/v1/gonum/stat/distuv/pareto.go  |  130 +
 .../gonum.org/v1/gonum/stat/distuv/poisson.go |  144 +
 .../v1/gonum/stat/distuv/statdist.go          |  142 +
 .../v1/gonum/stat/distuv/studentst.go         |  161 +
 .../v1/gonum/stat/distuv/triangle.go          |  278 ++
 .../gonum.org/v1/gonum/stat/distuv/uniform.go |  210 ++
 .../gonum.org/v1/gonum/stat/distuv/weibull.go |  231 ++
 vendor/gonum.org/v1/gonum/stat/doc.go         |    6 +
 vendor/gonum.org/v1/gonum/stat/pca_cca.go     |  317 ++
 vendor/gonum.org/v1/gonum/stat/roc.go         |  198 ++
 vendor/gonum.org/v1/gonum/stat/stat.go        | 1400 ++++++++
 vendor/gonum.org/v1/gonum/stat/statmat.go     |  142 +
 vendor/modules.txt                            |   29 +
 436 files changed, 95292 insertions(+), 5 deletions(-)
 create mode 100644 pkg/performanceprofile/profilecreator/autosize/autosize.go
 create mode 100644 vendor/golang.org/x/tools/container/intsets/sparse.go
 create mode 100644 vendor/gonum.org/v1/gonum/AUTHORS
 create mode 100644 vendor/gonum.org/v1/gonum/CONTRIBUTORS
 create mode 100644 vendor/gonum.org/v1/gonum/LICENSE
 create mode 100644 vendor/gonum.org/v1/gonum/blas/README.md
 create mode 100644 vendor/gonum.org/v1/gonum/blas/blas.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/blas64/blas64.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/blas64/conv.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/blas64/conv_symmetric.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/blas64/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/cblas128/cblas128.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/cblas128/conv.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/cblas128/conv_hermitian.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/cblas128/conv_symmetric.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/cblas128/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/conversions.bash
 create mode 100644 vendor/gonum.org/v1/gonum/blas/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/gonum/dgemm.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/gonum/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/gonum/errors.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/gonum/gonum.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/gonum/level1cmplx128.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/gonum/level1cmplx64.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/gonum/level1float32.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/gonum/level1float32_dsdot.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/gonum/level1float32_sdot.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/gonum/level1float32_sdsdot.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/gonum/level1float64.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/gonum/level1float64_ddot.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/gonum/level2cmplx128.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/gonum/level2cmplx64.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/gonum/level2float32.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/gonum/level2float64.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/gonum/level3cmplx128.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/gonum/level3cmplx64.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/gonum/level3float32.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/gonum/level3float64.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/gonum/sgemm.go
 create mode 100644 vendor/gonum.org/v1/gonum/blas/gonum/single_precision.bash
 create mode 100644 vendor/gonum.org/v1/gonum/floats/README.md
 create mode 100644 vendor/gonum.org/v1/gonum/floats/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/floats/floats.go
 create mode 100644 vendor/gonum.org/v1/gonum/floats/scalar/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/floats/scalar/scalar.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c128/axpyinc_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c128/axpyincto_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c128/axpyunitary_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c128/axpyunitaryto_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c128/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c128/dotcinc_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c128/dotcunitary_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c128/dotuinc_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c128/dotuunitary_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c128/dscalinc_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c128/dscalunitary_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c128/scal.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c128/scalUnitary_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c128/scalinc_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c128/stubs.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c128/stubs_amd64.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c128/stubs_noasm.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c64/axpyinc_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c64/axpyincto_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c64/axpyunitary_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c64/axpyunitaryto_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c64/conj.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c64/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c64/dotcinc_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c64/dotcunitary_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c64/dotuinc_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c64/dotuunitary_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c64/scal.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c64/stubs.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c64/stubs_amd64.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/c64/stubs_noasm.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f32/axpyinc_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f32/axpyincto_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f32/axpyunitary_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f32/axpyunitaryto_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f32/ddotinc_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f32/ddotunitary_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f32/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f32/dotinc_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f32/dotunitary_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f32/ge_noasm.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f32/gemv.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f32/l2norm.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f32/scal.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f32/stubs_amd64.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f32/stubs_noasm.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f32/sum_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/abssum_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/abssuminc_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/add_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/addconst_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/axpy.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/axpyinc_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/axpyincto_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/axpyunitary_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/axpyunitaryto_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/cumprod_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/cumsum_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/div_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/divto_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/dot.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/dot_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/ge_amd64.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/ge_noasm.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/gemvN_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/gemvT_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/ger_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/l1norm_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/l2norm_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/l2norm_noasm.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/l2normdist_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/l2norminc_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/linfnorm_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/scal.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/scalinc_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/scalincto_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/scalunitary_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/scalunitaryto_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/stubs_amd64.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/stubs_noasm.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/asm/f64/sum_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/cmplx64/abs.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/cmplx64/conj.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/cmplx64/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/cmplx64/isinf.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/cmplx64/isnan.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/cmplx64/sqrt.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/math32/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/math32/math.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/math32/signbit.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/math32/sqrt.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/math32/sqrt_amd64.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/math32/sqrt_amd64.s
 create mode 100644 vendor/gonum.org/v1/gonum/internal/math32/sqrt_arm64.go
 create mode 100644 vendor/gonum.org/v1/gonum/internal/math32/sqrt_arm64.s
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/.gitignore
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/README.md
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dbdsqr.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgebak.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgebal.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgebd2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgebrd.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgecon.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgeev.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgehd2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgehrd.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgelq2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgelqf.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgels.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgeql2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgeqp3.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgeqr2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgeqrf.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgerq2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgerqf.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgesc2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgesv.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgesvd.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgetc2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgetf2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgetrf.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgetri.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgetrs.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgghrd.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dggsvd3.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dggsvp3.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dgtsv.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dhseqr.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlabrd.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlacn2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlacpy.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlae2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlaev2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlaexc.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlag2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlags2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlagtm.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlahqr.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlahr2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlaln2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlangb.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlange.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlangt.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlanhs.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlansb.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlanst.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlansy.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlantb.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlantr.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlanv2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlapll.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlapmr.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlapmt.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlapy2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlaqp2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlaqps.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr04.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr1.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr23.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr5.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlarf.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlarfb.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlarfg.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlarft.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlarfx.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlartg.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlas2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlascl.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlaset.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlasq1.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlasq2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlasq3.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlasq4.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlasq5.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlasq6.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlasr.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlasrt.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlassq.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlasv2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlaswp.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlasy2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlatbs.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlatdf.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlatrd.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlatrs.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlauu2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dlauum.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dorg2l.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dorg2r.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dorgbr.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dorghr.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dorgl2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dorglq.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dorgql.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dorgqr.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dorgr2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dorgtr.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dorm2r.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dormbr.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dormhr.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dorml2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dormlq.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dormqr.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dormr2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dpbcon.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dpbtf2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dpbtrf.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dpbtrs.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dpocon.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dpotf2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dpotrf.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dpotri.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dpotrs.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dpstf2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dpstrf.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dptcon.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dptsv.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dpttrf.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dpttrs.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dptts2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/drscl.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dsteqr.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dsterf.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dsyev.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dsytd2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dsytrd.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dtbtrs.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dtgsja.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dtrcon.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dtrevc3.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dtrexc.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dtrti2.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dtrtri.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/dtrtrs.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/errors.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/iladlc.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/iladlr.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/ilaenv.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/iparmq.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/gonum/lapack.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/lapack.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/lapack64/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/lapack/lapack64/lapack64.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/README.md
 create mode 100644 vendor/gonum.org/v1/gonum/mat/band.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/cdense.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/cholesky.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/cmatrix.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/consts.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/dense.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/dense_arithmetic.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/diagonal.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/eigen.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/errors.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/format.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/gsvd.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/hogsvd.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/index_bound_checks.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/index_no_bound_checks.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/inner.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/io.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/lq.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/lu.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/matrix.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/offset.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/offset_appengine.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/pool.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/product.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/qr.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/shadow.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/shadow_common.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/shadow_complex.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/solve.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/svd.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/symband.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/symmetric.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/triangular.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/triband.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/tridiag.go
 create mode 100644 vendor/gonum.org/v1/gonum/mat/vector.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/README.md
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/airy.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/beta.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/betainc.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/digamma.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/ell_carlson.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/ell_complete.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/erf.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/gamma_inc.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/gamma_inc_inv.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/internal/amos/amos.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/internal/amos/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/internal/amos/staticcheck.conf
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/internal/cephes/cephes.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/internal/cephes/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/internal/cephes/igam.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/internal/cephes/igami.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/internal/cephes/incbeta.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/internal/cephes/incbi.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/internal/cephes/lanczos.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/internal/cephes/ndtri.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/internal/cephes/polevl.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/internal/cephes/staticcheck.conf
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/internal/cephes/unity.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/internal/cephes/zeta.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/internal/gonum/beta.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/internal/gonum/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/internal/gonum/gonum.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/mvgamma.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/roots.go
 create mode 100644 vendor/gonum.org/v1/gonum/mathext/zeta.go
 create mode 100644 vendor/gonum.org/v1/gonum/optimize/README.md
 create mode 100644 vendor/gonum.org/v1/gonum/optimize/backtracking.go
 create mode 100644 vendor/gonum.org/v1/gonum/optimize/bfgs.go
 create mode 100644 vendor/gonum.org/v1/gonum/optimize/bisection.go
 create mode 100644 vendor/gonum.org/v1/gonum/optimize/cg.go
 create mode 100644 vendor/gonum.org/v1/gonum/optimize/cmaes.go
 create mode 100644 vendor/gonum.org/v1/gonum/optimize/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/optimize/errors.go
 create mode 100644 vendor/gonum.org/v1/gonum/optimize/functionconvergence.go
 create mode 100644 vendor/gonum.org/v1/gonum/optimize/gradientdescent.go
 create mode 100644 vendor/gonum.org/v1/gonum/optimize/guessandcheck.go
 create mode 100644 vendor/gonum.org/v1/gonum/optimize/interfaces.go
 create mode 100644 vendor/gonum.org/v1/gonum/optimize/lbfgs.go
 create mode 100644 vendor/gonum.org/v1/gonum/optimize/linesearch.go
 create mode 100644 vendor/gonum.org/v1/gonum/optimize/listsearch.go
 create mode 100644 vendor/gonum.org/v1/gonum/optimize/local.go
 create mode 100644 vendor/gonum.org/v1/gonum/optimize/minimize.go
 create mode 100644 vendor/gonum.org/v1/gonum/optimize/morethuente.go
 create mode 100644 vendor/gonum.org/v1/gonum/optimize/neldermead.go
 create mode 100644 vendor/gonum.org/v1/gonum/optimize/newton.go
 create mode 100644 vendor/gonum.org/v1/gonum/optimize/printer.go
 create mode 100644 vendor/gonum.org/v1/gonum/optimize/stepsizers.go
 create mode 100644 vendor/gonum.org/v1/gonum/optimize/termination.go
 create mode 100644 vendor/gonum.org/v1/gonum/optimize/types.go
 create mode 100644 vendor/gonum.org/v1/gonum/spatial/r1/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/spatial/r1/interval.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/README.md
 create mode 100644 vendor/gonum.org/v1/gonum/stat/combin/combin.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/combin/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distmv/dirichlet.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distmv/distmv.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distmv/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distmv/interfaces.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distmv/normal.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distmv/statdist.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distmv/studentst.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distmv/uniform.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/alphastable.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/bernoulli.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/beta.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/binomial.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/categorical.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/chi.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/chisquared.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/constants.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/exponential.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/f.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/gamma.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/general.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/gumbel.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/interfaces.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/inversegamma.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/laplace.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/logistic.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/lognormal.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/norm.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/pareto.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/poisson.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/statdist.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/studentst.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/triangle.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/uniform.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/distuv/weibull.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/doc.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/pca_cca.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/roc.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/stat.go
 create mode 100644 vendor/gonum.org/v1/gonum/stat/statmat.go

diff --git a/go.mod b/go.mod
index 2c289bb96b..d669946d2d 100644
--- a/go.mod
+++ b/go.mod
@@ -28,6 +28,7 @@ require (
 	github.com/prometheus/client_golang v1.21.1
 	github.com/spf13/cobra v1.9.1
 	github.com/spf13/pflag v1.0.6
+	gonum.org/v1/gonum v0.16.0
 	gopkg.in/fsnotify.v1 v1.4.7
 	gopkg.in/ini.v1 v1.67.0
 	gopkg.in/yaml.v2 v2.4.0
diff --git a/go.sum b/go.sum
index ea705f8a4a..38a9b9d2be 100644
--- a/go.sum
+++ b/go.sum
@@ -824,6 +824,8 @@ golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8T
 golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw=
 gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY=
+gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
+gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
 google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE=
 google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M=
 google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg=
diff --git a/pkg/performanceprofile/profilecreator/autosize/autosize.go b/pkg/performanceprofile/profilecreator/autosize/autosize.go
new file mode 100644
index 0000000000..5be9800ec6
--- /dev/null
+++ b/pkg/performanceprofile/profilecreator/autosize/autosize.go
@@ -0,0 +1,246 @@
+package autosize
+
+import (
+	"errors"
+	"fmt"
+	"log"
+	"math"
+
+	"gonum.org/v1/gonum/optimize"
+
+	"github.com/openshift/cluster-node-tuning-operator/pkg/performanceprofile/profilecreator"
+)
+
+// Assumptions:
+// 1. All the machines in the node pool have identical HW specs and need identical sizing.
+// 2. We cannot distinguish between infra/OS CPU requirements and control plane CPU requirement.
+//    We will conflate the two costs in the latter.
+//
+// Definitions:
+// x_c: CPUs for the control plane - includes x_i: CPUs for OS/Infra
+// x_w: CPUs for the workload
+// Tc: Total available CPUs (includes OS/Infra
+//
+// Hard Constraints:
+//   x_c, x_w are integers because we need to dedicate full cores
+//   x_c, x_w >= 0
+//   x_c + x_w <= Tc
+//   x_c >= req(x_w) // control plane and infra cost is a function of the expected workload
+//
+// Objective:
+// We want to maximize x_w, or, equivalently, minimize x_c
+
+const (
+	defaultPenaltyWeight                 float64 = 100.0
+	defaultReservedRatioInitial          float64 = 0.0625 // 1/16. determined empirically. Use only as initial value.
+	defaultReservedRatioMax              float64 = 0.25   // 1/4. determined empirically. This is the practical upper bound.
+	defaultControlPlaneWorkloadCoreRatio float64 = 0.075  // TODO: how much control plane/infra power do we need to support the workload?
+)
+
+var (
+	ErrUnderallocatedControlPlane = errors.New("not enough CPUs for control plane")
+	ErrOverallocatedControlPlane  = errors.New("too many CPUs for control plane")
+	ErrInconsistentAllocation     = errors.New("inconsistent CPus allocation")
+)
+
+type Env struct {
+	Log *log.Logger
+}
+
+func DefaultEnv() Env {
+	return Env{
+		Log: profilecreator.GetAlertSink(),
+	}
+}
+
+type Params struct {
+	OfflinedCPUCount    int
+	UserLevelNetworking bool
+	MachineData         *profilecreator.GHWHandler
+	// cached vars
+	totalCPUs int
+	smtLevel  int
+}
+
+func (p Params) String() string {
+	return fmt.Sprintf("cpus=%d offline=%v SMTLevel=%v", p.totalCPUs, p.OfflinedCPUCount, p.smtLevel)
+}
+
+func setupMachineData(p *Params) error {
+	var err error
+
+	cpus, err := p.MachineData.CPU()
+	if err != nil {
+		return err
+	}
+
+	p.totalCPUs = int(cpus.TotalHardwareThreads)
+	// NOTE: this assumes all cores are equal, but it's a limitation also shared by GHW. CPUs with P/E cores will be misrepresented.
+	p.smtLevel = int(cpus.TotalHardwareThreads / cpus.TotalCores)
+	return nil
+}
+
+func (p Params) TotalCPUs() int {
+	return p.totalCPUs
+}
+
+func (p Params) SMTLevel() int {
+	return p.smtLevel
+}
+
+func (p Params) DefaultControlPlaneCores() int {
+	// intentionally overallocate to have a safe baseline
+	Tc := p.totalCPUs
+	return int(math.Round(float64(Tc) * defaultReservedRatioInitial)) // TODO handle SMT
+}
+
+// Get x_c, x_w as initial hardcoded value. Subject to optimization
+func (p Params) DefaultAllocation() Values {
+	Tc := p.totalCPUs
+	x_c := p.DefaultControlPlaneCores()
+	return Values{
+		ReservedCPUCount: x_c,
+		IsolatedCPUCount: Tc - x_c,
+	}
+}
+
+func (p Params) initialValue() []float64 {
+	vals := p.DefaultAllocation()
+	return []float64{
+		float64(vals.ReservedCPUCount), // x_c
+		float64(vals.IsolatedCPUCount), // x_w
+	}
+}
+
+func (p Params) controlPlaneRequirement(x_w float64) float64 {
+	R := defaultControlPlaneWorkloadCoreRatio
+	if p.UserLevelNetworking {
+		R = 0.0
+	}
+	// TODO: the most obvious relationship is for kernel level networking.
+	// We start with a linear relationship because its simplicity.
+	return float64(p.DefaultControlPlaneCores()) + R*x_w
+}
+
+type Score struct {
+	Cost float64 // the lower the better
+}
+
+func (sc Score) String() string {
+	val := -sc.Cost // positive values are easier to grasp
+	return fmt.Sprintf("optimization result: %.3f (higher is better)", val)
+}
+
+type Values struct {
+	// we intentionally compute the recommended cpu count, not precise allocation, because
+	// this is better done by other packages. We may expose the precise allocation as hint
+	// or for reference purposes in the future
+	ReservedCPUCount int
+	IsolatedCPUCount int
+}
+
+func (vals Values) String() string {
+	return fmt.Sprintf("reserved=%v/isolated=%v", vals.ReservedCPUCount, vals.IsolatedCPUCount)
+}
+
+// gonum doesn't support bounds yet so we have to make this an explicit step
+// https://github.com/gonum/gonum/issues/1725
+func Validate(params Params, vals Values) error {
+	Tc := params.TotalCPUs()
+	if vals.ReservedCPUCount < 1 { // TODO handle SMT
+		return ErrUnderallocatedControlPlane
+	}
+	if vals.ReservedCPUCount > int(math.Round((float64(Tc) * defaultReservedRatioMax))) { // works, but likely unacceptable
+		return ErrOverallocatedControlPlane
+	}
+	if Tc != vals.ReservedCPUCount+vals.IsolatedCPUCount {
+		return ErrInconsistentAllocation
+	}
+	return nil
+}
+
+// Objective function to minimize.
+// x[0] is x_c
+// x[1] is x_w
+func objective(p Params, x []float64) float64 {
+	x_c := x[0]
+	x_w := x[1]
+
+	// Our original objective is to maximize x_w, so we minimize -x_w
+	target := -x_w
+
+	// gonum doesn't support bounds yet so we have to use penalties:
+	// https://github.com/gonum/gonum/issues/1725
+
+	// Hard Constraints
+	var hardPenalty float64
+	// Don't exceed total CPUs
+	hardPenalty += defaultPenaltyWeight * math.Pow(math.Max(0, x_c+x_w-float64(p.TotalCPUs())), 2)
+
+	// Meet the control plane/infra requirement to avoid the workload to starve
+	hardPenalty += defaultPenaltyWeight * math.Pow(math.Max(0, p.controlPlaneRequirement(x_w)-x_c), 2)
+
+	// Must use positive CPU values (since gonum/optimize doesn't have simple bounds for all solvers)
+	hardPenalty += defaultPenaltyWeight*math.Pow(math.Max(0, -x_c), 2) + math.Pow(math.Max(0, -x_w), 2)
+
+	// Allocate in multiples of SMT level (usually 2) -- TODO: should be soft?
+	hardPenalty += defaultPenaltyWeight * math.Pow(math.Max(0, -float64(int(math.Round(x_c))%p.SMTLevel())), 2)
+
+	return target + hardPenalty
+}
+
+func Compute(env Env, params Params) (Values, Score, error) {
+	err := setupMachineData(&params)
+	if err != nil {
+		env.Log.Printf("Optimization failed: %v", err)
+		return params.DefaultAllocation(), Score{}, err
+	}
+
+	problem := optimize.Problem{
+		Func: func(x []float64) float64 {
+			return objective(params, x)
+		},
+	}
+
+	settings := &optimize.Settings{
+		MajorIterations: 99,
+	}
+
+	env.Log.Printf("Optimization start. Default allocation: %v", params.DefaultAllocation().String())
+	env.Log.Printf("Optimization start. Params: %v", params.String())
+
+	result, err := optimize.Minimize(problem, params.initialValue(), settings, &optimize.NelderMead{})
+	if err != nil {
+		env.Log.Printf("Optimization failed: %v", err)
+		return params.DefaultAllocation(), Score{}, err
+	}
+
+	smtLevel := params.SMTLevel()
+	totCPUs := params.TotalCPUs()
+	score := Score{Cost: result.F}
+	x_cr := int(math.Round(result.Location.X[0]))
+	x_c := asMultipleOf(x_cr, smtLevel)
+	env.Log.Printf("Optimization value: Xc=%v -> Xc=%v (SMTLevel=%v)", x_cr, x_c, smtLevel)
+
+	vals := Values{
+		ReservedCPUCount: x_c,
+		IsolatedCPUCount: totCPUs - x_c, // we can use x_w, but we just leverage invariants
+	}
+	env.Log.Printf("Optimization result: %s", vals.String())
+
+	if err := Validate(params, vals); err != nil {
+		env.Log.Printf("Optimization invalid: %v", err)
+		return params.DefaultAllocation(), Score{}, err
+	}
+
+	env.Log.Printf("Optimization done. Score: %v %s totalCPUs=%d", score.String(), vals.String(), totCPUs)
+	return vals, score, nil
+}
+
+func asMultipleOf(v, x int) int {
+	r := v % x
+	if r == 0 {
+		return v
+	}
+	return v + r
+}
diff --git a/pkg/performanceprofile/profilecreator/cmd/root.go b/pkg/performanceprofile/profilecreator/cmd/root.go
index 037bfad7b8..7b70f291ae 100644
--- a/pkg/performanceprofile/profilecreator/cmd/root.go
+++ b/pkg/performanceprofile/profilecreator/cmd/root.go
@@ -37,6 +37,7 @@ import (
 	machineconfigv1 "github.com/openshift/api/machineconfiguration/v1"
 	performancev2 "github.com/openshift/cluster-node-tuning-operator/pkg/apis/performanceprofile/v2"
 	"github.com/openshift/cluster-node-tuning-operator/pkg/performanceprofile/profilecreator"
+	"github.com/openshift/cluster-node-tuning-operator/pkg/performanceprofile/profilecreator/autosize"
 	"github.com/openshift/cluster-node-tuning-operator/pkg/performanceprofile/profilecreator/cmd/hypershift"
 	"github.com/openshift/cluster-node-tuning-operator/pkg/performanceprofile/profilecreator/serialize"
 	"github.com/openshift/cluster-node-tuning-operator/pkg/performanceprofile/profilecreator/toleration"
@@ -116,10 +117,10 @@ func NewRootCommand() *cobra.Command {
 	pcArgs := &ProfileCreatorArgs{
 		UserLevelNetworking:   ptr.To(false),
 		PerPodPowerManagement: ptr.To(false),
+		Autosize:              ptr.To(false),
 	}
 
 	var requiredFlags = []string{
-		"reserved-cpu-count",
 		"rt-kernel",
 		"must-gather-dir-path",
 	}
@@ -164,10 +165,26 @@ func NewRootCommand() *cobra.Command {
 			if err != nil {
 				return fmt.Errorf("targeted nodes differ: %w", err)
 			}
+
+			sizing := autosize.Values{
+				ReservedCPUCount: pcArgs.ReservedCPUCount,
+			}
+			if isAutosizeEnabled(pcArgs) {
+				params := autosize.Params{
+					OfflinedCPUCount:    pcArgs.OfflinedCPUCount,
+					UserLevelNetworking: (pcArgs.UserLevelNetworking != nil && *pcArgs.UserLevelNetworking),
+					MachineData:         nodesHandlers[0], // assume all nodes equal, pick the easiest
+				}
+				sizing, _, err = autosize.Compute(autosize.DefaultEnv(), params)
+				if err != nil {
+					return fmt.Errorf("failed to autosize the cluster values: %v", err)
+				}
+			}
+
 			// We make sure that the matched Nodes are the same
 			// Assumption here is moving forward matchedNodes[0] is representative of how all the nodes are
 			// same from hardware topology point of view
-			profileData, err := makeProfileDataFrom(nodesHandlers[0], pcArgs)
+			profileData, err := makeProfileDataFrom(nodesHandlers[0], pcArgs, sizing)
 			if err != nil {
 				return fmt.Errorf("failed to make profile data from node handler: %w", err)
 			}
@@ -222,6 +239,9 @@ func validateProfileCreatorFlags(pcArgs *ProfileCreatorArgs) error {
 	if pcArgs.MCPName != "" && pcArgs.NodePoolName != "" {
 		return fmt.Errorf("--mcp-name and --node-pool-name options cannot be used together")
 	}
+	if !isAutosizeEnabled(pcArgs) && pcArgs.ReservedCPUCount == 0 {
+		return fmt.Errorf("--reserved-cpu-count need to be set and greater than zero if autosizing (--autosize) is disabled")
+	}
 	if pcArgs.NodePoolName == "" {
 		// NodePoolName is an alias of MCPName
 		pcArgs.NodePoolName = pcArgs.MCPName
@@ -303,12 +323,13 @@ func makeClusterData(mustGatherDirPath string, createForHypershift bool) (Cluste
 	return clusterData, nil
 }
 
-func makeProfileDataFrom(nodeHandler *profilecreator.GHWHandler, args *ProfileCreatorArgs) (*ProfileData, error) {
+func makeProfileDataFrom(nodeHandler *profilecreator.GHWHandler, args *ProfileCreatorArgs, sizing autosize.Values) (*ProfileData, error) {
 	systemInfo, err := nodeHandler.GatherSystemInfo()
 	if err != nil {
 		return nil, fmt.Errorf("failed to compute get system information: %v", err)
 	}
-	reservedCPUs, isolatedCPUs, offlinedCPUs, err := profilecreator.CalculateCPUSets(systemInfo, args.ReservedCPUCount, args.OfflinedCPUCount, args.SplitReservedCPUsAcrossNUMA, args.DisableHT, args.PowerConsumptionMode == ultraLowLatency)
+
+	reservedCPUs, isolatedCPUs, offlinedCPUs, err := profilecreator.CalculateCPUSets(systemInfo, sizing.ReservedCPUCount, args.OfflinedCPUCount, args.SplitReservedCPUsAcrossNUMA, args.DisableHT, args.PowerConsumptionMode == ultraLowLatency)
 	if err != nil {
 		return nil, fmt.Errorf("failed to compute the reserved and isolated CPUs: %v", err)
 	}
@@ -411,13 +432,14 @@ type ProfileCreatorArgs struct {
 	TMPolicy                    string `json:"topology-manager-policy"`
 	PerPodPowerManagement       *bool  `json:"per-pod-power-management,omitempty"`
 	EnableHardwareTuning        bool   `json:"enable-hardware-tuning,omitempty"`
+	Autosize                    *bool  `json:"autosize,omitempty"`
 	// internal only this argument not passed by the user
 	// but detected automatically
 	createForHypershift bool
 }
 
 func (pca *ProfileCreatorArgs) AddFlags(flags *pflag.FlagSet) {
-	flags.IntVar(&pca.ReservedCPUCount, "reserved-cpu-count", 0, "Number of reserved CPUs (required)")
+	flags.IntVar(&pca.ReservedCPUCount, "reserved-cpu-count", 0, "Number of reserved CPUs")
 	flags.IntVar(&pca.OfflinedCPUCount, "offlined-cpu-count", 0, "Number of offlined CPUs")
 	flags.BoolVar(&pca.SplitReservedCPUsAcrossNUMA, "split-reserved-cpus-across-numa", false, "Split the Reserved CPUs across NUMA nodes")
 	flags.StringVar(&pca.MCPName, "mcp-name", "", "MCP name corresponding to the target machines (required)")
@@ -431,6 +453,7 @@ func (pca *ProfileCreatorArgs) AddFlags(flags *pflag.FlagSet) {
 	flags.BoolVar(pca.PerPodPowerManagement, "per-pod-power-management", false, "Enable Per Pod Power Management")
 	flags.BoolVar(&pca.EnableHardwareTuning, "enable-hardware-tuning", false, "Enable setting maximum cpu frequencies")
 	flags.StringVar(&pca.NodePoolName, "node-pool-name", "", "Node pool name corresponding to the target machines (HyperShift only)")
+	flags.BoolVar(pca.Autosize, "autosize", false, "autosize the control plane")
 }
 
 func makePerformanceProfileFrom(profileData ProfileData) (runtime.Object, error) {
@@ -582,3 +605,7 @@ func setSelectorsFor(profileData *ProfileData, args *ProfileCreatorArgs) error {
 	profileData.mcpSelector = mcpSelector
 	return nil
 }
+
+func isAutosizeEnabled(pcArgs *ProfileCreatorArgs) bool {
+	return pcArgs.Autosize != nil && *pcArgs.Autosize
+}
diff --git a/vendor/golang.org/x/tools/container/intsets/sparse.go b/vendor/golang.org/x/tools/container/intsets/sparse.go
new file mode 100644
index 0000000000..c56aacc28b
--- /dev/null
+++ b/vendor/golang.org/x/tools/container/intsets/sparse.go
@@ -0,0 +1,1107 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package intsets provides Sparse, a compact and fast representation
+// for sparse sets of int values.
+//
+// The time complexity of the operations Len, Insert, Remove and Has
+// is in O(n) but in practice those methods are faster and more
+// space-efficient than equivalent operations on sets based on the Go
+// map type.  The IsEmpty, Min, Max, Clear and TakeMin operations
+// require constant time.
+package intsets // import "golang.org/x/tools/container/intsets"
+
+// TODO(adonovan):
+// - Add InsertAll(...int), RemoveAll(...int)
+// - Add 'bool changed' results for {Intersection,Difference}With too.
+//
+// TODO(adonovan): implement Dense, a dense bit vector with a similar API.
+// The space usage would be proportional to Max(), not Len(), and the
+// implementation would be based upon big.Int.
+//
+// TODO(adonovan): opt: make UnionWith and Difference faster.
+// These are the hot-spots for go/pointer.
+
+import (
+	"bytes"
+	"fmt"
+	"math/bits"
+)
+
+// A Sparse is a set of int values.
+// Sparse operations (even queries) are not concurrency-safe.
+//
+// The zero value for Sparse is a valid empty set.
+//
+// Sparse sets must be copied using the Copy method, not by assigning
+// a Sparse value.
+type Sparse struct {
+	// An uninitialized Sparse represents an empty set.
+	// An empty set may also be represented by
+	//  root.next == root.prev == &root.
+	//
+	// The root is always the block with the smallest offset.
+	// It can be empty, but only if it is the only block; in that case, offset is
+	// MaxInt (which is not a valid offset).
+	root block
+}
+
+type word uintptr
+
+const (
+	_m            = ^word(0)
+	bitsPerWord   = 8 << (_m>>8&1 + _m>>16&1 + _m>>32&1)
+	bitsPerBlock  = 256 // optimal value for go/pointer solver performance
+	wordsPerBlock = bitsPerBlock / bitsPerWord
+)
+
+// Limit values of implementation-specific int type.
+const (
+	MaxInt = int(^uint(0) >> 1)
+	MinInt = -MaxInt - 1
+)
+
+// popcount returns the number of set bits in w.
+func popcount(x word) int {
+	// Avoid OnesCount(uint): don't assume uint = uintptr.
+	if bitsPerWord == 32 {
+		return bits.OnesCount32(uint32(x))
+	} else {
+		return bits.OnesCount64(uint64(x))
+	}
+}
+
+// nlz returns the number of leading zeros of x.
+func nlz(x word) int {
+	// Avoid LeadingZeros(uint): don't assume uint = uintptr.
+	if bitsPerWord == 32 {
+		return bits.LeadingZeros32(uint32(x))
+	} else {
+		return bits.LeadingZeros64(uint64(x))
+	}
+}
+
+// ntz returns the number of trailing zeros of x.
+func ntz(x word) int {
+	// Avoid TrailingZeros(uint): don't assume uint = uintptr.
+	if bitsPerWord == 32 {
+		return bits.TrailingZeros32(uint32(x))
+	} else {
+		return bits.TrailingZeros64(uint64(x))
+	}
+}
+
+// -- block ------------------------------------------------------------
+
+// A set is represented as a circular doubly-linked list of blocks,
+// each containing an offset and a bit array of fixed size
+// bitsPerBlock; the blocks are ordered by increasing offset.
+//
+// The set contains an element x iff the block whose offset is x - (x
+// mod bitsPerBlock) has the bit (x mod bitsPerBlock) set, where mod
+// is the Euclidean remainder.
+//
+// A block may only be empty transiently.
+type block struct {
+	offset     int                 // offset mod bitsPerBlock == 0
+	bits       [wordsPerBlock]word // contains at least one set bit
+	next, prev *block              // doubly-linked list of blocks
+}
+
+// wordMask returns the word index (in block.bits)
+// and single-bit mask for the block's ith bit.
+func wordMask(i uint) (w uint, mask word) {
+	w = i / bitsPerWord
+	mask = 1 << (i % bitsPerWord)
+	return
+}
+
+// insert sets the block b's ith bit and
+// returns true if it was not already set.
+func (b *block) insert(i uint) bool {
+	w, mask := wordMask(i)
+	if b.bits[w]&mask == 0 {
+		b.bits[w] |= mask
+		return true
+	}
+	return false
+}
+
+// remove clears the block's ith bit and
+// returns true if the bit was previously set.
+// NB: may leave the block empty.
+func (b *block) remove(i uint) bool {
+	w, mask := wordMask(i)
+	if b.bits[w]&mask != 0 {
+		b.bits[w] &^= mask
+		return true
+	}
+	return false
+}
+
+// has reports whether the block's ith bit is set.
+func (b *block) has(i uint) bool {
+	w, mask := wordMask(i)
+	return b.bits[w]&mask != 0
+}
+
+// empty reports whether b.len()==0, but more efficiently.
+func (b *block) empty() bool {
+	for _, w := range b.bits {
+		if w != 0 {
+			return false
+		}
+	}
+	return true
+}
+
+// len returns the number of set bits in block b.
+func (b *block) len() int {
+	var l int
+	for _, w := range b.bits {
+		l += popcount(w)
+	}
+	return l
+}
+
+// max returns the maximum element of the block.
+// The block must not be empty.
+func (b *block) max() int {
+	bi := b.offset + bitsPerBlock
+	// Decrement bi by number of high zeros in last.bits.
+	for i := len(b.bits) - 1; i >= 0; i-- {
+		if w := b.bits[i]; w != 0 {
+			return bi - nlz(w) - 1
+		}
+		bi -= bitsPerWord
+	}
+	panic("BUG: empty block")
+}
+
+// min returns the minimum element of the block,
+// and also removes it if take is set.
+// The block must not be initially empty.
+// NB: may leave the block empty.
+func (b *block) min(take bool) int {
+	for i, w := range b.bits {
+		if w != 0 {
+			tz := ntz(w)
+			if take {
+				b.bits[i] = w &^ (1 << uint(tz))
+			}
+			return b.offset + i*bitsPerWord + tz
+		}
+	}
+	panic("BUG: empty block")
+}
+
+// lowerBound returns the smallest element of the block that is greater than or
+// equal to the element corresponding to the ith bit. If there is no such
+// element, the second return value is false.
+func (b *block) lowerBound(i uint) (int, bool) {
+	w := i / bitsPerWord
+	bit := i % bitsPerWord
+
+	if val := b.bits[w] >> bit; val != 0 {
+		return b.offset + int(i) + ntz(val), true
+	}
+
+	for w++; w < wordsPerBlock; w++ {
+		if val := b.bits[w]; val != 0 {
+			return b.offset + int(w*bitsPerWord) + ntz(val), true
+		}
+	}
+
+	return 0, false
+}
+
+// forEach calls f for each element of block b.
+// f must not mutate b's enclosing Sparse.
+func (b *block) forEach(f func(int)) {
+	for i, w := range b.bits {
+		offset := b.offset + i*bitsPerWord
+		for bi := 0; w != 0 && bi < bitsPerWord; bi++ {
+			if w&1 != 0 {
+				f(offset)
+			}
+			offset++
+			w >>= 1
+		}
+	}
+}
+
+// offsetAndBitIndex returns the offset of the block that would
+// contain x and the bit index of x within that block.
+func offsetAndBitIndex(x int) (int, uint) {
+	mod := x % bitsPerBlock
+	if mod < 0 {
+		// Euclidean (non-negative) remainder
+		mod += bitsPerBlock
+	}
+	return x - mod, uint(mod)
+}
+
+// -- Sparse --------------------------------------------------------------
+
+// none is a shared, empty, sentinel block that indicates the end of a block
+// list.
+var none block
+
+// Dummy type used to generate an implicit panic. This must be defined at the
+// package level; if it is defined inside a function, it prevents the inlining
+// of that function.
+type to_copy_a_sparse_you_must_call_its_Copy_method struct{}
+
+// init ensures s is properly initialized.
+func (s *Sparse) init() {
+	root := &s.root
+	if root.next == nil {
+		root.offset = MaxInt
+		root.next = root
+		root.prev = root
+	} else if root.next.prev != root {
+		// Copying a Sparse x leads to pernicious corruption: the
+		// new Sparse y shares the old linked list, but iteration
+		// on y will never encounter &y.root so it goes into a
+		// loop.  Fail fast before this occurs.
+		// We don't want to call panic here because it prevents the
+		// inlining of this function.
+		_ = (interface{}(nil)).(to_copy_a_sparse_you_must_call_its_Copy_method)
+	}
+}
+
+func (s *Sparse) first() *block {
+	s.init()
+	if s.root.offset == MaxInt {
+		return &none
+	}
+	return &s.root
+}
+
+// next returns the next block in the list, or end if b is the last block.
+func (s *Sparse) next(b *block) *block {
+	if b.next == &s.root {
+		return &none
+	}
+	return b.next
+}
+
+// IsEmpty reports whether the set s is empty.
+func (s *Sparse) IsEmpty() bool {
+	return s.root.next == nil || s.root.offset == MaxInt
+}
+
+// Len returns the number of elements in the set s.
+func (s *Sparse) Len() int {
+	var l int
+	for b := s.first(); b != &none; b = s.next(b) {
+		l += b.len()
+	}
+	return l
+}
+
+// Max returns the maximum element of the set s, or MinInt if s is empty.
+func (s *Sparse) Max() int {
+	if s.IsEmpty() {
+		return MinInt
+	}
+	return s.root.prev.max()
+}
+
+// Min returns the minimum element of the set s, or MaxInt if s is empty.
+func (s *Sparse) Min() int {
+	if s.IsEmpty() {
+		return MaxInt
+	}
+	return s.root.min(false)
+}
+
+// LowerBound returns the smallest element >= x, or MaxInt if there is no such
+// element.
+func (s *Sparse) LowerBound(x int) int {
+	offset, i := offsetAndBitIndex(x)
+	for b := s.first(); b != &none; b = s.next(b) {
+		if b.offset > offset {
+			return b.min(false)
+		}
+		if b.offset == offset {
+			if y, ok := b.lowerBound(i); ok {
+				return y
+			}
+		}
+	}
+	return MaxInt
+}
+
+// block returns the block that would contain offset,
+// or nil if s contains no such block.
+// Precondition: offset is a multiple of bitsPerBlock.
+func (s *Sparse) block(offset int) *block {
+	for b := s.first(); b != &none && b.offset <= offset; b = s.next(b) {
+		if b.offset == offset {
+			return b
+		}
+	}
+	return nil
+}
+
+// Insert adds x to the set s, and reports whether the set grew.
+func (s *Sparse) Insert(x int) bool {
+	offset, i := offsetAndBitIndex(x)
+
+	b := s.first()
+	for ; b != &none && b.offset <= offset; b = s.next(b) {
+		if b.offset == offset {
+			return b.insert(i)
+		}
+	}
+
+	// Insert new block before b.
+	new := s.insertBlockBefore(b)
+	new.offset = offset
+	return new.insert(i)
+}
+
+// removeBlock removes a block and returns the block that followed it (or end if
+// it was the last block).
+func (s *Sparse) removeBlock(b *block) *block {
+	if b != &s.root {
+		b.prev.next = b.next
+		b.next.prev = b.prev
+		if b.next == &s.root {
+			return &none
+		}
+		return b.next
+	}
+
+	first := s.root.next
+	if first == &s.root {
+		// This was the only block.
+		s.Clear()
+		return &none
+	}
+	s.root.offset = first.offset
+	s.root.bits = first.bits
+	if first.next == &s.root {
+		// Single block remaining.
+		s.root.next = &s.root
+		s.root.prev = &s.root
+	} else {
+		s.root.next = first.next
+		first.next.prev = &s.root
+	}
+	return &s.root
+}
+
+// Remove removes x from the set s, and reports whether the set shrank.
+func (s *Sparse) Remove(x int) bool {
+	offset, i := offsetAndBitIndex(x)
+	if b := s.block(offset); b != nil {
+		if !b.remove(i) {
+			return false
+		}
+		if b.empty() {
+			s.removeBlock(b)
+		}
+		return true
+	}
+	return false
+}
+
+// Clear removes all elements from the set s.
+func (s *Sparse) Clear() {
+	s.root = block{
+		offset: MaxInt,
+		next:   &s.root,
+		prev:   &s.root,
+	}
+}
+
+// If set s is non-empty, TakeMin sets *p to the minimum element of
+// the set s, removes that element from the set and returns true.
+// Otherwise, it returns false and *p is undefined.
+//
+// This method may be used for iteration over a worklist like so:
+//
+//	var x int
+//	for worklist.TakeMin(&x) { use(x) }
+func (s *Sparse) TakeMin(p *int) bool {
+	if s.IsEmpty() {
+		return false
+	}
+	*p = s.root.min(true)
+	if s.root.empty() {
+		s.removeBlock(&s.root)
+	}
+	return true
+}
+
+// Has reports whether x is an element of the set s.
+func (s *Sparse) Has(x int) bool {
+	offset, i := offsetAndBitIndex(x)
+	if b := s.block(offset); b != nil {
+		return b.has(i)
+	}
+	return false
+}
+
+// forEach applies function f to each element of the set s in order.
+//
+// f must not mutate s.  Consequently, forEach is not safe to expose
+// to clients.  In any case, using "range s.AppendTo()" allows more
+// natural control flow with continue/break/return.
+func (s *Sparse) forEach(f func(int)) {
+	for b := s.first(); b != &none; b = s.next(b) {
+		b.forEach(f)
+	}
+}
+
+// Copy sets s to the value of x.
+func (s *Sparse) Copy(x *Sparse) {
+	if s == x {
+		return
+	}
+
+	xb := x.first()
+	sb := s.first()
+	for xb != &none {
+		if sb == &none {
+			sb = s.insertBlockBefore(sb)
+		}
+		sb.offset = xb.offset
+		sb.bits = xb.bits
+		xb = x.next(xb)
+		sb = s.next(sb)
+	}
+	s.discardTail(sb)
+}
+
+// insertBlockBefore returns a new block, inserting it before next.
+// If next is the root, the root is replaced. If next is end, the block is
+// inserted at the end.
+func (s *Sparse) insertBlockBefore(next *block) *block {
+	if s.IsEmpty() {
+		if next != &none {
+			panic("BUG: passed block with empty set")
+		}
+		return &s.root
+	}
+
+	if next == &s.root {
+		// Special case: we need to create a new block that will become the root
+		// block.The old root block becomes the second block.
+		second := s.root
+		s.root = block{
+			next: &second,
+		}
+		if second.next == &s.root {
+			s.root.prev = &second
+		} else {
+			s.root.prev = second.prev
+			second.next.prev = &second
+			second.prev = &s.root
+		}
+		return &s.root
+	}
+	if next == &none {
+		// Insert before root.
+		next = &s.root
+	}
+	b := new(block)
+	b.next = next
+	b.prev = next.prev
+	b.prev.next = b
+	next.prev = b
+	return b
+}
+
+// discardTail removes block b and all its successors from s.
+func (s *Sparse) discardTail(b *block) {
+	if b != &none {
+		if b == &s.root {
+			s.Clear()
+		} else {
+			b.prev.next = &s.root
+			s.root.prev = b.prev
+		}
+	}
+}
+
+// IntersectionWith sets s to the intersection s ∩ x.
+func (s *Sparse) IntersectionWith(x *Sparse) {
+	if s == x {
+		return
+	}
+
+	xb := x.first()
+	sb := s.first()
+	for xb != &none && sb != &none {
+		switch {
+		case xb.offset < sb.offset:
+			xb = x.next(xb)
+
+		case xb.offset > sb.offset:
+			sb = s.removeBlock(sb)
+
+		default:
+			var sum word
+			for i := range sb.bits {
+				r := xb.bits[i] & sb.bits[i]
+				sb.bits[i] = r
+				sum |= r
+			}
+			if sum != 0 {
+				sb = s.next(sb)
+			} else {
+				// sb will be overwritten or removed
+			}
+
+			xb = x.next(xb)
+		}
+	}
+
+	s.discardTail(sb)
+}
+
+// Intersection sets s to the intersection x ∩ y.
+func (s *Sparse) Intersection(x, y *Sparse) {
+	switch {
+	case s == x:
+		s.IntersectionWith(y)
+		return
+	case s == y:
+		s.IntersectionWith(x)
+		return
+	case x == y:
+		s.Copy(x)
+		return
+	}
+
+	xb := x.first()
+	yb := y.first()
+	sb := s.first()
+	for xb != &none && yb != &none {
+		switch {
+		case xb.offset < yb.offset:
+			xb = x.next(xb)
+			continue
+		case xb.offset > yb.offset:
+			yb = y.next(yb)
+			continue
+		}
+
+		if sb == &none {
+			sb = s.insertBlockBefore(sb)
+		}
+		sb.offset = xb.offset
+
+		var sum word
+		for i := range sb.bits {
+			r := xb.bits[i] & yb.bits[i]
+			sb.bits[i] = r
+			sum |= r
+		}
+		if sum != 0 {
+			sb = s.next(sb)
+		} else {
+			// sb will be overwritten or removed
+		}
+
+		xb = x.next(xb)
+		yb = y.next(yb)
+	}
+
+	s.discardTail(sb)
+}
+
+// Intersects reports whether s ∩ x ≠ ∅.
+func (s *Sparse) Intersects(x *Sparse) bool {
+	sb := s.first()
+	xb := x.first()
+	for sb != &none && xb != &none {
+		switch {
+		case xb.offset < sb.offset:
+			xb = x.next(xb)
+		case xb.offset > sb.offset:
+			sb = s.next(sb)
+		default:
+			for i := range sb.bits {
+				if sb.bits[i]&xb.bits[i] != 0 {
+					return true
+				}
+			}
+			sb = s.next(sb)
+			xb = x.next(xb)
+		}
+	}
+	return false
+}
+
+// UnionWith sets s to the union s ∪ x, and reports whether s grew.
+func (s *Sparse) UnionWith(x *Sparse) bool {
+	if s == x {
+		return false
+	}
+
+	var changed bool
+	xb := x.first()
+	sb := s.first()
+	for xb != &none {
+		if sb != &none && sb.offset == xb.offset {
+			for i := range xb.bits {
+				union := sb.bits[i] | xb.bits[i]
+				if sb.bits[i] != union {
+					sb.bits[i] = union
+					changed = true
+				}
+			}
+			xb = x.next(xb)
+		} else if sb == &none || sb.offset > xb.offset {
+			sb = s.insertBlockBefore(sb)
+			sb.offset = xb.offset
+			sb.bits = xb.bits
+			changed = true
+
+			xb = x.next(xb)
+		}
+		sb = s.next(sb)
+	}
+	return changed
+}
+
+// Union sets s to the union x ∪ y.
+func (s *Sparse) Union(x, y *Sparse) {
+	switch {
+	case x == y:
+		s.Copy(x)
+		return
+	case s == x:
+		s.UnionWith(y)
+		return
+	case s == y:
+		s.UnionWith(x)
+		return
+	}
+
+	xb := x.first()
+	yb := y.first()
+	sb := s.first()
+	for xb != &none || yb != &none {
+		if sb == &none {
+			sb = s.insertBlockBefore(sb)
+		}
+		switch {
+		case yb == &none || (xb != &none && xb.offset < yb.offset):
+			sb.offset = xb.offset
+			sb.bits = xb.bits
+			xb = x.next(xb)
+
+		case xb == &none || (yb != &none && yb.offset < xb.offset):
+			sb.offset = yb.offset
+			sb.bits = yb.bits
+			yb = y.next(yb)
+
+		default:
+			sb.offset = xb.offset
+			for i := range xb.bits {
+				sb.bits[i] = xb.bits[i] | yb.bits[i]
+			}
+			xb = x.next(xb)
+			yb = y.next(yb)
+		}
+		sb = s.next(sb)
+	}
+
+	s.discardTail(sb)
+}
+
+// DifferenceWith sets s to the difference s ∖ x.
+func (s *Sparse) DifferenceWith(x *Sparse) {
+	if s == x {
+		s.Clear()
+		return
+	}
+
+	xb := x.first()
+	sb := s.first()
+	for xb != &none && sb != &none {
+		switch {
+		case xb.offset > sb.offset:
+			sb = s.next(sb)
+
+		case xb.offset < sb.offset:
+			xb = x.next(xb)
+
+		default:
+			var sum word
+			for i := range sb.bits {
+				r := sb.bits[i] & ^xb.bits[i]
+				sb.bits[i] = r
+				sum |= r
+			}
+			if sum == 0 {
+				sb = s.removeBlock(sb)
+			} else {
+				sb = s.next(sb)
+			}
+			xb = x.next(xb)
+		}
+	}
+}
+
+// Difference sets s to the difference x ∖ y.
+func (s *Sparse) Difference(x, y *Sparse) {
+	switch {
+	case x == y:
+		s.Clear()
+		return
+	case s == x:
+		s.DifferenceWith(y)
+		return
+	case s == y:
+		var y2 Sparse
+		y2.Copy(y)
+		s.Difference(x, &y2)
+		return
+	}
+
+	xb := x.first()
+	yb := y.first()
+	sb := s.first()
+	for xb != &none && yb != &none {
+		if xb.offset > yb.offset {
+			// y has block, x has &none
+			yb = y.next(yb)
+			continue
+		}
+
+		if sb == &none {
+			sb = s.insertBlockBefore(sb)
+		}
+		sb.offset = xb.offset
+
+		switch {
+		case xb.offset < yb.offset:
+			// x has block, y has &none
+			sb.bits = xb.bits
+
+			sb = s.next(sb)
+
+		default:
+			// x and y have corresponding blocks
+			var sum word
+			for i := range sb.bits {
+				r := xb.bits[i] & ^yb.bits[i]
+				sb.bits[i] = r
+				sum |= r
+			}
+			if sum != 0 {
+				sb = s.next(sb)
+			} else {
+				// sb will be overwritten or removed
+			}
+
+			yb = y.next(yb)
+		}
+		xb = x.next(xb)
+	}
+
+	for xb != &none {
+		if sb == &none {
+			sb = s.insertBlockBefore(sb)
+		}
+		sb.offset = xb.offset
+		sb.bits = xb.bits
+		sb = s.next(sb)
+
+		xb = x.next(xb)
+	}
+
+	s.discardTail(sb)
+}
+
+// SymmetricDifferenceWith sets s to the symmetric difference s ∆ x.
+func (s *Sparse) SymmetricDifferenceWith(x *Sparse) {
+	if s == x {
+		s.Clear()
+		return
+	}
+
+	sb := s.first()
+	xb := x.first()
+	for xb != &none && sb != &none {
+		switch {
+		case sb.offset < xb.offset:
+			sb = s.next(sb)
+		case xb.offset < sb.offset:
+			nb := s.insertBlockBefore(sb)
+			nb.offset = xb.offset
+			nb.bits = xb.bits
+			xb = x.next(xb)
+		default:
+			var sum word
+			for i := range sb.bits {
+				r := sb.bits[i] ^ xb.bits[i]
+				sb.bits[i] = r
+				sum |= r
+			}
+			if sum == 0 {
+				sb = s.removeBlock(sb)
+			} else {
+				sb = s.next(sb)
+			}
+			xb = x.next(xb)
+		}
+	}
+
+	for xb != &none { // append the tail of x to s
+		sb = s.insertBlockBefore(sb)
+		sb.offset = xb.offset
+		sb.bits = xb.bits
+		sb = s.next(sb)
+		xb = x.next(xb)
+	}
+}
+
+// SymmetricDifference sets s to the symmetric difference x ∆ y.
+func (s *Sparse) SymmetricDifference(x, y *Sparse) {
+	switch {
+	case x == y:
+		s.Clear()
+		return
+	case s == x:
+		s.SymmetricDifferenceWith(y)
+		return
+	case s == y:
+		s.SymmetricDifferenceWith(x)
+		return
+	}
+
+	sb := s.first()
+	xb := x.first()
+	yb := y.first()
+	for xb != &none && yb != &none {
+		if sb == &none {
+			sb = s.insertBlockBefore(sb)
+		}
+		switch {
+		case yb.offset < xb.offset:
+			sb.offset = yb.offset
+			sb.bits = yb.bits
+			sb = s.next(sb)
+			yb = y.next(yb)
+		case xb.offset < yb.offset:
+			sb.offset = xb.offset
+			sb.bits = xb.bits
+			sb = s.next(sb)
+			xb = x.next(xb)
+		default:
+			var sum word
+			for i := range sb.bits {
+				r := xb.bits[i] ^ yb.bits[i]
+				sb.bits[i] = r
+				sum |= r
+			}
+			if sum != 0 {
+				sb.offset = xb.offset
+				sb = s.next(sb)
+			}
+			xb = x.next(xb)
+			yb = y.next(yb)
+		}
+	}
+
+	for xb != &none { // append the tail of x to s
+		if sb == &none {
+			sb = s.insertBlockBefore(sb)
+		}
+		sb.offset = xb.offset
+		sb.bits = xb.bits
+		sb = s.next(sb)
+		xb = x.next(xb)
+	}
+
+	for yb != &none { // append the tail of y to s
+		if sb == &none {
+			sb = s.insertBlockBefore(sb)
+		}
+		sb.offset = yb.offset
+		sb.bits = yb.bits
+		sb = s.next(sb)
+		yb = y.next(yb)
+	}
+
+	s.discardTail(sb)
+}
+
+// SubsetOf reports whether s ∖ x = ∅.
+func (s *Sparse) SubsetOf(x *Sparse) bool {
+	if s == x {
+		return true
+	}
+
+	sb := s.first()
+	xb := x.first()
+	for sb != &none {
+		switch {
+		case xb == &none || xb.offset > sb.offset:
+			return false
+		case xb.offset < sb.offset:
+			xb = x.next(xb)
+		default:
+			for i := range sb.bits {
+				if sb.bits[i]&^xb.bits[i] != 0 {
+					return false
+				}
+			}
+			sb = s.next(sb)
+			xb = x.next(xb)
+		}
+	}
+	return true
+}
+
+// Equals reports whether the sets s and t have the same elements.
+func (s *Sparse) Equals(t *Sparse) bool {
+	if s == t {
+		return true
+	}
+	sb := s.first()
+	tb := t.first()
+	for {
+		switch {
+		case sb == &none && tb == &none:
+			return true
+		case sb == &none || tb == &none:
+			return false
+		case sb.offset != tb.offset:
+			return false
+		case sb.bits != tb.bits:
+			return false
+		}
+
+		sb = s.next(sb)
+		tb = t.next(tb)
+	}
+}
+
+// String returns a human-readable description of the set s.
+func (s *Sparse) String() string {
+	var buf bytes.Buffer
+	buf.WriteByte('{')
+	s.forEach(func(x int) {
+		if buf.Len() > 1 {
+			buf.WriteByte(' ')
+		}
+		fmt.Fprintf(&buf, "%d", x)
+	})
+	buf.WriteByte('}')
+	return buf.String()
+}
+
+// BitString returns the set as a string of 1s and 0s denoting the sum
+// of the i'th powers of 2, for each i in s.  A radix point, always
+// preceded by a digit, appears if the sum is non-integral.
+//
+// Examples:
+//
+//	        {}.BitString() =      "0"
+//	     {4,5}.BitString() = "110000"
+//	      {-3}.BitString() =      "0.001"
+//	{-3,0,4,5}.BitString() = "110001.001"
+func (s *Sparse) BitString() string {
+	if s.IsEmpty() {
+		return "0"
+	}
+
+	min, max := s.Min(), s.Max()
+	var nbytes int
+	if max > 0 {
+		nbytes = max
+	}
+	nbytes++ // zero bit
+	radix := nbytes
+	if min < 0 {
+		nbytes += len(".") - min
+	}
+
+	b := make([]byte, nbytes)
+	for i := range b {
+		b[i] = '0'
+	}
+	if radix < nbytes {
+		b[radix] = '.'
+	}
+	s.forEach(func(x int) {
+		if x >= 0 {
+			x += len(".")
+		}
+		b[radix-x] = '1'
+	})
+	return string(b)
+}
+
+// GoString returns a string showing the internal representation of
+// the set s.
+func (s *Sparse) GoString() string {
+	var buf bytes.Buffer
+	for b := s.first(); b != &none; b = s.next(b) {
+		fmt.Fprintf(&buf, "block %p {offset=%d next=%p prev=%p",
+			b, b.offset, b.next, b.prev)
+		for _, w := range b.bits {
+			fmt.Fprintf(&buf, " 0%016x", w)
+		}
+		fmt.Fprintf(&buf, "}\n")
+	}
+	return buf.String()
+}
+
+// AppendTo returns the result of appending the elements of s to slice
+// in order.
+func (s *Sparse) AppendTo(slice []int) []int {
+	s.forEach(func(x int) {
+		slice = append(slice, x)
+	})
+	return slice
+}
+
+// -- Testing/debugging ------------------------------------------------
+
+// check returns an error if the representation invariants of s are violated.
+// (unused; retained for debugging)
+func (s *Sparse) check() error {
+	s.init()
+	if s.root.empty() {
+		// An empty set must have only the root block with offset MaxInt.
+		if s.root.next != &s.root {
+			return fmt.Errorf("multiple blocks with empty root block")
+		}
+		if s.root.offset != MaxInt {
+			return fmt.Errorf("empty set has offset %d, should be MaxInt", s.root.offset)
+		}
+		return nil
+	}
+	for b := s.first(); ; b = s.next(b) {
+		if b.offset%bitsPerBlock != 0 {
+			return fmt.Errorf("bad offset modulo: %d", b.offset)
+		}
+		if b.empty() {
+			return fmt.Errorf("empty block")
+		}
+		if b.prev.next != b {
+			return fmt.Errorf("bad prev.next link")
+		}
+		if b.next.prev != b {
+			return fmt.Errorf("bad next.prev link")
+		}
+		if b.next == &s.root {
+			break
+		}
+		if b.offset >= b.next.offset {
+			return fmt.Errorf("bad offset order: b.offset=%d, b.next.offset=%d",
+				b.offset, b.next.offset)
+		}
+	}
+	return nil
+}
diff --git a/vendor/gonum.org/v1/gonum/AUTHORS b/vendor/gonum.org/v1/gonum/AUTHORS
new file mode 100644
index 0000000000..1f0e79b4c0
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/AUTHORS
@@ -0,0 +1,141 @@
+# This is the official list of Gonum authors for copyright purposes.
+# This file is distinct from the CONTRIBUTORS files.
+# See the latter for an explanation.
+
+# Names should be added to this file as
+#	Name or Organization <email address>
+# The email address is not required for organizations.
+
+# Please keep the list sorted.
+
+Alexander Egurnov <alexander.egurnov@gmail.com>
+Andrei Blinnikov <goofinator@mail.ru>
+antichris <chris@u-d13.com>
+Bailey Lissington <lissington4@gmail.com>
+Bill Gray <wgray@gogray.com>
+Bill Noon <noon.bill@gmail.com>
+Brendan Tracey <tracey.brendan@gmail.com>
+Brent Pedersen <bpederse@gmail.com>
+Bulat Khasanov <afti@yandex.ru>
+Chad Kunde <kunde21@gmail.com>
+Chan Kwan Yin <sofe2038@gmail.com>
+Chih-Wei Chang <bert.cwchang@gmail.com>
+Chong-Yeol Nah <nahchongyeol@gmail.com>
+Chris Tessum <ctessum@gmail.com>
+Christophe Meessen <christophe.meessen@gmail.com>
+Christopher Waldon <christopher.waldon.dev@gmail.com>
+Clayton Northey <clayton.northey@gmail.com>
+Coana ApS
+Dan Kortschak <dan.kortschak@adelaide.edu.au> <dan@kortschak.io>
+Daniel Fireman <danielfireman@gmail.com>
+Dario Heinisch <dario.heinisch@gmail.com>
+David Kleiven <davidkleiven446@gmail.com>
+David Samborski <bloggingarrow@gmail.com>
+Davor Kapsa <davor.kapsa@gmail.com>
+DeepMind Technologies
+Delaney Gillilan <delaneygillilan@gmail.com>
+Dezmond Goff <goff.dezmond@gmail.com>
+Dirk Müller <dirk@dmllr.de>
+Dong-hee Na <donghee.na92@gmail.com>
+Dustin Spicuzza <dustin@virtualroadside.com>
+Egon Elbre <egonelbre@gmail.com>
+Ekaterina Efimova <katerina.efimova@gmail.com>
+Eng Zer Jun <engzerjun@gmail.com>
+Ethan Burns <burns.ethan@gmail.com>
+Ethan Reesor <ethan.reesor@gmail.com>
+Evert Lammerts <evert.lammerts@gmail.com>
+Evgeny Savinov <notime.sea@gmail.com>
+Fabian Wickborn <fabian@wickborn.net>
+Facundo Gaich <facugaich@gmail.com>
+Fazlul Shahriar <fshahriar@gmail.com>
+Francesc Campoy <campoy@golang.org>
+Google Inc
+Gustaf Johansson <gustaf@pinon.se>
+Hossein Zolfi <hossein.zolfi@gmail.com>
+Huang Peng Fei <huangpengfei@outlook.com>
+Iakov Davydov <iakov.davydov@unil.ch>
+Igor Mikushkin <igor.mikushkin@gmail.com>
+Iskander Sharipov <quasilyte@gmail.com>
+Jack Tudbury
+Jalem Raj Rohit <jrajrohit33@gmail.com>
+James Bell <james@stellentus.com>
+James Bowman <james.edward.bowman@gmail.com>
+James Holmes <32bitkid@gmail.com>
+Janne Snabb <snabb@epipe.com>
+Jeremy Atkinson <jchatkinson@gmail.com>
+Jes Cok <xigua67damn@gmail.com>
+Jinesi Yelizati <i63888888@163.com>
+Jonas Kahler <jonas@derkahler.de>
+Jonas Schulze <jonas.schulze@ovgu.de>
+Jonathan Bluett-Duncan <jbluettduncan@gmail.com>
+Jonathan J Lawlor <jonathan.lawlor@gmail.com>
+Jonathan Reiter <jonreiter@gmail.com>
+Jonathan Schroeder <jd.schroeder@gmail.com>
+Joost van Amersfoort <git@joo.st>
+Jordan Stoker <jordan_stoker@hotmail.com>
+Joseph Watson <jtwatson@linux-consulting.us>
+Josh Wilson <josh.craig.wilson@gmail.com>
+Julien Roland <juroland@gmail.com>
+Kai Trukenmüller <ktye78@gmail.com>
+Kendall Marcus <knowmost@outlook.com>
+Kent English <kent.english@gmail.com>
+Kevin C. Zimmerman <kevinczimmerman@gmail.com>
+Kirill Motkov <motkov.kirill@gmail.com>
+Konstantin Shaposhnikov <k.shaposhnikov@gmail.com>
+Leonid Kneller <recondite.matter@gmail.com>
+Lyron Winderbaum <lyron.winderbaum@student.adelaide.edu.au> <armadilloa16@gmail.com> <lyron.winderbaum@uwa.edu.au>
+Marco Leogrande <dark.knight.ita@gmail.com>
+Mark Canning <argusdusty@gmail.com>
+Mark Skilbeck <markskilbeck@gmail.com>
+Martin Diz <github@martindiz.com.ar>
+Matthew Connelly <matthew.b.connelly@gmail.com>
+Matthieu Di Mercurio <matthieu.dimercurio@gmail.com>
+Max Halford <maxhalford25@gmail.com>
+Maxim Sergeev <gudvinr@gmail.com>
+Microsoft Corporation
+MinJae Kwon <k239507@gmail.com>
+Nathan Edwards <etaoinshrdluwho@gmail.com>
+Nick Potts <nick@the-potts.com>
+Nils Wogatzky <odog@netcologne.de>
+Olivier Wulveryck <olivier.wulveryck@gmail.com>
+Or Rikon <rikonor@gmail.com>
+Patricio Whittingslow <graded.sp@gmail.com>
+Patrick DeVivo <patrick@tickgit.com>
+Pontus Melke <pontusmelke@gmail.com>
+Renee French
+Rishi Desai <desai.rishi1@gmail.com>
+Robert Kleffner <rob.kleffner@gmail.com>
+Robin Eklind <r.eklind.87@gmail.com>
+Roger Welin <roger.welin@icloud.com>
+Rondall Jones <rejones7@gmail.com>
+Sam Zaydel <szaydel@gmail.com>
+Samuel Kelemen <Samuel@Kelemen.us>
+Saran Ahluwalia <ahlusar.ahluwalia@gmail.com>
+Scott Holden <scott@sshconnection.com>
+Scott Kiesel <kiesel.scott@gmail.com>
+Sebastien Binet <seb.binet@gmail.com>
+Shawn Smith <shawnpsmith@gmail.com>
+Sintela Ltd
+source{d} <hello@sourced.tech>
+Spencer Lyon <spencerlyon2@gmail.com>
+Steve McCoy <mccoyst@gmail.com>
+Taesu Pyo <pyotaesu@gmail.com>
+Takeshi Yoneda <cz.rk.t0415y.g@gmail.com>
+Tamir Hyman <hyman.tamir@gmail.com>
+The University of Adelaide
+The University of Minnesota
+The University of Washington
+Thomas Berg <tomfuture@gmail.com>
+Tobin Harding <me@tobin.cc>
+Tom Payne <twpayne@gmail.com>
+Tristan Nicholls <tvk.nicholls@gmail.com>
+Valentin Deleplace <deleplace2015@gmail.com>
+Vincent Thiery <vjmthiery@gmail.com>
+Vladimír Chalupecký <vladimir.chalupecky@gmail.com>
+Will Tekulve <tekulve.will@gmail.com>
+Yasuhiro Matsumoto <mattn.jp@gmail.com>
+Yevgeniy Vahlis <evahlis@gmail.com>
+Yucheng Zhu <zyctc000@gmail.com>
+Yunomi <ynmtywn@gmail.com>
+Zhan Shan Mao <zhanshanmao@outlook.com>
+Zoe Juozapaitis
diff --git a/vendor/gonum.org/v1/gonum/CONTRIBUTORS b/vendor/gonum.org/v1/gonum/CONTRIBUTORS
new file mode 100644
index 0000000000..1fbe736c5f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/CONTRIBUTORS
@@ -0,0 +1,144 @@
+# This is the official list of people who can contribute
+# (and typically have contributed) code to the Gonum
+# project.
+#
+# The AUTHORS file lists the copyright holders; this file
+# lists people.  For example, Google employees would be listed here
+# but not in AUTHORS, because Google would hold the copyright.
+#
+# When adding J Random Contributor's name to this file,
+# either J's name or J's organization's name should be
+# added to the AUTHORS file.
+#
+# Names should be added to this file like so:
+#     Name <email address>
+#
+# Please keep the list sorted.
+
+Alexander Egurnov <alexander.egurnov@gmail.com>
+Andrei Blinnikov <goofinator@mail.ru>
+Andrew Brampton <brampton@gmail.com>
+antichris <chris@u-d13.com>
+Bailey Lissington <lissington4@gmail.com>
+Bill Gray <wgray@gogray.com>
+Bill Noon <noon.bill@gmail.com>
+Brendan Tracey <tracey.brendan@gmail.com>
+Brent Pedersen <bpederse@gmail.com>
+Bulat Khasanov <afti@yandex.ru>
+Chad Kunde <kunde21@gmail.com>
+Chan Kwan Yin <sofe2038@gmail.com>
+Chih-Wei Chang <bert.cwchang@gmail.com>
+Chong-Yeol Nah <nahchongyeol@gmail.com>
+Chris Tessum <ctessum@gmail.com>
+Christophe Meessen <christophe.meessen@gmail.com>
+Christopher Waldon <christopher.waldon.dev@gmail.com>
+Clayton Northey <clayton.northey@gmail.com>
+Dan Kortschak <dan.kortschak@adelaide.edu.au> <dan@kortschak.io>
+Dan Lorenc <lorenc.d@gmail.com>
+Daniel Fireman <danielfireman@gmail.com>
+Dario Heinisch <dario.heinisch@gmail.com>
+David Kleiven <davidkleiven446@gmail.com>
+David Samborski <bloggingarrow@gmail.com>
+Davor Kapsa <davor.kapsa@gmail.com>
+Delaney Gillilan <delaneygillilan@gmail.com>
+Dezmond Goff <goff.dezmond@gmail.com>
+Dirk Müller <dirk@dmllr.de>
+Dong-hee Na <donghee.na92@gmail.com>
+Dustin Spicuzza <dustin@virtualroadside.com>
+Egon Elbre <egonelbre@gmail.com>
+Ekaterina Efimova <katerina.efimova@gmail.com>
+Eng Zer Jun <engzerjun@gmail.com>
+Ethan Burns <burns.ethan@gmail.com>
+Ethan Reesor <ethan.reesor@gmail.com>
+Evert Lammerts <evert.lammerts@gmail.com>
+Evgeny Savinov <notime.sea@gmail.com>
+Fabian Wickborn <fabian@wickborn.net>
+Facundo Gaich <facugaich@gmail.com>
+Fazlul Shahriar <fshahriar@gmail.com>
+Francesc Campoy <campoy@golang.org>
+Gustaf Johansson <gustaf@pinon.se>
+Hossein Zolfi <hossein.zolfi@gmail.com>
+Huang Peng Fei <huangpengfei@outlook.com>
+Iakov Davydov <iakov.davydov@unil.ch>
+Igor Mikushkin <igor.mikushkin@gmail.com>
+Iskander Sharipov <quasilyte@gmail.com>
+Jack Tudbury
+Jalem Raj Rohit <jrajrohit33@gmail.com>
+James Bell <james@stellentus.com>
+James Bowman <james.edward.bowman@gmail.com>
+James Holmes <32bitkid@gmail.com>
+Janne Snabb <snabb@epipe.com>
+Jeremy Atkinson <jchatkinson@gmail.com>
+Jes Cok <xigua67damn@gmail.com>
+Jinesi Yelizati <i63888888@163.com>
+Jon Richards <noj.richards@gmail.com>
+Jonas Kahler <jonas@derkahler.de>
+Jonas Schulze <jonas.schulze@ovgu.de>
+Jonathan Bluett-Duncan <jbluettduncan@gmail.com>
+Jonathan J Lawlor <jonathan.lawlor@gmail.com>
+Jonathan Reiter <jonreiter@gmail.com>
+Jonathan Schroeder <jd.schroeder@gmail.com>
+Joost van Amersfoort <git@joo.st>
+Jordan Stoker <jordan_stoker@hotmail.com>
+Joseph Watson <jtwatson@linux-consulting.us>
+Josh Wilson <josh.craig.wilson@gmail.com>
+Julien Roland <juroland@gmail.com>
+Kai Trukenmüller <ktye78@gmail.com>
+Kendall Marcus <knowmost@outlook.com>
+Kent English <kent.english@gmail.com>
+Kevin C. Zimmerman <kevinczimmerman@gmail.com>
+Kirill Motkov <motkov.kirill@gmail.com>
+Konstantin Shaposhnikov <k.shaposhnikov@gmail.com>
+Leonid Kneller <recondite.matter@gmail.com>
+Lyron Winderbaum <lyron.winderbaum@student.adelaide.edu.au> <armadilloa16@gmail.com> <lyron.winderbaum@uwa.edu.au>
+Marco Leogrande <dark.knight.ita@gmail.com>
+Mark Canning <argusdusty@gmail.com>
+Mark Skilbeck <markskilbeck@gmail.com>
+Martin Diz <github@martindiz.com.ar>
+Matthew Connelly <matthew.b.connelly@gmail.com>
+Matthieu Di Mercurio <matthieu.dimercurio@gmail.com>
+Max Halford <maxhalford25@gmail.com>
+Maxim Sergeev <gudvinr@gmail.com>
+MinJae Kwon <k239507@gmail.com>
+Nathan Edwards <etaoinshrdluwho@gmail.com>
+Nick Potts <nick@the-potts.com>
+Nils Wogatzky <odog@netcologne.de>
+Olivier Wulveryck <olivier.wulveryck@gmail.com>
+Or Rikon <rikonor@gmail.com>
+Oskar Haarklou Veileborg <ohv1020@hotmail.com>
+Patricio Whittingslow <graded.sp@gmail.com>
+Patrick DeVivo <patrick@tickgit.com>
+Pontus Melke <pontusmelke@gmail.com>
+Renee French
+Rishi Desai <desai.rishi1@gmail.com>
+Robert Kleffner <rob.kleffner@gmail.com>
+Robin Eklind <r.eklind.87@gmail.com>
+Roger Welin <roger.welin@icloud.com>
+Roman Werpachowski <roman.werpachowski@gmail.com>
+Rondall Jones <rejones7@gmail.com>
+Sam Zaydel <szaydel@gmail.com>
+Samuel Kelemen <Samuel@Kelemen.us>
+Saran Ahluwalia <ahlusar.ahluwalia@gmail.com>
+Scott Holden <scott@sshconnection.com>
+Scott Kiesel <kiesel.scott@gmail.com>
+Sebastien Binet <seb.binet@gmail.com>
+Shawn Smith <shawnpsmith@gmail.com>
+Spencer Lyon <spencerlyon2@gmail.com>
+Steve McCoy <mccoyst@gmail.com>
+Taesu Pyo <pyotaesu@gmail.com>
+Takeshi Yoneda <cz.rk.t0415y.g@gmail.com>
+Tamir Hyman <hyman.tamir@gmail.com>
+Thomas Berg <tomfuture@gmail.com>
+Tobin Harding <me@tobin.cc>
+Tom Payne <twpayne@gmail.com>
+Tristan Nicholls <tvk.nicholls@gmail.com>
+Valentin Deleplace <deleplace2015@gmail.com>
+Vincent Thiery <vjmthiery@gmail.com>
+Vladimír Chalupecký <vladimir.chalupecky@gmail.com>
+Will Tekulve <tekulve.will@gmail.com>
+Yasuhiro Matsumoto <mattn.jp@gmail.com>
+Yevgeniy Vahlis <evahlis@gmail.com>
+Yucheng Zhu <zyctc000@gmail.com>
+Yunomi <ynmtywn@gmail.com>
+Zhan Shan Mao <zhanshanmao@outlook.com>
+Zoe Juozapaitis
diff --git a/vendor/gonum.org/v1/gonum/LICENSE b/vendor/gonum.org/v1/gonum/LICENSE
new file mode 100644
index 0000000000..ed477e59b5
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/LICENSE
@@ -0,0 +1,23 @@
+Copyright ©2013 The Gonum Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Gonum project nor the names of its authors and
+      contributors may be used to endorse or promote products derived from this
+      software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/vendor/gonum.org/v1/gonum/blas/README.md b/vendor/gonum.org/v1/gonum/blas/README.md
new file mode 100644
index 0000000000..16d62bd355
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/README.md
@@ -0,0 +1,51 @@
+# Gonum BLAS
+
+[![go.dev reference](https://pkg.go.dev/badge/gonum.org/v1/gonum/blas)](https://pkg.go.dev/gonum.org/v1/gonum/blas)
+[![GoDoc](https://godocs.io/gonum.org/v1/gonum/blas?status.svg)](https://godocs.io/gonum.org/v1/gonum/blas)
+
+A collection of packages to provide BLAS functionality for the [Go programming
+language](http://golang.org)
+
+## Installation
+```sh
+  go get gonum.org/v1/gonum/blas/...
+```
+
+## Packages
+
+### blas
+
+Defines [BLAS API](http://www.netlib.org/blas/blast-forum/cinterface.pdf) split in several
+interfaces.
+
+### blas/gonum
+
+Go implementation of the BLAS API (incomplete, implements the `float32` and `float64` API).
+
+### blas/blas64 and blas/blas32
+
+Wrappers for an implementation of the double (i.e., `float64`) and single (`float32`)
+precision real parts of the BLAS API.
+
+```Go
+package main
+
+import (
+	"fmt"
+
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+func main() {
+	v := blas64.Vector{Inc: 1, Data: []float64{1, 1, 1}}
+	v.N = len(v.Data)
+	fmt.Println("v has length:", blas64.Nrm2(v))
+}
+```
+
+### blas/cblas128 and blas/cblas64
+
+Wrappers for an implementation of the double (i.e., `complex128`) and single (`complex64`) 
+precision complex parts of the blas API.
+
+Currently blas/cblas64 and blas/cblas128 require gonum.org/v1/netlib/blas.
diff --git a/vendor/gonum.org/v1/gonum/blas/blas.go b/vendor/gonum.org/v1/gonum/blas/blas.go
new file mode 100644
index 0000000000..9b933e3fc5
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/blas.go
@@ -0,0 +1,283 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:generate ./conversions.bash
+
+package blas
+
+// Flag constants indicate Givens transformation H matrix state.
+type Flag int
+
+const (
+	Identity    Flag = -2 // H is the identity matrix; no rotation is needed.
+	Rescaling   Flag = -1 // H specifies rescaling.
+	OffDiagonal Flag = 0  // Off-diagonal elements of H are non-unit.
+	Diagonal    Flag = 1  // Diagonal elements of H are non-unit.
+)
+
+// SrotmParams contains Givens transformation parameters returned
+// by the Float32 Srotm method.
+type SrotmParams struct {
+	Flag
+	H [4]float32 // Column-major 2 by 2 matrix.
+}
+
+// DrotmParams contains Givens transformation parameters returned
+// by the Float64 Drotm method.
+type DrotmParams struct {
+	Flag
+	H [4]float64 // Column-major 2 by 2 matrix.
+}
+
+// Transpose specifies the transposition operation of a matrix.
+type Transpose byte
+
+const (
+	NoTrans   Transpose = 'N'
+	Trans     Transpose = 'T'
+	ConjTrans Transpose = 'C'
+)
+
+// Uplo specifies whether a matrix is upper or lower triangular.
+type Uplo byte
+
+const (
+	Upper Uplo = 'U'
+	Lower Uplo = 'L'
+	All   Uplo = 'A'
+)
+
+// Diag specifies whether a matrix is unit triangular.
+type Diag byte
+
+const (
+	NonUnit Diag = 'N'
+	Unit    Diag = 'U'
+)
+
+// Side specifies from which side a multiplication operation is performed.
+type Side byte
+
+const (
+	Left  Side = 'L'
+	Right Side = 'R'
+)
+
+// Float32 implements the single precision real BLAS routines.
+type Float32 interface {
+	Float32Level1
+	Float32Level2
+	Float32Level3
+}
+
+// Float32Level1 implements the single precision real BLAS Level 1 routines.
+type Float32Level1 interface {
+	Sdsdot(n int, alpha float32, x []float32, incX int, y []float32, incY int) float32
+	Dsdot(n int, x []float32, incX int, y []float32, incY int) float64
+	Sdot(n int, x []float32, incX int, y []float32, incY int) float32
+	Snrm2(n int, x []float32, incX int) float32
+	Sasum(n int, x []float32, incX int) float32
+	Isamax(n int, x []float32, incX int) int
+	Sswap(n int, x []float32, incX int, y []float32, incY int)
+	Scopy(n int, x []float32, incX int, y []float32, incY int)
+	Saxpy(n int, alpha float32, x []float32, incX int, y []float32, incY int)
+	Srotg(a, b float32) (c, s, r, z float32)
+	Srotmg(d1, d2, b1, b2 float32) (p SrotmParams, rd1, rd2, rb1 float32)
+	Srot(n int, x []float32, incX int, y []float32, incY int, c, s float32)
+	Srotm(n int, x []float32, incX int, y []float32, incY int, p SrotmParams)
+	Sscal(n int, alpha float32, x []float32, incX int)
+}
+
+// Float32Level2 implements the single precision real BLAS Level 2 routines.
+type Float32Level2 interface {
+	Sgemv(tA Transpose, m, n int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int)
+	Sgbmv(tA Transpose, m, n, kL, kU int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int)
+	Strmv(ul Uplo, tA Transpose, d Diag, n int, a []float32, lda int, x []float32, incX int)
+	Stbmv(ul Uplo, tA Transpose, d Diag, n, k int, a []float32, lda int, x []float32, incX int)
+	Stpmv(ul Uplo, tA Transpose, d Diag, n int, ap []float32, x []float32, incX int)
+	Strsv(ul Uplo, tA Transpose, d Diag, n int, a []float32, lda int, x []float32, incX int)
+	Stbsv(ul Uplo, tA Transpose, d Diag, n, k int, a []float32, lda int, x []float32, incX int)
+	Stpsv(ul Uplo, tA Transpose, d Diag, n int, ap []float32, x []float32, incX int)
+	Ssymv(ul Uplo, n int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int)
+	Ssbmv(ul Uplo, n, k int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int)
+	Sspmv(ul Uplo, n int, alpha float32, ap []float32, x []float32, incX int, beta float32, y []float32, incY int)
+	Sger(m, n int, alpha float32, x []float32, incX int, y []float32, incY int, a []float32, lda int)
+	Ssyr(ul Uplo, n int, alpha float32, x []float32, incX int, a []float32, lda int)
+	Sspr(ul Uplo, n int, alpha float32, x []float32, incX int, ap []float32)
+	Ssyr2(ul Uplo, n int, alpha float32, x []float32, incX int, y []float32, incY int, a []float32, lda int)
+	Sspr2(ul Uplo, n int, alpha float32, x []float32, incX int, y []float32, incY int, a []float32)
+}
+
+// Float32Level3 implements the single precision real BLAS Level 3 routines.
+type Float32Level3 interface {
+	Sgemm(tA, tB Transpose, m, n, k int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int)
+	Ssymm(s Side, ul Uplo, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int)
+	Ssyrk(ul Uplo, t Transpose, n, k int, alpha float32, a []float32, lda int, beta float32, c []float32, ldc int)
+	Ssyr2k(ul Uplo, t Transpose, n, k int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int)
+	Strmm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int)
+	Strsm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int)
+}
+
+// Float64 implements the single precision real BLAS routines.
+type Float64 interface {
+	Float64Level1
+	Float64Level2
+	Float64Level3
+}
+
+// Float64Level1 implements the double precision real BLAS Level 1 routines.
+type Float64Level1 interface {
+	Ddot(n int, x []float64, incX int, y []float64, incY int) float64
+	Dnrm2(n int, x []float64, incX int) float64
+	Dasum(n int, x []float64, incX int) float64
+	Idamax(n int, x []float64, incX int) int
+	Dswap(n int, x []float64, incX int, y []float64, incY int)
+	Dcopy(n int, x []float64, incX int, y []float64, incY int)
+	Daxpy(n int, alpha float64, x []float64, incX int, y []float64, incY int)
+	Drotg(a, b float64) (c, s, r, z float64)
+	Drotmg(d1, d2, b1, b2 float64) (p DrotmParams, rd1, rd2, rb1 float64)
+	Drot(n int, x []float64, incX int, y []float64, incY int, c float64, s float64)
+	Drotm(n int, x []float64, incX int, y []float64, incY int, p DrotmParams)
+	Dscal(n int, alpha float64, x []float64, incX int)
+}
+
+// Float64Level2 implements the double precision real BLAS Level 2 routines.
+type Float64Level2 interface {
+	Dgemv(tA Transpose, m, n int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int)
+	Dgbmv(tA Transpose, m, n, kL, kU int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int)
+	Dtrmv(ul Uplo, tA Transpose, d Diag, n int, a []float64, lda int, x []float64, incX int)
+	Dtbmv(ul Uplo, tA Transpose, d Diag, n, k int, a []float64, lda int, x []float64, incX int)
+	Dtpmv(ul Uplo, tA Transpose, d Diag, n int, ap []float64, x []float64, incX int)
+	Dtrsv(ul Uplo, tA Transpose, d Diag, n int, a []float64, lda int, x []float64, incX int)
+	Dtbsv(ul Uplo, tA Transpose, d Diag, n, k int, a []float64, lda int, x []float64, incX int)
+	Dtpsv(ul Uplo, tA Transpose, d Diag, n int, ap []float64, x []float64, incX int)
+	Dsymv(ul Uplo, n int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int)
+	Dsbmv(ul Uplo, n, k int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int)
+	Dspmv(ul Uplo, n int, alpha float64, ap []float64, x []float64, incX int, beta float64, y []float64, incY int)
+	Dger(m, n int, alpha float64, x []float64, incX int, y []float64, incY int, a []float64, lda int)
+	Dsyr(ul Uplo, n int, alpha float64, x []float64, incX int, a []float64, lda int)
+	Dspr(ul Uplo, n int, alpha float64, x []float64, incX int, ap []float64)
+	Dsyr2(ul Uplo, n int, alpha float64, x []float64, incX int, y []float64, incY int, a []float64, lda int)
+	Dspr2(ul Uplo, n int, alpha float64, x []float64, incX int, y []float64, incY int, a []float64)
+}
+
+// Float64Level3 implements the double precision real BLAS Level 3 routines.
+type Float64Level3 interface {
+	Dgemm(tA, tB Transpose, m, n, k int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int)
+	Dsymm(s Side, ul Uplo, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int)
+	Dsyrk(ul Uplo, t Transpose, n, k int, alpha float64, a []float64, lda int, beta float64, c []float64, ldc int)
+	Dsyr2k(ul Uplo, t Transpose, n, k int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int)
+	Dtrmm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int)
+	Dtrsm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int)
+}
+
+// Complex64 implements the single precision complex BLAS routines.
+type Complex64 interface {
+	Complex64Level1
+	Complex64Level2
+	Complex64Level3
+}
+
+// Complex64Level1 implements the single precision complex BLAS Level 1 routines.
+type Complex64Level1 interface {
+	Cdotu(n int, x []complex64, incX int, y []complex64, incY int) (dotu complex64)
+	Cdotc(n int, x []complex64, incX int, y []complex64, incY int) (dotc complex64)
+	Scnrm2(n int, x []complex64, incX int) float32
+	Scasum(n int, x []complex64, incX int) float32
+	Icamax(n int, x []complex64, incX int) int
+	Cswap(n int, x []complex64, incX int, y []complex64, incY int)
+	Ccopy(n int, x []complex64, incX int, y []complex64, incY int)
+	Caxpy(n int, alpha complex64, x []complex64, incX int, y []complex64, incY int)
+	Cscal(n int, alpha complex64, x []complex64, incX int)
+	Csscal(n int, alpha float32, x []complex64, incX int)
+}
+
+// Complex64Level2 implements the single precision complex BLAS routines Level 2 routines.
+type Complex64Level2 interface {
+	Cgemv(tA Transpose, m, n int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int)
+	Cgbmv(tA Transpose, m, n, kL, kU int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int)
+	Ctrmv(ul Uplo, tA Transpose, d Diag, n int, a []complex64, lda int, x []complex64, incX int)
+	Ctbmv(ul Uplo, tA Transpose, d Diag, n, k int, a []complex64, lda int, x []complex64, incX int)
+	Ctpmv(ul Uplo, tA Transpose, d Diag, n int, ap []complex64, x []complex64, incX int)
+	Ctrsv(ul Uplo, tA Transpose, d Diag, n int, a []complex64, lda int, x []complex64, incX int)
+	Ctbsv(ul Uplo, tA Transpose, d Diag, n, k int, a []complex64, lda int, x []complex64, incX int)
+	Ctpsv(ul Uplo, tA Transpose, d Diag, n int, ap []complex64, x []complex64, incX int)
+	Chemv(ul Uplo, n int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int)
+	Chbmv(ul Uplo, n, k int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int)
+	Chpmv(ul Uplo, n int, alpha complex64, ap []complex64, x []complex64, incX int, beta complex64, y []complex64, incY int)
+	Cgeru(m, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, a []complex64, lda int)
+	Cgerc(m, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, a []complex64, lda int)
+	Cher(ul Uplo, n int, alpha float32, x []complex64, incX int, a []complex64, lda int)
+	Chpr(ul Uplo, n int, alpha float32, x []complex64, incX int, a []complex64)
+	Cher2(ul Uplo, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, a []complex64, lda int)
+	Chpr2(ul Uplo, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, ap []complex64)
+}
+
+// Complex64Level3 implements the single precision complex BLAS Level 3 routines.
+type Complex64Level3 interface {
+	Cgemm(tA, tB Transpose, m, n, k int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int)
+	Csymm(s Side, ul Uplo, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int)
+	Csyrk(ul Uplo, t Transpose, n, k int, alpha complex64, a []complex64, lda int, beta complex64, c []complex64, ldc int)
+	Csyr2k(ul Uplo, t Transpose, n, k int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int)
+	Ctrmm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int)
+	Ctrsm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int)
+	Chemm(s Side, ul Uplo, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int)
+	Cherk(ul Uplo, t Transpose, n, k int, alpha float32, a []complex64, lda int, beta float32, c []complex64, ldc int)
+	Cher2k(ul Uplo, t Transpose, n, k int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta float32, c []complex64, ldc int)
+}
+
+// Complex128 implements the double precision complex BLAS routines.
+type Complex128 interface {
+	Complex128Level1
+	Complex128Level2
+	Complex128Level3
+}
+
+// Complex128Level1 implements the double precision complex BLAS Level 1 routines.
+type Complex128Level1 interface {
+	Zdotu(n int, x []complex128, incX int, y []complex128, incY int) (dotu complex128)
+	Zdotc(n int, x []complex128, incX int, y []complex128, incY int) (dotc complex128)
+	Dznrm2(n int, x []complex128, incX int) float64
+	Dzasum(n int, x []complex128, incX int) float64
+	Izamax(n int, x []complex128, incX int) int
+	Zswap(n int, x []complex128, incX int, y []complex128, incY int)
+	Zcopy(n int, x []complex128, incX int, y []complex128, incY int)
+	Zaxpy(n int, alpha complex128, x []complex128, incX int, y []complex128, incY int)
+	Zscal(n int, alpha complex128, x []complex128, incX int)
+	Zdscal(n int, alpha float64, x []complex128, incX int)
+}
+
+// Complex128Level2 implements the double precision complex BLAS Level 2 routines.
+type Complex128Level2 interface {
+	Zgemv(tA Transpose, m, n int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int)
+	Zgbmv(tA Transpose, m, n int, kL int, kU int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int)
+	Ztrmv(ul Uplo, tA Transpose, d Diag, n int, a []complex128, lda int, x []complex128, incX int)
+	Ztbmv(ul Uplo, tA Transpose, d Diag, n, k int, a []complex128, lda int, x []complex128, incX int)
+	Ztpmv(ul Uplo, tA Transpose, d Diag, n int, ap []complex128, x []complex128, incX int)
+	Ztrsv(ul Uplo, tA Transpose, d Diag, n int, a []complex128, lda int, x []complex128, incX int)
+	Ztbsv(ul Uplo, tA Transpose, d Diag, n, k int, a []complex128, lda int, x []complex128, incX int)
+	Ztpsv(ul Uplo, tA Transpose, d Diag, n int, ap []complex128, x []complex128, incX int)
+	Zhemv(ul Uplo, n int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int)
+	Zhbmv(ul Uplo, n, k int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int)
+	Zhpmv(ul Uplo, n int, alpha complex128, ap []complex128, x []complex128, incX int, beta complex128, y []complex128, incY int)
+	Zgeru(m, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, a []complex128, lda int)
+	Zgerc(m, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, a []complex128, lda int)
+	Zher(ul Uplo, n int, alpha float64, x []complex128, incX int, a []complex128, lda int)
+	Zhpr(ul Uplo, n int, alpha float64, x []complex128, incX int, a []complex128)
+	Zher2(ul Uplo, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, a []complex128, lda int)
+	Zhpr2(ul Uplo, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, ap []complex128)
+}
+
+// Complex128Level3 implements the double precision complex BLAS Level 3 routines.
+type Complex128Level3 interface {
+	Zgemm(tA, tB Transpose, m, n, k int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int)
+	Zsymm(s Side, ul Uplo, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int)
+	Zsyrk(ul Uplo, t Transpose, n, k int, alpha complex128, a []complex128, lda int, beta complex128, c []complex128, ldc int)
+	Zsyr2k(ul Uplo, t Transpose, n, k int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int)
+	Ztrmm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int)
+	Ztrsm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int)
+	Zhemm(s Side, ul Uplo, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int)
+	Zherk(ul Uplo, t Transpose, n, k int, alpha float64, a []complex128, lda int, beta float64, c []complex128, ldc int)
+	Zher2k(ul Uplo, t Transpose, n, k int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta float64, c []complex128, ldc int)
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/blas64/blas64.go b/vendor/gonum.org/v1/gonum/blas/blas64/blas64.go
new file mode 100644
index 0000000000..64ac985c1c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/blas64/blas64.go
@@ -0,0 +1,533 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package blas64
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/gonum"
+)
+
+var blas64 blas.Float64 = gonum.Implementation{}
+
+// Use sets the BLAS float64 implementation to be used by subsequent BLAS calls.
+// The default implementation is
+// gonum.org/v1/gonum/blas/gonum.Implementation.
+func Use(b blas.Float64) {
+	blas64 = b
+}
+
+// Implementation returns the current BLAS float64 implementation.
+//
+// Implementation allows direct calls to the current BLAS float64 implementation
+// giving finer control of parameters.
+func Implementation() blas.Float64 {
+	return blas64
+}
+
+// Vector represents a vector with an associated element increment.
+type Vector struct {
+	N    int
+	Data []float64
+	Inc  int
+}
+
+// General represents a matrix using the conventional storage scheme.
+type General struct {
+	Rows, Cols int
+	Data       []float64
+	Stride     int
+}
+
+// Band represents a band matrix using the band storage scheme.
+type Band struct {
+	Rows, Cols int
+	KL, KU     int
+	Data       []float64
+	Stride     int
+}
+
+// Triangular represents a triangular matrix using the conventional storage scheme.
+type Triangular struct {
+	Uplo   blas.Uplo
+	Diag   blas.Diag
+	N      int
+	Data   []float64
+	Stride int
+}
+
+// TriangularBand represents a triangular matrix using the band storage scheme.
+type TriangularBand struct {
+	Uplo   blas.Uplo
+	Diag   blas.Diag
+	N, K   int
+	Data   []float64
+	Stride int
+}
+
+// TriangularPacked represents a triangular matrix using the packed storage scheme.
+type TriangularPacked struct {
+	Uplo blas.Uplo
+	Diag blas.Diag
+	N    int
+	Data []float64
+}
+
+// Symmetric represents a symmetric matrix using the conventional storage scheme.
+type Symmetric struct {
+	Uplo   blas.Uplo
+	N      int
+	Data   []float64
+	Stride int
+}
+
+// SymmetricBand represents a symmetric matrix using the band storage scheme.
+type SymmetricBand struct {
+	Uplo   blas.Uplo
+	N, K   int
+	Data   []float64
+	Stride int
+}
+
+// SymmetricPacked represents a symmetric matrix using the packed storage scheme.
+type SymmetricPacked struct {
+	Uplo blas.Uplo
+	N    int
+	Data []float64
+}
+
+// Level 1
+
+const (
+	negInc    = "blas64: negative vector increment"
+	badLength = "blas64: vector length mismatch"
+)
+
+// Dot computes the dot product of the two vectors:
+//
+//	\sum_i x[i]*y[i].
+//
+// Dot will panic if the lengths of x and y do not match.
+func Dot(x, y Vector) float64 {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	return blas64.Ddot(x.N, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Nrm2 computes the Euclidean norm of the vector x:
+//
+//	sqrt(\sum_i x[i]*x[i]).
+//
+// Nrm2 will panic if the vector increment is negative.
+func Nrm2(x Vector) float64 {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	return blas64.Dnrm2(x.N, x.Data, x.Inc)
+}
+
+// Asum computes the sum of the absolute values of the elements of x:
+//
+//	\sum_i |x[i]|.
+//
+// Asum will panic if the vector increment is negative.
+func Asum(x Vector) float64 {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	return blas64.Dasum(x.N, x.Data, x.Inc)
+}
+
+// Iamax returns the index of an element of x with the largest absolute value.
+// If there are multiple such indices the earliest is returned.
+// Iamax returns -1 if n == 0.
+//
+// Iamax will panic if the vector increment is negative.
+func Iamax(x Vector) int {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	return blas64.Idamax(x.N, x.Data, x.Inc)
+}
+
+// Swap exchanges the elements of the two vectors:
+//
+//	x[i], y[i] = y[i], x[i] for all i.
+//
+// Swap will panic if the lengths of x and y do not match.
+func Swap(x, y Vector) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	blas64.Dswap(x.N, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Copy copies the elements of x into the elements of y:
+//
+//	y[i] = x[i] for all i.
+//
+// Copy will panic if the lengths of x and y do not match.
+func Copy(x, y Vector) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	blas64.Dcopy(x.N, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Axpy adds x scaled by alpha to y:
+//
+//	y[i] += alpha*x[i] for all i.
+//
+// Axpy will panic if the lengths of x and y do not match.
+func Axpy(alpha float64, x, y Vector) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	blas64.Daxpy(x.N, alpha, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Rotg computes the parameters of a Givens plane rotation so that
+//
+//	⎡ c s⎤   ⎡a⎤   ⎡r⎤
+//	⎣-s c⎦ * ⎣b⎦ = ⎣0⎦
+//
+// where a and b are the Cartesian coordinates of a given point.
+// c, s, and r are defined as
+//
+//	r = ±Sqrt(a^2 + b^2),
+//	c = a/r, the cosine of the rotation angle,
+//	s = a/r, the sine of the rotation angle,
+//
+// and z is defined such that
+//
+//	if |a| > |b|,        z = s,
+//	otherwise if c != 0, z = 1/c,
+//	otherwise            z = 1.
+func Rotg(a, b float64) (c, s, r, z float64) {
+	return blas64.Drotg(a, b)
+}
+
+// Rotmg computes the modified Givens rotation. See
+// http://www.netlib.org/lapack/explore-html/df/deb/drotmg_8f.html
+// for more details.
+func Rotmg(d1, d2, b1, b2 float64) (p blas.DrotmParams, rd1, rd2, rb1 float64) {
+	return blas64.Drotmg(d1, d2, b1, b2)
+}
+
+// Rot applies a plane transformation to n points represented by the vectors x
+// and y:
+//
+//	x[i] =  c*x[i] + s*y[i],
+//	y[i] = -s*x[i] + c*y[i], for all i.
+func Rot(x, y Vector, c, s float64) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	blas64.Drot(x.N, x.Data, x.Inc, y.Data, y.Inc, c, s)
+}
+
+// Rotm applies the modified Givens rotation to n points represented by the
+// vectors x and y.
+func Rotm(x, y Vector, p blas.DrotmParams) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	blas64.Drotm(x.N, x.Data, x.Inc, y.Data, y.Inc, p)
+}
+
+// Scal scales the vector x by alpha:
+//
+//	x[i] *= alpha for all i.
+//
+// Scal will panic if the vector increment is negative.
+func Scal(alpha float64, x Vector) {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	blas64.Dscal(x.N, alpha, x.Data, x.Inc)
+}
+
+// Level 2
+
+// Gemv computes
+//
+//	y = alpha * A * x + beta * y   if t == blas.NoTrans,
+//	y = alpha * Aᵀ * x + beta * y  if t == blas.Trans or blas.ConjTrans,
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars.
+func Gemv(t blas.Transpose, alpha float64, a General, x Vector, beta float64, y Vector) {
+	blas64.Dgemv(t, a.Rows, a.Cols, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Gbmv computes
+//
+//	y = alpha * A * x + beta * y   if t == blas.NoTrans,
+//	y = alpha * Aᵀ * x + beta * y  if t == blas.Trans or blas.ConjTrans,
+//
+// where A is an m×n band matrix, x and y are vectors, and alpha and beta are scalars.
+func Gbmv(t blas.Transpose, alpha float64, a Band, x Vector, beta float64, y Vector) {
+	blas64.Dgbmv(t, a.Rows, a.Cols, a.KL, a.KU, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Trmv computes
+//
+//	x = A * x   if t == blas.NoTrans,
+//	x = Aᵀ * x  if t == blas.Trans or blas.ConjTrans,
+//
+// where A is an n×n triangular matrix, and x is a vector.
+func Trmv(t blas.Transpose, a Triangular, x Vector) {
+	blas64.Dtrmv(a.Uplo, t, a.Diag, a.N, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tbmv computes
+//
+//	x = A * x   if t == blas.NoTrans,
+//	x = Aᵀ * x  if t == blas.Trans or blas.ConjTrans,
+//
+// where A is an n×n triangular band matrix, and x is a vector.
+func Tbmv(t blas.Transpose, a TriangularBand, x Vector) {
+	blas64.Dtbmv(a.Uplo, t, a.Diag, a.N, a.K, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tpmv computes
+//
+//	x = A * x   if t == blas.NoTrans,
+//	x = Aᵀ * x  if t == blas.Trans or blas.ConjTrans,
+//
+// where A is an n×n triangular matrix in packed format, and x is a vector.
+func Tpmv(t blas.Transpose, a TriangularPacked, x Vector) {
+	blas64.Dtpmv(a.Uplo, t, a.Diag, a.N, a.Data, x.Data, x.Inc)
+}
+
+// Trsv solves
+//
+//	A * x = b   if t == blas.NoTrans,
+//	Aᵀ * x = b  if t == blas.Trans or blas.ConjTrans,
+//
+// where A is an n×n triangular matrix, and x and b are vectors.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func Trsv(t blas.Transpose, a Triangular, x Vector) {
+	blas64.Dtrsv(a.Uplo, t, a.Diag, a.N, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tbsv solves
+//
+//	A * x = b   if t == blas.NoTrans,
+//	Aᵀ * x = b  if t == blas.Trans or blas.ConjTrans,
+//
+// where A is an n×n triangular band matrix, and x and b are vectors.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func Tbsv(t blas.Transpose, a TriangularBand, x Vector) {
+	blas64.Dtbsv(a.Uplo, t, a.Diag, a.N, a.K, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tpsv solves
+//
+//	A * x = b   if t == blas.NoTrans,
+//	Aᵀ * x = b  if t == blas.Trans or blas.ConjTrans,
+//
+// where A is an n×n triangular matrix in packed format, and x and b are
+// vectors.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func Tpsv(t blas.Transpose, a TriangularPacked, x Vector) {
+	blas64.Dtpsv(a.Uplo, t, a.Diag, a.N, a.Data, x.Data, x.Inc)
+}
+
+// Symv computes
+//
+//	y = alpha * A * x + beta * y,
+//
+// where A is an n×n symmetric matrix, x and y are vectors, and alpha and
+// beta are scalars.
+func Symv(alpha float64, a Symmetric, x Vector, beta float64, y Vector) {
+	blas64.Dsymv(a.Uplo, a.N, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Sbmv performs
+//
+//	y = alpha * A * x + beta * y,
+//
+// where A is an n×n symmetric band matrix, x and y are vectors, and alpha
+// and beta are scalars.
+func Sbmv(alpha float64, a SymmetricBand, x Vector, beta float64, y Vector) {
+	blas64.Dsbmv(a.Uplo, a.N, a.K, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Spmv performs
+//
+//	y = alpha * A * x + beta * y,
+//
+// where A is an n×n symmetric matrix in packed format, x and y are vectors,
+// and alpha and beta are scalars.
+func Spmv(alpha float64, a SymmetricPacked, x Vector, beta float64, y Vector) {
+	blas64.Dspmv(a.Uplo, a.N, alpha, a.Data, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Ger performs a rank-1 update
+//
+//	A += alpha * x * yᵀ,
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
+func Ger(alpha float64, x, y Vector, a General) {
+	blas64.Dger(a.Rows, a.Cols, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data, a.Stride)
+}
+
+// Syr performs a rank-1 update
+//
+//	A += alpha * x * xᵀ,
+//
+// where A is an n×n symmetric matrix, x is a vector, and alpha is a scalar.
+func Syr(alpha float64, x Vector, a Symmetric) {
+	blas64.Dsyr(a.Uplo, a.N, alpha, x.Data, x.Inc, a.Data, a.Stride)
+}
+
+// Spr performs the rank-1 update
+//
+//	A += alpha * x * xᵀ,
+//
+// where A is an n×n symmetric matrix in packed format, x is a vector, and
+// alpha is a scalar.
+func Spr(alpha float64, x Vector, a SymmetricPacked) {
+	blas64.Dspr(a.Uplo, a.N, alpha, x.Data, x.Inc, a.Data)
+}
+
+// Syr2 performs a rank-2 update
+//
+//	A += alpha * x * yᵀ + alpha * y * xᵀ,
+//
+// where A is a symmetric n×n matrix, x and y are vectors, and alpha is a scalar.
+func Syr2(alpha float64, x, y Vector, a Symmetric) {
+	blas64.Dsyr2(a.Uplo, a.N, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data, a.Stride)
+}
+
+// Spr2 performs a rank-2 update
+//
+//	A += alpha * x * yᵀ + alpha * y * xᵀ,
+//
+// where A is an n×n symmetric matrix in packed format, x and y are vectors,
+// and alpha is a scalar.
+func Spr2(alpha float64, x, y Vector, a SymmetricPacked) {
+	blas64.Dspr2(a.Uplo, a.N, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data)
+}
+
+// Level 3
+
+// Gemm computes
+//
+//	C = alpha * A * B + beta * C,
+//
+// where A, B, and C are dense matrices, and alpha and beta are scalars.
+// tA and tB specify whether A or B are transposed.
+func Gemm(tA, tB blas.Transpose, alpha float64, a, b General, beta float64, c General) {
+	var m, n, k int
+	if tA == blas.NoTrans {
+		m, k = a.Rows, a.Cols
+	} else {
+		m, k = a.Cols, a.Rows
+	}
+	if tB == blas.NoTrans {
+		n = b.Cols
+	} else {
+		n = b.Rows
+	}
+	blas64.Dgemm(tA, tB, m, n, k, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Symm performs
+//
+//	C = alpha * A * B + beta * C  if s == blas.Left,
+//	C = alpha * B * A + beta * C  if s == blas.Right,
+//
+// where A is an n×n or m×m symmetric matrix, B and C are m×n matrices, and
+// alpha is a scalar.
+func Symm(s blas.Side, alpha float64, a Symmetric, b General, beta float64, c General) {
+	var m, n int
+	if s == blas.Left {
+		m, n = a.N, b.Cols
+	} else {
+		m, n = b.Rows, a.N
+	}
+	blas64.Dsymm(s, a.Uplo, m, n, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Syrk performs a symmetric rank-k update
+//
+//	C = alpha * A * Aᵀ + beta * C  if t == blas.NoTrans,
+//	C = alpha * Aᵀ * A + beta * C  if t == blas.Trans or blas.ConjTrans,
+//
+// where C is an n×n symmetric matrix, A is an n×k matrix if t == blas.NoTrans and
+// a k×n matrix otherwise, and alpha and beta are scalars.
+func Syrk(t blas.Transpose, alpha float64, a General, beta float64, c Symmetric) {
+	var n, k int
+	if t == blas.NoTrans {
+		n, k = a.Rows, a.Cols
+	} else {
+		n, k = a.Cols, a.Rows
+	}
+	blas64.Dsyrk(c.Uplo, t, n, k, alpha, a.Data, a.Stride, beta, c.Data, c.Stride)
+}
+
+// Syr2k performs a symmetric rank-2k update
+//
+//	C = alpha * A * Bᵀ + alpha * B * Aᵀ + beta * C  if t == blas.NoTrans,
+//	C = alpha * Aᵀ * B + alpha * Bᵀ * A + beta * C  if t == blas.Trans or blas.ConjTrans,
+//
+// where C is an n×n symmetric matrix, A and B are n×k matrices if t == NoTrans
+// and k×n matrices otherwise, and alpha and beta are scalars.
+func Syr2k(t blas.Transpose, alpha float64, a, b General, beta float64, c Symmetric) {
+	var n, k int
+	if t == blas.NoTrans {
+		n, k = a.Rows, a.Cols
+	} else {
+		n, k = a.Cols, a.Rows
+	}
+	blas64.Dsyr2k(c.Uplo, t, n, k, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Trmm performs
+//
+//	B = alpha * A * B   if tA == blas.NoTrans and s == blas.Left,
+//	B = alpha * Aᵀ * B  if tA == blas.Trans or blas.ConjTrans, and s == blas.Left,
+//	B = alpha * B * A   if tA == blas.NoTrans and s == blas.Right,
+//	B = alpha * B * Aᵀ  if tA == blas.Trans or blas.ConjTrans, and s == blas.Right,
+//
+// where A is an n×n or m×m triangular matrix, B is an m×n matrix, and alpha is
+// a scalar.
+func Trmm(s blas.Side, tA blas.Transpose, alpha float64, a Triangular, b General) {
+	blas64.Dtrmm(s, a.Uplo, tA, a.Diag, b.Rows, b.Cols, alpha, a.Data, a.Stride, b.Data, b.Stride)
+}
+
+// Trsm solves
+//
+//	A * X = alpha * B   if tA == blas.NoTrans and s == blas.Left,
+//	Aᵀ * X = alpha * B  if tA == blas.Trans or blas.ConjTrans, and s == blas.Left,
+//	X * A = alpha * B   if tA == blas.NoTrans and s == blas.Right,
+//	X * Aᵀ = alpha * B  if tA == blas.Trans or blas.ConjTrans, and s == blas.Right,
+//
+// where A is an n×n or m×m triangular matrix, X and B are m×n matrices, and
+// alpha is a scalar.
+//
+// At entry to the function, X contains the values of B, and the result is
+// stored in-place into X.
+//
+// No check is made that A is invertible.
+func Trsm(s blas.Side, tA blas.Transpose, alpha float64, a Triangular, b General) {
+	blas64.Dtrsm(s, a.Uplo, tA, a.Diag, b.Rows, b.Cols, alpha, a.Data, a.Stride, b.Data, b.Stride)
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/blas64/conv.go b/vendor/gonum.org/v1/gonum/blas/blas64/conv.go
new file mode 100644
index 0000000000..695557d13a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/blas64/conv.go
@@ -0,0 +1,263 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package blas64
+
+import "gonum.org/v1/gonum/blas"
+
+// GeneralCols represents a matrix using the conventional column-major storage scheme.
+type GeneralCols General
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions as a and have adequate backing
+// data storage.
+func (t GeneralCols) From(a General) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("blas64: mismatched dimension")
+	}
+	if len(t.Data) < (t.Cols-1)*t.Stride+t.Rows {
+		panic("blas64: short data slice")
+	}
+	for i := 0; i < a.Rows; i++ {
+		for j, v := range a.Data[i*a.Stride : i*a.Stride+a.Cols] {
+			t.Data[i+j*t.Stride] = v
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions as a and have adequate backing
+// data storage.
+func (t General) From(a GeneralCols) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("blas64: mismatched dimension")
+	}
+	if len(t.Data) < (t.Rows-1)*t.Stride+t.Cols {
+		panic("blas64: short data slice")
+	}
+	for j := 0; j < a.Cols; j++ {
+		for i, v := range a.Data[j*a.Stride : j*a.Stride+a.Rows] {
+			t.Data[i*t.Stride+j] = v
+		}
+	}
+}
+
+// TriangularCols represents a matrix using the conventional column-major storage scheme.
+type TriangularCols Triangular
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, uplo and diag as a and have
+// adequate backing data storage.
+func (t TriangularCols) From(a Triangular) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("blas64: mismatched BLAS diag")
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.All:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, uplo and diag as a and have
+// adequate backing data storage.
+func (t Triangular) From(a TriangularCols) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("blas64: mismatched BLAS diag")
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.All:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	}
+}
+
+// BandCols represents a matrix using the band column-major storage scheme.
+type BandCols Band
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and bandwidth as a and have
+// adequate backing data storage.
+func (t BandCols) From(a Band) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("blas64: mismatched dimension")
+	}
+	if t.KL != a.KL || t.KU != a.KU {
+		panic("blas64: mismatched bandwidth")
+	}
+	if a.Stride < a.KL+a.KU+1 {
+		panic("blas64: short stride for source")
+	}
+	if t.Stride < t.KL+t.KU+1 {
+		panic("blas64: short stride for destination")
+	}
+	for i := 0; i < a.Rows; i++ {
+		for j := max(0, i-a.KL); j < min(i+a.KU+1, a.Cols); j++ {
+			t.Data[i+t.KU-j+j*t.Stride] = a.Data[j+a.KL-i+i*a.Stride]
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and bandwidth as a and have
+// adequate backing data storage.
+func (t Band) From(a BandCols) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("blas64: mismatched dimension")
+	}
+	if t.KL != a.KL || t.KU != a.KU {
+		panic("blas64: mismatched bandwidth")
+	}
+	if a.Stride < a.KL+a.KU+1 {
+		panic("blas64: short stride for source")
+	}
+	if t.Stride < t.KL+t.KU+1 {
+		panic("blas64: short stride for destination")
+	}
+	for j := 0; j < a.Cols; j++ {
+		for i := max(0, j-a.KU); i < min(j+a.KL+1, a.Rows); i++ {
+			t.Data[j+a.KL-i+i*a.Stride] = a.Data[i+t.KU-j+j*t.Stride]
+		}
+	}
+}
+
+// TriangularBandCols represents a triangular matrix using the band column-major storage scheme.
+type TriangularBandCols TriangularBand
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t TriangularBandCols) From(a TriangularBand) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("blas64: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("blas64: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("blas64: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("blas64: mismatched BLAS diag")
+	}
+	dst := BandCols{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := Band{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t TriangularBand) From(a TriangularBandCols) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("blas64: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("blas64: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("blas64: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("blas64: mismatched BLAS diag")
+	}
+	dst := Band{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := BandCols{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/blas64/conv_symmetric.go b/vendor/gonum.org/v1/gonum/blas/blas64/conv_symmetric.go
new file mode 100644
index 0000000000..5146f1a1c3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/blas64/conv_symmetric.go
@@ -0,0 +1,153 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package blas64
+
+import "gonum.org/v1/gonum/blas"
+
+// SymmetricCols represents a matrix using the conventional column-major storage scheme.
+type SymmetricCols Symmetric
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and uplo as a and have adequate
+// backing data storage.
+func (t SymmetricCols) From(a Symmetric) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and uplo as a and have adequate
+// backing data storage.
+func (t Symmetric) From(a SymmetricCols) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	}
+}
+
+// SymmetricBandCols represents a symmetric matrix using the band column-major storage scheme.
+type SymmetricBandCols SymmetricBand
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t SymmetricBandCols) From(a SymmetricBand) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("blas64: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("blas64: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("blas64: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	dst := BandCols{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := Band{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t SymmetricBand) From(a SymmetricBandCols) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("blas64: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("blas64: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("blas64: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	dst := Band{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := BandCols{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/blas64/doc.go b/vendor/gonum.org/v1/gonum/blas/blas64/doc.go
new file mode 100644
index 0000000000..7410cee486
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/blas64/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package blas64 provides a simple interface to the float64 BLAS API.
+package blas64 // import "gonum.org/v1/gonum/blas/blas64"
diff --git a/vendor/gonum.org/v1/gonum/blas/cblas128/cblas128.go b/vendor/gonum.org/v1/gonum/blas/cblas128/cblas128.go
new file mode 100644
index 0000000000..82a6f22e2b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/cblas128/cblas128.go
@@ -0,0 +1,600 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cblas128
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/gonum"
+)
+
+var cblas128 blas.Complex128 = gonum.Implementation{}
+
+// Use sets the BLAS complex128 implementation to be used by subsequent BLAS calls.
+// The default implementation is
+// gonum.org/v1/gonum/blas/gonum.Implementation.
+func Use(b blas.Complex128) {
+	cblas128 = b
+}
+
+// Implementation returns the current BLAS complex128 implementation.
+//
+// Implementation allows direct calls to the current the BLAS complex128 implementation
+// giving finer control of parameters.
+func Implementation() blas.Complex128 {
+	return cblas128
+}
+
+// Vector represents a vector with an associated element increment.
+type Vector struct {
+	N    int
+	Inc  int
+	Data []complex128
+}
+
+// General represents a matrix using the conventional storage scheme.
+type General struct {
+	Rows, Cols int
+	Stride     int
+	Data       []complex128
+}
+
+// Band represents a band matrix using the band storage scheme.
+type Band struct {
+	Rows, Cols int
+	KL, KU     int
+	Stride     int
+	Data       []complex128
+}
+
+// Triangular represents a triangular matrix using the conventional storage scheme.
+type Triangular struct {
+	N      int
+	Stride int
+	Data   []complex128
+	Uplo   blas.Uplo
+	Diag   blas.Diag
+}
+
+// TriangularBand represents a triangular matrix using the band storage scheme.
+type TriangularBand struct {
+	N, K   int
+	Stride int
+	Data   []complex128
+	Uplo   blas.Uplo
+	Diag   blas.Diag
+}
+
+// TriangularPacked represents a triangular matrix using the packed storage scheme.
+type TriangularPacked struct {
+	N    int
+	Data []complex128
+	Uplo blas.Uplo
+	Diag blas.Diag
+}
+
+// Symmetric represents a symmetric matrix using the conventional storage scheme.
+type Symmetric struct {
+	N      int
+	Stride int
+	Data   []complex128
+	Uplo   blas.Uplo
+}
+
+// SymmetricBand represents a symmetric matrix using the band storage scheme.
+type SymmetricBand struct {
+	N, K   int
+	Stride int
+	Data   []complex128
+	Uplo   blas.Uplo
+}
+
+// SymmetricPacked represents a symmetric matrix using the packed storage scheme.
+type SymmetricPacked struct {
+	N    int
+	Data []complex128
+	Uplo blas.Uplo
+}
+
+// Hermitian represents an Hermitian matrix using the conventional storage scheme.
+type Hermitian Symmetric
+
+// HermitianBand represents an Hermitian matrix using the band storage scheme.
+type HermitianBand SymmetricBand
+
+// HermitianPacked represents an Hermitian matrix using the packed storage scheme.
+type HermitianPacked SymmetricPacked
+
+// Level 1
+
+const (
+	negInc    = "cblas128: negative vector increment"
+	badLength = "cblas128: vector length mismatch"
+)
+
+// Dotu computes the dot product of the two vectors without
+// complex conjugation:
+//
+//	xᵀ * y.
+//
+// Dotu will panic if the lengths of x and y do not match.
+func Dotu(x, y Vector) complex128 {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	return cblas128.Zdotu(x.N, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Dotc computes the dot product of the two vectors with
+// complex conjugation:
+//
+//	xᴴ * y.
+//
+// Dotc will panic if the lengths of x and y do not match.
+func Dotc(x, y Vector) complex128 {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	return cblas128.Zdotc(x.N, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Nrm2 computes the Euclidean norm of the vector x:
+//
+//	sqrt(\sum_i x[i] * x[i]).
+//
+// Nrm2 will panic if the vector increment is negative.
+func Nrm2(x Vector) float64 {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	return cblas128.Dznrm2(x.N, x.Data, x.Inc)
+}
+
+// Asum computes the sum of magnitudes of the real and imaginary parts of
+// elements of the vector x:
+//
+//	\sum_i (|Re x[i]| + |Im x[i]|).
+//
+// Asum will panic if the vector increment is negative.
+func Asum(x Vector) float64 {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	return cblas128.Dzasum(x.N, x.Data, x.Inc)
+}
+
+// Iamax returns the index of an element of x with the largest sum of
+// magnitudes of the real and imaginary parts (|Re x[i]|+|Im x[i]|).
+// If there are multiple such indices, the earliest is returned.
+//
+// Iamax returns -1 if n == 0.
+//
+// Iamax will panic if the vector increment is negative.
+func Iamax(x Vector) int {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	return cblas128.Izamax(x.N, x.Data, x.Inc)
+}
+
+// Swap exchanges the elements of two vectors:
+//
+//	x[i], y[i] = y[i], x[i] for all i.
+//
+// Swap will panic if the lengths of x and y do not match.
+func Swap(x, y Vector) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	cblas128.Zswap(x.N, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Copy copies the elements of x into the elements of y:
+//
+//	y[i] = x[i] for all i.
+//
+// Copy will panic if the lengths of x and y do not match.
+func Copy(x, y Vector) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	cblas128.Zcopy(x.N, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Axpy computes
+//
+//	y = alpha * x + y,
+//
+// where x and y are vectors, and alpha is a scalar.
+// Axpy will panic if the lengths of x and y do not match.
+func Axpy(alpha complex128, x, y Vector) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	cblas128.Zaxpy(x.N, alpha, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Scal computes
+//
+//	x = alpha * x,
+//
+// where x is a vector, and alpha is a scalar.
+//
+// Scal will panic if the vector increment is negative.
+func Scal(alpha complex128, x Vector) {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	cblas128.Zscal(x.N, alpha, x.Data, x.Inc)
+}
+
+// Dscal computes
+//
+//	x = alpha * x,
+//
+// where x is a vector, and alpha is a real scalar.
+//
+// Dscal will panic if the vector increment is negative.
+func Dscal(alpha float64, x Vector) {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	cblas128.Zdscal(x.N, alpha, x.Data, x.Inc)
+}
+
+// Level 2
+
+// Gemv computes
+//
+//	y = alpha * A * x + beta * y   if t == blas.NoTrans,
+//	y = alpha * Aᵀ * x + beta * y  if t == blas.Trans,
+//	y = alpha * Aᴴ * x + beta * y  if t == blas.ConjTrans,
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are
+// scalars.
+func Gemv(t blas.Transpose, alpha complex128, a General, x Vector, beta complex128, y Vector) {
+	cblas128.Zgemv(t, a.Rows, a.Cols, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Gbmv computes
+//
+//	y = alpha * A * x + beta * y   if t == blas.NoTrans,
+//	y = alpha * Aᵀ * x + beta * y  if t == blas.Trans,
+//	y = alpha * Aᴴ * x + beta * y  if t == blas.ConjTrans,
+//
+// where A is an m×n band matrix, x and y are vectors, and alpha and beta are
+// scalars.
+func Gbmv(t blas.Transpose, alpha complex128, a Band, x Vector, beta complex128, y Vector) {
+	cblas128.Zgbmv(t, a.Rows, a.Cols, a.KL, a.KU, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Trmv computes
+//
+//	x = A * x   if t == blas.NoTrans,
+//	x = Aᵀ * x  if t == blas.Trans,
+//	x = Aᴴ * x  if t == blas.ConjTrans,
+//
+// where A is an n×n triangular matrix, and x is a vector.
+func Trmv(t blas.Transpose, a Triangular, x Vector) {
+	cblas128.Ztrmv(a.Uplo, t, a.Diag, a.N, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tbmv computes
+//
+//	x = A * x   if t == blas.NoTrans,
+//	x = Aᵀ * x  if t == blas.Trans,
+//	x = Aᴴ * x  if t == blas.ConjTrans,
+//
+// where A is an n×n triangular band matrix, and x is a vector.
+func Tbmv(t blas.Transpose, a TriangularBand, x Vector) {
+	cblas128.Ztbmv(a.Uplo, t, a.Diag, a.N, a.K, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tpmv computes
+//
+//	x = A * x   if t == blas.NoTrans,
+//	x = Aᵀ * x  if t == blas.Trans,
+//	x = Aᴴ * x  if t == blas.ConjTrans,
+//
+// where A is an n×n triangular matrix in packed format, and x is a vector.
+func Tpmv(t blas.Transpose, a TriangularPacked, x Vector) {
+	cblas128.Ztpmv(a.Uplo, t, a.Diag, a.N, a.Data, x.Data, x.Inc)
+}
+
+// Trsv solves
+//
+//	A * x = b   if t == blas.NoTrans,
+//	Aᵀ * x = b  if t == blas.Trans,
+//	Aᴴ * x = b  if t == blas.ConjTrans,
+//
+// where A is an n×n triangular matrix and x is a vector.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func Trsv(t blas.Transpose, a Triangular, x Vector) {
+	cblas128.Ztrsv(a.Uplo, t, a.Diag, a.N, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tbsv solves
+//
+//	A * x = b   if t == blas.NoTrans,
+//	Aᵀ * x = b  if t == blas.Trans,
+//	Aᴴ * x = b  if t == blas.ConjTrans,
+//
+// where A is an n×n triangular band matrix, and x is a vector.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func Tbsv(t blas.Transpose, a TriangularBand, x Vector) {
+	cblas128.Ztbsv(a.Uplo, t, a.Diag, a.N, a.K, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tpsv solves
+//
+//	A * x = b   if t == blas.NoTrans,
+//	Aᵀ * x = b  if t == blas.Trans,
+//	Aᴴ * x = b  if t == blas.ConjTrans,
+//
+// where A is an n×n triangular matrix in packed format and x is a vector.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func Tpsv(t blas.Transpose, a TriangularPacked, x Vector) {
+	cblas128.Ztpsv(a.Uplo, t, a.Diag, a.N, a.Data, x.Data, x.Inc)
+}
+
+// Hemv computes
+//
+//	y = alpha * A * x + beta * y,
+//
+// where A is an n×n Hermitian matrix, x and y are vectors, and alpha and
+// beta are scalars.
+func Hemv(alpha complex128, a Hermitian, x Vector, beta complex128, y Vector) {
+	cblas128.Zhemv(a.Uplo, a.N, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Hbmv performs
+//
+//	y = alpha * A * x + beta * y,
+//
+// where A is an n×n Hermitian band matrix, x and y are vectors, and alpha
+// and beta are scalars.
+func Hbmv(alpha complex128, a HermitianBand, x Vector, beta complex128, y Vector) {
+	cblas128.Zhbmv(a.Uplo, a.N, a.K, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Hpmv performs
+//
+//	y = alpha * A * x + beta * y,
+//
+// where A is an n×n Hermitian matrix in packed format, x and y are vectors,
+// and alpha and beta are scalars.
+func Hpmv(alpha complex128, a HermitianPacked, x Vector, beta complex128, y Vector) {
+	cblas128.Zhpmv(a.Uplo, a.N, alpha, a.Data, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Geru performs a rank-1 update
+//
+//	A += alpha * x * yᵀ,
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
+func Geru(alpha complex128, x, y Vector, a General) {
+	cblas128.Zgeru(a.Rows, a.Cols, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data, a.Stride)
+}
+
+// Gerc performs a rank-1 update
+//
+//	A += alpha * x * yᴴ,
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
+func Gerc(alpha complex128, x, y Vector, a General) {
+	cblas128.Zgerc(a.Rows, a.Cols, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data, a.Stride)
+}
+
+// Her performs a rank-1 update
+//
+//	A += alpha * x * yᵀ,
+//
+// where A is an m×n Hermitian matrix, x and y are vectors, and alpha is a scalar.
+func Her(alpha float64, x Vector, a Hermitian) {
+	cblas128.Zher(a.Uplo, a.N, alpha, x.Data, x.Inc, a.Data, a.Stride)
+}
+
+// Hpr performs a rank-1 update
+//
+//	A += alpha * x * xᴴ,
+//
+// where A is an n×n Hermitian matrix in packed format, x is a vector, and
+// alpha is a scalar.
+func Hpr(alpha float64, x Vector, a HermitianPacked) {
+	cblas128.Zhpr(a.Uplo, a.N, alpha, x.Data, x.Inc, a.Data)
+}
+
+// Her2 performs a rank-2 update
+//
+//	A += alpha * x * yᴴ + conj(alpha) * y * xᴴ,
+//
+// where A is an n×n Hermitian matrix, x and y are vectors, and alpha is a scalar.
+func Her2(alpha complex128, x, y Vector, a Hermitian) {
+	cblas128.Zher2(a.Uplo, a.N, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data, a.Stride)
+}
+
+// Hpr2 performs a rank-2 update
+//
+//	A += alpha * x * yᴴ + conj(alpha) * y * xᴴ,
+//
+// where A is an n×n Hermitian matrix in packed format, x and y are vectors,
+// and alpha is a scalar.
+func Hpr2(alpha complex128, x, y Vector, a HermitianPacked) {
+	cblas128.Zhpr2(a.Uplo, a.N, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data)
+}
+
+// Level 3
+
+// Gemm computes
+//
+//	C = alpha * A * B + beta * C,
+//
+// where A, B, and C are dense matrices, and alpha and beta are scalars.
+// tA and tB specify whether A or B are transposed or conjugated.
+func Gemm(tA, tB blas.Transpose, alpha complex128, a, b General, beta complex128, c General) {
+	var m, n, k int
+	if tA == blas.NoTrans {
+		m, k = a.Rows, a.Cols
+	} else {
+		m, k = a.Cols, a.Rows
+	}
+	if tB == blas.NoTrans {
+		n = b.Cols
+	} else {
+		n = b.Rows
+	}
+	cblas128.Zgemm(tA, tB, m, n, k, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Symm performs
+//
+//	C = alpha * A * B + beta * C  if s == blas.Left,
+//	C = alpha * B * A + beta * C  if s == blas.Right,
+//
+// where A is an n×n or m×m symmetric matrix, B and C are m×n matrices, and
+// alpha and beta are scalars.
+func Symm(s blas.Side, alpha complex128, a Symmetric, b General, beta complex128, c General) {
+	var m, n int
+	if s == blas.Left {
+		m, n = a.N, b.Cols
+	} else {
+		m, n = b.Rows, a.N
+	}
+	cblas128.Zsymm(s, a.Uplo, m, n, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Syrk performs a symmetric rank-k update
+//
+//	C = alpha * A * Aᵀ + beta * C  if t == blas.NoTrans,
+//	C = alpha * Aᵀ * A + beta * C  if t == blas.Trans,
+//
+// where C is an n×n symmetric matrix, A is an n×k matrix if t == blas.NoTrans
+// and a k×n matrix otherwise, and alpha and beta are scalars.
+func Syrk(t blas.Transpose, alpha complex128, a General, beta complex128, c Symmetric) {
+	var n, k int
+	if t == blas.NoTrans {
+		n, k = a.Rows, a.Cols
+	} else {
+		n, k = a.Cols, a.Rows
+	}
+	cblas128.Zsyrk(c.Uplo, t, n, k, alpha, a.Data, a.Stride, beta, c.Data, c.Stride)
+}
+
+// Syr2k performs a symmetric rank-2k update
+//
+//	C = alpha * A * Bᵀ + alpha * B * Aᵀ + beta * C  if t == blas.NoTrans,
+//	C = alpha * Aᵀ * B + alpha * Bᵀ * A + beta * C  if t == blas.Trans,
+//
+// where C is an n×n symmetric matrix, A and B are n×k matrices if
+// t == blas.NoTrans and k×n otherwise, and alpha and beta are scalars.
+func Syr2k(t blas.Transpose, alpha complex128, a, b General, beta complex128, c Symmetric) {
+	var n, k int
+	if t == blas.NoTrans {
+		n, k = a.Rows, a.Cols
+	} else {
+		n, k = a.Cols, a.Rows
+	}
+	cblas128.Zsyr2k(c.Uplo, t, n, k, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Trmm performs
+//
+//	B = alpha * A * B   if tA == blas.NoTrans and s == blas.Left,
+//	B = alpha * Aᵀ * B  if tA == blas.Trans and s == blas.Left,
+//	B = alpha * Aᴴ * B  if tA == blas.ConjTrans and s == blas.Left,
+//	B = alpha * B * A   if tA == blas.NoTrans and s == blas.Right,
+//	B = alpha * B * Aᵀ  if tA == blas.Trans and s == blas.Right,
+//	B = alpha * B * Aᴴ  if tA == blas.ConjTrans and s == blas.Right,
+//
+// where A is an n×n or m×m triangular matrix, B is an m×n matrix, and alpha is
+// a scalar.
+func Trmm(s blas.Side, tA blas.Transpose, alpha complex128, a Triangular, b General) {
+	cblas128.Ztrmm(s, a.Uplo, tA, a.Diag, b.Rows, b.Cols, alpha, a.Data, a.Stride, b.Data, b.Stride)
+}
+
+// Trsm solves
+//
+//	A * X = alpha * B   if tA == blas.NoTrans and s == blas.Left,
+//	Aᵀ * X = alpha * B  if tA == blas.Trans and s == blas.Left,
+//	Aᴴ * X = alpha * B  if tA == blas.ConjTrans and s == blas.Left,
+//	X * A = alpha * B   if tA == blas.NoTrans and s == blas.Right,
+//	X * Aᵀ = alpha * B  if tA == blas.Trans and s == blas.Right,
+//	X * Aᴴ = alpha * B  if tA == blas.ConjTrans and s == blas.Right,
+//
+// where A is an n×n or m×m triangular matrix, X and B are m×n matrices, and
+// alpha is a scalar.
+//
+// At entry to the function, b contains the values of B, and the result is
+// stored in-place into b.
+//
+// No check is made that A is invertible.
+func Trsm(s blas.Side, tA blas.Transpose, alpha complex128, a Triangular, b General) {
+	cblas128.Ztrsm(s, a.Uplo, tA, a.Diag, b.Rows, b.Cols, alpha, a.Data, a.Stride, b.Data, b.Stride)
+}
+
+// Hemm performs
+//
+//	C = alpha * A * B + beta * C  if s == blas.Left,
+//	C = alpha * B * A + beta * C  if s == blas.Right,
+//
+// where A is an n×n or m×m Hermitian matrix, B and C are m×n matrices, and
+// alpha and beta are scalars.
+func Hemm(s blas.Side, alpha complex128, a Hermitian, b General, beta complex128, c General) {
+	var m, n int
+	if s == blas.Left {
+		m, n = a.N, b.Cols
+	} else {
+		m, n = b.Rows, a.N
+	}
+	cblas128.Zhemm(s, a.Uplo, m, n, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Herk performs the Hermitian rank-k update
+//
+//	C = alpha * A * Aᴴ + beta*C  if t == blas.NoTrans,
+//	C = alpha * Aᴴ * A + beta*C  if t == blas.ConjTrans,
+//
+// where C is an n×n Hermitian matrix, A is an n×k matrix if t == blas.NoTrans
+// and a k×n matrix otherwise, and alpha and beta are scalars.
+func Herk(t blas.Transpose, alpha float64, a General, beta float64, c Hermitian) {
+	var n, k int
+	if t == blas.NoTrans {
+		n, k = a.Rows, a.Cols
+	} else {
+		n, k = a.Cols, a.Rows
+	}
+	cblas128.Zherk(c.Uplo, t, n, k, alpha, a.Data, a.Stride, beta, c.Data, c.Stride)
+}
+
+// Her2k performs the Hermitian rank-2k update
+//
+//	C = alpha * A * Bᴴ + conj(alpha) * B * Aᴴ + beta * C  if t == blas.NoTrans,
+//	C = alpha * Aᴴ * B + conj(alpha) * Bᴴ * A + beta * C  if t == blas.ConjTrans,
+//
+// where C is an n×n Hermitian matrix, A and B are n×k matrices if t == NoTrans
+// and k×n matrices otherwise, and alpha and beta are scalars.
+func Her2k(t blas.Transpose, alpha complex128, a, b General, beta float64, c Hermitian) {
+	var n, k int
+	if t == blas.NoTrans {
+		n, k = a.Rows, a.Cols
+	} else {
+		n, k = a.Cols, a.Rows
+	}
+	cblas128.Zher2k(c.Uplo, t, n, k, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/cblas128/conv.go b/vendor/gonum.org/v1/gonum/blas/cblas128/conv.go
new file mode 100644
index 0000000000..bfafb96efc
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/cblas128/conv.go
@@ -0,0 +1,265 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cblas128
+
+import "gonum.org/v1/gonum/blas"
+
+// GeneralCols represents a matrix using the conventional column-major storage scheme.
+type GeneralCols General
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions as a and have adequate backing
+// data storage.
+func (t GeneralCols) From(a General) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("cblas128: mismatched dimension")
+	}
+	if len(t.Data) < (t.Cols-1)*t.Stride+t.Rows {
+		panic("cblas128: short data slice")
+	}
+	for i := 0; i < a.Rows; i++ {
+		for j, v := range a.Data[i*a.Stride : i*a.Stride+a.Cols] {
+			t.Data[i+j*t.Stride] = v
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions as a and have adequate backing
+// data storage.
+func (t General) From(a GeneralCols) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("cblas128: mismatched dimension")
+	}
+	if len(t.Data) < (t.Rows-1)*t.Stride+t.Cols {
+		panic("cblas128: short data slice")
+	}
+	for j := 0; j < a.Cols; j++ {
+		for i, v := range a.Data[j*a.Stride : j*a.Stride+a.Rows] {
+			t.Data[i*t.Stride+j] = v
+		}
+	}
+}
+
+// TriangularCols represents a matrix using the conventional column-major storage scheme.
+type TriangularCols Triangular
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, uplo and diag as a and have
+// adequate backing data storage.
+func (t TriangularCols) From(a Triangular) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("cblas128: mismatched BLAS diag")
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.All:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, uplo and diag as a and have
+// adequate backing data storage.
+func (t Triangular) From(a TriangularCols) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("cblas128: mismatched BLAS diag")
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.All:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	}
+}
+
+// BandCols represents a matrix using the band column-major storage scheme.
+type BandCols Band
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and bandwidth as a and have
+// adequate backing data storage.
+func (t BandCols) From(a Band) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.KL != a.KL || t.KU != a.KU {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.KL+a.KU+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.KL+t.KU+1 {
+		panic("cblas128: short stride for destination")
+	}
+	for i := 0; i < a.Rows; i++ {
+		for j := max(0, i-a.KL); j < min(i+a.KU+1, a.Cols); j++ {
+			t.Data[i+t.KU-j+j*t.Stride] = a.Data[j+a.KL-i+i*a.Stride]
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and bandwidth as a and have
+// adequate backing data storage.
+func (t Band) From(a BandCols) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.KL != a.KL || t.KU != a.KU {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.KL+a.KU+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.KL+t.KU+1 {
+		panic("cblas128: short stride for destination")
+	}
+	for j := 0; j < a.Cols; j++ {
+		for i := max(0, j-a.KU); i < min(j+a.KL+1, a.Rows); i++ {
+			t.Data[j+a.KL-i+i*a.Stride] = a.Data[i+t.KU-j+j*t.Stride]
+		}
+	}
+}
+
+// TriangularBandCols represents a triangular matrix using the band column-major storage scheme.
+type TriangularBandCols TriangularBand
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t TriangularBandCols) From(a TriangularBand) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("cblas128: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("cblas128: mismatched BLAS diag")
+	}
+	dst := BandCols{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := Band{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t TriangularBand) From(a TriangularBandCols) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("cblas128: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("cblas128: mismatched BLAS diag")
+	}
+	dst := Band{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := BandCols{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/cblas128/conv_hermitian.go b/vendor/gonum.org/v1/gonum/blas/cblas128/conv_hermitian.go
new file mode 100644
index 0000000000..51c3a5777b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/cblas128/conv_hermitian.go
@@ -0,0 +1,155 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cblas128
+
+import "gonum.org/v1/gonum/blas"
+
+// HermitianCols represents a matrix using the conventional column-major storage scheme.
+type HermitianCols Hermitian
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and uplo as a and have adequate
+// backing data storage.
+func (t HermitianCols) From(a Hermitian) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and uplo as a and have adequate
+// backing data storage.
+func (t Hermitian) From(a HermitianCols) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	}
+}
+
+// HermitianBandCols represents an Hermitian matrix using the band column-major storage scheme.
+type HermitianBandCols HermitianBand
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t HermitianBandCols) From(a HermitianBand) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("cblas128: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	dst := BandCols{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := Band{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t HermitianBand) From(a HermitianBandCols) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("cblas128: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	dst := Band{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := BandCols{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/cblas128/conv_symmetric.go b/vendor/gonum.org/v1/gonum/blas/cblas128/conv_symmetric.go
new file mode 100644
index 0000000000..f1bf40c208
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/cblas128/conv_symmetric.go
@@ -0,0 +1,155 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cblas128
+
+import "gonum.org/v1/gonum/blas"
+
+// SymmetricCols represents a matrix using the conventional column-major storage scheme.
+type SymmetricCols Symmetric
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and uplo as a and have adequate
+// backing data storage.
+func (t SymmetricCols) From(a Symmetric) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and uplo as a and have adequate
+// backing data storage.
+func (t Symmetric) From(a SymmetricCols) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	}
+}
+
+// SymmetricBandCols represents a symmetric matrix using the band column-major storage scheme.
+type SymmetricBandCols SymmetricBand
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t SymmetricBandCols) From(a SymmetricBand) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("cblas128: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	dst := BandCols{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := Band{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t SymmetricBand) From(a SymmetricBandCols) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("cblas128: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	dst := Band{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := BandCols{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/cblas128/doc.go b/vendor/gonum.org/v1/gonum/blas/cblas128/doc.go
new file mode 100644
index 0000000000..09719b19e6
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/cblas128/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package cblas128 provides a simple interface to the complex128 BLAS API.
+package cblas128 // import "gonum.org/v1/gonum/blas/cblas128"
diff --git a/vendor/gonum.org/v1/gonum/blas/conversions.bash b/vendor/gonum.org/v1/gonum/blas/conversions.bash
new file mode 100644
index 0000000000..d1c0ef0d99
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/conversions.bash
@@ -0,0 +1,159 @@
+#!/usr/bin/env bash
+
+# Copyright ©2017 The Gonum Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Generate code for blas32.
+echo Generating blas32/conv.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > blas32/conv.go
+cat blas64/conv.go \
+| gofmt -r 'float64 -> float32' \
+\
+| sed -e 's/blas64/blas32/' \
+\
+>> blas32/conv.go
+
+echo Generating blas32/conv_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > blas32/conv_test.go
+cat blas64/conv_test.go \
+| gofmt -r 'float64 -> float32' \
+\
+| sed -e 's/blas64/blas32/' \
+      -e 's_"math"_math "gonum.org/v1/gonum/internal/math32"_' \
+\
+>> blas32/conv_test.go
+
+echo Generating blas32/conv_symmetric.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > blas32/conv_symmetric.go
+cat blas64/conv_symmetric.go \
+| gofmt -r 'float64 -> float32' \
+\
+| sed -e 's/blas64/blas32/' \
+\
+>> blas32/conv_symmetric.go
+
+echo Generating blas32/conv_symmetric_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > blas32/conv_symmetric_test.go
+cat blas64/conv_symmetric_test.go \
+| gofmt -r 'float64 -> float32' \
+\
+| sed -e 's/blas64/blas32/' \
+      -e 's_"math"_math "gonum.org/v1/gonum/internal/math32"_' \
+\
+>> blas32/conv_symmetric_test.go
+
+
+# Generate code for cblas128.
+echo Generating cblas128/conv.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv.go
+cat blas64/conv.go \
+| gofmt -r 'float64 -> complex128' \
+\
+| sed -e 's/blas64/cblas128/' \
+\
+>> cblas128/conv.go
+
+echo Generating cblas128/conv_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv_test.go
+cat blas64/conv_test.go \
+| gofmt -r 'float64 -> complex128' \
+\
+| sed -e 's/blas64/cblas128/' \
+      -e 's_"math"_math "math/cmplx"_' \
+\
+>> cblas128/conv_test.go
+
+echo Generating cblas128/conv_symmetric.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv_symmetric.go
+cat blas64/conv_symmetric.go \
+| gofmt -r 'float64 -> complex128' \
+\
+| sed -e 's/blas64/cblas128/' \
+\
+>> cblas128/conv_symmetric.go
+
+echo Generating cblas128/conv_symmetric_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv_symmetric_test.go
+cat blas64/conv_symmetric_test.go \
+| gofmt -r 'float64 -> complex128' \
+\
+| sed -e 's/blas64/cblas128/' \
+      -e 's_"math"_math "math/cmplx"_' \
+\
+>> cblas128/conv_symmetric_test.go
+
+echo Generating cblas128/conv_hermitian.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv_hermitian.go
+cat blas64/conv_symmetric.go \
+| gofmt -r 'float64 -> complex128' \
+\
+| sed -e 's/blas64/cblas128/' \
+      -e 's/Symmetric/Hermitian/g' \
+      -e 's/a symmetric/an Hermitian/g' \
+      -e 's/symmetric/hermitian/g' \
+      -e 's/Sym/Herm/g' \
+\
+>> cblas128/conv_hermitian.go
+
+echo Generating cblas128/conv_hermitian_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv_hermitian_test.go
+cat blas64/conv_symmetric_test.go \
+| gofmt -r 'float64 -> complex128' \
+\
+| sed -e 's/blas64/cblas128/' \
+      -e 's/Symmetric/Hermitian/g' \
+      -e 's/a symmetric/an Hermitian/g' \
+      -e 's/symmetric/hermitian/g' \
+      -e 's/Sym/Herm/g' \
+      -e 's_"math"_math "math/cmplx"_' \
+\
+>> cblas128/conv_hermitian_test.go
+
+
+# Generate code for cblas64.
+echo Generating cblas64/conv.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas64/conv.go
+cat blas64/conv.go \
+| gofmt -r 'float64 -> complex64' \
+\
+| sed -e 's/blas64/cblas64/' \
+\
+>> cblas64/conv.go
+
+echo Generating cblas64/conv_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas64/conv_test.go
+cat blas64/conv_test.go \
+| gofmt -r 'float64 -> complex64' \
+\
+| sed -e 's/blas64/cblas64/' \
+      -e 's_"math"_math "gonum.org/v1/gonum/internal/cmplx64"_' \
+\
+>> cblas64/conv_test.go
+
+echo Generating cblas64/conv_hermitian.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas64/conv_hermitian.go
+cat blas64/conv_symmetric.go \
+| gofmt -r 'float64 -> complex64' \
+\
+| sed -e 's/blas64/cblas64/' \
+      -e 's/Symmetric/Hermitian/g' \
+      -e 's/a symmetric/an Hermitian/g' \
+      -e 's/symmetric/hermitian/g' \
+      -e 's/Sym/Herm/g' \
+\
+>> cblas64/conv_hermitian.go
+
+echo Generating cblas64/conv_hermitian_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas64/conv_hermitian_test.go
+cat blas64/conv_symmetric_test.go \
+| gofmt -r 'float64 -> complex64' \
+\
+| sed -e 's/blas64/cblas64/' \
+      -e 's/Symmetric/Hermitian/g' \
+      -e 's/a symmetric/an Hermitian/g' \
+      -e 's/symmetric/hermitian/g' \
+      -e 's/Sym/Herm/g' \
+      -e 's_"math"_math "gonum.org/v1/gonum/internal/cmplx64"_' \
+\
+>> cblas64/conv_hermitian_test.go
diff --git a/vendor/gonum.org/v1/gonum/blas/doc.go b/vendor/gonum.org/v1/gonum/blas/doc.go
new file mode 100644
index 0000000000..ea4b16c904
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/doc.go
@@ -0,0 +1,108 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+Package blas provides interfaces for the BLAS linear algebra standard.
+
+All methods must perform appropriate parameter checking and panic if
+provided parameters that do not conform to the requirements specified
+by the BLAS standard.
+
+Quick Reference Guide to the BLAS from http://www.netlib.org/lapack/lug/node145.html
+
+This version is modified to remove the "order" option. All matrix operations are
+on row-order matrices.
+
+Level 1 BLAS
+
+	        dim scalar vector   vector   scalars              5-element prefixes
+	                                                          struct
+
+	_rotg (                                      a, b )                S, D
+	_rotmg(                              d1, d2, a, b )                S, D
+	_rot  ( n,         x, incX, y, incY,               c, s )          S, D
+	_rotm ( n,         x, incX, y, incY,                      param )  S, D
+	_swap ( n,         x, incX, y, incY )                              S, D, C, Z
+	_scal ( n,  alpha, x, incX )                                       S, D, C, Z, Cs, Zd
+	_copy ( n,         x, incX, y, incY )                              S, D, C, Z
+	_axpy ( n,  alpha, x, incX, y, incY )                              S, D, C, Z
+	_dot  ( n,         x, incX, y, incY )                              S, D, Ds
+	_dotu ( n,         x, incX, y, incY )                              C, Z
+	_dotc ( n,         x, incX, y, incY )                              C, Z
+	__dot ( n,  alpha, x, incX, y, incY )                              Sds
+	_nrm2 ( n,         x, incX )                                       S, D, Sc, Dz
+	_asum ( n,         x, incX )                                       S, D, Sc, Dz
+	I_amax( n,         x, incX )                                       s, d, c, z
+
+Level 2 BLAS
+
+	        options                   dim   b-width scalar matrix  vector   scalar vector   prefixes
+
+	_gemv (        trans,      m, n,         alpha, a, lda, x, incX, beta,  y, incY ) S, D, C, Z
+	_gbmv (        trans,      m, n, kL, kU, alpha, a, lda, x, incX, beta,  y, incY ) S, D, C, Z
+	_hemv ( uplo,                 n,         alpha, a, lda, x, incX, beta,  y, incY ) C, Z
+	_hbmv ( uplo,                 n, k,      alpha, a, lda, x, incX, beta,  y, incY ) C, Z
+	_hpmv ( uplo,                 n,         alpha, ap,     x, incX, beta,  y, incY ) C, Z
+	_symv ( uplo,                 n,         alpha, a, lda, x, incX, beta,  y, incY ) S, D
+	_sbmv ( uplo,                 n, k,      alpha, a, lda, x, incX, beta,  y, incY ) S, D
+	_spmv ( uplo,                 n,         alpha, ap,     x, incX, beta,  y, incY ) S, D
+	_trmv ( uplo, trans, diag,    n,                a, lda, x, incX )                 S, D, C, Z
+	_tbmv ( uplo, trans, diag,    n, k,             a, lda, x, incX )                 S, D, C, Z
+	_tpmv ( uplo, trans, diag,    n,                ap,     x, incX )                 S, D, C, Z
+	_trsv ( uplo, trans, diag,    n,                a, lda, x, incX )                 S, D, C, Z
+	_tbsv ( uplo, trans, diag,    n, k,             a, lda, x, incX )                 S, D, C, Z
+	_tpsv ( uplo, trans, diag,    n,                ap,     x, incX )                 S, D, C, Z
+
+	        options                   dim   scalar vector   vector   matrix  prefixes
+
+	_ger  (                    m, n, alpha, x, incX, y, incY, a, lda ) S, D
+	_geru (                    m, n, alpha, x, incX, y, incY, a, lda ) C, Z
+	_gerc (                    m, n, alpha, x, incX, y, incY, a, lda ) C, Z
+	_her  ( uplo,                 n, alpha, x, incX,          a, lda ) C, Z
+	_hpr  ( uplo,                 n, alpha, x, incX,          ap )     C, Z
+	_her2 ( uplo,                 n, alpha, x, incX, y, incY, a, lda ) C, Z
+	_hpr2 ( uplo,                 n, alpha, x, incX, y, incY, ap )     C, Z
+	_syr  ( uplo,                 n, alpha, x, incX,          a, lda ) S, D
+	_spr  ( uplo,                 n, alpha, x, incX,          ap )     S, D
+	_syr2 ( uplo,                 n, alpha, x, incX, y, incY, a, lda ) S, D
+	_spr2 ( uplo,                 n, alpha, x, incX, y, incY, ap )     S, D
+
+Level 3 BLAS
+
+	        options                                 dim      scalar matrix  matrix  scalar matrix  prefixes
+
+	_gemm (             transA, transB,      m, n, k, alpha, a, lda, b, ldb, beta,  c, ldc ) S, D, C, Z
+	_symm ( side, uplo,                      m, n,    alpha, a, lda, b, ldb, beta,  c, ldc ) S, D, C, Z
+	_hemm ( side, uplo,                      m, n,    alpha, a, lda, b, ldb, beta,  c, ldc ) C, Z
+	_syrk (       uplo, trans,                  n, k, alpha, a, lda,         beta,  c, ldc ) S, D, C, Z
+	_herk (       uplo, trans,                  n, k, alpha, a, lda,         beta,  c, ldc ) C, Z
+	_syr2k(       uplo, trans,                  n, k, alpha, a, lda, b, ldb, beta,  c, ldc ) S, D, C, Z
+	_her2k(       uplo, trans,                  n, k, alpha, a, lda, b, ldb, beta,  c, ldc ) C, Z
+	_trmm ( side, uplo, transA,        diag, m, n,    alpha, a, lda, b, ldb )                S, D, C, Z
+	_trsm ( side, uplo, transA,        diag, m, n,    alpha, a, lda, b, ldb )                S, D, C, Z
+
+Meaning of prefixes
+
+	S - float32	C - complex64
+	D - float64	Z - complex128
+
+Matrix types
+
+	GE - GEneral 		GB - General Band
+	SY - SYmmetric 		SB - Symmetric Band 	SP - Symmetric Packed
+	HE - HErmitian 		HB - Hermitian Band 	HP - Hermitian Packed
+	TR - TRiangular 	TB - Triangular Band 	TP - Triangular Packed
+
+Options
+
+	trans 	= NoTrans, Trans, ConjTrans
+	uplo 	= Upper, Lower
+	diag 	= Nonunit, Unit
+	side 	= Left, Right (A or op(A) on the left, or A or op(A) on the right)
+
+For real matrices, Trans and ConjTrans have the same meaning.
+For Hermitian matrices, trans = Trans is not allowed.
+For complex symmetric matrices, trans = ConjTrans is not allowed.
+*/
+package blas // import "gonum.org/v1/gonum/blas"
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/dgemm.go b/vendor/gonum.org/v1/gonum/blas/gonum/dgemm.go
new file mode 100644
index 0000000000..9e74cc1dbf
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/dgemm.go
@@ -0,0 +1,297 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"runtime"
+	"sync"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f64"
+)
+
+// Dgemm performs one of the matrix-matrix operations
+//
+//	C = alpha * A * B + beta * C
+//	C = alpha * Aᵀ * B + beta * C
+//	C = alpha * A * Bᵀ + beta * C
+//	C = alpha * Aᵀ * Bᵀ + beta * C
+//
+// where A is an m×k or k×m dense matrix, B is an n×k or k×n dense matrix, C is
+// an m×n matrix, and alpha and beta are scalars. tA and tB specify whether A or
+// B are transposed.
+func (Implementation) Dgemm(tA, tB blas.Transpose, m, n, k int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int) {
+	switch tA {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch tB {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	aTrans := tA == blas.Trans || tA == blas.ConjTrans
+	if aTrans {
+		if lda < max(1, m) {
+			panic(badLdA)
+		}
+	} else {
+		if lda < max(1, k) {
+			panic(badLdA)
+		}
+	}
+	bTrans := tB == blas.Trans || tB == blas.ConjTrans
+	if bTrans {
+		if ldb < max(1, k) {
+			panic(badLdB)
+		}
+	} else {
+		if ldb < max(1, n) {
+			panic(badLdB)
+		}
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if aTrans {
+		if len(a) < (k-1)*lda+m {
+			panic(shortA)
+		}
+	} else {
+		if len(a) < (m-1)*lda+k {
+			panic(shortA)
+		}
+	}
+	if bTrans {
+		if len(b) < (n-1)*ldb+k {
+			panic(shortB)
+		}
+	} else {
+		if len(b) < (k-1)*ldb+n {
+			panic(shortB)
+		}
+	}
+	if len(c) < (m-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	// scale c
+	if beta != 1 {
+		if beta == 0 {
+			for i := 0; i < m; i++ {
+				ctmp := c[i*ldc : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				ctmp := c[i*ldc : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+		}
+	}
+
+	dgemmParallel(aTrans, bTrans, m, n, k, a, lda, b, ldb, c, ldc, alpha)
+}
+
+func dgemmParallel(aTrans, bTrans bool, m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
+	// dgemmParallel computes a parallel matrix multiplication by partitioning
+	// a and b into sub-blocks, and updating c with the multiplication of the sub-block
+	// In all cases,
+	// A = [ 	A_11	A_12 ... 	A_1j
+	//			A_21	A_22 ...	A_2j
+	//				...
+	//			A_i1	A_i2 ...	A_ij]
+	//
+	// and same for B. All of the submatrix sizes are blockSize×blockSize except
+	// at the edges.
+	//
+	// In all cases, there is one dimension for each matrix along which
+	// C must be updated sequentially.
+	// Cij = \sum_k Aik Bki,	(A * B)
+	// Cij = \sum_k Aki Bkj,	(Aᵀ * B)
+	// Cij = \sum_k Aik Bjk,	(A * Bᵀ)
+	// Cij = \sum_k Aki Bjk,	(Aᵀ * Bᵀ)
+	//
+	// This code computes one {i, j} block sequentially along the k dimension,
+	// and computes all of the {i, j} blocks concurrently. This
+	// partitioning allows Cij to be updated in-place without race-conditions.
+	// Instead of launching a goroutine for each possible concurrent computation,
+	// a number of worker goroutines are created and channels are used to pass
+	// available and completed cases.
+	//
+	// http://alexkr.com/docs/matrixmult.pdf is a good reference on matrix-matrix
+	// multiplies, though this code does not copy matrices to attempt to eliminate
+	// cache misses.
+
+	maxKLen := k
+	parBlocks := blocks(m, blockSize) * blocks(n, blockSize)
+	if parBlocks < minParBlock {
+		// The matrix multiplication is small in the dimensions where it can be
+		// computed concurrently. Just do it in serial.
+		dgemmSerial(aTrans, bTrans, m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	}
+
+	// workerLimit acts a number of maximum concurrent workers,
+	// with the limit set to the number of procs available.
+	workerLimit := make(chan struct{}, runtime.GOMAXPROCS(0))
+
+	// wg is used to wait for all
+	var wg sync.WaitGroup
+	wg.Add(parBlocks)
+	defer wg.Wait()
+
+	for i := 0; i < m; i += blockSize {
+		for j := 0; j < n; j += blockSize {
+			workerLimit <- struct{}{}
+			go func(i, j int) {
+				defer func() {
+					wg.Done()
+					<-workerLimit
+				}()
+
+				leni := blockSize
+				if i+leni > m {
+					leni = m - i
+				}
+				lenj := blockSize
+				if j+lenj > n {
+					lenj = n - j
+				}
+
+				cSub := sliceView64(c, ldc, i, j, leni, lenj)
+
+				// Compute A_ik B_kj for all k
+				for k := 0; k < maxKLen; k += blockSize {
+					lenk := blockSize
+					if k+lenk > maxKLen {
+						lenk = maxKLen - k
+					}
+					var aSub, bSub []float64
+					if aTrans {
+						aSub = sliceView64(a, lda, k, i, lenk, leni)
+					} else {
+						aSub = sliceView64(a, lda, i, k, leni, lenk)
+					}
+					if bTrans {
+						bSub = sliceView64(b, ldb, j, k, lenj, lenk)
+					} else {
+						bSub = sliceView64(b, ldb, k, j, lenk, lenj)
+					}
+					dgemmSerial(aTrans, bTrans, leni, lenj, lenk, aSub, lda, bSub, ldb, cSub, ldc, alpha)
+				}
+			}(i, j)
+		}
+	}
+}
+
+// dgemmSerial is serial matrix multiply
+func dgemmSerial(aTrans, bTrans bool, m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
+	switch {
+	case !aTrans && !bTrans:
+		dgemmSerialNotNot(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	case aTrans && !bTrans:
+		dgemmSerialTransNot(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	case !aTrans && bTrans:
+		dgemmSerialNotTrans(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	case aTrans && bTrans:
+		dgemmSerialTransTrans(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	default:
+		panic("unreachable")
+	}
+}
+
+// dgemmSerial where neither a nor b are transposed
+func dgemmSerialNotNot(m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for i := 0; i < m; i++ {
+		ctmp := c[i*ldc : i*ldc+n]
+		for l, v := range a[i*lda : i*lda+k] {
+			tmp := alpha * v
+			if tmp != 0 {
+				f64.AxpyUnitary(tmp, b[l*ldb:l*ldb+n], ctmp)
+			}
+		}
+	}
+}
+
+// dgemmSerial where neither a is transposed and b is not
+func dgemmSerialTransNot(m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for l := 0; l < k; l++ {
+		btmp := b[l*ldb : l*ldb+n]
+		for i, v := range a[l*lda : l*lda+m] {
+			tmp := alpha * v
+			if tmp != 0 {
+				ctmp := c[i*ldc : i*ldc+n]
+				f64.AxpyUnitary(tmp, btmp, ctmp)
+			}
+		}
+	}
+}
+
+// dgemmSerial where neither a is not transposed and b is
+func dgemmSerialNotTrans(m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for i := 0; i < m; i++ {
+		atmp := a[i*lda : i*lda+k]
+		ctmp := c[i*ldc : i*ldc+n]
+		for j := 0; j < n; j++ {
+			ctmp[j] += alpha * f64.DotUnitary(atmp, b[j*ldb:j*ldb+k])
+		}
+	}
+}
+
+// dgemmSerial where both are transposed
+func dgemmSerialTransTrans(m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for l := 0; l < k; l++ {
+		for i, v := range a[l*lda : l*lda+m] {
+			tmp := alpha * v
+			if tmp != 0 {
+				ctmp := c[i*ldc : i*ldc+n]
+				f64.AxpyInc(tmp, b[l:], ctmp, uintptr(n), uintptr(ldb), 1, 0, 0)
+			}
+		}
+	}
+}
+
+func sliceView64(a []float64, lda, i, j, r, c int) []float64 {
+	return a[i*lda+j : (i+r-1)*lda+j+c]
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/doc.go b/vendor/gonum.org/v1/gonum/blas/gonum/doc.go
new file mode 100644
index 0000000000..cbca601d90
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/doc.go
@@ -0,0 +1,99 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Ensure changes made to blas/native are reflected in blas/cgo where relevant.
+
+/*
+Package gonum is a Go implementation of the BLAS API. This implementation
+panics when the input arguments are invalid as per the standard, for example
+if a vector increment is zero. Note that the treatment of NaN values
+is not specified, and differs among the BLAS implementations.
+gonum.org/v1/gonum/blas/blas64 provides helpful wrapper functions to the BLAS
+interface. The rest of this text describes the layout of the data for the input types.
+
+Note that in the function documentation, x[i] refers to the i^th element
+of the vector, which will be different from the i^th element of the slice if
+incX != 1.
+
+See http://www.netlib.org/lapack/explore-html/d4/de1/_l_i_c_e_n_s_e_source.html
+for more license information.
+
+Vector arguments are effectively strided slices. They have two input arguments,
+a number of elements, n, and an increment, incX. The increment specifies the
+distance between elements of the vector. The actual Go slice may be longer
+than necessary.
+The increment may be positive or negative, except in functions with only
+a single vector argument where the increment may only be positive. If the increment
+is negative, s[0] is the last element in the slice. Note that this is not the same
+as counting backward from the end of the slice, as len(s) may be longer than
+necessary. So, for example, if n = 5 and incX = 3, the elements of s are
+
+	[0 * * 1 * * 2 * * 3 * * 4 * * * ...]
+
+where ∗ elements are never accessed. If incX = -3, the same elements are
+accessed, just in reverse order (4, 3, 2, 1, 0).
+
+Dense matrices are specified by a number of rows, a number of columns, and a stride.
+The stride specifies the number of entries in the slice between the first element
+of successive rows. The stride must be at least as large as the number of columns
+but may be longer.
+
+	[a00 ... a0n a0* ... a1stride-1 a21 ... amn am* ... amstride-1]
+
+Thus, dense[i*ld + j] refers to the {i, j}th element of the matrix.
+
+Symmetric and triangular matrices (non-packed) are stored identically to Dense,
+except that only elements in one triangle of the matrix are accessed.
+
+Packed symmetric and packed triangular matrices are laid out with the entries
+condensed such that all of the unreferenced elements are removed. So, the upper triangular
+matrix
+
+	[
+	  1  2  3
+	  0  4  5
+	  0  0  6
+	]
+
+and the lower-triangular matrix
+
+	[
+	  1  0  0
+	  2  3  0
+	  4  5  6
+	]
+
+will both be compacted as [1 2 3 4 5 6]. The (i, j) element of the original
+dense matrix can be found at element i*n - (i-1)*i/2 + j for upper triangular,
+and at element i * (i+1) /2 + j for lower triangular.
+
+Banded matrices are laid out in a compact format, constructed by removing the
+zeros in the rows and aligning the diagonals. For example, the matrix
+
+	[
+	  1  2  3  0  0  0
+	  4  5  6  7  0  0
+	  0  8  9 10 11  0
+	  0  0 12 13 14 15
+	  0  0  0 16 17 18
+	  0  0  0  0 19 20
+	]
+
+implicitly becomes (∗ entries are never accessed)
+
+	[
+	   *  1  2  3
+	   4  5  6  7
+	   8  9 10 11
+	  12 13 14 15
+	  16 17 18  *
+	  19 20  *  *
+	]
+
+which is given to the BLAS routine as [∗ 1 2 3 4 ...].
+
+See http://www.crest.iu.edu/research/mtl/reference/html/banded.html
+for more information
+*/
+package gonum // import "gonum.org/v1/gonum/blas/gonum"
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/errors.go b/vendor/gonum.org/v1/gonum/blas/gonum/errors.go
new file mode 100644
index 0000000000..e98575d0fa
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/errors.go
@@ -0,0 +1,35 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+// Panic strings used during parameter checks.
+// This list is duplicated in netlib/blas/netlib. Keep in sync.
+const (
+	zeroIncX = "blas: zero x index increment"
+	zeroIncY = "blas: zero y index increment"
+
+	mLT0  = "blas: m < 0"
+	nLT0  = "blas: n < 0"
+	kLT0  = "blas: k < 0"
+	kLLT0 = "blas: kL < 0"
+	kULT0 = "blas: kU < 0"
+
+	badUplo      = "blas: illegal triangle"
+	badTranspose = "blas: illegal transpose"
+	badDiag      = "blas: illegal diagonal"
+	badSide      = "blas: illegal side"
+	badFlag      = "blas: illegal rotm flag"
+
+	badLdA = "blas: bad leading dimension of A"
+	badLdB = "blas: bad leading dimension of B"
+	badLdC = "blas: bad leading dimension of C"
+
+	shortX  = "blas: insufficient length of x"
+	shortY  = "blas: insufficient length of y"
+	shortAP = "blas: insufficient length of ap"
+	shortA  = "blas: insufficient length of a"
+	shortB  = "blas: insufficient length of b"
+	shortC  = "blas: insufficient length of c"
+)
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/gonum.go b/vendor/gonum.org/v1/gonum/blas/gonum/gonum.go
new file mode 100644
index 0000000000..5a5c111012
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/gonum.go
@@ -0,0 +1,38 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:generate ./single_precision.bash
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/internal/math32"
+)
+
+type Implementation struct{}
+
+// [SD]gemm behavior constants. These are kept here to keep them out of the
+// way during single precision code generation.
+const (
+	blockSize   = 64 // b x b matrix
+	minParBlock = 4  // minimum number of blocks needed to go parallel
+)
+
+// blocks returns the number of divisions of the dimension length with the given
+// block size.
+func blocks(dim, bsize int) int {
+	return (dim + bsize - 1) / bsize
+}
+
+// dcabs1 returns |real(z)|+|imag(z)|.
+func dcabs1(z complex128) float64 {
+	return math.Abs(real(z)) + math.Abs(imag(z))
+}
+
+// scabs1 returns |real(z)|+|imag(z)|.
+func scabs1(z complex64) float32 {
+	return math32.Abs(real(z)) + math32.Abs(imag(z))
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level1cmplx128.go b/vendor/gonum.org/v1/gonum/blas/gonum/level1cmplx128.go
new file mode 100644
index 0000000000..3e3af0db13
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1cmplx128.go
@@ -0,0 +1,454 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/c128"
+)
+
+var _ blas.Complex128Level1 = Implementation{}
+
+// Dzasum returns the sum of the absolute values of the elements of x
+//
+//	\sum_i |Re(x[i])| + |Im(x[i])|
+//
+// Dzasum returns 0 if incX is negative.
+func (Implementation) Dzasum(n int, x []complex128, incX int) float64 {
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	var sum float64
+	if incX == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		for _, v := range x[:n] {
+			sum += dcabs1(v)
+		}
+		return sum
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	for i := 0; i < n; i++ {
+		v := x[i*incX]
+		sum += dcabs1(v)
+	}
+	return sum
+}
+
+// Dznrm2 computes the Euclidean norm of the complex vector x,
+//
+//	‖x‖_2 = sqrt(\sum_i x[i] * conj(x[i])).
+//
+// This function returns 0 if incX is negative.
+func (Implementation) Dznrm2(n int, x []complex128, incX int) float64 {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	if n < 1 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	var (
+		scale float64
+		ssq   float64 = 1
+	)
+	if incX == 1 {
+		for _, v := range x[:n] {
+			re, im := math.Abs(real(v)), math.Abs(imag(v))
+			if re != 0 {
+				if re > scale {
+					ssq = 1 + ssq*(scale/re)*(scale/re)
+					scale = re
+				} else {
+					ssq += (re / scale) * (re / scale)
+				}
+			}
+			if im != 0 {
+				if im > scale {
+					ssq = 1 + ssq*(scale/im)*(scale/im)
+					scale = im
+				} else {
+					ssq += (im / scale) * (im / scale)
+				}
+			}
+		}
+		if math.IsInf(scale, 1) {
+			return math.Inf(1)
+		}
+		return scale * math.Sqrt(ssq)
+	}
+	for ix := 0; ix < n*incX; ix += incX {
+		re, im := math.Abs(real(x[ix])), math.Abs(imag(x[ix]))
+		if re != 0 {
+			if re > scale {
+				ssq = 1 + ssq*(scale/re)*(scale/re)
+				scale = re
+			} else {
+				ssq += (re / scale) * (re / scale)
+			}
+		}
+		if im != 0 {
+			if im > scale {
+				ssq = 1 + ssq*(scale/im)*(scale/im)
+				scale = im
+			} else {
+				ssq += (im / scale) * (im / scale)
+			}
+		}
+	}
+	if math.IsInf(scale, 1) {
+		return math.Inf(1)
+	}
+	return scale * math.Sqrt(ssq)
+}
+
+// Izamax returns the index of the first element of x having largest |Re(·)|+|Im(·)|.
+// Izamax returns -1 if n is 0 or incX is negative.
+func (Implementation) Izamax(n int, x []complex128, incX int) int {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		// Return invalid index.
+		return -1
+	}
+	if n < 1 {
+		if n == 0 {
+			// Return invalid index.
+			return -1
+		}
+		panic(nLT0)
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	idx := 0
+	max := dcabs1(x[0])
+	if incX == 1 {
+		for i, v := range x[1:n] {
+			absV := dcabs1(v)
+			if absV > max {
+				max = absV
+				idx = i + 1
+			}
+		}
+		return idx
+	}
+	ix := incX
+	for i := 1; i < n; i++ {
+		absV := dcabs1(x[ix])
+		if absV > max {
+			max = absV
+			idx = i
+		}
+		ix += incX
+	}
+	return idx
+}
+
+// Zaxpy adds alpha times x to y:
+//
+//	y[i] += alpha * x[i] for all i
+func (Implementation) Zaxpy(n int, alpha complex128, x []complex128, incX int, y []complex128, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if alpha == 0 {
+		return
+	}
+	if incX == 1 && incY == 1 {
+		c128.AxpyUnitary(alpha, x[:n], y[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (1 - n) * incX
+	}
+	if incY < 0 {
+		iy = (1 - n) * incY
+	}
+	c128.AxpyInc(alpha, x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Zcopy copies the vector x to vector y.
+func (Implementation) Zcopy(n int, x []complex128, incX int, y []complex128, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		copy(y[:n], x[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		y[iy] = x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Zdotc computes the dot product
+//
+//	xᴴ · y
+//
+// of two complex vectors x and y.
+func (Implementation) Zdotc(n int, x []complex128, incX int, y []complex128, incY int) complex128 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return c128.DotcUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || (n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return c128.DotcInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Zdotu computes the dot product
+//
+//	xᵀ · y
+//
+// of two complex vectors x and y.
+func (Implementation) Zdotu(n int, x []complex128, incX int, y []complex128, incY int) complex128 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return c128.DotuUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || (n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return c128.DotuInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Zdscal scales the vector x by a real scalar alpha.
+// Zdscal has no effect if incX < 0.
+func (Implementation) Zdscal(n int, alpha float64, x []complex128, incX int) {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if alpha == 0 {
+		if incX == 1 {
+			x = x[:n]
+			for i := range x {
+				x[i] = 0
+			}
+			return
+		}
+		for ix := 0; ix < n*incX; ix += incX {
+			x[ix] = 0
+		}
+		return
+	}
+	if incX == 1 {
+		x = x[:n]
+		for i, v := range x {
+			x[i] = complex(alpha*real(v), alpha*imag(v))
+		}
+		return
+	}
+	for ix := 0; ix < n*incX; ix += incX {
+		v := x[ix]
+		x[ix] = complex(alpha*real(v), alpha*imag(v))
+	}
+}
+
+// Zscal scales the vector x by a complex scalar alpha.
+// Zscal has no effect if incX < 0.
+func (Implementation) Zscal(n int, alpha complex128, x []complex128, incX int) {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if alpha == 0 {
+		if incX == 1 {
+			x = x[:n]
+			for i := range x {
+				x[i] = 0
+			}
+			return
+		}
+		for ix := 0; ix < n*incX; ix += incX {
+			x[ix] = 0
+		}
+		return
+	}
+	if incX == 1 {
+		c128.ScalUnitary(alpha, x[:n])
+		return
+	}
+	c128.ScalInc(alpha, x, uintptr(n), uintptr(incX))
+}
+
+// Zswap exchanges the elements of two complex vectors x and y.
+func (Implementation) Zswap(n int, x []complex128, incX int, y []complex128, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		x = x[:n]
+		for i, v := range x {
+			x[i], y[i] = y[i], v
+		}
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		x[ix], y[iy] = y[iy], x[ix]
+		ix += incX
+		iy += incY
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level1cmplx64.go b/vendor/gonum.org/v1/gonum/blas/gonum/level1cmplx64.go
new file mode 100644
index 0000000000..249335cada
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1cmplx64.go
@@ -0,0 +1,476 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	math "gonum.org/v1/gonum/internal/math32"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/c64"
+)
+
+var _ blas.Complex64Level1 = Implementation{}
+
+// Scasum returns the sum of the absolute values of the elements of x
+//
+//	\sum_i |Re(x[i])| + |Im(x[i])|
+//
+// Scasum returns 0 if incX is negative.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Scasum(n int, x []complex64, incX int) float32 {
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	var sum float32
+	if incX == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		for _, v := range x[:n] {
+			sum += scabs1(v)
+		}
+		return sum
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	for i := 0; i < n; i++ {
+		v := x[i*incX]
+		sum += scabs1(v)
+	}
+	return sum
+}
+
+// Scnrm2 computes the Euclidean norm of the complex vector x,
+//
+//	‖x‖_2 = sqrt(\sum_i x[i] * conj(x[i])).
+//
+// This function returns 0 if incX is negative.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Scnrm2(n int, x []complex64, incX int) float32 {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	if n < 1 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	var (
+		scale float32
+		ssq   float32 = 1
+	)
+	if incX == 1 {
+		for _, v := range x[:n] {
+			re, im := math.Abs(real(v)), math.Abs(imag(v))
+			if re != 0 {
+				if re > scale {
+					ssq = 1 + ssq*(scale/re)*(scale/re)
+					scale = re
+				} else {
+					ssq += (re / scale) * (re / scale)
+				}
+			}
+			if im != 0 {
+				if im > scale {
+					ssq = 1 + ssq*(scale/im)*(scale/im)
+					scale = im
+				} else {
+					ssq += (im / scale) * (im / scale)
+				}
+			}
+		}
+		if math.IsInf(scale, 1) {
+			return math.Inf(1)
+		}
+		return scale * math.Sqrt(ssq)
+	}
+	for ix := 0; ix < n*incX; ix += incX {
+		re, im := math.Abs(real(x[ix])), math.Abs(imag(x[ix]))
+		if re != 0 {
+			if re > scale {
+				ssq = 1 + ssq*(scale/re)*(scale/re)
+				scale = re
+			} else {
+				ssq += (re / scale) * (re / scale)
+			}
+		}
+		if im != 0 {
+			if im > scale {
+				ssq = 1 + ssq*(scale/im)*(scale/im)
+				scale = im
+			} else {
+				ssq += (im / scale) * (im / scale)
+			}
+		}
+	}
+	if math.IsInf(scale, 1) {
+		return math.Inf(1)
+	}
+	return scale * math.Sqrt(ssq)
+}
+
+// Icamax returns the index of the first element of x having largest |Re(·)|+|Im(·)|.
+// Icamax returns -1 if n is 0 or incX is negative.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Icamax(n int, x []complex64, incX int) int {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		// Return invalid index.
+		return -1
+	}
+	if n < 1 {
+		if n == 0 {
+			// Return invalid index.
+			return -1
+		}
+		panic(nLT0)
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	idx := 0
+	max := scabs1(x[0])
+	if incX == 1 {
+		for i, v := range x[1:n] {
+			absV := scabs1(v)
+			if absV > max {
+				max = absV
+				idx = i + 1
+			}
+		}
+		return idx
+	}
+	ix := incX
+	for i := 1; i < n; i++ {
+		absV := scabs1(x[ix])
+		if absV > max {
+			max = absV
+			idx = i
+		}
+		ix += incX
+	}
+	return idx
+}
+
+// Caxpy adds alpha times x to y:
+//
+//	y[i] += alpha * x[i] for all i
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Caxpy(n int, alpha complex64, x []complex64, incX int, y []complex64, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if alpha == 0 {
+		return
+	}
+	if incX == 1 && incY == 1 {
+		c64.AxpyUnitary(alpha, x[:n], y[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (1 - n) * incX
+	}
+	if incY < 0 {
+		iy = (1 - n) * incY
+	}
+	c64.AxpyInc(alpha, x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Ccopy copies the vector x to vector y.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Ccopy(n int, x []complex64, incX int, y []complex64, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		copy(y[:n], x[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		y[iy] = x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Cdotc computes the dot product
+//
+//	xᴴ · y
+//
+// of two complex vectors x and y.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cdotc(n int, x []complex64, incX int, y []complex64, incY int) complex64 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return c64.DotcUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || (n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return c64.DotcInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Cdotu computes the dot product
+//
+//	xᵀ · y
+//
+// of two complex vectors x and y.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cdotu(n int, x []complex64, incX int, y []complex64, incY int) complex64 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return c64.DotuUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || (n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return c64.DotuInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Csscal scales the vector x by a real scalar alpha.
+// Csscal has no effect if incX < 0.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Csscal(n int, alpha float32, x []complex64, incX int) {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if alpha == 0 {
+		if incX == 1 {
+			x = x[:n]
+			for i := range x {
+				x[i] = 0
+			}
+			return
+		}
+		for ix := 0; ix < n*incX; ix += incX {
+			x[ix] = 0
+		}
+		return
+	}
+	if incX == 1 {
+		x = x[:n]
+		for i, v := range x {
+			x[i] = complex(alpha*real(v), alpha*imag(v))
+		}
+		return
+	}
+	for ix := 0; ix < n*incX; ix += incX {
+		v := x[ix]
+		x[ix] = complex(alpha*real(v), alpha*imag(v))
+	}
+}
+
+// Cscal scales the vector x by a complex scalar alpha.
+// Cscal has no effect if incX < 0.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cscal(n int, alpha complex64, x []complex64, incX int) {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if alpha == 0 {
+		if incX == 1 {
+			x = x[:n]
+			for i := range x {
+				x[i] = 0
+			}
+			return
+		}
+		for ix := 0; ix < n*incX; ix += incX {
+			x[ix] = 0
+		}
+		return
+	}
+	if incX == 1 {
+		c64.ScalUnitary(alpha, x[:n])
+		return
+	}
+	c64.ScalInc(alpha, x, uintptr(n), uintptr(incX))
+}
+
+// Cswap exchanges the elements of two complex vectors x and y.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cswap(n int, x []complex64, incX int, y []complex64, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		x = x[:n]
+		for i, v := range x {
+			x[i], y[i] = y[i], v
+		}
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		x[ix], y[iy] = y[iy], x[ix]
+		ix += incX
+		iy += incY
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level1float32.go b/vendor/gonum.org/v1/gonum/blas/gonum/level1float32.go
new file mode 100644
index 0000000000..a90b88aceb
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1float32.go
@@ -0,0 +1,653 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	math "gonum.org/v1/gonum/internal/math32"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+var _ blas.Float32Level1 = Implementation{}
+
+// Snrm2 computes the Euclidean norm of a vector,
+//
+//	sqrt(\sum_i x[i] * x[i]).
+//
+// This function returns 0 if incX is negative.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Snrm2(n int, x []float32, incX int) float32 {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	if n < 2 {
+		if n == 1 {
+			return math.Abs(x[0])
+		}
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 {
+		return f32.L2NormUnitary(x[:n])
+	}
+	return f32.L2NormInc(x, uintptr(n), uintptr(incX))
+}
+
+// Sasum computes the sum of the absolute values of the elements of x.
+//
+//	\sum_i |x[i]|
+//
+// Sasum returns 0 if incX is negative.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sasum(n int, x []float32, incX int) float32 {
+	var sum float32
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	if incX == 1 {
+		x = x[:n]
+		for _, v := range x {
+			sum += math.Abs(v)
+		}
+		return sum
+	}
+	for i := 0; i < n; i++ {
+		sum += math.Abs(x[i*incX])
+	}
+	return sum
+}
+
+// Isamax returns the index of an element of x with the largest absolute value.
+// If there are multiple such indices the earliest is returned.
+// Isamax returns -1 if n == 0.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Isamax(n int, x []float32, incX int) int {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return -1
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	if n < 2 {
+		if n == 1 {
+			return 0
+		}
+		if n == 0 {
+			return -1 // Netlib returns invalid index when n == 0.
+		}
+		panic(nLT0)
+	}
+	idx := 0
+	max := math.Abs(x[0])
+	if incX == 1 {
+		for i, v := range x[:n] {
+			absV := math.Abs(v)
+			if absV > max {
+				max = absV
+				idx = i
+			}
+		}
+		return idx
+	}
+	ix := incX
+	for i := 1; i < n; i++ {
+		v := x[ix]
+		absV := math.Abs(v)
+		if absV > max {
+			max = absV
+			idx = i
+		}
+		ix += incX
+	}
+	return idx
+}
+
+// Sswap exchanges the elements of two vectors.
+//
+//	x[i], y[i] = y[i], x[i] for all i
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sswap(n int, x []float32, incX int, y []float32, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		x = x[:n]
+		for i, v := range x {
+			x[i], y[i] = y[i], v
+		}
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		x[ix], y[iy] = y[iy], x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Scopy copies the elements of x into the elements of y.
+//
+//	y[i] = x[i] for all i
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Scopy(n int, x []float32, incX int, y []float32, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		copy(y[:n], x[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		y[iy] = x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Saxpy adds alpha times x to y
+//
+//	y[i] += alpha * x[i] for all i
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Saxpy(n int, alpha float32, x []float32, incX int, y []float32, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if alpha == 0 {
+		return
+	}
+	if incX == 1 && incY == 1 {
+		f32.AxpyUnitary(alpha, x[:n], y[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	f32.AxpyInc(alpha, x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Srotg computes a plane rotation
+//
+//	⎡  c s ⎤ ⎡ a ⎤ = ⎡ r ⎤
+//	⎣ -s c ⎦ ⎣ b ⎦   ⎣ 0 ⎦
+//
+// satisfying c^2 + s^2 = 1.
+//
+// The computation uses the formulas
+//
+//	sigma = sgn(a)    if |a| >  |b|
+//	      = sgn(b)    if |b| >= |a|
+//	r = sigma*sqrt(a^2 + b^2)
+//	c = 1; s = 0      if r = 0
+//	c = a/r; s = b/r  if r != 0
+//	c >= 0            if |a| > |b|
+//
+// The subroutine also computes
+//
+//	z = s    if |a| > |b|,
+//	  = 1/c  if |b| >= |a| and c != 0
+//	  = 1    if c = 0
+//
+// This allows c and s to be reconstructed from z as follows:
+//
+//	If z = 1, set c = 0, s = 1.
+//	If |z| < 1, set c = sqrt(1 - z^2) and s = z.
+//	If |z| > 1, set c = 1/z and s = sqrt(1 - c^2).
+//
+// NOTE: There is a discrepancy between the reference implementation and the
+// BLAS technical manual regarding the sign for r when a or b are zero. Drotg
+// agrees with the definition in the manual and other common BLAS
+// implementations.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Srotg(a, b float32) (c, s, r, z float32) {
+	// Implementation based on Supplemental Material to:
+	// Edward Anderson. 2017. Algorithm 978: Safe Scaling in the Level 1 BLAS.
+	// ACM Trans. Math. Softw. 44, 1, Article 12 (July 2017), 28 pages.
+	// DOI: https://doi.org/10.1145/3061665
+	const (
+		safmin = 0x1p-126
+		safmax = 1 / safmin
+	)
+	anorm := math.Abs(a)
+	bnorm := math.Abs(b)
+	switch {
+	case bnorm == 0:
+		c = 1
+		s = 0
+		r = a
+		z = 0
+	case anorm == 0:
+		c = 0
+		s = 1
+		r = b
+		z = 1
+	default:
+		maxab := math.Max(anorm, bnorm)
+		scl := math.Min(math.Max(safmin, maxab), safmax)
+		var sigma float32
+		if anorm > bnorm {
+			sigma = math.Copysign(1, a)
+		} else {
+			sigma = math.Copysign(1, b)
+		}
+		ascl := a / scl
+		bscl := b / scl
+		r = sigma * (scl * math.Sqrt(ascl*ascl+bscl*bscl))
+		c = a / r
+		s = b / r
+		switch {
+		case anorm > bnorm:
+			z = s
+		case c != 0:
+			z = 1 / c
+		default:
+			z = 1
+		}
+	}
+	return c, s, r, z
+}
+
+// Srotmg computes the modified Givens rotation. See
+// http://www.netlib.org/lapack/explore-html/df/deb/drotmg_8f.html
+// for more details.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Srotmg(d1, d2, x1, y1 float32) (p blas.SrotmParams, rd1, rd2, rx1 float32) {
+	// The implementation of Drotmg used here is taken from Hopkins 1997
+	// Appendix A: https://doi.org/10.1145/289251.289253
+	// with the exception of the gam constants below.
+
+	const (
+		gam    = 4096.0
+		gamsq  = gam * gam
+		rgamsq = 1.0 / gamsq
+	)
+
+	if d1 < 0 {
+		p.Flag = blas.Rescaling // Error state.
+		return p, 0, 0, 0
+	}
+
+	if d2 == 0 || y1 == 0 {
+		p.Flag = blas.Identity
+		return p, d1, d2, x1
+	}
+
+	var h11, h12, h21, h22 float32
+	if (d1 == 0 || x1 == 0) && d2 > 0 {
+		p.Flag = blas.Diagonal
+		h12 = 1
+		h21 = -1
+		x1 = y1
+		d1, d2 = d2, d1
+	} else {
+		p2 := d2 * y1
+		p1 := d1 * x1
+		q2 := p2 * y1
+		q1 := p1 * x1
+		if math.Abs(q1) > math.Abs(q2) {
+			p.Flag = blas.OffDiagonal
+			h11 = 1
+			h22 = 1
+			h21 = -y1 / x1
+			h12 = p2 / p1
+			u := 1 - float32(h12*h21)
+			if u <= 0 {
+				p.Flag = blas.Rescaling // Error state.
+				return p, 0, 0, 0
+			}
+
+			d1 /= u
+			d2 /= u
+			x1 *= u
+		} else {
+			if q2 < 0 {
+				p.Flag = blas.Rescaling // Error state.
+				return p, 0, 0, 0
+			}
+
+			p.Flag = blas.Diagonal
+			h21 = -1
+			h12 = 1
+			h11 = p1 / p2
+			h22 = x1 / y1
+			u := 1 + float32(h11*h22)
+			d1, d2 = d2/u, d1/u
+			x1 = y1 * u
+		}
+	}
+
+	for d1 <= rgamsq && d1 != 0 {
+		p.Flag = blas.Rescaling
+		d1 = (d1 * gam) * gam
+		x1 /= gam
+		h11 /= gam
+		h12 /= gam
+	}
+	for d1 > gamsq {
+		p.Flag = blas.Rescaling
+		d1 = (d1 / gam) / gam
+		x1 *= gam
+		h11 *= gam
+		h12 *= gam
+	}
+
+	for math.Abs(d2) <= rgamsq && d2 != 0 {
+		p.Flag = blas.Rescaling
+		d2 = (d2 * gam) * gam
+		h21 /= gam
+		h22 /= gam
+	}
+	for math.Abs(d2) > gamsq {
+		p.Flag = blas.Rescaling
+		d2 = (d2 / gam) / gam
+		h21 *= gam
+		h22 *= gam
+	}
+
+	switch p.Flag {
+	case blas.Diagonal:
+		p.H = [4]float32{0: h11, 3: h22}
+	case blas.OffDiagonal:
+		p.H = [4]float32{1: h21, 2: h12}
+	case blas.Rescaling:
+		p.H = [4]float32{h11, h21, h12, h22}
+	default:
+		panic(badFlag)
+	}
+
+	return p, d1, d2, x1
+}
+
+// Srot applies a plane transformation.
+//
+//	x[i] = c * x[i] + s * y[i]
+//	y[i] = c * y[i] - s * x[i]
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Srot(n int, x []float32, incX int, y []float32, incY int, c float32, s float32) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		x = x[:n]
+		for i, vx := range x {
+			vy := y[i]
+			x[i], y[i] = c*vx+s*vy, c*vy-s*vx
+		}
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		vx := x[ix]
+		vy := y[iy]
+		x[ix], y[iy] = c*vx+s*vy, c*vy-s*vx
+		ix += incX
+		iy += incY
+	}
+}
+
+// Srotm applies the modified Givens rotation to the 2×n matrix.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Srotm(n int, x []float32, incX int, y []float32, incY int, p blas.SrotmParams) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	if p.Flag == blas.Identity {
+		return
+	}
+
+	switch p.Flag {
+	case blas.Rescaling:
+		h11 := p.H[0]
+		h12 := p.H[2]
+		h21 := p.H[1]
+		h22 := p.H[3]
+		if incX == 1 && incY == 1 {
+			x = x[:n]
+			for i, vx := range x {
+				vy := y[i]
+				x[i], y[i] = float32(vx*h11)+float32(vy*h12), float32(vx*h21)+float32(vy*h22)
+			}
+			return
+		}
+		var ix, iy int
+		if incX < 0 {
+			ix = (-n + 1) * incX
+		}
+		if incY < 0 {
+			iy = (-n + 1) * incY
+		}
+		for i := 0; i < n; i++ {
+			vx := x[ix]
+			vy := y[iy]
+			x[ix], y[iy] = float32(vx*h11)+float32(vy*h12), float32(vx*h21)+float32(vy*h22)
+			ix += incX
+			iy += incY
+		}
+	case blas.OffDiagonal:
+		h12 := p.H[2]
+		h21 := p.H[1]
+		if incX == 1 && incY == 1 {
+			x = x[:n]
+			for i, vx := range x {
+				vy := y[i]
+				x[i], y[i] = vx+float32(vy*h12), float32(vx*h21)+vy
+			}
+			return
+		}
+		var ix, iy int
+		if incX < 0 {
+			ix = (-n + 1) * incX
+		}
+		if incY < 0 {
+			iy = (-n + 1) * incY
+		}
+		for i := 0; i < n; i++ {
+			vx := x[ix]
+			vy := y[iy]
+			x[ix], y[iy] = vx+float32(vy*h12), float32(vx*h21)+vy
+			ix += incX
+			iy += incY
+		}
+	case blas.Diagonal:
+		h11 := p.H[0]
+		h22 := p.H[3]
+		if incX == 1 && incY == 1 {
+			x = x[:n]
+			for i, vx := range x {
+				vy := y[i]
+				x[i], y[i] = float32(vx*h11)+vy, -vx+float32(vy*h22)
+			}
+			return
+		}
+		var ix, iy int
+		if incX < 0 {
+			ix = (-n + 1) * incX
+		}
+		if incY < 0 {
+			iy = (-n + 1) * incY
+		}
+		for i := 0; i < n; i++ {
+			vx := x[ix]
+			vy := y[iy]
+			x[ix], y[iy] = float32(vx*h11)+vy, -vx+float32(vy*h22)
+			ix += incX
+			iy += incY
+		}
+	}
+}
+
+// Sscal scales x by alpha.
+//
+//	x[i] *= alpha
+//
+// Sscal has no effect if incX < 0.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sscal(n int, alpha float32, x []float32, incX int) {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if alpha == 0 {
+		if incX == 1 {
+			x = x[:n]
+			for i := range x {
+				x[i] = 0
+			}
+			return
+		}
+		for ix := 0; ix < n*incX; ix += incX {
+			x[ix] = 0
+		}
+		return
+	}
+	if incX == 1 {
+		f32.ScalUnitary(alpha, x[:n])
+		return
+	}
+	f32.ScalInc(alpha, x, uintptr(n), uintptr(incX))
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_dsdot.go b/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_dsdot.go
new file mode 100644
index 0000000000..cd7df4110a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_dsdot.go
@@ -0,0 +1,54 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+// Dsdot computes the dot product of the two vectors
+//
+//	\sum_i x[i]*y[i]
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Dsdot(n int, x []float32, incX int, y []float32, incY int) float64 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return f32.DdotUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || ix+(n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || iy+(n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return f32.DdotInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_sdot.go b/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_sdot.go
new file mode 100644
index 0000000000..c4cc166322
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_sdot.go
@@ -0,0 +1,54 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+// Sdot computes the dot product of the two vectors
+//
+//	\sum_i x[i]*y[i]
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sdot(n int, x []float32, incX int, y []float32, incY int) float32 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return f32.DotUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || ix+(n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || iy+(n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return f32.DotInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_sdsdot.go b/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_sdsdot.go
new file mode 100644
index 0000000000..eb6b73bd41
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_sdsdot.go
@@ -0,0 +1,54 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+// Sdsdot computes the dot product of the two vectors plus a constant
+//
+//	alpha + \sum_i x[i]*y[i]
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sdsdot(n int, alpha float32, x []float32, incX int, y []float32, incY int) float32 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return alpha + float32(f32.DdotUnitary(x[:n], y[:n]))
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || ix+(n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || iy+(n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return alpha + float32(f32.DdotInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy)))
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level1float64.go b/vendor/gonum.org/v1/gonum/blas/gonum/level1float64.go
new file mode 100644
index 0000000000..795769d966
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1float64.go
@@ -0,0 +1,629 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f64"
+)
+
+var _ blas.Float64Level1 = Implementation{}
+
+// Dnrm2 computes the Euclidean norm of a vector,
+//
+//	sqrt(\sum_i x[i] * x[i]).
+//
+// This function returns 0 if incX is negative.
+func (Implementation) Dnrm2(n int, x []float64, incX int) float64 {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	if n < 2 {
+		if n == 1 {
+			return math.Abs(x[0])
+		}
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 {
+		return f64.L2NormUnitary(x[:n])
+	}
+	return f64.L2NormInc(x, uintptr(n), uintptr(incX))
+}
+
+// Dasum computes the sum of the absolute values of the elements of x.
+//
+//	\sum_i |x[i]|
+//
+// Dasum returns 0 if incX is negative.
+func (Implementation) Dasum(n int, x []float64, incX int) float64 {
+	var sum float64
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	if incX == 1 {
+		x = x[:n]
+		for _, v := range x {
+			sum += math.Abs(v)
+		}
+		return sum
+	}
+	for i := 0; i < n; i++ {
+		sum += math.Abs(x[i*incX])
+	}
+	return sum
+}
+
+// Idamax returns the index of an element of x with the largest absolute value.
+// If there are multiple such indices the earliest is returned.
+// Idamax returns -1 if n == 0.
+func (Implementation) Idamax(n int, x []float64, incX int) int {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return -1
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	if n < 2 {
+		if n == 1 {
+			return 0
+		}
+		if n == 0 {
+			return -1 // Netlib returns invalid index when n == 0.
+		}
+		panic(nLT0)
+	}
+	idx := 0
+	max := math.Abs(x[0])
+	if incX == 1 {
+		for i, v := range x[:n] {
+			absV := math.Abs(v)
+			if absV > max {
+				max = absV
+				idx = i
+			}
+		}
+		return idx
+	}
+	ix := incX
+	for i := 1; i < n; i++ {
+		v := x[ix]
+		absV := math.Abs(v)
+		if absV > max {
+			max = absV
+			idx = i
+		}
+		ix += incX
+	}
+	return idx
+}
+
+// Dswap exchanges the elements of two vectors.
+//
+//	x[i], y[i] = y[i], x[i] for all i
+func (Implementation) Dswap(n int, x []float64, incX int, y []float64, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		x = x[:n]
+		for i, v := range x {
+			x[i], y[i] = y[i], v
+		}
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		x[ix], y[iy] = y[iy], x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Dcopy copies the elements of x into the elements of y.
+//
+//	y[i] = x[i] for all i
+func (Implementation) Dcopy(n int, x []float64, incX int, y []float64, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		copy(y[:n], x[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		y[iy] = x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Daxpy adds alpha times x to y
+//
+//	y[i] += alpha * x[i] for all i
+func (Implementation) Daxpy(n int, alpha float64, x []float64, incX int, y []float64, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if alpha == 0 {
+		return
+	}
+	if incX == 1 && incY == 1 {
+		f64.AxpyUnitary(alpha, x[:n], y[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	f64.AxpyInc(alpha, x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Drotg computes a plane rotation
+//
+//	⎡  c s ⎤ ⎡ a ⎤ = ⎡ r ⎤
+//	⎣ -s c ⎦ ⎣ b ⎦   ⎣ 0 ⎦
+//
+// satisfying c^2 + s^2 = 1.
+//
+// The computation uses the formulas
+//
+//	sigma = sgn(a)    if |a| >  |b|
+//	      = sgn(b)    if |b| >= |a|
+//	r = sigma*sqrt(a^2 + b^2)
+//	c = 1; s = 0      if r = 0
+//	c = a/r; s = b/r  if r != 0
+//	c >= 0            if |a| > |b|
+//
+// The subroutine also computes
+//
+//	z = s    if |a| > |b|,
+//	  = 1/c  if |b| >= |a| and c != 0
+//	  = 1    if c = 0
+//
+// This allows c and s to be reconstructed from z as follows:
+//
+//	If z = 1, set c = 0, s = 1.
+//	If |z| < 1, set c = sqrt(1 - z^2) and s = z.
+//	If |z| > 1, set c = 1/z and s = sqrt(1 - c^2).
+//
+// NOTE: There is a discrepancy between the reference implementation and the
+// BLAS technical manual regarding the sign for r when a or b are zero. Drotg
+// agrees with the definition in the manual and other common BLAS
+// implementations.
+func (Implementation) Drotg(a, b float64) (c, s, r, z float64) {
+	// Implementation based on Supplemental Material to:
+	// Edward Anderson. 2017. Algorithm 978: Safe Scaling in the Level 1 BLAS.
+	// ACM Trans. Math. Softw. 44, 1, Article 12 (July 2017), 28 pages.
+	// DOI: https://doi.org/10.1145/3061665
+	const (
+		safmin = 0x1p-1022
+		safmax = 1 / safmin
+	)
+	anorm := math.Abs(a)
+	bnorm := math.Abs(b)
+	switch {
+	case bnorm == 0:
+		c = 1
+		s = 0
+		r = a
+		z = 0
+	case anorm == 0:
+		c = 0
+		s = 1
+		r = b
+		z = 1
+	default:
+		maxab := math.Max(anorm, bnorm)
+		scl := math.Min(math.Max(safmin, maxab), safmax)
+		var sigma float64
+		if anorm > bnorm {
+			sigma = math.Copysign(1, a)
+		} else {
+			sigma = math.Copysign(1, b)
+		}
+		ascl := a / scl
+		bscl := b / scl
+		r = sigma * (scl * math.Sqrt(ascl*ascl+bscl*bscl))
+		c = a / r
+		s = b / r
+		switch {
+		case anorm > bnorm:
+			z = s
+		case c != 0:
+			z = 1 / c
+		default:
+			z = 1
+		}
+	}
+	return c, s, r, z
+}
+
+// Drotmg computes the modified Givens rotation. See
+// http://www.netlib.org/lapack/explore-html/df/deb/drotmg_8f.html
+// for more details.
+func (Implementation) Drotmg(d1, d2, x1, y1 float64) (p blas.DrotmParams, rd1, rd2, rx1 float64) {
+	// The implementation of Drotmg used here is taken from Hopkins 1997
+	// Appendix A: https://doi.org/10.1145/289251.289253
+	// with the exception of the gam constants below.
+
+	const (
+		gam    = 4096.0
+		gamsq  = gam * gam
+		rgamsq = 1.0 / gamsq
+	)
+
+	if d1 < 0 {
+		p.Flag = blas.Rescaling // Error state.
+		return p, 0, 0, 0
+	}
+
+	if d2 == 0 || y1 == 0 {
+		p.Flag = blas.Identity
+		return p, d1, d2, x1
+	}
+
+	var h11, h12, h21, h22 float64
+	if (d1 == 0 || x1 == 0) && d2 > 0 {
+		p.Flag = blas.Diagonal
+		h12 = 1
+		h21 = -1
+		x1 = y1
+		d1, d2 = d2, d1
+	} else {
+		p2 := d2 * y1
+		p1 := d1 * x1
+		q2 := p2 * y1
+		q1 := p1 * x1
+		if math.Abs(q1) > math.Abs(q2) {
+			p.Flag = blas.OffDiagonal
+			h11 = 1
+			h22 = 1
+			h21 = -y1 / x1
+			h12 = p2 / p1
+			u := 1 - float64(h12*h21)
+			if u <= 0 {
+				p.Flag = blas.Rescaling // Error state.
+				return p, 0, 0, 0
+			}
+
+			d1 /= u
+			d2 /= u
+			x1 *= u
+		} else {
+			if q2 < 0 {
+				p.Flag = blas.Rescaling // Error state.
+				return p, 0, 0, 0
+			}
+
+			p.Flag = blas.Diagonal
+			h21 = -1
+			h12 = 1
+			h11 = p1 / p2
+			h22 = x1 / y1
+			u := 1 + float64(h11*h22)
+			d1, d2 = d2/u, d1/u
+			x1 = y1 * u
+		}
+	}
+
+	for d1 <= rgamsq && d1 != 0 {
+		p.Flag = blas.Rescaling
+		d1 = (d1 * gam) * gam
+		x1 /= gam
+		h11 /= gam
+		h12 /= gam
+	}
+	for d1 > gamsq {
+		p.Flag = blas.Rescaling
+		d1 = (d1 / gam) / gam
+		x1 *= gam
+		h11 *= gam
+		h12 *= gam
+	}
+
+	for math.Abs(d2) <= rgamsq && d2 != 0 {
+		p.Flag = blas.Rescaling
+		d2 = (d2 * gam) * gam
+		h21 /= gam
+		h22 /= gam
+	}
+	for math.Abs(d2) > gamsq {
+		p.Flag = blas.Rescaling
+		d2 = (d2 / gam) / gam
+		h21 *= gam
+		h22 *= gam
+	}
+
+	switch p.Flag {
+	case blas.Diagonal:
+		p.H = [4]float64{0: h11, 3: h22}
+	case blas.OffDiagonal:
+		p.H = [4]float64{1: h21, 2: h12}
+	case blas.Rescaling:
+		p.H = [4]float64{h11, h21, h12, h22}
+	default:
+		panic(badFlag)
+	}
+
+	return p, d1, d2, x1
+}
+
+// Drot applies a plane transformation.
+//
+//	x[i] = c * x[i] + s * y[i]
+//	y[i] = c * y[i] - s * x[i]
+func (Implementation) Drot(n int, x []float64, incX int, y []float64, incY int, c float64, s float64) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		x = x[:n]
+		for i, vx := range x {
+			vy := y[i]
+			x[i], y[i] = c*vx+s*vy, c*vy-s*vx
+		}
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		vx := x[ix]
+		vy := y[iy]
+		x[ix], y[iy] = c*vx+s*vy, c*vy-s*vx
+		ix += incX
+		iy += incY
+	}
+}
+
+// Drotm applies the modified Givens rotation to the 2×n matrix.
+func (Implementation) Drotm(n int, x []float64, incX int, y []float64, incY int, p blas.DrotmParams) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	if p.Flag == blas.Identity {
+		return
+	}
+
+	switch p.Flag {
+	case blas.Rescaling:
+		h11 := p.H[0]
+		h12 := p.H[2]
+		h21 := p.H[1]
+		h22 := p.H[3]
+		if incX == 1 && incY == 1 {
+			x = x[:n]
+			for i, vx := range x {
+				vy := y[i]
+				x[i], y[i] = float64(vx*h11)+float64(vy*h12), float64(vx*h21)+float64(vy*h22)
+			}
+			return
+		}
+		var ix, iy int
+		if incX < 0 {
+			ix = (-n + 1) * incX
+		}
+		if incY < 0 {
+			iy = (-n + 1) * incY
+		}
+		for i := 0; i < n; i++ {
+			vx := x[ix]
+			vy := y[iy]
+			x[ix], y[iy] = float64(vx*h11)+float64(vy*h12), float64(vx*h21)+float64(vy*h22)
+			ix += incX
+			iy += incY
+		}
+	case blas.OffDiagonal:
+		h12 := p.H[2]
+		h21 := p.H[1]
+		if incX == 1 && incY == 1 {
+			x = x[:n]
+			for i, vx := range x {
+				vy := y[i]
+				x[i], y[i] = vx+float64(vy*h12), float64(vx*h21)+vy
+			}
+			return
+		}
+		var ix, iy int
+		if incX < 0 {
+			ix = (-n + 1) * incX
+		}
+		if incY < 0 {
+			iy = (-n + 1) * incY
+		}
+		for i := 0; i < n; i++ {
+			vx := x[ix]
+			vy := y[iy]
+			x[ix], y[iy] = vx+float64(vy*h12), float64(vx*h21)+vy
+			ix += incX
+			iy += incY
+		}
+	case blas.Diagonal:
+		h11 := p.H[0]
+		h22 := p.H[3]
+		if incX == 1 && incY == 1 {
+			x = x[:n]
+			for i, vx := range x {
+				vy := y[i]
+				x[i], y[i] = float64(vx*h11)+vy, -vx+float64(vy*h22)
+			}
+			return
+		}
+		var ix, iy int
+		if incX < 0 {
+			ix = (-n + 1) * incX
+		}
+		if incY < 0 {
+			iy = (-n + 1) * incY
+		}
+		for i := 0; i < n; i++ {
+			vx := x[ix]
+			vy := y[iy]
+			x[ix], y[iy] = float64(vx*h11)+vy, -vx+float64(vy*h22)
+			ix += incX
+			iy += incY
+		}
+	}
+}
+
+// Dscal scales x by alpha.
+//
+//	x[i] *= alpha
+//
+// Dscal has no effect if incX < 0.
+func (Implementation) Dscal(n int, alpha float64, x []float64, incX int) {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if alpha == 0 {
+		if incX == 1 {
+			x = x[:n]
+			for i := range x {
+				x[i] = 0
+			}
+			return
+		}
+		for ix := 0; ix < n*incX; ix += incX {
+			x[ix] = 0
+		}
+		return
+	}
+	if incX == 1 {
+		f64.ScalUnitary(alpha, x[:n])
+		return
+	}
+	f64.ScalInc(alpha, x, uintptr(n), uintptr(incX))
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level1float64_ddot.go b/vendor/gonum.org/v1/gonum/blas/gonum/level1float64_ddot.go
new file mode 100644
index 0000000000..1569656ef2
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1float64_ddot.go
@@ -0,0 +1,50 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/internal/asm/f64"
+)
+
+// Ddot computes the dot product of the two vectors
+//
+//	\sum_i x[i]*y[i]
+func (Implementation) Ddot(n int, x []float64, incX int, y []float64, incY int) float64 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return f64.DotUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || ix+(n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || iy+(n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return f64.DotInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level2cmplx128.go b/vendor/gonum.org/v1/gonum/blas/gonum/level2cmplx128.go
new file mode 100644
index 0000000000..fa076d5fb1
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level2cmplx128.go
@@ -0,0 +1,2940 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math/cmplx"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/c128"
+)
+
+var _ blas.Complex128Level2 = Implementation{}
+
+// Zgbmv performs one of the matrix-vector operations
+//
+//	y = alpha * A * x + beta * y   if trans = blas.NoTrans
+//	y = alpha * Aᵀ * x + beta * y  if trans = blas.Trans
+//	y = alpha * Aᴴ * x + beta * y  if trans = blas.ConjTrans
+//
+// where alpha and beta are scalars, x and y are vectors, and A is an m×n band matrix
+// with kL sub-diagonals and kU super-diagonals.
+func (Implementation) Zgbmv(trans blas.Transpose, m, n, kL, kU int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int) {
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if kL < 0 {
+		panic(kLLT0)
+	}
+	if kU < 0 {
+		panic(kULT0)
+	}
+	if lda < kL+kU+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(min(m, n+kL)-1)+kL+kU+1 {
+		panic(shortA)
+	}
+	var lenX, lenY int
+	if trans == blas.NoTrans {
+		lenX, lenY = n, m
+	} else {
+		lenX, lenY = m, n
+	}
+	if (incX > 0 && len(x) <= (lenX-1)*incX) || (incX < 0 && len(x) <= (1-lenX)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (lenY-1)*incY) || (incY < 0 && len(y) <= (1-lenY)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = (1 - lenX) * incX
+	}
+	var ky int
+	if incY < 0 {
+		ky = (1 - lenY) * incY
+	}
+
+	// Form y = beta*y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:lenY] {
+					y[i] = 0
+				}
+			} else {
+				c128.ScalUnitary(beta, y[:lenY])
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < lenY; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				if incY > 0 {
+					c128.ScalInc(beta, y, uintptr(lenY), uintptr(incY))
+				} else {
+					c128.ScalInc(beta, y, uintptr(lenY), uintptr(-incY))
+				}
+			}
+		}
+	}
+
+	nRow := min(m, n+kL)
+	nCol := kL + 1 + kU
+	switch trans {
+	case blas.NoTrans:
+		iy := ky
+		if incX == 1 {
+			for i := 0; i < nRow; i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				aRow := a[i*lda+l : i*lda+u]
+				off := max(0, i-kL)
+				xtmp := x[off : off+u-l]
+				var sum complex128
+				for j, v := range aRow {
+					sum += xtmp[j] * v
+				}
+				y[iy] += alpha * sum
+				iy += incY
+			}
+		} else {
+			for i := 0; i < nRow; i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				aRow := a[i*lda+l : i*lda+u]
+				off := max(0, i-kL) * incX
+				jx := kx
+				var sum complex128
+				for _, v := range aRow {
+					sum += x[off+jx] * v
+					jx += incX
+				}
+				y[iy] += alpha * sum
+				iy += incY
+			}
+		}
+	case blas.Trans:
+		if incX == 1 {
+			for i := 0; i < nRow; i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				aRow := a[i*lda+l : i*lda+u]
+				off := max(0, i-kL) * incY
+				alphaxi := alpha * x[i]
+				jy := ky
+				for _, v := range aRow {
+					y[off+jy] += alphaxi * v
+					jy += incY
+				}
+			}
+		} else {
+			ix := kx
+			for i := 0; i < nRow; i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				aRow := a[i*lda+l : i*lda+u]
+				off := max(0, i-kL) * incY
+				alphaxi := alpha * x[ix]
+				jy := ky
+				for _, v := range aRow {
+					y[off+jy] += alphaxi * v
+					jy += incY
+				}
+				ix += incX
+			}
+		}
+	case blas.ConjTrans:
+		if incX == 1 {
+			for i := 0; i < nRow; i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				aRow := a[i*lda+l : i*lda+u]
+				off := max(0, i-kL) * incY
+				alphaxi := alpha * x[i]
+				jy := ky
+				for _, v := range aRow {
+					y[off+jy] += alphaxi * cmplx.Conj(v)
+					jy += incY
+				}
+			}
+		} else {
+			ix := kx
+			for i := 0; i < nRow; i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				aRow := a[i*lda+l : i*lda+u]
+				off := max(0, i-kL) * incY
+				alphaxi := alpha * x[ix]
+				jy := ky
+				for _, v := range aRow {
+					y[off+jy] += alphaxi * cmplx.Conj(v)
+					jy += incY
+				}
+				ix += incX
+			}
+		}
+	}
+}
+
+// Zgemv performs one of the matrix-vector operations
+//
+//	y = alpha * A * x + beta * y   if trans = blas.NoTrans
+//	y = alpha * Aᵀ * x + beta * y  if trans = blas.Trans
+//	y = alpha * Aᴴ * x + beta * y  if trans = blas.ConjTrans
+//
+// where alpha and beta are scalars, x and y are vectors, and A is an m×n dense matrix.
+func (Implementation) Zgemv(trans blas.Transpose, m, n int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int) {
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	var lenX, lenY int
+	if trans == blas.NoTrans {
+		lenX = n
+		lenY = m
+	} else {
+		lenX = m
+		lenY = n
+	}
+	if len(a) < lda*(m-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (lenX-1)*incX) || (incX < 0 && len(x) <= (1-lenX)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (lenY-1)*incY) || (incY < 0 && len(y) <= (1-lenY)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = (1 - lenX) * incX
+	}
+	var ky int
+	if incY < 0 {
+		ky = (1 - lenY) * incY
+	}
+
+	// Form y = beta*y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:lenY] {
+					y[i] = 0
+				}
+			} else {
+				c128.ScalUnitary(beta, y[:lenY])
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < lenY; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				if incY > 0 {
+					c128.ScalInc(beta, y, uintptr(lenY), uintptr(incY))
+				} else {
+					c128.ScalInc(beta, y, uintptr(lenY), uintptr(-incY))
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	switch trans {
+	default:
+		// Form y = alpha*A*x + y.
+		iy := ky
+		if incX == 1 {
+			for i := 0; i < m; i++ {
+				y[iy] += alpha * c128.DotuUnitary(a[i*lda:i*lda+n], x[:n])
+				iy += incY
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			y[iy] += alpha * c128.DotuInc(a[i*lda:i*lda+n], x, uintptr(n), 1, uintptr(incX), 0, uintptr(kx))
+			iy += incY
+		}
+		return
+
+	case blas.Trans:
+		// Form y = alpha*Aᵀ*x + y.
+		ix := kx
+		if incY == 1 {
+			for i := 0; i < m; i++ {
+				c128.AxpyUnitary(alpha*x[ix], a[i*lda:i*lda+n], y[:n])
+				ix += incX
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			c128.AxpyInc(alpha*x[ix], a[i*lda:i*lda+n], y, uintptr(n), 1, uintptr(incY), 0, uintptr(ky))
+			ix += incX
+		}
+		return
+
+	case blas.ConjTrans:
+		// Form y = alpha*Aᴴ*x + y.
+		ix := kx
+		if incY == 1 {
+			for i := 0; i < m; i++ {
+				tmp := alpha * x[ix]
+				for j := 0; j < n; j++ {
+					y[j] += tmp * cmplx.Conj(a[i*lda+j])
+				}
+				ix += incX
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			tmp := alpha * x[ix]
+			jy := ky
+			for j := 0; j < n; j++ {
+				y[jy] += tmp * cmplx.Conj(a[i*lda+j])
+				jy += incY
+			}
+			ix += incX
+		}
+		return
+	}
+}
+
+// Zgerc performs the rank-one operation
+//
+//	A += alpha * x * yᴴ
+//
+// where A is an m×n dense matrix, alpha is a scalar, x is an m element vector,
+// and y is an n element vector.
+func (Implementation) Zgerc(m, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, a []complex128, lda int) {
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (m-1)*incX) || (incX < 0 && len(x) <= (1-m)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(a) < lda*(m-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	var kx, jy int
+	if incX < 0 {
+		kx = (1 - m) * incX
+	}
+	if incY < 0 {
+		jy = (1 - n) * incY
+	}
+	for j := 0; j < n; j++ {
+		if y[jy] != 0 {
+			tmp := alpha * cmplx.Conj(y[jy])
+			c128.AxpyInc(tmp, x, a[j:], uintptr(m), uintptr(incX), uintptr(lda), uintptr(kx), 0)
+		}
+		jy += incY
+	}
+}
+
+// Zgeru performs the rank-one operation
+//
+//	A += alpha * x * yᵀ
+//
+// where A is an m×n dense matrix, alpha is a scalar, x is an m element vector,
+// and y is an n element vector.
+func (Implementation) Zgeru(m, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, a []complex128, lda int) {
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (m-1)*incX) || (incX < 0 && len(x) <= (1-m)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(a) < lda*(m-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = (1 - m) * incX
+	}
+	if incY == 1 {
+		for i := 0; i < m; i++ {
+			if x[kx] != 0 {
+				tmp := alpha * x[kx]
+				c128.AxpyUnitary(tmp, y[:n], a[i*lda:i*lda+n])
+			}
+			kx += incX
+		}
+		return
+	}
+	var jy int
+	if incY < 0 {
+		jy = (1 - n) * incY
+	}
+	for i := 0; i < m; i++ {
+		if x[kx] != 0 {
+			tmp := alpha * x[kx]
+			c128.AxpyInc(tmp, y, a[i*lda:i*lda+n], uintptr(n), uintptr(incY), 1, uintptr(jy), 0)
+		}
+		kx += incX
+	}
+}
+
+// Zhbmv performs the matrix-vector operation
+//
+//	y = alpha * A * x + beta * y
+//
+// where alpha and beta are scalars, x and y are vectors, and A is an n×n
+// Hermitian band matrix with k super-diagonals. The imaginary parts of
+// the diagonal elements of A are ignored and assumed to be zero.
+func (Implementation) Zhbmv(uplo blas.Uplo, n, k int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	if lda < k+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+k+1 {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	// Set up the start indices in X and Y.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+	var ky int
+	if incY < 0 {
+		ky = (1 - n) * incY
+	}
+
+	// Form y = beta*y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:n] {
+					y[i] = 0
+				}
+			} else {
+				for i, v := range y[:n] {
+					y[i] = beta * v
+				}
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					y[iy] = beta * y[iy]
+					iy += incY
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	// The elements of A are accessed sequentially with one pass through a.
+	switch uplo {
+	case blas.Upper:
+		iy := ky
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				aRow := a[i*lda:]
+				alphaxi := alpha * x[i]
+				sum := alphaxi * complex(real(aRow[0]), 0)
+				u := min(k+1, n-i)
+				jy := incY
+				for j := 1; j < u; j++ {
+					v := aRow[j]
+					sum += alpha * x[i+j] * v
+					y[iy+jy] += alphaxi * cmplx.Conj(v)
+					jy += incY
+				}
+				y[iy] += sum
+				iy += incY
+			}
+		} else {
+			ix := kx
+			for i := 0; i < n; i++ {
+				aRow := a[i*lda:]
+				alphaxi := alpha * x[ix]
+				sum := alphaxi * complex(real(aRow[0]), 0)
+				u := min(k+1, n-i)
+				jx := incX
+				jy := incY
+				for j := 1; j < u; j++ {
+					v := aRow[j]
+					sum += alpha * x[ix+jx] * v
+					y[iy+jy] += alphaxi * cmplx.Conj(v)
+					jx += incX
+					jy += incY
+				}
+				y[iy] += sum
+				ix += incX
+				iy += incY
+			}
+		}
+	case blas.Lower:
+		iy := ky
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				l := max(0, k-i)
+				alphaxi := alpha * x[i]
+				jy := l * incY
+				aRow := a[i*lda:]
+				for j := l; j < k; j++ {
+					v := aRow[j]
+					y[iy] += alpha * v * x[i-k+j]
+					y[iy-k*incY+jy] += alphaxi * cmplx.Conj(v)
+					jy += incY
+				}
+				y[iy] += alphaxi * complex(real(aRow[k]), 0)
+				iy += incY
+			}
+		} else {
+			ix := kx
+			for i := 0; i < n; i++ {
+				l := max(0, k-i)
+				alphaxi := alpha * x[ix]
+				jx := l * incX
+				jy := l * incY
+				aRow := a[i*lda:]
+				for j := l; j < k; j++ {
+					v := aRow[j]
+					y[iy] += alpha * v * x[ix-k*incX+jx]
+					y[iy-k*incY+jy] += alphaxi * cmplx.Conj(v)
+					jx += incX
+					jy += incY
+				}
+				y[iy] += alphaxi * complex(real(aRow[k]), 0)
+				ix += incX
+				iy += incY
+			}
+		}
+	}
+}
+
+// Zhemv performs the matrix-vector operation
+//
+//	y = alpha * A * x + beta * y
+//
+// where alpha and beta are scalars, x and y are vectors, and A is an n×n
+// Hermitian matrix. The imaginary parts of the diagonal elements of A are
+// ignored and assumed to be zero.
+func (Implementation) Zhemv(uplo blas.Uplo, n int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	// Set up the start indices in X and Y.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+	var ky int
+	if incY < 0 {
+		ky = (1 - n) * incY
+	}
+
+	// Form y = beta*y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:n] {
+					y[i] = 0
+				}
+			} else {
+				for i, v := range y[:n] {
+					y[i] = beta * v
+				}
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					y[iy] = beta * y[iy]
+					iy += incY
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	// The elements of A are accessed sequentially with one pass through
+	// the triangular part of A.
+
+	if uplo == blas.Upper {
+		// Form y when A is stored in upper triangle.
+		if incX == 1 && incY == 1 {
+			for i := 0; i < n; i++ {
+				tmp1 := alpha * x[i]
+				var tmp2 complex128
+				for j := i + 1; j < n; j++ {
+					y[j] += tmp1 * cmplx.Conj(a[i*lda+j])
+					tmp2 += a[i*lda+j] * x[j]
+				}
+				aii := complex(real(a[i*lda+i]), 0)
+				y[i] += tmp1*aii + alpha*tmp2
+			}
+		} else {
+			ix := kx
+			iy := ky
+			for i := 0; i < n; i++ {
+				tmp1 := alpha * x[ix]
+				var tmp2 complex128
+				jx := ix
+				jy := iy
+				for j := i + 1; j < n; j++ {
+					jx += incX
+					jy += incY
+					y[jy] += tmp1 * cmplx.Conj(a[i*lda+j])
+					tmp2 += a[i*lda+j] * x[jx]
+				}
+				aii := complex(real(a[i*lda+i]), 0)
+				y[iy] += tmp1*aii + alpha*tmp2
+				ix += incX
+				iy += incY
+			}
+		}
+		return
+	}
+
+	// Form y when A is stored in lower triangle.
+	if incX == 1 && incY == 1 {
+		for i := 0; i < n; i++ {
+			tmp1 := alpha * x[i]
+			var tmp2 complex128
+			for j := 0; j < i; j++ {
+				y[j] += tmp1 * cmplx.Conj(a[i*lda+j])
+				tmp2 += a[i*lda+j] * x[j]
+			}
+			aii := complex(real(a[i*lda+i]), 0)
+			y[i] += tmp1*aii + alpha*tmp2
+		}
+	} else {
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			tmp1 := alpha * x[ix]
+			var tmp2 complex128
+			jx := kx
+			jy := ky
+			for j := 0; j < i; j++ {
+				y[jy] += tmp1 * cmplx.Conj(a[i*lda+j])
+				tmp2 += a[i*lda+j] * x[jx]
+				jx += incX
+				jy += incY
+			}
+			aii := complex(real(a[i*lda+i]), 0)
+			y[iy] += tmp1*aii + alpha*tmp2
+			ix += incX
+			iy += incY
+		}
+	}
+}
+
+// Zher performs the Hermitian rank-one operation
+//
+//	A += alpha * x * xᴴ
+//
+// where A is an n×n Hermitian matrix, alpha is a real scalar, and x is an n
+// element vector. On entry, the imaginary parts of the diagonal elements of A
+// are ignored and assumed to be zero, on return they will be set to zero.
+func (Implementation) Zher(uplo blas.Uplo, n int, alpha float64, x []complex128, incX int, a []complex128, lda int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+	if uplo == blas.Upper {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				if x[i] != 0 {
+					tmp := complex(alpha*real(x[i]), alpha*imag(x[i]))
+					aii := real(a[i*lda+i])
+					xtmp := real(tmp * cmplx.Conj(x[i]))
+					a[i*lda+i] = complex(aii+xtmp, 0)
+					for j := i + 1; j < n; j++ {
+						a[i*lda+j] += tmp * cmplx.Conj(x[j])
+					}
+				} else {
+					aii := real(a[i*lda+i])
+					a[i*lda+i] = complex(aii, 0)
+				}
+			}
+			return
+		}
+
+		ix := kx
+		for i := 0; i < n; i++ {
+			if x[ix] != 0 {
+				tmp := complex(alpha*real(x[ix]), alpha*imag(x[ix]))
+				aii := real(a[i*lda+i])
+				xtmp := real(tmp * cmplx.Conj(x[ix]))
+				a[i*lda+i] = complex(aii+xtmp, 0)
+				jx := ix + incX
+				for j := i + 1; j < n; j++ {
+					a[i*lda+j] += tmp * cmplx.Conj(x[jx])
+					jx += incX
+				}
+			} else {
+				aii := real(a[i*lda+i])
+				a[i*lda+i] = complex(aii, 0)
+			}
+			ix += incX
+		}
+		return
+	}
+
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			if x[i] != 0 {
+				tmp := complex(alpha*real(x[i]), alpha*imag(x[i]))
+				for j := 0; j < i; j++ {
+					a[i*lda+j] += tmp * cmplx.Conj(x[j])
+				}
+				aii := real(a[i*lda+i])
+				xtmp := real(tmp * cmplx.Conj(x[i]))
+				a[i*lda+i] = complex(aii+xtmp, 0)
+			} else {
+				aii := real(a[i*lda+i])
+				a[i*lda+i] = complex(aii, 0)
+			}
+		}
+		return
+	}
+
+	ix := kx
+	for i := 0; i < n; i++ {
+		if x[ix] != 0 {
+			tmp := complex(alpha*real(x[ix]), alpha*imag(x[ix]))
+			jx := kx
+			for j := 0; j < i; j++ {
+				a[i*lda+j] += tmp * cmplx.Conj(x[jx])
+				jx += incX
+			}
+			aii := real(a[i*lda+i])
+			xtmp := real(tmp * cmplx.Conj(x[ix]))
+			a[i*lda+i] = complex(aii+xtmp, 0)
+
+		} else {
+			aii := real(a[i*lda+i])
+			a[i*lda+i] = complex(aii, 0)
+		}
+		ix += incX
+	}
+}
+
+// Zher2 performs the Hermitian rank-two operation
+//
+//	A += alpha * x * yᴴ + conj(alpha) * y * xᴴ
+//
+// where alpha is a scalar, x and y are n element vectors and A is an n×n
+// Hermitian matrix. On entry, the imaginary parts of the diagonal elements are
+// ignored and assumed to be zero. On return they will be set to zero.
+func (Implementation) Zher2(uplo blas.Uplo, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, a []complex128, lda int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	var kx, ky int
+	var ix, iy int
+	if incX != 1 || incY != 1 {
+		if incX < 0 {
+			kx = (1 - n) * incX
+		}
+		if incY < 0 {
+			ky = (1 - n) * incY
+		}
+		ix = kx
+		iy = ky
+	}
+	if uplo == blas.Upper {
+		if incX == 1 && incY == 1 {
+			for i := 0; i < n; i++ {
+				if x[i] != 0 || y[i] != 0 {
+					tmp1 := alpha * x[i]
+					tmp2 := cmplx.Conj(alpha) * y[i]
+					aii := real(a[i*lda+i]) + real(tmp1*cmplx.Conj(y[i])) + real(tmp2*cmplx.Conj(x[i]))
+					a[i*lda+i] = complex(aii, 0)
+					for j := i + 1; j < n; j++ {
+						a[i*lda+j] += tmp1*cmplx.Conj(y[j]) + tmp2*cmplx.Conj(x[j])
+					}
+				} else {
+					aii := real(a[i*lda+i])
+					a[i*lda+i] = complex(aii, 0)
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			if x[ix] != 0 || y[iy] != 0 {
+				tmp1 := alpha * x[ix]
+				tmp2 := cmplx.Conj(alpha) * y[iy]
+				aii := real(a[i*lda+i]) + real(tmp1*cmplx.Conj(y[iy])) + real(tmp2*cmplx.Conj(x[ix]))
+				a[i*lda+i] = complex(aii, 0)
+				jx := ix + incX
+				jy := iy + incY
+				for j := i + 1; j < n; j++ {
+					a[i*lda+j] += tmp1*cmplx.Conj(y[jy]) + tmp2*cmplx.Conj(x[jx])
+					jx += incX
+					jy += incY
+				}
+			} else {
+				aii := real(a[i*lda+i])
+				a[i*lda+i] = complex(aii, 0)
+			}
+			ix += incX
+			iy += incY
+		}
+		return
+	}
+
+	if incX == 1 && incY == 1 {
+		for i := 0; i < n; i++ {
+			if x[i] != 0 || y[i] != 0 {
+				tmp1 := alpha * x[i]
+				tmp2 := cmplx.Conj(alpha) * y[i]
+				for j := 0; j < i; j++ {
+					a[i*lda+j] += tmp1*cmplx.Conj(y[j]) + tmp2*cmplx.Conj(x[j])
+				}
+				aii := real(a[i*lda+i]) + real(tmp1*cmplx.Conj(y[i])) + real(tmp2*cmplx.Conj(x[i]))
+				a[i*lda+i] = complex(aii, 0)
+			} else {
+				aii := real(a[i*lda+i])
+				a[i*lda+i] = complex(aii, 0)
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		if x[ix] != 0 || y[iy] != 0 {
+			tmp1 := alpha * x[ix]
+			tmp2 := cmplx.Conj(alpha) * y[iy]
+			jx := kx
+			jy := ky
+			for j := 0; j < i; j++ {
+				a[i*lda+j] += tmp1*cmplx.Conj(y[jy]) + tmp2*cmplx.Conj(x[jx])
+				jx += incX
+				jy += incY
+			}
+			aii := real(a[i*lda+i]) + real(tmp1*cmplx.Conj(y[iy])) + real(tmp2*cmplx.Conj(x[ix]))
+			a[i*lda+i] = complex(aii, 0)
+		} else {
+			aii := real(a[i*lda+i])
+			a[i*lda+i] = complex(aii, 0)
+		}
+		ix += incX
+		iy += incY
+	}
+}
+
+// Zhpmv performs the matrix-vector operation
+//
+//	y = alpha * A * x + beta * y
+//
+// where alpha and beta are scalars, x and y are vectors, and A is an n×n
+// Hermitian matrix in packed form. The imaginary parts of the diagonal
+// elements of A are ignored and assumed to be zero.
+func (Implementation) Zhpmv(uplo blas.Uplo, n int, alpha complex128, ap []complex128, x []complex128, incX int, beta complex128, y []complex128, incY int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	// Set up the start indices in X and Y.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+	var ky int
+	if incY < 0 {
+		ky = (1 - n) * incY
+	}
+
+	// Form y = beta*y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:n] {
+					y[i] = 0
+				}
+			} else {
+				for i, v := range y[:n] {
+					y[i] = beta * v
+				}
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					y[iy] *= beta
+					iy += incY
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	// The elements of A are accessed sequentially with one pass through ap.
+
+	var kk int
+	if uplo == blas.Upper {
+		// Form y when ap contains the upper triangle.
+		// Here, kk points to the current diagonal element in ap.
+		if incX == 1 && incY == 1 {
+			for i := 0; i < n; i++ {
+				tmp1 := alpha * x[i]
+				y[i] += tmp1 * complex(real(ap[kk]), 0)
+				var tmp2 complex128
+				k := kk + 1
+				for j := i + 1; j < n; j++ {
+					y[j] += tmp1 * cmplx.Conj(ap[k])
+					tmp2 += ap[k] * x[j]
+					k++
+				}
+				y[i] += alpha * tmp2
+				kk += n - i
+			}
+		} else {
+			ix := kx
+			iy := ky
+			for i := 0; i < n; i++ {
+				tmp1 := alpha * x[ix]
+				y[iy] += tmp1 * complex(real(ap[kk]), 0)
+				var tmp2 complex128
+				jx := ix
+				jy := iy
+				for k := kk + 1; k < kk+n-i; k++ {
+					jx += incX
+					jy += incY
+					y[jy] += tmp1 * cmplx.Conj(ap[k])
+					tmp2 += ap[k] * x[jx]
+				}
+				y[iy] += alpha * tmp2
+				ix += incX
+				iy += incY
+				kk += n - i
+			}
+		}
+		return
+	}
+
+	// Form y when ap contains the lower triangle.
+	// Here, kk points to the beginning of current row in ap.
+	if incX == 1 && incY == 1 {
+		for i := 0; i < n; i++ {
+			tmp1 := alpha * x[i]
+			var tmp2 complex128
+			k := kk
+			for j := 0; j < i; j++ {
+				y[j] += tmp1 * cmplx.Conj(ap[k])
+				tmp2 += ap[k] * x[j]
+				k++
+			}
+			aii := complex(real(ap[kk+i]), 0)
+			y[i] += tmp1*aii + alpha*tmp2
+			kk += i + 1
+		}
+	} else {
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			tmp1 := alpha * x[ix]
+			var tmp2 complex128
+			jx := kx
+			jy := ky
+			for k := kk; k < kk+i; k++ {
+				y[jy] += tmp1 * cmplx.Conj(ap[k])
+				tmp2 += ap[k] * x[jx]
+				jx += incX
+				jy += incY
+			}
+			aii := complex(real(ap[kk+i]), 0)
+			y[iy] += tmp1*aii + alpha*tmp2
+			ix += incX
+			iy += incY
+			kk += i + 1
+		}
+	}
+}
+
+// Zhpr performs the Hermitian rank-1 operation
+//
+//	A += alpha * x * xᴴ
+//
+// where alpha is a real scalar, x is a vector, and A is an n×n hermitian matrix
+// in packed form. On entry, the imaginary parts of the diagonal elements are
+// assumed to be zero, and on return they are set to zero.
+func (Implementation) Zhpr(uplo blas.Uplo, n int, alpha float64, x []complex128, incX int, ap []complex128) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	// The elements of A are accessed sequentially with one pass through ap.
+
+	var kk int
+	if uplo == blas.Upper {
+		// Form A when upper triangle is stored in AP.
+		// Here, kk points to the current diagonal element in ap.
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				xi := x[i]
+				if xi != 0 {
+					aii := real(ap[kk]) + alpha*real(cmplx.Conj(xi)*xi)
+					ap[kk] = complex(aii, 0)
+
+					tmp := complex(alpha, 0) * xi
+					a := ap[kk+1 : kk+n-i]
+					x := x[i+1 : n]
+					for j, v := range x {
+						a[j] += tmp * cmplx.Conj(v)
+					}
+				} else {
+					ap[kk] = complex(real(ap[kk]), 0)
+				}
+				kk += n - i
+			}
+		} else {
+			ix := kx
+			for i := 0; i < n; i++ {
+				xi := x[ix]
+				if xi != 0 {
+					aii := real(ap[kk]) + alpha*real(cmplx.Conj(xi)*xi)
+					ap[kk] = complex(aii, 0)
+
+					tmp := complex(alpha, 0) * xi
+					jx := ix + incX
+					a := ap[kk+1 : kk+n-i]
+					for k := range a {
+						a[k] += tmp * cmplx.Conj(x[jx])
+						jx += incX
+					}
+				} else {
+					ap[kk] = complex(real(ap[kk]), 0)
+				}
+				ix += incX
+				kk += n - i
+			}
+		}
+		return
+	}
+
+	// Form A when lower triangle is stored in AP.
+	// Here, kk points to the beginning of current row in ap.
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			xi := x[i]
+			if xi != 0 {
+				tmp := complex(alpha, 0) * xi
+				a := ap[kk : kk+i]
+				for j, v := range x[:i] {
+					a[j] += tmp * cmplx.Conj(v)
+				}
+
+				aii := real(ap[kk+i]) + alpha*real(cmplx.Conj(xi)*xi)
+				ap[kk+i] = complex(aii, 0)
+			} else {
+				ap[kk+i] = complex(real(ap[kk+i]), 0)
+			}
+			kk += i + 1
+		}
+	} else {
+		ix := kx
+		for i := 0; i < n; i++ {
+			xi := x[ix]
+			if xi != 0 {
+				tmp := complex(alpha, 0) * xi
+				a := ap[kk : kk+i]
+				jx := kx
+				for k := range a {
+					a[k] += tmp * cmplx.Conj(x[jx])
+					jx += incX
+				}
+
+				aii := real(ap[kk+i]) + alpha*real(cmplx.Conj(xi)*xi)
+				ap[kk+i] = complex(aii, 0)
+			} else {
+				ap[kk+i] = complex(real(ap[kk+i]), 0)
+			}
+			ix += incX
+			kk += i + 1
+		}
+	}
+}
+
+// Zhpr2 performs the Hermitian rank-2 operation
+//
+//	A += alpha * x * yᴴ + conj(alpha) * y * xᴴ
+//
+// where alpha is a complex scalar, x and y are n element vectors, and A is an
+// n×n Hermitian matrix, supplied in packed form. On entry, the imaginary parts
+// of the diagonal elements are assumed to be zero, and on return they are set to zero.
+func (Implementation) Zhpr2(uplo blas.Uplo, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, ap []complex128) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	// Set up start indices in X and Y.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+	var ky int
+	if incY < 0 {
+		ky = (1 - n) * incY
+	}
+
+	// The elements of A are accessed sequentially with one pass through ap.
+
+	var kk int
+	if uplo == blas.Upper {
+		// Form A when upper triangle is stored in AP.
+		// Here, kk points to the current diagonal element in ap.
+		if incX == 1 && incY == 1 {
+			for i := 0; i < n; i++ {
+				if x[i] != 0 || y[i] != 0 {
+					tmp1 := alpha * x[i]
+					tmp2 := cmplx.Conj(alpha) * y[i]
+					aii := real(ap[kk]) + real(tmp1*cmplx.Conj(y[i])) + real(tmp2*cmplx.Conj(x[i]))
+					ap[kk] = complex(aii, 0)
+					k := kk + 1
+					for j := i + 1; j < n; j++ {
+						ap[k] += tmp1*cmplx.Conj(y[j]) + tmp2*cmplx.Conj(x[j])
+						k++
+					}
+				} else {
+					ap[kk] = complex(real(ap[kk]), 0)
+				}
+				kk += n - i
+			}
+		} else {
+			ix := kx
+			iy := ky
+			for i := 0; i < n; i++ {
+				if x[ix] != 0 || y[iy] != 0 {
+					tmp1 := alpha * x[ix]
+					tmp2 := cmplx.Conj(alpha) * y[iy]
+					aii := real(ap[kk]) + real(tmp1*cmplx.Conj(y[iy])) + real(tmp2*cmplx.Conj(x[ix]))
+					ap[kk] = complex(aii, 0)
+					jx := ix + incX
+					jy := iy + incY
+					for k := kk + 1; k < kk+n-i; k++ {
+						ap[k] += tmp1*cmplx.Conj(y[jy]) + tmp2*cmplx.Conj(x[jx])
+						jx += incX
+						jy += incY
+					}
+				} else {
+					ap[kk] = complex(real(ap[kk]), 0)
+				}
+				ix += incX
+				iy += incY
+				kk += n - i
+			}
+		}
+		return
+	}
+
+	// Form A when lower triangle is stored in AP.
+	// Here, kk points to the beginning of current row in ap.
+	if incX == 1 && incY == 1 {
+		for i := 0; i < n; i++ {
+			if x[i] != 0 || y[i] != 0 {
+				tmp1 := alpha * x[i]
+				tmp2 := cmplx.Conj(alpha) * y[i]
+				k := kk
+				for j := 0; j < i; j++ {
+					ap[k] += tmp1*cmplx.Conj(y[j]) + tmp2*cmplx.Conj(x[j])
+					k++
+				}
+				aii := real(ap[kk+i]) + real(tmp1*cmplx.Conj(y[i])) + real(tmp2*cmplx.Conj(x[i]))
+				ap[kk+i] = complex(aii, 0)
+			} else {
+				ap[kk+i] = complex(real(ap[kk+i]), 0)
+			}
+			kk += i + 1
+		}
+	} else {
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			if x[ix] != 0 || y[iy] != 0 {
+				tmp1 := alpha * x[ix]
+				tmp2 := cmplx.Conj(alpha) * y[iy]
+				jx := kx
+				jy := ky
+				for k := kk; k < kk+i; k++ {
+					ap[k] += tmp1*cmplx.Conj(y[jy]) + tmp2*cmplx.Conj(x[jx])
+					jx += incX
+					jy += incY
+				}
+				aii := real(ap[kk+i]) + real(tmp1*cmplx.Conj(y[iy])) + real(tmp2*cmplx.Conj(x[ix]))
+				ap[kk+i] = complex(aii, 0)
+			} else {
+				ap[kk+i] = complex(real(ap[kk+i]), 0)
+			}
+			ix += incX
+			iy += incY
+			kk += i + 1
+		}
+	}
+}
+
+// Ztbmv performs one of the matrix-vector operations
+//
+//	x = A * x   if trans = blas.NoTrans
+//	x = Aᵀ * x  if trans = blas.Trans
+//	x = Aᴴ * x  if trans = blas.ConjTrans
+//
+// where x is an n element vector and A is an n×n triangular band matrix, with
+// (k+1) diagonals.
+func (Implementation) Ztbmv(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n, k int, a []complex128, lda int, x []complex128, incX int) {
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	switch diag {
+	default:
+		panic(badDiag)
+	case blas.NonUnit, blas.Unit:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	if lda < k+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+k+1 {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	switch trans {
+	case blas.NoTrans:
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					xi := x[i]
+					if diag == blas.NonUnit {
+						xi *= a[i*lda]
+					}
+					kk := min(k, n-i-1)
+					for j, aij := range a[i*lda+1 : i*lda+kk+1] {
+						xi += x[i+j+1] * aij
+					}
+					x[i] = xi
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					xi := x[ix]
+					if diag == blas.NonUnit {
+						xi *= a[i*lda]
+					}
+					kk := min(k, n-i-1)
+					jx := ix + incX
+					for _, aij := range a[i*lda+1 : i*lda+kk+1] {
+						xi += x[jx] * aij
+						jx += incX
+					}
+					x[ix] = xi
+					ix += incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					xi := x[i]
+					if diag == blas.NonUnit {
+						xi *= a[i*lda+k]
+					}
+					kk := min(k, i)
+					for j, aij := range a[i*lda+k-kk : i*lda+k] {
+						xi += x[i-kk+j] * aij
+					}
+					x[i] = xi
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					xi := x[ix]
+					if diag == blas.NonUnit {
+						xi *= a[i*lda+k]
+					}
+					kk := min(k, i)
+					jx := ix - kk*incX
+					for _, aij := range a[i*lda+k-kk : i*lda+k] {
+						xi += x[jx] * aij
+						jx += incX
+					}
+					x[ix] = xi
+					ix -= incX
+				}
+			}
+		}
+	case blas.Trans:
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					kk := min(k, n-i-1)
+					xi := x[i]
+					for j, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[i+j+1] += xi * aij
+					}
+					if diag == blas.NonUnit {
+						x[i] *= a[i*lda]
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					kk := min(k, n-i-1)
+					jx := ix + incX
+					xi := x[ix]
+					for _, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[jx] += xi * aij
+						jx += incX
+					}
+					if diag == blas.NonUnit {
+						x[ix] *= a[i*lda]
+					}
+					ix -= incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					kk := min(k, i)
+					xi := x[i]
+					for j, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[i-kk+j] += xi * aij
+					}
+					if diag == blas.NonUnit {
+						x[i] *= a[i*lda+k]
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					kk := min(k, i)
+					jx := ix - kk*incX
+					xi := x[ix]
+					for _, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[jx] += xi * aij
+						jx += incX
+					}
+					if diag == blas.NonUnit {
+						x[ix] *= a[i*lda+k]
+					}
+					ix += incX
+				}
+			}
+		}
+	case blas.ConjTrans:
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					kk := min(k, n-i-1)
+					xi := x[i]
+					for j, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[i+j+1] += xi * cmplx.Conj(aij)
+					}
+					if diag == blas.NonUnit {
+						x[i] *= cmplx.Conj(a[i*lda])
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					kk := min(k, n-i-1)
+					jx := ix + incX
+					xi := x[ix]
+					for _, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[jx] += xi * cmplx.Conj(aij)
+						jx += incX
+					}
+					if diag == blas.NonUnit {
+						x[ix] *= cmplx.Conj(a[i*lda])
+					}
+					ix -= incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					kk := min(k, i)
+					xi := x[i]
+					for j, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[i-kk+j] += xi * cmplx.Conj(aij)
+					}
+					if diag == blas.NonUnit {
+						x[i] *= cmplx.Conj(a[i*lda+k])
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					kk := min(k, i)
+					jx := ix - kk*incX
+					xi := x[ix]
+					for _, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[jx] += xi * cmplx.Conj(aij)
+						jx += incX
+					}
+					if diag == blas.NonUnit {
+						x[ix] *= cmplx.Conj(a[i*lda+k])
+					}
+					ix += incX
+				}
+			}
+		}
+	}
+}
+
+// Ztbsv solves one of the systems of equations
+//
+//	A * x = b   if trans == blas.NoTrans
+//	Aᵀ * x = b  if trans == blas.Trans
+//	Aᴴ * x = b  if trans == blas.ConjTrans
+//
+// where b and x are n element vectors and A is an n×n triangular band matrix
+// with (k+1) diagonals.
+//
+// On entry, x contains the values of b, and the solution is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func (Implementation) Ztbsv(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n, k int, a []complex128, lda int, x []complex128, incX int) {
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	switch diag {
+	default:
+		panic(badDiag)
+	case blas.NonUnit, blas.Unit:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	if lda < k+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+k+1 {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	switch trans {
+	case blas.NoTrans:
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					kk := min(k, n-i-1)
+					var sum complex128
+					for j, aij := range a[i*lda+1 : i*lda+kk+1] {
+						sum += x[i+1+j] * aij
+					}
+					x[i] -= sum
+					if diag == blas.NonUnit {
+						x[i] /= a[i*lda]
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					kk := min(k, n-i-1)
+					var sum complex128
+					jx := ix + incX
+					for _, aij := range a[i*lda+1 : i*lda+kk+1] {
+						sum += x[jx] * aij
+						jx += incX
+					}
+					x[ix] -= sum
+					if diag == blas.NonUnit {
+						x[ix] /= a[i*lda]
+					}
+					ix -= incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					kk := min(k, i)
+					var sum complex128
+					for j, aij := range a[i*lda+k-kk : i*lda+k] {
+						sum += x[i-kk+j] * aij
+					}
+					x[i] -= sum
+					if diag == blas.NonUnit {
+						x[i] /= a[i*lda+k]
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					kk := min(k, i)
+					var sum complex128
+					jx := ix - kk*incX
+					for _, aij := range a[i*lda+k-kk : i*lda+k] {
+						sum += x[jx] * aij
+						jx += incX
+					}
+					x[ix] -= sum
+					if diag == blas.NonUnit {
+						x[ix] /= a[i*lda+k]
+					}
+					ix += incX
+				}
+			}
+		}
+	case blas.Trans:
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[i] /= a[i*lda]
+					}
+					kk := min(k, n-i-1)
+					xi := x[i]
+					for j, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[i+1+j] -= xi * aij
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[ix] /= a[i*lda]
+					}
+					kk := min(k, n-i-1)
+					xi := x[ix]
+					jx := ix + incX
+					for _, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[jx] -= xi * aij
+						jx += incX
+					}
+					ix += incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[i] /= a[i*lda+k]
+					}
+					kk := min(k, i)
+					xi := x[i]
+					for j, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[i-kk+j] -= xi * aij
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[ix] /= a[i*lda+k]
+					}
+					kk := min(k, i)
+					xi := x[ix]
+					jx := ix - kk*incX
+					for _, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[jx] -= xi * aij
+						jx += incX
+					}
+					ix -= incX
+				}
+			}
+		}
+	case blas.ConjTrans:
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[i] /= cmplx.Conj(a[i*lda])
+					}
+					kk := min(k, n-i-1)
+					xi := x[i]
+					for j, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[i+1+j] -= xi * cmplx.Conj(aij)
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[ix] /= cmplx.Conj(a[i*lda])
+					}
+					kk := min(k, n-i-1)
+					xi := x[ix]
+					jx := ix + incX
+					for _, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[jx] -= xi * cmplx.Conj(aij)
+						jx += incX
+					}
+					ix += incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[i] /= cmplx.Conj(a[i*lda+k])
+					}
+					kk := min(k, i)
+					xi := x[i]
+					for j, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[i-kk+j] -= xi * cmplx.Conj(aij)
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[ix] /= cmplx.Conj(a[i*lda+k])
+					}
+					kk := min(k, i)
+					xi := x[ix]
+					jx := ix - kk*incX
+					for _, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[jx] -= xi * cmplx.Conj(aij)
+						jx += incX
+					}
+					ix -= incX
+				}
+			}
+		}
+	}
+}
+
+// Ztpmv performs one of the matrix-vector operations
+//
+//	x = A * x   if trans = blas.NoTrans
+//	x = Aᵀ * x  if trans = blas.Trans
+//	x = Aᴴ * x  if trans = blas.ConjTrans
+//
+// where x is an n element vector and A is an n×n triangular matrix, supplied in
+// packed form.
+func (Implementation) Ztpmv(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n int, ap []complex128, x []complex128, incX int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch diag {
+	default:
+		panic(badDiag)
+	case blas.NonUnit, blas.Unit:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	// The elements of A are accessed sequentially with one pass through A.
+
+	if trans == blas.NoTrans {
+		// Form x = A*x.
+		if uplo == blas.Upper {
+			// kk points to the current diagonal element in ap.
+			kk := 0
+			if incX == 1 {
+				x = x[:n]
+				for i := range x {
+					if diag == blas.NonUnit {
+						x[i] *= ap[kk]
+					}
+					if n-i-1 > 0 {
+						x[i] += c128.DotuUnitary(ap[kk+1:kk+n-i], x[i+1:])
+					}
+					kk += n - i
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[ix] *= ap[kk]
+					}
+					if n-i-1 > 0 {
+						x[ix] += c128.DotuInc(ap[kk+1:kk+n-i], x, uintptr(n-i-1), 1, uintptr(incX), 0, uintptr(ix+incX))
+					}
+					ix += incX
+					kk += n - i
+				}
+			}
+		} else {
+			// kk points to the beginning of current row in ap.
+			kk := n*(n+1)/2 - n
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[i] *= ap[kk+i]
+					}
+					if i > 0 {
+						x[i] += c128.DotuUnitary(ap[kk:kk+i], x[:i])
+					}
+					kk -= i
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[ix] *= ap[kk+i]
+					}
+					if i > 0 {
+						x[ix] += c128.DotuInc(ap[kk:kk+i], x, uintptr(i), 1, uintptr(incX), 0, uintptr(kx))
+					}
+					ix -= incX
+					kk -= i
+				}
+			}
+		}
+		return
+	}
+
+	if trans == blas.Trans {
+		// Form x = Aᵀ*x.
+		if uplo == blas.Upper {
+			// kk points to the current diagonal element in ap.
+			kk := n*(n+1)/2 - 1
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					xi := x[i]
+					if diag == blas.NonUnit {
+						x[i] *= ap[kk]
+					}
+					if n-i-1 > 0 {
+						c128.AxpyUnitary(xi, ap[kk+1:kk+n-i], x[i+1:n])
+					}
+					kk -= n - i + 1
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					xi := x[ix]
+					if diag == blas.NonUnit {
+						x[ix] *= ap[kk]
+					}
+					if n-i-1 > 0 {
+						c128.AxpyInc(xi, ap[kk+1:kk+n-i], x, uintptr(n-i-1), 1, uintptr(incX), 0, uintptr(ix+incX))
+					}
+					ix -= incX
+					kk -= n - i + 1
+				}
+			}
+		} else {
+			// kk points to the beginning of current row in ap.
+			kk := 0
+			if incX == 1 {
+				x = x[:n]
+				for i := range x {
+					if i > 0 {
+						c128.AxpyUnitary(x[i], ap[kk:kk+i], x[:i])
+					}
+					if diag == blas.NonUnit {
+						x[i] *= ap[kk+i]
+					}
+					kk += i + 1
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						c128.AxpyInc(x[ix], ap[kk:kk+i], x, uintptr(i), 1, uintptr(incX), 0, uintptr(kx))
+					}
+					if diag == blas.NonUnit {
+						x[ix] *= ap[kk+i]
+					}
+					ix += incX
+					kk += i + 1
+				}
+			}
+		}
+		return
+	}
+
+	// Form x = Aᴴ*x.
+	if uplo == blas.Upper {
+		// kk points to the current diagonal element in ap.
+		kk := n*(n+1)/2 - 1
+		if incX == 1 {
+			for i := n - 1; i >= 0; i-- {
+				xi := x[i]
+				if diag == blas.NonUnit {
+					x[i] *= cmplx.Conj(ap[kk])
+				}
+				k := kk + 1
+				for j := i + 1; j < n; j++ {
+					x[j] += xi * cmplx.Conj(ap[k])
+					k++
+				}
+				kk -= n - i + 1
+			}
+		} else {
+			ix := kx + (n-1)*incX
+			for i := n - 1; i >= 0; i-- {
+				xi := x[ix]
+				if diag == blas.NonUnit {
+					x[ix] *= cmplx.Conj(ap[kk])
+				}
+				jx := ix + incX
+				k := kk + 1
+				for j := i + 1; j < n; j++ {
+					x[jx] += xi * cmplx.Conj(ap[k])
+					jx += incX
+					k++
+				}
+				ix -= incX
+				kk -= n - i + 1
+			}
+		}
+	} else {
+		// kk points to the beginning of current row in ap.
+		kk := 0
+		if incX == 1 {
+			x = x[:n]
+			for i, xi := range x {
+				for j := 0; j < i; j++ {
+					x[j] += xi * cmplx.Conj(ap[kk+j])
+				}
+				if diag == blas.NonUnit {
+					x[i] *= cmplx.Conj(ap[kk+i])
+				}
+				kk += i + 1
+			}
+		} else {
+			ix := kx
+			for i := 0; i < n; i++ {
+				xi := x[ix]
+				jx := kx
+				for j := 0; j < i; j++ {
+					x[jx] += xi * cmplx.Conj(ap[kk+j])
+					jx += incX
+				}
+				if diag == blas.NonUnit {
+					x[ix] *= cmplx.Conj(ap[kk+i])
+				}
+				ix += incX
+				kk += i + 1
+			}
+		}
+	}
+}
+
+// Ztpsv solves one of the systems of equations
+//
+//	A * x = b   if trans == blas.NoTrans
+//	Aᵀ * x = b  if trans == blas.Trans
+//	Aᴴ * x = b  if trans == blas.ConjTrans
+//
+// where b and x are n element vectors and A is an n×n triangular matrix in
+// packed form.
+//
+// On entry, x contains the values of b, and the solution is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func (Implementation) Ztpsv(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n int, ap []complex128, x []complex128, incX int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch diag {
+	default:
+		panic(badDiag)
+	case blas.NonUnit, blas.Unit:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	// The elements of A are accessed sequentially with one pass through ap.
+
+	if trans == blas.NoTrans {
+		// Form x = inv(A)*x.
+		if uplo == blas.Upper {
+			kk := n*(n+1)/2 - 1
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					aii := ap[kk]
+					if n-i-1 > 0 {
+						x[i] -= c128.DotuUnitary(x[i+1:n], ap[kk+1:kk+n-i])
+					}
+					if diag == blas.NonUnit {
+						x[i] /= aii
+					}
+					kk -= n - i + 1
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					aii := ap[kk]
+					if n-i-1 > 0 {
+						x[ix] -= c128.DotuInc(x, ap[kk+1:kk+n-i], uintptr(n-i-1), uintptr(incX), 1, uintptr(ix+incX), 0)
+					}
+					if diag == blas.NonUnit {
+						x[ix] /= aii
+					}
+					ix -= incX
+					kk -= n - i + 1
+				}
+			}
+		} else {
+			kk := 0
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						x[i] -= c128.DotuUnitary(x[:i], ap[kk:kk+i])
+					}
+					if diag == blas.NonUnit {
+						x[i] /= ap[kk+i]
+					}
+					kk += i + 1
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						x[ix] -= c128.DotuInc(x, ap[kk:kk+i], uintptr(i), uintptr(incX), 1, uintptr(kx), 0)
+					}
+					if diag == blas.NonUnit {
+						x[ix] /= ap[kk+i]
+					}
+					ix += incX
+					kk += i + 1
+				}
+			}
+		}
+		return
+	}
+
+	if trans == blas.Trans {
+		// Form x = inv(Aᵀ)*x.
+		if uplo == blas.Upper {
+			kk := 0
+			if incX == 1 {
+				for j := 0; j < n; j++ {
+					if diag == blas.NonUnit {
+						x[j] /= ap[kk]
+					}
+					if n-j-1 > 0 {
+						c128.AxpyUnitary(-x[j], ap[kk+1:kk+n-j], x[j+1:n])
+					}
+					kk += n - j
+				}
+			} else {
+				jx := kx
+				for j := 0; j < n; j++ {
+					if diag == blas.NonUnit {
+						x[jx] /= ap[kk]
+					}
+					if n-j-1 > 0 {
+						c128.AxpyInc(-x[jx], ap[kk+1:kk+n-j], x, uintptr(n-j-1), 1, uintptr(incX), 0, uintptr(jx+incX))
+					}
+					jx += incX
+					kk += n - j
+				}
+			}
+		} else {
+			kk := n*(n+1)/2 - n
+			if incX == 1 {
+				for j := n - 1; j >= 0; j-- {
+					if diag == blas.NonUnit {
+						x[j] /= ap[kk+j]
+					}
+					if j > 0 {
+						c128.AxpyUnitary(-x[j], ap[kk:kk+j], x[:j])
+					}
+					kk -= j
+				}
+			} else {
+				jx := kx + (n-1)*incX
+				for j := n - 1; j >= 0; j-- {
+					if diag == blas.NonUnit {
+						x[jx] /= ap[kk+j]
+					}
+					if j > 0 {
+						c128.AxpyInc(-x[jx], ap[kk:kk+j], x, uintptr(j), 1, uintptr(incX), 0, uintptr(kx))
+					}
+					jx -= incX
+					kk -= j
+				}
+			}
+		}
+		return
+	}
+
+	// Form x = inv(Aᴴ)*x.
+	if uplo == blas.Upper {
+		kk := 0
+		if incX == 1 {
+			for j := 0; j < n; j++ {
+				if diag == blas.NonUnit {
+					x[j] /= cmplx.Conj(ap[kk])
+				}
+				xj := x[j]
+				k := kk + 1
+				for i := j + 1; i < n; i++ {
+					x[i] -= xj * cmplx.Conj(ap[k])
+					k++
+				}
+				kk += n - j
+			}
+		} else {
+			jx := kx
+			for j := 0; j < n; j++ {
+				if diag == blas.NonUnit {
+					x[jx] /= cmplx.Conj(ap[kk])
+				}
+				xj := x[jx]
+				ix := jx + incX
+				k := kk + 1
+				for i := j + 1; i < n; i++ {
+					x[ix] -= xj * cmplx.Conj(ap[k])
+					ix += incX
+					k++
+				}
+				jx += incX
+				kk += n - j
+			}
+		}
+	} else {
+		kk := n*(n+1)/2 - n
+		if incX == 1 {
+			for j := n - 1; j >= 0; j-- {
+				if diag == blas.NonUnit {
+					x[j] /= cmplx.Conj(ap[kk+j])
+				}
+				xj := x[j]
+				for i := 0; i < j; i++ {
+					x[i] -= xj * cmplx.Conj(ap[kk+i])
+				}
+				kk -= j
+			}
+		} else {
+			jx := kx + (n-1)*incX
+			for j := n - 1; j >= 0; j-- {
+				if diag == blas.NonUnit {
+					x[jx] /= cmplx.Conj(ap[kk+j])
+				}
+				xj := x[jx]
+				ix := kx
+				for i := 0; i < j; i++ {
+					x[ix] -= xj * cmplx.Conj(ap[kk+i])
+					ix += incX
+				}
+				jx -= incX
+				kk -= j
+			}
+		}
+	}
+}
+
+// Ztrmv performs one of the matrix-vector operations
+//
+//	x = A * x   if trans = blas.NoTrans
+//	x = Aᵀ * x  if trans = blas.Trans
+//	x = Aᴴ * x  if trans = blas.ConjTrans
+//
+// where x is a vector, and A is an n×n triangular matrix.
+func (Implementation) Ztrmv(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n int, a []complex128, lda int, x []complex128, incX int) {
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	switch diag {
+	default:
+		panic(badDiag)
+	case blas.NonUnit, blas.Unit:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	// The elements of A are accessed sequentially with one pass through A.
+
+	if trans == blas.NoTrans {
+		// Form x = A*x.
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[i] *= a[i*lda+i]
+					}
+					if n-i-1 > 0 {
+						x[i] += c128.DotuUnitary(a[i*lda+i+1:i*lda+n], x[i+1:n])
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[ix] *= a[i*lda+i]
+					}
+					if n-i-1 > 0 {
+						x[ix] += c128.DotuInc(a[i*lda+i+1:i*lda+n], x, uintptr(n-i-1), 1, uintptr(incX), 0, uintptr(ix+incX))
+					}
+					ix += incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[i] *= a[i*lda+i]
+					}
+					if i > 0 {
+						x[i] += c128.DotuUnitary(a[i*lda:i*lda+i], x[:i])
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[ix] *= a[i*lda+i]
+					}
+					if i > 0 {
+						x[ix] += c128.DotuInc(a[i*lda:i*lda+i], x, uintptr(i), 1, uintptr(incX), 0, uintptr(kx))
+					}
+					ix -= incX
+				}
+			}
+		}
+		return
+	}
+
+	if trans == blas.Trans {
+		// Form x = Aᵀ*x.
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					xi := x[i]
+					if diag == blas.NonUnit {
+						x[i] *= a[i*lda+i]
+					}
+					if n-i-1 > 0 {
+						c128.AxpyUnitary(xi, a[i*lda+i+1:i*lda+n], x[i+1:n])
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					xi := x[ix]
+					if diag == blas.NonUnit {
+						x[ix] *= a[i*lda+i]
+					}
+					if n-i-1 > 0 {
+						c128.AxpyInc(xi, a[i*lda+i+1:i*lda+n], x, uintptr(n-i-1), 1, uintptr(incX), 0, uintptr(ix+incX))
+					}
+					ix -= incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						c128.AxpyUnitary(x[i], a[i*lda:i*lda+i], x[:i])
+					}
+					if diag == blas.NonUnit {
+						x[i] *= a[i*lda+i]
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						c128.AxpyInc(x[ix], a[i*lda:i*lda+i], x, uintptr(i), 1, uintptr(incX), 0, uintptr(kx))
+					}
+					if diag == blas.NonUnit {
+						x[ix] *= a[i*lda+i]
+					}
+					ix += incX
+				}
+			}
+		}
+		return
+	}
+
+	// Form x = Aᴴ*x.
+	if uplo == blas.Upper {
+		if incX == 1 {
+			for i := n - 1; i >= 0; i-- {
+				xi := x[i]
+				if diag == blas.NonUnit {
+					x[i] *= cmplx.Conj(a[i*lda+i])
+				}
+				for j := i + 1; j < n; j++ {
+					x[j] += xi * cmplx.Conj(a[i*lda+j])
+				}
+			}
+		} else {
+			ix := kx + (n-1)*incX
+			for i := n - 1; i >= 0; i-- {
+				xi := x[ix]
+				if diag == blas.NonUnit {
+					x[ix] *= cmplx.Conj(a[i*lda+i])
+				}
+				jx := ix + incX
+				for j := i + 1; j < n; j++ {
+					x[jx] += xi * cmplx.Conj(a[i*lda+j])
+					jx += incX
+				}
+				ix -= incX
+			}
+		}
+	} else {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				for j := 0; j < i; j++ {
+					x[j] += x[i] * cmplx.Conj(a[i*lda+j])
+				}
+				if diag == blas.NonUnit {
+					x[i] *= cmplx.Conj(a[i*lda+i])
+				}
+			}
+		} else {
+			ix := kx
+			for i := 0; i < n; i++ {
+				jx := kx
+				for j := 0; j < i; j++ {
+					x[jx] += x[ix] * cmplx.Conj(a[i*lda+j])
+					jx += incX
+				}
+				if diag == blas.NonUnit {
+					x[ix] *= cmplx.Conj(a[i*lda+i])
+				}
+				ix += incX
+			}
+		}
+	}
+}
+
+// Ztrsv solves one of the systems of equations
+//
+//	A * x = b   if trans == blas.NoTrans
+//	Aᵀ * x = b  if trans == blas.Trans
+//	Aᴴ * x = b  if trans == blas.ConjTrans
+//
+// where b and x are n element vectors and A is an n×n triangular matrix.
+//
+// On entry, x contains the values of b, and the solution is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func (Implementation) Ztrsv(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n int, a []complex128, lda int, x []complex128, incX int) {
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	switch diag {
+	default:
+		panic(badDiag)
+	case blas.NonUnit, blas.Unit:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	// The elements of A are accessed sequentially with one pass through A.
+
+	if trans == blas.NoTrans {
+		// Form x = inv(A)*x.
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					aii := a[i*lda+i]
+					if n-i-1 > 0 {
+						x[i] -= c128.DotuUnitary(x[i+1:n], a[i*lda+i+1:i*lda+n])
+					}
+					if diag == blas.NonUnit {
+						x[i] /= aii
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					aii := a[i*lda+i]
+					if n-i-1 > 0 {
+						x[ix] -= c128.DotuInc(x, a[i*lda+i+1:i*lda+n], uintptr(n-i-1), uintptr(incX), 1, uintptr(ix+incX), 0)
+					}
+					if diag == blas.NonUnit {
+						x[ix] /= aii
+					}
+					ix -= incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						x[i] -= c128.DotuUnitary(x[:i], a[i*lda:i*lda+i])
+					}
+					if diag == blas.NonUnit {
+						x[i] /= a[i*lda+i]
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						x[ix] -= c128.DotuInc(x, a[i*lda:i*lda+i], uintptr(i), uintptr(incX), 1, uintptr(kx), 0)
+					}
+					if diag == blas.NonUnit {
+						x[ix] /= a[i*lda+i]
+					}
+					ix += incX
+				}
+			}
+		}
+		return
+	}
+
+	if trans == blas.Trans {
+		// Form x = inv(Aᵀ)*x.
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for j := 0; j < n; j++ {
+					if diag == blas.NonUnit {
+						x[j] /= a[j*lda+j]
+					}
+					if n-j-1 > 0 {
+						c128.AxpyUnitary(-x[j], a[j*lda+j+1:j*lda+n], x[j+1:n])
+					}
+				}
+			} else {
+				jx := kx
+				for j := 0; j < n; j++ {
+					if diag == blas.NonUnit {
+						x[jx] /= a[j*lda+j]
+					}
+					if n-j-1 > 0 {
+						c128.AxpyInc(-x[jx], a[j*lda+j+1:j*lda+n], x, uintptr(n-j-1), 1, uintptr(incX), 0, uintptr(jx+incX))
+					}
+					jx += incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for j := n - 1; j >= 0; j-- {
+					if diag == blas.NonUnit {
+						x[j] /= a[j*lda+j]
+					}
+					xj := x[j]
+					if j > 0 {
+						c128.AxpyUnitary(-xj, a[j*lda:j*lda+j], x[:j])
+					}
+				}
+			} else {
+				jx := kx + (n-1)*incX
+				for j := n - 1; j >= 0; j-- {
+					if diag == blas.NonUnit {
+						x[jx] /= a[j*lda+j]
+					}
+					if j > 0 {
+						c128.AxpyInc(-x[jx], a[j*lda:j*lda+j], x, uintptr(j), 1, uintptr(incX), 0, uintptr(kx))
+					}
+					jx -= incX
+				}
+			}
+		}
+		return
+	}
+
+	// Form x = inv(Aᴴ)*x.
+	if uplo == blas.Upper {
+		if incX == 1 {
+			for j := 0; j < n; j++ {
+				if diag == blas.NonUnit {
+					x[j] /= cmplx.Conj(a[j*lda+j])
+				}
+				xj := x[j]
+				for i := j + 1; i < n; i++ {
+					x[i] -= xj * cmplx.Conj(a[j*lda+i])
+				}
+			}
+		} else {
+			jx := kx
+			for j := 0; j < n; j++ {
+				if diag == blas.NonUnit {
+					x[jx] /= cmplx.Conj(a[j*lda+j])
+				}
+				xj := x[jx]
+				ix := jx + incX
+				for i := j + 1; i < n; i++ {
+					x[ix] -= xj * cmplx.Conj(a[j*lda+i])
+					ix += incX
+				}
+				jx += incX
+			}
+		}
+	} else {
+		if incX == 1 {
+			for j := n - 1; j >= 0; j-- {
+				if diag == blas.NonUnit {
+					x[j] /= cmplx.Conj(a[j*lda+j])
+				}
+				xj := x[j]
+				for i := 0; i < j; i++ {
+					x[i] -= xj * cmplx.Conj(a[j*lda+i])
+				}
+			}
+		} else {
+			jx := kx + (n-1)*incX
+			for j := n - 1; j >= 0; j-- {
+				if diag == blas.NonUnit {
+					x[jx] /= cmplx.Conj(a[j*lda+j])
+				}
+				xj := x[jx]
+				ix := kx
+				for i := 0; i < j; i++ {
+					x[ix] -= xj * cmplx.Conj(a[j*lda+i])
+					ix += incX
+				}
+				jx -= incX
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level2cmplx64.go b/vendor/gonum.org/v1/gonum/blas/gonum/level2cmplx64.go
new file mode 100644
index 0000000000..3ce67868cd
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level2cmplx64.go
@@ -0,0 +1,2976 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	cmplx "gonum.org/v1/gonum/internal/cmplx64"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/c64"
+)
+
+var _ blas.Complex64Level2 = Implementation{}
+
+// Cgbmv performs one of the matrix-vector operations
+//
+//	y = alpha * A * x + beta * y   if trans = blas.NoTrans
+//	y = alpha * Aᵀ * x + beta * y  if trans = blas.Trans
+//	y = alpha * Aᴴ * x + beta * y  if trans = blas.ConjTrans
+//
+// where alpha and beta are scalars, x and y are vectors, and A is an m×n band matrix
+// with kL sub-diagonals and kU super-diagonals.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cgbmv(trans blas.Transpose, m, n, kL, kU int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int) {
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if kL < 0 {
+		panic(kLLT0)
+	}
+	if kU < 0 {
+		panic(kULT0)
+	}
+	if lda < kL+kU+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(min(m, n+kL)-1)+kL+kU+1 {
+		panic(shortA)
+	}
+	var lenX, lenY int
+	if trans == blas.NoTrans {
+		lenX, lenY = n, m
+	} else {
+		lenX, lenY = m, n
+	}
+	if (incX > 0 && len(x) <= (lenX-1)*incX) || (incX < 0 && len(x) <= (1-lenX)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (lenY-1)*incY) || (incY < 0 && len(y) <= (1-lenY)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = (1 - lenX) * incX
+	}
+	var ky int
+	if incY < 0 {
+		ky = (1 - lenY) * incY
+	}
+
+	// Form y = beta*y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:lenY] {
+					y[i] = 0
+				}
+			} else {
+				c64.ScalUnitary(beta, y[:lenY])
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < lenY; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				if incY > 0 {
+					c64.ScalInc(beta, y, uintptr(lenY), uintptr(incY))
+				} else {
+					c64.ScalInc(beta, y, uintptr(lenY), uintptr(-incY))
+				}
+			}
+		}
+	}
+
+	nRow := min(m, n+kL)
+	nCol := kL + 1 + kU
+	switch trans {
+	case blas.NoTrans:
+		iy := ky
+		if incX == 1 {
+			for i := 0; i < nRow; i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				aRow := a[i*lda+l : i*lda+u]
+				off := max(0, i-kL)
+				xtmp := x[off : off+u-l]
+				var sum complex64
+				for j, v := range aRow {
+					sum += xtmp[j] * v
+				}
+				y[iy] += alpha * sum
+				iy += incY
+			}
+		} else {
+			for i := 0; i < nRow; i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				aRow := a[i*lda+l : i*lda+u]
+				off := max(0, i-kL) * incX
+				jx := kx
+				var sum complex64
+				for _, v := range aRow {
+					sum += x[off+jx] * v
+					jx += incX
+				}
+				y[iy] += alpha * sum
+				iy += incY
+			}
+		}
+	case blas.Trans:
+		if incX == 1 {
+			for i := 0; i < nRow; i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				aRow := a[i*lda+l : i*lda+u]
+				off := max(0, i-kL) * incY
+				alphaxi := alpha * x[i]
+				jy := ky
+				for _, v := range aRow {
+					y[off+jy] += alphaxi * v
+					jy += incY
+				}
+			}
+		} else {
+			ix := kx
+			for i := 0; i < nRow; i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				aRow := a[i*lda+l : i*lda+u]
+				off := max(0, i-kL) * incY
+				alphaxi := alpha * x[ix]
+				jy := ky
+				for _, v := range aRow {
+					y[off+jy] += alphaxi * v
+					jy += incY
+				}
+				ix += incX
+			}
+		}
+	case blas.ConjTrans:
+		if incX == 1 {
+			for i := 0; i < nRow; i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				aRow := a[i*lda+l : i*lda+u]
+				off := max(0, i-kL) * incY
+				alphaxi := alpha * x[i]
+				jy := ky
+				for _, v := range aRow {
+					y[off+jy] += alphaxi * cmplx.Conj(v)
+					jy += incY
+				}
+			}
+		} else {
+			ix := kx
+			for i := 0; i < nRow; i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				aRow := a[i*lda+l : i*lda+u]
+				off := max(0, i-kL) * incY
+				alphaxi := alpha * x[ix]
+				jy := ky
+				for _, v := range aRow {
+					y[off+jy] += alphaxi * cmplx.Conj(v)
+					jy += incY
+				}
+				ix += incX
+			}
+		}
+	}
+}
+
+// Cgemv performs one of the matrix-vector operations
+//
+//	y = alpha * A * x + beta * y   if trans = blas.NoTrans
+//	y = alpha * Aᵀ * x + beta * y  if trans = blas.Trans
+//	y = alpha * Aᴴ * x + beta * y  if trans = blas.ConjTrans
+//
+// where alpha and beta are scalars, x and y are vectors, and A is an m×n dense matrix.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cgemv(trans blas.Transpose, m, n int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int) {
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	var lenX, lenY int
+	if trans == blas.NoTrans {
+		lenX = n
+		lenY = m
+	} else {
+		lenX = m
+		lenY = n
+	}
+	if len(a) < lda*(m-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (lenX-1)*incX) || (incX < 0 && len(x) <= (1-lenX)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (lenY-1)*incY) || (incY < 0 && len(y) <= (1-lenY)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = (1 - lenX) * incX
+	}
+	var ky int
+	if incY < 0 {
+		ky = (1 - lenY) * incY
+	}
+
+	// Form y = beta*y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:lenY] {
+					y[i] = 0
+				}
+			} else {
+				c64.ScalUnitary(beta, y[:lenY])
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < lenY; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				if incY > 0 {
+					c64.ScalInc(beta, y, uintptr(lenY), uintptr(incY))
+				} else {
+					c64.ScalInc(beta, y, uintptr(lenY), uintptr(-incY))
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	switch trans {
+	default:
+		// Form y = alpha*A*x + y.
+		iy := ky
+		if incX == 1 {
+			for i := 0; i < m; i++ {
+				y[iy] += alpha * c64.DotuUnitary(a[i*lda:i*lda+n], x[:n])
+				iy += incY
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			y[iy] += alpha * c64.DotuInc(a[i*lda:i*lda+n], x, uintptr(n), 1, uintptr(incX), 0, uintptr(kx))
+			iy += incY
+		}
+		return
+
+	case blas.Trans:
+		// Form y = alpha*Aᵀ*x + y.
+		ix := kx
+		if incY == 1 {
+			for i := 0; i < m; i++ {
+				c64.AxpyUnitary(alpha*x[ix], a[i*lda:i*lda+n], y[:n])
+				ix += incX
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			c64.AxpyInc(alpha*x[ix], a[i*lda:i*lda+n], y, uintptr(n), 1, uintptr(incY), 0, uintptr(ky))
+			ix += incX
+		}
+		return
+
+	case blas.ConjTrans:
+		// Form y = alpha*Aᴴ*x + y.
+		ix := kx
+		if incY == 1 {
+			for i := 0; i < m; i++ {
+				tmp := alpha * x[ix]
+				for j := 0; j < n; j++ {
+					y[j] += tmp * cmplx.Conj(a[i*lda+j])
+				}
+				ix += incX
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			tmp := alpha * x[ix]
+			jy := ky
+			for j := 0; j < n; j++ {
+				y[jy] += tmp * cmplx.Conj(a[i*lda+j])
+				jy += incY
+			}
+			ix += incX
+		}
+		return
+	}
+}
+
+// Cgerc performs the rank-one operation
+//
+//	A += alpha * x * yᴴ
+//
+// where A is an m×n dense matrix, alpha is a scalar, x is an m element vector,
+// and y is an n element vector.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cgerc(m, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, a []complex64, lda int) {
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (m-1)*incX) || (incX < 0 && len(x) <= (1-m)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(a) < lda*(m-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	var kx, jy int
+	if incX < 0 {
+		kx = (1 - m) * incX
+	}
+	if incY < 0 {
+		jy = (1 - n) * incY
+	}
+	for j := 0; j < n; j++ {
+		if y[jy] != 0 {
+			tmp := alpha * cmplx.Conj(y[jy])
+			c64.AxpyInc(tmp, x, a[j:], uintptr(m), uintptr(incX), uintptr(lda), uintptr(kx), 0)
+		}
+		jy += incY
+	}
+}
+
+// Cgeru performs the rank-one operation
+//
+//	A += alpha * x * yᵀ
+//
+// where A is an m×n dense matrix, alpha is a scalar, x is an m element vector,
+// and y is an n element vector.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cgeru(m, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, a []complex64, lda int) {
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (m-1)*incX) || (incX < 0 && len(x) <= (1-m)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(a) < lda*(m-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = (1 - m) * incX
+	}
+	if incY == 1 {
+		for i := 0; i < m; i++ {
+			if x[kx] != 0 {
+				tmp := alpha * x[kx]
+				c64.AxpyUnitary(tmp, y[:n], a[i*lda:i*lda+n])
+			}
+			kx += incX
+		}
+		return
+	}
+	var jy int
+	if incY < 0 {
+		jy = (1 - n) * incY
+	}
+	for i := 0; i < m; i++ {
+		if x[kx] != 0 {
+			tmp := alpha * x[kx]
+			c64.AxpyInc(tmp, y, a[i*lda:i*lda+n], uintptr(n), uintptr(incY), 1, uintptr(jy), 0)
+		}
+		kx += incX
+	}
+}
+
+// Chbmv performs the matrix-vector operation
+//
+//	y = alpha * A * x + beta * y
+//
+// where alpha and beta are scalars, x and y are vectors, and A is an n×n
+// Hermitian band matrix with k super-diagonals. The imaginary parts of
+// the diagonal elements of A are ignored and assumed to be zero.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Chbmv(uplo blas.Uplo, n, k int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	if lda < k+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+k+1 {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	// Set up the start indices in X and Y.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+	var ky int
+	if incY < 0 {
+		ky = (1 - n) * incY
+	}
+
+	// Form y = beta*y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:n] {
+					y[i] = 0
+				}
+			} else {
+				for i, v := range y[:n] {
+					y[i] = beta * v
+				}
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					y[iy] = beta * y[iy]
+					iy += incY
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	// The elements of A are accessed sequentially with one pass through a.
+	switch uplo {
+	case blas.Upper:
+		iy := ky
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				aRow := a[i*lda:]
+				alphaxi := alpha * x[i]
+				sum := alphaxi * complex(real(aRow[0]), 0)
+				u := min(k+1, n-i)
+				jy := incY
+				for j := 1; j < u; j++ {
+					v := aRow[j]
+					sum += alpha * x[i+j] * v
+					y[iy+jy] += alphaxi * cmplx.Conj(v)
+					jy += incY
+				}
+				y[iy] += sum
+				iy += incY
+			}
+		} else {
+			ix := kx
+			for i := 0; i < n; i++ {
+				aRow := a[i*lda:]
+				alphaxi := alpha * x[ix]
+				sum := alphaxi * complex(real(aRow[0]), 0)
+				u := min(k+1, n-i)
+				jx := incX
+				jy := incY
+				for j := 1; j < u; j++ {
+					v := aRow[j]
+					sum += alpha * x[ix+jx] * v
+					y[iy+jy] += alphaxi * cmplx.Conj(v)
+					jx += incX
+					jy += incY
+				}
+				y[iy] += sum
+				ix += incX
+				iy += incY
+			}
+		}
+	case blas.Lower:
+		iy := ky
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				l := max(0, k-i)
+				alphaxi := alpha * x[i]
+				jy := l * incY
+				aRow := a[i*lda:]
+				for j := l; j < k; j++ {
+					v := aRow[j]
+					y[iy] += alpha * v * x[i-k+j]
+					y[iy-k*incY+jy] += alphaxi * cmplx.Conj(v)
+					jy += incY
+				}
+				y[iy] += alphaxi * complex(real(aRow[k]), 0)
+				iy += incY
+			}
+		} else {
+			ix := kx
+			for i := 0; i < n; i++ {
+				l := max(0, k-i)
+				alphaxi := alpha * x[ix]
+				jx := l * incX
+				jy := l * incY
+				aRow := a[i*lda:]
+				for j := l; j < k; j++ {
+					v := aRow[j]
+					y[iy] += alpha * v * x[ix-k*incX+jx]
+					y[iy-k*incY+jy] += alphaxi * cmplx.Conj(v)
+					jx += incX
+					jy += incY
+				}
+				y[iy] += alphaxi * complex(real(aRow[k]), 0)
+				ix += incX
+				iy += incY
+			}
+		}
+	}
+}
+
+// Chemv performs the matrix-vector operation
+//
+//	y = alpha * A * x + beta * y
+//
+// where alpha and beta are scalars, x and y are vectors, and A is an n×n
+// Hermitian matrix. The imaginary parts of the diagonal elements of A are
+// ignored and assumed to be zero.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Chemv(uplo blas.Uplo, n int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	// Set up the start indices in X and Y.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+	var ky int
+	if incY < 0 {
+		ky = (1 - n) * incY
+	}
+
+	// Form y = beta*y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:n] {
+					y[i] = 0
+				}
+			} else {
+				for i, v := range y[:n] {
+					y[i] = beta * v
+				}
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					y[iy] = beta * y[iy]
+					iy += incY
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	// The elements of A are accessed sequentially with one pass through
+	// the triangular part of A.
+
+	if uplo == blas.Upper {
+		// Form y when A is stored in upper triangle.
+		if incX == 1 && incY == 1 {
+			for i := 0; i < n; i++ {
+				tmp1 := alpha * x[i]
+				var tmp2 complex64
+				for j := i + 1; j < n; j++ {
+					y[j] += tmp1 * cmplx.Conj(a[i*lda+j])
+					tmp2 += a[i*lda+j] * x[j]
+				}
+				aii := complex(real(a[i*lda+i]), 0)
+				y[i] += tmp1*aii + alpha*tmp2
+			}
+		} else {
+			ix := kx
+			iy := ky
+			for i := 0; i < n; i++ {
+				tmp1 := alpha * x[ix]
+				var tmp2 complex64
+				jx := ix
+				jy := iy
+				for j := i + 1; j < n; j++ {
+					jx += incX
+					jy += incY
+					y[jy] += tmp1 * cmplx.Conj(a[i*lda+j])
+					tmp2 += a[i*lda+j] * x[jx]
+				}
+				aii := complex(real(a[i*lda+i]), 0)
+				y[iy] += tmp1*aii + alpha*tmp2
+				ix += incX
+				iy += incY
+			}
+		}
+		return
+	}
+
+	// Form y when A is stored in lower triangle.
+	if incX == 1 && incY == 1 {
+		for i := 0; i < n; i++ {
+			tmp1 := alpha * x[i]
+			var tmp2 complex64
+			for j := 0; j < i; j++ {
+				y[j] += tmp1 * cmplx.Conj(a[i*lda+j])
+				tmp2 += a[i*lda+j] * x[j]
+			}
+			aii := complex(real(a[i*lda+i]), 0)
+			y[i] += tmp1*aii + alpha*tmp2
+		}
+	} else {
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			tmp1 := alpha * x[ix]
+			var tmp2 complex64
+			jx := kx
+			jy := ky
+			for j := 0; j < i; j++ {
+				y[jy] += tmp1 * cmplx.Conj(a[i*lda+j])
+				tmp2 += a[i*lda+j] * x[jx]
+				jx += incX
+				jy += incY
+			}
+			aii := complex(real(a[i*lda+i]), 0)
+			y[iy] += tmp1*aii + alpha*tmp2
+			ix += incX
+			iy += incY
+		}
+	}
+}
+
+// Cher performs the Hermitian rank-one operation
+//
+//	A += alpha * x * xᴴ
+//
+// where A is an n×n Hermitian matrix, alpha is a real scalar, and x is an n
+// element vector. On entry, the imaginary parts of the diagonal elements of A
+// are ignored and assumed to be zero, on return they will be set to zero.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cher(uplo blas.Uplo, n int, alpha float32, x []complex64, incX int, a []complex64, lda int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+	if uplo == blas.Upper {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				if x[i] != 0 {
+					tmp := complex(alpha*real(x[i]), alpha*imag(x[i]))
+					aii := real(a[i*lda+i])
+					xtmp := real(tmp * cmplx.Conj(x[i]))
+					a[i*lda+i] = complex(aii+xtmp, 0)
+					for j := i + 1; j < n; j++ {
+						a[i*lda+j] += tmp * cmplx.Conj(x[j])
+					}
+				} else {
+					aii := real(a[i*lda+i])
+					a[i*lda+i] = complex(aii, 0)
+				}
+			}
+			return
+		}
+
+		ix := kx
+		for i := 0; i < n; i++ {
+			if x[ix] != 0 {
+				tmp := complex(alpha*real(x[ix]), alpha*imag(x[ix]))
+				aii := real(a[i*lda+i])
+				xtmp := real(tmp * cmplx.Conj(x[ix]))
+				a[i*lda+i] = complex(aii+xtmp, 0)
+				jx := ix + incX
+				for j := i + 1; j < n; j++ {
+					a[i*lda+j] += tmp * cmplx.Conj(x[jx])
+					jx += incX
+				}
+			} else {
+				aii := real(a[i*lda+i])
+				a[i*lda+i] = complex(aii, 0)
+			}
+			ix += incX
+		}
+		return
+	}
+
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			if x[i] != 0 {
+				tmp := complex(alpha*real(x[i]), alpha*imag(x[i]))
+				for j := 0; j < i; j++ {
+					a[i*lda+j] += tmp * cmplx.Conj(x[j])
+				}
+				aii := real(a[i*lda+i])
+				xtmp := real(tmp * cmplx.Conj(x[i]))
+				a[i*lda+i] = complex(aii+xtmp, 0)
+			} else {
+				aii := real(a[i*lda+i])
+				a[i*lda+i] = complex(aii, 0)
+			}
+		}
+		return
+	}
+
+	ix := kx
+	for i := 0; i < n; i++ {
+		if x[ix] != 0 {
+			tmp := complex(alpha*real(x[ix]), alpha*imag(x[ix]))
+			jx := kx
+			for j := 0; j < i; j++ {
+				a[i*lda+j] += tmp * cmplx.Conj(x[jx])
+				jx += incX
+			}
+			aii := real(a[i*lda+i])
+			xtmp := real(tmp * cmplx.Conj(x[ix]))
+			a[i*lda+i] = complex(aii+xtmp, 0)
+
+		} else {
+			aii := real(a[i*lda+i])
+			a[i*lda+i] = complex(aii, 0)
+		}
+		ix += incX
+	}
+}
+
+// Cher2 performs the Hermitian rank-two operation
+//
+//	A += alpha * x * yᴴ + conj(alpha) * y * xᴴ
+//
+// where alpha is a scalar, x and y are n element vectors and A is an n×n
+// Hermitian matrix. On entry, the imaginary parts of the diagonal elements are
+// ignored and assumed to be zero. On return they will be set to zero.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cher2(uplo blas.Uplo, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, a []complex64, lda int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	var kx, ky int
+	var ix, iy int
+	if incX != 1 || incY != 1 {
+		if incX < 0 {
+			kx = (1 - n) * incX
+		}
+		if incY < 0 {
+			ky = (1 - n) * incY
+		}
+		ix = kx
+		iy = ky
+	}
+	if uplo == blas.Upper {
+		if incX == 1 && incY == 1 {
+			for i := 0; i < n; i++ {
+				if x[i] != 0 || y[i] != 0 {
+					tmp1 := alpha * x[i]
+					tmp2 := cmplx.Conj(alpha) * y[i]
+					aii := real(a[i*lda+i]) + real(tmp1*cmplx.Conj(y[i])) + real(tmp2*cmplx.Conj(x[i]))
+					a[i*lda+i] = complex(aii, 0)
+					for j := i + 1; j < n; j++ {
+						a[i*lda+j] += tmp1*cmplx.Conj(y[j]) + tmp2*cmplx.Conj(x[j])
+					}
+				} else {
+					aii := real(a[i*lda+i])
+					a[i*lda+i] = complex(aii, 0)
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			if x[ix] != 0 || y[iy] != 0 {
+				tmp1 := alpha * x[ix]
+				tmp2 := cmplx.Conj(alpha) * y[iy]
+				aii := real(a[i*lda+i]) + real(tmp1*cmplx.Conj(y[iy])) + real(tmp2*cmplx.Conj(x[ix]))
+				a[i*lda+i] = complex(aii, 0)
+				jx := ix + incX
+				jy := iy + incY
+				for j := i + 1; j < n; j++ {
+					a[i*lda+j] += tmp1*cmplx.Conj(y[jy]) + tmp2*cmplx.Conj(x[jx])
+					jx += incX
+					jy += incY
+				}
+			} else {
+				aii := real(a[i*lda+i])
+				a[i*lda+i] = complex(aii, 0)
+			}
+			ix += incX
+			iy += incY
+		}
+		return
+	}
+
+	if incX == 1 && incY == 1 {
+		for i := 0; i < n; i++ {
+			if x[i] != 0 || y[i] != 0 {
+				tmp1 := alpha * x[i]
+				tmp2 := cmplx.Conj(alpha) * y[i]
+				for j := 0; j < i; j++ {
+					a[i*lda+j] += tmp1*cmplx.Conj(y[j]) + tmp2*cmplx.Conj(x[j])
+				}
+				aii := real(a[i*lda+i]) + real(tmp1*cmplx.Conj(y[i])) + real(tmp2*cmplx.Conj(x[i]))
+				a[i*lda+i] = complex(aii, 0)
+			} else {
+				aii := real(a[i*lda+i])
+				a[i*lda+i] = complex(aii, 0)
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		if x[ix] != 0 || y[iy] != 0 {
+			tmp1 := alpha * x[ix]
+			tmp2 := cmplx.Conj(alpha) * y[iy]
+			jx := kx
+			jy := ky
+			for j := 0; j < i; j++ {
+				a[i*lda+j] += tmp1*cmplx.Conj(y[jy]) + tmp2*cmplx.Conj(x[jx])
+				jx += incX
+				jy += incY
+			}
+			aii := real(a[i*lda+i]) + real(tmp1*cmplx.Conj(y[iy])) + real(tmp2*cmplx.Conj(x[ix]))
+			a[i*lda+i] = complex(aii, 0)
+		} else {
+			aii := real(a[i*lda+i])
+			a[i*lda+i] = complex(aii, 0)
+		}
+		ix += incX
+		iy += incY
+	}
+}
+
+// Chpmv performs the matrix-vector operation
+//
+//	y = alpha * A * x + beta * y
+//
+// where alpha and beta are scalars, x and y are vectors, and A is an n×n
+// Hermitian matrix in packed form. The imaginary parts of the diagonal
+// elements of A are ignored and assumed to be zero.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Chpmv(uplo blas.Uplo, n int, alpha complex64, ap []complex64, x []complex64, incX int, beta complex64, y []complex64, incY int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	// Set up the start indices in X and Y.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+	var ky int
+	if incY < 0 {
+		ky = (1 - n) * incY
+	}
+
+	// Form y = beta*y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:n] {
+					y[i] = 0
+				}
+			} else {
+				for i, v := range y[:n] {
+					y[i] = beta * v
+				}
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					y[iy] *= beta
+					iy += incY
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	// The elements of A are accessed sequentially with one pass through ap.
+
+	var kk int
+	if uplo == blas.Upper {
+		// Form y when ap contains the upper triangle.
+		// Here, kk points to the current diagonal element in ap.
+		if incX == 1 && incY == 1 {
+			for i := 0; i < n; i++ {
+				tmp1 := alpha * x[i]
+				y[i] += tmp1 * complex(real(ap[kk]), 0)
+				var tmp2 complex64
+				k := kk + 1
+				for j := i + 1; j < n; j++ {
+					y[j] += tmp1 * cmplx.Conj(ap[k])
+					tmp2 += ap[k] * x[j]
+					k++
+				}
+				y[i] += alpha * tmp2
+				kk += n - i
+			}
+		} else {
+			ix := kx
+			iy := ky
+			for i := 0; i < n; i++ {
+				tmp1 := alpha * x[ix]
+				y[iy] += tmp1 * complex(real(ap[kk]), 0)
+				var tmp2 complex64
+				jx := ix
+				jy := iy
+				for k := kk + 1; k < kk+n-i; k++ {
+					jx += incX
+					jy += incY
+					y[jy] += tmp1 * cmplx.Conj(ap[k])
+					tmp2 += ap[k] * x[jx]
+				}
+				y[iy] += alpha * tmp2
+				ix += incX
+				iy += incY
+				kk += n - i
+			}
+		}
+		return
+	}
+
+	// Form y when ap contains the lower triangle.
+	// Here, kk points to the beginning of current row in ap.
+	if incX == 1 && incY == 1 {
+		for i := 0; i < n; i++ {
+			tmp1 := alpha * x[i]
+			var tmp2 complex64
+			k := kk
+			for j := 0; j < i; j++ {
+				y[j] += tmp1 * cmplx.Conj(ap[k])
+				tmp2 += ap[k] * x[j]
+				k++
+			}
+			aii := complex(real(ap[kk+i]), 0)
+			y[i] += tmp1*aii + alpha*tmp2
+			kk += i + 1
+		}
+	} else {
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			tmp1 := alpha * x[ix]
+			var tmp2 complex64
+			jx := kx
+			jy := ky
+			for k := kk; k < kk+i; k++ {
+				y[jy] += tmp1 * cmplx.Conj(ap[k])
+				tmp2 += ap[k] * x[jx]
+				jx += incX
+				jy += incY
+			}
+			aii := complex(real(ap[kk+i]), 0)
+			y[iy] += tmp1*aii + alpha*tmp2
+			ix += incX
+			iy += incY
+			kk += i + 1
+		}
+	}
+}
+
+// Chpr performs the Hermitian rank-1 operation
+//
+//	A += alpha * x * xᴴ
+//
+// where alpha is a real scalar, x is a vector, and A is an n×n hermitian matrix
+// in packed form. On entry, the imaginary parts of the diagonal elements are
+// assumed to be zero, and on return they are set to zero.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Chpr(uplo blas.Uplo, n int, alpha float32, x []complex64, incX int, ap []complex64) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	// The elements of A are accessed sequentially with one pass through ap.
+
+	var kk int
+	if uplo == blas.Upper {
+		// Form A when upper triangle is stored in AP.
+		// Here, kk points to the current diagonal element in ap.
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				xi := x[i]
+				if xi != 0 {
+					aii := real(ap[kk]) + alpha*real(cmplx.Conj(xi)*xi)
+					ap[kk] = complex(aii, 0)
+
+					tmp := complex(alpha, 0) * xi
+					a := ap[kk+1 : kk+n-i]
+					x := x[i+1 : n]
+					for j, v := range x {
+						a[j] += tmp * cmplx.Conj(v)
+					}
+				} else {
+					ap[kk] = complex(real(ap[kk]), 0)
+				}
+				kk += n - i
+			}
+		} else {
+			ix := kx
+			for i := 0; i < n; i++ {
+				xi := x[ix]
+				if xi != 0 {
+					aii := real(ap[kk]) + alpha*real(cmplx.Conj(xi)*xi)
+					ap[kk] = complex(aii, 0)
+
+					tmp := complex(alpha, 0) * xi
+					jx := ix + incX
+					a := ap[kk+1 : kk+n-i]
+					for k := range a {
+						a[k] += tmp * cmplx.Conj(x[jx])
+						jx += incX
+					}
+				} else {
+					ap[kk] = complex(real(ap[kk]), 0)
+				}
+				ix += incX
+				kk += n - i
+			}
+		}
+		return
+	}
+
+	// Form A when lower triangle is stored in AP.
+	// Here, kk points to the beginning of current row in ap.
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			xi := x[i]
+			if xi != 0 {
+				tmp := complex(alpha, 0) * xi
+				a := ap[kk : kk+i]
+				for j, v := range x[:i] {
+					a[j] += tmp * cmplx.Conj(v)
+				}
+
+				aii := real(ap[kk+i]) + alpha*real(cmplx.Conj(xi)*xi)
+				ap[kk+i] = complex(aii, 0)
+			} else {
+				ap[kk+i] = complex(real(ap[kk+i]), 0)
+			}
+			kk += i + 1
+		}
+	} else {
+		ix := kx
+		for i := 0; i < n; i++ {
+			xi := x[ix]
+			if xi != 0 {
+				tmp := complex(alpha, 0) * xi
+				a := ap[kk : kk+i]
+				jx := kx
+				for k := range a {
+					a[k] += tmp * cmplx.Conj(x[jx])
+					jx += incX
+				}
+
+				aii := real(ap[kk+i]) + alpha*real(cmplx.Conj(xi)*xi)
+				ap[kk+i] = complex(aii, 0)
+			} else {
+				ap[kk+i] = complex(real(ap[kk+i]), 0)
+			}
+			ix += incX
+			kk += i + 1
+		}
+	}
+}
+
+// Chpr2 performs the Hermitian rank-2 operation
+//
+//	A += alpha * x * yᴴ + conj(alpha) * y * xᴴ
+//
+// where alpha is a complex scalar, x and y are n element vectors, and A is an
+// n×n Hermitian matrix, supplied in packed form. On entry, the imaginary parts
+// of the diagonal elements are assumed to be zero, and on return they are set to zero.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Chpr2(uplo blas.Uplo, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, ap []complex64) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	// Set up start indices in X and Y.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+	var ky int
+	if incY < 0 {
+		ky = (1 - n) * incY
+	}
+
+	// The elements of A are accessed sequentially with one pass through ap.
+
+	var kk int
+	if uplo == blas.Upper {
+		// Form A when upper triangle is stored in AP.
+		// Here, kk points to the current diagonal element in ap.
+		if incX == 1 && incY == 1 {
+			for i := 0; i < n; i++ {
+				if x[i] != 0 || y[i] != 0 {
+					tmp1 := alpha * x[i]
+					tmp2 := cmplx.Conj(alpha) * y[i]
+					aii := real(ap[kk]) + real(tmp1*cmplx.Conj(y[i])) + real(tmp2*cmplx.Conj(x[i]))
+					ap[kk] = complex(aii, 0)
+					k := kk + 1
+					for j := i + 1; j < n; j++ {
+						ap[k] += tmp1*cmplx.Conj(y[j]) + tmp2*cmplx.Conj(x[j])
+						k++
+					}
+				} else {
+					ap[kk] = complex(real(ap[kk]), 0)
+				}
+				kk += n - i
+			}
+		} else {
+			ix := kx
+			iy := ky
+			for i := 0; i < n; i++ {
+				if x[ix] != 0 || y[iy] != 0 {
+					tmp1 := alpha * x[ix]
+					tmp2 := cmplx.Conj(alpha) * y[iy]
+					aii := real(ap[kk]) + real(tmp1*cmplx.Conj(y[iy])) + real(tmp2*cmplx.Conj(x[ix]))
+					ap[kk] = complex(aii, 0)
+					jx := ix + incX
+					jy := iy + incY
+					for k := kk + 1; k < kk+n-i; k++ {
+						ap[k] += tmp1*cmplx.Conj(y[jy]) + tmp2*cmplx.Conj(x[jx])
+						jx += incX
+						jy += incY
+					}
+				} else {
+					ap[kk] = complex(real(ap[kk]), 0)
+				}
+				ix += incX
+				iy += incY
+				kk += n - i
+			}
+		}
+		return
+	}
+
+	// Form A when lower triangle is stored in AP.
+	// Here, kk points to the beginning of current row in ap.
+	if incX == 1 && incY == 1 {
+		for i := 0; i < n; i++ {
+			if x[i] != 0 || y[i] != 0 {
+				tmp1 := alpha * x[i]
+				tmp2 := cmplx.Conj(alpha) * y[i]
+				k := kk
+				for j := 0; j < i; j++ {
+					ap[k] += tmp1*cmplx.Conj(y[j]) + tmp2*cmplx.Conj(x[j])
+					k++
+				}
+				aii := real(ap[kk+i]) + real(tmp1*cmplx.Conj(y[i])) + real(tmp2*cmplx.Conj(x[i]))
+				ap[kk+i] = complex(aii, 0)
+			} else {
+				ap[kk+i] = complex(real(ap[kk+i]), 0)
+			}
+			kk += i + 1
+		}
+	} else {
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			if x[ix] != 0 || y[iy] != 0 {
+				tmp1 := alpha * x[ix]
+				tmp2 := cmplx.Conj(alpha) * y[iy]
+				jx := kx
+				jy := ky
+				for k := kk; k < kk+i; k++ {
+					ap[k] += tmp1*cmplx.Conj(y[jy]) + tmp2*cmplx.Conj(x[jx])
+					jx += incX
+					jy += incY
+				}
+				aii := real(ap[kk+i]) + real(tmp1*cmplx.Conj(y[iy])) + real(tmp2*cmplx.Conj(x[ix]))
+				ap[kk+i] = complex(aii, 0)
+			} else {
+				ap[kk+i] = complex(real(ap[kk+i]), 0)
+			}
+			ix += incX
+			iy += incY
+			kk += i + 1
+		}
+	}
+}
+
+// Ctbmv performs one of the matrix-vector operations
+//
+//	x = A * x   if trans = blas.NoTrans
+//	x = Aᵀ * x  if trans = blas.Trans
+//	x = Aᴴ * x  if trans = blas.ConjTrans
+//
+// where x is an n element vector and A is an n×n triangular band matrix, with
+// (k+1) diagonals.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Ctbmv(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n, k int, a []complex64, lda int, x []complex64, incX int) {
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	switch diag {
+	default:
+		panic(badDiag)
+	case blas.NonUnit, blas.Unit:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	if lda < k+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+k+1 {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	switch trans {
+	case blas.NoTrans:
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					xi := x[i]
+					if diag == blas.NonUnit {
+						xi *= a[i*lda]
+					}
+					kk := min(k, n-i-1)
+					for j, aij := range a[i*lda+1 : i*lda+kk+1] {
+						xi += x[i+j+1] * aij
+					}
+					x[i] = xi
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					xi := x[ix]
+					if diag == blas.NonUnit {
+						xi *= a[i*lda]
+					}
+					kk := min(k, n-i-1)
+					jx := ix + incX
+					for _, aij := range a[i*lda+1 : i*lda+kk+1] {
+						xi += x[jx] * aij
+						jx += incX
+					}
+					x[ix] = xi
+					ix += incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					xi := x[i]
+					if diag == blas.NonUnit {
+						xi *= a[i*lda+k]
+					}
+					kk := min(k, i)
+					for j, aij := range a[i*lda+k-kk : i*lda+k] {
+						xi += x[i-kk+j] * aij
+					}
+					x[i] = xi
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					xi := x[ix]
+					if diag == blas.NonUnit {
+						xi *= a[i*lda+k]
+					}
+					kk := min(k, i)
+					jx := ix - kk*incX
+					for _, aij := range a[i*lda+k-kk : i*lda+k] {
+						xi += x[jx] * aij
+						jx += incX
+					}
+					x[ix] = xi
+					ix -= incX
+				}
+			}
+		}
+	case blas.Trans:
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					kk := min(k, n-i-1)
+					xi := x[i]
+					for j, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[i+j+1] += xi * aij
+					}
+					if diag == blas.NonUnit {
+						x[i] *= a[i*lda]
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					kk := min(k, n-i-1)
+					jx := ix + incX
+					xi := x[ix]
+					for _, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[jx] += xi * aij
+						jx += incX
+					}
+					if diag == blas.NonUnit {
+						x[ix] *= a[i*lda]
+					}
+					ix -= incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					kk := min(k, i)
+					xi := x[i]
+					for j, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[i-kk+j] += xi * aij
+					}
+					if diag == blas.NonUnit {
+						x[i] *= a[i*lda+k]
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					kk := min(k, i)
+					jx := ix - kk*incX
+					xi := x[ix]
+					for _, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[jx] += xi * aij
+						jx += incX
+					}
+					if diag == blas.NonUnit {
+						x[ix] *= a[i*lda+k]
+					}
+					ix += incX
+				}
+			}
+		}
+	case blas.ConjTrans:
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					kk := min(k, n-i-1)
+					xi := x[i]
+					for j, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[i+j+1] += xi * cmplx.Conj(aij)
+					}
+					if diag == blas.NonUnit {
+						x[i] *= cmplx.Conj(a[i*lda])
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					kk := min(k, n-i-1)
+					jx := ix + incX
+					xi := x[ix]
+					for _, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[jx] += xi * cmplx.Conj(aij)
+						jx += incX
+					}
+					if diag == blas.NonUnit {
+						x[ix] *= cmplx.Conj(a[i*lda])
+					}
+					ix -= incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					kk := min(k, i)
+					xi := x[i]
+					for j, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[i-kk+j] += xi * cmplx.Conj(aij)
+					}
+					if diag == blas.NonUnit {
+						x[i] *= cmplx.Conj(a[i*lda+k])
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					kk := min(k, i)
+					jx := ix - kk*incX
+					xi := x[ix]
+					for _, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[jx] += xi * cmplx.Conj(aij)
+						jx += incX
+					}
+					if diag == blas.NonUnit {
+						x[ix] *= cmplx.Conj(a[i*lda+k])
+					}
+					ix += incX
+				}
+			}
+		}
+	}
+}
+
+// Ctbsv solves one of the systems of equations
+//
+//	A * x = b   if trans == blas.NoTrans
+//	Aᵀ * x = b  if trans == blas.Trans
+//	Aᴴ * x = b  if trans == blas.ConjTrans
+//
+// where b and x are n element vectors and A is an n×n triangular band matrix
+// with (k+1) diagonals.
+//
+// On entry, x contains the values of b, and the solution is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Ctbsv(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n, k int, a []complex64, lda int, x []complex64, incX int) {
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	switch diag {
+	default:
+		panic(badDiag)
+	case blas.NonUnit, blas.Unit:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	if lda < k+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+k+1 {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	switch trans {
+	case blas.NoTrans:
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					kk := min(k, n-i-1)
+					var sum complex64
+					for j, aij := range a[i*lda+1 : i*lda+kk+1] {
+						sum += x[i+1+j] * aij
+					}
+					x[i] -= sum
+					if diag == blas.NonUnit {
+						x[i] /= a[i*lda]
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					kk := min(k, n-i-1)
+					var sum complex64
+					jx := ix + incX
+					for _, aij := range a[i*lda+1 : i*lda+kk+1] {
+						sum += x[jx] * aij
+						jx += incX
+					}
+					x[ix] -= sum
+					if diag == blas.NonUnit {
+						x[ix] /= a[i*lda]
+					}
+					ix -= incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					kk := min(k, i)
+					var sum complex64
+					for j, aij := range a[i*lda+k-kk : i*lda+k] {
+						sum += x[i-kk+j] * aij
+					}
+					x[i] -= sum
+					if diag == blas.NonUnit {
+						x[i] /= a[i*lda+k]
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					kk := min(k, i)
+					var sum complex64
+					jx := ix - kk*incX
+					for _, aij := range a[i*lda+k-kk : i*lda+k] {
+						sum += x[jx] * aij
+						jx += incX
+					}
+					x[ix] -= sum
+					if diag == blas.NonUnit {
+						x[ix] /= a[i*lda+k]
+					}
+					ix += incX
+				}
+			}
+		}
+	case blas.Trans:
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[i] /= a[i*lda]
+					}
+					kk := min(k, n-i-1)
+					xi := x[i]
+					for j, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[i+1+j] -= xi * aij
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[ix] /= a[i*lda]
+					}
+					kk := min(k, n-i-1)
+					xi := x[ix]
+					jx := ix + incX
+					for _, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[jx] -= xi * aij
+						jx += incX
+					}
+					ix += incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[i] /= a[i*lda+k]
+					}
+					kk := min(k, i)
+					xi := x[i]
+					for j, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[i-kk+j] -= xi * aij
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[ix] /= a[i*lda+k]
+					}
+					kk := min(k, i)
+					xi := x[ix]
+					jx := ix - kk*incX
+					for _, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[jx] -= xi * aij
+						jx += incX
+					}
+					ix -= incX
+				}
+			}
+		}
+	case blas.ConjTrans:
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[i] /= cmplx.Conj(a[i*lda])
+					}
+					kk := min(k, n-i-1)
+					xi := x[i]
+					for j, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[i+1+j] -= xi * cmplx.Conj(aij)
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[ix] /= cmplx.Conj(a[i*lda])
+					}
+					kk := min(k, n-i-1)
+					xi := x[ix]
+					jx := ix + incX
+					for _, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[jx] -= xi * cmplx.Conj(aij)
+						jx += incX
+					}
+					ix += incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[i] /= cmplx.Conj(a[i*lda+k])
+					}
+					kk := min(k, i)
+					xi := x[i]
+					for j, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[i-kk+j] -= xi * cmplx.Conj(aij)
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[ix] /= cmplx.Conj(a[i*lda+k])
+					}
+					kk := min(k, i)
+					xi := x[ix]
+					jx := ix - kk*incX
+					for _, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[jx] -= xi * cmplx.Conj(aij)
+						jx += incX
+					}
+					ix -= incX
+				}
+			}
+		}
+	}
+}
+
+// Ctpmv performs one of the matrix-vector operations
+//
+//	x = A * x   if trans = blas.NoTrans
+//	x = Aᵀ * x  if trans = blas.Trans
+//	x = Aᴴ * x  if trans = blas.ConjTrans
+//
+// where x is an n element vector and A is an n×n triangular matrix, supplied in
+// packed form.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Ctpmv(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n int, ap []complex64, x []complex64, incX int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch diag {
+	default:
+		panic(badDiag)
+	case blas.NonUnit, blas.Unit:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	// The elements of A are accessed sequentially with one pass through A.
+
+	if trans == blas.NoTrans {
+		// Form x = A*x.
+		if uplo == blas.Upper {
+			// kk points to the current diagonal element in ap.
+			kk := 0
+			if incX == 1 {
+				x = x[:n]
+				for i := range x {
+					if diag == blas.NonUnit {
+						x[i] *= ap[kk]
+					}
+					if n-i-1 > 0 {
+						x[i] += c64.DotuUnitary(ap[kk+1:kk+n-i], x[i+1:])
+					}
+					kk += n - i
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[ix] *= ap[kk]
+					}
+					if n-i-1 > 0 {
+						x[ix] += c64.DotuInc(ap[kk+1:kk+n-i], x, uintptr(n-i-1), 1, uintptr(incX), 0, uintptr(ix+incX))
+					}
+					ix += incX
+					kk += n - i
+				}
+			}
+		} else {
+			// kk points to the beginning of current row in ap.
+			kk := n*(n+1)/2 - n
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[i] *= ap[kk+i]
+					}
+					if i > 0 {
+						x[i] += c64.DotuUnitary(ap[kk:kk+i], x[:i])
+					}
+					kk -= i
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[ix] *= ap[kk+i]
+					}
+					if i > 0 {
+						x[ix] += c64.DotuInc(ap[kk:kk+i], x, uintptr(i), 1, uintptr(incX), 0, uintptr(kx))
+					}
+					ix -= incX
+					kk -= i
+				}
+			}
+		}
+		return
+	}
+
+	if trans == blas.Trans {
+		// Form x = Aᵀ*x.
+		if uplo == blas.Upper {
+			// kk points to the current diagonal element in ap.
+			kk := n*(n+1)/2 - 1
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					xi := x[i]
+					if diag == blas.NonUnit {
+						x[i] *= ap[kk]
+					}
+					if n-i-1 > 0 {
+						c64.AxpyUnitary(xi, ap[kk+1:kk+n-i], x[i+1:n])
+					}
+					kk -= n - i + 1
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					xi := x[ix]
+					if diag == blas.NonUnit {
+						x[ix] *= ap[kk]
+					}
+					if n-i-1 > 0 {
+						c64.AxpyInc(xi, ap[kk+1:kk+n-i], x, uintptr(n-i-1), 1, uintptr(incX), 0, uintptr(ix+incX))
+					}
+					ix -= incX
+					kk -= n - i + 1
+				}
+			}
+		} else {
+			// kk points to the beginning of current row in ap.
+			kk := 0
+			if incX == 1 {
+				x = x[:n]
+				for i := range x {
+					if i > 0 {
+						c64.AxpyUnitary(x[i], ap[kk:kk+i], x[:i])
+					}
+					if diag == blas.NonUnit {
+						x[i] *= ap[kk+i]
+					}
+					kk += i + 1
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						c64.AxpyInc(x[ix], ap[kk:kk+i], x, uintptr(i), 1, uintptr(incX), 0, uintptr(kx))
+					}
+					if diag == blas.NonUnit {
+						x[ix] *= ap[kk+i]
+					}
+					ix += incX
+					kk += i + 1
+				}
+			}
+		}
+		return
+	}
+
+	// Form x = Aᴴ*x.
+	if uplo == blas.Upper {
+		// kk points to the current diagonal element in ap.
+		kk := n*(n+1)/2 - 1
+		if incX == 1 {
+			for i := n - 1; i >= 0; i-- {
+				xi := x[i]
+				if diag == blas.NonUnit {
+					x[i] *= cmplx.Conj(ap[kk])
+				}
+				k := kk + 1
+				for j := i + 1; j < n; j++ {
+					x[j] += xi * cmplx.Conj(ap[k])
+					k++
+				}
+				kk -= n - i + 1
+			}
+		} else {
+			ix := kx + (n-1)*incX
+			for i := n - 1; i >= 0; i-- {
+				xi := x[ix]
+				if diag == blas.NonUnit {
+					x[ix] *= cmplx.Conj(ap[kk])
+				}
+				jx := ix + incX
+				k := kk + 1
+				for j := i + 1; j < n; j++ {
+					x[jx] += xi * cmplx.Conj(ap[k])
+					jx += incX
+					k++
+				}
+				ix -= incX
+				kk -= n - i + 1
+			}
+		}
+	} else {
+		// kk points to the beginning of current row in ap.
+		kk := 0
+		if incX == 1 {
+			x = x[:n]
+			for i, xi := range x {
+				for j := 0; j < i; j++ {
+					x[j] += xi * cmplx.Conj(ap[kk+j])
+				}
+				if diag == blas.NonUnit {
+					x[i] *= cmplx.Conj(ap[kk+i])
+				}
+				kk += i + 1
+			}
+		} else {
+			ix := kx
+			for i := 0; i < n; i++ {
+				xi := x[ix]
+				jx := kx
+				for j := 0; j < i; j++ {
+					x[jx] += xi * cmplx.Conj(ap[kk+j])
+					jx += incX
+				}
+				if diag == blas.NonUnit {
+					x[ix] *= cmplx.Conj(ap[kk+i])
+				}
+				ix += incX
+				kk += i + 1
+			}
+		}
+	}
+}
+
+// Ctpsv solves one of the systems of equations
+//
+//	A * x = b   if trans == blas.NoTrans
+//	Aᵀ * x = b  if trans == blas.Trans
+//	Aᴴ * x = b  if trans == blas.ConjTrans
+//
+// where b and x are n element vectors and A is an n×n triangular matrix in
+// packed form.
+//
+// On entry, x contains the values of b, and the solution is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Ctpsv(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n int, ap []complex64, x []complex64, incX int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch diag {
+	default:
+		panic(badDiag)
+	case blas.NonUnit, blas.Unit:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	// The elements of A are accessed sequentially with one pass through ap.
+
+	if trans == blas.NoTrans {
+		// Form x = inv(A)*x.
+		if uplo == blas.Upper {
+			kk := n*(n+1)/2 - 1
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					aii := ap[kk]
+					if n-i-1 > 0 {
+						x[i] -= c64.DotuUnitary(x[i+1:n], ap[kk+1:kk+n-i])
+					}
+					if diag == blas.NonUnit {
+						x[i] /= aii
+					}
+					kk -= n - i + 1
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					aii := ap[kk]
+					if n-i-1 > 0 {
+						x[ix] -= c64.DotuInc(x, ap[kk+1:kk+n-i], uintptr(n-i-1), uintptr(incX), 1, uintptr(ix+incX), 0)
+					}
+					if diag == blas.NonUnit {
+						x[ix] /= aii
+					}
+					ix -= incX
+					kk -= n - i + 1
+				}
+			}
+		} else {
+			kk := 0
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						x[i] -= c64.DotuUnitary(x[:i], ap[kk:kk+i])
+					}
+					if diag == blas.NonUnit {
+						x[i] /= ap[kk+i]
+					}
+					kk += i + 1
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						x[ix] -= c64.DotuInc(x, ap[kk:kk+i], uintptr(i), uintptr(incX), 1, uintptr(kx), 0)
+					}
+					if diag == blas.NonUnit {
+						x[ix] /= ap[kk+i]
+					}
+					ix += incX
+					kk += i + 1
+				}
+			}
+		}
+		return
+	}
+
+	if trans == blas.Trans {
+		// Form x = inv(Aᵀ)*x.
+		if uplo == blas.Upper {
+			kk := 0
+			if incX == 1 {
+				for j := 0; j < n; j++ {
+					if diag == blas.NonUnit {
+						x[j] /= ap[kk]
+					}
+					if n-j-1 > 0 {
+						c64.AxpyUnitary(-x[j], ap[kk+1:kk+n-j], x[j+1:n])
+					}
+					kk += n - j
+				}
+			} else {
+				jx := kx
+				for j := 0; j < n; j++ {
+					if diag == blas.NonUnit {
+						x[jx] /= ap[kk]
+					}
+					if n-j-1 > 0 {
+						c64.AxpyInc(-x[jx], ap[kk+1:kk+n-j], x, uintptr(n-j-1), 1, uintptr(incX), 0, uintptr(jx+incX))
+					}
+					jx += incX
+					kk += n - j
+				}
+			}
+		} else {
+			kk := n*(n+1)/2 - n
+			if incX == 1 {
+				for j := n - 1; j >= 0; j-- {
+					if diag == blas.NonUnit {
+						x[j] /= ap[kk+j]
+					}
+					if j > 0 {
+						c64.AxpyUnitary(-x[j], ap[kk:kk+j], x[:j])
+					}
+					kk -= j
+				}
+			} else {
+				jx := kx + (n-1)*incX
+				for j := n - 1; j >= 0; j-- {
+					if diag == blas.NonUnit {
+						x[jx] /= ap[kk+j]
+					}
+					if j > 0 {
+						c64.AxpyInc(-x[jx], ap[kk:kk+j], x, uintptr(j), 1, uintptr(incX), 0, uintptr(kx))
+					}
+					jx -= incX
+					kk -= j
+				}
+			}
+		}
+		return
+	}
+
+	// Form x = inv(Aᴴ)*x.
+	if uplo == blas.Upper {
+		kk := 0
+		if incX == 1 {
+			for j := 0; j < n; j++ {
+				if diag == blas.NonUnit {
+					x[j] /= cmplx.Conj(ap[kk])
+				}
+				xj := x[j]
+				k := kk + 1
+				for i := j + 1; i < n; i++ {
+					x[i] -= xj * cmplx.Conj(ap[k])
+					k++
+				}
+				kk += n - j
+			}
+		} else {
+			jx := kx
+			for j := 0; j < n; j++ {
+				if diag == blas.NonUnit {
+					x[jx] /= cmplx.Conj(ap[kk])
+				}
+				xj := x[jx]
+				ix := jx + incX
+				k := kk + 1
+				for i := j + 1; i < n; i++ {
+					x[ix] -= xj * cmplx.Conj(ap[k])
+					ix += incX
+					k++
+				}
+				jx += incX
+				kk += n - j
+			}
+		}
+	} else {
+		kk := n*(n+1)/2 - n
+		if incX == 1 {
+			for j := n - 1; j >= 0; j-- {
+				if diag == blas.NonUnit {
+					x[j] /= cmplx.Conj(ap[kk+j])
+				}
+				xj := x[j]
+				for i := 0; i < j; i++ {
+					x[i] -= xj * cmplx.Conj(ap[kk+i])
+				}
+				kk -= j
+			}
+		} else {
+			jx := kx + (n-1)*incX
+			for j := n - 1; j >= 0; j-- {
+				if diag == blas.NonUnit {
+					x[jx] /= cmplx.Conj(ap[kk+j])
+				}
+				xj := x[jx]
+				ix := kx
+				for i := 0; i < j; i++ {
+					x[ix] -= xj * cmplx.Conj(ap[kk+i])
+					ix += incX
+				}
+				jx -= incX
+				kk -= j
+			}
+		}
+	}
+}
+
+// Ctrmv performs one of the matrix-vector operations
+//
+//	x = A * x   if trans = blas.NoTrans
+//	x = Aᵀ * x  if trans = blas.Trans
+//	x = Aᴴ * x  if trans = blas.ConjTrans
+//
+// where x is a vector, and A is an n×n triangular matrix.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Ctrmv(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n int, a []complex64, lda int, x []complex64, incX int) {
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	switch diag {
+	default:
+		panic(badDiag)
+	case blas.NonUnit, blas.Unit:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	// The elements of A are accessed sequentially with one pass through A.
+
+	if trans == blas.NoTrans {
+		// Form x = A*x.
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[i] *= a[i*lda+i]
+					}
+					if n-i-1 > 0 {
+						x[i] += c64.DotuUnitary(a[i*lda+i+1:i*lda+n], x[i+1:n])
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[ix] *= a[i*lda+i]
+					}
+					if n-i-1 > 0 {
+						x[ix] += c64.DotuInc(a[i*lda+i+1:i*lda+n], x, uintptr(n-i-1), 1, uintptr(incX), 0, uintptr(ix+incX))
+					}
+					ix += incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[i] *= a[i*lda+i]
+					}
+					if i > 0 {
+						x[i] += c64.DotuUnitary(a[i*lda:i*lda+i], x[:i])
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[ix] *= a[i*lda+i]
+					}
+					if i > 0 {
+						x[ix] += c64.DotuInc(a[i*lda:i*lda+i], x, uintptr(i), 1, uintptr(incX), 0, uintptr(kx))
+					}
+					ix -= incX
+				}
+			}
+		}
+		return
+	}
+
+	if trans == blas.Trans {
+		// Form x = Aᵀ*x.
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					xi := x[i]
+					if diag == blas.NonUnit {
+						x[i] *= a[i*lda+i]
+					}
+					if n-i-1 > 0 {
+						c64.AxpyUnitary(xi, a[i*lda+i+1:i*lda+n], x[i+1:n])
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					xi := x[ix]
+					if diag == blas.NonUnit {
+						x[ix] *= a[i*lda+i]
+					}
+					if n-i-1 > 0 {
+						c64.AxpyInc(xi, a[i*lda+i+1:i*lda+n], x, uintptr(n-i-1), 1, uintptr(incX), 0, uintptr(ix+incX))
+					}
+					ix -= incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						c64.AxpyUnitary(x[i], a[i*lda:i*lda+i], x[:i])
+					}
+					if diag == blas.NonUnit {
+						x[i] *= a[i*lda+i]
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						c64.AxpyInc(x[ix], a[i*lda:i*lda+i], x, uintptr(i), 1, uintptr(incX), 0, uintptr(kx))
+					}
+					if diag == blas.NonUnit {
+						x[ix] *= a[i*lda+i]
+					}
+					ix += incX
+				}
+			}
+		}
+		return
+	}
+
+	// Form x = Aᴴ*x.
+	if uplo == blas.Upper {
+		if incX == 1 {
+			for i := n - 1; i >= 0; i-- {
+				xi := x[i]
+				if diag == blas.NonUnit {
+					x[i] *= cmplx.Conj(a[i*lda+i])
+				}
+				for j := i + 1; j < n; j++ {
+					x[j] += xi * cmplx.Conj(a[i*lda+j])
+				}
+			}
+		} else {
+			ix := kx + (n-1)*incX
+			for i := n - 1; i >= 0; i-- {
+				xi := x[ix]
+				if diag == blas.NonUnit {
+					x[ix] *= cmplx.Conj(a[i*lda+i])
+				}
+				jx := ix + incX
+				for j := i + 1; j < n; j++ {
+					x[jx] += xi * cmplx.Conj(a[i*lda+j])
+					jx += incX
+				}
+				ix -= incX
+			}
+		}
+	} else {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				for j := 0; j < i; j++ {
+					x[j] += x[i] * cmplx.Conj(a[i*lda+j])
+				}
+				if diag == blas.NonUnit {
+					x[i] *= cmplx.Conj(a[i*lda+i])
+				}
+			}
+		} else {
+			ix := kx
+			for i := 0; i < n; i++ {
+				jx := kx
+				for j := 0; j < i; j++ {
+					x[jx] += x[ix] * cmplx.Conj(a[i*lda+j])
+					jx += incX
+				}
+				if diag == blas.NonUnit {
+					x[ix] *= cmplx.Conj(a[i*lda+i])
+				}
+				ix += incX
+			}
+		}
+	}
+}
+
+// Ctrsv solves one of the systems of equations
+//
+//	A * x = b   if trans == blas.NoTrans
+//	Aᵀ * x = b  if trans == blas.Trans
+//	Aᴴ * x = b  if trans == blas.ConjTrans
+//
+// where b and x are n element vectors and A is an n×n triangular matrix.
+//
+// On entry, x contains the values of b, and the solution is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Ctrsv(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n int, a []complex64, lda int, x []complex64, incX int) {
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	switch diag {
+	default:
+		panic(badDiag)
+	case blas.NonUnit, blas.Unit:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	// The elements of A are accessed sequentially with one pass through A.
+
+	if trans == blas.NoTrans {
+		// Form x = inv(A)*x.
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					aii := a[i*lda+i]
+					if n-i-1 > 0 {
+						x[i] -= c64.DotuUnitary(x[i+1:n], a[i*lda+i+1:i*lda+n])
+					}
+					if diag == blas.NonUnit {
+						x[i] /= aii
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					aii := a[i*lda+i]
+					if n-i-1 > 0 {
+						x[ix] -= c64.DotuInc(x, a[i*lda+i+1:i*lda+n], uintptr(n-i-1), uintptr(incX), 1, uintptr(ix+incX), 0)
+					}
+					if diag == blas.NonUnit {
+						x[ix] /= aii
+					}
+					ix -= incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						x[i] -= c64.DotuUnitary(x[:i], a[i*lda:i*lda+i])
+					}
+					if diag == blas.NonUnit {
+						x[i] /= a[i*lda+i]
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						x[ix] -= c64.DotuInc(x, a[i*lda:i*lda+i], uintptr(i), uintptr(incX), 1, uintptr(kx), 0)
+					}
+					if diag == blas.NonUnit {
+						x[ix] /= a[i*lda+i]
+					}
+					ix += incX
+				}
+			}
+		}
+		return
+	}
+
+	if trans == blas.Trans {
+		// Form x = inv(Aᵀ)*x.
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for j := 0; j < n; j++ {
+					if diag == blas.NonUnit {
+						x[j] /= a[j*lda+j]
+					}
+					if n-j-1 > 0 {
+						c64.AxpyUnitary(-x[j], a[j*lda+j+1:j*lda+n], x[j+1:n])
+					}
+				}
+			} else {
+				jx := kx
+				for j := 0; j < n; j++ {
+					if diag == blas.NonUnit {
+						x[jx] /= a[j*lda+j]
+					}
+					if n-j-1 > 0 {
+						c64.AxpyInc(-x[jx], a[j*lda+j+1:j*lda+n], x, uintptr(n-j-1), 1, uintptr(incX), 0, uintptr(jx+incX))
+					}
+					jx += incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for j := n - 1; j >= 0; j-- {
+					if diag == blas.NonUnit {
+						x[j] /= a[j*lda+j]
+					}
+					xj := x[j]
+					if j > 0 {
+						c64.AxpyUnitary(-xj, a[j*lda:j*lda+j], x[:j])
+					}
+				}
+			} else {
+				jx := kx + (n-1)*incX
+				for j := n - 1; j >= 0; j-- {
+					if diag == blas.NonUnit {
+						x[jx] /= a[j*lda+j]
+					}
+					if j > 0 {
+						c64.AxpyInc(-x[jx], a[j*lda:j*lda+j], x, uintptr(j), 1, uintptr(incX), 0, uintptr(kx))
+					}
+					jx -= incX
+				}
+			}
+		}
+		return
+	}
+
+	// Form x = inv(Aᴴ)*x.
+	if uplo == blas.Upper {
+		if incX == 1 {
+			for j := 0; j < n; j++ {
+				if diag == blas.NonUnit {
+					x[j] /= cmplx.Conj(a[j*lda+j])
+				}
+				xj := x[j]
+				for i := j + 1; i < n; i++ {
+					x[i] -= xj * cmplx.Conj(a[j*lda+i])
+				}
+			}
+		} else {
+			jx := kx
+			for j := 0; j < n; j++ {
+				if diag == blas.NonUnit {
+					x[jx] /= cmplx.Conj(a[j*lda+j])
+				}
+				xj := x[jx]
+				ix := jx + incX
+				for i := j + 1; i < n; i++ {
+					x[ix] -= xj * cmplx.Conj(a[j*lda+i])
+					ix += incX
+				}
+				jx += incX
+			}
+		}
+	} else {
+		if incX == 1 {
+			for j := n - 1; j >= 0; j-- {
+				if diag == blas.NonUnit {
+					x[j] /= cmplx.Conj(a[j*lda+j])
+				}
+				xj := x[j]
+				for i := 0; i < j; i++ {
+					x[i] -= xj * cmplx.Conj(a[j*lda+i])
+				}
+			}
+		} else {
+			jx := kx + (n-1)*incX
+			for j := n - 1; j >= 0; j-- {
+				if diag == blas.NonUnit {
+					x[jx] /= cmplx.Conj(a[j*lda+j])
+				}
+				xj := x[jx]
+				ix := kx
+				for i := 0; i < j; i++ {
+					x[ix] -= xj * cmplx.Conj(a[j*lda+i])
+					ix += incX
+				}
+				jx -= incX
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level2float32.go b/vendor/gonum.org/v1/gonum/blas/gonum/level2float32.go
new file mode 100644
index 0000000000..26e4959d7f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level2float32.go
@@ -0,0 +1,2400 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+var _ blas.Float32Level2 = Implementation{}
+
+// Sger performs the rank-one operation
+//
+//	A += alpha * x * yᵀ
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sger(m, n int, alpha float32, x []float32, incX int, y []float32, incY int, a []float32, lda int) {
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (m-1)*incX) || (incX < 0 && len(x) <= (1-m)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(a) < lda*(m-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+	f32.Ger(uintptr(m), uintptr(n),
+		alpha,
+		x, uintptr(incX),
+		y, uintptr(incY),
+		a, uintptr(lda))
+}
+
+// Sgbmv performs one of the matrix-vector operations
+//
+//	y = alpha * A * x + beta * y   if tA == blas.NoTrans
+//	y = alpha * Aᵀ * x + beta * y  if tA == blas.Trans or blas.ConjTrans
+//
+// where A is an m×n band matrix with kL sub-diagonals and kU super-diagonals,
+// x and y are vectors, and alpha and beta are scalars.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sgbmv(tA blas.Transpose, m, n, kL, kU int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int) {
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if kL < 0 {
+		panic(kLLT0)
+	}
+	if kU < 0 {
+		panic(kULT0)
+	}
+	if lda < kL+kU+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(min(m, n+kL)-1)+kL+kU+1 {
+		panic(shortA)
+	}
+	lenX := m
+	lenY := n
+	if tA == blas.NoTrans {
+		lenX = n
+		lenY = m
+	}
+	if (incX > 0 && len(x) <= (lenX-1)*incX) || (incX < 0 && len(x) <= (1-lenX)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (lenY-1)*incY) || (incY < 0 && len(y) <= (1-lenY)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	var kx, ky int
+	if incX < 0 {
+		kx = -(lenX - 1) * incX
+	}
+	if incY < 0 {
+		ky = -(lenY - 1) * incY
+	}
+
+	// Form y = beta * y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:lenY] {
+					y[i] = 0
+				}
+			} else {
+				f32.ScalUnitary(beta, y[:lenY])
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < lenY; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				if incY > 0 {
+					f32.ScalInc(beta, y, uintptr(lenY), uintptr(incY))
+				} else {
+					f32.ScalInc(beta, y, uintptr(lenY), uintptr(-incY))
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	// i and j are indices of the compacted banded matrix.
+	// off is the offset into the dense matrix (off + j = densej)
+	nCol := kU + 1 + kL
+	if tA == blas.NoTrans {
+		iy := ky
+		if incX == 1 {
+			for i := 0; i < min(m, n+kL); i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				off := max(0, i-kL)
+				atmp := a[i*lda+l : i*lda+u]
+				xtmp := x[off : off+u-l]
+				var sum float32
+				for j, v := range atmp {
+					sum += xtmp[j] * v
+				}
+				y[iy] += sum * alpha
+				iy += incY
+			}
+			return
+		}
+		for i := 0; i < min(m, n+kL); i++ {
+			l := max(0, kL-i)
+			u := min(nCol, n+kL-i)
+			off := max(0, i-kL)
+			atmp := a[i*lda+l : i*lda+u]
+			jx := kx
+			var sum float32
+			for _, v := range atmp {
+				sum += x[off*incX+jx] * v
+				jx += incX
+			}
+			y[iy] += sum * alpha
+			iy += incY
+		}
+		return
+	}
+	if incX == 1 {
+		for i := 0; i < min(m, n+kL); i++ {
+			l := max(0, kL-i)
+			u := min(nCol, n+kL-i)
+			off := max(0, i-kL)
+			atmp := a[i*lda+l : i*lda+u]
+			tmp := alpha * x[i]
+			jy := ky
+			for _, v := range atmp {
+				y[jy+off*incY] += tmp * v
+				jy += incY
+			}
+		}
+		return
+	}
+	ix := kx
+	for i := 0; i < min(m, n+kL); i++ {
+		l := max(0, kL-i)
+		u := min(nCol, n+kL-i)
+		off := max(0, i-kL)
+		atmp := a[i*lda+l : i*lda+u]
+		tmp := alpha * x[ix]
+		jy := ky
+		for _, v := range atmp {
+			y[jy+off*incY] += tmp * v
+			jy += incY
+		}
+		ix += incX
+	}
+}
+
+// Sgemv computes
+//
+//	y = alpha * A * x + beta * y   if tA = blas.NoTrans
+//	y = alpha * Aᵀ * x + beta * y  if tA = blas.Trans or blas.ConjTrans
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sgemv(tA blas.Transpose, m, n int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int) {
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	// Set up indexes
+	lenX := m
+	lenY := n
+	if tA == blas.NoTrans {
+		lenX = n
+		lenY = m
+	}
+
+	// Quick return if possible
+	if m == 0 || n == 0 {
+		return
+	}
+
+	if (incX > 0 && (lenX-1)*incX >= len(x)) || (incX < 0 && (1-lenX)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (lenY-1)*incY >= len(y)) || (incY < 0 && (1-lenY)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if len(a) < lda*(m-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		// First form y = beta * y
+		if incY > 0 {
+			Implementation{}.Sscal(lenY, beta, y, incY)
+		} else {
+			Implementation{}.Sscal(lenY, beta, y, -incY)
+		}
+		return
+	}
+
+	// Form y = alpha * A * x + y
+	if tA == blas.NoTrans {
+		f32.GemvN(uintptr(m), uintptr(n), alpha, a, uintptr(lda), x, uintptr(incX), beta, y, uintptr(incY))
+		return
+	}
+	// Cases where a is transposed.
+	f32.GemvT(uintptr(m), uintptr(n), alpha, a, uintptr(lda), x, uintptr(incX), beta, y, uintptr(incY))
+}
+
+// Strmv performs one of the matrix-vector operations
+//
+//	x = A * x   if tA == blas.NoTrans
+//	x = Aᵀ * x  if tA == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n triangular matrix, and x is a vector.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Strmv(ul blas.Uplo, tA blas.Transpose, d blas.Diag, n int, a []float32, lda int, x []float32, incX int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	nonUnit := d != blas.Unit
+	if n == 1 {
+		if nonUnit {
+			x[0] *= a[0]
+		}
+		return
+	}
+	var kx int
+	if incX <= 0 {
+		kx = -(n - 1) * incX
+	}
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					ilda := i * lda
+					var tmp float32
+					if nonUnit {
+						tmp = a[ilda+i] * x[i]
+					} else {
+						tmp = x[i]
+					}
+					x[i] = tmp + f32.DotUnitary(a[ilda+i+1:ilda+n], x[i+1:n])
+				}
+				return
+			}
+			ix := kx
+			for i := 0; i < n; i++ {
+				ilda := i * lda
+				var tmp float32
+				if nonUnit {
+					tmp = a[ilda+i] * x[ix]
+				} else {
+					tmp = x[ix]
+				}
+				x[ix] = tmp + f32.DotInc(x, a[ilda+i+1:ilda+n], uintptr(n-i-1), uintptr(incX), 1, uintptr(ix+incX), 0)
+				ix += incX
+			}
+			return
+		}
+		if incX == 1 {
+			for i := n - 1; i >= 0; i-- {
+				ilda := i * lda
+				var tmp float32
+				if nonUnit {
+					tmp += a[ilda+i] * x[i]
+				} else {
+					tmp = x[i]
+				}
+				x[i] = tmp + f32.DotUnitary(a[ilda:ilda+i], x[:i])
+			}
+			return
+		}
+		ix := kx + (n-1)*incX
+		for i := n - 1; i >= 0; i-- {
+			ilda := i * lda
+			var tmp float32
+			if nonUnit {
+				tmp = a[ilda+i] * x[ix]
+			} else {
+				tmp = x[ix]
+			}
+			x[ix] = tmp + f32.DotInc(x, a[ilda:ilda+i], uintptr(i), uintptr(incX), 1, uintptr(kx), 0)
+			ix -= incX
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := n - 1; i >= 0; i-- {
+				ilda := i * lda
+				xi := x[i]
+				f32.AxpyUnitary(xi, a[ilda+i+1:ilda+n], x[i+1:n])
+				if nonUnit {
+					x[i] *= a[ilda+i]
+				}
+			}
+			return
+		}
+		ix := kx + (n-1)*incX
+		for i := n - 1; i >= 0; i-- {
+			ilda := i * lda
+			xi := x[ix]
+			f32.AxpyInc(xi, a[ilda+i+1:ilda+n], x, uintptr(n-i-1), 1, uintptr(incX), 0, uintptr(kx+(i+1)*incX))
+			if nonUnit {
+				x[ix] *= a[ilda+i]
+			}
+			ix -= incX
+		}
+		return
+	}
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			ilda := i * lda
+			xi := x[i]
+			f32.AxpyUnitary(xi, a[ilda:ilda+i], x[:i])
+			if nonUnit {
+				x[i] *= a[i*lda+i]
+			}
+		}
+		return
+	}
+	ix := kx
+	for i := 0; i < n; i++ {
+		ilda := i * lda
+		xi := x[ix]
+		f32.AxpyInc(xi, a[ilda:ilda+i], x, uintptr(i), 1, uintptr(incX), 0, uintptr(kx))
+		if nonUnit {
+			x[ix] *= a[ilda+i]
+		}
+		ix += incX
+	}
+}
+
+// Strsv solves one of the systems of equations
+//
+//	A * x = b   if tA == blas.NoTrans
+//	Aᵀ * x = b  if tA == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n triangular matrix, and x and b are vectors.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Strsv(ul blas.Uplo, tA blas.Transpose, d blas.Diag, n int, a []float32, lda int, x []float32, incX int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	if n == 1 {
+		if d == blas.NonUnit {
+			x[0] /= a[0]
+		}
+		return
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+	nonUnit := d == blas.NonUnit
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					var sum float32
+					atmp := a[i*lda+i+1 : i*lda+n]
+					for j, v := range atmp {
+						jv := i + j + 1
+						sum += x[jv] * v
+					}
+					x[i] -= sum
+					if nonUnit {
+						x[i] /= a[i*lda+i]
+					}
+				}
+				return
+			}
+			ix := kx + (n-1)*incX
+			for i := n - 1; i >= 0; i-- {
+				var sum float32
+				jx := ix + incX
+				atmp := a[i*lda+i+1 : i*lda+n]
+				for _, v := range atmp {
+					sum += x[jx] * v
+					jx += incX
+				}
+				x[ix] -= sum
+				if nonUnit {
+					x[ix] /= a[i*lda+i]
+				}
+				ix -= incX
+			}
+			return
+		}
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				var sum float32
+				atmp := a[i*lda : i*lda+i]
+				for j, v := range atmp {
+					sum += x[j] * v
+				}
+				x[i] -= sum
+				if nonUnit {
+					x[i] /= a[i*lda+i]
+				}
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			jx := kx
+			var sum float32
+			atmp := a[i*lda : i*lda+i]
+			for _, v := range atmp {
+				sum += x[jx] * v
+				jx += incX
+			}
+			x[ix] -= sum
+			if nonUnit {
+				x[ix] /= a[i*lda+i]
+			}
+			ix += incX
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				if nonUnit {
+					x[i] /= a[i*lda+i]
+				}
+				xi := x[i]
+				atmp := a[i*lda+i+1 : i*lda+n]
+				for j, v := range atmp {
+					jv := j + i + 1
+					x[jv] -= v * xi
+				}
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			if nonUnit {
+				x[ix] /= a[i*lda+i]
+			}
+			xi := x[ix]
+			jx := kx + (i+1)*incX
+			atmp := a[i*lda+i+1 : i*lda+n]
+			for _, v := range atmp {
+				x[jx] -= v * xi
+				jx += incX
+			}
+			ix += incX
+		}
+		return
+	}
+	if incX == 1 {
+		for i := n - 1; i >= 0; i-- {
+			if nonUnit {
+				x[i] /= a[i*lda+i]
+			}
+			xi := x[i]
+			atmp := a[i*lda : i*lda+i]
+			for j, v := range atmp {
+				x[j] -= v * xi
+			}
+		}
+		return
+	}
+	ix := kx + (n-1)*incX
+	for i := n - 1; i >= 0; i-- {
+		if nonUnit {
+			x[ix] /= a[i*lda+i]
+		}
+		xi := x[ix]
+		jx := kx
+		atmp := a[i*lda : i*lda+i]
+		for _, v := range atmp {
+			x[jx] -= v * xi
+			jx += incX
+		}
+		ix -= incX
+	}
+}
+
+// Ssymv performs the matrix-vector operation
+//
+//	y = alpha * A * x + beta * y
+//
+// where A is an n×n symmetric matrix, x and y are vectors, and alpha and
+// beta are scalars.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Ssymv(ul blas.Uplo, n int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	// Set up start points
+	var kx, ky int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+	if incY < 0 {
+		ky = -(n - 1) * incY
+	}
+
+	// Form y = beta * y
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:n] {
+					y[i] = 0
+				}
+			} else {
+				f32.ScalUnitary(beta, y[:n])
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				if incY > 0 {
+					f32.ScalInc(beta, y, uintptr(n), uintptr(incY))
+				} else {
+					f32.ScalInc(beta, y, uintptr(n), uintptr(-incY))
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	if n == 1 {
+		y[0] += alpha * a[0] * x[0]
+		return
+	}
+
+	if ul == blas.Upper {
+		if incX == 1 {
+			iy := ky
+			for i := 0; i < n; i++ {
+				xv := x[i] * alpha
+				sum := x[i] * a[i*lda+i]
+				jy := ky + (i+1)*incY
+				atmp := a[i*lda+i+1 : i*lda+n]
+				for j, v := range atmp {
+					jp := j + i + 1
+					sum += x[jp] * v
+					y[jy] += xv * v
+					jy += incY
+				}
+				y[iy] += alpha * sum
+				iy += incY
+			}
+			return
+		}
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			xv := x[ix] * alpha
+			sum := x[ix] * a[i*lda+i]
+			jx := kx + (i+1)*incX
+			jy := ky + (i+1)*incY
+			atmp := a[i*lda+i+1 : i*lda+n]
+			for _, v := range atmp {
+				sum += x[jx] * v
+				y[jy] += xv * v
+				jx += incX
+				jy += incY
+			}
+			y[iy] += alpha * sum
+			ix += incX
+			iy += incY
+		}
+		return
+	}
+	// Cases where a is lower triangular.
+	if incX == 1 {
+		iy := ky
+		for i := 0; i < n; i++ {
+			jy := ky
+			xv := alpha * x[i]
+			atmp := a[i*lda : i*lda+i]
+			var sum float32
+			for j, v := range atmp {
+				sum += x[j] * v
+				y[jy] += xv * v
+				jy += incY
+			}
+			sum += x[i] * a[i*lda+i]
+			sum *= alpha
+			y[iy] += sum
+			iy += incY
+		}
+		return
+	}
+	ix := kx
+	iy := ky
+	for i := 0; i < n; i++ {
+		jx := kx
+		jy := ky
+		xv := alpha * x[ix]
+		atmp := a[i*lda : i*lda+i]
+		var sum float32
+		for _, v := range atmp {
+			sum += x[jx] * v
+			y[jy] += xv * v
+			jx += incX
+			jy += incY
+		}
+		sum += x[ix] * a[i*lda+i]
+		sum *= alpha
+		y[iy] += sum
+		ix += incX
+		iy += incY
+	}
+}
+
+// Stbmv performs one of the matrix-vector operations
+//
+//	x = A * x   if tA == blas.NoTrans
+//	x = Aᵀ * x  if tA == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n triangular band matrix with k+1 diagonals, and x is a vector.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Stbmv(ul blas.Uplo, tA blas.Transpose, d blas.Diag, n, k int, a []float32, lda int, x []float32, incX int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	if lda < k+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+k+1 {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+
+	nonunit := d != blas.Unit
+
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					u := min(1+k, n-i)
+					var sum float32
+					atmp := a[i*lda:]
+					xtmp := x[i:]
+					for j := 1; j < u; j++ {
+						sum += xtmp[j] * atmp[j]
+					}
+					if nonunit {
+						sum += xtmp[0] * atmp[0]
+					} else {
+						sum += xtmp[0]
+					}
+					x[i] = sum
+				}
+				return
+			}
+			ix := kx
+			for i := 0; i < n; i++ {
+				u := min(1+k, n-i)
+				var sum float32
+				atmp := a[i*lda:]
+				jx := incX
+				for j := 1; j < u; j++ {
+					sum += x[ix+jx] * atmp[j]
+					jx += incX
+				}
+				if nonunit {
+					sum += x[ix] * atmp[0]
+				} else {
+					sum += x[ix]
+				}
+				x[ix] = sum
+				ix += incX
+			}
+			return
+		}
+		if incX == 1 {
+			for i := n - 1; i >= 0; i-- {
+				l := max(0, k-i)
+				atmp := a[i*lda:]
+				var sum float32
+				for j := l; j < k; j++ {
+					sum += x[i-k+j] * atmp[j]
+				}
+				if nonunit {
+					sum += x[i] * atmp[k]
+				} else {
+					sum += x[i]
+				}
+				x[i] = sum
+			}
+			return
+		}
+		ix := kx + (n-1)*incX
+		for i := n - 1; i >= 0; i-- {
+			l := max(0, k-i)
+			atmp := a[i*lda:]
+			var sum float32
+			jx := l * incX
+			for j := l; j < k; j++ {
+				sum += x[ix-k*incX+jx] * atmp[j]
+				jx += incX
+			}
+			if nonunit {
+				sum += x[ix] * atmp[k]
+			} else {
+				sum += x[ix]
+			}
+			x[ix] = sum
+			ix -= incX
+		}
+		return
+	}
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := n - 1; i >= 0; i-- {
+				u := k + 1
+				if i < u {
+					u = i + 1
+				}
+				var sum float32
+				for j := 1; j < u; j++ {
+					sum += x[i-j] * a[(i-j)*lda+j]
+				}
+				if nonunit {
+					sum += x[i] * a[i*lda]
+				} else {
+					sum += x[i]
+				}
+				x[i] = sum
+			}
+			return
+		}
+		ix := kx + (n-1)*incX
+		for i := n - 1; i >= 0; i-- {
+			u := k + 1
+			if i < u {
+				u = i + 1
+			}
+			var sum float32
+			jx := incX
+			for j := 1; j < u; j++ {
+				sum += x[ix-jx] * a[(i-j)*lda+j]
+				jx += incX
+			}
+			if nonunit {
+				sum += x[ix] * a[i*lda]
+			} else {
+				sum += x[ix]
+			}
+			x[ix] = sum
+			ix -= incX
+		}
+		return
+	}
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			u := k
+			if i+k >= n {
+				u = n - i - 1
+			}
+			var sum float32
+			for j := 0; j < u; j++ {
+				sum += x[i+j+1] * a[(i+j+1)*lda+k-j-1]
+			}
+			if nonunit {
+				sum += x[i] * a[i*lda+k]
+			} else {
+				sum += x[i]
+			}
+			x[i] = sum
+		}
+		return
+	}
+	ix := kx
+	for i := 0; i < n; i++ {
+		u := k
+		if i+k >= n {
+			u = n - i - 1
+		}
+		var (
+			sum float32
+			jx  int
+		)
+		for j := 0; j < u; j++ {
+			sum += x[ix+jx+incX] * a[(i+j+1)*lda+k-j-1]
+			jx += incX
+		}
+		if nonunit {
+			sum += x[ix] * a[i*lda+k]
+		} else {
+			sum += x[ix]
+		}
+		x[ix] = sum
+		ix += incX
+	}
+}
+
+// Stpmv performs one of the matrix-vector operations
+//
+//	x = A * x   if tA == blas.NoTrans
+//	x = Aᵀ * x  if tA == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n triangular matrix in packed format, and x is a vector.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Stpmv(ul blas.Uplo, tA blas.Transpose, d blas.Diag, n int, ap []float32, x []float32, incX int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+
+	nonUnit := d == blas.NonUnit
+	var offset int // Offset is the index of (i,i)
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					xi := x[i]
+					if nonUnit {
+						xi *= ap[offset]
+					}
+					atmp := ap[offset+1 : offset+n-i]
+					xtmp := x[i+1:]
+					for j, v := range atmp {
+						xi += v * xtmp[j]
+					}
+					x[i] = xi
+					offset += n - i
+				}
+				return
+			}
+			ix := kx
+			for i := 0; i < n; i++ {
+				xix := x[ix]
+				if nonUnit {
+					xix *= ap[offset]
+				}
+				atmp := ap[offset+1 : offset+n-i]
+				jx := kx + (i+1)*incX
+				for _, v := range atmp {
+					xix += v * x[jx]
+					jx += incX
+				}
+				x[ix] = xix
+				offset += n - i
+				ix += incX
+			}
+			return
+		}
+		if incX == 1 {
+			offset = n*(n+1)/2 - 1
+			for i := n - 1; i >= 0; i-- {
+				xi := x[i]
+				if nonUnit {
+					xi *= ap[offset]
+				}
+				atmp := ap[offset-i : offset]
+				for j, v := range atmp {
+					xi += v * x[j]
+				}
+				x[i] = xi
+				offset -= i + 1
+			}
+			return
+		}
+		ix := kx + (n-1)*incX
+		offset = n*(n+1)/2 - 1
+		for i := n - 1; i >= 0; i-- {
+			xix := x[ix]
+			if nonUnit {
+				xix *= ap[offset]
+			}
+			atmp := ap[offset-i : offset]
+			jx := kx
+			for _, v := range atmp {
+				xix += v * x[jx]
+				jx += incX
+			}
+			x[ix] = xix
+			offset -= i + 1
+			ix -= incX
+		}
+		return
+	}
+	// Cases where ap is transposed.
+	if ul == blas.Upper {
+		if incX == 1 {
+			offset = n*(n+1)/2 - 1
+			for i := n - 1; i >= 0; i-- {
+				xi := x[i]
+				atmp := ap[offset+1 : offset+n-i]
+				xtmp := x[i+1:]
+				for j, v := range atmp {
+					xtmp[j] += v * xi
+				}
+				if nonUnit {
+					x[i] *= ap[offset]
+				}
+				offset -= n - i + 1
+			}
+			return
+		}
+		ix := kx + (n-1)*incX
+		offset = n*(n+1)/2 - 1
+		for i := n - 1; i >= 0; i-- {
+			xix := x[ix]
+			jx := kx + (i+1)*incX
+			atmp := ap[offset+1 : offset+n-i]
+			for _, v := range atmp {
+				x[jx] += v * xix
+				jx += incX
+			}
+			if nonUnit {
+				x[ix] *= ap[offset]
+			}
+			offset -= n - i + 1
+			ix -= incX
+		}
+		return
+	}
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			xi := x[i]
+			atmp := ap[offset-i : offset]
+			for j, v := range atmp {
+				x[j] += v * xi
+			}
+			if nonUnit {
+				x[i] *= ap[offset]
+			}
+			offset += i + 2
+		}
+		return
+	}
+	ix := kx
+	for i := 0; i < n; i++ {
+		xix := x[ix]
+		jx := kx
+		atmp := ap[offset-i : offset]
+		for _, v := range atmp {
+			x[jx] += v * xix
+			jx += incX
+		}
+		if nonUnit {
+			x[ix] *= ap[offset]
+		}
+		ix += incX
+		offset += i + 2
+	}
+}
+
+// Stbsv solves one of the systems of equations
+//
+//	A * x = b   if tA == blas.NoTrans
+//	Aᵀ * x = b  if tA == blas.Trans or tA == blas.ConjTrans
+//
+// where A is an n×n triangular band matrix with k+1 diagonals,
+// and x and b are vectors.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Stbsv(ul blas.Uplo, tA blas.Transpose, d blas.Diag, n, k int, a []float32, lda int, x []float32, incX int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	if lda < k+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+k+1 {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+	nonUnit := d == blas.NonUnit
+	// Form x = A^-1 x.
+	// Several cases below use subslices for speed improvement.
+	// The incX != 1 cases usually do not because incX may be negative.
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					bands := k
+					if i+bands >= n {
+						bands = n - i - 1
+					}
+					atmp := a[i*lda+1:]
+					xtmp := x[i+1 : i+bands+1]
+					var sum float32
+					for j, v := range xtmp {
+						sum += v * atmp[j]
+					}
+					x[i] -= sum
+					if nonUnit {
+						x[i] /= a[i*lda]
+					}
+				}
+				return
+			}
+			ix := kx + (n-1)*incX
+			for i := n - 1; i >= 0; i-- {
+				max := k + 1
+				if i+max > n {
+					max = n - i
+				}
+				atmp := a[i*lda:]
+				var (
+					jx  int
+					sum float32
+				)
+				for j := 1; j < max; j++ {
+					jx += incX
+					sum += x[ix+jx] * atmp[j]
+				}
+				x[ix] -= sum
+				if nonUnit {
+					x[ix] /= atmp[0]
+				}
+				ix -= incX
+			}
+			return
+		}
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				bands := k
+				if i-k < 0 {
+					bands = i
+				}
+				atmp := a[i*lda+k-bands:]
+				xtmp := x[i-bands : i]
+				var sum float32
+				for j, v := range xtmp {
+					sum += v * atmp[j]
+				}
+				x[i] -= sum
+				if nonUnit {
+					x[i] /= atmp[bands]
+				}
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			bands := k
+			if i-k < 0 {
+				bands = i
+			}
+			atmp := a[i*lda+k-bands:]
+			var (
+				sum float32
+				jx  int
+			)
+			for j := 0; j < bands; j++ {
+				sum += x[ix-bands*incX+jx] * atmp[j]
+				jx += incX
+			}
+			x[ix] -= sum
+			if nonUnit {
+				x[ix] /= atmp[bands]
+			}
+			ix += incX
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				bands := k
+				if i-k < 0 {
+					bands = i
+				}
+				var sum float32
+				for j := 0; j < bands; j++ {
+					sum += x[i-bands+j] * a[(i-bands+j)*lda+bands-j]
+				}
+				x[i] -= sum
+				if nonUnit {
+					x[i] /= a[i*lda]
+				}
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			bands := k
+			if i-k < 0 {
+				bands = i
+			}
+			var (
+				sum float32
+				jx  int
+			)
+			for j := 0; j < bands; j++ {
+				sum += x[ix-bands*incX+jx] * a[(i-bands+j)*lda+bands-j]
+				jx += incX
+			}
+			x[ix] -= sum
+			if nonUnit {
+				x[ix] /= a[i*lda]
+			}
+			ix += incX
+		}
+		return
+	}
+	if incX == 1 {
+		for i := n - 1; i >= 0; i-- {
+			bands := k
+			if i+bands >= n {
+				bands = n - i - 1
+			}
+			var sum float32
+			xtmp := x[i+1 : i+1+bands]
+			for j, v := range xtmp {
+				sum += v * a[(i+j+1)*lda+k-j-1]
+			}
+			x[i] -= sum
+			if nonUnit {
+				x[i] /= a[i*lda+k]
+			}
+		}
+		return
+	}
+	ix := kx + (n-1)*incX
+	for i := n - 1; i >= 0; i-- {
+		bands := k
+		if i+bands >= n {
+			bands = n - i - 1
+		}
+		var (
+			sum float32
+			jx  int
+		)
+		for j := 0; j < bands; j++ {
+			sum += x[ix+jx+incX] * a[(i+j+1)*lda+k-j-1]
+			jx += incX
+		}
+		x[ix] -= sum
+		if nonUnit {
+			x[ix] /= a[i*lda+k]
+		}
+		ix -= incX
+	}
+}
+
+// Ssbmv performs the matrix-vector operation
+//
+//	y = alpha * A * x + beta * y
+//
+// where A is an n×n symmetric band matrix with k super-diagonals, x and y are
+// vectors, and alpha and beta are scalars.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Ssbmv(ul blas.Uplo, n, k int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	if lda < k+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+k+1 {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	// Set up indexes
+	lenX := n
+	lenY := n
+	var kx, ky int
+	if incX < 0 {
+		kx = -(lenX - 1) * incX
+	}
+	if incY < 0 {
+		ky = -(lenY - 1) * incY
+	}
+
+	// Form y = beta * y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:n] {
+					y[i] = 0
+				}
+			} else {
+				f32.ScalUnitary(beta, y[:n])
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				if incY > 0 {
+					f32.ScalInc(beta, y, uintptr(n), uintptr(incY))
+				} else {
+					f32.ScalInc(beta, y, uintptr(n), uintptr(-incY))
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	if ul == blas.Upper {
+		if incX == 1 {
+			iy := ky
+			for i := 0; i < n; i++ {
+				atmp := a[i*lda:]
+				tmp := alpha * x[i]
+				sum := tmp * atmp[0]
+				u := min(k, n-i-1)
+				jy := incY
+				for j := 1; j <= u; j++ {
+					v := atmp[j]
+					sum += alpha * x[i+j] * v
+					y[iy+jy] += tmp * v
+					jy += incY
+				}
+				y[iy] += sum
+				iy += incY
+			}
+			return
+		}
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			atmp := a[i*lda:]
+			tmp := alpha * x[ix]
+			sum := tmp * atmp[0]
+			u := min(k, n-i-1)
+			jx := incX
+			jy := incY
+			for j := 1; j <= u; j++ {
+				v := atmp[j]
+				sum += alpha * x[ix+jx] * v
+				y[iy+jy] += tmp * v
+				jx += incX
+				jy += incY
+			}
+			y[iy] += sum
+			ix += incX
+			iy += incY
+		}
+		return
+	}
+
+	// Cases where a has bands below the diagonal.
+	if incX == 1 {
+		iy := ky
+		for i := 0; i < n; i++ {
+			l := max(0, k-i)
+			tmp := alpha * x[i]
+			jy := l * incY
+			atmp := a[i*lda:]
+			for j := l; j < k; j++ {
+				v := atmp[j]
+				y[iy] += alpha * v * x[i-k+j]
+				y[iy-k*incY+jy] += tmp * v
+				jy += incY
+			}
+			y[iy] += tmp * atmp[k]
+			iy += incY
+		}
+		return
+	}
+	ix := kx
+	iy := ky
+	for i := 0; i < n; i++ {
+		l := max(0, k-i)
+		tmp := alpha * x[ix]
+		jx := l * incX
+		jy := l * incY
+		atmp := a[i*lda:]
+		for j := l; j < k; j++ {
+			v := atmp[j]
+			y[iy] += alpha * v * x[ix-k*incX+jx]
+			y[iy-k*incY+jy] += tmp * v
+			jx += incX
+			jy += incY
+		}
+		y[iy] += tmp * atmp[k]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Ssyr performs the symmetric rank-one update
+//
+//	A += alpha * x * xᵀ
+//
+// where A is an n×n symmetric matrix, and x is a vector.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Ssyr(ul blas.Uplo, n int, alpha float32, x []float32, incX int, a []float32, lda int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	lenX := n
+	var kx int
+	if incX < 0 {
+		kx = -(lenX - 1) * incX
+	}
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				tmp := x[i] * alpha
+				if tmp != 0 {
+					atmp := a[i*lda+i : i*lda+n]
+					xtmp := x[i:n]
+					for j, v := range xtmp {
+						atmp[j] += v * tmp
+					}
+				}
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			tmp := x[ix] * alpha
+			if tmp != 0 {
+				jx := ix
+				atmp := a[i*lda:]
+				for j := i; j < n; j++ {
+					atmp[j] += x[jx] * tmp
+					jx += incX
+				}
+			}
+			ix += incX
+		}
+		return
+	}
+	// Cases where a is lower triangular.
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			tmp := x[i] * alpha
+			if tmp != 0 {
+				atmp := a[i*lda:]
+				xtmp := x[:i+1]
+				for j, v := range xtmp {
+					atmp[j] += tmp * v
+				}
+			}
+		}
+		return
+	}
+	ix := kx
+	for i := 0; i < n; i++ {
+		tmp := x[ix] * alpha
+		if tmp != 0 {
+			atmp := a[i*lda:]
+			jx := kx
+			for j := 0; j < i+1; j++ {
+				atmp[j] += tmp * x[jx]
+				jx += incX
+			}
+		}
+		ix += incX
+	}
+}
+
+// Ssyr2 performs the symmetric rank-two update
+//
+//	A += alpha * x * yᵀ + alpha * y * xᵀ
+//
+// where A is an n×n symmetric matrix, x and y are vectors, and alpha is a scalar.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Ssyr2(ul blas.Uplo, n int, alpha float32, x []float32, incX int, y []float32, incY int, a []float32, lda int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	var ky, kx int
+	if incY < 0 {
+		ky = -(n - 1) * incY
+	}
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+	if ul == blas.Upper {
+		if incX == 1 && incY == 1 {
+			for i := 0; i < n; i++ {
+				xi := x[i]
+				yi := y[i]
+				atmp := a[i*lda:]
+				for j := i; j < n; j++ {
+					atmp[j] += alpha * (xi*y[j] + x[j]*yi)
+				}
+			}
+			return
+		}
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			jx := kx + i*incX
+			jy := ky + i*incY
+			xi := x[ix]
+			yi := y[iy]
+			atmp := a[i*lda:]
+			for j := i; j < n; j++ {
+				atmp[j] += alpha * (xi*y[jy] + x[jx]*yi)
+				jx += incX
+				jy += incY
+			}
+			ix += incX
+			iy += incY
+		}
+		return
+	}
+	if incX == 1 && incY == 1 {
+		for i := 0; i < n; i++ {
+			xi := x[i]
+			yi := y[i]
+			atmp := a[i*lda:]
+			for j := 0; j <= i; j++ {
+				atmp[j] += alpha * (xi*y[j] + x[j]*yi)
+			}
+		}
+		return
+	}
+	ix := kx
+	iy := ky
+	for i := 0; i < n; i++ {
+		jx := kx
+		jy := ky
+		xi := x[ix]
+		yi := y[iy]
+		atmp := a[i*lda:]
+		for j := 0; j <= i; j++ {
+			atmp[j] += alpha * (xi*y[jy] + x[jx]*yi)
+			jx += incX
+			jy += incY
+		}
+		ix += incX
+		iy += incY
+	}
+}
+
+// Stpsv solves one of the systems of equations
+//
+//	A * x = b   if tA == blas.NoTrans
+//	Aᵀ * x = b  if tA == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n triangular matrix in packed format, and x and b are vectors.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Stpsv(ul blas.Uplo, tA blas.Transpose, d blas.Diag, n int, ap []float32, x []float32, incX int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+
+	nonUnit := d == blas.NonUnit
+	var offset int // Offset is the index of (i,i)
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			offset = n*(n+1)/2 - 1
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					atmp := ap[offset+1 : offset+n-i]
+					xtmp := x[i+1:]
+					var sum float32
+					for j, v := range atmp {
+						sum += v * xtmp[j]
+					}
+					x[i] -= sum
+					if nonUnit {
+						x[i] /= ap[offset]
+					}
+					offset -= n - i + 1
+				}
+				return
+			}
+			ix := kx + (n-1)*incX
+			for i := n - 1; i >= 0; i-- {
+				atmp := ap[offset+1 : offset+n-i]
+				jx := kx + (i+1)*incX
+				var sum float32
+				for _, v := range atmp {
+					sum += v * x[jx]
+					jx += incX
+				}
+				x[ix] -= sum
+				if nonUnit {
+					x[ix] /= ap[offset]
+				}
+				ix -= incX
+				offset -= n - i + 1
+			}
+			return
+		}
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				atmp := ap[offset-i : offset]
+				var sum float32
+				for j, v := range atmp {
+					sum += v * x[j]
+				}
+				x[i] -= sum
+				if nonUnit {
+					x[i] /= ap[offset]
+				}
+				offset += i + 2
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			jx := kx
+			atmp := ap[offset-i : offset]
+			var sum float32
+			for _, v := range atmp {
+				sum += v * x[jx]
+				jx += incX
+			}
+			x[ix] -= sum
+			if nonUnit {
+				x[ix] /= ap[offset]
+			}
+			ix += incX
+			offset += i + 2
+		}
+		return
+	}
+	// Cases where ap is transposed.
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				if nonUnit {
+					x[i] /= ap[offset]
+				}
+				xi := x[i]
+				atmp := ap[offset+1 : offset+n-i]
+				xtmp := x[i+1:]
+				for j, v := range atmp {
+					xtmp[j] -= v * xi
+				}
+				offset += n - i
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			if nonUnit {
+				x[ix] /= ap[offset]
+			}
+			xix := x[ix]
+			atmp := ap[offset+1 : offset+n-i]
+			jx := kx + (i+1)*incX
+			for _, v := range atmp {
+				x[jx] -= v * xix
+				jx += incX
+			}
+			ix += incX
+			offset += n - i
+		}
+		return
+	}
+	if incX == 1 {
+		offset = n*(n+1)/2 - 1
+		for i := n - 1; i >= 0; i-- {
+			if nonUnit {
+				x[i] /= ap[offset]
+			}
+			xi := x[i]
+			atmp := ap[offset-i : offset]
+			for j, v := range atmp {
+				x[j] -= v * xi
+			}
+			offset -= i + 1
+		}
+		return
+	}
+	ix := kx + (n-1)*incX
+	offset = n*(n+1)/2 - 1
+	for i := n - 1; i >= 0; i-- {
+		if nonUnit {
+			x[ix] /= ap[offset]
+		}
+		xix := x[ix]
+		atmp := ap[offset-i : offset]
+		jx := kx
+		for _, v := range atmp {
+			x[jx] -= v * xix
+			jx += incX
+		}
+		ix -= incX
+		offset -= i + 1
+	}
+}
+
+// Sspmv performs the matrix-vector operation
+//
+//	y = alpha * A * x + beta * y
+//
+// where A is an n×n symmetric matrix in packed format, x and y are vectors,
+// and alpha and beta are scalars.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sspmv(ul blas.Uplo, n int, alpha float32, ap []float32, x []float32, incX int, beta float32, y []float32, incY int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	// Set up start points
+	var kx, ky int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+	if incY < 0 {
+		ky = -(n - 1) * incY
+	}
+
+	// Form y = beta * y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:n] {
+					y[i] = 0
+				}
+			} else {
+				f32.ScalUnitary(beta, y[:n])
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				if incY > 0 {
+					f32.ScalInc(beta, y, uintptr(n), uintptr(incY))
+				} else {
+					f32.ScalInc(beta, y, uintptr(n), uintptr(-incY))
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	if n == 1 {
+		y[0] += alpha * ap[0] * x[0]
+		return
+	}
+	var offset int // Offset is the index of (i,i).
+	if ul == blas.Upper {
+		if incX == 1 {
+			iy := ky
+			for i := 0; i < n; i++ {
+				xv := x[i] * alpha
+				sum := ap[offset] * x[i]
+				atmp := ap[offset+1 : offset+n-i]
+				xtmp := x[i+1:]
+				jy := ky + (i+1)*incY
+				for j, v := range atmp {
+					sum += v * xtmp[j]
+					y[jy] += v * xv
+					jy += incY
+				}
+				y[iy] += alpha * sum
+				iy += incY
+				offset += n - i
+			}
+			return
+		}
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			xv := x[ix] * alpha
+			sum := ap[offset] * x[ix]
+			atmp := ap[offset+1 : offset+n-i]
+			jx := kx + (i+1)*incX
+			jy := ky + (i+1)*incY
+			for _, v := range atmp {
+				sum += v * x[jx]
+				y[jy] += v * xv
+				jx += incX
+				jy += incY
+			}
+			y[iy] += alpha * sum
+			ix += incX
+			iy += incY
+			offset += n - i
+		}
+		return
+	}
+	if incX == 1 {
+		iy := ky
+		for i := 0; i < n; i++ {
+			xv := x[i] * alpha
+			atmp := ap[offset-i : offset]
+			jy := ky
+			var sum float32
+			for j, v := range atmp {
+				sum += v * x[j]
+				y[jy] += v * xv
+				jy += incY
+			}
+			sum += ap[offset] * x[i]
+			y[iy] += alpha * sum
+			iy += incY
+			offset += i + 2
+		}
+		return
+	}
+	ix := kx
+	iy := ky
+	for i := 0; i < n; i++ {
+		xv := x[ix] * alpha
+		atmp := ap[offset-i : offset]
+		jx := kx
+		jy := ky
+		var sum float32
+		for _, v := range atmp {
+			sum += v * x[jx]
+			y[jy] += v * xv
+			jx += incX
+			jy += incY
+		}
+
+		sum += ap[offset] * x[ix]
+		y[iy] += alpha * sum
+		ix += incX
+		iy += incY
+		offset += i + 2
+	}
+}
+
+// Sspr performs the symmetric rank-one operation
+//
+//	A += alpha * x * xᵀ
+//
+// where A is an n×n symmetric matrix in packed format, x is a vector, and
+// alpha is a scalar.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sspr(ul blas.Uplo, n int, alpha float32, x []float32, incX int, ap []float32) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	lenX := n
+	var kx int
+	if incX < 0 {
+		kx = -(lenX - 1) * incX
+	}
+	var offset int // Offset is the index of (i,i).
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				atmp := ap[offset:]
+				xv := alpha * x[i]
+				xtmp := x[i:n]
+				for j, v := range xtmp {
+					atmp[j] += xv * v
+				}
+				offset += n - i
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			jx := kx + i*incX
+			atmp := ap[offset:]
+			xv := alpha * x[ix]
+			for j := 0; j < n-i; j++ {
+				atmp[j] += xv * x[jx]
+				jx += incX
+			}
+			ix += incX
+			offset += n - i
+		}
+		return
+	}
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			atmp := ap[offset-i:]
+			xv := alpha * x[i]
+			xtmp := x[:i+1]
+			for j, v := range xtmp {
+				atmp[j] += xv * v
+			}
+			offset += i + 2
+		}
+		return
+	}
+	ix := kx
+	for i := 0; i < n; i++ {
+		jx := kx
+		atmp := ap[offset-i:]
+		xv := alpha * x[ix]
+		for j := 0; j <= i; j++ {
+			atmp[j] += xv * x[jx]
+			jx += incX
+		}
+		ix += incX
+		offset += i + 2
+	}
+}
+
+// Sspr2 performs the symmetric rank-2 update
+//
+//	A += alpha * x * yᵀ + alpha * y * xᵀ
+//
+// where A is an n×n symmetric matrix in packed format, x and y are vectors,
+// and alpha is a scalar.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sspr2(ul blas.Uplo, n int, alpha float32, x []float32, incX int, y []float32, incY int, ap []float32) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	var ky, kx int
+	if incY < 0 {
+		ky = -(n - 1) * incY
+	}
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+	var offset int // Offset is the index of (i,i).
+	if ul == blas.Upper {
+		if incX == 1 && incY == 1 {
+			for i := 0; i < n; i++ {
+				atmp := ap[offset:]
+				xi := x[i]
+				yi := y[i]
+				xtmp := x[i:n]
+				ytmp := y[i:n]
+				for j, v := range xtmp {
+					atmp[j] += alpha * (xi*ytmp[j] + v*yi)
+				}
+				offset += n - i
+			}
+			return
+		}
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			jx := kx + i*incX
+			jy := ky + i*incY
+			atmp := ap[offset:]
+			xi := x[ix]
+			yi := y[iy]
+			for j := 0; j < n-i; j++ {
+				atmp[j] += alpha * (xi*y[jy] + x[jx]*yi)
+				jx += incX
+				jy += incY
+			}
+			ix += incX
+			iy += incY
+			offset += n - i
+		}
+		return
+	}
+	if incX == 1 && incY == 1 {
+		for i := 0; i < n; i++ {
+			atmp := ap[offset-i:]
+			xi := x[i]
+			yi := y[i]
+			xtmp := x[:i+1]
+			for j, v := range xtmp {
+				atmp[j] += alpha * (xi*y[j] + v*yi)
+			}
+			offset += i + 2
+		}
+		return
+	}
+	ix := kx
+	iy := ky
+	for i := 0; i < n; i++ {
+		jx := kx
+		jy := ky
+		atmp := ap[offset-i:]
+		for j := 0; j <= i; j++ {
+			atmp[j] += alpha * (x[ix]*y[jy] + x[jx]*y[iy])
+			jx += incX
+			jy += incY
+		}
+		ix += incX
+		iy += incY
+		offset += i + 2
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level2float64.go b/vendor/gonum.org/v1/gonum/blas/gonum/level2float64.go
new file mode 100644
index 0000000000..19b9c7e1c3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level2float64.go
@@ -0,0 +1,2366 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f64"
+)
+
+var _ blas.Float64Level2 = Implementation{}
+
+// Dger performs the rank-one operation
+//
+//	A += alpha * x * yᵀ
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
+func (Implementation) Dger(m, n int, alpha float64, x []float64, incX int, y []float64, incY int, a []float64, lda int) {
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (m-1)*incX) || (incX < 0 && len(x) <= (1-m)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(a) < lda*(m-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+	f64.Ger(uintptr(m), uintptr(n),
+		alpha,
+		x, uintptr(incX),
+		y, uintptr(incY),
+		a, uintptr(lda))
+}
+
+// Dgbmv performs one of the matrix-vector operations
+//
+//	y = alpha * A * x + beta * y   if tA == blas.NoTrans
+//	y = alpha * Aᵀ * x + beta * y  if tA == blas.Trans or blas.ConjTrans
+//
+// where A is an m×n band matrix with kL sub-diagonals and kU super-diagonals,
+// x and y are vectors, and alpha and beta are scalars.
+func (Implementation) Dgbmv(tA blas.Transpose, m, n, kL, kU int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int) {
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if kL < 0 {
+		panic(kLLT0)
+	}
+	if kU < 0 {
+		panic(kULT0)
+	}
+	if lda < kL+kU+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(min(m, n+kL)-1)+kL+kU+1 {
+		panic(shortA)
+	}
+	lenX := m
+	lenY := n
+	if tA == blas.NoTrans {
+		lenX = n
+		lenY = m
+	}
+	if (incX > 0 && len(x) <= (lenX-1)*incX) || (incX < 0 && len(x) <= (1-lenX)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (lenY-1)*incY) || (incY < 0 && len(y) <= (1-lenY)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	var kx, ky int
+	if incX < 0 {
+		kx = -(lenX - 1) * incX
+	}
+	if incY < 0 {
+		ky = -(lenY - 1) * incY
+	}
+
+	// Form y = beta * y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:lenY] {
+					y[i] = 0
+				}
+			} else {
+				f64.ScalUnitary(beta, y[:lenY])
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < lenY; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				if incY > 0 {
+					f64.ScalInc(beta, y, uintptr(lenY), uintptr(incY))
+				} else {
+					f64.ScalInc(beta, y, uintptr(lenY), uintptr(-incY))
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	// i and j are indices of the compacted banded matrix.
+	// off is the offset into the dense matrix (off + j = densej)
+	nCol := kU + 1 + kL
+	if tA == blas.NoTrans {
+		iy := ky
+		if incX == 1 {
+			for i := 0; i < min(m, n+kL); i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				off := max(0, i-kL)
+				atmp := a[i*lda+l : i*lda+u]
+				xtmp := x[off : off+u-l]
+				var sum float64
+				for j, v := range atmp {
+					sum += xtmp[j] * v
+				}
+				y[iy] += sum * alpha
+				iy += incY
+			}
+			return
+		}
+		for i := 0; i < min(m, n+kL); i++ {
+			l := max(0, kL-i)
+			u := min(nCol, n+kL-i)
+			off := max(0, i-kL)
+			atmp := a[i*lda+l : i*lda+u]
+			jx := kx
+			var sum float64
+			for _, v := range atmp {
+				sum += x[off*incX+jx] * v
+				jx += incX
+			}
+			y[iy] += sum * alpha
+			iy += incY
+		}
+		return
+	}
+	if incX == 1 {
+		for i := 0; i < min(m, n+kL); i++ {
+			l := max(0, kL-i)
+			u := min(nCol, n+kL-i)
+			off := max(0, i-kL)
+			atmp := a[i*lda+l : i*lda+u]
+			tmp := alpha * x[i]
+			jy := ky
+			for _, v := range atmp {
+				y[jy+off*incY] += tmp * v
+				jy += incY
+			}
+		}
+		return
+	}
+	ix := kx
+	for i := 0; i < min(m, n+kL); i++ {
+		l := max(0, kL-i)
+		u := min(nCol, n+kL-i)
+		off := max(0, i-kL)
+		atmp := a[i*lda+l : i*lda+u]
+		tmp := alpha * x[ix]
+		jy := ky
+		for _, v := range atmp {
+			y[jy+off*incY] += tmp * v
+			jy += incY
+		}
+		ix += incX
+	}
+}
+
+// Dgemv computes
+//
+//	y = alpha * A * x + beta * y   if tA = blas.NoTrans
+//	y = alpha * Aᵀ * x + beta * y  if tA = blas.Trans or blas.ConjTrans
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars.
+func (Implementation) Dgemv(tA blas.Transpose, m, n int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int) {
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	// Set up indexes
+	lenX := m
+	lenY := n
+	if tA == blas.NoTrans {
+		lenX = n
+		lenY = m
+	}
+
+	// Quick return if possible
+	if m == 0 || n == 0 {
+		return
+	}
+
+	if (incX > 0 && (lenX-1)*incX >= len(x)) || (incX < 0 && (1-lenX)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (lenY-1)*incY >= len(y)) || (incY < 0 && (1-lenY)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if len(a) < lda*(m-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		// First form y = beta * y
+		if incY > 0 {
+			Implementation{}.Dscal(lenY, beta, y, incY)
+		} else {
+			Implementation{}.Dscal(lenY, beta, y, -incY)
+		}
+		return
+	}
+
+	// Form y = alpha * A * x + y
+	if tA == blas.NoTrans {
+		f64.GemvN(uintptr(m), uintptr(n), alpha, a, uintptr(lda), x, uintptr(incX), beta, y, uintptr(incY))
+		return
+	}
+	// Cases where a is transposed.
+	f64.GemvT(uintptr(m), uintptr(n), alpha, a, uintptr(lda), x, uintptr(incX), beta, y, uintptr(incY))
+}
+
+// Dtrmv performs one of the matrix-vector operations
+//
+//	x = A * x   if tA == blas.NoTrans
+//	x = Aᵀ * x  if tA == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n triangular matrix, and x is a vector.
+func (Implementation) Dtrmv(ul blas.Uplo, tA blas.Transpose, d blas.Diag, n int, a []float64, lda int, x []float64, incX int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	nonUnit := d != blas.Unit
+	if n == 1 {
+		if nonUnit {
+			x[0] *= a[0]
+		}
+		return
+	}
+	var kx int
+	if incX <= 0 {
+		kx = -(n - 1) * incX
+	}
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					ilda := i * lda
+					var tmp float64
+					if nonUnit {
+						tmp = a[ilda+i] * x[i]
+					} else {
+						tmp = x[i]
+					}
+					x[i] = tmp + f64.DotUnitary(a[ilda+i+1:ilda+n], x[i+1:n])
+				}
+				return
+			}
+			ix := kx
+			for i := 0; i < n; i++ {
+				ilda := i * lda
+				var tmp float64
+				if nonUnit {
+					tmp = a[ilda+i] * x[ix]
+				} else {
+					tmp = x[ix]
+				}
+				x[ix] = tmp + f64.DotInc(x, a[ilda+i+1:ilda+n], uintptr(n-i-1), uintptr(incX), 1, uintptr(ix+incX), 0)
+				ix += incX
+			}
+			return
+		}
+		if incX == 1 {
+			for i := n - 1; i >= 0; i-- {
+				ilda := i * lda
+				var tmp float64
+				if nonUnit {
+					tmp += a[ilda+i] * x[i]
+				} else {
+					tmp = x[i]
+				}
+				x[i] = tmp + f64.DotUnitary(a[ilda:ilda+i], x[:i])
+			}
+			return
+		}
+		ix := kx + (n-1)*incX
+		for i := n - 1; i >= 0; i-- {
+			ilda := i * lda
+			var tmp float64
+			if nonUnit {
+				tmp = a[ilda+i] * x[ix]
+			} else {
+				tmp = x[ix]
+			}
+			x[ix] = tmp + f64.DotInc(x, a[ilda:ilda+i], uintptr(i), uintptr(incX), 1, uintptr(kx), 0)
+			ix -= incX
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := n - 1; i >= 0; i-- {
+				ilda := i * lda
+				xi := x[i]
+				f64.AxpyUnitary(xi, a[ilda+i+1:ilda+n], x[i+1:n])
+				if nonUnit {
+					x[i] *= a[ilda+i]
+				}
+			}
+			return
+		}
+		ix := kx + (n-1)*incX
+		for i := n - 1; i >= 0; i-- {
+			ilda := i * lda
+			xi := x[ix]
+			f64.AxpyInc(xi, a[ilda+i+1:ilda+n], x, uintptr(n-i-1), 1, uintptr(incX), 0, uintptr(kx+(i+1)*incX))
+			if nonUnit {
+				x[ix] *= a[ilda+i]
+			}
+			ix -= incX
+		}
+		return
+	}
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			ilda := i * lda
+			xi := x[i]
+			f64.AxpyUnitary(xi, a[ilda:ilda+i], x[:i])
+			if nonUnit {
+				x[i] *= a[i*lda+i]
+			}
+		}
+		return
+	}
+	ix := kx
+	for i := 0; i < n; i++ {
+		ilda := i * lda
+		xi := x[ix]
+		f64.AxpyInc(xi, a[ilda:ilda+i], x, uintptr(i), 1, uintptr(incX), 0, uintptr(kx))
+		if nonUnit {
+			x[ix] *= a[ilda+i]
+		}
+		ix += incX
+	}
+}
+
+// Dtrsv solves one of the systems of equations
+//
+//	A * x = b   if tA == blas.NoTrans
+//	Aᵀ * x = b  if tA == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n triangular matrix, and x and b are vectors.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func (Implementation) Dtrsv(ul blas.Uplo, tA blas.Transpose, d blas.Diag, n int, a []float64, lda int, x []float64, incX int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	if n == 1 {
+		if d == blas.NonUnit {
+			x[0] /= a[0]
+		}
+		return
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+	nonUnit := d == blas.NonUnit
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					var sum float64
+					atmp := a[i*lda+i+1 : i*lda+n]
+					for j, v := range atmp {
+						jv := i + j + 1
+						sum += x[jv] * v
+					}
+					x[i] -= sum
+					if nonUnit {
+						x[i] /= a[i*lda+i]
+					}
+				}
+				return
+			}
+			ix := kx + (n-1)*incX
+			for i := n - 1; i >= 0; i-- {
+				var sum float64
+				jx := ix + incX
+				atmp := a[i*lda+i+1 : i*lda+n]
+				for _, v := range atmp {
+					sum += x[jx] * v
+					jx += incX
+				}
+				x[ix] -= sum
+				if nonUnit {
+					x[ix] /= a[i*lda+i]
+				}
+				ix -= incX
+			}
+			return
+		}
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				var sum float64
+				atmp := a[i*lda : i*lda+i]
+				for j, v := range atmp {
+					sum += x[j] * v
+				}
+				x[i] -= sum
+				if nonUnit {
+					x[i] /= a[i*lda+i]
+				}
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			jx := kx
+			var sum float64
+			atmp := a[i*lda : i*lda+i]
+			for _, v := range atmp {
+				sum += x[jx] * v
+				jx += incX
+			}
+			x[ix] -= sum
+			if nonUnit {
+				x[ix] /= a[i*lda+i]
+			}
+			ix += incX
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				if nonUnit {
+					x[i] /= a[i*lda+i]
+				}
+				xi := x[i]
+				atmp := a[i*lda+i+1 : i*lda+n]
+				for j, v := range atmp {
+					jv := j + i + 1
+					x[jv] -= v * xi
+				}
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			if nonUnit {
+				x[ix] /= a[i*lda+i]
+			}
+			xi := x[ix]
+			jx := kx + (i+1)*incX
+			atmp := a[i*lda+i+1 : i*lda+n]
+			for _, v := range atmp {
+				x[jx] -= v * xi
+				jx += incX
+			}
+			ix += incX
+		}
+		return
+	}
+	if incX == 1 {
+		for i := n - 1; i >= 0; i-- {
+			if nonUnit {
+				x[i] /= a[i*lda+i]
+			}
+			xi := x[i]
+			atmp := a[i*lda : i*lda+i]
+			for j, v := range atmp {
+				x[j] -= v * xi
+			}
+		}
+		return
+	}
+	ix := kx + (n-1)*incX
+	for i := n - 1; i >= 0; i-- {
+		if nonUnit {
+			x[ix] /= a[i*lda+i]
+		}
+		xi := x[ix]
+		jx := kx
+		atmp := a[i*lda : i*lda+i]
+		for _, v := range atmp {
+			x[jx] -= v * xi
+			jx += incX
+		}
+		ix -= incX
+	}
+}
+
+// Dsymv performs the matrix-vector operation
+//
+//	y = alpha * A * x + beta * y
+//
+// where A is an n×n symmetric matrix, x and y are vectors, and alpha and
+// beta are scalars.
+func (Implementation) Dsymv(ul blas.Uplo, n int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	// Set up start points
+	var kx, ky int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+	if incY < 0 {
+		ky = -(n - 1) * incY
+	}
+
+	// Form y = beta * y
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:n] {
+					y[i] = 0
+				}
+			} else {
+				f64.ScalUnitary(beta, y[:n])
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				if incY > 0 {
+					f64.ScalInc(beta, y, uintptr(n), uintptr(incY))
+				} else {
+					f64.ScalInc(beta, y, uintptr(n), uintptr(-incY))
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	if n == 1 {
+		y[0] += alpha * a[0] * x[0]
+		return
+	}
+
+	if ul == blas.Upper {
+		if incX == 1 {
+			iy := ky
+			for i := 0; i < n; i++ {
+				xv := x[i] * alpha
+				sum := x[i] * a[i*lda+i]
+				jy := ky + (i+1)*incY
+				atmp := a[i*lda+i+1 : i*lda+n]
+				for j, v := range atmp {
+					jp := j + i + 1
+					sum += x[jp] * v
+					y[jy] += xv * v
+					jy += incY
+				}
+				y[iy] += alpha * sum
+				iy += incY
+			}
+			return
+		}
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			xv := x[ix] * alpha
+			sum := x[ix] * a[i*lda+i]
+			jx := kx + (i+1)*incX
+			jy := ky + (i+1)*incY
+			atmp := a[i*lda+i+1 : i*lda+n]
+			for _, v := range atmp {
+				sum += x[jx] * v
+				y[jy] += xv * v
+				jx += incX
+				jy += incY
+			}
+			y[iy] += alpha * sum
+			ix += incX
+			iy += incY
+		}
+		return
+	}
+	// Cases where a is lower triangular.
+	if incX == 1 {
+		iy := ky
+		for i := 0; i < n; i++ {
+			jy := ky
+			xv := alpha * x[i]
+			atmp := a[i*lda : i*lda+i]
+			var sum float64
+			for j, v := range atmp {
+				sum += x[j] * v
+				y[jy] += xv * v
+				jy += incY
+			}
+			sum += x[i] * a[i*lda+i]
+			sum *= alpha
+			y[iy] += sum
+			iy += incY
+		}
+		return
+	}
+	ix := kx
+	iy := ky
+	for i := 0; i < n; i++ {
+		jx := kx
+		jy := ky
+		xv := alpha * x[ix]
+		atmp := a[i*lda : i*lda+i]
+		var sum float64
+		for _, v := range atmp {
+			sum += x[jx] * v
+			y[jy] += xv * v
+			jx += incX
+			jy += incY
+		}
+		sum += x[ix] * a[i*lda+i]
+		sum *= alpha
+		y[iy] += sum
+		ix += incX
+		iy += incY
+	}
+}
+
+// Dtbmv performs one of the matrix-vector operations
+//
+//	x = A * x   if tA == blas.NoTrans
+//	x = Aᵀ * x  if tA == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n triangular band matrix with k+1 diagonals, and x is a vector.
+func (Implementation) Dtbmv(ul blas.Uplo, tA blas.Transpose, d blas.Diag, n, k int, a []float64, lda int, x []float64, incX int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	if lda < k+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+k+1 {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+
+	nonunit := d != blas.Unit
+
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					u := min(1+k, n-i)
+					var sum float64
+					atmp := a[i*lda:]
+					xtmp := x[i:]
+					for j := 1; j < u; j++ {
+						sum += xtmp[j] * atmp[j]
+					}
+					if nonunit {
+						sum += xtmp[0] * atmp[0]
+					} else {
+						sum += xtmp[0]
+					}
+					x[i] = sum
+				}
+				return
+			}
+			ix := kx
+			for i := 0; i < n; i++ {
+				u := min(1+k, n-i)
+				var sum float64
+				atmp := a[i*lda:]
+				jx := incX
+				for j := 1; j < u; j++ {
+					sum += x[ix+jx] * atmp[j]
+					jx += incX
+				}
+				if nonunit {
+					sum += x[ix] * atmp[0]
+				} else {
+					sum += x[ix]
+				}
+				x[ix] = sum
+				ix += incX
+			}
+			return
+		}
+		if incX == 1 {
+			for i := n - 1; i >= 0; i-- {
+				l := max(0, k-i)
+				atmp := a[i*lda:]
+				var sum float64
+				for j := l; j < k; j++ {
+					sum += x[i-k+j] * atmp[j]
+				}
+				if nonunit {
+					sum += x[i] * atmp[k]
+				} else {
+					sum += x[i]
+				}
+				x[i] = sum
+			}
+			return
+		}
+		ix := kx + (n-1)*incX
+		for i := n - 1; i >= 0; i-- {
+			l := max(0, k-i)
+			atmp := a[i*lda:]
+			var sum float64
+			jx := l * incX
+			for j := l; j < k; j++ {
+				sum += x[ix-k*incX+jx] * atmp[j]
+				jx += incX
+			}
+			if nonunit {
+				sum += x[ix] * atmp[k]
+			} else {
+				sum += x[ix]
+			}
+			x[ix] = sum
+			ix -= incX
+		}
+		return
+	}
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := n - 1; i >= 0; i-- {
+				u := k + 1
+				if i < u {
+					u = i + 1
+				}
+				var sum float64
+				for j := 1; j < u; j++ {
+					sum += x[i-j] * a[(i-j)*lda+j]
+				}
+				if nonunit {
+					sum += x[i] * a[i*lda]
+				} else {
+					sum += x[i]
+				}
+				x[i] = sum
+			}
+			return
+		}
+		ix := kx + (n-1)*incX
+		for i := n - 1; i >= 0; i-- {
+			u := k + 1
+			if i < u {
+				u = i + 1
+			}
+			var sum float64
+			jx := incX
+			for j := 1; j < u; j++ {
+				sum += x[ix-jx] * a[(i-j)*lda+j]
+				jx += incX
+			}
+			if nonunit {
+				sum += x[ix] * a[i*lda]
+			} else {
+				sum += x[ix]
+			}
+			x[ix] = sum
+			ix -= incX
+		}
+		return
+	}
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			u := k
+			if i+k >= n {
+				u = n - i - 1
+			}
+			var sum float64
+			for j := 0; j < u; j++ {
+				sum += x[i+j+1] * a[(i+j+1)*lda+k-j-1]
+			}
+			if nonunit {
+				sum += x[i] * a[i*lda+k]
+			} else {
+				sum += x[i]
+			}
+			x[i] = sum
+		}
+		return
+	}
+	ix := kx
+	for i := 0; i < n; i++ {
+		u := k
+		if i+k >= n {
+			u = n - i - 1
+		}
+		var (
+			sum float64
+			jx  int
+		)
+		for j := 0; j < u; j++ {
+			sum += x[ix+jx+incX] * a[(i+j+1)*lda+k-j-1]
+			jx += incX
+		}
+		if nonunit {
+			sum += x[ix] * a[i*lda+k]
+		} else {
+			sum += x[ix]
+		}
+		x[ix] = sum
+		ix += incX
+	}
+}
+
+// Dtpmv performs one of the matrix-vector operations
+//
+//	x = A * x   if tA == blas.NoTrans
+//	x = Aᵀ * x  if tA == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n triangular matrix in packed format, and x is a vector.
+func (Implementation) Dtpmv(ul blas.Uplo, tA blas.Transpose, d blas.Diag, n int, ap []float64, x []float64, incX int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+
+	nonUnit := d == blas.NonUnit
+	var offset int // Offset is the index of (i,i)
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					xi := x[i]
+					if nonUnit {
+						xi *= ap[offset]
+					}
+					atmp := ap[offset+1 : offset+n-i]
+					xtmp := x[i+1:]
+					for j, v := range atmp {
+						xi += v * xtmp[j]
+					}
+					x[i] = xi
+					offset += n - i
+				}
+				return
+			}
+			ix := kx
+			for i := 0; i < n; i++ {
+				xix := x[ix]
+				if nonUnit {
+					xix *= ap[offset]
+				}
+				atmp := ap[offset+1 : offset+n-i]
+				jx := kx + (i+1)*incX
+				for _, v := range atmp {
+					xix += v * x[jx]
+					jx += incX
+				}
+				x[ix] = xix
+				offset += n - i
+				ix += incX
+			}
+			return
+		}
+		if incX == 1 {
+			offset = n*(n+1)/2 - 1
+			for i := n - 1; i >= 0; i-- {
+				xi := x[i]
+				if nonUnit {
+					xi *= ap[offset]
+				}
+				atmp := ap[offset-i : offset]
+				for j, v := range atmp {
+					xi += v * x[j]
+				}
+				x[i] = xi
+				offset -= i + 1
+			}
+			return
+		}
+		ix := kx + (n-1)*incX
+		offset = n*(n+1)/2 - 1
+		for i := n - 1; i >= 0; i-- {
+			xix := x[ix]
+			if nonUnit {
+				xix *= ap[offset]
+			}
+			atmp := ap[offset-i : offset]
+			jx := kx
+			for _, v := range atmp {
+				xix += v * x[jx]
+				jx += incX
+			}
+			x[ix] = xix
+			offset -= i + 1
+			ix -= incX
+		}
+		return
+	}
+	// Cases where ap is transposed.
+	if ul == blas.Upper {
+		if incX == 1 {
+			offset = n*(n+1)/2 - 1
+			for i := n - 1; i >= 0; i-- {
+				xi := x[i]
+				atmp := ap[offset+1 : offset+n-i]
+				xtmp := x[i+1:]
+				for j, v := range atmp {
+					xtmp[j] += v * xi
+				}
+				if nonUnit {
+					x[i] *= ap[offset]
+				}
+				offset -= n - i + 1
+			}
+			return
+		}
+		ix := kx + (n-1)*incX
+		offset = n*(n+1)/2 - 1
+		for i := n - 1; i >= 0; i-- {
+			xix := x[ix]
+			jx := kx + (i+1)*incX
+			atmp := ap[offset+1 : offset+n-i]
+			for _, v := range atmp {
+				x[jx] += v * xix
+				jx += incX
+			}
+			if nonUnit {
+				x[ix] *= ap[offset]
+			}
+			offset -= n - i + 1
+			ix -= incX
+		}
+		return
+	}
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			xi := x[i]
+			atmp := ap[offset-i : offset]
+			for j, v := range atmp {
+				x[j] += v * xi
+			}
+			if nonUnit {
+				x[i] *= ap[offset]
+			}
+			offset += i + 2
+		}
+		return
+	}
+	ix := kx
+	for i := 0; i < n; i++ {
+		xix := x[ix]
+		jx := kx
+		atmp := ap[offset-i : offset]
+		for _, v := range atmp {
+			x[jx] += v * xix
+			jx += incX
+		}
+		if nonUnit {
+			x[ix] *= ap[offset]
+		}
+		ix += incX
+		offset += i + 2
+	}
+}
+
+// Dtbsv solves one of the systems of equations
+//
+//	A * x = b   if tA == blas.NoTrans
+//	Aᵀ * x = b  if tA == blas.Trans or tA == blas.ConjTrans
+//
+// where A is an n×n triangular band matrix with k+1 diagonals,
+// and x and b are vectors.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func (Implementation) Dtbsv(ul blas.Uplo, tA blas.Transpose, d blas.Diag, n, k int, a []float64, lda int, x []float64, incX int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	if lda < k+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+k+1 {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+	nonUnit := d == blas.NonUnit
+	// Form x = A^-1 x.
+	// Several cases below use subslices for speed improvement.
+	// The incX != 1 cases usually do not because incX may be negative.
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					bands := k
+					if i+bands >= n {
+						bands = n - i - 1
+					}
+					atmp := a[i*lda+1:]
+					xtmp := x[i+1 : i+bands+1]
+					var sum float64
+					for j, v := range xtmp {
+						sum += v * atmp[j]
+					}
+					x[i] -= sum
+					if nonUnit {
+						x[i] /= a[i*lda]
+					}
+				}
+				return
+			}
+			ix := kx + (n-1)*incX
+			for i := n - 1; i >= 0; i-- {
+				max := k + 1
+				if i+max > n {
+					max = n - i
+				}
+				atmp := a[i*lda:]
+				var (
+					jx  int
+					sum float64
+				)
+				for j := 1; j < max; j++ {
+					jx += incX
+					sum += x[ix+jx] * atmp[j]
+				}
+				x[ix] -= sum
+				if nonUnit {
+					x[ix] /= atmp[0]
+				}
+				ix -= incX
+			}
+			return
+		}
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				bands := k
+				if i-k < 0 {
+					bands = i
+				}
+				atmp := a[i*lda+k-bands:]
+				xtmp := x[i-bands : i]
+				var sum float64
+				for j, v := range xtmp {
+					sum += v * atmp[j]
+				}
+				x[i] -= sum
+				if nonUnit {
+					x[i] /= atmp[bands]
+				}
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			bands := k
+			if i-k < 0 {
+				bands = i
+			}
+			atmp := a[i*lda+k-bands:]
+			var (
+				sum float64
+				jx  int
+			)
+			for j := 0; j < bands; j++ {
+				sum += x[ix-bands*incX+jx] * atmp[j]
+				jx += incX
+			}
+			x[ix] -= sum
+			if nonUnit {
+				x[ix] /= atmp[bands]
+			}
+			ix += incX
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				bands := k
+				if i-k < 0 {
+					bands = i
+				}
+				var sum float64
+				for j := 0; j < bands; j++ {
+					sum += x[i-bands+j] * a[(i-bands+j)*lda+bands-j]
+				}
+				x[i] -= sum
+				if nonUnit {
+					x[i] /= a[i*lda]
+				}
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			bands := k
+			if i-k < 0 {
+				bands = i
+			}
+			var (
+				sum float64
+				jx  int
+			)
+			for j := 0; j < bands; j++ {
+				sum += x[ix-bands*incX+jx] * a[(i-bands+j)*lda+bands-j]
+				jx += incX
+			}
+			x[ix] -= sum
+			if nonUnit {
+				x[ix] /= a[i*lda]
+			}
+			ix += incX
+		}
+		return
+	}
+	if incX == 1 {
+		for i := n - 1; i >= 0; i-- {
+			bands := k
+			if i+bands >= n {
+				bands = n - i - 1
+			}
+			var sum float64
+			xtmp := x[i+1 : i+1+bands]
+			for j, v := range xtmp {
+				sum += v * a[(i+j+1)*lda+k-j-1]
+			}
+			x[i] -= sum
+			if nonUnit {
+				x[i] /= a[i*lda+k]
+			}
+		}
+		return
+	}
+	ix := kx + (n-1)*incX
+	for i := n - 1; i >= 0; i-- {
+		bands := k
+		if i+bands >= n {
+			bands = n - i - 1
+		}
+		var (
+			sum float64
+			jx  int
+		)
+		for j := 0; j < bands; j++ {
+			sum += x[ix+jx+incX] * a[(i+j+1)*lda+k-j-1]
+			jx += incX
+		}
+		x[ix] -= sum
+		if nonUnit {
+			x[ix] /= a[i*lda+k]
+		}
+		ix -= incX
+	}
+}
+
+// Dsbmv performs the matrix-vector operation
+//
+//	y = alpha * A * x + beta * y
+//
+// where A is an n×n symmetric band matrix with k super-diagonals, x and y are
+// vectors, and alpha and beta are scalars.
+func (Implementation) Dsbmv(ul blas.Uplo, n, k int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	if lda < k+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+k+1 {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	// Set up indexes
+	lenX := n
+	lenY := n
+	var kx, ky int
+	if incX < 0 {
+		kx = -(lenX - 1) * incX
+	}
+	if incY < 0 {
+		ky = -(lenY - 1) * incY
+	}
+
+	// Form y = beta * y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:n] {
+					y[i] = 0
+				}
+			} else {
+				f64.ScalUnitary(beta, y[:n])
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				if incY > 0 {
+					f64.ScalInc(beta, y, uintptr(n), uintptr(incY))
+				} else {
+					f64.ScalInc(beta, y, uintptr(n), uintptr(-incY))
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	if ul == blas.Upper {
+		if incX == 1 {
+			iy := ky
+			for i := 0; i < n; i++ {
+				atmp := a[i*lda:]
+				tmp := alpha * x[i]
+				sum := tmp * atmp[0]
+				u := min(k, n-i-1)
+				jy := incY
+				for j := 1; j <= u; j++ {
+					v := atmp[j]
+					sum += alpha * x[i+j] * v
+					y[iy+jy] += tmp * v
+					jy += incY
+				}
+				y[iy] += sum
+				iy += incY
+			}
+			return
+		}
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			atmp := a[i*lda:]
+			tmp := alpha * x[ix]
+			sum := tmp * atmp[0]
+			u := min(k, n-i-1)
+			jx := incX
+			jy := incY
+			for j := 1; j <= u; j++ {
+				v := atmp[j]
+				sum += alpha * x[ix+jx] * v
+				y[iy+jy] += tmp * v
+				jx += incX
+				jy += incY
+			}
+			y[iy] += sum
+			ix += incX
+			iy += incY
+		}
+		return
+	}
+
+	// Cases where a has bands below the diagonal.
+	if incX == 1 {
+		iy := ky
+		for i := 0; i < n; i++ {
+			l := max(0, k-i)
+			tmp := alpha * x[i]
+			jy := l * incY
+			atmp := a[i*lda:]
+			for j := l; j < k; j++ {
+				v := atmp[j]
+				y[iy] += alpha * v * x[i-k+j]
+				y[iy-k*incY+jy] += tmp * v
+				jy += incY
+			}
+			y[iy] += tmp * atmp[k]
+			iy += incY
+		}
+		return
+	}
+	ix := kx
+	iy := ky
+	for i := 0; i < n; i++ {
+		l := max(0, k-i)
+		tmp := alpha * x[ix]
+		jx := l * incX
+		jy := l * incY
+		atmp := a[i*lda:]
+		for j := l; j < k; j++ {
+			v := atmp[j]
+			y[iy] += alpha * v * x[ix-k*incX+jx]
+			y[iy-k*incY+jy] += tmp * v
+			jx += incX
+			jy += incY
+		}
+		y[iy] += tmp * atmp[k]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Dsyr performs the symmetric rank-one update
+//
+//	A += alpha * x * xᵀ
+//
+// where A is an n×n symmetric matrix, and x is a vector.
+func (Implementation) Dsyr(ul blas.Uplo, n int, alpha float64, x []float64, incX int, a []float64, lda int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	lenX := n
+	var kx int
+	if incX < 0 {
+		kx = -(lenX - 1) * incX
+	}
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				tmp := x[i] * alpha
+				if tmp != 0 {
+					atmp := a[i*lda+i : i*lda+n]
+					xtmp := x[i:n]
+					for j, v := range xtmp {
+						atmp[j] += v * tmp
+					}
+				}
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			tmp := x[ix] * alpha
+			if tmp != 0 {
+				jx := ix
+				atmp := a[i*lda:]
+				for j := i; j < n; j++ {
+					atmp[j] += x[jx] * tmp
+					jx += incX
+				}
+			}
+			ix += incX
+		}
+		return
+	}
+	// Cases where a is lower triangular.
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			tmp := x[i] * alpha
+			if tmp != 0 {
+				atmp := a[i*lda:]
+				xtmp := x[:i+1]
+				for j, v := range xtmp {
+					atmp[j] += tmp * v
+				}
+			}
+		}
+		return
+	}
+	ix := kx
+	for i := 0; i < n; i++ {
+		tmp := x[ix] * alpha
+		if tmp != 0 {
+			atmp := a[i*lda:]
+			jx := kx
+			for j := 0; j < i+1; j++ {
+				atmp[j] += tmp * x[jx]
+				jx += incX
+			}
+		}
+		ix += incX
+	}
+}
+
+// Dsyr2 performs the symmetric rank-two update
+//
+//	A += alpha * x * yᵀ + alpha * y * xᵀ
+//
+// where A is an n×n symmetric matrix, x and y are vectors, and alpha is a scalar.
+func (Implementation) Dsyr2(ul blas.Uplo, n int, alpha float64, x []float64, incX int, y []float64, incY int, a []float64, lda int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	var ky, kx int
+	if incY < 0 {
+		ky = -(n - 1) * incY
+	}
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+	if ul == blas.Upper {
+		if incX == 1 && incY == 1 {
+			for i := 0; i < n; i++ {
+				xi := x[i]
+				yi := y[i]
+				atmp := a[i*lda:]
+				for j := i; j < n; j++ {
+					atmp[j] += alpha * (xi*y[j] + x[j]*yi)
+				}
+			}
+			return
+		}
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			jx := kx + i*incX
+			jy := ky + i*incY
+			xi := x[ix]
+			yi := y[iy]
+			atmp := a[i*lda:]
+			for j := i; j < n; j++ {
+				atmp[j] += alpha * (xi*y[jy] + x[jx]*yi)
+				jx += incX
+				jy += incY
+			}
+			ix += incX
+			iy += incY
+		}
+		return
+	}
+	if incX == 1 && incY == 1 {
+		for i := 0; i < n; i++ {
+			xi := x[i]
+			yi := y[i]
+			atmp := a[i*lda:]
+			for j := 0; j <= i; j++ {
+				atmp[j] += alpha * (xi*y[j] + x[j]*yi)
+			}
+		}
+		return
+	}
+	ix := kx
+	iy := ky
+	for i := 0; i < n; i++ {
+		jx := kx
+		jy := ky
+		xi := x[ix]
+		yi := y[iy]
+		atmp := a[i*lda:]
+		for j := 0; j <= i; j++ {
+			atmp[j] += alpha * (xi*y[jy] + x[jx]*yi)
+			jx += incX
+			jy += incY
+		}
+		ix += incX
+		iy += incY
+	}
+}
+
+// Dtpsv solves one of the systems of equations
+//
+//	A * x = b   if tA == blas.NoTrans
+//	Aᵀ * x = b  if tA == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n triangular matrix in packed format, and x and b are vectors.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func (Implementation) Dtpsv(ul blas.Uplo, tA blas.Transpose, d blas.Diag, n int, ap []float64, x []float64, incX int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+
+	nonUnit := d == blas.NonUnit
+	var offset int // Offset is the index of (i,i)
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			offset = n*(n+1)/2 - 1
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					atmp := ap[offset+1 : offset+n-i]
+					xtmp := x[i+1:]
+					var sum float64
+					for j, v := range atmp {
+						sum += v * xtmp[j]
+					}
+					x[i] -= sum
+					if nonUnit {
+						x[i] /= ap[offset]
+					}
+					offset -= n - i + 1
+				}
+				return
+			}
+			ix := kx + (n-1)*incX
+			for i := n - 1; i >= 0; i-- {
+				atmp := ap[offset+1 : offset+n-i]
+				jx := kx + (i+1)*incX
+				var sum float64
+				for _, v := range atmp {
+					sum += v * x[jx]
+					jx += incX
+				}
+				x[ix] -= sum
+				if nonUnit {
+					x[ix] /= ap[offset]
+				}
+				ix -= incX
+				offset -= n - i + 1
+			}
+			return
+		}
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				atmp := ap[offset-i : offset]
+				var sum float64
+				for j, v := range atmp {
+					sum += v * x[j]
+				}
+				x[i] -= sum
+				if nonUnit {
+					x[i] /= ap[offset]
+				}
+				offset += i + 2
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			jx := kx
+			atmp := ap[offset-i : offset]
+			var sum float64
+			for _, v := range atmp {
+				sum += v * x[jx]
+				jx += incX
+			}
+			x[ix] -= sum
+			if nonUnit {
+				x[ix] /= ap[offset]
+			}
+			ix += incX
+			offset += i + 2
+		}
+		return
+	}
+	// Cases where ap is transposed.
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				if nonUnit {
+					x[i] /= ap[offset]
+				}
+				xi := x[i]
+				atmp := ap[offset+1 : offset+n-i]
+				xtmp := x[i+1:]
+				for j, v := range atmp {
+					xtmp[j] -= v * xi
+				}
+				offset += n - i
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			if nonUnit {
+				x[ix] /= ap[offset]
+			}
+			xix := x[ix]
+			atmp := ap[offset+1 : offset+n-i]
+			jx := kx + (i+1)*incX
+			for _, v := range atmp {
+				x[jx] -= v * xix
+				jx += incX
+			}
+			ix += incX
+			offset += n - i
+		}
+		return
+	}
+	if incX == 1 {
+		offset = n*(n+1)/2 - 1
+		for i := n - 1; i >= 0; i-- {
+			if nonUnit {
+				x[i] /= ap[offset]
+			}
+			xi := x[i]
+			atmp := ap[offset-i : offset]
+			for j, v := range atmp {
+				x[j] -= v * xi
+			}
+			offset -= i + 1
+		}
+		return
+	}
+	ix := kx + (n-1)*incX
+	offset = n*(n+1)/2 - 1
+	for i := n - 1; i >= 0; i-- {
+		if nonUnit {
+			x[ix] /= ap[offset]
+		}
+		xix := x[ix]
+		atmp := ap[offset-i : offset]
+		jx := kx
+		for _, v := range atmp {
+			x[jx] -= v * xix
+			jx += incX
+		}
+		ix -= incX
+		offset -= i + 1
+	}
+}
+
+// Dspmv performs the matrix-vector operation
+//
+//	y = alpha * A * x + beta * y
+//
+// where A is an n×n symmetric matrix in packed format, x and y are vectors,
+// and alpha and beta are scalars.
+func (Implementation) Dspmv(ul blas.Uplo, n int, alpha float64, ap []float64, x []float64, incX int, beta float64, y []float64, incY int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	// Set up start points
+	var kx, ky int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+	if incY < 0 {
+		ky = -(n - 1) * incY
+	}
+
+	// Form y = beta * y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:n] {
+					y[i] = 0
+				}
+			} else {
+				f64.ScalUnitary(beta, y[:n])
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				if incY > 0 {
+					f64.ScalInc(beta, y, uintptr(n), uintptr(incY))
+				} else {
+					f64.ScalInc(beta, y, uintptr(n), uintptr(-incY))
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	if n == 1 {
+		y[0] += alpha * ap[0] * x[0]
+		return
+	}
+	var offset int // Offset is the index of (i,i).
+	if ul == blas.Upper {
+		if incX == 1 {
+			iy := ky
+			for i := 0; i < n; i++ {
+				xv := x[i] * alpha
+				sum := ap[offset] * x[i]
+				atmp := ap[offset+1 : offset+n-i]
+				xtmp := x[i+1:]
+				jy := ky + (i+1)*incY
+				for j, v := range atmp {
+					sum += v * xtmp[j]
+					y[jy] += v * xv
+					jy += incY
+				}
+				y[iy] += alpha * sum
+				iy += incY
+				offset += n - i
+			}
+			return
+		}
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			xv := x[ix] * alpha
+			sum := ap[offset] * x[ix]
+			atmp := ap[offset+1 : offset+n-i]
+			jx := kx + (i+1)*incX
+			jy := ky + (i+1)*incY
+			for _, v := range atmp {
+				sum += v * x[jx]
+				y[jy] += v * xv
+				jx += incX
+				jy += incY
+			}
+			y[iy] += alpha * sum
+			ix += incX
+			iy += incY
+			offset += n - i
+		}
+		return
+	}
+	if incX == 1 {
+		iy := ky
+		for i := 0; i < n; i++ {
+			xv := x[i] * alpha
+			atmp := ap[offset-i : offset]
+			jy := ky
+			var sum float64
+			for j, v := range atmp {
+				sum += v * x[j]
+				y[jy] += v * xv
+				jy += incY
+			}
+			sum += ap[offset] * x[i]
+			y[iy] += alpha * sum
+			iy += incY
+			offset += i + 2
+		}
+		return
+	}
+	ix := kx
+	iy := ky
+	for i := 0; i < n; i++ {
+		xv := x[ix] * alpha
+		atmp := ap[offset-i : offset]
+		jx := kx
+		jy := ky
+		var sum float64
+		for _, v := range atmp {
+			sum += v * x[jx]
+			y[jy] += v * xv
+			jx += incX
+			jy += incY
+		}
+
+		sum += ap[offset] * x[ix]
+		y[iy] += alpha * sum
+		ix += incX
+		iy += incY
+		offset += i + 2
+	}
+}
+
+// Dspr performs the symmetric rank-one operation
+//
+//	A += alpha * x * xᵀ
+//
+// where A is an n×n symmetric matrix in packed format, x is a vector, and
+// alpha is a scalar.
+func (Implementation) Dspr(ul blas.Uplo, n int, alpha float64, x []float64, incX int, ap []float64) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	lenX := n
+	var kx int
+	if incX < 0 {
+		kx = -(lenX - 1) * incX
+	}
+	var offset int // Offset is the index of (i,i).
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				atmp := ap[offset:]
+				xv := alpha * x[i]
+				xtmp := x[i:n]
+				for j, v := range xtmp {
+					atmp[j] += xv * v
+				}
+				offset += n - i
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			jx := kx + i*incX
+			atmp := ap[offset:]
+			xv := alpha * x[ix]
+			for j := 0; j < n-i; j++ {
+				atmp[j] += xv * x[jx]
+				jx += incX
+			}
+			ix += incX
+			offset += n - i
+		}
+		return
+	}
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			atmp := ap[offset-i:]
+			xv := alpha * x[i]
+			xtmp := x[:i+1]
+			for j, v := range xtmp {
+				atmp[j] += xv * v
+			}
+			offset += i + 2
+		}
+		return
+	}
+	ix := kx
+	for i := 0; i < n; i++ {
+		jx := kx
+		atmp := ap[offset-i:]
+		xv := alpha * x[ix]
+		for j := 0; j <= i; j++ {
+			atmp[j] += xv * x[jx]
+			jx += incX
+		}
+		ix += incX
+		offset += i + 2
+	}
+}
+
+// Dspr2 performs the symmetric rank-2 update
+//
+//	A += alpha * x * yᵀ + alpha * y * xᵀ
+//
+// where A is an n×n symmetric matrix in packed format, x and y are vectors,
+// and alpha is a scalar.
+func (Implementation) Dspr2(ul blas.Uplo, n int, alpha float64, x []float64, incX int, y []float64, incY int, ap []float64) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	var ky, kx int
+	if incY < 0 {
+		ky = -(n - 1) * incY
+	}
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+	var offset int // Offset is the index of (i,i).
+	if ul == blas.Upper {
+		if incX == 1 && incY == 1 {
+			for i := 0; i < n; i++ {
+				atmp := ap[offset:]
+				xi := x[i]
+				yi := y[i]
+				xtmp := x[i:n]
+				ytmp := y[i:n]
+				for j, v := range xtmp {
+					atmp[j] += alpha * (xi*ytmp[j] + v*yi)
+				}
+				offset += n - i
+			}
+			return
+		}
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			jx := kx + i*incX
+			jy := ky + i*incY
+			atmp := ap[offset:]
+			xi := x[ix]
+			yi := y[iy]
+			for j := 0; j < n-i; j++ {
+				atmp[j] += alpha * (xi*y[jy] + x[jx]*yi)
+				jx += incX
+				jy += incY
+			}
+			ix += incX
+			iy += incY
+			offset += n - i
+		}
+		return
+	}
+	if incX == 1 && incY == 1 {
+		for i := 0; i < n; i++ {
+			atmp := ap[offset-i:]
+			xi := x[i]
+			yi := y[i]
+			xtmp := x[:i+1]
+			for j, v := range xtmp {
+				atmp[j] += alpha * (xi*y[j] + v*yi)
+			}
+			offset += i + 2
+		}
+		return
+	}
+	ix := kx
+	iy := ky
+	for i := 0; i < n; i++ {
+		jx := kx
+		jy := ky
+		atmp := ap[offset-i:]
+		for j := 0; j <= i; j++ {
+			atmp[j] += alpha * (x[ix]*y[jy] + x[jx]*y[iy])
+			jx += incX
+			jy += incY
+		}
+		ix += incX
+		iy += incY
+		offset += i + 2
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level3cmplx128.go b/vendor/gonum.org/v1/gonum/blas/gonum/level3cmplx128.go
new file mode 100644
index 0000000000..bfff8c5579
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level3cmplx128.go
@@ -0,0 +1,1751 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math/cmplx"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/c128"
+)
+
+var _ blas.Complex128Level3 = Implementation{}
+
+// Zgemm performs one of the matrix-matrix operations
+//
+//	C = alpha * op(A) * op(B) + beta * C
+//
+// where op(X) is one of
+//
+//	op(X) = X  or  op(X) = Xᵀ  or  op(X) = Xᴴ,
+//
+// alpha and beta are scalars, and A, B and C are matrices, with op(A) an m×k matrix,
+// op(B) a k×n matrix and C an m×n matrix.
+func (Implementation) Zgemm(tA, tB blas.Transpose, m, n, k int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int) {
+	switch tA {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch tB {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	}
+	rowA, colA := m, k
+	if tA != blas.NoTrans {
+		rowA, colA = k, m
+	}
+	if lda < max(1, colA) {
+		panic(badLdA)
+	}
+	rowB, colB := k, n
+	if tB != blas.NoTrans {
+		rowB, colB = n, k
+	}
+	if ldb < max(1, colB) {
+		panic(badLdB)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (rowA-1)*lda+colA {
+		panic(shortA)
+	}
+	if len(b) < (rowB-1)*ldb+colB {
+		panic(shortB)
+	}
+	if len(c) < (m-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					c[i*ldc+j] = 0
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					c[i*ldc+j] *= beta
+				}
+			}
+		}
+		return
+	}
+
+	switch tA {
+	case blas.NoTrans:
+		switch tB {
+		case blas.NoTrans:
+			// Form  C = alpha * A * B + beta * C.
+			for i := 0; i < m; i++ {
+				switch {
+				case beta == 0:
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] = 0
+					}
+				case beta != 1:
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] *= beta
+					}
+				}
+				for l := 0; l < k; l++ {
+					tmp := alpha * a[i*lda+l]
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] += tmp * b[l*ldb+j]
+					}
+				}
+			}
+		case blas.Trans:
+			// Form  C = alpha * A * Bᵀ + beta * C.
+			for i := 0; i < m; i++ {
+				switch {
+				case beta == 0:
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] = 0
+					}
+				case beta != 1:
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] *= beta
+					}
+				}
+				for l := 0; l < k; l++ {
+					tmp := alpha * a[i*lda+l]
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] += tmp * b[j*ldb+l]
+					}
+				}
+			}
+		case blas.ConjTrans:
+			// Form  C = alpha * A * Bᴴ + beta * C.
+			for i := 0; i < m; i++ {
+				switch {
+				case beta == 0:
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] = 0
+					}
+				case beta != 1:
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] *= beta
+					}
+				}
+				for l := 0; l < k; l++ {
+					tmp := alpha * a[i*lda+l]
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] += tmp * cmplx.Conj(b[j*ldb+l])
+					}
+				}
+			}
+		}
+	case blas.Trans:
+		switch tB {
+		case blas.NoTrans:
+			// Form  C = alpha * Aᵀ * B + beta * C.
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					var tmp complex128
+					for l := 0; l < k; l++ {
+						tmp += a[l*lda+i] * b[l*ldb+j]
+					}
+					if beta == 0 {
+						c[i*ldc+j] = alpha * tmp
+					} else {
+						c[i*ldc+j] = alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		case blas.Trans:
+			// Form  C = alpha * Aᵀ * Bᵀ + beta * C.
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					var tmp complex128
+					for l := 0; l < k; l++ {
+						tmp += a[l*lda+i] * b[j*ldb+l]
+					}
+					if beta == 0 {
+						c[i*ldc+j] = alpha * tmp
+					} else {
+						c[i*ldc+j] = alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		case blas.ConjTrans:
+			// Form  C = alpha * Aᵀ * Bᴴ + beta * C.
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					var tmp complex128
+					for l := 0; l < k; l++ {
+						tmp += a[l*lda+i] * cmplx.Conj(b[j*ldb+l])
+					}
+					if beta == 0 {
+						c[i*ldc+j] = alpha * tmp
+					} else {
+						c[i*ldc+j] = alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		}
+	case blas.ConjTrans:
+		switch tB {
+		case blas.NoTrans:
+			// Form  C = alpha * Aᴴ * B + beta * C.
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					var tmp complex128
+					for l := 0; l < k; l++ {
+						tmp += cmplx.Conj(a[l*lda+i]) * b[l*ldb+j]
+					}
+					if beta == 0 {
+						c[i*ldc+j] = alpha * tmp
+					} else {
+						c[i*ldc+j] = alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		case blas.Trans:
+			// Form  C = alpha * Aᴴ * Bᵀ + beta * C.
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					var tmp complex128
+					for l := 0; l < k; l++ {
+						tmp += cmplx.Conj(a[l*lda+i]) * b[j*ldb+l]
+					}
+					if beta == 0 {
+						c[i*ldc+j] = alpha * tmp
+					} else {
+						c[i*ldc+j] = alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		case blas.ConjTrans:
+			// Form  C = alpha * Aᴴ * Bᴴ + beta * C.
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					var tmp complex128
+					for l := 0; l < k; l++ {
+						tmp += cmplx.Conj(a[l*lda+i]) * cmplx.Conj(b[j*ldb+l])
+					}
+					if beta == 0 {
+						c[i*ldc+j] = alpha * tmp
+					} else {
+						c[i*ldc+j] = alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		}
+	}
+}
+
+// Zhemm performs one of the matrix-matrix operations
+//
+//	C = alpha*A*B + beta*C  if side == blas.Left
+//	C = alpha*B*A + beta*C  if side == blas.Right
+//
+// where alpha and beta are scalars, A is an m×m or n×n hermitian matrix and B
+// and C are m×n matrices. The imaginary parts of the diagonal elements of A are
+// assumed to be zero.
+func (Implementation) Zhemm(side blas.Side, uplo blas.Uplo, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int) {
+	na := m
+	if side == blas.Right {
+		na = n
+	}
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, na):
+		panic(badLdA)
+	case ldb < max(1, n):
+		panic(badLdB)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(na-1)+na {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+	if len(c) < ldc*(m-1)+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			for i := 0; i < m; i++ {
+				ci := c[i*ldc : i*ldc+n]
+				for j := range ci {
+					ci[j] = 0
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				ci := c[i*ldc : i*ldc+n]
+				c128.ScalUnitary(beta, ci)
+			}
+		}
+		return
+	}
+
+	if side == blas.Left {
+		// Form  C = alpha*A*B + beta*C.
+		for i := 0; i < m; i++ {
+			atmp := alpha * complex(real(a[i*lda+i]), 0)
+			bi := b[i*ldb : i*ldb+n]
+			ci := c[i*ldc : i*ldc+n]
+			if beta == 0 {
+				for j, bij := range bi {
+					ci[j] = atmp * bij
+				}
+			} else {
+				for j, bij := range bi {
+					ci[j] = atmp*bij + beta*ci[j]
+				}
+			}
+			if uplo == blas.Upper {
+				for k := 0; k < i; k++ {
+					atmp = alpha * cmplx.Conj(a[k*lda+i])
+					c128.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+				for k := i + 1; k < m; k++ {
+					atmp = alpha * a[i*lda+k]
+					c128.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+			} else {
+				for k := 0; k < i; k++ {
+					atmp = alpha * a[i*lda+k]
+					c128.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+				for k := i + 1; k < m; k++ {
+					atmp = alpha * cmplx.Conj(a[k*lda+i])
+					c128.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+			}
+		}
+	} else {
+		// Form  C = alpha*B*A + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < m; i++ {
+				for j := n - 1; j >= 0; j-- {
+					abij := alpha * b[i*ldb+j]
+					aj := a[j*lda+j+1 : j*lda+n]
+					bi := b[i*ldb+j+1 : i*ldb+n]
+					ci := c[i*ldc+j+1 : i*ldc+n]
+					var tmp complex128
+					for k, ajk := range aj {
+						ci[k] += abij * ajk
+						tmp += bi[k] * cmplx.Conj(ajk)
+					}
+					ajj := complex(real(a[j*lda+j]), 0)
+					if beta == 0 {
+						c[i*ldc+j] = abij*ajj + alpha*tmp
+					} else {
+						c[i*ldc+j] = abij*ajj + alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					abij := alpha * b[i*ldb+j]
+					aj := a[j*lda : j*lda+j]
+					bi := b[i*ldb : i*ldb+j]
+					ci := c[i*ldc : i*ldc+j]
+					var tmp complex128
+					for k, ajk := range aj {
+						ci[k] += abij * ajk
+						tmp += bi[k] * cmplx.Conj(ajk)
+					}
+					ajj := complex(real(a[j*lda+j]), 0)
+					if beta == 0 {
+						c[i*ldc+j] = abij*ajj + alpha*tmp
+					} else {
+						c[i*ldc+j] = abij*ajj + alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		}
+	}
+}
+
+// Zherk performs one of the hermitian rank-k operations
+//
+//	C = alpha*A*Aᴴ + beta*C  if trans == blas.NoTrans
+//	C = alpha*Aᴴ*A + beta*C  if trans == blas.ConjTrans
+//
+// where alpha and beta are real scalars, C is an n×n hermitian matrix and A is
+// an n×k matrix in the first case and a k×n matrix in the second case.
+//
+// The imaginary parts of the diagonal elements of C are assumed to be zero, and
+// on return they will be set to zero.
+func (Implementation) Zherk(uplo blas.Uplo, trans blas.Transpose, n, k int, alpha float64, a []complex128, lda int, beta float64, c []complex128, ldc int) {
+	var rowA, colA int
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans:
+		rowA, colA = n, k
+	case blas.ConjTrans:
+		rowA, colA = k, n
+	}
+	switch {
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case lda < max(1, colA):
+		panic(badLdA)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (rowA-1)*lda+colA {
+		panic(shortA)
+	}
+	if len(c) < (n-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if uplo == blas.Upper {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					ci[0] = complex(beta*real(ci[0]), 0)
+					if i != n-1 {
+						c128.DscalUnitary(beta, ci[1:])
+					}
+				}
+			}
+		} else {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					if i != 0 {
+						c128.DscalUnitary(beta, ci[:i])
+					}
+					ci[i] = complex(beta*real(ci[i]), 0)
+				}
+			}
+		}
+		return
+	}
+
+	calpha := complex(alpha, 0)
+	if trans == blas.NoTrans {
+		// Form  C = alpha*A*Aᴴ + beta*C.
+		cbeta := complex(beta, 0)
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				ai := a[i*lda : i*lda+k]
+				switch {
+				case beta == 0:
+					// Handle the i-th diagonal element of C.
+					ci[0] = complex(alpha*real(c128.DotcUnitary(ai, ai)), 0)
+					// Handle the remaining elements on the i-th row of C.
+					for jc := range ci[1:] {
+						j := i + 1 + jc
+						ci[jc+1] = calpha * c128.DotcUnitary(a[j*lda:j*lda+k], ai)
+					}
+				case beta != 1:
+					cii := calpha*c128.DotcUnitary(ai, ai) + cbeta*ci[0]
+					ci[0] = complex(real(cii), 0)
+					for jc, cij := range ci[1:] {
+						j := i + 1 + jc
+						ci[jc+1] = calpha*c128.DotcUnitary(a[j*lda:j*lda+k], ai) + cbeta*cij
+					}
+				default:
+					cii := calpha*c128.DotcUnitary(ai, ai) + ci[0]
+					ci[0] = complex(real(cii), 0)
+					for jc, cij := range ci[1:] {
+						j := i + 1 + jc
+						ci[jc+1] = calpha*c128.DotcUnitary(a[j*lda:j*lda+k], ai) + cij
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				ai := a[i*lda : i*lda+k]
+				switch {
+				case beta == 0:
+					// Handle the first i-1 elements on the i-th row of C.
+					for j := range ci[:i] {
+						ci[j] = calpha * c128.DotcUnitary(a[j*lda:j*lda+k], ai)
+					}
+					// Handle the i-th diagonal element of C.
+					ci[i] = complex(alpha*real(c128.DotcUnitary(ai, ai)), 0)
+				case beta != 1:
+					for j, cij := range ci[:i] {
+						ci[j] = calpha*c128.DotcUnitary(a[j*lda:j*lda+k], ai) + cbeta*cij
+					}
+					cii := calpha*c128.DotcUnitary(ai, ai) + cbeta*ci[i]
+					ci[i] = complex(real(cii), 0)
+				default:
+					for j, cij := range ci[:i] {
+						ci[j] = calpha*c128.DotcUnitary(a[j*lda:j*lda+k], ai) + cij
+					}
+					cii := calpha*c128.DotcUnitary(ai, ai) + ci[i]
+					ci[i] = complex(real(cii), 0)
+				}
+			}
+		}
+	} else {
+		// Form  C = alpha*Aᴴ*A + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				switch {
+				case beta == 0:
+					for jc := range ci {
+						ci[jc] = 0
+					}
+				case beta != 1:
+					c128.DscalUnitary(beta, ci)
+					ci[0] = complex(real(ci[0]), 0)
+				default:
+					ci[0] = complex(real(ci[0]), 0)
+				}
+				for j := 0; j < k; j++ {
+					aji := cmplx.Conj(a[j*lda+i])
+					if aji != 0 {
+						c128.AxpyUnitary(calpha*aji, a[j*lda+i:j*lda+n], ci)
+					}
+				}
+				c[i*ldc+i] = complex(real(c[i*ldc+i]), 0)
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				switch {
+				case beta == 0:
+					for j := range ci {
+						ci[j] = 0
+					}
+				case beta != 1:
+					c128.DscalUnitary(beta, ci)
+					ci[i] = complex(real(ci[i]), 0)
+				default:
+					ci[i] = complex(real(ci[i]), 0)
+				}
+				for j := 0; j < k; j++ {
+					aji := cmplx.Conj(a[j*lda+i])
+					if aji != 0 {
+						c128.AxpyUnitary(calpha*aji, a[j*lda:j*lda+i+1], ci)
+					}
+				}
+				c[i*ldc+i] = complex(real(c[i*ldc+i]), 0)
+			}
+		}
+	}
+}
+
+// Zher2k performs one of the hermitian rank-2k operations
+//
+//	C = alpha*A*Bᴴ + conj(alpha)*B*Aᴴ + beta*C  if trans == blas.NoTrans
+//	C = alpha*Aᴴ*B + conj(alpha)*Bᴴ*A + beta*C  if trans == blas.ConjTrans
+//
+// where alpha and beta are scalars with beta real, C is an n×n hermitian matrix
+// and A and B are n×k matrices in the first case and k×n matrices in the second case.
+//
+// The imaginary parts of the diagonal elements of C are assumed to be zero, and
+// on return they will be set to zero.
+func (Implementation) Zher2k(uplo blas.Uplo, trans blas.Transpose, n, k int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta float64, c []complex128, ldc int) {
+	var row, col int
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans:
+		row, col = n, k
+	case blas.ConjTrans:
+		row, col = k, n
+	}
+	switch {
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case lda < max(1, col):
+		panic(badLdA)
+	case ldb < max(1, col):
+		panic(badLdB)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (row-1)*lda+col {
+		panic(shortA)
+	}
+	if len(b) < (row-1)*ldb+col {
+		panic(shortB)
+	}
+	if len(c) < (n-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if uplo == blas.Upper {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					ci[0] = complex(beta*real(ci[0]), 0)
+					if i != n-1 {
+						c128.DscalUnitary(beta, ci[1:])
+					}
+				}
+			}
+		} else {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					if i != 0 {
+						c128.DscalUnitary(beta, ci[:i])
+					}
+					ci[i] = complex(beta*real(ci[i]), 0)
+				}
+			}
+		}
+		return
+	}
+
+	conjalpha := cmplx.Conj(alpha)
+	cbeta := complex(beta, 0)
+	if trans == blas.NoTrans {
+		// Form  C = alpha*A*Bᴴ + conj(alpha)*B*Aᴴ + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i+1 : i*ldc+n]
+				ai := a[i*lda : i*lda+k]
+				bi := b[i*ldb : i*ldb+k]
+				if beta == 0 {
+					cii := alpha*c128.DotcUnitary(bi, ai) + conjalpha*c128.DotcUnitary(ai, bi)
+					c[i*ldc+i] = complex(real(cii), 0)
+					for jc := range ci {
+						j := i + 1 + jc
+						ci[jc] = alpha*c128.DotcUnitary(b[j*ldb:j*ldb+k], ai) + conjalpha*c128.DotcUnitary(a[j*lda:j*lda+k], bi)
+					}
+				} else {
+					cii := alpha*c128.DotcUnitary(bi, ai) + conjalpha*c128.DotcUnitary(ai, bi) + cbeta*c[i*ldc+i]
+					c[i*ldc+i] = complex(real(cii), 0)
+					for jc, cij := range ci {
+						j := i + 1 + jc
+						ci[jc] = alpha*c128.DotcUnitary(b[j*ldb:j*ldb+k], ai) + conjalpha*c128.DotcUnitary(a[j*lda:j*lda+k], bi) + cbeta*cij
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i]
+				ai := a[i*lda : i*lda+k]
+				bi := b[i*ldb : i*ldb+k]
+				if beta == 0 {
+					for j := range ci {
+						ci[j] = alpha*c128.DotcUnitary(b[j*ldb:j*ldb+k], ai) + conjalpha*c128.DotcUnitary(a[j*lda:j*lda+k], bi)
+					}
+					cii := alpha*c128.DotcUnitary(bi, ai) + conjalpha*c128.DotcUnitary(ai, bi)
+					c[i*ldc+i] = complex(real(cii), 0)
+				} else {
+					for j, cij := range ci {
+						ci[j] = alpha*c128.DotcUnitary(b[j*ldb:j*ldb+k], ai) + conjalpha*c128.DotcUnitary(a[j*lda:j*lda+k], bi) + cbeta*cij
+					}
+					cii := alpha*c128.DotcUnitary(bi, ai) + conjalpha*c128.DotcUnitary(ai, bi) + cbeta*c[i*ldc+i]
+					c[i*ldc+i] = complex(real(cii), 0)
+				}
+			}
+		}
+	} else {
+		// Form  C = alpha*Aᴴ*B + conj(alpha)*Bᴴ*A + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				switch {
+				case beta == 0:
+					for jc := range ci {
+						ci[jc] = 0
+					}
+				case beta != 1:
+					c128.DscalUnitary(beta, ci)
+					ci[0] = complex(real(ci[0]), 0)
+				default:
+					ci[0] = complex(real(ci[0]), 0)
+				}
+				for j := 0; j < k; j++ {
+					aji := a[j*lda+i]
+					bji := b[j*ldb+i]
+					if aji != 0 {
+						c128.AxpyUnitary(alpha*cmplx.Conj(aji), b[j*ldb+i:j*ldb+n], ci)
+					}
+					if bji != 0 {
+						c128.AxpyUnitary(conjalpha*cmplx.Conj(bji), a[j*lda+i:j*lda+n], ci)
+					}
+				}
+				ci[0] = complex(real(ci[0]), 0)
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				switch {
+				case beta == 0:
+					for j := range ci {
+						ci[j] = 0
+					}
+				case beta != 1:
+					c128.DscalUnitary(beta, ci)
+					ci[i] = complex(real(ci[i]), 0)
+				default:
+					ci[i] = complex(real(ci[i]), 0)
+				}
+				for j := 0; j < k; j++ {
+					aji := a[j*lda+i]
+					bji := b[j*ldb+i]
+					if aji != 0 {
+						c128.AxpyUnitary(alpha*cmplx.Conj(aji), b[j*ldb:j*ldb+i+1], ci)
+					}
+					if bji != 0 {
+						c128.AxpyUnitary(conjalpha*cmplx.Conj(bji), a[j*lda:j*lda+i+1], ci)
+					}
+				}
+				ci[i] = complex(real(ci[i]), 0)
+			}
+		}
+	}
+}
+
+// Zsymm performs one of the matrix-matrix operations
+//
+//	C = alpha*A*B + beta*C  if side == blas.Left
+//	C = alpha*B*A + beta*C  if side == blas.Right
+//
+// where alpha and beta are scalars, A is an m×m or n×n symmetric matrix and B
+// and C are m×n matrices.
+func (Implementation) Zsymm(side blas.Side, uplo blas.Uplo, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int) {
+	na := m
+	if side == blas.Right {
+		na = n
+	}
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, na):
+		panic(badLdA)
+	case ldb < max(1, n):
+		panic(badLdB)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(na-1)+na {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+	if len(c) < ldc*(m-1)+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			for i := 0; i < m; i++ {
+				ci := c[i*ldc : i*ldc+n]
+				for j := range ci {
+					ci[j] = 0
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				ci := c[i*ldc : i*ldc+n]
+				c128.ScalUnitary(beta, ci)
+			}
+		}
+		return
+	}
+
+	if side == blas.Left {
+		// Form  C = alpha*A*B + beta*C.
+		for i := 0; i < m; i++ {
+			atmp := alpha * a[i*lda+i]
+			bi := b[i*ldb : i*ldb+n]
+			ci := c[i*ldc : i*ldc+n]
+			if beta == 0 {
+				for j, bij := range bi {
+					ci[j] = atmp * bij
+				}
+			} else {
+				for j, bij := range bi {
+					ci[j] = atmp*bij + beta*ci[j]
+				}
+			}
+			if uplo == blas.Upper {
+				for k := 0; k < i; k++ {
+					atmp = alpha * a[k*lda+i]
+					c128.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+				for k := i + 1; k < m; k++ {
+					atmp = alpha * a[i*lda+k]
+					c128.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+			} else {
+				for k := 0; k < i; k++ {
+					atmp = alpha * a[i*lda+k]
+					c128.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+				for k := i + 1; k < m; k++ {
+					atmp = alpha * a[k*lda+i]
+					c128.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+			}
+		}
+	} else {
+		// Form  C = alpha*B*A + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < m; i++ {
+				for j := n - 1; j >= 0; j-- {
+					abij := alpha * b[i*ldb+j]
+					aj := a[j*lda+j+1 : j*lda+n]
+					bi := b[i*ldb+j+1 : i*ldb+n]
+					ci := c[i*ldc+j+1 : i*ldc+n]
+					var tmp complex128
+					for k, ajk := range aj {
+						ci[k] += abij * ajk
+						tmp += bi[k] * ajk
+					}
+					if beta == 0 {
+						c[i*ldc+j] = abij*a[j*lda+j] + alpha*tmp
+					} else {
+						c[i*ldc+j] = abij*a[j*lda+j] + alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					abij := alpha * b[i*ldb+j]
+					aj := a[j*lda : j*lda+j]
+					bi := b[i*ldb : i*ldb+j]
+					ci := c[i*ldc : i*ldc+j]
+					var tmp complex128
+					for k, ajk := range aj {
+						ci[k] += abij * ajk
+						tmp += bi[k] * ajk
+					}
+					if beta == 0 {
+						c[i*ldc+j] = abij*a[j*lda+j] + alpha*tmp
+					} else {
+						c[i*ldc+j] = abij*a[j*lda+j] + alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		}
+	}
+}
+
+// Zsyrk performs one of the symmetric rank-k operations
+//
+//	C = alpha*A*Aᵀ + beta*C  if trans == blas.NoTrans
+//	C = alpha*Aᵀ*A + beta*C  if trans == blas.Trans
+//
+// where alpha and beta are scalars, C is an n×n symmetric matrix and A is
+// an n×k matrix in the first case and a k×n matrix in the second case.
+func (Implementation) Zsyrk(uplo blas.Uplo, trans blas.Transpose, n, k int, alpha complex128, a []complex128, lda int, beta complex128, c []complex128, ldc int) {
+	var rowA, colA int
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans:
+		rowA, colA = n, k
+	case blas.Trans:
+		rowA, colA = k, n
+	}
+	switch {
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case lda < max(1, colA):
+		panic(badLdA)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (rowA-1)*lda+colA {
+		panic(shortA)
+	}
+	if len(c) < (n-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if uplo == blas.Upper {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					c128.ScalUnitary(beta, ci)
+				}
+			}
+		} else {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					c128.ScalUnitary(beta, ci)
+				}
+			}
+		}
+		return
+	}
+
+	if trans == blas.NoTrans {
+		// Form  C = alpha*A*Aᵀ + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				ai := a[i*lda : i*lda+k]
+				if beta == 0 {
+					for jc := range ci {
+						j := i + jc
+						ci[jc] = alpha * c128.DotuUnitary(ai, a[j*lda:j*lda+k])
+					}
+				} else {
+					for jc, cij := range ci {
+						j := i + jc
+						ci[jc] = beta*cij + alpha*c128.DotuUnitary(ai, a[j*lda:j*lda+k])
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				ai := a[i*lda : i*lda+k]
+				if beta == 0 {
+					for j := range ci {
+						ci[j] = alpha * c128.DotuUnitary(ai, a[j*lda:j*lda+k])
+					}
+				} else {
+					for j, cij := range ci {
+						ci[j] = beta*cij + alpha*c128.DotuUnitary(ai, a[j*lda:j*lda+k])
+					}
+				}
+			}
+		}
+	} else {
+		// Form  C = alpha*Aᵀ*A + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				switch {
+				case beta == 0:
+					for jc := range ci {
+						ci[jc] = 0
+					}
+				case beta != 1:
+					for jc := range ci {
+						ci[jc] *= beta
+					}
+				}
+				for j := 0; j < k; j++ {
+					aji := a[j*lda+i]
+					if aji != 0 {
+						c128.AxpyUnitary(alpha*aji, a[j*lda+i:j*lda+n], ci)
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				switch {
+				case beta == 0:
+					for j := range ci {
+						ci[j] = 0
+					}
+				case beta != 1:
+					for j := range ci {
+						ci[j] *= beta
+					}
+				}
+				for j := 0; j < k; j++ {
+					aji := a[j*lda+i]
+					if aji != 0 {
+						c128.AxpyUnitary(alpha*aji, a[j*lda:j*lda+i+1], ci)
+					}
+				}
+			}
+		}
+	}
+}
+
+// Zsyr2k performs one of the symmetric rank-2k operations
+//
+//	C = alpha*A*Bᵀ + alpha*B*Aᵀ + beta*C  if trans == blas.NoTrans
+//	C = alpha*Aᵀ*B + alpha*Bᵀ*A + beta*C  if trans == blas.Trans
+//
+// where alpha and beta are scalars, C is an n×n symmetric matrix and A and B
+// are n×k matrices in the first case and k×n matrices in the second case.
+func (Implementation) Zsyr2k(uplo blas.Uplo, trans blas.Transpose, n, k int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int) {
+	var row, col int
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans:
+		row, col = n, k
+	case blas.Trans:
+		row, col = k, n
+	}
+	switch {
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case lda < max(1, col):
+		panic(badLdA)
+	case ldb < max(1, col):
+		panic(badLdB)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (row-1)*lda+col {
+		panic(shortA)
+	}
+	if len(b) < (row-1)*ldb+col {
+		panic(shortB)
+	}
+	if len(c) < (n-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if uplo == blas.Upper {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					c128.ScalUnitary(beta, ci)
+				}
+			}
+		} else {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					c128.ScalUnitary(beta, ci)
+				}
+			}
+		}
+		return
+	}
+
+	if trans == blas.NoTrans {
+		// Form  C = alpha*A*Bᵀ + alpha*B*Aᵀ + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				ai := a[i*lda : i*lda+k]
+				bi := b[i*ldb : i*ldb+k]
+				if beta == 0 {
+					for jc := range ci {
+						j := i + jc
+						ci[jc] = alpha*c128.DotuUnitary(ai, b[j*ldb:j*ldb+k]) + alpha*c128.DotuUnitary(bi, a[j*lda:j*lda+k])
+					}
+				} else {
+					for jc, cij := range ci {
+						j := i + jc
+						ci[jc] = alpha*c128.DotuUnitary(ai, b[j*ldb:j*ldb+k]) + alpha*c128.DotuUnitary(bi, a[j*lda:j*lda+k]) + beta*cij
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				ai := a[i*lda : i*lda+k]
+				bi := b[i*ldb : i*ldb+k]
+				if beta == 0 {
+					for j := range ci {
+						ci[j] = alpha*c128.DotuUnitary(ai, b[j*ldb:j*ldb+k]) + alpha*c128.DotuUnitary(bi, a[j*lda:j*lda+k])
+					}
+				} else {
+					for j, cij := range ci {
+						ci[j] = alpha*c128.DotuUnitary(ai, b[j*ldb:j*ldb+k]) + alpha*c128.DotuUnitary(bi, a[j*lda:j*lda+k]) + beta*cij
+					}
+				}
+			}
+		}
+	} else {
+		// Form  C = alpha*Aᵀ*B + alpha*Bᵀ*A + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				switch {
+				case beta == 0:
+					for jc := range ci {
+						ci[jc] = 0
+					}
+				case beta != 1:
+					for jc := range ci {
+						ci[jc] *= beta
+					}
+				}
+				for j := 0; j < k; j++ {
+					aji := a[j*lda+i]
+					bji := b[j*ldb+i]
+					if aji != 0 {
+						c128.AxpyUnitary(alpha*aji, b[j*ldb+i:j*ldb+n], ci)
+					}
+					if bji != 0 {
+						c128.AxpyUnitary(alpha*bji, a[j*lda+i:j*lda+n], ci)
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				switch {
+				case beta == 0:
+					for j := range ci {
+						ci[j] = 0
+					}
+				case beta != 1:
+					for j := range ci {
+						ci[j] *= beta
+					}
+				}
+				for j := 0; j < k; j++ {
+					aji := a[j*lda+i]
+					bji := b[j*ldb+i]
+					if aji != 0 {
+						c128.AxpyUnitary(alpha*aji, b[j*ldb:j*ldb+i+1], ci)
+					}
+					if bji != 0 {
+						c128.AxpyUnitary(alpha*bji, a[j*lda:j*lda+i+1], ci)
+					}
+				}
+			}
+		}
+	}
+}
+
+// Ztrmm performs one of the matrix-matrix operations
+//
+//	B = alpha * op(A) * B  if side == blas.Left,
+//	B = alpha * B * op(A)  if side == blas.Right,
+//
+// where alpha is a scalar, B is an m×n matrix, A is a unit, or non-unit,
+// upper or lower triangular matrix and op(A) is one of
+//
+//	op(A) = A   if trans == blas.NoTrans,
+//	op(A) = Aᵀ  if trans == blas.Trans,
+//	op(A) = Aᴴ  if trans == blas.ConjTrans.
+func (Implementation) Ztrmm(side blas.Side, uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int) {
+	na := m
+	if side == blas.Right {
+		na = n
+	}
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case trans != blas.NoTrans && trans != blas.Trans && trans != blas.ConjTrans:
+		panic(badTranspose)
+	case diag != blas.Unit && diag != blas.NonUnit:
+		panic(badDiag)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, na):
+		panic(badLdA)
+	case ldb < max(1, n):
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (na-1)*lda+na {
+		panic(shortA)
+	}
+	if len(b) < (m-1)*ldb+n {
+		panic(shortB)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		for i := 0; i < m; i++ {
+			bi := b[i*ldb : i*ldb+n]
+			for j := range bi {
+				bi[j] = 0
+			}
+		}
+		return
+	}
+
+	noConj := trans != blas.ConjTrans
+	noUnit := diag == blas.NonUnit
+	if side == blas.Left {
+		if trans == blas.NoTrans {
+			// Form B = alpha*A*B.
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					aii := alpha
+					if noUnit {
+						aii *= a[i*lda+i]
+					}
+					bi := b[i*ldb : i*ldb+n]
+					for j := range bi {
+						bi[j] *= aii
+					}
+					for ja, aij := range a[i*lda+i+1 : i*lda+m] {
+						j := ja + i + 1
+						if aij != 0 {
+							c128.AxpyUnitary(alpha*aij, b[j*ldb:j*ldb+n], bi)
+						}
+					}
+				}
+			} else {
+				for i := m - 1; i >= 0; i-- {
+					aii := alpha
+					if noUnit {
+						aii *= a[i*lda+i]
+					}
+					bi := b[i*ldb : i*ldb+n]
+					for j := range bi {
+						bi[j] *= aii
+					}
+					for j, aij := range a[i*lda : i*lda+i] {
+						if aij != 0 {
+							c128.AxpyUnitary(alpha*aij, b[j*ldb:j*ldb+n], bi)
+						}
+					}
+				}
+			}
+		} else {
+			// Form B = alpha*Aᵀ*B  or  B = alpha*Aᴴ*B.
+			if uplo == blas.Upper {
+				for k := m - 1; k >= 0; k-- {
+					bk := b[k*ldb : k*ldb+n]
+					for ja, ajk := range a[k*lda+k+1 : k*lda+m] {
+						if ajk == 0 {
+							continue
+						}
+						j := k + 1 + ja
+						if noConj {
+							c128.AxpyUnitary(alpha*ajk, bk, b[j*ldb:j*ldb+n])
+						} else {
+							c128.AxpyUnitary(alpha*cmplx.Conj(ajk), bk, b[j*ldb:j*ldb+n])
+						}
+					}
+					akk := alpha
+					if noUnit {
+						if noConj {
+							akk *= a[k*lda+k]
+						} else {
+							akk *= cmplx.Conj(a[k*lda+k])
+						}
+					}
+					if akk != 1 {
+						c128.ScalUnitary(akk, bk)
+					}
+				}
+			} else {
+				for k := 0; k < m; k++ {
+					bk := b[k*ldb : k*ldb+n]
+					for j, ajk := range a[k*lda : k*lda+k] {
+						if ajk == 0 {
+							continue
+						}
+						if noConj {
+							c128.AxpyUnitary(alpha*ajk, bk, b[j*ldb:j*ldb+n])
+						} else {
+							c128.AxpyUnitary(alpha*cmplx.Conj(ajk), bk, b[j*ldb:j*ldb+n])
+						}
+					}
+					akk := alpha
+					if noUnit {
+						if noConj {
+							akk *= a[k*lda+k]
+						} else {
+							akk *= cmplx.Conj(a[k*lda+k])
+						}
+					}
+					if akk != 1 {
+						c128.ScalUnitary(akk, bk)
+					}
+				}
+			}
+		}
+	} else {
+		if trans == blas.NoTrans {
+			// Form B = alpha*B*A.
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					for k := n - 1; k >= 0; k-- {
+						abik := alpha * bi[k]
+						if abik == 0 {
+							continue
+						}
+						bi[k] = abik
+						if noUnit {
+							bi[k] *= a[k*lda+k]
+						}
+						c128.AxpyUnitary(abik, a[k*lda+k+1:k*lda+n], bi[k+1:])
+					}
+				}
+			} else {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					for k := 0; k < n; k++ {
+						abik := alpha * bi[k]
+						if abik == 0 {
+							continue
+						}
+						bi[k] = abik
+						if noUnit {
+							bi[k] *= a[k*lda+k]
+						}
+						c128.AxpyUnitary(abik, a[k*lda:k*lda+k], bi[:k])
+					}
+				}
+			}
+		} else {
+			// Form B = alpha*B*Aᵀ  or  B = alpha*B*Aᴴ.
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					for j, bij := range bi {
+						if noConj {
+							if noUnit {
+								bij *= a[j*lda+j]
+							}
+							bij += c128.DotuUnitary(a[j*lda+j+1:j*lda+n], bi[j+1:n])
+						} else {
+							if noUnit {
+								bij *= cmplx.Conj(a[j*lda+j])
+							}
+							bij += c128.DotcUnitary(a[j*lda+j+1:j*lda+n], bi[j+1:n])
+						}
+						bi[j] = alpha * bij
+					}
+				}
+			} else {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					for j := n - 1; j >= 0; j-- {
+						bij := bi[j]
+						if noConj {
+							if noUnit {
+								bij *= a[j*lda+j]
+							}
+							bij += c128.DotuUnitary(a[j*lda:j*lda+j], bi[:j])
+						} else {
+							if noUnit {
+								bij *= cmplx.Conj(a[j*lda+j])
+							}
+							bij += c128.DotcUnitary(a[j*lda:j*lda+j], bi[:j])
+						}
+						bi[j] = alpha * bij
+					}
+				}
+			}
+		}
+	}
+}
+
+// Ztrsm solves one of the matrix equations
+//
+//	op(A) * X = alpha * B  if side == blas.Left,
+//	X * op(A) = alpha * B  if side == blas.Right,
+//
+// where alpha is a scalar, X and B are m×n matrices, A is a unit or
+// non-unit, upper or lower triangular matrix and op(A) is one of
+//
+//	op(A) = A   if transA == blas.NoTrans,
+//	op(A) = Aᵀ  if transA == blas.Trans,
+//	op(A) = Aᴴ  if transA == blas.ConjTrans.
+//
+// On return the matrix X is overwritten on B.
+func (Implementation) Ztrsm(side blas.Side, uplo blas.Uplo, transA blas.Transpose, diag blas.Diag, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int) {
+	na := m
+	if side == blas.Right {
+		na = n
+	}
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case transA != blas.NoTrans && transA != blas.Trans && transA != blas.ConjTrans:
+		panic(badTranspose)
+	case diag != blas.Unit && diag != blas.NonUnit:
+		panic(badDiag)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, na):
+		panic(badLdA)
+	case ldb < max(1, n):
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (na-1)*lda+na {
+		panic(shortA)
+	}
+	if len(b) < (m-1)*ldb+n {
+		panic(shortB)
+	}
+
+	if alpha == 0 {
+		for i := 0; i < m; i++ {
+			for j := 0; j < n; j++ {
+				b[i*ldb+j] = 0
+			}
+		}
+		return
+	}
+
+	noConj := transA != blas.ConjTrans
+	noUnit := diag == blas.NonUnit
+	if side == blas.Left {
+		if transA == blas.NoTrans {
+			// Form  B = alpha*inv(A)*B.
+			if uplo == blas.Upper {
+				for i := m - 1; i >= 0; i-- {
+					bi := b[i*ldb : i*ldb+n]
+					if alpha != 1 {
+						c128.ScalUnitary(alpha, bi)
+					}
+					for ka, aik := range a[i*lda+i+1 : i*lda+m] {
+						k := i + 1 + ka
+						if aik != 0 {
+							c128.AxpyUnitary(-aik, b[k*ldb:k*ldb+n], bi)
+						}
+					}
+					if noUnit {
+						c128.ScalUnitary(1/a[i*lda+i], bi)
+					}
+				}
+			} else {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					if alpha != 1 {
+						c128.ScalUnitary(alpha, bi)
+					}
+					for j, aij := range a[i*lda : i*lda+i] {
+						if aij != 0 {
+							c128.AxpyUnitary(-aij, b[j*ldb:j*ldb+n], bi)
+						}
+					}
+					if noUnit {
+						c128.ScalUnitary(1/a[i*lda+i], bi)
+					}
+				}
+			}
+		} else {
+			// Form  B = alpha*inv(Aᵀ)*B  or  B = alpha*inv(Aᴴ)*B.
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					if noUnit {
+						if noConj {
+							c128.ScalUnitary(1/a[i*lda+i], bi)
+						} else {
+							c128.ScalUnitary(1/cmplx.Conj(a[i*lda+i]), bi)
+						}
+					}
+					for ja, aij := range a[i*lda+i+1 : i*lda+m] {
+						if aij == 0 {
+							continue
+						}
+						j := i + 1 + ja
+						if noConj {
+							c128.AxpyUnitary(-aij, bi, b[j*ldb:j*ldb+n])
+						} else {
+							c128.AxpyUnitary(-cmplx.Conj(aij), bi, b[j*ldb:j*ldb+n])
+						}
+					}
+					if alpha != 1 {
+						c128.ScalUnitary(alpha, bi)
+					}
+				}
+			} else {
+				for i := m - 1; i >= 0; i-- {
+					bi := b[i*ldb : i*ldb+n]
+					if noUnit {
+						if noConj {
+							c128.ScalUnitary(1/a[i*lda+i], bi)
+						} else {
+							c128.ScalUnitary(1/cmplx.Conj(a[i*lda+i]), bi)
+						}
+					}
+					for j, aij := range a[i*lda : i*lda+i] {
+						if aij == 0 {
+							continue
+						}
+						if noConj {
+							c128.AxpyUnitary(-aij, bi, b[j*ldb:j*ldb+n])
+						} else {
+							c128.AxpyUnitary(-cmplx.Conj(aij), bi, b[j*ldb:j*ldb+n])
+						}
+					}
+					if alpha != 1 {
+						c128.ScalUnitary(alpha, bi)
+					}
+				}
+			}
+		}
+	} else {
+		if transA == blas.NoTrans {
+			// Form  B = alpha*B*inv(A).
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					if alpha != 1 {
+						c128.ScalUnitary(alpha, bi)
+					}
+					for j, bij := range bi {
+						if bij == 0 {
+							continue
+						}
+						if noUnit {
+							bi[j] /= a[j*lda+j]
+						}
+						c128.AxpyUnitary(-bi[j], a[j*lda+j+1:j*lda+n], bi[j+1:n])
+					}
+				}
+			} else {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					if alpha != 1 {
+						c128.ScalUnitary(alpha, bi)
+					}
+					for j := n - 1; j >= 0; j-- {
+						if bi[j] == 0 {
+							continue
+						}
+						if noUnit {
+							bi[j] /= a[j*lda+j]
+						}
+						c128.AxpyUnitary(-bi[j], a[j*lda:j*lda+j], bi[:j])
+					}
+				}
+			}
+		} else {
+			// Form  B = alpha*B*inv(Aᵀ)  or   B = alpha*B*inv(Aᴴ).
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					for j := n - 1; j >= 0; j-- {
+						bij := alpha * bi[j]
+						if noConj {
+							bij -= c128.DotuUnitary(a[j*lda+j+1:j*lda+n], bi[j+1:n])
+							if noUnit {
+								bij /= a[j*lda+j]
+							}
+						} else {
+							bij -= c128.DotcUnitary(a[j*lda+j+1:j*lda+n], bi[j+1:n])
+							if noUnit {
+								bij /= cmplx.Conj(a[j*lda+j])
+							}
+						}
+						bi[j] = bij
+					}
+				}
+			} else {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					for j, bij := range bi {
+						bij *= alpha
+						if noConj {
+							bij -= c128.DotuUnitary(a[j*lda:j*lda+j], bi[:j])
+							if noUnit {
+								bij /= a[j*lda+j]
+							}
+						} else {
+							bij -= c128.DotcUnitary(a[j*lda:j*lda+j], bi[:j])
+							if noUnit {
+								bij /= cmplx.Conj(a[j*lda+j])
+							}
+						}
+						bi[j] = bij
+					}
+				}
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level3cmplx64.go b/vendor/gonum.org/v1/gonum/blas/gonum/level3cmplx64.go
new file mode 100644
index 0000000000..b7fb5a2c4e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level3cmplx64.go
@@ -0,0 +1,1771 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	cmplx "gonum.org/v1/gonum/internal/cmplx64"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/c64"
+)
+
+var _ blas.Complex64Level3 = Implementation{}
+
+// Cgemm performs one of the matrix-matrix operations
+//
+//	C = alpha * op(A) * op(B) + beta * C
+//
+// where op(X) is one of
+//
+//	op(X) = X  or  op(X) = Xᵀ  or  op(X) = Xᴴ,
+//
+// alpha and beta are scalars, and A, B and C are matrices, with op(A) an m×k matrix,
+// op(B) a k×n matrix and C an m×n matrix.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cgemm(tA, tB blas.Transpose, m, n, k int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int) {
+	switch tA {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch tB {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	}
+	rowA, colA := m, k
+	if tA != blas.NoTrans {
+		rowA, colA = k, m
+	}
+	if lda < max(1, colA) {
+		panic(badLdA)
+	}
+	rowB, colB := k, n
+	if tB != blas.NoTrans {
+		rowB, colB = n, k
+	}
+	if ldb < max(1, colB) {
+		panic(badLdB)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (rowA-1)*lda+colA {
+		panic(shortA)
+	}
+	if len(b) < (rowB-1)*ldb+colB {
+		panic(shortB)
+	}
+	if len(c) < (m-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					c[i*ldc+j] = 0
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					c[i*ldc+j] *= beta
+				}
+			}
+		}
+		return
+	}
+
+	switch tA {
+	case blas.NoTrans:
+		switch tB {
+		case blas.NoTrans:
+			// Form  C = alpha * A * B + beta * C.
+			for i := 0; i < m; i++ {
+				switch {
+				case beta == 0:
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] = 0
+					}
+				case beta != 1:
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] *= beta
+					}
+				}
+				for l := 0; l < k; l++ {
+					tmp := alpha * a[i*lda+l]
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] += tmp * b[l*ldb+j]
+					}
+				}
+			}
+		case blas.Trans:
+			// Form  C = alpha * A * Bᵀ + beta * C.
+			for i := 0; i < m; i++ {
+				switch {
+				case beta == 0:
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] = 0
+					}
+				case beta != 1:
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] *= beta
+					}
+				}
+				for l := 0; l < k; l++ {
+					tmp := alpha * a[i*lda+l]
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] += tmp * b[j*ldb+l]
+					}
+				}
+			}
+		case blas.ConjTrans:
+			// Form  C = alpha * A * Bᴴ + beta * C.
+			for i := 0; i < m; i++ {
+				switch {
+				case beta == 0:
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] = 0
+					}
+				case beta != 1:
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] *= beta
+					}
+				}
+				for l := 0; l < k; l++ {
+					tmp := alpha * a[i*lda+l]
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] += tmp * cmplx.Conj(b[j*ldb+l])
+					}
+				}
+			}
+		}
+	case blas.Trans:
+		switch tB {
+		case blas.NoTrans:
+			// Form  C = alpha * Aᵀ * B + beta * C.
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					var tmp complex64
+					for l := 0; l < k; l++ {
+						tmp += a[l*lda+i] * b[l*ldb+j]
+					}
+					if beta == 0 {
+						c[i*ldc+j] = alpha * tmp
+					} else {
+						c[i*ldc+j] = alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		case blas.Trans:
+			// Form  C = alpha * Aᵀ * Bᵀ + beta * C.
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					var tmp complex64
+					for l := 0; l < k; l++ {
+						tmp += a[l*lda+i] * b[j*ldb+l]
+					}
+					if beta == 0 {
+						c[i*ldc+j] = alpha * tmp
+					} else {
+						c[i*ldc+j] = alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		case blas.ConjTrans:
+			// Form  C = alpha * Aᵀ * Bᴴ + beta * C.
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					var tmp complex64
+					for l := 0; l < k; l++ {
+						tmp += a[l*lda+i] * cmplx.Conj(b[j*ldb+l])
+					}
+					if beta == 0 {
+						c[i*ldc+j] = alpha * tmp
+					} else {
+						c[i*ldc+j] = alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		}
+	case blas.ConjTrans:
+		switch tB {
+		case blas.NoTrans:
+			// Form  C = alpha * Aᴴ * B + beta * C.
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					var tmp complex64
+					for l := 0; l < k; l++ {
+						tmp += cmplx.Conj(a[l*lda+i]) * b[l*ldb+j]
+					}
+					if beta == 0 {
+						c[i*ldc+j] = alpha * tmp
+					} else {
+						c[i*ldc+j] = alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		case blas.Trans:
+			// Form  C = alpha * Aᴴ * Bᵀ + beta * C.
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					var tmp complex64
+					for l := 0; l < k; l++ {
+						tmp += cmplx.Conj(a[l*lda+i]) * b[j*ldb+l]
+					}
+					if beta == 0 {
+						c[i*ldc+j] = alpha * tmp
+					} else {
+						c[i*ldc+j] = alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		case blas.ConjTrans:
+			// Form  C = alpha * Aᴴ * Bᴴ + beta * C.
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					var tmp complex64
+					for l := 0; l < k; l++ {
+						tmp += cmplx.Conj(a[l*lda+i]) * cmplx.Conj(b[j*ldb+l])
+					}
+					if beta == 0 {
+						c[i*ldc+j] = alpha * tmp
+					} else {
+						c[i*ldc+j] = alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		}
+	}
+}
+
+// Chemm performs one of the matrix-matrix operations
+//
+//	C = alpha*A*B + beta*C  if side == blas.Left
+//	C = alpha*B*A + beta*C  if side == blas.Right
+//
+// where alpha and beta are scalars, A is an m×m or n×n hermitian matrix and B
+// and C are m×n matrices. The imaginary parts of the diagonal elements of A are
+// assumed to be zero.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Chemm(side blas.Side, uplo blas.Uplo, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int) {
+	na := m
+	if side == blas.Right {
+		na = n
+	}
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, na):
+		panic(badLdA)
+	case ldb < max(1, n):
+		panic(badLdB)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(na-1)+na {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+	if len(c) < ldc*(m-1)+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			for i := 0; i < m; i++ {
+				ci := c[i*ldc : i*ldc+n]
+				for j := range ci {
+					ci[j] = 0
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				ci := c[i*ldc : i*ldc+n]
+				c64.ScalUnitary(beta, ci)
+			}
+		}
+		return
+	}
+
+	if side == blas.Left {
+		// Form  C = alpha*A*B + beta*C.
+		for i := 0; i < m; i++ {
+			atmp := alpha * complex(real(a[i*lda+i]), 0)
+			bi := b[i*ldb : i*ldb+n]
+			ci := c[i*ldc : i*ldc+n]
+			if beta == 0 {
+				for j, bij := range bi {
+					ci[j] = atmp * bij
+				}
+			} else {
+				for j, bij := range bi {
+					ci[j] = atmp*bij + beta*ci[j]
+				}
+			}
+			if uplo == blas.Upper {
+				for k := 0; k < i; k++ {
+					atmp = alpha * cmplx.Conj(a[k*lda+i])
+					c64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+				for k := i + 1; k < m; k++ {
+					atmp = alpha * a[i*lda+k]
+					c64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+			} else {
+				for k := 0; k < i; k++ {
+					atmp = alpha * a[i*lda+k]
+					c64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+				for k := i + 1; k < m; k++ {
+					atmp = alpha * cmplx.Conj(a[k*lda+i])
+					c64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+			}
+		}
+	} else {
+		// Form  C = alpha*B*A + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < m; i++ {
+				for j := n - 1; j >= 0; j-- {
+					abij := alpha * b[i*ldb+j]
+					aj := a[j*lda+j+1 : j*lda+n]
+					bi := b[i*ldb+j+1 : i*ldb+n]
+					ci := c[i*ldc+j+1 : i*ldc+n]
+					var tmp complex64
+					for k, ajk := range aj {
+						ci[k] += abij * ajk
+						tmp += bi[k] * cmplx.Conj(ajk)
+					}
+					ajj := complex(real(a[j*lda+j]), 0)
+					if beta == 0 {
+						c[i*ldc+j] = abij*ajj + alpha*tmp
+					} else {
+						c[i*ldc+j] = abij*ajj + alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					abij := alpha * b[i*ldb+j]
+					aj := a[j*lda : j*lda+j]
+					bi := b[i*ldb : i*ldb+j]
+					ci := c[i*ldc : i*ldc+j]
+					var tmp complex64
+					for k, ajk := range aj {
+						ci[k] += abij * ajk
+						tmp += bi[k] * cmplx.Conj(ajk)
+					}
+					ajj := complex(real(a[j*lda+j]), 0)
+					if beta == 0 {
+						c[i*ldc+j] = abij*ajj + alpha*tmp
+					} else {
+						c[i*ldc+j] = abij*ajj + alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		}
+	}
+}
+
+// Cherk performs one of the hermitian rank-k operations
+//
+//	C = alpha*A*Aᴴ + beta*C  if trans == blas.NoTrans
+//	C = alpha*Aᴴ*A + beta*C  if trans == blas.ConjTrans
+//
+// where alpha and beta are real scalars, C is an n×n hermitian matrix and A is
+// an n×k matrix in the first case and a k×n matrix in the second case.
+//
+// The imaginary parts of the diagonal elements of C are assumed to be zero, and
+// on return they will be set to zero.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cherk(uplo blas.Uplo, trans blas.Transpose, n, k int, alpha float32, a []complex64, lda int, beta float32, c []complex64, ldc int) {
+	var rowA, colA int
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans:
+		rowA, colA = n, k
+	case blas.ConjTrans:
+		rowA, colA = k, n
+	}
+	switch {
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case lda < max(1, colA):
+		panic(badLdA)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (rowA-1)*lda+colA {
+		panic(shortA)
+	}
+	if len(c) < (n-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if uplo == blas.Upper {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					ci[0] = complex(beta*real(ci[0]), 0)
+					if i != n-1 {
+						c64.SscalUnitary(beta, ci[1:])
+					}
+				}
+			}
+		} else {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					if i != 0 {
+						c64.SscalUnitary(beta, ci[:i])
+					}
+					ci[i] = complex(beta*real(ci[i]), 0)
+				}
+			}
+		}
+		return
+	}
+
+	calpha := complex(alpha, 0)
+	if trans == blas.NoTrans {
+		// Form  C = alpha*A*Aᴴ + beta*C.
+		cbeta := complex(beta, 0)
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				ai := a[i*lda : i*lda+k]
+				switch {
+				case beta == 0:
+					// Handle the i-th diagonal element of C.
+					ci[0] = complex(alpha*real(c64.DotcUnitary(ai, ai)), 0)
+					// Handle the remaining elements on the i-th row of C.
+					for jc := range ci[1:] {
+						j := i + 1 + jc
+						ci[jc+1] = calpha * c64.DotcUnitary(a[j*lda:j*lda+k], ai)
+					}
+				case beta != 1:
+					cii := calpha*c64.DotcUnitary(ai, ai) + cbeta*ci[0]
+					ci[0] = complex(real(cii), 0)
+					for jc, cij := range ci[1:] {
+						j := i + 1 + jc
+						ci[jc+1] = calpha*c64.DotcUnitary(a[j*lda:j*lda+k], ai) + cbeta*cij
+					}
+				default:
+					cii := calpha*c64.DotcUnitary(ai, ai) + ci[0]
+					ci[0] = complex(real(cii), 0)
+					for jc, cij := range ci[1:] {
+						j := i + 1 + jc
+						ci[jc+1] = calpha*c64.DotcUnitary(a[j*lda:j*lda+k], ai) + cij
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				ai := a[i*lda : i*lda+k]
+				switch {
+				case beta == 0:
+					// Handle the first i-1 elements on the i-th row of C.
+					for j := range ci[:i] {
+						ci[j] = calpha * c64.DotcUnitary(a[j*lda:j*lda+k], ai)
+					}
+					// Handle the i-th diagonal element of C.
+					ci[i] = complex(alpha*real(c64.DotcUnitary(ai, ai)), 0)
+				case beta != 1:
+					for j, cij := range ci[:i] {
+						ci[j] = calpha*c64.DotcUnitary(a[j*lda:j*lda+k], ai) + cbeta*cij
+					}
+					cii := calpha*c64.DotcUnitary(ai, ai) + cbeta*ci[i]
+					ci[i] = complex(real(cii), 0)
+				default:
+					for j, cij := range ci[:i] {
+						ci[j] = calpha*c64.DotcUnitary(a[j*lda:j*lda+k], ai) + cij
+					}
+					cii := calpha*c64.DotcUnitary(ai, ai) + ci[i]
+					ci[i] = complex(real(cii), 0)
+				}
+			}
+		}
+	} else {
+		// Form  C = alpha*Aᴴ*A + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				switch {
+				case beta == 0:
+					for jc := range ci {
+						ci[jc] = 0
+					}
+				case beta != 1:
+					c64.SscalUnitary(beta, ci)
+					ci[0] = complex(real(ci[0]), 0)
+				default:
+					ci[0] = complex(real(ci[0]), 0)
+				}
+				for j := 0; j < k; j++ {
+					aji := cmplx.Conj(a[j*lda+i])
+					if aji != 0 {
+						c64.AxpyUnitary(calpha*aji, a[j*lda+i:j*lda+n], ci)
+					}
+				}
+				c[i*ldc+i] = complex(real(c[i*ldc+i]), 0)
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				switch {
+				case beta == 0:
+					for j := range ci {
+						ci[j] = 0
+					}
+				case beta != 1:
+					c64.SscalUnitary(beta, ci)
+					ci[i] = complex(real(ci[i]), 0)
+				default:
+					ci[i] = complex(real(ci[i]), 0)
+				}
+				for j := 0; j < k; j++ {
+					aji := cmplx.Conj(a[j*lda+i])
+					if aji != 0 {
+						c64.AxpyUnitary(calpha*aji, a[j*lda:j*lda+i+1], ci)
+					}
+				}
+				c[i*ldc+i] = complex(real(c[i*ldc+i]), 0)
+			}
+		}
+	}
+}
+
+// Cher2k performs one of the hermitian rank-2k operations
+//
+//	C = alpha*A*Bᴴ + conj(alpha)*B*Aᴴ + beta*C  if trans == blas.NoTrans
+//	C = alpha*Aᴴ*B + conj(alpha)*Bᴴ*A + beta*C  if trans == blas.ConjTrans
+//
+// where alpha and beta are scalars with beta real, C is an n×n hermitian matrix
+// and A and B are n×k matrices in the first case and k×n matrices in the second case.
+//
+// The imaginary parts of the diagonal elements of C are assumed to be zero, and
+// on return they will be set to zero.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cher2k(uplo blas.Uplo, trans blas.Transpose, n, k int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta float32, c []complex64, ldc int) {
+	var row, col int
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans:
+		row, col = n, k
+	case blas.ConjTrans:
+		row, col = k, n
+	}
+	switch {
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case lda < max(1, col):
+		panic(badLdA)
+	case ldb < max(1, col):
+		panic(badLdB)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (row-1)*lda+col {
+		panic(shortA)
+	}
+	if len(b) < (row-1)*ldb+col {
+		panic(shortB)
+	}
+	if len(c) < (n-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if uplo == blas.Upper {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					ci[0] = complex(beta*real(ci[0]), 0)
+					if i != n-1 {
+						c64.SscalUnitary(beta, ci[1:])
+					}
+				}
+			}
+		} else {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					if i != 0 {
+						c64.SscalUnitary(beta, ci[:i])
+					}
+					ci[i] = complex(beta*real(ci[i]), 0)
+				}
+			}
+		}
+		return
+	}
+
+	conjalpha := cmplx.Conj(alpha)
+	cbeta := complex(beta, 0)
+	if trans == blas.NoTrans {
+		// Form  C = alpha*A*Bᴴ + conj(alpha)*B*Aᴴ + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i+1 : i*ldc+n]
+				ai := a[i*lda : i*lda+k]
+				bi := b[i*ldb : i*ldb+k]
+				if beta == 0 {
+					cii := alpha*c64.DotcUnitary(bi, ai) + conjalpha*c64.DotcUnitary(ai, bi)
+					c[i*ldc+i] = complex(real(cii), 0)
+					for jc := range ci {
+						j := i + 1 + jc
+						ci[jc] = alpha*c64.DotcUnitary(b[j*ldb:j*ldb+k], ai) + conjalpha*c64.DotcUnitary(a[j*lda:j*lda+k], bi)
+					}
+				} else {
+					cii := alpha*c64.DotcUnitary(bi, ai) + conjalpha*c64.DotcUnitary(ai, bi) + cbeta*c[i*ldc+i]
+					c[i*ldc+i] = complex(real(cii), 0)
+					for jc, cij := range ci {
+						j := i + 1 + jc
+						ci[jc] = alpha*c64.DotcUnitary(b[j*ldb:j*ldb+k], ai) + conjalpha*c64.DotcUnitary(a[j*lda:j*lda+k], bi) + cbeta*cij
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i]
+				ai := a[i*lda : i*lda+k]
+				bi := b[i*ldb : i*ldb+k]
+				if beta == 0 {
+					for j := range ci {
+						ci[j] = alpha*c64.DotcUnitary(b[j*ldb:j*ldb+k], ai) + conjalpha*c64.DotcUnitary(a[j*lda:j*lda+k], bi)
+					}
+					cii := alpha*c64.DotcUnitary(bi, ai) + conjalpha*c64.DotcUnitary(ai, bi)
+					c[i*ldc+i] = complex(real(cii), 0)
+				} else {
+					for j, cij := range ci {
+						ci[j] = alpha*c64.DotcUnitary(b[j*ldb:j*ldb+k], ai) + conjalpha*c64.DotcUnitary(a[j*lda:j*lda+k], bi) + cbeta*cij
+					}
+					cii := alpha*c64.DotcUnitary(bi, ai) + conjalpha*c64.DotcUnitary(ai, bi) + cbeta*c[i*ldc+i]
+					c[i*ldc+i] = complex(real(cii), 0)
+				}
+			}
+		}
+	} else {
+		// Form  C = alpha*Aᴴ*B + conj(alpha)*Bᴴ*A + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				switch {
+				case beta == 0:
+					for jc := range ci {
+						ci[jc] = 0
+					}
+				case beta != 1:
+					c64.SscalUnitary(beta, ci)
+					ci[0] = complex(real(ci[0]), 0)
+				default:
+					ci[0] = complex(real(ci[0]), 0)
+				}
+				for j := 0; j < k; j++ {
+					aji := a[j*lda+i]
+					bji := b[j*ldb+i]
+					if aji != 0 {
+						c64.AxpyUnitary(alpha*cmplx.Conj(aji), b[j*ldb+i:j*ldb+n], ci)
+					}
+					if bji != 0 {
+						c64.AxpyUnitary(conjalpha*cmplx.Conj(bji), a[j*lda+i:j*lda+n], ci)
+					}
+				}
+				ci[0] = complex(real(ci[0]), 0)
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				switch {
+				case beta == 0:
+					for j := range ci {
+						ci[j] = 0
+					}
+				case beta != 1:
+					c64.SscalUnitary(beta, ci)
+					ci[i] = complex(real(ci[i]), 0)
+				default:
+					ci[i] = complex(real(ci[i]), 0)
+				}
+				for j := 0; j < k; j++ {
+					aji := a[j*lda+i]
+					bji := b[j*ldb+i]
+					if aji != 0 {
+						c64.AxpyUnitary(alpha*cmplx.Conj(aji), b[j*ldb:j*ldb+i+1], ci)
+					}
+					if bji != 0 {
+						c64.AxpyUnitary(conjalpha*cmplx.Conj(bji), a[j*lda:j*lda+i+1], ci)
+					}
+				}
+				ci[i] = complex(real(ci[i]), 0)
+			}
+		}
+	}
+}
+
+// Csymm performs one of the matrix-matrix operations
+//
+//	C = alpha*A*B + beta*C  if side == blas.Left
+//	C = alpha*B*A + beta*C  if side == blas.Right
+//
+// where alpha and beta are scalars, A is an m×m or n×n symmetric matrix and B
+// and C are m×n matrices.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Csymm(side blas.Side, uplo blas.Uplo, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int) {
+	na := m
+	if side == blas.Right {
+		na = n
+	}
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, na):
+		panic(badLdA)
+	case ldb < max(1, n):
+		panic(badLdB)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(na-1)+na {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+	if len(c) < ldc*(m-1)+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			for i := 0; i < m; i++ {
+				ci := c[i*ldc : i*ldc+n]
+				for j := range ci {
+					ci[j] = 0
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				ci := c[i*ldc : i*ldc+n]
+				c64.ScalUnitary(beta, ci)
+			}
+		}
+		return
+	}
+
+	if side == blas.Left {
+		// Form  C = alpha*A*B + beta*C.
+		for i := 0; i < m; i++ {
+			atmp := alpha * a[i*lda+i]
+			bi := b[i*ldb : i*ldb+n]
+			ci := c[i*ldc : i*ldc+n]
+			if beta == 0 {
+				for j, bij := range bi {
+					ci[j] = atmp * bij
+				}
+			} else {
+				for j, bij := range bi {
+					ci[j] = atmp*bij + beta*ci[j]
+				}
+			}
+			if uplo == blas.Upper {
+				for k := 0; k < i; k++ {
+					atmp = alpha * a[k*lda+i]
+					c64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+				for k := i + 1; k < m; k++ {
+					atmp = alpha * a[i*lda+k]
+					c64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+			} else {
+				for k := 0; k < i; k++ {
+					atmp = alpha * a[i*lda+k]
+					c64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+				for k := i + 1; k < m; k++ {
+					atmp = alpha * a[k*lda+i]
+					c64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+			}
+		}
+	} else {
+		// Form  C = alpha*B*A + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < m; i++ {
+				for j := n - 1; j >= 0; j-- {
+					abij := alpha * b[i*ldb+j]
+					aj := a[j*lda+j+1 : j*lda+n]
+					bi := b[i*ldb+j+1 : i*ldb+n]
+					ci := c[i*ldc+j+1 : i*ldc+n]
+					var tmp complex64
+					for k, ajk := range aj {
+						ci[k] += abij * ajk
+						tmp += bi[k] * ajk
+					}
+					if beta == 0 {
+						c[i*ldc+j] = abij*a[j*lda+j] + alpha*tmp
+					} else {
+						c[i*ldc+j] = abij*a[j*lda+j] + alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					abij := alpha * b[i*ldb+j]
+					aj := a[j*lda : j*lda+j]
+					bi := b[i*ldb : i*ldb+j]
+					ci := c[i*ldc : i*ldc+j]
+					var tmp complex64
+					for k, ajk := range aj {
+						ci[k] += abij * ajk
+						tmp += bi[k] * ajk
+					}
+					if beta == 0 {
+						c[i*ldc+j] = abij*a[j*lda+j] + alpha*tmp
+					} else {
+						c[i*ldc+j] = abij*a[j*lda+j] + alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		}
+	}
+}
+
+// Csyrk performs one of the symmetric rank-k operations
+//
+//	C = alpha*A*Aᵀ + beta*C  if trans == blas.NoTrans
+//	C = alpha*Aᵀ*A + beta*C  if trans == blas.Trans
+//
+// where alpha and beta are scalars, C is an n×n symmetric matrix and A is
+// an n×k matrix in the first case and a k×n matrix in the second case.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Csyrk(uplo blas.Uplo, trans blas.Transpose, n, k int, alpha complex64, a []complex64, lda int, beta complex64, c []complex64, ldc int) {
+	var rowA, colA int
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans:
+		rowA, colA = n, k
+	case blas.Trans:
+		rowA, colA = k, n
+	}
+	switch {
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case lda < max(1, colA):
+		panic(badLdA)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (rowA-1)*lda+colA {
+		panic(shortA)
+	}
+	if len(c) < (n-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if uplo == blas.Upper {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					c64.ScalUnitary(beta, ci)
+				}
+			}
+		} else {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					c64.ScalUnitary(beta, ci)
+				}
+			}
+		}
+		return
+	}
+
+	if trans == blas.NoTrans {
+		// Form  C = alpha*A*Aᵀ + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				ai := a[i*lda : i*lda+k]
+				if beta == 0 {
+					for jc := range ci {
+						j := i + jc
+						ci[jc] = alpha * c64.DotuUnitary(ai, a[j*lda:j*lda+k])
+					}
+				} else {
+					for jc, cij := range ci {
+						j := i + jc
+						ci[jc] = beta*cij + alpha*c64.DotuUnitary(ai, a[j*lda:j*lda+k])
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				ai := a[i*lda : i*lda+k]
+				if beta == 0 {
+					for j := range ci {
+						ci[j] = alpha * c64.DotuUnitary(ai, a[j*lda:j*lda+k])
+					}
+				} else {
+					for j, cij := range ci {
+						ci[j] = beta*cij + alpha*c64.DotuUnitary(ai, a[j*lda:j*lda+k])
+					}
+				}
+			}
+		}
+	} else {
+		// Form  C = alpha*Aᵀ*A + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				switch {
+				case beta == 0:
+					for jc := range ci {
+						ci[jc] = 0
+					}
+				case beta != 1:
+					for jc := range ci {
+						ci[jc] *= beta
+					}
+				}
+				for j := 0; j < k; j++ {
+					aji := a[j*lda+i]
+					if aji != 0 {
+						c64.AxpyUnitary(alpha*aji, a[j*lda+i:j*lda+n], ci)
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				switch {
+				case beta == 0:
+					for j := range ci {
+						ci[j] = 0
+					}
+				case beta != 1:
+					for j := range ci {
+						ci[j] *= beta
+					}
+				}
+				for j := 0; j < k; j++ {
+					aji := a[j*lda+i]
+					if aji != 0 {
+						c64.AxpyUnitary(alpha*aji, a[j*lda:j*lda+i+1], ci)
+					}
+				}
+			}
+		}
+	}
+}
+
+// Csyr2k performs one of the symmetric rank-2k operations
+//
+//	C = alpha*A*Bᵀ + alpha*B*Aᵀ + beta*C  if trans == blas.NoTrans
+//	C = alpha*Aᵀ*B + alpha*Bᵀ*A + beta*C  if trans == blas.Trans
+//
+// where alpha and beta are scalars, C is an n×n symmetric matrix and A and B
+// are n×k matrices in the first case and k×n matrices in the second case.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Csyr2k(uplo blas.Uplo, trans blas.Transpose, n, k int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int) {
+	var row, col int
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans:
+		row, col = n, k
+	case blas.Trans:
+		row, col = k, n
+	}
+	switch {
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case lda < max(1, col):
+		panic(badLdA)
+	case ldb < max(1, col):
+		panic(badLdB)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (row-1)*lda+col {
+		panic(shortA)
+	}
+	if len(b) < (row-1)*ldb+col {
+		panic(shortB)
+	}
+	if len(c) < (n-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if uplo == blas.Upper {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					c64.ScalUnitary(beta, ci)
+				}
+			}
+		} else {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					c64.ScalUnitary(beta, ci)
+				}
+			}
+		}
+		return
+	}
+
+	if trans == blas.NoTrans {
+		// Form  C = alpha*A*Bᵀ + alpha*B*Aᵀ + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				ai := a[i*lda : i*lda+k]
+				bi := b[i*ldb : i*ldb+k]
+				if beta == 0 {
+					for jc := range ci {
+						j := i + jc
+						ci[jc] = alpha*c64.DotuUnitary(ai, b[j*ldb:j*ldb+k]) + alpha*c64.DotuUnitary(bi, a[j*lda:j*lda+k])
+					}
+				} else {
+					for jc, cij := range ci {
+						j := i + jc
+						ci[jc] = alpha*c64.DotuUnitary(ai, b[j*ldb:j*ldb+k]) + alpha*c64.DotuUnitary(bi, a[j*lda:j*lda+k]) + beta*cij
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				ai := a[i*lda : i*lda+k]
+				bi := b[i*ldb : i*ldb+k]
+				if beta == 0 {
+					for j := range ci {
+						ci[j] = alpha*c64.DotuUnitary(ai, b[j*ldb:j*ldb+k]) + alpha*c64.DotuUnitary(bi, a[j*lda:j*lda+k])
+					}
+				} else {
+					for j, cij := range ci {
+						ci[j] = alpha*c64.DotuUnitary(ai, b[j*ldb:j*ldb+k]) + alpha*c64.DotuUnitary(bi, a[j*lda:j*lda+k]) + beta*cij
+					}
+				}
+			}
+		}
+	} else {
+		// Form  C = alpha*Aᵀ*B + alpha*Bᵀ*A + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				switch {
+				case beta == 0:
+					for jc := range ci {
+						ci[jc] = 0
+					}
+				case beta != 1:
+					for jc := range ci {
+						ci[jc] *= beta
+					}
+				}
+				for j := 0; j < k; j++ {
+					aji := a[j*lda+i]
+					bji := b[j*ldb+i]
+					if aji != 0 {
+						c64.AxpyUnitary(alpha*aji, b[j*ldb+i:j*ldb+n], ci)
+					}
+					if bji != 0 {
+						c64.AxpyUnitary(alpha*bji, a[j*lda+i:j*lda+n], ci)
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				switch {
+				case beta == 0:
+					for j := range ci {
+						ci[j] = 0
+					}
+				case beta != 1:
+					for j := range ci {
+						ci[j] *= beta
+					}
+				}
+				for j := 0; j < k; j++ {
+					aji := a[j*lda+i]
+					bji := b[j*ldb+i]
+					if aji != 0 {
+						c64.AxpyUnitary(alpha*aji, b[j*ldb:j*ldb+i+1], ci)
+					}
+					if bji != 0 {
+						c64.AxpyUnitary(alpha*bji, a[j*lda:j*lda+i+1], ci)
+					}
+				}
+			}
+		}
+	}
+}
+
+// Ctrmm performs one of the matrix-matrix operations
+//
+//	B = alpha * op(A) * B  if side == blas.Left,
+//	B = alpha * B * op(A)  if side == blas.Right,
+//
+// where alpha is a scalar, B is an m×n matrix, A is a unit, or non-unit,
+// upper or lower triangular matrix and op(A) is one of
+//
+//	op(A) = A   if trans == blas.NoTrans,
+//	op(A) = Aᵀ  if trans == blas.Trans,
+//	op(A) = Aᴴ  if trans == blas.ConjTrans.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Ctrmm(side blas.Side, uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int) {
+	na := m
+	if side == blas.Right {
+		na = n
+	}
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case trans != blas.NoTrans && trans != blas.Trans && trans != blas.ConjTrans:
+		panic(badTranspose)
+	case diag != blas.Unit && diag != blas.NonUnit:
+		panic(badDiag)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, na):
+		panic(badLdA)
+	case ldb < max(1, n):
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (na-1)*lda+na {
+		panic(shortA)
+	}
+	if len(b) < (m-1)*ldb+n {
+		panic(shortB)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		for i := 0; i < m; i++ {
+			bi := b[i*ldb : i*ldb+n]
+			for j := range bi {
+				bi[j] = 0
+			}
+		}
+		return
+	}
+
+	noConj := trans != blas.ConjTrans
+	noUnit := diag == blas.NonUnit
+	if side == blas.Left {
+		if trans == blas.NoTrans {
+			// Form B = alpha*A*B.
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					aii := alpha
+					if noUnit {
+						aii *= a[i*lda+i]
+					}
+					bi := b[i*ldb : i*ldb+n]
+					for j := range bi {
+						bi[j] *= aii
+					}
+					for ja, aij := range a[i*lda+i+1 : i*lda+m] {
+						j := ja + i + 1
+						if aij != 0 {
+							c64.AxpyUnitary(alpha*aij, b[j*ldb:j*ldb+n], bi)
+						}
+					}
+				}
+			} else {
+				for i := m - 1; i >= 0; i-- {
+					aii := alpha
+					if noUnit {
+						aii *= a[i*lda+i]
+					}
+					bi := b[i*ldb : i*ldb+n]
+					for j := range bi {
+						bi[j] *= aii
+					}
+					for j, aij := range a[i*lda : i*lda+i] {
+						if aij != 0 {
+							c64.AxpyUnitary(alpha*aij, b[j*ldb:j*ldb+n], bi)
+						}
+					}
+				}
+			}
+		} else {
+			// Form B = alpha*Aᵀ*B  or  B = alpha*Aᴴ*B.
+			if uplo == blas.Upper {
+				for k := m - 1; k >= 0; k-- {
+					bk := b[k*ldb : k*ldb+n]
+					for ja, ajk := range a[k*lda+k+1 : k*lda+m] {
+						if ajk == 0 {
+							continue
+						}
+						j := k + 1 + ja
+						if noConj {
+							c64.AxpyUnitary(alpha*ajk, bk, b[j*ldb:j*ldb+n])
+						} else {
+							c64.AxpyUnitary(alpha*cmplx.Conj(ajk), bk, b[j*ldb:j*ldb+n])
+						}
+					}
+					akk := alpha
+					if noUnit {
+						if noConj {
+							akk *= a[k*lda+k]
+						} else {
+							akk *= cmplx.Conj(a[k*lda+k])
+						}
+					}
+					if akk != 1 {
+						c64.ScalUnitary(akk, bk)
+					}
+				}
+			} else {
+				for k := 0; k < m; k++ {
+					bk := b[k*ldb : k*ldb+n]
+					for j, ajk := range a[k*lda : k*lda+k] {
+						if ajk == 0 {
+							continue
+						}
+						if noConj {
+							c64.AxpyUnitary(alpha*ajk, bk, b[j*ldb:j*ldb+n])
+						} else {
+							c64.AxpyUnitary(alpha*cmplx.Conj(ajk), bk, b[j*ldb:j*ldb+n])
+						}
+					}
+					akk := alpha
+					if noUnit {
+						if noConj {
+							akk *= a[k*lda+k]
+						} else {
+							akk *= cmplx.Conj(a[k*lda+k])
+						}
+					}
+					if akk != 1 {
+						c64.ScalUnitary(akk, bk)
+					}
+				}
+			}
+		}
+	} else {
+		if trans == blas.NoTrans {
+			// Form B = alpha*B*A.
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					for k := n - 1; k >= 0; k-- {
+						abik := alpha * bi[k]
+						if abik == 0 {
+							continue
+						}
+						bi[k] = abik
+						if noUnit {
+							bi[k] *= a[k*lda+k]
+						}
+						c64.AxpyUnitary(abik, a[k*lda+k+1:k*lda+n], bi[k+1:])
+					}
+				}
+			} else {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					for k := 0; k < n; k++ {
+						abik := alpha * bi[k]
+						if abik == 0 {
+							continue
+						}
+						bi[k] = abik
+						if noUnit {
+							bi[k] *= a[k*lda+k]
+						}
+						c64.AxpyUnitary(abik, a[k*lda:k*lda+k], bi[:k])
+					}
+				}
+			}
+		} else {
+			// Form B = alpha*B*Aᵀ  or  B = alpha*B*Aᴴ.
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					for j, bij := range bi {
+						if noConj {
+							if noUnit {
+								bij *= a[j*lda+j]
+							}
+							bij += c64.DotuUnitary(a[j*lda+j+1:j*lda+n], bi[j+1:n])
+						} else {
+							if noUnit {
+								bij *= cmplx.Conj(a[j*lda+j])
+							}
+							bij += c64.DotcUnitary(a[j*lda+j+1:j*lda+n], bi[j+1:n])
+						}
+						bi[j] = alpha * bij
+					}
+				}
+			} else {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					for j := n - 1; j >= 0; j-- {
+						bij := bi[j]
+						if noConj {
+							if noUnit {
+								bij *= a[j*lda+j]
+							}
+							bij += c64.DotuUnitary(a[j*lda:j*lda+j], bi[:j])
+						} else {
+							if noUnit {
+								bij *= cmplx.Conj(a[j*lda+j])
+							}
+							bij += c64.DotcUnitary(a[j*lda:j*lda+j], bi[:j])
+						}
+						bi[j] = alpha * bij
+					}
+				}
+			}
+		}
+	}
+}
+
+// Ctrsm solves one of the matrix equations
+//
+//	op(A) * X = alpha * B  if side == blas.Left,
+//	X * op(A) = alpha * B  if side == blas.Right,
+//
+// where alpha is a scalar, X and B are m×n matrices, A is a unit or
+// non-unit, upper or lower triangular matrix and op(A) is one of
+//
+//	op(A) = A   if transA == blas.NoTrans,
+//	op(A) = Aᵀ  if transA == blas.Trans,
+//	op(A) = Aᴴ  if transA == blas.ConjTrans.
+//
+// On return the matrix X is overwritten on B.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Ctrsm(side blas.Side, uplo blas.Uplo, transA blas.Transpose, diag blas.Diag, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int) {
+	na := m
+	if side == blas.Right {
+		na = n
+	}
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case transA != blas.NoTrans && transA != blas.Trans && transA != blas.ConjTrans:
+		panic(badTranspose)
+	case diag != blas.Unit && diag != blas.NonUnit:
+		panic(badDiag)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, na):
+		panic(badLdA)
+	case ldb < max(1, n):
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (na-1)*lda+na {
+		panic(shortA)
+	}
+	if len(b) < (m-1)*ldb+n {
+		panic(shortB)
+	}
+
+	if alpha == 0 {
+		for i := 0; i < m; i++ {
+			for j := 0; j < n; j++ {
+				b[i*ldb+j] = 0
+			}
+		}
+		return
+	}
+
+	noConj := transA != blas.ConjTrans
+	noUnit := diag == blas.NonUnit
+	if side == blas.Left {
+		if transA == blas.NoTrans {
+			// Form  B = alpha*inv(A)*B.
+			if uplo == blas.Upper {
+				for i := m - 1; i >= 0; i-- {
+					bi := b[i*ldb : i*ldb+n]
+					if alpha != 1 {
+						c64.ScalUnitary(alpha, bi)
+					}
+					for ka, aik := range a[i*lda+i+1 : i*lda+m] {
+						k := i + 1 + ka
+						if aik != 0 {
+							c64.AxpyUnitary(-aik, b[k*ldb:k*ldb+n], bi)
+						}
+					}
+					if noUnit {
+						c64.ScalUnitary(1/a[i*lda+i], bi)
+					}
+				}
+			} else {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					if alpha != 1 {
+						c64.ScalUnitary(alpha, bi)
+					}
+					for j, aij := range a[i*lda : i*lda+i] {
+						if aij != 0 {
+							c64.AxpyUnitary(-aij, b[j*ldb:j*ldb+n], bi)
+						}
+					}
+					if noUnit {
+						c64.ScalUnitary(1/a[i*lda+i], bi)
+					}
+				}
+			}
+		} else {
+			// Form  B = alpha*inv(Aᵀ)*B  or  B = alpha*inv(Aᴴ)*B.
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					if noUnit {
+						if noConj {
+							c64.ScalUnitary(1/a[i*lda+i], bi)
+						} else {
+							c64.ScalUnitary(1/cmplx.Conj(a[i*lda+i]), bi)
+						}
+					}
+					for ja, aij := range a[i*lda+i+1 : i*lda+m] {
+						if aij == 0 {
+							continue
+						}
+						j := i + 1 + ja
+						if noConj {
+							c64.AxpyUnitary(-aij, bi, b[j*ldb:j*ldb+n])
+						} else {
+							c64.AxpyUnitary(-cmplx.Conj(aij), bi, b[j*ldb:j*ldb+n])
+						}
+					}
+					if alpha != 1 {
+						c64.ScalUnitary(alpha, bi)
+					}
+				}
+			} else {
+				for i := m - 1; i >= 0; i-- {
+					bi := b[i*ldb : i*ldb+n]
+					if noUnit {
+						if noConj {
+							c64.ScalUnitary(1/a[i*lda+i], bi)
+						} else {
+							c64.ScalUnitary(1/cmplx.Conj(a[i*lda+i]), bi)
+						}
+					}
+					for j, aij := range a[i*lda : i*lda+i] {
+						if aij == 0 {
+							continue
+						}
+						if noConj {
+							c64.AxpyUnitary(-aij, bi, b[j*ldb:j*ldb+n])
+						} else {
+							c64.AxpyUnitary(-cmplx.Conj(aij), bi, b[j*ldb:j*ldb+n])
+						}
+					}
+					if alpha != 1 {
+						c64.ScalUnitary(alpha, bi)
+					}
+				}
+			}
+		}
+	} else {
+		if transA == blas.NoTrans {
+			// Form  B = alpha*B*inv(A).
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					if alpha != 1 {
+						c64.ScalUnitary(alpha, bi)
+					}
+					for j, bij := range bi {
+						if bij == 0 {
+							continue
+						}
+						if noUnit {
+							bi[j] /= a[j*lda+j]
+						}
+						c64.AxpyUnitary(-bi[j], a[j*lda+j+1:j*lda+n], bi[j+1:n])
+					}
+				}
+			} else {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					if alpha != 1 {
+						c64.ScalUnitary(alpha, bi)
+					}
+					for j := n - 1; j >= 0; j-- {
+						if bi[j] == 0 {
+							continue
+						}
+						if noUnit {
+							bi[j] /= a[j*lda+j]
+						}
+						c64.AxpyUnitary(-bi[j], a[j*lda:j*lda+j], bi[:j])
+					}
+				}
+			}
+		} else {
+			// Form  B = alpha*B*inv(Aᵀ)  or   B = alpha*B*inv(Aᴴ).
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					for j := n - 1; j >= 0; j-- {
+						bij := alpha * bi[j]
+						if noConj {
+							bij -= c64.DotuUnitary(a[j*lda+j+1:j*lda+n], bi[j+1:n])
+							if noUnit {
+								bij /= a[j*lda+j]
+							}
+						} else {
+							bij -= c64.DotcUnitary(a[j*lda+j+1:j*lda+n], bi[j+1:n])
+							if noUnit {
+								bij /= cmplx.Conj(a[j*lda+j])
+							}
+						}
+						bi[j] = bij
+					}
+				}
+			} else {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					for j, bij := range bi {
+						bij *= alpha
+						if noConj {
+							bij -= c64.DotuUnitary(a[j*lda:j*lda+j], bi[:j])
+							if noUnit {
+								bij /= a[j*lda+j]
+							}
+						} else {
+							bij -= c64.DotcUnitary(a[j*lda:j*lda+j], bi[:j])
+							if noUnit {
+								bij /= cmplx.Conj(a[j*lda+j])
+							}
+						}
+						bi[j] = bij
+					}
+				}
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level3float32.go b/vendor/gonum.org/v1/gonum/blas/gonum/level3float32.go
new file mode 100644
index 0000000000..4b813fbc05
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level3float32.go
@@ -0,0 +1,925 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+var _ blas.Float32Level3 = Implementation{}
+
+// Strsm solves one of the matrix equations
+//
+//	A * X = alpha * B   if tA == blas.NoTrans and side == blas.Left
+//	Aᵀ * X = alpha * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Left
+//	X * A = alpha * B   if tA == blas.NoTrans and side == blas.Right
+//	X * Aᵀ = alpha * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Right
+//
+// where A is an n×n or m×m triangular matrix, X and B are m×n matrices, and alpha is a
+// scalar.
+//
+// At entry to the function, X contains the values of B, and the result is
+// stored in-place into X.
+//
+// No check is made that A is invertible.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Strsm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int) {
+	if s != blas.Left && s != blas.Right {
+		panic(badSide)
+	}
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	k := n
+	if s == blas.Left {
+		k = m
+	}
+	if lda < max(1, k) {
+		panic(badLdA)
+	}
+	if ldb < max(1, n) {
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(k-1)+k {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+
+	if alpha == 0 {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j := range btmp {
+				btmp[j] = 0
+			}
+		}
+		return
+	}
+	nonUnit := d == blas.NonUnit
+	if s == blas.Left {
+		if tA == blas.NoTrans {
+			if ul == blas.Upper {
+				for i := m - 1; i >= 0; i-- {
+					btmp := b[i*ldb : i*ldb+n]
+					if alpha != 1 {
+						f32.ScalUnitary(alpha, btmp)
+					}
+					for ka, va := range a[i*lda+i+1 : i*lda+m] {
+						if va != 0 {
+							k := ka + i + 1
+							f32.AxpyUnitary(-va, b[k*ldb:k*ldb+n], btmp)
+						}
+					}
+					if nonUnit {
+						tmp := 1 / a[i*lda+i]
+						f32.ScalUnitary(tmp, btmp)
+					}
+				}
+				return
+			}
+			for i := 0; i < m; i++ {
+				btmp := b[i*ldb : i*ldb+n]
+				if alpha != 1 {
+					f32.ScalUnitary(alpha, btmp)
+				}
+				for k, va := range a[i*lda : i*lda+i] {
+					if va != 0 {
+						f32.AxpyUnitary(-va, b[k*ldb:k*ldb+n], btmp)
+					}
+				}
+				if nonUnit {
+					tmp := 1 / a[i*lda+i]
+					f32.ScalUnitary(tmp, btmp)
+				}
+			}
+			return
+		}
+		// Cases where a is transposed
+		if ul == blas.Upper {
+			for k := 0; k < m; k++ {
+				btmpk := b[k*ldb : k*ldb+n]
+				if nonUnit {
+					tmp := 1 / a[k*lda+k]
+					f32.ScalUnitary(tmp, btmpk)
+				}
+				for ia, va := range a[k*lda+k+1 : k*lda+m] {
+					if va != 0 {
+						i := ia + k + 1
+						f32.AxpyUnitary(-va, btmpk, b[i*ldb:i*ldb+n])
+					}
+				}
+				if alpha != 1 {
+					f32.ScalUnitary(alpha, btmpk)
+				}
+			}
+			return
+		}
+		for k := m - 1; k >= 0; k-- {
+			btmpk := b[k*ldb : k*ldb+n]
+			if nonUnit {
+				tmp := 1 / a[k*lda+k]
+				f32.ScalUnitary(tmp, btmpk)
+			}
+			for i, va := range a[k*lda : k*lda+k] {
+				if va != 0 {
+					f32.AxpyUnitary(-va, btmpk, b[i*ldb:i*ldb+n])
+				}
+			}
+			if alpha != 1 {
+				f32.ScalUnitary(alpha, btmpk)
+			}
+		}
+		return
+	}
+	// Cases where a is to the right of X.
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < m; i++ {
+				btmp := b[i*ldb : i*ldb+n]
+				if alpha != 1 {
+					f32.ScalUnitary(alpha, btmp)
+				}
+				for k, vb := range btmp {
+					if vb == 0 {
+						continue
+					}
+					if nonUnit {
+						btmp[k] /= a[k*lda+k]
+					}
+					f32.AxpyUnitary(-btmp[k], a[k*lda+k+1:k*lda+n], btmp[k+1:n])
+				}
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			if alpha != 1 {
+				f32.ScalUnitary(alpha, btmp)
+			}
+			for k := n - 1; k >= 0; k-- {
+				if btmp[k] == 0 {
+					continue
+				}
+				if nonUnit {
+					btmp[k] /= a[k*lda+k]
+				}
+				f32.AxpyUnitary(-btmp[k], a[k*lda:k*lda+k], btmp[:k])
+			}
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j := n - 1; j >= 0; j-- {
+				tmp := alpha*btmp[j] - f32.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:])
+				if nonUnit {
+					tmp /= a[j*lda+j]
+				}
+				btmp[j] = tmp
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		btmp := b[i*ldb : i*ldb+n]
+		for j := 0; j < n; j++ {
+			tmp := alpha*btmp[j] - f32.DotUnitary(a[j*lda:j*lda+j], btmp[:j])
+			if nonUnit {
+				tmp /= a[j*lda+j]
+			}
+			btmp[j] = tmp
+		}
+	}
+}
+
+// Ssymm performs one of the matrix-matrix operations
+//
+//	C = alpha * A * B + beta * C  if side == blas.Left
+//	C = alpha * B * A + beta * C  if side == blas.Right
+//
+// where A is an n×n or m×m symmetric matrix, B and C are m×n matrices, and alpha
+// is a scalar.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Ssymm(s blas.Side, ul blas.Uplo, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int) {
+	if s != blas.Right && s != blas.Left {
+		panic(badSide)
+	}
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	k := n
+	if s == blas.Left {
+		k = m
+	}
+	if lda < max(1, k) {
+		panic(badLdA)
+	}
+	if ldb < max(1, n) {
+		panic(badLdB)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(k-1)+k {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+	if len(c) < ldc*(m-1)+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	if beta == 0 {
+		for i := 0; i < m; i++ {
+			ctmp := c[i*ldc : i*ldc+n]
+			for j := range ctmp {
+				ctmp[j] = 0
+			}
+		}
+	}
+
+	if alpha == 0 {
+		if beta != 0 {
+			for i := 0; i < m; i++ {
+				ctmp := c[i*ldc : i*ldc+n]
+				for j := 0; j < n; j++ {
+					ctmp[j] *= beta
+				}
+			}
+		}
+		return
+	}
+
+	isUpper := ul == blas.Upper
+	if s == blas.Left {
+		for i := 0; i < m; i++ {
+			atmp := alpha * a[i*lda+i]
+			btmp := b[i*ldb : i*ldb+n]
+			ctmp := c[i*ldc : i*ldc+n]
+			for j, v := range btmp {
+				ctmp[j] *= beta
+				ctmp[j] += atmp * v
+			}
+
+			for k := 0; k < i; k++ {
+				var atmp float32
+				if isUpper {
+					atmp = a[k*lda+i]
+				} else {
+					atmp = a[i*lda+k]
+				}
+				atmp *= alpha
+				f32.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ctmp)
+			}
+			for k := i + 1; k < m; k++ {
+				var atmp float32
+				if isUpper {
+					atmp = a[i*lda+k]
+				} else {
+					atmp = a[k*lda+i]
+				}
+				atmp *= alpha
+				f32.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ctmp)
+			}
+		}
+		return
+	}
+	if isUpper {
+		for i := 0; i < m; i++ {
+			for j := n - 1; j >= 0; j-- {
+				tmp := alpha * b[i*ldb+j]
+				var tmp2 float32
+				atmp := a[j*lda+j+1 : j*lda+n]
+				btmp := b[i*ldb+j+1 : i*ldb+n]
+				ctmp := c[i*ldc+j+1 : i*ldc+n]
+				for k, v := range atmp {
+					ctmp[k] += tmp * v
+					tmp2 += btmp[k] * v
+				}
+				c[i*ldc+j] *= beta
+				c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		for j := 0; j < n; j++ {
+			tmp := alpha * b[i*ldb+j]
+			var tmp2 float32
+			atmp := a[j*lda : j*lda+j]
+			btmp := b[i*ldb : i*ldb+j]
+			ctmp := c[i*ldc : i*ldc+j]
+			for k, v := range atmp {
+				ctmp[k] += tmp * v
+				tmp2 += btmp[k] * v
+			}
+			c[i*ldc+j] *= beta
+			c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
+		}
+	}
+}
+
+// Ssyrk performs one of the symmetric rank-k operations
+//
+//	C = alpha * A * Aᵀ + beta * C  if tA == blas.NoTrans
+//	C = alpha * Aᵀ * A + beta * C  if tA == blas.Trans or tA == blas.ConjTrans
+//
+// where A is an n×k or k×n matrix, C is an n×n symmetric matrix, and alpha and
+// beta are scalars.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Ssyrk(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float32, a []float32, lda int, beta float32, c []float32, ldc int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	row, col := k, n
+	if tA == blas.NoTrans {
+		row, col = n, k
+	}
+	if lda < max(1, col) {
+		panic(badLdA)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(row-1)+col {
+		panic(shortA)
+	}
+	if len(c) < ldc*(n-1)+n {
+		panic(shortC)
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			if ul == blas.Upper {
+				for i := 0; i < n; i++ {
+					ctmp := c[i*ldc+i : i*ldc+n]
+					for j := range ctmp {
+						ctmp[j] = 0
+					}
+				}
+				return
+			}
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc : i*ldc+i+1]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+			return
+		}
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc+i : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc : i*ldc+i+1]
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		return
+	}
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc+i : i*ldc+n]
+				atmp := a[i*lda : i*lda+k]
+				if beta == 0 {
+					for jc := range ctmp {
+						j := jc + i
+						ctmp[jc] = alpha * f32.DotUnitary(atmp, a[j*lda:j*lda+k])
+					}
+				} else {
+					for jc, vc := range ctmp {
+						j := jc + i
+						ctmp[jc] = vc*beta + alpha*f32.DotUnitary(atmp, a[j*lda:j*lda+k])
+					}
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc : i*ldc+i+1]
+			atmp := a[i*lda : i*lda+k]
+			if beta == 0 {
+				for j := range ctmp {
+					ctmp[j] = alpha * f32.DotUnitary(a[j*lda:j*lda+k], atmp)
+				}
+			} else {
+				for j, vc := range ctmp {
+					ctmp[j] = vc*beta + alpha*f32.DotUnitary(a[j*lda:j*lda+k], atmp)
+				}
+			}
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc+i : i*ldc+n]
+			if beta == 0 {
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			} else if beta != 1 {
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			for l := 0; l < k; l++ {
+				tmp := alpha * a[l*lda+i]
+				if tmp != 0 {
+					f32.AxpyUnitary(tmp, a[l*lda+i:l*lda+n], ctmp)
+				}
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		ctmp := c[i*ldc : i*ldc+i+1]
+		if beta != 1 {
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		for l := 0; l < k; l++ {
+			tmp := alpha * a[l*lda+i]
+			if tmp != 0 {
+				f32.AxpyUnitary(tmp, a[l*lda:l*lda+i+1], ctmp)
+			}
+		}
+	}
+}
+
+// Ssyr2k performs one of the symmetric rank 2k operations
+//
+//	C = alpha * A * Bᵀ + alpha * B * Aᵀ + beta * C  if tA == blas.NoTrans
+//	C = alpha * Aᵀ * B + alpha * Bᵀ * A + beta * C  if tA == blas.Trans or tA == blas.ConjTrans
+//
+// where A and B are n×k or k×n matrices, C is an n×n symmetric matrix, and
+// alpha and beta are scalars.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Ssyr2k(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	row, col := k, n
+	if tA == blas.NoTrans {
+		row, col = n, k
+	}
+	if lda < max(1, col) {
+		panic(badLdA)
+	}
+	if ldb < max(1, col) {
+		panic(badLdB)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(row-1)+col {
+		panic(shortA)
+	}
+	if len(b) < ldb*(row-1)+col {
+		panic(shortB)
+	}
+	if len(c) < ldc*(n-1)+n {
+		panic(shortC)
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			if ul == blas.Upper {
+				for i := 0; i < n; i++ {
+					ctmp := c[i*ldc+i : i*ldc+n]
+					for j := range ctmp {
+						ctmp[j] = 0
+					}
+				}
+				return
+			}
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc : i*ldc+i+1]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+			return
+		}
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc+i : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc : i*ldc+i+1]
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		return
+	}
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				atmp := a[i*lda : i*lda+k]
+				btmp := b[i*ldb : i*ldb+k]
+				ctmp := c[i*ldc+i : i*ldc+n]
+				if beta == 0 {
+					for jc := range ctmp {
+						j := i + jc
+						var tmp1, tmp2 float32
+						binner := b[j*ldb : j*ldb+k]
+						for l, v := range a[j*lda : j*lda+k] {
+							tmp1 += v * btmp[l]
+							tmp2 += atmp[l] * binner[l]
+						}
+						ctmp[jc] = alpha * (tmp1 + tmp2)
+					}
+				} else {
+					for jc := range ctmp {
+						j := i + jc
+						var tmp1, tmp2 float32
+						binner := b[j*ldb : j*ldb+k]
+						for l, v := range a[j*lda : j*lda+k] {
+							tmp1 += v * btmp[l]
+							tmp2 += atmp[l] * binner[l]
+						}
+						ctmp[jc] *= beta
+						ctmp[jc] += alpha * (tmp1 + tmp2)
+					}
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			atmp := a[i*lda : i*lda+k]
+			btmp := b[i*ldb : i*ldb+k]
+			ctmp := c[i*ldc : i*ldc+i+1]
+			if beta == 0 {
+				for j := 0; j <= i; j++ {
+					var tmp1, tmp2 float32
+					binner := b[j*ldb : j*ldb+k]
+					for l, v := range a[j*lda : j*lda+k] {
+						tmp1 += v * btmp[l]
+						tmp2 += atmp[l] * binner[l]
+					}
+					ctmp[j] = alpha * (tmp1 + tmp2)
+				}
+			} else {
+				for j := 0; j <= i; j++ {
+					var tmp1, tmp2 float32
+					binner := b[j*ldb : j*ldb+k]
+					for l, v := range a[j*lda : j*lda+k] {
+						tmp1 += v * btmp[l]
+						tmp2 += atmp[l] * binner[l]
+					}
+					ctmp[j] *= beta
+					ctmp[j] += alpha * (tmp1 + tmp2)
+				}
+			}
+		}
+		return
+	}
+	if ul == blas.Upper {
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc+i : i*ldc+n]
+			switch beta {
+			case 0:
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			case 1:
+			default:
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			for l := 0; l < k; l++ {
+				tmp1 := alpha * b[l*ldb+i]
+				tmp2 := alpha * a[l*lda+i]
+				btmp := b[l*ldb+i : l*ldb+n]
+				if tmp1 != 0 || tmp2 != 0 {
+					for j, v := range a[l*lda+i : l*lda+n] {
+						ctmp[j] += v*tmp1 + btmp[j]*tmp2
+					}
+				}
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		ctmp := c[i*ldc : i*ldc+i+1]
+		switch beta {
+		case 0:
+			for j := range ctmp {
+				ctmp[j] = 0
+			}
+		case 1:
+		default:
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		for l := 0; l < k; l++ {
+			tmp1 := alpha * b[l*ldb+i]
+			tmp2 := alpha * a[l*lda+i]
+			btmp := b[l*ldb : l*ldb+i+1]
+			if tmp1 != 0 || tmp2 != 0 {
+				for j, v := range a[l*lda : l*lda+i+1] {
+					ctmp[j] += v*tmp1 + btmp[j]*tmp2
+				}
+			}
+		}
+	}
+}
+
+// Strmm performs one of the matrix-matrix operations
+//
+//	B = alpha * A * B   if tA == blas.NoTrans and side == blas.Left
+//	B = alpha * Aᵀ * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Left
+//	B = alpha * B * A   if tA == blas.NoTrans and side == blas.Right
+//	B = alpha * B * Aᵀ  if tA == blas.Trans or blas.ConjTrans, and side == blas.Right
+//
+// where A is an n×n or m×m triangular matrix, B is an m×n matrix, and alpha is a scalar.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Strmm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int) {
+	if s != blas.Left && s != blas.Right {
+		panic(badSide)
+	}
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	k := n
+	if s == blas.Left {
+		k = m
+	}
+	if lda < max(1, k) {
+		panic(badLdA)
+	}
+	if ldb < max(1, n) {
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(k-1)+k {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+
+	if alpha == 0 {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j := range btmp {
+				btmp[j] = 0
+			}
+		}
+		return
+	}
+
+	nonUnit := d == blas.NonUnit
+	if s == blas.Left {
+		if tA == blas.NoTrans {
+			if ul == blas.Upper {
+				for i := 0; i < m; i++ {
+					tmp := alpha
+					if nonUnit {
+						tmp *= a[i*lda+i]
+					}
+					btmp := b[i*ldb : i*ldb+n]
+					f32.ScalUnitary(tmp, btmp)
+					for ka, va := range a[i*lda+i+1 : i*lda+m] {
+						k := ka + i + 1
+						if va != 0 {
+							f32.AxpyUnitary(alpha*va, b[k*ldb:k*ldb+n], btmp)
+						}
+					}
+				}
+				return
+			}
+			for i := m - 1; i >= 0; i-- {
+				tmp := alpha
+				if nonUnit {
+					tmp *= a[i*lda+i]
+				}
+				btmp := b[i*ldb : i*ldb+n]
+				f32.ScalUnitary(tmp, btmp)
+				for k, va := range a[i*lda : i*lda+i] {
+					if va != 0 {
+						f32.AxpyUnitary(alpha*va, b[k*ldb:k*ldb+n], btmp)
+					}
+				}
+			}
+			return
+		}
+		// Cases where a is transposed.
+		if ul == blas.Upper {
+			for k := m - 1; k >= 0; k-- {
+				btmpk := b[k*ldb : k*ldb+n]
+				for ia, va := range a[k*lda+k+1 : k*lda+m] {
+					i := ia + k + 1
+					btmp := b[i*ldb : i*ldb+n]
+					if va != 0 {
+						f32.AxpyUnitary(alpha*va, btmpk, btmp)
+					}
+				}
+				tmp := alpha
+				if nonUnit {
+					tmp *= a[k*lda+k]
+				}
+				if tmp != 1 {
+					f32.ScalUnitary(tmp, btmpk)
+				}
+			}
+			return
+		}
+		for k := 0; k < m; k++ {
+			btmpk := b[k*ldb : k*ldb+n]
+			for i, va := range a[k*lda : k*lda+k] {
+				btmp := b[i*ldb : i*ldb+n]
+				if va != 0 {
+					f32.AxpyUnitary(alpha*va, btmpk, btmp)
+				}
+			}
+			tmp := alpha
+			if nonUnit {
+				tmp *= a[k*lda+k]
+			}
+			if tmp != 1 {
+				f32.ScalUnitary(tmp, btmpk)
+			}
+		}
+		return
+	}
+	// Cases where a is on the right
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < m; i++ {
+				btmp := b[i*ldb : i*ldb+n]
+				for k := n - 1; k >= 0; k-- {
+					tmp := alpha * btmp[k]
+					if tmp == 0 {
+						continue
+					}
+					btmp[k] = tmp
+					if nonUnit {
+						btmp[k] *= a[k*lda+k]
+					}
+					f32.AxpyUnitary(tmp, a[k*lda+k+1:k*lda+n], btmp[k+1:n])
+				}
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for k := 0; k < n; k++ {
+				tmp := alpha * btmp[k]
+				if tmp == 0 {
+					continue
+				}
+				btmp[k] = tmp
+				if nonUnit {
+					btmp[k] *= a[k*lda+k]
+				}
+				f32.AxpyUnitary(tmp, a[k*lda:k*lda+k], btmp[:k])
+			}
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j, vb := range btmp {
+				tmp := vb
+				if nonUnit {
+					tmp *= a[j*lda+j]
+				}
+				tmp += f32.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:n])
+				btmp[j] = alpha * tmp
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		btmp := b[i*ldb : i*ldb+n]
+		for j := n - 1; j >= 0; j-- {
+			tmp := btmp[j]
+			if nonUnit {
+				tmp *= a[j*lda+j]
+			}
+			tmp += f32.DotUnitary(a[j*lda:j*lda+j], btmp[:j])
+			btmp[j] = alpha * tmp
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level3float64.go b/vendor/gonum.org/v1/gonum/blas/gonum/level3float64.go
new file mode 100644
index 0000000000..0d203513c1
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level3float64.go
@@ -0,0 +1,913 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f64"
+)
+
+var _ blas.Float64Level3 = Implementation{}
+
+// Dtrsm solves one of the matrix equations
+//
+//	A * X = alpha * B   if tA == blas.NoTrans and side == blas.Left
+//	Aᵀ * X = alpha * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Left
+//	X * A = alpha * B   if tA == blas.NoTrans and side == blas.Right
+//	X * Aᵀ = alpha * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Right
+//
+// where A is an n×n or m×m triangular matrix, X and B are m×n matrices, and alpha is a
+// scalar.
+//
+// At entry to the function, X contains the values of B, and the result is
+// stored in-place into X.
+//
+// No check is made that A is invertible.
+func (Implementation) Dtrsm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int) {
+	if s != blas.Left && s != blas.Right {
+		panic(badSide)
+	}
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	k := n
+	if s == blas.Left {
+		k = m
+	}
+	if lda < max(1, k) {
+		panic(badLdA)
+	}
+	if ldb < max(1, n) {
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(k-1)+k {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+
+	if alpha == 0 {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j := range btmp {
+				btmp[j] = 0
+			}
+		}
+		return
+	}
+	nonUnit := d == blas.NonUnit
+	if s == blas.Left {
+		if tA == blas.NoTrans {
+			if ul == blas.Upper {
+				for i := m - 1; i >= 0; i-- {
+					btmp := b[i*ldb : i*ldb+n]
+					if alpha != 1 {
+						f64.ScalUnitary(alpha, btmp)
+					}
+					for ka, va := range a[i*lda+i+1 : i*lda+m] {
+						if va != 0 {
+							k := ka + i + 1
+							f64.AxpyUnitary(-va, b[k*ldb:k*ldb+n], btmp)
+						}
+					}
+					if nonUnit {
+						tmp := 1 / a[i*lda+i]
+						f64.ScalUnitary(tmp, btmp)
+					}
+				}
+				return
+			}
+			for i := 0; i < m; i++ {
+				btmp := b[i*ldb : i*ldb+n]
+				if alpha != 1 {
+					f64.ScalUnitary(alpha, btmp)
+				}
+				for k, va := range a[i*lda : i*lda+i] {
+					if va != 0 {
+						f64.AxpyUnitary(-va, b[k*ldb:k*ldb+n], btmp)
+					}
+				}
+				if nonUnit {
+					tmp := 1 / a[i*lda+i]
+					f64.ScalUnitary(tmp, btmp)
+				}
+			}
+			return
+		}
+		// Cases where a is transposed
+		if ul == blas.Upper {
+			for k := 0; k < m; k++ {
+				btmpk := b[k*ldb : k*ldb+n]
+				if nonUnit {
+					tmp := 1 / a[k*lda+k]
+					f64.ScalUnitary(tmp, btmpk)
+				}
+				for ia, va := range a[k*lda+k+1 : k*lda+m] {
+					if va != 0 {
+						i := ia + k + 1
+						f64.AxpyUnitary(-va, btmpk, b[i*ldb:i*ldb+n])
+					}
+				}
+				if alpha != 1 {
+					f64.ScalUnitary(alpha, btmpk)
+				}
+			}
+			return
+		}
+		for k := m - 1; k >= 0; k-- {
+			btmpk := b[k*ldb : k*ldb+n]
+			if nonUnit {
+				tmp := 1 / a[k*lda+k]
+				f64.ScalUnitary(tmp, btmpk)
+			}
+			for i, va := range a[k*lda : k*lda+k] {
+				if va != 0 {
+					f64.AxpyUnitary(-va, btmpk, b[i*ldb:i*ldb+n])
+				}
+			}
+			if alpha != 1 {
+				f64.ScalUnitary(alpha, btmpk)
+			}
+		}
+		return
+	}
+	// Cases where a is to the right of X.
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < m; i++ {
+				btmp := b[i*ldb : i*ldb+n]
+				if alpha != 1 {
+					f64.ScalUnitary(alpha, btmp)
+				}
+				for k, vb := range btmp {
+					if vb == 0 {
+						continue
+					}
+					if nonUnit {
+						btmp[k] /= a[k*lda+k]
+					}
+					f64.AxpyUnitary(-btmp[k], a[k*lda+k+1:k*lda+n], btmp[k+1:n])
+				}
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			if alpha != 1 {
+				f64.ScalUnitary(alpha, btmp)
+			}
+			for k := n - 1; k >= 0; k-- {
+				if btmp[k] == 0 {
+					continue
+				}
+				if nonUnit {
+					btmp[k] /= a[k*lda+k]
+				}
+				f64.AxpyUnitary(-btmp[k], a[k*lda:k*lda+k], btmp[:k])
+			}
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j := n - 1; j >= 0; j-- {
+				tmp := alpha*btmp[j] - f64.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:])
+				if nonUnit {
+					tmp /= a[j*lda+j]
+				}
+				btmp[j] = tmp
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		btmp := b[i*ldb : i*ldb+n]
+		for j := 0; j < n; j++ {
+			tmp := alpha*btmp[j] - f64.DotUnitary(a[j*lda:j*lda+j], btmp[:j])
+			if nonUnit {
+				tmp /= a[j*lda+j]
+			}
+			btmp[j] = tmp
+		}
+	}
+}
+
+// Dsymm performs one of the matrix-matrix operations
+//
+//	C = alpha * A * B + beta * C  if side == blas.Left
+//	C = alpha * B * A + beta * C  if side == blas.Right
+//
+// where A is an n×n or m×m symmetric matrix, B and C are m×n matrices, and alpha
+// is a scalar.
+func (Implementation) Dsymm(s blas.Side, ul blas.Uplo, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int) {
+	if s != blas.Right && s != blas.Left {
+		panic(badSide)
+	}
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	k := n
+	if s == blas.Left {
+		k = m
+	}
+	if lda < max(1, k) {
+		panic(badLdA)
+	}
+	if ldb < max(1, n) {
+		panic(badLdB)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(k-1)+k {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+	if len(c) < ldc*(m-1)+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	if beta == 0 {
+		for i := 0; i < m; i++ {
+			ctmp := c[i*ldc : i*ldc+n]
+			for j := range ctmp {
+				ctmp[j] = 0
+			}
+		}
+	}
+
+	if alpha == 0 {
+		if beta != 0 {
+			for i := 0; i < m; i++ {
+				ctmp := c[i*ldc : i*ldc+n]
+				for j := 0; j < n; j++ {
+					ctmp[j] *= beta
+				}
+			}
+		}
+		return
+	}
+
+	isUpper := ul == blas.Upper
+	if s == blas.Left {
+		for i := 0; i < m; i++ {
+			atmp := alpha * a[i*lda+i]
+			btmp := b[i*ldb : i*ldb+n]
+			ctmp := c[i*ldc : i*ldc+n]
+			for j, v := range btmp {
+				ctmp[j] *= beta
+				ctmp[j] += atmp * v
+			}
+
+			for k := 0; k < i; k++ {
+				var atmp float64
+				if isUpper {
+					atmp = a[k*lda+i]
+				} else {
+					atmp = a[i*lda+k]
+				}
+				atmp *= alpha
+				f64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ctmp)
+			}
+			for k := i + 1; k < m; k++ {
+				var atmp float64
+				if isUpper {
+					atmp = a[i*lda+k]
+				} else {
+					atmp = a[k*lda+i]
+				}
+				atmp *= alpha
+				f64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ctmp)
+			}
+		}
+		return
+	}
+	if isUpper {
+		for i := 0; i < m; i++ {
+			for j := n - 1; j >= 0; j-- {
+				tmp := alpha * b[i*ldb+j]
+				var tmp2 float64
+				atmp := a[j*lda+j+1 : j*lda+n]
+				btmp := b[i*ldb+j+1 : i*ldb+n]
+				ctmp := c[i*ldc+j+1 : i*ldc+n]
+				for k, v := range atmp {
+					ctmp[k] += tmp * v
+					tmp2 += btmp[k] * v
+				}
+				c[i*ldc+j] *= beta
+				c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		for j := 0; j < n; j++ {
+			tmp := alpha * b[i*ldb+j]
+			var tmp2 float64
+			atmp := a[j*lda : j*lda+j]
+			btmp := b[i*ldb : i*ldb+j]
+			ctmp := c[i*ldc : i*ldc+j]
+			for k, v := range atmp {
+				ctmp[k] += tmp * v
+				tmp2 += btmp[k] * v
+			}
+			c[i*ldc+j] *= beta
+			c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
+		}
+	}
+}
+
+// Dsyrk performs one of the symmetric rank-k operations
+//
+//	C = alpha * A * Aᵀ + beta * C  if tA == blas.NoTrans
+//	C = alpha * Aᵀ * A + beta * C  if tA == blas.Trans or tA == blas.ConjTrans
+//
+// where A is an n×k or k×n matrix, C is an n×n symmetric matrix, and alpha and
+// beta are scalars.
+func (Implementation) Dsyrk(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float64, a []float64, lda int, beta float64, c []float64, ldc int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	row, col := k, n
+	if tA == blas.NoTrans {
+		row, col = n, k
+	}
+	if lda < max(1, col) {
+		panic(badLdA)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(row-1)+col {
+		panic(shortA)
+	}
+	if len(c) < ldc*(n-1)+n {
+		panic(shortC)
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			if ul == blas.Upper {
+				for i := 0; i < n; i++ {
+					ctmp := c[i*ldc+i : i*ldc+n]
+					for j := range ctmp {
+						ctmp[j] = 0
+					}
+				}
+				return
+			}
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc : i*ldc+i+1]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+			return
+		}
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc+i : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc : i*ldc+i+1]
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		return
+	}
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc+i : i*ldc+n]
+				atmp := a[i*lda : i*lda+k]
+				if beta == 0 {
+					for jc := range ctmp {
+						j := jc + i
+						ctmp[jc] = alpha * f64.DotUnitary(atmp, a[j*lda:j*lda+k])
+					}
+				} else {
+					for jc, vc := range ctmp {
+						j := jc + i
+						ctmp[jc] = vc*beta + alpha*f64.DotUnitary(atmp, a[j*lda:j*lda+k])
+					}
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc : i*ldc+i+1]
+			atmp := a[i*lda : i*lda+k]
+			if beta == 0 {
+				for j := range ctmp {
+					ctmp[j] = alpha * f64.DotUnitary(a[j*lda:j*lda+k], atmp)
+				}
+			} else {
+				for j, vc := range ctmp {
+					ctmp[j] = vc*beta + alpha*f64.DotUnitary(a[j*lda:j*lda+k], atmp)
+				}
+			}
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc+i : i*ldc+n]
+			if beta == 0 {
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			} else if beta != 1 {
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			for l := 0; l < k; l++ {
+				tmp := alpha * a[l*lda+i]
+				if tmp != 0 {
+					f64.AxpyUnitary(tmp, a[l*lda+i:l*lda+n], ctmp)
+				}
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		ctmp := c[i*ldc : i*ldc+i+1]
+		if beta != 1 {
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		for l := 0; l < k; l++ {
+			tmp := alpha * a[l*lda+i]
+			if tmp != 0 {
+				f64.AxpyUnitary(tmp, a[l*lda:l*lda+i+1], ctmp)
+			}
+		}
+	}
+}
+
+// Dsyr2k performs one of the symmetric rank 2k operations
+//
+//	C = alpha * A * Bᵀ + alpha * B * Aᵀ + beta * C  if tA == blas.NoTrans
+//	C = alpha * Aᵀ * B + alpha * Bᵀ * A + beta * C  if tA == blas.Trans or tA == blas.ConjTrans
+//
+// where A and B are n×k or k×n matrices, C is an n×n symmetric matrix, and
+// alpha and beta are scalars.
+func (Implementation) Dsyr2k(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	row, col := k, n
+	if tA == blas.NoTrans {
+		row, col = n, k
+	}
+	if lda < max(1, col) {
+		panic(badLdA)
+	}
+	if ldb < max(1, col) {
+		panic(badLdB)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(row-1)+col {
+		panic(shortA)
+	}
+	if len(b) < ldb*(row-1)+col {
+		panic(shortB)
+	}
+	if len(c) < ldc*(n-1)+n {
+		panic(shortC)
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			if ul == blas.Upper {
+				for i := 0; i < n; i++ {
+					ctmp := c[i*ldc+i : i*ldc+n]
+					for j := range ctmp {
+						ctmp[j] = 0
+					}
+				}
+				return
+			}
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc : i*ldc+i+1]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+			return
+		}
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc+i : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc : i*ldc+i+1]
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		return
+	}
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				atmp := a[i*lda : i*lda+k]
+				btmp := b[i*ldb : i*ldb+k]
+				ctmp := c[i*ldc+i : i*ldc+n]
+				if beta == 0 {
+					for jc := range ctmp {
+						j := i + jc
+						var tmp1, tmp2 float64
+						binner := b[j*ldb : j*ldb+k]
+						for l, v := range a[j*lda : j*lda+k] {
+							tmp1 += v * btmp[l]
+							tmp2 += atmp[l] * binner[l]
+						}
+						ctmp[jc] = alpha * (tmp1 + tmp2)
+					}
+				} else {
+					for jc := range ctmp {
+						j := i + jc
+						var tmp1, tmp2 float64
+						binner := b[j*ldb : j*ldb+k]
+						for l, v := range a[j*lda : j*lda+k] {
+							tmp1 += v * btmp[l]
+							tmp2 += atmp[l] * binner[l]
+						}
+						ctmp[jc] *= beta
+						ctmp[jc] += alpha * (tmp1 + tmp2)
+					}
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			atmp := a[i*lda : i*lda+k]
+			btmp := b[i*ldb : i*ldb+k]
+			ctmp := c[i*ldc : i*ldc+i+1]
+			if beta == 0 {
+				for j := 0; j <= i; j++ {
+					var tmp1, tmp2 float64
+					binner := b[j*ldb : j*ldb+k]
+					for l, v := range a[j*lda : j*lda+k] {
+						tmp1 += v * btmp[l]
+						tmp2 += atmp[l] * binner[l]
+					}
+					ctmp[j] = alpha * (tmp1 + tmp2)
+				}
+			} else {
+				for j := 0; j <= i; j++ {
+					var tmp1, tmp2 float64
+					binner := b[j*ldb : j*ldb+k]
+					for l, v := range a[j*lda : j*lda+k] {
+						tmp1 += v * btmp[l]
+						tmp2 += atmp[l] * binner[l]
+					}
+					ctmp[j] *= beta
+					ctmp[j] += alpha * (tmp1 + tmp2)
+				}
+			}
+		}
+		return
+	}
+	if ul == blas.Upper {
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc+i : i*ldc+n]
+			switch beta {
+			case 0:
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			case 1:
+			default:
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			for l := 0; l < k; l++ {
+				tmp1 := alpha * b[l*ldb+i]
+				tmp2 := alpha * a[l*lda+i]
+				btmp := b[l*ldb+i : l*ldb+n]
+				if tmp1 != 0 || tmp2 != 0 {
+					for j, v := range a[l*lda+i : l*lda+n] {
+						ctmp[j] += v*tmp1 + btmp[j]*tmp2
+					}
+				}
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		ctmp := c[i*ldc : i*ldc+i+1]
+		switch beta {
+		case 0:
+			for j := range ctmp {
+				ctmp[j] = 0
+			}
+		case 1:
+		default:
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		for l := 0; l < k; l++ {
+			tmp1 := alpha * b[l*ldb+i]
+			tmp2 := alpha * a[l*lda+i]
+			btmp := b[l*ldb : l*ldb+i+1]
+			if tmp1 != 0 || tmp2 != 0 {
+				for j, v := range a[l*lda : l*lda+i+1] {
+					ctmp[j] += v*tmp1 + btmp[j]*tmp2
+				}
+			}
+		}
+	}
+}
+
+// Dtrmm performs one of the matrix-matrix operations
+//
+//	B = alpha * A * B   if tA == blas.NoTrans and side == blas.Left
+//	B = alpha * Aᵀ * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Left
+//	B = alpha * B * A   if tA == blas.NoTrans and side == blas.Right
+//	B = alpha * B * Aᵀ  if tA == blas.Trans or blas.ConjTrans, and side == blas.Right
+//
+// where A is an n×n or m×m triangular matrix, B is an m×n matrix, and alpha is a scalar.
+func (Implementation) Dtrmm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int) {
+	if s != blas.Left && s != blas.Right {
+		panic(badSide)
+	}
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	k := n
+	if s == blas.Left {
+		k = m
+	}
+	if lda < max(1, k) {
+		panic(badLdA)
+	}
+	if ldb < max(1, n) {
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(k-1)+k {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+
+	if alpha == 0 {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j := range btmp {
+				btmp[j] = 0
+			}
+		}
+		return
+	}
+
+	nonUnit := d == blas.NonUnit
+	if s == blas.Left {
+		if tA == blas.NoTrans {
+			if ul == blas.Upper {
+				for i := 0; i < m; i++ {
+					tmp := alpha
+					if nonUnit {
+						tmp *= a[i*lda+i]
+					}
+					btmp := b[i*ldb : i*ldb+n]
+					f64.ScalUnitary(tmp, btmp)
+					for ka, va := range a[i*lda+i+1 : i*lda+m] {
+						k := ka + i + 1
+						if va != 0 {
+							f64.AxpyUnitary(alpha*va, b[k*ldb:k*ldb+n], btmp)
+						}
+					}
+				}
+				return
+			}
+			for i := m - 1; i >= 0; i-- {
+				tmp := alpha
+				if nonUnit {
+					tmp *= a[i*lda+i]
+				}
+				btmp := b[i*ldb : i*ldb+n]
+				f64.ScalUnitary(tmp, btmp)
+				for k, va := range a[i*lda : i*lda+i] {
+					if va != 0 {
+						f64.AxpyUnitary(alpha*va, b[k*ldb:k*ldb+n], btmp)
+					}
+				}
+			}
+			return
+		}
+		// Cases where a is transposed.
+		if ul == blas.Upper {
+			for k := m - 1; k >= 0; k-- {
+				btmpk := b[k*ldb : k*ldb+n]
+				for ia, va := range a[k*lda+k+1 : k*lda+m] {
+					i := ia + k + 1
+					btmp := b[i*ldb : i*ldb+n]
+					if va != 0 {
+						f64.AxpyUnitary(alpha*va, btmpk, btmp)
+					}
+				}
+				tmp := alpha
+				if nonUnit {
+					tmp *= a[k*lda+k]
+				}
+				if tmp != 1 {
+					f64.ScalUnitary(tmp, btmpk)
+				}
+			}
+			return
+		}
+		for k := 0; k < m; k++ {
+			btmpk := b[k*ldb : k*ldb+n]
+			for i, va := range a[k*lda : k*lda+k] {
+				btmp := b[i*ldb : i*ldb+n]
+				if va != 0 {
+					f64.AxpyUnitary(alpha*va, btmpk, btmp)
+				}
+			}
+			tmp := alpha
+			if nonUnit {
+				tmp *= a[k*lda+k]
+			}
+			if tmp != 1 {
+				f64.ScalUnitary(tmp, btmpk)
+			}
+		}
+		return
+	}
+	// Cases where a is on the right
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < m; i++ {
+				btmp := b[i*ldb : i*ldb+n]
+				for k := n - 1; k >= 0; k-- {
+					tmp := alpha * btmp[k]
+					if tmp == 0 {
+						continue
+					}
+					btmp[k] = tmp
+					if nonUnit {
+						btmp[k] *= a[k*lda+k]
+					}
+					f64.AxpyUnitary(tmp, a[k*lda+k+1:k*lda+n], btmp[k+1:n])
+				}
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for k := 0; k < n; k++ {
+				tmp := alpha * btmp[k]
+				if tmp == 0 {
+					continue
+				}
+				btmp[k] = tmp
+				if nonUnit {
+					btmp[k] *= a[k*lda+k]
+				}
+				f64.AxpyUnitary(tmp, a[k*lda:k*lda+k], btmp[:k])
+			}
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j, vb := range btmp {
+				tmp := vb
+				if nonUnit {
+					tmp *= a[j*lda+j]
+				}
+				tmp += f64.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:n])
+				btmp[j] = alpha * tmp
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		btmp := b[i*ldb : i*ldb+n]
+		for j := n - 1; j >= 0; j-- {
+			tmp := btmp[j]
+			if nonUnit {
+				tmp *= a[j*lda+j]
+			}
+			tmp += f64.DotUnitary(a[j*lda:j*lda+j], btmp[:j])
+			btmp[j] = alpha * tmp
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/sgemm.go b/vendor/gonum.org/v1/gonum/blas/gonum/sgemm.go
new file mode 100644
index 0000000000..7b03ce46a8
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/sgemm.go
@@ -0,0 +1,301 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"runtime"
+	"sync"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+// Sgemm performs one of the matrix-matrix operations
+//
+//	C = alpha * A * B + beta * C
+//	C = alpha * Aᵀ * B + beta * C
+//	C = alpha * A * Bᵀ + beta * C
+//	C = alpha * Aᵀ * Bᵀ + beta * C
+//
+// where A is an m×k or k×m dense matrix, B is an n×k or k×n dense matrix, C is
+// an m×n matrix, and alpha and beta are scalars. tA and tB specify whether A or
+// B are transposed.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sgemm(tA, tB blas.Transpose, m, n, k int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int) {
+	switch tA {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch tB {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	aTrans := tA == blas.Trans || tA == blas.ConjTrans
+	if aTrans {
+		if lda < max(1, m) {
+			panic(badLdA)
+		}
+	} else {
+		if lda < max(1, k) {
+			panic(badLdA)
+		}
+	}
+	bTrans := tB == blas.Trans || tB == blas.ConjTrans
+	if bTrans {
+		if ldb < max(1, k) {
+			panic(badLdB)
+		}
+	} else {
+		if ldb < max(1, n) {
+			panic(badLdB)
+		}
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if aTrans {
+		if len(a) < (k-1)*lda+m {
+			panic(shortA)
+		}
+	} else {
+		if len(a) < (m-1)*lda+k {
+			panic(shortA)
+		}
+	}
+	if bTrans {
+		if len(b) < (n-1)*ldb+k {
+			panic(shortB)
+		}
+	} else {
+		if len(b) < (k-1)*ldb+n {
+			panic(shortB)
+		}
+	}
+	if len(c) < (m-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	// scale c
+	if beta != 1 {
+		if beta == 0 {
+			for i := 0; i < m; i++ {
+				ctmp := c[i*ldc : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				ctmp := c[i*ldc : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+		}
+	}
+
+	sgemmParallel(aTrans, bTrans, m, n, k, a, lda, b, ldb, c, ldc, alpha)
+}
+
+func sgemmParallel(aTrans, bTrans bool, m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
+	// dgemmParallel computes a parallel matrix multiplication by partitioning
+	// a and b into sub-blocks, and updating c with the multiplication of the sub-block
+	// In all cases,
+	// A = [ 	A_11	A_12 ... 	A_1j
+	//			A_21	A_22 ...	A_2j
+	//				...
+	//			A_i1	A_i2 ...	A_ij]
+	//
+	// and same for B. All of the submatrix sizes are blockSize×blockSize except
+	// at the edges.
+	//
+	// In all cases, there is one dimension for each matrix along which
+	// C must be updated sequentially.
+	// Cij = \sum_k Aik Bki,	(A * B)
+	// Cij = \sum_k Aki Bkj,	(Aᵀ * B)
+	// Cij = \sum_k Aik Bjk,	(A * Bᵀ)
+	// Cij = \sum_k Aki Bjk,	(Aᵀ * Bᵀ)
+	//
+	// This code computes one {i, j} block sequentially along the k dimension,
+	// and computes all of the {i, j} blocks concurrently. This
+	// partitioning allows Cij to be updated in-place without race-conditions.
+	// Instead of launching a goroutine for each possible concurrent computation,
+	// a number of worker goroutines are created and channels are used to pass
+	// available and completed cases.
+	//
+	// http://alexkr.com/docs/matrixmult.pdf is a good reference on matrix-matrix
+	// multiplies, though this code does not copy matrices to attempt to eliminate
+	// cache misses.
+
+	maxKLen := k
+	parBlocks := blocks(m, blockSize) * blocks(n, blockSize)
+	if parBlocks < minParBlock {
+		// The matrix multiplication is small in the dimensions where it can be
+		// computed concurrently. Just do it in serial.
+		sgemmSerial(aTrans, bTrans, m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	}
+
+	// workerLimit acts a number of maximum concurrent workers,
+	// with the limit set to the number of procs available.
+	workerLimit := make(chan struct{}, runtime.GOMAXPROCS(0))
+
+	// wg is used to wait for all
+	var wg sync.WaitGroup
+	wg.Add(parBlocks)
+	defer wg.Wait()
+
+	for i := 0; i < m; i += blockSize {
+		for j := 0; j < n; j += blockSize {
+			workerLimit <- struct{}{}
+			go func(i, j int) {
+				defer func() {
+					wg.Done()
+					<-workerLimit
+				}()
+
+				leni := blockSize
+				if i+leni > m {
+					leni = m - i
+				}
+				lenj := blockSize
+				if j+lenj > n {
+					lenj = n - j
+				}
+
+				cSub := sliceView32(c, ldc, i, j, leni, lenj)
+
+				// Compute A_ik B_kj for all k
+				for k := 0; k < maxKLen; k += blockSize {
+					lenk := blockSize
+					if k+lenk > maxKLen {
+						lenk = maxKLen - k
+					}
+					var aSub, bSub []float32
+					if aTrans {
+						aSub = sliceView32(a, lda, k, i, lenk, leni)
+					} else {
+						aSub = sliceView32(a, lda, i, k, leni, lenk)
+					}
+					if bTrans {
+						bSub = sliceView32(b, ldb, j, k, lenj, lenk)
+					} else {
+						bSub = sliceView32(b, ldb, k, j, lenk, lenj)
+					}
+					sgemmSerial(aTrans, bTrans, leni, lenj, lenk, aSub, lda, bSub, ldb, cSub, ldc, alpha)
+				}
+			}(i, j)
+		}
+	}
+}
+
+// sgemmSerial is serial matrix multiply
+func sgemmSerial(aTrans, bTrans bool, m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
+	switch {
+	case !aTrans && !bTrans:
+		sgemmSerialNotNot(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	case aTrans && !bTrans:
+		sgemmSerialTransNot(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	case !aTrans && bTrans:
+		sgemmSerialNotTrans(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	case aTrans && bTrans:
+		sgemmSerialTransTrans(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	default:
+		panic("unreachable")
+	}
+}
+
+// sgemmSerial where neither a nor b are transposed
+func sgemmSerialNotNot(m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for i := 0; i < m; i++ {
+		ctmp := c[i*ldc : i*ldc+n]
+		for l, v := range a[i*lda : i*lda+k] {
+			tmp := alpha * v
+			if tmp != 0 {
+				f32.AxpyUnitary(tmp, b[l*ldb:l*ldb+n], ctmp)
+			}
+		}
+	}
+}
+
+// sgemmSerial where neither a is transposed and b is not
+func sgemmSerialTransNot(m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for l := 0; l < k; l++ {
+		btmp := b[l*ldb : l*ldb+n]
+		for i, v := range a[l*lda : l*lda+m] {
+			tmp := alpha * v
+			if tmp != 0 {
+				ctmp := c[i*ldc : i*ldc+n]
+				f32.AxpyUnitary(tmp, btmp, ctmp)
+			}
+		}
+	}
+}
+
+// sgemmSerial where neither a is not transposed and b is
+func sgemmSerialNotTrans(m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for i := 0; i < m; i++ {
+		atmp := a[i*lda : i*lda+k]
+		ctmp := c[i*ldc : i*ldc+n]
+		for j := 0; j < n; j++ {
+			ctmp[j] += alpha * f32.DotUnitary(atmp, b[j*ldb:j*ldb+k])
+		}
+	}
+}
+
+// sgemmSerial where both are transposed
+func sgemmSerialTransTrans(m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for l := 0; l < k; l++ {
+		for i, v := range a[l*lda : l*lda+m] {
+			tmp := alpha * v
+			if tmp != 0 {
+				ctmp := c[i*ldc : i*ldc+n]
+				f32.AxpyInc(tmp, b[l:], ctmp, uintptr(n), uintptr(ldb), 1, 0, 0)
+			}
+		}
+	}
+}
+
+func sliceView32(a []float32, lda, i, j, r, c int) []float32 {
+	return a[i*lda+j : (i+r-1)*lda+j+c]
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/single_precision.bash b/vendor/gonum.org/v1/gonum/blas/gonum/single_precision.bash
new file mode 100644
index 0000000000..a107fce492
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/single_precision.bash
@@ -0,0 +1,224 @@
+#!/usr/bin/env bash
+
+# Copyright ©2015 The Gonum Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+WARNINGF32='//\
+// Float32 implementations are autogenerated and not directly tested.\
+'
+WARNINGC64='//\
+// Complex64 implementations are autogenerated and not directly tested.\
+'
+
+# Level1 routines.
+
+echo Generating level1float32.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level1float32.go
+cat level1float64.go \
+| gofmt -r 'blas.Float64Level1 -> blas.Float32Level1' \
+\
+| gofmt -r 'float64 -> float32' \
+| gofmt -r 'blas.DrotmParams -> blas.SrotmParams' \
+\
+| gofmt -r 'f64.AxpyInc -> f32.AxpyInc' \
+| gofmt -r 'f64.AxpyUnitary -> f32.AxpyUnitary' \
+| gofmt -r 'f64.DotUnitary -> f32.DotUnitary' \
+| gofmt -r 'f64.L2NormInc -> f32.L2NormInc' \
+| gofmt -r 'f64.L2NormUnitary -> f32.L2NormUnitary' \
+| gofmt -r 'f64.ScalInc -> f32.ScalInc' \
+| gofmt -r 'f64.ScalUnitary -> f32.ScalUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1S\2_" \
+      -e 's_^// D_// S_' \
+      -e "s_^\(func (Implementation) \)Id\(.*\)\$_$WARNINGF32\1Is\2_" \
+      -e 's_^// Id_// Is_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+      -e 's_"math"_math "gonum.org/v1/gonum/internal/math32"_' \
+      -e 's_safmin = 0x1p-1022_safmin = 0x1p-126_' \
+>> level1float32.go
+
+echo Generating level1cmplx64.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level1cmplx64.go
+cat level1cmplx128.go \
+| gofmt -r 'blas.Complex128Level1 -> blas.Complex64Level1' \
+\
+| gofmt -r 'float64 -> float32' \
+| gofmt -r 'complex128 -> complex64' \
+\
+| gofmt -r 'c128.AxpyInc -> c64.AxpyInc' \
+| gofmt -r 'c128.AxpyUnitary -> c64.AxpyUnitary' \
+| gofmt -r 'c128.DotcInc -> c64.DotcInc' \
+| gofmt -r 'c128.DotcUnitary -> c64.DotcUnitary' \
+| gofmt -r 'c128.DotuInc -> c64.DotuInc' \
+| gofmt -r 'c128.DotuUnitary -> c64.DotuUnitary' \
+| gofmt -r 'c128.ScalInc -> c64.ScalInc' \
+| gofmt -r 'c128.ScalUnitary -> c64.ScalUnitary' \
+| gofmt -r 'dcabs1 -> scabs1' \
+\
+| sed -e "s_^\(func (Implementation) \)Zdot\(.*\)\$_$WARNINGC64\1Cdot\2_" \
+      -e 's_^// Zdot_// Cdot_' \
+      -e "s_^\(func (Implementation) \)Zdscal\(.*\)\$_$WARNINGC64\1Csscal\2_" \
+      -e 's_^// Zdscal_// Csscal_' \
+      -e "s_^\(func (Implementation) \)Z\(.*\)\$_$WARNINGC64\1C\2_" \
+      -e 's_^// Z_// C_' \
+      -e "s_^\(func (Implementation) \)Iz\(.*\)\$_$WARNINGC64\1Ic\2_" \
+      -e 's_^// Iz_// Ic_' \
+      -e "s_^\(func (Implementation) \)Dz\(.*\)\$_$WARNINGC64\1Sc\2_" \
+      -e 's_^// Dz_// Sc_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/c128"_"gonum.org/v1/gonum/internal/asm/c64"_' \
+      -e 's_"math"_math "gonum.org/v1/gonum/internal/math32"_' \
+>> level1cmplx64.go
+
+echo Generating level1float32_sdot.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level1float32_sdot.go
+cat level1float64_ddot.go \
+| gofmt -r 'float64 -> float32' \
+\
+| gofmt -r 'f64.DotInc -> f32.DotInc' \
+| gofmt -r 'f64.DotUnitary -> f32.DotUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1S\2_" \
+      -e 's_^// D_// S_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+>> level1float32_sdot.go
+
+echo Generating level1float32_dsdot.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level1float32_dsdot.go
+cat level1float64_ddot.go \
+| gofmt -r '[]float64 -> []float32' \
+\
+| gofmt -r 'f64.DotInc -> f32.DdotInc' \
+| gofmt -r 'f64.DotUnitary -> f32.DdotUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1Ds\2_" \
+      -e 's_^// D_// Ds_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+>> level1float32_dsdot.go
+
+echo Generating level1float32_sdsdot.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level1float32_sdsdot.go
+cat level1float64_ddot.go \
+| gofmt -r 'float64 -> float32' \
+\
+| gofmt -r 'f64.DotInc(x, y, f(n), f(incX), f(incY), f(ix), f(iy)) -> alpha + float32(f32.DdotInc(x, y, f(n), f(incX), f(incY), f(ix), f(iy)))' \
+| gofmt -r 'f64.DotUnitary(a, b) -> alpha + float32(f32.DdotUnitary(a, b))' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1Sds\2_" \
+      -e 's_^// D\(.*\)$_// Sds\1 plus a constant_' \
+      -e 's_\\sum_alpha + \\sum_' \
+      -e 's/n int/n int, alpha float32/' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+>> level1float32_sdsdot.go
+
+
+# Level2 routines.
+
+echo Generating level2float32.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level2float32.go
+cat level2float64.go \
+| gofmt -r 'blas.Float64Level2 -> blas.Float32Level2' \
+\
+| gofmt -r 'float64 -> float32' \
+\
+| gofmt -r 'f64.AxpyInc -> f32.AxpyInc' \
+| gofmt -r 'f64.AxpyIncTo -> f32.AxpyIncTo' \
+| gofmt -r 'f64.AxpyUnitary -> f32.AxpyUnitary' \
+| gofmt -r 'f64.AxpyUnitaryTo -> f32.AxpyUnitaryTo' \
+| gofmt -r 'f64.DotInc -> f32.DotInc' \
+| gofmt -r 'f64.DotUnitary -> f32.DotUnitary' \
+| gofmt -r 'f64.ScalInc -> f32.ScalInc' \
+| gofmt -r 'f64.ScalUnitary -> f32.ScalUnitary' \
+| gofmt -r 'f64.Ger -> f32.Ger' \
+| gofmt -r 'f64.GemvN -> f32.GemvN' \
+| gofmt -r 'f64.GemvT -> f32.GemvT' \
+| gofmt -r 'Implementation{}.Dscal -> Implementation{}.Sscal' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1S\2_" \
+      -e 's_^// D_// S_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+>> level2float32.go
+
+echo Generating level2cmplx64.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level2cmplx64.go
+cat level2cmplx128.go \
+| gofmt -r 'blas.Complex128Level2 -> blas.Complex64Level2' \
+\
+| gofmt -r 'complex128 -> complex64' \
+| gofmt -r 'float64 -> float32' \
+\
+| gofmt -r 'c128.AxpyInc -> c64.AxpyInc' \
+| gofmt -r 'c128.AxpyUnitary -> c64.AxpyUnitary' \
+| gofmt -r 'c128.DotuInc -> c64.DotuInc' \
+| gofmt -r 'c128.DotuUnitary -> c64.DotuUnitary' \
+| gofmt -r 'c128.ScalInc -> c64.ScalInc' \
+| gofmt -r 'c128.ScalUnitary -> c64.ScalUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)Z\(.*\)\$_$WARNINGC64\1C\2_" \
+      -e 's_^// Z_// C_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/c128"_"gonum.org/v1/gonum/internal/asm/c64"_' \
+      -e 's_"math/cmplx"_cmplx "gonum.org/v1/gonum/internal/cmplx64"_' \
+>> level2cmplx64.go
+
+# Level3 routines.
+
+echo Generating level3float32.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level3float32.go
+cat level3float64.go \
+| gofmt -r 'blas.Float64Level3 -> blas.Float32Level3' \
+\
+| gofmt -r 'float64 -> float32' \
+\
+| gofmt -r 'f64.AxpyUnitaryTo -> f32.AxpyUnitaryTo' \
+| gofmt -r 'f64.AxpyUnitary -> f32.AxpyUnitary' \
+| gofmt -r 'f64.DotUnitary -> f32.DotUnitary' \
+| gofmt -r 'f64.ScalUnitary -> f32.ScalUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1S\2_" \
+      -e 's_^// D_// S_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+>> level3float32.go
+
+echo Generating sgemm.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > sgemm.go
+cat dgemm.go \
+| gofmt -r 'float64 -> float32' \
+| gofmt -r 'sliceView64 -> sliceView32' \
+\
+| gofmt -r 'dgemmParallel -> sgemmParallel' \
+| gofmt -r 'computeNumBlocks64 -> computeNumBlocks32' \
+| gofmt -r 'dgemmSerial -> sgemmSerial' \
+| gofmt -r 'dgemmSerialNotNot -> sgemmSerialNotNot' \
+| gofmt -r 'dgemmSerialTransNot -> sgemmSerialTransNot' \
+| gofmt -r 'dgemmSerialNotTrans -> sgemmSerialNotTrans' \
+| gofmt -r 'dgemmSerialTransTrans -> sgemmSerialTransTrans' \
+\
+| gofmt -r 'f64.AxpyInc -> f32.AxpyInc' \
+| gofmt -r 'f64.AxpyUnitary -> f32.AxpyUnitary' \
+| gofmt -r 'f64.DotUnitary -> f32.DotUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1S\2_" \
+      -e 's_^// D_// S_' \
+      -e 's_^// d_// s_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+>> sgemm.go
+
+echo Generating level3cmplx64.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level3cmplx64.go
+cat level3cmplx128.go \
+| gofmt -r 'blas.Complex128Level3 -> blas.Complex64Level3' \
+\
+| gofmt -r 'float64 -> float32' \
+| gofmt -r 'complex128 -> complex64' \
+\
+| gofmt -r 'c128.ScalUnitary -> c64.ScalUnitary' \
+| gofmt -r 'c128.DscalUnitary -> c64.SscalUnitary' \
+| gofmt -r 'c128.DotcUnitary -> c64.DotcUnitary' \
+| gofmt -r 'c128.AxpyUnitary -> c64.AxpyUnitary' \
+| gofmt -r 'c128.DotuUnitary -> c64.DotuUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)Z\(.*\)\$_$WARNINGC64\1C\2_" \
+      -e 's_^// Z_// C_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/c128"_"gonum.org/v1/gonum/internal/asm/c64"_' \
+      -e 's_"math/cmplx"_cmplx "gonum.org/v1/gonum/internal/cmplx64"_' \
+>> level3cmplx64.go
diff --git a/vendor/gonum.org/v1/gonum/floats/README.md b/vendor/gonum.org/v1/gonum/floats/README.md
new file mode 100644
index 0000000000..e8ef46d567
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/floats/README.md
@@ -0,0 +1,7 @@
+# Gonum floats
+
+[![go.dev reference](https://pkg.go.dev/badge/gonum.org/v1/gonum/floats)](https://pkg.go.dev/gonum.org/v1/gonum/floats)
+[![GoDoc](https://godocs.io/gonum.org/v1/gonum/floats?status.svg)](https://godocs.io/gonum.org/v1/gonum/floats)
+
+Package floats provides a set of helper routines for dealing with slices of float64.
+The functions avoid allocations to allow for use within tight loops without garbage collection overhead.
diff --git a/vendor/gonum.org/v1/gonum/floats/doc.go b/vendor/gonum.org/v1/gonum/floats/doc.go
new file mode 100644
index 0000000000..bfe05c1918
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/floats/doc.go
@@ -0,0 +1,11 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package floats provides a set of helper routines for dealing with slices
+// of float64. The functions avoid allocations to allow for use within tight
+// loops without garbage collection overhead.
+//
+// The convention used is that when a slice is being modified in place, it has
+// the name dst.
+package floats // import "gonum.org/v1/gonum/floats"
diff --git a/vendor/gonum.org/v1/gonum/floats/floats.go b/vendor/gonum.org/v1/gonum/floats/floats.go
new file mode 100644
index 0000000000..68c4e65c7e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/floats/floats.go
@@ -0,0 +1,808 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package floats
+
+import (
+	"errors"
+	"math"
+	"slices"
+	"sort"
+
+	"gonum.org/v1/gonum/floats/scalar"
+	"gonum.org/v1/gonum/internal/asm/f64"
+)
+
+const (
+	zeroLength   = "floats: zero length slice"
+	shortSpan    = "floats: slice length less than 2"
+	badLength    = "floats: slice lengths do not match"
+	badDstLength = "floats: destination slice length does not match input"
+)
+
+// Add adds, element-wise, the elements of s and dst, and stores the result in dst.
+// It panics if the argument lengths do not match.
+func Add(dst, s []float64) {
+	if len(dst) != len(s) {
+		panic(badDstLength)
+	}
+	f64.AxpyUnitaryTo(dst, 1, s, dst)
+}
+
+// AddTo adds, element-wise, the elements of s and t and
+// stores the result in dst.
+// It panics if the argument lengths do not match.
+func AddTo(dst, s, t []float64) []float64 {
+	if len(s) != len(t) {
+		panic(badLength)
+	}
+	if len(dst) != len(s) {
+		panic(badDstLength)
+	}
+	f64.AxpyUnitaryTo(dst, 1, s, t)
+	return dst
+}
+
+// AddConst adds the scalar c to all of the values in dst.
+func AddConst(c float64, dst []float64) {
+	f64.AddConst(c, dst)
+}
+
+// AddScaled performs dst = dst + alpha * s.
+// It panics if the slice argument lengths do not match.
+func AddScaled(dst []float64, alpha float64, s []float64) {
+	if len(dst) != len(s) {
+		panic(badLength)
+	}
+	f64.AxpyUnitaryTo(dst, alpha, s, dst)
+}
+
+// AddScaledTo performs dst = y + alpha * s, where alpha is a scalar,
+// and dst, y and s are all slices.
+// It panics if the slice argument lengths do not match.
+//
+// At the return of the function, dst[i] = y[i] + alpha * s[i]
+func AddScaledTo(dst, y []float64, alpha float64, s []float64) []float64 {
+	if len(s) != len(y) {
+		panic(badLength)
+	}
+	if len(dst) != len(y) {
+		panic(badDstLength)
+	}
+	f64.AxpyUnitaryTo(dst, alpha, s, y)
+	return dst
+}
+
+// argsort is a helper that implements sort.Interface, as used by
+// Argsort and ArgsortStable.
+type argsort struct {
+	s    []float64
+	inds []int
+}
+
+func (a argsort) Len() int {
+	return len(a.s)
+}
+
+func (a argsort) Less(i, j int) bool {
+	return a.s[i] < a.s[j]
+}
+
+func (a argsort) Swap(i, j int) {
+	a.s[i], a.s[j] = a.s[j], a.s[i]
+	a.inds[i], a.inds[j] = a.inds[j], a.inds[i]
+}
+
+// Argsort sorts the elements of dst while tracking their original order.
+// At the conclusion of Argsort, dst will contain the original elements of dst
+// but sorted in increasing order, and inds will contain the original position
+// of the elements in the slice such that dst[i] = origDst[inds[i]].
+// It panics if the argument lengths do not match.
+func Argsort(dst []float64, inds []int) {
+	if len(dst) != len(inds) {
+		panic(badDstLength)
+	}
+	for i := range dst {
+		inds[i] = i
+	}
+
+	a := argsort{s: dst, inds: inds}
+	sort.Sort(a)
+}
+
+// ArgsortStable sorts the elements of dst while tracking their original order and
+// keeping the original order of equal elements. At the conclusion of ArgsortStable,
+// dst will contain the original elements of dst but sorted in increasing order,
+// and inds will contain the original position of the elements in the slice such
+// that dst[i] = origDst[inds[i]].
+// It panics if the argument lengths do not match.
+func ArgsortStable(dst []float64, inds []int) {
+	if len(dst) != len(inds) {
+		panic(badDstLength)
+	}
+	for i := range dst {
+		inds[i] = i
+	}
+
+	a := argsort{s: dst, inds: inds}
+	sort.Stable(a)
+}
+
+// Count applies the function f to every element of s and returns the number
+// of times the function returned true.
+func Count(f func(float64) bool, s []float64) int {
+	var n int
+	for _, val := range s {
+		if f(val) {
+			n++
+		}
+	}
+	return n
+}
+
+// CumProd finds the cumulative product of the first i elements in
+// s and puts them in place into the ith element of the
+// destination dst.
+// It panics if the argument lengths do not match.
+//
+// At the return of the function, dst[i] = s[i] * s[i-1] * s[i-2] * ...
+func CumProd(dst, s []float64) []float64 {
+	if len(dst) != len(s) {
+		panic(badDstLength)
+	}
+	if len(dst) == 0 {
+		return dst
+	}
+	return f64.CumProd(dst, s)
+}
+
+// CumSum finds the cumulative sum of the first i elements in
+// s and puts them in place into the ith element of the
+// destination dst.
+// It panics if the argument lengths do not match.
+//
+// At the return of the function, dst[i] = s[i] + s[i-1] + s[i-2] + ...
+func CumSum(dst, s []float64) []float64 {
+	if len(dst) != len(s) {
+		panic(badDstLength)
+	}
+	if len(dst) == 0 {
+		return dst
+	}
+	return f64.CumSum(dst, s)
+}
+
+// Distance computes the L-norm of s - t. See Norm for special cases.
+// It panics if the slice argument lengths do not match.
+func Distance(s, t []float64, L float64) float64 {
+	if len(s) != len(t) {
+		panic(badLength)
+	}
+	if len(s) == 0 {
+		return 0
+	}
+	if L == 2 {
+		return f64.L2DistanceUnitary(s, t)
+	}
+	var norm float64
+	if L == 1 {
+		for i, v := range s {
+			norm += math.Abs(t[i] - v)
+		}
+		return norm
+	}
+	if math.IsInf(L, 1) {
+		for i, v := range s {
+			absDiff := math.Abs(t[i] - v)
+			if absDiff > norm {
+				norm = absDiff
+			}
+		}
+		return norm
+	}
+	for i, v := range s {
+		norm += math.Pow(math.Abs(t[i]-v), L)
+	}
+	return math.Pow(norm, 1/L)
+}
+
+// Div performs element-wise division dst / s
+// and stores the value in dst.
+// It panics if the argument lengths do not match.
+func Div(dst, s []float64) {
+	if len(dst) != len(s) {
+		panic(badLength)
+	}
+	f64.Div(dst, s)
+}
+
+// DivTo performs element-wise division s / t
+// and stores the value in dst.
+// It panics if the argument lengths do not match.
+func DivTo(dst, s, t []float64) []float64 {
+	if len(s) != len(t) {
+		panic(badLength)
+	}
+	if len(dst) != len(s) {
+		panic(badDstLength)
+	}
+	return f64.DivTo(dst, s, t)
+}
+
+// Dot computes the dot product of s1 and s2, i.e.
+// sum_{i = 1}^N s1[i]*s2[i].
+// It panics if the argument lengths do not match.
+func Dot(s1, s2 []float64) float64 {
+	if len(s1) != len(s2) {
+		panic(badLength)
+	}
+	return f64.DotUnitary(s1, s2)
+}
+
+// Equal returns true when the slices have equal lengths and
+// all elements are numerically identical.
+func Equal(s1, s2 []float64) bool {
+	if len(s1) != len(s2) {
+		return false
+	}
+	for i, val := range s1 {
+		if s2[i] != val {
+			return false
+		}
+	}
+	return true
+}
+
+// EqualApprox returns true when the slices have equal lengths and
+// all element pairs have an absolute tolerance less than tol or a
+// relative tolerance less than tol.
+func EqualApprox(s1, s2 []float64, tol float64) bool {
+	if len(s1) != len(s2) {
+		return false
+	}
+	for i, a := range s1 {
+		if !scalar.EqualWithinAbsOrRel(a, s2[i], tol, tol) {
+			return false
+		}
+	}
+	return true
+}
+
+// EqualFunc returns true when the slices have the same lengths
+// and the function returns true for all element pairs.
+func EqualFunc(s1, s2 []float64, f func(float64, float64) bool) bool {
+	if len(s1) != len(s2) {
+		return false
+	}
+	for i, val := range s1 {
+		if !f(val, s2[i]) {
+			return false
+		}
+	}
+	return true
+}
+
+// EqualLengths returns true when all of the slices have equal length,
+// and false otherwise. It also returns true when there are no input slices.
+func EqualLengths(slices ...[]float64) bool {
+	// This length check is needed: http://play.golang.org/p/sdty6YiLhM
+	if len(slices) == 0 {
+		return true
+	}
+	l := len(slices[0])
+	for i := 1; i < len(slices); i++ {
+		if len(slices[i]) != l {
+			return false
+		}
+	}
+	return true
+}
+
+// Find applies f to every element of s and returns the indices of the first
+// k elements for which the f returns true, or all such elements
+// if k < 0.
+// Find will reslice inds to have 0 length, and will append
+// found indices to inds.
+// If k > 0 and there are fewer than k elements in s satisfying f,
+// all of the found elements will be returned along with an error.
+// At the return of the function, the input inds will be in an undetermined state.
+func Find(inds []int, f func(float64) bool, s []float64, k int) ([]int, error) {
+	// inds is also returned to allow for calling with nil.
+
+	// Reslice inds to have zero length.
+	inds = inds[:0]
+
+	// If zero elements requested, can just return.
+	if k == 0 {
+		return inds, nil
+	}
+
+	// If k < 0, return all of the found indices.
+	if k < 0 {
+		for i, val := range s {
+			if f(val) {
+				inds = append(inds, i)
+			}
+		}
+		return inds, nil
+	}
+
+	// Otherwise, find the first k elements.
+	nFound := 0
+	for i, val := range s {
+		if f(val) {
+			inds = append(inds, i)
+			nFound++
+			if nFound == k {
+				return inds, nil
+			}
+		}
+	}
+	// Finished iterating over the loop, which means k elements were not found.
+	return inds, errors.New("floats: insufficient elements found")
+}
+
+// HasNaN returns true when the slice s has any values that are NaN and false
+// otherwise.
+func HasNaN(s []float64) bool {
+	for _, v := range s {
+		if math.IsNaN(v) {
+			return true
+		}
+	}
+	return false
+}
+
+// LogSpan returns a set of n equally spaced points in log space between,
+// l and u where N is equal to len(dst). The first element of the
+// resulting dst will be l and the final element of dst will be u.
+// It panics if the length of dst is less than 2.
+// Note that this call will return NaNs if either l or u are negative, and
+// will return all zeros if l or u is zero.
+// Also returns the mutated slice dst, so that it can be used in range, like:
+//
+//	for i, x := range LogSpan(dst, l, u) { ... }
+func LogSpan(dst []float64, l, u float64) []float64 {
+	Span(dst, math.Log(l), math.Log(u))
+	for i := range dst {
+		dst[i] = math.Exp(dst[i])
+	}
+	return dst
+}
+
+// LogSumExp returns the log of the sum of the exponentials of the values in s.
+// Panics if s is an empty slice.
+func LogSumExp(s []float64) float64 {
+	// Want to do this in a numerically stable way which avoids
+	// overflow and underflow
+	// First, find the maximum value in the slice.
+	maxval := Max(s)
+	if math.IsInf(maxval, 0) {
+		// If it's infinity either way, the logsumexp will be infinity as well
+		// returning now avoids NaNs
+		return maxval
+	}
+	var lse float64
+	// Compute the sumexp part
+	for _, val := range s {
+		lse += math.Exp(val - maxval)
+	}
+	// Take the log and add back on the constant taken out
+	return math.Log(lse) + maxval
+}
+
+// Max returns the maximum value in the input slice. If the slice is empty, Max will panic.
+func Max(s []float64) float64 {
+	return s[MaxIdx(s)]
+}
+
+// MaxIdx returns the index of the maximum value in the input slice. If several
+// entries have the maximum value, the first such index is returned.
+// It panics if s is zero length.
+func MaxIdx(s []float64) int {
+	if len(s) == 0 {
+		panic(zeroLength)
+	}
+	max := math.NaN()
+	var ind int
+	for i, v := range s {
+		if math.IsNaN(v) {
+			continue
+		}
+		if v > max || math.IsNaN(max) {
+			max = v
+			ind = i
+		}
+	}
+	return ind
+}
+
+// Min returns the minimum value in the input slice.
+// It panics if s is zero length.
+func Min(s []float64) float64 {
+	return s[MinIdx(s)]
+}
+
+// MinIdx returns the index of the minimum value in the input slice. If several
+// entries have the minimum value, the first such index is returned.
+// It panics if s is zero length.
+func MinIdx(s []float64) int {
+	if len(s) == 0 {
+		panic(zeroLength)
+	}
+	min := math.NaN()
+	var ind int
+	for i, v := range s {
+		if math.IsNaN(v) {
+			continue
+		}
+		if v < min || math.IsNaN(min) {
+			min = v
+			ind = i
+		}
+	}
+	return ind
+}
+
+// Mul performs element-wise multiplication between dst
+// and s and stores the value in dst.
+// It panics if the argument lengths do not match.
+func Mul(dst, s []float64) {
+	if len(dst) != len(s) {
+		panic(badLength)
+	}
+	for i, val := range s {
+		dst[i] *= val
+	}
+}
+
+// MulTo performs element-wise multiplication between s
+// and t and stores the value in dst.
+// It panics if the argument lengths do not match.
+func MulTo(dst, s, t []float64) []float64 {
+	if len(s) != len(t) {
+		panic(badLength)
+	}
+	if len(dst) != len(s) {
+		panic(badDstLength)
+	}
+	for i, val := range t {
+		dst[i] = val * s[i]
+	}
+	return dst
+}
+
+// NearestIdx returns the index of the element in s
+// whose value is nearest to v. If several such
+// elements exist, the lowest index is returned.
+// It panics if s is zero length.
+func NearestIdx(s []float64, v float64) int {
+	if len(s) == 0 {
+		panic(zeroLength)
+	}
+	switch {
+	case math.IsNaN(v):
+		return 0
+	case math.IsInf(v, 1):
+		return MaxIdx(s)
+	case math.IsInf(v, -1):
+		return MinIdx(s)
+	}
+	var ind int
+	dist := math.NaN()
+	for i, val := range s {
+		newDist := math.Abs(v - val)
+		// A NaN distance will not be closer.
+		if math.IsNaN(newDist) {
+			continue
+		}
+		if newDist < dist || math.IsNaN(dist) {
+			dist = newDist
+			ind = i
+		}
+	}
+	return ind
+}
+
+// NearestIdxForSpan return the index of a hypothetical vector created
+// by Span with length n and bounds l and u whose value is closest
+// to v. That is, NearestIdxForSpan(n, l, u, v) is equivalent to
+// Nearest(Span(make([]float64, n),l,u),v) without an allocation.
+// It panics if n is less than two.
+func NearestIdxForSpan(n int, l, u float64, v float64) int {
+	if n < 2 {
+		panic(shortSpan)
+	}
+	if math.IsNaN(v) {
+		return 0
+	}
+
+	// Special cases for Inf and NaN.
+	switch {
+	case math.IsNaN(l) && !math.IsNaN(u):
+		return n - 1
+	case math.IsNaN(u):
+		return 0
+	case math.IsInf(l, 0) && math.IsInf(u, 0):
+		if l == u {
+			return 0
+		}
+		if n%2 == 1 {
+			if !math.IsInf(v, 0) {
+				return n / 2
+			}
+			if math.Copysign(1, v) == math.Copysign(1, l) {
+				return 0
+			}
+			return n/2 + 1
+		}
+		if math.Copysign(1, v) == math.Copysign(1, l) {
+			return 0
+		}
+		return n / 2
+	case math.IsInf(l, 0):
+		if v == l {
+			return 0
+		}
+		return n - 1
+	case math.IsInf(u, 0):
+		if v == u {
+			return n - 1
+		}
+		return 0
+	case math.IsInf(v, -1):
+		if l <= u {
+			return 0
+		}
+		return n - 1
+	case math.IsInf(v, 1):
+		if u <= l {
+			return 0
+		}
+		return n - 1
+	}
+
+	// Special cases for v outside (l, u) and (u, l).
+	switch {
+	case l < u:
+		if v <= l {
+			return 0
+		}
+		if v >= u {
+			return n - 1
+		}
+	case l > u:
+		if v >= l {
+			return 0
+		}
+		if v <= u {
+			return n - 1
+		}
+	default:
+		return 0
+	}
+
+	// Can't guarantee anything about exactly halfway between
+	// because of floating point weirdness.
+	return int((float64(n)-1)/(u-l)*(v-l) + 0.5)
+}
+
+// Norm returns the L norm of the slice S, defined as
+// (sum_{i=1}^N s[i]^L)^{1/L}
+// Special cases:
+// L = math.Inf(1) gives the maximum absolute value.
+// Does not correctly compute the zero norm (use Count).
+func Norm(s []float64, L float64) float64 {
+	// Should this complain if L is not positive?
+	// Should this be done in log space for better numerical stability?
+	//	would be more cost
+	//	maybe only if L is high?
+	if len(s) == 0 {
+		return 0
+	}
+	if L == 2 {
+		return f64.L2NormUnitary(s)
+	}
+	var norm float64
+	if L == 1 {
+		for _, val := range s {
+			norm += math.Abs(val)
+		}
+		return norm
+	}
+	if math.IsInf(L, 1) {
+		for _, val := range s {
+			norm = math.Max(norm, math.Abs(val))
+		}
+		return norm
+	}
+	for _, val := range s {
+		norm += math.Pow(math.Abs(val), L)
+	}
+	return math.Pow(norm, 1/L)
+}
+
+// Prod returns the product of the elements of the slice.
+// Returns 1 if len(s) = 0.
+func Prod(s []float64) float64 {
+	prod := 1.0
+	for _, val := range s {
+		prod *= val
+	}
+	return prod
+}
+
+// Reverse reverses the order of elements in the slice.
+//
+// Deprecated: This function simply calls [slices.Reverse].
+func Reverse(s []float64) {
+	slices.Reverse(s)
+}
+
+// Same returns true when the input slices have the same length and all
+// elements have the same value with NaN treated as the same.
+func Same(s, t []float64) bool {
+	if len(s) != len(t) {
+		return false
+	}
+	for i, v := range s {
+		w := t[i]
+		if v != w && !(math.IsNaN(v) && math.IsNaN(w)) {
+			return false
+		}
+	}
+	return true
+}
+
+// Scale multiplies every element in dst by the scalar c.
+func Scale(c float64, dst []float64) {
+	if len(dst) > 0 {
+		f64.ScalUnitary(c, dst)
+	}
+}
+
+// ScaleTo multiplies the elements in s by c and stores the result in dst.
+// It panics if the slice argument lengths do not match.
+func ScaleTo(dst []float64, c float64, s []float64) []float64 {
+	if len(dst) != len(s) {
+		panic(badDstLength)
+	}
+	if len(dst) > 0 {
+		f64.ScalUnitaryTo(dst, c, s)
+	}
+	return dst
+}
+
+// Span returns a set of N equally spaced points between l and u, where N
+// is equal to the length of the destination. The first element of the destination
+// is l, the final element of the destination is u.
+// It panics if the length of dst is less than 2.
+//
+// Span also returns the mutated slice dst, so that it can be used in range expressions,
+// like:
+//
+//	for i, x := range Span(dst, l, u) { ... }
+func Span(dst []float64, l, u float64) []float64 {
+	n := len(dst)
+	if n < 2 {
+		panic(shortSpan)
+	}
+
+	// Special cases for Inf and NaN.
+	switch {
+	case math.IsNaN(l):
+		for i := range dst[:len(dst)-1] {
+			dst[i] = math.NaN()
+		}
+		dst[len(dst)-1] = u
+		return dst
+	case math.IsNaN(u):
+		for i := range dst[1:] {
+			dst[i+1] = math.NaN()
+		}
+		dst[0] = l
+		return dst
+	case math.IsInf(l, 0) && math.IsInf(u, 0):
+		for i := range dst[:len(dst)/2] {
+			dst[i] = l
+			dst[len(dst)-i-1] = u
+		}
+		if len(dst)%2 == 1 {
+			if l != u {
+				dst[len(dst)/2] = 0
+			} else {
+				dst[len(dst)/2] = l
+			}
+		}
+		return dst
+	case math.IsInf(l, 0):
+		for i := range dst[:len(dst)-1] {
+			dst[i] = l
+		}
+		dst[len(dst)-1] = u
+		return dst
+	case math.IsInf(u, 0):
+		for i := range dst[1:] {
+			dst[i+1] = u
+		}
+		dst[0] = l
+		return dst
+	}
+
+	step := (u - l) / float64(n-1)
+	for i := range dst {
+		dst[i] = l + step*float64(i)
+	}
+	return dst
+}
+
+// Sub subtracts, element-wise, the elements of s from dst.
+// It panics if the argument lengths do not match.
+func Sub(dst, s []float64) {
+	if len(dst) != len(s) {
+		panic(badLength)
+	}
+	f64.AxpyUnitaryTo(dst, -1, s, dst)
+}
+
+// SubTo subtracts, element-wise, the elements of t from s and
+// stores the result in dst.
+// It panics if the argument lengths do not match.
+func SubTo(dst, s, t []float64) []float64 {
+	if len(s) != len(t) {
+		panic(badLength)
+	}
+	if len(dst) != len(s) {
+		panic(badDstLength)
+	}
+	f64.AxpyUnitaryTo(dst, -1, t, s)
+	return dst
+}
+
+// Sum returns the sum of the elements of the slice.
+func Sum(s []float64) float64 {
+	return f64.Sum(s)
+}
+
+// Within returns the first index i where s[i] <= v < s[i+1]. Within panics if:
+//   - len(s) < 2
+//   - s is not sorted
+func Within(s []float64, v float64) int {
+	if len(s) < 2 {
+		panic(shortSpan)
+	}
+	if !sort.Float64sAreSorted(s) {
+		panic("floats: input slice not sorted")
+	}
+	if v < s[0] || v >= s[len(s)-1] || math.IsNaN(v) {
+		return -1
+	}
+	for i, f := range s[1:] {
+		if v < f {
+			return i
+		}
+	}
+	return -1
+}
+
+// SumCompensated returns the sum of the elements of the slice calculated with greater
+// accuracy than Sum at the expense of additional computation.
+func SumCompensated(s []float64) float64 {
+	// SumCompensated uses an improved version of Kahan's compensated
+	// summation algorithm proposed by Neumaier.
+	// See https://en.wikipedia.org/wiki/Kahan_summation_algorithm for details.
+	var sum, c float64
+	for _, x := range s {
+		// This type conversion is here to prevent a sufficiently smart compiler
+		// from optimising away these operations.
+		t := float64(sum + x)
+		if math.Abs(sum) >= math.Abs(x) {
+			c += (sum - t) + x
+		} else {
+			c += (x - t) + sum
+		}
+		sum = t
+	}
+	return sum + c
+}
diff --git a/vendor/gonum.org/v1/gonum/floats/scalar/doc.go b/vendor/gonum.org/v1/gonum/floats/scalar/doc.go
new file mode 100644
index 0000000000..9e69c193e2
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/floats/scalar/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package scalar provides a set of helper routines for dealing with float64 values.
+package scalar // import "gonum.org/v1/gonum/floats/scalar"
diff --git a/vendor/gonum.org/v1/gonum/floats/scalar/scalar.go b/vendor/gonum.org/v1/gonum/floats/scalar/scalar.go
new file mode 100644
index 0000000000..46bf06b353
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/floats/scalar/scalar.go
@@ -0,0 +1,171 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package scalar
+
+import (
+	"math"
+	"strconv"
+)
+
+// EqualWithinAbs returns true when a and b have an absolute difference
+// not greater than tol.
+func EqualWithinAbs(a, b, tol float64) bool {
+	return a == b || math.Abs(a-b) <= tol
+}
+
+// minNormalFloat64 is the smallest normal number. For 64 bit IEEE-754
+// floats this is 2^{-1022}.
+const minNormalFloat64 = 0x1p-1022
+
+// EqualWithinRel returns true when the difference between a and b
+// is not greater than tol times the greater absolute value of a and b,
+//
+//	abs(a-b) <= tol * max(abs(a), abs(b)).
+func EqualWithinRel(a, b, tol float64) bool {
+	if a == b {
+		return true
+	}
+	delta := math.Abs(a - b)
+	if delta <= minNormalFloat64 {
+		return delta <= tol*minNormalFloat64
+	}
+	// We depend on the division in this relationship to identify
+	// infinities (we rely on the NaN to fail the test) otherwise
+	// we compare Infs of the same sign and evaluate Infs as equal
+	// independent of sign.
+	return delta/math.Max(math.Abs(a), math.Abs(b)) <= tol
+}
+
+// EqualWithinAbsOrRel returns true when a and b are equal to within
+// the absolute or relative tolerances. See EqualWithinAbs and
+// EqualWithinRel for details.
+func EqualWithinAbsOrRel(a, b, absTol, relTol float64) bool {
+	return EqualWithinAbs(a, b, absTol) || EqualWithinRel(a, b, relTol)
+}
+
+// EqualWithinULP returns true when a and b are equal to within
+// the specified number of floating point units in the last place.
+func EqualWithinULP(a, b float64, ulp uint) bool {
+	if a == b {
+		return true
+	}
+	if math.IsNaN(a) || math.IsNaN(b) {
+		return false
+	}
+	if math.Signbit(a) != math.Signbit(b) {
+		return math.Float64bits(math.Abs(a))+math.Float64bits(math.Abs(b)) <= uint64(ulp)
+	}
+	return ulpDiff(math.Float64bits(a), math.Float64bits(b)) <= uint64(ulp)
+}
+
+func ulpDiff(a, b uint64) uint64 {
+	if a > b {
+		return a - b
+	}
+	return b - a
+}
+
+const (
+	nanBits = 0x7ff8000000000000
+	nanMask = 0xfff8000000000000
+)
+
+// NaNWith returns an IEEE 754 "quiet not-a-number" value with the
+// payload specified in the low 51 bits of payload.
+// The NaN returned by math.NaN has a bit pattern equal to NaNWith(1).
+func NaNWith(payload uint64) float64 {
+	return math.Float64frombits(nanBits | (payload &^ nanMask))
+}
+
+// NaNPayload returns the lowest 51 bits payload of an IEEE 754 "quiet
+// not-a-number". For values of f other than quiet-NaN, NaNPayload
+// returns zero and false.
+func NaNPayload(f float64) (payload uint64, ok bool) {
+	b := math.Float64bits(f)
+	if b&nanBits != nanBits {
+		return 0, false
+	}
+	return b &^ nanMask, true
+}
+
+// ParseWithNA converts the string s to a float64 in value.
+// If s equals missing, weight is returned as 0, otherwise 1.
+func ParseWithNA(s, missing string) (value, weight float64, err error) {
+	if s == missing {
+		return 0, 0, nil
+	}
+	value, err = strconv.ParseFloat(s, 64)
+	if err == nil {
+		weight = 1
+	}
+	return value, weight, err
+}
+
+// Round returns the half away from zero rounded value of x with prec precision.
+//
+// Special cases are:
+//
+//	Round(±0) = +0
+//	Round(±Inf) = ±Inf
+//	Round(NaN) = NaN
+func Round(x float64, prec int) float64 {
+	if x == 0 {
+		// Make sure zero is returned
+		// without the negative bit set.
+		return 0
+	}
+	// Fast path for positive precision on integers.
+	if prec >= 0 && x == math.Trunc(x) {
+		return x
+	}
+	pow := math.Pow10(prec)
+	intermed := x * pow
+	if math.IsInf(intermed, 0) {
+		return x
+	}
+	x = math.Round(intermed)
+
+	if x == 0 {
+		return 0
+	}
+
+	return x / pow
+}
+
+// RoundEven returns the half even rounded value of x with prec precision.
+//
+// Special cases are:
+//
+//	RoundEven(±0) = +0
+//	RoundEven(±Inf) = ±Inf
+//	RoundEven(NaN) = NaN
+func RoundEven(x float64, prec int) float64 {
+	if x == 0 {
+		// Make sure zero is returned
+		// without the negative bit set.
+		return 0
+	}
+	// Fast path for positive precision on integers.
+	if prec >= 0 && x == math.Trunc(x) {
+		return x
+	}
+	pow := math.Pow10(prec)
+	intermed := x * pow
+	if math.IsInf(intermed, 0) {
+		return x
+	}
+	x = math.RoundToEven(intermed)
+
+	if x == 0 {
+		return 0
+	}
+
+	return x / pow
+}
+
+// Same returns true when the inputs have the same value, allowing NaN equality.
+func Same(a, b float64) bool {
+	return a == b || (math.IsNaN(a) && math.IsNaN(b))
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyinc_amd64.s
new file mode 100644
index 0000000000..d9b71a0d6b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyinc_amd64.s
@@ -0,0 +1,134 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// MOVDDUP X2, X3
+#define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
+// MOVDDUP X4, X5
+#define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
+// MOVDDUP X6, X7
+#define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
+// MOVDDUP X8, X9
+#define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
+
+// ADDSUBPD X2, X3
+#define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
+// ADDSUBPD X4, X5
+#define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
+// ADDSUBPD X6, X7
+#define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
+// ADDSUBPD X8, X9
+#define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
+
+// func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
+TEXT ·AxpyInc(SB), NOSPLIT, $0
+	MOVQ   x_base+16(FP), SI // SI = &x
+	MOVQ   y_base+40(FP), DI // DI = &y
+	MOVQ   n+64(FP), CX      // CX = n
+	CMPQ   CX, $0            // if n==0 { return }
+	JE     axpyi_end
+	MOVQ   ix+88(FP), R8     // R8 = ix  // Load the first index
+	SHLQ   $4, R8            // R8 *= sizeof(complex128)
+	MOVQ   iy+96(FP), R9     // R9 = iy
+	SHLQ   $4, R9            // R9 *= sizeof(complex128)
+	LEAQ   (SI)(R8*1), SI    // SI = &(x[ix])
+	LEAQ   (DI)(R9*1), DI    // DI = &(y[iy])
+	MOVQ   DI, DX            // DX = DI      // Separate Read/Write pointers
+	MOVQ   incX+72(FP), R8   // R8 = incX
+	SHLQ   $4, R8            // R8 *= sizeof(complex128)
+	MOVQ   incY+80(FP), R9   // R9 = iy
+	SHLQ   $4, R9            // R9 *= sizeof(complex128)
+	MOVUPS alpha+0(FP), X0   // X0 = { imag(a), real(a) }
+	MOVAPS X0, X1
+	SHUFPD $0x1, X1, X1      // X1 = { real(a), imag(a) }
+	MOVAPS X0, X10           // Copy X0 and X1 for pipelining
+	MOVAPS X1, X11
+	MOVQ   CX, BX
+	ANDQ   $3, CX            // CX = n % 4
+	SHRQ   $2, BX            // BX = floor( n / 4 )
+	JZ     axpyi_tail        // if BX == 0 { goto axpyi_tail }
+
+axpyi_loop: // do {
+	MOVUPS (SI), X2       // X_i = { imag(x[i]), real(x[i]) }
+	MOVUPS (SI)(R8*1), X4
+	LEAQ   (SI)(R8*2), SI // SI = &(SI[incX*2])
+	MOVUPS (SI), X6
+	MOVUPS (SI)(R8*1), X8
+
+	// X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_X2_X3
+	MOVDDUP_X4_X5
+	MOVDDUP_X6_X7
+	MOVDDUP_X8_X9
+
+	// X_i = { imag(x[i]), imag(x[i]) }
+	SHUFPD $0x3, X2, X2
+	SHUFPD $0x3, X4, X4
+	SHUFPD $0x3, X6, X6
+	SHUFPD $0x3, X8, X8
+
+	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+	MULPD X1, X2
+	MULPD X0, X3
+	MULPD X11, X4
+	MULPD X10, X5
+	MULPD X1, X6
+	MULPD X0, X7
+	MULPD X11, X8
+	MULPD X10, X9
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	ADDPD  (DX), X3
+	ADDPD  (DX)(R9*1), X5
+	LEAQ   (DX)(R9*2), DX // DX = &(DX[incY*2])
+	ADDPD  (DX), X7
+	ADDPD  (DX)(R9*1), X9
+	MOVUPS X3, (DI)       // dst[i] = X_(i+1)
+	MOVUPS X5, (DI)(R9*1)
+	LEAQ   (DI)(R9*2), DI
+	MOVUPS X7, (DI)
+	MOVUPS X9, (DI)(R9*1)
+	LEAQ   (SI)(R8*2), SI // SI = &(SI[incX*2])
+	LEAQ   (DX)(R9*2), DX // DX = &(DX[incY*2])
+	LEAQ   (DI)(R9*2), DI // DI = &(DI[incY*2])
+	DECQ   BX
+	JNZ    axpyi_loop     // } while --BX > 0
+	CMPQ   CX, $0         // if CX == 0 { return }
+	JE     axpyi_end
+
+axpyi_tail: // do {
+	MOVUPS (SI), X2     // X_i = { imag(x[i]), real(x[i]) }
+	MOVDDUP_X2_X3       // X_(i+1) = { real(x[i], real(x[i]) }
+	SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
+	MULPD  X1, X2       // X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	MULPD  X0, X3       // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+
+	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	ADDPD  (DI), X3
+	MOVUPS X3, (DI)   // y[i] = X_i
+	ADDQ   R8, SI     // SI = &(SI[incX])
+	ADDQ   R9, DI     // DI = &(DI[incY])
+	LOOP   axpyi_tail // } while --CX > 0
+
+axpyi_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyincto_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyincto_amd64.s
new file mode 100644
index 0000000000..d35e95d982
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyincto_amd64.s
@@ -0,0 +1,141 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// MOVDDUP X2, X3
+#define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
+// MOVDDUP X4, X5
+#define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
+// MOVDDUP X6, X7
+#define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
+// MOVDDUP X8, X9
+#define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
+
+// ADDSUBPD X2, X3
+#define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
+// ADDSUBPD X4, X5
+#define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
+// ADDSUBPD X6, X7
+#define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
+// ADDSUBPD X8, X9
+#define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
+
+// func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
+TEXT ·AxpyIncTo(SB), NOSPLIT, $0
+	MOVQ   dst_base+0(FP), DI // DI = &dst
+	MOVQ   x_base+56(FP), SI  // SI = &x
+	MOVQ   y_base+80(FP), DX  // DX = &y
+	MOVQ   n+104(FP), CX      // CX = n
+	CMPQ   CX, $0             // if n==0 { return }
+	JE     axpyi_end
+	MOVQ   ix+128(FP), R8     // R8 = ix  // Load the first index
+	SHLQ   $4, R8             // R8 *= sizeof(complex128)
+	MOVQ   iy+136(FP), R9     // R9 = iy
+	SHLQ   $4, R9             // R9 *= sizeof(complex128)
+	MOVQ   idst+32(FP), R10   // R10 = idst
+	SHLQ   $4, R10            // R10 *= sizeof(complex128)
+	LEAQ   (SI)(R8*1), SI     // SI = &(x[ix])
+	LEAQ   (DX)(R9*1), DX     // DX = &(y[iy])
+	LEAQ   (DI)(R10*1), DI    // DI = &(dst[idst])
+	MOVQ   incX+112(FP), R8   // R8 = incX
+	SHLQ   $4, R8             // R8 *= sizeof(complex128)
+	MOVQ   incY+120(FP), R9   // R9 = incY
+	SHLQ   $4, R9             // R9 *= sizeof(complex128)
+	MOVQ   incDst+24(FP), R10 // R10 = incDst
+	SHLQ   $4, R10            // R10 *= sizeof(complex128)
+	MOVUPS alpha+40(FP), X0   // X0 = { imag(a), real(a) }
+	MOVAPS X0, X1
+	SHUFPD $0x1, X1, X1       // X1 = { real(a), imag(a) }
+	MOVAPS X0, X10            // Copy X0 and X1 for pipelining
+	MOVAPS X1, X11
+	MOVQ   CX, BX
+	ANDQ   $3, CX             // CX = n % 4
+	SHRQ   $2, BX             // BX = floor( n / 4 )
+	JZ     axpyi_tail         // if BX == 0 { goto axpyi_tail }
+
+axpyi_loop: // do {
+	MOVUPS (SI), X2       // X_i = { imag(x[i]), real(x[i]) }
+	MOVUPS (SI)(R8*1), X4
+	LEAQ   (SI)(R8*2), SI // SI = &(SI[incX*2])
+
+	MOVUPS (SI), X6
+	MOVUPS (SI)(R8*1), X8
+
+	// X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_X2_X3
+	MOVDDUP_X4_X5
+	MOVDDUP_X6_X7
+	MOVDDUP_X8_X9
+
+	// X_i = { imag(x[i]), imag(x[i]) }
+	SHUFPD $0x3, X2, X2
+	SHUFPD $0x3, X4, X4
+	SHUFPD $0x3, X6, X6
+	SHUFPD $0x3, X8, X8
+
+	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+	MULPD X1, X2
+	MULPD X0, X3
+	MULPD X11, X4
+	MULPD X10, X5
+	MULPD X1, X6
+	MULPD X0, X7
+	MULPD X11, X8
+	MULPD X10, X9
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	ADDPD  (DX), X3
+	ADDPD  (DX)(R9*1), X5
+	LEAQ   (DX)(R9*2), DX  // DX = &(DX[incY*2])
+	ADDPD  (DX), X7
+	ADDPD  (DX)(R9*1), X9
+	MOVUPS X3, (DI)        // dst[i] = X_(i+1)
+	MOVUPS X5, (DI)(R10*1)
+	LEAQ   (DI)(R10*2), DI
+	MOVUPS X7, (DI)
+	MOVUPS X9, (DI)(R10*1)
+	LEAQ   (SI)(R8*2), SI  // SI = &(SI[incX*2])
+	LEAQ   (DX)(R9*2), DX  // DX = &(DX[incY*2])
+	LEAQ   (DI)(R10*2), DI // DI = &(DI[incDst*2])
+	DECQ   BX
+	JNZ    axpyi_loop      // } while --BX > 0
+	CMPQ   CX, $0          // if CX == 0 { return }
+	JE     axpyi_end
+
+axpyi_tail: // do {
+	MOVUPS (SI), X2     // X_i = { imag(x[i]), real(x[i]) }
+	MOVDDUP_X2_X3       // X_(i+1) = { real(x[i], real(x[i]) }
+	SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
+	MULPD  X1, X2       // X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	MULPD  X0, X3       // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+
+	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	ADDPD  (DX), X3
+	MOVUPS X3, (DI)   // y[i] X_(i+1)
+	ADDQ   R8, SI     // SI += incX
+	ADDQ   R9, DX     // DX += incY
+	ADDQ   R10, DI    // DI += incDst
+	LOOP   axpyi_tail // } while --CX > 0
+
+axpyi_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyunitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyunitary_amd64.s
new file mode 100644
index 0000000000..a6783255fd
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyunitary_amd64.s
@@ -0,0 +1,122 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// MOVDDUP X2, X3
+#define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
+// MOVDDUP X4, X5
+#define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
+// MOVDDUP X6, X7
+#define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
+// MOVDDUP X8, X9
+#define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
+
+// ADDSUBPD X2, X3
+#define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
+// ADDSUBPD X4, X5
+#define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
+// ADDSUBPD X6, X7
+#define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
+// ADDSUBPD X8, X9
+#define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
+
+// func AxpyUnitary(alpha complex128, x, y []complex128)
+TEXT ·AxpyUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+16(FP), SI // SI = &x
+	MOVQ    y_base+40(FP), DI // DI = &y
+	MOVQ    x_len+24(FP), CX  // CX = min( len(x), len(y) )
+	CMPQ    y_len+48(FP), CX
+	CMOVQLE y_len+48(FP), CX
+	CMPQ    CX, $0            // if CX == 0 { return }
+	JE      caxy_end
+	PXOR    X0, X0            // Clear work registers and cache-align loop
+	PXOR    X1, X1
+	MOVUPS  alpha+0(FP), X0   // X0 = { imag(a), real(a) }
+	MOVAPS  X0, X1
+	SHUFPD  $0x1, X1, X1      // X1 = { real(a), imag(a) }
+	XORQ    AX, AX            // i = 0
+	MOVAPS  X0, X10           // Copy X0 and X1 for pipelining
+	MOVAPS  X1, X11
+	MOVQ    CX, BX
+	ANDQ    $3, CX            // CX = n % 4
+	SHRQ    $2, BX            // BX = floor( n / 4 )
+	JZ      caxy_tail         // if BX == 0 { goto caxy_tail }
+
+caxy_loop: // do {
+	MOVUPS (SI)(AX*8), X2   // X_i = { imag(x[i]), real(x[i]) }
+	MOVUPS 16(SI)(AX*8), X4
+	MOVUPS 32(SI)(AX*8), X6
+	MOVUPS 48(SI)(AX*8), X8
+
+	// X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_X2_X3
+	MOVDDUP_X4_X5
+	MOVDDUP_X6_X7
+	MOVDDUP_X8_X9
+
+	// X_i = { imag(x[i]), imag(x[i]) }
+	SHUFPD $0x3, X2, X2
+	SHUFPD $0x3, X4, X4
+	SHUFPD $0x3, X6, X6
+	SHUFPD $0x3, X8, X8
+
+	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+	MULPD X1, X2
+	MULPD X0, X3
+	MULPD X11, X4
+	MULPD X10, X5
+	MULPD X1, X6
+	MULPD X0, X7
+	MULPD X11, X8
+	MULPD X10, X9
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	ADDPD  (DI)(AX*8), X3
+	ADDPD  16(DI)(AX*8), X5
+	ADDPD  32(DI)(AX*8), X7
+	ADDPD  48(DI)(AX*8), X9
+	MOVUPS X3, (DI)(AX*8)   // y[i] = X_(i+1)
+	MOVUPS X5, 16(DI)(AX*8)
+	MOVUPS X7, 32(DI)(AX*8)
+	MOVUPS X9, 48(DI)(AX*8)
+	ADDQ   $8, AX           // i += 8
+	DECQ   BX
+	JNZ    caxy_loop        // } while --BX > 0
+	CMPQ   CX, $0           // if CX == 0 { return }
+	JE     caxy_end
+
+caxy_tail: // do {
+	MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
+	MOVDDUP_X2_X3         // X_(i+1) = { real(x[i], real(x[i]) }
+	SHUFPD $0x3, X2, X2   // X_i = { imag(x[i]), imag(x[i]) }
+	MULPD  X1, X2         // X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	MULPD  X0, X3         // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+
+	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	ADDPD  (DI)(AX*8), X3
+	MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
+	ADDQ   $2, AX         // i += 2
+	LOOP   caxy_tail      // }  while --CX > 0
+
+caxy_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyunitaryto_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyunitaryto_amd64.s
new file mode 100644
index 0000000000..64add6886c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyunitaryto_amd64.s
@@ -0,0 +1,123 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// MOVDDUP X2, X3
+#define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
+// MOVDDUP X4, X5
+#define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
+// MOVDDUP X6, X7
+#define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
+// MOVDDUP X8, X9
+#define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
+
+// ADDSUBPD X2, X3
+#define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
+// ADDSUBPD X4, X5
+#define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
+// ADDSUBPD X6, X7
+#define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
+// ADDSUBPD X8, X9
+#define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
+
+// func AxpyUnitaryTo(dst []complex128, alpha complex64, x, y []complex128)
+TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
+	MOVQ    dst_base+0(FP), DI // DI = &dst
+	MOVQ    x_base+40(FP), SI  // SI = &x
+	MOVQ    y_base+64(FP), DX  // DX = &y
+	MOVQ    x_len+48(FP), CX   // CX = min( len(x), len(y), len(dst) )
+	CMPQ    y_len+72(FP), CX
+	CMOVQLE y_len+72(FP), CX
+	CMPQ    dst_len+8(FP), CX
+	CMOVQLE dst_len+8(FP), CX
+	CMPQ    CX, $0             // if CX == 0 { return }
+	JE      caxy_end
+	MOVUPS  alpha+24(FP), X0   // X0 = { imag(a), real(a) }
+	MOVAPS  X0, X1
+	SHUFPD  $0x1, X1, X1       // X1 = { real(a), imag(a) }
+	XORQ    AX, AX             // i = 0
+	MOVAPS  X0, X10            // Copy X0 and X1 for pipelining
+	MOVAPS  X1, X11
+	MOVQ    CX, BX
+	ANDQ    $3, CX             // CX = n % 4
+	SHRQ    $2, BX             // BX = floor( n / 4 )
+	JZ      caxy_tail          // if BX == 0 { goto caxy_tail }
+
+caxy_loop: // do {
+	MOVUPS (SI)(AX*8), X2   // X_i = { imag(x[i]), real(x[i]) }
+	MOVUPS 16(SI)(AX*8), X4
+	MOVUPS 32(SI)(AX*8), X6
+	MOVUPS 48(SI)(AX*8), X8
+
+	// X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi)
+	MOVDDUP_X4_X5
+	MOVDDUP_X6_X7
+	MOVDDUP_X8_X9
+
+	// X_i = { imag(x[i]), imag(x[i]) }
+	SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr)
+	SHUFPD $0x3, X4, X4
+	SHUFPD $0x3, X6, X6
+	SHUFPD $0x3, X8, X8
+
+	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+	MULPD X1, X2
+	MULPD X0, X3
+	MULPD X11, X4
+	MULPD X10, X5
+	MULPD X1, X6
+	MULPD X0, X7
+	MULPD X11, X8
+	MULPD X10, X9
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	ADDPD  (DX)(AX*8), X3
+	ADDPD  16(DX)(AX*8), X5
+	ADDPD  32(DX)(AX*8), X7
+	ADDPD  48(DX)(AX*8), X9
+	MOVUPS X3, (DI)(AX*8)   // y[i] = X_(i+1)
+	MOVUPS X5, 16(DI)(AX*8)
+	MOVUPS X7, 32(DI)(AX*8)
+	MOVUPS X9, 48(DI)(AX*8)
+	ADDQ   $8, AX           // i += 8
+	DECQ   BX
+	JNZ    caxy_loop        // } while --BX > 0
+	CMPQ   CX, $0           // if CX == 0 { return }
+	JE     caxy_end
+
+caxy_tail: // Same calculation, but read in values to avoid trampling memory
+	MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
+	MOVDDUP_X2_X3         // X_(i+1) = { real(x[i], real(x[i]) }
+	SHUFPD $0x3, X2, X2   // X_i = { imag(x[i]), imag(x[i]) }
+	MULPD  X1, X2         // X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	MULPD  X0, X3         // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+
+	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	ADDPD  (DX)(AX*8), X3
+	MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
+	ADDQ   $2, AX         // i += 2
+	LOOP   caxy_tail      // }  while --CX > 0
+
+caxy_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/doc.go b/vendor/gonum.org/v1/gonum/internal/asm/c128/doc.go
new file mode 100644
index 0000000000..8802ff138a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package c128 provides complex128 vector primitives.
+package c128 // import "gonum.org/v1/gonum/internal/asm/c128"
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/dotcinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c128/dotcinc_amd64.s
new file mode 100644
index 0000000000..235f67e7a2
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/dotcinc_amd64.s
@@ -0,0 +1,153 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVDDUP_XPTR__X3    LONG $0x1E120FF2 // MOVDDUP (SI), X3
+#define MOVDDUP_XPTR_INCX__X5    LONG $0x120F42F2; WORD $0x062C // MOVDDUP (SI)(R8*1), X5
+#define MOVDDUP_XPTR_INCX_2__X7    LONG $0x120F42F2; WORD $0x463C // MOVDDUP (SI)(R8*2), X7
+#define MOVDDUP_XPTR_INCx3X__X9    LONG $0x120F46F2; WORD $0x0E0C // MOVDDUP (SI)(R9*1), X9
+
+#define MOVDDUP_8_XPTR__X2    LONG $0x56120FF2; BYTE $0x08 // MOVDDUP 8(SI), X2
+#define MOVDDUP_8_XPTR_INCX__X4    LONG $0x120F42F2; WORD $0x0664; BYTE $0x08 // MOVDDUP 8(SI)(R8*1), X4
+#define MOVDDUP_8_XPTR_INCX_2__X6    LONG $0x120F42F2; WORD $0x4674; BYTE $0x08 // MOVDDUP 8(SI)(R8*2), X6
+#define MOVDDUP_8_XPTR_INCx3X__X8    LONG $0x120F46F2; WORD $0x0E44; BYTE $0x08 // MOVDDUP 8(SI)(R9*1), X8
+
+#define ADDSUBPD_X2_X3    LONG $0xDAD00F66 // ADDSUBPD X2, X3
+#define ADDSUBPD_X4_X5    LONG $0xECD00F66 // ADDSUBPD X4, X5
+#define ADDSUBPD_X6_X7    LONG $0xFED00F66 // ADDSUBPD X6, X7
+#define ADDSUBPD_X8_X9    LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define P_SUM X1
+#define INC_X R8
+#define INCx3_X R9
+#define INC_Y R10
+#define INCx3_Y R11
+#define NEG1 X15
+#define P_NEG1 X14
+
+// func DotcInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128)
+TEXT ·DotcInc(SB), NOSPLIT, $0
+	MOVQ   x_base+0(FP), X_PTR       // X_PTR = &x
+	MOVQ   y_base+24(FP), Y_PTR      // Y_PTR = &y
+	MOVQ   n+48(FP), LEN             // LEN = n
+	PXOR   SUM, SUM                  // SUM = 0
+	CMPQ   LEN, $0                   // if LEN == 0 { return }
+	JE     dot_end
+	PXOR   P_SUM, P_SUM              // P_SUM = 0
+	MOVQ   ix+72(FP), INC_X          // INC_X = ix * sizeof(complex128)
+	SHLQ   $4, INC_X
+	MOVQ   iy+80(FP), INC_Y          // INC_Y = iy * sizeof(complex128)
+	SHLQ   $4, INC_Y
+	LEAQ   (X_PTR)(INC_X*1), X_PTR   // X_PTR = &(X_PTR[ix])
+	LEAQ   (Y_PTR)(INC_Y*1), Y_PTR   // Y_PTR = &(Y_PTR[iy])
+	MOVQ   incX+56(FP), INC_X        // INC_X = incX
+	SHLQ   $4, INC_X                 // INC_X *=  sizeof(complex128)
+	MOVQ   incY+64(FP), INC_Y        // INC_Y = incY
+	SHLQ   $4, INC_Y                 // INC_Y *=  sizeof(complex128)
+	MOVSD  $(-1.0), NEG1
+	SHUFPD $0, NEG1, NEG1            // { -1, -1 }
+	MOVQ   LEN, TAIL
+	ANDQ   $3, TAIL                  // TAIL = n % 4
+	SHRQ   $2, LEN                   // LEN = floor( n / 4 )
+	JZ     dot_tail                  // if n <= 4 { goto dot_tail }
+	MOVAPS NEG1, P_NEG1              // Copy NEG1 to P_NEG1 for pipelining
+	LEAQ   (INC_X)(INC_X*2), INCx3_X // INCx3_X = 3 * incX * sizeof(complex128)
+	LEAQ   (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = 3 * incY * sizeof(complex128)
+
+dot_loop: // do {
+	MOVDDUP_XPTR__X3        // X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_XPTR_INCX__X5
+	MOVDDUP_XPTR_INCX_2__X7
+	MOVDDUP_XPTR_INCx3X__X9
+
+	MOVDDUP_8_XPTR__X2        // X_i = { imag(x[i]), imag(x[i]) }
+	MOVDDUP_8_XPTR_INCX__X4
+	MOVDDUP_8_XPTR_INCX_2__X6
+	MOVDDUP_8_XPTR_INCx3X__X8
+
+	// X_i = { -imag(x[i]), -imag(x[i]) }
+	MULPD NEG1, X2
+	MULPD P_NEG1, X4
+	MULPD NEG1, X6
+	MULPD P_NEG1, X8
+
+	// X_j = { imag(y[i]), real(y[i]) }
+	MOVUPS (Y_PTR), X10
+	MOVUPS (Y_PTR)(INC_Y*1), X11
+	MOVUPS (Y_PTR)(INC_Y*2), X12
+	MOVUPS (Y_PTR)(INCx3_Y*1), X13
+
+	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+	MULPD X10, X3
+	MULPD X11, X5
+	MULPD X12, X7
+	MULPD X13, X9
+
+	// X_j     = { real(y[i]), imag(y[i]) }
+	SHUFPD $0x1, X10, X10
+	SHUFPD $0x1, X11, X11
+	SHUFPD $0x1, X12, X12
+	SHUFPD $0x1, X13, X13
+
+	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	MULPD X10, X2
+	MULPD X11, X4
+	MULPD X12, X6
+	MULPD X13, X8
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	// psum += result[i]
+	ADDPD X3, SUM
+	ADDPD X5, P_SUM
+	ADDPD X7, SUM
+	ADDPD X9, P_SUM
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[incY*4])
+
+	DECQ  LEN
+	JNZ   dot_loop   // } while --LEN > 0
+	ADDPD P_SUM, SUM // sum += psum
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dot_end
+
+dot_tail: // do {
+	MOVDDUP_XPTR__X3      // X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_8_XPTR__X2    // X_i = { imag(x[i]), imag(x[i]) }
+	MULPD  NEG1, X2       // X_i     = { -imag(x[i])          , -imag(x[i])           }
+	MOVUPS (Y_PTR), X10   // X_j     = {  imag(y[i])          ,  real(y[i])           }
+	MULPD  X10, X3        // X_(i+1) = {  imag(a) * real(x[i]),  real(a) * real(x[i]) }
+	SHUFPD $0x1, X10, X10 // X_j     = {  real(y[i])          ,  imag(y[i])           }
+	MULPD  X10, X2        // X_i     = {  real(a) * imag(x[i]),  imag(a) * imag(x[i]) }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDPD X3, SUM      // sum += result[i]
+	ADDQ  INC_X, X_PTR // X_PTR += incX
+	ADDQ  INC_Y, Y_PTR // Y_PTR += incY
+	DECQ  TAIL
+	JNZ   dot_tail     // }  while --TAIL > 0
+
+dot_end:
+	MOVUPS SUM, sum+88(FP)
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/dotcunitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c128/dotcunitary_amd64.s
new file mode 100644
index 0000000000..0ffd0f1289
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/dotcunitary_amd64.s
@@ -0,0 +1,143 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVDDUP_XPTR_IDX_8__X3    LONG $0x1C120FF2; BYTE $0xC6 // MOVDDUP (SI)(AX*8), X3
+#define MOVDDUP_16_XPTR_IDX_8__X5    LONG $0x6C120FF2; WORD $0x10C6 // MOVDDUP 16(SI)(AX*8), X5
+#define MOVDDUP_32_XPTR_IDX_8__X7    LONG $0x7C120FF2; WORD $0x20C6 // MOVDDUP 32(SI)(AX*8), X7
+#define MOVDDUP_48_XPTR_IDX_8__X9    LONG $0x120F44F2; WORD $0xC64C; BYTE $0x30 // MOVDDUP 48(SI)(AX*8), X9
+
+#define MOVDDUP_XPTR_IIDX_8__X2    LONG $0x14120FF2; BYTE $0xD6 // MOVDDUP (SI)(DX*8), X2
+#define MOVDDUP_16_XPTR_IIDX_8__X4    LONG $0x64120FF2; WORD $0x10D6 // MOVDDUP 16(SI)(DX*8), X4
+#define MOVDDUP_32_XPTR_IIDX_8__X6    LONG $0x74120FF2; WORD $0x20D6 // MOVDDUP 32(SI)(DX*8), X6
+#define MOVDDUP_48_XPTR_IIDX_8__X8    LONG $0x120F44F2; WORD $0xD644; BYTE $0x30 // MOVDDUP 48(SI)(DX*8), X8
+
+#define ADDSUBPD_X2_X3    LONG $0xDAD00F66 // ADDSUBPD X2, X3
+#define ADDSUBPD_X4_X5    LONG $0xECD00F66 // ADDSUBPD X4, X5
+#define ADDSUBPD_X6_X7    LONG $0xFED00F66 // ADDSUBPD X6, X7
+#define ADDSUBPD_X8_X9    LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define P_SUM X1
+#define IDX AX
+#define I_IDX DX
+#define NEG1 X15
+#define P_NEG1 X14
+
+// func DotcUnitary(x, y []complex128) (sum complex128)
+TEXT ·DotcUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+0(FP), X_PTR  // X_PTR = &x
+	MOVQ    y_base+24(FP), Y_PTR // Y_PTR = &y
+	MOVQ    x_len+8(FP), LEN     // LEN = min( len(x), len(y) )
+	CMPQ    y_len+32(FP), LEN
+	CMOVQLE y_len+32(FP), LEN
+	PXOR    SUM, SUM             // sum = 0
+	CMPQ    LEN, $0              // if LEN == 0 { return }
+	JE      dot_end
+	XORPS   P_SUM, P_SUM         // psum = 0
+	MOVSD   $(-1.0), NEG1
+	SHUFPD  $0, NEG1, NEG1       // { -1, -1 }
+	XORQ    IDX, IDX             // i := 0
+	MOVQ    $1, I_IDX            // j := 1
+	MOVQ    LEN, TAIL
+	ANDQ    $3, TAIL             // TAIL = floor( TAIL / 4 )
+	SHRQ    $2, LEN              // LEN = TAIL % 4
+	JZ      dot_tail             // if LEN == 0 { goto dot_tail }
+
+	MOVAPS NEG1, P_NEG1 // Copy NEG1 to P_NEG1 for pipelining
+
+dot_loop: // do {
+	MOVDDUP_XPTR_IDX_8__X3    // X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_16_XPTR_IDX_8__X5
+	MOVDDUP_32_XPTR_IDX_8__X7
+	MOVDDUP_48_XPTR_IDX_8__X9
+
+	MOVDDUP_XPTR_IIDX_8__X2    // X_i = { imag(x[i]), imag(x[i]) }
+	MOVDDUP_16_XPTR_IIDX_8__X4
+	MOVDDUP_32_XPTR_IIDX_8__X6
+	MOVDDUP_48_XPTR_IIDX_8__X8
+
+	// X_i = { -imag(x[i]), -imag(x[i]) }
+	MULPD NEG1, X2
+	MULPD P_NEG1, X4
+	MULPD NEG1, X6
+	MULPD P_NEG1, X8
+
+	// X_j = { imag(y[i]), real(y[i]) }
+	MOVUPS (Y_PTR)(IDX*8), X10
+	MOVUPS 16(Y_PTR)(IDX*8), X11
+	MOVUPS 32(Y_PTR)(IDX*8), X12
+	MOVUPS 48(Y_PTR)(IDX*8), X13
+
+	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+	MULPD X10, X3
+	MULPD X11, X5
+	MULPD X12, X7
+	MULPD X13, X9
+
+	// X_j = { real(y[i]), imag(y[i]) }
+	SHUFPD $0x1, X10, X10
+	SHUFPD $0x1, X11, X11
+	SHUFPD $0x1, X12, X12
+	SHUFPD $0x1, X13, X13
+
+	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	MULPD X10, X2
+	MULPD X11, X4
+	MULPD X12, X6
+	MULPD X13, X8
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	// psum += result[i]
+	ADDPD X3, SUM
+	ADDPD X5, P_SUM
+	ADDPD X7, SUM
+	ADDPD X9, P_SUM
+
+	ADDQ  $8, IDX    // IDX += 8
+	ADDQ  $8, I_IDX  // I_IDX += 8
+	DECQ  LEN
+	JNZ   dot_loop   // } while --LEN > 0
+	ADDPD P_SUM, SUM // sum += psum
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dot_end
+
+dot_tail: // do {
+	MOVDDUP_XPTR_IDX_8__X3     // X_(i+1) = {  real(x[i])          ,  real(x[i])           }
+	MOVDDUP_XPTR_IIDX_8__X2    // X_i     = {  imag(x[i])          ,  imag(x[i])           }
+	MULPD  NEG1, X2            // X_i     = { -imag(x[i])          , -imag(x[i])           }
+	MOVUPS (Y_PTR)(IDX*8), X10 // X_j     = {  imag(y[i])          ,  real(y[i])           }
+	MULPD  X10, X3             // X_(i+1) = {  imag(a) * real(x[i]),  real(a) * real(x[i]) }
+	SHUFPD $0x1, X10, X10      // X_j     = {  real(y[i])          ,  imag(y[i])           }
+	MULPD  X10, X2             // X_i     = {  real(a) * imag(x[i]),  imag(a) * imag(x[i]) }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDPD X3, SUM   // SUM += result[i]
+	ADDQ  $2, IDX   // IDX += 2
+	ADDQ  $2, I_IDX // I_IDX += 2
+	DECQ  TAIL
+	JNZ   dot_tail  // }  while --TAIL > 0
+
+dot_end:
+	MOVUPS SUM, sum+48(FP)
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/dotuinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c128/dotuinc_amd64.s
new file mode 100644
index 0000000000..74fe5c3ba5
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/dotuinc_amd64.s
@@ -0,0 +1,141 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVDDUP_XPTR__X3    LONG $0x1E120FF2 // MOVDDUP (SI), X3
+#define MOVDDUP_XPTR_INCX__X5    LONG $0x120F42F2; WORD $0x062C // MOVDDUP (SI)(R8*1), X5
+#define MOVDDUP_XPTR_INCX_2__X7    LONG $0x120F42F2; WORD $0x463C // MOVDDUP (SI)(R8*2), X7
+#define MOVDDUP_XPTR_INCx3X__X9    LONG $0x120F46F2; WORD $0x0E0C // MOVDDUP (SI)(R9*1), X9
+
+#define MOVDDUP_8_XPTR__X2    LONG $0x56120FF2; BYTE $0x08 // MOVDDUP 8(SI), X2
+#define MOVDDUP_8_XPTR_INCX__X4    LONG $0x120F42F2; WORD $0x0664; BYTE $0x08 // MOVDDUP 8(SI)(R8*1), X4
+#define MOVDDUP_8_XPTR_INCX_2__X6    LONG $0x120F42F2; WORD $0x4674; BYTE $0x08 // MOVDDUP 8(SI)(R8*2), X6
+#define MOVDDUP_8_XPTR_INCx3X__X8    LONG $0x120F46F2; WORD $0x0E44; BYTE $0x08 // MOVDDUP 8(SI)(R9*1), X8
+
+#define ADDSUBPD_X2_X3    LONG $0xDAD00F66 // ADDSUBPD X2, X3
+#define ADDSUBPD_X4_X5    LONG $0xECD00F66 // ADDSUBPD X4, X5
+#define ADDSUBPD_X6_X7    LONG $0xFED00F66 // ADDSUBPD X6, X7
+#define ADDSUBPD_X8_X9    LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define P_SUM X1
+#define INC_X R8
+#define INCx3_X R9
+#define INC_Y R10
+#define INCx3_Y R11
+
+// func DotuInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128)
+TEXT ·DotuInc(SB), NOSPLIT, $0
+	MOVQ x_base+0(FP), X_PTR       // X_PTR = &x
+	MOVQ y_base+24(FP), Y_PTR      // Y_PTR = &y
+	MOVQ n+48(FP), LEN             // LEN = n
+	PXOR SUM, SUM                  // sum = 0
+	CMPQ LEN, $0                   // if LEN == 0 { return }
+	JE   dot_end
+	MOVQ ix+72(FP), INC_X          // INC_X = ix * sizeof(complex128)
+	SHLQ $4, INC_X
+	MOVQ iy+80(FP), INC_Y          // INC_Y = iy * sizeof(complex128)
+	SHLQ $4, INC_Y
+	LEAQ (X_PTR)(INC_X*1), X_PTR   // X_PTR = &(X_PTR[ix])
+	LEAQ (Y_PTR)(INC_Y*1), Y_PTR   // Y_PTR = &(Y_PTR[iy])
+	MOVQ incX+56(FP), INC_X        // INC_X = incX
+	SHLQ $4, INC_X                 // INC_X *=  sizeof(complex128)
+	MOVQ incY+64(FP), INC_Y        // INC_Y = incY
+	SHLQ $4, INC_Y                 // INC_Y *=  sizeof(complex128)
+	MOVQ LEN, TAIL
+	ANDQ $3, TAIL                  // LEN = LEN % 4
+	SHRQ $2, LEN                   // LEN = floor( LEN / 4 )
+	JZ   dot_tail                  // if LEN <= 4 { goto dot_tail }
+	PXOR P_SUM, P_SUM              // psum = 0
+	LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = 3 * incX * sizeof(complex128)
+	LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = 3 * incY * sizeof(complex128)
+
+dot_loop: // do {
+	MOVDDUP_XPTR__X3        // X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_XPTR_INCX__X5
+	MOVDDUP_XPTR_INCX_2__X7
+	MOVDDUP_XPTR_INCx3X__X9
+
+	MOVDDUP_8_XPTR__X2        // X_i = { imag(x[i]), imag(x[i]) }
+	MOVDDUP_8_XPTR_INCX__X4
+	MOVDDUP_8_XPTR_INCX_2__X6
+	MOVDDUP_8_XPTR_INCx3X__X8
+
+	// X_j = { imag(y[i]), real(y[i]) }
+	MOVUPS (Y_PTR), X10
+	MOVUPS (Y_PTR)(INC_Y*1), X11
+	MOVUPS (Y_PTR)(INC_Y*2), X12
+	MOVUPS (Y_PTR)(INCx3_Y*1), X13
+
+	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+	MULPD X10, X3
+	MULPD X11, X5
+	MULPD X12, X7
+	MULPD X13, X9
+
+	// X_j     = { real(y[i]), imag(y[i]) }
+	SHUFPD $0x1, X10, X10
+	SHUFPD $0x1, X11, X11
+	SHUFPD $0x1, X12, X12
+	SHUFPD $0x1, X13, X13
+
+	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	MULPD X10, X2
+	MULPD X11, X4
+	MULPD X12, X6
+	MULPD X13, X8
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	// psum += result[i]
+	ADDPD X3, SUM
+	ADDPD X5, P_SUM
+	ADDPD X7, SUM
+	ADDPD X9, P_SUM
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[incY*4])
+
+	DECQ  LEN
+	JNZ   dot_loop   // } while --BX > 0
+	ADDPD P_SUM, SUM // sum += psum
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dot_end
+
+dot_tail: // do {
+	MOVDDUP_XPTR__X3      // X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_8_XPTR__X2    // X_i = { imag(x[i]), imag(x[i]) }
+	MOVUPS (Y_PTR), X10   // X_j     = {  imag(y[i])          ,  real(y[i])           }
+	MULPD  X10, X3        // X_(i+1) = {  imag(a) * real(x[i]),  real(a) * real(x[i]) }
+	SHUFPD $0x1, X10, X10 // X_j     = {  real(y[i])          ,  imag(y[i])           }
+	MULPD  X10, X2        // X_i     = {  real(a) * imag(x[i]),  imag(a) * imag(x[i]) }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDPD X3, SUM      // sum += result[i]
+	ADDQ  INC_X, X_PTR // X_PTR += incX
+	ADDQ  INC_Y, Y_PTR // Y_PTR += incY
+	DECQ  TAIL         // --TAIL
+	JNZ   dot_tail     // }  while TAIL > 0
+
+dot_end:
+	MOVUPS SUM, sum+88(FP)
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/dotuunitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c128/dotuunitary_amd64.s
new file mode 100644
index 0000000000..8df019881b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/dotuunitary_amd64.s
@@ -0,0 +1,130 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVDDUP_XPTR_IDX_8__X3    LONG $0x1C120FF2; BYTE $0xC6 // MOVDDUP (SI)(AX*8), X3
+#define MOVDDUP_16_XPTR_IDX_8__X5    LONG $0x6C120FF2; WORD $0x10C6 // MOVDDUP 16(SI)(AX*8), X5
+#define MOVDDUP_32_XPTR_IDX_8__X7    LONG $0x7C120FF2; WORD $0x20C6 // MOVDDUP 32(SI)(AX*8), X7
+#define MOVDDUP_48_XPTR_IDX_8__X9    LONG $0x120F44F2; WORD $0xC64C; BYTE $0x30 // MOVDDUP 48(SI)(AX*8), X9
+
+#define MOVDDUP_XPTR_IIDX_8__X2    LONG $0x14120FF2; BYTE $0xD6 // MOVDDUP (SI)(DX*8), X2
+#define MOVDDUP_16_XPTR_IIDX_8__X4    LONG $0x64120FF2; WORD $0x10D6 // MOVDDUP 16(SI)(DX*8), X4
+#define MOVDDUP_32_XPTR_IIDX_8__X6    LONG $0x74120FF2; WORD $0x20D6 // MOVDDUP 32(SI)(DX*8), X6
+#define MOVDDUP_48_XPTR_IIDX_8__X8    LONG $0x120F44F2; WORD $0xD644; BYTE $0x30 // MOVDDUP 48(SI)(DX*8), X8
+
+#define ADDSUBPD_X2_X3    LONG $0xDAD00F66 // ADDSUBPD X2, X3
+#define ADDSUBPD_X4_X5    LONG $0xECD00F66 // ADDSUBPD X4, X5
+#define ADDSUBPD_X6_X7    LONG $0xFED00F66 // ADDSUBPD X6, X7
+#define ADDSUBPD_X8_X9    LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define P_SUM X1
+#define IDX AX
+#define I_IDX DX
+
+// func DotuUnitary(x, y []complex128) (sum complex128)
+TEXT ·DotuUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+0(FP), X_PTR  // X_PTR = &x
+	MOVQ    y_base+24(FP), Y_PTR // Y_PTR = &y
+	MOVQ    x_len+8(FP), LEN     // LEN = min( len(x), len(y) )
+	CMPQ    y_len+32(FP), LEN
+	CMOVQLE y_len+32(FP), LEN
+	PXOR    SUM, SUM             // SUM = 0
+	CMPQ    LEN, $0              // if LEN == 0 { return }
+	JE      dot_end
+	PXOR    P_SUM, P_SUM         // P_SUM = 0
+	XORQ    IDX, IDX             // IDX = 0
+	MOVQ    $1, DX               // j = 1
+	MOVQ    LEN, TAIL
+	ANDQ    $3, TAIL             // TAIL = floor( LEN / 4 )
+	SHRQ    $2, LEN              // LEN = LEN % 4
+	JZ      dot_tail             // if LEN == 0 { goto dot_tail }
+
+dot_loop: // do {
+	MOVDDUP_XPTR_IDX_8__X3    // X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_16_XPTR_IDX_8__X5
+	MOVDDUP_32_XPTR_IDX_8__X7
+	MOVDDUP_48_XPTR_IDX_8__X9
+
+	MOVDDUP_XPTR_IIDX_8__X2    // X_i = { imag(x[i]), imag(x[i]) }
+	MOVDDUP_16_XPTR_IIDX_8__X4
+	MOVDDUP_32_XPTR_IIDX_8__X6
+	MOVDDUP_48_XPTR_IIDX_8__X8
+
+	// X_j = { imag(y[i]), real(y[i]) }
+	MOVUPS (Y_PTR)(IDX*8), X10
+	MOVUPS 16(Y_PTR)(IDX*8), X11
+	MOVUPS 32(Y_PTR)(IDX*8), X12
+	MOVUPS 48(Y_PTR)(IDX*8), X13
+
+	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+	MULPD X10, X3
+	MULPD X11, X5
+	MULPD X12, X7
+	MULPD X13, X9
+
+	// X_j = { real(y[i]), imag(y[i]) }
+	SHUFPD $0x1, X10, X10
+	SHUFPD $0x1, X11, X11
+	SHUFPD $0x1, X12, X12
+	SHUFPD $0x1, X13, X13
+
+	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	MULPD X10, X2
+	MULPD X11, X4
+	MULPD X12, X6
+	MULPD X13, X8
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	// psum += result[i]
+	ADDPD X3, SUM
+	ADDPD X5, P_SUM
+	ADDPD X7, SUM
+	ADDPD X9, P_SUM
+
+	ADDQ  $8, IDX    // IDX += 8
+	ADDQ  $8, I_IDX  // I_IDX += 8
+	DECQ  LEN
+	JNZ   dot_loop   // } while --LEN > 0
+	ADDPD P_SUM, SUM // SUM += P_SUM
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dot_end
+
+dot_tail: // do {
+	MOVDDUP_XPTR_IDX_8__X3     // X_(i+1) = { real(x[i]            , real(x[i])            }
+	MOVDDUP_XPTR_IIDX_8__X2    // X_i     = { imag(x[i])           , imag(x[i])            }
+	MOVUPS (Y_PTR)(IDX*8), X10 // X_j     = {  imag(y[i])          ,  real(y[i])           }
+	MULPD  X10, X3             // X_(i+1) = {  imag(a) * real(x[i]),  real(a) * real(x[i]) }
+	SHUFPD $0x1, X10, X10      // X_j     = {  real(y[i])          ,  imag(y[i])           }
+	MULPD  X10, X2             // X_i     = {  real(a) * imag(x[i]),  imag(a) * imag(x[i]) }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDPD X3, SUM   // psum += result[i]
+	ADDQ  $2, IDX   // IDX += 2
+	ADDQ  $2, I_IDX // I_IDX += 2
+	DECQ  TAIL      // --TAIL
+	JNZ   dot_tail  // }  while TAIL > 0
+
+dot_end:
+	MOVUPS SUM, sum+48(FP)
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/dscalinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c128/dscalinc_amd64.s
new file mode 100644
index 0000000000..77a28ccead
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/dscalinc_amd64.s
@@ -0,0 +1,69 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define SRC SI
+#define DST SI
+#define LEN CX
+#define TAIL BX
+#define INC R9
+#define INC3 R10
+#define ALPHA X0
+#define ALPHA_2 X1
+
+#define MOVDDUP_ALPHA    LONG $0x44120FF2; WORD $0x0824 // MOVDDUP 8(SP), X0
+
+// func DscalInc(alpha float64, x []complex128, n, inc uintptr)
+TEXT ·DscalInc(SB), NOSPLIT, $0
+	MOVQ x_base+8(FP), SRC // SRC = &x
+	MOVQ n+32(FP), LEN     // LEN = n
+	CMPQ LEN, $0           // if LEN == 0 { return }
+	JE   dscal_end
+
+	MOVDDUP_ALPHA             // ALPHA = alpha
+	MOVQ   inc+40(FP), INC    // INC = inc
+	SHLQ   $4, INC            // INC = INC * sizeof(complex128)
+	LEAQ   (INC)(INC*2), INC3 // INC3 = 3 * INC
+	MOVUPS ALPHA, ALPHA_2     // Copy ALPHA and ALPHA_2 for pipelining
+	MOVQ   LEN, TAIL          // TAIL = LEN
+	SHRQ   $2, LEN            // LEN = floor( n / 4 )
+	JZ     dscal_tail         // if LEN == 0 { goto dscal_tail }
+
+dscal_loop: // do {
+	MOVUPS (SRC), X2         // X_i = x[i]
+	MOVUPS (SRC)(INC*1), X3
+	MOVUPS (SRC)(INC*2), X4
+	MOVUPS (SRC)(INC3*1), X5
+
+	MULPD ALPHA, X2   // X_i *= ALPHA
+	MULPD ALPHA_2, X3
+	MULPD ALPHA, X4
+	MULPD ALPHA_2, X5
+
+	MOVUPS X2, (DST)         // x[i] = X_i
+	MOVUPS X3, (DST)(INC*1)
+	MOVUPS X4, (DST)(INC*2)
+	MOVUPS X5, (DST)(INC3*1)
+
+	LEAQ (SRC)(INC*4), SRC // SRC += INC*4
+	DECQ LEN
+	JNZ  dscal_loop        // } while --LEN > 0
+
+dscal_tail:
+	ANDQ $3, TAIL  // TAIL = TAIL % 4
+	JE   dscal_end // if TAIL == 0 { return }
+
+dscal_tail_loop: // do {
+	MOVUPS (SRC), X2       // X_i = x[i]
+	MULPD  ALPHA, X2       // X_i *= ALPHA
+	MOVUPS X2, (DST)       // x[i] = X_i
+	ADDQ   INC, SRC        // SRC += INC
+	DECQ   TAIL
+	JNZ    dscal_tail_loop // } while --TAIL > 0
+
+dscal_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/dscalunitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c128/dscalunitary_amd64.s
new file mode 100644
index 0000000000..9fa91e4624
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/dscalunitary_amd64.s
@@ -0,0 +1,66 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define SRC SI
+#define DST SI
+#define LEN CX
+#define IDX AX
+#define TAIL BX
+#define ALPHA X0
+#define ALPHA_2 X1
+
+#define MOVDDUP_ALPHA    LONG $0x44120FF2; WORD $0x0824 // MOVDDUP 8(SP), X0
+
+// func DscalUnitary(alpha float64, x []complex128)
+TEXT ·DscalUnitary(SB), NOSPLIT, $0
+	MOVQ x_base+8(FP), SRC // SRC = &x
+	MOVQ x_len+16(FP), LEN // LEN = len(x)
+	CMPQ LEN, $0           // if LEN == 0 { return }
+	JE   dscal_end
+
+	MOVDDUP_ALPHA         // ALPHA = alpha
+	XORQ   IDX, IDX       // IDX = 0
+	MOVUPS ALPHA, ALPHA_2 // Copy ALPHA to ALPHA_2 for pipelining
+	MOVQ   LEN, TAIL      // TAIL = LEN
+	SHRQ   $2, LEN        // LEN = floor( n / 4 )
+	JZ     dscal_tail     // if LEN == 0 { goto dscal_tail }
+
+dscal_loop: // do {
+	MOVUPS (SRC)(IDX*8), X2   // X_i = x[i]
+	MOVUPS 16(SRC)(IDX*8), X3
+	MOVUPS 32(SRC)(IDX*8), X4
+	MOVUPS 48(SRC)(IDX*8), X5
+
+	MULPD ALPHA, X2   // X_i *= ALPHA
+	MULPD ALPHA_2, X3
+	MULPD ALPHA, X4
+	MULPD ALPHA_2, X5
+
+	MOVUPS X2, (DST)(IDX*8)   // x[i] = X_i
+	MOVUPS X3, 16(DST)(IDX*8)
+	MOVUPS X4, 32(DST)(IDX*8)
+	MOVUPS X5, 48(DST)(IDX*8)
+
+	ADDQ $8, IDX    // IDX += 8
+	DECQ LEN
+	JNZ  dscal_loop // } while --LEN > 0
+
+dscal_tail:
+	ANDQ $3, TAIL  // TAIL = TAIL % 4
+	JZ   dscal_end // if TAIL == 0 { return }
+
+dscal_tail_loop: // do {
+	MOVUPS (SRC)(IDX*8), X2 // X_i = x[i]
+	MULPD  ALPHA, X2        // X_i *= ALPHA
+	MOVUPS X2, (DST)(IDX*8) // x[i] = X_i
+	ADDQ   $2, IDX          // IDX += 2
+	DECQ   TAIL
+	JNZ    dscal_tail_loop  // } while --TAIL > 0
+
+dscal_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/scal.go b/vendor/gonum.org/v1/gonum/internal/asm/c128/scal.go
new file mode 100644
index 0000000000..27c3581752
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/scal.go
@@ -0,0 +1,33 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package c128
+
+// ScalUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha * v
+//	}
+func ScalUnitaryTo(dst []complex128, alpha complex128, x []complex128) {
+	for i, v := range x {
+		dst[i] = alpha * v
+	}
+}
+
+// ScalIncTo is
+//
+//	var idst, ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha * x[ix]
+//		ix += incX
+//		idst += incDst
+//	}
+func ScalIncTo(dst []complex128, incDst uintptr, alpha complex128, x []complex128, n, incX uintptr) {
+	var idst, ix uintptr
+	for i := 0; i < int(n); i++ {
+		dst[idst] = alpha * x[ix]
+		ix += incX
+		idst += incDst
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/scalUnitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c128/scalUnitary_amd64.s
new file mode 100644
index 0000000000..b76037fdd0
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/scalUnitary_amd64.s
@@ -0,0 +1,116 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define SRC SI
+#define DST SI
+#define LEN CX
+#define IDX AX
+#define TAIL BX
+#define ALPHA X0
+#define ALPHA_C X1
+#define ALPHA2 X10
+#define ALPHA_C2 X11
+
+#define MOVDDUP_X2_X3    LONG $0xDA120FF2 // MOVDDUP X2, X3
+#define MOVDDUP_X4_X5    LONG $0xEC120FF2 // MOVDDUP X4, X5
+#define MOVDDUP_X6_X7    LONG $0xFE120FF2 // MOVDDUP X6, X7
+#define MOVDDUP_X8_X9    LONG $0x120F45F2; BYTE $0xC8 // MOVDDUP X8, X9
+
+#define ADDSUBPD_X2_X3    LONG $0xDAD00F66 // ADDSUBPD X2, X3
+#define ADDSUBPD_X4_X5    LONG $0xECD00F66 // ADDSUBPD X4, X5
+#define ADDSUBPD_X6_X7    LONG $0xFED00F66 // ADDSUBPD X6, X7
+#define ADDSUBPD_X8_X9    LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
+
+// func ScalUnitary(alpha complex128, x []complex128)
+TEXT ·ScalUnitary(SB), NOSPLIT, $0
+	MOVQ x_base+16(FP), SRC // SRC = &x
+	MOVQ x_len+24(FP), LEN  // LEN = len(x)
+	CMPQ LEN, $0            // if LEN == 0 { return }
+	JE   scal_end
+
+	MOVUPS alpha+0(FP), ALPHA     // ALPHA = { imag(alpha), real(alpha) }
+	MOVAPS ALPHA, ALPHA_C
+	SHUFPD $0x1, ALPHA_C, ALPHA_C // ALPHA_C = { real(alpha), imag(alpha) }
+
+	XORQ   IDX, IDX          // IDX = 0
+	MOVAPS ALPHA, ALPHA2     // Copy ALPHA and ALPHA_C for pipelining
+	MOVAPS ALPHA_C, ALPHA_C2
+	MOVQ   LEN, TAIL
+	SHRQ   $2, LEN           // LEN = floor( n / 4 )
+	JZ     scal_tail         // if BX == 0 { goto scal_tail }
+
+scal_loop: // do {
+	MOVUPS (SRC)(IDX*8), X2   // X_i = { imag(x[i]), real(x[i]) }
+	MOVUPS 16(SRC)(IDX*8), X4
+	MOVUPS 32(SRC)(IDX*8), X6
+	MOVUPS 48(SRC)(IDX*8), X8
+
+	// X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_X2_X3
+	MOVDDUP_X4_X5
+	MOVDDUP_X6_X7
+	MOVDDUP_X8_X9
+
+	// X_i = { imag(x[i]), imag(x[i]) }
+	SHUFPD $0x3, X2, X2
+	SHUFPD $0x3, X4, X4
+	SHUFPD $0x3, X6, X6
+	SHUFPD $0x3, X8, X8
+
+	// X_i     = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i])  }
+	// X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i])  }
+	MULPD ALPHA_C, X2
+	MULPD ALPHA, X3
+	MULPD ALPHA_C2, X4
+	MULPD ALPHA2, X5
+	MULPD ALPHA_C, X6
+	MULPD ALPHA, X7
+	MULPD ALPHA_C2, X8
+	MULPD ALPHA2, X9
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]),
+	//	real(result[i]):  real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	MOVUPS X3, (DST)(IDX*8)   // x[i] = X_(i+1)
+	MOVUPS X5, 16(DST)(IDX*8)
+	MOVUPS X7, 32(DST)(IDX*8)
+	MOVUPS X9, 48(DST)(IDX*8)
+	ADDQ   $8, IDX            // IDX += 8
+	DECQ   LEN
+	JNZ    scal_loop          // } while --LEN > 0
+
+scal_tail:
+	ANDQ $3, TAIL // TAIL = TAIL % 4
+	JZ   scal_end // if TAIL == 0 { return }
+
+scal_tail_loop: // do {
+	MOVUPS (SRC)(IDX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
+	MOVDDUP_X2_X3           // X_(i+1) = { real(x[i], real(x[i]) }
+	SHUFPD $0x3, X2, X2     // X_i = { imag(x[i]), imag(x[i]) }
+	MULPD  ALPHA_C, X2      // X_i     = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i])  }
+	MULPD  ALPHA, X3        // X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i])  }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]),
+	//	real(result[i]):  real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+
+	MOVUPS X3, (DST)(IDX*8) // x[i] = X_(i+1)
+	ADDQ   $2, IDX          // IDX += 2
+	DECQ   TAIL
+	JNZ    scal_tail_loop   // }  while --LEN > 0
+
+scal_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/scalinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c128/scalinc_amd64.s
new file mode 100644
index 0000000000..6e0e51b658
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/scalinc_amd64.s
@@ -0,0 +1,121 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define SRC SI
+#define DST SI
+#define LEN CX
+#define TAIL BX
+#define INC R9
+#define INC3 R10
+#define ALPHA X0
+#define ALPHA_C X1
+#define ALPHA2 X10
+#define ALPHA_C2 X11
+
+#define MOVDDUP_X2_X3    LONG $0xDA120FF2 // MOVDDUP X2, X3
+#define MOVDDUP_X4_X5    LONG $0xEC120FF2 // MOVDDUP X4, X5
+#define MOVDDUP_X6_X7    LONG $0xFE120FF2 // MOVDDUP X6, X7
+#define MOVDDUP_X8_X9    LONG $0x120F45F2; BYTE $0xC8 // MOVDDUP X8, X9
+
+#define ADDSUBPD_X2_X3    LONG $0xDAD00F66 // ADDSUBPD X2, X3
+#define ADDSUBPD_X4_X5    LONG $0xECD00F66 // ADDSUBPD X4, X5
+#define ADDSUBPD_X6_X7    LONG $0xFED00F66 // ADDSUBPD X6, X7
+#define ADDSUBPD_X8_X9    LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
+
+// func ScalInc(alpha complex128, x []complex128, n, inc uintptr)
+TEXT ·ScalInc(SB), NOSPLIT, $0
+	MOVQ x_base+16(FP), SRC // SRC = &x
+	MOVQ n+40(FP), LEN      // LEN = len(x)
+	CMPQ LEN, $0
+	JE   scal_end           // if LEN == 0 { return }
+
+	MOVQ inc+48(FP), INC    // INC = inc
+	SHLQ $4, INC            // INC = INC * sizeof(complex128)
+	LEAQ (INC)(INC*2), INC3 // INC3 = 3 * INC
+
+	MOVUPS alpha+0(FP), ALPHA     // ALPHA = { imag(alpha), real(alpha) }
+	MOVAPS ALPHA, ALPHA_C
+	SHUFPD $0x1, ALPHA_C, ALPHA_C // ALPHA_C = { real(alpha), imag(alpha) }
+
+	MOVAPS ALPHA, ALPHA2     // Copy ALPHA and ALPHA_C for pipelining
+	MOVAPS ALPHA_C, ALPHA_C2
+	MOVQ   LEN, TAIL
+	SHRQ   $2, LEN           // LEN = floor( n / 4 )
+	JZ     scal_tail         // if BX == 0 { goto scal_tail }
+
+scal_loop: // do {
+	MOVUPS (SRC), X2         // X_i = { imag(x[i]), real(x[i]) }
+	MOVUPS (SRC)(INC*1), X4
+	MOVUPS (SRC)(INC*2), X6
+	MOVUPS (SRC)(INC3*1), X8
+
+	// X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_X2_X3
+	MOVDDUP_X4_X5
+	MOVDDUP_X6_X7
+	MOVDDUP_X8_X9
+
+	// X_i = { imag(x[i]), imag(x[i]) }
+	SHUFPD $0x3, X2, X2
+	SHUFPD $0x3, X4, X4
+	SHUFPD $0x3, X6, X6
+	SHUFPD $0x3, X8, X8
+
+	// X_i     = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i])  }
+	// X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i])  }
+	MULPD ALPHA_C, X2
+	MULPD ALPHA, X3
+	MULPD ALPHA_C2, X4
+	MULPD ALPHA2, X5
+	MULPD ALPHA_C, X6
+	MULPD ALPHA, X7
+	MULPD ALPHA_C2, X8
+	MULPD ALPHA2, X9
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]),
+	//	real(result[i]):  real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	MOVUPS X3, (DST)         // x[i] = X_(i+1)
+	MOVUPS X5, (DST)(INC*1)
+	MOVUPS X7, (DST)(INC*2)
+	MOVUPS X9, (DST)(INC3*1)
+
+	LEAQ (SRC)(INC*4), SRC // SRC = &(SRC[inc*4])
+	DECQ LEN
+	JNZ  scal_loop         // } while --BX > 0
+
+scal_tail:
+	ANDQ $3, TAIL // TAIL = TAIL % 4
+	JE   scal_end // if TAIL == 0 { return }
+
+scal_tail_loop: // do {
+	MOVUPS (SRC), X2    // X_i = { imag(x[i]), real(x[i]) }
+	MOVDDUP_X2_X3       // X_(i+1) = { real(x[i], real(x[i]) }
+	SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
+	MULPD  ALPHA_C, X2  // X_i     = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i])  }
+	MULPD  ALPHA, X3    // X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i])  }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]),
+	//	real(result[i]):  real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+
+	MOVUPS X3, (DST)      // x[i] = X_i
+	ADDQ   INC, SRC       // SRC = &(SRC[incX])
+	DECQ   TAIL
+	JNZ    scal_tail_loop // } while --TAIL > 0
+
+scal_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/stubs.go b/vendor/gonum.org/v1/gonum/internal/asm/c128/stubs.go
new file mode 100644
index 0000000000..9c3a8fb83d
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/stubs.go
@@ -0,0 +1,180 @@
+// Copyright ©2020 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package c128
+
+import (
+	"math"
+	"math/cmplx"
+)
+
+// Add is
+//
+//	for i, v := range s {
+//		dst[i] += v
+//	}
+func Add(dst, s []complex128) {
+	for i, v := range s {
+		dst[i] += v
+	}
+}
+
+// AddConst is
+//
+//	for i := range x {
+//		x[i] += alpha
+//	}
+func AddConst(alpha complex128, x []complex128) {
+	for i := range x {
+		x[i] += alpha
+	}
+}
+
+// CumSum is
+//
+//	if len(s) == 0 {
+//		return dst
+//	}
+//	dst[0] = s[0]
+//	for i, v := range s[1:] {
+//		dst[i+1] = dst[i] + v
+//	}
+//	return dst
+func CumSum(dst, s []complex128) []complex128 {
+	if len(s) == 0 {
+		return dst
+	}
+	dst[0] = s[0]
+	for i, v := range s[1:] {
+		dst[i+1] = dst[i] + v
+	}
+	return dst
+}
+
+// CumProd is
+//
+//	if len(s) == 0 {
+//		return dst
+//	}
+//	dst[0] = s[0]
+//	for i, v := range s[1:] {
+//		dst[i+1] = dst[i] * v
+//	}
+//	return dst
+func CumProd(dst, s []complex128) []complex128 {
+	if len(s) == 0 {
+		return dst
+	}
+	dst[0] = s[0]
+	for i, v := range s[1:] {
+		dst[i+1] = dst[i] * v
+	}
+	return dst
+}
+
+// Div is
+//
+//	for i, v := range s {
+//		dst[i] /= v
+//	}
+func Div(dst, s []complex128) {
+	for i, v := range s {
+		dst[i] /= v
+	}
+}
+
+// DivTo is
+//
+//	for i, v := range s {
+//		dst[i] = v / t[i]
+//	}
+//	return dst
+func DivTo(dst, s, t []complex128) []complex128 {
+	for i, v := range s {
+		dst[i] = v / t[i]
+	}
+	return dst
+}
+
+// DotUnitary is
+//
+//	for i, v := range x {
+//		sum += cmplx.Conj(v) * y[i]
+//	}
+//	return sum
+func DotUnitary(x, y []complex128) (sum complex128) {
+	for i, v := range x {
+		sum += cmplx.Conj(v) * y[i]
+	}
+	return sum
+}
+
+// L2DistanceUnitary returns the L2-norm of x-y.
+func L2DistanceUnitary(x, y []complex128) (norm float64) {
+	var scale float64
+	sumSquares := 1.0
+	for i, v := range x {
+		v -= y[i]
+		if v == 0 {
+			continue
+		}
+		absxi := cmplx.Abs(v)
+		if math.IsNaN(absxi) {
+			return math.NaN()
+		}
+		if scale < absxi {
+			s := scale / absxi
+			sumSquares = 1 + sumSquares*s*s
+			scale = absxi
+		} else {
+			s := absxi / scale
+			sumSquares += s * s
+		}
+	}
+	if math.IsInf(scale, 1) {
+		return math.Inf(1)
+	}
+	return scale * math.Sqrt(sumSquares)
+}
+
+// L2NormUnitary returns the L2-norm of x.
+func L2NormUnitary(x []complex128) (norm float64) {
+	var scale float64
+	sumSquares := 1.0
+	for _, v := range x {
+		if v == 0 {
+			continue
+		}
+		absxi := cmplx.Abs(v)
+		if math.IsNaN(absxi) {
+			return math.NaN()
+		}
+		if scale < absxi {
+			s := scale / absxi
+			sumSquares = 1 + sumSquares*s*s
+			scale = absxi
+		} else {
+			s := absxi / scale
+			sumSquares += s * s
+		}
+	}
+	if math.IsInf(scale, 1) {
+		return math.Inf(1)
+	}
+	return scale * math.Sqrt(sumSquares)
+}
+
+// Sum is
+//
+//	var sum complex128
+//	for i := range x {
+//	    sum += x[i]
+//	}
+func Sum(x []complex128) complex128 {
+	var sum complex128
+	for _, v := range x {
+		sum += v
+	}
+	return sum
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/stubs_amd64.go b/vendor/gonum.org/v1/gonum/internal/asm/c128/stubs_amd64.go
new file mode 100644
index 0000000000..c0e26a2f1e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/stubs_amd64.go
@@ -0,0 +1,109 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !noasm && !gccgo && !safe
+// +build !noasm,!gccgo,!safe
+
+package c128
+
+// AxpyUnitary is
+//
+//	for i, v := range x {
+//		y[i] += alpha * v
+//	}
+func AxpyUnitary(alpha complex128, x, y []complex128)
+
+// AxpyUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha*v + y[i]
+//	}
+func AxpyUnitaryTo(dst []complex128, alpha complex128, x, y []complex128)
+
+// AxpyInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		y[iy] += alpha * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
+
+// AxpyIncTo is
+//
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha*x[ix] + y[iy]
+//		ix += incX
+//		iy += incY
+//		idst += incDst
+//	}
+func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
+
+// DscalUnitary is
+//
+//	for i, v := range x {
+//		x[i] = complex(real(v)*alpha, imag(v)*alpha)
+//	}
+func DscalUnitary(alpha float64, x []complex128)
+
+// DscalInc is
+//
+//	var ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha)
+//		ix += inc
+//	}
+func DscalInc(alpha float64, x []complex128, n, inc uintptr)
+
+// ScalInc is
+//
+//	var ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		x[ix] *= alpha
+//		ix += incX
+//	}
+func ScalInc(alpha complex128, x []complex128, n, inc uintptr)
+
+// ScalUnitary is
+//
+//	for i := range x {
+//		x[i] *= alpha
+//	}
+func ScalUnitary(alpha complex128, x []complex128)
+
+// DotcUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * cmplx.Conj(v)
+//	}
+//	return sum
+func DotcUnitary(x, y []complex128) (sum complex128)
+
+// DotcInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * cmplx.Conj(x[ix])
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotcInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128)
+
+// DotuUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * v
+//	}
+//	return sum
+func DotuUnitary(x, y []complex128) (sum complex128)
+
+// DotuInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotuInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128)
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/stubs_noasm.go b/vendor/gonum.org/v1/gonum/internal/asm/c128/stubs_noasm.go
new file mode 100644
index 0000000000..21dfc4a8e1
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/stubs_noasm.go
@@ -0,0 +1,176 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 || noasm || gccgo || safe
+// +build !amd64 noasm gccgo safe
+
+package c128
+
+import "math/cmplx"
+
+// AxpyUnitary is
+//
+//	for i, v := range x {
+//		y[i] += alpha * v
+//	}
+func AxpyUnitary(alpha complex128, x, y []complex128) {
+	for i, v := range x {
+		y[i] += alpha * v
+	}
+}
+
+// AxpyUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha*v + y[i]
+//	}
+func AxpyUnitaryTo(dst []complex128, alpha complex128, x, y []complex128) {
+	for i, v := range x {
+		dst[i] = alpha*v + y[i]
+	}
+}
+
+// AxpyInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		y[iy] += alpha * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr) {
+	for i := 0; i < int(n); i++ {
+		y[iy] += alpha * x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// AxpyIncTo is
+//
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha*x[ix] + y[iy]
+//		ix += incX
+//		iy += incY
+//		idst += incDst
+//	}
+func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr) {
+	for i := 0; i < int(n); i++ {
+		dst[idst] = alpha*x[ix] + y[iy]
+		ix += incX
+		iy += incY
+		idst += incDst
+	}
+}
+
+// DscalUnitary is
+//
+//	for i, v := range x {
+//		x[i] = complex(real(v)*alpha, imag(v)*alpha)
+//	}
+func DscalUnitary(alpha float64, x []complex128) {
+	for i, v := range x {
+		x[i] = complex(real(v)*alpha, imag(v)*alpha)
+	}
+}
+
+// DscalInc is
+//
+//	var ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha)
+//		ix += inc
+//	}
+func DscalInc(alpha float64, x []complex128, n, inc uintptr) {
+	var ix uintptr
+	for i := 0; i < int(n); i++ {
+		x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha)
+		ix += inc
+	}
+}
+
+// ScalInc is
+//
+//	var ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		x[ix] *= alpha
+//		ix += incX
+//	}
+func ScalInc(alpha complex128, x []complex128, n, inc uintptr) {
+	var ix uintptr
+	for i := 0; i < int(n); i++ {
+		x[ix] *= alpha
+		ix += inc
+	}
+}
+
+// ScalUnitary is
+//
+//	for i := range x {
+//		x[i] *= alpha
+//	}
+func ScalUnitary(alpha complex128, x []complex128) {
+	for i := range x {
+		x[i] *= alpha
+	}
+}
+
+// DotcUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * cmplx.Conj(v)
+//	}
+//	return sum
+func DotcUnitary(x, y []complex128) (sum complex128) {
+	for i, v := range x {
+		sum += y[i] * cmplx.Conj(v)
+	}
+	return sum
+}
+
+// DotcInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * cmplx.Conj(x[ix])
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotcInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128) {
+	for i := 0; i < int(n); i++ {
+		sum += y[iy] * cmplx.Conj(x[ix])
+		ix += incX
+		iy += incY
+	}
+	return sum
+}
+
+// DotuUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * v
+//	}
+//	return sum
+func DotuUnitary(x, y []complex128) (sum complex128) {
+	for i, v := range x {
+		sum += y[i] * v
+	}
+	return sum
+}
+
+// DotuInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotuInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128) {
+	for i := 0; i < int(n); i++ {
+		sum += y[iy] * x[ix]
+		ix += incX
+		iy += incY
+	}
+	return sum
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyinc_amd64.s
new file mode 100644
index 0000000000..4d2c5e9ad5
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyinc_amd64.s
@@ -0,0 +1,151 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// MOVSHDUP X3, X2
+#define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3
+// MOVSLDUP X3, X3
+#define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB
+// ADDSUBPS X2, X3
+#define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
+
+// MOVSHDUP X5, X4
+#define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5
+// MOVSLDUP X5, X5
+#define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED
+// ADDSUBPS X4, X5
+#define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
+
+// MOVSHDUP X7, X6
+#define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7
+// MOVSLDUP X7, X7
+#define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF
+// ADDSUBPS X6, X7
+#define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
+
+// MOVSHDUP X9, X8
+#define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1
+// MOVSLDUP X9, X9
+#define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9
+// ADDSUBPS X8, X9
+#define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
+
+// func AxpyInc(alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr)
+TEXT ·AxpyInc(SB), NOSPLIT, $0
+	MOVQ   x_base+8(FP), SI  // SI = &x
+	MOVQ   y_base+32(FP), DI // DI = &y
+	MOVQ   n+56(FP), CX      // CX = n
+	CMPQ   CX, $0            // if n==0 { return }
+	JE     axpyi_end
+	MOVQ   ix+80(FP), R8     // R8 = ix
+	MOVQ   iy+88(FP), R9     // R9 = iy
+	LEAQ   (SI)(R8*8), SI    // SI = &(x[ix])
+	LEAQ   (DI)(R9*8), DI    // DI = &(y[iy])
+	MOVQ   DI, DX            // DX = DI    // Read/Write pointers
+	MOVQ   incX+64(FP), R8   // R8 = incX
+	SHLQ   $3, R8            // R8 *= sizeof(complex64)
+	MOVQ   incY+72(FP), R9   // R9 = incY
+	SHLQ   $3, R9            // R9 *= sizeof(complex64)
+	MOVSD  alpha+0(FP), X0   // X0 = { 0, 0, imag(a), real(a) }
+	MOVAPS X0, X1
+	SHUFPS $0x11, X1, X1     // X1 = { 0, 0, real(a), imag(a) }
+	MOVAPS X0, X10           // Copy X0 and X1 for pipelining
+	MOVAPS X1, X11
+	MOVQ   CX, BX
+	ANDQ   $3, CX            // CX = n % 4
+	SHRQ   $2, BX            // BX = floor( n / 4 )
+	JZ     axpyi_tail        // if BX == 0 { goto axpyi_tail }
+
+axpyi_loop: // do {
+	MOVSD (SI), X3       // X_i = { imag(x[i+1]), real(x[i+1]) }
+	MOVSD (SI)(R8*1), X5
+	LEAQ  (SI)(R8*2), SI // SI = &(SI[incX*2])
+	MOVSD (SI), X7
+	MOVSD (SI)(R8*1), X9
+
+	// X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSHDUP_X3_X2
+	MOVSHDUP_X5_X4
+	MOVSHDUP_X7_X6
+	MOVSHDUP_X9_X8
+
+	// X_i = { real(x[i]), real(x[i]) }
+	MOVSLDUP_X3_X3
+	MOVSLDUP_X5_X5
+	MOVSLDUP_X7_X7
+	MOVSLDUP_X9_X9
+
+	// X_(i-1) = {  real(a) * imag(x[i]),   imag(a) * imag(x[i]) }
+	// X_i     = {  imag(a) * real(x[i]),   real(a) * real(x[i])  }
+	MULPS X1, X2
+	MULPS X0, X3
+	MULPS X11, X4
+	MULPS X10, X5
+	MULPS X1, X6
+	MULPS X0, X7
+	MULPS X11, X8
+	MULPS X10, X9
+
+	// X_i = {
+	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
+	//  }
+	ADDSUBPS_X2_X3
+	ADDSUBPS_X4_X5
+	ADDSUBPS_X6_X7
+	ADDSUBPS_X8_X9
+
+	// X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	MOVSD (DX), X2
+	MOVSD (DX)(R9*1), X4
+	LEAQ  (DX)(R9*2), DX // DX = &(DX[incY*2])
+	MOVSD (DX), X6
+	MOVSD (DX)(R9*1), X8
+	ADDPS X2, X3
+	ADDPS X4, X5
+	ADDPS X6, X7
+	ADDPS X8, X9
+
+	MOVSD X3, (DI)       // y[i] = X_i
+	MOVSD X5, (DI)(R9*1)
+	LEAQ  (DI)(R9*2), DI // DI = &(DI[incDst])
+	MOVSD X7, (DI)
+	MOVSD X9, (DI)(R9*1)
+	LEAQ  (SI)(R8*2), SI // SI = &(SI[incX*2])
+	LEAQ  (DX)(R9*2), DX // DX = &(DX[incY*2])
+	LEAQ  (DI)(R9*2), DI // DI = &(DI[incDst])
+	DECQ  BX
+	JNZ   axpyi_loop     // }  while --BX > 0
+	CMPQ  CX, $0         // if CX == 0 { return }
+	JE    axpyi_end
+
+axpyi_tail: // do {
+	MOVSD (SI), X3 // X_i = { imag(x[i+1]), real(x[i+1]) }
+	MOVSHDUP_X3_X2 // X_(i-1) = { real(x[i]), real(x[i]) }
+	MOVSLDUP_X3_X3 // X_i = { imag(x[i]), imag(x[i]) }
+
+	// X_i     = { imag(a) * real(x[i]),  real(a) * real(x[i]) }
+	// X_(i-1) = { real(a) * imag(x[i]),  imag(a) * imag(x[i]) }
+	MULPS X1, X2
+	MULPS X0, X3
+
+	// X_i = {
+	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
+	//  }
+	ADDSUBPS_X2_X3 // (ai*x1r+ar*x1i, ar*x1r-ai*x1i)
+
+	// X_i = { imag(result[i]) + imag(y[i]),  real(result[i]) + real(y[i])  }
+	MOVSD (DI), X4
+	ADDPS X4, X3
+	MOVSD X3, (DI)   // y[i] = X_i
+	ADDQ  R8, SI     // SI += incX
+	ADDQ  R9, DI     // DI += incY
+	LOOP  axpyi_tail // } while --CX > 0
+
+axpyi_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyincto_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyincto_amd64.s
new file mode 100644
index 0000000000..1519f2d9b3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyincto_amd64.s
@@ -0,0 +1,156 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// MOVSHDUP X3, X2
+#define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3
+// MOVSLDUP X3, X3
+#define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB
+// ADDSUBPS X2, X3
+#define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
+
+// MOVSHDUP X5, X4
+#define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5
+// MOVSLDUP X5, X5
+#define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED
+// ADDSUBPS X4, X5
+#define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
+
+// MOVSHDUP X7, X6
+#define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7
+// MOVSLDUP X7, X7
+#define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF
+// ADDSUBPS X6, X7
+#define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
+
+// MOVSHDUP X9, X8
+#define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1
+// MOVSLDUP X9, X9
+#define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9
+// ADDSUBPS X8, X9
+#define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
+
+// func AxpyIncTo(dst []complex64, incDst, idst uintptr, alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr)
+TEXT ·AxpyIncTo(SB), NOSPLIT, $0
+	MOVQ   dst_base+0(FP), DI // DI = &dst
+	MOVQ   x_base+48(FP), SI  // SI = &x
+	MOVQ   y_base+72(FP), DX  // DX = &y
+	MOVQ   n+96(FP), CX       // CX = n
+	CMPQ   CX, $0             // if n==0 { return }
+	JE     axpyi_end
+	MOVQ   ix+120(FP), R8     // Load the first index
+	MOVQ   iy+128(FP), R9
+	MOVQ   idst+32(FP), R10
+	LEAQ   (SI)(R8*8), SI     // SI = &(x[ix])
+	LEAQ   (DX)(R9*8), DX     // DX = &(y[iy])
+	LEAQ   (DI)(R10*8), DI    // DI = &(dst[idst])
+	MOVQ   incX+104(FP), R8   // Incrementors*8 for easy iteration (ADDQ)
+	SHLQ   $3, R8
+	MOVQ   incY+112(FP), R9
+	SHLQ   $3, R9
+	MOVQ   incDst+24(FP), R10
+	SHLQ   $3, R10
+	MOVSD  alpha+40(FP), X0   // X0 = { 0, 0, imag(a), real(a) }
+	MOVAPS X0, X1
+	SHUFPS $0x11, X1, X1      // X1 = { 0, 0, real(a), imag(a) }
+	MOVAPS X0, X10            // Copy X0 and X1 for pipelining
+	MOVAPS X1, X11
+	MOVQ   CX, BX
+	ANDQ   $3, CX             // CX = n % 4
+	SHRQ   $2, BX             // BX = floor( n / 4 )
+	JZ     axpyi_tail         // if BX == 0 { goto axpyi_tail }
+
+axpyi_loop: // do {
+	MOVSD (SI), X3       // X_i = { imag(x[i]), real(x[i]) }
+	MOVSD (SI)(R8*1), X5
+	LEAQ  (SI)(R8*2), SI // SI = &(SI[incX*2])
+	MOVSD (SI), X7
+	MOVSD (SI)(R8*1), X9
+
+	// X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSHDUP_X3_X2
+	MOVSHDUP_X5_X4
+	MOVSHDUP_X7_X6
+	MOVSHDUP_X9_X8
+
+	// X_i = { real(x[i]), real(x[i]) }
+	MOVSLDUP_X3_X3
+	MOVSLDUP_X5_X5
+	MOVSLDUP_X7_X7
+	MOVSLDUP_X9_X9
+
+	// X_(i-1) = {  real(a) * imag(x[i]),   imag(a) * imag(x[i]) }
+	// X_i     = {  imag(a) * real(x[i]),   real(a) * real(x[i])  }
+	MULPS X1, X2
+	MULPS X0, X3
+	MULPS X11, X4
+	MULPS X10, X5
+	MULPS X1, X6
+	MULPS X0, X7
+	MULPS X11, X8
+	MULPS X10, X9
+
+	// X_i = {
+	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
+	//  }
+	ADDSUBPS_X2_X3
+	ADDSUBPS_X4_X5
+	ADDSUBPS_X6_X7
+	ADDSUBPS_X8_X9
+
+	// X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	MOVSD (DX), X2
+	MOVSD (DX)(R9*1), X4
+	LEAQ  (DX)(R9*2), DX // DX = &(DX[incY*2])
+	MOVSD (DX), X6
+	MOVSD (DX)(R9*1), X8
+	ADDPS X2, X3
+	ADDPS X4, X5
+	ADDPS X6, X7
+	ADDPS X8, X9
+
+	MOVSD X3, (DI)        // y[i] = X_i
+	MOVSD X5, (DI)(R10*1)
+	LEAQ  (DI)(R10*2), DI // DI = &(DI[incDst])
+	MOVSD X7, (DI)
+	MOVSD X9, (DI)(R10*1)
+	LEAQ  (SI)(R8*2), SI  // SI = &(SI[incX*2])
+	LEAQ  (DX)(R9*2), DX  // DX = &(DX[incY*2])
+	LEAQ  (DI)(R10*2), DI // DI = &(DI[incDst])
+	DECQ  BX
+	JNZ   axpyi_loop      // } while --BX > 0
+	CMPQ  CX, $0          // if CX == 0 { return }
+	JE    axpyi_end
+
+axpyi_tail:
+	MOVSD (SI), X3 // X_i     = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2 // X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3 // X_i     = { real(x[i]), real(x[i]) }
+
+	// X_i     = { imag(a) * real(x[i]),  real(a) * real(x[i]) }
+	// X_(i-1) = { real(a) * imag(x[i]),  imag(a) * imag(x[i]) }
+	MULPS X1, X2
+	MULPS X0, X3
+
+	// X_i = {
+	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
+	//  }
+	ADDSUBPS_X2_X3
+
+	// X_i = { imag(result[i]) + imag(y[i]),  real(result[i]) + real(y[i])  }
+	MOVSD (DX), X4
+	ADDPS X4, X3
+	MOVSD X3, (DI)   // y[i] = X_i
+	ADDQ  R8, SI     // SI += incX
+	ADDQ  R9, DX     // DX += incY
+	ADDQ  R10, DI    // DI += incDst
+	LOOP  axpyi_tail // } while --CX > 0
+
+axpyi_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyunitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyunitary_amd64.s
new file mode 100644
index 0000000000..71274c92cc
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyunitary_amd64.s
@@ -0,0 +1,160 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// MOVSHDUP X3, X2
+#define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3
+// MOVSLDUP X3, X3
+#define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB
+// ADDSUBPS X2, X3
+#define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
+
+// MOVSHDUP X5, X4
+#define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5
+// MOVSLDUP X5, X5
+#define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED
+// ADDSUBPS X4, X5
+#define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
+
+// MOVSHDUP X7, X6
+#define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7
+// MOVSLDUP X7, X7
+#define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF
+// ADDSUBPS X6, X7
+#define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
+
+// MOVSHDUP X9, X8
+#define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1
+// MOVSLDUP X9, X9
+#define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9
+// ADDSUBPS X8, X9
+#define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
+
+// func AxpyUnitary(alpha complex64, x, y []complex64)
+TEXT ·AxpyUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+8(FP), SI  // SI = &x
+	MOVQ    y_base+32(FP), DI // DI = &y
+	MOVQ    x_len+16(FP), CX  // CX = min( len(x), len(y) )
+	CMPQ    y_len+40(FP), CX
+	CMOVQLE y_len+40(FP), CX
+	CMPQ    CX, $0            // if CX == 0 { return }
+	JE      caxy_end
+	PXOR    X0, X0            // Clear work registers and cache-align loop
+	PXOR    X1, X1
+	MOVSD   alpha+0(FP), X0   // X0 = { 0, 0, imag(a), real(a) }
+	SHUFPD  $0, X0, X0        // X0  = { imag(a), real(a), imag(a), real(a) }
+	MOVAPS  X0, X1
+	SHUFPS  $0x11, X1, X1     // X1 = { real(a), imag(a), real(a), imag(a) }
+	XORQ    AX, AX            // i = 0
+	MOVQ    DI, BX            // Align on 16-byte boundary for ADDPS
+	ANDQ    $15, BX           // BX = &y & 15
+	JZ      caxy_no_trim      // if BX == 0 { goto caxy_no_trim }
+
+	// Trim first value in unaligned buffer
+	XORPS X2, X2         // Clear work registers and cache-align loop
+	XORPS X3, X3
+	XORPS X4, X4
+	MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2       // X2 = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3       // X3 = { real(x[i]), real(x[i]) }
+	MULPS X1, X2         // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
+	MULPS X0, X3         // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) }
+
+	// X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]), real(a)*real(x[i]) - imag(a)*imag(x[i]) }
+	ADDSUBPS_X2_X3
+	MOVSD (DI)(AX*8), X4 // X3 += y[i]
+	ADDPS X4, X3
+	MOVSD X3, (DI)(AX*8) // y[i]  = X3
+	INCQ  AX             // i++
+	DECQ  CX             // --CX
+	JZ    caxy_end       // if CX == 0 { return }
+
+caxy_no_trim:
+	MOVAPS X0, X10   // Copy X0 and X1 for pipelineing
+	MOVAPS X1, X11
+	MOVQ   CX, BX
+	ANDQ   $7, CX    // CX = n % 8
+	SHRQ   $3, BX    // BX = floor( n / 8 )
+	JZ     caxy_tail // if BX == 0 { goto caxy_tail }
+
+caxy_loop: // do {
+	// X_i = { imag(x[i]), real(x[i]), imag(x[i+1]), real(x[i+1]) }
+	MOVUPS (SI)(AX*8), X3
+	MOVUPS 16(SI)(AX*8), X5
+	MOVUPS 32(SI)(AX*8), X7
+	MOVUPS 48(SI)(AX*8), X9
+
+	// X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) }
+	MOVSHDUP_X3_X2
+	MOVSHDUP_X5_X4
+	MOVSHDUP_X7_X6
+	MOVSHDUP_X9_X8
+
+	// X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
+	MOVSLDUP_X3_X3
+	MOVSLDUP_X5_X5
+	MOVSLDUP_X7_X7
+	MOVSLDUP_X9_X9
+
+	// X_i     = {  imag(a) * real(x[i]),   real(a) * real(x[i]),
+	// 		imag(a) * real(x[i+1]), real(a) * real(x[i+1])  }
+	// X_(i-1) = {  real(a) * imag(x[i]),   imag(a) * imag(x[i]),
+	//		real(a) * imag(x[i+1]), imag(a) * imag(x[i+1])  }
+	MULPS X1, X2
+	MULPS X0, X3
+	MULPS X11, X4
+	MULPS X10, X5
+	MULPS X1, X6
+	MULPS X0, X7
+	MULPS X11, X8
+	MULPS X10, X9
+
+	// X_i = {
+	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
+	//	imag(result[i+1]): imag(a)*real(x[i+1]) + real(a)*imag(x[i+1]),
+	//	real(result[i+1]): real(a)*real(x[i+1]) - imag(a)*imag(x[i+1]),
+	//  }
+	ADDSUBPS_X2_X3
+	ADDSUBPS_X4_X5
+	ADDSUBPS_X6_X7
+	ADDSUBPS_X8_X9
+
+	// X_i = { imag(result[i])   + imag(y[i]),   real(result[i])   + real(y[i]),
+	//	   imag(result[i+1]) + imag(y[i+1]), real(result[i+1]) + real(y[i+1])  }
+	ADDPS  (DI)(AX*8), X3
+	ADDPS  16(DI)(AX*8), X5
+	ADDPS  32(DI)(AX*8), X7
+	ADDPS  48(DI)(AX*8), X9
+	MOVUPS X3, (DI)(AX*8)   // y[i:i+1] = X_i
+	MOVUPS X5, 16(DI)(AX*8)
+	MOVUPS X7, 32(DI)(AX*8)
+	MOVUPS X9, 48(DI)(AX*8)
+	ADDQ   $8, AX           // i += 8
+	DECQ   BX               // --BX
+	JNZ    caxy_loop        // }  while BX > 0
+	CMPQ   CX, $0           // if CX == 0  { return }
+	JE     caxy_end
+
+caxy_tail: // do {
+	MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2       // X2 = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3       // X3 = { real(x[i]), real(x[i]) }
+	MULPS X1, X2         // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
+	MULPS X0, X3         // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) }
+
+	// X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	  real(a)*real(x[i]) - imag(a)*imag(x[i])   }
+	ADDSUBPS_X2_X3
+	MOVSD (DI)(AX*8), X4 // X3 += y[i]
+	ADDPS X4, X3
+	MOVSD X3, (DI)(AX*8) // y[i]  = X3
+	INCQ  AX             // ++i
+	LOOP  caxy_tail      // } while --CX > 0
+
+caxy_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyunitaryto_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyunitaryto_amd64.s
new file mode 100644
index 0000000000..2e80d8ca94
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyunitaryto_amd64.s
@@ -0,0 +1,157 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// MOVSHDUP X3, X2
+#define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3
+// MOVSLDUP X3, X3
+#define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB
+// ADDSUBPS X2, X3
+#define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
+
+// MOVSHDUP X5, X4
+#define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5
+// MOVSLDUP X5, X5
+#define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED
+// ADDSUBPS X4, X5
+#define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
+
+// MOVSHDUP X7, X6
+#define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7
+// MOVSLDUP X7, X7
+#define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF
+// ADDSUBPS X6, X7
+#define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
+
+// MOVSHDUP X9, X8
+#define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1
+// MOVSLDUP X9, X9
+#define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9
+// ADDSUBPS X8, X9
+#define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
+
+// func AxpyUnitaryTo(dst []complex64, alpha complex64, x, y []complex64)
+TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
+	MOVQ    dst_base+0(FP), DI // DI = &dst
+	MOVQ    x_base+32(FP), SI  // SI = &x
+	MOVQ    y_base+56(FP), DX  // DX = &y
+	MOVQ    x_len+40(FP), CX
+	CMPQ    y_len+64(FP), CX   // CX = min( len(x), len(y), len(dst) )
+	CMOVQLE y_len+64(FP), CX
+	CMPQ    dst_len+8(FP), CX
+	CMOVQLE dst_len+8(FP), CX
+	CMPQ    CX, $0             // if CX == 0 { return }
+	JE      caxy_end
+	MOVSD   alpha+24(FP), X0   // X0 = { 0, 0, imag(a), real(a) }
+	SHUFPD  $0, X0, X0         // X0  = { imag(a), real(a), imag(a), real(a) }
+	MOVAPS  X0, X1
+	SHUFPS  $0x11, X1, X1      // X1 = { real(a), imag(a), real(a), imag(a) }
+	XORQ    AX, AX             // i = 0
+	MOVQ    DX, BX             // Align on 16-byte boundary for ADDPS
+	ANDQ    $15, BX            // BX = &y & 15
+	JZ      caxy_no_trim       // if BX == 0 { goto caxy_no_trim }
+
+	MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2       // X2 = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3       // X3 = { real(x[i]), real(x[i]) }
+	MULPS X1, X2         // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
+	MULPS X0, X3         // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) }
+
+	// X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]), real(a)*real(x[i]) - imag(a)*imag(x[i]) }
+	ADDSUBPS_X2_X3
+	MOVSD (DX)(AX*8), X4 // X3 += y[i]
+	ADDPS X4, X3
+	MOVSD X3, (DI)(AX*8) // dst[i]  = X3
+	INCQ  AX             // i++
+	DECQ  CX             // --CX
+	JZ    caxy_tail      // if BX == 0 { goto caxy_tail }
+
+caxy_no_trim:
+	MOVAPS X0, X10   // Copy X0 and X1 for pipelineing
+	MOVAPS X1, X11
+	MOVQ   CX, BX
+	ANDQ   $7, CX    // CX = n % 8
+	SHRQ   $3, BX    // BX = floor( n / 8 )
+	JZ     caxy_tail // if BX == 0 { goto caxy_tail }
+
+caxy_loop:
+	// X_i = { imag(x[i]), real(x[i]), imag(x[i+1]), real(x[i+1]) }
+	MOVUPS (SI)(AX*8), X3
+	MOVUPS 16(SI)(AX*8), X5
+	MOVUPS 32(SI)(AX*8), X7
+	MOVUPS 48(SI)(AX*8), X9
+
+	// X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) }
+	MOVSHDUP_X3_X2
+	MOVSHDUP_X5_X4
+	MOVSHDUP_X7_X6
+	MOVSHDUP_X9_X8
+
+	// X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
+	MOVSLDUP_X3_X3
+	MOVSLDUP_X5_X5
+	MOVSLDUP_X7_X7
+	MOVSLDUP_X9_X9
+
+	// X_i     = {  imag(a) * real(x[i]),   real(a) * real(x[i]),
+	// 		imag(a) * real(x[i+1]), real(a) * real(x[i+1])  }
+	// X_(i-1) = {  real(a) * imag(x[i]),   imag(a) * imag(x[i]),
+	//		real(a) * imag(x[i+1]), imag(a) * imag(x[i+1])  }
+	MULPS X1, X2
+	MULPS X0, X3
+	MULPS X11, X4
+	MULPS X10, X5
+	MULPS X1, X6
+	MULPS X0, X7
+	MULPS X11, X8
+	MULPS X10, X9
+
+	// X_i = {
+	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
+	//	imag(result[i+1]): imag(a)*real(x[i+1]) + real(a)*imag(x[i+1]),
+	//	real(result[i+1]): real(a)*real(x[i+1]) - imag(a)*imag(x[i+1]),
+	//  }
+	ADDSUBPS_X2_X3
+	ADDSUBPS_X4_X5
+	ADDSUBPS_X6_X7
+	ADDSUBPS_X8_X9
+
+	// X_i = { imag(result[i])   + imag(y[i]),   real(result[i])   + real(y[i]),
+	//	   imag(result[i+1]) + imag(y[i+1]), real(result[i+1]) + real(y[i+1])  }
+	ADDPS  (DX)(AX*8), X3
+	ADDPS  16(DX)(AX*8), X5
+	ADDPS  32(DX)(AX*8), X7
+	ADDPS  48(DX)(AX*8), X9
+	MOVUPS X3, (DI)(AX*8)   // y[i:i+1] = X_i
+	MOVUPS X5, 16(DI)(AX*8)
+	MOVUPS X7, 32(DI)(AX*8)
+	MOVUPS X9, 48(DI)(AX*8)
+	ADDQ   $8, AX           // i += 8
+	DECQ   BX               // --BX
+	JNZ    caxy_loop        // }  while BX > 0
+	CMPQ   CX, $0           // if CX == 0  { return }
+	JE     caxy_end
+
+caxy_tail: // do {
+	MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2       // X2 = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3       // X3 = { real(x[i]), real(x[i]) }
+	MULPS X1, X2         // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
+	MULPS X0, X3         // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) }
+
+	// X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	  real(a)*real(x[i]) - imag(a)*imag(x[i])  }
+	ADDSUBPS_X2_X3
+	MOVSD (DX)(AX*8), X4 // X3 += y[i]
+	ADDPS X4, X3
+	MOVSD X3, (DI)(AX*8) // y[i]  = X3
+	INCQ  AX             // ++i
+	LOOP  caxy_tail      // } while --CX > 0
+
+caxy_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/conj.go b/vendor/gonum.org/v1/gonum/internal/asm/c64/conj.go
new file mode 100644
index 0000000000..910e1e5c73
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/conj.go
@@ -0,0 +1,7 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package c64
+
+func conj(c complex64) complex64 { return complex(real(c), -imag(c)) }
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/doc.go b/vendor/gonum.org/v1/gonum/internal/asm/c64/doc.go
new file mode 100644
index 0000000000..35f1b2a26b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package c64 provides complex64 vector primitives.
+package c64 // import "gonum.org/v1/gonum/internal/asm/c64"
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/dotcinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c64/dotcinc_amd64.s
new file mode 100644
index 0000000000..8efda0bb77
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/dotcinc_amd64.s
@@ -0,0 +1,160 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVSHDUP_X3_X2    LONG $0xD3160FF3 // MOVSHDUP X3, X2
+#define MOVSHDUP_X5_X4    LONG $0xE5160FF3 // MOVSHDUP X5, X4
+#define MOVSHDUP_X7_X6    LONG $0xF7160FF3 // MOVSHDUP X7, X6
+#define MOVSHDUP_X9_X8    LONG $0x160F45F3; BYTE $0xC1 // MOVSHDUP X9, X8
+
+#define MOVSLDUP_X3_X3    LONG $0xDB120FF3 // MOVSLDUP X3, X3
+#define MOVSLDUP_X5_X5    LONG $0xED120FF3 // MOVSLDUP X5, X5
+#define MOVSLDUP_X7_X7    LONG $0xFF120FF3 // MOVSLDUP X7, X7
+#define MOVSLDUP_X9_X9    LONG $0x120F45F3; BYTE $0xC9 // MOVSLDUP X9, X9
+
+#define ADDSUBPS_X2_X3    LONG $0xDAD00FF2 // ADDSUBPS X2, X3
+#define ADDSUBPS_X4_X5    LONG $0xECD00FF2 // ADDSUBPS X4, X5
+#define ADDSUBPS_X6_X7    LONG $0xFED00FF2 // ADDSUBPS X6, X7
+#define ADDSUBPS_X8_X9    LONG $0xD00F45F2; BYTE $0xC8 // ADDSUBPS X8, X9
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define P_SUM X1
+#define INC_X R8
+#define INCx3_X R9
+#define INC_Y R10
+#define INCx3_Y R11
+#define NEG1 X15
+#define P_NEG1 X14
+
+// func DotcInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64)
+TEXT ·DotcInc(SB), NOSPLIT, $0
+	MOVQ   x_base+0(FP), X_PTR     // X_PTR = &x
+	MOVQ   y_base+24(FP), Y_PTR    // Y_PTR = &y
+	PXOR   SUM, SUM                // SUM = 0
+	PXOR   P_SUM, P_SUM            // P_SUM = 0
+	MOVQ   n+48(FP), LEN           // LEN = n
+	CMPQ   LEN, $0                 // if LEN == 0 { return }
+	JE     dotc_end
+	MOVQ   ix+72(FP), INC_X
+	MOVQ   iy+80(FP), INC_Y
+	LEAQ   (X_PTR)(INC_X*8), X_PTR // X_PTR = &(X_PTR[ix])
+	LEAQ   (Y_PTR)(INC_Y*8), Y_PTR // Y_PTR = &(Y_PTR[iy])
+	MOVQ   incX+56(FP), INC_X      // INC_X = incX * sizeof(complex64)
+	SHLQ   $3, INC_X
+	MOVQ   incY+64(FP), INC_Y      // INC_Y = incY * sizeof(complex64)
+	SHLQ   $3, INC_Y
+	MOVSS  $(-1.0), NEG1
+	SHUFPS $0, NEG1, NEG1          // { -1, -1, -1, -1 }
+
+	MOVQ LEN, TAIL
+	ANDQ $3, TAIL  // TAIL = LEN % 4
+	SHRQ $2, LEN   // LEN = floor( LEN / 4 )
+	JZ   dotc_tail // if LEN == 0 { goto dotc_tail }
+
+	MOVUPS NEG1, P_NEG1              // Copy NEG1 for pipelining
+	LEAQ   (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
+	LEAQ   (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
+
+dotc_loop: // do {
+	MOVSD (X_PTR), X3            // X_i = { imag(x[i]), real(x[i]) }
+	MOVSD (X_PTR)(INC_X*1), X5
+	MOVSD (X_PTR)(INC_X*2), X7
+	MOVSD (X_PTR)(INCx3_X*1), X9
+
+	// X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSHDUP_X3_X2
+	MOVSHDUP_X5_X4
+	MOVSHDUP_X7_X6
+	MOVSHDUP_X9_X8
+
+	// X_i = { real(x[i]), real(x[i]) }
+	MOVSLDUP_X3_X3
+	MOVSLDUP_X5_X5
+	MOVSLDUP_X7_X7
+	MOVSLDUP_X9_X9
+
+	// X_(i-1) = { -imag(x[i]), -imag(x[i]) }
+	MULPS NEG1, X2
+	MULPS P_NEG1, X4
+	MULPS NEG1, X6
+	MULPS P_NEG1, X8
+
+	// X_j = { imag(y[i]), real(y[i]) }
+	MOVSD (Y_PTR), X10
+	MOVSD (Y_PTR)(INC_Y*1), X11
+	MOVSD (Y_PTR)(INC_Y*2), X12
+	MOVSD (Y_PTR)(INCx3_Y*1), X13
+
+	// X_i     = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	MULPS X10, X3
+	MULPS X11, X5
+	MULPS X12, X7
+	MULPS X13, X9
+
+	// X_j = { real(y[i]), imag(y[i]) }
+	SHUFPS $0xB1, X10, X10
+	SHUFPS $0xB1, X11, X11
+	SHUFPS $0xB1, X12, X12
+	SHUFPS $0xB1, X13, X13
+
+	// X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+	MULPS X10, X2
+	MULPS X11, X4
+	MULPS X12, X6
+	MULPS X13, X8
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i]) * real(x[i]) + real(y[i]) * imag(x[i]),
+	//	real(result[i]):  real(y[i]) * real(x[i]) - imag(y[i]) * imag(x[i])  }
+	ADDSUBPS_X2_X3
+	ADDSUBPS_X4_X5
+	ADDSUBPS_X6_X7
+	ADDSUBPS_X8_X9
+
+	// SUM += X_i
+	ADDPS X3, SUM
+	ADDPS X5, P_SUM
+	ADDPS X7, SUM
+	ADDPS X9, P_SUM
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[INC_X*4])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[INC_Y*4])
+
+	DECQ LEN
+	JNZ  dotc_loop // } while --LEN > 0
+
+	ADDPS P_SUM, SUM // SUM = { P_SUM + SUM }
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dotc_end
+
+dotc_tail: // do {
+	MOVSD  (X_PTR), X3    // X_i = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2        // X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3        // X_i = { real(x[i]), real(x[i]) }
+	MULPS  NEG1, X2       // X_(i-1) = { -imag(x[i]), imag(x[i]) }
+	MOVUPS (Y_PTR), X10   // X_j = { imag(y[i]), real(y[i]) }
+	MULPS  X10, X3        // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	SHUFPS $0x1, X10, X10 // X_j = { real(y[i]), imag(y[i]) }
+	MULPS  X10, X2        // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
+	//	real(result[i]):  real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
+	ADDSUBPS_X2_X3
+	ADDPS X3, SUM      // SUM += X_i
+	ADDQ  INC_X, X_PTR // X_PTR += INC_X
+	ADDQ  INC_Y, Y_PTR // Y_PTR += INC_Y
+	DECQ  TAIL
+	JNZ   dotc_tail    // } while --TAIL > 0
+
+dotc_end:
+	MOVSD SUM, sum+88(FP) // return SUM
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/dotcunitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c64/dotcunitary_amd64.s
new file mode 100644
index 0000000000..78f43eee06
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/dotcunitary_amd64.s
@@ -0,0 +1,208 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVSLDUP_XPTR_IDX_8__X3    LONG $0x1C120FF3; BYTE $0xC6 // MOVSLDUP (SI)(AX*8), X3
+#define MOVSLDUP_16_XPTR_IDX_8__X5    LONG $0x6C120FF3; WORD $0x10C6 // MOVSLDUP 16(SI)(AX*8), X5
+#define MOVSLDUP_32_XPTR_IDX_8__X7    LONG $0x7C120FF3; WORD $0x20C6 // MOVSLDUP 32(SI)(AX*8), X7
+#define MOVSLDUP_48_XPTR_IDX_8__X9    LONG $0x120F44F3; WORD $0xC64C; BYTE $0x30 // MOVSLDUP 48(SI)(AX*8), X9
+
+#define MOVSHDUP_XPTR_IDX_8__X2    LONG $0x14160FF3; BYTE $0xC6 // MOVSHDUP (SI)(AX*8), X2
+#define MOVSHDUP_16_XPTR_IDX_8__X4    LONG $0x64160FF3; WORD $0x10C6 // MOVSHDUP 16(SI)(AX*8), X4
+#define MOVSHDUP_32_XPTR_IDX_8__X6    LONG $0x74160FF3; WORD $0x20C6 // MOVSHDUP 32(SI)(AX*8), X6
+#define MOVSHDUP_48_XPTR_IDX_8__X8    LONG $0x160F44F3; WORD $0xC644; BYTE $0x30 // MOVSHDUP 48(SI)(AX*8), X8
+
+#define MOVSHDUP_X3_X2    LONG $0xD3160FF3 // MOVSHDUP X3, X2
+#define MOVSLDUP_X3_X3    LONG $0xDB120FF3 // MOVSLDUP X3, X3
+
+#define ADDSUBPS_X2_X3    LONG $0xDAD00FF2 // ADDSUBPS X2, X3
+#define ADDSUBPS_X4_X5    LONG $0xECD00FF2 // ADDSUBPS X4, X5
+#define ADDSUBPS_X6_X7    LONG $0xFED00FF2 // ADDSUBPS X6, X7
+#define ADDSUBPS_X8_X9    LONG $0xD00F45F2; BYTE $0xC8 // ADDSUBPS X8, X9
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define P_SUM X1
+#define IDX AX
+#define I_IDX DX
+#define NEG1 X15
+#define P_NEG1 X14
+
+// func DotcUnitary(x, y []complex64) (sum complex64)
+TEXT ·DotcUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+0(FP), X_PTR  // X_PTR = &x
+	MOVQ    y_base+24(FP), Y_PTR // Y_PTR = &y
+	PXOR    SUM, SUM             // SUM = 0
+	PXOR    P_SUM, P_SUM         // P_SUM = 0
+	MOVQ    x_len+8(FP), LEN     // LEN = min( len(x), len(y) )
+	CMPQ    y_len+32(FP), LEN
+	CMOVQLE y_len+32(FP), LEN
+	CMPQ    LEN, $0              // if LEN == 0 { return }
+	JE      dotc_end
+	XORQ    IDX, IDX             // i = 0
+	MOVSS   $(-1.0), NEG1
+	SHUFPS  $0, NEG1, NEG1       // { -1, -1, -1, -1 }
+
+	MOVQ X_PTR, DX
+	ANDQ $15, DX      // DX = &x & 15
+	JZ   dotc_aligned // if DX == 0 { goto dotc_aligned }
+
+	MOVSD  (X_PTR)(IDX*8), X3  // X_i     = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2             // X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3             // X_i     = { real(x[i]), real(x[i]) }
+	MOVSD  (Y_PTR)(IDX*8), X10 // X_j     = { imag(y[i]), real(y[i]) }
+	MULPS  NEG1, X2            // X_(i-1) = { -imag(x[i]), imag(x[i]) }
+	MULPS  X10, X3             // X_i     = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	SHUFPS $0x1, X10, X10      // X_j     = { real(y[i]), imag(y[i]) }
+	MULPS  X10, X2             // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
+	//	real(result[i]):  real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
+	ADDSUBPS_X2_X3
+
+	MOVAPS X3, SUM  // SUM = X_i
+	INCQ   IDX      // IDX++
+	DECQ   LEN      // LEN--
+	JZ     dotc_ret // if LEN == 0 { goto dotc_ret }
+
+dotc_aligned:
+	MOVQ   LEN, TAIL
+	ANDQ   $7, TAIL     // TAIL = LEN % 8
+	SHRQ   $3, LEN      // LEN = floor( LEN / 8 )
+	JZ     dotc_tail    // if LEN == 0 { return }
+	MOVUPS NEG1, P_NEG1 // Copy NEG1 for pipelining
+
+dotc_loop: // do {
+	MOVSLDUP_XPTR_IDX_8__X3    // X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
+	MOVSLDUP_16_XPTR_IDX_8__X5
+	MOVSLDUP_32_XPTR_IDX_8__X7
+	MOVSLDUP_48_XPTR_IDX_8__X9
+
+	MOVSHDUP_XPTR_IDX_8__X2    // X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i+1]), imag(x[i+1]) }
+	MOVSHDUP_16_XPTR_IDX_8__X4
+	MOVSHDUP_32_XPTR_IDX_8__X6
+	MOVSHDUP_48_XPTR_IDX_8__X8
+
+	// X_j = { imag(y[i]), real(y[i]), imag(y[i+1]), real(y[i+1]) }
+	MOVUPS (Y_PTR)(IDX*8), X10
+	MOVUPS 16(Y_PTR)(IDX*8), X11
+	MOVUPS 32(Y_PTR)(IDX*8), X12
+	MOVUPS 48(Y_PTR)(IDX*8), X13
+
+	// X_(i-1) = { -imag(x[i]), -imag(x[i]), -imag(x[i]+1), -imag(x[i]+1) }
+	MULPS NEG1, X2
+	MULPS P_NEG1, X4
+	MULPS NEG1, X6
+	MULPS P_NEG1, X8
+
+	// X_i     = {  imag(y[i])   * real(x[i]),   real(y[i])   * real(x[i]),
+	// 		imag(y[i+1]) * real(x[i+1]), real(y[i+1]) * real(x[i+1])  }
+	MULPS X10, X3
+	MULPS X11, X5
+	MULPS X12, X7
+	MULPS X13, X9
+
+	// X_j = { real(y[i]), imag(y[i]), real(y[i+1]), imag(y[i+1]) }
+	SHUFPS $0xB1, X10, X10
+	SHUFPS $0xB1, X11, X11
+	SHUFPS $0xB1, X12, X12
+	SHUFPS $0xB1, X13, X13
+
+	// X_(i-1) = {  real(y[i])   * imag(x[i]),   imag(y[i])   * imag(x[i]),
+	//		real(y[i+1]) * imag(x[i+1]), imag(y[i+1]) * imag(x[i+1])  }
+	MULPS X10, X2
+	MULPS X11, X4
+	MULPS X12, X6
+	MULPS X13, X8
+
+	// X_i = {
+	//	imag(result[i]):   imag(y[i])   * real(x[i])   + real(y[i])   * imag(x[i]),
+	//	real(result[i]):   real(y[i])   * real(x[i])   - imag(y[i])   * imag(x[i]),
+	//	imag(result[i+1]): imag(y[i+1]) * real(x[i+1]) + real(y[i+1]) * imag(x[i+1]),
+	//	real(result[i+1]): real(y[i+1]) * real(x[i+1]) - imag(y[i+1]) * imag(x[i+1]),
+	//  }
+	ADDSUBPS_X2_X3
+	ADDSUBPS_X4_X5
+	ADDSUBPS_X6_X7
+	ADDSUBPS_X8_X9
+
+	// SUM += X_i
+	ADDPS X3, SUM
+	ADDPS X5, P_SUM
+	ADDPS X7, SUM
+	ADDPS X9, P_SUM
+
+	ADDQ $8, IDX   // IDX += 8
+	DECQ LEN
+	JNZ  dotc_loop // } while --LEN > 0
+
+	ADDPS SUM, P_SUM // P_SUM = { P_SUM[1] + SUM[1], P_SUM[0] + SUM[0] }
+	XORPS SUM, SUM   // SUM = 0
+
+	CMPQ TAIL, $0 // if TAIL == 0 { return }
+	JE   dotc_end
+
+dotc_tail:
+	MOVQ TAIL, LEN
+	SHRQ $1, LEN       // LEN = floor( LEN / 2 )
+	JZ   dotc_tail_one // if LEN == 0 { goto dotc_tail_one }
+
+dotc_tail_two: // do {
+	MOVSLDUP_XPTR_IDX_8__X3    // X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
+	MOVSHDUP_XPTR_IDX_8__X2    // X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) }
+	MOVUPS (Y_PTR)(IDX*8), X10 // X_j = { imag(y[i]), real(y[i]) }
+	MULPS  NEG1, X2            // X_(i-1) = { -imag(x[i]), imag(x[i]) }
+	MULPS  X10, X3             // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	SHUFPS $0xB1, X10, X10     // X_j = { real(y[i]), imag(y[i]) }
+	MULPS  X10, X2             // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
+	//	real(result[i]):  real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
+	ADDSUBPS_X2_X3
+
+	ADDPS X3, SUM // SUM += X_i
+
+	ADDQ $2, IDX       // IDX += 2
+	DECQ LEN
+	JNZ  dotc_tail_two // } while --LEN > 0
+
+	ADDPS SUM, P_SUM // P_SUM = { P_SUM[1] + SUM[1], P_SUM[0] + SUM[0] }
+	XORPS SUM, SUM   // SUM = 0
+
+	ANDQ $1, TAIL
+	JZ   dotc_end
+
+dotc_tail_one:
+	MOVSD  (X_PTR)(IDX*8), X3  // X_i = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2             // X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3             // X_i = { real(x[i]), real(x[i]) }
+	MOVSD  (Y_PTR)(IDX*8), X10 // X_j = { imag(y[i]), real(y[i]) }
+	MULPS  NEG1, X2            // X_(i-1) = { -imag(x[i]), imag(x[i]) }
+	MULPS  X10, X3             // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	SHUFPS $0x1, X10, X10      // X_j = { real(y[i]), imag(y[i]) }
+	MULPS  X10, X2             // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
+	//	real(result[i]):  real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
+	ADDSUBPS_X2_X3
+
+	ADDPS X3, SUM // SUM += X_i
+
+dotc_end:
+	ADDPS   P_SUM, SUM   // SUM = { P_SUM[0] + SUM[0] }
+	MOVHLPS P_SUM, P_SUM // P_SUM = { P_SUM[1], P_SUM[1] }
+	ADDPS   P_SUM, SUM   // SUM = { P_SUM[1] + SUM[0] }
+
+dotc_ret:
+	MOVSD SUM, sum+48(FP) // return SUM
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/dotuinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c64/dotuinc_amd64.s
new file mode 100644
index 0000000000..3dc2e144a8
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/dotuinc_amd64.s
@@ -0,0 +1,148 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVSHDUP_X3_X2    LONG $0xD3160FF3 // MOVSHDUP X3, X2
+#define MOVSHDUP_X5_X4    LONG $0xE5160FF3 // MOVSHDUP X5, X4
+#define MOVSHDUP_X7_X6    LONG $0xF7160FF3 // MOVSHDUP X7, X6
+#define MOVSHDUP_X9_X8    LONG $0x160F45F3; BYTE $0xC1 // MOVSHDUP X9, X8
+
+#define MOVSLDUP_X3_X3    LONG $0xDB120FF3 // MOVSLDUP X3, X3
+#define MOVSLDUP_X5_X5    LONG $0xED120FF3 // MOVSLDUP X5, X5
+#define MOVSLDUP_X7_X7    LONG $0xFF120FF3 // MOVSLDUP X7, X7
+#define MOVSLDUP_X9_X9    LONG $0x120F45F3; BYTE $0xC9 // MOVSLDUP X9, X9
+
+#define ADDSUBPS_X2_X3    LONG $0xDAD00FF2 // ADDSUBPS X2, X3
+#define ADDSUBPS_X4_X5    LONG $0xECD00FF2 // ADDSUBPS X4, X5
+#define ADDSUBPS_X6_X7    LONG $0xFED00FF2 // ADDSUBPS X6, X7
+#define ADDSUBPS_X8_X9    LONG $0xD00F45F2; BYTE $0xC8 // ADDSUBPS X8, X9
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define P_SUM X1
+#define INC_X R8
+#define INCx3_X R9
+#define INC_Y R10
+#define INCx3_Y R11
+
+// func DotuInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64)
+TEXT ·DotuInc(SB), NOSPLIT, $0
+	MOVQ x_base+0(FP), X_PTR     // X_PTR = &x
+	MOVQ y_base+24(FP), Y_PTR    // Y_PTR = &y
+	PXOR SUM, SUM                // SUM = 0
+	PXOR P_SUM, P_SUM            // P_SUM = 0
+	MOVQ n+48(FP), LEN           // LEN = n
+	CMPQ LEN, $0                 // if LEN == 0 { return }
+	JE   dotu_end
+	MOVQ ix+72(FP), INC_X
+	MOVQ iy+80(FP), INC_Y
+	LEAQ (X_PTR)(INC_X*8), X_PTR // X_PTR = &(X_PTR[ix])
+	LEAQ (Y_PTR)(INC_Y*8), Y_PTR // Y_PTR = &(Y_PTR[iy])
+	MOVQ incX+56(FP), INC_X      // INC_X = incX * sizeof(complex64)
+	SHLQ $3, INC_X
+	MOVQ incY+64(FP), INC_Y      // INC_Y = incY * sizeof(complex64)
+	SHLQ $3, INC_Y
+
+	MOVQ LEN, TAIL
+	ANDQ $3, TAIL  // TAIL = LEN % 4
+	SHRQ $2, LEN   // LEN = floor( LEN / 4 )
+	JZ   dotu_tail // if TAIL == 0 { goto dotu_tail }
+
+	LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
+	LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
+
+dotu_loop: // do {
+	MOVSD (X_PTR), X3            // X_i = { imag(x[i]), real(x[i]) }
+	MOVSD (X_PTR)(INC_X*1), X5
+	MOVSD (X_PTR)(INC_X*2), X7
+	MOVSD (X_PTR)(INCx3_X*1), X9
+
+	// X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSHDUP_X3_X2
+	MOVSHDUP_X5_X4
+	MOVSHDUP_X7_X6
+	MOVSHDUP_X9_X8
+
+	// X_i = { real(x[i]), real(x[i]) }
+	MOVSLDUP_X3_X3
+	MOVSLDUP_X5_X5
+	MOVSLDUP_X7_X7
+	MOVSLDUP_X9_X9
+
+	// X_j = { imag(y[i]), real(y[i]) }
+	MOVSD (Y_PTR), X10
+	MOVSD (Y_PTR)(INC_Y*1), X11
+	MOVSD (Y_PTR)(INC_Y*2), X12
+	MOVSD (Y_PTR)(INCx3_Y*1), X13
+
+	// X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	MULPS X10, X3
+	MULPS X11, X5
+	MULPS X12, X7
+	MULPS X13, X9
+
+	// X_j = { real(y[i]), imag(y[i]) }
+	SHUFPS $0xB1, X10, X10
+	SHUFPS $0xB1, X11, X11
+	SHUFPS $0xB1, X12, X12
+	SHUFPS $0xB1, X13, X13
+
+	// X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+	MULPS X10, X2
+	MULPS X11, X4
+	MULPS X12, X6
+	MULPS X13, X8
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i]) * real(x[i]) + real(y[i]) * imag(x[i]),
+	//	real(result[i]):  real(y[i]) * real(x[i]) - imag(y[i]) * imag(x[i])  }
+	ADDSUBPS_X2_X3
+	ADDSUBPS_X4_X5
+	ADDSUBPS_X6_X7
+	ADDSUBPS_X8_X9
+
+	// SUM += X_i
+	ADDPS X3, SUM
+	ADDPS X5, P_SUM
+	ADDPS X7, SUM
+	ADDPS X9, P_SUM
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[INC_X*4])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[INC_Y*4])
+
+	DECQ LEN
+	JNZ  dotu_loop // } while --LEN > 0
+
+	ADDPS P_SUM, SUM // SUM = { P_SUM + SUM }
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dotu_end
+
+dotu_tail: // do {
+	MOVSD  (X_PTR), X3    // X_i = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2        // X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3        // X_i = { real(x[i]), real(x[i]) }
+	MOVUPS (Y_PTR), X10   // X_j = { imag(y[i]), real(y[i]) }
+	MULPS  X10, X3        // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	SHUFPS $0x1, X10, X10 // X_j = { real(y[i]), imag(y[i]) }
+	MULPS  X10, X2        // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
+	//	real(result[i]):  real(y[i])*real(x[i]) - imag(y[i])*imag(x[i])  }
+	ADDSUBPS_X2_X3
+	ADDPS X3, SUM      // SUM += X_i
+	ADDQ  INC_X, X_PTR // X_PTR += INC_X
+	ADDQ  INC_Y, Y_PTR // Y_PTR += INC_Y
+	DECQ  TAIL
+	JNZ   dotu_tail    // } while --TAIL > 0
+
+dotu_end:
+	MOVSD SUM, sum+88(FP) // return SUM
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/dotuunitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c64/dotuunitary_amd64.s
new file mode 100644
index 0000000000..f11c6de78f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/dotuunitary_amd64.s
@@ -0,0 +1,197 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVSLDUP_XPTR_IDX_8__X3    LONG $0x1C120FF3; BYTE $0xC6 // MOVSLDUP (SI)(AX*8), X3
+#define MOVSLDUP_16_XPTR_IDX_8__X5    LONG $0x6C120FF3; WORD $0x10C6 // MOVSLDUP 16(SI)(AX*8), X5
+#define MOVSLDUP_32_XPTR_IDX_8__X7    LONG $0x7C120FF3; WORD $0x20C6 // MOVSLDUP 32(SI)(AX*8), X7
+#define MOVSLDUP_48_XPTR_IDX_8__X9    LONG $0x120F44F3; WORD $0xC64C; BYTE $0x30 // MOVSLDUP 48(SI)(AX*8), X9
+
+#define MOVSHDUP_XPTR_IDX_8__X2    LONG $0x14160FF3; BYTE $0xC6 // MOVSHDUP (SI)(AX*8), X2
+#define MOVSHDUP_16_XPTR_IDX_8__X4    LONG $0x64160FF3; WORD $0x10C6 // MOVSHDUP 16(SI)(AX*8), X4
+#define MOVSHDUP_32_XPTR_IDX_8__X6    LONG $0x74160FF3; WORD $0x20C6 // MOVSHDUP 32(SI)(AX*8), X6
+#define MOVSHDUP_48_XPTR_IDX_8__X8    LONG $0x160F44F3; WORD $0xC644; BYTE $0x30 // MOVSHDUP 48(SI)(AX*8), X8
+
+#define MOVSHDUP_X3_X2    LONG $0xD3160FF3 // MOVSHDUP X3, X2
+#define MOVSLDUP_X3_X3    LONG $0xDB120FF3 // MOVSLDUP X3, X3
+
+#define ADDSUBPS_X2_X3    LONG $0xDAD00FF2 // ADDSUBPS X2, X3
+#define ADDSUBPS_X4_X5    LONG $0xECD00FF2 // ADDSUBPS X4, X5
+#define ADDSUBPS_X6_X7    LONG $0xFED00FF2 // ADDSUBPS X6, X7
+#define ADDSUBPS_X8_X9    LONG $0xD00F45F2; BYTE $0xC8 // ADDSUBPS X8, X9
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define P_SUM X1
+#define IDX AX
+#define I_IDX DX
+#define NEG1 X15
+#define P_NEG1 X14
+
+// func DotuUnitary(x, y []complex64) (sum complex64)
+TEXT ·DotuUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+0(FP), X_PTR  // X_PTR = &x
+	MOVQ    y_base+24(FP), Y_PTR // Y_PTR = &y
+	PXOR    SUM, SUM             // SUM = 0
+	PXOR    P_SUM, P_SUM         // P_SUM = 0
+	MOVQ    x_len+8(FP), LEN     // LEN = min( len(x), len(y) )
+	CMPQ    y_len+32(FP), LEN
+	CMOVQLE y_len+32(FP), LEN
+	CMPQ    LEN, $0              // if LEN == 0 { return }
+	JE      dotu_end
+	XORQ    IDX, IDX             // IDX = 0
+
+	MOVQ X_PTR, DX
+	ANDQ $15, DX      // DX = &x & 15
+	JZ   dotu_aligned // if DX == 0 { goto dotu_aligned }
+
+	MOVSD  (X_PTR)(IDX*8), X3  // X_i     = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2             // X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3             // X_i     = { real(x[i]), real(x[i]) }
+	MOVSD  (Y_PTR)(IDX*8), X10 // X_j     = { imag(y[i]), real(y[i]) }
+	MULPS  X10, X3             // X_i     = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	SHUFPS $0x1, X10, X10      // X_j     = { real(y[i]), imag(y[i]) }
+	MULPS  X10, X2             // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
+	//	real(result[i]):  real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
+	ADDSUBPS_X2_X3
+
+	MOVAPS X3, SUM  // SUM = X_i
+	INCQ   IDX      // IDX++
+	DECQ   LEN      // LEN--
+	JZ     dotu_end // if LEN == 0 { goto dotu_end }
+
+dotu_aligned:
+	MOVQ LEN, TAIL
+	ANDQ $7, TAIL     // TAIL = LEN % 8
+	SHRQ $3, LEN      // LEN = floor( LEN / 8 )
+	JZ   dotu_tail    // if LEN == 0 { goto dotu_tail }
+	PXOR P_SUM, P_SUM
+
+dotu_loop: // do {
+	MOVSLDUP_XPTR_IDX_8__X3    // X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
+	MOVSLDUP_16_XPTR_IDX_8__X5
+	MOVSLDUP_32_XPTR_IDX_8__X7
+	MOVSLDUP_48_XPTR_IDX_8__X9
+
+	MOVSHDUP_XPTR_IDX_8__X2    // X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) }
+	MOVSHDUP_16_XPTR_IDX_8__X4
+	MOVSHDUP_32_XPTR_IDX_8__X6
+	MOVSHDUP_48_XPTR_IDX_8__X8
+
+	// X_j = { imag(y[i]), real(y[i]), imag(y[i+1]), real(y[i+1]) }
+	MOVUPS (Y_PTR)(IDX*8), X10
+	MOVUPS 16(Y_PTR)(IDX*8), X11
+	MOVUPS 32(Y_PTR)(IDX*8), X12
+	MOVUPS 48(Y_PTR)(IDX*8), X13
+
+	// X_i     = {  imag(y[i])   * real(x[i]),   real(y[i])   * real(x[i]),
+	// 		imag(y[i+1]) * real(x[i+1]), real(y[i+1]) * real(x[i+1])  }
+	MULPS X10, X3
+	MULPS X11, X5
+	MULPS X12, X7
+	MULPS X13, X9
+
+	// X_j = { real(y[i]), imag(y[i]), real(y[i+1]), imag(y[i+1]) }
+	SHUFPS $0xB1, X10, X10
+	SHUFPS $0xB1, X11, X11
+	SHUFPS $0xB1, X12, X12
+	SHUFPS $0xB1, X13, X13
+
+	// X_(i-1) = {  real(y[i])   * imag(x[i]),   imag(y[i])   * imag(x[i]),
+	//		real(y[i+1]) * imag(x[i+1]), imag(y[i+1]) * imag(x[i+1])  }
+	MULPS X10, X2
+	MULPS X11, X4
+	MULPS X12, X6
+	MULPS X13, X8
+
+	// X_i = {
+	//	imag(result[i]):   imag(y[i])   * real(x[i])   + real(y[i])   * imag(x[i]),
+	//	real(result[i]):   real(y[i])   * real(x[i])   - imag(y[i])   * imag(x[i]),
+	//	imag(result[i+1]): imag(y[i+1]) * real(x[i+1]) + real(y[i+1]) * imag(x[i+1]),
+	//	real(result[i+1]): real(y[i+1]) * real(x[i+1]) - imag(y[i+1]) * imag(x[i+1]),
+	//  }
+	ADDSUBPS_X2_X3
+	ADDSUBPS_X4_X5
+	ADDSUBPS_X6_X7
+	ADDSUBPS_X8_X9
+
+	// SUM += X_i
+	ADDPS X3, SUM
+	ADDPS X5, P_SUM
+	ADDPS X7, SUM
+	ADDPS X9, P_SUM
+
+	ADDQ $8, IDX   // IDX += 8
+	DECQ LEN
+	JNZ  dotu_loop // } while --LEN > 0
+
+	ADDPS SUM, P_SUM // P_SUM = { P_SUM[1] + SUM[1], P_SUM[0] + SUM[0] }
+	XORPS SUM, SUM   // SUM = 0
+
+	CMPQ TAIL, $0 // if TAIL == 0 { return }
+	JE   dotu_end
+
+dotu_tail:
+	MOVQ TAIL, LEN
+	SHRQ $1, LEN       // LEN = floor( LEN / 2 )
+	JZ   dotu_tail_one // if LEN == 0 { goto dotc_tail_one }
+
+dotu_tail_two: // do {
+	MOVSLDUP_XPTR_IDX_8__X3    // X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
+	MOVSHDUP_XPTR_IDX_8__X2    // X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) }
+	MOVUPS (Y_PTR)(IDX*8), X10 // X_j = { imag(y[i]), real(y[i]) }
+	MULPS  X10, X3             // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	SHUFPS $0xB1, X10, X10     // X_j = { real(y[i]), imag(y[i]) }
+	MULPS  X10, X2             // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
+	//	real(result[i]):  real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
+	ADDSUBPS_X2_X3
+
+	ADDPS X3, SUM // SUM += X_i
+
+	ADDQ $2, IDX       // IDX += 2
+	DECQ LEN
+	JNZ  dotu_tail_two // } while --LEN > 0
+
+	ADDPS SUM, P_SUM // P_SUM = { P_SUM[1] + SUM[1], P_SUM[0] + SUM[0] }
+	XORPS SUM, SUM   // SUM = 0
+
+	ANDQ $1, TAIL
+	JZ   dotu_end
+
+dotu_tail_one:
+	MOVSD  (X_PTR)(IDX*8), X3  // X_i = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2             // X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3             // X_i = { real(x[i]), real(x[i]) }
+	MOVSD  (Y_PTR)(IDX*8), X10 // X_j = { imag(y[i]), real(y[i]) }
+	MULPS  X10, X3             // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	SHUFPS $0x1, X10, X10      // X_j = { real(y[i]), imag(y[i]) }
+	MULPS  X10, X2             // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
+	//	real(result[i]):  real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
+	ADDSUBPS_X2_X3
+
+	ADDPS X3, SUM // SUM += X_i
+
+dotu_end:
+	ADDPS   P_SUM, SUM   // SUM = { P_SUM[0] + SUM[0] }
+	MOVHLPS P_SUM, P_SUM // P_SUM = { P_SUM[1], P_SUM[1] }
+	ADDPS   P_SUM, SUM   // SUM = { P_SUM[1] + SUM[0] }
+
+dotu_ret:
+	MOVSD SUM, sum+48(FP) // return SUM
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/scal.go b/vendor/gonum.org/v1/gonum/internal/asm/c64/scal.go
new file mode 100644
index 0000000000..6db0aa36f3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/scal.go
@@ -0,0 +1,85 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package c64
+
+// ScalUnitary is
+//
+//	for i := range x {
+//		x[i] *= alpha
+//	}
+func ScalUnitary(alpha complex64, x []complex64) {
+	for i := range x {
+		x[i] *= alpha
+	}
+}
+
+// ScalUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha * v
+//	}
+func ScalUnitaryTo(dst []complex64, alpha complex64, x []complex64) {
+	for i, v := range x {
+		dst[i] = alpha * v
+	}
+}
+
+// ScalInc is
+//
+//	var ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		x[ix] *= alpha
+//		ix += incX
+//	}
+func ScalInc(alpha complex64, x []complex64, n, incX uintptr) {
+	var ix uintptr
+	for i := 0; i < int(n); i++ {
+		x[ix] *= alpha
+		ix += incX
+	}
+}
+
+// ScalIncTo is
+//
+//	var idst, ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha * x[ix]
+//		ix += incX
+//		idst += incDst
+//	}
+func ScalIncTo(dst []complex64, incDst uintptr, alpha complex64, x []complex64, n, incX uintptr) {
+	var idst, ix uintptr
+	for i := 0; i < int(n); i++ {
+		dst[idst] = alpha * x[ix]
+		ix += incX
+		idst += incDst
+	}
+}
+
+// SscalUnitary is
+//
+//	for i, v := range x {
+//		x[i] = complex(real(v)*alpha, imag(v)*alpha)
+//	}
+func SscalUnitary(alpha float32, x []complex64) {
+	for i, v := range x {
+		x[i] = complex(real(v)*alpha, imag(v)*alpha)
+	}
+}
+
+// SscalInc is
+//
+//	var ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha)
+//		ix += inc
+//	}
+func SscalInc(alpha float32, x []complex64, n, inc uintptr) {
+	var ix uintptr
+	for i := 0; i < int(n); i++ {
+		x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha)
+		ix += inc
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/stubs.go b/vendor/gonum.org/v1/gonum/internal/asm/c64/stubs.go
new file mode 100644
index 0000000000..0aa626e141
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/stubs.go
@@ -0,0 +1,180 @@
+// Copyright ©2020 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package c64
+
+import (
+	"gonum.org/v1/gonum/internal/cmplx64"
+	"gonum.org/v1/gonum/internal/math32"
+)
+
+// Add is
+//
+//	for i, v := range s {
+//		dst[i] += v
+//	}
+func Add(dst, s []complex64) {
+	for i, v := range s {
+		dst[i] += v
+	}
+}
+
+// AddConst is
+//
+//	for i := range x {
+//		x[i] += alpha
+//	}
+func AddConst(alpha complex64, x []complex64) {
+	for i := range x {
+		x[i] += alpha
+	}
+}
+
+// CumSum is
+//
+//	if len(s) == 0 {
+//		return dst
+//	}
+//	dst[0] = s[0]
+//	for i, v := range s[1:] {
+//		dst[i+1] = dst[i] + v
+//	}
+//	return dst
+func CumSum(dst, s []complex64) []complex64 {
+	if len(s) == 0 {
+		return dst
+	}
+	dst[0] = s[0]
+	for i, v := range s[1:] {
+		dst[i+1] = dst[i] + v
+	}
+	return dst
+}
+
+// CumProd is
+//
+//	if len(s) == 0 {
+//		return dst
+//	}
+//	dst[0] = s[0]
+//	for i, v := range s[1:] {
+//		dst[i+1] = dst[i] * v
+//	}
+//	return dst
+func CumProd(dst, s []complex64) []complex64 {
+	if len(s) == 0 {
+		return dst
+	}
+	dst[0] = s[0]
+	for i, v := range s[1:] {
+		dst[i+1] = dst[i] * v
+	}
+	return dst
+}
+
+// Div is
+//
+//	for i, v := range s {
+//		dst[i] /= v
+//	}
+func Div(dst, s []complex64) {
+	for i, v := range s {
+		dst[i] /= v
+	}
+}
+
+// DivTo is
+//
+//	for i, v := range s {
+//		dst[i] = v / t[i]
+//	}
+//	return dst
+func DivTo(dst, s, t []complex64) []complex64 {
+	for i, v := range s {
+		dst[i] = v / t[i]
+	}
+	return dst
+}
+
+// DotUnitary is
+//
+//	for i, v := range x {
+//		sum += conj(v) * y[i]
+//	}
+//	return sum
+func DotUnitary(x, y []complex64) (sum complex64) {
+	for i, v := range x {
+		sum += cmplx64.Conj(v) * y[i]
+	}
+	return sum
+}
+
+// L2DistanceUnitary returns the L2-norm of x-y.
+func L2DistanceUnitary(x, y []complex64) (norm float32) {
+	var scale float32
+	sumSquares := float32(1.0)
+	for i, v := range x {
+		v -= y[i]
+		if v == 0 {
+			continue
+		}
+		absxi := cmplx64.Abs(v)
+		if math32.IsNaN(absxi) {
+			return math32.NaN()
+		}
+		if scale < absxi {
+			s := scale / absxi
+			sumSquares = 1 + sumSquares*s*s
+			scale = absxi
+		} else {
+			s := absxi / scale
+			sumSquares += s * s
+		}
+	}
+	if math32.IsInf(scale, 1) {
+		return math32.Inf(1)
+	}
+	return scale * math32.Sqrt(sumSquares)
+}
+
+// L2NormUnitary returns the L2-norm of x.
+func L2NormUnitary(x []complex64) (norm float32) {
+	var scale float32
+	sumSquares := float32(1.0)
+	for _, v := range x {
+		if v == 0 {
+			continue
+		}
+		absxi := cmplx64.Abs(v)
+		if math32.IsNaN(absxi) {
+			return math32.NaN()
+		}
+		if scale < absxi {
+			s := scale / absxi
+			sumSquares = 1 + sumSquares*s*s
+			scale = absxi
+		} else {
+			s := absxi / scale
+			sumSquares += s * s
+		}
+	}
+	if math32.IsInf(scale, 1) {
+		return math32.Inf(1)
+	}
+	return scale * math32.Sqrt(sumSquares)
+}
+
+// Sum is
+//
+//	var sum complex64
+//	for i := range x {
+//	    sum += x[i]
+//	}
+func Sum(x []complex64) complex64 {
+	var sum complex64
+	for _, v := range x {
+		sum += v
+	}
+	return sum
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/stubs_amd64.go b/vendor/gonum.org/v1/gonum/internal/asm/c64/stubs_amd64.go
new file mode 100644
index 0000000000..71367b016f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/stubs_amd64.go
@@ -0,0 +1,77 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !noasm && !gccgo && !safe
+// +build !noasm,!gccgo,!safe
+
+package c64
+
+// AxpyUnitary is
+//
+//	for i, v := range x {
+//		y[i] += alpha * v
+//	}
+func AxpyUnitary(alpha complex64, x, y []complex64)
+
+// AxpyUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha*v + y[i]
+//	}
+func AxpyUnitaryTo(dst []complex64, alpha complex64, x, y []complex64)
+
+// AxpyInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		y[iy] += alpha * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+func AxpyInc(alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr)
+
+// AxpyIncTo is
+//
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha*x[ix] + y[iy]
+//		ix += incX
+//		iy += incY
+//		idst += incDst
+//	}
+func AxpyIncTo(dst []complex64, incDst, idst uintptr, alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr)
+
+// DotcUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * conj(v)
+//	}
+//	return sum
+func DotcUnitary(x, y []complex64) (sum complex64)
+
+// DotcInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * conj(x[ix])
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotcInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64)
+
+// DotuUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * v
+//	}
+//	return sum
+func DotuUnitary(x, y []complex64) (sum complex64)
+
+// DotuInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotuInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64)
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/stubs_noasm.go b/vendor/gonum.org/v1/gonum/internal/asm/c64/stubs_noasm.go
new file mode 100644
index 0000000000..0d79b24fc8
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/stubs_noasm.go
@@ -0,0 +1,122 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 || noasm || gccgo || safe
+// +build !amd64 noasm gccgo safe
+
+package c64
+
+// AxpyUnitary is
+//
+//	for i, v := range x {
+//		y[i] += alpha * v
+//	}
+func AxpyUnitary(alpha complex64, x, y []complex64) {
+	for i, v := range x {
+		y[i] += alpha * v
+	}
+}
+
+// AxpyUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha*v + y[i]
+//	}
+func AxpyUnitaryTo(dst []complex64, alpha complex64, x, y []complex64) {
+	for i, v := range x {
+		dst[i] = alpha*v + y[i]
+	}
+}
+
+// AxpyInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		y[iy] += alpha * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+func AxpyInc(alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr) {
+	for i := 0; i < int(n); i++ {
+		y[iy] += alpha * x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// AxpyIncTo is
+//
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha*x[ix] + y[iy]
+//		ix += incX
+//		iy += incY
+//		idst += incDst
+//	}
+func AxpyIncTo(dst []complex64, incDst, idst uintptr, alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr) {
+	for i := 0; i < int(n); i++ {
+		dst[idst] = alpha*x[ix] + y[iy]
+		ix += incX
+		iy += incY
+		idst += incDst
+	}
+}
+
+// DotcUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * conj(v)
+//	}
+//	return sum
+func DotcUnitary(x, y []complex64) (sum complex64) {
+	for i, v := range x {
+		sum += y[i] * conj(v)
+	}
+	return sum
+}
+
+// DotcInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * conj(x[ix])
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotcInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64) {
+	for i := 0; i < int(n); i++ {
+		sum += y[iy] * conj(x[ix])
+		ix += incX
+		iy += incY
+	}
+	return sum
+}
+
+// DotuUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * v
+//	}
+//	return sum
+func DotuUnitary(x, y []complex64) (sum complex64) {
+	for i, v := range x {
+		sum += y[i] * v
+	}
+	return sum
+}
+
+// DotuInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotuInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64) {
+	for i := 0; i < int(n); i++ {
+		sum += y[iy] * x[ix]
+		ix += incX
+		iy += incY
+	}
+	return sum
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyinc_amd64.s
new file mode 100644
index 0000000000..c0b84cd81e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyinc_amd64.s
@@ -0,0 +1,73 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
+TEXT ·AxpyInc(SB), NOSPLIT, $0
+	MOVQ  n+56(FP), CX      // CX = n
+	CMPQ  CX, $0            // if n==0 { return }
+	JLE   axpyi_end
+	MOVQ  x_base+8(FP), SI  // SI = &x
+	MOVQ  y_base+32(FP), DI // DI = &y
+	MOVQ  ix+80(FP), R8     // R8 = ix
+	MOVQ  iy+88(FP), R9     // R9 = iy
+	LEAQ  (SI)(R8*4), SI    // SI = &(x[ix])
+	LEAQ  (DI)(R9*4), DI    // DI = &(y[iy])
+	MOVQ  DI, DX            // DX = DI   Read Pointer for y
+	MOVQ  incX+64(FP), R8   // R8 = incX
+	SHLQ  $2, R8            // R8 *= sizeof(float32)
+	MOVQ  incY+72(FP), R9   // R9 = incY
+	SHLQ  $2, R9            // R9 *= sizeof(float32)
+	MOVSS alpha+0(FP), X0   // X0 = alpha
+	MOVSS X0, X1            // X1 = X0  // for pipelining
+	MOVQ  CX, BX
+	ANDQ  $3, BX            // BX = n % 4
+	SHRQ  $2, CX            // CX = floor( n / 4 )
+	JZ    axpyi_tail_start  // if CX == 0 { goto axpyi_tail_start }
+
+axpyi_loop: // Loop unrolled 4x   do {
+	MOVSS (SI), X2       // X_i = x[i]
+	MOVSS (SI)(R8*1), X3
+	LEAQ  (SI)(R8*2), SI // SI = &(SI[incX*2])
+	MOVSS (SI), X4
+	MOVSS (SI)(R8*1), X5
+	MULSS X1, X2         // X_i *= a
+	MULSS X0, X3
+	MULSS X1, X4
+	MULSS X0, X5
+	ADDSS (DX), X2       // X_i += y[i]
+	ADDSS (DX)(R9*1), X3
+	LEAQ  (DX)(R9*2), DX // DX = &(DX[incY*2])
+	ADDSS (DX), X4
+	ADDSS (DX)(R9*1), X5
+	MOVSS X2, (DI)       // y[i] = X_i
+	MOVSS X3, (DI)(R9*1)
+	LEAQ  (DI)(R9*2), DI // DI = &(DI[incY*2])
+	MOVSS X4, (DI)
+	MOVSS X5, (DI)(R9*1)
+	LEAQ  (SI)(R8*2), SI // SI = &(SI[incX*2])  // Increment addresses
+	LEAQ  (DX)(R9*2), DX // DX = &(DX[incY*2])
+	LEAQ  (DI)(R9*2), DI // DI = &(DI[incY*2])
+	LOOP  axpyi_loop     // } while --CX > 0
+	CMPQ  BX, $0         // if BX == 0 { return }
+	JE    axpyi_end
+
+axpyi_tail_start: // Reset loop registers
+	MOVQ BX, CX // Loop counter: CX = BX
+
+axpyi_tail: // do {
+	MOVSS (SI), X2   // X2 = x[i]
+	MULSS X1, X2     // X2 *= a
+	ADDSS (DI), X2   // X2 += y[i]
+	MOVSS X2, (DI)   // y[i] = X2
+	ADDQ  R8, SI     // SI = &(SI[incX])
+	ADDQ  R9, DI     // DI = &(DI[incY])
+	LOOP  axpyi_tail // } while --CX > 0
+
+axpyi_end:
+	RET
+
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyincto_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyincto_amd64.s
new file mode 100644
index 0000000000..3f1d2b9330
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyincto_amd64.s
@@ -0,0 +1,78 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func AxpyIncTo(dst []float32, incDst, idst uintptr, alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
+TEXT ·AxpyIncTo(SB), NOSPLIT, $0
+	MOVQ  n+96(FP), CX       // CX = n
+	CMPQ  CX, $0             // if n==0 { return }
+	JLE   axpyi_end
+	MOVQ  dst_base+0(FP), DI // DI = &dst
+	MOVQ  x_base+48(FP), SI  // SI = &x
+	MOVQ  y_base+72(FP), DX  // DX = &y
+	MOVQ  ix+120(FP), R8     // R8 = ix  // Load the first index
+	MOVQ  iy+128(FP), R9     // R9 = iy
+	MOVQ  idst+32(FP), R10   // R10 = idst
+	LEAQ  (SI)(R8*4), SI     // SI = &(x[ix])
+	LEAQ  (DX)(R9*4), DX     // DX = &(y[iy])
+	LEAQ  (DI)(R10*4), DI    // DI = &(dst[idst])
+	MOVQ  incX+104(FP), R8   // R8 = incX
+	SHLQ  $2, R8             // R8 *= sizeof(float32)
+	MOVQ  incY+112(FP), R9   // R9 = incY
+	SHLQ  $2, R9             // R9 *= sizeof(float32)
+	MOVQ  incDst+24(FP), R10 // R10 = incDst
+	SHLQ  $2, R10            // R10 *= sizeof(float32)
+	MOVSS alpha+40(FP), X0   // X0 = alpha
+	MOVSS X0, X1             // X1 = X0  // for pipelining
+	MOVQ  CX, BX
+	ANDQ  $3, BX             // BX = n % 4
+	SHRQ  $2, CX             // CX = floor( n / 4 )
+	JZ    axpyi_tail_start   // if CX == 0 { goto axpyi_tail_start }
+
+axpyi_loop: // Loop unrolled 4x   do {
+	MOVSS (SI), X2        // X_i = x[i]
+	MOVSS (SI)(R8*1), X3
+	LEAQ  (SI)(R8*2), SI  // SI = &(SI[incX*2])
+	MOVSS (SI), X4
+	MOVSS (SI)(R8*1), X5
+	MULSS X1, X2          // X_i *= a
+	MULSS X0, X3
+	MULSS X1, X4
+	MULSS X0, X5
+	ADDSS (DX), X2        // X_i += y[i]
+	ADDSS (DX)(R9*1), X3
+	LEAQ  (DX)(R9*2), DX  // DX = &(DX[incY*2])
+	ADDSS (DX), X4
+	ADDSS (DX)(R9*1), X5
+	MOVSS X2, (DI)        // dst[i] = X_i
+	MOVSS X3, (DI)(R10*1)
+	LEAQ  (DI)(R10*2), DI // DI = &(DI[incDst*2])
+	MOVSS X4, (DI)
+	MOVSS X5, (DI)(R10*1)
+	LEAQ  (SI)(R8*2), SI  // SI = &(SI[incX*2])  // Increment addresses
+	LEAQ  (DX)(R9*2), DX  // DX = &(DX[incY*2])
+	LEAQ  (DI)(R10*2), DI // DI = &(DI[incDst*2])
+	LOOP  axpyi_loop      // } while --CX > 0
+	CMPQ  BX, $0          // if BX == 0 { return }
+	JE    axpyi_end
+
+axpyi_tail_start: // Reset loop registers
+	MOVQ BX, CX // Loop counter: CX = BX
+
+axpyi_tail: // do {
+	MOVSS (SI), X2   // X2 = x[i]
+	MULSS X1, X2     // X2 *= a
+	ADDSS (DX), X2   // X2 += y[i]
+	MOVSS X2, (DI)   // dst[i] = X2
+	ADDQ  R8, SI     // SI = &(SI[incX])
+	ADDQ  R9, DX     // DX = &(DX[incY])
+	ADDQ  R10, DI    // DI = &(DI[incY])
+	LOOP  axpyi_tail // } while --CX > 0
+
+axpyi_end:
+	RET
+
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyunitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyunitary_amd64.s
new file mode 100644
index 0000000000..8e24be8100
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyunitary_amd64.s
@@ -0,0 +1,97 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func AxpyUnitary(alpha float32, x, y []float32)
+TEXT ·AxpyUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+8(FP), SI  // SI = &x
+	MOVQ    y_base+32(FP), DI // DI = &y
+	MOVQ    x_len+16(FP), BX  // BX = min( len(x), len(y) )
+	CMPQ    y_len+40(FP), BX
+	CMOVQLE y_len+40(FP), BX
+	CMPQ    BX, $0            // if BX == 0 { return }
+	JE      axpy_end
+	MOVSS   alpha+0(FP), X0
+	SHUFPS  $0, X0, X0        // X0 = { a, a, a, a }
+	XORQ    AX, AX            // i = 0
+	PXOR    X2, X2            // 2 NOP instructions (PXOR) to align
+	PXOR    X3, X3            // loop to cache line
+	MOVQ    DI, CX
+	ANDQ    $0xF, CX          // Align on 16-byte boundary for ADDPS
+	JZ      axpy_no_trim      // if CX == 0 { goto axpy_no_trim }
+
+	XORQ $0xF, CX // CX = 4 - floor( BX % 16 / 4 )
+	INCQ CX
+	SHRQ $2, CX
+
+axpy_align: // Trim first value(s) in unaligned buffer  do {
+	MOVSS (SI)(AX*4), X2 // X2 = x[i]
+	MULSS X0, X2         // X2 *= a
+	ADDSS (DI)(AX*4), X2 // X2 += y[i]
+	MOVSS X2, (DI)(AX*4) // y[i] = X2
+	INCQ  AX             // i++
+	DECQ  BX
+	JZ    axpy_end       // if --BX == 0 { return }
+	LOOP  axpy_align     // } while --CX > 0
+
+axpy_no_trim:
+	MOVUPS X0, X1           // Copy X0 to X1 for pipelining
+	MOVQ   BX, CX
+	ANDQ   $0xF, BX         // BX = len % 16
+	SHRQ   $4, CX           // CX = int( len / 16 )
+	JZ     axpy_tail4_start // if CX == 0 { return }
+
+axpy_loop: // Loop unrolled 16x   do {
+	MOVUPS (SI)(AX*4), X2   // X2 = x[i:i+4]
+	MOVUPS 16(SI)(AX*4), X3
+	MOVUPS 32(SI)(AX*4), X4
+	MOVUPS 48(SI)(AX*4), X5
+	MULPS  X0, X2           // X2 *= a
+	MULPS  X1, X3
+	MULPS  X0, X4
+	MULPS  X1, X5
+	ADDPS  (DI)(AX*4), X2   // X2 += y[i:i+4]
+	ADDPS  16(DI)(AX*4), X3
+	ADDPS  32(DI)(AX*4), X4
+	ADDPS  48(DI)(AX*4), X5
+	MOVUPS X2, (DI)(AX*4)   // dst[i:i+4] = X2
+	MOVUPS X3, 16(DI)(AX*4)
+	MOVUPS X4, 32(DI)(AX*4)
+	MOVUPS X5, 48(DI)(AX*4)
+	ADDQ   $16, AX          // i += 16
+	LOOP   axpy_loop        // while (--CX) > 0
+	CMPQ   BX, $0           // if BX == 0 { return }
+	JE     axpy_end
+
+axpy_tail4_start: // Reset loop counter for 4-wide tail loop
+	MOVQ BX, CX          // CX = floor( BX / 4 )
+	SHRQ $2, CX
+	JZ   axpy_tail_start // if CX == 0 { goto axpy_tail_start }
+
+axpy_tail4: // Loop unrolled 4x   do {
+	MOVUPS (SI)(AX*4), X2 // X2 = x[i]
+	MULPS  X0, X2         // X2 *= a
+	ADDPS  (DI)(AX*4), X2 // X2 += y[i]
+	MOVUPS X2, (DI)(AX*4) // y[i] = X2
+	ADDQ   $4, AX         // i += 4
+	LOOP   axpy_tail4     // } while --CX > 0
+
+axpy_tail_start: // Reset loop counter for 1-wide tail loop
+	MOVQ BX, CX   // CX = BX % 4
+	ANDQ $3, CX
+	JZ   axpy_end // if CX == 0 { return }
+
+axpy_tail:
+	MOVSS (SI)(AX*4), X1 // X1 = x[i]
+	MULSS X0, X1         // X1 *= a
+	ADDSS (DI)(AX*4), X1 // X1 += y[i]
+	MOVSS X1, (DI)(AX*4) // y[i] = X1
+	INCQ  AX             // i++
+	LOOP  axpy_tail      // } while --CX > 0
+
+axpy_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyunitaryto_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyunitaryto_amd64.s
new file mode 100644
index 0000000000..9a68f0f491
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyunitaryto_amd64.s
@@ -0,0 +1,98 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32)
+TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
+	MOVQ    dst_base+0(FP), DI // DI = &dst
+	MOVQ    x_base+32(FP), SI  // SI = &x
+	MOVQ    y_base+56(FP), DX  // DX = &y
+	MOVQ    x_len+40(FP), BX   // BX = min( len(x), len(y), len(dst) )
+	CMPQ    y_len+64(FP), BX
+	CMOVQLE y_len+64(FP), BX
+	CMPQ    dst_len+8(FP), BX
+	CMOVQLE dst_len+8(FP), BX
+	CMPQ    BX, $0             // if BX == 0 { return }
+	JE      axpy_end
+	MOVSS   alpha+24(FP), X0
+	SHUFPS  $0, X0, X0         // X0 = { a, a, a, a, }
+	XORQ    AX, AX             // i = 0
+	MOVQ    DX, CX
+	ANDQ    $0xF, CX           // Align on 16-byte boundary for ADDPS
+	JZ      axpy_no_trim       // if CX == 0 { goto axpy_no_trim }
+
+	XORQ $0xF, CX // CX = 4 - floor ( B % 16 / 4 )
+	INCQ CX
+	SHRQ $2, CX
+
+axpy_align: // Trim first value(s) in unaligned buffer  do {
+	MOVSS (SI)(AX*4), X2 // X2 = x[i]
+	MULSS X0, X2         // X2 *= a
+	ADDSS (DX)(AX*4), X2 // X2 += y[i]
+	MOVSS X2, (DI)(AX*4) // y[i] = X2
+	INCQ  AX             // i++
+	DECQ  BX
+	JZ    axpy_end       // if --BX == 0 { return }
+	LOOP  axpy_align     // } while --CX > 0
+
+axpy_no_trim:
+	MOVUPS X0, X1           // Copy X0 to X1 for pipelining
+	MOVQ   BX, CX
+	ANDQ   $0xF, BX         // BX = len % 16
+	SHRQ   $4, CX           // CX = floor( len / 16 )
+	JZ     axpy_tail4_start // if CX == 0 { return }
+
+axpy_loop: // Loop unrolled 16x  do {
+	MOVUPS (SI)(AX*4), X2   // X2 = x[i:i+4]
+	MOVUPS 16(SI)(AX*4), X3
+	MOVUPS 32(SI)(AX*4), X4
+	MOVUPS 48(SI)(AX*4), X5
+	MULPS  X0, X2           // X2 *= a
+	MULPS  X1, X3
+	MULPS  X0, X4
+	MULPS  X1, X5
+	ADDPS  (DX)(AX*4), X2   // X2 += y[i:i+4]
+	ADDPS  16(DX)(AX*4), X3
+	ADDPS  32(DX)(AX*4), X4
+	ADDPS  48(DX)(AX*4), X5
+	MOVUPS X2, (DI)(AX*4)   // dst[i:i+4] = X2
+	MOVUPS X3, 16(DI)(AX*4)
+	MOVUPS X4, 32(DI)(AX*4)
+	MOVUPS X5, 48(DI)(AX*4)
+	ADDQ   $16, AX          // i += 16
+	LOOP   axpy_loop        // while (--CX) > 0
+	CMPQ   BX, $0           // if BX == 0 { return }
+	JE     axpy_end
+
+axpy_tail4_start: // Reset loop counter for 4-wide tail loop
+	MOVQ BX, CX          // CX = floor( BX / 4 )
+	SHRQ $2, CX
+	JZ   axpy_tail_start // if CX == 0 { goto axpy_tail_start }
+
+axpy_tail4: // Loop unrolled 4x  do {
+	MOVUPS (SI)(AX*4), X2 // X2 = x[i]
+	MULPS  X0, X2         // X2 *= a
+	ADDPS  (DX)(AX*4), X2 // X2 += y[i]
+	MOVUPS X2, (DI)(AX*4) // y[i] = X2
+	ADDQ   $4, AX         // i += 4
+	LOOP   axpy_tail4     // } while --CX > 0
+
+axpy_tail_start: // Reset loop counter for 1-wide tail loop
+	MOVQ BX, CX   // CX = BX % 4
+	ANDQ $3, CX
+	JZ   axpy_end // if CX == 0 { return }
+
+axpy_tail:
+	MOVSS (SI)(AX*4), X1 // X1 = x[i]
+	MULSS X0, X1         // X1 *= a
+	ADDSS (DX)(AX*4), X1 // X1 += y[i]
+	MOVSS X1, (DI)(AX*4) // y[i] = X1
+	INCQ  AX             // i++
+	LOOP  axpy_tail      // } while --CX > 0
+
+axpy_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/ddotinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f32/ddotinc_amd64.s
new file mode 100644
index 0000000000..85fcd89eed
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/ddotinc_amd64.s
@@ -0,0 +1,91 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define INC_X R8
+#define INCx3_X R10
+#define INC_Y R9
+#define INCx3_Y R11
+#define SUM X0
+#define P_SUM X1
+
+// func DdotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float64)
+TEXT ·DdotInc(SB), NOSPLIT, $0
+	MOVQ x_base+0(FP), X_PTR  // X_PTR = &x
+	MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
+	MOVQ n+48(FP), LEN        // LEN = n
+	PXOR SUM, SUM             // SUM = 0
+	CMPQ LEN, $0
+	JE   dot_end
+
+	MOVQ ix+72(FP), INC_X        // INC_X = ix
+	MOVQ iy+80(FP), INC_Y        // INC_Y = iy
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(x[ix])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(y[iy])
+
+	MOVQ incX+56(FP), INC_X // INC_X = incX * sizeof(float32)
+	SHLQ $2, INC_X
+	MOVQ incY+64(FP), INC_Y // INC_Y = incY * sizeof(float32)
+	SHLQ $2, INC_Y
+
+	MOVQ LEN, TAIL
+	ANDQ $3, TAIL  // TAIL = LEN % 4
+	SHRQ $2, LEN   // LEN = floor( LEN / 4 )
+	JZ   dot_tail  // if LEN == 0 { goto dot_tail }
+
+	PXOR P_SUM, P_SUM              // P_SUM = 0  for pipelining
+	LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
+	LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
+
+dot_loop: // Loop unrolled 4x  do {
+	CVTSS2SD (X_PTR), X2            // X_i = x[i:i+1]
+	CVTSS2SD (X_PTR)(INC_X*1), X3
+	CVTSS2SD (X_PTR)(INC_X*2), X4
+	CVTSS2SD (X_PTR)(INCx3_X*1), X5
+
+	CVTSS2SD (Y_PTR), X6            // X_j = y[i:i+1]
+	CVTSS2SD (Y_PTR)(INC_Y*1), X7
+	CVTSS2SD (Y_PTR)(INC_Y*2), X8
+	CVTSS2SD (Y_PTR)(INCx3_Y*1), X9
+
+	MULSD X6, X2 // X_i *= X_j
+	MULSD X7, X3
+	MULSD X8, X4
+	MULSD X9, X5
+
+	ADDSD X2, SUM   // SUM += X_i
+	ADDSD X3, P_SUM
+	ADDSD X4, SUM
+	ADDSD X5, P_SUM
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[INC_X * 4])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[INC_Y * 4])
+
+	DECQ LEN
+	JNZ  dot_loop // } while --LEN > 0
+
+	ADDSD P_SUM, SUM // SUM += P_SUM
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dot_end
+
+dot_tail: // do {
+	CVTSS2SD (X_PTR), X2  // X2 = x[i]
+	CVTSS2SD (Y_PTR), X3  // X2 *= y[i]
+	MULSD    X3, X2
+	ADDSD    X2, SUM      // SUM += X2
+	ADDQ     INC_X, X_PTR // X_PTR += INC_X
+	ADDQ     INC_Y, Y_PTR // Y_PTR += INC_Y
+	DECQ     TAIL
+	JNZ      dot_tail     // } while --TAIL > 0
+
+dot_end:
+	MOVSD SUM, sum+88(FP) // return SUM
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/ddotunitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f32/ddotunitary_amd64.s
new file mode 100644
index 0000000000..87ef09fa39
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/ddotunitary_amd64.s
@@ -0,0 +1,110 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define HADDPD_SUM_SUM    LONG $0xC07C0F66 // @ HADDPD X0, X0
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define IDX AX
+#define SUM X0
+#define P_SUM X1
+
+// func DdotUnitary(x, y []float32) (sum float32)
+TEXT ·DdotUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+0(FP), X_PTR  // X_PTR = &x
+	MOVQ    y_base+24(FP), Y_PTR // Y_PTR = &y
+	MOVQ    x_len+8(FP), LEN     // LEN = min( len(x), len(y) )
+	CMPQ    y_len+32(FP), LEN
+	CMOVQLE y_len+32(FP), LEN
+	PXOR    SUM, SUM             // psum = 0
+	CMPQ    LEN, $0
+	JE      dot_end
+
+	XORQ IDX, IDX
+	MOVQ Y_PTR, DX
+	ANDQ $0xF, DX    // Align on 16-byte boundary for ADDPS
+	JZ   dot_no_trim // if DX == 0 { goto dot_no_trim }
+
+	SUBQ $16, DX
+
+dot_align: // Trim first value(s) in unaligned buffer  do {
+	CVTSS2SD (X_PTR)(IDX*4), X2 // X2 = float64(x[i])
+	CVTSS2SD (Y_PTR)(IDX*4), X3 // X3 = float64(y[i])
+	MULSD    X3, X2
+	ADDSD    X2, SUM            // SUM += X2
+	INCQ     IDX                // IDX++
+	DECQ     LEN
+	JZ       dot_end            // if --TAIL == 0 { return }
+	ADDQ     $4, DX
+	JNZ      dot_align          // } while --LEN > 0
+
+dot_no_trim:
+	PXOR P_SUM, P_SUM   // P_SUM = 0  for pipelining
+	MOVQ LEN, TAIL
+	ANDQ $0x7, TAIL     // TAIL = LEN % 8
+	SHRQ $3, LEN        // LEN = floor( LEN / 8 )
+	JZ   dot_tail_start // if LEN == 0 { goto dot_tail_start }
+
+dot_loop: // Loop unrolled 8x  do {
+	CVTPS2PD (X_PTR)(IDX*4), X2   // X_i = x[i:i+1]
+	CVTPS2PD 8(X_PTR)(IDX*4), X3
+	CVTPS2PD 16(X_PTR)(IDX*4), X4
+	CVTPS2PD 24(X_PTR)(IDX*4), X5
+
+	CVTPS2PD (Y_PTR)(IDX*4), X6   // X_j = y[i:i+1]
+	CVTPS2PD 8(Y_PTR)(IDX*4), X7
+	CVTPS2PD 16(Y_PTR)(IDX*4), X8
+	CVTPS2PD 24(Y_PTR)(IDX*4), X9
+
+	MULPD X6, X2 // X_i *= X_j
+	MULPD X7, X3
+	MULPD X8, X4
+	MULPD X9, X5
+
+	ADDPD X2, SUM   // SUM += X_i
+	ADDPD X3, P_SUM
+	ADDPD X4, SUM
+	ADDPD X5, P_SUM
+
+	ADDQ $8, IDX  // IDX += 8
+	DECQ LEN
+	JNZ  dot_loop // } while --LEN > 0
+
+	ADDPD P_SUM, SUM // SUM += P_SUM
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dot_end
+
+dot_tail_start:
+	MOVQ TAIL, LEN
+	SHRQ $1, LEN
+	JZ   dot_tail_one
+
+dot_tail_two:
+	CVTPS2PD (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
+	CVTPS2PD (Y_PTR)(IDX*4), X6 // X_j = y[i:i+1]
+	MULPD    X6, X2             // X_i *= X_j
+	ADDPD    X2, SUM            // SUM += X_i
+	ADDQ     $2, IDX            // IDX += 2
+	DECQ     LEN
+	JNZ      dot_tail_two       // } while --LEN > 0
+
+	ANDQ $1, TAIL
+	JZ   dot_end
+
+dot_tail_one:
+	CVTSS2SD (X_PTR)(IDX*4), X2 // X2 = float64(x[i])
+	CVTSS2SD (Y_PTR)(IDX*4), X3 // X3 = float64(y[i])
+	MULSD    X3, X2             // X2 *= X3
+	ADDSD    X2, SUM            // SUM += X2
+
+dot_end:
+	HADDPD_SUM_SUM        // SUM = \sum{ SUM[i] }
+	MOVSD SUM, sum+48(FP) // return SUM
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/doc.go b/vendor/gonum.org/v1/gonum/internal/asm/f32/doc.go
new file mode 100644
index 0000000000..408847a698
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package f32 provides float32 vector primitives.
+package f32 // import "gonum.org/v1/gonum/internal/asm/f32"
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/dotinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f32/dotinc_amd64.s
new file mode 100644
index 0000000000..9ac8063691
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/dotinc_amd64.s
@@ -0,0 +1,85 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define INC_X R8
+#define INCx3_X R10
+#define INC_Y R9
+#define INCx3_Y R11
+#define SUM X0
+#define P_SUM X1
+
+// func DotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float32)
+TEXT ·DotInc(SB), NOSPLIT, $0
+	MOVQ x_base+0(FP), X_PTR  // X_PTR = &x
+	MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
+	PXOR SUM, SUM             // SUM = 0
+	MOVQ n+48(FP), LEN        // LEN = n
+	CMPQ LEN, $0
+	JE   dot_end
+
+	MOVQ ix+72(FP), INC_X        // INC_X = ix
+	MOVQ iy+80(FP), INC_Y        // INC_Y = iy
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(x[ix])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(y[iy])
+
+	MOVQ incX+56(FP), INC_X // INC_X := incX * sizeof(float32)
+	SHLQ $2, INC_X
+	MOVQ incY+64(FP), INC_Y // INC_Y := incY * sizeof(float32)
+	SHLQ $2, INC_Y
+
+	MOVQ LEN, TAIL
+	ANDQ $0x3, TAIL // TAIL = LEN % 4
+	SHRQ $2, LEN    // LEN = floor( LEN / 4 )
+	JZ   dot_tail   // if LEN == 0 { goto dot_tail }
+
+	PXOR P_SUM, P_SUM              // P_SUM = 0  for pipelining
+	LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
+	LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
+
+dot_loop: // Loop unrolled 4x  do {
+	MOVSS (X_PTR), X2            // X_i = x[i:i+1]
+	MOVSS (X_PTR)(INC_X*1), X3
+	MOVSS (X_PTR)(INC_X*2), X4
+	MOVSS (X_PTR)(INCx3_X*1), X5
+
+	MULSS (Y_PTR), X2            // X_i *= y[i:i+1]
+	MULSS (Y_PTR)(INC_Y*1), X3
+	MULSS (Y_PTR)(INC_Y*2), X4
+	MULSS (Y_PTR)(INCx3_Y*1), X5
+
+	ADDSS X2, SUM   // SUM += X_i
+	ADDSS X3, P_SUM
+	ADDSS X4, SUM
+	ADDSS X5, P_SUM
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[INC_X * 4])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[INC_Y * 4])
+
+	DECQ LEN
+	JNZ  dot_loop // } while --LEN > 0
+
+	ADDSS P_SUM, SUM // P_SUM += SUM
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dot_end
+
+dot_tail: // do {
+	MOVSS (X_PTR), X2  // X2 = x[i]
+	MULSS (Y_PTR), X2  // X2 *= y[i]
+	ADDSS X2, SUM      // SUM += X2
+	ADDQ  INC_X, X_PTR // X_PTR += INC_X
+	ADDQ  INC_Y, Y_PTR // Y_PTR += INC_Y
+	DECQ  TAIL
+	JNZ   dot_tail     // } while --TAIL > 0
+
+dot_end:
+	MOVSS SUM, sum+88(FP) // return SUM
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/dotunitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f32/dotunitary_amd64.s
new file mode 100644
index 0000000000..0023a6e244
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/dotunitary_amd64.s
@@ -0,0 +1,106 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define HADDPS_SUM_SUM    LONG $0xC07C0FF2 // @ HADDPS X0, X0
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define IDX AX
+#define SUM X0
+#define P_SUM X1
+
+// func DotUnitary(x, y []float32) (sum float32)
+TEXT ·DotUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+0(FP), X_PTR  // X_PTR = &x
+	MOVQ    y_base+24(FP), Y_PTR // Y_PTR = &y
+	PXOR    SUM, SUM             // SUM = 0
+	MOVQ    x_len+8(FP), LEN     // LEN = min( len(x), len(y) )
+	CMPQ    y_len+32(FP), LEN
+	CMOVQLE y_len+32(FP), LEN
+	CMPQ    LEN, $0
+	JE      dot_end
+
+	XORQ IDX, IDX
+	MOVQ Y_PTR, DX
+	ANDQ $0xF, DX    // Align on 16-byte boundary for MULPS
+	JZ   dot_no_trim // if DX == 0 { goto dot_no_trim }
+	SUBQ $16, DX
+
+dot_align: // Trim first value(s) in unaligned buffer  do {
+	MOVSS (X_PTR)(IDX*4), X2 // X2 = x[i]
+	MULSS (Y_PTR)(IDX*4), X2 // X2 *= y[i]
+	ADDSS X2, SUM            // SUM += X2
+	INCQ  IDX                // IDX++
+	DECQ  LEN
+	JZ    dot_end            // if --TAIL == 0 { return }
+	ADDQ  $4, DX
+	JNZ   dot_align          // } while --DX > 0
+
+dot_no_trim:
+	PXOR P_SUM, P_SUM    // P_SUM = 0  for pipelining
+	MOVQ LEN, TAIL
+	ANDQ $0xF, TAIL      // TAIL = LEN % 16
+	SHRQ $4, LEN         // LEN = floor( LEN / 16 )
+	JZ   dot_tail4_start // if LEN == 0 { goto dot_tail4_start }
+
+dot_loop: // Loop unrolled 16x  do {
+	MOVUPS (X_PTR)(IDX*4), X2   // X_i = x[i:i+1]
+	MOVUPS 16(X_PTR)(IDX*4), X3
+	MOVUPS 32(X_PTR)(IDX*4), X4
+	MOVUPS 48(X_PTR)(IDX*4), X5
+
+	MULPS (Y_PTR)(IDX*4), X2   // X_i *= y[i:i+1]
+	MULPS 16(Y_PTR)(IDX*4), X3
+	MULPS 32(Y_PTR)(IDX*4), X4
+	MULPS 48(Y_PTR)(IDX*4), X5
+
+	ADDPS X2, SUM   // SUM += X_i
+	ADDPS X3, P_SUM
+	ADDPS X4, SUM
+	ADDPS X5, P_SUM
+
+	ADDQ $16, IDX // IDX += 16
+	DECQ LEN
+	JNZ  dot_loop // } while --LEN > 0
+
+	ADDPS P_SUM, SUM // SUM += P_SUM
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dot_end
+
+dot_tail4_start: // Reset loop counter for 4-wide tail loop
+	MOVQ TAIL, LEN      // LEN = floor( TAIL / 4 )
+	SHRQ $2, LEN
+	JZ   dot_tail_start // if LEN == 0 { goto dot_tail_start }
+
+dot_tail4_loop: // Loop unrolled 4x  do {
+	MOVUPS (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
+	MULPS  (Y_PTR)(IDX*4), X2 // X_i *= y[i:i+1]
+	ADDPS  X2, SUM            // SUM += X_i
+	ADDQ   $4, IDX            // i += 4
+	DECQ   LEN
+	JNZ    dot_tail4_loop     // } while --LEN > 0
+
+dot_tail_start: // Reset loop counter for 1-wide tail loop
+	ANDQ $3, TAIL // TAIL = TAIL % 4
+	JZ   dot_end  // if TAIL == 0 { return }
+
+dot_tail: // do {
+	MOVSS (X_PTR)(IDX*4), X2 // X2 = x[i]
+	MULSS (Y_PTR)(IDX*4), X2 // X2 *= y[i]
+	ADDSS X2, SUM            // psum += X2
+	INCQ  IDX                // IDX++
+	DECQ  TAIL
+	JNZ   dot_tail           // } while --TAIL > 0
+
+dot_end:
+	HADDPS_SUM_SUM        // SUM = \sum{ SUM[i] }
+	HADDPS_SUM_SUM
+	MOVSS SUM, sum+48(FP) // return SUM
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.go b/vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.go
new file mode 100644
index 0000000000..72acba2077
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.go
@@ -0,0 +1,18 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !noasm && !gccgo && !safe
+// +build !noasm,!gccgo,!safe
+
+package f32
+
+// Ger performs the rank-one operation
+//
+//	A += alpha * x * yᵀ
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
+func Ger(m, n uintptr, alpha float32,
+	x []float32, incX uintptr,
+	y []float32, incY uintptr,
+	a []float32, lda uintptr)
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.s
new file mode 100644
index 0000000000..f8fd3df862
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.s
@@ -0,0 +1,757 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define SIZE 4
+#define BITSIZE 2
+#define KERNELSIZE 3
+
+#define M_DIM m+0(FP)
+#define M CX
+#define N_DIM n+8(FP)
+#define N BX
+
+#define TMP1 R14
+#define TMP2 R15
+
+#define X_PTR SI
+#define Y y_base+56(FP)
+#define Y_PTR DX
+#define A_ROW AX
+#define A_PTR DI
+
+#define INC_X R8
+#define INC3_X R9
+
+#define INC_Y R10
+#define INC3_Y R11
+
+#define LDA R12
+#define LDA3 R13
+
+#define ALPHA X0
+#define ALPHA_SPILL al-16(SP)
+
+#define LOAD_ALPHA \
+	MOVSS  alpha+16(FP), ALPHA \
+	SHUFPS $0, ALPHA, ALPHA
+
+#define LOAD_SCALED4 \
+	PREFETCHNTA 16*SIZE(X_PTR)    \
+	MOVDDUP     (X_PTR), X1       \
+	MOVDDUP     2*SIZE(X_PTR), X3 \
+	MOVSHDUP    X1, X2            \
+	MOVSHDUP    X3, X4            \
+	MOVSLDUP    X1, X1            \
+	MOVSLDUP    X3, X3            \
+	MULPS       ALPHA, X1         \
+	MULPS       ALPHA, X2         \
+	MULPS       ALPHA, X3         \
+	MULPS       ALPHA, X4
+
+#define LOAD_SCALED2 \
+	MOVDDUP  (X_PTR), X1 \
+	MOVSHDUP X1, X2      \
+	MOVSLDUP X1, X1      \
+	MULPS    ALPHA, X1   \
+	MULPS    ALPHA, X2
+
+#define LOAD_SCALED1 \
+	MOVSS  (X_PTR), X1 \
+	SHUFPS $0, X1, X1  \
+	MULPS  ALPHA, X1
+
+#define LOAD_SCALED4_INC \
+	PREFETCHNTA (X_PTR)(INC_X*8)      \
+	MOVSS       (X_PTR), X1           \
+	MOVSS       (X_PTR)(INC_X*1), X2  \
+	MOVSS       (X_PTR)(INC_X*2), X3  \
+	MOVSS       (X_PTR)(INC3_X*1), X4 \
+	SHUFPS      $0, X1, X1            \
+	SHUFPS      $0, X2, X2            \
+	SHUFPS      $0, X3, X3            \
+	SHUFPS      $0, X4, X4            \
+	MULPS       ALPHA, X1             \
+	MULPS       ALPHA, X2             \
+	MULPS       ALPHA, X3             \
+	MULPS       ALPHA, X4
+
+#define LOAD_SCALED2_INC \
+	MOVSS  (X_PTR), X1          \
+	MOVSS  (X_PTR)(INC_X*1), X2 \
+	SHUFPS $0, X1, X1           \
+	SHUFPS $0, X2, X2           \
+	MULPS  ALPHA, X1            \
+	MULPS  ALPHA, X2
+
+#define KERNEL_LOAD8 \
+	MOVUPS (Y_PTR), X5       \
+	MOVUPS 4*SIZE(Y_PTR), X6
+
+#define KERNEL_LOAD8_INC \
+	MOVSS    (Y_PTR), X5             \
+	MOVSS    (Y_PTR)(INC_Y*1), X6    \
+	MOVSS    (Y_PTR)(INC_Y*2), X7    \
+	MOVSS    (Y_PTR)(INC3_Y*1), X8   \
+	UNPCKLPS X6, X5                  \
+	UNPCKLPS X8, X7                  \
+	MOVLHPS  X7, X5                  \
+	LEAQ     (Y_PTR)(INC_Y*4), Y_PTR \
+	MOVSS    (Y_PTR), X6             \
+	MOVSS    (Y_PTR)(INC_Y*1), X7    \
+	MOVSS    (Y_PTR)(INC_Y*2), X8    \
+	MOVSS    (Y_PTR)(INC3_Y*1), X9   \
+	UNPCKLPS X7, X6                  \
+	UNPCKLPS X9, X8                  \
+	MOVLHPS  X8, X6
+
+#define KERNEL_LOAD4 \
+	MOVUPS (Y_PTR), X5
+
+#define KERNEL_LOAD4_INC \
+	MOVSS    (Y_PTR), X5           \
+	MOVSS    (Y_PTR)(INC_Y*1), X6  \
+	MOVSS    (Y_PTR)(INC_Y*2), X7  \
+	MOVSS    (Y_PTR)(INC3_Y*1), X8 \
+	UNPCKLPS X6, X5                \
+	UNPCKLPS X8, X7                \
+	MOVLHPS  X7, X5
+
+#define KERNEL_LOAD2 \
+	MOVSD (Y_PTR), X5
+
+#define KERNEL_LOAD2_INC \
+	MOVSS    (Y_PTR), X5          \
+	MOVSS    (Y_PTR)(INC_Y*1), X6 \
+	UNPCKLPS X6, X5
+
+#define KERNEL_4x8 \
+	MOVUPS X5, X7  \
+	MOVUPS X6, X8  \
+	MOVUPS X5, X9  \
+	MOVUPS X6, X10 \
+	MOVUPS X5, X11 \
+	MOVUPS X6, X12 \
+	MULPS  X1, X5  \
+	MULPS  X1, X6  \
+	MULPS  X2, X7  \
+	MULPS  X2, X8  \
+	MULPS  X3, X9  \
+	MULPS  X3, X10 \
+	MULPS  X4, X11 \
+	MULPS  X4, X12
+
+#define STORE_4x8 \
+	MOVUPS ALPHA, ALPHA_SPILL         \
+	MOVUPS (A_PTR), X13               \
+	ADDPS  X13, X5                    \
+	MOVUPS 4*SIZE(A_PTR), X14         \
+	ADDPS  X14, X6                    \
+	MOVUPS (A_PTR)(LDA*1), X15        \
+	ADDPS  X15, X7                    \
+	MOVUPS 4*SIZE(A_PTR)(LDA*1), X0   \
+	ADDPS  X0, X8                     \
+	MOVUPS (A_PTR)(LDA*2), X13        \
+	ADDPS  X13, X9                    \
+	MOVUPS 4*SIZE(A_PTR)(LDA*2), X14  \
+	ADDPS  X14, X10                   \
+	MOVUPS (A_PTR)(LDA3*1), X15       \
+	ADDPS  X15, X11                   \
+	MOVUPS 4*SIZE(A_PTR)(LDA3*1), X0  \
+	ADDPS  X0, X12                    \
+	MOVUPS X5, (A_PTR)                \
+	MOVUPS X6, 4*SIZE(A_PTR)          \
+	MOVUPS X7, (A_PTR)(LDA*1)         \
+	MOVUPS X8, 4*SIZE(A_PTR)(LDA*1)   \
+	MOVUPS X9, (A_PTR)(LDA*2)         \
+	MOVUPS X10, 4*SIZE(A_PTR)(LDA*2)  \
+	MOVUPS X11, (A_PTR)(LDA3*1)       \
+	MOVUPS X12, 4*SIZE(A_PTR)(LDA3*1) \
+	MOVUPS ALPHA_SPILL, ALPHA         \
+	ADDQ   $8*SIZE, A_PTR
+
+#define KERNEL_4x4 \
+	MOVUPS X5, X6 \
+	MOVUPS X5, X7 \
+	MOVUPS X5, X8 \
+	MULPS  X1, X5 \
+	MULPS  X2, X6 \
+	MULPS  X3, X7 \
+	MULPS  X4, X8
+
+#define STORE_4x4 \
+	MOVUPS (A_PTR), X13         \
+	ADDPS  X13, X5              \
+	MOVUPS (A_PTR)(LDA*1), X14  \
+	ADDPS  X14, X6              \
+	MOVUPS (A_PTR)(LDA*2), X15  \
+	ADDPS  X15, X7              \
+	MOVUPS (A_PTR)(LDA3*1), X13 \
+	ADDPS  X13, X8              \
+	MOVUPS X5, (A_PTR)          \
+	MOVUPS X6, (A_PTR)(LDA*1)   \
+	MOVUPS X7, (A_PTR)(LDA*2)   \
+	MOVUPS X8, (A_PTR)(LDA3*1)  \
+	ADDQ   $4*SIZE, A_PTR
+
+#define KERNEL_4x2 \
+	MOVUPS X5, X6 \
+	MOVUPS X5, X7 \
+	MOVUPS X5, X8 \
+	MULPS  X1, X5 \
+	MULPS  X2, X6 \
+	MULPS  X3, X7 \
+	MULPS  X4, X8
+
+#define STORE_4x2 \
+	MOVSD (A_PTR), X9          \
+	ADDPS X9, X5               \
+	MOVSD (A_PTR)(LDA*1), X10  \
+	ADDPS X10, X6              \
+	MOVSD (A_PTR)(LDA*2), X11  \
+	ADDPS X11, X7              \
+	MOVSD (A_PTR)(LDA3*1), X12 \
+	ADDPS X12, X8              \
+	MOVSD X5, (A_PTR)          \
+	MOVSD X6, (A_PTR)(LDA*1)   \
+	MOVSD X7, (A_PTR)(LDA*2)   \
+	MOVSD X8, (A_PTR)(LDA3*1)  \
+	ADDQ  $2*SIZE, A_PTR
+
+#define KERNEL_4x1 \
+	MOVSS (Y_PTR), X5 \
+	MOVSS X5, X6      \
+	MOVSS X5, X7      \
+	MOVSS X5, X8      \
+	MULSS X1, X5      \
+	MULSS X2, X6      \
+	MULSS X3, X7      \
+	MULSS X4, X8
+
+#define STORE_4x1 \
+	ADDSS (A_PTR), X5         \
+	ADDSS (A_PTR)(LDA*1), X6  \
+	ADDSS (A_PTR)(LDA*2), X7  \
+	ADDSS (A_PTR)(LDA3*1), X8 \
+	MOVSS X5, (A_PTR)         \
+	MOVSS X6, (A_PTR)(LDA*1)  \
+	MOVSS X7, (A_PTR)(LDA*2)  \
+	MOVSS X8, (A_PTR)(LDA3*1) \
+	ADDQ  $SIZE, A_PTR
+
+#define KERNEL_2x8 \
+	MOVUPS X5, X7 \
+	MOVUPS X6, X8 \
+	MULPS  X1, X5 \
+	MULPS  X1, X6 \
+	MULPS  X2, X7 \
+	MULPS  X2, X8
+
+#define STORE_2x8 \
+	MOVUPS (A_PTR), X9               \
+	ADDPS  X9, X5                    \
+	MOVUPS 4*SIZE(A_PTR), X10        \
+	ADDPS  X10, X6                   \
+	MOVUPS (A_PTR)(LDA*1), X11       \
+	ADDPS  X11, X7                   \
+	MOVUPS 4*SIZE(A_PTR)(LDA*1), X12 \
+	ADDPS  X12, X8                   \
+	MOVUPS X5, (A_PTR)               \
+	MOVUPS X6, 4*SIZE(A_PTR)         \
+	MOVUPS X7, (A_PTR)(LDA*1)        \
+	MOVUPS X8, 4*SIZE(A_PTR)(LDA*1)  \
+	ADDQ   $8*SIZE, A_PTR
+
+#define KERNEL_2x4 \
+	MOVUPS X5, X6 \
+	MULPS  X1, X5 \
+	MULPS  X2, X6
+
+#define STORE_2x4 \
+	MOVUPS (A_PTR), X9         \
+	ADDPS  X9, X5              \
+	MOVUPS (A_PTR)(LDA*1), X11 \
+	ADDPS  X11, X6             \
+	MOVUPS X5, (A_PTR)         \
+	MOVUPS X6, (A_PTR)(LDA*1)  \
+	ADDQ   $4*SIZE, A_PTR
+
+#define KERNEL_2x2 \
+	MOVSD X5, X6 \
+	MULPS X1, X5 \
+	MULPS X2, X6
+
+#define STORE_2x2 \
+	MOVSD (A_PTR), X7        \
+	ADDPS X7, X5             \
+	MOVSD (A_PTR)(LDA*1), X8 \
+	ADDPS X8, X6             \
+	MOVSD X5, (A_PTR)        \
+	MOVSD X6, (A_PTR)(LDA*1) \
+	ADDQ  $2*SIZE, A_PTR
+
+#define KERNEL_2x1 \
+	MOVSS (Y_PTR), X5 \
+	MOVSS X5, X6      \
+	MULSS X1, X5      \
+	MULSS X2, X6
+
+#define STORE_2x1 \
+	ADDSS (A_PTR), X5        \
+	ADDSS (A_PTR)(LDA*1), X6 \
+	MOVSS X5, (A_PTR)        \
+	MOVSS X6, (A_PTR)(LDA*1) \
+	ADDQ  $SIZE, A_PTR
+
+#define KERNEL_1x8 \
+	MULPS X1, X5 \
+	MULPS X1, X6
+
+#define STORE_1x8 \
+	MOVUPS (A_PTR), X7       \
+	ADDPS  X7, X5            \
+	MOVUPS 4*SIZE(A_PTR), X8 \
+	ADDPS  X8, X6            \
+	MOVUPS X5, (A_PTR)       \
+	MOVUPS X6, 4*SIZE(A_PTR) \
+	ADDQ   $8*SIZE, A_PTR
+
+#define KERNEL_1x4 \
+	MULPS X1, X5 \
+	MULPS X1, X6
+
+#define STORE_1x4 \
+	MOVUPS (A_PTR), X7    \
+	ADDPS  X7, X5         \
+	MOVUPS X5, (A_PTR)    \
+	ADDQ   $4*SIZE, A_PTR
+
+#define KERNEL_1x2 \
+	MULPS X1, X5
+
+#define STORE_1x2 \
+	MOVSD (A_PTR), X6    \
+	ADDPS X6, X5         \
+	MOVSD X5, (A_PTR)    \
+	ADDQ  $2*SIZE, A_PTR
+
+#define KERNEL_1x1 \
+	MOVSS (Y_PTR), X5 \
+	MULSS X1, X5
+
+#define STORE_1x1 \
+	ADDSS (A_PTR), X5  \
+	MOVSS X5, (A_PTR)  \
+	ADDQ  $SIZE, A_PTR
+
+// func Ger(m, n uintptr, alpha float32,
+//	x []float32, incX uintptr,
+//	y []float32, incY uintptr,
+//	a []float32, lda uintptr)
+TEXT ·Ger(SB), 0, $16-120
+	MOVQ M_DIM, M
+	MOVQ N_DIM, N
+	CMPQ M, $0
+	JE   end
+	CMPQ N, $0
+	JE   end
+
+	LOAD_ALPHA
+
+	MOVQ x_base+24(FP), X_PTR
+	MOVQ y_base+56(FP), Y_PTR
+	MOVQ a_base+88(FP), A_ROW
+	MOVQ A_ROW, A_PTR
+	MOVQ lda+112(FP), LDA     // LDA = LDA * sizeof(float32)
+	SHLQ $BITSIZE, LDA
+	LEAQ (LDA)(LDA*2), LDA3   // LDA3 = LDA * 3
+
+	CMPQ incY+80(FP), $1 // Check for dense vector Y (fast-path)
+	JNE  inc
+	CMPQ incX+48(FP), $1 // Check for dense vector X (fast-path)
+	JNE  inc
+
+	SHRQ $2, M
+	JZ   r2
+
+r4:
+
+	// LOAD 4
+	LOAD_SCALED4
+
+	MOVQ N_DIM, N
+	SHRQ $KERNELSIZE, N
+	JZ   r4c4
+
+r4c8:
+	// 4x8 KERNEL
+	KERNEL_LOAD8
+	KERNEL_4x8
+	STORE_4x8
+
+	ADDQ $8*SIZE, Y_PTR
+
+	DECQ N
+	JNZ  r4c8
+
+r4c4:
+	TESTQ $4, N_DIM
+	JZ    r4c2
+
+	// 4x4 KERNEL
+	KERNEL_LOAD4
+	KERNEL_4x4
+	STORE_4x4
+
+	ADDQ $4*SIZE, Y_PTR
+
+r4c2:
+	TESTQ $2, N_DIM
+	JZ    r4c1
+
+	// 4x2 KERNEL
+	KERNEL_LOAD2
+	KERNEL_4x2
+	STORE_4x2
+
+	ADDQ $2*SIZE, Y_PTR
+
+r4c1:
+	TESTQ $1, N_DIM
+	JZ    r4end
+
+	// 4x1 KERNEL
+	KERNEL_4x1
+	STORE_4x1
+
+	ADDQ $SIZE, Y_PTR
+
+r4end:
+	ADDQ $4*SIZE, X_PTR
+	MOVQ Y, Y_PTR
+	LEAQ (A_ROW)(LDA*4), A_ROW
+	MOVQ A_ROW, A_PTR
+
+	DECQ M
+	JNZ  r4
+
+r2:
+	TESTQ $2, M_DIM
+	JZ    r1
+
+	// LOAD 2
+	LOAD_SCALED2
+
+	MOVQ N_DIM, N
+	SHRQ $KERNELSIZE, N
+	JZ   r2c4
+
+r2c8:
+	// 2x8 KERNEL
+	KERNEL_LOAD8
+	KERNEL_2x8
+	STORE_2x8
+
+	ADDQ $8*SIZE, Y_PTR
+
+	DECQ N
+	JNZ  r2c8
+
+r2c4:
+	TESTQ $4, N_DIM
+	JZ    r2c2
+
+	// 2x4 KERNEL
+	KERNEL_LOAD4
+	KERNEL_2x4
+	STORE_2x4
+
+	ADDQ $4*SIZE, Y_PTR
+
+r2c2:
+	TESTQ $2, N_DIM
+	JZ    r2c1
+
+	// 2x2 KERNEL
+	KERNEL_LOAD2
+	KERNEL_2x2
+	STORE_2x2
+
+	ADDQ $2*SIZE, Y_PTR
+
+r2c1:
+	TESTQ $1, N_DIM
+	JZ    r2end
+
+	// 2x1 KERNEL
+	KERNEL_2x1
+	STORE_2x1
+
+	ADDQ $SIZE, Y_PTR
+
+r2end:
+	ADDQ $2*SIZE, X_PTR
+	MOVQ Y, Y_PTR
+	LEAQ (A_ROW)(LDA*2), A_ROW
+	MOVQ A_ROW, A_PTR
+
+r1:
+	TESTQ $1, M_DIM
+	JZ    end
+
+	// LOAD 1
+	LOAD_SCALED1
+
+	MOVQ N_DIM, N
+	SHRQ $KERNELSIZE, N
+	JZ   r1c4
+
+r1c8:
+	// 1x8 KERNEL
+	KERNEL_LOAD8
+	KERNEL_1x8
+	STORE_1x8
+
+	ADDQ $8*SIZE, Y_PTR
+
+	DECQ N
+	JNZ  r1c8
+
+r1c4:
+	TESTQ $4, N_DIM
+	JZ    r1c2
+
+	// 1x4 KERNEL
+	KERNEL_LOAD4
+	KERNEL_1x4
+	STORE_1x4
+
+	ADDQ $4*SIZE, Y_PTR
+
+r1c2:
+	TESTQ $2, N_DIM
+	JZ    r1c1
+
+	// 1x2 KERNEL
+	KERNEL_LOAD2
+	KERNEL_1x2
+	STORE_1x2
+
+	ADDQ $2*SIZE, Y_PTR
+
+r1c1:
+	TESTQ $1, N_DIM
+	JZ    end
+
+	// 1x1 KERNEL
+	KERNEL_1x1
+	STORE_1x1
+
+end:
+	RET
+
+inc:  // Algorithm for incY != 0 ( split loads in kernel )
+
+	MOVQ incX+48(FP), INC_X       // INC_X = incX * sizeof(float32)
+	SHLQ $BITSIZE, INC_X
+	MOVQ incY+80(FP), INC_Y       // INC_Y = incY * sizeof(float32)
+	SHLQ $BITSIZE, INC_Y
+	LEAQ (INC_X)(INC_X*2), INC3_X // INC3_X = INC_X * 3
+	LEAQ (INC_Y)(INC_Y*2), INC3_Y // INC3_Y = INC_Y * 3
+
+	XORQ    TMP2, TMP2
+	MOVQ    M, TMP1
+	SUBQ    $1, TMP1
+	IMULQ   INC_X, TMP1
+	NEGQ    TMP1
+	CMPQ    INC_X, $0
+	CMOVQLT TMP1, TMP2
+	LEAQ    (X_PTR)(TMP2*SIZE), X_PTR
+
+	XORQ    TMP2, TMP2
+	MOVQ    N, TMP1
+	SUBQ    $1, TMP1
+	IMULQ   INC_Y, TMP1
+	NEGQ    TMP1
+	CMPQ    INC_Y, $0
+	CMOVQLT TMP1, TMP2
+	LEAQ    (Y_PTR)(TMP2*SIZE), Y_PTR
+
+	SHRQ $2, M
+	JZ   inc_r2
+
+inc_r4:
+	// LOAD 4
+	LOAD_SCALED4_INC
+
+	MOVQ N_DIM, N
+	SHRQ $KERNELSIZE, N
+	JZ   inc_r4c4
+
+inc_r4c8:
+	// 4x4 KERNEL
+	KERNEL_LOAD8_INC
+	KERNEL_4x8
+	STORE_4x8
+
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+	DECQ N
+	JNZ  inc_r4c8
+
+inc_r4c4:
+	TESTQ $4, N_DIM
+	JZ    inc_r4c2
+
+	// 4x4 KERNEL
+	KERNEL_LOAD4_INC
+	KERNEL_4x4
+	STORE_4x4
+
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+
+inc_r4c2:
+	TESTQ $2, N_DIM
+	JZ    inc_r4c1
+
+	// 4x2 KERNEL
+	KERNEL_LOAD2_INC
+	KERNEL_4x2
+	STORE_4x2
+
+	LEAQ (Y_PTR)(INC_Y*2), Y_PTR
+
+inc_r4c1:
+	TESTQ $1, N_DIM
+	JZ    inc_r4end
+
+	// 4x1 KERNEL
+	KERNEL_4x1
+	STORE_4x1
+
+	ADDQ INC_Y, Y_PTR
+
+inc_r4end:
+	LEAQ (X_PTR)(INC_X*4), X_PTR
+	MOVQ Y, Y_PTR
+	LEAQ (A_ROW)(LDA*4), A_ROW
+	MOVQ A_ROW, A_PTR
+
+	DECQ M
+	JNZ  inc_r4
+
+inc_r2:
+	TESTQ $2, M_DIM
+	JZ    inc_r1
+
+	// LOAD 2
+	LOAD_SCALED2_INC
+
+	MOVQ N_DIM, N
+	SHRQ $KERNELSIZE, N
+	JZ   inc_r2c4
+
+inc_r2c8:
+	// 2x8 KERNEL
+	KERNEL_LOAD8_INC
+	KERNEL_2x8
+	STORE_2x8
+
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+	DECQ N
+	JNZ  inc_r2c8
+
+inc_r2c4:
+	TESTQ $4, N_DIM
+	JZ    inc_r2c2
+
+	// 2x4 KERNEL
+	KERNEL_LOAD4_INC
+	KERNEL_2x4
+	STORE_2x4
+
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+
+inc_r2c2:
+	TESTQ $2, N_DIM
+	JZ    inc_r2c1
+
+	// 2x2 KERNEL
+	KERNEL_LOAD2_INC
+	KERNEL_2x2
+	STORE_2x2
+
+	LEAQ (Y_PTR)(INC_Y*2), Y_PTR
+
+inc_r2c1:
+	TESTQ $1, N_DIM
+	JZ    inc_r2end
+
+	// 2x1 KERNEL
+	KERNEL_2x1
+	STORE_2x1
+
+	ADDQ INC_Y, Y_PTR
+
+inc_r2end:
+	LEAQ (X_PTR)(INC_X*2), X_PTR
+	MOVQ Y, Y_PTR
+	LEAQ (A_ROW)(LDA*2), A_ROW
+	MOVQ A_ROW, A_PTR
+
+inc_r1:
+	TESTQ $1, M_DIM
+	JZ    end
+
+	// LOAD 1
+	LOAD_SCALED1
+
+	MOVQ N_DIM, N
+	SHRQ $KERNELSIZE, N
+	JZ   inc_r1c4
+
+inc_r1c8:
+	// 1x8 KERNEL
+	KERNEL_LOAD8_INC
+	KERNEL_1x8
+	STORE_1x8
+
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+	DECQ N
+	JNZ  inc_r1c8
+
+inc_r1c4:
+	TESTQ $4, N_DIM
+	JZ    inc_r1c2
+
+	// 1x4 KERNEL
+	KERNEL_LOAD4_INC
+	KERNEL_1x4
+	STORE_1x4
+
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+
+inc_r1c2:
+	TESTQ $2, N_DIM
+	JZ    inc_r1c1
+
+	// 1x2 KERNEL
+	KERNEL_LOAD2_INC
+	KERNEL_1x2
+	STORE_1x2
+
+	LEAQ (Y_PTR)(INC_Y*2), Y_PTR
+
+inc_r1c1:
+	TESTQ $1, N_DIM
+	JZ    inc_end
+
+	// 1x1 KERNEL
+	KERNEL_1x1
+	STORE_1x1
+
+inc_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/ge_noasm.go b/vendor/gonum.org/v1/gonum/internal/asm/f32/ge_noasm.go
new file mode 100644
index 0000000000..61ee6f1802
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/ge_noasm.go
@@ -0,0 +1,39 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 || noasm || gccgo || safe
+// +build !amd64 noasm gccgo safe
+
+package f32
+
+// Ger performs the rank-one operation
+//
+//	A += alpha * x * yᵀ
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
+func Ger(m, n uintptr, alpha float32, x []float32, incX uintptr, y []float32, incY uintptr, a []float32, lda uintptr) {
+
+	if incX == 1 && incY == 1 {
+		x = x[:m]
+		y = y[:n]
+		for i, xv := range x {
+			AxpyUnitary(alpha*xv, y, a[uintptr(i)*lda:uintptr(i)*lda+n])
+		}
+		return
+	}
+
+	var ky, kx uintptr
+	if int(incY) < 0 {
+		ky = uintptr(-int(n-1) * int(incY))
+	}
+	if int(incX) < 0 {
+		kx = uintptr(-int(m-1) * int(incX))
+	}
+
+	ix := kx
+	for i := 0; i < int(m); i++ {
+		AxpyInc(alpha*x[ix], y, a[uintptr(i)*lda:uintptr(i)*lda+n], uintptr(n), uintptr(incY), 1, uintptr(ky), 0)
+		ix += incX
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/gemv.go b/vendor/gonum.org/v1/gonum/internal/asm/f32/gemv.go
new file mode 100644
index 0000000000..a6000504a7
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/gemv.go
@@ -0,0 +1,92 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package f32
+
+// GemvN computes
+//
+//	y = alpha * A * x + beta * y
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars.
+func GemvN(m, n uintptr, alpha float32, a []float32, lda uintptr, x []float32, incX uintptr, beta float32, y []float32, incY uintptr) {
+	var kx, ky, i uintptr
+	if int(incX) < 0 {
+		kx = uintptr(-int(n-1) * int(incX))
+	}
+	if int(incY) < 0 {
+		ky = uintptr(-int(m-1) * int(incY))
+	}
+
+	if incX == 1 && incY == 1 {
+		if beta == 0 {
+			for i = 0; i < m; i++ {
+				y[i] = alpha * DotUnitary(a[lda*i:lda*i+n], x)
+			}
+			return
+		}
+		for i = 0; i < m; i++ {
+			y[i] = y[i]*beta + alpha*DotUnitary(a[lda*i:lda*i+n], x)
+		}
+		return
+	}
+	iy := ky
+	if beta == 0 {
+		for i = 0; i < m; i++ {
+			y[iy] = alpha * DotInc(x, a[lda*i:lda*i+n], n, incX, 1, kx, 0)
+			iy += incY
+		}
+		return
+	}
+	for i = 0; i < m; i++ {
+		y[iy] = y[iy]*beta + alpha*DotInc(x, a[lda*i:lda*i+n], n, incX, 1, kx, 0)
+		iy += incY
+	}
+}
+
+// GemvT computes
+//
+//	y = alpha * Aᵀ * x + beta * y
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars.
+func GemvT(m, n uintptr, alpha float32, a []float32, lda uintptr, x []float32, incX uintptr, beta float32, y []float32, incY uintptr) {
+	var kx, ky, i uintptr
+	if int(incX) < 0 {
+		kx = uintptr(-int(m-1) * int(incX))
+	}
+	if int(incY) < 0 {
+		ky = uintptr(-int(n-1) * int(incY))
+	}
+	switch {
+	case beta == 0: // beta == 0 is special-cased to memclear
+		if incY == 1 {
+			for i := range y {
+				y[i] = 0
+			}
+		} else {
+			iy := ky
+			for i := 0; i < int(n); i++ {
+				y[iy] = 0
+				iy += incY
+			}
+		}
+	case int(incY) < 0:
+		ScalInc(beta, y, n, uintptr(int(-incY)))
+	case incY == 1:
+		ScalUnitary(beta, y[:n])
+	default:
+		ScalInc(beta, y, n, incY)
+	}
+
+	if incX == 1 && incY == 1 {
+		for i = 0; i < m; i++ {
+			AxpyUnitaryTo(y, alpha*x[i], a[lda*i:lda*i+n], y)
+		}
+		return
+	}
+	ix := kx
+	for i = 0; i < m; i++ {
+		AxpyInc(alpha*x[ix], a[lda*i:lda*i+n], y, n, 1, incY, 0, ky)
+		ix += incX
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/l2norm.go b/vendor/gonum.org/v1/gonum/internal/asm/f32/l2norm.go
new file mode 100644
index 0000000000..0f2a77405c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/l2norm.go
@@ -0,0 +1,90 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package f32
+
+import "gonum.org/v1/gonum/internal/math32"
+
+// L2NormUnitary is the level 2 norm of x.
+func L2NormUnitary(x []float32) (sum float32) {
+	var scale float32
+	var sumSquares float32 = 1
+	for _, v := range x {
+		if v == 0 {
+			continue
+		}
+		absxi := math32.Abs(v)
+		if math32.IsNaN(absxi) {
+			return math32.NaN()
+		}
+		if scale < absxi {
+			s := scale / absxi
+			sumSquares = 1 + sumSquares*s*s
+			scale = absxi
+		} else {
+			s := absxi / scale
+			sumSquares += s * s
+		}
+	}
+	if math32.IsInf(scale, 1) {
+		return math32.Inf(1)
+	}
+	return scale * math32.Sqrt(sumSquares)
+}
+
+// L2NormInc is the level 2 norm of x.
+func L2NormInc(x []float32, n, incX uintptr) (sum float32) {
+	var scale float32
+	var sumSquares float32 = 1
+	for ix := uintptr(0); ix < n*incX; ix += incX {
+		val := x[ix]
+		if val == 0 {
+			continue
+		}
+		absxi := math32.Abs(val)
+		if math32.IsNaN(absxi) {
+			return math32.NaN()
+		}
+		if scale < absxi {
+			s := scale / absxi
+			sumSquares = 1 + sumSquares*s*s
+			scale = absxi
+		} else {
+			s := absxi / scale
+			sumSquares += s * s
+		}
+	}
+	if math32.IsInf(scale, 1) {
+		return math32.Inf(1)
+	}
+	return scale * math32.Sqrt(sumSquares)
+}
+
+// L2DistanceUnitary is the L2 norm of x-y.
+func L2DistanceUnitary(x, y []float32) (sum float32) {
+	var scale float32
+	var sumSquares float32 = 1
+	for i, v := range x {
+		v -= y[i]
+		if v == 0 {
+			continue
+		}
+		absxi := math32.Abs(v)
+		if math32.IsNaN(absxi) {
+			return math32.NaN()
+		}
+		if scale < absxi {
+			s := scale / absxi
+			sumSquares = 1 + sumSquares*s*s
+			scale = absxi
+		} else {
+			s := absxi / scale
+			sumSquares += s * s
+		}
+	}
+	if math32.IsInf(scale, 1) {
+		return math32.Inf(1)
+	}
+	return scale * math32.Sqrt(sumSquares)
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/scal.go b/vendor/gonum.org/v1/gonum/internal/asm/f32/scal.go
new file mode 100644
index 0000000000..ad2adee652
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/scal.go
@@ -0,0 +1,59 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package f32
+
+// ScalUnitary is
+//
+//	for i := range x {
+//		x[i] *= alpha
+//	}
+func ScalUnitary(alpha float32, x []float32) {
+	for i := range x {
+		x[i] *= alpha
+	}
+}
+
+// ScalUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha * v
+//	}
+func ScalUnitaryTo(dst []float32, alpha float32, x []float32) {
+	for i, v := range x {
+		dst[i] = alpha * v
+	}
+}
+
+// ScalInc is
+//
+//	var ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		x[ix] *= alpha
+//		ix += incX
+//	}
+func ScalInc(alpha float32, x []float32, n, incX uintptr) {
+	var ix uintptr
+	for i := 0; i < int(n); i++ {
+		x[ix] *= alpha
+		ix += incX
+	}
+}
+
+// ScalIncTo is
+//
+//	var idst, ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha * x[ix]
+//		ix += incX
+//		idst += incDst
+//	}
+func ScalIncTo(dst []float32, incDst uintptr, alpha float32, x []float32, n, incX uintptr) {
+	var idst, ix uintptr
+	for i := 0; i < int(n); i++ {
+		dst[idst] = alpha * x[ix]
+		ix += incX
+		idst += incDst
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/stubs_amd64.go b/vendor/gonum.org/v1/gonum/internal/asm/f32/stubs_amd64.go
new file mode 100644
index 0000000000..2ea0519743
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/stubs_amd64.go
@@ -0,0 +1,86 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !noasm && !gccgo && !safe
+// +build !noasm,!gccgo,!safe
+
+package f32
+
+// AxpyUnitary is
+//
+//	for i, v := range x {
+//		y[i] += alpha * v
+//	}
+func AxpyUnitary(alpha float32, x, y []float32)
+
+// AxpyUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha*v + y[i]
+//	}
+func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32)
+
+// AxpyInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		y[iy] += alpha * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
+
+// AxpyIncTo is
+//
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha*x[ix] + y[iy]
+//		ix += incX
+//		iy += incY
+//		idst += incDst
+//	}
+func AxpyIncTo(dst []float32, incDst, idst uintptr, alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
+
+// DdotUnitary is
+//
+//	for i, v := range x {
+//		sum += float64(y[i]) * float64(v)
+//	}
+//	return
+func DdotUnitary(x, y []float32) (sum float64)
+
+// DdotInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += float64(y[iy]) * float64(x[ix])
+//		ix += incX
+//		iy += incY
+//	}
+//	return
+func DdotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float64)
+
+// DotUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * v
+//	}
+//	return sum
+func DotUnitary(x, y []float32) (sum float32)
+
+// DotInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float32)
+
+// Sum is
+//
+//	 var sum float32
+//	 for _, v := range x {
+//			sum += v
+//	 }
+//	 return sum
+func Sum(x []float32) float32
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/stubs_noasm.go b/vendor/gonum.org/v1/gonum/internal/asm/f32/stubs_noasm.go
new file mode 100644
index 0000000000..07b36ff34b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/stubs_noasm.go
@@ -0,0 +1,137 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 || noasm || gccgo || safe
+// +build !amd64 noasm gccgo safe
+
+package f32
+
+// AxpyUnitary is
+//
+//	for i, v := range x {
+//		y[i] += alpha * v
+//	}
+func AxpyUnitary(alpha float32, x, y []float32) {
+	for i, v := range x {
+		y[i] += alpha * v
+	}
+}
+
+// AxpyUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha*v + y[i]
+//	}
+func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32) {
+	for i, v := range x {
+		dst[i] = alpha*v + y[i]
+	}
+}
+
+// AxpyInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		y[iy] += alpha * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr) {
+	for i := 0; i < int(n); i++ {
+		y[iy] += alpha * x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// AxpyIncTo is
+//
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha*x[ix] + y[iy]
+//		ix += incX
+//		iy += incY
+//		idst += incDst
+//	}
+func AxpyIncTo(dst []float32, incDst, idst uintptr, alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr) {
+	for i := 0; i < int(n); i++ {
+		dst[idst] = alpha*x[ix] + y[iy]
+		ix += incX
+		iy += incY
+		idst += incDst
+	}
+}
+
+// DotUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * v
+//	}
+//	return sum
+func DotUnitary(x, y []float32) (sum float32) {
+	for i, v := range x {
+		sum += y[i] * v
+	}
+	return sum
+}
+
+// DotInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float32) {
+	for i := 0; i < int(n); i++ {
+		sum += y[iy] * x[ix]
+		ix += incX
+		iy += incY
+	}
+	return sum
+}
+
+// DdotUnitary is
+//
+//	for i, v := range x {
+//		sum += float64(y[i]) * float64(v)
+//	}
+//	return
+func DdotUnitary(x, y []float32) (sum float64) {
+	for i, v := range x {
+		sum += float64(y[i]) * float64(v)
+	}
+	return
+}
+
+// DdotInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += float64(y[iy]) * float64(x[ix])
+//		ix += incX
+//		iy += incY
+//	}
+//	return
+func DdotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float64) {
+	for i := 0; i < int(n); i++ {
+		sum += float64(y[iy]) * float64(x[ix])
+		ix += incX
+		iy += incY
+	}
+	return
+}
+
+// Sum is
+//
+//	var sum float32
+//	for _, v := range x {
+//		sum += v
+//	}
+//	return sum
+func Sum(x []float32) float32 {
+	var sum float32
+	for _, v := range x {
+		sum += v
+	}
+	return sum
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/sum_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f32/sum_amd64.s
new file mode 100644
index 0000000000..42e96361e4
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/sum_amd64.s
@@ -0,0 +1,100 @@
+// Copyright ©2021 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define X_PTR SI
+#define IDX AX
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define SUM_1 X1
+#define SUM_2 X2
+#define SUM_3 X3
+
+// func Sum(x []float32) float32
+TEXT ·Sum(SB), NOSPLIT, $0
+	MOVQ x_base+0(FP), X_PTR // X_PTR = &x
+	MOVQ x_len+8(FP), LEN    // LEN = len(x)
+	XORQ IDX, IDX            // i = 0
+	PXOR SUM, SUM            // p_sum_i = 0
+	CMPQ LEN, $0             // if LEN == 0 { return 0 }
+	JE   sum_end
+
+	PXOR SUM_1, SUM_1
+	PXOR SUM_2, SUM_2
+	PXOR SUM_3, SUM_3
+
+	MOVQ X_PTR, TAIL // Check memory alignment
+	ANDQ $15, TAIL   // TAIL = &x % 16
+	JZ   no_trim     // if TAIL == 0 { goto no_trim }
+	SUBQ $16, TAIL   // TAIL -= 16
+
+sum_align: // Align on 16-byte boundary do {
+	ADDSS (X_PTR)(IDX*4), SUM // SUM += x[0]
+	INCQ  IDX                 // i++
+	DECQ  LEN                 // LEN--
+	JZ    sum_end             // if LEN == 0 { return }
+	ADDQ  $4, TAIL            // TAIL += 4
+	JNZ   sum_align           // } while TAIL < 0
+
+no_trim:
+	MOVQ LEN, TAIL
+	SHRQ $4, LEN   // LEN = floor( n / 16 )
+	JZ   sum_tail8 // if LEN == 0 { goto sum_tail8 }
+
+
+sum_loop: // sum 16x wide do {
+	ADDPS (X_PTR)(IDX*4), SUM     // sum_i += x[i:i+4]
+	ADDPS 16(X_PTR)(IDX*4), SUM_1
+	ADDPS 32(X_PTR)(IDX*4), SUM_2
+	ADDPS 48(X_PTR)(IDX*4), SUM_3
+
+	ADDQ  $16, IDX                // i += 16
+	DECQ  LEN
+	JNZ   sum_loop                // } while --LEN > 0
+
+sum_tail8:
+	ADDPS SUM_3, SUM
+	ADDPS SUM_2, SUM_1
+
+	TESTQ $8, TAIL
+	JZ    sum_tail4
+
+	ADDPS (X_PTR)(IDX*4), SUM // sum_i += x[i:i+4]
+	ADDPS 16(X_PTR)(IDX*4), SUM_1
+	ADDQ  $8, IDX
+
+sum_tail4:
+	ADDPS SUM_1, SUM
+
+	TESTQ $4, TAIL
+	JZ    sum_tail2
+
+	ADDPS (X_PTR)(IDX*4), SUM // sum_i += x[i:i+4]
+	ADDQ  $4, IDX
+
+sum_tail2:
+	HADDPS SUM, SUM            // sum_i[:2] += sum_i[2:4]
+
+	TESTQ $2, TAIL
+	JZ    sum_tail1
+
+	MOVSD (X_PTR)(IDX*4), SUM_1 // reuse SUM_1
+	ADDPS SUM_1, SUM            // sum_i += x[i:i+2]
+	ADDQ  $2, IDX
+
+sum_tail1:
+	HADDPS SUM, SUM // sum_i[0] += sum_i[1]
+
+	TESTQ $1, TAIL
+	JZ    sum_end
+
+	ADDSS (X_PTR)(IDX*4), SUM
+
+sum_end: // return sum
+	MOVSS SUM, ret+24(FP)
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/abssum_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/abssum_amd64.s
new file mode 100644
index 0000000000..df63dc0905
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/abssum_amd64.s
@@ -0,0 +1,82 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func L1Norm(x []float64) float64
+TEXT ·L1Norm(SB), NOSPLIT, $0
+	MOVQ x_base+0(FP), SI // SI = &x
+	MOVQ x_len+8(FP), CX  // CX = len(x)
+	XORQ AX, AX           // i = 0
+	PXOR X0, X0           // p_sum_i = 0
+	PXOR X1, X1
+	PXOR X2, X2
+	PXOR X3, X3
+	PXOR X4, X4
+	PXOR X5, X5
+	PXOR X6, X6
+	PXOR X7, X7
+	CMPQ CX, $0           // if CX == 0 { return 0 }
+	JE   absum_end
+	MOVQ CX, BX
+	ANDQ $7, BX           // BX = len(x) % 8
+	SHRQ $3, CX           // CX = floor( len(x) / 8 )
+	JZ   absum_tail_start // if CX == 0 { goto absum_tail_start }
+
+absum_loop: // do {
+	// p_sum += max( p_sum + x[i], p_sum - x[i] )
+	MOVUPS (SI)(AX*8), X8    // X_i = x[i:i+1]
+	MOVUPS 16(SI)(AX*8), X9
+	MOVUPS 32(SI)(AX*8), X10
+	MOVUPS 48(SI)(AX*8), X11
+	ADDPD  X8, X0            // p_sum_i += X_i  ( positive values )
+	ADDPD  X9, X2
+	ADDPD  X10, X4
+	ADDPD  X11, X6
+	SUBPD  X8, X1            // p_sum_(i+1) -= X_i  ( negative values )
+	SUBPD  X9, X3
+	SUBPD  X10, X5
+	SUBPD  X11, X7
+	MAXPD  X1, X0            // p_sum_i = max( p_sum_i, p_sum_(i+1) )
+	MAXPD  X3, X2
+	MAXPD  X5, X4
+	MAXPD  X7, X6
+	MOVAPS X0, X1            // p_sum_(i+1) = p_sum_i
+	MOVAPS X2, X3
+	MOVAPS X4, X5
+	MOVAPS X6, X7
+	ADDQ   $8, AX            // i += 8
+	LOOP   absum_loop        // } while --CX > 0
+
+	// p_sum_0 = \sum_{i=1}^{3}( p_sum_(i*2) )
+	ADDPD X3, X0
+	ADDPD X5, X7
+	ADDPD X7, X0
+
+	// p_sum_0[0] = p_sum_0[0] + p_sum_0[1]
+	MOVAPS X0, X1
+	SHUFPD $0x3, X0, X0 // lower( p_sum_0 ) = upper( p_sum_0 )
+	ADDSD  X1, X0
+	CMPQ   BX, $0
+	JE     absum_end    // if BX == 0 { goto absum_end }
+
+absum_tail_start: // Reset loop registers
+	MOVQ  BX, CX // Loop counter:  CX = BX
+	XORPS X8, X8 // X_8 = 0
+
+absum_tail: // do {
+	// p_sum += max( p_sum + x[i], p_sum - x[i] )
+	MOVSD (SI)(AX*8), X8 // X_8 = x[i]
+	MOVSD X0, X1         // p_sum_1 = p_sum_0
+	ADDSD X8, X0         // p_sum_0 += X_8
+	SUBSD X8, X1         // p_sum_1 -= X_8
+	MAXSD X1, X0         // p_sum_0 = max( p_sum_0, p_sum_1 )
+	INCQ  AX             // i++
+	LOOP  absum_tail     // } while --CX > 0
+
+absum_end: // return p_sum_0
+	MOVSD X0, sum+24(FP)
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/abssuminc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/abssuminc_amd64.s
new file mode 100644
index 0000000000..647517333c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/abssuminc_amd64.s
@@ -0,0 +1,90 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func L1NormInc(x []float64, n, incX int) (sum float64)
+TEXT ·L1NormInc(SB), NOSPLIT, $0
+	MOVQ  x_base+0(FP), SI // SI = &x
+	MOVQ  n+24(FP), CX     // CX = n
+	MOVQ  incX+32(FP), AX  // AX =  increment * sizeof( float64 )
+	SHLQ  $3, AX
+	MOVQ  AX, DX           // DX = AX * 3
+	IMULQ $3, DX
+	PXOR  X0, X0           // p_sum_i = 0
+	PXOR  X1, X1
+	PXOR  X2, X2
+	PXOR  X3, X3
+	PXOR  X4, X4
+	PXOR  X5, X5
+	PXOR  X6, X6
+	PXOR  X7, X7
+	CMPQ  CX, $0           // if CX == 0 { return 0 }
+	JE    absum_end
+	MOVQ  CX, BX
+	ANDQ  $7, BX           // BX = n % 8
+	SHRQ  $3, CX           // CX = floor( n / 8 )
+	JZ    absum_tail_start // if CX == 0 { goto absum_tail_start }
+
+absum_loop: // do {
+	// p_sum = max( p_sum + x[i], p_sum - x[i] )
+	MOVSD  (SI), X8        // X_i[0] = x[i]
+	MOVSD  (SI)(AX*1), X9
+	MOVSD  (SI)(AX*2), X10
+	MOVSD  (SI)(DX*1), X11
+	LEAQ   (SI)(AX*4), SI  // SI = SI + 4
+	MOVHPD (SI), X8        // X_i[1] = x[i+4]
+	MOVHPD (SI)(AX*1), X9
+	MOVHPD (SI)(AX*2), X10
+	MOVHPD (SI)(DX*1), X11
+	ADDPD  X8, X0          // p_sum_i += X_i  ( positive values )
+	ADDPD  X9, X2
+	ADDPD  X10, X4
+	ADDPD  X11, X6
+	SUBPD  X8, X1          // p_sum_(i+1) -= X_i  ( negative values )
+	SUBPD  X9, X3
+	SUBPD  X10, X5
+	SUBPD  X11, X7
+	MAXPD  X1, X0          // p_sum_i = max( p_sum_i, p_sum_(i+1) )
+	MAXPD  X3, X2
+	MAXPD  X5, X4
+	MAXPD  X7, X6
+	MOVAPS X0, X1          // p_sum_(i+1) = p_sum_i
+	MOVAPS X2, X3
+	MOVAPS X4, X5
+	MOVAPS X6, X7
+	LEAQ   (SI)(AX*4), SI  // SI = SI + 4
+	LOOP   absum_loop      // } while --CX > 0
+
+	// p_sum_0 = \sum_{i=1}^{3}( p_sum_(i*2) )
+	ADDPD X3, X0
+	ADDPD X5, X7
+	ADDPD X7, X0
+
+	// p_sum_0[0] = p_sum_0[0] + p_sum_0[1]
+	MOVAPS X0, X1
+	SHUFPD $0x3, X0, X0 // lower( p_sum_0 ) = upper( p_sum_0 )
+	ADDSD  X1, X0
+	CMPQ   BX, $0
+	JE     absum_end    // if BX == 0 { goto absum_end }
+
+absum_tail_start: // Reset loop registers
+	MOVQ  BX, CX // Loop counter:  CX = BX
+	XORPS X8, X8 // X_8 = 0
+
+absum_tail: // do {
+	// p_sum += max( p_sum + x[i], p_sum - x[i] )
+	MOVSD (SI), X8   // X_8 = x[i]
+	MOVSD X0, X1     // p_sum_1 = p_sum_0
+	ADDSD X8, X0     // p_sum_0 += X_8
+	SUBSD X8, X1     // p_sum_1 -= X_8
+	MAXSD X1, X0     // p_sum_0 = max( p_sum_0, p_sum_1 )
+	ADDQ  AX, SI     // i++
+	LOOP  absum_tail // } while --CX > 0
+
+absum_end: // return p_sum_0
+	MOVSD X0, sum+40(FP)
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/add_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/add_amd64.s
new file mode 100644
index 0000000000..e377f51256
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/add_amd64.s
@@ -0,0 +1,66 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func Add(dst, s []float64)
+TEXT ·Add(SB), NOSPLIT, $0
+	MOVQ    dst_base+0(FP), DI // DI = &dst
+	MOVQ    dst_len+8(FP), CX  // CX = len(dst)
+	MOVQ    s_base+24(FP), SI  // SI = &s
+	CMPQ    s_len+32(FP), CX   // CX = max( CX, len(s) )
+	CMOVQLE s_len+32(FP), CX
+	CMPQ    CX, $0             // if CX == 0 { return }
+	JE      add_end
+	XORQ    AX, AX
+	MOVQ    DI, BX
+	ANDQ    $0x0F, BX          // BX = &dst & 15
+	JZ      add_no_trim        // if BX == 0 { goto add_no_trim }
+
+	// Align on 16-bit boundary
+	MOVSD (SI)(AX*8), X0 // X0 = s[i]
+	ADDSD (DI)(AX*8), X0 // X0 += dst[i]
+	MOVSD X0, (DI)(AX*8) // dst[i] = X0
+	INCQ  AX             // i++
+	DECQ  CX             // --CX
+	JE    add_end        // if CX == 0 { return  }
+
+add_no_trim:
+	MOVQ CX, BX
+	ANDQ $7, BX         // BX = len(dst) % 8
+	SHRQ $3, CX         // CX = floor( len(dst) / 8 )
+	JZ   add_tail_start // if CX == 0 { goto add_tail_start }
+
+add_loop: // Loop unrolled 8x   do {
+	MOVUPS (SI)(AX*8), X0   // X_i = s[i:i+1]
+	MOVUPS 16(SI)(AX*8), X1
+	MOVUPS 32(SI)(AX*8), X2
+	MOVUPS 48(SI)(AX*8), X3
+	ADDPD  (DI)(AX*8), X0   // X_i += dst[i:i+1]
+	ADDPD  16(DI)(AX*8), X1
+	ADDPD  32(DI)(AX*8), X2
+	ADDPD  48(DI)(AX*8), X3
+	MOVUPS X0, (DI)(AX*8)   // dst[i:i+1] = X_i
+	MOVUPS X1, 16(DI)(AX*8)
+	MOVUPS X2, 32(DI)(AX*8)
+	MOVUPS X3, 48(DI)(AX*8)
+	ADDQ   $8, AX           // i += 8
+	LOOP   add_loop         // } while --CX > 0
+	CMPQ   BX, $0           // if BX == 0 { return }
+	JE     add_end
+
+add_tail_start: // Reset loop registers
+	MOVQ BX, CX // Loop counter: CX = BX
+
+add_tail: // do {
+	MOVSD (SI)(AX*8), X0 // X0 = s[i]
+	ADDSD (DI)(AX*8), X0 // X0 += dst[i]
+	MOVSD X0, (DI)(AX*8) // dst[i] = X0
+	INCQ  AX             // ++i
+	LOOP  add_tail       // } while --CX > 0
+
+add_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/addconst_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/addconst_amd64.s
new file mode 100644
index 0000000000..6f52a8f64f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/addconst_amd64.s
@@ -0,0 +1,53 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func Addconst(alpha float64, x []float64)
+TEXT ·AddConst(SB), NOSPLIT, $0
+	MOVQ   x_base+8(FP), SI // SI = &x
+	MOVQ   x_len+16(FP), CX // CX = len(x)
+	CMPQ   CX, $0           // if len(x) == 0 { return }
+	JE     ac_end
+	MOVSD  alpha+0(FP), X4  // X4 = { a, a }
+	SHUFPD $0, X4, X4
+	MOVUPS X4, X5           // X5 = X4
+	XORQ   AX, AX           // i = 0
+	MOVQ   CX, BX
+	ANDQ   $7, BX           // BX = len(x) % 8
+	SHRQ   $3, CX           // CX = floor( len(x) / 8 )
+	JZ     ac_tail_start    // if CX == 0 { goto ac_tail_start }
+
+ac_loop: // Loop unrolled 8x   do {
+	MOVUPS (SI)(AX*8), X0   // X_i = s[i:i+1]
+	MOVUPS 16(SI)(AX*8), X1
+	MOVUPS 32(SI)(AX*8), X2
+	MOVUPS 48(SI)(AX*8), X3
+	ADDPD  X4, X0           // X_i += a
+	ADDPD  X5, X1
+	ADDPD  X4, X2
+	ADDPD  X5, X3
+	MOVUPS X0, (SI)(AX*8)   // s[i:i+1] = X_i
+	MOVUPS X1, 16(SI)(AX*8)
+	MOVUPS X2, 32(SI)(AX*8)
+	MOVUPS X3, 48(SI)(AX*8)
+	ADDQ   $8, AX           // i += 8
+	LOOP   ac_loop          // } while --CX > 0
+	CMPQ   BX, $0           // if BX == 0 { return }
+	JE     ac_end
+
+ac_tail_start: // Reset loop counters
+	MOVQ BX, CX // Loop counter: CX = BX
+
+ac_tail: // do {
+	MOVSD (SI)(AX*8), X0 // X0 = s[i]
+	ADDSD X4, X0         // X0 += a
+	MOVSD X0, (SI)(AX*8) // s[i] = X0
+	INCQ  AX             // ++i
+	LOOP  ac_tail        // } while --CX > 0
+
+ac_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/axpy.go b/vendor/gonum.org/v1/gonum/internal/asm/f64/axpy.go
new file mode 100644
index 0000000000..2ab8129a54
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/axpy.go
@@ -0,0 +1,62 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 || noasm || gccgo || safe
+// +build !amd64 noasm gccgo safe
+
+package f64
+
+// AxpyUnitary is
+//
+//	for i, v := range x {
+//		y[i] += alpha * v
+//	}
+func AxpyUnitary(alpha float64, x, y []float64) {
+	for i, v := range x {
+		y[i] += alpha * v
+	}
+}
+
+// AxpyUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha*v + y[i]
+//	}
+func AxpyUnitaryTo(dst []float64, alpha float64, x, y []float64) {
+	for i, v := range x {
+		dst[i] = alpha*v + y[i]
+	}
+}
+
+// AxpyInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		y[iy] += alpha * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+func AxpyInc(alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) {
+	for i := 0; i < int(n); i++ {
+		y[iy] += alpha * x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// AxpyIncTo is
+//
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha*x[ix] + y[iy]
+//		ix += incX
+//		iy += incY
+//		idst += incDst
+//	}
+func AxpyIncTo(dst []float64, incDst, idst uintptr, alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) {
+	for i := 0; i < int(n); i++ {
+		dst[idst] = alpha*x[ix] + y[iy]
+		ix += incX
+		iy += incY
+		idst += incDst
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyinc_amd64.s
new file mode 100644
index 0000000000..a4e180fbfa
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyinc_amd64.s
@@ -0,0 +1,142 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// Some of the loop unrolling code is copied from:
+// http://golang.org/src/math/big/arith_amd64.s
+// which is distributed under these terms:
+//
+// Copyright (c) 2012 The Go Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define X_PTR SI
+#define Y_PTR DI
+#define DST_PTR DI
+#define IDX AX
+#define LEN CX
+#define TAIL BX
+#define INC_X R8
+#define INCx3_X R11
+#define INC_Y R9
+#define INCx3_Y R12
+#define INC_DST R9
+#define INCx3_DST R12
+#define ALPHA X0
+#define ALPHA_2 X1
+
+// func AxpyInc(alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr)
+TEXT ·AxpyInc(SB), NOSPLIT, $0
+	MOVQ x_base+8(FP), X_PTR  // X_PTR = &x
+	MOVQ y_base+32(FP), Y_PTR // Y_PTR = &y
+	MOVQ n+56(FP), LEN        // LEN = n
+	CMPQ LEN, $0              // if LEN == 0 { return }
+	JE   end
+
+	MOVQ ix+80(FP), INC_X
+	MOVQ iy+88(FP), INC_Y
+	LEAQ (X_PTR)(INC_X*8), X_PTR // X_PTR = &(x[ix])
+	LEAQ (Y_PTR)(INC_Y*8), Y_PTR // Y_PTR = &(y[iy])
+	MOVQ Y_PTR, DST_PTR          // DST_PTR = Y_PTR  // Write pointer
+
+	MOVQ incX+64(FP), INC_X // INC_X = incX * sizeof(float64)
+	SHLQ $3, INC_X
+	MOVQ incY+72(FP), INC_Y // INC_Y = incY * sizeof(float64)
+	SHLQ $3, INC_Y
+
+	MOVSD alpha+0(FP), ALPHA // ALPHA = alpha
+	MOVQ  LEN, TAIL
+	ANDQ  $3, TAIL           // TAIL = n % 4
+	SHRQ  $2, LEN            // LEN = floor( n / 4 )
+	JZ    tail_start         // if LEN == 0 { goto tail_start }
+
+	MOVAPS ALPHA, ALPHA_2            // ALPHA_2 = ALPHA  for pipelining
+	LEAQ   (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
+	LEAQ   (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
+
+loop:  // do {  // y[i] += alpha * x[i] unrolled 4x.
+	MOVSD (X_PTR), X2            // X_i = x[i]
+	MOVSD (X_PTR)(INC_X*1), X3
+	MOVSD (X_PTR)(INC_X*2), X4
+	MOVSD (X_PTR)(INCx3_X*1), X5
+
+	MULSD ALPHA, X2   // X_i *= a
+	MULSD ALPHA_2, X3
+	MULSD ALPHA, X4
+	MULSD ALPHA_2, X5
+
+	ADDSD (Y_PTR), X2            // X_i += y[i]
+	ADDSD (Y_PTR)(INC_Y*1), X3
+	ADDSD (Y_PTR)(INC_Y*2), X4
+	ADDSD (Y_PTR)(INCx3_Y*1), X5
+
+	MOVSD X2, (DST_PTR)              // y[i] = X_i
+	MOVSD X3, (DST_PTR)(INC_DST*1)
+	MOVSD X4, (DST_PTR)(INC_DST*2)
+	MOVSD X5, (DST_PTR)(INCx3_DST*1)
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[incY*4])
+	DECQ LEN
+	JNZ  loop                    // } while --LEN > 0
+	CMPQ TAIL, $0                // if TAIL == 0 { return }
+	JE   end
+
+tail_start: // Reset Loop registers
+	MOVQ TAIL, LEN // Loop counter: LEN = TAIL
+	SHRQ $1, LEN   // LEN = floor( LEN / 2 )
+	JZ   tail_one
+
+tail_two:
+	MOVSD (X_PTR), X2              // X_i = x[i]
+	MOVSD (X_PTR)(INC_X*1), X3
+	MULSD ALPHA, X2                // X_i *= a
+	MULSD ALPHA, X3
+	ADDSD (Y_PTR), X2              // X_i += y[i]
+	ADDSD (Y_PTR)(INC_Y*1), X3
+	MOVSD X2, (DST_PTR)            // y[i] = X_i
+	MOVSD X3, (DST_PTR)(INC_DST*1)
+
+	LEAQ (X_PTR)(INC_X*2), X_PTR // X_PTR = &(X_PTR[incX*2])
+	LEAQ (Y_PTR)(INC_Y*2), Y_PTR // Y_PTR = &(Y_PTR[incY*2])
+
+	ANDQ $1, TAIL
+	JZ   end      // if TAIL == 0 { goto end }
+
+tail_one:
+	// y[i] += alpha * x[i] for the last n % 4 iterations.
+	MOVSD (X_PTR), X2   // X2 = x[i]
+	MULSD ALPHA, X2     // X2 *= a
+	ADDSD (Y_PTR), X2   // X2 += y[i]
+	MOVSD X2, (DST_PTR) // y[i] = X2
+
+end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyincto_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyincto_amd64.s
new file mode 100644
index 0000000000..0f54a39400
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyincto_amd64.s
@@ -0,0 +1,148 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// Some of the loop unrolling code is copied from:
+// http://golang.org/src/math/big/arith_amd64.s
+// which is distributed under these terms:
+//
+// Copyright (c) 2012 The Go Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define X_PTR SI
+#define Y_PTR DI
+#define DST_PTR DX
+#define IDX AX
+#define LEN CX
+#define TAIL BX
+#define INC_X R8
+#define INCx3_X R11
+#define INC_Y R9
+#define INCx3_Y R12
+#define INC_DST R10
+#define INCx3_DST R13
+#define ALPHA X0
+#define ALPHA_2 X1
+
+// func AxpyIncTo(dst []float64, incDst, idst uintptr, alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr)
+TEXT ·AxpyIncTo(SB), NOSPLIT, $0
+	MOVQ dst_base+0(FP), DST_PTR // DST_PTR := &dst
+	MOVQ x_base+48(FP), X_PTR    // X_PTR := &x
+	MOVQ y_base+72(FP), Y_PTR    // Y_PTR := &y
+	MOVQ n+96(FP), LEN           // LEN := n
+	CMPQ LEN, $0                 // if LEN == 0 { return }
+	JE   end
+
+	MOVQ ix+120(FP), INC_X
+	LEAQ (X_PTR)(INC_X*8), X_PTR       // X_PTR = &(x[ix])
+	MOVQ iy+128(FP), INC_Y
+	LEAQ (Y_PTR)(INC_Y*8), Y_PTR       // Y_PTR = &(dst[idst])
+	MOVQ idst+32(FP), INC_DST
+	LEAQ (DST_PTR)(INC_DST*8), DST_PTR // DST_PTR = &(y[iy])
+
+	MOVQ  incX+104(FP), INC_X    // INC_X = incX * sizeof(float64)
+	SHLQ  $3, INC_X
+	MOVQ  incY+112(FP), INC_Y    // INC_Y = incY * sizeof(float64)
+	SHLQ  $3, INC_Y
+	MOVQ  incDst+24(FP), INC_DST // INC_DST = incDst * sizeof(float64)
+	SHLQ  $3, INC_DST
+	MOVSD alpha+40(FP), ALPHA
+
+	MOVQ LEN, TAIL
+	ANDQ $3, TAIL   // TAIL = n % 4
+	SHRQ $2, LEN    // LEN = floor( n / 4 )
+	JZ   tail_start // if LEN == 0 { goto tail_start }
+
+	MOVSD ALPHA, ALPHA_2                  // ALPHA_2 = ALPHA for pipelining
+	LEAQ  (INC_X)(INC_X*2), INCx3_X       // INCx3_X = INC_X * 3
+	LEAQ  (INC_Y)(INC_Y*2), INCx3_Y       // INCx3_Y = INC_Y * 3
+	LEAQ  (INC_DST)(INC_DST*2), INCx3_DST // INCx3_DST = INC_DST * 3
+
+loop:  // do {  // y[i] += alpha * x[i] unrolled 2x.
+	MOVSD (X_PTR), X2            // X_i = x[i]
+	MOVSD (X_PTR)(INC_X*1), X3
+	MOVSD (X_PTR)(INC_X*2), X4
+	MOVSD (X_PTR)(INCx3_X*1), X5
+
+	MULSD ALPHA, X2   // X_i *= a
+	MULSD ALPHA_2, X3
+	MULSD ALPHA, X4
+	MULSD ALPHA_2, X5
+
+	ADDSD (Y_PTR), X2            // X_i += y[i]
+	ADDSD (Y_PTR)(INC_Y*1), X3
+	ADDSD (Y_PTR)(INC_Y*2), X4
+	ADDSD (Y_PTR)(INCx3_Y*1), X5
+
+	MOVSD X2, (DST_PTR)              // y[i] = X_i
+	MOVSD X3, (DST_PTR)(INC_DST*1)
+	MOVSD X4, (DST_PTR)(INC_DST*2)
+	MOVSD X5, (DST_PTR)(INCx3_DST*1)
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR       // X_PTR = &(X_PTR[incX*4])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR       // Y_PTR = &(Y_PTR[incY*4])
+	LEAQ (DST_PTR)(INC_DST*4), DST_PTR // DST_PTR = &(DST_PTR[incDst*4]
+	DECQ LEN
+	JNZ  loop                          // } while --LEN > 0
+	CMPQ TAIL, $0                      // if TAIL == 0 { return }
+	JE   end
+
+tail_start: // Reset Loop registers
+	MOVQ TAIL, LEN // Loop counter: LEN = TAIL
+	SHRQ $1, LEN   // LEN = floor( LEN / 2 )
+	JZ   tail_one
+
+tail_two:
+	MOVSD (X_PTR), X2              // X_i = x[i]
+	MOVSD (X_PTR)(INC_X*1), X3
+	MULSD ALPHA, X2                // X_i *= a
+	MULSD ALPHA, X3
+	ADDSD (Y_PTR), X2              // X_i += y[i]
+	ADDSD (Y_PTR)(INC_Y*1), X3
+	MOVSD X2, (DST_PTR)            // y[i] = X_i
+	MOVSD X3, (DST_PTR)(INC_DST*1)
+
+	LEAQ (X_PTR)(INC_X*2), X_PTR       // X_PTR = &(X_PTR[incX*2])
+	LEAQ (Y_PTR)(INC_Y*2), Y_PTR       // Y_PTR = &(Y_PTR[incY*2])
+	LEAQ (DST_PTR)(INC_DST*2), DST_PTR // DST_PTR = &(DST_PTR[incY*2]
+
+	ANDQ $1, TAIL
+	JZ   end      // if TAIL == 0 { goto end }
+
+tail_one:
+	MOVSD (X_PTR), X2   // X2 = x[i]
+	MULSD ALPHA, X2     // X2 *= a
+	ADDSD (Y_PTR), X2   // X2 += y[i]
+	MOVSD X2, (DST_PTR) // y[i] = X2
+
+end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyunitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyunitary_amd64.s
new file mode 100644
index 0000000000..f0b78596b6
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyunitary_amd64.s
@@ -0,0 +1,134 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// Some of the loop unrolling code is copied from:
+// http://golang.org/src/math/big/arith_amd64.s
+// which is distributed under these terms:
+//
+// Copyright (c) 2012 The Go Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define X_PTR SI
+#define Y_PTR DI
+#define DST_PTR DI
+#define IDX AX
+#define LEN CX
+#define TAIL BX
+#define ALPHA X0
+#define ALPHA_2 X1
+
+// func AxpyUnitary(alpha float64, x, y []float64)
+TEXT ·AxpyUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+8(FP), X_PTR  // X_PTR := &x
+	MOVQ    y_base+32(FP), Y_PTR // Y_PTR := &y
+	MOVQ    x_len+16(FP), LEN    // LEN = min( len(x), len(y) )
+	CMPQ    y_len+40(FP), LEN
+	CMOVQLE y_len+40(FP), LEN
+	CMPQ    LEN, $0              // if LEN == 0 { return }
+	JE      end
+	XORQ    IDX, IDX
+	MOVSD   alpha+0(FP), ALPHA   // ALPHA := { alpha, alpha }
+	SHUFPD  $0, ALPHA, ALPHA
+	MOVUPS  ALPHA, ALPHA_2       // ALPHA_2 := ALPHA   for pipelining
+	MOVQ    Y_PTR, TAIL          // Check memory alignment
+	ANDQ    $15, TAIL            // TAIL = &y % 16
+	JZ      no_trim              // if TAIL == 0 { goto no_trim }
+
+	// Align on 16-byte boundary
+	MOVSD (X_PTR), X2   // X2 := x[0]
+	MULSD ALPHA, X2     // X2 *= a
+	ADDSD (Y_PTR), X2   // X2 += y[0]
+	MOVSD X2, (DST_PTR) // y[0] = X2
+	INCQ  IDX           // i++
+	DECQ  LEN           // LEN--
+	JZ    end           // if LEN == 0 { return }
+
+no_trim:
+	MOVQ LEN, TAIL
+	ANDQ $7, TAIL   // TAIL := n % 8
+	SHRQ $3, LEN    // LEN = floor( n / 8 )
+	JZ   tail_start // if LEN == 0 { goto tail2_start }
+
+loop:  // do {
+	// y[i] += alpha * x[i] unrolled 8x.
+	MOVUPS (X_PTR)(IDX*8), X2   // X_i = x[i]
+	MOVUPS 16(X_PTR)(IDX*8), X3
+	MOVUPS 32(X_PTR)(IDX*8), X4
+	MOVUPS 48(X_PTR)(IDX*8), X5
+
+	MULPD ALPHA, X2   // X_i *= a
+	MULPD ALPHA_2, X3
+	MULPD ALPHA, X4
+	MULPD ALPHA_2, X5
+
+	ADDPD (Y_PTR)(IDX*8), X2   // X_i += y[i]
+	ADDPD 16(Y_PTR)(IDX*8), X3
+	ADDPD 32(Y_PTR)(IDX*8), X4
+	ADDPD 48(Y_PTR)(IDX*8), X5
+
+	MOVUPS X2, (DST_PTR)(IDX*8)   // y[i] = X_i
+	MOVUPS X3, 16(DST_PTR)(IDX*8)
+	MOVUPS X4, 32(DST_PTR)(IDX*8)
+	MOVUPS X5, 48(DST_PTR)(IDX*8)
+
+	ADDQ $8, IDX  // i += 8
+	DECQ LEN
+	JNZ  loop     // } while --LEN > 0
+	CMPQ TAIL, $0 // if TAIL == 0 { return }
+	JE   end
+
+tail_start: // Reset loop registers
+	MOVQ TAIL, LEN // Loop counter: LEN = TAIL
+	SHRQ $1, LEN   // LEN = floor( TAIL / 2 )
+	JZ   tail_one  // if TAIL == 0 { goto tail }
+
+tail_two: // do {
+	MOVUPS (X_PTR)(IDX*8), X2   // X2 = x[i]
+	MULPD  ALPHA, X2            // X2 *= a
+	ADDPD  (Y_PTR)(IDX*8), X2   // X2 += y[i]
+	MOVUPS X2, (DST_PTR)(IDX*8) // y[i] = X2
+	ADDQ   $2, IDX              // i += 2
+	DECQ   LEN
+	JNZ    tail_two             // } while --LEN > 0
+
+	ANDQ $1, TAIL
+	JZ   end      // if TAIL == 0 { goto end }
+
+tail_one:
+	MOVSD (X_PTR)(IDX*8), X2   // X2 = x[i]
+	MULSD ALPHA, X2            // X2 *= a
+	ADDSD (Y_PTR)(IDX*8), X2   // X2 += y[i]
+	MOVSD X2, (DST_PTR)(IDX*8) // y[i] = X2
+
+end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyunitaryto_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyunitaryto_amd64.s
new file mode 100644
index 0000000000..dbb0a7eaba
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyunitaryto_amd64.s
@@ -0,0 +1,140 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// Some of the loop unrolling code is copied from:
+// http://golang.org/src/math/big/arith_amd64.s
+// which is distributed under these terms:
+//
+// Copyright (c) 2012 The Go Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define X_PTR SI
+#define Y_PTR DX
+#define DST_PTR DI
+#define IDX AX
+#define LEN CX
+#define TAIL BX
+#define ALPHA X0
+#define ALPHA_2 X1
+
+// func AxpyUnitaryTo(dst []float64, alpha float64, x, y []float64)
+TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
+	MOVQ    dst_base+0(FP), DST_PTR // DST_PTR := &dst
+	MOVQ    x_base+32(FP), X_PTR    // X_PTR := &x
+	MOVQ    y_base+56(FP), Y_PTR    // Y_PTR := &y
+	MOVQ    x_len+40(FP), LEN       // LEN = min( len(x), len(y), len(dst) )
+	CMPQ    y_len+64(FP), LEN
+	CMOVQLE y_len+64(FP), LEN
+	CMPQ    dst_len+8(FP), LEN
+	CMOVQLE dst_len+8(FP), LEN
+
+	CMPQ LEN, $0
+	JE   end     // if LEN == 0 { return }
+
+	XORQ   IDX, IDX            // IDX = 0
+	MOVSD  alpha+24(FP), ALPHA
+	SHUFPD $0, ALPHA, ALPHA    // ALPHA := { alpha, alpha }
+	MOVQ   Y_PTR, TAIL         // Check memory alignment
+	ANDQ   $15, TAIL           // TAIL = &y % 16
+	JZ     no_trim             // if TAIL == 0 { goto no_trim }
+
+	// Align on 16-byte boundary
+	MOVSD (X_PTR), X2   // X2 := x[0]
+	MULSD ALPHA, X2     // X2 *= a
+	ADDSD (Y_PTR), X2   // X2 += y[0]
+	MOVSD X2, (DST_PTR) // y[0] = X2
+	INCQ  IDX           // i++
+	DECQ  LEN           // LEN--
+	JZ    end           // if LEN == 0 { return }
+
+no_trim:
+	MOVQ LEN, TAIL
+	ANDQ $7, TAIL   // TAIL := n % 8
+	SHRQ $3, LEN    // LEN = floor( n / 8 )
+	JZ   tail_start // if LEN == 0 { goto tail_start }
+
+	MOVUPS ALPHA, ALPHA_2 // ALPHA_2 := ALPHA  for pipelining
+
+loop:  // do {
+	// y[i] += alpha * x[i] unrolled 8x.
+	MOVUPS (X_PTR)(IDX*8), X2   // X_i = x[i]
+	MOVUPS 16(X_PTR)(IDX*8), X3
+	MOVUPS 32(X_PTR)(IDX*8), X4
+	MOVUPS 48(X_PTR)(IDX*8), X5
+
+	MULPD ALPHA, X2   // X_i *= alpha
+	MULPD ALPHA_2, X3
+	MULPD ALPHA, X4
+	MULPD ALPHA_2, X5
+
+	ADDPD (Y_PTR)(IDX*8), X2   // X_i += y[i]
+	ADDPD 16(Y_PTR)(IDX*8), X3
+	ADDPD 32(Y_PTR)(IDX*8), X4
+	ADDPD 48(Y_PTR)(IDX*8), X5
+
+	MOVUPS X2, (DST_PTR)(IDX*8)   // y[i] = X_i
+	MOVUPS X3, 16(DST_PTR)(IDX*8)
+	MOVUPS X4, 32(DST_PTR)(IDX*8)
+	MOVUPS X5, 48(DST_PTR)(IDX*8)
+
+	ADDQ $8, IDX  // i += 8
+	DECQ LEN
+	JNZ  loop     // } while --LEN > 0
+	CMPQ TAIL, $0 // if TAIL == 0 { return }
+	JE   end
+
+tail_start: // Reset loop registers
+	MOVQ TAIL, LEN // Loop counter: LEN = TAIL
+	SHRQ $1, LEN   // LEN = floor( TAIL / 2 )
+	JZ   tail_one  // if LEN == 0 { goto tail }
+
+tail_two: // do {
+	MOVUPS (X_PTR)(IDX*8), X2   // X2 = x[i]
+	MULPD  ALPHA, X2            // X2 *= alpha
+	ADDPD  (Y_PTR)(IDX*8), X2   // X2 += y[i]
+	MOVUPS X2, (DST_PTR)(IDX*8) // y[i] = X2
+	ADDQ   $2, IDX              // i += 2
+	DECQ   LEN
+	JNZ    tail_two             // } while --LEN > 0
+
+	ANDQ $1, TAIL
+	JZ   end      // if TAIL == 0 { goto end }
+
+tail_one:
+	MOVSD (X_PTR)(IDX*8), X2   // X2 = x[i]
+	MULSD ALPHA, X2            // X2 *= a
+	ADDSD (Y_PTR)(IDX*8), X2   // X2 += y[i]
+	MOVSD X2, (DST_PTR)(IDX*8) // y[i] = X2
+
+end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/cumprod_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/cumprod_amd64.s
new file mode 100644
index 0000000000..58168482d8
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/cumprod_amd64.s
@@ -0,0 +1,71 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+TEXT ·CumProd(SB), NOSPLIT, $0
+	MOVQ    dst_base+0(FP), DI // DI = &dst
+	MOVQ    dst_len+8(FP), CX  // CX = len(dst)
+	MOVQ    s_base+24(FP), SI  // SI = &s
+	CMPQ    s_len+32(FP), CX   // CX = max( CX, len(s) )
+	CMOVQLE s_len+32(FP), CX
+	MOVQ    CX, ret_len+56(FP) // len(ret) = CX
+	CMPQ    CX, $0             // if CX == 0 { return }
+	JE      cp_end
+	XORQ    AX, AX             // i = 0
+
+	MOVSD  (SI), X5   // p_prod = { s[0], s[0] }
+	SHUFPD $0, X5, X5
+	MOVSD  X5, (DI)   // dst[0] = s[0]
+	INCQ   AX         // ++i
+	DECQ   CX         // -- CX
+	JZ     cp_end     // if CX == 0 { return }
+
+	MOVQ CX, BX
+	ANDQ $3, BX        // BX = CX % 4
+	SHRQ $2, CX        // CX = floor( CX / 4 )
+	JZ   cp_tail_start // if CX == 0 { goto cp_tail_start }
+
+cp_loop: // Loop unrolled 4x   do {
+	MOVUPS (SI)(AX*8), X0   // X0 = s[i:i+1]
+	MOVUPS 16(SI)(AX*8), X2
+	MOVAPS X0, X1           // X1 = X0
+	MOVAPS X2, X3
+	SHUFPD $1, X1, X1       // { X1[0], X1[1] } = { X1[1], X1[0] }
+	SHUFPD $1, X3, X3
+	MULPD  X0, X1           // X1 *= X0
+	MULPD  X2, X3
+	SHUFPD $2, X1, X0       // { X0[0], X0[1] } = { X0[0], X1[1] }
+	SHUFPD $3, X1, X1       // { X1[0], X1[1] } = { X1[1], X1[1] }
+	SHUFPD $2, X3, X2
+	SHUFPD $3, X3, X3
+	MULPD  X5, X0           // X0 *= p_prod
+	MULPD  X1, X5           // p_prod *= X1
+	MULPD  X5, X2
+	MOVUPS X0, (DI)(AX*8)   // dst[i] = X0
+	MOVUPS X2, 16(DI)(AX*8)
+	MULPD  X3, X5
+	ADDQ   $4, AX           // i += 4
+	LOOP   cp_loop          // } while --CX > 0
+
+	// if BX == 0 { return }
+	CMPQ BX, $0
+	JE   cp_end
+
+cp_tail_start: // Reset loop registers
+	MOVQ BX, CX // Loop counter: CX = BX
+
+cp_tail: // do {
+	MULSD (SI)(AX*8), X5 // p_prod *= s[i]
+	MOVSD X5, (DI)(AX*8) // dst[i] = p_prod
+	INCQ  AX             // ++i
+	LOOP  cp_tail        // } while --CX > 0
+
+cp_end:
+	MOVQ DI, ret_base+48(FP) // &ret = &dst
+	MOVQ dst_cap+16(FP), SI  // cap(ret) = cap(dst)
+	MOVQ SI, ret_cap+64(FP)
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/cumsum_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/cumsum_amd64.s
new file mode 100644
index 0000000000..85613202a4
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/cumsum_amd64.s
@@ -0,0 +1,64 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+TEXT ·CumSum(SB), NOSPLIT, $0
+	MOVQ    dst_base+0(FP), DI // DI = &dst
+	MOVQ    dst_len+8(FP), CX  // CX = len(dst)
+	MOVQ    s_base+24(FP), SI  // SI = &s
+	CMPQ    s_len+32(FP), CX   // CX = max( CX, len(s) )
+	CMOVQLE s_len+32(FP), CX
+	MOVQ    CX, ret_len+56(FP) // len(ret) = CX
+	CMPQ    CX, $0             // if CX == 0 { return }
+	JE      cs_end
+	XORQ    AX, AX             // i = 0
+	PXOR    X5, X5             // p_sum = 0
+	MOVQ    CX, BX
+	ANDQ    $3, BX             // BX = CX % 4
+	SHRQ    $2, CX             // CX = floor( CX / 4 )
+	JZ      cs_tail_start      // if CX == 0 { goto cs_tail_start }
+
+cs_loop: // Loop unrolled 4x   do {
+	MOVUPS (SI)(AX*8), X0   // X0 = s[i:i+1]
+	MOVUPS 16(SI)(AX*8), X2
+	MOVAPS X0, X1           // X1 = X0
+	MOVAPS X2, X3
+	SHUFPD $1, X1, X1       // { X1[0], X1[1] } = { X1[1], X1[0] }
+	SHUFPD $1, X3, X3
+	ADDPD  X0, X1           // X1 += X0
+	ADDPD  X2, X3
+	SHUFPD $2, X1, X0       // { X0[0], X0[1] } = { X0[0], X1[1] }
+	SHUFPD $3, X1, X1       // { X1[0], X1[1] } = { X1[1], X1[1] }
+	SHUFPD $2, X3, X2
+	SHUFPD $3, X3, X3
+	ADDPD  X5, X0           // X0 += p_sum
+	ADDPD  X1, X5           // p_sum += X1
+	ADDPD  X5, X2
+	MOVUPS X0, (DI)(AX*8)   // dst[i] = X0
+	MOVUPS X2, 16(DI)(AX*8)
+	ADDPD  X3, X5
+	ADDQ   $4, AX           // i += 4
+	LOOP   cs_loop          // } while --CX > 0
+
+	// if BX == 0 { return }
+	CMPQ BX, $0
+	JE   cs_end
+
+cs_tail_start: // Reset loop registers
+	MOVQ BX, CX // Loop counter: CX = BX
+
+cs_tail: // do {
+	ADDSD (SI)(AX*8), X5 // p_sum *= s[i]
+	MOVSD X5, (DI)(AX*8) // dst[i] = p_sum
+	INCQ  AX             // ++i
+	LOOP  cs_tail        // } while --CX > 0
+
+cs_end:
+	MOVQ DI, ret_base+48(FP) // &ret = &dst
+	MOVQ dst_cap+16(FP), SI  // cap(ret) = cap(dst)
+	MOVQ SI, ret_cap+64(FP)
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/div_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/div_amd64.s
new file mode 100644
index 0000000000..9583976748
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/div_amd64.s
@@ -0,0 +1,67 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func Div(dst, s []float64)
+TEXT ·Div(SB), NOSPLIT, $0
+	MOVQ    dst_base+0(FP), DI // DI = &dst
+	MOVQ    dst_len+8(FP), CX  // CX = len(dst)
+	MOVQ    s_base+24(FP), SI  // SI = &s
+	CMPQ    s_len+32(FP), CX   // CX = max( CX, len(s) )
+	CMOVQLE s_len+32(FP), CX
+	CMPQ    CX, $0             // if CX == 0 { return }
+	JE      div_end
+	XORQ    AX, AX             // i = 0
+	MOVQ    SI, BX
+	ANDQ    $15, BX            // BX = &s & 15
+	JZ      div_no_trim        // if BX == 0 { goto div_no_trim }
+
+	// Align on 16-bit boundary
+	MOVSD (DI)(AX*8), X0 // X0 = dst[i]
+	DIVSD (SI)(AX*8), X0 // X0 /= s[i]
+	MOVSD X0, (DI)(AX*8) // dst[i] = X0
+	INCQ  AX             // ++i
+	DECQ  CX             // --CX
+	JZ    div_end        // if CX == 0 { return }
+
+div_no_trim:
+	MOVQ CX, BX
+	ANDQ $7, BX         // BX = len(dst) % 8
+	SHRQ $3, CX         // CX = floor( len(dst) / 8 )
+	JZ   div_tail_start // if CX == 0 { goto div_tail_start }
+
+div_loop: // Loop unrolled 8x   do {
+	MOVUPS (DI)(AX*8), X0   // X0 = dst[i:i+1]
+	MOVUPS 16(DI)(AX*8), X1
+	MOVUPS 32(DI)(AX*8), X2
+	MOVUPS 48(DI)(AX*8), X3
+	DIVPD  (SI)(AX*8), X0   // X0 /= s[i:i+1]
+	DIVPD  16(SI)(AX*8), X1
+	DIVPD  32(SI)(AX*8), X2
+	DIVPD  48(SI)(AX*8), X3
+	MOVUPS X0, (DI)(AX*8)   // dst[i] = X0
+	MOVUPS X1, 16(DI)(AX*8)
+	MOVUPS X2, 32(DI)(AX*8)
+	MOVUPS X3, 48(DI)(AX*8)
+	ADDQ   $8, AX           // i += 8
+	LOOP   div_loop         // } while --CX > 0
+	CMPQ   BX, $0           // if BX == 0 { return }
+	JE     div_end
+
+div_tail_start: // Reset loop registers
+	MOVQ BX, CX // Loop counter: CX = BX
+
+div_tail: // do {
+	MOVSD (DI)(AX*8), X0 // X0 = dst[i]
+	DIVSD (SI)(AX*8), X0 // X0 /= s[i]
+	MOVSD X0, (DI)(AX*8) // dst[i] = X0
+	INCQ  AX             // ++i
+	LOOP  div_tail       // } while --CX > 0
+
+div_end:
+	RET
+
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/divto_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/divto_amd64.s
new file mode 100644
index 0000000000..e7094cb95b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/divto_amd64.s
@@ -0,0 +1,73 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func DivTo(dst, x, y []float64)
+TEXT ·DivTo(SB), NOSPLIT, $0
+	MOVQ    dst_base+0(FP), DI // DI = &dst
+	MOVQ    dst_len+8(FP), CX  // CX = len(dst)
+	MOVQ    x_base+24(FP), SI  // SI = &x
+	MOVQ    y_base+48(FP), DX  // DX = &y
+	CMPQ    x_len+32(FP), CX   // CX = max( len(dst), len(x), len(y) )
+	CMOVQLE x_len+32(FP), CX
+	CMPQ    y_len+56(FP), CX
+	CMOVQLE y_len+56(FP), CX
+	MOVQ    CX, ret_len+80(FP) // len(ret) = CX
+	CMPQ    CX, $0             // if CX == 0 { return }
+	JE      div_end
+	XORQ    AX, AX             // i = 0
+	MOVQ    DX, BX
+	ANDQ    $15, BX            // BX = &y & OxF
+	JZ      div_no_trim        // if BX == 0 { goto div_no_trim }
+
+	// Align on 16-bit boundary
+	MOVSD (SI)(AX*8), X0 // X0 = s[i]
+	DIVSD (DX)(AX*8), X0 // X0 /= t[i]
+	MOVSD X0, (DI)(AX*8) // dst[i] = X0
+	INCQ  AX             // ++i
+	DECQ  CX             // --CX
+	JZ    div_end        // if CX == 0 { return }
+
+div_no_trim:
+	MOVQ CX, BX
+	ANDQ $7, BX         // BX = len(dst) % 8
+	SHRQ $3, CX         // CX = floor( len(dst) / 8 )
+	JZ   div_tail_start // if CX == 0 { goto div_tail_start }
+
+div_loop: // Loop unrolled 8x   do {
+	MOVUPS (SI)(AX*8), X0   // X0 = x[i:i+1]
+	MOVUPS 16(SI)(AX*8), X1
+	MOVUPS 32(SI)(AX*8), X2
+	MOVUPS 48(SI)(AX*8), X3
+	DIVPD  (DX)(AX*8), X0   // X0 /= y[i:i+1]
+	DIVPD  16(DX)(AX*8), X1
+	DIVPD  32(DX)(AX*8), X2
+	DIVPD  48(DX)(AX*8), X3
+	MOVUPS X0, (DI)(AX*8)   // dst[i:i+1] = X0
+	MOVUPS X1, 16(DI)(AX*8)
+	MOVUPS X2, 32(DI)(AX*8)
+	MOVUPS X3, 48(DI)(AX*8)
+	ADDQ   $8, AX           // i += 8
+	LOOP   div_loop         // } while --CX > 0
+	CMPQ   BX, $0           // if BX == 0 { return }
+	JE     div_end
+
+div_tail_start: // Reset loop registers
+	MOVQ BX, CX // Loop counter: CX = BX
+
+div_tail: // do {
+	MOVSD (SI)(AX*8), X0 // X0  = x[i]
+	DIVSD (DX)(AX*8), X0 // X0 /= y[i]
+	MOVSD X0, (DI)(AX*8)
+	INCQ  AX             // ++i
+	LOOP  div_tail       // } while --CX > 0
+
+div_end:
+	MOVQ DI, ret_base+72(FP) // &ret = &dst
+	MOVQ dst_cap+16(FP), DI  // cap(ret) = cap(dst)
+	MOVQ DI, ret_cap+88(FP)
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/doc.go b/vendor/gonum.org/v1/gonum/internal/asm/f64/doc.go
new file mode 100644
index 0000000000..33c76c1e03
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package f64 provides float64 vector primitives.
+package f64 // import "gonum.org/v1/gonum/internal/asm/f64"
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/dot.go b/vendor/gonum.org/v1/gonum/internal/asm/f64/dot.go
new file mode 100644
index 0000000000..099316440e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/dot.go
@@ -0,0 +1,38 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 || noasm || gccgo || safe
+// +build !amd64 noasm gccgo safe
+
+package f64
+
+// DotUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * v
+//	}
+//	return sum
+func DotUnitary(x, y []float64) (sum float64) {
+	for i, v := range x {
+		sum += y[i] * v
+	}
+	return sum
+}
+
+// DotInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotInc(x, y []float64, n, incX, incY, ix, iy uintptr) (sum float64) {
+	for i := 0; i < int(n); i++ {
+		sum += y[iy] * x[ix]
+		ix += incX
+		iy += incY
+	}
+	return sum
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/dot_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/dot_amd64.s
new file mode 100644
index 0000000000..c8cd412962
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/dot_amd64.s
@@ -0,0 +1,145 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// Some of the loop unrolling code is copied from:
+// http://golang.org/src/math/big/arith_amd64.s
+// which is distributed under these terms:
+//
+// Copyright (c) 2012 The Go Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func DdotUnitary(x, y []float64) (sum float64)
+// This function assumes len(y) >= len(x).
+TEXT ·DotUnitary(SB), NOSPLIT, $0
+	MOVQ x+0(FP), R8
+	MOVQ x_len+8(FP), DI // n = len(x)
+	MOVQ y+24(FP), R9
+
+	MOVSD $(0.0), X7 // sum = 0
+	MOVSD $(0.0), X8 // sum = 0
+
+	MOVQ $0, SI   // i = 0
+	SUBQ $4, DI   // n -= 4
+	JL   tail_uni // if n < 0 goto tail_uni
+
+loop_uni:
+	// sum += x[i] * y[i] unrolled 4x.
+	MOVUPD 0(R8)(SI*8), X0
+	MOVUPD 0(R9)(SI*8), X1
+	MOVUPD 16(R8)(SI*8), X2
+	MOVUPD 16(R9)(SI*8), X3
+	MULPD  X1, X0
+	MULPD  X3, X2
+	ADDPD  X0, X7
+	ADDPD  X2, X8
+
+	ADDQ $4, SI   // i += 4
+	SUBQ $4, DI   // n -= 4
+	JGE  loop_uni // if n >= 0 goto loop_uni
+
+tail_uni:
+	ADDQ $4, DI  // n += 4
+	JLE  end_uni // if n <= 0 goto end_uni
+
+onemore_uni:
+	// sum += x[i] * y[i] for the remaining 1-3 elements.
+	MOVSD 0(R8)(SI*8), X0
+	MOVSD 0(R9)(SI*8), X1
+	MULSD X1, X0
+	ADDSD X0, X7
+
+	ADDQ $1, SI      // i++
+	SUBQ $1, DI      // n--
+	JNZ  onemore_uni // if n != 0 goto onemore_uni
+
+end_uni:
+	// Add the four sums together.
+	ADDPD    X8, X7
+	MOVSD    X7, X0
+	UNPCKHPD X7, X7
+	ADDSD    X0, X7
+	MOVSD    X7, sum+48(FP) // Return final sum.
+	RET
+
+// func DdotInc(x, y []float64, n, incX, incY, ix, iy uintptr) (sum float64)
+TEXT ·DotInc(SB), NOSPLIT, $0
+	MOVQ x+0(FP), R8
+	MOVQ y+24(FP), R9
+	MOVQ n+48(FP), CX
+	MOVQ incX+56(FP), R11
+	MOVQ incY+64(FP), R12
+	MOVQ ix+72(FP), R13
+	MOVQ iy+80(FP), R14
+
+	MOVSD $(0.0), X7      // sum = 0
+	LEAQ  (R8)(R13*8), SI // p = &x[ix]
+	LEAQ  (R9)(R14*8), DI // q = &y[ix]
+	SHLQ  $3, R11         // incX *= sizeof(float64)
+	SHLQ  $3, R12         // indY *= sizeof(float64)
+
+	SUBQ $2, CX   // n -= 2
+	JL   tail_inc // if n < 0 goto tail_inc
+
+loop_inc:
+	// sum += *p * *q unrolled 2x.
+	MOVHPD (SI), X0
+	MOVHPD (DI), X1
+	ADDQ   R11, SI  // p += incX
+	ADDQ   R12, DI  // q += incY
+	MOVLPD (SI), X0
+	MOVLPD (DI), X1
+	ADDQ   R11, SI  // p += incX
+	ADDQ   R12, DI  // q += incY
+
+	MULPD X1, X0
+	ADDPD X0, X7
+
+	SUBQ $2, CX   // n -= 2
+	JGE  loop_inc // if n >= 0 goto loop_inc
+
+tail_inc:
+	ADDQ $2, CX  // n += 2
+	JLE  end_inc // if n <= 0 goto end_inc
+
+	// sum += *p * *q for the last iteration if n is odd.
+	MOVSD (SI), X0
+	MULSD (DI), X0
+	ADDSD X0, X7
+
+end_inc:
+	// Add the two sums together.
+	MOVSD    X7, X0
+	UNPCKHPD X7, X7
+	ADDSD    X0, X7
+	MOVSD    X7, sum+88(FP) // Return final sum.
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/ge_amd64.go b/vendor/gonum.org/v1/gonum/internal/asm/f64/ge_amd64.go
new file mode 100644
index 0000000000..5b04233845
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/ge_amd64.go
@@ -0,0 +1,29 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !noasm && !gccgo && !safe
+// +build !noasm,!gccgo,!safe
+
+package f64
+
+// Ger performs the rank-one operation
+//
+//	A += alpha * x * yᵀ
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
+func Ger(m, n uintptr, alpha float64, x []float64, incX uintptr, y []float64, incY uintptr, a []float64, lda uintptr)
+
+// GemvN computes
+//
+//	y = alpha * A * x + beta * y
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars.
+func GemvN(m, n uintptr, alpha float64, a []float64, lda uintptr, x []float64, incX uintptr, beta float64, y []float64, incY uintptr)
+
+// GemvT computes
+//
+//	y = alpha * Aᵀ * x + beta * y
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars.
+func GemvT(m, n uintptr, alpha float64, a []float64, lda uintptr, x []float64, incX uintptr, beta float64, y []float64, incY uintptr)
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/ge_noasm.go b/vendor/gonum.org/v1/gonum/internal/asm/f64/ge_noasm.go
new file mode 100644
index 0000000000..e8dee0511b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/ge_noasm.go
@@ -0,0 +1,125 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 || noasm || gccgo || safe
+// +build !amd64 noasm gccgo safe
+
+package f64
+
+// Ger performs the rank-one operation
+//
+//	A += alpha * x * yᵀ
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
+func Ger(m, n uintptr, alpha float64, x []float64, incX uintptr, y []float64, incY uintptr, a []float64, lda uintptr) {
+	if incX == 1 && incY == 1 {
+		x = x[:m]
+		y = y[:n]
+		for i, xv := range x {
+			AxpyUnitary(alpha*xv, y, a[uintptr(i)*lda:uintptr(i)*lda+n])
+		}
+		return
+	}
+
+	var ky, kx uintptr
+	if int(incY) < 0 {
+		ky = uintptr(-int(n-1) * int(incY))
+	}
+	if int(incX) < 0 {
+		kx = uintptr(-int(m-1) * int(incX))
+	}
+
+	ix := kx
+	for i := 0; i < int(m); i++ {
+		AxpyInc(alpha*x[ix], y, a[uintptr(i)*lda:uintptr(i)*lda+n], n, incY, 1, ky, 0)
+		ix += incX
+	}
+}
+
+// GemvN computes
+//
+//	y = alpha * A * x + beta * y
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars.
+func GemvN(m, n uintptr, alpha float64, a []float64, lda uintptr, x []float64, incX uintptr, beta float64, y []float64, incY uintptr) {
+	var kx, ky, i uintptr
+	if int(incX) < 0 {
+		kx = uintptr(-int(n-1) * int(incX))
+	}
+	if int(incY) < 0 {
+		ky = uintptr(-int(m-1) * int(incY))
+	}
+
+	if incX == 1 && incY == 1 {
+		if beta == 0 {
+			for i = 0; i < m; i++ {
+				y[i] = alpha * DotUnitary(a[lda*i:lda*i+n], x)
+			}
+			return
+		}
+		for i = 0; i < m; i++ {
+			y[i] = y[i]*beta + alpha*DotUnitary(a[lda*i:lda*i+n], x)
+		}
+		return
+	}
+	iy := ky
+	if beta == 0 {
+		for i = 0; i < m; i++ {
+			y[iy] = alpha * DotInc(x, a[lda*i:lda*i+n], n, incX, 1, kx, 0)
+			iy += incY
+		}
+		return
+	}
+	for i = 0; i < m; i++ {
+		y[iy] = y[iy]*beta + alpha*DotInc(x, a[lda*i:lda*i+n], n, incX, 1, kx, 0)
+		iy += incY
+	}
+}
+
+// GemvT computes
+//
+//	y = alpha * Aᵀ * x + beta * y
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars.
+func GemvT(m, n uintptr, alpha float64, a []float64, lda uintptr, x []float64, incX uintptr, beta float64, y []float64, incY uintptr) {
+	var kx, ky, i uintptr
+	if int(incX) < 0 {
+		kx = uintptr(-int(m-1) * int(incX))
+	}
+	if int(incY) < 0 {
+		ky = uintptr(-int(n-1) * int(incY))
+	}
+	switch {
+	case beta == 0: // beta == 0 is special-cased to memclear
+		if incY == 1 {
+			for i := range y {
+				y[i] = 0
+			}
+		} else {
+			iy := ky
+			for i := 0; i < int(n); i++ {
+				y[iy] = 0
+				iy += incY
+			}
+		}
+	case int(incY) < 0:
+		ScalInc(beta, y, n, uintptr(int(-incY)))
+	case incY == 1:
+		ScalUnitary(beta, y[:n])
+	default:
+		ScalInc(beta, y, n, incY)
+	}
+
+	if incX == 1 && incY == 1 {
+		for i = 0; i < m; i++ {
+			AxpyUnitaryTo(y, alpha*x[i], a[lda*i:lda*i+n], y)
+		}
+		return
+	}
+	ix := kx
+	for i = 0; i < m; i++ {
+		AxpyInc(alpha*x[ix], a[lda*i:lda*i+n], y, n, 1, incY, 0, ky)
+		ix += incX
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/gemvN_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/gemvN_amd64.s
new file mode 100644
index 0000000000..917e0e30e1
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/gemvN_amd64.s
@@ -0,0 +1,685 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define SIZE 8
+
+#define M_DIM m+0(FP)
+#define M CX
+#define N_DIM n+8(FP)
+#define N BX
+
+#define TMP1 R14
+#define TMP2 R15
+
+#define X_PTR SI
+#define X x_base+56(FP)
+#define INC_X R8
+#define INC3_X R9
+
+#define Y_PTR DX
+#define Y y_base+96(FP)
+#define INC_Y R10
+#define INC3_Y R11
+
+#define A_ROW AX
+#define A_PTR DI
+#define LDA R12
+#define LDA3 R13
+
+#define ALPHA X15
+#define BETA X14
+
+#define INIT4 \
+	XORPS X0, X0 \
+	XORPS X1, X1 \
+	XORPS X2, X2 \
+	XORPS X3, X3
+
+#define INIT2 \
+	XORPS X0, X0 \
+	XORPS X1, X1
+
+#define INIT1 \
+	XORPS X0, X0
+
+#define KERNEL_LOAD4 \
+	MOVUPS (X_PTR), X12       \
+	MOVUPS 2*SIZE(X_PTR), X13
+
+#define KERNEL_LOAD2 \
+	MOVUPS (X_PTR), X12
+
+#define KERNEL_LOAD4_INC \
+	MOVSD  (X_PTR), X12           \
+	MOVHPD (X_PTR)(INC_X*1), X12  \
+	MOVSD  (X_PTR)(INC_X*2), X13  \
+	MOVHPD (X_PTR)(INC3_X*1), X13
+
+#define KERNEL_LOAD2_INC \
+	MOVSD  (X_PTR), X12          \
+	MOVHPD (X_PTR)(INC_X*1), X12
+
+#define KERNEL_4x4 \
+	MOVUPS (A_PTR), X4                \
+	MOVUPS 2*SIZE(A_PTR), X5          \
+	MOVUPS (A_PTR)(LDA*1), X6         \
+	MOVUPS 2*SIZE(A_PTR)(LDA*1), X7   \
+	MOVUPS (A_PTR)(LDA*2), X8         \
+	MOVUPS 2*SIZE(A_PTR)(LDA*2), X9   \
+	MOVUPS (A_PTR)(LDA3*1), X10       \
+	MOVUPS 2*SIZE(A_PTR)(LDA3*1), X11 \
+	MULPD  X12, X4                    \
+	MULPD  X13, X5                    \
+	MULPD  X12, X6                    \
+	MULPD  X13, X7                    \
+	MULPD  X12, X8                    \
+	MULPD  X13, X9                    \
+	MULPD  X12, X10                   \
+	MULPD  X13, X11                   \
+	ADDPD  X4, X0                     \
+	ADDPD  X5, X0                     \
+	ADDPD  X6, X1                     \
+	ADDPD  X7, X1                     \
+	ADDPD  X8, X2                     \
+	ADDPD  X9, X2                     \
+	ADDPD  X10, X3                    \
+	ADDPD  X11, X3                    \
+	ADDQ   $4*SIZE, A_PTR
+
+#define KERNEL_4x2 \
+	MOVUPS (A_PTR), X4         \
+	MOVUPS (A_PTR)(LDA*1), X5  \
+	MOVUPS (A_PTR)(LDA*2), X6  \
+	MOVUPS (A_PTR)(LDA3*1), X7 \
+	MULPD  X12, X4             \
+	MULPD  X12, X5             \
+	MULPD  X12, X6             \
+	MULPD  X12, X7             \
+	ADDPD  X4, X0              \
+	ADDPD  X5, X1              \
+	ADDPD  X6, X2              \
+	ADDPD  X7, X3              \
+	ADDQ   $2*SIZE, A_PTR
+
+#define KERNEL_4x1 \
+	MOVDDUP (X_PTR), X12        \
+	MOVSD   (A_PTR), X4         \
+	MOVHPD  (A_PTR)(LDA*1), X4  \
+	MOVSD   (A_PTR)(LDA*2), X5  \
+	MOVHPD  (A_PTR)(LDA3*1), X5 \
+	MULPD   X12, X4             \
+	MULPD   X12, X5             \
+	ADDPD   X4, X0              \
+	ADDPD   X5, X2              \
+	ADDQ    $SIZE, A_PTR
+
+#define STORE4 \
+	MOVUPS (Y_PTR), X4       \
+	MOVUPS 2*SIZE(Y_PTR), X5 \
+	MULPD  ALPHA, X0         \
+	MULPD  ALPHA, X2         \
+	MULPD  BETA, X4          \
+	MULPD  BETA, X5          \
+	ADDPD  X0, X4            \
+	ADDPD  X2, X5            \
+	MOVUPS X4, (Y_PTR)       \
+	MOVUPS X5, 2*SIZE(Y_PTR)
+
+#define STORE4_INC \
+	MOVSD  (Y_PTR), X4           \
+	MOVHPD (Y_PTR)(INC_Y*1), X4  \
+	MOVSD  (Y_PTR)(INC_Y*2), X5  \
+	MOVHPD (Y_PTR)(INC3_Y*1), X5 \
+	MULPD  ALPHA, X0             \
+	MULPD  ALPHA, X2             \
+	MULPD  BETA, X4              \
+	MULPD  BETA, X5              \
+	ADDPD  X0, X4                \
+	ADDPD  X2, X5                \
+	MOVLPD X4, (Y_PTR)           \
+	MOVHPD X4, (Y_PTR)(INC_Y*1)  \
+	MOVLPD X5, (Y_PTR)(INC_Y*2)  \
+	MOVHPD X5, (Y_PTR)(INC3_Y*1)
+
+#define KERNEL_2x4 \
+	MOVUPS (A_PTR), X8               \
+	MOVUPS 2*SIZE(A_PTR), X9         \
+	MOVUPS (A_PTR)(LDA*1), X10       \
+	MOVUPS 2*SIZE(A_PTR)(LDA*1), X11 \
+	MULPD  X12, X8                   \
+	MULPD  X13, X9                   \
+	MULPD  X12, X10                  \
+	MULPD  X13, X11                  \
+	ADDPD  X8, X0                    \
+	ADDPD  X10, X1                   \
+	ADDPD  X9, X0                    \
+	ADDPD  X11, X1                   \
+	ADDQ   $4*SIZE, A_PTR
+
+#define KERNEL_2x2 \
+	MOVUPS (A_PTR), X8        \
+	MOVUPS (A_PTR)(LDA*1), X9 \
+	MULPD  X12, X8            \
+	MULPD  X12, X9            \
+	ADDPD  X8, X0             \
+	ADDPD  X9, X1             \
+	ADDQ   $2*SIZE, A_PTR
+
+#define KERNEL_2x1 \
+	MOVDDUP (X_PTR), X12       \
+	MOVSD   (A_PTR), X8        \
+	MOVHPD  (A_PTR)(LDA*1), X8 \
+	MULPD   X12, X8            \
+	ADDPD   X8, X0             \
+	ADDQ    $SIZE, A_PTR
+
+#define STORE2 \
+	MOVUPS (Y_PTR), X4 \
+	MULPD  ALPHA, X0   \
+	MULPD  BETA, X4    \
+	ADDPD  X0, X4      \
+	MOVUPS X4, (Y_PTR)
+
+#define STORE2_INC \
+	MOVSD  (Y_PTR), X4          \
+	MOVHPD (Y_PTR)(INC_Y*1), X4 \
+	MULPD  ALPHA, X0            \
+	MULPD  BETA, X4             \
+	ADDPD  X0, X4               \
+	MOVSD  X4, (Y_PTR)          \
+	MOVHPD X4, (Y_PTR)(INC_Y*1)
+
+#define KERNEL_1x4 \
+	MOVUPS (A_PTR), X8       \
+	MOVUPS 2*SIZE(A_PTR), X9 \
+	MULPD  X12, X8           \
+	MULPD  X13, X9           \
+	ADDPD  X8, X0            \
+	ADDPD  X9, X0            \
+	ADDQ   $4*SIZE, A_PTR
+
+#define KERNEL_1x2 \
+	MOVUPS (A_PTR), X8    \
+	MULPD  X12, X8        \
+	ADDPD  X8, X0         \
+	ADDQ   $2*SIZE, A_PTR
+
+#define KERNEL_1x1 \
+	MOVSD (X_PTR), X12 \
+	MOVSD (A_PTR), X8  \
+	MULSD X12, X8      \
+	ADDSD X8, X0       \
+	ADDQ  $SIZE, A_PTR
+
+#define STORE1 \
+	HADDPD X0, X0      \
+	MOVSD  (Y_PTR), X4 \
+	MULSD  ALPHA, X0   \
+	MULSD  BETA, X4    \
+	ADDSD  X0, X4      \
+	MOVSD  X4, (Y_PTR)
+
+// func GemvN(m, n int,
+//	alpha float64,
+//	a []float64, lda int,
+//	x []float64, incX int,
+//	beta float64,
+//	y []float64, incY int)
+TEXT ·GemvN(SB), NOSPLIT, $32-128
+	MOVQ M_DIM, M
+	MOVQ N_DIM, N
+	CMPQ M, $0
+	JE   end
+	CMPQ N, $0
+	JE   end
+
+	MOVDDUP alpha+16(FP), ALPHA
+	MOVDDUP beta+88(FP), BETA
+
+	MOVQ x_base+56(FP), X_PTR
+	MOVQ y_base+96(FP), Y_PTR
+	MOVQ a_base+24(FP), A_ROW
+	MOVQ incY+120(FP), INC_Y
+	MOVQ lda+48(FP), LDA      // LDA = LDA * sizeof(float64)
+	SHLQ $3, LDA
+	LEAQ (LDA)(LDA*2), LDA3   // LDA3 = LDA * 3
+	MOVQ A_ROW, A_PTR
+
+	XORQ    TMP2, TMP2
+	MOVQ    M, TMP1
+	SUBQ    $1, TMP1
+	IMULQ   INC_Y, TMP1
+	NEGQ    TMP1
+	CMPQ    INC_Y, $0
+	CMOVQLT TMP1, TMP2
+	LEAQ    (Y_PTR)(TMP2*SIZE), Y_PTR
+	MOVQ    Y_PTR, Y
+
+	SHLQ $3, INC_Y                // INC_Y = incY * sizeof(float64)
+	LEAQ (INC_Y)(INC_Y*2), INC3_Y // INC3_Y = INC_Y * 3
+
+	MOVSD  $0.0, X0
+	COMISD BETA, X0
+	JNE    gemv_start // if beta != 0 { goto gemv_start }
+
+gemv_clear: // beta == 0 is special cased to clear memory (no nan handling)
+	XORPS X0, X0
+	XORPS X1, X1
+	XORPS X2, X2
+	XORPS X3, X3
+
+	CMPQ incY+120(FP), $1 // Check for dense vector X (fast-path)
+	JNE  inc_clear
+
+	SHRQ $3, M
+	JZ   clear4
+
+clear8:
+	MOVUPS X0, (Y_PTR)
+	MOVUPS X1, 16(Y_PTR)
+	MOVUPS X2, 32(Y_PTR)
+	MOVUPS X3, 48(Y_PTR)
+	ADDQ   $8*SIZE, Y_PTR
+	DECQ   M
+	JNZ    clear8
+
+clear4:
+	TESTQ  $4, M_DIM
+	JZ     clear2
+	MOVUPS X0, (Y_PTR)
+	MOVUPS X1, 16(Y_PTR)
+	ADDQ   $4*SIZE, Y_PTR
+
+clear2:
+	TESTQ  $2, M_DIM
+	JZ     clear1
+	MOVUPS X0, (Y_PTR)
+	ADDQ   $2*SIZE, Y_PTR
+
+clear1:
+	TESTQ $1, M_DIM
+	JZ    prep_end
+	MOVSD X0, (Y_PTR)
+
+	JMP prep_end
+
+inc_clear:
+	SHRQ $2, M
+	JZ   inc_clear2
+
+inc_clear4:
+	MOVSD X0, (Y_PTR)
+	MOVSD X1, (Y_PTR)(INC_Y*1)
+	MOVSD X2, (Y_PTR)(INC_Y*2)
+	MOVSD X3, (Y_PTR)(INC3_Y*1)
+	LEAQ  (Y_PTR)(INC_Y*4), Y_PTR
+	DECQ  M
+	JNZ   inc_clear4
+
+inc_clear2:
+	TESTQ $2, M_DIM
+	JZ    inc_clear1
+	MOVSD X0, (Y_PTR)
+	MOVSD X1, (Y_PTR)(INC_Y*1)
+	LEAQ  (Y_PTR)(INC_Y*2), Y_PTR
+
+inc_clear1:
+	TESTQ $1, M_DIM
+	JZ    prep_end
+	MOVSD X0, (Y_PTR)
+
+prep_end:
+	MOVQ Y, Y_PTR
+	MOVQ M_DIM, M
+
+gemv_start:
+	CMPQ incX+80(FP), $1 // Check for dense vector X (fast-path)
+	JNE  inc
+
+	SHRQ $2, M
+	JZ   r2
+
+r4:
+	// LOAD 4
+	INIT4
+
+	MOVQ N_DIM, N
+	SHRQ $2, N
+	JZ   r4c2
+
+r4c4:
+	// 4x4 KERNEL
+	KERNEL_LOAD4
+	KERNEL_4x4
+
+	ADDQ $4*SIZE, X_PTR
+
+	DECQ N
+	JNZ  r4c4
+
+r4c2:
+	TESTQ $2, N_DIM
+	JZ    r4c1
+
+	// 4x2 KERNEL
+	KERNEL_LOAD2
+	KERNEL_4x2
+
+	ADDQ $2*SIZE, X_PTR
+
+r4c1:
+	HADDPD X1, X0
+	HADDPD X3, X2
+	TESTQ  $1, N_DIM
+	JZ     r4end
+
+	// 4x1 KERNEL
+	KERNEL_4x1
+
+	ADDQ $SIZE, X_PTR
+
+r4end:
+	CMPQ INC_Y, $SIZE
+	JNZ  r4st_inc
+
+	STORE4
+	ADDQ $4*SIZE, Y_PTR
+	JMP  r4inc
+
+r4st_inc:
+	STORE4_INC
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+
+r4inc:
+	MOVQ X, X_PTR
+	LEAQ (A_ROW)(LDA*4), A_ROW
+	MOVQ A_ROW, A_PTR
+
+	DECQ M
+	JNZ  r4
+
+r2:
+	TESTQ $2, M_DIM
+	JZ    r1
+
+	// LOAD 2
+	INIT2
+
+	MOVQ N_DIM, N
+	SHRQ $2, N
+	JZ   r2c2
+
+r2c4:
+	// 2x4 KERNEL
+	KERNEL_LOAD4
+	KERNEL_2x4
+
+	ADDQ $4*SIZE, X_PTR
+
+	DECQ N
+	JNZ  r2c4
+
+r2c2:
+	TESTQ $2, N_DIM
+	JZ    r2c1
+
+	// 2x2 KERNEL
+	KERNEL_LOAD2
+	KERNEL_2x2
+
+	ADDQ $2*SIZE, X_PTR
+
+r2c1:
+	HADDPD X1, X0
+	TESTQ  $1, N_DIM
+	JZ     r2end
+
+	// 2x1 KERNEL
+	KERNEL_2x1
+
+	ADDQ $SIZE, X_PTR
+
+r2end:
+	CMPQ INC_Y, $SIZE
+	JNE  r2st_inc
+
+	STORE2
+	ADDQ $2*SIZE, Y_PTR
+	JMP  r2inc
+
+r2st_inc:
+	STORE2_INC
+	LEAQ (Y_PTR)(INC_Y*2), Y_PTR
+
+r2inc:
+	MOVQ X, X_PTR
+	LEAQ (A_ROW)(LDA*2), A_ROW
+	MOVQ A_ROW, A_PTR
+
+r1:
+	TESTQ $1, M_DIM
+	JZ    end
+
+	// LOAD 1
+	INIT1
+
+	MOVQ N_DIM, N
+	SHRQ $2, N
+	JZ   r1c2
+
+r1c4:
+	// 1x4 KERNEL
+	KERNEL_LOAD4
+	KERNEL_1x4
+
+	ADDQ $4*SIZE, X_PTR
+
+	DECQ N
+	JNZ  r1c4
+
+r1c2:
+	TESTQ $2, N_DIM
+	JZ    r1c1
+
+	// 1x2 KERNEL
+	KERNEL_LOAD2
+	KERNEL_1x2
+
+	ADDQ $2*SIZE, X_PTR
+
+r1c1:
+
+	TESTQ $1, N_DIM
+	JZ    r1end
+
+	// 1x1 KERNEL
+	KERNEL_1x1
+
+r1end:
+	STORE1
+
+end:
+	RET
+
+inc:  // Algorithm for incX != 1 ( split loads in kernel )
+	MOVQ incX+80(FP), INC_X // INC_X = incX
+
+	XORQ    TMP2, TMP2                // TMP2  = 0
+	MOVQ    N, TMP1                   // TMP1 = N
+	SUBQ    $1, TMP1                  // TMP1 -= 1
+	NEGQ    TMP1                      // TMP1 = -TMP1
+	IMULQ   INC_X, TMP1               // TMP1 *= INC_X
+	CMPQ    INC_X, $0                 // if INC_X < 0 { TMP2 = TMP1 }
+	CMOVQLT TMP1, TMP2
+	LEAQ    (X_PTR)(TMP2*SIZE), X_PTR // X_PTR = X_PTR[TMP2]
+	MOVQ    X_PTR, X                  // X = X_PTR
+
+	SHLQ $3, INC_X
+	LEAQ (INC_X)(INC_X*2), INC3_X // INC3_X = INC_X * 3
+
+	SHRQ $2, M
+	JZ   inc_r2
+
+inc_r4:
+	// LOAD 4
+	INIT4
+
+	MOVQ N_DIM, N
+	SHRQ $2, N
+	JZ   inc_r4c2
+
+inc_r4c4:
+	// 4x4 KERNEL
+	KERNEL_LOAD4_INC
+	KERNEL_4x4
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR
+
+	DECQ N
+	JNZ  inc_r4c4
+
+inc_r4c2:
+	TESTQ $2, N_DIM
+	JZ    inc_r4c1
+
+	// 4x2 KERNEL
+	KERNEL_LOAD2_INC
+	KERNEL_4x2
+
+	LEAQ (X_PTR)(INC_X*2), X_PTR
+
+inc_r4c1:
+	HADDPD X1, X0
+	HADDPD X3, X2
+	TESTQ  $1, N_DIM
+	JZ     inc_r4end
+
+	// 4x1 KERNEL
+	KERNEL_4x1
+
+	ADDQ INC_X, X_PTR
+
+inc_r4end:
+	CMPQ INC_Y, $SIZE
+	JNE  inc_r4st_inc
+
+	STORE4
+	ADDQ $4*SIZE, Y_PTR
+	JMP  inc_r4inc
+
+inc_r4st_inc:
+	STORE4_INC
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+
+inc_r4inc:
+	MOVQ X, X_PTR
+	LEAQ (A_ROW)(LDA*4), A_ROW
+	MOVQ A_ROW, A_PTR
+
+	DECQ M
+	JNZ  inc_r4
+
+inc_r2:
+	TESTQ $2, M_DIM
+	JZ    inc_r1
+
+	// LOAD 2
+	INIT2
+
+	MOVQ N_DIM, N
+	SHRQ $2, N
+	JZ   inc_r2c2
+
+inc_r2c4:
+	// 2x4 KERNEL
+	KERNEL_LOAD4_INC
+	KERNEL_2x4
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR
+	DECQ N
+	JNZ  inc_r2c4
+
+inc_r2c2:
+	TESTQ $2, N_DIM
+	JZ    inc_r2c1
+
+	// 2x2 KERNEL
+	KERNEL_LOAD2_INC
+	KERNEL_2x2
+
+	LEAQ (X_PTR)(INC_X*2), X_PTR
+
+inc_r2c1:
+	HADDPD X1, X0
+	TESTQ  $1, N_DIM
+	JZ     inc_r2end
+
+	// 2x1 KERNEL
+	KERNEL_2x1
+
+	ADDQ INC_X, X_PTR
+
+inc_r2end:
+	CMPQ INC_Y, $SIZE
+	JNE  inc_r2st_inc
+
+	STORE2
+	ADDQ $2*SIZE, Y_PTR
+	JMP  inc_r2inc
+
+inc_r2st_inc:
+	STORE2_INC
+	LEAQ (Y_PTR)(INC_Y*2), Y_PTR
+
+inc_r2inc:
+	MOVQ X, X_PTR
+	LEAQ (A_ROW)(LDA*2), A_ROW
+	MOVQ A_ROW, A_PTR
+
+inc_r1:
+	TESTQ $1, M_DIM
+	JZ    inc_end
+
+	// LOAD 1
+	INIT1
+
+	MOVQ N_DIM, N
+	SHRQ $2, N
+	JZ   inc_r1c2
+
+inc_r1c4:
+	// 1x4 KERNEL
+	KERNEL_LOAD4_INC
+	KERNEL_1x4
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR
+	DECQ N
+	JNZ  inc_r1c4
+
+inc_r1c2:
+	TESTQ $2, N_DIM
+	JZ    inc_r1c1
+
+	// 1x2 KERNEL
+	KERNEL_LOAD2_INC
+	KERNEL_1x2
+
+	LEAQ (X_PTR)(INC_X*2), X_PTR
+
+inc_r1c1:
+	TESTQ $1, N_DIM
+	JZ    inc_r1end
+
+	// 1x1 KERNEL
+	KERNEL_1x1
+
+inc_r1end:
+	STORE1
+
+inc_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/gemvT_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/gemvT_amd64.s
new file mode 100644
index 0000000000..040710009e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/gemvT_amd64.s
@@ -0,0 +1,745 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define SIZE 8
+
+#define M_DIM n+8(FP)
+#define M CX
+#define N_DIM m+0(FP)
+#define N BX
+
+#define TMP1 R14
+#define TMP2 R15
+
+#define X_PTR SI
+#define X x_base+56(FP)
+#define Y_PTR DX
+#define Y y_base+96(FP)
+#define A_ROW AX
+#define A_PTR DI
+
+#define INC_X R8
+#define INC3_X R9
+
+#define INC_Y R10
+#define INC3_Y R11
+
+#define LDA R12
+#define LDA3 R13
+
+#define ALPHA X15
+#define BETA X14
+
+#define INIT4 \
+	MOVDDUP (X_PTR), X8            \
+	MOVDDUP (X_PTR)(INC_X*1), X9   \
+	MOVDDUP (X_PTR)(INC_X*2), X10  \
+	MOVDDUP (X_PTR)(INC3_X*1), X11 \
+	MULPD   ALPHA, X8              \
+	MULPD   ALPHA, X9              \
+	MULPD   ALPHA, X10             \
+	MULPD   ALPHA, X11
+
+#define INIT2 \
+	MOVDDUP (X_PTR), X8          \
+	MOVDDUP (X_PTR)(INC_X*1), X9 \
+	MULPD   ALPHA, X8            \
+	MULPD   ALPHA, X9
+
+#define INIT1 \
+	MOVDDUP (X_PTR), X8 \
+	MULPD   ALPHA, X8
+
+#define KERNEL_LOAD4 \
+	MOVUPS (Y_PTR), X0       \
+	MOVUPS 2*SIZE(Y_PTR), X1
+
+#define KERNEL_LOAD2 \
+	MOVUPS (Y_PTR), X0
+
+#define KERNEL_LOAD4_INC \
+	MOVSD  (Y_PTR), X0           \
+	MOVHPD (Y_PTR)(INC_Y*1), X0  \
+	MOVSD  (Y_PTR)(INC_Y*2), X1  \
+	MOVHPD (Y_PTR)(INC3_Y*1), X1
+
+#define KERNEL_LOAD2_INC \
+	MOVSD  (Y_PTR), X0          \
+	MOVHPD (Y_PTR)(INC_Y*1), X0
+
+#define KERNEL_4x4 \
+	MOVUPS (A_PTR), X4               \
+	MOVUPS 2*SIZE(A_PTR), X5         \
+	MOVUPS (A_PTR)(LDA*1), X6        \
+	MOVUPS 2*SIZE(A_PTR)(LDA*1), X7  \
+	MULPD  X8, X4                    \
+	MULPD  X8, X5                    \
+	MULPD  X9, X6                    \
+	MULPD  X9, X7                    \
+	ADDPD  X4, X0                    \
+	ADDPD  X5, X1                    \
+	ADDPD  X6, X0                    \
+	ADDPD  X7, X1                    \
+	MOVUPS (A_PTR)(LDA*2), X4        \
+	MOVUPS 2*SIZE(A_PTR)(LDA*2), X5  \
+	MOVUPS (A_PTR)(LDA3*1), X6       \
+	MOVUPS 2*SIZE(A_PTR)(LDA3*1), X7 \
+	MULPD  X10, X4                   \
+	MULPD  X10, X5                   \
+	MULPD  X11, X6                   \
+	MULPD  X11, X7                   \
+	ADDPD  X4, X0                    \
+	ADDPD  X5, X1                    \
+	ADDPD  X6, X0                    \
+	ADDPD  X7, X1                    \
+	ADDQ   $4*SIZE, A_PTR
+
+#define KERNEL_4x2 \
+	MOVUPS (A_PTR), X4              \
+	MOVUPS 2*SIZE(A_PTR), X5        \
+	MOVUPS (A_PTR)(LDA*1), X6       \
+	MOVUPS 2*SIZE(A_PTR)(LDA*1), X7 \
+	MULPD  X8, X4                   \
+	MULPD  X8, X5                   \
+	MULPD  X9, X6                   \
+	MULPD  X9, X7                   \
+	ADDPD  X4, X0                   \
+	ADDPD  X5, X1                   \
+	ADDPD  X6, X0                   \
+	ADDPD  X7, X1                   \
+	ADDQ   $4*SIZE, A_PTR
+
+#define KERNEL_4x1 \
+	MOVUPS (A_PTR), X4       \
+	MOVUPS 2*SIZE(A_PTR), X5 \
+	MULPD  X8, X4            \
+	MULPD  X8, X5            \
+	ADDPD  X4, X0            \
+	ADDPD  X5, X1            \
+	ADDQ   $4*SIZE, A_PTR
+
+#define STORE4 \
+	MOVUPS X0, (Y_PTR)       \
+	MOVUPS X1, 2*SIZE(Y_PTR)
+
+#define STORE4_INC \
+	MOVLPD X0, (Y_PTR)           \
+	MOVHPD X0, (Y_PTR)(INC_Y*1)  \
+	MOVLPD X1, (Y_PTR)(INC_Y*2)  \
+	MOVHPD X1, (Y_PTR)(INC3_Y*1)
+
+#define KERNEL_2x4 \
+	MOVUPS (A_PTR), X4         \
+	MOVUPS (A_PTR)(LDA*1), X5  \
+	MOVUPS (A_PTR)(LDA*2), X6  \
+	MOVUPS (A_PTR)(LDA3*1), X7 \
+	MULPD  X8, X4              \
+	MULPD  X9, X5              \
+	MULPD  X10, X6             \
+	MULPD  X11, X7             \
+	ADDPD  X4, X0              \
+	ADDPD  X5, X0              \
+	ADDPD  X6, X0              \
+	ADDPD  X7, X0              \
+	ADDQ   $2*SIZE, A_PTR
+
+#define KERNEL_2x2 \
+	MOVUPS (A_PTR), X4        \
+	MOVUPS (A_PTR)(LDA*1), X5 \
+	MULPD  X8, X4             \
+	MULPD  X9, X5             \
+	ADDPD  X4, X0             \
+	ADDPD  X5, X0             \
+	ADDQ   $2*SIZE, A_PTR
+
+#define KERNEL_2x1 \
+	MOVUPS (A_PTR), X4    \
+	MULPD  X8, X4         \
+	ADDPD  X4, X0         \
+	ADDQ   $2*SIZE, A_PTR
+
+#define STORE2 \
+	MOVUPS X0, (Y_PTR)
+
+#define STORE2_INC \
+	MOVLPD X0, (Y_PTR)          \
+	MOVHPD X0, (Y_PTR)(INC_Y*1)
+
+#define KERNEL_1x4 \
+	MOVSD (Y_PTR), X0         \
+	MOVSD (A_PTR), X4         \
+	MOVSD (A_PTR)(LDA*1), X5  \
+	MOVSD (A_PTR)(LDA*2), X6  \
+	MOVSD (A_PTR)(LDA3*1), X7 \
+	MULSD X8, X4              \
+	MULSD X9, X5              \
+	MULSD X10, X6             \
+	MULSD X11, X7             \
+	ADDSD X4, X0              \
+	ADDSD X5, X0              \
+	ADDSD X6, X0              \
+	ADDSD X7, X0              \
+	MOVSD X0, (Y_PTR)         \
+	ADDQ  $SIZE, A_PTR
+
+#define KERNEL_1x2 \
+	MOVSD (Y_PTR), X0        \
+	MOVSD (A_PTR), X4        \
+	MOVSD (A_PTR)(LDA*1), X5 \
+	MULSD X8, X4             \
+	MULSD X9, X5             \
+	ADDSD X4, X0             \
+	ADDSD X5, X0             \
+	MOVSD X0, (Y_PTR)        \
+	ADDQ  $SIZE, A_PTR
+
+#define KERNEL_1x1 \
+	MOVSD (Y_PTR), X0  \
+	MOVSD (A_PTR), X4  \
+	MULSD X8, X4       \
+	ADDSD X4, X0       \
+	MOVSD X0, (Y_PTR)  \
+	ADDQ  $SIZE, A_PTR
+
+#define SCALE_8(PTR, SCAL) \
+	MOVUPS (PTR), X0   \
+	MOVUPS 16(PTR), X1 \
+	MOVUPS 32(PTR), X2 \
+	MOVUPS 48(PTR), X3 \
+	MULPD  SCAL, X0    \
+	MULPD  SCAL, X1    \
+	MULPD  SCAL, X2    \
+	MULPD  SCAL, X3    \
+	MOVUPS X0, (PTR)   \
+	MOVUPS X1, 16(PTR) \
+	MOVUPS X2, 32(PTR) \
+	MOVUPS X3, 48(PTR)
+
+#define SCALE_4(PTR, SCAL) \
+	MOVUPS (PTR), X0   \
+	MOVUPS 16(PTR), X1 \
+	MULPD  SCAL, X0    \
+	MULPD  SCAL, X1    \
+	MOVUPS X0, (PTR)   \
+	MOVUPS X1, 16(PTR) \
+
+#define SCALE_2(PTR, SCAL) \
+	MOVUPS (PTR), X0 \
+	MULPD  SCAL, X0  \
+	MOVUPS X0, (PTR) \
+
+#define SCALE_1(PTR, SCAL) \
+	MOVSD (PTR), X0 \
+	MULSD SCAL, X0  \
+	MOVSD X0, (PTR) \
+
+#define SCALEINC_4(PTR, INC, INC3, SCAL) \
+	MOVSD (PTR), X0         \
+	MOVSD (PTR)(INC*1), X1  \
+	MOVSD (PTR)(INC*2), X2  \
+	MOVSD (PTR)(INC3*1), X3 \
+	MULSD SCAL, X0          \
+	MULSD SCAL, X1          \
+	MULSD SCAL, X2          \
+	MULSD SCAL, X3          \
+	MOVSD X0, (PTR)         \
+	MOVSD X1, (PTR)(INC*1)  \
+	MOVSD X2, (PTR)(INC*2)  \
+	MOVSD X3, (PTR)(INC3*1)
+
+#define SCALEINC_2(PTR, INC, SCAL) \
+	MOVSD (PTR), X0        \
+	MOVSD (PTR)(INC*1), X1 \
+	MULSD SCAL, X0         \
+	MULSD SCAL, X1         \
+	MOVSD X0, (PTR)        \
+	MOVSD X1, (PTR)(INC*1)
+
+// func GemvT(m, n int,
+//	alpha float64,
+//	a []float64, lda int,
+//	x []float64, incX int,
+//	beta float64,
+//	y []float64, incY int)
+TEXT ·GemvT(SB), NOSPLIT, $32-128
+	MOVQ M_DIM, M
+	MOVQ N_DIM, N
+	CMPQ M, $0
+	JE   end
+	CMPQ N, $0
+	JE   end
+
+	MOVDDUP alpha+16(FP), ALPHA
+
+	MOVQ x_base+56(FP), X_PTR
+	MOVQ y_base+96(FP), Y_PTR
+	MOVQ a_base+24(FP), A_ROW
+	MOVQ incY+120(FP), INC_Y  // INC_Y = incY * sizeof(float64)
+	MOVQ lda+48(FP), LDA      // LDA = LDA * sizeof(float64)
+	SHLQ $3, LDA
+	LEAQ (LDA)(LDA*2), LDA3   // LDA3 = LDA * 3
+	MOVQ A_ROW, A_PTR
+
+	MOVQ incX+80(FP), INC_X // INC_X = incX * sizeof(float64)
+
+	XORQ    TMP2, TMP2
+	MOVQ    N, TMP1
+	SUBQ    $1, TMP1
+	NEGQ    TMP1
+	IMULQ   INC_X, TMP1
+	CMPQ    INC_X, $0
+	CMOVQLT TMP1, TMP2
+	LEAQ    (X_PTR)(TMP2*SIZE), X_PTR
+	MOVQ    X_PTR, X
+
+	SHLQ $3, INC_X
+	LEAQ (INC_X)(INC_X*2), INC3_X // INC3_X = INC_X * 3
+
+	CMPQ incY+120(FP), $1 // Check for dense vector Y (fast-path)
+	JNE  inc
+
+	MOVSD  $1.0, X0
+	COMISD beta+88(FP), X0
+	JE     gemv_start
+
+	MOVSD  $0.0, X0
+	COMISD beta+88(FP), X0
+	JE     gemv_clear
+
+	MOVDDUP beta+88(FP), BETA
+	SHRQ    $3, M
+	JZ      scal4
+
+scal8:
+	SCALE_8(Y_PTR, BETA)
+	ADDQ $8*SIZE, Y_PTR
+	DECQ M
+	JNZ  scal8
+
+scal4:
+	TESTQ $4, M_DIM
+	JZ    scal2
+	SCALE_4(Y_PTR, BETA)
+	ADDQ  $4*SIZE, Y_PTR
+
+scal2:
+	TESTQ $2, M_DIM
+	JZ    scal1
+	SCALE_2(Y_PTR, BETA)
+	ADDQ  $2*SIZE, Y_PTR
+
+scal1:
+	TESTQ $1, M_DIM
+	JZ    prep_end
+	SCALE_1(Y_PTR, BETA)
+
+	JMP prep_end
+
+gemv_clear: // beta == 0 is special cased to clear memory (no nan handling)
+	XORPS X0, X0
+	XORPS X1, X1
+	XORPS X2, X2
+	XORPS X3, X3
+
+	SHRQ $3, M
+	JZ   clear4
+
+clear8:
+	MOVUPS X0, (Y_PTR)
+	MOVUPS X1, 16(Y_PTR)
+	MOVUPS X2, 32(Y_PTR)
+	MOVUPS X3, 48(Y_PTR)
+	ADDQ   $8*SIZE, Y_PTR
+	DECQ   M
+	JNZ    clear8
+
+clear4:
+	TESTQ  $4, M_DIM
+	JZ     clear2
+	MOVUPS X0, (Y_PTR)
+	MOVUPS X1, 16(Y_PTR)
+	ADDQ   $4*SIZE, Y_PTR
+
+clear2:
+	TESTQ  $2, M_DIM
+	JZ     clear1
+	MOVUPS X0, (Y_PTR)
+	ADDQ   $2*SIZE, Y_PTR
+
+clear1:
+	TESTQ $1, M_DIM
+	JZ    prep_end
+	MOVSD X0, (Y_PTR)
+
+prep_end:
+	MOVQ Y, Y_PTR
+	MOVQ M_DIM, M
+
+gemv_start:
+	SHRQ $2, N
+	JZ   c2
+
+c4:
+	// LOAD 4
+	INIT4
+
+	MOVQ M_DIM, M
+	SHRQ $2, M
+	JZ   c4r2
+
+c4r4:
+	// 4x4 KERNEL
+	KERNEL_LOAD4
+	KERNEL_4x4
+	STORE4
+
+	ADDQ $4*SIZE, Y_PTR
+
+	DECQ M
+	JNZ  c4r4
+
+c4r2:
+	TESTQ $2, M_DIM
+	JZ    c4r1
+
+	// 4x2 KERNEL
+	KERNEL_LOAD2
+	KERNEL_2x4
+	STORE2
+
+	ADDQ $2*SIZE, Y_PTR
+
+c4r1:
+	TESTQ $1, M_DIM
+	JZ    c4end
+
+	// 4x1 KERNEL
+	KERNEL_1x4
+
+	ADDQ $SIZE, Y_PTR
+
+c4end:
+	LEAQ (X_PTR)(INC_X*4), X_PTR
+	MOVQ Y, Y_PTR
+	LEAQ (A_ROW)(LDA*4), A_ROW
+	MOVQ A_ROW, A_PTR
+
+	DECQ N
+	JNZ  c4
+
+c2:
+	TESTQ $2, N_DIM
+	JZ    c1
+
+	// LOAD 2
+	INIT2
+
+	MOVQ M_DIM, M
+	SHRQ $2, M
+	JZ   c2r2
+
+c2r4:
+	// 2x4 KERNEL
+	KERNEL_LOAD4
+	KERNEL_4x2
+	STORE4
+
+	ADDQ $4*SIZE, Y_PTR
+
+	DECQ M
+	JNZ  c2r4
+
+c2r2:
+	TESTQ $2, M_DIM
+	JZ    c2r1
+
+	// 2x2 KERNEL
+	KERNEL_LOAD2
+	KERNEL_2x2
+	STORE2
+
+	ADDQ $2*SIZE, Y_PTR
+
+c2r1:
+	TESTQ $1, M_DIM
+	JZ    c2end
+
+	// 2x1 KERNEL
+	KERNEL_1x2
+
+	ADDQ $SIZE, Y_PTR
+
+c2end:
+	LEAQ (X_PTR)(INC_X*2), X_PTR
+	MOVQ Y, Y_PTR
+	LEAQ (A_ROW)(LDA*2), A_ROW
+	MOVQ A_ROW, A_PTR
+
+c1:
+	TESTQ $1, N_DIM
+	JZ    end
+
+	// LOAD 1
+	INIT1
+
+	MOVQ M_DIM, M
+	SHRQ $2, M
+	JZ   c1r2
+
+c1r4:
+	// 1x4 KERNEL
+	KERNEL_LOAD4
+	KERNEL_4x1
+	STORE4
+
+	ADDQ $4*SIZE, Y_PTR
+
+	DECQ M
+	JNZ  c1r4
+
+c1r2:
+	TESTQ $2, M_DIM
+	JZ    c1r1
+
+	// 1x2 KERNEL
+	KERNEL_LOAD2
+	KERNEL_2x1
+	STORE2
+
+	ADDQ $2*SIZE, Y_PTR
+
+c1r1:
+	TESTQ $1, M_DIM
+	JZ    end
+
+	// 1x1 KERNEL
+	KERNEL_1x1
+
+end:
+	RET
+
+inc:  // Algorithm for incX != 0 ( split loads in kernel )
+	XORQ    TMP2, TMP2
+	MOVQ    M, TMP1
+	SUBQ    $1, TMP1
+	IMULQ   INC_Y, TMP1
+	NEGQ    TMP1
+	CMPQ    INC_Y, $0
+	CMOVQLT TMP1, TMP2
+	LEAQ    (Y_PTR)(TMP2*SIZE), Y_PTR
+	MOVQ    Y_PTR, Y
+
+	SHLQ $3, INC_Y
+	LEAQ (INC_Y)(INC_Y*2), INC3_Y // INC3_Y = INC_Y * 3
+
+	MOVSD  $1.0, X0
+	COMISD beta+88(FP), X0
+	JE     inc_gemv_start
+
+	MOVSD  $0.0, X0
+	COMISD beta+88(FP), X0
+	JE     inc_gemv_clear
+
+	MOVDDUP beta+88(FP), BETA
+	SHRQ    $2, M
+	JZ      inc_scal2
+
+inc_scal4:
+	SCALEINC_4(Y_PTR, INC_Y, INC3_Y, BETA)
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+	DECQ M
+	JNZ  inc_scal4
+
+inc_scal2:
+	TESTQ $2, M_DIM
+	JZ    inc_scal1
+
+	SCALEINC_2(Y_PTR, INC_Y, BETA)
+	LEAQ (Y_PTR)(INC_Y*2), Y_PTR
+
+inc_scal1:
+	TESTQ $1, M_DIM
+	JZ    inc_prep_end
+	SCALE_1(Y_PTR, BETA)
+
+	JMP inc_prep_end
+
+inc_gemv_clear: // beta == 0 is special-cased to clear memory (no nan handling)
+	XORPS X0, X0
+	XORPS X1, X1
+	XORPS X2, X2
+	XORPS X3, X3
+
+	SHRQ $2, M
+	JZ   inc_clear2
+
+inc_clear4:
+	MOVSD X0, (Y_PTR)
+	MOVSD X1, (Y_PTR)(INC_Y*1)
+	MOVSD X2, (Y_PTR)(INC_Y*2)
+	MOVSD X3, (Y_PTR)(INC3_Y*1)
+	LEAQ  (Y_PTR)(INC_Y*4), Y_PTR
+	DECQ  M
+	JNZ   inc_clear4
+
+inc_clear2:
+	TESTQ $2, M_DIM
+	JZ    inc_clear1
+	MOVSD X0, (Y_PTR)
+	MOVSD X1, (Y_PTR)(INC_Y*1)
+	LEAQ  (Y_PTR)(INC_Y*2), Y_PTR
+
+inc_clear1:
+	TESTQ $1, M_DIM
+	JZ    inc_prep_end
+	MOVSD X0, (Y_PTR)
+
+inc_prep_end:
+	MOVQ Y, Y_PTR
+	MOVQ M_DIM, M
+
+inc_gemv_start:
+	SHRQ $2, N
+	JZ   inc_c2
+
+inc_c4:
+	// LOAD 4
+	INIT4
+
+	MOVQ M_DIM, M
+	SHRQ $2, M
+	JZ   inc_c4r2
+
+inc_c4r4:
+	// 4x4 KERNEL
+	KERNEL_LOAD4_INC
+	KERNEL_4x4
+	STORE4_INC
+
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+
+	DECQ M
+	JNZ  inc_c4r4
+
+inc_c4r2:
+	TESTQ $2, M_DIM
+	JZ    inc_c4r1
+
+	// 4x2 KERNEL
+	KERNEL_LOAD2_INC
+	KERNEL_2x4
+	STORE2_INC
+
+	LEAQ (Y_PTR)(INC_Y*2), Y_PTR
+
+inc_c4r1:
+	TESTQ $1, M_DIM
+	JZ    inc_c4end
+
+	// 4x1 KERNEL
+	KERNEL_1x4
+
+	ADDQ INC_Y, Y_PTR
+
+inc_c4end:
+	LEAQ (X_PTR)(INC_X*4), X_PTR
+	MOVQ Y, Y_PTR
+	LEAQ (A_ROW)(LDA*4), A_ROW
+	MOVQ A_ROW, A_PTR
+
+	DECQ N
+	JNZ  inc_c4
+
+inc_c2:
+	TESTQ $2, N_DIM
+	JZ    inc_c1
+
+	// LOAD 2
+	INIT2
+
+	MOVQ M_DIM, M
+	SHRQ $2, M
+	JZ   inc_c2r2
+
+inc_c2r4:
+	// 2x4 KERNEL
+	KERNEL_LOAD4_INC
+	KERNEL_4x2
+	STORE4_INC
+
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+	DECQ M
+	JNZ  inc_c2r4
+
+inc_c2r2:
+	TESTQ $2, M_DIM
+	JZ    inc_c2r1
+
+	// 2x2 KERNEL
+	KERNEL_LOAD2_INC
+	KERNEL_2x2
+	STORE2_INC
+
+	LEAQ (Y_PTR)(INC_Y*2), Y_PTR
+
+inc_c2r1:
+	TESTQ $1, M_DIM
+	JZ    inc_c2end
+
+	// 2x1 KERNEL
+	KERNEL_1x2
+
+	ADDQ INC_Y, Y_PTR
+
+inc_c2end:
+	LEAQ (X_PTR)(INC_X*2), X_PTR
+	MOVQ Y, Y_PTR
+	LEAQ (A_ROW)(LDA*2), A_ROW
+	MOVQ A_ROW, A_PTR
+
+inc_c1:
+	TESTQ $1, N_DIM
+	JZ    inc_end
+
+	// LOAD 1
+	INIT1
+
+	MOVQ M_DIM, M
+	SHRQ $2, M
+	JZ   inc_c1r2
+
+inc_c1r4:
+	// 1x4 KERNEL
+	KERNEL_LOAD4_INC
+	KERNEL_4x1
+	STORE4_INC
+
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+	DECQ M
+	JNZ  inc_c1r4
+
+inc_c1r2:
+	TESTQ $2, M_DIM
+	JZ    inc_c1r1
+
+	// 1x2 KERNEL
+	KERNEL_LOAD2_INC
+	KERNEL_2x1
+	STORE2_INC
+
+	LEAQ (Y_PTR)(INC_Y*2), Y_PTR
+
+inc_c1r1:
+	TESTQ $1, M_DIM
+	JZ    inc_end
+
+	// 1x1 KERNEL
+	KERNEL_1x1
+
+inc_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/ger_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/ger_amd64.s
new file mode 100644
index 0000000000..8cae569138
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/ger_amd64.s
@@ -0,0 +1,591 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define SIZE 8
+
+#define M_DIM m+0(FP)
+#define M CX
+#define N_DIM n+8(FP)
+#define N BX
+
+#define TMP1 R14
+#define TMP2 R15
+
+#define X_PTR SI
+#define Y y_base+56(FP)
+#define Y_PTR DX
+#define A_ROW AX
+#define A_PTR DI
+
+#define INC_X R8
+#define INC3_X R9
+
+#define INC_Y R10
+#define INC3_Y R11
+
+#define LDA R12
+#define LDA3 R13
+
+#define ALPHA X0
+
+#define LOAD4 \
+	PREFETCHNTA (X_PTR )(INC_X*8)     \
+	MOVDDUP     (X_PTR), X1           \
+	MOVDDUP     (X_PTR)(INC_X*1), X2  \
+	MOVDDUP     (X_PTR)(INC_X*2), X3  \
+	MOVDDUP     (X_PTR)(INC3_X*1), X4 \
+	MULPD       ALPHA, X1             \
+	MULPD       ALPHA, X2             \
+	MULPD       ALPHA, X3             \
+	MULPD       ALPHA, X4
+
+#define LOAD2 \
+	MOVDDUP (X_PTR), X1          \
+	MOVDDUP (X_PTR)(INC_X*1), X2 \
+	MULPD   ALPHA, X1            \
+	MULPD   ALPHA, X2
+
+#define LOAD1 \
+	MOVDDUP (X_PTR), X1 \
+	MULPD   ALPHA, X1
+
+#define KERNEL_LOAD4 \
+	MOVUPS (Y_PTR), X5       \
+	MOVUPS 2*SIZE(Y_PTR), X6
+
+#define KERNEL_LOAD4_INC \
+	MOVLPD (Y_PTR), X5           \
+	MOVHPD (Y_PTR)(INC_Y*1), X5  \
+	MOVLPD (Y_PTR)(INC_Y*2), X6  \
+	MOVHPD (Y_PTR)(INC3_Y*1), X6
+
+#define KERNEL_LOAD2 \
+	MOVUPS (Y_PTR), X5
+
+#define KERNEL_LOAD2_INC \
+	MOVLPD (Y_PTR), X5          \
+	MOVHPD (Y_PTR)(INC_Y*1), X5
+
+#define KERNEL_4x4 \
+	MOVUPS X5, X7  \
+	MOVUPS X6, X8  \
+	MOVUPS X5, X9  \
+	MOVUPS X6, X10 \
+	MOVUPS X5, X11 \
+	MOVUPS X6, X12 \
+	MULPD  X1, X5  \
+	MULPD  X1, X6  \
+	MULPD  X2, X7  \
+	MULPD  X2, X8  \
+	MULPD  X3, X9  \
+	MULPD  X3, X10 \
+	MULPD  X4, X11 \
+	MULPD  X4, X12
+
+#define STORE_4x4 \
+	MOVUPS (A_PTR), X13               \
+	ADDPD  X13, X5                    \
+	MOVUPS 2*SIZE(A_PTR), X14         \
+	ADDPD  X14, X6                    \
+	MOVUPS (A_PTR)(LDA*1), X15        \
+	ADDPD  X15, X7                    \
+	MOVUPS 2*SIZE(A_PTR)(LDA*1), X0   \
+	ADDPD  X0, X8                     \
+	MOVUPS (A_PTR)(LDA*2), X13        \
+	ADDPD  X13, X9                    \
+	MOVUPS 2*SIZE(A_PTR)(LDA*2), X14  \
+	ADDPD  X14, X10                   \
+	MOVUPS (A_PTR)(LDA3*1), X15       \
+	ADDPD  X15, X11                   \
+	MOVUPS 2*SIZE(A_PTR)(LDA3*1), X0  \
+	ADDPD  X0, X12                    \
+	MOVUPS X5, (A_PTR)                \
+	MOVUPS X6, 2*SIZE(A_PTR)          \
+	MOVUPS X7, (A_PTR)(LDA*1)         \
+	MOVUPS X8, 2*SIZE(A_PTR)(LDA*1)   \
+	MOVUPS X9, (A_PTR)(LDA*2)         \
+	MOVUPS X10, 2*SIZE(A_PTR)(LDA*2)  \
+	MOVUPS X11, (A_PTR)(LDA3*1)       \
+	MOVUPS X12, 2*SIZE(A_PTR)(LDA3*1) \
+	ADDQ   $4*SIZE, A_PTR
+
+#define KERNEL_4x2 \
+	MOVUPS X5, X6 \
+	MOVUPS X5, X7 \
+	MOVUPS X5, X8 \
+	MULPD  X1, X5 \
+	MULPD  X2, X6 \
+	MULPD  X3, X7 \
+	MULPD  X4, X8
+
+#define STORE_4x2 \
+	MOVUPS (A_PTR), X9          \
+	ADDPD  X9, X5               \
+	MOVUPS (A_PTR)(LDA*1), X10  \
+	ADDPD  X10, X6              \
+	MOVUPS (A_PTR)(LDA*2), X11  \
+	ADDPD  X11, X7              \
+	MOVUPS (A_PTR)(LDA3*1), X12 \
+	ADDPD  X12, X8              \
+	MOVUPS X5, (A_PTR)          \
+	MOVUPS X6, (A_PTR)(LDA*1)   \
+	MOVUPS X7, (A_PTR)(LDA*2)   \
+	MOVUPS X8, (A_PTR)(LDA3*1)  \
+	ADDQ   $2*SIZE, A_PTR
+
+#define KERNEL_4x1 \
+	MOVSD (Y_PTR), X5 \
+	MOVSD X5, X6      \
+	MOVSD X5, X7      \
+	MOVSD X5, X8      \
+	MULSD X1, X5      \
+	MULSD X2, X6      \
+	MULSD X3, X7      \
+	MULSD X4, X8
+
+#define STORE_4x1 \
+	ADDSD (A_PTR), X5         \
+	ADDSD (A_PTR)(LDA*1), X6  \
+	ADDSD (A_PTR)(LDA*2), X7  \
+	ADDSD (A_PTR)(LDA3*1), X8 \
+	MOVSD X5, (A_PTR)         \
+	MOVSD X6, (A_PTR)(LDA*1)  \
+	MOVSD X7, (A_PTR)(LDA*2)  \
+	MOVSD X8, (A_PTR)(LDA3*1) \
+	ADDQ  $SIZE, A_PTR
+
+#define KERNEL_2x4 \
+	MOVUPS X5, X7 \
+	MOVUPS X6, X8 \
+	MULPD  X1, X5 \
+	MULPD  X1, X6 \
+	MULPD  X2, X7 \
+	MULPD  X2, X8
+
+#define STORE_2x4 \
+	MOVUPS (A_PTR), X9               \
+	ADDPD  X9, X5                    \
+	MOVUPS 2*SIZE(A_PTR), X10        \
+	ADDPD  X10, X6                   \
+	MOVUPS (A_PTR)(LDA*1), X11       \
+	ADDPD  X11, X7                   \
+	MOVUPS 2*SIZE(A_PTR)(LDA*1), X12 \
+	ADDPD  X12, X8                   \
+	MOVUPS X5, (A_PTR)               \
+	MOVUPS X6, 2*SIZE(A_PTR)         \
+	MOVUPS X7, (A_PTR)(LDA*1)        \
+	MOVUPS X8, 2*SIZE(A_PTR)(LDA*1)  \
+	ADDQ   $4*SIZE, A_PTR
+
+#define KERNEL_2x2 \
+	MOVUPS X5, X6 \
+	MULPD  X1, X5 \
+	MULPD  X2, X6
+
+#define STORE_2x2 \
+	MOVUPS (A_PTR), X7        \
+	ADDPD  X7, X5             \
+	MOVUPS (A_PTR)(LDA*1), X8 \
+	ADDPD  X8, X6             \
+	MOVUPS X5, (A_PTR)        \
+	MOVUPS X6, (A_PTR)(LDA*1) \
+	ADDQ   $2*SIZE, A_PTR
+
+#define KERNEL_2x1 \
+	MOVSD (Y_PTR), X5 \
+	MOVSD X5, X6      \
+	MULSD X1, X5      \
+	MULSD X2, X6
+
+#define STORE_2x1 \
+	ADDSD (A_PTR), X5        \
+	ADDSD (A_PTR)(LDA*1), X6 \
+	MOVSD X5, (A_PTR)        \
+	MOVSD X6, (A_PTR)(LDA*1) \
+	ADDQ  $SIZE, A_PTR
+
+#define KERNEL_1x4 \
+	MULPD X1, X5 \
+	MULPD X1, X6
+
+#define STORE_1x4 \
+	MOVUPS (A_PTR), X7       \
+	ADDPD  X7, X5            \
+	MOVUPS 2*SIZE(A_PTR), X8 \
+	ADDPD  X8, X6            \
+	MOVUPS X5, (A_PTR)       \
+	MOVUPS X6, 2*SIZE(A_PTR) \
+	ADDQ   $4*SIZE, A_PTR
+
+#define KERNEL_1x2 \
+	MULPD X1, X5
+
+#define STORE_1x2 \
+	MOVUPS (A_PTR), X6    \
+	ADDPD  X6, X5         \
+	MOVUPS X5, (A_PTR)    \
+	ADDQ   $2*SIZE, A_PTR
+
+#define KERNEL_1x1 \
+	MOVSD (Y_PTR), X5 \
+	MULSD X1, X5
+
+#define STORE_1x1 \
+	ADDSD (A_PTR), X5  \
+	MOVSD X5, (A_PTR)  \
+	ADDQ  $SIZE, A_PTR
+
+// func Ger(m, n uintptr, alpha float64,
+//	x []float64, incX uintptr,
+//	y []float64, incY uintptr,
+//	a []float64, lda uintptr)
+TEXT ·Ger(SB), NOSPLIT, $0
+	MOVQ M_DIM, M
+	MOVQ N_DIM, N
+	CMPQ M, $0
+	JE   end
+	CMPQ N, $0
+	JE   end
+
+	MOVDDUP alpha+16(FP), ALPHA
+
+	MOVQ x_base+24(FP), X_PTR
+	MOVQ y_base+56(FP), Y_PTR
+	MOVQ a_base+88(FP), A_ROW
+	MOVQ incX+48(FP), INC_X       // INC_X = incX * sizeof(float64)
+	SHLQ $3, INC_X
+	MOVQ lda+112(FP), LDA         // LDA = LDA * sizeof(float64)
+	SHLQ $3, LDA
+	LEAQ (LDA)(LDA*2), LDA3       // LDA3 = LDA * 3
+	LEAQ (INC_X)(INC_X*2), INC3_X // INC3_X = INC_X * 3
+	MOVQ A_ROW, A_PTR
+
+	XORQ    TMP2, TMP2
+	MOVQ    M, TMP1
+	SUBQ    $1, TMP1
+	IMULQ   INC_X, TMP1
+	NEGQ    TMP1
+	CMPQ    INC_X, $0
+	CMOVQLT TMP1, TMP2
+	LEAQ    (X_PTR)(TMP2*SIZE), X_PTR
+
+	CMPQ incY+80(FP), $1 // Check for dense vector Y (fast-path)
+	JG   inc
+	JL   end
+
+	SHRQ $2, M
+	JZ   r2
+
+r4:
+	// LOAD 4
+	LOAD4
+
+	MOVQ N_DIM, N
+	SHRQ $2, N
+	JZ   r4c2
+
+r4c4:
+	// 4x4 KERNEL
+	KERNEL_LOAD4
+	KERNEL_4x4
+	STORE_4x4
+
+	ADDQ $4*SIZE, Y_PTR
+
+	DECQ N
+	JNZ  r4c4
+
+	// Reload ALPHA after it's clobbered by STORE_4x4
+	MOVDDUP alpha+16(FP), ALPHA
+
+r4c2:
+	TESTQ $2, N_DIM
+	JZ    r4c1
+
+	// 4x2 KERNEL
+	KERNEL_LOAD2
+	KERNEL_4x2
+	STORE_4x2
+
+	ADDQ $2*SIZE, Y_PTR
+
+r4c1:
+	TESTQ $1, N_DIM
+	JZ    r4end
+
+	// 4x1 KERNEL
+	KERNEL_4x1
+	STORE_4x1
+
+	ADDQ $SIZE, Y_PTR
+
+r4end:
+	LEAQ (X_PTR)(INC_X*4), X_PTR
+	MOVQ Y, Y_PTR
+	LEAQ (A_ROW)(LDA*4), A_ROW
+	MOVQ A_ROW, A_PTR
+
+	DECQ M
+	JNZ  r4
+
+r2:
+	TESTQ $2, M_DIM
+	JZ    r1
+
+	// LOAD 2
+	LOAD2
+
+	MOVQ N_DIM, N
+	SHRQ $2, N
+	JZ   r2c2
+
+r2c4:
+	// 2x4 KERNEL
+	KERNEL_LOAD4
+	KERNEL_2x4
+	STORE_2x4
+
+	ADDQ $4*SIZE, Y_PTR
+
+	DECQ N
+	JNZ  r2c4
+
+r2c2:
+	TESTQ $2, N_DIM
+	JZ    r2c1
+
+	// 2x2 KERNEL
+	KERNEL_LOAD2
+	KERNEL_2x2
+	STORE_2x2
+
+	ADDQ $2*SIZE, Y_PTR
+
+r2c1:
+	TESTQ $1, N_DIM
+	JZ    r2end
+
+	// 2x1 KERNEL
+	KERNEL_2x1
+	STORE_2x1
+
+	ADDQ $SIZE, Y_PTR
+
+r2end:
+	LEAQ (X_PTR)(INC_X*2), X_PTR
+	MOVQ Y, Y_PTR
+	LEAQ (A_ROW)(LDA*2), A_ROW
+	MOVQ A_ROW, A_PTR
+
+r1:
+	TESTQ $1, M_DIM
+	JZ    end
+
+	// LOAD 1
+	LOAD1
+
+	MOVQ N_DIM, N
+	SHRQ $2, N
+	JZ   r1c2
+
+r1c4:
+	// 1x4 KERNEL
+	KERNEL_LOAD4
+	KERNEL_1x4
+	STORE_1x4
+
+	ADDQ $4*SIZE, Y_PTR
+
+	DECQ N
+	JNZ  r1c4
+
+r1c2:
+	TESTQ $2, N_DIM
+	JZ    r1c1
+
+	// 1x2 KERNEL
+	KERNEL_LOAD2
+	KERNEL_1x2
+	STORE_1x2
+
+	ADDQ $2*SIZE, Y_PTR
+
+r1c1:
+	TESTQ $1, N_DIM
+	JZ    end
+
+	// 1x1 KERNEL
+	KERNEL_1x1
+	STORE_1x1
+
+	ADDQ $SIZE, Y_PTR
+
+end:
+	RET
+
+inc:  // Algorithm for incY != 1 ( split loads in kernel )
+
+	MOVQ incY+80(FP), INC_Y       // INC_Y = incY * sizeof(float64)
+	SHLQ $3, INC_Y
+	LEAQ (INC_Y)(INC_Y*2), INC3_Y // INC3_Y = INC_Y * 3
+
+	XORQ    TMP2, TMP2
+	MOVQ    N, TMP1
+	SUBQ    $1, TMP1
+	IMULQ   INC_Y, TMP1
+	NEGQ    TMP1
+	CMPQ    INC_Y, $0
+	CMOVQLT TMP1, TMP2
+	LEAQ    (Y_PTR)(TMP2*SIZE), Y_PTR
+
+	SHRQ $2, M
+	JZ   inc_r2
+
+inc_r4:
+	// LOAD 4
+	LOAD4
+
+	MOVQ N_DIM, N
+	SHRQ $2, N
+	JZ   inc_r4c2
+
+inc_r4c4:
+	// 4x4 KERNEL
+	KERNEL_LOAD4_INC
+	KERNEL_4x4
+	STORE_4x4
+
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+	DECQ N
+	JNZ  inc_r4c4
+
+	// Reload ALPHA after it's clobbered by STORE_4x4
+	MOVDDUP alpha+16(FP), ALPHA
+
+inc_r4c2:
+	TESTQ $2, N_DIM
+	JZ    inc_r4c1
+
+	// 4x2 KERNEL
+	KERNEL_LOAD2_INC
+	KERNEL_4x2
+	STORE_4x2
+
+	LEAQ (Y_PTR)(INC_Y*2), Y_PTR
+
+inc_r4c1:
+	TESTQ $1, N_DIM
+	JZ    inc_r4end
+
+	// 4x1 KERNEL
+	KERNEL_4x1
+	STORE_4x1
+
+	ADDQ INC_Y, Y_PTR
+
+inc_r4end:
+	LEAQ (X_PTR)(INC_X*4), X_PTR
+	MOVQ Y, Y_PTR
+	LEAQ (A_ROW)(LDA*4), A_ROW
+	MOVQ A_ROW, A_PTR
+
+	DECQ M
+	JNZ  inc_r4
+
+inc_r2:
+	TESTQ $2, M_DIM
+	JZ    inc_r1
+
+	// LOAD 2
+	LOAD2
+
+	MOVQ N_DIM, N
+	SHRQ $2, N
+	JZ   inc_r2c2
+
+inc_r2c4:
+	// 2x4 KERNEL
+	KERNEL_LOAD4_INC
+	KERNEL_2x4
+	STORE_2x4
+
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+	DECQ N
+	JNZ  inc_r2c4
+
+inc_r2c2:
+	TESTQ $2, N_DIM
+	JZ    inc_r2c1
+
+	// 2x2 KERNEL
+	KERNEL_LOAD2_INC
+	KERNEL_2x2
+	STORE_2x2
+
+	LEAQ (Y_PTR)(INC_Y*2), Y_PTR
+
+inc_r2c1:
+	TESTQ $1, N_DIM
+	JZ    inc_r2end
+
+	// 2x1 KERNEL
+	KERNEL_2x1
+	STORE_2x1
+
+	ADDQ INC_Y, Y_PTR
+
+inc_r2end:
+	LEAQ (X_PTR)(INC_X*2), X_PTR
+	MOVQ Y, Y_PTR
+	LEAQ (A_ROW)(LDA*2), A_ROW
+	MOVQ A_ROW, A_PTR
+
+inc_r1:
+	TESTQ $1, M_DIM
+	JZ    end
+
+	// LOAD 1
+	LOAD1
+
+	MOVQ N_DIM, N
+	SHRQ $2, N
+	JZ   inc_r1c2
+
+inc_r1c4:
+	// 1x4 KERNEL
+	KERNEL_LOAD4_INC
+	KERNEL_1x4
+	STORE_1x4
+
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+	DECQ N
+	JNZ  inc_r1c4
+
+inc_r1c2:
+	TESTQ $2, N_DIM
+	JZ    inc_r1c1
+
+	// 1x2 KERNEL
+	KERNEL_LOAD2_INC
+	KERNEL_1x2
+	STORE_1x2
+
+	LEAQ (Y_PTR)(INC_Y*2), Y_PTR
+
+inc_r1c1:
+	TESTQ $1, N_DIM
+	JZ    end
+
+	// 1x1 KERNEL
+	KERNEL_1x1
+	STORE_1x1
+
+	ADDQ INC_Y, Y_PTR
+
+inc_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/l1norm_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/l1norm_amd64.s
new file mode 100644
index 0000000000..b4b1fd02fb
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/l1norm_amd64.s
@@ -0,0 +1,58 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func L1Dist(s, t []float64) float64
+TEXT ·L1Dist(SB), NOSPLIT, $0
+	MOVQ    s_base+0(FP), DI  // DI = &s
+	MOVQ    t_base+24(FP), SI // SI = &t
+	MOVQ    s_len+8(FP), CX   // CX = len(s)
+	CMPQ    t_len+32(FP), CX  // CX = max( CX, len(t) )
+	CMOVQLE t_len+32(FP), CX
+	PXOR    X3, X3            // norm = 0
+	CMPQ    CX, $0            // if CX == 0 { return 0 }
+	JE      l1_end
+	XORQ    AX, AX            // i = 0
+	MOVQ    CX, BX
+	ANDQ    $1, BX            // BX = CX % 2
+	SHRQ    $1, CX            // CX = floor( CX / 2 )
+	JZ      l1_tail_start     // if CX == 0 { return 0 }
+
+l1_loop: // Loop unrolled 2x  do {
+	MOVUPS (SI)(AX*8), X0 // X0 = t[i:i+1]
+	MOVUPS (DI)(AX*8), X1 // X1 = s[i:i+1]
+	MOVAPS X0, X2
+	SUBPD  X1, X0
+	SUBPD  X2, X1
+	MAXPD  X1, X0         // X0 = max( X0 - X1, X1 - X0 )
+	ADDPD  X0, X3         // norm += X0
+	ADDQ   $2, AX         // i += 2
+	LOOP   l1_loop        // } while --CX > 0
+	CMPQ   BX, $0         // if BX == 0 { return }
+	JE     l1_end
+
+l1_tail_start: // Reset loop registers
+	MOVQ BX, CX // Loop counter: CX = BX
+	PXOR X0, X0 // reset X0, X1 to break dependencies
+	PXOR X1, X1
+
+l1_tail:
+	MOVSD  (SI)(AX*8), X0 // X0 = t[i]
+	MOVSD  (DI)(AX*8), X1 // x1 = s[i]
+	MOVAPD X0, X2
+	SUBSD  X1, X0
+	SUBSD  X2, X1
+	MAXSD  X1, X0         // X0 = max( X0 - X1, X1 - X0 )
+	ADDSD  X0, X3         // norm += X0
+
+l1_end:
+	MOVAPS X3, X2
+	SHUFPD $1, X2, X2
+	ADDSD  X3, X2         // X2 = X3[1] + X3[0]
+	MOVSD  X2, ret+48(FP) // return X2
+	RET
+
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/l2norm_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/l2norm_amd64.s
new file mode 100644
index 0000000000..86e01c8701
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/l2norm_amd64.s
@@ -0,0 +1,109 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define SUMSQ X0
+#define ABSX X1
+#define SCALE X2
+#define ZERO X3
+#define TMP X4
+#define ABSMASK X5
+#define INF X7
+#define INFMASK X11
+#define NANMASK X12
+#define IDX AX
+#define LEN SI
+#define X_ DI
+
+#define ABSMASK_DATA l2nrodata<>+0(SB)
+#define INF_DATA l2nrodata<>+8(SB)
+#define NAN_DATA l2nrodata<>+16(SB)
+// AbsMask
+DATA l2nrodata<>+0(SB)/8, $0x7FFFFFFFFFFFFFFF
+// Inf
+DATA l2nrodata<>+8(SB)/8, $0x7FF0000000000000
+// NaN
+DATA l2nrodata<>+16(SB)/8, $0xFFF8000000000000
+GLOBL l2nrodata<>+0(SB), RODATA, $24
+
+// L2NormUnitary returns the L2-norm of x.
+// func L2NormUnitary(x []float64) (norm float64)
+TEXT ·L2NormUnitary(SB), NOSPLIT, $0
+	MOVQ x_len+8(FP), LEN // LEN = len(x)
+	MOVQ x_base+0(FP), X_
+	PXOR ZERO, ZERO
+	CMPQ LEN, $0          // if LEN == 0 { return 0 }
+	JZ   retZero
+
+	PXOR  INFMASK, INFMASK
+	PXOR  NANMASK, NANMASK
+	MOVSD $1.0, SUMSQ           // ssq = 1
+	XORPS SCALE, SCALE
+	MOVSD ABSMASK_DATA, ABSMASK
+	MOVSD INF_DATA, INF
+	XORQ  IDX, IDX              // idx == 0
+
+initZero:  // for ;x[i]==0; i++ {}
+	// Skip all leading zeros, to avoid divide by zero NaN
+	MOVSD   (X_)(IDX*8), ABSX // absxi = x[i]
+	UCOMISD ABSX, ZERO
+	JP      retNaN            // if isNaN(x[i]) { return NaN }
+	JNE     loop              // if x[i] != 0 { goto loop }
+	INCQ    IDX               // i++
+	CMPQ    IDX, LEN
+	JE      retZero           // if i == LEN { return 0 }
+	JMP     initZero
+
+loop:
+	MOVSD   (X_)(IDX*8), ABSX // absxi = x[i]
+	MOVUPS  ABSX, TMP
+	CMPSD   ABSX, TMP, $3
+	ORPD    TMP, NANMASK      // NANMASK = NANMASK | IsNaN(absxi)
+	MOVSD   INF, TMP
+	ANDPD   ABSMASK, ABSX     // absxi == Abs(absxi)
+	CMPSD   ABSX, TMP, $0
+	ORPD    TMP, INFMASK      // INFMASK =  INFMASK | IsInf(absxi)
+	UCOMISD SCALE, ABSX
+	JA      adjScale          // IF SCALE > ABSXI { goto adjScale }
+
+	DIVSD SCALE, ABSX // absxi = scale / absxi
+	MULSD ABSX, ABSX  // absxi *= absxi
+	ADDSD ABSX, SUMSQ // sumsq += absxi
+	INCQ  IDX         // i++
+	CMPQ  IDX, LEN
+	JNE   loop        // if i < LEN { continue }
+	JMP   retSum      // if i == LEN { goto retSum }
+
+adjScale:  // Scale > Absxi
+	DIVSD  ABSX, SCALE  // tmp = absxi / scale
+	MULSD  SCALE, SUMSQ // sumsq *= tmp
+	MULSD  SCALE, SUMSQ // sumsq *= tmp
+	ADDSD  $1.0, SUMSQ  // sumsq += 1
+	MOVUPS ABSX, SCALE  // scale = absxi
+	INCQ   IDX          // i++
+	CMPQ   IDX, LEN
+	JNE    loop         // if i < LEN { continue }
+
+retSum:  // Calculate return value
+	SQRTSD  SUMSQ, SUMSQ     // sumsq = sqrt(sumsq)
+	MULSD   SCALE, SUMSQ     // sumsq += scale
+	MOVQ    SUMSQ, R10       // tmp = sumsq
+	UCOMISD ZERO, INFMASK
+	CMOVQPS INF_DATA, R10    // if INFMASK { tmp = INF }
+	UCOMISD ZERO, NANMASK
+	CMOVQPS NAN_DATA, R10    // if NANMASK { tmp = NaN }
+	MOVQ    R10, norm+24(FP) // return tmp
+	RET
+
+retZero:
+	MOVSD ZERO, norm+24(FP) // return 0
+	RET
+
+retNaN:
+	MOVSD NAN_DATA, TMP    // return NaN
+	MOVSD TMP, norm+24(FP)
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/l2norm_noasm.go b/vendor/gonum.org/v1/gonum/internal/asm/f64/l2norm_noasm.go
new file mode 100644
index 0000000000..bfb8fba981
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/l2norm_noasm.go
@@ -0,0 +1,93 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 || noasm || gccgo || safe
+// +build !amd64 noasm gccgo safe
+
+package f64
+
+import "math"
+
+// L2NormUnitary returns the L2-norm of x.
+func L2NormUnitary(x []float64) (norm float64) {
+	var scale float64
+	sumSquares := 1.0
+	for _, v := range x {
+		if v == 0 {
+			continue
+		}
+		absxi := math.Abs(v)
+		if math.IsNaN(absxi) {
+			return math.NaN()
+		}
+		if scale < absxi {
+			s := scale / absxi
+			sumSquares = 1 + sumSquares*s*s
+			scale = absxi
+		} else {
+			s := absxi / scale
+			sumSquares += s * s
+		}
+	}
+	if math.IsInf(scale, 1) {
+		return math.Inf(1)
+	}
+	return scale * math.Sqrt(sumSquares)
+}
+
+// L2NormInc returns the L2-norm of x.
+func L2NormInc(x []float64, n, incX uintptr) (norm float64) {
+	var scale float64
+	sumSquares := 1.0
+	for ix := uintptr(0); ix < n*incX; ix += incX {
+		val := x[ix]
+		if val == 0 {
+			continue
+		}
+		absxi := math.Abs(val)
+		if math.IsNaN(absxi) {
+			return math.NaN()
+		}
+		if scale < absxi {
+			s := scale / absxi
+			sumSquares = 1 + sumSquares*s*s
+			scale = absxi
+		} else {
+			s := absxi / scale
+			sumSquares += s * s
+		}
+	}
+	if math.IsInf(scale, 1) {
+		return math.Inf(1)
+	}
+	return scale * math.Sqrt(sumSquares)
+}
+
+// L2DistanceUnitary returns the L2-norm of x-y.
+func L2DistanceUnitary(x, y []float64) (norm float64) {
+	var scale float64
+	sumSquares := 1.0
+	for i, v := range x {
+		v -= y[i]
+		if v == 0 {
+			continue
+		}
+		absxi := math.Abs(v)
+		if math.IsNaN(absxi) {
+			return math.NaN()
+		}
+		if scale < absxi {
+			s := scale / absxi
+			sumSquares = 1 + sumSquares*s*s
+			scale = absxi
+		} else {
+			s := absxi / scale
+			sumSquares += s * s
+		}
+	}
+	if math.IsInf(scale, 1) {
+		return math.Inf(1)
+	}
+	return scale * math.Sqrt(sumSquares)
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/l2normdist_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/l2normdist_amd64.s
new file mode 100644
index 0000000000..10dcae400e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/l2normdist_amd64.s
@@ -0,0 +1,115 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define SUMSQ X0
+#define ABSX X1
+#define SCALE X2
+#define ZERO X3
+#define TMP X4
+#define ABSMASK X5
+#define INF X7
+#define INFMASK X11
+#define NANMASK X12
+#define IDX AX
+#define X_ DI
+#define Y_ BX
+#define LEN SI
+
+#define ABSMASK_DATA l2nrodata<>+0(SB)
+#define INF_DATA l2nrodata<>+8(SB)
+#define NAN_DATA l2nrodata<>+16(SB)
+// AbsMask
+DATA l2nrodata<>+0(SB)/8, $0x7FFFFFFFFFFFFFFF
+// Inf
+DATA l2nrodata<>+8(SB)/8, $0x7FF0000000000000
+// NaN
+DATA l2nrodata<>+16(SB)/8, $0xFFF8000000000000
+GLOBL l2nrodata<>+0(SB), RODATA, $24
+
+// L2DistanceUnitary returns the L2-norm of x-y.
+// func L2DistanceUnitary(x,y []float64) (norm float64)
+TEXT ·L2DistanceUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+0(FP), X_
+	MOVQ    y_base+24(FP), Y_
+	PXOR    ZERO, ZERO
+	MOVQ    x_len+8(FP), LEN  // LEN = min( len(x), len(y) )
+	CMPQ    y_len+32(FP), LEN
+	CMOVQLE y_len+32(FP), LEN
+	CMPQ    LEN, $0           // if LEN == 0 { return 0 }
+	JZ      retZero
+
+	PXOR  INFMASK, INFMASK
+	PXOR  NANMASK, NANMASK
+	MOVSD $1.0, SUMSQ           // ssq = 1
+	XORPS SCALE, SCALE
+	MOVSD ABSMASK_DATA, ABSMASK
+	MOVSD INF_DATA, INF
+	XORQ  IDX, IDX              // idx == 0
+
+initZero:  // for ;x[i]==0; i++ {}
+	// Skip all leading zeros, to avoid divide by zero NaN
+	MOVSD   (X_)(IDX*8), ABSX // absxi = x[i]
+	SUBSD   (Y_)(IDX*8), ABSX // absxi = x[i]-y[i]
+	UCOMISD ABSX, ZERO
+	JP      retNaN            // if isNaN(absxi) { return NaN }
+	JNE     loop              // if absxi != 0 { goto loop }
+	INCQ    IDX               // i++
+	CMPQ    IDX, LEN
+	JE      retZero           // if i == LEN { return 0 }
+	JMP     initZero
+
+loop:
+	MOVSD   (X_)(IDX*8), ABSX // absxi = x[i]
+	SUBSD   (Y_)(IDX*8), ABSX // absxi = x[i]-y[i]
+	MOVUPS  ABSX, TMP
+	CMPSD   ABSX, TMP, $3
+	ORPD    TMP, NANMASK      // NANMASK = NANMASK | IsNaN(absxi)
+	MOVSD   INF, TMP
+	ANDPD   ABSMASK, ABSX     // absxi == Abs(absxi)
+	CMPSD   ABSX, TMP, $0
+	ORPD    TMP, INFMASK      // INFMASK =  INFMASK | IsInf(absxi)
+	UCOMISD SCALE, ABSX
+	JA      adjScale          // IF SCALE > ABSXI { goto adjScale }
+
+	DIVSD SCALE, ABSX // absxi = scale / absxi
+	MULSD ABSX, ABSX  // absxi *= absxi
+	ADDSD ABSX, SUMSQ // sumsq += absxi
+	INCQ  IDX         // i++
+	CMPQ  IDX, LEN
+	JNE   loop        // if i < LEN { continue }
+	JMP   retSum      // if i == LEN { goto retSum }
+
+adjScale:  // Scale > Absxi
+	DIVSD  ABSX, SCALE  // tmp = absxi / scale
+	MULSD  SCALE, SUMSQ // sumsq *= tmp
+	MULSD  SCALE, SUMSQ // sumsq *= tmp
+	ADDSD  $1.0, SUMSQ  // sumsq += 1
+	MOVUPS ABSX, SCALE  // scale = absxi
+	INCQ   IDX          // i++
+	CMPQ   IDX, LEN
+	JNE    loop         // if i < LEN { continue }
+
+retSum:  // Calculate return value
+	SQRTSD  SUMSQ, SUMSQ     // sumsq = sqrt(sumsq)
+	MULSD   SCALE, SUMSQ     // sumsq += scale
+	MOVQ    SUMSQ, R10       // tmp = sumsq
+	UCOMISD ZERO, INFMASK
+	CMOVQPS INF_DATA, R10    // if INFMASK { tmp = INF }
+	UCOMISD ZERO, NANMASK
+	CMOVQPS NAN_DATA, R10    // if NANMASK { tmp = NaN }
+	MOVQ    R10, norm+48(FP) // return tmp
+	RET
+
+retZero:
+	MOVSD ZERO, norm+48(FP) // return 0
+	RET
+
+retNaN:
+	MOVSD NAN_DATA, TMP    // return NaN
+	MOVSD TMP, norm+48(FP)
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/l2norminc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/l2norminc_amd64.s
new file mode 100644
index 0000000000..8341db93ac
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/l2norminc_amd64.s
@@ -0,0 +1,110 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define SUMSQ X0
+#define ABSX X1
+#define SCALE X2
+#define ZERO X3
+#define TMP X4
+#define ABSMASK X5
+#define INF X7
+#define INFMASK X11
+#define NANMASK X12
+#define IDX AX
+#define LEN SI
+#define INC BX
+#define X_ DI
+
+#define ABSMASK_DATA l2nrodata<>+0(SB)
+#define INF_DATA l2nrodata<>+8(SB)
+#define NAN_DATA l2nrodata<>+16(SB)
+// AbsMask
+DATA l2nrodata<>+0(SB)/8, $0x7FFFFFFFFFFFFFFF
+// Inf
+DATA l2nrodata<>+8(SB)/8, $0x7FF0000000000000
+// NaN
+DATA l2nrodata<>+16(SB)/8, $0xFFF8000000000000
+GLOBL l2nrodata<>+0(SB), RODATA, $24
+
+// func L2NormInc(x []float64, n, incX uintptr) (norm float64)
+TEXT ·L2NormInc(SB), NOSPLIT, $0
+	MOVQ n+24(FP), LEN    // LEN = len(x)
+	MOVQ incX+32(FP), INC
+	MOVQ x_base+0(FP), X_
+	XORPS ZERO, ZERO
+	CMPQ LEN, $0          // if LEN == 0 { return 0 }
+	JZ   retZero
+
+	XORPS INFMASK, INFMASK
+	XORPS NANMASK, NANMASK
+	MOVSD $1.0, SUMSQ           // ssq = 1
+	XORPS SCALE, SCALE
+	MOVSD ABSMASK_DATA, ABSMASK
+	MOVSD INF_DATA, INF
+	SHLQ  $3, INC               // INC *= sizeof(float64)
+
+initZero:  // for ;x[i]==0; i++ {}
+	// Skip all leading zeros, to avoid divide by zero NaN
+	MOVSD   (X_), ABSX // absxi = x[i]
+	UCOMISD ABSX, ZERO
+	JP      retNaN     // if isNaN(x[i]) { return NaN }
+	JNZ     loop       // if x[i] != 0 { goto loop }
+	ADDQ    INC, X_    // i += INC
+	DECQ    LEN        // LEN--
+	JZ      retZero    // if LEN == 0 { return 0 }
+	JMP     initZero
+
+loop:
+	MOVSD   (X_), ABSX    // absxi = x[i]
+	MOVUPS  ABSX, TMP
+	CMPSD   ABSX, TMP, $3
+	ORPD    TMP, NANMASK  // NANMASK = NANMASK | IsNaN(absxi)
+	MOVSD   INF, TMP
+	ANDPD   ABSMASK, ABSX // absxi == Abs(absxi)
+	CMPSD   ABSX, TMP, $0
+	ORPD    TMP, INFMASK  // INFMASK =  INFMASK | IsInf(absxi)
+	UCOMISD SCALE, ABSX
+	JA      adjScale      // IF SCALE > ABSXI { goto adjScale }
+
+	DIVSD SCALE, ABSX // absxi = scale / absxi
+	MULSD ABSX, ABSX  // absxi *= absxi
+	ADDSD ABSX, SUMSQ // sumsq += absxi
+	ADDQ  INC, X_     // i += INC
+	DECQ  LEN         // LEN--
+	JNZ   loop        // if LEN > 0 { continue }
+	JMP   retSum      // if LEN == 0 { goto retSum }
+
+adjScale:  // Scale > Absxi
+	DIVSD  ABSX, SCALE  // tmp = absxi / scale
+	MULSD  SCALE, SUMSQ // sumsq *= tmp
+	MULSD  SCALE, SUMSQ // sumsq *= tmp
+	ADDSD  $1.0, SUMSQ  // sumsq += 1
+	MOVUPS ABSX, SCALE  // scale = absxi
+	ADDQ   INC, X_      // i += INC
+	DECQ   LEN          // LEN--
+	JNZ    loop         // if LEN > 0 { continue }
+
+retSum:  // Calculate return value
+	SQRTSD  SUMSQ, SUMSQ     // sumsq = sqrt(sumsq)
+	MULSD   SCALE, SUMSQ     // sumsq += scale
+	MOVQ    SUMSQ, R10       // tmp = sumsq
+	UCOMISD ZERO, INFMASK
+	CMOVQPS INF_DATA, R10    // if INFMASK { tmp = INF }
+	UCOMISD ZERO, NANMASK
+	CMOVQPS NAN_DATA, R10    // if NANMASK { tmp = NaN }
+	MOVQ    R10, norm+40(FP) // return tmp
+	RET
+
+retZero:
+	MOVSD ZERO, norm+40(FP) // return 0
+	RET
+
+retNaN:
+	MOVSD NAN_DATA, TMP    // return NaN
+	MOVSD TMP, norm+40(FP)
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/linfnorm_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/linfnorm_amd64.s
new file mode 100644
index 0000000000..ac18b481de
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/linfnorm_amd64.s
@@ -0,0 +1,57 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func LinfDist(s, t []float64) float64
+TEXT ·LinfDist(SB), NOSPLIT, $0
+	MOVQ    s_base+0(FP), DI  // DI = &s
+	MOVQ    t_base+24(FP), SI // SI = &t
+	MOVQ    s_len+8(FP), CX   // CX = len(s)
+	CMPQ    t_len+32(FP), CX  // CX = max( CX, len(t) )
+	CMOVQLE t_len+32(FP), CX
+	PXOR    X3, X3            // norm = 0
+	CMPQ    CX, $0            // if CX == 0 { return 0 }
+	JE      l1_end
+	XORQ    AX, AX            // i = 0
+	MOVQ    CX, BX
+	ANDQ    $1, BX            // BX = CX % 2
+	SHRQ    $1, CX            // CX = floor( CX / 2 )
+	JZ      l1_tail_start     // if CX == 0 { return 0 }
+
+l1_loop: // Loop unrolled 2x  do {
+	MOVUPS (SI)(AX*8), X0 // X0 = t[i:i+1]
+	MOVUPS (DI)(AX*8), X1 // X1 = s[i:i+1]
+	MOVAPS X0, X2
+	SUBPD  X1, X0
+	SUBPD  X2, X1
+	MAXPD  X1, X0         // X0 = max( X0 - X1, X1 - X0 )
+	MAXPD  X0, X3         // norm = max( norm, X0 )
+	ADDQ   $2, AX         // i += 2
+	LOOP   l1_loop        // } while --CX > 0
+	CMPQ   BX, $0         // if BX == 0 { return }
+	JE     l1_end
+
+l1_tail_start: // Reset loop registers
+	MOVQ BX, CX // Loop counter: CX = BX
+	PXOR X0, X0 // reset X0, X1 to break dependencies
+	PXOR X1, X1
+
+l1_tail:
+	MOVSD  (SI)(AX*8), X0 // X0 = t[i]
+	MOVSD  (DI)(AX*8), X1 // X1 = s[i]
+	MOVAPD X0, X2
+	SUBSD  X1, X0
+	SUBSD  X2, X1
+	MAXSD  X1, X0         // X0 = max( X0 - X1, X1 - X0 )
+	MAXSD  X0, X3         // norm = max( norm, X0 )
+
+l1_end:
+	MOVAPS X3, X2
+	SHUFPD $1, X2, X2
+	MAXSD  X3, X2         // X2 = max( X3[1], X3[0] )
+	MOVSD  X2, ret+48(FP) // return X2
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/scal.go b/vendor/gonum.org/v1/gonum/internal/asm/f64/scal.go
new file mode 100644
index 0000000000..c95219e18a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/scal.go
@@ -0,0 +1,62 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 || noasm || gccgo || safe
+// +build !amd64 noasm gccgo safe
+
+package f64
+
+// ScalUnitary is
+//
+//	for i := range x {
+//		x[i] *= alpha
+//	}
+func ScalUnitary(alpha float64, x []float64) {
+	for i := range x {
+		x[i] *= alpha
+	}
+}
+
+// ScalUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha * v
+//	}
+func ScalUnitaryTo(dst []float64, alpha float64, x []float64) {
+	for i, v := range x {
+		dst[i] = alpha * v
+	}
+}
+
+// ScalInc is
+//
+//	var ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		x[ix] *= alpha
+//		ix += incX
+//	}
+func ScalInc(alpha float64, x []float64, n, incX uintptr) {
+	var ix uintptr
+	for i := 0; i < int(n); i++ {
+		x[ix] *= alpha
+		ix += incX
+	}
+}
+
+// ScalIncTo is
+//
+//	var idst, ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha * x[ix]
+//		ix += incX
+//		idst += incDst
+//	}
+func ScalIncTo(dst []float64, incDst uintptr, alpha float64, x []float64, n, incX uintptr) {
+	var idst, ix uintptr
+	for i := 0; i < int(n); i++ {
+		dst[idst] = alpha * x[ix]
+		ix += incX
+		idst += incDst
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/scalinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/scalinc_amd64.s
new file mode 100644
index 0000000000..d623a284f9
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/scalinc_amd64.s
@@ -0,0 +1,113 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// Some of the loop unrolling code is copied from:
+// http://golang.org/src/math/big/arith_amd64.s
+// which is distributed under these terms:
+//
+// Copyright (c) 2012 The Go Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define X_PTR SI
+#define LEN CX
+#define TAIL BX
+#define INC_X R8
+#define INCx3_X R9
+#define ALPHA X0
+#define ALPHA_2 X1
+
+// func ScalInc(alpha float64, x []float64, n, incX uintptr)
+TEXT ·ScalInc(SB), NOSPLIT, $0
+	MOVSD alpha+0(FP), ALPHA  // ALPHA = alpha
+	MOVQ  x_base+8(FP), X_PTR // X_PTR = &x
+	MOVQ  incX+40(FP), INC_X  // INC_X = incX
+	SHLQ  $3, INC_X           // INC_X *= sizeof(float64)
+	MOVQ  n+32(FP), LEN       // LEN = n
+	CMPQ  LEN, $0
+	JE    end                 // if LEN == 0 { return }
+
+	MOVQ LEN, TAIL
+	ANDQ $3, TAIL   // TAIL = LEN % 4
+	SHRQ $2, LEN    // LEN = floor( LEN / 4 )
+	JZ   tail_start // if LEN == 0 { goto tail_start }
+
+	MOVUPS ALPHA, ALPHA_2            // ALPHA_2 = ALPHA for pipelining
+	LEAQ   (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
+
+loop:  // do { // x[i] *= alpha unrolled 4x.
+	MOVSD (X_PTR), X2            // X_i = x[i]
+	MOVSD (X_PTR)(INC_X*1), X3
+	MOVSD (X_PTR)(INC_X*2), X4
+	MOVSD (X_PTR)(INCx3_X*1), X5
+
+	MULSD ALPHA, X2   // X_i *= a
+	MULSD ALPHA_2, X3
+	MULSD ALPHA, X4
+	MULSD ALPHA_2, X5
+
+	MOVSD X2, (X_PTR)            // x[i] = X_i
+	MOVSD X3, (X_PTR)(INC_X*1)
+	MOVSD X4, (X_PTR)(INC_X*2)
+	MOVSD X5, (X_PTR)(INCx3_X*1)
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4])
+	DECQ LEN
+	JNZ  loop                    // } while --LEN > 0
+	CMPQ TAIL, $0
+	JE   end                     // if TAIL == 0 { return }
+
+tail_start: // Reset loop registers
+	MOVQ TAIL, LEN // Loop counter: LEN = TAIL
+	SHRQ $1, LEN   // LEN = floor( LEN / 2 )
+	JZ   tail_one
+
+tail_two: // do {
+	MOVSD (X_PTR), X2          // X_i = x[i]
+	MOVSD (X_PTR)(INC_X*1), X3
+	MULSD ALPHA, X2            // X_i *= a
+	MULSD ALPHA, X3
+	MOVSD X2, (X_PTR)          // x[i] = X_i
+	MOVSD X3, (X_PTR)(INC_X*1)
+
+	LEAQ (X_PTR)(INC_X*2), X_PTR // X_PTR = &(X_PTR[incX*2])
+
+	ANDQ $1, TAIL
+	JZ   end
+
+tail_one:
+	MOVSD (X_PTR), X2 // X_i = x[i]
+	MULSD ALPHA, X2   // X_i *= ALPHA
+	MOVSD X2, (X_PTR) // x[i] = X_i
+
+end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/scalincto_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/scalincto_amd64.s
new file mode 100644
index 0000000000..1c2722098d
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/scalincto_amd64.s
@@ -0,0 +1,122 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// Some of the loop unrolling code is copied from:
+// http://golang.org/src/math/big/arith_amd64.s
+// which is distributed under these terms:
+//
+// Copyright (c) 2012 The Go Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define X_PTR SI
+#define DST_PTR DI
+#define LEN CX
+#define TAIL BX
+#define INC_X R8
+#define INCx3_X R9
+#define INC_DST R10
+#define INCx3_DST R11
+#define ALPHA X0
+#define ALPHA_2 X1
+
+// func ScalIncTo(dst []float64, incDst uintptr, alpha float64, x []float64, n, incX uintptr)
+TEXT ·ScalIncTo(SB), NOSPLIT, $0
+	MOVQ  dst_base+0(FP), DST_PTR // DST_PTR = &dst
+	MOVQ  incDst+24(FP), INC_DST  // INC_DST = incDst
+	SHLQ  $3, INC_DST             // INC_DST *= sizeof(float64)
+	MOVSD alpha+32(FP), ALPHA     // ALPHA = alpha
+	MOVQ  x_base+40(FP), X_PTR    // X_PTR = &x
+	MOVQ  n+64(FP), LEN           // LEN = n
+	MOVQ  incX+72(FP), INC_X      // INC_X = incX
+	SHLQ  $3, INC_X               // INC_X *= sizeof(float64)
+	CMPQ  LEN, $0
+	JE    end                     // if LEN == 0 { return }
+
+	MOVQ LEN, TAIL
+	ANDQ $3, TAIL   // TAIL = LEN % 4
+	SHRQ $2, LEN    // LEN = floor( LEN / 4 )
+	JZ   tail_start // if LEN == 0 { goto tail_start }
+
+	MOVUPS ALPHA, ALPHA_2                  // ALPHA_2 = ALPHA for pipelining
+	LEAQ   (INC_X)(INC_X*2), INCx3_X       // INCx3_X = INC_X * 3
+	LEAQ   (INC_DST)(INC_DST*2), INCx3_DST // INCx3_DST = INC_DST * 3
+
+loop:  // do { // x[i] *= alpha unrolled 4x.
+	MOVSD (X_PTR), X2            // X_i = x[i]
+	MOVSD (X_PTR)(INC_X*1), X3
+	MOVSD (X_PTR)(INC_X*2), X4
+	MOVSD (X_PTR)(INCx3_X*1), X5
+
+	MULSD ALPHA, X2   // X_i *= a
+	MULSD ALPHA_2, X3
+	MULSD ALPHA, X4
+	MULSD ALPHA_2, X5
+
+	MOVSD X2, (DST_PTR)              // dst[i] = X_i
+	MOVSD X3, (DST_PTR)(INC_DST*1)
+	MOVSD X4, (DST_PTR)(INC_DST*2)
+	MOVSD X5, (DST_PTR)(INCx3_DST*1)
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR       // X_PTR = &(X_PTR[incX*4])
+	LEAQ (DST_PTR)(INC_DST*4), DST_PTR // DST_PTR = &(DST_PTR[incDst*4])
+	DECQ LEN
+	JNZ  loop                          // } while --LEN > 0
+	CMPQ TAIL, $0
+	JE   end                           // if TAIL == 0 { return }
+
+tail_start: // Reset loop registers
+	MOVQ TAIL, LEN // Loop counter: LEN = TAIL
+	SHRQ $1, LEN   // LEN = floor( LEN / 2 )
+	JZ   tail_one
+
+tail_two:
+	MOVSD (X_PTR), X2              // X_i = x[i]
+	MOVSD (X_PTR)(INC_X*1), X3
+	MULSD ALPHA, X2                // X_i *= a
+	MULSD ALPHA, X3
+	MOVSD X2, (DST_PTR)            // dst[i] = X_i
+	MOVSD X3, (DST_PTR)(INC_DST*1)
+
+	LEAQ (X_PTR)(INC_X*2), X_PTR       // X_PTR = &(X_PTR[incX*2])
+	LEAQ (DST_PTR)(INC_DST*2), DST_PTR // DST_PTR = &(DST_PTR[incDst*2])
+
+	ANDQ $1, TAIL
+	JZ   end
+
+tail_one:
+	MOVSD (X_PTR), X2   // X_i = x[i]
+	MULSD ALPHA, X2     // X_i *= ALPHA
+	MOVSD X2, (DST_PTR) // x[i] = X_i
+
+end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/scalunitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/scalunitary_amd64.s
new file mode 100644
index 0000000000..6e8f5ca6e1
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/scalunitary_amd64.s
@@ -0,0 +1,112 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// Some of the loop unrolling code is copied from:
+// http://golang.org/src/math/big/arith_amd64.s
+// which is distributed under these terms:
+//
+// Copyright (c) 2012 The Go Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVDDUP_ALPHA    LONG $0x44120FF2; WORD $0x0824 // @ MOVDDUP XMM0, 8[RSP]
+
+#define X_PTR SI
+#define DST_PTR DI
+#define IDX AX
+#define LEN CX
+#define TAIL BX
+#define ALPHA X0
+#define ALPHA_2 X1
+
+// func ScalUnitary(alpha float64, x []float64)
+TEXT ·ScalUnitary(SB), NOSPLIT, $0
+	MOVDDUP_ALPHA            // ALPHA = { alpha, alpha }
+	MOVQ x_base+8(FP), X_PTR // X_PTR = &x
+	MOVQ x_len+16(FP), LEN   // LEN = len(x)
+	CMPQ LEN, $0
+	JE   end                 // if LEN == 0 { return }
+	XORQ IDX, IDX            // IDX = 0
+
+	MOVQ LEN, TAIL
+	ANDQ $7, TAIL   // TAIL = LEN % 8
+	SHRQ $3, LEN    // LEN = floor( LEN / 8 )
+	JZ   tail_start // if LEN == 0 { goto tail_start }
+
+	MOVUPS ALPHA, ALPHA_2
+
+loop:  // do {  // x[i] *= alpha unrolled 8x.
+	MOVUPS (X_PTR)(IDX*8), X2   // X_i = x[i]
+	MOVUPS 16(X_PTR)(IDX*8), X3
+	MOVUPS 32(X_PTR)(IDX*8), X4
+	MOVUPS 48(X_PTR)(IDX*8), X5
+
+	MULPD ALPHA, X2   // X_i *= ALPHA
+	MULPD ALPHA_2, X3
+	MULPD ALPHA, X4
+	MULPD ALPHA_2, X5
+
+	MOVUPS X2, (X_PTR)(IDX*8)   // x[i] = X_i
+	MOVUPS X3, 16(X_PTR)(IDX*8)
+	MOVUPS X4, 32(X_PTR)(IDX*8)
+	MOVUPS X5, 48(X_PTR)(IDX*8)
+
+	ADDQ $8, IDX  // i += 8
+	DECQ LEN
+	JNZ  loop     // while --LEN > 0
+	CMPQ TAIL, $0
+	JE   end      // if TAIL == 0 { return }
+
+tail_start: // Reset loop registers
+	MOVQ TAIL, LEN // Loop counter: LEN = TAIL
+	SHRQ $1, LEN   // LEN = floor( TAIL / 2 )
+	JZ   tail_one  // if n == 0 goto end
+
+tail_two: // do {
+	MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i]
+	MULPD  ALPHA, X2          // X_i *= ALPHA
+	MOVUPS X2, (X_PTR)(IDX*8) // x[i] = X_i
+	ADDQ   $2, IDX            // i += 2
+	DECQ   LEN
+	JNZ    tail_two           // while --LEN > 0
+
+	ANDQ $1, TAIL
+	JZ   end      // if TAIL == 0 { return }
+
+tail_one:
+	// x[i] *= alpha for the remaining element.
+	MOVSD (X_PTR)(IDX*8), X2
+	MULSD ALPHA, X2
+	MOVSD X2, (X_PTR)(IDX*8)
+
+end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/scalunitaryto_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/scalunitaryto_amd64.s
new file mode 100644
index 0000000000..986480a5be
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/scalunitaryto_amd64.s
@@ -0,0 +1,113 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// Some of the loop unrolling code is copied from:
+// http://golang.org/src/math/big/arith_amd64.s
+// which is distributed under these terms:
+//
+// Copyright (c) 2012 The Go Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVDDUP_ALPHA    LONG $0x44120FF2; WORD $0x2024 // @ MOVDDUP 32(SP), X0  /*XMM0, 32[RSP]*/
+
+#define X_PTR SI
+#define DST_PTR DI
+#define IDX AX
+#define LEN CX
+#define TAIL BX
+#define ALPHA X0
+#define ALPHA_2 X1
+
+// func ScalUnitaryTo(dst []float64, alpha float64, x []float64)
+// This function assumes len(dst) >= len(x).
+TEXT ·ScalUnitaryTo(SB), NOSPLIT, $0
+	MOVQ x_base+32(FP), X_PTR    // X_PTR = &x
+	MOVQ dst_base+0(FP), DST_PTR // DST_PTR = &dst
+	MOVDDUP_ALPHA                // ALPHA = { alpha, alpha }
+	MOVQ x_len+40(FP), LEN       // LEN = len(x)
+	CMPQ LEN, $0
+	JE   end                     // if LEN == 0 { return }
+
+	XORQ IDX, IDX   // IDX = 0
+	MOVQ LEN, TAIL
+	ANDQ $7, TAIL   // TAIL = LEN % 8
+	SHRQ $3, LEN    // LEN = floor( LEN / 8 )
+	JZ   tail_start // if LEN == 0 { goto tail_start }
+
+	MOVUPS ALPHA, ALPHA_2 // ALPHA_2 = ALPHA for pipelining
+
+loop:  // do { // dst[i] = alpha * x[i] unrolled 8x.
+	MOVUPS (X_PTR)(IDX*8), X2   // X_i = x[i]
+	MOVUPS 16(X_PTR)(IDX*8), X3
+	MOVUPS 32(X_PTR)(IDX*8), X4
+	MOVUPS 48(X_PTR)(IDX*8), X5
+
+	MULPD ALPHA, X2   // X_i *= ALPHA
+	MULPD ALPHA_2, X3
+	MULPD ALPHA, X4
+	MULPD ALPHA_2, X5
+
+	MOVUPS X2, (DST_PTR)(IDX*8)   // dst[i] = X_i
+	MOVUPS X3, 16(DST_PTR)(IDX*8)
+	MOVUPS X4, 32(DST_PTR)(IDX*8)
+	MOVUPS X5, 48(DST_PTR)(IDX*8)
+
+	ADDQ $8, IDX  // i += 8
+	DECQ LEN
+	JNZ  loop     // while --LEN > 0
+	CMPQ TAIL, $0
+	JE   end      // if TAIL == 0 { return }
+
+tail_start: // Reset loop counters
+	MOVQ TAIL, LEN // Loop counter: LEN = TAIL
+	SHRQ $1, LEN   // LEN = floor( TAIL / 2 )
+	JZ   tail_one  // if LEN == 0 { goto tail_one }
+
+tail_two: // do {
+	MOVUPS (X_PTR)(IDX*8), X2   // X_i = x[i]
+	MULPD  ALPHA, X2            // X_i *= ALPHA
+	MOVUPS X2, (DST_PTR)(IDX*8) // dst[i] = X_i
+	ADDQ   $2, IDX              // i += 2
+	DECQ   LEN
+	JNZ    tail_two             // while --LEN > 0
+
+	ANDQ $1, TAIL
+	JZ   end      // if TAIL == 0 { return }
+
+tail_one:
+	MOVSD (X_PTR)(IDX*8), X2   // X_i = x[i]
+	MULSD ALPHA, X2            // X_i *= ALPHA
+	MOVSD X2, (DST_PTR)(IDX*8) // dst[i] = X_i
+
+end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/stubs_amd64.go b/vendor/gonum.org/v1/gonum/internal/asm/f64/stubs_amd64.go
new file mode 100644
index 0000000000..7139bedd74
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/stubs_amd64.go
@@ -0,0 +1,277 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !noasm && !gccgo && !safe
+// +build !noasm,!gccgo,!safe
+
+package f64
+
+// L1Norm is
+//
+//	for _, v := range x {
+//		sum += math.Abs(v)
+//	}
+//	return sum
+func L1Norm(x []float64) (sum float64)
+
+// L1NormInc is
+//
+//	for i := 0; i < n*incX; i += incX {
+//		sum += math.Abs(x[i])
+//	}
+//	return sum
+func L1NormInc(x []float64, n, incX int) (sum float64)
+
+// AddConst is
+//
+//	for i := range x {
+//		x[i] += alpha
+//	}
+func AddConst(alpha float64, x []float64)
+
+// Add is
+//
+//	for i, v := range s {
+//		dst[i] += v
+//	}
+func Add(dst, s []float64)
+
+// AxpyUnitary is
+//
+//	for i, v := range x {
+//		y[i] += alpha * v
+//	}
+func AxpyUnitary(alpha float64, x, y []float64)
+
+// AxpyUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha*v + y[i]
+//	}
+func AxpyUnitaryTo(dst []float64, alpha float64, x, y []float64)
+
+// AxpyInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		y[iy] += alpha * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+func AxpyInc(alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr)
+
+// AxpyIncTo is
+//
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha*x[ix] + y[iy]
+//		ix += incX
+//		iy += incY
+//		idst += incDst
+//	}
+func AxpyIncTo(dst []float64, incDst, idst uintptr, alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr)
+
+// CumSum is
+//
+//	if len(s) == 0 {
+//		return dst
+//	}
+//	dst[0] = s[0]
+//	for i, v := range s[1:] {
+//		dst[i+1] = dst[i] + v
+//	}
+//	return dst
+func CumSum(dst, s []float64) []float64
+
+// CumProd is
+//
+//	if len(s) == 0 {
+//		return dst
+//	}
+//	dst[0] = s[0]
+//	for i, v := range s[1:] {
+//		dst[i+1] = dst[i] * v
+//	}
+//	return dst
+func CumProd(dst, s []float64) []float64
+
+// Div is
+//
+//	for i, v := range s {
+//		dst[i] /= v
+//	}
+func Div(dst, s []float64)
+
+// DivTo is
+//
+//	for i, v := range s {
+//		dst[i] = v / t[i]
+//	}
+//	return dst
+func DivTo(dst, x, y []float64) []float64
+
+// DotUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * v
+//	}
+//	return sum
+func DotUnitary(x, y []float64) (sum float64)
+
+// DotInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotInc(x, y []float64, n, incX, incY, ix, iy uintptr) (sum float64)
+
+// L1Dist is
+//
+//	var norm float64
+//	for i, v := range s {
+//		norm += math.Abs(t[i] - v)
+//	}
+//	return norm
+func L1Dist(s, t []float64) float64
+
+// LinfDist is
+//
+//	var norm float64
+//	if len(s) == 0 {
+//		return 0
+//	}
+//	norm = math.Abs(t[0] - s[0])
+//	for i, v := range s[1:] {
+//		absDiff := math.Abs(t[i+1] - v)
+//		if absDiff > norm || math.IsNaN(norm) {
+//			norm = absDiff
+//		}
+//	}
+//	return norm
+func LinfDist(s, t []float64) float64
+
+// ScalUnitary is
+//
+//	for i := range x {
+//		x[i] *= alpha
+//	}
+func ScalUnitary(alpha float64, x []float64)
+
+// ScalUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha * v
+//	}
+func ScalUnitaryTo(dst []float64, alpha float64, x []float64)
+
+// ScalInc is
+//
+//	var ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		x[ix] *= alpha
+//		ix += incX
+//	}
+func ScalInc(alpha float64, x []float64, n, incX uintptr)
+
+// ScalIncTo is
+//
+//	var idst, ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha * x[ix]
+//		ix += incX
+//		idst += incDst
+//	}
+func ScalIncTo(dst []float64, incDst uintptr, alpha float64, x []float64, n, incX uintptr)
+
+// Sum is
+//
+//	var sum float64
+//	for i := range x {
+//	    sum += x[i]
+//	}
+func Sum(x []float64) float64
+
+// L2NormUnitary returns the L2-norm of x.
+//
+//	  var scale float64
+//	  sumSquares := 1.0
+//	  for _, v := range x {
+//	  	if v == 0 {
+//	  		continue
+//	  	}
+//	  	absxi := math.Abs(v)
+//	  	if math.IsNaN(absxi) {
+//	  		return math.NaN()
+//	  	}
+//	  	if scale < absxi {
+//	  		s := scale / absxi
+//	  		sumSquares = 1 + sumSquares*s*s
+//	  		scale = absxi
+//	  	} else {
+//	  		s := absxi / scale
+//	  		sumSquares += s * s
+//	  	}
+//		  	if math.IsInf(scale, 1) {
+//			  	return math.Inf(1)
+//		  	}
+//	  }
+//	  return scale * math.Sqrt(sumSquares)
+func L2NormUnitary(x []float64) (norm float64)
+
+// L2NormInc returns the L2-norm of x.
+//
+//	var scale float64
+//	sumSquares := 1.0
+//	for ix := uintptr(0); ix < n*incX; ix += incX {
+//		val := x[ix]
+//		if val == 0 {
+//			continue
+//		}
+//		absxi := math.Abs(val)
+//		if math.IsNaN(absxi) {
+//			return math.NaN()
+//		}
+//		if scale < absxi {
+//			s := scale / absxi
+//			sumSquares = 1 + sumSquares*s*s
+//			scale = absxi
+//		} else {
+//			s := absxi / scale
+//			sumSquares += s * s
+//		}
+//	}
+//	if math.IsInf(scale, 1) {
+//		return math.Inf(1)
+//	}
+//	return scale * math.Sqrt(sumSquares)
+func L2NormInc(x []float64, n, incX uintptr) (norm float64)
+
+// L2DistanceUnitary returns the L2-norm of x-y.
+//
+//	var scale float64
+//	sumSquares := 1.0
+//	for i, v := range x {
+//		v -= y[i]
+//		if v == 0 {
+//			continue
+//		}
+//		absxi := math.Abs(v)
+//		if math.IsNaN(absxi) {
+//			return math.NaN()
+//		}
+//		if scale < absxi {
+//			s := scale / absxi
+//			sumSquares = 1 + sumSquares*s*s
+//			scale = absxi
+//		} else {
+//			s := absxi / scale
+//			sumSquares += s * s
+//		}
+//	}
+//	if math.IsInf(scale, 1) {
+//		return math.Inf(1)
+//	}
+//	return scale * math.Sqrt(sumSquares)
+func L2DistanceUnitary(x, y []float64) (norm float64)
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/stubs_noasm.go b/vendor/gonum.org/v1/gonum/internal/asm/f64/stubs_noasm.go
new file mode 100644
index 0000000000..f066379191
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/stubs_noasm.go
@@ -0,0 +1,182 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 || noasm || gccgo || safe
+// +build !amd64 noasm gccgo safe
+
+package f64
+
+import "math"
+
+// L1Norm is
+//
+//	for _, v := range x {
+//		sum += math.Abs(v)
+//	}
+//	return sum
+func L1Norm(x []float64) (sum float64) {
+	for _, v := range x {
+		sum += math.Abs(v)
+	}
+	return sum
+}
+
+// L1NormInc is
+//
+//	for i := 0; i < n*incX; i += incX {
+//		sum += math.Abs(x[i])
+//	}
+//	return sum
+func L1NormInc(x []float64, n, incX int) (sum float64) {
+	for i := 0; i < n*incX; i += incX {
+		sum += math.Abs(x[i])
+	}
+	return sum
+}
+
+// Add is
+//
+//	for i, v := range s {
+//		dst[i] += v
+//	}
+func Add(dst, s []float64) {
+	for i, v := range s {
+		dst[i] += v
+	}
+}
+
+// AddConst is
+//
+//	for i := range x {
+//		x[i] += alpha
+//	}
+func AddConst(alpha float64, x []float64) {
+	for i := range x {
+		x[i] += alpha
+	}
+}
+
+// CumSum is
+//
+//	if len(s) == 0 {
+//		return dst
+//	}
+//	dst[0] = s[0]
+//	for i, v := range s[1:] {
+//		dst[i+1] = dst[i] + v
+//	}
+//	return dst
+func CumSum(dst, s []float64) []float64 {
+	if len(s) == 0 {
+		return dst
+	}
+	dst[0] = s[0]
+	for i, v := range s[1:] {
+		dst[i+1] = dst[i] + v
+	}
+	return dst
+}
+
+// CumProd is
+//
+//	if len(s) == 0 {
+//		return dst
+//	}
+//	dst[0] = s[0]
+//	for i, v := range s[1:] {
+//		dst[i+1] = dst[i] * v
+//	}
+//	return dst
+func CumProd(dst, s []float64) []float64 {
+	if len(s) == 0 {
+		return dst
+	}
+	dst[0] = s[0]
+	for i, v := range s[1:] {
+		dst[i+1] = dst[i] * v
+	}
+	return dst
+}
+
+// Div is
+//
+//	for i, v := range s {
+//		dst[i] /= v
+//	}
+func Div(dst, s []float64) {
+	for i, v := range s {
+		dst[i] /= v
+	}
+}
+
+// DivTo is
+//
+//	for i, v := range s {
+//		dst[i] = v / t[i]
+//	}
+//	return dst
+func DivTo(dst, s, t []float64) []float64 {
+	for i, v := range s {
+		dst[i] = v / t[i]
+	}
+	return dst
+}
+
+// L1Dist is
+//
+//	var norm float64
+//	for i, v := range s {
+//		norm += math.Abs(t[i] - v)
+//	}
+//	return norm
+func L1Dist(s, t []float64) float64 {
+	var norm float64
+	for i, v := range s {
+		norm += math.Abs(t[i] - v)
+	}
+	return norm
+}
+
+// LinfDist is
+//
+//	var norm float64
+//	if len(s) == 0 {
+//		return 0
+//	}
+//	norm = math.Abs(t[0] - s[0])
+//	for i, v := range s[1:] {
+//		absDiff := math.Abs(t[i+1] - v)
+//		if absDiff > norm || math.IsNaN(norm) {
+//			norm = absDiff
+//		}
+//	}
+//	return norm
+func LinfDist(s, t []float64) float64 {
+	var norm float64
+	if len(s) == 0 {
+		return 0
+	}
+	norm = math.Abs(t[0] - s[0])
+	for i, v := range s[1:] {
+		absDiff := math.Abs(t[i+1] - v)
+		if absDiff > norm || math.IsNaN(norm) {
+			norm = absDiff
+		}
+	}
+	return norm
+}
+
+// Sum is
+//
+//	var sum float64
+//	for i := range x {
+//	    sum += x[i]
+//	}
+func Sum(x []float64) float64 {
+	var sum float64
+	for _, v := range x {
+		sum += v
+	}
+	return sum
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/sum_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/sum_amd64.s
new file mode 100644
index 0000000000..dd77cbd053
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/sum_amd64.s
@@ -0,0 +1,99 @@
+// Copyright ©2018 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define X_PTR SI
+#define IDX AX
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define SUM_1 X1
+#define SUM_2 X2
+#define SUM_3 X3
+
+// func Sum(x []float64) float64
+TEXT ·Sum(SB), NOSPLIT, $0
+	MOVQ x_base+0(FP), X_PTR // X_PTR = &x
+	MOVQ x_len+8(FP), LEN    // LEN = len(x)
+	XORQ IDX, IDX            // i = 0
+	PXOR SUM, SUM            // p_sum_i = 0
+	CMPQ LEN, $0             // if LEN == 0 { return 0 }
+	JE   sum_end
+
+	PXOR SUM_1, SUM_1
+	PXOR SUM_2, SUM_2
+	PXOR SUM_3, SUM_3
+
+	MOVQ X_PTR, TAIL // Check memory alignment
+	ANDQ $15, TAIL   // TAIL = &y % 16
+	JZ   no_trim     // if TAIL == 0 { goto no_trim }
+
+	// Align on 16-byte boundary
+	ADDSD (X_PTR), X0 // X0 += x[0]
+	INCQ  IDX         // i++
+	DECQ  LEN         // LEN--
+	JZ    sum_end     // if LEN == 0 { return }
+
+no_trim:
+	MOVQ LEN, TAIL
+	SHRQ $4, LEN   // LEN = floor( n / 16 )
+	JZ   sum_tail8 // if LEN == 0 { goto sum_tail8 }
+
+sum_loop: // sum 16x wide do {
+	ADDPD (X_PTR)(IDX*8), SUM      // sum_i += x[i:i+2]
+	ADDPD 16(X_PTR)(IDX*8), SUM_1
+	ADDPD 32(X_PTR)(IDX*8), SUM_2
+	ADDPD 48(X_PTR)(IDX*8), SUM_3
+	ADDPD 64(X_PTR)(IDX*8), SUM
+	ADDPD 80(X_PTR)(IDX*8), SUM_1
+	ADDPD 96(X_PTR)(IDX*8), SUM_2
+	ADDPD 112(X_PTR)(IDX*8), SUM_3
+	ADDQ  $16, IDX                 // i += 16
+	DECQ  LEN
+	JNZ   sum_loop                 // } while --LEN > 0
+
+sum_tail8:
+	TESTQ $8, TAIL
+	JZ    sum_tail4
+
+	ADDPD (X_PTR)(IDX*8), SUM     // sum_i += x[i:i+2]
+	ADDPD 16(X_PTR)(IDX*8), SUM_1
+	ADDPD 32(X_PTR)(IDX*8), SUM_2
+	ADDPD 48(X_PTR)(IDX*8), SUM_3
+	ADDQ  $8, IDX
+
+sum_tail4:
+	ADDPD SUM_3, SUM
+	ADDPD SUM_2, SUM_1
+
+	TESTQ $4, TAIL
+	JZ    sum_tail2
+
+	ADDPD (X_PTR)(IDX*8), SUM     // sum_i += x[i:i+2]
+	ADDPD 16(X_PTR)(IDX*8), SUM_1
+	ADDQ  $4, IDX
+
+sum_tail2:
+	ADDPD SUM_1, SUM
+
+	TESTQ $2, TAIL
+	JZ    sum_tail1
+
+	ADDPD (X_PTR)(IDX*8), SUM // sum_i += x[i:i+2]
+	ADDQ  $2, IDX
+
+sum_tail1:
+	HADDPD SUM, SUM // sum_i[0] += sum_i[1]
+
+	TESTQ $1, TAIL
+	JZ    sum_end
+
+	ADDSD (X_PTR)(IDX*8), SUM
+
+sum_end: // return sum
+	MOVSD SUM, ret+24(FP)
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/cmplx64/abs.go b/vendor/gonum.org/v1/gonum/internal/cmplx64/abs.go
new file mode 100644
index 0000000000..ac6eb81c0e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/cmplx64/abs.go
@@ -0,0 +1,14 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cmplx64
+
+import math "gonum.org/v1/gonum/internal/math32"
+
+// Abs returns the absolute value (also called the modulus) of x.
+func Abs(x complex64) float32 { return math.Hypot(real(x), imag(x)) }
diff --git a/vendor/gonum.org/v1/gonum/internal/cmplx64/conj.go b/vendor/gonum.org/v1/gonum/internal/cmplx64/conj.go
new file mode 100644
index 0000000000..705262f2f9
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/cmplx64/conj.go
@@ -0,0 +1,12 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cmplx64
+
+// Conj returns the complex conjugate of x.
+func Conj(x complex64) complex64 { return complex(real(x), -imag(x)) }
diff --git a/vendor/gonum.org/v1/gonum/internal/cmplx64/doc.go b/vendor/gonum.org/v1/gonum/internal/cmplx64/doc.go
new file mode 100644
index 0000000000..5424ea099c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/cmplx64/doc.go
@@ -0,0 +1,7 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package cmplx64 provides complex64 versions of standard library math/cmplx
+// package routines used by gonum/blas.
+package cmplx64 // import "gonum.org/v1/gonum/internal/cmplx64"
diff --git a/vendor/gonum.org/v1/gonum/internal/cmplx64/isinf.go b/vendor/gonum.org/v1/gonum/internal/cmplx64/isinf.go
new file mode 100644
index 0000000000..21d3d180e1
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/cmplx64/isinf.go
@@ -0,0 +1,25 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cmplx64
+
+import math "gonum.org/v1/gonum/internal/math32"
+
+// IsInf returns true if either real(x) or imag(x) is an infinity.
+func IsInf(x complex64) bool {
+	if math.IsInf(real(x), 0) || math.IsInf(imag(x), 0) {
+		return true
+	}
+	return false
+}
+
+// Inf returns a complex infinity, complex(+Inf, +Inf).
+func Inf() complex64 {
+	inf := math.Inf(1)
+	return complex(inf, inf)
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/cmplx64/isnan.go b/vendor/gonum.org/v1/gonum/internal/cmplx64/isnan.go
new file mode 100644
index 0000000000..d6d43dbd1f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/cmplx64/isnan.go
@@ -0,0 +1,29 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cmplx64
+
+import math "gonum.org/v1/gonum/internal/math32"
+
+// IsNaN returns true if either real(x) or imag(x) is NaN
+// and neither is an infinity.
+func IsNaN(x complex64) bool {
+	switch {
+	case math.IsInf(real(x), 0) || math.IsInf(imag(x), 0):
+		return false
+	case math.IsNaN(real(x)) || math.IsNaN(imag(x)):
+		return true
+	}
+	return false
+}
+
+// NaN returns a complex “not-a-number” value.
+func NaN() complex64 {
+	nan := math.NaN()
+	return complex(nan, nan)
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/cmplx64/sqrt.go b/vendor/gonum.org/v1/gonum/internal/cmplx64/sqrt.go
new file mode 100644
index 0000000000..439987b4ba
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/cmplx64/sqrt.go
@@ -0,0 +1,108 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cmplx64
+
+import math "gonum.org/v1/gonum/internal/math32"
+
+// The original C code, the long comment, and the constants
+// below are from http://netlib.sandia.gov/cephes/c9x-complex/clog.c.
+// The go code is a simplified version of the original C.
+//
+// Cephes Math Library Release 2.8:  June, 2000
+// Copyright 1984, 1987, 1989, 1992, 2000 by Stephen L. Moshier
+//
+// The readme file at http://netlib.sandia.gov/cephes/ says:
+//    Some software in this archive may be from the book _Methods and
+// Programs for Mathematical Functions_ (Prentice-Hall or Simon & Schuster
+// International, 1989) or from the Cephes Mathematical Library, a
+// commercial product. In either event, it is copyrighted by the author.
+// What you see here may be used freely but it comes with no support or
+// guarantee.
+//
+//   The two known misprints in the book are repaired here in the
+// source listings for the gamma function and the incomplete beta
+// integral.
+//
+//   Stephen L. Moshier
+//   moshier@na-net.ornl.gov
+
+// Complex square root
+//
+// DESCRIPTION:
+//
+// If z = x + iy,  r = |z|, then
+//
+//                       1/2
+// Re w  =  [ (r + x)/2 ]   ,
+//
+//                       1/2
+// Im w  =  [ (r - x)/2 ]   .
+//
+// Cancelation error in r-x or r+x is avoided by using the
+// identity  2 Re w Im w  =  y.
+//
+// Note that -w is also a square root of z. The root chosen
+// is always in the right half plane and Im w has the same sign as y.
+//
+// ACCURACY:
+//
+//                      Relative error:
+// arithmetic   domain     # trials      peak         rms
+//    DEC       -10,+10     25000       3.2e-17     9.6e-18
+//    IEEE      -10,+10   1,000,000     2.9e-16     6.1e-17
+
+// Sqrt returns the square root of x.
+// The result r is chosen so that real(r) ≥ 0 and imag(r) has the same sign as imag(x).
+func Sqrt(x complex64) complex64 {
+	if imag(x) == 0 {
+		if real(x) == 0 {
+			return complex(0, 0)
+		}
+		if real(x) < 0 {
+			return complex(0, math.Sqrt(-real(x)))
+		}
+		return complex(math.Sqrt(real(x)), 0)
+	}
+	if real(x) == 0 {
+		if imag(x) < 0 {
+			r := math.Sqrt(-0.5 * imag(x))
+			return complex(r, -r)
+		}
+		r := math.Sqrt(0.5 * imag(x))
+		return complex(r, r)
+	}
+	a := real(x)
+	b := imag(x)
+	var scale float32
+	// Rescale to avoid internal overflow or underflow.
+	if math.Abs(a) > 4 || math.Abs(b) > 4 {
+		a *= 0.25
+		b *= 0.25
+		scale = 2
+	} else {
+		a *= 1.8014398509481984e16 // 2**54
+		b *= 1.8014398509481984e16
+		scale = 7.450580596923828125e-9 // 2**-27
+	}
+	r := math.Hypot(a, b)
+	var t float32
+	if a > 0 {
+		t = math.Sqrt(0.5*r + 0.5*a)
+		r = scale * math.Abs((0.5*b)/t)
+		t *= scale
+	} else {
+		r = math.Sqrt(0.5*r - 0.5*a)
+		t = scale * math.Abs((0.5*b)/r)
+		r *= scale
+	}
+	if b < 0 {
+		return complex(t, -r)
+	}
+	return complex(t, r)
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/math32/doc.go b/vendor/gonum.org/v1/gonum/internal/math32/doc.go
new file mode 100644
index 0000000000..68917c64e6
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/math32/doc.go
@@ -0,0 +1,7 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package math32 provides float32 versions of standard library math package
+// routines used by gonum/blas/native.
+package math32 // import "gonum.org/v1/gonum/internal/math32"
diff --git a/vendor/gonum.org/v1/gonum/internal/math32/math.go b/vendor/gonum.org/v1/gonum/internal/math32/math.go
new file mode 100644
index 0000000000..5e92f3d02e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/math32/math.go
@@ -0,0 +1,166 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package math32
+
+import (
+	"math"
+)
+
+const (
+	unan    = 0x7fc00000
+	uinf    = 0x7f800000
+	uneginf = 0xff800000
+	mask    = 0x7f8 >> 3
+	shift   = 32 - 8 - 1
+	bias    = 127
+)
+
+// Abs returns the absolute value of x.
+//
+// Special cases are:
+//
+//	Abs(±Inf) = +Inf
+//	Abs(NaN) = NaN
+func Abs(x float32) float32 {
+	switch {
+	case x < 0:
+		return -x
+	case x == 0:
+		return 0 // return correctly abs(-0)
+	}
+	return x
+}
+
+// Copysign returns a value with the magnitude
+// of x and the sign of y.
+func Copysign(x, y float32) float32 {
+	const sign = 1 << 31
+	return math.Float32frombits(math.Float32bits(x)&^sign | math.Float32bits(y)&sign)
+}
+
+// Hypot returns Sqrt(p*p + q*q), taking care to avoid
+// unnecessary overflow and underflow.
+//
+// Special cases are:
+//
+//	Hypot(±Inf, q) = +Inf
+//	Hypot(p, ±Inf) = +Inf
+//	Hypot(NaN, q) = NaN
+//	Hypot(p, NaN) = NaN
+func Hypot(p, q float32) float32 {
+	// special cases
+	switch {
+	case IsInf(p, 0) || IsInf(q, 0):
+		return Inf(1)
+	case IsNaN(p) || IsNaN(q):
+		return NaN()
+	}
+	if p < 0 {
+		p = -p
+	}
+	if q < 0 {
+		q = -q
+	}
+	if p < q {
+		p, q = q, p
+	}
+	if p == 0 {
+		return 0
+	}
+	q = q / p
+	return p * Sqrt(1+q*q)
+}
+
+// Inf returns positive infinity if sign >= 0, negative infinity if sign < 0.
+func Inf(sign int) float32 {
+	var v uint32
+	if sign >= 0 {
+		v = uinf
+	} else {
+		v = uneginf
+	}
+	return math.Float32frombits(v)
+}
+
+// IsInf reports whether f is an infinity, according to sign.
+// If sign > 0, IsInf reports whether f is positive infinity.
+// If sign < 0, IsInf reports whether f is negative infinity.
+// If sign == 0, IsInf reports whether f is either infinity.
+func IsInf(f float32, sign int) bool {
+	// Test for infinity by comparing against maximum float.
+	// To avoid the floating-point hardware, could use:
+	//	x := math.Float32bits(f);
+	//	return sign >= 0 && x == uinf || sign <= 0 && x == uneginf;
+	return sign >= 0 && f > math.MaxFloat32 || sign <= 0 && f < -math.MaxFloat32
+}
+
+// IsNaN reports whether f is an IEEE 754 “not-a-number” value.
+func IsNaN(f float32) (is bool) {
+	// IEEE 754 says that only NaNs satisfy f != f.
+	// To avoid the floating-point hardware, could use:
+	//	x := math.Float32bits(f);
+	//	return uint32(x>>shift)&mask == mask && x != uinf && x != uneginf
+	return f != f
+}
+
+// Max returns the larger of x or y.
+//
+// Special cases are:
+//
+//	Max(x, +Inf) = Max(+Inf, x) = +Inf
+//	Max(x, NaN) = Max(NaN, x) = NaN
+//	Max(+0, ±0) = Max(±0, +0) = +0
+//	Max(-0, -0) = -0
+func Max(x, y float32) float32 {
+	// special cases
+	switch {
+	case IsInf(x, 1) || IsInf(y, 1):
+		return Inf(1)
+	case IsNaN(x) || IsNaN(y):
+		return NaN()
+	case x == 0 && x == y:
+		if Signbit(x) {
+			return y
+		}
+		return x
+	}
+	if x > y {
+		return x
+	}
+	return y
+}
+
+// Min returns the smaller of x or y.
+//
+// Special cases are:
+//
+//	Min(x, -Inf) = Min(-Inf, x) = -Inf
+//	Min(x, NaN) = Min(NaN, x) = NaN
+//	Min(-0, ±0) = Min(±0, -0) = -0
+func Min(x, y float32) float32 {
+	// special cases
+	switch {
+	case IsInf(x, -1) || IsInf(y, -1):
+		return Inf(-1)
+	case IsNaN(x) || IsNaN(y):
+		return NaN()
+	case x == 0 && x == y:
+		if Signbit(x) {
+			return x
+		}
+		return y
+	}
+	if x < y {
+		return x
+	}
+	return y
+}
+
+// NaN returns an IEEE 754 “not-a-number” value.
+func NaN() float32 { return math.Float32frombits(unan) }
diff --git a/vendor/gonum.org/v1/gonum/internal/math32/signbit.go b/vendor/gonum.org/v1/gonum/internal/math32/signbit.go
new file mode 100644
index 0000000000..3e9f0bb41d
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/math32/signbit.go
@@ -0,0 +1,16 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package math32
+
+import "math"
+
+// Signbit returns true if x is negative or negative zero.
+func Signbit(x float32) bool {
+	return math.Float32bits(x)&(1<<31) != 0
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/math32/sqrt.go b/vendor/gonum.org/v1/gonum/internal/math32/sqrt.go
new file mode 100644
index 0000000000..41f4a134df
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/math32/sqrt.go
@@ -0,0 +1,26 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (!amd64 && !arm64) || noasm || gccgo || safe
+// +build !amd64,!arm64 noasm gccgo safe
+
+package math32
+
+import (
+	"math"
+)
+
+// Sqrt returns the square root of x.
+//
+// Special cases are:
+//
+//	Sqrt(+Inf) = +Inf
+//	Sqrt(±0) = ±0
+//	Sqrt(x < 0) = NaN
+//	Sqrt(NaN) = NaN
+func Sqrt(x float32) float32 {
+	// FIXME(kortschak): Direct translation of the math package
+	// asm code for 386 fails to build.
+	return float32(math.Sqrt(float64(x)))
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/math32/sqrt_amd64.go b/vendor/gonum.org/v1/gonum/internal/math32/sqrt_amd64.go
new file mode 100644
index 0000000000..eca83f8700
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/math32/sqrt_amd64.go
@@ -0,0 +1,22 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !noasm && !gccgo && !safe
+// +build !noasm,!gccgo,!safe
+
+package math32
+
+// Sqrt returns the square root of x.
+//
+// Special cases are:
+//
+//	Sqrt(+Inf) = +Inf
+//	Sqrt(±0) = ±0
+//	Sqrt(x < 0) = NaN
+//	Sqrt(NaN) = NaN
+func Sqrt(x float32) float32
diff --git a/vendor/gonum.org/v1/gonum/internal/math32/sqrt_amd64.s b/vendor/gonum.org/v1/gonum/internal/math32/sqrt_amd64.s
new file mode 100644
index 0000000000..1c1432a3ca
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/math32/sqrt_amd64.s
@@ -0,0 +1,17 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func Sqrt(x float32) float32
+TEXT ·Sqrt(SB),NOSPLIT,$0
+	SQRTSS x+0(FP), X0
+	MOVSS X0, ret+8(FP)
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/math32/sqrt_arm64.go b/vendor/gonum.org/v1/gonum/internal/math32/sqrt_arm64.go
new file mode 100644
index 0000000000..eca83f8700
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/math32/sqrt_arm64.go
@@ -0,0 +1,22 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !noasm && !gccgo && !safe
+// +build !noasm,!gccgo,!safe
+
+package math32
+
+// Sqrt returns the square root of x.
+//
+// Special cases are:
+//
+//	Sqrt(+Inf) = +Inf
+//	Sqrt(±0) = ±0
+//	Sqrt(x < 0) = NaN
+//	Sqrt(NaN) = NaN
+func Sqrt(x float32) float32
diff --git a/vendor/gonum.org/v1/gonum/internal/math32/sqrt_arm64.s b/vendor/gonum.org/v1/gonum/internal/math32/sqrt_arm64.s
new file mode 100644
index 0000000000..f18b5521d4
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/math32/sqrt_arm64.s
@@ -0,0 +1,18 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Copyright ©2020 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func Sqrt(x float32) float32
+TEXT ·Sqrt(SB),NOSPLIT,$0
+	FMOVS	x+0(FP), F0
+	FSQRTS	F0, F0
+	FMOVS	F0, ret+8(FP)
+	RET
diff --git a/vendor/gonum.org/v1/gonum/lapack/.gitignore b/vendor/gonum.org/v1/gonum/lapack/.gitignore
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/vendor/gonum.org/v1/gonum/lapack/README.md b/vendor/gonum.org/v1/gonum/lapack/README.md
new file mode 100644
index 0000000000..ee23148c97
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/README.md
@@ -0,0 +1,29 @@
+Gonum LAPACK
+======
+[![go.dev reference](https://pkg.go.dev/badge/gonum.org/v1/gonum/lapack)](https://pkg.go.dev/gonum.org/v1/gonum/lapack)
+[![GoDoc](https://godocs.io/gonum.org/v1/gonum/lapack?status.svg)](https://godocs.io/gonum.org/v1/gonum/lapack)
+
+A collection of packages to provide LAPACK functionality for the Go programming
+language (http://golang.org). This provides a partial implementation in native go
+and a wrapper using cgo to a c-based implementation.
+
+## Installation
+
+```
+  go get gonum.org/v1/gonum/lapack/...
+```
+
+## Packages
+
+### lapack
+
+Defines the LAPACK API based on http://www.netlib.org/lapack/lapacke.html
+
+### lapack/gonum
+
+Go implementation of the LAPACK API (incomplete, implements the `float64` API).
+
+### lapack/lapack64
+
+Wrappers for an implementation of the double (i.e., `float64`) precision real parts of
+the LAPACK API.
diff --git a/vendor/gonum.org/v1/gonum/lapack/doc.go b/vendor/gonum.org/v1/gonum/lapack/doc.go
new file mode 100644
index 0000000000..2475cb4aa0
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2018 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package lapack provides interfaces for the LAPACK linear algebra standard.
+package lapack // import "gonum.org/v1/gonum/lapack"
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dbdsqr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dbdsqr.go
new file mode 100644
index 0000000000..fd421d7ef5
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dbdsqr.go
@@ -0,0 +1,506 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dbdsqr performs a singular value decomposition of a real n×n bidiagonal matrix.
+//
+// The SVD of the bidiagonal matrix B is
+//
+//	B = Q * S * Pᵀ
+//
+// where S is a diagonal matrix of singular values, Q is an orthogonal matrix of
+// left singular vectors, and P is an orthogonal matrix of right singular vectors.
+//
+// Q and P are only computed if requested. If left singular vectors are requested,
+// this routine returns U * Q instead of Q, and if right singular vectors are
+// requested Pᵀ * VT is returned instead of Pᵀ.
+//
+// Frequently Dbdsqr is used in conjunction with Dgebrd which reduces a general
+// matrix A into bidiagonal form. In this case, the SVD of A is
+//
+//	A = (U * Q) * S * (Pᵀ * VT)
+//
+// This routine may also compute Qᵀ * C.
+//
+// d and e contain the elements of the bidiagonal matrix b. d must have length at
+// least n, and e must have length at least n-1. Dbdsqr will panic if there is
+// insufficient length. On exit, D contains the singular values of B in decreasing
+// order.
+//
+// VT is a matrix of size n×ncvt whose elements are stored in vt. The elements
+// of vt are modified to contain Pᵀ * VT on exit. VT is not used if ncvt == 0.
+//
+// U is a matrix of size nru×n whose elements are stored in u. The elements
+// of u are modified to contain U * Q on exit. U is not used if nru == 0.
+//
+// C is a matrix of size n×ncc whose elements are stored in c. The elements
+// of c are modified to contain Qᵀ * C on exit. C is not used if ncc == 0.
+//
+// work contains temporary storage and must have length at least 4*(n-1). Dbdsqr
+// will panic if there is insufficient working memory.
+//
+// Dbdsqr returns whether the decomposition was successful.
+//
+// Dbdsqr is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dbdsqr(uplo blas.Uplo, n, ncvt, nru, ncc int, d, e, vt []float64, ldvt int, u []float64, ldu int, c []float64, ldc int, work []float64) (ok bool) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case ncvt < 0:
+		panic(ncvtLT0)
+	case nru < 0:
+		panic(nruLT0)
+	case ncc < 0:
+		panic(nccLT0)
+	case ldvt < max(1, ncvt):
+		panic(badLdVT)
+	case (ldu < max(1, n) && nru > 0) || (ldu < 1 && nru == 0):
+		panic(badLdU)
+	case ldc < max(1, ncc):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return true
+	}
+
+	if len(vt) < (n-1)*ldvt+ncvt && ncvt != 0 {
+		panic(shortVT)
+	}
+	if len(u) < (nru-1)*ldu+n && nru != 0 {
+		panic(shortU)
+	}
+	if len(c) < (n-1)*ldc+ncc && ncc != 0 {
+		panic(shortC)
+	}
+	if len(d) < n {
+		panic(shortD)
+	}
+	if len(e) < n-1 {
+		panic(shortE)
+	}
+	if len(work) < 4*(n-1) {
+		panic(shortWork)
+	}
+
+	var info int
+	bi := blas64.Implementation()
+	const maxIter = 6
+
+	if n != 1 {
+		// If the singular vectors do not need to be computed, use qd algorithm.
+		if !(ncvt > 0 || nru > 0 || ncc > 0) {
+			info = impl.Dlasq1(n, d, e, work)
+			// If info is 2 dqds didn't finish, and so try to.
+			if info != 2 {
+				return info == 0
+			}
+		}
+		nm1 := n - 1
+		nm12 := nm1 + nm1
+		nm13 := nm12 + nm1
+		idir := 0
+
+		eps := dlamchE
+		unfl := dlamchS
+		lower := uplo == blas.Lower
+		var cs, sn, r float64
+		if lower {
+			for i := 0; i < n-1; i++ {
+				cs, sn, r = impl.Dlartg(d[i], e[i])
+				d[i] = r
+				e[i] = sn * d[i+1]
+				d[i+1] *= cs
+				work[i] = cs
+				work[nm1+i] = sn
+			}
+			if nru > 0 {
+				impl.Dlasr(blas.Right, lapack.Variable, lapack.Forward, nru, n, work, work[n-1:], u, ldu)
+			}
+			if ncc > 0 {
+				impl.Dlasr(blas.Left, lapack.Variable, lapack.Forward, n, ncc, work, work[n-1:], c, ldc)
+			}
+		}
+		// Compute singular values to a relative accuracy of tol. If tol is negative
+		// the values will be computed to an absolute accuracy of math.Abs(tol) * norm(b)
+		tolmul := math.Max(10, math.Min(100, math.Pow(eps, -1.0/8)))
+		tol := tolmul * eps
+		var smax float64
+		for i := 0; i < n; i++ {
+			smax = math.Max(smax, math.Abs(d[i]))
+		}
+		for i := 0; i < n-1; i++ {
+			smax = math.Max(smax, math.Abs(e[i]))
+		}
+
+		var smin float64
+		var thresh float64
+		if tol >= 0 {
+			sminoa := math.Abs(d[0])
+			if sminoa != 0 {
+				mu := sminoa
+				for i := 1; i < n; i++ {
+					mu = math.Abs(d[i]) * (mu / (mu + math.Abs(e[i-1])))
+					sminoa = math.Min(sminoa, mu)
+					if sminoa == 0 {
+						break
+					}
+				}
+			}
+			sminoa = sminoa / math.Sqrt(float64(n))
+			thresh = math.Max(tol*sminoa, float64(maxIter*n*n)*unfl)
+		} else {
+			thresh = math.Max(math.Abs(tol)*smax, float64(maxIter*n*n)*unfl)
+		}
+		// Prepare for the main iteration loop for the singular values.
+		maxIt := maxIter * n * n
+		iter := 0
+		oldl2 := -1
+		oldm := -1
+		// m points to the last element of unconverged part of matrix.
+		m := n
+
+	Outer:
+		for m > 1 {
+			if iter > maxIt {
+				info = 0
+				for i := 0; i < n-1; i++ {
+					if e[i] != 0 {
+						info++
+					}
+				}
+				return info == 0
+			}
+			// Find diagonal block of matrix to work on.
+			if tol < 0 && math.Abs(d[m-1]) <= thresh {
+				d[m-1] = 0
+			}
+			smax = math.Abs(d[m-1])
+			var l2 int
+			var broke bool
+			for l3 := 0; l3 < m-1; l3++ {
+				l2 = m - l3 - 2
+				abss := math.Abs(d[l2])
+				abse := math.Abs(e[l2])
+				if tol < 0 && abss <= thresh {
+					d[l2] = 0
+				}
+				if abse <= thresh {
+					broke = true
+					break
+				}
+				smax = math.Max(math.Max(smax, abss), abse)
+			}
+			if broke {
+				e[l2] = 0
+				if l2 == m-2 {
+					// Convergence of bottom singular value, return to top.
+					m--
+					continue
+				}
+				l2++
+			} else {
+				l2 = 0
+			}
+			// e[ll] through e[m-2] are nonzero, e[ll-1] is zero
+			if l2 == m-2 {
+				// Handle 2×2 block separately.
+				var sinr, cosr, sinl, cosl float64
+				d[m-1], d[m-2], sinr, cosr, sinl, cosl = impl.Dlasv2(d[m-2], e[m-2], d[m-1])
+				e[m-2] = 0
+				if ncvt > 0 {
+					bi.Drot(ncvt, vt[(m-2)*ldvt:], 1, vt[(m-1)*ldvt:], 1, cosr, sinr)
+				}
+				if nru > 0 {
+					bi.Drot(nru, u[m-2:], ldu, u[m-1:], ldu, cosl, sinl)
+				}
+				if ncc > 0 {
+					bi.Drot(ncc, c[(m-2)*ldc:], 1, c[(m-1)*ldc:], 1, cosl, sinl)
+				}
+				m -= 2
+				continue
+			}
+			// If working on a new submatrix, choose shift direction from larger end
+			// diagonal element toward smaller.
+			if l2 > oldm-1 || m-1 < oldl2 {
+				if math.Abs(d[l2]) >= math.Abs(d[m-1]) {
+					idir = 1
+				} else {
+					idir = 2
+				}
+			}
+			// Apply convergence tests.
+			// TODO(btracey): There is a lot of similar looking code here. See
+			// if there is a better way to de-duplicate.
+			if idir == 1 {
+				// Run convergence test in forward direction.
+				// First apply standard test to bottom of matrix.
+				if math.Abs(e[m-2]) <= math.Abs(tol)*math.Abs(d[m-1]) || (tol < 0 && math.Abs(e[m-2]) <= thresh) {
+					e[m-2] = 0
+					continue
+				}
+				if tol >= 0 {
+					// If relative accuracy desired, apply convergence criterion forward.
+					mu := math.Abs(d[l2])
+					smin = mu
+					for l3 := l2; l3 < m-1; l3++ {
+						if math.Abs(e[l3]) <= tol*mu {
+							e[l3] = 0
+							continue Outer
+						}
+						mu = math.Abs(d[l3+1]) * (mu / (mu + math.Abs(e[l3])))
+						smin = math.Min(smin, mu)
+					}
+				}
+			} else {
+				// Run convergence test in backward direction.
+				// First apply standard test to top of matrix.
+				if math.Abs(e[l2]) <= math.Abs(tol)*math.Abs(d[l2]) || (tol < 0 && math.Abs(e[l2]) <= thresh) {
+					e[l2] = 0
+					continue
+				}
+				if tol >= 0 {
+					// If relative accuracy desired, apply convergence criterion backward.
+					mu := math.Abs(d[m-1])
+					smin = mu
+					for l3 := m - 2; l3 >= l2; l3-- {
+						if math.Abs(e[l3]) <= tol*mu {
+							e[l3] = 0
+							continue Outer
+						}
+						mu = math.Abs(d[l3]) * (mu / (mu + math.Abs(e[l3])))
+						smin = math.Min(smin, mu)
+					}
+				}
+			}
+			oldl2 = l2
+			oldm = m
+			// Compute shift. First, test if shifting would ruin relative accuracy,
+			// and if so set the shift to zero.
+			var shift float64
+			if tol >= 0 && float64(n)*tol*(smin/smax) <= math.Max(eps, (1.0/100)*tol) {
+				shift = 0
+			} else {
+				var sl2 float64
+				if idir == 1 {
+					sl2 = math.Abs(d[l2])
+					shift, _ = impl.Dlas2(d[m-2], e[m-2], d[m-1])
+				} else {
+					sl2 = math.Abs(d[m-1])
+					shift, _ = impl.Dlas2(d[l2], e[l2], d[l2+1])
+				}
+				// Test if shift is negligible
+				if sl2 > 0 {
+					if (shift/sl2)*(shift/sl2) < eps {
+						shift = 0
+					}
+				}
+			}
+			iter += m - l2 + 1
+			// If no shift, do simplified QR iteration.
+			if shift == 0 {
+				if idir == 1 {
+					cs := 1.0
+					oldcs := 1.0
+					var sn, r, oldsn float64
+					for i := l2; i < m-1; i++ {
+						cs, sn, r = impl.Dlartg(d[i]*cs, e[i])
+						if i > l2 {
+							e[i-1] = oldsn * r
+						}
+						oldcs, oldsn, d[i] = impl.Dlartg(oldcs*r, d[i+1]*sn)
+						work[i-l2] = cs
+						work[i-l2+nm1] = sn
+						work[i-l2+nm12] = oldcs
+						work[i-l2+nm13] = oldsn
+					}
+					h := d[m-1] * cs
+					d[m-1] = h * oldcs
+					e[m-2] = h * oldsn
+					if ncvt > 0 {
+						impl.Dlasr(blas.Left, lapack.Variable, lapack.Forward, m-l2, ncvt, work, work[n-1:], vt[l2*ldvt:], ldvt)
+					}
+					if nru > 0 {
+						impl.Dlasr(blas.Right, lapack.Variable, lapack.Forward, nru, m-l2, work[nm12:], work[nm13:], u[l2:], ldu)
+					}
+					if ncc > 0 {
+						impl.Dlasr(blas.Left, lapack.Variable, lapack.Forward, m-l2, ncc, work[nm12:], work[nm13:], c[l2*ldc:], ldc)
+					}
+					if math.Abs(e[m-2]) < thresh {
+						e[m-2] = 0
+					}
+				} else {
+					cs := 1.0
+					oldcs := 1.0
+					var sn, r, oldsn float64
+					for i := m - 1; i >= l2+1; i-- {
+						cs, sn, r = impl.Dlartg(d[i]*cs, e[i-1])
+						if i < m-1 {
+							e[i] = oldsn * r
+						}
+						oldcs, oldsn, d[i] = impl.Dlartg(oldcs*r, d[i-1]*sn)
+						work[i-l2-1] = cs
+						work[i-l2+nm1-1] = -sn
+						work[i-l2+nm12-1] = oldcs
+						work[i-l2+nm13-1] = -oldsn
+					}
+					h := d[l2] * cs
+					d[l2] = h * oldcs
+					e[l2] = h * oldsn
+					if ncvt > 0 {
+						impl.Dlasr(blas.Left, lapack.Variable, lapack.Backward, m-l2, ncvt, work[nm12:], work[nm13:], vt[l2*ldvt:], ldvt)
+					}
+					if nru > 0 {
+						impl.Dlasr(blas.Right, lapack.Variable, lapack.Backward, nru, m-l2, work, work[n-1:], u[l2:], ldu)
+					}
+					if ncc > 0 {
+						impl.Dlasr(blas.Left, lapack.Variable, lapack.Backward, m-l2, ncc, work, work[n-1:], c[l2*ldc:], ldc)
+					}
+					if math.Abs(e[l2]) <= thresh {
+						e[l2] = 0
+					}
+				}
+			} else {
+				// Use nonzero shift.
+				if idir == 1 {
+					// Chase bulge from top to bottom. Save cosines and sines for
+					// later singular vector updates.
+					f := (math.Abs(d[l2]) - shift) * (math.Copysign(1, d[l2]) + shift/d[l2])
+					g := e[l2]
+					var cosl, sinl float64
+					for i := l2; i < m-1; i++ {
+						cosr, sinr, r := impl.Dlartg(f, g)
+						if i > l2 {
+							e[i-1] = r
+						}
+						f = cosr*d[i] + sinr*e[i]
+						e[i] = cosr*e[i] - sinr*d[i]
+						g = sinr * d[i+1]
+						d[i+1] *= cosr
+						cosl, sinl, r = impl.Dlartg(f, g)
+						d[i] = r
+						f = cosl*e[i] + sinl*d[i+1]
+						d[i+1] = cosl*d[i+1] - sinl*e[i]
+						if i < m-2 {
+							g = sinl * e[i+1]
+							e[i+1] = cosl * e[i+1]
+						}
+						work[i-l2] = cosr
+						work[i-l2+nm1] = sinr
+						work[i-l2+nm12] = cosl
+						work[i-l2+nm13] = sinl
+					}
+					e[m-2] = f
+					if ncvt > 0 {
+						impl.Dlasr(blas.Left, lapack.Variable, lapack.Forward, m-l2, ncvt, work, work[n-1:], vt[l2*ldvt:], ldvt)
+					}
+					if nru > 0 {
+						impl.Dlasr(blas.Right, lapack.Variable, lapack.Forward, nru, m-l2, work[nm12:], work[nm13:], u[l2:], ldu)
+					}
+					if ncc > 0 {
+						impl.Dlasr(blas.Left, lapack.Variable, lapack.Forward, m-l2, ncc, work[nm12:], work[nm13:], c[l2*ldc:], ldc)
+					}
+					if math.Abs(e[m-2]) <= thresh {
+						e[m-2] = 0
+					}
+				} else {
+					// Chase bulge from top to bottom. Save cosines and sines for
+					// later singular vector updates.
+					f := (math.Abs(d[m-1]) - shift) * (math.Copysign(1, d[m-1]) + shift/d[m-1])
+					g := e[m-2]
+					for i := m - 1; i > l2; i-- {
+						cosr, sinr, r := impl.Dlartg(f, g)
+						if i < m-1 {
+							e[i] = r
+						}
+						f = cosr*d[i] + sinr*e[i-1]
+						e[i-1] = cosr*e[i-1] - sinr*d[i]
+						g = sinr * d[i-1]
+						d[i-1] *= cosr
+						cosl, sinl, r := impl.Dlartg(f, g)
+						d[i] = r
+						f = cosl*e[i-1] + sinl*d[i-1]
+						d[i-1] = cosl*d[i-1] - sinl*e[i-1]
+						if i > l2+1 {
+							g = sinl * e[i-2]
+							e[i-2] *= cosl
+						}
+						work[i-l2-1] = cosr
+						work[i-l2+nm1-1] = -sinr
+						work[i-l2+nm12-1] = cosl
+						work[i-l2+nm13-1] = -sinl
+					}
+					e[l2] = f
+					if math.Abs(e[l2]) <= thresh {
+						e[l2] = 0
+					}
+					if ncvt > 0 {
+						impl.Dlasr(blas.Left, lapack.Variable, lapack.Backward, m-l2, ncvt, work[nm12:], work[nm13:], vt[l2*ldvt:], ldvt)
+					}
+					if nru > 0 {
+						impl.Dlasr(blas.Right, lapack.Variable, lapack.Backward, nru, m-l2, work, work[n-1:], u[l2:], ldu)
+					}
+					if ncc > 0 {
+						impl.Dlasr(blas.Left, lapack.Variable, lapack.Backward, m-l2, ncc, work, work[n-1:], c[l2*ldc:], ldc)
+					}
+				}
+			}
+		}
+	}
+
+	// All singular values converged, make them positive.
+	for i := 0; i < n; i++ {
+		if d[i] < 0 {
+			d[i] *= -1
+			if ncvt > 0 {
+				bi.Dscal(ncvt, -1, vt[i*ldvt:], 1)
+			}
+		}
+	}
+
+	// Sort the singular values in decreasing order.
+	for i := 0; i < n-1; i++ {
+		isub := 0
+		smin := d[0]
+		for j := 1; j < n-i; j++ {
+			if d[j] <= smin {
+				isub = j
+				smin = d[j]
+			}
+		}
+		if isub != n-i {
+			// Swap singular values and vectors.
+			d[isub] = d[n-i-1]
+			d[n-i-1] = smin
+			if ncvt > 0 {
+				bi.Dswap(ncvt, vt[isub*ldvt:], 1, vt[(n-i-1)*ldvt:], 1)
+			}
+			if nru > 0 {
+				bi.Dswap(nru, u[isub:], ldu, u[n-i-1:], ldu)
+			}
+			if ncc > 0 {
+				bi.Dswap(ncc, c[isub*ldc:], 1, c[(n-i-1)*ldc:], 1)
+			}
+		}
+	}
+	info = 0
+	for i := 0; i < n-1; i++ {
+		if e[i] != 0 {
+			info++
+		}
+	}
+	return info == 0
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgebak.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgebak.go
new file mode 100644
index 0000000000..b6af972e6a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgebak.go
@@ -0,0 +1,91 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dgebak updates an n×m matrix V as
+//
+//	V = P D V       if side == lapack.EVRight,
+//	V = P D^{-1} V  if side == lapack.EVLeft,
+//
+// where P and D are n×n permutation and scaling matrices, respectively,
+// implicitly represented by job, scale, ilo and ihi as returned by Dgebal.
+//
+// Typically, columns of the matrix V contain the right or left (determined by
+// side) eigenvectors of the balanced matrix output by Dgebal, and Dgebak forms
+// the eigenvectors of the original matrix.
+//
+// Dgebak is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgebak(job lapack.BalanceJob, side lapack.EVSide, n, ilo, ihi int, scale []float64, m int, v []float64, ldv int) {
+	switch {
+	case job != lapack.BalanceNone && job != lapack.Permute && job != lapack.Scale && job != lapack.PermuteScale:
+		panic(badBalanceJob)
+	case side != lapack.EVLeft && side != lapack.EVRight:
+		panic(badEVSide)
+	case n < 0:
+		panic(nLT0)
+	case ilo < 0 || max(0, n-1) < ilo:
+		panic(badIlo)
+	case ihi < min(ilo, n-1) || n <= ihi:
+		panic(badIhi)
+	case m < 0:
+		panic(mLT0)
+	case ldv < max(1, m):
+		panic(badLdV)
+	}
+
+	// Quick return if possible.
+	if n == 0 || m == 0 {
+		return
+	}
+
+	if len(scale) < n {
+		panic(shortScale)
+	}
+	if len(v) < (n-1)*ldv+m {
+		panic(shortV)
+	}
+
+	// Quick return if possible.
+	if job == lapack.BalanceNone {
+		return
+	}
+
+	bi := blas64.Implementation()
+	if ilo != ihi && job != lapack.Permute {
+		// Backward balance.
+		if side == lapack.EVRight {
+			for i := ilo; i <= ihi; i++ {
+				bi.Dscal(m, scale[i], v[i*ldv:], 1)
+			}
+		} else {
+			for i := ilo; i <= ihi; i++ {
+				bi.Dscal(m, 1/scale[i], v[i*ldv:], 1)
+			}
+		}
+	}
+	if job == lapack.Scale {
+		return
+	}
+	// Backward permutation.
+	for i := ilo - 1; i >= 0; i-- {
+		k := int(scale[i])
+		if k == i {
+			continue
+		}
+		bi.Dswap(m, v[i*ldv:], 1, v[k*ldv:], 1)
+	}
+	for i := ihi + 1; i < n; i++ {
+		k := int(scale[i])
+		if k == i {
+			continue
+		}
+		bi.Dswap(m, v[i*ldv:], 1, v[k*ldv:], 1)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgebal.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgebal.go
new file mode 100644
index 0000000000..7623e2faee
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgebal.go
@@ -0,0 +1,248 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dgebal balances an n×n matrix A. Balancing consists of two stages, permuting
+// and scaling. Both steps are optional and depend on the value of job.
+//
+// Permuting consists of applying a permutation matrix P such that the matrix
+// that results from Pᵀ*A*P takes the upper block triangular form
+//
+//	         [ T1  X  Y  ]
+//	Pᵀ A P = [  0  B  Z  ],
+//	         [  0  0  T2 ]
+//
+// where T1 and T2 are upper triangular matrices and B contains at least one
+// nonzero off-diagonal element in each row and column. The indices ilo and ihi
+// mark the starting and ending columns of the submatrix B. The eigenvalues of A
+// isolated in the first 0 to ilo-1 and last ihi+1 to n-1 elements on the
+// diagonal can be read off without any roundoff error.
+//
+// Scaling consists of applying a diagonal similarity transformation D such that
+// D^{-1}*B*D has the 1-norm of each row and its corresponding column nearly
+// equal. The output matrix is
+//
+//	[ T1     X*D          Y    ]
+//	[  0  inv(D)*B*D  inv(D)*Z ].
+//	[  0      0           T2   ]
+//
+// Scaling may reduce the 1-norm of the matrix, and improve the accuracy of
+// the computed eigenvalues and/or eigenvectors.
+//
+// job specifies the operations that will be performed on A.
+// If job is lapack.BalanceNone, Dgebal sets scale[i] = 1 for all i and returns ilo=0, ihi=n-1.
+// If job is lapack.Permute, only permuting will be done.
+// If job is lapack.Scale, only scaling will be done.
+// If job is lapack.PermuteScale, both permuting and scaling will be done.
+//
+// On return, if job is lapack.Permute or lapack.PermuteScale, it will hold that
+//
+//	A[i,j] == 0,   for i > j and j ∈ {0, ..., ilo-1, ihi+1, ..., n-1}.
+//
+// If job is lapack.BalanceNone or lapack.Scale, or if n == 0, it will hold that
+//
+//	ilo == 0 and ihi == n-1.
+//
+// On return, scale will contain information about the permutations and scaling
+// factors applied to A. If π(j) denotes the index of the column interchanged
+// with column j, and D[j,j] denotes the scaling factor applied to column j,
+// then
+//
+//	scale[j] == π(j),     for j ∈ {0, ..., ilo-1, ihi+1, ..., n-1},
+//	         == D[j,j],   for j ∈ {ilo, ..., ihi}.
+//
+// scale must have length equal to n, otherwise Dgebal will panic.
+//
+// Dgebal is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgebal(job lapack.BalanceJob, n int, a []float64, lda int, scale []float64) (ilo, ihi int) {
+	switch {
+	case job != lapack.BalanceNone && job != lapack.Permute && job != lapack.Scale && job != lapack.PermuteScale:
+		panic(badBalanceJob)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	ilo = 0
+	ihi = n - 1
+
+	if n == 0 {
+		return ilo, ihi
+	}
+
+	if len(scale) != n {
+		panic(shortScale)
+	}
+
+	if job == lapack.BalanceNone {
+		for i := range scale {
+			scale[i] = 1
+		}
+		return ilo, ihi
+	}
+
+	if len(a) < (n-1)*lda+n {
+		panic(shortA)
+	}
+
+	bi := blas64.Implementation()
+	swapped := true
+
+	if job == lapack.Scale {
+		goto scaling
+	}
+
+	// Permutation to isolate eigenvalues if possible.
+	//
+	// Search for rows isolating an eigenvalue and push them down.
+	for swapped {
+		swapped = false
+	rows:
+		for i := ihi; i >= 0; i-- {
+			for j := 0; j <= ihi; j++ {
+				if i == j {
+					continue
+				}
+				if a[i*lda+j] != 0 {
+					continue rows
+				}
+			}
+			// Row i has only zero off-diagonal elements in the
+			// block A[ilo:ihi+1,ilo:ihi+1].
+			scale[ihi] = float64(i)
+			if i != ihi {
+				bi.Dswap(ihi+1, a[i:], lda, a[ihi:], lda)
+				bi.Dswap(n, a[i*lda:], 1, a[ihi*lda:], 1)
+			}
+			if ihi == 0 {
+				scale[0] = 1
+				return ilo, ihi
+			}
+			ihi--
+			swapped = true
+			break
+		}
+	}
+	// Search for columns isolating an eigenvalue and push them left.
+	swapped = true
+	for swapped {
+		swapped = false
+	columns:
+		for j := ilo; j <= ihi; j++ {
+			for i := ilo; i <= ihi; i++ {
+				if i == j {
+					continue
+				}
+				if a[i*lda+j] != 0 {
+					continue columns
+				}
+			}
+			// Column j has only zero off-diagonal elements in the
+			// block A[ilo:ihi+1,ilo:ihi+1].
+			scale[ilo] = float64(j)
+			if j != ilo {
+				bi.Dswap(ihi+1, a[j:], lda, a[ilo:], lda)
+				bi.Dswap(n-ilo, a[j*lda+ilo:], 1, a[ilo*lda+ilo:], 1)
+			}
+			swapped = true
+			ilo++
+			break
+		}
+	}
+
+scaling:
+	for i := ilo; i <= ihi; i++ {
+		scale[i] = 1
+	}
+
+	if job == lapack.Permute {
+		return ilo, ihi
+	}
+
+	// Balance the submatrix in rows ilo to ihi.
+
+	const (
+		// sclfac should be a power of 2 to avoid roundoff errors.
+		// Elements of scale are restricted to powers of sclfac,
+		// therefore the matrix will be only nearly balanced.
+		sclfac = 2
+		// factor determines the minimum reduction of the row and column
+		// norms that is considered non-negligible. It must be less than 1.
+		factor = 0.95
+	)
+	sfmin1 := dlamchS / dlamchP
+	sfmax1 := 1 / sfmin1
+	sfmin2 := sfmin1 * sclfac
+	sfmax2 := 1 / sfmin2
+
+	// Iterative loop for norm reduction.
+	var conv bool
+	for !conv {
+		conv = true
+		for i := ilo; i <= ihi; i++ {
+			c := bi.Dnrm2(ihi-ilo+1, a[ilo*lda+i:], lda)
+			r := bi.Dnrm2(ihi-ilo+1, a[i*lda+ilo:], 1)
+			ica := bi.Idamax(ihi+1, a[i:], lda)
+			ca := math.Abs(a[ica*lda+i])
+			ira := bi.Idamax(n-ilo, a[i*lda+ilo:], 1)
+			ra := math.Abs(a[i*lda+ilo+ira])
+
+			// Guard against zero c or r due to underflow.
+			if c == 0 || r == 0 {
+				continue
+			}
+			g := r / sclfac
+			f := 1.0
+			s := c + r
+			for c < g && math.Max(f, math.Max(c, ca)) < sfmax2 && math.Min(r, math.Min(g, ra)) > sfmin2 {
+				if math.IsNaN(c + f + ca + r + g + ra) {
+					// Panic if NaN to avoid infinite loop.
+					panic("lapack: NaN")
+				}
+				f *= sclfac
+				c *= sclfac
+				ca *= sclfac
+				g /= sclfac
+				r /= sclfac
+				ra /= sclfac
+			}
+			g = c / sclfac
+			for r <= g && math.Max(r, ra) < sfmax2 && math.Min(math.Min(f, c), math.Min(g, ca)) > sfmin2 {
+				f /= sclfac
+				c /= sclfac
+				ca /= sclfac
+				g /= sclfac
+				r *= sclfac
+				ra *= sclfac
+			}
+
+			if c+r >= factor*s {
+				// Reduction would be negligible.
+				continue
+			}
+			if f < 1 && scale[i] < 1 && f*scale[i] <= sfmin1 {
+				continue
+			}
+			if f > 1 && scale[i] > 1 && scale[i] >= sfmax1/f {
+				continue
+			}
+
+			// Now balance.
+			scale[i] *= f
+			bi.Dscal(n-ilo, 1/f, a[i*lda+ilo:], 1)
+			bi.Dscal(ihi+1, f, a[i:], lda)
+			conv = false
+		}
+	}
+	return ilo, ihi
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgebd2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgebd2.go
new file mode 100644
index 0000000000..4f323ec500
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgebd2.go
@@ -0,0 +1,88 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dgebd2 reduces an m×n matrix A to upper or lower bidiagonal form by an orthogonal
+// transformation.
+//
+//	Qᵀ * A * P = B
+//
+// if m >= n, B is upper diagonal, otherwise B is lower bidiagonal.
+// d is the diagonal, len = min(m,n)
+// e is the off-diagonal len = min(m,n)-1
+//
+// Dgebd2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgebd2(m, n int, a []float64, lda int, d, e, tauQ, tauP, work []float64) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	minmn := min(m, n)
+	if minmn == 0 {
+		return
+	}
+
+	switch {
+	case len(d) < minmn:
+		panic(shortD)
+	case len(e) < minmn-1:
+		panic(shortE)
+	case len(tauQ) < minmn:
+		panic(shortTauQ)
+	case len(tauP) < minmn:
+		panic(shortTauP)
+	case len(work) < max(m, n):
+		panic(shortWork)
+	}
+
+	if m >= n {
+		for i := 0; i < n; i++ {
+			a[i*lda+i], tauQ[i] = impl.Dlarfg(m-i, a[i*lda+i], a[min(i+1, m-1)*lda+i:], lda)
+			d[i] = a[i*lda+i]
+			a[i*lda+i] = 1
+			// Apply H_i to A[i:m, i+1:n] from the left.
+			if i < n-1 {
+				impl.Dlarf(blas.Left, m-i, n-i-1, a[i*lda+i:], lda, tauQ[i], a[i*lda+i+1:], lda, work)
+			}
+			a[i*lda+i] = d[i]
+			if i < n-1 {
+				a[i*lda+i+1], tauP[i] = impl.Dlarfg(n-i-1, a[i*lda+i+1], a[i*lda+min(i+2, n-1):], 1)
+				e[i] = a[i*lda+i+1]
+				a[i*lda+i+1] = 1
+				impl.Dlarf(blas.Right, m-i-1, n-i-1, a[i*lda+i+1:], 1, tauP[i], a[(i+1)*lda+i+1:], lda, work)
+				a[i*lda+i+1] = e[i]
+			} else {
+				tauP[i] = 0
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		a[i*lda+i], tauP[i] = impl.Dlarfg(n-i, a[i*lda+i], a[i*lda+min(i+1, n-1):], 1)
+		d[i] = a[i*lda+i]
+		a[i*lda+i] = 1
+		if i < m-1 {
+			impl.Dlarf(blas.Right, m-i-1, n-i, a[i*lda+i:], 1, tauP[i], a[(i+1)*lda+i:], lda, work)
+		}
+		a[i*lda+i] = d[i]
+		if i < m-1 {
+			a[(i+1)*lda+i], tauQ[i] = impl.Dlarfg(m-i-1, a[(i+1)*lda+i], a[min(i+2, m-1)*lda+i:], lda)
+			e[i] = a[(i+1)*lda+i]
+			a[(i+1)*lda+i] = 1
+			impl.Dlarf(blas.Left, m-i-1, n-i-1, a[(i+1)*lda+i:], lda, tauQ[i], a[(i+1)*lda+i+1:], lda, work)
+			a[(i+1)*lda+i] = e[i]
+		} else {
+			tauQ[i] = 0
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgebrd.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgebrd.go
new file mode 100644
index 0000000000..6b6654ba6b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgebrd.go
@@ -0,0 +1,169 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dgebrd reduces a general m×n matrix A to upper or lower bidiagonal form B by
+// an orthogonal transformation:
+//
+//	Qᵀ * A * P = B.
+//
+// The diagonal elements of B are stored in d and the off-diagonal elements are stored
+// in e. These are additionally stored along the diagonal of A and the off-diagonal
+// of A. If m >= n B is an upper-bidiagonal matrix, and if m < n B is a
+// lower-bidiagonal matrix.
+//
+// The remaining elements of A store the data needed to construct Q and P.
+// The matrices Q and P are products of elementary reflectors
+//
+//	if m >= n, Q = H_0 * H_1 * ... * H_{n-1},
+//	           P = G_0 * G_1 * ... * G_{n-2},
+//	if m < n,  Q = H_0 * H_1 * ... * H_{m-2},
+//	           P = G_0 * G_1 * ... * G_{m-1},
+//
+// where
+//
+//	H_i = I - tauQ[i] * v_i * v_iᵀ,
+//	G_i = I - tauP[i] * u_i * u_iᵀ.
+//
+// As an example, on exit the entries of A when m = 6, and n = 5
+//
+//	[ d   e  u1  u1  u1]
+//	[v1   d   e  u2  u2]
+//	[v1  v2   d   e  u3]
+//	[v1  v2  v3   d   e]
+//	[v1  v2  v3  v4   d]
+//	[v1  v2  v3  v4  v5]
+//
+// and when m = 5, n = 6
+//
+//	[ d  u1  u1  u1  u1  u1]
+//	[ e   d  u2  u2  u2  u2]
+//	[v1   e   d  u3  u3  u3]
+//	[v1  v2   e   d  u4  u4]
+//	[v1  v2  v3   e   d  u5]
+//
+// d, tauQ, and tauP must all have length at least min(m,n), and e must have
+// length min(m,n) - 1, unless lwork is -1 when there is no check except for
+// work which must have a length of at least one.
+//
+// work is temporary storage, and lwork specifies the usable memory length.
+// At minimum, lwork >= max(1,m,n) or be -1 and this function will panic otherwise.
+// Dgebrd is blocked decomposition, but the block size is limited
+// by the temporary space available. If lwork == -1, instead of performing Dgebrd,
+// the optimal work length will be stored into work[0].
+//
+// Dgebrd is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgebrd(m, n int, a []float64, lda int, d, e, tauQ, tauP, work []float64, lwork int) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < max(1, max(m, n)) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	minmn := min(m, n)
+	if minmn == 0 {
+		work[0] = 1
+		return
+	}
+
+	nb := impl.Ilaenv(1, "DGEBRD", " ", m, n, -1, -1)
+	lwkopt := (m + n) * nb
+	if lwork == -1 {
+		work[0] = float64(lwkopt)
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(d) < minmn:
+		panic(shortD)
+	case len(e) < minmn-1:
+		panic(shortE)
+	case len(tauQ) < minmn:
+		panic(shortTauQ)
+	case len(tauP) < minmn:
+		panic(shortTauP)
+	}
+
+	nx := minmn
+	ws := max(m, n)
+	if 1 < nb && nb < minmn {
+		// At least one blocked operation can be done.
+		// Get the crossover point nx.
+		nx = max(nb, impl.Ilaenv(3, "DGEBRD", " ", m, n, -1, -1))
+		// Determine when to switch from blocked to unblocked code.
+		if nx < minmn {
+			// At least one blocked operation will be done.
+			ws = (m + n) * nb
+			if lwork < ws {
+				// Not enough work space for the optimal nb,
+				// consider using a smaller block size.
+				nbmin := impl.Ilaenv(2, "DGEBRD", " ", m, n, -1, -1)
+				if lwork >= (m+n)*nbmin {
+					// Enough work space for minimum block size.
+					nb = lwork / (m + n)
+				} else {
+					nb = minmn
+					nx = minmn
+				}
+			}
+		}
+	}
+	bi := blas64.Implementation()
+	ldworkx := nb
+	ldworky := nb
+	var i int
+	for i = 0; i < minmn-nx; i += nb {
+		// Reduce rows and columns i:i+nb to bidiagonal form and return
+		// the matrices X and Y which are needed to update the unreduced
+		// part of the matrix.
+		// X is stored in the first m rows of work, y in the next rows.
+		x := work[:m*ldworkx]
+		y := work[m*ldworkx:]
+		impl.Dlabrd(m-i, n-i, nb, a[i*lda+i:], lda,
+			d[i:], e[i:], tauQ[i:], tauP[i:],
+			x, ldworkx, y, ldworky)
+
+		// Update the trailing submatrix A[i+nb:m,i+nb:n], using an update
+		// of the form  A := A - V*Y**T - X*U**T
+		bi.Dgemm(blas.NoTrans, blas.Trans, m-i-nb, n-i-nb, nb,
+			-1, a[(i+nb)*lda+i:], lda, y[nb*ldworky:], ldworky,
+			1, a[(i+nb)*lda+i+nb:], lda)
+
+		bi.Dgemm(blas.NoTrans, blas.NoTrans, m-i-nb, n-i-nb, nb,
+			-1, x[nb*ldworkx:], ldworkx, a[i*lda+i+nb:], lda,
+			1, a[(i+nb)*lda+i+nb:], lda)
+
+		// Copy diagonal and off-diagonal elements of B back into A.
+		if m >= n {
+			for j := i; j < i+nb; j++ {
+				a[j*lda+j] = d[j]
+				a[j*lda+j+1] = e[j]
+			}
+		} else {
+			for j := i; j < i+nb; j++ {
+				a[j*lda+j] = d[j]
+				a[(j+1)*lda+j] = e[j]
+			}
+		}
+	}
+	// Use unblocked code to reduce the remainder of the matrix.
+	impl.Dgebd2(m-i, n-i, a[i*lda+i:], lda, d[i:], e[i:], tauQ[i:], tauP[i:], work)
+	work[0] = float64(ws)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgecon.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgecon.go
new file mode 100644
index 0000000000..1d04644142
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgecon.go
@@ -0,0 +1,106 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dgecon estimates and returns the reciprocal of the condition number of the
+// n×n matrix A, in either the 1-norm or the ∞-norm, using the LU factorization
+// computed by Dgetrf.
+//
+// An estimate is obtained for norm(A⁻¹), and the reciprocal of the condition
+// number rcond is computed as
+//
+//	rcond 1 / ( norm(A) * norm(A⁻¹) ).
+//
+// If n is zero, rcond is always 1.
+//
+// anorm is the 1-norm or the ∞-norm of the original matrix A. anorm must be
+// non-negative, otherwise Dgecon will panic. If anorm is 0 or infinity, Dgecon
+// returns 0. If anorm is NaN, Dgecon returns NaN.
+//
+// work must have length at least 4*n and iwork must have length at least n,
+// otherwise Dgecon will panic.
+func (impl Implementation) Dgecon(norm lapack.MatrixNorm, n int, a []float64, lda int, anorm float64, work []float64, iwork []int) float64 {
+	switch {
+	case norm != lapack.MaxColumnSum && norm != lapack.MaxRowSum:
+		panic(badNorm)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case anorm < 0:
+		panic(negANorm)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 1
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(work) < 4*n:
+		panic(shortWork)
+	case len(iwork) < n:
+		panic(shortIWork)
+	}
+
+	// Quick return if possible.
+	switch {
+	case anorm == 0:
+		return 0
+	case math.IsNaN(anorm):
+		// Propagate NaN.
+		return anorm
+	case math.IsInf(anorm, 1):
+		return 0
+	}
+
+	bi := blas64.Implementation()
+	var rcond, ainvnm float64
+	var kase int
+	var normin bool
+	isave := new([3]int)
+	onenrm := norm == lapack.MaxColumnSum
+	smlnum := dlamchS
+	kase1 := 2
+	if onenrm {
+		kase1 = 1
+	}
+	for {
+		ainvnm, kase = impl.Dlacn2(n, work[n:], work, iwork, ainvnm, kase, isave)
+		if kase == 0 {
+			if ainvnm != 0 {
+				rcond = (1 / ainvnm) / anorm
+			}
+			return rcond
+		}
+		var sl, su float64
+		if kase == kase1 {
+			sl = impl.Dlatrs(blas.Lower, blas.NoTrans, blas.Unit, normin, n, a, lda, work, work[2*n:])
+			su = impl.Dlatrs(blas.Upper, blas.NoTrans, blas.NonUnit, normin, n, a, lda, work, work[3*n:])
+		} else {
+			su = impl.Dlatrs(blas.Upper, blas.Trans, blas.NonUnit, normin, n, a, lda, work, work[3*n:])
+			sl = impl.Dlatrs(blas.Lower, blas.Trans, blas.Unit, normin, n, a, lda, work, work[2*n:])
+		}
+		scale := sl * su
+		normin = true
+		if scale != 1 {
+			ix := bi.Idamax(n, work, 1)
+			if scale == 0 || scale < math.Abs(work[ix])*smlnum {
+				return rcond
+			}
+			impl.Drscl(n, scale, work, 1)
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgeev.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgeev.go
new file mode 100644
index 0000000000..b49b66fc65
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgeev.go
@@ -0,0 +1,287 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dgeev computes the eigenvalues and, optionally, the left and/or right
+// eigenvectors for an n×n real nonsymmetric matrix A.
+//
+// The right eigenvector v_j of A corresponding to an eigenvalue λ_j
+// is defined by
+//
+//	A v_j = λ_j v_j,
+//
+// and the left eigenvector u_j corresponding to an eigenvalue λ_j is defined by
+//
+//	u_jᴴ A = λ_j u_jᴴ,
+//
+// where u_jᴴ is the conjugate transpose of u_j.
+//
+// On return, A will be overwritten and the left and right eigenvectors will be
+// stored, respectively, in the columns of the n×n matrices VL and VR in the
+// same order as their eigenvalues. If the j-th eigenvalue is real, then
+//
+//	u_j = VL[:,j],
+//	v_j = VR[:,j],
+//
+// and if it is not real, then j and j+1 form a complex conjugate pair and the
+// eigenvectors can be recovered as
+//
+//	u_j     = VL[:,j] + i*VL[:,j+1],
+//	u_{j+1} = VL[:,j] - i*VL[:,j+1],
+//	v_j     = VR[:,j] + i*VR[:,j+1],
+//	v_{j+1} = VR[:,j] - i*VR[:,j+1],
+//
+// where i is the imaginary unit. The computed eigenvectors are normalized to
+// have Euclidean norm equal to 1 and largest component real.
+//
+// Left eigenvectors will be computed only if jobvl == lapack.LeftEVCompute,
+// otherwise jobvl must be lapack.LeftEVNone.
+// Right eigenvectors will be computed only if jobvr == lapack.RightEVCompute,
+// otherwise jobvr must be lapack.RightEVNone.
+// For other values of jobvl and jobvr Dgeev will panic.
+//
+// wr and wi contain the real and imaginary parts, respectively, of the computed
+// eigenvalues. Complex conjugate pairs of eigenvalues appear consecutively with
+// the eigenvalue having the positive imaginary part first.
+// wr and wi must have length n, and Dgeev will panic otherwise.
+//
+// work must have length at least lwork and lwork must be at least max(1,4*n) if
+// the left or right eigenvectors are computed, and at least max(1,3*n) if no
+// eigenvectors are computed. For good performance, lwork must generally be
+// larger.  On return, optimal value of lwork will be stored in work[0].
+//
+// If lwork == -1, instead of performing Dgeev, the function only calculates the
+// optimal value of lwork and stores it into work[0].
+//
+// On return, first is the index of the first valid eigenvalue. If first == 0,
+// all eigenvalues and eigenvectors have been computed. If first is positive,
+// Dgeev failed to compute all the eigenvalues, no eigenvectors have been
+// computed and wr[first:] and wi[first:] contain those eigenvalues which have
+// converged.
+func (impl Implementation) Dgeev(jobvl lapack.LeftEVJob, jobvr lapack.RightEVJob, n int, a []float64, lda int, wr, wi []float64, vl []float64, ldvl int, vr []float64, ldvr int, work []float64, lwork int) (first int) {
+	wantvl := jobvl == lapack.LeftEVCompute
+	wantvr := jobvr == lapack.RightEVCompute
+	var minwrk int
+	if wantvl || wantvr {
+		minwrk = max(1, 4*n)
+	} else {
+		minwrk = max(1, 3*n)
+	}
+	switch {
+	case jobvl != lapack.LeftEVCompute && jobvl != lapack.LeftEVNone:
+		panic(badLeftEVJob)
+	case jobvr != lapack.RightEVCompute && jobvr != lapack.RightEVNone:
+		panic(badRightEVJob)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldvl < 1 || (ldvl < n && wantvl):
+		panic(badLdVL)
+	case ldvr < 1 || (ldvr < n && wantvr):
+		panic(badLdVR)
+	case lwork < minwrk && lwork != -1:
+		panic(badLWork)
+	case len(work) < lwork:
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		work[0] = 1
+		return 0
+	}
+
+	maxwrk := 2*n + n*impl.Ilaenv(1, "DGEHRD", " ", n, 1, n, 0)
+	if wantvl || wantvr {
+		maxwrk = max(maxwrk, 2*n+(n-1)*impl.Ilaenv(1, "DORGHR", " ", n, 1, n, -1))
+		impl.Dhseqr(lapack.EigenvaluesAndSchur, lapack.SchurOrig, n, 0, n-1,
+			a, lda, wr, wi, nil, n, work, -1)
+		maxwrk = max(maxwrk, max(n+1, n+int(work[0])))
+		side := lapack.EVLeft
+		if wantvr {
+			side = lapack.EVRight
+		}
+		impl.Dtrevc3(side, lapack.EVAllMulQ, nil, n, a, lda, vl, ldvl, vr, ldvr,
+			n, work, -1)
+		maxwrk = max(maxwrk, n+int(work[0]))
+		maxwrk = max(maxwrk, 4*n)
+	} else {
+		impl.Dhseqr(lapack.EigenvaluesOnly, lapack.SchurNone, n, 0, n-1,
+			a, lda, wr, wi, vr, ldvr, work, -1)
+		maxwrk = max(maxwrk, max(n+1, n+int(work[0])))
+	}
+	maxwrk = max(maxwrk, minwrk)
+
+	if lwork == -1 {
+		work[0] = float64(maxwrk)
+		return 0
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(wr) != n:
+		panic(badLenWr)
+	case len(wi) != n:
+		panic(badLenWi)
+	case len(vl) < (n-1)*ldvl+n && wantvl:
+		panic(shortVL)
+	case len(vr) < (n-1)*ldvr+n && wantvr:
+		panic(shortVR)
+	}
+
+	// Get machine constants.
+	smlnum := math.Sqrt(dlamchS) / dlamchP
+	bignum := 1 / smlnum
+
+	// Scale A if max element outside range [smlnum,bignum].
+	anrm := impl.Dlange(lapack.MaxAbs, n, n, a, lda, nil)
+	var scalea bool
+	var cscale float64
+	if 0 < anrm && anrm < smlnum {
+		scalea = true
+		cscale = smlnum
+	} else if anrm > bignum {
+		scalea = true
+		cscale = bignum
+	}
+	if scalea {
+		impl.Dlascl(lapack.General, 0, 0, anrm, cscale, n, n, a, lda)
+	}
+
+	// Balance the matrix.
+	workbal := work[:n]
+	ilo, ihi := impl.Dgebal(lapack.PermuteScale, n, a, lda, workbal)
+
+	// Reduce to upper Hessenberg form.
+	iwrk := 2 * n
+	tau := work[n : iwrk-1]
+	impl.Dgehrd(n, ilo, ihi, a, lda, tau, work[iwrk:], lwork-iwrk)
+
+	var side lapack.EVSide
+	if wantvl {
+		side = lapack.EVLeft
+		// Copy Householder vectors to VL.
+		impl.Dlacpy(blas.Lower, n, n, a, lda, vl, ldvl)
+		// Generate orthogonal matrix in VL.
+		impl.Dorghr(n, ilo, ihi, vl, ldvl, tau, work[iwrk:], lwork-iwrk)
+		// Perform QR iteration, accumulating Schur vectors in VL.
+		iwrk = n
+		first = impl.Dhseqr(lapack.EigenvaluesAndSchur, lapack.SchurOrig, n, ilo, ihi,
+			a, lda, wr, wi, vl, ldvl, work[iwrk:], lwork-iwrk)
+		if wantvr {
+			// Want left and right eigenvectors.
+			// Copy Schur vectors to VR.
+			side = lapack.EVBoth
+			impl.Dlacpy(blas.All, n, n, vl, ldvl, vr, ldvr)
+		}
+	} else if wantvr {
+		side = lapack.EVRight
+		// Copy Householder vectors to VR.
+		impl.Dlacpy(blas.Lower, n, n, a, lda, vr, ldvr)
+		// Generate orthogonal matrix in VR.
+		impl.Dorghr(n, ilo, ihi, vr, ldvr, tau, work[iwrk:], lwork-iwrk)
+		// Perform QR iteration, accumulating Schur vectors in VR.
+		iwrk = n
+		first = impl.Dhseqr(lapack.EigenvaluesAndSchur, lapack.SchurOrig, n, ilo, ihi,
+			a, lda, wr, wi, vr, ldvr, work[iwrk:], lwork-iwrk)
+	} else {
+		// Compute eigenvalues only.
+		iwrk = n
+		first = impl.Dhseqr(lapack.EigenvaluesOnly, lapack.SchurNone, n, ilo, ihi,
+			a, lda, wr, wi, nil, 1, work[iwrk:], lwork-iwrk)
+	}
+
+	if first > 0 {
+		if scalea {
+			// Undo scaling.
+			impl.Dlascl(lapack.General, 0, 0, cscale, anrm, n-first, 1, wr[first:], 1)
+			impl.Dlascl(lapack.General, 0, 0, cscale, anrm, n-first, 1, wi[first:], 1)
+			impl.Dlascl(lapack.General, 0, 0, cscale, anrm, ilo, 1, wr, 1)
+			impl.Dlascl(lapack.General, 0, 0, cscale, anrm, ilo, 1, wi, 1)
+		}
+		work[0] = float64(maxwrk)
+		return first
+	}
+
+	if wantvl || wantvr {
+		// Compute left and/or right eigenvectors.
+		impl.Dtrevc3(side, lapack.EVAllMulQ, nil, n,
+			a, lda, vl, ldvl, vr, ldvr, n, work[iwrk:], lwork-iwrk)
+	}
+	bi := blas64.Implementation()
+	if wantvl {
+		// Undo balancing of left eigenvectors.
+		impl.Dgebak(lapack.PermuteScale, lapack.EVLeft, n, ilo, ihi, workbal, n, vl, ldvl)
+		// Normalize left eigenvectors and make largest component real.
+		for i, wii := range wi {
+			if wii < 0 {
+				continue
+			}
+			if wii == 0 {
+				scl := 1 / bi.Dnrm2(n, vl[i:], ldvl)
+				bi.Dscal(n, scl, vl[i:], ldvl)
+				continue
+			}
+			scl := 1 / impl.Dlapy2(bi.Dnrm2(n, vl[i:], ldvl), bi.Dnrm2(n, vl[i+1:], ldvl))
+			bi.Dscal(n, scl, vl[i:], ldvl)
+			bi.Dscal(n, scl, vl[i+1:], ldvl)
+			for k := 0; k < n; k++ {
+				vi := vl[k*ldvl+i]
+				vi1 := vl[k*ldvl+i+1]
+				work[iwrk+k] = vi*vi + vi1*vi1
+			}
+			k := bi.Idamax(n, work[iwrk:iwrk+n], 1)
+			cs, sn, _ := impl.Dlartg(vl[k*ldvl+i], vl[k*ldvl+i+1])
+			bi.Drot(n, vl[i:], ldvl, vl[i+1:], ldvl, cs, sn)
+			vl[k*ldvl+i+1] = 0
+		}
+	}
+	if wantvr {
+		// Undo balancing of right eigenvectors.
+		impl.Dgebak(lapack.PermuteScale, lapack.EVRight, n, ilo, ihi, workbal, n, vr, ldvr)
+		// Normalize right eigenvectors and make largest component real.
+		for i, wii := range wi {
+			if wii < 0 {
+				continue
+			}
+			if wii == 0 {
+				scl := 1 / bi.Dnrm2(n, vr[i:], ldvr)
+				bi.Dscal(n, scl, vr[i:], ldvr)
+				continue
+			}
+			scl := 1 / impl.Dlapy2(bi.Dnrm2(n, vr[i:], ldvr), bi.Dnrm2(n, vr[i+1:], ldvr))
+			bi.Dscal(n, scl, vr[i:], ldvr)
+			bi.Dscal(n, scl, vr[i+1:], ldvr)
+			for k := 0; k < n; k++ {
+				vi := vr[k*ldvr+i]
+				vi1 := vr[k*ldvr+i+1]
+				work[iwrk+k] = vi*vi + vi1*vi1
+			}
+			k := bi.Idamax(n, work[iwrk:iwrk+n], 1)
+			cs, sn, _ := impl.Dlartg(vr[k*ldvr+i], vr[k*ldvr+i+1])
+			bi.Drot(n, vr[i:], ldvr, vr[i+1:], ldvr, cs, sn)
+			vr[k*ldvr+i+1] = 0
+		}
+	}
+
+	if scalea {
+		// Undo scaling.
+		impl.Dlascl(lapack.General, 0, 0, cscale, anrm, n-first, 1, wr[first:], 1)
+		impl.Dlascl(lapack.General, 0, 0, cscale, anrm, n-first, 1, wi[first:], 1)
+	}
+
+	work[0] = float64(maxwrk)
+	return first
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgehd2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgehd2.go
new file mode 100644
index 0000000000..64b0cb4028
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgehd2.go
@@ -0,0 +1,105 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dgehd2 reduces a block of a general n×n matrix A to upper Hessenberg form H
+// by an orthogonal similarity transformation Qᵀ * A * Q = H.
+//
+// The matrix Q is represented as a product of (ihi-ilo) elementary
+// reflectors
+//
+//	Q = H_{ilo} H_{ilo+1} ... H_{ihi-1}.
+//
+// Each H_i has the form
+//
+//	H_i = I - tau[i] * v * vᵀ
+//
+// where v is a real vector with v[0:i+1] = 0, v[i+1] = 1 and v[ihi+1:n] = 0.
+// v[i+2:ihi+1] is stored on exit in A[i+2:ihi+1,i].
+//
+// On entry, a contains the n×n general matrix to be reduced. On return, the
+// upper triangle and the first subdiagonal of A are overwritten with the upper
+// Hessenberg matrix H, and the elements below the first subdiagonal, with the
+// slice tau, represent the orthogonal matrix Q as a product of elementary
+// reflectors.
+//
+// The contents of A are illustrated by the following example, with n = 7, ilo =
+// 1 and ihi = 5.
+// On entry,
+//
+//	[ a   a   a   a   a   a   a ]
+//	[     a   a   a   a   a   a ]
+//	[     a   a   a   a   a   a ]
+//	[     a   a   a   a   a   a ]
+//	[     a   a   a   a   a   a ]
+//	[     a   a   a   a   a   a ]
+//	[                         a ]
+//
+// on return,
+//
+//	[ a   a   h   h   h   h   a ]
+//	[     a   h   h   h   h   a ]
+//	[     h   h   h   h   h   h ]
+//	[     v1  h   h   h   h   h ]
+//	[     v1  v2  h   h   h   h ]
+//	[     v1  v2  v3  h   h   h ]
+//	[                         a ]
+//
+// where a denotes an element of the original matrix A, h denotes a
+// modified element of the upper Hessenberg matrix H, and vi denotes an
+// element of the vector defining H_i.
+//
+// ilo and ihi determine the block of A that will be reduced to upper Hessenberg
+// form. It must hold that 0 <= ilo <= ihi <= max(0, n-1), otherwise Dgehd2 will
+// panic.
+//
+// On return, tau will contain the scalar factors of the elementary reflectors.
+// It must have length equal to n-1, otherwise Dgehd2 will panic.
+//
+// work must have length at least n, otherwise Dgehd2 will panic.
+//
+// Dgehd2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgehd2(n, ilo, ihi int, a []float64, lda int, tau, work []float64) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case ilo < 0 || max(0, n-1) < ilo:
+		panic(badIlo)
+	case ihi < min(ilo, n-1) || n <= ihi:
+		panic(badIhi)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(tau) != n-1:
+		panic(badLenTau)
+	case len(work) < n:
+		panic(shortWork)
+	}
+
+	for i := ilo; i < ihi; i++ {
+		// Compute elementary reflector H_i to annihilate A[i+2:ihi+1,i].
+		var aii float64
+		aii, tau[i] = impl.Dlarfg(ihi-i, a[(i+1)*lda+i], a[min(i+2, n-1)*lda+i:], lda)
+		a[(i+1)*lda+i] = 1
+
+		// Apply H_i to A[0:ihi+1,i+1:ihi+1] from the right.
+		impl.Dlarf(blas.Right, ihi+1, ihi-i, a[(i+1)*lda+i:], lda, tau[i], a[i+1:], lda, work)
+
+		// Apply H_i to A[i+1:ihi+1,i+1:n] from the left.
+		impl.Dlarf(blas.Left, ihi-i, n-i-1, a[(i+1)*lda+i:], lda, tau[i], a[(i+1)*lda+i+1:], lda, work)
+		a[(i+1)*lda+i] = aii
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgehrd.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgehrd.go
new file mode 100644
index 0000000000..ae1533029d
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgehrd.go
@@ -0,0 +1,202 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dgehrd reduces a block of a real n×n general matrix A to upper Hessenberg
+// form H by an orthogonal similarity transformation Qᵀ * A * Q = H.
+//
+// The matrix Q is represented as a product of (ihi-ilo) elementary
+// reflectors
+//
+//	Q = H_{ilo} H_{ilo+1} ... H_{ihi-1}.
+//
+// Each H_i has the form
+//
+//	H_i = I - tau[i] * v * vᵀ
+//
+// where v is a real vector with v[0:i+1] = 0, v[i+1] = 1 and v[ihi+1:n] = 0.
+// v[i+2:ihi+1] is stored on exit in A[i+2:ihi+1,i].
+//
+// On entry, a contains the n×n general matrix to be reduced. On return, the
+// upper triangle and the first subdiagonal of A will be overwritten with the
+// upper Hessenberg matrix H, and the elements below the first subdiagonal, with
+// the slice tau, represent the orthogonal matrix Q as a product of elementary
+// reflectors.
+//
+// The contents of a are illustrated by the following example, with n = 7, ilo =
+// 1 and ihi = 5.
+// On entry,
+//
+//	[ a   a   a   a   a   a   a ]
+//	[     a   a   a   a   a   a ]
+//	[     a   a   a   a   a   a ]
+//	[     a   a   a   a   a   a ]
+//	[     a   a   a   a   a   a ]
+//	[     a   a   a   a   a   a ]
+//	[                         a ]
+//
+// on return,
+//
+//	[ a   a   h   h   h   h   a ]
+//	[     a   h   h   h   h   a ]
+//	[     h   h   h   h   h   h ]
+//	[     v1  h   h   h   h   h ]
+//	[     v1  v2  h   h   h   h ]
+//	[     v1  v2  v3  h   h   h ]
+//	[                         a ]
+//
+// where a denotes an element of the original matrix A, h denotes a
+// modified element of the upper Hessenberg matrix H, and vi denotes an
+// element of the vector defining H_i.
+//
+// ilo and ihi determine the block of A that will be reduced to upper Hessenberg
+// form. It must hold that 0 <= ilo <= ihi < n if n > 0, and ilo == 0 and ihi ==
+// -1 if n == 0, otherwise Dgehrd will panic.
+//
+// On return, tau will contain the scalar factors of the elementary reflectors.
+// Elements tau[:ilo] and tau[ihi:] will be set to zero. tau must have length
+// equal to n-1 if n > 0, otherwise Dgehrd will panic.
+//
+// work must have length at least lwork and lwork must be at least max(1,n),
+// otherwise Dgehrd will panic. On return, work[0] contains the optimal value of
+// lwork.
+//
+// If lwork == -1, instead of performing Dgehrd, only the optimal value of lwork
+// will be stored in work[0].
+//
+// Dgehrd is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgehrd(n, ilo, ihi int, a []float64, lda int, tau, work []float64, lwork int) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case ilo < 0 || max(0, n-1) < ilo:
+		panic(badIlo)
+	case ihi < min(ilo, n-1) || n <= ihi:
+		panic(badIhi)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < max(1, n) && lwork != -1:
+		panic(badLWork)
+	case len(work) < lwork:
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		work[0] = 1
+		return
+	}
+
+	const (
+		nbmax = 64
+		ldt   = nbmax + 1
+		tsize = ldt * nbmax
+	)
+	// Compute the workspace requirements.
+	nb := min(nbmax, impl.Ilaenv(1, "DGEHRD", " ", n, ilo, ihi, -1))
+	lwkopt := n*nb + tsize
+	if lwork == -1 {
+		work[0] = float64(lwkopt)
+		return
+	}
+
+	if len(a) < (n-1)*lda+n {
+		panic(shortA)
+	}
+	if len(tau) != n-1 {
+		panic(badLenTau)
+	}
+
+	// Set tau[:ilo] and tau[ihi:] to zero.
+	for i := 0; i < ilo; i++ {
+		tau[i] = 0
+	}
+	for i := ihi; i < n-1; i++ {
+		tau[i] = 0
+	}
+
+	// Quick return if possible.
+	nh := ihi - ilo + 1
+	if nh <= 1 {
+		work[0] = 1
+		return
+	}
+
+	// Determine the block size.
+	nbmin := 2
+	var nx int
+	if 1 < nb && nb < nh {
+		// Determine when to cross over from blocked to unblocked code
+		// (last block is always handled by unblocked code).
+		nx = max(nb, impl.Ilaenv(3, "DGEHRD", " ", n, ilo, ihi, -1))
+		if nx < nh {
+			// Determine if workspace is large enough for blocked code.
+			if lwork < n*nb+tsize {
+				// Not enough workspace to use optimal nb:
+				// determine the minimum value of nb, and reduce
+				// nb or force use of unblocked code.
+				nbmin = max(2, impl.Ilaenv(2, "DGEHRD", " ", n, ilo, ihi, -1))
+				if lwork >= n*nbmin+tsize {
+					nb = (lwork - tsize) / n
+				} else {
+					nb = 1
+				}
+			}
+		}
+	}
+	ldwork := nb // work is used as an n×nb matrix.
+
+	var i int
+	if nb < nbmin || nh <= nb {
+		// Use unblocked code below.
+		i = ilo
+	} else {
+		// Use blocked code.
+		bi := blas64.Implementation()
+		iwt := n * nb // Size of the matrix Y and index where the matrix T starts in work.
+		for i = ilo; i < ihi-nx; i += nb {
+			ib := min(nb, ihi-i)
+
+			// Reduce columns [i:i+ib] to Hessenberg form, returning the
+			// matrices V and T of the block reflector H = I - V*T*Vᵀ
+			// which performs the reduction, and also the matrix Y = A*V*T.
+			impl.Dlahr2(ihi+1, i+1, ib, a[i:], lda, tau[i:], work[iwt:], ldt, work, ldwork)
+
+			// Apply the block reflector H to A[:ihi+1,i+ib:ihi+1] from the
+			// right, computing  A := A - Y * Vᵀ. V[i+ib,i+ib-1] must be set
+			// to 1.
+			ei := a[(i+ib)*lda+i+ib-1]
+			a[(i+ib)*lda+i+ib-1] = 1
+			bi.Dgemm(blas.NoTrans, blas.Trans, ihi+1, ihi-i-ib+1, ib,
+				-1, work, ldwork,
+				a[(i+ib)*lda+i:], lda,
+				1, a[i+ib:], lda)
+			a[(i+ib)*lda+i+ib-1] = ei
+
+			// Apply the block reflector H to A[0:i+1,i+1:i+ib-1] from the
+			// right.
+			bi.Dtrmm(blas.Right, blas.Lower, blas.Trans, blas.Unit, i+1, ib-1,
+				1, a[(i+1)*lda+i:], lda, work, ldwork)
+			for j := 0; j <= ib-2; j++ {
+				bi.Daxpy(i+1, -1, work[j:], ldwork, a[i+j+1:], lda)
+			}
+
+			// Apply the block reflector H to A[i+1:ihi+1,i+ib:n] from the
+			// left.
+			impl.Dlarfb(blas.Left, blas.Trans, lapack.Forward, lapack.ColumnWise,
+				ihi-i, n-i-ib, ib,
+				a[(i+1)*lda+i:], lda, work[iwt:], ldt, a[(i+1)*lda+i+ib:], lda, work, ldwork)
+		}
+	}
+	// Use unblocked code to reduce the rest of the matrix.
+	impl.Dgehd2(n, i, ihi, a, lda, tau, work)
+	work[0] = float64(lwkopt)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgelq2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgelq2.go
new file mode 100644
index 0000000000..abc96f7d2a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgelq2.go
@@ -0,0 +1,65 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dgelq2 computes the LQ factorization of the m×n matrix A.
+//
+// In an LQ factorization, L is a lower triangular m×n matrix, and Q is an n×n
+// orthonormal matrix.
+//
+// a is modified to contain the information to construct L and Q.
+// The lower triangle of a contains the matrix L. The upper triangular elements
+// (not including the diagonal) contain the elementary reflectors. tau is modified
+// to contain the reflector scales. tau must have length of at least k = min(m,n)
+// and this function will panic otherwise.
+//
+// See Dgeqr2 for a description of the elementary reflectors and orthonormal
+// matrix Q. Q is constructed as a product of these elementary reflectors,
+// Q = H_{k-1} * ... * H_1 * H_0.
+//
+// work is temporary storage of length at least m and this function will panic otherwise.
+//
+// Dgelq2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgelq2(m, n int, a []float64, lda int, tau, work []float64) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	k := min(m, n)
+	if k == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(tau) < k:
+		panic(shortTau)
+	case len(work) < m:
+		panic(shortWork)
+	}
+
+	for i := 0; i < k; i++ {
+		a[i*lda+i], tau[i] = impl.Dlarfg(n-i, a[i*lda+i], a[i*lda+min(i+1, n-1):], 1)
+		if i < m-1 {
+			aii := a[i*lda+i]
+			a[i*lda+i] = 1
+			impl.Dlarf(blas.Right, m-i-1, n-i,
+				a[i*lda+i:], 1,
+				tau[i],
+				a[(i+1)*lda+i:], lda,
+				work)
+			a[i*lda+i] = aii
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgelqf.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgelqf.go
new file mode 100644
index 0000000000..f1fd13a019
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgelqf.go
@@ -0,0 +1,97 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dgelqf computes the LQ factorization of the m×n matrix A using a blocked
+// algorithm. See the documentation for Dgelq2 for a description of the
+// parameters at entry and exit.
+//
+// work is temporary storage, and lwork specifies the usable memory length.
+// At minimum, lwork >= m, and this function will panic otherwise.
+// Dgelqf is a blocked LQ factorization, but the block size is limited
+// by the temporary space available. If lwork == -1, instead of performing Dgelqf,
+// the optimal work length will be stored into work[0].
+//
+// tau must have length at least min(m,n), and this function will panic otherwise.
+func (impl Implementation) Dgelqf(m, n int, a []float64, lda int, tau, work []float64, lwork int) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < max(1, m) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	k := min(m, n)
+	if k == 0 {
+		work[0] = 1
+		return
+	}
+
+	nb := impl.Ilaenv(1, "DGELQF", " ", m, n, -1, -1)
+	if lwork == -1 {
+		work[0] = float64(m * nb)
+		return
+	}
+
+	if len(a) < (m-1)*lda+n {
+		panic(shortA)
+	}
+	if len(tau) < k {
+		panic(shortTau)
+	}
+
+	// Find the optimal blocking size based on the size of available memory
+	// and optimal machine parameters.
+	nbmin := 2
+	var nx int
+	iws := m
+	if 1 < nb && nb < k {
+		nx = max(0, impl.Ilaenv(3, "DGELQF", " ", m, n, -1, -1))
+		if nx < k {
+			iws = m * nb
+			if lwork < iws {
+				nb = lwork / m
+				nbmin = max(2, impl.Ilaenv(2, "DGELQF", " ", m, n, -1, -1))
+			}
+		}
+	}
+	ldwork := nb
+	// Computed blocked LQ factorization.
+	var i int
+	if nbmin <= nb && nb < k && nx < k {
+		for i = 0; i < k-nx; i += nb {
+			ib := min(k-i, nb)
+			impl.Dgelq2(ib, n-i, a[i*lda+i:], lda, tau[i:], work)
+			if i+ib < m {
+				impl.Dlarft(lapack.Forward, lapack.RowWise, n-i, ib,
+					a[i*lda+i:], lda,
+					tau[i:],
+					work, ldwork)
+				impl.Dlarfb(blas.Right, blas.NoTrans, lapack.Forward, lapack.RowWise,
+					m-i-ib, n-i, ib,
+					a[i*lda+i:], lda,
+					work, ldwork,
+					a[(i+ib)*lda+i:], lda,
+					work[ib*ldwork:], ldwork)
+			}
+		}
+	}
+	// Perform unblocked LQ factorization on the remainder.
+	if i < k {
+		impl.Dgelq2(m-i, n-i, a[i*lda+i:], lda, tau[i:], work)
+	}
+	work[0] = float64(iws)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgels.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgels.go
new file mode 100644
index 0000000000..3018973a9e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgels.go
@@ -0,0 +1,220 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dgels finds a minimum-norm solution based on the matrices A and B using the
+// QR or LQ factorization. Dgels returns false if the matrix
+// A is singular, and true if this solution was successfully found.
+//
+// The minimization problem solved depends on the input parameters.
+//
+//  1. If m >= n and trans == blas.NoTrans, Dgels finds X such that || A*X - B||_2
+//     is minimized.
+//  2. If m < n and trans == blas.NoTrans, Dgels finds the minimum norm solution of
+//     A * X = B.
+//  3. If m >= n and trans == blas.Trans, Dgels finds the minimum norm solution of
+//     Aᵀ * X = B.
+//  4. If m < n and trans == blas.Trans, Dgels finds X such that || A*X - B||_2
+//     is minimized.
+//
+// Note that the least-squares solutions (cases 1 and 3) perform the minimization
+// per column of B. This is not the same as finding the minimum-norm matrix.
+//
+// The matrix A is a general matrix of size m×n and is modified during this call.
+// The input matrix B is of size max(m,n)×nrhs, and serves two purposes. On entry,
+// the elements of b specify the input matrix B. B has size m×nrhs if
+// trans == blas.NoTrans, and n×nrhs if trans == blas.Trans. On exit, the
+// leading submatrix of b contains the solution vectors X. If trans == blas.NoTrans,
+// this submatrix is of size n×nrhs, and of size m×nrhs otherwise.
+//
+// work is temporary storage, and lwork specifies the usable memory length.
+// At minimum, lwork >= max(m,n) + max(m,n,nrhs), and this function will panic
+// otherwise. A longer work will enable blocked algorithms to be called.
+// In the special case that lwork == -1, work[0] will be set to the optimal working
+// length.
+func (impl Implementation) Dgels(trans blas.Transpose, m, n, nrhs int, a []float64, lda int, b []float64, ldb int, work []float64, lwork int) bool {
+	mn := min(m, n)
+	minwrk := mn + max(mn, nrhs)
+	switch {
+	case trans != blas.NoTrans && trans != blas.Trans && trans != blas.ConjTrans:
+		panic(badTrans)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case nrhs < 0:
+		panic(nrhsLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldb < max(1, nrhs):
+		panic(badLdB)
+	case lwork < max(1, minwrk) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if mn == 0 || nrhs == 0 {
+		impl.Dlaset(blas.All, max(m, n), nrhs, 0, 0, b, ldb)
+		work[0] = 1
+		return true
+	}
+
+	// Find optimal block size.
+	var nb int
+	if m >= n {
+		nb = impl.Ilaenv(1, "DGEQRF", " ", m, n, -1, -1)
+		if trans != blas.NoTrans {
+			nb = max(nb, impl.Ilaenv(1, "DORMQR", "LN", m, nrhs, n, -1))
+		} else {
+			nb = max(nb, impl.Ilaenv(1, "DORMQR", "LT", m, nrhs, n, -1))
+		}
+	} else {
+		nb = impl.Ilaenv(1, "DGELQF", " ", m, n, -1, -1)
+		if trans != blas.NoTrans {
+			nb = max(nb, impl.Ilaenv(1, "DORMLQ", "LT", n, nrhs, m, -1))
+		} else {
+			nb = max(nb, impl.Ilaenv(1, "DORMLQ", "LN", n, nrhs, m, -1))
+		}
+	}
+	wsize := max(1, mn+max(mn, nrhs)*nb)
+	work[0] = float64(wsize)
+
+	if lwork == -1 {
+		return true
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(b) < (max(m, n)-1)*ldb+nrhs:
+		panic(shortB)
+	}
+
+	// Scale the input matrices if they contain extreme values.
+	smlnum := dlamchS / dlamchP
+	bignum := 1 / smlnum
+	anrm := impl.Dlange(lapack.MaxAbs, m, n, a, lda, nil)
+	var iascl int
+	if anrm > 0 && anrm < smlnum {
+		impl.Dlascl(lapack.General, 0, 0, anrm, smlnum, m, n, a, lda)
+		iascl = 1
+	} else if anrm > bignum {
+		impl.Dlascl(lapack.General, 0, 0, anrm, bignum, m, n, a, lda)
+	} else if anrm == 0 {
+		// Matrix is all zeros.
+		impl.Dlaset(blas.All, max(m, n), nrhs, 0, 0, b, ldb)
+		return true
+	}
+	brow := m
+	if trans != blas.NoTrans {
+		brow = n
+	}
+	bnrm := impl.Dlange(lapack.MaxAbs, brow, nrhs, b, ldb, nil)
+	ibscl := 0
+	if bnrm > 0 && bnrm < smlnum {
+		impl.Dlascl(lapack.General, 0, 0, bnrm, smlnum, brow, nrhs, b, ldb)
+		ibscl = 1
+	} else if bnrm > bignum {
+		impl.Dlascl(lapack.General, 0, 0, bnrm, bignum, brow, nrhs, b, ldb)
+		ibscl = 2
+	}
+
+	// Solve the minimization problem using a QR or an LQ decomposition.
+	var scllen int
+	if m >= n {
+		impl.Dgeqrf(m, n, a, lda, work[:n], work[mn:], lwork-mn)
+		if trans == blas.NoTrans {
+			impl.Dormqr(blas.Left, blas.Trans, m, nrhs, n,
+				a, lda,
+				work[:n],
+				b, ldb,
+				work[mn:], lwork-mn)
+			ok := impl.Dtrtrs(blas.Upper, blas.NoTrans, blas.NonUnit, n, nrhs,
+				a, lda,
+				b, ldb)
+			if !ok {
+				return false
+			}
+			scllen = n
+		} else {
+			ok := impl.Dtrtrs(blas.Upper, blas.Trans, blas.NonUnit, n, nrhs,
+				a, lda,
+				b, ldb)
+			if !ok {
+				return false
+			}
+			for i := n; i < m; i++ {
+				for j := 0; j < nrhs; j++ {
+					b[i*ldb+j] = 0
+				}
+			}
+			impl.Dormqr(blas.Left, blas.NoTrans, m, nrhs, n,
+				a, lda,
+				work[:n],
+				b, ldb,
+				work[mn:], lwork-mn)
+			scllen = m
+		}
+	} else {
+		impl.Dgelqf(m, n, a, lda, work, work[mn:], lwork-mn)
+		if trans == blas.NoTrans {
+			ok := impl.Dtrtrs(blas.Lower, blas.NoTrans, blas.NonUnit,
+				m, nrhs,
+				a, lda,
+				b, ldb)
+			if !ok {
+				return false
+			}
+			for i := m; i < n; i++ {
+				for j := 0; j < nrhs; j++ {
+					b[i*ldb+j] = 0
+				}
+			}
+			impl.Dormlq(blas.Left, blas.Trans, n, nrhs, m,
+				a, lda,
+				work,
+				b, ldb,
+				work[mn:], lwork-mn)
+			scllen = n
+		} else {
+			impl.Dormlq(blas.Left, blas.NoTrans, n, nrhs, m,
+				a, lda,
+				work,
+				b, ldb,
+				work[mn:], lwork-mn)
+			ok := impl.Dtrtrs(blas.Lower, blas.Trans, blas.NonUnit,
+				m, nrhs,
+				a, lda,
+				b, ldb)
+			if !ok {
+				return false
+			}
+		}
+	}
+
+	// Adjust answer vector based on scaling.
+	if iascl == 1 {
+		impl.Dlascl(lapack.General, 0, 0, anrm, smlnum, scllen, nrhs, b, ldb)
+	}
+	if iascl == 2 {
+		impl.Dlascl(lapack.General, 0, 0, anrm, bignum, scllen, nrhs, b, ldb)
+	}
+	if ibscl == 1 {
+		impl.Dlascl(lapack.General, 0, 0, smlnum, bnrm, scllen, nrhs, b, ldb)
+	}
+	if ibscl == 2 {
+		impl.Dlascl(lapack.General, 0, 0, bignum, bnrm, scllen, nrhs, b, ldb)
+	}
+
+	work[0] = float64(wsize)
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgeql2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgeql2.go
new file mode 100644
index 0000000000..d18989d274
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgeql2.go
@@ -0,0 +1,67 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dgeql2 computes the QL factorization of the m×n matrix A. That is, Dgeql2
+// computes Q and L such that
+//
+//	A = Q * L
+//
+// where Q is an m×m orthonormal matrix and L is a lower trapezoidal matrix.
+//
+// Q is represented as a product of elementary reflectors,
+//
+//	Q = H_{k-1} * ... * H_1 * H_0
+//
+// where k = min(m,n) and each H_i has the form
+//
+//	H_i = I - tau[i] * v_i * v_iᵀ
+//
+// Vector v_i has v[m-k+i+1:m] = 0, v[m-k+i] = 1, and v[:m-k+i+1] is stored on
+// exit in A[0:m-k+i-1, n-k+i].
+//
+// tau must have length at least min(m,n), and Dgeql2 will panic otherwise.
+//
+// work is temporary memory storage and must have length at least n.
+//
+// Dgeql2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgeql2(m, n int, a []float64, lda int, tau, work []float64) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	k := min(m, n)
+	if k == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(tau) < k:
+		panic(shortTau)
+	case len(work) < n:
+		panic(shortWork)
+	}
+
+	var aii float64
+	for i := k - 1; i >= 0; i-- {
+		// Generate elementary reflector H_i to annihilate A[0:m-k+i-1, n-k+i].
+		aii, tau[i] = impl.Dlarfg(m-k+i+1, a[(m-k+i)*lda+n-k+i], a[n-k+i:], lda)
+
+		// Apply H_i to A[0:m-k+i, 0:n-k+i-1] from the left.
+		a[(m-k+i)*lda+n-k+i] = 1
+		impl.Dlarf(blas.Left, m-k+i+1, n-k+i, a[n-k+i:], lda, tau[i], a, lda, work)
+		a[(m-k+i)*lda+n-k+i] = aii
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgeqp3.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgeqp3.go
new file mode 100644
index 0000000000..da8cd4fa76
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgeqp3.go
@@ -0,0 +1,195 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dgeqp3 computes a QR factorization with column pivoting of the m×n matrix A:
+//
+//	A*P = Q*R
+//
+// where P is a permutation matrix, Q is an orthogonal matrix and R is a
+// min(m,n)×n upper trapezoidal matrix.
+//
+// On return, the upper triangle of A contains the matrix R. The elements below
+// the diagonal together with tau represent the matrix Q as a product of
+// elementary reflectors
+//
+//	Q = H_0 * H_1 * ... * H_{k-1}, where k = min(m,n).
+//
+// Each H_i has the form
+//
+//	H_i = I - tau * v * vᵀ
+//
+// where tau is a scalar and v is a vector with v[0:i] = 0 and v[i] = 1;
+// v[i+1:m] is stored on exit in A[i+1:m,i], and tau in tau[i].
+//
+// jpvt specifies a column pivot to be applied to A. On entry, if jpvt[j] is at
+// least zero, the jth column of A is permuted to the front of A*P (a leading
+// column), if jpvt[j] is -1 the jth column of A is a free column. If jpvt[j] <
+// -1, Dgeqp3 will panic. On return, jpvt holds the permutation that was
+// applied; the jth column of A*P was the jpvt[j] column of A. jpvt must have
+// length n or Dgeqp3 will panic.
+//
+// tau holds the scalar factors of the elementary reflectors. It must have
+// length min(m,n), otherwise Dgeqp3 will panic.
+//
+// work must have length at least max(1,lwork), and lwork must be at least
+// 3*n+1, otherwise Dgeqp3 will panic. For optimal performance lwork must be at
+// least 2*n+(n+1)*nb, where nb is the optimal blocksize. On return, work[0]
+// will contain the optimal value of lwork.
+//
+// If lwork == -1, instead of performing Dgeqp3, only the optimal value of lwork
+// will be stored in work[0].
+//
+// Dgeqp3 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgeqp3(m, n int, a []float64, lda int, jpvt []int, tau, work []float64, lwork int) {
+	const (
+		inb    = 1
+		inbmin = 2
+		ixover = 3
+	)
+
+	minmn := min(m, n)
+	iws := 3*n + 1
+	if minmn == 0 {
+		iws = 1
+	}
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < iws && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if minmn == 0 {
+		work[0] = 1
+		return
+	}
+
+	nb := impl.Ilaenv(inb, "DGEQRF", " ", m, n, -1, -1)
+	if lwork == -1 {
+		work[0] = float64(2*n + (n+1)*nb)
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(jpvt) != n:
+		panic(badLenJpvt)
+	case len(tau) < minmn:
+		panic(shortTau)
+	}
+
+	for _, v := range jpvt {
+		if v < -1 || n <= v {
+			panic(badJpvt)
+		}
+	}
+
+	bi := blas64.Implementation()
+
+	// Move initial columns up front.
+	var nfxd int
+	for j := 0; j < n; j++ {
+		if jpvt[j] == -1 {
+			jpvt[j] = j
+			continue
+		}
+		if j != nfxd {
+			bi.Dswap(m, a[j:], lda, a[nfxd:], lda)
+			jpvt[j], jpvt[nfxd] = jpvt[nfxd], j
+		} else {
+			jpvt[j] = j
+		}
+		nfxd++
+	}
+
+	// Factorize nfxd columns.
+	//
+	// Compute the QR factorization of nfxd columns and update remaining columns.
+	if nfxd > 0 {
+		na := min(m, nfxd)
+		impl.Dgeqrf(m, na, a, lda, tau[:na], work, lwork)
+		iws = max(iws, int(work[0]))
+		if na < n {
+			impl.Dormqr(blas.Left, blas.Trans, m, n-na, na, a, lda, tau[:na], a[na:], lda,
+				work, lwork)
+			iws = max(iws, int(work[0]))
+		}
+	}
+
+	if nfxd >= minmn {
+		work[0] = float64(iws)
+		return
+	}
+
+	// Factorize free columns.
+	sm := m - nfxd
+	sn := n - nfxd
+	sminmn := minmn - nfxd
+
+	// Determine the block size.
+	nb = impl.Ilaenv(inb, "DGEQRF", " ", sm, sn, -1, -1)
+	nbmin := 2
+	nx := 0
+
+	if 1 < nb && nb < sminmn {
+		// Determine when to cross over from blocked to unblocked code.
+		nx = max(0, impl.Ilaenv(ixover, "DGEQRF", " ", sm, sn, -1, -1))
+
+		if nx < sminmn {
+			// Determine if workspace is large enough for blocked code.
+			minws := 2*sn + (sn+1)*nb
+			iws = max(iws, minws)
+			if lwork < minws {
+				// Not enough workspace to use optimal nb. Reduce
+				// nb and determine the minimum value of nb.
+				nb = (lwork - 2*sn) / (sn + 1)
+				nbmin = max(2, impl.Ilaenv(inbmin, "DGEQRF", " ", sm, sn, -1, -1))
+			}
+		}
+	}
+
+	// Initialize partial column norms.
+	// The first n elements of work store the exact column norms.
+	for j := nfxd; j < n; j++ {
+		work[j] = bi.Dnrm2(sm, a[nfxd*lda+j:], lda)
+		work[n+j] = work[j]
+	}
+	j := nfxd
+	if nbmin <= nb && nb < sminmn && nx < sminmn {
+		// Use blocked code initially.
+
+		// Compute factorization.
+		var fjb int
+		for topbmn := minmn - nx; j < topbmn; j += fjb {
+			jb := min(nb, topbmn-j)
+
+			// Factorize jb columns among columns j:n.
+			fjb = impl.Dlaqps(m, n-j, j, jb, a[j:], lda, jpvt[j:], tau[j:],
+				work[j:n], work[j+n:2*n], work[2*n:2*n+jb], work[2*n+jb:], jb)
+		}
+	}
+
+	// Use unblocked code to factor the last or only block.
+	if j < minmn {
+		impl.Dlaqp2(m, n-j, j, a[j:], lda, jpvt[j:], tau[j:],
+			work[j:n], work[j+n:2*n], work[2*n:])
+	}
+
+	work[0] = float64(iws)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgeqr2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgeqr2.go
new file mode 100644
index 0000000000..4d1a4b3b0c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgeqr2.go
@@ -0,0 +1,78 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dgeqr2 computes a QR factorization of the m×n matrix A.
+//
+// In a QR factorization, Q is an m×m orthonormal matrix, and R is an
+// upper triangular m×n matrix.
+//
+// A is modified to contain the information to construct Q and R.
+// The upper triangle of a contains the matrix R. The lower triangular elements
+// (not including the diagonal) contain the elementary reflectors. tau is modified
+// to contain the reflector scales. tau must have length min(m,n), and
+// this function will panic otherwise.
+//
+// The ith elementary reflector can be explicitly constructed by first extracting
+// the
+//
+//	v[j] = 0           j < i
+//	v[j] = 1           j == i
+//	v[j] = a[j*lda+i]  j > i
+//
+// and computing H_i = I - tau[i] * v * vᵀ.
+//
+// The orthonormal matrix Q can be constructed from a product of these elementary
+// reflectors, Q = H_0 * H_1 * ... * H_{k-1}, where k = min(m,n).
+//
+// work is temporary storage of length at least n and this function will panic otherwise.
+//
+// Dgeqr2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgeqr2(m, n int, a []float64, lda int, tau, work []float64) {
+	// TODO(btracey): This is oriented such that columns of a are eliminated.
+	// This likely could be re-arranged to take better advantage of row-major
+	// storage.
+
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case len(work) < n:
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	k := min(m, n)
+	if k == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(tau) != k:
+		panic(badLenTau)
+	}
+
+	for i := 0; i < k; i++ {
+		// Generate elementary reflector H_i.
+		a[i*lda+i], tau[i] = impl.Dlarfg(m-i, a[i*lda+i], a[min((i+1), m-1)*lda+i:], lda)
+		if i < n-1 {
+			aii := a[i*lda+i]
+			a[i*lda+i] = 1
+			impl.Dlarf(blas.Left, m-i, n-i-1,
+				a[i*lda+i:], lda,
+				tau[i],
+				a[i*lda+i+1:], lda,
+				work)
+			a[i*lda+i] = aii
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgeqrf.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgeqrf.go
new file mode 100644
index 0000000000..2bcbde586c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgeqrf.go
@@ -0,0 +1,108 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dgeqrf computes the QR factorization of the m×n matrix A using a blocked
+// algorithm. See the documentation for Dgeqr2 for a description of the
+// parameters at entry and exit.
+//
+// work is temporary storage, and lwork specifies the usable memory length.
+// The length of work must be at least max(1, lwork) and lwork must be -1
+// or at least n, otherwise this function will panic.
+// Dgeqrf is a blocked QR factorization, but the block size is limited
+// by the temporary space available. If lwork == -1, instead of performing Dgeqrf,
+// the optimal work length will be stored into work[0].
+//
+// tau must have length min(m,n), and this function will panic otherwise.
+func (impl Implementation) Dgeqrf(m, n int, a []float64, lda int, tau, work []float64, lwork int) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < max(1, n) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	k := min(m, n)
+	if k == 0 {
+		work[0] = 1
+		return
+	}
+
+	// nb is the optimal blocksize, i.e. the number of columns transformed at a time.
+	nb := impl.Ilaenv(1, "DGEQRF", " ", m, n, -1, -1)
+	if lwork == -1 {
+		work[0] = float64(n * nb)
+		return
+	}
+
+	if len(a) < (m-1)*lda+n {
+		panic(shortA)
+	}
+	if len(tau) != k {
+		panic(badLenTau)
+	}
+
+	nbmin := 2 // Minimal block size.
+	var nx int // Use unblocked (unless changed in the next for loop)
+	iws := n
+	// Only consider blocked if the suggested block size is > 1 and the
+	// number of rows or columns is sufficiently large.
+	if 1 < nb && nb < k {
+		// nx is the block size at which the code switches from blocked
+		// to unblocked.
+		nx = max(0, impl.Ilaenv(3, "DGEQRF", " ", m, n, -1, -1))
+		if k > nx {
+			iws = n * nb
+			if lwork < iws {
+				// Not enough workspace to use the optimal block
+				// size. Get the minimum block size instead.
+				nb = lwork / n
+				nbmin = max(2, impl.Ilaenv(2, "DGEQRF", " ", m, n, -1, -1))
+			}
+		}
+	}
+
+	// Compute QR using a blocked algorithm.
+	var i int
+	if nbmin <= nb && nb < k && nx < k {
+		ldwork := nb
+		for i = 0; i < k-nx; i += nb {
+			ib := min(k-i, nb)
+			// Compute the QR factorization of the current block.
+			impl.Dgeqr2(m-i, ib, a[i*lda+i:], lda, tau[i:i+ib], work)
+			if i+ib < n {
+				// Form the triangular factor of the block reflector and apply Hᵀ
+				// In Dlarft, work becomes the T matrix.
+				impl.Dlarft(lapack.Forward, lapack.ColumnWise, m-i, ib,
+					a[i*lda+i:], lda,
+					tau[i:],
+					work, ldwork)
+				impl.Dlarfb(blas.Left, blas.Trans, lapack.Forward, lapack.ColumnWise,
+					m-i, n-i-ib, ib,
+					a[i*lda+i:], lda,
+					work, ldwork,
+					a[i*lda+i+ib:], lda,
+					work[ib*ldwork:], ldwork)
+			}
+		}
+	}
+	// Call unblocked code on the remaining columns.
+	if i < k {
+		impl.Dgeqr2(m-i, n-i, a[i*lda+i:], lda, tau[i:], work)
+	}
+	work[0] = float64(iws)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgerq2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgerq2.go
new file mode 100644
index 0000000000..44ca1bc1a0
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgerq2.go
@@ -0,0 +1,74 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dgerq2 computes an RQ factorization of the m×n matrix A,
+//
+//	A = R * Q.
+//
+// On exit, if m <= n, the upper triangle of the subarray
+// A[0:m, n-m:n] contains the m×m upper triangular matrix R.
+// If m >= n, the elements on and above the (m-n)-th subdiagonal
+// contain the m×n upper trapezoidal matrix R.
+// The remaining elements, with tau, represent the
+// orthogonal matrix Q as a product of min(m,n) elementary
+// reflectors.
+//
+// The matrix Q is represented as a product of elementary reflectors
+//
+//	Q = H_0 H_1 . . . H_{min(m,n)-1}.
+//
+// Each H(i) has the form
+//
+//	H_i = I - tau_i * v * vᵀ
+//
+// where v is a vector with v[0:n-k+i-1] stored in A[m-k+i, 0:n-k+i-1],
+// v[n-k+i:n] = 0 and v[n-k+i] = 1.
+//
+// tau must have length min(m,n) and work must have length m, otherwise
+// Dgerq2 will panic.
+//
+// Dgerq2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgerq2(m, n int, a []float64, lda int, tau, work []float64) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case len(work) < m:
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	k := min(m, n)
+	if k == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(tau) < k:
+		panic(shortTau)
+	}
+
+	for i := k - 1; i >= 0; i-- {
+		// Generate elementary reflector H[i] to annihilate
+		// A[m-k+i, 0:n-k+i-1].
+		mki := m - k + i
+		nki := n - k + i
+		var aii float64
+		aii, tau[i] = impl.Dlarfg(nki+1, a[mki*lda+nki], a[mki*lda:], 1)
+
+		// Apply H[i] to A[0:m-k+i-1, 0:n-k+i] from the right.
+		a[mki*lda+nki] = 1
+		impl.Dlarf(blas.Right, mki, nki+1, a[mki*lda:], 1, tau[i], a, lda, work)
+		a[mki*lda+nki] = aii
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgerqf.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgerqf.go
new file mode 100644
index 0000000000..fe010b4792
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgerqf.go
@@ -0,0 +1,135 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dgerqf computes an RQ factorization of the m×n matrix A,
+//
+//	A = R * Q.
+//
+// On exit, if m <= n, the upper triangle of the subarray
+// A[0:m, n-m:n] contains the m×m upper triangular matrix R.
+// If m >= n, the elements on and above the (m-n)-th subdiagonal
+// contain the m×n upper trapezoidal matrix R.
+// The remaining elements, with tau, represent the
+// orthogonal matrix Q as a product of min(m,n) elementary
+// reflectors.
+//
+// The matrix Q is represented as a product of elementary reflectors
+//
+//	Q = H_0 H_1 . . . H_{min(m,n)-1}.
+//
+// Each H(i) has the form
+//
+//	H_i = I - tau_i * v * vᵀ
+//
+// where v is a vector with v[0:n-k+i-1] stored in A[m-k+i, 0:n-k+i-1],
+// v[n-k+i:n] = 0 and v[n-k+i] = 1.
+//
+// tau must have length min(m,n), work must have length max(1, lwork),
+// and lwork must be -1 or at least max(1, m), otherwise Dgerqf will panic.
+// On exit, work[0] will contain the optimal length for work.
+//
+// Dgerqf is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgerqf(m, n int, a []float64, lda int, tau, work []float64, lwork int) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < max(1, m) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	k := min(m, n)
+	if k == 0 {
+		work[0] = 1
+		return
+	}
+
+	nb := impl.Ilaenv(1, "DGERQF", " ", m, n, -1, -1)
+	if lwork == -1 {
+		work[0] = float64(m * nb)
+		return
+	}
+
+	if len(a) < (m-1)*lda+n {
+		panic(shortA)
+	}
+	if len(tau) != k {
+		panic(badLenTau)
+	}
+
+	nbmin := 2
+	nx := 1
+	iws := m
+	var ldwork int
+	if 1 < nb && nb < k {
+		// Determine when to cross over from blocked to unblocked code.
+		nx = max(0, impl.Ilaenv(3, "DGERQF", " ", m, n, -1, -1))
+		if nx < k {
+			// Determine whether workspace is large enough for blocked code.
+			iws = m * nb
+			if lwork < iws {
+				// Not enough workspace to use optimal nb. Reduce
+				// nb and determine the minimum value of nb.
+				nb = lwork / m
+				nbmin = max(2, impl.Ilaenv(2, "DGERQF", " ", m, n, -1, -1))
+			}
+			ldwork = nb
+		}
+	}
+
+	var mu, nu int
+	if nbmin <= nb && nb < k && nx < k {
+		// Use blocked code initially.
+		// The last kk rows are handled by the block method.
+		ki := ((k - nx - 1) / nb) * nb
+		kk := min(k, ki+nb)
+
+		var i int
+		for i = k - kk + ki; i >= k-kk; i -= nb {
+			ib := min(k-i, nb)
+
+			// Compute the RQ factorization of the current block
+			// A[m-k+i:m-k+i+ib-1, 0:n-k+i+ib-1].
+			impl.Dgerq2(ib, n-k+i+ib, a[(m-k+i)*lda:], lda, tau[i:], work)
+			if m-k+i > 0 {
+				// Form the triangular factor of the block reflector
+				// H = H_{i+ib-1} . . . H_{i+1} H_i.
+				impl.Dlarft(lapack.Backward, lapack.RowWise,
+					n-k+i+ib, ib, a[(m-k+i)*lda:], lda, tau[i:],
+					work, ldwork)
+
+				// Apply H to A[0:m-k+i-1, 0:n-k+i+ib-1] from the right.
+				impl.Dlarfb(blas.Right, blas.NoTrans, lapack.Backward, lapack.RowWise,
+					m-k+i, n-k+i+ib, ib, a[(m-k+i)*lda:], lda,
+					work, ldwork,
+					a, lda,
+					work[ib*ldwork:], ldwork)
+			}
+		}
+		mu = m - k + i + nb
+		nu = n - k + i + nb
+	} else {
+		mu = m
+		nu = n
+	}
+
+	// Use unblocked code to factor the last or only block.
+	if mu > 0 && nu > 0 {
+		impl.Dgerq2(mu, nu, a, lda, tau, work)
+	}
+	work[0] = float64(iws)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgesc2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgesc2.go
new file mode 100644
index 0000000000..b2201085c5
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgesc2.go
@@ -0,0 +1,93 @@
+// Copyright ©2021 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dgesc2 solves a system of linear equations
+//
+//	A * x = scale * b
+//
+// with a general n×n matrix A represented by the LU factorization with complete
+// pivoting
+//
+//	A = P * L * U * Q
+//
+// as computed by Dgetc2.
+//
+// On entry, rhs contains the right hand side vector b. On return, it is
+// overwritten with the solution vector x.
+//
+// Dgesc2 returns a scale factor
+//
+//	0 <= scale <= 1
+//
+// chosen to prevent overflow in the solution.
+//
+// Dgesc2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgesc2(n int, a []float64, lda int, rhs []float64, ipiv, jpiv []int) (scale float64) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 0
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(rhs) < n:
+		panic(shortRHS)
+	case len(ipiv) != n:
+		panic(badLenIpiv)
+	case len(jpiv) != n:
+		panic(badLenJpiv)
+	}
+
+	const smlnum = dlamchS / dlamchP
+
+	// Apply permutations ipiv to rhs.
+	impl.Dlaswp(1, rhs, 1, 0, n-1, ipiv[:n], 1)
+
+	// Solve for L part.
+	for i := 0; i < n-1; i++ {
+		for j := i + 1; j < n; j++ {
+			rhs[j] -= float64(a[j*lda+i] * rhs[i])
+		}
+	}
+
+	// Check for scaling.
+	scale = 1.0
+	bi := blas64.Implementation()
+	i := bi.Idamax(n, rhs, 1)
+	if 2*smlnum*math.Abs(rhs[i]) > math.Abs(a[(n-1)*lda+(n-1)]) {
+		temp := 0.5 / math.Abs(rhs[i])
+		bi.Dscal(n, temp, rhs, 1)
+		scale *= temp
+	}
+
+	// Solve for U part.
+	for i := n - 1; i >= 0; i-- {
+		temp := 1.0 / a[i*lda+i]
+		rhs[i] *= temp
+		for j := i + 1; j < n; j++ {
+			rhs[i] -= float64(rhs[j] * (a[i*lda+j] * temp))
+		}
+	}
+
+	// Apply permutations jpiv to the solution (rhs).
+	impl.Dlaswp(1, rhs, 1, 0, n-1, jpiv[:n], -1)
+
+	return scale
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgesv.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgesv.go
new file mode 100644
index 0000000000..0be4414ca1
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgesv.go
@@ -0,0 +1,60 @@
+// Copyright ©2021 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dgesv computes the solution to a real system of linear equations
+//
+//	A * X = B
+//
+// where A is an n×n matrix and X and B are n×nrhs matrices.
+//
+// The LU decomposition with partial pivoting and row interchanges is used to
+// factor A as
+//
+//	A = P * L * U
+//
+// where P is a permutation matrix, L is unit lower triangular, and U is upper
+// triangular. On return, the factors L and U are stored in a; the unit diagonal
+// elements of L are not stored. The row pivot indices that define the
+// permutation matrix P are stored in ipiv.
+//
+// The factored form of A is then used to solve the system of equations A * X =
+// B. On entry, b contains the right hand side matrix B. On return, if ok is
+// true, b contains the solution matrix X.
+func (impl Implementation) Dgesv(n, nrhs int, a []float64, lda int, ipiv []int, b []float64, ldb int) (ok bool) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case nrhs < 0:
+		panic(nrhsLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldb < max(1, nrhs):
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if n == 0 || nrhs == 0 {
+		return true
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortAB)
+	case len(ipiv) != n:
+		panic(badLenIpiv)
+	case len(b) < (n-1)*ldb+nrhs:
+		panic(shortB)
+	}
+
+	ok = impl.Dgetrf(n, n, a, lda, ipiv)
+	if ok {
+		impl.Dgetrs(blas.NoTrans, n, nrhs, a, lda, ipiv, b, ldb)
+	}
+
+	return ok
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgesvd.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgesvd.go
new file mode 100644
index 0000000000..97da749bfb
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgesvd.go
@@ -0,0 +1,1378 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+const noSVDO = "dgesvd: not coded for overwrite"
+
+// Dgesvd computes the singular value decomposition of the input matrix A.
+//
+// The singular value decomposition is
+//
+//	A = U * Sigma * Vᵀ
+//
+// where Sigma is an m×n diagonal matrix containing the singular values of A,
+// U is an m×m orthogonal matrix and V is an n×n orthogonal matrix. The first
+// min(m,n) columns of U and V are the left and right singular vectors of A
+// respectively.
+//
+// jobU and jobVT are options for computing the singular vectors. The behavior
+// is as follows
+//
+//	jobU == lapack.SVDAll       All m columns of U are returned in u
+//	jobU == lapack.SVDStore     The first min(m,n) columns are returned in u
+//	jobU == lapack.SVDOverwrite The first min(m,n) columns of U are written into a
+//	jobU == lapack.SVDNone      The columns of U are not computed.
+//
+// The behavior is the same for jobVT and the rows of Vᵀ. At most one of jobU
+// and jobVT can equal lapack.SVDOverwrite, and Dgesvd will panic otherwise.
+//
+// On entry, a contains the data for the m×n matrix A. During the call to Dgesvd
+// the data is overwritten. On exit, A contains the appropriate singular vectors
+// if either job is lapack.SVDOverwrite.
+//
+// s is a slice of length at least min(m,n) and on exit contains the singular
+// values in decreasing order.
+//
+// u contains the left singular vectors on exit, stored column-wise. If
+// jobU == lapack.SVDAll, u is of size m×m. If jobU == lapack.SVDStore u is
+// of size m×min(m,n). If jobU == lapack.SVDOverwrite or lapack.SVDNone, u is
+// not used.
+//
+// vt contains the left singular vectors on exit, stored row-wise. If
+// jobV == lapack.SVDAll, vt is of size n×n. If jobVT == lapack.SVDStore vt is
+// of size min(m,n)×n. If jobVT == lapack.SVDOverwrite or lapack.SVDNone, vt is
+// not used.
+//
+// work is a slice for storing temporary memory, and lwork is the usable size of
+// the slice. lwork must be at least max(5*min(m,n), 3*min(m,n)+max(m,n)).
+// If lwork == -1, instead of performing Dgesvd, the optimal work length will be
+// stored into work[0]. Dgesvd will panic if the working memory has insufficient
+// storage.
+//
+// Dgesvd returns whether the decomposition successfully completed.
+func (impl Implementation) Dgesvd(jobU, jobVT lapack.SVDJob, m, n int, a []float64, lda int, s, u []float64, ldu int, vt []float64, ldvt int, work []float64, lwork int) (ok bool) {
+	if jobU == lapack.SVDOverwrite || jobVT == lapack.SVDOverwrite {
+		panic(noSVDO)
+	}
+
+	wantua := jobU == lapack.SVDAll
+	wantus := jobU == lapack.SVDStore
+	wantuas := wantua || wantus
+	wantuo := jobU == lapack.SVDOverwrite
+	wantun := jobU == lapack.SVDNone
+	if !(wantua || wantus || wantuo || wantun) {
+		panic(badSVDJob)
+	}
+
+	wantva := jobVT == lapack.SVDAll
+	wantvs := jobVT == lapack.SVDStore
+	wantvas := wantva || wantvs
+	wantvo := jobVT == lapack.SVDOverwrite
+	wantvn := jobVT == lapack.SVDNone
+	if !(wantva || wantvs || wantvo || wantvn) {
+		panic(badSVDJob)
+	}
+
+	if wantuo && wantvo {
+		panic(bothSVDOver)
+	}
+
+	minmn := min(m, n)
+	minwork := 1
+	if minmn > 0 {
+		minwork = max(3*minmn+max(m, n), 5*minmn)
+	}
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldu < 1, wantua && ldu < m, wantus && ldu < minmn:
+		panic(badLdU)
+	case ldvt < 1 || (wantvas && ldvt < n):
+		panic(badLdVT)
+	case lwork < minwork && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if minmn == 0 {
+		work[0] = 1
+		return true
+	}
+
+	// Compute optimal workspace size for subroutines.
+	opts := string(jobU) + string(jobVT)
+	mnthr := impl.Ilaenv(6, "DGESVD", opts, m, n, 0, 0)
+	maxwrk := 1
+	var wrkbl, bdspac int
+	if m >= n {
+		bdspac = 5 * n
+		impl.Dgeqrf(m, n, a, lda, nil, work, -1)
+		lwork_dgeqrf := int(work[0])
+
+		impl.Dorgqr(m, n, n, a, lda, nil, work, -1)
+		lwork_dorgqr_n := int(work[0])
+		impl.Dorgqr(m, m, n, a, lda, nil, work, -1)
+		lwork_dorgqr_m := int(work[0])
+
+		impl.Dgebrd(n, n, a, lda, s, nil, nil, nil, work, -1)
+		lwork_dgebrd := int(work[0])
+
+		impl.Dorgbr(lapack.GeneratePT, n, n, n, a, lda, nil, work, -1)
+		lwork_dorgbr_p := int(work[0])
+
+		impl.Dorgbr(lapack.GenerateQ, n, n, n, a, lda, nil, work, -1)
+		lwork_dorgbr_q := int(work[0])
+
+		if m >= mnthr {
+			if wantun {
+				// Path 1 (m much larger than n, jobU == None)
+				maxwrk = n + lwork_dgeqrf
+				maxwrk = max(maxwrk, 3*n+lwork_dgebrd)
+				if wantvo || wantvas {
+					maxwrk = max(maxwrk, 3*n+lwork_dorgbr_p)
+				}
+				maxwrk = max(maxwrk, bdspac)
+			} else if wantuo && wantvn {
+				// Path 2 (m much larger than n, jobU == Overwrite, jobVT == None)
+				wrkbl = n + lwork_dgeqrf
+				wrkbl = max(wrkbl, n+lwork_dorgqr_n)
+				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = max(n*n+wrkbl, n*n+m*n+n)
+			} else if wantuo && wantvas {
+				// Path 3 (m much larger than n, jobU == Overwrite, jobVT == Store or All)
+				wrkbl = n + lwork_dgeqrf
+				wrkbl = max(wrkbl, n+lwork_dorgqr_n)
+				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = max(n*n+wrkbl, n*n+m*n+n)
+			} else if wantus && wantvn {
+				// Path 4 (m much larger than n, jobU == Store, jobVT == None)
+				wrkbl = n + lwork_dgeqrf
+				wrkbl = max(wrkbl, n+lwork_dorgqr_n)
+				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = n*n + wrkbl
+			} else if wantus && wantvo {
+				// Path 5 (m much larger than n, jobU == Store, jobVT == Overwrite)
+				wrkbl = n + lwork_dgeqrf
+				wrkbl = max(wrkbl, n+lwork_dorgqr_n)
+				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = 2*n*n + wrkbl
+			} else if wantus && wantvas {
+				// Path 6 (m much larger than n, jobU == Store, jobVT == Store or All)
+				wrkbl = n + lwork_dgeqrf
+				wrkbl = max(wrkbl, n+lwork_dorgqr_n)
+				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = n*n + wrkbl
+			} else if wantua && wantvn {
+				// Path 7 (m much larger than n, jobU == All, jobVT == None)
+				wrkbl = n + lwork_dgeqrf
+				wrkbl = max(wrkbl, n+lwork_dorgqr_m)
+				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = n*n + wrkbl
+			} else if wantua && wantvo {
+				// Path 8 (m much larger than n, jobU == All, jobVT == Overwrite)
+				wrkbl = n + lwork_dgeqrf
+				wrkbl = max(wrkbl, n+lwork_dorgqr_m)
+				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = 2*n*n + wrkbl
+			} else if wantua && wantvas {
+				// Path 9 (m much larger than n, jobU == All, jobVT == Store or All)
+				wrkbl = n + lwork_dgeqrf
+				wrkbl = max(wrkbl, n+lwork_dorgqr_m)
+				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = n*n + wrkbl
+			}
+		} else {
+			// Path 10 (m at least n, but not much larger)
+			impl.Dgebrd(m, n, a, lda, s, nil, nil, nil, work, -1)
+			lwork_dgebrd := int(work[0])
+			maxwrk = 3*n + lwork_dgebrd
+			if wantus || wantuo {
+				impl.Dorgbr(lapack.GenerateQ, m, n, n, a, lda, nil, work, -1)
+				lwork_dorgbr_q = int(work[0])
+				maxwrk = max(maxwrk, 3*n+lwork_dorgbr_q)
+			}
+			if wantua {
+				impl.Dorgbr(lapack.GenerateQ, m, m, n, a, lda, nil, work, -1)
+				lwork_dorgbr_q := int(work[0])
+				maxwrk = max(maxwrk, 3*n+lwork_dorgbr_q)
+			}
+			if !wantvn {
+				maxwrk = max(maxwrk, 3*n+lwork_dorgbr_p)
+			}
+			maxwrk = max(maxwrk, bdspac)
+		}
+	} else {
+		bdspac = 5 * m
+
+		impl.Dgelqf(m, n, a, lda, nil, work, -1)
+		lwork_dgelqf := int(work[0])
+
+		impl.Dorglq(n, n, m, nil, n, nil, work, -1)
+		lwork_dorglq_n := int(work[0])
+		impl.Dorglq(m, n, m, a, lda, nil, work, -1)
+		lwork_dorglq_m := int(work[0])
+
+		impl.Dgebrd(m, m, a, lda, s, nil, nil, nil, work, -1)
+		lwork_dgebrd := int(work[0])
+
+		impl.Dorgbr(lapack.GeneratePT, m, m, m, a, n, nil, work, -1)
+		lwork_dorgbr_p := int(work[0])
+
+		impl.Dorgbr(lapack.GenerateQ, m, m, m, a, n, nil, work, -1)
+		lwork_dorgbr_q := int(work[0])
+
+		if n >= mnthr {
+			if wantvn {
+				// Path 1t (n much larger than m, jobVT == None)
+				maxwrk = m + lwork_dgelqf
+				maxwrk = max(maxwrk, 3*m+lwork_dgebrd)
+				if wantuo || wantuas {
+					maxwrk = max(maxwrk, 3*m+lwork_dorgbr_q)
+				}
+				maxwrk = max(maxwrk, bdspac)
+			} else if wantvo && wantun {
+				// Path 2t (n much larger than m, jobU == None, jobVT == Overwrite)
+				wrkbl = m + lwork_dgelqf
+				wrkbl = max(wrkbl, m+lwork_dorglq_m)
+				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = max(m*m+wrkbl, m*m+m*n+m)
+			} else if wantvo && wantuas {
+				// Path 3t (n much larger than m, jobU == Store or All, jobVT == Overwrite)
+				wrkbl = m + lwork_dgelqf
+				wrkbl = max(wrkbl, m+lwork_dorglq_m)
+				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = max(m*m+wrkbl, m*m+m*n+m)
+			} else if wantvs && wantun {
+				// Path 4t (n much larger than m, jobU == None, jobVT == Store)
+				wrkbl = m + lwork_dgelqf
+				wrkbl = max(wrkbl, m+lwork_dorglq_m)
+				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = m*m + wrkbl
+			} else if wantvs && wantuo {
+				// Path 5t (n much larger than m, jobU == Overwrite, jobVT == Store)
+				wrkbl = m + lwork_dgelqf
+				wrkbl = max(wrkbl, m+lwork_dorglq_m)
+				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = 2*m*m + wrkbl
+			} else if wantvs && wantuas {
+				// Path 6t (n much larger than m, jobU == Store or All, jobVT == Store)
+				wrkbl = m + lwork_dgelqf
+				wrkbl = max(wrkbl, m+lwork_dorglq_m)
+				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = m*m + wrkbl
+			} else if wantva && wantun {
+				// Path 7t (n much larger than m, jobU== None, jobVT == All)
+				wrkbl = m + lwork_dgelqf
+				wrkbl = max(wrkbl, m+lwork_dorglq_n)
+				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = m*m + wrkbl
+			} else if wantva && wantuo {
+				// Path 8t (n much larger than m, jobU == Overwrite, jobVT == All)
+				wrkbl = m + lwork_dgelqf
+				wrkbl = max(wrkbl, m+lwork_dorglq_n)
+				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = 2*m*m + wrkbl
+			} else if wantva && wantuas {
+				// Path 9t (n much larger than m, jobU == Store or All, jobVT == All)
+				wrkbl = m + lwork_dgelqf
+				wrkbl = max(wrkbl, m+lwork_dorglq_n)
+				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = m*m + wrkbl
+			}
+		} else {
+			// Path 10t (n greater than m, but not much larger)
+			impl.Dgebrd(m, n, a, lda, s, nil, nil, nil, work, -1)
+			lwork_dgebrd = int(work[0])
+			maxwrk = 3*m + lwork_dgebrd
+			if wantvs || wantvo {
+				impl.Dorgbr(lapack.GeneratePT, m, n, m, a, n, nil, work, -1)
+				lwork_dorgbr_p = int(work[0])
+				maxwrk = max(maxwrk, 3*m+lwork_dorgbr_p)
+			}
+			if wantva {
+				impl.Dorgbr(lapack.GeneratePT, n, n, m, a, n, nil, work, -1)
+				lwork_dorgbr_p = int(work[0])
+				maxwrk = max(maxwrk, 3*m+lwork_dorgbr_p)
+			}
+			if !wantun {
+				maxwrk = max(maxwrk, 3*m+lwork_dorgbr_q)
+			}
+			maxwrk = max(maxwrk, bdspac)
+		}
+	}
+
+	maxwrk = max(maxwrk, minwork)
+	if lwork == -1 {
+		work[0] = float64(maxwrk)
+		return true
+	}
+
+	if len(a) < (m-1)*lda+n {
+		panic(shortA)
+	}
+	if len(s) < minmn {
+		panic(shortS)
+	}
+	if (len(u) < (m-1)*ldu+m && wantua) || (len(u) < (m-1)*ldu+minmn && wantus) {
+		panic(shortU)
+	}
+	if (len(vt) < (n-1)*ldvt+n && wantva) || (len(vt) < (minmn-1)*ldvt+n && wantvs) {
+		panic(shortVT)
+	}
+
+	// Perform decomposition.
+	eps := dlamchE
+	smlnum := math.Sqrt(dlamchS) / eps
+	bignum := 1 / smlnum
+
+	// Scale A if max element outside range [smlnum, bignum].
+	anrm := impl.Dlange(lapack.MaxAbs, m, n, a, lda, nil)
+	var iscl bool
+	if anrm > 0 && anrm < smlnum {
+		iscl = true
+		impl.Dlascl(lapack.General, 0, 0, anrm, smlnum, m, n, a, lda)
+	} else if anrm > bignum {
+		iscl = true
+		impl.Dlascl(lapack.General, 0, 0, anrm, bignum, m, n, a, lda)
+	}
+
+	bi := blas64.Implementation()
+	var ie int
+	if m >= n {
+		// If A has sufficiently more rows than columns, use the QR decomposition.
+		if m >= mnthr {
+			// m >> n
+			if wantun {
+				// Path 1.
+				itau := 0
+				iwork := itau + n
+
+				// Compute A = Q * R.
+				impl.Dgeqrf(m, n, a, lda, work[itau:itau+n], work[iwork:], lwork-iwork)
+
+				// Zero out below R.
+				impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, a[lda:], lda)
+				ie = 0
+				itauq := ie + n
+				itaup := itauq + n
+				iwork = itaup + n
+				// Bidiagonalize R in A.
+				impl.Dgebrd(n, n, a, lda, s, work[ie:], work[itauq:],
+					work[itaup:], work[iwork:], lwork-iwork)
+				ncvt := 0
+				if wantvo || wantvas {
+					impl.Dorgbr(lapack.GeneratePT, n, n, n, a, lda, work[itaup:],
+						work[iwork:], lwork-iwork)
+					ncvt = n
+				}
+				iwork = ie + n
+
+				// Perform bidiagonal QR iteration computing right singular vectors
+				// of A in A if desired.
+				ok = impl.Dbdsqr(blas.Upper, n, ncvt, 0, 0, s, work[ie:],
+					a, lda, work, 1, work, 1, work[iwork:])
+
+				// If right singular vectors desired in VT, copy them there.
+				if wantvas {
+					impl.Dlacpy(blas.All, n, n, a, lda, vt, ldvt)
+				}
+			} else if wantuo && wantvn {
+				// Path 2
+				panic(noSVDO)
+			} else if wantuo && wantvas {
+				// Path 3
+				panic(noSVDO)
+			} else if wantus {
+				if wantvn {
+					// Path 4
+					if lwork >= n*n+max(4*n, bdspac) {
+						// Sufficient workspace for a fast algorithm.
+						ir := 0
+						var ldworkr int
+						if lwork >= wrkbl+lda*n {
+							ldworkr = lda
+						} else {
+							ldworkr = n
+						}
+						itau := ir + ldworkr*n
+						iwork := itau + n
+						// Compute A = Q * R.
+						impl.Dgeqrf(m, n, a, lda, work[itau:itau+n], work[iwork:], lwork-iwork)
+
+						// Copy R to work[ir:], zeroing out below it.
+						impl.Dlacpy(blas.Upper, n, n, a, lda, work[ir:], ldworkr)
+						impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, work[ir+ldworkr:], ldworkr)
+
+						// Generate Q in A.
+						impl.Dorgqr(m, n, n, a, lda, work[itau:itau+n], work[iwork:], lwork-iwork)
+						ie := itau
+						itauq := ie + n
+						itaup := itauq + n
+						iwork = itaup + n
+
+						// Bidiagonalize R in work[ir:].
+						impl.Dgebrd(n, n, work[ir:], ldworkr, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+
+						// Generate left vectors bidiagonalizing R in work[ir:].
+						impl.Dorgbr(lapack.GenerateQ, n, n, n, work[ir:], ldworkr,
+							work[itauq:], work[iwork:], lwork-iwork)
+						iwork = ie + n
+
+						// Perform bidiagonal QR iteration, computing left singular
+						// vectors of R in work[ir:].
+						ok = impl.Dbdsqr(blas.Upper, n, 0, n, 0, s, work[ie:], work, 1,
+							work[ir:], ldworkr, work, 1, work[iwork:])
+
+						// Multiply Q in A by left singular vectors of R in
+						// work[ir:], storing result in U.
+						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, n, 1, a, lda,
+							work[ir:], ldworkr, 0, u, ldu)
+					} else {
+						// Insufficient workspace for a fast algorithm.
+						itau := 0
+						iwork := itau + n
+
+						// Compute A = Q*R, copying result to U.
+						impl.Dgeqrf(m, n, a, lda, work[itau:itau+n], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu)
+
+						// Generate Q in U.
+						impl.Dorgqr(m, n, n, u, ldu, work[itau:itau+n], work[iwork:], lwork-iwork)
+						ie := itau
+						itauq := ie + n
+						itaup := itauq + n
+						iwork = itaup + n
+
+						// Zero out below R in A.
+						impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, a[lda:], lda)
+
+						// Bidiagonalize R in A.
+						impl.Dgebrd(n, n, a, lda, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+
+						// Multiply Q in U by left vectors bidiagonalizing R.
+						impl.Dormbr(lapack.ApplyQ, blas.Right, blas.NoTrans, m, n, n,
+							a, lda, work[itauq:], u, ldu, work[iwork:], lwork-iwork)
+						iwork = ie + n
+
+						// Perform bidiagonal QR iteration, computing left
+						// singular vectors of A in U.
+						ok = impl.Dbdsqr(blas.Upper, n, 0, m, 0, s, work[ie:], work, 1,
+							u, ldu, work, 1, work[iwork:])
+					}
+				} else if wantvo {
+					// Path 5
+					panic(noSVDO)
+				} else if wantvas {
+					// Path 6
+					if lwork >= n*n+max(4*n, bdspac) {
+						// Sufficient workspace for a fast algorithm.
+						iu := 0
+						var ldworku int
+						if lwork >= wrkbl+lda*n {
+							ldworku = lda
+						} else {
+							ldworku = n
+						}
+						itau := iu + ldworku*n
+						iwork := itau + n
+
+						// Compute A = Q * R.
+						impl.Dgeqrf(m, n, a, lda, work[itau:itau+n], work[iwork:], lwork-iwork)
+						// Copy R to work[iu:], zeroing out below it.
+						impl.Dlacpy(blas.Upper, n, n, a, lda, work[iu:], ldworku)
+						impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, work[iu+ldworku:], ldworku)
+
+						// Generate Q in A.
+						impl.Dorgqr(m, n, n, a, lda, work[itau:itau+n], work[iwork:], lwork-iwork)
+
+						ie := itau
+						itauq := ie + n
+						itaup := itauq + n
+						iwork = itaup + n
+
+						// Bidiagonalize R in work[iu:], copying result to VT.
+						impl.Dgebrd(n, n, work[iu:], ldworku, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Upper, n, n, work[iu:], ldworku, vt, ldvt)
+
+						// Generate left bidiagonalizing vectors in work[iu:].
+						impl.Dorgbr(lapack.GenerateQ, n, n, n, work[iu:], ldworku,
+							work[itauq:], work[iwork:], lwork-iwork)
+
+						// Generate right bidiagonalizing vectors in VT.
+						impl.Dorgbr(lapack.GeneratePT, n, n, n, vt, ldvt,
+							work[itaup:], work[iwork:], lwork-iwork)
+						iwork = ie + n
+
+						// Perform bidiagonal QR iteration, computing left singular
+						// vectors of R in work[iu:], and computing right singular
+						// vectors of R in VT.
+						ok = impl.Dbdsqr(blas.Upper, n, n, n, 0, s, work[ie:],
+							vt, ldvt, work[iu:], ldworku, work, 1, work[iwork:])
+
+						// Multiply Q in A by left singular vectors of R in
+						// work[iu:], storing result in U.
+						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, n, 1, a, lda,
+							work[iu:], ldworku, 0, u, ldu)
+					} else {
+						// Insufficient workspace for a fast algorithm.
+						itau := 0
+						iwork := itau + n
+
+						// Compute A = Q * R, copying result to U.
+						impl.Dgeqrf(m, n, a, lda, work[itau:itau+n], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu)
+
+						// Generate Q in U.
+						impl.Dorgqr(m, n, n, u, ldu, work[itau:itau+n], work[iwork:], lwork-iwork)
+
+						// Copy R to VT, zeroing out below it.
+						impl.Dlacpy(blas.Upper, n, n, a, lda, vt, ldvt)
+						impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, vt[ldvt:], ldvt)
+
+						ie := itau
+						itauq := ie + n
+						itaup := itauq + n
+						iwork = itaup + n
+
+						// Bidiagonalize R in VT.
+						impl.Dgebrd(n, n, vt, ldvt, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+
+						// Multiply Q in U by left bidiagonalizing vectors in VT.
+						impl.Dormbr(lapack.ApplyQ, blas.Right, blas.NoTrans, m, n, n,
+							vt, ldvt, work[itauq:], u, ldu, work[iwork:], lwork-iwork)
+
+						// Generate right bidiagonalizing vectors in VT.
+						impl.Dorgbr(lapack.GeneratePT, n, n, n, vt, ldvt,
+							work[itaup:], work[iwork:], lwork-iwork)
+						iwork = ie + n
+
+						// Perform bidiagonal QR iteration, computing left singular
+						// vectors of A in U and computing right singular vectors
+						// of A in VT.
+						ok = impl.Dbdsqr(blas.Upper, n, n, m, 0, s, work[ie:],
+							vt, ldvt, u, ldu, work, 1, work[iwork:])
+					}
+				}
+			} else if wantua {
+				if wantvn {
+					// Path 7
+					if lwork >= n*n+max(max(n+m, 4*n), bdspac) {
+						// Sufficient workspace for a fast algorithm.
+						ir := 0
+						var ldworkr int
+						if lwork >= wrkbl+lda*n {
+							ldworkr = lda
+						} else {
+							ldworkr = n
+						}
+						itau := ir + ldworkr*n
+						iwork := itau + n
+
+						// Compute A = Q*R, copying result to U.
+						impl.Dgeqrf(m, n, a, lda, work[itau:itau+n], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu)
+
+						// Copy R to work[ir:], zeroing out below it.
+						impl.Dlacpy(blas.Upper, n, n, a, lda, work[ir:], ldworkr)
+						impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, work[ir+ldworkr:], ldworkr)
+
+						// Generate Q in U.
+						impl.Dorgqr(m, m, n, u, ldu, work[itau:itau+n], work[iwork:], lwork-iwork)
+						ie := itau
+						itauq := ie + n
+						itaup := itauq + n
+						iwork = itaup + n
+
+						// Bidiagonalize R in work[ir:].
+						impl.Dgebrd(n, n, work[ir:], ldworkr, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+
+						// Generate left bidiagonalizing vectors in work[ir:].
+						impl.Dorgbr(lapack.GenerateQ, n, n, n, work[ir:], ldworkr,
+							work[itauq:], work[iwork:], lwork-iwork)
+						iwork = ie + n
+
+						// Perform bidiagonal QR iteration, computing left singular
+						// vectors of R in work[ir:].
+						ok = impl.Dbdsqr(blas.Upper, n, 0, n, 0, s, work[ie:], work, 1,
+							work[ir:], ldworkr, work, 1, work[iwork:])
+
+						// Multiply Q in U by left singular vectors of R in
+						// work[ir:], storing result in A.
+						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, n, 1, u, ldu,
+							work[ir:], ldworkr, 0, a, lda)
+
+						// Copy left singular vectors of A from A to U.
+						impl.Dlacpy(blas.All, m, n, a, lda, u, ldu)
+					} else {
+						// Insufficient workspace for a fast algorithm.
+						itau := 0
+						iwork := itau + n
+
+						// Compute A = Q*R, copying result to U.
+						impl.Dgeqrf(m, n, a, lda, work[itau:itau+n], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu)
+
+						// Generate Q in U.
+						impl.Dorgqr(m, m, n, u, ldu, work[itau:itau+n], work[iwork:], lwork-iwork)
+						ie := itau
+						itauq := ie + n
+						itaup := itauq + n
+						iwork = itaup + n
+
+						// Zero out below R in A.
+						impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, a[lda:], lda)
+
+						// Bidiagonalize R in A.
+						impl.Dgebrd(n, n, a, lda, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+
+						// Multiply Q in U by left bidiagonalizing vectors in A.
+						impl.Dormbr(lapack.ApplyQ, blas.Right, blas.NoTrans, m, n, n,
+							a, lda, work[itauq:], u, ldu, work[iwork:], lwork-iwork)
+						iwork = ie + n
+
+						// Perform bidiagonal QR iteration, computing left
+						// singular vectors of A in U.
+						ok = impl.Dbdsqr(blas.Upper, n, 0, m, 0, s, work[ie:],
+							work, 1, u, ldu, work, 1, work[iwork:])
+					}
+				} else if wantvo {
+					// Path 8.
+					panic(noSVDO)
+				} else if wantvas {
+					// Path 9.
+					if lwork >= n*n+max(max(n+m, 4*n), bdspac) {
+						// Sufficient workspace for a fast algorithm.
+						iu := 0
+						var ldworku int
+						if lwork >= wrkbl+lda*n {
+							ldworku = lda
+						} else {
+							ldworku = n
+						}
+						itau := iu + ldworku*n
+						iwork := itau + n
+
+						// Compute A = Q * R, copying result to U.
+						impl.Dgeqrf(m, n, a, lda, work[itau:itau+n], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu)
+
+						// Generate Q in U.
+						impl.Dorgqr(m, m, n, u, ldu, work[itau:itau+n], work[iwork:], lwork-iwork)
+
+						// Copy R to work[iu:], zeroing out below it.
+						impl.Dlacpy(blas.Upper, n, n, a, lda, work[iu:], ldworku)
+						impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, work[iu+ldworku:], ldworku)
+
+						ie = itau
+						itauq := ie + n
+						itaup := itauq + n
+						iwork = itaup + n
+
+						// Bidiagonalize R in work[iu:], copying result to VT.
+						impl.Dgebrd(n, n, work[iu:], ldworku, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Upper, n, n, work[iu:], ldworku, vt, ldvt)
+
+						// Generate left bidiagonalizing vectors in work[iu:].
+						impl.Dorgbr(lapack.GenerateQ, n, n, n, work[iu:], ldworku,
+							work[itauq:], work[iwork:], lwork-iwork)
+
+						// Generate right bidiagonalizing vectors in VT.
+						impl.Dorgbr(lapack.GeneratePT, n, n, n, vt, ldvt,
+							work[itaup:], work[iwork:], lwork-iwork)
+						iwork = ie + n
+
+						// Perform bidiagonal QR iteration, computing left singular
+						// vectors of R in work[iu:] and computing right
+						// singular vectors of R in VT.
+						ok = impl.Dbdsqr(blas.Upper, n, n, n, 0, s, work[ie:],
+							vt, ldvt, work[iu:], ldworku, work, 1, work[iwork:])
+
+						// Multiply Q in U by left singular vectors of R in
+						// work[iu:], storing result in A.
+						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, n, 1,
+							u, ldu, work[iu:], ldworku, 0, a, lda)
+
+						// Copy left singular vectors of A from A to U.
+						impl.Dlacpy(blas.All, m, n, a, lda, u, ldu)
+
+						/*
+							// Bidiagonalize R in VT.
+							impl.Dgebrd(n, n, vt, ldvt, s, work[ie:],
+								work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+
+							// Multiply Q in U by left bidiagonalizing vectors in VT.
+							impl.Dormbr(lapack.ApplyQ, blas.Right, blas.NoTrans,
+								m, n, n, vt, ldvt, work[itauq:], u, ldu, work[iwork:], lwork-iwork)
+
+							// Generate right bidiagonalizing vectors in VT.
+							impl.Dorgbr(lapack.GeneratePT, n, n, n, vt, ldvt,
+								work[itaup:], work[iwork:], lwork-iwork)
+							iwork = ie + n
+
+							// Perform bidiagonal QR iteration, computing left singular
+							// vectors of A in U and computing right singular vectors
+							// of A in VT.
+							ok = impl.Dbdsqr(blas.Upper, n, n, m, 0, s, work[ie:],
+								vt, ldvt, u, ldu, work, 1, work[iwork:])
+						*/
+					} else {
+						// Insufficient workspace for a fast algorithm.
+						itau := 0
+						iwork := itau + n
+
+						// Compute A = Q*R, copying result to U.
+						impl.Dgeqrf(m, n, a, lda, work[itau:itau+n], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu)
+
+						// Generate Q in U.
+						impl.Dorgqr(m, m, n, u, ldu, work[itau:itau+n], work[iwork:], lwork-iwork)
+
+						// Copy R from A to VT, zeroing out below it.
+						impl.Dlacpy(blas.Upper, n, n, a, lda, vt, ldvt)
+						if n > 1 {
+							impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, vt[ldvt:], ldvt)
+						}
+
+						ie := itau
+						itauq := ie + n
+						itaup := itauq + n
+						iwork = itaup + n
+
+						// Bidiagonalize R in VT.
+						impl.Dgebrd(n, n, vt, ldvt, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+
+						// Multiply Q in U by left bidiagonalizing vectors in VT.
+						impl.Dormbr(lapack.ApplyQ, blas.Right, blas.NoTrans,
+							m, n, n, vt, ldvt, work[itauq:], u, ldu, work[iwork:], lwork-iwork)
+
+						// Generate right bidiagonizing vectors in VT.
+						impl.Dorgbr(lapack.GeneratePT, n, n, n, vt, ldvt,
+							work[itaup:], work[iwork:], lwork-iwork)
+						iwork = ie + n
+
+						// Perform bidiagonal QR iteration, computing left singular
+						// vectors of A in U and computing right singular vectors
+						// of A in VT.
+						ok = impl.Dbdsqr(blas.Upper, n, n, m, 0, s, work[ie:],
+							vt, ldvt, u, ldu, work, 1, work[iwork:])
+					}
+				}
+			}
+		} else {
+			// Path 10.
+			// M at least N, but not much larger.
+			ie = 0
+			itauq := ie + n
+			itaup := itauq + n
+			iwork := itaup + n
+
+			// Bidiagonalize A.
+			impl.Dgebrd(m, n, a, lda, s, work[ie:], work[itauq:],
+				work[itaup:], work[iwork:], lwork-iwork)
+			if wantuas {
+				// Left singular vectors are desired in U. Copy result to U and
+				// generate left biadiagonalizing vectors in U.
+				impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu)
+				var ncu int
+				if wantus {
+					ncu = n
+				}
+				if wantua {
+					ncu = m
+				}
+				impl.Dorgbr(lapack.GenerateQ, m, ncu, n, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
+			}
+			if wantvas {
+				// Right singular vectors are desired in VT. Copy result to VT and
+				// generate left biadiagonalizing vectors in VT.
+				impl.Dlacpy(blas.Upper, n, n, a, lda, vt, ldvt)
+				impl.Dorgbr(lapack.GeneratePT, n, n, n, vt, ldvt, work[itaup:], work[iwork:], lwork-iwork)
+			}
+			if wantuo {
+				panic(noSVDO)
+			}
+			if wantvo {
+				panic(noSVDO)
+			}
+			iwork = ie + n
+			var nru, ncvt int
+			if wantuas || wantuo {
+				nru = m
+			}
+			if wantun {
+				nru = 0
+			}
+			if wantvas || wantvo {
+				ncvt = n
+			}
+			if wantvn {
+				ncvt = 0
+			}
+			if !wantuo && !wantvo {
+				// Perform bidiagonal QR iteration, if desired, computing left
+				// singular vectors in U and right singular vectors in VT.
+				ok = impl.Dbdsqr(blas.Upper, n, ncvt, nru, 0, s, work[ie:],
+					vt, ldvt, u, ldu, work, 1, work[iwork:])
+			} else {
+				// There will be two branches when the implementation is complete.
+				panic(noSVDO)
+			}
+		}
+	} else {
+		// A has more columns than rows. If A has sufficiently more columns than
+		// rows, first reduce using the LQ decomposition.
+		if n >= mnthr {
+			// n >> m.
+			if wantvn {
+				// Path 1t.
+				itau := 0
+				iwork := itau + m
+
+				// Compute A = L*Q.
+				impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
+
+				// Zero out above L.
+				impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, a[1:], lda)
+				ie := 0
+				itauq := ie + m
+				itaup := itauq + m
+				iwork = itaup + m
+
+				// Bidiagonalize L in A.
+				impl.Dgebrd(m, m, a, lda, s, work[ie:itauq],
+					work[itauq:itaup], work[itaup:iwork], work[iwork:], lwork-iwork)
+				if wantuo || wantuas {
+					impl.Dorgbr(lapack.GenerateQ, m, m, m, a, lda,
+						work[itauq:], work[iwork:], lwork-iwork)
+				}
+				iwork = ie + m
+				nru := 0
+				if wantuo || wantuas {
+					nru = m
+				}
+
+				// Perform bidiagonal QR iteration, computing left singular vectors
+				// of A in A if desired.
+				ok = impl.Dbdsqr(blas.Upper, m, 0, nru, 0, s, work[ie:],
+					work, 1, a, lda, work, 1, work[iwork:])
+
+				// If left singular vectors desired in U, copy them there.
+				if wantuas {
+					impl.Dlacpy(blas.All, m, m, a, lda, u, ldu)
+				}
+			} else if wantvo && wantun {
+				// Path 2t.
+				panic(noSVDO)
+			} else if wantvo && wantuas {
+				// Path 3t.
+				panic(noSVDO)
+			} else if wantvs {
+				if wantun {
+					// Path 4t.
+					if lwork >= m*m+max(4*m, bdspac) {
+						// Sufficient workspace for a fast algorithm.
+						ir := 0
+						var ldworkr int
+						if lwork >= wrkbl+lda*m {
+							ldworkr = lda
+						} else {
+							ldworkr = m
+						}
+						itau := ir + ldworkr*m
+						iwork := itau + m
+
+						// Compute A = L*Q.
+						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
+
+						// Copy L to work[ir:], zeroing out above it.
+						impl.Dlacpy(blas.Lower, m, m, a, lda, work[ir:], ldworkr)
+						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, work[ir+1:], ldworkr)
+
+						// Generate Q in A.
+						impl.Dorglq(m, n, m, a, lda, work[itau:], work[iwork:], lwork-iwork)
+						ie := itau
+						itauq := ie + m
+						itaup := itauq + m
+						iwork = itaup + m
+
+						// Bidiagonalize L in work[ir:].
+						impl.Dgebrd(m, m, work[ir:], ldworkr, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+
+						// Generate right vectors bidiagonalizing L in work[ir:].
+						impl.Dorgbr(lapack.GeneratePT, m, m, m, work[ir:], ldworkr,
+							work[itaup:], work[iwork:], lwork-iwork)
+						iwork = ie + m
+
+						// Perform bidiagonal QR iteration, computing right singular
+						// vectors of L in work[ir:].
+						ok = impl.Dbdsqr(blas.Upper, m, m, 0, 0, s, work[ie:],
+							work[ir:], ldworkr, work, 1, work, 1, work[iwork:])
+
+						// Multiply right singular vectors of L in work[ir:] by
+						// Q in A, storing result in VT.
+						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, m, 1,
+							work[ir:], ldworkr, a, lda, 0, vt, ldvt)
+					} else {
+						// Insufficient workspace for a fast algorithm.
+						itau := 0
+						iwork := itau + m
+
+						// Compute A = L*Q.
+						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
+
+						// Copy result to VT.
+						impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)
+
+						// Generate Q in VT.
+						impl.Dorglq(m, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork)
+						ie := itau
+						itauq := ie + m
+						itaup := itauq + m
+						iwork = itaup + m
+
+						// Zero out above L in A.
+						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, a[1:], lda)
+
+						// Bidiagonalize L in A.
+						impl.Dgebrd(m, m, a, lda, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+
+						// Multiply right vectors bidiagonalizing L by Q in VT.
+						impl.Dormbr(lapack.ApplyP, blas.Left, blas.Trans, m, n, m,
+							a, lda, work[itaup:], vt, ldvt, work[iwork:], lwork-iwork)
+						iwork = ie + m
+
+						// Perform bidiagonal QR iteration, computing right
+						// singular vectors of A in VT.
+						ok = impl.Dbdsqr(blas.Upper, m, n, 0, 0, s, work[ie:],
+							vt, ldvt, work, 1, work, 1, work[iwork:])
+					}
+				} else if wantuo {
+					// Path 5t.
+					panic(noSVDO)
+				} else if wantuas {
+					// Path 6t.
+					if lwork >= m*m+max(4*m, bdspac) {
+						// Sufficient workspace for a fast algorithm.
+						iu := 0
+						var ldworku int
+						if lwork >= wrkbl+lda*m {
+							ldworku = lda
+						} else {
+							ldworku = m
+						}
+						itau := iu + ldworku*m
+						iwork := itau + m
+
+						// Compute A = L*Q.
+						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
+
+						// Copy L to work[iu:], zeroing out above it.
+						impl.Dlacpy(blas.Lower, m, m, a, lda, work[iu:], ldworku)
+						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, work[iu+1:], ldworku)
+
+						// Generate Q in A.
+						impl.Dorglq(m, n, m, a, lda, work[itau:], work[iwork:], lwork-iwork)
+						ie := itau
+						itauq := ie + m
+						itaup := itauq + m
+						iwork = itaup + m
+
+						// Bidiagonalize L in work[iu:], copying result to U.
+						impl.Dgebrd(m, m, work[iu:], ldworku, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Lower, m, m, work[iu:], ldworku, u, ldu)
+
+						// Generate right bidiagionalizing vectors in work[iu:].
+						impl.Dorgbr(lapack.GeneratePT, m, m, m, work[iu:], ldworku,
+							work[itaup:], work[iwork:], lwork-iwork)
+
+						// Generate left bidiagonalizing vectors in U.
+						impl.Dorgbr(lapack.GenerateQ, m, m, m, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
+						iwork = ie + m
+
+						// Perform bidiagonal QR iteration, computing left singular
+						// vectors of L in U and computing right singular vectors of
+						// L in work[iu:].
+						ok = impl.Dbdsqr(blas.Upper, m, m, m, 0, s, work[ie:],
+							work[iu:], ldworku, u, ldu, work, 1, work[iwork:])
+
+						// Multiply right singular vectors of L in work[iu:] by
+						// Q in A, storing result in VT.
+						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, m, 1,
+							work[iu:], ldworku, a, lda, 0, vt, ldvt)
+					} else {
+						// Insufficient workspace for a fast algorithm.
+						itau := 0
+						iwork := itau + m
+
+						// Compute A = L*Q, copying result to VT.
+						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)
+
+						// Generate Q in VT.
+						impl.Dorglq(m, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork)
+
+						// Copy L to U, zeroing out above it.
+						impl.Dlacpy(blas.Lower, m, m, a, lda, u, ldu)
+						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, u[1:], ldu)
+
+						ie := itau
+						itauq := ie + m
+						itaup := itauq + m
+						iwork = itaup + m
+
+						// Bidiagonalize L in U.
+						impl.Dgebrd(m, m, u, ldu, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+
+						// Multiply right bidiagonalizing vectors in U by Q in VT.
+						impl.Dormbr(lapack.ApplyP, blas.Left, blas.Trans, m, n, m,
+							u, ldu, work[itaup:], vt, ldvt, work[iwork:], lwork-iwork)
+
+						// Generate left bidiagonalizing vectors in U.
+						impl.Dorgbr(lapack.GenerateQ, m, m, m, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
+						iwork = ie + m
+
+						// Perform bidiagonal QR iteration, computing left singular
+						// vectors of A in U and computing right singular vectors
+						// of A in VT.
+						ok = impl.Dbdsqr(blas.Upper, m, n, m, 0, s, work[ie:], vt, ldvt,
+							u, ldu, work, 1, work[iwork:])
+					}
+				}
+			} else if wantva {
+				if wantun {
+					// Path 7t.
+					if lwork >= m*m+max(max(n+m, 4*m), bdspac) {
+						// Sufficient workspace for a fast algorithm.
+						ir := 0
+						var ldworkr int
+						if lwork >= wrkbl+lda*m {
+							ldworkr = lda
+						} else {
+							ldworkr = m
+						}
+						itau := ir + ldworkr*m
+						iwork := itau + m
+
+						// Compute A = L*Q, copying result to VT.
+						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)
+
+						// Copy L to work[ir:], zeroing out above it.
+						impl.Dlacpy(blas.Lower, m, m, a, lda, work[ir:], ldworkr)
+						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, work[ir+1:], ldworkr)
+
+						// Generate Q in VT.
+						impl.Dorglq(n, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork)
+
+						ie := itau
+						itauq := ie + m
+						itaup := itauq + m
+						iwork = itaup + m
+
+						// Bidiagonalize L in work[ir:].
+						impl.Dgebrd(m, m, work[ir:], ldworkr, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+
+						// Generate right bidiagonalizing vectors in work[ir:].
+						impl.Dorgbr(lapack.GeneratePT, m, m, m, work[ir:], ldworkr,
+							work[itaup:], work[iwork:], lwork-iwork)
+						iwork = ie + m
+
+						// Perform bidiagonal QR iteration, computing right
+						// singular vectors of L in work[ir:].
+						ok = impl.Dbdsqr(blas.Upper, m, m, 0, 0, s, work[ie:],
+							work[ir:], ldworkr, work, 1, work, 1, work[iwork:])
+
+						// Multiply right singular vectors of L in work[ir:] by
+						// Q in VT, storing result in A.
+						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, m, 1,
+							work[ir:], ldworkr, vt, ldvt, 0, a, lda)
+
+						// Copy right singular vectors of A from A to VT.
+						impl.Dlacpy(blas.All, m, n, a, lda, vt, ldvt)
+					} else {
+						// Insufficient workspace for a fast algorithm.
+						itau := 0
+						iwork := itau + m
+						// Compute A = L * Q, copying result to VT.
+						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)
+
+						// Generate Q in VT.
+						impl.Dorglq(n, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork)
+
+						ie := itau
+						itauq := ie + m
+						itaup := itauq + m
+						iwork = itaup + m
+
+						// Zero out above L in A.
+						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, a[1:], lda)
+
+						// Bidiagonalize L in A.
+						impl.Dgebrd(m, m, a, lda, s, work[ie:], work[itauq:],
+							work[itaup:], work[iwork:], lwork-iwork)
+
+						// Multiply right bidiagonalizing vectors in A by Q in VT.
+						impl.Dormbr(lapack.ApplyP, blas.Left, blas.Trans, m, n, m,
+							a, lda, work[itaup:], vt, ldvt, work[iwork:], lwork-iwork)
+						iwork = ie + m
+
+						// Perform bidiagonal QR iteration, computing right singular
+						// vectors of A in VT.
+						ok = impl.Dbdsqr(blas.Upper, m, n, 0, 0, s, work[ie:],
+							vt, ldvt, work, 1, work, 1, work[iwork:])
+					}
+				} else if wantuo {
+					panic(noSVDO)
+				} else if wantuas {
+					// Path 9t.
+					if lwork >= m*m+max(max(m+n, 4*m), bdspac) {
+						// Sufficient workspace for a fast algorithm.
+						iu := 0
+
+						var ldworku int
+						if lwork >= wrkbl+lda*m {
+							ldworku = lda
+						} else {
+							ldworku = m
+						}
+						itau := iu + ldworku*m
+						iwork := itau + m
+
+						// Generate A = L * Q copying result to VT.
+						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)
+
+						// Generate Q in VT.
+						impl.Dorglq(n, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork)
+
+						// Copy L to work[iu:], zeroing out above it.
+						impl.Dlacpy(blas.Lower, m, m, a, lda, work[iu:], ldworku)
+						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, work[iu+1:], ldworku)
+						ie = itau
+						itauq := ie + m
+						itaup := itauq + m
+						iwork = itaup + m
+
+						// Bidiagonalize L in work[iu:], copying result to U.
+						impl.Dgebrd(m, m, work[iu:], ldworku, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Lower, m, m, work[iu:], ldworku, u, ldu)
+
+						// Generate right bidiagonalizing vectors in work[iu:].
+						impl.Dorgbr(lapack.GeneratePT, m, m, m, work[iu:], ldworku,
+							work[itaup:], work[iwork:], lwork-iwork)
+
+						// Generate left bidiagonalizing vectors in U.
+						impl.Dorgbr(lapack.GenerateQ, m, m, m, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
+						iwork = ie + m
+
+						// Perform bidiagonal QR iteration, computing left singular
+						// vectors of L in U and computing right singular vectors
+						// of L in work[iu:].
+						ok = impl.Dbdsqr(blas.Upper, m, m, m, 0, s, work[ie:],
+							work[iu:], ldworku, u, ldu, work, 1, work[iwork:])
+
+						// Multiply right singular vectors of L in work[iu:]
+						// Q in VT, storing result in A.
+						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, m, 1,
+							work[iu:], ldworku, vt, ldvt, 0, a, lda)
+
+						// Copy right singular vectors of A from A to VT.
+						impl.Dlacpy(blas.All, m, n, a, lda, vt, ldvt)
+					} else {
+						// Insufficient workspace for a fast algorithm.
+						itau := 0
+						iwork := itau + m
+
+						// Compute A = L * Q, copying result to VT.
+						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)
+
+						// Generate Q in VT.
+						impl.Dorglq(n, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork)
+
+						// Copy L to U, zeroing out above it.
+						impl.Dlacpy(blas.Lower, m, m, a, lda, u, ldu)
+						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, u[1:], ldu)
+
+						ie = itau
+						itauq := ie + m
+						itaup := itauq + m
+						iwork = itaup + m
+
+						// Bidiagonalize L in U.
+						impl.Dgebrd(m, m, u, ldu, s, work[ie:], work[itauq:],
+							work[itaup:], work[iwork:], lwork-iwork)
+
+						// Multiply right bidiagonalizing vectors in U by Q in VT.
+						impl.Dormbr(lapack.ApplyP, blas.Left, blas.Trans, m, n, m,
+							u, ldu, work[itaup:], vt, ldvt, work[iwork:], lwork-iwork)
+
+						// Generate left bidiagonalizing vectors in U.
+						impl.Dorgbr(lapack.GenerateQ, m, m, m, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
+						iwork = ie + m
+
+						// Perform bidiagonal QR iteration, computing left singular
+						// vectors of A in U and computing right singular vectors
+						// of A in VT.
+						ok = impl.Dbdsqr(blas.Upper, m, n, m, 0, s, work[ie:],
+							vt, ldvt, u, ldu, work, 1, work[iwork:])
+					}
+				}
+			}
+		} else {
+			// Path 10t.
+			// N at least M, but not much larger.
+			ie = 0
+			itauq := ie + m
+			itaup := itauq + m
+			iwork := itaup + m
+
+			// Bidiagonalize A.
+			impl.Dgebrd(m, n, a, lda, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+			if wantuas {
+				// If left singular vectors desired in U, copy result to U and
+				// generate left bidiagonalizing vectors in U.
+				impl.Dlacpy(blas.Lower, m, m, a, lda, u, ldu)
+				impl.Dorgbr(lapack.GenerateQ, m, m, n, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
+			}
+			if wantvas {
+				// If right singular vectors desired in VT, copy result to VT
+				// and generate right bidiagonalizing vectors in VT.
+				impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)
+				var nrvt int
+				if wantva {
+					nrvt = n
+				} else {
+					nrvt = m
+				}
+				impl.Dorgbr(lapack.GeneratePT, nrvt, n, m, vt, ldvt, work[itaup:], work[iwork:], lwork-iwork)
+			}
+			if wantuo {
+				panic(noSVDO)
+			}
+			if wantvo {
+				panic(noSVDO)
+			}
+			iwork = ie + m
+			var nru, ncvt int
+			if wantuas || wantuo {
+				nru = m
+			}
+			if wantvas || wantvo {
+				ncvt = n
+			}
+			if !wantuo && !wantvo {
+				// Perform bidiagonal QR iteration, if desired, computing left
+				// singular vectors in U and computing right singular vectors in
+				// VT.
+				ok = impl.Dbdsqr(blas.Lower, m, ncvt, nru, 0, s, work[ie:],
+					vt, ldvt, u, ldu, work, 1, work[iwork:])
+			} else {
+				// There will be two branches when the implementation is complete.
+				panic(noSVDO)
+			}
+		}
+	}
+	if !ok {
+		if ie > 1 {
+			for i := 0; i < minmn-1; i++ {
+				work[i+1] = work[i+ie]
+			}
+		}
+		if ie < 1 {
+			for i := minmn - 2; i >= 0; i-- {
+				work[i+1] = work[i+ie]
+			}
+		}
+	}
+	// Undo scaling if necessary.
+	if iscl {
+		if anrm > bignum {
+			impl.Dlascl(lapack.General, 0, 0, bignum, anrm, 1, minmn, s, minmn)
+		}
+		if !ok && anrm > bignum {
+			impl.Dlascl(lapack.General, 0, 0, bignum, anrm, 1, minmn-1, work[1:], minmn)
+		}
+		if anrm < smlnum {
+			impl.Dlascl(lapack.General, 0, 0, smlnum, anrm, 1, minmn, s, minmn)
+		}
+		if !ok && anrm < smlnum {
+			impl.Dlascl(lapack.General, 0, 0, smlnum, anrm, 1, minmn-1, work[1:], minmn)
+		}
+	}
+	work[0] = float64(maxwrk)
+	return ok
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgetc2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgetc2.go
new file mode 100644
index 0000000000..41203e9fa2
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgetc2.go
@@ -0,0 +1,125 @@
+// Copyright ©2021 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dgetc2 computes an LU factorization with complete pivoting of the n×n matrix
+// A. The factorization has the form
+//
+//	A = P * L * U * Q,
+//
+// where P and Q are permutation matrices, L is lower triangular with unit
+// diagonal elements and U is upper triangular.
+//
+// On entry, a contains the matrix A to be factored. On return, a is overwritten
+// with the factors L and U. The unit diagonal elements of L are not stored.
+//
+// On return, ipiv and jpiv contain the pivot indices: row i has been
+// interchanged with row ipiv[i] and column j has been interchanged with column
+// jpiv[j]. ipiv and jpiv must have length n, otherwise Dgetc2 will panic.
+//
+// If k is non-negative, then U[k,k] is likely to produce overflow when solving
+// for x in A*x=b and U has been perturbed to avoid the overflow.
+//
+// Dgetc2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgetc2(n int, a []float64, lda int, ipiv, jpiv []int) (k int) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Negative k indicates U was not perturbed.
+	k = -1
+
+	// Quick return if possible.
+	if n == 0 {
+		return k
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(ipiv) != n:
+		panic(badLenIpiv)
+	case len(jpiv) != n:
+		panic(badLenJpvt)
+	}
+
+	const (
+		eps    = dlamchP
+		smlnum = dlamchS / eps
+	)
+
+	if n == 1 {
+		ipiv[0], jpiv[0] = 0, 0
+		if math.Abs(a[0]) < smlnum {
+			k = 0
+			a[0] = smlnum
+		}
+		return k
+	}
+
+	// Factorize A using complete pivoting.
+	// Set pivots less than smin to smin.
+	var smin float64
+	var ipv, jpv int
+	bi := blas64.Implementation()
+	for i := 0; i < n-1; i++ {
+		var xmax float64
+		for ip := i; ip < n; ip++ {
+			for jp := i; jp < n; jp++ {
+				if math.Abs(a[ip*lda+jp]) >= xmax {
+					xmax = math.Abs(a[ip*lda+jp])
+					ipv = ip
+					jpv = jp
+				}
+			}
+		}
+		if i == 0 {
+			smin = math.Max(eps*xmax, smlnum)
+		}
+
+		// Swap rows.
+		if ipv != i {
+			bi.Dswap(n, a[ipv*lda:], 1, a[i*lda:], 1)
+		}
+		ipiv[i] = ipv
+
+		// Swap columns.
+		if jpv != i {
+			bi.Dswap(n, a[jpv:], lda, a[i:], lda)
+		}
+		jpiv[i] = jpv
+
+		// Check for singularity.
+		if math.Abs(a[i*lda+i]) < smin {
+			k = i
+			a[i*lda+i] = smin
+		}
+
+		for j := i + 1; j < n; j++ {
+			a[j*lda+i] /= a[i*lda+i]
+		}
+		bi.Dger(n-i-1, n-i-1, -1, a[(i+1)*lda+i:], lda, a[i*lda+i+1:], 1, a[(i+1)*lda+i+1:], lda)
+	}
+
+	if math.Abs(a[(n-1)*lda+n-1]) < smin {
+		k = n - 1
+		a[(n-1)*lda+(n-1)] = smin
+	}
+
+	// Set last pivots to last index.
+	ipiv[n-1] = n - 1
+	jpiv[n-1] = n - 1
+
+	return k
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgetf2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgetf2.go
new file mode 100644
index 0000000000..6a7003cf31
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgetf2.go
@@ -0,0 +1,90 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dgetf2 computes the LU decomposition of an m×n matrix A using partial
+// pivoting with row interchanges.
+//
+// The LU decomposition is a factorization of A into
+//
+//	A = P * L * U
+//
+// where P is a permutation matrix, L is a lower triangular with unit diagonal
+// elements (lower trapezoidal if m > n), and U is upper triangular (upper
+// trapezoidal if m < n).
+//
+// On entry, a contains the matrix A. On return, L and U are stored in place
+// into a, and P is represented by ipiv.
+//
+// ipiv contains a sequence of row interchanges. It indicates that row i of the
+// matrix was interchanged with ipiv[i]. ipiv must have length min(m,n), and
+// Dgetf2 will panic otherwise. ipiv is zero-indexed.
+//
+// Dgetf2 returns whether the matrix A is nonsingular. The LU decomposition will
+// be computed regardless of the singularity of A, but the result should not be
+// used to solve a system of equation.
+//
+// Dgetf2 is an internal routine. It is exported for testing purposes.
+func (Implementation) Dgetf2(m, n int, a []float64, lda int, ipiv []int) (ok bool) {
+	mn := min(m, n)
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if mn == 0 {
+		return true
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(ipiv) != mn:
+		panic(badLenIpiv)
+	}
+
+	bi := blas64.Implementation()
+
+	sfmin := dlamchS
+	ok = true
+	for j := 0; j < mn; j++ {
+		// Find a pivot and test for singularity.
+		jp := j + bi.Idamax(m-j, a[j*lda+j:], lda)
+		ipiv[j] = jp
+		if a[jp*lda+j] == 0 {
+			ok = false
+		} else {
+			// Swap the rows if necessary.
+			if jp != j {
+				bi.Dswap(n, a[j*lda:], 1, a[jp*lda:], 1)
+			}
+			if j < m-1 {
+				aj := a[j*lda+j]
+				if math.Abs(aj) >= sfmin {
+					bi.Dscal(m-j-1, 1/aj, a[(j+1)*lda+j:], lda)
+				} else {
+					for i := 0; i < m-j-1; i++ {
+						a[(j+1)*lda+j] = a[(j+1)*lda+j] / a[lda*j+j]
+					}
+				}
+			}
+		}
+		if j < mn-1 {
+			bi.Dger(m-j-1, n-j-1, -1, a[(j+1)*lda+j:], lda, a[j*lda+j+1:], 1, a[(j+1)*lda+j+1:], lda)
+		}
+	}
+	return ok
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgetrf.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgetrf.go
new file mode 100644
index 0000000000..38ae8efa14
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgetrf.go
@@ -0,0 +1,89 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dgetrf computes the LU decomposition of an m×n matrix A using partial
+// pivoting with row interchanges.
+//
+// The LU decomposition is a factorization of A into
+//
+//	A = P * L * U
+//
+// where P is a permutation matrix, L is a lower triangular with unit diagonal
+// elements (lower trapezoidal if m > n), and U is upper triangular (upper
+// trapezoidal if m < n).
+//
+// On entry, a contains the matrix A. On return, L and U are stored in place
+// into a, and P is represented by ipiv.
+//
+// ipiv contains a sequence of row interchanges. It indicates that row i of the
+// matrix was interchanged with ipiv[i]. ipiv must have length min(m,n), and
+// Dgetrf will panic otherwise. ipiv is zero-indexed.
+//
+// Dgetrf returns whether the matrix A is nonsingular. The LU decomposition will
+// be computed regardless of the singularity of A, but the result should not be
+// used to solve a system of equation.
+func (impl Implementation) Dgetrf(m, n int, a []float64, lda int, ipiv []int) (ok bool) {
+	mn := min(m, n)
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if mn == 0 {
+		return true
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(ipiv) != mn:
+		panic(badLenIpiv)
+	}
+
+	bi := blas64.Implementation()
+
+	nb := impl.Ilaenv(1, "DGETRF", " ", m, n, -1, -1)
+	if nb <= 1 || mn <= nb {
+		// Use the unblocked algorithm.
+		return impl.Dgetf2(m, n, a, lda, ipiv)
+	}
+	ok = true
+	for j := 0; j < mn; j += nb {
+		jb := min(mn-j, nb)
+		blockOk := impl.Dgetf2(m-j, jb, a[j*lda+j:], lda, ipiv[j:j+jb])
+		if !blockOk {
+			ok = false
+		}
+		for i := j; i <= min(m-1, j+jb-1); i++ {
+			ipiv[i] = j + ipiv[i]
+		}
+		impl.Dlaswp(j, a, lda, j, j+jb-1, ipiv[:j+jb], 1)
+		if j+jb < n {
+			impl.Dlaswp(n-j-jb, a[j+jb:], lda, j, j+jb-1, ipiv[:j+jb], 1)
+			bi.Dtrsm(blas.Left, blas.Lower, blas.NoTrans, blas.Unit,
+				jb, n-j-jb, 1,
+				a[j*lda+j:], lda,
+				a[j*lda+j+jb:], lda)
+			if j+jb < m {
+				bi.Dgemm(blas.NoTrans, blas.NoTrans, m-j-jb, n-j-jb, jb, -1,
+					a[(j+jb)*lda+j:], lda,
+					a[j*lda+j+jb:], lda,
+					1, a[(j+jb)*lda+j+jb:], lda)
+			}
+		}
+	}
+	return ok
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgetri.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgetri.go
new file mode 100644
index 0000000000..b2f2ae46b9
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgetri.go
@@ -0,0 +1,116 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dgetri computes the inverse of the matrix A using the LU factorization computed
+// by Dgetrf. On entry, a contains the PLU decomposition of A as computed by
+// Dgetrf and on exit contains the reciprocal of the original matrix.
+//
+// Dgetri will not perform the inversion if the matrix is singular, and returns
+// a boolean indicating whether the inversion was successful.
+//
+// work is temporary storage, and lwork specifies the usable memory length.
+// At minimum, lwork >= n and this function will panic otherwise.
+// Dgetri is a blocked inversion, but the block size is limited
+// by the temporary space available. If lwork == -1, instead of performing Dgetri,
+// the optimal work length will be stored into work[0].
+func (impl Implementation) Dgetri(n int, a []float64, lda int, ipiv []int, work []float64, lwork int) (ok bool) {
+	iws := max(1, n)
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < iws && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	if n == 0 {
+		work[0] = 1
+		return true
+	}
+
+	nb := impl.Ilaenv(1, "DGETRI", " ", n, -1, -1, -1)
+	if lwork == -1 {
+		work[0] = float64(n * nb)
+		return true
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(ipiv) != n:
+		panic(badLenIpiv)
+	}
+
+	// Form inv(U).
+	ok = impl.Dtrtri(blas.Upper, blas.NonUnit, n, a, lda)
+	if !ok {
+		return false
+	}
+
+	nbmin := 2
+	if 1 < nb && nb < n {
+		iws = max(n*nb, 1)
+		if lwork < iws {
+			nb = lwork / n
+			nbmin = max(2, impl.Ilaenv(2, "DGETRI", " ", n, -1, -1, -1))
+		}
+	}
+	ldwork := nb
+
+	bi := blas64.Implementation()
+	// Solve the equation inv(A)*L = inv(U) for inv(A).
+	// TODO(btracey): Replace this with a more row-major oriented algorithm.
+	if nb < nbmin || n <= nb {
+		// Unblocked code.
+		for j := n - 1; j >= 0; j-- {
+			for i := j + 1; i < n; i++ {
+				// Copy current column of L to work and replace with zeros.
+				work[i] = a[i*lda+j]
+				a[i*lda+j] = 0
+			}
+			// Compute current column of inv(A).
+			if j < n-1 {
+				bi.Dgemv(blas.NoTrans, n, n-j-1, -1, a[(j+1):], lda, work[(j+1):], 1, 1, a[j:], lda)
+			}
+		}
+	} else {
+		// Blocked code.
+		nn := ((n - 1) / nb) * nb
+		for j := nn; j >= 0; j -= nb {
+			jb := min(nb, n-j)
+			// Copy current block column of L to work and replace
+			// with zeros.
+			for jj := j; jj < j+jb; jj++ {
+				for i := jj + 1; i < n; i++ {
+					work[i*ldwork+(jj-j)] = a[i*lda+jj]
+					a[i*lda+jj] = 0
+				}
+			}
+			// Compute current block column of inv(A).
+			if j+jb < n {
+				bi.Dgemm(blas.NoTrans, blas.NoTrans, n, jb, n-j-jb, -1, a[(j+jb):], lda, work[(j+jb)*ldwork:], ldwork, 1, a[j:], lda)
+			}
+			bi.Dtrsm(blas.Right, blas.Lower, blas.NoTrans, blas.Unit, n, jb, 1, work[j*ldwork:], ldwork, a[j:], lda)
+		}
+	}
+	// Apply column interchanges.
+	for j := n - 2; j >= 0; j-- {
+		jp := ipiv[j]
+		if jp != j {
+			bi.Dswap(n, a[j:], lda, a[jp:], lda)
+		}
+	}
+	work[0] = float64(iws)
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgetrs.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgetrs.go
new file mode 100644
index 0000000000..35b33aa7d7
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgetrs.go
@@ -0,0 +1,74 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dgetrs solves a system of equations using an LU factorization.
+// The system of equations solved is
+//
+//	A * X = B  if trans == blas.Trans
+//	Aᵀ * X = B if trans == blas.NoTrans
+//
+// A is a general n×n matrix with stride lda. B is a general matrix of size n×nrhs.
+//
+// On entry b contains the elements of the matrix B. On exit, b contains the
+// elements of X, the solution to the system of equations.
+//
+// a and ipiv contain the LU factorization of A and the permutation indices as
+// computed by Dgetrf. ipiv is zero-indexed.
+func (impl Implementation) Dgetrs(trans blas.Transpose, n, nrhs int, a []float64, lda int, ipiv []int, b []float64, ldb int) {
+	switch {
+	case trans != blas.NoTrans && trans != blas.Trans && trans != blas.ConjTrans:
+		panic(badTrans)
+	case n < 0:
+		panic(nLT0)
+	case nrhs < 0:
+		panic(nrhsLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldb < max(1, nrhs):
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if n == 0 || nrhs == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(b) < (n-1)*ldb+nrhs:
+		panic(shortB)
+	case len(ipiv) != n:
+		panic(badLenIpiv)
+	}
+
+	bi := blas64.Implementation()
+
+	if trans == blas.NoTrans {
+		// Solve A * X = B.
+		impl.Dlaswp(nrhs, b, ldb, 0, n-1, ipiv, 1)
+		// Solve L * X = B, updating b.
+		bi.Dtrsm(blas.Left, blas.Lower, blas.NoTrans, blas.Unit,
+			n, nrhs, 1, a, lda, b, ldb)
+		// Solve U * X = B, updating b.
+		bi.Dtrsm(blas.Left, blas.Upper, blas.NoTrans, blas.NonUnit,
+			n, nrhs, 1, a, lda, b, ldb)
+		return
+	}
+	// Solve Aᵀ * X = B.
+	// Solve Uᵀ * X = B, updating b.
+	bi.Dtrsm(blas.Left, blas.Upper, blas.Trans, blas.NonUnit,
+		n, nrhs, 1, a, lda, b, ldb)
+	// Solve Lᵀ * X = B, updating b.
+	bi.Dtrsm(blas.Left, blas.Lower, blas.Trans, blas.Unit,
+		n, nrhs, 1, a, lda, b, ldb)
+	impl.Dlaswp(nrhs, b, ldb, 0, n-1, ipiv, -1)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgghrd.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgghrd.go
new file mode 100644
index 0000000000..c9d6b4d1b3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgghrd.go
@@ -0,0 +1,125 @@
+// Copyright ©2023 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dgghrd reduces a pair of real matrices (A,B) to generalized upper Hessenberg
+// form using orthogonal transformations, where A is a general matrix and B is
+// upper triangular.
+//
+// This subroutine simultaneously reduces A to a Hessenberg matrix H
+//
+//	Qᵀ*A*Z = H,
+//
+// and transforms B to another upper triangular matrix T
+//
+//	Qᵀ*B*Z = T.
+//
+// The orthogonal matrices Q and Z are determined as products of Givens
+// rotations. They may either be formed explicitly (lapack.OrthoExplicit), or
+// they may be postmultiplied into input matrices Q1 and Z1
+// (lapack.OrthoPostmul), so that
+//
+//	Q1 * A * Z1ᵀ = (Q1*Q) * H * (Z1*Z)ᵀ,
+//	Q1 * B * Z1ᵀ = (Q1*Q) * T * (Z1*Z)ᵀ.
+//
+// ilo and ihi determine the block of A that will be reduced. It must hold that
+//
+//   - 0 <= ilo <= ihi < n      if n > 0,
+//   - ilo == 0 and ihi == -1   if n == 0,
+//
+// otherwise Dgghrd will panic.
+//
+// Dgghrd is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgghrd(compq, compz lapack.OrthoComp, n, ilo, ihi int, a []float64, lda int, b []float64, ldb int, q []float64, ldq int, z []float64, ldz int) {
+	switch {
+	case compq != lapack.OrthoNone && compq != lapack.OrthoExplicit && compq != lapack.OrthoPostmul:
+		panic(badOrthoComp)
+	case compz != lapack.OrthoNone && compz != lapack.OrthoExplicit && compz != lapack.OrthoPostmul:
+		panic(badOrthoComp)
+	case n < 0:
+		panic(nLT0)
+	case ilo < 0 || max(0, n-1) < ilo:
+		panic(badIlo)
+	case ihi < min(ilo, n-1) || n <= ihi:
+		panic(badIhi)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldb < max(1, n):
+		panic(badLdB)
+	case (compq != lapack.OrthoNone && ldq < n) || ldq < 1:
+		panic(badLdQ)
+	case (compz != lapack.OrthoNone && ldz < n) || ldz < 1:
+		panic(badLdZ)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(b) < (n-1)*ldb+n:
+		panic(shortB)
+	case compq != lapack.OrthoNone && len(q) < (n-1)*ldq+n:
+		panic(shortQ)
+	case compz != lapack.OrthoNone && len(z) < (n-1)*ldz+n:
+		panic(shortZ)
+	}
+
+	if compq == lapack.OrthoExplicit {
+		impl.Dlaset(blas.All, n, n, 0, 1, q, ldq)
+	}
+	if compz == lapack.OrthoExplicit {
+		impl.Dlaset(blas.All, n, n, 0, 1, z, ldz)
+	}
+
+	// Quick return if possible.
+	if n == 1 {
+		return
+	}
+
+	// Zero out lower triangle of B.
+	for i := 1; i < n; i++ {
+		for j := 0; j < i; j++ {
+			b[i*ldb+j] = 0
+		}
+	}
+	bi := blas64.Implementation()
+	// Reduce A and B.
+	for jcol := ilo; jcol <= ihi-2; jcol++ {
+		for jrow := ihi; jrow >= jcol+2; jrow-- {
+			// Step 1: rotate rows jrow-1, jrow to kill A[jrow,jcol].
+			var c, s float64
+			c, s, a[(jrow-1)*lda+jcol] = impl.Dlartg(a[(jrow-1)*lda+jcol], a[jrow*lda+jcol])
+			a[jrow*lda+jcol] = 0
+
+			bi.Drot(n-jcol-1, a[(jrow-1)*lda+jcol+1:], 1, a[jrow*lda+jcol+1:], 1, c, s)
+			bi.Drot(n+2-jrow-1, b[(jrow-1)*ldb+jrow-1:], 1, b[jrow*ldb+jrow-1:], 1, c, s)
+
+			if compq != lapack.OrthoNone {
+				bi.Drot(n, q[jrow-1:], ldq, q[jrow:], ldq, c, s)
+			}
+
+			// Step 2: rotate columns jrow, jrow-1 to kill B[jrow,jrow-1].
+			c, s, b[jrow*ldb+jrow] = impl.Dlartg(b[jrow*ldb+jrow], b[jrow*ldb+jrow-1])
+			b[jrow*ldb+jrow-1] = 0
+
+			bi.Drot(ihi+1, a[jrow:], lda, a[jrow-1:], lda, c, s)
+			bi.Drot(jrow, b[jrow:], ldb, b[jrow-1:], ldb, c, s)
+
+			if compz != lapack.OrthoNone {
+				bi.Drot(n, z[jrow:], ldz, z[jrow-1:], ldz, c, s)
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dggsvd3.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dggsvd3.go
new file mode 100644
index 0000000000..cfe10efa9d
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dggsvd3.go
@@ -0,0 +1,258 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dggsvd3 computes the generalized singular value decomposition (GSVD)
+// of an m×n matrix A and p×n matrix B:
+//
+//	Uᵀ*A*Q = D1*[ 0 R ]
+//
+//	Vᵀ*B*Q = D2*[ 0 R ]
+//
+// where U, V and Q are orthogonal matrices.
+//
+// Dggsvd3 returns k and l, the dimensions of the sub-blocks. k+l
+// is the effective numerical rank of the (m+p)×n matrix [ Aᵀ Bᵀ ]ᵀ.
+// R is a (k+l)×(k+l) nonsingular upper triangular matrix, D1 and
+// D2 are m×(k+l) and p×(k+l) diagonal matrices and of the following
+// structures, respectively:
+//
+// If m-k-l >= 0,
+//
+//	                  k  l
+//	     D1 =     k [ I  0 ]
+//	              l [ 0  C ]
+//	          m-k-l [ 0  0 ]
+//
+//	                k  l
+//	     D2 = l   [ 0  S ]
+//	          p-l [ 0  0 ]
+//
+//	             n-k-l  k    l
+//	[ 0 R ] = k [  0   R11  R12 ] k
+//	          l [  0    0   R22 ] l
+//
+// where
+//
+//	C = diag( alpha_k, ... , alpha_{k+l} ),
+//	S = diag( beta_k,  ... , beta_{k+l} ),
+//	C^2 + S^2 = I.
+//
+// R is stored in
+//
+//	A[0:k+l, n-k-l:n]
+//
+// on exit.
+//
+// If m-k-l < 0,
+//
+//	               k m-k k+l-m
+//	    D1 =   k [ I  0    0  ]
+//	         m-k [ 0  C    0  ]
+//
+//	                 k m-k k+l-m
+//	    D2 =   m-k [ 0  S    0  ]
+//	         k+l-m [ 0  0    I  ]
+//	           p-l [ 0  0    0  ]
+//
+//	               n-k-l  k   m-k  k+l-m
+//	[ 0 R ] =    k [ 0    R11  R12  R13 ]
+//	           m-k [ 0     0   R22  R23 ]
+//	         k+l-m [ 0     0    0   R33 ]
+//
+// where
+//
+//	C = diag( alpha_k, ... , alpha_m ),
+//	S = diag( beta_k,  ... , beta_m ),
+//	C^2 + S^2 = I.
+//
+//	R = [ R11 R12 R13 ] is stored in A[1:m, n-k-l+1:n]
+//	    [  0  R22 R23 ]
+//
+// and R33 is stored in
+//
+//	B[m-k:l, n+m-k-l:n] on exit.
+//
+// Dggsvd3 computes C, S, R, and optionally the orthogonal transformation
+// matrices U, V and Q.
+//
+// jobU, jobV and jobQ are options for computing the orthogonal matrices. The behavior
+// is as follows
+//
+//	jobU == lapack.GSVDU        Compute orthogonal matrix U
+//	jobU == lapack.GSVDNone     Do not compute orthogonal matrix.
+//
+// The behavior is the same for jobV and jobQ with the exception that instead of
+// lapack.GSVDU these accept lapack.GSVDV and lapack.GSVDQ respectively.
+// The matrices U, V and Q must be m×m, p×p and n×n respectively unless the
+// relevant job parameter is lapack.GSVDNone.
+//
+// alpha and beta must have length n or Dggsvd3 will panic. On exit, alpha and
+// beta contain the generalized singular value pairs of A and B
+//
+//	alpha[0:k] = 1,
+//	beta[0:k]  = 0,
+//
+// if m-k-l >= 0,
+//
+//	alpha[k:k+l] = diag(C),
+//	beta[k:k+l]  = diag(S),
+//
+// if m-k-l < 0,
+//
+//	alpha[k:m]= C, alpha[m:k+l]= 0
+//	beta[k:m] = S, beta[m:k+l] = 1.
+//
+// if k+l < n,
+//
+//	alpha[k+l:n] = 0 and
+//	beta[k+l:n]  = 0.
+//
+// On exit, iwork contains the permutation required to sort alpha descending.
+//
+// iwork must have length n, work must have length at least max(1, lwork), and
+// lwork must be -1 or greater than n, otherwise Dggsvd3 will panic. If
+// lwork is -1, work[0] holds the optimal lwork on return, but Dggsvd3 does
+// not perform the GSVD.
+func (impl Implementation) Dggsvd3(jobU, jobV, jobQ lapack.GSVDJob, m, n, p int, a []float64, lda int, b []float64, ldb int, alpha, beta, u []float64, ldu int, v []float64, ldv int, q []float64, ldq int, work []float64, lwork int, iwork []int) (k, l int, ok bool) {
+	wantu := jobU == lapack.GSVDU
+	wantv := jobV == lapack.GSVDV
+	wantq := jobQ == lapack.GSVDQ
+	switch {
+	case !wantu && jobU != lapack.GSVDNone:
+		panic(badGSVDJob + "U")
+	case !wantv && jobV != lapack.GSVDNone:
+		panic(badGSVDJob + "V")
+	case !wantq && jobQ != lapack.GSVDNone:
+		panic(badGSVDJob + "Q")
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case p < 0:
+		panic(pLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldb < max(1, n):
+		panic(badLdB)
+	case ldu < 1, wantu && ldu < m:
+		panic(badLdU)
+	case ldv < 1, wantv && ldv < p:
+		panic(badLdV)
+	case ldq < 1, wantq && ldq < n:
+		panic(badLdQ)
+	case len(iwork) < n:
+		panic(shortWork)
+	case lwork < 1 && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Determine optimal work length.
+	impl.Dggsvp3(jobU, jobV, jobQ,
+		m, p, n,
+		a, lda,
+		b, ldb,
+		0, 0,
+		u, ldu,
+		v, ldv,
+		q, ldq,
+		iwork,
+		work, work, -1)
+	lwkopt := n + int(work[0])
+	lwkopt = max(lwkopt, 2*n)
+	lwkopt = max(lwkopt, 1)
+	work[0] = float64(lwkopt)
+	if lwork == -1 {
+		return 0, 0, true
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(b) < (p-1)*ldb+n:
+		panic(shortB)
+	case wantu && len(u) < (m-1)*ldu+m:
+		panic(shortU)
+	case wantv && len(v) < (p-1)*ldv+p:
+		panic(shortV)
+	case wantq && len(q) < (n-1)*ldq+n:
+		panic(shortQ)
+	case len(alpha) != n:
+		panic(badLenAlpha)
+	case len(beta) != n:
+		panic(badLenBeta)
+	}
+
+	// Compute the Frobenius norm of matrices A and B.
+	anorm := impl.Dlange(lapack.Frobenius, m, n, a, lda, nil)
+	bnorm := impl.Dlange(lapack.Frobenius, p, n, b, ldb, nil)
+
+	// Get machine precision and set up threshold for determining
+	// the effective numerical rank of the matrices A and B.
+	tola := float64(max(m, n)) * math.Max(anorm, dlamchS) * dlamchP
+	tolb := float64(max(p, n)) * math.Max(bnorm, dlamchS) * dlamchP
+
+	// Preprocessing.
+	k, l = impl.Dggsvp3(jobU, jobV, jobQ,
+		m, p, n,
+		a, lda,
+		b, ldb,
+		tola, tolb,
+		u, ldu,
+		v, ldv,
+		q, ldq,
+		iwork,
+		work[:n], work[n:], lwork-n)
+
+	// Compute the GSVD of two upper "triangular" matrices.
+	_, ok = impl.Dtgsja(jobU, jobV, jobQ,
+		m, p, n,
+		k, l,
+		a, lda,
+		b, ldb,
+		tola, tolb,
+		alpha, beta,
+		u, ldu,
+		v, ldv,
+		q, ldq,
+		work)
+
+	// Sort the singular values and store the pivot indices in iwork
+	// Copy alpha to work, then sort alpha in work.
+	bi := blas64.Implementation()
+	bi.Dcopy(n, alpha, 1, work[:n], 1)
+	ibnd := min(l, m-k)
+	for i := 0; i < ibnd; i++ {
+		// Scan for largest alpha_{k+i}.
+		isub := i
+		smax := work[k+i]
+		for j := i + 1; j < ibnd; j++ {
+			if v := work[k+j]; v > smax {
+				isub = j
+				smax = v
+			}
+		}
+		if isub != i {
+			work[k+isub] = work[k+i]
+			work[k+i] = smax
+			iwork[k+i] = k + isub
+		} else {
+			iwork[k+i] = k + i
+		}
+	}
+
+	work[0] = float64(lwkopt)
+
+	return k, l, ok
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dggsvp3.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dggsvp3.go
new file mode 100644
index 0000000000..f7f04c764f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dggsvp3.go
@@ -0,0 +1,286 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dggsvp3 computes orthogonal matrices U, V and Q such that
+//
+//	                n-k-l  k    l
+//	Uᵀ*A*Q =     k [ 0    A12  A13 ] if m-k-l >= 0;
+//	             l [ 0     0   A23 ]
+//	         m-k-l [ 0     0    0  ]
+//
+//	                n-k-l  k    l
+//	Uᵀ*A*Q =     k [ 0    A12  A13 ] if m-k-l < 0;
+//	           m-k [ 0     0   A23 ]
+//
+//	                n-k-l  k    l
+//	Vᵀ*B*Q =     l [ 0     0   B13 ]
+//	           p-l [ 0     0    0  ]
+//
+// where the k×k matrix A12 and l×l matrix B13 are non-singular
+// upper triangular. A23 is l×l upper triangular if m-k-l >= 0,
+// otherwise A23 is (m-k)×l upper trapezoidal.
+//
+// Dggsvp3 returns k and l, the dimensions of the sub-blocks. k+l
+// is the effective numerical rank of the (m+p)×n matrix [ Aᵀ Bᵀ ]ᵀ.
+//
+// jobU, jobV and jobQ are options for computing the orthogonal matrices. The behavior
+// is as follows
+//
+//	jobU == lapack.GSVDU        Compute orthogonal matrix U
+//	jobU == lapack.GSVDNone     Do not compute orthogonal matrix.
+//
+// The behavior is the same for jobV and jobQ with the exception that instead of
+// lapack.GSVDU these accept lapack.GSVDV and lapack.GSVDQ respectively.
+// The matrices U, V and Q must be m×m, p×p and n×n respectively unless the
+// relevant job parameter is lapack.GSVDNone.
+//
+// tola and tolb are the convergence criteria for the Jacobi-Kogbetliantz
+// iteration procedure. Generally, they are the same as used in the preprocessing
+// step, for example,
+//
+//	tola = max(m, n)*norm(A)*eps,
+//	tolb = max(p, n)*norm(B)*eps.
+//
+// Where eps is the machine epsilon.
+//
+// iwork must have length n, work must have length at least max(1, lwork), and
+// lwork must be -1 or greater than zero, otherwise Dggsvp3 will panic.
+//
+// Dggsvp3 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dggsvp3(jobU, jobV, jobQ lapack.GSVDJob, m, p, n int, a []float64, lda int, b []float64, ldb int, tola, tolb float64, u []float64, ldu int, v []float64, ldv int, q []float64, ldq int, iwork []int, tau, work []float64, lwork int) (k, l int) {
+	wantu := jobU == lapack.GSVDU
+	wantv := jobV == lapack.GSVDV
+	wantq := jobQ == lapack.GSVDQ
+	switch {
+	case !wantu && jobU != lapack.GSVDNone:
+		panic(badGSVDJob + "U")
+	case !wantv && jobV != lapack.GSVDNone:
+		panic(badGSVDJob + "V")
+	case !wantq && jobQ != lapack.GSVDNone:
+		panic(badGSVDJob + "Q")
+	case m < 0:
+		panic(mLT0)
+	case p < 0:
+		panic(pLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldb < max(1, n):
+		panic(badLdB)
+	case ldu < 1, wantu && ldu < m:
+		panic(badLdU)
+	case ldv < 1, wantv && ldv < p:
+		panic(badLdV)
+	case ldq < 1, wantq && ldq < n:
+		panic(badLdQ)
+	case len(iwork) != n:
+		panic(shortWork)
+	case lwork < 1 && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	var lwkopt int
+	impl.Dgeqp3(p, n, b, ldb, iwork, tau, work, -1)
+	lwkopt = int(work[0])
+	if wantv {
+		lwkopt = max(lwkopt, p)
+	}
+	lwkopt = max(lwkopt, min(n, p))
+	lwkopt = max(lwkopt, m)
+	if wantq {
+		lwkopt = max(lwkopt, n)
+	}
+	impl.Dgeqp3(m, n, a, lda, iwork, tau, work, -1)
+	lwkopt = max(lwkopt, int(work[0]))
+	lwkopt = max(1, lwkopt)
+	if lwork == -1 {
+		work[0] = float64(lwkopt)
+		return 0, 0
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(b) < (p-1)*ldb+n:
+		panic(shortB)
+	case wantu && len(u) < (m-1)*ldu+m:
+		panic(shortU)
+	case wantv && len(v) < (p-1)*ldv+p:
+		panic(shortV)
+	case wantq && len(q) < (n-1)*ldq+n:
+		panic(shortQ)
+	case len(tau) < n:
+		// tau check must come after lwkopt query since
+		// the Dggsvd3 call for lwkopt query may have
+		// lwork == -1, and tau is provided by work.
+		panic(shortTau)
+	}
+
+	const forward = true
+
+	// QR with column pivoting of B: B*P = V*[ S11 S12 ].
+	//                                       [  0   0  ]
+	for i := range iwork[:n] {
+		iwork[i] = 0
+	}
+	impl.Dgeqp3(p, n, b, ldb, iwork, tau, work, lwork)
+
+	// Update A := A*P.
+	impl.Dlapmt(forward, m, n, a, lda, iwork)
+
+	// Determine the effective rank of matrix B.
+	for i := 0; i < min(p, n); i++ {
+		if math.Abs(b[i*ldb+i]) > tolb {
+			l++
+		}
+	}
+
+	if wantv {
+		// Copy the details of V, and form V.
+		impl.Dlaset(blas.All, p, p, 0, 0, v, ldv)
+		if p > 1 {
+			impl.Dlacpy(blas.Lower, p-1, min(p, n), b[ldb:], ldb, v[ldv:], ldv)
+		}
+		impl.Dorg2r(p, p, min(p, n), v, ldv, tau[:min(p, n)], work)
+	}
+
+	// Clean up B.
+	for i := 1; i < l; i++ {
+		r := b[i*ldb : i*ldb+i]
+		for j := range r {
+			r[j] = 0
+		}
+	}
+	if p > l {
+		impl.Dlaset(blas.All, p-l, n, 0, 0, b[l*ldb:], ldb)
+	}
+
+	if wantq {
+		// Set Q = I and update Q := Q*P.
+		impl.Dlaset(blas.All, n, n, 0, 1, q, ldq)
+		impl.Dlapmt(forward, n, n, q, ldq, iwork)
+	}
+
+	if p >= l && n != l {
+		// RQ factorization of [ S11 S12 ]: [ S11 S12 ] = [ 0 S12 ]*Z.
+		impl.Dgerq2(l, n, b, ldb, tau, work)
+
+		// Update A := A*Zᵀ.
+		impl.Dormr2(blas.Right, blas.Trans, m, n, l, b, ldb, tau, a, lda, work)
+
+		if wantq {
+			// Update Q := Q*Zᵀ.
+			impl.Dormr2(blas.Right, blas.Trans, n, n, l, b, ldb, tau, q, ldq, work)
+		}
+
+		// Clean up B.
+		impl.Dlaset(blas.All, l, n-l, 0, 0, b, ldb)
+		for i := 1; i < l; i++ {
+			r := b[i*ldb+n-l : i*ldb+i+n-l]
+			for j := range r {
+				r[j] = 0
+			}
+		}
+	}
+
+	// Let              N-L     L
+	//            A = [ A11    A12 ] M,
+	//
+	// then the following does the complete QR decomposition of A11:
+	//
+	//          A11 = U*[  0  T12 ]*P1ᵀ.
+	//                  [  0   0  ]
+	for i := range iwork[:n-l] {
+		iwork[i] = 0
+	}
+	impl.Dgeqp3(m, n-l, a, lda, iwork[:n-l], tau, work, lwork)
+
+	// Determine the effective rank of A11.
+	for i := 0; i < min(m, n-l); i++ {
+		if math.Abs(a[i*lda+i]) > tola {
+			k++
+		}
+	}
+
+	// Update A12 := Uᵀ*A12, where A12 = A[0:m, n-l:n].
+	impl.Dorm2r(blas.Left, blas.Trans, m, l, min(m, n-l), a, lda, tau[:min(m, n-l)], a[n-l:], lda, work)
+
+	if wantu {
+		// Copy the details of U, and form U.
+		impl.Dlaset(blas.All, m, m, 0, 0, u, ldu)
+		if m > 1 {
+			impl.Dlacpy(blas.Lower, m-1, min(m, n-l), a[lda:], lda, u[ldu:], ldu)
+		}
+		k := min(m, n-l)
+		impl.Dorg2r(m, m, k, u, ldu, tau[:k], work)
+	}
+
+	if wantq {
+		// Update Q[0:n, 0:n-l] := Q[0:n, 0:n-l]*P1.
+		impl.Dlapmt(forward, n, n-l, q, ldq, iwork[:n-l])
+	}
+
+	// Clean up A: set the strictly lower triangular part of
+	// A[0:k, 0:k] = 0, and A[k:m, 0:n-l] = 0.
+	for i := 1; i < k; i++ {
+		r := a[i*lda : i*lda+i]
+		for j := range r {
+			r[j] = 0
+		}
+	}
+	if m > k {
+		impl.Dlaset(blas.All, m-k, n-l, 0, 0, a[k*lda:], lda)
+	}
+
+	if n-l > k {
+		// RQ factorization of [ T11 T12 ] = [ 0 T12 ]*Z1.
+		impl.Dgerq2(k, n-l, a, lda, tau, work)
+
+		if wantq {
+			// Update Q[0:n, 0:n-l] := Q[0:n, 0:n-l]*Z1ᵀ.
+			impl.Dorm2r(blas.Right, blas.Trans, n, n-l, k, a, lda, tau[:k], q, ldq, work)
+		}
+
+		// Clean up A.
+		impl.Dlaset(blas.All, k, n-l-k, 0, 0, a, lda)
+		for i := 1; i < k; i++ {
+			r := a[i*lda+n-k-l : i*lda+i+n-k-l]
+			for j := range r {
+				a[j] = 0
+			}
+		}
+	}
+
+	if m > k {
+		// QR factorization of A[k:m, n-l:n].
+		impl.Dgeqr2(m-k, l, a[k*lda+n-l:], lda, tau[:min(m-k, l)], work)
+		if wantu {
+			// Update U[:, k:m) := U[:, k:m]*U1.
+			impl.Dorm2r(blas.Right, blas.NoTrans, m, m-k, min(m-k, l), a[k*lda+n-l:], lda, tau[:min(m-k, l)], u[k:], ldu, work)
+		}
+
+		// Clean up A.
+		for i := k + 1; i < m; i++ {
+			r := a[i*lda+n-l : i*lda+min(n-l+i-k, n)]
+			for j := range r {
+				r[j] = 0
+			}
+		}
+	}
+
+	work[0] = float64(lwkopt)
+	return k, l
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgtsv.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgtsv.go
new file mode 100644
index 0000000000..944af1a607
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgtsv.go
@@ -0,0 +1,101 @@
+// Copyright ©2020 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dgtsv solves the equation
+//
+//	A * X = B
+//
+// where A is an n×n tridiagonal matrix. It uses Gaussian elimination with
+// partial pivoting. The equation Aᵀ * X = B may be solved by swapping the
+// arguments for du and dl.
+//
+// On entry, dl, d and du contain the sub-diagonal, the diagonal and the
+// super-diagonal, respectively, of A. On return, the first n-2 elements of dl,
+// the first n-1 elements of du and the first n elements of d may be
+// overwritten.
+//
+// On entry, b contains the n×nrhs right-hand side matrix B. On return, b will
+// be overwritten. If ok is true, it will be overwritten by the solution matrix X.
+//
+// Dgtsv returns whether the solution X has been successfully computed.
+func (impl Implementation) Dgtsv(n, nrhs int, dl, d, du []float64, b []float64, ldb int) (ok bool) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case nrhs < 0:
+		panic(nrhsLT0)
+	case ldb < max(1, nrhs):
+		panic(badLdB)
+	}
+
+	if n == 0 || nrhs == 0 {
+		return true
+	}
+
+	switch {
+	case len(dl) < n-1:
+		panic(shortDL)
+	case len(d) < n:
+		panic(shortD)
+	case len(du) < n-1:
+		panic(shortDU)
+	case len(b) < (n-1)*ldb+nrhs:
+		panic(shortB)
+	}
+
+	dl = dl[:n-1]
+	d = d[:n]
+	du = du[:n-1]
+
+	for i := 0; i < n-1; i++ {
+		if math.Abs(d[i]) >= math.Abs(dl[i]) {
+			// No row interchange required.
+			if d[i] == 0 {
+				return false
+			}
+			fact := dl[i] / d[i]
+			d[i+1] -= fact * du[i]
+			for j := 0; j < nrhs; j++ {
+				b[(i+1)*ldb+j] -= fact * b[i*ldb+j]
+			}
+			dl[i] = 0
+		} else {
+			// Interchange rows i and i+1.
+			fact := d[i] / dl[i]
+			d[i] = dl[i]
+			tmp := d[i+1]
+			d[i+1] = du[i] - fact*tmp
+			du[i] = tmp
+			if i+1 < n-1 {
+				dl[i] = du[i+1]
+				du[i+1] = -fact * dl[i]
+			}
+			for j := 0; j < nrhs; j++ {
+				tmp = b[i*ldb+j]
+				b[i*ldb+j] = b[(i+1)*ldb+j]
+				b[(i+1)*ldb+j] = tmp - fact*b[(i+1)*ldb+j]
+			}
+		}
+	}
+	if d[n-1] == 0 {
+		return false
+	}
+
+	// Back solve with the matrix U from the factorization.
+	for j := 0; j < nrhs; j++ {
+		b[(n-1)*ldb+j] /= d[n-1]
+		if n > 1 {
+			b[(n-2)*ldb+j] = (b[(n-2)*ldb+j] - du[n-2]*b[(n-1)*ldb+j]) / d[n-2]
+		}
+		for i := n - 3; i >= 0; i-- {
+			b[i*ldb+j] = (b[i*ldb+j] - du[i]*b[(i+1)*ldb+j] - dl[i]*b[(i+2)*ldb+j]) / d[i]
+		}
+	}
+
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dhseqr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dhseqr.go
new file mode 100644
index 0000000000..beccf132b7
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dhseqr.go
@@ -0,0 +1,272 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dhseqr computes the eigenvalues of an n×n Hessenberg matrix H and,
+// optionally, the matrices T and Z from the Schur decomposition
+//
+//	H = Z T Zᵀ,
+//
+// where T is an n×n upper quasi-triangular matrix (the Schur form), and Z is
+// the n×n orthogonal matrix of Schur vectors.
+//
+// Optionally Z may be postmultiplied into an input orthogonal matrix Q so that
+// this routine can give the Schur factorization of a matrix A which has been
+// reduced to the Hessenberg form H by the orthogonal matrix Q:
+//
+//	A = Q H Qᵀ = (QZ) T (QZ)ᵀ.
+//
+// If job == lapack.EigenvaluesOnly, only the eigenvalues will be computed.
+// If job == lapack.EigenvaluesAndSchur, the eigenvalues and the Schur form T will
+// be computed.
+// For other values of job Dhseqr will panic.
+//
+// If compz == lapack.SchurNone, no Schur vectors will be computed and Z will not be
+// referenced.
+// If compz == lapack.SchurHess, on return Z will contain the matrix of Schur
+// vectors of H.
+// If compz == lapack.SchurOrig, on entry z is assumed to contain the orthogonal
+// matrix Q that is the identity except for the submatrix
+// Q[ilo:ihi+1,ilo:ihi+1]. On return z will be updated to the product Q*Z.
+//
+// ilo and ihi determine the block of H on which Dhseqr operates. It is assumed
+// that H is already upper triangular in rows and columns [0:ilo] and [ihi+1:n],
+// although it will be only checked that the block is isolated, that is,
+//
+//	ilo == 0   or H[ilo,ilo-1] == 0,
+//	ihi == n-1 or H[ihi+1,ihi] == 0,
+//
+// and Dhseqr will panic otherwise. ilo and ihi are typically set by a previous
+// call to Dgebal, otherwise they should be set to 0 and n-1, respectively. It
+// must hold that
+//
+//	0 <= ilo <= ihi < n     if n > 0,
+//	ilo == 0 and ihi == -1  if n == 0.
+//
+// wr and wi must have length n.
+//
+// work must have length at least lwork and lwork must be at least max(1,n)
+// otherwise Dhseqr will panic. The minimum lwork delivers very good and
+// sometimes optimal performance, although lwork as large as 11*n may be
+// required. On return, work[0] will contain the optimal value of lwork.
+//
+// If lwork is -1, instead of performing Dhseqr, the function only estimates the
+// optimal workspace size and stores it into work[0]. Neither h nor z are
+// accessed.
+//
+// unconverged indicates whether Dhseqr computed all the eigenvalues.
+//
+// If unconverged == 0, all the eigenvalues have been computed and their real
+// and imaginary parts will be stored on return in wr and wi, respectively. If
+// two eigenvalues are computed as a complex conjugate pair, they are stored in
+// consecutive elements of wr and wi, say the i-th and (i+1)th, with wi[i] > 0
+// and wi[i+1] < 0.
+//
+// If unconverged == 0 and job == lapack.EigenvaluesAndSchur, on return H will
+// contain the upper quasi-triangular matrix T from the Schur decomposition (the
+// Schur form). 2×2 diagonal blocks (corresponding to complex conjugate pairs of
+// eigenvalues) will be returned in standard form, with
+//
+//	H[i,i] == H[i+1,i+1],
+//
+// and
+//
+//	H[i+1,i]*H[i,i+1] < 0.
+//
+// The eigenvalues will be stored in wr and wi in the same order as on the
+// diagonal of the Schur form returned in H, with
+//
+//	wr[i] = H[i,i],
+//
+// and, if H[i:i+2,i:i+2] is a 2×2 diagonal block,
+//
+//	wi[i]   = sqrt(-H[i+1,i]*H[i,i+1]),
+//	wi[i+1] = -wi[i].
+//
+// If unconverged == 0 and job == lapack.EigenvaluesOnly, the contents of h
+// on return is unspecified.
+//
+// If unconverged > 0, some eigenvalues have not converged, and the blocks
+// [0:ilo] and [unconverged:n] of wr and wi will contain those eigenvalues which
+// have been successfully computed. Failures are rare.
+//
+// If unconverged > 0 and job == lapack.EigenvaluesOnly, on return the
+// remaining unconverged eigenvalues are the eigenvalues of the upper Hessenberg
+// matrix H[ilo:unconverged,ilo:unconverged].
+//
+// If unconverged > 0 and job == lapack.EigenvaluesAndSchur, then on
+// return
+//
+//	(initial H) U = U (final H),   (*)
+//
+// where U is an orthogonal matrix. The final H is upper Hessenberg and
+// H[unconverged:ihi+1,unconverged:ihi+1] is upper quasi-triangular.
+//
+// If unconverged > 0 and compz == lapack.SchurOrig, then on return
+//
+//	(final Z) = (initial Z) U,
+//
+// where U is the orthogonal matrix in (*) regardless of the value of job.
+//
+// If unconverged > 0 and compz == lapack.SchurHess, then on return
+//
+//	(final Z) = U,
+//
+// where U is the orthogonal matrix in (*) regardless of the value of job.
+//
+// References:
+//
+//	[1] R. Byers. LAPACK 3.1 xHSEQR: Tuning and Implementation Notes on the
+//	    Small Bulge Multi-Shift QR Algorithm with Aggressive Early Deflation.
+//	    LAPACK Working Note 187 (2007)
+//	    URL: http://www.netlib.org/lapack/lawnspdf/lawn187.pdf
+//	[2] K. Braman, R. Byers, R. Mathias. The Multishift QR Algorithm. Part I:
+//	    Maintaining Well-Focused Shifts and Level 3 Performance. SIAM J. Matrix
+//	    Anal. Appl. 23(4) (2002), pp. 929—947
+//	    URL: http://dx.doi.org/10.1137/S0895479801384573
+//	[3] K. Braman, R. Byers, R. Mathias. The Multishift QR Algorithm. Part II:
+//	    Aggressive Early Deflation. SIAM J. Matrix Anal. Appl. 23(4) (2002), pp. 948—973
+//	    URL: http://dx.doi.org/10.1137/S0895479801384585
+//
+// Dhseqr is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dhseqr(job lapack.SchurJob, compz lapack.SchurComp, n, ilo, ihi int, h []float64, ldh int, wr, wi []float64, z []float64, ldz int, work []float64, lwork int) (unconverged int) {
+	wantt := job == lapack.EigenvaluesAndSchur
+	wantz := compz == lapack.SchurHess || compz == lapack.SchurOrig
+
+	switch {
+	case job != lapack.EigenvaluesOnly && job != lapack.EigenvaluesAndSchur:
+		panic(badSchurJob)
+	case compz != lapack.SchurNone && compz != lapack.SchurHess && compz != lapack.SchurOrig:
+		panic(badSchurComp)
+	case n < 0:
+		panic(nLT0)
+	case ilo < 0 || max(0, n-1) < ilo:
+		panic(badIlo)
+	case ihi < min(ilo, n-1) || n <= ihi:
+		panic(badIhi)
+	case ldh < max(1, n):
+		panic(badLdH)
+	case ldz < 1, wantz && ldz < n:
+		panic(badLdZ)
+	case lwork < max(1, n) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		work[0] = 1
+		return 0
+	}
+
+	// Quick return in case of a workspace query.
+	if lwork == -1 {
+		impl.Dlaqr04(wantt, wantz, n, ilo, ihi, h, ldh, wr, wi, ilo, ihi, z, ldz, work, -1, 1)
+		work[0] = math.Max(float64(n), work[0])
+		return 0
+	}
+
+	switch {
+	case len(h) < (n-1)*ldh+n:
+		panic(shortH)
+	case wantz && len(z) < (n-1)*ldz+n:
+		panic(shortZ)
+	case len(wr) < n:
+		panic(shortWr)
+	case len(wi) < n:
+		panic(shortWi)
+	}
+
+	const (
+		// Matrices of order ntiny or smaller must be processed by
+		// Dlahqr because of insufficient subdiagonal scratch space.
+		// This is a hard limit.
+		ntiny = 15
+
+		// nl is the size of a local workspace to help small matrices
+		// through a rare Dlahqr failure. nl > ntiny is required and
+		// nl <= nmin = Ilaenv(ispec=12,...) is recommended (the default
+		// value of nmin is 75). Using nl = 49 allows up to six
+		// simultaneous shifts and a 16×16 deflation window.
+		nl = 49
+	)
+
+	// Copy eigenvalues isolated by Dgebal.
+	for i := 0; i < ilo; i++ {
+		wr[i] = h[i*ldh+i]
+		wi[i] = 0
+	}
+	for i := ihi + 1; i < n; i++ {
+		wr[i] = h[i*ldh+i]
+		wi[i] = 0
+	}
+
+	// Initialize Z to identity matrix if requested.
+	if compz == lapack.SchurHess {
+		impl.Dlaset(blas.All, n, n, 0, 1, z, ldz)
+	}
+
+	// Quick return if possible.
+	if ilo == ihi {
+		wr[ilo] = h[ilo*ldh+ilo]
+		wi[ilo] = 0
+		return 0
+	}
+
+	// Dlahqr/Dlaqr04 crossover point.
+	nmin := impl.Ilaenv(12, "DHSEQR", string(job)+string(compz), n, ilo, ihi, lwork)
+	nmin = max(ntiny, nmin)
+
+	if n > nmin {
+		// Dlaqr0 for big matrices.
+		unconverged = impl.Dlaqr04(wantt, wantz, n, ilo, ihi, h, ldh, wr[:ihi+1], wi[:ihi+1],
+			ilo, ihi, z, ldz, work, lwork, 1)
+	} else {
+		// Dlahqr for small matrices.
+		unconverged = impl.Dlahqr(wantt, wantz, n, ilo, ihi, h, ldh, wr[:ihi+1], wi[:ihi+1],
+			ilo, ihi, z, ldz)
+		if unconverged > 0 {
+			// A rare Dlahqr failure! Dlaqr04 sometimes succeeds
+			// when Dlahqr fails.
+			kbot := unconverged
+			if n >= nl {
+				// Larger matrices have enough subdiagonal
+				// scratch space to call Dlaqr04 directly.
+				unconverged = impl.Dlaqr04(wantt, wantz, n, ilo, kbot, h, ldh,
+					wr[:ihi+1], wi[:ihi+1], ilo, ihi, z, ldz, work, lwork, 1)
+			} else {
+				// Tiny matrices don't have enough subdiagonal
+				// scratch space to benefit from Dlaqr04. Hence,
+				// tiny matrices must be copied into a larger
+				// array before calling Dlaqr04.
+				var hl [nl * nl]float64
+				impl.Dlacpy(blas.All, n, n, h, ldh, hl[:], nl)
+				impl.Dlaset(blas.All, nl, nl-n, 0, 0, hl[n:], nl)
+				var workl [nl]float64
+				unconverged = impl.Dlaqr04(wantt, wantz, nl, ilo, kbot, hl[:], nl,
+					wr[:ihi+1], wi[:ihi+1], ilo, ihi, z, ldz, workl[:], nl, 1)
+				work[0] = workl[0]
+				if wantt || unconverged > 0 {
+					impl.Dlacpy(blas.All, n, n, hl[:], nl, h, ldh)
+				}
+			}
+		}
+	}
+	// Zero out under the first subdiagonal, if necessary.
+	if (wantt || unconverged > 0) && n > 2 {
+		impl.Dlaset(blas.Lower, n-2, n-2, 0, 0, h[2*ldh:], ldh)
+	}
+
+	work[0] = math.Max(float64(n), work[0])
+	return unconverged
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlabrd.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlabrd.go
new file mode 100644
index 0000000000..396242cc2d
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlabrd.go
@@ -0,0 +1,183 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlabrd reduces the first NB rows and columns of a real general m×n matrix
+// A to upper or lower bidiagonal form by an orthogonal transformation
+//
+//	Q**T * A * P
+//
+// If m >= n, A is reduced to upper bidiagonal form and upon exit the elements
+// on and below the diagonal in the first nb columns represent the elementary
+// reflectors, and the elements above the diagonal in the first nb rows represent
+// the matrix P. If m < n, A is reduced to lower bidiagonal form and the elements
+// P is instead stored above the diagonal.
+//
+// The reduction to bidiagonal form is stored in d and e, where d are the diagonal
+// elements, and e are the off-diagonal elements.
+//
+// The matrices Q and P are products of elementary reflectors
+//
+//	Q = H_0 * H_1 * ... * H_{nb-1}
+//	P = G_0 * G_1 * ... * G_{nb-1}
+//
+// where
+//
+//	H_i = I - tauQ[i] * v_i * v_iᵀ
+//	G_i = I - tauP[i] * u_i * u_iᵀ
+//
+// As an example, on exit the entries of A when m = 6, n = 5, and nb = 2
+//
+//	[ 1   1  u1  u1  u1]
+//	[v1   1   1  u2  u2]
+//	[v1  v2   a   a   a]
+//	[v1  v2   a   a   a]
+//	[v1  v2   a   a   a]
+//	[v1  v2   a   a   a]
+//
+// and when m = 5, n = 6, and nb = 2
+//
+//	[ 1  u1  u1  u1  u1  u1]
+//	[ 1   1  u2  u2  u2  u2]
+//	[v1   1   a   a   a   a]
+//	[v1  v2   a   a   a   a]
+//	[v1  v2   a   a   a   a]
+//
+// Dlabrd also returns the matrices X and Y which are used with U and V to
+// apply the transformation to the unreduced part of the matrix
+//
+//	A := A - V*Yᵀ - X*Uᵀ
+//
+// and returns the matrices X and Y which are needed to apply the
+// transformation to the unreduced part of A.
+//
+// X is an m×nb matrix, Y is an n×nb matrix. d, e, taup, and tauq must all have
+// length at least nb. Dlabrd will panic if these size constraints are violated.
+//
+// Dlabrd is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlabrd(m, n, nb int, a []float64, lda int, d, e, tauQ, tauP, x []float64, ldx int, y []float64, ldy int) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case nb < 0:
+		panic(nbLT0)
+	case nb > n:
+		panic(nbGTN)
+	case nb > m:
+		panic(nbGTM)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldx < max(1, nb):
+		panic(badLdX)
+	case ldy < max(1, nb):
+		panic(badLdY)
+	}
+
+	if m == 0 || n == 0 || nb == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(d) < nb:
+		panic(shortD)
+	case len(e) < nb:
+		panic(shortE)
+	case len(tauQ) < nb:
+		panic(shortTauQ)
+	case len(tauP) < nb:
+		panic(shortTauP)
+	case len(x) < (m-1)*ldx+nb:
+		panic(shortX)
+	case len(y) < (n-1)*ldy+nb:
+		panic(shortY)
+	}
+
+	bi := blas64.Implementation()
+
+	if m >= n {
+		// Reduce to upper bidiagonal form.
+		for i := 0; i < nb; i++ {
+			bi.Dgemv(blas.NoTrans, m-i, i, -1, a[i*lda:], lda, y[i*ldy:], 1, 1, a[i*lda+i:], lda)
+			bi.Dgemv(blas.NoTrans, m-i, i, -1, x[i*ldx:], ldx, a[i:], lda, 1, a[i*lda+i:], lda)
+
+			a[i*lda+i], tauQ[i] = impl.Dlarfg(m-i, a[i*lda+i], a[min(i+1, m-1)*lda+i:], lda)
+			d[i] = a[i*lda+i]
+			if i < n-1 {
+				// Compute Y[i+1:n, i].
+				a[i*lda+i] = 1
+				bi.Dgemv(blas.Trans, m-i, n-i-1, 1, a[i*lda+i+1:], lda, a[i*lda+i:], lda, 0, y[(i+1)*ldy+i:], ldy)
+				bi.Dgemv(blas.Trans, m-i, i, 1, a[i*lda:], lda, a[i*lda+i:], lda, 0, y[i:], ldy)
+				bi.Dgemv(blas.NoTrans, n-i-1, i, -1, y[(i+1)*ldy:], ldy, y[i:], ldy, 1, y[(i+1)*ldy+i:], ldy)
+				bi.Dgemv(blas.Trans, m-i, i, 1, x[i*ldx:], ldx, a[i*lda+i:], lda, 0, y[i:], ldy)
+				bi.Dgemv(blas.Trans, i, n-i-1, -1, a[i+1:], lda, y[i:], ldy, 1, y[(i+1)*ldy+i:], ldy)
+				bi.Dscal(n-i-1, tauQ[i], y[(i+1)*ldy+i:], ldy)
+
+				// Update A[i, i+1:n].
+				bi.Dgemv(blas.NoTrans, n-i-1, i+1, -1, y[(i+1)*ldy:], ldy, a[i*lda:], 1, 1, a[i*lda+i+1:], 1)
+				bi.Dgemv(blas.Trans, i, n-i-1, -1, a[i+1:], lda, x[i*ldx:], 1, 1, a[i*lda+i+1:], 1)
+
+				// Generate reflection P[i] to annihilate A[i, i+2:n].
+				a[i*lda+i+1], tauP[i] = impl.Dlarfg(n-i-1, a[i*lda+i+1], a[i*lda+min(i+2, n-1):], 1)
+				e[i] = a[i*lda+i+1]
+				a[i*lda+i+1] = 1
+
+				// Compute X[i+1:m, i].
+				bi.Dgemv(blas.NoTrans, m-i-1, n-i-1, 1, a[(i+1)*lda+i+1:], lda, a[i*lda+i+1:], 1, 0, x[(i+1)*ldx+i:], ldx)
+				bi.Dgemv(blas.Trans, n-i-1, i+1, 1, y[(i+1)*ldy:], ldy, a[i*lda+i+1:], 1, 0, x[i:], ldx)
+				bi.Dgemv(blas.NoTrans, m-i-1, i+1, -1, a[(i+1)*lda:], lda, x[i:], ldx, 1, x[(i+1)*ldx+i:], ldx)
+				bi.Dgemv(blas.NoTrans, i, n-i-1, 1, a[i+1:], lda, a[i*lda+i+1:], 1, 0, x[i:], ldx)
+				bi.Dgemv(blas.NoTrans, m-i-1, i, -1, x[(i+1)*ldx:], ldx, x[i:], ldx, 1, x[(i+1)*ldx+i:], ldx)
+				bi.Dscal(m-i-1, tauP[i], x[(i+1)*ldx+i:], ldx)
+			}
+		}
+		return
+	}
+	// Reduce to lower bidiagonal form.
+	for i := 0; i < nb; i++ {
+		// Update A[i,i:n]
+		bi.Dgemv(blas.NoTrans, n-i, i, -1, y[i*ldy:], ldy, a[i*lda:], 1, 1, a[i*lda+i:], 1)
+		bi.Dgemv(blas.Trans, i, n-i, -1, a[i:], lda, x[i*ldx:], 1, 1, a[i*lda+i:], 1)
+
+		// Generate reflection P[i] to annihilate A[i, i+1:n]
+		a[i*lda+i], tauP[i] = impl.Dlarfg(n-i, a[i*lda+i], a[i*lda+min(i+1, n-1):], 1)
+		d[i] = a[i*lda+i]
+		if i < m-1 {
+			a[i*lda+i] = 1
+			// Compute X[i+1:m, i].
+			bi.Dgemv(blas.NoTrans, m-i-1, n-i, 1, a[(i+1)*lda+i:], lda, a[i*lda+i:], 1, 0, x[(i+1)*ldx+i:], ldx)
+			bi.Dgemv(blas.Trans, n-i, i, 1, y[i*ldy:], ldy, a[i*lda+i:], 1, 0, x[i:], ldx)
+			bi.Dgemv(blas.NoTrans, m-i-1, i, -1, a[(i+1)*lda:], lda, x[i:], ldx, 1, x[(i+1)*ldx+i:], ldx)
+			bi.Dgemv(blas.NoTrans, i, n-i, 1, a[i:], lda, a[i*lda+i:], 1, 0, x[i:], ldx)
+			bi.Dgemv(blas.NoTrans, m-i-1, i, -1, x[(i+1)*ldx:], ldx, x[i:], ldx, 1, x[(i+1)*ldx+i:], ldx)
+			bi.Dscal(m-i-1, tauP[i], x[(i+1)*ldx+i:], ldx)
+
+			// Update A[i+1:m, i].
+			bi.Dgemv(blas.NoTrans, m-i-1, i, -1, a[(i+1)*lda:], lda, y[i*ldy:], 1, 1, a[(i+1)*lda+i:], lda)
+			bi.Dgemv(blas.NoTrans, m-i-1, i+1, -1, x[(i+1)*ldx:], ldx, a[i:], lda, 1, a[(i+1)*lda+i:], lda)
+
+			// Generate reflection Q[i] to annihilate A[i+2:m, i].
+			a[(i+1)*lda+i], tauQ[i] = impl.Dlarfg(m-i-1, a[(i+1)*lda+i], a[min(i+2, m-1)*lda+i:], lda)
+			e[i] = a[(i+1)*lda+i]
+			a[(i+1)*lda+i] = 1
+
+			// Compute Y[i+1:n, i].
+			bi.Dgemv(blas.Trans, m-i-1, n-i-1, 1, a[(i+1)*lda+i+1:], lda, a[(i+1)*lda+i:], lda, 0, y[(i+1)*ldy+i:], ldy)
+			bi.Dgemv(blas.Trans, m-i-1, i, 1, a[(i+1)*lda:], lda, a[(i+1)*lda+i:], lda, 0, y[i:], ldy)
+			bi.Dgemv(blas.NoTrans, n-i-1, i, -1, y[(i+1)*ldy:], ldy, y[i:], ldy, 1, y[(i+1)*ldy+i:], ldy)
+			bi.Dgemv(blas.Trans, m-i-1, i+1, 1, x[(i+1)*ldx:], ldx, a[(i+1)*lda+i:], lda, 0, y[i:], ldy)
+			bi.Dgemv(blas.Trans, i+1, n-i-1, -1, a[i+1:], lda, y[i:], ldy, 1, y[(i+1)*ldy+i:], ldy)
+			bi.Dscal(n-i-1, tauQ[i], y[(i+1)*ldy+i:], ldy)
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlacn2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlacn2.go
new file mode 100644
index 0000000000..cd6cf719d5
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlacn2.go
@@ -0,0 +1,136 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlacn2 estimates the 1-norm of an n×n matrix A using sequential updates with
+// matrix-vector products provided externally.
+//
+// Dlacn2 is called sequentially and it returns the value of est and kase to be
+// used on the next call.
+// On the initial call, kase must be 0.
+// In between calls, x must be overwritten by
+//
+//	A * X    if kase was returned as 1,
+//	Aᵀ * X   if kase was returned as 2,
+//
+// and all other parameters must not be changed.
+// On the final return, kase is returned as 0, v contains A*W where W is a
+// vector, and est = norm(V)/norm(W) is a lower bound for 1-norm of A.
+//
+// v, x, and isgn must all have length n and n must be at least 1, otherwise
+// Dlacn2 will panic. isave is used for temporary storage.
+//
+// Dlacn2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlacn2(n int, v, x []float64, isgn []int, est float64, kase int, isave *[3]int) (float64, int) {
+	switch {
+	case n < 1:
+		panic(nLT1)
+	case len(v) < n:
+		panic(shortV)
+	case len(x) < n:
+		panic(shortX)
+	case len(isgn) < n:
+		panic(shortIsgn)
+	case isave[0] < 0 || 5 < isave[0]:
+		panic(badIsave)
+	case isave[0] == 0 && kase != 0:
+		panic(badIsave)
+	}
+
+	const itmax = 5
+	bi := blas64.Implementation()
+
+	if kase == 0 {
+		for i := 0; i < n; i++ {
+			x[i] = 1 / float64(n)
+		}
+		kase = 1
+		isave[0] = 1
+		return est, kase
+	}
+	switch isave[0] {
+	case 1:
+		if n == 1 {
+			v[0] = x[0]
+			est = math.Abs(v[0])
+			kase = 0
+			return est, kase
+		}
+		est = bi.Dasum(n, x, 1)
+		for i := 0; i < n; i++ {
+			x[i] = math.Copysign(1, x[i])
+			isgn[i] = int(x[i])
+		}
+		kase = 2
+		isave[0] = 2
+		return est, kase
+	case 2:
+		isave[1] = bi.Idamax(n, x, 1)
+		isave[2] = 2
+		for i := 0; i < n; i++ {
+			x[i] = 0
+		}
+		x[isave[1]] = 1
+		kase = 1
+		isave[0] = 3
+		return est, kase
+	case 3:
+		bi.Dcopy(n, x, 1, v, 1)
+		estold := est
+		est = bi.Dasum(n, v, 1)
+		sameSigns := true
+		for i := 0; i < n; i++ {
+			if int(math.Copysign(1, x[i])) != isgn[i] {
+				sameSigns = false
+				break
+			}
+		}
+		if !sameSigns && est > estold {
+			for i := 0; i < n; i++ {
+				x[i] = math.Copysign(1, x[i])
+				isgn[i] = int(x[i])
+			}
+			kase = 2
+			isave[0] = 4
+			return est, kase
+		}
+	case 4:
+		jlast := isave[1]
+		isave[1] = bi.Idamax(n, x, 1)
+		if x[jlast] != math.Abs(x[isave[1]]) && isave[2] < itmax {
+			isave[2] += 1
+			for i := 0; i < n; i++ {
+				x[i] = 0
+			}
+			x[isave[1]] = 1
+			kase = 1
+			isave[0] = 3
+			return est, kase
+		}
+	case 5:
+		tmp := 2 * (bi.Dasum(n, x, 1)) / float64(3*n)
+		if tmp > est {
+			bi.Dcopy(n, x, 1, v, 1)
+			est = tmp
+		}
+		kase = 0
+		return est, kase
+	}
+	// Iteration complete. Final stage
+	altsgn := 1.0
+	for i := 0; i < n; i++ {
+		x[i] = altsgn * (1 + float64(i)/float64(n-1))
+		altsgn *= -1
+	}
+	kase = 1
+	isave[0] = 5
+	return est, kase
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlacpy.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlacpy.go
new file mode 100644
index 0000000000..793bb8c7ca
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlacpy.go
@@ -0,0 +1,59 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dlacpy copies the elements of A specified by uplo into B. Uplo can specify
+// a triangular portion with blas.Upper or blas.Lower, or can specify all of the
+// elements with blas.All.
+//
+// Dlacpy is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlacpy(uplo blas.Uplo, m, n int, a []float64, lda int, b []float64, ldb int) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower && uplo != blas.All:
+		panic(badUplo)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldb < max(1, n):
+		panic(badLdB)
+	}
+
+	if m == 0 || n == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(b) < (m-1)*ldb+n:
+		panic(shortB)
+	}
+
+	switch uplo {
+	case blas.Upper:
+		for i := 0; i < m; i++ {
+			for j := i; j < n; j++ {
+				b[i*ldb+j] = a[i*lda+j]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < m; i++ {
+			for j := 0; j < min(i+1, n); j++ {
+				b[i*ldb+j] = a[i*lda+j]
+			}
+		}
+	case blas.All:
+		for i := 0; i < m; i++ {
+			for j := 0; j < n; j++ {
+				b[i*ldb+j] = a[i*lda+j]
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlae2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlae2.go
new file mode 100644
index 0000000000..2eda3a18fe
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlae2.go
@@ -0,0 +1,51 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlae2 computes the eigenvalues of a 2×2 symmetric matrix
+//
+//	[a b]
+//	[b c]
+//
+// and returns the eigenvalue with the larger absolute value as rt1 and the
+// smaller as rt2.
+//
+// Dlae2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlae2(a, b, c float64) (rt1, rt2 float64) {
+	sm := a + c
+	df := a - c
+	adf := math.Abs(df)
+	tb := b + b
+	ab := math.Abs(tb)
+	acmx := c
+	acmn := a
+	if math.Abs(a) > math.Abs(c) {
+		acmx = a
+		acmn = c
+	}
+	var rt float64
+	if adf > ab {
+		rt = adf * math.Sqrt(1+(ab/adf)*(ab/adf))
+	} else if adf < ab {
+		rt = ab * math.Sqrt(1+(adf/ab)*(adf/ab))
+	} else {
+		rt = ab * math.Sqrt2
+	}
+	if sm < 0 {
+		rt1 = 0.5 * (sm - rt)
+		rt2 = (acmx/rt1)*acmn - (b/rt1)*b
+		return rt1, rt2
+	}
+	if sm > 0 {
+		rt1 = 0.5 * (sm + rt)
+		rt2 = (acmx/rt1)*acmn - (b/rt1)*b
+		return rt1, rt2
+	}
+	rt1 = 0.5 * rt
+	rt2 = -0.5 * rt
+	return rt1, rt2
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlaev2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaev2.go
new file mode 100644
index 0000000000..56923f51d3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaev2.go
@@ -0,0 +1,85 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlaev2 computes the Eigen decomposition of a symmetric 2×2 matrix.
+// The matrix is given by
+//
+//	[a b]
+//	[b c]
+//
+// Dlaev2 returns rt1 and rt2, the eigenvalues of the matrix where |RT1| > |RT2|,
+// and [cs1, sn1] which is the unit right eigenvalue for RT1.
+//
+//	[ cs1 sn1] [a b] [cs1 -sn1] = [rt1   0]
+//	[-sn1 cs1] [b c] [sn1  cs1]   [  0 rt2]
+//
+// Dlaev2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlaev2(a, b, c float64) (rt1, rt2, cs1, sn1 float64) {
+	sm := a + c
+	df := a - c
+	adf := math.Abs(df)
+	tb := b + b
+	ab := math.Abs(tb)
+	acmx := c
+	acmn := a
+	if math.Abs(a) > math.Abs(c) {
+		acmx = a
+		acmn = c
+	}
+	var rt float64
+	if adf > ab {
+		rt = adf * math.Sqrt(1+(ab/adf)*(ab/adf))
+	} else if adf < ab {
+		rt = ab * math.Sqrt(1+(adf/ab)*(adf/ab))
+	} else {
+		rt = ab * math.Sqrt(2)
+	}
+	var sgn1 float64
+	if sm < 0 {
+		rt1 = 0.5 * (sm - rt)
+		sgn1 = -1
+		rt2 = (acmx/rt1)*acmn - (b/rt1)*b
+	} else if sm > 0 {
+		rt1 = 0.5 * (sm + rt)
+		sgn1 = 1
+		rt2 = (acmx/rt1)*acmn - (b/rt1)*b
+	} else {
+		rt1 = 0.5 * rt
+		rt2 = -0.5 * rt
+		sgn1 = 1
+	}
+	var cs, sgn2 float64
+	if df >= 0 {
+		cs = df + rt
+		sgn2 = 1
+	} else {
+		cs = df - rt
+		sgn2 = -1
+	}
+	acs := math.Abs(cs)
+	if acs > ab {
+		ct := -tb / cs
+		sn1 = 1 / math.Sqrt(1+ct*ct)
+		cs1 = ct * sn1
+	} else {
+		if ab == 0 {
+			cs1 = 1
+			sn1 = 0
+		} else {
+			tn := -cs / tb
+			cs1 = 1 / math.Sqrt(1+tn*tn)
+			sn1 = tn * cs1
+		}
+	}
+	if sgn1 == sgn2 {
+		tn := cs1
+		cs1 = -sn1
+		sn1 = tn
+	}
+	return rt1, rt2, cs1, sn1
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlaexc.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaexc.go
new file mode 100644
index 0000000000..2b79bd8ae7
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaexc.go
@@ -0,0 +1,269 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlaexc swaps two adjacent diagonal blocks of order 1 or 2 in an n×n upper
+// quasi-triangular matrix T by an orthogonal similarity transformation.
+//
+// T must be in Schur canonical form, that is, block upper triangular with 1×1
+// and 2×2 diagonal blocks; each 2×2 diagonal block has its diagonal elements
+// equal and its off-diagonal elements of opposite sign. On return, T will
+// contain the updated matrix again in Schur canonical form.
+//
+// If wantq is true, the transformation is accumulated in the n×n matrix Q,
+// otherwise Q is not referenced.
+//
+// j1 is the index of the first row of the first block. n1 and n2 are the order
+// of the first and second block, respectively.
+//
+// work must have length at least n, otherwise Dlaexc will panic.
+//
+// If ok is false, the transformed matrix T would be too far from Schur form.
+// The blocks are not swapped, and T and Q are not modified.
+//
+// If n1 and n2 are both equal to 1, Dlaexc will always return true.
+//
+// Dlaexc is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlaexc(wantq bool, n int, t []float64, ldt int, q []float64, ldq int, j1, n1, n2 int, work []float64) (ok bool) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case ldt < max(1, n):
+		panic(badLdT)
+	case wantq && ldt < max(1, n):
+		panic(badLdQ)
+	case j1 < 0 || n <= j1:
+		panic(badJ1)
+	case len(work) < n:
+		panic(shortWork)
+	case n1 < 0 || 2 < n1:
+		panic(badN1)
+	case n2 < 0 || 2 < n2:
+		panic(badN2)
+	}
+
+	if n == 0 || n1 == 0 || n2 == 0 {
+		return true
+	}
+
+	switch {
+	case len(t) < (n-1)*ldt+n:
+		panic(shortT)
+	case wantq && len(q) < (n-1)*ldq+n:
+		panic(shortQ)
+	}
+
+	if j1+n1 >= n {
+		// TODO(vladimir-ch): Reference LAPACK does this check whether
+		// the start of the second block is in the matrix T. It returns
+		// true if it is not and moreover it does not check whether the
+		// whole second block fits into T. This does not feel
+		// satisfactory. The only caller of Dlaexc is Dtrexc, so if the
+		// caller makes sure that this does not happen, we could be
+		// stricter here.
+		return true
+	}
+
+	j2 := j1 + 1
+	j3 := j1 + 2
+
+	bi := blas64.Implementation()
+
+	if n1 == 1 && n2 == 1 {
+		// Swap two 1×1 blocks.
+		t11 := t[j1*ldt+j1]
+		t22 := t[j2*ldt+j2]
+
+		// Determine the transformation to perform the interchange.
+		cs, sn, _ := impl.Dlartg(t[j1*ldt+j2], t22-t11)
+
+		// Apply transformation to the matrix T.
+		if n-j3 > 0 {
+			bi.Drot(n-j3, t[j1*ldt+j3:], 1, t[j2*ldt+j3:], 1, cs, sn)
+		}
+		if j1 > 0 {
+			bi.Drot(j1, t[j1:], ldt, t[j2:], ldt, cs, sn)
+		}
+
+		t[j1*ldt+j1] = t22
+		t[j2*ldt+j2] = t11
+
+		if wantq {
+			// Accumulate transformation in the matrix Q.
+			bi.Drot(n, q[j1:], ldq, q[j2:], ldq, cs, sn)
+		}
+
+		return true
+	}
+
+	// Swapping involves at least one 2×2 block.
+	//
+	// Copy the diagonal block of order n1+n2 to the local array d and
+	// compute its norm.
+	nd := n1 + n2
+	var d [16]float64
+	const ldd = 4
+	impl.Dlacpy(blas.All, nd, nd, t[j1*ldt+j1:], ldt, d[:], ldd)
+	dnorm := impl.Dlange(lapack.MaxAbs, nd, nd, d[:], ldd, work)
+
+	// Compute machine-dependent threshold for test for accepting swap.
+	eps := dlamchP
+	thresh := math.Max(10*eps*dnorm, dlamchS/eps)
+
+	// Solve T11*X - X*T22 = scale*T12 for X.
+	var x [4]float64
+	const ldx = 2
+	scale, _, _ := impl.Dlasy2(false, false, -1, n1, n2, d[:], ldd, d[n1*ldd+n1:], ldd, d[n1:], ldd, x[:], ldx)
+
+	// Swap the adjacent diagonal blocks.
+	switch {
+	case n1 == 1 && n2 == 2:
+		// Generate elementary reflector H so that
+		//  ( scale, X11, X12 ) H = ( 0, 0, * )
+		u := [3]float64{scale, x[0], 1}
+		_, tau := impl.Dlarfg(3, x[1], u[:2], 1)
+		t11 := t[j1*ldt+j1]
+
+		// Perform swap provisionally on diagonal block in d.
+		impl.Dlarfx(blas.Left, 3, 3, u[:], tau, d[:], ldd, work)
+		impl.Dlarfx(blas.Right, 3, 3, u[:], tau, d[:], ldd, work)
+
+		// Test whether to reject swap.
+		if math.Max(math.Abs(d[2*ldd]), math.Max(math.Abs(d[2*ldd+1]), math.Abs(d[2*ldd+2]-t11))) > thresh {
+			return false
+		}
+
+		// Accept swap: apply transformation to the entire matrix T.
+		impl.Dlarfx(blas.Left, 3, n-j1, u[:], tau, t[j1*ldt+j1:], ldt, work)
+		impl.Dlarfx(blas.Right, j2+1, 3, u[:], tau, t[j1:], ldt, work)
+
+		t[j3*ldt+j1] = 0
+		t[j3*ldt+j2] = 0
+		t[j3*ldt+j3] = t11
+
+		if wantq {
+			// Accumulate transformation in the matrix Q.
+			impl.Dlarfx(blas.Right, n, 3, u[:], tau, q[j1:], ldq, work)
+		}
+
+	case n1 == 2 && n2 == 1:
+		//  Generate elementary reflector H so that:
+		//   H (  -X11 ) = ( * )
+		//     (  -X21 ) = ( 0 )
+		//     ( scale ) = ( 0 )
+		u := [3]float64{1, -x[ldx], scale}
+		_, tau := impl.Dlarfg(3, -x[0], u[1:], 1)
+		t33 := t[j3*ldt+j3]
+
+		// Perform swap provisionally on diagonal block in D.
+		impl.Dlarfx(blas.Left, 3, 3, u[:], tau, d[:], ldd, work)
+		impl.Dlarfx(blas.Right, 3, 3, u[:], tau, d[:], ldd, work)
+
+		// Test whether to reject swap.
+		if math.Max(math.Abs(d[ldd]), math.Max(math.Abs(d[2*ldd]), math.Abs(d[0]-t33))) > thresh {
+			return false
+		}
+
+		// Accept swap: apply transformation to the entire matrix T.
+		impl.Dlarfx(blas.Right, j3+1, 3, u[:], tau, t[j1:], ldt, work)
+		impl.Dlarfx(blas.Left, 3, n-j1-1, u[:], tau, t[j1*ldt+j2:], ldt, work)
+
+		t[j1*ldt+j1] = t33
+		t[j2*ldt+j1] = 0
+		t[j3*ldt+j1] = 0
+
+		if wantq {
+			// Accumulate transformation in the matrix Q.
+			impl.Dlarfx(blas.Right, n, 3, u[:], tau, q[j1:], ldq, work)
+		}
+
+	default: // n1 == 2 && n2 == 2
+		// Generate elementary reflectors H_1 and H_2 so that:
+		//  H_2 H_1 (  -X11  -X12 ) = (  *  * )
+		//          (  -X21  -X22 )   (  0  * )
+		//          ( scale    0  )   (  0  0 )
+		//          (    0  scale )   (  0  0 )
+		u1 := [3]float64{1, -x[ldx], scale}
+		_, tau1 := impl.Dlarfg(3, -x[0], u1[1:], 1)
+
+		temp := -tau1 * (x[1] + u1[1]*x[ldx+1])
+		u2 := [3]float64{1, -temp * u1[2], scale}
+		_, tau2 := impl.Dlarfg(3, -temp*u1[1]-x[ldx+1], u2[1:], 1)
+
+		// Perform swap provisionally on diagonal block in D.
+		impl.Dlarfx(blas.Left, 3, 4, u1[:], tau1, d[:], ldd, work)
+		impl.Dlarfx(blas.Right, 4, 3, u1[:], tau1, d[:], ldd, work)
+		impl.Dlarfx(blas.Left, 3, 4, u2[:], tau2, d[ldd:], ldd, work)
+		impl.Dlarfx(blas.Right, 4, 3, u2[:], tau2, d[1:], ldd, work)
+
+		// Test whether to reject swap.
+		m1 := math.Max(math.Abs(d[2*ldd]), math.Abs(d[2*ldd+1]))
+		m2 := math.Max(math.Abs(d[3*ldd]), math.Abs(d[3*ldd+1]))
+		if math.Max(m1, m2) > thresh {
+			return false
+		}
+
+		// Accept swap: apply transformation to the entire matrix T.
+		j4 := j1 + 3
+		impl.Dlarfx(blas.Left, 3, n-j1, u1[:], tau1, t[j1*ldt+j1:], ldt, work)
+		impl.Dlarfx(blas.Right, j4+1, 3, u1[:], tau1, t[j1:], ldt, work)
+		impl.Dlarfx(blas.Left, 3, n-j1, u2[:], tau2, t[j2*ldt+j1:], ldt, work)
+		impl.Dlarfx(blas.Right, j4+1, 3, u2[:], tau2, t[j2:], ldt, work)
+
+		t[j3*ldt+j1] = 0
+		t[j3*ldt+j2] = 0
+		t[j4*ldt+j1] = 0
+		t[j4*ldt+j2] = 0
+
+		if wantq {
+			// Accumulate transformation in the matrix Q.
+			impl.Dlarfx(blas.Right, n, 3, u1[:], tau1, q[j1:], ldq, work)
+			impl.Dlarfx(blas.Right, n, 3, u2[:], tau2, q[j2:], ldq, work)
+		}
+	}
+
+	if n2 == 2 {
+		// Standardize new 2×2 block T11.
+		a, b := t[j1*ldt+j1], t[j1*ldt+j2]
+		c, d := t[j2*ldt+j1], t[j2*ldt+j2]
+		var cs, sn float64
+		t[j1*ldt+j1], t[j1*ldt+j2], t[j2*ldt+j1], t[j2*ldt+j2], _, _, _, _, cs, sn = impl.Dlanv2(a, b, c, d)
+		if n-j1-2 > 0 {
+			bi.Drot(n-j1-2, t[j1*ldt+j1+2:], 1, t[j2*ldt+j1+2:], 1, cs, sn)
+		}
+		if j1 > 0 {
+			bi.Drot(j1, t[j1:], ldt, t[j2:], ldt, cs, sn)
+		}
+		if wantq {
+			bi.Drot(n, q[j1:], ldq, q[j2:], ldq, cs, sn)
+		}
+	}
+	if n1 == 2 {
+		// Standardize new 2×2 block T22.
+		j3 := j1 + n2
+		j4 := j3 + 1
+		a, b := t[j3*ldt+j3], t[j3*ldt+j4]
+		c, d := t[j4*ldt+j3], t[j4*ldt+j4]
+		var cs, sn float64
+		t[j3*ldt+j3], t[j3*ldt+j4], t[j4*ldt+j3], t[j4*ldt+j4], _, _, _, _, cs, sn = impl.Dlanv2(a, b, c, d)
+		if n-j3-2 > 0 {
+			bi.Drot(n-j3-2, t[j3*ldt+j3+2:], 1, t[j4*ldt+j3+2:], 1, cs, sn)
+		}
+		bi.Drot(j3, t[j3:], ldt, t[j4:], ldt, cs, sn)
+		if wantq {
+			bi.Drot(n, q[j3:], ldq, q[j4:], ldq, cs, sn)
+		}
+	}
+
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlag2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlag2.go
new file mode 100644
index 0000000000..cd644b65bb
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlag2.go
@@ -0,0 +1,237 @@
+// Copyright ©2021 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlag2 computes the eigenvalues of a 2×2 generalized eigenvalue problem
+//
+//	A - w*B
+//
+// where B is an upper triangular matrix.
+//
+// Dlag2 uses scaling as necessary to avoid over-/underflow. Scaling results in
+// a modified eigenvalue problem
+//
+//	s*A - w*B
+//
+// where s is a non-negative scaling factor chosen so that w, w*B, and s*A do
+// not overflow and, if possible, do not underflow, either.
+//
+// scale1 and scale2 are used to avoid over-/underflow in the eigenvalue
+// equation which defines the first and second eigenvalue respectively. Note
+// that scale1 and scale2 may be zero or less than the underflow threshold if
+// the corresponding exact eigenvalue is sufficiently large.
+//
+// If the eigenvalues are real, then:
+//   - wi is zero,
+//   - the eigenvalues are wr1/scale1 and wr2/scale2.
+//
+// If the eigenvalues are complex, then:
+//   - wi is non-negative,
+//   - the eigenvalues are (wr1 ± wi*i)/scale1,
+//   - wr1 = wr2,
+//   - scale1 = scale2.
+//
+// Dlag2 assumes that the one-norm of A and B is less than 1/dlamchS. Entries of
+// A less than sqrt(dlamchS)*norm(A) are subject to being treated as zero. The
+// diagonals of B should be at least sqrt(dlamchS) times the largest element of
+// B (in absolute value); if a diagonal is smaller than that, then
+// ±sqrt(dlamchS) will be used instead of that diagonal.
+//
+// Dlag2 is an internal routine. It is exported for testing purposes.
+func (Implementation) Dlag2(a []float64, lda int, b []float64, ldb int) (scale1, scale2, wr1, wr2, wi float64) {
+	switch {
+	case lda < 2:
+		panic(badLdA)
+	case ldb < 2:
+		panic(badLdB)
+	case len(a) < lda+2:
+		panic(shortA)
+	case len(b) < ldb+2:
+		panic(shortB)
+	}
+
+	const (
+		safmin = dlamchS
+		safmax = 1 / safmin
+		fuzzy1 = 1 + 1e-5
+	)
+	rtmin := math.Sqrt(safmin)
+	rtmax := 1 / rtmin
+
+	// Scale A.
+	anorm := math.Max(math.Abs(a[0])+math.Abs(a[lda]),
+		math.Abs(a[1])+math.Abs(a[lda+1]))
+	anorm = math.Max(anorm, safmin)
+	ascale := 1 / anorm
+	a11 := ascale * a[0]
+	a21 := ascale * a[lda]
+	a12 := ascale * a[1]
+	a22 := ascale * a[lda+1]
+
+	// Perturb B if necessary to insure non-singularity.
+	b11 := b[0]
+	b12 := b[1]
+	b22 := b[ldb+1]
+	bmin := rtmin * math.Max(math.Max(math.Abs(b11), math.Abs(b12)),
+		math.Max(math.Abs(b22), rtmin))
+	if math.Abs(b11) < bmin {
+		b11 = math.Copysign(bmin, b11)
+	}
+	if math.Abs(b22) < bmin {
+		b22 = math.Copysign(bmin, b22)
+	}
+
+	// Scale B.
+	bnorm := math.Max(math.Max(math.Abs(b11), math.Abs(b12)+math.Abs(b22)), safmin)
+	bsize := math.Max(math.Abs(b11), math.Abs(b22))
+	bscale := 1 / bsize
+	b11 *= bscale
+	b12 *= bscale
+	b22 *= bscale
+
+	// Compute larger eigenvalue by method described by C. van Loan.
+	var (
+		as12, abi22   float64
+		pp, qq, shift float64
+	)
+	binv11 := 1 / b11
+	binv22 := 1 / b22
+	s1 := a11 * binv11
+	s2 := a22 * binv22
+	// AS is A shifted by -shift*B.
+	if math.Abs(s1) <= math.Abs(s2) {
+		shift = s1
+		as12 = a12 - shift*b12
+		as22 := a22 - shift*b22
+		ss := a21 * (binv11 * binv22)
+		abi22 = as22*binv22 - ss*b12
+		pp = 0.5 * abi22
+		qq = ss * as12
+	} else {
+		shift = s2
+		as12 = a12 - shift*b12
+		as11 := a11 - shift*b11
+		ss := a21 * (binv11 * binv22)
+		abi22 = -ss * b12
+		pp = 0.5 * (as11*binv11 + abi22)
+		qq = ss * as12
+	}
+	var discr, r float64
+	if math.Abs(pp*rtmin) >= 1 {
+		tmp := rtmin * pp
+		discr = tmp*tmp + qq*safmin
+		r = math.Sqrt(math.Abs(discr)) * rtmax
+	} else {
+		pp2 := pp * pp
+		if pp2+math.Abs(qq) <= safmin {
+			tmp := rtmax * pp
+			discr = tmp*tmp + qq*safmax
+			r = math.Sqrt(math.Abs(discr)) * rtmin
+		} else {
+			discr = pp2 + qq
+			r = math.Sqrt(math.Abs(discr))
+		}
+	}
+
+	// TODO(vladimir-ch): Is the following comment from the reference needed in
+	// a Go implementation?
+	//
+	// Note: the test of r in the following `if` is to cover the case when discr
+	// is small and negative and is flushed to zero during the calculation of r.
+	// On machines which have a consistent flush-to-zero threshold and handle
+	// numbers above that threshold correctly, it would not be necessary.
+	if discr >= 0 || r == 0 {
+		sum := pp + math.Copysign(r, pp)
+		diff := pp - math.Copysign(r, pp)
+		wbig := shift + sum
+
+		// Compute smaller eigenvalue.
+		wsmall := shift + diff
+		if 0.5*math.Abs(wbig) > math.Max(math.Abs(wsmall), safmin) {
+			wdet := (a11*a22 - a12*a21) * (binv11 * binv22)
+			wsmall = wdet / wbig
+		}
+		// Choose (real) eigenvalue closest to 2,2 element of A*B^{-1} for wr1.
+		if pp > abi22 {
+			wr1 = math.Min(wbig, wsmall)
+			wr2 = math.Max(wbig, wsmall)
+		} else {
+			wr1 = math.Max(wbig, wsmall)
+			wr2 = math.Min(wbig, wsmall)
+		}
+	} else {
+		// Complex eigenvalues.
+		wr1 = shift + pp
+		wr2 = wr1
+		wi = r
+	}
+
+	// Further scaling to avoid underflow and overflow in computing
+	// scale1 and overflow in computing w*B.
+	//
+	// This scale factor (wscale) is bounded from above using c1 and c2,
+	// and from below using c3 and c4:
+	//  - c1 implements the condition s*A must never overflow.
+	//  - c2 implements the condition w*B must never overflow.
+	//  - c3, with c2, implement the condition that s*A - w*B must never overflow.
+	//  - c4 implements the condition s should not underflow.
+	//  - c5 implements the condition max(s,|w|) should be at least 2.
+	c1 := bsize * (safmin * math.Max(1, ascale))
+	c2 := safmin * math.Max(1, bnorm)
+	c3 := bsize * safmin
+	c4 := 1.0
+	c5 := 1.0
+	if ascale <= 1 || bsize <= 1 {
+		c5 = math.Min(1, ascale*bsize)
+		if ascale <= 1 && bsize <= 1 {
+			c4 = math.Min(1, (ascale/safmin)*bsize)
+		}
+	}
+
+	// Scale first eigenvalue.
+	wabs := math.Abs(wr1) + math.Abs(wi)
+	wsize := math.Max(math.Max(safmin, c1), math.Max(fuzzy1*(wabs*c2+c3),
+		math.Min(c4, 0.5*math.Max(wabs, c5))))
+	maxABsize := math.Max(ascale, bsize)
+	minABsize := math.Min(ascale, bsize)
+	if wsize != 1 {
+		wscale := 1 / wsize
+		if wsize > 1 {
+			scale1 = (maxABsize * wscale) * minABsize
+		} else {
+			scale1 = (minABsize * wscale) * maxABsize
+		}
+		wr1 *= wscale
+		if wi != 0 {
+			wi *= wscale
+			wr2 = wr1
+			scale2 = scale1
+		}
+	} else {
+		scale1 = ascale * bsize
+		scale2 = scale1
+	}
+
+	// Scale second eigenvalue if real.
+	if wi == 0 {
+		wsize = math.Max(math.Max(safmin, c1), math.Max(fuzzy1*(math.Abs(wr2)*c2+c3),
+			math.Min(c4, 0.5*math.Max(math.Abs(wr2), c5))))
+		if wsize != 1 {
+			wscale := 1 / wsize
+			if wsize > 1 {
+				scale2 = (maxABsize * wscale) * minABsize
+			} else {
+				scale2 = (minABsize * wscale) * maxABsize
+			}
+			wr2 *= wscale
+		} else {
+			scale2 = ascale * bsize
+		}
+	}
+
+	return scale1, scale2, wr1, wr2, wi
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlags2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlags2.go
new file mode 100644
index 0000000000..7bd4f21970
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlags2.go
@@ -0,0 +1,186 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlags2 computes 2-by-2 orthogonal matrices U, V and Q with the
+// triangles of A and B specified by upper.
+//
+// If upper is true
+//
+//	Uᵀ*A*Q = Uᵀ*[ a1 a2 ]*Q = [ x  0 ]
+//	            [ 0  a3 ]     [ x  x ]
+//
+// and
+//
+//	Vᵀ*B*Q = Vᵀ*[ b1 b2 ]*Q = [ x  0 ]
+//	            [ 0  b3 ]     [ x  x ]
+//
+// otherwise
+//
+//	Uᵀ*A*Q = Uᵀ*[ a1 0  ]*Q = [ x  x ]
+//	            [ a2 a3 ]     [ 0  x ]
+//
+// and
+//
+//	Vᵀ*B*Q = Vᵀ*[ b1 0  ]*Q = [ x  x ]
+//	            [ b2 b3 ]     [ 0  x ].
+//
+// The rows of the transformed A and B are parallel, where
+//
+//	U = [  csu  snu ], V = [  csv snv ], Q = [  csq   snq ]
+//	    [ -snu  csu ]      [ -snv csv ]      [ -snq   csq ]
+//
+// Dlags2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlags2(upper bool, a1, a2, a3, b1, b2, b3 float64) (csu, snu, csv, snv, csq, snq float64) {
+	if upper {
+		// Input matrices A and B are upper triangular matrices.
+		//
+		// Form matrix C = A*adj(B) = [ a b ]
+		//                            [ 0 d ]
+		a := a1 * b3
+		d := a3 * b1
+		b := a2*b1 - a1*b2
+
+		// The SVD of real 2-by-2 triangular C.
+		//
+		//  [ csl -snl ]*[ a b ]*[  csr  snr ] = [ r 0 ]
+		//  [ snl  csl ] [ 0 d ] [ -snr  csr ]   [ 0 t ]
+		_, _, snr, csr, snl, csl := impl.Dlasv2(a, b, d)
+
+		if math.Abs(csl) >= math.Abs(snl) || math.Abs(csr) >= math.Abs(snr) {
+			// Compute the [0, 0] and [0, 1] elements of Uᵀ*A and Vᵀ*B,
+			// and [0, 1] element of |U|ᵀ*|A| and |V|ᵀ*|B|.
+
+			ua11r := csl * a1
+			ua12 := csl*a2 + snl*a3
+
+			vb11r := csr * b1
+			vb12 := csr*b2 + snr*b3
+
+			aua12 := math.Abs(csl)*math.Abs(a2) + math.Abs(snl)*math.Abs(a3)
+			avb12 := math.Abs(csr)*math.Abs(b2) + math.Abs(snr)*math.Abs(b3)
+
+			// Zero [0, 1] elements of Uᵀ*A and Vᵀ*B.
+			if math.Abs(ua11r)+math.Abs(ua12) != 0 {
+				if aua12/(math.Abs(ua11r)+math.Abs(ua12)) <= avb12/(math.Abs(vb11r)+math.Abs(vb12)) {
+					csq, snq, _ = impl.Dlartg(-ua11r, ua12)
+				} else {
+					csq, snq, _ = impl.Dlartg(-vb11r, vb12)
+				}
+			} else {
+				csq, snq, _ = impl.Dlartg(-vb11r, vb12)
+			}
+
+			csu = csl
+			snu = -snl
+			csv = csr
+			snv = -snr
+		} else {
+			// Compute the [1, 0] and [1, 1] elements of Uᵀ*A and Vᵀ*B,
+			// and [1, 1] element of |U|ᵀ*|A| and |V|ᵀ*|B|.
+
+			ua21 := -snl * a1
+			ua22 := -snl*a2 + csl*a3
+
+			vb21 := -snr * b1
+			vb22 := -snr*b2 + csr*b3
+
+			aua22 := math.Abs(snl)*math.Abs(a2) + math.Abs(csl)*math.Abs(a3)
+			avb22 := math.Abs(snr)*math.Abs(b2) + math.Abs(csr)*math.Abs(b3)
+
+			// Zero [1, 1] elements of Uᵀ*A and Vᵀ*B, and then swap.
+			if math.Abs(ua21)+math.Abs(ua22) != 0 {
+				if aua22/(math.Abs(ua21)+math.Abs(ua22)) <= avb22/(math.Abs(vb21)+math.Abs(vb22)) {
+					csq, snq, _ = impl.Dlartg(-ua21, ua22)
+				} else {
+					csq, snq, _ = impl.Dlartg(-vb21, vb22)
+				}
+			} else {
+				csq, snq, _ = impl.Dlartg(-vb21, vb22)
+			}
+
+			csu = snl
+			snu = csl
+			csv = snr
+			snv = csr
+		}
+	} else {
+		// Input matrices A and B are lower triangular matrices
+		//
+		// Form matrix C = A*adj(B) = [ a 0 ]
+		//                            [ c d ]
+		a := a1 * b3
+		d := a3 * b1
+		c := a2*b3 - a3*b2
+
+		// The SVD of real 2-by-2 triangular C
+		//
+		// [ csl -snl ]*[ a 0 ]*[  csr  snr ] = [ r 0 ]
+		// [ snl  csl ] [ c d ] [ -snr  csr ]   [ 0 t ]
+		_, _, snr, csr, snl, csl := impl.Dlasv2(a, c, d)
+
+		if math.Abs(csr) >= math.Abs(snr) || math.Abs(csl) >= math.Abs(snl) {
+			// Compute the [1, 0] and [1, 1] elements of Uᵀ*A and Vᵀ*B,
+			// and [1, 0] element of |U|ᵀ*|A| and |V|ᵀ*|B|.
+
+			ua21 := -snr*a1 + csr*a2
+			ua22r := csr * a3
+
+			vb21 := -snl*b1 + csl*b2
+			vb22r := csl * b3
+
+			aua21 := math.Abs(snr)*math.Abs(a1) + math.Abs(csr)*math.Abs(a2)
+			avb21 := math.Abs(snl)*math.Abs(b1) + math.Abs(csl)*math.Abs(b2)
+
+			// Zero [1, 0] elements of Uᵀ*A and Vᵀ*B.
+			if (math.Abs(ua21) + math.Abs(ua22r)) != 0 {
+				if aua21/(math.Abs(ua21)+math.Abs(ua22r)) <= avb21/(math.Abs(vb21)+math.Abs(vb22r)) {
+					csq, snq, _ = impl.Dlartg(ua22r, ua21)
+				} else {
+					csq, snq, _ = impl.Dlartg(vb22r, vb21)
+				}
+			} else {
+				csq, snq, _ = impl.Dlartg(vb22r, vb21)
+			}
+
+			csu = csr
+			snu = -snr
+			csv = csl
+			snv = -snl
+		} else {
+			// Compute the [0, 0] and [0, 1] elements of Uᵀ *A and Vᵀ *B,
+			// and [0, 0] element of |U|ᵀ*|A| and |V|ᵀ*|B|.
+
+			ua11 := csr*a1 + snr*a2
+			ua12 := snr * a3
+
+			vb11 := csl*b1 + snl*b2
+			vb12 := snl * b3
+
+			aua11 := math.Abs(csr)*math.Abs(a1) + math.Abs(snr)*math.Abs(a2)
+			avb11 := math.Abs(csl)*math.Abs(b1) + math.Abs(snl)*math.Abs(b2)
+
+			// Zero [0, 0] elements of Uᵀ*A and Vᵀ*B, and then swap.
+			if (math.Abs(ua11) + math.Abs(ua12)) != 0 {
+				if aua11/(math.Abs(ua11)+math.Abs(ua12)) <= avb11/(math.Abs(vb11)+math.Abs(vb12)) {
+					csq, snq, _ = impl.Dlartg(ua12, ua11)
+				} else {
+					csq, snq, _ = impl.Dlartg(vb12, vb11)
+				}
+			} else {
+				csq, snq, _ = impl.Dlartg(vb12, vb11)
+			}
+
+			csu = snr
+			snu = csr
+			csv = snl
+			snv = csl
+		}
+	}
+
+	return csu, snu, csv, snv, csq, snq
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlagtm.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlagtm.go
new file mode 100644
index 0000000000..fc8c8eb403
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlagtm.go
@@ -0,0 +1,111 @@
+// Copyright ©2020 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dlagtm performs one of the matrix-matrix operations
+//
+//	C = alpha * A * B + beta * C   if trans == blas.NoTrans
+//	C = alpha * Aᵀ * B + beta * C  if trans == blas.Trans or blas.ConjTrans
+//
+// where A is an m×m tridiagonal matrix represented by its diagonals dl, d, du,
+// B and C are m×n dense matrices, and alpha and beta are scalars.
+func (impl Implementation) Dlagtm(trans blas.Transpose, m, n int, alpha float64, dl, d, du []float64, b []float64, ldb int, beta float64, c []float64, ldc int) {
+	switch {
+	case trans != blas.NoTrans && trans != blas.Trans && trans != blas.ConjTrans:
+		panic(badTrans)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case ldb < max(1, n):
+		panic(badLdB)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	if m == 0 || n == 0 {
+		return
+	}
+
+	switch {
+	case len(dl) < m-1:
+		panic(shortDL)
+	case len(d) < m:
+		panic(shortD)
+	case len(du) < m-1:
+		panic(shortDU)
+	case len(b) < (m-1)*ldb+n:
+		panic(shortB)
+	case len(c) < (m-1)*ldc+n:
+		panic(shortC)
+	}
+
+	if beta != 1 {
+		if beta == 0 {
+			for i := 0; i < m; i++ {
+				ci := c[i*ldc : i*ldc+n]
+				for j := range ci {
+					ci[j] = 0
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				ci := c[i*ldc : i*ldc+n]
+				for j := range ci {
+					ci[j] *= beta
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	if m == 1 {
+		if alpha == 1 {
+			for j := 0; j < n; j++ {
+				c[j] += d[0] * b[j]
+			}
+		} else {
+			for j := 0; j < n; j++ {
+				c[j] += alpha * d[0] * b[j]
+			}
+		}
+		return
+	}
+
+	if trans != blas.NoTrans {
+		dl, du = du, dl
+	}
+
+	if alpha == 1 {
+		for j := 0; j < n; j++ {
+			c[j] += d[0]*b[j] + du[0]*b[ldb+j]
+		}
+		for i := 1; i < m-1; i++ {
+			for j := 0; j < n; j++ {
+				c[i*ldc+j] += dl[i-1]*b[(i-1)*ldb+j] + d[i]*b[i*ldb+j] + du[i]*b[(i+1)*ldb+j]
+			}
+		}
+		for j := 0; j < n; j++ {
+			c[(m-1)*ldc+j] += dl[m-2]*b[(m-2)*ldb+j] + d[m-1]*b[(m-1)*ldb+j]
+		}
+	} else {
+		for j := 0; j < n; j++ {
+			c[j] += alpha * (d[0]*b[j] + du[0]*b[ldb+j])
+		}
+		for i := 1; i < m-1; i++ {
+			for j := 0; j < n; j++ {
+				c[i*ldc+j] += alpha * (dl[i-1]*b[(i-1)*ldb+j] + d[i]*b[i*ldb+j] + du[i]*b[(i+1)*ldb+j])
+			}
+		}
+		for j := 0; j < n; j++ {
+			c[(m-1)*ldc+j] += alpha * (dl[m-2]*b[(m-2)*ldb+j] + d[m-1]*b[(m-1)*ldb+j])
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlahqr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlahqr.go
new file mode 100644
index 0000000000..6f1202547e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlahqr.go
@@ -0,0 +1,449 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlahqr computes the eigenvalues and Schur factorization of a block of an n×n
+// upper Hessenberg matrix H, using the double-shift/single-shift QR algorithm.
+//
+// h and ldh represent the matrix H. Dlahqr works primarily with the Hessenberg
+// submatrix H[ilo:ihi+1,ilo:ihi+1], but applies transformations to all of H if
+// wantt is true. It is assumed that H[ihi+1:n,ihi+1:n] is already upper
+// quasi-triangular, although this is not checked.
+//
+// It must hold that
+//
+//	0 <= ilo <= max(0,ihi), and ihi < n,
+//
+// and that
+//
+//	H[ilo,ilo-1] == 0,  if ilo > 0,
+//
+// otherwise Dlahqr will panic.
+//
+// If unconverged is zero on return, wr[ilo:ihi+1] and wi[ilo:ihi+1] will contain
+// respectively the real and imaginary parts of the computed eigenvalues ilo
+// to ihi. If two eigenvalues are computed as a complex conjugate pair, they are
+// stored in consecutive elements of wr and wi, say the i-th and (i+1)th, with
+// wi[i] > 0 and wi[i+1] < 0. If wantt is true, the eigenvalues are stored in
+// the same order as on the diagonal of the Schur form returned in H, with
+// wr[i] = H[i,i], and, if H[i:i+2,i:i+2] is a 2×2 diagonal block,
+// wi[i] = sqrt(abs(H[i+1,i]*H[i,i+1])) and wi[i+1] = -wi[i].
+//
+// wr and wi must have length ihi+1.
+//
+// z and ldz represent an n×n matrix Z. If wantz is true, the transformations
+// will be applied to the submatrix Z[iloz:ihiz+1,ilo:ihi+1] and it must hold that
+//
+//	0 <= iloz <= ilo, and ihi <= ihiz < n.
+//
+// If wantz is false, z is not referenced.
+//
+// unconverged indicates whether Dlahqr computed all the eigenvalues ilo to ihi
+// in a total of 30 iterations per eigenvalue.
+//
+// If unconverged is zero, all the eigenvalues ilo to ihi have been computed and
+// will be stored on return in wr[ilo:ihi+1] and wi[ilo:ihi+1].
+//
+// If unconverged is zero and wantt is true, H[ilo:ihi+1,ilo:ihi+1] will be
+// overwritten on return by upper quasi-triangular full Schur form with any
+// 2×2 diagonal blocks in standard form.
+//
+// If unconverged is zero and if wantt is false, the contents of h on return is
+// unspecified.
+//
+// If unconverged is positive, some eigenvalues have not converged, and
+// wr[unconverged:ihi+1] and wi[unconverged:ihi+1] contain those eigenvalues
+// which have been successfully computed.
+//
+// If unconverged is positive and wantt is true, then on return
+//
+//	(initial H)*U = U*(final H),   (*)
+//
+// where U is an orthogonal matrix. The final H is upper Hessenberg and
+// H[unconverged:ihi+1,unconverged:ihi+1] is upper quasi-triangular.
+//
+// If unconverged is positive and wantt is false, on return the remaining
+// unconverged eigenvalues are the eigenvalues of the upper Hessenberg matrix
+// H[ilo:unconverged,ilo:unconverged].
+//
+// If unconverged is positive and wantz is true, then on return
+//
+//	(final Z) = (initial Z)*U,
+//
+// where U is the orthogonal matrix in (*) regardless of the value of wantt.
+//
+// Dlahqr is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlahqr(wantt, wantz bool, n, ilo, ihi int, h []float64, ldh int, wr, wi []float64, iloz, ihiz int, z []float64, ldz int) (unconverged int) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case ilo < 0, max(0, ihi) < ilo:
+		panic(badIlo)
+	case ihi >= n:
+		panic(badIhi)
+	case ldh < max(1, n):
+		panic(badLdH)
+	case wantz && (iloz < 0 || ilo < iloz):
+		panic(badIloz)
+	case wantz && (ihiz < ihi || n <= ihiz):
+		panic(badIhiz)
+	case ldz < 1, wantz && ldz < n:
+		panic(badLdZ)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 0
+	}
+
+	switch {
+	case len(h) < (n-1)*ldh+n:
+		panic(shortH)
+	case len(wr) != ihi+1:
+		panic(shortWr)
+	case len(wi) != ihi+1:
+		panic(shortWi)
+	case wantz && len(z) < (n-1)*ldz+n:
+		panic(shortZ)
+	case ilo > 0 && h[ilo*ldh+ilo-1] != 0:
+		panic(notIsolated)
+	}
+
+	if ilo == ihi {
+		wr[ilo] = h[ilo*ldh+ilo]
+		wi[ilo] = 0
+		return 0
+	}
+
+	// Clear out the trash.
+	for j := ilo; j < ihi-2; j++ {
+		h[(j+2)*ldh+j] = 0
+		h[(j+3)*ldh+j] = 0
+	}
+	if ilo <= ihi-2 {
+		h[ihi*ldh+ihi-2] = 0
+	}
+
+	nh := ihi - ilo + 1
+	nz := ihiz - iloz + 1
+
+	// Set machine-dependent constants for the stopping criterion.
+	ulp := dlamchP
+	smlnum := float64(nh) / ulp * dlamchS
+
+	// i1 and i2 are the indices of the first row and last column of H to
+	// which transformations must be applied. If eigenvalues only are being
+	// computed, i1 and i2 are set inside the main loop.
+	var i1, i2 int
+	if wantt {
+		i1 = 0
+		i2 = n - 1
+	}
+
+	itmax := 30 * max(10, nh) // Total number of QR iterations allowed.
+
+	// kdefl counts the number of iterations since a deflation.
+	kdefl := 0
+
+	// The main loop begins here. i is the loop index and decreases from ihi
+	// to ilo in steps of 1 or 2. Each iteration of the loop works with the
+	// active submatrix in rows and columns l to i. Eigenvalues i+1 to ihi
+	// have already converged. Either l = ilo or H[l,l-1] is negligible so
+	// that the matrix splits.
+	bi := blas64.Implementation()
+	i := ihi
+	for i >= ilo {
+		l := ilo
+
+		// Perform QR iterations on rows and columns ilo to i until a
+		// submatrix of order 1 or 2 splits off at the bottom because a
+		// subdiagonal element has become negligible.
+		converged := false
+		for its := 0; its <= itmax; its++ {
+			// Look for a single small subdiagonal element.
+			var k int
+			for k = i; k > l; k-- {
+				if math.Abs(h[k*ldh+k-1]) <= smlnum {
+					break
+				}
+				tst := math.Abs(h[(k-1)*ldh+k-1]) + math.Abs(h[k*ldh+k])
+				if tst == 0 {
+					if k-2 >= ilo {
+						tst += math.Abs(h[(k-1)*ldh+k-2])
+					}
+					if k+1 <= ihi {
+						tst += math.Abs(h[(k+1)*ldh+k])
+					}
+				}
+				// The following is a conservative small
+				// subdiagonal deflation criterion due to Ahues
+				// & Tisseur (LAWN 122, 1997). It has better
+				// mathematical foundation and improves accuracy
+				// in some cases.
+				if math.Abs(h[k*ldh+k-1]) <= ulp*tst {
+					ab := math.Max(math.Abs(h[k*ldh+k-1]), math.Abs(h[(k-1)*ldh+k]))
+					ba := math.Min(math.Abs(h[k*ldh+k-1]), math.Abs(h[(k-1)*ldh+k]))
+					aa := math.Max(math.Abs(h[k*ldh+k]), math.Abs(h[(k-1)*ldh+k-1]-h[k*ldh+k]))
+					bb := math.Min(math.Abs(h[k*ldh+k]), math.Abs(h[(k-1)*ldh+k-1]-h[k*ldh+k]))
+					s := aa + ab
+					if ab/s*ba <= math.Max(smlnum, aa/s*bb*ulp) {
+						break
+					}
+				}
+			}
+			l = k
+			if l > ilo {
+				// H[l,l-1] is negligible.
+				h[l*ldh+l-1] = 0
+			}
+			if l >= i-1 {
+				// Break the loop because a submatrix of order 1
+				// or 2 has split off.
+				converged = true
+				break
+			}
+			kdefl++
+
+			// Now the active submatrix is in rows and columns l to
+			// i. If eigenvalues only are being computed, only the
+			// active submatrix need be transformed.
+			if !wantt {
+				i1 = l
+				i2 = i
+			}
+
+			const (
+				dat1  = 0.75
+				dat2  = -0.4375
+				kexsh = 10
+			)
+			var h11, h21, h12, h22 float64
+			switch {
+			case kdefl%(2*kexsh) == 0: // Exceptional shift.
+				s := math.Abs(h[i*ldh+i-1]) + math.Abs(h[(i-1)*ldh+i-2])
+				h11 = dat1*s + h[i*ldh+i]
+				h12 = dat2 * s
+				h21 = s
+				h22 = h11
+			case kdefl%kexsh == 0: // Exceptional shift.
+				s := math.Abs(h[(l+1)*ldh+l]) + math.Abs(h[(l+2)*ldh+l+1])
+				h11 = dat1*s + h[l*ldh+l]
+				h12 = dat2 * s
+				h21 = s
+				h22 = h11
+			default: // Prepare to use Francis' double shift (i.e.,
+				// 2nd degree generalized Rayleigh quotient).
+				h11 = h[(i-1)*ldh+i-1]
+				h21 = h[i*ldh+i-1]
+				h12 = h[(i-1)*ldh+i]
+				h22 = h[i*ldh+i]
+			}
+			s := math.Abs(h11) + math.Abs(h12) + math.Abs(h21) + math.Abs(h22)
+			var (
+				rt1r, rt1i float64
+				rt2r, rt2i float64
+			)
+			if s != 0 {
+				h11 /= s
+				h21 /= s
+				h12 /= s
+				h22 /= s
+				tr := (h11 + h22) / 2
+				det := (h11-tr)*(h22-tr) - h12*h21
+				rtdisc := math.Sqrt(math.Abs(det))
+				if det >= 0 {
+					// Complex conjugate shifts.
+					rt1r = tr * s
+					rt2r = rt1r
+					rt1i = rtdisc * s
+					rt2i = -rt1i
+				} else {
+					// Real shifts (use only one of them).
+					rt1r = tr + rtdisc
+					rt2r = tr - rtdisc
+					if math.Abs(rt1r-h22) <= math.Abs(rt2r-h22) {
+						rt1r *= s
+						rt2r = rt1r
+					} else {
+						rt2r *= s
+						rt1r = rt2r
+					}
+					rt1i = 0
+					rt2i = 0
+				}
+			}
+
+			// Look for two consecutive small subdiagonal elements.
+			var m int
+			var v [3]float64
+			for m = i - 2; m >= l; m-- {
+				// Determine the effect of starting the
+				// double-shift QR iteration at row m, and see
+				// if this would make H[m,m-1] negligible. The
+				// following uses scaling to avoid overflows and
+				// most underflows.
+				h21s := h[(m+1)*ldh+m]
+				s := math.Abs(h[m*ldh+m]-rt2r) + math.Abs(rt2i) + math.Abs(h21s)
+				h21s /= s
+				v[0] = h21s*h[m*ldh+m+1] + (h[m*ldh+m]-rt1r)*((h[m*ldh+m]-rt2r)/s) - rt2i/s*rt1i
+				v[1] = h21s * (h[m*ldh+m] + h[(m+1)*ldh+m+1] - rt1r - rt2r)
+				v[2] = h21s * h[(m+2)*ldh+m+1]
+				s = math.Abs(v[0]) + math.Abs(v[1]) + math.Abs(v[2])
+				v[0] /= s
+				v[1] /= s
+				v[2] /= s
+				if m == l {
+					break
+				}
+				dsum := math.Abs(h[(m-1)*ldh+m-1]) + math.Abs(h[m*ldh+m]) + math.Abs(h[(m+1)*ldh+m+1])
+				if math.Abs(h[m*ldh+m-1])*(math.Abs(v[1])+math.Abs(v[2])) <= ulp*math.Abs(v[0])*dsum {
+					break
+				}
+			}
+
+			// Double-shift QR step.
+			for k := m; k < i; k++ {
+				// The first iteration of this loop determines a
+				// reflection G from the vector V and applies it
+				// from left and right to H, thus creating a
+				// non-zero bulge below the subdiagonal.
+				//
+				// Each subsequent iteration determines a
+				// reflection G to restore the Hessenberg form
+				// in the (k-1)th column, and thus chases the
+				// bulge one step toward the bottom of the
+				// active submatrix. nr is the order of G.
+
+				nr := min(3, i-k+1)
+				if k > m {
+					bi.Dcopy(nr, h[k*ldh+k-1:], ldh, v[:], 1)
+				}
+				var t0 float64
+				v[0], t0 = impl.Dlarfg(nr, v[0], v[1:], 1)
+				if k > m {
+					h[k*ldh+k-1] = v[0]
+					h[(k+1)*ldh+k-1] = 0
+					if k < i-1 {
+						h[(k+2)*ldh+k-1] = 0
+					}
+				} else if m > l {
+					// Use the following instead of H[k,k-1] = -H[k,k-1]
+					// to avoid a bug when v[1] and v[2] underflow.
+					h[k*ldh+k-1] *= 1 - t0
+				}
+				t1 := t0 * v[1]
+				if nr == 3 {
+					t2 := t0 * v[2]
+
+					// Apply G from the left to transform
+					// the rows of the matrix in columns k
+					// to i2.
+					for j := k; j <= i2; j++ {
+						sum := h[k*ldh+j] + v[1]*h[(k+1)*ldh+j] + v[2]*h[(k+2)*ldh+j]
+						h[k*ldh+j] -= sum * t0
+						h[(k+1)*ldh+j] -= sum * t1
+						h[(k+2)*ldh+j] -= sum * t2
+					}
+
+					// Apply G from the right to transform
+					// the columns of the matrix in rows i1
+					// to min(k+3,i).
+					for j := i1; j <= min(k+3, i); j++ {
+						sum := h[j*ldh+k] + v[1]*h[j*ldh+k+1] + v[2]*h[j*ldh+k+2]
+						h[j*ldh+k] -= sum * t0
+						h[j*ldh+k+1] -= sum * t1
+						h[j*ldh+k+2] -= sum * t2
+					}
+
+					if wantz {
+						// Accumulate transformations in the matrix Z.
+						for j := iloz; j <= ihiz; j++ {
+							sum := z[j*ldz+k] + v[1]*z[j*ldz+k+1] + v[2]*z[j*ldz+k+2]
+							z[j*ldz+k] -= sum * t0
+							z[j*ldz+k+1] -= sum * t1
+							z[j*ldz+k+2] -= sum * t2
+						}
+					}
+				} else if nr == 2 {
+					// Apply G from the left to transform
+					// the rows of the matrix in columns k
+					// to i2.
+					for j := k; j <= i2; j++ {
+						sum := h[k*ldh+j] + v[1]*h[(k+1)*ldh+j]
+						h[k*ldh+j] -= sum * t0
+						h[(k+1)*ldh+j] -= sum * t1
+					}
+
+					// Apply G from the right to transform
+					// the columns of the matrix in rows i1
+					// to min(k+3,i).
+					for j := i1; j <= i; j++ {
+						sum := h[j*ldh+k] + v[1]*h[j*ldh+k+1]
+						h[j*ldh+k] -= sum * t0
+						h[j*ldh+k+1] -= sum * t1
+					}
+
+					if wantz {
+						// Accumulate transformations in the matrix Z.
+						for j := iloz; j <= ihiz; j++ {
+							sum := z[j*ldz+k] + v[1]*z[j*ldz+k+1]
+							z[j*ldz+k] -= sum * t0
+							z[j*ldz+k+1] -= sum * t1
+						}
+					}
+				}
+			}
+		}
+
+		if !converged {
+			// The QR iteration finished without splitting off a
+			// submatrix of order 1 or 2.
+			return i + 1
+		}
+
+		if l == i {
+			// H[i,i-1] is negligible: one eigenvalue has converged.
+			wr[i] = h[i*ldh+i]
+			wi[i] = 0
+		} else if l == i-1 {
+			// H[i-1,i-2] is negligible: a pair of eigenvalues have converged.
+
+			// Transform the 2×2 submatrix to standard Schur form,
+			// and compute and store the eigenvalues.
+			var cs, sn float64
+			a, b := h[(i-1)*ldh+i-1], h[(i-1)*ldh+i]
+			c, d := h[i*ldh+i-1], h[i*ldh+i]
+			a, b, c, d, wr[i-1], wi[i-1], wr[i], wi[i], cs, sn = impl.Dlanv2(a, b, c, d)
+			h[(i-1)*ldh+i-1], h[(i-1)*ldh+i] = a, b
+			h[i*ldh+i-1], h[i*ldh+i] = c, d
+
+			if wantt {
+				// Apply the transformation to the rest of H.
+				if i2 > i {
+					bi.Drot(i2-i, h[(i-1)*ldh+i+1:], 1, h[i*ldh+i+1:], 1, cs, sn)
+				}
+				bi.Drot(i-i1-1, h[i1*ldh+i-1:], ldh, h[i1*ldh+i:], ldh, cs, sn)
+			}
+
+			if wantz {
+				// Apply the transformation to Z.
+				bi.Drot(nz, z[iloz*ldz+i-1:], ldz, z[iloz*ldz+i:], ldz, cs, sn)
+			}
+		}
+
+		// Reset deflation counter.
+		kdefl = 0
+
+		// Return to start of the main loop with new value of i.
+		i = l - 1
+	}
+	return 0
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlahr2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlahr2.go
new file mode 100644
index 0000000000..5921473342
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlahr2.go
@@ -0,0 +1,202 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlahr2 reduces the first nb columns of a real general n×(n-k+1) matrix A so
+// that elements below the k-th subdiagonal are zero. The reduction is performed
+// by an orthogonal similarity transformation Qᵀ * A * Q. Dlahr2 returns the
+// matrices V and T which determine Q as a block reflector I - V*T*Vᵀ, and
+// also the matrix Y = A * V * T.
+//
+// The matrix Q is represented as a product of nb elementary reflectors
+//
+//	Q = H_0 * H_1 * ... * H_{nb-1}.
+//
+// Each H_i has the form
+//
+//	H_i = I - tau[i] * v * vᵀ,
+//
+// where v is a real vector with v[0:i+k-1] = 0 and v[i+k-1] = 1. v[i+k:n] is
+// stored on exit in A[i+k+1:n,i].
+//
+// The elements of the vectors v together form the (n-k+1)×nb matrix
+// V which is needed, with T and Y, to apply the transformation to the
+// unreduced part of the matrix, using an update of the form
+//
+//	A = (I - V*T*Vᵀ) * (A - Y*Vᵀ).
+//
+// On entry, a contains the n×(n-k+1) general matrix A. On return, the elements
+// on and above the k-th subdiagonal in the first nb columns are overwritten
+// with the corresponding elements of the reduced matrix; the elements below the
+// k-th subdiagonal, with the slice tau, represent the matrix Q as a product of
+// elementary reflectors. The other columns of A are unchanged.
+//
+// The contents of A on exit are illustrated by the following example
+// with n = 7, k = 3 and nb = 2:
+//
+//	[ a   a   a   a   a ]
+//	[ a   a   a   a   a ]
+//	[ a   a   a   a   a ]
+//	[ h   h   a   a   a ]
+//	[ v0  h   a   a   a ]
+//	[ v0  v1  a   a   a ]
+//	[ v0  v1  a   a   a ]
+//
+// where a denotes an element of the original matrix A, h denotes a
+// modified element of the upper Hessenberg matrix H, and vi denotes an
+// element of the vector defining H_i.
+//
+// k is the offset for the reduction. Elements below the k-th subdiagonal in the
+// first nb columns are reduced to zero.
+//
+// nb is the number of columns to be reduced.
+//
+// On entry, a represents the n×(n-k+1) matrix A. On return, the elements on and
+// above the k-th subdiagonal in the first nb columns are overwritten with the
+// corresponding elements of the reduced matrix. The elements below the k-th
+// subdiagonal, with the slice tau, represent the matrix Q as a product of
+// elementary reflectors. The other columns of A are unchanged.
+//
+// tau will contain the scalar factors of the elementary reflectors. It must
+// have length at least nb.
+//
+// t and ldt represent the nb×nb upper triangular matrix T, and y and ldy
+// represent the n×nb matrix Y.
+//
+// Dlahr2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlahr2(n, k, nb int, a []float64, lda int, tau, t []float64, ldt int, y []float64, ldy int) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case nb < 0:
+		panic(nbLT0)
+	case nb > n:
+		panic(nbGTN)
+	case lda < max(1, n-k+1):
+		panic(badLdA)
+	case ldt < max(1, nb):
+		panic(badLdT)
+	case ldy < max(1, nb):
+		panic(badLdY)
+	}
+
+	// Quick return if possible.
+	if n < 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n-k+1:
+		panic(shortA)
+	case len(tau) < nb:
+		panic(shortTau)
+	case len(t) < (nb-1)*ldt+nb:
+		panic(shortT)
+	case len(y) < (n-1)*ldy+nb:
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if n == 1 {
+		return
+	}
+
+	bi := blas64.Implementation()
+	var ei float64
+	for i := 0; i < nb; i++ {
+		if i > 0 {
+			// Update A[k:n,i].
+
+			// Update i-th column of A - Y * Vᵀ.
+			bi.Dgemv(blas.NoTrans, n-k, i,
+				-1, y[k*ldy:], ldy,
+				a[(k+i-1)*lda:], 1,
+				1, a[k*lda+i:], lda)
+
+			// Apply I - V * Tᵀ * Vᵀ to this column (call it b)
+			// from the left, using the last column of T as
+			// workspace.
+			// Let V = [ V1 ]   and   b = [ b1 ]   (first i rows)
+			//         [ V2 ]             [ b2 ]
+			// where V1 is unit lower triangular.
+			//
+			// w := V1ᵀ * b1.
+			bi.Dcopy(i, a[k*lda+i:], lda, t[nb-1:], ldt)
+			bi.Dtrmv(blas.Lower, blas.Trans, blas.Unit, i,
+				a[k*lda:], lda, t[nb-1:], ldt)
+
+			// w := w + V2ᵀ * b2.
+			bi.Dgemv(blas.Trans, n-k-i, i,
+				1, a[(k+i)*lda:], lda,
+				a[(k+i)*lda+i:], lda,
+				1, t[nb-1:], ldt)
+
+			// w := Tᵀ * w.
+			bi.Dtrmv(blas.Upper, blas.Trans, blas.NonUnit, i,
+				t, ldt, t[nb-1:], ldt)
+
+			// b2 := b2 - V2*w.
+			bi.Dgemv(blas.NoTrans, n-k-i, i,
+				-1, a[(k+i)*lda:], lda,
+				t[nb-1:], ldt,
+				1, a[(k+i)*lda+i:], lda)
+
+			// b1 := b1 - V1*w.
+			bi.Dtrmv(blas.Lower, blas.NoTrans, blas.Unit, i,
+				a[k*lda:], lda, t[nb-1:], ldt)
+			bi.Daxpy(i, -1, t[nb-1:], ldt, a[k*lda+i:], lda)
+
+			a[(k+i-1)*lda+i-1] = ei
+		}
+
+		// Generate the elementary reflector H_i to annihilate
+		// A[k+i+1:n,i].
+		ei, tau[i] = impl.Dlarfg(n-k-i, a[(k+i)*lda+i], a[min(k+i+1, n-1)*lda+i:], lda)
+		a[(k+i)*lda+i] = 1
+
+		// Compute Y[k:n,i].
+		bi.Dgemv(blas.NoTrans, n-k, n-k-i,
+			1, a[k*lda+i+1:], lda,
+			a[(k+i)*lda+i:], lda,
+			0, y[k*ldy+i:], ldy)
+		bi.Dgemv(blas.Trans, n-k-i, i,
+			1, a[(k+i)*lda:], lda,
+			a[(k+i)*lda+i:], lda,
+			0, t[i:], ldt)
+		bi.Dgemv(blas.NoTrans, n-k, i,
+			-1, y[k*ldy:], ldy,
+			t[i:], ldt,
+			1, y[k*ldy+i:], ldy)
+		bi.Dscal(n-k, tau[i], y[k*ldy+i:], ldy)
+
+		// Compute T[0:i,i].
+		bi.Dscal(i, -tau[i], t[i:], ldt)
+		bi.Dtrmv(blas.Upper, blas.NoTrans, blas.NonUnit, i,
+			t, ldt, t[i:], ldt)
+
+		t[i*ldt+i] = tau[i]
+	}
+	a[(k+nb-1)*lda+nb-1] = ei
+
+	// Compute Y[0:k,0:nb].
+	impl.Dlacpy(blas.All, k, nb, a[1:], lda, y, ldy)
+	bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.Unit, k, nb,
+		1, a[k*lda:], lda, y, ldy)
+	if n > k+nb {
+		bi.Dgemm(blas.NoTrans, blas.NoTrans, k, nb, n-k-nb,
+			1, a[1+nb:], lda,
+			a[(k+nb)*lda:], lda,
+			1, y, ldy)
+	}
+	bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.NonUnit, k, nb,
+		1, t, ldt, y, ldy)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlaln2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaln2.go
new file mode 100644
index 0000000000..54d443988b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaln2.go
@@ -0,0 +1,407 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlaln2 solves a linear equation or a system of 2 linear equations of the form
+//
+//	(ca A   - w D) X = scale B  if trans == false,
+//	(ca Aᵀ - w D) X = scale B   if trans == true,
+//
+// where A is a na×na real matrix, ca is a real scalar, D is a na×na diagonal
+// real matrix, w is a scalar, real if nw == 1, complex if nw == 2, and X and B
+// are na×1 matrices, real if w is real, complex if w is complex.
+//
+// If w is complex, X and B are represented as na×2 matrices, the first column
+// of each being the real part and the second being the imaginary part.
+//
+// na and nw must be 1 or 2, otherwise Dlaln2 will panic.
+//
+// d1 and d2 are the diagonal elements of D. d2 is not used if na == 1.
+//
+// wr and wi represent the real and imaginary part, respectively, of the scalar
+// w. wi is not used if nw == 1.
+//
+// smin is the desired lower bound on the singular values of A. This should be
+// a safe distance away from underflow or overflow, say, between
+// (underflow/machine precision) and (overflow*machine precision).
+//
+// If both singular values of (ca A - w D) are less than smin, smin*identity
+// will be used instead of (ca A - w D). If only one singular value is less than
+// smin, one element of (ca A - w D) will be perturbed enough to make the
+// smallest singular value roughly smin. If both singular values are at least
+// smin, (ca A - w D) will not be perturbed. In any case, the perturbation will
+// be at most some small multiple of max(smin, ulp*norm(ca A - w D)). The
+// singular values are computed by infinity-norm approximations, and thus will
+// only be correct to a factor of 2 or so.
+//
+// All input quantities are assumed to be smaller than overflow by a reasonable
+// factor.
+//
+// scale is a scaling factor less than or equal to 1 which is chosen so that X
+// can be computed without overflow. X is further scaled if necessary to assure
+// that norm(ca A - w D)*norm(X) is less than overflow.
+//
+// xnorm contains the infinity-norm of X when X is regarded as a na×nw real
+// matrix.
+//
+// ok will be false if (ca A - w D) had to be perturbed to make its smallest
+// singular value greater than smin, otherwise ok will be true.
+//
+// Dlaln2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlaln2(trans bool, na, nw int, smin, ca float64, a []float64, lda int, d1, d2 float64, b []float64, ldb int, wr, wi float64, x []float64, ldx int) (scale, xnorm float64, ok bool) {
+	// TODO(vladimir-ch): Consider splitting this function into two, one
+	// handling the real case (nw == 1) and the other handling the complex
+	// case (nw == 2). Given that Go has complex types, their signatures
+	// would be simpler and more natural, and the implementation not as
+	// convoluted.
+
+	switch {
+	case na != 1 && na != 2:
+		panic(badNa)
+	case nw != 1 && nw != 2:
+		panic(badNw)
+	case lda < na:
+		panic(badLdA)
+	case len(a) < (na-1)*lda+na:
+		panic(shortA)
+	case ldb < nw:
+		panic(badLdB)
+	case len(b) < (na-1)*ldb+nw:
+		panic(shortB)
+	case ldx < nw:
+		panic(badLdX)
+	case len(x) < (na-1)*ldx+nw:
+		panic(shortX)
+	}
+
+	smlnum := 2 * dlamchS
+	bignum := 1 / smlnum
+	smini := math.Max(smin, smlnum)
+
+	ok = true
+	scale = 1
+
+	if na == 1 {
+		// 1×1 (i.e., scalar) system C X = B.
+
+		if nw == 1 {
+			// Real 1×1 system.
+
+			// C = ca A - w D.
+			csr := ca*a[0] - wr*d1
+			cnorm := math.Abs(csr)
+
+			// If |C| < smini, use C = smini.
+			if cnorm < smini {
+				csr = smini
+				cnorm = smini
+				ok = false
+			}
+
+			// Check scaling for X = B / C.
+			bnorm := math.Abs(b[0])
+			if cnorm < 1 && bnorm > math.Max(1, bignum*cnorm) {
+				scale = 1 / bnorm
+			}
+
+			// Compute X.
+			x[0] = b[0] * scale / csr
+			xnorm = math.Abs(x[0])
+
+			return scale, xnorm, ok
+		}
+
+		// Complex 1×1 system (w is complex).
+
+		// C = ca A - w D.
+		csr := ca*a[0] - wr*d1
+		csi := -wi * d1
+		cnorm := math.Abs(csr) + math.Abs(csi)
+
+		// If |C| < smini, use C = smini.
+		if cnorm < smini {
+			csr = smini
+			csi = 0
+			cnorm = smini
+			ok = false
+		}
+
+		// Check scaling for X = B / C.
+		bnorm := math.Abs(b[0]) + math.Abs(b[1])
+		if cnorm < 1 && bnorm > math.Max(1, bignum*cnorm) {
+			scale = 1 / bnorm
+		}
+
+		// Compute X.
+		cx := complex(scale*b[0], scale*b[1]) / complex(csr, csi)
+		x[0], x[1] = real(cx), imag(cx)
+		xnorm = math.Abs(x[0]) + math.Abs(x[1])
+
+		return scale, xnorm, ok
+	}
+
+	// 2×2 system.
+
+	// Compute the real part of
+	//  C = ca A   - w D
+	// or
+	//  C = ca Aᵀ - w D.
+	crv := [4]float64{
+		ca*a[0] - wr*d1,
+		ca * a[1],
+		ca * a[lda],
+		ca*a[lda+1] - wr*d2,
+	}
+	if trans {
+		crv[1] = ca * a[lda]
+		crv[2] = ca * a[1]
+	}
+
+	pivot := [4][4]int{
+		{0, 1, 2, 3},
+		{1, 0, 3, 2},
+		{2, 3, 0, 1},
+		{3, 2, 1, 0},
+	}
+
+	if nw == 1 {
+		// Real 2×2 system (w is real).
+
+		// Find the largest element in C.
+		var cmax float64
+		var icmax int
+		for j, v := range crv {
+			v = math.Abs(v)
+			if v > cmax {
+				cmax = v
+				icmax = j
+			}
+		}
+
+		// If norm(C) < smini, use smini*identity.
+		if cmax < smini {
+			bnorm := math.Max(math.Abs(b[0]), math.Abs(b[ldb]))
+			if smini < 1 && bnorm > math.Max(1, bignum*smini) {
+				scale = 1 / bnorm
+			}
+			temp := scale / smini
+			x[0] = temp * b[0]
+			x[ldx] = temp * b[ldb]
+			xnorm = temp * bnorm
+			ok = false
+
+			return scale, xnorm, ok
+		}
+
+		// Gaussian elimination with complete pivoting.
+		// Form upper triangular matrix
+		//  [ur11 ur12]
+		//  [   0 ur22]
+		ur11 := crv[icmax]
+		ur12 := crv[pivot[icmax][1]]
+		cr21 := crv[pivot[icmax][2]]
+		cr22 := crv[pivot[icmax][3]]
+		ur11r := 1 / ur11
+		lr21 := ur11r * cr21
+		ur22 := cr22 - ur12*lr21
+
+		// If smaller pivot < smini, use smini.
+		if math.Abs(ur22) < smini {
+			ur22 = smini
+			ok = false
+		}
+
+		var br1, br2 float64
+		if icmax > 1 {
+			// If the pivot lies in the second row, swap the rows.
+			br1 = b[ldb]
+			br2 = b[0]
+		} else {
+			br1 = b[0]
+			br2 = b[ldb]
+		}
+		br2 -= lr21 * br1 // Apply the Gaussian elimination step to the right-hand side.
+
+		bbnd := math.Max(math.Abs(ur22*ur11r*br1), math.Abs(br2))
+		if bbnd > 1 && math.Abs(ur22) < 1 && bbnd >= bignum*math.Abs(ur22) {
+			scale = 1 / bbnd
+		}
+
+		// Solve the linear system ur*xr=br.
+		xr2 := br2 * scale / ur22
+		xr1 := scale*br1*ur11r - ur11r*ur12*xr2
+		if icmax&0x1 != 0 {
+			// If the pivot lies in the second column, swap the components of the solution.
+			x[0] = xr2
+			x[ldx] = xr1
+		} else {
+			x[0] = xr1
+			x[ldx] = xr2
+		}
+		xnorm = math.Max(math.Abs(xr1), math.Abs(xr2))
+
+		// Further scaling if norm(A)*norm(X) > overflow.
+		if xnorm > 1 && cmax > 1 && xnorm > bignum/cmax {
+			temp := cmax / bignum
+			x[0] *= temp
+			x[ldx] *= temp
+			xnorm *= temp
+			scale *= temp
+		}
+
+		return scale, xnorm, ok
+	}
+
+	// Complex 2×2 system (w is complex).
+
+	// Find the largest element in C.
+	civ := [4]float64{
+		-wi * d1,
+		0,
+		0,
+		-wi * d2,
+	}
+	var cmax float64
+	var icmax int
+	for j, v := range crv {
+		v := math.Abs(v)
+		if v+math.Abs(civ[j]) > cmax {
+			cmax = v + math.Abs(civ[j])
+			icmax = j
+		}
+	}
+
+	// If norm(C) < smini, use smini*identity.
+	if cmax < smini {
+		br1 := math.Abs(b[0]) + math.Abs(b[1])
+		br2 := math.Abs(b[ldb]) + math.Abs(b[ldb+1])
+		bnorm := math.Max(br1, br2)
+		if smini < 1 && bnorm > 1 && bnorm > bignum*smini {
+			scale = 1 / bnorm
+		}
+		temp := scale / smini
+		x[0] = temp * b[0]
+		x[1] = temp * b[1]
+		x[ldb] = temp * b[ldb]
+		x[ldb+1] = temp * b[ldb+1]
+		xnorm = temp * bnorm
+		ok = false
+
+		return scale, xnorm, ok
+	}
+
+	// Gaussian elimination with complete pivoting.
+	ur11 := crv[icmax]
+	ui11 := civ[icmax]
+	ur12 := crv[pivot[icmax][1]]
+	ui12 := civ[pivot[icmax][1]]
+	cr21 := crv[pivot[icmax][2]]
+	ci21 := civ[pivot[icmax][2]]
+	cr22 := crv[pivot[icmax][3]]
+	ci22 := civ[pivot[icmax][3]]
+	var (
+		ur11r, ui11r float64
+		lr21, li21   float64
+		ur12s, ui12s float64
+		ur22, ui22   float64
+	)
+	if icmax == 0 || icmax == 3 {
+		// Off-diagonals of pivoted C are real.
+		if math.Abs(ur11) > math.Abs(ui11) {
+			temp := ui11 / ur11
+			ur11r = 1 / (ur11 * (1 + temp*temp))
+			ui11r = -temp * ur11r
+		} else {
+			temp := ur11 / ui11
+			ui11r = -1 / (ui11 * (1 + temp*temp))
+			ur11r = -temp * ui11r
+		}
+		lr21 = cr21 * ur11r
+		li21 = cr21 * ui11r
+		ur12s = ur12 * ur11r
+		ui12s = ur12 * ui11r
+		ur22 = cr22 - ur12*lr21
+		ui22 = ci22 - ur12*li21
+	} else {
+		// Diagonals of pivoted C are real.
+		ur11r = 1 / ur11
+		// ui11r is already 0.
+		lr21 = cr21 * ur11r
+		li21 = ci21 * ur11r
+		ur12s = ur12 * ur11r
+		ui12s = ui12 * ur11r
+		ur22 = cr22 - ur12*lr21 + ui12*li21
+		ui22 = -ur12*li21 - ui12*lr21
+	}
+	u22abs := math.Abs(ur22) + math.Abs(ui22)
+
+	// If smaller pivot < smini, use smini.
+	if u22abs < smini {
+		ur22 = smini
+		ui22 = 0
+		ok = false
+	}
+
+	var br1, bi1 float64
+	var br2, bi2 float64
+	if icmax > 1 {
+		// If the pivot lies in the second row, swap the rows.
+		br1 = b[ldb]
+		bi1 = b[ldb+1]
+		br2 = b[0]
+		bi2 = b[1]
+	} else {
+		br1 = b[0]
+		bi1 = b[1]
+		br2 = b[ldb]
+		bi2 = b[ldb+1]
+	}
+	br2 += -lr21*br1 + li21*bi1
+	bi2 += -li21*br1 - lr21*bi1
+
+	bbnd1 := u22abs * (math.Abs(ur11r) + math.Abs(ui11r)) * (math.Abs(br1) + math.Abs(bi1))
+	bbnd2 := math.Abs(br2) + math.Abs(bi2)
+	bbnd := math.Max(bbnd1, bbnd2)
+	if bbnd > 1 && u22abs < 1 && bbnd >= bignum*u22abs {
+		scale = 1 / bbnd
+		br1 *= scale
+		bi1 *= scale
+		br2 *= scale
+		bi2 *= scale
+	}
+
+	cx2 := complex(br2, bi2) / complex(ur22, ui22)
+	xr2, xi2 := real(cx2), imag(cx2)
+	xr1 := ur11r*br1 - ui11r*bi1 - ur12s*xr2 + ui12s*xi2
+	xi1 := ui11r*br1 + ur11r*bi1 - ui12s*xr2 - ur12s*xi2
+	if icmax&0x1 != 0 {
+		// If the pivot lies in the second column, swap the components of the solution.
+		x[0] = xr2
+		x[1] = xi2
+		x[ldx] = xr1
+		x[ldx+1] = xi1
+	} else {
+		x[0] = xr1
+		x[1] = xi1
+		x[ldx] = xr2
+		x[ldx+1] = xi2
+	}
+	xnorm = math.Max(math.Abs(xr1)+math.Abs(xi1), math.Abs(xr2)+math.Abs(xi2))
+
+	// Further scaling if norm(A)*norm(X) > overflow.
+	if xnorm > 1 && cmax > 1 && xnorm > bignum/cmax {
+		temp := cmax / bignum
+		x[0] *= temp
+		x[1] *= temp
+		x[ldx] *= temp
+		x[ldx+1] *= temp
+		xnorm *= temp
+		scale *= temp
+	}
+
+	return scale, xnorm, ok
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlangb.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlangb.go
new file mode 100644
index 0000000000..4b7b449f63
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlangb.go
@@ -0,0 +1,87 @@
+// Copyright ©2021 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/internal/asm/f64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlangb returns the given norm of an m×n band matrix with kl sub-diagonals and
+// ku super-diagonals.
+func (impl Implementation) Dlangb(norm lapack.MatrixNorm, m, n, kl, ku int, ab []float64, ldab int) float64 {
+	ncol := kl + 1 + ku
+	switch {
+	case norm != lapack.MaxAbs && norm != lapack.MaxRowSum && norm != lapack.MaxColumnSum && norm != lapack.Frobenius:
+		panic(badNorm)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case kl < 0:
+		panic(klLT0)
+	case ku < 0:
+		panic(kuLT0)
+	case ldab < ncol:
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return 0
+	}
+
+	switch {
+	case len(ab) < min(m, n+kl)*ldab:
+		panic(shortAB)
+	}
+
+	var value float64
+	switch norm {
+	case lapack.MaxAbs:
+		for i := 0; i < min(m, n+kl); i++ {
+			l := max(0, kl-i)
+			u := min(n+kl-i, ncol)
+			for _, aij := range ab[i*ldab+l : i*ldab+u] {
+				aij = math.Abs(aij)
+				if aij > value || math.IsNaN(aij) {
+					value = aij
+				}
+			}
+		}
+	case lapack.MaxRowSum:
+		for i := 0; i < min(m, n+kl); i++ {
+			l := max(0, kl-i)
+			u := min(n+kl-i, ncol)
+			sum := f64.L1Norm(ab[i*ldab+l : i*ldab+u])
+			if sum > value || math.IsNaN(sum) {
+				value = sum
+			}
+		}
+	case lapack.MaxColumnSum:
+		for j := 0; j < min(m+ku, n); j++ {
+			jb := min(kl+j, ncol-1)
+			ib := max(0, j-ku)
+			jlen := min(j+kl, m-1) - ib + 1
+			sum := f64.L1NormInc(ab[ib*ldab+jb:], jlen, max(1, ldab-1))
+			if sum > value || math.IsNaN(sum) {
+				value = sum
+			}
+		}
+	case lapack.Frobenius:
+		scale := 0.0
+		sum := 1.0
+		for i := 0; i < min(m, n+kl); i++ {
+			l := max(0, kl-i)
+			u := min(n+kl-i, ncol)
+			ilen := u - l
+			scale, sum = impl.Dlassq(ilen, ab[i*ldab+l:], 1, scale, sum)
+		}
+		value = scale * math.Sqrt(sum)
+	}
+	return value
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlange.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlange.go
new file mode 100644
index 0000000000..3a00dce1da
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlange.go
@@ -0,0 +1,89 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlange returns the value of the specified norm of a general m×n matrix A:
+//
+//	lapack.MaxAbs:       the maximum absolute value of any element.
+//	lapack.MaxColumnSum: the maximum column sum of the absolute values of the elements (1-norm).
+//	lapack.MaxRowSum:    the maximum row sum of the absolute values of the elements (infinity-norm).
+//	lapack.Frobenius:    the square root of the sum of the squares of the elements (Frobenius norm).
+//
+// If norm == lapack.MaxColumnSum, work must be of length n, and this function will
+// panic otherwise. There are no restrictions on work for the other matrix norms.
+func (impl Implementation) Dlange(norm lapack.MatrixNorm, m, n int, a []float64, lda int, work []float64) float64 {
+	// TODO(btracey): These should probably be refactored to use BLAS calls.
+	switch {
+	case norm != lapack.MaxRowSum && norm != lapack.MaxColumnSum && norm != lapack.Frobenius && norm != lapack.MaxAbs:
+		panic(badNorm)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return 0
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(badLdA)
+	case norm == lapack.MaxColumnSum && len(work) < n:
+		panic(shortWork)
+	}
+
+	switch norm {
+	case lapack.MaxAbs:
+		var value float64
+		for i := 0; i < m; i++ {
+			for j := 0; j < n; j++ {
+				value = math.Max(value, math.Abs(a[i*lda+j]))
+			}
+		}
+		return value
+	case lapack.MaxColumnSum:
+		for i := 0; i < n; i++ {
+			work[i] = 0
+		}
+		for i := 0; i < m; i++ {
+			for j := 0; j < n; j++ {
+				work[j] += math.Abs(a[i*lda+j])
+			}
+		}
+		var value float64
+		for i := 0; i < n; i++ {
+			value = math.Max(value, work[i])
+		}
+		return value
+	case lapack.MaxRowSum:
+		var value float64
+		for i := 0; i < m; i++ {
+			var sum float64
+			for j := 0; j < n; j++ {
+				sum += math.Abs(a[i*lda+j])
+			}
+			value = math.Max(value, sum)
+		}
+		return value
+	default:
+		// lapack.Frobenius
+		scale := 0.0
+		sum := 1.0
+		for i := 0; i < m; i++ {
+			scale, sum = impl.Dlassq(n, a[i*lda:], 1, scale, sum)
+		}
+		return scale * math.Sqrt(sum)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlangt.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlangt.go
new file mode 100644
index 0000000000..cd1c49b5c3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlangt.go
@@ -0,0 +1,115 @@
+// Copyright ©2020 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlangt returns the value of the given norm of an n×n tridiagonal matrix
+// represented by the three diagonals.
+//
+// d must have length at least n and dl and du must have length at least n-1.
+func (impl Implementation) Dlangt(norm lapack.MatrixNorm, n int, dl, d, du []float64) float64 {
+	switch {
+	case norm != lapack.MaxAbs && norm != lapack.MaxRowSum && norm != lapack.MaxColumnSum && norm != lapack.Frobenius:
+		panic(badNorm)
+	case n < 0:
+		panic(nLT0)
+	}
+
+	if n == 0 {
+		return 0
+	}
+
+	switch {
+	case len(dl) < n-1:
+		panic(shortDL)
+	case len(d) < n:
+		panic(shortD)
+	case len(du) < n-1:
+		panic(shortDU)
+	}
+
+	dl = dl[:n-1]
+	d = d[:n]
+	du = du[:n-1]
+
+	var anorm float64
+	switch norm {
+	case lapack.MaxAbs:
+		for _, diag := range [][]float64{dl, d, du} {
+			for _, di := range diag {
+				if math.IsNaN(di) {
+					return di
+				}
+				di = math.Abs(di)
+				if di > anorm {
+					anorm = di
+				}
+			}
+		}
+	case lapack.MaxColumnSum:
+		if n == 1 {
+			return math.Abs(d[0])
+		}
+		anorm = math.Abs(d[0]) + math.Abs(dl[0])
+		if math.IsNaN(anorm) {
+			return anorm
+		}
+		tmp := math.Abs(du[n-2]) + math.Abs(d[n-1])
+		if math.IsNaN(tmp) {
+			return tmp
+		}
+		if tmp > anorm {
+			anorm = tmp
+		}
+		for i := 1; i < n-1; i++ {
+			tmp = math.Abs(du[i-1]) + math.Abs(d[i]) + math.Abs(dl[i])
+			if math.IsNaN(tmp) {
+				return tmp
+			}
+			if tmp > anorm {
+				anorm = tmp
+			}
+		}
+	case lapack.MaxRowSum:
+		if n == 1 {
+			return math.Abs(d[0])
+		}
+		anorm = math.Abs(d[0]) + math.Abs(du[0])
+		if math.IsNaN(anorm) {
+			return anorm
+		}
+		tmp := math.Abs(dl[n-2]) + math.Abs(d[n-1])
+		if math.IsNaN(tmp) {
+			return tmp
+		}
+		if tmp > anorm {
+			anorm = tmp
+		}
+		for i := 1; i < n-1; i++ {
+			tmp = math.Abs(dl[i-1]) + math.Abs(d[i]) + math.Abs(du[i])
+			if math.IsNaN(tmp) {
+				return tmp
+			}
+			if tmp > anorm {
+				anorm = tmp
+			}
+		}
+	case lapack.Frobenius:
+		scale := 0.0
+		ssq := 1.0
+		scale, ssq = impl.Dlassq(n, d, 1, scale, ssq)
+		if n > 1 {
+			scale, ssq = impl.Dlassq(n-1, dl, 1, scale, ssq)
+			scale, ssq = impl.Dlassq(n-1, du, 1, scale, ssq)
+		}
+		anorm = scale * math.Sqrt(ssq)
+	}
+	return anorm
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlanhs.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlanhs.go
new file mode 100644
index 0000000000..054b90f02b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlanhs.go
@@ -0,0 +1,78 @@
+// Copyright ©2023 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlanhs returns the value of the one norm, or the Frobenius norm, or the
+// infinity norm, or the element of largest absolute value of a Hessenberg
+// matrix A.
+//
+// If norm is lapack.MaxColumnSum, work must have length at least n.
+func (impl Implementation) Dlanhs(norm lapack.MatrixNorm, n int, a []float64, lda int, work []float64) float64 {
+	switch {
+	case norm != lapack.MaxRowSum && norm != lapack.MaxAbs && norm != lapack.MaxColumnSum && norm != lapack.Frobenius:
+		panic(badNorm)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	if n == 0 {
+		return 0
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case norm == lapack.MaxColumnSum && len(work) < n:
+		panic(shortWork)
+	}
+
+	bi := blas64.Implementation()
+	var value float64
+	switch norm {
+	case lapack.MaxAbs:
+		for i := 0; i < n; i++ {
+			minj := max(0, i-1)
+			for _, v := range a[i*lda+minj : i*lda+n] {
+				value = math.Max(value, math.Abs(v))
+			}
+		}
+	case lapack.MaxColumnSum:
+		for i := 0; i < n; i++ {
+			work[i] = 0
+		}
+		for i := 0; i < n; i++ {
+			for j := max(0, i-1); j < n; j++ {
+				work[j] += math.Abs(a[i*lda+j])
+			}
+		}
+		for _, v := range work[:n] {
+			value = math.Max(value, v)
+		}
+	case lapack.MaxRowSum:
+		for i := 0; i < n; i++ {
+			minj := max(0, i-1)
+			sum := bi.Dasum(n-minj, a[i*lda+minj:], 1)
+			value = math.Max(value, sum)
+		}
+	case lapack.Frobenius:
+		scale := 0.0
+		sum := 1.0
+		for i := 0; i < n; i++ {
+			minj := max(0, i-1)
+			scale, sum = impl.Dlassq(n-minj, a[i*lda+minj:], 1, scale, sum)
+		}
+		value = scale * math.Sqrt(sum)
+	}
+	return value
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlansb.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlansb.go
new file mode 100644
index 0000000000..17801f84b6
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlansb.go
@@ -0,0 +1,131 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlansb returns the given norm of an n×n symmetric band matrix with kd
+// super-diagonals.
+//
+// When norm is lapack.MaxColumnSum or lapack.MaxRowSum, the length of work must
+// be at least n.
+func (impl Implementation) Dlansb(norm lapack.MatrixNorm, uplo blas.Uplo, n, kd int, ab []float64, ldab int, work []float64) float64 {
+	switch {
+	case norm != lapack.MaxAbs && norm != lapack.MaxRowSum && norm != lapack.MaxColumnSum && norm != lapack.Frobenius:
+		panic(badNorm)
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case kd < 0:
+		panic(kdLT0)
+	case ldab < kd+1:
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 0
+	}
+
+	switch {
+	case len(ab) < (n-1)*ldab+kd+1:
+		panic(shortAB)
+	case len(work) < n && (norm == lapack.MaxColumnSum || norm == lapack.MaxRowSum):
+		panic(shortWork)
+	}
+
+	var value float64
+	switch norm {
+	case lapack.MaxAbs:
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				for j := 0; j < min(n-i, kd+1); j++ {
+					aij := math.Abs(ab[i*ldab+j])
+					if aij > value || math.IsNaN(aij) {
+						value = aij
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				for j := max(0, kd-i); j < kd+1; j++ {
+					aij := math.Abs(ab[i*ldab+j])
+					if aij > value || math.IsNaN(aij) {
+						value = aij
+					}
+				}
+			}
+		}
+	case lapack.MaxColumnSum, lapack.MaxRowSum:
+		work = work[:n]
+		var sum float64
+		if uplo == blas.Upper {
+			for i := range work {
+				work[i] = 0
+			}
+			for i := 0; i < n; i++ {
+				sum := work[i] + math.Abs(ab[i*ldab])
+				for j := i + 1; j < min(i+kd+1, n); j++ {
+					aij := math.Abs(ab[i*ldab+j-i])
+					sum += aij
+					work[j] += aij
+				}
+				if sum > value || math.IsNaN(sum) {
+					value = sum
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				sum = 0
+				for j := max(0, i-kd); j < i; j++ {
+					aij := math.Abs(ab[i*ldab+kd+j-i])
+					sum += aij
+					work[j] += aij
+				}
+				work[i] = sum + math.Abs(ab[i*ldab+kd])
+			}
+			for _, sum := range work {
+				if sum > value || math.IsNaN(sum) {
+					value = sum
+				}
+			}
+		}
+	case lapack.Frobenius:
+		scale := 0.0
+		sum := 1.0
+		if uplo == blas.Upper {
+			if kd > 0 {
+				// Sum off-diagonals.
+				for i := 0; i < n-1; i++ {
+					ilen := min(n-i-1, kd)
+					scale, sum = impl.Dlassq(ilen, ab[i*ldab+1:], 1, scale, sum)
+				}
+				sum *= 2
+			}
+			// Sum diagonal.
+			scale, sum = impl.Dlassq(n, ab, ldab, scale, sum)
+		} else {
+			if kd > 0 {
+				// Sum off-diagonals.
+				for i := 1; i < n; i++ {
+					ilen := min(i, kd)
+					scale, sum = impl.Dlassq(ilen, ab[i*ldab+kd-ilen:], 1, scale, sum)
+				}
+				sum *= 2
+			}
+			// Sum diagonal.
+			scale, sum = impl.Dlassq(n, ab[kd:], ldab, scale, sum)
+		}
+		value = scale * math.Sqrt(sum)
+	}
+
+	return value
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlanst.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlanst.go
new file mode 100644
index 0000000000..9ca1897e34
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlanst.go
@@ -0,0 +1,75 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlanst computes the specified norm of a symmetric tridiagonal matrix A.
+// The diagonal elements of A are stored in d and the off-diagonal elements
+// are stored in e.
+func (impl Implementation) Dlanst(norm lapack.MatrixNorm, n int, d, e []float64) float64 {
+	switch {
+	case norm != lapack.MaxRowSum && norm != lapack.MaxColumnSum && norm != lapack.Frobenius && norm != lapack.MaxAbs:
+		panic(badNorm)
+	case n < 0:
+		panic(nLT0)
+	}
+	if n == 0 {
+		return 0
+	}
+	switch {
+	case len(d) < n:
+		panic(shortD)
+	case len(e) < n-1:
+		panic(shortE)
+	}
+
+	switch norm {
+	default:
+		panic(badNorm)
+	case lapack.MaxAbs:
+		anorm := math.Abs(d[n-1])
+		for i := 0; i < n-1; i++ {
+			sum := math.Abs(d[i])
+			if anorm < sum || math.IsNaN(sum) {
+				anorm = sum
+			}
+			sum = math.Abs(e[i])
+			if anorm < sum || math.IsNaN(sum) {
+				anorm = sum
+			}
+		}
+		return anorm
+	case lapack.MaxColumnSum, lapack.MaxRowSum:
+		if n == 1 {
+			return math.Abs(d[0])
+		}
+		anorm := math.Abs(d[0]) + math.Abs(e[0])
+		sum := math.Abs(e[n-2]) + math.Abs(d[n-1])
+		if anorm < sum || math.IsNaN(sum) {
+			anorm = sum
+		}
+		for i := 1; i < n-1; i++ {
+			sum := math.Abs(d[i]) + math.Abs(e[i]) + math.Abs(e[i-1])
+			if anorm < sum || math.IsNaN(sum) {
+				anorm = sum
+			}
+		}
+		return anorm
+	case lapack.Frobenius:
+		var scale float64
+		sum := 1.0
+		if n > 1 {
+			scale, sum = impl.Dlassq(n-1, e, 1, scale, sum)
+			sum = 2 * sum
+		}
+		scale, sum = impl.Dlassq(n, d, 1, scale, sum)
+		return scale * math.Sqrt(sum)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlansy.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlansy.go
new file mode 100644
index 0000000000..b972c72e55
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlansy.go
@@ -0,0 +1,125 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlansy returns the value of the specified norm of an n×n symmetric matrix. If
+// norm == lapack.MaxColumnSum or norm == lapack.MaxRowSum, work must have length
+// at least n, otherwise work is unused.
+func (impl Implementation) Dlansy(norm lapack.MatrixNorm, uplo blas.Uplo, n int, a []float64, lda int, work []float64) float64 {
+	switch {
+	case norm != lapack.MaxRowSum && norm != lapack.MaxColumnSum && norm != lapack.Frobenius && norm != lapack.MaxAbs:
+		panic(badNorm)
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 0
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case (norm == lapack.MaxColumnSum || norm == lapack.MaxRowSum) && len(work) < n:
+		panic(shortWork)
+	}
+
+	switch norm {
+	case lapack.MaxAbs:
+		if uplo == blas.Upper {
+			var max float64
+			for i := 0; i < n; i++ {
+				for j := i; j < n; j++ {
+					v := math.Abs(a[i*lda+j])
+					if math.IsNaN(v) {
+						return math.NaN()
+					}
+					if v > max {
+						max = v
+					}
+				}
+			}
+			return max
+		}
+		var max float64
+		for i := 0; i < n; i++ {
+			for j := 0; j <= i; j++ {
+				v := math.Abs(a[i*lda+j])
+				if math.IsNaN(v) {
+					return math.NaN()
+				}
+				if v > max {
+					max = v
+				}
+			}
+		}
+		return max
+	case lapack.MaxRowSum, lapack.MaxColumnSum:
+		// A symmetric matrix has the same 1-norm and ∞-norm.
+		for i := 0; i < n; i++ {
+			work[i] = 0
+		}
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				work[i] += math.Abs(a[i*lda+i])
+				for j := i + 1; j < n; j++ {
+					v := math.Abs(a[i*lda+j])
+					work[i] += v
+					work[j] += v
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				for j := 0; j < i; j++ {
+					v := math.Abs(a[i*lda+j])
+					work[i] += v
+					work[j] += v
+				}
+				work[i] += math.Abs(a[i*lda+i])
+			}
+		}
+		var max float64
+		for i := 0; i < n; i++ {
+			v := work[i]
+			if math.IsNaN(v) {
+				return math.NaN()
+			}
+			if v > max {
+				max = v
+			}
+		}
+		return max
+	default:
+		// lapack.Frobenius:
+		scale := 0.0
+		sum := 1.0
+		// Sum off-diagonals.
+		if uplo == blas.Upper {
+			for i := 0; i < n-1; i++ {
+				scale, sum = impl.Dlassq(n-i-1, a[i*lda+i+1:], 1, scale, sum)
+			}
+		} else {
+			for i := 1; i < n; i++ {
+				scale, sum = impl.Dlassq(i, a[i*lda:], 1, scale, sum)
+			}
+		}
+		sum *= 2
+		// Sum diagonal.
+		scale, sum = impl.Dlassq(n, a, lda+1, scale, sum)
+		return scale * math.Sqrt(sum)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlantb.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlantb.go
new file mode 100644
index 0000000000..ceab2a6af3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlantb.go
@@ -0,0 +1,209 @@
+// Copyright ©2020 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlantb returns the value of the given norm of an n×n triangular band matrix A
+// with k+1 diagonals.
+//
+// When norm is lapack.MaxColumnSum, the length of work must be at least n.
+func (impl Implementation) Dlantb(norm lapack.MatrixNorm, uplo blas.Uplo, diag blas.Diag, n, k int, a []float64, lda int, work []float64) float64 {
+	switch {
+	case norm != lapack.MaxAbs && norm != lapack.MaxRowSum && norm != lapack.MaxColumnSum && norm != lapack.Frobenius:
+		panic(badNorm)
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kdLT0)
+	case lda < k+1:
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 0
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+k+1:
+		panic(shortAB)
+	case len(work) < n && norm == lapack.MaxColumnSum:
+		panic(shortWork)
+	}
+
+	var value float64
+	switch norm {
+	case lapack.MaxAbs:
+		if uplo == blas.Upper {
+			var jfirst int
+			if diag == blas.Unit {
+				value = 1
+				jfirst = 1
+			}
+			for i := 0; i < n; i++ {
+				for _, aij := range a[i*lda+jfirst : i*lda+min(n-i, k+1)] {
+					if math.IsNaN(aij) {
+						return aij
+					}
+					aij = math.Abs(aij)
+					if aij > value {
+						value = aij
+					}
+				}
+			}
+		} else {
+			jlast := k + 1
+			if diag == blas.Unit {
+				value = 1
+				jlast = k
+			}
+			for i := 0; i < n; i++ {
+				for _, aij := range a[i*lda+max(0, k-i) : i*lda+jlast] {
+					if math.IsNaN(aij) {
+						return math.NaN()
+					}
+					aij = math.Abs(aij)
+					if aij > value {
+						value = aij
+					}
+				}
+			}
+		}
+	case lapack.MaxRowSum:
+		var sum float64
+		if uplo == blas.Upper {
+			var jfirst int
+			if diag == blas.Unit {
+				jfirst = 1
+			}
+			for i := 0; i < n; i++ {
+				sum = 0
+				if diag == blas.Unit {
+					sum = 1
+				}
+				for _, aij := range a[i*lda+jfirst : i*lda+min(n-i, k+1)] {
+					sum += math.Abs(aij)
+				}
+				if math.IsNaN(sum) {
+					return math.NaN()
+				}
+				if sum > value {
+					value = sum
+				}
+			}
+		} else {
+			jlast := k + 1
+			if diag == blas.Unit {
+				jlast = k
+			}
+			for i := 0; i < n; i++ {
+				sum = 0
+				if diag == blas.Unit {
+					sum = 1
+				}
+				for _, aij := range a[i*lda+max(0, k-i) : i*lda+jlast] {
+					sum += math.Abs(aij)
+				}
+				if math.IsNaN(sum) {
+					return math.NaN()
+				}
+				if sum > value {
+					value = sum
+				}
+			}
+		}
+	case lapack.MaxColumnSum:
+		work = work[:n]
+		if diag == blas.Unit {
+			for i := range work {
+				work[i] = 1
+			}
+		} else {
+			for i := range work {
+				work[i] = 0
+			}
+		}
+		if uplo == blas.Upper {
+			var jfirst int
+			if diag == blas.Unit {
+				jfirst = 1
+			}
+			for i := 0; i < n; i++ {
+				for j, aij := range a[i*lda+jfirst : i*lda+min(n-i, k+1)] {
+					work[i+jfirst+j] += math.Abs(aij)
+				}
+			}
+		} else {
+			jlast := k + 1
+			if diag == blas.Unit {
+				jlast = k
+			}
+			for i := 0; i < n; i++ {
+				off := max(0, k-i)
+				for j, aij := range a[i*lda+off : i*lda+jlast] {
+					work[i+j+off-k] += math.Abs(aij)
+				}
+			}
+		}
+		for _, wi := range work {
+			if math.IsNaN(wi) {
+				return math.NaN()
+			}
+			if wi > value {
+				value = wi
+			}
+		}
+	case lapack.Frobenius:
+		var scale, sum float64
+		switch uplo {
+		case blas.Upper:
+			if diag == blas.Unit {
+				scale = 1
+				sum = float64(n)
+				if k > 0 {
+					for i := 0; i < n-1; i++ {
+						ilen := min(n-i-1, k)
+						scale, sum = impl.Dlassq(ilen, a[i*lda+1:], 1, scale, sum)
+					}
+				}
+			} else {
+				scale = 0
+				sum = 1
+				for i := 0; i < n; i++ {
+					ilen := min(n-i, k+1)
+					scale, sum = impl.Dlassq(ilen, a[i*lda:], 1, scale, sum)
+				}
+			}
+		case blas.Lower:
+			if diag == blas.Unit {
+				scale = 1
+				sum = float64(n)
+				if k > 0 {
+					for i := 1; i < n; i++ {
+						ilen := min(i, k)
+						scale, sum = impl.Dlassq(ilen, a[i*lda+k-ilen:], 1, scale, sum)
+					}
+				}
+			} else {
+				scale = 0
+				sum = 1
+				for i := 0; i < n; i++ {
+					ilen := min(i, k) + 1
+					scale, sum = impl.Dlassq(ilen, a[i*lda+k+1-ilen:], 1, scale, sum)
+				}
+			}
+		}
+		value = scale * math.Sqrt(sum)
+	}
+	return value
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlantr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlantr.go
new file mode 100644
index 0000000000..33569832fd
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlantr.go
@@ -0,0 +1,252 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlantr computes the specified norm of an m×n trapezoidal matrix A. If
+// norm == lapack.MaxColumnSum work must have length at least n, otherwise work
+// is unused.
+func (impl Implementation) Dlantr(norm lapack.MatrixNorm, uplo blas.Uplo, diag blas.Diag, m, n int, a []float64, lda int, work []float64) float64 {
+	switch {
+	case norm != lapack.MaxRowSum && norm != lapack.MaxColumnSum && norm != lapack.Frobenius && norm != lapack.MaxAbs:
+		panic(badNorm)
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case diag != blas.Unit && diag != blas.NonUnit:
+		panic(badDiag)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	minmn := min(m, n)
+	if minmn == 0 {
+		return 0
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case norm == lapack.MaxColumnSum && len(work) < n:
+		panic(shortWork)
+	}
+
+	switch norm {
+	case lapack.MaxAbs:
+		if diag == blas.Unit {
+			value := 1.0
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					for j := i + 1; j < n; j++ {
+						tmp := math.Abs(a[i*lda+j])
+						if math.IsNaN(tmp) {
+							return tmp
+						}
+						if tmp > value {
+							value = tmp
+						}
+					}
+				}
+				return value
+			}
+			for i := 1; i < m; i++ {
+				for j := 0; j < min(i, n); j++ {
+					tmp := math.Abs(a[i*lda+j])
+					if math.IsNaN(tmp) {
+						return tmp
+					}
+					if tmp > value {
+						value = tmp
+					}
+				}
+			}
+			return value
+		}
+		var value float64
+		if uplo == blas.Upper {
+			for i := 0; i < m; i++ {
+				for j := i; j < n; j++ {
+					tmp := math.Abs(a[i*lda+j])
+					if math.IsNaN(tmp) {
+						return tmp
+					}
+					if tmp > value {
+						value = tmp
+					}
+				}
+			}
+			return value
+		}
+		for i := 0; i < m; i++ {
+			for j := 0; j <= min(i, n-1); j++ {
+				tmp := math.Abs(a[i*lda+j])
+				if math.IsNaN(tmp) {
+					return tmp
+				}
+				if tmp > value {
+					value = tmp
+				}
+			}
+		}
+		return value
+	case lapack.MaxColumnSum:
+		if diag == blas.Unit {
+			for i := 0; i < minmn; i++ {
+				work[i] = 1
+			}
+			for i := minmn; i < n; i++ {
+				work[i] = 0
+			}
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					for j := i + 1; j < n; j++ {
+						work[j] += math.Abs(a[i*lda+j])
+					}
+				}
+			} else {
+				for i := 1; i < m; i++ {
+					for j := 0; j < min(i, n); j++ {
+						work[j] += math.Abs(a[i*lda+j])
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				work[i] = 0
+			}
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					for j := i; j < n; j++ {
+						work[j] += math.Abs(a[i*lda+j])
+					}
+				}
+			} else {
+				for i := 0; i < m; i++ {
+					for j := 0; j <= min(i, n-1); j++ {
+						work[j] += math.Abs(a[i*lda+j])
+					}
+				}
+			}
+		}
+		var max float64
+		for _, v := range work[:n] {
+			if math.IsNaN(v) {
+				return math.NaN()
+			}
+			if v > max {
+				max = v
+			}
+		}
+		return max
+	case lapack.MaxRowSum:
+		var maxsum float64
+		if diag == blas.Unit {
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					var sum float64
+					if i < minmn {
+						sum = 1
+					}
+					for j := i + 1; j < n; j++ {
+						sum += math.Abs(a[i*lda+j])
+					}
+					if math.IsNaN(sum) {
+						return math.NaN()
+					}
+					if sum > maxsum {
+						maxsum = sum
+					}
+				}
+				return maxsum
+			} else {
+				for i := 0; i < m; i++ {
+					var sum float64
+					if i < minmn {
+						sum = 1
+					}
+					for j := 0; j < min(i, n); j++ {
+						sum += math.Abs(a[i*lda+j])
+					}
+					if math.IsNaN(sum) {
+						return math.NaN()
+					}
+					if sum > maxsum {
+						maxsum = sum
+					}
+				}
+				return maxsum
+			}
+		} else {
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					var sum float64
+					for j := i; j < n; j++ {
+						sum += math.Abs(a[i*lda+j])
+					}
+					if math.IsNaN(sum) {
+						return sum
+					}
+					if sum > maxsum {
+						maxsum = sum
+					}
+				}
+				return maxsum
+			} else {
+				for i := 0; i < m; i++ {
+					var sum float64
+					for j := 0; j <= min(i, n-1); j++ {
+						sum += math.Abs(a[i*lda+j])
+					}
+					if math.IsNaN(sum) {
+						return sum
+					}
+					if sum > maxsum {
+						maxsum = sum
+					}
+				}
+				return maxsum
+			}
+		}
+	default:
+		// lapack.Frobenius:
+		var scale, sum float64
+		if diag == blas.Unit {
+			scale = 1
+			sum = float64(min(m, n))
+			if uplo == blas.Upper {
+				for i := 0; i < min(m, n); i++ {
+					scale, sum = impl.Dlassq(n-i-1, a[i*lda+i+1:], 1, scale, sum)
+				}
+			} else {
+				for i := 1; i < m; i++ {
+					scale, sum = impl.Dlassq(min(i, n), a[i*lda:], 1, scale, sum)
+				}
+			}
+		} else {
+			scale = 0
+			sum = 1
+			if uplo == blas.Upper {
+				for i := 0; i < min(m, n); i++ {
+					scale, sum = impl.Dlassq(n-i, a[i*lda+i:], 1, scale, sum)
+				}
+			} else {
+				for i := 0; i < m; i++ {
+					scale, sum = impl.Dlassq(min(i+1, n), a[i*lda:], 1, scale, sum)
+				}
+			}
+		}
+		return scale * math.Sqrt(sum)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlanv2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlanv2.go
new file mode 100644
index 0000000000..360f71b1d3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlanv2.go
@@ -0,0 +1,151 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlanv2 computes the Schur factorization of a real 2×2 matrix:
+//
+//	[ a b ] = [ cs -sn ] * [ aa bb ] * [ cs sn ]
+//	[ c d ]   [ sn  cs ]   [ cc dd ] * [-sn cs ]
+//
+// If cc is zero, aa and dd are real eigenvalues of the matrix. Otherwise it
+// holds that aa = dd and bb*cc < 0, and aa ± sqrt(bb*cc) are complex conjugate
+// eigenvalues. The real and imaginary parts of the eigenvalues are returned in
+// (rt1r,rt1i) and (rt2r,rt2i).
+func (impl Implementation) Dlanv2(a, b, c, d float64) (aa, bb, cc, dd float64, rt1r, rt1i, rt2r, rt2i float64, cs, sn float64) {
+	switch {
+	case c == 0: // Matrix is already upper triangular.
+		aa = a
+		bb = b
+		cc = 0
+		dd = d
+		cs = 1
+		sn = 0
+	case b == 0: // Matrix is lower triangular, swap rows and columns.
+		aa = d
+		bb = -c
+		cc = 0
+		dd = a
+		cs = 0
+		sn = 1
+	case a == d && math.Signbit(b) != math.Signbit(c): // Matrix is already in the standard Schur form.
+		aa = a
+		bb = b
+		cc = c
+		dd = d
+		cs = 1
+		sn = 0
+	default:
+		temp := a - d
+		p := temp / 2
+		bcmax := math.Max(math.Abs(b), math.Abs(c))
+		bcmis := math.Min(math.Abs(b), math.Abs(c))
+		if b*c < 0 {
+			bcmis *= -1
+		}
+		scale := math.Max(math.Abs(p), bcmax)
+		z := p/scale*p + bcmax/scale*bcmis
+		eps := dlamchP
+
+		if z >= 4*eps {
+			// Real eigenvalues. Compute aa and dd.
+			if p > 0 {
+				z = p + math.Sqrt(scale)*math.Sqrt(z)
+			} else {
+				z = p - math.Sqrt(scale)*math.Sqrt(z)
+			}
+			aa = d + z
+			dd = d - bcmax/z*bcmis
+			// Compute bb and the rotation matrix.
+			tau := impl.Dlapy2(c, z)
+			cs = z / tau
+			sn = c / tau
+			bb = b - c
+			cc = 0
+		} else {
+			// Complex eigenvalues, or real (almost) equal eigenvalues.
+			// Make diagonal elements equal.
+			safmn2 := math.Pow(dlamchB, math.Log(dlamchS/dlamchE)/math.Log(dlamchB)/2)
+			safmx2 := 1 / safmn2
+			sigma := b + c
+		loop:
+			for iter := 0; iter < 20; iter++ {
+				scale = math.Max(math.Abs(temp), math.Abs(sigma))
+				switch {
+				case scale >= safmx2:
+					sigma *= safmn2
+					temp *= safmn2
+				case scale <= safmn2:
+					sigma *= safmx2
+					temp *= safmx2
+				default:
+					break loop
+				}
+			}
+			p = temp / 2
+			tau := impl.Dlapy2(sigma, temp)
+			cs = math.Sqrt((1 + math.Abs(sigma)/tau) / 2)
+			sn = -p / (tau * cs)
+			if sigma < 0 {
+				sn *= -1
+			}
+			// Compute [ aa bb ] = [ a b ] [ cs -sn ]
+			//         [ cc dd ]   [ c d ] [ sn  cs ]
+			aa = a*cs + b*sn
+			bb = -a*sn + b*cs
+			cc = c*cs + d*sn
+			dd = -c*sn + d*cs
+			// Compute [ a b ] = [ cs sn ] [ aa bb ]
+			//         [ c d ]   [-sn cs ] [ cc dd ]
+			a = aa*cs + cc*sn
+			b = bb*cs + dd*sn
+			c = -aa*sn + cc*cs
+			d = -bb*sn + dd*cs
+
+			temp = (a + d) / 2
+			aa = temp
+			bb = b
+			cc = c
+			dd = temp
+
+			if cc != 0 {
+				if bb != 0 {
+					if math.Signbit(bb) == math.Signbit(cc) {
+						// Real eigenvalues, reduce to
+						// upper triangular form.
+						sab := math.Sqrt(math.Abs(bb))
+						sac := math.Sqrt(math.Abs(cc))
+						p = sab * sac
+						if cc < 0 {
+							p *= -1
+						}
+						tau = 1 / math.Sqrt(math.Abs(bb+cc))
+						aa = temp + p
+						bb = bb - cc
+						cc = 0
+						dd = temp - p
+						cs1 := sab * tau
+						sn1 := sac * tau
+						cs, sn = cs*cs1-sn*sn1, cs*sn1+sn*cs1
+					}
+				} else {
+					bb = -cc
+					cc = 0
+					cs, sn = -sn, cs
+				}
+			}
+		}
+	}
+
+	// Store eigenvalues in (rt1r,rt1i) and (rt2r,rt2i).
+	rt1r = aa
+	rt2r = dd
+	if cc != 0 {
+		rt1i = math.Sqrt(math.Abs(bb)) * math.Sqrt(math.Abs(cc))
+		rt2i = -rt1i
+	}
+	return
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlapll.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlapll.go
new file mode 100644
index 0000000000..bf98c338eb
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlapll.go
@@ -0,0 +1,55 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas/blas64"
+
+// Dlapll returns the smallest singular value of the n×2 matrix A = [ x y ].
+// The function first computes the QR factorization of A = Q*R, and then computes
+// the SVD of the 2-by-2 upper triangular matrix r.
+//
+// The contents of x and y are overwritten during the call.
+//
+// Dlapll is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlapll(n int, x []float64, incX int, y []float64, incY int) float64 {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case incX <= 0:
+		panic(badIncX)
+	case incY <= 0:
+		panic(badIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 0
+	}
+
+	switch {
+	case len(x) < 1+(n-1)*incX:
+		panic(shortX)
+	case len(y) < 1+(n-1)*incY:
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if n == 1 {
+		return 0
+	}
+
+	// Compute the QR factorization of the N-by-2 matrix [ X Y ].
+	a00, tau := impl.Dlarfg(n, x[0], x[incX:], incX)
+	x[0] = 1
+
+	bi := blas64.Implementation()
+	c := -tau * bi.Ddot(n, x, incX, y, incY)
+	bi.Daxpy(n, c, x, incX, y, incY)
+	a11, _ := impl.Dlarfg(n-1, y[incY], y[2*incY:], incY)
+
+	// Compute the SVD of 2-by-2 upper triangular matrix.
+	ssmin, _ := impl.Dlas2(a00, y[0], a11)
+	return ssmin
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlapmr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlapmr.go
new file mode 100644
index 0000000000..73cd82db96
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlapmr.go
@@ -0,0 +1,88 @@
+// Copyright ©2022 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas/blas64"
+
+// Dlapmr rearranges the rows of the m×n matrix X as specified by the permutation
+// k[0],k[1],...,k[m-1] of the integers 0,...,m-1.
+//
+// If forward is true, a forward permutation is applied:
+//
+//	X[k[i],0:n] is moved to X[i,0:n] for i=0,1,...,m-1.
+//
+// If forward is false, a backward permutation is applied:
+//
+//	X[i,0:n] is moved to X[k[i],0:n] for i=0,1,...,m-1.
+//
+// k must have length m, otherwise Dlapmr will panic.
+func (impl Implementation) Dlapmr(forward bool, m, n int, x []float64, ldx int, k []int) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case ldx < max(1, n):
+		panic(badLdX)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	switch {
+	case len(x) < (m-1)*ldx+n:
+		panic(shortX)
+	case len(k) != m:
+		panic(badLenK)
+	}
+
+	// Quick return if possible.
+	if m == 1 {
+		return
+	}
+
+	bi := blas64.Implementation()
+
+	for i, ki := range k {
+		k[i] = -(ki + 1)
+	}
+	if forward {
+		for i, ki := range k {
+			if ki >= 0 {
+				continue
+			}
+			j := i
+			k[j] = -k[j] - 1
+			in := k[j]
+			for {
+				if k[in] >= 0 {
+					break
+				}
+				bi.Dswap(n, x[j*ldx:], 1, x[in*ldx:], 1)
+				k[in] = -k[in] - 1
+				j = in
+				in = k[in]
+			}
+		}
+	} else {
+		for i, ki := range k {
+			if ki >= 0 {
+				continue
+			}
+			k[i] = -ki - 1
+			j := k[i]
+			for {
+				if j == i {
+					break
+				}
+				bi.Dswap(n, x[i*ldx:], 1, x[j*ldx:], 1)
+				k[j] = -k[j] - 1
+				j = k[j]
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlapmt.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlapmt.go
new file mode 100644
index 0000000000..4a70e68f04
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlapmt.go
@@ -0,0 +1,89 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas/blas64"
+
+// Dlapmt rearranges the columns of the m×n matrix X as specified by the
+// permutation k_0, k_1, ..., k_n-1 of the integers 0, ..., n-1.
+//
+// If forward is true a forward permutation is performed:
+//
+//	X[0:m, k[j]] is moved to X[0:m, j] for j = 0, 1, ..., n-1.
+//
+// otherwise a backward permutation is performed:
+//
+//	X[0:m, j] is moved to X[0:m, k[j]] for j = 0, 1, ..., n-1.
+//
+// k must have length n, otherwise Dlapmt will panic. k is zero-indexed.
+func (impl Implementation) Dlapmt(forward bool, m, n int, x []float64, ldx int, k []int) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case ldx < max(1, n):
+		panic(badLdX)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	switch {
+	case len(x) < (m-1)*ldx+n:
+		panic(shortX)
+	case len(k) != n:
+		panic(badLenK)
+	}
+
+	// Quick return if possible.
+	if n == 1 {
+		return
+	}
+
+	for i, v := range k {
+		v++
+		k[i] = -v
+	}
+
+	bi := blas64.Implementation()
+
+	if forward {
+		for j, v := range k {
+			if v >= 0 {
+				continue
+			}
+			k[j] = -v
+			i := -v - 1
+			for k[i] < 0 {
+				bi.Dswap(m, x[j:], ldx, x[i:], ldx)
+
+				k[i] = -k[i]
+				j = i
+				i = k[i] - 1
+			}
+		}
+	} else {
+		for i, v := range k {
+			if v >= 0 {
+				continue
+			}
+			k[i] = -v
+			j := -v - 1
+			for j != i {
+				bi.Dswap(m, x[j:], ldx, x[i:], ldx)
+
+				k[j] = -k[j]
+				j = k[j] - 1
+			}
+		}
+	}
+
+	for i := range k {
+		k[i]--
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlapy2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlapy2.go
new file mode 100644
index 0000000000..19f73ffabd
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlapy2.go
@@ -0,0 +1,14 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlapy2 is the LAPACK version of math.Hypot.
+//
+// Dlapy2 is an internal routine. It is exported for testing purposes.
+func (Implementation) Dlapy2(x, y float64) float64 {
+	return math.Hypot(x, y)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqp2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqp2.go
new file mode 100644
index 0000000000..cc3bc06db6
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqp2.go
@@ -0,0 +1,127 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlaqp2 computes a QR factorization with column pivoting of the block A[offset:m, 0:n]
+// of the m×n matrix A. The block A[0:offset, 0:n] is accordingly pivoted, but not factorized.
+//
+// On exit, the upper triangle of block A[offset:m, 0:n] is the triangular factor obtained.
+// The elements in block A[offset:m, 0:n] below the diagonal, together with tau, represent
+// the orthogonal matrix Q as a product of elementary reflectors.
+//
+// offset is number of rows of the matrix A that must be pivoted but not factorized.
+// offset must not be negative otherwise Dlaqp2 will panic.
+//
+// On exit, jpvt holds the permutation that was applied; the jth column of A*P was the
+// jpvt[j] column of A. jpvt must have length n, otherwise Dlaqp2 will panic.
+//
+// On exit tau holds the scalar factors of the elementary reflectors. It must have length
+// at least min(m-offset, n) otherwise Dlaqp2 will panic.
+//
+// vn1 and vn2 hold the partial and complete column norms respectively. They must have length n,
+// otherwise Dlaqp2 will panic.
+//
+// work must have length n, otherwise Dlaqp2 will panic.
+//
+// Dlaqp2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlaqp2(m, n, offset int, a []float64, lda int, jpvt []int, tau, vn1, vn2, work []float64) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case offset < 0:
+		panic(offsetLT0)
+	case offset > m:
+		panic(offsetGTM)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	mn := min(m-offset, n)
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(jpvt) != n:
+		panic(badLenJpvt)
+	case len(tau) < mn:
+		panic(shortTau)
+	case len(vn1) < n:
+		panic(shortVn1)
+	case len(vn2) < n:
+		panic(shortVn2)
+	case len(work) < n:
+		panic(shortWork)
+	}
+
+	tol3z := math.Sqrt(dlamchE)
+
+	bi := blas64.Implementation()
+
+	// Compute factorization.
+	for i := 0; i < mn; i++ {
+		offpi := offset + i
+
+		// Determine ith pivot column and swap if necessary.
+		p := i + bi.Idamax(n-i, vn1[i:], 1)
+		if p != i {
+			bi.Dswap(m, a[p:], lda, a[i:], lda)
+			jpvt[p], jpvt[i] = jpvt[i], jpvt[p]
+			vn1[p] = vn1[i]
+			vn2[p] = vn2[i]
+		}
+
+		// Generate elementary reflector H_i.
+		if offpi < m-1 {
+			a[offpi*lda+i], tau[i] = impl.Dlarfg(m-offpi, a[offpi*lda+i], a[(offpi+1)*lda+i:], lda)
+		} else {
+			tau[i] = 0
+		}
+
+		if i < n-1 {
+			// Apply H_iᵀ to A[offset+i:m, i:n] from the left.
+			aii := a[offpi*lda+i]
+			a[offpi*lda+i] = 1
+			impl.Dlarf(blas.Left, m-offpi, n-i-1, a[offpi*lda+i:], lda, tau[i], a[offpi*lda+i+1:], lda, work)
+			a[offpi*lda+i] = aii
+		}
+
+		// Update partial column norms.
+		for j := i + 1; j < n; j++ {
+			if vn1[j] == 0 {
+				continue
+			}
+
+			// The following marked lines follow from the
+			// analysis in Lapack Working Note 176.
+			r := math.Abs(a[offpi*lda+j]) / vn1[j] // *
+			temp := math.Max(0, 1-r*r)             // *
+			r = vn1[j] / vn2[j]                    // *
+			temp2 := temp * r * r                  // *
+			if temp2 < tol3z {
+				var v float64
+				if offpi < m-1 {
+					v = bi.Dnrm2(m-offpi-1, a[(offpi+1)*lda+j:], lda)
+				}
+				vn1[j] = v
+				vn2[j] = v
+			} else {
+				vn1[j] *= math.Sqrt(temp) // *
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqps.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqps.go
new file mode 100644
index 0000000000..da1a60e5cf
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqps.go
@@ -0,0 +1,244 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlaqps computes a step of QR factorization with column pivoting
+// of an m×n matrix A by using Blas-3. It tries to factorize nb
+// columns from A starting from the row offset, and updates all
+// of the matrix with Dgemm.
+//
+// In some cases, due to catastrophic cancellations, it cannot
+// factorize nb columns. Hence, the actual number of factorized
+// columns is returned in kb.
+//
+// Dlaqps computes a QR factorization with column pivoting of the
+// block A[offset:m, 0:nb] of the m×n matrix A. The block
+// A[0:offset, 0:n] is accordingly pivoted, but not factorized.
+//
+// On exit, the upper triangle of block A[offset:m, 0:kb] is the
+// triangular factor obtained. The elements in block A[offset:m, 0:n]
+// below the diagonal, together with tau, represent the orthogonal
+// matrix Q as a product of elementary reflectors.
+//
+// offset is number of rows of the matrix A that must be pivoted but
+// not factorized. offset must not be negative otherwise Dlaqps will panic.
+//
+// On exit, jpvt holds the permutation that was applied; the jth column
+// of A*P was the jpvt[j] column of A. jpvt must have length n,
+// otherwise Dlapqs will panic.
+//
+// On exit tau holds the scalar factors of the elementary reflectors.
+// It must have length nb, otherwise Dlapqs will panic.
+//
+// vn1 and vn2 hold the partial and complete column norms respectively.
+// They must have length n, otherwise Dlapqs will panic.
+//
+// auxv must have length nb, otherwise Dlaqps will panic.
+//
+// f and ldf represent an n×nb matrix F that is overwritten during the
+// call.
+//
+// Dlaqps is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlaqps(m, n, offset, nb int, a []float64, lda int, jpvt []int, tau, vn1, vn2, auxv, f []float64, ldf int) (kb int) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case offset < 0:
+		panic(offsetLT0)
+	case offset > m:
+		panic(offsetGTM)
+	case nb < 0:
+		panic(nbLT0)
+	case nb > n:
+		panic(nbGTN)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldf < max(1, nb):
+		panic(badLdF)
+	}
+
+	if m == 0 || n == 0 {
+		return 0
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(jpvt) != n:
+		panic(badLenJpvt)
+	case len(vn1) < n:
+		panic(shortVn1)
+	case len(vn2) < n:
+		panic(shortVn2)
+	}
+
+	if nb == 0 {
+		return 0
+	}
+
+	switch {
+	case len(tau) < nb:
+		panic(shortTau)
+	case len(auxv) < nb:
+		panic(shortAuxv)
+	case len(f) < (n-1)*ldf+nb:
+		panic(shortF)
+	}
+
+	if offset == m {
+		return 0
+	}
+
+	lastrk := min(m, n+offset)
+	lsticc := -1
+	tol3z := math.Sqrt(dlamchE)
+
+	bi := blas64.Implementation()
+
+	var k, rk int
+	for ; k < nb && lsticc == -1; k++ {
+		rk = offset + k
+
+		// Determine kth pivot column and swap if necessary.
+		p := k + bi.Idamax(n-k, vn1[k:], 1)
+		if p != k {
+			bi.Dswap(m, a[p:], lda, a[k:], lda)
+			bi.Dswap(k, f[p*ldf:], 1, f[k*ldf:], 1)
+			jpvt[p], jpvt[k] = jpvt[k], jpvt[p]
+			vn1[p] = vn1[k]
+			vn2[p] = vn2[k]
+		}
+
+		// Apply previous Householder reflectors to column K:
+		//
+		// A[rk:m, k] = A[rk:m, k] - A[rk:m, 0:k-1]*F[k, 0:k-1]ᵀ.
+		if k > 0 {
+			bi.Dgemv(blas.NoTrans, m-rk, k, -1,
+				a[rk*lda:], lda,
+				f[k*ldf:], 1,
+				1,
+				a[rk*lda+k:], lda)
+		}
+
+		// Generate elementary reflector H_k.
+		if rk < m-1 {
+			a[rk*lda+k], tau[k] = impl.Dlarfg(m-rk, a[rk*lda+k], a[(rk+1)*lda+k:], lda)
+		} else {
+			tau[k] = 0
+		}
+
+		akk := a[rk*lda+k]
+		a[rk*lda+k] = 1
+
+		// Compute kth column of F:
+		//
+		// Compute F[k+1:n, k] = tau[k]*A[rk:m, k+1:n]ᵀ*A[rk:m, k].
+		if k < n-1 {
+			bi.Dgemv(blas.Trans, m-rk, n-k-1, tau[k],
+				a[rk*lda+k+1:], lda,
+				a[rk*lda+k:], lda,
+				0,
+				f[(k+1)*ldf+k:], ldf)
+		}
+
+		// Padding F[0:k, k] with zeros.
+		for j := 0; j < k; j++ {
+			f[j*ldf+k] = 0
+		}
+
+		// Incremental updating of F:
+		//
+		// F[0:n, k] := F[0:n, k] - tau[k]*F[0:n, 0:k-1]*A[rk:m, 0:k-1]ᵀ*A[rk:m,k].
+		if k > 0 {
+			bi.Dgemv(blas.Trans, m-rk, k, -tau[k],
+				a[rk*lda:], lda,
+				a[rk*lda+k:], lda,
+				0,
+				auxv, 1)
+			bi.Dgemv(blas.NoTrans, n, k, 1,
+				f, ldf,
+				auxv, 1,
+				1,
+				f[k:], ldf)
+		}
+
+		// Update the current row of A:
+		//
+		// A[rk, k+1:n] = A[rk, k+1:n] - A[rk, 0:k]*F[k+1:n, 0:k]ᵀ.
+		if k < n-1 {
+			bi.Dgemv(blas.NoTrans, n-k-1, k+1, -1,
+				f[(k+1)*ldf:], ldf,
+				a[rk*lda:], 1,
+				1,
+				a[rk*lda+k+1:], 1)
+		}
+
+		// Update partial column norms.
+		if rk < lastrk-1 {
+			for j := k + 1; j < n; j++ {
+				if vn1[j] == 0 {
+					continue
+				}
+
+				// The following marked lines follow from the
+				// analysis in Lapack Working Note 176.
+				r := math.Abs(a[rk*lda+j]) / vn1[j] // *
+				temp := math.Max(0, 1-r*r)          // *
+				r = vn1[j] / vn2[j]                 // *
+				temp2 := temp * r * r               // *
+				if temp2 < tol3z {
+					// vn2 is used here as a collection of
+					// indices into vn2 and also a collection
+					// of column norms.
+					vn2[j] = float64(lsticc)
+					lsticc = j
+				} else {
+					vn1[j] *= math.Sqrt(temp) // *
+				}
+			}
+		}
+
+		a[rk*lda+k] = akk
+	}
+	kb = k
+	rk = offset + kb
+
+	// Apply the block reflector to the rest of the matrix:
+	//
+	// A[offset+kb+1:m, kb+1:n] := A[offset+kb+1:m, kb+1:n] - A[offset+kb+1:m, 1:kb]*F[kb+1:n, 1:kb]ᵀ.
+	if kb < min(n, m-offset) {
+		bi.Dgemm(blas.NoTrans, blas.Trans,
+			m-rk, n-kb, kb, -1,
+			a[rk*lda:], lda,
+			f[kb*ldf:], ldf,
+			1,
+			a[rk*lda+kb:], lda)
+	}
+
+	// Recomputation of difficult columns.
+	for lsticc >= 0 {
+		itemp := int(vn2[lsticc])
+
+		// NOTE: The computation of vn1[lsticc] relies on the fact that
+		// Dnrm2 does not fail on vectors with norm below the value of
+		// sqrt(dlamchS)
+		v := bi.Dnrm2(m-rk, a[rk*lda+lsticc:], lda)
+		vn1[lsticc] = v
+		vn2[lsticc] = v
+
+		lsticc = itemp
+	}
+
+	return kb
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr04.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr04.go
new file mode 100644
index 0000000000..8e4b266b85
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr04.go
@@ -0,0 +1,493 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+)
+
+// Dlaqr04 computes the eigenvalues of a block of an n×n upper Hessenberg matrix
+// H, and optionally the matrices T and Z from the Schur decomposition
+//
+//	H = Z T Zᵀ
+//
+// where T is an upper quasi-triangular matrix (the Schur form), and Z is the
+// orthogonal matrix of Schur vectors.
+//
+// wantt indicates whether the full Schur form T is required. If wantt is false,
+// then only enough of H will be updated to preserve the eigenvalues.
+//
+// wantz indicates whether the n×n matrix of Schur vectors Z is required. If it
+// is true, the orthogonal similarity transformation will be accumulated into
+// Z[iloz:ihiz+1,ilo:ihi+1], otherwise Z will not be referenced.
+//
+// ilo and ihi determine the block of H on which Dlaqr04 operates. It must hold that
+//
+//	0 <= ilo <= ihi < n     if n > 0,
+//	ilo == 0 and ihi == -1  if n == 0,
+//
+// and the block must be isolated, that is,
+//
+//	ilo == 0   or H[ilo,ilo-1] == 0,
+//	ihi == n-1 or H[ihi+1,ihi] == 0,
+//
+// otherwise Dlaqr04 will panic.
+//
+// wr and wi must have length ihi+1.
+//
+// iloz and ihiz specify the rows of Z to which transformations will be applied
+// if wantz is true. It must hold that
+//
+//	0 <= iloz <= ilo,  and  ihi <= ihiz < n,
+//
+// otherwise Dlaqr04 will panic.
+//
+// work must have length at least lwork and lwork must be
+//
+//	lwork >= 1  if n <= 11,
+//	lwork >= n  if n > 11,
+//
+// otherwise Dlaqr04 will panic. lwork as large as 6*n may be required for
+// optimal performance. On return, work[0] will contain the optimal value of
+// lwork.
+//
+// If lwork is -1, instead of performing Dlaqr04, the function only estimates the
+// optimal workspace size and stores it into work[0]. Neither h nor z are
+// accessed.
+//
+// recur is the non-negative recursion depth. For recur > 0, Dlaqr04 behaves
+// as DLAQR0, for recur == 0 it behaves as DLAQR4.
+//
+// unconverged indicates whether Dlaqr04 computed all the eigenvalues of H[ilo:ihi+1,ilo:ihi+1].
+//
+// If unconverged is zero and wantt is true, H will contain on return the upper
+// quasi-triangular matrix T from the Schur decomposition. 2×2 diagonal blocks
+// (corresponding to complex conjugate pairs of eigenvalues) will be returned in
+// standard form, with H[i,i] == H[i+1,i+1] and H[i+1,i]*H[i,i+1] < 0.
+//
+// If unconverged is zero and if wantt is false, the contents of h on return is
+// unspecified.
+//
+// If unconverged is zero, all the eigenvalues have been computed and their real
+// and imaginary parts will be stored on return in wr[ilo:ihi+1] and
+// wi[ilo:ihi+1], respectively. If two eigenvalues are computed as a complex
+// conjugate pair, they are stored in consecutive elements of wr and wi, say the
+// i-th and (i+1)th, with wi[i] > 0 and wi[i+1] < 0. If wantt is true, then the
+// eigenvalues are stored in the same order as on the diagonal of the Schur form
+// returned in H, with wr[i] = H[i,i] and, if H[i:i+2,i:i+2] is a 2×2 diagonal
+// block, wi[i] = sqrt(-H[i+1,i]*H[i,i+1]) and wi[i+1] = -wi[i].
+//
+// If unconverged is positive, some eigenvalues have not converged, and
+// wr[unconverged:ihi+1] and wi[unconverged:ihi+1] will contain those
+// eigenvalues which have been successfully computed. Failures are rare.
+//
+// If unconverged is positive and wantt is true, then on return
+//
+//	(initial H)*U = U*(final H),   (*)
+//
+// where U is an orthogonal matrix. The final H is upper Hessenberg and
+// H[unconverged:ihi+1,unconverged:ihi+1] is upper quasi-triangular.
+//
+// If unconverged is positive and wantt is false, on return the remaining
+// unconverged eigenvalues are the eigenvalues of the upper Hessenberg matrix
+// H[ilo:unconverged,ilo:unconverged].
+//
+// If unconverged is positive and wantz is true, then on return
+//
+//	(final Z) = (initial Z)*U,
+//
+// where U is the orthogonal matrix in (*) regardless of the value of wantt.
+//
+// References:
+//
+//	[1] K. Braman, R. Byers, R. Mathias. The Multishift QR Algorithm. Part I:
+//	    Maintaining Well-Focused Shifts and Level 3 Performance. SIAM J. Matrix
+//	    Anal. Appl. 23(4) (2002), pp. 929—947
+//	    URL: http://dx.doi.org/10.1137/S0895479801384573
+//	[2] K. Braman, R. Byers, R. Mathias. The Multishift QR Algorithm. Part II:
+//	    Aggressive Early Deflation. SIAM J. Matrix Anal. Appl. 23(4) (2002), pp. 948—973
+//	    URL: http://dx.doi.org/10.1137/S0895479801384585
+//
+// Dlaqr04 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlaqr04(wantt, wantz bool, n, ilo, ihi int, h []float64, ldh int, wr, wi []float64, iloz, ihiz int, z []float64, ldz int, work []float64, lwork int, recur int) (unconverged int) {
+	const (
+		// Matrices of order ntiny or smaller must be processed by
+		// Dlahqr because of insufficient subdiagonal scratch space.
+		// This is a hard limit.
+		ntiny = 15
+		// Exceptional deflation windows: try to cure rare slow
+		// convergence by varying the size of the deflation window after
+		// kexnw iterations.
+		kexnw = 5
+		// Exceptional shifts: try to cure rare slow convergence with
+		// ad-hoc exceptional shifts every kexsh iterations.
+		kexsh = 6
+
+		// See https://github.com/gonum/lapack/pull/151#discussion_r68162802
+		// and the surrounding discussion for an explanation where these
+		// constants come from.
+		// TODO(vladimir-ch): Similar constants for exceptional shifts
+		// are used also in dlahqr.go. The first constant is different
+		// there, it is equal to 3. Why? And does it matter?
+		wilk1 = 0.75
+		wilk2 = -0.4375
+	)
+
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case ilo < 0 || max(0, n-1) < ilo:
+		panic(badIlo)
+	case ihi < min(ilo, n-1) || n <= ihi:
+		panic(badIhi)
+	case ldh < max(1, n):
+		panic(badLdH)
+	case wantz && (iloz < 0 || ilo < iloz):
+		panic(badIloz)
+	case wantz && (ihiz < ihi || n <= ihiz):
+		panic(badIhiz)
+	case ldz < 1, wantz && ldz < n:
+		panic(badLdZ)
+	case lwork < 1 && lwork != -1:
+		panic(badLWork)
+	// TODO(vladimir-ch): Enable if and when we figure out what the minimum
+	// necessary lwork value is. Dlaqr04 says that the minimum is n which
+	// clashes with Dlaqr23's opinion about optimal work when nw <= 2
+	// (independent of n).
+	// case lwork < n && n > ntiny && lwork != -1:
+	// 	panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	case recur < 0:
+		panic(recurLT0)
+	}
+
+	// Quick return.
+	if n == 0 {
+		work[0] = 1
+		return 0
+	}
+
+	if lwork != -1 {
+		switch {
+		case len(h) < (n-1)*ldh+n:
+			panic(shortH)
+		case len(wr) != ihi+1:
+			panic(badLenWr)
+		case len(wi) != ihi+1:
+			panic(badLenWi)
+		case wantz && len(z) < (n-1)*ldz+n:
+			panic(shortZ)
+		case ilo > 0 && h[ilo*ldh+ilo-1] != 0:
+			panic(notIsolated)
+		case ihi+1 < n && h[(ihi+1)*ldh+ihi] != 0:
+			panic(notIsolated)
+		}
+	}
+
+	if n <= ntiny {
+		// Tiny matrices must use Dlahqr.
+		if lwork == -1 {
+			work[0] = 1
+			return 0
+		}
+		return impl.Dlahqr(wantt, wantz, n, ilo, ihi, h, ldh, wr, wi, iloz, ihiz, z, ldz)
+	}
+
+	// Use small bulge multi-shift QR with aggressive early deflation on
+	// larger-than-tiny matrices.
+	var jbcmpz string
+	if wantt {
+		jbcmpz = "S"
+	} else {
+		jbcmpz = "E"
+	}
+	if wantz {
+		jbcmpz += "V"
+	} else {
+		jbcmpz += "N"
+	}
+
+	var fname string
+	if recur > 0 {
+		fname = "DLAQR0"
+	} else {
+		fname = "DLAQR4"
+	}
+	// nwr is the recommended deflation window size. n is greater than ntiny,
+	// so there is enough subdiagonal workspace for nwr >= 2 as required.
+	// (In fact, there is enough subdiagonal space for nwr >= 4.)
+	// TODO(vladimir-ch): If there is enough space for nwr >= 4, should we
+	// use it?
+	nwr := impl.Ilaenv(13, fname, jbcmpz, n, ilo, ihi, lwork)
+	nwr = max(2, nwr)
+	nwr = min(ihi-ilo+1, min((n-1)/3, nwr))
+
+	// nsr is the recommended number of simultaneous shifts. n is greater than
+	// ntiny, so there is enough subdiagonal workspace for nsr to be even and
+	// greater than or equal to two as required.
+	nsr := impl.Ilaenv(15, fname, jbcmpz, n, ilo, ihi, lwork)
+	nsr = min(nsr, min((n-3)/6, ihi-ilo))
+	nsr = max(2, nsr&^1)
+
+	// Workspace query call to Dlaqr23.
+	impl.Dlaqr23(wantt, wantz, n, ilo, ihi, nwr+1, h, ldh, iloz, ihiz, z, ldz,
+		wr, wi, h, ldh, n, h, ldh, n, h, ldh, work, -1, recur)
+	// Optimal workspace is max(Dlaqr5, Dlaqr23).
+	lwkopt := max(3*nsr/2, int(work[0]))
+	// Quick return in case of workspace query.
+	if lwork == -1 {
+		work[0] = float64(lwkopt)
+		return 0
+	}
+
+	// Dlahqr/Dlaqr04 crossover point.
+	nmin := impl.Ilaenv(12, fname, jbcmpz, n, ilo, ihi, lwork)
+	nmin = max(ntiny, nmin)
+
+	// Nibble determines when to skip a multi-shift QR sweep (Dlaqr5).
+	nibble := impl.Ilaenv(14, fname, jbcmpz, n, ilo, ihi, lwork)
+	nibble = max(0, nibble)
+
+	// Computation mode of far-from-diagonal orthogonal updates in Dlaqr5.
+	kacc22 := impl.Ilaenv(16, fname, jbcmpz, n, ilo, ihi, lwork)
+	kacc22 = max(0, min(kacc22, 2))
+
+	// nwmax is the largest possible deflation window for which there is
+	// sufficient workspace.
+	nwmax := min((n-1)/3, lwork/2)
+	nw := nwmax // Start with maximum deflation window size.
+
+	// nsmax is the largest number of simultaneous shifts for which there is
+	// sufficient workspace.
+	nsmax := min((n-3)/6, 2*lwork/3) &^ 1
+
+	ndfl := 1 // Number of iterations since last deflation.
+	ndec := 0 // Deflation window size decrement.
+
+	// Main loop.
+	var (
+		itmax = max(30, 2*kexsh) * max(10, (ihi-ilo+1))
+		it    = 0
+	)
+	for kbot := ihi; kbot >= ilo; {
+		if it == itmax {
+			unconverged = kbot + 1
+			break
+		}
+		it++
+
+		// Locate active block.
+		ktop := ilo
+		for k := kbot; k >= ilo+1; k-- {
+			if h[k*ldh+k-1] == 0 {
+				ktop = k
+				break
+			}
+		}
+
+		// Select deflation window size nw.
+		//
+		// Typical Case:
+		//  If possible and advisable, nibble the entire active block.
+		//  If not, use size min(nwr,nwmax) or min(nwr+1,nwmax)
+		//  depending upon which has the smaller corresponding
+		//  subdiagonal entry (a heuristic).
+		//
+		// Exceptional Case:
+		//  If there have been no deflations in kexnw or more
+		//  iterations, then vary the deflation window size. At first,
+		//  because larger windows are, in general, more powerful than
+		//  smaller ones, rapidly increase the window to the maximum
+		//  possible. Then, gradually reduce the window size.
+		nh := kbot - ktop + 1
+		nwupbd := min(nh, nwmax)
+		if ndfl < kexnw {
+			nw = min(nwupbd, nwr)
+		} else {
+			nw = min(nwupbd, 2*nw)
+		}
+		if nw < nwmax {
+			if nw >= nh-1 {
+				nw = nh
+			} else {
+				kwtop := kbot - nw + 1
+				if math.Abs(h[kwtop*ldh+kwtop-1]) > math.Abs(h[(kwtop-1)*ldh+kwtop-2]) {
+					nw++
+				}
+			}
+		}
+		if ndfl < kexnw {
+			ndec = -1
+		} else if ndec >= 0 || nw >= nwupbd {
+			ndec++
+			if nw-ndec < 2 {
+				ndec = 0
+			}
+			nw -= ndec
+		}
+
+		// Split workspace under the subdiagonal of H into:
+		//  - an nw×nw work array V in the lower left-hand corner,
+		//  - an nw×nhv horizontal work array along the bottom edge (nhv
+		//    must be at least nw but more is better),
+		//  - an nve×nw vertical work array along the left-hand-edge
+		//    (nhv can be any positive integer but more is better).
+		kv := n - nw
+		kt := nw
+		kwv := nw + 1
+		nhv := n - kwv - kt
+		// Aggressive early deflation.
+		ls, ld := impl.Dlaqr23(wantt, wantz, n, ktop, kbot, nw,
+			h, ldh, iloz, ihiz, z, ldz, wr[:kbot+1], wi[:kbot+1],
+			h[kv*ldh:], ldh, nhv, h[kv*ldh+kt:], ldh, nhv, h[kwv*ldh:], ldh, work, lwork, recur)
+
+		// Adjust kbot accounting for new deflations.
+		kbot -= ld
+		// ks points to the shifts.
+		ks := kbot - ls + 1
+
+		// Skip an expensive QR sweep if there is a (partly heuristic)
+		// reason to expect that many eigenvalues will deflate without
+		// it. Here, the QR sweep is skipped if many eigenvalues have
+		// just been deflated or if the remaining active block is small.
+		if ld > 0 && (100*ld > nw*nibble || kbot-ktop+1 <= min(nmin, nwmax)) {
+			// ld is positive, note progress.
+			ndfl = 1
+			continue
+		}
+
+		// ns is the nominal number of simultaneous shifts. This may be
+		// lowered (slightly) if Dlaqr23 did not provide that many
+		// shifts.
+		ns := min(min(nsmax, nsr), max(2, kbot-ktop)) &^ 1
+
+		// If there have been no deflations in a multiple of kexsh
+		// iterations, then try exceptional shifts. Otherwise use shifts
+		// provided by Dlaqr23 above or from the eigenvalues of a
+		// trailing principal submatrix.
+		if ndfl%kexsh == 0 {
+			ks = kbot - ns + 1
+			for i := kbot; i > max(ks, ktop+1); i -= 2 {
+				ss := math.Abs(h[i*ldh+i-1]) + math.Abs(h[(i-1)*ldh+i-2])
+				aa := wilk1*ss + h[i*ldh+i]
+				_, _, _, _, wr[i-1], wi[i-1], wr[i], wi[i], _, _ =
+					impl.Dlanv2(aa, ss, wilk2*ss, aa)
+			}
+			if ks == ktop {
+				wr[ks+1] = h[(ks+1)*ldh+ks+1]
+				wi[ks+1] = 0
+				wr[ks] = wr[ks+1]
+				wi[ks] = wi[ks+1]
+			}
+		} else {
+			// If we got ns/2 or fewer shifts, use Dlahqr or recur
+			// into Dlaqr04 on a trailing principal submatrix to get
+			// more. Since ns <= nsmax <=(n+6)/9, there is enough
+			// space below the subdiagonal to fit an ns×ns scratch
+			// array.
+			if kbot-ks+1 <= ns/2 {
+				ks = kbot - ns + 1
+				kt = n - ns
+				impl.Dlacpy(blas.All, ns, ns, h[ks*ldh+ks:], ldh, h[kt*ldh:], ldh)
+				if ns > nmin && recur > 0 {
+					ks += impl.Dlaqr04(false, false, ns, 1, ns-1, h[kt*ldh:], ldh,
+						wr[ks:ks+ns], wi[ks:ks+ns], 0, 0, nil, 0, work, lwork, recur-1)
+				} else {
+					ks += impl.Dlahqr(false, false, ns, 0, ns-1, h[kt*ldh:], ldh,
+						wr[ks:ks+ns], wi[ks:ks+ns], 0, 0, nil, 1)
+				}
+				// In case of a rare QR failure use eigenvalues
+				// of the trailing 2×2 principal submatrix.
+				if ks >= kbot {
+					aa := h[(kbot-1)*ldh+kbot-1]
+					bb := h[(kbot-1)*ldh+kbot]
+					cc := h[kbot*ldh+kbot-1]
+					dd := h[kbot*ldh+kbot]
+					_, _, _, _, wr[kbot-1], wi[kbot-1], wr[kbot], wi[kbot], _, _ =
+						impl.Dlanv2(aa, bb, cc, dd)
+					ks = kbot - 1
+				}
+			}
+
+			if kbot-ks+1 > ns {
+				// Sorting the shifts helps a little. Bubble
+				// sort keeps complex conjugate pairs together.
+				sorted := false
+				for k := kbot; k > ks; k-- {
+					if sorted {
+						break
+					}
+					sorted = true
+					for i := ks; i < k; i++ {
+						if math.Abs(wr[i])+math.Abs(wi[i]) >= math.Abs(wr[i+1])+math.Abs(wi[i+1]) {
+							continue
+						}
+						sorted = false
+						wr[i], wr[i+1] = wr[i+1], wr[i]
+						wi[i], wi[i+1] = wi[i+1], wi[i]
+					}
+				}
+			}
+
+			// Shuffle shifts into pairs of real shifts and pairs of
+			// complex conjugate shifts using the fact that complex
+			// conjugate shifts are already adjacent to one another.
+			// TODO(vladimir-ch): The shuffling here could probably
+			// be removed but I'm not sure right now and it's safer
+			// to leave it.
+			for i := kbot; i > ks+1; i -= 2 {
+				if wi[i] == -wi[i-1] {
+					continue
+				}
+				wr[i], wr[i-1], wr[i-2] = wr[i-1], wr[i-2], wr[i]
+				wi[i], wi[i-1], wi[i-2] = wi[i-1], wi[i-2], wi[i]
+			}
+		}
+
+		// If there are only two shifts and both are real, then use only one.
+		if kbot-ks+1 == 2 && wi[kbot] == 0 {
+			if math.Abs(wr[kbot]-h[kbot*ldh+kbot]) < math.Abs(wr[kbot-1]-h[kbot*ldh+kbot]) {
+				wr[kbot-1] = wr[kbot]
+			} else {
+				wr[kbot] = wr[kbot-1]
+			}
+		}
+
+		// Use up to ns of the smallest magnitude shifts. If there
+		// aren't ns shifts available, then use them all, possibly
+		// dropping one to make the number of shifts even.
+		ns = min(ns, kbot-ks+1) &^ 1
+		ks = kbot - ns + 1
+
+		// Split workspace under the subdiagonal into:
+		// - a kdu×kdu work array U in the lower left-hand-corner,
+		// - a kdu×nhv horizontal work array WH along the bottom edge
+		//   (nhv must be at least kdu but more is better),
+		// - an nhv×kdu vertical work array WV along the left-hand-edge
+		//   (nhv must be at least kdu but more is better).
+		kdu := 2 * ns
+		ku := n - kdu
+		kwh := kdu
+		kwv = kdu + 3
+		nhv = n - kwv - kdu
+		// Small-bulge multi-shift QR sweep.
+		impl.Dlaqr5(wantt, wantz, kacc22, n, ktop, kbot, ns,
+			wr[ks:ks+ns], wi[ks:ks+ns], h, ldh, iloz, ihiz, z, ldz,
+			work, 3, h[ku*ldh:], ldh, nhv, h[kwv*ldh:], ldh, nhv, h[ku*ldh+kwh:], ldh)
+
+		// Note progress (or the lack of it).
+		if ld > 0 {
+			ndfl = 1
+		} else {
+			ndfl++
+		}
+	}
+
+	work[0] = float64(lwkopt)
+	return unconverged
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr1.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr1.go
new file mode 100644
index 0000000000..c20c88fdb4
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr1.go
@@ -0,0 +1,61 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlaqr1 sets v to a scalar multiple of the first column of the product
+//
+//	(H - (sr1 + i*si1)*I)*(H - (sr2 + i*si2)*I)
+//
+// where H is a 2×2 or 3×3 matrix, I is the identity matrix of the same size,
+// and i is the imaginary unit. Scaling is done to avoid overflows and most
+// underflows.
+//
+// n is the order of H and must be either 2 or 3. It must hold that either sr1 =
+// sr2 and si1 = -si2, or si1 = si2 = 0. The length of v must be equal to n. If
+// any of these conditions is not met, Dlaqr1 will panic.
+//
+// Dlaqr1 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlaqr1(n int, h []float64, ldh int, sr1, si1, sr2, si2 float64, v []float64) {
+	switch {
+	case n != 2 && n != 3:
+		panic("lapack: n must be 2 or 3")
+	case ldh < n:
+		panic(badLdH)
+	case len(h) < (n-1)*ldh+n:
+		panic(shortH)
+	case !((sr1 == sr2 && si1 == -si2) || (si1 == 0 && si2 == 0)):
+		panic(badShifts)
+	case len(v) != n:
+		panic(shortV)
+	}
+
+	if n == 2 {
+		s := math.Abs(h[0]-sr2) + math.Abs(si2) + math.Abs(h[ldh])
+		if s == 0 {
+			v[0] = 0
+			v[1] = 0
+		} else {
+			h21s := h[ldh] / s
+			v[0] = h21s*h[1] + (h[0]-sr1)*((h[0]-sr2)/s) - si1*(si2/s)
+			v[1] = h21s * (h[0] + h[ldh+1] - sr1 - sr2)
+		}
+		return
+	}
+
+	s := math.Abs(h[0]-sr2) + math.Abs(si2) + math.Abs(h[ldh]) + math.Abs(h[2*ldh])
+	if s == 0 {
+		v[0] = 0
+		v[1] = 0
+		v[2] = 0
+	} else {
+		h21s := h[ldh] / s
+		h31s := h[2*ldh] / s
+		v[0] = (h[0]-sr1)*((h[0]-sr2)/s) - si1*(si2/s) + h[1]*h21s + h[2]*h31s
+		v[1] = h21s*(h[0]+h[ldh+1]-sr1-sr2) + h[ldh+2]*h31s
+		v[2] = h31s*(h[0]+h[2*ldh+2]-sr1-sr2) + h21s*h[2*ldh+1]
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr23.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr23.go
new file mode 100644
index 0000000000..a3fa6661c6
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr23.go
@@ -0,0 +1,423 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlaqr23 performs the orthogonal similarity transformation of an n×n upper
+// Hessenberg matrix to detect and deflate fully converged eigenvalues from a
+// trailing principal submatrix using aggressive early deflation [1].
+//
+// On return, H will be overwritten by a new Hessenberg matrix that is a
+// perturbation of an orthogonal similarity transformation of H. It is hoped
+// that on output H will have many zero subdiagonal entries.
+//
+// If wantt is true, the matrix H will be fully updated so that the
+// quasi-triangular Schur factor can be computed. If wantt is false, then only
+// enough of H will be updated to preserve the eigenvalues.
+//
+// If wantz is true, the orthogonal similarity transformation will be
+// accumulated into Z[iloz:ihiz+1,ktop:kbot+1], otherwise Z is not referenced.
+//
+// ktop and kbot determine a block [ktop:kbot+1,ktop:kbot+1] along the diagonal
+// of H. It must hold that
+//
+//	0 <= ilo <= ihi < n     if n > 0,
+//	ilo == 0 and ihi == -1  if n == 0,
+//
+// and the block must be isolated, that is, it must hold that
+//
+//	ktop == 0   or H[ktop,ktop-1] == 0,
+//	kbot == n-1 or H[kbot+1,kbot] == 0,
+//
+// otherwise Dlaqr23 will panic.
+//
+// nw is the deflation window size. It must hold that
+//
+//	0 <= nw <= kbot-ktop+1,
+//
+// otherwise Dlaqr23 will panic.
+//
+// iloz and ihiz specify the rows of the n×n matrix Z to which transformations
+// will be applied if wantz is true. It must hold that
+//
+//	0 <= iloz <= ktop,  and  kbot <= ihiz < n,
+//
+// otherwise Dlaqr23 will panic.
+//
+// sr and si must have length kbot+1, otherwise Dlaqr23 will panic.
+//
+// v and ldv represent an nw×nw work matrix.
+// t and ldt represent an nw×nh work matrix, and nh must be at least nw.
+// wv and ldwv represent an nv×nw work matrix.
+//
+// work must have length at least lwork and lwork must be at least max(1,2*nw),
+// otherwise Dlaqr23 will panic. Larger values of lwork may result in greater
+// efficiency. On return, work[0] will contain the optimal value of lwork.
+//
+// If lwork is -1, instead of performing Dlaqr23, the function only estimates the
+// optimal workspace size and stores it into work[0]. Neither h nor z are
+// accessed.
+//
+// recur is the non-negative recursion depth. For recur > 0, Dlaqr23 behaves
+// as DLAQR3, for recur == 0 it behaves as DLAQR2.
+//
+// On return, ns and nd will contain respectively the number of unconverged
+// (i.e., approximate) eigenvalues and converged eigenvalues that are stored in
+// sr and si.
+//
+// On return, the real and imaginary parts of approximate eigenvalues that may
+// be used for shifts will be stored respectively in sr[kbot-nd-ns+1:kbot-nd+1]
+// and si[kbot-nd-ns+1:kbot-nd+1].
+//
+// On return, the real and imaginary parts of converged eigenvalues will be
+// stored respectively in sr[kbot-nd+1:kbot+1] and si[kbot-nd+1:kbot+1].
+//
+// References:
+//
+//	[1] K. Braman, R. Byers, R. Mathias. The Multishift QR Algorithm. Part II:
+//	    Aggressive Early Deflation. SIAM J. Matrix Anal. Appl 23(4) (2002), pp. 948—973
+//	    URL: http://dx.doi.org/10.1137/S0895479801384585
+func (impl Implementation) Dlaqr23(wantt, wantz bool, n, ktop, kbot, nw int, h []float64, ldh int, iloz, ihiz int, z []float64, ldz int, sr, si []float64, v []float64, ldv int, nh int, t []float64, ldt int, nv int, wv []float64, ldwv int, work []float64, lwork int, recur int) (ns, nd int) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case ktop < 0 || max(0, n-1) < ktop:
+		panic(badKtop)
+	case kbot < min(ktop, n-1) || n <= kbot:
+		panic(badKbot)
+	case nw < 0 || kbot-ktop+1+1 < nw:
+		panic(badNw)
+	case ldh < max(1, n):
+		panic(badLdH)
+	case wantz && (iloz < 0 || ktop < iloz):
+		panic(badIloz)
+	case wantz && (ihiz < kbot || n <= ihiz):
+		panic(badIhiz)
+	case ldz < 1, wantz && ldz < n:
+		panic(badLdZ)
+	case ldv < max(1, nw):
+		panic(badLdV)
+	case nh < nw:
+		panic(badNh)
+	case ldt < max(1, nh):
+		panic(badLdT)
+	case nv < 0:
+		panic(nvLT0)
+	case ldwv < max(1, nw):
+		panic(badLdWV)
+	case lwork < max(1, 2*nw) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	case recur < 0:
+		panic(recurLT0)
+	}
+
+	// Quick return for zero window size.
+	if nw == 0 {
+		work[0] = 1
+		return 0, 0
+	}
+
+	// LAPACK code does not enforce the documented behavior
+	//  nw <= kbot-ktop+1
+	// but we do (we panic above).
+	jw := nw
+	lwkopt := max(1, 2*nw)
+	if jw > 2 {
+		// Workspace query call to Dgehrd.
+		impl.Dgehrd(jw, 0, jw-2, t, ldt, work, work, -1)
+		lwk1 := int(work[0])
+		// Workspace query call to Dormhr.
+		impl.Dormhr(blas.Right, blas.NoTrans, jw, jw, 0, jw-2, t, ldt, work, v, ldv, work, -1)
+		lwk2 := int(work[0])
+		if recur > 0 {
+			// Workspace query call to Dlaqr04.
+			impl.Dlaqr04(true, true, jw, 0, jw-1, t, ldt, sr, si, 0, jw-1, v, ldv, work, -1, recur-1)
+			lwk3 := int(work[0])
+			// Optimal workspace.
+			lwkopt = max(jw+max(lwk1, lwk2), lwk3)
+		} else {
+			// Optimal workspace.
+			lwkopt = jw + max(lwk1, lwk2)
+		}
+	}
+	// Quick return in case of workspace query.
+	if lwork == -1 {
+		work[0] = float64(lwkopt)
+		return 0, 0
+	}
+
+	// Check input slices only if not doing workspace query.
+	switch {
+	case len(h) < (n-1)*ldh+n:
+		panic(shortH)
+	case len(v) < (nw-1)*ldv+nw:
+		panic(shortV)
+	case len(t) < (nw-1)*ldt+nh:
+		panic(shortT)
+	case len(wv) < (nv-1)*ldwv+nw:
+		panic(shortWV)
+	case wantz && len(z) < (n-1)*ldz+n:
+		panic(shortZ)
+	case len(sr) != kbot+1:
+		panic(badLenSr)
+	case len(si) != kbot+1:
+		panic(badLenSi)
+	case ktop > 0 && h[ktop*ldh+ktop-1] != 0:
+		panic(notIsolated)
+	case kbot+1 < n && h[(kbot+1)*ldh+kbot] != 0:
+		panic(notIsolated)
+	}
+
+	// Machine constants.
+	ulp := dlamchP
+	smlnum := float64(n) / ulp * dlamchS
+
+	// Setup deflation window.
+	var s float64
+	kwtop := kbot - jw + 1
+	if kwtop != ktop {
+		s = h[kwtop*ldh+kwtop-1]
+	}
+	if kwtop == kbot {
+		// 1×1 deflation window.
+		sr[kwtop] = h[kwtop*ldh+kwtop]
+		si[kwtop] = 0
+		ns = 1
+		nd = 0
+		if math.Abs(s) <= math.Max(smlnum, ulp*math.Abs(h[kwtop*ldh+kwtop])) {
+			ns = 0
+			nd = 1
+			if kwtop > ktop {
+				h[kwtop*ldh+kwtop-1] = 0
+			}
+		}
+		work[0] = 1
+		return ns, nd
+	}
+
+	// Convert to spike-triangular form. In case of a rare QR failure, this
+	// routine continues to do aggressive early deflation using that part of
+	// the deflation window that converged using infqr here and there to
+	// keep track.
+	impl.Dlacpy(blas.Upper, jw, jw, h[kwtop*ldh+kwtop:], ldh, t, ldt)
+	bi := blas64.Implementation()
+	bi.Dcopy(jw-1, h[(kwtop+1)*ldh+kwtop:], ldh+1, t[ldt:], ldt+1)
+	impl.Dlaset(blas.All, jw, jw, 0, 1, v, ldv)
+	nmin := impl.Ilaenv(12, "DLAQR3", "SV", jw, 0, jw-1, lwork)
+	var infqr int
+	if recur > 0 && jw > nmin {
+		infqr = impl.Dlaqr04(true, true, jw, 0, jw-1, t, ldt, sr[kwtop:], si[kwtop:], 0, jw-1, v, ldv, work, lwork, recur-1)
+	} else {
+		infqr = impl.Dlahqr(true, true, jw, 0, jw-1, t, ldt, sr[kwtop:], si[kwtop:], 0, jw-1, v, ldv)
+	}
+	// Note that ilo == 0 which conveniently coincides with the success
+	// value of infqr, that is, infqr as an index always points to the first
+	// converged eigenvalue.
+
+	// Dtrexc needs a clean margin near the diagonal.
+	for j := 0; j < jw-3; j++ {
+		t[(j+2)*ldt+j] = 0
+		t[(j+3)*ldt+j] = 0
+	}
+	if jw >= 3 {
+		t[(jw-1)*ldt+jw-3] = 0
+	}
+
+	ns = jw
+	ilst := infqr
+	// Deflation detection loop.
+	for ilst < ns {
+		bulge := false
+		if ns >= 2 {
+			bulge = t[(ns-1)*ldt+ns-2] != 0
+		}
+		if !bulge {
+			// Real eigenvalue.
+			abst := math.Abs(t[(ns-1)*ldt+ns-1])
+			if abst == 0 {
+				abst = math.Abs(s)
+			}
+			if math.Abs(s*v[ns-1]) <= math.Max(smlnum, ulp*abst) {
+				// Deflatable.
+				ns--
+			} else {
+				// Undeflatable, move it up out of the way.
+				// Dtrexc can not fail in this case.
+				_, ilst, _ = impl.Dtrexc(lapack.UpdateSchur, jw, t, ldt, v, ldv, ns-1, ilst, work)
+				ilst++
+			}
+			continue
+		}
+		// Complex conjugate pair.
+		abst := math.Abs(t[(ns-1)*ldt+ns-1]) + math.Sqrt(math.Abs(t[(ns-1)*ldt+ns-2]))*math.Sqrt(math.Abs(t[(ns-2)*ldt+ns-1]))
+		if abst == 0 {
+			abst = math.Abs(s)
+		}
+		if math.Max(math.Abs(s*v[ns-1]), math.Abs(s*v[ns-2])) <= math.Max(smlnum, ulp*abst) {
+			// Deflatable.
+			ns -= 2
+		} else {
+			// Undeflatable, move them up out of the way.
+			// Dtrexc does the right thing with ilst in case of a
+			// rare exchange failure.
+			_, ilst, _ = impl.Dtrexc(lapack.UpdateSchur, jw, t, ldt, v, ldv, ns-1, ilst, work)
+			ilst += 2
+		}
+	}
+
+	// Return to Hessenberg form.
+	if ns == 0 {
+		s = 0
+	}
+	if ns < jw {
+		// Sorting diagonal blocks of T improves accuracy for graded
+		// matrices. Bubble sort deals well with exchange failures.
+		sorted := false
+		i := ns
+		for !sorted {
+			sorted = true
+			kend := i - 1
+			i = infqr
+			var k int
+			if i == ns-1 || t[(i+1)*ldt+i] == 0 {
+				k = i + 1
+			} else {
+				k = i + 2
+			}
+			for k <= kend {
+				var evi float64
+				if k == i+1 {
+					evi = math.Abs(t[i*ldt+i])
+				} else {
+					evi = math.Abs(t[i*ldt+i]) + math.Sqrt(math.Abs(t[(i+1)*ldt+i]))*math.Sqrt(math.Abs(t[i*ldt+i+1]))
+				}
+
+				var evk float64
+				if k == kend || t[(k+1)*ldt+k] == 0 {
+					evk = math.Abs(t[k*ldt+k])
+				} else {
+					evk = math.Abs(t[k*ldt+k]) + math.Sqrt(math.Abs(t[(k+1)*ldt+k]))*math.Sqrt(math.Abs(t[k*ldt+k+1]))
+				}
+
+				if evi >= evk {
+					i = k
+				} else {
+					sorted = false
+					_, ilst, ok := impl.Dtrexc(lapack.UpdateSchur, jw, t, ldt, v, ldv, i, k, work)
+					if ok {
+						i = ilst
+					} else {
+						i = k
+					}
+				}
+				if i == kend || t[(i+1)*ldt+i] == 0 {
+					k = i + 1
+				} else {
+					k = i + 2
+				}
+			}
+		}
+	}
+
+	// Restore shift/eigenvalue array from T.
+	for i := jw - 1; i >= infqr; {
+		if i == infqr || t[i*ldt+i-1] == 0 {
+			sr[kwtop+i] = t[i*ldt+i]
+			si[kwtop+i] = 0
+			i--
+			continue
+		}
+		aa := t[(i-1)*ldt+i-1]
+		bb := t[(i-1)*ldt+i]
+		cc := t[i*ldt+i-1]
+		dd := t[i*ldt+i]
+		_, _, _, _, sr[kwtop+i-1], si[kwtop+i-1], sr[kwtop+i], si[kwtop+i], _, _ = impl.Dlanv2(aa, bb, cc, dd)
+		i -= 2
+	}
+
+	if ns < jw || s == 0 {
+		if ns > 1 && s != 0 {
+			// Reflect spike back into lower triangle.
+			bi.Dcopy(ns, v[:ns], 1, work[:ns], 1)
+			_, tau := impl.Dlarfg(ns, work[0], work[1:ns], 1)
+			work[0] = 1
+			impl.Dlaset(blas.Lower, jw-2, jw-2, 0, 0, t[2*ldt:], ldt)
+			impl.Dlarf(blas.Left, ns, jw, work[:ns], 1, tau, t, ldt, work[jw:])
+			impl.Dlarf(blas.Right, ns, ns, work[:ns], 1, tau, t, ldt, work[jw:])
+			impl.Dlarf(blas.Right, jw, ns, work[:ns], 1, tau, v, ldv, work[jw:])
+			impl.Dgehrd(jw, 0, ns-1, t, ldt, work[:jw-1], work[jw:], lwork-jw)
+		}
+
+		// Copy updated reduced window into place.
+		if kwtop > 0 {
+			h[kwtop*ldh+kwtop-1] = s * v[0]
+		}
+		impl.Dlacpy(blas.Upper, jw, jw, t, ldt, h[kwtop*ldh+kwtop:], ldh)
+		bi.Dcopy(jw-1, t[ldt:], ldt+1, h[(kwtop+1)*ldh+kwtop:], ldh+1)
+
+		// Accumulate orthogonal matrix in order to update H and Z, if
+		// requested.
+		if ns > 1 && s != 0 {
+			// work[:ns-1] contains the elementary reflectors stored
+			// by a call to Dgehrd above.
+			impl.Dormhr(blas.Right, blas.NoTrans, jw, ns, 0, ns-1,
+				t, ldt, work[:ns-1], v, ldv, work[jw:], lwork-jw)
+		}
+
+		// Update vertical slab in H.
+		var ltop int
+		if !wantt {
+			ltop = ktop
+		}
+		for krow := ltop; krow < kwtop; krow += nv {
+			kln := min(nv, kwtop-krow)
+			bi.Dgemm(blas.NoTrans, blas.NoTrans, kln, jw, jw,
+				1, h[krow*ldh+kwtop:], ldh, v, ldv,
+				0, wv, ldwv)
+			impl.Dlacpy(blas.All, kln, jw, wv, ldwv, h[krow*ldh+kwtop:], ldh)
+		}
+
+		// Update horizontal slab in H.
+		if wantt {
+			for kcol := kbot + 1; kcol < n; kcol += nh {
+				kln := min(nh, n-kcol)
+				bi.Dgemm(blas.Trans, blas.NoTrans, jw, kln, jw,
+					1, v, ldv, h[kwtop*ldh+kcol:], ldh,
+					0, t, ldt)
+				impl.Dlacpy(blas.All, jw, kln, t, ldt, h[kwtop*ldh+kcol:], ldh)
+			}
+		}
+
+		// Update vertical slab in Z.
+		if wantz {
+			for krow := iloz; krow <= ihiz; krow += nv {
+				kln := min(nv, ihiz-krow+1)
+				bi.Dgemm(blas.NoTrans, blas.NoTrans, kln, jw, jw,
+					1, z[krow*ldz+kwtop:], ldz, v, ldv,
+					0, wv, ldwv)
+				impl.Dlacpy(blas.All, kln, jw, wv, ldwv, z[krow*ldz+kwtop:], ldz)
+			}
+		}
+	}
+
+	// The number of deflations.
+	nd = jw - ns
+	// Shifts are converged eigenvalues that could not be deflated.
+	// Subtracting infqr from the spike length takes care of the case of a
+	// rare QR failure while calculating eigenvalues of the deflation
+	// window.
+	ns -= infqr
+	work[0] = float64(lwkopt)
+	return ns, nd
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr5.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr5.go
new file mode 100644
index 0000000000..443a53d5c4
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr5.go
@@ -0,0 +1,560 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlaqr5 performs a single small-bulge multi-shift QR sweep on an isolated
+// block of a Hessenberg matrix.
+//
+// wantt and wantz determine whether the quasi-triangular Schur factor and the
+// orthogonal Schur factor, respectively, will be computed.
+//
+// kacc22 specifies the computation mode of far-from-diagonal orthogonal
+// updates. Permitted values are:
+//
+//	0: Dlaqr5 will not accumulate reflections and will not use matrix-matrix
+//	   multiply to update far-from-diagonal matrix entries.
+//	1: Dlaqr5 will accumulate reflections and use matrix-matrix multiply to
+//	   update far-from-diagonal matrix entries.
+//	2: Same as kacc22=1. This option used to enable exploiting the 2×2 structure
+//	   during matrix multiplications, but this is no longer supported.
+//
+// For other values of kacc2 Dlaqr5 will panic.
+//
+// n is the order of the Hessenberg matrix H.
+//
+// ktop and kbot are indices of the first and last row and column of an isolated
+// diagonal block upon which the QR sweep will be applied. It must hold that
+//
+//	ktop == 0,   or 0 < ktop <= n-1 and H[ktop, ktop-1] == 0, and
+//	kbot == n-1, or 0 <= kbot < n-1 and H[kbot+1, kbot] == 0,
+//
+// otherwise Dlaqr5 will panic.
+//
+// nshfts is the number of simultaneous shifts. It must be positive and even,
+// otherwise Dlaqr5 will panic.
+//
+// sr and si contain the real and imaginary parts, respectively, of the shifts
+// of origin that define the multi-shift QR sweep. On return both slices may be
+// reordered by Dlaqr5. Their length must be equal to nshfts, otherwise Dlaqr5
+// will panic.
+//
+// h and ldh represent the Hessenberg matrix H of size n×n. On return
+// multi-shift QR sweep with shifts sr+i*si has been applied to the isolated
+// diagonal block in rows and columns ktop through kbot, inclusive.
+//
+// iloz and ihiz specify the rows of Z to which transformations will be applied
+// if wantz is true. It must hold that 0 <= iloz <= ihiz < n, otherwise Dlaqr5
+// will panic.
+//
+// z and ldz represent the matrix Z of size n×n. If wantz is true, the QR sweep
+// orthogonal similarity transformation is accumulated into
+// z[iloz:ihiz,iloz:ihiz] from the right, otherwise z not referenced.
+//
+// v and ldv represent an auxiliary matrix V of size (nshfts/2)×3. Note that V
+// is transposed with respect to the reference netlib implementation.
+//
+// u and ldu represent an auxiliary matrix of size (2*nshfts)×(2*nshfts).
+//
+// wh and ldwh represent an auxiliary matrix of size (2*nshfts-1)×nh.
+//
+// wv and ldwv represent an auxiliary matrix of size nv×(2*nshfts-1).
+//
+// Dlaqr5 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlaqr5(wantt, wantz bool, kacc22 int, n, ktop, kbot, nshfts int, sr, si []float64, h []float64, ldh int, iloz, ihiz int, z []float64, ldz int, v []float64, ldv int, u []float64, ldu int, nv int, wv []float64, ldwv int, nh int, wh []float64, ldwh int) {
+	switch {
+	case kacc22 != 0 && kacc22 != 1 && kacc22 != 2:
+		panic(badKacc22)
+	case n < 0:
+		panic(nLT0)
+	case ktop < 0 || n <= ktop:
+		panic(badKtop)
+	case kbot < 0 || n <= kbot:
+		panic(badKbot)
+
+	case nshfts < 0:
+		panic(nshftsLT0)
+	case nshfts&0x1 != 0:
+		panic(nshftsOdd)
+	case len(sr) != nshfts:
+		panic(badLenSr)
+	case len(si) != nshfts:
+		panic(badLenSi)
+
+	case ldh < max(1, n):
+		panic(badLdH)
+	case len(h) < (n-1)*ldh+n:
+		panic(shortH)
+
+	case wantz && ihiz >= n:
+		panic(badIhiz)
+	case wantz && iloz < 0 || ihiz < iloz:
+		panic(badIloz)
+	case ldz < 1, wantz && ldz < n:
+		panic(badLdZ)
+	case wantz && len(z) < (n-1)*ldz+n:
+		panic(shortZ)
+
+	case ldv < 3:
+		// V is transposed w.r.t. reference lapack.
+		panic(badLdV)
+	case len(v) < (nshfts/2-1)*ldv+3:
+		panic(shortV)
+
+	case ldu < max(1, 2*nshfts):
+		panic(badLdU)
+	case len(u) < (2*nshfts-1)*ldu+2*nshfts:
+		panic(shortU)
+
+	case nv < 0:
+		panic(nvLT0)
+	case ldwv < max(1, 2*nshfts):
+		panic(badLdWV)
+	case len(wv) < (nv-1)*ldwv+2*nshfts:
+		panic(shortWV)
+
+	case nh < 0:
+		panic(nhLT0)
+	case ldwh < max(1, nh):
+		panic(badLdWH)
+	case len(wh) < (2*nshfts-1)*ldwh+nh:
+		panic(shortWH)
+
+	case ktop > 0 && h[ktop*ldh+ktop-1] != 0:
+		panic(notIsolated)
+	case kbot < n-1 && h[(kbot+1)*ldh+kbot] != 0:
+		panic(notIsolated)
+	}
+
+	// If there are no shifts, then there is nothing to do.
+	if nshfts < 2 {
+		return
+	}
+	// If the active block is empty or 1×1, then there is nothing to do.
+	if ktop >= kbot {
+		return
+	}
+
+	// Shuffle shifts into pairs of real shifts and pairs of complex
+	// conjugate shifts assuming complex conjugate shifts are already
+	// adjacent to one another.
+	for i := 0; i < nshfts-2; i += 2 {
+		if si[i] == -si[i+1] {
+			continue
+		}
+		sr[i], sr[i+1], sr[i+2] = sr[i+1], sr[i+2], sr[i]
+		si[i], si[i+1], si[i+2] = si[i+1], si[i+2], si[i]
+	}
+
+	// Note: lapack says that nshfts must be even but allows it to be odd
+	// anyway. We panic above if nshfts is not even, so reducing it by one
+	// is unnecessary. The only caller Dlaqr04 uses only even nshfts.
+	//
+	// The original comment and code from lapack-3.6.0/SRC/dlaqr5.f:341:
+	// *     ==== NSHFTS is supposed to be even, but if it is odd,
+	// *     .    then simply reduce it by one.  The shuffle above
+	// *     .    ensures that the dropped shift is real and that
+	// *     .    the remaining shifts are paired. ====
+	// *
+	//      NS = NSHFTS - MOD( NSHFTS, 2 )
+	ns := nshfts
+
+	safmin := dlamchS
+	ulp := dlamchP
+	smlnum := safmin * float64(n) / ulp
+
+	// Use accumulated reflections to update far-from-diagonal entries?
+	accum := kacc22 == 1 || kacc22 == 2
+
+	// Clear trash.
+	if ktop+2 <= kbot {
+		h[(ktop+2)*ldh+ktop] = 0
+	}
+
+	// nbmps = number of 2-shift bulges in the chain.
+	nbmps := ns / 2
+
+	// kdu = width of slab.
+	kdu := 4 * nbmps
+
+	// Create and chase chains of nbmps bulges.
+	for incol := ktop - 2*nbmps + 1; incol <= kbot-2; incol += 2 * nbmps {
+		// jtop is an index from which updates from the right start.
+		var jtop int
+		switch {
+		case accum:
+			jtop = max(ktop, incol)
+		case wantt:
+		default:
+			jtop = ktop
+		}
+		ndcol := incol + kdu
+		if accum {
+			impl.Dlaset(blas.All, kdu, kdu, 0, 1, u, ldu)
+		}
+		// Near-the-diagonal bulge chase. The following loop performs
+		// the near-the-diagonal part of a small bulge multi-shift QR
+		// sweep. Each 4*nbmps column diagonal chunk extends from
+		// column incol to column ndcol (including both column incol and
+		// column ndcol). The following loop chases a 2*nbmps+1 column
+		// long chain of nbmps bulges 2*nbmps columns to the right.
+		// (incol may be less than ktop and ndcol may be greater than
+		// kbot indicating phantom columns from which to chase bulges
+		// before they are actually introduced or to which to chase
+		// bulges beyond column kbot.)
+		for krcol := incol; krcol <= min(incol+2*nbmps-1, kbot-2); krcol++ {
+			// Bulges number mtop to mbot are active double implicit
+			// shift bulges. There may or may not also be small 2×2
+			// bulge, if there is room. The inactive bulges (if any)
+			// must wait until the active bulges have moved down the
+			// diagonal to make room. The phantom matrix paradigm
+			// described above helps keep track.
+			mtop := max(0, (ktop-krcol)/2)
+			mbot := min(nbmps, (kbot-krcol-1)/2) - 1
+			m22 := mbot + 1
+			bmp22 := (mbot < nbmps-1) && (krcol+2*m22 == kbot-2)
+			// Generate reflections to chase the chain right one column.
+			// The minimum value of k is ktop-1.
+			if bmp22 {
+				// Special case: 2×2 reflection at bottom treated separately.
+				k := krcol + 2*m22
+				if k == ktop-1 {
+					impl.Dlaqr1(2, h[(k+1)*ldh+k+1:], ldh,
+						sr[2*m22], si[2*m22], sr[2*m22+1], si[2*m22+1],
+						v[m22*ldv:m22*ldv+2])
+					beta := v[m22*ldv]
+					_, v[m22*ldv] = impl.Dlarfg(2, beta, v[m22*ldv+1:m22*ldv+2], 1)
+				} else {
+					beta := h[(k+1)*ldh+k]
+					v[m22*ldv+1] = h[(k+2)*ldh+k]
+					beta, v[m22*ldv] = impl.Dlarfg(2, beta, v[m22*ldv+1:m22*ldv+2], 1)
+					h[(k+1)*ldh+k] = beta
+					h[(k+2)*ldh+k] = 0
+				}
+				// Perform update from right within computational window.
+				t1 := v[m22*ldv]
+				t2 := t1 * v[m22*ldv+1]
+				for j := jtop; j <= min(kbot, k+3); j++ {
+					refsum := h[j*ldh+k+1] + v[m22*ldv+1]*h[j*ldh+k+2]
+					h[j*ldh+k+1] -= refsum * t1
+					h[j*ldh+k+2] -= refsum * t2
+				}
+				// Perform update from left within computational window.
+				var jbot int
+				switch {
+				case accum:
+					jbot = min(ndcol, kbot)
+				case wantt:
+					jbot = n - 1
+				default:
+					jbot = kbot
+				}
+				t1 = v[m22*ldv]
+				t2 = t1 * v[m22*ldv+1]
+				for j := k + 1; j <= jbot; j++ {
+					refsum := h[(k+1)*ldh+j] + v[m22*ldv+1]*h[(k+2)*ldh+j]
+					h[(k+1)*ldh+j] -= refsum * t1
+					h[(k+2)*ldh+j] -= refsum * t2
+				}
+				// The following convergence test requires that the traditional
+				// small-compared-to-nearby-diagonals criterion and the Ahues &
+				// Tisseur (LAWN 122, 1997) criteria both be satisfied. The latter
+				// improves accuracy in some examples. Falling back on an alternate
+				// convergence criterion when tst1 or tst2 is zero (as done here) is
+				// traditional but probably unnecessary.
+				if k >= ktop && h[(k+1)*ldh+k] != 0 {
+					tst1 := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1])
+					if tst1 == 0 {
+						if k >= ktop+1 {
+							tst1 += math.Abs(h[k*ldh+k-1])
+						}
+						if k >= ktop+2 {
+							tst1 += math.Abs(h[k*ldh+k-2])
+						}
+						if k >= ktop+3 {
+							tst1 += math.Abs(h[k*ldh+k-3])
+						}
+						if k <= kbot-2 {
+							tst1 += math.Abs(h[(k+2)*ldh+k+1])
+						}
+						if k <= kbot-3 {
+							tst1 += math.Abs(h[(k+3)*ldh+k+1])
+						}
+						if k <= kbot-4 {
+							tst1 += math.Abs(h[(k+4)*ldh+k+1])
+						}
+					}
+					if math.Abs(h[(k+1)*ldh+k]) <= math.Max(smlnum, ulp*tst1) {
+						h12 := math.Max(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1]))
+						h21 := math.Min(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1]))
+						h11 := math.Max(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1]))
+						h22 := math.Min(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1]))
+						scl := h11 + h12
+						tst2 := h22 * (h11 / scl)
+						if tst2 == 0 || h21*(h12/scl) <= math.Max(smlnum, ulp*tst2) {
+							h[(k+1)*ldh+k] = 0
+						}
+					}
+				}
+				// Accumulate orthogonal transformations.
+				if accum {
+					kms := k - incol - 1
+					t1 = v[m22*ldv]
+					t2 = t1 * v[m22*ldv+1]
+					for j := max(0, ktop-incol-1); j < kdu; j++ {
+						refsum := u[j*ldu+kms+1] + v[m22*ldv+1]*u[j*ldu+kms+2]
+						u[j*ldu+kms+1] -= refsum * t1
+						u[j*ldu+kms+2] -= refsum * t2
+					}
+				} else if wantz {
+					t1 = v[m22*ldv]
+					t2 = t1 * v[m22*ldv+1]
+					for j := iloz; j <= ihiz; j++ {
+						refsum := z[j*ldz+k+1] + v[m22*ldv+1]*z[j*ldz+k+2]
+						z[j*ldz+k+1] -= refsum * t1
+						z[j*ldz+k+2] -= refsum * t2
+					}
+				}
+			}
+			// Normal case: Chain of 3×3 reflections.
+			for m := mbot; m >= mtop; m-- {
+				k := krcol + 2*m
+				if k == ktop-1 {
+					impl.Dlaqr1(3, h[ktop*ldh+ktop:], ldh,
+						sr[2*m], si[2*m], sr[2*m+1], si[2*m+1],
+						v[m*ldv:m*ldv+3])
+					alpha := v[m*ldv]
+					_, v[m*ldv] = impl.Dlarfg(3, alpha, v[m*ldv+1:m*ldv+3], 1)
+				} else {
+					// Perform delayed transformation of row below m-th bulge.
+					// Exploit fact that first two elements of row are actually
+					// zero.
+					t1 := v[m*ldv]
+					t2 := t1 * v[m*ldv+1]
+					t3 := t1 * v[m*ldv+2]
+					refsum := v[m*ldv+2] * h[(k+3)*ldh+k+2]
+					h[(k+3)*ldh+k] = -refsum * t1
+					h[(k+3)*ldh+k+1] = -refsum * t2
+					h[(k+3)*ldh+k+2] -= refsum * t3
+					// Calculate reflection to move m-th bulge one step.
+					beta := h[(k+1)*ldh+k]
+					v[m*ldv+1] = h[(k+2)*ldh+k]
+					v[m*ldv+2] = h[(k+3)*ldh+k]
+					beta, v[m*ldv] = impl.Dlarfg(3, beta, v[m*ldv+1:m*ldv+3], 1)
+					// A bulge may collapse because of vigilant deflation or
+					// destructive underflow. In the underflow case, try the
+					// two-small-subdiagonals trick to try to reinflate the
+					// bulge.
+					if h[(k+3)*ldh+k] != 0 || h[(k+3)*ldh+k+1] != 0 || h[(k+3)*ldh+k+2] == 0 {
+						// Typical case: not collapsed (yet).
+						h[(k+1)*ldh+k] = beta
+						h[(k+2)*ldh+k] = 0
+						h[(k+3)*ldh+k] = 0
+					} else {
+						// Atypical case: collapsed. Attempt to reintroduce
+						// ignoring H[k+1,k] and H[k+2,k]. If the fill resulting
+						// from the new reflector is too large, then abandon it.
+						// Otherwise, use the new one.
+						var vt [3]float64
+						impl.Dlaqr1(3, h[(k+1)*ldh+k+1:], ldh,
+							sr[2*m], si[2*m], sr[2*m+1], si[2*m+1],
+							vt[:])
+						_, vt[0] = impl.Dlarfg(3, vt[0], vt[1:3], 1)
+						t1 = vt[0]
+						t2 = t1 * vt[1]
+						t3 = t1 * vt[2]
+						refsum = h[(k+1)*ldh+k] + vt[1]*h[(k+2)*ldh+k]
+						dsum := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1]) + math.Abs(h[(k+2)*ldh+k+2])
+						if math.Abs(h[(k+2)*ldh+k]-refsum*t2)+math.Abs(refsum*t3) > ulp*dsum {
+							// Starting a new bulge here would create
+							// non-negligible fill. Use the old one with
+							// trepidation.
+							h[(k+1)*ldh+k] = beta
+							h[(k+2)*ldh+k] = 0
+							h[(k+3)*ldh+k] = 0
+						} else {
+							// Starting a new bulge here would create only
+							// negligible fill. Replace the old reflector with
+							// the new one.
+							h[(k+1)*ldh+k] -= refsum * t1
+							h[(k+2)*ldh+k] = 0
+							h[(k+3)*ldh+k] = 0
+							v[m*ldv] = vt[0]
+							v[m*ldv+1] = vt[1]
+							v[m*ldv+2] = vt[2]
+						}
+					}
+				}
+				// Apply reflection from the right and the first column of
+				// update from the left. These updates are required for the
+				// vigilant deflation check. We still delay most of the updates
+				// from the left for efficiency.
+				t1 := v[m*ldv]
+				t2 := t1 * v[m*ldv+1]
+				t3 := t1 * v[m*ldv+2]
+				for j := jtop; j <= min(kbot, k+3); j++ {
+					refsum := h[j*ldh+k+1] + v[m*ldv+1]*h[j*ldh+k+2] + v[m*ldv+2]*h[j*ldh+k+3]
+					h[j*ldh+k+1] -= refsum * t1
+					h[j*ldh+k+2] -= refsum * t2
+					h[j*ldh+k+3] -= refsum * t3
+				}
+				// Perform update from left for subsequent column.
+				refsum := h[(k+1)*ldh+k+1] + v[m*ldv+1]*h[(k+2)*ldh+k+1] + v[m*ldv+2]*h[(k+3)*ldh+k+1]
+				h[(k+1)*ldh+k+1] -= refsum * t1
+				h[(k+2)*ldh+k+1] -= refsum * t2
+				h[(k+3)*ldh+k+1] -= refsum * t3
+				// The following convergence test requires that the tradition
+				// small-compared-to-nearby-diagonals criterion and the Ahues &
+				// Tisseur (LAWN 122, 1997) criteria both be satisfied. The
+				// latter improves accuracy in some examples. Falling back on an
+				// alternate convergence criterion when tst1 or tst2 is zero (as
+				// done here) is traditional but probably unnecessary.
+				if k < ktop {
+					continue
+				}
+				if h[(k+1)*ldh+k] != 0 {
+					tst1 := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1])
+					if tst1 == 0 {
+						if k >= ktop+1 {
+							tst1 += math.Abs(h[k*ldh+k-1])
+						}
+						if k >= ktop+2 {
+							tst1 += math.Abs(h[k*ldh+k-2])
+						}
+						if k >= ktop+3 {
+							tst1 += math.Abs(h[k*ldh+k-3])
+						}
+						if k <= kbot-2 {
+							tst1 += math.Abs(h[(k+2)*ldh+k+1])
+						}
+						if k <= kbot-3 {
+							tst1 += math.Abs(h[(k+3)*ldh+k+1])
+						}
+						if k <= kbot-4 {
+							tst1 += math.Abs(h[(k+4)*ldh+k+1])
+						}
+					}
+					if math.Abs(h[(k+1)*ldh+k]) <= math.Max(smlnum, ulp*tst1) {
+						h12 := math.Max(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1]))
+						h21 := math.Min(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1]))
+						h11 := math.Max(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1]))
+						h22 := math.Min(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1]))
+						scl := h11 + h12
+						tst2 := h22 * (h11 / scl)
+						if tst2 == 0 || h21*(h12/scl) <= math.Max(smlnum, ulp*tst2) {
+							h[(k+1)*ldh+k] = 0
+						}
+					}
+				}
+			}
+			// Multiply H by reflections from the left.
+			var jbot int
+			switch {
+			case accum:
+				jbot = min(ndcol, kbot)
+			case wantt:
+				jbot = n - 1
+			default:
+				jbot = kbot
+			}
+			for m := mbot; m >= mtop; m-- {
+				k := krcol + 2*m
+				t1 := v[m*ldv]
+				t2 := t1 * v[m*ldv+1]
+				t3 := t1 * v[m*ldv+2]
+				for j := max(ktop, krcol+2*(m+1)); j <= jbot; j++ {
+					refsum := h[(k+1)*ldh+j] + v[m*ldv+1]*h[(k+2)*ldh+j] + v[m*ldv+2]*h[(k+3)*ldh+j]
+					h[(k+1)*ldh+j] -= refsum * t1
+					h[(k+2)*ldh+j] -= refsum * t2
+					h[(k+3)*ldh+j] -= refsum * t3
+				}
+			}
+			// Accumulate orthogonal transformations.
+			if accum {
+				// Accumulate U. If necessary, update Z later with an
+				// efficient matrix-matrix multiply.
+				for m := mbot; m >= mtop; m-- {
+					k := krcol + 2*m
+					kms := k - incol - 1
+					i2 := max(0, ktop-incol-1)
+					i2 = max(i2, kms-(krcol-incol))
+					i4 := min(kdu, krcol+2*mbot-incol+5)
+					t1 := v[m*ldv]
+					t2 := t1 * v[m*ldv+1]
+					t3 := t1 * v[m*ldv+2]
+					for j := i2; j < i4; j++ {
+						refsum := u[j*ldu+kms+1] + v[m*ldv+1]*u[j*ldu+kms+2] + v[m*ldv+2]*u[j*ldu+kms+3]
+						u[j*ldu+kms+1] -= refsum * t1
+						u[j*ldu+kms+2] -= refsum * t2
+						u[j*ldu+kms+3] -= refsum * t3
+					}
+				}
+			} else if wantz {
+				// U is not accumulated, so update Z now by multiplying by
+				// reflections from the right.
+				for m := mbot; m >= mtop; m-- {
+					k := krcol + 2*m
+					t1 := v[m*ldv]
+					t2 := t1 * v[m*ldv+1]
+					t3 := t1 * v[m*ldv+2]
+					for j := iloz; j <= ihiz; j++ {
+						refsum := z[j*ldz+k+1] + v[m*ldv+1]*z[j*ldz+k+2] + v[m*ldv+2]*z[j*ldz+k+3]
+						z[j*ldz+k+1] -= refsum * t1
+						z[j*ldz+k+2] -= refsum * t2
+						z[j*ldz+k+3] -= refsum * t3
+					}
+				}
+			}
+		}
+		// Use U (if accumulated) to update far-from-diagonal entries in H.
+		// If required, use U to update Z as well.
+		if !accum {
+			continue
+		}
+		jtop, jbot := ktop, kbot
+		if wantt {
+			jtop = 0
+			jbot = n - 1
+		}
+		bi := blas64.Implementation()
+		k1 := max(0, ktop-incol-1)
+		nu := kdu - max(0, ndcol-kbot) - k1
+		// Horizontal multiply.
+		for jcol := min(ndcol, kbot) + 1; jcol <= jbot; jcol += nh {
+			jlen := min(nh, jbot-jcol+1)
+			bi.Dgemm(blas.Trans, blas.NoTrans, nu, jlen, nu,
+				1, u[k1*ldu+k1:], ldu,
+				h[(incol+k1+1)*ldh+jcol:], ldh,
+				0, wh, ldwh)
+			impl.Dlacpy(blas.All, nu, jlen, wh, ldwh, h[(incol+k1+1)*ldh+jcol:], ldh)
+		}
+		// Vertical multiply.
+		for jrow := jtop; jrow < max(ktop, incol); jrow += nv {
+			jlen := min(nv, max(ktop, incol)-jrow)
+			bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, nu, nu,
+				1, h[jrow*ldh+incol+k1+1:], ldh,
+				u[k1*ldu+k1:], ldu,
+				0, wv, ldwv)
+			impl.Dlacpy(blas.All, jlen, nu, wv, ldwv, h[jrow*ldh+incol+k1+1:], ldh)
+		}
+		// Z multiply (also vertical).
+		if wantz {
+			for jrow := iloz; jrow <= ihiz; jrow += nv {
+				jlen := min(nv, ihiz-jrow+1)
+				bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, nu, nu,
+					1, z[jrow*ldz+incol+k1+1:], ldz,
+					u[k1*ldu+k1:], ldu,
+					0, wv, ldwv)
+				impl.Dlacpy(blas.All, jlen, nu, wv, ldwv, z[jrow*ldz+incol+k1+1:], ldz)
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlarf.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlarf.go
new file mode 100644
index 0000000000..16581a1b4e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlarf.go
@@ -0,0 +1,102 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlarf applies an elementary reflector H to an m×n matrix C:
+//
+//	C = H * C  if side == blas.Left
+//	C = C * H  if side == blas.Right
+//
+// H is represented in the form
+//
+//	H = I - tau * v * vᵀ
+//
+// where tau is a scalar and v is a vector.
+//
+// work must have length at least m if side == blas.Left and
+// at least n if side == blas.Right.
+//
+// Dlarf is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlarf(side blas.Side, m, n int, v []float64, incv int, tau float64, c []float64, ldc int, work []float64) {
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case incv == 0:
+		panic(zeroIncV)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	if m == 0 || n == 0 {
+		return
+	}
+
+	applyleft := side == blas.Left
+	lenV := n
+	if applyleft {
+		lenV = m
+	}
+
+	switch {
+	case len(v) < 1+(lenV-1)*abs(incv):
+		panic(shortV)
+	case len(c) < (m-1)*ldc+n:
+		panic(shortC)
+	case (applyleft && len(work) < n) || (!applyleft && len(work) < m):
+		panic(shortWork)
+	}
+
+	lastv := -1 // last non-zero element of v
+	lastc := -1 // last non-zero row/column of C
+	if tau != 0 {
+		if applyleft {
+			lastv = m - 1
+		} else {
+			lastv = n - 1
+		}
+		var i int
+		if incv > 0 {
+			i = lastv * incv
+		}
+		// Look for the last non-zero row in v.
+		for lastv >= 0 && v[i] == 0 {
+			lastv--
+			i -= incv
+		}
+		if applyleft {
+			// Scan for the last non-zero column in C[0:lastv, :]
+			lastc = impl.Iladlc(lastv+1, n, c, ldc)
+		} else {
+			// Scan for the last non-zero row in C[:, 0:lastv]
+			lastc = impl.Iladlr(m, lastv+1, c, ldc)
+		}
+	}
+	if lastv == -1 || lastc == -1 {
+		return
+	}
+	bi := blas64.Implementation()
+	if applyleft {
+		// Form H * C
+		// w[0:lastc+1] = c[1:lastv+1, 1:lastc+1]ᵀ * v[1:lastv+1,1]
+		bi.Dgemv(blas.Trans, lastv+1, lastc+1, 1, c, ldc, v, incv, 0, work, 1)
+		// c[0: lastv, 0: lastc] = c[...] - w[0:lastv, 1] * v[1:lastc, 1]ᵀ
+		bi.Dger(lastv+1, lastc+1, -tau, v, incv, work, 1, c, ldc)
+	} else {
+		// Form C * H
+		// w[0:lastc+1,1] := c[0:lastc+1,0:lastv+1] * v[0:lastv+1,1]
+		bi.Dgemv(blas.NoTrans, lastc+1, lastv+1, 1, c, ldc, v, incv, 0, work, 1)
+		// c[0:lastc+1,0:lastv+1] = c[...] - w[0:lastc+1,0] * v[0:lastv+1,0]ᵀ
+		bi.Dger(lastc+1, lastv+1, -tau, work, 1, v, incv, c, ldc)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlarfb.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlarfb.go
new file mode 100644
index 0000000000..eb43ca74ce
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlarfb.go
@@ -0,0 +1,461 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlarfb applies a block reflector to a matrix.
+//
+// In the call to Dlarfb, the mxn c is multiplied by the implicitly defined matrix h as follows:
+//
+//	c = h * c   if side == Left and trans == NoTrans
+//	c = c * h   if side == Right and trans == NoTrans
+//	c = hᵀ * c  if side == Left and trans == Trans
+//	c = c * hᵀ  if side == Right and trans == Trans
+//
+// h is a product of elementary reflectors. direct sets the direction of multiplication
+//
+//	h = h_1 * h_2 * ... * h_k    if direct == Forward
+//	h = h_k * h_k-1 * ... * h_1  if direct == Backward
+//
+// The combination of direct and store defines the orientation of the elementary
+// reflectors. In all cases the ones on the diagonal are implicitly represented.
+//
+// If direct == lapack.Forward and store == lapack.ColumnWise
+//
+//	V = [ 1        ]
+//	    [v1   1    ]
+//	    [v1  v2   1]
+//	    [v1  v2  v3]
+//	    [v1  v2  v3]
+//
+// If direct == lapack.Forward and store == lapack.RowWise
+//
+//	V = [ 1  v1  v1  v1  v1]
+//	    [     1  v2  v2  v2]
+//	    [         1  v3  v3]
+//
+// If direct == lapack.Backward and store == lapack.ColumnWise
+//
+//	V = [v1  v2  v3]
+//	    [v1  v2  v3]
+//	    [ 1  v2  v3]
+//	    [     1  v3]
+//	    [         1]
+//
+// If direct == lapack.Backward and store == lapack.RowWise
+//
+//	V = [v1  v1   1        ]
+//	    [v2  v2  v2   1    ]
+//	    [v3  v3  v3  v3   1]
+//
+// An elementary reflector can be explicitly constructed by extracting the
+// corresponding elements of v, placing a 1 where the diagonal would be, and
+// placing zeros in the remaining elements.
+//
+// t is a k×k matrix containing the block reflector, and this function will panic
+// if t is not of sufficient size. See Dlarft for more information.
+//
+// work is a temporary storage matrix with stride ldwork.
+// work must be of size at least n×k side == Left and m×k if side == Right, and
+// this function will panic if this size is not met.
+//
+// Dlarfb is an internal routine. It is exported for testing purposes.
+func (Implementation) Dlarfb(side blas.Side, trans blas.Transpose, direct lapack.Direct, store lapack.StoreV, m, n, k int, v []float64, ldv int, t []float64, ldt int, c []float64, ldc int, work []float64, ldwork int) {
+	nv := m
+	if side == blas.Right {
+		nv = n
+	}
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case trans != blas.Trans && trans != blas.NoTrans:
+		panic(badTrans)
+	case direct != lapack.Forward && direct != lapack.Backward:
+		panic(badDirect)
+	case store != lapack.ColumnWise && store != lapack.RowWise:
+		panic(badStoreV)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case store == lapack.ColumnWise && ldv < max(1, k):
+		panic(badLdV)
+	case store == lapack.RowWise && ldv < max(1, nv):
+		panic(badLdV)
+	case ldt < max(1, k):
+		panic(badLdT)
+	case ldc < max(1, n):
+		panic(badLdC)
+	case ldwork < max(1, k):
+		panic(badLdWork)
+	}
+
+	if m == 0 || n == 0 {
+		return
+	}
+
+	nw := n
+	if side == blas.Right {
+		nw = m
+	}
+	switch {
+	case store == lapack.ColumnWise && len(v) < (nv-1)*ldv+k:
+		panic(shortV)
+	case store == lapack.RowWise && len(v) < (k-1)*ldv+nv:
+		panic(shortV)
+	case len(t) < (k-1)*ldt+k:
+		panic(shortT)
+	case len(c) < (m-1)*ldc+n:
+		panic(shortC)
+	case len(work) < (nw-1)*ldwork+k:
+		panic(shortWork)
+	}
+
+	bi := blas64.Implementation()
+
+	transt := blas.Trans
+	if trans == blas.Trans {
+		transt = blas.NoTrans
+	}
+	// TODO(btracey): This follows the original Lapack code where the
+	// elements are copied into the columns of the working array. The
+	// loops should go in the other direction so the data is written
+	// into the rows of work so the copy is not strided. A bigger change
+	// would be to replace work with workᵀ, but benchmarks would be
+	// needed to see if the change is merited.
+	if store == lapack.ColumnWise {
+		if direct == lapack.Forward {
+			// V1 is the first k rows of C. V2 is the remaining rows.
+			if side == blas.Left {
+				// W = Cᵀ V = C1ᵀ V1 + C2ᵀ V2 (stored in work).
+
+				// W = C1.
+				for j := 0; j < k; j++ {
+					bi.Dcopy(n, c[j*ldc:], 1, work[j:], ldwork)
+				}
+				// W = W * V1.
+				bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.Unit,
+					n, k, 1,
+					v, ldv,
+					work, ldwork)
+				if m > k {
+					// W = W + C2ᵀ V2.
+					bi.Dgemm(blas.Trans, blas.NoTrans, n, k, m-k,
+						1, c[k*ldc:], ldc, v[k*ldv:], ldv,
+						1, work, ldwork)
+				}
+				// W = W * Tᵀ or W * T.
+				bi.Dtrmm(blas.Right, blas.Upper, transt, blas.NonUnit, n, k,
+					1, t, ldt,
+					work, ldwork)
+				// C -= V * Wᵀ.
+				if m > k {
+					// C2 -= V2 * Wᵀ.
+					bi.Dgemm(blas.NoTrans, blas.Trans, m-k, n, k,
+						-1, v[k*ldv:], ldv, work, ldwork,
+						1, c[k*ldc:], ldc)
+				}
+				// W *= V1ᵀ.
+				bi.Dtrmm(blas.Right, blas.Lower, blas.Trans, blas.Unit, n, k,
+					1, v, ldv,
+					work, ldwork)
+				// C1 -= Wᵀ.
+				// TODO(btracey): This should use blas.Axpy.
+				for i := 0; i < n; i++ {
+					for j := 0; j < k; j++ {
+						c[j*ldc+i] -= work[i*ldwork+j]
+					}
+				}
+				return
+			}
+			// Form C = C * H or C * Hᵀ, where C = (C1 C2).
+
+			// W = C1.
+			for i := 0; i < k; i++ {
+				bi.Dcopy(m, c[i:], ldc, work[i:], ldwork)
+			}
+			// W *= V1.
+			bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.Unit, m, k,
+				1, v, ldv,
+				work, ldwork)
+			if n > k {
+				bi.Dgemm(blas.NoTrans, blas.NoTrans, m, k, n-k,
+					1, c[k:], ldc, v[k*ldv:], ldv,
+					1, work, ldwork)
+			}
+			// W *= T or Tᵀ.
+			bi.Dtrmm(blas.Right, blas.Upper, trans, blas.NonUnit, m, k,
+				1, t, ldt,
+				work, ldwork)
+			if n > k {
+				bi.Dgemm(blas.NoTrans, blas.Trans, m, n-k, k,
+					-1, work, ldwork, v[k*ldv:], ldv,
+					1, c[k:], ldc)
+			}
+			// C -= W * Vᵀ.
+			bi.Dtrmm(blas.Right, blas.Lower, blas.Trans, blas.Unit, m, k,
+				1, v, ldv,
+				work, ldwork)
+			// C -= W.
+			// TODO(btracey): This should use blas.Axpy.
+			for i := 0; i < m; i++ {
+				for j := 0; j < k; j++ {
+					c[i*ldc+j] -= work[i*ldwork+j]
+				}
+			}
+			return
+		}
+		// V = (V1)
+		//   = (V2) (last k rows)
+		// Where V2 is unit upper triangular.
+		if side == blas.Left {
+			// Form H * C or
+			// W = Cᵀ V.
+
+			// W = C2ᵀ.
+			for j := 0; j < k; j++ {
+				bi.Dcopy(n, c[(m-k+j)*ldc:], 1, work[j:], ldwork)
+			}
+			// W *= V2.
+			bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.Unit, n, k,
+				1, v[(m-k)*ldv:], ldv,
+				work, ldwork)
+			if m > k {
+				// W += C1ᵀ * V1.
+				bi.Dgemm(blas.Trans, blas.NoTrans, n, k, m-k,
+					1, c, ldc, v, ldv,
+					1, work, ldwork)
+			}
+			// W *= T or Tᵀ.
+			bi.Dtrmm(blas.Right, blas.Lower, transt, blas.NonUnit, n, k,
+				1, t, ldt,
+				work, ldwork)
+			// C -= V * Wᵀ.
+			if m > k {
+				bi.Dgemm(blas.NoTrans, blas.Trans, m-k, n, k,
+					-1, v, ldv, work, ldwork,
+					1, c, ldc)
+			}
+			// W *= V2ᵀ.
+			bi.Dtrmm(blas.Right, blas.Upper, blas.Trans, blas.Unit, n, k,
+				1, v[(m-k)*ldv:], ldv,
+				work, ldwork)
+			// C2 -= Wᵀ.
+			// TODO(btracey): This should use blas.Axpy.
+			for i := 0; i < n; i++ {
+				for j := 0; j < k; j++ {
+					c[(m-k+j)*ldc+i] -= work[i*ldwork+j]
+				}
+			}
+			return
+		}
+		// Form C * H or C * Hᵀ where C = (C1 C2).
+		// W = C * V.
+
+		// W = C2.
+		for j := 0; j < k; j++ {
+			bi.Dcopy(m, c[n-k+j:], ldc, work[j:], ldwork)
+		}
+
+		// W = W * V2.
+		bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.Unit, m, k,
+			1, v[(n-k)*ldv:], ldv,
+			work, ldwork)
+		if n > k {
+			bi.Dgemm(blas.NoTrans, blas.NoTrans, m, k, n-k,
+				1, c, ldc, v, ldv,
+				1, work, ldwork)
+		}
+		// W *= T or Tᵀ.
+		bi.Dtrmm(blas.Right, blas.Lower, trans, blas.NonUnit, m, k,
+			1, t, ldt,
+			work, ldwork)
+		// C -= W * Vᵀ.
+		if n > k {
+			// C1 -= W * V1ᵀ.
+			bi.Dgemm(blas.NoTrans, blas.Trans, m, n-k, k,
+				-1, work, ldwork, v, ldv,
+				1, c, ldc)
+		}
+		// W *= V2ᵀ.
+		bi.Dtrmm(blas.Right, blas.Upper, blas.Trans, blas.Unit, m, k,
+			1, v[(n-k)*ldv:], ldv,
+			work, ldwork)
+		// C2 -= W.
+		// TODO(btracey): This should use blas.Axpy.
+		for i := 0; i < m; i++ {
+			for j := 0; j < k; j++ {
+				c[i*ldc+n-k+j] -= work[i*ldwork+j]
+			}
+		}
+		return
+	}
+	// Store = Rowwise.
+	if direct == lapack.Forward {
+		// V = (V1 V2) where v1 is unit upper triangular.
+		if side == blas.Left {
+			// Form H * C or Hᵀ * C where C = (C1; C2).
+			// W = Cᵀ * Vᵀ.
+
+			// W = C1ᵀ.
+			for j := 0; j < k; j++ {
+				bi.Dcopy(n, c[j*ldc:], 1, work[j:], ldwork)
+			}
+			// W *= V1ᵀ.
+			bi.Dtrmm(blas.Right, blas.Upper, blas.Trans, blas.Unit, n, k,
+				1, v, ldv,
+				work, ldwork)
+			if m > k {
+				bi.Dgemm(blas.Trans, blas.Trans, n, k, m-k,
+					1, c[k*ldc:], ldc, v[k:], ldv,
+					1, work, ldwork)
+			}
+			// W *= T or Tᵀ.
+			bi.Dtrmm(blas.Right, blas.Upper, transt, blas.NonUnit, n, k,
+				1, t, ldt,
+				work, ldwork)
+			// C -= Vᵀ * Wᵀ.
+			if m > k {
+				bi.Dgemm(blas.Trans, blas.Trans, m-k, n, k,
+					-1, v[k:], ldv, work, ldwork,
+					1, c[k*ldc:], ldc)
+			}
+			// W *= V1.
+			bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.Unit, n, k,
+				1, v, ldv,
+				work, ldwork)
+			// C1 -= Wᵀ.
+			// TODO(btracey): This should use blas.Axpy.
+			for i := 0; i < n; i++ {
+				for j := 0; j < k; j++ {
+					c[j*ldc+i] -= work[i*ldwork+j]
+				}
+			}
+			return
+		}
+		// Form C * H or C * Hᵀ where C = (C1 C2).
+		// W = C * Vᵀ.
+
+		// W = C1.
+		for j := 0; j < k; j++ {
+			bi.Dcopy(m, c[j:], ldc, work[j:], ldwork)
+		}
+		// W *= V1ᵀ.
+		bi.Dtrmm(blas.Right, blas.Upper, blas.Trans, blas.Unit, m, k,
+			1, v, ldv,
+			work, ldwork)
+		if n > k {
+			bi.Dgemm(blas.NoTrans, blas.Trans, m, k, n-k,
+				1, c[k:], ldc, v[k:], ldv,
+				1, work, ldwork)
+		}
+		// W *= T or Tᵀ.
+		bi.Dtrmm(blas.Right, blas.Upper, trans, blas.NonUnit, m, k,
+			1, t, ldt,
+			work, ldwork)
+		// C -= W * V.
+		if n > k {
+			bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n-k, k,
+				-1, work, ldwork, v[k:], ldv,
+				1, c[k:], ldc)
+		}
+		// W *= V1.
+		bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.Unit, m, k,
+			1, v, ldv,
+			work, ldwork)
+		// C1 -= W.
+		// TODO(btracey): This should use blas.Axpy.
+		for i := 0; i < m; i++ {
+			for j := 0; j < k; j++ {
+				c[i*ldc+j] -= work[i*ldwork+j]
+			}
+		}
+		return
+	}
+	// V = (V1 V2) where V2 is the last k columns and is lower unit triangular.
+	if side == blas.Left {
+		// Form H * C or Hᵀ C where C = (C1 ; C2).
+		// W = Cᵀ * Vᵀ.
+
+		// W = C2ᵀ.
+		for j := 0; j < k; j++ {
+			bi.Dcopy(n, c[(m-k+j)*ldc:], 1, work[j:], ldwork)
+		}
+		// W *= V2ᵀ.
+		bi.Dtrmm(blas.Right, blas.Lower, blas.Trans, blas.Unit, n, k,
+			1, v[m-k:], ldv,
+			work, ldwork)
+		if m > k {
+			bi.Dgemm(blas.Trans, blas.Trans, n, k, m-k,
+				1, c, ldc, v, ldv,
+				1, work, ldwork)
+		}
+		// W *= T or Tᵀ.
+		bi.Dtrmm(blas.Right, blas.Lower, transt, blas.NonUnit, n, k,
+			1, t, ldt,
+			work, ldwork)
+		// C -= Vᵀ * Wᵀ.
+		if m > k {
+			bi.Dgemm(blas.Trans, blas.Trans, m-k, n, k,
+				-1, v, ldv, work, ldwork,
+				1, c, ldc)
+		}
+		// W *= V2.
+		bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.Unit, n, k,
+			1, v[m-k:], ldv,
+			work, ldwork)
+		// C2 -= Wᵀ.
+		// TODO(btracey): This should use blas.Axpy.
+		for i := 0; i < n; i++ {
+			for j := 0; j < k; j++ {
+				c[(m-k+j)*ldc+i] -= work[i*ldwork+j]
+			}
+		}
+		return
+	}
+	// Form C * H or C * Hᵀ where C = (C1 C2).
+	// W = C * Vᵀ.
+	// W = C2.
+	for j := 0; j < k; j++ {
+		bi.Dcopy(m, c[n-k+j:], ldc, work[j:], ldwork)
+	}
+	// W *= V2ᵀ.
+	bi.Dtrmm(blas.Right, blas.Lower, blas.Trans, blas.Unit, m, k,
+		1, v[n-k:], ldv,
+		work, ldwork)
+	if n > k {
+		bi.Dgemm(blas.NoTrans, blas.Trans, m, k, n-k,
+			1, c, ldc, v, ldv,
+			1, work, ldwork)
+	}
+	// W *= T or Tᵀ.
+	bi.Dtrmm(blas.Right, blas.Lower, trans, blas.NonUnit, m, k,
+		1, t, ldt,
+		work, ldwork)
+	// C -= W * V.
+	if n > k {
+		bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n-k, k,
+			-1, work, ldwork, v, ldv,
+			1, c, ldc)
+	}
+	// W *= V2.
+	bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.Unit, m, k,
+		1, v[n-k:], ldv,
+		work, ldwork)
+	// C1 -= W.
+	// TODO(btracey): This should use blas.Axpy.
+	for i := 0; i < m; i++ {
+		for j := 0; j < k; j++ {
+			c[i*ldc+n-k+j] -= work[i*ldwork+j]
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlarfg.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlarfg.go
new file mode 100644
index 0000000000..74ad111d41
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlarfg.go
@@ -0,0 +1,75 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlarfg generates an elementary reflector for a Householder matrix. It creates
+// a real elementary reflector of order n such that
+//
+//	H * (alpha) = (beta)
+//	    (    x)   (   0)
+//	Hᵀ * H = I
+//
+// H is represented in the form
+//
+//	H = 1 - tau * (1; v) * (1 vᵀ)
+//
+// where tau is a real scalar.
+//
+// On entry, x contains the vector x, on exit it contains v.
+//
+// Dlarfg is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlarfg(n int, alpha float64, x []float64, incX int) (beta, tau float64) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case incX <= 0:
+		panic(badIncX)
+	}
+
+	if n <= 1 {
+		return alpha, 0
+	}
+
+	if len(x) < 1+(n-2)*abs(incX) {
+		panic(shortX)
+	}
+
+	bi := blas64.Implementation()
+
+	xnorm := bi.Dnrm2(n-1, x, incX)
+	if xnorm == 0 {
+		return alpha, 0
+	}
+	beta = -math.Copysign(impl.Dlapy2(alpha, xnorm), alpha)
+	safmin := dlamchS / dlamchE
+	knt := 0
+	if math.Abs(beta) < safmin {
+		// xnorm and beta may be inaccurate, scale x and recompute.
+		rsafmn := 1 / safmin
+		for {
+			knt++
+			bi.Dscal(n-1, rsafmn, x, incX)
+			beta *= rsafmn
+			alpha *= rsafmn
+			if math.Abs(beta) >= safmin {
+				break
+			}
+		}
+		xnorm = bi.Dnrm2(n-1, x, incX)
+		beta = -math.Copysign(impl.Dlapy2(alpha, xnorm), alpha)
+	}
+	tau = (beta - alpha) / beta
+	bi.Dscal(n-1, 1/(alpha-beta), x, incX)
+	for j := 0; j < knt; j++ {
+		beta *= safmin
+	}
+	return beta, tau
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlarft.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlarft.go
new file mode 100644
index 0000000000..921a5a3d21
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlarft.go
@@ -0,0 +1,169 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlarft forms the triangular factor T of a block reflector H, storing the answer
+// in t.
+//
+//	H = I - V * T * Vᵀ  if store == lapack.ColumnWise
+//	H = I - Vᵀ * T * V  if store == lapack.RowWise
+//
+// H is defined by a product of the elementary reflectors where
+//
+//	H = H_0 * H_1 * ... * H_{k-1}  if direct == lapack.Forward
+//	H = H_{k-1} * ... * H_1 * H_0  if direct == lapack.Backward
+//
+// t is a k×k triangular matrix. t is upper triangular if direct = lapack.Forward
+// and lower triangular otherwise. This function will panic if t is not of
+// sufficient size.
+//
+// store describes the storage of the elementary reflectors in v. See
+// Dlarfb for a description of layout.
+//
+// tau contains the scalar factors of the elementary reflectors H_i.
+//
+// Dlarft is an internal routine. It is exported for testing purposes.
+func (Implementation) Dlarft(direct lapack.Direct, store lapack.StoreV, n, k int, v []float64, ldv int, tau []float64, t []float64, ldt int) {
+	mv, nv := n, k
+	if store == lapack.RowWise {
+		mv, nv = k, n
+	}
+	switch {
+	case direct != lapack.Forward && direct != lapack.Backward:
+		panic(badDirect)
+	case store != lapack.RowWise && store != lapack.ColumnWise:
+		panic(badStoreV)
+	case n < 0:
+		panic(nLT0)
+	case k < 1:
+		panic(kLT1)
+	case ldv < max(1, nv):
+		panic(badLdV)
+	case len(tau) < k:
+		panic(shortTau)
+	case ldt < max(1, k):
+		panic(shortT)
+	}
+
+	if n == 0 {
+		return
+	}
+
+	switch {
+	case len(v) < (mv-1)*ldv+nv:
+		panic(shortV)
+	case len(t) < (k-1)*ldt+k:
+		panic(shortT)
+	}
+
+	bi := blas64.Implementation()
+
+	// TODO(btracey): There are a number of minor obvious loop optimizations here.
+	// TODO(btracey): It may be possible to rearrange some of the code so that
+	// index of 1 is more common in the Dgemv.
+	if direct == lapack.Forward {
+		prevlastv := n - 1
+		for i := 0; i < k; i++ {
+			prevlastv = max(i, prevlastv)
+			if tau[i] == 0 {
+				for j := 0; j <= i; j++ {
+					t[j*ldt+i] = 0
+				}
+				continue
+			}
+			var lastv int
+			if store == lapack.ColumnWise {
+				// skip trailing zeros
+				for lastv = n - 1; lastv >= i+1; lastv-- {
+					if v[lastv*ldv+i] != 0 {
+						break
+					}
+				}
+				for j := 0; j < i; j++ {
+					t[j*ldt+i] = -tau[i] * v[i*ldv+j]
+				}
+				j := min(lastv, prevlastv)
+				bi.Dgemv(blas.Trans, j-i, i,
+					-tau[i], v[(i+1)*ldv:], ldv, v[(i+1)*ldv+i:], ldv,
+					1, t[i:], ldt)
+			} else {
+				for lastv = n - 1; lastv >= i+1; lastv-- {
+					if v[i*ldv+lastv] != 0 {
+						break
+					}
+				}
+				for j := 0; j < i; j++ {
+					t[j*ldt+i] = -tau[i] * v[j*ldv+i]
+				}
+				j := min(lastv, prevlastv)
+				bi.Dgemv(blas.NoTrans, i, j-i,
+					-tau[i], v[i+1:], ldv, v[i*ldv+i+1:], 1,
+					1, t[i:], ldt)
+			}
+			bi.Dtrmv(blas.Upper, blas.NoTrans, blas.NonUnit, i, t, ldt, t[i:], ldt)
+			t[i*ldt+i] = tau[i]
+			if i > 1 {
+				prevlastv = max(prevlastv, lastv)
+			} else {
+				prevlastv = lastv
+			}
+		}
+		return
+	}
+	prevlastv := 0
+	for i := k - 1; i >= 0; i-- {
+		if tau[i] == 0 {
+			for j := i; j < k; j++ {
+				t[j*ldt+i] = 0
+			}
+			continue
+		}
+		var lastv int
+		if i < k-1 {
+			if store == lapack.ColumnWise {
+				for lastv = 0; lastv < i; lastv++ {
+					if v[lastv*ldv+i] != 0 {
+						break
+					}
+				}
+				for j := i + 1; j < k; j++ {
+					t[j*ldt+i] = -tau[i] * v[(n-k+i)*ldv+j]
+				}
+				j := max(lastv, prevlastv)
+				bi.Dgemv(blas.Trans, n-k+i-j, k-i-1,
+					-tau[i], v[j*ldv+i+1:], ldv, v[j*ldv+i:], ldv,
+					1, t[(i+1)*ldt+i:], ldt)
+			} else {
+				for lastv = 0; lastv < i; lastv++ {
+					if v[i*ldv+lastv] != 0 {
+						break
+					}
+				}
+				for j := i + 1; j < k; j++ {
+					t[j*ldt+i] = -tau[i] * v[j*ldv+n-k+i]
+				}
+				j := max(lastv, prevlastv)
+				bi.Dgemv(blas.NoTrans, k-i-1, n-k+i-j,
+					-tau[i], v[(i+1)*ldv+j:], ldv, v[i*ldv+j:], 1,
+					1, t[(i+1)*ldt+i:], ldt)
+			}
+			bi.Dtrmv(blas.Lower, blas.NoTrans, blas.NonUnit, k-i-1,
+				t[(i+1)*ldt+i+1:], ldt,
+				t[(i+1)*ldt+i:], ldt)
+			if i > 0 {
+				prevlastv = min(prevlastv, lastv)
+			} else {
+				prevlastv = lastv
+			}
+		}
+		t[i*ldt+i] = tau[i]
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlarfx.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlarfx.go
new file mode 100644
index 0000000000..4e40dad188
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlarfx.go
@@ -0,0 +1,552 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dlarfx applies an elementary reflector H to a real m×n matrix C, from either
+// the left or the right, with loop unrolling when the reflector has order less
+// than 11.
+//
+// H is represented in the form
+//
+//	H = I - tau * v * vᵀ,
+//
+// where tau is a real scalar and v is a real vector. If tau = 0, then H is
+// taken to be the identity matrix.
+//
+// v must have length equal to m if side == blas.Left, and equal to n if side ==
+// blas.Right, otherwise Dlarfx will panic.
+//
+// c and ldc represent the m×n matrix C. On return, C is overwritten by the
+// matrix H * C if side == blas.Left, or C * H if side == blas.Right.
+//
+// work must have length at least n if side == blas.Left, and at least m if side
+// == blas.Right, otherwise Dlarfx will panic. work is not referenced if H has
+// order < 11.
+//
+// Dlarfx is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlarfx(side blas.Side, m, n int, v []float64, tau float64, c []float64, ldc int, work []float64) {
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	nh := m
+	lwork := n
+	if side == blas.Right {
+		nh = n
+		lwork = m
+	}
+	switch {
+	case len(v) < nh:
+		panic(shortV)
+	case len(c) < (m-1)*ldc+n:
+		panic(shortC)
+	case nh > 10 && len(work) < lwork:
+		panic(shortWork)
+	}
+
+	if tau == 0 {
+		return
+	}
+
+	if side == blas.Left {
+		// Form H * C, where H has order m.
+		switch m {
+		default: // Code for general m.
+			impl.Dlarf(side, m, n, v, 1, tau, c, ldc, work)
+			return
+
+		case 0: // No-op for zero size matrix.
+			return
+
+		case 1: // Special code for 1×1 Householder matrix.
+			t0 := 1 - tau*v[0]*v[0]
+			for j := 0; j < n; j++ {
+				c[j] *= t0
+			}
+			return
+
+		case 2: // Special code for 2×2 Householder matrix.
+			v0 := v[0]
+			t0 := tau * v0
+			v1 := v[1]
+			t1 := tau * v1
+			for j := 0; j < n; j++ {
+				sum := v0*c[j] + v1*c[ldc+j]
+				c[j] -= sum * t0
+				c[ldc+j] -= sum * t1
+			}
+			return
+
+		case 3: // Special code for 3×3 Householder matrix.
+			v0 := v[0]
+			t0 := tau * v0
+			v1 := v[1]
+			t1 := tau * v1
+			v2 := v[2]
+			t2 := tau * v2
+			for j := 0; j < n; j++ {
+				sum := v0*c[j] + v1*c[ldc+j] + v2*c[2*ldc+j]
+				c[j] -= sum * t0
+				c[ldc+j] -= sum * t1
+				c[2*ldc+j] -= sum * t2
+			}
+			return
+
+		case 4: // Special code for 4×4 Householder matrix.
+			v0 := v[0]
+			t0 := tau * v0
+			v1 := v[1]
+			t1 := tau * v1
+			v2 := v[2]
+			t2 := tau * v2
+			v3 := v[3]
+			t3 := tau * v3
+			for j := 0; j < n; j++ {
+				sum := v0*c[j] + v1*c[ldc+j] + v2*c[2*ldc+j] + v3*c[3*ldc+j]
+				c[j] -= sum * t0
+				c[ldc+j] -= sum * t1
+				c[2*ldc+j] -= sum * t2
+				c[3*ldc+j] -= sum * t3
+			}
+			return
+
+		case 5: // Special code for 5×5 Householder matrix.
+			v0 := v[0]
+			t0 := tau * v0
+			v1 := v[1]
+			t1 := tau * v1
+			v2 := v[2]
+			t2 := tau * v2
+			v3 := v[3]
+			t3 := tau * v3
+			v4 := v[4]
+			t4 := tau * v4
+			for j := 0; j < n; j++ {
+				sum := v0*c[j] + v1*c[ldc+j] + v2*c[2*ldc+j] + v3*c[3*ldc+j] + v4*c[4*ldc+j]
+				c[j] -= sum * t0
+				c[ldc+j] -= sum * t1
+				c[2*ldc+j] -= sum * t2
+				c[3*ldc+j] -= sum * t3
+				c[4*ldc+j] -= sum * t4
+			}
+			return
+
+		case 6: // Special code for 6×6 Householder matrix.
+			v0 := v[0]
+			t0 := tau * v0
+			v1 := v[1]
+			t1 := tau * v1
+			v2 := v[2]
+			t2 := tau * v2
+			v3 := v[3]
+			t3 := tau * v3
+			v4 := v[4]
+			t4 := tau * v4
+			v5 := v[5]
+			t5 := tau * v5
+			for j := 0; j < n; j++ {
+				sum := v0*c[j] + v1*c[ldc+j] + v2*c[2*ldc+j] + v3*c[3*ldc+j] + v4*c[4*ldc+j] +
+					v5*c[5*ldc+j]
+				c[j] -= sum * t0
+				c[ldc+j] -= sum * t1
+				c[2*ldc+j] -= sum * t2
+				c[3*ldc+j] -= sum * t3
+				c[4*ldc+j] -= sum * t4
+				c[5*ldc+j] -= sum * t5
+			}
+			return
+
+		case 7: // Special code for 7×7 Householder matrix.
+			v0 := v[0]
+			t0 := tau * v0
+			v1 := v[1]
+			t1 := tau * v1
+			v2 := v[2]
+			t2 := tau * v2
+			v3 := v[3]
+			t3 := tau * v3
+			v4 := v[4]
+			t4 := tau * v4
+			v5 := v[5]
+			t5 := tau * v5
+			v6 := v[6]
+			t6 := tau * v6
+			for j := 0; j < n; j++ {
+				sum := v0*c[j] + v1*c[ldc+j] + v2*c[2*ldc+j] + v3*c[3*ldc+j] + v4*c[4*ldc+j] +
+					v5*c[5*ldc+j] + v6*c[6*ldc+j]
+				c[j] -= sum * t0
+				c[ldc+j] -= sum * t1
+				c[2*ldc+j] -= sum * t2
+				c[3*ldc+j] -= sum * t3
+				c[4*ldc+j] -= sum * t4
+				c[5*ldc+j] -= sum * t5
+				c[6*ldc+j] -= sum * t6
+			}
+			return
+
+		case 8: // Special code for 8×8 Householder matrix.
+			v0 := v[0]
+			t0 := tau * v0
+			v1 := v[1]
+			t1 := tau * v1
+			v2 := v[2]
+			t2 := tau * v2
+			v3 := v[3]
+			t3 := tau * v3
+			v4 := v[4]
+			t4 := tau * v4
+			v5 := v[5]
+			t5 := tau * v5
+			v6 := v[6]
+			t6 := tau * v6
+			v7 := v[7]
+			t7 := tau * v7
+			for j := 0; j < n; j++ {
+				sum := v0*c[j] + v1*c[ldc+j] + v2*c[2*ldc+j] + v3*c[3*ldc+j] + v4*c[4*ldc+j] +
+					v5*c[5*ldc+j] + v6*c[6*ldc+j] + v7*c[7*ldc+j]
+				c[j] -= sum * t0
+				c[ldc+j] -= sum * t1
+				c[2*ldc+j] -= sum * t2
+				c[3*ldc+j] -= sum * t3
+				c[4*ldc+j] -= sum * t4
+				c[5*ldc+j] -= sum * t5
+				c[6*ldc+j] -= sum * t6
+				c[7*ldc+j] -= sum * t7
+			}
+			return
+
+		case 9: // Special code for 9×9 Householder matrix.
+			v0 := v[0]
+			t0 := tau * v0
+			v1 := v[1]
+			t1 := tau * v1
+			v2 := v[2]
+			t2 := tau * v2
+			v3 := v[3]
+			t3 := tau * v3
+			v4 := v[4]
+			t4 := tau * v4
+			v5 := v[5]
+			t5 := tau * v5
+			v6 := v[6]
+			t6 := tau * v6
+			v7 := v[7]
+			t7 := tau * v7
+			v8 := v[8]
+			t8 := tau * v8
+			for j := 0; j < n; j++ {
+				sum := v0*c[j] + v1*c[ldc+j] + v2*c[2*ldc+j] + v3*c[3*ldc+j] + v4*c[4*ldc+j] +
+					v5*c[5*ldc+j] + v6*c[6*ldc+j] + v7*c[7*ldc+j] + v8*c[8*ldc+j]
+				c[j] -= sum * t0
+				c[ldc+j] -= sum * t1
+				c[2*ldc+j] -= sum * t2
+				c[3*ldc+j] -= sum * t3
+				c[4*ldc+j] -= sum * t4
+				c[5*ldc+j] -= sum * t5
+				c[6*ldc+j] -= sum * t6
+				c[7*ldc+j] -= sum * t7
+				c[8*ldc+j] -= sum * t8
+			}
+			return
+
+		case 10: // Special code for 10×10 Householder matrix.
+			v0 := v[0]
+			t0 := tau * v0
+			v1 := v[1]
+			t1 := tau * v1
+			v2 := v[2]
+			t2 := tau * v2
+			v3 := v[3]
+			t3 := tau * v3
+			v4 := v[4]
+			t4 := tau * v4
+			v5 := v[5]
+			t5 := tau * v5
+			v6 := v[6]
+			t6 := tau * v6
+			v7 := v[7]
+			t7 := tau * v7
+			v8 := v[8]
+			t8 := tau * v8
+			v9 := v[9]
+			t9 := tau * v9
+			for j := 0; j < n; j++ {
+				sum := v0*c[j] + v1*c[ldc+j] + v2*c[2*ldc+j] + v3*c[3*ldc+j] + v4*c[4*ldc+j] +
+					v5*c[5*ldc+j] + v6*c[6*ldc+j] + v7*c[7*ldc+j] + v8*c[8*ldc+j] + v9*c[9*ldc+j]
+				c[j] -= sum * t0
+				c[ldc+j] -= sum * t1
+				c[2*ldc+j] -= sum * t2
+				c[3*ldc+j] -= sum * t3
+				c[4*ldc+j] -= sum * t4
+				c[5*ldc+j] -= sum * t5
+				c[6*ldc+j] -= sum * t6
+				c[7*ldc+j] -= sum * t7
+				c[8*ldc+j] -= sum * t8
+				c[9*ldc+j] -= sum * t9
+			}
+			return
+		}
+	}
+
+	// Form C * H, where H has order n.
+	switch n {
+	default: // Code for general n.
+		impl.Dlarf(side, m, n, v, 1, tau, c, ldc, work)
+		return
+
+	case 0: // No-op for zero size matrix.
+		return
+
+	case 1: // Special code for 1×1 Householder matrix.
+		t0 := 1 - tau*v[0]*v[0]
+		for j := 0; j < m; j++ {
+			c[j*ldc] *= t0
+		}
+		return
+
+	case 2: // Special code for 2×2 Householder matrix.
+		v0 := v[0]
+		t0 := tau * v0
+		v1 := v[1]
+		t1 := tau * v1
+		for j := 0; j < m; j++ {
+			cs := c[j*ldc:]
+			sum := v0*cs[0] + v1*cs[1]
+			cs[0] -= sum * t0
+			cs[1] -= sum * t1
+		}
+		return
+
+	case 3: // Special code for 3×3 Householder matrix.
+		v0 := v[0]
+		t0 := tau * v0
+		v1 := v[1]
+		t1 := tau * v1
+		v2 := v[2]
+		t2 := tau * v2
+		for j := 0; j < m; j++ {
+			cs := c[j*ldc:]
+			sum := v0*cs[0] + v1*cs[1] + v2*cs[2]
+			cs[0] -= sum * t0
+			cs[1] -= sum * t1
+			cs[2] -= sum * t2
+		}
+		return
+
+	case 4: // Special code for 4×4 Householder matrix.
+		v0 := v[0]
+		t0 := tau * v0
+		v1 := v[1]
+		t1 := tau * v1
+		v2 := v[2]
+		t2 := tau * v2
+		v3 := v[3]
+		t3 := tau * v3
+		for j := 0; j < m; j++ {
+			cs := c[j*ldc:]
+			sum := v0*cs[0] + v1*cs[1] + v2*cs[2] + v3*cs[3]
+			cs[0] -= sum * t0
+			cs[1] -= sum * t1
+			cs[2] -= sum * t2
+			cs[3] -= sum * t3
+		}
+		return
+
+	case 5: // Special code for 5×5 Householder matrix.
+		v0 := v[0]
+		t0 := tau * v0
+		v1 := v[1]
+		t1 := tau * v1
+		v2 := v[2]
+		t2 := tau * v2
+		v3 := v[3]
+		t3 := tau * v3
+		v4 := v[4]
+		t4 := tau * v4
+		for j := 0; j < m; j++ {
+			cs := c[j*ldc:]
+			sum := v0*cs[0] + v1*cs[1] + v2*cs[2] + v3*cs[3] + v4*cs[4]
+			cs[0] -= sum * t0
+			cs[1] -= sum * t1
+			cs[2] -= sum * t2
+			cs[3] -= sum * t3
+			cs[4] -= sum * t4
+		}
+		return
+
+	case 6: // Special code for 6×6 Householder matrix.
+		v0 := v[0]
+		t0 := tau * v0
+		v1 := v[1]
+		t1 := tau * v1
+		v2 := v[2]
+		t2 := tau * v2
+		v3 := v[3]
+		t3 := tau * v3
+		v4 := v[4]
+		t4 := tau * v4
+		v5 := v[5]
+		t5 := tau * v5
+		for j := 0; j < m; j++ {
+			cs := c[j*ldc:]
+			sum := v0*cs[0] + v1*cs[1] + v2*cs[2] + v3*cs[3] + v4*cs[4] + v5*cs[5]
+			cs[0] -= sum * t0
+			cs[1] -= sum * t1
+			cs[2] -= sum * t2
+			cs[3] -= sum * t3
+			cs[4] -= sum * t4
+			cs[5] -= sum * t5
+		}
+		return
+
+	case 7: // Special code for 7×7 Householder matrix.
+		v0 := v[0]
+		t0 := tau * v0
+		v1 := v[1]
+		t1 := tau * v1
+		v2 := v[2]
+		t2 := tau * v2
+		v3 := v[3]
+		t3 := tau * v3
+		v4 := v[4]
+		t4 := tau * v4
+		v5 := v[5]
+		t5 := tau * v5
+		v6 := v[6]
+		t6 := tau * v6
+		for j := 0; j < m; j++ {
+			cs := c[j*ldc:]
+			sum := v0*cs[0] + v1*cs[1] + v2*cs[2] + v3*cs[3] + v4*cs[4] +
+				v5*cs[5] + v6*cs[6]
+			cs[0] -= sum * t0
+			cs[1] -= sum * t1
+			cs[2] -= sum * t2
+			cs[3] -= sum * t3
+			cs[4] -= sum * t4
+			cs[5] -= sum * t5
+			cs[6] -= sum * t6
+		}
+		return
+
+	case 8: // Special code for 8×8 Householder matrix.
+		v0 := v[0]
+		t0 := tau * v0
+		v1 := v[1]
+		t1 := tau * v1
+		v2 := v[2]
+		t2 := tau * v2
+		v3 := v[3]
+		t3 := tau * v3
+		v4 := v[4]
+		t4 := tau * v4
+		v5 := v[5]
+		t5 := tau * v5
+		v6 := v[6]
+		t6 := tau * v6
+		v7 := v[7]
+		t7 := tau * v7
+		for j := 0; j < m; j++ {
+			cs := c[j*ldc:]
+			sum := v0*cs[0] + v1*cs[1] + v2*cs[2] + v3*cs[3] + v4*cs[4] +
+				v5*cs[5] + v6*cs[6] + v7*cs[7]
+			cs[0] -= sum * t0
+			cs[1] -= sum * t1
+			cs[2] -= sum * t2
+			cs[3] -= sum * t3
+			cs[4] -= sum * t4
+			cs[5] -= sum * t5
+			cs[6] -= sum * t6
+			cs[7] -= sum * t7
+		}
+		return
+
+	case 9: // Special code for 9×9 Householder matrix.
+		v0 := v[0]
+		t0 := tau * v0
+		v1 := v[1]
+		t1 := tau * v1
+		v2 := v[2]
+		t2 := tau * v2
+		v3 := v[3]
+		t3 := tau * v3
+		v4 := v[4]
+		t4 := tau * v4
+		v5 := v[5]
+		t5 := tau * v5
+		v6 := v[6]
+		t6 := tau * v6
+		v7 := v[7]
+		t7 := tau * v7
+		v8 := v[8]
+		t8 := tau * v8
+		for j := 0; j < m; j++ {
+			cs := c[j*ldc:]
+			sum := v0*cs[0] + v1*cs[1] + v2*cs[2] + v3*cs[3] + v4*cs[4] +
+				v5*cs[5] + v6*cs[6] + v7*cs[7] + v8*cs[8]
+			cs[0] -= sum * t0
+			cs[1] -= sum * t1
+			cs[2] -= sum * t2
+			cs[3] -= sum * t3
+			cs[4] -= sum * t4
+			cs[5] -= sum * t5
+			cs[6] -= sum * t6
+			cs[7] -= sum * t7
+			cs[8] -= sum * t8
+		}
+		return
+
+	case 10: // Special code for 10×10 Householder matrix.
+		v0 := v[0]
+		t0 := tau * v0
+		v1 := v[1]
+		t1 := tau * v1
+		v2 := v[2]
+		t2 := tau * v2
+		v3 := v[3]
+		t3 := tau * v3
+		v4 := v[4]
+		t4 := tau * v4
+		v5 := v[5]
+		t5 := tau * v5
+		v6 := v[6]
+		t6 := tau * v6
+		v7 := v[7]
+		t7 := tau * v7
+		v8 := v[8]
+		t8 := tau * v8
+		v9 := v[9]
+		t9 := tau * v9
+		for j := 0; j < m; j++ {
+			cs := c[j*ldc:]
+			sum := v0*cs[0] + v1*cs[1] + v2*cs[2] + v3*cs[3] + v4*cs[4] +
+				v5*cs[5] + v6*cs[6] + v7*cs[7] + v8*cs[8] + v9*cs[9]
+			cs[0] -= sum * t0
+			cs[1] -= sum * t1
+			cs[2] -= sum * t2
+			cs[3] -= sum * t3
+			cs[4] -= sum * t4
+			cs[5] -= sum * t5
+			cs[6] -= sum * t6
+			cs[7] -= sum * t7
+			cs[8] -= sum * t8
+			cs[9] -= sum * t9
+		}
+		return
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlartg.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlartg.go
new file mode 100644
index 0000000000..93416c6f5f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlartg.go
@@ -0,0 +1,73 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlartg generates a plane rotation so that
+//
+//	[ cs sn] * [f] = [r]
+//	[-sn cs]   [g] = [0]
+//
+// where cs*cs + sn*sn = 1.
+//
+// This is a more accurate version of BLAS Drotg that uses scaling to avoid
+// overflow or underflow, with the other differences that
+//   - cs >= 0
+//   - if g = 0, then cs = 1 and sn = 0
+//   - if f = 0 and g != 0, then cs = 0 and sn = sign(1,g)
+//
+// Dlartg is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlartg(f, g float64) (cs, sn, r float64) {
+	// Implementation based on Supplemental Material to:
+	//
+	// Edward Anderson
+	// Algorithm 978: Safe Scaling in the Level 1 BLAS
+	// ACM Trans. Math. Softw. 44, 1, Article 12 (2017)
+	// DOI: https://doi.org/10.1145/3061665
+	//
+	// For further details see:
+	//
+	// W. Pereira, A. Lotfi, J. Langou
+	// Numerical analysis of Givens rotation
+	// DOI: https://doi.org/10.48550/arXiv.2211.04010
+
+	if g == 0 {
+		return 1, 0, f
+	}
+
+	g1 := math.Abs(g)
+
+	if f == 0 {
+		return 0, math.Copysign(1, g), g1
+	}
+
+	const safmin = dlamchS
+	const safmax = 1 / safmin
+	rtmin := math.Sqrt(safmin)
+	rtmax := math.Sqrt(safmax / 2)
+
+	f1 := math.Abs(f)
+
+	if rtmin < f1 && f1 < rtmax && rtmin < g1 && g1 < rtmax {
+		d := math.Sqrt(f*f + g*g)
+		cs = f1 / d
+		r = math.Copysign(d, f)
+		sn = g / r
+
+		return cs, sn, r
+	}
+
+	u := math.Min(math.Max(safmin, math.Max(f1, g1)), safmax)
+	fs := f / u
+	gs := g / u
+	d := math.Sqrt(fs*fs + gs*gs)
+	cs = math.Abs(fs) / d
+	r = math.Copysign(d, f)
+	sn = gs / r
+	r *= u
+
+	return cs, sn, r
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlas2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlas2.go
new file mode 100644
index 0000000000..a819fa3536
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlas2.go
@@ -0,0 +1,45 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlas2 computes the singular values of the 2×2 matrix defined by
+//
+//	[F G]
+//	[0 H]
+//
+// The smaller and larger singular values are returned in that order.
+//
+// Dlas2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlas2(f, g, h float64) (ssmin, ssmax float64) {
+	fa := math.Abs(f)
+	ga := math.Abs(g)
+	ha := math.Abs(h)
+	fhmin := math.Min(fa, ha)
+	fhmax := math.Max(fa, ha)
+	if fhmin == 0 {
+		if fhmax == 0 {
+			return 0, ga
+		}
+		v := math.Min(fhmax, ga) / math.Max(fhmax, ga)
+		return 0, math.Max(fhmax, ga) * math.Sqrt(1+v*v)
+	}
+	if ga < fhmax {
+		as := 1 + fhmin/fhmax
+		at := (fhmax - fhmin) / fhmax
+		au := (ga / fhmax) * (ga / fhmax)
+		c := 2 / (math.Sqrt(as*as+au) + math.Sqrt(at*at+au))
+		return fhmin * c, fhmax / c
+	}
+	au := fhmax / ga
+	if au == 0 {
+		return fhmin * fhmax / ga, ga
+	}
+	as := 1 + fhmin/fhmax
+	at := (fhmax - fhmin) / fhmax
+	c := 1 / (math.Sqrt(1+(as*au)*(as*au)) + math.Sqrt(1+(at*au)*(at*au)))
+	return 2 * (fhmin * c) * au, ga / (c + c)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlascl.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlascl.go
new file mode 100644
index 0000000000..61c4eb79cb
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlascl.go
@@ -0,0 +1,111 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlascl multiplies an m×n matrix by the scalar cto/cfrom.
+//
+// cfrom must not be zero, and cto and cfrom must not be NaN, otherwise Dlascl
+// will panic.
+//
+// Dlascl is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlascl(kind lapack.MatrixType, kl, ku int, cfrom, cto float64, m, n int, a []float64, lda int) {
+	switch kind {
+	default:
+		panic(badMatrixType)
+	case 'H', 'B', 'Q', 'Z': // See dlascl.f.
+		panic("not implemented")
+	case lapack.General, lapack.UpperTri, lapack.LowerTri:
+		if lda < max(1, n) {
+			panic(badLdA)
+		}
+	}
+	switch {
+	case cfrom == 0:
+		panic(zeroCFrom)
+	case math.IsNaN(cfrom):
+		panic(nanCFrom)
+	case math.IsNaN(cto):
+		panic(nanCTo)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	}
+
+	if n == 0 || m == 0 {
+		return
+	}
+
+	switch kind {
+	case lapack.General, lapack.UpperTri, lapack.LowerTri:
+		if len(a) < (m-1)*lda+n {
+			panic(shortA)
+		}
+	}
+
+	smlnum := dlamchS
+	bignum := 1 / smlnum
+	cfromc := cfrom
+	ctoc := cto
+	cfrom1 := cfromc * smlnum
+	for {
+		var done bool
+		var mul, ctol float64
+		if cfrom1 == cfromc {
+			// cfromc is inf.
+			mul = ctoc / cfromc
+			done = true
+			ctol = ctoc
+		} else {
+			ctol = ctoc / bignum
+			if ctol == ctoc {
+				// ctoc is either 0 or inf.
+				mul = ctoc
+				done = true
+				cfromc = 1
+			} else if math.Abs(cfrom1) > math.Abs(ctoc) && ctoc != 0 {
+				mul = smlnum
+				done = false
+				cfromc = cfrom1
+			} else if math.Abs(ctol) > math.Abs(cfromc) {
+				mul = bignum
+				done = false
+				ctoc = ctol
+			} else {
+				mul = ctoc / cfromc
+				done = true
+			}
+		}
+		switch kind {
+		case lapack.General:
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					a[i*lda+j] = a[i*lda+j] * mul
+				}
+			}
+		case lapack.UpperTri:
+			for i := 0; i < m; i++ {
+				for j := i; j < n; j++ {
+					a[i*lda+j] = a[i*lda+j] * mul
+				}
+			}
+		case lapack.LowerTri:
+			for i := 0; i < m; i++ {
+				for j := 0; j <= min(i, n-1); j++ {
+					a[i*lda+j] = a[i*lda+j] * mul
+				}
+			}
+		}
+		if done {
+			break
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlaset.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaset.go
new file mode 100644
index 0000000000..b8b6b0f4db
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaset.go
@@ -0,0 +1,58 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dlaset sets the off-diagonal elements of A to alpha, and the diagonal
+// elements to beta. If uplo == blas.Upper, only the elements in the upper
+// triangular part are set. If uplo == blas.Lower, only the elements in the
+// lower triangular part are set. If uplo is otherwise, all of the elements of A
+// are set.
+//
+// Dlaset is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlaset(uplo blas.Uplo, m, n int, alpha, beta float64, a []float64, lda int) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	minmn := min(m, n)
+	if minmn == 0 {
+		return
+	}
+
+	if len(a) < (m-1)*lda+n {
+		panic(shortA)
+	}
+
+	switch uplo {
+	case blas.Upper:
+		for i := 0; i < m; i++ {
+			for j := i + 1; j < n; j++ {
+				a[i*lda+j] = alpha
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < m; i++ {
+			for j := 0; j < min(i, n); j++ {
+				a[i*lda+j] = alpha
+			}
+		}
+	default:
+		for i := 0; i < m; i++ {
+			for j := 0; j < n; j++ {
+				a[i*lda+j] = alpha
+			}
+		}
+	}
+	for i := 0; i < minmn; i++ {
+		a[i*lda+i] = beta
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq1.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq1.go
new file mode 100644
index 0000000000..1f1d1dc42e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq1.go
@@ -0,0 +1,100 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlasq1 computes the singular values of an n×n bidiagonal matrix with diagonal
+// d and off-diagonal e. On exit, d contains the singular values in decreasing
+// order, and e is overwritten. d must have length at least n, e must have
+// length at least n-1, and the input work must have length at least 4*n. Dlasq1
+// will panic if these conditions are not met.
+//
+// Dlasq1 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlasq1(n int, d, e, work []float64) (info int) {
+	if n < 0 {
+		panic(nLT0)
+	}
+
+	if n == 0 {
+		return info
+	}
+
+	switch {
+	case len(d) < n:
+		panic(shortD)
+	case len(e) < n-1:
+		panic(shortE)
+	case len(work) < 4*n:
+		panic(shortWork)
+	}
+
+	if n == 1 {
+		d[0] = math.Abs(d[0])
+		return info
+	}
+
+	if n == 2 {
+		d[1], d[0] = impl.Dlas2(d[0], e[0], d[1])
+		return info
+	}
+
+	// Estimate the largest singular value.
+	var sigmx float64
+	for i := 0; i < n-1; i++ {
+		d[i] = math.Abs(d[i])
+		sigmx = math.Max(sigmx, math.Abs(e[i]))
+	}
+	d[n-1] = math.Abs(d[n-1])
+	// Early return if sigmx is zero (matrix is already diagonal).
+	if sigmx == 0 {
+		impl.Dlasrt(lapack.SortDecreasing, n, d)
+		return info
+	}
+
+	for i := 0; i < n; i++ {
+		sigmx = math.Max(sigmx, d[i])
+	}
+
+	// Copy D and E into WORK (in the Z format) and scale (squaring the
+	// input data makes scaling by a power of the radix pointless).
+
+	eps := dlamchP
+	safmin := dlamchS
+	scale := math.Sqrt(eps / safmin)
+	bi := blas64.Implementation()
+	bi.Dcopy(n, d, 1, work, 2)
+	bi.Dcopy(n-1, e, 1, work[1:], 2)
+	impl.Dlascl(lapack.General, 0, 0, sigmx, scale, 2*n-1, 1, work, 1)
+
+	// Compute the q's and e's.
+	for i := 0; i < 2*n-1; i++ {
+		work[i] *= work[i]
+	}
+	work[2*n-1] = 0
+
+	info = impl.Dlasq2(n, work)
+	if info == 0 {
+		for i := 0; i < n; i++ {
+			d[i] = math.Sqrt(work[i])
+		}
+		impl.Dlascl(lapack.General, 0, 0, scale, sigmx, n, 1, d, 1)
+	} else if info == 2 {
+		// Maximum number of iterations exceeded. Move data from work
+		// into D and E so the calling subroutine can try to finish.
+		for i := 0; i < n; i++ {
+			d[i] = math.Sqrt(work[2*i])
+			e[i] = math.Sqrt(work[2*i+1])
+		}
+		impl.Dlascl(lapack.General, 0, 0, scale, sigmx, n, 1, d, 1)
+		impl.Dlascl(lapack.General, 0, 0, scale, sigmx, n, 1, e, 1)
+	}
+	return info
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq2.go
new file mode 100644
index 0000000000..e3870b1d96
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq2.go
@@ -0,0 +1,370 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlasq2 computes all the eigenvalues of the symmetric positive
+// definite tridiagonal matrix associated with the qd array Z. Eigevalues
+// are computed to high relative accuracy avoiding denormalization, underflow
+// and overflow.
+//
+// To see the relation of Z to the tridiagonal matrix, let L be a
+// unit lower bidiagonal matrix with sub-diagonals Z(2,4,6,,..) and
+// let U be an upper bidiagonal matrix with 1's above and diagonal
+// Z(1,3,5,,..). The tridiagonal is L*U or, if you prefer, the
+// symmetric tridiagonal to which it is similar.
+//
+// info returns a status error. The return codes mean as follows:
+//
+//	0: The algorithm completed successfully.
+//	1: A split was marked by a positive value in e.
+//	2: Current block of Z not diagonalized after 100*n iterations (in inner
+//	   while loop). On exit Z holds a qd array with the same eigenvalues as
+//	   the given Z.
+//	3: Termination criterion of outer while loop not met (program created more
+//	   than N unreduced blocks).
+//
+// z must have length at least 4*n, and must not contain any negative elements.
+// Dlasq2 will panic otherwise.
+//
+// Dlasq2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlasq2(n int, z []float64) (info int) {
+	if n < 0 {
+		panic(nLT0)
+	}
+
+	if n == 0 {
+		return info
+	}
+
+	if len(z) < 4*n {
+		panic(shortZ)
+	}
+
+	if n == 1 {
+		if z[0] < 0 {
+			panic(negZ)
+		}
+		return info
+	}
+
+	const cbias = 1.5
+
+	eps := dlamchP
+	safmin := dlamchS
+	tol := eps * 100
+	tol2 := tol * tol
+	if n == 2 {
+		if z[1] < 0 || z[2] < 0 {
+			panic(negZ)
+		} else if z[2] > z[0] {
+			z[0], z[2] = z[2], z[0]
+		}
+		z[4] = z[0] + z[1] + z[2]
+		if z[1] > z[2]*tol2 {
+			t := 0.5 * (z[0] - z[2] + z[1])
+			s := z[2] * (z[1] / t)
+			if s <= t {
+				s = z[2] * (z[1] / (t * (1 + math.Sqrt(1+s/t))))
+			} else {
+				s = z[2] * (z[1] / (t + math.Sqrt(t)*math.Sqrt(t+s)))
+			}
+			t = z[0] + s + z[1]
+			z[2] *= z[0] / t
+			z[0] = t
+		}
+		z[1] = z[2]
+		z[5] = z[1] + z[0]
+		return info
+	}
+	// Check for negative data and compute sums of q's and e's.
+	z[2*n-1] = 0
+	emin := z[1]
+	var d, e, qmax float64
+	var i1, n1 int
+	for k := 0; k < 2*(n-1); k += 2 {
+		if z[k] < 0 || z[k+1] < 0 {
+			panic(negZ)
+		}
+		d += z[k]
+		e += z[k+1]
+		qmax = math.Max(qmax, z[k])
+		emin = math.Min(emin, z[k+1])
+	}
+	if z[2*(n-1)] < 0 {
+		panic(negZ)
+	}
+	d += z[2*(n-1)]
+	// Check for diagonality.
+	if e == 0 {
+		for k := 1; k < n; k++ {
+			z[k] = z[2*k]
+		}
+		impl.Dlasrt(lapack.SortDecreasing, n, z)
+		z[2*(n-1)] = d
+		return info
+	}
+	trace := d + e
+	// Check for zero data.
+	if trace == 0 {
+		z[2*(n-1)] = 0
+		return info
+	}
+	// Rearrange data for locality: Z=(q1,qq1,e1,ee1,q2,qq2,e2,ee2,...).
+	for k := 2 * n; k >= 2; k -= 2 {
+		z[2*k-1] = 0
+		z[2*k-2] = z[k-1]
+		z[2*k-3] = 0
+		z[2*k-4] = z[k-2]
+	}
+	i0 := 0
+	n0 := n - 1
+
+	// Reverse the qd-array, if warranted.
+	// z[4*i0-3] --> z[4*(i0+1)-3-1] --> z[4*i0]
+	if cbias*z[4*i0] < z[4*n0] {
+		ipn4Out := 4 * (i0 + n0 + 2)
+		for i4loop := 4 * (i0 + 1); i4loop <= 2*(i0+n0+1); i4loop += 4 {
+			i4 := i4loop - 1
+			ipn4 := ipn4Out - 1
+			z[i4-3], z[ipn4-i4-4] = z[ipn4-i4-4], z[i4-3]
+			z[i4-1], z[ipn4-i4-6] = z[ipn4-i4-6], z[i4-1]
+		}
+	}
+
+	// Initial split checking via dqd and Li's test.
+	pp := 0
+	for k := 0; k < 2; k++ {
+		d = z[4*n0+pp]
+		for i4loop := 4*n0 + pp; i4loop >= 4*(i0+1)+pp; i4loop -= 4 {
+			i4 := i4loop - 1
+			if z[i4-1] <= tol2*d {
+				z[i4-1] = math.Copysign(0, -1)
+				d = z[i4-3]
+			} else {
+				d = z[i4-3] * (d / (d + z[i4-1]))
+			}
+		}
+		// dqd maps Z to ZZ plus Li's test.
+		emin = z[4*(i0+1)+pp]
+		d = z[4*i0+pp]
+		for i4loop := 4*(i0+1) + pp; i4loop <= 4*n0+pp; i4loop += 4 {
+			i4 := i4loop - 1
+			z[i4-2*pp-2] = d + z[i4-1]
+			if z[i4-1] <= tol2*d {
+				z[i4-1] = math.Copysign(0, -1)
+				z[i4-2*pp-2] = d
+				z[i4-2*pp] = 0
+				d = z[i4+1]
+			} else if safmin*z[i4+1] < z[i4-2*pp-2] && safmin*z[i4-2*pp-2] < z[i4+1] {
+				tmp := z[i4+1] / z[i4-2*pp-2]
+				z[i4-2*pp] = z[i4-1] * tmp
+				d *= tmp
+			} else {
+				z[i4-2*pp] = z[i4+1] * (z[i4-1] / z[i4-2*pp-2])
+				d = z[i4+1] * (d / z[i4-2*pp-2])
+			}
+			emin = math.Min(emin, z[i4-2*pp])
+		}
+		z[4*(n0+1)-pp-3] = d
+
+		// Now find qmax.
+		qmax = z[4*(i0+1)-pp-3]
+		for i4loop := 4*(i0+1) - pp + 2; i4loop <= 4*(n0+1)+pp-2; i4loop += 4 {
+			i4 := i4loop - 1
+			qmax = math.Max(qmax, z[i4])
+		}
+		// Prepare for the next iteration on K.
+		pp = 1 - pp
+	}
+
+	// Initialise variables to pass to DLASQ3.
+	var ttype int
+	var dmin1, dmin2, dn, dn1, dn2, g, tau float64
+	var tempq float64
+	iter := 2
+	var nFail int
+	nDiv := 2 * (n0 - i0)
+	var i4 int
+outer:
+	for iwhila := 1; iwhila <= n+1; iwhila++ {
+		// Test for completion.
+		if n0 < 0 {
+			// Move q's to the front.
+			for k := 1; k < n; k++ {
+				z[k] = z[4*k]
+			}
+			// Sort and compute sum of eigenvalues.
+			impl.Dlasrt(lapack.SortDecreasing, n, z)
+			e = 0
+			for k := n - 1; k >= 0; k-- {
+				e += z[k]
+			}
+			// Store trace, sum(eigenvalues) and information on performance.
+			z[2*n] = trace
+			z[2*n+1] = e
+			z[2*n+2] = float64(iter)
+			z[2*n+3] = float64(nDiv) / float64(n*n)
+			z[2*n+4] = 100 * float64(nFail) / float64(iter)
+			return info
+		}
+
+		// While array unfinished do
+		// e[n0] holds the value of sigma when submatrix in i0:n0
+		// splits from the rest of the array, but is negated.
+		var desig float64
+		var sigma float64
+		if n0 != n-1 {
+			sigma = -z[4*(n0+1)-2]
+		}
+		if sigma < 0 {
+			info = 1
+			return info
+		}
+		// Find last unreduced submatrix's top index i0, find qmax and
+		// emin. Find Gershgorin-type bound if Q's much greater than E's.
+		var emax float64
+		if n0 > i0 {
+			emin = math.Abs(z[4*(n0+1)-6])
+		} else {
+			emin = 0
+		}
+		qmin := z[4*(n0+1)-4]
+		qmax = qmin
+		zSmall := false
+		for i4loop := 4 * (n0 + 1); i4loop >= 8; i4loop -= 4 {
+			i4 = i4loop - 1
+			if z[i4-5] <= 0 {
+				zSmall = true
+				break
+			}
+			if qmin >= 4*emax {
+				qmin = math.Min(qmin, z[i4-3])
+				emax = math.Max(emax, z[i4-5])
+			}
+			qmax = math.Max(qmax, z[i4-7]+z[i4-5])
+			emin = math.Min(emin, z[i4-5])
+		}
+		if !zSmall {
+			i4 = 3
+		}
+		i0 = (i4+1)/4 - 1
+		pp = 0
+		if n0-i0 > 1 {
+			dee := z[4*i0]
+			deemin := dee
+			kmin := i0
+			for i4loop := 4*(i0+1) + 1; i4loop <= 4*(n0+1)-3; i4loop += 4 {
+				i4 := i4loop - 1
+				dee = z[i4] * (dee / (dee + z[i4-2]))
+				if dee <= deemin {
+					deemin = dee
+					kmin = (i4+4)/4 - 1
+				}
+			}
+			if (kmin-i0)*2 < n0-kmin && deemin <= 0.5*z[4*n0] {
+				ipn4Out := 4 * (i0 + n0 + 2)
+				pp = 2
+				for i4loop := 4 * (i0 + 1); i4loop <= 2*(i0+n0+1); i4loop += 4 {
+					i4 := i4loop - 1
+					ipn4 := ipn4Out - 1
+					z[i4-3], z[ipn4-i4-4] = z[ipn4-i4-4], z[i4-3]
+					z[i4-2], z[ipn4-i4-3] = z[ipn4-i4-3], z[i4-2]
+					z[i4-1], z[ipn4-i4-6] = z[ipn4-i4-6], z[i4-1]
+					z[i4], z[ipn4-i4-5] = z[ipn4-i4-5], z[i4]
+				}
+			}
+		}
+		// Put -(initial shift) into DMIN.
+		dmin := -math.Max(0, qmin-2*math.Sqrt(qmin)*math.Sqrt(emax))
+
+		// Now i0:n0 is unreduced.
+		// PP = 0 for ping, PP = 1 for pong.
+		// PP = 2 indicates that flipping was applied to the Z array and
+		// 		that the tests for deflation upon entry in Dlasq3 should
+		// 		not be performed.
+		nbig := 100 * (n0 - i0 + 1)
+		for iwhilb := 0; iwhilb < nbig; iwhilb++ {
+			if i0 > n0 {
+				continue outer
+			}
+
+			// While submatrix unfinished take a good dqds step.
+			i0, n0, pp, dmin, sigma, desig, qmax, nFail, iter, nDiv, ttype, dmin1, dmin2, dn, dn1, dn2, g, tau =
+				impl.Dlasq3(i0, n0, z, pp, dmin, sigma, desig, qmax, nFail, iter, nDiv, ttype, dmin1, dmin2, dn, dn1, dn2, g, tau)
+
+			pp = 1 - pp
+			// When emin is very small check for splits.
+			if pp == 0 && n0-i0 >= 3 {
+				if z[4*(n0+1)-1] <= tol2*qmax || z[4*(n0+1)-2] <= tol2*sigma {
+					splt := i0 - 1
+					qmax = z[4*i0]
+					emin = z[4*(i0+1)-2]
+					oldemn := z[4*(i0+1)-1]
+					for i4loop := 4 * (i0 + 1); i4loop <= 4*(n0-2); i4loop += 4 {
+						i4 := i4loop - 1
+						if z[i4] <= tol2*z[i4-3] || z[i4-1] <= tol2*sigma {
+							z[i4-1] = -sigma
+							splt = i4 / 4
+							qmax = 0
+							emin = z[i4+3]
+							oldemn = z[i4+4]
+						} else {
+							qmax = math.Max(qmax, z[i4+1])
+							emin = math.Min(emin, z[i4-1])
+							oldemn = math.Min(oldemn, z[i4])
+						}
+					}
+					z[4*(n0+1)-2] = emin
+					z[4*(n0+1)-1] = oldemn
+					i0 = splt + 1
+				}
+			}
+		}
+		// Maximum number of iterations exceeded, restore the shift
+		// sigma and place the new d's and e's in a qd array.
+		// This might need to be done for several blocks.
+		info = 2
+		i1 = i0
+		for {
+			tempq = z[4*i0]
+			z[4*i0] += sigma
+			for k := i0 + 1; k <= n0; k++ {
+				tempe := z[4*(k+1)-6]
+				z[4*(k+1)-6] *= tempq / z[4*(k+1)-8]
+				tempq = z[4*k]
+				z[4*k] += sigma + tempe - z[4*(k+1)-6]
+			}
+			// Prepare to do this on the previous block if there is one.
+			if i1 <= 0 {
+				break
+			}
+			n1 = i1 - 1
+			for i1 >= 1 && z[4*(i1+1)-6] >= 0 {
+				i1 -= 1
+			}
+			sigma = -z[4*(n1+1)-2]
+		}
+		for k := 0; k < n; k++ {
+			z[2*k] = z[4*k]
+			// Only the block 1..N0 is unfinished.  The rest of the e's
+			// must be essentially zero, although sometimes other data
+			// has been stored in them.
+			if k < n0 {
+				z[2*(k+1)-1] = z[4*(k+1)-1]
+			} else {
+				z[2*(k+1)] = 0
+			}
+		}
+		return info
+	}
+	info = 3
+	return info
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq3.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq3.go
new file mode 100644
index 0000000000..a05e94ef17
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq3.go
@@ -0,0 +1,172 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlasq3 checks for deflation, computes a shift (tau) and calls dqds.
+// In case of failure it changes shifts, and tries again until output
+// is positive.
+//
+// Dlasq3 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlasq3(i0, n0 int, z []float64, pp int, dmin, sigma, desig, qmax float64, nFail, iter, nDiv int, ttype int, dmin1, dmin2, dn, dn1, dn2, g, tau float64) (
+	i0Out, n0Out, ppOut int, dminOut, sigmaOut, desigOut, qmaxOut float64, nFailOut, iterOut, nDivOut, ttypeOut int, dmin1Out, dmin2Out, dnOut, dn1Out, dn2Out, gOut, tauOut float64) {
+	switch {
+	case i0 < 0:
+		panic(i0LT0)
+	case n0 < 0:
+		panic(n0LT0)
+	case len(z) < 4*n0:
+		panic(shortZ)
+	case pp != 0 && pp != 1 && pp != 2:
+		panic(badPp)
+	}
+
+	const cbias = 1.5
+
+	n0in := n0
+	eps := dlamchP
+	tol := eps * 100
+	tol2 := tol * tol
+	var nn int
+	var t float64
+	for {
+		if n0 < i0 {
+			return i0, n0, pp, dmin, sigma, desig, qmax, nFail, iter, nDiv, ttype, dmin1, dmin2, dn, dn1, dn2, g, tau
+		}
+		if n0 == i0 {
+			z[4*(n0+1)-4] = z[4*(n0+1)+pp-4] + sigma
+			n0--
+			continue
+		}
+		nn = 4*(n0+1) + pp - 1
+		if n0 != i0+1 {
+			// Check whether e[n0-1] is negligible, 1 eigenvalue.
+			if z[nn-5] > tol2*(sigma+z[nn-3]) && z[nn-2*pp-4] > tol2*z[nn-7] {
+				// Check whether e[n0-2] is negligible, 2 eigenvalues.
+				if z[nn-9] > tol2*sigma && z[nn-2*pp-8] > tol2*z[nn-11] {
+					break
+				}
+			} else {
+				z[4*(n0+1)-4] = z[4*(n0+1)+pp-4] + sigma
+				n0--
+				continue
+			}
+		}
+		if z[nn-3] > z[nn-7] {
+			z[nn-3], z[nn-7] = z[nn-7], z[nn-3]
+		}
+		t = 0.5 * (z[nn-7] - z[nn-3] + z[nn-5])
+		if z[nn-5] > z[nn-3]*tol2 && t != 0 {
+			s := z[nn-3] * (z[nn-5] / t)
+			if s <= t {
+				s = z[nn-3] * (z[nn-5] / (t * (1 + math.Sqrt(1+s/t))))
+			} else {
+				s = z[nn-3] * (z[nn-5] / (t + math.Sqrt(t)*math.Sqrt(t+s)))
+			}
+			t = z[nn-7] + (s + z[nn-5])
+			z[nn-3] *= z[nn-7] / t
+			z[nn-7] = t
+		}
+		z[4*(n0+1)-8] = z[nn-7] + sigma
+		z[4*(n0+1)-4] = z[nn-3] + sigma
+		n0 -= 2
+	}
+	if pp == 2 {
+		pp = 0
+	}
+
+	// Reverse the qd-array, if warranted.
+	if dmin <= 0 || n0 < n0in {
+		if cbias*z[4*(i0+1)+pp-4] < z[4*(n0+1)+pp-4] {
+			ipn4Out := 4 * (i0 + n0 + 2)
+			for j4loop := 4 * (i0 + 1); j4loop <= 2*((i0+1)+(n0+1)-1); j4loop += 4 {
+				ipn4 := ipn4Out - 1
+				j4 := j4loop - 1
+
+				z[j4-3], z[ipn4-j4-4] = z[ipn4-j4-4], z[j4-3]
+				z[j4-2], z[ipn4-j4-3] = z[ipn4-j4-3], z[j4-2]
+				z[j4-1], z[ipn4-j4-6] = z[ipn4-j4-6], z[j4-1]
+				z[j4], z[ipn4-j4-5] = z[ipn4-j4-5], z[j4]
+			}
+			if n0-i0 <= 4 {
+				z[4*(n0+1)+pp-2] = z[4*(i0+1)+pp-2]
+				z[4*(n0+1)-pp-1] = z[4*(i0+1)-pp-1]
+			}
+			dmin2 = math.Min(dmin2, z[4*(i0+1)-pp-2])
+			z[4*(n0+1)+pp-2] = math.Min(math.Min(z[4*(n0+1)+pp-2], z[4*(i0+1)+pp-2]), z[4*(i0+1)+pp+2])
+			z[4*(n0+1)-pp-1] = math.Min(math.Min(z[4*(n0+1)-pp-1], z[4*(i0+1)-pp-1]), z[4*(i0+1)-pp+3])
+			qmax = math.Max(math.Max(qmax, z[4*(i0+1)+pp-4]), z[4*(i0+1)+pp])
+			dmin = math.Copysign(0, -1) // Fortran code has -zero, but -0 in go is 0
+		}
+	}
+
+	// Choose a shift.
+	tau, ttype, g = impl.Dlasq4(i0, n0, z, pp, n0in, dmin, dmin1, dmin2, dn, dn1, dn2, tau, ttype, g)
+
+	// Call dqds until dmin > 0.
+loop:
+	for {
+		i0, n0, pp, tau, sigma, dmin, dmin1, dmin2, dn, dn1, dn2 = impl.Dlasq5(i0, n0, z, pp, tau, sigma)
+
+		nDiv += n0 - i0 + 2
+		iter++
+		switch {
+		case dmin >= 0 && dmin1 >= 0:
+			// Success.
+			goto done
+
+		case dmin < 0 && dmin1 > 0 && z[4*n0-pp-1] < tol*(sigma+dn1) && math.Abs(dn) < tol*sigma:
+			// Convergence hidden by negative dn.
+			z[4*n0-pp+1] = 0
+			dmin = 0
+			goto done
+
+		case dmin < 0:
+			// Tau too big. Select new Tau and try again.
+			nFail++
+			if ttype < -22 {
+				// Failed twice. Play it safe.
+				tau = 0
+			} else if dmin1 > 0 {
+				// Late failure. Gives excellent shift.
+				tau = (tau + dmin) * (1 - 2*eps)
+				ttype -= 11
+			} else {
+				// Early failure. Divide by 4.
+				tau = tau / 4
+				ttype -= 12
+			}
+
+		case math.IsNaN(dmin):
+			if tau == 0 {
+				break loop
+			}
+			tau = 0
+
+		default:
+			// Possible underflow. Play it safe.
+			break loop
+		}
+	}
+
+	// Risk of underflow.
+	dmin, dmin1, dmin2, dn, dn1, dn2 = impl.Dlasq6(i0, n0, z, pp)
+	nDiv += n0 - i0 + 2
+	iter++
+	tau = 0
+
+done:
+	if tau < sigma {
+		desig += tau
+		t = sigma + desig
+		desig -= t - sigma
+	} else {
+		t = sigma + tau
+		desig += sigma - (t - tau)
+	}
+	sigma = t
+	return i0, n0, pp, dmin, sigma, desig, qmax, nFail, iter, nDiv, ttype, dmin1, dmin2, dn, dn1, dn2, g, tau
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq4.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq4.go
new file mode 100644
index 0000000000..f6dbb31b98
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq4.go
@@ -0,0 +1,249 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlasq4 computes an approximation to the smallest eigenvalue using values of d
+// from the previous transform.
+// i0, n0, and n0in are zero-indexed.
+//
+// Dlasq4 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlasq4(i0, n0 int, z []float64, pp int, n0in int, dmin, dmin1, dmin2, dn, dn1, dn2, tau float64, ttype int, g float64) (tauOut float64, ttypeOut int, gOut float64) {
+	switch {
+	case i0 < 0:
+		panic(i0LT0)
+	case n0 < 0:
+		panic(n0LT0)
+	case len(z) < 4*n0:
+		panic(shortZ)
+	case pp != 0 && pp != 1:
+		panic(badPp)
+	}
+
+	const (
+		cnst1 = 0.563
+		cnst2 = 1.01
+		cnst3 = 1.05
+
+		cnstthird = 0.333 // TODO(btracey): Fix?
+	)
+	// A negative dmin forces the shift to take that absolute value
+	// ttype records the type of shift.
+	if dmin <= 0 {
+		tau = -dmin
+		ttype = -1
+		return tau, ttype, g
+	}
+	nn := 4*(n0+1) + pp - 1 // -1 for zero indexing
+	s := math.NaN()         // Poison s so that failure to take a path below is obvious
+	if n0in == n0 {
+		// No eigenvalues deflated.
+		if dmin == dn || dmin == dn1 {
+			b1 := math.Sqrt(z[nn-3]) * math.Sqrt(z[nn-5])
+			b2 := math.Sqrt(z[nn-7]) * math.Sqrt(z[nn-9])
+			a2 := z[nn-7] + z[nn-5]
+			if dmin == dn && dmin1 == dn1 {
+				gap2 := dmin2 - a2 - dmin2/4
+				var gap1 float64
+				if gap2 > 0 && gap2 > b2 {
+					gap1 = a2 - dn - (b2/gap2)*b2
+				} else {
+					gap1 = a2 - dn - (b1 + b2)
+				}
+				if gap1 > 0 && gap1 > b1 {
+					s = math.Max(dn-(b1/gap1)*b1, 0.5*dmin)
+					ttype = -2
+				} else {
+					s = 0
+					if dn > b1 {
+						s = dn - b1
+					}
+					if a2 > b1+b2 {
+						s = math.Min(s, a2-(b1+b2))
+					}
+					s = math.Max(s, cnstthird*dmin)
+					ttype = -3
+				}
+			} else {
+				ttype = -4
+				s = dmin / 4
+				var gam float64
+				var np int
+				if dmin == dn {
+					gam = dn
+					a2 = 0
+					if z[nn-5] > z[nn-7] {
+						return tau, ttype, g
+					}
+					b2 = z[nn-5] / z[nn-7]
+					np = nn - 9
+				} else {
+					np = nn - 2*pp
+					gam = dn1
+					if z[np-4] > z[np-2] {
+						return tau, ttype, g
+					}
+					a2 = z[np-4] / z[np-2]
+					if z[nn-9] > z[nn-11] {
+						return tau, ttype, g
+					}
+					b2 = z[nn-9] / z[nn-11]
+					np = nn - 13
+				}
+				// Approximate contribution to norm squared from i < nn-1.
+				a2 += b2
+				for i4loop := np + 1; i4loop >= 4*(i0+1)-1+pp; i4loop -= 4 {
+					i4 := i4loop - 1
+					if b2 == 0 {
+						break
+					}
+					b1 = b2
+					if z[i4] > z[i4-2] {
+						return tau, ttype, g
+					}
+					b2 *= z[i4] / z[i4-2]
+					a2 += b2
+					if 100*math.Max(b2, b1) < a2 || cnst1 < a2 {
+						break
+					}
+				}
+				a2 *= cnst3
+				// Rayleigh quotient residual bound.
+				if a2 < cnst1 {
+					s = gam * (1 - math.Sqrt(a2)) / (1 + a2)
+				}
+			}
+		} else if dmin == dn2 {
+			ttype = -5
+			s = dmin / 4
+			// Compute contribution to norm squared from i > nn-2.
+			np := nn - 2*pp
+			b1 := z[np-2]
+			b2 := z[np-6]
+			gam := dn2
+			if z[np-8] > b2 || z[np-4] > b1 {
+				return tau, ttype, g
+			}
+			a2 := (z[np-8] / b2) * (1 + z[np-4]/b1)
+			// Approximate contribution to norm squared from i < nn-2.
+			if n0-i0 > 2 {
+				b2 = z[nn-13] / z[nn-15]
+				a2 += b2
+				for i4loop := (nn + 1) - 17; i4loop >= 4*(i0+1)-1+pp; i4loop -= 4 {
+					i4 := i4loop - 1
+					if b2 == 0 {
+						break
+					}
+					b1 = b2
+					if z[i4] > z[i4-2] {
+						return tau, ttype, g
+					}
+					b2 *= z[i4] / z[i4-2]
+					a2 += b2
+					if 100*math.Max(b2, b1) < a2 || cnst1 < a2 {
+						break
+					}
+				}
+				a2 *= cnst3
+			}
+			if a2 < cnst1 {
+				s = gam * (1 - math.Sqrt(a2)) / (1 + a2)
+			}
+		} else {
+			// Case 6, no information to guide us.
+			if ttype == -6 {
+				g += cnstthird * (1 - g)
+			} else if ttype == -18 {
+				g = cnstthird / 4
+			} else {
+				g = 1.0 / 4
+			}
+			s = g * dmin
+			ttype = -6
+		}
+	} else if n0in == (n0 + 1) {
+		// One eigenvalue just deflated. Use DMIN1, DN1 for DMIN and DN.
+		if dmin1 == dn1 && dmin2 == dn2 {
+			ttype = -7
+			s = cnstthird * dmin1
+			if z[nn-5] > z[nn-7] {
+				return tau, ttype, g
+			}
+			b1 := z[nn-5] / z[nn-7]
+			b2 := b1
+			if b2 != 0 {
+				for i4loop := 4*(n0+1) - 9 + pp; i4loop >= 4*(i0+1)-1+pp; i4loop -= 4 {
+					i4 := i4loop - 1
+					a2 := b1
+					if z[i4] > z[i4-2] {
+						return tau, ttype, g
+					}
+					b1 *= z[i4] / z[i4-2]
+					b2 += b1
+					if 100*math.Max(b1, a2) < b2 {
+						break
+					}
+				}
+			}
+			b2 = math.Sqrt(cnst3 * b2)
+			a2 := dmin1 / (1 + b2*b2)
+			gap2 := 0.5*dmin2 - a2
+			if gap2 > 0 && gap2 > b2*a2 {
+				s = math.Max(s, a2*(1-cnst2*a2*(b2/gap2)*b2))
+			} else {
+				s = math.Max(s, a2*(1-cnst2*b2))
+				ttype = -8
+			}
+		} else {
+			s = dmin1 / 4
+			if dmin1 == dn1 {
+				s = 0.5 * dmin1
+			}
+			ttype = -9
+		}
+	} else if n0in == (n0 + 2) {
+		// Two eigenvalues deflated. Use DMIN2, DN2 for DMIN and DN.
+		if dmin2 == dn2 && 2*z[nn-5] < z[nn-7] {
+			ttype = -10
+			s = cnstthird * dmin2
+			if z[nn-5] > z[nn-7] {
+				return tau, ttype, g
+			}
+			b1 := z[nn-5] / z[nn-7]
+			b2 := b1
+			if b2 != 0 {
+				for i4loop := 4*(n0+1) - 9 + pp; i4loop >= 4*(i0+1)-1+pp; i4loop -= 4 {
+					i4 := i4loop - 1
+					if z[i4] > z[i4-2] {
+						return tau, ttype, g
+					}
+					b1 *= z[i4] / z[i4-2]
+					b2 += b1
+					if 100*b1 < b2 {
+						break
+					}
+				}
+			}
+			b2 = math.Sqrt(cnst3 * b2)
+			a2 := dmin2 / (1 + b2*b2)
+			gap2 := z[nn-7] + z[nn-9] - math.Sqrt(z[nn-11])*math.Sqrt(z[nn-9]) - a2
+			if gap2 > 0 && gap2 > b2*a2 {
+				s = math.Max(s, a2*(1-cnst2*a2*(b2/gap2)*b2))
+			} else {
+				s = math.Max(s, a2*(1-cnst2*b2))
+			}
+		} else {
+			s = dmin2 / 4
+			ttype = -11
+		}
+	} else if n0in > n0+2 {
+		// Case 12, more than two eigenvalues deflated. No information.
+		s = 0
+		ttype = -12
+	}
+	tau = s
+	return tau, ttype, g
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq5.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq5.go
new file mode 100644
index 0000000000..d3826d9186
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq5.go
@@ -0,0 +1,140 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlasq5 computes one dqds transform in ping-pong form.
+// i0 and n0 are zero-indexed.
+//
+// Dlasq5 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlasq5(i0, n0 int, z []float64, pp int, tau, sigma float64) (i0Out, n0Out, ppOut int, tauOut, sigmaOut, dmin, dmin1, dmin2, dn, dnm1, dnm2 float64) {
+	// The lapack function has inputs for ieee and eps, but Go requires ieee so
+	// these are unnecessary.
+
+	switch {
+	case i0 < 0:
+		panic(i0LT0)
+	case n0 < 0:
+		panic(n0LT0)
+	case len(z) < 4*n0:
+		panic(shortZ)
+	case pp != 0 && pp != 1:
+		panic(badPp)
+	}
+
+	if n0-i0-1 <= 0 {
+		return i0, n0, pp, tau, sigma, dmin, dmin1, dmin2, dn, dnm1, dnm2
+	}
+
+	eps := dlamchP
+	dthresh := eps * (sigma + tau)
+	if tau < dthresh*0.5 {
+		tau = 0
+	}
+	var j4 int
+	var emin float64
+	if tau != 0 {
+		j4 = 4*i0 + pp
+		emin = z[j4+4]
+		d := z[j4] - tau
+		dmin = d
+		// In the reference there are code paths that actually return this value.
+		// dmin1 = -z[j4]
+		if pp == 0 {
+			for j4loop := 4 * (i0 + 1); j4loop <= 4*((n0+1)-3); j4loop += 4 {
+				j4 := j4loop - 1
+				z[j4-2] = d + z[j4-1]
+				tmp := z[j4+1] / z[j4-2]
+				d = d*tmp - tau
+				dmin = math.Min(dmin, d)
+				z[j4] = z[j4-1] * tmp
+				emin = math.Min(z[j4], emin)
+			}
+		} else {
+			for j4loop := 4 * (i0 + 1); j4loop <= 4*((n0+1)-3); j4loop += 4 {
+				j4 := j4loop - 1
+				z[j4-3] = d + z[j4]
+				tmp := z[j4+2] / z[j4-3]
+				d = d*tmp - tau
+				dmin = math.Min(dmin, d)
+				z[j4-1] = z[j4] * tmp
+				emin = math.Min(z[j4-1], emin)
+			}
+		}
+		// Unroll the last two steps.
+		dnm2 = d
+		dmin2 = dmin
+		j4 = 4*((n0+1)-2) - pp - 1
+		j4p2 := j4 + 2*pp - 1
+		z[j4-2] = dnm2 + z[j4p2]
+		z[j4] = z[j4p2+2] * (z[j4p2] / z[j4-2])
+		dnm1 = z[j4p2+2]*(dnm2/z[j4-2]) - tau
+		dmin = math.Min(dmin, dnm1)
+
+		dmin1 = dmin
+		j4 += 4
+		j4p2 = j4 + 2*pp - 1
+		z[j4-2] = dnm1 + z[j4p2]
+		z[j4] = z[j4p2+2] * (z[j4p2] / z[j4-2])
+		dn = z[j4p2+2]*(dnm1/z[j4-2]) - tau
+		dmin = math.Min(dmin, dn)
+	} else {
+		// This is the version that sets d's to zero if they are small enough.
+		j4 = 4*(i0+1) + pp - 4
+		emin = z[j4+4]
+		d := z[j4] - tau
+		dmin = d
+		// In the reference there are code paths that actually return this value.
+		// dmin1 = -z[j4]
+		if pp == 0 {
+			for j4loop := 4 * (i0 + 1); j4loop <= 4*((n0+1)-3); j4loop += 4 {
+				j4 := j4loop - 1
+				z[j4-2] = d + z[j4-1]
+				tmp := z[j4+1] / z[j4-2]
+				d = d*tmp - tau
+				if d < dthresh {
+					d = 0
+				}
+				dmin = math.Min(dmin, d)
+				z[j4] = z[j4-1] * tmp
+				emin = math.Min(z[j4], emin)
+			}
+		} else {
+			for j4loop := 4 * (i0 + 1); j4loop <= 4*((n0+1)-3); j4loop += 4 {
+				j4 := j4loop - 1
+				z[j4-3] = d + z[j4]
+				tmp := z[j4+2] / z[j4-3]
+				d = d*tmp - tau
+				if d < dthresh {
+					d = 0
+				}
+				dmin = math.Min(dmin, d)
+				z[j4-1] = z[j4] * tmp
+				emin = math.Min(z[j4-1], emin)
+			}
+		}
+		// Unroll the last two steps.
+		dnm2 = d
+		dmin2 = dmin
+		j4 = 4*((n0+1)-2) - pp - 1
+		j4p2 := j4 + 2*pp - 1
+		z[j4-2] = dnm2 + z[j4p2]
+		z[j4] = z[j4p2+2] * (z[j4p2] / z[j4-2])
+		dnm1 = z[j4p2+2]*(dnm2/z[j4-2]) - tau
+		dmin = math.Min(dmin, dnm1)
+
+		dmin1 = dmin
+		j4 += 4
+		j4p2 = j4 + 2*pp - 1
+		z[j4-2] = dnm1 + z[j4p2]
+		z[j4] = z[j4p2+2] * (z[j4p2] / z[j4-2])
+		dn = z[j4p2+2]*(dnm1/z[j4-2]) - tau
+		dmin = math.Min(dmin, dn)
+	}
+	z[j4+2] = dn
+	z[4*(n0+1)-pp-1] = emin
+	return i0, n0, pp, tau, sigma, dmin, dmin1, dmin2, dn, dnm1, dnm2
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq6.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq6.go
new file mode 100644
index 0000000000..54bf587562
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq6.go
@@ -0,0 +1,118 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlasq6 computes one dqd transform in ping-pong form with protection against
+// overflow and underflow. z has length at least 4*(n0+1) and holds the qd array.
+// i0 is the zero-based first index.
+// n0 is the zero-based last index.
+//
+// Dlasq6 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlasq6(i0, n0 int, z []float64, pp int) (dmin, dmin1, dmin2, dn, dnm1, dnm2 float64) {
+	switch {
+	case i0 < 0:
+		panic(i0LT0)
+	case n0 < 0:
+		panic(n0LT0)
+	case len(z) < 4*n0:
+		panic(shortZ)
+	case pp != 0 && pp != 1:
+		panic(badPp)
+	}
+
+	if n0-i0-1 <= 0 {
+		return dmin, dmin1, dmin2, dn, dnm1, dnm2
+	}
+
+	safmin := dlamchS
+	j4 := 4*(i0+1) + pp - 4 // -4 rather than -3 for zero indexing
+	emin := z[j4+4]
+	d := z[j4]
+	dmin = d
+	if pp == 0 {
+		for j4loop := 4 * (i0 + 1); j4loop <= 4*((n0+1)-3); j4loop += 4 {
+			j4 := j4loop - 1 // Translate back to zero-indexed.
+			z[j4-2] = d + z[j4-1]
+			if z[j4-2] == 0 {
+				z[j4] = 0
+				d = z[j4+1]
+				dmin = d
+				emin = 0
+			} else if safmin*z[j4+1] < z[j4-2] && safmin*z[j4-2] < z[j4+1] {
+				tmp := z[j4+1] / z[j4-2]
+				z[j4] = z[j4-1] * tmp
+				d *= tmp
+			} else {
+				z[j4] = z[j4+1] * (z[j4-1] / z[j4-2])
+				d = z[j4+1] * (d / z[j4-2])
+			}
+			dmin = math.Min(dmin, d)
+			emin = math.Min(emin, z[j4])
+		}
+	} else {
+		for j4loop := 4 * (i0 + 1); j4loop <= 4*((n0+1)-3); j4loop += 4 {
+			j4 := j4loop - 1
+			z[j4-3] = d + z[j4]
+			if z[j4-3] == 0 {
+				z[j4-1] = 0
+				d = z[j4+2]
+				dmin = d
+				emin = 0
+			} else if safmin*z[j4+2] < z[j4-3] && safmin*z[j4-3] < z[j4+2] {
+				tmp := z[j4+2] / z[j4-3]
+				z[j4-1] = z[j4] * tmp
+				d *= tmp
+			} else {
+				z[j4-1] = z[j4+2] * (z[j4] / z[j4-3])
+				d = z[j4+2] * (d / z[j4-3])
+			}
+			dmin = math.Min(dmin, d)
+			emin = math.Min(emin, z[j4-1])
+		}
+	}
+	// Unroll last two steps.
+	dnm2 = d
+	dmin2 = dmin
+	j4 = 4*(n0-1) - pp - 1
+	j4p2 := j4 + 2*pp - 1
+	z[j4-2] = dnm2 + z[j4p2]
+	if z[j4-2] == 0 {
+		z[j4] = 0
+		dnm1 = z[j4p2+2]
+		dmin = dnm1
+		emin = 0
+	} else if safmin*z[j4p2+2] < z[j4-2] && safmin*z[j4-2] < z[j4p2+2] {
+		tmp := z[j4p2+2] / z[j4-2]
+		z[j4] = z[j4p2] * tmp
+		dnm1 = dnm2 * tmp
+	} else {
+		z[j4] = z[j4p2+2] * (z[j4p2] / z[j4-2])
+		dnm1 = z[j4p2+2] * (dnm2 / z[j4-2])
+	}
+	dmin = math.Min(dmin, dnm1)
+	dmin1 = dmin
+	j4 += 4
+	j4p2 = j4 + 2*pp - 1
+	z[j4-2] = dnm1 + z[j4p2]
+	if z[j4-2] == 0 {
+		z[j4] = 0
+		dn = z[j4p2+2]
+		dmin = dn
+		emin = 0
+	} else if safmin*z[j4p2+2] < z[j4-2] && safmin*z[j4-2] < z[j4p2+2] {
+		tmp := z[j4p2+2] / z[j4-2]
+		z[j4] = z[j4p2] * tmp
+		dn = dnm1 * tmp
+	} else {
+		z[j4] = z[j4p2+2] * (z[j4p2] / z[j4-2])
+		dn = z[j4p2+2] * (dnm1 / z[j4-2])
+	}
+	dmin = math.Min(dmin, dn)
+	z[j4+2] = dn
+	z[4*(n0+1)-pp-1] = emin
+	return dmin, dmin1, dmin2, dn, dnm1, dnm2
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlasr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasr.go
new file mode 100644
index 0000000000..3aab41f8e3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasr.go
@@ -0,0 +1,287 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlasr applies a sequence of plane rotations to the m×n matrix A. This series
+// of plane rotations is implicitly represented by a matrix P. P is multiplied
+// by a depending on the value of side -- A = P * A if side == lapack.Left,
+// A = A * Pᵀ if side == lapack.Right.
+//
+// The exact value of P depends on the value of pivot, but in all cases P is
+// implicitly represented by a series of 2×2 rotation matrices. The entries of
+// rotation matrix k are defined by s[k] and c[k]
+//
+//	R(k) = [ c[k] s[k]]
+//	       [-s[k] s[k]]
+//
+// If direct == lapack.Forward, the rotation matrices are applied as
+// P = P(z-1) * ... * P(2) * P(1), while if direct == lapack.Backward they are
+// applied as P = P(1) * P(2) * ... * P(n).
+//
+// pivot defines the mapping of the elements in R(k) to P(k).
+// If pivot == lapack.Variable, the rotation is performed for the (k, k+1) plane.
+//
+//	P(k) = [1                    ]
+//	       [    ...              ]
+//	       [     1               ]
+//	       [       c[k] s[k]     ]
+//	       [      -s[k] c[k]     ]
+//	       [                 1   ]
+//	       [                ...  ]
+//	       [                    1]
+//
+// if pivot == lapack.Top, the rotation is performed for the (1, k+1) plane,
+//
+//	P(k) = [c[k]        s[k]     ]
+//	       [    1                ]
+//	       [     ...             ]
+//	       [         1           ]
+//	       [-s[k]       c[k]     ]
+//	       [                 1   ]
+//	       [                ...  ]
+//	       [                    1]
+//
+// and if pivot == lapack.Bottom, the rotation is performed for the (k, z) plane.
+//
+//	P(k) = [1                    ]
+//	       [  ...                ]
+//	       [      1              ]
+//	       [        c[k]     s[k]]
+//	       [           1         ]
+//	       [            ...      ]
+//	       [              1      ]
+//	       [       -s[k]     c[k]]
+//
+// s and c have length m - 1 if side == blas.Left, and n - 1 if side == blas.Right.
+//
+// Dlasr is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlasr(side blas.Side, pivot lapack.Pivot, direct lapack.Direct, m, n int, c, s, a []float64, lda int) {
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case pivot != lapack.Variable && pivot != lapack.Top && pivot != lapack.Bottom:
+		panic(badPivot)
+	case direct != lapack.Forward && direct != lapack.Backward:
+		panic(badDirect)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	if side == blas.Left {
+		if len(c) < m-1 {
+			panic(shortC)
+		}
+		if len(s) < m-1 {
+			panic(shortS)
+		}
+	} else {
+		if len(c) < n-1 {
+			panic(shortC)
+		}
+		if len(s) < n-1 {
+			panic(shortS)
+		}
+	}
+	if len(a) < (m-1)*lda+n {
+		panic(shortA)
+	}
+
+	if side == blas.Left {
+		if pivot == lapack.Variable {
+			if direct == lapack.Forward {
+				for j := 0; j < m-1; j++ {
+					ctmp := c[j]
+					stmp := s[j]
+					if ctmp != 1 || stmp != 0 {
+						for i := 0; i < n; i++ {
+							tmp2 := a[j*lda+i]
+							tmp := a[(j+1)*lda+i]
+							a[(j+1)*lda+i] = ctmp*tmp - stmp*tmp2
+							a[j*lda+i] = stmp*tmp + ctmp*tmp2
+						}
+					}
+				}
+				return
+			}
+			for j := m - 2; j >= 0; j-- {
+				ctmp := c[j]
+				stmp := s[j]
+				if ctmp != 1 || stmp != 0 {
+					for i := 0; i < n; i++ {
+						tmp2 := a[j*lda+i]
+						tmp := a[(j+1)*lda+i]
+						a[(j+1)*lda+i] = ctmp*tmp - stmp*tmp2
+						a[j*lda+i] = stmp*tmp + ctmp*tmp2
+					}
+				}
+			}
+			return
+		} else if pivot == lapack.Top {
+			if direct == lapack.Forward {
+				for j := 1; j < m; j++ {
+					ctmp := c[j-1]
+					stmp := s[j-1]
+					if ctmp != 1 || stmp != 0 {
+						for i := 0; i < n; i++ {
+							tmp := a[j*lda+i]
+							tmp2 := a[i]
+							a[j*lda+i] = ctmp*tmp - stmp*tmp2
+							a[i] = stmp*tmp + ctmp*tmp2
+						}
+					}
+				}
+				return
+			}
+			for j := m - 1; j >= 1; j-- {
+				ctmp := c[j-1]
+				stmp := s[j-1]
+				if ctmp != 1 || stmp != 0 {
+					for i := 0; i < n; i++ {
+						ctmp := c[j-1]
+						stmp := s[j-1]
+						if ctmp != 1 || stmp != 0 {
+							for i := 0; i < n; i++ {
+								tmp := a[j*lda+i]
+								tmp2 := a[i]
+								a[j*lda+i] = ctmp*tmp - stmp*tmp2
+								a[i] = stmp*tmp + ctmp*tmp2
+							}
+						}
+					}
+				}
+			}
+			return
+		}
+		if direct == lapack.Forward {
+			for j := 0; j < m-1; j++ {
+				ctmp := c[j]
+				stmp := s[j]
+				if ctmp != 1 || stmp != 0 {
+					for i := 0; i < n; i++ {
+						tmp := a[j*lda+i]
+						tmp2 := a[(m-1)*lda+i]
+						a[j*lda+i] = stmp*tmp2 + ctmp*tmp
+						a[(m-1)*lda+i] = ctmp*tmp2 - stmp*tmp
+					}
+				}
+			}
+			return
+		}
+		for j := m - 2; j >= 0; j-- {
+			ctmp := c[j]
+			stmp := s[j]
+			if ctmp != 1 || stmp != 0 {
+				for i := 0; i < n; i++ {
+					tmp := a[j*lda+i]
+					tmp2 := a[(m-1)*lda+i]
+					a[j*lda+i] = stmp*tmp2 + ctmp*tmp
+					a[(m-1)*lda+i] = ctmp*tmp2 - stmp*tmp
+				}
+			}
+		}
+		return
+	}
+	if pivot == lapack.Variable {
+		if direct == lapack.Forward {
+			for j := 0; j < n-1; j++ {
+				ctmp := c[j]
+				stmp := s[j]
+				if ctmp != 1 || stmp != 0 {
+					for i := 0; i < m; i++ {
+						tmp := a[i*lda+j+1]
+						tmp2 := a[i*lda+j]
+						a[i*lda+j+1] = ctmp*tmp - stmp*tmp2
+						a[i*lda+j] = stmp*tmp + ctmp*tmp2
+					}
+				}
+			}
+			return
+		}
+		for j := n - 2; j >= 0; j-- {
+			ctmp := c[j]
+			stmp := s[j]
+			if ctmp != 1 || stmp != 0 {
+				for i := 0; i < m; i++ {
+					tmp := a[i*lda+j+1]
+					tmp2 := a[i*lda+j]
+					a[i*lda+j+1] = ctmp*tmp - stmp*tmp2
+					a[i*lda+j] = stmp*tmp + ctmp*tmp2
+				}
+			}
+		}
+		return
+	} else if pivot == lapack.Top {
+		if direct == lapack.Forward {
+			for j := 1; j < n; j++ {
+				ctmp := c[j-1]
+				stmp := s[j-1]
+				if ctmp != 1 || stmp != 0 {
+					for i := 0; i < m; i++ {
+						tmp := a[i*lda+j]
+						tmp2 := a[i*lda]
+						a[i*lda+j] = ctmp*tmp - stmp*tmp2
+						a[i*lda] = stmp*tmp + ctmp*tmp2
+					}
+				}
+			}
+			return
+		}
+		for j := n - 1; j >= 1; j-- {
+			ctmp := c[j-1]
+			stmp := s[j-1]
+			if ctmp != 1 || stmp != 0 {
+				for i := 0; i < m; i++ {
+					tmp := a[i*lda+j]
+					tmp2 := a[i*lda]
+					a[i*lda+j] = ctmp*tmp - stmp*tmp2
+					a[i*lda] = stmp*tmp + ctmp*tmp2
+				}
+			}
+		}
+		return
+	}
+	if direct == lapack.Forward {
+		for j := 0; j < n-1; j++ {
+			ctmp := c[j]
+			stmp := s[j]
+			if ctmp != 1 || stmp != 0 {
+				for i := 0; i < m; i++ {
+					tmp := a[i*lda+j]
+					tmp2 := a[i*lda+n-1]
+					a[i*lda+j] = stmp*tmp2 + ctmp*tmp
+					a[i*lda+n-1] = ctmp*tmp2 - stmp*tmp
+				}
+
+			}
+		}
+		return
+	}
+	for j := n - 2; j >= 0; j-- {
+		ctmp := c[j]
+		stmp := s[j]
+		if ctmp != 1 || stmp != 0 {
+			for i := 0; i < m; i++ {
+				tmp := a[i*lda+j]
+				tmp2 := a[i*lda+n-1]
+				a[i*lda+j] = stmp*tmp2 + ctmp*tmp
+				a[i*lda+n-1] = ctmp*tmp2 - stmp*tmp
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlasrt.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasrt.go
new file mode 100644
index 0000000000..be472805bf
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasrt.go
@@ -0,0 +1,36 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"sort"
+
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlasrt sorts the numbers in the input slice d. If s == lapack.SortIncreasing,
+// the elements are sorted in increasing order. If s == lapack.SortDecreasing,
+// the elements are sorted in decreasing order. For other values of s Dlasrt
+// will panic.
+//
+// Dlasrt is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlasrt(s lapack.Sort, n int, d []float64) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case len(d) < n:
+		panic(shortD)
+	}
+
+	d = d[:n]
+	switch s {
+	default:
+		panic(badSort)
+	case lapack.SortIncreasing:
+		sort.Float64s(d)
+	case lapack.SortDecreasing:
+		sort.Sort(sort.Reverse(sort.Float64Slice(d)))
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlassq.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlassq.go
new file mode 100644
index 0000000000..8f8cda8789
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlassq.go
@@ -0,0 +1,131 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlassq updates a sum of squares represented in scaled form. Dlassq returns
+// the values scl and smsq such that
+//
+//	scl^2*smsq = X[0]^2 + ... + X[n-1]^2 + scale^2*sumsq
+//
+// The value of sumsq is assumed to be non-negative.
+//
+// Dlassq is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlassq(n int, x []float64, incx int, scale float64, sumsq float64) (scl, smsq float64) {
+	// Implementation based on Supplemental Material to:
+	// Edward Anderson. 2017. Algorithm 978: Safe Scaling in the Level 1 BLAS.
+	// ACM Trans. Math. Softw. 44, 1, Article 12 (July 2017), 28 pages.
+	// DOI: https://doi.org/10.1145/3061665
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case incx <= 0:
+		panic(badIncX)
+	case len(x) < 1+(n-1)*incx:
+		panic(shortX)
+	}
+
+	if math.IsNaN(scale) || math.IsNaN(sumsq) {
+		return scale, sumsq
+	}
+
+	if sumsq == 0 {
+		scale = 1
+	}
+	if scale == 0 {
+		scale = 1
+		sumsq = 0
+	}
+
+	if n == 0 {
+		return scale, sumsq
+	}
+
+	// Compute the sum of squares in 3 accumulators:
+	//  - abig: sum of squares scaled down to avoid overflow
+	//  - asml: sum of squares scaled up to avoid underflow
+	//  - amed: sum of squares that do not require scaling
+	// The thresholds and multipliers are:
+	//  - values bigger than dtbig are scaled down by dsbig
+	//  - values smaller than dtsml are scaled up by dssml
+	var (
+		isBig            bool
+		asml, amed, abig float64
+	)
+	for i, ix := 0, 0; i < n; i++ {
+		ax := math.Abs(x[ix])
+		switch {
+		case ax > dtbig:
+			ax *= dsbig
+			abig += ax * ax
+			isBig = true
+		case ax < dtsml:
+			if !isBig {
+				ax *= dssml
+				asml += ax * ax
+			}
+		default:
+			amed += ax * ax
+		}
+		ix += incx
+	}
+	// Put the existing sum of squares into one of the accumulators.
+	if sumsq > 0 {
+		ax := scale * math.Sqrt(sumsq)
+		switch {
+		case ax > dtbig:
+			if scale > 1 {
+				scale *= dsbig
+				abig += scale * (scale * sumsq)
+			} else {
+				// sumsq > dtbig^2 => (dsbig * (dsbig * sumsq)) is representable.
+				abig += scale * (scale * (dsbig * (dsbig * sumsq)))
+			}
+		case ax < dtsml:
+			if !isBig {
+				if scale < 1 {
+					scale *= dssml
+					asml += scale * (scale * sumsq)
+				} else {
+					// sumsq < dtsml^2 => (dssml * (dssml * sumsq)) is representable.
+					asml += scale * (scale * (dssml * (dssml * sumsq)))
+				}
+			}
+		default:
+			amed += scale * (scale * sumsq)
+		}
+	}
+	// Combine abig and amed or amed and asml if more than one accumulator was
+	// used.
+	switch {
+	case abig > 0:
+		// Combine abig and amed:
+		if amed > 0 || math.IsNaN(amed) {
+			abig += (amed * dsbig) * dsbig
+		}
+		scale = 1 / dsbig
+		sumsq = abig
+	case asml > 0:
+		// Combine amed and asml:
+		if amed > 0 || math.IsNaN(amed) {
+			amed = math.Sqrt(amed)
+			asml = math.Sqrt(asml) / dssml
+			ymin, ymax := asml, amed
+			if asml > amed {
+				ymin, ymax = amed, asml
+			}
+			scale = 1
+			sumsq = ymax * ymax * (1 + (ymin/ymax)*(ymin/ymax))
+		} else {
+			scale = 1 / dssml
+			sumsq = asml
+		}
+	default:
+		scale = 1
+		sumsq = amed
+	}
+	return scale, sumsq
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlasv2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasv2.go
new file mode 100644
index 0000000000..cc7ceea0b8
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasv2.go
@@ -0,0 +1,117 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlasv2 computes the singular value decomposition of a 2×2 matrix.
+//
+//	[ csl snl] [f g] [csr -snr] = [ssmax     0]
+//	[-snl csl] [0 h] [snr  csr] = [    0 ssmin]
+//
+// ssmax is the larger absolute singular value, and ssmin is the smaller absolute
+// singular value. [cls, snl] and [csr, snr] are the left and right singular vectors.
+//
+// Dlasv2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlasv2(f, g, h float64) (ssmin, ssmax, snr, csr, snl, csl float64) {
+	ft := f
+	fa := math.Abs(ft)
+	ht := h
+	ha := math.Abs(h)
+	// pmax points to the largest element of the matrix in terms of absolute value.
+	// 1 if F, 2 if G, 3 if H.
+	pmax := 1
+	swap := ha > fa
+	if swap {
+		pmax = 3
+		ft, ht = ht, ft
+		fa, ha = ha, fa
+	}
+	gt := g
+	ga := math.Abs(gt)
+	var clt, crt, slt, srt float64
+	if ga == 0 {
+		ssmin = ha
+		ssmax = fa
+		clt = 1
+		crt = 1
+		slt = 0
+		srt = 0
+	} else {
+		gasmall := true
+		if ga > fa {
+			pmax = 2
+			if (fa / ga) < dlamchE {
+				gasmall = false
+				ssmax = ga
+				if ha > 1 {
+					ssmin = fa / (ga / ha)
+				} else {
+					ssmin = (fa / ga) * ha
+				}
+				clt = 1
+				slt = ht / gt
+				srt = 1
+				crt = ft / gt
+			}
+		}
+		if gasmall {
+			d := fa - ha
+			l := d / fa
+			if d == fa { // deal with inf
+				l = 1
+			}
+			m := gt / ft
+			t := 2 - l
+			s := math.Hypot(t, m)
+			var r float64
+			if l == 0 {
+				r = math.Abs(m)
+			} else {
+				r = math.Hypot(l, m)
+			}
+			a := 0.5 * (s + r)
+			ssmin = ha / a
+			ssmax = fa * a
+			if m == 0 {
+				if l == 0 {
+					t = math.Copysign(2, ft) * math.Copysign(1, gt)
+				} else {
+					t = gt/math.Copysign(d, ft) + m/t
+				}
+			} else {
+				t = (m/(s+t) + m/(r+l)) * (1 + a)
+			}
+			l = math.Hypot(t, 2)
+			crt = 2 / l
+			srt = t / l
+			clt = (crt + srt*m) / a
+			slt = (ht / ft) * srt / a
+		}
+	}
+	if swap {
+		csl = srt
+		snl = crt
+		csr = slt
+		snr = clt
+	} else {
+		csl = clt
+		snl = slt
+		csr = crt
+		snr = srt
+	}
+	var tsign float64
+	switch pmax {
+	case 1:
+		tsign = math.Copysign(1, csr) * math.Copysign(1, csl) * math.Copysign(1, f)
+	case 2:
+		tsign = math.Copysign(1, snr) * math.Copysign(1, csl) * math.Copysign(1, g)
+	case 3:
+		tsign = math.Copysign(1, snr) * math.Copysign(1, snl) * math.Copysign(1, h)
+	}
+	ssmax = math.Copysign(ssmax, tsign)
+	ssmin = math.Copysign(ssmin, tsign*math.Copysign(1, f)*math.Copysign(1, h))
+	return ssmin, ssmax, snr, csr, snl, csl
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlaswp.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaswp.go
new file mode 100644
index 0000000000..88600ac17b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaswp.go
@@ -0,0 +1,58 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas/blas64"
+
+// Dlaswp swaps the rows k1 to k2 of a rectangular matrix A according to the
+// indices in ipiv so that row k is swapped with ipiv[k].
+//
+// n is the number of columns of A and incX is the increment for ipiv. If incX
+// is 1, the swaps are applied from k1 to k2. If incX is -1, the swaps are
+// applied in reverse order from k2 to k1. For other values of incX Dlaswp will
+// panic. ipiv must have length k2+1, otherwise Dlaswp will panic.
+//
+// The indices k1, k2, and the elements of ipiv are zero-based.
+//
+// Dlaswp is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlaswp(n int, a []float64, lda int, k1, k2 int, ipiv []int, incX int) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case k1 < 0:
+		panic(badK1)
+	case k2 < k1:
+		panic(badK2)
+	case lda < max(1, n):
+		panic(badLdA)
+	case len(a) < k2*lda+n: // A must have at least k2+1 rows.
+		panic(shortA)
+	case len(ipiv) != k2+1:
+		panic(badLenIpiv)
+	case incX != 1 && incX != -1:
+		panic(absIncNotOne)
+	}
+
+	if n == 0 {
+		return
+	}
+
+	bi := blas64.Implementation()
+	if incX == 1 {
+		for k := k1; k <= k2; k++ {
+			if k == ipiv[k] {
+				continue
+			}
+			bi.Dswap(n, a[k*lda:], 1, a[ipiv[k]*lda:], 1)
+		}
+		return
+	}
+	for k := k2; k >= k1; k-- {
+		if k == ipiv[k] {
+			continue
+		}
+		bi.Dswap(n, a[k*lda:], 1, a[ipiv[k]*lda:], 1)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlasy2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasy2.go
new file mode 100644
index 0000000000..160b68b84a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasy2.go
@@ -0,0 +1,292 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlasy2 solves the Sylvester matrix equation where the matrices are of order 1
+// or 2. It computes the unknown n1×n2 matrix X so that
+//
+//	TL*X   + sgn*X*TR  = scale*B  if tranl == false and tranr == false,
+//	TLᵀ*X + sgn*X*TR   = scale*B  if tranl == true  and tranr == false,
+//	TL*X   + sgn*X*TRᵀ = scale*B  if tranl == false and tranr == true,
+//	TLᵀ*X + sgn*X*TRᵀ  = scale*B  if tranl == true  and tranr == true,
+//
+// where TL is n1×n1, TR is n2×n2, B is n1×n2, and 1 <= n1,n2 <= 2.
+//
+// isgn must be 1 or -1, and n1 and n2 must be 0, 1, or 2, but these conditions
+// are not checked.
+//
+// Dlasy2 returns three values, a scale factor that is chosen less than or equal
+// to 1 to prevent the solution overflowing, the infinity norm of the solution,
+// and an indicator of success. If ok is false, TL and TR have eigenvalues that
+// are too close, so TL or TR is perturbed to get a non-singular equation.
+//
+// Dlasy2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlasy2(tranl, tranr bool, isgn, n1, n2 int, tl []float64, ldtl int, tr []float64, ldtr int, b []float64, ldb int, x []float64, ldx int) (scale, xnorm float64, ok bool) {
+	// TODO(vladimir-ch): Add input validation checks conditionally skipped
+	// using the build tag mechanism.
+
+	ok = true
+	// Quick return if possible.
+	if n1 == 0 || n2 == 0 {
+		return scale, xnorm, ok
+	}
+
+	// Set constants to control overflow.
+	eps := dlamchP
+	smlnum := dlamchS / eps
+	sgn := float64(isgn)
+
+	if n1 == 1 && n2 == 1 {
+		// 1×1 case: TL11*X + sgn*X*TR11 = B11.
+		tau1 := tl[0] + sgn*tr[0]
+		bet := math.Abs(tau1)
+		if bet <= smlnum {
+			tau1 = smlnum
+			bet = smlnum
+			ok = false
+		}
+		scale = 1
+		gam := math.Abs(b[0])
+		if smlnum*gam > bet {
+			scale = 1 / gam
+		}
+		x[0] = b[0] * scale / tau1
+		xnorm = math.Abs(x[0])
+		return scale, xnorm, ok
+	}
+
+	if n1+n2 == 3 {
+		// 1×2 or 2×1 case.
+		var (
+			smin float64
+			tmp  [4]float64 // tmp is used as a 2×2 row-major matrix.
+			btmp [2]float64
+		)
+		if n1 == 1 && n2 == 2 {
+			// 1×2 case: TL11*[X11 X12] + sgn*[X11 X12]*op[TR11 TR12] = [B11 B12].
+			//                                            [TR21 TR22]
+			smin = math.Abs(tl[0])
+			smin = math.Max(smin, math.Max(math.Abs(tr[0]), math.Abs(tr[1])))
+			smin = math.Max(smin, math.Max(math.Abs(tr[ldtr]), math.Abs(tr[ldtr+1])))
+			smin = math.Max(eps*smin, smlnum)
+			tmp[0] = tl[0] + sgn*tr[0]
+			tmp[3] = tl[0] + sgn*tr[ldtr+1]
+			if tranr {
+				tmp[1] = sgn * tr[1]
+				tmp[2] = sgn * tr[ldtr]
+			} else {
+				tmp[1] = sgn * tr[ldtr]
+				tmp[2] = sgn * tr[1]
+			}
+			btmp[0] = b[0]
+			btmp[1] = b[1]
+		} else {
+			// 2×1 case: op[TL11 TL12]*[X11] + sgn*[X11]*TR11 = [B11].
+			//             [TL21 TL22]*[X21]       [X21]        [B21]
+			smin = math.Abs(tr[0])
+			smin = math.Max(smin, math.Max(math.Abs(tl[0]), math.Abs(tl[1])))
+			smin = math.Max(smin, math.Max(math.Abs(tl[ldtl]), math.Abs(tl[ldtl+1])))
+			smin = math.Max(eps*smin, smlnum)
+			tmp[0] = tl[0] + sgn*tr[0]
+			tmp[3] = tl[ldtl+1] + sgn*tr[0]
+			if tranl {
+				tmp[1] = tl[ldtl]
+				tmp[2] = tl[1]
+			} else {
+				tmp[1] = tl[1]
+				tmp[2] = tl[ldtl]
+			}
+			btmp[0] = b[0]
+			btmp[1] = b[ldb]
+		}
+
+		// Solve 2×2 system using complete pivoting.
+		// Set pivots less than smin to smin.
+
+		bi := blas64.Implementation()
+		ipiv := bi.Idamax(len(tmp), tmp[:], 1)
+		// Compute the upper triangular matrix [u11 u12].
+		//                                     [  0 u22]
+		u11 := tmp[ipiv]
+		if math.Abs(u11) <= smin {
+			ok = false
+			u11 = smin
+		}
+		locu12 := [4]int{1, 0, 3, 2} // Index in tmp of the element on the same row as the pivot.
+		u12 := tmp[locu12[ipiv]]
+		locl21 := [4]int{2, 3, 0, 1} // Index in tmp of the element on the same column as the pivot.
+		l21 := tmp[locl21[ipiv]] / u11
+		locu22 := [4]int{3, 2, 1, 0} // Index in tmp of the remaining element.
+		u22 := tmp[locu22[ipiv]] - l21*u12
+		if math.Abs(u22) <= smin {
+			ok = false
+			u22 = smin
+		}
+		if ipiv&0x2 != 0 { // true for ipiv equal to 2 and 3.
+			// The pivot was in the second row, swap the elements of
+			// the right-hand side.
+			btmp[0], btmp[1] = btmp[1], btmp[0]-l21*btmp[1]
+		} else {
+			btmp[1] -= l21 * btmp[0]
+		}
+		scale = 1
+		if 2*smlnum*math.Abs(btmp[1]) > math.Abs(u22) || 2*smlnum*math.Abs(btmp[0]) > math.Abs(u11) {
+			scale = 0.5 / math.Max(math.Abs(btmp[0]), math.Abs(btmp[1]))
+			btmp[0] *= scale
+			btmp[1] *= scale
+		}
+		// Solve the system [u11 u12] [x21] = [ btmp[0] ].
+		//                  [  0 u22] [x22]   [ btmp[1] ]
+		x22 := btmp[1] / u22
+		x21 := btmp[0]/u11 - (u12/u11)*x22
+		if ipiv&0x1 != 0 { // true for ipiv equal to 1 and 3.
+			// The pivot was in the second column, swap the elements
+			// of the solution.
+			x21, x22 = x22, x21
+		}
+		x[0] = x21
+		if n1 == 1 {
+			x[1] = x22
+			xnorm = math.Abs(x[0]) + math.Abs(x[1])
+		} else {
+			x[ldx] = x22
+			xnorm = math.Max(math.Abs(x[0]), math.Abs(x[ldx]))
+		}
+		return scale, xnorm, ok
+	}
+
+	// 2×2 case: op[TL11 TL12]*[X11 X12] + SGN*[X11 X12]*op[TR11 TR12] = [B11 B12].
+	//             [TL21 TL22] [X21 X22]       [X21 X22]   [TR21 TR22]   [B21 B22]
+	//
+	// Solve equivalent 4×4 system using complete pivoting.
+	// Set pivots less than smin to smin.
+
+	smin := math.Max(math.Abs(tr[0]), math.Abs(tr[1]))
+	smin = math.Max(smin, math.Max(math.Abs(tr[ldtr]), math.Abs(tr[ldtr+1])))
+	smin = math.Max(smin, math.Max(math.Abs(tl[0]), math.Abs(tl[1])))
+	smin = math.Max(smin, math.Max(math.Abs(tl[ldtl]), math.Abs(tl[ldtl+1])))
+	smin = math.Max(eps*smin, smlnum)
+
+	var t [4][4]float64
+	t[0][0] = tl[0] + sgn*tr[0]
+	t[1][1] = tl[0] + sgn*tr[ldtr+1]
+	t[2][2] = tl[ldtl+1] + sgn*tr[0]
+	t[3][3] = tl[ldtl+1] + sgn*tr[ldtr+1]
+	if tranl {
+		t[0][2] = tl[ldtl]
+		t[1][3] = tl[ldtl]
+		t[2][0] = tl[1]
+		t[3][1] = tl[1]
+	} else {
+		t[0][2] = tl[1]
+		t[1][3] = tl[1]
+		t[2][0] = tl[ldtl]
+		t[3][1] = tl[ldtl]
+	}
+	if tranr {
+		t[0][1] = sgn * tr[1]
+		t[1][0] = sgn * tr[ldtr]
+		t[2][3] = sgn * tr[1]
+		t[3][2] = sgn * tr[ldtr]
+	} else {
+		t[0][1] = sgn * tr[ldtr]
+		t[1][0] = sgn * tr[1]
+		t[2][3] = sgn * tr[ldtr]
+		t[3][2] = sgn * tr[1]
+	}
+
+	var btmp [4]float64
+	btmp[0] = b[0]
+	btmp[1] = b[1]
+	btmp[2] = b[ldb]
+	btmp[3] = b[ldb+1]
+
+	// Perform elimination.
+	var jpiv [4]int // jpiv records any column swaps for pivoting.
+	for i := 0; i < 3; i++ {
+		var (
+			xmax       float64
+			ipsv, jpsv int
+		)
+		for ip := i; ip < 4; ip++ {
+			for jp := i; jp < 4; jp++ {
+				if math.Abs(t[ip][jp]) >= xmax {
+					xmax = math.Abs(t[ip][jp])
+					ipsv = ip
+					jpsv = jp
+				}
+			}
+		}
+		if ipsv != i {
+			// The pivot is not in the top row of the unprocessed
+			// block, swap rows ipsv and i of t and btmp.
+			t[ipsv], t[i] = t[i], t[ipsv]
+			btmp[ipsv], btmp[i] = btmp[i], btmp[ipsv]
+		}
+		if jpsv != i {
+			// The pivot is not in the left column of the
+			// unprocessed block, swap columns jpsv and i of t.
+			for k := 0; k < 4; k++ {
+				t[k][jpsv], t[k][i] = t[k][i], t[k][jpsv]
+			}
+		}
+		jpiv[i] = jpsv
+		if math.Abs(t[i][i]) < smin {
+			ok = false
+			t[i][i] = smin
+		}
+		for k := i + 1; k < 4; k++ {
+			t[k][i] /= t[i][i]
+			btmp[k] -= t[k][i] * btmp[i]
+			for j := i + 1; j < 4; j++ {
+				t[k][j] -= t[k][i] * t[i][j]
+			}
+		}
+	}
+	if math.Abs(t[3][3]) < smin {
+		ok = false
+		t[3][3] = smin
+	}
+	scale = 1
+	if 8*smlnum*math.Abs(btmp[0]) > math.Abs(t[0][0]) ||
+		8*smlnum*math.Abs(btmp[1]) > math.Abs(t[1][1]) ||
+		8*smlnum*math.Abs(btmp[2]) > math.Abs(t[2][2]) ||
+		8*smlnum*math.Abs(btmp[3]) > math.Abs(t[3][3]) {
+
+		maxbtmp := math.Max(math.Abs(btmp[0]), math.Abs(btmp[1]))
+		maxbtmp = math.Max(maxbtmp, math.Max(math.Abs(btmp[2]), math.Abs(btmp[3])))
+		scale = (1.0 / 8.0) / maxbtmp
+		btmp[0] *= scale
+		btmp[1] *= scale
+		btmp[2] *= scale
+		btmp[3] *= scale
+	}
+	// Compute the solution of the upper triangular system t * tmp = btmp.
+	var tmp [4]float64
+	for i := 3; i >= 0; i-- {
+		temp := 1 / t[i][i]
+		tmp[i] = btmp[i] * temp
+		for j := i + 1; j < 4; j++ {
+			tmp[i] -= temp * t[i][j] * tmp[j]
+		}
+	}
+	for i := 2; i >= 0; i-- {
+		if jpiv[i] != i {
+			tmp[i], tmp[jpiv[i]] = tmp[jpiv[i]], tmp[i]
+		}
+	}
+	x[0] = tmp[0]
+	x[1] = tmp[1]
+	x[ldx] = tmp[2]
+	x[ldx+1] = tmp[3]
+	xnorm = math.Max(math.Abs(tmp[0])+math.Abs(tmp[1]), math.Abs(tmp[2])+math.Abs(tmp[3]))
+	return scale, xnorm, ok
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlatbs.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlatbs.go
new file mode 100644
index 0000000000..e0e809cf90
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlatbs.go
@@ -0,0 +1,454 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlatbs solves a triangular banded system of equations
+//
+//	A * x = s*b    if trans == blas.NoTrans
+//	Aᵀ * x = s*b  if trans == blas.Trans or blas.ConjTrans
+//
+// where A is an upper or lower triangular band matrix, x and b are n-element
+// vectors, and s is a scaling factor chosen so that the components of x will be
+// less than the overflow threshold.
+//
+// On entry, x contains the right-hand side b of the triangular system.
+// On return, x is overwritten by the solution vector x.
+//
+// normin specifies whether the cnorm parameter contains the column norms of A on
+// entry. If it is true, cnorm[j] contains the norm of the off-diagonal part of
+// the j-th column of A. If it is false, the norms will be computed and stored
+// in cnorm.
+//
+// Dlatbs returns the scaling factor s for the triangular system. If the matrix
+// A is singular (A[j,j]==0 for some j), then scale is set to 0 and a
+// non-trivial solution to A*x = 0 is returned.
+//
+// Dlatbs is an internal routine. It is exported for testing purposes.
+func (Implementation) Dlatbs(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, normin bool, n, kd int, ab []float64, ldab int, x, cnorm []float64) (scale float64) {
+	noTran := trans == blas.NoTrans
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case !noTran && trans != blas.Trans && trans != blas.ConjTrans:
+		panic(badTrans)
+	case diag != blas.NonUnit && diag != blas.Unit:
+		panic(badDiag)
+	case n < 0:
+		panic(nLT0)
+	case kd < 0:
+		panic(kdLT0)
+	case ldab < kd+1:
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 1
+	}
+
+	switch {
+	case len(ab) < (n-1)*ldab+kd+1:
+		panic(shortAB)
+	case len(x) < n:
+		panic(shortX)
+	case len(cnorm) < n:
+		panic(shortCNorm)
+	}
+
+	// Parameters to control overflow.
+	smlnum := dlamchS / dlamchP
+	bignum := 1 / smlnum
+
+	bi := blas64.Implementation()
+	kld := max(1, ldab-1)
+	if !normin {
+		// Compute the 1-norm of each column, not including the diagonal.
+		if uplo == blas.Upper {
+			for j := 0; j < n; j++ {
+				jlen := min(j, kd)
+				if jlen > 0 {
+					cnorm[j] = bi.Dasum(jlen, ab[(j-jlen)*ldab+jlen:], kld)
+				} else {
+					cnorm[j] = 0
+				}
+			}
+		} else {
+			for j := 0; j < n; j++ {
+				jlen := min(n-j-1, kd)
+				if jlen > 0 {
+					cnorm[j] = bi.Dasum(jlen, ab[(j+1)*ldab+kd-1:], kld)
+				} else {
+					cnorm[j] = 0
+				}
+			}
+		}
+	}
+
+	// Set up indices and increments for loops below.
+	var (
+		jFirst, jLast, jInc int
+		maind               int
+	)
+	if noTran {
+		if uplo == blas.Upper {
+			jFirst = n - 1
+			jLast = -1
+			jInc = -1
+			maind = 0
+		} else {
+			jFirst = 0
+			jLast = n
+			jInc = 1
+			maind = kd
+		}
+	} else {
+		if uplo == blas.Upper {
+			jFirst = 0
+			jLast = n
+			jInc = 1
+			maind = 0
+		} else {
+			jFirst = n - 1
+			jLast = -1
+			jInc = -1
+			maind = kd
+		}
+	}
+
+	// Scale the column norms by tscal if the maximum element in cnorm is
+	// greater than bignum.
+	tmax := cnorm[bi.Idamax(n, cnorm, 1)]
+	tscal := 1.0
+	if tmax > bignum {
+		tscal = 1 / (smlnum * tmax)
+		bi.Dscal(n, tscal, cnorm, 1)
+	}
+
+	// Compute a bound on the computed solution vector to see if the Level 2
+	// BLAS routine Dtbsv can be used.
+
+	xMax := math.Abs(x[bi.Idamax(n, x, 1)])
+	xBnd := xMax
+	grow := 0.0
+	// Compute the growth only if the maximum element in cnorm is NOT greater
+	// than bignum.
+	if tscal != 1 {
+		goto skipComputeGrow
+	}
+	if noTran {
+		// Compute the growth in A * x = b.
+		if diag == blas.NonUnit {
+			// A is non-unit triangular.
+			//
+			// Compute grow = 1/G_j and xBnd = 1/M_j.
+			// Initially, G_0 = max{x(i), i=1,...,n}.
+			grow = 1 / math.Max(xBnd, smlnum)
+			xBnd = grow
+			for j := jFirst; j != jLast; j += jInc {
+				if grow <= smlnum {
+					// Exit the loop because the growth factor is too small.
+					goto skipComputeGrow
+				}
+				// M_j = G_{j-1} / abs(A[j,j])
+				tjj := math.Abs(ab[j*ldab+maind])
+				xBnd = math.Min(xBnd, math.Min(1, tjj)*grow)
+				if tjj+cnorm[j] >= smlnum {
+					// G_j = G_{j-1}*( 1 + cnorm[j] / abs(A[j,j]) )
+					grow *= tjj / (tjj + cnorm[j])
+				} else {
+					// G_j could overflow, set grow to 0.
+					grow = 0
+				}
+			}
+			grow = xBnd
+		} else {
+			// A is unit triangular.
+			//
+			// Compute grow = 1/G_j, where G_0 = max{x(i), i=1,...,n}.
+			grow = math.Min(1, 1/math.Max(xBnd, smlnum))
+			for j := jFirst; j != jLast; j += jInc {
+				if grow <= smlnum {
+					// Exit the loop because the growth factor is too small.
+					goto skipComputeGrow
+				}
+				// G_j = G_{j-1}*( 1 + cnorm[j] )
+				grow /= 1 + cnorm[j]
+			}
+		}
+	} else {
+		// Compute the growth in Aᵀ * x = b.
+		if diag == blas.NonUnit {
+			// A is non-unit triangular.
+			//
+			// Compute grow = 1/G_j and xBnd = 1/M_j.
+			// Initially, G_0 = max{x(i), i=1,...,n}.
+			grow = 1 / math.Max(xBnd, smlnum)
+			xBnd = grow
+			for j := jFirst; j != jLast; j += jInc {
+				if grow <= smlnum {
+					// Exit the loop because the growth factor is too small.
+					goto skipComputeGrow
+				}
+				// G_j = max( G_{j-1}, M_{j-1}*( 1 + cnorm[j] ) )
+				xj := 1 + cnorm[j]
+				grow = math.Min(grow, xBnd/xj)
+				// M_j = M_{j-1}*( 1 + cnorm[j] ) / abs(A[j,j])
+				tjj := math.Abs(ab[j*ldab+maind])
+				if xj > tjj {
+					xBnd *= tjj / xj
+				}
+			}
+			grow = math.Min(grow, xBnd)
+		} else {
+			// A is unit triangular.
+			//
+			// Compute grow = 1/G_j, where G_0 = max{x(i), i=1,...,n}.
+			grow = math.Min(1, 1/math.Max(xBnd, smlnum))
+			for j := jFirst; j != jLast; j += jInc {
+				if grow <= smlnum {
+					// Exit the loop because the growth factor is too small.
+					goto skipComputeGrow
+				}
+				// G_j = G_{j-1}*( 1 + cnorm[j] )
+				grow /= 1 + cnorm[j]
+			}
+		}
+	}
+skipComputeGrow:
+
+	if grow*tscal > smlnum {
+		// The reciprocal of the bound on elements of X is not too small, use
+		// the Level 2 BLAS solve.
+		bi.Dtbsv(uplo, trans, diag, n, kd, ab, ldab, x, 1)
+		// Scale the column norms by 1/tscal for return.
+		if tscal != 1 {
+			bi.Dscal(n, 1/tscal, cnorm, 1)
+		}
+		return 1
+	}
+
+	// Use a Level 1 BLAS solve, scaling intermediate results.
+
+	scale = 1
+	if xMax > bignum {
+		// Scale x so that its components are less than or equal to bignum in
+		// absolute value.
+		scale = bignum / xMax
+		bi.Dscal(n, scale, x, 1)
+		xMax = bignum
+	}
+
+	if noTran {
+		// Solve A * x = b.
+		for j := jFirst; j != jLast; j += jInc {
+			// Compute x[j] = b[j] / A[j,j], scaling x if necessary.
+			xj := math.Abs(x[j])
+			tjjs := tscal
+			if diag == blas.NonUnit {
+				tjjs *= ab[j*ldab+maind]
+			}
+			tjj := math.Abs(tjjs)
+			switch {
+			case tjj > smlnum:
+				// smlnum < abs(A[j,j])
+				if tjj < 1 && xj > tjj*bignum {
+					// Scale x by 1/b[j].
+					rec := 1 / xj
+					bi.Dscal(n, rec, x, 1)
+					scale *= rec
+					xMax *= rec
+				}
+				x[j] /= tjjs
+				xj = math.Abs(x[j])
+			case tjj > 0:
+				// 0 < abs(A[j,j]) <= smlnum
+				if xj > tjj*bignum {
+					// Scale x by (1/abs(x[j]))*abs(A[j,j])*bignum to avoid
+					// overflow when dividing by A[j,j].
+					rec := tjj * bignum / xj
+					if cnorm[j] > 1 {
+						// Scale by 1/cnorm[j] to avoid overflow when
+						// multiplying x[j] times column j.
+						rec /= cnorm[j]
+					}
+					bi.Dscal(n, rec, x, 1)
+					scale *= rec
+					xMax *= rec
+				}
+				x[j] /= tjjs
+				xj = math.Abs(x[j])
+			default:
+				// A[j,j] == 0: Set x[0:n] = 0, x[j] = 1, and scale = 0, and
+				// compute a solution to A*x = 0.
+				for i := range x[:n] {
+					x[i] = 0
+				}
+				x[j] = 1
+				xj = 1
+				scale = 0
+				xMax = 0
+			}
+
+			// Scale x if necessary to avoid overflow when adding a multiple of
+			// column j of A.
+			switch {
+			case xj > 1:
+				rec := 1 / xj
+				if cnorm[j] > (bignum-xMax)*rec {
+					// Scale x by 1/(2*abs(x[j])).
+					rec *= 0.5
+					bi.Dscal(n, rec, x, 1)
+					scale *= rec
+				}
+			case xj*cnorm[j] > bignum-xMax:
+				// Scale x by 1/2.
+				bi.Dscal(n, 0.5, x, 1)
+				scale *= 0.5
+			}
+
+			if uplo == blas.Upper {
+				if j > 0 {
+					// Compute the update
+					//  x[max(0,j-kd):j] := x[max(0,j-kd):j] - x[j] * A[max(0,j-kd):j,j]
+					jlen := min(j, kd)
+					if jlen > 0 {
+						bi.Daxpy(jlen, -x[j]*tscal, ab[(j-jlen)*ldab+jlen:], kld, x[j-jlen:], 1)
+					}
+					i := bi.Idamax(j, x, 1)
+					xMax = math.Abs(x[i])
+				}
+			} else if j < n-1 {
+				// Compute the update
+				//  x[j+1:min(j+kd,n)] := x[j+1:min(j+kd,n)] - x[j] * A[j+1:min(j+kd,n),j]
+				jlen := min(kd, n-j-1)
+				if jlen > 0 {
+					bi.Daxpy(jlen, -x[j]*tscal, ab[(j+1)*ldab+kd-1:], kld, x[j+1:], 1)
+				}
+				i := j + 1 + bi.Idamax(n-j-1, x[j+1:], 1)
+				xMax = math.Abs(x[i])
+			}
+		}
+	} else {
+		// Solve Aᵀ * x = b.
+		for j := jFirst; j != jLast; j += jInc {
+			// Compute x[j] = b[j] - sum A[k,j]*x[k].
+			//                       k!=j
+			xj := math.Abs(x[j])
+			tjjs := tscal
+			if diag == blas.NonUnit {
+				tjjs *= ab[j*ldab+maind]
+			}
+			tjj := math.Abs(tjjs)
+			rec := 1 / math.Max(1, xMax)
+			uscal := tscal
+			if cnorm[j] > (bignum-xj)*rec {
+				// If x[j] could overflow, scale x by 1/(2*xMax).
+				rec *= 0.5
+				if tjj > 1 {
+					// Divide by A[j,j] when scaling x if A[j,j] > 1.
+					rec = math.Min(1, rec*tjj)
+					uscal /= tjjs
+				}
+				if rec < 1 {
+					bi.Dscal(n, rec, x, 1)
+					scale *= rec
+					xMax *= rec
+				}
+			}
+
+			var sumj float64
+			if uscal == 1 {
+				// If the scaling needed for A in the dot product is 1, call
+				// Ddot to perform the dot product...
+				if uplo == blas.Upper {
+					jlen := min(j, kd)
+					if jlen > 0 {
+						sumj = bi.Ddot(jlen, ab[(j-jlen)*ldab+jlen:], kld, x[j-jlen:], 1)
+					}
+				} else {
+					jlen := min(n-j-1, kd)
+					if jlen > 0 {
+						sumj = bi.Ddot(jlen, ab[(j+1)*ldab+kd-1:], kld, x[j+1:], 1)
+					}
+				}
+			} else {
+				// ...otherwise, use in-line code for the dot product.
+				if uplo == blas.Upper {
+					jlen := min(j, kd)
+					for i := 0; i < jlen; i++ {
+						sumj += (ab[(j-jlen+i)*ldab+jlen-i] * uscal) * x[j-jlen+i]
+					}
+				} else {
+					jlen := min(n-j-1, kd)
+					for i := 0; i < jlen; i++ {
+						sumj += (ab[(j+1+i)*ldab+kd-1-i] * uscal) * x[j+i+1]
+					}
+				}
+			}
+
+			if uscal == tscal {
+				// Compute x[j] := ( x[j] - sumj ) / A[j,j]
+				// if 1/A[j,j] was not used to scale the dot product.
+				x[j] -= sumj
+				xj = math.Abs(x[j])
+				// Compute x[j] = x[j] / A[j,j], scaling if necessary.
+				// Note: the reference implementation skips this step for blas.Unit matrices
+				// when tscal is equal to 1 but it complicates the logic and only saves
+				// the comparison and division in the first switch-case. Not skipping it
+				// is also consistent with the NoTrans case above.
+				switch {
+				case tjj > smlnum:
+					// smlnum < abs(A[j,j]):
+					if tjj < 1 && xj > tjj*bignum {
+						// Scale x by 1/abs(x[j]).
+						rec := 1 / xj
+						bi.Dscal(n, rec, x, 1)
+						scale *= rec
+						xMax *= rec
+					}
+					x[j] /= tjjs
+				case tjj > 0:
+					// 0 < abs(A[j,j]) <= smlnum:
+					if xj > tjj*bignum {
+						// Scale x by (1/abs(x[j]))*abs(A[j,j])*bignum.
+						rec := (tjj * bignum) / xj
+						bi.Dscal(n, rec, x, 1)
+						scale *= rec
+						xMax *= rec
+					}
+					x[j] /= tjjs
+				default:
+					// A[j,j] == 0: Set x[0:n] = 0, x[j] = 1, and scale = 0, and
+					// compute a solution Aᵀ * x = 0.
+					for i := range x[:n] {
+						x[i] = 0
+					}
+					x[j] = 1
+					scale = 0
+					xMax = 0
+				}
+			} else {
+				// Compute x[j] := x[j] / A[j,j] - sumj
+				// if the dot product has already been divided by 1/A[j,j].
+				x[j] = x[j]/tjjs - sumj
+			}
+			xMax = math.Max(xMax, math.Abs(x[j]))
+		}
+		scale /= tscal
+	}
+
+	// Scale the column norms by 1/tscal for return.
+	if tscal != 1 {
+		bi.Dscal(n, 1/tscal, cnorm, 1)
+	}
+	return scale
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlatdf.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlatdf.go
new file mode 100644
index 0000000000..83422912b9
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlatdf.go
@@ -0,0 +1,175 @@
+// Copyright ©2021 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlatdf computes a contribution to the reciprocal Dif-estimate by solving
+//
+//	Z * x = h - f
+//
+// and choosing the vector h such that the norm of x is as large as possible.
+//
+// The n×n matrix Z is represented by its LU factorization as computed by Dgetc2
+// and has the form
+//
+//	Z = P * L * U * Q
+//
+// where P and Q are permutation matrices, L is lower triangular with unit
+// diagonal elements and U is upper triangular.
+//
+// job specifies the heuristic method for computing the contribution.
+//
+// If job is lapack.LocalLookAhead, all entries of h are chosen as either +1 or
+// -1.
+//
+// If job is lapack.NormalizedNullVector, an approximate null-vector e of Z is
+// computed using Dgecon and normalized. h is chosen as ±e with the sign giving
+// the greater value of 2-norm(x). This strategy is about 5 times as expensive
+// as LocalLookAhead.
+//
+// On entry, rhs holds the contribution f from earlier solved sub-systems. On
+// return, rhs holds the solution x.
+//
+// ipiv and jpiv contain the pivot indices as returned by Dgetc2: row i of the
+// matrix has been interchanged with row ipiv[i] and column j of the matrix has
+// been interchanged with column jpiv[j].
+//
+// n must be at most 8, ipiv and jpiv must have length n, and rhs must have
+// length at least n, otherwise Dlatdf will panic.
+//
+// rdsum and rdscal represent the sum of squares of computed contributions to
+// the Dif-estimate from earlier solved sub-systems. rdscal is the scaling
+// factor used to prevent overflow in rdsum. Dlatdf returns this sum of squares
+// updated with the contributions from the current sub-system.
+//
+// Dlatdf is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlatdf(job lapack.MaximizeNormXJob, n int, z []float64, ldz int, rhs []float64, rdsum, rdscal float64, ipiv, jpiv []int) (scale, sum float64) {
+	switch {
+	case job != lapack.LocalLookAhead && job != lapack.NormalizedNullVector:
+		panic(badMaximizeNormXJob)
+	case n < 0:
+		panic(nLT0)
+	case n > 8:
+		panic("lapack: n > 8")
+	case ldz < max(1, n):
+		panic(badLdZ)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	switch {
+	case len(z) < (n-1)*ldz+n:
+		panic(shortZ)
+	case len(rhs) < n:
+		panic(shortRHS)
+	case len(ipiv) != n:
+		panic(badLenIpiv)
+	case len(jpiv) != n:
+		panic(badLenJpiv)
+	}
+
+	const maxdim = 8
+	var (
+		xps   [maxdim]float64
+		xms   [maxdim]float64
+		work  [4 * maxdim]float64
+		iwork [maxdim]int
+	)
+	bi := blas64.Implementation()
+	xp := xps[:n]
+	xm := xms[:n]
+	if job == lapack.NormalizedNullVector {
+		// Compute approximate nullvector xm of Z.
+		_ = impl.Dgecon(lapack.MaxRowSum, n, z, ldz, 1, work[:], iwork[:])
+		// This relies on undocumented content in work[n:2*n] stored by Dgecon.
+		bi.Dcopy(n, work[n:], 1, xm, 1)
+
+		// Compute rhs.
+		impl.Dlaswp(1, xm, 1, 0, n-2, ipiv[:n-1], -1)
+		tmp := 1 / bi.Dnrm2(n, xm, 1)
+		bi.Dscal(n, tmp, xm, 1)
+		bi.Dcopy(n, xm, 1, xp, 1)
+		bi.Daxpy(n, 1, rhs, 1, xp, 1)
+		bi.Daxpy(n, -1.0, xm, 1, rhs, 1)
+		_ = impl.Dgesc2(n, z, ldz, rhs, ipiv, jpiv)
+		_ = impl.Dgesc2(n, z, ldz, xp, ipiv, jpiv)
+		if bi.Dasum(n, xp, 1) > bi.Dasum(n, rhs, 1) {
+			bi.Dcopy(n, xp, 1, rhs, 1)
+		}
+
+		// Compute and return the updated sum of squares.
+		return impl.Dlassq(n, rhs, 1, rdscal, rdsum)
+	}
+
+	// Apply permutations ipiv to rhs
+	impl.Dlaswp(1, rhs, 1, 0, n-2, ipiv[:n-1], 1)
+
+	// Solve for L-part choosing rhs either to +1 or -1.
+	pmone := -1.0
+	for j := 0; j < n-2; j++ {
+		bp := rhs[j] + 1
+		bm := rhs[j] - 1
+
+		// Look-ahead for L-part rhs[0:n-2] = +1 or -1, splus and sminu computed
+		// more efficiently than in https://doi.org/10.1109/9.29404.
+		splus := 1 + bi.Ddot(n-j-1, z[(j+1)*ldz+j:], ldz, z[(j+1)*ldz+j:], ldz)
+		sminu := bi.Ddot(n-j-1, z[(j+1)*ldz+j:], ldz, rhs[j+1:], 1)
+		splus *= rhs[j]
+		switch {
+		case splus > sminu:
+			rhs[j] = bp
+		case sminu > splus:
+			rhs[j] = bm
+		default:
+			// In this case the updating sums are equal and we can choose rsh[j]
+			// +1 or -1. The first time this happens we choose -1, thereafter
+			// +1. This is a simple way to get good estimates of matrices like
+			// Byers well-known example (see https://doi.org/10.1109/9.29404).
+			rhs[j] += pmone
+			pmone = 1
+		}
+
+		// Compute remaining rhs.
+		bi.Daxpy(n-j-1, -rhs[j], z[(j+1)*ldz+j:], ldz, rhs[j+1:], 1)
+	}
+
+	// Solve for U-part, look-ahead for rhs[n-1] = ±1. This is not done in
+	// Bsolve and will hopefully give us a better estimate because any
+	// ill-conditioning of the original matrix is transferred to U and not to L.
+	// U[n-1,n-1] is an approximation to sigma_min(LU).
+	bi.Dcopy(n-1, rhs, 1, xp, 1)
+	xp[n-1] = rhs[n-1] + 1
+	rhs[n-1] -= 1
+	var splus, sminu float64
+	for i := n - 1; i >= 0; i-- {
+		tmp := 1 / z[i*ldz+i]
+		xp[i] *= tmp
+		rhs[i] *= tmp
+		for k := i + 1; k < n; k++ {
+			xp[i] -= xp[k] * (z[i*ldz+k] * tmp)
+			rhs[i] -= rhs[k] * (z[i*ldz+k] * tmp)
+		}
+		splus += math.Abs(xp[i])
+		sminu += math.Abs(rhs[i])
+	}
+	if splus > sminu {
+		bi.Dcopy(n, xp, 1, rhs, 1)
+	}
+
+	// Apply the permutations jpiv to the computed solution (rhs).
+	impl.Dlaswp(1, rhs, 1, 0, n-2, jpiv[:n-1], -1)
+
+	// Compute and return the updated sum of squares.
+	return impl.Dlassq(n, rhs, 1, rdscal, rdsum)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlatrd.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlatrd.go
new file mode 100644
index 0000000000..195be09c9b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlatrd.go
@@ -0,0 +1,176 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlatrd reduces nb rows and columns of a real n×n symmetric matrix A to symmetric
+// tridiagonal form. It computes the orthonormal similarity transformation
+//
+//	Qᵀ * A * Q
+//
+// and returns the matrices V and W to apply to the unreduced part of A. If
+// uplo == blas.Upper, the upper triangle is supplied and the last nb rows are
+// reduced. If uplo == blas.Lower, the lower triangle is supplied and the first
+// nb rows are reduced.
+//
+// a contains the symmetric matrix on entry with active triangular half specified
+// by uplo. On exit, the nb columns have been reduced to tridiagonal form. The
+// diagonal contains the diagonal of the reduced matrix, the off-diagonal is
+// set to 1, and the remaining elements contain the data to construct Q.
+//
+// If uplo == blas.Upper, with n = 5 and nb = 2 on exit a is
+//
+//	[ a   a   a  v4  v5]
+//	[     a   a  v4  v5]
+//	[         a   1  v5]
+//	[             d   1]
+//	[                 d]
+//
+// If uplo == blas.Lower, with n = 5 and nb = 2, on exit a is
+//
+//	[ d                ]
+//	[ 1   d            ]
+//	[v1   1   a        ]
+//	[v1  v2   a   a    ]
+//	[v1  v2   a   a   a]
+//
+// e contains the superdiagonal elements of the reduced matrix. If uplo == blas.Upper,
+// e[n-nb:n-1] contains the last nb columns of the reduced matrix, while if
+// uplo == blas.Lower, e[:nb] contains the first nb columns of the reduced matrix.
+// e must have length at least n-1, and Dlatrd will panic otherwise.
+//
+// tau contains the scalar factors of the elementary reflectors needed to construct Q.
+// The reflectors are stored in tau[n-nb:n-1] if uplo == blas.Upper, and in
+// tau[:nb] if uplo == blas.Lower. tau must have length n-1, and Dlatrd will panic
+// otherwise.
+//
+// w is an n×nb matrix. On exit it contains the data to update the unreduced part
+// of A.
+//
+// The matrix Q is represented as a product of elementary reflectors. Each reflector
+// H has the form
+//
+//	I - tau * v * vᵀ
+//
+// If uplo == blas.Upper,
+//
+//	Q = H_{n-1} * H_{n-2} * ... * H_{n-nb}
+//
+// where v[:i-1] is stored in A[:i-1,i], v[i-1] = 1, and v[i:n] = 0.
+//
+// If uplo == blas.Lower,
+//
+//	Q = H_0 * H_1 * ... * H_{nb-1}
+//
+// where v[:i+1] = 0, v[i+1] = 1, and v[i+2:n] is stored in A[i+2:n,i].
+//
+// The vectors v form the n×nb matrix V which is used with W to apply a
+// symmetric rank-2 update to the unreduced part of A
+//
+//	A = A - V * Wᵀ - W * Vᵀ
+//
+// Dlatrd is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlatrd(uplo blas.Uplo, n, nb int, a []float64, lda int, e, tau, w []float64, ldw int) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case nb < 0:
+		panic(nbLT0)
+	case nb > n:
+		panic(nbGTN)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldw < max(1, nb):
+		panic(badLdW)
+	}
+
+	if n == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(w) < (n-1)*ldw+nb:
+		panic(shortW)
+	case len(e) < n-1:
+		panic(shortE)
+	case len(tau) < n-1:
+		panic(shortTau)
+	}
+
+	bi := blas64.Implementation()
+
+	if uplo == blas.Upper {
+		for i := n - 1; i >= n-nb; i-- {
+			iw := i - n + nb
+			if i < n-1 {
+				// Update A(0:i, i).
+				bi.Dgemv(blas.NoTrans, i+1, n-i-1, -1, a[i+1:], lda,
+					w[i*ldw+iw+1:], 1, 1, a[i:], lda)
+				bi.Dgemv(blas.NoTrans, i+1, n-i-1, -1, w[iw+1:], ldw,
+					a[i*lda+i+1:], 1, 1, a[i:], lda)
+			}
+			if i > 0 {
+				// Generate elementary reflector H_i to annihilate A(0:i-2,i).
+				e[i-1], tau[i-1] = impl.Dlarfg(i, a[(i-1)*lda+i], a[i:], lda)
+				a[(i-1)*lda+i] = 1
+
+				// Compute W(0:i-1, i).
+				bi.Dsymv(blas.Upper, i, 1, a, lda, a[i:], lda, 0, w[iw:], ldw)
+				if i < n-1 {
+					bi.Dgemv(blas.Trans, i, n-i-1, 1, w[iw+1:], ldw,
+						a[i:], lda, 0, w[(i+1)*ldw+iw:], ldw)
+					bi.Dgemv(blas.NoTrans, i, n-i-1, -1, a[i+1:], lda,
+						w[(i+1)*ldw+iw:], ldw, 1, w[iw:], ldw)
+					bi.Dgemv(blas.Trans, i, n-i-1, 1, a[i+1:], lda,
+						a[i:], lda, 0, w[(i+1)*ldw+iw:], ldw)
+					bi.Dgemv(blas.NoTrans, i, n-i-1, -1, w[iw+1:], ldw,
+						w[(i+1)*ldw+iw:], ldw, 1, w[iw:], ldw)
+				}
+				bi.Dscal(i, tau[i-1], w[iw:], ldw)
+				alpha := -0.5 * tau[i-1] * bi.Ddot(i, w[iw:], ldw, a[i:], lda)
+				bi.Daxpy(i, alpha, a[i:], lda, w[iw:], ldw)
+			}
+		}
+	} else {
+		// Reduce first nb columns of lower triangle.
+		for i := 0; i < nb; i++ {
+			// Update A(i:n, i)
+			bi.Dgemv(blas.NoTrans, n-i, i, -1, a[i*lda:], lda,
+				w[i*ldw:], 1, 1, a[i*lda+i:], lda)
+			bi.Dgemv(blas.NoTrans, n-i, i, -1, w[i*ldw:], ldw,
+				a[i*lda:], 1, 1, a[i*lda+i:], lda)
+			if i < n-1 {
+				// Generate elementary reflector H_i to annihilate A(i+2:n,i).
+				e[i], tau[i] = impl.Dlarfg(n-i-1, a[(i+1)*lda+i], a[min(i+2, n-1)*lda+i:], lda)
+				a[(i+1)*lda+i] = 1
+
+				// Compute W(i+1:n,i).
+				bi.Dsymv(blas.Lower, n-i-1, 1, a[(i+1)*lda+i+1:], lda,
+					a[(i+1)*lda+i:], lda, 0, w[(i+1)*ldw+i:], ldw)
+				bi.Dgemv(blas.Trans, n-i-1, i, 1, w[(i+1)*ldw:], ldw,
+					a[(i+1)*lda+i:], lda, 0, w[i:], ldw)
+				bi.Dgemv(blas.NoTrans, n-i-1, i, -1, a[(i+1)*lda:], lda,
+					w[i:], ldw, 1, w[(i+1)*ldw+i:], ldw)
+				bi.Dgemv(blas.Trans, n-i-1, i, 1, a[(i+1)*lda:], lda,
+					a[(i+1)*lda+i:], lda, 0, w[i:], ldw)
+				bi.Dgemv(blas.NoTrans, n-i-1, i, -1, w[(i+1)*ldw:], ldw,
+					w[i:], ldw, 1, w[(i+1)*ldw+i:], ldw)
+				bi.Dscal(n-i-1, tau[i], w[(i+1)*ldw+i:], ldw)
+				alpha := -0.5 * tau[i] * bi.Ddot(n-i-1, w[(i+1)*ldw+i:], ldw,
+					a[(i+1)*lda+i:], lda)
+				bi.Daxpy(n-i-1, alpha, a[(i+1)*lda+i:], lda,
+					w[(i+1)*ldw+i:], ldw)
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlatrs.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlatrs.go
new file mode 100644
index 0000000000..f13b7d57c0
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlatrs.go
@@ -0,0 +1,410 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlatrs solves a triangular system of equations scaled to prevent overflow. It
+// solves
+//
+//	A * x = scale * b if trans == blas.NoTrans
+//	Aᵀ * x = scale * b if trans == blas.Trans
+//
+// where the scale s is set for numeric stability.
+//
+// A is an n×n triangular matrix. On entry, the slice x contains the values of
+// b, and on exit it contains the solution vector x.
+//
+// If normin == true, cnorm is an input and cnorm[j] contains the norm of the off-diagonal
+// part of the j^th column of A. If trans == blas.NoTrans, cnorm[j] must be greater
+// than or equal to the infinity norm, and greater than or equal to the one-norm
+// otherwise. If normin == false, then cnorm is treated as an output, and is set
+// to contain the 1-norm of the off-diagonal part of the j^th column of A.
+//
+// Dlatrs is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlatrs(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, normin bool, n int, a []float64, lda int, x []float64, cnorm []float64) (scale float64) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case trans != blas.NoTrans && trans != blas.Trans && trans != blas.ConjTrans:
+		panic(badTrans)
+	case diag != blas.Unit && diag != blas.NonUnit:
+		panic(badDiag)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 1
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(x) < n:
+		panic(shortX)
+	case len(cnorm) < n:
+		panic(shortCNorm)
+	}
+
+	upper := uplo == blas.Upper
+	nonUnit := diag == blas.NonUnit
+
+	smlnum := dlamchS / dlamchP
+	bignum := 1 / smlnum
+	scale = 1
+
+	bi := blas64.Implementation()
+
+	if !normin {
+		if upper {
+			cnorm[0] = 0
+			for j := 1; j < n; j++ {
+				cnorm[j] = bi.Dasum(j, a[j:], lda)
+			}
+		} else {
+			for j := 0; j < n-1; j++ {
+				cnorm[j] = bi.Dasum(n-j-1, a[(j+1)*lda+j:], lda)
+			}
+			cnorm[n-1] = 0
+		}
+	}
+	// Scale the column norms by tscal if the maximum element in cnorm is greater than bignum.
+	imax := bi.Idamax(n, cnorm, 1)
+	var tscal float64
+	if cnorm[imax] <= bignum {
+		tscal = 1
+	} else {
+		tmax := cnorm[imax]
+		// Avoid NaN generation if entries in cnorm exceed the overflow
+		// threshold.
+		if tmax <= math.MaxFloat64 {
+			// Case 1: All entries in cnorm are valid floating-point numbers.
+			tscal = 1 / (smlnum * tmax)
+			bi.Dscal(n, tscal, cnorm, 1)
+		} else {
+			// Case 2: At least one column norm of A cannot be represented as
+			// floating-point number. Find the offdiagonal entry A[i,j] with the
+			// largest absolute value. If this entry is not +/- Infinity, use
+			// this value as tscal.
+			tmax = 0
+			if upper {
+				// A is upper triangular.
+				for j := 1; j < n; j++ {
+					tmax = math.Max(impl.Dlange(lapack.MaxAbs, j, 1, a[j:], lda, nil), tmax)
+				}
+			} else {
+				// A is lower triangular.
+				for j := 0; j < n-1; j++ {
+					tmax = math.Max(impl.Dlange(lapack.MaxAbs, n-j-1, 1, a[(j+1)*lda+j:], lda, nil), tmax)
+				}
+			}
+			if tmax <= math.MaxFloat64 {
+				tscal = 1 / (smlnum * tmax)
+				for j := 0; j < n; j++ {
+					if cnorm[j] <= math.MaxFloat64 {
+						cnorm[j] *= tscal
+					} else {
+						// Recompute the 1-norm without introducing Infinity in
+						// the summation.
+						cnorm[j] = 0
+						if upper {
+							for i := 0; i < j; i++ {
+								cnorm[j] += tscal * math.Abs(a[i*lda+j])
+							}
+						} else {
+							for i := j + 1; i < n; i++ {
+								cnorm[j] += tscal * math.Abs(a[i*lda+j])
+							}
+						}
+					}
+				}
+			} else {
+				// At least one entry of A is not a valid floating-point entry.
+				// Rely on Dtrsv to propagate Inf and NaN.
+				bi.Dtrsv(uplo, trans, diag, n, a, lda, x, 1)
+				return
+			}
+		}
+	}
+
+	// Compute a bound on the computed solution vector to see if bi.Dtrsv can be used.
+	j := bi.Idamax(n, x, 1)
+	xmax := math.Abs(x[j])
+	xbnd := xmax
+	var grow float64
+	var jfirst, jlast, jinc int
+	if trans == blas.NoTrans {
+		if upper {
+			jfirst = n - 1
+			jlast = -1
+			jinc = -1
+		} else {
+			jfirst = 0
+			jlast = n
+			jinc = 1
+		}
+		// Compute the growth in A * x = b.
+		if tscal != 1 {
+			grow = 0
+			goto Solve
+		}
+		if nonUnit {
+			grow = 1 / math.Max(xbnd, smlnum)
+			xbnd = grow
+			for j := jfirst; j != jlast; j += jinc {
+				if grow <= smlnum {
+					goto Solve
+				}
+				tjj := math.Abs(a[j*lda+j])
+				xbnd = math.Min(xbnd, math.Min(1, tjj)*grow)
+				if tjj+cnorm[j] >= smlnum {
+					grow *= tjj / (tjj + cnorm[j])
+				} else {
+					grow = 0
+				}
+			}
+			grow = xbnd
+		} else {
+			grow = math.Min(1, 1/math.Max(xbnd, smlnum))
+			for j := jfirst; j != jlast; j += jinc {
+				if grow <= smlnum {
+					goto Solve
+				}
+				grow *= 1 / (1 + cnorm[j])
+			}
+		}
+	} else {
+		if upper {
+			jfirst = 0
+			jlast = n
+			jinc = 1
+		} else {
+			jfirst = n - 1
+			jlast = -1
+			jinc = -1
+		}
+		if tscal != 1 {
+			grow = 0
+			goto Solve
+		}
+		if nonUnit {
+			grow = 1 / (math.Max(xbnd, smlnum))
+			xbnd = grow
+			for j := jfirst; j != jlast; j += jinc {
+				if grow <= smlnum {
+					goto Solve
+				}
+				xj := 1 + cnorm[j]
+				grow = math.Min(grow, xbnd/xj)
+				tjj := math.Abs(a[j*lda+j])
+				if xj > tjj {
+					xbnd *= tjj / xj
+				}
+			}
+			grow = math.Min(grow, xbnd)
+		} else {
+			grow = math.Min(1, 1/math.Max(xbnd, smlnum))
+			for j := jfirst; j != jlast; j += jinc {
+				if grow <= smlnum {
+					goto Solve
+				}
+				xj := 1 + cnorm[j]
+				grow /= xj
+			}
+		}
+	}
+
+Solve:
+	if grow*tscal > smlnum {
+		// Use the Level 2 BLAS solve if the reciprocal of the bound on
+		// elements of X is not too small.
+		bi.Dtrsv(uplo, trans, diag, n, a, lda, x, 1)
+		if tscal != 1 {
+			bi.Dscal(n, 1/tscal, cnorm, 1)
+		}
+		return scale
+	}
+
+	// Use a Level 1 BLAS solve, scaling intermediate results.
+	if xmax > bignum {
+		scale = bignum / xmax
+		bi.Dscal(n, scale, x, 1)
+		xmax = bignum
+	}
+	if trans == blas.NoTrans {
+		for j := jfirst; j != jlast; j += jinc {
+			xj := math.Abs(x[j])
+			var tjj, tjjs float64
+			if nonUnit {
+				tjjs = a[j*lda+j] * tscal
+			} else {
+				tjjs = tscal
+				if tscal == 1 {
+					goto Skip1
+				}
+			}
+			tjj = math.Abs(tjjs)
+			if tjj > smlnum {
+				if tjj < 1 {
+					if xj > tjj*bignum {
+						rec := 1 / xj
+						bi.Dscal(n, rec, x, 1)
+						scale *= rec
+						xmax *= rec
+					}
+				}
+				x[j] /= tjjs
+				xj = math.Abs(x[j])
+			} else if tjj > 0 {
+				if xj > tjj*bignum {
+					rec := (tjj * bignum) / xj
+					if cnorm[j] > 1 {
+						rec /= cnorm[j]
+					}
+					bi.Dscal(n, rec, x, 1)
+					scale *= rec
+					xmax *= rec
+				}
+				x[j] /= tjjs
+				xj = math.Abs(x[j])
+			} else {
+				for i := 0; i < n; i++ {
+					x[i] = 0
+				}
+				x[j] = 1
+				xj = 1
+				scale = 0
+				xmax = 0
+			}
+		Skip1:
+			if xj > 1 {
+				rec := 1 / xj
+				if cnorm[j] > (bignum-xmax)*rec {
+					rec *= 0.5
+					bi.Dscal(n, rec, x, 1)
+					scale *= rec
+				}
+			} else if xj*cnorm[j] > bignum-xmax {
+				bi.Dscal(n, 0.5, x, 1)
+				scale *= 0.5
+			}
+			if upper {
+				if j > 0 {
+					bi.Daxpy(j, -x[j]*tscal, a[j:], lda, x, 1)
+					i := bi.Idamax(j, x, 1)
+					xmax = math.Abs(x[i])
+				}
+			} else {
+				if j < n-1 {
+					bi.Daxpy(n-j-1, -x[j]*tscal, a[(j+1)*lda+j:], lda, x[j+1:], 1)
+					i := j + bi.Idamax(n-j-1, x[j+1:], 1)
+					xmax = math.Abs(x[i])
+				}
+			}
+		}
+	} else {
+		for j := jfirst; j != jlast; j += jinc {
+			xj := math.Abs(x[j])
+			uscal := tscal
+			rec := 1 / math.Max(xmax, 1)
+			var tjjs float64
+			if cnorm[j] > (bignum-xj)*rec {
+				rec *= 0.5
+				if nonUnit {
+					tjjs = a[j*lda+j] * tscal
+				} else {
+					tjjs = tscal
+				}
+				tjj := math.Abs(tjjs)
+				if tjj > 1 {
+					rec = math.Min(1, rec*tjj)
+					uscal /= tjjs
+				}
+				if rec < 1 {
+					bi.Dscal(n, rec, x, 1)
+					scale *= rec
+					xmax *= rec
+				}
+			}
+			var sumj float64
+			if uscal == 1 {
+				if upper {
+					sumj = bi.Ddot(j, a[j:], lda, x, 1)
+				} else if j < n-1 {
+					sumj = bi.Ddot(n-j-1, a[(j+1)*lda+j:], lda, x[j+1:], 1)
+				}
+			} else {
+				if upper {
+					for i := 0; i < j; i++ {
+						sumj += (a[i*lda+j] * uscal) * x[i]
+					}
+				} else if j < n {
+					for i := j + 1; i < n; i++ {
+						sumj += (a[i*lda+j] * uscal) * x[i]
+					}
+				}
+			}
+			if uscal == tscal {
+				x[j] -= sumj
+				xj := math.Abs(x[j])
+				var tjjs float64
+				if nonUnit {
+					tjjs = a[j*lda+j] * tscal
+				} else {
+					tjjs = tscal
+					if tscal == 1 {
+						goto Skip2
+					}
+				}
+				tjj := math.Abs(tjjs)
+				if tjj > smlnum {
+					if tjj < 1 {
+						if xj > tjj*bignum {
+							rec = 1 / xj
+							bi.Dscal(n, rec, x, 1)
+							scale *= rec
+							xmax *= rec
+						}
+					}
+					x[j] /= tjjs
+				} else if tjj > 0 {
+					if xj > tjj*bignum {
+						rec = (tjj * bignum) / xj
+						bi.Dscal(n, rec, x, 1)
+						scale *= rec
+						xmax *= rec
+					}
+					x[j] /= tjjs
+				} else {
+					for i := 0; i < n; i++ {
+						x[i] = 0
+					}
+					x[j] = 1
+					scale = 0
+					xmax = 0
+				}
+			} else {
+				x[j] = x[j]/tjjs - sumj
+			}
+		Skip2:
+			xmax = math.Max(xmax, math.Abs(x[j]))
+		}
+	}
+	scale /= tscal
+	if tscal != 1 {
+		bi.Dscal(n, 1/tscal, cnorm, 1)
+	}
+	return scale
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlauu2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlauu2.go
new file mode 100644
index 0000000000..b70a8420d5
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlauu2.go
@@ -0,0 +1,66 @@
+// Copyright ©2018 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlauu2 computes the product
+//
+//	U * Uᵀ  if uplo is blas.Upper
+//	Lᵀ * L  if uplo is blas.Lower
+//
+// where U or L is stored in the upper or lower triangular part of A.
+// Only the upper or lower triangle of the result is stored, overwriting
+// the corresponding factor in A.
+func (impl Implementation) Dlauu2(uplo blas.Uplo, n int, a []float64, lda int) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	if len(a) < (n-1)*lda+n {
+		panic(shortA)
+	}
+
+	bi := blas64.Implementation()
+
+	if uplo == blas.Upper {
+		// Compute the product U*Uᵀ.
+		for i := 0; i < n; i++ {
+			aii := a[i*lda+i]
+			if i < n-1 {
+				a[i*lda+i] = bi.Ddot(n-i, a[i*lda+i:], 1, a[i*lda+i:], 1)
+				bi.Dgemv(blas.NoTrans, i, n-i-1, 1, a[i+1:], lda, a[i*lda+i+1:], 1,
+					aii, a[i:], lda)
+			} else {
+				bi.Dscal(i+1, aii, a[i:], lda)
+			}
+		}
+	} else {
+		// Compute the product Lᵀ*L.
+		for i := 0; i < n; i++ {
+			aii := a[i*lda+i]
+			if i < n-1 {
+				a[i*lda+i] = bi.Ddot(n-i, a[i*lda+i:], lda, a[i*lda+i:], lda)
+				bi.Dgemv(blas.Trans, n-i-1, i, 1, a[(i+1)*lda:], lda, a[(i+1)*lda+i:], lda,
+					aii, a[i*lda:], 1)
+			} else {
+				bi.Dscal(i+1, aii, a[i*lda:], 1)
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlauum.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlauum.go
new file mode 100644
index 0000000000..575ed7c88f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlauum.go
@@ -0,0 +1,83 @@
+// Copyright ©2018 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlauum computes the product
+//
+//	U * Uᵀ  if uplo is blas.Upper
+//	Lᵀ * L  if uplo is blas.Lower
+//
+// where U or L is stored in the upper or lower triangular part of A.
+// Only the upper or lower triangle of the result is stored, overwriting
+// the corresponding factor in A.
+func (impl Implementation) Dlauum(uplo blas.Uplo, n int, a []float64, lda int) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	if len(a) < (n-1)*lda+n {
+		panic(shortA)
+	}
+
+	// Determine the block size.
+	opts := "U"
+	if uplo == blas.Lower {
+		opts = "L"
+	}
+	nb := impl.Ilaenv(1, "DLAUUM", opts, n, -1, -1, -1)
+
+	if nb <= 1 || n <= nb {
+		// Use unblocked code.
+		impl.Dlauu2(uplo, n, a, lda)
+		return
+	}
+
+	// Use blocked code.
+	bi := blas64.Implementation()
+	if uplo == blas.Upper {
+		// Compute the product U*Uᵀ.
+		for i := 0; i < n; i += nb {
+			ib := min(nb, n-i)
+			bi.Dtrmm(blas.Right, blas.Upper, blas.Trans, blas.NonUnit,
+				i, ib, 1, a[i*lda+i:], lda, a[i:], lda)
+			impl.Dlauu2(blas.Upper, ib, a[i*lda+i:], lda)
+			if n-i-ib > 0 {
+				bi.Dgemm(blas.NoTrans, blas.Trans, i, ib, n-i-ib,
+					1, a[i+ib:], lda, a[i*lda+i+ib:], lda, 1, a[i:], lda)
+				bi.Dsyrk(blas.Upper, blas.NoTrans, ib, n-i-ib,
+					1, a[i*lda+i+ib:], lda, 1, a[i*lda+i:], lda)
+			}
+		}
+	} else {
+		// Compute the product Lᵀ*L.
+		for i := 0; i < n; i += nb {
+			ib := min(nb, n-i)
+			bi.Dtrmm(blas.Left, blas.Lower, blas.Trans, blas.NonUnit,
+				ib, i, 1, a[i*lda+i:], lda, a[i*lda:], lda)
+			impl.Dlauu2(blas.Lower, ib, a[i*lda+i:], lda)
+			if n-i-ib > 0 {
+				bi.Dgemm(blas.Trans, blas.NoTrans, ib, i, n-i-ib,
+					1, a[(i+ib)*lda+i:], lda, a[(i+ib)*lda:], lda, 1, a[i*lda:], lda)
+				bi.Dsyrk(blas.Lower, blas.Trans, ib, n-i-ib,
+					1, a[(i+ib)*lda+i:], lda, 1, a[i*lda+i:], lda)
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/doc.go b/vendor/gonum.org/v1/gonum/lapack/gonum/doc.go
new file mode 100644
index 0000000000..087f63cc6e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/doc.go
@@ -0,0 +1,28 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package gonum is a pure-go implementation of the LAPACK API. The LAPACK API defines
+// a set of algorithms for advanced matrix operations.
+//
+// The function definitions and implementations follow that of the netlib reference
+// implementation. See http://www.netlib.org/lapack/explore-html/ for more
+// information, and http://www.netlib.org/lapack/explore-html/d4/de1/_l_i_c_e_n_s_e_source.html
+// for more license information.
+//
+// Slice function arguments frequently represent vectors and matrices. The data
+// layout is identical to that found in https://pkg.go.dev/gonum.org/v1/gonum/blas/gonum.
+//
+// Most LAPACK functions are built on top the routines defined in the BLAS API,
+// and as such the computation time for many LAPACK functions is
+// dominated by BLAS calls. Here, BLAS is accessed through the
+// blas64 package (https://pkg.go.dev/gonum.org/v1/gonum/blas/blas64). In particular,
+// this implies that an external BLAS library will be used if it is
+// registered in blas64.
+//
+// The full LAPACK capability has not been implemented at present. The full
+// API is very large, containing approximately 200 functions for double precision
+// alone. Future additions will be focused on supporting the Gonum matrix
+// package (https://pkg.go.dev/gonum.org/v1/gonum/mat), though pull requests
+// with implementations and tests for LAPACK function are encouraged.
+package gonum // import "gonum.org/v1/gonum/lapack/gonum"
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dorg2l.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dorg2l.go
new file mode 100644
index 0000000000..fdb37af2a7
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dorg2l.go
@@ -0,0 +1,78 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dorg2l generates an m×n matrix Q with orthonormal columns which is defined
+// as the last n columns of a product of k elementary reflectors of order m.
+//
+//	Q = H_{k-1} * ... * H_1 * H_0
+//
+// See Dgelqf for more information. It must be that m >= n >= k.
+//
+// tau contains the scalar reflectors computed by Dgeqlf. tau must have length
+// at least k, and Dorg2l will panic otherwise.
+//
+// work contains temporary memory, and must have length at least n. Dorg2l will
+// panic otherwise.
+//
+// Dorg2l is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dorg2l(m, n, k int, a []float64, lda int, tau, work []float64) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case n > m:
+		panic(nGTM)
+	case k < 0:
+		panic(kLT0)
+	case k > n:
+		panic(kGTN)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	if n == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(tau) < k:
+		panic(shortTau)
+	case len(work) < n:
+		panic(shortWork)
+	}
+
+	// Initialize columns 0:n-k to columns of the unit matrix.
+	for j := 0; j < n-k; j++ {
+		for l := 0; l < m; l++ {
+			a[l*lda+j] = 0
+		}
+		a[(m-n+j)*lda+j] = 1
+	}
+
+	bi := blas64.Implementation()
+	for i := 0; i < k; i++ {
+		ii := n - k + i
+
+		// Apply H_i to A[0:m-k+i, 0:n-k+i] from the left.
+		a[(m-n+ii)*lda+ii] = 1
+		impl.Dlarf(blas.Left, m-n+ii+1, ii, a[ii:], lda, tau[i], a, lda, work)
+		bi.Dscal(m-n+ii, -tau[i], a[ii:], lda)
+		a[(m-n+ii)*lda+ii] = 1 - tau[i]
+
+		// Set A[m-k+i:m, n-k+i+1] to zero.
+		for l := m - n + ii + 1; l < m; l++ {
+			a[l*lda+ii] = 0
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dorg2r.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dorg2r.go
new file mode 100644
index 0000000000..c56f24cbd9
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dorg2r.go
@@ -0,0 +1,77 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dorg2r generates an m×n matrix Q with orthonormal columns defined by the
+// product of elementary reflectors as computed by Dgeqrf.
+//
+//	Q = H_0 * H_1 * ... * H_{k-1}
+//
+// len(tau) = k, 0 <= k <= n, 0 <= n <= m, len(work) >= n.
+// Dorg2r will panic if these conditions are not met.
+//
+// Dorg2r is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dorg2r(m, n, k int, a []float64, lda int, tau []float64, work []float64) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case n > m:
+		panic(nGTM)
+	case k < 0:
+		panic(kLT0)
+	case k > n:
+		panic(kGTN)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	if n == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(tau) != k:
+		panic(badLenTau)
+	case len(work) < n:
+		panic(shortWork)
+	}
+
+	bi := blas64.Implementation()
+
+	// Initialize columns k+1:n to columns of the unit matrix.
+	for l := 0; l < m; l++ {
+		for j := k; j < n; j++ {
+			a[l*lda+j] = 0
+		}
+	}
+	for j := k; j < n; j++ {
+		a[j*lda+j] = 1
+	}
+	for i := k - 1; i >= 0; i-- {
+		for i := range work {
+			work[i] = 0
+		}
+		if i < n-1 {
+			a[i*lda+i] = 1
+			impl.Dlarf(blas.Left, m-i, n-i-1, a[i*lda+i:], lda, tau[i], a[i*lda+i+1:], lda, work)
+		}
+		if i < m-1 {
+			bi.Dscal(m-i-1, -tau[i], a[(i+1)*lda+i:], lda)
+		}
+		a[i*lda+i] = 1 - tau[i]
+		for l := 0; l < i; l++ {
+			a[l*lda+i] = 0
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dorgbr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dorgbr.go
new file mode 100644
index 0000000000..35535100b6
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dorgbr.go
@@ -0,0 +1,138 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/lapack"
+
+// Dorgbr generates one of the matrices Q or Pᵀ computed by Dgebrd
+// computed from the decomposition Dgebrd. See Dgebd2 for the description of
+// Q and Pᵀ.
+//
+// If vect == lapack.GenerateQ, then a is assumed to have been an m×k matrix and
+// Q is of order m. If m >= k, then Dorgbr returns the first n columns of Q
+// where m >= n >= k. If m < k, then Dorgbr returns Q as an m×m matrix.
+//
+// If vect == lapack.GeneratePT, then A is assumed to have been a k×n matrix, and
+// Pᵀ is of order n. If k < n, then Dorgbr returns the first m rows of Pᵀ,
+// where n >= m >= k. If k >= n, then Dorgbr returns Pᵀ as an n×n matrix.
+//
+// Dorgbr is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dorgbr(vect lapack.GenOrtho, m, n, k int, a []float64, lda int, tau, work []float64, lwork int) {
+	wantq := vect == lapack.GenerateQ
+	mn := min(m, n)
+	switch {
+	case vect != lapack.GenerateQ && vect != lapack.GeneratePT:
+		panic(badGenOrtho)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case wantq && n > m:
+		panic(nGTM)
+	case wantq && n < min(m, k):
+		panic("lapack: n < min(m,k)")
+	case !wantq && m > n:
+		panic(mGTN)
+	case !wantq && m < min(n, k):
+		panic("lapack: m < min(n,k)")
+	case lda < max(1, n) && lwork != -1:
+		// Normally, we follow the reference and require the leading
+		// dimension to be always valid, even in case of workspace
+		// queries. However, if a caller provided a placeholder value
+		// for lda (and a) when doing a workspace query that didn't
+		// fulfill the condition here, it would cause a panic. This is
+		// exactly what Dgesvd does.
+		panic(badLdA)
+	case lwork < max(1, mn) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	work[0] = 1
+	if m == 0 || n == 0 {
+		return
+	}
+
+	if wantq {
+		if m >= k {
+			impl.Dorgqr(m, n, k, a, lda, tau, work, -1)
+		} else if m > 1 {
+			impl.Dorgqr(m-1, m-1, m-1, a[lda+1:], lda, tau, work, -1)
+		}
+	} else {
+		if k < n {
+			impl.Dorglq(m, n, k, a, lda, tau, work, -1)
+		} else if n > 1 {
+			impl.Dorglq(n-1, n-1, n-1, a[lda+1:], lda, tau, work, -1)
+		}
+	}
+	lworkopt := int(work[0])
+	lworkopt = max(lworkopt, mn)
+	if lwork == -1 {
+		work[0] = float64(lworkopt)
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case wantq && len(tau) < min(m, k):
+		panic(shortTau)
+	case !wantq && len(tau) < min(n, k):
+		panic(shortTau)
+	}
+
+	if wantq {
+		// Form Q, determined by a call to Dgebrd to reduce an m×k matrix.
+		if m >= k {
+			impl.Dorgqr(m, n, k, a, lda, tau[:k], work, lwork)
+		} else {
+			// Shift the vectors which define the elementary reflectors one
+			// column to the right, and set the first row and column of Q to
+			// those of the unit matrix.
+			for j := m - 1; j >= 1; j-- {
+				a[j] = 0
+				for i := j + 1; i < m; i++ {
+					a[i*lda+j] = a[i*lda+j-1]
+				}
+			}
+			a[0] = 1
+			for i := 1; i < m; i++ {
+				a[i*lda] = 0
+			}
+			if m > 1 {
+				// Form Q[1:m-1, 1:m-1]
+				impl.Dorgqr(m-1, m-1, m-1, a[lda+1:], lda, tau[:m-1], work, lwork)
+			}
+		}
+	} else {
+		// Form Pᵀ, determined by a call to Dgebrd to reduce a k×n matrix.
+		if k < n {
+			impl.Dorglq(m, n, k, a, lda, tau, work, lwork)
+		} else {
+			// Shift the vectors which define the elementary reflectors one
+			// row downward, and set the first row and column of Pᵀ to
+			// those of the unit matrix.
+			a[0] = 1
+			for i := 1; i < n; i++ {
+				a[i*lda] = 0
+			}
+			for j := 1; j < n; j++ {
+				for i := j - 1; i >= 1; i-- {
+					a[i*lda+j] = a[(i-1)*lda+j]
+				}
+				a[j] = 0
+			}
+			if n > 1 {
+				impl.Dorglq(n-1, n-1, n-1, a[lda+1:], lda, tau, work, lwork)
+			}
+		}
+	}
+	work[0] = float64(lworkopt)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dorghr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dorghr.go
new file mode 100644
index 0000000000..8f0dd452ec
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dorghr.go
@@ -0,0 +1,103 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+// Dorghr generates an n×n orthogonal matrix Q which is defined as the product
+// of ihi-ilo elementary reflectors:
+//
+//	Q = H_{ilo} H_{ilo+1} ... H_{ihi-1}.
+//
+// a and lda represent an n×n matrix that contains the elementary reflectors, as
+// returned by Dgehrd. On return, a is overwritten by the n×n orthogonal matrix
+// Q. Q will be equal to the identity matrix except in the submatrix
+// Q[ilo+1:ihi+1,ilo+1:ihi+1].
+//
+// ilo and ihi must have the same values as in the previous call of Dgehrd. It
+// must hold that
+//
+//	0 <= ilo <= ihi < n  if n > 0,
+//	ilo = 0, ihi = -1    if n == 0.
+//
+// tau contains the scalar factors of the elementary reflectors, as returned by
+// Dgehrd. tau must have length n-1.
+//
+// work must have length at least max(1,lwork) and lwork must be at least
+// ihi-ilo. For optimum performance lwork must be at least (ihi-ilo)*nb where nb
+// is the optimal blocksize. On return, work[0] will contain the optimal value
+// of lwork.
+//
+// If lwork == -1, instead of performing Dorghr, only the optimal value of lwork
+// will be stored into work[0].
+//
+// If any requirement on input sizes is not met, Dorghr will panic.
+//
+// Dorghr is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dorghr(n, ilo, ihi int, a []float64, lda int, tau, work []float64, lwork int) {
+	nh := ihi - ilo
+	switch {
+	case ilo < 0 || max(1, n) <= ilo:
+		panic(badIlo)
+	case ihi < min(ilo, n-1) || n <= ihi:
+		panic(badIhi)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < max(1, nh) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		work[0] = 1
+		return
+	}
+
+	lwkopt := max(1, nh) * impl.Ilaenv(1, "DORGQR", " ", nh, nh, nh, -1)
+	if lwork == -1 {
+		work[0] = float64(lwkopt)
+		return
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(tau) < n-1:
+		panic(shortTau)
+	}
+
+	// Shift the vectors which define the elementary reflectors one column
+	// to the right.
+	for i := ilo + 2; i < ihi+1; i++ {
+		copy(a[i*lda+ilo+1:i*lda+i], a[i*lda+ilo:i*lda+i-1])
+	}
+	// Set the first ilo+1 and the last n-ihi-1 rows and columns to those of
+	// the identity matrix.
+	for i := 0; i < ilo+1; i++ {
+		for j := 0; j < n; j++ {
+			a[i*lda+j] = 0
+		}
+		a[i*lda+i] = 1
+	}
+	for i := ilo + 1; i < ihi+1; i++ {
+		for j := 0; j <= ilo; j++ {
+			a[i*lda+j] = 0
+		}
+		for j := i; j < n; j++ {
+			a[i*lda+j] = 0
+		}
+	}
+	for i := ihi + 1; i < n; i++ {
+		for j := 0; j < n; j++ {
+			a[i*lda+j] = 0
+		}
+		a[i*lda+i] = 1
+	}
+	if nh > 0 {
+		// Generate Q[ilo+1:ihi+1,ilo+1:ihi+1].
+		impl.Dorgqr(nh, nh, nh, a[(ilo+1)*lda+ilo+1:], lda, tau[ilo:ihi], work, lwork)
+	}
+	work[0] = float64(lwkopt)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dorgl2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dorgl2.go
new file mode 100644
index 0000000000..6dd9a88863
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dorgl2.go
@@ -0,0 +1,79 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dorgl2 generates an m×n matrix Q with orthonormal rows defined as the first m
+// rows of a product of k elementary reflectors of order n
+//
+//	Q = H_{k-1} * ... * H_0
+//
+// as returned by Dgelqf.
+//
+// On entry, tau and the first k rows of A must contain the scalar factors and
+// the vectors, respectively, which define the elementary reflectors H_i,
+// i=0,...,k-1, as returned by Dgelqf. On return, A contains the matrix Q.
+//
+// tau must have length at least k, work must have length at least m, and it
+// must hold that 0 <= k <= m <= n, otherwise Dorgl2 will panic.
+//
+// Dorgl2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dorgl2(m, n, k int, a []float64, lda int, tau, work []float64) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < m:
+		panic(nLTM)
+	case k < 0:
+		panic(kLT0)
+	case k > m:
+		panic(kGTM)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	if m == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(tau) < k:
+		panic(shortTau)
+	case len(work) < m:
+		panic(shortWork)
+	}
+
+	bi := blas64.Implementation()
+
+	if k < m {
+		for i := k; i < m; i++ {
+			for j := 0; j < n; j++ {
+				a[i*lda+j] = 0
+			}
+		}
+		for j := k; j < m; j++ {
+			a[j*lda+j] = 1
+		}
+	}
+	for i := k - 1; i >= 0; i-- {
+		if i < n-1 {
+			if i < m-1 {
+				a[i*lda+i] = 1
+				impl.Dlarf(blas.Right, m-i-1, n-i, a[i*lda+i:], 1, tau[i], a[(i+1)*lda+i:], lda, work)
+			}
+			bi.Dscal(n-i-1, -tau[i], a[i*lda+i+1:], 1)
+		}
+		a[i*lda+i] = 1 - tau[i]
+		for l := 0; l < i; l++ {
+			a[i*lda+l] = 0
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dorglq.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dorglq.go
new file mode 100644
index 0000000000..d6b3aadfca
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dorglq.go
@@ -0,0 +1,125 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dorglq generates an m×n matrix Q with orthonormal rows defined as the first m
+// rows of a product of k elementary reflectors of order n
+//
+//	Q = H_{k-1} * ... * H_0
+//
+// as returned by Dgelqf.
+//
+// On entry, tau and the first k rows of A must contain the scalar factors and
+// the vectors, respectively, which define the elementary reflectors H_i,
+// i=0,...,k-1, as returned by Dgelqf. On return, A contains the matrix Q.
+//
+// tau must have length at least k, work must have length at least lwork and
+// lwork must be at least max(1,m). On return, optimal value of lwork will be
+// stored in work[0]. It must also hold that 0 <= k <= m <= n, otherwise Dorglq
+// will panic.
+//
+// If lwork == -1, instead of performing Dorglq, the function only calculates
+// the optimal value of lwork and stores it into work[0].
+func (impl Implementation) Dorglq(m, n, k int, a []float64, lda int, tau, work []float64, lwork int) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < m:
+		panic(nLTM)
+	case k < 0:
+		panic(kLT0)
+	case k > m:
+		panic(kGTM)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < max(1, m) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	if m == 0 {
+		work[0] = 1
+		return
+	}
+
+	nb := impl.Ilaenv(1, "DORGLQ", " ", m, n, k, -1)
+	if lwork == -1 {
+		work[0] = float64(m * nb)
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(tau) < k:
+		panic(shortTau)
+	}
+
+	nbmin := 2 // Minimum block size
+	var nx int // Crossover size from blocked to unblocked code
+	iws := m   // Length of work needed
+	var ldwork int
+	if 1 < nb && nb < k {
+		nx = max(0, impl.Ilaenv(3, "DORGLQ", " ", m, n, k, -1))
+		if nx < k {
+			ldwork = nb
+			iws = m * ldwork
+			if lwork < iws {
+				nb = lwork / m
+				ldwork = nb
+				nbmin = max(2, impl.Ilaenv(2, "DORGLQ", " ", m, n, k, -1))
+			}
+		}
+	}
+
+	var ki, kk int
+	if nbmin <= nb && nb < k && nx < k {
+		// The first kk rows are handled by the blocked method.
+		ki = ((k - nx - 1) / nb) * nb
+		kk = min(k, ki+nb)
+		for i := kk; i < m; i++ {
+			for j := 0; j < kk; j++ {
+				a[i*lda+j] = 0
+			}
+		}
+	}
+	if kk < m {
+		// Perform the operation on columns kk to the end.
+		impl.Dorgl2(m-kk, n-kk, k-kk, a[kk*lda+kk:], lda, tau[kk:], work)
+	}
+	if kk > 0 {
+		// Perform the operation on column-blocks
+		for i := ki; i >= 0; i -= nb {
+			ib := min(nb, k-i)
+			if i+ib < m {
+				impl.Dlarft(lapack.Forward, lapack.RowWise,
+					n-i, ib,
+					a[i*lda+i:], lda,
+					tau[i:],
+					work, ldwork)
+
+				impl.Dlarfb(blas.Right, blas.Trans, lapack.Forward, lapack.RowWise,
+					m-i-ib, n-i, ib,
+					a[i*lda+i:], lda,
+					work, ldwork,
+					a[(i+ib)*lda+i:], lda,
+					work[ib*ldwork:], ldwork)
+			}
+			impl.Dorgl2(ib, n-i, ib, a[i*lda+i:], lda, tau[i:], work)
+			for l := i; l < i+ib; l++ {
+				for j := 0; j < i; j++ {
+					a[l*lda+j] = 0
+				}
+			}
+		}
+	}
+	work[0] = float64(iws)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dorgql.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dorgql.go
new file mode 100644
index 0000000000..d5ef17f3b6
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dorgql.go
@@ -0,0 +1,139 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dorgql generates the m×n matrix Q with orthonormal columns defined as the
+// last n columns of a product of k elementary reflectors of order m
+//
+//	Q = H_{k-1} * ... * H_1 * H_0.
+//
+// It must hold that
+//
+//	0 <= k <= n <= m,
+//
+// and Dorgql will panic otherwise.
+//
+// On entry, the (n-k+i)-th column of A must contain the vector which defines
+// the elementary reflector H_i, for i=0,...,k-1, and tau[i] must contain its
+// scalar factor. On return, a contains the m×n matrix Q.
+//
+// tau must have length at least k, and Dorgql will panic otherwise.
+//
+// work must have length at least max(1,lwork), and lwork must be at least
+// max(1,n), otherwise Dorgql will panic. For optimum performance lwork must
+// be a sufficiently large multiple of n.
+//
+// If lwork == -1, instead of computing Dorgql the optimal work length is stored
+// into work[0].
+//
+// Dorgql is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dorgql(m, n, k int, a []float64, lda int, tau, work []float64, lwork int) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case n > m:
+		panic(nGTM)
+	case k < 0:
+		panic(kLT0)
+	case k > n:
+		panic(kGTN)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < max(1, n) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		work[0] = 1
+		return
+	}
+
+	nb := impl.Ilaenv(1, "DORGQL", " ", m, n, k, -1)
+	if lwork == -1 {
+		work[0] = float64(n * nb)
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(tau) < k:
+		panic(shortTau)
+	}
+
+	nbmin := 2
+	var nx, ldwork int
+	iws := n
+	if 1 < nb && nb < k {
+		// Determine when to cross over from blocked to unblocked code.
+		nx = max(0, impl.Ilaenv(3, "DORGQL", " ", m, n, k, -1))
+		if nx < k {
+			// Determine if workspace is large enough for blocked code.
+			iws = n * nb
+			if lwork < iws {
+				// Not enough workspace to use optimal nb: reduce nb and determine
+				// the minimum value of nb.
+				nb = lwork / n
+				nbmin = max(2, impl.Ilaenv(2, "DORGQL", " ", m, n, k, -1))
+			}
+			ldwork = nb
+		}
+	}
+
+	var kk int
+	if nbmin <= nb && nb < k && nx < k {
+		// Use blocked code after the first block. The last kk columns are handled
+		// by the block method.
+		kk = min(k, ((k-nx+nb-1)/nb)*nb)
+
+		// Set A(m-kk:m, 0:n-kk) to zero.
+		for i := m - kk; i < m; i++ {
+			for j := 0; j < n-kk; j++ {
+				a[i*lda+j] = 0
+			}
+		}
+	}
+
+	// Use unblocked code for the first or only block.
+	impl.Dorg2l(m-kk, n-kk, k-kk, a, lda, tau, work)
+	if kk > 0 {
+		// Use blocked code.
+		for i := k - kk; i < k; i += nb {
+			ib := min(nb, k-i)
+			if n-k+i > 0 {
+				// Form the triangular factor of the block reflector
+				// H = H_{i+ib-1} * ... * H_{i+1} * H_i.
+				impl.Dlarft(lapack.Backward, lapack.ColumnWise, m-k+i+ib, ib,
+					a[n-k+i:], lda, tau[i:], work, ldwork)
+
+				// Apply H to A[0:m-k+i+ib, 0:n-k+i] from the left.
+				impl.Dlarfb(blas.Left, blas.NoTrans, lapack.Backward, lapack.ColumnWise,
+					m-k+i+ib, n-k+i, ib, a[n-k+i:], lda, work, ldwork,
+					a, lda, work[ib*ldwork:], ldwork)
+			}
+
+			// Apply H to rows 0:m-k+i+ib of current block.
+			impl.Dorg2l(m-k+i+ib, ib, ib, a[n-k+i:], lda, tau[i:], work)
+
+			// Set rows m-k+i+ib:m of current block to zero.
+			for j := n - k + i; j < n-k+i+ib; j++ {
+				for l := m - k + i + ib; l < m; l++ {
+					a[l*lda+j] = 0
+				}
+			}
+		}
+	}
+	work[0] = float64(iws)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dorgqr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dorgqr.go
new file mode 100644
index 0000000000..a1e0fa8716
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dorgqr.go
@@ -0,0 +1,136 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dorgqr generates an m×n matrix Q with orthonormal columns defined by the
+// product of elementary reflectors
+//
+//	Q = H_0 * H_1 * ... * H_{k-1}
+//
+// as computed by Dgeqrf.
+// Dorgqr is the blocked version of Dorg2r that makes greater use of level-3 BLAS
+// routines.
+//
+// The length of tau must be k, and the length of work must be at least n.
+// It also must be that 0 <= k <= n and 0 <= n <= m.
+//
+// work is temporary storage, and lwork specifies the usable memory length. At
+// minimum, lwork >= n, and the amount of blocking is limited by the usable
+// length. If lwork == -1, instead of computing Dorgqr the optimal work length
+// is stored into work[0].
+//
+// Dorgqr will panic if the conditions on input values are not met.
+//
+// Dorgqr is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dorgqr(m, n, k int, a []float64, lda int, tau, work []float64, lwork int) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case n > m:
+		panic(nGTM)
+	case k < 0:
+		panic(kLT0)
+	case k > n:
+		panic(kGTN)
+	case lda < max(1, n) && lwork != -1:
+		// Normally, we follow the reference and require the leading
+		// dimension to be always valid, even in case of workspace
+		// queries. However, if a caller provided a placeholder value
+		// for lda (and a) when doing a workspace query that didn't
+		// fulfill the condition here, it would cause a panic. This is
+		// exactly what Dgesvd does.
+		panic(badLdA)
+	case lwork < max(1, n) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	if n == 0 {
+		work[0] = 1
+		return
+	}
+
+	nb := impl.Ilaenv(1, "DORGQR", " ", m, n, k, -1)
+	// work is treated as an n×nb matrix
+	if lwork == -1 {
+		work[0] = float64(n * nb)
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(tau) != k:
+		panic(badLenTau)
+	}
+
+	nbmin := 2 // Minimum block size
+	var nx int // Crossover size from blocked to unblocked code
+	iws := n   // Length of work needed
+	var ldwork int
+	if 1 < nb && nb < k {
+		nx = max(0, impl.Ilaenv(3, "DORGQR", " ", m, n, k, -1))
+		if nx < k {
+			ldwork = nb
+			iws = n * ldwork
+			if lwork < iws {
+				nb = lwork / n
+				ldwork = nb
+				nbmin = max(2, impl.Ilaenv(2, "DORGQR", " ", m, n, k, -1))
+			}
+		}
+	}
+	var ki, kk int
+	if nbmin <= nb && nb < k && nx < k {
+		// The first kk columns are handled by the blocked method.
+		ki = ((k - nx - 1) / nb) * nb
+		kk = min(k, ki+nb)
+		for i := 0; i < kk; i++ {
+			for j := kk; j < n; j++ {
+				a[i*lda+j] = 0
+			}
+		}
+	}
+	if kk < n {
+		// Perform the operation on columns kk to the end.
+		impl.Dorg2r(m-kk, n-kk, k-kk, a[kk*lda+kk:], lda, tau[kk:], work)
+	}
+	if kk > 0 {
+		// Perform the operation on column-blocks.
+		for i := ki; i >= 0; i -= nb {
+			ib := min(nb, k-i)
+			if i+ib < n {
+				impl.Dlarft(lapack.Forward, lapack.ColumnWise,
+					m-i, ib,
+					a[i*lda+i:], lda,
+					tau[i:],
+					work, ldwork)
+
+				impl.Dlarfb(blas.Left, blas.NoTrans, lapack.Forward, lapack.ColumnWise,
+					m-i, n-i-ib, ib,
+					a[i*lda+i:], lda,
+					work, ldwork,
+					a[i*lda+i+ib:], lda,
+					work[ib*ldwork:], ldwork)
+			}
+			impl.Dorg2r(m-i, ib, ib, a[i*lda+i:], lda, tau[i:i+ib], work)
+			// Set rows 0:i-1 of current block to zero.
+			for j := i; j < i+ib; j++ {
+				for l := 0; l < i; l++ {
+					a[l*lda+j] = 0
+				}
+			}
+		}
+	}
+	work[0] = float64(iws)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dorgr2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dorgr2.go
new file mode 100644
index 0000000000..6f2790cb8f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dorgr2.go
@@ -0,0 +1,83 @@
+// Copyright ©2021 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dorgr2 generates an m×n real matrix Q with orthonormal rows, which is defined
+// as the last m rows of a product of k elementary reflectors of order n
+//
+//	Q = H_0 * H_1 * ... * H_{k-1}
+//
+// as returned by Dgerqf.
+//
+// On entry, the (m-k+i)-th row of A must contain the vector which defines the
+// elementary reflector H_i, for i = 0,1,...,k, as returned by Dgerqf. On
+// return, A will contain the m×n matrix Q.
+//
+// The i-th element of tau must contain the scalar factor of the elementary
+// reflector H_i, as returned by Dgerqf.
+//
+// It must hold that
+//
+//	n >= m >= k >= 0,
+//
+// the length of tau must be k and the length of work must be m, otherwise
+// Dorgr2 will panic.
+//
+// Dorgr2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dorgr2(m, n, k int, a []float64, lda int, tau, work []float64) {
+	switch {
+	case k < 0:
+		panic(kLT0)
+	case m < k:
+		panic(kGTM)
+	case n < m:
+		panic(mGTN)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if m == 0 {
+		return
+	}
+
+	switch {
+	case len(tau) != k:
+		panic(badLenTau)
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(work) < m:
+		panic(shortWork)
+	}
+
+	// Initialise rows 0:m-k to rows of the unit matrix.
+	for l := 0; l < m-k; l++ {
+		row := a[l*lda : l*lda+n]
+		for j := range row {
+			row[j] = 0
+		}
+		a[l*lda+n-m+l] = 1
+	}
+	bi := blas64.Implementation()
+	for i := 0; i < k; i++ {
+		ii := m - k + i
+
+		// Apply H_i to A[0:m-k+i+1, 0:n-k+i+1] from the right.
+		a[ii*lda+n-m+ii] = 1
+		impl.Dlarf(blas.Right, ii, n-m+ii+1, a[ii*lda:], 1, tau[i], a, lda, work)
+		bi.Dscal(n-m+ii, -tau[i], a[ii*lda:], 1)
+		a[ii*lda+n-m+ii] = 1 - tau[i]
+
+		// Set A[m-k+i, n-k+i:n] to zero.
+		for l := n - m + ii + 1; l < n; l++ {
+			a[ii*lda+l] = 0
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dorgtr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dorgtr.go
new file mode 100644
index 0000000000..7021ae53d3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dorgtr.go
@@ -0,0 +1,106 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dorgtr generates a real orthogonal matrix Q which is defined as the product
+// of n-1 elementary reflectors of order n as returned by Dsytrd.
+//
+// The construction of Q depends on the value of uplo:
+//
+//	Q = H_{n-1} * ... * H_1 * H_0  if uplo == blas.Upper
+//	Q = H_0 * H_1 * ... * H_{n-1}  if uplo == blas.Lower
+//
+// where H_i is constructed from the elementary reflectors as computed by Dsytrd.
+// See the documentation for Dsytrd for more information.
+//
+// tau must have length at least n-1, and Dorgtr will panic otherwise.
+//
+// work is temporary storage, and lwork specifies the usable memory length. At
+// minimum, lwork >= max(1,n-1), and Dorgtr will panic otherwise. The amount of blocking
+// is limited by the usable length.
+// If lwork == -1, instead of computing Dorgtr the optimal work length is stored
+// into work[0].
+//
+// Dorgtr is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dorgtr(uplo blas.Uplo, n int, a []float64, lda int, tau, work []float64, lwork int) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < max(1, n-1) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	if n == 0 {
+		work[0] = 1
+		return
+	}
+
+	var nb int
+	if uplo == blas.Upper {
+		nb = impl.Ilaenv(1, "DORGQL", " ", n-1, n-1, n-1, -1)
+	} else {
+		nb = impl.Ilaenv(1, "DORGQR", " ", n-1, n-1, n-1, -1)
+	}
+	lworkopt := max(1, n-1) * nb
+	if lwork == -1 {
+		work[0] = float64(lworkopt)
+		return
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(tau) < n-1:
+		panic(shortTau)
+	}
+
+	if uplo == blas.Upper {
+		// Q was determined by a call to Dsytrd with uplo == blas.Upper.
+		// Shift the vectors which define the elementary reflectors one column
+		// to the left, and set the last row and column of Q to those of the unit
+		// matrix.
+		for j := 0; j < n-1; j++ {
+			for i := 0; i < j; i++ {
+				a[i*lda+j] = a[i*lda+j+1]
+			}
+			a[(n-1)*lda+j] = 0
+		}
+		for i := 0; i < n-1; i++ {
+			a[i*lda+n-1] = 0
+		}
+		a[(n-1)*lda+n-1] = 1
+
+		// Generate Q[0:n-1, 0:n-1].
+		impl.Dorgql(n-1, n-1, n-1, a, lda, tau, work, lwork)
+	} else {
+		// Q was determined by a call to Dsytrd with uplo == blas.Upper.
+		// Shift the vectors which define the elementary reflectors one column
+		// to the right, and set the first row and column of Q to those of the unit
+		// matrix.
+		for j := n - 1; j > 0; j-- {
+			a[j] = 0
+			for i := j + 1; i < n; i++ {
+				a[i*lda+j] = a[i*lda+j-1]
+			}
+		}
+		a[0] = 1
+		for i := 1; i < n; i++ {
+			a[i*lda] = 0
+		}
+		if n > 1 {
+			// Generate Q[1:n, 1:n].
+			impl.Dorgqr(n-1, n-1, n-1, a[lda+1:], lda, tau[:n-1], work, lwork)
+		}
+	}
+	work[0] = float64(lworkopt)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dorm2r.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dorm2r.go
new file mode 100644
index 0000000000..aea77a70d2
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dorm2r.go
@@ -0,0 +1,103 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dorm2r multiplies a general matrix C by an orthogonal matrix from a QR factorization
+// determined by Dgeqrf.
+//
+//	C = Q * C   if side == blas.Left and trans == blas.NoTrans
+//	C = Qᵀ * C  if side == blas.Left and trans == blas.Trans
+//	C = C * Q   if side == blas.Right and trans == blas.NoTrans
+//	C = C * Qᵀ  if side == blas.Right and trans == blas.Trans
+//
+// If side == blas.Left, a is a matrix of size m×k, and if side == blas.Right
+// a is of size n×k.
+//
+// tau contains the Householder factors and must have length k and this function
+// will panic otherwise.
+//
+// work is temporary storage of length at least n if side == blas.Left
+// and at least m if side == blas.Right and this function will panic otherwise.
+//
+// Dorm2r is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dorm2r(side blas.Side, trans blas.Transpose, m, n, k int, a []float64, lda int, tau, c []float64, ldc int, work []float64) {
+	left := side == blas.Left
+	switch {
+	case !left && side != blas.Right:
+		panic(badSide)
+	case trans != blas.Trans && trans != blas.NoTrans:
+		panic(badTrans)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case left && k > m:
+		panic(kGTM)
+	case !left && k > n:
+		panic(kGTN)
+	case lda < max(1, k):
+		panic(badLdA)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 || k == 0 {
+		return
+	}
+
+	switch {
+	case left && len(a) < (m-1)*lda+k:
+		panic(shortA)
+	case !left && len(a) < (n-1)*lda+k:
+		panic(shortA)
+	case len(c) < (m-1)*ldc+n:
+		panic(shortC)
+	case len(tau) != k:
+		panic(badLenTau)
+	case left && len(work) < n:
+		panic(shortWork)
+	case !left && len(work) < m:
+		panic(shortWork)
+	}
+
+	if left {
+		if trans == blas.NoTrans {
+			for i := k - 1; i >= 0; i-- {
+				aii := a[i*lda+i]
+				a[i*lda+i] = 1
+				impl.Dlarf(side, m-i, n, a[i*lda+i:], lda, tau[i], c[i*ldc:], ldc, work)
+				a[i*lda+i] = aii
+			}
+			return
+		}
+		for i := 0; i < k; i++ {
+			aii := a[i*lda+i]
+			a[i*lda+i] = 1
+			impl.Dlarf(side, m-i, n, a[i*lda+i:], lda, tau[i], c[i*ldc:], ldc, work)
+			a[i*lda+i] = aii
+		}
+		return
+	}
+	if trans == blas.NoTrans {
+		for i := 0; i < k; i++ {
+			aii := a[i*lda+i]
+			a[i*lda+i] = 1
+			impl.Dlarf(side, m, n-i, a[i*lda+i:], lda, tau[i], c[i:], ldc, work)
+			a[i*lda+i] = aii
+		}
+		return
+	}
+	for i := k - 1; i >= 0; i-- {
+		aii := a[i*lda+i]
+		a[i*lda+i] = 1
+		impl.Dlarf(side, m, n-i, a[i*lda+i:], lda, tau[i], c[i:], ldc, work)
+		a[i*lda+i] = aii
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dormbr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dormbr.go
new file mode 100644
index 0000000000..8be7040c92
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dormbr.go
@@ -0,0 +1,180 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dormbr applies a multiplicative update to the matrix C based on a
+// decomposition computed by Dgebrd.
+//
+// Dormbr overwrites the m×n matrix C with
+//
+//	Q * C   if vect == lapack.ApplyQ, side == blas.Left, and trans == blas.NoTrans
+//	C * Q   if vect == lapack.ApplyQ, side == blas.Right, and trans == blas.NoTrans
+//	Qᵀ * C  if vect == lapack.ApplyQ, side == blas.Left, and trans == blas.Trans
+//	C * Qᵀ  if vect == lapack.ApplyQ, side == blas.Right, and trans == blas.Trans
+//
+//	P * C   if vect == lapack.ApplyP, side == blas.Left, and trans == blas.NoTrans
+//	C * P   if vect == lapack.ApplyP, side == blas.Right, and trans == blas.NoTrans
+//	Pᵀ * C  if vect == lapack.ApplyP, side == blas.Left, and trans == blas.Trans
+//	C * Pᵀ  if vect == lapack.ApplyP, side == blas.Right, and trans == blas.Trans
+//
+// where P and Q are the orthogonal matrices determined by Dgebrd when reducing
+// a matrix A to bidiagonal form: A = Q * B * Pᵀ. See Dgebrd for the
+// definitions of Q and P.
+//
+// If vect == lapack.ApplyQ, A is assumed to have been an nq×k matrix, while if
+// vect == lapack.ApplyP, A is assumed to have been a k×nq matrix. nq = m if
+// side == blas.Left, while nq = n if side == blas.Right.
+//
+// tau must have length min(nq,k), and Dormbr will panic otherwise. tau contains
+// the elementary reflectors to construct Q or P depending on the value of
+// vect.
+//
+// work must have length at least max(1,lwork), and lwork must be either -1 or
+// at least max(1,n) if side == blas.Left, and at least max(1,m) if side ==
+// blas.Right. For optimum performance lwork should be at least n*nb if side ==
+// blas.Left, and at least m*nb if side == blas.Right, where nb is the optimal
+// block size. On return, work[0] will contain the optimal value of lwork.
+//
+// If lwork == -1, the function only calculates the optimal value of lwork and
+// returns it in work[0].
+//
+// Dormbr is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dormbr(vect lapack.ApplyOrtho, side blas.Side, trans blas.Transpose, m, n, k int, a []float64, lda int, tau, c []float64, ldc int, work []float64, lwork int) {
+	nq := n
+	nw := m
+	if side == blas.Left {
+		nq = m
+		nw = n
+	}
+	applyQ := vect == lapack.ApplyQ
+	switch {
+	case !applyQ && vect != lapack.ApplyP:
+		panic(badApplyOrtho)
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case trans != blas.NoTrans && trans != blas.Trans:
+		panic(badTrans)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case applyQ && lda < max(1, min(nq, k)):
+		panic(badLdA)
+	case !applyQ && lda < max(1, nq):
+		panic(badLdA)
+	case ldc < max(1, n):
+		panic(badLdC)
+	case lwork < max(1, nw) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		work[0] = 1
+		return
+	}
+
+	// The current implementation does not use opts, but a future change may
+	// use these options so construct them.
+	var opts string
+	if side == blas.Left {
+		opts = "L"
+	} else {
+		opts = "R"
+	}
+	if trans == blas.Trans {
+		opts += "T"
+	} else {
+		opts += "N"
+	}
+	var nb int
+	if applyQ {
+		if side == blas.Left {
+			nb = impl.Ilaenv(1, "DORMQR", opts, m-1, n, m-1, -1)
+		} else {
+			nb = impl.Ilaenv(1, "DORMQR", opts, m, n-1, n-1, -1)
+		}
+	} else {
+		if side == blas.Left {
+			nb = impl.Ilaenv(1, "DORMLQ", opts, m-1, n, m-1, -1)
+		} else {
+			nb = impl.Ilaenv(1, "DORMLQ", opts, m, n-1, n-1, -1)
+		}
+	}
+	lworkopt := max(1, nw) * nb
+	if lwork == -1 {
+		work[0] = float64(lworkopt)
+		return
+	}
+
+	minnqk := min(nq, k)
+	switch {
+	case applyQ && len(a) < (nq-1)*lda+minnqk:
+		panic(shortA)
+	case !applyQ && len(a) < (minnqk-1)*lda+nq:
+		panic(shortA)
+	case len(tau) < minnqk:
+		panic(shortTau)
+	case len(c) < (m-1)*ldc+n:
+		panic(shortC)
+	}
+
+	if applyQ {
+		// Change the operation to get Q depending on the size of the initial
+		// matrix to Dgebrd. The size matters due to the storage location of
+		// the off-diagonal elements.
+		if nq >= k {
+			impl.Dormqr(side, trans, m, n, k, a, lda, tau[:k], c, ldc, work, lwork)
+		} else if nq > 1 {
+			mi := m
+			ni := n - 1
+			i1 := 0
+			i2 := 1
+			if side == blas.Left {
+				mi = m - 1
+				ni = n
+				i1 = 1
+				i2 = 0
+			}
+			impl.Dormqr(side, trans, mi, ni, nq-1, a[lda:], lda, tau[:nq-1], c[i1*ldc+i2:], ldc, work, lwork)
+		}
+		work[0] = float64(lworkopt)
+		return
+	}
+
+	transt := blas.Trans
+	if trans == blas.Trans {
+		transt = blas.NoTrans
+	}
+
+	// Change the operation to get P depending on the size of the initial
+	// matrix to Dgebrd. The size matters due to the storage location of
+	// the off-diagonal elements.
+	if nq > k {
+		impl.Dormlq(side, transt, m, n, k, a, lda, tau, c, ldc, work, lwork)
+	} else if nq > 1 {
+		mi := m
+		ni := n - 1
+		i1 := 0
+		i2 := 1
+		if side == blas.Left {
+			mi = m - 1
+			ni = n
+			i1 = 1
+			i2 = 0
+		}
+		impl.Dormlq(side, transt, mi, ni, nq-1, a[1:], lda, tau, c[i1*ldc+i2:], ldc, work, lwork)
+	}
+	work[0] = float64(lworkopt)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dormhr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dormhr.go
new file mode 100644
index 0000000000..318a57adca
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dormhr.go
@@ -0,0 +1,134 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dormhr multiplies an m×n general matrix C with an nq×nq orthogonal matrix Q
+//
+//	Q * C   if side == blas.Left  and trans == blas.NoTrans,
+//	Qᵀ * C  if side == blas.Left  and trans == blas.Trans,
+//	C * Q   if side == blas.Right and trans == blas.NoTrans,
+//	C * Qᵀ  if side == blas.Right and trans == blas.Trans,
+//
+// where nq == m if side == blas.Left and nq == n if side == blas.Right.
+//
+// Q is defined implicitly as the product of ihi-ilo elementary reflectors, as
+// returned by Dgehrd:
+//
+//	Q = H_{ilo} H_{ilo+1} ... H_{ihi-1}.
+//
+// Q is equal to the identity matrix except in the submatrix
+// Q[ilo+1:ihi+1,ilo+1:ihi+1].
+//
+// ilo and ihi must have the same values as in the previous call of Dgehrd. It
+// must hold that
+//
+//	0 <= ilo <= ihi < m   if m > 0 and side == blas.Left,
+//	ilo = 0 and ihi = -1  if m = 0 and side == blas.Left,
+//	0 <= ilo <= ihi < n   if n > 0 and side == blas.Right,
+//	ilo = 0 and ihi = -1  if n = 0 and side == blas.Right.
+//
+// a and lda represent an m×m matrix if side == blas.Left and an n×n matrix if
+// side == blas.Right. The matrix contains vectors which define the elementary
+// reflectors, as returned by Dgehrd.
+//
+// tau contains the scalar factors of the elementary reflectors, as returned by
+// Dgehrd. tau must have length m-1 if side == blas.Left and n-1 if side ==
+// blas.Right.
+//
+// c and ldc represent the m×n matrix C. On return, c is overwritten by the
+// product with Q.
+//
+// work must have length at least max(1,lwork), and lwork must be at least
+// max(1,n), if side == blas.Left, and max(1,m), if side == blas.Right. For
+// optimum performance lwork should be at least n*nb if side == blas.Left and
+// m*nb if side == blas.Right, where nb is the optimal block size. On return,
+// work[0] will contain the optimal value of lwork.
+//
+// If lwork == -1, instead of performing Dormhr, only the optimal value of lwork
+// will be stored in work[0].
+//
+// If any requirement on input sizes is not met, Dormhr will panic.
+//
+// Dormhr is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dormhr(side blas.Side, trans blas.Transpose, m, n, ilo, ihi int, a []float64, lda int, tau, c []float64, ldc int, work []float64, lwork int) {
+	nq := n // The order of Q.
+	nw := m // The minimum length of work.
+	if side == blas.Left {
+		nq = m
+		nw = n
+	}
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case trans != blas.NoTrans && trans != blas.Trans:
+		panic(badTrans)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case ilo < 0 || max(1, nq) <= ilo:
+		panic(badIlo)
+	case ihi < min(ilo, nq-1) || nq <= ihi:
+		panic(badIhi)
+	case lda < max(1, nq):
+		panic(badLdA)
+	case lwork < max(1, nw) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		work[0] = 1
+		return
+	}
+
+	nh := ihi - ilo
+	var nb int
+	if side == blas.Left {
+		opts := "LN"
+		if trans == blas.Trans {
+			opts = "LT"
+		}
+		nb = impl.Ilaenv(1, "DORMQR", opts, nh, n, nh, -1)
+	} else {
+		opts := "RN"
+		if trans == blas.Trans {
+			opts = "RT"
+		}
+		nb = impl.Ilaenv(1, "DORMQR", opts, m, nh, nh, -1)
+	}
+	lwkopt := max(1, nw) * nb
+	if lwork == -1 {
+		work[0] = float64(lwkopt)
+		return
+	}
+
+	if nh == 0 {
+		work[0] = 1
+		return
+	}
+
+	switch {
+	case len(a) < (nq-1)*lda+nq:
+		panic(shortA)
+	case len(c) < (m-1)*ldc+n:
+		panic(shortC)
+	case len(tau) != nq-1:
+		panic(badLenTau)
+	}
+
+	if side == blas.Left {
+		impl.Dormqr(side, trans, nh, n, nh, a[(ilo+1)*lda+ilo:], lda,
+			tau[ilo:ihi], c[(ilo+1)*ldc:], ldc, work, lwork)
+	} else {
+		impl.Dormqr(side, trans, m, nh, nh, a[(ilo+1)*lda+ilo:], lda,
+			tau[ilo:ihi], c[ilo+1:], ldc, work, lwork)
+	}
+	work[0] = float64(lwkopt)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dorml2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dorml2.go
new file mode 100644
index 0000000000..665e2102c8
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dorml2.go
@@ -0,0 +1,104 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dorml2 multiplies a general matrix C by an orthogonal matrix from an LQ factorization
+// determined by Dgelqf.
+//
+//	C = Q * C   if side == blas.Left and trans == blas.NoTrans
+//	C = Qᵀ * C  if side == blas.Left and trans == blas.Trans
+//	C = C * Q   if side == blas.Right and trans == blas.NoTrans
+//	C = C * Qᵀ  if side == blas.Right and trans == blas.Trans
+//
+// If side == blas.Left, a is a matrix of side k×m, and if side == blas.Right
+// a is of size k×n.
+//
+// tau contains the Householder factors and is of length at least k and this function will
+// panic otherwise.
+//
+// work is temporary storage of length at least n if side == blas.Left
+// and at least m if side == blas.Right and this function will panic otherwise.
+//
+// Dorml2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dorml2(side blas.Side, trans blas.Transpose, m, n, k int, a []float64, lda int, tau, c []float64, ldc int, work []float64) {
+	left := side == blas.Left
+	switch {
+	case !left && side != blas.Right:
+		panic(badSide)
+	case trans != blas.Trans && trans != blas.NoTrans:
+		panic(badTrans)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case left && k > m:
+		panic(kGTM)
+	case !left && k > n:
+		panic(kGTN)
+	case left && lda < max(1, m):
+		panic(badLdA)
+	case !left && lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 || k == 0 {
+		return
+	}
+
+	switch {
+	case left && len(a) < (k-1)*lda+m:
+		panic(shortA)
+	case !left && len(a) < (k-1)*lda+n:
+		panic(shortA)
+	case len(tau) < k:
+		panic(shortTau)
+	case len(c) < (m-1)*ldc+n:
+		panic(shortC)
+	case left && len(work) < n:
+		panic(shortWork)
+	case !left && len(work) < m:
+		panic(shortWork)
+	}
+
+	notrans := trans == blas.NoTrans
+	switch {
+	case left && notrans:
+		for i := 0; i < k; i++ {
+			aii := a[i*lda+i]
+			a[i*lda+i] = 1
+			impl.Dlarf(side, m-i, n, a[i*lda+i:], 1, tau[i], c[i*ldc:], ldc, work)
+			a[i*lda+i] = aii
+		}
+
+	case left && !notrans:
+		for i := k - 1; i >= 0; i-- {
+			aii := a[i*lda+i]
+			a[i*lda+i] = 1
+			impl.Dlarf(side, m-i, n, a[i*lda+i:], 1, tau[i], c[i*ldc:], ldc, work)
+			a[i*lda+i] = aii
+		}
+
+	case !left && notrans:
+		for i := k - 1; i >= 0; i-- {
+			aii := a[i*lda+i]
+			a[i*lda+i] = 1
+			impl.Dlarf(side, m, n-i, a[i*lda+i:], 1, tau[i], c[i:], ldc, work)
+			a[i*lda+i] = aii
+		}
+
+	case !left && !notrans:
+		for i := 0; i < k; i++ {
+			aii := a[i*lda+i]
+			a[i*lda+i] = 1
+			impl.Dlarf(side, m, n-i, a[i*lda+i:], 1, tau[i], c[i:], ldc, work)
+			a[i*lda+i] = aii
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dormlq.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dormlq.go
new file mode 100644
index 0000000000..37b499739a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dormlq.go
@@ -0,0 +1,176 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dormlq multiplies the matrix C by the orthogonal matrix Q defined by the
+// slices a and tau. A and tau are as returned from Dgelqf.
+//
+//	C = Q * C   if side == blas.Left and trans == blas.NoTrans
+//	C = Qᵀ * C  if side == blas.Left and trans == blas.Trans
+//	C = C * Q   if side == blas.Right and trans == blas.NoTrans
+//	C = C * Qᵀ  if side == blas.Right and trans == blas.Trans
+//
+// If side == blas.Left, A is a matrix of side k×m, and if side == blas.Right
+// A is of size k×n. This uses a blocked algorithm.
+//
+// work is temporary storage, and lwork specifies the usable memory length.
+// At minimum, lwork >= m if side == blas.Left and lwork >= n if side == blas.Right,
+// and this function will panic otherwise.
+// Dormlq uses a block algorithm, but the block size is limited
+// by the temporary space available. If lwork == -1, instead of performing Dormlq,
+// the optimal work length will be stored into work[0].
+//
+// tau contains the Householder scales and must have length at least k, and
+// this function will panic otherwise.
+func (impl Implementation) Dormlq(side blas.Side, trans blas.Transpose, m, n, k int, a []float64, lda int, tau, c []float64, ldc int, work []float64, lwork int) {
+	left := side == blas.Left
+	nw := m
+	if left {
+		nw = n
+	}
+	switch {
+	case !left && side != blas.Right:
+		panic(badSide)
+	case trans != blas.Trans && trans != blas.NoTrans:
+		panic(badTrans)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case left && k > m:
+		panic(kGTM)
+	case !left && k > n:
+		panic(kGTN)
+	case left && lda < max(1, m):
+		panic(badLdA)
+	case !left && lda < max(1, n):
+		panic(badLdA)
+	case lwork < max(1, nw) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 || k == 0 {
+		work[0] = 1
+		return
+	}
+
+	const (
+		nbmax = 64
+		ldt   = nbmax
+		tsize = nbmax * ldt
+	)
+	opts := string(side) + string(trans)
+	nb := min(nbmax, impl.Ilaenv(1, "DORMLQ", opts, m, n, k, -1))
+	lworkopt := max(1, nw)*nb + tsize
+	if lwork == -1 {
+		work[0] = float64(lworkopt)
+		return
+	}
+
+	switch {
+	case left && len(a) < (k-1)*lda+m:
+		panic(shortA)
+	case !left && len(a) < (k-1)*lda+n:
+		panic(shortA)
+	case len(tau) < k:
+		panic(shortTau)
+	case len(c) < (m-1)*ldc+n:
+		panic(shortC)
+	}
+
+	nbmin := 2
+	if 1 < nb && nb < k {
+		iws := nw*nb + tsize
+		if lwork < iws {
+			nb = (lwork - tsize) / nw
+			nbmin = max(2, impl.Ilaenv(2, "DORMLQ", opts, m, n, k, -1))
+		}
+	}
+	if nb < nbmin || k <= nb {
+		// Call unblocked code.
+		impl.Dorml2(side, trans, m, n, k, a, lda, tau, c, ldc, work)
+		work[0] = float64(lworkopt)
+		return
+	}
+
+	t := work[:tsize]
+	wrk := work[tsize:]
+	ldwrk := nb
+
+	notrans := trans == blas.NoTrans
+	transt := blas.NoTrans
+	if notrans {
+		transt = blas.Trans
+	}
+
+	switch {
+	case left && notrans:
+		for i := 0; i < k; i += nb {
+			ib := min(nb, k-i)
+			impl.Dlarft(lapack.Forward, lapack.RowWise, m-i, ib,
+				a[i*lda+i:], lda,
+				tau[i:],
+				t, ldt)
+			impl.Dlarfb(side, transt, lapack.Forward, lapack.RowWise, m-i, n, ib,
+				a[i*lda+i:], lda,
+				t, ldt,
+				c[i*ldc:], ldc,
+				wrk, ldwrk)
+		}
+
+	case left && !notrans:
+		for i := ((k - 1) / nb) * nb; i >= 0; i -= nb {
+			ib := min(nb, k-i)
+			impl.Dlarft(lapack.Forward, lapack.RowWise, m-i, ib,
+				a[i*lda+i:], lda,
+				tau[i:],
+				t, ldt)
+			impl.Dlarfb(side, transt, lapack.Forward, lapack.RowWise, m-i, n, ib,
+				a[i*lda+i:], lda,
+				t, ldt,
+				c[i*ldc:], ldc,
+				wrk, ldwrk)
+		}
+
+	case !left && notrans:
+		for i := ((k - 1) / nb) * nb; i >= 0; i -= nb {
+			ib := min(nb, k-i)
+			impl.Dlarft(lapack.Forward, lapack.RowWise, n-i, ib,
+				a[i*lda+i:], lda,
+				tau[i:],
+				t, ldt)
+			impl.Dlarfb(side, transt, lapack.Forward, lapack.RowWise, m, n-i, ib,
+				a[i*lda+i:], lda,
+				t, ldt,
+				c[i:], ldc,
+				wrk, ldwrk)
+		}
+
+	case !left && !notrans:
+		for i := 0; i < k; i += nb {
+			ib := min(nb, k-i)
+			impl.Dlarft(lapack.Forward, lapack.RowWise, n-i, ib,
+				a[i*lda+i:], lda,
+				tau[i:],
+				t, ldt)
+			impl.Dlarfb(side, transt, lapack.Forward, lapack.RowWise, m, n-i, ib,
+				a[i*lda+i:], lda,
+				t, ldt,
+				c[i:], ldc,
+				wrk, ldwrk)
+		}
+	}
+	work[0] = float64(lworkopt)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dormqr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dormqr.go
new file mode 100644
index 0000000000..c1e5668be5
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dormqr.go
@@ -0,0 +1,180 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dormqr multiplies an m×n matrix C by an orthogonal matrix Q as
+//
+//	C = Q * C   if side == blas.Left  and trans == blas.NoTrans,
+//	C = Qᵀ * C  if side == blas.Left  and trans == blas.Trans,
+//	C = C * Q   if side == blas.Right and trans == blas.NoTrans,
+//	C = C * Qᵀ  if side == blas.Right and trans == blas.Trans,
+//
+// where Q is defined as the product of k elementary reflectors
+//
+//	Q = H_0 * H_1 * ... * H_{k-1}.
+//
+// If side == blas.Left, A is an m×k matrix and 0 <= k <= m.
+// If side == blas.Right, A is an n×k matrix and 0 <= k <= n.
+// The ith column of A contains the vector which defines the elementary
+// reflector H_i and tau[i] contains its scalar factor. tau must have length k
+// and Dormqr will panic otherwise. Dgeqrf returns A and tau in the required
+// form.
+//
+// work must have length at least max(1,lwork), and lwork must be at least n if
+// side == blas.Left and at least m if side == blas.Right, otherwise Dormqr will
+// panic.
+//
+// work is temporary storage, and lwork specifies the usable memory length. At
+// minimum, lwork >= m if side == blas.Left and lwork >= n if side ==
+// blas.Right, and this function will panic otherwise. Larger values of lwork
+// will generally give better performance. On return, work[0] will contain the
+// optimal value of lwork.
+//
+// If lwork is -1, instead of performing Dormqr, the optimal workspace size will
+// be stored into work[0].
+func (impl Implementation) Dormqr(side blas.Side, trans blas.Transpose, m, n, k int, a []float64, lda int, tau, c []float64, ldc int, work []float64, lwork int) {
+	left := side == blas.Left
+	nq := n
+	nw := m
+	if left {
+		nq = m
+		nw = n
+	}
+	switch {
+	case !left && side != blas.Right:
+		panic(badSide)
+	case trans != blas.NoTrans && trans != blas.Trans:
+		panic(badTrans)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case left && k > m:
+		panic(kGTM)
+	case !left && k > n:
+		panic(kGTN)
+	case lda < max(1, k):
+		panic(badLdA)
+	case ldc < max(1, n):
+		panic(badLdC)
+	case lwork < max(1, nw) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 || k == 0 {
+		work[0] = 1
+		return
+	}
+
+	const (
+		nbmax = 64
+		ldt   = nbmax
+		tsize = nbmax * ldt
+	)
+	opts := string(side) + string(trans)
+	nb := min(nbmax, impl.Ilaenv(1, "DORMQR", opts, m, n, k, -1))
+	lworkopt := max(1, nw)*nb + tsize
+	if lwork == -1 {
+		work[0] = float64(lworkopt)
+		return
+	}
+
+	switch {
+	case len(a) < (nq-1)*lda+k:
+		panic(shortA)
+	case len(tau) != k:
+		panic(badLenTau)
+	case len(c) < (m-1)*ldc+n:
+		panic(shortC)
+	}
+
+	nbmin := 2
+	if 1 < nb && nb < k {
+		if lwork < nw*nb+tsize {
+			nb = (lwork - tsize) / nw
+			nbmin = max(2, impl.Ilaenv(2, "DORMQR", opts, m, n, k, -1))
+		}
+	}
+
+	if nb < nbmin || k <= nb {
+		// Call unblocked code.
+		impl.Dorm2r(side, trans, m, n, k, a, lda, tau, c, ldc, work)
+		work[0] = float64(lworkopt)
+		return
+	}
+
+	var (
+		ldwork  = nb
+		notrans = trans == blas.NoTrans
+	)
+	switch {
+	case left && notrans:
+		for i := ((k - 1) / nb) * nb; i >= 0; i -= nb {
+			ib := min(nb, k-i)
+			impl.Dlarft(lapack.Forward, lapack.ColumnWise, m-i, ib,
+				a[i*lda+i:], lda,
+				tau[i:],
+				work[:tsize], ldt)
+			impl.Dlarfb(side, trans, lapack.Forward, lapack.ColumnWise, m-i, n, ib,
+				a[i*lda+i:], lda,
+				work[:tsize], ldt,
+				c[i*ldc:], ldc,
+				work[tsize:], ldwork)
+		}
+
+	case left && !notrans:
+		for i := 0; i < k; i += nb {
+			ib := min(nb, k-i)
+			impl.Dlarft(lapack.Forward, lapack.ColumnWise, m-i, ib,
+				a[i*lda+i:], lda,
+				tau[i:],
+				work[:tsize], ldt)
+			impl.Dlarfb(side, trans, lapack.Forward, lapack.ColumnWise, m-i, n, ib,
+				a[i*lda+i:], lda,
+				work[:tsize], ldt,
+				c[i*ldc:], ldc,
+				work[tsize:], ldwork)
+		}
+
+	case !left && notrans:
+		for i := 0; i < k; i += nb {
+			ib := min(nb, k-i)
+			impl.Dlarft(lapack.Forward, lapack.ColumnWise, n-i, ib,
+				a[i*lda+i:], lda,
+				tau[i:],
+				work[:tsize], ldt)
+			impl.Dlarfb(side, trans, lapack.Forward, lapack.ColumnWise, m, n-i, ib,
+				a[i*lda+i:], lda,
+				work[:tsize], ldt,
+				c[i:], ldc,
+				work[tsize:], ldwork)
+		}
+
+	case !left && !notrans:
+		for i := ((k - 1) / nb) * nb; i >= 0; i -= nb {
+			ib := min(nb, k-i)
+			impl.Dlarft(lapack.Forward, lapack.ColumnWise, n-i, ib,
+				a[i*lda+i:], lda,
+				tau[i:],
+				work[:tsize], ldt)
+			impl.Dlarfb(side, trans, lapack.Forward, lapack.ColumnWise, m, n-i, ib,
+				a[i*lda+i:], lda,
+				work[:tsize], ldt,
+				c[i:], ldc,
+				work[tsize:], ldwork)
+		}
+	}
+	work[0] = float64(lworkopt)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dormr2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dormr2.go
new file mode 100644
index 0000000000..59d4d4f17e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dormr2.go
@@ -0,0 +1,105 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dormr2 multiplies a general matrix C by an orthogonal matrix from a RQ factorization
+// determined by Dgerqf.
+//
+//	C = Q * C   if side == blas.Left and trans == blas.NoTrans
+//	C = Qᵀ * C  if side == blas.Left and trans == blas.Trans
+//	C = C * Q   if side == blas.Right and trans == blas.NoTrans
+//	C = C * Qᵀ  if side == blas.Right and trans == blas.Trans
+//
+// If side == blas.Left, a is a matrix of size k×m, and if side == blas.Right
+// a is of size k×n.
+//
+// tau contains the Householder factors and is of length at least k and this function
+// will panic otherwise.
+//
+// work is temporary storage of length at least n if side == blas.Left
+// and at least m if side == blas.Right and this function will panic otherwise.
+//
+// Dormr2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dormr2(side blas.Side, trans blas.Transpose, m, n, k int, a []float64, lda int, tau, c []float64, ldc int, work []float64) {
+	left := side == blas.Left
+	nq := n
+	nw := m
+	if left {
+		nq = m
+		nw = n
+	}
+	switch {
+	case !left && side != blas.Right:
+		panic(badSide)
+	case trans != blas.NoTrans && trans != blas.Trans:
+		panic(badTrans)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case left && k > m:
+		panic(kGTM)
+	case !left && k > n:
+		panic(kGTN)
+	case lda < max(1, nq):
+		panic(badLdA)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 || k == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (k-1)*lda+nq:
+		panic(shortA)
+	case len(tau) < k:
+		panic(shortTau)
+	case len(c) < (m-1)*ldc+n:
+		panic(shortC)
+	case len(work) < nw:
+		panic(shortWork)
+	}
+
+	if left {
+		if trans == blas.NoTrans {
+			for i := k - 1; i >= 0; i-- {
+				aii := a[i*lda+(m-k+i)]
+				a[i*lda+(m-k+i)] = 1
+				impl.Dlarf(side, m-k+i+1, n, a[i*lda:], 1, tau[i], c, ldc, work)
+				a[i*lda+(m-k+i)] = aii
+			}
+			return
+		}
+		for i := 0; i < k; i++ {
+			aii := a[i*lda+(m-k+i)]
+			a[i*lda+(m-k+i)] = 1
+			impl.Dlarf(side, m-k+i+1, n, a[i*lda:], 1, tau[i], c, ldc, work)
+			a[i*lda+(m-k+i)] = aii
+		}
+		return
+	}
+	if trans == blas.NoTrans {
+		for i := 0; i < k; i++ {
+			aii := a[i*lda+(n-k+i)]
+			a[i*lda+(n-k+i)] = 1
+			impl.Dlarf(side, m, n-k+i+1, a[i*lda:], 1, tau[i], c, ldc, work)
+			a[i*lda+(n-k+i)] = aii
+		}
+		return
+	}
+	for i := k - 1; i >= 0; i-- {
+		aii := a[i*lda+(n-k+i)]
+		a[i*lda+(n-k+i)] = 1
+		impl.Dlarf(side, m, n-k+i+1, a[i*lda:], 1, tau[i], c, ldc, work)
+		a[i*lda+(n-k+i)] = aii
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpbcon.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpbcon.go
new file mode 100644
index 0000000000..0ed63e62dd
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpbcon.go
@@ -0,0 +1,111 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dpbcon returns an estimate of the reciprocal of the condition number (in the
+// 1-norm) of an n×n symmetric positive definite band matrix using the Cholesky
+// factorization
+//
+//	A = Uᵀ*U  if uplo == blas.Upper
+//	A = L*Lᵀ  if uplo == blas.Lower
+//
+// computed by Dpbtrf. The estimate is obtained for norm(inv(A)), and the
+// reciprocal of the condition number is computed as
+//
+//	rcond = 1 / (anorm * norm(inv(A))).
+//
+// The length of work must be at least 3*n and the length of iwork must be at
+// least n.
+func (impl Implementation) Dpbcon(uplo blas.Uplo, n, kd int, ab []float64, ldab int, anorm float64, work []float64, iwork []int) (rcond float64) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case kd < 0:
+		panic(kdLT0)
+	case ldab < kd+1:
+		panic(badLdA)
+	case anorm < 0:
+		panic(badNorm)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 1
+	}
+
+	switch {
+	case len(ab) < (n-1)*ldab+kd+1:
+		panic(shortAB)
+	case len(work) < 3*n:
+		panic(shortWork)
+	case len(iwork) < n:
+		panic(shortIWork)
+	}
+
+	// Quick return if possible.
+	if anorm == 0 {
+		return 0
+	}
+
+	const smlnum = dlamchS
+
+	var (
+		ainvnm float64
+		kase   int
+		isave  [3]int
+		normin bool
+
+		// Denote work slices.
+		x     = work[:n]
+		v     = work[n : 2*n]
+		cnorm = work[2*n : 3*n]
+	)
+	// Estimate the 1-norm of the inverse.
+	bi := blas64.Implementation()
+	for {
+		ainvnm, kase = impl.Dlacn2(n, v, x, iwork, ainvnm, kase, &isave)
+		if kase == 0 {
+			break
+		}
+		var op1, op2 blas.Transpose
+		if uplo == blas.Upper {
+			// Multiply x by inv(Uᵀ),
+			op1 = blas.Trans
+			// then by inv(Uᵀ).
+			op2 = blas.NoTrans
+		} else {
+			// Multiply x by inv(L),
+			op1 = blas.NoTrans
+			// then by inv(Lᵀ).
+			op2 = blas.Trans
+		}
+		scaleL := impl.Dlatbs(uplo, op1, blas.NonUnit, normin, n, kd, ab, ldab, x, cnorm)
+		normin = true
+		scaleU := impl.Dlatbs(uplo, op2, blas.NonUnit, normin, n, kd, ab, ldab, x, cnorm)
+		// Multiply x by 1/scale if doing so will not cause overflow.
+		scale := scaleL * scaleU
+		if scale != 1 {
+			ix := bi.Idamax(n, x, 1)
+			if scale < math.Abs(x[ix])*smlnum || scale == 0 {
+				return 0
+			}
+			impl.Drscl(n, scale, x, 1)
+		}
+	}
+	if ainvnm == 0 {
+		return 0
+	}
+	// Return the estimate of the reciprocal condition number.
+	return (1 / ainvnm) / anorm
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpbtf2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpbtf2.go
new file mode 100644
index 0000000000..8150e56802
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpbtf2.go
@@ -0,0 +1,114 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dpbtf2 computes the Cholesky factorization of a symmetric positive banded
+// matrix ab. The matrix ab is n×n with kd diagonal bands. The Cholesky
+// factorization computed is
+//
+//	A = Uᵀ * U  if ul == blas.Upper
+//	A = L * Lᵀ  if ul == blas.Lower
+//
+// ul also specifies the storage of ab. If ul == blas.Upper, then
+// ab is stored as an upper-triangular banded matrix with kd super-diagonals,
+// and if ul == blas.Lower, ab is stored as a lower-triangular banded matrix
+// with kd sub-diagonals. On exit, the banded matrix U or L is stored in-place
+// into ab depending on the value of ul. Dpbtf2 returns whether the factorization
+// was successfully completed.
+//
+// The band storage scheme is illustrated below when n = 6, and kd = 2.
+// The resulting Cholesky decomposition is stored in the same elements as the
+// input band matrix (a11 becomes u11 or l11, etc.).
+//
+//	ul = blas.Upper
+//	a11 a12 a13
+//	a22 a23 a24
+//	a33 a34 a35
+//	a44 a45 a46
+//	a55 a56  *
+//	a66  *   *
+//
+//	ul = blas.Lower
+//	 *   *  a11
+//	 *  a21 a22
+//	a31 a32 a33
+//	a42 a43 a44
+//	a53 a54 a55
+//	a64 a65 a66
+//
+// Dpbtf2 is the unblocked version of the algorithm, see Dpbtrf for the blocked
+// version.
+//
+// Dpbtf2 is an internal routine, exported for testing purposes.
+func (Implementation) Dpbtf2(uplo blas.Uplo, n, kd int, ab []float64, ldab int) (ok bool) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case kd < 0:
+		panic(kdLT0)
+	case ldab < kd+1:
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return true
+	}
+
+	if len(ab) < (n-1)*ldab+kd+1 {
+		panic(shortAB)
+	}
+
+	bi := blas64.Implementation()
+
+	kld := max(1, ldab-1)
+	if uplo == blas.Upper {
+		// Compute the Cholesky factorization A = Uᵀ * U.
+		for j := 0; j < n; j++ {
+			// Compute U(j,j) and test for non-positive-definiteness.
+			ajj := ab[j*ldab]
+			if ajj <= 0 {
+				return false
+			}
+			ajj = math.Sqrt(ajj)
+			ab[j*ldab] = ajj
+			// Compute elements j+1:j+kn of row j and update the trailing submatrix
+			// within the band.
+			kn := min(kd, n-j-1)
+			if kn > 0 {
+				bi.Dscal(kn, 1/ajj, ab[j*ldab+1:], 1)
+				bi.Dsyr(blas.Upper, kn, -1, ab[j*ldab+1:], 1, ab[(j+1)*ldab:], kld)
+			}
+		}
+		return true
+	}
+	// Compute the Cholesky factorization A = L * Lᵀ.
+	for j := 0; j < n; j++ {
+		// Compute L(j,j) and test for non-positive-definiteness.
+		ajj := ab[j*ldab+kd]
+		if ajj <= 0 {
+			return false
+		}
+		ajj = math.Sqrt(ajj)
+		ab[j*ldab+kd] = ajj
+		// Compute elements j+1:j+kn of column j and update the trailing submatrix
+		// within the band.
+		kn := min(kd, n-j-1)
+		if kn > 0 {
+			bi.Dscal(kn, 1/ajj, ab[(j+1)*ldab+kd-1:], kld)
+			bi.Dsyr(blas.Lower, kn, -1, ab[(j+1)*ldab+kd-1:], kld, ab[(j+1)*ldab+kd:], kld)
+		}
+	}
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpbtrf.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpbtrf.go
new file mode 100644
index 0000000000..12cdfc0fab
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpbtrf.go
@@ -0,0 +1,216 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dpbtrf computes the Cholesky factorization of an n×n symmetric positive
+// definite band matrix
+//
+//	A = Uᵀ * U  if uplo == blas.Upper
+//	A = L * Lᵀ  if uplo == blas.Lower
+//
+// where U is an upper triangular band matrix and L is lower triangular. kd is
+// the number of super- or sub-diagonals of A.
+//
+// The band storage scheme is illustrated below when n = 6 and kd = 2. Elements
+// marked * are not used by the function.
+//
+//	uplo == blas.Upper
+//	On entry:         On return:
+//	 a00  a01  a02     u00  u01  u02
+//	 a11  a12  a13     u11  u12  u13
+//	 a22  a23  a24     u22  u23  u24
+//	 a33  a34  a35     u33  u34  u35
+//	 a44  a45   *      u44  u45   *
+//	 a55   *    *      u55   *    *
+//
+//	uplo == blas.Lower
+//	On entry:         On return:
+//	  *    *   a00       *    *   l00
+//	  *   a10  a11       *   l10  l11
+//	 a20  a21  a22      l20  l21  l22
+//	 a31  a32  a33      l31  l32  l33
+//	 a42  a43  a44      l42  l43  l44
+//	 a53  a54  a55      l53  l54  l55
+func (impl Implementation) Dpbtrf(uplo blas.Uplo, n, kd int, ab []float64, ldab int) (ok bool) {
+	const nbmax = 32
+
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case kd < 0:
+		panic(kdLT0)
+	case ldab < kd+1:
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return true
+	}
+
+	if len(ab) < (n-1)*ldab+kd+1 {
+		panic(shortAB)
+	}
+
+	opts := string(blas.Upper)
+	if uplo == blas.Lower {
+		opts = string(blas.Lower)
+	}
+	nb := impl.Ilaenv(1, "DPBTRF", opts, n, kd, -1, -1)
+	// The block size must not exceed the semi-bandwidth kd, and must not
+	// exceed the limit set by the size of the local array work.
+	nb = min(nb, nbmax)
+
+	if nb <= 1 || kd < nb {
+		// Use unblocked code.
+		return impl.Dpbtf2(uplo, n, kd, ab, ldab)
+	}
+
+	// Use blocked code.
+	ldwork := nb
+	work := make([]float64, nb*ldwork)
+	bi := blas64.Implementation()
+	if uplo == blas.Upper {
+		// Compute the Cholesky factorization of a symmetric band
+		// matrix, given the upper triangle of the matrix in band
+		// storage.
+
+		// Process the band matrix one diagonal block at a time.
+		for i := 0; i < n; i += nb {
+			ib := min(nb, n-i)
+			// Factorize the diagonal block.
+			ok := impl.Dpotf2(uplo, ib, ab[i*ldab:], ldab-1)
+			if !ok {
+				return false
+			}
+			if i+ib >= n {
+				continue
+			}
+			// Update the relevant part of the trailing submatrix.
+			// If A11 denotes the diagonal block which has just been
+			// factorized, then we need to update the remaining
+			// blocks in the diagram:
+			//
+			//  A11   A12   A13
+			//        A22   A23
+			//              A33
+			//
+			// The numbers of rows and columns in the partitioning
+			// are ib, i2, i3 respectively. The blocks A12, A22 and
+			// A23 are empty if ib = kd. The upper triangle of A13
+			// lies outside the band.
+			i2 := min(kd-ib, n-i-ib)
+			if i2 > 0 {
+				// Update A12.
+				bi.Dtrsm(blas.Left, blas.Upper, blas.Trans, blas.NonUnit, ib, i2,
+					1, ab[i*ldab:], ldab-1, ab[i*ldab+ib:], ldab-1)
+				// Update A22.
+				bi.Dsyrk(blas.Upper, blas.Trans, i2, ib,
+					-1, ab[i*ldab+ib:], ldab-1, 1, ab[(i+ib)*ldab:], ldab-1)
+			}
+			i3 := min(ib, n-i-kd)
+			if i3 > 0 {
+				// Copy the lower triangle of A13 into the work array.
+				for ii := 0; ii < ib; ii++ {
+					for jj := 0; jj <= min(ii, i3-1); jj++ {
+						work[ii*ldwork+jj] = ab[(i+ii)*ldab+kd-ii+jj]
+					}
+				}
+				// Update A13 (in the work array).
+				bi.Dtrsm(blas.Left, blas.Upper, blas.Trans, blas.NonUnit, ib, i3,
+					1, ab[i*ldab:], ldab-1, work, ldwork)
+				// Update A23.
+				if i2 > 0 {
+					bi.Dgemm(blas.Trans, blas.NoTrans, i2, i3, ib,
+						-1, ab[i*ldab+ib:], ldab-1, work, ldwork,
+						1, ab[(i+ib)*ldab+kd-ib:], ldab-1)
+				}
+				// Update A33.
+				bi.Dsyrk(blas.Upper, blas.Trans, i3, ib,
+					-1, work, ldwork, 1, ab[(i+kd)*ldab:], ldab-1)
+				// Copy the lower triangle of A13 back into place.
+				for ii := 0; ii < ib; ii++ {
+					for jj := 0; jj <= min(ii, i3-1); jj++ {
+						ab[(i+ii)*ldab+kd-ii+jj] = work[ii*ldwork+jj]
+					}
+				}
+			}
+		}
+	} else {
+		// Compute the Cholesky factorization of a symmetric band
+		// matrix, given the lower triangle of the matrix in band
+		// storage.
+
+		// Process the band matrix one diagonal block at a time.
+		for i := 0; i < n; i += nb {
+			ib := min(nb, n-i)
+			// Factorize the diagonal block.
+			ok := impl.Dpotf2(uplo, ib, ab[i*ldab+kd:], ldab-1)
+			if !ok {
+				return false
+			}
+			if i+ib >= n {
+				continue
+			}
+			// Update the relevant part of the trailing submatrix.
+			// If A11 denotes the diagonal block which has just been
+			// factorized, then we need to update the remaining
+			// blocks in the diagram:
+			//
+			//  A11
+			//  A21   A22
+			//  A31   A32   A33
+			//
+			// The numbers of rows and columns in the partitioning
+			// are ib, i2, i3 respectively. The blocks A21, A22 and
+			// A32 are empty if ib = kd. The lowr triangle of A31
+			// lies outside the band.
+			i2 := min(kd-ib, n-i-ib)
+			if i2 > 0 {
+				// Update A21.
+				bi.Dtrsm(blas.Right, blas.Lower, blas.Trans, blas.NonUnit, i2, ib,
+					1, ab[i*ldab+kd:], ldab-1, ab[(i+ib)*ldab+kd-ib:], ldab-1)
+				// Update A22.
+				bi.Dsyrk(blas.Lower, blas.NoTrans, i2, ib,
+					-1, ab[(i+ib)*ldab+kd-ib:], ldab-1, 1, ab[(i+ib)*ldab+kd:], ldab-1)
+			}
+			i3 := min(ib, n-i-kd)
+			if i3 > 0 {
+				// Copy the upper triangle of A31 into the work array.
+				for ii := 0; ii < i3; ii++ {
+					for jj := ii; jj < ib; jj++ {
+						work[ii*ldwork+jj] = ab[(ii+i+kd)*ldab+jj-ii]
+					}
+				}
+				// Update A31 (in the work array).
+				bi.Dtrsm(blas.Right, blas.Lower, blas.Trans, blas.NonUnit, i3, ib,
+					1, ab[i*ldab+kd:], ldab-1, work, ldwork)
+				// Update A32.
+				if i2 > 0 {
+					bi.Dgemm(blas.NoTrans, blas.Trans, i3, i2, ib,
+						-1, work, ldwork, ab[(i+ib)*ldab+kd-ib:], ldab-1,
+						1, ab[(i+kd)*ldab+ib:], ldab-1)
+				}
+				// Update A33.
+				bi.Dsyrk(blas.Lower, blas.NoTrans, i3, ib,
+					-1, work, ldwork, 1, ab[(i+kd)*ldab+kd:], ldab-1)
+				// Copy the upper triangle of A31 back into place.
+				for ii := 0; ii < i3; ii++ {
+					for jj := ii; jj < ib; jj++ {
+						ab[(ii+i+kd)*ldab+jj-ii] = work[ii*ldwork+jj]
+					}
+				}
+			}
+		}
+	}
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpbtrs.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpbtrs.go
new file mode 100644
index 0000000000..97c9ada00b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpbtrs.go
@@ -0,0 +1,69 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dpbtrs solves a system of linear equations A*X = B with an n×n symmetric
+// positive definite band matrix A using the Cholesky factorization
+//
+//	A = Uᵀ * U  if uplo == blas.Upper
+//	A = L * Lᵀ  if uplo == blas.Lower
+//
+// computed by Dpbtrf. kd is the number of super- or sub-diagonals of A. See the
+// documentation for Dpbtrf for a description of the band storage format of A.
+//
+// On entry, b contains the n×nrhs right hand side matrix B. On return, it is
+// overwritten with the solution matrix X.
+func (Implementation) Dpbtrs(uplo blas.Uplo, n, kd, nrhs int, ab []float64, ldab int, b []float64, ldb int) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case kd < 0:
+		panic(kdLT0)
+	case nrhs < 0:
+		panic(nrhsLT0)
+	case ldab < kd+1:
+		panic(badLdA)
+	case ldb < max(1, nrhs):
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if n == 0 || nrhs == 0 {
+		return
+	}
+
+	if len(ab) < (n-1)*ldab+kd+1 {
+		panic(shortAB)
+	}
+	if len(b) < (n-1)*ldb+nrhs {
+		panic(shortB)
+	}
+
+	bi := blas64.Implementation()
+	if uplo == blas.Upper {
+		// Solve A*X = B where A = Uᵀ*U.
+		for j := 0; j < nrhs; j++ {
+			// Solve Uᵀ*Y = B, overwriting B with Y.
+			bi.Dtbsv(blas.Upper, blas.Trans, blas.NonUnit, n, kd, ab, ldab, b[j:], ldb)
+			// Solve U*X = Y, overwriting Y with X.
+			bi.Dtbsv(blas.Upper, blas.NoTrans, blas.NonUnit, n, kd, ab, ldab, b[j:], ldb)
+		}
+	} else {
+		// Solve A*X = B where A = L*Lᵀ.
+		for j := 0; j < nrhs; j++ {
+			// Solve L*Y = B, overwriting B with Y.
+			bi.Dtbsv(blas.Lower, blas.NoTrans, blas.NonUnit, n, kd, ab, ldab, b[j:], ldb)
+			// Solve Lᵀ*X = Y, overwriting Y with X.
+			bi.Dtbsv(blas.Lower, blas.Trans, blas.NonUnit, n, kd, ab, ldab, b[j:], ldb)
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpocon.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpocon.go
new file mode 100644
index 0000000000..7af4c18728
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpocon.go
@@ -0,0 +1,90 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dpocon estimates the reciprocal of the condition number of a positive-definite
+// matrix A given the Cholesky decomposition of A. The condition number computed
+// is based on the 1-norm and the ∞-norm.
+//
+// anorm is the 1-norm and the ∞-norm of the original matrix A.
+//
+// work is a temporary data slice of length at least 3*n and Dpocon will panic otherwise.
+//
+// iwork is a temporary data slice of length at least n and Dpocon will panic otherwise.
+func (impl Implementation) Dpocon(uplo blas.Uplo, n int, a []float64, lda int, anorm float64, work []float64, iwork []int) float64 {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case anorm < 0:
+		panic(negANorm)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 1
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(work) < 3*n:
+		panic(shortWork)
+	case len(iwork) < n:
+		panic(shortIWork)
+	}
+
+	if anorm == 0 {
+		return 0
+	}
+
+	bi := blas64.Implementation()
+
+	var (
+		smlnum = dlamchS
+		rcond  float64
+		sl, su float64
+		normin bool
+		ainvnm float64
+		kase   int
+		isave  [3]int
+	)
+	for {
+		ainvnm, kase = impl.Dlacn2(n, work[n:], work, iwork, ainvnm, kase, &isave)
+		if kase == 0 {
+			if ainvnm != 0 {
+				rcond = (1 / ainvnm) / anorm
+			}
+			return rcond
+		}
+		if uplo == blas.Upper {
+			sl = impl.Dlatrs(blas.Upper, blas.Trans, blas.NonUnit, normin, n, a, lda, work, work[2*n:])
+			normin = true
+			su = impl.Dlatrs(blas.Upper, blas.NoTrans, blas.NonUnit, normin, n, a, lda, work, work[2*n:])
+		} else {
+			sl = impl.Dlatrs(blas.Lower, blas.NoTrans, blas.NonUnit, normin, n, a, lda, work, work[2*n:])
+			normin = true
+			su = impl.Dlatrs(blas.Lower, blas.Trans, blas.NonUnit, normin, n, a, lda, work, work[2*n:])
+		}
+		scale := sl * su
+		if scale != 1 {
+			ix := bi.Idamax(n, work, 1)
+			if scale == 0 || scale < math.Abs(work[ix])*smlnum {
+				return rcond
+			}
+			impl.Drscl(n, scale, work, 1)
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpotf2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpotf2.go
new file mode 100644
index 0000000000..83411f1cf1
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpotf2.go
@@ -0,0 +1,82 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dpotf2 computes the Cholesky decomposition of the symmetric positive definite
+// matrix a. If ul == blas.Upper, then a is stored as an upper-triangular matrix,
+// and a = Uᵀ U is stored in place into a. If ul == blas.Lower, then a = L Lᵀ
+// is computed and stored in-place into a. If a is not positive definite, false
+// is returned. This is the unblocked version of the algorithm.
+//
+// Dpotf2 is an internal routine. It is exported for testing purposes.
+func (Implementation) Dpotf2(ul blas.Uplo, n int, a []float64, lda int) (ok bool) {
+	switch {
+	case ul != blas.Upper && ul != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return true
+	}
+
+	if len(a) < (n-1)*lda+n {
+		panic(shortA)
+	}
+
+	bi := blas64.Implementation()
+
+	if ul == blas.Upper {
+		for j := 0; j < n; j++ {
+			ajj := a[j*lda+j]
+			if j != 0 {
+				ajj -= bi.Ddot(j, a[j:], lda, a[j:], lda)
+			}
+			if ajj <= 0 || math.IsNaN(ajj) {
+				a[j*lda+j] = ajj
+				return false
+			}
+			ajj = math.Sqrt(ajj)
+			a[j*lda+j] = ajj
+			if j < n-1 {
+				bi.Dgemv(blas.Trans, j, n-j-1,
+					-1, a[j+1:], lda, a[j:], lda,
+					1, a[j*lda+j+1:], 1)
+				bi.Dscal(n-j-1, 1/ajj, a[j*lda+j+1:], 1)
+			}
+		}
+		return true
+	}
+	for j := 0; j < n; j++ {
+		ajj := a[j*lda+j]
+		if j != 0 {
+			ajj -= bi.Ddot(j, a[j*lda:], 1, a[j*lda:], 1)
+		}
+		if ajj <= 0 || math.IsNaN(ajj) {
+			a[j*lda+j] = ajj
+			return false
+		}
+		ajj = math.Sqrt(ajj)
+		a[j*lda+j] = ajj
+		if j < n-1 {
+			bi.Dgemv(blas.NoTrans, n-j-1, j,
+				-1, a[(j+1)*lda:], lda, a[j*lda:], 1,
+				1, a[(j+1)*lda+j:], lda)
+			bi.Dscal(n-j-1, 1/ajj, a[(j+1)*lda+j:], lda)
+		}
+	}
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpotrf.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpotrf.go
new file mode 100644
index 0000000000..7c81680166
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpotrf.go
@@ -0,0 +1,81 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dpotrf computes the Cholesky decomposition of the symmetric positive definite
+// matrix a. If ul == blas.Upper, then a is stored as an upper-triangular matrix,
+// and a = Uᵀ U is stored in place into a. If ul == blas.Lower, then a = L Lᵀ
+// is computed and stored in-place into a. If a is not positive definite, false
+// is returned. This is the blocked version of the algorithm.
+func (impl Implementation) Dpotrf(ul blas.Uplo, n int, a []float64, lda int) (ok bool) {
+	switch {
+	case ul != blas.Upper && ul != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return true
+	}
+
+	if len(a) < (n-1)*lda+n {
+		panic(shortA)
+	}
+
+	nb := impl.Ilaenv(1, "DPOTRF", string(ul), n, -1, -1, -1)
+	if nb <= 1 || n <= nb {
+		return impl.Dpotf2(ul, n, a, lda)
+	}
+	bi := blas64.Implementation()
+	if ul == blas.Upper {
+		for j := 0; j < n; j += nb {
+			jb := min(nb, n-j)
+			bi.Dsyrk(blas.Upper, blas.Trans, jb, j,
+				-1, a[j:], lda,
+				1, a[j*lda+j:], lda)
+			ok = impl.Dpotf2(blas.Upper, jb, a[j*lda+j:], lda)
+			if !ok {
+				return ok
+			}
+			if j+jb < n {
+				bi.Dgemm(blas.Trans, blas.NoTrans, jb, n-j-jb, j,
+					-1, a[j:], lda, a[j+jb:], lda,
+					1, a[j*lda+j+jb:], lda)
+				bi.Dtrsm(blas.Left, blas.Upper, blas.Trans, blas.NonUnit, jb, n-j-jb,
+					1, a[j*lda+j:], lda,
+					a[j*lda+j+jb:], lda)
+			}
+		}
+		return true
+	}
+	for j := 0; j < n; j += nb {
+		jb := min(nb, n-j)
+		bi.Dsyrk(blas.Lower, blas.NoTrans, jb, j,
+			-1, a[j*lda:], lda,
+			1, a[j*lda+j:], lda)
+		ok := impl.Dpotf2(blas.Lower, jb, a[j*lda+j:], lda)
+		if !ok {
+			return ok
+		}
+		if j+jb < n {
+			bi.Dgemm(blas.NoTrans, blas.Trans, n-j-jb, jb, j,
+				-1, a[(j+jb)*lda:], lda, a[j*lda:], lda,
+				1, a[(j+jb)*lda+j:], lda)
+			bi.Dtrsm(blas.Right, blas.Lower, blas.Trans, blas.NonUnit, n-j-jb, jb,
+				1, a[j*lda+j:], lda,
+				a[(j+jb)*lda+j:], lda)
+		}
+	}
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpotri.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpotri.go
new file mode 100644
index 0000000000..6fa981c130
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpotri.go
@@ -0,0 +1,44 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dpotri computes the inverse of a real symmetric positive definite matrix A
+// using its Cholesky factorization.
+//
+// On entry, a contains the triangular factor U or L from the Cholesky
+// factorization A = Uᵀ*U or A = L*Lᵀ, as computed by Dpotrf.
+// On return, a contains the upper or lower triangle of the (symmetric)
+// inverse of A, overwriting the input factor U or L.
+func (impl Implementation) Dpotri(uplo blas.Uplo, n int, a []float64, lda int) (ok bool) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return true
+	}
+
+	if len(a) < (n-1)*lda+n {
+		panic(shortA)
+	}
+
+	// Invert the triangular Cholesky factor U or L.
+	ok = impl.Dtrtri(uplo, blas.NonUnit, n, a, lda)
+	if !ok {
+		return false
+	}
+
+	// Form inv(U)*inv(U)ᵀ or inv(L)ᵀ*inv(L).
+	impl.Dlauum(uplo, n, a, lda)
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpotrs.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpotrs.go
new file mode 100644
index 0000000000..77d070001a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpotrs.go
@@ -0,0 +1,64 @@
+// Copyright ©2018 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dpotrs solves a system of n linear equations A*X = B where A is an n×n
+// symmetric positive definite matrix and B is an n×nrhs matrix. The matrix A is
+// represented by its Cholesky factorization
+//
+//	A = Uᵀ*U  if uplo == blas.Upper
+//	A = L*Lᵀ  if uplo == blas.Lower
+//
+// as computed by Dpotrf. On entry, B contains the right-hand side matrix B, on
+// return it contains the solution matrix X.
+func (Implementation) Dpotrs(uplo blas.Uplo, n, nrhs int, a []float64, lda int, b []float64, ldb int) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case nrhs < 0:
+		panic(nrhsLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldb < max(1, nrhs):
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if n == 0 || nrhs == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(b) < (n-1)*ldb+nrhs:
+		panic(shortB)
+	}
+
+	bi := blas64.Implementation()
+
+	if uplo == blas.Upper {
+		// Solve Uᵀ * U * X = B where U is stored in the upper triangle of A.
+
+		// Solve Uᵀ * X = B, overwriting B with X.
+		bi.Dtrsm(blas.Left, blas.Upper, blas.Trans, blas.NonUnit, n, nrhs, 1, a, lda, b, ldb)
+		// Solve U * X = B, overwriting B with X.
+		bi.Dtrsm(blas.Left, blas.Upper, blas.NoTrans, blas.NonUnit, n, nrhs, 1, a, lda, b, ldb)
+	} else {
+		// Solve L * Lᵀ * X = B where L is stored in the lower triangle of A.
+
+		// Solve L * X = B, overwriting B with X.
+		bi.Dtrsm(blas.Left, blas.Lower, blas.NoTrans, blas.NonUnit, n, nrhs, 1, a, lda, b, ldb)
+		// Solve Lᵀ * X = B, overwriting B with X.
+		bi.Dtrsm(blas.Left, blas.Lower, blas.Trans, blas.NonUnit, n, nrhs, 1, a, lda, b, ldb)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpstf2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpstf2.go
new file mode 100644
index 0000000000..79b607ddc9
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpstf2.go
@@ -0,0 +1,202 @@
+// Copyright ©2021 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dpstf2 computes the Cholesky factorization with complete pivoting of an n×n
+// symmetric positive semidefinite matrix A.
+//
+// The factorization has the form
+//
+//	Pᵀ * A * P = Uᵀ * U ,  if uplo = blas.Upper,
+//	Pᵀ * A * P = L  * Lᵀ,  if uplo = blas.Lower,
+//
+// where U is an upper triangular matrix, L is lower triangular, and P is a
+// permutation matrix.
+//
+// tol is a user-defined tolerance. The algorithm terminates if the pivot is
+// less than or equal to tol. If tol is negative, then n*eps*max(A[k,k]) will be
+// used instead.
+//
+// On return, A contains the factor U or L from the Cholesky factorization and
+// piv contains P stored such that P[piv[k],k] = 1.
+//
+// Dpstf2 returns the computed rank of A and whether the factorization can be
+// used to solve a system. Dpstf2 does not attempt to check that A is positive
+// semi-definite, so if ok is false, the matrix A is either rank deficient or is
+// not positive semidefinite.
+//
+// The length of piv must be n and the length of work must be at least 2*n,
+// otherwise Dpstf2 will panic.
+//
+// Dpstf2 is an internal routine. It is exported for testing purposes.
+func (Implementation) Dpstf2(uplo blas.Uplo, n int, a []float64, lda int, piv []int, tol float64, work []float64) (rank int, ok bool) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 0, true
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(piv) != n:
+		panic(badLenPiv)
+	case len(work) < 2*n:
+		panic(shortWork)
+	}
+
+	// Initialize piv.
+	for i := range piv[:n] {
+		piv[i] = i
+	}
+
+	// Compute the first pivot.
+	pvt := 0
+	ajj := a[0]
+	for i := 1; i < n; i++ {
+		aii := a[i*lda+i]
+		if aii > ajj {
+			pvt = i
+			ajj = aii
+		}
+	}
+	if ajj <= 0 || math.IsNaN(ajj) {
+		return 0, false
+	}
+
+	// Compute stopping value if not supplied.
+	dstop := tol
+	if dstop < 0 {
+		dstop = float64(n) * dlamchE * ajj
+	}
+
+	// Set first half of work to zero, holds dot products.
+	dots := work[:n]
+	for i := range dots {
+		dots[i] = 0
+	}
+	work2 := work[n : 2*n]
+
+	bi := blas64.Implementation()
+	if uplo == blas.Upper {
+		// Compute the Cholesky factorization  Pᵀ * A * P = Uᵀ * U.
+		for j := 0; j < n; j++ {
+			// Update dot products and compute possible pivots which are stored
+			// in the second half of work.
+			for i := j; i < n; i++ {
+				if j > 0 {
+					tmp := a[(j-1)*lda+i]
+					dots[i] += tmp * tmp
+				}
+				work2[i] = a[i*lda+i] - dots[i]
+			}
+			if j > 0 {
+				// Find the pivot.
+				pvt = j
+				ajj = work2[pvt]
+				for k := j + 1; k < n; k++ {
+					wk := work2[k]
+					if wk > ajj {
+						pvt = k
+						ajj = wk
+					}
+				}
+				// Test for exit.
+				if ajj <= dstop || math.IsNaN(ajj) {
+					a[j*lda+j] = ajj
+					return j, false
+				}
+			}
+			if j != pvt {
+				// Swap pivot rows and columns.
+				a[pvt*lda+pvt] = a[j*lda+j]
+				bi.Dswap(j, a[j:], lda, a[pvt:], lda)
+				if pvt < n-1 {
+					bi.Dswap(n-pvt-1, a[j*lda+(pvt+1):], 1, a[pvt*lda+(pvt+1):], 1)
+				}
+				bi.Dswap(pvt-j-1, a[j*lda+(j+1):], 1, a[(j+1)*lda+pvt:], lda)
+				// Swap dot products and piv.
+				dots[j], dots[pvt] = dots[pvt], dots[j]
+				piv[j], piv[pvt] = piv[pvt], piv[j]
+			}
+			ajj = math.Sqrt(ajj)
+			a[j*lda+j] = ajj
+			// Compute elements j+1:n of row j.
+			if j < n-1 {
+				bi.Dgemv(blas.Trans, j, n-j-1,
+					-1, a[j+1:], lda, a[j:], lda,
+					1, a[j*lda+j+1:], 1)
+				bi.Dscal(n-j-1, 1/ajj, a[j*lda+j+1:], 1)
+			}
+		}
+	} else {
+		// Compute the Cholesky factorization  Pᵀ * A * P = L * Lᵀ.
+		for j := 0; j < n; j++ {
+			// Update dot products and compute possible pivots which are stored
+			// in the second half of work.
+			for i := j; i < n; i++ {
+				if j > 0 {
+					tmp := a[i*lda+(j-1)]
+					dots[i] += tmp * tmp
+				}
+				work2[i] = a[i*lda+i] - dots[i]
+			}
+			if j > 0 {
+				// Find the pivot.
+				pvt = j
+				ajj = work2[pvt]
+				for k := j + 1; k < n; k++ {
+					wk := work2[k]
+					if wk > ajj {
+						pvt = k
+						ajj = wk
+					}
+				}
+				// Test for exit.
+				if ajj <= dstop || math.IsNaN(ajj) {
+					a[j*lda+j] = ajj
+					return j, false
+				}
+			}
+			if j != pvt {
+				// Swap pivot rows and columns.
+				a[pvt*lda+pvt] = a[j*lda+j]
+				bi.Dswap(j, a[j*lda:], 1, a[pvt*lda:], 1)
+				if pvt < n-1 {
+					bi.Dswap(n-pvt-1, a[(pvt+1)*lda+j:], lda, a[(pvt+1)*lda+pvt:], lda)
+				}
+				bi.Dswap(pvt-j-1, a[(j+1)*lda+j:], lda, a[pvt*lda+(j+1):], 1)
+				// Swap dot products and piv.
+				dots[j], dots[pvt] = dots[pvt], dots[j]
+				piv[j], piv[pvt] = piv[pvt], piv[j]
+			}
+			ajj = math.Sqrt(ajj)
+			a[j*lda+j] = ajj
+			// Compute elements j+1:n of column j.
+			if j < n-1 {
+				bi.Dgemv(blas.NoTrans, n-j-1, j,
+					-1, a[(j+1)*lda:], lda, a[j*lda:], 1,
+					1, a[(j+1)*lda+j:], lda)
+				bi.Dscal(n-j-1, 1/ajj, a[(j+1)*lda+j:], lda)
+			}
+		}
+	}
+	return n, true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpstrf.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpstrf.go
new file mode 100644
index 0000000000..46a2fd4b77
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpstrf.go
@@ -0,0 +1,233 @@
+// Copyright ©2021 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dpstrf computes the Cholesky factorization with complete pivoting of an n×n
+// symmetric positive semidefinite matrix A.
+//
+// The factorization has the form
+//
+//	Pᵀ * A * P = Uᵀ * U ,  if uplo = blas.Upper,
+//	Pᵀ * A * P = L  * Lᵀ,  if uplo = blas.Lower,
+//
+// where U is an upper triangular matrix, L is lower triangular, and P is a
+// permutation matrix.
+//
+// tol is a user-defined tolerance. The algorithm terminates if the pivot is
+// less than or equal to tol. If tol is negative, then n*eps*max(A[k,k]) will be
+// used instead.
+//
+// On return, A contains the factor U or L from the Cholesky factorization and
+// piv contains P stored such that P[piv[k],k] = 1.
+//
+// Dpstrf returns the computed rank of A and whether the factorization can be
+// used to solve a system. Dpstrf does not attempt to check that A is positive
+// semi-definite, so if ok is false, the matrix A is either rank deficient or is
+// not positive semidefinite.
+//
+// The length of piv must be n and the length of work must be at least 2*n,
+// otherwise Dpstrf will panic.
+//
+// Dpstrf is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dpstrf(uplo blas.Uplo, n int, a []float64, lda int, piv []int, tol float64, work []float64) (rank int, ok bool) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 0, true
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(piv) != n:
+		panic(badLenPiv)
+	case len(work) < 2*n:
+		panic(shortWork)
+	}
+
+	// Get block size.
+	nb := impl.Ilaenv(1, "DPOTRF", string(uplo), n, -1, -1, -1)
+	if nb <= 1 || n <= nb {
+		// Use unblocked code.
+		return impl.Dpstf2(uplo, n, a, lda, piv, tol, work)
+	}
+
+	// Initialize piv.
+	for i := range piv[:n] {
+		piv[i] = i
+	}
+
+	// Compute the first pivot.
+	pvt := 0
+	ajj := a[0]
+	for i := 1; i < n; i++ {
+		aii := a[i*lda+i]
+		if aii > ajj {
+			pvt = i
+			ajj = aii
+		}
+	}
+	if ajj <= 0 || math.IsNaN(ajj) {
+		return 0, false
+	}
+
+	// Compute stopping value if not supplied.
+	dstop := tol
+	if dstop < 0 {
+		dstop = float64(n) * dlamchE * ajj
+	}
+
+	bi := blas64.Implementation()
+	// Split work in half, the first half holds dot products.
+	dots := work[:n]
+	work2 := work[n : 2*n]
+	if uplo == blas.Upper {
+		// Compute the Cholesky factorization  Pᵀ * A * P = Uᵀ * U.
+		for k := 0; k < n; k += nb {
+			// Account for last block not being nb wide.
+			jb := min(nb, n-k)
+			// Set relevant part of dot products to zero.
+			for i := k; i < n; i++ {
+				dots[i] = 0
+			}
+			for j := k; j < k+jb; j++ {
+				// Update dot products and compute possible pivots which are stored
+				// in the second half of work.
+				for i := j; i < n; i++ {
+					if j > k {
+						tmp := a[(j-1)*lda+i]
+						dots[i] += tmp * tmp
+					}
+					work2[i] = a[i*lda+i] - dots[i]
+				}
+				if j > 0 {
+					// Find the pivot.
+					pvt = j
+					ajj = work2[pvt]
+					for l := j + 1; l < n; l++ {
+						wl := work2[l]
+						if wl > ajj {
+							pvt = l
+							ajj = wl
+						}
+					}
+					// Test for exit.
+					if ajj <= dstop || math.IsNaN(ajj) {
+						a[j*lda+j] = ajj
+						return j, false
+					}
+				}
+				if j != pvt {
+					// Swap pivot rows and columns.
+					a[pvt*lda+pvt] = a[j*lda+j]
+					bi.Dswap(j, a[j:], lda, a[pvt:], lda)
+					if pvt < n-1 {
+						bi.Dswap(n-pvt-1, a[j*lda+(pvt+1):], 1, a[pvt*lda+(pvt+1):], 1)
+					}
+					bi.Dswap(pvt-j-1, a[j*lda+(j+1):], 1, a[(j+1)*lda+pvt:], lda)
+					// Swap dot products and piv.
+					dots[j], dots[pvt] = dots[pvt], dots[j]
+					piv[j], piv[pvt] = piv[pvt], piv[j]
+				}
+				ajj = math.Sqrt(ajj)
+				a[j*lda+j] = ajj
+				// Compute elements j+1:n of row j.
+				if j < n-1 {
+					bi.Dgemv(blas.Trans, j-k, n-j-1,
+						-1, a[k*lda+j+1:], lda, a[k*lda+j:], lda,
+						1, a[j*lda+j+1:], 1)
+					bi.Dscal(n-j-1, 1/ajj, a[j*lda+j+1:], 1)
+				}
+			}
+			// Update trailing matrix.
+			if k+jb < n {
+				j := k + jb
+				bi.Dsyrk(blas.Upper, blas.Trans, n-j, jb,
+					-1, a[k*lda+j:], lda, 1, a[j*lda+j:], lda)
+			}
+		}
+	} else {
+		// Compute the Cholesky factorization  Pᵀ * A * P = L * Lᵀ.
+		for k := 0; k < n; k += nb {
+			// Account for last block not being nb wide.
+			jb := min(nb, n-k)
+			// Set relevant part of dot products to zero.
+			for i := k; i < n; i++ {
+				dots[i] = 0
+			}
+			for j := k; j < k+jb; j++ {
+				// Update dot products and compute possible pivots which are stored
+				// in the second half of work.
+				for i := j; i < n; i++ {
+					if j > k {
+						tmp := a[i*lda+(j-1)]
+						dots[i] += tmp * tmp
+					}
+					work2[i] = a[i*lda+i] - dots[i]
+				}
+				if j > 0 {
+					// Find the pivot.
+					pvt = j
+					ajj = work2[pvt]
+					for l := j + 1; l < n; l++ {
+						wl := work2[l]
+						if wl > ajj {
+							pvt = l
+							ajj = wl
+						}
+					}
+					// Test for exit.
+					if ajj <= dstop || math.IsNaN(ajj) {
+						a[j*lda+j] = ajj
+						return j, false
+					}
+				}
+				if j != pvt {
+					// Swap pivot rows and columns.
+					a[pvt*lda+pvt] = a[j*lda+j]
+					bi.Dswap(j, a[j*lda:], 1, a[pvt*lda:], 1)
+					if pvt < n-1 {
+						bi.Dswap(n-pvt-1, a[(pvt+1)*lda+j:], lda, a[(pvt+1)*lda+pvt:], lda)
+					}
+					bi.Dswap(pvt-j-1, a[(j+1)*lda+j:], lda, a[pvt*lda+(j+1):], 1)
+					// Swap dot products and piv.
+					dots[j], dots[pvt] = dots[pvt], dots[j]
+					piv[j], piv[pvt] = piv[pvt], piv[j]
+				}
+				ajj = math.Sqrt(ajj)
+				a[j*lda+j] = ajj
+				// Compute elements j+1:n of column j.
+				if j < n-1 {
+					bi.Dgemv(blas.NoTrans, n-j-1, j-k,
+						-1, a[(j+1)*lda+k:], lda, a[j*lda+k:], 1,
+						1, a[(j+1)*lda+j:], lda)
+					bi.Dscal(n-j-1, 1/ajj, a[(j+1)*lda+j:], lda)
+				}
+			}
+			// Update trailing matrix.
+			if k+jb < n {
+				j := k + jb
+				bi.Dsyrk(blas.Lower, blas.NoTrans, n-j, jb,
+					-1, a[j*lda+k:], lda, 1, a[j*lda+j:], lda)
+			}
+		}
+	}
+	return n, true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dptcon.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dptcon.go
new file mode 100644
index 0000000000..cd41e3175a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dptcon.go
@@ -0,0 +1,99 @@
+// Copyright ©2023 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dptcon computes and returns the reciprocal of the condition number (in the
+// 1-norm) of a symmetric positive definite tridiagonal matrix A using the
+// factorization A = L*D*Lᵀ or A = Uᵀ*D*U computed by Dpttrf.
+//
+// The reciprocal of the condition number is computed as
+//
+//	rcond = 1 / (anorm * ‖A⁻¹‖)
+//
+// and ‖A⁻¹‖ is computed by a direct method.
+//
+// d and e contain, respectively, the n diagonal elements of the diagonal matrix
+// D and the (n-1) off-diagonal elements of the unit bidiagonal factor U or L
+// from the factorization of A, as computed by Dpttrf.
+//
+// anorm is the 1-norm of the original matrix A.
+//
+// work must have length n, otherwise Dptcon will panic.
+func (impl Implementation) Dptcon(n int, d, e []float64, anorm float64, work []float64) (rcond float64) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case anorm < 0:
+		panic(badNorm)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 1
+	}
+
+	switch {
+	case len(d) < n:
+		panic(shortD)
+	case len(e) < n-1:
+		panic(shortE)
+	case len(work) < n:
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	switch {
+	case anorm == 0:
+		return 0
+	case math.IsNaN(anorm):
+		// Propagate NaN.
+		return anorm
+	case math.IsInf(anorm, 1):
+		return 0
+	}
+
+	// Check that d[0:n] is positive.
+	for _, di := range d[:n] {
+		if di <= 0 {
+			return 0
+		}
+	}
+
+	// Solve M(A) * x = e, where M(A) = (m[i,j]) is given by
+	//
+	// 	m[i,j] =  abs(A[i,j]), i == j,
+	// 	m[i,j] = -abs(A[i,j]), i != j,
+	//
+	// and e = [1,1,...,1]ᵀ. Note M(A) = M(L)*D*M(L)ᵀ.
+
+	// Solve M(L) * b = e.
+	work[0] = 1
+	for i := 1; i < n; i++ {
+		work[i] = 1 + work[i-1]*math.Abs(e[i-1])
+	}
+
+	// Solve D * M(L)ᵀ * x = b.
+	work[n-1] /= d[n-1]
+	for i := n - 2; i >= 0; i-- {
+		work[i] = work[i]/d[i] + work[i+1]*math.Abs(e[i])
+	}
+
+	// Compute ainvnm = max(x[i]), 0<=i<n.
+	bi := blas64.Implementation()
+	ix := bi.Idamax(n, work, 1)
+	ainvnm := math.Abs(work[ix])
+	if ainvnm == 0 {
+		return 0
+	}
+
+	// Compute the reciprocal condition number.
+	return 1 / ainvnm / anorm
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dptsv.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dptsv.go
new file mode 100644
index 0000000000..37851b63d5
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dptsv.go
@@ -0,0 +1,49 @@
+// Copyright ©2023 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+// Dptsv computes the solution to system of linear equations
+//
+//	A * X = B
+//
+// where A is an n×n symmetric positive definite tridiagonal matrix, and X and B
+// are n×nrhs matrices. A is factored as A = L*D*Lᵀ, and the factored form of A
+// is then used to solve the system of equations.
+//
+// On entry, d contains the n diagonal elements of A and e contains the (n-1)
+// subdiagonal elements of A. On return, d contains the n diagonal elements of
+// the diagonal matrix D from the factorization A = L*D*Lᵀ and e contains the
+// (n-1) subdiagonal elements of the unit bidiagonal factor L.
+//
+// Dptsv returns whether the solution X has been successfully computed.
+func (impl Implementation) Dptsv(n, nrhs int, d, e []float64, b []float64, ldb int) (ok bool) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case nrhs < 0:
+		panic(nrhsLT0)
+	case ldb < max(1, nrhs):
+		panic(badLdB)
+	}
+
+	if n == 0 || nrhs == 0 {
+		return true
+	}
+
+	switch {
+	case len(d) < n:
+		panic(shortD)
+	case len(e) < n-1:
+		panic(shortE)
+	case len(b) < (n-1)*ldb+nrhs:
+		panic(shortB)
+	}
+
+	ok = impl.Dpttrf(n, d, e)
+	if ok {
+		impl.Dpttrs(n, nrhs, d, e, b, ldb)
+	}
+	return ok
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpttrf.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpttrf.go
new file mode 100644
index 0000000000..8ff2c2e4ba
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpttrf.go
@@ -0,0 +1,80 @@
+// Copyright ©2023 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+// Dpttrf computes the L*D*Lᵀ factorization of an n×n symmetric positive
+// definite tridiagonal matrix A and returns whether the factorization was
+// successful.
+//
+// On entry, d and e contain the n diagonal and (n-1) subdiagonal elements,
+// respectively, of A.
+//
+// On return, d contains the n diagonal elements of the diagonal matrix D and e
+// contains the (n-1) subdiagonal elements of the unit bidiagonal matrix L.
+func (impl Implementation) Dpttrf(n int, d, e []float64) (ok bool) {
+	if n < 0 {
+		panic(nLT0)
+	}
+
+	if n == 0 {
+		return true
+	}
+
+	switch {
+	case len(d) < n:
+		panic(shortD)
+	case len(e) < n-1:
+		panic(shortE)
+	}
+
+	// Compute the L*D*Lᵀ (or Uᵀ*D*U) factorization of A.
+	i4 := (n - 1) % 4
+	for i := 0; i < i4; i++ {
+		if d[i] <= 0 {
+			return false
+		}
+		ei := e[i]
+		e[i] /= d[i]
+		d[i+1] -= e[i] * ei
+	}
+	for i := i4; i < n-4; i += 4 {
+		// Drop out of the loop if d[i] <= 0: the matrix is not positive
+		// definite.
+		if d[i] <= 0 {
+			return false
+		}
+
+		// Solve for e[i] and d[i+1].
+		ei := e[i]
+		e[i] /= d[i]
+		d[i+1] -= e[i] * ei
+		if d[i+1] <= 0 {
+			return false
+		}
+
+		// Solve for e[i+1] and d[i+2].
+		ei = e[i+1]
+		e[i+1] /= d[i+1]
+		d[i+2] -= e[i+1] * ei
+		if d[i+2] <= 0 {
+			return false
+		}
+
+		// Solve for e[i+2] and d[i+3].
+		ei = e[i+2]
+		e[i+2] /= d[i+2]
+		d[i+3] -= e[i+2] * ei
+		if d[i+3] <= 0 {
+			return false
+		}
+
+		// Solve for e[i+3] and d[i+4].
+		ei = e[i+3]
+		e[i+3] /= d[i+3]
+		d[i+4] -= e[i+3] * ei
+	}
+	// Check d[n-1] for positive definiteness.
+	return d[n-1] > 0
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpttrs.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpttrs.go
new file mode 100644
index 0000000000..7bdee6f937
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpttrs.go
@@ -0,0 +1,51 @@
+// Copyright ©2023 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+// Dpttrs solves a tridiagonal system of the form
+//
+//	A * X = B
+//
+// using the L*D*Lᵀ factorization of A computed by Dpttrf. D is a diagonal
+// matrix specified in d, L is a unit bidiagonal matrix whose subdiagonal is
+// specified in e, and X and B are n×nrhs matrices.
+func (impl Implementation) Dpttrs(n, nrhs int, d, e []float64, b []float64, ldb int) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case nrhs < 0:
+		panic(nrhsLT0)
+	case ldb < max(1, nrhs):
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if n == 0 || nrhs == 0 {
+		return
+	}
+
+	switch {
+	case len(d) < n:
+		panic(shortD)
+	case len(e) < n-1:
+		panic(shortE)
+	case len(b) < (n-1)*ldb+nrhs:
+		panic(shortB)
+	}
+
+	nb := 1
+	if nrhs > 1 {
+		nb = max(1, impl.Ilaenv(1, "DPTTRS", " ", n, nrhs, -1, -1))
+	}
+
+	if nb >= nrhs {
+		impl.dptts2(n, nrhs, d, e, b, ldb)
+	} else {
+		for j := 0; j < nrhs; j += nb {
+			jb := min(nrhs-j, nb)
+			impl.dptts2(n, jb, d, e, b[j:], ldb)
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dptts2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dptts2.go
new file mode 100644
index 0000000000..ff1df168f2
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dptts2.go
@@ -0,0 +1,39 @@
+// Copyright ©2023 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas/blas64"
+
+// dptts2 solves a tridiagonal system of the form
+//
+//	A * X = B
+//
+// using the L*D*Lᵀ factorization of A computed by Dpttrf. D is a diagonal
+// matrix specified in d, L is a unit bidiagonal matrix whose subdiagonal is
+// specified in e, and X and B are n×nrhs matrices.
+func (impl Implementation) dptts2(n, nrhs int, d, e []float64, b []float64, ldb int) {
+	// Quick return if possible.
+	if n <= 1 {
+		if n == 1 {
+			bi := blas64.Implementation()
+			bi.Dscal(nrhs, 1/d[0], b, 1)
+		}
+		return
+	}
+
+	// Solve A * X = B using the factorization A = L*D*Lᵀ, overwriting each
+	// right hand side vector with its solution.
+	for j := 0; j < nrhs; j++ {
+		// Solve L * x = b.
+		for i := 1; i < n; i++ {
+			b[i*ldb+j] -= b[(i-1)*ldb+j] * e[i-1]
+		}
+		// Solve D * Lᵀ * x = b.
+		b[(n-1)*ldb+j] /= d[n-1]
+		for i := n - 2; i >= 0; i-- {
+			b[i*ldb+j] = b[i*ldb+j]/d[i] - b[(i+1)*ldb+j]*e[i]
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/drscl.go b/vendor/gonum.org/v1/gonum/lapack/gonum/drscl.go
new file mode 100644
index 0000000000..b2772dbc22
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/drscl.go
@@ -0,0 +1,63 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Drscl multiplies the vector x by 1/a being careful to avoid overflow or
+// underflow where possible.
+//
+// Drscl is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Drscl(n int, a float64, x []float64, incX int) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case incX <= 0:
+		panic(badIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	if len(x) < 1+(n-1)*incX {
+		panic(shortX)
+	}
+
+	bi := blas64.Implementation()
+
+	cden := a
+	cnum := 1.0
+	smlnum := dlamchS
+	bignum := 1 / smlnum
+	for {
+		cden1 := cden * smlnum
+		cnum1 := cnum / bignum
+		var mul float64
+		var done bool
+		switch {
+		case cnum != 0 && math.Abs(cden1) > math.Abs(cnum):
+			mul = smlnum
+			done = false
+			cden = cden1
+		case math.Abs(cnum1) > math.Abs(cden):
+			mul = bignum
+			done = false
+			cnum = cnum1
+		default:
+			mul = cnum / cden
+			done = true
+		}
+		bi.Dscal(n, mul, x, incX)
+		if done {
+			break
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dsteqr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dsteqr.go
new file mode 100644
index 0000000000..d6c7861ab5
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dsteqr.go
@@ -0,0 +1,376 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dsteqr computes the eigenvalues and optionally the eigenvectors of a symmetric
+// tridiagonal matrix using the implicit QL or QR method. The eigenvectors of a
+// full or band symmetric matrix can also be found if Dsytrd, Dsptrd, or Dsbtrd
+// have been used to reduce this matrix to tridiagonal form.
+//
+// d, on entry, contains the diagonal elements of the tridiagonal matrix. On exit,
+// d contains the eigenvalues in ascending order. d must have length n and
+// Dsteqr will panic otherwise.
+//
+// e, on entry, contains the off-diagonal elements of the tridiagonal matrix on
+// entry, and is overwritten during the call to Dsteqr. e must have length n-1 and
+// Dsteqr will panic otherwise.
+//
+// z, on entry, contains the n×n orthogonal matrix used in the reduction to
+// tridiagonal form if compz == lapack.EVOrig. On exit, if
+// compz == lapack.EVOrig, z contains the orthonormal eigenvectors of the
+// original symmetric matrix, and if compz == lapack.EVTridiag, z contains the
+// orthonormal eigenvectors of the symmetric tridiagonal matrix. z is not used
+// if compz == lapack.EVCompNone.
+//
+// work must have length at least max(1, 2*n-2) if the eigenvectors are computed,
+// and Dsteqr will panic otherwise.
+//
+// Dsteqr is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dsteqr(compz lapack.EVComp, n int, d, e, z []float64, ldz int, work []float64) (ok bool) {
+	switch {
+	case compz != lapack.EVCompNone && compz != lapack.EVTridiag && compz != lapack.EVOrig:
+		panic(badEVComp)
+	case n < 0:
+		panic(nLT0)
+	case ldz < 1, compz != lapack.EVCompNone && ldz < n:
+		panic(badLdZ)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return true
+	}
+
+	switch {
+	case len(d) < n:
+		panic(shortD)
+	case len(e) < n-1:
+		panic(shortE)
+	case compz != lapack.EVCompNone && len(z) < (n-1)*ldz+n:
+		panic(shortZ)
+	case compz != lapack.EVCompNone && len(work) < max(1, 2*n-2):
+		panic(shortWork)
+	}
+
+	var icompz int
+	if compz == lapack.EVOrig {
+		icompz = 1
+	} else if compz == lapack.EVTridiag {
+		icompz = 2
+	}
+
+	if n == 1 {
+		if icompz == 2 {
+			z[0] = 1
+		}
+		return true
+	}
+
+	bi := blas64.Implementation()
+
+	eps := dlamchE
+	eps2 := eps * eps
+	safmin := dlamchS
+	safmax := 1 / safmin
+	ssfmax := math.Sqrt(safmax) / 3
+	ssfmin := math.Sqrt(safmin) / eps2
+
+	// Compute the eigenvalues and eigenvectors of the tridiagonal matrix.
+	if icompz == 2 {
+		impl.Dlaset(blas.All, n, n, 0, 1, z, ldz)
+	}
+	const maxit = 30
+	nmaxit := n * maxit
+
+	jtot := 0
+
+	// Determine where the matrix splits and choose QL or QR iteration for each
+	// block, according to whether top or bottom diagonal element is smaller.
+	l1 := 0
+	nm1 := n - 1
+
+	type scaletype int
+	const (
+		down scaletype = iota + 1
+		up
+	)
+	var iscale scaletype
+
+	for {
+		if l1 > n-1 {
+			// Order eigenvalues and eigenvectors.
+			if icompz == 0 {
+				impl.Dlasrt(lapack.SortIncreasing, n, d)
+			} else {
+				// TODO(btracey): Consider replacing this sort with a call to sort.Sort.
+				for ii := 1; ii < n; ii++ {
+					i := ii - 1
+					k := i
+					p := d[i]
+					for j := ii; j < n; j++ {
+						if d[j] < p {
+							k = j
+							p = d[j]
+						}
+					}
+					if k != i {
+						d[k] = d[i]
+						d[i] = p
+						bi.Dswap(n, z[i:], ldz, z[k:], ldz)
+					}
+				}
+			}
+			return true
+		}
+		if l1 > 0 {
+			e[l1-1] = 0
+		}
+		var m int
+		if l1 <= nm1 {
+			for m = l1; m < nm1; m++ {
+				test := math.Abs(e[m])
+				if test == 0 {
+					break
+				}
+				if test <= (math.Sqrt(math.Abs(d[m]))*math.Sqrt(math.Abs(d[m+1])))*eps {
+					e[m] = 0
+					break
+				}
+			}
+		}
+		l := l1
+		lsv := l
+		lend := m
+		lendsv := lend
+		l1 = m + 1
+		if lend == l {
+			continue
+		}
+
+		// Scale submatrix in rows and columns L to Lend
+		anorm := impl.Dlanst(lapack.MaxAbs, lend-l+1, d[l:], e[l:])
+		switch {
+		case anorm == 0:
+			continue
+		case anorm > ssfmax:
+			iscale = down
+			// Pretend that d and e are matrices with 1 column.
+			impl.Dlascl(lapack.General, 0, 0, anorm, ssfmax, lend-l+1, 1, d[l:], 1)
+			impl.Dlascl(lapack.General, 0, 0, anorm, ssfmax, lend-l, 1, e[l:], 1)
+		case anorm < ssfmin:
+			iscale = up
+			impl.Dlascl(lapack.General, 0, 0, anorm, ssfmin, lend-l+1, 1, d[l:], 1)
+			impl.Dlascl(lapack.General, 0, 0, anorm, ssfmin, lend-l, 1, e[l:], 1)
+		}
+
+		// Choose between QL and QR.
+		if math.Abs(d[lend]) < math.Abs(d[l]) {
+			lend = lsv
+			l = lendsv
+		}
+		if lend > l {
+			// QL Iteration. Look for small subdiagonal element.
+			for {
+				if l != lend {
+					for m = l; m < lend; m++ {
+						v := math.Abs(e[m])
+						if v*v <= (eps2*math.Abs(d[m]))*math.Abs(d[m+1])+safmin {
+							break
+						}
+					}
+				} else {
+					m = lend
+				}
+				if m < lend {
+					e[m] = 0
+				}
+				p := d[l]
+				if m == l {
+					// Eigenvalue found.
+					l++
+					if l > lend {
+						break
+					}
+					continue
+				}
+
+				// If remaining matrix is 2×2, use Dlae2 to compute its eigensystem.
+				if m == l+1 {
+					if icompz > 0 {
+						d[l], d[l+1], work[l], work[n-1+l] = impl.Dlaev2(d[l], e[l], d[l+1])
+						impl.Dlasr(blas.Right, lapack.Variable, lapack.Backward,
+							n, 2, work[l:], work[n-1+l:], z[l:], ldz)
+					} else {
+						d[l], d[l+1] = impl.Dlae2(d[l], e[l], d[l+1])
+					}
+					e[l] = 0
+					l += 2
+					if l > lend {
+						break
+					}
+					continue
+				}
+
+				if jtot == nmaxit {
+					break
+				}
+				jtot++
+
+				// Form shift
+				g := (d[l+1] - p) / (2 * e[l])
+				r := impl.Dlapy2(g, 1)
+				g = d[m] - p + e[l]/(g+math.Copysign(r, g))
+				s := 1.0
+				c := 1.0
+				p = 0.0
+
+				// Inner loop
+				for i := m - 1; i >= l; i-- {
+					f := s * e[i]
+					b := c * e[i]
+					c, s, r = impl.Dlartg(g, f)
+					if i != m-1 {
+						e[i+1] = r
+					}
+					g = d[i+1] - p
+					r = (d[i]-g)*s + 2*c*b
+					p = s * r
+					d[i+1] = g + p
+					g = c*r - b
+
+					// If eigenvectors are desired, then save rotations.
+					if icompz > 0 {
+						work[i] = c
+						work[n-1+i] = -s
+					}
+				}
+				// If eigenvectors are desired, then apply saved rotations.
+				if icompz > 0 {
+					mm := m - l + 1
+					impl.Dlasr(blas.Right, lapack.Variable, lapack.Backward,
+						n, mm, work[l:], work[n-1+l:], z[l:], ldz)
+				}
+				d[l] -= p
+				e[l] = g
+			}
+		} else {
+			// QR Iteration.
+			// Look for small superdiagonal element.
+			for {
+				if l != lend {
+					for m = l; m > lend; m-- {
+						v := math.Abs(e[m-1])
+						if v*v <= (eps2*math.Abs(d[m])*math.Abs(d[m-1]) + safmin) {
+							break
+						}
+					}
+				} else {
+					m = lend
+				}
+				if m > lend {
+					e[m-1] = 0
+				}
+				p := d[l]
+				if m == l {
+					// Eigenvalue found
+					l--
+					if l < lend {
+						break
+					}
+					continue
+				}
+
+				// If remaining matrix is 2×2, use Dlae2 to compute its eigenvalues.
+				if m == l-1 {
+					if icompz > 0 {
+						d[l-1], d[l], work[m], work[n-1+m] = impl.Dlaev2(d[l-1], e[l-1], d[l])
+						impl.Dlasr(blas.Right, lapack.Variable, lapack.Forward,
+							n, 2, work[m:], work[n-1+m:], z[l-1:], ldz)
+					} else {
+						d[l-1], d[l] = impl.Dlae2(d[l-1], e[l-1], d[l])
+					}
+					e[l-1] = 0
+					l -= 2
+					if l < lend {
+						break
+					}
+					continue
+				}
+				if jtot == nmaxit {
+					break
+				}
+				jtot++
+
+				// Form shift.
+				g := (d[l-1] - p) / (2 * e[l-1])
+				r := impl.Dlapy2(g, 1)
+				g = d[m] - p + (e[l-1])/(g+math.Copysign(r, g))
+				s := 1.0
+				c := 1.0
+				p = 0.0
+
+				// Inner loop.
+				for i := m; i < l; i++ {
+					f := s * e[i]
+					b := c * e[i]
+					c, s, r = impl.Dlartg(g, f)
+					if i != m {
+						e[i-1] = r
+					}
+					g = d[i] - p
+					r = (d[i+1]-g)*s + 2*c*b
+					p = s * r
+					d[i] = g + p
+					g = c*r - b
+
+					// If eigenvectors are desired, then save rotations.
+					if icompz > 0 {
+						work[i] = c
+						work[n-1+i] = s
+					}
+				}
+
+				// If eigenvectors are desired, then apply saved rotations.
+				if icompz > 0 {
+					mm := l - m + 1
+					impl.Dlasr(blas.Right, lapack.Variable, lapack.Forward,
+						n, mm, work[m:], work[n-1+m:], z[m:], ldz)
+				}
+				d[l] -= p
+				e[l-1] = g
+			}
+		}
+
+		// Undo scaling if necessary.
+		switch iscale {
+		case down:
+			// Pretend that d and e are matrices with 1 column.
+			impl.Dlascl(lapack.General, 0, 0, ssfmax, anorm, lendsv-lsv+1, 1, d[lsv:], 1)
+			impl.Dlascl(lapack.General, 0, 0, ssfmax, anorm, lendsv-lsv, 1, e[lsv:], 1)
+		case up:
+			impl.Dlascl(lapack.General, 0, 0, ssfmin, anorm, lendsv-lsv+1, 1, d[lsv:], 1)
+			impl.Dlascl(lapack.General, 0, 0, ssfmin, anorm, lendsv-lsv, 1, e[lsv:], 1)
+		}
+
+		// Check for no convergence to an eigenvalue after a total of n*maxit iterations.
+		if jtot >= nmaxit {
+			break
+		}
+	}
+	for i := 0; i < n-1; i++ {
+		if e[i] != 0 {
+			return false
+		}
+	}
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dsterf.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dsterf.go
new file mode 100644
index 0000000000..dc1e178dfa
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dsterf.go
@@ -0,0 +1,285 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dsterf computes all eigenvalues of a symmetric tridiagonal matrix using the
+// Pal-Walker-Kahan variant of the QL or QR algorithm.
+//
+// d contains the diagonal elements of the tridiagonal matrix on entry, and
+// contains the eigenvalues in ascending order on exit. d must have length at
+// least n, or Dsterf will panic.
+//
+// e contains the off-diagonal elements of the tridiagonal matrix on entry, and is
+// overwritten during the call to Dsterf. e must have length of at least n-1 or
+// Dsterf will panic.
+//
+// Dsterf is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dsterf(n int, d, e []float64) (ok bool) {
+	if n < 0 {
+		panic(nLT0)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return true
+	}
+
+	switch {
+	case len(d) < n:
+		panic(shortD)
+	case len(e) < n-1:
+		panic(shortE)
+	}
+
+	if n == 1 {
+		return true
+	}
+
+	const (
+		none = 0 // The values are not scaled.
+		down = 1 // The values are scaled below ssfmax threshold.
+		up   = 2 // The values are scaled below ssfmin threshold.
+	)
+
+	// Determine the unit roundoff for this environment.
+	eps := dlamchE
+	eps2 := eps * eps
+	safmin := dlamchS
+	safmax := 1 / safmin
+	ssfmax := math.Sqrt(safmax) / 3
+	ssfmin := math.Sqrt(safmin) / eps2
+
+	// Compute the eigenvalues of the tridiagonal matrix.
+	maxit := 30
+	nmaxit := n * maxit
+	jtot := 0
+
+	l1 := 0
+
+	for {
+		if l1 > n-1 {
+			impl.Dlasrt(lapack.SortIncreasing, n, d)
+			return true
+		}
+		if l1 > 0 {
+			e[l1-1] = 0
+		}
+		var m int
+		for m = l1; m < n-1; m++ {
+			if math.Abs(e[m]) <= math.Sqrt(math.Abs(d[m]))*math.Sqrt(math.Abs(d[m+1]))*eps {
+				e[m] = 0
+				break
+			}
+		}
+
+		l := l1
+		lsv := l
+		lend := m
+		lendsv := lend
+		l1 = m + 1
+		if lend == 0 {
+			continue
+		}
+
+		// Scale submatrix in rows and columns l to lend.
+		anorm := impl.Dlanst(lapack.MaxAbs, lend-l+1, d[l:], e[l:])
+		iscale := none
+		if anorm == 0 {
+			continue
+		}
+		if anorm > ssfmax {
+			iscale = down
+			impl.Dlascl(lapack.General, 0, 0, anorm, ssfmax, lend-l+1, 1, d[l:], n)
+			impl.Dlascl(lapack.General, 0, 0, anorm, ssfmax, lend-l, 1, e[l:], n)
+		} else if anorm < ssfmin {
+			iscale = up
+			impl.Dlascl(lapack.General, 0, 0, anorm, ssfmin, lend-l+1, 1, d[l:], n)
+			impl.Dlascl(lapack.General, 0, 0, anorm, ssfmin, lend-l, 1, e[l:], n)
+		}
+
+		el := e[l:lend]
+		for i, v := range el {
+			el[i] *= v
+		}
+
+		// Choose between QL and QR iteration.
+		if math.Abs(d[lend]) < math.Abs(d[l]) {
+			lend = lsv
+			l = lendsv
+		}
+		if lend >= l {
+			// QL Iteration.
+			// Look for small sub-diagonal element.
+			for {
+				if l != lend {
+					for m = l; m < lend; m++ {
+						if math.Abs(e[m]) <= eps2*(math.Abs(d[m]*d[m+1])) {
+							break
+						}
+					}
+				} else {
+					m = lend
+				}
+				if m < lend {
+					e[m] = 0
+				}
+				p := d[l]
+				if m == l {
+					// Eigenvalue found.
+					l++
+					if l > lend {
+						break
+					}
+					continue
+				}
+				// If remaining matrix is 2 by 2, use Dlae2 to compute its eigenvalues.
+				if m == l+1 {
+					d[l], d[l+1] = impl.Dlae2(d[l], math.Sqrt(e[l]), d[l+1])
+					e[l] = 0
+					l += 2
+					if l > lend {
+						break
+					}
+					continue
+				}
+				if jtot == nmaxit {
+					break
+				}
+				jtot++
+
+				// Form shift.
+				rte := math.Sqrt(e[l])
+				sigma := (d[l+1] - p) / (2 * rte)
+				r := impl.Dlapy2(sigma, 1)
+				sigma = p - (rte / (sigma + math.Copysign(r, sigma)))
+
+				c := 1.0
+				s := 0.0
+				gamma := d[m] - sigma
+				p = gamma * gamma
+
+				// Inner loop.
+				for i := m - 1; i >= l; i-- {
+					bb := e[i]
+					r := p + bb
+					if i != m-1 {
+						e[i+1] = s * r
+					}
+					oldc := c
+					c = p / r
+					s = bb / r
+					oldgam := gamma
+					alpha := d[i]
+					gamma = c*(alpha-sigma) - s*oldgam
+					d[i+1] = oldgam + (alpha - gamma)
+					if c != 0 {
+						p = (gamma * gamma) / c
+					} else {
+						p = oldc * bb
+					}
+				}
+				e[l] = s * p
+				d[l] = sigma + gamma
+			}
+		} else {
+			for {
+				// QR Iteration.
+				// Look for small super-diagonal element.
+				for m = l; m > lend; m-- {
+					if math.Abs(e[m-1]) <= eps2*math.Abs(d[m]*d[m-1]) {
+						break
+					}
+				}
+				if m > lend {
+					e[m-1] = 0
+				}
+				p := d[l]
+				if m == l {
+					// Eigenvalue found.
+					l--
+					if l < lend {
+						break
+					}
+					continue
+				}
+
+				// If remaining matrix is 2 by 2, use Dlae2 to compute its eigenvalues.
+				if m == l-1 {
+					d[l], d[l-1] = impl.Dlae2(d[l], math.Sqrt(e[l-1]), d[l-1])
+					e[l-1] = 0
+					l -= 2
+					if l < lend {
+						break
+					}
+					continue
+				}
+				if jtot == nmaxit {
+					break
+				}
+				jtot++
+
+				// Form shift.
+				rte := math.Sqrt(e[l-1])
+				sigma := (d[l-1] - p) / (2 * rte)
+				r := impl.Dlapy2(sigma, 1)
+				sigma = p - (rte / (sigma + math.Copysign(r, sigma)))
+
+				c := 1.0
+				s := 0.0
+				gamma := d[m] - sigma
+				p = gamma * gamma
+
+				// Inner loop.
+				for i := m; i < l; i++ {
+					bb := e[i]
+					r := p + bb
+					if i != m {
+						e[i-1] = s * r
+					}
+					oldc := c
+					c = p / r
+					s = bb / r
+					oldgam := gamma
+					alpha := d[i+1]
+					gamma = c*(alpha-sigma) - s*oldgam
+					d[i] = oldgam + alpha - gamma
+					if c != 0 {
+						p = (gamma * gamma) / c
+					} else {
+						p = oldc * bb
+					}
+				}
+				e[l-1] = s * p
+				d[l] = sigma + gamma
+			}
+		}
+
+		// Undo scaling if necessary
+		switch iscale {
+		case down:
+			impl.Dlascl(lapack.General, 0, 0, ssfmax, anorm, lendsv-lsv+1, 1, d[lsv:], n)
+		case up:
+			impl.Dlascl(lapack.General, 0, 0, ssfmin, anorm, lendsv-lsv+1, 1, d[lsv:], n)
+		}
+
+		// Check for no convergence to an eigenvalue after a total of n*maxit iterations.
+		if jtot >= nmaxit {
+			break
+		}
+	}
+	for _, v := range e[:n-1] {
+		if v != 0 {
+			return false
+		}
+	}
+	impl.Dlasrt(lapack.SortIncreasing, n, d)
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dsyev.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dsyev.go
new file mode 100644
index 0000000000..5f57f3a5c9
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dsyev.go
@@ -0,0 +1,130 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dsyev computes all eigenvalues and, optionally, the eigenvectors of a real
+// symmetric matrix A.
+//
+// w contains the eigenvalues in ascending order upon return. w must have length
+// at least n, and Dsyev will panic otherwise.
+//
+// On entry, a contains the elements of the symmetric matrix A in the triangular
+// portion specified by uplo. If jobz == lapack.EVCompute, a contains the
+// orthonormal eigenvectors of A on exit, otherwise jobz must be lapack.EVNone
+// and on exit the specified triangular region is overwritten.
+//
+// work is temporary storage, and lwork specifies the usable memory length. At minimum,
+// lwork >= 3*n-1, and Dsyev will panic otherwise. The amount of blocking is
+// limited by the usable length. If lwork == -1, instead of computing Dsyev the
+// optimal work length is stored into work[0].
+func (impl Implementation) Dsyev(jobz lapack.EVJob, uplo blas.Uplo, n int, a []float64, lda int, w, work []float64, lwork int) (ok bool) {
+	switch {
+	case jobz != lapack.EVNone && jobz != lapack.EVCompute:
+		panic(badEVJob)
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < max(1, 3*n-1) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return true
+	}
+
+	var opts string
+	if uplo == blas.Upper {
+		opts = "U"
+	} else {
+		opts = "L"
+	}
+	nb := impl.Ilaenv(1, "DSYTRD", opts, n, -1, -1, -1)
+	lworkopt := max(1, (nb+2)*n)
+	if lwork == -1 {
+		work[0] = float64(lworkopt)
+		return
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(w) < n:
+		panic(shortW)
+	}
+
+	if n == 1 {
+		w[0] = a[0]
+		work[0] = 2
+		if jobz == lapack.EVCompute {
+			a[0] = 1
+		}
+		return true
+	}
+
+	safmin := dlamchS
+	eps := dlamchP
+	smlnum := safmin / eps
+	bignum := 1 / smlnum
+	rmin := math.Sqrt(smlnum)
+	rmax := math.Sqrt(bignum)
+
+	// Scale matrix to allowable range, if necessary.
+	anrm := impl.Dlansy(lapack.MaxAbs, uplo, n, a, lda, work)
+	scaled := false
+	var sigma float64
+	if anrm > 0 && anrm < rmin {
+		scaled = true
+		sigma = rmin / anrm
+	} else if anrm > rmax {
+		scaled = true
+		sigma = rmax / anrm
+	}
+	if scaled {
+		kind := lapack.LowerTri
+		if uplo == blas.Upper {
+			kind = lapack.UpperTri
+		}
+		impl.Dlascl(kind, 0, 0, 1, sigma, n, n, a, lda)
+	}
+	var inde int
+	indtau := inde + n
+	indwork := indtau + n
+	llwork := lwork - indwork
+	impl.Dsytrd(uplo, n, a, lda, w, work[inde:], work[indtau:], work[indwork:], llwork)
+
+	// For eigenvalues only, call Dsterf. For eigenvectors, first call Dorgtr
+	// to generate the orthogonal matrix, then call Dsteqr.
+	if jobz == lapack.EVNone {
+		ok = impl.Dsterf(n, w, work[inde:])
+	} else {
+		impl.Dorgtr(uplo, n, a, lda, work[indtau:], work[indwork:], llwork)
+		ok = impl.Dsteqr(lapack.EVComp(jobz), n, w, work[inde:], a, lda, work[indtau:])
+	}
+	if !ok {
+		return false
+	}
+
+	// If the matrix was scaled, then rescale eigenvalues appropriately.
+	if scaled {
+		bi := blas64.Implementation()
+		bi.Dscal(n, 1/sigma, w, 1)
+	}
+	work[0] = float64(lworkopt)
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dsytd2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dsytd2.go
new file mode 100644
index 0000000000..03e7cc07b0
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dsytd2.go
@@ -0,0 +1,147 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dsytd2 reduces a symmetric n×n matrix A to symmetric tridiagonal form T by
+// an orthogonal similarity transformation
+//
+//	Qᵀ * A * Q = T
+//
+// On entry, the matrix is contained in the specified triangle of a. On exit,
+// if uplo == blas.Upper, the diagonal and first super-diagonal of a are
+// overwritten with the elements of T. The elements above the first super-diagonal
+// are overwritten with the elementary reflectors that are used with
+// the elements written to tau in order to construct Q. If uplo == blas.Lower,
+// the elements are written in the lower triangular region.
+//
+// d must have length at least n. e and tau must have length at least n-1. Dsytd2
+// will panic if these sizes are not met.
+//
+// Q is represented as a product of elementary reflectors.
+// If uplo == blas.Upper
+//
+//	Q = H_{n-2} * ... * H_1 * H_0
+//
+// and if uplo == blas.Lower
+//
+//	Q = H_0 * H_1 * ... * H_{n-2}
+//
+// where
+//
+//	H_i = I - tau * v * vᵀ
+//
+// where tau is stored in tau[i], and v is stored in a.
+//
+// If uplo == blas.Upper, v[0:i-1] is stored in A[0:i-1,i+1], v[i] = 1, and
+// v[i+1:] = 0. The elements of a are
+//
+//	[ d   e  v2  v3  v4]
+//	[     d   e  v3  v4]
+//	[         d   e  v4]
+//	[             d   e]
+//	[                 d]
+//
+// If uplo == blas.Lower, v[0:i+1] = 0, v[i+1] = 1, and v[i+2:] is stored in
+// A[i+2:n,i].
+// The elements of a are
+//
+//	[ d                ]
+//	[ e   d            ]
+//	[v1   e   d        ]
+//	[v1  v2   e   d    ]
+//	[v1  v2  v3   e   d]
+//
+// Dsytd2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dsytd2(uplo blas.Uplo, n int, a []float64, lda int, d, e, tau []float64) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(d) < n:
+		panic(shortD)
+	case len(e) < n-1:
+		panic(shortE)
+	case len(tau) < n-1:
+		panic(shortTau)
+	}
+
+	bi := blas64.Implementation()
+
+	if uplo == blas.Upper {
+		// Reduce the upper triangle of A.
+		for i := n - 2; i >= 0; i-- {
+			// Generate elementary reflector H_i = I - tau * v * vᵀ to
+			// annihilate A[i:i-1, i+1].
+			var taui float64
+			a[i*lda+i+1], taui = impl.Dlarfg(i+1, a[i*lda+i+1], a[i+1:], lda)
+			e[i] = a[i*lda+i+1]
+			if taui != 0 {
+				// Apply H_i from both sides to A[0:i,0:i].
+				a[i*lda+i+1] = 1
+
+				// Compute x := tau * A * v storing x in tau[0:i].
+				bi.Dsymv(uplo, i+1, taui, a, lda, a[i+1:], lda, 0, tau, 1)
+
+				// Compute w := x - 1/2 * tau * (xᵀ * v) * v.
+				alpha := -0.5 * taui * bi.Ddot(i+1, tau, 1, a[i+1:], lda)
+				bi.Daxpy(i+1, alpha, a[i+1:], lda, tau, 1)
+
+				// Apply the transformation as a rank-2 update
+				// A = A - v * wᵀ - w * vᵀ.
+				bi.Dsyr2(uplo, i+1, -1, a[i+1:], lda, tau, 1, a, lda)
+				a[i*lda+i+1] = e[i]
+			}
+			d[i+1] = a[(i+1)*lda+i+1]
+			tau[i] = taui
+		}
+		d[0] = a[0]
+		return
+	}
+	// Reduce the lower triangle of A.
+	for i := 0; i < n-1; i++ {
+		// Generate elementary reflector H_i = I - tau * v * vᵀ to
+		// annihilate A[i+2:n, i].
+		var taui float64
+		a[(i+1)*lda+i], taui = impl.Dlarfg(n-i-1, a[(i+1)*lda+i], a[min(i+2, n-1)*lda+i:], lda)
+		e[i] = a[(i+1)*lda+i]
+		if taui != 0 {
+			// Apply H_i from both sides to A[i+1:n, i+1:n].
+			a[(i+1)*lda+i] = 1
+
+			// Compute x := tau * A * v, storing y in tau[i:n-1].
+			bi.Dsymv(uplo, n-i-1, taui, a[(i+1)*lda+i+1:], lda, a[(i+1)*lda+i:], lda, 0, tau[i:], 1)
+
+			// Compute w := x - 1/2 * tau * (xᵀ * v) * v.
+			alpha := -0.5 * taui * bi.Ddot(n-i-1, tau[i:], 1, a[(i+1)*lda+i:], lda)
+			bi.Daxpy(n-i-1, alpha, a[(i+1)*lda+i:], lda, tau[i:], 1)
+
+			// Apply the transformation as a rank-2 update
+			// A = A - v * wᵀ - w * vᵀ.
+			bi.Dsyr2(uplo, n-i-1, -1, a[(i+1)*lda+i:], lda, tau[i:], 1, a[(i+1)*lda+i+1:], lda)
+			a[(i+1)*lda+i] = e[i]
+		}
+		d[i] = a[i*lda+i]
+		tau[i] = taui
+	}
+	d[n-1] = a[(n-1)*lda+n-1]
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dsytrd.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dsytrd.go
new file mode 100644
index 0000000000..74d2287ed2
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dsytrd.go
@@ -0,0 +1,184 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dsytrd reduces a symmetric n×n matrix A to symmetric tridiagonal form by an
+// orthogonal similarity transformation
+//
+//	Qᵀ * A * Q = T
+//
+// where Q is an orthonormal matrix and T is symmetric and tridiagonal.
+//
+// On entry, a contains the elements of the input matrix in the triangle specified
+// by uplo. On exit, the diagonal and sub/super-diagonal are overwritten by the
+// corresponding elements of the tridiagonal matrix T. The remaining elements in
+// the triangle, along with the array tau, contain the data to construct Q as
+// the product of elementary reflectors.
+//
+// If uplo == blas.Upper, Q is constructed with
+//
+//	Q = H_{n-2} * ... * H_1 * H_0
+//
+// where
+//
+//	H_i = I - tau_i * v * vᵀ
+//
+// v is constructed as v[i+1:n] = 0, v[i] = 1, v[0:i-1] is stored in A[0:i-1, i+1].
+// The elements of A are
+//
+//	[ d   e  v1  v2  v3]
+//	[     d   e  v2  v3]
+//	[         d   e  v3]
+//	[             d   e]
+//	[                 e]
+//
+// If uplo == blas.Lower, Q is constructed with
+//
+//	Q = H_0 * H_1 * ... * H_{n-2}
+//
+// where
+//
+//	H_i = I - tau_i * v * vᵀ
+//
+// v is constructed as v[0:i+1] = 0, v[i+1] = 1, v[i+2:n] is stored in A[i+2:n, i].
+// The elements of A are
+//
+//	[ d                ]
+//	[ e   d            ]
+//	[v0   e   d        ]
+//	[v0  v1   e   d    ]
+//	[v0  v1  v2   e   d]
+//
+// d must have length n, and e and tau must have length n-1. Dsytrd will panic if
+// these conditions are not met.
+//
+// work is temporary storage, and lwork specifies the usable memory length. At minimum,
+// lwork >= 1, and Dsytrd will panic otherwise. The amount of blocking is
+// limited by the usable length.
+// If lwork == -1, instead of computing Dsytrd the optimal work length is stored
+// into work[0].
+//
+// Dsytrd is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dsytrd(uplo blas.Uplo, n int, a []float64, lda int, d, e, tau, work []float64, lwork int) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < 1 && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		work[0] = 1
+		return
+	}
+
+	nb := impl.Ilaenv(1, "DSYTRD", string(uplo), n, -1, -1, -1)
+	lworkopt := n * nb
+	if lwork == -1 {
+		work[0] = float64(lworkopt)
+		return
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(d) < n:
+		panic(shortD)
+	case len(e) < n-1:
+		panic(shortE)
+	case len(tau) < n-1:
+		panic(shortTau)
+	}
+
+	bi := blas64.Implementation()
+
+	nx := n
+	iws := 1
+	var ldwork int
+	if 1 < nb && nb < n {
+		// Determine when to cross over from blocked to unblocked code. The last
+		// block is always handled by unblocked code.
+		nx = max(nb, impl.Ilaenv(3, "DSYTRD", string(uplo), n, -1, -1, -1))
+		if nx < n {
+			// Determine if workspace is large enough for blocked code.
+			ldwork = nb
+			iws = n * ldwork
+			if lwork < iws {
+				// Not enough workspace to use optimal nb: determine the minimum
+				// value of nb and reduce nb or force use of unblocked code by
+				// setting nx = n.
+				nb = max(lwork/n, 1)
+				nbmin := impl.Ilaenv(2, "DSYTRD", string(uplo), n, -1, -1, -1)
+				if nb < nbmin {
+					nx = n
+				}
+			}
+		} else {
+			nx = n
+		}
+	} else {
+		nb = 1
+	}
+	ldwork = nb
+
+	if uplo == blas.Upper {
+		// Reduce the upper triangle of A. Columns 0:kk are handled by the
+		// unblocked method.
+		var i int
+		kk := n - ((n-nx+nb-1)/nb)*nb
+		for i = n - nb; i >= kk; i -= nb {
+			// Reduce columns i:i+nb to tridiagonal form and form the matrix W
+			// which is needed to update the unreduced part of the matrix.
+			impl.Dlatrd(uplo, i+nb, nb, a, lda, e, tau, work, ldwork)
+
+			// Update the unreduced submatrix A[0:i-1,0:i-1], using an update
+			// of the form A = A - V*Wᵀ - W*Vᵀ.
+			bi.Dsyr2k(uplo, blas.NoTrans, i, nb, -1, a[i:], lda, work, ldwork, 1, a, lda)
+
+			// Copy superdiagonal elements back into A, and diagonal elements into D.
+			for j := i; j < i+nb; j++ {
+				a[(j-1)*lda+j] = e[j-1]
+				d[j] = a[j*lda+j]
+			}
+		}
+		// Use unblocked code to reduce the last or only block
+		// check that i == kk.
+		impl.Dsytd2(uplo, kk, a, lda, d, e, tau)
+	} else {
+		var i int
+		// Reduce the lower triangle of A.
+		for i = 0; i < n-nx; i += nb {
+			// Reduce columns 0:i+nb to tridiagonal form and form the matrix W
+			// which is needed to update the unreduced part of the matrix.
+			impl.Dlatrd(uplo, n-i, nb, a[i*lda+i:], lda, e[i:], tau[i:], work, ldwork)
+
+			// Update the unreduced submatrix A[i+ib:n, i+ib:n], using an update
+			// of the form A = A + V*Wᵀ - W*Vᵀ.
+			bi.Dsyr2k(uplo, blas.NoTrans, n-i-nb, nb, -1, a[(i+nb)*lda+i:], lda,
+				work[nb*ldwork:], ldwork, 1, a[(i+nb)*lda+i+nb:], lda)
+
+			// Copy subdiagonal elements back into A, and diagonal elements into D.
+			for j := i; j < i+nb; j++ {
+				a[(j+1)*lda+j] = e[j]
+				d[j] = a[j*lda+j]
+			}
+		}
+		// Use unblocked code to reduce the last or only block.
+		impl.Dsytd2(uplo, n-i, a[i*lda+i:], lda, d[i:], e[i:], tau[i:])
+	}
+	work[0] = float64(iws)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dtbtrs.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dtbtrs.go
new file mode 100644
index 0000000000..6b56d9e0f0
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dtbtrs.go
@@ -0,0 +1,77 @@
+// Copyright ©2020 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dtbtrs solves a triangular system of the form
+//
+//	A * X = B   if trans == blas.NoTrans
+//	Aᵀ * X = B  if trans == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n triangular band matrix with kd super- or subdiagonals, and
+// B is an n×nrhs matrix.
+//
+// Dtbtrs returns whether A is non-singular. If A is singular, no solution X is
+// computed.
+func (impl Implementation) Dtbtrs(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n, kd, nrhs int, a []float64, lda int, b []float64, ldb int) (ok bool) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case trans != blas.NoTrans && trans != blas.Trans && trans != blas.ConjTrans:
+		panic(badTrans)
+	case diag != blas.NonUnit && diag != blas.Unit:
+		panic(badDiag)
+	case n < 0:
+		panic(nLT0)
+	case kd < 0:
+		panic(kdLT0)
+	case nrhs < 0:
+		panic(nrhsLT0)
+	case lda < kd+1:
+		panic(badLdA)
+	case ldb < max(1, nrhs):
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return true
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+kd+1:
+		panic(shortA)
+	case len(b) < (n-1)*ldb+nrhs:
+		panic(shortB)
+	}
+
+	// Check for singularity.
+	if diag == blas.NonUnit {
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				if a[i*lda] == 0 {
+					return false
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				if a[i*lda+kd] == 0 {
+					return false
+				}
+			}
+		}
+	}
+
+	// Solve A * X = B  or Aᵀ * X = B.
+	bi := blas64.Implementation()
+	for j := 0; j < nrhs; j++ {
+		bi.Dtbsv(uplo, trans, diag, n, kd, a, lda, b[j:], ldb)
+	}
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dtgsja.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dtgsja.go
new file mode 100644
index 0000000000..b3f0208a35
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dtgsja.go
@@ -0,0 +1,389 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dtgsja computes the generalized singular value decomposition (GSVD)
+// of two real upper triangular or trapezoidal matrices A and B.
+//
+// A and B have the following forms, which may be obtained by the
+// preprocessing subroutine Dggsvp from a general m×n matrix A and p×n
+// matrix B:
+//
+//	          n-k-l  k    l
+//	A =    k [  0   A12  A13 ] if m-k-l >= 0;
+//	       l [  0    0   A23 ]
+//	   m-k-l [  0    0    0  ]
+//
+//	          n-k-l  k    l
+//	A =    k [  0   A12  A13 ] if m-k-l < 0;
+//	     m-k [  0    0   A23 ]
+//
+//	          n-k-l  k    l
+//	B =    l [  0    0   B13 ]
+//	     p-l [  0    0    0  ]
+//
+// where the k×k matrix A12 and l×l matrix B13 are non-singular
+// upper triangular. A23 is l×l upper triangular if m-k-l >= 0,
+// otherwise A23 is (m-k)×l upper trapezoidal.
+//
+// On exit,
+//
+//	Uᵀ*A*Q = D1*[ 0 R ], Vᵀ*B*Q = D2*[ 0 R ],
+//
+// where U, V and Q are orthogonal matrices.
+// R is a non-singular upper triangular matrix, and D1 and D2 are
+// diagonal matrices, which are of the following structures:
+//
+// If m-k-l >= 0,
+//
+//	                  k  l
+//	     D1 =     k [ I  0 ]
+//	              l [ 0  C ]
+//	          m-k-l [ 0  0 ]
+//
+//	                k  l
+//	     D2 = l   [ 0  S ]
+//	          p-l [ 0  0 ]
+//
+//	             n-k-l  k    l
+//	[ 0 R ] = k [  0   R11  R12 ] k
+//	          l [  0    0   R22 ] l
+//
+// where
+//
+//	C = diag( alpha_k, ... , alpha_{k+l} ),
+//	S = diag( beta_k,  ... , beta_{k+l} ),
+//	C^2 + S^2 = I.
+//
+// R is stored in
+//
+//	A[0:k+l, n-k-l:n]
+//
+// on exit.
+//
+// If m-k-l < 0,
+//
+//	               k m-k k+l-m
+//	    D1 =   k [ I  0    0  ]
+//	         m-k [ 0  C    0  ]
+//
+//	                 k m-k k+l-m
+//	    D2 =   m-k [ 0  S    0  ]
+//	         k+l-m [ 0  0    I  ]
+//	           p-l [ 0  0    0  ]
+//
+//	               n-k-l  k   m-k  k+l-m
+//	[ 0 R ] =    k [ 0    R11  R12  R13 ]
+//	           m-k [ 0     0   R22  R23 ]
+//	         k+l-m [ 0     0    0   R33 ]
+//
+// where
+//
+//	C = diag( alpha_k, ... , alpha_m ),
+//	S = diag( beta_k,  ... , beta_m ),
+//	C^2 + S^2 = I.
+//
+//	R = [ R11 R12 R13 ] is stored in A[0:m, n-k-l:n]
+//	    [  0  R22 R23 ]
+//
+// and R33 is stored in
+//
+//	B[m-k:l, n+m-k-l:n] on exit.
+//
+// The computation of the orthogonal transformation matrices U, V or Q
+// is optional. These matrices may either be formed explicitly, or they
+// may be post-multiplied into input matrices U1, V1, or Q1.
+//
+// Dtgsja essentially uses a variant of Kogbetliantz algorithm to reduce
+// min(l,m-k)×l triangular or trapezoidal matrix A23 and l×l
+// matrix B13 to the form:
+//
+//	U1ᵀ*A13*Q1 = C1*R1; V1ᵀ*B13*Q1 = S1*R1,
+//
+// where U1, V1 and Q1 are orthogonal matrices. C1 and S1 are diagonal
+// matrices satisfying
+//
+//	C1^2 + S1^2 = I,
+//
+// and R1 is an l×l non-singular upper triangular matrix.
+//
+// jobU, jobV and jobQ are options for computing the orthogonal matrices. The behavior
+// is as follows
+//
+//	jobU == lapack.GSVDU        Compute orthogonal matrix U
+//	jobU == lapack.GSVDUnit     Use unit-initialized matrix
+//	jobU == lapack.GSVDNone     Do not compute orthogonal matrix.
+//
+// The behavior is the same for jobV and jobQ with the exception that instead of
+// lapack.GSVDU these accept lapack.GSVDV and lapack.GSVDQ respectively.
+// The matrices U, V and Q must be m×m, p×p and n×n respectively unless the
+// relevant job parameter is lapack.GSVDNone.
+//
+// k and l specify the sub-blocks in the input matrices A and B:
+//
+//	A23 = A[k:min(k+l,m), n-l:n) and B13 = B[0:l, n-l:n]
+//
+// of A and B, whose GSVD is going to be computed by Dtgsja.
+//
+// tola and tolb are the convergence criteria for the Jacobi-Kogbetliantz
+// iteration procedure. Generally, they are the same as used in the preprocessing
+// step, for example,
+//
+//	tola = max(m, n)*norm(A)*eps,
+//	tolb = max(p, n)*norm(B)*eps,
+//
+// where eps is the machine epsilon.
+//
+// work must have length at least 2*n, otherwise Dtgsja will panic.
+//
+// alpha and beta must have length n or Dtgsja will panic. On exit, alpha and
+// beta contain the generalized singular value pairs of A and B
+//
+//	alpha[0:k] = 1,
+//	beta[0:k]  = 0,
+//
+// if m-k-l >= 0,
+//
+//	alpha[k:k+l] = diag(C),
+//	beta[k:k+l]  = diag(S),
+//
+// if m-k-l < 0,
+//
+//	alpha[k:m]= C, alpha[m:k+l]= 0
+//	beta[k:m] = S, beta[m:k+l] = 1.
+//
+// if k+l < n,
+//
+//	alpha[k+l:n] = 0 and
+//	beta[k+l:n]  = 0.
+//
+// On exit, A[n-k:n, 0:min(k+l,m)] contains the triangular matrix R or part of R
+// and if necessary, B[m-k:l, n+m-k-l:n] contains a part of R.
+//
+// Dtgsja returns whether the routine converged and the number of iteration cycles
+// that were run.
+//
+// Dtgsja is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dtgsja(jobU, jobV, jobQ lapack.GSVDJob, m, p, n, k, l int, a []float64, lda int, b []float64, ldb int, tola, tolb float64, alpha, beta, u []float64, ldu int, v []float64, ldv int, q []float64, ldq int, work []float64) (cycles int, ok bool) {
+	const maxit = 40
+
+	initu := jobU == lapack.GSVDUnit
+	wantu := initu || jobU == lapack.GSVDU
+
+	initv := jobV == lapack.GSVDUnit
+	wantv := initv || jobV == lapack.GSVDV
+
+	initq := jobQ == lapack.GSVDUnit
+	wantq := initq || jobQ == lapack.GSVDQ
+
+	switch {
+	case !initu && !wantu && jobU != lapack.GSVDNone:
+		panic(badGSVDJob + "U")
+	case !initv && !wantv && jobV != lapack.GSVDNone:
+		panic(badGSVDJob + "V")
+	case !initq && !wantq && jobQ != lapack.GSVDNone:
+		panic(badGSVDJob + "Q")
+	case m < 0:
+		panic(mLT0)
+	case p < 0:
+		panic(pLT0)
+	case n < 0:
+		panic(nLT0)
+
+	case lda < max(1, n):
+		panic(badLdA)
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+
+	case ldb < max(1, n):
+		panic(badLdB)
+	case len(b) < (p-1)*ldb+n:
+		panic(shortB)
+
+	case len(alpha) != n:
+		panic(badLenAlpha)
+	case len(beta) != n:
+		panic(badLenBeta)
+
+	case ldu < 1, wantu && ldu < m:
+		panic(badLdU)
+	case wantu && len(u) < (m-1)*ldu+m:
+		panic(shortU)
+
+	case ldv < 1, wantv && ldv < p:
+		panic(badLdV)
+	case wantv && len(v) < (p-1)*ldv+p:
+		panic(shortV)
+
+	case ldq < 1, wantq && ldq < n:
+		panic(badLdQ)
+	case wantq && len(q) < (n-1)*ldq+n:
+		panic(shortQ)
+
+	case len(work) < 2*n:
+		panic(shortWork)
+	}
+
+	// Initialize U, V and Q, if necessary
+	if initu {
+		impl.Dlaset(blas.All, m, m, 0, 1, u, ldu)
+	}
+	if initv {
+		impl.Dlaset(blas.All, p, p, 0, 1, v, ldv)
+	}
+	if initq {
+		impl.Dlaset(blas.All, n, n, 0, 1, q, ldq)
+	}
+
+	bi := blas64.Implementation()
+	minTol := math.Min(tola, tolb)
+
+	// Loop until convergence.
+	upper := false
+	for cycles = 1; cycles <= maxit; cycles++ {
+		upper = !upper
+
+		for i := 0; i < l-1; i++ {
+			for j := i + 1; j < l; j++ {
+				var a1, a2, a3 float64
+				if k+i < m {
+					a1 = a[(k+i)*lda+n-l+i]
+				}
+				if k+j < m {
+					a3 = a[(k+j)*lda+n-l+j]
+				}
+
+				b1 := b[i*ldb+n-l+i]
+				b3 := b[j*ldb+n-l+j]
+
+				var b2 float64
+				if upper {
+					if k+i < m {
+						a2 = a[(k+i)*lda+n-l+j]
+					}
+					b2 = b[i*ldb+n-l+j]
+				} else {
+					if k+j < m {
+						a2 = a[(k+j)*lda+n-l+i]
+					}
+					b2 = b[j*ldb+n-l+i]
+				}
+
+				csu, snu, csv, snv, csq, snq := impl.Dlags2(upper, a1, a2, a3, b1, b2, b3)
+
+				// Update (k+i)-th and (k+j)-th rows of matrix A: Uᵀ*A.
+				if k+j < m {
+					bi.Drot(l, a[(k+j)*lda+n-l:], 1, a[(k+i)*lda+n-l:], 1, csu, snu)
+				}
+
+				// Update i-th and j-th rows of matrix B: Vᵀ*B.
+				bi.Drot(l, b[j*ldb+n-l:], 1, b[i*ldb+n-l:], 1, csv, snv)
+
+				// Update (n-l+i)-th and (n-l+j)-th columns of matrices
+				// A and B: A*Q and B*Q.
+				bi.Drot(min(k+l, m), a[n-l+j:], lda, a[n-l+i:], lda, csq, snq)
+				bi.Drot(l, b[n-l+j:], ldb, b[n-l+i:], ldb, csq, snq)
+
+				if upper {
+					if k+i < m {
+						a[(k+i)*lda+n-l+j] = 0
+					}
+					b[i*ldb+n-l+j] = 0
+				} else {
+					if k+j < m {
+						a[(k+j)*lda+n-l+i] = 0
+					}
+					b[j*ldb+n-l+i] = 0
+				}
+
+				// Update orthogonal matrices U, V, Q, if desired.
+				if wantu && k+j < m {
+					bi.Drot(m, u[k+j:], ldu, u[k+i:], ldu, csu, snu)
+				}
+				if wantv {
+					bi.Drot(p, v[j:], ldv, v[i:], ldv, csv, snv)
+				}
+				if wantq {
+					bi.Drot(n, q[n-l+j:], ldq, q[n-l+i:], ldq, csq, snq)
+				}
+			}
+		}
+
+		if !upper {
+			// The matrices A13 and B13 were lower triangular at the start
+			// of the cycle, and are now upper triangular.
+			//
+			// Convergence test: test the parallelism of the corresponding
+			// rows of A and B.
+			var error float64
+			for i := 0; i < min(l, m-k); i++ {
+				bi.Dcopy(l-i, a[(k+i)*lda+n-l+i:], 1, work, 1)
+				bi.Dcopy(l-i, b[i*ldb+n-l+i:], 1, work[l:], 1)
+				ssmin := impl.Dlapll(l-i, work, 1, work[l:], 1)
+				error = math.Max(error, ssmin)
+			}
+			if math.Abs(error) <= minTol {
+				// The algorithm has converged.
+				// Compute the generalized singular value pairs (alpha, beta)
+				// and set the triangular matrix R to array A.
+				for i := 0; i < k; i++ {
+					alpha[i] = 1
+					beta[i] = 0
+				}
+
+				for i := 0; i < min(l, m-k); i++ {
+					a1 := a[(k+i)*lda+n-l+i]
+					b1 := b[i*ldb+n-l+i]
+					gamma := b1 / a1
+					if !math.IsInf(gamma, 0) {
+						// Change sign if necessary.
+						if gamma < 0 {
+							bi.Dscal(l-i, -1, b[i*ldb+n-l+i:], 1)
+							if wantv {
+								bi.Dscal(p, -1, v[i:], ldv)
+							}
+						}
+						beta[k+i], alpha[k+i], _ = impl.Dlartg(math.Abs(gamma), 1)
+
+						if alpha[k+i] >= beta[k+i] {
+							bi.Dscal(l-i, 1/alpha[k+i], a[(k+i)*lda+n-l+i:], 1)
+						} else {
+							bi.Dscal(l-i, 1/beta[k+i], b[i*ldb+n-l+i:], 1)
+							bi.Dcopy(l-i, b[i*ldb+n-l+i:], 1, a[(k+i)*lda+n-l+i:], 1)
+						}
+					} else {
+						alpha[k+i] = 0
+						beta[k+i] = 1
+						bi.Dcopy(l-i, b[i*ldb+n-l+i:], 1, a[(k+i)*lda+n-l+i:], 1)
+					}
+				}
+
+				for i := m; i < k+l; i++ {
+					alpha[i] = 0
+					beta[i] = 1
+				}
+				if k+l < n {
+					for i := k + l; i < n; i++ {
+						alpha[i] = 0
+						beta[i] = 0
+					}
+				}
+
+				return cycles, true
+			}
+		}
+	}
+
+	// The algorithm has not converged after maxit cycles.
+	return cycles, false
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dtrcon.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dtrcon.go
new file mode 100644
index 0000000000..899c95dd58
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dtrcon.go
@@ -0,0 +1,90 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dtrcon estimates the reciprocal of the condition number of a triangular matrix A.
+// The condition number computed may be based on the 1-norm or the ∞-norm.
+//
+// work is a temporary data slice of length at least 3*n and Dtrcon will panic otherwise.
+//
+// iwork is a temporary data slice of length at least n and Dtrcon will panic otherwise.
+func (impl Implementation) Dtrcon(norm lapack.MatrixNorm, uplo blas.Uplo, diag blas.Diag, n int, a []float64, lda int, work []float64, iwork []int) float64 {
+	switch {
+	case norm != lapack.MaxColumnSum && norm != lapack.MaxRowSum:
+		panic(badNorm)
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case diag != blas.NonUnit && diag != blas.Unit:
+		panic(badDiag)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	if n == 0 {
+		return 1
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(work) < 3*n:
+		panic(shortWork)
+	case len(iwork) < n:
+		panic(shortIWork)
+	}
+
+	bi := blas64.Implementation()
+
+	var rcond float64
+	smlnum := dlamchS * float64(n)
+
+	anorm := impl.Dlantr(norm, uplo, diag, n, n, a, lda, work)
+
+	if anorm <= 0 {
+		return rcond
+	}
+	var ainvnm float64
+	var normin bool
+	kase1 := 2
+	if norm == lapack.MaxColumnSum {
+		kase1 = 1
+	}
+	var kase int
+	isave := new([3]int)
+	var scale float64
+	for {
+		ainvnm, kase = impl.Dlacn2(n, work[n:], work, iwork, ainvnm, kase, isave)
+		if kase == 0 {
+			if ainvnm != 0 {
+				rcond = (1 / anorm) / ainvnm
+			}
+			return rcond
+		}
+		if kase == kase1 {
+			scale = impl.Dlatrs(uplo, blas.NoTrans, diag, normin, n, a, lda, work, work[2*n:])
+		} else {
+			scale = impl.Dlatrs(uplo, blas.Trans, diag, normin, n, a, lda, work, work[2*n:])
+		}
+		normin = true
+		if scale != 1 {
+			ix := bi.Idamax(n, work, 1)
+			xnorm := math.Abs(work[ix])
+			if scale == 0 || scale < xnorm*smlnum {
+				return rcond
+			}
+			impl.Drscl(n, scale, work, 1)
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dtrevc3.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dtrevc3.go
new file mode 100644
index 0000000000..86197d3af5
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dtrevc3.go
@@ -0,0 +1,894 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dtrevc3 computes some or all of the right and/or left eigenvectors of an n×n
+// upper quasi-triangular matrix T in Schur canonical form. Matrices of this
+// type are produced by the Schur factorization of a real general matrix A
+//
+//	A = Q T Qᵀ,
+//
+// as computed by Dhseqr.
+//
+// The right eigenvector x of T corresponding to an
+// eigenvalue λ is defined by
+//
+//	T x = λ x,
+//
+// and the left eigenvector y is defined by
+//
+//	yᵀ T = λ yᵀ.
+//
+// The eigenvalues are read directly from the diagonal blocks of T.
+//
+// This routine returns the matrices X and/or Y of right and left eigenvectors
+// of T, or the products Q*X and/or Q*Y, where Q is an input matrix. If Q is the
+// orthogonal factor that reduces a matrix A to Schur form T, then Q*X and Q*Y
+// are the matrices of right and left eigenvectors of A.
+//
+// If side == lapack.EVRight, only right eigenvectors will be computed.
+// If side == lapack.EVLeft, only left eigenvectors will be computed.
+// If side == lapack.EVBoth, both right and left eigenvectors will be computed.
+// For other values of side, Dtrevc3 will panic.
+//
+// If howmny == lapack.EVAll, all right and/or left eigenvectors will be
+// computed.
+// If howmny == lapack.EVAllMulQ, all right and/or left eigenvectors will be
+// computed and multiplied from left by the matrices in VR and/or VL.
+// If howmny == lapack.EVSelected, right and/or left eigenvectors will be
+// computed as indicated by selected.
+// For other values of howmny, Dtrevc3 will panic.
+//
+// selected specifies which eigenvectors will be computed. It must have length n
+// if howmny == lapack.EVSelected, and it is not referenced otherwise.
+// If w_j is a real eigenvalue, the corresponding real eigenvector will be
+// computed if selected[j] is true.
+// If w_j and w_{j+1} are the real and imaginary parts of a complex eigenvalue,
+// the corresponding complex eigenvector is computed if either selected[j] or
+// selected[j+1] is true, and on return selected[j] will be set to true and
+// selected[j+1] will be set to false.
+//
+// VL and VR are n×mm matrices. If howmny is lapack.EVAll or
+// lapack.AllEVMulQ, mm must be at least n. If howmny is
+// lapack.EVSelected, mm must be large enough to store the selected
+// eigenvectors. Each selected real eigenvector occupies one column and each
+// selected complex eigenvector occupies two columns. If mm is not sufficiently
+// large, Dtrevc3 will panic.
+//
+// On entry, if howmny is lapack.EVAllMulQ, it is assumed that VL (if side
+// is lapack.EVLeft or lapack.EVBoth) contains an n×n matrix QL,
+// and that VR (if side is lapack.EVRight or lapack.EVBoth) contains
+// an n×n matrix QR. QL and QR are typically the orthogonal matrix Q of Schur
+// vectors returned by Dhseqr.
+//
+// On return, if side is lapack.EVLeft or lapack.EVBoth,
+// VL will contain:
+//
+//	if howmny == lapack.EVAll,      the matrix Y of left eigenvectors of T,
+//	if howmny == lapack.EVAllMulQ,  the matrix Q*Y,
+//	if howmny == lapack.EVSelected, the left eigenvectors of T specified by
+//	                                selected, stored consecutively in the
+//	                                columns of VL, in the same order as their
+//	                                eigenvalues.
+//
+// VL is not referenced if side == lapack.EVRight.
+//
+// On return, if side is lapack.EVRight or lapack.EVBoth,
+// VR will contain:
+//
+//	if howmny == lapack.EVAll,      the matrix X of right eigenvectors of T,
+//	if howmny == lapack.EVAllMulQ,  the matrix Q*X,
+//	if howmny == lapack.EVSelected, the left eigenvectors of T specified by
+//	                                selected, stored consecutively in the
+//	                                columns of VR, in the same order as their
+//	                                eigenvalues.
+//
+// VR is not referenced if side == lapack.EVLeft.
+//
+// Complex eigenvectors corresponding to a complex eigenvalue are stored in VL
+// and VR in two consecutive columns, the first holding the real part, and the
+// second the imaginary part.
+//
+// Each eigenvector will be normalized so that the element of largest magnitude
+// has magnitude 1. Here the magnitude of a complex number (x,y) is taken to be
+// |x| + |y|.
+//
+// work must have length at least lwork and lwork must be at least max(1,3*n),
+// otherwise Dtrevc3 will panic. For optimum performance, lwork should be at
+// least n+2*n*nb, where nb is the optimal blocksize.
+//
+// If lwork == -1, instead of performing Dtrevc3, the function only estimates
+// the optimal workspace size based on n and stores it into work[0].
+//
+// Dtrevc3 returns the number of columns in VL and/or VR actually used to store
+// the eigenvectors.
+//
+// Dtrevc3 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dtrevc3(side lapack.EVSide, howmny lapack.EVHowMany, selected []bool, n int, t []float64, ldt int, vl []float64, ldvl int, vr []float64, ldvr int, mm int, work []float64, lwork int) (m int) {
+	bothv := side == lapack.EVBoth
+	rightv := side == lapack.EVRight || bothv
+	leftv := side == lapack.EVLeft || bothv
+	switch {
+	case !rightv && !leftv:
+		panic(badEVSide)
+	case howmny != lapack.EVAll && howmny != lapack.EVAllMulQ && howmny != lapack.EVSelected:
+		panic(badEVHowMany)
+	case n < 0:
+		panic(nLT0)
+	case ldt < max(1, n):
+		panic(badLdT)
+	case mm < 0:
+		panic(mmLT0)
+	case ldvl < 1:
+		// ldvl and ldvr are also checked below after the computation of
+		// m (number of columns of VL and VR) in case of howmny == EVSelected.
+		panic(badLdVL)
+	case ldvr < 1:
+		panic(badLdVR)
+	case lwork < max(1, 3*n) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		work[0] = 1
+		return 0
+	}
+
+	// Normally we don't check slice lengths until after the workspace
+	// query. However, even in case of the workspace query we need to
+	// compute and return the value of m, and since the computation accesses t,
+	// we put the length check of t here.
+	if len(t) < (n-1)*ldt+n {
+		panic(shortT)
+	}
+
+	if howmny == lapack.EVSelected {
+		if len(selected) != n {
+			panic(badLenSelected)
+		}
+		// Set m to the number of columns required to store the selected
+		// eigenvectors, and standardize the slice selected.
+		// Each selected real eigenvector occupies one column and each
+		// selected complex eigenvector occupies two columns.
+		for j := 0; j < n; {
+			if j == n-1 || t[(j+1)*ldt+j] == 0 {
+				// Diagonal 1×1 block corresponding to a
+				// real eigenvalue.
+				if selected[j] {
+					m++
+				}
+				j++
+			} else {
+				// Diagonal 2×2 block corresponding to a
+				// complex eigenvalue.
+				if selected[j] || selected[j+1] {
+					selected[j] = true
+					selected[j+1] = false
+					m += 2
+				}
+				j += 2
+			}
+		}
+	} else {
+		m = n
+	}
+	if mm < m {
+		panic(badMm)
+	}
+
+	// Quick return in case of a workspace query.
+	nb := impl.Ilaenv(1, "DTREVC", string(side)+string(howmny), n, -1, -1, -1)
+	if lwork == -1 {
+		work[0] = float64(n + 2*n*nb)
+		return m
+	}
+
+	// Quick return if no eigenvectors were selected.
+	if m == 0 {
+		return 0
+	}
+
+	switch {
+	case leftv && ldvl < mm:
+		panic(badLdVL)
+	case leftv && len(vl) < (n-1)*ldvl+mm:
+		panic(shortVL)
+
+	case rightv && ldvr < mm:
+		panic(badLdVR)
+	case rightv && len(vr) < (n-1)*ldvr+mm:
+		panic(shortVR)
+	}
+
+	// Use blocked version of back-transformation if sufficient workspace.
+	// Zero-out the workspace to avoid potential NaN propagation.
+	const (
+		nbmin = 8
+		nbmax = 128
+	)
+	if howmny == lapack.EVAllMulQ && lwork >= n+2*n*nbmin {
+		nb = min((lwork-n)/(2*n), nbmax)
+		impl.Dlaset(blas.All, n, 1+2*nb, 0, 0, work[:n+2*nb*n], 1+2*nb)
+	} else {
+		nb = 1
+	}
+
+	// Set the constants to control overflow.
+	ulp := dlamchP
+	smlnum := float64(n) / ulp * dlamchS
+	bignum := (1 - ulp) / smlnum
+
+	// Split work into a vector of column norms and an n×2*nb matrix b.
+	norms := work[:n]
+	ldb := 2 * nb
+	b := work[n : n+n*ldb]
+
+	// Compute 1-norm of each column of strictly upper triangular part of T
+	// to control overflow in triangular solver.
+	norms[0] = 0
+	for j := 1; j < n; j++ {
+		var cn float64
+		for i := 0; i < j; i++ {
+			cn += math.Abs(t[i*ldt+j])
+		}
+		norms[j] = cn
+	}
+
+	bi := blas64.Implementation()
+
+	var (
+		x [4]float64
+
+		iv int // Index of column in current block.
+		is int
+
+		// ip is used below to specify the real or complex eigenvalue:
+		//  ip == 0, real eigenvalue,
+		//        1, first  of conjugate complex pair (wr,wi),
+		//       -1, second of conjugate complex pair (wr,wi).
+		ip        int
+		iscomplex [nbmax]int // Stores ip for each column in current block.
+	)
+
+	if side == lapack.EVLeft {
+		goto leftev
+	}
+
+	// Compute right eigenvectors.
+
+	// For complex right vector, iv-1 is for real part and iv for complex
+	// part. Non-blocked version always uses iv=1, blocked version starts
+	// with iv=nb-1 and goes down to 0 or 1.
+	iv = max(2, nb) - 1
+	ip = 0
+	is = m - 1
+	for ki := n - 1; ki >= 0; ki-- {
+		if ip == -1 {
+			// Previous iteration (ki+1) was second of
+			// conjugate pair, so this ki is first of
+			// conjugate pair.
+			ip = 1
+			continue
+		}
+
+		if ki == 0 || t[ki*ldt+ki-1] == 0 {
+			// Last column or zero on sub-diagonal, so this
+			// ki must be real eigenvalue.
+			ip = 0
+		} else {
+			// Non-zero on sub-diagonal, so this ki is
+			// second of conjugate pair.
+			ip = -1
+		}
+
+		if howmny == lapack.EVSelected {
+			if ip == 0 {
+				if !selected[ki] {
+					continue
+				}
+			} else if !selected[ki-1] {
+				continue
+			}
+		}
+
+		// Compute the ki-th eigenvalue (wr,wi).
+		wr := t[ki*ldt+ki]
+		var wi float64
+		if ip != 0 {
+			wi = math.Sqrt(math.Abs(t[ki*ldt+ki-1])) * math.Sqrt(math.Abs(t[(ki-1)*ldt+ki]))
+		}
+		smin := math.Max(ulp*(math.Abs(wr)+math.Abs(wi)), smlnum)
+
+		if ip == 0 {
+			// Real right eigenvector.
+
+			b[ki*ldb+iv] = 1
+			// Form right-hand side.
+			for k := 0; k < ki; k++ {
+				b[k*ldb+iv] = -t[k*ldt+ki]
+			}
+			// Solve upper quasi-triangular system:
+			//  [ T[0:ki,0:ki] - wr ]*X = scale*b.
+			for j := ki - 1; j >= 0; {
+				if j == 0 || t[j*ldt+j-1] == 0 {
+					// 1×1 diagonal block.
+					scale, xnorm, _ := impl.Dlaln2(false, 1, 1, smin, 1, t[j*ldt+j:], ldt,
+						1, 1, b[j*ldb+iv:], ldb, wr, 0, x[:1], 2)
+					// Scale X[0,0] to avoid overflow when updating the
+					// right-hand side.
+					if xnorm > 1 && norms[j] > bignum/xnorm {
+						x[0] /= xnorm
+						scale /= xnorm
+					}
+					// Scale if necessary.
+					if scale != 1 {
+						bi.Dscal(ki+1, scale, b[iv:], ldb)
+					}
+					b[j*ldb+iv] = x[0]
+					// Update right-hand side.
+					bi.Daxpy(j, -x[0], t[j:], ldt, b[iv:], ldb)
+					j--
+				} else {
+					// 2×2 diagonal block.
+					scale, xnorm, _ := impl.Dlaln2(false, 2, 1, smin, 1, t[(j-1)*ldt+j-1:], ldt,
+						1, 1, b[(j-1)*ldb+iv:], ldb, wr, 0, x[:3], 2)
+					// Scale X[0,0] and X[1,0] to avoid overflow
+					// when updating the right-hand side.
+					if xnorm > 1 {
+						beta := math.Max(norms[j-1], norms[j])
+						if beta > bignum/xnorm {
+							x[0] /= xnorm
+							x[2] /= xnorm
+							scale /= xnorm
+						}
+					}
+					// Scale if necessary.
+					if scale != 1 {
+						bi.Dscal(ki+1, scale, b[iv:], ldb)
+					}
+					b[(j-1)*ldb+iv] = x[0]
+					b[j*ldb+iv] = x[2]
+					// Update right-hand side.
+					bi.Daxpy(j-1, -x[0], t[j-1:], ldt, b[iv:], ldb)
+					bi.Daxpy(j-1, -x[2], t[j:], ldt, b[iv:], ldb)
+					j -= 2
+				}
+			}
+			// Copy the vector x or Q*x to VR and normalize.
+			switch {
+			case howmny != lapack.EVAllMulQ:
+				// No back-transform: copy x to VR and normalize.
+				bi.Dcopy(ki+1, b[iv:], ldb, vr[is:], ldvr)
+				ii := bi.Idamax(ki+1, vr[is:], ldvr)
+				remax := 1 / math.Abs(vr[ii*ldvr+is])
+				bi.Dscal(ki+1, remax, vr[is:], ldvr)
+				for k := ki + 1; k < n; k++ {
+					vr[k*ldvr+is] = 0
+				}
+			case nb == 1:
+				// Version 1: back-transform each vector with GEMV, Q*x.
+				if ki > 0 {
+					bi.Dgemv(blas.NoTrans, n, ki, 1, vr, ldvr, b[iv:], ldb,
+						b[ki*ldb+iv], vr[ki:], ldvr)
+				}
+				ii := bi.Idamax(n, vr[ki:], ldvr)
+				remax := 1 / math.Abs(vr[ii*ldvr+ki])
+				bi.Dscal(n, remax, vr[ki:], ldvr)
+			default:
+				// Version 2: back-transform block of vectors with GEMM.
+				// Zero out below vector.
+				for k := ki + 1; k < n; k++ {
+					b[k*ldb+iv] = 0
+				}
+				iscomplex[iv] = ip
+				// Back-transform and normalization is done below.
+			}
+		} else {
+			// Complex right eigenvector.
+
+			// Initial solve
+			//  [ ( T[ki-1,ki-1] T[ki-1,ki] ) - (wr + i*wi) ]*X = 0.
+			//  [ ( T[ki,  ki-1] T[ki,  ki] )               ]
+			if math.Abs(t[(ki-1)*ldt+ki]) >= math.Abs(t[ki*ldt+ki-1]) {
+				b[(ki-1)*ldb+iv-1] = 1
+				b[ki*ldb+iv] = wi / t[(ki-1)*ldt+ki]
+			} else {
+				b[(ki-1)*ldb+iv-1] = -wi / t[ki*ldt+ki-1]
+				b[ki*ldb+iv] = 1
+			}
+			b[ki*ldb+iv-1] = 0
+			b[(ki-1)*ldb+iv] = 0
+			// Form right-hand side.
+			for k := 0; k < ki-1; k++ {
+				b[k*ldb+iv-1] = -b[(ki-1)*ldb+iv-1] * t[k*ldt+ki-1]
+				b[k*ldb+iv] = -b[ki*ldb+iv] * t[k*ldt+ki]
+			}
+			// Solve upper quasi-triangular system:
+			//  [ T[0:ki-1,0:ki-1] - (wr+i*wi) ]*X = scale*(b1+i*b2)
+			for j := ki - 2; j >= 0; {
+				if j == 0 || t[j*ldt+j-1] == 0 {
+					// 1×1 diagonal block.
+
+					scale, xnorm, _ := impl.Dlaln2(false, 1, 2, smin, 1, t[j*ldt+j:], ldt,
+						1, 1, b[j*ldb+iv-1:], ldb, wr, wi, x[:2], 2)
+					// Scale X[0,0] and X[0,1] to avoid
+					// overflow when updating the right-hand side.
+					if xnorm > 1 && norms[j] > bignum/xnorm {
+						x[0] /= xnorm
+						x[1] /= xnorm
+						scale /= xnorm
+					}
+					// Scale if necessary.
+					if scale != 1 {
+						bi.Dscal(ki+1, scale, b[iv-1:], ldb)
+						bi.Dscal(ki+1, scale, b[iv:], ldb)
+					}
+					b[j*ldb+iv-1] = x[0]
+					b[j*ldb+iv] = x[1]
+					// Update the right-hand side.
+					bi.Daxpy(j, -x[0], t[j:], ldt, b[iv-1:], ldb)
+					bi.Daxpy(j, -x[1], t[j:], ldt, b[iv:], ldb)
+					j--
+				} else {
+					// 2×2 diagonal block.
+
+					scale, xnorm, _ := impl.Dlaln2(false, 2, 2, smin, 1, t[(j-1)*ldt+j-1:], ldt,
+						1, 1, b[(j-1)*ldb+iv-1:], ldb, wr, wi, x[:], 2)
+					// Scale X to avoid overflow when updating
+					// the right-hand side.
+					if xnorm > 1 {
+						beta := math.Max(norms[j-1], norms[j])
+						if beta > bignum/xnorm {
+							rec := 1 / xnorm
+							x[0] *= rec
+							x[1] *= rec
+							x[2] *= rec
+							x[3] *= rec
+							scale *= rec
+						}
+					}
+					// Scale if necessary.
+					if scale != 1 {
+						bi.Dscal(ki+1, scale, b[iv-1:], ldb)
+						bi.Dscal(ki+1, scale, b[iv:], ldb)
+					}
+					b[(j-1)*ldb+iv-1] = x[0]
+					b[(j-1)*ldb+iv] = x[1]
+					b[j*ldb+iv-1] = x[2]
+					b[j*ldb+iv] = x[3]
+					// Update the right-hand side.
+					bi.Daxpy(j-1, -x[0], t[j-1:], ldt, b[iv-1:], ldb)
+					bi.Daxpy(j-1, -x[1], t[j-1:], ldt, b[iv:], ldb)
+					bi.Daxpy(j-1, -x[2], t[j:], ldt, b[iv-1:], ldb)
+					bi.Daxpy(j-1, -x[3], t[j:], ldt, b[iv:], ldb)
+					j -= 2
+				}
+			}
+
+			// Copy the vector x or Q*x to VR and normalize.
+			switch {
+			case howmny != lapack.EVAllMulQ:
+				// No back-transform: copy x to VR and normalize.
+				bi.Dcopy(ki+1, b[iv-1:], ldb, vr[is-1:], ldvr)
+				bi.Dcopy(ki+1, b[iv:], ldb, vr[is:], ldvr)
+				emax := 0.0
+				for k := 0; k <= ki; k++ {
+					emax = math.Max(emax, math.Abs(vr[k*ldvr+is-1])+math.Abs(vr[k*ldvr+is]))
+				}
+				remax := 1 / emax
+				bi.Dscal(ki+1, remax, vr[is-1:], ldvr)
+				bi.Dscal(ki+1, remax, vr[is:], ldvr)
+				for k := ki + 1; k < n; k++ {
+					vr[k*ldvr+is-1] = 0
+					vr[k*ldvr+is] = 0
+				}
+			case nb == 1:
+				// Version 1: back-transform each vector with GEMV, Q*x.
+				if ki-1 > 0 {
+					bi.Dgemv(blas.NoTrans, n, ki-1, 1, vr, ldvr, b[iv-1:], ldb,
+						b[(ki-1)*ldb+iv-1], vr[ki-1:], ldvr)
+					bi.Dgemv(blas.NoTrans, n, ki-1, 1, vr, ldvr, b[iv:], ldb,
+						b[ki*ldb+iv], vr[ki:], ldvr)
+				} else {
+					bi.Dscal(n, b[(ki-1)*ldb+iv-1], vr[ki-1:], ldvr)
+					bi.Dscal(n, b[ki*ldb+iv], vr[ki:], ldvr)
+				}
+				emax := 0.0
+				for k := 0; k < n; k++ {
+					emax = math.Max(emax, math.Abs(vr[k*ldvr+ki-1])+math.Abs(vr[k*ldvr+ki]))
+				}
+				remax := 1 / emax
+				bi.Dscal(n, remax, vr[ki-1:], ldvr)
+				bi.Dscal(n, remax, vr[ki:], ldvr)
+			default:
+				// Version 2: back-transform block of vectors with GEMM.
+				// Zero out below vector.
+				for k := ki + 1; k < n; k++ {
+					b[k*ldb+iv-1] = 0
+					b[k*ldb+iv] = 0
+				}
+				iscomplex[iv-1] = -ip
+				iscomplex[iv] = ip
+				iv--
+				// Back-transform and normalization is done below.
+			}
+		}
+		if nb > 1 {
+			// Blocked version of back-transform.
+
+			// For complex case, ki2 includes both vectors (ki-1 and ki).
+			ki2 := ki
+			if ip != 0 {
+				ki2--
+			}
+			// Columns iv:nb of b are valid vectors.
+			// When the number of vectors stored reaches nb-1 or nb,
+			// or if this was last vector, do the Gemm.
+			if iv < 2 || ki2 == 0 {
+				bi.Dgemm(blas.NoTrans, blas.NoTrans, n, nb-iv, ki2+nb-iv,
+					1, vr, ldvr, b[iv:], ldb,
+					0, b[nb+iv:], ldb)
+				// Normalize vectors.
+				var remax float64
+				for k := iv; k < nb; k++ {
+					if iscomplex[k] == 0 {
+						// Real eigenvector.
+						ii := bi.Idamax(n, b[nb+k:], ldb)
+						remax = 1 / math.Abs(b[ii*ldb+nb+k])
+					} else if iscomplex[k] == 1 {
+						// First eigenvector of conjugate pair.
+						emax := 0.0
+						for ii := 0; ii < n; ii++ {
+							emax = math.Max(emax, math.Abs(b[ii*ldb+nb+k])+math.Abs(b[ii*ldb+nb+k+1]))
+						}
+						remax = 1 / emax
+						// Second eigenvector of conjugate pair
+						// will reuse this value of remax.
+					}
+					bi.Dscal(n, remax, b[nb+k:], ldb)
+				}
+				impl.Dlacpy(blas.All, n, nb-iv, b[nb+iv:], ldb, vr[ki2:], ldvr)
+				iv = nb - 1
+			} else {
+				iv--
+			}
+		}
+		is--
+		if ip != 0 {
+			is--
+		}
+	}
+
+	if side == lapack.EVRight {
+		return m
+	}
+
+leftev:
+	// Compute left eigenvectors.
+
+	// For complex left vector, iv is for real part and iv+1 for complex
+	// part. Non-blocked version always uses iv=0. Blocked version starts
+	// with iv=0, goes up to nb-2 or nb-1.
+	iv = 0
+	ip = 0
+	is = 0
+	for ki := 0; ki < n; ki++ {
+		if ip == 1 {
+			// Previous iteration ki-1 was first of conjugate pair,
+			// so this ki is second of conjugate pair.
+			ip = -1
+			continue
+		}
+
+		if ki == n-1 || t[(ki+1)*ldt+ki] == 0 {
+			// Last column or zero on sub-diagonal, so this ki must
+			// be real eigenvalue.
+			ip = 0
+		} else {
+			// Non-zero on sub-diagonal, so this ki is first of
+			// conjugate pair.
+			ip = 1
+		}
+		if howmny == lapack.EVSelected && !selected[ki] {
+			continue
+		}
+
+		// Compute the ki-th eigenvalue (wr,wi).
+		wr := t[ki*ldt+ki]
+		var wi float64
+		if ip != 0 {
+			wi = math.Sqrt(math.Abs(t[ki*ldt+ki+1])) * math.Sqrt(math.Abs(t[(ki+1)*ldt+ki]))
+		}
+		smin := math.Max(ulp*(math.Abs(wr)+math.Abs(wi)), smlnum)
+
+		if ip == 0 {
+			// Real left eigenvector.
+
+			b[ki*ldb+iv] = 1
+			// Form right-hand side.
+			for k := ki + 1; k < n; k++ {
+				b[k*ldb+iv] = -t[ki*ldt+k]
+			}
+			// Solve transposed quasi-triangular system:
+			//  [ T[ki+1:n,ki+1:n] - wr ]ᵀ * X = scale*b
+			vmax := 1.0
+			vcrit := bignum
+			for j := ki + 1; j < n; {
+				if j == n-1 || t[(j+1)*ldt+j] == 0 {
+					// 1×1 diagonal block.
+
+					// Scale if necessary to avoid overflow
+					// when forming the right-hand side.
+					if norms[j] > vcrit {
+						rec := 1 / vmax
+						bi.Dscal(n-ki, rec, b[ki*ldb+iv:], ldb)
+						vmax = 1
+					}
+					b[j*ldb+iv] -= bi.Ddot(j-ki-1, t[(ki+1)*ldt+j:], ldt, b[(ki+1)*ldb+iv:], ldb)
+					// Solve [ T[j,j] - wr ]ᵀ * X = b.
+					scale, _, _ := impl.Dlaln2(false, 1, 1, smin, 1, t[j*ldt+j:], ldt,
+						1, 1, b[j*ldb+iv:], ldb, wr, 0, x[:1], 2)
+					// Scale if necessary.
+					if scale != 1 {
+						bi.Dscal(n-ki, scale, b[ki*ldb+iv:], ldb)
+					}
+					b[j*ldb+iv] = x[0]
+					vmax = math.Max(math.Abs(b[j*ldb+iv]), vmax)
+					vcrit = bignum / vmax
+					j++
+				} else {
+					// 2×2 diagonal block.
+
+					// Scale if necessary to avoid overflow
+					// when forming the right-hand side.
+					beta := math.Max(norms[j], norms[j+1])
+					if beta > vcrit {
+						bi.Dscal(n-ki, 1/vmax, b[ki*ldb+iv:], ldb)
+						vmax = 1
+					}
+					b[j*ldb+iv] -= bi.Ddot(j-ki-1, t[(ki+1)*ldt+j:], ldt, b[(ki+1)*ldb+iv:], ldb)
+					b[(j+1)*ldb+iv] -= bi.Ddot(j-ki-1, t[(ki+1)*ldt+j+1:], ldt, b[(ki+1)*ldb+iv:], ldb)
+					// Solve
+					//  [ T[j,j]-wr  T[j,j+1]      ]ᵀ * X = scale*[ b1 ]
+					//  [ T[j+1,j]   T[j+1,j+1]-wr ]              [ b2 ]
+					scale, _, _ := impl.Dlaln2(true, 2, 1, smin, 1, t[j*ldt+j:], ldt,
+						1, 1, b[j*ldb+iv:], ldb, wr, 0, x[:3], 2)
+					// Scale if necessary.
+					if scale != 1 {
+						bi.Dscal(n-ki, scale, b[ki*ldb+iv:], ldb)
+					}
+					b[j*ldb+iv] = x[0]
+					b[(j+1)*ldb+iv] = x[2]
+					vmax = math.Max(vmax, math.Max(math.Abs(b[j*ldb+iv]), math.Abs(b[(j+1)*ldb+iv])))
+					vcrit = bignum / vmax
+					j += 2
+				}
+			}
+			// Copy the vector x or Q*x to VL and normalize.
+			switch {
+			case howmny != lapack.EVAllMulQ:
+				// No back-transform: copy x to VL and normalize.
+				bi.Dcopy(n-ki, b[ki*ldb+iv:], ldb, vl[ki*ldvl+is:], ldvl)
+				ii := bi.Idamax(n-ki, vl[ki*ldvl+is:], ldvl) + ki
+				remax := 1 / math.Abs(vl[ii*ldvl+is])
+				bi.Dscal(n-ki, remax, vl[ki*ldvl+is:], ldvl)
+				for k := 0; k < ki; k++ {
+					vl[k*ldvl+is] = 0
+				}
+			case nb == 1:
+				// Version 1: back-transform each vector with Gemv, Q*x.
+				if n-ki-1 > 0 {
+					bi.Dgemv(blas.NoTrans, n, n-ki-1,
+						1, vl[ki+1:], ldvl, b[(ki+1)*ldb+iv:], ldb,
+						b[ki*ldb+iv], vl[ki:], ldvl)
+				}
+				ii := bi.Idamax(n, vl[ki:], ldvl)
+				remax := 1 / math.Abs(vl[ii*ldvl+ki])
+				bi.Dscal(n, remax, vl[ki:], ldvl)
+			default:
+				// Version 2: back-transform block of vectors with Gemm
+				// zero out above vector.
+				for k := 0; k < ki; k++ {
+					b[k*ldb+iv] = 0
+				}
+				iscomplex[iv] = ip
+				// Back-transform and normalization is done below.
+			}
+		} else {
+			// Complex left eigenvector.
+
+			// Initial solve:
+			// [ [ T[ki,ki]   T[ki,ki+1]   ]ᵀ - (wr - i* wi) ]*X = 0.
+			// [ [ T[ki+1,ki] T[ki+1,ki+1] ]                 ]
+			if math.Abs(t[ki*ldt+ki+1]) >= math.Abs(t[(ki+1)*ldt+ki]) {
+				b[ki*ldb+iv] = wi / t[ki*ldt+ki+1]
+				b[(ki+1)*ldb+iv+1] = 1
+			} else {
+				b[ki*ldb+iv] = 1
+				b[(ki+1)*ldb+iv+1] = -wi / t[(ki+1)*ldt+ki]
+			}
+			b[(ki+1)*ldb+iv] = 0
+			b[ki*ldb+iv+1] = 0
+			// Form right-hand side.
+			for k := ki + 2; k < n; k++ {
+				b[k*ldb+iv] = -b[ki*ldb+iv] * t[ki*ldt+k]
+				b[k*ldb+iv+1] = -b[(ki+1)*ldb+iv+1] * t[(ki+1)*ldt+k]
+			}
+			// Solve transposed quasi-triangular system:
+			// [ T[ki+2:n,ki+2:n]ᵀ - (wr-i*wi) ]*X = b1+i*b2
+			vmax := 1.0
+			vcrit := bignum
+			for j := ki + 2; j < n; {
+				if j == n-1 || t[(j+1)*ldt+j] == 0 {
+					// 1×1 diagonal block.
+
+					// Scale if necessary to avoid overflow
+					// when forming the right-hand side elements.
+					if norms[j] > vcrit {
+						rec := 1 / vmax
+						bi.Dscal(n-ki, rec, b[ki*ldb+iv:], ldb)
+						bi.Dscal(n-ki, rec, b[ki*ldb+iv+1:], ldb)
+						vmax = 1
+					}
+					b[j*ldb+iv] -= bi.Ddot(j-ki-2, t[(ki+2)*ldt+j:], ldt, b[(ki+2)*ldb+iv:], ldb)
+					b[j*ldb+iv+1] -= bi.Ddot(j-ki-2, t[(ki+2)*ldt+j:], ldt, b[(ki+2)*ldb+iv+1:], ldb)
+					// Solve [ T[j,j]-(wr-i*wi) ]*(X11+i*X12) = b1+i*b2.
+					scale, _, _ := impl.Dlaln2(false, 1, 2, smin, 1, t[j*ldt+j:], ldt,
+						1, 1, b[j*ldb+iv:], ldb, wr, -wi, x[:2], 2)
+					// Scale if necessary.
+					if scale != 1 {
+						bi.Dscal(n-ki, scale, b[ki*ldb+iv:], ldb)
+						bi.Dscal(n-ki, scale, b[ki*ldb+iv+1:], ldb)
+					}
+					b[j*ldb+iv] = x[0]
+					b[j*ldb+iv+1] = x[1]
+					vmax = math.Max(vmax, math.Max(math.Abs(b[j*ldb+iv]), math.Abs(b[j*ldb+iv+1])))
+					vcrit = bignum / vmax
+					j++
+				} else {
+					// 2×2 diagonal block.
+
+					// Scale if necessary to avoid overflow
+					// when forming the right-hand side elements.
+					if math.Max(norms[j], norms[j+1]) > vcrit {
+						rec := 1 / vmax
+						bi.Dscal(n-ki, rec, b[ki*ldb+iv:], ldb)
+						bi.Dscal(n-ki, rec, b[ki*ldb+iv+1:], ldb)
+						vmax = 1
+					}
+					b[j*ldb+iv] -= bi.Ddot(j-ki-2, t[(ki+2)*ldt+j:], ldt, b[(ki+2)*ldb+iv:], ldb)
+					b[j*ldb+iv+1] -= bi.Ddot(j-ki-2, t[(ki+2)*ldt+j:], ldt, b[(ki+2)*ldb+iv+1:], ldb)
+					b[(j+1)*ldb+iv] -= bi.Ddot(j-ki-2, t[(ki+2)*ldt+j+1:], ldt, b[(ki+2)*ldb+iv:], ldb)
+					b[(j+1)*ldb+iv+1] -= bi.Ddot(j-ki-2, t[(ki+2)*ldt+j+1:], ldt, b[(ki+2)*ldb+iv+1:], ldb)
+					// Solve 2×2 complex linear equation
+					//  [ [T[j,j]   T[j,j+1]  ]ᵀ - (wr-i*wi)*I ]*X = scale*b
+					//  [ [T[j+1,j] T[j+1,j+1]]                ]
+					scale, _, _ := impl.Dlaln2(true, 2, 2, smin, 1, t[j*ldt+j:], ldt,
+						1, 1, b[j*ldb+iv:], ldb, wr, -wi, x[:], 2)
+					// Scale if necessary.
+					if scale != 1 {
+						bi.Dscal(n-ki, scale, b[ki*ldb+iv:], ldb)
+						bi.Dscal(n-ki, scale, b[ki*ldb+iv+1:], ldb)
+					}
+					b[j*ldb+iv] = x[0]
+					b[j*ldb+iv+1] = x[1]
+					b[(j+1)*ldb+iv] = x[2]
+					b[(j+1)*ldb+iv+1] = x[3]
+					vmax01 := math.Max(math.Abs(x[0]), math.Abs(x[1]))
+					vmax23 := math.Max(math.Abs(x[2]), math.Abs(x[3]))
+					vmax = math.Max(vmax, math.Max(vmax01, vmax23))
+					vcrit = bignum / vmax
+					j += 2
+				}
+			}
+			// Copy the vector x or Q*x to VL and normalize.
+			switch {
+			case howmny != lapack.EVAllMulQ:
+				// No back-transform: copy x to VL and normalize.
+				bi.Dcopy(n-ki, b[ki*ldb+iv:], ldb, vl[ki*ldvl+is:], ldvl)
+				bi.Dcopy(n-ki, b[ki*ldb+iv+1:], ldb, vl[ki*ldvl+is+1:], ldvl)
+				emax := 0.0
+				for k := ki; k < n; k++ {
+					emax = math.Max(emax, math.Abs(vl[k*ldvl+is])+math.Abs(vl[k*ldvl+is+1]))
+				}
+				remax := 1 / emax
+				bi.Dscal(n-ki, remax, vl[ki*ldvl+is:], ldvl)
+				bi.Dscal(n-ki, remax, vl[ki*ldvl+is+1:], ldvl)
+				for k := 0; k < ki; k++ {
+					vl[k*ldvl+is] = 0
+					vl[k*ldvl+is+1] = 0
+				}
+			case nb == 1:
+				// Version 1: back-transform each vector with GEMV, Q*x.
+				if n-ki-2 > 0 {
+					bi.Dgemv(blas.NoTrans, n, n-ki-2,
+						1, vl[ki+2:], ldvl, b[(ki+2)*ldb+iv:], ldb,
+						b[ki*ldb+iv], vl[ki:], ldvl)
+					bi.Dgemv(blas.NoTrans, n, n-ki-2,
+						1, vl[ki+2:], ldvl, b[(ki+2)*ldb+iv+1:], ldb,
+						b[(ki+1)*ldb+iv+1], vl[ki+1:], ldvl)
+				} else {
+					bi.Dscal(n, b[ki*ldb+iv], vl[ki:], ldvl)
+					bi.Dscal(n, b[(ki+1)*ldb+iv+1], vl[ki+1:], ldvl)
+				}
+				emax := 0.0
+				for k := 0; k < n; k++ {
+					emax = math.Max(emax, math.Abs(vl[k*ldvl+ki])+math.Abs(vl[k*ldvl+ki+1]))
+				}
+				remax := 1 / emax
+				bi.Dscal(n, remax, vl[ki:], ldvl)
+				bi.Dscal(n, remax, vl[ki+1:], ldvl)
+			default:
+				// Version 2: back-transform block of vectors with GEMM.
+				// Zero out above vector.
+				// Could go from ki-nv+1 to ki-1.
+				for k := 0; k < ki; k++ {
+					b[k*ldb+iv] = 0
+					b[k*ldb+iv+1] = 0
+				}
+				iscomplex[iv] = ip
+				iscomplex[iv+1] = -ip
+				iv++
+				// Back-transform and normalization is done below.
+			}
+		}
+		if nb > 1 {
+			// Blocked version of back-transform.
+			// For complex case, ki2 includes both vectors ki and ki+1.
+			ki2 := ki
+			if ip != 0 {
+				ki2++
+			}
+			// Columns [0:iv] of work are valid vectors. When the
+			// number of vectors stored reaches nb-1 or nb, or if
+			// this was last vector, do the Gemm.
+			if iv >= nb-2 || ki2 == n-1 {
+				bi.Dgemm(blas.NoTrans, blas.NoTrans, n, iv+1, n-ki2+iv,
+					1, vl[ki2-iv:], ldvl, b[(ki2-iv)*ldb:], ldb,
+					0, b[nb:], ldb)
+				// Normalize vectors.
+				var remax float64
+				for k := 0; k <= iv; k++ {
+					if iscomplex[k] == 0 {
+						// Real eigenvector.
+						ii := bi.Idamax(n, b[nb+k:], ldb)
+						remax = 1 / math.Abs(b[ii*ldb+nb+k])
+					} else if iscomplex[k] == 1 {
+						// First eigenvector of conjugate pair.
+						emax := 0.0
+						for ii := 0; ii < n; ii++ {
+							emax = math.Max(emax, math.Abs(b[ii*ldb+nb+k])+math.Abs(b[ii*ldb+nb+k+1]))
+						}
+						remax = 1 / emax
+						// Second eigenvector of conjugate pair
+						// will reuse this value of remax.
+					}
+					bi.Dscal(n, remax, b[nb+k:], ldb)
+				}
+				impl.Dlacpy(blas.All, n, iv+1, b[nb:], ldb, vl[ki2-iv:], ldvl)
+				iv = 0
+			} else {
+				iv++
+			}
+		}
+		is++
+		if ip != 0 {
+			is++
+		}
+	}
+
+	return m
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dtrexc.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dtrexc.go
new file mode 100644
index 0000000000..2a0a5e7c6d
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dtrexc.go
@@ -0,0 +1,230 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/lapack"
+
+// Dtrexc reorders the real Schur factorization of a n×n real matrix
+//
+//	A = Q*T*Qᵀ
+//
+// so that the diagonal block of T with row index ifst is moved to row ilst.
+//
+// On entry, T must be in Schur canonical form, that is, block upper triangular
+// with 1×1 and 2×2 diagonal blocks; each 2×2 diagonal block has its diagonal
+// elements equal and its off-diagonal elements of opposite sign.
+//
+// On return, T will be reordered by an orthogonal similarity transformation Z
+// as Zᵀ*T*Z, and will be again in Schur canonical form.
+//
+// If compq is lapack.UpdateSchur, on return the matrix Q of Schur vectors will be
+// updated by post-multiplying it with Z.
+// If compq is lapack.UpdateSchurNone, the matrix Q is not referenced and will not be
+// updated.
+// For other values of compq Dtrexc will panic.
+//
+// ifst and ilst specify the reordering of the diagonal blocks of T. The block
+// with row index ifst is moved to row ilst, by a sequence of transpositions
+// between adjacent blocks.
+//
+// If ifst points to the second row of a 2×2 block, ifstOut will point to the
+// first row, otherwise it will be equal to ifst.
+//
+// ilstOut will point to the first row of the block in its final position. If ok
+// is true, ilstOut may differ from ilst by +1 or -1.
+//
+// It must hold that
+//
+//	0 <= ifst < n, and  0 <= ilst < n,
+//
+// otherwise Dtrexc will panic.
+//
+// If ok is false, two adjacent blocks were too close to swap because the
+// problem is very ill-conditioned. T may have been partially reordered, and
+// ilstOut will point to the first row of the block at the position to which it
+// has been moved.
+//
+// work must have length at least n, otherwise Dtrexc will panic.
+//
+// Dtrexc is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dtrexc(compq lapack.UpdateSchurComp, n int, t []float64, ldt int, q []float64, ldq int, ifst, ilst int, work []float64) (ifstOut, ilstOut int, ok bool) {
+	switch {
+	case compq != lapack.UpdateSchur && compq != lapack.UpdateSchurNone:
+		panic(badUpdateSchurComp)
+	case n < 0:
+		panic(nLT0)
+	case ldt < max(1, n):
+		panic(badLdT)
+	case ldq < 1, compq == lapack.UpdateSchur && ldq < n:
+		panic(badLdQ)
+	case (ifst < 0 || n <= ifst) && n > 0:
+		panic(badIfst)
+	case (ilst < 0 || n <= ilst) && n > 0:
+		panic(badIlst)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return ifst, ilst, true
+	}
+
+	switch {
+	case len(t) < (n-1)*ldt+n:
+		panic(shortT)
+	case compq == lapack.UpdateSchur && len(q) < (n-1)*ldq+n:
+		panic(shortQ)
+	case len(work) < n:
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if n == 1 {
+		return ifst, ilst, true
+	}
+
+	// Determine the first row of specified block
+	// and find out it is 1×1 or 2×2.
+	if ifst > 0 && t[ifst*ldt+ifst-1] != 0 {
+		ifst--
+	}
+	nbf := 1 // Size of the first block.
+	if ifst+1 < n && t[(ifst+1)*ldt+ifst] != 0 {
+		nbf = 2
+	}
+	// Determine the first row of the final block
+	// and find out it is 1×1 or 2×2.
+	if ilst > 0 && t[ilst*ldt+ilst-1] != 0 {
+		ilst--
+	}
+	nbl := 1 // Size of the last block.
+	if ilst+1 < n && t[(ilst+1)*ldt+ilst] != 0 {
+		nbl = 2
+	}
+
+	ok = true
+	wantq := compq == lapack.UpdateSchur
+
+	switch {
+	case ifst == ilst:
+		return ifst, ilst, true
+
+	case ifst < ilst:
+		// Update ilst.
+		switch {
+		case nbf == 2 && nbl == 1:
+			ilst--
+		case nbf == 1 && nbl == 2:
+			ilst++
+		}
+		here := ifst
+		for here < ilst {
+			// Swap block with next one below.
+			if nbf == 1 || nbf == 2 {
+				// Current block either 1×1 or 2×2.
+				nbnext := 1 // Size of the next block.
+				if here+nbf+1 < n && t[(here+nbf+1)*ldt+here+nbf] != 0 {
+					nbnext = 2
+				}
+				ok = impl.Dlaexc(wantq, n, t, ldt, q, ldq, here, nbf, nbnext, work)
+				if !ok {
+					return ifst, here, false
+				}
+				here += nbnext
+				// Test if 2×2 block breaks into two 1×1 blocks.
+				if nbf == 2 && t[(here+1)*ldt+here] == 0 {
+					nbf = 3
+				}
+				continue
+			}
+
+			// Current block consists of two 1×1 blocks each of
+			// which must be swapped individually.
+			nbnext := 1 // Size of the next block.
+			if here+3 < n && t[(here+3)*ldt+here+2] != 0 {
+				nbnext = 2
+			}
+			ok = impl.Dlaexc(wantq, n, t, ldt, q, ldq, here+1, 1, nbnext, work)
+			if !ok {
+				return ifst, here, false
+			}
+			if nbnext == 1 {
+				// Swap two 1×1 blocks, no problems possible.
+				impl.Dlaexc(wantq, n, t, ldt, q, ldq, here, 1, nbnext, work)
+				here++
+				continue
+			}
+			// Recompute nbnext in case 2×2 split.
+			if t[(here+2)*ldt+here+1] == 0 {
+				nbnext = 1
+			}
+			if nbnext == 2 {
+				// 2×2 block did not split.
+				ok = impl.Dlaexc(wantq, n, t, ldt, q, ldq, here, 1, nbnext, work)
+				if !ok {
+					return ifst, here, false
+				}
+			} else {
+				// 2×2 block did split.
+				impl.Dlaexc(wantq, n, t, ldt, q, ldq, here, 1, 1, work)
+				impl.Dlaexc(wantq, n, t, ldt, q, ldq, here+1, 1, 1, work)
+			}
+			here += 2
+		}
+		return ifst, here, true
+
+	default: // ifst > ilst
+		here := ifst
+		for here > ilst {
+			// Swap block with next one above.
+			nbnext := 1
+			if here >= 2 && t[(here-1)*ldt+here-2] != 0 {
+				nbnext = 2
+			}
+			if nbf == 1 || nbf == 2 {
+				// Current block either 1×1 or 2×2.
+				ok = impl.Dlaexc(wantq, n, t, ldt, q, ldq, here-nbnext, nbnext, nbf, work)
+				if !ok {
+					return ifst, here, false
+				}
+				here -= nbnext
+				// Test if 2×2 block breaks into two 1×1 blocks.
+				if nbf == 2 && t[(here+1)*ldt+here] == 0 {
+					nbf = 3
+				}
+				continue
+			}
+
+			// Current block consists of two 1×1 blocks each of
+			// which must be swapped individually.
+			ok = impl.Dlaexc(wantq, n, t, ldt, q, ldq, here-nbnext, nbnext, 1, work)
+			if !ok {
+				return ifst, here, false
+			}
+			if nbnext == 1 {
+				// Swap two 1×1 blocks, no problems possible.
+				impl.Dlaexc(wantq, n, t, ldt, q, ldq, here, nbnext, 1, work)
+				here--
+				continue
+			}
+			// Recompute nbnext in case 2×2 split.
+			if t[here*ldt+here-1] == 0 {
+				nbnext = 1
+			}
+			if nbnext == 2 {
+				// 2×2 block did not split.
+				ok = impl.Dlaexc(wantq, n, t, ldt, q, ldq, here-1, 2, 1, work)
+				if !ok {
+					return ifst, here, false
+				}
+			} else {
+				// 2×2 block did split.
+				impl.Dlaexc(wantq, n, t, ldt, q, ldq, here, 1, 1, work)
+				impl.Dlaexc(wantq, n, t, ldt, q, ldq, here-1, 1, 1, work)
+			}
+			here -= 2
+		}
+		return ifst, here, true
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dtrti2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dtrti2.go
new file mode 100644
index 0000000000..efc24b65ea
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dtrti2.go
@@ -0,0 +1,69 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dtrti2 computes the inverse of a triangular matrix, storing the result in place
+// into a. This is the BLAS level 2 version of the algorithm.
+//
+// Dtrti2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dtrti2(uplo blas.Uplo, diag blas.Diag, n int, a []float64, lda int) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case diag != blas.NonUnit && diag != blas.Unit:
+		panic(badDiag)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	if n == 0 {
+		return
+	}
+
+	if len(a) < (n-1)*lda+n {
+		panic(shortA)
+	}
+
+	bi := blas64.Implementation()
+
+	nonUnit := diag == blas.NonUnit
+	// TODO(btracey): Replace this with a row-major ordering.
+	if uplo == blas.Upper {
+		for j := 0; j < n; j++ {
+			var ajj float64
+			if nonUnit {
+				ajj = 1 / a[j*lda+j]
+				a[j*lda+j] = ajj
+				ajj *= -1
+			} else {
+				ajj = -1
+			}
+			bi.Dtrmv(blas.Upper, blas.NoTrans, diag, j, a, lda, a[j:], lda)
+			bi.Dscal(j, ajj, a[j:], lda)
+		}
+		return
+	}
+	for j := n - 1; j >= 0; j-- {
+		var ajj float64
+		if nonUnit {
+			ajj = 1 / a[j*lda+j]
+			a[j*lda+j] = ajj
+			ajj *= -1
+		} else {
+			ajj = -1
+		}
+		if j < n-1 {
+			bi.Dtrmv(blas.Lower, blas.NoTrans, diag, n-j-1, a[(j+1)*lda+j+1:], lda, a[(j+1)*lda+j:], lda)
+			bi.Dscal(n-j-1, ajj, a[(j+1)*lda+j:], lda)
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dtrtri.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dtrtri.go
new file mode 100644
index 0000000000..6ec3663c35
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dtrtri.go
@@ -0,0 +1,72 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dtrtri computes the inverse of a triangular matrix, storing the result in place
+// into a. This is the BLAS level 3 version of the algorithm which builds upon
+// Dtrti2 to operate on matrix blocks instead of only individual columns.
+//
+// Dtrtri will not perform the inversion if the matrix is singular, and returns
+// a boolean indicating whether the inversion was successful.
+func (impl Implementation) Dtrtri(uplo blas.Uplo, diag blas.Diag, n int, a []float64, lda int) (ok bool) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case diag != blas.NonUnit && diag != blas.Unit:
+		panic(badDiag)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	if n == 0 {
+		return true
+	}
+
+	if len(a) < (n-1)*lda+n {
+		panic(shortA)
+	}
+
+	if diag == blas.NonUnit {
+		for i := 0; i < n; i++ {
+			if a[i*lda+i] == 0 {
+				return false
+			}
+		}
+	}
+
+	bi := blas64.Implementation()
+
+	nb := impl.Ilaenv(1, "DTRTRI", "UD", n, -1, -1, -1)
+	if nb <= 1 || nb > n {
+		impl.Dtrti2(uplo, diag, n, a, lda)
+		return true
+	}
+	if uplo == blas.Upper {
+		for j := 0; j < n; j += nb {
+			jb := min(nb, n-j)
+			bi.Dtrmm(blas.Left, blas.Upper, blas.NoTrans, diag, j, jb, 1, a, lda, a[j:], lda)
+			bi.Dtrsm(blas.Right, blas.Upper, blas.NoTrans, diag, j, jb, -1, a[j*lda+j:], lda, a[j:], lda)
+			impl.Dtrti2(blas.Upper, diag, jb, a[j*lda+j:], lda)
+		}
+		return true
+	}
+	nn := ((n - 1) / nb) * nb
+	for j := nn; j >= 0; j -= nb {
+		jb := min(nb, n-j)
+		if j+jb <= n-1 {
+			bi.Dtrmm(blas.Left, blas.Lower, blas.NoTrans, diag, n-j-jb, jb, 1, a[(j+jb)*lda+j+jb:], lda, a[(j+jb)*lda+j:], lda)
+			bi.Dtrsm(blas.Right, blas.Lower, blas.NoTrans, diag, n-j-jb, jb, -1, a[j*lda+j:], lda, a[(j+jb)*lda+j:], lda)
+		}
+		impl.Dtrti2(blas.Lower, diag, jb, a[j*lda+j:], lda)
+	}
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dtrtrs.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dtrtrs.go
new file mode 100644
index 0000000000..2145fbd5fd
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dtrtrs.go
@@ -0,0 +1,55 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dtrtrs solves a triangular system of the form A * X = B or Aᵀ * X = B. Dtrtrs
+// returns whether the solve completed successfully. If A is singular, no solve is performed.
+func (impl Implementation) Dtrtrs(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n, nrhs int, a []float64, lda int, b []float64, ldb int) (ok bool) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case trans != blas.NoTrans && trans != blas.Trans && trans != blas.ConjTrans:
+		panic(badTrans)
+	case diag != blas.NonUnit && diag != blas.Unit:
+		panic(badDiag)
+	case n < 0:
+		panic(nLT0)
+	case nrhs < 0:
+		panic(nrhsLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldb < max(1, nrhs):
+		panic(badLdB)
+	}
+
+	if n == 0 {
+		return true
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(b) < (n-1)*ldb+nrhs:
+		panic(shortB)
+	}
+
+	// Check for singularity.
+	nounit := diag == blas.NonUnit
+	if nounit {
+		for i := 0; i < n; i++ {
+			if a[i*lda+i] == 0 {
+				return false
+			}
+		}
+	}
+	bi := blas64.Implementation()
+	bi.Dtrsm(blas.Left, uplo, trans, diag, n, nrhs, 1, a, lda, b, ldb)
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/errors.go b/vendor/gonum.org/v1/gonum/lapack/gonum/errors.go
new file mode 100644
index 0000000000..711cc2d5ad
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/errors.go
@@ -0,0 +1,183 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+// This list is duplicated in netlib/lapack/netlib. Keep in sync.
+const (
+	// Panic strings for bad enumeration values.
+	badApplyOrtho       = "lapack: bad ApplyOrtho"
+	badBalanceJob       = "lapack: bad BalanceJob"
+	badDiag             = "lapack: bad Diag"
+	badDirect           = "lapack: bad Direct"
+	badEVComp           = "lapack: bad EVComp"
+	badEVHowMany        = "lapack: bad EVHowMany"
+	badEVJob            = "lapack: bad EVJob"
+	badEVSide           = "lapack: bad EVSide"
+	badGSVDJob          = "lapack: bad GSVDJob"
+	badGenOrtho         = "lapack: bad GenOrtho"
+	badLeftEVJob        = "lapack: bad LeftEVJob"
+	badMatrixType       = "lapack: bad MatrixType"
+	badMaximizeNormXJob = "lapack: bad MaximizeNormXJob"
+	badNorm             = "lapack: bad Norm"
+	badOrthoComp        = "lapack: bad OrthoComp"
+	badPivot            = "lapack: bad Pivot"
+	badRightEVJob       = "lapack: bad RightEVJob"
+	badSVDJob           = "lapack: bad SVDJob"
+	badSchurComp        = "lapack: bad SchurComp"
+	badSchurJob         = "lapack: bad SchurJob"
+	badSide             = "lapack: bad Side"
+	badSort             = "lapack: bad Sort"
+	badStoreV           = "lapack: bad StoreV"
+	badTrans            = "lapack: bad Trans"
+	badUpdateSchurComp  = "lapack: bad UpdateSchurComp"
+	badUplo             = "lapack: bad Uplo"
+	bothSVDOver         = "lapack: both jobU and jobVT are lapack.SVDOverwrite"
+
+	// Panic strings for bad numerical and string values.
+	badIfst     = "lapack: ifst out of range"
+	badIhi      = "lapack: ihi out of range"
+	badIhiz     = "lapack: ihiz out of range"
+	badIlo      = "lapack: ilo out of range"
+	badIloz     = "lapack: iloz out of range"
+	badIlst     = "lapack: ilst out of range"
+	badIsave    = "lapack: bad isave value"
+	badIspec    = "lapack: bad ispec value"
+	badJ1       = "lapack: j1 out of range"
+	badJpvt     = "lapack: bad element of jpvt"
+	badK1       = "lapack: k1 out of range"
+	badK2       = "lapack: k2 out of range"
+	badKacc22   = "lapack: invalid value of kacc22"
+	badKbot     = "lapack: kbot out of range"
+	badKtop     = "lapack: ktop out of range"
+	badLWork    = "lapack: insufficient declared workspace length"
+	badMm       = "lapack: mm out of range"
+	badN1       = "lapack: bad value of n1"
+	badN2       = "lapack: bad value of n2"
+	badNa       = "lapack: bad value of na"
+	badName     = "lapack: bad name"
+	badNh       = "lapack: bad value of nh"
+	badNw       = "lapack: bad value of nw"
+	badPp       = "lapack: bad value of pp"
+	badShifts   = "lapack: bad shifts"
+	i0LT0       = "lapack: i0 < 0"
+	kGTM        = "lapack: k > m"
+	kGTN        = "lapack: k > n"
+	kLT0        = "lapack: k < 0"
+	kLT1        = "lapack: k < 1"
+	kdLT0       = "lapack: kd < 0"
+	klLT0       = "lapack: kl < 0"
+	kuLT0       = "lapack: ku < 0"
+	mGTN        = "lapack: m > n"
+	mLT0        = "lapack: m < 0"
+	mmLT0       = "lapack: mm < 0"
+	n0LT0       = "lapack: n0 < 0"
+	nGTM        = "lapack: n > m"
+	nLT0        = "lapack: n < 0"
+	nLT1        = "lapack: n < 1"
+	nLTM        = "lapack: n < m"
+	nanCFrom    = "lapack: cfrom is NaN"
+	nanCTo      = "lapack: cto is NaN"
+	nbGTM       = "lapack: nb > m"
+	nbGTN       = "lapack: nb > n"
+	nbLT0       = "lapack: nb < 0"
+	nccLT0      = "lapack: ncc < 0"
+	ncvtLT0     = "lapack: ncvt < 0"
+	negANorm    = "lapack: anorm < 0"
+	negZ        = "lapack: negative z value"
+	nhLT0       = "lapack: nh < 0"
+	notIsolated = "lapack: block is not isolated"
+	nrhsLT0     = "lapack: nrhs < 0"
+	nruLT0      = "lapack: nru < 0"
+	nshftsLT0   = "lapack: nshfts < 0"
+	nshftsOdd   = "lapack: nshfts must be even"
+	nvLT0       = "lapack: nv < 0"
+	offsetGTM   = "lapack: offset > m"
+	offsetLT0   = "lapack: offset < 0"
+	pLT0        = "lapack: p < 0"
+	recurLT0    = "lapack: recur < 0"
+	zeroCFrom   = "lapack: zero cfrom"
+
+	// Panic strings for bad slice lengths.
+	badLenAlpha    = "lapack: bad length of alpha"
+	badLenBeta     = "lapack: bad length of beta"
+	badLenIpiv     = "lapack: bad length of ipiv"
+	badLenJpiv     = "lapack: bad length of jpiv"
+	badLenJpvt     = "lapack: bad length of jpvt"
+	badLenK        = "lapack: bad length of k"
+	badLenPiv      = "lapack: bad length of piv"
+	badLenSelected = "lapack: bad length of selected"
+	badLenSi       = "lapack: bad length of si"
+	badLenSr       = "lapack: bad length of sr"
+	badLenTau      = "lapack: bad length of tau"
+	badLenWi       = "lapack: bad length of wi"
+	badLenWr       = "lapack: bad length of wr"
+
+	// Panic strings for insufficient slice lengths.
+	shortA     = "lapack: insufficient length of a"
+	shortAB    = "lapack: insufficient length of ab"
+	shortAuxv  = "lapack: insufficient length of auxv"
+	shortB     = "lapack: insufficient length of b"
+	shortC     = "lapack: insufficient length of c"
+	shortCNorm = "lapack: insufficient length of cnorm"
+	shortD     = "lapack: insufficient length of d"
+	shortDL    = "lapack: insufficient length of dl"
+	shortDU    = "lapack: insufficient length of du"
+	shortE     = "lapack: insufficient length of e"
+	shortF     = "lapack: insufficient length of f"
+	shortH     = "lapack: insufficient length of h"
+	shortIWork = "lapack: insufficient length of iwork"
+	shortIsgn  = "lapack: insufficient length of isgn"
+	shortQ     = "lapack: insufficient length of q"
+	shortRHS   = "lapack: insufficient length of rhs"
+	shortS     = "lapack: insufficient length of s"
+	shortScale = "lapack: insufficient length of scale"
+	shortT     = "lapack: insufficient length of t"
+	shortTau   = "lapack: insufficient length of tau"
+	shortTauP  = "lapack: insufficient length of tauP"
+	shortTauQ  = "lapack: insufficient length of tauQ"
+	shortU     = "lapack: insufficient length of u"
+	shortV     = "lapack: insufficient length of v"
+	shortVL    = "lapack: insufficient length of vl"
+	shortVR    = "lapack: insufficient length of vr"
+	shortVT    = "lapack: insufficient length of vt"
+	shortVn1   = "lapack: insufficient length of vn1"
+	shortVn2   = "lapack: insufficient length of vn2"
+	shortW     = "lapack: insufficient length of w"
+	shortWH    = "lapack: insufficient length of wh"
+	shortWV    = "lapack: insufficient length of wv"
+	shortWi    = "lapack: insufficient length of wi"
+	shortWork  = "lapack: insufficient length of work"
+	shortWr    = "lapack: insufficient length of wr"
+	shortX     = "lapack: insufficient length of x"
+	shortY     = "lapack: insufficient length of y"
+	shortZ     = "lapack: insufficient length of z"
+
+	// Panic strings for bad leading dimensions of matrices.
+	badLdA    = "lapack: bad leading dimension of A"
+	badLdB    = "lapack: bad leading dimension of B"
+	badLdC    = "lapack: bad leading dimension of C"
+	badLdF    = "lapack: bad leading dimension of F"
+	badLdH    = "lapack: bad leading dimension of H"
+	badLdQ    = "lapack: bad leading dimension of Q"
+	badLdT    = "lapack: bad leading dimension of T"
+	badLdU    = "lapack: bad leading dimension of U"
+	badLdV    = "lapack: bad leading dimension of V"
+	badLdVL   = "lapack: bad leading dimension of VL"
+	badLdVR   = "lapack: bad leading dimension of VR"
+	badLdVT   = "lapack: bad leading dimension of VT"
+	badLdW    = "lapack: bad leading dimension of W"
+	badLdWH   = "lapack: bad leading dimension of WH"
+	badLdWV   = "lapack: bad leading dimension of WV"
+	badLdWork = "lapack: bad leading dimension of Work"
+	badLdX    = "lapack: bad leading dimension of X"
+	badLdY    = "lapack: bad leading dimension of Y"
+	badLdZ    = "lapack: bad leading dimension of Z"
+
+	// Panic strings for bad vector increments.
+	absIncNotOne = "lapack: increment not one or negative one"
+	badIncX      = "lapack: incX <= 0"
+	badIncY      = "lapack: incY <= 0"
+	zeroIncV     = "lapack: incv == 0"
+)
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/iladlc.go b/vendor/gonum.org/v1/gonum/lapack/gonum/iladlc.go
new file mode 100644
index 0000000000..b251d72691
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/iladlc.go
@@ -0,0 +1,45 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+// Iladlc scans a matrix for its last non-zero column. Returns -1 if the matrix
+// is all zeros.
+//
+// Iladlc is an internal routine. It is exported for testing purposes.
+func (Implementation) Iladlc(m, n int, a []float64, lda int) int {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	if n == 0 || m == 0 {
+		return -1
+	}
+
+	if len(a) < (m-1)*lda+n {
+		panic(shortA)
+	}
+
+	// Test common case where corner is non-zero.
+	if a[n-1] != 0 || a[(m-1)*lda+(n-1)] != 0 {
+		return n - 1
+	}
+
+	// Scan each row tracking the highest column seen.
+	highest := -1
+	for i := 0; i < m; i++ {
+		for j := n - 1; j >= 0; j-- {
+			if a[i*lda+j] != 0 {
+				highest = max(highest, j)
+				break
+			}
+		}
+	}
+	return highest
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/iladlr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/iladlr.go
new file mode 100644
index 0000000000..b73fe18ea2
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/iladlr.go
@@ -0,0 +1,41 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+// Iladlr scans a matrix for its last non-zero row. Returns -1 if the matrix
+// is all zeros.
+//
+// Iladlr is an internal routine. It is exported for testing purposes.
+func (Implementation) Iladlr(m, n int, a []float64, lda int) int {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	if n == 0 || m == 0 {
+		return -1
+	}
+
+	if len(a) < (m-1)*lda+n {
+		panic(shortA)
+	}
+
+	// Check the common case where the corner is non-zero
+	if a[(m-1)*lda] != 0 || a[(m-1)*lda+n-1] != 0 {
+		return m - 1
+	}
+	for i := m - 1; i >= 0; i-- {
+		for j := 0; j < n; j++ {
+			if a[i*lda+j] != 0 {
+				return i
+			}
+		}
+	}
+	return -1
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/ilaenv.go b/vendor/gonum.org/v1/gonum/lapack/gonum/ilaenv.go
new file mode 100644
index 0000000000..fc70806c45
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/ilaenv.go
@@ -0,0 +1,395 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+// Ilaenv returns algorithm tuning parameters for the algorithm given by the
+// input string. ispec specifies the parameter to return:
+//
+//	1: The optimal block size for a blocked algorithm.
+//	2: The minimum block size for a blocked algorithm.
+//	3: The block size of unprocessed data at which a blocked algorithm should
+//	   crossover to an unblocked version.
+//	4: The number of shifts.
+//	5: The minimum column dimension for blocking to be used.
+//	6: The crossover point for SVD (to use QR factorization or not).
+//	7: The number of processors.
+//	8: The crossover point for multi-shift in QR and QZ methods for non-symmetric eigenvalue problems.
+//	9: Maximum size of the subproblems in divide-and-conquer algorithms.
+//	10: ieee infinity and NaN arithmetic can be trusted not to trap.
+//	11: ieee infinity arithmetic can be trusted not to trap.
+//	12...16: parameters for Dhseqr and related functions. See Iparmq for more
+//	         information.
+//
+// Ilaenv is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Ilaenv(ispec int, name string, opts string, n1, n2, n3, n4 int) int {
+	// TODO(btracey): Replace this with a constant lookup? A list of constants?
+	sname := name[0] == 'S' || name[0] == 'D'
+	cname := name[0] == 'C' || name[0] == 'Z'
+	if !sname && !cname {
+		panic(badName)
+	}
+	c2 := name[1:3]
+	c3 := name[3:6]
+	c4 := c3[1:3]
+
+	switch ispec {
+	default:
+		panic(badIspec)
+	case 1:
+		switch c2 {
+		default:
+			panic(badName)
+		case "GE":
+			switch c3 {
+			default:
+				panic(badName)
+			case "TRF":
+				if sname {
+					return 64
+				}
+				return 64
+			case "QRF", "RQF", "LQF", "QLF":
+				if sname {
+					return 32
+				}
+				return 32
+			case "HRD":
+				if sname {
+					return 32
+				}
+				return 32
+			case "BRD":
+				if sname {
+					return 32
+				}
+				return 32
+			case "TRI":
+				if sname {
+					return 64
+				}
+				return 64
+			}
+		case "PO":
+			switch c3 {
+			default:
+				panic(badName)
+			case "TRF":
+				if sname {
+					return 64
+				}
+				return 64
+			}
+		case "SY":
+			switch c3 {
+			default:
+				panic(badName)
+			case "TRF":
+				if sname {
+					return 64
+				}
+				return 64
+			case "TRD":
+				return 32
+			case "GST":
+				return 64
+			}
+		case "HE":
+			switch c3 {
+			default:
+				panic(badName)
+			case "TRF":
+				return 64
+			case "TRD":
+				return 32
+			case "GST":
+				return 64
+			}
+		case "OR":
+			switch c3[0] {
+			default:
+				panic(badName)
+			case 'G':
+				switch c3[1:] {
+				default:
+					panic(badName)
+				case "QR", "RQ", "LQ", "QL", "HR", "TR", "BR":
+					return 32
+				}
+			case 'M':
+				switch c3[1:] {
+				default:
+					panic(badName)
+				case "QR", "RQ", "LQ", "QL", "HR", "TR", "BR":
+					return 32
+				}
+			}
+		case "UN":
+			switch c3[0] {
+			default:
+				panic(badName)
+			case 'G':
+				switch c3[1:] {
+				default:
+					panic(badName)
+				case "QR", "RQ", "LQ", "QL", "HR", "TR", "BR":
+					return 32
+				}
+			case 'M':
+				switch c3[1:] {
+				default:
+					panic(badName)
+				case "QR", "RQ", "LQ", "QL", "HR", "TR", "BR":
+					return 32
+				}
+			}
+		case "GB":
+			switch c3 {
+			default:
+				panic(badName)
+			case "TRF":
+				if sname {
+					if n4 <= 64 {
+						return 1
+					}
+					return 32
+				}
+				if n4 <= 64 {
+					return 1
+				}
+				return 32
+			}
+		case "PB":
+			switch c3 {
+			default:
+				panic(badName)
+			case "TRF":
+				if sname {
+					if n2 <= 64 {
+						return 1
+					}
+					return 32
+				}
+				if n2 <= 64 {
+					return 1
+				}
+				return 32
+			}
+		case "PT":
+			switch c3 {
+			default:
+				panic(badName)
+			case "TRS":
+				return 1
+			}
+		case "TR":
+			switch c3 {
+			default:
+				panic(badName)
+			case "TRI":
+				if sname {
+					return 64
+				}
+				return 64
+			case "EVC":
+				if sname {
+					return 64
+				}
+				return 64
+			}
+		case "LA":
+			switch c3 {
+			default:
+				panic(badName)
+			case "UUM":
+				if sname {
+					return 64
+				}
+				return 64
+			}
+		case "ST":
+			if sname && c3 == "EBZ" {
+				return 1
+			}
+			panic(badName)
+		}
+	case 2:
+		switch c2 {
+		default:
+			panic(badName)
+		case "GE":
+			switch c3 {
+			default:
+				panic(badName)
+			case "QRF", "RQF", "LQF", "QLF":
+				if sname {
+					return 2
+				}
+				return 2
+			case "HRD":
+				if sname {
+					return 2
+				}
+				return 2
+			case "BRD":
+				if sname {
+					return 2
+				}
+				return 2
+			case "TRI":
+				if sname {
+					return 2
+				}
+				return 2
+			}
+		case "SY":
+			switch c3 {
+			default:
+				panic(badName)
+			case "TRF":
+				if sname {
+					return 8
+				}
+				return 8
+			case "TRD":
+				if sname {
+					return 2
+				}
+				panic(badName)
+			}
+		case "HE":
+			if c3 == "TRD" {
+				return 2
+			}
+			panic(badName)
+		case "OR":
+			if !sname {
+				panic(badName)
+			}
+			switch c3[0] {
+			default:
+				panic(badName)
+			case 'G':
+				switch c4 {
+				default:
+					panic(badName)
+				case "QR", "RQ", "LQ", "QL", "HR", "TR", "BR":
+					return 2
+				}
+			case 'M':
+				switch c4 {
+				default:
+					panic(badName)
+				case "QR", "RQ", "LQ", "QL", "HR", "TR", "BR":
+					return 2
+				}
+			}
+		case "UN":
+			switch c3[0] {
+			default:
+				panic(badName)
+			case 'G':
+				switch c4 {
+				default:
+					panic(badName)
+				case "QR", "RQ", "LQ", "QL", "HR", "TR", "BR":
+					return 2
+				}
+			case 'M':
+				switch c4 {
+				default:
+					panic(badName)
+				case "QR", "RQ", "LQ", "QL", "HR", "TR", "BR":
+					return 2
+				}
+			}
+		}
+	case 3:
+		switch c2 {
+		default:
+			panic(badName)
+		case "GE":
+			switch c3 {
+			default:
+				panic(badName)
+			case "QRF", "RQF", "LQF", "QLF":
+				if sname {
+					return 128
+				}
+				return 128
+			case "HRD":
+				if sname {
+					return 128
+				}
+				return 128
+			case "BRD":
+				if sname {
+					return 128
+				}
+				return 128
+			}
+		case "SY":
+			if sname && c3 == "TRD" {
+				return 32
+			}
+			panic(badName)
+		case "HE":
+			if c3 == "TRD" {
+				return 32
+			}
+			panic(badName)
+		case "OR":
+			switch c3[0] {
+			default:
+				panic(badName)
+			case 'G':
+				switch c4 {
+				default:
+					panic(badName)
+				case "QR", "RQ", "LQ", "QL", "HR", "TR", "BR":
+					return 128
+				}
+			}
+		case "UN":
+			switch c3[0] {
+			default:
+				panic(badName)
+			case 'G':
+				switch c4 {
+				default:
+					panic(badName)
+				case "QR", "RQ", "LQ", "QL", "HR", "TR", "BR":
+					return 128
+				}
+			}
+		}
+	case 4:
+		// Used by xHSEQR
+		return 6
+	case 5:
+		// Not used
+		return 2
+	case 6:
+		// Used by xGELSS and xGESVD
+		return int(float64(min(n1, n2)) * 1.6)
+	case 7:
+		// Not used
+		return 1
+	case 8:
+		// Used by xHSEQR
+		return 50
+	case 9:
+		// used by xGELSD and xGESDD
+		return 25
+	case 10:
+		// Go guarantees ieee
+		return 1
+	case 11:
+		// Go guarantees ieee
+		return 1
+	case 12, 13, 14, 15, 16:
+		// Dhseqr and related functions for eigenvalue problems.
+		return impl.Iparmq(ispec, name, opts, n1, n2, n3, n4)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/iparmq.go b/vendor/gonum.org/v1/gonum/lapack/gonum/iparmq.go
new file mode 100644
index 0000000000..65d105245e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/iparmq.go
@@ -0,0 +1,117 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Iparmq returns problem and machine dependent parameters useful for Dhseqr and
+// related subroutines for eigenvalue problems.
+//
+// ispec specifies the parameter to return:
+//
+//	12: Crossover point between Dlahqr and Dlaqr0. Will be at least 11.
+//	13: Deflation window size.
+//	14: Nibble crossover point. Determines when to skip a multi-shift QR sweep.
+//	15: Number of simultaneous shifts in a multishift QR iteration.
+//	16: Select structured matrix multiply.
+//
+// For other values of ispec Iparmq will panic.
+//
+// name is the name of the calling function. name must be in uppercase but this
+// is not checked.
+//
+// opts is not used and exists for future use.
+//
+// n is the order of the Hessenberg matrix H.
+//
+// ilo and ihi specify the block [ilo:ihi+1,ilo:ihi+1] that is being processed.
+//
+// lwork is the amount of workspace available.
+//
+// Except for ispec input parameters are not checked.
+//
+// Iparmq is an internal routine. It is exported for testing purposes.
+func (Implementation) Iparmq(ispec int, name, opts string, n, ilo, ihi, lwork int) int {
+	nh := ihi - ilo + 1
+	ns := 2
+	switch {
+	case nh >= 30:
+		ns = 4
+	case nh >= 60:
+		ns = 10
+	case nh >= 150:
+		ns = max(10, nh/int(math.Log(float64(nh))/math.Ln2))
+	case nh >= 590:
+		ns = 64
+	case nh >= 3000:
+		ns = 128
+	case nh >= 6000:
+		ns = 256
+	}
+	ns = max(2, ns-(ns%2))
+
+	switch ispec {
+	default:
+		panic(badIspec)
+
+	case 12:
+		// Matrices of order smaller than nmin get sent to Dlahqr, the
+		// classic double shift algorithm. This must be at least 11.
+		const nmin = 75
+		return nmin
+
+	case 13:
+		const knwswp = 500
+		if nh <= knwswp {
+			return ns
+		}
+		return 3 * ns / 2
+
+	case 14:
+		// Skip a computationally expensive multi-shift QR sweep with
+		// Dlaqr5 whenever aggressive early deflation finds at least
+		// nibble*(window size)/100 deflations. The default, small,
+		// value reflects the expectation that the cost of looking
+		// through the deflation window with Dlaqr3 will be
+		// substantially smaller.
+		const nibble = 14
+		return nibble
+
+	case 15:
+		return ns
+
+	case 16:
+		if len(name) != 6 {
+			panic(badName)
+		}
+		const (
+			k22min = 14
+			kacmin = 14
+		)
+		var acc22 int
+		switch {
+		case name[1:] == "GGHRD" || name[1:] == "GGHD3":
+			acc22 = 1
+			if nh >= k22min {
+				acc22 = 2
+			}
+		case name[3:] == "EXC":
+			if nh >= kacmin {
+				acc22 = 1
+			}
+			if nh >= k22min {
+				acc22 = 2
+			}
+		case name[1:] == "HSEQR" || name[1:5] == "LAQR":
+			if ns >= kacmin {
+				acc22 = 1
+			}
+			if ns >= k22min {
+				acc22 = 2
+			}
+		}
+		return acc22
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/lapack.go b/vendor/gonum.org/v1/gonum/lapack/gonum/lapack.go
new file mode 100644
index 0000000000..5daefc584d
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/lapack.go
@@ -0,0 +1,64 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/lapack"
+
+// Implementation is the native Go implementation of LAPACK routines. It
+// is built on top of calls to the return of blas64.Implementation(), so while
+// this code is in pure Go, the underlying BLAS implementation may not be.
+type Implementation struct{}
+
+var _ lapack.Float64 = Implementation{}
+
+func abs(a int) int {
+	if a < 0 {
+		return -a
+	}
+	return a
+}
+
+const (
+	// dlamchE is the machine epsilon. For IEEE this is 2^{-53}.
+	dlamchE = 0x1p-53
+
+	// dlamchB is the radix of the machine (the base of the number system).
+	dlamchB = 2
+
+	// dlamchP is base * eps.
+	dlamchP = dlamchB * dlamchE
+
+	// dlamchS is the "safe minimum", that is, the lowest number such that
+	// 1/dlamchS does not overflow, or also the smallest normal number.
+	// For IEEE this is 2^{-1022}.
+	dlamchS = 0x1p-1022
+
+	// Blue's scaling constants
+	//
+	// An n-vector x is well-scaled if
+	//  dtsml ≤ |xᵢ| ≤ dtbig for 0 ≤ i < n and n ≤ 1/dlamchP,
+	// where
+	//  dtsml = 2^ceil((expmin-1)/2) = 2^ceil((-1021-1)/2) = 2^{-511} = 1.4916681462400413e-154
+	//  dtbig = 2^floor((expmax-digits+1)/2) = 2^floor((1024-53+1)/2) = 2^{486} = 1.997919072202235e+146
+	// If any xᵢ is not well-scaled, then multiplying small values by dssml and
+	// large values by dsbig avoids underflow or overflow when computing the sum
+	// of squares \sum_0^{n-1} (xᵢ)².
+	//  dssml = 2^{-floor((expmin-digits)/2)} = 2^{-floor((-1021-53)/2)} = 2^537 = 4.4989137945431964e+161
+	//  dsbig = 2^{-ceil((expmax+digits-1)/2)} = 2^{-ceil((1024+53-1)/2)} = 2^{-538} = 1.1113793747425387e-162
+	//
+	// References:
+	//  - Anderson E. (2017)
+	//    Algorithm 978: Safe Scaling in the Level 1 BLAS
+	//    ACM Trans Math Softw 44:1--28
+	//    https://doi.org/10.1145/3061665
+	//  - Blue, James L. (1978)
+	//    A Portable Fortran Program to Find the Euclidean Norm of a Vector
+	//    ACM Trans Math Softw 4:15--23
+	//    https://doi.org/10.1145/355769.355771
+	dtsml = 0x1p-511
+	dtbig = 0x1p486
+	dssml = 0x1p537
+	dsbig = 0x1p-538
+)
diff --git a/vendor/gonum.org/v1/gonum/lapack/lapack.go b/vendor/gonum.org/v1/gonum/lapack/lapack.go
new file mode 100644
index 0000000000..60ef1c244a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/lapack.go
@@ -0,0 +1,240 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package lapack
+
+import "gonum.org/v1/gonum/blas"
+
+// Complex128 defines the public complex128 LAPACK API supported by gonum/lapack.
+type Complex128 interface{}
+
+// Float64 defines the public float64 LAPACK API supported by gonum/lapack.
+type Float64 interface {
+	Dgecon(norm MatrixNorm, n int, a []float64, lda int, anorm float64, work []float64, iwork []int) float64
+	Dgeev(jobvl LeftEVJob, jobvr RightEVJob, n int, a []float64, lda int, wr, wi []float64, vl []float64, ldvl int, vr []float64, ldvr int, work []float64, lwork int) (first int)
+	Dgels(trans blas.Transpose, m, n, nrhs int, a []float64, lda int, b []float64, ldb int, work []float64, lwork int) bool
+	Dgelqf(m, n int, a []float64, lda int, tau, work []float64, lwork int)
+	Dgeqp3(m, n int, a []float64, lda int, jpvt []int, tau, work []float64, lwork int)
+	Dgeqrf(m, n int, a []float64, lda int, tau, work []float64, lwork int)
+	Dgesvd(jobU, jobVT SVDJob, m, n int, a []float64, lda int, s, u []float64, ldu int, vt []float64, ldvt int, work []float64, lwork int) (ok bool)
+	Dgetrf(m, n int, a []float64, lda int, ipiv []int) (ok bool)
+	Dgetri(n int, a []float64, lda int, ipiv []int, work []float64, lwork int) (ok bool)
+	Dgetrs(trans blas.Transpose, n, nrhs int, a []float64, lda int, ipiv []int, b []float64, ldb int)
+	Dggsvd3(jobU, jobV, jobQ GSVDJob, m, n, p int, a []float64, lda int, b []float64, ldb int, alpha, beta, u []float64, ldu int, v []float64, ldv int, q []float64, ldq int, work []float64, lwork int, iwork []int) (k, l int, ok bool)
+	Dlantr(norm MatrixNorm, uplo blas.Uplo, diag blas.Diag, m, n int, a []float64, lda int, work []float64) float64
+	Dlange(norm MatrixNorm, m, n int, a []float64, lda int, work []float64) float64
+	Dlansy(norm MatrixNorm, uplo blas.Uplo, n int, a []float64, lda int, work []float64) float64
+	Dlapmr(forward bool, m, n int, x []float64, ldx int, k []int)
+	Dlapmt(forward bool, m, n int, x []float64, ldx int, k []int)
+	Dorgqr(m, n, k int, a []float64, lda int, tau, work []float64, lwork int)
+	Dormqr(side blas.Side, trans blas.Transpose, m, n, k int, a []float64, lda int, tau, c []float64, ldc int, work []float64, lwork int)
+	Dorglq(m, n, k int, a []float64, lda int, tau, work []float64, lwork int)
+	Dormlq(side blas.Side, trans blas.Transpose, m, n, k int, a []float64, lda int, tau, c []float64, ldc int, work []float64, lwork int)
+	Dpbcon(uplo blas.Uplo, n, kd int, ab []float64, ldab int, anorm float64, work []float64, iwork []int) float64
+	Dpbtrf(uplo blas.Uplo, n, kd int, ab []float64, ldab int) (ok bool)
+	Dpbtrs(uplo blas.Uplo, n, kd, nrhs int, ab []float64, ldab int, b []float64, ldb int)
+	Dpocon(uplo blas.Uplo, n int, a []float64, lda int, anorm float64, work []float64, iwork []int) float64
+	Dpotrf(ul blas.Uplo, n int, a []float64, lda int) (ok bool)
+	Dpotri(ul blas.Uplo, n int, a []float64, lda int) (ok bool)
+	Dpotrs(ul blas.Uplo, n, nrhs int, a []float64, lda int, b []float64, ldb int)
+	Dpstrf(uplo blas.Uplo, n int, a []float64, lda int, piv []int, tol float64, work []float64) (rank int, ok bool)
+	Dsyev(jobz EVJob, uplo blas.Uplo, n int, a []float64, lda int, w, work []float64, lwork int) (ok bool)
+	Dtbtrs(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n, kd, nrhs int, a []float64, lda int, b []float64, ldb int) (ok bool)
+	Dtrcon(norm MatrixNorm, uplo blas.Uplo, diag blas.Diag, n int, a []float64, lda int, work []float64, iwork []int) float64
+	Dtrtri(uplo blas.Uplo, diag blas.Diag, n int, a []float64, lda int) (ok bool)
+	Dtrtrs(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n, nrhs int, a []float64, lda int, b []float64, ldb int) (ok bool)
+}
+
+// Direct specifies the direction of the multiplication for the Householder matrix.
+type Direct byte
+
+const (
+	Forward  Direct = 'F' // Reflectors are right-multiplied, H_0 * H_1 * ... * H_{k-1}.
+	Backward Direct = 'B' // Reflectors are left-multiplied, H_{k-1} * ... * H_1 * H_0.
+)
+
+// Sort is the sorting order.
+type Sort byte
+
+const (
+	SortIncreasing Sort = 'I'
+	SortDecreasing Sort = 'D'
+)
+
+// StoreV indicates the storage direction of elementary reflectors.
+type StoreV byte
+
+const (
+	ColumnWise StoreV = 'C' // Reflector stored in a column of the matrix.
+	RowWise    StoreV = 'R' // Reflector stored in a row of the matrix.
+)
+
+// MatrixNorm represents the kind of matrix norm to compute.
+type MatrixNorm byte
+
+const (
+	MaxAbs       MatrixNorm = 'M' // max(abs(A(i,j)))
+	MaxColumnSum MatrixNorm = 'O' // Maximum absolute column sum (one norm)
+	MaxRowSum    MatrixNorm = 'I' // Maximum absolute row sum (infinity norm)
+	Frobenius    MatrixNorm = 'F' // Frobenius norm (sqrt of sum of squares)
+)
+
+// MatrixType represents the kind of matrix represented in the data.
+type MatrixType byte
+
+const (
+	General  MatrixType = 'G' // A general dense matrix.
+	UpperTri MatrixType = 'U' // An upper triangular matrix.
+	LowerTri MatrixType = 'L' // A lower triangular matrix.
+)
+
+// Pivot specifies the pivot type for plane rotations.
+type Pivot byte
+
+const (
+	Variable Pivot = 'V'
+	Top      Pivot = 'T'
+	Bottom   Pivot = 'B'
+)
+
+// ApplyOrtho specifies which orthogonal matrix is applied in Dormbr.
+type ApplyOrtho byte
+
+const (
+	ApplyP ApplyOrtho = 'P' // Apply P or Pᵀ.
+	ApplyQ ApplyOrtho = 'Q' // Apply Q or Qᵀ.
+)
+
+// GenOrtho specifies which orthogonal matrix is generated in Dorgbr.
+type GenOrtho byte
+
+const (
+	GeneratePT GenOrtho = 'P' // Generate Pᵀ.
+	GenerateQ  GenOrtho = 'Q' // Generate Q.
+)
+
+// SVDJob specifies the singular vector computation type for SVD.
+type SVDJob byte
+
+const (
+	SVDAll       SVDJob = 'A' // Compute all columns of the orthogonal matrix U or V.
+	SVDStore     SVDJob = 'S' // Compute the singular vectors and store them in the orthogonal matrix U or V.
+	SVDOverwrite SVDJob = 'O' // Compute the singular vectors and overwrite them on the input matrix A.
+	SVDNone      SVDJob = 'N' // Do not compute singular vectors.
+)
+
+// GSVDJob specifies the singular vector computation type for Generalized SVD.
+type GSVDJob byte
+
+const (
+	GSVDU    GSVDJob = 'U' // Compute orthogonal matrix U.
+	GSVDV    GSVDJob = 'V' // Compute orthogonal matrix V.
+	GSVDQ    GSVDJob = 'Q' // Compute orthogonal matrix Q.
+	GSVDUnit GSVDJob = 'I' // Use unit-initialized matrix.
+	GSVDNone GSVDJob = 'N' // Do not compute orthogonal matrix.
+)
+
+// EVComp specifies how eigenvectors are computed in Dsteqr.
+type EVComp byte
+
+const (
+	EVOrig     EVComp = 'V' // Compute eigenvectors of the original symmetric matrix.
+	EVTridiag  EVComp = 'I' // Compute eigenvectors of the tridiagonal matrix.
+	EVCompNone EVComp = 'N' // Do not compute eigenvectors.
+)
+
+// EVJob specifies whether eigenvectors are computed in Dsyev.
+type EVJob byte
+
+const (
+	EVCompute EVJob = 'V' // Compute eigenvectors.
+	EVNone    EVJob = 'N' // Do not compute eigenvectors.
+)
+
+// LeftEVJob specifies whether left eigenvectors are computed in Dgeev.
+type LeftEVJob byte
+
+const (
+	LeftEVCompute LeftEVJob = 'V' // Compute left eigenvectors.
+	LeftEVNone    LeftEVJob = 'N' // Do not compute left eigenvectors.
+)
+
+// RightEVJob specifies whether right eigenvectors are computed in Dgeev.
+type RightEVJob byte
+
+const (
+	RightEVCompute RightEVJob = 'V' // Compute right eigenvectors.
+	RightEVNone    RightEVJob = 'N' // Do not compute right eigenvectors.
+)
+
+// BalanceJob specifies matrix balancing operation.
+type BalanceJob byte
+
+const (
+	Permute      BalanceJob = 'P'
+	Scale        BalanceJob = 'S'
+	PermuteScale BalanceJob = 'B'
+	BalanceNone  BalanceJob = 'N'
+)
+
+// SchurJob specifies whether the Schur form is computed in Dhseqr.
+type SchurJob byte
+
+const (
+	EigenvaluesOnly     SchurJob = 'E'
+	EigenvaluesAndSchur SchurJob = 'S'
+)
+
+// SchurComp specifies whether and how the Schur vectors are computed in Dhseqr.
+type SchurComp byte
+
+const (
+	SchurOrig SchurComp = 'V' // Compute Schur vectors of the original matrix.
+	SchurHess SchurComp = 'I' // Compute Schur vectors of the upper Hessenberg matrix.
+	SchurNone SchurComp = 'N' // Do not compute Schur vectors.
+)
+
+// UpdateSchurComp specifies whether the matrix of Schur vectors is updated in Dtrexc.
+type UpdateSchurComp byte
+
+const (
+	UpdateSchur     UpdateSchurComp = 'V' // Update the matrix of Schur vectors.
+	UpdateSchurNone UpdateSchurComp = 'N' // Do not update the matrix of Schur vectors.
+)
+
+// EVSide specifies what eigenvectors are computed in Dtrevc3.
+type EVSide byte
+
+const (
+	EVRight EVSide = 'R' // Compute only right eigenvectors.
+	EVLeft  EVSide = 'L' // Compute only left eigenvectors.
+	EVBoth  EVSide = 'B' // Compute both right and left eigenvectors.
+)
+
+// EVHowMany specifies which eigenvectors are computed in Dtrevc3 and how.
+type EVHowMany byte
+
+const (
+	EVAll      EVHowMany = 'A' // Compute all right and/or left eigenvectors.
+	EVAllMulQ  EVHowMany = 'B' // Compute all right and/or left eigenvectors multiplied by an input matrix.
+	EVSelected EVHowMany = 'S' // Compute selected right and/or left eigenvectors.
+)
+
+// MaximizeNormXJob specifies the heuristic method for computing a contribution to
+// the reciprocal Dif-estimate in Dlatdf.
+type MaximizeNormXJob byte
+
+const (
+	LocalLookAhead       MaximizeNormXJob = 0 // Solve Z*x=h-f where h is a vector of ±1.
+	NormalizedNullVector MaximizeNormXJob = 2 // Compute an approximate null-vector e of Z, normalize e and solve Z*x=±e-f.
+)
+
+// OrthoComp specifies whether and how the orthogonal matrix is computed in Dgghrd.
+type OrthoComp byte
+
+const (
+	OrthoNone     OrthoComp = 'N' // Do not compute the orthogonal matrix.
+	OrthoExplicit OrthoComp = 'I' // The orthogonal matrix is formed explicitly and returned in the argument.
+	OrthoPostmul  OrthoComp = 'V' // The orthogonal matrix is post-multiplied into the matrix stored in the argument on entry.
+)
diff --git a/vendor/gonum.org/v1/gonum/lapack/lapack64/doc.go b/vendor/gonum.org/v1/gonum/lapack/lapack64/doc.go
new file mode 100644
index 0000000000..da19e3ec78
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/lapack64/doc.go
@@ -0,0 +1,20 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package lapack64 provides a set of convenient wrapper functions for LAPACK
+// calls, as specified in the netlib standard (www.netlib.org).
+//
+// The native Go routines are used by default, and the Use function can be used
+// to set an alternative implementation.
+//
+// If the type of matrix (General, Symmetric, etc.) is known and fixed, it is
+// used in the wrapper signature. In many cases, however, the type of the matrix
+// changes during the call to the routine, for example the matrix is symmetric on
+// entry and is triangular on exit. In these cases the correct types should be checked
+// in the documentation.
+//
+// The full set of Lapack functions is very large, and it is not clear that a
+// full implementation is desirable, let alone feasible. Please open up an issue
+// if there is a specific function you need and/or are willing to implement.
+package lapack64 // import "gonum.org/v1/gonum/lapack/lapack64"
diff --git a/vendor/gonum.org/v1/gonum/lapack/lapack64/lapack64.go b/vendor/gonum.org/v1/gonum/lapack/lapack64/lapack64.go
new file mode 100644
index 0000000000..1b4c1734a1
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/lapack64/lapack64.go
@@ -0,0 +1,908 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package lapack64
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/gonum"
+)
+
+var lapack64 lapack.Float64 = gonum.Implementation{}
+
+// Use sets the LAPACK float64 implementation to be used by subsequent BLAS calls.
+// The default implementation is native.Implementation.
+func Use(l lapack.Float64) {
+	lapack64 = l
+}
+
+// Tridiagonal represents a tridiagonal matrix using its three diagonals.
+type Tridiagonal struct {
+	N  int
+	DL []float64
+	D  []float64
+	DU []float64
+}
+
+// Potrf computes the Cholesky factorization of a.
+// The factorization has the form
+//
+//	A = Uᵀ * U  if a.Uplo == blas.Upper, or
+//	A = L * Lᵀ  if a.Uplo == blas.Lower,
+//
+// where U is an upper triangular matrix and L is lower triangular.
+// The triangular matrix is returned in t, and the underlying data between
+// a and t is shared. The returned bool indicates whether a is positive
+// definite and the factorization could be finished.
+func Potrf(a blas64.Symmetric) (t blas64.Triangular, ok bool) {
+	ok = lapack64.Dpotrf(a.Uplo, a.N, a.Data, max(1, a.Stride))
+	t.Uplo = a.Uplo
+	t.N = a.N
+	t.Data = a.Data
+	t.Stride = a.Stride
+	t.Diag = blas.NonUnit
+	return
+}
+
+// Potri computes the inverse of a real symmetric positive definite matrix A
+// using its Cholesky factorization.
+//
+// On entry, t contains the triangular factor U or L from the Cholesky
+// factorization A = Uᵀ*U or A = L*Lᵀ, as computed by Potrf.
+//
+// On return, the upper or lower triangle of the (symmetric) inverse of A is
+// stored in t, overwriting the input factor U or L, and also returned in a. The
+// underlying data between a and t is shared.
+//
+// The returned bool indicates whether the inverse was computed successfully.
+func Potri(t blas64.Triangular) (a blas64.Symmetric, ok bool) {
+	ok = lapack64.Dpotri(t.Uplo, t.N, t.Data, max(1, t.Stride))
+	a.Uplo = t.Uplo
+	a.N = t.N
+	a.Data = t.Data
+	a.Stride = t.Stride
+	return
+}
+
+// Potrs solves a system of n linear equations A*X = B where A is an n×n
+// symmetric positive definite matrix and B is an n×nrhs matrix, using the
+// Cholesky factorization A = Uᵀ*U or A = L*Lᵀ. t contains the corresponding
+// triangular factor as returned by Potrf. On entry, B contains the right-hand
+// side matrix B, on return it contains the solution matrix X.
+func Potrs(t blas64.Triangular, b blas64.General) {
+	lapack64.Dpotrs(t.Uplo, t.N, b.Cols, t.Data, max(1, t.Stride), b.Data, max(1, b.Stride))
+}
+
+// Pbcon returns an estimate of the reciprocal of the condition number (in the
+// 1-norm) of an n×n symmetric positive definite band matrix using the Cholesky
+// factorization
+//
+//	A = Uᵀ*U  if uplo == blas.Upper
+//	A = L*Lᵀ  if uplo == blas.Lower
+//
+// computed by Pbtrf. The estimate is obtained for norm(inv(A)), and the
+// reciprocal of the condition number is computed as
+//
+//	rcond = 1 / (anorm * norm(inv(A))).
+//
+// The length of work must be at least 3*n and the length of iwork must be at
+// least n.
+func Pbcon(a blas64.SymmetricBand, anorm float64, work []float64, iwork []int) float64 {
+	return lapack64.Dpbcon(a.Uplo, a.N, a.K, a.Data, a.Stride, anorm, work, iwork)
+}
+
+// Pbtrf computes the Cholesky factorization of an n×n symmetric positive
+// definite band matrix
+//
+//	A = Uᵀ * U  if a.Uplo == blas.Upper
+//	A = L * Lᵀ  if a.Uplo == blas.Lower
+//
+// where U and L are upper, respectively lower, triangular band matrices.
+//
+// The triangular matrix U or L is returned in t, and the underlying data
+// between a and t is shared. The returned bool indicates whether A is positive
+// definite and the factorization could be finished.
+func Pbtrf(a blas64.SymmetricBand) (t blas64.TriangularBand, ok bool) {
+	ok = lapack64.Dpbtrf(a.Uplo, a.N, a.K, a.Data, max(1, a.Stride))
+	t.Uplo = a.Uplo
+	t.Diag = blas.NonUnit
+	t.N = a.N
+	t.K = a.K
+	t.Data = a.Data
+	t.Stride = a.Stride
+	return t, ok
+}
+
+// Pbtrs solves a system of linear equations A*X = B with an n×n symmetric
+// positive definite band matrix A using the Cholesky factorization
+//
+//	A = Uᵀ * U  if t.Uplo == blas.Upper
+//	A = L * Lᵀ  if t.Uplo == blas.Lower
+//
+// t contains the corresponding triangular factor as returned by Pbtrf.
+//
+// On entry, b contains the right hand side matrix B. On return, it is
+// overwritten with the solution matrix X.
+func Pbtrs(t blas64.TriangularBand, b blas64.General) {
+	lapack64.Dpbtrs(t.Uplo, t.N, t.K, b.Cols, t.Data, max(1, t.Stride), b.Data, max(1, b.Stride))
+}
+
+// Pstrf computes the Cholesky factorization with complete pivoting of an n×n
+// symmetric positive semidefinite matrix A.
+//
+// The factorization has the form
+//
+//	Pᵀ * A * P = Uᵀ * U ,  if a.Uplo = blas.Upper,
+//	Pᵀ * A * P = L  * Lᵀ,  if a.Uplo = blas.Lower,
+//
+// where U is an upper triangular matrix, L is lower triangular, and P is a
+// permutation matrix.
+//
+// tol is a user-defined tolerance. The algorithm terminates if the pivot is
+// less than or equal to tol. If tol is negative, then n*eps*max(A[k,k]) will be
+// used instead.
+//
+// The triangular factor U or L from the Cholesky factorization is returned in t
+// and the underlying data between a and t is shared. P is stored on return in
+// vector piv such that P[piv[k],k] = 1.
+//
+// Pstrf returns the computed rank of A and whether the factorization can be
+// used to solve a system. Pstrf does not attempt to check that A is positive
+// semi-definite, so if ok is false, the matrix A is either rank deficient or is
+// not positive semidefinite.
+//
+// The length of piv must be n and the length of work must be at least 2*n,
+// otherwise Pstrf will panic.
+func Pstrf(a blas64.Symmetric, piv []int, tol float64, work []float64) (t blas64.Triangular, rank int, ok bool) {
+	rank, ok = lapack64.Dpstrf(a.Uplo, a.N, a.Data, max(1, a.Stride), piv, tol, work)
+	t.Uplo = a.Uplo
+	t.Diag = blas.NonUnit
+	t.N = a.N
+	t.Data = a.Data
+	t.Stride = a.Stride
+	return t, rank, ok
+}
+
+// Gecon estimates the reciprocal of the condition number of the n×n matrix A
+// given the LU decomposition of the matrix. The condition number computed may
+// be based on the 1-norm or the ∞-norm.
+//
+// a contains the result of the LU decomposition of A as computed by Getrf.
+//
+// anorm is the corresponding 1-norm or ∞-norm of the original matrix A.
+//
+// work is a temporary data slice of length at least 4*n and Gecon will panic otherwise.
+//
+// iwork is a temporary data slice of length at least n and Gecon will panic otherwise.
+func Gecon(norm lapack.MatrixNorm, a blas64.General, anorm float64, work []float64, iwork []int) float64 {
+	return lapack64.Dgecon(norm, a.Cols, a.Data, max(1, a.Stride), anorm, work, iwork)
+}
+
+// Gels finds a minimum-norm solution based on the matrices A and B using the
+// QR or LQ factorization. Gels returns false if the matrix
+// A is singular, and true if this solution was successfully found.
+//
+// The minimization problem solved depends on the input parameters.
+//
+//  1. If m >= n and trans == blas.NoTrans, Gels finds X such that || A*X - B||_2
+//     is minimized.
+//  2. If m < n and trans == blas.NoTrans, Gels finds the minimum norm solution of
+//     A * X = B.
+//  3. If m >= n and trans == blas.Trans, Gels finds the minimum norm solution of
+//     Aᵀ * X = B.
+//  4. If m < n and trans == blas.Trans, Gels finds X such that || A*X - B||_2
+//     is minimized.
+//
+// Note that the least-squares solutions (cases 1 and 3) perform the minimization
+// per column of B. This is not the same as finding the minimum-norm matrix.
+//
+// The matrix A is a general matrix of size m×n and is modified during this call.
+// The input matrix B is of size max(m,n)×nrhs, and serves two purposes. On entry,
+// the elements of b specify the input matrix B. B has size m×nrhs if
+// trans == blas.NoTrans, and n×nrhs if trans == blas.Trans. On exit, the
+// leading submatrix of b contains the solution vectors X. If trans == blas.NoTrans,
+// this submatrix is of size n×nrhs, and of size m×nrhs otherwise.
+//
+// Work is temporary storage, and lwork specifies the usable memory length.
+// At minimum, lwork >= max(m,n) + max(m,n,nrhs), and this function will panic
+// otherwise. A longer work will enable blocked algorithms to be called.
+// In the special case that lwork == -1, work[0] will be set to the optimal working
+// length.
+func Gels(trans blas.Transpose, a blas64.General, b blas64.General, work []float64, lwork int) bool {
+	return lapack64.Dgels(trans, a.Rows, a.Cols, b.Cols, a.Data, max(1, a.Stride), b.Data, max(1, b.Stride), work, lwork)
+}
+
+// Geqp3 computes a QR factorization with column pivoting of the m×n matrix A:
+//
+//	A*P = Q*R
+//
+// where P is a permutation matrix, Q is an orthogonal matrix and R is a
+// min(m,n)×n upper trapezoidal matrix.
+//
+// On return, the upper triangle of A contains the matrix R. The elements below
+// the diagonal together with tau represent the matrix Q as a product of
+// elementary reflectors
+//
+//	Q = H_0 * H_1 * ... * H_{k-1}, where k = min(m,n).
+//
+// Each H_i has the form
+//
+//	H_i = I - tau * v * vᵀ
+//
+// where tau is a scalar and v is a vector with v[0:i] = 0 and v[i] = 1;
+// v[i+1:m] is stored on exit in A[i+1:m,i], and tau in tau[i].
+//
+// jpvt specifies a column pivot to be applied to A. On entry, if jpvt[j] is at
+// least zero, the jth column of A is permuted to the front of A*P (a leading
+// column), if jpvt[j] is -1 the jth column of A is a free column. If jpvt[j] <
+// -1, Geqp3 will panic. On return, jpvt holds the permutation that was applied;
+// the jth column of A*P was the jpvt[j] column of A. jpvt must have length n or
+// Geqp3 will panic.
+//
+// tau holds the scalar factors of the elementary reflectors. It must have
+// length min(m,n), otherwise Geqp3 will panic.
+//
+// work must have length at least max(1,lwork), and lwork must be at least
+// 3*n+1, otherwise Geqp3 will panic. For optimal performance lwork must be at
+// least 2*n+(n+1)*nb, where nb is the optimal blocksize. On return, work[0]
+// will contain the optimal value of lwork.
+//
+// If lwork == -1, instead of performing Geqp3, only the optimal value of lwork
+// will be stored in work[0].
+func Geqp3(a blas64.General, jpvt []int, tau, work []float64, lwork int) {
+	lapack64.Dgeqp3(a.Rows, a.Cols, a.Data, max(1, a.Stride), jpvt, tau, work, lwork)
+}
+
+// Geqrf computes the QR factorization of the m×n matrix A using a blocked
+// algorithm. A is modified to contain the information to construct Q and R.
+// The upper triangle of a contains the matrix R. The lower triangular elements
+// (not including the diagonal) contain the elementary reflectors. tau is modified
+// to contain the reflector scales. tau must have length min(m,n), and
+// this function will panic otherwise.
+//
+// The ith elementary reflector can be explicitly constructed by first extracting
+// the
+//
+//	v[j] = 0           j < i
+//	v[j] = 1           j == i
+//	v[j] = a[j*lda+i]  j > i
+//
+// and computing H_i = I - tau[i] * v * vᵀ.
+//
+// The orthonormal matrix Q can be constructed from a product of these elementary
+// reflectors, Q = H_0 * H_1 * ... * H_{k-1}, where k = min(m,n).
+//
+// Work is temporary storage, and lwork specifies the usable memory length.
+// At minimum, lwork >= m and this function will panic otherwise.
+// Geqrf is a blocked QR factorization, but the block size is limited
+// by the temporary space available. If lwork == -1, instead of performing Geqrf,
+// the optimal work length will be stored into work[0].
+func Geqrf(a blas64.General, tau, work []float64, lwork int) {
+	lapack64.Dgeqrf(a.Rows, a.Cols, a.Data, max(1, a.Stride), tau, work, lwork)
+}
+
+// Gelqf computes the LQ factorization of the m×n matrix A using a blocked
+// algorithm. A is modified to contain the information to construct L and Q. The
+// lower triangle of a contains the matrix L. The elements above the diagonal
+// and the slice tau represent the matrix Q. tau is modified to contain the
+// reflector scales. tau must have length at least min(m,n), and this function
+// will panic otherwise.
+//
+// See Geqrf for a description of the elementary reflectors and orthonormal
+// matrix Q. Q is constructed as a product of these elementary reflectors,
+// Q = H_{k-1} * ... * H_1 * H_0.
+//
+// Work is temporary storage, and lwork specifies the usable memory length.
+// At minimum, lwork >= m and this function will panic otherwise.
+// Gelqf is a blocked LQ factorization, but the block size is limited
+// by the temporary space available. If lwork == -1, instead of performing Gelqf,
+// the optimal work length will be stored into work[0].
+func Gelqf(a blas64.General, tau, work []float64, lwork int) {
+	lapack64.Dgelqf(a.Rows, a.Cols, a.Data, max(1, a.Stride), tau, work, lwork)
+}
+
+// Gesvd computes the singular value decomposition of the input matrix A.
+//
+// The singular value decomposition is
+//
+//	A = U * Sigma * Vᵀ
+//
+// where Sigma is an m×n diagonal matrix containing the singular values of A,
+// U is an m×m orthogonal matrix and V is an n×n orthogonal matrix. The first
+// min(m,n) columns of U and V are the left and right singular vectors of A
+// respectively.
+//
+// jobU and jobVT are options for computing the singular vectors. The behavior
+// is as follows
+//
+//	jobU == lapack.SVDAll       All m columns of U are returned in u
+//	jobU == lapack.SVDStore     The first min(m,n) columns are returned in u
+//	jobU == lapack.SVDOverwrite The first min(m,n) columns of U are written into a
+//	jobU == lapack.SVDNone      The columns of U are not computed.
+//
+// The behavior is the same for jobVT and the rows of Vᵀ. At most one of jobU
+// and jobVT can equal lapack.SVDOverwrite, and Gesvd will panic otherwise.
+//
+// On entry, a contains the data for the m×n matrix A. During the call to Gesvd
+// the data is overwritten. On exit, A contains the appropriate singular vectors
+// if either job is lapack.SVDOverwrite.
+//
+// s is a slice of length at least min(m,n) and on exit contains the singular
+// values in decreasing order.
+//
+// u contains the left singular vectors on exit, stored columnwise. If
+// jobU == lapack.SVDAll, u is of size m×m. If jobU == lapack.SVDStore u is
+// of size m×min(m,n). If jobU == lapack.SVDOverwrite or lapack.SVDNone, u is
+// not used.
+//
+// vt contains the left singular vectors on exit, stored rowwise. If
+// jobV == lapack.SVDAll, vt is of size n×m. If jobVT == lapack.SVDStore vt is
+// of size min(m,n)×n. If jobVT == lapack.SVDOverwrite or lapack.SVDNone, vt is
+// not used.
+//
+// work is a slice for storing temporary memory, and lwork is the usable size of
+// the slice. lwork must be at least max(5*min(m,n), 3*min(m,n)+max(m,n)).
+// If lwork == -1, instead of performing Gesvd, the optimal work length will be
+// stored into work[0]. Gesvd will panic if the working memory has insufficient
+// storage.
+//
+// Gesvd returns whether the decomposition successfully completed.
+func Gesvd(jobU, jobVT lapack.SVDJob, a, u, vt blas64.General, s, work []float64, lwork int) (ok bool) {
+	return lapack64.Dgesvd(jobU, jobVT, a.Rows, a.Cols, a.Data, max(1, a.Stride), s, u.Data, max(1, u.Stride), vt.Data, max(1, vt.Stride), work, lwork)
+}
+
+// Getrf computes the LU decomposition of an m×n matrix A using partial
+// pivoting with row interchanges.
+//
+// The LU decomposition is a factorization of A into
+//
+//	A = P * L * U
+//
+// where P is a permutation matrix, L is a lower triangular with unit diagonal
+// elements (lower trapezoidal if m > n), and U is upper triangular (upper
+// trapezoidal if m < n).
+//
+// On entry, a contains the matrix A. On return, L and U are stored in place
+// into a, and P is represented by ipiv.
+//
+// ipiv contains a sequence of row swaps. It indicates that row i of the matrix
+// was interchanged with ipiv[i]. ipiv must have length min(m,n), and Getrf will
+// panic otherwise. ipiv is zero-indexed.
+//
+// Getrf returns whether the matrix A is nonsingular. The LU decomposition will
+// be computed regardless of the singularity of A, but the result should not be
+// used to solve a system of equation.
+func Getrf(a blas64.General, ipiv []int) bool {
+	return lapack64.Dgetrf(a.Rows, a.Cols, a.Data, max(1, a.Stride), ipiv)
+}
+
+// Getri computes the inverse of the matrix A using the LU factorization computed
+// by Getrf. On entry, a contains the PLU decomposition of A as computed by
+// Getrf and on exit contains the reciprocal of the original matrix.
+//
+// Getri will not perform the inversion if the matrix is singular, and returns
+// a boolean indicating whether the inversion was successful.
+//
+// Work is temporary storage, and lwork specifies the usable memory length.
+// At minimum, lwork >= n and this function will panic otherwise.
+// Getri is a blocked inversion, but the block size is limited
+// by the temporary space available. If lwork == -1, instead of performing Getri,
+// the optimal work length will be stored into work[0].
+func Getri(a blas64.General, ipiv []int, work []float64, lwork int) (ok bool) {
+	return lapack64.Dgetri(a.Cols, a.Data, max(1, a.Stride), ipiv, work, lwork)
+}
+
+// Getrs solves a system of equations using an LU factorization.
+// The system of equations solved is
+//
+//	A * X = B   if trans == blas.Trans
+//	Aᵀ * X = B  if trans == blas.NoTrans
+//
+// A is a general n×n matrix with stride lda. B is a general matrix of size n×nrhs.
+//
+// On entry b contains the elements of the matrix B. On exit, b contains the
+// elements of X, the solution to the system of equations.
+//
+// a and ipiv contain the LU factorization of A and the permutation indices as
+// computed by Getrf. ipiv is zero-indexed.
+func Getrs(trans blas.Transpose, a blas64.General, b blas64.General, ipiv []int) {
+	lapack64.Dgetrs(trans, a.Cols, b.Cols, a.Data, max(1, a.Stride), ipiv, b.Data, max(1, b.Stride))
+}
+
+// Ggsvd3 computes the generalized singular value decomposition (GSVD)
+// of an m×n matrix A and p×n matrix B:
+//
+//	Uᵀ*A*Q = D1*[ 0 R ]
+//
+//	Vᵀ*B*Q = D2*[ 0 R ]
+//
+// where U, V and Q are orthogonal matrices.
+//
+// Ggsvd3 returns k and l, the dimensions of the sub-blocks. k+l
+// is the effective numerical rank of the (m+p)×n matrix [ Aᵀ Bᵀ ]ᵀ.
+// R is a (k+l)×(k+l) nonsingular upper triangular matrix, D1 and
+// D2 are m×(k+l) and p×(k+l) diagonal matrices and of the following
+// structures, respectively:
+//
+// If m-k-l >= 0,
+//
+//	                  k  l
+//	     D1 =     k [ I  0 ]
+//	              l [ 0  C ]
+//	          m-k-l [ 0  0 ]
+//
+//	                k  l
+//	     D2 = l   [ 0  S ]
+//	          p-l [ 0  0 ]
+//
+//	             n-k-l  k    l
+//	[ 0 R ] = k [  0   R11  R12 ] k
+//	          l [  0    0   R22 ] l
+//
+// where
+//
+//	C = diag( alpha_k, ... , alpha_{k+l} ),
+//	S = diag( beta_k,  ... , beta_{k+l} ),
+//	C^2 + S^2 = I.
+//
+// R is stored in
+//
+//	A[0:k+l, n-k-l:n]
+//
+// on exit.
+//
+// If m-k-l < 0,
+//
+//	               k m-k k+l-m
+//	    D1 =   k [ I  0    0  ]
+//	         m-k [ 0  C    0  ]
+//
+//	                 k m-k k+l-m
+//	    D2 =   m-k [ 0  S    0  ]
+//	         k+l-m [ 0  0    I  ]
+//	           p-l [ 0  0    0  ]
+//
+//	               n-k-l  k   m-k  k+l-m
+//	[ 0 R ] =    k [ 0    R11  R12  R13 ]
+//	           m-k [ 0     0   R22  R23 ]
+//	         k+l-m [ 0     0    0   R33 ]
+//
+// where
+//
+//	C = diag( alpha_k, ... , alpha_m ),
+//	S = diag( beta_k,  ... , beta_m ),
+//	C^2 + S^2 = I.
+//
+//	R = [ R11 R12 R13 ] is stored in A[1:m, n-k-l+1:n]
+//	    [  0  R22 R23 ]
+//
+// and R33 is stored in
+//
+//	B[m-k:l, n+m-k-l:n] on exit.
+//
+// Ggsvd3 computes C, S, R, and optionally the orthogonal transformation
+// matrices U, V and Q.
+//
+// jobU, jobV and jobQ are options for computing the orthogonal matrices. The behavior
+// is as follows
+//
+//	jobU == lapack.GSVDU        Compute orthogonal matrix U
+//	jobU == lapack.GSVDNone     Do not compute orthogonal matrix.
+//
+// The behavior is the same for jobV and jobQ with the exception that instead of
+// lapack.GSVDU these accept lapack.GSVDV and lapack.GSVDQ respectively.
+// The matrices U, V and Q must be m×m, p×p and n×n respectively unless the
+// relevant job parameter is lapack.GSVDNone.
+//
+// alpha and beta must have length n or Ggsvd3 will panic. On exit, alpha and
+// beta contain the generalized singular value pairs of A and B
+//
+//	alpha[0:k] = 1,
+//	beta[0:k]  = 0,
+//
+// if m-k-l >= 0,
+//
+//	alpha[k:k+l] = diag(C),
+//	beta[k:k+l]  = diag(S),
+//
+// if m-k-l < 0,
+//
+//	alpha[k:m]= C, alpha[m:k+l]= 0
+//	beta[k:m] = S, beta[m:k+l] = 1.
+//
+// if k+l < n,
+//
+//	alpha[k+l:n] = 0 and
+//	beta[k+l:n]  = 0.
+//
+// On exit, iwork contains the permutation required to sort alpha descending.
+//
+// iwork must have length n, work must have length at least max(1, lwork), and
+// lwork must be -1 or greater than n, otherwise Ggsvd3 will panic. If
+// lwork is -1, work[0] holds the optimal lwork on return, but Ggsvd3 does
+// not perform the GSVD.
+func Ggsvd3(jobU, jobV, jobQ lapack.GSVDJob, a, b blas64.General, alpha, beta []float64, u, v, q blas64.General, work []float64, lwork int, iwork []int) (k, l int, ok bool) {
+	return lapack64.Dggsvd3(jobU, jobV, jobQ, a.Rows, a.Cols, b.Rows, a.Data, max(1, a.Stride), b.Data, max(1, b.Stride), alpha, beta, u.Data, max(1, u.Stride), v.Data, max(1, v.Stride), q.Data, max(1, q.Stride), work, lwork, iwork)
+}
+
+// Gtsv solves one of the equations
+//
+//	A * X = B   if trans == blas.NoTrans
+//	Aᵀ * X = B  if trans == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n tridiagonal matrix. It uses Gaussian elimination with
+// partial pivoting.
+//
+// On entry, a contains the matrix A, on return it will be overwritten.
+//
+// On entry, b contains the n×nrhs right-hand side matrix B. On return, it will
+// be overwritten. If ok is true, it will be overwritten by the solution matrix X.
+//
+// Gtsv returns whether the solution X has been successfully computed.
+//
+// Dgtsv is not part of the lapack.Float64 interface and so calls to Gtsv are
+// always executed by the Gonum implementation.
+func Gtsv(trans blas.Transpose, a Tridiagonal, b blas64.General) (ok bool) {
+	if trans != blas.NoTrans {
+		a.DL, a.DU = a.DU, a.DL
+	}
+	return gonum.Implementation{}.Dgtsv(a.N, b.Cols, a.DL, a.D, a.DU, b.Data, max(1, b.Stride))
+}
+
+// Lagtm performs one of the matrix-matrix operations
+//
+//	C = alpha * A * B + beta * C   if trans == blas.NoTrans
+//	C = alpha * Aᵀ * B + beta * C  if trans == blas.Trans or blas.ConjTrans
+//
+// where A is an m×m tridiagonal matrix represented by its diagonals dl, d, du,
+// B and C are m×n dense matrices, and alpha and beta are scalars.
+//
+// Dlagtm is not part of the lapack.Float64 interface and so calls to Lagtm are
+// always executed by the Gonum implementation.
+func Lagtm(trans blas.Transpose, alpha float64, a Tridiagonal, b blas64.General, beta float64, c blas64.General) {
+	gonum.Implementation{}.Dlagtm(trans, c.Rows, c.Cols, alpha, a.DL, a.D, a.DU, b.Data, max(1, b.Stride), beta, c.Data, max(1, c.Stride))
+}
+
+// Lange computes the matrix norm of the general m×n matrix A. The input norm
+// specifies the norm computed.
+//
+//	lapack.MaxAbs: the maximum absolute value of an element.
+//	lapack.MaxColumnSum: the maximum column sum of the absolute values of the entries.
+//	lapack.MaxRowSum: the maximum row sum of the absolute values of the entries.
+//	lapack.Frobenius: the square root of the sum of the squares of the entries.
+//
+// If norm == lapack.MaxColumnSum, work must be of length n, and this function will panic otherwise.
+// There are no restrictions on work for the other matrix norms.
+func Lange(norm lapack.MatrixNorm, a blas64.General, work []float64) float64 {
+	return lapack64.Dlange(norm, a.Rows, a.Cols, a.Data, max(1, a.Stride), work)
+}
+
+// Langb returns the given norm of a general m×n band matrix with kl sub-diagonals and
+// ku super-diagonals.
+//
+// Dlangb is not part of the lapack.Float64 interface and so calls to Langb are always
+// executed by the Gonum implementation.
+func Langb(norm lapack.MatrixNorm, a blas64.Band) float64 {
+	return gonum.Implementation{}.Dlangb(norm, a.Rows, a.Cols, a.KL, a.KU, a.Data, max(1, a.Stride))
+}
+
+// Langt computes the specified norm of an n×n tridiagonal matrix.
+//
+// Dlangt is not part of the lapack.Float64 interface and so calls to Langt are
+// always executed by the Gonum implementation.
+func Langt(norm lapack.MatrixNorm, a Tridiagonal) float64 {
+	return gonum.Implementation{}.Dlangt(norm, a.N, a.DL, a.D, a.DU)
+}
+
+// Lansb computes the specified norm of an n×n symmetric band matrix. If
+// norm == lapack.MaxColumnSum or norm == lapack.MaxRowSum, work must have length
+// at least n and this function will panic otherwise.
+// There are no restrictions on work for the other matrix norms.
+//
+// Dlansb is not part of the lapack.Float64 interface and so calls to Lansb are always
+// executed by the Gonum implementation.
+func Lansb(norm lapack.MatrixNorm, a blas64.SymmetricBand, work []float64) float64 {
+	return gonum.Implementation{}.Dlansb(norm, a.Uplo, a.N, a.K, a.Data, max(1, a.Stride), work)
+}
+
+// Lansy computes the specified norm of an n×n symmetric matrix. If
+// norm == lapack.MaxColumnSum or norm == lapack.MaxRowSum, work must have length
+// at least n and this function will panic otherwise.
+// There are no restrictions on work for the other matrix norms.
+func Lansy(norm lapack.MatrixNorm, a blas64.Symmetric, work []float64) float64 {
+	return lapack64.Dlansy(norm, a.Uplo, a.N, a.Data, max(1, a.Stride), work)
+}
+
+// Lantr computes the specified norm of an m×n trapezoidal matrix A. If
+// norm == lapack.MaxColumnSum work must have length at least n and this function
+// will panic otherwise. There are no restrictions on work for the other matrix norms.
+func Lantr(norm lapack.MatrixNorm, a blas64.Triangular, work []float64) float64 {
+	return lapack64.Dlantr(norm, a.Uplo, a.Diag, a.N, a.N, a.Data, max(1, a.Stride), work)
+}
+
+// Lantb computes the specified norm of an n×n triangular band matrix A. If
+// norm == lapack.MaxColumnSum work must have length at least n and this function
+// will panic otherwise. There are no restrictions on work for the other matrix
+// norms.
+func Lantb(norm lapack.MatrixNorm, a blas64.TriangularBand, work []float64) float64 {
+	return gonum.Implementation{}.Dlantb(norm, a.Uplo, a.Diag, a.N, a.K, a.Data, max(1, a.Stride), work)
+}
+
+// Lapmr rearranges the rows of the m×n matrix X as specified by the permutation
+// k[0],k[1],...,k[m-1] of the integers 0,...,m-1.
+//
+// If forward is true, a forward permutation is applied:
+//
+//	X[k[i],0:n] is moved to X[i,0:n] for i=0,1,...,m-1.
+//
+// If forward is false, a backward permutation is applied:
+//
+//	X[i,0:n] is moved to X[k[i],0:n] for i=0,1,...,m-1.
+//
+// k must have length m, otherwise Lapmr will panic. k is zero-indexed.
+func Lapmr(forward bool, x blas64.General, k []int) {
+	lapack64.Dlapmr(forward, x.Rows, x.Cols, x.Data, max(1, x.Stride), k)
+}
+
+// Lapmt rearranges the columns of the m×n matrix X as specified by the
+// permutation k[0],k[1],...,k[n-1] of the integers 0,...,n-1.
+//
+// If forward is true, a forward permutation is applied:
+//
+//	X[0:m,k[j]] is moved to X[0:m,j] for j=0,1,...,n-1.
+//
+// If forward is false, a backward permutation is applied:
+//
+//	X[0:m,j] is moved to X[0:m,k[j]] for j=0,1,...,n-1.
+//
+// k must have length n, otherwise Lapmt will panic. k is zero-indexed.
+func Lapmt(forward bool, x blas64.General, k []int) {
+	lapack64.Dlapmt(forward, x.Rows, x.Cols, x.Data, max(1, x.Stride), k)
+}
+
+// Orglq generates an m×n matrix Q with orthonormal rows defined as the first m
+// rows of a product of k elementary reflectors of order n
+//
+//	Q = H_{k-1} * ... * H_0
+//
+// as returned by Dgelqf.
+//
+// k is determined by the length of tau.
+//
+// On entry, tau and the first k rows of A must contain the scalar factors and
+// the vectors, respectively, which define the elementary reflectors H_i,
+// i=0,...,k-1, as returned by Dgelqf. On return, A contains the matrix Q.
+//
+// work must have length at least lwork and lwork must be at least max(1,m). On
+// return, optimal value of lwork will be stored in work[0]. It must also hold
+// that 0 <= k <= m <= n, otherwise Orglq will panic.
+//
+// If lwork == -1, instead of performing Orglq, the function only calculates the
+// optimal value of lwork and stores it into work[0].
+func Orglq(a blas64.General, tau, work []float64, lwork int) {
+	lapack64.Dorglq(a.Rows, a.Cols, len(tau), a.Data, a.Stride, tau, work, lwork)
+}
+
+// Ormlq multiplies the matrix C by the othogonal matrix Q defined by
+// A and tau. A and tau are as returned from Gelqf.
+//
+//	C = Q * C   if side == blas.Left and trans == blas.NoTrans
+//	C = Qᵀ * C  if side == blas.Left and trans == blas.Trans
+//	C = C * Q   if side == blas.Right and trans == blas.NoTrans
+//	C = C * Qᵀ  if side == blas.Right and trans == blas.Trans
+//
+// If side == blas.Left, A is a matrix of side k×m, and if side == blas.Right
+// A is of size k×n. This uses a blocked algorithm.
+//
+// Work is temporary storage, and lwork specifies the usable memory length.
+// At minimum, lwork >= m if side == blas.Left and lwork >= n if side == blas.Right,
+// and this function will panic otherwise.
+// Ormlq uses a block algorithm, but the block size is limited
+// by the temporary space available. If lwork == -1, instead of performing Ormlq,
+// the optimal work length will be stored into work[0].
+//
+// Tau contains the Householder scales and must have length at least k, and
+// this function will panic otherwise.
+func Ormlq(side blas.Side, trans blas.Transpose, a blas64.General, tau []float64, c blas64.General, work []float64, lwork int) {
+	lapack64.Dormlq(side, trans, c.Rows, c.Cols, a.Rows, a.Data, max(1, a.Stride), tau, c.Data, max(1, c.Stride), work, lwork)
+}
+
+// Orgqr generates an m×n matrix Q with orthonormal columns defined by the
+// product of elementary reflectors
+//
+//	Q = H_0 * H_1 * ... * H_{k-1}
+//
+// as computed by Geqrf.
+//
+// k is determined by the length of tau.
+//
+// The length of work must be at least n and it also must be that 0 <= k <= n
+// and 0 <= n <= m.
+//
+// work is temporary storage, and lwork specifies the usable memory length. At
+// minimum, lwork >= n, and the amount of blocking is limited by the usable
+// length. If lwork == -1, instead of computing Orgqr the optimal work length
+// is stored into work[0].
+//
+// Orgqr will panic if the conditions on input values are not met.
+func Orgqr(a blas64.General, tau []float64, work []float64, lwork int) {
+	lapack64.Dorgqr(a.Rows, a.Cols, len(tau), a.Data, a.Stride, tau, work, lwork)
+}
+
+// Ormqr multiplies an m×n matrix C by an orthogonal matrix Q as
+//
+//	C = Q * C   if side == blas.Left  and trans == blas.NoTrans,
+//	C = Qᵀ * C  if side == blas.Left  and trans == blas.Trans,
+//	C = C * Q   if side == blas.Right and trans == blas.NoTrans,
+//	C = C * Qᵀ  if side == blas.Right and trans == blas.Trans,
+//
+// where Q is defined as the product of k elementary reflectors
+//
+//	Q = H_0 * H_1 * ... * H_{k-1}.
+//
+// k is determined by the length of tau.
+//
+// If side == blas.Left, A is an m×k matrix and 0 <= k <= m.
+// If side == blas.Right, A is an n×k matrix and 0 <= k <= n.
+// The ith column of A contains the vector which defines the elementary
+// reflector H_i and tau[i] contains its scalar factor. Geqrf returns A and tau
+// in the required form.
+//
+// work must have length at least max(1,lwork), and lwork must be at least n if
+// side == blas.Left and at least m if side == blas.Right, otherwise Ormqr will
+// panic.
+//
+// work is temporary storage, and lwork specifies the usable memory length. At
+// minimum, lwork >= m if side == blas.Left and lwork >= n if side ==
+// blas.Right, and this function will panic otherwise. Larger values of lwork
+// will generally give better performance. On return, work[0] will contain the
+// optimal value of lwork.
+//
+// If lwork is -1, instead of performing Ormqr, the optimal workspace size will
+// be stored into work[0].
+func Ormqr(side blas.Side, trans blas.Transpose, a blas64.General, tau []float64, c blas64.General, work []float64, lwork int) {
+	lapack64.Dormqr(side, trans, c.Rows, c.Cols, len(tau), a.Data, max(1, a.Stride), tau, c.Data, max(1, c.Stride), work, lwork)
+}
+
+// Pocon estimates the reciprocal of the condition number of a positive-definite
+// matrix A given the Cholesky decomposition of A. The condition number computed
+// is based on the 1-norm and the ∞-norm.
+//
+// anorm is the 1-norm and the ∞-norm of the original matrix A.
+//
+// work is a temporary data slice of length at least 3*n and Pocon will panic otherwise.
+//
+// iwork is a temporary data slice of length at least n and Pocon will panic otherwise.
+func Pocon(a blas64.Symmetric, anorm float64, work []float64, iwork []int) float64 {
+	return lapack64.Dpocon(a.Uplo, a.N, a.Data, max(1, a.Stride), anorm, work, iwork)
+}
+
+// Syev computes all eigenvalues and, optionally, the eigenvectors of a real
+// symmetric matrix A.
+//
+// w contains the eigenvalues in ascending order upon return. w must have length
+// at least n, and Syev will panic otherwise.
+//
+// On entry, a contains the elements of the symmetric matrix A in the triangular
+// portion specified by uplo. If jobz == lapack.EVCompute, a contains the
+// orthonormal eigenvectors of A on exit, otherwise jobz must be lapack.EVNone
+// and on exit the specified triangular region is overwritten.
+//
+// Work is temporary storage, and lwork specifies the usable memory length. At minimum,
+// lwork >= 3*n-1, and Syev will panic otherwise. The amount of blocking is
+// limited by the usable length. If lwork == -1, instead of computing Syev the
+// optimal work length is stored into work[0].
+func Syev(jobz lapack.EVJob, a blas64.Symmetric, w, work []float64, lwork int) (ok bool) {
+	return lapack64.Dsyev(jobz, a.Uplo, a.N, a.Data, max(1, a.Stride), w, work, lwork)
+}
+
+// Tbtrs solves a triangular system of the form
+//
+//	A * X = B   if trans == blas.NoTrans
+//	Aᵀ * X = B  if trans == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n triangular band matrix, and B is an n×nrhs matrix.
+//
+// Tbtrs returns whether A is non-singular. If A is singular, no solutions X
+// are computed.
+func Tbtrs(trans blas.Transpose, a blas64.TriangularBand, b blas64.General) (ok bool) {
+	return lapack64.Dtbtrs(a.Uplo, trans, a.Diag, a.N, a.K, b.Cols, a.Data, max(1, a.Stride), b.Data, max(1, b.Stride))
+}
+
+// Trcon estimates the reciprocal of the condition number of a triangular matrix A.
+// The condition number computed may be based on the 1-norm or the ∞-norm.
+//
+// work is a temporary data slice of length at least 3*n and Trcon will panic otherwise.
+//
+// iwork is a temporary data slice of length at least n and Trcon will panic otherwise.
+func Trcon(norm lapack.MatrixNorm, a blas64.Triangular, work []float64, iwork []int) float64 {
+	return lapack64.Dtrcon(norm, a.Uplo, a.Diag, a.N, a.Data, max(1, a.Stride), work, iwork)
+}
+
+// Trtri computes the inverse of a triangular matrix, storing the result in place
+// into a.
+//
+// Trtri will not perform the inversion if the matrix is singular, and returns
+// a boolean indicating whether the inversion was successful.
+func Trtri(a blas64.Triangular) (ok bool) {
+	return lapack64.Dtrtri(a.Uplo, a.Diag, a.N, a.Data, max(1, a.Stride))
+}
+
+// Trtrs solves a triangular system of the form A * X = B or Aᵀ * X = B. Trtrs
+// returns whether the solve completed successfully. If A is singular, no solve is performed.
+func Trtrs(trans blas.Transpose, a blas64.Triangular, b blas64.General) (ok bool) {
+	return lapack64.Dtrtrs(a.Uplo, trans, a.Diag, a.N, b.Cols, a.Data, max(1, a.Stride), b.Data, max(1, b.Stride))
+}
+
+// Geev computes the eigenvalues and, optionally, the left and/or right
+// eigenvectors for an n×n real nonsymmetric matrix A.
+//
+// The right eigenvector v_j of A corresponding to an eigenvalue λ_j
+// is defined by
+//
+//	A v_j = λ_j v_j,
+//
+// and the left eigenvector u_j corresponding to an eigenvalue λ_j is defined by
+//
+//	u_jᴴ A = λ_j u_jᴴ,
+//
+// where u_jᴴ is the conjugate transpose of u_j.
+//
+// On return, A will be overwritten and the left and right eigenvectors will be
+// stored, respectively, in the columns of the n×n matrices VL and VR in the
+// same order as their eigenvalues. If the j-th eigenvalue is real, then
+//
+//	u_j = VL[:,j],
+//	v_j = VR[:,j],
+//
+// and if it is not real, then j and j+1 form a complex conjugate pair and the
+// eigenvectors can be recovered as
+//
+//	u_j     = VL[:,j] + i*VL[:,j+1],
+//	u_{j+1} = VL[:,j] - i*VL[:,j+1],
+//	v_j     = VR[:,j] + i*VR[:,j+1],
+//	v_{j+1} = VR[:,j] - i*VR[:,j+1],
+//
+// where i is the imaginary unit. The computed eigenvectors are normalized to
+// have Euclidean norm equal to 1 and largest component real.
+//
+// Left eigenvectors will be computed only if jobvl == lapack.LeftEVCompute,
+// otherwise jobvl must be lapack.LeftEVNone.
+// Right eigenvectors will be computed only if jobvr == lapack.RightEVCompute,
+// otherwise jobvr must be lapack.RightEVNone.
+// For other values of jobvl and jobvr Geev will panic.
+//
+// On return, wr and wi will contain the real and imaginary parts, respectively,
+// of the computed eigenvalues. Complex conjugate pairs of eigenvalues appear
+// consecutively with the eigenvalue having the positive imaginary part first.
+// wr and wi must have length n, and Geev will panic otherwise.
+//
+// work must have length at least lwork and lwork must be at least max(1,4*n) if
+// the left or right eigenvectors are computed, and at least max(1,3*n) if no
+// eigenvectors are computed. For good performance, lwork must generally be
+// larger. On return, optimal value of lwork will be stored in work[0].
+//
+// If lwork == -1, instead of performing Geev, the function only calculates the
+// optimal value of lwork and stores it into work[0].
+//
+// On return, first will be the index of the first valid eigenvalue.
+// If first == 0, all eigenvalues and eigenvectors have been computed.
+// If first is positive, Geev failed to compute all the eigenvalues, no
+// eigenvectors have been computed and wr[first:] and wi[first:] contain those
+// eigenvalues which have converged.
+func Geev(jobvl lapack.LeftEVJob, jobvr lapack.RightEVJob, a blas64.General, wr, wi []float64, vl, vr blas64.General, work []float64, lwork int) (first int) {
+	n := a.Rows
+	if a.Cols != n {
+		panic("lapack64: matrix not square")
+	}
+	if jobvl == lapack.LeftEVCompute && (vl.Rows != n || vl.Cols != n) {
+		panic("lapack64: bad size of VL")
+	}
+	if jobvr == lapack.RightEVCompute && (vr.Rows != n || vr.Cols != n) {
+		panic("lapack64: bad size of VR")
+	}
+	return lapack64.Dgeev(jobvl, jobvr, n, a.Data, max(1, a.Stride), wr, wi, vl.Data, max(1, vl.Stride), vr.Data, max(1, vr.Stride), work, lwork)
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/README.md b/vendor/gonum.org/v1/gonum/mat/README.md
new file mode 100644
index 0000000000..5e7be6b234
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/README.md
@@ -0,0 +1,6 @@
+# Gonum matrix
+
+[![go.dev reference](https://pkg.go.dev/badge/gonum.org/v1/gonum/mat)](https://pkg.go.dev/gonum.org/v1/gonum/mat)
+[![GoDoc](https://godocs.io/gonum.org/v1/gonum/mat?status.svg)](https://godocs.io/gonum.org/v1/gonum/mat)
+
+Package mat is a matrix package for the Go language.
diff --git a/vendor/gonum.org/v1/gonum/mat/band.go b/vendor/gonum.org/v1/gonum/mat/band.go
new file mode 100644
index 0000000000..7660cdaa8e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/band.go
@@ -0,0 +1,368 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+var (
+	bandDense *BandDense
+	_         Matrix      = bandDense
+	_         allMatrix   = bandDense
+	_         denseMatrix = bandDense
+	_         Banded      = bandDense
+	_         RawBander   = bandDense
+
+	_ NonZeroDoer    = bandDense
+	_ RowNonZeroDoer = bandDense
+	_ ColNonZeroDoer = bandDense
+)
+
+// BandDense represents a band matrix in dense storage format.
+type BandDense struct {
+	mat blas64.Band
+}
+
+// Banded is a band matrix representation.
+type Banded interface {
+	Matrix
+	// Bandwidth returns the lower and upper bandwidth values for
+	// the matrix. The total bandwidth of the matrix is kl+ku+1.
+	Bandwidth() (kl, ku int)
+
+	// TBand is the equivalent of the T() method in the Matrix
+	// interface but guarantees the transpose is of banded type.
+	TBand() Banded
+}
+
+// A RawBander can return a blas64.Band representation of the receiver.
+// Changes to the blas64.Band.Data slice will be reflected in the original
+// matrix, changes to the Rows, Cols, KL, KU and Stride fields will not.
+type RawBander interface {
+	RawBand() blas64.Band
+}
+
+// A MutableBanded can set elements of a band matrix.
+type MutableBanded interface {
+	Banded
+
+	// SetBand sets the element at row i, column j to the value v.
+	// It panics if the location is outside the appropriate region of the matrix.
+	SetBand(i, j int, v float64)
+}
+
+var (
+	_ Matrix            = TransposeBand{}
+	_ Banded            = TransposeBand{}
+	_ UntransposeBander = TransposeBand{}
+)
+
+// TransposeBand is a type for performing an implicit transpose of a band
+// matrix. It implements the Banded interface, returning values from the
+// transpose of the matrix within.
+type TransposeBand struct {
+	Banded Banded
+}
+
+// At returns the value of the element at row i and column j of the transposed
+// matrix, that is, row j and column i of the Banded field.
+func (t TransposeBand) At(i, j int) float64 {
+	return t.Banded.At(j, i)
+}
+
+// Dims returns the dimensions of the transposed matrix.
+func (t TransposeBand) Dims() (r, c int) {
+	c, r = t.Banded.Dims()
+	return r, c
+}
+
+// T performs an implicit transpose by returning the Banded field.
+func (t TransposeBand) T() Matrix {
+	return t.Banded
+}
+
+// Bandwidth returns the lower and upper bandwidth values for
+// the transposed matrix.
+func (t TransposeBand) Bandwidth() (kl, ku int) {
+	kl, ku = t.Banded.Bandwidth()
+	return ku, kl
+}
+
+// TBand performs an implicit transpose by returning the Banded field.
+func (t TransposeBand) TBand() Banded {
+	return t.Banded
+}
+
+// Untranspose returns the Banded field.
+func (t TransposeBand) Untranspose() Matrix {
+	return t.Banded
+}
+
+// UntransposeBand returns the Banded field.
+func (t TransposeBand) UntransposeBand() Banded {
+	return t.Banded
+}
+
+// NewBandDense creates a new Band matrix with r rows and c columns. If data == nil,
+// a new slice is allocated for the backing slice. If len(data) == min(r, c+kl)*(kl+ku+1),
+// data is used as the backing slice, and changes to the elements of the returned
+// BandDense will be reflected in data. If neither of these is true, NewBandDense
+// will panic. kl must be at least zero and less r, and ku must be at least zero and
+// less than c, otherwise NewBandDense will panic.
+// NewBandDense will panic if either r or c is zero.
+//
+// The data must be arranged in row-major order constructed by removing the zeros
+// from the rows outside the band and aligning the diagonals. For example, the matrix
+//
+//	1  2  3  0  0  0
+//	4  5  6  7  0  0
+//	0  8  9 10 11  0
+//	0  0 12 13 14 15
+//	0  0  0 16 17 18
+//	0  0  0  0 19 20
+//
+// becomes (* entries are never accessed)
+//   - 1  2  3
+//     4  5  6  7
+//     8  9 10 11
+//     12 13 14 15
+//     16 17 18  *
+//     19 20  *  *
+//
+// which is passed to NewBandDense as []float64{*, 1, 2, 3, 4, ...} with kl=1 and ku=2.
+// Only the values in the band portion of the matrix are used.
+func NewBandDense(r, c, kl, ku int, data []float64) *BandDense {
+	if r <= 0 || c <= 0 || kl < 0 || ku < 0 {
+		if r == 0 || c == 0 {
+			panic(ErrZeroLength)
+		}
+		panic(ErrNegativeDimension)
+	}
+	if kl+1 > r || ku+1 > c {
+		panic(ErrBandwidth)
+	}
+	bc := kl + ku + 1
+	if data != nil && len(data) != min(r, c+kl)*bc {
+		panic(ErrShape)
+	}
+	if data == nil {
+		data = make([]float64, min(r, c+kl)*bc)
+	}
+	return &BandDense{
+		mat: blas64.Band{
+			Rows:   r,
+			Cols:   c,
+			KL:     kl,
+			KU:     ku,
+			Stride: bc,
+			Data:   data,
+		},
+	}
+}
+
+// NewDiagonalRect is a convenience function that returns a diagonal matrix represented by a
+// BandDense. The length of data must be min(r, c) otherwise NewDiagonalRect will panic.
+func NewDiagonalRect(r, c int, data []float64) *BandDense {
+	return NewBandDense(r, c, 0, 0, data)
+}
+
+// Dims returns the number of rows and columns in the matrix.
+func (b *BandDense) Dims() (r, c int) {
+	return b.mat.Rows, b.mat.Cols
+}
+
+// Bandwidth returns the upper and lower bandwidths of the matrix.
+func (b *BandDense) Bandwidth() (kl, ku int) {
+	return b.mat.KL, b.mat.KU
+}
+
+// T performs an implicit transpose by returning the receiver inside a Transpose.
+func (b *BandDense) T() Matrix {
+	return Transpose{b}
+}
+
+// TBand performs an implicit transpose by returning the receiver inside a TransposeBand.
+func (b *BandDense) TBand() Banded {
+	return TransposeBand{b}
+}
+
+// RawBand returns the underlying blas64.Band used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in returned blas64.Band.
+func (b *BandDense) RawBand() blas64.Band {
+	return b.mat
+}
+
+// SetRawBand sets the underlying blas64.Band used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in the input.
+func (b *BandDense) SetRawBand(mat blas64.Band) {
+	b.mat = mat
+}
+
+// IsEmpty returns whether the receiver is empty. Empty matrices can be the
+// receiver for size-restricted operations. The receiver can be zeroed using Reset.
+func (b *BandDense) IsEmpty() bool {
+	return b.mat.Stride == 0
+}
+
+// Reset empties the matrix so that it can be reused as the
+// receiver of a dimensionally restricted operation.
+//
+// Reset should not be used when the matrix shares backing data.
+// See the Reseter interface for more information.
+func (b *BandDense) Reset() {
+	b.mat.Rows = 0
+	b.mat.Cols = 0
+	b.mat.KL = 0
+	b.mat.KU = 0
+	b.mat.Stride = 0
+	b.mat.Data = b.mat.Data[:0]
+}
+
+// DiagView returns the diagonal as a matrix backed by the original data.
+func (b *BandDense) DiagView() Diagonal {
+	n := min(b.mat.Rows, b.mat.Cols)
+	return &DiagDense{
+		mat: blas64.Vector{
+			N:    n,
+			Inc:  b.mat.Stride,
+			Data: b.mat.Data[b.mat.KL : (n-1)*b.mat.Stride+b.mat.KL+1],
+		},
+	}
+}
+
+// DoNonZero calls the function fn for each of the non-zero elements of b. The function fn
+// takes a row/column index and the element value of b at (i, j).
+func (b *BandDense) DoNonZero(fn func(i, j int, v float64)) {
+	for i := 0; i < min(b.mat.Rows, b.mat.Cols+b.mat.KL); i++ {
+		for j := max(0, i-b.mat.KL); j < min(b.mat.Cols, i+b.mat.KU+1); j++ {
+			v := b.at(i, j)
+			if v != 0 {
+				fn(i, j, v)
+			}
+		}
+	}
+}
+
+// DoRowNonZero calls the function fn for each of the non-zero elements of row i of b. The function fn
+// takes a row/column index and the element value of b at (i, j).
+func (b *BandDense) DoRowNonZero(i int, fn func(i, j int, v float64)) {
+	if i < 0 || b.mat.Rows <= i {
+		panic(ErrRowAccess)
+	}
+	for j := max(0, i-b.mat.KL); j < min(b.mat.Cols, i+b.mat.KU+1); j++ {
+		v := b.at(i, j)
+		if v != 0 {
+			fn(i, j, v)
+		}
+	}
+}
+
+// DoColNonZero calls the function fn for each of the non-zero elements of column j of b. The function fn
+// takes a row/column index and the element value of b at (i, j).
+func (b *BandDense) DoColNonZero(j int, fn func(i, j int, v float64)) {
+	if j < 0 || b.mat.Cols <= j {
+		panic(ErrColAccess)
+	}
+	for i := 0; i < min(b.mat.Rows, b.mat.Cols+b.mat.KL); i++ {
+		if i-b.mat.KL <= j && j < i+b.mat.KU+1 {
+			v := b.at(i, j)
+			if v != 0 {
+				fn(i, j, v)
+			}
+		}
+	}
+}
+
+// Zero sets all of the matrix elements to zero.
+func (b *BandDense) Zero() {
+	m := b.mat.Rows
+	kL := b.mat.KL
+	nCol := b.mat.KU + 1 + kL
+	for i := 0; i < m; i++ {
+		l := max(0, kL-i)
+		u := min(nCol, m+kL-i)
+		zero(b.mat.Data[i*b.mat.Stride+l : i*b.mat.Stride+u])
+	}
+}
+
+// Norm returns the specified norm of the receiver. Valid norms are:
+//
+//	1 - The maximum absolute column sum
+//	2 - The Frobenius norm, the square root of the sum of the squares of the elements
+//	Inf - The maximum absolute row sum
+//
+// Norm will panic with ErrNormOrder if an illegal norm is specified and with
+// ErrZeroLength if the matrix has zero size.
+func (b *BandDense) Norm(norm float64) float64 {
+	if b.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	lnorm := normLapack(norm, false)
+	if lnorm == lapack.MaxColumnSum || lnorm == lapack.MaxRowSum {
+		return lapack64.Langb(lnorm, b.mat)
+	}
+	return lapack64.Langb(lnorm, b.mat)
+}
+
+// Trace returns the trace of the matrix.
+//
+// Trace will panic with ErrSquare if the matrix is not square and with
+// ErrZeroLength if the matrix has zero size.
+func (b *BandDense) Trace() float64 {
+	r, c := b.Dims()
+	if r != c {
+		panic(ErrSquare)
+	}
+	if b.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	rb := b.RawBand()
+	var tr float64
+	for i := 0; i < r; i++ {
+		tr += rb.Data[rb.KL+i*rb.Stride]
+	}
+	return tr
+}
+
+// MulVecTo computes B⋅x or Bᵀ⋅x storing the result into dst.
+func (b *BandDense) MulVecTo(dst *VecDense, trans bool, x Vector) {
+	m, n := b.Dims()
+	if trans {
+		m, n = n, m
+	}
+	if x.Len() != n {
+		panic(ErrShape)
+	}
+	dst.reuseAsNonZeroed(m)
+
+	t := blas.NoTrans
+	if trans {
+		t = blas.Trans
+	}
+
+	xMat, _ := untransposeExtract(x)
+	if xVec, ok := xMat.(*VecDense); ok {
+		if dst != xVec {
+			dst.checkOverlap(xVec.mat)
+			blas64.Gbmv(t, 1, b.mat, xVec.mat, 0, dst.mat)
+		} else {
+			xCopy := getVecDenseWorkspace(n, false)
+			xCopy.CloneFromVec(xVec)
+			blas64.Gbmv(t, 1, b.mat, xCopy.mat, 0, dst.mat)
+			putVecDenseWorkspace(xCopy)
+		}
+	} else {
+		xCopy := getVecDenseWorkspace(n, false)
+		xCopy.CloneFromVec(x)
+		blas64.Gbmv(t, 1, b.mat, xCopy.mat, 0, dst.mat)
+		putVecDenseWorkspace(xCopy)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/cdense.go b/vendor/gonum.org/v1/gonum/mat/cdense.go
new file mode 100644
index 0000000000..86f0423c58
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/cdense.go
@@ -0,0 +1,368 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math/cmplx"
+
+	"gonum.org/v1/gonum/blas/cblas128"
+)
+
+var (
+	cDense *CDense
+
+	_ CMatrix   = cDense
+	_ allMatrix = cDense
+)
+
+// CDense is a dense matrix representation with complex data.
+type CDense struct {
+	mat cblas128.General
+
+	capRows, capCols int
+}
+
+// Dims returns the number of rows and columns in the matrix.
+func (m *CDense) Dims() (r, c int) {
+	return m.mat.Rows, m.mat.Cols
+}
+
+// Caps returns the number of rows and columns in the backing matrix.
+func (m *CDense) Caps() (r, c int) { return m.capRows, m.capCols }
+
+// H performs an implicit conjugate transpose by returning the receiver inside a
+// ConjTranspose.
+func (m *CDense) H() CMatrix {
+	return ConjTranspose{m}
+}
+
+// T performs an implicit transpose by returning the receiver inside a
+// CTranspose.
+func (m *CDense) T() CMatrix {
+	return CTranspose{m}
+}
+
+// Conj calculates the element-wise conjugate of a and stores the result in the
+// receiver.
+// Conj will panic if m and a do not have the same dimension unless m is empty.
+func (m *CDense) Conj(a CMatrix) {
+	ar, ac := a.Dims()
+	aU, aTrans, aConj := untransposeExtractCmplx(a)
+	m.reuseAsNonZeroed(ar, ac)
+
+	if arm, ok := a.(*CDense); ok {
+		amat := arm.mat
+		if m != aU {
+			m.checkOverlap(amat)
+		}
+		for ja, jm := 0, 0; ja < ar*amat.Stride; ja, jm = ja+amat.Stride, jm+m.mat.Stride {
+			for i, v := range amat.Data[ja : ja+ac] {
+				m.mat.Data[i+jm] = cmplx.Conj(v)
+			}
+		}
+		return
+	}
+
+	m.checkOverlapMatrix(aU)
+	if aTrans != aConj && m == aU {
+		// Only make workspace if the destination is transposed
+		// with respect to the source and they are the same
+		// matrix.
+		var restore func()
+		m, restore = m.isolatedWorkspace(aU)
+		defer restore()
+	}
+
+	for r := 0; r < ar; r++ {
+		for c := 0; c < ac; c++ {
+			m.set(r, c, cmplx.Conj(a.At(r, c)))
+		}
+	}
+}
+
+// Slice returns a new CMatrix that shares backing data with the receiver.
+// The returned matrix starts at {i,j} of the receiver and extends k-i rows
+// and l-j columns. The final row in the resulting matrix is k-1 and the
+// final column is l-1.
+// Slice panics with ErrIndexOutOfRange if the slice is outside the capacity
+// of the receiver.
+func (m *CDense) Slice(i, k, j, l int) CMatrix {
+	return m.slice(i, k, j, l)
+}
+
+func (m *CDense) slice(i, k, j, l int) *CDense {
+	mr, mc := m.Caps()
+	if i < 0 || mr <= i || j < 0 || mc <= j || k < i || mr < k || l < j || mc < l {
+		if i == k || j == l {
+			panic(ErrZeroLength)
+		}
+		panic(ErrIndexOutOfRange)
+	}
+	t := *m
+	t.mat.Data = t.mat.Data[i*t.mat.Stride+j : (k-1)*t.mat.Stride+l]
+	t.mat.Rows = k - i
+	t.mat.Cols = l - j
+	t.capRows -= i
+	t.capCols -= j
+	return &t
+}
+
+// NewCDense creates a new complex Dense matrix with r rows and c columns.
+// If data == nil, a new slice is allocated for the backing slice.
+// If len(data) == r*c, data is used as the backing slice, and changes to the
+// elements of the returned CDense will be reflected in data.
+// If neither of these is true, NewCDense will panic.
+// NewCDense will panic if either r or c is zero.
+//
+// The data must be arranged in row-major order, i.e. the (i*c + j)-th
+// element in the data slice is the {i, j}-th element in the matrix.
+func NewCDense(r, c int, data []complex128) *CDense {
+	if r <= 0 || c <= 0 {
+		if r == 0 || c == 0 {
+			panic(ErrZeroLength)
+		}
+		panic("mat: negative dimension")
+	}
+	if data != nil && r*c != len(data) {
+		panic(ErrShape)
+	}
+	if data == nil {
+		data = make([]complex128, r*c)
+	}
+	return &CDense{
+		mat: cblas128.General{
+			Rows:   r,
+			Cols:   c,
+			Stride: c,
+			Data:   data,
+		},
+		capRows: r,
+		capCols: c,
+	}
+}
+
+// ReuseAs changes the receiver if it IsEmpty() to be of size r×c.
+//
+// ReuseAs re-uses the backing data slice if it has sufficient capacity,
+// otherwise a new slice is allocated. The backing data is zero on return.
+//
+// ReuseAs panics if the receiver is not empty, and panics if
+// the input sizes are less than one. To empty the receiver for re-use,
+// Reset should be used.
+func (m *CDense) ReuseAs(r, c int) {
+	if r <= 0 || c <= 0 {
+		if r == 0 || c == 0 {
+			panic(ErrZeroLength)
+		}
+		panic(ErrNegativeDimension)
+	}
+	if !m.IsEmpty() {
+		panic(ErrReuseNonEmpty)
+	}
+	m.reuseAsZeroed(r, c)
+}
+
+// reuseAs resizes an empty matrix to a r×c matrix,
+// or checks that a non-empty matrix is r×c.
+//
+// reuseAs must be kept in sync with reuseAsZeroed.
+func (m *CDense) reuseAsNonZeroed(r, c int) {
+	if m.mat.Rows > m.capRows || m.mat.Cols > m.capCols {
+		// Panic as a string, not a mat.Error.
+		panic(badCap)
+	}
+	if r == 0 || c == 0 {
+		panic(ErrZeroLength)
+	}
+	if m.IsEmpty() {
+		m.mat = cblas128.General{
+			Rows:   r,
+			Cols:   c,
+			Stride: c,
+			Data:   useC(m.mat.Data, r*c),
+		}
+		m.capRows = r
+		m.capCols = c
+		return
+	}
+	if r != m.mat.Rows || c != m.mat.Cols {
+		panic(ErrShape)
+	}
+}
+
+func (m *CDense) reuseAsZeroed(r, c int) {
+	// This must be kept in-sync with reuseAs.
+	if m.mat.Rows > m.capRows || m.mat.Cols > m.capCols {
+		// Panic as a string, not a mat.Error.
+		panic(badCap)
+	}
+	if r == 0 || c == 0 {
+		panic(ErrZeroLength)
+	}
+	if m.IsEmpty() {
+		m.mat = cblas128.General{
+			Rows:   r,
+			Cols:   c,
+			Stride: c,
+			Data:   useZeroedC(m.mat.Data, r*c),
+		}
+		m.capRows = r
+		m.capCols = c
+		return
+	}
+	if r != m.mat.Rows || c != m.mat.Cols {
+		panic(ErrShape)
+	}
+	m.Zero()
+}
+
+// isolatedWorkspace returns a new dense matrix w with the size of a and
+// returns a callback to defer which performs cleanup at the return of the call.
+// This should be used when a method receiver is the same pointer as an input argument.
+func (m *CDense) isolatedWorkspace(a CMatrix) (w *CDense, restore func()) {
+	r, c := a.Dims()
+	if r == 0 || c == 0 {
+		panic(ErrZeroLength)
+	}
+	w = getCDenseWorkspace(r, c, false)
+	return w, func() {
+		m.Copy(w)
+		putCDenseWorkspace(w)
+	}
+}
+
+// Reset zeros the dimensions of the matrix so that it can be reused as the
+// receiver of a dimensionally restricted operation.
+//
+// Reset should not be used when the matrix shares backing data.
+// See the Reseter interface for more information.
+func (m *CDense) Reset() {
+	// Row, Cols and Stride must be zeroed in unison.
+	m.mat.Rows, m.mat.Cols, m.mat.Stride = 0, 0, 0
+	m.capRows, m.capCols = 0, 0
+	m.mat.Data = m.mat.Data[:0]
+}
+
+// IsEmpty returns whether the receiver is empty. Empty matrices can be the
+// receiver for size-restricted operations. The receiver can be zeroed using Reset.
+func (m *CDense) IsEmpty() bool {
+	// It must be the case that m.Dims() returns
+	// zeros in this case. See comment in Reset().
+	return m.mat.Stride == 0
+}
+
+// Zero sets all of the matrix elements to zero.
+func (m *CDense) Zero() {
+	r := m.mat.Rows
+	c := m.mat.Cols
+	for i := 0; i < r; i++ {
+		zeroC(m.mat.Data[i*m.mat.Stride : i*m.mat.Stride+c])
+	}
+}
+
+// Copy makes a copy of elements of a into the receiver. It is similar to the
+// built-in copy; it copies as much as the overlap between the two matrices and
+// returns the number of rows and columns it copied. If a aliases the receiver
+// and is a transposed Dense or VecDense, with a non-unitary increment, Copy will
+// panic.
+//
+// See the Copier interface for more information.
+func (m *CDense) Copy(a CMatrix) (r, c int) {
+	r, c = a.Dims()
+	if a == m {
+		return r, c
+	}
+	r = min(r, m.mat.Rows)
+	c = min(c, m.mat.Cols)
+	if r == 0 || c == 0 {
+		return 0, 0
+	}
+	// TODO(btracey): Check for overlap when complex version exists.
+	// TODO(btracey): Add fast-paths.
+	for i := 0; i < r; i++ {
+		for j := 0; j < c; j++ {
+			m.set(i, j, a.At(i, j))
+		}
+	}
+	return r, c
+}
+
+// SetRawCMatrix sets the underlying cblas128.General used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in b.
+func (m *CDense) SetRawCMatrix(b cblas128.General) {
+	m.capRows, m.capCols = b.Rows, b.Cols
+	m.mat = b
+}
+
+// RawCMatrix returns the underlying cblas128.General used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in returned cblas128.General.
+func (m *CDense) RawCMatrix() cblas128.General { return m.mat }
+
+// Grow returns the receiver expanded by r rows and c columns. If the dimensions
+// of the expanded matrix are outside the capacities of the receiver a new
+// allocation is made, otherwise not. Note the receiver itself is not modified
+// during the call to Grow.
+func (m *CDense) Grow(r, c int) CMatrix {
+	if r < 0 || c < 0 {
+		panic(ErrIndexOutOfRange)
+	}
+	if r == 0 && c == 0 {
+		return m
+	}
+
+	r += m.mat.Rows
+	c += m.mat.Cols
+
+	var t CDense
+	switch {
+	case m.mat.Rows == 0 || m.mat.Cols == 0:
+		t.mat = cblas128.General{
+			Rows:   r,
+			Cols:   c,
+			Stride: c,
+			// We zero because we don't know how the matrix will be used.
+			// In other places, the mat is immediately filled with a result;
+			// this is not the case here.
+			Data: useZeroedC(m.mat.Data, r*c),
+		}
+	case r > m.capRows || c > m.capCols:
+		cr := max(r, m.capRows)
+		cc := max(c, m.capCols)
+		t.mat = cblas128.General{
+			Rows:   r,
+			Cols:   c,
+			Stride: cc,
+			Data:   make([]complex128, cr*cc),
+		}
+		t.capRows = cr
+		t.capCols = cc
+		// Copy the complete matrix over to the new matrix.
+		// Including elements not currently visible. Use a temporary structure
+		// to avoid modifying the receiver.
+		var tmp CDense
+		tmp.mat = cblas128.General{
+			Rows:   m.mat.Rows,
+			Cols:   m.mat.Cols,
+			Stride: m.mat.Stride,
+			Data:   m.mat.Data,
+		}
+		tmp.capRows = m.capRows
+		tmp.capCols = m.capCols
+		t.Copy(&tmp)
+		return &t
+	default:
+		t.mat = cblas128.General{
+			Data:   m.mat.Data[:(r-1)*m.mat.Stride+c],
+			Rows:   r,
+			Cols:   c,
+			Stride: m.mat.Stride,
+		}
+	}
+	t.capRows = r
+	t.capCols = c
+	return &t
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/cholesky.go b/vendor/gonum.org/v1/gonum/mat/cholesky.go
new file mode 100644
index 0000000000..f11948d0f8
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/cholesky.go
@@ -0,0 +1,1203 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+const (
+	badTriangle = "mat: invalid triangle"
+	badCholesky = "mat: invalid Cholesky factorization"
+)
+
+var (
+	_ Matrix    = (*Cholesky)(nil)
+	_ Symmetric = (*Cholesky)(nil)
+
+	_ Matrix    = (*BandCholesky)(nil)
+	_ Symmetric = (*BandCholesky)(nil)
+	_ Banded    = (*BandCholesky)(nil)
+	_ SymBanded = (*BandCholesky)(nil)
+
+	_ Matrix    = (*PivotedCholesky)(nil)
+	_ Symmetric = (*PivotedCholesky)(nil)
+)
+
+// Cholesky is a symmetric positive definite matrix represented by its
+// Cholesky decomposition.
+//
+// The decomposition can be constructed using the Factorize method. The
+// factorization itself can be extracted using the UTo or LTo methods, and the
+// original symmetric matrix can be recovered with ToSym.
+//
+// Note that this matrix representation is useful for certain operations, in
+// particular finding solutions to linear equations. It is very inefficient
+// at other operations, in particular At is slow.
+//
+// Cholesky methods may only be called on a value that has been successfully
+// initialized by a call to Factorize that has returned true. Calls to methods
+// of an unsuccessful Cholesky factorization will panic.
+type Cholesky struct {
+	// The chol pointer must never be retained as a pointer outside the Cholesky
+	// struct, either by returning chol outside the struct or by setting it to
+	// a pointer coming from outside. The same prohibition applies to the data
+	// slice within chol.
+	chol *TriDense
+	cond float64
+}
+
+// updateCond updates the condition number of the Cholesky decomposition. If
+// norm > 0, then that norm is used as the norm of the original matrix A, otherwise
+// the norm is estimated from the decomposition.
+func (c *Cholesky) updateCond(norm float64) {
+	n := c.chol.mat.N
+	work := getFloat64s(3*n, false)
+	defer putFloat64s(work)
+	if norm < 0 {
+		// This is an approximation. By the definition of a norm,
+		//  |AB| <= |A| |B|.
+		// Since A = Uᵀ*U, we get for the condition number κ that
+		//  κ(A) := |A| |A^-1| = |Uᵀ*U| |A^-1| <= |Uᵀ| |U| |A^-1|,
+		// so this will overestimate the condition number somewhat.
+		// The norm of the original factorized matrix cannot be stored
+		// because of update possibilities.
+		unorm := lapack64.Lantr(CondNorm, c.chol.mat, work)
+		lnorm := lapack64.Lantr(CondNormTrans, c.chol.mat, work)
+		norm = unorm * lnorm
+	}
+	sym := c.chol.asSymBlas()
+	iwork := getInts(n, false)
+	v := lapack64.Pocon(sym, norm, work, iwork)
+	putInts(iwork)
+	c.cond = 1 / v
+}
+
+// Dims returns the dimensions of the matrix.
+func (ch *Cholesky) Dims() (r, c int) {
+	n := ch.SymmetricDim()
+	return n, n
+}
+
+// At returns the element at row i, column j.
+func (c *Cholesky) At(i, j int) float64 {
+	n := c.SymmetricDim()
+	if uint(i) >= uint(n) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(n) {
+		panic(ErrColAccess)
+	}
+
+	var val float64
+	for k := 0; k <= min(i, j); k++ {
+		val += c.chol.at(k, i) * c.chol.at(k, j)
+	}
+	return val
+}
+
+// T returns the receiver, the transpose of a symmetric matrix.
+func (c *Cholesky) T() Matrix {
+	return c
+}
+
+// SymmetricDim implements the Symmetric interface and returns the number of rows
+// in the matrix (this is also the number of columns).
+func (c *Cholesky) SymmetricDim() int {
+	if c.chol == nil {
+		return 0
+	}
+	n, _ := c.chol.Triangle()
+	return n
+}
+
+// Cond returns the condition number of the factorized matrix.
+func (c *Cholesky) Cond() float64 {
+	if !c.valid() {
+		panic(badCholesky)
+	}
+	return c.cond
+}
+
+// Factorize calculates the Cholesky decomposition of the matrix A and returns
+// whether the matrix is positive definite. If Factorize returns false, the
+// factorization must not be used.
+func (c *Cholesky) Factorize(a Symmetric) (ok bool) {
+	n := a.SymmetricDim()
+	if c.chol == nil {
+		c.chol = NewTriDense(n, Upper, nil)
+	} else {
+		c.chol.Reset()
+		c.chol.reuseAsNonZeroed(n, Upper)
+	}
+	copySymIntoTriangle(c.chol, a)
+
+	sym := c.chol.asSymBlas()
+	work := getFloat64s(c.chol.mat.N, false)
+	norm := lapack64.Lansy(CondNorm, sym, work)
+	putFloat64s(work)
+	_, ok = lapack64.Potrf(sym)
+	if ok {
+		c.updateCond(norm)
+	} else {
+		c.Reset()
+	}
+	return ok
+}
+
+// Reset resets the factorization so that it can be reused as the receiver of a
+// dimensionally restricted operation.
+func (c *Cholesky) Reset() {
+	if c.chol != nil {
+		c.chol.Reset()
+	}
+	c.cond = math.Inf(1)
+}
+
+// IsEmpty returns whether the receiver is empty. Empty matrices can be the
+// receiver for size-restricted operations. The receiver can be emptied using
+// Reset.
+func (c *Cholesky) IsEmpty() bool {
+	return c.chol == nil || c.chol.IsEmpty()
+}
+
+// SetFromU sets the Cholesky decomposition from the given triangular matrix.
+// SetFromU panics if t is not upper triangular. If the receiver is empty it
+// is resized to be n×n, the size of t. If dst is non-empty, SetFromU panics
+// if c is not of size n×n. Note that t is copied into, not stored inside, the
+// receiver.
+func (c *Cholesky) SetFromU(t Triangular) {
+	n, kind := t.Triangle()
+	if kind != Upper {
+		panic("cholesky: matrix must be upper triangular")
+	}
+	if c.chol == nil {
+		c.chol = NewTriDense(n, Upper, nil)
+	} else {
+		c.chol.reuseAsNonZeroed(n, Upper)
+	}
+	c.chol.Copy(t)
+	c.updateCond(-1)
+}
+
+// Clone makes a copy of the input Cholesky into the receiver, overwriting the
+// previous value of the receiver. Clone does not place any restrictions on receiver
+// shape. Clone panics if the input Cholesky is not the result of a valid decomposition.
+func (c *Cholesky) Clone(chol *Cholesky) {
+	if !chol.valid() {
+		panic(badCholesky)
+	}
+	n := chol.SymmetricDim()
+	if c.chol == nil {
+		c.chol = NewTriDense(n, Upper, nil)
+	} else {
+		c.chol = NewTriDense(n, Upper, use(c.chol.mat.Data, n*n))
+	}
+	c.chol.Copy(chol.chol)
+	c.cond = chol.cond
+}
+
+// Det returns the determinant of the matrix that has been factorized.
+func (c *Cholesky) Det() float64 {
+	if !c.valid() {
+		panic(badCholesky)
+	}
+	return math.Exp(c.LogDet())
+}
+
+// LogDet returns the log of the determinant of the matrix that has been factorized.
+func (c *Cholesky) LogDet() float64 {
+	if !c.valid() {
+		panic(badCholesky)
+	}
+	var det float64
+	for i := 0; i < c.chol.mat.N; i++ {
+		det += 2 * math.Log(c.chol.mat.Data[i*c.chol.mat.Stride+i])
+	}
+	return det
+}
+
+// SolveTo finds the matrix X that solves A * X = B where A is represented
+// by the Cholesky decomposition. The result is stored in-place into dst.
+// If the Cholesky decomposition is singular or near-singular a Condition error
+// is returned. See the documentation for Condition for more information.
+func (c *Cholesky) SolveTo(dst *Dense, b Matrix) error {
+	if !c.valid() {
+		panic(badCholesky)
+	}
+	n := c.chol.mat.N
+	bm, bn := b.Dims()
+	if n != bm {
+		panic(ErrShape)
+	}
+
+	dst.reuseAsNonZeroed(bm, bn)
+	if b != dst {
+		dst.Copy(b)
+	}
+	lapack64.Potrs(c.chol.mat, dst.mat)
+	if c.cond > ConditionTolerance {
+		return Condition(c.cond)
+	}
+	return nil
+}
+
+// SolveCholTo finds the matrix X that solves A * X = B where A and B are represented
+// by their Cholesky decompositions a and b. The result is stored in-place into
+// dst.
+// If the Cholesky decomposition is singular or near-singular a Condition error
+// is returned. See the documentation for Condition for more information.
+func (a *Cholesky) SolveCholTo(dst *Dense, b *Cholesky) error {
+	if !a.valid() || !b.valid() {
+		panic(badCholesky)
+	}
+	bn := b.chol.mat.N
+	if a.chol.mat.N != bn {
+		panic(ErrShape)
+	}
+
+	dst.reuseAsZeroed(bn, bn)
+	dst.Copy(b.chol.T())
+	blas64.Trsm(blas.Left, blas.Trans, 1, a.chol.mat, dst.mat)
+	blas64.Trsm(blas.Left, blas.NoTrans, 1, a.chol.mat, dst.mat)
+	blas64.Trmm(blas.Right, blas.NoTrans, 1, b.chol.mat, dst.mat)
+	if a.cond > ConditionTolerance {
+		return Condition(a.cond)
+	}
+	return nil
+}
+
+// SolveVecTo finds the vector x that solves A * x = b where A is represented
+// by the Cholesky decomposition. The result is stored in-place into
+// dst.
+// If the Cholesky decomposition is singular or near-singular a Condition error
+// is returned. See the documentation for Condition for more information.
+func (c *Cholesky) SolveVecTo(dst *VecDense, b Vector) error {
+	if !c.valid() {
+		panic(badCholesky)
+	}
+	n := c.chol.mat.N
+	if br, bc := b.Dims(); br != n || bc != 1 {
+		panic(ErrShape)
+	}
+	switch rv := b.(type) {
+	default:
+		dst.reuseAsNonZeroed(n)
+		return c.SolveTo(dst.asDense(), b)
+	case RawVectorer:
+		bmat := rv.RawVector()
+		if dst != b {
+			dst.checkOverlap(bmat)
+		}
+		dst.reuseAsNonZeroed(n)
+		if dst != b {
+			dst.CopyVec(b)
+		}
+		lapack64.Potrs(c.chol.mat, dst.asGeneral())
+		if c.cond > ConditionTolerance {
+			return Condition(c.cond)
+		}
+		return nil
+	}
+}
+
+// RawU returns the Triangular matrix used to store the Cholesky factorization
+// of the original matrix A. If the returned matrix is modified, the
+// factorization is invalid and should not be used.
+//
+// If Factorize has not been called, RawU will return nil.
+func (c *Cholesky) RawU() Triangular {
+	if !c.valid() {
+		return nil
+	}
+	return c.chol
+}
+
+// UTo stores into dst the n×n upper triangular matrix U from a Cholesky
+// decomposition
+//
+//	A = Uᵀ * U.
+//
+// If dst is empty, it is resized to be an n×n upper triangular matrix. When dst
+// is non-empty, UTo panics if dst is not n×n or not Upper. UTo will also panic
+// if the receiver does not contain a successful factorization.
+func (c *Cholesky) UTo(dst *TriDense) {
+	if !c.valid() {
+		panic(badCholesky)
+	}
+	n := c.chol.mat.N
+	if dst.IsEmpty() {
+		dst.ReuseAsTri(n, Upper)
+	} else {
+		n2, kind := dst.Triangle()
+		if n != n2 {
+			panic(ErrShape)
+		}
+		if kind != Upper {
+			panic(ErrTriangle)
+		}
+	}
+	dst.Copy(c.chol)
+}
+
+// LTo stores into dst the n×n lower triangular matrix L from a Cholesky
+// decomposition
+//
+//	A = L * Lᵀ.
+//
+// If dst is empty, it is resized to be an n×n lower triangular matrix. When dst
+// is non-empty, LTo panics if dst is not n×n or not Lower. LTo will also panic
+// if the receiver does not contain a successful factorization.
+func (c *Cholesky) LTo(dst *TriDense) {
+	if !c.valid() {
+		panic(badCholesky)
+	}
+	n := c.chol.mat.N
+	if dst.IsEmpty() {
+		dst.ReuseAsTri(n, Lower)
+	} else {
+		n2, kind := dst.Triangle()
+		if n != n2 {
+			panic(ErrShape)
+		}
+		if kind != Lower {
+			panic(ErrTriangle)
+		}
+	}
+	dst.Copy(c.chol.TTri())
+}
+
+// ToSym reconstructs the original positive definite matrix from its
+// Cholesky decomposition, storing the result into dst. If dst is
+// empty it is resized to be n×n. If dst is non-empty, ToSym panics
+// if dst is not of size n×n. ToSym will also panic if the receiver
+// does not contain a successful factorization.
+func (c *Cholesky) ToSym(dst *SymDense) {
+	if !c.valid() {
+		panic(badCholesky)
+	}
+	n := c.chol.mat.N
+	if dst.IsEmpty() {
+		dst.ReuseAsSym(n)
+	} else {
+		n2 := dst.SymmetricDim()
+		if n != n2 {
+			panic(ErrShape)
+		}
+	}
+	// Create a TriDense representing the Cholesky factor U with dst's
+	// backing slice.
+	// Operations on u are reflected in s.
+	u := &TriDense{
+		mat: blas64.Triangular{
+			Uplo:   blas.Upper,
+			Diag:   blas.NonUnit,
+			N:      n,
+			Data:   dst.mat.Data,
+			Stride: dst.mat.Stride,
+		},
+		cap: n,
+	}
+	u.Copy(c.chol)
+	// Compute the product Uᵀ*U using the algorithm from LAPACK/TESTING/LIN/dpot01.f
+	a := u.mat.Data
+	lda := u.mat.Stride
+	bi := blas64.Implementation()
+	for k := n - 1; k >= 0; k-- {
+		a[k*lda+k] = bi.Ddot(k+1, a[k:], lda, a[k:], lda)
+		if k > 0 {
+			bi.Dtrmv(blas.Upper, blas.Trans, blas.NonUnit, k, a, lda, a[k:], lda)
+		}
+	}
+}
+
+// InverseTo computes the inverse of the matrix represented by its Cholesky
+// factorization and stores the result into s. If the factorized
+// matrix is ill-conditioned, a Condition error will be returned.
+// Note that matrix inversion is numerically unstable, and should generally be
+// avoided where possible, for example by using the Solve routines.
+func (c *Cholesky) InverseTo(dst *SymDense) error {
+	if !c.valid() {
+		panic(badCholesky)
+	}
+	dst.reuseAsNonZeroed(c.chol.mat.N)
+	// Create a TriDense representing the Cholesky factor U with the backing
+	// slice from dst.
+	// Operations on u are reflected in dst.
+	u := &TriDense{
+		mat: blas64.Triangular{
+			Uplo:   blas.Upper,
+			Diag:   blas.NonUnit,
+			N:      dst.mat.N,
+			Data:   dst.mat.Data,
+			Stride: dst.mat.Stride,
+		},
+		cap: dst.mat.N,
+	}
+	u.Copy(c.chol)
+
+	_, ok := lapack64.Potri(u.mat)
+	if !ok {
+		return Condition(math.Inf(1))
+	}
+	if c.cond > ConditionTolerance {
+		return Condition(c.cond)
+	}
+	return nil
+}
+
+// Scale multiplies the original matrix A by a positive constant using
+// its Cholesky decomposition, storing the result in-place into the receiver.
+// That is, if the original Cholesky factorization is
+//
+//	Uᵀ * U = A
+//
+// the updated factorization is
+//
+//	U'ᵀ * U' = f A = A'
+//
+// Scale panics if the constant is non-positive, or if the receiver is non-empty
+// and is of a different size from the input.
+func (c *Cholesky) Scale(f float64, orig *Cholesky) {
+	if !orig.valid() {
+		panic(badCholesky)
+	}
+	if f <= 0 {
+		panic("cholesky: scaling by a non-positive constant")
+	}
+	n := orig.SymmetricDim()
+	if c.chol == nil {
+		c.chol = NewTriDense(n, Upper, nil)
+	} else if c.chol.mat.N != n {
+		panic(ErrShape)
+	}
+	c.chol.ScaleTri(math.Sqrt(f), orig.chol)
+	c.cond = orig.cond // Scaling by a positive constant does not change the condition number.
+}
+
+// ExtendVecSym computes the Cholesky decomposition of the original matrix A,
+// whose Cholesky decomposition is in a, extended by a the n×1 vector v according to
+//
+//	[A  w]
+//	[w' k]
+//
+// where k = v[n-1] and w = v[:n-1]. The result is stored into the receiver.
+// In order for the updated matrix to be positive definite, it must be the case
+// that k > w' A^-1 w. If this condition does not hold then ExtendVecSym will
+// return false and the receiver will not be updated.
+//
+// ExtendVecSym will panic if v.Len() != a.SymmetricDim()+1 or if a does not contain
+// a valid decomposition.
+func (c *Cholesky) ExtendVecSym(a *Cholesky, v Vector) (ok bool) {
+	n := a.SymmetricDim()
+
+	if v.Len() != n+1 {
+		panic(badSliceLength)
+	}
+	if !a.valid() {
+		panic(badCholesky)
+	}
+
+	// The algorithm is commented here, but see also
+	//  https://math.stackexchange.com/questions/955874/cholesky-factor-when-adding-a-row-and-column-to-already-factorized-matrix
+	// We have A and want to compute the Cholesky of
+	//  [A  w]
+	//  [w' k]
+	// We want
+	//  [U c]
+	//  [0 d]
+	// to be the updated Cholesky, and so it must be that
+	//  [A  w] = [U' 0] [U c]
+	//  [w' k]   [c' d] [0 d]
+	// Thus, we need
+	//  1) A = U'U (true by the original decomposition being valid),
+	//  2) U' * c = w  =>  c = U'^-1 w
+	//  3) c'*c + d'*d = k  =>  d = sqrt(k-c'*c)
+
+	// First, compute c = U'^-1 a
+	w := NewVecDense(n, nil)
+	w.CopyVec(v)
+	k := v.At(n, 0)
+
+	var t VecDense
+	_ = t.SolveVec(a.chol.T(), w)
+
+	dot := Dot(&t, &t)
+	if dot >= k {
+		return false
+	}
+	d := math.Sqrt(k - dot)
+
+	newU := NewTriDense(n+1, Upper, nil)
+	newU.Copy(a.chol)
+	for i := 0; i < n; i++ {
+		newU.SetTri(i, n, t.At(i, 0))
+	}
+	newU.SetTri(n, n, d)
+	c.chol = newU
+	c.updateCond(-1)
+	return true
+}
+
+// SymRankOne performs a rank-1 update of the original matrix A and refactorizes
+// its Cholesky factorization, storing the result into the receiver. That is, if
+// in the original Cholesky factorization
+//
+//	Uᵀ * U = A,
+//
+// in the updated factorization
+//
+//	U'ᵀ * U' = A + alpha * x * xᵀ = A'.
+//
+// Note that when alpha is negative, the updating problem may be ill-conditioned
+// and the results may be inaccurate, or the updated matrix A' may not be
+// positive definite and not have a Cholesky factorization. SymRankOne returns
+// whether the updated matrix A' is positive definite. If the update fails
+// the receiver is left unchanged.
+//
+// SymRankOne updates a Cholesky factorization in O(n²) time. The Cholesky
+// factorization computation from scratch is O(n³).
+func (c *Cholesky) SymRankOne(orig *Cholesky, alpha float64, x Vector) (ok bool) {
+	if !orig.valid() {
+		panic(badCholesky)
+	}
+	n := orig.SymmetricDim()
+	if r, c := x.Dims(); r != n || c != 1 {
+		panic(ErrShape)
+	}
+	if orig != c {
+		if c.chol == nil {
+			c.chol = NewTriDense(n, Upper, nil)
+		} else if c.chol.mat.N != n {
+			panic(ErrShape)
+		}
+		c.chol.Copy(orig.chol)
+	}
+
+	if alpha == 0 {
+		return true
+	}
+
+	// Algorithms for updating and downdating the Cholesky factorization are
+	// described, for example, in
+	// - J. J. Dongarra, J. R. Bunch, C. B. Moler, G. W. Stewart: LINPACK
+	//   Users' Guide. SIAM (1979), pages 10.10--10.14
+	// or
+	// - P. E. Gill, G. H. Golub, W. Murray, and M. A. Saunders: Methods for
+	//   modifying matrix factorizations. Mathematics of Computation 28(126)
+	//   (1974), Method C3 on page 521
+	//
+	// The implementation is based on LINPACK code
+	// http://www.netlib.org/linpack/dchud.f
+	// http://www.netlib.org/linpack/dchdd.f
+	// and
+	// https://icl.cs.utk.edu/lapack-forum/viewtopic.php?f=2&t=2646
+	//
+	// According to http://icl.cs.utk.edu/lapack-forum/archives/lapack/msg00301.html
+	// LINPACK is released under BSD license.
+	//
+	// See also:
+	// - M. A. Saunders: Large-scale Linear Programming Using the Cholesky
+	//   Factorization. Technical Report Stanford University (1972)
+	//   http://i.stanford.edu/pub/cstr/reports/cs/tr/72/252/CS-TR-72-252.pdf
+	// - Matthias Seeger: Low rank updates for the Cholesky decomposition.
+	//   EPFL Technical Report 161468 (2004)
+	//   http://infoscience.epfl.ch/record/161468
+
+	work := getFloat64s(n, false)
+	defer putFloat64s(work)
+	var xmat blas64.Vector
+	if rv, ok := x.(RawVectorer); ok {
+		xmat = rv.RawVector()
+	} else {
+		var tmp *VecDense
+		tmp.CopyVec(x)
+		xmat = tmp.RawVector()
+	}
+	blas64.Copy(xmat, blas64.Vector{N: n, Data: work, Inc: 1})
+
+	if alpha > 0 {
+		// Compute rank-1 update.
+		if alpha != 1 {
+			blas64.Scal(math.Sqrt(alpha), blas64.Vector{N: n, Data: work, Inc: 1})
+		}
+		umat := c.chol.mat
+		stride := umat.Stride
+		for i := 0; i < n; i++ {
+			// Compute parameters of the Givens matrix that zeroes
+			// the i-th element of x.
+			c, s, r, _ := blas64.Rotg(umat.Data[i*stride+i], work[i])
+			if r < 0 {
+				// Multiply by -1 to have positive diagonal
+				// elements.
+				r *= -1
+				c *= -1
+				s *= -1
+			}
+			umat.Data[i*stride+i] = r
+			if i < n-1 {
+				// Multiply the extended factorization matrix by
+				// the Givens matrix from the left. Only
+				// the i-th row and x are modified.
+				blas64.Rot(
+					blas64.Vector{N: n - i - 1, Data: umat.Data[i*stride+i+1 : i*stride+n], Inc: 1},
+					blas64.Vector{N: n - i - 1, Data: work[i+1 : n], Inc: 1},
+					c, s)
+			}
+		}
+		c.updateCond(-1)
+		return true
+	}
+
+	// Compute rank-1 downdate.
+	alpha = math.Sqrt(-alpha)
+	if alpha != 1 {
+		blas64.Scal(alpha, blas64.Vector{N: n, Data: work, Inc: 1})
+	}
+	// Solve Uᵀ * p = x storing the result into work.
+	ok = lapack64.Trtrs(blas.Trans, c.chol.RawTriangular(), blas64.General{
+		Rows:   n,
+		Cols:   1,
+		Stride: 1,
+		Data:   work,
+	})
+	if !ok {
+		// The original matrix is singular. Should not happen, because
+		// the factorization is valid.
+		panic(badCholesky)
+	}
+	norm := blas64.Nrm2(blas64.Vector{N: n, Data: work, Inc: 1})
+	if norm >= 1 {
+		// The updated matrix is not positive definite.
+		return false
+	}
+	norm = math.Sqrt((1 + norm) * (1 - norm))
+	cos := getFloat64s(n, false)
+	defer putFloat64s(cos)
+	sin := getFloat64s(n, false)
+	defer putFloat64s(sin)
+	for i := n - 1; i >= 0; i-- {
+		// Compute parameters of Givens matrices that zero elements of p
+		// backwards.
+		cos[i], sin[i], norm, _ = blas64.Rotg(norm, work[i])
+		if norm < 0 {
+			norm *= -1
+			cos[i] *= -1
+			sin[i] *= -1
+		}
+	}
+	workMat := getTriDenseWorkspace(c.chol.mat.N, c.chol.triKind(), false)
+	defer putTriWorkspace(workMat)
+	workMat.Copy(c.chol)
+	umat := workMat.mat
+	stride := workMat.mat.Stride
+	for i := n - 1; i >= 0; i-- {
+		work[i] = 0
+		// Apply Givens matrices to U.
+		blas64.Rot(
+			blas64.Vector{N: n - i, Data: work[i:n], Inc: 1},
+			blas64.Vector{N: n - i, Data: umat.Data[i*stride+i : i*stride+n], Inc: 1},
+			cos[i], sin[i])
+		if umat.Data[i*stride+i] == 0 {
+			// The matrix is singular (may rarely happen due to
+			// floating-point effects?).
+			ok = false
+		} else if umat.Data[i*stride+i] < 0 {
+			// Diagonal elements should be positive. If it happens
+			// that on the i-th row the diagonal is negative,
+			// multiply U from the left by an identity matrix that
+			// has -1 on the i-th row.
+			blas64.Scal(-1, blas64.Vector{N: n - i, Data: umat.Data[i*stride+i : i*stride+n], Inc: 1})
+		}
+	}
+	if ok {
+		c.chol.Copy(workMat)
+		c.updateCond(-1)
+	}
+	return ok
+}
+
+func (c *Cholesky) valid() bool {
+	return c.chol != nil && !c.chol.IsEmpty()
+}
+
+// BandCholesky is a symmetric positive-definite band matrix represented by its
+// Cholesky decomposition.
+//
+// Note that this matrix representation is useful for certain operations, in
+// particular finding solutions to linear equations. It is very inefficient at
+// other operations, in particular At is slow.
+//
+// BandCholesky methods may only be called on a value that has been successfully
+// initialized by a call to Factorize that has returned true. Calls to methods
+// of an unsuccessful Cholesky factorization will panic.
+type BandCholesky struct {
+	// The chol pointer must never be retained as a pointer outside the Cholesky
+	// struct, either by returning chol outside the struct or by setting it to
+	// a pointer coming from outside. The same prohibition applies to the data
+	// slice within chol.
+	chol *TriBandDense
+	cond float64
+}
+
+// Factorize calculates the Cholesky decomposition of the matrix A and returns
+// whether the matrix is positive definite. If Factorize returns false, the
+// factorization must not be used.
+func (ch *BandCholesky) Factorize(a SymBanded) (ok bool) {
+	n, k := a.SymBand()
+	if ch.chol == nil {
+		ch.chol = NewTriBandDense(n, k, Upper, nil)
+	} else {
+		ch.chol.Reset()
+		ch.chol.ReuseAsTriBand(n, k, Upper)
+	}
+	copySymBandIntoTriBand(ch.chol, a)
+	cSym := blas64.SymmetricBand{
+		Uplo:   blas.Upper,
+		N:      n,
+		K:      k,
+		Data:   ch.chol.RawTriBand().Data,
+		Stride: ch.chol.RawTriBand().Stride,
+	}
+	_, ok = lapack64.Pbtrf(cSym)
+	if !ok {
+		ch.Reset()
+		return false
+	}
+	work := getFloat64s(3*n, false)
+	iwork := getInts(n, false)
+	aNorm := lapack64.Lansb(CondNorm, cSym, work)
+	ch.cond = 1 / lapack64.Pbcon(cSym, aNorm, work, iwork)
+	putInts(iwork)
+	putFloat64s(work)
+	return true
+}
+
+// SolveTo finds the matrix X that solves A * X = B where A is represented by
+// the Cholesky decomposition. The result is stored in-place into dst.
+// If the Cholesky decomposition is singular or near-singular a Condition error
+// is returned. See the documentation for Condition for more information.
+func (ch *BandCholesky) SolveTo(dst *Dense, b Matrix) error {
+	if !ch.valid() {
+		panic(badCholesky)
+	}
+	br, bc := b.Dims()
+	if br != ch.chol.mat.N {
+		panic(ErrShape)
+	}
+	dst.reuseAsNonZeroed(br, bc)
+	if b != dst {
+		dst.Copy(b)
+	}
+	lapack64.Pbtrs(ch.chol.mat, dst.mat)
+	if ch.cond > ConditionTolerance {
+		return Condition(ch.cond)
+	}
+	return nil
+}
+
+// SolveVecTo finds the vector x that solves A * x = b where A is represented by
+// the Cholesky decomposition. The result is stored in-place into dst.
+// If the Cholesky decomposition is singular or near-singular a Condition error
+// is returned. See the documentation for Condition for more information.
+func (ch *BandCholesky) SolveVecTo(dst *VecDense, b Vector) error {
+	if !ch.valid() {
+		panic(badCholesky)
+	}
+	n := ch.chol.mat.N
+	if br, bc := b.Dims(); br != n || bc != 1 {
+		panic(ErrShape)
+	}
+	if b, ok := b.(RawVectorer); ok && dst != b {
+		dst.checkOverlap(b.RawVector())
+	}
+	dst.reuseAsNonZeroed(n)
+	if dst != b {
+		dst.CopyVec(b)
+	}
+	lapack64.Pbtrs(ch.chol.mat, dst.asGeneral())
+	if ch.cond > ConditionTolerance {
+		return Condition(ch.cond)
+	}
+	return nil
+}
+
+// Cond returns the condition number of the factorized matrix.
+func (ch *BandCholesky) Cond() float64 {
+	if !ch.valid() {
+		panic(badCholesky)
+	}
+	return ch.cond
+}
+
+// Reset resets the factorization so that it can be reused as the receiver of
+// a dimensionally restricted operation.
+func (ch *BandCholesky) Reset() {
+	if ch.chol != nil {
+		ch.chol.Reset()
+	}
+	ch.cond = math.Inf(1)
+}
+
+// Dims returns the dimensions of the matrix.
+func (ch *BandCholesky) Dims() (r, c int) {
+	n := ch.SymmetricDim()
+	return n, n
+}
+
+// At returns the element at row i, column j.
+func (ch *BandCholesky) At(i, j int) float64 {
+	n, k := ch.SymBand()
+	if uint(i) >= uint(n) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(n) {
+		panic(ErrColAccess)
+	}
+
+	if i > j {
+		i, j = j, i
+	}
+	if j-i > k {
+		return 0
+	}
+	var aij float64
+	for k := max(0, j-k); k <= i; k++ {
+		aij += ch.chol.at(k, i) * ch.chol.at(k, j)
+	}
+	return aij
+}
+
+// T returns the receiver, the transpose of a symmetric matrix.
+func (ch *BandCholesky) T() Matrix {
+	return ch
+}
+
+// TBand returns the receiver, the transpose of a symmetric band matrix.
+func (ch *BandCholesky) TBand() Banded {
+	return ch
+}
+
+// SymmetricDim implements the Symmetric interface and returns the number of rows
+// in the matrix (this is also the number of columns).
+func (ch *BandCholesky) SymmetricDim() int {
+	if ch.chol == nil {
+		return 0
+	}
+	n, _ := ch.chol.Triangle()
+	return n
+}
+
+// Bandwidth returns the lower and upper bandwidth values for the matrix.
+// The total bandwidth of the matrix is kl+ku+1.
+func (ch *BandCholesky) Bandwidth() (kl, ku int) {
+	_, k, _ := ch.chol.TriBand()
+	return k, k
+}
+
+// SymBand returns the number of rows/columns in the matrix, and the size of the
+// bandwidth. The total bandwidth of the matrix is 2*k+1.
+func (ch *BandCholesky) SymBand() (n, k int) {
+	n, k, _ = ch.chol.TriBand()
+	return n, k
+}
+
+// IsEmpty returns whether the receiver is empty. Empty matrices can be the
+// receiver for dimensionally restricted operations. The receiver can be emptied
+// using Reset.
+func (ch *BandCholesky) IsEmpty() bool {
+	return ch == nil || ch.chol.IsEmpty()
+}
+
+// Det returns the determinant of the matrix that has been factorized.
+func (ch *BandCholesky) Det() float64 {
+	if !ch.valid() {
+		panic(badCholesky)
+	}
+	return math.Exp(ch.LogDet())
+}
+
+// LogDet returns the log of the determinant of the matrix that has been factorized.
+func (ch *BandCholesky) LogDet() float64 {
+	if !ch.valid() {
+		panic(badCholesky)
+	}
+	var det float64
+	for i := 0; i < ch.chol.mat.N; i++ {
+		det += 2 * math.Log(ch.chol.mat.Data[i*ch.chol.mat.Stride])
+	}
+	return det
+}
+
+func (ch *BandCholesky) valid() bool {
+	return ch.chol != nil && !ch.chol.IsEmpty()
+}
+
+// PivotedCholesky is a symmetric positive semi-definite matrix represented by
+// its Cholesky factorization with complete pivoting.
+//
+// The factorization has the form
+//
+//	Pᵀ * A * P = Uᵀ * U
+//
+// where U is an upper triangular matrix and P is a permutation matrix.
+//
+// Cholesky methods may only be called on a receiver that has been initialized
+// by a call to Factorize. SolveTo and SolveVecTo methods may only called if
+// Factorize has returned true.
+//
+// If the matrix A is certainly positive definite, then the unpivoted Cholesky
+// could be more efficient, especially for smaller matrices.
+type PivotedCholesky struct {
+	chol          *TriDense // The factor U
+	piv, pivTrans []int     // The permutation matrices P and Pᵀ
+	rank          int       // The computed rank of A
+
+	ok   bool    // Indicates whether and the factorization can be used for solving linear systems
+	cond float64 // The condition number when ok is true
+}
+
+// Factorize computes the Cholesky factorization of the symmetric positive
+// semi-definite matrix A and returns whether the matrix is positive definite.
+// If Factorize returns false, the SolveTo methods must not be used.
+//
+// tol is a tolerance used to determine the computed rank of A. If it is
+// negative, a default value will be used.
+func (c *PivotedCholesky) Factorize(a Symmetric, tol float64) (ok bool) {
+	n := a.SymmetricDim()
+	c.reset(n)
+	copySymIntoTriangle(c.chol, a)
+
+	work := getFloat64s(3*c.chol.mat.N, false)
+	defer putFloat64s(work)
+
+	sym := c.chol.asSymBlas()
+	aNorm := lapack64.Lansy(CondNorm, sym, work)
+	_, c.rank, c.ok = lapack64.Pstrf(sym, c.piv, tol, work)
+	if c.ok {
+		iwork := getInts(n, false)
+		defer putInts(iwork)
+		c.cond = 1 / lapack64.Pocon(sym, aNorm, work, iwork)
+	} else {
+		for i := c.rank; i < n; i++ {
+			zero(sym.Data[i*sym.Stride+i : i*sym.Stride+n])
+		}
+	}
+	for i, p := range c.piv {
+		c.pivTrans[p] = i
+	}
+
+	return c.ok
+}
+
+// reset prepares the receiver for factorization of matrices of size n.
+func (c *PivotedCholesky) reset(n int) {
+	if c.chol == nil {
+		c.chol = NewTriDense(n, Upper, nil)
+	} else {
+		c.chol.Reset()
+		c.chol.reuseAsNonZeroed(n, Upper)
+	}
+	c.piv = useInt(c.piv, n)
+	c.pivTrans = useInt(c.pivTrans, n)
+	c.rank = 0
+	c.ok = false
+	c.cond = math.Inf(1)
+}
+
+// Dims returns the dimensions of the matrix A.
+func (ch *PivotedCholesky) Dims() (r, c int) {
+	n := ch.SymmetricDim()
+	return n, n
+}
+
+// At returns the element of A at row i, column j.
+func (c *PivotedCholesky) At(i, j int) float64 {
+	n := c.SymmetricDim()
+	if uint(i) >= uint(n) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(n) {
+		panic(ErrColAccess)
+	}
+
+	i = c.pivTrans[i]
+	j = c.pivTrans[j]
+	minij := min(min(i+1, j+1), c.rank)
+	var val float64
+	for k := 0; k < minij; k++ {
+		val += c.chol.at(k, i) * c.chol.at(k, j)
+	}
+	return val
+}
+
+// T returns the receiver, the transpose of a symmetric matrix.
+func (c *PivotedCholesky) T() Matrix {
+	return c
+}
+
+// SymmetricDim implements the Symmetric interface and returns the number of
+// rows (or columns) in the matrix .
+func (c *PivotedCholesky) SymmetricDim() int {
+	if c.chol == nil {
+		return 0
+	}
+	n, _ := c.chol.Triangle()
+	return n
+}
+
+// Rank returns the computed rank of the matrix A.
+func (c *PivotedCholesky) Rank() int {
+	if c.chol == nil {
+		panic(badCholesky)
+	}
+	return c.rank
+}
+
+// Cond returns the condition number of the factorized matrix.
+func (c *PivotedCholesky) Cond() float64 {
+	if c.chol == nil {
+		panic(badCholesky)
+	}
+	return c.cond
+}
+
+// RawU returns the Triangular matrix used to store the Cholesky factorization
+// of the original matrix A. If the returned matrix is modified, the
+// factorization is invalid and should not be used.
+//
+// If Factorized returned false, the rows of U from Rank to n will contain zeros
+// and so U will be upper trapezoidal.
+//
+// If Factorize has not been called, RawU will return nil.
+func (c *PivotedCholesky) RawU() Triangular {
+	if c.chol == nil {
+		return nil
+	}
+	return c.chol
+}
+
+// UTo stores the n×n upper triangular matrix U from the Cholesky factorization
+//
+//	Pᵀ * A * P = Uᵀ * U.
+//
+// into dst. If dst is empty, it is resized to be an n×n upper triangular
+// matrix. When dst is non-empty, UTo panics if dst is not n×n or not Upper.
+//
+// If Factorized returned false, the rows of U from Rank to n will contain zeros
+// and so U will be upper trapezoidal.
+func (c *PivotedCholesky) UTo(dst *TriDense) {
+	if c.chol == nil {
+		panic(badCholesky)
+	}
+	n := c.chol.mat.N
+	if dst.IsEmpty() {
+		dst.ReuseAsTri(n, Upper)
+	} else {
+		n2, kind := dst.Triangle()
+		if n != n2 {
+			panic(ErrShape)
+		}
+		if kind != Upper {
+			panic(ErrTriangle)
+		}
+	}
+	dst.Copy(c.chol)
+}
+
+// ColumnPivots returns the column permutation p that represents the permutation
+// matrix P from the Cholesky factorization
+//
+//	Pᵀ * A * P = Uᵀ * U
+//
+// such that the nonzero entries are P[p[k],k] = 1.
+func (c *PivotedCholesky) ColumnPivots(dst []int) []int {
+	if c.chol == nil {
+		panic(badCholesky)
+	}
+	n := c.chol.mat.N
+	if dst == nil {
+		dst = make([]int, n)
+	}
+	if len(dst) != n {
+		panic(badSliceLength)
+	}
+	copy(dst, c.piv)
+	return dst
+}
+
+// SolveTo finds the matrix X that solves A * X = B where A is represented by
+// the Cholesky decomposition. The result is stored in-place into dst. If the
+// Cholesky decomposition is singular or near-singular, a Condition error is
+// returned. See the documentation for Condition for more information.
+//
+// If Factorize returned false, SolveTo will panic.
+func (c *PivotedCholesky) SolveTo(dst *Dense, b Matrix) error {
+	if !c.ok {
+		panic(badCholesky)
+	}
+	n := c.chol.mat.N
+	bm, bn := b.Dims()
+	if n != bm {
+		panic(ErrShape)
+	}
+
+	dst.reuseAsNonZeroed(bm, bn)
+	if dst != b {
+		dst.Copy(b)
+	}
+
+	// Permute rows of B: D = Pᵀ * B.
+	lapack64.Lapmr(true, dst.mat, c.piv)
+	// Solve Uᵀ * U * Y = D.
+	lapack64.Potrs(c.chol.mat, dst.mat)
+	// Permute rows of Y to recover the solution: X = P * Y.
+	lapack64.Lapmr(false, dst.mat, c.piv)
+
+	if c.cond > ConditionTolerance {
+		return Condition(c.cond)
+	}
+	return nil
+}
+
+// SolveVecTo finds the vector x that solves A * x = b where A is represented by
+// the Cholesky decomposition. The result is stored in-place into dst. If the
+// Cholesky decomposition is singular or near-singular, a Condition error is
+// returned. See the documentation for Condition for more information.
+//
+// If Factorize returned false, SolveVecTo will panic.
+func (c *PivotedCholesky) SolveVecTo(dst *VecDense, b Vector) error {
+	if !c.ok {
+		panic(badCholesky)
+	}
+	n := c.chol.mat.N
+	if br, bc := b.Dims(); br != n || bc != 1 {
+		panic(ErrShape)
+	}
+	if b, ok := b.(RawVectorer); ok && dst != b {
+		dst.checkOverlap(b.RawVector())
+	}
+
+	dst.reuseAsNonZeroed(n)
+	if dst != b {
+		dst.CopyVec(b)
+	}
+
+	// Permute rows of B: D = Pᵀ * B.
+	lapack64.Lapmr(true, dst.asGeneral(), c.piv)
+	// Solve Uᵀ * U * Y = D.
+	lapack64.Potrs(c.chol.mat, dst.asGeneral())
+	// Permute rows of Y to recover the solution: X = P * Y.
+	lapack64.Lapmr(false, dst.asGeneral(), c.piv)
+
+	if c.cond > ConditionTolerance {
+		return Condition(c.cond)
+	}
+	return nil
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/cmatrix.go b/vendor/gonum.org/v1/gonum/mat/cmatrix.go
new file mode 100644
index 0000000000..336645751d
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/cmatrix.go
@@ -0,0 +1,314 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+	"math/cmplx"
+
+	"gonum.org/v1/gonum/blas/cblas128"
+	"gonum.org/v1/gonum/floats/scalar"
+)
+
+// CMatrix is the basic matrix interface type for complex matrices.
+type CMatrix interface {
+	// Dims returns the dimensions of a CMatrix.
+	Dims() (r, c int)
+
+	// At returns the value of a matrix element at row i, column j.
+	// It will panic if i or j are out of bounds for the matrix.
+	At(i, j int) complex128
+
+	// H returns the conjugate transpose of the CMatrix. Whether H
+	// returns a copy of the underlying data is implementation dependent.
+	// This method may be implemented using the ConjTranspose type, which
+	// provides an implicit matrix conjugate transpose.
+	H() CMatrix
+
+	// T returns the transpose of the CMatrix. Whether T returns a copy of the
+	// underlying data is implementation dependent.
+	// This method may be implemented using the CTranspose type, which
+	// provides an implicit matrix transpose.
+	T() CMatrix
+}
+
+// A RawCMatrixer can return a cblas128.General representation of the receiver. Changes to the cblas128.General.Data
+// slice will be reflected in the original matrix, changes to the Rows, Cols and Stride fields will not.
+type RawCMatrixer interface {
+	RawCMatrix() cblas128.General
+}
+
+var (
+	_ CMatrix          = ConjTranspose{}
+	_ UnConjTransposer = ConjTranspose{}
+)
+
+// ConjTranspose is a type for performing an implicit matrix conjugate transpose.
+// It implements the CMatrix interface, returning values from the conjugate
+// transpose of the matrix within.
+type ConjTranspose struct {
+	CMatrix CMatrix
+}
+
+// At returns the value of the element at row i and column j of the conjugate
+// transposed matrix, that is, row j and column i of the CMatrix field.
+func (t ConjTranspose) At(i, j int) complex128 {
+	z := t.CMatrix.At(j, i)
+	return cmplx.Conj(z)
+}
+
+// Dims returns the dimensions of the transposed matrix. The number of rows returned
+// is the number of columns in the CMatrix field, and the number of columns is
+// the number of rows in the CMatrix field.
+func (t ConjTranspose) Dims() (r, c int) {
+	c, r = t.CMatrix.Dims()
+	return r, c
+}
+
+// H performs an implicit conjugate transpose by returning the CMatrix field.
+func (t ConjTranspose) H() CMatrix {
+	return t.CMatrix
+}
+
+// T performs an implicit transpose by returning the receiver inside a
+// CTranspose.
+func (t ConjTranspose) T() CMatrix {
+	return CTranspose{t}
+}
+
+// UnConjTranspose returns the CMatrix field.
+func (t ConjTranspose) UnConjTranspose() CMatrix {
+	return t.CMatrix
+}
+
+// CTranspose is a type for performing an implicit matrix conjugate transpose.
+// It implements the CMatrix interface, returning values from the conjugate
+// transpose of the matrix within.
+type CTranspose struct {
+	CMatrix CMatrix
+}
+
+// At returns the value of the element at row i and column j of the conjugate
+// transposed matrix, that is, row j and column i of the CMatrix field.
+func (t CTranspose) At(i, j int) complex128 {
+	return t.CMatrix.At(j, i)
+}
+
+// Dims returns the dimensions of the transposed matrix. The number of rows returned
+// is the number of columns in the CMatrix field, and the number of columns is
+// the number of rows in the CMatrix field.
+func (t CTranspose) Dims() (r, c int) {
+	c, r = t.CMatrix.Dims()
+	return r, c
+}
+
+// H performs an implicit transpose by returning the receiver inside a
+// ConjTranspose.
+func (t CTranspose) H() CMatrix {
+	return ConjTranspose{t}
+}
+
+// T performs an implicit conjugate transpose by returning the CMatrix field.
+func (t CTranspose) T() CMatrix {
+	return t.CMatrix
+}
+
+// Untranspose returns the CMatrix field.
+func (t CTranspose) Untranspose() CMatrix {
+	return t.CMatrix
+}
+
+// UnConjTransposer is a type that can undo an implicit conjugate transpose.
+type UnConjTransposer interface {
+	// UnConjTranspose returns the underlying CMatrix stored for the implicit
+	// conjugate transpose.
+	UnConjTranspose() CMatrix
+
+	// Note: This interface is needed to unify all of the Conjugate types. In
+	// the cmat128 methods, we need to test if the CMatrix has been implicitly
+	// transposed. If this is checked by testing for the specific Conjugate type
+	// then the behavior will be different if the user uses H() or HTri() for a
+	// triangular matrix.
+}
+
+// CUntransposer is a type that can undo an implicit transpose.
+type CUntransposer interface {
+	// Untranspose returns the underlying CMatrix stored for the implicit
+	// transpose.
+	Untranspose() CMatrix
+
+	// Note: This interface is needed to unify all of the CTranspose types. In
+	// the cmat128 methods, we need to test if the CMatrix has been implicitly
+	// transposed. If this is checked by testing for the specific CTranspose type
+	// then the behavior will be different if the user uses T() or TTri() for a
+	// triangular matrix.
+}
+
+// useC returns a complex128 slice with l elements, using c if it
+// has the necessary capacity, otherwise creating a new slice.
+func useC(c []complex128, l int) []complex128 {
+	if l <= cap(c) {
+		return c[:l]
+	}
+	return make([]complex128, l)
+}
+
+// useZeroedC returns a complex128 slice with l elements, using c if it
+// has the necessary capacity, otherwise creating a new slice. The
+// elements of the returned slice are guaranteed to be zero.
+func useZeroedC(c []complex128, l int) []complex128 {
+	if l <= cap(c) {
+		c = c[:l]
+		zeroC(c)
+		return c
+	}
+	return make([]complex128, l)
+}
+
+// zeroC zeros the given slice's elements.
+func zeroC(c []complex128) {
+	for i := range c {
+		c[i] = 0
+	}
+}
+
+// untransposeCmplx untransposes a matrix if applicable. If a is an CUntransposer
+// or an UnConjTransposer, then untranspose returns the underlying matrix and true for
+// the kind of transpose (potentially both).
+// If it is not, then it returns the input matrix and false for trans and conj.
+func untransposeCmplx(a CMatrix) (u CMatrix, trans, conj bool) {
+	switch ut := a.(type) {
+	case CUntransposer:
+		trans = true
+		u := ut.Untranspose()
+		if uc, ok := u.(UnConjTransposer); ok {
+			return uc.UnConjTranspose(), trans, true
+		}
+		return u, trans, false
+	case UnConjTransposer:
+		conj = true
+		u := ut.UnConjTranspose()
+		if ut, ok := u.(CUntransposer); ok {
+			return ut.Untranspose(), true, conj
+		}
+		return u, false, conj
+	default:
+		return a, false, false
+	}
+}
+
+// untransposeExtractCmplx returns an untransposed matrix in a built-in matrix type.
+//
+// The untransposed matrix is returned unaltered if it is a built-in matrix type.
+// Otherwise, if it implements a Raw method, an appropriate built-in type value
+// is returned holding the raw matrix value of the input. If neither of these
+// is possible, the untransposed matrix is returned.
+func untransposeExtractCmplx(a CMatrix) (u CMatrix, trans, conj bool) {
+	ut, trans, conj := untransposeCmplx(a)
+	switch m := ut.(type) {
+	case *CDense:
+		return m, trans, conj
+	case RawCMatrixer:
+		var d CDense
+		d.SetRawCMatrix(m.RawCMatrix())
+		return &d, trans, conj
+	default:
+		return ut, trans, conj
+	}
+}
+
+// CEqual returns whether the matrices a and b have the same size
+// and are element-wise equal.
+func CEqual(a, b CMatrix) bool {
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+	if ar != br || ac != bc {
+		return false
+	}
+	// TODO(btracey): Add in fast-paths.
+	for i := 0; i < ar; i++ {
+		for j := 0; j < ac; j++ {
+			if a.At(i, j) != b.At(i, j) {
+				return false
+			}
+		}
+	}
+	return true
+}
+
+// CEqualApprox returns whether the matrices a and b have the same size and contain all equal
+// elements with tolerance for element-wise equality specified by epsilon. Matrices
+// with non-equal shapes are not equal.
+func CEqualApprox(a, b CMatrix, epsilon float64) bool {
+	// TODO(btracey):
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+	if ar != br || ac != bc {
+		return false
+	}
+	for i := 0; i < ar; i++ {
+		for j := 0; j < ac; j++ {
+			if !cEqualWithinAbsOrRel(a.At(i, j), b.At(i, j), epsilon, epsilon) {
+				return false
+			}
+		}
+	}
+	return true
+}
+
+// TODO(btracey): Move these into a cmplxs if/when we have one.
+
+func cEqualWithinAbsOrRel(a, b complex128, absTol, relTol float64) bool {
+	if cEqualWithinAbs(a, b, absTol) {
+		return true
+	}
+	return cEqualWithinRel(a, b, relTol)
+}
+
+// cEqualWithinAbs returns true if a and b have an absolute
+// difference of less than tol.
+func cEqualWithinAbs(a, b complex128, tol float64) bool {
+	return a == b || cmplx.Abs(a-b) <= tol
+}
+
+const minNormalFloat64 = 2.2250738585072014e-308
+
+// cEqualWithinRel returns true if the difference between a and b
+// is not greater than tol times the greater value.
+func cEqualWithinRel(a, b complex128, tol float64) bool {
+	if a == b {
+		return true
+	}
+	if cmplx.IsNaN(a) || cmplx.IsNaN(b) {
+		return false
+	}
+	// Cannot play the same trick as in floats/scalar because there are multiple
+	// possible infinities.
+	if cmplx.IsInf(a) {
+		if !cmplx.IsInf(b) {
+			return false
+		}
+		ra := real(a)
+		if math.IsInf(ra, 0) {
+			if ra == real(b) {
+				return scalar.EqualWithinRel(imag(a), imag(b), tol)
+			}
+			return false
+		}
+		if imag(a) == imag(b) {
+			return scalar.EqualWithinRel(ra, real(b), tol)
+		}
+		return false
+	}
+	if cmplx.IsInf(b) {
+		return false
+	}
+
+	delta := cmplx.Abs(a - b)
+	if delta <= minNormalFloat64 {
+		return delta <= tol*minNormalFloat64
+	}
+	return delta/math.Max(cmplx.Abs(a), cmplx.Abs(b)) <= tol
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/consts.go b/vendor/gonum.org/v1/gonum/mat/consts.go
new file mode 100644
index 0000000000..3de3f5bf47
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/consts.go
@@ -0,0 +1,15 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+// TriKind represents the triangularity of the matrix.
+type TriKind bool
+
+const (
+	// Upper specifies an upper triangular matrix.
+	Upper TriKind = true
+	// Lower specifies a lower triangular matrix.
+	Lower TriKind = false
+)
diff --git a/vendor/gonum.org/v1/gonum/mat/dense.go b/vendor/gonum.org/v1/gonum/mat/dense.go
new file mode 100644
index 0000000000..b08360cc70
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/dense.go
@@ -0,0 +1,670 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+var (
+	dense *Dense
+
+	_ Matrix      = dense
+	_ allMatrix   = dense
+	_ denseMatrix = dense
+	_ Mutable     = dense
+
+	_ ClonerFrom   = dense
+	_ RowViewer    = dense
+	_ ColViewer    = dense
+	_ RawRowViewer = dense
+	_ Grower       = dense
+
+	_ RawMatrixSetter = dense
+	_ RawMatrixer     = dense
+
+	_ Reseter = dense
+)
+
+// Dense is a dense matrix representation.
+type Dense struct {
+	mat blas64.General
+
+	capRows, capCols int
+}
+
+// NewDense creates a new Dense matrix with r rows and c columns. If data == nil,
+// a new slice is allocated for the backing slice. If len(data) == r*c, data is
+// used as the backing slice, and changes to the elements of the returned Dense
+// will be reflected in data. If neither of these is true, NewDense will panic.
+// NewDense will panic if either r or c is zero.
+//
+// The data must be arranged in row-major order, i.e. the (i*c + j)-th
+// element in the data slice is the {i, j}-th element in the matrix.
+func NewDense(r, c int, data []float64) *Dense {
+	if r <= 0 || c <= 0 {
+		if r == 0 || c == 0 {
+			panic(ErrZeroLength)
+		}
+		panic(ErrNegativeDimension)
+	}
+	if data != nil && r*c != len(data) {
+		panic(ErrShape)
+	}
+	if data == nil {
+		data = make([]float64, r*c)
+	}
+	return &Dense{
+		mat: blas64.General{
+			Rows:   r,
+			Cols:   c,
+			Stride: c,
+			Data:   data,
+		},
+		capRows: r,
+		capCols: c,
+	}
+}
+
+// ReuseAs changes the receiver if it IsEmpty() to be of size r×c.
+//
+// ReuseAs re-uses the backing data slice if it has sufficient capacity,
+// otherwise a new slice is allocated. The backing data is zero on return.
+//
+// ReuseAs panics if the receiver is not empty, and panics if
+// the input sizes are less than one. To empty the receiver for re-use,
+// Reset should be used.
+func (m *Dense) ReuseAs(r, c int) {
+	if r <= 0 || c <= 0 {
+		if r == 0 || c == 0 {
+			panic(ErrZeroLength)
+		}
+		panic(ErrNegativeDimension)
+	}
+	if !m.IsEmpty() {
+		panic(ErrReuseNonEmpty)
+	}
+	m.reuseAsZeroed(r, c)
+}
+
+// reuseAsNonZeroed resizes an empty matrix to a r×c matrix,
+// or checks that a non-empty matrix is r×c. It does not zero
+// the data in the receiver.
+func (m *Dense) reuseAsNonZeroed(r, c int) {
+	// reuseAs must be kept in sync with reuseAsZeroed.
+	if m.mat.Rows > m.capRows || m.mat.Cols > m.capCols {
+		// Panic as a string, not a mat.Error.
+		panic(badCap)
+	}
+	if r == 0 || c == 0 {
+		panic(ErrZeroLength)
+	}
+	if m.IsEmpty() {
+		m.mat = blas64.General{
+			Rows:   r,
+			Cols:   c,
+			Stride: c,
+			Data:   use(m.mat.Data, r*c),
+		}
+		m.capRows = r
+		m.capCols = c
+		return
+	}
+	if r != m.mat.Rows || c != m.mat.Cols {
+		panic(ErrShape)
+	}
+}
+
+// reuseAsZeroed resizes an empty matrix to a r×c matrix,
+// or checks that a non-empty matrix is r×c. It zeroes
+// all the elements of the matrix.
+func (m *Dense) reuseAsZeroed(r, c int) {
+	// reuseAsZeroed must be kept in sync with reuseAsNonZeroed.
+	if m.mat.Rows > m.capRows || m.mat.Cols > m.capCols {
+		// Panic as a string, not a mat.Error.
+		panic(badCap)
+	}
+	if r == 0 || c == 0 {
+		panic(ErrZeroLength)
+	}
+	if m.IsEmpty() {
+		m.mat = blas64.General{
+			Rows:   r,
+			Cols:   c,
+			Stride: c,
+			Data:   useZeroed(m.mat.Data, r*c),
+		}
+		m.capRows = r
+		m.capCols = c
+		return
+	}
+	if r != m.mat.Rows || c != m.mat.Cols {
+		panic(ErrShape)
+	}
+	m.Zero()
+}
+
+// Zero sets all of the matrix elements to zero.
+func (m *Dense) Zero() {
+	r := m.mat.Rows
+	c := m.mat.Cols
+	for i := 0; i < r; i++ {
+		zero(m.mat.Data[i*m.mat.Stride : i*m.mat.Stride+c])
+	}
+}
+
+// isolatedWorkspace returns a new dense matrix w with the size of a and
+// returns a callback to defer which performs cleanup at the return of the call.
+// This should be used when a method receiver is the same pointer as an input argument.
+func (m *Dense) isolatedWorkspace(a Matrix) (w *Dense, restore func()) {
+	r, c := a.Dims()
+	if r == 0 || c == 0 {
+		panic(ErrZeroLength)
+	}
+	w = getDenseWorkspace(r, c, false)
+	return w, func() {
+		m.Copy(w)
+		putDenseWorkspace(w)
+	}
+}
+
+// Reset empties the matrix so that it can be reused as the
+// receiver of a dimensionally restricted operation.
+//
+// Reset should not be used when the matrix shares backing data.
+// See the Reseter interface for more information.
+func (m *Dense) Reset() {
+	// Row, Cols and Stride must be zeroed in unison.
+	m.mat.Rows, m.mat.Cols, m.mat.Stride = 0, 0, 0
+	m.capRows, m.capCols = 0, 0
+	m.mat.Data = m.mat.Data[:0]
+}
+
+// IsEmpty returns whether the receiver is empty. Empty matrices can be the
+// receiver for size-restricted operations. The receiver can be emptied using
+// Reset.
+func (m *Dense) IsEmpty() bool {
+	// It must be the case that m.Dims() returns
+	// zeros in this case. See comment in Reset().
+	return m.mat.Stride == 0
+}
+
+// asTriDense returns a TriDense with the given size and side. The backing data
+// of the TriDense is the same as the receiver.
+func (m *Dense) asTriDense(n int, diag blas.Diag, uplo blas.Uplo) *TriDense {
+	return &TriDense{
+		mat: blas64.Triangular{
+			N:      n,
+			Stride: m.mat.Stride,
+			Data:   m.mat.Data,
+			Uplo:   uplo,
+			Diag:   diag,
+		},
+		cap: n,
+	}
+}
+
+// DenseCopyOf returns a newly allocated copy of the elements of a.
+func DenseCopyOf(a Matrix) *Dense {
+	d := &Dense{}
+	d.CloneFrom(a)
+	return d
+}
+
+// SetRawMatrix sets the underlying blas64.General used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in b.
+func (m *Dense) SetRawMatrix(b blas64.General) {
+	m.capRows, m.capCols = b.Rows, b.Cols
+	m.mat = b
+}
+
+// RawMatrix returns the underlying blas64.General used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in returned blas64.General.
+func (m *Dense) RawMatrix() blas64.General { return m.mat }
+
+// Dims returns the number of rows and columns in the matrix.
+func (m *Dense) Dims() (r, c int) { return m.mat.Rows, m.mat.Cols }
+
+// Caps returns the number of rows and columns in the backing matrix.
+func (m *Dense) Caps() (r, c int) { return m.capRows, m.capCols }
+
+// T performs an implicit transpose by returning the receiver inside a Transpose.
+func (m *Dense) T() Matrix {
+	return Transpose{m}
+}
+
+// ColView returns a Vector reflecting the column j, backed by the matrix data.
+//
+// See ColViewer for more information.
+func (m *Dense) ColView(j int) Vector {
+	var v VecDense
+	v.ColViewOf(m, j)
+	return &v
+}
+
+// SetCol sets the values in the specified column of the matrix to the values
+// in src. len(src) must equal the number of rows in the receiver.
+func (m *Dense) SetCol(j int, src []float64) {
+	if j >= m.mat.Cols || j < 0 {
+		panic(ErrColAccess)
+	}
+	if len(src) != m.mat.Rows {
+		panic(ErrColLength)
+	}
+
+	blas64.Copy(
+		blas64.Vector{N: m.mat.Rows, Inc: 1, Data: src},
+		blas64.Vector{N: m.mat.Rows, Inc: m.mat.Stride, Data: m.mat.Data[j:]},
+	)
+}
+
+// SetRow sets the values in the specified rows of the matrix to the values
+// in src. len(src) must equal the number of columns in the receiver.
+func (m *Dense) SetRow(i int, src []float64) {
+	if i >= m.mat.Rows || i < 0 {
+		panic(ErrRowAccess)
+	}
+	if len(src) != m.mat.Cols {
+		panic(ErrRowLength)
+	}
+
+	copy(m.rawRowView(i), src)
+}
+
+// RowView returns row i of the matrix data represented as a column vector,
+// backed by the matrix data.
+//
+// See RowViewer for more information.
+func (m *Dense) RowView(i int) Vector {
+	var v VecDense
+	v.RowViewOf(m, i)
+	return &v
+}
+
+// RawRowView returns a slice backed by the same array as backing the
+// receiver.
+func (m *Dense) RawRowView(i int) []float64 {
+	if i >= m.mat.Rows || i < 0 {
+		panic(ErrRowAccess)
+	}
+	return m.rawRowView(i)
+}
+
+func (m *Dense) rawRowView(i int) []float64 {
+	return m.mat.Data[i*m.mat.Stride : i*m.mat.Stride+m.mat.Cols]
+}
+
+// DiagView returns the diagonal as a matrix backed by the original data.
+func (m *Dense) DiagView() Diagonal {
+	n := min(m.mat.Rows, m.mat.Cols)
+	return &DiagDense{
+		mat: blas64.Vector{
+			N:    n,
+			Inc:  m.mat.Stride + 1,
+			Data: m.mat.Data[:(n-1)*m.mat.Stride+n],
+		},
+	}
+}
+
+// Slice returns a new Matrix that shares backing data with the receiver.
+// The returned matrix starts at {i,j} of the receiver and extends k-i rows
+// and l-j columns. The final row in the resulting matrix is k-1 and the
+// final column is l-1.
+// Slice panics with ErrIndexOutOfRange if the slice is outside the capacity
+// of the receiver.
+func (m *Dense) Slice(i, k, j, l int) Matrix {
+	return m.slice(i, k, j, l)
+}
+
+func (m *Dense) slice(i, k, j, l int) *Dense {
+	mr, mc := m.Caps()
+	if i < 0 || mr <= i || j < 0 || mc <= j || k < i || mr < k || l < j || mc < l {
+		if i == k || j == l {
+			panic(ErrZeroLength)
+		}
+		panic(ErrIndexOutOfRange)
+	}
+	t := *m
+	t.mat.Data = t.mat.Data[i*t.mat.Stride+j : (k-1)*t.mat.Stride+l]
+	t.mat.Rows = k - i
+	t.mat.Cols = l - j
+	t.capRows -= i
+	t.capCols -= j
+	return &t
+}
+
+// Grow returns the receiver expanded by r rows and c columns. If the dimensions
+// of the expanded matrix are outside the capacities of the receiver a new
+// allocation is made, otherwise not. Note the receiver itself is not modified
+// during the call to Grow.
+func (m *Dense) Grow(r, c int) Matrix {
+	if r < 0 || c < 0 {
+		panic(ErrIndexOutOfRange)
+	}
+	if r == 0 && c == 0 {
+		return m
+	}
+
+	r += m.mat.Rows
+	c += m.mat.Cols
+
+	var t Dense
+	switch {
+	case m.mat.Rows == 0 || m.mat.Cols == 0:
+		t.mat = blas64.General{
+			Rows:   r,
+			Cols:   c,
+			Stride: c,
+			// We zero because we don't know how the matrix will be used.
+			// In other places, the mat is immediately filled with a result;
+			// this is not the case here.
+			Data: useZeroed(m.mat.Data, r*c),
+		}
+	case r > m.capRows || c > m.capCols:
+		cr := max(r, m.capRows)
+		cc := max(c, m.capCols)
+		t.mat = blas64.General{
+			Rows:   r,
+			Cols:   c,
+			Stride: cc,
+			Data:   make([]float64, cr*cc),
+		}
+		t.capRows = cr
+		t.capCols = cc
+		// Copy the complete matrix over to the new matrix.
+		// Including elements not currently visible. Use a temporary structure
+		// to avoid modifying the receiver.
+		var tmp Dense
+		tmp.mat = blas64.General{
+			Rows:   m.mat.Rows,
+			Cols:   m.mat.Cols,
+			Stride: m.mat.Stride,
+			Data:   m.mat.Data,
+		}
+		tmp.capRows = m.capRows
+		tmp.capCols = m.capCols
+		t.Copy(&tmp)
+		return &t
+	default:
+		t.mat = blas64.General{
+			Data:   m.mat.Data[:(r-1)*m.mat.Stride+c],
+			Rows:   r,
+			Cols:   c,
+			Stride: m.mat.Stride,
+		}
+	}
+	t.capRows = r
+	t.capCols = c
+	return &t
+}
+
+// CloneFrom makes a copy of a into the receiver, overwriting the previous value of
+// the receiver. The clone from operation does not make any restriction on shape and
+// will not cause shadowing.
+//
+// See the ClonerFrom interface for more information.
+func (m *Dense) CloneFrom(a Matrix) {
+	r, c := a.Dims()
+	mat := blas64.General{
+		Rows:   r,
+		Cols:   c,
+		Stride: c,
+	}
+	m.capRows, m.capCols = r, c
+
+	aU, trans := untransposeExtract(a)
+	switch aU := aU.(type) {
+	case *Dense:
+		amat := aU.mat
+		mat.Data = make([]float64, r*c)
+		if trans {
+			for i := 0; i < r; i++ {
+				blas64.Copy(blas64.Vector{N: c, Inc: amat.Stride, Data: amat.Data[i : i+(c-1)*amat.Stride+1]},
+					blas64.Vector{N: c, Inc: 1, Data: mat.Data[i*c : (i+1)*c]})
+			}
+		} else {
+			for i := 0; i < r; i++ {
+				copy(mat.Data[i*c:(i+1)*c], amat.Data[i*amat.Stride:i*amat.Stride+c])
+			}
+		}
+	case *VecDense:
+		amat := aU.mat
+		mat.Data = make([]float64, aU.mat.N)
+		blas64.Copy(blas64.Vector{N: aU.mat.N, Inc: amat.Inc, Data: amat.Data},
+			blas64.Vector{N: aU.mat.N, Inc: 1, Data: mat.Data})
+	default:
+		mat.Data = make([]float64, r*c)
+		w := *m
+		w.mat = mat
+		for i := 0; i < r; i++ {
+			for j := 0; j < c; j++ {
+				w.set(i, j, a.At(i, j))
+			}
+		}
+		*m = w
+		return
+	}
+	m.mat = mat
+}
+
+// Copy makes a copy of elements of a into the receiver. It is similar to the
+// built-in copy; it copies as much as the overlap between the two matrices and
+// returns the number of rows and columns it copied. If a aliases the receiver
+// and is a transposed Dense or VecDense, with a non-unitary increment, Copy will
+// panic.
+//
+// See the Copier interface for more information.
+func (m *Dense) Copy(a Matrix) (r, c int) {
+	r, c = a.Dims()
+	if a == m {
+		return r, c
+	}
+	r = min(r, m.mat.Rows)
+	c = min(c, m.mat.Cols)
+	if r == 0 || c == 0 {
+		return 0, 0
+	}
+
+	aU, trans := untransposeExtract(a)
+	switch aU := aU.(type) {
+	case *Dense:
+		amat := aU.mat
+		if trans {
+			if amat.Stride != 1 {
+				m.checkOverlap(amat)
+			}
+			for i := 0; i < r; i++ {
+				blas64.Copy(blas64.Vector{N: c, Inc: amat.Stride, Data: amat.Data[i : i+(c-1)*amat.Stride+1]},
+					blas64.Vector{N: c, Inc: 1, Data: m.mat.Data[i*m.mat.Stride : i*m.mat.Stride+c]})
+			}
+		} else {
+			switch o := offset(m.mat.Data, amat.Data); {
+			case o < 0:
+				for i := r - 1; i >= 0; i-- {
+					copy(m.mat.Data[i*m.mat.Stride:i*m.mat.Stride+c], amat.Data[i*amat.Stride:i*amat.Stride+c])
+				}
+			case o > 0:
+				for i := 0; i < r; i++ {
+					copy(m.mat.Data[i*m.mat.Stride:i*m.mat.Stride+c], amat.Data[i*amat.Stride:i*amat.Stride+c])
+				}
+			default:
+				// Nothing to do.
+			}
+		}
+	case *VecDense:
+		var n, stride int
+		amat := aU.mat
+		if trans {
+			if amat.Inc != 1 {
+				m.checkOverlap(aU.asGeneral())
+			}
+			n = c
+			stride = 1
+		} else {
+			n = r
+			stride = m.mat.Stride
+		}
+		if amat.Inc == 1 && stride == 1 {
+			copy(m.mat.Data, amat.Data[:n])
+			break
+		}
+		switch o := offset(m.mat.Data, amat.Data); {
+		case o < 0:
+			blas64.Copy(blas64.Vector{N: n, Inc: -amat.Inc, Data: amat.Data},
+				blas64.Vector{N: n, Inc: -stride, Data: m.mat.Data})
+		case o > 0:
+			blas64.Copy(blas64.Vector{N: n, Inc: amat.Inc, Data: amat.Data},
+				blas64.Vector{N: n, Inc: stride, Data: m.mat.Data})
+		default:
+			// Nothing to do.
+		}
+	default:
+		m.checkOverlapMatrix(aU)
+		for i := 0; i < r; i++ {
+			for j := 0; j < c; j++ {
+				m.set(i, j, a.At(i, j))
+			}
+		}
+	}
+
+	return r, c
+}
+
+// Stack appends the rows of b onto the rows of a, placing the result into the
+// receiver with b placed in the greater indexed rows. Stack will panic if the
+// two input matrices do not have the same number of columns or the constructed
+// stacked matrix is not the same shape as the receiver.
+func (m *Dense) Stack(a, b Matrix) {
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+	if ac != bc || m == a || m == b {
+		panic(ErrShape)
+	}
+
+	m.reuseAsNonZeroed(ar+br, ac)
+
+	m.Copy(a)
+	w := m.slice(ar, ar+br, 0, bc)
+	w.Copy(b)
+}
+
+// Augment creates the augmented matrix of a and b, where b is placed in the
+// greater indexed columns. Augment will panic if the two input matrices do
+// not have the same number of rows or the constructed augmented matrix is
+// not the same shape as the receiver.
+func (m *Dense) Augment(a, b Matrix) {
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+	if ar != br || m == a || m == b {
+		panic(ErrShape)
+	}
+
+	m.reuseAsNonZeroed(ar, ac+bc)
+
+	m.Copy(a)
+	w := m.slice(0, br, ac, ac+bc)
+	w.Copy(b)
+}
+
+// Trace returns the trace of the matrix.
+//
+// Trace will panic with ErrSquare if the matrix is not square and with
+// ErrZeroLength if the matrix has zero size.
+func (m *Dense) Trace() float64 {
+	r, c := m.Dims()
+	if r != c {
+		panic(ErrSquare)
+	}
+	if m.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	// TODO(btracey): could use internal asm sum routine.
+	var v float64
+	for i := 0; i < m.mat.Rows; i++ {
+		v += m.mat.Data[i*m.mat.Stride+i]
+	}
+	return v
+}
+
+// Norm returns the specified norm of the receiver. Valid norms are:
+//
+//	1 - The maximum absolute column sum
+//	2 - The Frobenius norm, the square root of the sum of the squares of the elements
+//	Inf - The maximum absolute row sum
+//
+// Norm will panic with ErrNormOrder if an illegal norm is specified and with
+// ErrShape if the matrix has zero size.
+func (m *Dense) Norm(norm float64) float64 {
+	if m.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	lnorm := normLapack(norm, false)
+	if lnorm == lapack.MaxColumnSum {
+		work := getFloat64s(m.mat.Cols, false)
+		defer putFloat64s(work)
+		return lapack64.Lange(lnorm, m.mat, work)
+	}
+	return lapack64.Lange(lnorm, m.mat, nil)
+}
+
+// Permutation constructs an n×n permutation matrix P from the given
+// row permutation such that the nonzero entries are P[i,p[i]] = 1.
+func (m *Dense) Permutation(n int, p []int) {
+	if len(p) != n {
+		panic(badSliceLength)
+	}
+	m.reuseAsZeroed(n, n)
+	for i, v := range p {
+		if v < 0 || v >= n {
+			panic(ErrRowAccess)
+		}
+		m.mat.Data[i*m.mat.Stride+v] = 1
+	}
+}
+
+// PermuteRows rearranges the rows of the m×n matrix A in the receiver as
+// specified by the permutation p[0],p[1],...,p[m-1] of the integers 0,...,m-1.
+//
+// If inverse is false, the given permutation is applied:
+//
+//	A[p[i],0:n] is moved to A[i,0:n] for i=0,1,...,m-1.
+//
+// If inverse is true, the inverse permutation is applied:
+//
+//	A[i,0:n] is moved to A[p[i],0:n] for i=0,1,...,m-1.
+//
+// p must have length m, otherwise PermuteRows will panic.
+func (m *Dense) PermuteRows(p []int, inverse bool) {
+	r, _ := m.Dims()
+	if len(p) != r {
+		panic(badSliceLength)
+	}
+	lapack64.Lapmr(!inverse, m.mat, p)
+}
+
+// PermuteCols rearranges the columns of the m×n matrix A in the reciever as
+// specified by the permutation p[0],p[1],...,p[n-1] of the integers 0,...,n-1.
+//
+// If inverse is false, the given permutation is applied:
+//
+//	A[0:m,p[j]] is moved to A[0:m,j] for j = 0, 1, ..., n-1.
+//
+// If inverse is true, the inverse permutation is applied:
+//
+//	A[0:m,j] is moved to A[0:m,p[j]] for j = 0, 1, ..., n-1.
+//
+// p must have length n, otherwise PermuteCols will panic.
+func (m *Dense) PermuteCols(p []int, inverse bool) {
+	_, c := m.Dims()
+	if len(p) != c {
+		panic(badSliceLength)
+	}
+	lapack64.Lapmt(!inverse, m.mat, p)
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/dense_arithmetic.go b/vendor/gonum.org/v1/gonum/mat/dense_arithmetic.go
new file mode 100644
index 0000000000..259ee13d51
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/dense_arithmetic.go
@@ -0,0 +1,877 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+// Add adds a and b element-wise, placing the result in the receiver. Add
+// will panic if the two matrices do not have the same shape.
+func (m *Dense) Add(a, b Matrix) {
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+	if ar != br || ac != bc {
+		panic(ErrShape)
+	}
+
+	aU, aTrans := untransposeExtract(a)
+	bU, bTrans := untransposeExtract(b)
+	m.reuseAsNonZeroed(ar, ac)
+
+	if arm, ok := a.(*Dense); ok {
+		if brm, ok := b.(*Dense); ok {
+			amat, bmat := arm.mat, brm.mat
+			if m != aU {
+				m.checkOverlap(amat)
+			}
+			if m != bU {
+				m.checkOverlap(bmat)
+			}
+			for ja, jb, jm := 0, 0, 0; ja < ar*amat.Stride; ja, jb, jm = ja+amat.Stride, jb+bmat.Stride, jm+m.mat.Stride {
+				for i, v := range amat.Data[ja : ja+ac] {
+					m.mat.Data[i+jm] = v + bmat.Data[i+jb]
+				}
+			}
+			return
+		}
+	}
+
+	m.checkOverlapMatrix(aU)
+	m.checkOverlapMatrix(bU)
+	var restore func()
+	if aTrans && m == aU {
+		m, restore = m.isolatedWorkspace(aU)
+		defer restore()
+	} else if bTrans && m == bU {
+		m, restore = m.isolatedWorkspace(bU)
+		defer restore()
+	}
+
+	for r := 0; r < ar; r++ {
+		for c := 0; c < ac; c++ {
+			m.set(r, c, a.At(r, c)+b.At(r, c))
+		}
+	}
+}
+
+// Sub subtracts the matrix b from a, placing the result in the receiver. Sub
+// will panic if the two matrices do not have the same shape.
+func (m *Dense) Sub(a, b Matrix) {
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+	if ar != br || ac != bc {
+		panic(ErrShape)
+	}
+
+	aU, aTrans := untransposeExtract(a)
+	bU, bTrans := untransposeExtract(b)
+	m.reuseAsNonZeroed(ar, ac)
+
+	if arm, ok := a.(*Dense); ok {
+		if brm, ok := b.(*Dense); ok {
+			amat, bmat := arm.mat, brm.mat
+			if m != aU {
+				m.checkOverlap(amat)
+			}
+			if m != bU {
+				m.checkOverlap(bmat)
+			}
+			for ja, jb, jm := 0, 0, 0; ja < ar*amat.Stride; ja, jb, jm = ja+amat.Stride, jb+bmat.Stride, jm+m.mat.Stride {
+				for i, v := range amat.Data[ja : ja+ac] {
+					m.mat.Data[i+jm] = v - bmat.Data[i+jb]
+				}
+			}
+			return
+		}
+	}
+
+	m.checkOverlapMatrix(aU)
+	m.checkOverlapMatrix(bU)
+	var restore func()
+	if aTrans && m == aU {
+		m, restore = m.isolatedWorkspace(aU)
+		defer restore()
+	} else if bTrans && m == bU {
+		m, restore = m.isolatedWorkspace(bU)
+		defer restore()
+	}
+
+	for r := 0; r < ar; r++ {
+		for c := 0; c < ac; c++ {
+			m.set(r, c, a.At(r, c)-b.At(r, c))
+		}
+	}
+}
+
+// MulElem performs element-wise multiplication of a and b, placing the result
+// in the receiver. MulElem will panic if the two matrices do not have the same
+// shape.
+func (m *Dense) MulElem(a, b Matrix) {
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+	if ar != br || ac != bc {
+		panic(ErrShape)
+	}
+
+	aU, aTrans := untransposeExtract(a)
+	bU, bTrans := untransposeExtract(b)
+	m.reuseAsNonZeroed(ar, ac)
+
+	if arm, ok := a.(*Dense); ok {
+		if brm, ok := b.(*Dense); ok {
+			amat, bmat := arm.mat, brm.mat
+			if m != aU {
+				m.checkOverlap(amat)
+			}
+			if m != bU {
+				m.checkOverlap(bmat)
+			}
+			for ja, jb, jm := 0, 0, 0; ja < ar*amat.Stride; ja, jb, jm = ja+amat.Stride, jb+bmat.Stride, jm+m.mat.Stride {
+				for i, v := range amat.Data[ja : ja+ac] {
+					m.mat.Data[i+jm] = v * bmat.Data[i+jb]
+				}
+			}
+			return
+		}
+	}
+
+	m.checkOverlapMatrix(aU)
+	m.checkOverlapMatrix(bU)
+	var restore func()
+	if aTrans && m == aU {
+		m, restore = m.isolatedWorkspace(aU)
+		defer restore()
+	} else if bTrans && m == bU {
+		m, restore = m.isolatedWorkspace(bU)
+		defer restore()
+	}
+
+	for r := 0; r < ar; r++ {
+		for c := 0; c < ac; c++ {
+			m.set(r, c, a.At(r, c)*b.At(r, c))
+		}
+	}
+}
+
+// DivElem performs element-wise division of a by b, placing the result
+// in the receiver. DivElem will panic if the two matrices do not have the same
+// shape.
+func (m *Dense) DivElem(a, b Matrix) {
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+	if ar != br || ac != bc {
+		panic(ErrShape)
+	}
+
+	aU, aTrans := untransposeExtract(a)
+	bU, bTrans := untransposeExtract(b)
+	m.reuseAsNonZeroed(ar, ac)
+
+	if arm, ok := a.(*Dense); ok {
+		if brm, ok := b.(*Dense); ok {
+			amat, bmat := arm.mat, brm.mat
+			if m != aU {
+				m.checkOverlap(amat)
+			}
+			if m != bU {
+				m.checkOverlap(bmat)
+			}
+			for ja, jb, jm := 0, 0, 0; ja < ar*amat.Stride; ja, jb, jm = ja+amat.Stride, jb+bmat.Stride, jm+m.mat.Stride {
+				for i, v := range amat.Data[ja : ja+ac] {
+					m.mat.Data[i+jm] = v / bmat.Data[i+jb]
+				}
+			}
+			return
+		}
+	}
+
+	m.checkOverlapMatrix(aU)
+	m.checkOverlapMatrix(bU)
+	var restore func()
+	if aTrans && m == aU {
+		m, restore = m.isolatedWorkspace(aU)
+		defer restore()
+	} else if bTrans && m == bU {
+		m, restore = m.isolatedWorkspace(bU)
+		defer restore()
+	}
+
+	for r := 0; r < ar; r++ {
+		for c := 0; c < ac; c++ {
+			m.set(r, c, a.At(r, c)/b.At(r, c))
+		}
+	}
+}
+
+// Inverse computes the inverse of the matrix a, storing the result into the
+// receiver. If a is ill-conditioned, a Condition error will be returned.
+// Note that matrix inversion is numerically unstable, and should generally
+// be avoided where possible, for example by using the Solve routines.
+func (m *Dense) Inverse(a Matrix) error {
+	// TODO(btracey): Special case for RawTriangular, etc.
+	r, c := a.Dims()
+	if r != c {
+		panic(ErrSquare)
+	}
+	m.reuseAsNonZeroed(a.Dims())
+	aU, aTrans := untransposeExtract(a)
+	switch rm := aU.(type) {
+	case *Dense:
+		if m != aU || aTrans {
+			if m == aU || m.checkOverlap(rm.mat) {
+				tmp := getDenseWorkspace(r, c, false)
+				tmp.Copy(a)
+				m.Copy(tmp)
+				putDenseWorkspace(tmp)
+				break
+			}
+			m.Copy(a)
+		}
+	default:
+		m.Copy(a)
+	}
+	// Compute the norm of A.
+	work := getFloat64s(4*r, false) // Length must be at least 4*r for Gecon.
+	norm := lapack64.Lange(CondNorm, m.mat, work)
+	// Compute the LU factorization of A.
+	ipiv := getInts(r, false)
+	defer putInts(ipiv)
+	ok := lapack64.Getrf(m.mat, ipiv)
+	if !ok {
+		// A is exactly singular.
+		return Condition(math.Inf(1))
+	}
+	// Compute the condition number of A using the LU factorization.
+	iwork := getInts(r, false)
+	defer putInts(iwork)
+	rcond := lapack64.Gecon(CondNorm, m.mat, norm, work, iwork)
+	// Compute A^{-1} from the LU factorization regardless of the value of rcond.
+	lapack64.Getri(m.mat, ipiv, work, -1)
+	if int(work[0]) > len(work) {
+		l := int(work[0])
+		putFloat64s(work)
+		work = getFloat64s(l, false)
+	}
+	defer putFloat64s(work)
+	ok = lapack64.Getri(m.mat, ipiv, work, len(work))
+	if !ok || rcond == 0 {
+		// A is exactly singular.
+		return Condition(math.Inf(1))
+	}
+	// Check whether A is singular for computational purposes.
+	cond := 1 / rcond
+	if cond > ConditionTolerance {
+		return Condition(cond)
+	}
+	return nil
+}
+
+// Mul takes the matrix product of a and b, placing the result in the receiver.
+// If the number of columns in a does not equal the number of rows in b, Mul will panic.
+func (m *Dense) Mul(a, b Matrix) {
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+
+	if ac != br {
+		panic(ErrShape)
+	}
+
+	aU, aTrans := untransposeExtract(a)
+	bU, bTrans := untransposeExtract(b)
+	m.reuseAsNonZeroed(ar, bc)
+	var restore func()
+	if m == aU {
+		m, restore = m.isolatedWorkspace(aU)
+		defer restore()
+	} else if m == bU {
+		m, restore = m.isolatedWorkspace(bU)
+		defer restore()
+	}
+	aT := blas.NoTrans
+	if aTrans {
+		aT = blas.Trans
+	}
+	bT := blas.NoTrans
+	if bTrans {
+		bT = blas.Trans
+	}
+
+	// Some of the cases do not have a transpose option, so create
+	// temporary memory.
+	// C = Aᵀ * B = (Bᵀ * A)ᵀ
+	// Cᵀ = Bᵀ * A.
+	if aU, ok := aU.(*Dense); ok {
+		if restore == nil {
+			m.checkOverlap(aU.mat)
+		}
+		switch bU := bU.(type) {
+		case *Dense:
+			if restore == nil {
+				m.checkOverlap(bU.mat)
+			}
+			blas64.Gemm(aT, bT, 1, aU.mat, bU.mat, 0, m.mat)
+			return
+
+		case *SymDense:
+			if aTrans {
+				c := getDenseWorkspace(ac, ar, false)
+				blas64.Symm(blas.Left, 1, bU.mat, aU.mat, 0, c.mat)
+				strictCopy(m, c.T())
+				putDenseWorkspace(c)
+				return
+			}
+			blas64.Symm(blas.Right, 1, bU.mat, aU.mat, 0, m.mat)
+			return
+
+		case *TriDense:
+			// Trmm updates in place, so copy aU first.
+			if aTrans {
+				c := getDenseWorkspace(ac, ar, false)
+				var tmp Dense
+				tmp.SetRawMatrix(aU.mat)
+				c.Copy(&tmp)
+				bT := blas.Trans
+				if bTrans {
+					bT = blas.NoTrans
+				}
+				blas64.Trmm(blas.Left, bT, 1, bU.mat, c.mat)
+				strictCopy(m, c.T())
+				putDenseWorkspace(c)
+				return
+			}
+			m.Copy(a)
+			blas64.Trmm(blas.Right, bT, 1, bU.mat, m.mat)
+			return
+
+		case *VecDense:
+			m.checkOverlap(bU.asGeneral())
+			bvec := bU.RawVector()
+			if bTrans {
+				// {ar,1} x {1,bc}, which is not a vector.
+				// Instead, construct B as a General.
+				bmat := blas64.General{
+					Rows:   bc,
+					Cols:   1,
+					Stride: bvec.Inc,
+					Data:   bvec.Data,
+				}
+				blas64.Gemm(aT, bT, 1, aU.mat, bmat, 0, m.mat)
+				return
+			}
+			cvec := blas64.Vector{
+				Inc:  m.mat.Stride,
+				Data: m.mat.Data,
+			}
+			blas64.Gemv(aT, 1, aU.mat, bvec, 0, cvec)
+			return
+		}
+	}
+	if bU, ok := bU.(*Dense); ok {
+		if restore == nil {
+			m.checkOverlap(bU.mat)
+		}
+		switch aU := aU.(type) {
+		case *SymDense:
+			if bTrans {
+				c := getDenseWorkspace(bc, br, false)
+				blas64.Symm(blas.Right, 1, aU.mat, bU.mat, 0, c.mat)
+				strictCopy(m, c.T())
+				putDenseWorkspace(c)
+				return
+			}
+			blas64.Symm(blas.Left, 1, aU.mat, bU.mat, 0, m.mat)
+			return
+
+		case *TriDense:
+			// Trmm updates in place, so copy bU first.
+			if bTrans {
+				c := getDenseWorkspace(bc, br, false)
+				var tmp Dense
+				tmp.SetRawMatrix(bU.mat)
+				c.Copy(&tmp)
+				aT := blas.Trans
+				if aTrans {
+					aT = blas.NoTrans
+				}
+				blas64.Trmm(blas.Right, aT, 1, aU.mat, c.mat)
+				strictCopy(m, c.T())
+				putDenseWorkspace(c)
+				return
+			}
+			m.Copy(b)
+			blas64.Trmm(blas.Left, aT, 1, aU.mat, m.mat)
+			return
+
+		case *VecDense:
+			m.checkOverlap(aU.asGeneral())
+			avec := aU.RawVector()
+			if aTrans {
+				// {1,ac} x {ac, bc}
+				// Transpose B so that the vector is on the right.
+				cvec := blas64.Vector{
+					Inc:  1,
+					Data: m.mat.Data,
+				}
+				bT := blas.Trans
+				if bTrans {
+					bT = blas.NoTrans
+				}
+				blas64.Gemv(bT, 1, bU.mat, avec, 0, cvec)
+				return
+			}
+			// {ar,1} x {1,bc} which is not a vector result.
+			// Instead, construct A as a General.
+			amat := blas64.General{
+				Rows:   ar,
+				Cols:   1,
+				Stride: avec.Inc,
+				Data:   avec.Data,
+			}
+			blas64.Gemm(aT, bT, 1, amat, bU.mat, 0, m.mat)
+			return
+		}
+	}
+
+	m.checkOverlapMatrix(aU)
+	m.checkOverlapMatrix(bU)
+	row := getFloat64s(ac, false)
+	defer putFloat64s(row)
+	for r := 0; r < ar; r++ {
+		for i := range row {
+			row[i] = a.At(r, i)
+		}
+		for c := 0; c < bc; c++ {
+			var v float64
+			for i, e := range row {
+				v += e * b.At(i, c)
+			}
+			m.mat.Data[r*m.mat.Stride+c] = v
+		}
+	}
+}
+
+// strictCopy copies a into m panicking if the shape of a and m differ.
+func strictCopy(m *Dense, a Matrix) {
+	r, c := m.Copy(a)
+	if r != m.mat.Rows || c != m.mat.Cols {
+		// Panic with a string since this
+		// is not a user-facing panic.
+		panic(ErrShape.Error())
+	}
+}
+
+// Exp calculates the exponential of the matrix a, e^a, placing the result
+// in the receiver. Exp will panic with ErrShape if a is not square.
+func (m *Dense) Exp(a Matrix) {
+	// The implementation used here is from Functions of Matrices: Theory and Computation
+	// Chapter 10, Algorithm 10.20. https://doi.org/10.1137/1.9780898717778.ch10
+
+	r, c := a.Dims()
+	if r != c {
+		panic(ErrShape)
+	}
+
+	m.reuseAsNonZeroed(r, r)
+	if r == 1 {
+		m.mat.Data[0] = math.Exp(a.At(0, 0))
+		return
+	}
+
+	pade := []struct {
+		theta float64
+		b     []float64
+	}{
+		{theta: 0.015, b: []float64{
+			120, 60, 12, 1,
+		}},
+		{theta: 0.25, b: []float64{
+			30240, 15120, 3360, 420, 30, 1,
+		}},
+		{theta: 0.95, b: []float64{
+			17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1,
+		}},
+		{theta: 2.1, b: []float64{
+			17643225600, 8821612800, 2075673600, 302702400, 30270240, 2162160, 110880, 3960, 90, 1,
+		}},
+	}
+
+	a1 := m
+	a1.Copy(a)
+	v := getDenseWorkspace(r, r, true)
+	vraw := v.RawMatrix()
+	n := r * r
+	vvec := blas64.Vector{N: n, Inc: 1, Data: vraw.Data}
+	defer putDenseWorkspace(v)
+
+	u := getDenseWorkspace(r, r, true)
+	uraw := u.RawMatrix()
+	uvec := blas64.Vector{N: n, Inc: 1, Data: uraw.Data}
+	defer putDenseWorkspace(u)
+
+	a2 := getDenseWorkspace(r, r, false)
+	defer putDenseWorkspace(a2)
+
+	n1 := Norm(a, 1)
+	for i, t := range pade {
+		if n1 > t.theta {
+			continue
+		}
+
+		// This loop only executes once, so
+		// this is not as horrible as it looks.
+		p := getDenseWorkspace(r, r, true)
+		praw := p.RawMatrix()
+		pvec := blas64.Vector{N: n, Inc: 1, Data: praw.Data}
+		defer putDenseWorkspace(p)
+
+		for k := 0; k < r; k++ {
+			p.set(k, k, 1)
+			v.set(k, k, t.b[0])
+			u.set(k, k, t.b[1])
+		}
+
+		a2.Mul(a1, a1)
+		for j := 0; j <= i; j++ {
+			p.Mul(p, a2)
+			blas64.Axpy(t.b[2*j+2], pvec, vvec)
+			blas64.Axpy(t.b[2*j+3], pvec, uvec)
+		}
+		u.Mul(a1, u)
+
+		// Use p as a workspace here and
+		// rename u for the second call's
+		// receiver.
+		vmu, vpu := u, p
+		vpu.Add(v, u)
+		vmu.Sub(v, u)
+
+		_ = m.Solve(vmu, vpu)
+		return
+	}
+
+	// Remaining Padé table line.
+	const theta13 = 5.4
+	b := [...]float64{
+		64764752532480000, 32382376266240000, 7771770303897600, 1187353796428800,
+		129060195264000, 10559470521600, 670442572800, 33522128640,
+		1323241920, 40840800, 960960, 16380, 182, 1,
+	}
+
+	s := math.Log2(n1 / theta13)
+	if s >= 0 {
+		s = math.Ceil(s)
+		a1.Scale(1/math.Pow(2, s), a1)
+	}
+	a2.Mul(a1, a1)
+
+	i := getDenseWorkspace(r, r, true)
+	for j := 0; j < r; j++ {
+		i.set(j, j, 1)
+	}
+	iraw := i.RawMatrix()
+	ivec := blas64.Vector{N: n, Inc: 1, Data: iraw.Data}
+	defer putDenseWorkspace(i)
+
+	a2raw := a2.RawMatrix()
+	a2vec := blas64.Vector{N: n, Inc: 1, Data: a2raw.Data}
+
+	a4 := getDenseWorkspace(r, r, false)
+	a4raw := a4.RawMatrix()
+	a4vec := blas64.Vector{N: n, Inc: 1, Data: a4raw.Data}
+	defer putDenseWorkspace(a4)
+	a4.Mul(a2, a2)
+
+	a6 := getDenseWorkspace(r, r, false)
+	a6raw := a6.RawMatrix()
+	a6vec := blas64.Vector{N: n, Inc: 1, Data: a6raw.Data}
+	defer putDenseWorkspace(a6)
+	a6.Mul(a2, a4)
+
+	// V = A_6(b_12*A_6 + b_10*A_4 + b_8*A_2) + b_6*A_6 + b_4*A_4 + b_2*A_2 +b_0*I
+	blas64.Axpy(b[12], a6vec, vvec)
+	blas64.Axpy(b[10], a4vec, vvec)
+	blas64.Axpy(b[8], a2vec, vvec)
+	v.Mul(v, a6)
+	blas64.Axpy(b[6], a6vec, vvec)
+	blas64.Axpy(b[4], a4vec, vvec)
+	blas64.Axpy(b[2], a2vec, vvec)
+	blas64.Axpy(b[0], ivec, vvec)
+
+	// U = A(A_6(b_13*A_6 + b_11*A_4 + b_9*A_2) + b_7*A_6 + b_5*A_4 + b_2*A_3 +b_1*I)
+	blas64.Axpy(b[13], a6vec, uvec)
+	blas64.Axpy(b[11], a4vec, uvec)
+	blas64.Axpy(b[9], a2vec, uvec)
+	u.Mul(u, a6)
+	blas64.Axpy(b[7], a6vec, uvec)
+	blas64.Axpy(b[5], a4vec, uvec)
+	blas64.Axpy(b[3], a2vec, uvec)
+	blas64.Axpy(b[1], ivec, uvec)
+	u.Mul(u, a1)
+
+	// Use i as a workspace here and
+	// rename u for the second call's
+	// receiver.
+	vmu, vpu := u, i
+	vpu.Add(v, u)
+	vmu.Sub(v, u)
+
+	_ = m.Solve(vmu, vpu)
+
+	for ; s > 0; s-- {
+		m.Mul(m, m)
+	}
+}
+
+// Pow calculates the integral power of the matrix a to n, placing the result
+// in the receiver. Pow will panic if n is negative or if a is not square.
+func (m *Dense) Pow(a Matrix, n int) {
+	if n < 0 {
+		panic("mat: illegal power")
+	}
+	r, c := a.Dims()
+	if r != c {
+		panic(ErrShape)
+	}
+
+	m.reuseAsNonZeroed(r, c)
+
+	// Take possible fast paths.
+	switch n {
+	case 0:
+		for i := 0; i < r; i++ {
+			zero(m.mat.Data[i*m.mat.Stride : i*m.mat.Stride+c])
+			m.mat.Data[i*m.mat.Stride+i] = 1
+		}
+		return
+	case 1:
+		m.Copy(a)
+		return
+	case 2:
+		m.Mul(a, a)
+		return
+	}
+
+	// Perform iterative exponentiation by squaring in work space.
+	w := getDenseWorkspace(r, r, false)
+	w.Copy(a)
+	s := getDenseWorkspace(r, r, false)
+	s.Copy(a)
+	x := getDenseWorkspace(r, r, false)
+	for n--; n > 0; n >>= 1 {
+		if n&1 != 0 {
+			x.Mul(w, s)
+			w, x = x, w
+		}
+		if n != 1 {
+			x.Mul(s, s)
+			s, x = x, s
+		}
+	}
+	m.Copy(w)
+	putDenseWorkspace(w)
+	putDenseWorkspace(s)
+	putDenseWorkspace(x)
+}
+
+// Kronecker calculates the Kronecker product of a and b, placing the result in
+// the receiver.
+func (m *Dense) Kronecker(a, b Matrix) {
+	ra, ca := a.Dims()
+	rb, cb := b.Dims()
+
+	m.reuseAsNonZeroed(ra*rb, ca*cb)
+	for i := 0; i < ra; i++ {
+		for j := 0; j < ca; j++ {
+			m.slice(i*rb, (i+1)*rb, j*cb, (j+1)*cb).Scale(a.At(i, j), b)
+		}
+	}
+}
+
+// Scale multiplies the elements of a by f, placing the result in the receiver.
+//
+// See the Scaler interface for more information.
+func (m *Dense) Scale(f float64, a Matrix) {
+	ar, ac := a.Dims()
+
+	m.reuseAsNonZeroed(ar, ac)
+
+	aU, aTrans := untransposeExtract(a)
+	if rm, ok := aU.(*Dense); ok {
+		amat := rm.mat
+		if m == aU || m.checkOverlap(amat) {
+			var restore func()
+			m, restore = m.isolatedWorkspace(a)
+			defer restore()
+		}
+		if !aTrans {
+			for ja, jm := 0, 0; ja < ar*amat.Stride; ja, jm = ja+amat.Stride, jm+m.mat.Stride {
+				for i, v := range amat.Data[ja : ja+ac] {
+					m.mat.Data[i+jm] = v * f
+				}
+			}
+		} else {
+			for ja, jm := 0, 0; ja < ac*amat.Stride; ja, jm = ja+amat.Stride, jm+1 {
+				for i, v := range amat.Data[ja : ja+ar] {
+					m.mat.Data[i*m.mat.Stride+jm] = v * f
+				}
+			}
+		}
+		return
+	}
+
+	m.checkOverlapMatrix(a)
+	for r := 0; r < ar; r++ {
+		for c := 0; c < ac; c++ {
+			m.set(r, c, f*a.At(r, c))
+		}
+	}
+}
+
+// Apply applies the function fn to each of the elements of a, placing the
+// resulting matrix in the receiver. The function fn takes a row/column
+// index and element value and returns some function of that tuple.
+func (m *Dense) Apply(fn func(i, j int, v float64) float64, a Matrix) {
+	ar, ac := a.Dims()
+
+	m.reuseAsNonZeroed(ar, ac)
+
+	aU, aTrans := untransposeExtract(a)
+	if rm, ok := aU.(*Dense); ok {
+		amat := rm.mat
+		if m == aU || m.checkOverlap(amat) {
+			var restore func()
+			m, restore = m.isolatedWorkspace(a)
+			defer restore()
+		}
+		if !aTrans {
+			for j, ja, jm := 0, 0, 0; ja < ar*amat.Stride; j, ja, jm = j+1, ja+amat.Stride, jm+m.mat.Stride {
+				for i, v := range amat.Data[ja : ja+ac] {
+					m.mat.Data[i+jm] = fn(j, i, v)
+				}
+			}
+		} else {
+			for j, ja, jm := 0, 0, 0; ja < ac*amat.Stride; j, ja, jm = j+1, ja+amat.Stride, jm+1 {
+				for i, v := range amat.Data[ja : ja+ar] {
+					m.mat.Data[i*m.mat.Stride+jm] = fn(i, j, v)
+				}
+			}
+		}
+		return
+	}
+
+	m.checkOverlapMatrix(a)
+	for r := 0; r < ar; r++ {
+		for c := 0; c < ac; c++ {
+			m.set(r, c, fn(r, c, a.At(r, c)))
+		}
+	}
+}
+
+// RankOne performs a rank-one update to the matrix a with the vectors x and
+// y, where x and y are treated as column vectors. The result is stored in the
+// receiver. The Outer method can be used instead of RankOne if a is not needed.
+//
+//	m = a + alpha * x * yᵀ
+func (m *Dense) RankOne(a Matrix, alpha float64, x, y Vector) {
+	ar, ac := a.Dims()
+	if x.Len() != ar {
+		panic(ErrShape)
+	}
+	if y.Len() != ac {
+		panic(ErrShape)
+	}
+
+	if a != m {
+		aU, _ := untransposeExtract(a)
+		if rm, ok := aU.(*Dense); ok {
+			m.checkOverlap(rm.RawMatrix())
+		}
+	}
+
+	var xmat, ymat blas64.Vector
+	fast := true
+	xU, _ := untransposeExtract(x)
+	if rv, ok := xU.(*VecDense); ok {
+		r, c := xU.Dims()
+		xmat = rv.mat
+		m.checkOverlap(generalFromVector(xmat, r, c))
+	} else {
+		fast = false
+	}
+	yU, _ := untransposeExtract(y)
+	if rv, ok := yU.(*VecDense); ok {
+		r, c := yU.Dims()
+		ymat = rv.mat
+		m.checkOverlap(generalFromVector(ymat, r, c))
+	} else {
+		fast = false
+	}
+
+	if fast {
+		if m != a {
+			m.reuseAsNonZeroed(ar, ac)
+			m.Copy(a)
+		}
+		blas64.Ger(alpha, xmat, ymat, m.mat)
+		return
+	}
+
+	m.reuseAsNonZeroed(ar, ac)
+	for i := 0; i < ar; i++ {
+		for j := 0; j < ac; j++ {
+			m.set(i, j, a.At(i, j)+alpha*x.AtVec(i)*y.AtVec(j))
+		}
+	}
+}
+
+// Outer calculates the outer product of the vectors x and y, where x and y
+// are treated as column vectors, and stores the result in the receiver.
+//
+//	m = alpha * x * yᵀ
+//
+// In order to update an existing matrix, see RankOne.
+func (m *Dense) Outer(alpha float64, x, y Vector) {
+	r, c := x.Len(), y.Len()
+
+	m.reuseAsZeroed(r, c)
+
+	var xmat, ymat blas64.Vector
+	fast := true
+	xU, _ := untransposeExtract(x)
+	if rv, ok := xU.(*VecDense); ok {
+		r, c := xU.Dims()
+		xmat = rv.mat
+		m.checkOverlap(generalFromVector(xmat, r, c))
+	} else {
+		fast = false
+	}
+	yU, _ := untransposeExtract(y)
+	if rv, ok := yU.(*VecDense); ok {
+		r, c := yU.Dims()
+		ymat = rv.mat
+		m.checkOverlap(generalFromVector(ymat, r, c))
+	} else {
+		fast = false
+	}
+
+	if fast {
+		for i := 0; i < r; i++ {
+			zero(m.mat.Data[i*m.mat.Stride : i*m.mat.Stride+c])
+		}
+		blas64.Ger(alpha, xmat, ymat, m.mat)
+		return
+	}
+
+	for i := 0; i < r; i++ {
+		for j := 0; j < c; j++ {
+			m.set(i, j, alpha*x.AtVec(i)*y.AtVec(j))
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/diagonal.go b/vendor/gonum.org/v1/gonum/mat/diagonal.go
new file mode 100644
index 0000000000..c42f70c831
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/diagonal.go
@@ -0,0 +1,342 @@
+// Copyright ©2018 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+var (
+	diagDense *DiagDense
+	_         Matrix          = diagDense
+	_         allMatrix       = diagDense
+	_         denseMatrix     = diagDense
+	_         Diagonal        = diagDense
+	_         MutableDiagonal = diagDense
+	_         Triangular      = diagDense
+	_         TriBanded       = diagDense
+	_         Symmetric       = diagDense
+	_         SymBanded       = diagDense
+	_         Banded          = diagDense
+	_         RawBander       = diagDense
+	_         RawSymBander    = diagDense
+
+	diag Diagonal
+	_    Matrix     = diag
+	_    Diagonal   = diag
+	_    Triangular = diag
+	_    TriBanded  = diag
+	_    Symmetric  = diag
+	_    SymBanded  = diag
+	_    Banded     = diag
+)
+
+// Diagonal represents a diagonal matrix, that is a square matrix that only
+// has non-zero terms on the diagonal.
+type Diagonal interface {
+	Matrix
+	// Diag returns the number of rows/columns in the matrix.
+	Diag() int
+
+	// The following interfaces are included in the Diagonal
+	// interface to allow the use of Diagonal types in
+	// functions operating on these types.
+	Banded
+	SymBanded
+	Symmetric
+	Triangular
+	TriBanded
+}
+
+// MutableDiagonal is a Diagonal matrix whose elements can be set.
+type MutableDiagonal interface {
+	Diagonal
+	SetDiag(i int, v float64)
+}
+
+// DiagDense represents a diagonal matrix in dense storage format.
+type DiagDense struct {
+	mat blas64.Vector
+}
+
+// NewDiagDense creates a new Diagonal matrix with n rows and n columns.
+// The length of data must be n or data must be nil, otherwise NewDiagDense
+// will panic. NewDiagDense will panic if n is zero.
+func NewDiagDense(n int, data []float64) *DiagDense {
+	if n <= 0 {
+		if n == 0 {
+			panic(ErrZeroLength)
+		}
+		panic("mat: negative dimension")
+	}
+	if data == nil {
+		data = make([]float64, n)
+	}
+	if len(data) != n {
+		panic(ErrShape)
+	}
+	return &DiagDense{
+		mat: blas64.Vector{N: n, Data: data, Inc: 1},
+	}
+}
+
+// Diag returns the dimension of the receiver.
+func (d *DiagDense) Diag() int {
+	return d.mat.N
+}
+
+// Dims returns the dimensions of the matrix.
+func (d *DiagDense) Dims() (r, c int) {
+	return d.mat.N, d.mat.N
+}
+
+// T returns the transpose of the matrix.
+func (d *DiagDense) T() Matrix {
+	return d
+}
+
+// TTri returns the transpose of the matrix. Note that Diagonal matrices are
+// Upper by default.
+func (d *DiagDense) TTri() Triangular {
+	return TransposeTri{d}
+}
+
+// TBand performs an implicit transpose by returning the receiver inside a
+// TransposeBand.
+func (d *DiagDense) TBand() Banded {
+	return TransposeBand{d}
+}
+
+// TTriBand performs an implicit transpose by returning the receiver inside a
+// TransposeTriBand. Note that Diagonal matrices are Upper by default.
+func (d *DiagDense) TTriBand() TriBanded {
+	return TransposeTriBand{d}
+}
+
+// Bandwidth returns the upper and lower bandwidths of the matrix.
+// These values are always zero for diagonal matrices.
+func (d *DiagDense) Bandwidth() (kl, ku int) {
+	return 0, 0
+}
+
+// SymmetricDim implements the Symmetric interface.
+func (d *DiagDense) SymmetricDim() int {
+	return d.mat.N
+}
+
+// SymBand returns the number of rows/columns in the matrix, and the size of
+// the bandwidth.
+func (d *DiagDense) SymBand() (n, k int) {
+	return d.mat.N, 0
+}
+
+// Triangle implements the Triangular interface.
+func (d *DiagDense) Triangle() (int, TriKind) {
+	return d.mat.N, Upper
+}
+
+// TriBand returns the number of rows/columns in the matrix, the
+// size of the bandwidth, and the orientation. Note that Diagonal matrices are
+// Upper by default.
+func (d *DiagDense) TriBand() (n, k int, kind TriKind) {
+	return d.mat.N, 0, Upper
+}
+
+// Reset empties the matrix so that it can be reused as the
+// receiver of a dimensionally restricted operation.
+//
+// Reset should not be used when the matrix shares backing data.
+// See the Reseter interface for more information.
+func (d *DiagDense) Reset() {
+	// No change of Inc or n to 0 may be
+	// made unless both are set to 0.
+	d.mat.Inc = 0
+	d.mat.N = 0
+	d.mat.Data = d.mat.Data[:0]
+}
+
+// Zero sets all of the matrix elements to zero.
+func (d *DiagDense) Zero() {
+	for i := 0; i < d.mat.N; i++ {
+		d.mat.Data[d.mat.Inc*i] = 0
+	}
+}
+
+// DiagView returns the diagonal as a matrix backed by the original data.
+func (d *DiagDense) DiagView() Diagonal {
+	return d
+}
+
+// DiagFrom copies the diagonal of m into the receiver. The receiver must
+// be min(r, c) long or empty, otherwise DiagFrom will panic.
+func (d *DiagDense) DiagFrom(m Matrix) {
+	n := min(m.Dims())
+	d.reuseAsNonZeroed(n)
+
+	var vec blas64.Vector
+	switch r := m.(type) {
+	case *DiagDense:
+		vec = r.mat
+	case RawBander:
+		mat := r.RawBand()
+		vec = blas64.Vector{
+			N:    n,
+			Inc:  mat.Stride,
+			Data: mat.Data[mat.KL : (n-1)*mat.Stride+mat.KL+1],
+		}
+	case RawMatrixer:
+		mat := r.RawMatrix()
+		vec = blas64.Vector{
+			N:    n,
+			Inc:  mat.Stride + 1,
+			Data: mat.Data[:(n-1)*mat.Stride+n],
+		}
+	case RawSymBander:
+		mat := r.RawSymBand()
+		vec = blas64.Vector{
+			N:    n,
+			Inc:  mat.Stride,
+			Data: mat.Data[:(n-1)*mat.Stride+1],
+		}
+	case RawSymmetricer:
+		mat := r.RawSymmetric()
+		vec = blas64.Vector{
+			N:    n,
+			Inc:  mat.Stride + 1,
+			Data: mat.Data[:(n-1)*mat.Stride+n],
+		}
+	case RawTriBander:
+		mat := r.RawTriBand()
+		data := mat.Data
+		if mat.Uplo == blas.Lower {
+			data = data[mat.K:]
+		}
+		vec = blas64.Vector{
+			N:    n,
+			Inc:  mat.Stride,
+			Data: data[:(n-1)*mat.Stride+1],
+		}
+	case RawTriangular:
+		mat := r.RawTriangular()
+		if mat.Diag == blas.Unit {
+			for i := 0; i < n; i += d.mat.Inc {
+				d.mat.Data[i] = 1
+			}
+			return
+		}
+		vec = blas64.Vector{
+			N:    n,
+			Inc:  mat.Stride + 1,
+			Data: mat.Data[:(n-1)*mat.Stride+n],
+		}
+	case RawVectorer:
+		d.mat.Data[0] = r.RawVector().Data[0]
+		return
+	default:
+		for i := 0; i < n; i++ {
+			d.setDiag(i, m.At(i, i))
+		}
+		return
+	}
+	blas64.Copy(vec, d.mat)
+}
+
+// RawBand returns the underlying data used by the receiver represented
+// as a blas64.Band.
+// Changes to elements in the receiver following the call will be reflected
+// in returned blas64.Band.
+func (d *DiagDense) RawBand() blas64.Band {
+	return blas64.Band{
+		Rows:   d.mat.N,
+		Cols:   d.mat.N,
+		KL:     0,
+		KU:     0,
+		Stride: d.mat.Inc,
+		Data:   d.mat.Data,
+	}
+}
+
+// RawSymBand returns the underlying data used by the receiver represented
+// as a blas64.SymmetricBand.
+// Changes to elements in the receiver following the call will be reflected
+// in returned blas64.Band.
+func (d *DiagDense) RawSymBand() blas64.SymmetricBand {
+	return blas64.SymmetricBand{
+		N:      d.mat.N,
+		K:      0,
+		Stride: d.mat.Inc,
+		Uplo:   blas.Upper,
+		Data:   d.mat.Data,
+	}
+}
+
+// reuseAsNonZeroed resizes an empty diagonal to a r×r diagonal,
+// or checks that a non-empty matrix is r×r.
+func (d *DiagDense) reuseAsNonZeroed(r int) {
+	if r == 0 {
+		panic(ErrZeroLength)
+	}
+	if d.IsEmpty() {
+		d.mat = blas64.Vector{
+			Inc:  1,
+			Data: use(d.mat.Data, r),
+		}
+		d.mat.N = r
+		return
+	}
+	if r != d.mat.N {
+		panic(ErrShape)
+	}
+}
+
+// IsEmpty returns whether the receiver is empty. Empty matrices can be the
+// receiver for size-restricted operations. The receiver can be emptied using
+// Reset.
+func (d *DiagDense) IsEmpty() bool {
+	// It must be the case that d.Dims() returns
+	// zeros in this case. See comment in Reset().
+	return d.mat.Inc == 0
+}
+
+// Trace returns the trace of the matrix.
+//
+// Trace will panic with ErrZeroLength if the matrix has zero size.
+func (d *DiagDense) Trace() float64 {
+	if d.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	rb := d.RawBand()
+	var tr float64
+	for i := 0; i < rb.Rows; i++ {
+		tr += rb.Data[rb.KL+i*rb.Stride]
+	}
+	return tr
+}
+
+// Norm returns the specified norm of the receiver. Valid norms are:
+//
+//	1 or Inf - The maximum diagonal element magnitude
+//	2 - The Frobenius norm, the square root of the sum of the squares of
+//	    the diagonal elements
+//
+// Norm will panic with ErrNormOrder if an illegal norm is specified and with
+// ErrZeroLength if the receiver has zero size.
+func (d *DiagDense) Norm(norm float64) float64 {
+	if d.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	switch norm {
+	default:
+		panic(ErrNormOrder)
+	case 1, math.Inf(1):
+		imax := blas64.Iamax(d.mat)
+		return math.Abs(d.at(imax, imax))
+	case 2:
+		return blas64.Nrm2(d.mat)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/doc.go b/vendor/gonum.org/v1/gonum/mat/doc.go
new file mode 100644
index 0000000000..f8c078cfef
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/doc.go
@@ -0,0 +1,200 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package mat provides implementations of float64 and complex128 matrix
+// structures and linear algebra operations on them.
+//
+// # Overview
+//
+// This section provides a quick overview of the mat package. The following
+// sections provide more in depth commentary.
+//
+// mat provides:
+//   - Interfaces for Matrix classes (Matrix, Symmetric, Triangular)
+//   - Concrete implementations (Dense, SymDense, TriDense, VecDense)
+//   - Methods and functions for using matrix data (Add, Trace, SymRankOne)
+//   - Types for constructing and using matrix factorizations (QR, LU, etc.)
+//   - The complementary types for complex matrices, CMatrix, CSymDense, etc.
+//
+// In the documentation below, we use "matrix" as a short-hand for all of
+// the FooDense types implemented in this package. We use "Matrix" to
+// refer to the Matrix interface.
+//
+// A matrix may be constructed through the corresponding New function. If no
+// backing array is provided the matrix will be initialized to all zeros.
+//
+//	// Allocate a zeroed real matrix of size 3×5
+//	zero := mat.NewDense(3, 5, nil)
+//
+// If a backing data slice is provided, the matrix will have those elements.
+// All matrices are stored in row-major format and users should consider
+// this when expressing matrix arithmetic to ensure optimal performance.
+//
+//	// Generate a 6×6 matrix of random values.
+//	data := make([]float64, 36)
+//	for i := range data {
+//		data[i] = rand.NormFloat64()
+//	}
+//	a := mat.NewDense(6, 6, data)
+//
+// Operations involving matrix data are implemented as functions when the values
+// of the matrix remain unchanged
+//
+//	tr := mat.Trace(a)
+//
+// and are implemented as methods when the operation modifies the receiver.
+//
+//	zero.Copy(a)
+//
+// Note that the input arguments to most functions and methods are interfaces
+// rather than concrete types `func Trace(Matrix)` rather than
+// `func Trace(*Dense)` allowing flexible use of internal and external
+// Matrix types.
+//
+// When a matrix is the destination or receiver for a function or method,
+// the operation will panic if the matrix is not the correct size.
+// An exception to this is when the destination is empty (see below).
+//
+// # Empty matrix
+//
+// An empty matrix is one that has zero size. Empty matrices are used to allow
+// the destination of a matrix operation to assume the correct size automatically.
+// This operation will re-use the backing data, if available, or will allocate
+// new data if necessary. The IsEmpty method returns whether the given matrix
+// is empty. The zero-value of a matrix is empty, and is useful for easily
+// getting the result of matrix operations.
+//
+//	var c mat.Dense // construct a new zero-value matrix
+//	c.Mul(a, a)     // c is automatically adjusted to be the right size
+//
+// The Reset method can be used to revert a matrix to an empty matrix.
+// Reset should not be used when multiple different matrices share the same backing
+// data slice. This can cause unexpected data modifications after being resized.
+// An empty matrix can not be sliced even if it does have an adequately sized
+// backing data slice, but can be expanded using its Grow method if it exists.
+//
+// # The Matrix Interfaces
+//
+// The Matrix interface is the common link between the concrete types of real
+// matrices. The Matrix interface is defined by three functions: Dims, which
+// returns the dimensions of the Matrix, At, which returns the element in the
+// specified location, and T for returning a Transpose (discussed later). All of
+// the matrix types can perform these behaviors and so implement the interface.
+// Methods and functions are designed to use this interface, so in particular the method
+//
+//	func (m *Dense) Mul(a, b Matrix)
+//
+// constructs a *Dense from the result of a multiplication with any Matrix types,
+// not just *Dense. Where more restrictive requirements must be met, there are also
+// additional interfaces like Symmetric and Triangular. For example, in
+//
+//	func (s *SymDense) AddSym(a, b Symmetric)
+//
+// the Symmetric interface guarantees a symmetric result.
+//
+// The CMatrix interface plays the same role for complex matrices. The difference
+// is that the CMatrix type has the H method instead T, for returning the conjugate
+// transpose.
+//
+// (Conjugate) Transposes
+//
+// The T method is used for transposition on real matrices, and H is used for
+// conjugate transposition on complex matrices. For example, c.Mul(a.T(), b) computes
+// c = aᵀ * b. The mat types implement this method implicitly —
+// see the Transpose and Conjugate types for more details. Note that some
+// operations have a transpose as part of their definition, as in *SymDense.SymOuterK.
+//
+// # Matrix Factorization
+//
+// Matrix factorizations, such as the LU decomposition, typically have their own
+// specific data storage, and so are each implemented as a specific type. The
+// factorization can be computed through a call to Factorize
+//
+//	var lu mat.LU
+//	lu.Factorize(a)
+//
+// The elements of the factorization can be extracted through methods on the
+// factorized type, for example *LU.UTo. The factorization types can also be used
+// directly, as in *Cholesky.SolveTo. Some factorizations can be updated directly,
+// without needing to update the original matrix and refactorize, for example with
+// *LU.RankOne.
+//
+// # BLAS and LAPACK
+//
+// BLAS and LAPACK are the standard APIs for linear algebra routines. Many
+// operations in mat are implemented using calls to the wrapper functions
+// in gonum/blas/blas64 and gonum/lapack/lapack64 and their complex equivalents.
+// By default, blas64 and lapack64 call the native Go implementations of the
+// routines. Alternatively, it is possible to use C-based implementations of the
+// APIs through the respective cgo packages and the wrapper packages' "Use"
+// functions. The Go implementation of LAPACK makes calls through blas64, so if
+// a cgo BLAS implementation is registered, the lapack64 calls will be partially
+// executed in Go and partially executed in C.
+//
+// # Type Switching
+//
+// The Matrix abstraction enables efficiency as well as interoperability. Go's
+// type reflection capabilities are used to choose the most efficient routine
+// given the specific concrete types. For example, in
+//
+//	c.Mul(a, b)
+//
+// if a and b both implement RawMatrixer, that is, they can be represented as a
+// blas64.General, blas64.Gemm (general matrix multiplication) is called, while
+// instead if b is a RawSymmetricer blas64.Symm is used (general-symmetric
+// multiplication), and if b is a *VecDense blas64.Gemv is used.
+//
+// There are many possible type combinations and special cases. No specific guarantees
+// are made about the performance of any method, and in particular, note that an
+// abstract matrix type may be copied into a concrete type of the corresponding
+// value. If there are specific special cases that are needed, please submit a
+// pull-request or file an issue.
+//
+// # Invariants
+//
+// Matrix input arguments to package functions are never directly modified. If an
+// operation changes Matrix data, the mutated matrix will be the receiver of a
+// method, or will be the first, dst, argument to a method named with a To suffix.
+//
+// For convenience, a matrix may be used as both a receiver and as an input, e.g.
+//
+//	a.Pow(a, 6)
+//	v.SolveVec(a.T(), v)
+//
+// though in many cases this will cause an allocation (see Element Aliasing).
+// An exception to this rule is Copy, which does not allow a.Copy(a.T()).
+//
+// # Element Aliasing
+//
+// Most methods in mat modify receiver data. It is forbidden for the modified
+// data region of the receiver to overlap the used data area of the input
+// arguments. The exception to this rule is when the method receiver is equal to one
+// of the input arguments, as in the a.Pow(a, 6) call above, or its implicit transpose.
+//
+// This prohibition is to help avoid subtle mistakes when the method needs to read
+// from and write to the same data region. There are ways to make mistakes using the
+// mat API, and mat functions will detect and complain about those.
+// There are many ways to make mistakes by excursion from the mat API via
+// interaction with raw matrix values.
+//
+// If you need to read the rest of this section to understand the behavior of
+// your program, you are being clever. Don't be clever. If you must be clever,
+// blas64 and lapack64 may be used to call the behavior directly.
+//
+// mat will use the following rules to detect overlap between the receiver and one
+// of the inputs:
+//   - the input implements one of the Raw methods, and
+//   - the address ranges of the backing data slices overlap, and
+//   - the strides differ or there is an overlap in the used data elements.
+//
+// If such an overlap is detected, the method will panic.
+//
+// The following cases will not panic:
+//   - the data slices do not overlap,
+//   - there is pointer identity between the receiver and input values after
+//     the value has been untransposed if necessary.
+//
+// mat will not attempt to detect element overlap if the input does not implement a
+// Raw method. Method behavior is undefined if there is undetected overlap.
+package mat // import "gonum.org/v1/gonum/mat"
diff --git a/vendor/gonum.org/v1/gonum/mat/eigen.go b/vendor/gonum.org/v1/gonum/mat/eigen.go
new file mode 100644
index 0000000000..859247d880
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/eigen.go
@@ -0,0 +1,450 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+const (
+	badFact   = "mat: use without successful factorization"
+	noVectors = "mat: eigenvectors not computed"
+)
+
+// EigenSym is a type for computing all eigenvalues and, optionally,
+// eigenvectors of a symmetric matrix A.
+//
+// It is a Symmetric matrix represented by its spectral factorization. Once
+// computed, this representation is useful for extracting eigenvalues and
+// eigenvector, but At is slow.
+type EigenSym struct {
+	vectorsComputed bool
+
+	values  []float64
+	vectors *Dense
+}
+
+// Dims returns the dimensions of the matrix.
+func (e *EigenSym) Dims() (r, c int) {
+	n := e.SymmetricDim()
+	return n, n
+}
+
+// SymmetricDim implements the Symmetric interface.
+func (e *EigenSym) SymmetricDim() int {
+	return len(e.values)
+}
+
+// At returns the element at row i, column j of the matrix A.
+//
+// At will panic if the eigenvectors have not been computed.
+func (e *EigenSym) At(i, j int) float64 {
+	if !e.vectorsComputed {
+		panic(noVectors)
+	}
+	n, _ := e.Dims()
+	if uint(i) >= uint(n) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(n) {
+		panic(ErrColAccess)
+	}
+
+	var val float64
+	for k := 0; k < n; k++ {
+		val += e.values[k] * e.vectors.at(i, k) * e.vectors.at(j, k)
+	}
+	return val
+}
+
+// T returns the receiver, the transpose of a symmetric matrix.
+func (e *EigenSym) T() Matrix {
+	return e
+}
+
+// Factorize computes the spectral factorization (eigendecomposition) of the
+// symmetric matrix A.
+//
+// The spectral factorization of A can be written as
+//
+//	A = Q * Λ * Qᵀ
+//
+// where Λ is a diagonal matrix whose entries are the eigenvalues, and Q is an
+// orthogonal matrix whose columns are the eigenvectors.
+//
+// If vectors is false, the eigenvectors are not computed and later calls to
+// VectorsTo and At will panic.
+//
+// Factorize returns whether the factorization succeeded. If it returns false,
+// methods that require a successful factorization will panic.
+func (e *EigenSym) Factorize(a Symmetric, vectors bool) (ok bool) {
+	// kill previous decomposition
+	e.vectorsComputed = false
+	e.values = e.values[:]
+
+	n := a.SymmetricDim()
+	sd := NewSymDense(n, nil)
+	sd.CopySym(a)
+
+	jobz := lapack.EVNone
+	if vectors {
+		jobz = lapack.EVCompute
+	}
+	w := make([]float64, n)
+	work := []float64{0}
+	lapack64.Syev(jobz, sd.mat, w, work, -1)
+
+	work = getFloat64s(int(work[0]), false)
+	ok = lapack64.Syev(jobz, sd.mat, w, work, len(work))
+	putFloat64s(work)
+	if !ok {
+		e.vectorsComputed = false
+		e.values = nil
+		e.vectors = nil
+		return false
+	}
+	e.vectorsComputed = vectors
+	e.values = w
+	e.vectors = NewDense(n, n, sd.mat.Data)
+	return true
+}
+
+// succFact returns whether the receiver contains a successful factorization.
+func (e *EigenSym) succFact() bool {
+	return len(e.values) != 0
+}
+
+// Values extracts the eigenvalues of the factorized n×n matrix A in ascending
+// order.
+//
+// If dst is not nil, the values are stored in-place into dst and returned,
+// otherwise a new slice is allocated first. If dst is not nil, it must have
+// length equal to n.
+//
+// If the receiver does not contain a successful factorization, Values will
+// panic.
+func (e *EigenSym) Values(dst []float64) []float64 {
+	if !e.succFact() {
+		panic(badFact)
+	}
+	if dst == nil {
+		dst = make([]float64, len(e.values))
+	}
+	if len(dst) != len(e.values) {
+		panic(ErrSliceLengthMismatch)
+	}
+	copy(dst, e.values)
+	return dst
+}
+
+// RawValues returns the slice storing the eigenvalues of A in ascending order.
+//
+// If the returned slice is modified, the factorization is invalid and should
+// not be used.
+//
+// If the receiver does not contain a successful factorization, RawValues will
+// return nil.
+func (e *EigenSym) RawValues() []float64 {
+	if !e.succFact() {
+		return nil
+	}
+	return e.values
+}
+
+// VectorsTo stores the orthonormal eigenvectors of the factorized n×n matrix A
+// into the columns of dst.
+//
+// If dst is empty, VectorsTo will resize dst to be n×n. When dst is non-empty,
+// VectorsTo will panic if dst is not n×n. VectorsTo will also panic if the
+// eigenvectors were not computed during the factorization, or if the receiver
+// does not contain a successful factorization.
+func (e *EigenSym) VectorsTo(dst *Dense) {
+	if !e.succFact() {
+		panic(badFact)
+	}
+	if !e.vectorsComputed {
+		panic(noVectors)
+	}
+	r, c := e.vectors.Dims()
+	if dst.IsEmpty() {
+		dst.ReuseAs(r, c)
+	} else {
+		r2, c2 := dst.Dims()
+		if r != r2 || c != c2 {
+			panic(ErrShape)
+		}
+	}
+	dst.Copy(e.vectors)
+}
+
+// RawQ returns the orthogonal matrix Q from the spectral factorization of the
+// original matrix A
+//
+//	A = Q * Λ * Qᵀ
+//
+// The columns of Q contain the eigenvectors of A.
+//
+// If the returned matrix is modified, the factorization is invalid and should
+// not be used.
+//
+// If the receiver does not contain a successful factorization or eigenvectors
+// not computed, RawU will return nil.
+func (e *EigenSym) RawQ() Matrix {
+	if !e.succFact() || !e.vectorsComputed {
+		return nil
+	}
+	return e.vectors
+}
+
+// EigenKind specifies the computation of eigenvectors during factorization.
+type EigenKind int
+
+const (
+	// EigenNone specifies to not compute any eigenvectors.
+	EigenNone EigenKind = 0
+	// EigenLeft specifies to compute the left eigenvectors.
+	EigenLeft EigenKind = 1 << iota
+	// EigenRight specifies to compute the right eigenvectors.
+	EigenRight
+	// EigenBoth is a convenience value for computing both eigenvectors.
+	EigenBoth EigenKind = EigenLeft | EigenRight
+)
+
+// Eigen is a type for creating and using the eigenvalue decomposition of a dense matrix.
+type Eigen struct {
+	n int // The size of the factorized matrix.
+
+	kind EigenKind
+
+	values   []complex128
+	rVectors *CDense
+	lVectors *CDense
+}
+
+// succFact returns whether the receiver contains a successful factorization.
+func (e *Eigen) succFact() bool {
+	return e.n != 0
+}
+
+// Factorize computes the eigenvalues of the square matrix a, and optionally
+// the eigenvectors.
+//
+// A right eigenvalue/eigenvector combination is defined by
+//
+//	A * x_r = λ * x_r
+//
+// where x_r is the column vector called an eigenvector, and λ is the corresponding
+// eigenvalue.
+//
+// Similarly, a left eigenvalue/eigenvector combination is defined by
+//
+//	x_l * A = λ * x_l
+//
+// The eigenvalues, but not the eigenvectors, are the same for both decompositions.
+//
+// Typically eigenvectors refer to right eigenvectors.
+//
+// In all cases, Factorize computes the eigenvalues of the matrix. kind
+// specifies which of the eigenvectors, if any, to compute. See the EigenKind
+// documentation for more information.
+// Eigen panics if the input matrix is not square.
+//
+// Factorize returns whether the decomposition succeeded. If the decomposition
+// failed, methods that require a successful factorization will panic.
+func (e *Eigen) Factorize(a Matrix, kind EigenKind) (ok bool) {
+	// kill previous factorization.
+	e.n = 0
+	e.kind = 0
+	// Copy a because it is modified during the Lapack call.
+	r, c := a.Dims()
+	if r != c {
+		panic(ErrShape)
+	}
+	var sd Dense
+	sd.CloneFrom(a)
+
+	left := kind&EigenLeft != 0
+	right := kind&EigenRight != 0
+
+	var vl, vr Dense
+	jobvl := lapack.LeftEVNone
+	jobvr := lapack.RightEVNone
+	if left {
+		vl = *NewDense(r, r, nil)
+		jobvl = lapack.LeftEVCompute
+	}
+	if right {
+		vr = *NewDense(c, c, nil)
+		jobvr = lapack.RightEVCompute
+	}
+
+	wr := getFloat64s(c, false)
+	defer putFloat64s(wr)
+	wi := getFloat64s(c, false)
+	defer putFloat64s(wi)
+
+	work := []float64{0}
+	lapack64.Geev(jobvl, jobvr, sd.mat, wr, wi, vl.mat, vr.mat, work, -1)
+	work = getFloat64s(int(work[0]), false)
+	first := lapack64.Geev(jobvl, jobvr, sd.mat, wr, wi, vl.mat, vr.mat, work, len(work))
+	putFloat64s(work)
+
+	if first != 0 {
+		e.values = nil
+		return false
+	}
+	e.n = r
+	e.kind = kind
+
+	// Construct complex eigenvalues from float64 data.
+	values := make([]complex128, r)
+	for i, v := range wr {
+		values[i] = complex(v, wi[i])
+	}
+	e.values = values
+
+	// Construct complex eigenvectors from float64 data.
+	var cvl, cvr CDense
+	if left {
+		cvl = *NewCDense(r, r, nil)
+		e.complexEigenTo(&cvl, &vl)
+		e.lVectors = &cvl
+	} else {
+		e.lVectors = nil
+	}
+	if right {
+		cvr = *NewCDense(c, c, nil)
+		e.complexEigenTo(&cvr, &vr)
+		e.rVectors = &cvr
+	} else {
+		e.rVectors = nil
+	}
+	return true
+}
+
+// Kind returns the EigenKind of the decomposition. If no decomposition has been
+// computed, Kind returns -1.
+func (e *Eigen) Kind() EigenKind {
+	if !e.succFact() {
+		return -1
+	}
+	return e.kind
+}
+
+// Values extracts the eigenvalues of the factorized matrix. If dst is
+// non-nil, the values are stored in-place into dst. In this case
+// dst must have length n, otherwise Values will panic. If dst is
+// nil, then a new slice will be allocated of the proper length and
+// filed with the eigenvalues.
+//
+// Values panics if the Eigen decomposition was not successful.
+func (e *Eigen) Values(dst []complex128) []complex128 {
+	if !e.succFact() {
+		panic(badFact)
+	}
+	if dst == nil {
+		dst = make([]complex128, e.n)
+	}
+	if len(dst) != e.n {
+		panic(ErrSliceLengthMismatch)
+	}
+	copy(dst, e.values)
+	return dst
+}
+
+// complexEigenTo extracts the complex eigenvectors from the real matrix d
+// and stores them into the complex matrix dst.
+//
+// The columns of the returned n×n dense matrix contain the eigenvectors of the
+// decomposition in the same order as the eigenvalues.
+// If the j-th eigenvalue is real, then
+//
+//	dst[:,j] = d[:,j],
+//
+// and if it is not real, then the elements of the j-th and (j+1)-th columns of d
+// form complex conjugate pairs and the eigenvectors are recovered as
+//
+//	dst[:,j]   = d[:,j] + i*d[:,j+1],
+//	dst[:,j+1] = d[:,j] - i*d[:,j+1],
+//
+// where i is the imaginary unit.
+func (e *Eigen) complexEigenTo(dst *CDense, d *Dense) {
+	r, c := d.Dims()
+	cr, cc := dst.Dims()
+	if r != cr {
+		panic("size mismatch")
+	}
+	if c != cc {
+		panic("size mismatch")
+	}
+	for j := 0; j < c; j++ {
+		if imag(e.values[j]) == 0 {
+			for i := 0; i < r; i++ {
+				dst.set(i, j, complex(d.at(i, j), 0))
+			}
+			continue
+		}
+		for i := 0; i < r; i++ {
+			real := d.at(i, j)
+			imag := d.at(i, j+1)
+			dst.set(i, j, complex(real, imag))
+			dst.set(i, j+1, complex(real, -imag))
+		}
+		j++
+	}
+}
+
+// VectorsTo stores the right eigenvectors of the decomposition into the columns
+// of dst. The computed eigenvectors are normalized to have Euclidean norm equal
+// to 1 and largest component real.
+//
+// If dst is empty, VectorsTo will resize dst to be n×n. When dst is
+// non-empty, VectorsTo will panic if dst is not n×n. VectorsTo will also
+// panic if the eigenvectors were not computed during the factorization,
+// or if the receiver does not contain a successful factorization.
+func (e *Eigen) VectorsTo(dst *CDense) {
+	if !e.succFact() {
+		panic(badFact)
+	}
+	if e.kind&EigenRight == 0 {
+		panic(noVectors)
+	}
+	if dst.IsEmpty() {
+		dst.ReuseAs(e.n, e.n)
+	} else {
+		r, c := dst.Dims()
+		if r != e.n || c != e.n {
+			panic(ErrShape)
+		}
+	}
+	dst.Copy(e.rVectors)
+}
+
+// LeftVectorsTo stores the left eigenvectors of the decomposition into the
+// columns of dst. The computed eigenvectors are normalized to have Euclidean
+// norm equal to 1 and largest component real.
+//
+// If dst is empty, LeftVectorsTo will resize dst to be n×n. When dst is
+// non-empty, LeftVectorsTo will panic if dst is not n×n. LeftVectorsTo will also
+// panic if the left eigenvectors were not computed during the factorization,
+// or if the receiver does not contain a successful factorization
+func (e *Eigen) LeftVectorsTo(dst *CDense) {
+	if !e.succFact() {
+		panic(badFact)
+	}
+	if e.kind&EigenLeft == 0 {
+		panic(noVectors)
+	}
+	if dst.IsEmpty() {
+		dst.ReuseAs(e.n, e.n)
+	} else {
+		r, c := dst.Dims()
+		if r != e.n || c != e.n {
+			panic(ErrShape)
+		}
+	}
+	dst.Copy(e.lVectors)
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/errors.go b/vendor/gonum.org/v1/gonum/mat/errors.go
new file mode 100644
index 0000000000..641d816219
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/errors.go
@@ -0,0 +1,154 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"fmt"
+	"runtime"
+
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Condition is the condition number of a matrix. The condition
+// number is defined as |A| * |A^-1|.
+//
+// One important use of Condition is during linear solve routines (finding x such
+// that A * x = b). The condition number of A indicates the accuracy of
+// the computed solution. A Condition error will be returned if the condition
+// number of A is sufficiently large. If A is exactly singular to working precision,
+// Condition == ∞, and the solve algorithm may have completed early. If Condition
+// is large and finite the solve algorithm will be performed, but the computed
+// solution may be inaccurate. Due to the nature of finite precision arithmetic,
+// the value of Condition is only an approximate test of singularity.
+type Condition float64
+
+func (c Condition) Error() string {
+	return fmt.Sprintf("matrix singular or near-singular with condition number %.4e", c)
+}
+
+// ConditionTolerance is the tolerance limit of the condition number. If the
+// condition number is above this value, the matrix is considered singular.
+const ConditionTolerance = 1e16
+
+const (
+	// CondNorm is the matrix norm used for computing the condition number by routines
+	// in the matrix packages.
+	CondNorm = lapack.MaxRowSum
+
+	// CondNormTrans is the norm used to compute on Aᵀ to get the same result as
+	// computing CondNorm on A.
+	CondNormTrans = lapack.MaxColumnSum
+)
+
+const stackTraceBufferSize = 1 << 20
+
+// Maybe will recover a panic with a type mat.Error from fn, and return this error
+// as the Err field of an ErrorStack. The stack trace for the panicking function will be
+// recovered and placed in the StackTrace field. Any other error is re-panicked.
+func Maybe(fn func()) (err error) {
+	defer func() {
+		if r := recover(); r != nil {
+			if e, ok := r.(Error); ok {
+				if e.string == "" {
+					panic("mat: invalid error")
+				}
+				buf := make([]byte, stackTraceBufferSize)
+				n := runtime.Stack(buf, false)
+				err = ErrorStack{Err: e, StackTrace: string(buf[:n])}
+				return
+			}
+			panic(r)
+		}
+	}()
+	fn()
+	return
+}
+
+// MaybeFloat will recover a panic with a type mat.Error from fn, and return this error
+// as the Err field of an ErrorStack. The stack trace for the panicking function will be
+// recovered and placed in the StackTrace field. Any other error is re-panicked.
+func MaybeFloat(fn func() float64) (f float64, err error) {
+	defer func() {
+		if r := recover(); r != nil {
+			if e, ok := r.(Error); ok {
+				if e.string == "" {
+					panic("mat: invalid error")
+				}
+				buf := make([]byte, stackTraceBufferSize)
+				n := runtime.Stack(buf, false)
+				err = ErrorStack{Err: e, StackTrace: string(buf[:n])}
+				return
+			}
+			panic(r)
+		}
+	}()
+	return fn(), nil
+}
+
+// MaybeComplex will recover a panic with a type mat.Error from fn, and return this error
+// as the Err field of an ErrorStack. The stack trace for the panicking function will be
+// recovered and placed in the StackTrace field. Any other error is re-panicked.
+func MaybeComplex(fn func() complex128) (f complex128, err error) {
+	defer func() {
+		if r := recover(); r != nil {
+			if e, ok := r.(Error); ok {
+				if e.string == "" {
+					panic("mat: invalid error")
+				}
+				buf := make([]byte, stackTraceBufferSize)
+				n := runtime.Stack(buf, false)
+				err = ErrorStack{Err: e, StackTrace: string(buf[:n])}
+				return
+			}
+			panic(r)
+		}
+	}()
+	return fn(), nil
+}
+
+// Error represents matrix handling errors. These errors can be recovered by Maybe wrappers.
+type Error struct{ string }
+
+func (err Error) Error() string { return err.string }
+
+var (
+	ErrNegativeDimension   = Error{"mat: negative dimension"}
+	ErrIndexOutOfRange     = Error{"mat: index out of range"}
+	ErrReuseNonEmpty       = Error{"mat: reuse of non-empty matrix"}
+	ErrRowAccess           = Error{"mat: row index out of range"}
+	ErrColAccess           = Error{"mat: column index out of range"}
+	ErrVectorAccess        = Error{"mat: vector index out of range"}
+	ErrZeroLength          = Error{"mat: zero length in matrix dimension"}
+	ErrRowLength           = Error{"mat: row length mismatch"}
+	ErrColLength           = Error{"mat: col length mismatch"}
+	ErrSquare              = Error{"mat: expect square matrix"}
+	ErrNormOrder           = Error{"mat: invalid norm order for matrix"}
+	ErrSingular            = Error{"mat: matrix is singular"}
+	ErrShape               = Error{"mat: dimension mismatch"}
+	ErrIllegalStride       = Error{"mat: illegal stride"}
+	ErrPivot               = Error{"mat: malformed pivot list"}
+	ErrTriangle            = Error{"mat: triangular storage mismatch"}
+	ErrTriangleSet         = Error{"mat: triangular set out of bounds"}
+	ErrBandwidth           = Error{"mat: bandwidth out of range"}
+	ErrBandSet             = Error{"mat: band set out of bounds"}
+	ErrDiagSet             = Error{"mat: diagonal set out of bounds"}
+	ErrSliceLengthMismatch = Error{"mat: input slice length mismatch"}
+	ErrNotPSD              = Error{"mat: input not positive symmetric definite"}
+	ErrFailedEigen         = Error{"mat: eigendecomposition not successful"}
+)
+
+// ErrorStack represents matrix handling errors that have been recovered by Maybe wrappers.
+type ErrorStack struct {
+	Err error
+
+	// StackTrace is the stack trace
+	// recovered by Maybe, MaybeFloat
+	// or MaybeComplex.
+	StackTrace string
+}
+
+func (err ErrorStack) Error() string { return err.Err.Error() }
+
+const badCap = "mat: bad capacity"
diff --git a/vendor/gonum.org/v1/gonum/mat/format.go b/vendor/gonum.org/v1/gonum/mat/format.go
new file mode 100644
index 0000000000..c239ddd363
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/format.go
@@ -0,0 +1,516 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+)
+
+// Formatted returns a fmt.Formatter for the matrix m using the given options.
+func Formatted(m Matrix, options ...FormatOption) fmt.Formatter {
+	f := formatter{
+		matrix: m,
+		dot:    '.',
+	}
+	for _, o := range options {
+		o(&f)
+	}
+	return f
+}
+
+type formatter struct {
+	matrix  Matrix
+	prefix  string
+	margin  int
+	dot     byte
+	squeeze bool
+
+	format func(m Matrix, prefix string, margin int, dot byte, squeeze bool, fs fmt.State, c rune)
+}
+
+// FormatOption is a functional option for matrix formatting.
+type FormatOption func(*formatter)
+
+// Prefix sets the formatted prefix to the string p. Prefix is a string that is prepended to
+// each line of output after the first line.
+func Prefix(p string) FormatOption {
+	return func(f *formatter) { f.prefix = p }
+}
+
+// Excerpt sets the maximum number of rows and columns to print at the margins of the matrix
+// to m. If m is zero or less all elements are printed.
+func Excerpt(m int) FormatOption {
+	return func(f *formatter) { f.margin = m }
+}
+
+// DotByte sets the dot character to b. The dot character is used to replace zero elements
+// if the result is printed with the fmt ' ' verb flag. Without a DotByte option, the default
+// dot character is '.'.
+func DotByte(b byte) FormatOption {
+	return func(f *formatter) { f.dot = b }
+}
+
+// Squeeze sets the printing behavior to minimise column width for each individual column.
+func Squeeze() FormatOption {
+	return func(f *formatter) { f.squeeze = true }
+}
+
+// FormatMATLAB sets the printing behavior to output MATLAB syntax. If MATLAB syntax is
+// specified, the ' ' verb flag and Excerpt option are ignored. If the alternative syntax
+// verb flag, '#' is used the matrix is formatted in rows and columns.
+func FormatMATLAB() FormatOption {
+	return func(f *formatter) { f.format = formatMATLAB }
+}
+
+// FormatPython sets the printing behavior to output Python syntax. If Python syntax is
+// specified, the ' ' verb flag and Excerpt option are ignored. If the alternative syntax
+// verb flag, '#' is used the matrix is formatted in rows and columns.
+func FormatPython() FormatOption {
+	return func(f *formatter) { f.format = formatPython }
+}
+
+// Format satisfies the fmt.Formatter interface.
+func (f formatter) Format(fs fmt.State, c rune) {
+	if c == 'v' && fs.Flag('#') && f.format == nil {
+		fmt.Fprintf(fs, "%#v", f.matrix)
+		return
+	}
+	if f.format == nil {
+		f.format = format
+	}
+	f.format(f.matrix, f.prefix, f.margin, f.dot, f.squeeze, fs, c)
+}
+
+// format prints a pretty representation of m to the fs io.Writer. The format character c
+// specifies the numerical representation of elements; valid values are those for float64
+// specified in the fmt package, with their associated flags. In addition to this, a space
+// preceding a verb indicates that zero values should be represented by the dot character.
+// The printed range of the matrix can be limited by specifying a positive value for margin;
+// If margin is greater than zero, only the first and last margin rows/columns of the matrix
+// are output. If squeeze is true, column widths are determined on a per-column basis.
+//
+// format will not provide Go syntax output.
+func format(m Matrix, prefix string, margin int, dot byte, squeeze bool, fs fmt.State, c rune) {
+	rows, cols := m.Dims()
+
+	var printed int
+	if margin <= 0 {
+		printed = rows
+		if cols > printed {
+			printed = cols
+		}
+	} else {
+		printed = margin
+	}
+
+	prec, pOk := fs.Precision()
+	if !pOk {
+		prec = -1
+	}
+
+	var (
+		maxWidth int
+		widths   widther
+		buf, pad []byte
+	)
+	if squeeze {
+		widths = make(columnWidth, cols)
+	} else {
+		widths = new(uniformWidth)
+	}
+	switch c {
+	case 'v', 'e', 'E', 'f', 'F', 'g', 'G':
+		if c == 'v' {
+			buf, maxWidth = maxCellWidth(m, 'g', printed, prec, widths)
+		} else {
+			buf, maxWidth = maxCellWidth(m, c, printed, prec, widths)
+		}
+	default:
+		fmt.Fprintf(fs, "%%!%c(%T=Dims(%d, %d))", c, m, rows, cols)
+		return
+	}
+	width, _ := fs.Width()
+	width = max(width, maxWidth)
+	pad = make([]byte, max(width, 2))
+	for i := range pad {
+		pad[i] = ' '
+	}
+
+	first := true
+	if rows > 2*printed || cols > 2*printed {
+		first = false
+		fmt.Fprintf(fs, "Dims(%d, %d)\n", rows, cols)
+	}
+
+	skipZero := fs.Flag(' ')
+	for i := 0; i < rows; i++ {
+		if !first {
+			fmt.Fprint(fs, prefix)
+		}
+		first = false
+		var el string
+		switch {
+		case rows == 1:
+			fmt.Fprint(fs, "[")
+			el = "]"
+		case i == 0:
+			fmt.Fprint(fs, "⎡")
+			el = "⎤\n"
+		case i < rows-1:
+			fmt.Fprint(fs, "⎢")
+			el = "⎥\n"
+		default:
+			fmt.Fprint(fs, "⎣")
+			el = "⎦"
+		}
+
+		for j := 0; j < cols; j++ {
+			if j >= printed && j < cols-printed {
+				j = cols - printed - 1
+				if i == 0 || i == rows-1 {
+					fmt.Fprint(fs, "...  ...  ")
+				} else {
+					fmt.Fprint(fs, "          ")
+				}
+				continue
+			}
+
+			v := m.At(i, j)
+			if v == 0 && skipZero {
+				buf = buf[:1]
+				buf[0] = dot
+			} else {
+				if c == 'v' {
+					buf = strconv.AppendFloat(buf[:0], v, 'g', prec, 64)
+				} else {
+					buf = strconv.AppendFloat(buf[:0], v, byte(c), prec, 64)
+				}
+			}
+			if fs.Flag('-') {
+				fs.Write(buf)
+				fs.Write(pad[:widths.width(j)-len(buf)])
+			} else {
+				fs.Write(pad[:widths.width(j)-len(buf)])
+				fs.Write(buf)
+			}
+
+			if j < cols-1 {
+				fs.Write(pad[:2])
+			}
+		}
+
+		fmt.Fprint(fs, el)
+
+		if i >= printed-1 && i < rows-printed && 2*printed < rows {
+			i = rows - printed - 1
+			fmt.Fprintf(fs, "%s .\n%[1]s .\n%[1]s .\n", prefix)
+			continue
+		}
+	}
+}
+
+// formatMATLAB prints a MATLAB representation of m to the fs io.Writer. The format character c
+// specifies the numerical representation of elements; valid values are those for float64
+// specified in the fmt package, with their associated flags.
+// The printed range of the matrix can be limited by specifying a positive value for margin;
+// If squeeze is true, column widths are determined on a per-column basis.
+//
+// formatMATLAB will not provide Go syntax output.
+func formatMATLAB(m Matrix, prefix string, _ int, _ byte, squeeze bool, fs fmt.State, c rune) {
+	rows, cols := m.Dims()
+
+	prec, pOk := fs.Precision()
+	width, _ := fs.Width()
+	if !fs.Flag('#') {
+		switch c {
+		case 'v', 'e', 'E', 'f', 'F', 'g', 'G':
+		default:
+			fmt.Fprintf(fs, "%%!%c(%T=Dims(%d, %d))", c, m, rows, cols)
+			return
+		}
+		format := fmtString(fs, c, prec, width)
+		fs.Write([]byte{'['})
+		for i := 0; i < rows; i++ {
+			if i != 0 {
+				fs.Write([]byte("; "))
+			}
+			for j := 0; j < cols; j++ {
+				if j != 0 {
+					fs.Write([]byte{' '})
+				}
+				fmt.Fprintf(fs, format, m.At(i, j))
+			}
+		}
+		fs.Write([]byte{']'})
+		return
+	}
+
+	if !pOk {
+		prec = -1
+	}
+
+	printed := rows
+	if cols > printed {
+		printed = cols
+	}
+
+	var (
+		maxWidth int
+		widths   widther
+		buf, pad []byte
+	)
+	if squeeze {
+		widths = make(columnWidth, cols)
+	} else {
+		widths = new(uniformWidth)
+	}
+	switch c {
+	case 'v', 'e', 'E', 'f', 'F', 'g', 'G':
+		if c == 'v' {
+			buf, maxWidth = maxCellWidth(m, 'g', printed, prec, widths)
+		} else {
+			buf, maxWidth = maxCellWidth(m, c, printed, prec, widths)
+		}
+	default:
+		fmt.Fprintf(fs, "%%!%c(%T=Dims(%d, %d))", c, m, rows, cols)
+		return
+	}
+	width = max(width, maxWidth)
+	pad = make([]byte, max(width, 1))
+	for i := range pad {
+		pad[i] = ' '
+	}
+
+	for i := 0; i < rows; i++ {
+		var el string
+		switch {
+		case rows == 1:
+			fmt.Fprint(fs, "[")
+			el = "]"
+		case i == 0:
+			fmt.Fprint(fs, "[\n"+prefix+" ")
+			el = "\n"
+		case i < rows-1:
+			fmt.Fprint(fs, prefix+" ")
+			el = "\n"
+		default:
+			fmt.Fprint(fs, prefix+" ")
+			el = "\n" + prefix + "]"
+		}
+
+		for j := 0; j < cols; j++ {
+			v := m.At(i, j)
+			if c == 'v' {
+				buf = strconv.AppendFloat(buf[:0], v, 'g', prec, 64)
+			} else {
+				buf = strconv.AppendFloat(buf[:0], v, byte(c), prec, 64)
+			}
+			if fs.Flag('-') {
+				fs.Write(buf)
+				fs.Write(pad[:widths.width(j)-len(buf)])
+			} else {
+				fs.Write(pad[:widths.width(j)-len(buf)])
+				fs.Write(buf)
+			}
+
+			if j < cols-1 {
+				fs.Write(pad[:1])
+			}
+		}
+
+		fmt.Fprint(fs, el)
+	}
+}
+
+// formatPython prints a Python representation of m to the fs io.Writer. The format character c
+// specifies the numerical representation of elements; valid values are those for float64
+// specified in the fmt package, with their associated flags.
+// The printed range of the matrix can be limited by specifying a positive value for margin;
+// If squeeze is true, column widths are determined on a per-column basis.
+//
+// formatPython will not provide Go syntax output.
+func formatPython(m Matrix, prefix string, _ int, _ byte, squeeze bool, fs fmt.State, c rune) {
+	rows, cols := m.Dims()
+
+	prec, pOk := fs.Precision()
+	width, _ := fs.Width()
+	if !fs.Flag('#') {
+		switch c {
+		case 'v', 'e', 'E', 'f', 'F', 'g', 'G':
+		default:
+			fmt.Fprintf(fs, "%%!%c(%T=Dims(%d, %d))", c, m, rows, cols)
+			return
+		}
+		format := fmtString(fs, c, prec, width)
+		fs.Write([]byte{'['})
+		if rows > 1 {
+			fs.Write([]byte{'['})
+		}
+		for i := 0; i < rows; i++ {
+			if i != 0 {
+				fs.Write([]byte("], ["))
+			}
+			for j := 0; j < cols; j++ {
+				if j != 0 {
+					fs.Write([]byte(", "))
+				}
+				fmt.Fprintf(fs, format, m.At(i, j))
+			}
+		}
+		if rows > 1 {
+			fs.Write([]byte{']'})
+		}
+		fs.Write([]byte{']'})
+		return
+	}
+
+	if !pOk {
+		prec = -1
+	}
+
+	printed := rows
+	if cols > printed {
+		printed = cols
+	}
+
+	var (
+		maxWidth int
+		widths   widther
+		buf, pad []byte
+	)
+	if squeeze {
+		widths = make(columnWidth, cols)
+	} else {
+		widths = new(uniformWidth)
+	}
+	switch c {
+	case 'v', 'e', 'E', 'f', 'F', 'g', 'G':
+		if c == 'v' {
+			buf, maxWidth = maxCellWidth(m, 'g', printed, prec, widths)
+		} else {
+			buf, maxWidth = maxCellWidth(m, c, printed, prec, widths)
+		}
+	default:
+		fmt.Fprintf(fs, "%%!%c(%T=Dims(%d, %d))", c, m, rows, cols)
+		return
+	}
+	width = max(width, maxWidth)
+	pad = make([]byte, max(width, 1))
+	for i := range pad {
+		pad[i] = ' '
+	}
+
+	for i := 0; i < rows; i++ {
+		if i != 0 {
+			fmt.Fprint(fs, prefix)
+		}
+		var el string
+		switch {
+		case rows == 1:
+			fmt.Fprint(fs, "[")
+			el = "]"
+		case i == 0:
+			fmt.Fprint(fs, "[[")
+			el = "],\n"
+		case i < rows-1:
+			fmt.Fprint(fs, " [")
+			el = "],\n"
+		default:
+			fmt.Fprint(fs, " [")
+			el = "]]"
+		}
+
+		for j := 0; j < cols; j++ {
+			v := m.At(i, j)
+			if c == 'v' {
+				buf = strconv.AppendFloat(buf[:0], v, 'g', prec, 64)
+			} else {
+				buf = strconv.AppendFloat(buf[:0], v, byte(c), prec, 64)
+			}
+			if fs.Flag('-') {
+				fs.Write(buf)
+				fs.Write(pad[:widths.width(j)-len(buf)])
+			} else {
+				fs.Write(pad[:widths.width(j)-len(buf)])
+				fs.Write(buf)
+			}
+
+			if j < cols-1 {
+				fs.Write([]byte{','})
+				fs.Write(pad[:1])
+			}
+		}
+
+		fmt.Fprint(fs, el)
+	}
+}
+
+// This is horrible, but it's what we have.
+func fmtString(fs fmt.State, c rune, prec, width int) string {
+	var b strings.Builder
+	b.WriteByte('%')
+	for _, f := range "0+- " {
+		if fs.Flag(int(f)) {
+			b.WriteByte(byte(f))
+		}
+	}
+	if width >= 0 {
+		fmt.Fprint(&b, width)
+	}
+	if prec >= 0 {
+		b.WriteByte('.')
+		if prec > 0 {
+			fmt.Fprint(&b, prec)
+		}
+	}
+	b.WriteRune(c)
+	return b.String()
+}
+
+func maxCellWidth(m Matrix, c rune, printed, prec int, w widther) ([]byte, int) {
+	var (
+		buf        = make([]byte, 0, 64)
+		rows, cols = m.Dims()
+		max        int
+	)
+	for i := 0; i < rows; i++ {
+		if i >= printed-1 && i < rows-printed && 2*printed < rows {
+			i = rows - printed - 1
+			continue
+		}
+		for j := 0; j < cols; j++ {
+			if j >= printed && j < cols-printed {
+				continue
+			}
+
+			buf = strconv.AppendFloat(buf, m.At(i, j), byte(c), prec, 64)
+			if len(buf) > max {
+				max = len(buf)
+			}
+			if len(buf) > w.width(j) {
+				w.setWidth(j, len(buf))
+			}
+			buf = buf[:0]
+		}
+	}
+	return buf, max
+}
+
+type widther interface {
+	width(i int) int
+	setWidth(i, w int)
+}
+
+type uniformWidth int
+
+func (u *uniformWidth) width(_ int) int   { return int(*u) }
+func (u *uniformWidth) setWidth(_, w int) { *u = uniformWidth(w) }
+
+type columnWidth []int
+
+func (c columnWidth) width(i int) int   { return c[i] }
+func (c columnWidth) setWidth(i, w int) { c[i] = w }
diff --git a/vendor/gonum.org/v1/gonum/mat/gsvd.go b/vendor/gonum.org/v1/gonum/mat/gsvd.go
new file mode 100644
index 0000000000..02286207cf
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/gsvd.go
@@ -0,0 +1,436 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/floats"
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+// GSVDKind specifies the treatment of singular vectors during a GSVD
+// factorization.
+type GSVDKind int
+
+const (
+	// GSVDNone specifies that no singular vectors should be computed during
+	// the decomposition.
+	GSVDNone GSVDKind = 0
+
+	// GSVDU specifies that the U singular vectors should be computed during
+	// the decomposition.
+	GSVDU GSVDKind = 1 << iota
+	// GSVDV specifies that the V singular vectors should be computed during
+	// the decomposition.
+	GSVDV
+	// GSVDQ specifies that the Q singular vectors should be computed during
+	// the decomposition.
+	GSVDQ
+
+	// GSVDAll is a convenience value for computing all of the singular vectors.
+	GSVDAll = GSVDU | GSVDV | GSVDQ
+)
+
+// GSVD is a type for creating and using the Generalized Singular Value Decomposition
+// (GSVD) of a matrix.
+//
+// The factorization is a linear transformation of the data sets from the given
+// variable×sample spaces to reduced and diagonalized "eigenvariable"×"eigensample"
+// spaces.
+type GSVD struct {
+	kind GSVDKind
+
+	r, p, c, k, l int
+	s1, s2        []float64
+	a, b, u, v, q blas64.General
+
+	work  []float64
+	iwork []int
+}
+
+// succFact returns whether the receiver contains a successful factorization.
+func (gsvd *GSVD) succFact() bool {
+	return gsvd.r != 0
+}
+
+// Factorize computes the generalized singular value decomposition (GSVD) of the input
+// the r×c matrix A and the p×c matrix B. The singular values of A and B are computed
+// in all cases, while the singular vectors are optionally computed depending on the
+// input kind.
+//
+// The full singular value decomposition (kind == GSVDAll) deconstructs A and B as
+//
+//	A = U * Σ₁ * [ 0 R ] * Qᵀ
+//
+//	B = V * Σ₂ * [ 0 R ] * Qᵀ
+//
+// where Σ₁ and Σ₂ are r×(k+l) and p×(k+l) diagonal matrices of singular values, and
+// U, V and Q are r×r, p×p and c×c orthogonal matrices of singular vectors. k+l is the
+// effective numerical rank of the matrix [ Aᵀ Bᵀ ]ᵀ.
+//
+// It is frequently not necessary to compute the full GSVD. Computation time and
+// storage costs can be reduced using the appropriate kind. Either only the singular
+// values can be computed (kind == SVDNone), or in conjunction with specific singular
+// vectors (kind bit set according to GSVDU, GSVDV and GSVDQ).
+//
+// Factorize returns whether the decomposition succeeded. If the decomposition
+// failed, routines that require a successful factorization will panic.
+func (gsvd *GSVD) Factorize(a, b Matrix, kind GSVDKind) (ok bool) {
+	// kill the previous decomposition
+	gsvd.r = 0
+	gsvd.kind = 0
+
+	r, c := a.Dims()
+	gsvd.r, gsvd.c = r, c
+	p, c := b.Dims()
+	gsvd.p = p
+	if gsvd.c != c {
+		panic(ErrShape)
+	}
+	var jobU, jobV, jobQ lapack.GSVDJob
+	switch {
+	default:
+		panic("gsvd: bad input kind")
+	case kind == GSVDNone:
+		jobU = lapack.GSVDNone
+		jobV = lapack.GSVDNone
+		jobQ = lapack.GSVDNone
+	case GSVDAll&kind != 0:
+		if GSVDU&kind != 0 {
+			jobU = lapack.GSVDU
+			gsvd.u = blas64.General{
+				Rows:   r,
+				Cols:   r,
+				Stride: r,
+				Data:   use(gsvd.u.Data, r*r),
+			}
+		}
+		if GSVDV&kind != 0 {
+			jobV = lapack.GSVDV
+			gsvd.v = blas64.General{
+				Rows:   p,
+				Cols:   p,
+				Stride: p,
+				Data:   use(gsvd.v.Data, p*p),
+			}
+		}
+		if GSVDQ&kind != 0 {
+			jobQ = lapack.GSVDQ
+			gsvd.q = blas64.General{
+				Rows:   c,
+				Cols:   c,
+				Stride: c,
+				Data:   use(gsvd.q.Data, c*c),
+			}
+		}
+	}
+
+	// A and B are destroyed on call, so copy the matrices.
+	aCopy := DenseCopyOf(a)
+	bCopy := DenseCopyOf(b)
+
+	gsvd.s1 = use(gsvd.s1, c)
+	gsvd.s2 = use(gsvd.s2, c)
+
+	gsvd.iwork = useInt(gsvd.iwork, c)
+
+	gsvd.work = use(gsvd.work, 1)
+	lapack64.Ggsvd3(jobU, jobV, jobQ, aCopy.mat, bCopy.mat, gsvd.s1, gsvd.s2, gsvd.u, gsvd.v, gsvd.q, gsvd.work, -1, gsvd.iwork)
+	gsvd.work = use(gsvd.work, int(gsvd.work[0]))
+	gsvd.k, gsvd.l, ok = lapack64.Ggsvd3(jobU, jobV, jobQ, aCopy.mat, bCopy.mat, gsvd.s1, gsvd.s2, gsvd.u, gsvd.v, gsvd.q, gsvd.work, len(gsvd.work), gsvd.iwork)
+	if ok {
+		gsvd.a = aCopy.mat
+		gsvd.b = bCopy.mat
+		gsvd.kind = kind
+	}
+	return ok
+}
+
+// Kind returns the GSVDKind of the decomposition. If no decomposition has been
+// computed, Kind returns -1.
+func (gsvd *GSVD) Kind() GSVDKind {
+	if !gsvd.succFact() {
+		return -1
+	}
+	return gsvd.kind
+}
+
+// Rank returns the k and l terms of the rank of [ Aᵀ Bᵀ ]ᵀ.
+func (gsvd *GSVD) Rank() (k, l int) {
+	return gsvd.k, gsvd.l
+}
+
+// GeneralizedValues returns the generalized singular values of the factorized matrices.
+// If the input slice is non-nil, the values will be stored in-place into the slice.
+// In this case, the slice must have length min(r,c)-k, and GeneralizedValues will
+// panic with ErrSliceLengthMismatch otherwise. If the input slice is nil,
+// a new slice of the appropriate length will be allocated and returned.
+//
+// GeneralizedValues will panic if the receiver does not contain a successful factorization.
+func (gsvd *GSVD) GeneralizedValues(v []float64) []float64 {
+	if !gsvd.succFact() {
+		panic(badFact)
+	}
+	r := gsvd.r
+	c := gsvd.c
+	k := gsvd.k
+	d := min(r, c)
+	if v == nil {
+		v = make([]float64, d-k)
+	}
+	if len(v) != d-k {
+		panic(ErrSliceLengthMismatch)
+	}
+	floats.DivTo(v, gsvd.s1[k:d], gsvd.s2[k:d])
+	return v
+}
+
+// ValuesA returns the singular values of the factorized A matrix.
+// If the input slice is non-nil, the values will be stored in-place into the slice.
+// In this case, the slice must have length min(r,c)-k, and ValuesA will panic with
+// ErrSliceLengthMismatch otherwise. If the input slice is nil,
+// a new slice of the appropriate length will be allocated and returned.
+//
+// ValuesA will panic if the receiver does not contain a successful factorization.
+func (gsvd *GSVD) ValuesA(s []float64) []float64 {
+	if !gsvd.succFact() {
+		panic(badFact)
+	}
+	r := gsvd.r
+	c := gsvd.c
+	k := gsvd.k
+	d := min(r, c)
+	if s == nil {
+		s = make([]float64, d-k)
+	}
+	if len(s) != d-k {
+		panic(ErrSliceLengthMismatch)
+	}
+	copy(s, gsvd.s1[k:min(r, c)])
+	return s
+}
+
+// ValuesB returns the singular values of the factorized B matrix.
+// If the input slice is non-nil, the values will be stored in-place into the slice.
+// In this case, the slice must have length min(r,c)-k, and ValuesB will panic with
+// ErrSliceLengthMismatch otherwise. If the input slice is nil,
+// a new slice of the appropriate length will be allocated and returned.
+//
+// ValuesB will panic if the receiver does not contain a successful factorization.
+func (gsvd *GSVD) ValuesB(s []float64) []float64 {
+	if !gsvd.succFact() {
+		panic(badFact)
+	}
+	r := gsvd.r
+	c := gsvd.c
+	k := gsvd.k
+	d := min(r, c)
+	if s == nil {
+		s = make([]float64, d-k)
+	}
+	if len(s) != d-k {
+		panic(ErrSliceLengthMismatch)
+	}
+	copy(s, gsvd.s2[k:d])
+	return s
+}
+
+// ZeroRTo extracts the matrix [ 0 R ] from the singular value decomposition,
+// storing the result into dst. [ 0 R ] is of size (k+l)×c.
+//
+// If dst is empty, ZeroRTo will resize dst to be (k+l)×c. When dst is
+// non-empty, ZeroRTo will panic if dst is not (k+l)×c. ZeroRTo will also panic
+// if the receiver does not contain a successful factorization.
+func (gsvd *GSVD) ZeroRTo(dst *Dense) {
+	if !gsvd.succFact() {
+		panic(badFact)
+	}
+	r := gsvd.r
+	c := gsvd.c
+	k := gsvd.k
+	l := gsvd.l
+	h := min(k+l, r)
+	if dst.IsEmpty() {
+		dst.ReuseAs(k+l, c)
+	} else {
+		r2, c2 := dst.Dims()
+		if r2 != k+l || c != c2 {
+			panic(ErrShape)
+		}
+		dst.Zero()
+	}
+	a := Dense{
+		mat:     gsvd.a,
+		capRows: r,
+		capCols: c,
+	}
+	dst.slice(0, h, c-k-l, c).Copy(a.Slice(0, h, c-k-l, c))
+	if r < k+l {
+		b := Dense{
+			mat:     gsvd.b,
+			capRows: gsvd.p,
+			capCols: c,
+		}
+		dst.slice(r, k+l, c+r-k-l, c).Copy(b.Slice(r-k, l, c+r-k-l, c))
+	}
+}
+
+// SigmaATo extracts the matrix Σ₁ from the singular value decomposition, storing
+// the result into dst. Σ₁ is size r×(k+l).
+//
+// If dst is empty, SigmaATo will resize dst to be r×(k+l). When dst is
+// non-empty, SigmATo will panic if dst is not r×(k+l). SigmaATo will also
+// panic if the receiver does not contain a successful factorization.
+func (gsvd *GSVD) SigmaATo(dst *Dense) {
+	if !gsvd.succFact() {
+		panic(badFact)
+	}
+	r := gsvd.r
+	k := gsvd.k
+	l := gsvd.l
+	if dst.IsEmpty() {
+		dst.ReuseAs(r, k+l)
+	} else {
+		r2, c := dst.Dims()
+		if r2 != r || c != k+l {
+			panic(ErrShape)
+		}
+		dst.Zero()
+	}
+	for i := 0; i < k; i++ {
+		dst.set(i, i, 1)
+	}
+	for i := k; i < min(r, k+l); i++ {
+		dst.set(i, i, gsvd.s1[i])
+	}
+}
+
+// SigmaBTo extracts the matrix Σ₂ from the singular value decomposition, storing
+// the result into dst. Σ₂ is size p×(k+l).
+//
+// If dst is empty, SigmaBTo will resize dst to be p×(k+l). When dst is
+// non-empty, SigmBTo will panic if dst is not p×(k+l). SigmaBTo will also
+// panic if the receiver does not contain a successful factorization.
+func (gsvd *GSVD) SigmaBTo(dst *Dense) {
+	if !gsvd.succFact() {
+		panic(badFact)
+	}
+	r := gsvd.r
+	p := gsvd.p
+	k := gsvd.k
+	l := gsvd.l
+	if dst.IsEmpty() {
+		dst.ReuseAs(p, k+l)
+	} else {
+		r, c := dst.Dims()
+		if r != p || c != k+l {
+			panic(ErrShape)
+		}
+		dst.Zero()
+	}
+	for i := 0; i < min(l, r-k); i++ {
+		dst.set(i, i+k, gsvd.s2[k+i])
+	}
+	for i := r - k; i < l; i++ {
+		dst.set(i, i+k, 1)
+	}
+}
+
+// UTo extracts the matrix U from the singular value decomposition, storing
+// the result into dst. U is size r×r.
+//
+// If dst is empty, UTo will resize dst to be r×r. When dst is
+// non-empty, UTo will panic if dst is not r×r. UTo will also
+// panic if the receiver does not contain a successful factorization.
+func (gsvd *GSVD) UTo(dst *Dense) {
+	if !gsvd.succFact() {
+		panic(badFact)
+	}
+	if gsvd.kind&GSVDU == 0 {
+		panic("mat: improper GSVD kind")
+	}
+	r := gsvd.u.Rows
+	c := gsvd.u.Cols
+	if dst.IsEmpty() {
+		dst.ReuseAs(r, c)
+	} else {
+		r2, c2 := dst.Dims()
+		if r != r2 || c != c2 {
+			panic(ErrShape)
+		}
+	}
+
+	tmp := &Dense{
+		mat:     gsvd.u,
+		capRows: r,
+		capCols: c,
+	}
+	dst.Copy(tmp)
+}
+
+// VTo extracts the matrix V from the singular value decomposition, storing
+// the result into dst. V is size p×p.
+//
+// If dst is empty, VTo will resize dst to be p×p. When dst is
+// non-empty, VTo will panic if dst is not p×p. VTo will also
+// panic if the receiver does not contain a successful factorization.
+func (gsvd *GSVD) VTo(dst *Dense) {
+	if !gsvd.succFact() {
+		panic(badFact)
+	}
+	if gsvd.kind&GSVDV == 0 {
+		panic("mat: improper GSVD kind")
+	}
+	r := gsvd.v.Rows
+	c := gsvd.v.Cols
+	if dst.IsEmpty() {
+		dst.ReuseAs(r, c)
+	} else {
+		r2, c2 := dst.Dims()
+		if r != r2 || c != c2 {
+			panic(ErrShape)
+		}
+	}
+
+	tmp := &Dense{
+		mat:     gsvd.v,
+		capRows: r,
+		capCols: c,
+	}
+	dst.Copy(tmp)
+}
+
+// QTo extracts the matrix Q from the singular value decomposition, storing
+// the result into dst. Q is size c×c.
+//
+// If dst is empty, QTo will resize dst to be c×c. When dst is
+// non-empty, QTo will panic if dst is not c×c. QTo will also
+// panic if the receiver does not contain a successful factorization.
+func (gsvd *GSVD) QTo(dst *Dense) {
+	if !gsvd.succFact() {
+		panic(badFact)
+	}
+	if gsvd.kind&GSVDQ == 0 {
+		panic("mat: improper GSVD kind")
+	}
+	r := gsvd.q.Rows
+	c := gsvd.q.Cols
+	if dst.IsEmpty() {
+		dst.ReuseAs(r, c)
+	} else {
+		r2, c2 := dst.Dims()
+		if r != r2 || c != c2 {
+			panic(ErrShape)
+		}
+	}
+
+	tmp := &Dense{
+		mat:     gsvd.q,
+		capRows: r,
+		capCols: c,
+	}
+	dst.Copy(tmp)
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/hogsvd.go b/vendor/gonum.org/v1/gonum/mat/hogsvd.go
new file mode 100644
index 0000000000..40a03315b9
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/hogsvd.go
@@ -0,0 +1,239 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"errors"
+
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// HOGSVD is a type for creating and using the Higher Order Generalized Singular Value
+// Decomposition (HOGSVD) of a set of matrices.
+//
+// The factorization is a linear transformation of the data sets from the given
+// variable×sample spaces to reduced and diagonalized "eigenvariable"×"eigensample"
+// spaces.
+type HOGSVD struct {
+	n int
+	v *Dense
+	b []Dense
+
+	err error
+}
+
+// succFact returns whether the receiver contains a successful factorization.
+func (gsvd *HOGSVD) succFact() bool {
+	return gsvd.n != 0
+}
+
+// Factorize computes the higher order generalized singular value decomposition (HOGSVD)
+// of the n input r_i×c column tall matrices in m. HOGSV extends the GSVD case from 2 to n
+// input matrices.
+//
+//	M_0 = U_0 * Σ_0 * Vᵀ
+//	M_1 = U_1 * Σ_1 * Vᵀ
+//	.
+//	.
+//	.
+//	M_{n-1} = U_{n-1} * Σ_{n-1} * Vᵀ
+//
+// where U_i are r_i×c matrices of singular vectors, Σ are c×c matrices singular values, and V
+// is a c×c matrix of singular vectors.
+//
+// Factorize returns whether the decomposition succeeded. If the decomposition
+// failed, routines that require a successful factorization will panic.
+func (gsvd *HOGSVD) Factorize(m ...Matrix) (ok bool) {
+	// Factorize performs the HOGSVD factorisation
+	// essentially as described by Ponnapalli et al.
+	// https://doi.org/10.1371/journal.pone.0028072
+
+	if len(m) < 2 {
+		panic("hogsvd: too few matrices")
+	}
+	gsvd.n = 0
+
+	r, c := m[0].Dims()
+	a := make([]Cholesky, len(m))
+	var ts SymDense
+	for i, d := range m {
+		rd, cd := d.Dims()
+		if rd < cd {
+			gsvd.err = ErrShape
+			return false
+		}
+		if rd > r {
+			r = rd
+		}
+		if cd != c {
+			panic(ErrShape)
+		}
+		ts.Reset()
+		ts.SymOuterK(1, d.T())
+		ok = a[i].Factorize(&ts)
+		if !ok {
+			gsvd.err = errors.New("hogsvd: cholesky decomposition failed")
+			return false
+		}
+	}
+
+	s := getDenseWorkspace(c, c, true)
+	defer putDenseWorkspace(s)
+	sij := getDenseWorkspace(c, c, false)
+	defer putDenseWorkspace(sij)
+	for i, ai := range a {
+		for _, aj := range a[i+1:] {
+			gsvd.err = ai.SolveCholTo(sij, &aj)
+			if gsvd.err != nil {
+				return false
+			}
+			s.Add(s, sij)
+
+			gsvd.err = aj.SolveCholTo(sij, &ai)
+			if gsvd.err != nil {
+				return false
+			}
+			s.Add(s, sij)
+		}
+	}
+	s.Scale(1/float64(len(m)*(len(m)-1)), s)
+
+	var eig Eigen
+	ok = eig.Factorize(s.T(), EigenRight)
+	if !ok {
+		gsvd.err = errors.New("hogsvd: eigen decomposition failed")
+		return false
+	}
+	var vc CDense
+	eig.VectorsTo(&vc)
+	// vc is guaranteed to have real eigenvalues.
+	rc, cc := vc.Dims()
+	v := NewDense(rc, cc, nil)
+	for i := 0; i < rc; i++ {
+		for j := 0; j < cc; j++ {
+			a := vc.At(i, j)
+			v.set(i, j, real(a))
+		}
+	}
+	// Rescale the columns of v by their Frobenius norms.
+	// Work done in cv is reflected in v.
+	var cv VecDense
+	for j := 0; j < c; j++ {
+		cv.ColViewOf(v, j)
+		cv.ScaleVec(1/blas64.Nrm2(cv.mat), &cv)
+	}
+
+	b := make([]Dense, len(m))
+	biT := getDenseWorkspace(c, r, false)
+	defer putDenseWorkspace(biT)
+	for i, d := range m {
+		// All calls to reset will leave an emptied
+		// matrix with capacity to store the result
+		// without additional allocation.
+		biT.Reset()
+		gsvd.err = biT.Solve(v, d.T())
+		if gsvd.err != nil {
+			return false
+		}
+		b[i].CloneFrom(biT.T())
+	}
+
+	gsvd.n = len(m)
+	gsvd.v = v
+	gsvd.b = b
+	return true
+}
+
+// Err returns the reason for a factorization failure.
+func (gsvd *HOGSVD) Err() error {
+	return gsvd.err
+}
+
+// Len returns the number of matrices that have been factorized. If Len returns
+// zero, the factorization was not successful.
+func (gsvd *HOGSVD) Len() int {
+	return gsvd.n
+}
+
+// UTo extracts the matrix U_n from the singular value decomposition, storing
+// the result in-place into dst. U_n is size r×c.
+//
+// If dst is empty, UTo will resize dst to be r×c. When dst is
+// non-empty, UTo will panic if dst is not r×c. UTo will also
+// panic if the receiver does not contain a successful factorization.
+func (gsvd *HOGSVD) UTo(dst *Dense, n int) {
+	if !gsvd.succFact() {
+		panic(badFact)
+	}
+	if n < 0 || gsvd.n <= n {
+		panic("hogsvd: invalid index")
+	}
+	r, c := gsvd.b[n].Dims()
+	if dst.IsEmpty() {
+		dst.ReuseAs(r, c)
+	} else {
+		r2, c2 := dst.Dims()
+		if r != r2 || c != c2 {
+			panic(ErrShape)
+		}
+	}
+	dst.Copy(&gsvd.b[n])
+	var v VecDense
+	for j, f := range gsvd.Values(nil, n) {
+		v.ColViewOf(dst, j)
+		v.ScaleVec(1/f, &v)
+	}
+}
+
+// Values returns the nth set of singular values of the factorized system.
+// If the input slice is non-nil, the values will be stored in-place into the slice.
+// In this case, the slice must have length c, and Values will panic with
+// ErrSliceLengthMismatch otherwise. If the input slice is nil,
+// a new slice of the appropriate length will be allocated and returned.
+//
+// Values will panic if the receiver does not contain a successful factorization.
+func (gsvd *HOGSVD) Values(s []float64, n int) []float64 {
+	if !gsvd.succFact() {
+		panic(badFact)
+	}
+	if n < 0 || gsvd.n <= n {
+		panic("hogsvd: invalid index")
+	}
+
+	_, c := gsvd.b[n].Dims()
+	if s == nil {
+		s = make([]float64, c)
+	} else if len(s) != c {
+		panic(ErrSliceLengthMismatch)
+	}
+	var v VecDense
+	for j := 0; j < c; j++ {
+		v.ColViewOf(&gsvd.b[n], j)
+		s[j] = blas64.Nrm2(v.mat)
+	}
+	return s
+}
+
+// VTo extracts the matrix V from the singular value decomposition, storing
+// the result in-place into dst. V is size c×c.
+//
+// If dst is empty, VTo will resize dst to be c×c. When dst is
+// non-empty, VTo will panic if dst is not c×c. VTo will also
+// panic if the receiver does not contain a successful factorization.
+func (gsvd *HOGSVD) VTo(dst *Dense) {
+	if !gsvd.succFact() {
+		panic(badFact)
+	}
+	r, c := gsvd.v.Dims()
+	if dst.IsEmpty() {
+		dst.ReuseAs(r, c)
+	} else {
+		r2, c2 := dst.Dims()
+		if r != r2 || c != c2 {
+			panic(ErrShape)
+		}
+	}
+	dst.Copy(gsvd.v)
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/index_bound_checks.go b/vendor/gonum.org/v1/gonum/mat/index_bound_checks.go
new file mode 100644
index 0000000000..59a9e04788
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/index_bound_checks.go
@@ -0,0 +1,398 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file must be kept in sync with index_no_bound_checks.go.
+
+//go:build bounds
+// +build bounds
+
+package mat
+
+// At returns the element at row i, column j.
+func (m *Dense) At(i, j int) float64 {
+	return m.at(i, j)
+}
+
+func (m *Dense) at(i, j int) float64 {
+	if uint(i) >= uint(m.mat.Rows) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(m.mat.Cols) {
+		panic(ErrColAccess)
+	}
+	return m.mat.Data[i*m.mat.Stride+j]
+}
+
+// Set sets the element at row i, column j to the value v.
+func (m *Dense) Set(i, j int, v float64) {
+	m.set(i, j, v)
+}
+
+func (m *Dense) set(i, j int, v float64) {
+	if uint(i) >= uint(m.mat.Rows) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(m.mat.Cols) {
+		panic(ErrColAccess)
+	}
+	m.mat.Data[i*m.mat.Stride+j] = v
+}
+
+// At returns the element at row i, column j.
+func (m *CDense) At(i, j int) complex128 {
+	return m.at(i, j)
+}
+
+func (m *CDense) at(i, j int) complex128 {
+	if uint(i) >= uint(m.mat.Rows) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(m.mat.Cols) {
+		panic(ErrColAccess)
+	}
+	return m.mat.Data[i*m.mat.Stride+j]
+}
+
+// Set sets the element at row i, column j to the value v.
+func (m *CDense) Set(i, j int, v complex128) {
+	m.set(i, j, v)
+}
+
+func (m *CDense) set(i, j int, v complex128) {
+	if uint(i) >= uint(m.mat.Rows) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(m.mat.Cols) {
+		panic(ErrColAccess)
+	}
+	m.mat.Data[i*m.mat.Stride+j] = v
+}
+
+// At returns the element at row i.
+// It panics if i is out of bounds or if j is not zero.
+func (v *VecDense) At(i, j int) float64 {
+	if j != 0 {
+		panic(ErrColAccess)
+	}
+	return v.at(i)
+}
+
+// AtVec returns the element at row i.
+// It panics if i is out of bounds.
+func (v *VecDense) AtVec(i int) float64 {
+	return v.at(i)
+}
+
+func (v *VecDense) at(i int) float64 {
+	if uint(i) >= uint(v.mat.N) {
+		panic(ErrRowAccess)
+	}
+	return v.mat.Data[i*v.mat.Inc]
+}
+
+// SetVec sets the element at row i to the value val.
+// It panics if i is out of bounds.
+func (v *VecDense) SetVec(i int, val float64) {
+	v.setVec(i, val)
+}
+
+func (v *VecDense) setVec(i int, val float64) {
+	if uint(i) >= uint(v.mat.N) {
+		panic(ErrVectorAccess)
+	}
+	v.mat.Data[i*v.mat.Inc] = val
+}
+
+// At returns the element at row i and column j.
+func (t *SymDense) At(i, j int) float64 {
+	return t.at(i, j)
+}
+
+func (t *SymDense) at(i, j int) float64 {
+	if uint(i) >= uint(t.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(t.mat.N) {
+		panic(ErrColAccess)
+	}
+	if i > j {
+		i, j = j, i
+	}
+	return t.mat.Data[i*t.mat.Stride+j]
+}
+
+// SetSym sets the elements at (i,j) and (j,i) to the value v.
+func (t *SymDense) SetSym(i, j int, v float64) {
+	t.set(i, j, v)
+}
+
+func (t *SymDense) set(i, j int, v float64) {
+	if uint(i) >= uint(t.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(t.mat.N) {
+		panic(ErrColAccess)
+	}
+	if i > j {
+		i, j = j, i
+	}
+	t.mat.Data[i*t.mat.Stride+j] = v
+}
+
+// At returns the element at row i, column j.
+func (t *TriDense) At(i, j int) float64 {
+	return t.at(i, j)
+}
+
+func (t *TriDense) at(i, j int) float64 {
+	if uint(i) >= uint(t.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(t.mat.N) {
+		panic(ErrColAccess)
+	}
+	isUpper := t.isUpper()
+	if (isUpper && i > j) || (!isUpper && i < j) {
+		return 0
+	}
+	return t.mat.Data[i*t.mat.Stride+j]
+}
+
+// SetTri sets the element of the triangular matrix at row i, column j to the value v.
+// It panics if the location is outside the appropriate half of the matrix.
+func (t *TriDense) SetTri(i, j int, v float64) {
+	t.set(i, j, v)
+}
+
+func (t *TriDense) set(i, j int, v float64) {
+	if uint(i) >= uint(t.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(t.mat.N) {
+		panic(ErrColAccess)
+	}
+	isUpper := t.isUpper()
+	if (isUpper && i > j) || (!isUpper && i < j) {
+		panic(ErrTriangleSet)
+	}
+	t.mat.Data[i*t.mat.Stride+j] = v
+}
+
+// At returns the element at row i, column j.
+func (b *BandDense) At(i, j int) float64 {
+	return b.at(i, j)
+}
+
+func (b *BandDense) at(i, j int) float64 {
+	if uint(i) >= uint(b.mat.Rows) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(b.mat.Cols) {
+		panic(ErrColAccess)
+	}
+	pj := j + b.mat.KL - i
+	if pj < 0 || b.mat.KL+b.mat.KU+1 <= pj {
+		return 0
+	}
+	return b.mat.Data[i*b.mat.Stride+pj]
+}
+
+// SetBand sets the element at row i, column j to the value v.
+// It panics if the location is outside the appropriate region of the matrix.
+func (b *BandDense) SetBand(i, j int, v float64) {
+	b.set(i, j, v)
+}
+
+func (b *BandDense) set(i, j int, v float64) {
+	if uint(i) >= uint(b.mat.Rows) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(b.mat.Cols) {
+		panic(ErrColAccess)
+	}
+	pj := j + b.mat.KL - i
+	if pj < 0 || b.mat.KL+b.mat.KU+1 <= pj {
+		panic(ErrBandSet)
+	}
+	b.mat.Data[i*b.mat.Stride+pj] = v
+}
+
+// At returns the element at row i, column j.
+func (s *SymBandDense) At(i, j int) float64 {
+	return s.at(i, j)
+}
+
+func (s *SymBandDense) at(i, j int) float64 {
+	if uint(i) >= uint(s.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(s.mat.N) {
+		panic(ErrColAccess)
+	}
+	if i > j {
+		i, j = j, i
+	}
+	pj := j - i
+	if s.mat.K+1 <= pj {
+		return 0
+	}
+	return s.mat.Data[i*s.mat.Stride+pj]
+}
+
+// SetSymBand sets the element at row i, column j to the value v.
+// It panics if the location is outside the appropriate region of the matrix.
+func (s *SymBandDense) SetSymBand(i, j int, v float64) {
+	s.set(i, j, v)
+}
+
+func (s *SymBandDense) set(i, j int, v float64) {
+	if uint(i) >= uint(s.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(s.mat.N) {
+		panic(ErrColAccess)
+	}
+	if i > j {
+		i, j = j, i
+	}
+	pj := j - i
+	if s.mat.K+1 <= pj {
+		panic(ErrBandSet)
+	}
+	s.mat.Data[i*s.mat.Stride+pj] = v
+}
+
+func (t *TriBandDense) At(i, j int) float64 {
+	return t.at(i, j)
+}
+
+func (t *TriBandDense) at(i, j int) float64 {
+	// TODO(btracey): Support Diag field, see #692.
+	if uint(i) >= uint(t.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(t.mat.N) {
+		panic(ErrColAccess)
+	}
+	isUpper := t.isUpper()
+	if (isUpper && i > j) || (!isUpper && i < j) {
+		return 0
+	}
+	kl, ku := t.mat.K, 0
+	if isUpper {
+		kl, ku = 0, t.mat.K
+	}
+	pj := j + kl - i
+	if pj < 0 || kl+ku+1 <= pj {
+		return 0
+	}
+	return t.mat.Data[i*t.mat.Stride+pj]
+}
+
+func (t *TriBandDense) SetTriBand(i, j int, v float64) {
+	t.setTriBand(i, j, v)
+}
+
+func (t *TriBandDense) setTriBand(i, j int, v float64) {
+	if uint(i) >= uint(t.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(t.mat.N) {
+		panic(ErrColAccess)
+	}
+	isUpper := t.isUpper()
+	if (isUpper && i > j) || (!isUpper && i < j) {
+		panic(ErrTriangleSet)
+	}
+	kl, ku := t.mat.K, 0
+	if isUpper {
+		kl, ku = 0, t.mat.K
+	}
+	pj := j + kl - i
+	if pj < 0 || kl+ku+1 <= pj {
+		panic(ErrBandSet)
+	}
+	// TODO(btracey): Support Diag field, see #692.
+	t.mat.Data[i*t.mat.Stride+pj] = v
+}
+
+// At returns the element at row i, column j.
+func (d *DiagDense) At(i, j int) float64 {
+	return d.at(i, j)
+}
+
+func (d *DiagDense) at(i, j int) float64 {
+	if uint(i) >= uint(d.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(d.mat.N) {
+		panic(ErrColAccess)
+	}
+	if i != j {
+		return 0
+	}
+	return d.mat.Data[i*d.mat.Inc]
+}
+
+// SetDiag sets the element at row i, column i to the value v.
+// It panics if the location is outside the appropriate region of the matrix.
+func (d *DiagDense) SetDiag(i int, v float64) {
+	d.setDiag(i, v)
+}
+
+func (d *DiagDense) setDiag(i int, v float64) {
+	if uint(i) >= uint(d.mat.N) {
+		panic(ErrRowAccess)
+	}
+	d.mat.Data[i*d.mat.Inc] = v
+}
+
+// At returns the element at row i, column j.
+func (a *Tridiag) At(i, j int) float64 {
+	return a.at(i, j)
+}
+
+func (a *Tridiag) at(i, j int) float64 {
+	if uint(i) >= uint(a.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(a.mat.N) {
+		panic(ErrColAccess)
+	}
+	switch i - j {
+	case -1:
+		return a.mat.DU[i]
+	case 0:
+		return a.mat.D[i]
+	case 1:
+		return a.mat.DL[j]
+	default:
+		return 0
+	}
+}
+
+// SetBand sets the element at row i, column j to the value v.
+// It panics if the location is outside the appropriate region of the matrix.
+func (a *Tridiag) SetBand(i, j int, v float64) {
+	a.set(i, j, v)
+}
+
+func (a *Tridiag) set(i, j int, v float64) {
+	if uint(i) >= uint(a.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(a.mat.N) {
+		panic(ErrColAccess)
+	}
+	switch i - j {
+	case -1:
+		a.mat.DU[i] = v
+	case 0:
+		a.mat.D[i] = v
+	case 1:
+		a.mat.DL[j] = v
+	default:
+		panic(ErrBandSet)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/index_no_bound_checks.go b/vendor/gonum.org/v1/gonum/mat/index_no_bound_checks.go
new file mode 100644
index 0000000000..335128806f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/index_no_bound_checks.go
@@ -0,0 +1,400 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file must be kept in sync with index_bound_checks.go.
+
+//go:build !bounds
+// +build !bounds
+
+package mat
+
+// At returns the element at row i, column j.
+func (m *Dense) At(i, j int) float64 {
+	if uint(i) >= uint(m.mat.Rows) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(m.mat.Cols) {
+		panic(ErrColAccess)
+	}
+	return m.at(i, j)
+}
+
+func (m *Dense) at(i, j int) float64 {
+	return m.mat.Data[i*m.mat.Stride+j]
+}
+
+// Set sets the element at row i, column j to the value v.
+func (m *Dense) Set(i, j int, v float64) {
+	if uint(i) >= uint(m.mat.Rows) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(m.mat.Cols) {
+		panic(ErrColAccess)
+	}
+	m.set(i, j, v)
+}
+
+func (m *Dense) set(i, j int, v float64) {
+	m.mat.Data[i*m.mat.Stride+j] = v
+}
+
+// At returns the element at row i, column j.
+func (m *CDense) At(i, j int) complex128 {
+	if uint(i) >= uint(m.mat.Rows) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(m.mat.Cols) {
+		panic(ErrColAccess)
+	}
+	return m.at(i, j)
+}
+
+func (m *CDense) at(i, j int) complex128 {
+	return m.mat.Data[i*m.mat.Stride+j]
+}
+
+// Set sets the element at row i, column j to the value v.
+func (m *CDense) Set(i, j int, v complex128) {
+	if uint(i) >= uint(m.mat.Rows) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(m.mat.Cols) {
+		panic(ErrColAccess)
+	}
+	m.set(i, j, v)
+}
+
+func (m *CDense) set(i, j int, v complex128) {
+	m.mat.Data[i*m.mat.Stride+j] = v
+}
+
+// At returns the element at row i.
+// It panics if i is out of bounds or if j is not zero.
+func (v *VecDense) At(i, j int) float64 {
+	if uint(i) >= uint(v.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if j != 0 {
+		panic(ErrColAccess)
+	}
+	return v.at(i)
+}
+
+// AtVec returns the element at row i.
+// It panics if i is out of bounds.
+func (v *VecDense) AtVec(i int) float64 {
+	if uint(i) >= uint(v.mat.N) {
+		panic(ErrRowAccess)
+	}
+	return v.at(i)
+}
+
+func (v *VecDense) at(i int) float64 {
+	return v.mat.Data[i*v.mat.Inc]
+}
+
+// SetVec sets the element at row i to the value val.
+// It panics if i is out of bounds.
+func (v *VecDense) SetVec(i int, val float64) {
+	if uint(i) >= uint(v.mat.N) {
+		panic(ErrVectorAccess)
+	}
+	v.setVec(i, val)
+}
+
+func (v *VecDense) setVec(i int, val float64) {
+	v.mat.Data[i*v.mat.Inc] = val
+}
+
+// At returns the element at row i and column j.
+func (s *SymDense) At(i, j int) float64 {
+	if uint(i) >= uint(s.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(s.mat.N) {
+		panic(ErrColAccess)
+	}
+	return s.at(i, j)
+}
+
+func (s *SymDense) at(i, j int) float64 {
+	if i > j {
+		i, j = j, i
+	}
+	return s.mat.Data[i*s.mat.Stride+j]
+}
+
+// SetSym sets the elements at (i,j) and (j,i) to the value v.
+func (s *SymDense) SetSym(i, j int, v float64) {
+	if uint(i) >= uint(s.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(s.mat.N) {
+		panic(ErrColAccess)
+	}
+	s.set(i, j, v)
+}
+
+func (s *SymDense) set(i, j int, v float64) {
+	if i > j {
+		i, j = j, i
+	}
+	s.mat.Data[i*s.mat.Stride+j] = v
+}
+
+// At returns the element at row i, column j.
+func (t *TriDense) At(i, j int) float64 {
+	if uint(i) >= uint(t.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(t.mat.N) {
+		panic(ErrColAccess)
+	}
+	return t.at(i, j)
+}
+
+func (t *TriDense) at(i, j int) float64 {
+	isUpper := t.triKind()
+	if (isUpper && i > j) || (!isUpper && i < j) {
+		return 0
+	}
+	return t.mat.Data[i*t.mat.Stride+j]
+}
+
+// SetTri sets the element at row i, column j to the value v.
+// It panics if the location is outside the appropriate half of the matrix.
+func (t *TriDense) SetTri(i, j int, v float64) {
+	if uint(i) >= uint(t.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(t.mat.N) {
+		panic(ErrColAccess)
+	}
+	isUpper := t.isUpper()
+	if (isUpper && i > j) || (!isUpper && i < j) {
+		panic(ErrTriangleSet)
+	}
+	t.set(i, j, v)
+}
+
+func (t *TriDense) set(i, j int, v float64) {
+	t.mat.Data[i*t.mat.Stride+j] = v
+}
+
+// At returns the element at row i, column j.
+func (b *BandDense) At(i, j int) float64 {
+	if uint(i) >= uint(b.mat.Rows) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(b.mat.Cols) {
+		panic(ErrColAccess)
+	}
+	return b.at(i, j)
+}
+
+func (b *BandDense) at(i, j int) float64 {
+	pj := j + b.mat.KL - i
+	if pj < 0 || b.mat.KL+b.mat.KU+1 <= pj {
+		return 0
+	}
+	return b.mat.Data[i*b.mat.Stride+pj]
+}
+
+// SetBand sets the element at row i, column j to the value v.
+// It panics if the location is outside the appropriate region of the matrix.
+func (b *BandDense) SetBand(i, j int, v float64) {
+	if uint(i) >= uint(b.mat.Rows) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(b.mat.Cols) {
+		panic(ErrColAccess)
+	}
+	pj := j + b.mat.KL - i
+	if pj < 0 || b.mat.KL+b.mat.KU+1 <= pj {
+		panic(ErrBandSet)
+	}
+	b.set(i, j, v)
+}
+
+func (b *BandDense) set(i, j int, v float64) {
+	pj := j + b.mat.KL - i
+	b.mat.Data[i*b.mat.Stride+pj] = v
+}
+
+// At returns the element at row i, column j.
+func (s *SymBandDense) At(i, j int) float64 {
+	if uint(i) >= uint(s.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(s.mat.N) {
+		panic(ErrColAccess)
+	}
+	return s.at(i, j)
+}
+
+func (s *SymBandDense) at(i, j int) float64 {
+	if i > j {
+		i, j = j, i
+	}
+	pj := j - i
+	if s.mat.K+1 <= pj {
+		return 0
+	}
+	return s.mat.Data[i*s.mat.Stride+pj]
+}
+
+// SetSymBand sets the element at row i, column j to the value v.
+// It panics if the location is outside the appropriate region of the matrix.
+func (s *SymBandDense) SetSymBand(i, j int, v float64) {
+	if uint(i) >= uint(s.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(s.mat.N) {
+		panic(ErrColAccess)
+	}
+	s.set(i, j, v)
+}
+
+func (s *SymBandDense) set(i, j int, v float64) {
+	if i > j {
+		i, j = j, i
+	}
+	pj := j - i
+	if s.mat.K+1 <= pj {
+		panic(ErrBandSet)
+	}
+	s.mat.Data[i*s.mat.Stride+pj] = v
+}
+
+func (t *TriBandDense) At(i, j int) float64 {
+	if uint(i) >= uint(t.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(t.mat.N) {
+		panic(ErrColAccess)
+	}
+	return t.at(i, j)
+}
+
+func (t *TriBandDense) at(i, j int) float64 {
+	// TODO(btracey): Support Diag field, see #692.
+	isUpper := t.isUpper()
+	if (isUpper && i > j) || (!isUpper && i < j) {
+		return 0
+	}
+	kl := t.mat.K
+	ku := 0
+	if isUpper {
+		ku = t.mat.K
+		kl = 0
+	}
+	pj := j + kl - i
+	if pj < 0 || kl+ku+1 <= pj {
+		return 0
+	}
+	return t.mat.Data[i*t.mat.Stride+pj]
+}
+
+func (t *TriBandDense) SetTriBand(i, j int, v float64) {
+	if uint(i) >= uint(t.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(t.mat.N) {
+		panic(ErrColAccess)
+	}
+	isUpper := t.isUpper()
+	if (isUpper && i > j) || (!isUpper && i < j) {
+		panic(ErrTriangleSet)
+	}
+	kl, ku := t.mat.K, 0
+	if isUpper {
+		kl, ku = 0, t.mat.K
+	}
+	pj := j + kl - i
+	if pj < 0 || kl+ku+1 <= pj {
+		panic(ErrBandSet)
+	}
+	// TODO(btracey): Support Diag field, see #692.
+	t.mat.Data[i*t.mat.Stride+pj] = v
+}
+
+// At returns the element at row i, column j.
+func (d *DiagDense) At(i, j int) float64 {
+	if uint(i) >= uint(d.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(d.mat.N) {
+		panic(ErrColAccess)
+	}
+	return d.at(i, j)
+}
+
+func (d *DiagDense) at(i, j int) float64 {
+	if i != j {
+		return 0
+	}
+	return d.mat.Data[i*d.mat.Inc]
+}
+
+// SetDiag sets the element at row i, column i to the value v.
+// It panics if the location is outside the appropriate region of the matrix.
+func (d *DiagDense) SetDiag(i int, v float64) {
+	if uint(i) >= uint(d.mat.N) {
+		panic(ErrRowAccess)
+	}
+	d.setDiag(i, v)
+}
+
+func (d *DiagDense) setDiag(i int, v float64) {
+	d.mat.Data[i*d.mat.Inc] = v
+}
+
+// At returns the element at row i, column j.
+func (a *Tridiag) At(i, j int) float64 {
+	if uint(i) >= uint(a.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(a.mat.N) {
+		panic(ErrColAccess)
+	}
+	return a.at(i, j)
+}
+
+func (a *Tridiag) at(i, j int) float64 {
+	switch i - j {
+	case -1:
+		return a.mat.DU[i]
+	case 0:
+		return a.mat.D[i]
+	case 1:
+		return a.mat.DL[j]
+	default:
+		return 0
+	}
+}
+
+// SetBand sets the element at row i, column j to the value v.
+// It panics if the location is outside the appropriate region of the matrix.
+func (a *Tridiag) SetBand(i, j int, v float64) {
+	if uint(i) >= uint(a.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(a.mat.N) {
+		panic(ErrColAccess)
+	}
+	a.set(i, j, v)
+}
+
+func (a *Tridiag) set(i, j int, v float64) {
+	switch i - j {
+	case -1:
+		a.mat.DU[i] = v
+	case 0:
+		a.mat.D[i] = v
+	case 1:
+		a.mat.DL[j] = v
+	default:
+		panic(ErrBandSet)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/inner.go b/vendor/gonum.org/v1/gonum/mat/inner.go
new file mode 100644
index 0000000000..4f94a96a6b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/inner.go
@@ -0,0 +1,126 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/internal/asm/f64"
+)
+
+// Inner computes the generalized inner product
+//
+//	xᵀ A y
+//
+// between the vectors x and y with matrix A, where x and y are treated as
+// column vectors.
+//
+// This is only a true inner product if A is symmetric positive definite, though
+// the operation works for any matrix A.
+//
+// Inner panics if x.Len != m or y.Len != n when A is an m x n matrix.
+func Inner(x Vector, a Matrix, y Vector) float64 {
+	m, n := a.Dims()
+	if x.Len() != m {
+		panic(ErrShape)
+	}
+	if y.Len() != n {
+		panic(ErrShape)
+	}
+	if m == 0 || n == 0 {
+		return 0
+	}
+
+	var sum float64
+
+	switch a := a.(type) {
+	case RawSymmetricer:
+		amat := a.RawSymmetric()
+		if amat.Uplo != blas.Upper {
+			// Panic as a string not a mat.Error.
+			panic(badSymTriangle)
+		}
+		var xmat, ymat blas64.Vector
+		if xrv, ok := x.(RawVectorer); ok {
+			xmat = xrv.RawVector()
+		} else {
+			break
+		}
+		if yrv, ok := y.(RawVectorer); ok {
+			ymat = yrv.RawVector()
+		} else {
+			break
+		}
+		for i := 0; i < x.Len(); i++ {
+			xi := x.AtVec(i)
+			if xi != 0 {
+				if ymat.Inc == 1 {
+					sum += xi * f64.DotUnitary(
+						amat.Data[i*amat.Stride+i:i*amat.Stride+n],
+						ymat.Data[i:],
+					)
+				} else {
+					sum += xi * f64.DotInc(
+						amat.Data[i*amat.Stride+i:i*amat.Stride+n],
+						ymat.Data[i*ymat.Inc:], uintptr(n-i),
+						1, uintptr(ymat.Inc),
+						0, 0,
+					)
+				}
+			}
+			yi := y.AtVec(i)
+			if i != n-1 && yi != 0 {
+				if xmat.Inc == 1 {
+					sum += yi * f64.DotUnitary(
+						amat.Data[i*amat.Stride+i+1:i*amat.Stride+n],
+						xmat.Data[i+1:],
+					)
+				} else {
+					sum += yi * f64.DotInc(
+						amat.Data[i*amat.Stride+i+1:i*amat.Stride+n],
+						xmat.Data[(i+1)*xmat.Inc:], uintptr(n-i-1),
+						1, uintptr(xmat.Inc),
+						0, 0,
+					)
+				}
+			}
+		}
+		return sum
+	case RawMatrixer:
+		amat := a.RawMatrix()
+		var ymat blas64.Vector
+		if yrv, ok := y.(RawVectorer); ok {
+			ymat = yrv.RawVector()
+		} else {
+			break
+		}
+		for i := 0; i < x.Len(); i++ {
+			xi := x.AtVec(i)
+			if xi != 0 {
+				if ymat.Inc == 1 {
+					sum += xi * f64.DotUnitary(
+						amat.Data[i*amat.Stride:i*amat.Stride+n],
+						ymat.Data,
+					)
+				} else {
+					sum += xi * f64.DotInc(
+						amat.Data[i*amat.Stride:i*amat.Stride+n],
+						ymat.Data, uintptr(n),
+						1, uintptr(ymat.Inc),
+						0, 0,
+					)
+				}
+			}
+		}
+		return sum
+	}
+	for i := 0; i < x.Len(); i++ {
+		xi := x.AtVec(i)
+		for j := 0; j < y.Len(); j++ {
+			sum += xi * a.At(i, j) * y.AtVec(j)
+		}
+	}
+	return sum
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/io.go b/vendor/gonum.org/v1/gonum/mat/io.go
new file mode 100644
index 0000000000..0641fa28b6
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/io.go
@@ -0,0 +1,495 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"math"
+)
+
+// version is the current on-disk codec version.
+const version uint32 = 0x1
+
+// maxLen is the biggest slice/array len one can create on a 32/64b platform.
+const maxLen = int64(int(^uint(0) >> 1))
+
+var (
+	headerSize  = binary.Size(storage{})
+	sizeFloat64 = binary.Size(float64(0))
+
+	errWrongType = errors.New("mat: wrong data type")
+
+	errTooBig    = errors.New("mat: resulting data slice too big")
+	errTooSmall  = errors.New("mat: input slice too small")
+	errBadBuffer = errors.New("mat: data buffer size mismatch")
+	errBadSize   = errors.New("mat: invalid dimension")
+)
+
+// Type encoding scheme:
+//
+// Type 		Form 	Packing 	Uplo 		Unit 		Rows 	Columns kU 	kL
+// uint8 		[GST] 	uint8 [BPF] 	uint8 [AUL] 	bool 		int64 	int64 	int64 	int64
+// General 		'G' 	'F' 		'A' 		false 		r 	c 	0 	0
+// Band 		'G' 	'B' 		'A' 		false 		r 	c 	kU 	kL
+// Symmetric 		'S' 	'F' 		ul 		false 		n 	n 	0 	0
+// SymmetricBand 	'S' 	'B' 		ul 		false 		n 	n 	k 	k
+// SymmetricPacked 	'S' 	'P' 		ul 		false 		n 	n 	0 	0
+// Triangular 		'T' 	'F' 		ul 		Diag==Unit 	n 	n 	0 	0
+// TriangularBand 	'T' 	'B' 		ul 		Diag==Unit 	n 	n 	k 	k
+// TriangularPacked 	'T' 	'P' 		ul	 	Diag==Unit 	n 	n 	0 	0
+//
+// G - general, S - symmetric, T - triangular
+// F - full, B - band, P - packed
+// A - all, U - upper, L - lower
+
+// MarshalBinary encodes the receiver into a binary form and returns the result.
+//
+// Dense is little-endian encoded as follows:
+//
+//	 0 -  3  Version = 1          (uint32)
+//	 4       'G'                  (byte)
+//	 5       'F'                  (byte)
+//	 6       'A'                  (byte)
+//	 7       0                    (byte)
+//	 8 - 15  number of rows       (int64)
+//	16 - 23  number of columns    (int64)
+//	24 - 31  0                    (int64)
+//	32 - 39  0                    (int64)
+//	40 - ..  matrix data elements (float64)
+//	         [0,0] [0,1] ... [0,ncols-1]
+//	         [1,0] [1,1] ... [1,ncols-1]
+//	         ...
+//	         [nrows-1,0] ... [nrows-1,ncols-1]
+func (m Dense) MarshalBinary() ([]byte, error) {
+	bufLen := int64(headerSize) + int64(m.mat.Rows)*int64(m.mat.Cols)*int64(sizeFloat64)
+	if bufLen <= 0 {
+		// bufLen is too big and has wrapped around.
+		return nil, errTooBig
+	}
+
+	header := storage{
+		Form: 'G', Packing: 'F', Uplo: 'A',
+		Rows: int64(m.mat.Rows), Cols: int64(m.mat.Cols),
+		Version: version,
+	}
+	buf := make([]byte, bufLen)
+	n, err := header.marshalBinaryTo(bytes.NewBuffer(buf[:0]))
+	if err != nil {
+		return buf[:n], err
+	}
+
+	p := headerSize
+	r, c := m.Dims()
+	for i := 0; i < r; i++ {
+		for j := 0; j < c; j++ {
+			binary.LittleEndian.PutUint64(buf[p:p+sizeFloat64], math.Float64bits(m.at(i, j)))
+			p += sizeFloat64
+		}
+	}
+
+	return buf, nil
+}
+
+// MarshalBinaryTo encodes the receiver into a binary form and writes it into w.
+// MarshalBinaryTo returns the number of bytes written into w and an error, if any.
+//
+// See MarshalBinary for the on-disk layout.
+func (m Dense) MarshalBinaryTo(w io.Writer) (int, error) {
+	header := storage{
+		Form: 'G', Packing: 'F', Uplo: 'A',
+		Rows: int64(m.mat.Rows), Cols: int64(m.mat.Cols),
+		Version: version,
+	}
+	n, err := header.marshalBinaryTo(w)
+	if err != nil {
+		return n, err
+	}
+
+	r, c := m.Dims()
+	var b [8]byte
+	for i := 0; i < r; i++ {
+		for j := 0; j < c; j++ {
+			binary.LittleEndian.PutUint64(b[:], math.Float64bits(m.at(i, j)))
+			nn, err := w.Write(b[:])
+			n += nn
+			if err != nil {
+				return n, err
+			}
+		}
+	}
+
+	return n, nil
+}
+
+// UnmarshalBinary decodes the binary form into the receiver.
+// It panics if the receiver is a non-empty Dense matrix.
+//
+// See MarshalBinary for the on-disk layout.
+//
+// Limited checks on the validity of the binary input are performed:
+//   - ErrShape is returned if the number of rows or columns is negative,
+//   - an error is returned if the resulting Dense matrix is too
+//     big for the current architecture (e.g. a 16GB matrix written by a
+//     64b application and read back from a 32b application.)
+//
+// UnmarshalBinary does not limit the size of the unmarshaled matrix, and so
+// it should not be used on untrusted data.
+func (m *Dense) UnmarshalBinary(data []byte) error {
+	if !m.IsEmpty() {
+		panic("mat: unmarshal into non-empty matrix")
+	}
+
+	if len(data) < headerSize {
+		return errTooSmall
+	}
+
+	var header storage
+	err := header.unmarshalBinary(data[:headerSize])
+	if err != nil {
+		return err
+	}
+	rows := header.Rows
+	cols := header.Cols
+	header.Version = 0
+	header.Rows = 0
+	header.Cols = 0
+	if (header != storage{Form: 'G', Packing: 'F', Uplo: 'A'}) {
+		return errWrongType
+	}
+	if rows < 0 || cols < 0 {
+		return errBadSize
+	}
+	size := rows * cols
+	if size == 0 {
+		return ErrZeroLength
+	}
+	if int(size) < 0 || size > maxLen {
+		return errTooBig
+	}
+	if len(data) != headerSize+int(rows*cols)*sizeFloat64 {
+		return errBadBuffer
+	}
+
+	p := headerSize
+	m.reuseAsNonZeroed(int(rows), int(cols))
+	for i := range m.mat.Data {
+		m.mat.Data[i] = math.Float64frombits(binary.LittleEndian.Uint64(data[p : p+sizeFloat64]))
+		p += sizeFloat64
+	}
+
+	return nil
+}
+
+// UnmarshalBinaryFrom decodes the binary form into the receiver and returns
+// the number of bytes read and an error if any.
+// It panics if the receiver is a non-empty Dense matrix.
+//
+// See MarshalBinary for the on-disk layout.
+//
+// Limited checks on the validity of the binary input are performed:
+//   - ErrShape is returned if the number of rows or columns is negative,
+//   - an error is returned if the resulting Dense matrix is too
+//     big for the current architecture (e.g. a 16GB matrix written by a
+//     64b application and read back from a 32b application.)
+//
+// UnmarshalBinary does not limit the size of the unmarshaled matrix, and so
+// it should not be used on untrusted data.
+func (m *Dense) UnmarshalBinaryFrom(r io.Reader) (int, error) {
+	if !m.IsEmpty() {
+		panic("mat: unmarshal into non-empty matrix")
+	}
+
+	var header storage
+	n, err := header.unmarshalBinaryFrom(r)
+	if err != nil {
+		return n, err
+	}
+	rows := header.Rows
+	cols := header.Cols
+	header.Version = 0
+	header.Rows = 0
+	header.Cols = 0
+	if (header != storage{Form: 'G', Packing: 'F', Uplo: 'A'}) {
+		return n, errWrongType
+	}
+	if rows < 0 || cols < 0 {
+		return n, errBadSize
+	}
+	size := rows * cols
+	if size == 0 {
+		return n, ErrZeroLength
+	}
+	if int(size) < 0 || size > maxLen {
+		return n, errTooBig
+	}
+
+	m.reuseAsNonZeroed(int(rows), int(cols))
+	var b [8]byte
+	for i := range m.mat.Data {
+		nn, err := readFull(r, b[:])
+		n += nn
+		if err != nil {
+			if err == io.EOF {
+				return n, io.ErrUnexpectedEOF
+			}
+			return n, err
+		}
+		m.mat.Data[i] = math.Float64frombits(binary.LittleEndian.Uint64(b[:]))
+	}
+
+	return n, nil
+}
+
+// MarshalBinary encodes the receiver into a binary form and returns the result.
+//
+// VecDense is little-endian encoded as follows:
+//
+//	 0 -  3  Version = 1            (uint32)
+//	 4       'G'                    (byte)
+//	 5       'F'                    (byte)
+//	 6       'A'                    (byte)
+//	 7       0                      (byte)
+//	 8 - 15  number of elements     (int64)
+//	16 - 23  1                      (int64)
+//	24 - 31  0                      (int64)
+//	32 - 39  0                      (int64)
+//	40 - ..  vector's data elements (float64)
+func (v VecDense) MarshalBinary() ([]byte, error) {
+	bufLen := int64(headerSize) + int64(v.mat.N)*int64(sizeFloat64)
+	if bufLen <= 0 {
+		// bufLen is too big and has wrapped around.
+		return nil, errTooBig
+	}
+
+	header := storage{
+		Form: 'G', Packing: 'F', Uplo: 'A',
+		Rows: int64(v.mat.N), Cols: 1,
+		Version: version,
+	}
+	buf := make([]byte, bufLen)
+	n, err := header.marshalBinaryTo(bytes.NewBuffer(buf[:0]))
+	if err != nil {
+		return buf[:n], err
+	}
+
+	p := headerSize
+	for i := 0; i < v.mat.N; i++ {
+		binary.LittleEndian.PutUint64(buf[p:p+sizeFloat64], math.Float64bits(v.at(i)))
+		p += sizeFloat64
+	}
+
+	return buf, nil
+}
+
+// MarshalBinaryTo encodes the receiver into a binary form, writes it to w and
+// returns the number of bytes written and an error if any.
+//
+// See MarshalBinary for the on-disk format.
+func (v VecDense) MarshalBinaryTo(w io.Writer) (int, error) {
+	header := storage{
+		Form: 'G', Packing: 'F', Uplo: 'A',
+		Rows: int64(v.mat.N), Cols: 1,
+		Version: version,
+	}
+	n, err := header.marshalBinaryTo(w)
+	if err != nil {
+		return n, err
+	}
+
+	var buf [8]byte
+	for i := 0; i < v.mat.N; i++ {
+		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(v.at(i)))
+		nn, err := w.Write(buf[:])
+		n += nn
+		if err != nil {
+			return n, err
+		}
+	}
+
+	return n, nil
+}
+
+// UnmarshalBinary decodes the binary form into the receiver.
+// It panics if the receiver is a non-empty VecDense.
+//
+// See MarshalBinary for the on-disk layout.
+//
+// Limited checks on the validity of the binary input are performed:
+//   - ErrShape is returned if the number of rows is negative,
+//   - an error is returned if the resulting VecDense is too
+//     big for the current architecture (e.g. a 16GB vector written by a
+//     64b application and read back from a 32b application.)
+//
+// UnmarshalBinary does not limit the size of the unmarshaled vector, and so
+// it should not be used on untrusted data.
+func (v *VecDense) UnmarshalBinary(data []byte) error {
+	if !v.IsEmpty() {
+		panic("mat: unmarshal into non-empty vector")
+	}
+
+	if len(data) < headerSize {
+		return errTooSmall
+	}
+
+	var header storage
+	err := header.unmarshalBinary(data[:headerSize])
+	if err != nil {
+		return err
+	}
+	if header.Cols != 1 {
+		return ErrShape
+	}
+	n := header.Rows
+	header.Version = 0
+	header.Rows = 0
+	header.Cols = 0
+	if (header != storage{Form: 'G', Packing: 'F', Uplo: 'A'}) {
+		return errWrongType
+	}
+	if n == 0 {
+		return ErrZeroLength
+	}
+	if n < 0 {
+		return errBadSize
+	}
+	if int64(maxLen) < n {
+		return errTooBig
+	}
+	if len(data) != headerSize+int(n)*sizeFloat64 {
+		return errBadBuffer
+	}
+
+	p := headerSize
+	v.reuseAsNonZeroed(int(n))
+	for i := range v.mat.Data {
+		v.mat.Data[i] = math.Float64frombits(binary.LittleEndian.Uint64(data[p : p+sizeFloat64]))
+		p += sizeFloat64
+	}
+
+	return nil
+}
+
+// UnmarshalBinaryFrom decodes the binary form into the receiver, from the
+// io.Reader and returns the number of bytes read and an error if any.
+// It panics if the receiver is a non-empty VecDense.
+//
+// See MarshalBinary for the on-disk layout.
+// See UnmarshalBinary for the list of sanity checks performed on the input.
+func (v *VecDense) UnmarshalBinaryFrom(r io.Reader) (int, error) {
+	if !v.IsEmpty() {
+		panic("mat: unmarshal into non-empty vector")
+	}
+
+	var header storage
+	n, err := header.unmarshalBinaryFrom(r)
+	if err != nil {
+		return n, err
+	}
+	if header.Cols != 1 {
+		return n, ErrShape
+	}
+	l := header.Rows
+	header.Version = 0
+	header.Rows = 0
+	header.Cols = 0
+	if (header != storage{Form: 'G', Packing: 'F', Uplo: 'A'}) {
+		return n, errWrongType
+	}
+	if l == 0 {
+		return n, ErrZeroLength
+	}
+	if l < 0 {
+		return n, errBadSize
+	}
+	if int64(maxLen) < l {
+		return n, errTooBig
+	}
+
+	v.reuseAsNonZeroed(int(l))
+	var b [8]byte
+	for i := range v.mat.Data {
+		nn, err := readFull(r, b[:])
+		n += nn
+		if err != nil {
+			if err == io.EOF {
+				return n, io.ErrUnexpectedEOF
+			}
+			return n, err
+		}
+		v.mat.Data[i] = math.Float64frombits(binary.LittleEndian.Uint64(b[:]))
+	}
+
+	return n, nil
+}
+
+// storage is the internal representation of the storage format of a
+// serialised matrix.
+type storage struct {
+	Version uint32 // Keep this first.
+	Form    byte   // [GST]
+	Packing byte   // [BPF]
+	Uplo    byte   // [AUL]
+	Unit    bool
+	Rows    int64
+	Cols    int64
+	KU      int64
+	KL      int64
+}
+
+// TODO(kortschak): Consider replacing these with calls to direct
+// encoding/decoding of fields rather than to binary.Write/binary.Read.
+
+func (s storage) marshalBinaryTo(w io.Writer) (int, error) {
+	buf := bytes.NewBuffer(make([]byte, 0, headerSize))
+	err := binary.Write(buf, binary.LittleEndian, s)
+	if err != nil {
+		return 0, err
+	}
+	return w.Write(buf.Bytes())
+}
+
+func (s *storage) unmarshalBinary(buf []byte) error {
+	err := binary.Read(bytes.NewReader(buf), binary.LittleEndian, s)
+	if err != nil {
+		return err
+	}
+	if s.Version != version {
+		return fmt.Errorf("mat: incorrect version: %d", s.Version)
+	}
+	return nil
+}
+
+func (s *storage) unmarshalBinaryFrom(r io.Reader) (int, error) {
+	buf := make([]byte, headerSize)
+	n, err := readFull(r, buf)
+	if err != nil {
+		return n, err
+	}
+	return n, s.unmarshalBinary(buf[:n])
+}
+
+// readFull reads from r into buf until it has read len(buf).
+// It returns the number of bytes copied and an error if fewer bytes were read.
+// If an EOF happens after reading fewer than len(buf) bytes, io.ErrUnexpectedEOF is returned.
+func readFull(r io.Reader, buf []byte) (int, error) {
+	var n int
+	var err error
+	for n < len(buf) && err == nil {
+		var nn int
+		nn, err = r.Read(buf[n:])
+		n += nn
+	}
+	if n == len(buf) {
+		return n, nil
+	}
+	if err == io.EOF {
+		return n, io.ErrUnexpectedEOF
+	}
+	return n, err
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/lq.go b/vendor/gonum.org/v1/gonum/mat/lq.go
new file mode 100644
index 0000000000..a3b3543b08
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/lq.go
@@ -0,0 +1,305 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+const badLQ = "mat: invalid LQ factorization"
+
+// LQ is a type for creating and using the LQ factorization of a matrix.
+type LQ struct {
+	lq   *Dense
+	q    *Dense
+	tau  []float64
+	cond float64
+}
+
+// Dims returns the dimensions of the matrix.
+func (lq *LQ) Dims() (r, c int) {
+	if lq.lq == nil {
+		return 0, 0
+	}
+	return lq.lq.Dims()
+}
+
+// At returns the element at row i, column j.
+func (lq *LQ) At(i, j int) float64 {
+	m, n := lq.Dims()
+	if uint(i) >= uint(m) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(n) {
+		panic(ErrColAccess)
+	}
+
+	var val float64
+	for k := 0; k <= i; k++ {
+		val += lq.lq.at(i, k) * lq.q.at(k, j)
+	}
+	return val
+}
+
+// T performs an implicit transpose by returning the receiver inside a
+// Transpose.
+func (lq *LQ) T() Matrix {
+	return Transpose{lq}
+}
+
+func (lq *LQ) updateCond(norm lapack.MatrixNorm) {
+	// Since A = L*Q, and Q is orthogonal, we get for the condition number κ
+	//  κ(A) := |A| |A^-1| = |L*Q| |(L*Q)^-1| = |L| |Qᵀ * L^-1|
+	//        = |L| |L^-1| = κ(L),
+	// where we used that fact that Q^-1 = Qᵀ. However, this assumes that
+	// the matrix norm is invariant under orthogonal transformations which
+	// is not the case for CondNorm. Hopefully the error is negligible: κ
+	// is only a qualitative measure anyway.
+	m := lq.lq.mat.Rows
+	work := getFloat64s(3*m, false)
+	iwork := getInts(m, false)
+	l := lq.lq.asTriDense(m, blas.NonUnit, blas.Lower)
+	v := lapack64.Trcon(norm, l.mat, work, iwork)
+	lq.cond = 1 / v
+	putFloat64s(work)
+	putInts(iwork)
+}
+
+// Factorize computes the LQ factorization of an m×n matrix a where m <= n. The LQ
+// factorization always exists even if A is singular.
+//
+// The LQ decomposition is a factorization of the matrix A such that A = L * Q.
+// The matrix Q is an orthonormal n×n matrix, and L is an m×n lower triangular matrix.
+// L and Q can be extracted using the LTo and QTo methods.
+func (lq *LQ) Factorize(a Matrix) {
+	lq.factorize(a, CondNorm)
+}
+
+func (lq *LQ) factorize(a Matrix, norm lapack.MatrixNorm) {
+	m, n := a.Dims()
+	if m > n {
+		panic(ErrShape)
+	}
+	if lq.lq == nil {
+		lq.lq = &Dense{}
+	}
+	lq.lq.CloneFrom(a)
+	work := []float64{0}
+	lq.tau = make([]float64, m)
+	lapack64.Gelqf(lq.lq.mat, lq.tau, work, -1)
+	work = getFloat64s(int(work[0]), false)
+	lapack64.Gelqf(lq.lq.mat, lq.tau, work, len(work))
+	putFloat64s(work)
+	lq.updateCond(norm)
+	lq.updateQ()
+}
+
+func (lq *LQ) updateQ() {
+	_, n := lq.Dims()
+	if lq.q == nil {
+		lq.q = NewDense(n, n, nil)
+	} else {
+		lq.q.reuseAsNonZeroed(n, n)
+	}
+	// Construct Q from the elementary reflectors.
+	lq.q.Copy(lq.lq)
+	work := []float64{0}
+	lapack64.Orglq(lq.q.mat, lq.tau, work, -1)
+	work = getFloat64s(int(work[0]), false)
+	lapack64.Orglq(lq.q.mat, lq.tau, work, len(work))
+	putFloat64s(work)
+}
+
+// isValid returns whether the receiver contains a factorization.
+func (lq *LQ) isValid() bool {
+	return lq.lq != nil && !lq.lq.IsEmpty()
+}
+
+// Cond returns the condition number for the factorized matrix.
+// Cond will panic if the receiver does not contain a factorization.
+func (lq *LQ) Cond() float64 {
+	if !lq.isValid() {
+		panic(badLQ)
+	}
+	return lq.cond
+}
+
+// TODO(btracey): Add in the "Reduced" forms for extracting the m×m orthogonal
+// and upper triangular matrices.
+
+// LTo extracts the m×n lower trapezoidal matrix from a LQ decomposition.
+//
+// If dst is empty, LTo will resize dst to be r×c. When dst is
+// non-empty, LTo will panic if dst is not r×c. LTo will also panic
+// if the receiver does not contain a successful factorization.
+func (lq *LQ) LTo(dst *Dense) {
+	if !lq.isValid() {
+		panic(badLQ)
+	}
+
+	r, c := lq.lq.Dims()
+	if dst.IsEmpty() {
+		dst.ReuseAs(r, c)
+	} else {
+		r2, c2 := dst.Dims()
+		if r != r2 || c != c2 {
+			panic(ErrShape)
+		}
+	}
+
+	// Disguise the LQ as a lower triangular.
+	t := &TriDense{
+		mat: blas64.Triangular{
+			N:      r,
+			Stride: lq.lq.mat.Stride,
+			Data:   lq.lq.mat.Data,
+			Uplo:   blas.Lower,
+			Diag:   blas.NonUnit,
+		},
+		cap: lq.lq.capCols,
+	}
+	dst.Copy(t)
+
+	if r == c {
+		return
+	}
+	// Zero right of the triangular.
+	for i := 0; i < r; i++ {
+		zero(dst.mat.Data[i*dst.mat.Stride+r : i*dst.mat.Stride+c])
+	}
+}
+
+// QTo extracts the n×n orthonormal matrix Q from an LQ decomposition.
+//
+// If dst is empty, QTo will resize dst to be n×n. When dst is
+// non-empty, QTo will panic if dst is not n×n. QTo will also panic
+// if the receiver does not contain a successful factorization.
+func (lq *LQ) QTo(dst *Dense) {
+	if !lq.isValid() {
+		panic(badLQ)
+	}
+
+	_, n := lq.lq.Dims()
+	if dst.IsEmpty() {
+		dst.ReuseAs(n, n)
+	} else {
+		m2, n2 := dst.Dims()
+		if n != m2 || n != n2 {
+			panic(ErrShape)
+		}
+	}
+	dst.Copy(lq.q)
+}
+
+// SolveTo finds a minimum-norm solution to a system of linear equations defined
+// by the matrices A and b, where A is an m×n matrix represented in its LQ factorized
+// form. If A is singular or near-singular a Condition error is returned.
+// See the documentation for Condition for more information.
+//
+// The minimization problem solved depends on the input parameters.
+//
+//	If trans == false, find the minimum norm solution of A * X = B.
+//	If trans == true, find X such that ||A*X - B||_2 is minimized.
+//
+// The solution matrix, X, is stored in place into dst.
+// SolveTo will panic if the receiver does not contain a factorization.
+func (lq *LQ) SolveTo(dst *Dense, trans bool, b Matrix) error {
+	if !lq.isValid() {
+		panic(badLQ)
+	}
+
+	r, c := lq.lq.Dims()
+	br, bc := b.Dims()
+
+	// The LQ solve algorithm stores the result in-place into the right hand side.
+	// The storage for the answer must be large enough to hold both b and x.
+	// However, this method's receiver must be the size of x. Copy b, and then
+	// copy the result into x at the end.
+	if trans {
+		if c != br {
+			panic(ErrShape)
+		}
+		dst.reuseAsNonZeroed(r, bc)
+	} else {
+		if r != br {
+			panic(ErrShape)
+		}
+		dst.reuseAsNonZeroed(c, bc)
+	}
+	// Do not need to worry about overlap between x and b because w has its own
+	// independent storage.
+	w := getDenseWorkspace(max(r, c), bc, false)
+	w.Copy(b)
+	t := lq.lq.asTriDense(lq.lq.mat.Rows, blas.NonUnit, blas.Lower).mat
+	if trans {
+		work := []float64{0}
+		lapack64.Ormlq(blas.Left, blas.NoTrans, lq.lq.mat, lq.tau, w.mat, work, -1)
+		work = getFloat64s(int(work[0]), false)
+		lapack64.Ormlq(blas.Left, blas.NoTrans, lq.lq.mat, lq.tau, w.mat, work, len(work))
+		putFloat64s(work)
+
+		ok := lapack64.Trtrs(blas.Trans, t, w.mat)
+		if !ok {
+			return Condition(math.Inf(1))
+		}
+	} else {
+		ok := lapack64.Trtrs(blas.NoTrans, t, w.mat)
+		if !ok {
+			return Condition(math.Inf(1))
+		}
+		for i := r; i < c; i++ {
+			zero(w.mat.Data[i*w.mat.Stride : i*w.mat.Stride+bc])
+		}
+		work := []float64{0}
+		lapack64.Ormlq(blas.Left, blas.Trans, lq.lq.mat, lq.tau, w.mat, work, -1)
+		work = getFloat64s(int(work[0]), false)
+		lapack64.Ormlq(blas.Left, blas.Trans, lq.lq.mat, lq.tau, w.mat, work, len(work))
+		putFloat64s(work)
+	}
+	// x was set above to be the correct size for the result.
+	dst.Copy(w)
+	putDenseWorkspace(w)
+	if lq.cond > ConditionTolerance {
+		return Condition(lq.cond)
+	}
+	return nil
+}
+
+// SolveVecTo finds a minimum-norm solution to a system of linear equations.
+// See LQ.SolveTo for the full documentation.
+// SolveToVec will panic if the receiver does not contain a factorization.
+func (lq *LQ) SolveVecTo(dst *VecDense, trans bool, b Vector) error {
+	if !lq.isValid() {
+		panic(badLQ)
+	}
+
+	r, c := lq.lq.Dims()
+	if _, bc := b.Dims(); bc != 1 {
+		panic(ErrShape)
+	}
+
+	// The Solve implementation is non-trivial, so rather than duplicate the code,
+	// instead recast the VecDenses as Dense and call the matrix code.
+	bm := Matrix(b)
+	if rv, ok := b.(RawVectorer); ok {
+		bmat := rv.RawVector()
+		if dst != b {
+			dst.checkOverlap(bmat)
+		}
+		b := VecDense{mat: bmat}
+		bm = b.asDense()
+	}
+	if trans {
+		dst.reuseAsNonZeroed(r)
+	} else {
+		dst.reuseAsNonZeroed(c)
+	}
+	return lq.SolveTo(dst.asDense(), trans, bm)
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/lu.go b/vendor/gonum.org/v1/gonum/mat/lu.go
new file mode 100644
index 0000000000..b530ada7e5
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/lu.go
@@ -0,0 +1,487 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/floats"
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+const (
+	badSliceLength = "mat: improper slice length"
+	badLU          = "mat: invalid LU factorization"
+)
+
+// LU is a square n×n matrix represented by its LU factorization with partial
+// pivoting.
+//
+// The factorization has the form
+//
+//	A = P * L * U
+//
+// where P is a permutation matrix, L is lower triangular with unit diagonal
+// elements, and U is upper triangular.
+//
+// Note that this matrix representation is useful for certain operations, in
+// particular for solving linear systems of equations. It is very inefficient at
+// other operations, in particular At is slow.
+type LU struct {
+	lu    *Dense
+	swaps []int
+	piv   []int
+	cond  float64
+	ok    bool // Whether A is nonsingular
+}
+
+var _ Matrix = (*LU)(nil)
+
+// Dims returns the dimensions of the matrix A.
+func (lu *LU) Dims() (r, c int) {
+	if lu.lu == nil {
+		return 0, 0
+	}
+	return lu.lu.Dims()
+}
+
+// At returns the element of A at row i, column j.
+func (lu *LU) At(i, j int) float64 {
+	n, _ := lu.Dims()
+	if uint(i) >= uint(n) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(n) {
+		panic(ErrColAccess)
+	}
+
+	i = lu.piv[i]
+	var val float64
+	for k := 0; k < min(i, j+1); k++ {
+		val += lu.lu.at(i, k) * lu.lu.at(k, j)
+	}
+	if i <= j {
+		val += lu.lu.at(i, j)
+	}
+	return val
+}
+
+// T performs an implicit transpose by returning the receiver inside a
+// Transpose.
+func (lu *LU) T() Matrix {
+	return Transpose{lu}
+}
+
+// updateCond updates the stored condition number of the matrix. anorm is the
+// norm of the original matrix. If anorm is negative it will be estimated.
+func (lu *LU) updateCond(anorm float64, norm lapack.MatrixNorm) {
+	n := lu.lu.mat.Cols
+	work := getFloat64s(4*n, false)
+	defer putFloat64s(work)
+	iwork := getInts(n, false)
+	defer putInts(iwork)
+	if anorm < 0 {
+		// This is an approximation. By the definition of a norm,
+		//  |AB| <= |A| |B|.
+		// Since A = L*U, we get for the condition number κ that
+		//  κ(A) := |A| |A^-1| = |L*U| |A^-1| <= |L| |U| |A^-1|,
+		// so this will overestimate the condition number somewhat.
+		// The norm of the original factorized matrix cannot be stored
+		// because of update possibilities.
+		u := lu.lu.asTriDense(n, blas.NonUnit, blas.Upper)
+		l := lu.lu.asTriDense(n, blas.Unit, blas.Lower)
+		unorm := lapack64.Lantr(norm, u.mat, work)
+		lnorm := lapack64.Lantr(norm, l.mat, work)
+		anorm = unorm * lnorm
+	}
+	v := lapack64.Gecon(norm, lu.lu.mat, anorm, work, iwork)
+	lu.cond = 1 / v
+}
+
+// Factorize computes the LU factorization of the square matrix A and stores the
+// result in the receiver. The LU decomposition will complete regardless of the
+// singularity of a.
+//
+// The L and U matrix factors can be extracted from the factorization using the
+// LTo and UTo methods. The matrix P can be extracted as a row permutation using
+// the RowPivots method and applied using Dense.PermuteRows.
+func (lu *LU) Factorize(a Matrix) {
+	lu.factorize(a, CondNorm)
+}
+
+func (lu *LU) factorize(a Matrix, norm lapack.MatrixNorm) {
+	m, n := a.Dims()
+	if m != n {
+		panic(ErrSquare)
+	}
+	if lu.lu == nil {
+		lu.lu = NewDense(n, n, nil)
+	} else {
+		lu.lu.Reset()
+		lu.lu.reuseAsNonZeroed(n, n)
+	}
+	lu.lu.Copy(a)
+	lu.swaps = useInt(lu.swaps, n)
+	lu.piv = useInt(lu.piv, n)
+	work := getFloat64s(n, false)
+	anorm := lapack64.Lange(norm, lu.lu.mat, work)
+	putFloat64s(work)
+	lu.ok = lapack64.Getrf(lu.lu.mat, lu.swaps)
+	lu.updatePivots(lu.swaps)
+	lu.updateCond(anorm, norm)
+}
+
+func (lu *LU) updatePivots(swaps []int) {
+	// Replay the sequence of row swaps in order to find the row permutation.
+	for i := range lu.piv {
+		lu.piv[i] = i
+	}
+	n, _ := lu.Dims()
+	for i := n - 1; i >= 0; i-- {
+		v := swaps[i]
+		lu.piv[i], lu.piv[v] = lu.piv[v], lu.piv[i]
+	}
+}
+
+// isValid returns whether the receiver contains a factorization.
+func (lu *LU) isValid() bool {
+	return lu.lu != nil && !lu.lu.IsEmpty()
+}
+
+// Cond returns the condition number for the factorized matrix.
+// Cond will panic if the receiver does not contain a factorization.
+func (lu *LU) Cond() float64 {
+	if !lu.isValid() {
+		panic(badLU)
+	}
+	return lu.cond
+}
+
+// Reset resets the factorization so that it can be reused as the receiver of a
+// dimensionally restricted operation.
+func (lu *LU) Reset() {
+	if lu.lu != nil {
+		lu.lu.Reset()
+	}
+	lu.swaps = lu.swaps[:0]
+	lu.piv = lu.piv[:0]
+}
+
+func (lu *LU) isZero() bool {
+	return len(lu.swaps) == 0
+}
+
+// Det returns the determinant of the matrix that has been factorized. In many
+// expressions, using LogDet will be more numerically stable.
+// Det will panic if the receiver does not contain a factorization.
+func (lu *LU) Det() float64 {
+	if !lu.ok {
+		return 0
+	}
+	det, sign := lu.LogDet()
+	return math.Exp(det) * sign
+}
+
+// LogDet returns the log of the determinant and the sign of the determinant
+// for the matrix that has been factorized. Numerical stability in product and
+// division expressions is generally improved by working in log space.
+// LogDet will panic if the receiver does not contain a factorization.
+func (lu *LU) LogDet() (det float64, sign float64) {
+	if !lu.isValid() {
+		panic(badLU)
+	}
+
+	_, n := lu.lu.Dims()
+	logDiag := getFloat64s(n, false)
+	defer putFloat64s(logDiag)
+	sign = 1.0
+	for i := 0; i < n; i++ {
+		v := lu.lu.at(i, i)
+		if v < 0 {
+			sign *= -1
+		}
+		if lu.swaps[i] != i {
+			sign *= -1
+		}
+		logDiag[i] = math.Log(math.Abs(v))
+	}
+	return floats.Sum(logDiag), sign
+}
+
+// RowPivots returns the row permutation that represents the permutation matrix
+// P from the LU factorization
+//
+//	A = P * L * U.
+//
+// If dst is nil, a new slice is allocated and returned. If dst is not nil and
+// the length of dst does not equal the size of the factorized matrix, RowPivots
+// will panic. RowPivots will panic if the receiver does not contain a
+// factorization.
+func (lu *LU) RowPivots(dst []int) []int {
+	if !lu.isValid() {
+		panic(badLU)
+	}
+	_, n := lu.lu.Dims()
+	if dst == nil {
+		dst = make([]int, n)
+	}
+	if len(dst) != n {
+		panic(badSliceLength)
+	}
+	copy(dst, lu.piv)
+	return dst
+}
+
+// Pivot returns the row pivots of the receiver.
+//
+// Deprecated: Use RowPivots instead.
+func (lu *LU) Pivot(dst []int) []int {
+	return lu.RowPivots(dst)
+}
+
+// RankOne updates an LU factorization as if a rank-one update had been applied to
+// the original matrix A, storing the result into the receiver. That is, if in
+// the original LU decomposition P * L * U = A, in the updated decomposition
+// P * L' * U' = A + alpha * x * yᵀ.
+// RankOne will panic if orig does not contain a factorization.
+func (lu *LU) RankOne(orig *LU, alpha float64, x, y Vector) {
+	if !orig.isValid() {
+		panic(badLU)
+	}
+
+	// RankOne uses algorithm a1 on page 28 of "Multiple-Rank Updates to Matrix
+	// Factorizations for Nonlinear Analysis and Circuit Design" by Linzhong Deng.
+	// http://web.stanford.edu/group/SOL/dissertations/Linzhong-Deng-thesis.pdf
+	_, n := orig.lu.Dims()
+	if r, c := x.Dims(); r != n || c != 1 {
+		panic(ErrShape)
+	}
+	if r, c := y.Dims(); r != n || c != 1 {
+		panic(ErrShape)
+	}
+	if orig != lu {
+		if lu.isZero() {
+			lu.swaps = useInt(lu.swaps, n)
+			lu.piv = useInt(lu.piv, n)
+			if lu.lu == nil {
+				lu.lu = NewDense(n, n, nil)
+			} else {
+				lu.lu.reuseAsNonZeroed(n, n)
+			}
+		} else if len(lu.swaps) != n {
+			panic(ErrShape)
+		}
+		copy(lu.swaps, orig.swaps)
+		lu.updatePivots(lu.swaps)
+		lu.lu.Copy(orig.lu)
+	}
+
+	xs := getFloat64s(n, false)
+	defer putFloat64s(xs)
+	ys := getFloat64s(n, false)
+	defer putFloat64s(ys)
+	for i := 0; i < n; i++ {
+		xs[i] = x.AtVec(i)
+		ys[i] = y.AtVec(i)
+	}
+
+	// Adjust for the pivoting in the LU factorization
+	for i, v := range lu.swaps {
+		xs[i], xs[v] = xs[v], xs[i]
+	}
+
+	lum := lu.lu.mat
+	omega := alpha
+	for j := 0; j < n; j++ {
+		ujj := lum.Data[j*lum.Stride+j]
+		ys[j] /= ujj
+		theta := 1 + xs[j]*ys[j]*omega
+		beta := omega * ys[j] / theta
+		gamma := omega * xs[j]
+		omega -= beta * gamma
+		lum.Data[j*lum.Stride+j] *= theta
+		for i := j + 1; i < n; i++ {
+			xs[i] -= lum.Data[i*lum.Stride+j] * xs[j]
+			tmp := ys[i]
+			ys[i] -= lum.Data[j*lum.Stride+i] * ys[j]
+			lum.Data[i*lum.Stride+j] += beta * xs[i]
+			lum.Data[j*lum.Stride+i] += gamma * tmp
+		}
+	}
+	lu.updateCond(-1, CondNorm)
+}
+
+// LTo extracts the lower triangular matrix from an LU factorization.
+//
+// If dst is empty, LTo will resize dst to be a lower-triangular n×n matrix.
+// When dst is non-empty, LTo will panic if dst is not n×n or not Lower.
+// LTo will also panic if the receiver does not contain a successful
+// factorization.
+func (lu *LU) LTo(dst *TriDense) *TriDense {
+	if !lu.isValid() {
+		panic(badLU)
+	}
+
+	_, n := lu.lu.Dims()
+	if dst.IsEmpty() {
+		dst.ReuseAsTri(n, Lower)
+	} else {
+		n2, kind := dst.Triangle()
+		if n != n2 {
+			panic(ErrShape)
+		}
+		if kind != Lower {
+			panic(ErrTriangle)
+		}
+	}
+	// Extract the lower triangular elements.
+	for i := 1; i < n; i++ {
+		copy(dst.mat.Data[i*dst.mat.Stride:i*dst.mat.Stride+i], lu.lu.mat.Data[i*lu.lu.mat.Stride:i*lu.lu.mat.Stride+i])
+	}
+	// Set ones on the diagonal.
+	for i := 0; i < n; i++ {
+		dst.mat.Data[i*dst.mat.Stride+i] = 1
+	}
+	return dst
+}
+
+// UTo extracts the upper triangular matrix from an LU factorization.
+//
+// If dst is empty, UTo will resize dst to be an upper-triangular n×n matrix.
+// When dst is non-empty, UTo will panic if dst is not n×n or not Upper.
+// UTo will also panic if the receiver does not contain a successful
+// factorization.
+func (lu *LU) UTo(dst *TriDense) {
+	if !lu.isValid() {
+		panic(badLU)
+	}
+
+	_, n := lu.lu.Dims()
+	if dst.IsEmpty() {
+		dst.ReuseAsTri(n, Upper)
+	} else {
+		n2, kind := dst.Triangle()
+		if n != n2 {
+			panic(ErrShape)
+		}
+		if kind != Upper {
+			panic(ErrTriangle)
+		}
+	}
+	// Extract the upper triangular elements.
+	for i := 0; i < n; i++ {
+		copy(dst.mat.Data[i*dst.mat.Stride+i:i*dst.mat.Stride+n], lu.lu.mat.Data[i*lu.lu.mat.Stride+i:i*lu.lu.mat.Stride+n])
+	}
+}
+
+// SolveTo solves a system of linear equations
+//
+//	A * X = B   if trans == false
+//	Aᵀ * X = B  if trans == true
+//
+// using the LU factorization of A stored in the receiver. The solution matrix X
+// is stored into dst.
+//
+// If A is singular or near-singular a Condition error is returned. See the
+// documentation for Condition for more information. SolveTo will panic if the
+// receiver does not contain a factorization.
+func (lu *LU) SolveTo(dst *Dense, trans bool, b Matrix) error {
+	if !lu.isValid() {
+		panic(badLU)
+	}
+
+	_, n := lu.lu.Dims()
+	br, bc := b.Dims()
+	if br != n {
+		panic(ErrShape)
+	}
+
+	if !lu.ok {
+		return Condition(math.Inf(1))
+	}
+
+	dst.reuseAsNonZeroed(n, bc)
+	bU, _ := untranspose(b)
+	if dst == bU {
+		var restore func()
+		dst, restore = dst.isolatedWorkspace(bU)
+		defer restore()
+	} else if rm, ok := bU.(RawMatrixer); ok {
+		dst.checkOverlap(rm.RawMatrix())
+	}
+
+	dst.Copy(b)
+	t := blas.NoTrans
+	if trans {
+		t = blas.Trans
+	}
+	lapack64.Getrs(t, lu.lu.mat, dst.mat, lu.swaps)
+	if lu.cond > ConditionTolerance {
+		return Condition(lu.cond)
+	}
+	return nil
+}
+
+// SolveVecTo solves a system of linear equations
+//
+//	A * x = b   if trans == false
+//	Aᵀ * x = b  if trans == true
+//
+// using the LU factorization of A stored in the receiver. The solution matrix x
+// is stored into dst.
+//
+// If A is singular or near-singular a Condition error is returned. See the
+// documentation for Condition for more information. SolveVecTo will panic if the
+// receiver does not contain a factorization.
+func (lu *LU) SolveVecTo(dst *VecDense, trans bool, b Vector) error {
+	if !lu.isValid() {
+		panic(badLU)
+	}
+
+	_, n := lu.lu.Dims()
+	if br, bc := b.Dims(); br != n || bc != 1 {
+		panic(ErrShape)
+	}
+
+	switch rv := b.(type) {
+	default:
+		dst.reuseAsNonZeroed(n)
+		return lu.SolveTo(dst.asDense(), trans, b)
+	case RawVectorer:
+		if dst != b {
+			dst.checkOverlap(rv.RawVector())
+		}
+
+		if !lu.ok {
+			return Condition(math.Inf(1))
+		}
+
+		dst.reuseAsNonZeroed(n)
+		var restore func()
+		if dst == b {
+			dst, restore = dst.isolatedWorkspace(b)
+			defer restore()
+		}
+		dst.CopyVec(b)
+		vMat := blas64.General{
+			Rows:   n,
+			Cols:   1,
+			Stride: dst.mat.Inc,
+			Data:   dst.mat.Data,
+		}
+		t := blas.NoTrans
+		if trans {
+			t = blas.Trans
+		}
+		lapack64.Getrs(t, lu.lu.mat, vMat, lu.swaps)
+		if lu.cond > ConditionTolerance {
+			return Condition(lu.cond)
+		}
+		return nil
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/matrix.go b/vendor/gonum.org/v1/gonum/mat/matrix.go
new file mode 100644
index 0000000000..2d67bbe081
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/matrix.go
@@ -0,0 +1,1000 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/floats/scalar"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Matrix is the basic matrix interface type.
+type Matrix interface {
+	// Dims returns the dimensions of a Matrix.
+	Dims() (r, c int)
+
+	// At returns the value of a matrix element at row i, column j.
+	// It will panic if i or j are out of bounds for the matrix.
+	At(i, j int) float64
+
+	// T returns the transpose of the Matrix. Whether T returns a copy of the
+	// underlying data is implementation dependent.
+	// This method may be implemented using the Transpose type, which
+	// provides an implicit matrix transpose.
+	T() Matrix
+}
+
+// allMatrix represents the extra set of methods that all mat Matrix types
+// should satisfy. This is used to enforce compile-time consistency between the
+// Dense types, especially helpful when adding new features.
+type allMatrix interface {
+	Reseter
+	IsEmpty() bool
+	Zero()
+}
+
+// denseMatrix represents the extra set of methods that all Dense Matrix types
+// should satisfy. This is used to enforce compile-time consistency between the
+// Dense types, especially helpful when adding new features.
+type denseMatrix interface {
+	DiagView() Diagonal
+	Tracer
+	Normer
+}
+
+var (
+	_ Matrix       = Transpose{}
+	_ Untransposer = Transpose{}
+)
+
+// Transpose is a type for performing an implicit matrix transpose. It implements
+// the Matrix interface, returning values from the transpose of the matrix within.
+type Transpose struct {
+	Matrix Matrix
+}
+
+// At returns the value of the element at row i and column j of the transposed
+// matrix, that is, row j and column i of the Matrix field.
+func (t Transpose) At(i, j int) float64 {
+	return t.Matrix.At(j, i)
+}
+
+// Dims returns the dimensions of the transposed matrix. The number of rows returned
+// is the number of columns in the Matrix field, and the number of columns is
+// the number of rows in the Matrix field.
+func (t Transpose) Dims() (r, c int) {
+	c, r = t.Matrix.Dims()
+	return r, c
+}
+
+// T performs an implicit transpose by returning the Matrix field.
+func (t Transpose) T() Matrix {
+	return t.Matrix
+}
+
+// Untranspose returns the Matrix field.
+func (t Transpose) Untranspose() Matrix {
+	return t.Matrix
+}
+
+// Untransposer is a type that can undo an implicit transpose.
+type Untransposer interface {
+	// Note: This interface is needed to unify all of the Transpose types. In
+	// the mat methods, we need to test if the Matrix has been implicitly
+	// transposed. If this is checked by testing for the specific Transpose type
+	// then the behavior will be different if the user uses T() or TTri() for a
+	// triangular matrix.
+
+	// Untranspose returns the underlying Matrix stored for the implicit transpose.
+	Untranspose() Matrix
+}
+
+// UntransposeBander is a type that can undo an implicit band transpose.
+type UntransposeBander interface {
+	// Untranspose returns the underlying Banded stored for the implicit transpose.
+	UntransposeBand() Banded
+}
+
+// UntransposeTrier is a type that can undo an implicit triangular transpose.
+type UntransposeTrier interface {
+	// Untranspose returns the underlying Triangular stored for the implicit transpose.
+	UntransposeTri() Triangular
+}
+
+// UntransposeTriBander is a type that can undo an implicit triangular banded
+// transpose.
+type UntransposeTriBander interface {
+	// Untranspose returns the underlying Triangular stored for the implicit transpose.
+	UntransposeTriBand() TriBanded
+}
+
+// Mutable is a matrix interface type that allows elements to be altered.
+type Mutable interface {
+	// Set alters the matrix element at row i, column j to v.
+	// It will panic if i or j are out of bounds for the matrix.
+	Set(i, j int, v float64)
+
+	Matrix
+}
+
+// A RowViewer can return a Vector reflecting a row that is backed by the matrix
+// data. The Vector returned will have length equal to the number of columns.
+type RowViewer interface {
+	RowView(i int) Vector
+}
+
+// A RawRowViewer can return a slice of float64 reflecting a row that is backed by the matrix
+// data.
+type RawRowViewer interface {
+	RawRowView(i int) []float64
+}
+
+// A ColViewer can return a Vector reflecting a column that is backed by the matrix
+// data. The Vector returned will have length equal to the number of rows.
+type ColViewer interface {
+	ColView(j int) Vector
+}
+
+// A RawColViewer can return a slice of float64 reflecting a column that is backed by the matrix
+// data.
+type RawColViewer interface {
+	RawColView(j int) []float64
+}
+
+// A ClonerFrom can make a copy of a into the receiver, overwriting the previous value of the
+// receiver. The clone operation does not make any restriction on shape and will not cause
+// shadowing.
+type ClonerFrom interface {
+	CloneFrom(a Matrix)
+}
+
+// A Reseter can reset the matrix so that it can be reused as the receiver of a dimensionally
+// restricted operation. This is commonly used when the matrix is being used as a workspace
+// or temporary matrix.
+//
+// If the matrix is a view, using Reset may result in data corruption in elements outside
+// the view. Similarly, if the matrix shares backing data with another variable, using
+// Reset may lead to unexpected changes in data values.
+type Reseter interface {
+	Reset()
+}
+
+// A Copier can make a copy of elements of a into the receiver. The submatrix copied
+// starts at row and column 0 and has dimensions equal to the minimum dimensions of
+// the two matrices. The number of row and columns copied is returned.
+// Copy will copy from a source that aliases the receiver unless the source is transposed;
+// an aliasing transpose copy will panic with the exception for a special case when
+// the source data has a unitary increment or stride.
+type Copier interface {
+	Copy(a Matrix) (r, c int)
+}
+
+// A Grower can grow the size of the represented matrix by the given number of rows and columns.
+// Growing beyond the size given by the Caps method will result in the allocation of a new
+// matrix and copying of the elements. If Grow is called with negative increments it will
+// panic with ErrIndexOutOfRange.
+type Grower interface {
+	Caps() (r, c int)
+	Grow(r, c int) Matrix
+}
+
+// A RawMatrixSetter can set the underlying blas64.General used by the receiver. There is no restriction
+// on the shape of the receiver. Changes to the receiver's elements will be reflected in the blas64.General.Data.
+type RawMatrixSetter interface {
+	SetRawMatrix(a blas64.General)
+}
+
+// A RawMatrixer can return a blas64.General representation of the receiver. Changes to the blas64.General.Data
+// slice will be reflected in the original matrix, changes to the Rows, Cols and Stride fields will not.
+type RawMatrixer interface {
+	RawMatrix() blas64.General
+}
+
+// A RawVectorer can return a blas64.Vector representation of the receiver. Changes to the blas64.Vector.Data
+// slice will be reflected in the original matrix, changes to the Inc field will not.
+type RawVectorer interface {
+	RawVector() blas64.Vector
+}
+
+// A NonZeroDoer can call a function for each non-zero element of the receiver.
+// The parameters of the function are the element indices and its value.
+type NonZeroDoer interface {
+	DoNonZero(func(i, j int, v float64))
+}
+
+// A RowNonZeroDoer can call a function for each non-zero element of a row of the receiver.
+// The parameters of the function are the element indices and its value.
+type RowNonZeroDoer interface {
+	DoRowNonZero(i int, fn func(i, j int, v float64))
+}
+
+// A ColNonZeroDoer can call a function for each non-zero element of a column of the receiver.
+// The parameters of the function are the element indices and its value.
+type ColNonZeroDoer interface {
+	DoColNonZero(j int, fn func(i, j int, v float64))
+}
+
+// A SolveToer can solve a linear system A⋅X = B or Aᵀ⋅X = B where A is a matrix
+// represented by the receiver and B is a given matrix, storing the result into
+// dst.
+//
+// If dst is empty, SolveTo will resize it to the correct size, otherwise it
+// must have the correct size. Individual implementations may impose other
+// restrictions on the input parameters, for example that A is a square matrix.
+type SolveToer interface {
+	SolveTo(dst *Dense, trans bool, b Matrix) error
+}
+
+// untranspose untransposes a matrix if applicable. If a is an Untransposer, then
+// untranspose returns the underlying matrix and true. If it is not, then it returns
+// the input matrix and false.
+func untranspose(a Matrix) (Matrix, bool) {
+	if ut, ok := a.(Untransposer); ok {
+		return ut.Untranspose(), true
+	}
+	return a, false
+}
+
+// untransposeExtract returns an untransposed matrix in a built-in matrix type.
+//
+// The untransposed matrix is returned unaltered if it is a built-in matrix type.
+// Otherwise, if it implements a Raw method, an appropriate built-in type value
+// is returned holding the raw matrix value of the input. If neither of these
+// is possible, the untransposed matrix is returned.
+func untransposeExtract(a Matrix) (Matrix, bool) {
+	ut, trans := untranspose(a)
+	switch m := ut.(type) {
+	case *DiagDense, *SymBandDense, *TriBandDense, *BandDense, *TriDense, *SymDense, *Dense, *VecDense, *Tridiag:
+		return m, trans
+	// TODO(btracey): Add here if we ever have an equivalent of RawDiagDense.
+	case RawSymBander:
+		rsb := m.RawSymBand()
+		if rsb.Uplo != blas.Upper {
+			return ut, trans
+		}
+		var sb SymBandDense
+		sb.SetRawSymBand(rsb)
+		return &sb, trans
+	case RawTriBander:
+		rtb := m.RawTriBand()
+		if rtb.Diag == blas.Unit {
+			return ut, trans
+		}
+		var tb TriBandDense
+		tb.SetRawTriBand(rtb)
+		return &tb, trans
+	case RawBander:
+		var b BandDense
+		b.SetRawBand(m.RawBand())
+		return &b, trans
+	case RawTriangular:
+		rt := m.RawTriangular()
+		if rt.Diag == blas.Unit {
+			return ut, trans
+		}
+		var t TriDense
+		t.SetRawTriangular(rt)
+		return &t, trans
+	case RawSymmetricer:
+		rs := m.RawSymmetric()
+		if rs.Uplo != blas.Upper {
+			return ut, trans
+		}
+		var s SymDense
+		s.SetRawSymmetric(rs)
+		return &s, trans
+	case RawMatrixer:
+		var d Dense
+		d.SetRawMatrix(m.RawMatrix())
+		return &d, trans
+	case RawVectorer:
+		var v VecDense
+		v.SetRawVector(m.RawVector())
+		return &v, trans
+	case RawTridiagonaler:
+		var d Tridiag
+		d.SetRawTridiagonal(m.RawTridiagonal())
+		return &d, trans
+	default:
+		return ut, trans
+	}
+}
+
+// TODO(btracey): Consider adding CopyCol/CopyRow if the behavior seems useful.
+// TODO(btracey): Add in fast paths to Row/Col for the other concrete types
+// (TriDense, etc.) as well as relevant interfaces (RowColer, RawRowViewer, etc.)
+
+// Col copies the elements in the jth column of the matrix into the slice dst.
+// The length of the provided slice must equal the number of rows, unless the
+// slice is nil in which case a new slice is first allocated.
+func Col(dst []float64, j int, a Matrix) []float64 {
+	r, c := a.Dims()
+	if j < 0 || j >= c {
+		panic(ErrColAccess)
+	}
+	if dst == nil {
+		dst = make([]float64, r)
+	} else {
+		if len(dst) != r {
+			panic(ErrColLength)
+		}
+	}
+	aU, aTrans := untranspose(a)
+	if rm, ok := aU.(RawMatrixer); ok {
+		m := rm.RawMatrix()
+		if aTrans {
+			copy(dst, m.Data[j*m.Stride:j*m.Stride+m.Cols])
+			return dst
+		}
+		blas64.Copy(blas64.Vector{N: r, Inc: m.Stride, Data: m.Data[j:]},
+			blas64.Vector{N: r, Inc: 1, Data: dst},
+		)
+		return dst
+	}
+	for i := 0; i < r; i++ {
+		dst[i] = a.At(i, j)
+	}
+	return dst
+}
+
+// Row copies the elements in the ith row of the matrix into the slice dst.
+// The length of the provided slice must equal the number of columns, unless the
+// slice is nil in which case a new slice is first allocated.
+func Row(dst []float64, i int, a Matrix) []float64 {
+	r, c := a.Dims()
+	if i < 0 || i >= r {
+		panic(ErrColAccess)
+	}
+	if dst == nil {
+		dst = make([]float64, c)
+	} else {
+		if len(dst) != c {
+			panic(ErrRowLength)
+		}
+	}
+	aU, aTrans := untranspose(a)
+	if rm, ok := aU.(RawMatrixer); ok {
+		m := rm.RawMatrix()
+		if aTrans {
+			blas64.Copy(blas64.Vector{N: c, Inc: m.Stride, Data: m.Data[i:]},
+				blas64.Vector{N: c, Inc: 1, Data: dst},
+			)
+			return dst
+		}
+		copy(dst, m.Data[i*m.Stride:i*m.Stride+m.Cols])
+		return dst
+	}
+	for j := 0; j < c; j++ {
+		dst[j] = a.At(i, j)
+	}
+	return dst
+}
+
+// Cond returns the condition number of the given matrix under the given norm.
+// The condition number must be based on the 1-norm, 2-norm or ∞-norm.
+// Cond will panic with ErrZeroLength if the matrix has zero size.
+//
+// BUG(btracey): The computation of the 1-norm and ∞-norm for non-square matrices
+// is inaccurate, although is typically the right order of magnitude. See
+// https://github.com/xianyi/OpenBLAS/issues/636. While the value returned will
+// change with the resolution of this bug, the result from Cond will match the
+// condition number used internally.
+func Cond(a Matrix, norm float64) float64 {
+	m, n := a.Dims()
+	if m == 0 || n == 0 {
+		panic(ErrZeroLength)
+	}
+	var lnorm lapack.MatrixNorm
+	switch norm {
+	default:
+		panic("mat: bad norm value")
+	case 1:
+		lnorm = lapack.MaxColumnSum
+	case 2:
+		var svd SVD
+		ok := svd.Factorize(a, SVDNone)
+		if !ok {
+			return math.Inf(1)
+		}
+		return svd.Cond()
+	case math.Inf(1):
+		lnorm = lapack.MaxRowSum
+	}
+
+	if m == n {
+		// Use the LU decomposition to compute the condition number.
+		var lu LU
+		lu.factorize(a, lnorm)
+		return lu.Cond()
+	}
+	if m > n {
+		// Use the QR factorization to compute the condition number.
+		var qr QR
+		qr.factorize(a, lnorm)
+		return qr.Cond()
+	}
+	// Use the LQ factorization to compute the condition number.
+	var lq LQ
+	lq.factorize(a, lnorm)
+	return lq.Cond()
+}
+
+// Det returns the determinant of the square matrix a. In many expressions using
+// LogDet will be more numerically stable.
+//
+// Det panics with ErrSquare if a is not square and with ErrZeroLength if a has
+// zero size.
+func Det(a Matrix) float64 {
+	det, sign := LogDet(a)
+	return math.Exp(det) * sign
+}
+
+// Dot returns the sum of the element-wise product of a and b.
+//
+// Dot panics with ErrShape if the vector sizes are unequal and with
+// ErrZeroLength if the sizes are zero.
+func Dot(a, b Vector) float64 {
+	la := a.Len()
+	lb := b.Len()
+	if la != lb {
+		panic(ErrShape)
+	}
+	if la == 0 {
+		panic(ErrZeroLength)
+	}
+	if arv, ok := a.(RawVectorer); ok {
+		if brv, ok := b.(RawVectorer); ok {
+			return blas64.Dot(arv.RawVector(), brv.RawVector())
+		}
+	}
+	var sum float64
+	for i := 0; i < la; i++ {
+		sum += a.At(i, 0) * b.At(i, 0)
+	}
+	return sum
+}
+
+// Equal returns whether the matrices a and b have the same size
+// and are element-wise equal.
+func Equal(a, b Matrix) bool {
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+	if ar != br || ac != bc {
+		return false
+	}
+	aU, aTrans := untranspose(a)
+	bU, bTrans := untranspose(b)
+	if rma, ok := aU.(RawMatrixer); ok {
+		if rmb, ok := bU.(RawMatrixer); ok {
+			ra := rma.RawMatrix()
+			rb := rmb.RawMatrix()
+			if aTrans == bTrans {
+				for i := 0; i < ra.Rows; i++ {
+					for j := 0; j < ra.Cols; j++ {
+						if ra.Data[i*ra.Stride+j] != rb.Data[i*rb.Stride+j] {
+							return false
+						}
+					}
+				}
+				return true
+			}
+			for i := 0; i < ra.Rows; i++ {
+				for j := 0; j < ra.Cols; j++ {
+					if ra.Data[i*ra.Stride+j] != rb.Data[j*rb.Stride+i] {
+						return false
+					}
+				}
+			}
+			return true
+		}
+	}
+	if rma, ok := aU.(RawSymmetricer); ok {
+		if rmb, ok := bU.(RawSymmetricer); ok {
+			ra := rma.RawSymmetric()
+			rb := rmb.RawSymmetric()
+			// Symmetric matrices are always upper and equal to their transpose.
+			for i := 0; i < ra.N; i++ {
+				for j := i; j < ra.N; j++ {
+					if ra.Data[i*ra.Stride+j] != rb.Data[i*rb.Stride+j] {
+						return false
+					}
+				}
+			}
+			return true
+		}
+	}
+	if ra, ok := aU.(*VecDense); ok {
+		if rb, ok := bU.(*VecDense); ok {
+			// If the raw vectors are the same length they must either both be
+			// transposed or both not transposed (or have length 1).
+			for i := 0; i < ra.mat.N; i++ {
+				if ra.mat.Data[i*ra.mat.Inc] != rb.mat.Data[i*rb.mat.Inc] {
+					return false
+				}
+			}
+			return true
+		}
+	}
+	for i := 0; i < ar; i++ {
+		for j := 0; j < ac; j++ {
+			if a.At(i, j) != b.At(i, j) {
+				return false
+			}
+		}
+	}
+	return true
+}
+
+// EqualApprox returns whether the matrices a and b have the same size and contain all equal
+// elements with tolerance for element-wise equality specified by epsilon. Matrices
+// with non-equal shapes are not equal.
+func EqualApprox(a, b Matrix, epsilon float64) bool {
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+	if ar != br || ac != bc {
+		return false
+	}
+	aU, aTrans := untranspose(a)
+	bU, bTrans := untranspose(b)
+	if rma, ok := aU.(RawMatrixer); ok {
+		if rmb, ok := bU.(RawMatrixer); ok {
+			ra := rma.RawMatrix()
+			rb := rmb.RawMatrix()
+			if aTrans == bTrans {
+				for i := 0; i < ra.Rows; i++ {
+					for j := 0; j < ra.Cols; j++ {
+						if !scalar.EqualWithinAbsOrRel(ra.Data[i*ra.Stride+j], rb.Data[i*rb.Stride+j], epsilon, epsilon) {
+							return false
+						}
+					}
+				}
+				return true
+			}
+			for i := 0; i < ra.Rows; i++ {
+				for j := 0; j < ra.Cols; j++ {
+					if !scalar.EqualWithinAbsOrRel(ra.Data[i*ra.Stride+j], rb.Data[j*rb.Stride+i], epsilon, epsilon) {
+						return false
+					}
+				}
+			}
+			return true
+		}
+	}
+	if rma, ok := aU.(RawSymmetricer); ok {
+		if rmb, ok := bU.(RawSymmetricer); ok {
+			ra := rma.RawSymmetric()
+			rb := rmb.RawSymmetric()
+			// Symmetric matrices are always upper and equal to their transpose.
+			for i := 0; i < ra.N; i++ {
+				for j := i; j < ra.N; j++ {
+					if !scalar.EqualWithinAbsOrRel(ra.Data[i*ra.Stride+j], rb.Data[i*rb.Stride+j], epsilon, epsilon) {
+						return false
+					}
+				}
+			}
+			return true
+		}
+	}
+	if ra, ok := aU.(*VecDense); ok {
+		if rb, ok := bU.(*VecDense); ok {
+			// If the raw vectors are the same length they must either both be
+			// transposed or both not transposed (or have length 1).
+			for i := 0; i < ra.mat.N; i++ {
+				if !scalar.EqualWithinAbsOrRel(ra.mat.Data[i*ra.mat.Inc], rb.mat.Data[i*rb.mat.Inc], epsilon, epsilon) {
+					return false
+				}
+			}
+			return true
+		}
+	}
+	for i := 0; i < ar; i++ {
+		for j := 0; j < ac; j++ {
+			if !scalar.EqualWithinAbsOrRel(a.At(i, j), b.At(i, j), epsilon, epsilon) {
+				return false
+			}
+		}
+	}
+	return true
+}
+
+// LogDet returns the log of the determinant and the sign of the determinant
+// for the matrix that has been factorized. Numerical stability in product and
+// division expressions is generally improved by working in log space.
+//
+// LogDet panics with ErrSquare is a is not square and with ErrZeroLength if a
+// has zero size.
+func LogDet(a Matrix) (det float64, sign float64) {
+	// TODO(btracey): Add specialized routines for TriDense, etc.
+	var lu LU
+	lu.Factorize(a)
+	return lu.LogDet()
+}
+
+// Max returns the largest element value of the matrix A.
+//
+// Max will panic with ErrZeroLength if the matrix has zero size.
+func Max(a Matrix) float64 {
+	r, c := a.Dims()
+	if r == 0 || c == 0 {
+		panic(ErrZeroLength)
+	}
+	// Max(A) = Max(Aᵀ)
+	aU, _ := untranspose(a)
+	switch m := aU.(type) {
+	case RawMatrixer:
+		rm := m.RawMatrix()
+		max := math.Inf(-1)
+		for i := 0; i < rm.Rows; i++ {
+			for _, v := range rm.Data[i*rm.Stride : i*rm.Stride+rm.Cols] {
+				if v > max {
+					max = v
+				}
+			}
+		}
+		return max
+	case RawTriangular:
+		rm := m.RawTriangular()
+		// The max of a triangular is at least 0 unless the size is 1.
+		if rm.N == 1 {
+			return rm.Data[0]
+		}
+		max := 0.0
+		if rm.Uplo == blas.Upper {
+			for i := 0; i < rm.N; i++ {
+				for _, v := range rm.Data[i*rm.Stride+i : i*rm.Stride+rm.N] {
+					if v > max {
+						max = v
+					}
+				}
+			}
+			return max
+		}
+		for i := 0; i < rm.N; i++ {
+			for _, v := range rm.Data[i*rm.Stride : i*rm.Stride+i+1] {
+				if v > max {
+					max = v
+				}
+			}
+		}
+		return max
+	case RawSymmetricer:
+		rm := m.RawSymmetric()
+		if rm.Uplo != blas.Upper {
+			panic(badSymTriangle)
+		}
+		max := math.Inf(-1)
+		for i := 0; i < rm.N; i++ {
+			for _, v := range rm.Data[i*rm.Stride+i : i*rm.Stride+rm.N] {
+				if v > max {
+					max = v
+				}
+			}
+		}
+		return max
+	default:
+		r, c := aU.Dims()
+		max := math.Inf(-1)
+		for i := 0; i < r; i++ {
+			for j := 0; j < c; j++ {
+				v := aU.At(i, j)
+				if v > max {
+					max = v
+				}
+			}
+		}
+		return max
+	}
+}
+
+// Min returns the smallest element value of the matrix A.
+//
+// Min will panic with ErrZeroLength if the matrix has zero size.
+func Min(a Matrix) float64 {
+	r, c := a.Dims()
+	if r == 0 || c == 0 {
+		panic(ErrZeroLength)
+	}
+	// Min(A) = Min(Aᵀ)
+	aU, _ := untranspose(a)
+	switch m := aU.(type) {
+	case RawMatrixer:
+		rm := m.RawMatrix()
+		min := math.Inf(1)
+		for i := 0; i < rm.Rows; i++ {
+			for _, v := range rm.Data[i*rm.Stride : i*rm.Stride+rm.Cols] {
+				if v < min {
+					min = v
+				}
+			}
+		}
+		return min
+	case RawTriangular:
+		rm := m.RawTriangular()
+		// The min of a triangular is at most 0 unless the size is 1.
+		if rm.N == 1 {
+			return rm.Data[0]
+		}
+		min := 0.0
+		if rm.Uplo == blas.Upper {
+			for i := 0; i < rm.N; i++ {
+				for _, v := range rm.Data[i*rm.Stride+i : i*rm.Stride+rm.N] {
+					if v < min {
+						min = v
+					}
+				}
+			}
+			return min
+		}
+		for i := 0; i < rm.N; i++ {
+			for _, v := range rm.Data[i*rm.Stride : i*rm.Stride+i+1] {
+				if v < min {
+					min = v
+				}
+			}
+		}
+		return min
+	case RawSymmetricer:
+		rm := m.RawSymmetric()
+		if rm.Uplo != blas.Upper {
+			panic(badSymTriangle)
+		}
+		min := math.Inf(1)
+		for i := 0; i < rm.N; i++ {
+			for _, v := range rm.Data[i*rm.Stride+i : i*rm.Stride+rm.N] {
+				if v < min {
+					min = v
+				}
+			}
+		}
+		return min
+	default:
+		r, c := aU.Dims()
+		min := math.Inf(1)
+		for i := 0; i < r; i++ {
+			for j := 0; j < c; j++ {
+				v := aU.At(i, j)
+				if v < min {
+					min = v
+				}
+			}
+		}
+		return min
+	}
+}
+
+// A Normer can compute a norm of the matrix. Valid norms are:
+//
+//	1 - The maximum absolute column sum
+//	2 - The Frobenius norm, the square root of the sum of the squares of the elements
+//	Inf - The maximum absolute row sum
+type Normer interface {
+	Norm(norm float64) float64
+}
+
+// Norm returns the specified norm of the matrix A. Valid norms are:
+//
+//	1 - The maximum absolute column sum
+//	2 - The Frobenius norm, the square root of the sum of the squares of the elements
+//	Inf - The maximum absolute row sum
+//
+// If a is a Normer, its Norm method will be used to calculate the norm.
+//
+// Norm will panic with ErrNormOrder if an illegal norm is specified and with
+// ErrShape if the matrix has zero size.
+func Norm(a Matrix, norm float64) float64 {
+	r, c := a.Dims()
+	if r == 0 || c == 0 {
+		panic(ErrZeroLength)
+	}
+	m, trans := untransposeExtract(a)
+	if m, ok := m.(Normer); ok {
+		if trans {
+			switch norm {
+			case 1:
+				norm = math.Inf(1)
+			case math.Inf(1):
+				norm = 1
+			}
+		}
+		return m.Norm(norm)
+	}
+	switch norm {
+	default:
+		panic(ErrNormOrder)
+	case 1:
+		var max float64
+		for j := 0; j < c; j++ {
+			var sum float64
+			for i := 0; i < r; i++ {
+				sum += math.Abs(a.At(i, j))
+			}
+			if sum > max {
+				max = sum
+			}
+		}
+		return max
+	case 2:
+		var sum float64
+		for i := 0; i < r; i++ {
+			for j := 0; j < c; j++ {
+				v := a.At(i, j)
+				sum += v * v
+			}
+		}
+		return math.Sqrt(sum)
+	case math.Inf(1):
+		var max float64
+		for i := 0; i < r; i++ {
+			var sum float64
+			for j := 0; j < c; j++ {
+				sum += math.Abs(a.At(i, j))
+			}
+			if sum > max {
+				max = sum
+			}
+		}
+		return max
+	}
+}
+
+// normLapack converts the float64 norm input in Norm to a lapack.MatrixNorm.
+func normLapack(norm float64, aTrans bool) lapack.MatrixNorm {
+	switch norm {
+	case 1:
+		n := lapack.MaxColumnSum
+		if aTrans {
+			n = lapack.MaxRowSum
+		}
+		return n
+	case 2:
+		return lapack.Frobenius
+	case math.Inf(1):
+		n := lapack.MaxRowSum
+		if aTrans {
+			n = lapack.MaxColumnSum
+		}
+		return n
+	default:
+		panic(ErrNormOrder)
+	}
+}
+
+// Sum returns the sum of the elements of the matrix.
+//
+// Sum will panic with ErrZeroLength if the matrix has zero size.
+func Sum(a Matrix) float64 {
+	r, c := a.Dims()
+	if r == 0 || c == 0 {
+		panic(ErrZeroLength)
+	}
+	var sum float64
+	aU, _ := untranspose(a)
+	switch rma := aU.(type) {
+	case RawSymmetricer:
+		rm := rma.RawSymmetric()
+		for i := 0; i < rm.N; i++ {
+			// Diagonals count once while off-diagonals count twice.
+			sum += rm.Data[i*rm.Stride+i]
+			var s float64
+			for _, v := range rm.Data[i*rm.Stride+i+1 : i*rm.Stride+rm.N] {
+				s += v
+			}
+			sum += 2 * s
+		}
+		return sum
+	case RawTriangular:
+		rm := rma.RawTriangular()
+		var startIdx, endIdx int
+		for i := 0; i < rm.N; i++ {
+			// Start and end index for this triangle-row.
+			switch rm.Uplo {
+			case blas.Upper:
+				startIdx = i
+				endIdx = rm.N
+			case blas.Lower:
+				startIdx = 0
+				endIdx = i + 1
+			default:
+				panic(badTriangle)
+			}
+			for _, v := range rm.Data[i*rm.Stride+startIdx : i*rm.Stride+endIdx] {
+				sum += v
+			}
+		}
+		return sum
+	case RawMatrixer:
+		rm := rma.RawMatrix()
+		for i := 0; i < rm.Rows; i++ {
+			for _, v := range rm.Data[i*rm.Stride : i*rm.Stride+rm.Cols] {
+				sum += v
+			}
+		}
+		return sum
+	case *VecDense:
+		rm := rma.RawVector()
+		for i := 0; i < rm.N; i++ {
+			sum += rm.Data[i*rm.Inc]
+		}
+		return sum
+	default:
+		r, c := a.Dims()
+		for i := 0; i < r; i++ {
+			for j := 0; j < c; j++ {
+				sum += a.At(i, j)
+			}
+		}
+		return sum
+	}
+}
+
+// A Tracer can compute the trace of the matrix. Trace must panic with ErrSquare
+// if the matrix is not square.
+type Tracer interface {
+	Trace() float64
+}
+
+// Trace returns the trace of the matrix. If a is a Tracer, its Trace method
+// will be used to calculate the matrix trace.
+//
+// Trace will panic with ErrSquare if the matrix is not square and with
+// ErrZeroLength if the matrix has zero size.
+func Trace(a Matrix) float64 {
+	r, c := a.Dims()
+	if r == 0 || c == 0 {
+		panic(ErrZeroLength)
+	}
+	m, _ := untransposeExtract(a)
+	if t, ok := m.(Tracer); ok {
+		return t.Trace()
+	}
+	if r != c {
+		panic(ErrSquare)
+	}
+	var v float64
+	for i := 0; i < r; i++ {
+		v += a.At(i, i)
+	}
+	return v
+}
+
+// use returns a float64 slice with l elements, using f if it
+// has the necessary capacity, otherwise creating a new slice.
+func use(f []float64, l int) []float64 {
+	if l <= cap(f) {
+		return f[:l]
+	}
+	return make([]float64, l)
+}
+
+// useZeroed returns a float64 slice with l elements, using f if it
+// has the necessary capacity, otherwise creating a new slice. The
+// elements of the returned slice are guaranteed to be zero.
+func useZeroed(f []float64, l int) []float64 {
+	if l <= cap(f) {
+		f = f[:l]
+		zero(f)
+		return f
+	}
+	return make([]float64, l)
+}
+
+// zero zeros the given slice's elements.
+func zero(f []float64) {
+	for i := range f {
+		f[i] = 0
+	}
+}
+
+// useInt returns an int slice with l elements, using i if it
+// has the necessary capacity, otherwise creating a new slice.
+func useInt(i []int, l int) []int {
+	if l <= cap(i) {
+		return i[:l]
+	}
+	return make([]int, l)
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/offset.go b/vendor/gonum.org/v1/gonum/mat/offset.go
new file mode 100644
index 0000000000..26c80a4c8f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/offset.go
@@ -0,0 +1,32 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !safe
+// +build !safe
+
+package mat
+
+import "unsafe"
+
+// offset returns the number of float64 values b[0] is after a[0].
+func offset(a, b []float64) int {
+	if &a[0] == &b[0] {
+		return 0
+	}
+	// This expression must be atomic with respect to GC moves.
+	// At this stage this is true, because the GC does not
+	// move. See https://golang.org/issue/12445.
+	return int(uintptr(unsafe.Pointer(&b[0]))-uintptr(unsafe.Pointer(&a[0]))) / int(unsafe.Sizeof(float64(0)))
+}
+
+// offsetComplex returns the number of complex128 values b[0] is after a[0].
+func offsetComplex(a, b []complex128) int {
+	if &a[0] == &b[0] {
+		return 0
+	}
+	// This expression must be atomic with respect to GC moves.
+	// At this stage this is true, because the GC does not
+	// move. See https://golang.org/issue/12445.
+	return int(uintptr(unsafe.Pointer(&b[0]))-uintptr(unsafe.Pointer(&a[0]))) / int(unsafe.Sizeof(complex128(0)))
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/offset_appengine.go b/vendor/gonum.org/v1/gonum/mat/offset_appengine.go
new file mode 100644
index 0000000000..be2ca78cba
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/offset_appengine.go
@@ -0,0 +1,40 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build safe
+// +build safe
+
+package mat
+
+import "reflect"
+
+var sizeOfFloat64 = int(reflect.TypeOf(float64(0)).Size())
+
+// offset returns the number of float64 values b[0] is after a[0].
+func offset(a, b []float64) int {
+	va0 := reflect.ValueOf(a).Index(0)
+	vb0 := reflect.ValueOf(b).Index(0)
+	if va0.Addr() == vb0.Addr() {
+		return 0
+	}
+	// This expression must be atomic with respect to GC moves.
+	// At this stage this is true, because the GC does not
+	// move. See https://golang.org/issue/12445.
+	return int(vb0.UnsafeAddr()-va0.UnsafeAddr()) / sizeOfFloat64
+}
+
+var sizeOfComplex128 = int(reflect.TypeOf(complex128(0)).Size())
+
+// offsetComplex returns the number of complex128 values b[0] is after a[0].
+func offsetComplex(a, b []complex128) int {
+	va0 := reflect.ValueOf(a).Index(0)
+	vb0 := reflect.ValueOf(b).Index(0)
+	if va0.Addr() == vb0.Addr() {
+		return 0
+	}
+	// This expression must be atomic with respect to GC moves.
+	// At this stage this is true, because the GC does not
+	// move. See https://golang.org/issue/12445.
+	return int(vb0.UnsafeAddr()-va0.UnsafeAddr()) / sizeOfComplex128
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/pool.go b/vendor/gonum.org/v1/gonum/mat/pool.go
new file mode 100644
index 0000000000..b9dce1c45b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/pool.go
@@ -0,0 +1,260 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math/bits"
+	"sync"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/blas/cblas128"
+)
+
+// poolFor returns the ceiling of base 2 log of size. It provides an index
+// into a pool array to a sync.Pool that will return values able to hold
+// size elements.
+func poolFor(size uint) int {
+	if size == 0 {
+		return 0
+	}
+	return bits.Len(size - 1)
+}
+
+var (
+	// poolDense contains size stratified workspace Dense pools.
+	// Each poolDense element i returns sized matrices with a data
+	// slice capped at 1<<i.
+	poolDense [63]sync.Pool
+
+	// poolSymDense is the SymDense equivalent of poolDense.
+	poolSymDense [63]sync.Pool
+
+	// poolTriDense is the TriDense equivalent of poolDense.
+	poolTriDense [63]sync.Pool
+
+	// poolVecDense is the VecDense equivalent of poolDense.
+	poolVecDense [63]sync.Pool
+
+	// poolCDense is the CDense equivalent of poolDense.
+	poolCDense [63]sync.Pool
+
+	// poolFloat64s is the []float64 equivalent of poolDense.
+	poolFloat64s [63]sync.Pool
+
+	// poolInts is the []int equivalent of poolDense.
+	poolInts [63]sync.Pool
+)
+
+func init() {
+	for i := range poolDense {
+		l := 1 << uint(i)
+		// Real matrix pools.
+		poolDense[i].New = func() interface{} {
+			return &Dense{mat: blas64.General{
+				Data: make([]float64, l),
+			}}
+		}
+		poolSymDense[i].New = func() interface{} {
+			return &SymDense{mat: blas64.Symmetric{
+				Uplo: blas.Upper,
+				Data: make([]float64, l),
+			}}
+		}
+		poolTriDense[i].New = func() interface{} {
+			return &TriDense{mat: blas64.Triangular{
+				Data: make([]float64, l),
+			}}
+		}
+		poolVecDense[i].New = func() interface{} {
+			return &VecDense{mat: blas64.Vector{
+				Inc:  1,
+				Data: make([]float64, l),
+			}}
+		}
+
+		// Complex matrix pools.
+		poolCDense[i].New = func() interface{} {
+			return &CDense{mat: cblas128.General{
+				Data: make([]complex128, l),
+			}}
+		}
+
+		// Helper pools.
+		poolFloat64s[i].New = func() interface{} {
+			s := make([]float64, l)
+			return &s
+		}
+		poolInts[i].New = func() interface{} {
+			s := make([]int, l)
+			return &s
+		}
+	}
+}
+
+// getDenseWorkspace returns a *Dense of size r×c and a data slice
+// with a cap that is less than 2*r*c. If clear is true, the
+// data slice visible through the Matrix interface is zeroed.
+func getDenseWorkspace(r, c int, clear bool) *Dense {
+	l := uint(r * c)
+	w := poolDense[poolFor(l)].Get().(*Dense)
+	w.mat.Data = w.mat.Data[:l]
+	if clear {
+		zero(w.mat.Data)
+	}
+	w.mat.Rows = r
+	w.mat.Cols = c
+	w.mat.Stride = c
+	w.capRows = r
+	w.capCols = c
+	return w
+}
+
+// putDenseWorkspace replaces a used *Dense into the appropriate size
+// workspace pool. putDenseWorkspace must not be called with a matrix
+// where references to the underlying data slice have been kept.
+func putDenseWorkspace(w *Dense) {
+	poolDense[poolFor(uint(cap(w.mat.Data)))].Put(w)
+}
+
+// getSymDenseWorkspace returns a *SymDense of size n and a cap that
+// is less than 2*n. If clear is true, the data slice visible
+// through the Matrix interface is zeroed.
+func getSymDenseWorkspace(n int, clear bool) *SymDense {
+	l := uint(n)
+	l *= l
+	s := poolSymDense[poolFor(l)].Get().(*SymDense)
+	s.mat.Data = s.mat.Data[:l]
+	if clear {
+		zero(s.mat.Data)
+	}
+	s.mat.N = n
+	s.mat.Stride = n
+	s.cap = n
+	return s
+}
+
+// putSymDenseWorkspace replaces a used *SymDense into the appropriate size
+// workspace pool. putSymDenseWorkspace must not be called with a matrix
+// where references to the underlying data slice have been kept.
+func putSymDenseWorkspace(s *SymDense) {
+	poolSymDense[poolFor(uint(cap(s.mat.Data)))].Put(s)
+}
+
+// getTriDenseWorkspace returns a *TriDense of size n and a cap that
+// is less than 2*n. If clear is true, the data slice visible
+// through the Matrix interface is zeroed.
+func getTriDenseWorkspace(n int, kind TriKind, clear bool) *TriDense {
+	l := uint(n)
+	l *= l
+	t := poolTriDense[poolFor(l)].Get().(*TriDense)
+	t.mat.Data = t.mat.Data[:l]
+	if clear {
+		zero(t.mat.Data)
+	}
+	t.mat.N = n
+	t.mat.Stride = n
+	if kind == Upper {
+		t.mat.Uplo = blas.Upper
+	} else if kind == Lower {
+		t.mat.Uplo = blas.Lower
+	} else {
+		panic(ErrTriangle)
+	}
+	t.mat.Diag = blas.NonUnit
+	t.cap = n
+	return t
+}
+
+// putTriWorkspace replaces a used *TriDense into the appropriate size
+// workspace pool. putTriWorkspace must not be called with a matrix
+// where references to the underlying data slice have been kept.
+func putTriWorkspace(t *TriDense) {
+	poolTriDense[poolFor(uint(cap(t.mat.Data)))].Put(t)
+}
+
+// getVecDenseWorkspace returns a *VecDense of length n and a cap that
+// is less than 2*n. If clear is true, the data slice visible
+// through the Matrix interface is zeroed.
+func getVecDenseWorkspace(n int, clear bool) *VecDense {
+	l := uint(n)
+	v := poolVecDense[poolFor(l)].Get().(*VecDense)
+	v.mat.Data = v.mat.Data[:l]
+	if clear {
+		zero(v.mat.Data)
+	}
+	v.mat.N = n
+	return v
+}
+
+// putVecDenseWorkspace replaces a used *VecDense into the appropriate size
+// workspace pool. putVecDenseWorkspace must not be called with a matrix
+// where references to the underlying data slice have been kept.
+func putVecDenseWorkspace(v *VecDense) {
+	poolVecDense[poolFor(uint(cap(v.mat.Data)))].Put(v)
+}
+
+// getCDenseWorkspace returns a *CDense of size r×c and a data slice
+// with a cap that is less than 2*r*c. If clear is true, the
+// data slice visible through the CMatrix interface is zeroed.
+func getCDenseWorkspace(r, c int, clear bool) *CDense {
+	l := uint(r * c)
+	w := poolCDense[poolFor(l)].Get().(*CDense)
+	w.mat.Data = w.mat.Data[:l]
+	if clear {
+		zeroC(w.mat.Data)
+	}
+	w.mat.Rows = r
+	w.mat.Cols = c
+	w.mat.Stride = c
+	w.capRows = r
+	w.capCols = c
+	return w
+}
+
+// putCDenseWorkspace replaces a used *CDense into the appropriate size
+// workspace pool. putWorkspace must not be called with a matrix
+// where references to the underlying data slice have been kept.
+func putCDenseWorkspace(w *CDense) {
+	poolCDense[poolFor(uint(cap(w.mat.Data)))].Put(w)
+}
+
+// getFloat64s returns a []float64 of length l and a cap that is
+// less than 2*l. If clear is true, the slice visible is zeroed.
+func getFloat64s(l int, clear bool) []float64 {
+	w := *poolFloat64s[poolFor(uint(l))].Get().(*[]float64)
+	w = w[:l]
+	if clear {
+		zero(w)
+	}
+	return w
+}
+
+// putFloat64s replaces a used []float64 into the appropriate size
+// workspace pool. putFloat64s must not be called with a slice
+// where references to the underlying data have been kept.
+func putFloat64s(w []float64) {
+	poolFloat64s[poolFor(uint(cap(w)))].Put(&w)
+}
+
+// getInts returns a []int of length l and a cap that is
+// less than 2*l. If clear is true, the slice visible is zeroed.
+func getInts(l int, clear bool) []int {
+	w := *poolInts[poolFor(uint(l))].Get().(*[]int)
+	w = w[:l]
+	if clear {
+		for i := range w {
+			w[i] = 0
+		}
+	}
+	return w
+}
+
+// putInts replaces a used []int into the appropriate size
+// workspace pool. putInts must not be called with a slice
+// where references to the underlying data have been kept.
+func putInts(w []int) {
+	poolInts[poolFor(uint(cap(w)))].Put(&w)
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/product.go b/vendor/gonum.org/v1/gonum/mat/product.go
new file mode 100644
index 0000000000..43e46a2c7f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/product.go
@@ -0,0 +1,193 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import "fmt"
+
+// Product calculates the product of the given factors and places the result in
+// the receiver. The order of multiplication operations is optimized to minimize
+// the number of floating point operations on the basis that all matrix
+// multiplications are general.
+func (m *Dense) Product(factors ...Matrix) {
+	// The operation order optimisation is the naive O(n^3) dynamic
+	// programming approach and does not take into consideration
+	// finer-grained optimisations that might be available.
+	//
+	// TODO(kortschak) Consider using the O(nlogn) or O(mlogn)
+	// algorithms that are available. e.g.
+	//
+	// e.g. http://www.jofcis.com/publishedpapers/2014_10_10_4299_4306.pdf
+	//
+	// In the case that this is replaced, retain this code in
+	// tests to compare against.
+
+	r, c := m.Dims()
+	switch len(factors) {
+	case 0:
+		if r != 0 || c != 0 {
+			panic(ErrShape)
+		}
+		return
+	case 1:
+		m.reuseAsNonZeroed(factors[0].Dims())
+		m.Copy(factors[0])
+		return
+	case 2:
+		// Don't do work that we know the answer to.
+		m.Mul(factors[0], factors[1])
+		return
+	}
+
+	p := newMultiplier(m, factors)
+	p.optimize()
+	result := p.multiply()
+	m.reuseAsNonZeroed(result.Dims())
+	m.Copy(result)
+	putDenseWorkspace(result)
+}
+
+// debugProductWalk enables debugging output for Product.
+const debugProductWalk = false
+
+// multiplier performs operation order optimisation and tree traversal.
+type multiplier struct {
+	// factors is the ordered set of
+	// factors to multiply.
+	factors []Matrix
+	// dims is the chain of factor
+	// dimensions.
+	dims []int
+
+	// table contains the dynamic
+	// programming costs and subchain
+	// division indices.
+	table table
+}
+
+func newMultiplier(m *Dense, factors []Matrix) *multiplier {
+	// Check size early, but don't yet
+	// allocate data for m.
+	r, c := m.Dims()
+	fr, fc := factors[0].Dims() // newMultiplier is only called with len(factors) > 2.
+	if !m.IsEmpty() {
+		if fr != r {
+			panic(ErrShape)
+		}
+		if _, lc := factors[len(factors)-1].Dims(); lc != c {
+			panic(ErrShape)
+		}
+	}
+
+	dims := make([]int, len(factors)+1)
+	dims[0] = r
+	dims[len(dims)-1] = c
+	pc := fc
+	for i, f := range factors[1:] {
+		cr, cc := f.Dims()
+		dims[i+1] = cr
+		if pc != cr {
+			panic(ErrShape)
+		}
+		pc = cc
+	}
+
+	return &multiplier{
+		factors: factors,
+		dims:    dims,
+		table:   newTable(len(factors)),
+	}
+}
+
+// optimize determines an optimal matrix multiply operation order.
+func (p *multiplier) optimize() {
+	if debugProductWalk {
+		fmt.Printf("chain dims: %v\n", p.dims)
+	}
+	const maxInt = int(^uint(0) >> 1)
+	for f := 1; f < len(p.factors); f++ {
+		for i := 0; i < len(p.factors)-f; i++ {
+			j := i + f
+			p.table.set(i, j, entry{cost: maxInt})
+			for k := i; k < j; k++ {
+				cost := p.table.at(i, k).cost + p.table.at(k+1, j).cost + p.dims[i]*p.dims[k+1]*p.dims[j+1]
+				if cost < p.table.at(i, j).cost {
+					p.table.set(i, j, entry{cost: cost, k: k})
+				}
+			}
+		}
+	}
+}
+
+// multiply walks the optimal operation tree found by optimize,
+// leaving the final result in the stack. It returns the
+// product, which may be copied but should be returned to
+// the workspace pool.
+func (p *multiplier) multiply() *Dense {
+	result, _ := p.multiplySubchain(0, len(p.factors)-1)
+	if debugProductWalk {
+		r, c := result.Dims()
+		fmt.Printf("\tpop result (%d×%d) cost=%d\n", r, c, p.table.at(0, len(p.factors)-1).cost)
+	}
+	return result.(*Dense)
+}
+
+func (p *multiplier) multiplySubchain(i, j int) (m Matrix, intermediate bool) {
+	if i == j {
+		return p.factors[i], false
+	}
+
+	a, aTmp := p.multiplySubchain(i, p.table.at(i, j).k)
+	b, bTmp := p.multiplySubchain(p.table.at(i, j).k+1, j)
+
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+	if ac != br {
+		// Panic with a string since this
+		// is not a user-facing panic.
+		panic(ErrShape.Error())
+	}
+
+	if debugProductWalk {
+		fmt.Printf("\tpush f[%d] (%d×%d)%s * f[%d] (%d×%d)%s\n",
+			i, ar, ac, result(aTmp), j, br, bc, result(bTmp))
+	}
+
+	r := getDenseWorkspace(ar, bc, false)
+	r.Mul(a, b)
+	if aTmp {
+		putDenseWorkspace(a.(*Dense))
+	}
+	if bTmp {
+		putDenseWorkspace(b.(*Dense))
+	}
+	return r, true
+}
+
+type entry struct {
+	k    int // is the chain subdivision index.
+	cost int // cost is the cost of the operation.
+}
+
+// table is a row major n×n dynamic programming table.
+type table struct {
+	n       int
+	entries []entry
+}
+
+func newTable(n int) table {
+	return table{n: n, entries: make([]entry, n*n)}
+}
+
+func (t table) at(i, j int) entry     { return t.entries[i*t.n+j] }
+func (t table) set(i, j int, e entry) { t.entries[i*t.n+j] = e }
+
+type result bool
+
+func (r result) String() string {
+	if r {
+		return " (popped result)"
+	}
+	return ""
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/qr.go b/vendor/gonum.org/v1/gonum/mat/qr.go
new file mode 100644
index 0000000000..7f8fec8f6f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/qr.go
@@ -0,0 +1,349 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+const badQR = "mat: invalid QR factorization"
+
+// QR is a type for creating and using the QR factorization of a matrix.
+type QR struct {
+	qr   *Dense
+	q    *Dense
+	tau  []float64
+	cond float64
+}
+
+// Dims returns the dimensions of the matrix.
+func (qr *QR) Dims() (r, c int) {
+	if qr.qr == nil {
+		return 0, 0
+	}
+	return qr.qr.Dims()
+}
+
+// At returns the element at row i, column j. At will panic if the receiver
+// does not contain a successful factorization.
+func (qr *QR) At(i, j int) float64 {
+	if !qr.isValid() {
+		panic(badQR)
+	}
+
+	m, n := qr.Dims()
+	if uint(i) >= uint(m) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(n) {
+		panic(ErrColAccess)
+	}
+
+	if qr.q == nil || qr.q.IsEmpty() {
+		// Calculate Qi, Q i-th row
+		qi := getFloat64s(m, true)
+		qr.qRowTo(i, qi)
+
+		// Compute QR(i,j)
+		var val float64
+		for k := 0; k <= j; k++ {
+			val += qi[k] * qr.qr.at(k, j)
+		}
+		putFloat64s(qi)
+		return val
+	}
+
+	var val float64
+	for k := 0; k <= j; k++ {
+		val += qr.q.at(i, k) * qr.qr.at(k, j)
+	}
+	return val
+}
+
+// qRowTo extracts the i-th row of the orthonormal matrix Q from a QR
+// decomposition.
+func (qr *QR) qRowTo(i int, dst []float64) {
+	c := blas64.General{
+		Rows:   1,
+		Cols:   len(dst),
+		Stride: len(dst),
+		Data:   dst,
+	}
+	c.Data[i] = 1 // C is the i-th unit vector
+
+	// Construct Qi from the elementary reflectors: Qi = C * (H(1) H(2) ... H(nTau))
+	work := []float64{0}
+	lapack64.Ormqr(blas.Right, blas.NoTrans, qr.qr.mat, qr.tau, c, work, -1)
+	work = getFloat64s(int(work[0]), false)
+	lapack64.Ormqr(blas.Right, blas.NoTrans, qr.qr.mat, qr.tau, c, work, len(work))
+	putFloat64s(work)
+}
+
+// T performs an implicit transpose by returning the receiver inside a
+// Transpose.
+func (qr *QR) T() Matrix {
+	return Transpose{qr}
+}
+
+func (qr *QR) updateCond(norm lapack.MatrixNorm) {
+	// Since A = Q*R, and Q is orthogonal, we get for the condition number κ
+	//  κ(A) := |A| |A^-1| = |Q*R| |(Q*R)^-1| = |R| |R^-1 * Qᵀ|
+	//        = |R| |R^-1| = κ(R),
+	// where we used that fact that Q^-1 = Qᵀ. However, this assumes that
+	// the matrix norm is invariant under orthogonal transformations which
+	// is not the case for CondNorm. Hopefully the error is negligible: κ
+	// is only a qualitative measure anyway.
+	n := qr.qr.mat.Cols
+	work := getFloat64s(3*n, false)
+	iwork := getInts(n, false)
+	r := qr.qr.asTriDense(n, blas.NonUnit, blas.Upper)
+	v := lapack64.Trcon(norm, r.mat, work, iwork)
+	putFloat64s(work)
+	putInts(iwork)
+	qr.cond = 1 / v
+}
+
+// Factorize computes the QR factorization of an m×n matrix a where m >= n. The QR
+// factorization always exists even if A is singular.
+//
+// The QR decomposition is a factorization of the matrix A such that A = Q * R.
+// The matrix Q is an orthonormal m×m matrix, and R is an m×n upper triangular matrix.
+// Q and R can be extracted using the QTo and RTo methods.
+func (qr *QR) Factorize(a Matrix) {
+	qr.factorize(a, CondNorm)
+}
+
+func (qr *QR) factorize(a Matrix, norm lapack.MatrixNorm) {
+	m, n := a.Dims()
+	if m < n {
+		panic(ErrShape)
+	}
+	if qr.qr == nil {
+		qr.qr = &Dense{}
+	}
+	qr.qr.CloneFrom(a)
+	work := []float64{0}
+	qr.tau = make([]float64, n)
+	lapack64.Geqrf(qr.qr.mat, qr.tau, work, -1)
+	work = getFloat64s(int(work[0]), false)
+	lapack64.Geqrf(qr.qr.mat, qr.tau, work, len(work))
+	putFloat64s(work)
+	qr.updateCond(norm)
+	if qr.q != nil {
+		qr.q.Reset()
+	}
+}
+
+func (qr *QR) updateQ() {
+	m, _ := qr.Dims()
+	if qr.q == nil {
+		qr.q = NewDense(m, m, nil)
+	} else {
+		qr.q.reuseAsNonZeroed(m, m)
+	}
+	// Construct Q from the elementary reflectors.
+	qr.q.Copy(qr.qr)
+	work := []float64{0}
+	lapack64.Orgqr(qr.q.mat, qr.tau, work, -1)
+	work = getFloat64s(int(work[0]), false)
+	lapack64.Orgqr(qr.q.mat, qr.tau, work, len(work))
+	putFloat64s(work)
+}
+
+// isValid returns whether the receiver contains a factorization.
+func (qr *QR) isValid() bool {
+	return qr.qr != nil && !qr.qr.IsEmpty()
+}
+
+// Cond returns the condition number for the factorized matrix.
+// Cond will panic if the receiver does not contain a factorization.
+func (qr *QR) Cond() float64 {
+	if !qr.isValid() {
+		panic(badQR)
+	}
+	return qr.cond
+}
+
+// TODO(btracey): Add in the "Reduced" forms for extracting the n×n orthogonal
+// and upper triangular matrices.
+
+// RTo extracts the m×n upper trapezoidal matrix from a QR decomposition.
+//
+// If dst is empty, RTo will resize dst to be r×c. When dst is non-empty,
+// RTo will panic if dst is not r×c. RTo will also panic if the receiver
+// does not contain a successful factorization.
+func (qr *QR) RTo(dst *Dense) {
+	if !qr.isValid() {
+		panic(badQR)
+	}
+
+	r, c := qr.qr.Dims()
+	if dst.IsEmpty() {
+		dst.ReuseAs(r, c)
+	} else {
+		r2, c2 := dst.Dims()
+		if r != r2 || c != c2 {
+			panic(ErrShape)
+		}
+	}
+
+	// Disguise the QR as an upper triangular
+	t := &TriDense{
+		mat: blas64.Triangular{
+			N:      c,
+			Stride: qr.qr.mat.Stride,
+			Data:   qr.qr.mat.Data,
+			Uplo:   blas.Upper,
+			Diag:   blas.NonUnit,
+		},
+		cap: qr.qr.capCols,
+	}
+	dst.Copy(t)
+
+	// Zero below the triangular.
+	for i := r; i < c; i++ {
+		zero(dst.mat.Data[i*dst.mat.Stride : i*dst.mat.Stride+c])
+	}
+}
+
+// QTo extracts the r×r orthonormal matrix Q from a QR decomposition.
+//
+// If dst is empty, QTo will resize dst to be r×r. When dst is non-empty,
+// QTo will panic if dst is not r×r. QTo will also panic if the receiver
+// does not contain a successful factorization.
+func (qr *QR) QTo(dst *Dense) {
+	if !qr.isValid() {
+		panic(badQR)
+	}
+
+	r, _ := qr.qr.Dims()
+	if dst.IsEmpty() {
+		dst.ReuseAs(r, r)
+	} else {
+		r2, c2 := dst.Dims()
+		if r != r2 || r != c2 {
+			panic(ErrShape)
+		}
+	}
+
+	if qr.q == nil || qr.q.IsEmpty() {
+		qr.updateQ()
+	}
+	dst.Copy(qr.q)
+}
+
+// SolveTo finds a minimum-norm solution to a system of linear equations defined
+// by the matrices A and b, where A is an m×n matrix represented in its QR factorized
+// form. If A is singular or near-singular a Condition error is returned.
+// See the documentation for Condition for more information.
+//
+// The minimization problem solved depends on the input parameters.
+//
+//	If trans == false, find X such that ||A*X - B||_2 is minimized.
+//	If trans == true, find the minimum norm solution of Aᵀ * X = B.
+//
+// The solution matrix, X, is stored in place into dst.
+// SolveTo will panic if the receiver does not contain a factorization.
+func (qr *QR) SolveTo(dst *Dense, trans bool, b Matrix) error {
+	if !qr.isValid() {
+		panic(badQR)
+	}
+
+	r, c := qr.qr.Dims()
+	br, bc := b.Dims()
+
+	// The QR solve algorithm stores the result in-place into the right hand side.
+	// The storage for the answer must be large enough to hold both b and x.
+	// However, this method's receiver must be the size of x. Copy b, and then
+	// copy the result into m at the end.
+	if trans {
+		if c != br {
+			panic(ErrShape)
+		}
+		dst.reuseAsNonZeroed(r, bc)
+	} else {
+		if r != br {
+			panic(ErrShape)
+		}
+		dst.reuseAsNonZeroed(c, bc)
+	}
+	// Do not need to worry about overlap between m and b because x has its own
+	// independent storage.
+	w := getDenseWorkspace(max(r, c), bc, false)
+	w.Copy(b)
+	t := qr.qr.asTriDense(qr.qr.mat.Cols, blas.NonUnit, blas.Upper).mat
+	if trans {
+		ok := lapack64.Trtrs(blas.Trans, t, w.mat)
+		if !ok {
+			return Condition(math.Inf(1))
+		}
+		for i := c; i < r; i++ {
+			zero(w.mat.Data[i*w.mat.Stride : i*w.mat.Stride+bc])
+		}
+		work := []float64{0}
+		lapack64.Ormqr(blas.Left, blas.NoTrans, qr.qr.mat, qr.tau, w.mat, work, -1)
+		work = getFloat64s(int(work[0]), false)
+		lapack64.Ormqr(blas.Left, blas.NoTrans, qr.qr.mat, qr.tau, w.mat, work, len(work))
+		putFloat64s(work)
+	} else {
+		work := []float64{0}
+		lapack64.Ormqr(blas.Left, blas.Trans, qr.qr.mat, qr.tau, w.mat, work, -1)
+		work = getFloat64s(int(work[0]), false)
+		lapack64.Ormqr(blas.Left, blas.Trans, qr.qr.mat, qr.tau, w.mat, work, len(work))
+		putFloat64s(work)
+
+		ok := lapack64.Trtrs(blas.NoTrans, t, w.mat)
+		if !ok {
+			return Condition(math.Inf(1))
+		}
+	}
+	// X was set above to be the correct size for the result.
+	dst.Copy(w)
+	putDenseWorkspace(w)
+	if qr.cond > ConditionTolerance {
+		return Condition(qr.cond)
+	}
+	return nil
+}
+
+// SolveVecTo finds a minimum-norm solution to a system of linear equations,
+//
+//	Ax = b.
+//
+// See QR.SolveTo for the full documentation.
+// SolveVecTo will panic if the receiver does not contain a factorization.
+func (qr *QR) SolveVecTo(dst *VecDense, trans bool, b Vector) error {
+	if !qr.isValid() {
+		panic(badQR)
+	}
+
+	r, c := qr.qr.Dims()
+	if _, bc := b.Dims(); bc != 1 {
+		panic(ErrShape)
+	}
+
+	// The Solve implementation is non-trivial, so rather than duplicate the code,
+	// instead recast the VecDenses as Dense and call the matrix code.
+	bm := Matrix(b)
+	if rv, ok := b.(RawVectorer); ok {
+		bmat := rv.RawVector()
+		if dst != b {
+			dst.checkOverlap(bmat)
+		}
+		b := VecDense{mat: bmat}
+		bm = b.asDense()
+	}
+	if trans {
+		dst.reuseAsNonZeroed(r)
+	} else {
+		dst.reuseAsNonZeroed(c)
+	}
+	return qr.SolveTo(dst.asDense(), trans, bm)
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/shadow.go b/vendor/gonum.org/v1/gonum/mat/shadow.go
new file mode 100644
index 0000000000..4fc24c3466
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/shadow.go
@@ -0,0 +1,243 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import "gonum.org/v1/gonum/blas/blas64"
+
+// checkOverlap returns false if the receiver does not overlap data elements
+// referenced by the parameter and panics otherwise.
+//
+// checkOverlap methods return a boolean to allow the check call to be added to a
+// boolean expression, making use of short-circuit operators.
+func checkOverlap(a, b blas64.General) bool {
+	if cap(a.Data) == 0 || cap(b.Data) == 0 {
+		return false
+	}
+
+	off := offset(a.Data[:1], b.Data[:1])
+
+	if off == 0 {
+		// At least one element overlaps.
+		if a.Cols == b.Cols && a.Rows == b.Rows && a.Stride == b.Stride {
+			panic(regionIdentity)
+		}
+		panic(regionOverlap)
+	}
+
+	if off > 0 && len(a.Data) <= off {
+		// We know a is completely before b.
+		return false
+	}
+	if off < 0 && len(b.Data) <= -off {
+		// We know a is completely after b.
+		return false
+	}
+
+	if a.Stride != b.Stride && a.Stride != 1 && b.Stride != 1 {
+		// Too hard, so assume the worst; if either stride
+		// is one it will be caught in rectanglesOverlap.
+		panic(mismatchedStrides)
+	}
+
+	if off < 0 {
+		off = -off
+		a.Cols, b.Cols = b.Cols, a.Cols
+	}
+	if rectanglesOverlap(off, a.Cols, b.Cols, min(a.Stride, b.Stride)) {
+		panic(regionOverlap)
+	}
+	return false
+}
+
+func (m *Dense) checkOverlap(a blas64.General) bool {
+	return checkOverlap(m.RawMatrix(), a)
+}
+
+func (m *Dense) checkOverlapMatrix(a Matrix) bool {
+	if m == a {
+		return false
+	}
+	var amat blas64.General
+	switch ar := a.(type) {
+	default:
+		return false
+	case RawMatrixer:
+		amat = ar.RawMatrix()
+	case RawSymmetricer:
+		amat = generalFromSymmetric(ar.RawSymmetric())
+	case RawSymBander:
+		amat = generalFromSymmetricBand(ar.RawSymBand())
+	case RawTriangular:
+		amat = generalFromTriangular(ar.RawTriangular())
+	case RawVectorer:
+		r, c := a.Dims()
+		amat = generalFromVector(ar.RawVector(), r, c)
+	}
+	return m.checkOverlap(amat)
+}
+
+func (s *SymDense) checkOverlap(a blas64.General) bool {
+	return checkOverlap(generalFromSymmetric(s.RawSymmetric()), a)
+}
+
+func (s *SymDense) checkOverlapMatrix(a Matrix) bool {
+	if s == a {
+		return false
+	}
+	var amat blas64.General
+	switch ar := a.(type) {
+	default:
+		return false
+	case RawMatrixer:
+		amat = ar.RawMatrix()
+	case RawSymmetricer:
+		amat = generalFromSymmetric(ar.RawSymmetric())
+	case RawSymBander:
+		amat = generalFromSymmetricBand(ar.RawSymBand())
+	case RawTriangular:
+		amat = generalFromTriangular(ar.RawTriangular())
+	case RawVectorer:
+		r, c := a.Dims()
+		amat = generalFromVector(ar.RawVector(), r, c)
+	}
+	return s.checkOverlap(amat)
+}
+
+// generalFromSymmetric returns a blas64.General with the backing
+// data and dimensions of a.
+func generalFromSymmetric(a blas64.Symmetric) blas64.General {
+	return blas64.General{
+		Rows:   a.N,
+		Cols:   a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+}
+
+func (t *TriDense) checkOverlap(a blas64.General) bool {
+	return checkOverlap(generalFromTriangular(t.RawTriangular()), a)
+}
+
+func (t *TriDense) checkOverlapMatrix(a Matrix) bool {
+	if t == a {
+		return false
+	}
+	var amat blas64.General
+	switch ar := a.(type) {
+	default:
+		return false
+	case RawMatrixer:
+		amat = ar.RawMatrix()
+	case RawSymmetricer:
+		amat = generalFromSymmetric(ar.RawSymmetric())
+	case RawSymBander:
+		amat = generalFromSymmetricBand(ar.RawSymBand())
+	case RawTriangular:
+		amat = generalFromTriangular(ar.RawTriangular())
+	case RawVectorer:
+		r, c := a.Dims()
+		amat = generalFromVector(ar.RawVector(), r, c)
+	}
+	return t.checkOverlap(amat)
+}
+
+// generalFromTriangular returns a blas64.General with the backing
+// data and dimensions of a.
+func generalFromTriangular(a blas64.Triangular) blas64.General {
+	return blas64.General{
+		Rows:   a.N,
+		Cols:   a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+}
+
+func (v *VecDense) checkOverlap(a blas64.Vector) bool {
+	mat := v.mat
+	if cap(mat.Data) == 0 || cap(a.Data) == 0 {
+		return false
+	}
+
+	off := offset(mat.Data[:1], a.Data[:1])
+
+	if off == 0 {
+		// At least one element overlaps.
+		if mat.Inc == a.Inc && len(mat.Data) == len(a.Data) {
+			panic(regionIdentity)
+		}
+		panic(regionOverlap)
+	}
+
+	if off > 0 && len(mat.Data) <= off {
+		// We know v is completely before a.
+		return false
+	}
+	if off < 0 && len(a.Data) <= -off {
+		// We know v is completely after a.
+		return false
+	}
+
+	if mat.Inc != a.Inc && mat.Inc != 1 && a.Inc != 1 {
+		// Too hard, so assume the worst; if either
+		// increment is one it will be caught below.
+		panic(mismatchedStrides)
+	}
+	inc := min(mat.Inc, a.Inc)
+
+	if inc == 1 || off&inc == 0 {
+		panic(regionOverlap)
+	}
+	return false
+}
+
+// generalFromVector returns a blas64.General with the backing
+// data and dimensions of a.
+func generalFromVector(a blas64.Vector, r, c int) blas64.General {
+	return blas64.General{
+		Rows:   r,
+		Cols:   c,
+		Stride: a.Inc,
+		Data:   a.Data,
+	}
+}
+
+func (s *SymBandDense) checkOverlap(a blas64.General) bool {
+	return checkOverlap(generalFromSymmetricBand(s.RawSymBand()), a)
+}
+
+//lint:ignore U1000 This will be used when we do shadow checks for banded matrices.
+func (s *SymBandDense) checkOverlapMatrix(a Matrix) bool {
+	if s == a {
+		return false
+	}
+	var amat blas64.General
+	switch ar := a.(type) {
+	default:
+		return false
+	case RawMatrixer:
+		amat = ar.RawMatrix()
+	case RawSymmetricer:
+		amat = generalFromSymmetric(ar.RawSymmetric())
+	case RawSymBander:
+		amat = generalFromSymmetricBand(ar.RawSymBand())
+	case RawTriangular:
+		amat = generalFromTriangular(ar.RawTriangular())
+	case RawVectorer:
+		r, c := a.Dims()
+		amat = generalFromVector(ar.RawVector(), r, c)
+	}
+	return s.checkOverlap(amat)
+}
+
+// generalFromSymmetricBand returns a blas64.General with the backing
+// data and dimensions of a.
+func generalFromSymmetricBand(a blas64.SymmetricBand) blas64.General {
+	return blas64.General{
+		Rows:   a.N,
+		Cols:   a.K + 1,
+		Data:   a.Data,
+		Stride: a.Stride,
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/shadow_common.go b/vendor/gonum.org/v1/gonum/mat/shadow_common.go
new file mode 100644
index 0000000000..e4cdf4ddee
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/shadow_common.go
@@ -0,0 +1,54 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+const (
+	// regionOverlap is the panic string used for the general case
+	// of a matrix region overlap between a source and destination.
+	regionOverlap = "mat: bad region: overlap"
+
+	// regionIdentity is the panic string used for the specific
+	// case of complete agreement between a source and a destination.
+	regionIdentity = "mat: bad region: identical"
+
+	// mismatchedStrides is the panic string used for overlapping
+	// data slices with differing strides.
+	mismatchedStrides = "mat: bad region: different strides"
+)
+
+// rectanglesOverlap returns whether the strided rectangles a and b overlap
+// when b is offset by off elements after a but has at least one element before
+// the end of a. off must be positive. a and b have aCols and bCols respectively.
+//
+// rectanglesOverlap works by shifting both matrices left such that the left
+// column of a is at 0. The column indexes are flattened by obtaining the shifted
+// relative left and right column positions modulo the common stride. This allows
+// direct comparison of the column offsets when the matrix backing data slices
+// are known to overlap.
+func rectanglesOverlap(off, aCols, bCols, stride int) bool {
+	if stride == 1 {
+		// Unit stride means overlapping data
+		// slices must overlap as matrices.
+		return true
+	}
+
+	// Flatten the shifted matrix column positions
+	// so a starts at 0, modulo the common stride.
+	aTo := aCols
+	// The mod stride operations here make the from
+	// and to indexes comparable between a and b when
+	// the data slices of a and b overlap.
+	bFrom := off % stride
+	bTo := (bFrom + bCols) % stride
+
+	if bTo == 0 || bFrom < bTo {
+		// b matrix is not wrapped: compare for
+		// simple overlap.
+		return bFrom < aTo
+	}
+
+	// b strictly wraps and so must overlap with a.
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/shadow_complex.go b/vendor/gonum.org/v1/gonum/mat/shadow_complex.go
new file mode 100644
index 0000000000..1a3f3fc231
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/shadow_complex.go
@@ -0,0 +1,72 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// TODO(kortschak): Generate this file from shadow.go when all complex type are available.
+
+package mat
+
+import "gonum.org/v1/gonum/blas/cblas128"
+
+// checkOverlapComplex returns false if the receiver does not overlap data elements
+// referenced by the parameter and panics otherwise.
+//
+// checkOverlapComplex methods return a boolean to allow the check call to be added to a
+// boolean expression, making use of short-circuit operators.
+func checkOverlapComplex(a, b cblas128.General) bool {
+	if cap(a.Data) == 0 || cap(b.Data) == 0 {
+		return false
+	}
+
+	off := offsetComplex(a.Data[:1], b.Data[:1])
+
+	if off == 0 {
+		// At least one element overlaps.
+		if a.Cols == b.Cols && a.Rows == b.Rows && a.Stride == b.Stride {
+			panic(regionIdentity)
+		}
+		panic(regionOverlap)
+	}
+
+	if off > 0 && len(a.Data) <= off {
+		// We know a is completely before b.
+		return false
+	}
+	if off < 0 && len(b.Data) <= -off {
+		// We know a is completely after b.
+		return false
+	}
+
+	if a.Stride != b.Stride && a.Stride != 1 && b.Stride != 1 {
+		// Too hard, so assume the worst; if either stride
+		// is one it will be caught in rectanglesOverlap.
+		panic(mismatchedStrides)
+	}
+
+	if off < 0 {
+		off = -off
+		a.Cols, b.Cols = b.Cols, a.Cols
+	}
+	if rectanglesOverlap(off, a.Cols, b.Cols, min(a.Stride, b.Stride)) {
+		panic(regionOverlap)
+	}
+	return false
+}
+
+func (m *CDense) checkOverlap(a cblas128.General) bool {
+	return checkOverlapComplex(m.RawCMatrix(), a)
+}
+
+func (m *CDense) checkOverlapMatrix(a CMatrix) bool {
+	if m == a {
+		return false
+	}
+	var amat cblas128.General
+	switch ar := a.(type) {
+	default:
+		return false
+	case RawCMatrixer:
+		amat = ar.RawCMatrix()
+	}
+	return m.checkOverlap(amat)
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/solve.go b/vendor/gonum.org/v1/gonum/mat/solve.go
new file mode 100644
index 0000000000..ffccce8c45
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/solve.go
@@ -0,0 +1,124 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+// Solve solves the linear least squares problem
+//
+//	minimize over x |b - A*x|_2
+//
+// where A is an m×n matrix, b is a given m element vector and x is n element
+// solution vector. Solve assumes that A has full rank, that is
+//
+//	rank(A) = min(m,n)
+//
+// If m >= n, Solve finds the unique least squares solution of an overdetermined
+// system.
+//
+// If m < n, there is an infinite number of solutions that satisfy b-A*x=0. In
+// this case Solve finds the unique solution of an underdetermined system that
+// minimizes |x|_2.
+//
+// Several right-hand side vectors b and solution vectors x can be handled in a
+// single call. Vectors b are stored in the columns of the m×k matrix B. Vectors
+// x will be stored in-place into the n×k receiver.
+//
+// If the underlying matrix of a is a SolveToer, its SolveTo method is used,
+// otherwise a Dense copy of a will be used for the solution.
+//
+// If A does not have full rank, a Condition error is returned. See the
+// documentation for Condition for more information.
+func (m *Dense) Solve(a, b Matrix) error {
+	aU, aTrans := untransposeExtract(a)
+	if a, ok := aU.(SolveToer); ok {
+		return a.SolveTo(m, aTrans, b)
+	}
+
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+	if ar != br {
+		panic(ErrShape)
+	}
+	m.reuseAsNonZeroed(ac, bc)
+
+	switch {
+	case ar == ac:
+		if a == b {
+			// x = I.
+			if ar == 1 {
+				m.mat.Data[0] = 1
+				return nil
+			}
+			for i := 0; i < ar; i++ {
+				v := m.mat.Data[i*m.mat.Stride : i*m.mat.Stride+ac]
+				zero(v)
+				v[i] = 1
+			}
+			return nil
+		}
+		var lu LU
+		lu.Factorize(a)
+		return lu.SolveTo(m, false, b)
+	case ar > ac:
+		var qr QR
+		qr.Factorize(a)
+		return qr.SolveTo(m, false, b)
+	default:
+		var lq LQ
+		lq.Factorize(a)
+		return lq.SolveTo(m, false, b)
+	}
+}
+
+// SolveVec solves the linear least squares problem
+//
+//	minimize over x |b - A*x|_2
+//
+// where A is an m×n matrix, b is a given m element vector and x is n element
+// solution vector. Solve assumes that A has full rank, that is
+//
+//	rank(A) = min(m,n)
+//
+// If m >= n, Solve finds the unique least squares solution of an overdetermined
+// system.
+//
+// If m < n, there is an infinite number of solutions that satisfy b-A*x=0. In
+// this case Solve finds the unique solution of an underdetermined system that
+// minimizes |x|_2.
+//
+// The solution vector x will be stored in-place into the receiver.
+//
+// If A does not have full rank, a Condition error is returned. See the
+// documentation for Condition for more information.
+func (v *VecDense) SolveVec(a Matrix, b Vector) error {
+	if _, bc := b.Dims(); bc != 1 {
+		panic(ErrShape)
+	}
+	_, c := a.Dims()
+
+	// The Solve implementation is non-trivial, so rather than duplicate the code,
+	// instead recast the VecDenses as Dense and call the matrix code.
+
+	if rv, ok := b.(RawVectorer); ok {
+		bmat := rv.RawVector()
+		if v != b {
+			v.checkOverlap(bmat)
+		}
+		v.reuseAsNonZeroed(c)
+		m := v.asDense()
+		// We conditionally create bm as m when b and v are identical
+		// to prevent the overlap detection code from identifying m
+		// and bm as overlapping but not identical.
+		bm := m
+		if v != b {
+			b := VecDense{mat: bmat}
+			bm = b.asDense()
+		}
+		return m.Solve(a, bm)
+	}
+
+	v.reuseAsNonZeroed(c)
+	m := v.asDense()
+	return m.Solve(a, b)
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/svd.go b/vendor/gonum.org/v1/gonum/mat/svd.go
new file mode 100644
index 0000000000..5244d9f67d
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/svd.go
@@ -0,0 +1,425 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+const badRcond = "mat: invalid rcond value"
+
+// SVD is a type for creating and using the Singular Value Decomposition
+// of a matrix.
+type SVD struct {
+	kind SVDKind
+
+	s  []float64
+	u  blas64.General
+	vt blas64.General
+}
+
+// SVDKind specifies the treatment of singular vectors during an SVD
+// factorization.
+type SVDKind int
+
+const (
+	// SVDNone specifies that no singular vectors should be computed during
+	// the decomposition.
+	SVDNone SVDKind = 0
+
+	// SVDThinU specifies the thin decomposition for U should be computed.
+	SVDThinU SVDKind = 1 << (iota - 1)
+	// SVDFullU specifies the full decomposition for U should be computed.
+	SVDFullU
+	// SVDThinV specifies the thin decomposition for V should be computed.
+	SVDThinV
+	// SVDFullV specifies the full decomposition for V should be computed.
+	SVDFullV
+
+	// SVDThin is a convenience value for computing both thin vectors.
+	SVDThin SVDKind = SVDThinU | SVDThinV
+	// SVDFull is a convenience value for computing both full vectors.
+	SVDFull SVDKind = SVDFullU | SVDFullV
+)
+
+// succFact returns whether the receiver contains a successful factorization.
+func (svd *SVD) succFact() bool {
+	return len(svd.s) != 0
+}
+
+// Factorize computes the singular value decomposition (SVD) of the input matrix A.
+// The singular values of A are computed in all cases, while the singular
+// vectors are optionally computed depending on the input kind.
+//
+// The full singular value decomposition (kind == SVDFull) is a factorization
+// of an m×n matrix A of the form
+//
+//	A = U * Σ * Vᵀ
+//
+// where Σ is an m×n diagonal matrix, U is an m×m orthogonal matrix, and V is an
+// n×n orthogonal matrix. The diagonal elements of Σ are the singular values of A.
+// The first min(m,n) columns of U and V are, respectively, the left and right
+// singular vectors of A.
+//
+// Significant storage space can be saved by using the thin representation of
+// the SVD (kind == SVDThin) instead of the full SVD, especially if
+// m >> n or m << n. The thin SVD finds
+//
+//	A = U~ * Σ * V~ᵀ
+//
+// where U~ is of size m×min(m,n), Σ is a diagonal matrix of size min(m,n)×min(m,n)
+// and V~ is of size n×min(m,n).
+//
+// Factorize returns whether the decomposition succeeded. If the decomposition
+// failed, routines that require a successful factorization will panic.
+func (svd *SVD) Factorize(a Matrix, kind SVDKind) (ok bool) {
+	// kill previous factorization
+	svd.s = svd.s[:0]
+	svd.kind = kind
+
+	m, n := a.Dims()
+	var jobU, jobVT lapack.SVDJob
+
+	// TODO(btracey): This code should be modified to have the smaller
+	// matrix written in-place into aCopy when the lapack/native/dgesvd
+	// implementation is complete.
+	switch {
+	case kind&SVDFullU != 0:
+		jobU = lapack.SVDAll
+		svd.u = blas64.General{
+			Rows:   m,
+			Cols:   m,
+			Stride: m,
+			Data:   use(svd.u.Data, m*m),
+		}
+	case kind&SVDThinU != 0:
+		jobU = lapack.SVDStore
+		svd.u = blas64.General{
+			Rows:   m,
+			Cols:   min(m, n),
+			Stride: min(m, n),
+			Data:   use(svd.u.Data, m*min(m, n)),
+		}
+	default:
+		jobU = lapack.SVDNone
+	}
+	switch {
+	case kind&SVDFullV != 0:
+		svd.vt = blas64.General{
+			Rows:   n,
+			Cols:   n,
+			Stride: n,
+			Data:   use(svd.vt.Data, n*n),
+		}
+		jobVT = lapack.SVDAll
+	case kind&SVDThinV != 0:
+		svd.vt = blas64.General{
+			Rows:   min(m, n),
+			Cols:   n,
+			Stride: n,
+			Data:   use(svd.vt.Data, min(m, n)*n),
+		}
+		jobVT = lapack.SVDStore
+	default:
+		jobVT = lapack.SVDNone
+	}
+
+	// A is destroyed on call, so copy the matrix.
+	aCopy := DenseCopyOf(a)
+	svd.kind = kind
+	svd.s = use(svd.s, min(m, n))
+
+	work := []float64{0}
+	lapack64.Gesvd(jobU, jobVT, aCopy.mat, svd.u, svd.vt, svd.s, work, -1)
+	work = getFloat64s(int(work[0]), false)
+	ok = lapack64.Gesvd(jobU, jobVT, aCopy.mat, svd.u, svd.vt, svd.s, work, len(work))
+	putFloat64s(work)
+	if !ok {
+		svd.kind = 0
+	}
+	return ok
+}
+
+// Kind returns the SVDKind of the decomposition. If no decomposition has been
+// computed, Kind returns -1.
+func (svd *SVD) Kind() SVDKind {
+	if !svd.succFact() {
+		return -1
+	}
+	return svd.kind
+}
+
+// Rank returns the rank of A based on the count of singular values greater than
+// rcond scaled by the largest singular value.
+// Rank will panic if the receiver does not contain a successful factorization or
+// rcond is negative.
+func (svd *SVD) Rank(rcond float64) int {
+	if rcond < 0 {
+		panic(badRcond)
+	}
+	if !svd.succFact() {
+		panic(badFact)
+	}
+	s0 := svd.s[0]
+	for i, v := range svd.s {
+		if v <= rcond*s0 {
+			return i
+		}
+	}
+	return len(svd.s)
+}
+
+// Cond returns the 2-norm condition number for the factorized matrix. Cond will
+// panic if the receiver does not contain a successful factorization.
+func (svd *SVD) Cond() float64 {
+	if !svd.succFact() {
+		panic(badFact)
+	}
+	return svd.s[0] / svd.s[len(svd.s)-1]
+}
+
+// Values returns the singular values of the factorized matrix in descending order.
+//
+// If the input slice is non-nil, the values will be stored in-place into
+// the slice. In this case, the slice must have length min(m,n), and Values will
+// panic with ErrSliceLengthMismatch otherwise. If the input slice is nil, a new
+// slice of the appropriate length will be allocated and returned.
+//
+// Values will panic if the receiver does not contain a successful factorization.
+func (svd *SVD) Values(s []float64) []float64 {
+	if !svd.succFact() {
+		panic(badFact)
+	}
+	if s == nil {
+		s = make([]float64, len(svd.s))
+	}
+	if len(s) != len(svd.s) {
+		panic(ErrSliceLengthMismatch)
+	}
+	copy(s, svd.s)
+	return s
+}
+
+// UTo extracts the matrix U from the singular value decomposition. The first
+// min(m,n) columns are the left singular vectors and correspond to the singular
+// values as returned from SVD.Values.
+//
+// If dst is empty, UTo will resize dst to be m×m if the full U was computed
+// and size m×min(m,n) if the thin U was computed. When dst is non-empty, then
+// UTo will panic if dst is not the appropriate size. UTo will also panic if
+// the receiver does not contain a successful factorization, or if U was
+// not computed during factorization.
+func (svd *SVD) UTo(dst *Dense) {
+	if !svd.succFact() {
+		panic(badFact)
+	}
+	kind := svd.kind
+	if kind&SVDThinU == 0 && kind&SVDFullU == 0 {
+		panic("svd: u not computed during factorization")
+	}
+	r := svd.u.Rows
+	c := svd.u.Cols
+	if dst.IsEmpty() {
+		dst.ReuseAs(r, c)
+	} else {
+		r2, c2 := dst.Dims()
+		if r != r2 || c != c2 {
+			panic(ErrShape)
+		}
+	}
+
+	tmp := &Dense{
+		mat:     svd.u,
+		capRows: r,
+		capCols: c,
+	}
+	dst.Copy(tmp)
+}
+
+// VTo extracts the matrix V from the singular value decomposition. The first
+// min(m,n) columns are the right singular vectors and correspond to the singular
+// values as returned from SVD.Values.
+//
+// If dst is empty, VTo will resize dst to be n×n if the full V was computed
+// and size n×min(m,n) if the thin V was computed. When dst is non-empty, then
+// VTo will panic if dst is not the appropriate size. VTo will also panic if
+// the receiver does not contain a successful factorization, or if V was
+// not computed during factorization.
+func (svd *SVD) VTo(dst *Dense) {
+	if !svd.succFact() {
+		panic(badFact)
+	}
+	kind := svd.kind
+	if kind&SVDThinV == 0 && kind&SVDFullV == 0 {
+		panic("svd: v not computed during factorization")
+	}
+	r := svd.vt.Rows
+	c := svd.vt.Cols
+	if dst.IsEmpty() {
+		dst.ReuseAs(c, r)
+	} else {
+		r2, c2 := dst.Dims()
+		if c != r2 || r != c2 {
+			panic(ErrShape)
+		}
+	}
+
+	tmp := &Dense{
+		mat:     svd.vt,
+		capRows: r,
+		capCols: c,
+	}
+	dst.Copy(tmp.T())
+}
+
+// SolveTo calculates the minimum-norm solution to a linear least squares problem
+//
+//	minimize over n-element vectors x: |b - A*x|_2 and |x|_2
+//
+// where b is a given m-element vector, using the SVD of m×n matrix A stored in
+// the receiver. A may be rank-deficient, that is, the given effective rank can be
+//
+//	rank ≤ min(m,n)
+//
+// The rank can be computed using SVD.Rank.
+//
+// Several right-hand side vectors b and solution vectors x can be handled in a
+// single call. Vectors b are stored in the columns of the m×k matrix B and the
+// resulting vectors x will be stored in the columns of dst. dst must be either
+// empty or have the size equal to n×k.
+//
+// The decomposition must have been factorized computing both the U and V
+// singular vectors.
+//
+// SolveTo returns the residuals calculated from the complete SVD. For this
+// value to be valid the factorization must have been performed with at least
+// SVDFullU.
+func (svd *SVD) SolveTo(dst *Dense, b Matrix, rank int) []float64 {
+	if !svd.succFact() {
+		panic(badFact)
+	}
+	if rank < 1 || len(svd.s) < rank {
+		panic("svd: rank out of range")
+	}
+	kind := svd.kind
+	if kind&SVDThinU == 0 && kind&SVDFullU == 0 {
+		panic("svd: u not computed during factorization")
+	}
+	if kind&SVDThinV == 0 && kind&SVDFullV == 0 {
+		panic("svd: v not computed during factorization")
+	}
+
+	u := Dense{
+		mat:     svd.u,
+		capRows: svd.u.Rows,
+		capCols: svd.u.Cols,
+	}
+	vt := Dense{
+		mat:     svd.vt,
+		capRows: svd.vt.Rows,
+		capCols: svd.vt.Cols,
+	}
+	s := svd.s[:rank]
+
+	_, bc := b.Dims()
+	c := getDenseWorkspace(svd.u.Cols, bc, false)
+	defer putDenseWorkspace(c)
+	c.Mul(u.T(), b)
+
+	y := getDenseWorkspace(rank, bc, false)
+	defer putDenseWorkspace(y)
+	y.DivElem(c.slice(0, rank, 0, bc), repVector{vec: s, cols: bc})
+	dst.Mul(vt.slice(0, rank, 0, svd.vt.Cols).T(), y)
+
+	res := make([]float64, bc)
+	if rank < svd.u.Cols {
+		c = c.slice(len(s), svd.u.Cols, 0, bc)
+		for j := range res {
+			col := c.ColView(j)
+			res[j] = Dot(col, col)
+		}
+	}
+	return res
+}
+
+type repVector struct {
+	vec  []float64
+	cols int
+}
+
+func (m repVector) Dims() (r, c int) { return len(m.vec), m.cols }
+func (m repVector) At(i, j int) float64 {
+	if i < 0 || len(m.vec) <= i || j < 0 || m.cols <= j {
+		panic(ErrIndexOutOfRange.string) // Panic with string to prevent mat.Error recovery.
+	}
+	return m.vec[i]
+}
+func (m repVector) T() Matrix { return Transpose{m} }
+
+// SolveVecTo calculates the minimum-norm solution to a linear least squares problem
+//
+//	minimize over n-element vectors x: |b - A*x|_2 and |x|_2
+//
+// where b is a given m-element vector, using the SVD of m×n matrix A stored in
+// the receiver. A may be rank-deficient, that is, the given effective rank can be
+//
+//	rank ≤ min(m,n)
+//
+// The rank can be computed using SVD.Rank.
+//
+// The resulting vector x will be stored in dst. dst must be either empty or
+// have length equal to n.
+//
+// The decomposition must have been factorized computing both the U and V
+// singular vectors.
+//
+// SolveVecTo returns the residuals calculated from the complete SVD. For this
+// value to be valid the factorization must have been performed with at least
+// SVDFullU.
+func (svd *SVD) SolveVecTo(dst *VecDense, b Vector, rank int) float64 {
+	if !svd.succFact() {
+		panic(badFact)
+	}
+	if rank < 1 || len(svd.s) < rank {
+		panic("svd: rank out of range")
+	}
+	kind := svd.kind
+	if kind&SVDThinU == 0 && kind&SVDFullU == 0 {
+		panic("svd: u not computed during factorization")
+	}
+	if kind&SVDThinV == 0 && kind&SVDFullV == 0 {
+		panic("svd: v not computed during factorization")
+	}
+
+	u := Dense{
+		mat:     svd.u,
+		capRows: svd.u.Rows,
+		capCols: svd.u.Cols,
+	}
+	vt := Dense{
+		mat:     svd.vt,
+		capRows: svd.vt.Rows,
+		capCols: svd.vt.Cols,
+	}
+	s := svd.s[:rank]
+
+	c := getVecDenseWorkspace(svd.u.Cols, false)
+	defer putVecDenseWorkspace(c)
+	c.MulVec(u.T(), b)
+
+	y := getVecDenseWorkspace(rank, false)
+	defer putVecDenseWorkspace(y)
+	y.DivElemVec(c.sliceVec(0, rank), NewVecDense(rank, s))
+	dst.MulVec(vt.slice(0, rank, 0, svd.vt.Cols).T(), y)
+
+	var res float64
+	if rank < c.Len() {
+		c = c.sliceVec(rank, c.Len())
+		res = Dot(c, c)
+	}
+	return res
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/symband.go b/vendor/gonum.org/v1/gonum/mat/symband.go
new file mode 100644
index 0000000000..63638ea912
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/symband.go
@@ -0,0 +1,312 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+var (
+	symBandDense *SymBandDense
+	_            Matrix           = symBandDense
+	_            allMatrix        = symBandDense
+	_            denseMatrix      = symBandDense
+	_            Symmetric        = symBandDense
+	_            Banded           = symBandDense
+	_            SymBanded        = symBandDense
+	_            RawSymBander     = symBandDense
+	_            MutableSymBanded = symBandDense
+
+	_ NonZeroDoer    = symBandDense
+	_ RowNonZeroDoer = symBandDense
+	_ ColNonZeroDoer = symBandDense
+)
+
+// SymBandDense represents a symmetric band matrix in dense storage format.
+type SymBandDense struct {
+	mat blas64.SymmetricBand
+}
+
+// SymBanded is a symmetric band matrix interface type.
+type SymBanded interface {
+	Banded
+
+	// SymmetricDim returns the number of rows/columns in the matrix.
+	SymmetricDim() int
+
+	// SymBand returns the number of rows/columns in the matrix, and the size of
+	// the bandwidth.
+	SymBand() (n, k int)
+}
+
+// MutableSymBanded is a symmetric band matrix interface type that allows elements
+// to be altered.
+type MutableSymBanded interface {
+	SymBanded
+	SetSymBand(i, j int, v float64)
+}
+
+// A RawSymBander can return a blas64.SymmetricBand representation of the receiver.
+// Changes to the blas64.SymmetricBand.Data slice will be reflected in the original
+// matrix, changes to the N, K, Stride and Uplo fields will not.
+type RawSymBander interface {
+	RawSymBand() blas64.SymmetricBand
+}
+
+// NewSymBandDense creates a new SymBand matrix with n rows and columns. If data == nil,
+// a new slice is allocated for the backing slice. If len(data) == n*(k+1),
+// data is used as the backing slice, and changes to the elements of the returned
+// SymBandDense will be reflected in data. If neither of these is true, NewSymBandDense
+// will panic. k must be at least zero and less than n, otherwise NewSymBandDense will panic.
+//
+// The data must be arranged in row-major order constructed by removing the zeros
+// from the rows outside the band and aligning the diagonals. SymBandDense matrices
+// are stored in the upper triangle. For example, the matrix
+//
+//	1  2  3  0  0  0
+//	2  4  5  6  0  0
+//	3  5  7  8  9  0
+//	0  6  8 10 11 12
+//	0  0  9 11 13 14
+//	0  0  0 12 14 15
+//
+// becomes (* entries are never accessed)
+//
+//	 1  2  3
+//	 4  5  6
+//	 7  8  9
+//	10 11 12
+//	13 14  *
+//	15  *  *
+//
+// which is passed to NewSymBandDense as []float64{1, 2, ..., 15, *, *, *} with k=2.
+// Only the values in the band portion of the matrix are used.
+func NewSymBandDense(n, k int, data []float64) *SymBandDense {
+	if n <= 0 || k < 0 {
+		if n == 0 {
+			panic(ErrZeroLength)
+		}
+		panic("mat: negative dimension")
+	}
+	if k+1 > n {
+		panic("mat: band out of range")
+	}
+	bc := k + 1
+	if data != nil && len(data) != n*bc {
+		panic(ErrShape)
+	}
+	if data == nil {
+		data = make([]float64, n*bc)
+	}
+	return &SymBandDense{
+		mat: blas64.SymmetricBand{
+			N:      n,
+			K:      k,
+			Stride: bc,
+			Uplo:   blas.Upper,
+			Data:   data,
+		},
+	}
+}
+
+// Dims returns the number of rows and columns in the matrix.
+func (s *SymBandDense) Dims() (r, c int) {
+	return s.mat.N, s.mat.N
+}
+
+// SymmetricDim returns the size of the receiver.
+func (s *SymBandDense) SymmetricDim() int {
+	return s.mat.N
+}
+
+// Bandwidth returns the bandwidths of the matrix.
+func (s *SymBandDense) Bandwidth() (kl, ku int) {
+	return s.mat.K, s.mat.K
+}
+
+// SymBand returns the number of rows/columns in the matrix, and the size of
+// the bandwidth.
+func (s *SymBandDense) SymBand() (n, k int) {
+	return s.mat.N, s.mat.K
+}
+
+// T implements the Matrix interface. Symmetric matrices, by definition, are
+// equal to their transpose, and this is a no-op.
+func (s *SymBandDense) T() Matrix {
+	return s
+}
+
+// TBand implements the Banded interface.
+func (s *SymBandDense) TBand() Banded {
+	return s
+}
+
+// RawSymBand returns the underlying blas64.SymBand used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in returned blas64.SymBand.
+func (s *SymBandDense) RawSymBand() blas64.SymmetricBand {
+	return s.mat
+}
+
+// SetRawSymBand sets the underlying blas64.SymmetricBand used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in the input.
+//
+// The supplied SymmetricBand must use blas.Upper storage format.
+func (s *SymBandDense) SetRawSymBand(mat blas64.SymmetricBand) {
+	if mat.Uplo != blas.Upper {
+		panic("mat: blas64.SymmetricBand does not have blas.Upper storage")
+	}
+	s.mat = mat
+}
+
+// IsEmpty returns whether the receiver is empty. Empty matrices can be the
+// receiver for size-restricted operations. The receiver can be emptied using
+// Reset.
+func (s *SymBandDense) IsEmpty() bool {
+	return s.mat.Stride == 0
+}
+
+// Reset empties the matrix so that it can be reused as the
+// receiver of a dimensionally restricted operation.
+//
+// Reset should not be used when the matrix shares backing data.
+// See the Reseter interface for more information.
+func (s *SymBandDense) Reset() {
+	s.mat.N = 0
+	s.mat.K = 0
+	s.mat.Stride = 0
+	s.mat.Uplo = 0
+	s.mat.Data = s.mat.Data[:0]
+}
+
+// Zero sets all of the matrix elements to zero.
+func (s *SymBandDense) Zero() {
+	for i := 0; i < s.mat.N; i++ {
+		u := min(1+s.mat.K, s.mat.N-i)
+		zero(s.mat.Data[i*s.mat.Stride : i*s.mat.Stride+u])
+	}
+}
+
+// DiagView returns the diagonal as a matrix backed by the original data.
+func (s *SymBandDense) DiagView() Diagonal {
+	n := s.mat.N
+	return &DiagDense{
+		mat: blas64.Vector{
+			N:    n,
+			Inc:  s.mat.Stride,
+			Data: s.mat.Data[:(n-1)*s.mat.Stride+1],
+		},
+	}
+}
+
+// DoNonZero calls the function fn for each of the non-zero elements of s. The function fn
+// takes a row/column index and the element value of s at (i, j).
+func (s *SymBandDense) DoNonZero(fn func(i, j int, v float64)) {
+	for i := 0; i < s.mat.N; i++ {
+		for j := max(0, i-s.mat.K); j < min(s.mat.N, i+s.mat.K+1); j++ {
+			v := s.at(i, j)
+			if v != 0 {
+				fn(i, j, v)
+			}
+		}
+	}
+}
+
+// DoRowNonZero calls the function fn for each of the non-zero elements of row i of s. The function fn
+// takes a row/column index and the element value of s at (i, j).
+func (s *SymBandDense) DoRowNonZero(i int, fn func(i, j int, v float64)) {
+	if i < 0 || s.mat.N <= i {
+		panic(ErrRowAccess)
+	}
+	for j := max(0, i-s.mat.K); j < min(s.mat.N, i+s.mat.K+1); j++ {
+		v := s.at(i, j)
+		if v != 0 {
+			fn(i, j, v)
+		}
+	}
+}
+
+// DoColNonZero calls the function fn for each of the non-zero elements of column j of s. The function fn
+// takes a row/column index and the element value of s at (i, j).
+func (s *SymBandDense) DoColNonZero(j int, fn func(i, j int, v float64)) {
+	if j < 0 || s.mat.N <= j {
+		panic(ErrColAccess)
+	}
+	for i := 0; i < s.mat.N; i++ {
+		if i-s.mat.K <= j && j < i+s.mat.K+1 {
+			v := s.at(i, j)
+			if v != 0 {
+				fn(i, j, v)
+			}
+		}
+	}
+}
+
+// Norm returns the specified norm of the receiver. Valid norms are:
+//
+//	1 - The maximum absolute column sum
+//	2 - The Frobenius norm, the square root of the sum of the squares of the elements
+//	Inf - The maximum absolute row sum
+//
+// Norm will panic with ErrNormOrder if an illegal norm is specified and with
+// ErrZeroLength if the matrix has zero size.
+func (s *SymBandDense) Norm(norm float64) float64 {
+	if s.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	lnorm := normLapack(norm, false)
+	if lnorm == lapack.MaxColumnSum || lnorm == lapack.MaxRowSum {
+		work := getFloat64s(s.mat.N, false)
+		defer putFloat64s(work)
+		return lapack64.Lansb(lnorm, s.mat, work)
+	}
+	return lapack64.Lansb(lnorm, s.mat, nil)
+}
+
+// Trace returns the trace of the matrix.
+//
+// Trace will panic with ErrZeroLength if the matrix has zero size.
+func (s *SymBandDense) Trace() float64 {
+	if s.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	rb := s.RawSymBand()
+	var tr float64
+	for i := 0; i < rb.N; i++ {
+		tr += rb.Data[i*rb.Stride]
+	}
+	return tr
+}
+
+// MulVecTo computes S⋅x storing the result into dst.
+func (s *SymBandDense) MulVecTo(dst *VecDense, _ bool, x Vector) {
+	n := s.mat.N
+	if x.Len() != n {
+		panic(ErrShape)
+	}
+	dst.reuseAsNonZeroed(n)
+
+	xMat, _ := untransposeExtract(x)
+	if xVec, ok := xMat.(*VecDense); ok {
+		if dst != xVec {
+			dst.checkOverlap(xVec.mat)
+			blas64.Sbmv(1, s.mat, xVec.mat, 0, dst.mat)
+		} else {
+			xCopy := getVecDenseWorkspace(n, false)
+			xCopy.CloneFromVec(xVec)
+			blas64.Sbmv(1, s.mat, xCopy.mat, 0, dst.mat)
+			putVecDenseWorkspace(xCopy)
+		}
+	} else {
+		xCopy := getVecDenseWorkspace(n, false)
+		xCopy.CloneFromVec(x)
+		blas64.Sbmv(1, s.mat, xCopy.mat, 0, dst.mat)
+		putVecDenseWorkspace(xCopy)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/symmetric.go b/vendor/gonum.org/v1/gonum/mat/symmetric.go
new file mode 100644
index 0000000000..e38e4c7b6f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/symmetric.go
@@ -0,0 +1,698 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+var (
+	symDense *SymDense
+
+	_ Matrix           = symDense
+	_ allMatrix        = symDense
+	_ denseMatrix      = symDense
+	_ Symmetric        = symDense
+	_ RawSymmetricer   = symDense
+	_ MutableSymmetric = symDense
+)
+
+const badSymTriangle = "mat: blas64.Symmetric not upper"
+
+// SymDense is a symmetric matrix that uses dense storage. SymDense
+// matrices are stored in the upper triangle.
+type SymDense struct {
+	mat blas64.Symmetric
+	cap int
+}
+
+// Symmetric represents a symmetric matrix (where the element at {i, j} equals
+// the element at {j, i}). Symmetric matrices are always square.
+type Symmetric interface {
+	Matrix
+	// SymmetricDim returns the number of rows/columns in the matrix.
+	SymmetricDim() int
+}
+
+// A RawSymmetricer can return a view of itself as a BLAS Symmetric matrix.
+type RawSymmetricer interface {
+	RawSymmetric() blas64.Symmetric
+}
+
+// A MutableSymmetric can set elements of a symmetric matrix.
+type MutableSymmetric interface {
+	Symmetric
+	SetSym(i, j int, v float64)
+}
+
+// NewSymDense creates a new Symmetric matrix with n rows and columns. If data == nil,
+// a new slice is allocated for the backing slice. If len(data) == n*n, data is
+// used as the backing slice, and changes to the elements of the returned SymDense
+// will be reflected in data. If neither of these is true, NewSymDense will panic.
+// NewSymDense will panic if n is zero.
+//
+// The data must be arranged in row-major order, i.e. the (i*c + j)-th
+// element in the data slice is the {i, j}-th element in the matrix.
+// Only the values in the upper triangular portion of the matrix are used.
+func NewSymDense(n int, data []float64) *SymDense {
+	if n <= 0 {
+		if n == 0 {
+			panic(ErrZeroLength)
+		}
+		panic("mat: negative dimension")
+	}
+	if data != nil && n*n != len(data) {
+		panic(ErrShape)
+	}
+	if data == nil {
+		data = make([]float64, n*n)
+	}
+	return &SymDense{
+		mat: blas64.Symmetric{
+			N:      n,
+			Stride: n,
+			Data:   data,
+			Uplo:   blas.Upper,
+		},
+		cap: n,
+	}
+}
+
+// Dims returns the number of rows and columns in the matrix.
+func (s *SymDense) Dims() (r, c int) {
+	return s.mat.N, s.mat.N
+}
+
+// Caps returns the number of rows and columns in the backing matrix.
+func (s *SymDense) Caps() (r, c int) {
+	return s.cap, s.cap
+}
+
+// T returns the receiver, the transpose of a symmetric matrix.
+func (s *SymDense) T() Matrix {
+	return s
+}
+
+// SymmetricDim implements the Symmetric interface and returns the number of rows
+// and columns in the matrix.
+func (s *SymDense) SymmetricDim() int {
+	return s.mat.N
+}
+
+// RawSymmetric returns the matrix as a blas64.Symmetric. The returned
+// value must be stored in upper triangular format.
+func (s *SymDense) RawSymmetric() blas64.Symmetric {
+	return s.mat
+}
+
+// SetRawSymmetric sets the underlying blas64.Symmetric used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in the input.
+//
+// The supplied Symmetric must use blas.Upper storage format.
+func (s *SymDense) SetRawSymmetric(mat blas64.Symmetric) {
+	if mat.Uplo != blas.Upper {
+		panic(badSymTriangle)
+	}
+	s.cap = mat.N
+	s.mat = mat
+}
+
+// Reset empties the matrix so that it can be reused as the
+// receiver of a dimensionally restricted operation.
+//
+// Reset should not be used when the matrix shares backing data.
+// See the Reseter interface for more information.
+func (s *SymDense) Reset() {
+	// N and Stride must be zeroed in unison.
+	s.mat.N, s.mat.Stride = 0, 0
+	s.mat.Data = s.mat.Data[:0]
+}
+
+// ReuseAsSym changes the receiver if it IsEmpty() to be of size n×n.
+//
+// ReuseAsSym re-uses the backing data slice if it has sufficient capacity,
+// otherwise a new slice is allocated. The backing data is zero on return.
+//
+// ReuseAsSym panics if the receiver is not empty, and panics if
+// the input size is less than one. To empty the receiver for re-use,
+// Reset should be used.
+func (s *SymDense) ReuseAsSym(n int) {
+	if n <= 0 {
+		if n == 0 {
+			panic(ErrZeroLength)
+		}
+		panic(ErrNegativeDimension)
+	}
+	if !s.IsEmpty() {
+		panic(ErrReuseNonEmpty)
+	}
+	s.reuseAsZeroed(n)
+}
+
+// Zero sets all of the matrix elements to zero.
+func (s *SymDense) Zero() {
+	for i := 0; i < s.mat.N; i++ {
+		zero(s.mat.Data[i*s.mat.Stride+i : i*s.mat.Stride+s.mat.N])
+	}
+}
+
+// IsEmpty returns whether the receiver is empty. Empty matrices can be the
+// receiver for size-restricted operations. The receiver can be emptied using
+// Reset.
+func (s *SymDense) IsEmpty() bool {
+	// It must be the case that m.Dims() returns
+	// zeros in this case. See comment in Reset().
+	return s.mat.N == 0
+}
+
+// reuseAsNonZeroed resizes an empty matrix to a n×n matrix,
+// or checks that a non-empty matrix is n×n.
+func (s *SymDense) reuseAsNonZeroed(n int) {
+	// reuseAsNonZeroed must be kept in sync with reuseAsZeroed.
+	if n == 0 {
+		panic(ErrZeroLength)
+	}
+	if s.mat.N > s.cap {
+		// Panic as a string, not a mat.Error.
+		panic(badCap)
+	}
+	if s.IsEmpty() {
+		s.mat = blas64.Symmetric{
+			N:      n,
+			Stride: n,
+			Data:   use(s.mat.Data, n*n),
+			Uplo:   blas.Upper,
+		}
+		s.cap = n
+		return
+	}
+	if s.mat.Uplo != blas.Upper {
+		panic(badSymTriangle)
+	}
+	if s.mat.N != n {
+		panic(ErrShape)
+	}
+}
+
+// reuseAsNonZeroed resizes an empty matrix to a n×n matrix,
+// or checks that a non-empty matrix is n×n. It then zeros the
+// elements of the matrix.
+func (s *SymDense) reuseAsZeroed(n int) {
+	// reuseAsZeroed must be kept in sync with reuseAsNonZeroed.
+	if n == 0 {
+		panic(ErrZeroLength)
+	}
+	if s.mat.N > s.cap {
+		// Panic as a string, not a mat.Error.
+		panic(badCap)
+	}
+	if s.IsEmpty() {
+		s.mat = blas64.Symmetric{
+			N:      n,
+			Stride: n,
+			Data:   useZeroed(s.mat.Data, n*n),
+			Uplo:   blas.Upper,
+		}
+		s.cap = n
+		return
+	}
+	if s.mat.Uplo != blas.Upper {
+		panic(badSymTriangle)
+	}
+	if s.mat.N != n {
+		panic(ErrShape)
+	}
+	s.Zero()
+}
+
+func (s *SymDense) isolatedWorkspace(a Symmetric) (w *SymDense, restore func()) {
+	n := a.SymmetricDim()
+	if n == 0 {
+		panic(ErrZeroLength)
+	}
+	w = getSymDenseWorkspace(n, false)
+	return w, func() {
+		s.CopySym(w)
+		putSymDenseWorkspace(w)
+	}
+}
+
+// DiagView returns the diagonal as a matrix backed by the original data.
+func (s *SymDense) DiagView() Diagonal {
+	n := s.mat.N
+	return &DiagDense{
+		mat: blas64.Vector{
+			N:    n,
+			Inc:  s.mat.Stride + 1,
+			Data: s.mat.Data[:(n-1)*s.mat.Stride+n],
+		},
+	}
+}
+
+func (s *SymDense) AddSym(a, b Symmetric) {
+	n := a.SymmetricDim()
+	if n != b.SymmetricDim() {
+		panic(ErrShape)
+	}
+	s.reuseAsNonZeroed(n)
+
+	if a, ok := a.(RawSymmetricer); ok {
+		if b, ok := b.(RawSymmetricer); ok {
+			amat, bmat := a.RawSymmetric(), b.RawSymmetric()
+			if s != a {
+				s.checkOverlap(generalFromSymmetric(amat))
+			}
+			if s != b {
+				s.checkOverlap(generalFromSymmetric(bmat))
+			}
+			for i := 0; i < n; i++ {
+				btmp := bmat.Data[i*bmat.Stride+i : i*bmat.Stride+n]
+				stmp := s.mat.Data[i*s.mat.Stride+i : i*s.mat.Stride+n]
+				for j, v := range amat.Data[i*amat.Stride+i : i*amat.Stride+n] {
+					stmp[j] = v + btmp[j]
+				}
+			}
+			return
+		}
+	}
+
+	s.checkOverlapMatrix(a)
+	s.checkOverlapMatrix(b)
+	for i := 0; i < n; i++ {
+		stmp := s.mat.Data[i*s.mat.Stride : i*s.mat.Stride+n]
+		for j := i; j < n; j++ {
+			stmp[j] = a.At(i, j) + b.At(i, j)
+		}
+	}
+}
+
+func (s *SymDense) CopySym(a Symmetric) int {
+	n := a.SymmetricDim()
+	n = min(n, s.mat.N)
+	if n == 0 {
+		return 0
+	}
+	switch a := a.(type) {
+	case RawSymmetricer:
+		amat := a.RawSymmetric()
+		if amat.Uplo != blas.Upper {
+			panic(badSymTriangle)
+		}
+		for i := 0; i < n; i++ {
+			copy(s.mat.Data[i*s.mat.Stride+i:i*s.mat.Stride+n], amat.Data[i*amat.Stride+i:i*amat.Stride+n])
+		}
+	default:
+		for i := 0; i < n; i++ {
+			stmp := s.mat.Data[i*s.mat.Stride : i*s.mat.Stride+n]
+			for j := i; j < n; j++ {
+				stmp[j] = a.At(i, j)
+			}
+		}
+	}
+	return n
+}
+
+// SymRankOne performs a symmetric rank-one update to the matrix a with x,
+// which is treated as a column vector, and stores the result in the receiver
+//
+//	s = a + alpha * x * xᵀ
+func (s *SymDense) SymRankOne(a Symmetric, alpha float64, x Vector) {
+	n := x.Len()
+	if a.SymmetricDim() != n {
+		panic(ErrShape)
+	}
+	s.reuseAsNonZeroed(n)
+
+	if s != a {
+		if rs, ok := a.(RawSymmetricer); ok {
+			s.checkOverlap(generalFromSymmetric(rs.RawSymmetric()))
+		}
+		s.CopySym(a)
+	}
+
+	xU, _ := untransposeExtract(x)
+	if rv, ok := xU.(*VecDense); ok {
+		r, c := xU.Dims()
+		xmat := rv.mat
+		s.checkOverlap(generalFromVector(xmat, r, c))
+		blas64.Syr(alpha, xmat, s.mat)
+		return
+	}
+
+	for i := 0; i < n; i++ {
+		for j := i; j < n; j++ {
+			s.set(i, j, s.at(i, j)+alpha*x.AtVec(i)*x.AtVec(j))
+		}
+	}
+}
+
+// SymRankK performs a symmetric rank-k update to the matrix a and stores the
+// result into the receiver. If a is zero, see SymOuterK.
+//
+//	s = a + alpha * x * x'
+func (s *SymDense) SymRankK(a Symmetric, alpha float64, x Matrix) {
+	n := a.SymmetricDim()
+	r, _ := x.Dims()
+	if r != n {
+		panic(ErrShape)
+	}
+	xMat, aTrans := untransposeExtract(x)
+	var g blas64.General
+	if rm, ok := xMat.(*Dense); ok {
+		g = rm.mat
+	} else {
+		g = DenseCopyOf(x).mat
+		aTrans = false
+	}
+	if a != s {
+		if rs, ok := a.(RawSymmetricer); ok {
+			s.checkOverlap(generalFromSymmetric(rs.RawSymmetric()))
+		}
+		s.reuseAsNonZeroed(n)
+		s.CopySym(a)
+	}
+	t := blas.NoTrans
+	if aTrans {
+		t = blas.Trans
+	}
+	blas64.Syrk(t, alpha, g, 1, s.mat)
+}
+
+// SymOuterK calculates the outer product of x with itself and stores
+// the result into the receiver. It is equivalent to the matrix
+// multiplication
+//
+//	s = alpha * x * x'.
+//
+// In order to update an existing matrix, see SymRankOne.
+func (s *SymDense) SymOuterK(alpha float64, x Matrix) {
+	n, _ := x.Dims()
+	switch {
+	case s.IsEmpty():
+		s.mat = blas64.Symmetric{
+			N:      n,
+			Stride: n,
+			Data:   useZeroed(s.mat.Data, n*n),
+			Uplo:   blas.Upper,
+		}
+		s.cap = n
+		s.SymRankK(s, alpha, x)
+	case s.mat.Uplo != blas.Upper:
+		panic(badSymTriangle)
+	case s.mat.N == n:
+		if s == x {
+			w := getSymDenseWorkspace(n, true)
+			w.SymRankK(w, alpha, x)
+			s.CopySym(w)
+			putSymDenseWorkspace(w)
+		} else {
+			switch r := x.(type) {
+			case RawMatrixer:
+				s.checkOverlap(r.RawMatrix())
+			case RawSymmetricer:
+				s.checkOverlap(generalFromSymmetric(r.RawSymmetric()))
+			case RawTriangular:
+				s.checkOverlap(generalFromTriangular(r.RawTriangular()))
+			}
+			// Only zero the upper triangle.
+			for i := 0; i < n; i++ {
+				ri := i * s.mat.Stride
+				zero(s.mat.Data[ri+i : ri+n])
+			}
+			s.SymRankK(s, alpha, x)
+		}
+	default:
+		panic(ErrShape)
+	}
+}
+
+// RankTwo performs a symmetric rank-two update to the matrix a with the
+// vectors x and y, which are treated as column vectors, and stores the
+// result in the receiver
+//
+//	m = a + alpha * (x * yᵀ + y * xᵀ)
+func (s *SymDense) RankTwo(a Symmetric, alpha float64, x, y Vector) {
+	n := s.mat.N
+	if x.Len() != n {
+		panic(ErrShape)
+	}
+	if y.Len() != n {
+		panic(ErrShape)
+	}
+
+	if s != a {
+		if rs, ok := a.(RawSymmetricer); ok {
+			s.checkOverlap(generalFromSymmetric(rs.RawSymmetric()))
+		}
+	}
+
+	var xmat, ymat blas64.Vector
+	fast := true
+	xU, _ := untransposeExtract(x)
+	if rv, ok := xU.(*VecDense); ok {
+		r, c := xU.Dims()
+		xmat = rv.mat
+		s.checkOverlap(generalFromVector(xmat, r, c))
+	} else {
+		fast = false
+	}
+	yU, _ := untransposeExtract(y)
+	if rv, ok := yU.(*VecDense); ok {
+		r, c := yU.Dims()
+		ymat = rv.mat
+		s.checkOverlap(generalFromVector(ymat, r, c))
+	} else {
+		fast = false
+	}
+
+	if s != a {
+		if rs, ok := a.(RawSymmetricer); ok {
+			s.checkOverlap(generalFromSymmetric(rs.RawSymmetric()))
+		}
+		s.reuseAsNonZeroed(n)
+		s.CopySym(a)
+	}
+
+	if fast {
+		if s != a {
+			s.reuseAsNonZeroed(n)
+			s.CopySym(a)
+		}
+		blas64.Syr2(alpha, xmat, ymat, s.mat)
+		return
+	}
+
+	for i := 0; i < n; i++ {
+		s.reuseAsNonZeroed(n)
+		for j := i; j < n; j++ {
+			s.set(i, j, a.At(i, j)+alpha*(x.AtVec(i)*y.AtVec(j)+y.AtVec(i)*x.AtVec(j)))
+		}
+	}
+}
+
+// ScaleSym multiplies the elements of a by f, placing the result in the receiver.
+func (s *SymDense) ScaleSym(f float64, a Symmetric) {
+	n := a.SymmetricDim()
+	s.reuseAsNonZeroed(n)
+	if a, ok := a.(RawSymmetricer); ok {
+		amat := a.RawSymmetric()
+		if s != a {
+			s.checkOverlap(generalFromSymmetric(amat))
+		}
+		for i := 0; i < n; i++ {
+			for j := i; j < n; j++ {
+				s.mat.Data[i*s.mat.Stride+j] = f * amat.Data[i*amat.Stride+j]
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		for j := i; j < n; j++ {
+			s.mat.Data[i*s.mat.Stride+j] = f * a.At(i, j)
+		}
+	}
+}
+
+// SubsetSym extracts a subset of the rows and columns of the matrix a and stores
+// the result in-place into the receiver. The resulting matrix size is
+// len(set)×len(set). Specifically, at the conclusion of SubsetSym,
+// s.At(i, j) equals a.At(set[i], set[j]). Note that the supplied set does not
+// have to be a strict subset, dimension repeats are allowed.
+func (s *SymDense) SubsetSym(a Symmetric, set []int) {
+	n := len(set)
+	na := a.SymmetricDim()
+	s.reuseAsNonZeroed(n)
+	var restore func()
+	if a == s {
+		s, restore = s.isolatedWorkspace(a)
+		defer restore()
+	}
+
+	if a, ok := a.(RawSymmetricer); ok {
+		raw := a.RawSymmetric()
+		if s != a {
+			s.checkOverlap(generalFromSymmetric(raw))
+		}
+		for i := 0; i < n; i++ {
+			ssub := s.mat.Data[i*s.mat.Stride : i*s.mat.Stride+n]
+			r := set[i]
+			rsub := raw.Data[r*raw.Stride : r*raw.Stride+na]
+			for j := i; j < n; j++ {
+				c := set[j]
+				if r <= c {
+					ssub[j] = rsub[c]
+				} else {
+					ssub[j] = raw.Data[c*raw.Stride+r]
+				}
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		for j := i; j < n; j++ {
+			s.mat.Data[i*s.mat.Stride+j] = a.At(set[i], set[j])
+		}
+	}
+}
+
+// SliceSym returns a new Matrix that shares backing data with the receiver.
+// The returned matrix starts at {i,i} of the receiver and extends k-i rows
+// and columns. The final row and column in the resulting matrix is k-1.
+// SliceSym panics with ErrIndexOutOfRange if the slice is outside the
+// capacity of the receiver.
+func (s *SymDense) SliceSym(i, k int) Symmetric {
+	return s.sliceSym(i, k)
+}
+
+func (s *SymDense) sliceSym(i, k int) *SymDense {
+	sz := s.cap
+	if i < 0 || sz < i || k < i || sz < k {
+		panic(ErrIndexOutOfRange)
+	}
+	v := *s
+	v.mat.Data = s.mat.Data[i*s.mat.Stride+i : (k-1)*s.mat.Stride+k]
+	v.mat.N = k - i
+	v.cap = s.cap - i
+	return &v
+}
+
+// Norm returns the specified norm of the receiver. Valid norms are:
+//
+//	1 - The maximum absolute column sum
+//	2 - The Frobenius norm, the square root of the sum of the squares of the elements
+//	Inf - The maximum absolute row sum
+//
+// Norm will panic with ErrNormOrder if an illegal norm is specified and with
+// ErrZeroLength if the matrix has zero size.
+func (s *SymDense) Norm(norm float64) float64 {
+	if s.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	lnorm := normLapack(norm, false)
+	if lnorm == lapack.MaxColumnSum || lnorm == lapack.MaxRowSum {
+		work := getFloat64s(s.mat.N, false)
+		defer putFloat64s(work)
+		return lapack64.Lansy(lnorm, s.mat, work)
+	}
+	return lapack64.Lansy(lnorm, s.mat, nil)
+}
+
+// Trace returns the trace of the matrix.
+//
+// Trace will panic with ErrZeroLength if the matrix has zero size.
+func (s *SymDense) Trace() float64 {
+	if s.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	// TODO(btracey): could use internal asm sum routine.
+	var v float64
+	for i := 0; i < s.mat.N; i++ {
+		v += s.mat.Data[i*s.mat.Stride+i]
+	}
+	return v
+}
+
+// GrowSym returns the receiver expanded by n rows and n columns. If the
+// dimensions of the expanded matrix are outside the capacity of the receiver
+// a new allocation is made, otherwise not. Note that the receiver itself is
+// not modified during the call to GrowSquare.
+func (s *SymDense) GrowSym(n int) Symmetric {
+	if n < 0 {
+		panic(ErrIndexOutOfRange)
+	}
+	if n == 0 {
+		return s
+	}
+	var v SymDense
+	n += s.mat.N
+	if s.IsEmpty() || n > s.cap {
+		v.mat = blas64.Symmetric{
+			N:      n,
+			Stride: n,
+			Uplo:   blas.Upper,
+			Data:   make([]float64, n*n),
+		}
+		v.cap = n
+		// Copy elements, including those not currently visible. Use a temporary
+		// structure to avoid modifying the receiver.
+		var tmp SymDense
+		tmp.mat = blas64.Symmetric{
+			N:      s.cap,
+			Stride: s.mat.Stride,
+			Data:   s.mat.Data,
+			Uplo:   s.mat.Uplo,
+		}
+		tmp.cap = s.cap
+		v.CopySym(&tmp)
+		return &v
+	}
+	v.mat = blas64.Symmetric{
+		N:      n,
+		Stride: s.mat.Stride,
+		Uplo:   blas.Upper,
+		Data:   s.mat.Data[:(n-1)*s.mat.Stride+n],
+	}
+	v.cap = s.cap
+	return &v
+}
+
+// PowPSD computes a^pow where a is a positive symmetric definite matrix.
+//
+// PowPSD returns an error if the matrix is not positive symmetric definite
+// or the Eigen decomposition is not successful.
+func (s *SymDense) PowPSD(a Symmetric, pow float64) error {
+	dim := a.SymmetricDim()
+	s.reuseAsNonZeroed(dim)
+
+	var eigen EigenSym
+	ok := eigen.Factorize(a, true)
+	if !ok {
+		return ErrFailedEigen
+	}
+	values := eigen.Values(nil)
+	for i, v := range values {
+		if v <= 0 {
+			return ErrNotPSD
+		}
+		values[i] = math.Pow(v, pow)
+	}
+	var u Dense
+	eigen.VectorsTo(&u)
+
+	s.SymOuterK(values[0], u.ColView(0))
+
+	var v VecDense
+	for i := 1; i < dim; i++ {
+		v.ColViewOf(&u, i)
+		s.SymRankOne(s, values[i], &v)
+	}
+	return nil
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/triangular.go b/vendor/gonum.org/v1/gonum/mat/triangular.go
new file mode 100644
index 0000000000..0e37fb0102
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/triangular.go
@@ -0,0 +1,832 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+var (
+	triDense *TriDense
+	_        Matrix            = triDense
+	_        allMatrix         = triDense
+	_        denseMatrix       = triDense
+	_        Triangular        = triDense
+	_        RawTriangular     = triDense
+	_        MutableTriangular = triDense
+
+	_ NonZeroDoer    = triDense
+	_ RowNonZeroDoer = triDense
+	_ ColNonZeroDoer = triDense
+)
+
+// TriDense represents an upper or lower triangular matrix in dense storage
+// format.
+type TriDense struct {
+	mat blas64.Triangular
+	cap int
+}
+
+// Triangular represents a triangular matrix. Triangular matrices are always square.
+type Triangular interface {
+	Matrix
+	// Triangle returns the number of rows/columns in the matrix and its
+	// orientation.
+	Triangle() (n int, kind TriKind)
+
+	// TTri is the equivalent of the T() method in the Matrix interface but
+	// guarantees the transpose is of triangular type.
+	TTri() Triangular
+}
+
+// A RawTriangular can return a blas64.Triangular representation of the receiver.
+// Changes to the blas64.Triangular.Data slice will be reflected in the original
+// matrix, changes to the N, Stride, Uplo and Diag fields will not.
+type RawTriangular interface {
+	RawTriangular() blas64.Triangular
+}
+
+// A MutableTriangular can set elements of a triangular matrix.
+type MutableTriangular interface {
+	Triangular
+	SetTri(i, j int, v float64)
+}
+
+var (
+	_ Matrix           = TransposeTri{}
+	_ Triangular       = TransposeTri{}
+	_ UntransposeTrier = TransposeTri{}
+)
+
+// TransposeTri is a type for performing an implicit transpose of a Triangular
+// matrix. It implements the Triangular interface, returning values from the
+// transpose of the matrix within.
+type TransposeTri struct {
+	Triangular Triangular
+}
+
+// At returns the value of the element at row i and column j of the transposed
+// matrix, that is, row j and column i of the Triangular field.
+func (t TransposeTri) At(i, j int) float64 {
+	return t.Triangular.At(j, i)
+}
+
+// Dims returns the dimensions of the transposed matrix. Triangular matrices are
+// square and thus this is the same size as the original Triangular.
+func (t TransposeTri) Dims() (r, c int) {
+	c, r = t.Triangular.Dims()
+	return r, c
+}
+
+// T performs an implicit transpose by returning the Triangular field.
+func (t TransposeTri) T() Matrix {
+	return t.Triangular
+}
+
+// Triangle returns the number of rows/columns in the matrix and its orientation.
+func (t TransposeTri) Triangle() (int, TriKind) {
+	n, upper := t.Triangular.Triangle()
+	return n, !upper
+}
+
+// TTri performs an implicit transpose by returning the Triangular field.
+func (t TransposeTri) TTri() Triangular {
+	return t.Triangular
+}
+
+// Untranspose returns the Triangular field.
+func (t TransposeTri) Untranspose() Matrix {
+	return t.Triangular
+}
+
+func (t TransposeTri) UntransposeTri() Triangular {
+	return t.Triangular
+}
+
+// NewTriDense creates a new Triangular matrix with n rows and columns. If data == nil,
+// a new slice is allocated for the backing slice. If len(data) == n*n, data is
+// used as the backing slice, and changes to the elements of the returned TriDense
+// will be reflected in data. If neither of these is true, NewTriDense will panic.
+// NewTriDense will panic if n is zero.
+//
+// The data must be arranged in row-major order, i.e. the (i*c + j)-th
+// element in the data slice is the {i, j}-th element in the matrix.
+// Only the values in the triangular portion corresponding to kind are used.
+func NewTriDense(n int, kind TriKind, data []float64) *TriDense {
+	if n <= 0 {
+		if n == 0 {
+			panic(ErrZeroLength)
+		}
+		panic("mat: negative dimension")
+	}
+	if data != nil && len(data) != n*n {
+		panic(ErrShape)
+	}
+	if data == nil {
+		data = make([]float64, n*n)
+	}
+	uplo := blas.Lower
+	if kind == Upper {
+		uplo = blas.Upper
+	}
+	return &TriDense{
+		mat: blas64.Triangular{
+			N:      n,
+			Stride: n,
+			Data:   data,
+			Uplo:   uplo,
+			Diag:   blas.NonUnit,
+		},
+		cap: n,
+	}
+}
+
+func (t *TriDense) Dims() (r, c int) {
+	return t.mat.N, t.mat.N
+}
+
+// Triangle returns the dimension of t and its orientation. The returned
+// orientation is only valid when n is not empty.
+func (t *TriDense) Triangle() (n int, kind TriKind) {
+	return t.mat.N, t.triKind()
+}
+
+func (t *TriDense) isUpper() bool {
+	return isUpperUplo(t.mat.Uplo)
+}
+
+func (t *TriDense) triKind() TriKind {
+	return TriKind(isUpperUplo(t.mat.Uplo))
+}
+
+func isUpperUplo(u blas.Uplo) bool {
+	switch u {
+	case blas.Upper:
+		return true
+	case blas.Lower:
+		return false
+	default:
+		panic(badTriangle)
+	}
+}
+
+// asSymBlas returns the receiver restructured as a blas64.Symmetric with the
+// same backing memory. Panics if the receiver is unit.
+// This returns a blas64.Symmetric and not a *SymDense because SymDense can only
+// be upper triangular.
+func (t *TriDense) asSymBlas() blas64.Symmetric {
+	if t.mat.Diag == blas.Unit {
+		panic("mat: cannot convert unit TriDense into blas64.Symmetric")
+	}
+	return blas64.Symmetric{
+		N:      t.mat.N,
+		Stride: t.mat.Stride,
+		Data:   t.mat.Data,
+		Uplo:   t.mat.Uplo,
+	}
+}
+
+// T performs an implicit transpose by returning the receiver inside a Transpose.
+func (t *TriDense) T() Matrix {
+	return Transpose{t}
+}
+
+// TTri performs an implicit transpose by returning the receiver inside a TransposeTri.
+func (t *TriDense) TTri() Triangular {
+	return TransposeTri{t}
+}
+
+func (t *TriDense) RawTriangular() blas64.Triangular {
+	return t.mat
+}
+
+// SetRawTriangular sets the underlying blas64.Triangular used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in the input.
+//
+// The supplied Triangular must not use blas.Unit storage format.
+func (t *TriDense) SetRawTriangular(mat blas64.Triangular) {
+	if mat.Diag == blas.Unit {
+		panic("mat: cannot set TriDense with Unit storage format")
+	}
+	t.cap = mat.N
+	t.mat = mat
+}
+
+// Reset empties the matrix so that it can be reused as the
+// receiver of a dimensionally restricted operation.
+//
+// Reset should not be used when the matrix shares backing data.
+// See the Reseter interface for more information.
+func (t *TriDense) Reset() {
+	// N and Stride must be zeroed in unison.
+	t.mat.N, t.mat.Stride = 0, 0
+	// Defensively zero Uplo to ensure
+	// it is set correctly later.
+	t.mat.Uplo = 0
+	t.mat.Data = t.mat.Data[:0]
+}
+
+// Zero sets all of the matrix elements to zero.
+func (t *TriDense) Zero() {
+	if t.isUpper() {
+		for i := 0; i < t.mat.N; i++ {
+			zero(t.mat.Data[i*t.mat.Stride+i : i*t.mat.Stride+t.mat.N])
+		}
+		return
+	}
+	for i := 0; i < t.mat.N; i++ {
+		zero(t.mat.Data[i*t.mat.Stride : i*t.mat.Stride+i+1])
+	}
+}
+
+// IsEmpty returns whether the receiver is empty. Empty matrices can be the
+// receiver for size-restricted operations. The receiver can be emptied using
+// Reset.
+func (t *TriDense) IsEmpty() bool {
+	// It must be the case that t.Dims() returns
+	// zeros in this case. See comment in Reset().
+	return t.mat.Stride == 0
+}
+
+// untransposeTri untransposes a matrix if applicable. If a is an UntransposeTrier, then
+// untransposeTri returns the underlying matrix and true. If it is not, then it returns
+// the input matrix and false.
+func untransposeTri(a Triangular) (Triangular, bool) {
+	if ut, ok := a.(UntransposeTrier); ok {
+		return ut.UntransposeTri(), true
+	}
+	return a, false
+}
+
+// ReuseAsTri changes the receiver if it IsEmpty() to be of size n×n.
+//
+// ReuseAsTri re-uses the backing data slice if it has sufficient capacity,
+// otherwise a new slice is allocated. The backing data is zero on return.
+//
+// ReuseAsTri panics if the receiver is not empty, and panics if
+// the input size is less than one. To empty the receiver for re-use,
+// Reset should be used.
+func (t *TriDense) ReuseAsTri(n int, kind TriKind) {
+	if n <= 0 {
+		if n == 0 {
+			panic(ErrZeroLength)
+		}
+		panic(ErrNegativeDimension)
+	}
+	if !t.IsEmpty() {
+		panic(ErrReuseNonEmpty)
+	}
+	t.reuseAsZeroed(n, kind)
+}
+
+// reuseAsNonZeroed resizes an empty receiver to an n×n triangular matrix with the given
+// orientation. If the receiver is not empty, reuseAsNonZeroed checks that the receiver
+// is the correct size and orientation.
+func (t *TriDense) reuseAsNonZeroed(n int, kind TriKind) {
+	// reuseAsNonZeroed must be kept in sync with reuseAsZeroed.
+	if n == 0 {
+		panic(ErrZeroLength)
+	}
+	ul := blas.Lower
+	if kind == Upper {
+		ul = blas.Upper
+	}
+	if t.mat.N > t.cap {
+		// Panic as a string, not a mat.Error.
+		panic(badCap)
+	}
+	if t.IsEmpty() {
+		t.mat = blas64.Triangular{
+			N:      n,
+			Stride: n,
+			Diag:   blas.NonUnit,
+			Data:   use(t.mat.Data, n*n),
+			Uplo:   ul,
+		}
+		t.cap = n
+		return
+	}
+	if t.mat.N != n {
+		panic(ErrShape)
+	}
+	if t.mat.Uplo != ul {
+		panic(ErrTriangle)
+	}
+}
+
+// reuseAsZeroed resizes an empty receiver to an n×n triangular matrix with the given
+// orientation. If the receiver is not empty, reuseAsZeroed checks that the receiver
+// is the correct size and orientation. It then zeros out the matrix data.
+func (t *TriDense) reuseAsZeroed(n int, kind TriKind) {
+	// reuseAsZeroed must be kept in sync with reuseAsNonZeroed.
+	if n == 0 {
+		panic(ErrZeroLength)
+	}
+	ul := blas.Lower
+	if kind == Upper {
+		ul = blas.Upper
+	}
+	if t.mat.N > t.cap {
+		// Panic as a string, not a mat.Error.
+		panic(badCap)
+	}
+	if t.IsEmpty() {
+		t.mat = blas64.Triangular{
+			N:      n,
+			Stride: n,
+			Diag:   blas.NonUnit,
+			Data:   useZeroed(t.mat.Data, n*n),
+			Uplo:   ul,
+		}
+		t.cap = n
+		return
+	}
+	if t.mat.N != n {
+		panic(ErrShape)
+	}
+	if t.mat.Uplo != ul {
+		panic(ErrTriangle)
+	}
+	t.Zero()
+}
+
+// isolatedWorkspace returns a new TriDense matrix w with the size of a and
+// returns a callback to defer which performs cleanup at the return of the call.
+// This should be used when a method receiver is the same pointer as an input argument.
+func (t *TriDense) isolatedWorkspace(a Triangular) (w *TriDense, restore func()) {
+	n, kind := a.Triangle()
+	if n == 0 {
+		panic(ErrZeroLength)
+	}
+	w = getTriDenseWorkspace(n, kind, false)
+	return w, func() {
+		t.Copy(w)
+		putTriWorkspace(w)
+	}
+}
+
+// DiagView returns the diagonal as a matrix backed by the original data.
+func (t *TriDense) DiagView() Diagonal {
+	if t.mat.Diag == blas.Unit {
+		panic("mat: cannot take view of Unit diagonal")
+	}
+	n := t.mat.N
+	return &DiagDense{
+		mat: blas64.Vector{
+			N:    n,
+			Inc:  t.mat.Stride + 1,
+			Data: t.mat.Data[:(n-1)*t.mat.Stride+n],
+		},
+	}
+}
+
+// Copy makes a copy of elements of a into the receiver. It is similar to the
+// built-in copy; it copies as much as the overlap between the two matrices and
+// returns the number of rows and columns it copied. Only elements within the
+// receiver's non-zero triangle are set.
+//
+// See the Copier interface for more information.
+func (t *TriDense) Copy(a Matrix) (r, c int) {
+	r, c = a.Dims()
+	r = min(r, t.mat.N)
+	c = min(c, t.mat.N)
+	if r == 0 || c == 0 {
+		return 0, 0
+	}
+
+	switch a := a.(type) {
+	case RawMatrixer:
+		amat := a.RawMatrix()
+		if t.isUpper() {
+			for i := 0; i < r; i++ {
+				copy(t.mat.Data[i*t.mat.Stride+i:i*t.mat.Stride+c], amat.Data[i*amat.Stride+i:i*amat.Stride+c])
+			}
+		} else {
+			for i := 0; i < r; i++ {
+				copy(t.mat.Data[i*t.mat.Stride:i*t.mat.Stride+i+1], amat.Data[i*amat.Stride:i*amat.Stride+i+1])
+			}
+		}
+	case RawTriangular:
+		amat := a.RawTriangular()
+		aIsUpper := isUpperUplo(amat.Uplo)
+		tIsUpper := t.isUpper()
+		switch {
+		case tIsUpper && aIsUpper:
+			for i := 0; i < r; i++ {
+				copy(t.mat.Data[i*t.mat.Stride+i:i*t.mat.Stride+c], amat.Data[i*amat.Stride+i:i*amat.Stride+c])
+			}
+		case !tIsUpper && !aIsUpper:
+			for i := 0; i < r; i++ {
+				copy(t.mat.Data[i*t.mat.Stride:i*t.mat.Stride+i+1], amat.Data[i*amat.Stride:i*amat.Stride+i+1])
+			}
+		default:
+			for i := 0; i < r; i++ {
+				t.set(i, i, amat.Data[i*amat.Stride+i])
+			}
+		}
+	default:
+		isUpper := t.isUpper()
+		for i := 0; i < r; i++ {
+			if isUpper {
+				for j := i; j < c; j++ {
+					t.set(i, j, a.At(i, j))
+				}
+			} else {
+				for j := 0; j <= i; j++ {
+					t.set(i, j, a.At(i, j))
+				}
+			}
+		}
+	}
+
+	return r, c
+}
+
+// InverseTri computes the inverse of the triangular matrix a, storing the result
+// into the receiver. If a is ill-conditioned, a Condition error will be returned.
+// Note that matrix inversion is numerically unstable, and should generally be
+// avoided where possible, for example by using the Solve routines.
+func (t *TriDense) InverseTri(a Triangular) error {
+	t.checkOverlapMatrix(a)
+	n, _ := a.Triangle()
+	t.reuseAsNonZeroed(a.Triangle())
+	t.Copy(a)
+	work := getFloat64s(3*n, false)
+	iwork := getInts(n, false)
+	cond := lapack64.Trcon(CondNorm, t.mat, work, iwork)
+	putFloat64s(work)
+	putInts(iwork)
+	if math.IsInf(cond, 1) {
+		return Condition(cond)
+	}
+	ok := lapack64.Trtri(t.mat)
+	if !ok {
+		return Condition(math.Inf(1))
+	}
+	if cond > ConditionTolerance {
+		return Condition(cond)
+	}
+	return nil
+}
+
+// MulTri takes the product of triangular matrices a and b and places the result
+// in the receiver. The size of a and b must match, and they both must have the
+// same TriKind, or Mul will panic.
+func (t *TriDense) MulTri(a, b Triangular) {
+	n, kind := a.Triangle()
+	nb, kindb := b.Triangle()
+	if n != nb {
+		panic(ErrShape)
+	}
+	if kind != kindb {
+		panic(ErrTriangle)
+	}
+
+	aU, _ := untransposeTri(a)
+	bU, _ := untransposeTri(b)
+	t.checkOverlapMatrix(bU)
+	t.checkOverlapMatrix(aU)
+	t.reuseAsNonZeroed(n, kind)
+	var restore func()
+	if t == aU {
+		t, restore = t.isolatedWorkspace(aU)
+		defer restore()
+	} else if t == bU {
+		t, restore = t.isolatedWorkspace(bU)
+		defer restore()
+	}
+
+	// Inspect types here, helps keep the loops later clean(er).
+	_, aDiag := aU.(Diagonal)
+	_, bDiag := bU.(Diagonal)
+	// If they are both diagonal only need 1 loop.
+	// All diagonal matrices are Upper.
+	// TODO: Add fast paths for DiagDense.
+	if aDiag && bDiag {
+		t.Zero()
+		for i := 0; i < n; i++ {
+			t.SetTri(i, i, a.At(i, i)*b.At(i, i))
+		}
+		return
+	}
+
+	// Now we know at least one matrix is non-diagonal.
+	// And all diagonal matrices are all Upper.
+	// The both-diagonal case is handled above.
+	// TODO: Add fast paths for Dense variants.
+	if kind == Upper {
+		for i := 0; i < n; i++ {
+			for j := i; j < n; j++ {
+				switch {
+				case aDiag:
+					t.SetTri(i, j, a.At(i, i)*b.At(i, j))
+				case bDiag:
+					t.SetTri(i, j, a.At(i, j)*b.At(j, j))
+				default:
+					var v float64
+					for k := i; k <= j; k++ {
+						v += a.At(i, k) * b.At(k, j)
+					}
+					t.SetTri(i, j, v)
+				}
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		for j := 0; j <= i; j++ {
+			var v float64
+			for k := j; k <= i; k++ {
+				v += a.At(i, k) * b.At(k, j)
+			}
+			t.SetTri(i, j, v)
+		}
+	}
+}
+
+// ScaleTri multiplies the elements of a by f, placing the result in the receiver.
+// If the receiver is non-zero, the size and kind of the receiver must match
+// the input, or ScaleTri will panic.
+func (t *TriDense) ScaleTri(f float64, a Triangular) {
+	n, kind := a.Triangle()
+	t.reuseAsNonZeroed(n, kind)
+
+	// TODO(btracey): Improve the set of fast-paths.
+	switch a := a.(type) {
+	case RawTriangular:
+		amat := a.RawTriangular()
+		if t != a {
+			t.checkOverlap(generalFromTriangular(amat))
+		}
+		if kind == Upper {
+			for i := 0; i < n; i++ {
+				ts := t.mat.Data[i*t.mat.Stride+i : i*t.mat.Stride+n]
+				as := amat.Data[i*amat.Stride+i : i*amat.Stride+n]
+				for i, v := range as {
+					ts[i] = v * f
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ts := t.mat.Data[i*t.mat.Stride : i*t.mat.Stride+i+1]
+			as := amat.Data[i*amat.Stride : i*amat.Stride+i+1]
+			for i, v := range as {
+				ts[i] = v * f
+			}
+		}
+		return
+	default:
+		t.checkOverlapMatrix(a)
+		isUpper := kind == Upper
+		for i := 0; i < n; i++ {
+			if isUpper {
+				for j := i; j < n; j++ {
+					t.set(i, j, f*a.At(i, j))
+				}
+			} else {
+				for j := 0; j <= i; j++ {
+					t.set(i, j, f*a.At(i, j))
+				}
+			}
+		}
+	}
+}
+
+// SliceTri returns a new Triangular that shares backing data with the receiver.
+// The returned matrix starts at {i,i} of the receiver and extends k-i rows and
+// columns. The final row and column in the resulting matrix is k-1.
+// SliceTri panics with ErrIndexOutOfRange if the slice is outside the capacity
+// of the receiver.
+func (t *TriDense) SliceTri(i, k int) Triangular {
+	return t.sliceTri(i, k)
+}
+
+func (t *TriDense) sliceTri(i, k int) *TriDense {
+	if i < 0 || t.cap < i || k < i || t.cap < k {
+		panic(ErrIndexOutOfRange)
+	}
+	v := *t
+	v.mat.Data = t.mat.Data[i*t.mat.Stride+i : (k-1)*t.mat.Stride+k]
+	v.mat.N = k - i
+	v.cap = t.cap - i
+	return &v
+}
+
+// Norm returns the specified norm of the receiver. Valid norms are:
+//
+//	1 - The maximum absolute column sum
+//	2 - The Frobenius norm, the square root of the sum of the squares of the elements
+//	Inf - The maximum absolute row sum
+//
+// Norm will panic with ErrNormOrder if an illegal norm is specified and with
+// ErrZeroLength if the matrix has zero size.
+func (t *TriDense) Norm(norm float64) float64 {
+	if t.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	lnorm := normLapack(norm, false)
+	if lnorm == lapack.MaxColumnSum {
+		work := getFloat64s(t.mat.N, false)
+		defer putFloat64s(work)
+		return lapack64.Lantr(lnorm, t.mat, work)
+	}
+	return lapack64.Lantr(lnorm, t.mat, nil)
+}
+
+// Trace returns the trace of the matrix.
+//
+// Trace will panic with ErrZeroLength if the matrix has zero size.
+func (t *TriDense) Trace() float64 {
+	if t.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	// TODO(btracey): could use internal asm sum routine.
+	var v float64
+	for i := 0; i < t.mat.N; i++ {
+		v += t.mat.Data[i*t.mat.Stride+i]
+	}
+	return v
+}
+
+// copySymIntoTriangle copies a symmetric matrix into a TriDense
+func copySymIntoTriangle(t *TriDense, s Symmetric) {
+	n, upper := t.Triangle()
+	ns := s.SymmetricDim()
+	if n != ns {
+		panic("mat: triangle size mismatch")
+	}
+	ts := t.mat.Stride
+	if rs, ok := s.(RawSymmetricer); ok {
+		sd := rs.RawSymmetric()
+		ss := sd.Stride
+		if upper {
+			if sd.Uplo == blas.Upper {
+				for i := 0; i < n; i++ {
+					copy(t.mat.Data[i*ts+i:i*ts+n], sd.Data[i*ss+i:i*ss+n])
+				}
+				return
+			}
+			for i := 0; i < n; i++ {
+				for j := i; j < n; j++ {
+					t.mat.Data[i*ts+j] = sd.Data[j*ss+i]
+				}
+			}
+			return
+		}
+		if sd.Uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				for j := 0; j <= i; j++ {
+					t.mat.Data[i*ts+j] = sd.Data[j*ss+i]
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			copy(t.mat.Data[i*ts:i*ts+i+1], sd.Data[i*ss:i*ss+i+1])
+		}
+		return
+	}
+	if upper {
+		for i := 0; i < n; i++ {
+			for j := i; j < n; j++ {
+				t.mat.Data[i*ts+j] = s.At(i, j)
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		for j := 0; j <= i; j++ {
+			t.mat.Data[i*ts+j] = s.At(i, j)
+		}
+	}
+}
+
+// DoNonZero calls the function fn for each of the non-zero elements of t. The function fn
+// takes a row/column index and the element value of t at (i, j).
+func (t *TriDense) DoNonZero(fn func(i, j int, v float64)) {
+	if t.isUpper() {
+		for i := 0; i < t.mat.N; i++ {
+			for j := i; j < t.mat.N; j++ {
+				v := t.at(i, j)
+				if v != 0 {
+					fn(i, j, v)
+				}
+			}
+		}
+		return
+	}
+	for i := 0; i < t.mat.N; i++ {
+		for j := 0; j <= i; j++ {
+			v := t.at(i, j)
+			if v != 0 {
+				fn(i, j, v)
+			}
+		}
+	}
+}
+
+// DoRowNonZero calls the function fn for each of the non-zero elements of row i of t. The function fn
+// takes a row/column index and the element value of t at (i, j).
+func (t *TriDense) DoRowNonZero(i int, fn func(i, j int, v float64)) {
+	if i < 0 || t.mat.N <= i {
+		panic(ErrRowAccess)
+	}
+	if t.isUpper() {
+		for j := i; j < t.mat.N; j++ {
+			v := t.at(i, j)
+			if v != 0 {
+				fn(i, j, v)
+			}
+		}
+		return
+	}
+	for j := 0; j <= i; j++ {
+		v := t.at(i, j)
+		if v != 0 {
+			fn(i, j, v)
+		}
+	}
+}
+
+// DoColNonZero calls the function fn for each of the non-zero elements of column j of t. The function fn
+// takes a row/column index and the element value of t at (i, j).
+func (t *TriDense) DoColNonZero(j int, fn func(i, j int, v float64)) {
+	if j < 0 || t.mat.N <= j {
+		panic(ErrColAccess)
+	}
+	if t.isUpper() {
+		for i := 0; i <= j; i++ {
+			v := t.at(i, j)
+			if v != 0 {
+				fn(i, j, v)
+			}
+		}
+		return
+	}
+	for i := j; i < t.mat.N; i++ {
+		v := t.at(i, j)
+		if v != 0 {
+			fn(i, j, v)
+		}
+	}
+}
+
+// SolveTo solves a triangular system T * X = B or Tᵀ * X = B where T is an n×n
+// triangular matrix represented by the receiver and B is a given n×nrhs matrix.
+// If T is non-singular, the result will be stored into dst and nil will be
+// returned. If T is singular, the contents of dst will be undefined and a
+// Condition error will be returned.
+//
+// If dst is empty, SolveTo will resize it to n×nrhs. If dst is not empty,
+// SolveTo will panic if dst is not n×nrhs.
+func (t *TriDense) SolveTo(dst *Dense, trans bool, b Matrix) error {
+	n, nrhs := b.Dims()
+	if n != t.mat.N {
+		panic(ErrShape)
+	}
+
+	dst.reuseAsNonZeroed(n, nrhs)
+	bU, bTrans := untranspose(b)
+	if dst == bU {
+		if bTrans {
+			work := getDenseWorkspace(n, nrhs, false)
+			defer putDenseWorkspace(work)
+			work.Copy(b)
+			dst.Copy(work)
+		}
+	} else {
+		if rm, ok := bU.(RawMatrixer); ok {
+			dst.checkOverlap(rm.RawMatrix())
+		}
+		dst.Copy(b)
+	}
+
+	transT := blas.NoTrans
+	if trans {
+		transT = blas.Trans
+	}
+	ok := lapack64.Trtrs(transT, t.mat, dst.mat)
+	if !ok {
+		return Condition(math.Inf(1))
+	}
+
+	work := getFloat64s(3*n, false)
+	iwork := getInts(n, false)
+	cond := lapack64.Trcon(CondNorm, t.mat, work, iwork)
+	putFloat64s(work)
+	putInts(iwork)
+	if cond > ConditionTolerance {
+		return Condition(cond)
+	}
+
+	return nil
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/triband.go b/vendor/gonum.org/v1/gonum/mat/triband.go
new file mode 100644
index 0000000000..aa0b51d6f7
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/triband.go
@@ -0,0 +1,694 @@
+// Copyright ©2018 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+var (
+	triBand TriBanded
+	_       Banded     = triBand
+	_       Triangular = triBand
+
+	triBandDense *TriBandDense
+	_            Matrix           = triBandDense
+	_            allMatrix        = triBandDense
+	_            denseMatrix      = triBandDense
+	_            Triangular       = triBandDense
+	_            Banded           = triBandDense
+	_            TriBanded        = triBandDense
+	_            RawTriBander     = triBandDense
+	_            MutableTriBanded = triBandDense
+)
+
+// TriBanded is a triangular band matrix interface type.
+type TriBanded interface {
+	Banded
+
+	// Triangle returns the number of rows/columns in the matrix and its
+	// orientation.
+	Triangle() (n int, kind TriKind)
+
+	// TTri is the equivalent of the T() method in the Matrix interface but
+	// guarantees the transpose is of triangular type.
+	TTri() Triangular
+
+	// TriBand returns the number of rows/columns in the matrix, the
+	// size of the bandwidth, and the orientation.
+	TriBand() (n, k int, kind TriKind)
+
+	// TTriBand is the equivalent of the T() method in the Matrix interface but
+	// guarantees the transpose is of banded triangular type.
+	TTriBand() TriBanded
+}
+
+// A RawTriBander can return a blas64.TriangularBand representation of the receiver.
+// Changes to the blas64.TriangularBand.Data slice will be reflected in the original
+// matrix, changes to the N, K, Stride, Uplo and Diag fields will not.
+type RawTriBander interface {
+	RawTriBand() blas64.TriangularBand
+}
+
+// MutableTriBanded is a triangular band matrix interface type that allows
+// elements to be altered.
+type MutableTriBanded interface {
+	TriBanded
+	SetTriBand(i, j int, v float64)
+}
+
+var (
+	tTriBand TransposeTriBand
+	_        Matrix               = tTriBand
+	_        TriBanded            = tTriBand
+	_        Untransposer         = tTriBand
+	_        UntransposeTrier     = tTriBand
+	_        UntransposeBander    = tTriBand
+	_        UntransposeTriBander = tTriBand
+)
+
+// TransposeTriBand is a type for performing an implicit transpose of a TriBanded
+// matrix. It implements the TriBanded interface, returning values from the
+// transpose of the matrix within.
+type TransposeTriBand struct {
+	TriBanded TriBanded
+}
+
+// At returns the value of the element at row i and column j of the transposed
+// matrix, that is, row j and column i of the TriBanded field.
+func (t TransposeTriBand) At(i, j int) float64 {
+	return t.TriBanded.At(j, i)
+}
+
+// Dims returns the dimensions of the transposed matrix. TriBanded matrices are
+// square and thus this is the same size as the original TriBanded.
+func (t TransposeTriBand) Dims() (r, c int) {
+	c, r = t.TriBanded.Dims()
+	return r, c
+}
+
+// T performs an implicit transpose by returning the TriBand field.
+func (t TransposeTriBand) T() Matrix {
+	return t.TriBanded
+}
+
+// Triangle returns the number of rows/columns in the matrix and its orientation.
+func (t TransposeTriBand) Triangle() (int, TriKind) {
+	n, upper := t.TriBanded.Triangle()
+	return n, !upper
+}
+
+// TTri performs an implicit transpose by returning the TriBand field.
+func (t TransposeTriBand) TTri() Triangular {
+	return t.TriBanded
+}
+
+// Bandwidth returns the upper and lower bandwidths of the matrix.
+func (t TransposeTriBand) Bandwidth() (kl, ku int) {
+	kl, ku = t.TriBanded.Bandwidth()
+	return ku, kl
+}
+
+// TBand performs an implicit transpose by returning the TriBand field.
+func (t TransposeTriBand) TBand() Banded {
+	return t.TriBanded
+}
+
+// TriBand returns the number of rows/columns in the matrix, the
+// size of the bandwidth, and the orientation.
+func (t TransposeTriBand) TriBand() (n, k int, kind TriKind) {
+	n, k, kind = t.TriBanded.TriBand()
+	return n, k, !kind
+}
+
+// TTriBand performs an implicit transpose by returning the TriBand field.
+func (t TransposeTriBand) TTriBand() TriBanded {
+	return t.TriBanded
+}
+
+// Untranspose returns the Triangular field.
+func (t TransposeTriBand) Untranspose() Matrix {
+	return t.TriBanded
+}
+
+// UntransposeTri returns the underlying Triangular matrix.
+func (t TransposeTriBand) UntransposeTri() Triangular {
+	return t.TriBanded
+}
+
+// UntransposeBand returns the underlying Banded matrix.
+func (t TransposeTriBand) UntransposeBand() Banded {
+	return t.TriBanded
+}
+
+// UntransposeTriBand returns the underlying TriBanded matrix.
+func (t TransposeTriBand) UntransposeTriBand() TriBanded {
+	return t.TriBanded
+}
+
+// TriBandDense represents a triangular band matrix in dense storage format.
+type TriBandDense struct {
+	mat blas64.TriangularBand
+}
+
+// NewTriBandDense creates a new triangular banded matrix with n rows and columns,
+// k bands in the direction of the specified kind. If data == nil,
+// a new slice is allocated for the backing slice. If len(data) == n*(k+1),
+// data is used as the backing slice, and changes to the elements of the returned
+// TriBandDense will be reflected in data. If neither of these is true, NewTriBandDense
+// will panic. k must be at least zero and less than n, otherwise NewTriBandDense will panic.
+//
+// The data must be arranged in row-major order constructed by removing the zeros
+// from the rows outside the band and aligning the diagonals. For example, if
+// the upper-triangular banded matrix
+//
+//	1  2  3  0  0  0
+//	0  4  5  6  0  0
+//	0  0  7  8  9  0
+//	0  0  0 10 11 12
+//	0  0  0 0  13 14
+//	0  0  0 0  0  15
+//
+// becomes (* entries are never accessed)
+//
+//	 1  2  3
+//	 4  5  6
+//	 7  8  9
+//	10 11 12
+//	13 14  *
+//	15  *  *
+//
+// which is passed to NewTriBandDense as []float64{1, 2, ..., 15, *, *, *}
+// with k=2 and kind = mat.Upper.
+// The lower triangular banded matrix
+//
+//	1  0  0  0  0  0
+//	2  3  0  0  0  0
+//	4  5  6  0  0  0
+//	0  7  8  9  0  0
+//	0  0 10 11 12  0
+//	0  0  0 13 14 15
+//
+// becomes (* entries are never accessed)
+//   - *  1
+//   - 2  3
+//     4  5  6
+//     7  8  9
+//     10 11 12
+//     13 14 15
+//
+// which is passed to NewTriBandDense as []float64{*, *, *, 1, 2, ..., 15}
+// with k=2 and kind = mat.Lower.
+// Only the values in the band portion of the matrix are used.
+func NewTriBandDense(n, k int, kind TriKind, data []float64) *TriBandDense {
+	if n <= 0 || k < 0 {
+		if n == 0 {
+			panic(ErrZeroLength)
+		}
+		panic(ErrNegativeDimension)
+	}
+	if k+1 > n {
+		panic(ErrBandwidth)
+	}
+	bc := k + 1
+	if data != nil && len(data) != n*bc {
+		panic(ErrShape)
+	}
+	if data == nil {
+		data = make([]float64, n*bc)
+	}
+	uplo := blas.Lower
+	if kind {
+		uplo = blas.Upper
+	}
+	return &TriBandDense{
+		mat: blas64.TriangularBand{
+			Uplo:   uplo,
+			Diag:   blas.NonUnit,
+			N:      n,
+			K:      k,
+			Data:   data,
+			Stride: bc,
+		},
+	}
+}
+
+// Dims returns the number of rows and columns in the matrix.
+func (t *TriBandDense) Dims() (r, c int) {
+	return t.mat.N, t.mat.N
+}
+
+// T performs an implicit transpose by returning the receiver inside a Transpose.
+func (t *TriBandDense) T() Matrix {
+	return Transpose{t}
+}
+
+// IsEmpty returns whether the receiver is empty. Empty matrices can be the
+// receiver for size-restricted operations. The receiver can be emptied using
+// Reset.
+func (t *TriBandDense) IsEmpty() bool {
+	// It must be the case that t.Dims() returns
+	// zeros in this case. See comment in Reset().
+	return t.mat.Stride == 0
+}
+
+// Reset empties the matrix so that it can be reused as the
+// receiver of a dimensionally restricted operation.
+//
+// Reset should not be used when the matrix shares backing data.
+// See the Reseter interface for more information.
+func (t *TriBandDense) Reset() {
+	t.mat.N = 0
+	t.mat.Stride = 0
+	t.mat.K = 0
+	t.mat.Data = t.mat.Data[:0]
+}
+
+// ReuseAsTriBand changes the receiver to be of size n×n, bandwidth k+1 and of
+// the given kind, re-using the backing data slice if it has sufficient capacity
+// and allocating a new slice otherwise. The backing data is zero on return.
+//
+// The receiver must be empty, n must be positive and k must be non-negative and
+// less than n, otherwise ReuseAsTriBand will panic. To empty the receiver for
+// re-use, Reset should be used.
+func (t *TriBandDense) ReuseAsTriBand(n, k int, kind TriKind) {
+	if n <= 0 || k < 0 {
+		if n == 0 {
+			panic(ErrZeroLength)
+		}
+		panic(ErrNegativeDimension)
+	}
+	if k+1 > n {
+		panic(ErrBandwidth)
+	}
+	if !t.IsEmpty() {
+		panic(ErrReuseNonEmpty)
+	}
+	t.reuseAsZeroed(n, k, kind)
+}
+
+// reuseAsZeroed resizes an empty receiver to an n×n triangular band matrix with
+// the given bandwidth and orientation. If the receiver is not empty,
+// reuseAsZeroed checks that the receiver has the correct size, bandwidth and
+// orientation. It then zeros out the matrix data.
+func (t *TriBandDense) reuseAsZeroed(n, k int, kind TriKind) {
+	// reuseAsZeroed must be kept in sync with reuseAsNonZeroed.
+	if n == 0 {
+		panic(ErrZeroLength)
+	}
+	ul := blas.Lower
+	if kind == Upper {
+		ul = blas.Upper
+	}
+	if t.IsEmpty() {
+		t.mat = blas64.TriangularBand{
+			Uplo:   ul,
+			Diag:   blas.NonUnit,
+			N:      n,
+			K:      k,
+			Data:   useZeroed(t.mat.Data, n*(k+1)),
+			Stride: k + 1,
+		}
+		return
+	}
+	if t.mat.N != n || t.mat.K != k {
+		panic(ErrShape)
+	}
+	if t.mat.Uplo != ul {
+		panic(ErrTriangle)
+	}
+	t.Zero()
+}
+
+// reuseAsNonZeroed resizes an empty receiver to an n×n triangular band matrix
+// with the given bandwidth and orientation. If the receiver is not empty,
+// reuseAsZeroed checks that the receiver has the correct size, bandwidth and
+// orientation.
+//
+//lint:ignore U1000 This will be used later.
+func (t *TriBandDense) reuseAsNonZeroed(n, k int, kind TriKind) {
+	// reuseAsNonZeroed must be kept in sync with reuseAsZeroed.
+	if n == 0 {
+		panic(ErrZeroLength)
+	}
+	ul := blas.Lower
+	if kind == Upper {
+		ul = blas.Upper
+	}
+	if t.IsEmpty() {
+		t.mat = blas64.TriangularBand{
+			Uplo:   ul,
+			Diag:   blas.NonUnit,
+			N:      n,
+			K:      k,
+			Data:   use(t.mat.Data, n*(k+1)),
+			Stride: k + 1,
+		}
+		return
+	}
+	if t.mat.N != n || t.mat.K != k {
+		panic(ErrShape)
+	}
+	if t.mat.Uplo != ul {
+		panic(ErrTriangle)
+	}
+}
+
+// DoNonZero calls the function fn for each of the non-zero elements of t. The function fn
+// takes a row/column index and the element value of t at (i, j).
+func (t *TriBandDense) DoNonZero(fn func(i, j int, v float64)) {
+	if t.isUpper() {
+		for i := 0; i < t.mat.N; i++ {
+			for j := i; j < min(i+t.mat.K+1, t.mat.N); j++ {
+				v := t.at(i, j)
+				if v != 0 {
+					fn(i, j, v)
+				}
+			}
+		}
+	} else {
+		for i := 0; i < t.mat.N; i++ {
+			for j := max(0, i-t.mat.K); j <= i; j++ {
+				v := t.at(i, j)
+				if v != 0 {
+					fn(i, j, v)
+				}
+			}
+		}
+	}
+}
+
+// DoRowNonZero calls the function fn for each of the non-zero elements of row i of t. The function fn
+// takes a row/column index and the element value of t at (i, j).
+func (t *TriBandDense) DoRowNonZero(i int, fn func(i, j int, v float64)) {
+	if i < 0 || t.mat.N <= i {
+		panic(ErrRowAccess)
+	}
+	if t.isUpper() {
+		for j := i; j < min(i+t.mat.K+1, t.mat.N); j++ {
+			v := t.at(i, j)
+			if v != 0 {
+				fn(i, j, v)
+			}
+		}
+	} else {
+		for j := max(0, i-t.mat.K); j <= i; j++ {
+			v := t.at(i, j)
+			if v != 0 {
+				fn(i, j, v)
+			}
+		}
+	}
+}
+
+// DoColNonZero calls the function fn for each of the non-zero elements of column j of t. The function fn
+// takes a row/column index and the element value of t at (i, j).
+func (t *TriBandDense) DoColNonZero(j int, fn func(i, j int, v float64)) {
+	if j < 0 || t.mat.N <= j {
+		panic(ErrColAccess)
+	}
+	if t.isUpper() {
+		for i := 0; i < t.mat.N; i++ {
+			v := t.at(i, j)
+			if v != 0 {
+				fn(i, j, v)
+			}
+		}
+	} else {
+		for i := 0; i < t.mat.N; i++ {
+			v := t.at(i, j)
+			if v != 0 {
+				fn(i, j, v)
+			}
+		}
+	}
+}
+
+// Zero sets all of the matrix elements to zero.
+func (t *TriBandDense) Zero() {
+	if t.isUpper() {
+		for i := 0; i < t.mat.N; i++ {
+			u := min(1+t.mat.K, t.mat.N-i)
+			zero(t.mat.Data[i*t.mat.Stride : i*t.mat.Stride+u])
+		}
+		return
+	}
+	for i := 0; i < t.mat.N; i++ {
+		l := max(0, t.mat.K-i)
+		zero(t.mat.Data[i*t.mat.Stride+l : i*t.mat.Stride+t.mat.K+1])
+	}
+}
+
+func (t *TriBandDense) isUpper() bool {
+	return isUpperUplo(t.mat.Uplo)
+}
+
+func (t *TriBandDense) triKind() TriKind {
+	return TriKind(isUpperUplo(t.mat.Uplo))
+}
+
+// Triangle returns the dimension of t and its orientation. The returned
+// orientation is only valid when n is not zero.
+func (t *TriBandDense) Triangle() (n int, kind TriKind) {
+	return t.mat.N, t.triKind()
+}
+
+// TTri performs an implicit transpose by returning the receiver inside a TransposeTri.
+func (t *TriBandDense) TTri() Triangular {
+	return TransposeTri{t}
+}
+
+// Bandwidth returns the upper and lower bandwidths of the matrix.
+func (t *TriBandDense) Bandwidth() (kl, ku int) {
+	if t.isUpper() {
+		return 0, t.mat.K
+	}
+	return t.mat.K, 0
+}
+
+// TBand performs an implicit transpose by returning the receiver inside a TransposeBand.
+func (t *TriBandDense) TBand() Banded {
+	return TransposeBand{t}
+}
+
+// TriBand returns the number of rows/columns in the matrix, the
+// size of the bandwidth, and the orientation.
+func (t *TriBandDense) TriBand() (n, k int, kind TriKind) {
+	return t.mat.N, t.mat.K, TriKind(!t.IsEmpty()) && t.triKind()
+}
+
+// TTriBand performs an implicit transpose by returning the receiver inside a TransposeTriBand.
+func (t *TriBandDense) TTriBand() TriBanded {
+	return TransposeTriBand{t}
+}
+
+// RawTriBand returns the underlying blas64.TriangularBand used by the receiver.
+// Changes to the blas64.TriangularBand.Data slice will be reflected in the original
+// matrix, changes to the N, K, Stride, Uplo and Diag fields will not.
+func (t *TriBandDense) RawTriBand() blas64.TriangularBand {
+	return t.mat
+}
+
+// SetRawTriBand sets the underlying blas64.TriangularBand used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in the input.
+//
+// The supplied TriangularBand must not use blas.Unit storage format.
+func (t *TriBandDense) SetRawTriBand(mat blas64.TriangularBand) {
+	if mat.Diag == blas.Unit {
+		panic("mat: cannot set TriBand with Unit storage")
+	}
+	t.mat = mat
+}
+
+// DiagView returns the diagonal as a matrix backed by the original data.
+func (t *TriBandDense) DiagView() Diagonal {
+	if t.mat.Diag == blas.Unit {
+		panic("mat: cannot take view of Unit diagonal")
+	}
+	n := t.mat.N
+	data := t.mat.Data
+	if !t.isUpper() {
+		data = data[t.mat.K:]
+	}
+	return &DiagDense{
+		mat: blas64.Vector{
+			N:    n,
+			Inc:  t.mat.Stride,
+			Data: data[:(n-1)*t.mat.Stride+1],
+		},
+	}
+}
+
+// Norm returns the specified norm of the receiver. Valid norms are:
+//
+//	1 - The maximum absolute column sum
+//	2 - The Frobenius norm, the square root of the sum of the squares of the elements
+//	Inf - The maximum absolute row sum
+//
+// Norm will panic with ErrNormOrder if an illegal norm is specified and with
+// ErrZeroLength if the matrix has zero size.
+func (t *TriBandDense) Norm(norm float64) float64 {
+	if t.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	lnorm := normLapack(norm, false)
+	if lnorm == lapack.MaxColumnSum {
+		work := getFloat64s(t.mat.N, false)
+		defer putFloat64s(work)
+		return lapack64.Lantb(lnorm, t.mat, work)
+	}
+	return lapack64.Lantb(lnorm, t.mat, nil)
+}
+
+// Trace returns the trace of the matrix.
+//
+// Trace will panic with ErrZeroLength if the matrix has zero size.
+func (t *TriBandDense) Trace() float64 {
+	if t.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	rb := t.RawTriBand()
+	var tr float64
+	var offsetIndex int
+	if rb.Uplo == blas.Lower {
+		offsetIndex = rb.K
+	}
+	for i := 0; i < rb.N; i++ {
+		tr += rb.Data[offsetIndex+i*rb.Stride]
+	}
+	return tr
+}
+
+// SolveTo solves a triangular system T * X = B  or  Tᵀ * X = B where T is an
+// n×n triangular band matrix represented by the receiver and B is a given
+// n×nrhs matrix. If T is non-singular, the result will be stored into dst and
+// nil will be returned. If T is singular, the contents of dst will be undefined
+// and a Condition error will be returned.
+func (t *TriBandDense) SolveTo(dst *Dense, trans bool, b Matrix) error {
+	n, nrhs := b.Dims()
+	if n != t.mat.N {
+		panic(ErrShape)
+	}
+
+	dst.reuseAsNonZeroed(n, nrhs)
+	bU, bTrans := untranspose(b)
+	if dst == bU {
+		if bTrans {
+			work := getDenseWorkspace(n, nrhs, false)
+			defer putDenseWorkspace(work)
+			work.Copy(b)
+			dst.Copy(work)
+		}
+	} else {
+		if rm, ok := bU.(RawMatrixer); ok {
+			dst.checkOverlap(rm.RawMatrix())
+		}
+		dst.Copy(b)
+	}
+
+	var ok bool
+	if trans {
+		ok = lapack64.Tbtrs(blas.Trans, t.mat, dst.mat)
+	} else {
+		ok = lapack64.Tbtrs(blas.NoTrans, t.mat, dst.mat)
+	}
+	if !ok {
+		return Condition(math.Inf(1))
+	}
+	return nil
+}
+
+// SolveVecTo solves a triangular system T * x = b  or  Tᵀ * x = b where T is an
+// n×n triangular band matrix represented by the receiver and b is a given
+// n-vector. If T is non-singular, the result will be stored into dst and nil
+// will be returned. If T is singular, the contents of dst will be undefined and
+// a Condition error will be returned.
+func (t *TriBandDense) SolveVecTo(dst *VecDense, trans bool, b Vector) error {
+	n, nrhs := b.Dims()
+	if n != t.mat.N || nrhs != 1 {
+		panic(ErrShape)
+	}
+	if b, ok := b.(RawVectorer); ok && dst != b {
+		dst.checkOverlap(b.RawVector())
+	}
+	dst.reuseAsNonZeroed(n)
+	if dst != b {
+		dst.CopyVec(b)
+	}
+	var ok bool
+	if trans {
+		ok = lapack64.Tbtrs(blas.Trans, t.mat, dst.asGeneral())
+	} else {
+		ok = lapack64.Tbtrs(blas.NoTrans, t.mat, dst.asGeneral())
+	}
+	if !ok {
+		return Condition(math.Inf(1))
+	}
+	return nil
+}
+
+func copySymBandIntoTriBand(dst *TriBandDense, s SymBanded) {
+	n, k, upper := dst.TriBand()
+	ns, ks := s.SymBand()
+	if n != ns {
+		panic("mat: triangle size mismatch")
+	}
+	if k != ks {
+		panic("mat: triangle bandwidth mismatch")
+	}
+
+	// TODO(vladimir-ch): implement the missing cases below as needed.
+	t := dst.mat
+	sU, _ := untransposeExtract(s)
+	if sbd, ok := sU.(*SymBandDense); ok {
+		s := sbd.RawSymBand()
+		if upper {
+			if s.Uplo == blas.Upper {
+				// dst is upper triangular, s is stored in upper triangle.
+				for i := 0; i < n; i++ {
+					ilen := min(k+1, n-i)
+					copy(t.Data[i*t.Stride:i*t.Stride+ilen], s.Data[i*s.Stride:i*s.Stride+ilen])
+				}
+			} else {
+				// dst is upper triangular, s is stored in lower triangle.
+				//
+				// The following is a possible implementation for this case but
+				// is commented out due to lack of test coverage.
+				// for i := 0; i < n; i++ {
+				//  ilen := min(k+1, n-i)
+				//  for j := 0; j < ilen; j++ {
+				//      t.Data[i*t.Stride+j] = s.Data[(i+j)*s.Stride+k-j]
+				//  }
+				// }
+				panic("not implemented")
+			}
+		} else {
+			if s.Uplo == blas.Upper {
+				// dst is lower triangular, s is stored in upper triangle.
+				panic("not implemented")
+			} else {
+				// dst is lower triangular, s is stored in lower triangle.
+				panic("not implemented")
+			}
+		}
+		return
+	}
+	if upper {
+		for i := 0; i < n; i++ {
+			ilen := min(k+1, n-i)
+			for j := 0; j < ilen; j++ {
+				t.Data[i*t.Stride+j] = s.At(i, i+j)
+			}
+		}
+	} else {
+		panic("not implemented")
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/tridiag.go b/vendor/gonum.org/v1/gonum/mat/tridiag.go
new file mode 100644
index 0000000000..c001d48631
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/tridiag.go
@@ -0,0 +1,417 @@
+// Copyright ©2020 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/internal/asm/f64"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+var (
+	tridiagDense *Tridiag
+	_            Matrix           = tridiagDense
+	_            allMatrix        = tridiagDense
+	_            denseMatrix      = tridiagDense
+	_            Banded           = tridiagDense
+	_            MutableBanded    = tridiagDense
+	_            RawTridiagonaler = tridiagDense
+)
+
+// A RawTridiagonaler can return a lapack64.Tridiagonal representation of the
+// receiver. Changes to the elements of DL, D, DU in lapack64.Tridiagonal will
+// be reflected in the original matrix, changes to the N field will not.
+type RawTridiagonaler interface {
+	RawTridiagonal() lapack64.Tridiagonal
+}
+
+// Tridiag represents a tridiagonal matrix by its three diagonals.
+type Tridiag struct {
+	mat lapack64.Tridiagonal
+}
+
+// NewTridiag creates a new n×n tridiagonal matrix with the first sub-diagonal
+// in dl, the main diagonal in d and the first super-diagonal in du. If all of
+// dl, d, and du are nil, new backing slices will be allocated for them. If dl
+// and du have length n-1 and d has length n, they will be used as backing
+// slices, and changes to the elements of the returned Tridiag will be reflected
+// in dl, d, du. If neither of these is true, NewTridiag will panic.
+func NewTridiag(n int, dl, d, du []float64) *Tridiag {
+	if n <= 0 {
+		if n == 0 {
+			panic(ErrZeroLength)
+		}
+		panic(ErrNegativeDimension)
+	}
+	if dl != nil || d != nil || du != nil {
+		if len(dl) != n-1 || len(d) != n || len(du) != n-1 {
+			panic(ErrShape)
+		}
+	} else {
+		d = make([]float64, n)
+		if n > 1 {
+			dl = make([]float64, n-1)
+			du = make([]float64, n-1)
+		}
+	}
+	return &Tridiag{
+		mat: lapack64.Tridiagonal{
+			N:  n,
+			DL: dl,
+			D:  d,
+			DU: du,
+		},
+	}
+}
+
+// Dims returns the number of rows and columns in the matrix.
+func (a *Tridiag) Dims() (r, c int) {
+	return a.mat.N, a.mat.N
+}
+
+// Bandwidth returns 1, 1 - the upper and lower bandwidths of the matrix.
+func (a *Tridiag) Bandwidth() (kl, ku int) {
+	return 1, 1
+}
+
+// T performs an implicit transpose by returning the receiver inside a Transpose.
+func (a *Tridiag) T() Matrix {
+	// An alternative would be to return the receiver with DL,DU swapped; the
+	// untranspose function would then always return false. With Transpose the
+	// diagonal swapping will be done in tridiagonal routines in lapack like
+	// lapack64.Gtsv or gonum.Dlagtm based on the trans parameter.
+	return Transpose{a}
+}
+
+// TBand performs an implicit transpose by returning the receiver inside a
+// TransposeBand.
+func (a *Tridiag) TBand() Banded {
+	// An alternative would be to return the receiver with DL,DU swapped; see
+	// explanation in T above.
+	return TransposeBand{a}
+}
+
+// RawTridiagonal returns the underlying lapack64.Tridiagonal used by the
+// receiver. Changes to elements in the receiver following the call will be
+// reflected in the returned matrix.
+func (a *Tridiag) RawTridiagonal() lapack64.Tridiagonal {
+	return a.mat
+}
+
+// SetRawTridiagonal sets the underlying lapack64.Tridiagonal used by the
+// receiver. Changes to elements in the receiver following the call will be
+// reflected in the input.
+func (a *Tridiag) SetRawTridiagonal(mat lapack64.Tridiagonal) {
+	a.mat = mat
+}
+
+// IsEmpty returns whether the receiver is empty. Empty matrices can be the
+// receiver for size-restricted operations. The receiver can be zeroed using
+// Reset.
+func (a *Tridiag) IsEmpty() bool {
+	return a.mat.N == 0
+}
+
+// Reset empties the matrix so that it can be reused as the receiver of a
+// dimensionally restricted operation.
+//
+// Reset should not be used when the matrix shares backing data. See the Reseter
+// interface for more information.
+func (a *Tridiag) Reset() {
+	a.mat.N = 0
+	a.mat.DL = a.mat.DL[:0]
+	a.mat.D = a.mat.D[:0]
+	a.mat.DU = a.mat.DU[:0]
+}
+
+// CloneFromTridiag makes a copy of the input Tridiag into the receiver,
+// overwriting the previous value of the receiver. CloneFromTridiag does not
+// place any restrictions on receiver shape.
+func (a *Tridiag) CloneFromTridiag(from *Tridiag) {
+	n := from.mat.N
+	switch n {
+	case 0:
+		panic(ErrZeroLength)
+	case 1:
+		a.mat = lapack64.Tridiagonal{
+			N:  1,
+			DL: use(a.mat.DL, 0),
+			D:  use(a.mat.D, 1),
+			DU: use(a.mat.DU, 0),
+		}
+		a.mat.D[0] = from.mat.D[0]
+	default:
+		a.mat = lapack64.Tridiagonal{
+			N:  n,
+			DL: use(a.mat.DL, n-1),
+			D:  use(a.mat.D, n),
+			DU: use(a.mat.DU, n-1),
+		}
+		copy(a.mat.DL, from.mat.DL)
+		copy(a.mat.D, from.mat.D)
+		copy(a.mat.DU, from.mat.DU)
+	}
+}
+
+// DiagView returns the diagonal as a matrix backed by the original data.
+func (a *Tridiag) DiagView() Diagonal {
+	return &DiagDense{
+		mat: blas64.Vector{
+			N:    a.mat.N,
+			Data: a.mat.D[:a.mat.N],
+			Inc:  1,
+		},
+	}
+}
+
+// Zero sets all of the matrix elements to zero.
+func (a *Tridiag) Zero() {
+	zero(a.mat.DL)
+	zero(a.mat.D)
+	zero(a.mat.DU)
+}
+
+// Trace returns the trace of the matrix.
+//
+// Trace will panic with ErrZeroLength if the matrix has zero size.
+func (a *Tridiag) Trace() float64 {
+	if a.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	return f64.Sum(a.mat.D)
+}
+
+// Norm returns the specified norm of the receiver. Valid norms are:
+//
+//	1 - The maximum absolute column sum
+//	2 - The Frobenius norm, the square root of the sum of the squares of the elements
+//	Inf - The maximum absolute row sum
+//
+// Norm will panic with ErrNormOrder if an illegal norm is specified and with
+// ErrZeroLength if the matrix has zero size.
+func (a *Tridiag) Norm(norm float64) float64 {
+	if a.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	return lapack64.Langt(normLapack(norm, false), a.mat)
+}
+
+// MulVecTo computes A⋅x or Aᵀ⋅x storing the result into dst.
+func (a *Tridiag) MulVecTo(dst *VecDense, trans bool, x Vector) {
+	n := a.mat.N
+	if x.Len() != n {
+		panic(ErrShape)
+	}
+	dst.reuseAsNonZeroed(n)
+	t := blas.NoTrans
+	if trans {
+		t = blas.Trans
+	}
+	xMat, _ := untransposeExtract(x)
+	if xVec, ok := xMat.(*VecDense); ok && dst != xVec {
+		dst.checkOverlap(xVec.mat)
+		lapack64.Lagtm(t, 1, a.mat, xVec.asGeneral(), 0, dst.asGeneral())
+	} else {
+		xCopy := getVecDenseWorkspace(n, false)
+		xCopy.CloneFromVec(x)
+		lapack64.Lagtm(t, 1, a.mat, xCopy.asGeneral(), 0, dst.asGeneral())
+		putVecDenseWorkspace(xCopy)
+	}
+}
+
+// SolveTo solves a tridiagonal system A⋅X = B  or  Aᵀ⋅X = B where A is an
+// n×n tridiagonal matrix represented by the receiver and B is a given n×nrhs
+// matrix. If A is non-singular, the result will be stored into dst and nil will
+// be returned. If A is singular, the contents of dst will be undefined and a
+// Condition error will be returned.
+func (a *Tridiag) SolveTo(dst *Dense, trans bool, b Matrix) error {
+	n, nrhs := b.Dims()
+	if n != a.mat.N {
+		panic(ErrShape)
+	}
+
+	dst.reuseAsNonZeroed(n, nrhs)
+	bU, bTrans := untranspose(b)
+	if dst == bU {
+		if bTrans {
+			work := getDenseWorkspace(n, nrhs, false)
+			defer putDenseWorkspace(work)
+			work.Copy(b)
+			dst.Copy(work)
+		}
+	} else {
+		if rm, ok := bU.(RawMatrixer); ok {
+			dst.checkOverlap(rm.RawMatrix())
+		}
+		dst.Copy(b)
+	}
+
+	var aCopy Tridiag
+	aCopy.CloneFromTridiag(a)
+	var ok bool
+	if trans {
+		ok = lapack64.Gtsv(blas.Trans, aCopy.mat, dst.mat)
+	} else {
+		ok = lapack64.Gtsv(blas.NoTrans, aCopy.mat, dst.mat)
+	}
+	if !ok {
+		return Condition(math.Inf(1))
+	}
+	return nil
+}
+
+// SolveVecTo solves a tridiagonal system A⋅X = B  or  Aᵀ⋅X = B where A is an
+// n×n tridiagonal matrix represented by the receiver and b is a given n-vector.
+// If A is non-singular, the result will be stored into dst and nil will be
+// returned. If A is singular, the contents of dst will be undefined and a
+// Condition error will be returned.
+func (a *Tridiag) SolveVecTo(dst *VecDense, trans bool, b Vector) error {
+	n, nrhs := b.Dims()
+	if n != a.mat.N || nrhs != 1 {
+		panic(ErrShape)
+	}
+	if b, ok := b.(RawVectorer); ok && dst != b {
+		dst.checkOverlap(b.RawVector())
+	}
+	dst.reuseAsNonZeroed(n)
+	if dst != b {
+		dst.CopyVec(b)
+	}
+	var aCopy Tridiag
+	aCopy.CloneFromTridiag(a)
+	var ok bool
+	if trans {
+		ok = lapack64.Gtsv(blas.Trans, aCopy.mat, dst.asGeneral())
+	} else {
+		ok = lapack64.Gtsv(blas.NoTrans, aCopy.mat, dst.asGeneral())
+	}
+	if !ok {
+		return Condition(math.Inf(1))
+	}
+	return nil
+}
+
+// DoNonZero calls the function fn for each of the non-zero elements of A. The
+// function fn takes a row/column index and the element value of A at (i,j).
+func (a *Tridiag) DoNonZero(fn func(i, j int, v float64)) {
+	for i, aij := range a.mat.DU {
+		if aij != 0 {
+			fn(i, i+1, aij)
+		}
+	}
+	for i, aii := range a.mat.D {
+		if aii != 0 {
+			fn(i, i, aii)
+		}
+	}
+	for i, aij := range a.mat.DL {
+		if aij != 0 {
+			fn(i+1, i, aij)
+		}
+	}
+}
+
+// DoRowNonZero calls the function fn for each of the non-zero elements of row i
+// of A. The function fn takes a row/column index and the element value of A at
+// (i,j).
+func (a *Tridiag) DoRowNonZero(i int, fn func(i, j int, v float64)) {
+	n := a.mat.N
+	if uint(i) >= uint(n) {
+		panic(ErrRowAccess)
+	}
+	if n == 1 {
+		v := a.mat.D[0]
+		if v != 0 {
+			fn(0, 0, v)
+		}
+		return
+	}
+	switch i {
+	case 0:
+		v := a.mat.D[0]
+		if v != 0 {
+			fn(i, 0, v)
+		}
+		v = a.mat.DU[0]
+		if v != 0 {
+			fn(i, 1, v)
+		}
+	case n - 1:
+		v := a.mat.DL[n-2]
+		if v != 0 {
+			fn(n-1, n-2, v)
+		}
+		v = a.mat.D[n-1]
+		if v != 0 {
+			fn(n-1, n-1, v)
+		}
+	default:
+		v := a.mat.DL[i-1]
+		if v != 0 {
+			fn(i, i-1, v)
+		}
+		v = a.mat.D[i]
+		if v != 0 {
+			fn(i, i, v)
+		}
+		v = a.mat.DU[i]
+		if v != 0 {
+			fn(i, i+1, v)
+		}
+	}
+}
+
+// DoColNonZero calls the function fn for each of the non-zero elements of
+// column j of A. The function fn takes a row/column index and the element value
+// of A at (i, j).
+func (a *Tridiag) DoColNonZero(j int, fn func(i, j int, v float64)) {
+	n := a.mat.N
+	if uint(j) >= uint(n) {
+		panic(ErrColAccess)
+	}
+	if n == 1 {
+		v := a.mat.D[0]
+		if v != 0 {
+			fn(0, 0, v)
+		}
+		return
+	}
+	switch j {
+	case 0:
+		v := a.mat.D[0]
+		if v != 0 {
+			fn(0, 0, v)
+		}
+		v = a.mat.DL[0]
+		if v != 0 {
+			fn(1, 0, v)
+		}
+	case n - 1:
+		v := a.mat.DU[n-2]
+		if v != 0 {
+			fn(n-2, n-1, v)
+		}
+		v = a.mat.D[n-1]
+		if v != 0 {
+			fn(n-1, n-1, v)
+		}
+	default:
+		v := a.mat.DU[j-1]
+		if v != 0 {
+			fn(j-1, j, v)
+		}
+		v = a.mat.D[j]
+		if v != 0 {
+			fn(j, j, v)
+		}
+		v = a.mat.DL[j]
+		if v != 0 {
+			fn(j+1, j, v)
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/vector.go b/vendor/gonum.org/v1/gonum/mat/vector.go
new file mode 100644
index 0000000000..5c5d3ff749
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/vector.go
@@ -0,0 +1,855 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/internal/asm/f64"
+)
+
+var (
+	vector *VecDense
+
+	_ Matrix        = vector
+	_ allMatrix     = vector
+	_ Vector        = vector
+	_ Reseter       = vector
+	_ MutableVector = vector
+)
+
+// Vector is a vector.
+type Vector interface {
+	Matrix
+	AtVec(int) float64
+	Len() int
+}
+
+// A MutableVector can set elements of a vector.
+type MutableVector interface {
+	Vector
+	SetVec(i int, v float64)
+}
+
+// TransposeVec is a type for performing an implicit transpose of a Vector.
+// It implements the Vector interface, returning values from the transpose
+// of the vector within.
+type TransposeVec struct {
+	Vector Vector
+}
+
+// At returns the value of the element at row i and column j of the transposed
+// matrix, that is, row j and column i of the Vector field.
+func (t TransposeVec) At(i, j int) float64 {
+	return t.Vector.At(j, i)
+}
+
+// AtVec returns the element at position i. It panics if i is out of bounds.
+func (t TransposeVec) AtVec(i int) float64 {
+	return t.Vector.AtVec(i)
+}
+
+// Dims returns the dimensions of the transposed vector.
+func (t TransposeVec) Dims() (r, c int) {
+	c, r = t.Vector.Dims()
+	return r, c
+}
+
+// T performs an implicit transpose by returning the Vector field.
+func (t TransposeVec) T() Matrix {
+	return t.Vector
+}
+
+// Len returns the number of columns in the vector.
+func (t TransposeVec) Len() int {
+	return t.Vector.Len()
+}
+
+// TVec performs an implicit transpose by returning the Vector field.
+func (t TransposeVec) TVec() Vector {
+	return t.Vector
+}
+
+// Untranspose returns the Vector field.
+func (t TransposeVec) Untranspose() Matrix {
+	return t.Vector
+}
+
+func (t TransposeVec) UntransposeVec() Vector {
+	return t.Vector
+}
+
+// VecDense represents a column vector.
+type VecDense struct {
+	mat blas64.Vector
+	// A BLAS vector can have a negative increment, but allowing this
+	// in the mat type complicates a lot of code, and doesn't gain anything.
+	// VecDense must have positive increment in this package.
+}
+
+// NewVecDense creates a new VecDense of length n. If data == nil,
+// a new slice is allocated for the backing slice. If len(data) == n, data is
+// used as the backing slice, and changes to the elements of the returned VecDense
+// will be reflected in data. If neither of these is true, NewVecDense will panic.
+// NewVecDense will panic if n is zero.
+func NewVecDense(n int, data []float64) *VecDense {
+	if n <= 0 {
+		if n == 0 {
+			panic(ErrZeroLength)
+		}
+		panic("mat: negative dimension")
+	}
+	if len(data) != n && data != nil {
+		panic(ErrShape)
+	}
+	if data == nil {
+		data = make([]float64, n)
+	}
+	return &VecDense{
+		mat: blas64.Vector{
+			N:    n,
+			Inc:  1,
+			Data: data,
+		},
+	}
+}
+
+// SliceVec returns a new Vector that shares backing data with the receiver.
+// The returned matrix starts at i of the receiver and extends k-i elements.
+// SliceVec panics with ErrIndexOutOfRange if the slice is outside the capacity
+// of the receiver.
+func (v *VecDense) SliceVec(i, k int) Vector {
+	return v.sliceVec(i, k)
+}
+
+func (v *VecDense) sliceVec(i, k int) *VecDense {
+	if i < 0 || k <= i || v.Cap() < k {
+		panic(ErrIndexOutOfRange)
+	}
+	return &VecDense{
+		mat: blas64.Vector{
+			N:    k - i,
+			Inc:  v.mat.Inc,
+			Data: v.mat.Data[i*v.mat.Inc : (k-1)*v.mat.Inc+1],
+		},
+	}
+}
+
+// Dims returns the number of rows and columns in the matrix. Columns is always 1
+// for a non-Reset vector.
+func (v *VecDense) Dims() (r, c int) {
+	if v.IsEmpty() {
+		return 0, 0
+	}
+	return v.mat.N, 1
+}
+
+// Caps returns the number of rows and columns in the backing matrix. Columns is always 1
+// for a non-Reset vector.
+func (v *VecDense) Caps() (r, c int) {
+	if v.IsEmpty() {
+		return 0, 0
+	}
+	return v.Cap(), 1
+}
+
+// Len returns the length of the vector.
+func (v *VecDense) Len() int {
+	return v.mat.N
+}
+
+// Cap returns the capacity of the vector.
+func (v *VecDense) Cap() int {
+	if v.IsEmpty() {
+		return 0
+	}
+	return (cap(v.mat.Data)-1)/v.mat.Inc + 1
+}
+
+// T performs an implicit transpose by returning the receiver inside a Transpose.
+func (v *VecDense) T() Matrix {
+	return Transpose{v}
+}
+
+// TVec performs an implicit transpose by returning the receiver inside a TransposeVec.
+func (v *VecDense) TVec() Vector {
+	return TransposeVec{v}
+}
+
+// Reset empties the matrix so that it can be reused as the
+// receiver of a dimensionally restricted operation.
+//
+// Reset should not be used when the matrix shares backing data.
+// See the Reseter interface for more information.
+func (v *VecDense) Reset() {
+	// No change of Inc or N to 0 may be
+	// made unless both are set to 0.
+	v.mat.Inc = 0
+	v.mat.N = 0
+	v.mat.Data = v.mat.Data[:0]
+}
+
+// Zero sets all of the matrix elements to zero.
+func (v *VecDense) Zero() {
+	for i := 0; i < v.mat.N; i++ {
+		v.mat.Data[v.mat.Inc*i] = 0
+	}
+}
+
+// CloneFromVec makes a copy of a into the receiver, overwriting the previous value
+// of the receiver.
+func (v *VecDense) CloneFromVec(a Vector) {
+	if v == a {
+		return
+	}
+	n := a.Len()
+	v.mat = blas64.Vector{
+		N:    n,
+		Inc:  1,
+		Data: use(v.mat.Data, n),
+	}
+	if r, ok := a.(RawVectorer); ok {
+		blas64.Copy(r.RawVector(), v.mat)
+		return
+	}
+	for i := 0; i < a.Len(); i++ {
+		v.setVec(i, a.AtVec(i))
+	}
+}
+
+// VecDenseCopyOf returns a newly allocated copy of the elements of a.
+func VecDenseCopyOf(a Vector) *VecDense {
+	v := &VecDense{}
+	v.CloneFromVec(a)
+	return v
+}
+
+// RawVector returns the underlying blas64.Vector used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in returned blas64.Vector.
+func (v *VecDense) RawVector() blas64.Vector {
+	return v.mat
+}
+
+// SetRawVector sets the underlying blas64.Vector used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in the input.
+func (v *VecDense) SetRawVector(a blas64.Vector) {
+	v.mat = a
+}
+
+// CopyVec makes a copy of elements of a into the receiver. It is similar to the
+// built-in copy; it copies as much as the overlap between the two vectors and
+// returns the number of elements it copied.
+func (v *VecDense) CopyVec(a Vector) int {
+	n := min(v.Len(), a.Len())
+	if v == a {
+		return n
+	}
+	if r, ok := a.(RawVectorer); ok {
+		src := r.RawVector()
+		src.N = n
+		dst := v.mat
+		dst.N = n
+		blas64.Copy(src, dst)
+		return n
+	}
+	for i := 0; i < n; i++ {
+		v.setVec(i, a.AtVec(i))
+	}
+	return n
+}
+
+// Norm returns the specified norm of the receiver. Valid norms are:
+//
+//	1 - The sum of the element magnitudes
+//	2 - The Euclidean norm, the square root of the sum of the squares of the elements
+//	Inf - The maximum element magnitude
+//
+// Norm will panic with ErrNormOrder if an illegal norm is specified and with
+// ErrZeroLength if the vector has zero size.
+func (v *VecDense) Norm(norm float64) float64 {
+	if v.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	switch norm {
+	default:
+		panic(ErrNormOrder)
+	case 1:
+		return blas64.Asum(v.mat)
+	case 2:
+		return blas64.Nrm2(v.mat)
+	case math.Inf(1):
+		imax := blas64.Iamax(v.mat)
+		return math.Abs(v.at(imax))
+	}
+}
+
+// ScaleVec scales the vector a by alpha, placing the result in the receiver.
+func (v *VecDense) ScaleVec(alpha float64, a Vector) {
+	n := a.Len()
+
+	if v == a {
+		if v.mat.Inc == 1 {
+			f64.ScalUnitary(alpha, v.mat.Data)
+			return
+		}
+		f64.ScalInc(alpha, v.mat.Data, uintptr(n), uintptr(v.mat.Inc))
+		return
+	}
+
+	v.reuseAsNonZeroed(n)
+
+	if rv, ok := a.(RawVectorer); ok {
+		mat := rv.RawVector()
+		v.checkOverlap(mat)
+		if v.mat.Inc == 1 && mat.Inc == 1 {
+			f64.ScalUnitaryTo(v.mat.Data, alpha, mat.Data)
+			return
+		}
+		f64.ScalIncTo(v.mat.Data, uintptr(v.mat.Inc),
+			alpha, mat.Data, uintptr(n), uintptr(mat.Inc))
+		return
+	}
+
+	for i := 0; i < n; i++ {
+		v.setVec(i, alpha*a.AtVec(i))
+	}
+}
+
+// AddScaledVec adds the vectors a and alpha*b, placing the result in the receiver.
+func (v *VecDense) AddScaledVec(a Vector, alpha float64, b Vector) {
+	if alpha == 1 {
+		v.AddVec(a, b)
+		return
+	}
+	if alpha == -1 {
+		v.SubVec(a, b)
+		return
+	}
+
+	ar := a.Len()
+	br := b.Len()
+
+	if ar != br {
+		panic(ErrShape)
+	}
+
+	var amat, bmat blas64.Vector
+	fast := true
+	aU, _ := untransposeExtract(a)
+	if rv, ok := aU.(*VecDense); ok {
+		amat = rv.mat
+		if v != a {
+			v.checkOverlap(amat)
+		}
+	} else {
+		fast = false
+	}
+	bU, _ := untransposeExtract(b)
+	if rv, ok := bU.(*VecDense); ok {
+		bmat = rv.mat
+		if v != b {
+			v.checkOverlap(bmat)
+		}
+	} else {
+		fast = false
+	}
+
+	v.reuseAsNonZeroed(ar)
+
+	switch {
+	case alpha == 0: // v <- a
+		if v == a {
+			return
+		}
+		v.CopyVec(a)
+	case v == a && v == b: // v <- v + alpha * v = (alpha + 1) * v
+		blas64.Scal(alpha+1, v.mat)
+	case !fast: // v <- a + alpha * b without blas64 support.
+		for i := 0; i < ar; i++ {
+			v.setVec(i, a.AtVec(i)+alpha*b.AtVec(i))
+		}
+	case v == a && v != b: // v <- v + alpha * b
+		if v.mat.Inc == 1 && bmat.Inc == 1 {
+			// Fast path for a common case.
+			f64.AxpyUnitaryTo(v.mat.Data, alpha, bmat.Data, amat.Data)
+		} else {
+			f64.AxpyInc(alpha, bmat.Data, v.mat.Data,
+				uintptr(ar), uintptr(bmat.Inc), uintptr(v.mat.Inc), 0, 0)
+		}
+	default: // v <- a + alpha * b or v <- a + alpha * v
+		if v.mat.Inc == 1 && amat.Inc == 1 && bmat.Inc == 1 {
+			// Fast path for a common case.
+			f64.AxpyUnitaryTo(v.mat.Data, alpha, bmat.Data, amat.Data)
+		} else {
+			f64.AxpyIncTo(v.mat.Data, uintptr(v.mat.Inc), 0,
+				alpha, bmat.Data, amat.Data,
+				uintptr(ar), uintptr(bmat.Inc), uintptr(amat.Inc), 0, 0)
+		}
+	}
+}
+
+// AddVec adds the vectors a and b, placing the result in the receiver.
+func (v *VecDense) AddVec(a, b Vector) {
+	ar := a.Len()
+	br := b.Len()
+
+	if ar != br {
+		panic(ErrShape)
+	}
+
+	v.reuseAsNonZeroed(ar)
+
+	aU, _ := untransposeExtract(a)
+	bU, _ := untransposeExtract(b)
+
+	if arv, ok := aU.(*VecDense); ok {
+		if brv, ok := bU.(*VecDense); ok {
+			amat := arv.mat
+			bmat := brv.mat
+
+			if v != a {
+				v.checkOverlap(amat)
+			}
+			if v != b {
+				v.checkOverlap(bmat)
+			}
+
+			if v.mat.Inc == 1 && amat.Inc == 1 && bmat.Inc == 1 {
+				// Fast path for a common case.
+				f64.AxpyUnitaryTo(v.mat.Data, 1, bmat.Data, amat.Data)
+				return
+			}
+			f64.AxpyIncTo(v.mat.Data, uintptr(v.mat.Inc), 0,
+				1, bmat.Data, amat.Data,
+				uintptr(ar), uintptr(bmat.Inc), uintptr(amat.Inc), 0, 0)
+			return
+		}
+	}
+
+	for i := 0; i < ar; i++ {
+		v.setVec(i, a.AtVec(i)+b.AtVec(i))
+	}
+}
+
+// SubVec subtracts the vector b from a, placing the result in the receiver.
+func (v *VecDense) SubVec(a, b Vector) {
+	ar := a.Len()
+	br := b.Len()
+
+	if ar != br {
+		panic(ErrShape)
+	}
+
+	v.reuseAsNonZeroed(ar)
+
+	aU, _ := untransposeExtract(a)
+	bU, _ := untransposeExtract(b)
+
+	if arv, ok := aU.(*VecDense); ok {
+		if brv, ok := bU.(*VecDense); ok {
+			amat := arv.mat
+			bmat := brv.mat
+
+			if v != a {
+				v.checkOverlap(amat)
+			}
+			if v != b {
+				v.checkOverlap(bmat)
+			}
+
+			if v.mat.Inc == 1 && amat.Inc == 1 && bmat.Inc == 1 {
+				// Fast path for a common case.
+				f64.AxpyUnitaryTo(v.mat.Data, -1, bmat.Data, amat.Data)
+				return
+			}
+			f64.AxpyIncTo(v.mat.Data, uintptr(v.mat.Inc), 0,
+				-1, bmat.Data, amat.Data,
+				uintptr(ar), uintptr(bmat.Inc), uintptr(amat.Inc), 0, 0)
+			return
+		}
+	}
+
+	for i := 0; i < ar; i++ {
+		v.setVec(i, a.AtVec(i)-b.AtVec(i))
+	}
+}
+
+// MulElemVec performs element-wise multiplication of a and b, placing the result
+// in the receiver.
+func (v *VecDense) MulElemVec(a, b Vector) {
+	ar := a.Len()
+	br := b.Len()
+
+	if ar != br {
+		panic(ErrShape)
+	}
+
+	v.reuseAsNonZeroed(ar)
+
+	aU, _ := untransposeExtract(a)
+	bU, _ := untransposeExtract(b)
+
+	if arv, ok := aU.(*VecDense); ok {
+		if brv, ok := bU.(*VecDense); ok {
+			amat := arv.mat
+			bmat := brv.mat
+
+			if v != a {
+				v.checkOverlap(amat)
+			}
+			if v != b {
+				v.checkOverlap(bmat)
+			}
+
+			if v.mat.Inc == 1 && amat.Inc == 1 && bmat.Inc == 1 {
+				// Fast path for a common case.
+				for i, a := range amat.Data {
+					v.mat.Data[i] = a * bmat.Data[i]
+				}
+				return
+			}
+			var ia, ib int
+			for i := 0; i < ar; i++ {
+				v.setVec(i, amat.Data[ia]*bmat.Data[ib])
+				ia += amat.Inc
+				ib += bmat.Inc
+			}
+			return
+		}
+	}
+
+	for i := 0; i < ar; i++ {
+		v.setVec(i, a.AtVec(i)*b.AtVec(i))
+	}
+}
+
+// DivElemVec performs element-wise division of a by b, placing the result
+// in the receiver.
+func (v *VecDense) DivElemVec(a, b Vector) {
+	ar := a.Len()
+	br := b.Len()
+
+	if ar != br {
+		panic(ErrShape)
+	}
+
+	v.reuseAsNonZeroed(ar)
+
+	aU, _ := untransposeExtract(a)
+	bU, _ := untransposeExtract(b)
+
+	if arv, ok := aU.(*VecDense); ok {
+		if brv, ok := bU.(*VecDense); ok {
+			amat := arv.mat
+			bmat := brv.mat
+
+			if v != a {
+				v.checkOverlap(amat)
+			}
+			if v != b {
+				v.checkOverlap(bmat)
+			}
+
+			if v.mat.Inc == 1 && amat.Inc == 1 && bmat.Inc == 1 {
+				// Fast path for a common case.
+				for i, a := range amat.Data {
+					v.setVec(i, a/bmat.Data[i])
+				}
+				return
+			}
+			var ia, ib int
+			for i := 0; i < ar; i++ {
+				v.setVec(i, amat.Data[ia]/bmat.Data[ib])
+				ia += amat.Inc
+				ib += bmat.Inc
+			}
+		}
+	}
+
+	for i := 0; i < ar; i++ {
+		v.setVec(i, a.AtVec(i)/b.AtVec(i))
+	}
+}
+
+// MulVec computes a * b. The result is stored into the receiver.
+// MulVec panics if the number of columns in a does not equal the number of rows in b
+// or if the number of columns in b does not equal 1.
+func (v *VecDense) MulVec(a Matrix, b Vector) {
+	r, c := a.Dims()
+	br, bc := b.Dims()
+	if c != br || bc != 1 {
+		panic(ErrShape)
+	}
+
+	aU, trans := untransposeExtract(a)
+	var bmat blas64.Vector
+	fast := true
+	bU, _ := untransposeExtract(b)
+	if rv, ok := bU.(*VecDense); ok {
+		bmat = rv.mat
+		if v != b {
+			v.checkOverlap(bmat)
+		}
+	} else {
+		fast = false
+	}
+
+	v.reuseAsNonZeroed(r)
+	var restore func()
+	if v == aU {
+		v, restore = v.isolatedWorkspace(aU.(*VecDense))
+		defer restore()
+	} else if v == b {
+		v, restore = v.isolatedWorkspace(b)
+		defer restore()
+	}
+
+	// TODO(kortschak): Improve the non-fast paths.
+	switch aU := aU.(type) {
+	case Vector:
+		if b.Len() == 1 {
+			// {n,1} x {1,1}
+			v.ScaleVec(b.AtVec(0), aU)
+			return
+		}
+
+		// {1,n} x {n,1}
+		if fast {
+			if rv, ok := aU.(*VecDense); ok {
+				amat := rv.mat
+				if v != aU {
+					v.checkOverlap(amat)
+				}
+
+				if amat.Inc == 1 && bmat.Inc == 1 {
+					// Fast path for a common case.
+					v.setVec(0, f64.DotUnitary(amat.Data, bmat.Data))
+					return
+				}
+				v.setVec(0, f64.DotInc(amat.Data, bmat.Data,
+					uintptr(c), uintptr(amat.Inc), uintptr(bmat.Inc), 0, 0))
+				return
+			}
+		}
+		var sum float64
+		for i := 0; i < c; i++ {
+			sum += aU.AtVec(i) * b.AtVec(i)
+		}
+		v.setVec(0, sum)
+		return
+	case *SymBandDense:
+		if fast {
+			aU.checkOverlap(v.asGeneral())
+			blas64.Sbmv(1, aU.mat, bmat, 0, v.mat)
+			return
+		}
+	case *SymDense:
+		if fast {
+			aU.checkOverlap(v.asGeneral())
+			blas64.Symv(1, aU.mat, bmat, 0, v.mat)
+			return
+		}
+	case *TriDense:
+		if fast {
+			v.CopyVec(b)
+			aU.checkOverlap(v.asGeneral())
+			ta := blas.NoTrans
+			if trans {
+				ta = blas.Trans
+			}
+			blas64.Trmv(ta, aU.mat, v.mat)
+			return
+		}
+	case *Dense:
+		if fast {
+			aU.checkOverlap(v.asGeneral())
+			t := blas.NoTrans
+			if trans {
+				t = blas.Trans
+			}
+			blas64.Gemv(t, 1, aU.mat, bmat, 0, v.mat)
+			return
+		}
+	default:
+		if fast {
+			for i := 0; i < r; i++ {
+				var f float64
+				for j := 0; j < c; j++ {
+					f += a.At(i, j) * bmat.Data[j*bmat.Inc]
+				}
+				v.setVec(i, f)
+			}
+			return
+		}
+	}
+
+	for i := 0; i < r; i++ {
+		var f float64
+		for j := 0; j < c; j++ {
+			f += a.At(i, j) * b.AtVec(j)
+		}
+		v.setVec(i, f)
+	}
+}
+
+// ReuseAsVec changes the receiver if it IsEmpty() to be of size n×1.
+//
+// ReuseAsVec re-uses the backing data slice if it has sufficient capacity,
+// otherwise a new slice is allocated. The backing data is zero on return.
+//
+// ReuseAsVec panics if the receiver is not empty, and panics if
+// the input size is less than one. To empty the receiver for re-use,
+// Reset should be used.
+func (v *VecDense) ReuseAsVec(n int) {
+	if n <= 0 {
+		if n == 0 {
+			panic(ErrZeroLength)
+		}
+		panic(ErrNegativeDimension)
+	}
+	if !v.IsEmpty() {
+		panic(ErrReuseNonEmpty)
+	}
+	v.reuseAsZeroed(n)
+}
+
+// reuseAsNonZeroed resizes an empty vector to a r×1 vector,
+// or checks that a non-empty matrix is r×1.
+func (v *VecDense) reuseAsNonZeroed(r int) {
+	// reuseAsNonZeroed must be kept in sync with reuseAsZeroed.
+	if r == 0 {
+		panic(ErrZeroLength)
+	}
+	if v.IsEmpty() {
+		v.mat = blas64.Vector{
+			N:    r,
+			Inc:  1,
+			Data: use(v.mat.Data, r),
+		}
+		return
+	}
+	if r != v.mat.N {
+		panic(ErrShape)
+	}
+}
+
+// reuseAsZeroed resizes an empty vector to a r×1 vector,
+// or checks that a non-empty matrix is r×1.
+func (v *VecDense) reuseAsZeroed(r int) {
+	// reuseAsZeroed must be kept in sync with reuseAsNonZeroed.
+	if r == 0 {
+		panic(ErrZeroLength)
+	}
+	if v.IsEmpty() {
+		v.mat = blas64.Vector{
+			N:    r,
+			Inc:  1,
+			Data: useZeroed(v.mat.Data, r),
+		}
+		return
+	}
+	if r != v.mat.N {
+		panic(ErrShape)
+	}
+	v.Zero()
+}
+
+// IsEmpty returns whether the receiver is empty. Empty matrices can be the
+// receiver for size-restricted operations. The receiver can be emptied using
+// Reset.
+func (v *VecDense) IsEmpty() bool {
+	// It must be the case that v.Dims() returns
+	// zeros in this case. See comment in Reset().
+	return v.mat.Inc == 0
+}
+
+func (v *VecDense) isolatedWorkspace(a Vector) (n *VecDense, restore func()) {
+	l := a.Len()
+	if l == 0 {
+		panic(ErrZeroLength)
+	}
+	n = getVecDenseWorkspace(l, false)
+	return n, func() {
+		v.CopyVec(n)
+		putVecDenseWorkspace(n)
+	}
+}
+
+// asDense returns a Dense representation of the receiver with the same
+// underlying data.
+func (v *VecDense) asDense() *Dense {
+	return &Dense{
+		mat:     v.asGeneral(),
+		capRows: v.mat.N,
+		capCols: 1,
+	}
+}
+
+// asGeneral returns a blas64.General representation of the receiver with the
+// same underlying data.
+func (v *VecDense) asGeneral() blas64.General {
+	return blas64.General{
+		Rows:   v.mat.N,
+		Cols:   1,
+		Stride: v.mat.Inc,
+		Data:   v.mat.Data,
+	}
+}
+
+// ColViewOf reflects the column j of the RawMatrixer m, into the receiver
+// backed by the same underlying data. The receiver must either be empty
+// have length equal to the number of rows of m.
+func (v *VecDense) ColViewOf(m RawMatrixer, j int) {
+	rm := m.RawMatrix()
+
+	if j >= rm.Cols || j < 0 {
+		panic(ErrColAccess)
+	}
+	if !v.IsEmpty() && v.mat.N != rm.Rows {
+		panic(ErrShape)
+	}
+
+	v.mat.Inc = rm.Stride
+	v.mat.Data = rm.Data[j : (rm.Rows-1)*rm.Stride+j+1]
+	v.mat.N = rm.Rows
+}
+
+// RowViewOf reflects the row i of the RawMatrixer m, into the receiver
+// backed by the same underlying data. The receiver must either be
+// empty or have length equal to the number of columns of m.
+func (v *VecDense) RowViewOf(m RawMatrixer, i int) {
+	rm := m.RawMatrix()
+
+	if i >= rm.Rows || i < 0 {
+		panic(ErrRowAccess)
+	}
+	if !v.IsEmpty() && v.mat.N != rm.Cols {
+		panic(ErrShape)
+	}
+
+	v.mat.Inc = 1
+	v.mat.Data = rm.Data[i*rm.Stride : i*rm.Stride+rm.Cols]
+	v.mat.N = rm.Cols
+}
+
+// Permute rearranges the elements of the n-vector v in the receiver as
+// specified by the permutation p[0],p[1],...,p[n-1] of the integers 0,...,n-1.
+//
+// If inverse is false, the given permutation is applied:
+//
+//	v[p[i]] is moved to v[i] for i=0,1,...,n-1.
+//
+// If inverse is true, the inverse permutation is applied:
+//
+//	v[i] is moved to v[p[i]] for i=0,1,...,n-1.
+//
+// p must have length n, otherwise Permute will panic.
+func (v *VecDense) Permute(p []int, inverse bool) {
+	v.asDense().PermuteRows(p, inverse)
+}
diff --git a/vendor/gonum.org/v1/gonum/mathext/README.md b/vendor/gonum.org/v1/gonum/mathext/README.md
new file mode 100644
index 0000000000..9f462f11f4
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/README.md
@@ -0,0 +1,6 @@
+# mathext
+
+[![go.dev reference](https://pkg.go.dev/badge/gonum.org/v1/gonum/mathext)](https://pkg.go.dev/gonum.org/v1/gonum/mathext)
+[![GoDoc](https://godocs.io/gonum.org/v1/gonum/mathext?status.svg)](https://godocs.io/gonum.org/v1/gonum/mathext)
+
+Package mathext implements basic elementary functions not included in the Go standard library.
diff --git a/vendor/gonum.org/v1/gonum/mathext/airy.go b/vendor/gonum.org/v1/gonum/mathext/airy.go
new file mode 100644
index 0000000000..f2904b4766
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/airy.go
@@ -0,0 +1,41 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mathext
+
+import "gonum.org/v1/gonum/mathext/internal/amos"
+
+// AiryAi returns the value of the Airy function at z. The Airy function here,
+// Ai(z), is one of the two linearly independent solutions to
+//
+//	y′′ - y*z = 0.
+//
+// See http://mathworld.wolfram.com/AiryFunctions.html for more detailed information.
+func AiryAi(z complex128) complex128 {
+	// id specifies the order of the derivative to compute,
+	// 0 for the function itself and 1 for the derivative.
+	// kode specifies the scaling option. See the function
+	// documentation for the exact behavior.
+	id := 0
+	kode := 1
+	air, aii, _, _ := amos.Zairy(real(z), imag(z), id, kode)
+	return complex(air, aii)
+}
+
+// AiryAiDeriv returns the value of the derivative of the Airy function at z. The
+// Airy function here, Ai(z), is one of the two linearly independent solutions to
+//
+//	y′′ - y*z = 0.
+//
+// See http://mathworld.wolfram.com/AiryFunctions.html for more detailed information.
+func AiryAiDeriv(z complex128) complex128 {
+	// id specifies the order of the derivative to compute,
+	// 0 for the function itself and 1 for the derivative.
+	// kode specifies the scaling option. See the function
+	// documentation for the exact behavior.
+	id := 1
+	kode := 1
+	air, aii, _, _ := amos.Zairy(real(z), imag(z), id, kode)
+	return complex(air, aii)
+}
diff --git a/vendor/gonum.org/v1/gonum/mathext/beta.go b/vendor/gonum.org/v1/gonum/mathext/beta.go
new file mode 100644
index 0000000000..2df51f3ddc
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/beta.go
@@ -0,0 +1,40 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mathext
+
+import "gonum.org/v1/gonum/mathext/internal/gonum"
+
+// Beta returns the value of the complete beta function B(a, b). It is defined as
+//
+//	Γ(a)Γ(b) / Γ(a+b)
+//
+// Special cases are:
+//
+//	B(a,b) returns NaN if a or b is Inf
+//	B(a,b) returns NaN if a and b are 0
+//	B(a,b) returns NaN if a or b is NaN
+//	B(a,b) returns NaN if a or b is < 0
+//	B(a,b) returns +Inf if a xor b is 0.
+//
+// See http://mathworld.wolfram.com/BetaFunction.html for more detailed information.
+func Beta(a, b float64) float64 {
+	return gonum.Beta(a, b)
+}
+
+// Lbeta returns the natural logarithm of the complete beta function B(a,b).
+// Lbeta is defined as:
+//
+//	Ln(Γ(a)Γ(b)/Γ(a+b))
+//
+// Special cases are:
+//
+//	Lbeta(a,b) returns NaN if a or b is Inf
+//	Lbeta(a,b) returns NaN if a and b are 0
+//	Lbeta(a,b) returns NaN if a or b is NaN
+//	Lbeta(a,b) returns NaN if a or b is < 0
+//	Lbeta(a,b) returns +Inf if a xor b is 0.
+func Lbeta(a, b float64) float64 {
+	return gonum.Lbeta(a, b)
+}
diff --git a/vendor/gonum.org/v1/gonum/mathext/betainc.go b/vendor/gonum.org/v1/gonum/mathext/betainc.go
new file mode 100644
index 0000000000..9a0c61a9bb
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/betainc.go
@@ -0,0 +1,33 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mathext
+
+import "gonum.org/v1/gonum/mathext/internal/cephes"
+
+// RegIncBeta returns the value of the regularized incomplete beta function
+// I(x;a,b). It is defined as
+//
+//	I(x;a,b) = B(x;a,b) / B(a,b)
+//	         = Γ(a+b) / (Γ(a)*Γ(b)) * int_0^x u^(a-1) * (1-u)^(b-1) du.
+//
+// The domain of definition is 0 <= x <= 1, and the parameters a and b must be positive.
+// For other values of x, a, and b RegIncBeta will panic.
+func RegIncBeta(a, b float64, x float64) float64 {
+	return cephes.Incbet(a, b, x)
+}
+
+// InvRegIncBeta computes the inverse of the regularized incomplete beta function.
+// It returns the x for which
+//
+//	y = I(x;a,b)
+//
+// The domain of definition is 0 <= y <= 1, and the parameters a and b must be
+// positive. For other values of x, a, and b InvRegIncBeta will panic.
+func InvRegIncBeta(a, b float64, y float64) float64 {
+	if y < 0 || 1 < y {
+		panic("mathext: parameter out of range")
+	}
+	return cephes.Incbi(a, b, y)
+}
diff --git a/vendor/gonum.org/v1/gonum/mathext/digamma.go b/vendor/gonum.org/v1/gonum/mathext/digamma.go
new file mode 100644
index 0000000000..67ebf007ff
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/digamma.go
@@ -0,0 +1,45 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mathext
+
+import (
+	"math"
+)
+
+// Digamma returns the logorithmic derivative of the gamma function at x.
+//
+//	ψ(x) = d/dx (Ln (Γ(x)).
+func Digamma(x float64) float64 {
+	// This is adapted from
+	// http://web.science.mq.edu.au/~mjohnson/code/digamma.c
+	var result float64
+	switch {
+	case math.IsNaN(x), math.IsInf(x, 1):
+		return x
+	case math.IsInf(x, -1):
+		return math.NaN()
+	case x == 0:
+		return math.Copysign(math.Inf(1), -x)
+	case x < 0:
+		if x == math.Floor(x) {
+			return math.NaN()
+		}
+		// Reflection formula, http://dlmf.nist.gov/5.5#E4
+		_, r := math.Modf(x)
+		result = -math.Pi / math.Tan(math.Pi*r)
+		x = 1 - x
+	}
+	for ; x < 7; x++ {
+		// Recurrence relation, http://dlmf.nist.gov/5.5#E2
+		result -= 1 / x
+	}
+	x -= 0.5
+	xx := 1 / x
+	xx2 := xx * xx
+	xx4 := xx2 * xx2
+	// Asymptotic expansion, http://dlmf.nist.gov/5.11#E2
+	result += math.Log(x) + (1.0/24.0)*xx2 - (7.0/960.0)*xx4 + (31.0/8064.0)*xx4*xx2 - (127.0/30720.0)*xx4*xx4
+	return result
+}
diff --git a/vendor/gonum.org/v1/gonum/mathext/doc.go b/vendor/gonum.org/v1/gonum/mathext/doc.go
new file mode 100644
index 0000000000..539622d4a8
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/doc.go
@@ -0,0 +1,7 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package mathext implements special math functions not implemented by the
+// Go standard library.
+package mathext // import "gonum.org/v1/gonum/mathext"
diff --git a/vendor/gonum.org/v1/gonum/mathext/ell_carlson.go b/vendor/gonum.org/v1/gonum/mathext/ell_carlson.go
new file mode 100644
index 0000000000..1334f6b94a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/ell_carlson.go
@@ -0,0 +1,168 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mathext
+
+import (
+	"math"
+)
+
+// EllipticRF computes the symmetric elliptic integral R_F(x,y,z):
+//
+//	R_F(x,y,z) = (1/2)\int_{0}^{\infty}{1/s(t)} dt,
+//	s(t) = \sqrt{(t+x)(t+y)(t+z)}.
+//
+// The arguments x, y, z must satisfy the following conditions, otherwise the function returns math.NaN():
+//
+//	0 ≤ x,y,z ≤ upper,
+//	lower ≤ x+y,y+z,z+x,
+//
+// where:
+//
+//	lower = 5/(2^1022) = 1.112536929253601e-307,
+//	upper = (2^1022)/5 = 8.988465674311580e+306.
+//
+// The definition of the symmetric elliptic integral R_F can be found in NIST
+// Digital Library of Mathematical Functions (http://dlmf.nist.gov/19.16.E1).
+func EllipticRF(x, y, z float64) float64 {
+	// The original Fortran code was published as Algorithm 577 in ACM TOMS (http://doi.org/10.1145/355958.355970).
+	// This code is also available as a part of SLATEC Common Mathematical Library (http://netlib.org/slatec/index.html). Later, Carlson described
+	// an improved version in http://dx.doi.org/10.1007/BF02198293 (also available at https://arxiv.org/abs/math/9409227).
+	const (
+		lower = 5.0 / (1 << 256) / (1 << 256) / (1 << 256) / (1 << 254) // 5*2^-1022
+		upper = 1 / lower
+		tol   = 1.2674918778210762260320167734407048051023273568443e-02 // (3ε)^(1/8)
+	)
+	if x < 0 || y < 0 || z < 0 || math.IsNaN(x) || math.IsNaN(y) || math.IsNaN(z) {
+		return math.NaN()
+	}
+	if upper < x || upper < y || upper < z {
+		return math.NaN()
+	}
+	if x+y < lower || y+z < lower || z+x < lower {
+		return math.NaN()
+	}
+
+	A0 := (x + y + z) / 3
+	An := A0
+	Q := math.Max(math.Max(math.Abs(A0-x), math.Abs(A0-y)), math.Abs(A0-z)) / tol
+	xn, yn, zn := x, y, z
+	mul := 1.0
+
+	for Q >= mul*math.Abs(An) {
+		xnsqrt, ynsqrt, znsqrt := math.Sqrt(xn), math.Sqrt(yn), math.Sqrt(zn)
+		lambda := xnsqrt*ynsqrt + ynsqrt*znsqrt + znsqrt*xnsqrt
+		An = (An + lambda) * 0.25
+		xn = (xn + lambda) * 0.25
+		yn = (yn + lambda) * 0.25
+		zn = (zn + lambda) * 0.25
+		mul *= 4
+	}
+
+	X := (A0 - x) / (mul * An)
+	Y := (A0 - y) / (mul * An)
+	Z := -(X + Y)
+	E2 := X*Y - Z*Z
+	E3 := X * Y * Z
+
+	// http://dlmf.nist.gov/19.36.E1
+	return (1 - 1/10.0*E2 + 1/14.0*E3 + 1/24.0*E2*E2 - 3/44.0*E2*E3 - 5/208.0*E2*E2*E2 + 3/104.0*E3*E3 + 1/16.0*E2*E2*E3) / math.Sqrt(An)
+}
+
+// EllipticRD computes the symmetric elliptic integral R_D(x,y,z):
+//
+//	R_D(x,y,z) = (1/2)\int_{0}^{\infty}{1/(s(t)(t+z))} dt,
+//	s(t) = \sqrt{(t+x)(t+y)(t+z)}.
+//
+// The arguments x, y, z must satisfy the following conditions, otherwise the function returns math.NaN():
+//
+//	0 ≤ x,y ≤ upper,
+//	lower ≤ z ≤ upper,
+//	lower ≤ x+y,
+//
+// where:
+//
+//	lower = (5/(2^1022))^(1/3) = 4.809554074311679e-103,
+//	upper = ((2^1022)/5)^(1/3) = 2.079194837087086e+102.
+//
+// The definition of the symmetric elliptic integral R_D can be found in NIST
+// Digital Library of Mathematical Functions (http://dlmf.nist.gov/19.16.E5).
+func EllipticRD(x, y, z float64) float64 {
+	// The original Fortran code was published as Algorithm 577 in ACM TOMS (http://doi.org/10.1145/355958.355970).
+	// This code is also available as a part of SLATEC Common Mathematical Library (http://netlib.org/slatec/index.html). Later, Carlson described
+	// an improved version in http://dx.doi.org/10.1007/BF02198293 (also available at https://arxiv.org/abs/math/9409227).
+	const (
+		lower = 4.8095540743116787026618007863123676393525016818363e-103 // (5*2^-1022)^(1/3)
+		upper = 1 / lower
+		tol   = 9.0351169339315770474760122547068324993857488849382e-03 // (ε/5)^(1/8)
+	)
+	if x < 0 || y < 0 || math.IsNaN(x) || math.IsNaN(y) || math.IsNaN(z) {
+		return math.NaN()
+	}
+	if upper < x || upper < y || upper < z {
+		return math.NaN()
+	}
+	if x+y < lower || z < lower {
+		return math.NaN()
+	}
+
+	A0 := (x + y + 3*z) / 5
+	An := A0
+	Q := math.Max(math.Max(math.Abs(A0-x), math.Abs(A0-y)), math.Abs(A0-z)) / tol
+	xn, yn, zn := x, y, z
+	mul, s := 1.0, 0.0
+
+	for Q >= mul*math.Abs(An) {
+		xnsqrt, ynsqrt, znsqrt := math.Sqrt(xn), math.Sqrt(yn), math.Sqrt(zn)
+		lambda := xnsqrt*ynsqrt + ynsqrt*znsqrt + znsqrt*xnsqrt
+		s += 1 / (mul * znsqrt * (zn + lambda))
+		An = (An + lambda) * 0.25
+		xn = (xn + lambda) * 0.25
+		yn = (yn + lambda) * 0.25
+		zn = (zn + lambda) * 0.25
+		mul *= 4
+	}
+
+	X := (A0 - x) / (mul * An)
+	Y := (A0 - y) / (mul * An)
+	Z := -(X + Y) / 3
+	E2 := X*Y - 6*Z*Z
+	E3 := (3*X*Y - 8*Z*Z) * Z
+	E4 := 3 * (X*Y - Z*Z) * Z * Z
+	E5 := X * Y * Z * Z * Z
+
+	// http://dlmf.nist.gov/19.36.E2
+	return (1-3/14.0*E2+1/6.0*E3+9/88.0*E2*E2-3/22.0*E4-9/52.0*E2*E3+3/26.0*E5-1/16.0*E2*E2*E2+3/40.0*E3*E3+3/20.0*E2*E4+45/272.0*E2*E2*E3-9/68.0*(E3*E4+E2*E5))/(mul*An*math.Sqrt(An)) + 3*s
+}
+
+// EllipticF computes the Legendre's elliptic integral of the 1st kind F(phi,m), 0≤m<1:
+//
+//	F(\phi,m) = \int_{0}^{\phi} 1 / \sqrt{1-m\sin^2(\theta)} d\theta
+//
+// Legendre's elliptic integrals can be expressed as symmetric elliptic integrals, in this case:
+//
+//	F(\phi,m) = \sin\phi R_F(\cos^2\phi,1-m\sin^2\phi,1)
+//
+// The definition of F(phi,k) where k=sqrt(m) can be found in NIST Digital Library of Mathematical
+// Functions (http://dlmf.nist.gov/19.2.E4).
+func EllipticF(phi, m float64) float64 {
+	s, c := math.Sincos(phi)
+	return s * EllipticRF(c*c, 1-m*s*s, 1)
+}
+
+// EllipticE computes the Legendre's elliptic integral of the 2nd kind E(phi,m), 0≤m<1:
+//
+//	E(\phi,m) = \int_{0}^{\phi} \sqrt{1-m\sin^2(\theta)} d\theta
+//
+// Legendre's elliptic integrals can be expressed as symmetric elliptic integrals, in this case:
+//
+//	E(\phi,m) = \sin\phi R_F(\cos^2\phi,1-m\sin^2\phi,1)-(m/3)\sin^3\phi R_D(\cos^2\phi,1-m\sin^2\phi,1)
+//
+// The definition of E(phi,k) where k=sqrt(m) can be found in NIST Digital Library of Mathematical
+// Functions (http://dlmf.nist.gov/19.2.E5).
+func EllipticE(phi, m float64) float64 {
+	s, c := math.Sincos(phi)
+	x, y := c*c, 1-m*s*s
+	return s * (EllipticRF(x, y, 1) - (m/3)*s*s*EllipticRD(x, y, 1))
+}
diff --git a/vendor/gonum.org/v1/gonum/mathext/ell_complete.go b/vendor/gonum.org/v1/gonum/mathext/ell_complete.go
new file mode 100644
index 0000000000..bdba081aad
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/ell_complete.go
@@ -0,0 +1,355 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mathext
+
+import (
+	"math"
+)
+
+// CompleteK computes the complete elliptic integral of the 1st kind, 0≤m≤1. It returns math.NaN() if m is not in [0,1].
+//
+//	K(m) = \int_{0}^{π/2} 1/{\sqrt{1-m{\sin^2θ}}} dθ
+func CompleteK(m float64) float64 {
+	// Reference:
+	// Toshio Fukushima, Precise and fast computation of complete elliptic integrals
+	// by piecewise minimax rational function approximation,
+	// Journal of Computational and Applied Mathematics, Volume 282, 2015, Pages 71-76.
+	// https://doi.org/10.1016/j.cam.2014.12.038
+	// Original Fortran code available at:
+	// https://www.researchgate.net/publication/295857819_xceitxt_F90_package_of_complete_elliptic_integral_computation
+	if m < 0 || 1 < m || math.IsNaN(m) {
+		return math.NaN()
+	}
+
+	mc := 1 - m
+
+	if mc > 0.592990 {
+		t := 2.45694208987494165*mc - 1.45694208987494165
+		t2 := t * t
+		p := ((3703.75266375099019 + t2*(2744.82029097576810+t2*36.2381612593459565)) + t*(5462.47093231923466+t2*(543.839017382099411+t2*0.393188651542789784)))
+		q := ((2077.94377067058435 + t2*(1959.05960044399275+t2*43.5464368440078942)) + t*(3398.00069767755460+t2*(472.794455487539279+t2)))
+		return p / q
+	}
+	if mc > 0.350756 {
+		t := 4.12823963605439369*mc - 1.44800482178389491
+		t2 := t * t
+		p := ((4264.28203103974630 + t2*(3214.59187442783167+t2*43.2589626155454993)) + t*(6341.90978213264024+t2*(642.790566685354573+t2*0.475223892294445943)))
+		q := ((2125.06914237062279 + t2*(2006.03187933518870+t2*44.1848041560412224)) + t*(3479.95663350926514+t2*(482.900172581418890+t2)))
+		return p / q
+	}
+	if mc > 0.206924 {
+		t := 6.95255575949719117*mc - 1.43865064797819679
+		t2 := t * t
+		p := ((4870.25402224986382 + t2*(3738.29369283392307+t2*51.3609902253065926)) + t*(7307.18826377416591+t2*(754.928587580583704+t2*0.571948962277566451)))
+		q := ((2172.51745704102287 + t2*(2056.13612019430497+t2*44.9026847057686146)) + t*(3565.04737778032566+t2*(493.962405117599400+t2)))
+		return p / q
+	}
+	if mc > 0.121734 {
+		t := 11.7384669562155183*mc - 1.42897053644793990
+		t2 := t * t
+		p := ((5514.8512729127464 + t2*(4313.60788246750934+t2*60.598720224393536)) + t*(8350.4595896779631+t2*(880.27903031894216+t2*0.68504458747933773)))
+		q := ((2218.41682813309737 + t2*(2107.97379949034285+t2*45.6911096775045314)) + t*(3650.41829123846319+t2*(505.74295207655096+t2)))
+		return p / q
+	}
+	if mc > 0.071412 {
+		t := 19.8720241643813839*mc - 1.41910098962680339
+		t2 := t * t
+		p := ((6188.8743957372448 + t2*(4935.41351498551527+t2*70.981049144472361)) + t*(9459.3331440432847+t2*(1018.21910476032105+t2*0.81599895108245948)))
+		q := ((2260.73112539748448 + t2*(2159.68721749761492+t2*46.5298955058476510)) + t*(3732.66955095581621+t2*(517.86964191812384+t2)))
+		return p / q
+	}
+	if mc > 0.041770 {
+		t := 33.7359152553808785*mc - 1.40914918021725929
+		t2 := t * t
+		p := ((6879.5170681289562 + t2*(5594.8381504799829+t2*82.452856129147838)) + t*(10615.0836403687221+t2*(1167.26108955935542+t2*0.96592719058503951)))
+		q := ((2296.88303450660439 + t2*(2208.74949754945558+t2*47.3844470709989137)) + t*(3807.37745652028212+t2*(529.79651353072921+t2)))
+		return p / q
+	}
+	if mc > 0.024360 {
+		t := 57.4382538770821367*mc - 1.39919586444572085
+		t2 := t * t
+		p := ((7570.6827538712100 + t2*(6279.2661370014890+t2*94.886883830605940)) + t*(11792.9392624454532+t2*(1325.01058966228180+t2*1.13537029594409690)))
+		q := ((2324.04824540459984 + t2*(2252.22250562615338+t2*48.2089280211559345)) + t*(3869.56755306385732+t2*(540.85752251676412+t2)))
+		return p / q
+	}
+	if mc > 0.014165 {
+		t := 98.0872976949485042*mc - 1.38940657184894556
+		t2 := t * t
+		p := ((8247.2601660137746 + t2*(6974.7495213178613+t2*108.098282908839979)) + t*(12967.7060124572914+t2*(1488.54008220335966+t2*1.32411616748380686)))
+		q := ((2340.47337508405427 + t2*(2287.70677154700516+t2*48.9575432570382154)) + t*(3915.63324533769906+t2*(550.45072377717361+t2)))
+		return p / q
+	}
+	if mc > 0.008213 {
+		t := 168.010752688172043*mc - 1.37987231182795699
+		t2 := t * t
+		p := ((8894.2961573611293 + t2*(7666.5611739483371+t2*121.863474964652041)) + t*(14113.7038749808951+t2*(1654.60731579994159+t2*1.53112170837206117)))
+		q := ((2344.88618943372377 + t2*(2313.28396270968662+t2*49.5906602613891184)) + t*(3942.81065054556536+t2*(558.07615380622169+t2)))
+		return p / q
+	}
+	if mc > 0 {
+		t := 1.0 - 121.758188238159016*mc
+		p := -math.Log(mc*0.0625) * (34813.4518336350547 + t*(235.767716637974271+t*0.199792723884069485)) / (69483.5736412906324 + t*(614.265044703187382+t))
+		q := -mc * (9382.53386835986099 + t*(51.6478985993381223+t*0.00410754154682816898)) / (37327.7262507318317 + t*(408.017247271148538+t))
+		return p + q
+	}
+
+	return math.Inf(1)
+}
+
+// CompleteE computes the complete elliptic integral of the 2nd kind, 0≤m≤1. It returns math.NaN() if m is not in [0,1].
+//
+//	E(m) = \int_{0}^{π/2} {\sqrt{1-m{\sin^2θ}}} dθ
+func CompleteE(m float64) float64 {
+	// Reference:
+	// Toshio Fukushima, Precise and fast computation of complete elliptic integrals
+	// by piecewise minimax rational function approximation,
+	// Journal of Computational and Applied Mathematics, Volume 282, 2015, Pages 71-76.
+	// https://doi.org/10.1016/j.cam.2014.12.038
+	// Original Fortran code available at:
+	// https://www.researchgate.net/publication/295857819_xceitxt_F90_package_of_complete_elliptic_integral_computation
+	if m < 0 || 1 < m || math.IsNaN(m) {
+		return math.NaN()
+	}
+
+	mc := 1 - m
+
+	if mc > 0.566638 {
+		t := 2.30753965506897236*mc - 1.30753965506897236
+		t2 := t * t
+		p := ((19702.2363352671642 + t2*(18177.1879313824040+t2*409.975559128654710)) + t*(31904.1559574281609+t2*(4362.94760768571862+t2*10.3244775335024885)))
+		q := ((14241.2135819448616 + t2*(10266.4884503526076+t2*117.162100771599098)) + t*(20909.9899599927367+t2*(1934.86289070792954+t2)))
+		return p / q
+	}
+	if mc > 0.315153 {
+		t := 3.97638030101198879*mc - 1.25316818100483130
+		t2 := t * t
+		p := ((16317.0721393008221 + t2*(15129.4009798463159+t2*326.113727011739428)) + t*(26627.8852140835023+t2*(3574.15857605556033+t2*7.93163724081373477)))
+		q := ((13047.1505096551210 + t2*(9964.25173735060361+t2*117.670514069579649)) + t*(19753.5762165922376+t2*(1918.72232033637537+t2)))
+		return p / q
+	}
+	if mc > 0.171355 {
+		t := 6.95419964116329852*mc - 1.19163687951153702
+		t2 := t * t
+		p := ((13577.3850240991520 + t2*(12871.9137872656293+t2*263.964361648520708)) + t*(22545.4744699553993+t2*(3000.74575264868572+t2*6.08522443139677663)))
+		q := ((11717.3306408059832 + t2*(9619.40382323874064+t2*118.690522739531267)) + t*(18431.1264424290258+t2*(1904.06010727307491+t2)))
+		return p / q
+	}
+	if mc > 0.090670 {
+		t := 12.3938774245522712*mc - 1.12375286608415443
+		t2 := t * t
+		p := ((11307.9485341543712 + t2*(11208.6068472959372+t2*219.253495956962613)) + t*(19328.6173704569489+t2*(2596.54874477084334+t2*4.66931143174036616)))
+		q := ((10307.6837501971393 + t2*(9241.7604666150102+t2*120.498555754227847)) + t*(16982.2450249024383+t2*(1893.41905403040679+t2)))
+		return p / q
+	}
+	if mc > 0.046453 {
+		t := 22.6157360291290680*mc - 1.05056878576113260
+		t2 := t * t
+		p := ((9383.1490856819874 + t2*(9977.2498973537718+t2*188.618148076418837)) + t*(16718.9730458676860+t2*(2323.49987246555537+t2*3.59313532204509922)))
+		q := ((8877.1964704758383 + t2*(8840.2771293410661+t2*123.422125687316355)) + t*(15450.0537230364062+t2*(1889.13672102820913+t2)))
+		return p / q
+	}
+	if mc > 0.022912 {
+		t := 42.4790790535661187*mc - 0.973280659275306911
+		t2 := t * t
+		p := ((7719.1171817802054 + t2*(9045.3996063894006+t2*169.386557799782496)) + t*(14521.7363804934985+t2*(2149.92068078627829+t2*2.78515570453129137)))
+		q := ((7479.7539074698012 + t2*(8420.3848818926324+t2*127.802109608726363)) + t*(13874.4978011497847+t2*(1892.69753150329759+t2)))
+		return p / q
+	}
+	if mc > 0.010809 {
+		t := 82.6241427745187144*mc - 0.893084359249772784
+		t2 := t * t
+		p := ((6261.6095608987273 + t2*(8304.3265605809870+t2*159.371262600702237)) + t*(12593.0874916293982+t2*(2048.68391263416822+t2*2.18867046462858104)))
+		q := ((6156.4532048239501 + t2*(7979.7435857665227+t2*133.911640385965187)) + t*(12283.8373999680518+t2*(1903.60556312663537+t2)))
+		return p / q
+	}
+	if mc > 0.004841 {
+		t := 167.560321715817694*mc - 0.811159517426273458
+		t2 := t * t
+		p := ((4978.06146583586728 + t2*(7664.6703673290453+t2*156.689647694892782)) + t*(10831.7178150656694+t2*(1995.66437151562090+t2*1.75859085945198570)))
+		q := ((4935.56743322938333 + t2*(7506.8028283118051+t2*141.854303920116856)) + t*(10694.5510113880077+t2*(1918.38517009740321+t2)))
+		return p / q
+	}
+	if mc > 0 {
+		t := 1.0 - 206.568890725056806*mc
+		p := -mc * math.Log(mc*0.0625) * (41566.6612602868736 + t*(154.034981522913482+t*0.0618072471798575991)) / (165964.442527585615 + t*(917.589668642251803+t))
+		q := (132232.803956682877 + t*(353.375480007017643-t*1.40105837312528026)) / (132393.665743088043 + t*(192.112635228732532-t))
+		return p + q
+	}
+
+	return 1
+}
+
+// CompleteB computes an associate complete elliptic integral of the 2nd kind, 0≤m≤1. It returns math.NaN() if m is not in [0,1].
+//
+//	B(m) = \int_{0}^{π/2} {\cos^2θ} / {\sqrt{1-m{\sin^2θ}}} dθ
+func CompleteB(m float64) float64 {
+	// Reference:
+	// Toshio Fukushima, Precise and fast computation of complete elliptic integrals
+	// by piecewise minimax rational function approximation,
+	// Journal of Computational and Applied Mathematics, Volume 282, 2015, Pages 71-76.
+	// https://doi.org/10.1016/j.cam.2014.12.038
+	// Original Fortran code available at:
+	// https://www.researchgate.net/publication/295857819_xceitxt_F90_package_of_complete_elliptic_integral_computation
+	if m < 0 || 1 < m || math.IsNaN(m) {
+		return math.NaN()
+	}
+
+	mc := 1 - m
+
+	if mc > 0.555073 {
+		t := 2.24755971204264969*mc - 1.24755971204264969
+		t2 := t * t
+		p := ((2030.25011505956379 + t2*(1727.60635612511943+t2*25.0715510300422010)) + t*(3223.16236100954529+t2*(361.164121995173076+t2*0.280355207707726826)))
+		q := ((2420.64907902774675 + t2*(2327.48464880306840+t2*47.9870997057202318)) + t*(4034.28168313496638+t2*(549.234220839203960+t2)))
+		return p / q
+	}
+	if mc > 0.302367 {
+		t := 3.95716761770595079*mc - 1.19651690106289522
+		t2 := t * t
+		p := ((2209.26925068374373 + t2*(1981.37862223307242+t2*29.7612810087709299)) + t*(3606.58475322372526+t2*(422.693774742063054+t2*0.334623999861181980)))
+		q := ((2499.57898767250755 + t2*(2467.63998386656941+t2*50.0198090806651216)) + t*(4236.30953048456334+t2*(581.879599221457589+t2)))
+		return p / q
+	}
+	if mc > 0.161052 {
+		t := 7.07638962601280827*mc - 1.13966670204861480
+		t2 := t * t
+		p := ((2359.14823394150129 + t2*(2254.30785457761760+t2*35.2259786264917876)) + t*(3983.28520266051676+t2*(492.601686517364701+t2*0.396605124984359783)))
+		q := ((2563.95563932625156 + t2*(2633.23323959119935+t2*52.6711647124832948)) + t*(4450.19076667898892+t2*(622.983787815718489+t2)))
+		return p / q
+	}
+	if mc > 0.083522 {
+		t := 12.8982329420869341*mc - 1.07728621178898491
+		t2 := t * t
+		p := ((2464.65334987833736 + t2*(2541.68516994216007+t2*41.5832527504007778)) + t*(4333.38639187691528+t2*(571.53606797524881+t2*0.465975784547025267)))
+		q := ((2600.66956117247726 + t2*(2823.69445052534842+t2*56.136001230010910)) + t*(4661.64381841490914+t2*(674.25435972414302+t2)))
+		return p / q
+	}
+	if mc > 0.041966 {
+		t := 24.0639137549331023*mc - 1.00986620463952257
+		t2 := t * t
+		p := ((2509.86724450741259 + t2*(2835.27071287535469+t2*48.9701196718008345)) + t*(4631.12336462339975+t2*(659.86172161727281+t2*0.54158304771955794)))
+		q := ((2594.15983397593723 + t2*(3034.20118545214106+t2*60.652838995496991)) + t*(4848.17491604384532+t2*(737.15143838356850+t2)))
+		return p / q
+	}
+	if mc > 0.020313 {
+		t := 46.1829769546944996*mc - 0.938114810880709371
+		t2 := t * t
+		p := ((2480.58307884128017 + t2*(3122.00900554841322+t2*57.541132641218839)) + t*(4845.57861173250699+t2*(757.31633816400643+t2*0.62119950515996627)))
+		q := ((2528.85218300581396 + t2*(3253.86151324157460+t2*66.496093157522450)) + t*(4979.31783250484768+t2*(812.40556572486862+t2)))
+		return p / q
+	}
+	if mc > 0.009408 {
+		t := 91.7010545621274645*mc - 0.862723521320495186
+		t2 := t * t
+		p := ((2365.25385348859592 + t2*(3381.09304915246175+t2*67.442026950538221)) + t*(4939.53925884558687+t2*(862.16657576129841+t2*0.70143698925710129)))
+		q := ((2390.48737882063755 + t2*(3462.34808443022907+t2*73.934680452209164)) + t*(5015.4675579215077+t2*(898.99542983710459+t2)))
+		return p / q
+	}
+	if mc > 0.004136 {
+		t := 189.681335356600910*mc - 0.784522003034901366
+		t2 := t * t
+		p := ((2160.82916040868119 + t2*(3584.53058926175721+t2*78.769178005879162)) + t*(4877.14832623847052+t2*(970.53716686804832+t2*0.77797110431753920)))
+		q := ((2172.70451405048305 + t2*(3630.52345460629336+t2*83.173163222639080)) + t*(4916.35263668839769+t2*(993.36676027886685+t2)))
+		return p / q
+	}
+	if mc > 0 {
+		t := 1 - 106.292517006802721*mc
+		p := mc * math.Log(mc*0.0625) * (6607.46457640413908 + t*(19.0287633783211078-t*0.00625368946932704460)) / (26150.3443630974309 + t*(354.603981274536040+t))
+		q := (26251.5678902584870 + t*(168.788023807915689+t*0.352150236262724288)) / (26065.7912239203873 + t*(353.916840382280456+t))
+		return p + q
+	}
+
+	return 1
+}
+
+// CompleteD computes an associate complete elliptic integral of the 2nd kind, 0≤m≤1. It returns math.NaN() if m is not in [0,1].
+//
+//	D(m) = \int_{0}^{π/2} {\sin^2θ} / {\sqrt{1-m{\sin^2θ}}} dθ
+func CompleteD(m float64) float64 {
+	// Reference:
+	// Toshio Fukushima, Precise and fast computation of complete elliptic integrals
+	// by piecewise minimax rational function approximation,
+	// Journal of Computational and Applied Mathematics, Volume 282, 2015, Pages 71-76.
+	// https://doi.org/10.1016/j.cam.2014.12.038
+	// Original Fortran code available at:
+	// https://www.researchgate.net/publication/295857819_xceitxt_F90_package_of_complete_elliptic_integral_computation
+	if m < 0 || 1 < m || math.IsNaN(m) {
+		return math.NaN()
+	}
+
+	mc := 1 - m
+
+	if mc > 0.599909 {
+		t := 2.49943137936119533*mc - 1.49943137936119533
+		t2 := t * t
+		p := ((1593.39813781813498 + t2*(1058.56241259843217+t2*11.7584241242587571)) + t*(2233.25576544961714+t2*(195.247394601357872+t2*0.101486443490307517)))
+		q := ((1685.47865546030468 + t2*(1604.88100543517015+t2*38.6743012128666717)) + t*(2756.20968383181114+t2*(397.504162950935944+t2)))
+		return p / q
+	}
+	if mc > 0.359180 {
+		t := 4.15404874360795750*mc - 1.49205122772910617
+		t2 := t * t
+		p := ((1967.01442513777287 + t2*(1329.30058268219177+t2*15.0447805948342760)) + t*(2779.87604145516343+t2*(247.475085945854673+t2*0.130547566005491628)))
+		q := ((1749.70634057327467 + t2*(1654.40804288486242+t2*39.1895256017535337)) + t*(2853.92630369567765+t2*(406.925098588378587+t2)))
+		return p / q
+	}
+	if mc > 0.214574 {
+		t := 6.91534237860116454*mc - 1.48385267554596628
+		t2 := t * t
+		p := ((2409.64196912091452 + t2*(1659.30176823041376+t2*19.1942111405094383)) + t*(3436.40744503228691+t2*(312.186468430688790+t2*0.167847673021897479)))
+		q := ((1824.89205701262525 + t2*(1715.38574780156913+t2*39.8798253173462218)) + t*(2971.02216287936566+t2*(418.929791715319490+t2)))
+		return p / q
+	}
+	if mc > 0.127875 {
+		t := 11.5341584101316047*mc - 1.47493050669557896
+		t2 := t * t
+		p := ((2926.81143179637839 + t2*(2056.45624281065334+t2*24.3811986813439843)) + t*(4214.52119721241319+t2*(391.420514384925370+t2*0.215574280659075512)))
+		q := ((1910.33091918583314 + t2*(1787.99942542734799+t2*40.7663012893484449)) + t*(3107.04531802441481+t2*(433.673494280825971+t2)))
+		return p / q
+	}
+	if mc > 0.076007 {
+		t := 19.2797100331611013*mc - 1.46539292049047582
+		t2 := t * t
+		p := ((3520.63614251102960 + t2*(2526.67111759550923+t2*30.7739877519417978)) + t*(5121.2842239226937+t2*(486.926821696342529+t2*0.276315678908126399)))
+		q := ((2003.81997889501324 + t2*(1871.05914195570669+t2*41.8489850490387023)) + t*(3259.09205279874214+t2*(451.007555352632053+t2)))
+		return p / q
+	}
+	if mc > 0.045052 {
+		t := 32.3049588111775157*mc - 1.45540300436116944
+		t2 := t * t
+		p := ((4188.00087087025347 + t2*(3072.05695847158556+t2*38.5070211470790031)) + t*(6156.0080960857764+t2*(599.76666155374012+t2*0.352955925261363680)))
+		q := ((2101.60113938424690 + t2*(1961.76794074710108+t2*43.0997999502743622)) + t*(3421.55151253792527+t2*(470.407158843118117+t2)))
+		return p / q
+	}
+	if mc > 0.026626 {
+		t := 54.2711386084880061*mc - 1.44502333658960165
+		t2 := t * t
+		p := ((4916.74442376570733 + t2*(3688.12811638360551+t2*47.6447145147811350)) + t*(7304.6632479558695+t2*(729.75841970840314+t2*0.448422756936257635)))
+		q := ((2197.49982676612397 + t2*(2055.19657857622715+t2*44.4576261146308645)) + t*(3584.94502590860852+t2*(490.880160668822953+t2)))
+		return p / q
+	}
+	if mc > 0.015689 {
+		t := 91.4327512114839536*mc - 1.43448843375697175
+		t2 := t * t
+		p := ((5688.7542903989517 + t2*(4364.21513060078954+t2*58.159468141567195)) + t*(8542.6096475195826+t2*(875.35992968472914+t2*0.56528145509695951)))
+		q := ((2285.44062680812883 + t2*(2145.80779422696555+t2*45.8427480379028781)) + t*(3739.30422133833258+t2*(511.23253971875808+t2)))
+		return p / q
+	}
+	if mc > 0.009216 {
+		t := 154.487872701992894*mc - 1.42376023482156651
+		t2 := t * t
+		p := ((6475.3392225234969 + t2*(5081.2997108708577+t2*69.910123337464043)) + t*(9829.1138694605662+t2*(1033.32687775311981+t2*0.70526087421186325)))
+		q := ((2357.74885505777295 + t2*(2226.89527217032394+t2*47.1609071069631012)) + t*(3872.32565152553360+t2*(530.03943432061149+t2)))
+		return p / q
+	}
+	if mc > 0 {
+		t := 1 - 108.506944444444444*mc
+		p := -math.Log(mc*0.0625) * (6.2904323649908115e6 + t*(58565.284164780476+t*(131.176674599188545+t*0.0426826410911220304))) / (1.24937550257219890e7 + t*(203580.534005225410+t*(921.17729845011868+t)))
+		q := -(27356.1090344387530 + t*(107.767403612304371-t*0.0827769227048233593)) / (27104.0854889805978 + t*(358.708172147752755+t))
+		return p + q
+	}
+
+	return math.Inf(1)
+}
diff --git a/vendor/gonum.org/v1/gonum/mathext/erf.go b/vendor/gonum.org/v1/gonum/mathext/erf.go
new file mode 100644
index 0000000000..793238b03a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/erf.go
@@ -0,0 +1,91 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mathext
+
+import "math"
+
+/*
+Copyright (c) 2012 The Probab Authors. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+* Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+// NormalQuantile computes the quantile function (inverse CDF) of the standard
+// normal. NormalQuantile panics if the input p is less than 0 or greater than 1.
+func NormalQuantile(p float64) float64 {
+	switch {
+	case p < 0 || 1 < p:
+		panic("mathext: quantile out of bounds")
+	case p == 1:
+		return math.Inf(1)
+	case p == 0:
+		return math.Inf(-1)
+	}
+	// Compute rational approximation based on the value of p.
+
+	dp := p - 0.5
+	if math.Abs(dp) <= 0.425 {
+		z := 0.180625 - dp*dp
+		z1 := ((((((zQSA[0]*z+zQSA[1])*z+zQSA[2])*z+zQSA[3])*z+zQSA[4])*z+zQSA[5])*z+zQSA[6])*z + zQSA[7]
+		z2 := ((((((zQSB[0]*z+zQSB[1])*z+zQSB[2])*z+zQSB[3])*z+zQSB[4])*z+zQSB[5])*z+zQSB[6])*z + zQSB[7]
+		return dp * z1 / z2
+	}
+
+	if p < 0.5 {
+		r := math.Sqrt(-math.Log(p))
+		if r <= 5.0 {
+			z := r - 1.6
+			z1 := ((((((zQIA[0]*z+zQIA[1])*z+zQIA[2])*z+zQIA[3])*z+zQIA[4])*z+zQIA[5])*z+zQIA[6])*z + zQIA[7]
+			z2 := ((((((zQIB[0]*z+zQIB[1])*z+zQIB[2])*z+zQIB[3])*z+zQIB[4])*z+zQIB[5])*z+zQIB[6])*z + zQIB[7]
+			return -z1 / z2
+		}
+		z := r - 5
+		z1 := ((((((zQTA[0]*z+zQTA[1])*z+zQTA[2])*z+zQTA[3])*z+zQTA[4])*z+zQTA[5])*z+zQTA[6])*z + zQTA[7]
+		z2 := ((((((zQTB[0]*z+zQTB[1])*z+zQTB[2])*z+zQTB[3])*z+zQTB[4])*z+zQTB[5])*z+zQTB[6])*z + zQTB[7]
+		return -z1 / z2
+	}
+	r := math.Sqrt(-math.Log(1 - p))
+	if r <= 5.0 {
+		z := r - 1.6
+		z1 := ((((((zQIA[0]*z+zQIA[1])*z+zQIA[2])*z+zQIA[3])*z+zQIA[4])*z+zQIA[5])*z+zQIA[6])*z + zQIA[7]
+		z2 := ((((((zQIB[0]*z+zQIB[1])*z+zQIB[2])*z+zQIB[3])*z+zQIB[4])*z+zQIB[5])*z+zQIB[6])*z + zQIB[7]
+		return z1 / z2
+	}
+
+	z := r - 5
+	z1 := ((((((zQTA[0]*z+zQTA[1])*z+zQTA[2])*z+zQTA[3])*z+zQTA[4])*z+zQTA[5])*z+zQTA[6])*z + zQTA[7]
+	z2 := ((((((zQTB[0]*z+zQTB[1])*z+zQTB[2])*z+zQTB[3])*z+zQTB[4])*z+zQTB[5])*z+zQTB[6])*z + zQTB[7]
+	return z1 / z2
+}
+
+var (
+	zQSA = [...]float64{2509.0809287301226727, 33430.575583588128105, 67265.770927008700853, 45921.953931549871457, 13731.693765509461125, 1971.5909503065514427, 133.14166789178437745, 3.387132872796366608}
+	zQSB = [...]float64{5226.495278852854561, 28729.085735721942674, 39307.89580009271061, 21213.794301586595867, 5394.1960214247511077, 687.1870074920579083, 42.313330701600911252, 1.0}
+	zQIA = [...]float64{7.7454501427834140764e-4, 0.0227238449892691845833, 0.24178072517745061177, 1.27045825245236838258, 3.64784832476320460504, 5.7694972214606914055, 4.6303378461565452959, 1.42343711074968357734}
+	zQIB = [...]float64{1.05075007164441684324e-9, 5.475938084995344946e-4, 0.0151986665636164571966, 0.14810397642748007459, 0.68976733498510000455, 1.6763848301838038494, 2.05319162663775882187, 1.0}
+	zQTA = [...]float64{2.01033439929228813265e-7, 2.71155556874348757815e-5, 0.0012426609473880784386, 0.026532189526576123093, 0.29656057182850489123, 1.7848265399172913358, 5.4637849111641143699, 6.6579046435011037772}
+	zQTB = [...]float64{2.04426310338993978564e-15, 1.4215117583164458887e-7, 1.8463183175100546818e-5, 7.868691311456132591e-4, 0.0148753612908506148525, 0.13692988092273580531, 0.59983220655588793769, 1.0}
+)
diff --git a/vendor/gonum.org/v1/gonum/mathext/gamma_inc.go b/vendor/gonum.org/v1/gonum/mathext/gamma_inc.go
new file mode 100644
index 0000000000..c4abe2c2d1
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/gamma_inc.go
@@ -0,0 +1,58 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mathext
+
+import (
+	"gonum.org/v1/gonum/mathext/internal/cephes"
+)
+
+// GammaIncReg computes the regularized incomplete Gamma integral.
+//
+//	GammaIncReg(a,x) = (1/ Γ(a)) \int_0^x e^{-t} t^{a-1} dt
+//
+// The input argument a must be positive and x must be non-negative or GammaIncReg
+// will panic.
+//
+// See http://mathworld.wolfram.com/IncompleteGammaFunction.html
+// or https://en.wikipedia.org/wiki/Incomplete_gamma_function for more detailed
+// information.
+func GammaIncReg(a, x float64) float64 {
+	return cephes.Igam(a, x)
+}
+
+// GammaIncRegComp computes the complemented regularized incomplete Gamma integral.
+//
+//	GammaIncRegComp(a,x) = 1 - GammaIncReg(a,x)
+//	                     = (1/ Γ(a)) \int_x^\infty e^{-t} t^{a-1} dt
+//
+// The input argument a must be positive and x must be non-negative or
+// GammaIncRegComp will panic.
+func GammaIncRegComp(a, x float64) float64 {
+	return cephes.IgamC(a, x)
+}
+
+// GammaIncRegInv computes the inverse of the regularized incomplete Gamma integral. That is,
+// it returns the x such that:
+//
+//	GammaIncReg(a, x) = y
+//
+// The input argument a must be positive and y must be between 0 and 1
+// inclusive or GammaIncRegInv will panic. GammaIncRegInv should return a positive
+// number, but can return NaN if there is a failure to converge.
+func GammaIncRegInv(a, y float64) float64 {
+	return gammaIncRegInv(a, y)
+}
+
+// GammaIncRegCompInv computes the inverse of the complemented regularized incomplete Gamma
+// integral. That is, it returns the x such that:
+//
+//	GammaIncRegComp(a, x) = y
+//
+// The input argument a must be positive and y must be between 0 and 1
+// inclusive or GammaIncRegCompInv will panic. GammaIncRegCompInv should return a
+// positive number, but can return 0 even with non-zero y due to underflow.
+func GammaIncRegCompInv(a, y float64) float64 {
+	return cephes.IgamI(a, y)
+}
diff --git a/vendor/gonum.org/v1/gonum/mathext/gamma_inc_inv.go b/vendor/gonum.org/v1/gonum/mathext/gamma_inc_inv.go
new file mode 100644
index 0000000000..175cb6bc93
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/gamma_inc_inv.go
@@ -0,0 +1,58 @@
+// Derived from SciPy's special/c_misc/gammaincinv.c
+// https://github.com/scipy/scipy/blob/master/scipy/special/c_misc/gammaincinv.c
+
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mathext
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/mathext/internal/cephes"
+)
+
+const (
+	allowedATol = 1e-306
+	allowedRTol = 1e-6
+)
+
+func gammaIncReg(x float64, params []float64) float64 {
+	return cephes.Igam(params[0], x) - params[1]
+}
+
+// gammaIncRegInv is the inverse of the regularized incomplete Gamma integral. That is, it
+// returns x such that:
+//
+//	Igam(a, x) = y
+//
+// The input argument a must be positive and y must be between 0 and 1
+// inclusive or gammaIncRegInv will panic. gammaIncRegInv should return a
+// positive number, but can return NaN if there is a failure to converge.
+func gammaIncRegInv(a, y float64) float64 {
+	// For y not small, we just use
+	//  IgamI(a, 1-y)
+	// (inverse of the complemented incomplete Gamma integral). For y small,
+	// however, 1-y is about 1, and we lose digits.
+	if a <= 0 || y <= 0 || y >= 0.25 {
+		return cephes.IgamI(a, 1-y)
+	}
+
+	lo := 0.0
+	flo := -y
+	hi := cephes.IgamI(a, 0.75)
+	fhi := 0.25 - y
+
+	params := []float64{a, y}
+
+	// Also, after we generate a small interval by bisection above, false
+	// position will do a large step from an interval of width ~1e-4 to ~1e-14
+	// in one step (a=10, x=0.05, but similar for other values).
+	result, bestX, _, errEst := falsePosition(lo, hi, flo, fhi, 2*machEp, 2*machEp, 1e-2*a, gammaIncReg, params)
+	if result == fSolveMaxIterations && errEst > allowedATol+allowedRTol*math.Abs(bestX) {
+		bestX = math.NaN()
+	}
+
+	return bestX
+}
diff --git a/vendor/gonum.org/v1/gonum/mathext/internal/amos/amos.go b/vendor/gonum.org/v1/gonum/mathext/internal/amos/amos.go
new file mode 100644
index 0000000000..6ea2d4668c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/internal/amos/amos.go
@@ -0,0 +1,2136 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package amos
+
+import (
+	"math"
+	"math/cmplx"
+)
+
+/*
+The AMOS functions are included in SLATEC, and the SLATEC guide (http://www.netlib.org/slatec/guide) explicitly states:
+"The Library is in the public domain and distributed by the Energy
+Science and Technology Software Center."
+Mention of AMOS's inclusion in SLATEC goes back at least to this 1985 technical report from Sandia National Labs: http://infoserve.sandia.gov/sand_doc/1985/851018.pdf
+*/
+
+// math.NaN() are for padding to keep indexing easy.
+var imach = []int{-0, 5, 6, 0, 0, 32, 4, 2, 31, 2147483647, 2, 24, -125, 127, 53, -1021, 1023}
+
+var dmach = []float64{math.NaN(), 2.23e-308, 1.79e-308, 1.11e-16, 2.22e-16, 0.30103000998497009}
+
+func abs(a int) int {
+	if a >= 0 {
+		return a
+	}
+	return -a
+}
+
+func Zairy(ZR, ZI float64, ID, KODE int) (AIR, AII float64, NZ, IERR int) {
+	// zairy is adapted from the original Netlib code by Donald Amos.
+	// http://www.netlib.no/netlib/amos/zairy.f
+
+	// Original comment:
+	/*
+		C***BEGIN PROLOGUE  ZAIRY
+		C***DATE WRITTEN   830501   (YYMMDD)
+		C***REVISION DATE  890801   (YYMMDD)
+		C***CATEGORY NO.  B5K
+		C***KEYWORDS  AIRY FUNCTION,BESSEL FUNCTIONS OF ORDER ONE THIRD
+		C***AUTHOR  AMOS, DONALD E., SANDIA NATIONAL LABORATORIES
+		C***PURPOSE  TO COMPUTE AIRY FUNCTIONS AI(Z) AND DAI(Z) FOR COMPLEX Z
+		C***DESCRIPTION
+		C
+		C                      ***A DOUBLE PRECISION ROUTINE***
+		C         ON KODE=1, ZAIRY COMPUTES THE COMPLEX AIRY FUNCTION AI(Z) OR
+		C         ITS DERIVATIVE DAI(Z)/DZ ON ID=0 OR ID=1 RESPECTIVELY. ON
+		C         KODE=2, A SCALING OPTION CEXP(ZTA)*AI(Z) OR CEXP(ZTA)*
+		C         DAI(Z)/DZ IS PROVIDED TO REMOVE THE EXPONENTIAL DECAY IN
+		C         -PI/3<ARG(Z)<PI/3 AND THE EXPONENTIAL GROWTH IN
+		C         PI/3<ABS(ARG(Z))<PI WHERE ZTA=(2/3)*Z*CSQRT(Z).
+		C
+		C         WHILE THE AIRY FUNCTIONS AI(Z) AND DAI(Z)/DZ ARE ANALYTIC IN
+		C         THE WHOLE Z PLANE, THE CORRESPONDING SCALED FUNCTIONS DEFINED
+		C         FOR KODE=2 HAVE A CUT ALONG THE NEGATIVE REAL AXIS.
+		C         DEFINTIONS AND NOTATION ARE FOUND IN THE NBS HANDBOOK OF
+		C         MATHEMATICAL FUNCTIONS (REF. 1).
+		C
+		C         INPUT      ZR,ZI ARE DOUBLE PRECISION
+		C           ZR,ZI  - Z=CMPLX(ZR,ZI)
+		C           ID     - ORDER OF DERIVATIVE, ID=0 OR ID=1
+		C           KODE   - A PARAMETER TO INDICATE THE SCALING OPTION
+		C                    KODE= 1  returnS
+		C                             AI=AI(Z)                ON ID=0 OR
+		C                             AI=DAI(Z)/DZ            ON ID=1
+		C                        = 2  returnS
+		C                             AI=CEXP(ZTA)*AI(Z)       ON ID=0 OR
+		C                             AI=CEXP(ZTA)*DAI(Z)/DZ   ON ID=1 WHERE
+		C                             ZTA=(2/3)*Z*CSQRT(Z)
+		C
+		C         OUTPUT     AIR,AII ARE DOUBLE PRECISION
+		C           AIR,AII- COMPLEX ANSWER DEPENDING ON THE CHOICES FOR ID AND
+		C                    KODE
+		C           NZ     - UNDERFLOW INDICATOR
+		C                    NZ= 0   , NORMAL return
+		C                    NZ= 1   , AI=CMPLX(0.0E0,0.0E0) DUE TO UNDERFLOW IN
+		C                              -PI/3<ARG(Z)<PI/3 ON KODE=1
+		C           IERR   - ERROR FLAG
+		C                    IERR=0, NORMAL return - COMPUTATION COMPLETED
+		C                    IERR=1, INPUT ERROR   - NO COMPUTATION
+		C                    IERR=2, OVERFLOW      - NO COMPUTATION, REAL(ZTA)
+		C                            TOO LARGE ON KODE=1
+		C                    IERR=3, CABS(Z) LARGE      - COMPUTATION COMPLETED
+		C                            LOSSES OF SIGNIFCANCE BY ARGUMENT REDUCTION
+		C                            PRODUCE LESS THAN HALF OF MACHINE ACCURACY
+		C                    IERR=4, CABS(Z) TOO LARGE  - NO COMPUTATION
+		C                            COMPLETE LOSS OF ACCURACY BY ARGUMENT
+		C                            REDUCTION
+		C                    IERR=5, ERROR              - NO COMPUTATION,
+		C                            ALGORITHM TERMINATION CONDITION NOT MET
+		C
+		C***LONG DESCRIPTION
+		C
+		C         AI AND DAI ARE COMPUTED FOR CABS(Z)>1.0 FROM THE K BESSEL
+		C         FUNCTIONS BY
+		C
+		C            AI(Z)=C*SQRT(Z)*K(1/3,ZTA) , DAI(Z)=-C*Z*K(2/3,ZTA)
+		C                           C=1.0/(PI*SQRT(3.0))
+		C                            ZTA=(2/3)*Z**(3/2)
+		C
+		C         WITH THE POWER SERIES FOR CABS(Z)<=1.0.
+		C
+		C         IN MOST COMPLEX VARIABLE COMPUTATION, ONE MUST EVALUATE ELE-
+		C         MENTARY FUNCTIONS. WHEN THE MAGNITUDE OF Z IS LARGE, LOSSES
+		C         OF SIGNIFICANCE BY ARGUMENT REDUCTION OCCUR. CONSEQUENTLY, IF
+		C         THE MAGNITUDE OF ZETA=(2/3)*Z**1.5 EXCEEDS U1=SQRT(0.5/UR),
+		C         THEN LOSSES EXCEEDING HALF PRECISION ARE LIKELY AND AN ERROR
+		C         FLAG IERR=3 IS TRIGGERED WHERE UR=math.Max(dmach[4),1.0D-18) IS
+		C         DOUBLE PRECISION UNIT ROUNDOFF LIMITED TO 18 DIGITS PRECISION.
+		C         ALSO, if THE MAGNITUDE OF ZETA IS LARGER THAN U2=0.5/UR, THEN
+		C         ALL SIGNIFICANCE IS LOST AND IERR=4. IN ORDER TO USE THE INT
+		C         FUNCTION, ZETA MUST BE FURTHER RESTRICTED NOT TO EXCEED THE
+		C         LARGEST INTEGER, U3=I1MACH(9). THUS, THE MAGNITUDE OF ZETA
+		C         MUST BE RESTRICTED BY MIN(U2,U3). ON 32 BIT MACHINES, U1,U2,
+		C         AND U3 ARE APPROXIMATELY 2.0E+3, 4.2E+6, 2.1E+9 IN SINGLE
+		C         PRECISION ARITHMETIC AND 1.3E+8, 1.8E+16, 2.1E+9 IN DOUBLE
+		C         PRECISION ARITHMETIC RESPECTIVELY. THIS MAKES U2 AND U3 LIMIT-
+		C         ING IN THEIR RESPECTIVE ARITHMETICS. THIS MEANS THAT THE MAG-
+		C         NITUDE OF Z CANNOT EXCEED 3.1E+4 IN SINGLE AND 2.1E+6 IN
+		C         DOUBLE PRECISION ARITHMETIC. THIS ALSO MEANS THAT ONE CAN
+		C         EXPECT TO RETAIN, IN THE WORST CASES ON 32 BIT MACHINES,
+		C         NO DIGITS IN SINGLE PRECISION AND ONLY 7 DIGITS IN DOUBLE
+		C         PRECISION ARITHMETIC. SIMILAR CONSIDERATIONS HOLD FOR OTHER
+		C         MACHINES.
+		C
+		C         THE APPROXIMATE RELATIVE ERROR IN THE MAGNITUDE OF A COMPLEX
+		C         BESSEL FUNCTION CAN BE EXPRESSED BY P*10**S WHERE P=MAX(UNIT
+		C         ROUNDOFF,1.0E-18) IS THE NOMINAL PRECISION AND 10**S REPRE-
+		C         SENTS THE INCREASE IN ERROR DUE TO ARGUMENT REDUCTION IN THE
+		C         ELEMENTARY FUNCTIONS. HERE, S=MAX(1,ABS(LOG10(CABS(Z))),
+		C         ABS(LOG10(FNU))) APPROXIMATELY (I.E. S=MAX(1,ABS(EXPONENT OF
+		C         CABS(Z),ABS(EXPONENT OF FNU)) ). HOWEVER, THE PHASE ANGLE MAY
+		C         HAVE ONLY ABSOLUTE ACCURACY. THIS IS MOST LIKELY TO OCCUR WHEN
+		C         ONE COMPONENT (IN ABSOLUTE VALUE) IS LARGER THAN THE OTHER BY
+		C         SEVERAL ORDERS OF MAGNITUDE. if ONE COMPONENT IS 10**K LARGER
+		C         THAN THE OTHER, THEN ONE CAN EXPECT ONLY MAX(ABS(LOG10(P))-K,
+		C         0) SIGNIFICANT DIGITS; OR, STATED ANOTHER WAY, WHEN K EXCEEDS
+		C         THE EXPONENT OF P, NO SIGNIFICANT DIGITS REMAIN IN THE SMALLER
+		C         COMPONENT. HOWEVER, THE PHASE ANGLE RETAINS ABSOLUTE ACCURACY
+		C         BECAUSE, IN COMPLEX ARITHMETIC WITH PRECISION P, THE SMALLER
+		C         COMPONENT WILL NOT (AS A RULE) DECREASE BELOW P TIMES THE
+		C         MAGNITUDE OF THE LARGER COMPONENT. IN THESE EXTREME CASES,
+		C         THE PRINCIPAL PHASE ANGLE IS ON THE ORDER OF +P, -P, PI/2-P,
+		C         OR -PI/2+P.
+		C
+		C***REFERENCES  HANDBOOK OF MATHEMATICAL FUNCTIONS BY M. ABRAMOWITZ
+		C                 AND I. A. STEGUN, NBS AMS SERIES 55, U.S. DEPT. OF
+		C                 COMMERCE, 1955.
+		C
+		C               COMPUTATION OF BESSEL FUNCTIONS OF COMPLEX ARGUMENT
+		C                 AND LARGE ORDER BY D. E. AMOS, SAND83-0643, MAY, 1983
+		C
+		C               A SUBROUTINE PACKAGE FOR BESSEL FUNCTIONS OF A COMPLEX
+		C                 ARGUMENT AND NONNEGATIVE ORDER BY D. E. AMOS, SAND85-
+		C                 1018, MAY, 1985
+		C
+		C               A PORTABLE PACKAGE FOR BESSEL FUNCTIONS OF A COMPLEX
+		C                 ARGUMENT AND NONNEGATIVE ORDER BY D. E. AMOS, TRANS.
+		C                 MATH. SOFTWARE, 1986
+	*/
+	var AI, CONE, CSQ, CY, S1, S2, TRM1, TRM2, Z, ZTA, Z3 complex128
+	var AA, AD, AK, ALIM, ATRM, AZ, AZ3, BK,
+		CC, CK, COEF, CONEI, CONER, CSQI, CSQR, C1, C2, DIG,
+		DK, D1, D2, ELIM, FID, FNU, PTR, RL, R1M5, SFAC, STI, STR,
+		S1I, S1R, S2I, S2R, TOL, TRM1I, TRM1R, TRM2I, TRM2R, TTH, ZEROI,
+		ZEROR, ZTAI, ZTAR, Z3I, Z3R, ALAZ, BB float64
+	var IFLAG, K, K1, K2, MR, NN int
+	var tmp complex128
+
+	// Extra element for padding.
+	CYR := []float64{math.NaN(), 0}
+	CYI := []float64{math.NaN(), 0}
+
+	_ = AI
+	_ = CONE
+	_ = CSQ
+	_ = CY
+	_ = S1
+	_ = S2
+	_ = TRM1
+	_ = TRM2
+	_ = Z
+	_ = ZTA
+	_ = Z3
+
+	TTH = 6.66666666666666667e-01
+	C1 = 3.55028053887817240e-01
+	C2 = 2.58819403792806799e-01
+	COEF = 1.83776298473930683e-01
+	ZEROR = 0
+	ZEROI = 0
+	CONER = 1
+	CONEI = 0
+
+	NZ = 0
+	if ID < 0 || ID > 1 {
+		IERR = 1
+	}
+	if KODE < 1 || KODE > 2 {
+		IERR = 1
+	}
+	if IERR != 0 {
+		return
+	}
+	AZ = cmplx.Abs(complex(ZR, ZI))
+	TOL = math.Max(dmach[4], 1.0e-18)
+	FID = float64(ID)
+	if AZ > 1.0e0 {
+		goto Seventy
+	}
+
+	// POWER SERIES FOR CABS(Z)<=1.
+	S1R = CONER
+	S1I = CONEI
+	S2R = CONER
+	S2I = CONEI
+	if AZ < TOL {
+		goto OneSeventy
+	}
+	AA = AZ * AZ
+	if AA < TOL/AZ {
+		goto Forty
+	}
+	TRM1R = CONER
+	TRM1I = CONEI
+	TRM2R = CONER
+	TRM2I = CONEI
+	ATRM = 1.0e0
+	STR = ZR*ZR - ZI*ZI
+	STI = ZR*ZI + ZI*ZR
+	Z3R = STR*ZR - STI*ZI
+	Z3I = STR*ZI + STI*ZR
+	AZ3 = AZ * AA
+	AK = 2.0e0 + FID
+	BK = 3.0e0 - FID - FID
+	CK = 4.0e0 - FID
+	DK = 3.0e0 + FID + FID
+	D1 = AK * DK
+	D2 = BK * CK
+	AD = math.Min(D1, D2)
+	AK = 24.0e0 + 9.0e0*FID
+	BK = 30.0e0 - 9.0e0*FID
+	for K = 1; K <= 25; K++ {
+		STR = (TRM1R*Z3R - TRM1I*Z3I) / D1
+		TRM1I = (TRM1R*Z3I + TRM1I*Z3R) / D1
+		TRM1R = STR
+		S1R = S1R + TRM1R
+		S1I = S1I + TRM1I
+		STR = (TRM2R*Z3R - TRM2I*Z3I) / D2
+		TRM2I = (TRM2R*Z3I + TRM2I*Z3R) / D2
+		TRM2R = STR
+		S2R = S2R + TRM2R
+		S2I = S2I + TRM2I
+		ATRM = ATRM * AZ3 / AD
+		D1 = D1 + AK
+		D2 = D2 + BK
+		AD = math.Min(D1, D2)
+		if ATRM < TOL*AD {
+			goto Forty
+		}
+		AK = AK + 18.0e0
+		BK = BK + 18.0e0
+	}
+Forty:
+	if ID == 1 {
+		goto Fifty
+	}
+	AIR = S1R*C1 - C2*(ZR*S2R-ZI*S2I)
+	AII = S1I*C1 - C2*(ZR*S2I+ZI*S2R)
+	if KODE == 1 {
+		return
+	}
+	tmp = cmplx.Sqrt(complex(ZR, ZI))
+	STR = real(tmp)
+	STI = imag(tmp)
+	ZTAR = TTH * (ZR*STR - ZI*STI)
+	ZTAI = TTH * (ZR*STI + ZI*STR)
+	tmp = cmplx.Exp(complex(ZTAR, ZTAI))
+	STR = real(tmp)
+	STI = imag(tmp)
+	PTR = AIR*STR - AII*STI
+	AII = AIR*STI + AII*STR
+	AIR = PTR
+	return
+
+Fifty:
+	AIR = -S2R * C2
+	AII = -S2I * C2
+	if AZ <= TOL {
+		goto Sixty
+	}
+	STR = ZR*S1R - ZI*S1I
+	STI = ZR*S1I + ZI*S1R
+	CC = C1 / (1.0e0 + FID)
+	AIR = AIR + CC*(STR*ZR-STI*ZI)
+	AII = AII + CC*(STR*ZI+STI*ZR)
+
+Sixty:
+	if KODE == 1 {
+		return
+	}
+	tmp = cmplx.Sqrt(complex(ZR, ZI))
+	STR = real(tmp)
+	STI = imag(tmp)
+	ZTAR = TTH * (ZR*STR - ZI*STI)
+	ZTAI = TTH * (ZR*STI + ZI*STR)
+	tmp = cmplx.Exp(complex(ZTAR, ZTAI))
+	STR = real(tmp)
+	STI = imag(tmp)
+	PTR = STR*AIR - STI*AII
+	AII = STR*AII + STI*AIR
+	AIR = PTR
+	return
+
+	// CASE FOR CABS(Z)>1.0.
+Seventy:
+	FNU = (1.0e0 + FID) / 3.0e0
+
+	/*
+	   SET PARAMETERS RELATED TO MACHINE CONSTANTS.
+	   TOL IS THE APPROXIMATE UNIT ROUNDOFF LIMITED TO 1.0D-18.
+	   ELIM IS THE APPROXIMATE EXPONENTIAL OVER-&&UNDERFLOW LIMIT.
+	   EXP(-ELIM)<EXP(-ALIM)=EXP(-ELIM)/TOL    AND
+	   EXP(ELIM)>EXP(ALIM)=EXP(ELIM)*TOL       ARE INTERVALS NEAR
+	   UNDERFLOW&&OVERFLOW LIMITS WHERE SCALED ARITHMETIC IS DONE.
+	   RL IS THE LOWER BOUNDARY OF THE ASYMPTOTIC EXPANSION FOR LA>=Z.
+	   DIG = NUMBER OF BASE 10 DIGITS IN TOL = 10**(-DIG).
+	*/
+	K1 = imach[15]
+	K2 = imach[16]
+	R1M5 = dmach[5]
+
+	K = min(abs(K1), abs(K2))
+	ELIM = 2.303e0 * (float64(K)*R1M5 - 3.0e0)
+	K1 = imach[14] - 1
+	AA = R1M5 * float64(K1)
+	DIG = math.Min(AA, 18.0e0)
+	AA = AA * 2.303e0
+	ALIM = ELIM + math.Max(-AA, -41.45e0)
+	RL = 1.2e0*DIG + 3.0e0
+	ALAZ = math.Log(AZ)
+
+	// TEST FOR PROPER RANGE.
+	AA = 0.5e0 / TOL
+	BB = float64(float32(imach[9])) * 0.5e0
+	AA = math.Min(AA, BB)
+	AA = math.Pow(AA, TTH)
+	if AZ > AA {
+		goto TwoSixty
+	}
+	AA = math.Sqrt(AA)
+	if AZ > AA {
+		IERR = 3
+	}
+	tmp = cmplx.Sqrt(complex(ZR, ZI))
+	CSQR = real(tmp)
+	CSQI = imag(tmp)
+	ZTAR = TTH * (ZR*CSQR - ZI*CSQI)
+	ZTAI = TTH * (ZR*CSQI + ZI*CSQR)
+
+	//  RE(ZTA)<=0 WHEN RE(Z)<0, ESPECIALLY WHEN IM(Z) IS SMALL.
+	IFLAG = 0
+	SFAC = 1.0e0
+	AK = ZTAI
+	if ZR >= 0.0e0 {
+		goto Eighty
+	}
+	BK = ZTAR
+	CK = -math.Abs(BK)
+	ZTAR = CK
+	ZTAI = AK
+
+Eighty:
+	if ZI != 0.0e0 {
+		goto Ninety
+	}
+	if ZR > 0.0e0 {
+		goto Ninety
+	}
+	ZTAR = 0.0e0
+	ZTAI = AK
+Ninety:
+	AA = ZTAR
+	if AA >= 0.0e0 && ZR > 0.0e0 {
+		goto OneTen
+	}
+	if KODE == 2 {
+		goto OneHundred
+	}
+
+	// OVERFLOW TEST.
+	if AA > (-ALIM) {
+		goto OneHundred
+	}
+	AA = -AA + 0.25e0*ALAZ
+	IFLAG = 1
+	SFAC = TOL
+	if AA > ELIM {
+		goto TwoSeventy
+	}
+
+OneHundred:
+	// CBKNU AND CACON return EXP(ZTA)*K(FNU,ZTA) ON KODE=2.
+	MR = 1
+	if ZI < 0.0e0 {
+		MR = -1
+	}
+	_, _, _, _, _, _, CYR, CYI, NN, _, _, _, _ = Zacai(ZTAR, ZTAI, FNU, KODE, MR, 1, CYR, CYI, RL, TOL, ELIM, ALIM)
+	if NN < 0 {
+		goto TwoEighty
+	}
+	NZ = NZ + NN
+	goto OneThirty
+
+OneTen:
+	if KODE == 2 {
+		goto OneTwenty
+	}
+
+	// UNDERFLOW TEST.
+	if AA < ALIM {
+		goto OneTwenty
+	}
+	AA = -AA - 0.25e0*ALAZ
+	IFLAG = 2
+	SFAC = 1.0e0 / TOL
+	if AA < (-ELIM) {
+		goto TwoTen
+	}
+OneTwenty:
+	_, _, _, _, _, CYR, CYI, NZ, _, _, _ = Zbknu(ZTAR, ZTAI, FNU, KODE, 1, CYR, CYI, TOL, ELIM, ALIM)
+
+OneThirty:
+	S1R = CYR[1] * COEF
+	S1I = CYI[1] * COEF
+	if IFLAG != 0 {
+		goto OneFifty
+	}
+	if ID == 1 {
+		goto OneFourty
+	}
+	AIR = CSQR*S1R - CSQI*S1I
+	AII = CSQR*S1I + CSQI*S1R
+	return
+OneFourty:
+	AIR = -(ZR*S1R - ZI*S1I)
+	AII = -(ZR*S1I + ZI*S1R)
+	return
+OneFifty:
+	S1R = S1R * SFAC
+	S1I = S1I * SFAC
+	if ID == 1 {
+		goto OneSixty
+	}
+	STR = S1R*CSQR - S1I*CSQI
+	S1I = S1R*CSQI + S1I*CSQR
+	S1R = STR
+	AIR = S1R / SFAC
+	AII = S1I / SFAC
+	return
+OneSixty:
+	STR = -(S1R*ZR - S1I*ZI)
+	S1I = -(S1R*ZI + S1I*ZR)
+	S1R = STR
+	AIR = S1R / SFAC
+	AII = S1I / SFAC
+	return
+OneSeventy:
+	AA = 1.0e+3 * dmach[1]
+	S1R = ZEROR
+	S1I = ZEROI
+	if ID == 1 {
+		goto OneNinety
+	}
+	if AZ <= AA {
+		goto OneEighty
+	}
+	S1R = C2 * ZR
+	S1I = C2 * ZI
+OneEighty:
+	AIR = C1 - S1R
+	AII = -S1I
+	return
+OneNinety:
+	AIR = -C2
+	AII = 0.0e0
+	AA = math.Sqrt(AA)
+	if AZ <= AA {
+		goto TwoHundred
+	}
+	S1R = 0.5e0 * (ZR*ZR - ZI*ZI)
+	S1I = ZR * ZI
+TwoHundred:
+	AIR = AIR + C1*S1R
+	AII = AII + C1*S1I
+	return
+TwoTen:
+	NZ = 1
+	AIR = ZEROR
+	AII = ZEROI
+	return
+TwoSeventy:
+	NZ = 0
+	IERR = 2
+	return
+TwoEighty:
+	if NN == (-1) {
+		goto TwoSeventy
+	}
+	NZ = 0
+	IERR = 5
+	return
+TwoSixty:
+	IERR = 4
+	NZ = 0
+	return
+}
+
+// sbknu computes the k bessel function in the right half z plane.
+func Zbknu(ZR, ZI, FNU float64, KODE, N int, YR, YI []float64, TOL, ELIM, ALIM float64) (ZRout, ZIout, FNUout float64, KODEout, Nout int, YRout, YIout []float64, NZ int, TOLout, ELIMout, ALIMout float64) {
+	/* Old dimension comment.
+		DIMENSION YR(N), YI(N), CC(8), CSSR(3), CSRR(3), BRY(3), CYR(2),
+	     * CYI(2)
+	*/
+
+	// TODO(btracey): Find which of these are inputs/outputs/both and clean up
+	// the function call.
+	// YR and YI have length n (but n+1 with better indexing)
+	var AA, AK, ASCLE, A1, A2, BB, BK, CAZ,
+		CBI, CBR, CCHI, CCHR, CKI, CKR, COEFI, COEFR, CONEI, CONER,
+		CRSCR, CSCLR, CSHI, CSHR, CSI, CSR, CTWOR,
+		CZEROI, CZEROR, CZI, CZR, DNU, DNU2, DPI, ETEST, FC, FHS,
+		FI, FK, FKS, FMUI, FMUR, FPI, FR, G1, G2, HPI, PI, PR, PTI,
+		PTR, P1I, P1R, P2I, P2M, P2R, QI, QR, RAK, RCAZ, RTHPI, RZI,
+		RZR, R1, S, SMUI, SMUR, SPI, STI, STR, S1I, S1R, S2I, S2R, TM,
+		TTH, T1, T2, ELM, CELMR, ZDR, ZDI, AS, ALAS, HELIM float64
+
+	var I, IFLAG, INU, K, KFLAG, KK, KMAX, KODED, IDUM, J, IC, INUB, NW int
+
+	var sinh, cosh complex128
+	//var sin, cos float64
+
+	var tmp, p complex128
+	var CSSR, CSRR, BRY [4]float64
+	var CYR, CYI [3]float64
+
+	KMAX = 30
+	CZEROR = 0
+	CZEROI = 0
+	CONER = 1
+	CONEI = 0
+	CTWOR = 2
+	R1 = 2
+
+	DPI = 3.14159265358979324e0
+	RTHPI = 1.25331413731550025e0
+	SPI = 1.90985931710274403e0
+	HPI = 1.57079632679489662e0
+	FPI = 1.89769999331517738e0
+	TTH = 6.66666666666666666e-01
+
+	CC := [9]float64{math.NaN(), 5.77215664901532861e-01, -4.20026350340952355e-02,
+		-4.21977345555443367e-02, 7.21894324666309954e-03,
+		-2.15241674114950973e-04, -2.01348547807882387e-05,
+		1.13302723198169588e-06, 6.11609510448141582e-09}
+
+	CAZ = cmplx.Abs(complex(ZR, ZI))
+	CSCLR = 1.0e0 / TOL
+	CRSCR = TOL
+	CSSR[1] = CSCLR
+	CSSR[2] = 1.0e0
+	CSSR[3] = CRSCR
+	CSRR[1] = CRSCR
+	CSRR[2] = 1.0e0
+	CSRR[3] = CSCLR
+	BRY[1] = 1.0e+3 * dmach[1] / TOL
+	BRY[2] = 1.0e0 / BRY[1]
+	BRY[3] = dmach[2]
+	IFLAG = 0
+	KODED = KODE
+	RCAZ = 1.0e0 / CAZ
+	STR = ZR * RCAZ
+	STI = -ZI * RCAZ
+	RZR = (STR + STR) * RCAZ
+	RZI = (STI + STI) * RCAZ
+	INU = int(float32(FNU + 0.5))
+	DNU = FNU - float64(INU)
+	if math.Abs(DNU) == 0.5e0 {
+		goto OneTen
+	}
+	DNU2 = 0.0e0
+	if math.Abs(DNU) > TOL {
+		DNU2 = DNU * DNU
+	}
+	if CAZ > R1 {
+		goto OneTen
+	}
+
+	// SERIES FOR CABS(Z)<=R1.
+	FC = 1.0e0
+	tmp = cmplx.Log(complex(RZR, RZI))
+	SMUR = real(tmp)
+	SMUI = imag(tmp)
+	FMUR = SMUR * DNU
+	FMUI = SMUI * DNU
+	tmp = complex(FMUR, FMUI)
+	sinh = cmplx.Sinh(tmp)
+	cosh = cmplx.Cosh(tmp)
+	CSHR = real(sinh)
+	CSHI = imag(sinh)
+	CCHR = real(cosh)
+	CCHI = imag(cosh)
+	if DNU == 0.0e0 {
+		goto Ten
+	}
+	FC = DNU * DPI
+	FC = FC / math.Sin(FC)
+	SMUR = CSHR / DNU
+	SMUI = CSHI / DNU
+Ten:
+	A2 = 1.0e0 + DNU
+
+	// GAM(1-Z)*GAM(1+Z)=PI*Z/SIN(PI*Z), T1=1/GAM(1-DNU), T2=1/GAM(1+DNU).
+	T2 = math.Exp(-dgamln(A2, IDUM))
+	T1 = 1.0e0 / (T2 * FC)
+	if math.Abs(DNU) > 0.1e0 {
+		goto Forty
+	}
+
+	// SERIES FOR F0 TO RESOLVE INDETERMINACY FOR SMALL ABS(DNU).
+	AK = 1.0e0
+	S = CC[1]
+	for K = 2; K <= 8; K++ {
+		AK = AK * DNU2
+		TM = CC[K] * AK
+		S = S + TM
+		if math.Abs(TM) < TOL {
+			goto Thirty
+		}
+	}
+Thirty:
+	G1 = -S
+	goto Fifty
+Forty:
+	G1 = (T1 - T2) / (DNU + DNU)
+Fifty:
+	G2 = (T1 + T2) * 0.5e0
+	FR = FC * (CCHR*G1 + SMUR*G2)
+	FI = FC * (CCHI*G1 + SMUI*G2)
+	tmp = cmplx.Exp(complex(FMUR, FMUI))
+	STR = real(tmp)
+	STI = imag(tmp)
+	PR = 0.5e0 * STR / T2
+	PI = 0.5e0 * STI / T2
+	tmp = complex(0.5, 0) / complex(STR, STI)
+	PTR = real(tmp)
+	PTI = imag(tmp)
+	QR = PTR / T1
+	QI = PTI / T1
+	S1R = FR
+	S1I = FI
+	S2R = PR
+	S2I = PI
+	AK = 1.0e0
+	A1 = 1.0e0
+	CKR = CONER
+	CKI = CONEI
+	BK = 1.0e0 - DNU2
+	if INU > 0 || N > 1 {
+		goto Eighty
+	}
+
+	// GENERATE K(FNU,Z), 0.0E0 <= FNU < 0.5E0 AND N=1.
+	if CAZ < TOL {
+		goto Seventy
+	}
+	tmp = complex(ZR, ZI) * complex(ZR, ZI)
+	CZR = real(tmp)
+	CZI = imag(tmp)
+	CZR = 0.25e0 * CZR
+	CZI = 0.25e0 * CZI
+	T1 = 0.25e0 * CAZ * CAZ
+Sixty:
+	FR = (FR*AK + PR + QR) / BK
+	FI = (FI*AK + PI + QI) / BK
+	STR = 1.0e0 / (AK - DNU)
+	PR = PR * STR
+	PI = PI * STR
+	STR = 1.0e0 / (AK + DNU)
+	QR = QR * STR
+	QI = QI * STR
+	STR = CKR*CZR - CKI*CZI
+	RAK = 1.0e0 / AK
+	CKI = (CKR*CZI + CKI*CZR) * RAK
+	CKR = STR * RAK
+	S1R = CKR*FR - CKI*FI + S1R
+	S1I = CKR*FI + CKI*FR + S1I
+	A1 = A1 * T1 * RAK
+	BK = BK + AK + AK + 1.0e0
+	AK = AK + 1.0e0
+	if A1 > TOL {
+		goto Sixty
+	}
+Seventy:
+	YR[1] = S1R
+	YI[1] = S1I
+	if KODED == 1 {
+		return ZR, ZI, FNU, KODE, N, YR, YI, NZ, TOL, ELIM, ALIM
+	}
+	tmp = cmplx.Exp(complex(ZR, ZI))
+	STR = real(tmp)
+	STI = imag(tmp)
+	tmp = complex(S1R, S1I) * complex(STR, STI)
+	YR[1] = real(tmp)
+	YI[1] = imag(tmp)
+	return ZR, ZI, FNU, KODE, N, YR, YI, NZ, TOL, ELIM, ALIM
+
+	// GENERATE K(DNU,Z) AND K(DNU+1,Z) FOR FORWARD RECURRENCE.
+Eighty:
+	if CAZ < TOL {
+		goto OneHundred
+	}
+	tmp = complex(ZR, ZI) * complex(ZR, ZI)
+	CZR = real(tmp)
+	CZI = imag(tmp)
+	CZR = 0.25e0 * CZR
+	CZI = 0.25e0 * CZI
+	T1 = 0.25e0 * CAZ * CAZ
+Ninety:
+	FR = (FR*AK + PR + QR) / BK
+	FI = (FI*AK + PI + QI) / BK
+	STR = 1.0e0 / (AK - DNU)
+	PR = PR * STR
+	PI = PI * STR
+	STR = 1.0e0 / (AK + DNU)
+	QR = QR * STR
+	QI = QI * STR
+	STR = CKR*CZR - CKI*CZI
+	RAK = 1.0e0 / AK
+	CKI = (CKR*CZI + CKI*CZR) * RAK
+	CKR = STR * RAK
+	S1R = CKR*FR - CKI*FI + S1R
+	S1I = CKR*FI + CKI*FR + S1I
+	STR = PR - FR*AK
+	STI = PI - FI*AK
+	S2R = CKR*STR - CKI*STI + S2R
+	S2I = CKR*STI + CKI*STR + S2I
+	A1 = A1 * T1 * RAK
+	BK = BK + AK + AK + 1.0e0
+	AK = AK + 1.0e0
+	if A1 > TOL {
+		goto Ninety
+	}
+OneHundred:
+	KFLAG = 2
+	A1 = FNU + 1.0e0
+	AK = A1 * math.Abs(SMUR)
+	if AK > ALIM {
+		KFLAG = 3
+	}
+	STR = CSSR[KFLAG]
+	P2R = S2R * STR
+	P2I = S2I * STR
+	tmp = complex(P2R, P2I) * complex(RZR, RZI)
+	S2R = real(tmp)
+	S2I = imag(tmp)
+	S1R = S1R * STR
+	S1I = S1I * STR
+	if KODED == 1 {
+		goto TwoTen
+	}
+	tmp = cmplx.Exp(complex(ZR, ZI))
+	FR = real(tmp)
+	FI = imag(tmp)
+	tmp = complex(S1R, S1I) * complex(FR, FI)
+	S1R = real(tmp)
+	S1I = imag(tmp)
+	tmp = complex(S2R, S2I) * complex(FR, FI)
+	S2R = real(tmp)
+	S2I = imag(tmp)
+	goto TwoTen
+
+	// IFLAG=0 MEANS NO UNDERFLOW OCCURRED
+	// IFLAG=1 MEANS AN UNDERFLOW OCCURRED- COMPUTATION PROCEEDS WITH
+	// KODED=2 AND A TEST FOR ON SCALE VALUES IS MADE DURING FORWARD RECURSION
+OneTen:
+	tmp = cmplx.Sqrt(complex(ZR, ZI))
+	STR = real(tmp)
+	STI = imag(tmp)
+	tmp = complex(RTHPI, CZEROI) / complex(STR, STI)
+	COEFR = real(tmp)
+	COEFI = imag(tmp)
+	KFLAG = 2
+	if KODED == 2 {
+		goto OneTwenty
+	}
+	if ZR > ALIM {
+		goto TwoNinety
+	}
+
+	STR = math.Exp(-ZR) * CSSR[KFLAG]
+	//sin, cos = math.Sincos(ZI)
+	STI = -STR * math.Sin(ZI)
+	STR = STR * math.Cos(ZI)
+	tmp = complex(COEFR, COEFI) * complex(STR, STI)
+	COEFR = real(tmp)
+	COEFI = imag(tmp)
+OneTwenty:
+	if math.Abs(DNU) == 0.5e0 {
+		goto ThreeHundred
+	}
+	// MILLER ALGORITHM FOR CABS(Z)>R1.
+	AK = math.Cos(DPI * DNU)
+	AK = math.Abs(AK)
+	if AK == CZEROR {
+		goto ThreeHundred
+	}
+	FHS = math.Abs(0.25e0 - DNU2)
+	if FHS == CZEROR {
+		goto ThreeHundred
+	}
+
+	// COMPUTE R2=F(E). if CABS(Z)>=R2, USE FORWARD RECURRENCE TO
+	// DETERMINE THE BACKWARD INDEX K. R2=F(E) IS A STRAIGHT LINE ON
+	// 12<=E<=60. E IS COMPUTED FROM 2**(-E)=B**(1-I1MACH(14))=
+	// TOL WHERE B IS THE BASE OF THE ARITHMETIC.
+	T1 = float64(imach[14] - 1)
+	T1 = T1 * dmach[5] * 3.321928094e0
+	T1 = math.Max(T1, 12.0e0)
+	T1 = math.Min(T1, 60.0e0)
+	T2 = TTH*T1 - 6.0e0
+	if ZR != 0.0e0 {
+		goto OneThirty
+	}
+	T1 = HPI
+	goto OneFourty
+OneThirty:
+	T1 = math.Atan(ZI / ZR)
+	T1 = math.Abs(T1)
+OneFourty:
+	if T2 > CAZ {
+		goto OneSeventy
+	}
+	// FORWARD RECURRENCE LOOP WHEN CABS(Z)>=R2.
+	ETEST = AK / (DPI * CAZ * TOL)
+	FK = CONER
+	if ETEST < CONER {
+		goto OneEighty
+	}
+	FKS = CTWOR
+	CKR = CAZ + CAZ + CTWOR
+	P1R = CZEROR
+	P2R = CONER
+	for I = 1; I <= KMAX; I++ {
+		AK = FHS / FKS
+		CBR = CKR / (FK + CONER)
+		PTR = P2R
+		P2R = CBR*P2R - P1R*AK
+		P1R = PTR
+		CKR = CKR + CTWOR
+		FKS = FKS + FK + FK + CTWOR
+		FHS = FHS + FK + FK
+		FK = FK + CONER
+		STR = math.Abs(P2R) * FK
+		if ETEST < STR {
+			goto OneSixty
+		}
+	}
+	goto ThreeTen
+OneSixty:
+	FK = FK + SPI*T1*math.Sqrt(T2/CAZ)
+	FHS = math.Abs(0.25 - DNU2)
+	goto OneEighty
+OneSeventy:
+	// COMPUTE BACKWARD INDEX K FOR CABS(Z)<R2.
+	A2 = math.Sqrt(CAZ)
+	AK = FPI * AK / (TOL * math.Sqrt(A2))
+	AA = 3.0e0 * T1 / (1.0e0 + CAZ)
+	BB = 14.7e0 * T1 / (28.0e0 + CAZ)
+	AK = (math.Log(AK) + CAZ*math.Cos(AA)/(1.0e0+0.008e0*CAZ)) / math.Cos(BB)
+	FK = 0.12125e0*AK*AK/CAZ + 1.5e0
+OneEighty:
+	// BACKWARD RECURRENCE LOOP FOR MILLER ALGORITHM.
+	K = int(float32(FK))
+	FK = float64(K)
+	FKS = FK * FK
+	P1R = CZEROR
+	P1I = CZEROI
+	P2R = TOL
+	P2I = CZEROI
+	CSR = P2R
+	CSI = P2I
+	for I = 1; I <= K; I++ {
+		A1 = FKS - FK
+		AK = (FKS + FK) / (A1 + FHS)
+		RAK = 2.0e0 / (FK + CONER)
+		CBR = (FK + ZR) * RAK
+		CBI = ZI * RAK
+		PTR = P2R
+		PTI = P2I
+		P2R = (PTR*CBR - PTI*CBI - P1R) * AK
+		P2I = (PTI*CBR + PTR*CBI - P1I) * AK
+		P1R = PTR
+		P1I = PTI
+		CSR = CSR + P2R
+		CSI = CSI + P2I
+		FKS = A1 - FK + CONER
+		FK = FK - CONER
+	}
+	// COMPUTE (P2/CS)=(P2/CABS(CS))*(CONJG(CS)/CABS(CS)) FOR BETTER SCALING.
+	TM = cmplx.Abs(complex(CSR, CSI))
+	PTR = 1.0e0 / TM
+	S1R = P2R * PTR
+	S1I = P2I * PTR
+	CSR = CSR * PTR
+	CSI = -CSI * PTR
+	tmp = complex(COEFR, COEFI) * complex(S1R, S1I)
+	STR = real(tmp)
+	STI = imag(tmp)
+	tmp = complex(STR, STI) * complex(CSR, CSI)
+	S1R = real(tmp)
+	S1I = imag(tmp)
+	if INU > 0 || N > 1 {
+		goto TwoHundred
+	}
+	ZDR = ZR
+	ZDI = ZI
+	if IFLAG == 1 {
+		goto TwoSeventy
+	}
+	goto TwoFourty
+TwoHundred:
+	// COMPUTE P1/P2=(P1/CABS(P2)*CONJG(P2)/CABS(P2) FOR SCALING.
+	TM = cmplx.Abs(complex(P2R, P2I))
+	PTR = 1.0e0 / TM
+	P1R = P1R * PTR
+	P1I = P1I * PTR
+	P2R = P2R * PTR
+	P2I = -P2I * PTR
+	tmp = complex(P1R, P1I) * complex(P2R, P2I)
+	PTR = real(tmp)
+	PTI = imag(tmp)
+	STR = DNU + 0.5e0 - PTR
+	STI = -PTI
+	tmp = complex(STR, STI) / complex(ZR, ZI)
+	STR = real(tmp)
+	STI = imag(tmp)
+	STR = STR + 1.0e0
+	tmp = complex(STR, STI) * complex(S1R, S1I)
+	S2R = real(tmp)
+	S2I = imag(tmp)
+
+	// FORWARD RECURSION ON THE THREE TERM RECURSION WITH RELATION WITH
+	// SCALING NEAR EXPONENT EXTREMES ON KFLAG=1 OR KFLAG=3
+TwoTen:
+	STR = DNU + 1.0e0
+	CKR = STR * RZR
+	CKI = STR * RZI
+	if N == 1 {
+		INU = INU - 1
+	}
+	if INU > 0 {
+		goto TwoTwenty
+	}
+	if N > 1 {
+		goto TwoFifteen
+	}
+	S1R = S2R
+	S1I = S2I
+TwoFifteen:
+	ZDR = ZR
+	ZDI = ZI
+	if IFLAG == 1 {
+		goto TwoSeventy
+	}
+	goto TwoFourty
+TwoTwenty:
+	INUB = 1
+	if IFLAG == 1 {
+		goto TwoSixtyOne
+	}
+TwoTwentyFive:
+	P1R = CSRR[KFLAG]
+	ASCLE = BRY[KFLAG]
+	for I = INUB; I <= INU; I++ {
+		STR = S2R
+		STI = S2I
+		S2R = CKR*STR - CKI*STI + S1R
+		S2I = CKR*STI + CKI*STR + S1I
+		S1R = STR
+		S1I = STI
+		CKR = CKR + RZR
+		CKI = CKI + RZI
+		if KFLAG >= 3 {
+			continue
+		}
+		P2R = S2R * P1R
+		P2I = S2I * P1R
+		STR = math.Abs(P2R)
+		STI = math.Abs(P2I)
+		P2M = math.Max(STR, STI)
+		if P2M <= ASCLE {
+			continue
+		}
+		KFLAG = KFLAG + 1
+		ASCLE = BRY[KFLAG]
+		S1R = S1R * P1R
+		S1I = S1I * P1R
+		S2R = P2R
+		S2I = P2I
+		STR = CSSR[KFLAG]
+		S1R = S1R * STR
+		S1I = S1I * STR
+		S2R = S2R * STR
+		S2I = S2I * STR
+		P1R = CSRR[KFLAG]
+	}
+	if N != 1 {
+		goto TwoFourty
+	}
+	S1R = S2R
+	S1I = S2I
+TwoFourty:
+	STR = CSRR[KFLAG]
+	YR[1] = S1R * STR
+	YI[1] = S1I * STR
+	if N == 1 {
+		return ZR, ZI, FNU, KODE, N, YR, YI, NZ, TOL, ELIM, ALIM
+	}
+	YR[2] = S2R * STR
+	YI[2] = S2I * STR
+	if N == 2 {
+		return ZR, ZI, FNU, KODE, N, YR, YI, NZ, TOL, ELIM, ALIM
+	}
+	KK = 2
+TwoFifty:
+	KK = KK + 1
+	if KK > N {
+		return ZR, ZI, FNU, KODE, N, YR, YI, NZ, TOL, ELIM, ALIM
+	}
+	P1R = CSRR[KFLAG]
+	ASCLE = BRY[KFLAG]
+	for I = KK; I <= N; I++ {
+		P2R = S2R
+		P2I = S2I
+		S2R = CKR*P2R - CKI*P2I + S1R
+		S2I = CKI*P2R + CKR*P2I + S1I
+		S1R = P2R
+		S1I = P2I
+		CKR = CKR + RZR
+		CKI = CKI + RZI
+		P2R = S2R * P1R
+		P2I = S2I * P1R
+		YR[I] = P2R
+		YI[I] = P2I
+		if KFLAG >= 3 {
+			continue
+		}
+		STR = math.Abs(P2R)
+		STI = math.Abs(P2I)
+		P2M = math.Max(STR, STI)
+		if P2M <= ASCLE {
+			continue
+		}
+		KFLAG = KFLAG + 1
+		ASCLE = BRY[KFLAG]
+		S1R = S1R * P1R
+		S1I = S1I * P1R
+		S2R = P2R
+		S2I = P2I
+		STR = CSSR[KFLAG]
+		S1R = S1R * STR
+		S1I = S1I * STR
+		S2R = S2R * STR
+		S2I = S2I * STR
+		P1R = CSRR[KFLAG]
+	}
+	return ZR, ZI, FNU, KODE, N, YR, YI, NZ, TOL, ELIM, ALIM
+
+	// IFLAG=1 CASES, FORWARD RECURRENCE ON SCALED VALUES ON UNDERFLOW.
+TwoSixtyOne:
+	HELIM = 0.5e0 * ELIM
+	ELM = math.Exp(-ELIM)
+	CELMR = ELM
+	ASCLE = BRY[1]
+	ZDR = ZR
+	ZDI = ZI
+	IC = -1
+	J = 2
+	for I = 1; I <= INU; I++ {
+		STR = S2R
+		STI = S2I
+		S2R = STR*CKR - STI*CKI + S1R
+		S2I = STI*CKR + STR*CKI + S1I
+		S1R = STR
+		S1I = STI
+		CKR = CKR + RZR
+		CKI = CKI + RZI
+		AS = cmplx.Abs(complex(S2R, S2I))
+		ALAS = math.Log(AS)
+		P2R = -ZDR + ALAS
+		if P2R < (-ELIM) {
+			goto TwoSixtyThree
+		}
+		tmp = cmplx.Log(complex(S2R, S2I))
+		STR = real(tmp)
+		STI = imag(tmp)
+		P2R = -ZDR + STR
+		P2I = -ZDI + STI
+		P2M = math.Exp(P2R) / TOL
+		// sin, cos = math.Sincos(P2I)
+		P1R = P2M * math.Cos(P2I)
+		P1I = P2M * math.Sin(P2I)
+		p = complex(P1R, P1I)
+		NW = Zuchk(p, ASCLE, TOL)
+		if NW != 0 {
+			goto TwoSixtyThree
+		}
+		J = 3 - J
+		CYR[J] = P1R
+		CYI[J] = P1I
+		if IC == (I - 1) {
+			goto TwoSixtyFour
+		}
+		IC = I
+		continue
+	TwoSixtyThree:
+		if ALAS < HELIM {
+			continue
+		}
+		ZDR = ZDR - ELIM
+		S1R = S1R * CELMR
+		S1I = S1I * CELMR
+		S2R = S2R * CELMR
+		S2I = S2I * CELMR
+	}
+	if N != 1 {
+		goto TwoSeventy
+	}
+	S1R = S2R
+	S1I = S2I
+	goto TwoSeventy
+TwoSixtyFour:
+	KFLAG = 1
+	INUB = I + 1
+	S2R = CYR[J]
+	S2I = CYI[J]
+	J = 3 - J
+	S1R = CYR[J]
+	S1I = CYI[J]
+	if INUB <= INU {
+		goto TwoTwentyFive
+	}
+	if N != 1 {
+		goto TwoFourty
+	}
+	S1R = S2R
+	S1I = S2I
+	goto TwoFourty
+TwoSeventy:
+	YR[1] = S1R
+	YI[1] = S1I
+	if N == 1 {
+		goto TwoEighty
+	}
+	YR[2] = S2R
+	YI[2] = S2I
+TwoEighty:
+	ASCLE = BRY[1]
+	_, _, FNU, N, YR, YI, NZ, RZR, RZI, _, TOL, ELIM = Zkscl(ZDR, ZDI, FNU, N, YR, YI, RZR, RZI, ASCLE, TOL, ELIM)
+	INU = N - NZ
+	if INU <= 0 {
+		return ZR, ZI, FNU, KODE, N, YR, YI, NZ, TOL, ELIM, ALIM
+	}
+	KK = NZ + 1
+	S1R = YR[KK]
+	S1I = YI[KK]
+	YR[KK] = S1R * CSRR[1]
+	YI[KK] = S1I * CSRR[1]
+	if INU == 1 {
+		return ZR, ZI, FNU, KODE, N, YR, YI, NZ, TOL, ELIM, ALIM
+	}
+	KK = NZ + 2
+	S2R = YR[KK]
+	S2I = YI[KK]
+	YR[KK] = S2R * CSRR[1]
+	YI[KK] = S2I * CSRR[1]
+	if INU == 2 {
+		return ZR, ZI, FNU, KODE, N, YR, YI, NZ, TOL, ELIM, ALIM
+	}
+	T2 = FNU + float64(float32(KK-1))
+	CKR = T2 * RZR
+	CKI = T2 * RZI
+	KFLAG = 1
+	goto TwoFifty
+TwoNinety:
+
+	// SCALE BY math.Exp(Z), IFLAG = 1 CASES.
+
+	IFLAG = 1
+	KFLAG = 2
+	goto OneTwenty
+
+	// FNU=HALF ODD INTEGER CASE, DNU=-0.5
+ThreeHundred:
+	S1R = COEFR
+	S1I = COEFI
+	S2R = COEFR
+	S2I = COEFI
+	goto TwoTen
+
+ThreeTen:
+	NZ = -2
+	return ZR, ZI, FNU, KODE, N, YR, YI, NZ, TOL, ELIM, ALIM
+}
+
+// SET K FUNCTIONS TO ZERO ON UNDERFLOW, CONTINUE RECURRENCE
+// ON SCALED FUNCTIONS UNTIL TWO MEMBERS COME ON SCALE, THEN
+// return WITH MIN(NZ+2,N) VALUES SCALED BY 1/TOL.
+func Zkscl(ZRR, ZRI, FNU float64, N int, YR, YI []float64, RZR, RZI, ASCLE, TOL, ELIM float64) (
+	ZRRout, ZRIout, FNUout float64, Nout int, YRout, YIout []float64, NZ int, RZRout, RZIout, ASCLEout, TOLout, ELIMout float64) {
+	var ACS, AS, CKI, CKR, CSI, CSR, FN, STR, S1I, S1R, S2I,
+		S2R, ZEROI, ZEROR, ZDR, ZDI, CELMR, ELM, HELIM, ALAS float64
+
+	var I, IC, KK, NN, NW int
+	var tmp, c complex128
+	var CYR, CYI [3]float64
+	var sin, cos float64
+
+	// DIMENSION YR(N), YI(N), CYR(2), CYI(2)
+	ZEROR = 0
+	ZEROI = 0
+	IC = 0
+	NN = min(2, N)
+	for I = 1; I <= NN; I++ {
+		S1R = YR[I]
+		S1I = YI[I]
+		CYR[I] = S1R
+		CYI[I] = S1I
+		AS = cmplx.Abs(complex(S1R, S1I))
+		ACS = -ZRR + math.Log(AS)
+		NZ = NZ + 1
+		YR[I] = ZEROR
+		YI[I] = ZEROI
+		if ACS < (-ELIM) {
+			continue
+		}
+
+		tmp = cmplx.Log(complex(S1R, S1I))
+		CSR = real(tmp)
+		CSI = imag(tmp)
+		CSR = CSR - ZRR
+		CSI = CSI - ZRI
+		STR = math.Exp(CSR) / TOL
+		// sin, cos = math.Sincos(CSI)
+		CSR = STR * math.Cos(CSI)
+		CSI = STR * math.Sin(CSI)
+		c = complex(CSR, CSI)
+		NW = Zuchk(c, ASCLE, TOL)
+		if NW != 0 {
+			continue
+		}
+		YR[I] = CSR
+		YI[I] = CSI
+		IC = I
+		NZ = NZ - 1
+	}
+	if N == 1 {
+		return ZRR, ZRI, FNU, N, YR, YI, NZ, RZR, RZI, ASCLE, TOL, ELIM
+	}
+	if IC > 1 {
+		goto Twenty
+	}
+	YR[1] = ZEROR
+	YI[1] = ZEROI
+	NZ = 2
+Twenty:
+	if N == 2 {
+		return ZRR, ZRI, FNU, N, YR, YI, NZ, RZR, RZI, ASCLE, TOL, ELIM
+	}
+	if NZ == 0 {
+		return ZRR, ZRI, FNU, N, YR, YI, NZ, RZR, RZI, ASCLE, TOL, ELIM
+	}
+	FN = FNU + 1.0e0
+	CKR = FN * RZR
+	CKI = FN * RZI
+	S1R = CYR[1]
+	S1I = CYI[1]
+	S2R = CYR[2]
+	S2I = CYI[2]
+	HELIM = 0.5e0 * ELIM
+	ELM = math.Exp(-ELIM)
+	CELMR = ELM
+	ZDR = ZRR
+	ZDI = ZRI
+
+	// FIND TWO CONSECUTIVE Y VALUES ON SCALE. SCALE RECURRENCE IF
+	// S2 GETS LARGER THAN EXP(ELIM/2)
+	for I = 3; I <= N; I++ {
+		KK = I
+		CSR = S2R
+		CSI = S2I
+		S2R = CKR*CSR - CKI*CSI + S1R
+		S2I = CKI*CSR + CKR*CSI + S1I
+		S1R = CSR
+		S1I = CSI
+		CKR = CKR + RZR
+		CKI = CKI + RZI
+		AS = cmplx.Abs(complex(S2R, S2I))
+		ALAS = math.Log(AS)
+		ACS = -ZDR + ALAS
+		NZ = NZ + 1
+		YR[I] = ZEROR
+		YI[I] = ZEROI
+		if ACS < (-ELIM) {
+			goto TwentyFive
+		}
+		tmp = cmplx.Log(complex(S2R, S2I))
+		CSR = real(tmp)
+		CSI = imag(tmp)
+		CSR = CSR - ZDR
+		CSI = CSI - ZDI
+		STR = math.Exp(CSR) / TOL
+		sin, cos = math.Sincos(CSI)
+		CSR = STR * cos
+		CSI = STR * sin
+		c = complex(CSR, CSI)
+		NW = Zuchk(c, ASCLE, TOL)
+		if NW != 0 {
+			goto TwentyFive
+		}
+		YR[I] = CSR
+		YI[I] = CSI
+		NZ = NZ - 1
+		if IC == KK-1 {
+			goto Forty
+		}
+		IC = KK
+		continue
+	TwentyFive:
+		if ALAS < HELIM {
+			continue
+		}
+		ZDR = ZDR - ELIM
+		S1R = S1R * CELMR
+		S1I = S1I * CELMR
+		S2R = S2R * CELMR
+		S2I = S2I * CELMR
+	}
+	NZ = N
+	if IC == N {
+		NZ = N - 1
+	}
+	goto FourtyFive
+Forty:
+	NZ = KK - 2
+FourtyFive:
+	for I = 1; I <= NZ; I++ {
+		YR[I] = ZEROR
+		YI[I] = ZEROI
+	}
+	return ZRR, ZRI, FNU, N, YR, YI, NZ, RZR, RZI, ASCLE, TOL, ELIM
+}
+
+// Zuchk tests whether the magnitude of the real or imaginary part would
+// underflow when y is scaled by tol.
+//
+// y enters as a scaled quantity whose magnitude is greater than
+//
+//	1e3 + 3*dmach(1)/tol
+//
+// y is accepted if the underflow is at least one precision below the magnitude
+// of the largest component. Otherwise an underflow is assumed as the phase angle
+// does not have sufficient accuracy.
+func Zuchk(y complex128, scale, tol float64) int {
+	absR := math.Abs(real(y))
+	absI := math.Abs(imag(y))
+	minAbs := math.Min(absR, absI)
+	if minAbs > scale {
+		return 0
+	}
+	maxAbs := math.Max(absR, absI)
+	minAbs /= tol
+	if maxAbs < minAbs {
+		return 1
+	}
+	return 0
+}
+
+// ZACAI APPLIES THE ANALYTIC CONTINUATION FORMULA
+//
+//	K(FNU,ZN*EXP(MP))=K(FNU,ZN)*EXP(-MP*FNU) - MP*I(FNU,ZN)
+//	      MP=PI*MR*CMPLX(0.0,1.0)
+//
+// TO CONTINUE THE K FUNCTION FROM THE RIGHT HALF TO THE LEFT
+// HALF Z PLANE FOR USE WITH ZAIRY WHERE FNU=1/3 OR 2/3 AND N=1.
+// ZACAI IS THE SAME AS ZACON WITH THE PARTS FOR LARGER ORDERS AND
+// RECURRENCE REMOVED. A RECURSIVE CALL TO ZACON CAN RESULT if ZACON
+// IS CALLED FROM ZAIRY.
+func Zacai(ZR, ZI, FNU float64, KODE, MR, N int, YR, YI []float64, RL, TOL, ELIM, ALIM float64) (
+	ZRout, ZIout, FNUout float64, KODEout, MRout, Nout int, YRout, YIout []float64, NZ int, RLout, TOLout, ELIMout, ALIMout float64) {
+	var ARG, ASCLE, AZ, CSGNR, CSGNI, CSPNR,
+		CSPNI, C1R, C1I, C2R, C2I, DFNU, FMR, PI,
+		SGN, YY, ZNR, ZNI float64
+	var INU, IUF, NN, NW int
+	var zn, c1, c2, z complex128
+	var y []complex128
+	//var sin, cos float64
+
+	CYR := []float64{math.NaN(), 0, 0}
+	CYI := []float64{math.NaN(), 0, 0}
+
+	PI = math.Pi
+	ZNR = -ZR
+	ZNI = -ZI
+	AZ = cmplx.Abs(complex(ZR, ZI))
+	NN = N
+	DFNU = FNU + float64(float32(N-1))
+	if AZ <= 2.0e0 {
+		goto Ten
+	}
+	if AZ*AZ*0.25 > DFNU+1.0e0 {
+		goto Twenty
+	}
+Ten:
+	// POWER SERIES FOR THE I FUNCTION.
+	z = complex(ZNR, ZNI)
+	y = make([]complex128, len(YR))
+	for i, v := range YR {
+		y[i] = complex(v, YI[i])
+	}
+	Zseri(z, FNU, KODE, NN, y[1:], TOL, ELIM, ALIM)
+	for i, v := range y {
+		YR[i] = real(v)
+		YI[i] = imag(v)
+	}
+	goto Forty
+Twenty:
+	if AZ < RL {
+		goto Thirty
+	}
+	// ASYMPTOTIC EXPANSION FOR LARGE Z FOR THE I FUNCTION.
+	ZNR, ZNI, FNU, KODE, _, YR, YI, NW, RL, TOL, ELIM, ALIM = Zasyi(ZNR, ZNI, FNU, KODE, NN, YR, YI, RL, TOL, ELIM, ALIM)
+	if NW < 0 {
+		goto Eighty
+	}
+	goto Forty
+Thirty:
+	// MILLER ALGORITHM NORMALIZED BY THE SERIES FOR THE I FUNCTION
+	ZNR, ZNI, FNU, KODE, _, YR, YI, NW, TOL = Zmlri(ZNR, ZNI, FNU, KODE, NN, YR, YI, TOL)
+	if NW < 0 {
+		goto Eighty
+	}
+Forty:
+	// ANALYTIC CONTINUATION TO THE LEFT HALF PLANE FOR THE K FUNCTION.
+	ZNR, ZNI, FNU, KODE, _, CYR, CYI, NW, TOL, ELIM, ALIM = Zbknu(ZNR, ZNI, FNU, KODE, 1, CYR, CYI, TOL, ELIM, ALIM)
+	if NW != 0 {
+		goto Eighty
+	}
+	FMR = float64(float32(MR))
+	SGN = -math.Copysign(PI, FMR)
+	CSGNR = 0.0e0
+	CSGNI = SGN
+	if KODE == 1 {
+		goto Fifty
+	}
+	YY = -ZNI
+	//sin, cos = math.Sincos(YY)
+	CSGNR = -CSGNI * math.Sin(YY)
+	CSGNI = CSGNI * math.Cos(YY)
+Fifty:
+	// CALCULATE CSPN=EXP(FNU*PI*I) TO MINIMIZE LOSSES OF SIGNIFICANCE
+	// WHEN FNU IS LARGE
+	INU = int(float32(FNU))
+	ARG = (FNU - float64(float32(INU))) * SGN
+	//sin, cos = math.Sincos(ARG)
+	CSPNR = math.Cos(ARG)
+	CSPNI = math.Sin(ARG)
+	if INU%2 == 0 {
+		goto Sixty
+	}
+	CSPNR = -CSPNR
+	CSPNI = -CSPNI
+Sixty:
+	C1R = CYR[1]
+	C1I = CYI[1]
+	C2R = YR[1]
+	C2I = YI[1]
+	if KODE == 1 {
+		goto Seventy
+	}
+	IUF = 0
+	ASCLE = 1.0e+3 * dmach[1] / TOL
+	zn = complex(ZNR, ZNI)
+	c1 = complex(C1R, C1I)
+	c2 = complex(C2R, C2I)
+	c1, c2, NW, _ = Zs1s2(zn, c1, c2, ASCLE, ALIM, IUF)
+	C1R = real(c1)
+	C1I = imag(c1)
+	C2R = real(c2)
+	C2I = imag(c2)
+	NZ = NZ + NW
+Seventy:
+	YR[1] = CSPNR*C1R - CSPNI*C1I + CSGNR*C2R - CSGNI*C2I
+	YI[1] = CSPNR*C1I + CSPNI*C1R + CSGNR*C2I + CSGNI*C2R
+	return ZR, ZI, FNU, KODE, MR, N, YR, YI, NZ, RL, TOL, ELIM, ALIM
+Eighty:
+	NZ = -1
+	if NW == -2 {
+		NZ = -2
+	}
+	return ZR, ZI, FNU, KODE, MR, N, YR, YI, NZ, RL, TOL, ELIM, ALIM
+}
+
+// ZASYI COMPUTES THE I BESSEL FUNCTION FOR REAL(Z)>=0.0 BY
+// MEANS OF THE ASYMPTOTIC EXPANSION FOR LARGE CABS(Z) IN THE
+// REGION CABS(Z)>MAX(RL,FNU*FNU/2). NZ=0 IS A NORMAL return.
+// NZ<0 INDICATES AN OVERFLOW ON KODE=1.
+func Zasyi(ZR, ZI, FNU float64, KODE, N int, YR, YI []float64, RL, TOL, ELIM, ALIM float64) (
+	ZRout, ZIout, FNUout float64, KODEout, Nout int, YRout, YIout []float64, NZ int, RLout, TOLout, ELIMout, ALIMout float64) {
+	var AA, AEZ, AK, AK1I, AK1R, ARG, ARM, ATOL,
+		AZ, BB, BK, CKI, CKR, CONEI, CONER, CS1I, CS1R, CS2I, CS2R, CZI,
+		CZR, DFNU, DKI, DKR, DNU2, EZI, EZR, FDN, PI, P1I,
+		P1R, RAZ, RTPI, RTR1, RZI, RZR, S, SGN, SQK, STI, STR, S2I,
+		S2R, TZI, TZR, ZEROI, ZEROR float64
+
+	var I, IB, IL, INU, J, JL, K, KODED, M, NN int
+	var tmp complex128
+	// var sin, cos float64
+
+	PI = math.Pi
+	RTPI = 0.159154943091895336e0
+	ZEROR = 0
+	ZEROI = 0
+	CONER = 1
+	CONEI = 0
+
+	AZ = cmplx.Abs(complex(ZR, ZI))
+	ARM = 1.0e3 * dmach[1]
+	RTR1 = math.Sqrt(ARM)
+	IL = min(2, N)
+	DFNU = FNU + float64(float32(N-IL))
+
+	// OVERFLOW TEST
+	RAZ = 1.0e0 / AZ
+	STR = ZR * RAZ
+	STI = -ZI * RAZ
+	AK1R = RTPI * STR * RAZ
+	AK1I = RTPI * STI * RAZ
+	tmp = cmplx.Sqrt(complex(AK1R, AK1I))
+	AK1R = real(tmp)
+	AK1I = imag(tmp)
+	CZR = ZR
+	CZI = ZI
+	if KODE != 2 {
+		goto Ten
+	}
+	CZR = ZEROR
+	CZI = ZI
+Ten:
+	if math.Abs(CZR) > ELIM {
+		goto OneHundred
+	}
+	DNU2 = DFNU + DFNU
+	KODED = 1
+	if (math.Abs(CZR) > ALIM) && (N > 2) {
+		goto Twenty
+	}
+	KODED = 0
+	tmp = cmplx.Exp(complex(CZR, CZI))
+	STR = real(tmp)
+	STI = imag(tmp)
+	tmp = complex(AK1R, AK1I) * complex(STR, STI)
+	AK1R = real(tmp)
+	AK1I = imag(tmp)
+Twenty:
+	FDN = 0.0e0
+	if DNU2 > RTR1 {
+		FDN = DNU2 * DNU2
+	}
+	EZR = ZR * 8.0e0
+	EZI = ZI * 8.0e0
+
+	// WHEN Z IS IMAGINARY, THE ERROR TEST MUST BE MADE RELATIVE TO THE
+	// FIRST RECIPROCAL POWER SINCE THIS IS THE LEADING TERM OF THE
+	// EXPANSION FOR THE IMAGINARY PART.
+	AEZ = 8.0e0 * AZ
+	S = TOL / AEZ
+	JL = int(float32(RL+RL)) + 2
+	P1R = ZEROR
+	P1I = ZEROI
+	if ZI == 0.0e0 {
+		goto Thirty
+	}
+
+	// CALCULATE EXP(PI*(0.5+FNU+N-IL)*I) TO MINIMIZE LOSSES OF
+	// SIGNIFICANCE WHEN FNU OR N IS LARGE
+	INU = int(float32(FNU))
+	ARG = (FNU - float64(float32(INU))) * PI
+	INU = INU + N - IL
+	//sin, cos = math.Sincos(ARG)
+	AK = -math.Sin(ARG)
+	BK = math.Cos(ARG)
+	if ZI < 0.0e0 {
+		BK = -BK
+	}
+	P1R = AK
+	P1I = BK
+	if INU%2 == 0 {
+		goto Thirty
+	}
+	P1R = -P1R
+	P1I = -P1I
+Thirty:
+	for K = 1; K <= IL; K++ {
+		SQK = FDN - 1.0e0
+		ATOL = S * math.Abs(SQK)
+		SGN = 1.0e0
+		CS1R = CONER
+		CS1I = CONEI
+		CS2R = CONER
+		CS2I = CONEI
+		CKR = CONER
+		CKI = CONEI
+		AK = 0.0e0
+		AA = 1.0e0
+		BB = AEZ
+		DKR = EZR
+		DKI = EZI
+		// TODO(btracey): This loop is executed tens of thousands of times. Why?
+		// is that really necessary?
+		for J = 1; J <= JL; J++ {
+			tmp = complex(CKR, CKI) / complex(DKR, DKI)
+			STR = real(tmp)
+			STI = imag(tmp)
+			CKR = STR * SQK
+			CKI = STI * SQK
+			CS2R = CS2R + CKR
+			CS2I = CS2I + CKI
+			SGN = -SGN
+			CS1R = CS1R + CKR*SGN
+			CS1I = CS1I + CKI*SGN
+			DKR = DKR + EZR
+			DKI = DKI + EZI
+			AA = AA * math.Abs(SQK) / BB
+			BB = BB + AEZ
+			AK = AK + 8.0e0
+			SQK = SQK - AK
+			if AA <= ATOL {
+				goto Fifty
+			}
+		}
+		goto OneTen
+	Fifty:
+		S2R = CS1R
+		S2I = CS1I
+		if ZR+ZR >= ELIM {
+			goto Sixty
+		}
+		TZR = ZR + ZR
+		TZI = ZI + ZI
+		tmp = cmplx.Exp(complex(-TZR, -TZI))
+		STR = real(tmp)
+		STI = imag(tmp)
+		tmp = complex(STR, STI) * complex(P1R, P1I)
+		STR = real(tmp)
+		STI = imag(tmp)
+		tmp = complex(STR, STI) * complex(CS2R, CS2I)
+		STR = real(tmp)
+		STI = imag(tmp)
+		S2R = S2R + STR
+		S2I = S2I + STI
+	Sixty:
+		FDN = FDN + 8.0e0*DFNU + 4.0e0
+		P1R = -P1R
+		P1I = -P1I
+		M = N - IL + K
+		YR[M] = S2R*AK1R - S2I*AK1I
+		YI[M] = S2R*AK1I + S2I*AK1R
+	}
+	if N <= 2 {
+		return ZR, ZI, FNU, KODE, N, YR, YI, NZ, RL, TOL, ELIM, ALIM
+	}
+	NN = N
+	K = NN - 2
+	AK = float64(float32(K))
+	STR = ZR * RAZ
+	STI = -ZI * RAZ
+	RZR = (STR + STR) * RAZ
+	RZI = (STI + STI) * RAZ
+	IB = 3
+	for I = IB; I <= NN; I++ {
+		YR[K] = (AK+FNU)*(RZR*YR[K+1]-RZI*YI[K+1]) + YR[K+2]
+		YI[K] = (AK+FNU)*(RZR*YI[K+1]+RZI*YR[K+1]) + YI[K+2]
+		AK = AK - 1.0e0
+		K = K - 1
+	}
+	if KODED == 0 {
+		return ZR, ZI, FNU, KODE, N, YR, YI, NZ, RL, TOL, ELIM, ALIM
+	}
+	tmp = cmplx.Exp(complex(CZR, CZI))
+	CKR = real(tmp)
+	CKI = imag(tmp)
+	for I = 1; I <= NN; I++ {
+		STR = YR[I]*CKR - YI[I]*CKI
+		YI[I] = YR[I]*CKI + YI[I]*CKR
+		YR[I] = STR
+	}
+	return ZR, ZI, FNU, KODE, N, YR, YI, NZ, RL, TOL, ELIM, ALIM
+OneHundred:
+	NZ = -1
+	return ZR, ZI, FNU, KODE, N, YR, YI, NZ, RL, TOL, ELIM, ALIM
+OneTen:
+	NZ = -2
+	return ZR, ZI, FNU, KODE, N, YR, YI, NZ, RL, TOL, ELIM, ALIM
+}
+
+// ZMLRI COMPUTES THE I BESSEL FUNCTION FOR RE(Z)>=0.0 BY THE
+// MILLER ALGORITHM NORMALIZED BY A NEUMANN SERIES.
+func Zmlri(ZR, ZI, FNU float64, KODE, N int, YR, YI []float64, TOL float64) (
+	ZRout, ZIout, FNUout float64, KODEout, Nout int, YRout, YIout []float64, NZ int, TOLout float64) {
+	var ACK, AK, AP, AT, AZ, BK, CKI, CKR, CNORMI,
+		CNORMR, CONEI, CONER, FKAP, FKK, FLAM, FNF, PTI, PTR, P1I,
+		P1R, P2I, P2R, RAZ, RHO, RHO2, RZI, RZR, SCLE, STI, STR, SUMI,
+		SUMR, TFNF, TST, ZEROI, ZEROR float64
+	var I, IAZ, IDUM, IFNU, INU, ITIME, K, KK, KM, M int
+	var tmp complex128
+	ZEROR = 0
+	ZEROI = 0
+	CONER = 1
+	CONEI = 0
+
+	SCLE = dmach[1] / TOL
+	AZ = cmplx.Abs(complex(ZR, ZI))
+	IAZ = int(float32(AZ))
+	IFNU = int(float32(FNU))
+	INU = IFNU + N - 1
+	AT = float64(float32(IAZ)) + 1.0e0
+	RAZ = 1.0e0 / AZ
+	STR = ZR * RAZ
+	STI = -ZI * RAZ
+	CKR = STR * AT * RAZ
+	CKI = STI * AT * RAZ
+	RZR = (STR + STR) * RAZ
+	RZI = (STI + STI) * RAZ
+	P1R = ZEROR
+	P1I = ZEROI
+	P2R = CONER
+	P2I = CONEI
+	ACK = (AT + 1.0e0) * RAZ
+	RHO = ACK + math.Sqrt(ACK*ACK-1.0e0)
+	RHO2 = RHO * RHO
+	TST = (RHO2 + RHO2) / ((RHO2 - 1.0e0) * (RHO - 1.0e0))
+	TST = TST / TOL
+
+	// COMPUTE RELATIVE TRUNCATION ERROR INDEX FOR SERIES.
+	//fmt.Println("before loop", P2R, P2I, CKR, CKI, RZR, RZI, TST, AK)
+	AK = AT
+	for I = 1; I <= 80; I++ {
+		PTR = P2R
+		PTI = P2I
+		P2R = P1R - (CKR*PTR - CKI*PTI)
+		P2I = P1I - (CKI*PTR + CKR*PTI)
+		P1R = PTR
+		P1I = PTI
+		CKR = CKR + RZR
+		CKI = CKI + RZI
+		AP = cmplx.Abs(complex(P2R, P2I))
+		if AP > TST*AK*AK {
+			goto Twenty
+		}
+		AK = AK + 1.0e0
+	}
+	goto OneTen
+Twenty:
+	I = I + 1
+	K = 0
+	if INU < IAZ {
+		goto Forty
+	}
+	// COMPUTE RELATIVE TRUNCATION ERROR FOR RATIOS.
+	P1R = ZEROR
+	P1I = ZEROI
+	P2R = CONER
+	P2I = CONEI
+	AT = float64(float32(INU)) + 1.0e0
+	STR = ZR * RAZ
+	STI = -ZI * RAZ
+	CKR = STR * AT * RAZ
+	CKI = STI * AT * RAZ
+	ACK = AT * RAZ
+	TST = math.Sqrt(ACK / TOL)
+	ITIME = 1
+	for K = 1; K <= 80; K++ {
+		PTR = P2R
+		PTI = P2I
+		P2R = P1R - (CKR*PTR - CKI*PTI)
+		P2I = P1I - (CKR*PTI + CKI*PTR)
+		P1R = PTR
+		P1I = PTI
+		CKR = CKR + RZR
+		CKI = CKI + RZI
+		AP = cmplx.Abs(complex(P2R, P2I))
+		if AP < TST {
+			continue
+		}
+		if ITIME == 2 {
+			goto Forty
+		}
+		ACK = cmplx.Abs(complex(CKR, CKI))
+		FLAM = ACK + math.Sqrt(ACK*ACK-1.0e0)
+		FKAP = AP / cmplx.Abs(complex(P1R, P1I))
+		RHO = math.Min(FLAM, FKAP)
+		TST = TST * math.Sqrt(RHO/(RHO*RHO-1.0e0))
+		ITIME = 2
+	}
+	goto OneTen
+Forty:
+	// BACKWARD RECURRENCE AND SUM NORMALIZING RELATION.
+	K = K + 1
+	KK = max(I+IAZ, K+INU)
+	FKK = float64(float32(KK))
+	P1R = ZEROR
+	P1I = ZEROI
+
+	// SCALE P2 AND SUM BY SCLE.
+	P2R = SCLE
+	P2I = ZEROI
+	FNF = FNU - float64(float32(IFNU))
+	TFNF = FNF + FNF
+	BK = dgamln(FKK+TFNF+1.0e0, IDUM) - dgamln(FKK+1.0e0, IDUM) - dgamln(TFNF+1.0e0, IDUM)
+	BK = math.Exp(BK)
+	SUMR = ZEROR
+	SUMI = ZEROI
+	KM = KK - INU
+	for I = 1; I <= KM; I++ {
+		PTR = P2R
+		PTI = P2I
+		P2R = P1R + (FKK+FNF)*(RZR*PTR-RZI*PTI)
+		P2I = P1I + (FKK+FNF)*(RZI*PTR+RZR*PTI)
+		P1R = PTR
+		P1I = PTI
+		AK = 1.0e0 - TFNF/(FKK+TFNF)
+		ACK = BK * AK
+		SUMR = SUMR + (ACK+BK)*P1R
+		SUMI = SUMI + (ACK+BK)*P1I
+		BK = ACK
+		FKK = FKK - 1.0e0
+	}
+	YR[N] = P2R
+	YI[N] = P2I
+	if N == 1 {
+		goto Seventy
+	}
+	for I = 2; I <= N; I++ {
+		PTR = P2R
+		PTI = P2I
+		P2R = P1R + (FKK+FNF)*(RZR*PTR-RZI*PTI)
+		P2I = P1I + (FKK+FNF)*(RZI*PTR+RZR*PTI)
+		P1R = PTR
+		P1I = PTI
+		AK = 1.0e0 - TFNF/(FKK+TFNF)
+		ACK = BK * AK
+		SUMR = SUMR + (ACK+BK)*P1R
+		SUMI = SUMI + (ACK+BK)*P1I
+		BK = ACK
+		FKK = FKK - 1.0e0
+		M = N - I + 1
+		YR[M] = P2R
+		YI[M] = P2I
+	}
+Seventy:
+	if IFNU <= 0 {
+		goto Ninety
+	}
+	for I = 1; I <= IFNU; I++ {
+		PTR = P2R
+		PTI = P2I
+		P2R = P1R + (FKK+FNF)*(RZR*PTR-RZI*PTI)
+		P2I = P1I + (FKK+FNF)*(RZR*PTI+RZI*PTR)
+		P1R = PTR
+		P1I = PTI
+		AK = 1.0e0 - TFNF/(FKK+TFNF)
+		ACK = BK * AK
+		SUMR = SUMR + (ACK+BK)*P1R
+		SUMI = SUMI + (ACK+BK)*P1I
+		BK = ACK
+		FKK = FKK - 1.0e0
+	}
+Ninety:
+	PTR = ZR
+	PTI = ZI
+	if KODE == 2 {
+		PTR = ZEROR
+	}
+	tmp = cmplx.Log(complex(RZR, RZI))
+	STR = real(tmp)
+	STI = imag(tmp)
+	P1R = -FNF*STR + PTR
+	P1I = -FNF*STI + PTI
+	AP = dgamln(1.0e0+FNF, IDUM)
+	PTR = P1R - AP
+	PTI = P1I
+
+	// THE DIVISION CEXP(PT)/(SUM+P2) IS ALTERED TO AVOID OVERFLOW
+	// IN THE DENOMINATOR BY SQUARING LARGE QUANTITIES.
+	P2R = P2R + SUMR
+	P2I = P2I + SUMI
+	AP = cmplx.Abs(complex(P2R, P2I))
+	P1R = 1.0e0 / AP
+	tmp = cmplx.Exp(complex(PTR, PTI))
+	STR = real(tmp)
+	STI = imag(tmp)
+	CKR = STR * P1R
+	CKI = STI * P1R
+	PTR = P2R * P1R
+	PTI = -P2I * P1R
+	tmp = complex(CKR, CKI) * complex(PTR, PTI)
+	CNORMR = real(tmp)
+	CNORMI = imag(tmp)
+	for I = 1; I <= N; I++ {
+		STR = YR[I]*CNORMR - YI[I]*CNORMI
+		YI[I] = YR[I]*CNORMI + YI[I]*CNORMR
+		YR[I] = STR
+	}
+	return ZR, ZI, FNU, KODE, N, YR, YI, NZ, TOL
+OneTen:
+	NZ = -2
+	return ZR, ZI, FNU, KODE, N, YR, YI, NZ, TOL
+}
+
+// Zseri computes the I bessel function for real(z) >= 0 by means of the power
+// series for large |z| in the region |z| <= 2*sqrt(fnu+1).
+//
+// nz = 0 is a normal return. nz > 0 means that the last nz components were set
+// to zero due to underflow. nz < 0 means that underflow occurred, but the
+// condition |z| <= 2*sqrt(fnu+1) was violated and the computation must be
+// completed in another routine with n -= abs(nz).
+func Zseri(z complex128, fnu float64, kode, n int, y []complex128, tol, elim, alim float64) (nz int) {
+	// TODO(btracey): The original fortran line is "ARM = 1.0D+3*D1MACH(1)". Evidently, in Fortran
+	// this is interpreted as one to the power of +3*D1MACH(1). While it is possible
+	// this was intentional, it seems unlikely.
+	arm := 1000 * dmach[1]
+	az := cmplx.Abs(z)
+	if az < arm {
+		for i := 0; i < n; i++ {
+			y[i] = 0
+		}
+		if fnu == 0 {
+			y[0] = 1
+			n--
+		}
+		if az == 0 {
+			return 0
+		}
+		return n
+	}
+	hz := 0.5 * z
+	var cz complex128
+	var acz float64
+	if az > math.Sqrt(arm) {
+		cz = hz * hz
+		acz = cmplx.Abs(cz)
+	}
+	NN := n
+	ck := cmplx.Log(hz)
+	var ak1 complex128
+	for {
+		dfnu := fnu + float64(NN-1)
+		// Underflow test.
+		ak1 = ck * complex(dfnu, 0)
+		ak := dgamln(dfnu+1, 0)
+		ak1 -= complex(ak, 0)
+		if kode == 2 {
+			ak1 -= complex(real(z), 0)
+		}
+		if real(ak1) > -elim {
+			break
+		}
+		nz++
+		y[NN-1] = 0
+		if acz > dfnu {
+			// Return with nz < 0 if abs(Z*Z/4)>fnu+u-nz-1 complete the calculation
+			// in cbinu with n = n - abs(nz).
+			nz *= -1
+			return nz
+		}
+		NN--
+		if NN == 0 {
+			return nz
+		}
+	}
+	crscr := 1.0
+	var flag int
+	var scale float64
+	aa := real(ak1)
+	if aa <= -alim {
+		flag = 1
+		crscr = tol
+		scale = arm / tol
+		aa -= math.Log(tol)
+	}
+	var w [2]complex128
+	for {
+		coef := cmplx.Exp(complex(aa, imag(ak1)))
+		atol := tol * acz / (fnu + float64(NN))
+		for i := 0; i < min(2, NN); i++ {
+			FNUP := fnu + float64(NN-i)
+			s1 := 1 + 0i
+			if acz >= tol*FNUP {
+				ak2 := 1 + 0i
+				ak := FNUP + 2
+				S := FNUP
+				scl := 2.0
+				first := true
+				for first || scl > atol {
+					ak2 = ak2 * cz * complex(1/S, 0)
+					scl *= acz / S
+					s1 += ak2
+					S += ak
+					ak += 2
+					first = false
+				}
+			}
+			s2 := s1 * coef
+			w[i] = s2
+			if flag == 1 {
+				if Zuchk(s2, scale, tol) != 0 {
+					var full bool
+					var dfnu float64
+					// This code is similar to the code that exists above. The
+					// code copying is here because the original Fortran used
+					// a goto to solve the loop-and-a-half problem. Removing the
+					// goto makes the behavior of the function and variable scoping
+					// much clearer, but requires copying this code due to Go's
+					// goto rules.
+					for {
+						if full {
+							dfnu = fnu + float64(NN-1)
+							// Underflow test.
+							ak1 = ck * complex(dfnu, 0)
+							ak1 -= complex(dgamln(dfnu+1, 0), 0)
+							if kode == 2 {
+								ak1 -= complex(real(z), 0)
+							}
+							if real(ak1) > -elim {
+								break
+							}
+						} else {
+							full = true
+						}
+						nz++
+						y[NN-1] = 0
+						if acz > dfnu {
+							// Return with nz < 0 if abs(Z*Z/4)>fnu+u-nz-1 complete the calculation
+							// in cbinu with n = n - abs(nz).
+							nz *= -1
+							return nz
+						}
+						NN--
+						if NN == 0 {
+							return nz
+						}
+					}
+					continue
+				}
+			}
+			y[NN-i-1] = s2 * complex(crscr, 0)
+			coef /= hz
+			coef *= complex(FNUP-1, 0)
+		}
+		break
+	}
+	if NN <= 2 {
+		return nz
+	}
+	rz := complex(2*real(z)/(az*az), -2*imag(z)/(az*az))
+	if flag == 0 {
+		for i := NN - 3; i >= 0; i-- {
+			y[i] = complex(float64(i+1)+fnu, 0)*rz*y[i+1] + y[i+2]
+		}
+		return nz
+	}
+
+	// exp(-alim)=exp(-elim)/tol=approximately one digit of precision above the
+	// underflow limit, which equals scale = dmach[1)*SS*1e3.
+	s1 := w[0]
+	s2 := w[1]
+	for K := NN - 3; K >= 0; K-- {
+		s1, s2 = s2, s1+complex(float64(K+1)+fnu, 0)*(rz*s2)
+		ck := s2 * complex(crscr, 0)
+		y[K] = ck
+		if cmplx.Abs(ck) > scale {
+			for ; K >= 0; K-- {
+				y[K] = complex(float64(K+1)+fnu, 0)*rz*y[K+1] + y[K+2]
+			}
+			return nz
+		}
+	}
+	return nz
+}
+
+// Zs1s2 tests for a possible underflow resulting from the addition of the I and
+// K functions in the analytic continuation formula where s1 == K function and
+// s2 == I function.
+//
+// When kode == 1, the I and K functions are different orders of magnitude.
+//
+// When kode == 2, they may both be of the same order of magnitude, but the maximum
+// must be at least one precision above the underflow limit.
+func Zs1s2(zr, s1, s2 complex128, scale, lim float64, iuf int) (s1o, s2o complex128, nz, iufo int) {
+	if s1 == 0 || math.Log(cmplx.Abs(s1))-2*real(zr) < -lim {
+		if cmplx.Abs(s2) > scale {
+			return 0, s2, 0, iuf
+		}
+		return 0, 0, 1, 0
+	}
+	// TODO(btracey): Written like this for numerical rounding reasons.
+	// Fix once we're sure other changes are correct.
+	s1 = cmplx.Exp(cmplx.Log(s1) - zr - zr)
+	if math.Max(cmplx.Abs(s1), cmplx.Abs(s2)) > scale {
+		return s1, s2, 0, iuf + 1
+	}
+	return 0, 0, 1, 0
+}
+
+func dgamln(z float64, ierr int) float64 {
+	//return amoslib.DgamlnFort(z)
+	// Go implementation.
+	if z < 0 {
+		return 0
+	}
+	a2, _ := math.Lgamma(z)
+	return a2
+}
diff --git a/vendor/gonum.org/v1/gonum/mathext/internal/amos/doc.go b/vendor/gonum.org/v1/gonum/mathext/internal/amos/doc.go
new file mode 100644
index 0000000000..32f96c1474
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/internal/amos/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package amos implements functions originally in the Netlib code by Donald Amos.
+package amos // import "gonum.org/v1/gonum/mathext/internal/amos"
diff --git a/vendor/gonum.org/v1/gonum/mathext/internal/amos/staticcheck.conf b/vendor/gonum.org/v1/gonum/mathext/internal/amos/staticcheck.conf
new file mode 100644
index 0000000000..e7e254ff3f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/internal/amos/staticcheck.conf
@@ -0,0 +1 @@
+checks = []
diff --git a/vendor/gonum.org/v1/gonum/mathext/internal/cephes/cephes.go b/vendor/gonum.org/v1/gonum/mathext/internal/cephes/cephes.go
new file mode 100644
index 0000000000..20cac067ea
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/internal/cephes/cephes.go
@@ -0,0 +1,28 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cephes
+
+import "math"
+
+/*
+Additional copyright information:
+
+Code in this package is adapted from the Cephes library (http://www.netlib.org/cephes/).
+There is no explicit licence on Netlib, but the author has agreed to a BSD release.
+See https://github.com/deepmind/torch-cephes/blob/master/LICENSE.txt and
+https://lists.debian.org/debian-legal/2004/12/msg00295.html
+*/
+
+const (
+	paramOutOfBounds            = "cephes: parameter out of bounds"
+	errParamFunctionSingularity = "cephes: function singularity"
+)
+
+const (
+	machEp  = 1.0 / (1 << 53)
+	maxLog  = 1024 * math.Ln2
+	minLog  = -1075 * math.Ln2
+	maxIter = 2000
+)
diff --git a/vendor/gonum.org/v1/gonum/mathext/internal/cephes/doc.go b/vendor/gonum.org/v1/gonum/mathext/internal/cephes/doc.go
new file mode 100644
index 0000000000..086c46948b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/internal/cephes/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package cephes implements functions originally in the Netlib code by Stephen Mosher.
+package cephes // import "gonum.org/v1/gonum/mathext/internal/cephes"
diff --git a/vendor/gonum.org/v1/gonum/mathext/internal/cephes/igam.go b/vendor/gonum.org/v1/gonum/mathext/internal/cephes/igam.go
new file mode 100644
index 0000000000..4bc0bd1dcb
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/internal/cephes/igam.go
@@ -0,0 +1,320 @@
+// Derived from SciPy's special/cephes/igam.c and special/cephes/igam.h
+// https://github.com/scipy/scipy/blob/master/scipy/special/cephes/igam.c
+// https://github.com/scipy/scipy/blob/master/scipy/special/cephes/igam.h
+// Made freely available by Stephen L. Moshier without support or guarantee.
+
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+// Copyright ©1985, ©1987 by Stephen L. Moshier
+// Portions Copyright ©2016 The Gonum Authors. All rights reserved.
+
+package cephes
+
+import "math"
+
+const (
+	igamDimK       = 25
+	igamDimN       = 25
+	igam           = 1
+	igamC          = 0
+	igamSmall      = 20
+	igamLarge      = 200
+	igamSmallRatio = 0.3
+	igamLargeRatio = 4.5
+)
+
+var igamCoefs = [igamDimK][igamDimN]float64{
+	{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2, 1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4, 3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6, 8.296711340953086e-7, -1.7665952736826079e-7, 6.7078535434014986e-9, 1.0261809784240308e-8, -4.3820360184533532e-9, 9.1476995822367902e-10, -2.551419399494625e-11, -5.8307721325504251e-11, 2.4361948020667416e-11, -5.0276692801141756e-12, 1.1004392031956135e-13, 3.3717632624009854e-13, -1.3923887224181621e-13, 2.8534893807047443e-14, -5.1391118342425726e-16, -1.9752288294349443e-15},
+	{-1.8518518518518519e-3, -3.4722222222222222e-3, 2.6455026455026455e-3, -9.9022633744855967e-4, 2.0576131687242798e-4, -4.0187757201646091e-7, -1.8098550334489978e-5, 7.6491609160811101e-6, -1.6120900894563446e-6, 4.6471278028074343e-9, 1.378633446915721e-7, -5.752545603517705e-8, 1.1951628599778147e-8, -1.7543241719747648e-11, -1.0091543710600413e-9, 4.1627929918425826e-10, -8.5639070264929806e-11, 6.0672151016047586e-14, 7.1624989648114854e-12, -2.9331866437714371e-12, 5.9966963656836887e-13, -2.1671786527323314e-16, -4.9783399723692616e-14, 2.0291628823713425e-14, -4.13125571381061e-15},
+	{4.1335978835978836e-3, -2.6813271604938272e-3, 7.7160493827160494e-4, 2.0093878600823045e-6, -1.0736653226365161e-4, 5.2923448829120125e-5, -1.2760635188618728e-5, 3.4235787340961381e-8, 1.3721957309062933e-6, -6.298992138380055e-7, 1.4280614206064242e-7, -2.0477098421990866e-10, -1.4092529910867521e-8, 6.228974084922022e-9, -1.3670488396617113e-9, 9.4283561590146782e-13, 1.2872252400089318e-10, -5.5645956134363321e-11, 1.1975935546366981e-11, -4.1689782251838635e-15, -1.0940640427884594e-12, 4.6622399463901357e-13, -9.905105763906906e-14, 1.8931876768373515e-17, 8.8592218725911273e-15},
+	{6.4943415637860082e-4, 2.2947209362139918e-4, -4.6918949439525571e-4, 2.6772063206283885e-4, -7.5618016718839764e-5, -2.3965051138672967e-7, 1.1082654115347302e-5, -5.6749528269915966e-6, 1.4230900732435884e-6, -2.7861080291528142e-11, -1.6958404091930277e-7, 8.0994649053880824e-8, -1.9111168485973654e-8, 2.3928620439808118e-12, 2.0620131815488798e-9, -9.4604966618551322e-10, 2.1541049775774908e-10, -1.388823336813903e-14, -2.1894761681963939e-11, 9.7909989511716851e-12, -2.1782191880180962e-12, 6.2088195734079014e-17, 2.126978363279737e-13, -9.3446887915174333e-14, 2.0453671226782849e-14},
+	{-8.618882909167117e-4, 7.8403922172006663e-4, -2.9907248030319018e-4, -1.4638452578843418e-6, 6.6414982154651222e-5, -3.9683650471794347e-5, 1.1375726970678419e-5, 2.5074972262375328e-10, -1.6954149536558306e-6, 8.9075075322053097e-7, -2.2929348340008049e-7, 2.956794137544049e-11, 2.8865829742708784e-8, -1.4189739437803219e-8, 3.4463580499464897e-9, -2.3024517174528067e-13, -3.9409233028046405e-10, 1.8602338968504502e-10, -4.356323005056618e-11, 1.2786001016296231e-15, 4.6792750266579195e-12, -2.1492464706134829e-12, 4.9088156148096522e-13, -6.3385914848915603e-18, -5.0453320690800944e-14},
+	{-3.3679855336635815e-4, -6.9728137583658578e-5, 2.7727532449593921e-4, -1.9932570516188848e-4, 6.7977804779372078e-5, 1.419062920643967e-7, -1.3594048189768693e-5, 8.0184702563342015e-6, -2.2914811765080952e-6, -3.252473551298454e-10, 3.4652846491085265e-7, -1.8447187191171343e-7, 4.8240967037894181e-8, -1.7989466721743515e-14, -6.3061945000135234e-9, 3.1624176287745679e-9, -7.8409242536974293e-10, 5.1926791652540407e-15, 9.3589442423067836e-11, -4.5134262161632782e-11, 1.0799129993116827e-11, -3.661886712685252e-17, -1.210902069055155e-12, 5.6807435849905643e-13, -1.3249659916340829e-13},
+	{5.3130793646399222e-4, -5.9216643735369388e-4, 2.7087820967180448e-4, 7.9023532326603279e-7, -8.1539693675619688e-5, 5.6116827531062497e-5, -1.8329116582843376e-5, -3.0796134506033048e-9, 3.4651553688036091e-6, -2.0291327396058604e-6, 5.7887928631490037e-7, 2.338630673826657e-13, -8.8286007463304835e-8, 4.7435958880408128e-8, -1.2545415020710382e-8, 8.6496488580102925e-14, 1.6846058979264063e-9, -8.5754928235775947e-10, 2.1598224929232125e-10, -7.6132305204761539e-16, -2.6639822008536144e-11, 1.3065700536611057e-11, -3.1799163902367977e-12, 4.7109761213674315e-18, 3.6902800842763467e-13},
+	{3.4436760689237767e-4, 5.1717909082605922e-5, -3.3493161081142236e-4, 2.812695154763237e-4, -1.0976582244684731e-4, -1.2741009095484485e-7, 2.7744451511563644e-5, -1.8263488805711333e-5, 5.7876949497350524e-6, 4.9387589339362704e-10, -1.0595367014026043e-6, 6.1667143761104075e-7, -1.7562973359060462e-7, -1.2974473287015439e-12, 2.695423606288966e-8, -1.4578352908731271e-8, 3.887645959386175e-9, -3.8810022510194121e-17, -5.3279941738772867e-10, 2.7437977643314845e-10, -6.9957960920705679e-11, 2.5899863874868481e-17, 8.8566890996696381e-12, -4.403168815871311e-12, 1.0865561947091654e-12},
+	{-6.5262391859530942e-4, 8.3949872067208728e-4, -4.3829709854172101e-4, -6.969091458420552e-7, 1.6644846642067548e-4, -1.2783517679769219e-4, 4.6299532636913043e-5, 4.5579098679227077e-9, -1.0595271125805195e-5, 6.7833429048651666e-6, -2.1075476666258804e-6, -1.7213731432817145e-11, 3.7735877416110979e-7, -2.1867506700122867e-7, 6.2202288040189269e-8, 6.5977038267330006e-16, -9.5903864974256858e-9, 5.2132144922808078e-9, -1.3991589583935709e-9, 5.382058999060575e-16, 1.9484714275467745e-10, -1.0127287556389682e-10, 2.6077347197254926e-11, -5.0904186999932993e-18, -3.3721464474854592e-12},
+	{-5.9676129019274625e-4, -7.2048954160200106e-5, 6.7823088376673284e-4, -6.4014752602627585e-4, 2.7750107634328704e-4, 1.8197008380465151e-7, -8.4795071170685032e-5, 6.105192082501531e-5, -2.1073920183404862e-5, -8.8585890141255994e-10, 4.5284535953805377e-6, -2.8427815022504408e-6, 8.7082341778646412e-7, 3.6886101871706965e-12, -1.5344695190702061e-7, 8.862466778790695e-8, -2.5184812301826817e-8, -1.0225912098215092e-14, 3.8969470758154777e-9, -2.1267304792235635e-9, 5.7370135528051385e-10, -1.887749850169741e-19, -8.0931538694657866e-11, 4.2382723283449199e-11, -1.1002224534207726e-11},
+	{1.3324454494800656e-3, -1.9144384985654775e-3, 1.1089369134596637e-3, 9.932404122642299e-7, -5.0874501293093199e-4, 4.2735056665392884e-4, -1.6858853767910799e-4, -8.1301893922784998e-9, 4.5284402370562147e-5, -3.127053674781734e-5, 1.044986828530338e-5, 4.8435226265680926e-11, -2.1482565873456258e-6, 1.329369701097492e-6, -4.0295693092101029e-7, -1.7567877666323291e-13, 7.0145043163668257e-8, -4.040787734999483e-8, 1.1474026743371963e-8, 3.9642746853563325e-18, -1.7804938269892714e-9, 9.7480262548731646e-10, -2.6405338676507616e-10, 5.794875163403742e-18, 3.7647749553543836e-11},
+	{1.579727660730835e-3, 1.6251626278391582e-4, -2.0633421035543276e-3, 2.1389686185689098e-3, -1.0108559391263003e-3, -3.9912705529919201e-7, 3.6235025084764691e-4, -2.8143901463712154e-4, 1.0449513336495887e-4, 2.1211418491830297e-9, -2.5779417251947842e-5, 1.7281818956040463e-5, -5.6413773872904282e-6, -1.1024320105776174e-11, 1.1223224418895175e-6, -6.8693396379526735e-7, 2.0653236975414887e-7, 4.6714772409838506e-14, -3.5609886164949055e-8, 2.0470855345905963e-8, -5.8091738633283358e-9, -1.332821287582869e-16, 9.0354604391335133e-10, -4.9598782517330834e-10, 1.3481607129399749e-10},
+	{-4.0725121195140166e-3, 6.4033628338080698e-3, -4.0410161081676618e-3, -2.183732802866233e-6, 2.1740441801254639e-3, -1.9700440518418892e-3, 8.3595469747962458e-4, 1.9445447567109655e-8, -2.5779387120421696e-4, 1.9009987368139304e-4, -6.7696499937438965e-5, -1.4440629666426572e-10, 1.5712512518742269e-5, -1.0304008744776893e-5, 3.304517767401387e-6, 7.9829760242325709e-13, -6.4097794149313004e-7, 3.8894624761300056e-7, -1.1618347644948869e-7, -2.816808630596451e-15, 1.9878012911297093e-8, -1.1407719956357511e-8, 3.2355857064185555e-9, 4.1759468293455945e-20, -5.0423112718105824e-10},
+	{-5.9475779383993003e-3, -5.4016476789260452e-4, 8.7910413550767898e-3, -9.8576315587856125e-3, 5.0134695031021538e-3, 1.2807521786221875e-6, -2.0626019342754683e-3, 1.7109128573523058e-3, -6.7695312714133799e-4, -6.9011545676562133e-9, 1.8855128143995902e-4, -1.3395215663491969e-4, 4.6263183033528039e-5, 4.0034230613321351e-11, -1.0255652921494033e-5, 6.612086372797651e-6, -2.0913022027253008e-6, -2.0951775649603837e-13, 3.9756029041993247e-7, -2.3956211978815887e-7, 7.1182883382145864e-8, 8.925574873053455e-16, -1.2101547235064676e-8, 6.9350618248334386e-9, -1.9661464453856102e-9},
+	{1.7402027787522711e-2, -2.9527880945699121e-2, 2.0045875571402799e-2, 7.0289515966903407e-6, -1.2375421071343148e-2, 1.1976293444235254e-2, -5.4156038466518525e-3, -6.3290893396418616e-8, 1.8855118129005065e-3, -1.473473274825001e-3, 5.5515810097708387e-4, 5.2406834412550662e-10, -1.4357913535784836e-4, 9.9181293224943297e-5, -3.3460834749478311e-5, -3.5755837291098993e-12, 7.1560851960630076e-6, -4.5516802628155526e-6, 1.4236576649271475e-6, 1.8803149082089664e-14, -2.6623403898929211e-7, 1.5950642189595716e-7, -4.7187514673841102e-8, -6.5107872958755177e-17, 7.9795091026746235e-9},
+	{3.0249124160905891e-2, 2.4817436002649977e-3, -4.9939134373457022e-2, 5.9915643009307869e-2, -3.2483207601623391e-2, -5.7212968652103441e-6, 1.5085251778569354e-2, -1.3261324005088445e-2, 5.5515262632426148e-3, 3.0263182257030016e-8, -1.7229548406756723e-3, 1.2893570099929637e-3, -4.6845138348319876e-4, -1.830259937893045e-10, 1.1449739014822654e-4, -7.7378565221244477e-5, 2.5625836246985201e-5, 1.0766165333192814e-12, -5.3246809282422621e-6, 3.349634863064464e-6, -1.0381253128684018e-6, -5.608909920621128e-15, 1.9150821930676591e-7, -1.1418365800203486e-7, 3.3654425209171788e-8},
+	{-9.9051020880159045e-2, 1.7954011706123486e-1, -1.2989606383463778e-1, -3.1478872752284357e-5, 9.0510635276848131e-2, -9.2828824411184397e-2, 4.4412112839877808e-2, 2.7779236316835888e-7, -1.7229543805449697e-2, 1.4182925050891573e-2, -5.6214161633747336e-3, -2.39598509186381e-9, 1.6029634366079908e-3, -1.1606784674435773e-3, 4.1001337768153873e-4, 1.8365800754090661e-11, -9.5844256563655903e-5, 6.3643062337764708e-5, -2.076250624489065e-5, -1.1806020912804483e-13, 4.2131808239120649e-6, -2.6262241337012467e-6, 8.0770620494930662e-7, 6.0125912123632725e-16, -1.4729737374018841e-7},
+	{-1.9994542198219728e-1, -1.5056113040026424e-2, 3.6470239469348489e-1, -4.6435192311733545e-1, 2.6640934719197893e-1, 3.4038266027147191e-5, -1.3784338709329624e-1, 1.276467178337056e-1, -5.6213828755200985e-2, -1.753150885483011e-7, 1.9235592956768113e-2, -1.5088821281095315e-2, 5.7401854451350123e-3, 1.0622382710310225e-9, -1.5335082692563998e-3, 1.0819320643228214e-3, -3.7372510193945659e-4, -6.6170909729031985e-12, 8.4263617380909628e-5, -5.5150706827483479e-5, 1.7769536448348069e-5, 3.8827923210205533e-14, -3.53513697488768e-6, 2.1865832130045269e-6, -6.6812849447625594e-7},
+	{7.2438608504029431e-1, -1.3918010932653375, 1.0654143352413968, 1.876173868950258e-4, -8.2705501176152696e-1, 8.9352433347828414e-1, -4.4971003995291339e-1, -1.6107401567546652e-6, 1.9235590165271091e-1, -1.6597702160042609e-1, 6.8882222681814333e-2, 1.3910091724608687e-8, -2.146911561508663e-2, 1.6228980898865892e-2, -5.9796016172584256e-3, -1.1287469112826745e-10, 1.5167451119784857e-3, -1.0478634293553899e-3, 3.5539072889126421e-4, 8.1704322111801517e-13, -7.7773013442452395e-5, 5.0291413897007722e-5, -1.6035083867000518e-5, 1.2469354315487605e-14, 3.1369106244517615e-6},
+	{1.6668949727276811, 1.165462765994632e-1, -3.3288393225018906, 4.4692325482864037, -2.6977693045875807, -2.600667859891061e-4, 1.5389017615694539, -1.4937962361134612, 6.8881964633233148e-1, 1.3077482004552385e-6, -2.5762963325596288e-1, 2.1097676102125449e-1, -8.3714408359219882e-2, -7.7920428881354753e-9, 2.4267923064833599e-2, -1.7813678334552311e-2, 6.3970330388900056e-3, 4.9430807090480523e-11, -1.5554602758465635e-3, 1.0561196919903214e-3, -3.5277184460472902e-4, 9.3002334645022459e-14, 7.5285855026557172e-5, -4.8186515569156351e-5, 1.5227271505597605e-5},
+	{-6.6188298861372935, 1.3397985455142589e+1, -1.0789350606845146e+1, -1.4352254537875018e-3, 9.2333694596189809, -1.0456552819547769e+1, 5.5105526029033471, 1.2024439690716742e-5, -2.5762961164755816, 2.3207442745387179, -1.0045728797216284, -1.0207833290021914e-7, 3.3975092171169466e-1, -2.6720517450757468e-1, 1.0235252851562706e-1, 8.4329730484871625e-10, -2.7998284958442595e-2, 2.0066274144976813e-2, -7.0554368915086242e-3, 1.9402238183698188e-12, 1.6562888105449611e-3, -1.1082898580743683e-3, 3.654545161310169e-4, -5.1290032026971794e-11, -7.6340103696869031e-5},
+	{-1.7112706061976095e+1, -1.1208044642899116, 3.7131966511885444e+1, -5.2298271025348962e+1, 3.3058589696624618e+1, 2.4791298976200222e-3, -2.061089403411526e+1, 2.088672775145582e+1, -1.0045703956517752e+1, -1.2238783449063012e-5, 4.0770134274221141, -3.473667358470195, 1.4329352617312006, 7.1359914411879712e-8, -4.4797257159115612e-1, 3.4112666080644461e-1, -1.2699786326594923e-1, -2.8953677269081528e-10, 3.3125776278259863e-2, -2.3274087021036101e-2, 8.0399993503648882e-3, -1.177805216235265e-9, -1.8321624891071668e-3, 1.2108282933588665e-3, -3.9479941246822517e-4},
+	{7.389033153567425e+1, -1.5680141270402273e+2, 1.322177542759164e+2, 1.3692876877324546e-2, -1.2366496885920151e+2, 1.4620689391062729e+2, -8.0365587724865346e+1, -1.1259851148881298e-4, 4.0770132196179938e+1, -3.8210340013273034e+1, 1.719522294277362e+1, 9.3519707955168356e-7, -6.2716159907747034, 5.1168999071852637, -2.0319658112299095, -4.9507215582761543e-9, 5.9626397294332597e-1, -4.4220765337238094e-1, 1.6079998700166273e-1, -2.4733786203223402e-8, -4.0307574759979762e-2, 2.7849050747097869e-2, -9.4751858992054221e-3, 6.419922235909132e-6, 2.1250180774699461e-3},
+	{2.1216837098382522e+2, 1.3107863022633868e+1, -4.9698285932871748e+2, 7.3121595266969204e+2, -4.8213821720890847e+2, -2.8817248692894889e-2, 3.2616720302947102e+2, -3.4389340280087117e+2, 1.7195193870816232e+2, 1.4038077378096158e-4, -7.52594195897599e+1, 6.651969984520934e+1, -2.8447519748152462e+1, -7.613702615875391e-7, 9.5402237105304373, -7.5175301113311376, 2.8943997568871961, -4.6612194999538201e-7, -8.0615149598794088e-1, 5.8483006570631029e-1, -2.0845408972964956e-1, 1.4765818959305817e-4, 5.1000433863753019e-2, -3.3066252141883665e-2, 1.5109265210467774e-2},
+	{-9.8959643098322368e+2, 2.1925555360905233e+3, -1.9283586782723356e+3, -1.5925738122215253e-1, 1.9569985945919857e+3, -2.4072514765081556e+3, 1.3756149959336496e+3, 1.2920735237496668e-3, -7.525941715948055e+2, 7.3171668742208716e+2, -3.4137023466220065e+2, -9.9857390260608043e-6, 1.3356313181291573e+2, -1.1276295161252794e+2, 4.6310396098204458e+1, -7.9237387133614756e-6, -1.4510726927018646e+1, 1.1111771248100563e+1, -4.1690817945270892, 3.1008219800117808e-3, 1.1220095449981468, -7.6052379926149916e-1, 3.6262236505085254e-1, 2.216867741940747e-1, 4.8683443692930507e-1},
+}
+
+// Igam computes the incomplete Gamma integral.
+//
+//	Igam(a,x) = (1/ Γ(a)) \int_0^x e^{-t} t^{a-1} dt
+//
+// The input argument a must be positive and x must be non-negative or Igam
+// will panic.
+func Igam(a, x float64) float64 {
+	// The integral is evaluated by either a power series or continued fraction
+	// expansion, depending on the relative values of a and x.
+	// Sources:
+	// [1] "The Digital Library of Mathematical Functions", dlmf.nist.gov
+	// [2] Maddock et. al., "Incomplete Gamma Functions",
+	// http://www.boost.org/doc/libs/1_61_0/libs/math/doc/html/math_toolkit/sf_gamma/igamma.html
+
+	// Check zero integration limit first
+	if x == 0 {
+		return 0
+	}
+
+	if x < 0 || a <= 0 {
+		panic(paramOutOfBounds)
+	}
+
+	// Asymptotic regime where a ~ x; see [2].
+	absxmaA := math.Abs(x-a) / a
+	if (igamSmall < a && a < igamLarge && absxmaA < igamSmallRatio) ||
+		(igamLarge < a && absxmaA < igamLargeRatio/math.Sqrt(a)) {
+		return asymptoticSeries(a, x, igam)
+	}
+
+	if x > 1 && x > a {
+		return 1 - IgamC(a, x)
+	}
+
+	return igamSeries(a, x)
+}
+
+// IgamC computes the complemented incomplete Gamma integral.
+//
+//	IgamC(a,x) = 1 - Igam(a,x)
+//	           = (1/ Γ(a)) \int_0^\infty e^{-t} t^{a-1} dt
+//
+// The input argument a must be positive and x must be non-negative or
+// IgamC will panic.
+func IgamC(a, x float64) float64 {
+	// The integral is evaluated by either a power series or continued fraction
+	// expansion, depending on the relative values of a and x.
+	// Sources:
+	// [1] "The Digital Library of Mathematical Functions", dlmf.nist.gov
+	// [2] Maddock et. al., "Incomplete Gamma Functions",
+	// http://www.boost.org/doc/libs/1_61_0/libs/math/doc/html/math_toolkit/sf_gamma/igamma.html
+
+	switch {
+	case x < 0, a <= 0:
+		panic(paramOutOfBounds)
+	case x == 0:
+		return 1
+	case math.IsInf(x, 0):
+		return 0
+	}
+
+	// Asymptotic regime where a ~ x; see [2].
+	absxmaA := math.Abs(x-a) / a
+	if (igamSmall < a && a < igamLarge && absxmaA < igamSmallRatio) ||
+		(igamLarge < a && absxmaA < igamLargeRatio/math.Sqrt(a)) {
+		return asymptoticSeries(a, x, igamC)
+	}
+
+	// Everywhere else; see [2].
+	if x > 1.1 {
+		if x < a {
+			return 1 - igamSeries(a, x)
+		}
+		return igamCContinuedFraction(a, x)
+	} else if x <= 0.5 {
+		if -0.4/math.Log(x) < a {
+			return 1 - igamSeries(a, x)
+		}
+		return igamCSeries(a, x)
+	}
+
+	if x*1.1 < a {
+		return 1 - igamSeries(a, x)
+	}
+	return igamCSeries(a, x)
+}
+
+// igamFac computes
+//
+//	x^a * e^{-x} / Γ(a)
+//
+// corrected from (15) and (16) in [2] by replacing
+//
+//	e^{x - a}
+//
+// with
+//
+//	e^{a - x}
+func igamFac(a, x float64) float64 {
+	if math.Abs(a-x) > 0.4*math.Abs(a) {
+		ax := a*math.Log(x) - x - lgam(a)
+		return math.Exp(ax)
+	}
+
+	fac := a + lanczosG - 0.5
+	res := math.Sqrt(fac/math.Exp(1)) / lanczosSumExpgScaled(a)
+
+	if a < 200 && x < 200 {
+		res *= math.Exp(a-x) * math.Pow(x/fac, a)
+	} else {
+		num := x - a - lanczosG + 0.5
+		res *= math.Exp(a*log1pmx(num/fac) + x*(0.5-lanczosG)/fac)
+	}
+
+	return res
+}
+
+// igamCContinuedFraction computes IgamC using DLMF 8.9.2.
+func igamCContinuedFraction(a, x float64) float64 {
+	ax := igamFac(a, x)
+	if ax == 0 {
+		return 0
+	}
+
+	// Continued fraction
+	y := 1 - a
+	z := x + y + 1
+	c := 0.0
+	pkm2 := 1.0
+	qkm2 := x
+	pkm1 := x + 1.0
+	qkm1 := z * x
+	ans := pkm1 / qkm1
+
+	for i := 0; i < maxIter; i++ {
+		c += 1.0
+		y += 1.0
+		z += 2.0
+		yc := y * c
+		pk := pkm1*z - pkm2*yc
+		qk := qkm1*z - qkm2*yc
+		var t float64
+		if qk != 0 {
+			r := pk / qk
+			t = math.Abs((ans - r) / r)
+			ans = r
+		} else {
+			t = 1.0
+		}
+		pkm2 = pkm1
+		pkm1 = pk
+		qkm2 = qkm1
+		qkm1 = qk
+		if math.Abs(pk) > big {
+			pkm2 *= biginv
+			pkm1 *= biginv
+			qkm2 *= biginv
+			qkm1 *= biginv
+		}
+		if t <= machEp {
+			break
+		}
+	}
+
+	return ans * ax
+}
+
+// igamSeries computes Igam using DLMF 8.11.4.
+func igamSeries(a, x float64) float64 {
+	ax := igamFac(a, x)
+	if ax == 0 {
+		return 0
+	}
+
+	// Power series
+	r := a
+	c := 1.0
+	ans := 1.0
+
+	for i := 0; i < maxIter; i++ {
+		r += 1.0
+		c *= x / r
+		ans += c
+		if c <= machEp*ans {
+			break
+		}
+	}
+
+	return ans * ax / a
+}
+
+// igamCSeries computes IgamC using DLMF 8.7.3. This is related to the series
+// in igamSeries but extra care is taken to avoid cancellation.
+func igamCSeries(a, x float64) float64 {
+	fac := 1.0
+	sum := 0.0
+
+	for n := 1; n < maxIter; n++ {
+		fac *= -x / float64(n)
+		term := fac / (a + float64(n))
+		sum += term
+		if math.Abs(term) <= machEp*math.Abs(sum) {
+			break
+		}
+	}
+
+	logx := math.Log(x)
+	term := -expm1(a*logx - lgam1p(a))
+	return term - math.Exp(a*logx-lgam(a))*sum
+}
+
+// asymptoticSeries computes Igam/IgamC using DLMF 8.12.3/8.12.4.
+func asymptoticSeries(a, x float64, fun int) float64 {
+	maxpow := 0
+	lambda := x / a
+	sigma := (x - a) / a
+	absoldterm := math.MaxFloat64
+	etapow := [igamDimN]float64{1}
+	sum := 0.0
+	afac := 1.0
+
+	var sgn float64
+	if fun == igam {
+		sgn = -1
+	} else {
+		sgn = 1
+	}
+
+	var eta float64
+	if lambda > 1 {
+		eta = math.Sqrt(-2 * log1pmx(sigma))
+	} else if lambda < 1 {
+		eta = -math.Sqrt(-2 * log1pmx(sigma))
+	} else {
+		eta = 0
+	}
+	res := 0.5 * math.Erfc(sgn*eta*math.Sqrt(a/2))
+
+	for k := 0; k < igamDimK; k++ {
+		ck := igamCoefs[k][0]
+		for n := 1; n < igamDimN; n++ {
+			if n > maxpow {
+				etapow[n] = eta * etapow[n-1]
+				maxpow++
+			}
+			ckterm := igamCoefs[k][n] * etapow[n]
+			ck += ckterm
+			if math.Abs(ckterm) < machEp*math.Abs(ck) {
+				break
+			}
+		}
+		term := ck * afac
+		absterm := math.Abs(term)
+		if absterm > absoldterm {
+			break
+		}
+		sum += term
+		if absterm < machEp*math.Abs(sum) {
+			break
+		}
+		absoldterm = absterm
+		afac /= a
+	}
+	res += sgn * math.Exp(-0.5*a*eta*eta) * sum / math.Sqrt(2*math.Pi*a)
+
+	return res
+}
diff --git a/vendor/gonum.org/v1/gonum/mathext/internal/cephes/igami.go b/vendor/gonum.org/v1/gonum/mathext/internal/cephes/igami.go
new file mode 100644
index 0000000000..bb80b9cf83
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/internal/cephes/igami.go
@@ -0,0 +1,155 @@
+// Derived from SciPy's special/cephes/igami.c
+// https://github.com/scipy/scipy/blob/master/scipy/special/cephes/igami.c
+// Made freely available by Stephen L. Moshier without support or guarantee.
+
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+// Copyright ©1984, ©1987, ©1995 by Stephen L. Moshier
+// Portions Copyright ©2017 The Gonum Authors. All rights reserved.
+
+package cephes
+
+import "math"
+
+// IgamI computes the inverse of the incomplete Gamma function. That is, it
+// returns the x such that:
+//
+//	IgamC(a, x) = p
+//
+// The input argument a must be positive and p must be between 0 and 1
+// inclusive or IgamI will panic. IgamI should return a positive number, but
+// can return 0 even with non-zero y due to underflow.
+func IgamI(a, p float64) float64 {
+	// Bound the solution
+	x0 := math.MaxFloat64
+	yl := 0.0
+	x1 := 0.0
+	yh := 1.0
+	dithresh := 5.0 * machEp
+
+	if p < 0 || p > 1 || a <= 0 {
+		panic(paramOutOfBounds)
+	}
+
+	if p == 0 {
+		return math.Inf(1)
+	}
+
+	if p == 1 {
+		return 0.0
+	}
+
+	// Starting with the approximate value
+	//  x = a y^3
+	// where
+	//  y = 1 - d - ndtri(p) sqrt(d)
+	// and
+	//  d = 1/9a
+	// the routine performs up to 10 Newton iterations to find the root of
+	//  IgamC(a, x) - p = 0
+	d := 1.0 / (9.0 * a)
+	y := 1.0 - d - Ndtri(p)*math.Sqrt(d)
+	x := a * y * y * y
+
+	lgm := lgam(a)
+
+	for i := 0; i < 10; i++ {
+		if x > x0 || x < x1 {
+			break
+		}
+
+		y = IgamC(a, x)
+
+		if y < yl || y > yh {
+			break
+		}
+
+		if y < p {
+			x0 = x
+			yl = y
+		} else {
+			x1 = x
+			yh = y
+		}
+
+		// Compute the derivative of the function at this point
+		d = (a-1)*math.Log(x) - x - lgm
+		if d < -maxLog {
+			break
+		}
+		d = -math.Exp(d)
+
+		// Compute the step to the next approximation of x
+		d = (y - p) / d
+		if math.Abs(d/x) < machEp {
+			return x
+		}
+		x = x - d
+	}
+
+	d = 0.0625
+	if x0 == math.MaxFloat64 {
+		if x <= 0 {
+			x = 1
+		}
+		for x0 == math.MaxFloat64 {
+			x = (1 + d) * x
+			y = IgamC(a, x)
+			if y < p {
+				x0 = x
+				yl = y
+				break
+			}
+			d = d + d
+		}
+	}
+
+	d = 0.5
+	dir := 0
+	for i := 0; i < 400; i++ {
+		x = x1 + d*(x0-x1)
+		y = IgamC(a, x)
+
+		lgm = (x0 - x1) / (x1 + x0)
+		if math.Abs(lgm) < dithresh {
+			break
+		}
+
+		lgm = (y - p) / p
+		if math.Abs(lgm) < dithresh {
+			break
+		}
+
+		if x <= 0 {
+			break
+		}
+
+		if y >= p {
+			x1 = x
+			yh = y
+			if dir < 0 {
+				dir = 0
+				d = 0.5
+			} else if dir > 1 {
+				d = 0.5*d + 0.5
+			} else {
+				d = (p - yl) / (yh - yl)
+			}
+			dir++
+		} else {
+			x0 = x
+			yl = y
+			if dir > 0 {
+				dir = 0
+				d = 0.5
+			} else if dir < -1 {
+				d = 0.5 * d
+			} else {
+				d = (p - yl) / (yh - yl)
+			}
+			dir--
+		}
+	}
+
+	return x
+}
diff --git a/vendor/gonum.org/v1/gonum/mathext/internal/cephes/incbeta.go b/vendor/gonum.org/v1/gonum/mathext/internal/cephes/incbeta.go
new file mode 100644
index 0000000000..6a818154f6
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/internal/cephes/incbeta.go
@@ -0,0 +1,312 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+ * Cephes Math Library, Release 2.3:  March, 1995
+ * Copyright 1984, 1995 by Stephen L. Moshier
+ */
+
+package cephes
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/mathext/internal/gonum"
+)
+
+const (
+	maxGam = 171.624376956302725
+	big    = 4.503599627370496e15
+	biginv = 2.22044604925031308085e-16
+)
+
+// Incbet computes the regularized incomplete beta function.
+func Incbet(aa, bb, xx float64) float64 {
+	if aa <= 0 || bb <= 0 {
+		panic(paramOutOfBounds)
+	}
+	if xx <= 0 || xx >= 1 {
+		if xx == 0 {
+			return 0
+		}
+		if xx == 1 {
+			return 1
+		}
+		panic(paramOutOfBounds)
+	}
+
+	var flag int
+	if bb*xx <= 1 && xx <= 0.95 {
+		t := pseries(aa, bb, xx)
+		return transformT(t, flag)
+	}
+
+	w := 1 - xx
+
+	// Reverse a and b if x is greater than the mean.
+	var a, b, xc, x float64
+	if xx > aa/(aa+bb) {
+		flag = 1
+		a = bb
+		b = aa
+		xc = xx
+		x = w
+	} else {
+		a = aa
+		b = bb
+		xc = w
+		x = xx
+	}
+
+	if flag == 1 && (b*x) <= 1.0 && x <= 0.95 {
+		t := pseries(a, b, x)
+		return transformT(t, flag)
+	}
+
+	// Choose expansion for better convergence.
+	y := x*(a+b-2.0) - (a - 1.0)
+	if y < 0.0 {
+		w = incbcf(a, b, x)
+	} else {
+		w = incbd(a, b, x) / xc
+	}
+
+	// Multiply w by the factor
+	// x^a * (1-x)^b * Γ(a+b) / (a*Γ(a)*Γ(b))
+	var t float64
+	y = a * math.Log(x)
+	t = b * math.Log(xc)
+	if (a+b) < maxGam && math.Abs(y) < maxLog && math.Abs(t) < maxLog {
+		t = math.Pow(xc, b)
+		t *= math.Pow(x, a)
+		t /= a
+		t *= w
+		t *= 1.0 / gonum.Beta(a, b)
+		return transformT(t, flag)
+	}
+
+	// Resort to logarithms.
+	y += t - gonum.Lbeta(a, b)
+	y += math.Log(w / a)
+	if y < minLog {
+		t = 0.0
+	} else {
+		t = math.Exp(y)
+	}
+
+	return transformT(t, flag)
+}
+
+func transformT(t float64, flag int) float64 {
+	if flag == 1 {
+		if t <= machEp {
+			t = 1.0 - machEp
+		} else {
+			t = 1.0 - t
+		}
+	}
+	return t
+}
+
+// incbcf returns the incomplete beta integral evaluated by a continued fraction
+// expansion.
+func incbcf(a, b, x float64) float64 {
+	var xk, pk, pkm1, pkm2, qk, qkm1, qkm2 float64
+	var k1, k2, k3, k4, k5, k6, k7, k8 float64
+	var r, t, ans, thresh float64
+	var n int
+
+	k1 = a
+	k2 = a + b
+	k3 = a
+	k4 = a + 1.0
+	k5 = 1.0
+	k6 = b - 1.0
+	k7 = k4
+	k8 = a + 2.0
+
+	pkm2 = 0.0
+	qkm2 = 1.0
+	pkm1 = 1.0
+	qkm1 = 1.0
+	ans = 1.0
+	r = 1.0
+	thresh = 3.0 * machEp
+
+	for n = 0; n <= 300; n++ {
+
+		xk = -(x * k1 * k2) / (k3 * k4)
+		pk = pkm1 + pkm2*xk
+		qk = qkm1 + qkm2*xk
+		pkm2 = pkm1
+		pkm1 = pk
+		qkm2 = qkm1
+		qkm1 = qk
+
+		xk = (x * k5 * k6) / (k7 * k8)
+		pk = pkm1 + pkm2*xk
+		qk = qkm1 + qkm2*xk
+		pkm2 = pkm1
+		pkm1 = pk
+		qkm2 = qkm1
+		qkm1 = qk
+
+		if qk != 0 {
+			r = pk / qk
+		}
+		if r != 0 {
+			t = math.Abs((ans - r) / r)
+			ans = r
+		} else {
+			t = 1.0
+		}
+
+		if t < thresh {
+			return ans
+		}
+
+		k1 += 1.0
+		k2 += 1.0
+		k3 += 2.0
+		k4 += 2.0
+		k5 += 1.0
+		k6 -= 1.0
+		k7 += 2.0
+		k8 += 2.0
+
+		if (math.Abs(qk) + math.Abs(pk)) > big {
+			pkm2 *= biginv
+			pkm1 *= biginv
+			qkm2 *= biginv
+			qkm1 *= biginv
+		}
+		if (math.Abs(qk) < biginv) || (math.Abs(pk) < biginv) {
+			pkm2 *= big
+			pkm1 *= big
+			qkm2 *= big
+			qkm1 *= big
+		}
+	}
+
+	return ans
+}
+
+// incbd returns the incomplete beta integral evaluated by a continued fraction
+// expansion.
+func incbd(a, b, x float64) float64 {
+	var xk, pk, pkm1, pkm2, qk, qkm1, qkm2 float64
+	var k1, k2, k3, k4, k5, k6, k7, k8 float64
+	var r, t, ans, z, thresh float64
+	var n int
+
+	k1 = a
+	k2 = b - 1.0
+	k3 = a
+	k4 = a + 1.0
+	k5 = 1.0
+	k6 = a + b
+	k7 = a + 1.0
+	k8 = a + 2.0
+
+	pkm2 = 0.0
+	qkm2 = 1.0
+	pkm1 = 1.0
+	qkm1 = 1.0
+	z = x / (1.0 - x)
+	ans = 1.0
+	r = 1.0
+	thresh = 3.0 * machEp
+	for n = 0; n <= 300; n++ {
+
+		xk = -(z * k1 * k2) / (k3 * k4)
+		pk = pkm1 + pkm2*xk
+		qk = qkm1 + qkm2*xk
+		pkm2 = pkm1
+		pkm1 = pk
+		qkm2 = qkm1
+		qkm1 = qk
+
+		xk = (z * k5 * k6) / (k7 * k8)
+		pk = pkm1 + pkm2*xk
+		qk = qkm1 + qkm2*xk
+		pkm2 = pkm1
+		pkm1 = pk
+		qkm2 = qkm1
+		qkm1 = qk
+
+		if qk != 0 {
+			r = pk / qk
+		}
+		if r != 0 {
+			t = math.Abs((ans - r) / r)
+			ans = r
+		} else {
+			t = 1.0
+		}
+
+		if t < thresh {
+			return ans
+		}
+
+		k1 += 1.0
+		k2 -= 1.0
+		k3 += 2.0
+		k4 += 2.0
+		k5 += 1.0
+		k6 += 1.0
+		k7 += 2.0
+		k8 += 2.0
+
+		if (math.Abs(qk) + math.Abs(pk)) > big {
+			pkm2 *= biginv
+			pkm1 *= biginv
+			qkm2 *= biginv
+			qkm1 *= biginv
+		}
+		if (math.Abs(qk) < biginv) || (math.Abs(pk) < biginv) {
+			pkm2 *= big
+			pkm1 *= big
+			qkm2 *= big
+			qkm1 *= big
+		}
+	}
+	return ans
+}
+
+// pseries returns the incomplete beta integral evaluated by a power series. Use
+// when b*x is small and x not too close to 1.
+func pseries(a, b, x float64) float64 {
+	var s, t, u, v, n, t1, z, ai float64
+	ai = 1.0 / a
+	u = (1.0 - b) * x
+	v = u / (a + 1.0)
+	t1 = v
+	t = u
+	n = 2.0
+	s = 0.0
+	z = machEp * ai
+	for math.Abs(v) > z {
+		u = (n - b) * x / n
+		t *= u
+		v = t / (a + n)
+		s += v
+		n += 1.0
+	}
+	s += t1
+	s += ai
+
+	u = a * math.Log(x)
+	if (a+b) < maxGam && math.Abs(u) < maxLog {
+		t = 1.0 / gonum.Beta(a, b)
+		s = s * t * math.Pow(x, a)
+	} else {
+		t = -gonum.Lbeta(a, b) + u + math.Log(s)
+		if t < minLog {
+			s = 0.0
+		} else {
+			s = math.Exp(t)
+		}
+	}
+	return (s)
+}
diff --git a/vendor/gonum.org/v1/gonum/mathext/internal/cephes/incbi.go b/vendor/gonum.org/v1/gonum/mathext/internal/cephes/incbi.go
new file mode 100644
index 0000000000..2b612d83f9
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/internal/cephes/incbi.go
@@ -0,0 +1,247 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+ * Cephes Math Library Release 2.4:  March,1996
+ * Copyright 1984, 1996 by Stephen L. Moshier
+ */
+
+package cephes
+
+import "math"
+
+// Incbi computes the inverse of the regularized incomplete beta integral.
+func Incbi(aa, bb, yy0 float64) float64 {
+	var a, b, y0, d, y, x, x0, x1, lgm, yp, di, dithresh, yl, yh, xt float64
+	var i, rflg, dir, nflg int
+
+	if yy0 <= 0 {
+		return (0.0)
+	}
+	if yy0 >= 1.0 {
+		return (1.0)
+	}
+	x0 = 0.0
+	yl = 0.0
+	x1 = 1.0
+	yh = 1.0
+	nflg = 0
+
+	if aa <= 1.0 || bb <= 1.0 {
+		dithresh = 1.0e-6
+		rflg = 0
+		a = aa
+		b = bb
+		y0 = yy0
+		x = a / (a + b)
+		y = Incbet(a, b, x)
+		goto ihalve
+	} else {
+		dithresh = 1.0e-4
+	}
+	// Approximation to inverse function
+	yp = -Ndtri(yy0)
+
+	if yy0 > 0.5 {
+		rflg = 1
+		a = bb
+		b = aa
+		y0 = 1.0 - yy0
+		yp = -yp
+	} else {
+		rflg = 0
+		a = aa
+		b = bb
+		y0 = yy0
+	}
+
+	lgm = (yp*yp - 3.0) / 6.0
+	x = 2.0 / (1.0/(2.0*a-1.0) + 1.0/(2.0*b-1.0))
+	d = yp*math.Sqrt(x+lgm)/x - (1.0/(2.0*b-1.0)-1.0/(2.0*a-1.0))*(lgm+5.0/6.0-2.0/(3.0*x))
+	d = 2.0 * d
+	if d < minLog {
+		// mtherr("incbi", UNDERFLOW)
+		x = 0
+		goto done
+	}
+	x = a / (a + b*math.Exp(d))
+	y = Incbet(a, b, x)
+	yp = (y - y0) / y0
+	if math.Abs(yp) < 0.2 {
+		goto newt
+	}
+
+	/* Resort to interval halving if not close enough. */
+ihalve:
+
+	dir = 0
+	di = 0.5
+	for i = 0; i < 100; i++ {
+		if i != 0 {
+			x = x0 + di*(x1-x0)
+			if x == 1.0 {
+				x = 1.0 - machEp
+			}
+			if x == 0.0 {
+				di = 0.5
+				x = x0 + di*(x1-x0)
+				if x == 0.0 {
+					// mtherr("incbi", UNDERFLOW)
+					goto done
+				}
+			}
+			y = Incbet(a, b, x)
+			yp = (x1 - x0) / (x1 + x0)
+			if math.Abs(yp) < dithresh {
+				goto newt
+			}
+			yp = (y - y0) / y0
+			if math.Abs(yp) < dithresh {
+				goto newt
+			}
+		}
+		if y < y0 {
+			x0 = x
+			yl = y
+			if dir < 0 {
+				dir = 0
+				di = 0.5
+			} else if dir > 3 {
+				di = 1.0 - (1.0-di)*(1.0-di)
+			} else if dir > 1 {
+				di = 0.5*di + 0.5
+			} else {
+				di = (y0 - y) / (yh - yl)
+			}
+			dir += 1
+			if x0 > 0.75 {
+				if rflg == 1 {
+					rflg = 0
+					a = aa
+					b = bb
+					y0 = yy0
+				} else {
+					rflg = 1
+					a = bb
+					b = aa
+					y0 = 1.0 - yy0
+				}
+				x = 1.0 - x
+				y = Incbet(a, b, x)
+				x0 = 0.0
+				yl = 0.0
+				x1 = 1.0
+				yh = 1.0
+				goto ihalve
+			}
+		} else {
+			x1 = x
+			if rflg == 1 && x1 < machEp {
+				x = 0.0
+				goto done
+			}
+			yh = y
+			if dir > 0 {
+				dir = 0
+				di = 0.5
+			} else if dir < -3 {
+				di = di * di
+			} else if dir < -1 {
+				di = 0.5 * di
+			} else {
+				di = (y - y0) / (yh - yl)
+			}
+			dir -= 1
+		}
+	}
+	// mtherr("incbi", PLOSS)
+	if x0 >= 1.0 {
+		x = 1.0 - machEp
+		goto done
+	}
+	if x <= 0.0 {
+		// mtherr("incbi", UNDERFLOW)
+		x = 0.0
+		goto done
+	}
+
+newt:
+	if nflg > 0 {
+		goto done
+	}
+	nflg = 1
+	lgm = lgam(a+b) - lgam(a) - lgam(b)
+
+	for i = 0; i < 8; i++ {
+		/* Compute the function at this point. */
+		if i != 0 {
+			y = Incbet(a, b, x)
+		}
+		if y < yl {
+			x = x0
+			y = yl
+		} else if y > yh {
+			x = x1
+			y = yh
+		} else if y < y0 {
+			x0 = x
+			yl = y
+		} else {
+			x1 = x
+			yh = y
+		}
+		if x == 1.0 || x == 0.0 {
+			break
+		}
+		/* Compute the derivative of the function at this point. */
+		d = (a-1.0)*math.Log(x) + (b-1.0)*math.Log(1.0-x) + lgm
+		if d < minLog {
+			goto done
+		}
+		if d > maxLog {
+			break
+		}
+		d = math.Exp(d)
+		/* Compute the step to the next approximation of x. */
+		d = (y - y0) / d
+		xt = x - d
+		if xt <= x0 {
+			y = (x - x0) / (x1 - x0)
+			xt = x0 + 0.5*y*(x-x0)
+			if xt <= 0.0 {
+				break
+			}
+		}
+		if xt >= x1 {
+			y = (x1 - x) / (x1 - x0)
+			xt = x1 - 0.5*y*(x1-x)
+			if xt >= 1.0 {
+				break
+			}
+		}
+		x = xt
+		if math.Abs(d/x) < 128.0*machEp {
+			goto done
+		}
+	}
+	/* Did not converge.  */
+	dithresh = 256.0 * machEp
+	goto ihalve
+
+done:
+
+	if rflg > 0 {
+		if x <= machEp {
+			x = 1.0 - machEp
+		} else {
+			x = 1.0 - x
+		}
+	}
+	return (x)
+}
+
+func lgam(a float64) float64 {
+	lg, _ := math.Lgamma(a)
+	return lg
+}
diff --git a/vendor/gonum.org/v1/gonum/mathext/internal/cephes/lanczos.go b/vendor/gonum.org/v1/gonum/mathext/internal/cephes/lanczos.go
new file mode 100644
index 0000000000..ec29161f3e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/internal/cephes/lanczos.go
@@ -0,0 +1,153 @@
+// Derived from SciPy's special/cephes/lanczos.c
+// https://github.com/scipy/scipy/blob/master/scipy/special/cephes/lanczos.c
+
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+// Copyright ©2006 John Maddock
+// Portions Copyright ©2003 Boost
+// Portions Copyright ©2016 The Gonum Authors. All rights reserved.
+
+package cephes
+
+// Optimal values for G for each N are taken from
+// http://web.mala.bc.ca/pughg/phdThesis/phdThesis.pdf,
+// as are the theoretical error bounds.
+
+// Constants calculated using the method described by Godfrey
+// http://my.fit.edu/~gabdo/gamma.txt and elaborated by Toth at
+// http://www.rskey.org/gamma.htm using NTL::RR at 1000 bit precision.
+
+var lanczosNum = [...]float64{
+	2.506628274631000270164908177133837338626,
+	210.8242777515793458725097339207133627117,
+	8071.672002365816210638002902272250613822,
+	186056.2653952234950402949897160456992822,
+	2876370.628935372441225409051620849613599,
+	31426415.58540019438061423162831820536287,
+	248874557.8620541565114603864132294232163,
+	1439720407.311721673663223072794912393972,
+	6039542586.35202800506429164430729792107,
+	17921034426.03720969991975575445893111267,
+	35711959237.35566804944018545154716670596,
+	42919803642.64909876895789904700198885093,
+	23531376880.41075968857200767445163675473,
+}
+
+var lanczosDenom = [...]float64{
+	1,
+	66,
+	1925,
+	32670,
+	357423,
+	2637558,
+	13339535,
+	45995730,
+	105258076,
+	150917976,
+	120543840,
+	39916800,
+	0,
+}
+
+var lanczosSumExpgScaledNum = [...]float64{
+	0.006061842346248906525783753964555936883222,
+	0.5098416655656676188125178644804694509993,
+	19.51992788247617482847860966235652136208,
+	449.9445569063168119446858607650988409623,
+	6955.999602515376140356310115515198987526,
+	75999.29304014542649875303443598909137092,
+	601859.6171681098786670226533699352302507,
+	3481712.15498064590882071018964774556468,
+	14605578.08768506808414169982791359218571,
+	43338889.32467613834773723740590533316085,
+	86363131.28813859145546927288977868422342,
+	103794043.1163445451906271053616070238554,
+	56906521.91347156388090791033559122686859,
+}
+
+var lanczosSumExpgScaledDenom = [...]float64{
+	1,
+	66,
+	1925,
+	32670,
+	357423,
+	2637558,
+	13339535,
+	45995730,
+	105258076,
+	150917976,
+	120543840,
+	39916800,
+	0,
+}
+
+var lanczosSumNear1D = [...]float64{
+	0.3394643171893132535170101292240837927725e-9,
+	-0.2499505151487868335680273909354071938387e-8,
+	0.8690926181038057039526127422002498960172e-8,
+	-0.1933117898880828348692541394841204288047e-7,
+	0.3075580174791348492737947340039992829546e-7,
+	-0.2752907702903126466004207345038327818713e-7,
+	-0.1515973019871092388943437623825208095123e-5,
+	0.004785200610085071473880915854204301886437,
+	-0.1993758927614728757314233026257810172008,
+	1.483082862367253753040442933770164111678,
+	-3.327150580651624233553677113928873034916,
+	2.208709979316623790862569924861841433016,
+}
+
+var lanczosSumNear2D = [...]float64{
+	0.1009141566987569892221439918230042368112e-8,
+	-0.7430396708998719707642735577238449585822e-8,
+	0.2583592566524439230844378948704262291927e-7,
+	-0.5746670642147041587497159649318454348117e-7,
+	0.9142922068165324132060550591210267992072e-7,
+	-0.8183698410724358930823737982119474130069e-7,
+	-0.4506604409707170077136555010018549819192e-5,
+	0.01422519127192419234315002746252160965831,
+	-0.5926941084905061794445733628891024027949,
+	4.408830289125943377923077727900630927902,
+	-9.8907772644920670589288081640128194231,
+	6.565936202082889535528455955485877361223,
+}
+
+const lanczosG = 6.024680040776729583740234375
+
+func lanczosSum(x float64) float64 {
+	return ratevl(x,
+		lanczosNum[:],
+		len(lanczosNum)-1,
+		lanczosDenom[:],
+		len(lanczosDenom)-1)
+}
+
+func lanczosSumExpgScaled(x float64) float64 {
+	return ratevl(x,
+		lanczosSumExpgScaledNum[:],
+		len(lanczosSumExpgScaledNum)-1,
+		lanczosSumExpgScaledDenom[:],
+		len(lanczosSumExpgScaledDenom)-1)
+}
+
+func lanczosSumNear1(dx float64) float64 {
+	var result float64
+
+	for i, val := range lanczosSumNear1D {
+		k := float64(i + 1)
+		result += (-val * dx) / (k*dx + k*k)
+	}
+
+	return result
+}
+
+func lanczosSumNear2(dx float64) float64 {
+	var result float64
+	x := dx + 2
+
+	for i, val := range lanczosSumNear2D {
+		k := float64(i + 1)
+		result += (-val * dx) / (x + k*x + k*k - 1)
+	}
+
+	return result
+}
diff --git a/vendor/gonum.org/v1/gonum/mathext/internal/cephes/ndtri.go b/vendor/gonum.org/v1/gonum/mathext/internal/cephes/ndtri.go
new file mode 100644
index 0000000000..03910ff8f4
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/internal/cephes/ndtri.go
@@ -0,0 +1,150 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+ * Cephes Math Library Release 2.1:  January, 1989
+ * Copyright 1984, 1987, 1989 by Stephen L. Moshier
+ * Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+ */
+
+package cephes
+
+import "math"
+
+// TODO(btracey): There is currently an implementation of this functionality
+// in gonum/stat/distuv. Find out which implementation is better, and rectify
+// by having distuv call this, or moving this implementation into
+// gonum/mathext/internal/gonum.
+
+// math.Sqrt(2*pi)
+const s2pi = 2.50662827463100050242e0
+
+// approximation for 0 <= |y - 0.5| <= 3/8
+var P0 = [5]float64{
+	-5.99633501014107895267e1,
+	9.80010754185999661536e1,
+	-5.66762857469070293439e1,
+	1.39312609387279679503e1,
+	-1.23916583867381258016e0,
+}
+
+var Q0 = [8]float64{
+	/* 1.00000000000000000000E0, */
+	1.95448858338141759834e0,
+	4.67627912898881538453e0,
+	8.63602421390890590575e1,
+	-2.25462687854119370527e2,
+	2.00260212380060660359e2,
+	-8.20372256168333339912e1,
+	1.59056225126211695515e1,
+	-1.18331621121330003142e0,
+}
+
+// Approximation for interval z = math.Sqrt(-2 log y ) between 2 and 8
+// i.e., y between exp(-2) = .135 and exp(-32) = 1.27e-14.
+var P1 = [9]float64{
+	4.05544892305962419923e0,
+	3.15251094599893866154e1,
+	5.71628192246421288162e1,
+	4.40805073893200834700e1,
+	1.46849561928858024014e1,
+	2.18663306850790267539e0,
+	-1.40256079171354495875e-1,
+	-3.50424626827848203418e-2,
+	-8.57456785154685413611e-4,
+}
+
+var Q1 = [8]float64{
+	/*  1.00000000000000000000E0, */
+	1.57799883256466749731e1,
+	4.53907635128879210584e1,
+	4.13172038254672030440e1,
+	1.50425385692907503408e1,
+	2.50464946208309415979e0,
+	-1.42182922854787788574e-1,
+	-3.80806407691578277194e-2,
+	-9.33259480895457427372e-4,
+}
+
+// Approximation for interval z = math.Sqrt(-2 log y ) between 8 and 64
+// i.e., y between exp(-32) = 1.27e-14 and exp(-2048) = 3.67e-890.
+var P2 = [9]float64{
+	3.23774891776946035970e0,
+	6.91522889068984211695e0,
+	3.93881025292474443415e0,
+	1.33303460815807542389e0,
+	2.01485389549179081538e-1,
+	1.23716634817820021358e-2,
+	3.01581553508235416007e-4,
+	2.65806974686737550832e-6,
+	6.23974539184983293730e-9,
+}
+
+var Q2 = [8]float64{
+	/*  1.00000000000000000000E0, */
+	6.02427039364742014255e0,
+	3.67983563856160859403e0,
+	1.37702099489081330271e0,
+	2.16236993594496635890e-1,
+	1.34204006088543189037e-2,
+	3.28014464682127739104e-4,
+	2.89247864745380683936e-6,
+	6.79019408009981274425e-9,
+}
+
+// Ndtri returns the argument, x, for which the area under the
+// Gaussian probability density function (integrated from
+// minus infinity to x) is equal to y.
+func Ndtri(y0 float64) float64 {
+	// For small arguments 0 < y < exp(-2), the program computes
+	// z = math.Sqrt( -2.0 * math.Log(y) );  then the approximation is
+	// x = z - math.Log(z)/z  - (1/z) P(1/z) / Q(1/z).
+	// There are two rational functions P/Q, one for 0 < y < exp(-32)
+	// and the other for y up to exp(-2).  For larger arguments,
+	// w = y - 0.5, and  x/math.Sqrt(2pi) = w + w**3 R(w**2)/S(w**2)).
+	var x, y, z, y2, x0, x1 float64
+	var code int
+
+	if y0 <= 0.0 {
+		if y0 < 0 {
+			panic(paramOutOfBounds)
+		}
+		return math.Inf(-1)
+	}
+	if y0 >= 1.0 {
+		if y0 > 1 {
+			panic(paramOutOfBounds)
+		}
+		return math.Inf(1)
+	}
+	code = 1
+	y = y0
+	if y > (1.0 - 0.13533528323661269189) { /* 0.135... = exp(-2) */
+		y = 1.0 - y
+		code = 0
+	}
+
+	if y > 0.13533528323661269189 {
+		y = y - 0.5
+		y2 = y * y
+		x = y + y*(y2*polevl(y2, P0[:], 4)/p1evl(y2, Q0[:], 8))
+		x = x * s2pi
+		return (x)
+	}
+
+	x = math.Sqrt(-2.0 * math.Log(y))
+	x0 = x - math.Log(x)/x
+
+	z = 1.0 / x
+	if x < 8.0 { /* y > exp(-32) = 1.2664165549e-14 */
+		x1 = z * polevl(z, P1[:], 8) / p1evl(z, Q1[:], 8)
+	} else {
+		x1 = z * polevl(z, P2[:], 8) / p1evl(z, Q2[:], 8)
+	}
+	x = x0 - x1
+	if code != 0 {
+		x = -x
+	}
+	return (x)
+}
diff --git a/vendor/gonum.org/v1/gonum/mathext/internal/cephes/polevl.go b/vendor/gonum.org/v1/gonum/mathext/internal/cephes/polevl.go
new file mode 100644
index 0000000000..aec399f372
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/internal/cephes/polevl.go
@@ -0,0 +1,84 @@
+// Derived from SciPy's special/cephes/polevl.h
+// https://github.com/scipy/scipy/blob/master/scipy/special/cephes/polevl.h
+// Made freely available by Stephen L. Moshier without support or guarantee.
+
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+// Copyright ©1984, ©1987, ©1988 by Stephen L. Moshier
+// Portions Copyright ©2016 The Gonum Authors. All rights reserved.
+
+package cephes
+
+import "math"
+
+// polevl evaluates a polynomial of degree N
+//
+//	y = c_0 + c_1 x_1 + c_2 x_2^2 ...
+//
+// where the coefficients are stored in reverse order, i.e. coef[0] = c_n and
+// coef[n] = c_0.
+func polevl(x float64, coef []float64, n int) float64 {
+	ans := coef[0]
+	for i := 1; i <= n; i++ {
+		ans = ans*x + coef[i]
+	}
+	return ans
+}
+
+// p1evl is the same as polevl, except c_n is assumed to be 1 and is not included
+// in the slice.
+func p1evl(x float64, coef []float64, n int) float64 {
+	ans := x + coef[0]
+	for i := 1; i <= n-1; i++ {
+		ans = ans*x + coef[i]
+	}
+	return ans
+}
+
+// ratevl evaluates a rational function
+func ratevl(x float64, num []float64, m int, denom []float64, n int) float64 {
+	// Source: Holin et. al., "Polynomial and Rational Function Evaluation",
+	// http://www.boost.org/doc/libs/1_61_0/libs/math/doc/html/math_toolkit/roots/rational.html
+	absx := math.Abs(x)
+
+	var dir, idx int
+	var y float64
+	if absx > 1 {
+		// Evaluate as a polynomial in 1/x
+		dir = -1
+		idx = m
+		y = 1 / x
+	} else {
+		dir = 1
+		idx = 0
+		y = x
+	}
+
+	// Evaluate the numerator
+	numAns := num[idx]
+	idx += dir
+	for i := 0; i < m; i++ {
+		numAns = numAns*y + num[idx]
+		idx += dir
+	}
+
+	// Evaluate the denominator
+	if absx > 1 {
+		idx = n
+	} else {
+		idx = 0
+	}
+
+	denomAns := denom[idx]
+	idx += dir
+	for i := 0; i < n; i++ {
+		denomAns = denomAns*y + denom[idx]
+		idx += dir
+	}
+
+	if absx > 1 {
+		pow := float64(n - m)
+		return math.Pow(x, pow) * numAns / denomAns
+	}
+	return numAns / denomAns
+}
diff --git a/vendor/gonum.org/v1/gonum/mathext/internal/cephes/staticcheck.conf b/vendor/gonum.org/v1/gonum/mathext/internal/cephes/staticcheck.conf
new file mode 100644
index 0000000000..e7e254ff3f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/internal/cephes/staticcheck.conf
@@ -0,0 +1 @@
+checks = []
diff --git a/vendor/gonum.org/v1/gonum/mathext/internal/cephes/unity.go b/vendor/gonum.org/v1/gonum/mathext/internal/cephes/unity.go
new file mode 100644
index 0000000000..3996e7e558
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/internal/cephes/unity.go
@@ -0,0 +1,184 @@
+// Derived from SciPy's special/cephes/unity.c
+// https://github.com/scipy/scipy/blob/master/scipy/special/cephes/unity.c
+// Made freely available by Stephen L. Moshier without support or guarantee.
+
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+// Copyright ©1984, ©1996 by Stephen L. Moshier
+// Portions Copyright ©2016 The Gonum Authors. All rights reserved.
+
+package cephes
+
+import "math"
+
+// Relative error approximations for function arguments near unity.
+//  log1p(x) = log(1+x)
+//  expm1(x) = exp(x) - 1
+//  cosm1(x) = cos(x) - 1
+//  lgam1p(x) = lgam(1+x)
+
+const (
+	invSqrt2 = 1 / math.Sqrt2
+	pi4      = math.Pi / 4
+	euler    = 0.577215664901532860606512090082402431 // Euler constant
+)
+
+// Coefficients for
+//
+//	log(1+x) = x - \frac{x^2}{2} + \frac{x^3 lP(x)}{lQ(x)}
+//
+// for
+//
+//	\frac{1}{\sqrt{2}} <= x < \sqrt{2}
+//
+// Theoretical peak relative error = 2.32e-20
+var lP = [...]float64{
+	4.5270000862445199635215e-5,
+	4.9854102823193375972212e-1,
+	6.5787325942061044846969e0,
+	2.9911919328553073277375e1,
+	6.0949667980987787057556e1,
+	5.7112963590585538103336e1,
+	2.0039553499201281259648e1,
+}
+
+var lQ = [...]float64{
+	1.5062909083469192043167e1,
+	8.3047565967967209469434e1,
+	2.2176239823732856465394e2,
+	3.0909872225312059774938e2,
+	2.1642788614495947685003e2,
+	6.0118660497603843919306e1,
+}
+
+// log1p computes
+//
+//	log(1 + x)
+func log1p(x float64) float64 {
+	z := 1 + x
+	if z < invSqrt2 || z > math.Sqrt2 {
+		return math.Log(z)
+	}
+	z = x * x
+	z = -0.5*z + x*(z*polevl(x, lP[:], 6)/p1evl(x, lQ[:], 6))
+	return x + z
+}
+
+// log1pmx computes
+//
+//	log(1 + x) - x
+func log1pmx(x float64) float64 {
+	if math.Abs(x) < 0.5 {
+		xfac := x
+		res := 0.0
+
+		var term float64
+		for n := 2; n < maxIter; n++ {
+			xfac *= -x
+			term = xfac / float64(n)
+			res += term
+			if math.Abs(term) < machEp*math.Abs(res) {
+				break
+			}
+		}
+		return res
+	}
+	return log1p(x) - x
+}
+
+// Coefficients for
+//
+//	e^x = 1 + \frac{2x eP(x^2)}{eQ(x^2) - eP(x^2)}
+//
+// for
+//
+//	-0.5 <= x <= 0.5
+var eP = [...]float64{
+	1.2617719307481059087798e-4,
+	3.0299440770744196129956e-2,
+	9.9999999999999999991025e-1,
+}
+
+var eQ = [...]float64{
+	3.0019850513866445504159e-6,
+	2.5244834034968410419224e-3,
+	2.2726554820815502876593e-1,
+	2.0000000000000000000897e0,
+}
+
+// expm1 computes
+//
+//	expm1(x) = e^x - 1
+func expm1(x float64) float64 {
+	if math.IsInf(x, 0) {
+		if math.IsNaN(x) || x > 0 {
+			return x
+		}
+		return -1
+	}
+	if x < -0.5 || x > 0.5 {
+		return math.Exp(x) - 1
+	}
+	xx := x * x
+	r := x * polevl(xx, eP[:], 2)
+	r = r / (polevl(xx, eQ[:], 3) - r)
+	return r + r
+}
+
+var coscof = [...]float64{
+	4.7377507964246204691685e-14,
+	-1.1470284843425359765671e-11,
+	2.0876754287081521758361e-9,
+	-2.7557319214999787979814e-7,
+	2.4801587301570552304991e-5,
+	-1.3888888888888872993737e-3,
+	4.1666666666666666609054e-2,
+}
+
+// cosm1 computes
+//
+//	cosm1(x) = cos(x) - 1
+func cosm1(x float64) float64 {
+	if x < -pi4 || x > pi4 {
+		return math.Cos(x) - 1
+	}
+	xx := x * x
+	xx = -0.5*xx + xx*xx*polevl(xx, coscof[:], 6)
+	return xx
+}
+
+// lgam1pTayler computes
+//
+//	lgam(x + 1)
+//
+// around x = 0 using its Taylor series.
+func lgam1pTaylor(x float64) float64 {
+	if x == 0 {
+		return 0
+	}
+	res := -euler * x
+	xfac := -x
+	for n := 2; n < 42; n++ {
+		nf := float64(n)
+		xfac *= -x
+		coeff := Zeta(nf, 1) * xfac / nf
+		res += coeff
+		if math.Abs(coeff) < machEp*math.Abs(res) {
+			break
+		}
+	}
+
+	return res
+}
+
+// lgam1p computes
+//
+//	lgam(x + 1)
+func lgam1p(x float64) float64 {
+	if math.Abs(x) <= 0.5 {
+		return lgam1pTaylor(x)
+	} else if math.Abs(x-1) < 0.5 {
+		return math.Log(x) + lgam1pTaylor(x-1)
+	}
+	return lgam(x + 1)
+}
diff --git a/vendor/gonum.org/v1/gonum/mathext/internal/cephes/zeta.go b/vendor/gonum.org/v1/gonum/mathext/internal/cephes/zeta.go
new file mode 100644
index 0000000000..0efeaa6045
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/internal/cephes/zeta.go
@@ -0,0 +1,117 @@
+// Derived from SciPy's special/cephes/zeta.c
+// https://github.com/scipy/scipy/blob/master/scipy/special/cephes/zeta.c
+// Made freely available by Stephen L. Moshier without support or guarantee.
+
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+// Copyright ©1984, ©1987 by Stephen L. Moshier
+// Portions Copyright ©2016 The Gonum Authors. All rights reserved.
+
+package cephes
+
+import "math"
+
+// zetaCoegs are the expansion coefficients for Euler-Maclaurin summation
+// formula:
+//
+//	\frac{(2k)!}{B_{2k}}
+//
+// where
+//
+//	B_{2k}
+//
+// are Bernoulli numbers.
+var zetaCoefs = [...]float64{
+	12.0,
+	-720.0,
+	30240.0,
+	-1209600.0,
+	47900160.0,
+	-1.307674368e12 / 691,
+	7.47242496e10,
+	-1.067062284288e16 / 3617,
+	5.109094217170944e18 / 43867,
+	-8.028576626982912e20 / 174611,
+	1.5511210043330985984e23 / 854513,
+	-1.6938241367317436694528e27 / 236364091,
+}
+
+// Zeta computes the Riemann zeta function of two arguments.
+//
+//	Zeta(x,q) = \sum_{k=0}^{\infty} (k+q)^{-x}
+//
+// Note that Zeta returns +Inf if x is 1 and will panic if x is less than 1,
+// q is either zero or a negative integer, or q is negative and x is not an
+// integer.
+//
+// Note that:
+//
+//	zeta(x,1) = zetac(x) + 1
+func Zeta(x, q float64) float64 {
+	// REFERENCE: Gradshteyn, I. S., and I. M. Ryzhik, Tables of Integrals, Series,
+	// and Products, p. 1073; Academic Press, 1980.
+	if x == 1 {
+		return math.Inf(1)
+	}
+
+	if x < 1 {
+		panic(paramOutOfBounds)
+	}
+
+	if q <= 0 {
+		if q == math.Floor(q) {
+			panic(errParamFunctionSingularity)
+		}
+		if x != math.Floor(x) {
+			panic(paramOutOfBounds) // Because q^-x not defined
+		}
+	}
+
+	// Asymptotic expansion: http://dlmf.nist.gov/25.11#E43
+	if q > 1e8 {
+		return (1/(x-1) + 1/(2*q)) * math.Pow(q, 1-x)
+	}
+
+	// The Euler-Maclaurin summation formula is used to obtain the expansion:
+	//  Zeta(x,q) = \sum_{k=1}^n (k+q)^{-x} + \frac{(n+q)^{1-x}}{x-1} - \frac{1}{2(n+q)^x} + \sum_{j=1}^{\infty} \frac{B_{2j}x(x+1)...(x+2j)}{(2j)! (n+q)^{x+2j+1}}
+	// where
+	//  B_{2j}
+	// are Bernoulli numbers.
+	// Permit negative q but continue sum until n+q > 9. This case should be
+	// handled by a reflection formula. If q<0 and x is an integer, there is a
+	// relation to the polyGamma function.
+	s := math.Pow(q, -x)
+	a := q
+	i := 0
+	b := 0.0
+	for i < 9 || a <= 9 {
+		i++
+		a += 1.0
+		b = math.Pow(a, -x)
+		s += b
+		if math.Abs(b/s) < machEp {
+			return s
+		}
+	}
+
+	w := a
+	s += b * w / (x - 1)
+	s -= 0.5 * b
+	a = 1.0
+	k := 0.0
+	for _, coef := range zetaCoefs {
+		a *= x + k
+		b /= w
+		t := a * b / coef
+		s = s + t
+		t = math.Abs(t / s)
+		if t < machEp {
+			return s
+		}
+		k += 1.0
+		a *= x + k
+		b /= w
+		k += 1.0
+	}
+	return s
+}
diff --git a/vendor/gonum.org/v1/gonum/mathext/internal/gonum/beta.go b/vendor/gonum.org/v1/gonum/mathext/internal/gonum/beta.go
new file mode 100644
index 0000000000..f1fb3587f2
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/internal/gonum/beta.go
@@ -0,0 +1,58 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+)
+
+// Beta returns the value of the complete beta function B(a, b). It is defined as
+//
+//	Γ(a)Γ(b) / Γ(a+b)
+//
+// Special cases are:
+//
+//	B(a,b) returns NaN if a or b is Inf
+//	B(a,b) returns NaN if a and b are 0
+//	B(a,b) returns NaN if a or b is NaN
+//	B(a,b) returns NaN if a or b is < 0
+//	B(a,b) returns +Inf if a xor b is 0.
+//
+// See http://mathworld.wolfram.com/BetaFunction.html for more detailed information.
+func Beta(a, b float64) float64 {
+	return math.Exp(Lbeta(a, b))
+}
+
+// Lbeta returns the natural logarithm of the complete beta function B(a,b).
+// Lbeta is defined as:
+//
+//	Ln(Γ(a)Γ(b)/Γ(a+b))
+//
+// Special cases are:
+//
+//	Lbeta(a,b) returns NaN if a or b is Inf
+//	Lbeta(a,b) returns NaN if a and b are 0
+//	Lbeta(a,b) returns NaN if a or b is NaN
+//	Lbeta(a,b) returns NaN if a or b is < 0
+//	Lbeta(a,b) returns +Inf if a xor b is 0.
+func Lbeta(a, b float64) float64 {
+	switch {
+	case math.IsInf(a, +1) || math.IsInf(b, +1):
+		return math.NaN()
+	case a == 0 && b == 0:
+		return math.NaN()
+	case a < 0 || b < 0:
+		return math.NaN()
+	case math.IsNaN(a) || math.IsNaN(b):
+		return math.NaN()
+	case a == 0 || b == 0:
+		return math.Inf(+1)
+	}
+
+	la, _ := math.Lgamma(a)
+	lb, _ := math.Lgamma(b)
+	lab, _ := math.Lgamma(a + b)
+	return la + lb - lab
+}
diff --git a/vendor/gonum.org/v1/gonum/mathext/internal/gonum/doc.go b/vendor/gonum.org/v1/gonum/mathext/internal/gonum/doc.go
new file mode 100644
index 0000000000..cbe6aa2381
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/internal/gonum/doc.go
@@ -0,0 +1,7 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package gonum contains functions implemented by the gonum team.
+// It is here to avoid circular imports and/or double coding of functions.
+package gonum // import "gonum.org/v1/gonum/mathext/internal/gonum"
diff --git a/vendor/gonum.org/v1/gonum/mathext/internal/gonum/gonum.go b/vendor/gonum.org/v1/gonum/mathext/internal/gonum/gonum.go
new file mode 100644
index 0000000000..47e02ce386
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/internal/gonum/gonum.go
@@ -0,0 +1,5 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
diff --git a/vendor/gonum.org/v1/gonum/mathext/mvgamma.go b/vendor/gonum.org/v1/gonum/mathext/mvgamma.go
new file mode 100644
index 0000000000..99a9233de4
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/mvgamma.go
@@ -0,0 +1,32 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mathext
+
+import "math"
+
+const (
+	logPi = 1.14472988584940017414342735135305871164729481 // http://oeis.org/A053510
+)
+
+// MvLgamma returns the log of the multivariate Gamma function. Dim
+// must be greater than zero, and MvLgamma will return NaN if v < (dim-1)/2.
+//
+// See https://en.wikipedia.org/wiki/Multivariate_gamma_function for more
+// information.
+func MvLgamma(v float64, dim int) float64 {
+	if dim < 1 {
+		panic("mathext: negative dimension")
+	}
+	df := float64(dim)
+	if v < (df-1)*0.5 {
+		return math.NaN()
+	}
+	ans := df * (df - 1) * 0.25 * logPi
+	for i := 1; i <= dim; i++ {
+		lg, _ := math.Lgamma(v + float64(1-i)*0.5)
+		ans += lg
+	}
+	return ans
+}
diff --git a/vendor/gonum.org/v1/gonum/mathext/roots.go b/vendor/gonum.org/v1/gonum/mathext/roots.go
new file mode 100644
index 0000000000..120ce6ef26
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/roots.go
@@ -0,0 +1,181 @@
+// Derived from SciPy's special/c_misc/fsolve.c and special/c_misc/misc.h
+// https://github.com/scipy/scipy/blob/master/scipy/special/c_misc/fsolve.c
+// https://github.com/scipy/scipy/blob/master/scipy/special/c_misc/misc.h
+
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mathext
+
+import "math"
+
+type objectiveFunc func(float64, []float64) float64
+
+type fSolveResult uint8
+
+const (
+	// An exact solution was found, in which case the first point on the
+	// interval is the value
+	fSolveExact fSolveResult = iota + 1
+	// Interval width is less than the tolerance
+	fSolveConverged
+	// Root-finding didn't converge in a set number of iterations
+	fSolveMaxIterations
+)
+
+const (
+	machEp = 1.0 / (1 << 53)
+)
+
+// falsePosition uses a combination of bisection and false position to find a
+// root of a function within a given interval. This is guaranteed to converge,
+// and always keeps a bounding interval, unlike Newton's method. Inputs are:
+//
+//	x1, x2:   initial bounding interval
+//	f1, f2: value of f() at x1 and x2
+//	absErr, relErr: absolute and relative errors on the bounding interval
+//	bisectTil: if > 0.0, perform bisection until the width of the bounding
+//	           interval is less than this
+//	f, fExtra: function to find root of is f(x, fExtra)
+//
+// Returns:
+//
+//	result: whether an exact root was found, the process converged to a
+//	        bounding interval small than the required error, or the max number
+//	        of iterations was hit
+//	bestX: best root approximation
+//	bestF: function value at bestX
+//	errEst: error estimation
+func falsePosition(x1, x2, f1, f2, absErr, relErr, bisectTil float64, f objectiveFunc, fExtra []float64) (fSolveResult, float64, float64, float64) {
+	// The false position steps are either unmodified, or modified with the
+	// Anderson-Bjorck method as appropriate. Theoretically, this has a "speed of
+	// convergence" of 1.7 (bisection is 1, Newton is 2).
+	// Note that this routine was designed initially to work with gammaincinv, so
+	// it may not be tuned right for other problems. Don't use it blindly.
+
+	if f1*f2 >= 0 {
+		panic("Initial interval is not a bounding interval")
+	}
+
+	const (
+		maxIterations = 100
+		bisectIter    = 4
+		bisectWidth   = 4.0
+	)
+
+	const (
+		bisect = iota + 1
+		falseP
+	)
+
+	var state uint8
+	if bisectTil > 0 {
+		state = bisect
+	} else {
+		state = falseP
+	}
+
+	gamma := 1.0
+
+	w := math.Abs(x2 - x1)
+	lastBisectWidth := w
+
+	var nFalseP int
+	var x3, f3, bestX, bestF float64
+	for i := 0; i < maxIterations; i++ {
+		switch state {
+		case bisect:
+			x3 = 0.5 * (x1 + x2)
+			if x3 == x1 || x3 == x2 {
+				// i.e., x1 and x2 are successive floating-point numbers
+				bestX = x3
+				if x3 == x1 {
+					bestF = f1
+				} else {
+					bestF = f2
+				}
+				return fSolveConverged, bestX, bestF, w
+			}
+
+			f3 = f(x3, fExtra)
+			if f3 == 0 {
+				return fSolveExact, x3, f3, w
+			}
+
+			if f3*f2 < 0 {
+				x1 = x2
+				f1 = f2
+			}
+			x2 = x3
+			f2 = f3
+			w = math.Abs(x2 - x1)
+			lastBisectWidth = w
+			if bisectTil > 0 {
+				if w < bisectTil {
+					bisectTil = -1.0
+					gamma = 1.0
+					nFalseP = 0
+					state = falseP
+				}
+			} else {
+				gamma = 1.0
+				nFalseP = 0
+				state = falseP
+			}
+		case falseP:
+			s12 := (f2 - gamma*f1) / (x2 - x1)
+			x3 = x2 - f2/s12
+			f3 = f(x3, fExtra)
+			if f3 == 0 {
+				return fSolveExact, x3, f3, w
+			}
+
+			nFalseP++
+			if f3*f2 < 0 {
+				gamma = 1.0
+				x1 = x2
+				f1 = f2
+			} else {
+				// Anderson-Bjorck method
+				g := 1.0 - f3/f2
+				if g <= 0 {
+					g = 0.5
+				}
+				gamma *= g
+			}
+			x2 = x3
+			f2 = f3
+			w = math.Abs(x2 - x1)
+
+			// Sanity check. For every 4 false position checks, see if we really are
+			// decreasing the interval by comparing to what bisection would have
+			// achieved (or, rather, a bit more lenient than that -- interval
+			// decreased by 4 instead of by 16, as the fp could be decreasing gamma
+			// for a bit). Note that this should guarantee convergence, as it makes
+			// sure that we always end up decreasing the interval width with a
+			// bisection.
+			if nFalseP > bisectIter {
+				if w*bisectWidth > lastBisectWidth {
+					state = bisect
+				}
+				nFalseP = 0
+				lastBisectWidth = w
+			}
+		}
+
+		tol := absErr + relErr*math.Max(math.Max(math.Abs(x1), math.Abs(x2)), 1.0)
+		if w <= tol {
+			if math.Abs(f1) < math.Abs(f2) {
+				bestX = x1
+				bestF = f1
+			} else {
+				bestX = x2
+				bestF = f2
+			}
+			return fSolveConverged, bestX, bestF, w
+		}
+	}
+
+	return fSolveMaxIterations, x3, f3, w
+}
diff --git a/vendor/gonum.org/v1/gonum/mathext/zeta.go b/vendor/gonum.org/v1/gonum/mathext/zeta.go
new file mode 100644
index 0000000000..23a87fae8b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mathext/zeta.go
@@ -0,0 +1,22 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mathext
+
+import "gonum.org/v1/gonum/mathext/internal/cephes"
+
+// Zeta computes the Riemann zeta function of two arguments.
+//
+//	Zeta(x,q) = \sum_{k=0}^{\infty} (k+q)^{-x}
+//
+// Note that Zeta returns +Inf if x is 1 and will panic if x is less than 1,
+// q is either zero or a negative integer, or q is negative and x is not an
+// integer.
+//
+// See http://mathworld.wolfram.com/HurwitzZetaFunction.html
+// or https://en.wikipedia.org/wiki/Multiple_zeta_function#Two_parameters_case
+// for more detailed information.
+func Zeta(x, q float64) float64 {
+	return cephes.Zeta(x, q)
+}
diff --git a/vendor/gonum.org/v1/gonum/optimize/README.md b/vendor/gonum.org/v1/gonum/optimize/README.md
new file mode 100644
index 0000000000..70a8530616
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/optimize/README.md
@@ -0,0 +1,6 @@
+# Gonum optimize
+
+[![go.dev reference](https://pkg.go.dev/badge/gonum.org/v1/gonum/optimize)](https://pkg.go.dev/gonum.org/v1/gonum/optimize)
+[![GoDoc](https://godocs.io/gonum.org/v1/gonum/optimize?status.svg)](https://godocs.io/gonum.org/v1/gonum/optimize)
+
+Package optimize is an optimization package for the Go language.
diff --git a/vendor/gonum.org/v1/gonum/optimize/backtracking.go b/vendor/gonum.org/v1/gonum/optimize/backtracking.go
new file mode 100644
index 0000000000..2ab44e44db
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/optimize/backtracking.go
@@ -0,0 +1,84 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package optimize
+
+const (
+	defaultBacktrackingContraction = 0.5
+	defaultBacktrackingDecrease    = 1e-4
+	minimumBacktrackingStepSize    = 1e-20
+)
+
+var _ Linesearcher = (*Backtracking)(nil)
+
+// Backtracking is a Linesearcher that uses backtracking to find a point that
+// satisfies the Armijo condition with the given decrease factor. If the Armijo
+// condition has not been met, the step size is decreased by ContractionFactor.
+//
+// The Armijo condition only requires the gradient at the beginning of each
+// major iteration (not at successive step locations), and so Backtracking may
+// be a good linesearch for functions with expensive gradients. Backtracking is
+// not appropriate for optimizers that require the Wolfe conditions to be met,
+// such as BFGS.
+//
+// Both DecreaseFactor and ContractionFactor must be between zero and one, and
+// Backtracking will panic otherwise. If either DecreaseFactor or
+// ContractionFactor are zero, it will be set to a reasonable default.
+type Backtracking struct {
+	DecreaseFactor    float64 // Constant factor in the sufficient decrease (Armijo) condition.
+	ContractionFactor float64 // Step size multiplier at each iteration (step *= ContractionFactor).
+
+	stepSize float64
+	initF    float64
+	initG    float64
+
+	lastOp Operation
+}
+
+func (b *Backtracking) Init(f, g float64, step float64) Operation {
+	if step <= 0 {
+		panic("backtracking: bad step size")
+	}
+	if g >= 0 {
+		panic("backtracking: initial derivative is non-negative")
+	}
+
+	if b.ContractionFactor == 0 {
+		b.ContractionFactor = defaultBacktrackingContraction
+	}
+	if b.DecreaseFactor == 0 {
+		b.DecreaseFactor = defaultBacktrackingDecrease
+	}
+	if b.ContractionFactor <= 0 || b.ContractionFactor >= 1 {
+		panic("backtracking: ContractionFactor must be between 0 and 1")
+	}
+	if b.DecreaseFactor <= 0 || b.DecreaseFactor >= 1 {
+		panic("backtracking: DecreaseFactor must be between 0 and 1")
+	}
+
+	b.stepSize = step
+	b.initF = f
+	b.initG = g
+
+	b.lastOp = FuncEvaluation
+	return b.lastOp
+}
+
+func (b *Backtracking) Iterate(f, _ float64) (Operation, float64, error) {
+	if b.lastOp != FuncEvaluation {
+		panic("backtracking: Init has not been called")
+	}
+
+	if ArmijoConditionMet(f, b.initF, b.initG, b.stepSize, b.DecreaseFactor) {
+		b.lastOp = MajorIteration
+		return b.lastOp, b.stepSize, nil
+	}
+	b.stepSize *= b.ContractionFactor
+	if b.stepSize < minimumBacktrackingStepSize {
+		b.lastOp = NoOperation
+		return b.lastOp, b.stepSize, ErrLinesearcherFailure
+	}
+	b.lastOp = FuncEvaluation
+	return b.lastOp, b.stepSize, nil
+}
diff --git a/vendor/gonum.org/v1/gonum/optimize/bfgs.go b/vendor/gonum.org/v1/gonum/optimize/bfgs.go
new file mode 100644
index 0000000000..b44ef81ee0
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/optimize/bfgs.go
@@ -0,0 +1,192 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package optimize
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/mat"
+)
+
+var (
+	_ Method          = (*BFGS)(nil)
+	_ localMethod     = (*BFGS)(nil)
+	_ NextDirectioner = (*BFGS)(nil)
+)
+
+// BFGS implements the Broyden–Fletcher–Goldfarb–Shanno optimization method. It
+// is a quasi-Newton method that performs successive rank-one updates to an
+// estimate of the inverse Hessian of the objective function. It exhibits
+// super-linear convergence when in proximity to a local minimum. It has memory
+// cost that is O(n^2) relative to the input dimension.
+type BFGS struct {
+	// Linesearcher selects suitable steps along the descent direction.
+	// Accepted steps should satisfy the strong Wolfe conditions.
+	// If Linesearcher == nil, an appropriate default is chosen.
+	Linesearcher Linesearcher
+	// GradStopThreshold sets the threshold for stopping if the gradient norm
+	// gets too small. If GradStopThreshold is 0 it is defaulted to 1e-12, and
+	// if it is NaN the setting is not used.
+	GradStopThreshold float64
+
+	ls *LinesearchMethod
+
+	status Status
+	err    error
+
+	dim  int
+	x    mat.VecDense // Location of the last major iteration.
+	grad mat.VecDense // Gradient at the last major iteration.
+	s    mat.VecDense // Difference between locations in this and the previous iteration.
+	y    mat.VecDense // Difference between gradients in this and the previous iteration.
+	tmp  mat.VecDense
+
+	invHess *mat.SymDense
+
+	first bool // Indicator of the first iteration.
+}
+
+func (b *BFGS) Status() (Status, error) {
+	return b.status, b.err
+}
+
+func (*BFGS) Uses(has Available) (uses Available, err error) {
+	return has.gradient()
+}
+
+func (b *BFGS) Init(dim, tasks int) int {
+	b.status = NotTerminated
+	b.err = nil
+	return 1
+}
+
+func (b *BFGS) Run(operation chan<- Task, result <-chan Task, tasks []Task) {
+	b.status, b.err = localOptimizer{}.run(b, b.GradStopThreshold, operation, result, tasks)
+	close(operation)
+}
+
+func (b *BFGS) initLocal(loc *Location) (Operation, error) {
+	if b.Linesearcher == nil {
+		b.Linesearcher = &Bisection{}
+	}
+	if b.ls == nil {
+		b.ls = &LinesearchMethod{}
+	}
+	b.ls.Linesearcher = b.Linesearcher
+	b.ls.NextDirectioner = b
+
+	return b.ls.Init(loc)
+}
+
+func (b *BFGS) iterateLocal(loc *Location) (Operation, error) {
+	return b.ls.Iterate(loc)
+}
+
+func (b *BFGS) InitDirection(loc *Location, dir []float64) (stepSize float64) {
+	dim := len(loc.X)
+	b.dim = dim
+	b.first = true
+
+	x := mat.NewVecDense(dim, loc.X)
+	grad := mat.NewVecDense(dim, loc.Gradient)
+	b.x.CloneFromVec(x)
+	b.grad.CloneFromVec(grad)
+
+	b.y.Reset()
+	b.s.Reset()
+	b.tmp.Reset()
+
+	if b.invHess == nil || cap(b.invHess.RawSymmetric().Data) < dim*dim {
+		b.invHess = mat.NewSymDense(dim, nil)
+	} else {
+		b.invHess = mat.NewSymDense(dim, b.invHess.RawSymmetric().Data[:dim*dim])
+	}
+	// The values of the inverse Hessian are initialized in the first call to
+	// NextDirection.
+
+	// Initial direction is just negative of the gradient because the Hessian
+	// is an identity matrix.
+	d := mat.NewVecDense(dim, dir)
+	d.ScaleVec(-1, grad)
+	return 1 / mat.Norm(d, 2)
+}
+
+func (b *BFGS) NextDirection(loc *Location, dir []float64) (stepSize float64) {
+	dim := b.dim
+	if len(loc.X) != dim {
+		panic("bfgs: unexpected size mismatch")
+	}
+	if len(loc.Gradient) != dim {
+		panic("bfgs: unexpected size mismatch")
+	}
+	if len(dir) != dim {
+		panic("bfgs: unexpected size mismatch")
+	}
+
+	x := mat.NewVecDense(dim, loc.X)
+	grad := mat.NewVecDense(dim, loc.Gradient)
+
+	// s = x_{k+1} - x_{k}
+	b.s.SubVec(x, &b.x)
+	// y = g_{k+1} - g_{k}
+	b.y.SubVec(grad, &b.grad)
+
+	sDotY := mat.Dot(&b.s, &b.y)
+
+	if b.first {
+		// Rescale the initial Hessian.
+		// From: Nocedal, J., Wright, S.: Numerical Optimization (2nd ed).
+		//       Springer (2006), page 143, eq. 6.20.
+		yDotY := mat.Dot(&b.y, &b.y)
+		scale := sDotY / yDotY
+		for i := 0; i < dim; i++ {
+			for j := i; j < dim; j++ {
+				if i == j {
+					b.invHess.SetSym(i, i, scale)
+				} else {
+					b.invHess.SetSym(i, j, 0)
+				}
+			}
+		}
+		b.first = false
+	}
+
+	if math.Abs(sDotY) != 0 {
+		// Update the inverse Hessian according to the formula
+		//
+		//  B_{k+1}^-1 = B_k^-1
+		//             + (s_kᵀ y_k + y_kᵀ B_k^-1 y_k) / (s_kᵀ y_k)^2 * (s_k s_kᵀ)
+		//             - (B_k^-1 y_k s_kᵀ + s_k y_kᵀ B_k^-1) / (s_kᵀ y_k).
+		//
+		// Note that y_kᵀ B_k^-1 y_k is a scalar, and that the third term is a
+		// rank-two update where B_k^-1 y_k is one vector and s_k is the other.
+		yBy := mat.Inner(&b.y, b.invHess, &b.y)
+		b.tmp.MulVec(b.invHess, &b.y)
+		scale := (1 + yBy/sDotY) / sDotY
+		b.invHess.SymRankOne(b.invHess, scale, &b.s)
+		b.invHess.RankTwo(b.invHess, -1/sDotY, &b.tmp, &b.s)
+	}
+
+	// Update the stored BFGS data.
+	b.x.CopyVec(x)
+	b.grad.CopyVec(grad)
+
+	// New direction is stored in dir.
+	d := mat.NewVecDense(dim, dir)
+	d.MulVec(b.invHess, grad)
+	d.ScaleVec(-1, d)
+
+	return 1
+}
+
+func (*BFGS) needs() struct {
+	Gradient bool
+	Hessian  bool
+} {
+	return struct {
+		Gradient bool
+		Hessian  bool
+	}{true, false}
+}
diff --git a/vendor/gonum.org/v1/gonum/optimize/bisection.go b/vendor/gonum.org/v1/gonum/optimize/bisection.go
new file mode 100644
index 0000000000..b194a2090b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/optimize/bisection.go
@@ -0,0 +1,146 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package optimize
+
+import "math"
+
+const defaultBisectionCurvature = 0.9
+
+var _ Linesearcher = (*Bisection)(nil)
+
+// Bisection is a Linesearcher that uses a bisection to find a point that
+// satisfies the strong Wolfe conditions with the given curvature factor and
+// a decrease factor of zero.
+type Bisection struct {
+	// CurvatureFactor is the constant factor in the curvature condition.
+	// Smaller values result in a more exact line search.
+	// A set value must be in the interval (0, 1), otherwise Init will panic.
+	// If it is zero, it will be defaulted to 0.9.
+	CurvatureFactor float64
+
+	minStep  float64
+	maxStep  float64
+	currStep float64
+
+	initF float64
+	minF  float64
+	maxF  float64
+	lastF float64
+
+	initGrad float64
+
+	lastOp Operation
+}
+
+func (b *Bisection) Init(f, g float64, step float64) Operation {
+	if step <= 0 {
+		panic("bisection: bad step size")
+	}
+	if g >= 0 {
+		panic("bisection: initial derivative is non-negative")
+	}
+
+	if b.CurvatureFactor == 0 {
+		b.CurvatureFactor = defaultBisectionCurvature
+	}
+	if b.CurvatureFactor <= 0 || b.CurvatureFactor >= 1 {
+		panic("bisection: CurvatureFactor not between 0 and 1")
+	}
+
+	b.minStep = 0
+	b.maxStep = math.Inf(1)
+	b.currStep = step
+
+	b.initF = f
+	b.minF = f
+	b.maxF = math.NaN()
+
+	b.initGrad = g
+
+	// Only evaluate the gradient when necessary.
+	b.lastOp = FuncEvaluation
+	return b.lastOp
+}
+
+func (b *Bisection) Iterate(f, g float64) (Operation, float64, error) {
+	if b.lastOp != FuncEvaluation && b.lastOp != GradEvaluation {
+		panic("bisection: Init has not been called")
+	}
+	minF := b.initF
+	if b.maxF < minF {
+		minF = b.maxF
+	}
+	if b.minF < minF {
+		minF = b.minF
+	}
+	if b.lastOp == FuncEvaluation {
+		// See if the function value is good enough to make progress. If it is,
+		// evaluate the gradient. If not, set it to the upper bound if the bound
+		// has not yet been found, otherwise iterate toward the minimum location.
+		if f <= minF {
+			b.lastF = f
+			b.lastOp = GradEvaluation
+			return b.lastOp, b.currStep, nil
+		}
+		if math.IsInf(b.maxStep, 1) {
+			b.maxStep = b.currStep
+			b.maxF = f
+			return b.nextStep((b.minStep + b.maxStep) / 2)
+		}
+		if b.minF <= b.maxF {
+			b.maxStep = b.currStep
+			b.maxF = f
+		} else {
+			b.minStep = b.currStep
+			b.minF = f
+		}
+		return b.nextStep((b.minStep + b.maxStep) / 2)
+	}
+	f = b.lastF
+	// The function value was lower. Check if this location is sufficient to
+	// converge the linesearch, otherwise iterate.
+	if StrongWolfeConditionsMet(f, g, minF, b.initGrad, b.currStep, 0, b.CurvatureFactor) {
+		b.lastOp = MajorIteration
+		return b.lastOp, b.currStep, nil
+	}
+	if math.IsInf(b.maxStep, 1) {
+		// The function value is lower. If the gradient is positive, an upper bound
+		// of the minimum been found. If the gradient is negative, search farther
+		// in that direction.
+		if g > 0 {
+			b.maxStep = b.currStep
+			b.maxF = f
+			return b.nextStep((b.minStep + b.maxStep) / 2)
+		}
+		b.minStep = b.currStep
+		b.minF = f
+		return b.nextStep(b.currStep * 2)
+	}
+	// The interval has been bounded, and we have found a new lowest value. Use
+	// the gradient to decide which direction.
+	if g < 0 {
+		b.minStep = b.currStep
+		b.minF = f
+	} else {
+		b.maxStep = b.currStep
+		b.maxF = f
+	}
+	return b.nextStep((b.minStep + b.maxStep) / 2)
+}
+
+// nextStep checks if the new step is equal to the old step.
+// This can happen if min and max are the same, or if the step size is infinity,
+// both of which indicate the minimization must stop. If the steps are different,
+// it sets the new step size and returns the evaluation type and the step. If the steps
+// are the same, it returns an error.
+func (b *Bisection) nextStep(step float64) (Operation, float64, error) {
+	if b.currStep == step {
+		b.lastOp = NoOperation
+		return b.lastOp, b.currStep, ErrLinesearcherFailure
+	}
+	b.currStep = step
+	b.lastOp = FuncEvaluation
+	return b.lastOp, b.currStep, nil
+}
diff --git a/vendor/gonum.org/v1/gonum/optimize/cg.go b/vendor/gonum.org/v1/gonum/optimize/cg.go
new file mode 100644
index 0000000000..6474b03792
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/optimize/cg.go
@@ -0,0 +1,368 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package optimize
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/floats"
+)
+
+const (
+	iterationRestartFactor = 6
+	angleRestartThreshold  = -0.9
+)
+
+var (
+	_ Method          = (*CG)(nil)
+	_ localMethod     = (*CG)(nil)
+	_ NextDirectioner = (*CG)(nil)
+)
+
+// CGVariant calculates the scaling parameter, β, used for updating the
+// conjugate direction in the nonlinear conjugate gradient (CG) method.
+type CGVariant interface {
+	// Init is called at the first iteration and provides a way to initialize
+	// any internal state.
+	Init(loc *Location)
+	// Beta returns the value of the scaling parameter that is computed
+	// according to the particular variant of the CG method.
+	Beta(grad, gradPrev, dirPrev []float64) float64
+}
+
+var (
+	_ CGVariant = (*FletcherReeves)(nil)
+	_ CGVariant = (*PolakRibierePolyak)(nil)
+	_ CGVariant = (*HestenesStiefel)(nil)
+	_ CGVariant = (*DaiYuan)(nil)
+	_ CGVariant = (*HagerZhang)(nil)
+)
+
+// CG implements the nonlinear conjugate gradient method for solving nonlinear
+// unconstrained optimization problems. It is a line search method that
+// generates the search directions d_k according to the formula
+//
+//	d_{k+1} = -∇f_{k+1} + β_k*d_k,   d_0 = -∇f_0.
+//
+// Variants of the conjugate gradient method differ in the choice of the
+// parameter β_k. The conjugate gradient method usually requires fewer function
+// evaluations than the gradient descent method and no matrix storage, but
+// L-BFGS is usually more efficient.
+//
+// CG implements a restart strategy that takes the steepest descent direction
+// (i.e., d_{k+1} = -∇f_{k+1}) whenever any of the following conditions holds:
+//
+//   - A certain number of iterations has elapsed without a restart. This number
+//     is controllable via IterationRestartFactor and if equal to 0, it is set to
+//     a reasonable default based on the problem dimension.
+//   - The angle between the gradients at two consecutive iterations ∇f_k and
+//     ∇f_{k+1} is too large.
+//   - The direction d_{k+1} is not a descent direction.
+//   - β_k returned from CGVariant.Beta is equal to zero.
+//
+// The line search for CG must yield step sizes that satisfy the strong Wolfe
+// conditions at every iteration, otherwise the generated search direction
+// might fail to be a descent direction. The line search should be more
+// stringent compared with those for Newton-like methods, which can be achieved
+// by setting the gradient constant in the strong Wolfe conditions to a small
+// value.
+//
+// See also William Hager, Hongchao Zhang, A survey of nonlinear conjugate
+// gradient methods. Pacific Journal of Optimization, 2 (2006), pp. 35-58, and
+// references therein.
+type CG struct {
+	// Linesearcher must satisfy the strong Wolfe conditions at every iteration.
+	// If Linesearcher == nil, an appropriate default is chosen.
+	Linesearcher Linesearcher
+	// Variant implements the particular CG formula for computing β_k.
+	// If Variant is nil, an appropriate default is chosen.
+	Variant CGVariant
+	// InitialStep estimates the initial line search step size, because the CG
+	// method does not generate well-scaled search directions.
+	// If InitialStep is nil, an appropriate default is chosen.
+	InitialStep StepSizer
+
+	// IterationRestartFactor determines the frequency of restarts based on the
+	// problem dimension. The negative gradient direction is taken whenever
+	// ceil(IterationRestartFactor*(problem dimension)) iterations have elapsed
+	// without a restart. For medium and large-scale problems
+	// IterationRestartFactor should be set to 1, low-dimensional problems a
+	// larger value should be chosen. Note that if the ceil function returns 1,
+	// CG will be identical to gradient descent.
+	// If IterationRestartFactor is 0, it will be set to 6.
+	// CG will panic if IterationRestartFactor is negative.
+	IterationRestartFactor float64
+	// AngleRestartThreshold sets the threshold angle for restart. The method
+	// is restarted if the cosine of the angle between two consecutive
+	// gradients is smaller than or equal to AngleRestartThreshold, that is, if
+	//  ∇f_k·∇f_{k+1} / (|∇f_k| |∇f_{k+1}|) <= AngleRestartThreshold.
+	// A value of AngleRestartThreshold closer to -1 (successive gradients in
+	// exact opposite directions) will tend to reduce the number of restarts.
+	// If AngleRestartThreshold is 0, it will be set to -0.9.
+	// CG will panic if AngleRestartThreshold is not in the interval [-1, 0].
+	AngleRestartThreshold float64
+	// GradStopThreshold sets the threshold for stopping if the gradient norm
+	// gets too small. If GradStopThreshold is 0 it is defaulted to 1e-12, and
+	// if it is NaN the setting is not used.
+	GradStopThreshold float64
+
+	ls *LinesearchMethod
+
+	status Status
+	err    error
+
+	restartAfter    int
+	iterFromRestart int
+
+	dirPrev      []float64
+	gradPrev     []float64
+	gradPrevNorm float64
+}
+
+func (cg *CG) Status() (Status, error) {
+	return cg.status, cg.err
+}
+
+func (*CG) Uses(has Available) (uses Available, err error) {
+	return has.gradient()
+}
+
+func (cg *CG) Init(dim, tasks int) int {
+	cg.status = NotTerminated
+	cg.err = nil
+	return 1
+}
+
+func (cg *CG) Run(operation chan<- Task, result <-chan Task, tasks []Task) {
+	cg.status, cg.err = localOptimizer{}.run(cg, cg.GradStopThreshold, operation, result, tasks)
+	close(operation)
+}
+
+func (cg *CG) initLocal(loc *Location) (Operation, error) {
+	if cg.IterationRestartFactor < 0 {
+		panic("cg: IterationRestartFactor is negative")
+	}
+	if cg.AngleRestartThreshold < -1 || cg.AngleRestartThreshold > 0 {
+		panic("cg: AngleRestartThreshold not in [-1, 0]")
+	}
+
+	if cg.Linesearcher == nil {
+		cg.Linesearcher = &MoreThuente{CurvatureFactor: 0.1}
+	}
+	if cg.Variant == nil {
+		cg.Variant = &HestenesStiefel{}
+	}
+	if cg.InitialStep == nil {
+		cg.InitialStep = &FirstOrderStepSize{}
+	}
+
+	if cg.IterationRestartFactor == 0 {
+		cg.IterationRestartFactor = iterationRestartFactor
+	}
+	if cg.AngleRestartThreshold == 0 {
+		cg.AngleRestartThreshold = angleRestartThreshold
+	}
+
+	if cg.ls == nil {
+		cg.ls = &LinesearchMethod{}
+	}
+	cg.ls.Linesearcher = cg.Linesearcher
+	cg.ls.NextDirectioner = cg
+
+	return cg.ls.Init(loc)
+}
+
+func (cg *CG) iterateLocal(loc *Location) (Operation, error) {
+	return cg.ls.Iterate(loc)
+}
+
+func (cg *CG) InitDirection(loc *Location, dir []float64) (stepSize float64) {
+	dim := len(loc.X)
+
+	cg.restartAfter = int(math.Ceil(cg.IterationRestartFactor * float64(dim)))
+	cg.iterFromRestart = 0
+
+	// The initial direction is always the negative gradient.
+	copy(dir, loc.Gradient)
+	floats.Scale(-1, dir)
+
+	cg.dirPrev = resize(cg.dirPrev, dim)
+	copy(cg.dirPrev, dir)
+	cg.gradPrev = resize(cg.gradPrev, dim)
+	copy(cg.gradPrev, loc.Gradient)
+	cg.gradPrevNorm = floats.Norm(loc.Gradient, 2)
+
+	cg.Variant.Init(loc)
+	return cg.InitialStep.Init(loc, dir)
+}
+
+func (cg *CG) NextDirection(loc *Location, dir []float64) (stepSize float64) {
+	copy(dir, loc.Gradient)
+	floats.Scale(-1, dir)
+
+	cg.iterFromRestart++
+	var restart bool
+	if cg.iterFromRestart == cg.restartAfter {
+		// Restart because too many iterations have been taken without a restart.
+		restart = true
+	}
+
+	gDot := floats.Dot(loc.Gradient, cg.gradPrev)
+	gNorm := floats.Norm(loc.Gradient, 2)
+	if gDot <= cg.AngleRestartThreshold*gNorm*cg.gradPrevNorm {
+		// Restart because the angle between the last two gradients is too large.
+		restart = true
+	}
+
+	// Compute the scaling factor β_k even when restarting, because cg.Variant
+	// may be keeping an inner state that needs to be updated at every iteration.
+	beta := cg.Variant.Beta(loc.Gradient, cg.gradPrev, cg.dirPrev)
+	if beta == 0 {
+		// β_k == 0 means that the steepest descent direction will be taken, so
+		// indicate that the method is in fact being restarted.
+		restart = true
+	}
+	if !restart {
+		// The method is not being restarted, so update the descent direction.
+		floats.AddScaled(dir, beta, cg.dirPrev)
+		if floats.Dot(loc.Gradient, dir) >= 0 {
+			// Restart because the new direction is not a descent direction.
+			restart = true
+			copy(dir, loc.Gradient)
+			floats.Scale(-1, dir)
+		}
+	}
+
+	// Get the initial line search step size from the StepSizer even if the
+	// method was restarted, because StepSizers need to see every iteration.
+	stepSize = cg.InitialStep.StepSize(loc, dir)
+	if restart {
+		// The method was restarted and since the steepest descent direction is
+		// not related to the previous direction, discard the estimated step
+		// size from cg.InitialStep and use step size of 1 instead.
+		stepSize = 1
+		// Reset to 0 the counter of iterations taken since the last restart.
+		cg.iterFromRestart = 0
+	}
+
+	copy(cg.gradPrev, loc.Gradient)
+	copy(cg.dirPrev, dir)
+	cg.gradPrevNorm = gNorm
+	return stepSize
+}
+
+func (*CG) needs() struct {
+	Gradient bool
+	Hessian  bool
+} {
+	return struct {
+		Gradient bool
+		Hessian  bool
+	}{true, false}
+}
+
+// FletcherReeves implements the Fletcher-Reeves variant of the CG method that
+// computes the scaling parameter β_k according to the formula
+//
+//	β_k = |∇f_{k+1}|^2 / |∇f_k|^2.
+type FletcherReeves struct {
+	prevNorm float64
+}
+
+func (fr *FletcherReeves) Init(loc *Location) {
+	fr.prevNorm = floats.Norm(loc.Gradient, 2)
+}
+
+func (fr *FletcherReeves) Beta(grad, _, _ []float64) (beta float64) {
+	norm := floats.Norm(grad, 2)
+	beta = (norm / fr.prevNorm) * (norm / fr.prevNorm)
+	fr.prevNorm = norm
+	return beta
+}
+
+// PolakRibierePolyak implements the Polak-Ribiere-Polyak variant of the CG
+// method that computes the scaling parameter β_k according to the formula
+//
+//	β_k = max(0, ∇f_{k+1}·y_k / |∇f_k|^2),
+//
+// where y_k = ∇f_{k+1} - ∇f_k.
+type PolakRibierePolyak struct {
+	prevNorm float64
+}
+
+func (pr *PolakRibierePolyak) Init(loc *Location) {
+	pr.prevNorm = floats.Norm(loc.Gradient, 2)
+}
+
+func (pr *PolakRibierePolyak) Beta(grad, gradPrev, _ []float64) (beta float64) {
+	norm := floats.Norm(grad, 2)
+	dot := floats.Dot(grad, gradPrev)
+	beta = (norm*norm - dot) / (pr.prevNorm * pr.prevNorm)
+	pr.prevNorm = norm
+	return math.Max(0, beta)
+}
+
+// HestenesStiefel implements the Hestenes-Stiefel variant of the CG method
+// that computes the scaling parameter β_k according to the formula
+//
+//	β_k = max(0, ∇f_{k+1}·y_k / d_k·y_k),
+//
+// where y_k = ∇f_{k+1} - ∇f_k.
+type HestenesStiefel struct {
+	y []float64
+}
+
+func (hs *HestenesStiefel) Init(loc *Location) {
+	hs.y = resize(hs.y, len(loc.Gradient))
+}
+
+func (hs *HestenesStiefel) Beta(grad, gradPrev, dirPrev []float64) (beta float64) {
+	floats.SubTo(hs.y, grad, gradPrev)
+	beta = floats.Dot(grad, hs.y) / floats.Dot(dirPrev, hs.y)
+	return math.Max(0, beta)
+}
+
+// DaiYuan implements the Dai-Yuan variant of the CG method that computes the
+// scaling parameter β_k according to the formula
+//
+//	β_k = |∇f_{k+1}|^2 / d_k·y_k,
+//
+// where y_k = ∇f_{k+1} - ∇f_k.
+type DaiYuan struct {
+	y []float64
+}
+
+func (dy *DaiYuan) Init(loc *Location) {
+	dy.y = resize(dy.y, len(loc.Gradient))
+}
+
+func (dy *DaiYuan) Beta(grad, gradPrev, dirPrev []float64) (beta float64) {
+	floats.SubTo(dy.y, grad, gradPrev)
+	norm := floats.Norm(grad, 2)
+	return norm * norm / floats.Dot(dirPrev, dy.y)
+}
+
+// HagerZhang implements the Hager-Zhang variant of the CG method that computes the
+// scaling parameter β_k according to the formula
+//
+//	β_k = (y_k - 2 d_k |y_k|^2/(d_k·y_k))·∇f_{k+1} / (d_k·y_k),
+//
+// where y_k = ∇f_{k+1} - ∇f_k.
+type HagerZhang struct {
+	y []float64
+}
+
+func (hz *HagerZhang) Init(loc *Location) {
+	hz.y = resize(hz.y, len(loc.Gradient))
+}
+
+func (hz *HagerZhang) Beta(grad, gradPrev, dirPrev []float64) (beta float64) {
+	floats.SubTo(hz.y, grad, gradPrev)
+	dirDotY := floats.Dot(dirPrev, hz.y)
+	gDotY := floats.Dot(grad, hz.y)
+	gDotDir := floats.Dot(grad, dirPrev)
+	yNorm := floats.Norm(hz.y, 2)
+	return (gDotY - 2*gDotDir*yNorm*yNorm/dirDotY) / dirDotY
+}
diff --git a/vendor/gonum.org/v1/gonum/optimize/cmaes.go b/vendor/gonum.org/v1/gonum/optimize/cmaes.go
new file mode 100644
index 0000000000..f635d1f000
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/optimize/cmaes.go
@@ -0,0 +1,468 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package optimize
+
+import (
+	"math"
+	"math/rand/v2"
+	"sort"
+
+	"gonum.org/v1/gonum/floats"
+	"gonum.org/v1/gonum/mat"
+	"gonum.org/v1/gonum/stat/distmv"
+)
+
+var _ Method = (*CmaEsChol)(nil)
+
+// TODO(btracey): If we ever implement the traditional CMA-ES algorithm, provide
+// the base explanation there, and modify this description to just
+// describe the differences.
+
+// CmaEsChol implements the covariance matrix adaptation evolution strategy (CMA-ES)
+// based on the Cholesky decomposition. The full algorithm is described in
+//
+//	Krause, Oswin, Dídac Rodríguez Arbonès, and Christian Igel. "CMA-ES with
+//	optimal covariance update and storage complexity." Advances in Neural
+//	Information Processing Systems. 2016.
+//	https://papers.nips.cc/paper/6457-cma-es-with-optimal-covariance-update-and-storage-complexity.pdf
+//
+// CMA-ES is a global optimization method that progressively adapts a population
+// of samples. CMA-ES combines techniques from local optimization with global
+// optimization. Specifically, the CMA-ES algorithm uses an initial multivariate
+// normal distribution to generate a population of input locations. The input locations
+// with the lowest function values are used to update the parameters of the normal
+// distribution, a new set of input locations are generated, and this procedure
+// is iterated until convergence. The initial sampling distribution will have
+// a mean specified by the initial x location, and a covariance specified by
+// the InitCholesky field.
+//
+// As the normal distribution is progressively updated according to the best samples,
+// it can be that the mean of the distribution is updated in a gradient-descent
+// like fashion, followed by a shrinking covariance.
+// It is recommended that the algorithm be run multiple times (with different
+// InitMean) to have a better chance of finding the global minimum.
+//
+// The CMA-ES-Chol algorithm differs from the standard CMA-ES algorithm in that
+// it directly updates the Cholesky decomposition of the normal distribution.
+// This changes the runtime from O(dimension^3) to O(dimension^2*population)
+// The evolution of the multi-variate normal will be similar to the baseline
+// CMA-ES algorithm, but the covariance update equation is not identical.
+//
+// For more information about the CMA-ES algorithm, see
+//
+//	https://en.wikipedia.org/wiki/CMA-ES
+//	https://arxiv.org/pdf/1604.00772.pdf
+type CmaEsChol struct {
+	// InitStepSize sets the initial size of the covariance matrix adaptation.
+	// If InitStepSize is 0, a default value of 0.5 is used. InitStepSize cannot
+	// be negative, or CmaEsChol will panic.
+	InitStepSize float64
+	// Population sets the population size for the algorithm. If Population is
+	// 0, a default value of 4 + math.Floor(3*math.Log(float64(dim))) is used.
+	// Population cannot be negative or CmaEsChol will panic.
+	Population int
+	// InitCholesky specifies the Cholesky decomposition of the covariance
+	// matrix for the initial sampling distribution. If InitCholesky is nil,
+	// a default value of I is used. If it is non-nil, then it must have
+	// InitCholesky.Size() be equal to the problem dimension.
+	InitCholesky *mat.Cholesky
+	// StopLogDet sets the threshold for stopping the optimization if the
+	// distribution becomes too peaked. The log determinant is a measure of the
+	// (log) "volume" of the normal distribution, and when it is too small
+	// the samples are almost the same. If the log determinant of the covariance
+	// matrix becomes less than StopLogDet, the optimization run is concluded.
+	// If StopLogDet is 0, a default value of dim*log(1e-16) is used.
+	// If StopLogDet is NaN, the stopping criterion is not used, though
+	// this can cause numeric instabilities in the algorithm.
+	StopLogDet float64
+	// ForgetBest, when true, does not track the best overall function value found,
+	// instead returning the new best sample in each iteration. If ForgetBest
+	// is false, then the minimum value returned will be the lowest across all
+	// iterations, regardless of when that sample was generated.
+	ForgetBest bool
+	// Src allows a random number generator to be supplied for generating samples.
+	// If Src is nil the generator in golang.org/x/math/rand is used.
+	Src rand.Source
+
+	// Fixed algorithm parameters.
+	dim                 int
+	pop                 int
+	weights             []float64
+	muEff               float64
+	cc, cs, c1, cmu, ds float64
+	eChi                float64
+
+	// Function data.
+	xs *mat.Dense
+	fs []float64
+
+	// Adaptive algorithm parameters.
+	invSigma float64 // inverse of the sigma parameter
+	pc, ps   []float64
+	mean     []float64
+	chol     mat.Cholesky
+
+	// Overall best.
+	bestX []float64
+	bestF float64
+
+	// Synchronization.
+	sentIdx     int
+	receivedIdx int
+	operation   chan<- Task
+	updateErr   error
+}
+
+var (
+	_ Statuser = (*CmaEsChol)(nil)
+	_ Method   = (*CmaEsChol)(nil)
+)
+
+func (cma *CmaEsChol) methodConverged() Status {
+	sd := cma.StopLogDet
+	switch {
+	case math.IsNaN(sd):
+		return NotTerminated
+	case sd == 0:
+		sd = float64(cma.dim) * -36.8413614879 // ln(1e-16)
+	}
+	if cma.chol.LogDet() < sd {
+		return MethodConverge
+	}
+	return NotTerminated
+}
+
+// Status returns the status of the method.
+func (cma *CmaEsChol) Status() (Status, error) {
+	if cma.updateErr != nil {
+		return Failure, cma.updateErr
+	}
+	return cma.methodConverged(), nil
+}
+
+func (*CmaEsChol) Uses(has Available) (uses Available, err error) {
+	return has.function()
+}
+
+func (cma *CmaEsChol) Init(dim, tasks int) int {
+	if dim <= 0 {
+		panic(nonpositiveDimension)
+	}
+	if tasks < 0 {
+		panic(negativeTasks)
+	}
+
+	// Set fixed algorithm parameters.
+	// Parameter values are from https://arxiv.org/pdf/1604.00772.pdf .
+	cma.dim = dim
+	cma.pop = cma.Population
+	n := float64(dim)
+	if cma.pop == 0 {
+		cma.pop = 4 + int(3*math.Log(n)) // Note the implicit floor.
+	} else if cma.pop < 0 {
+		panic("cma-es-chol: negative population size")
+	}
+	mu := cma.pop / 2
+	cma.weights = resize(cma.weights, mu)
+	for i := range cma.weights {
+		v := math.Log(float64(mu)+0.5) - math.Log(float64(i)+1)
+		cma.weights[i] = v
+	}
+	floats.Scale(1/floats.Sum(cma.weights), cma.weights)
+	cma.muEff = 0
+	for _, v := range cma.weights {
+		cma.muEff += v * v
+	}
+	cma.muEff = 1 / cma.muEff
+
+	cma.cc = (4 + cma.muEff/n) / (n + 4 + 2*cma.muEff/n)
+	cma.cs = (cma.muEff + 2) / (n + cma.muEff + 5)
+	cma.c1 = 2 / ((n+1.3)*(n+1.3) + cma.muEff)
+	cma.cmu = math.Min(1-cma.c1, 2*(cma.muEff-2+1/cma.muEff)/((n+2)*(n+2)+cma.muEff))
+	cma.ds = 1 + 2*math.Max(0, math.Sqrt((cma.muEff-1)/(n+1))-1) + cma.cs
+	// E[chi] is taken from https://en.wikipedia.org/wiki/CMA-ES (there
+	// listed as E[||N(0,1)||]).
+	cma.eChi = math.Sqrt(n) * (1 - 1.0/(4*n) + 1/(21*n*n))
+
+	// Allocate memory for function data.
+	cma.xs = mat.NewDense(cma.pop, dim, nil)
+	cma.fs = resize(cma.fs, cma.pop)
+
+	// Allocate and initialize adaptive parameters.
+	cma.invSigma = 1 / cma.InitStepSize
+	if cma.InitStepSize == 0 {
+		cma.invSigma = 10.0 / 3
+	} else if cma.InitStepSize < 0 {
+		panic("cma-es-chol: negative initial step size")
+	}
+	cma.pc = resize(cma.pc, dim)
+	for i := range cma.pc {
+		cma.pc[i] = 0
+	}
+	cma.ps = resize(cma.ps, dim)
+	for i := range cma.ps {
+		cma.ps[i] = 0
+	}
+	cma.mean = resize(cma.mean, dim) // mean location initialized at the start of Run
+
+	if cma.InitCholesky != nil {
+		if cma.InitCholesky.SymmetricDim() != dim {
+			panic("cma-es-chol: incorrect InitCholesky size")
+		}
+		cma.chol.Clone(cma.InitCholesky)
+	} else {
+		// Set the initial Cholesky to I.
+		b := mat.NewDiagDense(dim, nil)
+		for i := 0; i < dim; i++ {
+			b.SetDiag(i, 1)
+		}
+		var chol mat.Cholesky
+		ok := chol.Factorize(b)
+		if !ok {
+			panic("cma-es-chol: bad cholesky. shouldn't happen")
+		}
+		cma.chol = chol
+	}
+
+	cma.bestX = resize(cma.bestX, dim)
+	cma.bestF = math.Inf(1)
+
+	cma.sentIdx = 0
+	cma.receivedIdx = 0
+	cma.operation = nil
+	cma.updateErr = nil
+	t := min(tasks, cma.pop)
+	return t
+}
+
+func (cma *CmaEsChol) sendInitTasks(tasks []Task) {
+	for i, task := range tasks {
+		cma.sendTask(i, task)
+	}
+	cma.sentIdx = len(tasks)
+}
+
+// sendTask generates a sample and sends the task. It does not update the cma index.
+func (cma *CmaEsChol) sendTask(idx int, task Task) {
+	task.ID = idx
+	task.Op = FuncEvaluation
+	distmv.NormalRand(cma.xs.RawRowView(idx), cma.mean, &cma.chol, cma.Src)
+	copy(task.X, cma.xs.RawRowView(idx))
+	cma.operation <- task
+}
+
+// bestIdx returns the best index in the functions. Returns -1 if all values
+// are NaN.
+func (cma *CmaEsChol) bestIdx() int {
+	best := -1
+	bestVal := math.Inf(1)
+	for i, v := range cma.fs {
+		if math.IsNaN(v) {
+			continue
+		}
+		// Use equality in case somewhere evaluates to +inf.
+		if v <= bestVal {
+			best = i
+			bestVal = v
+		}
+	}
+	return best
+}
+
+// findBestAndUpdateTask finds the best task in the current list, updates the
+// new best overall, and then stores the best location into task.
+func (cma *CmaEsChol) findBestAndUpdateTask(task Task) Task {
+	// Find and update the best location.
+	// Don't use floats because there may be NaN values.
+	best := cma.bestIdx()
+	bestF := math.NaN()
+	bestX := cma.xs.RawRowView(0)
+	if best != -1 {
+		bestF = cma.fs[best]
+		bestX = cma.xs.RawRowView(best)
+	}
+	if cma.ForgetBest {
+		task.F = bestF
+		copy(task.X, bestX)
+	} else {
+		if bestF < cma.bestF {
+			cma.bestF = bestF
+			copy(cma.bestX, bestX)
+		}
+		task.F = cma.bestF
+		copy(task.X, cma.bestX)
+	}
+	return task
+}
+
+func (cma *CmaEsChol) Run(operations chan<- Task, results <-chan Task, tasks []Task) {
+	copy(cma.mean, tasks[0].X)
+	cma.operation = operations
+	// Send the initial tasks. We know there are at most as many tasks as elements
+	// of the population.
+	cma.sendInitTasks(tasks)
+
+Loop:
+	for {
+		result := <-results
+		switch result.Op {
+		default:
+			panic("unknown operation")
+		case PostIteration:
+			break Loop
+		case MajorIteration:
+			// The last thing we did was update all of the tasks and send the
+			// major iteration. Now we can send a group of tasks again.
+			cma.sendInitTasks(tasks)
+		case FuncEvaluation:
+			cma.receivedIdx++
+			cma.fs[result.ID] = result.F
+			switch {
+			case cma.sentIdx < cma.pop:
+				// There are still tasks to evaluate. Send the next.
+				cma.sendTask(cma.sentIdx, result)
+				cma.sentIdx++
+			case cma.receivedIdx < cma.pop:
+				// All the tasks have been sent, but not all of them have been received.
+				// Need to wait until all are back.
+				continue Loop
+			default:
+				// All of the evaluations have been received.
+				if cma.receivedIdx != cma.pop {
+					panic("bad logic")
+				}
+				cma.receivedIdx = 0
+				cma.sentIdx = 0
+
+				task := cma.findBestAndUpdateTask(result)
+				// Update the parameters and send a MajorIteration or a convergence.
+				err := cma.update()
+				// Kill the existing data.
+				for i := range cma.fs {
+					cma.fs[i] = math.NaN()
+					cma.xs.Set(i, 0, math.NaN())
+				}
+				switch {
+				case err != nil:
+					cma.updateErr = err
+					task.Op = MethodDone
+				case cma.methodConverged() != NotTerminated:
+					task.Op = MethodDone
+				default:
+					task.Op = MajorIteration
+					task.ID = -1
+				}
+				operations <- task
+			}
+		}
+	}
+
+	// Been told to stop. Clean up.
+	// Need to see best of our evaluated tasks so far. Should instead just
+	// collect, then see.
+	for task := range results {
+		switch task.Op {
+		case MajorIteration:
+		case FuncEvaluation:
+			cma.fs[task.ID] = task.F
+		default:
+			panic("unknown operation")
+		}
+	}
+	// Send the new best value if the evaluation is better than any we've
+	// found so far. Keep this separate from findBestAndUpdateTask so that
+	// we only send an iteration if we find a better location.
+	if !cma.ForgetBest {
+		best := cma.bestIdx()
+		if best != -1 && cma.fs[best] < cma.bestF {
+			task := tasks[0]
+			task.F = cma.fs[best]
+			copy(task.X, cma.xs.RawRowView(best))
+			task.Op = MajorIteration
+			task.ID = -1
+			operations <- task
+		}
+	}
+	close(operations)
+}
+
+// update computes the new parameters (mean, cholesky, etc.). Does not update
+// any of the synchronization parameters (taskIdx).
+func (cma *CmaEsChol) update() error {
+	// Sort the function values to find the elite samples.
+	ftmp := make([]float64, cma.pop)
+	copy(ftmp, cma.fs)
+	indexes := make([]int, cma.pop)
+	for i := range indexes {
+		indexes[i] = i
+	}
+	sort.Sort(bestSorter{F: ftmp, Idx: indexes})
+
+	meanOld := make([]float64, len(cma.mean))
+	copy(meanOld, cma.mean)
+
+	// m_{t+1} = \sum_{i=1}^mu w_i x_i
+	for i := range cma.mean {
+		cma.mean[i] = 0
+	}
+	for i, w := range cma.weights {
+		idx := indexes[i] // index of teh 1337 sample.
+		floats.AddScaled(cma.mean, w, cma.xs.RawRowView(idx))
+	}
+	meanDiff := make([]float64, len(cma.mean))
+	floats.SubTo(meanDiff, cma.mean, meanOld)
+
+	// p_{c,t+1} = (1-c_c) p_{c,t} + \sqrt(c_c*(2-c_c)*mueff) (m_{t+1}-m_t)/sigma_t
+	floats.Scale(1-cma.cc, cma.pc)
+	scaleC := math.Sqrt(cma.cc*(2-cma.cc)*cma.muEff) * cma.invSigma
+	floats.AddScaled(cma.pc, scaleC, meanDiff)
+
+	// p_{sigma, t+1} = (1-c_sigma) p_{sigma,t} + \sqrt(c_s*(2-c_s)*mueff) A_t^-1 (m_{t+1}-m_t)/sigma_t
+	floats.Scale(1-cma.cs, cma.ps)
+	// First compute A_t^-1 (m_{t+1}-m_t), then add the scaled vector.
+	tmp := make([]float64, cma.dim)
+	tmpVec := mat.NewVecDense(cma.dim, tmp)
+	diffVec := mat.NewVecDense(cma.dim, meanDiff)
+	err := tmpVec.SolveVec(cma.chol.RawU().T(), diffVec)
+	if err != nil {
+		return err
+	}
+	scaleS := math.Sqrt(cma.cs*(2-cma.cs)*cma.muEff) * cma.invSigma
+	floats.AddScaled(cma.ps, scaleS, tmp)
+
+	// Compute the update to A.
+	scaleChol := 1 - cma.c1 - cma.cmu
+	if scaleChol == 0 {
+		scaleChol = math.SmallestNonzeroFloat64 // enough to kill the old data, but still non-zero.
+	}
+	cma.chol.Scale(scaleChol, &cma.chol)
+	cma.chol.SymRankOne(&cma.chol, cma.c1, mat.NewVecDense(cma.dim, cma.pc))
+	for i, w := range cma.weights {
+		idx := indexes[i]
+		floats.SubTo(tmp, cma.xs.RawRowView(idx), meanOld)
+		cma.chol.SymRankOne(&cma.chol, cma.cmu*w*cma.invSigma, tmpVec)
+	}
+
+	// sigma_{t+1} = sigma_t exp(c_sigma/d_sigma * norm(p_{sigma,t+1}/ E[chi] -1)
+	normPs := floats.Norm(cma.ps, 2)
+	cma.invSigma /= math.Exp(cma.cs / cma.ds * (normPs/cma.eChi - 1))
+	return nil
+}
+
+type bestSorter struct {
+	F   []float64
+	Idx []int
+}
+
+func (b bestSorter) Len() int {
+	return len(b.F)
+}
+func (b bestSorter) Less(i, j int) bool {
+	return b.F[i] < b.F[j]
+}
+func (b bestSorter) Swap(i, j int) {
+	b.F[i], b.F[j] = b.F[j], b.F[i]
+	b.Idx[i], b.Idx[j] = b.Idx[j], b.Idx[i]
+}
diff --git a/vendor/gonum.org/v1/gonum/optimize/doc.go b/vendor/gonum.org/v1/gonum/optimize/doc.go
new file mode 100644
index 0000000000..667e8f94e9
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/optimize/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package optimize implements algorithms for finding the optimum value of functions.
+package optimize // import "gonum.org/v1/gonum/optimize"
diff --git a/vendor/gonum.org/v1/gonum/optimize/errors.go b/vendor/gonum.org/v1/gonum/optimize/errors.go
new file mode 100644
index 0000000000..7d6f8aee02
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/optimize/errors.go
@@ -0,0 +1,78 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package optimize
+
+import (
+	"errors"
+	"fmt"
+	"math"
+)
+
+var (
+	// ErrZeroDimensional signifies an optimization was called with an input of length 0.
+	ErrZeroDimensional = errors.New("optimize: zero dimensional input")
+
+	// ErrLinesearcherFailure signifies that a Linesearcher has iterated too
+	// many times. This may occur if the gradient tolerance is set too low.
+	ErrLinesearcherFailure = errors.New("linesearch: failed to converge")
+
+	// ErrNonDescentDirection signifies that LinesearchMethod has received a
+	// search direction from a NextDirectioner in which the function is not
+	// decreasing.
+	ErrNonDescentDirection = errors.New("linesearch: non-descent search direction")
+
+	// ErrNoProgress signifies that LinesearchMethod cannot make further
+	// progress because there is no change in location after Linesearcher step
+	// due to floating-point arithmetic.
+	ErrNoProgress = errors.New("linesearch: no change in location after Linesearcher step")
+
+	// ErrLinesearcherBound signifies that a Linesearcher reached a step that
+	// lies out of allowed bounds.
+	ErrLinesearcherBound = errors.New("linesearch: step out of bounds")
+
+	// ErrMissingGrad signifies that a Method requires a Gradient function that
+	// is not supplied by Problem.
+	ErrMissingGrad = errors.New("optimize: problem does not provide needed Grad function")
+
+	// ErrMissingHess signifies that a Method requires a Hessian function that
+	// is not supplied by Problem.
+	ErrMissingHess = errors.New("optimize: problem does not provide needed Hess function")
+)
+
+// ErrFunc is returned when an initial function value is invalid. The error
+// state may be either +Inf or NaN. ErrFunc satisfies the error interface.
+type ErrFunc float64
+
+func (err ErrFunc) Error() string {
+	switch {
+	case math.IsInf(float64(err), 1):
+		return "optimize: initial function value is infinite"
+	case math.IsNaN(float64(err)):
+		return "optimize: initial function value is NaN"
+	default:
+		panic("optimize: bad ErrFunc")
+	}
+}
+
+// ErrGrad is returned when an initial gradient is invalid. The error gradient
+// may be either ±Inf or NaN. ErrGrad satisfies the error interface.
+type ErrGrad struct {
+	Grad  float64 // Grad is the invalid gradient value.
+	Index int     // Index is the position at which the invalid gradient was found.
+}
+
+func (err ErrGrad) Error() string {
+	switch {
+	case math.IsInf(err.Grad, 0):
+		return fmt.Sprintf("optimize: initial gradient is infinite at position %d", err.Index)
+	case math.IsNaN(err.Grad):
+		return fmt.Sprintf("optimize: initial gradient is NaN at position %d", err.Index)
+	default:
+		panic("optimize: bad ErrGrad")
+	}
+}
+
+// List of shared panic strings
+const badProblem = "optimize: objective function is undefined"
diff --git a/vendor/gonum.org/v1/gonum/optimize/functionconvergence.go b/vendor/gonum.org/v1/gonum/optimize/functionconvergence.go
new file mode 100644
index 0000000000..d5b12c307d
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/optimize/functionconvergence.go
@@ -0,0 +1,85 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package optimize
+
+import (
+	"math"
+)
+
+// Converger returns the convergence of the optimization based on
+// locations found during optimization. Converger must not modify the value of
+// the provided Location in any of the methods.
+type Converger interface {
+	Init(dim int)
+	Converged(loc *Location) Status
+}
+
+var (
+	_ Converger = NeverTerminate{}
+	_ Converger = (*FunctionConverge)(nil)
+)
+
+// NeverTerminate implements Converger, always reporting NotTerminated.
+type NeverTerminate struct{}
+
+func (NeverTerminate) Init(dim int) {}
+
+func (NeverTerminate) Converged(loc *Location) Status {
+	return NotTerminated
+}
+
+// FunctionConverge tests for insufficient improvement in the optimum value
+// over the last iterations. A FunctionConvergence status is returned if
+// there is no significant decrease for FunctionConverge.Iterations. A
+// significant decrease is considered if
+//
+//	f < f_best
+//
+// and
+//
+//	f_best - f > FunctionConverge.Relative * maxabs(f, f_best) + FunctionConverge.Absolute
+//
+// If the decrease is significant, then the iteration counter is reset and
+// f_best is updated.
+//
+// If FunctionConverge.Iterations == 0, it has no effect.
+type FunctionConverge struct {
+	Absolute   float64
+	Relative   float64
+	Iterations int
+
+	first bool
+	best  float64
+	iter  int
+}
+
+func (fc *FunctionConverge) Init(dim int) {
+	fc.first = true
+	fc.best = 0
+	fc.iter = 0
+}
+
+func (fc *FunctionConverge) Converged(l *Location) Status {
+	f := l.F
+	if fc.first {
+		fc.best = f
+		fc.first = false
+		return NotTerminated
+	}
+	if fc.Iterations == 0 {
+		return NotTerminated
+	}
+	maxAbs := math.Max(math.Abs(f), math.Abs(fc.best))
+	if f < fc.best && fc.best-f > fc.Relative*maxAbs+fc.Absolute {
+		fc.best = f
+		fc.iter = 0
+		return NotTerminated
+	}
+	fc.iter++
+	if fc.iter < fc.Iterations {
+		return NotTerminated
+	}
+	return FunctionConvergence
+}
diff --git a/vendor/gonum.org/v1/gonum/optimize/gradientdescent.go b/vendor/gonum.org/v1/gonum/optimize/gradientdescent.go
new file mode 100644
index 0000000000..d11896594a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/optimize/gradientdescent.go
@@ -0,0 +1,95 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package optimize
+
+import "gonum.org/v1/gonum/floats"
+
+var (
+	_ Method          = (*GradientDescent)(nil)
+	_ localMethod     = (*GradientDescent)(nil)
+	_ NextDirectioner = (*GradientDescent)(nil)
+)
+
+// GradientDescent implements the steepest descent optimization method that
+// performs successive steps along the direction of the negative gradient.
+type GradientDescent struct {
+	// Linesearcher selects suitable steps along the descent direction.
+	// If Linesearcher is nil, a reasonable default will be chosen.
+	Linesearcher Linesearcher
+	// StepSizer determines the initial step size along each direction.
+	// If StepSizer is nil, a reasonable default will be chosen.
+	StepSizer StepSizer
+	// GradStopThreshold sets the threshold for stopping if the gradient norm
+	// gets too small. If GradStopThreshold is 0 it is defaulted to 1e-12, and
+	// if it is NaN the setting is not used.
+	GradStopThreshold float64
+
+	ls *LinesearchMethod
+
+	status Status
+	err    error
+}
+
+func (g *GradientDescent) Status() (Status, error) {
+	return g.status, g.err
+}
+
+func (*GradientDescent) Uses(has Available) (uses Available, err error) {
+	return has.gradient()
+}
+
+func (g *GradientDescent) Init(dim, tasks int) int {
+	g.status = NotTerminated
+	g.err = nil
+	return 1
+}
+
+func (g *GradientDescent) Run(operation chan<- Task, result <-chan Task, tasks []Task) {
+	g.status, g.err = localOptimizer{}.run(g, g.GradStopThreshold, operation, result, tasks)
+	close(operation)
+}
+
+func (g *GradientDescent) initLocal(loc *Location) (Operation, error) {
+	if g.Linesearcher == nil {
+		g.Linesearcher = &Backtracking{}
+	}
+	if g.StepSizer == nil {
+		g.StepSizer = &QuadraticStepSize{}
+	}
+
+	if g.ls == nil {
+		g.ls = &LinesearchMethod{}
+	}
+	g.ls.Linesearcher = g.Linesearcher
+	g.ls.NextDirectioner = g
+
+	return g.ls.Init(loc)
+}
+
+func (g *GradientDescent) iterateLocal(loc *Location) (Operation, error) {
+	return g.ls.Iterate(loc)
+}
+
+func (g *GradientDescent) InitDirection(loc *Location, dir []float64) (stepSize float64) {
+	copy(dir, loc.Gradient)
+	floats.Scale(-1, dir)
+	return g.StepSizer.Init(loc, dir)
+}
+
+func (g *GradientDescent) NextDirection(loc *Location, dir []float64) (stepSize float64) {
+	copy(dir, loc.Gradient)
+	floats.Scale(-1, dir)
+	return g.StepSizer.StepSize(loc, dir)
+}
+
+func (*GradientDescent) needs() struct {
+	Gradient bool
+	Hessian  bool
+} {
+	return struct {
+		Gradient bool
+		Hessian  bool
+	}{true, false}
+}
diff --git a/vendor/gonum.org/v1/gonum/optimize/guessandcheck.go b/vendor/gonum.org/v1/gonum/optimize/guessandcheck.go
new file mode 100644
index 0000000000..35d2792681
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/optimize/guessandcheck.go
@@ -0,0 +1,92 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package optimize
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/stat/distmv"
+)
+
+var _ Method = (*GuessAndCheck)(nil)
+
+// GuessAndCheck is a global optimizer that evaluates the function at random
+// locations. Not a good optimizer, but useful for comparison and debugging.
+type GuessAndCheck struct {
+	Rander distmv.Rander
+
+	bestF float64
+	bestX []float64
+}
+
+func (*GuessAndCheck) Uses(has Available) (uses Available, err error) {
+	return has.function()
+}
+
+func (g *GuessAndCheck) Init(dim, tasks int) int {
+	if dim <= 0 {
+		panic(nonpositiveDimension)
+	}
+	if tasks < 0 {
+		panic(negativeTasks)
+	}
+	g.bestF = math.Inf(1)
+	g.bestX = resize(g.bestX, dim)
+	return tasks
+}
+
+func (g *GuessAndCheck) sendNewLoc(operation chan<- Task, task Task) {
+	g.Rander.Rand(task.X)
+	task.Op = FuncEvaluation
+	operation <- task
+}
+
+func (g *GuessAndCheck) updateMajor(operation chan<- Task, task Task) {
+	// Update the best value seen so far, and send a MajorIteration.
+	if task.F < g.bestF {
+		g.bestF = task.F
+		copy(g.bestX, task.X)
+	} else {
+		task.F = g.bestF
+		copy(task.X, g.bestX)
+	}
+	task.Op = MajorIteration
+	operation <- task
+}
+
+func (g *GuessAndCheck) Run(operation chan<- Task, result <-chan Task, tasks []Task) {
+	// Send initial tasks to evaluate
+	for _, task := range tasks {
+		g.sendNewLoc(operation, task)
+	}
+
+	// Read from the channel until PostIteration is sent.
+Loop:
+	for {
+		task := <-result
+		switch task.Op {
+		default:
+			panic("unknown operation")
+		case PostIteration:
+			break Loop
+		case MajorIteration:
+			g.sendNewLoc(operation, task)
+		case FuncEvaluation:
+			g.updateMajor(operation, task)
+		}
+	}
+
+	// PostIteration was sent. Update the best new values.
+	for task := range result {
+		switch task.Op {
+		default:
+			panic("unknown operation")
+		case MajorIteration:
+		case FuncEvaluation:
+			g.updateMajor(operation, task)
+		}
+	}
+	close(operation)
+}
diff --git a/vendor/gonum.org/v1/gonum/optimize/interfaces.go b/vendor/gonum.org/v1/gonum/optimize/interfaces.go
new file mode 100644
index 0000000000..09d395a2f8
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/optimize/interfaces.go
@@ -0,0 +1,132 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package optimize
+
+// A localMethod can optimize an objective function.
+//
+// It uses a reverse-communication interface between the optimization method
+// and the caller. Method acts as a client that asks the caller to perform
+// needed operations via Operation returned from Init and Iterate methods.
+// This provides independence of the optimization algorithm on user-supplied
+// data and their representation, and enables automation of common operations
+// like checking for (various types of) convergence and maintaining statistics.
+//
+// A Method can command an Evaluation, a MajorIteration or NoOperation operations.
+//
+// An evaluation operation is one or more of the Evaluation operations
+// (FuncEvaluation, GradEvaluation, etc.) which can be combined with
+// the bitwise or operator. In an evaluation operation, the requested fields of
+// Problem will be evaluated at the point specified in Location.X.
+// The corresponding fields of Location will be filled with the results that
+// can be retrieved upon the next call to Iterate. The Method interface
+// requires that entries of Location are not modified aside from the commanded
+// evaluations. Thus, the type implementing Method may use multiple Operations
+// to set the Location fields at a particular x value.
+//
+// Instead of an Evaluation, a Method may declare MajorIteration. In
+// a MajorIteration, the values in the fields of Location are treated as
+// a potential optimizer. The convergence of the optimization routine
+// (GradientThreshold, etc.) is checked at this new best point. In
+// a MajorIteration, the fields of Location must be valid and consistent.
+//
+// A Method must not return InitIteration and PostIteration operations. These are
+// reserved for the clients to be passed to Recorders. A Method must also not
+// combine the Evaluation operations with the Iteration operations.
+type localMethod interface {
+	// Init initializes the method based on the initial data in loc, updates it
+	// and returns the first operation to be carried out by the caller.
+	// The initial location must be valid as specified by Needs.
+	initLocal(loc *Location) (Operation, error)
+
+	// Iterate retrieves data from loc, performs one iteration of the method,
+	// updates loc and returns the next operation.
+	iterateLocal(loc *Location) (Operation, error)
+
+	needser
+}
+
+type needser interface {
+	// needs specifies information about the objective function needed by the
+	// optimizer beyond just the function value. The information is used
+	// internally for initialization and must match evaluation types returned
+	// by Init and Iterate during the optimization process.
+	needs() struct {
+		Gradient bool
+		Hessian  bool
+	}
+}
+
+// Statuser can report the status and any error. It is intended for methods as
+// an additional error reporting mechanism apart from the errors returned from
+// Init and Iterate.
+type Statuser interface {
+	Status() (Status, error)
+}
+
+// Linesearcher is a type that can perform a line search. It tries to find an
+// (approximate) minimum of the objective function along the search direction
+// dir_k starting at the most recent location x_k, i.e., it tries to minimize
+// the function
+//
+//	φ(step) := f(x_k + step * dir_k) where step > 0.
+//
+// Typically, a Linesearcher will be used in conjunction with LinesearchMethod
+// for performing gradient-based optimization through sequential line searches.
+type Linesearcher interface {
+	// Init initializes the Linesearcher and a new line search. Value and
+	// derivative contain φ(0) and φ'(0), respectively, and step contains the
+	// first trial step length. It returns an Operation that must be one of
+	// FuncEvaluation, GradEvaluation, FuncEvaluation|GradEvaluation. The
+	// caller must evaluate φ(step), φ'(step), or both, respectively, and pass
+	// the result to Linesearcher in value and derivative arguments to Iterate.
+	Init(value, derivative float64, step float64) Operation
+
+	// Iterate takes in the values of φ and φ' evaluated at the previous step
+	// and returns the next operation.
+	//
+	// If op is one of FuncEvaluation, GradEvaluation,
+	// FuncEvaluation|GradEvaluation, the caller must evaluate φ(step),
+	// φ'(step), or both, respectively, and pass the result to Linesearcher in
+	// value and derivative arguments on the next call to Iterate.
+	//
+	// If op is MajorIteration, a sufficiently accurate minimum of φ has been
+	// found at the previous step and the line search has concluded. Init must
+	// be called again to initialize a new line search.
+	//
+	// If err is nil, op must not specify another operation. If err is not nil,
+	// the values of op and step are undefined.
+	Iterate(value, derivative float64) (op Operation, step float64, err error)
+}
+
+// NextDirectioner implements a strategy for computing a new line search
+// direction at each major iteration. Typically, a NextDirectioner will be
+// used in conjunction with LinesearchMethod for performing gradient-based
+// optimization through sequential line searches.
+type NextDirectioner interface {
+	// InitDirection initializes the NextDirectioner at the given starting location,
+	// putting the initial direction in place into dir, and returning the initial
+	// step size. InitDirection must not modify Location.
+	InitDirection(loc *Location, dir []float64) (step float64)
+
+	// NextDirection updates the search direction and step size. Location is
+	// the location seen at the conclusion of the most recent linesearch. The
+	// next search direction is put in place into dir, and the next step size
+	// is returned. NextDirection must not modify Location.
+	NextDirection(loc *Location, dir []float64) (step float64)
+}
+
+// StepSizer can set the next step size of the optimization given the last Location.
+// Returned step size must be positive.
+type StepSizer interface {
+	Init(loc *Location, dir []float64) float64
+	StepSize(loc *Location, dir []float64) float64
+}
+
+// A Recorder can record the progress of the optimization, for example to print
+// the progress to StdOut or to a log file. A Recorder must not modify any data.
+type Recorder interface {
+	Init() error
+	Record(*Location, Operation, *Stats) error
+}
diff --git a/vendor/gonum.org/v1/gonum/optimize/lbfgs.go b/vendor/gonum.org/v1/gonum/optimize/lbfgs.go
new file mode 100644
index 0000000000..6caad9c330
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/optimize/lbfgs.go
@@ -0,0 +1,199 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package optimize
+
+import (
+	"gonum.org/v1/gonum/floats"
+)
+
+var (
+	_ Method          = (*LBFGS)(nil)
+	_ localMethod     = (*LBFGS)(nil)
+	_ NextDirectioner = (*LBFGS)(nil)
+)
+
+// LBFGS implements the limited-memory BFGS method for gradient-based
+// unconstrained minimization.
+//
+// It stores a modified version of the inverse Hessian approximation H
+// implicitly from the last Store iterations while the normal BFGS method
+// stores and manipulates H directly as a dense matrix. Therefore LBFGS is more
+// appropriate than BFGS for large problems as the cost of LBFGS scales as
+// O(Store * dim) while BFGS scales as O(dim^2). The "forgetful" nature of
+// LBFGS may also make it perform better than BFGS for functions with Hessians
+// that vary rapidly spatially.
+type LBFGS struct {
+	// Linesearcher selects suitable steps along the descent direction.
+	// Accepted steps should satisfy the strong Wolfe conditions.
+	// If Linesearcher is nil, a reasonable default will be chosen.
+	Linesearcher Linesearcher
+	// Store is the size of the limited-memory storage.
+	// If Store is 0, it will be defaulted to 15.
+	Store int
+	// GradStopThreshold sets the threshold for stopping if the gradient norm
+	// gets too small. If GradStopThreshold is 0 it is defaulted to 1e-12, and
+	// if it is NaN the setting is not used.
+	GradStopThreshold float64
+
+	status Status
+	err    error
+
+	ls *LinesearchMethod
+
+	dim  int       // Dimension of the problem
+	x    []float64 // Location at the last major iteration
+	grad []float64 // Gradient at the last major iteration
+
+	// History
+	oldest int         // Index of the oldest element of the history
+	y      [][]float64 // Last Store values of y
+	s      [][]float64 // Last Store values of s
+	rho    []float64   // Last Store values of rho
+	a      []float64   // Cache of Hessian updates
+}
+
+func (l *LBFGS) Status() (Status, error) {
+	return l.status, l.err
+}
+
+func (*LBFGS) Uses(has Available) (uses Available, err error) {
+	return has.gradient()
+}
+
+func (l *LBFGS) Init(dim, tasks int) int {
+	l.status = NotTerminated
+	l.err = nil
+	return 1
+}
+
+func (l *LBFGS) Run(operation chan<- Task, result <-chan Task, tasks []Task) {
+	l.status, l.err = localOptimizer{}.run(l, l.GradStopThreshold, operation, result, tasks)
+	close(operation)
+}
+
+func (l *LBFGS) initLocal(loc *Location) (Operation, error) {
+	if l.Linesearcher == nil {
+		l.Linesearcher = &Bisection{}
+	}
+	if l.Store == 0 {
+		l.Store = 15
+	}
+
+	if l.ls == nil {
+		l.ls = &LinesearchMethod{}
+	}
+	l.ls.Linesearcher = l.Linesearcher
+	l.ls.NextDirectioner = l
+
+	return l.ls.Init(loc)
+}
+
+func (l *LBFGS) iterateLocal(loc *Location) (Operation, error) {
+	return l.ls.Iterate(loc)
+}
+
+func (l *LBFGS) InitDirection(loc *Location, dir []float64) (stepSize float64) {
+	dim := len(loc.X)
+	l.dim = dim
+	l.oldest = 0
+
+	l.a = resize(l.a, l.Store)
+	l.rho = resize(l.rho, l.Store)
+	l.y = l.initHistory(l.y)
+	l.s = l.initHistory(l.s)
+
+	l.x = resize(l.x, dim)
+	copy(l.x, loc.X)
+
+	l.grad = resize(l.grad, dim)
+	copy(l.grad, loc.Gradient)
+
+	copy(dir, loc.Gradient)
+	floats.Scale(-1, dir)
+	return 1 / floats.Norm(dir, 2)
+}
+
+func (l *LBFGS) initHistory(hist [][]float64) [][]float64 {
+	c := cap(hist)
+	if c < l.Store {
+		n := make([][]float64, l.Store-c)
+		hist = append(hist[:c], n...)
+	}
+	hist = hist[:l.Store]
+	for i := range hist {
+		hist[i] = resize(hist[i], l.dim)
+		for j := range hist[i] {
+			hist[i][j] = 0
+		}
+	}
+	return hist
+}
+
+func (l *LBFGS) NextDirection(loc *Location, dir []float64) (stepSize float64) {
+	// Uses two-loop correction as described in
+	// Nocedal, J., Wright, S.: Numerical Optimization (2nd ed). Springer (2006), chapter 7, page 178.
+
+	if len(loc.X) != l.dim {
+		panic("lbfgs: unexpected size mismatch")
+	}
+	if len(loc.Gradient) != l.dim {
+		panic("lbfgs: unexpected size mismatch")
+	}
+	if len(dir) != l.dim {
+		panic("lbfgs: unexpected size mismatch")
+	}
+
+	y := l.y[l.oldest]
+	floats.SubTo(y, loc.Gradient, l.grad)
+	s := l.s[l.oldest]
+	floats.SubTo(s, loc.X, l.x)
+	sDotY := floats.Dot(s, y)
+	l.rho[l.oldest] = 1 / sDotY
+
+	l.oldest = (l.oldest + 1) % l.Store
+
+	copy(l.x, loc.X)
+	copy(l.grad, loc.Gradient)
+	copy(dir, loc.Gradient)
+
+	// Start with the most recent element and go backward,
+	for i := 0; i < l.Store; i++ {
+		idx := l.oldest - i - 1
+		if idx < 0 {
+			idx += l.Store
+		}
+		l.a[idx] = l.rho[idx] * floats.Dot(l.s[idx], dir)
+		floats.AddScaled(dir, -l.a[idx], l.y[idx])
+	}
+
+	// Scale the initial Hessian.
+	gamma := sDotY / floats.Dot(y, y)
+	floats.Scale(gamma, dir)
+
+	// Start with the oldest element and go forward.
+	for i := 0; i < l.Store; i++ {
+		idx := i + l.oldest
+		if idx >= l.Store {
+			idx -= l.Store
+		}
+		beta := l.rho[idx] * floats.Dot(l.y[idx], dir)
+		floats.AddScaled(dir, l.a[idx]-beta, l.s[idx])
+	}
+
+	// dir contains H^{-1} * g, so flip the direction for minimization.
+	floats.Scale(-1, dir)
+
+	return 1
+}
+
+func (*LBFGS) needs() struct {
+	Gradient bool
+	Hessian  bool
+} {
+	return struct {
+		Gradient bool
+		Hessian  bool
+	}{true, false}
+}
diff --git a/vendor/gonum.org/v1/gonum/optimize/linesearch.go b/vendor/gonum.org/v1/gonum/optimize/linesearch.go
new file mode 100644
index 0000000000..0fb1dd6ce5
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/optimize/linesearch.go
@@ -0,0 +1,218 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package optimize
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/floats"
+)
+
+// LinesearchMethod represents an abstract optimization method in which a
+// function is optimized through successive line search optimizations.
+type LinesearchMethod struct {
+	// NextDirectioner specifies the search direction of each linesearch.
+	NextDirectioner NextDirectioner
+	// Linesearcher performs a linesearch along the search direction.
+	Linesearcher Linesearcher
+
+	x   []float64 // Starting point for the current iteration.
+	dir []float64 // Search direction for the current iteration.
+
+	first     bool      // Indicator of the first iteration.
+	nextMajor bool      // Indicates that MajorIteration must be commanded at the next call to Iterate.
+	eval      Operation // Indicator of valid fields in Location.
+
+	lastStep float64   // Step taken from x in the previous call to Iterate.
+	lastOp   Operation // Operation returned from the previous call to Iterate.
+}
+
+func (ls *LinesearchMethod) Init(loc *Location) (Operation, error) {
+	if loc.Gradient == nil {
+		panic("linesearch: gradient is nil")
+	}
+
+	dim := len(loc.X)
+	ls.x = resize(ls.x, dim)
+	ls.dir = resize(ls.dir, dim)
+
+	ls.first = true
+	ls.nextMajor = false
+
+	// Indicate that all fields of loc are valid.
+	ls.eval = FuncEvaluation | GradEvaluation
+	if loc.Hessian != nil {
+		ls.eval |= HessEvaluation
+	}
+
+	ls.lastStep = math.NaN()
+	ls.lastOp = NoOperation
+
+	return ls.initNextLinesearch(loc)
+}
+
+func (ls *LinesearchMethod) Iterate(loc *Location) (Operation, error) {
+	switch ls.lastOp {
+	case NoOperation:
+		// TODO(vladimir-ch): Either Init has not been called, or the caller is
+		// trying to resume the optimization run after Iterate previously
+		// returned with an error. Decide what is the proper thing to do. See also #125.
+
+	case MajorIteration:
+		// The previous updated location did not converge the full
+		// optimization. Initialize a new Linesearch.
+		return ls.initNextLinesearch(loc)
+
+	default:
+		// Update the indicator of valid fields of loc.
+		ls.eval |= ls.lastOp
+
+		if ls.nextMajor {
+			ls.nextMajor = false
+
+			// Linesearcher previously finished, and the invalid fields of loc
+			// have now been validated. Announce MajorIteration.
+			ls.lastOp = MajorIteration
+			return ls.lastOp, nil
+		}
+	}
+
+	// Continue the linesearch.
+
+	f := math.NaN()
+	if ls.eval&FuncEvaluation != 0 {
+		f = loc.F
+	}
+	projGrad := math.NaN()
+	if ls.eval&GradEvaluation != 0 {
+		projGrad = floats.Dot(loc.Gradient, ls.dir)
+	}
+	op, step, err := ls.Linesearcher.Iterate(f, projGrad)
+	if err != nil {
+		return ls.error(err)
+	}
+
+	switch op {
+	case MajorIteration:
+		// Linesearch has been finished.
+
+		ls.lastOp = complementEval(loc, ls.eval)
+		if ls.lastOp == NoOperation {
+			// loc is complete, MajorIteration can be declared directly.
+			ls.lastOp = MajorIteration
+		} else {
+			// Declare MajorIteration on the next call to Iterate.
+			ls.nextMajor = true
+		}
+
+	case FuncEvaluation, GradEvaluation, FuncEvaluation | GradEvaluation:
+		if step != ls.lastStep {
+			// We are moving to a new location, and not, say, evaluating extra
+			// information at the current location.
+
+			// Compute the next evaluation point and store it in loc.X.
+			floats.AddScaledTo(loc.X, ls.x, step, ls.dir)
+			if floats.Equal(ls.x, loc.X) {
+				// Step size has become so small that the next evaluation point is
+				// indistinguishable from the starting point for the current
+				// iteration due to rounding errors.
+				return ls.error(ErrNoProgress)
+			}
+			ls.lastStep = step
+			ls.eval = NoOperation // Indicate all invalid fields of loc.
+		}
+		ls.lastOp = op
+
+	default:
+		panic("linesearch: Linesearcher returned invalid operation")
+	}
+
+	return ls.lastOp, nil
+}
+
+func (ls *LinesearchMethod) error(err error) (Operation, error) {
+	ls.lastOp = NoOperation
+	return ls.lastOp, err
+}
+
+// initNextLinesearch initializes the next linesearch using the previous
+// complete location stored in loc. It fills loc.X and returns an evaluation
+// to be performed at loc.X.
+func (ls *LinesearchMethod) initNextLinesearch(loc *Location) (Operation, error) {
+	copy(ls.x, loc.X)
+
+	var step float64
+	if ls.first {
+		ls.first = false
+		step = ls.NextDirectioner.InitDirection(loc, ls.dir)
+	} else {
+		step = ls.NextDirectioner.NextDirection(loc, ls.dir)
+	}
+
+	projGrad := floats.Dot(loc.Gradient, ls.dir)
+	if projGrad >= 0 {
+		return ls.error(ErrNonDescentDirection)
+	}
+
+	op := ls.Linesearcher.Init(loc.F, projGrad, step)
+	switch op {
+	case FuncEvaluation, GradEvaluation, FuncEvaluation | GradEvaluation:
+	default:
+		panic("linesearch: Linesearcher returned invalid operation")
+	}
+
+	floats.AddScaledTo(loc.X, ls.x, step, ls.dir)
+	if floats.Equal(ls.x, loc.X) {
+		// Step size is so small that the next evaluation point is
+		// indistinguishable from the starting point for the current iteration
+		// due to rounding errors.
+		return ls.error(ErrNoProgress)
+	}
+
+	ls.lastStep = step
+	ls.eval = NoOperation // Invalidate all fields of loc.
+
+	ls.lastOp = op
+	return ls.lastOp, nil
+}
+
+// ArmijoConditionMet returns true if the Armijo condition (aka sufficient
+// decrease) has been met. Under normal conditions, the following should be
+// true, though this is not enforced:
+//   - initGrad < 0
+//   - step > 0
+//   - 0 < decrease < 1
+func ArmijoConditionMet(currObj, initObj, initGrad, step, decrease float64) bool {
+	return currObj <= initObj+decrease*step*initGrad
+}
+
+// StrongWolfeConditionsMet returns true if the strong Wolfe conditions have been met.
+// The strong Wolfe conditions ensure sufficient decrease in the function
+// value, and sufficient decrease in the magnitude of the projected gradient.
+// Under normal conditions, the following should be true, though this is not
+// enforced:
+//   - initGrad < 0
+//   - step > 0
+//   - 0 <= decrease < curvature < 1
+func StrongWolfeConditionsMet(currObj, currGrad, initObj, initGrad, step, decrease, curvature float64) bool {
+	if currObj > initObj+decrease*step*initGrad {
+		return false
+	}
+	return math.Abs(currGrad) < curvature*math.Abs(initGrad)
+}
+
+// WeakWolfeConditionsMet returns true if the weak Wolfe conditions have been met.
+// The weak Wolfe conditions ensure sufficient decrease in the function value,
+// and sufficient decrease in the value of the projected gradient. Under normal
+// conditions, the following should be true, though this is not enforced:
+//   - initGrad < 0
+//   - step > 0
+//   - 0 <= decrease < curvature< 1
+func WeakWolfeConditionsMet(currObj, currGrad, initObj, initGrad, step, decrease, curvature float64) bool {
+	if currObj > initObj+decrease*step*initGrad {
+		return false
+	}
+	return currGrad >= curvature*initGrad
+}
diff --git a/vendor/gonum.org/v1/gonum/optimize/listsearch.go b/vendor/gonum.org/v1/gonum/optimize/listsearch.go
new file mode 100644
index 0000000000..1771892b79
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/optimize/listsearch.go
@@ -0,0 +1,123 @@
+// Copyright ©2018 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package optimize
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/mat"
+)
+
+var _ Method = (*ListSearch)(nil)
+
+// ListSearch finds the optimum location from a specified list of possible
+// optimum locations.
+type ListSearch struct {
+	// Locs is the list of locations to optimize. Each row of Locs is a location
+	// to optimize. The number of columns of Locs must match the dimensions
+	// passed to InitGlobal, and Locs must have at least one row.
+	Locs mat.Matrix
+
+	eval    int
+	rows    int
+	bestF   float64
+	bestIdx int
+}
+
+func (*ListSearch) Uses(has Available) (uses Available, err error) {
+	return has.function()
+}
+
+// Init initializes the method for optimization. The input dimension
+// must match the number of columns of Locs.
+func (l *ListSearch) Init(dim, tasks int) int {
+	if dim <= 0 {
+		panic(nonpositiveDimension)
+	}
+	if tasks < 0 {
+		panic(negativeTasks)
+	}
+	r, c := l.Locs.Dims()
+	if r == 0 {
+		panic("listsearch: list matrix has no rows")
+	}
+	if c != dim {
+		panic("listsearch: supplied dimension does not match list columns")
+	}
+	l.eval = 0
+	l.rows = r
+	l.bestF = math.Inf(1)
+	l.bestIdx = -1
+	return min(r, tasks)
+}
+
+func (l *ListSearch) sendNewLoc(operation chan<- Task, task Task) {
+	task.Op = FuncEvaluation
+	task.ID = l.eval
+	mat.Row(task.X, l.eval, l.Locs)
+	l.eval++
+	operation <- task
+}
+
+func (l *ListSearch) updateMajor(operation chan<- Task, task Task) {
+	// Update the best value seen so far, and send a MajorIteration.
+	if task.F < l.bestF {
+		l.bestF = task.F
+		l.bestIdx = task.ID
+	} else {
+		task.F = l.bestF
+		mat.Row(task.X, l.bestIdx, l.Locs)
+	}
+	task.Op = MajorIteration
+	operation <- task
+}
+
+func (l *ListSearch) Status() (Status, error) {
+	if l.eval < l.rows {
+		return NotTerminated, nil
+	}
+	return MethodConverge, nil
+}
+
+func (l *ListSearch) Run(operation chan<- Task, result <-chan Task, tasks []Task) {
+	// Send initial tasks to evaluate
+	for _, task := range tasks {
+		l.sendNewLoc(operation, task)
+	}
+	// Read from the channel until PostIteration is sent or until the list of
+	// tasks is exhausted.
+Loop:
+	for {
+		task := <-result
+		switch task.Op {
+		default:
+			panic("unknown operation")
+		case PostIteration:
+			break Loop
+		case MajorIteration:
+			if l.eval == l.rows {
+				task.Op = MethodDone
+				operation <- task
+				continue
+			}
+			l.sendNewLoc(operation, task)
+		case FuncEvaluation:
+			l.updateMajor(operation, task)
+		}
+	}
+
+	// Post iteration was sent, or the list has been completed. Read in the final
+	// list of tasks.
+	for task := range result {
+		switch task.Op {
+		default:
+			panic("unknown operation")
+		case MajorIteration:
+		case FuncEvaluation:
+			l.updateMajor(operation, task)
+		}
+	}
+	close(operation)
+}
diff --git a/vendor/gonum.org/v1/gonum/optimize/local.go b/vendor/gonum.org/v1/gonum/optimize/local.go
new file mode 100644
index 0000000000..27177e7273
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/optimize/local.go
@@ -0,0 +1,146 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package optimize
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/floats"
+)
+
+// localOptimizer is a helper type for running an optimization using a LocalMethod.
+type localOptimizer struct{}
+
+// run controls the optimization run for a localMethod. The calling method
+// must close the operation channel at the conclusion of the optimization. This
+// provides a happens before relationship between the return of status and the
+// closure of operation, and thus a call to method.Status (if necessary).
+func (l localOptimizer) run(method localMethod, gradThresh float64, operation chan<- Task, result <-chan Task, tasks []Task) (Status, error) {
+	// Local methods start with a fully-specified initial location.
+	task := tasks[0]
+	task = l.initialLocation(operation, result, task, method)
+	if task.Op == PostIteration {
+		l.finish(operation, result)
+		return NotTerminated, nil
+	}
+	status, err := l.checkStartingLocation(task, gradThresh)
+	if err != nil {
+		l.finishMethodDone(operation, result, task)
+		return status, err
+	}
+
+	// Send a major iteration with the starting location.
+	task.Op = MajorIteration
+	operation <- task
+	task = <-result
+	if task.Op == PostIteration {
+		l.finish(operation, result)
+		return NotTerminated, nil
+	}
+	op, err := method.initLocal(task.Location)
+	if err != nil {
+		l.finishMethodDone(operation, result, task)
+		return Failure, err
+	}
+	task.Op = op
+	operation <- task
+Loop:
+	for {
+		r := <-result
+		switch r.Op {
+		case PostIteration:
+			break Loop
+		case MajorIteration:
+			// The last operation was a MajorIteration. Check if the gradient
+			// is below the threshold.
+			if status := l.checkGradientConvergence(r.Gradient, gradThresh); status != NotTerminated {
+				l.finishMethodDone(operation, result, task)
+				return GradientThreshold, nil
+			}
+			fallthrough
+		default:
+			op, err := method.iterateLocal(r.Location)
+			if err != nil {
+				l.finishMethodDone(operation, result, r)
+				return Failure, err
+			}
+			r.Op = op
+			operation <- r
+		}
+	}
+	l.finish(operation, result)
+	return NotTerminated, nil
+}
+
+// initialOperation returns the Operation needed to fill the initial location
+// based on the needs of the method and the values already supplied.
+func (localOptimizer) initialOperation(task Task, n needser) Operation {
+	var newOp Operation
+	op := task.Op
+	if op&FuncEvaluation == 0 {
+		newOp |= FuncEvaluation
+	}
+	needs := n.needs()
+	if needs.Gradient && op&GradEvaluation == 0 {
+		newOp |= GradEvaluation
+	}
+	if needs.Hessian && op&HessEvaluation == 0 {
+		newOp |= HessEvaluation
+	}
+	return newOp
+}
+
+// initialLocation fills the initial location based on the needs of the method.
+// The task passed to initialLocation should be the first task sent in RunGlobal.
+func (l localOptimizer) initialLocation(operation chan<- Task, result <-chan Task, task Task, needs needser) Task {
+	task.Op = l.initialOperation(task, needs)
+	operation <- task
+	return <-result
+}
+
+func (l localOptimizer) checkStartingLocation(task Task, gradThresh float64) (Status, error) {
+	if math.IsInf(task.F, 1) || math.IsNaN(task.F) {
+		return Failure, ErrFunc(task.F)
+	}
+	for i, v := range task.Gradient {
+		if math.IsInf(v, 0) || math.IsNaN(v) {
+			return Failure, ErrGrad{Grad: v, Index: i}
+		}
+	}
+	status := l.checkGradientConvergence(task.Gradient, gradThresh)
+	return status, nil
+}
+
+func (localOptimizer) checkGradientConvergence(gradient []float64, gradThresh float64) Status {
+	if gradient == nil || math.IsNaN(gradThresh) {
+		return NotTerminated
+	}
+	if gradThresh == 0 {
+		gradThresh = defaultGradientAbsTol
+	}
+	if norm := floats.Norm(gradient, math.Inf(1)); norm < gradThresh {
+		return GradientThreshold
+	}
+	return NotTerminated
+}
+
+// finish completes the channel operations to finish an optimization.
+func (localOptimizer) finish(operation chan<- Task, result <-chan Task) {
+	// Guarantee that result is closed before operation is closed.
+	for range result {
+	}
+}
+
+// finishMethodDone sends a MethodDone signal on operation, reads the result,
+// and completes the channel operations to finish an optimization.
+func (l localOptimizer) finishMethodDone(operation chan<- Task, result <-chan Task, task Task) {
+	task.Op = MethodDone
+	operation <- task
+	task = <-result
+	if task.Op != PostIteration {
+		panic("optimize: task should have returned post iteration")
+	}
+	l.finish(operation, result)
+}
diff --git a/vendor/gonum.org/v1/gonum/optimize/minimize.go b/vendor/gonum.org/v1/gonum/optimize/minimize.go
new file mode 100644
index 0000000000..ea962f258f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/optimize/minimize.go
@@ -0,0 +1,595 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package optimize
+
+import (
+	"fmt"
+	"math"
+	"time"
+
+	"gonum.org/v1/gonum/floats"
+	"gonum.org/v1/gonum/mat"
+)
+
+const (
+	nonpositiveDimension string = "optimize: non-positive input dimension"
+	negativeTasks        string = "optimize: negative input number of tasks"
+)
+
+// Task is a type to communicate between the Method and the outer
+// calling script.
+type Task struct {
+	ID int
+	Op Operation
+	*Location
+}
+
+// Location represents a location in the optimization procedure.
+type Location struct {
+	// X is the function input for the location.
+	X []float64
+	// F is the result of evaluating the function at X.
+	F float64
+	// Gradient holds the first-order partial derivatives
+	// of the function at X.
+	// The length of Gradient must match the length of X
+	// or be zero. If the capacity of Gradient is less
+	// than the length of X, a new slice will be allocated.
+	Gradient []float64
+	// Hessian holds the second-order partial derivatives
+	// of the function at X.
+	// The dimensions of Hessian must match the length of X
+	// or Hessian must be nil or empty. If Hessian is nil
+	// a new mat.SymDense will be allocated, if it is empty
+	// it will be resized to match the length of X.
+	Hessian *mat.SymDense
+}
+
+// Method is a type which can search for an optimum of an objective function.
+type Method interface {
+	// Init initializes the method for optimization. The inputs are
+	// the problem dimension and number of available concurrent tasks.
+	//
+	// Init returns the number of concurrent processes to use, which must be
+	// less than or equal to tasks.
+	Init(dim, tasks int) (concurrent int)
+	// Run runs an optimization. The method sends Tasks on
+	// the operation channel (for performing function evaluations, major
+	// iterations, etc.). The result of the tasks will be returned on Result.
+	// See the documentation for Operation types for the possible operations.
+	//
+	// The caller of Run will signal the termination of the optimization
+	// (i.e. convergence from user settings) by sending a task with a PostIteration
+	// Op field on result. More tasks may still be sent on operation after this
+	// occurs, but only MajorIteration operations will still be conducted
+	// appropriately. Thus, it can not be guaranteed that all Evaluations sent
+	// on operation will be evaluated, however if an Evaluation is started,
+	// the results of that evaluation will be sent on results.
+	//
+	// The Method must read from the result channel until it is closed.
+	// During this, the Method may want to send new MajorIteration(s) on
+	// operation. Method then must close operation, and return from Run.
+	// These steps must establish a "happens-before" relationship between result
+	// being closed (externally) and Run closing operation, for example
+	// by using a range loop to read from result even if no results are expected.
+	//
+	// The last parameter to Run is a slice of tasks with length equal to
+	// the return from Init. Task has an ID field which may be
+	// set and modified by Method, and must not be modified by the caller.
+	// The first element of tasks contains information about the initial location.
+	// The Location.X field is always valid. The Operation field specifies which
+	// other values of Location are known. If Operation == NoOperation, none of
+	// the values should be used, otherwise the Evaluation operations will be
+	// composed to specify the valid fields. Methods are free to use or
+	// ignore these values.
+	//
+	// Successful execution of an Operation may require the Method to modify
+	// fields a Location. MajorIteration calls will not modify the values in
+	// the Location, but Evaluation operations will. Methods are encouraged to
+	// leave Location fields untouched to allow memory re-use. If data needs to
+	// be stored, the respective field should be set to nil -- Methods should
+	// not allocate Location memory themselves.
+	//
+	// Method may have its own specific convergence criteria, which can
+	// be communicated using a MethodDone operation. This will trigger a
+	// PostIteration to be sent on result, and the MethodDone task will not be
+	// returned on result. The Method must implement Statuser, and the
+	// call to Status must return a Status other than NotTerminated.
+	//
+	// The operation and result tasks are guaranteed to have a buffer length
+	// equal to the return from Init.
+	Run(operation chan<- Task, result <-chan Task, tasks []Task)
+	// Uses checks if the Method is suited to the optimization problem. The
+	// input is the available functions in Problem to call, and the returns are
+	// the functions which may be used and an error if there is a mismatch
+	// between the Problem and the Method's capabilities.
+	Uses(has Available) (uses Available, err error)
+}
+
+// Minimize uses an optimizer to search for a minimum of a function. A
+// maximization problem can be transformed into a minimization problem by
+// multiplying the function by -1.
+//
+// The first argument represents the problem to be minimized. Its fields are
+// routines that evaluate the objective function, gradient, and other
+// quantities related to the problem. The objective function, p.Func, must not
+// be nil. The optimization method used may require other fields to be non-nil
+// as specified by method.Needs. Minimize will panic if these are not met. The
+// method can be determined automatically from the supplied problem which is
+// described below.
+//
+// If p.Status is not nil, it is called before every evaluation. If the
+// returned Status is other than NotTerminated or if the error is not nil, the
+// optimization run is terminated.
+//
+// The second argument specifies the initial location for the optimization.
+// Some Methods do not require an initial location, but initX must still be
+// specified for the dimension of the optimization problem.
+//
+// The third argument contains the settings for the minimization. If settings
+// is nil, the zero value will be used, see the documentation of the Settings
+// type for more information, and see the warning below. All settings will be
+// honored for all Methods, even if that setting is counter-productive to the
+// method. Minimize cannot guarantee strict adherence to the evaluation bounds
+// specified when performing concurrent evaluations and updates.
+//
+// The final argument is the optimization method to use. If method == nil, then
+// an appropriate default is chosen based on the properties of the other arguments
+// (dimension, gradient-free or gradient-based, etc.). If method is not nil,
+// Minimize panics if the Problem is not consistent with the Method (Uses
+// returns an error).
+//
+// Minimize returns a Result struct and any error that occurred. See the
+// documentation of Result for more information.
+//
+// See the documentation for Method for the details on implementing a method.
+//
+// Be aware that the default settings of Minimize are to accurately find the
+// minimum. For certain functions and optimization methods, this can take many
+// function evaluations. The Settings input struct can be used to limit this,
+// for example by modifying the maximum function evaluations or gradient tolerance.
+func Minimize(p Problem, initX []float64, settings *Settings, method Method) (*Result, error) {
+	startTime := time.Now()
+	if method == nil {
+		method = getDefaultMethod(&p)
+	}
+	if settings == nil {
+		settings = &Settings{}
+	}
+	stats := &Stats{}
+	dim := len(initX)
+	err := checkOptimization(p, dim, settings.Recorder)
+	if err != nil {
+		return nil, err
+	}
+
+	optLoc := newLocation(dim) // This must have an allocated X field.
+	optLoc.F = math.Inf(1)
+
+	initOp, initLoc := getInitLocation(dim, initX, settings.InitValues)
+
+	converger := settings.Converger
+	if converger == nil {
+		converger = defaultFunctionConverge()
+	}
+	converger.Init(dim)
+
+	stats.Runtime = time.Since(startTime)
+
+	// Send initial location to Recorder
+	if settings.Recorder != nil {
+		err = settings.Recorder.Record(optLoc, InitIteration, stats)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	// Run optimization
+	var status Status
+	status, err = minimize(&p, method, settings, converger, stats, initOp, initLoc, optLoc, startTime)
+
+	// Cleanup and collect results
+	if settings.Recorder != nil && err == nil {
+		err = settings.Recorder.Record(optLoc, PostIteration, stats)
+	}
+	stats.Runtime = time.Since(startTime)
+	return &Result{
+		Location: *optLoc,
+		Stats:    *stats,
+		Status:   status,
+	}, err
+}
+
+func getDefaultMethod(p *Problem) Method {
+	if p.Grad != nil {
+		return &LBFGS{}
+	}
+	return &NelderMead{}
+}
+
+// minimize performs an optimization. minimize updates the settings and optLoc,
+// and returns the final Status and error.
+func minimize(prob *Problem, method Method, settings *Settings, converger Converger, stats *Stats, initOp Operation, initLoc, optLoc *Location, startTime time.Time) (Status, error) {
+	dim := len(optLoc.X)
+	nTasks := settings.Concurrent
+	if nTasks == 0 {
+		nTasks = 1
+	}
+	has := availFromProblem(*prob)
+	_, initErr := method.Uses(has)
+	if initErr != nil {
+		panic(fmt.Sprintf("optimize: specified method inconsistent with Problem: %v", initErr))
+	}
+	newNTasks := method.Init(dim, nTasks)
+	if newNTasks > nTasks {
+		panic("optimize: too many tasks returned by Method")
+	}
+	nTasks = newNTasks
+
+	// Launch the method. The method communicates tasks using the operations
+	// channel, and results is used to return the evaluated results.
+	operations := make(chan Task, nTasks)
+	results := make(chan Task, nTasks)
+	go func() {
+		tasks := make([]Task, nTasks)
+		tasks[0].Location = initLoc
+		tasks[0].Op = initOp
+		for i := 1; i < len(tasks); i++ {
+			tasks[i].Location = newLocation(dim)
+		}
+		method.Run(operations, results, tasks)
+	}()
+
+	// Algorithmic Overview:
+	// There are three pieces to performing a concurrent optimization,
+	// the distributor, the workers, and the stats combiner. At a high level,
+	// the distributor reads in tasks sent by method, sending evaluations to the
+	// workers, and forwarding other operations to the statsCombiner. The workers
+	// read these forwarded evaluation tasks, evaluate the relevant parts of Problem
+	// and forward the results on to the stats combiner. The stats combiner reads
+	// in results from the workers, as well as tasks from the distributor, and
+	// uses them to update optimization statistics (function evaluations, etc.)
+	// and to check optimization convergence.
+	//
+	// The complicated part is correctly shutting down the optimization. The
+	// procedure is as follows. First, the stats combiner closes done and sends
+	// a PostIteration to the method. The distributor then reads that done has
+	// been closed, and closes the channel with the workers. At this point, no
+	// more evaluation operations will be executed. As the workers finish their
+	// evaluations, they forward the results onto the stats combiner, and then
+	// signal their shutdown to the stats combiner. When all workers have successfully
+	// finished, the stats combiner closes the results channel, signaling to the
+	// method that all results have been collected. At this point, the method
+	// may send MajorIteration(s) to update an optimum location based on these
+	// last returned results, and then the method will close the operations channel.
+	// The Method must ensure that the closing of results happens before the
+	// closing of operations in order to ensure proper shutdown order.
+	// Now that no more tasks will be commanded by the method, the distributor
+	// closes statsChan, and with no more statistics to update the optimization
+	// concludes.
+
+	workerChan := make(chan Task) // Delegate tasks to the workers.
+	statsChan := make(chan Task)  // Send evaluation updates.
+	done := make(chan struct{})   // Communicate the optimization is done.
+
+	// Read tasks from the method and distribute as appropriate.
+	distributor := func() {
+		for {
+			select {
+			case task := <-operations:
+				switch task.Op {
+				case InitIteration:
+					panic("optimize: Method returned InitIteration")
+				case PostIteration:
+					panic("optimize: Method returned PostIteration")
+				case NoOperation, MajorIteration, MethodDone:
+					statsChan <- task
+				default:
+					if !task.Op.isEvaluation() {
+						panic("optimize: expecting evaluation operation")
+					}
+					workerChan <- task
+				}
+			case <-done:
+				// No more evaluations will be sent, shut down the workers, and
+				// read the final tasks.
+				close(workerChan)
+				for task := range operations {
+					if task.Op == MajorIteration {
+						statsChan <- task
+					}
+				}
+				close(statsChan)
+				return
+			}
+		}
+	}
+	go distributor()
+
+	// Evaluate the Problem concurrently.
+	worker := func() {
+		x := make([]float64, dim)
+		for task := range workerChan {
+			evaluate(prob, task.Location, task.Op, x)
+			statsChan <- task
+		}
+		// Signal successful worker completion.
+		statsChan <- Task{Op: signalDone}
+	}
+	for i := 0; i < nTasks; i++ {
+		go worker()
+	}
+
+	var (
+		workersDone int // effective wg for the workers
+		status      Status
+		err         error
+		finalStatus Status
+		finalError  error
+	)
+
+	// Update optimization statistics and check convergence.
+	var methodDone bool
+	for task := range statsChan {
+		switch task.Op {
+		default:
+			if !task.Op.isEvaluation() {
+				panic("minimize: evaluation task expected")
+			}
+			updateEvaluationStats(stats, task.Op)
+			status, err = checkEvaluationLimits(prob, stats, settings)
+		case signalDone:
+			workersDone++
+			if workersDone == nTasks {
+				close(results)
+			}
+			continue
+		case NoOperation:
+			// Just send the task back.
+		case MajorIteration:
+			status = performMajorIteration(optLoc, task.Location, stats, converger, startTime, settings)
+		case MethodDone:
+			methodDone = true
+			status = MethodConverge
+		}
+		if settings.Recorder != nil && status == NotTerminated && err == nil {
+			stats.Runtime = time.Since(startTime)
+			// Allow err to be overloaded if the Recorder fails.
+			err = settings.Recorder.Record(task.Location, task.Op, stats)
+			if err != nil {
+				status = Failure
+			}
+		}
+		// If this is the first termination status, trigger the conclusion of
+		// the optimization.
+		if status != NotTerminated || err != nil {
+			select {
+			case <-done:
+			default:
+				finalStatus = status
+				finalError = err
+				results <- Task{
+					Op: PostIteration,
+				}
+				close(done)
+			}
+		}
+
+		// Send the result back to the Problem if there are still active workers.
+		if workersDone != nTasks && task.Op != MethodDone {
+			results <- task
+		}
+	}
+	// This code block is here rather than above to ensure Status() is not called
+	// before Method.Run closes operations.
+	if methodDone {
+		statuser, ok := method.(Statuser)
+		if !ok {
+			panic("optimize: method returned MethodDone but is not a Statuser")
+		}
+		finalStatus, finalError = statuser.Status()
+		if finalStatus == NotTerminated {
+			panic("optimize: method returned MethodDone but a NotTerminated status")
+		}
+	}
+	return finalStatus, finalError
+}
+
+func defaultFunctionConverge() *FunctionConverge {
+	return &FunctionConverge{
+		Absolute:   1e-10,
+		Iterations: 100,
+	}
+}
+
+// newLocation allocates a new location structure with an X field of the
+// appropriate size.
+func newLocation(dim int) *Location {
+	return &Location{
+		X: make([]float64, dim),
+	}
+}
+
+// getInitLocation checks the validity of initLocation and initOperation and
+// returns the initial values as a *Location.
+func getInitLocation(dim int, initX []float64, initValues *Location) (Operation, *Location) {
+	loc := newLocation(dim)
+	if initX == nil {
+		if initValues != nil {
+			panic("optimize: initValues is non-nil but no initial location specified")
+		}
+		return NoOperation, loc
+	}
+	copy(loc.X, initX)
+	if initValues == nil {
+		return NoOperation, loc
+	} else {
+		if initValues.X != nil {
+			panic("optimize: location specified in InitValues (only use InitX)")
+		}
+	}
+	loc.F = initValues.F
+	op := FuncEvaluation
+	if initValues.Gradient != nil {
+		if len(initValues.Gradient) != dim {
+			panic("optimize: initial gradient does not match problem dimension")
+		}
+		loc.Gradient = initValues.Gradient
+		op |= GradEvaluation
+	}
+	if initValues.Hessian != nil {
+		if initValues.Hessian.SymmetricDim() != dim {
+			panic("optimize: initial Hessian does not match problem dimension")
+		}
+		loc.Hessian = initValues.Hessian
+		op |= HessEvaluation
+	}
+	return op, loc
+}
+
+func checkOptimization(p Problem, dim int, recorder Recorder) error {
+	if p.Func == nil {
+		panic(badProblem)
+	}
+	if dim <= 0 {
+		panic("optimize: impossible problem dimension")
+	}
+	if p.Status != nil {
+		_, err := p.Status()
+		if err != nil {
+			return err
+		}
+	}
+	if recorder != nil {
+		err := recorder.Init()
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// evaluate evaluates the routines specified by the Operation at loc.X, and stores
+// the answer into loc. loc.X is copied into x before evaluating in order to
+// prevent the routines from modifying it.
+func evaluate(p *Problem, loc *Location, op Operation, x []float64) {
+	if !op.isEvaluation() {
+		panic(fmt.Sprintf("optimize: invalid evaluation %v", op))
+	}
+	copy(x, loc.X)
+	if op&FuncEvaluation != 0 {
+		loc.F = p.Func(x)
+	}
+	if op&GradEvaluation != 0 {
+		// Make sure we have a destination in which to place the gradient.
+		if len(loc.Gradient) == 0 {
+			if cap(loc.Gradient) < len(x) {
+				loc.Gradient = make([]float64, len(x))
+			} else {
+				loc.Gradient = loc.Gradient[:len(x)]
+			}
+		}
+		p.Grad(loc.Gradient, x)
+	}
+	if op&HessEvaluation != 0 {
+		// Make sure we have a destination in which to place the Hessian.
+		switch {
+		case loc.Hessian == nil:
+			loc.Hessian = mat.NewSymDense(len(x), nil)
+		case loc.Hessian.IsEmpty():
+			loc.Hessian.ReuseAsSym(len(x))
+		}
+		p.Hess(loc.Hessian, x)
+	}
+}
+
+// updateEvaluationStats updates the statistics based on the operation.
+func updateEvaluationStats(stats *Stats, op Operation) {
+	if op&FuncEvaluation != 0 {
+		stats.FuncEvaluations++
+	}
+	if op&GradEvaluation != 0 {
+		stats.GradEvaluations++
+	}
+	if op&HessEvaluation != 0 {
+		stats.HessEvaluations++
+	}
+}
+
+// checkLocationConvergence checks if the current optimal location satisfies
+// any of the convergence criteria based on the function location.
+//
+// checkLocationConvergence returns NotTerminated if the Location does not satisfy
+// the convergence criteria given by settings. Otherwise a corresponding status is
+// returned.
+// Unlike checkLimits, checkConvergence is called only at MajorIterations.
+func checkLocationConvergence(loc *Location, settings *Settings, converger Converger) Status {
+	if math.IsInf(loc.F, -1) {
+		return FunctionNegativeInfinity
+	}
+	if loc.Gradient != nil && settings.GradientThreshold > 0 {
+		norm := floats.Norm(loc.Gradient, math.Inf(1))
+		if norm < settings.GradientThreshold {
+			return GradientThreshold
+		}
+	}
+	return converger.Converged(loc)
+}
+
+// checkEvaluationLimits checks the optimization limits after an evaluation
+// Operation. It checks the number of evaluations (of various kinds) and checks
+// the status of the Problem, if applicable.
+func checkEvaluationLimits(p *Problem, stats *Stats, settings *Settings) (Status, error) {
+	if p.Status != nil {
+		status, err := p.Status()
+		if err != nil || status != NotTerminated {
+			return status, err
+		}
+	}
+	if settings.FuncEvaluations > 0 && stats.FuncEvaluations >= settings.FuncEvaluations {
+		return FunctionEvaluationLimit, nil
+	}
+	if settings.GradEvaluations > 0 && stats.GradEvaluations >= settings.GradEvaluations {
+		return GradientEvaluationLimit, nil
+	}
+	if settings.HessEvaluations > 0 && stats.HessEvaluations >= settings.HessEvaluations {
+		return HessianEvaluationLimit, nil
+	}
+	return NotTerminated, nil
+}
+
+// checkIterationLimits checks the limits on iterations affected by MajorIteration.
+func checkIterationLimits(loc *Location, stats *Stats, settings *Settings) Status {
+	if settings.MajorIterations > 0 && stats.MajorIterations >= settings.MajorIterations {
+		return IterationLimit
+	}
+	if settings.Runtime > 0 && stats.Runtime >= settings.Runtime {
+		return RuntimeLimit
+	}
+	return NotTerminated
+}
+
+// performMajorIteration does all of the steps needed to perform a MajorIteration.
+// It increments the iteration count, updates the optimal location, and checks
+// the necessary convergence criteria.
+func performMajorIteration(optLoc, loc *Location, stats *Stats, converger Converger, startTime time.Time, settings *Settings) Status {
+	optLoc.F = loc.F
+	copy(optLoc.X, loc.X)
+	if loc.Gradient == nil {
+		optLoc.Gradient = nil
+	} else {
+		if optLoc.Gradient == nil {
+			optLoc.Gradient = make([]float64, len(loc.Gradient))
+		}
+		copy(optLoc.Gradient, loc.Gradient)
+	}
+	stats.MajorIterations++
+	stats.Runtime = time.Since(startTime)
+	status := checkLocationConvergence(optLoc, settings, converger)
+	if status != NotTerminated {
+		return status
+	}
+	return checkIterationLimits(optLoc, stats, settings)
+}
diff --git a/vendor/gonum.org/v1/gonum/optimize/morethuente.go b/vendor/gonum.org/v1/gonum/optimize/morethuente.go
new file mode 100644
index 0000000000..cb23890ca1
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/optimize/morethuente.go
@@ -0,0 +1,387 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package optimize
+
+import "math"
+
+var _ Linesearcher = (*MoreThuente)(nil)
+
+// MoreThuente is a Linesearcher that finds steps that satisfy both the
+// sufficient decrease and curvature conditions (the strong Wolfe conditions).
+//
+// References:
+//   - More, J.J. and D.J. Thuente: Line Search Algorithms with Guaranteed Sufficient
+//     Decrease. ACM Transactions on Mathematical Software 20(3) (1994), 286-307
+type MoreThuente struct {
+	// DecreaseFactor is the constant factor in the sufficient decrease
+	// (Armijo) condition.
+	// It must be in the interval [0, 1). The default value is 0.
+	DecreaseFactor float64
+	// CurvatureFactor is the constant factor in the Wolfe conditions. Smaller
+	// values result in a more exact line search.
+	// A set value must be in the interval (0, 1). If it is zero, it will be
+	// defaulted to 0.9.
+	CurvatureFactor float64
+	// StepTolerance sets the minimum acceptable width for the linesearch
+	// interval. If the relative interval length is less than this value,
+	// ErrLinesearcherFailure is returned.
+	// It must be non-negative. If it is zero, it will be defaulted to 1e-10.
+	StepTolerance float64
+
+	// MinimumStep is the minimum step that the linesearcher will take.
+	// It must be non-negative and less than MaximumStep. Defaults to no
+	// minimum (a value of 0).
+	MinimumStep float64
+	// MaximumStep is the maximum step that the linesearcher will take.
+	// It must be greater than MinimumStep. If it is zero, it will be defaulted
+	// to 1e20.
+	MaximumStep float64
+
+	bracketed bool    // Indicates if a minimum has been bracketed.
+	fInit     float64 // Function value at step = 0.
+	gInit     float64 // Derivative value at step = 0.
+
+	// When stage is 1, the algorithm updates the interval given by x and y
+	// so that it contains a minimizer of the modified function
+	//  psi(step) = f(step) - f(0) - DecreaseFactor * step * f'(0).
+	// When stage is 2, the interval is updated so that it contains a minimizer
+	// of f.
+	stage int
+
+	step         float64    // Current step.
+	lower, upper float64    // Lower and upper bounds on the next step.
+	x            float64    // Endpoint of the interval with a lower function value.
+	fx, gx       float64    // Data at x.
+	y            float64    // The other endpoint.
+	fy, gy       float64    // Data at y.
+	width        [2]float64 // Width of the interval at two previous iterations.
+}
+
+const (
+	mtMinGrowthFactor float64 = 1.1
+	mtMaxGrowthFactor float64 = 4
+)
+
+func (mt *MoreThuente) Init(f, g float64, step float64) Operation {
+	// Based on the original Fortran code that is available, for example, from
+	//  http://ftp.mcs.anl.gov/pub/MINPACK-2/csrch/
+	// as part of
+	//  MINPACK-2 Project. November 1993.
+	//  Argonne National Laboratory and University of Minnesota.
+	//  Brett M. Averick, Richard G. Carter, and Jorge J. Moré.
+
+	if g >= 0 {
+		panic("morethuente: initial derivative is non-negative")
+	}
+	if step <= 0 {
+		panic("morethuente: invalid initial step")
+	}
+
+	if mt.CurvatureFactor == 0 {
+		mt.CurvatureFactor = 0.9
+	}
+	if mt.StepTolerance == 0 {
+		mt.StepTolerance = 1e-10
+	}
+	if mt.MaximumStep == 0 {
+		mt.MaximumStep = 1e20
+	}
+
+	if mt.MinimumStep < 0 {
+		panic("morethuente: minimum step is negative")
+	}
+	if mt.MaximumStep <= mt.MinimumStep {
+		panic("morethuente: maximum step is not greater than minimum step")
+	}
+	if mt.DecreaseFactor < 0 || mt.DecreaseFactor >= 1 {
+		panic("morethuente: invalid decrease factor")
+	}
+	if mt.CurvatureFactor <= 0 || mt.CurvatureFactor >= 1 {
+		panic("morethuente: invalid curvature factor")
+	}
+	if mt.StepTolerance <= 0 {
+		panic("morethuente: step tolerance is not positive")
+	}
+
+	if step < mt.MinimumStep {
+		step = mt.MinimumStep
+	}
+	if step > mt.MaximumStep {
+		step = mt.MaximumStep
+	}
+
+	mt.bracketed = false
+	mt.stage = 1
+	mt.fInit = f
+	mt.gInit = g
+
+	mt.x, mt.fx, mt.gx = 0, f, g
+	mt.y, mt.fy, mt.gy = 0, f, g
+
+	mt.lower = 0
+	mt.upper = step + mtMaxGrowthFactor*step
+
+	mt.width[0] = mt.MaximumStep - mt.MinimumStep
+	mt.width[1] = 2 * mt.width[0]
+
+	mt.step = step
+	return FuncEvaluation | GradEvaluation
+}
+
+func (mt *MoreThuente) Iterate(f, g float64) (Operation, float64, error) {
+	if mt.stage == 0 {
+		panic("morethuente: Init has not been called")
+	}
+
+	gTest := mt.DecreaseFactor * mt.gInit
+	fTest := mt.fInit + mt.step*gTest
+
+	if mt.bracketed {
+		if mt.step <= mt.lower || mt.step >= mt.upper || mt.upper-mt.lower <= mt.StepTolerance*mt.upper {
+			// step contains the best step found (see below).
+			return NoOperation, mt.step, ErrLinesearcherFailure
+		}
+	}
+	if mt.step == mt.MaximumStep && f <= fTest && g <= gTest {
+		return NoOperation, mt.step, ErrLinesearcherBound
+	}
+	if mt.step == mt.MinimumStep && (f > fTest || g >= gTest) {
+		return NoOperation, mt.step, ErrLinesearcherFailure
+	}
+
+	// Test for convergence.
+	if f <= fTest && math.Abs(g) <= mt.CurvatureFactor*(-mt.gInit) {
+		mt.stage = 0
+		return MajorIteration, mt.step, nil
+	}
+
+	if mt.stage == 1 && f <= fTest && g >= 0 {
+		mt.stage = 2
+	}
+
+	if mt.stage == 1 && f <= mt.fx && f > fTest {
+		// Lower function value but the decrease is not sufficient .
+
+		// Compute values and derivatives of the modified function at step, x, y.
+		fm := f - mt.step*gTest
+		fxm := mt.fx - mt.x*gTest
+		fym := mt.fy - mt.y*gTest
+		gm := g - gTest
+		gxm := mt.gx - gTest
+		gym := mt.gy - gTest
+		// Update x, y and step.
+		mt.nextStep(fxm, gxm, fym, gym, fm, gm)
+		// Recover values and derivates of the non-modified function at x and y.
+		mt.fx = fxm + mt.x*gTest
+		mt.fy = fym + mt.y*gTest
+		mt.gx = gxm + gTest
+		mt.gy = gym + gTest
+	} else {
+		// Update x, y and step.
+		mt.nextStep(mt.fx, mt.gx, mt.fy, mt.gy, f, g)
+	}
+
+	if mt.bracketed {
+		// Monitor the length of the bracketing interval. If the interval has
+		// not been reduced sufficiently after two steps, use bisection to
+		// force its length to zero.
+		width := mt.y - mt.x
+		if math.Abs(width) >= 2.0/3*mt.width[1] {
+			mt.step = mt.x + 0.5*width
+		}
+		mt.width[0], mt.width[1] = math.Abs(width), mt.width[0]
+	}
+
+	if mt.bracketed {
+		mt.lower = math.Min(mt.x, mt.y)
+		mt.upper = math.Max(mt.x, mt.y)
+	} else {
+		mt.lower = mt.step + mtMinGrowthFactor*(mt.step-mt.x)
+		mt.upper = mt.step + mtMaxGrowthFactor*(mt.step-mt.x)
+	}
+
+	// Force the step to be in [MinimumStep, MaximumStep].
+	mt.step = math.Max(mt.MinimumStep, math.Min(mt.step, mt.MaximumStep))
+
+	if mt.bracketed {
+		if mt.step <= mt.lower || mt.step >= mt.upper || mt.upper-mt.lower <= mt.StepTolerance*mt.upper {
+			// If further progress is not possible, set step to the best step
+			// obtained during the search.
+			mt.step = mt.x
+		}
+	}
+
+	return FuncEvaluation | GradEvaluation, mt.step, nil
+}
+
+// nextStep computes the next safeguarded step and updates the interval that
+// contains a step that satisfies the sufficient decrease and curvature
+// conditions.
+func (mt *MoreThuente) nextStep(fx, gx, fy, gy, f, g float64) {
+	x := mt.x
+	y := mt.y
+	step := mt.step
+
+	gNeg := g < 0
+	if gx < 0 {
+		gNeg = !gNeg
+	}
+
+	var next float64
+	var bracketed bool
+	switch {
+	case f > fx:
+		// A higher function value. The minimum is bracketed between x and step.
+		// We want the next step to be closer to x because the function value
+		// there is lower.
+
+		theta := 3*(fx-f)/(step-x) + gx + g
+		s := math.Max(math.Abs(gx), math.Abs(g))
+		s = math.Max(s, math.Abs(theta))
+		gamma := s * math.Sqrt((theta/s)*(theta/s)-(gx/s)*(g/s))
+		if step < x {
+			gamma *= -1
+		}
+		p := gamma - gx + theta
+		q := gamma - gx + gamma + g
+		r := p / q
+		stpc := x + r*(step-x)
+		stpq := x + gx/((fx-f)/(step-x)+gx)/2*(step-x)
+
+		if math.Abs(stpc-x) < math.Abs(stpq-x) {
+			// The cubic step is closer to x than the quadratic step.
+			// Take the cubic step.
+			next = stpc
+		} else {
+			// If f is much larger than fx, then the quadratic step may be too
+			// close to x. Therefore heuristically take the average of the
+			// cubic and quadratic steps.
+			next = stpc + (stpq-stpc)/2
+		}
+		bracketed = true
+
+	case gNeg:
+		// A lower function value and derivatives of opposite sign. The minimum
+		// is bracketed between x and step. If we choose a step that is far
+		// from step, the next iteration will also likely fall in this case.
+
+		theta := 3*(fx-f)/(step-x) + gx + g
+		s := math.Max(math.Abs(gx), math.Abs(g))
+		s = math.Max(s, math.Abs(theta))
+		gamma := s * math.Sqrt((theta/s)*(theta/s)-(gx/s)*(g/s))
+		if step > x {
+			gamma *= -1
+		}
+		p := gamma - g + theta
+		q := gamma - g + gamma + gx
+		r := p / q
+		stpc := step + r*(x-step)
+		stpq := step + g/(g-gx)*(x-step)
+
+		if math.Abs(stpc-step) > math.Abs(stpq-step) {
+			// The cubic step is farther from x than the quadratic step.
+			// Take the cubic step.
+			next = stpc
+		} else {
+			// Take the quadratic step.
+			next = stpq
+		}
+		bracketed = true
+
+	case math.Abs(g) < math.Abs(gx):
+		// A lower function value, derivatives of the same sign, and the
+		// magnitude of the derivative decreases. Extrapolate function values
+		// at x and step so that the next step lies between step and y.
+
+		theta := 3*(fx-f)/(step-x) + gx + g
+		s := math.Max(math.Abs(gx), math.Abs(g))
+		s = math.Max(s, math.Abs(theta))
+		gamma := s * math.Sqrt(math.Max(0, (theta/s)*(theta/s)-(gx/s)*(g/s)))
+		if step > x {
+			gamma *= -1
+		}
+		p := gamma - g + theta
+		q := gamma + gx - g + gamma
+		r := p / q
+		var stpc float64
+		switch {
+		case r < 0 && gamma != 0:
+			stpc = step + r*(x-step)
+		case step > x:
+			stpc = mt.upper
+		default:
+			stpc = mt.lower
+		}
+		stpq := step + g/(g-gx)*(x-step)
+
+		if mt.bracketed {
+			// We are extrapolating so be cautious and take the step that
+			// is closer to step.
+			if math.Abs(stpc-step) < math.Abs(stpq-step) {
+				next = stpc
+			} else {
+				next = stpq
+			}
+			// Modify next if it is close to or beyond y.
+			if step > x {
+				next = math.Min(step+2.0/3*(y-step), next)
+			} else {
+				next = math.Max(step+2.0/3*(y-step), next)
+			}
+		} else {
+			// Minimum has not been bracketed so take the larger step...
+			if math.Abs(stpc-step) > math.Abs(stpq-step) {
+				next = stpc
+			} else {
+				next = stpq
+			}
+			// ...but within reason.
+			next = math.Max(mt.lower, math.Min(next, mt.upper))
+		}
+
+	default:
+		// A lower function value, derivatives of the same sign, and the
+		// magnitude of the derivative does not decrease. The function seems to
+		// decrease rapidly in the direction of the step.
+
+		switch {
+		case mt.bracketed:
+			theta := 3*(f-fy)/(y-step) + gy + g
+			s := math.Max(math.Abs(gy), math.Abs(g))
+			s = math.Max(s, math.Abs(theta))
+			gamma := s * math.Sqrt((theta/s)*(theta/s)-(gy/s)*(g/s))
+			if step > y {
+				gamma *= -1
+			}
+			p := gamma - g + theta
+			q := gamma - g + gamma + gy
+			r := p / q
+			next = step + r*(y-step)
+		case step > x:
+			next = mt.upper
+		default:
+			next = mt.lower
+		}
+	}
+
+	if f > fx {
+		// x is still the best step.
+		mt.y = step
+		mt.fy = f
+		mt.gy = g
+	} else {
+		// step is the new best step.
+		if gNeg {
+			mt.y = x
+			mt.fy = fx
+			mt.gy = gx
+		}
+		mt.x = step
+		mt.fx = f
+		mt.gx = g
+	}
+	mt.bracketed = bracketed
+	mt.step = next
+}
diff --git a/vendor/gonum.org/v1/gonum/optimize/neldermead.go b/vendor/gonum.org/v1/gonum/optimize/neldermead.go
new file mode 100644
index 0000000000..5118fd4cd5
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/optimize/neldermead.go
@@ -0,0 +1,348 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package optimize
+
+import (
+	"math"
+	"sort"
+
+	"gonum.org/v1/gonum/floats"
+)
+
+// nmIterType is a Nelder-Mead evaluation kind
+type nmIterType int
+
+const (
+	nmReflected = iota
+	nmExpanded
+	nmContractedInside
+	nmContractedOutside
+	nmInitialize
+	nmShrink
+	nmMajor
+)
+
+type nmVertexSorter struct {
+	vertices [][]float64
+	values   []float64
+}
+
+func (n nmVertexSorter) Len() int {
+	return len(n.values)
+}
+
+func (n nmVertexSorter) Less(i, j int) bool {
+	return n.values[i] < n.values[j]
+}
+
+func (n nmVertexSorter) Swap(i, j int) {
+	n.values[i], n.values[j] = n.values[j], n.values[i]
+	n.vertices[i], n.vertices[j] = n.vertices[j], n.vertices[i]
+}
+
+var _ Method = (*NelderMead)(nil)
+
+// NelderMead is an implementation of the Nelder-Mead simplex algorithm for
+// gradient-free nonlinear optimization (not to be confused with Danzig's
+// simplex algorithm for linear programming). The implementation follows the
+// algorithm described in
+//
+//	http://epubs.siam.org/doi/pdf/10.1137/S1052623496303470
+//
+// If an initial simplex is provided, it is used and initLoc is ignored. If
+// InitialVertices and InitialValues are both nil, an initial simplex will be
+// generated automatically using the initial location as one vertex, and each
+// additional vertex as SimplexSize away in one dimension.
+//
+// If the simplex update parameters (Reflection, etc.)
+// are zero, they will be set automatically based on the dimension according to
+// the recommendations in
+//
+//	http://www.webpages.uidaho.edu/~fuchang/res/ANMS.pdf
+type NelderMead struct {
+	InitialVertices [][]float64
+	InitialValues   []float64
+	Reflection      float64 // Reflection parameter (>0)
+	Expansion       float64 // Expansion parameter (>1)
+	Contraction     float64 // Contraction parameter (>0, <1)
+	Shrink          float64 // Shrink parameter (>0, <1)
+	SimplexSize     float64 // size of auto-constructed initial simplex
+
+	status Status
+	err    error
+
+	reflection  float64
+	expansion   float64
+	contraction float64
+	shrink      float64
+
+	vertices [][]float64 // location of the vertices sorted in ascending f
+	values   []float64   // function values at the vertices sorted in ascending f
+	centroid []float64   // centroid of all but the worst vertex
+
+	fillIdx        int        // index for filling the simplex during initialization and shrinking
+	lastIter       nmIterType // Last iteration
+	reflectedPoint []float64  // Storage of the reflected point location
+	reflectedValue float64    // Value at the last reflection point
+}
+
+func (n *NelderMead) Status() (Status, error) {
+	return n.status, n.err
+}
+
+func (*NelderMead) Uses(has Available) (uses Available, err error) {
+	return has.function()
+}
+
+func (n *NelderMead) Init(dim, tasks int) int {
+	n.status = NotTerminated
+	n.err = nil
+	return 1
+}
+
+func (n *NelderMead) Run(operation chan<- Task, result <-chan Task, tasks []Task) {
+	n.status, n.err = localOptimizer{}.run(n, math.NaN(), operation, result, tasks)
+	close(operation)
+}
+
+func (n *NelderMead) initLocal(loc *Location) (Operation, error) {
+	dim := len(loc.X)
+	if cap(n.vertices) < dim+1 {
+		n.vertices = make([][]float64, dim+1)
+	}
+	n.vertices = n.vertices[:dim+1]
+	for i := range n.vertices {
+		n.vertices[i] = resize(n.vertices[i], dim)
+	}
+	n.values = resize(n.values, dim+1)
+	n.centroid = resize(n.centroid, dim)
+	n.reflectedPoint = resize(n.reflectedPoint, dim)
+
+	if n.SimplexSize == 0 {
+		n.SimplexSize = 0.05
+	}
+
+	// Default parameter choices are chosen in a dimension-dependent way
+	// from http://www.webpages.uidaho.edu/~fuchang/res/ANMS.pdf
+	n.reflection = n.Reflection
+	if n.reflection == 0 {
+		n.reflection = 1
+	}
+	n.expansion = n.Expansion
+	if n.expansion == 0 {
+		n.expansion = 1 + 2/float64(dim)
+		if dim == 1 {
+			n.expansion = 2
+		}
+	}
+	n.contraction = n.Contraction
+	if n.contraction == 0 {
+		n.contraction = 0.75 - 1/(2*float64(dim))
+		if dim == 1 {
+			n.contraction = 0.5
+		}
+	}
+	n.shrink = n.Shrink
+	if n.shrink == 0 {
+		n.shrink = 1 - 1/float64(dim)
+		if dim == 1 {
+			n.shrink = 0.5
+		}
+	}
+
+	if n.InitialVertices != nil {
+		// Initial simplex provided. Copy the locations and values, and sort them.
+		if len(n.InitialVertices) != dim+1 {
+			panic("neldermead: incorrect number of vertices in initial simplex")
+		}
+		if len(n.InitialValues) != dim+1 {
+			panic("neldermead: incorrect number of values in initial simplex")
+		}
+		for i := range n.InitialVertices {
+			if len(n.InitialVertices[i]) != dim {
+				panic("neldermead: vertex size mismatch")
+			}
+			copy(n.vertices[i], n.InitialVertices[i])
+		}
+		copy(n.values, n.InitialValues)
+		sort.Sort(nmVertexSorter{n.vertices, n.values})
+		computeCentroid(n.vertices, n.centroid)
+		return n.returnNext(nmMajor, loc)
+	}
+
+	// No simplex provided. Begin initializing initial simplex. First simplex
+	// entry is the initial location, then step 1 in every direction.
+	copy(n.vertices[dim], loc.X)
+	n.values[dim] = loc.F
+	n.fillIdx = 0
+	loc.X[n.fillIdx] += n.SimplexSize
+	n.lastIter = nmInitialize
+	return FuncEvaluation, nil
+}
+
+// computeCentroid computes the centroid of all the simplex vertices except the
+// final one
+func computeCentroid(vertices [][]float64, centroid []float64) {
+	dim := len(centroid)
+	for i := range centroid {
+		centroid[i] = 0
+	}
+	for i := 0; i < dim; i++ {
+		vertex := vertices[i]
+		for j, v := range vertex {
+			centroid[j] += v
+		}
+	}
+	for i := range centroid {
+		centroid[i] /= float64(dim)
+	}
+}
+
+func (n *NelderMead) iterateLocal(loc *Location) (Operation, error) {
+	dim := len(loc.X)
+	switch n.lastIter {
+	case nmInitialize:
+		n.values[n.fillIdx] = loc.F
+		copy(n.vertices[n.fillIdx], loc.X)
+		n.fillIdx++
+		if n.fillIdx == dim {
+			// Successfully finished building initial simplex.
+			sort.Sort(nmVertexSorter{n.vertices, n.values})
+			computeCentroid(n.vertices, n.centroid)
+			return n.returnNext(nmMajor, loc)
+		}
+		copy(loc.X, n.vertices[dim])
+		loc.X[n.fillIdx] += n.SimplexSize
+		return FuncEvaluation, nil
+	case nmMajor:
+		// Nelder Mead iterations start with Reflection step
+		return n.returnNext(nmReflected, loc)
+	case nmReflected:
+		n.reflectedValue = loc.F
+		switch {
+		case loc.F >= n.values[0] && loc.F < n.values[dim-1]:
+			n.replaceWorst(loc.X, loc.F)
+			return n.returnNext(nmMajor, loc)
+		case loc.F < n.values[0]:
+			return n.returnNext(nmExpanded, loc)
+		default:
+			if loc.F < n.values[dim] {
+				return n.returnNext(nmContractedOutside, loc)
+			}
+			return n.returnNext(nmContractedInside, loc)
+		}
+	case nmExpanded:
+		if loc.F < n.reflectedValue {
+			n.replaceWorst(loc.X, loc.F)
+		} else {
+			n.replaceWorst(n.reflectedPoint, n.reflectedValue)
+		}
+		return n.returnNext(nmMajor, loc)
+	case nmContractedOutside:
+		if loc.F <= n.reflectedValue {
+			n.replaceWorst(loc.X, loc.F)
+			return n.returnNext(nmMajor, loc)
+		}
+		n.fillIdx = 1
+		return n.returnNext(nmShrink, loc)
+	case nmContractedInside:
+		if loc.F < n.values[dim] {
+			n.replaceWorst(loc.X, loc.F)
+			return n.returnNext(nmMajor, loc)
+		}
+		n.fillIdx = 1
+		return n.returnNext(nmShrink, loc)
+	case nmShrink:
+		copy(n.vertices[n.fillIdx], loc.X)
+		n.values[n.fillIdx] = loc.F
+		n.fillIdx++
+		if n.fillIdx != dim+1 {
+			return n.returnNext(nmShrink, loc)
+		}
+		sort.Sort(nmVertexSorter{n.vertices, n.values})
+		computeCentroid(n.vertices, n.centroid)
+		return n.returnNext(nmMajor, loc)
+	default:
+		panic("unreachable")
+	}
+}
+
+// returnNext updates the location based on the iteration type and the current
+// simplex, and returns the next operation.
+func (n *NelderMead) returnNext(iter nmIterType, loc *Location) (Operation, error) {
+	n.lastIter = iter
+	switch iter {
+	case nmMajor:
+		// Fill loc with the current best point and value,
+		// and command a convergence check.
+		copy(loc.X, n.vertices[0])
+		loc.F = n.values[0]
+		return MajorIteration, nil
+	case nmReflected, nmExpanded, nmContractedOutside, nmContractedInside:
+		// x_new = x_centroid + scale * (x_centroid - x_worst)
+		var scale float64
+		switch iter {
+		case nmReflected:
+			scale = n.reflection
+		case nmExpanded:
+			scale = n.reflection * n.expansion
+		case nmContractedOutside:
+			scale = n.reflection * n.contraction
+		case nmContractedInside:
+			scale = -n.contraction
+		}
+		dim := len(loc.X)
+		floats.SubTo(loc.X, n.centroid, n.vertices[dim])
+		floats.Scale(scale, loc.X)
+		floats.Add(loc.X, n.centroid)
+		if iter == nmReflected {
+			copy(n.reflectedPoint, loc.X)
+		}
+		return FuncEvaluation, nil
+	case nmShrink:
+		// x_shrink = x_best + delta * (x_i + x_best)
+		floats.SubTo(loc.X, n.vertices[n.fillIdx], n.vertices[0])
+		floats.Scale(n.shrink, loc.X)
+		floats.Add(loc.X, n.vertices[0])
+		return FuncEvaluation, nil
+	default:
+		panic("unreachable")
+	}
+}
+
+// replaceWorst removes the worst location in the simplex and adds the new
+// {x, f} pair maintaining sorting.
+func (n *NelderMead) replaceWorst(x []float64, f float64) {
+	dim := len(x)
+	if f >= n.values[dim] {
+		panic("increase in simplex value")
+	}
+	copy(n.vertices[dim], x)
+	n.values[dim] = f
+
+	// Sort the newly-added value.
+	for i := dim - 1; i >= 0; i-- {
+		if n.values[i] < f {
+			break
+		}
+		n.vertices[i], n.vertices[i+1] = n.vertices[i+1], n.vertices[i]
+		n.values[i], n.values[i+1] = n.values[i+1], n.values[i]
+	}
+
+	// Update the location of the centroid. Only one point has been replaced, so
+	// subtract the worst point and add the new one.
+	floats.AddScaled(n.centroid, -1/float64(dim), n.vertices[dim])
+	floats.AddScaled(n.centroid, 1/float64(dim), x)
+}
+
+func (*NelderMead) needs() struct {
+	Gradient bool
+	Hessian  bool
+} {
+	return struct {
+		Gradient bool
+		Hessian  bool
+	}{false, false}
+}
diff --git a/vendor/gonum.org/v1/gonum/optimize/newton.go b/vendor/gonum.org/v1/gonum/optimize/newton.go
new file mode 100644
index 0000000000..bd29a08be6
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/optimize/newton.go
@@ -0,0 +1,182 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package optimize
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/mat"
+)
+
+const maxNewtonModifications = 20
+
+var (
+	_ Method          = (*Newton)(nil)
+	_ localMethod     = (*Newton)(nil)
+	_ NextDirectioner = (*Newton)(nil)
+)
+
+// Newton implements a modified Newton's method for Hessian-based unconstrained
+// minimization. It applies regularization when the Hessian is not positive
+// definite, and it can converge to a local minimum from any starting point.
+//
+// Newton iteratively forms a quadratic model to the objective function f and
+// tries to minimize this approximate model. It generates a sequence of
+// locations x_k by means of
+//
+//	solve H_k d_k = -∇f_k for d_k,
+//	x_{k+1} = x_k + α_k d_k,
+//
+// where H_k is the Hessian matrix of f at x_k and α_k is a step size found by
+// a line search.
+//
+// Away from a minimizer H_k may not be positive definite and d_k may not be a
+// descent direction. Newton implements a Hessian modification strategy that
+// adds successively larger multiples of identity to H_k until it becomes
+// positive definite. Note that the repeated trial factorization of the
+// modified Hessian involved in this process can be computationally expensive.
+//
+// If the Hessian matrix cannot be formed explicitly or if the computational
+// cost of its factorization is prohibitive, BFGS or L-BFGS quasi-Newton method
+// can be used instead.
+type Newton struct {
+	// Linesearcher is used for selecting suitable steps along the descent
+	// direction d. Accepted steps should satisfy at least one of the Wolfe,
+	// Goldstein or Armijo conditions.
+	// If Linesearcher == nil, an appropriate default is chosen.
+	Linesearcher Linesearcher
+	// Increase is the factor by which a scalar tau is successively increased
+	// so that (H + tau*I) is positive definite. Larger values reduce the
+	// number of trial Hessian factorizations, but also reduce the second-order
+	// information in H.
+	// Increase must be greater than 1. If Increase is 0, it is defaulted to 5.
+	Increase float64
+	// GradStopThreshold sets the threshold for stopping if the gradient norm
+	// gets too small. If GradStopThreshold is 0 it is defaulted to 1e-12, and
+	// if it is NaN the setting is not used.
+	GradStopThreshold float64
+
+	status Status
+	err    error
+
+	ls *LinesearchMethod
+
+	hess *mat.SymDense // Storage for a copy of the Hessian matrix.
+	chol mat.Cholesky  // Storage for the Cholesky factorization.
+	tau  float64
+}
+
+func (n *Newton) Status() (Status, error) {
+	return n.status, n.err
+}
+
+func (*Newton) Uses(has Available) (uses Available, err error) {
+	return has.hessian()
+}
+
+func (n *Newton) Init(dim, tasks int) int {
+	n.status = NotTerminated
+	n.err = nil
+	return 1
+}
+
+func (n *Newton) Run(operation chan<- Task, result <-chan Task, tasks []Task) {
+	n.status, n.err = localOptimizer{}.run(n, n.GradStopThreshold, operation, result, tasks)
+	close(operation)
+}
+
+func (n *Newton) initLocal(loc *Location) (Operation, error) {
+	if n.Increase == 0 {
+		n.Increase = 5
+	}
+	if n.Increase <= 1 {
+		panic("optimize: Newton.Increase must be greater than 1")
+	}
+	if n.Linesearcher == nil {
+		n.Linesearcher = &Bisection{}
+	}
+	if n.ls == nil {
+		n.ls = &LinesearchMethod{}
+	}
+	n.ls.Linesearcher = n.Linesearcher
+	n.ls.NextDirectioner = n
+	return n.ls.Init(loc)
+}
+
+func (n *Newton) iterateLocal(loc *Location) (Operation, error) {
+	return n.ls.Iterate(loc)
+}
+
+func (n *Newton) InitDirection(loc *Location, dir []float64) (stepSize float64) {
+	dim := len(loc.X)
+	n.hess = resizeSymDense(n.hess, dim)
+	n.tau = 0
+	return n.NextDirection(loc, dir)
+}
+
+func (n *Newton) NextDirection(loc *Location, dir []float64) (stepSize float64) {
+	// This method implements Algorithm 3.3 (Cholesky with Added Multiple of
+	// the Identity) from Nocedal, Wright (2006), 2nd edition.
+
+	dim := len(loc.X)
+	d := mat.NewVecDense(dim, dir)
+	grad := mat.NewVecDense(dim, loc.Gradient)
+	n.hess.CopySym(loc.Hessian)
+
+	// Find the smallest diagonal entry of the Hessian.
+	minA := n.hess.At(0, 0)
+	for i := 1; i < dim; i++ {
+		a := n.hess.At(i, i)
+		if a < minA {
+			minA = a
+		}
+	}
+	// If the smallest diagonal entry is positive, the Hessian may be positive
+	// definite, and so first attempt to apply the Cholesky factorization to
+	// the un-modified Hessian. If the smallest entry is negative, use the
+	// final tau from the last iteration if regularization was needed,
+	// otherwise guess an appropriate value for tau.
+	if minA > 0 {
+		n.tau = 0
+	} else if n.tau == 0 {
+		n.tau = -minA + 0.001
+	}
+
+	for k := 0; k < maxNewtonModifications; k++ {
+		if n.tau != 0 {
+			// Add a multiple of identity to the Hessian.
+			for i := 0; i < dim; i++ {
+				n.hess.SetSym(i, i, loc.Hessian.At(i, i)+n.tau)
+			}
+		}
+		// Try to apply the Cholesky factorization.
+		pd := n.chol.Factorize(n.hess)
+		if pd {
+			// Store the solution in d's backing array, dir.
+			err := n.chol.SolveVecTo(d, grad)
+			if err == nil {
+				d.ScaleVec(-1, d)
+				return 1
+			}
+		}
+		// Modified Hessian is not PD, so increase tau.
+		n.tau = math.Max(n.Increase*n.tau, 0.001)
+	}
+
+	// Hessian modification failed to get a PD matrix. Return the negative
+	// gradient as the descent direction.
+	d.ScaleVec(-1, grad)
+	return 1
+}
+
+func (n *Newton) needs() struct {
+	Gradient bool
+	Hessian  bool
+} {
+	return struct {
+		Gradient bool
+		Hessian  bool
+	}{true, true}
+}
diff --git a/vendor/gonum.org/v1/gonum/optimize/printer.go b/vendor/gonum.org/v1/gonum/optimize/printer.go
new file mode 100644
index 0000000000..c4cc77bcbd
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/optimize/printer.go
@@ -0,0 +1,108 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package optimize
+
+import (
+	"fmt"
+	"io"
+	"math"
+	"os"
+	"time"
+
+	"gonum.org/v1/gonum/floats"
+)
+
+var printerHeadings = [...]string{
+	"Iter",
+	"Runtime",
+	"FuncEvals",
+	"Func",
+	"GradEvals",
+	"|Gradient|∞",
+	"HessEvals",
+}
+
+const (
+	printerBaseTmpl = "%9v  %16v  %9v  %22v" // Base template for headings and values that are always printed.
+	printerGradTmpl = "  %9v  %22v"          // Appended to base template when loc.Gradient != nil.
+	printerHessTmpl = "  %9v"                // Appended to base template when loc.Hessian != nil.
+)
+
+var _ Recorder = (*Printer)(nil)
+
+// Printer writes column-format output to the specified writer as the optimization
+// progresses. By default, it writes to os.Stdout.
+type Printer struct {
+	Writer          io.Writer
+	HeadingInterval int
+	ValueInterval   time.Duration
+
+	lastHeading int
+	lastValue   time.Time
+}
+
+func NewPrinter() *Printer {
+	return &Printer{
+		Writer:          os.Stdout,
+		HeadingInterval: 30,
+		ValueInterval:   500 * time.Millisecond,
+	}
+}
+
+func (p *Printer) Init() error {
+	p.lastHeading = p.HeadingInterval              // So the headings are printed the first time.
+	p.lastValue = time.Now().Add(-p.ValueInterval) // So the values are printed the first time.
+	return nil
+}
+
+func (p *Printer) Record(loc *Location, op Operation, stats *Stats) error {
+	if op != MajorIteration && op != InitIteration && op != PostIteration {
+		return nil
+	}
+
+	// Print values always on PostIteration or when ValueInterval has elapsed.
+	printValues := time.Since(p.lastValue) > p.ValueInterval || op == PostIteration
+	if !printValues {
+		// Return early if not printing anything.
+		return nil
+	}
+
+	// Print heading when HeadingInterval lines have been printed, but never on PostIteration.
+	printHeading := p.lastHeading >= p.HeadingInterval && op != PostIteration
+	if printHeading {
+		p.lastHeading = 1
+	} else {
+		p.lastHeading++
+	}
+
+	if printHeading {
+		headings := "\n" + fmt.Sprintf(printerBaseTmpl, printerHeadings[0], printerHeadings[1], printerHeadings[2], printerHeadings[3])
+		if loc.Gradient != nil {
+			headings += fmt.Sprintf(printerGradTmpl, printerHeadings[4], printerHeadings[5])
+		}
+		if loc.Hessian != nil {
+			headings += fmt.Sprintf(printerHessTmpl, printerHeadings[6])
+		}
+		_, err := fmt.Fprintln(p.Writer, headings)
+		if err != nil {
+			return err
+		}
+	}
+
+	values := fmt.Sprintf(printerBaseTmpl, stats.MajorIterations, stats.Runtime, stats.FuncEvaluations, loc.F)
+	if loc.Gradient != nil {
+		values += fmt.Sprintf(printerGradTmpl, stats.GradEvaluations, floats.Norm(loc.Gradient, math.Inf(1)))
+	}
+	if loc.Hessian != nil {
+		values += fmt.Sprintf(printerHessTmpl, stats.HessEvaluations)
+	}
+	_, err := fmt.Fprintln(p.Writer, values)
+	if err != nil {
+		return err
+	}
+
+	p.lastValue = time.Now()
+	return nil
+}
diff --git a/vendor/gonum.org/v1/gonum/optimize/stepsizers.go b/vendor/gonum.org/v1/gonum/optimize/stepsizers.go
new file mode 100644
index 0000000000..6508b573e9
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/optimize/stepsizers.go
@@ -0,0 +1,194 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package optimize
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/floats"
+	"gonum.org/v1/gonum/floats/scalar"
+)
+
+const (
+	initialStepFactor = 1
+
+	quadraticMinimumStepSize = 1e-3
+	quadraticMaximumStepSize = 1
+	quadraticThreshold       = 1e-12
+
+	firstOrderMinimumStepSize = quadraticMinimumStepSize
+	firstOrderMaximumStepSize = quadraticMaximumStepSize
+)
+
+var (
+	_ StepSizer = ConstantStepSize{}
+	_ StepSizer = (*QuadraticStepSize)(nil)
+	_ StepSizer = (*FirstOrderStepSize)(nil)
+)
+
+// ConstantStepSize is a StepSizer that returns the same step size for
+// every iteration.
+type ConstantStepSize struct {
+	Size float64
+}
+
+func (c ConstantStepSize) Init(_ *Location, _ []float64) float64 {
+	return c.Size
+}
+
+func (c ConstantStepSize) StepSize(_ *Location, _ []float64) float64 {
+	return c.Size
+}
+
+// QuadraticStepSize estimates the initial line search step size as the minimum
+// of a quadratic that interpolates f(x_{k-1}), f(x_k) and ∇f_k⋅p_k.
+// This is useful for line search methods that do not produce well-scaled
+// descent directions, such as gradient descent or conjugate gradient methods.
+// The step size is bounded away from zero.
+type QuadraticStepSize struct {
+	// Threshold determines that the initial step size should be estimated by
+	// quadratic interpolation when the relative change in the objective
+	// function is larger than Threshold.  Otherwise the initial step size is
+	// set to 2*previous step size.
+	// If Threshold is zero, it will be set to 1e-12.
+	Threshold float64
+	// InitialStepFactor sets the step size for the first iteration to be InitialStepFactor / |g|_∞.
+	// If InitialStepFactor is zero, it will be set to one.
+	InitialStepFactor float64
+	// MinStepSize is the lower bound on the estimated step size.
+	// MinStepSize times GradientAbsTol should always be greater than machine epsilon.
+	// If MinStepSize is zero, it will be set to 1e-3.
+	MinStepSize float64
+	// MaxStepSize is the upper bound on the estimated step size.
+	// If MaxStepSize is zero, it will be set to 1.
+	MaxStepSize float64
+
+	fPrev        float64
+	dirPrevNorm  float64
+	projGradPrev float64
+	xPrev        []float64
+}
+
+func (q *QuadraticStepSize) Init(loc *Location, dir []float64) (stepSize float64) {
+	if q.Threshold == 0 {
+		q.Threshold = quadraticThreshold
+	}
+	if q.InitialStepFactor == 0 {
+		q.InitialStepFactor = initialStepFactor
+	}
+	if q.MinStepSize == 0 {
+		q.MinStepSize = quadraticMinimumStepSize
+	}
+	if q.MaxStepSize == 0 {
+		q.MaxStepSize = quadraticMaximumStepSize
+	}
+	if q.MaxStepSize <= q.MinStepSize {
+		panic("optimize: MinStepSize not smaller than MaxStepSize")
+	}
+
+	gNorm := floats.Norm(loc.Gradient, math.Inf(1))
+	stepSize = math.Max(q.MinStepSize, math.Min(q.InitialStepFactor/gNorm, q.MaxStepSize))
+
+	q.fPrev = loc.F
+	q.dirPrevNorm = floats.Norm(dir, 2)
+	q.projGradPrev = floats.Dot(loc.Gradient, dir)
+	q.xPrev = resize(q.xPrev, len(loc.X))
+	copy(q.xPrev, loc.X)
+	return stepSize
+}
+
+func (q *QuadraticStepSize) StepSize(loc *Location, dir []float64) (stepSize float64) {
+	stepSizePrev := floats.Distance(loc.X, q.xPrev, 2) / q.dirPrevNorm
+	projGrad := floats.Dot(loc.Gradient, dir)
+
+	stepSize = 2 * stepSizePrev
+	if !scalar.EqualWithinRel(q.fPrev, loc.F, q.Threshold) {
+		// Two consecutive function values are not relatively equal, so
+		// computing the minimum of a quadratic interpolant might make sense
+
+		df := (loc.F - q.fPrev) / stepSizePrev
+		quadTest := df - q.projGradPrev
+		if quadTest > 0 {
+			// There is a chance of approximating the function well by a
+			// quadratic only if the finite difference (f_k-f_{k-1})/stepSizePrev
+			// is larger than ∇f_{k-1}⋅p_{k-1}
+
+			// Set the step size to the minimizer of the quadratic function that
+			// interpolates f_{k-1}, ∇f_{k-1}⋅p_{k-1} and f_k
+			stepSize = -q.projGradPrev * stepSizePrev / quadTest / 2
+		}
+	}
+	// Bound the step size to lie in [MinStepSize, MaxStepSize]
+	stepSize = math.Max(q.MinStepSize, math.Min(stepSize, q.MaxStepSize))
+
+	q.fPrev = loc.F
+	q.dirPrevNorm = floats.Norm(dir, 2)
+	q.projGradPrev = projGrad
+	copy(q.xPrev, loc.X)
+	return stepSize
+}
+
+// FirstOrderStepSize estimates the initial line search step size based on the
+// assumption that the first-order change in the function will be the same as
+// that obtained at the previous iteration. That is, the initial step size s^0_k
+// is chosen so that
+//
+//	s^0_k ∇f_k⋅p_k = s_{k-1} ∇f_{k-1}⋅p_{k-1}
+//
+// This is useful for line search methods that do not produce well-scaled
+// descent directions, such as gradient descent or conjugate gradient methods.
+type FirstOrderStepSize struct {
+	// InitialStepFactor sets the step size for the first iteration to be InitialStepFactor / |g|_∞.
+	// If InitialStepFactor is zero, it will be set to one.
+	InitialStepFactor float64
+	// MinStepSize is the lower bound on the estimated step size.
+	// MinStepSize times GradientAbsTol should always be greater than machine epsilon.
+	// If MinStepSize is zero, it will be set to 1e-3.
+	MinStepSize float64
+	// MaxStepSize is the upper bound on the estimated step size.
+	// If MaxStepSize is zero, it will be set to 1.
+	MaxStepSize float64
+
+	dirPrevNorm  float64
+	projGradPrev float64
+	xPrev        []float64
+}
+
+func (fo *FirstOrderStepSize) Init(loc *Location, dir []float64) (stepSize float64) {
+	if fo.InitialStepFactor == 0 {
+		fo.InitialStepFactor = initialStepFactor
+	}
+	if fo.MinStepSize == 0 {
+		fo.MinStepSize = firstOrderMinimumStepSize
+	}
+	if fo.MaxStepSize == 0 {
+		fo.MaxStepSize = firstOrderMaximumStepSize
+	}
+	if fo.MaxStepSize <= fo.MinStepSize {
+		panic("optimize: MinStepSize not smaller than MaxStepSize")
+	}
+
+	gNorm := floats.Norm(loc.Gradient, math.Inf(1))
+	stepSize = math.Max(fo.MinStepSize, math.Min(fo.InitialStepFactor/gNorm, fo.MaxStepSize))
+
+	fo.dirPrevNorm = floats.Norm(dir, 2)
+	fo.projGradPrev = floats.Dot(loc.Gradient, dir)
+	fo.xPrev = resize(fo.xPrev, len(loc.X))
+	copy(fo.xPrev, loc.X)
+	return stepSize
+}
+
+func (fo *FirstOrderStepSize) StepSize(loc *Location, dir []float64) (stepSize float64) {
+	stepSizePrev := floats.Distance(loc.X, fo.xPrev, 2) / fo.dirPrevNorm
+	projGrad := floats.Dot(loc.Gradient, dir)
+
+	stepSize = stepSizePrev * fo.projGradPrev / projGrad
+	stepSize = math.Max(fo.MinStepSize, math.Min(stepSize, fo.MaxStepSize))
+
+	fo.dirPrevNorm = floats.Norm(dir, 2)
+	fo.projGradPrev = floats.Dot(loc.Gradient, dir)
+	copy(fo.xPrev, loc.X)
+	return stepSize
+}
diff --git a/vendor/gonum.org/v1/gonum/optimize/termination.go b/vendor/gonum.org/v1/gonum/optimize/termination.go
new file mode 100644
index 0000000000..df4bdb7e49
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/optimize/termination.go
@@ -0,0 +1,123 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package optimize
+
+import "errors"
+
+// Status represents the status of the optimization. Programs
+// should not rely on the underlying numeric value of the Status being constant.
+type Status int
+
+const (
+	NotTerminated Status = iota
+	Success
+	FunctionThreshold
+	FunctionConvergence
+	GradientThreshold
+	StepConvergence
+	FunctionNegativeInfinity
+	MethodConverge
+	Failure
+	IterationLimit
+	RuntimeLimit
+	FunctionEvaluationLimit
+	GradientEvaluationLimit
+	HessianEvaluationLimit
+)
+
+func (s Status) String() string {
+	return statuses[s].name
+}
+
+// Early returns true if the status indicates the optimization ended before a
+// minimum was found. As an example, if the maximum iterations was reached, a
+// minimum was not found, but if the gradient norm was reached then a minimum
+// was found.
+func (s Status) Early() bool {
+	return statuses[s].early
+}
+
+// Err returns the error associated with an early ending to the minimization. If
+// Early returns false, Err will return nil.
+func (s Status) Err() error {
+	return statuses[s].err
+}
+
+var statuses = []struct {
+	name  string
+	early bool
+	err   error
+}{
+	{
+		name: "NotTerminated",
+	},
+	{
+		name: "Success",
+	},
+	{
+		name: "FunctionThreshold",
+	},
+	{
+		name: "FunctionConvergence",
+	},
+	{
+		name: "GradientThreshold",
+	},
+	{
+		name: "StepConvergence",
+	},
+	{
+		name: "FunctionNegativeInfinity",
+	},
+	{
+		name: "MethodConverge",
+	},
+	{
+		name:  "Failure",
+		early: true,
+		err:   errors.New("optimize: termination ended in failure"),
+	},
+	{
+		name:  "IterationLimit",
+		early: true,
+		err:   errors.New("optimize: maximum number of major iterations reached"),
+	},
+	{
+		name:  "RuntimeLimit",
+		early: true,
+		err:   errors.New("optimize: maximum runtime reached"),
+	},
+	{
+		name:  "FunctionEvaluationLimit",
+		early: true,
+		err:   errors.New("optimize: maximum number of function evaluations reached"),
+	},
+	{
+		name:  "GradientEvaluationLimit",
+		early: true,
+		err:   errors.New("optimize: maximum number of gradient evaluations reached"),
+	},
+	{
+		name:  "HessianEvaluationLimit",
+		early: true,
+		err:   errors.New("optimize: maximum number of Hessian evaluations reached"),
+	},
+}
+
+// NewStatus returns a unique Status variable to represent a custom status.
+// NewStatus is intended to be called only during package initialization, and
+// calls to NewStatus are not thread safe.
+//
+// NewStatus takes in three arguments, the string that should be output from
+// Status.String, a boolean if the status indicates early optimization conclusion,
+// and the error to return from Err (if any).
+func NewStatus(name string, early bool, err error) Status {
+	statuses = append(statuses, struct {
+		name  string
+		early bool
+		err   error
+	}{name, early, err})
+	return Status(len(statuses) - 1)
+}
diff --git a/vendor/gonum.org/v1/gonum/optimize/types.go b/vendor/gonum.org/v1/gonum/optimize/types.go
new file mode 100644
index 0000000000..e3172c1d70
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/optimize/types.go
@@ -0,0 +1,273 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package optimize
+
+import (
+	"fmt"
+	"time"
+
+	"gonum.org/v1/gonum/mat"
+)
+
+const defaultGradientAbsTol = 1e-12
+
+// Operation represents the set of operations commanded by Method at each
+// iteration. It is a bitmap of various Iteration and Evaluation constants.
+// Individual constants must NOT be combined together by the binary OR operator
+// except for the Evaluation operations.
+type Operation uint64
+
+// Supported Operations.
+const (
+	// NoOperation specifies that no evaluation or convergence check should
+	// take place.
+	NoOperation Operation = 0
+	// InitIteration is sent to Recorder to indicate the initial location.
+	// All fields of the location to record must be valid.
+	// Method must not return it.
+	InitIteration Operation = 1 << (iota - 1)
+	// PostIteration is sent to Recorder to indicate the final location
+	// reached during an optimization run.
+	// All fields of the location to record must be valid.
+	// Method must not return it.
+	PostIteration
+	// MajorIteration indicates that the next candidate location for
+	// an optimum has been found and convergence should be checked.
+	MajorIteration
+	// MethodDone declares that the method is done running. A method must
+	// be a Statuser in order to use this iteration, and after returning
+	// MethodDone, the Status must return other than NotTerminated.
+	MethodDone
+	// FuncEvaluation specifies that the objective function
+	// should be evaluated.
+	FuncEvaluation
+	// GradEvaluation specifies that the gradient
+	// of the objective function should be evaluated.
+	GradEvaluation
+	// HessEvaluation specifies that the Hessian
+	// of the objective function should be evaluated.
+	HessEvaluation
+	// signalDone is used internally to signal completion.
+	signalDone
+
+	// Mask for the evaluating operations.
+	evalMask = FuncEvaluation | GradEvaluation | HessEvaluation
+)
+
+func (op Operation) isEvaluation() bool {
+	return op&evalMask != 0 && op&^evalMask == 0
+}
+
+func (op Operation) String() string {
+	if op&evalMask != 0 {
+		return fmt.Sprintf("Evaluation(Func: %t, Grad: %t, Hess: %t, Extra: 0b%b)",
+			op&FuncEvaluation != 0,
+			op&GradEvaluation != 0,
+			op&HessEvaluation != 0,
+			op&^(evalMask))
+	}
+	s, ok := operationNames[op]
+	if ok {
+		return s
+	}
+	return fmt.Sprintf("Operation(%d)", op)
+}
+
+var operationNames = map[Operation]string{
+	NoOperation:    "NoOperation",
+	InitIteration:  "InitIteration",
+	MajorIteration: "MajorIteration",
+	PostIteration:  "PostIteration",
+	MethodDone:     "MethodDone",
+	signalDone:     "signalDone",
+}
+
+// Result represents the answer of an optimization run. It contains the optimum
+// function value, X location, and gradient as well as the Status at convergence
+// and Statistics taken during the run.
+type Result struct {
+	Location
+	Stats
+	Status Status
+}
+
+// Stats contains the statistics of the run.
+type Stats struct {
+	MajorIterations int           // Total number of major iterations
+	FuncEvaluations int           // Number of evaluations of Func
+	GradEvaluations int           // Number of evaluations of Grad
+	HessEvaluations int           // Number of evaluations of Hess
+	Runtime         time.Duration // Total runtime of the optimization
+}
+
+// complementEval returns an evaluating operation that evaluates fields of loc
+// not evaluated by eval.
+func complementEval(loc *Location, eval Operation) (complEval Operation) {
+	if eval&FuncEvaluation == 0 {
+		complEval = FuncEvaluation
+	}
+	if loc.Gradient != nil && eval&GradEvaluation == 0 {
+		complEval |= GradEvaluation
+	}
+	if loc.Hessian != nil && eval&HessEvaluation == 0 {
+		complEval |= HessEvaluation
+	}
+	return complEval
+}
+
+// Problem describes the optimization problem to be solved.
+type Problem struct {
+	// Func evaluates the objective function at the given location. Func
+	// must not modify x.
+	Func func(x []float64) float64
+
+	// Grad evaluates the gradient at x and stores the result in grad which will
+	// be the same length as x. Grad must not modify x.
+	Grad func(grad, x []float64)
+
+	// Hess evaluates the Hessian at x and stores the result in-place in hess which
+	// will have dimensions matching the length of x. Hess must not modify x.
+	Hess func(hess *mat.SymDense, x []float64)
+
+	// Status reports the status of the objective function being optimized and any
+	// error. This can be used to terminate early, for example when the function is
+	// not able to evaluate itself. The user can use one of the pre-provided Status
+	// constants, or may call NewStatus to create a custom Status value.
+	Status func() (Status, error)
+}
+
+// Available describes the functions available to call in Problem.
+type Available struct {
+	Grad bool
+	Hess bool
+}
+
+func availFromProblem(prob Problem) Available {
+	return Available{Grad: prob.Grad != nil, Hess: prob.Hess != nil}
+}
+
+// function tests if the Problem described by the receiver is suitable for an
+// unconstrained Method that only calls the function, and returns the result.
+func (has Available) function() (uses Available, err error) {
+	// TODO(btracey): This needs to be modified when optimize supports
+	// constrained optimization.
+	return Available{}, nil
+}
+
+// gradient tests if the Problem described by the receiver is suitable for an
+// unconstrained gradient-based Method, and returns the result.
+func (has Available) gradient() (uses Available, err error) {
+	// TODO(btracey): This needs to be modified when optimize supports
+	// constrained optimization.
+	if !has.Grad {
+		return Available{}, ErrMissingGrad
+	}
+	return Available{Grad: true}, nil
+}
+
+// hessian tests if the Problem described by the receiver is suitable for an
+// unconstrained Hessian-based Method, and returns the result.
+func (has Available) hessian() (uses Available, err error) {
+	// TODO(btracey): This needs to be modified when optimize supports
+	// constrained optimization.
+	if !has.Grad {
+		return Available{}, ErrMissingGrad
+	}
+	if !has.Hess {
+		return Available{}, ErrMissingHess
+	}
+	return Available{Grad: true, Hess: true}, nil
+}
+
+// Settings represents settings of the optimization run. It contains initial
+// settings, convergence information, and Recorder information. Convergence
+// settings are only checked at MajorIterations, while Evaluation thresholds
+// are checked at every Operation. See the field comments for default values.
+type Settings struct {
+	// InitValues specifies properties (function value, gradient, etc.) known
+	// at the initial location passed to Minimize. If InitValues is non-nil, then
+	// the function value F must be provided, the location X must not be specified
+	// and other fields may be specified. The values in Location may be modified
+	// during the call to Minimize.
+	InitValues *Location
+
+	// GradientThreshold stops optimization with GradientThreshold status if the
+	// infinity norm of the gradient is less than this value. This defaults to
+	// a value of 0 (and so gradient convergence is not checked), however note
+	// that many Methods (LBFGS, CG, etc.) will converge with a small value of
+	// the gradient, and so to fully disable this setting the Method may need to
+	// be modified.
+	// This setting has no effect if the gradient is not used by the Method.
+	GradientThreshold float64
+
+	// Converger checks if the optimization has converged based on the (history
+	// of) locations found during the optimization. Minimize will pass the
+	// Location at every MajorIteration to the Converger.
+	//
+	// If the Converger is nil, a default value of
+	//  FunctionConverge {
+	//		Absolute: 1e-10,
+	//		Iterations: 100,
+	//  }
+	// will be used. NeverTerminated can be used to always return a
+	// NotTerminated status.
+	Converger Converger
+
+	// MajorIterations is the maximum number of iterations allowed.
+	// IterationLimit status is returned if the number of major iterations
+	// equals or exceeds this value.
+	// If it equals zero, this setting has no effect.
+	// The default value is 0.
+	MajorIterations int
+
+	// Runtime is the maximum runtime allowed. RuntimeLimit status is returned
+	// if the duration of the run is longer than this value. Runtime is only
+	// checked at MajorIterations of the Method.
+	// If it equals zero, this setting has no effect.
+	// The default value is 0.
+	Runtime time.Duration
+
+	// FuncEvaluations is the maximum allowed number of function evaluations.
+	// FunctionEvaluationLimit status is returned if the total number of calls
+	// to Func equals or exceeds this number.
+	// If it equals zero, this setting has no effect.
+	// The default value is 0.
+	FuncEvaluations int
+
+	// GradEvaluations is the maximum allowed number of gradient evaluations.
+	// GradientEvaluationLimit status is returned if the total number of calls
+	// to Grad equals or exceeds this number.
+	// If it equals zero, this setting has no effect.
+	// The default value is 0.
+	GradEvaluations int
+
+	// HessEvaluations is the maximum allowed number of Hessian evaluations.
+	// HessianEvaluationLimit status is returned if the total number of calls
+	// to Hess equals or exceeds this number.
+	// If it equals zero, this setting has no effect.
+	// The default value is 0.
+	HessEvaluations int
+
+	Recorder Recorder
+
+	// Concurrent represents how many concurrent evaluations are possible.
+	Concurrent int
+}
+
+// resize takes x and returns a slice of length dim. It returns a resliced x
+// if cap(x) >= dim, and a new slice otherwise.
+func resize(x []float64, dim int) []float64 {
+	if dim > cap(x) {
+		return make([]float64, dim)
+	}
+	return x[:dim]
+}
+
+func resizeSymDense(m *mat.SymDense, dim int) *mat.SymDense {
+	if m == nil || cap(m.RawSymmetric().Data) < dim*dim {
+		return mat.NewSymDense(dim, nil)
+	}
+	return mat.NewSymDense(dim, m.RawSymmetric().Data[:dim*dim])
+}
diff --git a/vendor/gonum.org/v1/gonum/spatial/r1/doc.go b/vendor/gonum.org/v1/gonum/spatial/r1/doc.go
new file mode 100644
index 0000000000..0215961190
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/spatial/r1/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package r1 provides 1D vectors and intervals and operations on them.
+package r1 // import "gonum.org/v1/gonum/spatial/r1"
diff --git a/vendor/gonum.org/v1/gonum/spatial/r1/interval.go b/vendor/gonum.org/v1/gonum/spatial/r1/interval.go
new file mode 100644
index 0000000000..71f42fd3f7
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/spatial/r1/interval.go
@@ -0,0 +1,10 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package r1
+
+// Interval represents an interval.
+type Interval struct {
+	Min, Max float64
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/README.md b/vendor/gonum.org/v1/gonum/stat/README.md
new file mode 100644
index 0000000000..7156dc5095
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/README.md
@@ -0,0 +1,6 @@
+# Gonum stat
+
+[![go.dev reference](https://pkg.go.dev/badge/gonum.org/v1/gonum/stat)](https://pkg.go.dev/gonum.org/v1/gonum/stat)
+[![GoDoc](https://godocs.io/gonum.org/v1/gonum/stat?status.svg)](https://godocs.io/gonum.org/v1/gonum/stat)
+
+Package stat is a statistics package for the Go language.
diff --git a/vendor/gonum.org/v1/gonum/stat/combin/combin.go b/vendor/gonum.org/v1/gonum/stat/combin/combin.go
new file mode 100644
index 0000000000..b5814171cc
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/combin/combin.go
@@ -0,0 +1,683 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package combin
+
+import (
+	"math"
+	"sort"
+)
+
+const (
+	errNegInput             = "combin: negative input"
+	badSetSize              = "combin: n < k"
+	badInput                = "combin: wrong input slice length"
+	errNonpositiveDimension = "combin: non-positive dimension"
+)
+
+// Binomial returns the binomial coefficient of (n,k), also commonly referred to
+// as "n choose k".
+//
+// The binomial coefficient, C(n,k), is the number of unordered combinations of
+// k elements in a set that is n elements big, and is defined as
+//
+//	C(n,k) = n!/((n-k)!k!)
+//
+// n and k must be non-negative with n >= k, otherwise Binomial will panic.
+// No check is made for overflow.
+func Binomial(n, k int) int {
+	if n < 0 || k < 0 {
+		panic(errNegInput)
+	}
+	if n < k {
+		panic(badSetSize)
+	}
+	// (n,k) = (n, n-k)
+	if k > n/2 {
+		k = n - k
+	}
+	b := 1
+	for i := 1; i <= k; i++ {
+		b = (n - k + i) * b / i
+	}
+	return b
+}
+
+// GeneralizedBinomial returns the generalized binomial coefficient of (n, k),
+// defined as
+//
+//	Γ(n+1) / (Γ(k+1) Γ(n-k+1))
+//
+// where Γ is the Gamma function. GeneralizedBinomial is useful for continuous
+// relaxations of the binomial coefficient, or when the binomial coefficient value
+// may overflow int. In the latter case, one may use math/big for an exact
+// computation.
+//
+// n and k must be non-negative with n >= k, otherwise GeneralizedBinomial will panic.
+func GeneralizedBinomial(n, k float64) float64 {
+	return math.Exp(LogGeneralizedBinomial(n, k))
+}
+
+// LogGeneralizedBinomial returns the log of the generalized binomial coefficient.
+// See GeneralizedBinomial for more information.
+func LogGeneralizedBinomial(n, k float64) float64 {
+	if n < 0 || k < 0 {
+		panic(errNegInput)
+	}
+	if n < k {
+		panic(badSetSize)
+	}
+	a, _ := math.Lgamma(n + 1)
+	b, _ := math.Lgamma(k + 1)
+	c, _ := math.Lgamma(n - k + 1)
+	return a - b - c
+}
+
+// CombinationGenerator generates combinations iteratively. The Combinations
+// function may be called to generate all combinations collectively.
+type CombinationGenerator struct {
+	n         int
+	k         int
+	previous  []int
+	remaining int
+}
+
+// NewCombinationGenerator returns a CombinationGenerator for generating the
+// combinations of k elements from a set of size n.
+//
+// n and k must be non-negative with n >= k, otherwise NewCombinationGenerator
+// will panic.
+func NewCombinationGenerator(n, k int) *CombinationGenerator {
+	return &CombinationGenerator{
+		n:         n,
+		k:         k,
+		remaining: Binomial(n, k),
+	}
+}
+
+// Next advances the iterator if there are combinations remaining to be generated,
+// and returns false if all combinations have been generated. Next must be called
+// to initialize the first value before calling Combination or Combination will
+// panic. The value returned by Combination is only changed during calls to Next.
+func (c *CombinationGenerator) Next() bool {
+	if c.remaining <= 0 {
+		// Next is called before combination, so c.remaining is set to zero before
+		// Combination is called. Thus, Combination cannot panic on zero, and a
+		// second sentinel value is needed.
+		c.remaining = -1
+		return false
+	}
+	if c.previous == nil {
+		c.previous = make([]int, c.k)
+		for i := range c.previous {
+			c.previous[i] = i
+		}
+	} else {
+		nextCombination(c.previous, c.n, c.k)
+	}
+	c.remaining--
+	return true
+}
+
+// Combination returns the current combination. If dst is non-nil, it must have
+// length k and the result will be stored in-place into dst. If dst
+// is nil a new slice will be allocated and returned. If all of the combinations
+// have already been constructed (Next() returns false), Combination will panic.
+//
+// Next must be called to initialize the first value before calling Combination
+// or Combination will panic. The value returned by Combination is only changed
+// during calls to Next.
+func (c *CombinationGenerator) Combination(dst []int) []int {
+	if c.remaining == -1 {
+		panic("combin: all combinations have been generated")
+	}
+	if c.previous == nil {
+		panic("combin: Combination called before Next")
+	}
+	if dst == nil {
+		dst = make([]int, c.k)
+	} else if len(dst) != c.k {
+		panic(badInput)
+	}
+	copy(dst, c.previous)
+	return dst
+}
+
+// Combinations generates all of the combinations of k elements from a
+// set of size n. The returned slice has length Binomial(n,k) and each inner slice
+// has length k.
+//
+// n and k must be non-negative with n >= k, otherwise Combinations will panic.
+//
+// CombinationGenerator may alternatively be used to generate the combinations
+// iteratively instead of collectively, or IndexToCombination for random access.
+func Combinations(n, k int) [][]int {
+	combins := Binomial(n, k)
+	data := make([][]int, combins)
+	if len(data) == 0 {
+		return data
+	}
+	data[0] = make([]int, k)
+	for i := range data[0] {
+		data[0][i] = i
+	}
+	for i := 1; i < combins; i++ {
+		next := make([]int, k)
+		copy(next, data[i-1])
+		nextCombination(next, n, k)
+		data[i] = next
+	}
+	return data
+}
+
+// nextCombination generates the combination after s, overwriting the input value.
+func nextCombination(s []int, n, k int) {
+	for j := k - 1; j >= 0; j-- {
+		if s[j] == n+j-k {
+			continue
+		}
+		s[j]++
+		for l := j + 1; l < k; l++ {
+			s[l] = s[j] + l - j
+		}
+		break
+	}
+}
+
+// CombinationIndex returns the index of the given combination.
+//
+// The functions CombinationIndex and IndexToCombination define a bijection
+// between the integers and the Binomial(n, k) number of possible combinations.
+// CombinationIndex returns the inverse of IndexToCombination.
+//
+// CombinationIndex panics if comb is not a sorted combination of the first
+// [0,n) integers, if n or k are negative, or if k is greater than n.
+func CombinationIndex(comb []int, n, k int) int {
+	if n < 0 || k < 0 {
+		panic(errNegInput)
+	}
+	if n < k {
+		panic(badSetSize)
+	}
+	if len(comb) != k {
+		panic("combin: bad length combination")
+	}
+	if !sort.IntsAreSorted(comb) {
+		panic("combin: input combination is not sorted")
+	}
+	contains := make(map[int]struct{}, k)
+	for _, v := range comb {
+		contains[v] = struct{}{}
+	}
+	if len(contains) != k {
+		panic("combin: comb contains non-unique elements")
+	}
+	// This algorithm iterates in reverse lexicograhpic order.
+	// Flip the index and values to swap the order.
+	rev := make([]int, k)
+	for i, v := range comb {
+		rev[len(comb)-i-1] = n - v - 1
+	}
+	idx := 0
+	for i, v := range rev {
+		if v >= i+1 {
+			idx += Binomial(v, i+1)
+		}
+	}
+	return Binomial(n, k) - 1 - idx
+}
+
+// IndexToCombination returns the combination corresponding to the given index.
+//
+// The functions CombinationIndex and IndexToCombination define a bijection
+// between the integers and the Binomial(n, k) number of possible combinations.
+// IndexToCombination returns the inverse of CombinationIndex (up to the order
+// of the elements).
+//
+// The combination is stored in-place into dst if dst is non-nil, otherwise
+// a new slice is allocated and returned.
+//
+// IndexToCombination panics if n or k are negative, if k is greater than n,
+// or if idx is not in [0, Binomial(n,k)-1]. IndexToCombination will also panic
+// if dst is non-nil and len(dst) is not k.
+func IndexToCombination(dst []int, idx, n, k int) []int {
+	if idx < 0 || idx >= Binomial(n, k) {
+		panic("combin: invalid index")
+	}
+	if dst == nil {
+		dst = make([]int, k)
+	} else if len(dst) != k {
+		panic(badInput)
+	}
+	// The base algorithm indexes in reverse lexicographic order
+	// flip the values and the index.
+	idx = Binomial(n, k) - 1 - idx
+	for i := range dst {
+		// Find the largest number m such that Binomial(m, k-i) <= idx.
+		// This is one less than the first number such that it is larger.
+		m := sort.Search(n, func(m int) bool {
+			if m < k-i {
+				return false
+			}
+			return Binomial(m, k-i) > idx
+		})
+		m--
+		// Normally this is put m into the last free spot, but we
+		// reverse the index and the value.
+		dst[i] = n - m - 1
+		if m >= k-i {
+			idx -= Binomial(m, k-i)
+		}
+	}
+	return dst
+}
+
+// Cartesian returns the Cartesian product of the slices in data. The Cartesian
+// product of two sets is the set of all combinations of the items. For example,
+// given the input
+//
+//	[]int{2, 3, 1}
+//
+// the returned matrix will be
+//
+//	[ 0 0 0 ]
+//	[ 0 1 0 ]
+//	[ 0 2 0 ]
+//	[ 1 0 0 ]
+//	[ 1 1 0 ]
+//	[ 1 2 0 ]
+//
+// Cartesian panics if any of the provided lengths are less than 1.
+func Cartesian(lens []int) [][]int {
+	rows := Card(lens)
+	if rows == 0 {
+		panic("combin: empty lengths")
+	}
+	out := make([][]int, rows)
+	for i := 0; i < rows; i++ {
+		out[i] = SubFor(nil, i, lens)
+	}
+	return out
+}
+
+// Card computes the cardinality of the multi-dimensional space whose dimensions have size specified by dims
+// All length values must be positive, otherwise this will panic.
+func Card(dims []int) int {
+	if len(dims) == 0 {
+		return 0
+	}
+	card := 1
+	for _, v := range dims {
+		if v < 0 {
+			panic("combin: length less than zero")
+		}
+		card *= v
+	}
+	return card
+}
+
+// NewCartesianGenerator returns a CartesianGenerator for iterating over Cartesian products which are generated on the fly.
+// All values in lens must be positive, otherwise this will panic.
+func NewCartesianGenerator(lens []int) *CartesianGenerator {
+	return &CartesianGenerator{
+		lens: lens,
+		rows: Card(lens),
+		idx:  -1,
+	}
+}
+
+// CartesianGenerator iterates over a Cartesian product set.
+type CartesianGenerator struct {
+	lens []int
+	rows int
+	idx  int
+}
+
+// Next moves to the next product of the Cartesian set.
+// It returns false if the generator reached the end of the Cartesian set end.
+func (g *CartesianGenerator) Next() bool {
+	if g.idx+1 < g.rows {
+		g.idx++
+		return true
+	}
+	g.idx = g.rows
+	return false
+}
+
+// Product generates one product of the Cartesian set according to the current index which is increased by Next().
+// Next needs to be called at least one time before this method, otherwise it will panic.
+func (g *CartesianGenerator) Product(dst []int) []int {
+	return SubFor(dst, g.idx, g.lens)
+}
+
+// IdxFor converts a multi-dimensional index into a linear index for a
+// multi-dimensional space. sub specifies the index for each dimension, and dims
+// specifies the size of each dimension. IdxFor is the inverse of SubFor.
+// IdxFor panics if any of the entries of sub are negative, any of the entries
+// of dim are non-positive, or if sub[i] >= dims[i] for any i.
+func IdxFor(sub, dims []int) int {
+	// The index returned is "row-major", that is the last index of sub is
+	// continuous.
+	var idx int
+	stride := 1
+	for i := len(dims) - 1; i >= 0; i-- {
+		v := sub[i]
+		d := dims[i]
+		if d <= 0 {
+			panic(errNonpositiveDimension)
+		}
+		if v < 0 || v >= d {
+			panic("combin: invalid subscript")
+		}
+		idx += v * stride
+		stride *= d
+	}
+	return idx
+}
+
+// SubFor returns the multi-dimensional subscript for the input linear index to
+// the multi-dimensional space. dims specifies the size of each dimension, and
+// idx specifies the linear index. SubFor is the inverse of IdxFor.
+//
+// If sub is non-nil the result is stored in-place into sub, and SubFor will panic
+// if len(sub) != len(dims). If sub is nil a new slice of the appropriate length
+// is allocated. SubFor panics if idx < 0 or if idx is greater than or equal to
+// the product of the dimensions.
+func SubFor(sub []int, idx int, dims []int) []int {
+	if sub == nil {
+		sub = make([]int, len(dims))
+	}
+	if len(sub) != len(dims) {
+		panic(badInput)
+	}
+	if idx < 0 {
+		panic(errNegInput)
+	}
+	stride := 1
+	for i := len(dims) - 1; i >= 1; i-- {
+		stride *= dims[i]
+	}
+	for i := 0; i < len(dims)-1; i++ {
+		v := idx / stride
+		d := dims[i]
+		if d < 0 {
+			panic(errNonpositiveDimension)
+		}
+		if v >= dims[i] {
+			panic("combin: index too large")
+		}
+		sub[i] = v
+		idx -= v * stride
+		stride /= dims[i+1]
+	}
+	if idx > dims[len(sub)-1] {
+		panic("combin: index too large")
+	}
+	sub[len(sub)-1] = idx
+	return sub
+}
+
+// NumPermutations returns the number of permutations when selecting k
+// objects from a set of n objects when the selection order matters.
+// No check is made for overflow.
+//
+// NumPermutations panics if either n or k is negative, or if k is
+// greater than n.
+func NumPermutations(n, k int) int {
+	if n < 0 {
+		panic("combin: n is negative")
+	}
+	if k < 0 {
+		panic("combin: k is negative")
+	}
+	if k > n {
+		panic("combin: k is greater than n")
+	}
+	p := 1
+	for i := n - k + 1; i <= n; i++ {
+		p *= i
+	}
+	return p
+}
+
+// Permutations generates all of the permutations of k elements from a
+// set of size n. The returned slice has length NumPermutations(n, k)
+// and each inner slice has length k.
+//
+// n and k must be non-negative with n >= k, otherwise Permutations will panic.
+//
+// PermutationGenerator may alternatively be used to generate the permutations
+// iteratively instead of collectively, or IndexToPermutation for random access.
+func Permutations(n, k int) [][]int {
+	nPerms := NumPermutations(n, k)
+	data := make([][]int, nPerms)
+	if len(data) == 0 {
+		return data
+	}
+	for i := 0; i < nPerms; i++ {
+		data[i] = IndexToPermutation(nil, i, n, k)
+	}
+	return data
+}
+
+// PermutationGenerator generates permutations iteratively. The Permutations
+// function may be called to generate all permutations collectively.
+type PermutationGenerator struct {
+	n           int
+	k           int
+	nPerm       int
+	idx         int
+	permutation []int
+}
+
+// NewPermutationGenerator returns a PermutationGenerator for generating the
+// permutations of k elements from a set of size n.
+//
+// n and k must be non-negative with n >= k, otherwise NewPermutationGenerator
+// will panic.
+func NewPermutationGenerator(n, k int) *PermutationGenerator {
+	return &PermutationGenerator{
+		n:           n,
+		k:           k,
+		nPerm:       NumPermutations(n, k),
+		idx:         -1,
+		permutation: make([]int, k),
+	}
+}
+
+// Next advances the iterator if there are permutations remaining to be generated,
+// and returns false if all permutations have been generated. Next must be called
+// to initialize the first value before calling Permutation or Permutation will
+// panic. The value returned by Permutation is only changed during calls to Next.
+func (p *PermutationGenerator) Next() bool {
+	if p.idx >= p.nPerm-1 {
+		p.idx = p.nPerm // so Permutation can panic.
+		return false
+	}
+	p.idx++
+	IndexToPermutation(p.permutation, p.idx, p.n, p.k)
+	return true
+}
+
+// Permutation returns the current permutation. If dst is non-nil, it must have
+// length k and the result will be stored in-place into dst. If dst
+// is nil a new slice will be allocated and returned. If all of the permutations
+// have already been constructed (Next() returns false), Permutation will panic.
+//
+// Next must be called to initialize the first value before calling Permutation
+// or Permutation will panic. The value returned by Permutation is only changed
+// during calls to Next.
+func (p *PermutationGenerator) Permutation(dst []int) []int {
+	if p.idx == p.nPerm {
+		panic("combin: all permutations have been generated")
+	}
+	if p.idx == -1 {
+		panic("combin: Permutation called before Next")
+	}
+	if dst == nil {
+		dst = make([]int, p.k)
+	} else if len(dst) != p.k {
+		panic(badInput)
+	}
+	copy(dst, p.permutation)
+	return dst
+}
+
+// PermutationIndex returns the index of the given permutation.
+//
+// The functions PermutationIndex and IndexToPermutation define a bijection
+// between the integers and the NumPermutations(n, k) number of possible permutations.
+// PermutationIndex returns the inverse of IndexToPermutation.
+//
+// PermutationIndex panics if perm is not a permutation of k of the first
+// [0,n) integers, if n or k are negative, or if k is greater than n.
+func PermutationIndex(perm []int, n, k int) int {
+	if n < 0 || k < 0 {
+		panic(errNegInput)
+	}
+	if n < k {
+		panic(badSetSize)
+	}
+	if len(perm) != k {
+		panic("combin: bad length permutation")
+	}
+	contains := make(map[int]struct{}, k)
+	for _, v := range perm {
+		if v < 0 || v >= n {
+			panic("combin: bad element")
+		}
+		contains[v] = struct{}{}
+	}
+	if len(contains) != k {
+		panic("combin: perm contains non-unique elements")
+	}
+	if n == k {
+		// The permutation is the ordering of the elements.
+		return equalPermutationIndex(perm)
+	}
+
+	// The permutation index is found by finding the combination index and the
+	// equalPermutation index. The combination index is found by just sorting
+	// the elements, and the permutation index is the ordering of the size
+	// of the elements.
+	tmp := make([]int, len(perm))
+	copy(tmp, perm)
+	idx := make([]int, len(perm))
+	for i := range idx {
+		idx[i] = i
+	}
+	s := sortInts{tmp, idx}
+	sort.Sort(s)
+	order := make([]int, len(perm))
+	for i, v := range idx {
+		order[v] = i
+	}
+	combIdx := CombinationIndex(tmp, n, k)
+	permIdx := equalPermutationIndex(order)
+	return combIdx*NumPermutations(k, k) + permIdx
+}
+
+type sortInts struct {
+	data []int
+	idx  []int
+}
+
+func (s sortInts) Len() int {
+	return len(s.data)
+}
+
+func (s sortInts) Less(i, j int) bool {
+	return s.data[i] < s.data[j]
+}
+
+func (s sortInts) Swap(i, j int) {
+	s.data[i], s.data[j] = s.data[j], s.data[i]
+	s.idx[i], s.idx[j] = s.idx[j], s.idx[i]
+}
+
+// IndexToPermutation returns the permutation corresponding to the given index.
+//
+// The functions PermutationIndex and IndexToPermutation define a bijection
+// between the integers and the NumPermutations(n, k) number of possible permutations.
+// IndexToPermutation returns the inverse of PermutationIndex.
+//
+// The permutation is stored in-place into dst if dst is non-nil, otherwise
+// a new slice is allocated and returned.
+//
+// IndexToPermutation panics if n or k are negative, if k is greater than n,
+// or if idx is not in [0, NumPermutations(n,k)-1]. IndexToPermutation will also panic
+// if dst is non-nil and len(dst) is not k.
+func IndexToPermutation(dst []int, idx, n, k int) []int {
+	nPerm := NumPermutations(n, k)
+	if idx < 0 || idx >= nPerm {
+		panic("combin: invalid index")
+	}
+	if dst == nil {
+		dst = make([]int, k)
+	} else if len(dst) != k {
+		panic(badInput)
+	}
+	if n == k {
+		indexToEqualPermutation(dst, idx)
+		return dst
+	}
+
+	// First, we index into the combination (which of the k items to choose)
+	// and then we index into the n == k permutation of those k items. The
+	// indexing acts like a matrix with nComb rows and factorial(k) columns.
+	kPerm := NumPermutations(k, k)
+	combIdx := idx / kPerm
+	permIdx := idx % kPerm
+	comb := IndexToCombination(nil, combIdx, n, k) // Gives us the set of integers.
+	perm := make([]int, len(dst))
+	indexToEqualPermutation(perm, permIdx) // Gives their order.
+	for i, v := range perm {
+		dst[i] = comb[v]
+	}
+	return dst
+}
+
+// equalPermutationIndex returns the index of the given permutation of the
+// first k integers.
+func equalPermutationIndex(perm []int) int {
+	// Note(btracey): This is an n^2 algorithm, but factorial increases
+	// very quickly (25! overflows int64) so this is not a problem in
+	// practice.
+	idx := 0
+	for i, u := range perm {
+		less := 0
+		for _, v := range perm[i:] {
+			if v < u {
+				less++
+			}
+		}
+		idx += less * factorial(len(perm)-i-1)
+	}
+	return idx
+}
+
+// indexToEqualPermutation returns the permutation for the first len(dst)
+// integers for the given index.
+func indexToEqualPermutation(dst []int, idx int) {
+	for i := range dst {
+		dst[i] = i
+	}
+	for i := range dst {
+		f := factorial(len(dst) - i - 1)
+		r := idx / f
+		v := dst[i+r]
+		copy(dst[i+1:i+r+1], dst[i:i+r])
+		dst[i] = v
+		idx %= f
+	}
+}
+
+// factorial returns a!.
+func factorial(a int) int {
+	f := 1
+	for i := 2; i <= a; i++ {
+		f *= i
+	}
+	return f
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/combin/doc.go b/vendor/gonum.org/v1/gonum/stat/combin/doc.go
new file mode 100644
index 0000000000..496045cdd3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/combin/doc.go
@@ -0,0 +1,7 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package combin implements routines involving combinatorics (permutations,
+// combinations, etc.).
+package combin // import "gonum.org/v1/gonum/stat/combin"
diff --git a/vendor/gonum.org/v1/gonum/stat/distmv/dirichlet.go b/vendor/gonum.org/v1/gonum/stat/distmv/dirichlet.go
new file mode 100644
index 0000000000..61d799884c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distmv/dirichlet.go
@@ -0,0 +1,149 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distmv
+
+import (
+	"math"
+	"math/rand/v2"
+
+	"gonum.org/v1/gonum/floats"
+	"gonum.org/v1/gonum/mat"
+	"gonum.org/v1/gonum/stat/distuv"
+)
+
+// Dirichlet implements the Dirichlet probability distribution.
+//
+// The Dirichlet distribution is a continuous probability distribution that
+// generates elements over the probability simplex, i.e. ||x||_1 = 1. The Dirichlet
+// distribution is the conjugate prior to the categorical distribution and the
+// multivariate version of the beta distribution. The probability of a point x is
+//
+//	1/Beta(α) \prod_i x_i^(α_i - 1)
+//
+// where Beta(α) is the multivariate Beta function (see the mathext package).
+//
+// For more information see https://en.wikipedia.org/wiki/Dirichlet_distribution
+type Dirichlet struct {
+	alpha []float64
+	dim   int
+	src   rand.Source
+
+	lbeta    float64
+	sumAlpha float64
+}
+
+// NewDirichlet creates a new dirichlet distribution with the given parameters alpha.
+// NewDirichlet will panic if len(alpha) == 0, or if any alpha is <= 0.
+func NewDirichlet(alpha []float64, src rand.Source) *Dirichlet {
+	dim := len(alpha)
+	if dim == 0 {
+		panic(badZeroDimension)
+	}
+	for _, v := range alpha {
+		if v <= 0 {
+			panic("dirichlet: non-positive alpha")
+		}
+	}
+	a := make([]float64, len(alpha))
+	copy(a, alpha)
+	d := &Dirichlet{
+		alpha: a,
+		dim:   dim,
+		src:   src,
+	}
+	d.lbeta, d.sumAlpha = d.genLBeta(a)
+	return d
+}
+
+// CovarianceMatrix calculates the covariance matrix of the distribution,
+// storing the result in dst. Upon return, the value at element {i, j} of the
+// covariance matrix is equal to the covariance of the i^th and j^th variables.
+//
+//	covariance(i, j) = E[(x_i - E[x_i])(x_j - E[x_j])]
+//
+// If the dst matrix is empty it will be resized to the correct dimensions,
+// otherwise dst must match the dimension of the receiver or CovarianceMatrix
+// will panic.
+func (d *Dirichlet) CovarianceMatrix(dst *mat.SymDense) {
+	if dst.IsEmpty() {
+		*dst = *(dst.GrowSym(d.dim).(*mat.SymDense))
+	} else if dst.SymmetricDim() != d.dim {
+		panic("dirichelet: input matrix size mismatch")
+	}
+	scale := 1 / (d.sumAlpha * d.sumAlpha * (d.sumAlpha + 1))
+	for i := 0; i < d.dim; i++ {
+		ai := d.alpha[i]
+		v := ai * (d.sumAlpha - ai) * scale
+		dst.SetSym(i, i, v)
+		for j := i + 1; j < d.dim; j++ {
+			aj := d.alpha[j]
+			v := -ai * aj * scale
+			dst.SetSym(i, j, v)
+		}
+	}
+}
+
+// genLBeta computes the generalized LBeta function.
+func (d *Dirichlet) genLBeta(alpha []float64) (lbeta, sumAlpha float64) {
+	for _, alpha := range d.alpha {
+		lg, _ := math.Lgamma(alpha)
+		lbeta += lg
+		sumAlpha += alpha
+	}
+	lg, _ := math.Lgamma(sumAlpha)
+	return lbeta - lg, sumAlpha
+}
+
+// Dim returns the dimension of the distribution.
+func (d *Dirichlet) Dim() int {
+	return d.dim
+}
+
+// LogProb computes the log of the pdf of the point x.
+//
+// It does not check that ||x||_1 = 1.
+func (d *Dirichlet) LogProb(x []float64) float64 {
+	dim := d.dim
+	if len(x) != dim {
+		panic(badSizeMismatch)
+	}
+	var lprob float64
+	for i, x := range x {
+		lprob += (d.alpha[i] - 1) * math.Log(x)
+	}
+	lprob -= d.lbeta
+	return lprob
+}
+
+// Mean returns the mean of the probability distribution.
+//
+// If dst is not nil, the mean will be stored in-place into dst and returned,
+// otherwise a new slice will be allocated first. If dst is not nil, it must
+// have length equal to the dimension of the distribution.
+func (d *Dirichlet) Mean(dst []float64) []float64 {
+	dst = reuseAs(dst, d.dim)
+	floats.ScaleTo(dst, 1/d.sumAlpha, d.alpha)
+	return dst
+}
+
+// Prob computes the value of the probability density function at x.
+func (d *Dirichlet) Prob(x []float64) float64 {
+	return math.Exp(d.LogProb(x))
+}
+
+// Rand generates a random number according to the distribution.
+//
+// If dst is not nil, the sample will be stored in-place into dst and returned,
+// otherwise a new slice will be allocated first. If dst is not nil, it must
+// have length equal to the dimension of the distribution.
+func (d *Dirichlet) Rand(dst []float64) []float64 {
+	dst = reuseAs(dst, d.dim)
+	for i, alpha := range d.alpha {
+		dst[i] = distuv.Gamma{Alpha: alpha, Beta: 1, Src: d.src}.Rand()
+	}
+	sum := floats.Sum(dst)
+	floats.Scale(1/sum, dst)
+	return dst
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distmv/distmv.go b/vendor/gonum.org/v1/gonum/stat/distmv/distmv.go
new file mode 100644
index 0000000000..49b67291a4
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distmv/distmv.go
@@ -0,0 +1,28 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distmv
+
+const (
+	badQuantile      = "distmv: quantile not between 0 and 1"
+	badOutputLen     = "distmv: output slice is not nil or the correct length"
+	badInputLength   = "distmv: input slice length mismatch"
+	badSizeMismatch  = "distmv: size mismatch"
+	badZeroDimension = "distmv: zero dimensional input"
+	nonPosDimension  = "distmv: non-positive dimension input"
+)
+
+const logTwoPi = 1.8378770664093454835606594728112352797227949472755668
+
+// reuseAs returns a slice of length n. If len(dst) is n, dst is returned,
+// otherwise dst must be nil or reuseAs will panic.
+func reuseAs(dst []float64, n int) []float64 {
+	if dst == nil {
+		dst = make([]float64, n)
+	}
+	if len(dst) != n {
+		panic(badOutputLen)
+	}
+	return dst
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distmv/doc.go b/vendor/gonum.org/v1/gonum/stat/distmv/doc.go
new file mode 100644
index 0000000000..142e056816
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distmv/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package distmv provides multivariate random distribution types.
+package distmv // import "gonum.org/v1/gonum/stat/distmv"
diff --git a/vendor/gonum.org/v1/gonum/stat/distmv/interfaces.go b/vendor/gonum.org/v1/gonum/stat/distmv/interfaces.go
new file mode 100644
index 0000000000..04f56aa29b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distmv/interfaces.go
@@ -0,0 +1,35 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distmv
+
+// Quantiler returns the multi-dimensional inverse cumulative distribution function.
+// len(x) must equal len(p), and if x is non-nil, len(x) must also equal len(p).
+// If x is nil, a new slice will be allocated and returned, otherwise the quantile
+// will be stored in-place into x. All of the values of p must be between 0 and 1,
+// or Quantile will panic.
+type Quantiler interface {
+	Quantile(x, p []float64) []float64
+}
+
+// LogProber computes the log of the probability of the point x.
+type LogProber interface {
+	LogProb(x []float64) float64
+}
+
+// Rander generates a random number according to the distribution.
+//
+// If the input is non-nil, len(x) must equal len(p) and the dimension of the distribution,
+// otherwise Quantile will panic.
+//
+// If the input is nil, a new slice will be allocated and returned.
+type Rander interface {
+	Rand(x []float64) []float64
+}
+
+// RandLogProber is both a Rander and a LogProber.
+type RandLogProber interface {
+	Rander
+	LogProber
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distmv/normal.go b/vendor/gonum.org/v1/gonum/stat/distmv/normal.go
new file mode 100644
index 0000000000..a52f37c240
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distmv/normal.go
@@ -0,0 +1,524 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distmv
+
+import (
+	"math"
+	"math/rand/v2"
+
+	"gonum.org/v1/gonum/floats"
+	"gonum.org/v1/gonum/mat"
+	"gonum.org/v1/gonum/stat"
+	"gonum.org/v1/gonum/stat/distuv"
+)
+
+// Normal is a multivariate normal distribution (also known as the multivariate
+// Gaussian distribution). Its pdf in k dimensions is given by
+//
+//	(2 π)^(-k/2) |Σ|^(-1/2) exp(-1/2 (x-μ)'Σ^-1(x-μ))
+//
+// where μ is the mean vector and Σ the covariance matrix. Σ must be symmetric
+// and positive definite. Use NewNormal to construct.
+type Normal struct {
+	mu []float64
+
+	sigma mat.SymDense
+
+	chol       mat.Cholesky
+	logSqrtDet float64
+	dim        int
+
+	// If src is altered, rnd must be updated.
+	src rand.Source
+	rnd *rand.Rand
+}
+
+// NewNormal creates a new Normal with the given mean and covariance matrix.
+// NewNormal panics if len(mu) == 0, or if len(mu) != sigma.N. If the covariance
+// matrix is not positive-definite, the returned boolean is false.
+func NewNormal(mu []float64, sigma mat.Symmetric, src rand.Source) (*Normal, bool) {
+	if len(mu) == 0 {
+		panic(badZeroDimension)
+	}
+	dim := sigma.SymmetricDim()
+	if dim != len(mu) {
+		panic(badSizeMismatch)
+	}
+	n := &Normal{
+		src: src,
+		rnd: rand.New(src),
+		dim: dim,
+		mu:  make([]float64, dim),
+	}
+	copy(n.mu, mu)
+	ok := n.chol.Factorize(sigma)
+	if !ok {
+		return nil, false
+	}
+	n.sigma = *mat.NewSymDense(dim, nil)
+	n.sigma.CopySym(sigma)
+	n.logSqrtDet = 0.5 * n.chol.LogDet()
+	return n, true
+}
+
+// NewNormalChol creates a new Normal distribution with the given mean and
+// covariance matrix represented by its Cholesky decomposition. NewNormalChol
+// panics if len(mu) is not equal to chol.Size().
+func NewNormalChol(mu []float64, chol *mat.Cholesky, src rand.Source) *Normal {
+	dim := len(mu)
+	if dim != chol.SymmetricDim() {
+		panic(badSizeMismatch)
+	}
+	n := &Normal{
+		src: src,
+		rnd: rand.New(src),
+		dim: dim,
+		mu:  make([]float64, dim),
+	}
+	n.chol.Clone(chol)
+	copy(n.mu, mu)
+	n.logSqrtDet = 0.5 * n.chol.LogDet()
+	return n
+}
+
+// NewNormalPrecision creates a new Normal distribution with the given mean and
+// precision matrix (inverse of the covariance matrix). NewNormalPrecision
+// panics if len(mu) is not equal to prec.SymmetricDim(). If the precision matrix
+// is not positive-definite, NewNormalPrecision returns nil for norm and false
+// for ok.
+func NewNormalPrecision(mu []float64, prec *mat.SymDense, src rand.Source) (norm *Normal, ok bool) {
+	if len(mu) == 0 {
+		panic(badZeroDimension)
+	}
+	dim := prec.SymmetricDim()
+	if dim != len(mu) {
+		panic(badSizeMismatch)
+	}
+	// TODO(btracey): Computing a matrix inverse is generally numerically unstable.
+	// This only has to compute the inverse of a positive definite matrix, which
+	// is much better, but this still loses precision. It is worth considering if
+	// instead the precision matrix should be stored explicitly and used instead
+	// of the Cholesky decomposition of the covariance matrix where appropriate.
+	var chol mat.Cholesky
+	ok = chol.Factorize(prec)
+	if !ok {
+		return nil, false
+	}
+	var sigma mat.SymDense
+	err := chol.InverseTo(&sigma)
+	if err != nil {
+		return nil, false
+	}
+	return NewNormal(mu, &sigma, src)
+}
+
+// ConditionNormal returns the Normal distribution that is the receiver conditioned
+// on the input evidence. The returned multivariate normal has dimension
+// n - len(observed), where n is the dimension of the original receiver. The updated
+// mean and covariance are
+//
+//	mu = mu_un + sigma_{ob,un}ᵀ * sigma_{ob,ob}^-1 (v - mu_ob)
+//	sigma = sigma_{un,un} - sigma_{ob,un}ᵀ * sigma_{ob,ob}^-1 * sigma_{ob,un}
+//
+// where mu_un and mu_ob are the original means of the unobserved and observed
+// variables respectively, sigma_{un,un} is the unobserved subset of the covariance
+// matrix, sigma_{ob,ob} is the observed subset of the covariance matrix, and
+// sigma_{un,ob} are the cross terms. The elements of x_2 have been observed with
+// values v. The dimension order is preserved during conditioning, so if the value
+// of dimension 1 is observed, the returned normal represents dimensions {0, 2, ...}
+// of the original Normal distribution.
+//
+// ConditionNormal returns {nil, false} if there is a failure during the update.
+// Mathematically this is impossible, but can occur with finite precision arithmetic.
+func (n *Normal) ConditionNormal(observed []int, values []float64, src rand.Source) (*Normal, bool) {
+	if len(observed) == 0 {
+		panic("normal: no observed value")
+	}
+	if len(observed) != len(values) {
+		panic(badInputLength)
+	}
+	for _, v := range observed {
+		if v < 0 || v >= n.Dim() {
+			panic("normal: observed value out of bounds")
+		}
+	}
+
+	_, mu1, sigma11 := studentsTConditional(observed, values, math.Inf(1), n.mu, &n.sigma)
+	if mu1 == nil {
+		return nil, false
+	}
+	return NewNormal(mu1, sigma11, src)
+}
+
+// CovarianceMatrix stores the covariance matrix of the distribution in dst.
+// Upon return, the value at element {i, j} of the covariance matrix is equal
+// to the covariance of the i^th and j^th variables.
+//
+//	covariance(i, j) = E[(x_i - E[x_i])(x_j - E[x_j])]
+//
+// If the dst matrix is empty it will be resized to the correct dimensions,
+// otherwise dst must match the dimension of the receiver or CovarianceMatrix
+// will panic.
+func (n *Normal) CovarianceMatrix(dst *mat.SymDense) {
+	if dst.IsEmpty() {
+		*dst = *(dst.GrowSym(n.dim).(*mat.SymDense))
+	} else if dst.SymmetricDim() != n.dim {
+		panic("normal: input matrix size mismatch")
+	}
+	dst.CopySym(&n.sigma)
+}
+
+// Dim returns the dimension of the distribution.
+func (n *Normal) Dim() int {
+	return n.dim
+}
+
+// Entropy returns the differential entropy of the distribution.
+func (n *Normal) Entropy() float64 {
+	return float64(n.dim)/2*(1+logTwoPi) + n.logSqrtDet
+}
+
+// LogProb computes the log of the pdf of the point x.
+func (n *Normal) LogProb(x []float64) float64 {
+	dim := n.dim
+	if len(x) != dim {
+		panic(badSizeMismatch)
+	}
+	return normalLogProb(x, n.mu, &n.chol, n.logSqrtDet)
+}
+
+// NormalLogProb computes the log probability of the location x for a Normal
+// distribution the given mean and Cholesky decomposition of the covariance matrix.
+// NormalLogProb panics if len(x) is not equal to len(mu), or if len(mu) != chol.Size().
+//
+// This function saves time and memory if the Cholesky decomposition is already
+// available. Otherwise, the NewNormal function should be used.
+func NormalLogProb(x, mu []float64, chol *mat.Cholesky) float64 {
+	dim := len(mu)
+	if len(x) != dim {
+		panic(badSizeMismatch)
+	}
+	if chol.SymmetricDim() != dim {
+		panic(badSizeMismatch)
+	}
+	logSqrtDet := 0.5 * chol.LogDet()
+	return normalLogProb(x, mu, chol, logSqrtDet)
+}
+
+// normalLogProb is the same as NormalLogProb, but does not make size checks and
+// additionally requires log(|Σ|^-0.5)
+func normalLogProb(x, mu []float64, chol *mat.Cholesky, logSqrtDet float64) float64 {
+	dim := len(mu)
+	c := -0.5*float64(dim)*logTwoPi - logSqrtDet
+	dst := stat.Mahalanobis(mat.NewVecDense(dim, x), mat.NewVecDense(dim, mu), chol)
+	return c - 0.5*dst*dst
+}
+
+// MarginalNormal returns the marginal distribution of the given input variables.
+// That is, MarginalNormal returns
+//
+//	p(x_i) = \int_{x_o} p(x_i | x_o) p(x_o) dx_o
+//
+// where x_i are the dimensions in the input, and x_o are the remaining dimensions.
+// See https://en.wikipedia.org/wiki/Marginal_distribution for more information.
+//
+// The input src is passed to the call to NewNormal.
+func (n *Normal) MarginalNormal(vars []int, src rand.Source) (*Normal, bool) {
+	newMean := make([]float64, len(vars))
+	for i, v := range vars {
+		newMean[i] = n.mu[v]
+	}
+	var s mat.SymDense
+	s.SubsetSym(&n.sigma, vars)
+	return NewNormal(newMean, &s, src)
+}
+
+// MarginalNormalSingle returns the marginal of the given input variable.
+// That is, MarginalNormal returns
+//
+//	p(x_i) = \int_{x_¬i} p(x_i | x_¬i) p(x_¬i) dx_¬i
+//
+// where i is the input index.
+// See https://en.wikipedia.org/wiki/Marginal_distribution for more information.
+//
+// The input src is passed to the constructed distuv.Normal.
+func (n *Normal) MarginalNormalSingle(i int, src rand.Source) distuv.Normal {
+	return distuv.Normal{
+		Mu:    n.mu[i],
+		Sigma: math.Sqrt(n.sigma.At(i, i)),
+		Src:   src,
+	}
+}
+
+// Mean returns the mean of the probability distribution.
+//
+// If dst is not nil, the mean will be stored in-place into dst and returned,
+// otherwise a new slice will be allocated first. If dst is not nil, it must
+// have length equal to the dimension of the distribution.
+func (n *Normal) Mean(dst []float64) []float64 {
+	dst = reuseAs(dst, n.dim)
+	copy(dst, n.mu)
+	return dst
+}
+
+// Prob computes the value of the probability density function at x.
+func (n *Normal) Prob(x []float64) float64 {
+	return math.Exp(n.LogProb(x))
+}
+
+// Quantile returns the value of the multi-dimensional inverse cumulative
+// distribution function at p.
+//
+// If dst is not nil, the quantile will be stored in-place into dst and
+// returned, otherwise a new slice will be allocated first. If dst is not nil,
+// it must have length equal to the dimension of the distribution. Quantile will
+// also panic if the length of p is not equal to the dimension of the
+// distribution.
+//
+// All of the values of p must be between 0 and 1, inclusive, or Quantile will
+// panic.
+func (n *Normal) Quantile(dst, p []float64) []float64 {
+	if len(p) != n.dim {
+		panic(badInputLength)
+	}
+	dst = reuseAs(dst, n.dim)
+
+	// Transform to a standard normal and then transform to a multivariate Gaussian.
+	for i, v := range p {
+		dst[i] = distuv.UnitNormal.Quantile(v)
+	}
+	n.TransformNormal(dst, dst)
+	return dst
+}
+
+// Rand generates a random sample according to the distribution.
+//
+// If dst is not nil, the sample will be stored in-place into dst and returned,
+// otherwise a new slice will be allocated first. If dst is not nil, it must
+// have length equal to the dimension of the distribution.
+func (n *Normal) Rand(dst []float64) []float64 {
+	return NormalRand(dst, n.mu, &n.chol, n.src)
+}
+
+// NormalRand generates a random sample from a multivariate normal distribution
+// given by the mean and the Cholesky factorization of the covariance matrix.
+//
+// If dst is not nil, the sample will be stored in-place into dst and returned,
+// otherwise a new slice will be allocated first. If dst is not nil, it must
+// have length equal to the dimension of the distribution.
+//
+// This function saves time and memory if the Cholesky factorization is already
+// available. Otherwise, the NewNormal function should be used.
+func NormalRand(dst, mean []float64, chol *mat.Cholesky, src rand.Source) []float64 {
+	if len(mean) != chol.SymmetricDim() {
+		panic(badInputLength)
+	}
+	dst = reuseAs(dst, len(mean))
+
+	if src == nil {
+		for i := range dst {
+			dst[i] = rand.NormFloat64()
+		}
+	} else {
+		rnd := rand.New(src)
+		for i := range dst {
+			dst[i] = rnd.NormFloat64()
+		}
+	}
+	transformNormal(dst, dst, mean, chol)
+	return dst
+}
+
+// EigenSym is an eigendecomposition of a symmetric matrix.
+type EigenSym interface {
+	mat.Symmetric
+	// RawValues returns all eigenvalues in ascending order. The returned slice
+	// must not be modified.
+	RawValues() []float64
+	// RawQ returns an orthogonal matrix whose columns contain the eigenvectors.
+	// The returned matrix must not be modified.
+	RawQ() mat.Matrix
+}
+
+// PositivePartEigenSym is an EigenSym that sets any negative eigenvalues from
+// the given eigendecomposition to zero but otherwise returns the values
+// unchanged.
+//
+// This is useful for filtering eigenvalues of positive semi-definite matrices
+// that are almost zero but negative due to rounding errors.
+type PositivePartEigenSym struct {
+	ed   *mat.EigenSym
+	vals []float64
+}
+
+var _ EigenSym = (*PositivePartEigenSym)(nil)
+var _ EigenSym = (*mat.EigenSym)(nil)
+
+// NewPositivePartEigenSym returns a new PositivePartEigenSym, wrapping the
+// given eigendecomposition.
+func NewPositivePartEigenSym(ed *mat.EigenSym) *PositivePartEigenSym {
+	n := ed.SymmetricDim()
+	vals := make([]float64, n)
+	for i, lamda := range ed.RawValues() {
+		if lamda > 0 {
+			vals[i] = lamda
+		}
+	}
+	return &PositivePartEigenSym{
+		ed:   ed,
+		vals: vals,
+	}
+}
+
+// SymmetricDim returns the value from the wrapped eigendecomposition.
+func (ed *PositivePartEigenSym) SymmetricDim() int { return ed.ed.SymmetricDim() }
+
+// Dims returns the dimensions from the wrapped eigendecomposition.
+func (ed *PositivePartEigenSym) Dims() (r, c int) { return ed.ed.Dims() }
+
+// At returns the value from the wrapped eigendecomposition.
+func (ed *PositivePartEigenSym) At(i, j int) float64 { return ed.ed.At(i, j) }
+
+// T returns the transpose from the wrapped eigendecomposition.
+func (ed *PositivePartEigenSym) T() mat.Matrix { return ed.ed.T() }
+
+// RawQ returns the orthogonal matrix Q from the wrapped eigendecomposition. The
+// returned matrix must not be modified.
+func (ed *PositivePartEigenSym) RawQ() mat.Matrix { return ed.ed.RawQ() }
+
+// RawValues returns the eigenvalues from the wrapped eigendecomposition in
+// ascending order with any negative value replaced by zero. The returned slice
+// must not be modified.
+func (ed *PositivePartEigenSym) RawValues() []float64 { return ed.vals }
+
+// NormalRandCov generates a random sample from a multivariate normal
+// distribution given by the mean and the covariance matrix.
+//
+// If dst is not nil, the sample will be stored in-place into dst and returned,
+// otherwise a new slice will be allocated first. If dst is not nil, it must
+// have length equal to the dimension of the distribution.
+//
+// cov should be *mat.Cholesky, *mat.PivotedCholesky or EigenSym, otherwise
+// NormalRandCov will be very inefficient because a pivoted Cholesky
+// factorization of cov will be computed for every sample.
+//
+// If cov is an EigenSym, all eigenvalues returned by RawValues must be
+// non-negative, otherwise NormalRandCov will panic.
+func NormalRandCov(dst, mean []float64, cov mat.Symmetric, src rand.Source) []float64 {
+	n := len(mean)
+	if cov.SymmetricDim() != n {
+		panic(badInputLength)
+	}
+	dst = reuseAs(dst, n)
+	if src == nil {
+		for i := range dst {
+			dst[i] = rand.NormFloat64()
+		}
+	} else {
+		rnd := rand.New(src)
+		for i := range dst {
+			dst[i] = rnd.NormFloat64()
+		}
+	}
+
+	switch cov := cov.(type) {
+	case *mat.Cholesky:
+		dstVec := mat.NewVecDense(n, dst)
+		dstVec.MulVec(cov.RawU().T(), dstVec)
+	case *mat.PivotedCholesky:
+		dstVec := mat.NewVecDense(n, dst)
+		dstVec.MulVec(cov.RawU().T(), dstVec)
+		dstVec.Permute(cov.ColumnPivots(nil), true)
+	case EigenSym:
+		vals := cov.RawValues()
+		if vals[0] < 0 {
+			panic("distmv: covariance matrix is not positive semi-definite")
+		}
+		for i, val := range vals {
+			dst[i] *= math.Sqrt(val)
+		}
+		dstVec := mat.NewVecDense(n, dst)
+		dstVec.MulVec(cov.RawQ(), dstVec)
+	default:
+		var chol mat.PivotedCholesky
+		chol.Factorize(cov, -1)
+		dstVec := mat.NewVecDense(n, dst)
+		dstVec.MulVec(chol.RawU().T(), dstVec)
+		dstVec.Permute(chol.ColumnPivots(nil), true)
+	}
+	floats.Add(dst, mean)
+
+	return dst
+}
+
+// ScoreInput returns the gradient of the log-probability with respect to the
+// input x. That is, ScoreInput computes
+//
+//	∇_x log(p(x))
+//
+// If dst is not nil, the score will be stored in-place into dst and returned,
+// otherwise a new slice will be allocated first. If dst is not nil, it must
+// have length equal to the dimension of the distribution.
+func (n *Normal) ScoreInput(dst, x []float64) []float64 {
+	// Normal log probability is
+	//  c - 0.5*(x-μ)' Σ^-1 (x-μ).
+	// So the derivative is just
+	//  -Σ^-1 (x-μ).
+	if len(x) != n.Dim() {
+		panic(badInputLength)
+	}
+	dst = reuseAs(dst, n.dim)
+
+	floats.SubTo(dst, x, n.mu)
+	dstVec := mat.NewVecDense(len(dst), dst)
+	err := n.chol.SolveVecTo(dstVec, dstVec)
+	if err != nil {
+		panic(err)
+	}
+	floats.Scale(-1, dst)
+	return dst
+}
+
+// SetMean changes the mean of the normal distribution. SetMean panics if len(mu)
+// does not equal the dimension of the normal distribution.
+func (n *Normal) SetMean(mu []float64) {
+	if len(mu) != n.Dim() {
+		panic(badSizeMismatch)
+	}
+	copy(n.mu, mu)
+}
+
+// TransformNormal transforms x generated from a standard multivariate normal
+// into a vector that has been generated under the normal distribution of the
+// receiver.
+//
+// If dst is not nil, the result will be stored in-place into dst and returned,
+// otherwise a new slice will be allocated first. If dst is not nil, it must
+// have length equal to the dimension of the distribution. TransformNormal will
+// also panic if the length of x is not equal to the dimension of the receiver.
+func (n *Normal) TransformNormal(dst, x []float64) []float64 {
+	if len(x) != n.dim {
+		panic(badInputLength)
+	}
+	dst = reuseAs(dst, n.dim)
+	transformNormal(dst, x, n.mu, &n.chol)
+	return dst
+}
+
+// transformNormal performs the same operation as Normal.TransformNormal except
+// no safety checks are performed and all memory must be provided.
+func transformNormal(dst, normal, mu []float64, chol *mat.Cholesky) []float64 {
+	dim := len(mu)
+	dstVec := mat.NewVecDense(dim, dst)
+	srcVec := mat.NewVecDense(dim, normal)
+	// If dst and normal are the same slice, make them the same Vector otherwise
+	// mat complains about being tricky.
+	if &normal[0] == &dst[0] {
+		srcVec = dstVec
+	}
+	dstVec.MulVec(chol.RawU().T(), srcVec)
+	floats.Add(dst, mu)
+	return dst
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distmv/statdist.go b/vendor/gonum.org/v1/gonum/stat/distmv/statdist.go
new file mode 100644
index 0000000000..c835924c6e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distmv/statdist.go
@@ -0,0 +1,390 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distmv
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/floats"
+	"gonum.org/v1/gonum/mat"
+	"gonum.org/v1/gonum/mathext"
+	"gonum.org/v1/gonum/spatial/r1"
+	"gonum.org/v1/gonum/stat"
+)
+
+// Bhattacharyya is a type for computing the Bhattacharyya distance between
+// probability distributions.
+//
+// The Bhattacharyya distance is defined as
+//
+//	D_B = -ln(BC(l,r))
+//	BC = \int_-∞^∞ (p(x)q(x))^(1/2) dx
+//
+// Where BC is known as the Bhattacharyya coefficient.
+// The Bhattacharyya distance is related to the Hellinger distance by
+//
+//	H(l,r) = sqrt(1-BC(l,r))
+//
+// For more information, see
+//
+//	https://en.wikipedia.org/wiki/Bhattacharyya_distance
+type Bhattacharyya struct{}
+
+// DistNormal computes the Bhattacharyya distance between normal distributions l and r.
+// The dimensions of the input distributions must match or DistNormal will panic.
+//
+// For Normal distributions, the Bhattacharyya distance is
+//
+//	Σ = (Σ_l + Σ_r)/2
+//	D_B = (1/8)*(μ_l - μ_r)ᵀ*Σ^-1*(μ_l - μ_r) + (1/2)*ln(det(Σ)/(det(Σ_l)*det(Σ_r))^(1/2))
+func (Bhattacharyya) DistNormal(l, r *Normal) float64 {
+	dim := l.Dim()
+	if dim != r.Dim() {
+		panic(badSizeMismatch)
+	}
+
+	var sigma mat.SymDense
+	sigma.AddSym(&l.sigma, &r.sigma)
+	sigma.ScaleSym(0.5, &sigma)
+
+	var chol mat.Cholesky
+	chol.Factorize(&sigma)
+
+	mahalanobis := stat.Mahalanobis(mat.NewVecDense(dim, l.mu), mat.NewVecDense(dim, r.mu), &chol)
+	mahalanobisSq := mahalanobis * mahalanobis
+
+	dl := l.chol.LogDet()
+	dr := r.chol.LogDet()
+	ds := chol.LogDet()
+
+	return 0.125*mahalanobisSq + 0.5*ds - 0.25*dl - 0.25*dr
+}
+
+// DistUniform computes the Bhattacharyya distance between uniform distributions l and r.
+// The dimensions of the input distributions must match or DistUniform will panic.
+func (Bhattacharyya) DistUniform(l, r *Uniform) float64 {
+	if len(l.bounds) != len(r.bounds) {
+		panic(badSizeMismatch)
+	}
+	// BC = \int \sqrt(p(x)q(x)), which for uniform distributions is a constant
+	// over the volume where both distributions have positive probability.
+	// Compute the overlap and the value of sqrt(p(x)q(x)). The entropy is the
+	// negative log probability of the distribution (use instead of LogProb so
+	// it is not necessary to construct an x value).
+	//
+	// BC = volume * sqrt(p(x)q(x))
+	// logBC = log(volume) + 0.5*(logP + logQ)
+	// D_B = -logBC
+	return -unifLogVolOverlap(l.bounds, r.bounds) + 0.5*(l.Entropy()+r.Entropy())
+}
+
+// unifLogVolOverlap computes the log of the volume of the hyper-rectangle where
+// both uniform distributions have positive probability.
+func unifLogVolOverlap(b1, b2 []r1.Interval) float64 {
+	var logVolOverlap float64
+	for dim, v1 := range b1 {
+		v2 := b2[dim]
+		// If the surfaces don't overlap, then the volume is 0
+		if v1.Max <= v2.Min || v2.Max <= v1.Min {
+			return math.Inf(-1)
+		}
+		vol := math.Min(v1.Max, v2.Max) - math.Max(v1.Min, v2.Min)
+		logVolOverlap += math.Log(vol)
+	}
+	return logVolOverlap
+}
+
+// CrossEntropy is a type for computing the cross-entropy between probability
+// distributions.
+//
+// The cross-entropy is defined as
+//   - \int_x l(x) log(r(x)) dx = KL(l || r) + H(l)
+//
+// where KL is the Kullback-Leibler divergence and H is the entropy.
+// For more information, see
+//
+//	https://en.wikipedia.org/wiki/Cross_entropy
+type CrossEntropy struct{}
+
+// DistNormal returns the cross-entropy between normal distributions l and r.
+// The dimensions of the input distributions must match or DistNormal will panic.
+func (CrossEntropy) DistNormal(l, r *Normal) float64 {
+	if l.Dim() != r.Dim() {
+		panic(badSizeMismatch)
+	}
+	kl := KullbackLeibler{}.DistNormal(l, r)
+	return kl + l.Entropy()
+}
+
+// Hellinger is a type for computing the Hellinger distance between probability
+// distributions.
+//
+// The Hellinger distance is defined as
+//
+//	H^2(l,r) = 1/2 * int_x (\sqrt(l(x)) - \sqrt(r(x)))^2 dx
+//
+// and is bounded between 0 and 1. Note the above formula defines the squared
+// Hellinger distance, while this returns the Hellinger distance itself.
+// The Hellinger distance is related to the Bhattacharyya distance by
+//
+//	H^2 = 1 - exp(-D_B)
+//
+// For more information, see
+//
+//	https://en.wikipedia.org/wiki/Hellinger_distance
+type Hellinger struct{}
+
+// DistNormal returns the Hellinger distance between normal distributions l and r.
+// The dimensions of the input distributions must match or DistNormal will panic.
+//
+// See the documentation of Bhattacharyya.DistNormal for the formula for Normal
+// distributions.
+func (Hellinger) DistNormal(l, r *Normal) float64 {
+	if l.Dim() != r.Dim() {
+		panic(badSizeMismatch)
+	}
+	db := Bhattacharyya{}.DistNormal(l, r)
+	bc := math.Exp(-db)
+	return math.Sqrt(1 - bc)
+}
+
+// KullbackLeibler is a type for computing the Kullback-Leibler divergence from l to r.
+//
+// The Kullback-Leibler divergence is defined as
+//
+//	D_KL(l || r ) = \int_x p(x) log(p(x)/q(x)) dx
+//
+// Note that the Kullback-Leibler divergence is not symmetric with respect to
+// the order of the input arguments.
+type KullbackLeibler struct{}
+
+// DistDirichlet returns the Kullback-Leibler divergence between Dirichlet
+// distributions l and r. The dimensions of the input distributions must match
+// or DistDirichlet will panic.
+//
+// For two Dirichlet distributions, the KL divergence is computed as
+//
+//	D_KL(l || r) = log Γ(α_0_l) - \sum_i log Γ(α_i_l) - log Γ(α_0_r) + \sum_i log Γ(α_i_r)
+//	               + \sum_i (α_i_l - α_i_r)(ψ(α_i_l)- ψ(α_0_l))
+//
+// Where Γ is the gamma function, ψ is the digamma function, and α_0 is the
+// sum of the Dirichlet parameters.
+func (KullbackLeibler) DistDirichlet(l, r *Dirichlet) float64 {
+	// http://bariskurt.com/kullback-leibler-divergence-between-two-dirichlet-and-beta-distributions/
+	if l.Dim() != r.Dim() {
+		panic(badSizeMismatch)
+	}
+	l0, _ := math.Lgamma(l.sumAlpha)
+	r0, _ := math.Lgamma(r.sumAlpha)
+	dl := mathext.Digamma(l.sumAlpha)
+
+	var l1, r1, c float64
+	for i, al := range l.alpha {
+		ar := r.alpha[i]
+		vl, _ := math.Lgamma(al)
+		l1 += vl
+		vr, _ := math.Lgamma(ar)
+		r1 += vr
+		c += (al - ar) * (mathext.Digamma(al) - dl)
+	}
+	return l0 - l1 - r0 + r1 + c
+}
+
+// DistNormal returns the KullbackLeibler divergence between normal distributions l and r.
+// The dimensions of the input distributions must match or DistNormal will panic.
+//
+// For two normal distributions, the KL divergence is computed as
+//
+//	D_KL(l || r) = 0.5*[ln(|Σ_r|) - ln(|Σ_l|) + (μ_l - μ_r)ᵀ*Σ_r^-1*(μ_l - μ_r) + tr(Σ_r^-1*Σ_l)-d]
+func (KullbackLeibler) DistNormal(l, r *Normal) float64 {
+	dim := l.Dim()
+	if dim != r.Dim() {
+		panic(badSizeMismatch)
+	}
+
+	mahalanobis := stat.Mahalanobis(mat.NewVecDense(dim, l.mu), mat.NewVecDense(dim, r.mu), &r.chol)
+	mahalanobisSq := mahalanobis * mahalanobis
+
+	// TODO(btracey): Optimize where there is a SolveCholeskySym
+	// TODO(btracey): There may be a more efficient way to just compute the trace
+	// Compute tr(Σ_r^-1*Σ_l) using the fact that Σ_l = Uᵀ * U
+	var u mat.TriDense
+	l.chol.UTo(&u)
+	var m mat.Dense
+	err := r.chol.SolveTo(&m, u.T())
+	if err != nil {
+		return math.NaN()
+	}
+	m.Mul(&m, &u)
+	tr := mat.Trace(&m)
+
+	return r.logSqrtDet - l.logSqrtDet + 0.5*(mahalanobisSq+tr-float64(l.dim))
+}
+
+// DistUniform returns the KullbackLeibler divergence between uniform distributions
+// l and r. The dimensions of the input distributions must match or DistUniform
+// will panic.
+func (KullbackLeibler) DistUniform(l, r *Uniform) float64 {
+	bl := l.Bounds(nil)
+	br := r.Bounds(nil)
+	if len(bl) != len(br) {
+		panic(badSizeMismatch)
+	}
+
+	// The KL is ∞ if l is not completely contained within r, because then
+	// r(x) is zero when l(x) is non-zero for some x.
+	contained := true
+	for i, v := range bl {
+		if v.Min < br[i].Min || br[i].Max < v.Max {
+			contained = false
+			break
+		}
+	}
+	if !contained {
+		return math.Inf(1)
+	}
+
+	// The KL divergence is finite.
+	//
+	// KL defines 0*ln(0) = 0, so there is no contribution to KL where l(x) = 0.
+	// Inside the region, l(x) and r(x) are constant (uniform distribution), and
+	// this constant is integrated over l(x), which integrates out to one.
+	// The entropy is -log(p(x)).
+	logPx := -l.Entropy()
+	logQx := -r.Entropy()
+	return logPx - logQx
+}
+
+// Renyi is a type for computing the Rényi divergence of order α from l to r.
+//
+// The Rényi divergence with α > 0, α ≠ 1 is defined as
+//
+//	D_α(l || r) = 1/(α-1) log(\int_-∞^∞ l(x)^α r(x)^(1-α)dx)
+//
+// The Rényi divergence has special forms for α = 0 and α = 1. This type does
+// not implement α = ∞. For α = 0,
+//
+//	D_0(l || r) = -log \int_-∞^∞ r(x)1{p(x)>0} dx
+//
+// that is, the negative log probability under r(x) that l(x) > 0.
+// When α = 1, the Rényi divergence is equal to the Kullback-Leibler divergence.
+// The Rényi divergence is also equal to half the Bhattacharyya distance when α = 0.5.
+//
+// The parameter α must be in 0 ≤ α < ∞ or the distance functions will panic.
+type Renyi struct {
+	Alpha float64
+}
+
+// DistNormal returns the Rényi divergence between normal distributions l and r.
+// The dimensions of the input distributions must match or DistNormal will panic.
+//
+// For two normal distributions, the Rényi divergence is computed as
+//
+//	Σ_α = (1-α) Σ_l + αΣ_r
+//	D_α(l||r) = α/2 * (μ_l - μ_r)'*Σ_α^-1*(μ_l - μ_r) + 1/(2(α-1))*ln(|Σ_λ|/(|Σ_l|^(1-α)*|Σ_r|^α))
+//
+// For a more nicely formatted version of the formula, see Eq. 15 of
+//
+//	Kolchinsky, Artemy, and Brendan D. Tracey. "Estimating Mixture Entropy
+//	with Pairwise Distances." arXiv preprint arXiv:1706.02419 (2017).
+//
+// Note that the this formula is for Chernoff divergence, which differs from
+// Rényi divergence by a factor of 1-α. Also be aware that most sources in
+// the literature report this formula incorrectly.
+func (renyi Renyi) DistNormal(l, r *Normal) float64 {
+	if renyi.Alpha < 0 {
+		panic("renyi: alpha < 0")
+	}
+	dim := l.Dim()
+	if dim != r.Dim() {
+		panic(badSizeMismatch)
+	}
+	if renyi.Alpha == 0 {
+		return 0
+	}
+	if renyi.Alpha == 1 {
+		return KullbackLeibler{}.DistNormal(l, r)
+	}
+
+	logDetL := l.chol.LogDet()
+	logDetR := r.chol.LogDet()
+
+	// Σ_α = (1-α)Σ_l + αΣ_r.
+	sigA := mat.NewSymDense(dim, nil)
+	for i := 0; i < dim; i++ {
+		for j := i; j < dim; j++ {
+			v := (1-renyi.Alpha)*l.sigma.At(i, j) + renyi.Alpha*r.sigma.At(i, j)
+			sigA.SetSym(i, j, v)
+		}
+	}
+
+	var chol mat.Cholesky
+	ok := chol.Factorize(sigA)
+	if !ok {
+		return math.NaN()
+	}
+	logDetA := chol.LogDet()
+
+	mahalanobis := stat.Mahalanobis(mat.NewVecDense(dim, l.mu), mat.NewVecDense(dim, r.mu), &chol)
+	mahalanobisSq := mahalanobis * mahalanobis
+
+	return (renyi.Alpha/2)*mahalanobisSq + 1/(2*(1-renyi.Alpha))*(logDetA-(1-renyi.Alpha)*logDetL-renyi.Alpha*logDetR)
+}
+
+// Wasserstein is a type for computing the Wasserstein distance between two
+// probability distributions.
+//
+// The Wasserstein distance is defined as
+//
+//	W(l,r) := inf 𝔼(||X-Y||_2^2)^1/2
+//
+// For more information, see
+//
+//	https://en.wikipedia.org/wiki/Wasserstein_metric
+type Wasserstein struct{}
+
+// DistNormal returns the Wasserstein distance between normal distributions l and r.
+// The dimensions of the input distributions must match or DistNormal will panic.
+//
+// The Wasserstein distance for Normal distributions is
+//
+//	d^2 = ||m_l - m_r||_2^2 + Tr(Σ_l + Σ_r - 2(Σ_l^(1/2)*Σ_r*Σ_l^(1/2))^(1/2))
+//
+// For more information, see
+//
+//	http://djalil.chafai.net/blog/2010/04/30/wasserstein-distance-between-two-gaussians/
+func (Wasserstein) DistNormal(l, r *Normal) float64 {
+	dim := l.Dim()
+	if dim != r.Dim() {
+		panic(badSizeMismatch)
+	}
+
+	d := floats.Distance(l.mu, r.mu, 2)
+	d = d * d
+
+	// Compute Σ_l^(1/2)
+	var ssl mat.SymDense
+	err := ssl.PowPSD(&l.sigma, 0.5)
+	if err != nil {
+		panic(err)
+	}
+	// Compute Σ_l^(1/2)*Σ_r*Σ_l^(1/2)
+	var mean mat.Dense
+	mean.Mul(&ssl, &r.sigma)
+	mean.Mul(&mean, &ssl)
+
+	// Reinterpret as symdense, and take Σ^(1/2)
+	meanSym := mat.NewSymDense(dim, mean.RawMatrix().Data)
+	err = ssl.PowPSD(meanSym, 0.5)
+	if err != nil {
+		panic(err)
+	}
+
+	tr := mat.Trace(&r.sigma)
+	tl := mat.Trace(&l.sigma)
+	tm := mat.Trace(&ssl)
+
+	return d + tl + tr - 2*tm
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distmv/studentst.go b/vendor/gonum.org/v1/gonum/stat/distmv/studentst.go
new file mode 100644
index 0000000000..7dee85b6ae
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distmv/studentst.go
@@ -0,0 +1,362 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distmv
+
+import (
+	"math"
+	"math/rand/v2"
+	"sort"
+
+	"golang.org/x/tools/container/intsets"
+
+	"gonum.org/v1/gonum/floats"
+	"gonum.org/v1/gonum/mat"
+	"gonum.org/v1/gonum/stat"
+	"gonum.org/v1/gonum/stat/distuv"
+)
+
+// StudentsT is a multivariate Student's T distribution. It is a distribution over
+// ℝ^n with the probability density
+//
+//	p(y) = (Γ((ν+n)/2) / Γ(ν/2)) * (νπ)^(-n/2) * |Ʃ|^(-1/2) *
+//	           (1 + 1/ν * (y-μ)ᵀ * Ʃ^-1 * (y-μ))^(-(ν+n)/2)
+//
+// where ν is a scalar greater than 2, μ is a vector in ℝ^n, and Ʃ is an n×n
+// symmetric positive definite matrix.
+//
+// In this distribution, ν sets the spread of the distribution, similar to
+// the degrees of freedom in a univariate Student's T distribution. As ν → ∞,
+// the distribution approaches a multi-variate normal distribution.
+// μ is the mean of the distribution, and the covariance is  ν/(ν-2)*Ʃ.
+//
+// See https://en.wikipedia.org/wiki/Student%27s_t-distribution and
+// http://users.isy.liu.se/en/rt/roth/student.pdf for more information.
+type StudentsT struct {
+	nu float64
+	mu []float64
+	// If src is altered, rnd must be updated.
+	src rand.Source
+	rnd *rand.Rand
+
+	sigma mat.SymDense // only stored if needed
+
+	chol       mat.Cholesky
+	lower      mat.TriDense
+	logSqrtDet float64
+	dim        int
+}
+
+// NewStudentsT creates a new StudentsT with the given nu, mu, and sigma
+// parameters.
+//
+// NewStudentsT panics if len(mu) == 0, or if len(mu) != sigma.SymmetricDim(). If
+// the covariance matrix is not positive-definite, nil is returned and ok is false.
+func NewStudentsT(mu []float64, sigma mat.Symmetric, nu float64, src rand.Source) (dist *StudentsT, ok bool) {
+	if len(mu) == 0 {
+		panic(badZeroDimension)
+	}
+	dim := sigma.SymmetricDim()
+	if dim != len(mu) {
+		panic(badSizeMismatch)
+	}
+
+	s := &StudentsT{
+		nu:  nu,
+		mu:  make([]float64, dim),
+		dim: dim,
+		src: src,
+	}
+	if src != nil {
+		s.rnd = rand.New(src)
+	}
+	copy(s.mu, mu)
+
+	ok = s.chol.Factorize(sigma)
+	if !ok {
+		return nil, false
+	}
+	s.sigma = *mat.NewSymDense(dim, nil)
+	s.sigma.CopySym(sigma)
+	s.chol.LTo(&s.lower)
+	s.logSqrtDet = 0.5 * s.chol.LogDet()
+	return s, true
+}
+
+// ConditionStudentsT returns the Student's T distribution that is the receiver
+// conditioned on the input evidence, and the success of the operation.
+// The returned Student's T has dimension
+// n - len(observed), where n is the dimension of the original receiver.
+// The dimension order is preserved during conditioning, so if the value
+// of dimension 1 is observed, the returned normal represents dimensions {0, 2, ...}
+// of the original Student's T distribution.
+//
+// ok indicates whether there was a failure during the update. If ok is false
+// the operation failed and dist is not usable.
+// Mathematically this is impossible, but can occur with finite precision arithmetic.
+func (s *StudentsT) ConditionStudentsT(observed []int, values []float64, src rand.Source) (dist *StudentsT, ok bool) {
+	if len(observed) == 0 {
+		panic("studentst: no observed value")
+	}
+	if len(observed) != len(values) {
+		panic(badInputLength)
+	}
+
+	for _, v := range observed {
+		if v < 0 || v >= s.dim {
+			panic("studentst: observed value out of bounds")
+		}
+	}
+
+	newNu, newMean, newSigma := studentsTConditional(observed, values, s.nu, s.mu, &s.sigma)
+	if newMean == nil {
+		return nil, false
+	}
+
+	return NewStudentsT(newMean, newSigma, newNu, src)
+
+}
+
+// studentsTConditional updates a Student's T distribution based on the observed samples
+// (see documentation for the public function). The Gaussian conditional update
+// is treated as a special case when  nu == math.Inf(1).
+func studentsTConditional(observed []int, values []float64, nu float64, mu []float64, sigma mat.Symmetric) (newNu float64, newMean []float64, newSigma *mat.SymDense) {
+	dim := len(mu)
+	ob := len(observed)
+
+	unobserved := findUnob(observed, dim)
+
+	unob := len(unobserved)
+	if unob == 0 {
+		panic("stat: all dimensions observed")
+	}
+
+	mu1 := make([]float64, unob)
+	for i, v := range unobserved {
+		mu1[i] = mu[v]
+	}
+	mu2 := make([]float64, ob) // really v - mu2
+	for i, v := range observed {
+		mu2[i] = values[i] - mu[v]
+	}
+
+	var sigma11, sigma22 mat.SymDense
+	sigma11.SubsetSym(sigma, unobserved)
+	sigma22.SubsetSym(sigma, observed)
+
+	sigma21 := mat.NewDense(ob, unob, nil)
+	for i, r := range observed {
+		for j, c := range unobserved {
+			v := sigma.At(r, c)
+			sigma21.Set(i, j, v)
+		}
+	}
+
+	var chol mat.Cholesky
+	ok := chol.Factorize(&sigma22)
+	if !ok {
+		return math.NaN(), nil, nil
+	}
+
+	// Compute mu_1 + sigma_{2,1}ᵀ * sigma_{2,2}^-1 (v - mu_2).
+	v := mat.NewVecDense(ob, mu2)
+	var tmp, tmp2 mat.VecDense
+	err := chol.SolveVecTo(&tmp, v)
+	if err != nil {
+		return math.NaN(), nil, nil
+	}
+	tmp2.MulVec(sigma21.T(), &tmp)
+
+	for i := range mu1 {
+		mu1[i] += tmp2.At(i, 0)
+	}
+
+	// Compute tmp4 = sigma_{2,1}ᵀ * sigma_{2,2}^-1 * sigma_{2,1}.
+	// TODO(btracey): Should this be a method of SymDense?
+	var tmp3, tmp4 mat.Dense
+	err = chol.SolveTo(&tmp3, sigma21)
+	if err != nil {
+		return math.NaN(), nil, nil
+	}
+	tmp4.Mul(sigma21.T(), &tmp3)
+
+	// Compute sigma_{1,1} - tmp4
+	// TODO(btracey): If tmp4 can constructed with a method, then this can be
+	// replaced with SubSym.
+	for i := 0; i < len(unobserved); i++ {
+		for j := i; j < len(unobserved); j++ {
+			v := sigma11.At(i, j)
+			sigma11.SetSym(i, j, v-tmp4.At(i, j))
+		}
+	}
+
+	// The computed variables are accurate for a Normal.
+	if math.IsInf(nu, 1) {
+		return nu, mu1, &sigma11
+	}
+
+	// Compute beta = (v - mu_2)ᵀ * sigma_{2,2}^-1 * (v - mu_2)ᵀ
+	beta := mat.Dot(v, &tmp)
+
+	// Scale the covariance matrix
+	sigma11.ScaleSym((nu+beta)/(nu+float64(ob)), &sigma11)
+
+	return nu + float64(ob), mu1, &sigma11
+}
+
+// findUnob returns the unobserved variables (the complementary set to observed).
+// findUnob panics if any value repeated in observed.
+func findUnob(observed []int, dim int) (unobserved []int) {
+	var setOb intsets.Sparse
+	for _, v := range observed {
+		setOb.Insert(v)
+	}
+	var setAll intsets.Sparse
+	for i := 0; i < dim; i++ {
+		setAll.Insert(i)
+	}
+	var setUnob intsets.Sparse
+	setUnob.Difference(&setAll, &setOb)
+	unobserved = setUnob.AppendTo(nil)
+	sort.Ints(unobserved)
+	return unobserved
+}
+
+// CovarianceMatrix calculates the covariance matrix of the distribution,
+// storing the result in dst. Upon return, the value at element {i, j} of the
+// covariance matrix is equal to the covariance of the i^th and j^th variables.
+//
+//	covariance(i, j) = E[(x_i - E[x_i])(x_j - E[x_j])]
+//
+// If the dst matrix is empty it will be resized to the correct dimensions,
+// otherwise dst must match the dimension of the receiver or CovarianceMatrix
+// will panic.
+func (st *StudentsT) CovarianceMatrix(dst *mat.SymDense) {
+	if dst.IsEmpty() {
+		*dst = *(dst.GrowSym(st.dim).(*mat.SymDense))
+	} else if dst.SymmetricDim() != st.dim {
+		panic("studentst: input matrix size mismatch")
+	}
+	dst.CopySym(&st.sigma)
+	dst.ScaleSym(st.nu/(st.nu-2), dst)
+}
+
+// Dim returns the dimension of the distribution.
+func (s *StudentsT) Dim() int {
+	return s.dim
+}
+
+// LogProb computes the log of the pdf of the point x.
+func (s *StudentsT) LogProb(y []float64) float64 {
+	if len(y) != s.dim {
+		panic(badInputLength)
+	}
+
+	nu := s.nu
+	n := float64(s.dim)
+	lg1, _ := math.Lgamma((nu + n) / 2)
+	lg2, _ := math.Lgamma(nu / 2)
+
+	t1 := lg1 - lg2 - n/2*math.Log(nu*math.Pi) - s.logSqrtDet
+
+	mahal := stat.Mahalanobis(mat.NewVecDense(len(y), y), mat.NewVecDense(len(s.mu), s.mu), &s.chol)
+	mahal *= mahal
+	return t1 - ((nu+n)/2)*math.Log(1+mahal/nu)
+}
+
+// MarginalStudentsT returns the marginal distribution of the given input variables,
+// and the success of the operation.
+// That is, MarginalStudentsT returns
+//
+//	p(x_i) = \int_{x_o} p(x_i | x_o) p(x_o) dx_o
+//
+// where x_i are the dimensions in the input, and x_o are the remaining dimensions.
+// See https://en.wikipedia.org/wiki/Marginal_distribution for more information.
+//
+// The input src is passed to the created StudentsT.
+//
+// ok indicates whether there was a failure during the marginalization. If ok is false
+// the operation failed and dist is not usable.
+// Mathematically this is impossible, but can occur with finite precision arithmetic.
+func (s *StudentsT) MarginalStudentsT(vars []int, src rand.Source) (dist *StudentsT, ok bool) {
+	newMean := make([]float64, len(vars))
+	for i, v := range vars {
+		newMean[i] = s.mu[v]
+	}
+	var newSigma mat.SymDense
+	newSigma.SubsetSym(&s.sigma, vars)
+	return NewStudentsT(newMean, &newSigma, s.nu, src)
+}
+
+// MarginalStudentsTSingle returns the marginal distribution of the given input variable.
+// That is, MarginalStudentsTSingle returns
+//
+//	p(x_i) = \int_{x_o} p(x_i | x_o) p(x_o) dx_o
+//
+// where i is the input index, and x_o are the remaining dimensions.
+// See https://en.wikipedia.org/wiki/Marginal_distribution for more information.
+//
+// The input src is passed to the call to NewStudentsT.
+func (s *StudentsT) MarginalStudentsTSingle(i int, src rand.Source) distuv.StudentsT {
+	return distuv.StudentsT{
+		Mu:    s.mu[i],
+		Sigma: math.Sqrt(s.sigma.At(i, i)),
+		Nu:    s.nu,
+		Src:   src,
+	}
+}
+
+// TODO(btracey): Implement marginal single. Need to modify univariate StudentsT
+// to be three-parameter.
+
+// Mean returns the mean of the probability distribution.
+//
+// If dst is not nil, the mean will be stored in-place into dst and returned,
+// otherwise a new slice will be allocated first. If dst is not nil, it must
+// have length equal to the dimension of the distribution.
+func (s *StudentsT) Mean(dst []float64) []float64 {
+	dst = reuseAs(dst, s.dim)
+	copy(dst, s.mu)
+	return dst
+}
+
+// Nu returns the degrees of freedom parameter of the distribution.
+func (s *StudentsT) Nu() float64 {
+	return s.nu
+}
+
+// Prob computes the value of the probability density function at x.
+func (s *StudentsT) Prob(y []float64) float64 {
+	return math.Exp(s.LogProb(y))
+}
+
+// Rand generates a random sample according to the distribution.
+//
+// If dst is not nil, the sample will be stored in-place into dst and returned,
+// otherwise a new slice will be allocated first. If dst is not nil, it must
+// have length equal to the dimension of the distribution.
+func (s *StudentsT) Rand(dst []float64) []float64 {
+	// If Y is distributed according to N(0,Sigma), and U is chi^2 with
+	// parameter ν, then
+	//  X = mu + Y * sqrt(nu / U)
+	// X is distributed according to this distribution.
+
+	// Generate Y.
+	dst = reuseAs(dst, s.dim)
+	if s.rnd == nil {
+		for i := range dst {
+			dst[i] = rand.NormFloat64()
+		}
+	} else {
+		for i := range dst {
+			dst[i] = s.rnd.NormFloat64()
+		}
+	}
+	y := mat.NewVecDense(s.dim, dst)
+	y.MulVec(&s.lower, y)
+	// Compute mu + Y*sqrt(nu/U)
+	u := distuv.ChiSquared{K: s.nu, Src: s.src}.Rand()
+	floats.AddScaledTo(dst, s.mu, math.Sqrt(s.nu/u), dst)
+	return dst
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distmv/uniform.go b/vendor/gonum.org/v1/gonum/stat/distmv/uniform.go
new file mode 100644
index 0000000000..81d8cba6d7
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distmv/uniform.go
@@ -0,0 +1,200 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distmv
+
+import (
+	"math"
+	"math/rand/v2"
+
+	"gonum.org/v1/gonum/spatial/r1"
+)
+
+// Uniform represents a multivariate uniform distribution.
+type Uniform struct {
+	bounds []r1.Interval
+	dim    int
+	rnd    *rand.Rand
+}
+
+// NewUniform creates a new uniform distribution with the given bounds.
+func NewUniform(bnds []r1.Interval, src rand.Source) *Uniform {
+	dim := len(bnds)
+	if dim == 0 {
+		panic(badZeroDimension)
+	}
+	for _, b := range bnds {
+		if b.Max < b.Min {
+			panic("uniform: maximum less than minimum")
+		}
+	}
+	u := &Uniform{
+		bounds: make([]r1.Interval, dim),
+		dim:    dim,
+	}
+	if src != nil {
+		u.rnd = rand.New(src)
+	}
+	for i, b := range bnds {
+		u.bounds[i].Min = b.Min
+		u.bounds[i].Max = b.Max
+	}
+	return u
+}
+
+// NewUnitUniform creates a new Uniform distribution over the dim-dimensional
+// unit hypercube. That is, a uniform distribution where each dimension has
+// Min = 0 and Max = 1.
+func NewUnitUniform(dim int, src rand.Source) *Uniform {
+	if dim <= 0 {
+		panic(nonPosDimension)
+	}
+	bounds := make([]r1.Interval, dim)
+	for i := range bounds {
+		bounds[i].Min = 0
+		bounds[i].Max = 1
+	}
+	u := Uniform{
+		bounds: bounds,
+		dim:    dim,
+	}
+	if src != nil {
+		u.rnd = rand.New(src)
+	}
+	return &u
+}
+
+// Bounds returns the bounds on the variables of the distribution.
+//
+// If dst is not nil, the bounds will be stored in-place into dst and returned,
+// otherwise a new slice will be allocated first. If dst is not nil, it must
+// have length equal to the dimension of the distribution.
+func (u *Uniform) Bounds(bounds []r1.Interval) []r1.Interval {
+	if bounds == nil {
+		bounds = make([]r1.Interval, u.Dim())
+	}
+	if len(bounds) != u.Dim() {
+		panic(badInputLength)
+	}
+	copy(bounds, u.bounds)
+	return bounds
+}
+
+// CDF returns the value of the multidimensional cumulative distribution
+// function of the probability distribution at the point x.
+//
+// If dst is not nil, the value will be stored in-place into dst and returned,
+// otherwise a new slice will be allocated first. If dst is not nil, it must
+// have length equal to the dimension of the distribution. CDF will also panic
+// if the length of x is not equal to the dimension of the distribution.
+func (u *Uniform) CDF(dst, x []float64) []float64 {
+	if len(x) != u.dim {
+		panic(badSizeMismatch)
+	}
+	dst = reuseAs(dst, u.dim)
+
+	for i, v := range x {
+		if v < u.bounds[i].Min {
+			dst[i] = 0
+		} else if v > u.bounds[i].Max {
+			dst[i] = 1
+		} else {
+			dst[i] = (v - u.bounds[i].Min) / (u.bounds[i].Max - u.bounds[i].Min)
+		}
+	}
+	return dst
+}
+
+// Dim returns the dimension of the distribution.
+func (u *Uniform) Dim() int {
+	return u.dim
+}
+
+// Entropy returns the differential entropy of the distribution.
+func (u *Uniform) Entropy() float64 {
+	// Entropy is log of the volume.
+	var logVol float64
+	for _, b := range u.bounds {
+		logVol += math.Log(b.Max - b.Min)
+	}
+	return logVol
+}
+
+// LogProb computes the log of the pdf of the point x.
+func (u *Uniform) LogProb(x []float64) float64 {
+	dim := u.dim
+	if len(x) != dim {
+		panic(badSizeMismatch)
+	}
+	var logprob float64
+	for i, b := range u.bounds {
+		if x[i] < b.Min || x[i] > b.Max {
+			return math.Inf(-1)
+		}
+		logprob -= math.Log(b.Max - b.Min)
+	}
+	return logprob
+}
+
+// Mean returns the mean of the probability distribution.
+//
+// If dst is not nil, the mean will be stored in-place into dst and returned,
+// otherwise a new slice will be allocated first. If dst is not nil, it must
+// have length equal to the dimension of the distribution.
+func (u *Uniform) Mean(dst []float64) []float64 {
+	dst = reuseAs(dst, u.dim)
+	for i, b := range u.bounds {
+		dst[i] = (b.Max + b.Min) / 2
+	}
+	return dst
+}
+
+// Prob computes the value of the probability density function at x.
+func (u *Uniform) Prob(x []float64) float64 {
+	return math.Exp(u.LogProb(x))
+}
+
+// Rand generates a random sample according to the distribution.
+//
+// If dst is not nil, the sample will be stored in-place into dst and returned,
+// otherwise a new slice will be allocated first. If dst is not nil, it must
+// have length equal to the dimension of the distribution.
+func (u *Uniform) Rand(dst []float64) []float64 {
+	dst = reuseAs(dst, u.dim)
+	if u.rnd == nil {
+		for i, b := range u.bounds {
+			dst[i] = rand.Float64()*(b.Max-b.Min) + b.Min
+		}
+		return dst
+	}
+	for i, b := range u.bounds {
+		dst[i] = u.rnd.Float64()*(b.Max-b.Min) + b.Min
+	}
+	return dst
+}
+
+// Quantile returns the value of the multi-dimensional inverse cumulative
+// distribution function at p.
+//
+// If dst is not nil, the quantile will be stored in-place into dst and
+// returned, otherwise a new slice will be allocated first. If dst is not nil,
+// it must have length equal to the dimension of the distribution. Quantile will
+// also panic if the length of p is not equal to the dimension of the
+// distribution.
+//
+// All of the values of p must be between 0 and 1, inclusive, or Quantile will
+// panic.
+func (u *Uniform) Quantile(dst, p []float64) []float64 {
+	if len(p) != u.dim {
+		panic(badSizeMismatch)
+	}
+	dst = reuseAs(dst, u.dim)
+	for i, v := range p {
+		if v < 0 || v > 1 {
+			panic(badQuantile)
+		}
+		dst[i] = v*(u.bounds[i].Max-u.bounds[i].Min) + u.bounds[i].Min
+	}
+	return dst
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/alphastable.go b/vendor/gonum.org/v1/gonum/stat/distuv/alphastable.go
new file mode 100644
index 0000000000..20ffeac620
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/alphastable.go
@@ -0,0 +1,112 @@
+// Copyright ©2020 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+import (
+	"math"
+	"math/rand/v2"
+)
+
+// AlphaStable represents an α-stable distribution with four parameters.
+// See https://en.wikipedia.org/wiki/Stable_distribution for more information.
+type AlphaStable struct {
+	// Alpha is the stability parameter.
+	// It is valid within the range 0 < α ≤ 2.
+	Alpha float64
+	// Beta is the skewness parameter.
+	// It is valid within the range -1 ≤ β ≤ 1.
+	Beta float64
+	// C is the scale parameter.
+	// It is valid when positive.
+	C float64
+	// Mu is the location parameter.
+	Mu  float64
+	Src rand.Source
+}
+
+// ExKurtosis returns the excess kurtosis of the distribution.
+// ExKurtosis returns NaN when Alpha != 2.
+func (a AlphaStable) ExKurtosis() float64 {
+	if a.Alpha == 2 {
+		return 0
+	}
+	return math.NaN()
+}
+
+// Mean returns the mean of the probability distribution.
+// Mean returns NaN when Alpha <= 1.
+func (a AlphaStable) Mean() float64 {
+	if a.Alpha > 1 {
+		return a.Mu
+	}
+	return math.NaN()
+}
+
+// Median returns the median of the distribution.
+// Median panics when Beta != 0, because then the mode is not analytically
+// expressible.
+func (a AlphaStable) Median() float64 {
+	if a.Beta == 0 {
+		return a.Mu
+	}
+	panic("distuv: cannot compute Median for Beta != 0")
+}
+
+// Mode returns the mode of the distribution.
+// Mode panics when Beta != 0, because then the mode is not analytically
+// expressible.
+func (a AlphaStable) Mode() float64 {
+	if a.Beta == 0 {
+		return a.Mu
+	}
+	panic("distuv: cannot compute Mode for Beta != 0")
+}
+
+// NumParameters returns the number of parameters in the distribution.
+func (a AlphaStable) NumParameters() int {
+	return 4
+}
+
+// Rand returns a random sample drawn from the distribution.
+func (a AlphaStable) Rand() float64 {
+	// From https://en.wikipedia.org/wiki/Stable_distribution#Simulation_of_stable_variables
+	const halfPi = math.Pi / 2
+	u := Uniform{-halfPi, halfPi, a.Src}.Rand()
+	w := Exponential{1, a.Src}.Rand()
+	if a.Alpha == 1 {
+		f := halfPi + a.Beta*u
+		x := (f*math.Tan(u) - a.Beta*math.Log(halfPi*w*math.Cos(u)/f)) / halfPi
+		return a.C*(x+a.Beta*math.Log(a.C)/halfPi) + a.Mu
+	}
+	zeta := -a.Beta * math.Tan(halfPi*a.Alpha)
+	xi := math.Atan(-zeta) / a.Alpha
+	f := a.Alpha * (u + xi)
+	g := math.Sqrt(1+zeta*zeta) * math.Pow(math.Cos(u-f)/w, 1-a.Alpha) / math.Cos(u)
+	x := math.Pow(g, 1/a.Alpha) * math.Sin(f)
+	return a.C*x + a.Mu
+}
+
+// Skewness returns the skewness of the distribution.
+// Skewness returns NaN when Alpha != 2.
+func (a AlphaStable) Skewness() float64 {
+	if a.Alpha == 2 {
+		return 0
+	}
+	return math.NaN()
+}
+
+// StdDev returns the standard deviation of the probability distribution.
+func (a AlphaStable) StdDev() float64 {
+	return math.Sqrt(a.Variance())
+}
+
+// Variance returns the variance of the probability distribution.
+// Variance returns +Inf when Alpha != 2.
+func (a AlphaStable) Variance() float64 {
+	if a.Alpha == 2 {
+		return 2 * a.C * a.C
+	}
+	return math.Inf(1)
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/bernoulli.go b/vendor/gonum.org/v1/gonum/stat/distuv/bernoulli.go
new file mode 100644
index 0000000000..b220afae28
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/bernoulli.go
@@ -0,0 +1,140 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+import (
+	"math"
+	"math/rand/v2"
+)
+
+// Bernoulli represents a random variable whose value is 1 with probability p and
+// value of zero with probability 1-P. The value of P must be between 0 and 1.
+// More information at https://en.wikipedia.org/wiki/Bernoulli_distribution.
+type Bernoulli struct {
+	P   float64
+	Src rand.Source
+}
+
+// CDF computes the value of the cumulative density function at x.
+func (b Bernoulli) CDF(x float64) float64 {
+	if x < 0 {
+		return 0
+	}
+	if x < 1 {
+		return 1 - b.P
+	}
+	return 1
+}
+
+// Entropy returns the entropy of the distribution.
+func (b Bernoulli) Entropy() float64 {
+	if b.P == 0 || b.P == 1 {
+		return 0
+	}
+	q := 1 - b.P
+	return -b.P*math.Log(b.P) - q*math.Log(q)
+}
+
+// ExKurtosis returns the excess kurtosis of the distribution.
+func (b Bernoulli) ExKurtosis() float64 {
+	pq := b.P * (1 - b.P)
+	return (1 - 6*pq) / pq
+}
+
+// LogProb computes the natural logarithm of the value of the probability density function at x.
+func (b Bernoulli) LogProb(x float64) float64 {
+	if x == 0 {
+		return math.Log(1 - b.P)
+	}
+	if x == 1 {
+		return math.Log(b.P)
+	}
+	return math.Inf(-1)
+}
+
+// Mean returns the mean of the probability distribution.
+func (b Bernoulli) Mean() float64 {
+	return b.P
+}
+
+// Median returns the median of the probability distribution.
+func (b Bernoulli) Median() float64 {
+	p := b.P
+	switch {
+	case p < 0.5:
+		return 0
+	case p > 0.5:
+		return 1
+	default:
+		return 0.5
+	}
+}
+
+// NumParameters returns the number of parameters in the distribution.
+func (Bernoulli) NumParameters() int {
+	return 1
+}
+
+// Prob computes the value of the probability distribution at x.
+func (b Bernoulli) Prob(x float64) float64 {
+	if x == 0 {
+		return 1 - b.P
+	}
+	if x == 1 {
+		return b.P
+	}
+	return 0
+}
+
+// Quantile returns the minimum value of x from amongst all those values whose CDF value exceeds or equals p.
+func (b Bernoulli) Quantile(p float64) float64 {
+	if p < 0 || 1 < p {
+		panic(badPercentile)
+	}
+	if p <= 1-b.P {
+		return 0
+	}
+	return 1
+}
+
+// Rand returns a random sample drawn from the distribution.
+func (b Bernoulli) Rand() float64 {
+	var rnd float64
+	if b.Src == nil {
+		rnd = rand.Float64()
+	} else {
+		rnd = rand.New(b.Src).Float64()
+	}
+	if rnd < b.P {
+		return 1
+	}
+	return 0
+}
+
+// Skewness returns the skewness of the distribution.
+func (b Bernoulli) Skewness() float64 {
+	return (1 - 2*b.P) / math.Sqrt(b.P*(1-b.P))
+}
+
+// StdDev returns the standard deviation of the probability distribution.
+func (b Bernoulli) StdDev() float64 {
+	return math.Sqrt(b.Variance())
+}
+
+// Survival returns the survival function (complementary CDF) at x.
+func (b Bernoulli) Survival(x float64) float64 {
+	if x < 0 {
+		return 1
+	}
+	if x < 1 {
+		return b.P
+	}
+	return 0
+}
+
+// Variance returns the variance of the probability distribution.
+func (b Bernoulli) Variance() float64 {
+	return b.P * (1 - b.P)
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/beta.go b/vendor/gonum.org/v1/gonum/stat/distuv/beta.go
new file mode 100644
index 0000000000..93e9343b6b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/beta.go
@@ -0,0 +1,151 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+import (
+	"math"
+	"math/rand/v2"
+
+	"gonum.org/v1/gonum/mathext"
+)
+
+// Beta implements the Beta distribution, a two-parameter continuous distribution
+// with support between 0 and 1.
+//
+// The beta distribution has density function
+//
+//	x^(α-1) * (1-x)^(β-1) * Γ(α+β) / (Γ(α)*Γ(β))
+//
+// For more information, see https://en.wikipedia.org/wiki/Beta_distribution
+type Beta struct {
+	// Alpha is the left shape parameter of the distribution. Alpha must be greater
+	// than 0.
+	Alpha float64
+	// Beta is the right shape parameter of the distribution. Beta must be greater
+	// than 0.
+	Beta float64
+
+	Src rand.Source
+}
+
+// CDF computes the value of the cumulative distribution function at x.
+func (b Beta) CDF(x float64) float64 {
+	if x <= 0 {
+		return 0
+	}
+	if x >= 1 {
+		return 1
+	}
+	return mathext.RegIncBeta(b.Alpha, b.Beta, x)
+}
+
+// Entropy returns the differential entropy of the distribution.
+func (b Beta) Entropy() float64 {
+	if b.Alpha <= 0 || b.Beta <= 0 {
+		panic("beta: negative parameters")
+	}
+	return mathext.Lbeta(b.Alpha, b.Beta) - (b.Alpha-1)*mathext.Digamma(b.Alpha) -
+		(b.Beta-1)*mathext.Digamma(b.Beta) + (b.Alpha+b.Beta-2)*mathext.Digamma(b.Alpha+b.Beta)
+}
+
+// ExKurtosis returns the excess kurtosis of the distribution.
+func (b Beta) ExKurtosis() float64 {
+	num := 6 * ((b.Alpha-b.Beta)*(b.Alpha-b.Beta)*(b.Alpha+b.Beta+1) - b.Alpha*b.Beta*(b.Alpha+b.Beta+2))
+	den := b.Alpha * b.Beta * (b.Alpha + b.Beta + 2) * (b.Alpha + b.Beta + 3)
+	return num / den
+}
+
+// LogProb computes the natural logarithm of the value of the probability
+// density function at x.
+func (b Beta) LogProb(x float64) float64 {
+	if x < 0 || x > 1 {
+		return math.Inf(-1)
+	}
+
+	if b.Alpha <= 0 || b.Beta <= 0 {
+		panic("beta: negative parameters")
+	}
+
+	lab, _ := math.Lgamma(b.Alpha + b.Beta)
+	la, _ := math.Lgamma(b.Alpha)
+	lb, _ := math.Lgamma(b.Beta)
+	var lx float64
+	if b.Alpha != 1 {
+		lx = (b.Alpha - 1) * math.Log(x)
+	}
+	var l1mx float64
+	if b.Beta != 1 {
+		l1mx = (b.Beta - 1) * math.Log(1-x)
+	}
+	return lab - la - lb + lx + l1mx
+}
+
+// Mean returns the mean of the probability distribution.
+func (b Beta) Mean() float64 {
+	return b.Alpha / (b.Alpha + b.Beta)
+}
+
+// Mode returns the mode of the distribution.
+//
+// Mode returns NaN if both parameters are less than or equal to 1 as a special case,
+// 0 if only Alpha <= 1 and 1 if only Beta <= 1.
+func (b Beta) Mode() float64 {
+	if b.Alpha <= 1 {
+		if b.Beta <= 1 {
+			return math.NaN()
+		}
+		return 0
+	}
+	if b.Beta <= 1 {
+		return 1
+	}
+	return (b.Alpha - 1) / (b.Alpha + b.Beta - 2)
+}
+
+// NumParameters returns the number of parameters in the distribution.
+func (b Beta) NumParameters() int {
+	return 2
+}
+
+// Prob computes the value of the probability density function at x.
+func (b Beta) Prob(x float64) float64 {
+	return math.Exp(b.LogProb(x))
+}
+
+// Quantile returns the inverse of the cumulative distribution function.
+func (b Beta) Quantile(p float64) float64 {
+	if p < 0 || p > 1 {
+		panic(badPercentile)
+	}
+	return mathext.InvRegIncBeta(b.Alpha, b.Beta, p)
+}
+
+// Rand returns a random sample drawn from the distribution.
+func (b Beta) Rand() float64 {
+	ga := Gamma{Alpha: b.Alpha, Beta: 1, Src: b.Src}.Rand()
+	gb := Gamma{Alpha: b.Beta, Beta: 1, Src: b.Src}.Rand()
+	return ga / (ga + gb)
+}
+
+// StdDev returns the standard deviation of the probability distribution.
+func (b Beta) StdDev() float64 {
+	return math.Sqrt(b.Variance())
+}
+
+// Survival returns the survival function (complementary CDF) at x.
+func (b Beta) Survival(x float64) float64 {
+	switch {
+	case x <= 0:
+		return 1
+	case x >= 1:
+		return 0
+	}
+	return mathext.RegIncBeta(b.Beta, b.Alpha, 1-x)
+}
+
+// Variance returns the variance of the probability distribution.
+func (b Beta) Variance() float64 {
+	return b.Alpha * b.Beta / ((b.Alpha + b.Beta) * (b.Alpha + b.Beta) * (b.Alpha + b.Beta + 1))
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/binomial.go b/vendor/gonum.org/v1/gonum/stat/distuv/binomial.go
new file mode 100644
index 0000000000..4f5f6c7e1f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/binomial.go
@@ -0,0 +1,189 @@
+// Copyright ©2018 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+import (
+	"math"
+	"math/rand/v2"
+
+	"gonum.org/v1/gonum/mathext"
+	"gonum.org/v1/gonum/stat/combin"
+)
+
+// Binomial implements the binomial distribution, a discrete probability distribution
+// that expresses the probability of a given number of successful Bernoulli trials
+// out of a total of n, each with success probability p.
+// The binomial distribution has the density function:
+//
+//	f(k) = (n choose k) p^k (1-p)^(n-k)
+//
+// For more information, see https://en.wikipedia.org/wiki/Binomial_distribution.
+type Binomial struct {
+	// N is the total number of Bernoulli trials. N must be greater than 0.
+	N float64
+	// P is the probability of success in any given trial. P must be in [0, 1].
+	P float64
+
+	Src rand.Source
+}
+
+// CDF computes the value of the cumulative distribution function at x.
+func (b Binomial) CDF(x float64) float64 {
+	if x < 0 {
+		return 0
+	}
+	if x >= b.N {
+		return 1
+	}
+	x = math.Floor(x)
+	return mathext.RegIncBeta(b.N-x, x+1, 1-b.P)
+}
+
+// ExKurtosis returns the excess kurtosis of the distribution.
+func (b Binomial) ExKurtosis() float64 {
+	v := b.P * (1 - b.P)
+	return (1 - 6*v) / (b.N * v)
+}
+
+// LogProb computes the natural logarithm of the value of the probability
+// density function at x.
+func (b Binomial) LogProb(x float64) float64 {
+	if x < 0 || x > b.N || math.Floor(x) != x {
+		return math.Inf(-1)
+	}
+	lb := combin.LogGeneralizedBinomial(b.N, x)
+	return lb + x*math.Log(b.P) + (b.N-x)*math.Log(1-b.P)
+}
+
+// Mean returns the mean of the probability distribution.
+func (b Binomial) Mean() float64 {
+	return b.N * b.P
+}
+
+// NumParameters returns the number of parameters in the distribution.
+func (Binomial) NumParameters() int {
+	return 2
+}
+
+// Prob computes the value of the probability density function at x.
+func (b Binomial) Prob(x float64) float64 {
+	return math.Exp(b.LogProb(x))
+}
+
+// Rand returns a random sample drawn from the distribution.
+func (b Binomial) Rand() float64 {
+	// NUMERICAL RECIPES IN C: THE ART OF SCIENTIFIC COMPUTING (ISBN 0-521-43108-5)
+	// p. 295-6
+	// http://www.aip.de/groups/soe/local/numres/bookcpdf/c7-3.pdf
+
+	runif := rand.Float64
+	rexp := rand.ExpFloat64
+	if b.Src != nil {
+		rnd := rand.New(b.Src)
+		runif = rnd.Float64
+		rexp = rnd.ExpFloat64
+	}
+
+	p := b.P
+	if p > 0.5 {
+		p = 1 - p
+	}
+	am := b.N * p
+
+	if b.N < 25 {
+		// Use direct method.
+		bnl := 0.0
+		for i := 0; i < int(b.N); i++ {
+			if runif() < p {
+				bnl++
+			}
+		}
+		if p != b.P {
+			return b.N - bnl
+		}
+		return bnl
+	}
+
+	if am < 1 {
+		// Use rejection method with Poisson proposal.
+		const logM = 2.6e-2 // constant for rejection sampling (https://en.wikipedia.org/wiki/Rejection_sampling)
+		var bnl float64
+		z := -p
+		pclog := (1 + 0.5*z) * z / (1 + (1+1.0/6*z)*z) // Padé approximant of log(1 + x)
+		for {
+			bnl = 0.0
+			t := 0.0
+			for i := 0; i < int(b.N); i++ {
+				t += rexp()
+				if t >= am {
+					break
+				}
+				bnl++
+			}
+			bnlc := b.N - bnl
+			z = -bnl / b.N
+			log1p := (1 + 0.5*z) * z / (1 + (1+1.0/6*z)*z)
+			t = (bnlc+0.5)*log1p + bnl - bnlc*pclog + 1/(12*bnlc) - am + logM // Uses Stirling's expansion of log(n!)
+			if rexp() >= t {
+				break
+			}
+		}
+		if p != b.P {
+			return b.N - bnl
+		}
+		return bnl
+	}
+	// Original algorithm samples from a Poisson distribution with the
+	// appropriate expected value. However, the Poisson approximation is
+	// asymptotic such that the absolute deviation in probability is O(1/n).
+	// Rejection sampling produces exact variates with at worst less than 3%
+	// rejection with minimal additional computation.
+
+	// Use rejection method with Cauchy proposal.
+	g, _ := math.Lgamma(b.N + 1)
+	plog := math.Log(p)
+	pclog := math.Log1p(-p)
+	sq := math.Sqrt(2 * am * (1 - p))
+	for {
+		var em, y float64
+		for {
+			y = math.Tan(math.Pi * runif())
+			em = sq*y + am
+			if em >= 0 && em < b.N+1 {
+				break
+			}
+		}
+		em = math.Floor(em)
+		lg1, _ := math.Lgamma(em + 1)
+		lg2, _ := math.Lgamma(b.N - em + 1)
+		t := 1.2 * sq * (1 + y*y) * math.Exp(g-lg1-lg2+em*plog+(b.N-em)*pclog)
+		if runif() <= t {
+			if p != b.P {
+				return b.N - em
+			}
+			return em
+		}
+	}
+}
+
+// Skewness returns the skewness of the distribution.
+func (b Binomial) Skewness() float64 {
+	return (1 - 2*b.P) / b.StdDev()
+}
+
+// StdDev returns the standard deviation of the probability distribution.
+func (b Binomial) StdDev() float64 {
+	return math.Sqrt(b.Variance())
+}
+
+// Survival returns the survival function (complementary CDF) at x.
+func (b Binomial) Survival(x float64) float64 {
+	return 1 - b.CDF(x)
+}
+
+// Variance returns the variance of the probability distribution.
+func (b Binomial) Variance() float64 {
+	return b.N * b.P * (1 - b.P)
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/categorical.go b/vendor/gonum.org/v1/gonum/stat/distuv/categorical.go
new file mode 100644
index 0000000000..f4b77ee49f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/categorical.go
@@ -0,0 +1,184 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+import (
+	"math"
+	"math/rand/v2"
+)
+
+// Categorical is an extension of the Bernoulli distribution where x takes
+// values {0, 1, ..., len(w)-1} where w is the weight vector. Categorical must
+// be initialized with NewCategorical.
+type Categorical struct {
+	weights []float64
+
+	// heap is a weight heap.
+	//
+	// It keeps a heap-organised sum of remaining
+	// index weights that are available to be taken
+	// from.
+	//
+	// Each element holds the sum of weights for
+	// the corresponding index, plus the sum of
+	// its children's weights; the children of
+	// an element i can be found at positions
+	// 2*(i+1)-1 and 2*(i+1). The root of the
+	// weight heap is at element 0.
+	//
+	// See comments in container/heap for an
+	// explanation of the layout of a heap.
+	heap []float64
+
+	src rand.Source
+}
+
+// NewCategorical constructs a new categorical distribution where the probability
+// that x equals i is proportional to w[i]. All of the weights must be
+// nonnegative, and at least one of the weights must be positive.
+func NewCategorical(w []float64, src rand.Source) Categorical {
+	c := Categorical{
+		weights: make([]float64, len(w)),
+		heap:    make([]float64, len(w)),
+		src:     src,
+	}
+	c.ReweightAll(w)
+	return c
+}
+
+// CDF computes the value of the cumulative density function at x.
+func (c Categorical) CDF(x float64) float64 {
+	var cdf float64
+	for i, w := range c.weights {
+		if x < float64(i) {
+			break
+		}
+		cdf += w
+	}
+	return cdf / c.heap[0]
+}
+
+// Entropy returns the entropy of the distribution.
+func (c Categorical) Entropy() float64 {
+	var ent float64
+	for _, w := range c.weights {
+		if w == 0 {
+			continue
+		}
+		p := w / c.heap[0]
+		ent += p * math.Log(p)
+	}
+	return -ent
+}
+
+// Len returns the number of values x could possibly take (the length of the
+// initial supplied weight vector).
+func (c Categorical) Len() int {
+	return len(c.weights)
+}
+
+// Mean returns the mean of the probability distribution.
+func (c Categorical) Mean() float64 {
+	var mean float64
+	for i, v := range c.weights {
+		mean += float64(i) * v
+	}
+	return mean / c.heap[0]
+}
+
+// Prob computes the value of the probability density function at x.
+func (c Categorical) Prob(x float64) float64 {
+	xi := int(x)
+	if float64(xi) != x {
+		return 0
+	}
+	if xi < 0 || xi > len(c.weights)-1 {
+		return 0
+	}
+	return c.weights[xi] / c.heap[0]
+}
+
+// LogProb computes the natural logarithm of the value of the probability density function at x.
+func (c Categorical) LogProb(x float64) float64 {
+	return math.Log(c.Prob(x))
+}
+
+// Rand returns a random draw from the categorical distribution.
+func (c Categorical) Rand() float64 {
+	var r float64
+	if c.src == nil {
+		r = c.heap[0] * rand.Float64()
+	} else {
+		r = c.heap[0] * rand.New(c.src).Float64()
+	}
+	i := 1
+	last := -1
+	left := len(c.weights)
+	for {
+		if r -= c.weights[i-1]; r <= 0 {
+			break // Fall within item i-1.
+		}
+		i <<= 1 // Move to left child.
+		if d := c.heap[i-1]; r > d {
+			r -= d
+			// If enough r to pass left child,
+			// move to right child state will
+			// be caught at break above.
+			i++
+		}
+		if i == last || left < 0 {
+			panic("categorical: bad sample")
+		}
+		last = i
+		left--
+	}
+	return float64(i - 1)
+}
+
+// Reweight sets the weight of item idx to w. The input weight must be
+// non-negative, and after reweighting at least one of the weights must be
+// positive.
+func (c Categorical) Reweight(idx int, w float64) {
+	if w < 0 {
+		panic("categorical: negative weight")
+	}
+	w, c.weights[idx] = c.weights[idx]-w, w
+	idx++
+	for idx > 0 {
+		c.heap[idx-1] -= w
+		idx >>= 1
+	}
+	if c.heap[0] <= 0 {
+		panic("categorical: sum of the weights non-positive")
+	}
+}
+
+// ReweightAll resets the weights of the distribution. ReweightAll panics if
+// len(w) != c.Len. All of the weights must be nonnegative, and at least one of
+// the weights must be positive.
+func (c Categorical) ReweightAll(w []float64) {
+	if len(w) != c.Len() {
+		panic("categorical: length of the slices do not match")
+	}
+	for _, v := range w {
+		if v < 0 {
+			panic("categorical: negative weight")
+		}
+	}
+	copy(c.weights, w)
+	c.reset()
+}
+
+func (c Categorical) reset() {
+	copy(c.heap, c.weights)
+	for i := len(c.heap) - 1; i > 0; i-- {
+		// Sometimes 1-based counting makes sense.
+		c.heap[((i+1)>>1)-1] += c.heap[i]
+	}
+	// TODO(btracey): Renormalization for weird weights?
+	if c.heap[0] <= 0 {
+		panic("categorical: sum of the weights non-positive")
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/chi.go b/vendor/gonum.org/v1/gonum/stat/distuv/chi.go
new file mode 100644
index 0000000000..105f529653
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/chi.go
@@ -0,0 +1,124 @@
+// Copyright ©2021 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+import (
+	"math"
+	"math/rand/v2"
+
+	"gonum.org/v1/gonum/mathext"
+)
+
+// Chi implements the χ distribution, a one parameter distribution
+// with support on the positive numbers.
+//
+// The density function is given by
+//
+//	1/(2^{k/2-1} * Γ(k/2)) * x^{k - 1} * e^{-x^2/2}
+//
+// For more information, see https://en.wikipedia.org/wiki/Chi_distribution.
+type Chi struct {
+	// K is the shape parameter, corresponding to the degrees of freedom. Must
+	// be greater than 0.
+	K float64
+
+	Src rand.Source
+}
+
+// CDF computes the value of the cumulative density function at x.
+func (c Chi) CDF(x float64) float64 {
+	return mathext.GammaIncReg(c.K/2, (x*x)/2)
+}
+
+// Entropy returns the differential entropy of the distribution.
+func (c Chi) Entropy() float64 {
+	lg, _ := math.Lgamma(c.K / 2)
+	return lg + 0.5*(c.K-math.Ln2-(c.K-1)*mathext.Digamma(c.K/2))
+}
+
+// ExKurtosis returns the excess kurtosis of the distribution.
+func (c Chi) ExKurtosis() float64 {
+	v := c.Variance()
+	s := math.Sqrt(v)
+	return 2 / v * (1 - c.Mean()*s*c.Skewness() - v)
+}
+
+// LogProb computes the natural logarithm of the value of the probability
+// density function at x.
+func (c Chi) LogProb(x float64) float64 {
+	if x < 0 {
+		return math.Inf(-1)
+	}
+	lg, _ := math.Lgamma(c.K / 2)
+	return (c.K-1)*math.Log(x) - (x*x)/2 - (c.K/2-1)*math.Ln2 - lg
+}
+
+// Mean returns the mean of the probability distribution.
+func (c Chi) Mean() float64 {
+	lg1, _ := math.Lgamma((c.K + 1) / 2)
+	lg, _ := math.Lgamma(c.K / 2)
+	return math.Sqrt2 * math.Exp(lg1-lg)
+}
+
+// Median returns the median of the distribution.
+func (c Chi) Median() float64 {
+	return c.Quantile(0.5)
+}
+
+// Mode returns the mode of the distribution.
+//
+// Mode returns NaN if K is less than one.
+func (c Chi) Mode() float64 {
+	return math.Sqrt(c.K - 1)
+}
+
+// NumParameters returns the number of parameters in the distribution.
+func (c Chi) NumParameters() int {
+	return 1
+}
+
+// Prob computes the value of the probability density function at x.
+func (c Chi) Prob(x float64) float64 {
+	return math.Exp(c.LogProb(x))
+}
+
+// Rand returns a random sample drawn from the distribution.
+func (c Chi) Rand() float64 {
+	return math.Sqrt(Gamma{c.K / 2, 0.5, c.Src}.Rand())
+}
+
+// Quantile returns the inverse of the cumulative distribution function.
+func (c Chi) Quantile(p float64) float64 {
+	if p < 0 || 1 < p {
+		panic(badPercentile)
+	}
+	return math.Sqrt(2 * mathext.GammaIncRegInv(0.5*c.K, p))
+}
+
+// Skewness returns the skewness of the distribution.
+func (c Chi) Skewness() float64 {
+	v := c.Variance()
+	s := math.Sqrt(v)
+	return c.Mean() / (s * v) * (1 - 2*v)
+}
+
+// StdDev returns the standard deviation of the probability distribution.
+func (c Chi) StdDev() float64 {
+	return math.Sqrt(c.Variance())
+}
+
+// Survival returns the survival function (complementary CDF) at x.
+func (c Chi) Survival(x float64) float64 {
+	if x < 0 {
+		return 1
+	}
+	return mathext.GammaIncRegComp(0.5*c.K, 0.5*(x*x))
+}
+
+// Variance returns the variance of the probability distribution.
+func (c Chi) Variance() float64 {
+	m := c.Mean()
+	return math.Max(0, c.K-m*m)
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/chisquared.go b/vendor/gonum.org/v1/gonum/stat/distuv/chisquared.go
new file mode 100644
index 0000000000..1190803ae7
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/chisquared.go
@@ -0,0 +1,101 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+import (
+	"math"
+	"math/rand/v2"
+
+	"gonum.org/v1/gonum/mathext"
+)
+
+// ChiSquared implements the χ² distribution, a one parameter distribution
+// with support on the positive numbers.
+//
+// The density function is given by
+//
+//	1/(2^{k/2} * Γ(k/2)) * x^{k/2 - 1} * e^{-x/2}
+//
+// It is a special case of the Gamma distribution, Γ(k/2, 1/2).
+//
+// For more information, see https://en.wikipedia.org/wiki/Chi-squared_distribution.
+type ChiSquared struct {
+	// K is the shape parameter, corresponding to the degrees of freedom. Must
+	// be greater than 0.
+	K float64
+
+	Src rand.Source
+}
+
+// CDF computes the value of the cumulative density function at x.
+func (c ChiSquared) CDF(x float64) float64 {
+	return mathext.GammaIncReg(c.K/2, x/2)
+}
+
+// ExKurtosis returns the excess kurtosis of the distribution.
+func (c ChiSquared) ExKurtosis() float64 {
+	return 12 / c.K
+}
+
+// LogProb computes the natural logarithm of the value of the probability
+// density function at x.
+func (c ChiSquared) LogProb(x float64) float64 {
+	if x < 0 {
+		return math.Inf(-1)
+	}
+	lg, _ := math.Lgamma(c.K / 2)
+	return (c.K/2-1)*math.Log(x) - x/2 - (c.K/2)*math.Ln2 - lg
+}
+
+// Mean returns the mean of the probability distribution.
+func (c ChiSquared) Mean() float64 {
+	return c.K
+}
+
+// Mode returns the mode of the distribution.
+func (c ChiSquared) Mode() float64 {
+	return math.Max(c.K-2, 0)
+}
+
+// NumParameters returns the number of parameters in the distribution.
+func (c ChiSquared) NumParameters() int {
+	return 1
+}
+
+// Prob computes the value of the probability density function at x.
+func (c ChiSquared) Prob(x float64) float64 {
+	return math.Exp(c.LogProb(x))
+}
+
+// Rand returns a random sample drawn from the distribution.
+func (c ChiSquared) Rand() float64 {
+	return Gamma{c.K / 2, 0.5, c.Src}.Rand()
+}
+
+// Quantile returns the inverse of the cumulative distribution function.
+func (c ChiSquared) Quantile(p float64) float64 {
+	if p < 0 || p > 1 {
+		panic(badPercentile)
+	}
+	return mathext.GammaIncRegInv(0.5*c.K, p) * 2
+}
+
+// StdDev returns the standard deviation of the probability distribution.
+func (c ChiSquared) StdDev() float64 {
+	return math.Sqrt(c.Variance())
+}
+
+// Survival returns the survival function (complementary CDF) at x.
+func (c ChiSquared) Survival(x float64) float64 {
+	if x < 0 {
+		return 1
+	}
+	return mathext.GammaIncRegComp(0.5*c.K, 0.5*x)
+}
+
+// Variance returns the variance of the probability distribution.
+func (c ChiSquared) Variance() float64 {
+	return 2 * c.K
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/constants.go b/vendor/gonum.org/v1/gonum/stat/distuv/constants.go
new file mode 100644
index 0000000000..3ebe635047
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/constants.go
@@ -0,0 +1,28 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+const (
+	// oneOverRoot2Pi is the value of 1/(2Pi)^(1/2)
+	// http://www.wolframalpha.com/input/?i=1%2F%282+*+pi%29%5E%281%2F2%29
+	oneOverRoot2Pi = 0.39894228040143267793994605993438186847585863116493465766592582967065792589930183850125233390730693643030255886263518268
+
+	//LogRoot2Pi is the value of log(sqrt(2*Pi))
+	logRoot2Pi    = 0.91893853320467274178032973640561763986139747363778341281715154048276569592726039769474329863595419762200564662463433744
+	negLogRoot2Pi = -logRoot2Pi
+	log2Pi        = 1.8378770664093454835606594728112352797227949472755668
+	ln2           = 0.69314718055994530941723212145817656807550013436025525412068000949339362196969471560586332699641868754200148102057068573368552023
+
+	// Euler–Mascheroni constant.
+	eulerGamma = 0.5772156649015328606065120900824024310421593359399235988057672348848677267776646709369470632917467495146314472498070824809605
+
+	// sqrt3 is the value of sqrt(3)
+	// https://www.wolframalpha.com/input/?i=sqrt%283%29
+	sqrt3 = 1.7320508075688772935274463415058723669428052538103806280558069794519330169088000370811461867572485756756261414154067030299699450
+)
+
+const (
+	panicNameMismatch = "parameter name mismatch"
+)
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/doc.go b/vendor/gonum.org/v1/gonum/stat/distuv/doc.go
new file mode 100644
index 0000000000..68aba2d064
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package distuv provides univariate random distribution types.
+package distuv // import "gonum.org/v1/gonum/stat/distuv"
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/exponential.go b/vendor/gonum.org/v1/gonum/stat/distuv/exponential.go
new file mode 100644
index 0000000000..3acadb437c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/exponential.go
@@ -0,0 +1,266 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+import (
+	"math"
+	"math/rand/v2"
+
+	"gonum.org/v1/gonum/floats"
+	"gonum.org/v1/gonum/stat"
+)
+
+// Exponential represents the exponential distribution (https://en.wikipedia.org/wiki/Exponential_distribution).
+type Exponential struct {
+	Rate float64
+	Src  rand.Source
+}
+
+// CDF computes the value of the cumulative density function at x.
+func (e Exponential) CDF(x float64) float64 {
+	if x < 0 {
+		return 0
+	}
+	return -math.Expm1(-e.Rate * x)
+}
+
+// ConjugateUpdate updates the parameters of the distribution from the sufficient
+// statistics of a set of samples. The sufficient statistics, suffStat, have been
+// observed with nSamples observations. The prior values of the distribution are those
+// currently in the distribution, and have been observed with priorStrength samples.
+//
+// For the exponential distribution, the sufficient statistic is the inverse of
+// the mean of the samples.
+// The prior is having seen priorStrength[0] samples with inverse mean Exponential.Rate
+// As a result of this function, Exponential.Rate is updated based on the weighted
+// samples, and priorStrength is modified to include the new number of samples observed.
+//
+// This function panics if len(suffStat) != e.NumSuffStat() or
+// len(priorStrength) != e.NumSuffStat().
+func (e *Exponential) ConjugateUpdate(suffStat []float64, nSamples float64, priorStrength []float64) {
+	if len(suffStat) != e.NumSuffStat() {
+		panic("exponential: incorrect suffStat length")
+	}
+	if len(priorStrength) != e.NumSuffStat() {
+		panic("exponential: incorrect priorStrength length")
+	}
+
+	totalSamples := nSamples + priorStrength[0]
+
+	totalSum := nSamples / suffStat[0]
+	if !(priorStrength[0] == 0) {
+		totalSum += priorStrength[0] / e.Rate
+	}
+	e.Rate = totalSamples / totalSum
+	priorStrength[0] = totalSamples
+}
+
+// Entropy returns the entropy of the distribution.
+func (e Exponential) Entropy() float64 {
+	return 1 - math.Log(e.Rate)
+}
+
+// ExKurtosis returns the excess kurtosis of the distribution.
+func (Exponential) ExKurtosis() float64 {
+	return 6
+}
+
+// Fit sets the parameters of the probability distribution from the
+// data samples x with relative weights w.
+// If weights is nil, then all the weights are 1.
+// If weights is not nil, then the len(weights) must equal len(samples).
+func (e *Exponential) Fit(samples, weights []float64) {
+	suffStat := make([]float64, e.NumSuffStat())
+	nSamples := e.SuffStat(suffStat, samples, weights)
+	e.ConjugateUpdate(suffStat, nSamples, make([]float64, e.NumSuffStat()))
+}
+
+// LogProb computes the natural logarithm of the value of the probability density function at x.
+func (e Exponential) LogProb(x float64) float64 {
+	if x < 0 {
+		return math.Inf(-1)
+	}
+	return math.Log(e.Rate) - e.Rate*x
+}
+
+// Mean returns the mean of the probability distribution.
+func (e Exponential) Mean() float64 {
+	return 1 / e.Rate
+}
+
+// Median returns the median of the probability distribution.
+func (e Exponential) Median() float64 {
+	return math.Ln2 / e.Rate
+}
+
+// Mode returns the mode of the probability distribution.
+func (Exponential) Mode() float64 {
+	return 0
+}
+
+// NumParameters returns the number of parameters in the distribution.
+func (Exponential) NumParameters() int {
+	return 1
+}
+
+// NumSuffStat returns the number of sufficient statistics for the distribution.
+func (Exponential) NumSuffStat() int {
+	return 1
+}
+
+// Prob computes the value of the probability density function at x.
+func (e Exponential) Prob(x float64) float64 {
+	return math.Exp(e.LogProb(x))
+}
+
+// Quantile returns the inverse of the cumulative probability distribution.
+func (e Exponential) Quantile(p float64) float64 {
+	if p < 0 || p > 1 {
+		panic(badPercentile)
+	}
+	return -math.Log(1-p) / e.Rate
+}
+
+// Rand returns a random sample drawn from the distribution.
+func (e Exponential) Rand() float64 {
+	var rnd float64
+	if e.Src == nil {
+		rnd = rand.ExpFloat64()
+	} else {
+		rnd = rand.New(e.Src).ExpFloat64()
+	}
+	return rnd / e.Rate
+}
+
+// Score returns the score function with respect to the parameters of the
+// distribution at the input location x. The score function is the derivative
+// of the log-likelihood at x with respect to the parameters
+//
+//	(∂/∂θ) log(p(x;θ))
+//
+// If deriv is non-nil, len(deriv) must equal the number of parameters otherwise
+// Score will panic, and the derivative is stored in-place into deriv. If deriv
+// is nil a new slice will be allocated and returned.
+//
+// The order is [∂LogProb / ∂Rate].
+//
+// For more information, see https://en.wikipedia.org/wiki/Score_%28statistics%29.
+//
+// Special cases:
+//
+//	Score(0) = [NaN]
+func (e Exponential) Score(deriv []float64, x float64) []float64 {
+	if deriv == nil {
+		deriv = make([]float64, e.NumParameters())
+	}
+	if len(deriv) != e.NumParameters() {
+		panic(badLength)
+	}
+	if x > 0 {
+		deriv[0] = 1/e.Rate - x
+		return deriv
+	}
+	if x < 0 {
+		deriv[0] = 0
+		return deriv
+	}
+	deriv[0] = math.NaN()
+	return deriv
+}
+
+// ScoreInput returns the score function with respect to the input of the
+// distribution at the input location specified by x. The score function is the
+// derivative of the log-likelihood
+//
+//	(d/dx) log(p(x)) .
+//
+// Special cases:
+//
+//	ScoreInput(0) = NaN
+func (e Exponential) ScoreInput(x float64) float64 {
+	if x > 0 {
+		return -e.Rate
+	}
+	if x < 0 {
+		return 0
+	}
+	return math.NaN()
+}
+
+// Skewness returns the skewness of the distribution.
+func (Exponential) Skewness() float64 {
+	return 2
+}
+
+// StdDev returns the standard deviation of the probability distribution.
+func (e Exponential) StdDev() float64 {
+	return 1 / e.Rate
+}
+
+// SuffStat computes the sufficient statistics of set of samples to update
+// the distribution. The sufficient statistics are stored in place, and the
+// effective number of samples are returned.
+//
+// The exponential distribution has one sufficient statistic, the average rate
+// of the samples.
+//
+// If weights is nil, the weights are assumed to be 1, otherwise panics if
+// len(samples) != len(weights). Panics if len(suffStat) != NumSuffStat().
+func (Exponential) SuffStat(suffStat, samples, weights []float64) (nSamples float64) {
+	if len(weights) != 0 && len(samples) != len(weights) {
+		panic(badLength)
+	}
+
+	if len(suffStat) != (Exponential{}).NumSuffStat() {
+		panic(badSuffStat)
+	}
+
+	if len(weights) == 0 {
+		nSamples = float64(len(samples))
+	} else {
+		nSamples = floats.Sum(weights)
+	}
+
+	mean := stat.Mean(samples, weights)
+	suffStat[0] = 1 / mean
+	return nSamples
+}
+
+// Survival returns the survival function (complementary CDF) at x.
+func (e Exponential) Survival(x float64) float64 {
+	if x < 0 {
+		return 1
+	}
+	return math.Exp(-e.Rate * x)
+}
+
+// setParameters modifies the parameters of the distribution.
+func (e *Exponential) setParameters(p []Parameter) {
+	if len(p) != e.NumParameters() {
+		panic("exponential: incorrect number of parameters to set")
+	}
+	if p[0].Name != "Rate" {
+		panic("exponential: " + panicNameMismatch)
+	}
+	e.Rate = p[0].Value
+}
+
+// Variance returns the variance of the probability distribution.
+func (e Exponential) Variance() float64 {
+	return 1 / (e.Rate * e.Rate)
+}
+
+// parameters returns the parameters of the distribution.
+func (e Exponential) parameters(p []Parameter) []Parameter {
+	nParam := e.NumParameters()
+	if p == nil {
+		p = make([]Parameter, nParam)
+	} else if len(p) != nParam {
+		panic("exponential: improper parameter length")
+	}
+	p[0].Name = "Rate"
+	p[0].Value = e.Rate
+	return p
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/f.go b/vendor/gonum.org/v1/gonum/stat/distuv/f.go
new file mode 100644
index 0000000000..299fce4fc3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/f.go
@@ -0,0 +1,134 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+import (
+	"math"
+	"math/rand/v2"
+
+	"gonum.org/v1/gonum/mathext"
+)
+
+// F implements the F-distribution, a two-parameter continuous distribution
+// with support over the positive real numbers.
+//
+// The F-distribution has density function
+//
+//	sqrt(((d1*x)^d1) * d2^d2 / ((d1*x+d2)^(d1+d2))) / (x * B(d1/2,d2/2))
+//
+// where B is the beta function.
+//
+// For more information, see https://en.wikipedia.org/wiki/F-distribution
+type F struct {
+	D1  float64 // Degrees of freedom for the numerator
+	D2  float64 // Degrees of freedom for the denominator
+	Src rand.Source
+}
+
+// CDF computes the value of the cumulative density function at x.
+func (f F) CDF(x float64) float64 {
+	return mathext.RegIncBeta(f.D1/2, f.D2/2, f.D1*x/(f.D1*x+f.D2))
+}
+
+// ExKurtosis returns the excess kurtosis of the distribution.
+//
+// ExKurtosis returns NaN if the D2 parameter is less or equal to 8.
+func (f F) ExKurtosis() float64 {
+	if f.D2 <= 8 {
+		return math.NaN()
+	}
+	return (12 / (f.D2 - 6)) * ((5*f.D2-22)/(f.D2-8) + ((f.D2-4)/f.D1)*((f.D2-2)/(f.D2-8))*((f.D2-2)/(f.D1+f.D2-2)))
+}
+
+// LogProb computes the natural logarithm of the value of the probability
+// density function at x.
+func (f F) LogProb(x float64) float64 {
+	return 0.5*(f.D1*math.Log(f.D1*x)+f.D2*math.Log(f.D2)-(f.D1+f.D2)*math.Log(f.D1*x+f.D2)) - math.Log(x) - mathext.Lbeta(f.D1/2, f.D2/2)
+}
+
+// Mean returns the mean of the probability distribution.
+//
+// Mean returns NaN if the D2 parameter is less than or equal to 2.
+func (f F) Mean() float64 {
+	if f.D2 <= 2 {
+		return math.NaN()
+	}
+	return f.D2 / (f.D2 - 2)
+}
+
+// Mode returns the mode of the distribution.
+//
+// Mode returns NaN if the D1 parameter is less than or equal to 2.
+func (f F) Mode() float64 {
+	if f.D1 <= 2 {
+		return math.NaN()
+	}
+	return ((f.D1 - 2) / f.D1) * (f.D2 / (f.D2 + 2))
+}
+
+// NumParameters returns the number of parameters in the distribution.
+func (f F) NumParameters() int {
+	return 2
+}
+
+// Prob computes the value of the probability density function at x.
+func (f F) Prob(x float64) float64 {
+	return math.Exp(f.LogProb(x))
+}
+
+// Quantile returns the inverse of the cumulative distribution function.
+func (f F) Quantile(p float64) float64 {
+	if p < 0 || p > 1 {
+		panic(badPercentile)
+	}
+	y := mathext.InvRegIncBeta(0.5*f.D1, 0.5*f.D2, p)
+	return f.D2 * y / (f.D1 * (1 - y))
+}
+
+// Rand returns a random sample drawn from the distribution.
+func (f F) Rand() float64 {
+	u1 := ChiSquared{f.D1, f.Src}.Rand()
+	u2 := ChiSquared{f.D2, f.Src}.Rand()
+	return (u1 / f.D1) / (u2 / f.D2)
+}
+
+// Skewness returns the skewness of the distribution.
+//
+// Skewness returns NaN if the D2 parameter is less than or equal to 6.
+func (f F) Skewness() float64 {
+	if f.D2 <= 6 {
+		return math.NaN()
+	}
+	num := (2*f.D1 + f.D2 - 2) * math.Sqrt(8*(f.D2-4))
+	den := (f.D2 - 6) * math.Sqrt(f.D1*(f.D1+f.D2-2))
+	return num / den
+}
+
+// StdDev returns the standard deviation of the probability distribution.
+//
+// StdDev returns NaN if the D2 parameter is less than or equal to 4.
+func (f F) StdDev() float64 {
+	if f.D2 <= 4 {
+		return math.NaN()
+	}
+	return math.Sqrt(f.Variance())
+}
+
+// Survival returns the survival function (complementary CDF) at x.
+func (f F) Survival(x float64) float64 {
+	return 1 - f.CDF(x)
+}
+
+// Variance returns the variance of the probability distribution.
+//
+// Variance returns NaN if the D2 parameter is less than or equal to 4.
+func (f F) Variance() float64 {
+	if f.D2 <= 4 {
+		return math.NaN()
+	}
+	num := 2 * f.D2 * f.D2 * (f.D1 + f.D2 - 2)
+	den := f.D1 * (f.D2 - 2) * (f.D2 - 2) * (f.D2 - 4)
+	return num / den
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/gamma.go b/vendor/gonum.org/v1/gonum/stat/distuv/gamma.go
new file mode 100644
index 0000000000..739574f5c6
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/gamma.go
@@ -0,0 +1,203 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+import (
+	"math"
+	"math/rand/v2"
+
+	"gonum.org/v1/gonum/mathext"
+)
+
+// Gamma implements the Gamma distribution, a two-parameter continuous distribution
+// with support over the positive real numbers.
+//
+// The gamma distribution has density function
+//
+//	β^α / Γ(α) x^(α-1)e^(-βx)
+//
+// For more information, see https://en.wikipedia.org/wiki/Gamma_distribution
+type Gamma struct {
+	// Alpha is the shape parameter of the distribution. Alpha must be greater
+	// than 0. If Alpha == 1, this is equivalent to an exponential distribution.
+	Alpha float64
+	// Beta is the rate parameter of the distribution. Beta must be greater than 0.
+	// If Beta == 2, this is equivalent to a Chi-Squared distribution.
+	Beta float64
+
+	Src rand.Source
+}
+
+// CDF computes the value of the cumulative distribution function at x.
+func (g Gamma) CDF(x float64) float64 {
+	if x < 0 {
+		return 0
+	}
+	return mathext.GammaIncReg(g.Alpha, g.Beta*x)
+}
+
+// ExKurtosis returns the excess kurtosis of the distribution.
+func (g Gamma) ExKurtosis() float64 {
+	return 6 / g.Alpha
+}
+
+// LogProb computes the natural logarithm of the value of the probability
+// density function at x.
+func (g Gamma) LogProb(x float64) float64 {
+	if x < 0 {
+		return math.Inf(-1)
+	}
+	a := g.Alpha
+	b := g.Beta
+	lg, _ := math.Lgamma(a)
+	if a == 1 {
+		return math.Log(b) - lg - b*x
+	}
+	return a*math.Log(b) - lg + (a-1)*math.Log(x) - b*x
+}
+
+// Mean returns the mean of the probability distribution.
+func (g Gamma) Mean() float64 {
+	return g.Alpha / g.Beta
+}
+
+// Mode returns the mode of the gamma distribution.
+//
+// The mode is 0 in the special case where the Alpha (shape) parameter
+// is less than 1.
+func (g Gamma) Mode() float64 {
+	if g.Alpha < 1 {
+		return 0
+	}
+	return (g.Alpha - 1) / g.Beta
+}
+
+// NumParameters returns the number of parameters in the distribution.
+func (Gamma) NumParameters() int {
+	return 2
+}
+
+// Prob computes the value of the probability density function at x.
+func (g Gamma) Prob(x float64) float64 {
+	return math.Exp(g.LogProb(x))
+}
+
+// Quantile returns the inverse of the cumulative distribution function.
+func (g Gamma) Quantile(p float64) float64 {
+	if p < 0 || p > 1 {
+		panic(badPercentile)
+	}
+	return mathext.GammaIncRegInv(g.Alpha, p) / g.Beta
+}
+
+// Rand returns a random sample drawn from the distribution.
+//
+// Rand panics if either alpha or beta is <= 0.
+func (g Gamma) Rand() float64 {
+	const (
+		// The 0.2 threshold is from https://www4.stat.ncsu.edu/~rmartin/Codes/rgamss.R
+		// described in detail in https://arxiv.org/abs/1302.1884.
+		smallAlphaThresh = 0.2
+	)
+	if g.Beta <= 0 {
+		panic("gamma: beta <= 0")
+	}
+
+	unifrnd := rand.Float64
+	exprnd := rand.ExpFloat64
+	normrnd := rand.NormFloat64
+	if g.Src != nil {
+		rnd := rand.New(g.Src)
+		unifrnd = rnd.Float64
+		exprnd = rnd.ExpFloat64
+		normrnd = rnd.NormFloat64
+	}
+
+	a := g.Alpha
+	b := g.Beta
+	switch {
+	case a <= 0:
+		panic("gamma: alpha <= 0")
+	case a == 1:
+		// Generate from exponential
+		return exprnd() / b
+	case a < smallAlphaThresh:
+		// Generate using
+		//  Liu, Chuanhai, Martin, Ryan and Syring, Nick. "Simulating from a
+		//  gamma distribution with small shape parameter"
+		//  https://arxiv.org/abs/1302.1884
+		//   use this reference: http://link.springer.com/article/10.1007/s00180-016-0692-0
+
+		// Algorithm adjusted to work in log space as much as possible.
+		lambda := 1/a - 1
+		lr := -math.Log1p(1 / lambda / math.E)
+		for {
+			e := exprnd()
+			var z float64
+			if e >= -lr {
+				z = e + lr
+			} else {
+				z = -exprnd() / lambda
+			}
+			eza := math.Exp(-z / a)
+			lh := -z - eza
+			var lEta float64
+			if z >= 0 {
+				lEta = -z
+			} else {
+				lEta = -1 + lambda*z
+			}
+			if lh-lEta > -exprnd() {
+				return eza / b
+			}
+		}
+	case a >= smallAlphaThresh:
+		// Generate using:
+		//  Marsaglia, George, and Wai Wan Tsang. "A simple method for generating
+		//  gamma variables." ACM Transactions on Mathematical Software (TOMS)
+		//  26.3 (2000): 363-372.
+		d := a - 1.0/3
+		m := 1.0
+		if a < 1 {
+			d += 1.0
+			m = math.Pow(unifrnd(), 1/a)
+		}
+		c := 1 / (3 * math.Sqrt(d))
+		for {
+			x := normrnd()
+			v := 1 + x*c
+			if v <= 0.0 {
+				continue
+			}
+			v = v * v * v
+			u := unifrnd()
+			if u < 1.0-0.0331*(x*x)*(x*x) {
+				return m * d * v / b
+			}
+			if math.Log(u) < 0.5*x*x+d*(1-v+math.Log(v)) {
+				return m * d * v / b
+			}
+		}
+	}
+	panic("unreachable")
+}
+
+// Survival returns the survival function (complementary CDF) at x.
+func (g Gamma) Survival(x float64) float64 {
+	if x < 0 {
+		return 1
+	}
+	return mathext.GammaIncRegComp(g.Alpha, g.Beta*x)
+}
+
+// StdDev returns the standard deviation of the probability distribution.
+func (g Gamma) StdDev() float64 {
+	return math.Sqrt(g.Alpha) / g.Beta
+}
+
+// Variance returns the variance of the probability distribution.
+func (g Gamma) Variance() float64 {
+	return g.Alpha / g.Beta / g.Beta
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/general.go b/vendor/gonum.org/v1/gonum/stat/distuv/general.go
new file mode 100644
index 0000000000..5b78991943
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/general.go
@@ -0,0 +1,24 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+// Parameter represents a parameter of a probability distribution
+type Parameter struct {
+	Name  string
+	Value float64
+}
+
+const (
+	badPercentile = "distuv: percentile out of bounds"
+	badLength     = "distuv: slice length mismatch"
+	badSuffStat   = "distuv: wrong suffStat length"
+	errNoSamples  = "distuv: must have at least one sample"
+)
+
+const (
+	expNegOneHalf   = 0.6065306597126334236037995349911804534419 // https://oeis.org/A092605
+	eulerMascheroni = 0.5772156649015328606065120900824024310421 // https://oeis.org/A001620
+	apery           = 1.2020569031595942853997381615114499907649 // https://oeis.org/A002117
+)
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/gumbel.go b/vendor/gonum.org/v1/gonum/stat/distuv/gumbel.go
new file mode 100644
index 0000000000..7017dd4e1b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/gumbel.go
@@ -0,0 +1,118 @@
+// Copyright ©2018 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+import (
+	"math"
+	"math/rand/v2"
+)
+
+// GumbelRight implements the right-skewed Gumbel distribution, a two-parameter
+// continuous distribution with support over the real numbers. The right-skewed
+// Gumbel distribution is also sometimes known as the Extreme Value distribution.
+//
+// The right-skewed Gumbel distribution has density function
+//
+//	1/beta * exp(-(z + exp(-z)))
+//	z = (x - mu)/beta
+//
+// Beta must be greater than 0.
+//
+// For more information, see https://en.wikipedia.org/wiki/Gumbel_distribution.
+type GumbelRight struct {
+	Mu   float64
+	Beta float64
+	Src  rand.Source
+}
+
+func (g GumbelRight) z(x float64) float64 {
+	return (x - g.Mu) / g.Beta
+}
+
+// CDF computes the value of the cumulative density function at x.
+func (g GumbelRight) CDF(x float64) float64 {
+	z := g.z(x)
+	return math.Exp(-math.Exp(-z))
+}
+
+// Entropy returns the differential entropy of the distribution.
+func (g GumbelRight) Entropy() float64 {
+	return math.Log(g.Beta) + eulerMascheroni + 1
+}
+
+// ExKurtosis returns the excess kurtosis of the distribution.
+func (g GumbelRight) ExKurtosis() float64 {
+	return 12.0 / 5
+}
+
+// LogProb computes the natural logarithm of the value of the probability density function at x.
+func (g GumbelRight) LogProb(x float64) float64 {
+	z := g.z(x)
+	return -math.Log(g.Beta) - z - math.Exp(-z)
+}
+
+// Mean returns the mean of the probability distribution.
+func (g GumbelRight) Mean() float64 {
+	return g.Mu + g.Beta*eulerMascheroni
+}
+
+// Median returns the median of the Gumbel distribution.
+func (g GumbelRight) Median() float64 {
+	return g.Mu - g.Beta*math.Log(math.Ln2)
+}
+
+// Mode returns the mode of the normal distribution.
+func (g GumbelRight) Mode() float64 {
+	return g.Mu
+}
+
+// NumParameters returns the number of parameters in the distribution.
+func (GumbelRight) NumParameters() int {
+	return 2
+}
+
+// Prob computes the value of the probability density function at x.
+func (g GumbelRight) Prob(x float64) float64 {
+	return math.Exp(g.LogProb(x))
+}
+
+// Quantile returns the inverse of the cumulative probability distribution.
+func (g GumbelRight) Quantile(p float64) float64 {
+	if p < 0 || 1 < p {
+		panic(badPercentile)
+	}
+	return g.Mu - g.Beta*math.Log(-math.Log(p))
+}
+
+// Rand returns a random sample drawn from the distribution.
+func (g GumbelRight) Rand() float64 {
+	var rnd float64
+	if g.Src == nil {
+		rnd = rand.ExpFloat64()
+	} else {
+		rnd = rand.New(g.Src).ExpFloat64()
+	}
+	return g.Mu - g.Beta*math.Log(rnd)
+}
+
+// Skewness returns the skewness of the distribution.
+func (GumbelRight) Skewness() float64 {
+	return 12 * math.Sqrt(6) * apery / (math.Pi * math.Pi * math.Pi)
+}
+
+// StdDev returns the standard deviation of the probability distribution.
+func (g GumbelRight) StdDev() float64 {
+	return (math.Pi / math.Sqrt(6)) * g.Beta
+}
+
+// Survival returns the survival function (complementary CDF) at x.
+func (g GumbelRight) Survival(x float64) float64 {
+	return 1 - g.CDF(x)
+}
+
+// Variance returns the variance of the probability distribution.
+func (g GumbelRight) Variance() float64 {
+	return math.Pi * math.Pi * g.Beta * g.Beta / 6
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/interfaces.go b/vendor/gonum.org/v1/gonum/stat/distuv/interfaces.go
new file mode 100644
index 0000000000..a3cc94bb27
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/interfaces.go
@@ -0,0 +1,32 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+// LogProber wraps the LogProb method.
+type LogProber interface {
+	// LogProb returns the natural logarithm of the
+	// value of the probability density or probability
+	// mass function at x.
+	LogProb(x float64) float64
+}
+
+// Rander wraps the Rand method.
+type Rander interface {
+	// Rand returns a random sample drawn from the distribution.
+	Rand() float64
+}
+
+// RandLogProber is the interface that groups the Rander and LogProber methods.
+type RandLogProber interface {
+	Rander
+	LogProber
+}
+
+// Quantiler wraps the Quantile method.
+type Quantiler interface {
+	// Quantile returns the minimum value of x from amongst
+	// all those values whose CDF value exceeds or equals p.
+	Quantile(p float64) float64
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/inversegamma.go b/vendor/gonum.org/v1/gonum/stat/distuv/inversegamma.go
new file mode 100644
index 0000000000..44fe5e6ce9
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/inversegamma.go
@@ -0,0 +1,123 @@
+// Copyright ©2018 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+import (
+	"math"
+	"math/rand/v2"
+
+	"gonum.org/v1/gonum/mathext"
+)
+
+// InverseGamma implements the inverse gamma distribution, a two-parameter
+// continuous distribution with support over the positive real numbers. The
+// inverse gamma distribution is the same as the distribution of the reciprocal
+// of a gamma distributed random variable.
+//
+// The inverse gamma distribution has density function
+//
+//	β^α / Γ(α) x^(-α-1)e^(-β/x)
+//
+// For more information, see https://en.wikipedia.org/wiki/Inverse-gamma_distribution
+type InverseGamma struct {
+	// Alpha is the shape parameter of the distribution. Alpha must be greater than 0.
+	Alpha float64
+	// Beta is the scale parameter of the distribution. Beta must be greater than 0.
+	Beta float64
+
+	Src rand.Source
+}
+
+// CDF computes the value of the cumulative distribution function at x.
+func (g InverseGamma) CDF(x float64) float64 {
+	if x < 0 {
+		return 0
+	}
+	// TODO(btracey): Replace this with a direct call to the upper regularized
+	// gamma function if mathext gets it.
+	//return 1 - mathext.GammaInc(g.Alpha, g.Beta/x)
+	return mathext.GammaIncRegComp(g.Alpha, g.Beta/x)
+}
+
+// ExKurtosis returns the excess kurtosis of the distribution.
+func (g InverseGamma) ExKurtosis() float64 {
+	if g.Alpha <= 4 {
+		return math.Inf(1)
+	}
+	return (30*g.Alpha - 66) / (g.Alpha - 3) / (g.Alpha - 4)
+}
+
+// LogProb computes the natural logarithm of the value of the probability
+// density function at x.
+func (g InverseGamma) LogProb(x float64) float64 {
+	if x <= 0 {
+		return math.Inf(-1)
+	}
+	a := g.Alpha
+	b := g.Beta
+	lg, _ := math.Lgamma(a)
+	return a*math.Log(b) - lg + (-a-1)*math.Log(x) - b/x
+}
+
+// Mean returns the mean of the probability distribution.
+func (g InverseGamma) Mean() float64 {
+	if g.Alpha <= 1 {
+		return math.Inf(1)
+	}
+	return g.Beta / (g.Alpha - 1)
+}
+
+// Mode returns the mode of the distribution.
+func (g InverseGamma) Mode() float64 {
+	return g.Beta / (g.Alpha + 1)
+}
+
+// NumParameters returns the number of parameters in the distribution.
+func (InverseGamma) NumParameters() int {
+	return 2
+}
+
+// Prob computes the value of the probability density function at x.
+func (g InverseGamma) Prob(x float64) float64 {
+	return math.Exp(g.LogProb(x))
+}
+
+// Quantile returns the inverse of the cumulative distribution function.
+func (g InverseGamma) Quantile(p float64) float64 {
+	if p < 0 || 1 < p {
+		panic(badPercentile)
+	}
+	return (1 / (mathext.GammaIncRegCompInv(g.Alpha, p))) * g.Beta
+}
+
+// Rand returns a random sample drawn from the distribution.
+//
+// Rand panics if either alpha or beta is <= 0.
+func (g InverseGamma) Rand() float64 {
+	// TODO(btracey): See if there is a more direct way to sample.
+	return 1 / Gamma(g).Rand()
+}
+
+// Survival returns the survival function (complementary CDF) at x.
+func (g InverseGamma) Survival(x float64) float64 {
+	if x < 0 {
+		return 1
+	}
+	return mathext.GammaIncReg(g.Alpha, g.Beta/x)
+}
+
+// StdDev returns the standard deviation of the probability distribution.
+func (g InverseGamma) StdDev() float64 {
+	return math.Sqrt(g.Variance())
+}
+
+// Variance returns the variance of the probability distribution.
+func (g InverseGamma) Variance() float64 {
+	if g.Alpha <= 2 {
+		return math.Inf(1)
+	}
+	v := g.Beta / (g.Alpha - 1)
+	return v * v / (g.Alpha - 2)
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/laplace.go b/vendor/gonum.org/v1/gonum/stat/distuv/laplace.go
new file mode 100644
index 0000000000..36d965512e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/laplace.go
@@ -0,0 +1,267 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+import (
+	"math"
+	"math/rand/v2"
+	"sort"
+
+	"gonum.org/v1/gonum/stat"
+)
+
+// Laplace represents the Laplace distribution (https://en.wikipedia.org/wiki/Laplace_distribution).
+type Laplace struct {
+	Mu    float64 // Mean of the Laplace distribution
+	Scale float64 // Scale of the Laplace distribution
+	Src   rand.Source
+}
+
+// CDF computes the value of the cumulative density function at x.
+func (l Laplace) CDF(x float64) float64 {
+	if x < l.Mu {
+		return 0.5 * math.Exp((x-l.Mu)/l.Scale)
+	}
+	return 1 - 0.5*math.Exp(-(x-l.Mu)/l.Scale)
+}
+
+// Entropy returns the entropy of the distribution.
+func (l Laplace) Entropy() float64 {
+	return 1 + math.Log(2*l.Scale)
+}
+
+// ExKurtosis returns the excess kurtosis of the distribution.
+func (l Laplace) ExKurtosis() float64 {
+	return 3
+}
+
+// Fit sets the parameters of the probability distribution from the
+// data samples x with relative weights w.
+// If weights is nil, then all the weights are 1.
+// If weights is not nil, then the len(weights) must equal len(samples).
+//
+// Note: Laplace distribution has no FitPrior because it has no sufficient
+// statistics.
+func (l *Laplace) Fit(samples, weights []float64) {
+	if weights != nil && len(samples) != len(weights) {
+		panic(badLength)
+	}
+
+	if len(samples) == 0 {
+		panic(errNoSamples)
+	}
+	if len(samples) == 1 {
+		l.Mu = samples[0]
+		l.Scale = 0
+		return
+	}
+
+	var (
+		sortedSamples []float64
+		sortedWeights []float64
+	)
+	if sort.Float64sAreSorted(samples) {
+		sortedSamples = samples
+		sortedWeights = weights
+	} else {
+		// Need to copy variables so the input variables aren't effected by the sorting
+		sortedSamples = make([]float64, len(samples))
+		copy(sortedSamples, samples)
+		sortedWeights := make([]float64, len(samples))
+		copy(sortedWeights, weights)
+
+		stat.SortWeighted(sortedSamples, sortedWeights)
+	}
+
+	// The (weighted) median of the samples is the maximum likelihood estimate
+	// of the mean parameter
+	// TODO: Rethink quantile type when stat has more options
+	l.Mu = stat.Quantile(0.5, stat.Empirical, sortedSamples, sortedWeights)
+
+	// The scale parameter is the average absolute distance
+	// between the sample and the mean
+	var absError float64
+	var sumWeights float64
+	if weights != nil {
+		for i, v := range samples {
+			absError += weights[i] * math.Abs(l.Mu-v)
+			sumWeights += weights[i]
+		}
+		l.Scale = absError / sumWeights
+	} else {
+		for _, v := range samples {
+			absError += math.Abs(l.Mu - v)
+		}
+		l.Scale = absError / float64(len(samples))
+	}
+}
+
+// LogProb computes the natural logarithm of the value of the probability density
+// function at x.
+func (l Laplace) LogProb(x float64) float64 {
+	return -math.Ln2 - math.Log(l.Scale) - math.Abs(x-l.Mu)/l.Scale
+}
+
+// parameters returns the parameters of the distribution.
+func (l Laplace) parameters(p []Parameter) []Parameter {
+	nParam := l.NumParameters()
+	if p == nil {
+		p = make([]Parameter, nParam)
+	} else if len(p) != nParam {
+		panic(badLength)
+	}
+	p[0].Name = "Mu"
+	p[0].Value = l.Mu
+	p[1].Name = "Scale"
+	p[1].Value = l.Scale
+	return p
+}
+
+// Mean returns the mean of the probability distribution.
+func (l Laplace) Mean() float64 {
+	return l.Mu
+}
+
+// Median returns the median of the LaPlace distribution.
+func (l Laplace) Median() float64 {
+	return l.Mu
+}
+
+// Mode returns the mode of the LaPlace distribution.
+func (l Laplace) Mode() float64 {
+	return l.Mu
+}
+
+// NumParameters returns the number of parameters in the distribution.
+func (l Laplace) NumParameters() int {
+	return 2
+}
+
+// Quantile returns the inverse of the cumulative probability distribution.
+func (l Laplace) Quantile(p float64) float64 {
+	if p < 0 || p > 1 {
+		panic(badPercentile)
+	}
+	if p < 0.5 {
+		return l.Mu + l.Scale*math.Log(1+2*(p-0.5))
+	}
+	return l.Mu - l.Scale*math.Log(1-2*(p-0.5))
+}
+
+// Prob computes the value of the probability density function at x.
+func (l Laplace) Prob(x float64) float64 {
+	return math.Exp(l.LogProb(x))
+}
+
+// Rand returns a random sample drawn from the distribution.
+func (l Laplace) Rand() float64 {
+	var rnd float64
+	if l.Src == nil {
+		rnd = rand.Float64()
+	} else {
+		rnd = rand.New(l.Src).Float64()
+	}
+	u := rnd - 0.5
+	if u < 0 {
+		return l.Mu + l.Scale*math.Log(1+2*u)
+	}
+	return l.Mu - l.Scale*math.Log(1-2*u)
+}
+
+// Score returns the score function with respect to the parameters of the
+// distribution at the input location x. The score function is the derivative
+// of the log-likelihood at x with respect to the parameters
+//
+//	(∂/∂θ) log(p(x;θ))
+//
+// If deriv is non-nil, len(deriv) must equal the number of parameters otherwise
+// Score will panic, and the derivative is stored in-place into deriv. If deriv
+// is nil a new slice will be allocated and returned.
+//
+// The order is [∂LogProb / ∂Mu, ∂LogProb / ∂Scale].
+//
+// For more information, see https://en.wikipedia.org/wiki/Score_%28statistics%29.
+//
+// Special cases:
+//
+//	Score(l.Mu) = [NaN, -1/l.Scale]
+func (l Laplace) Score(deriv []float64, x float64) []float64 {
+	if deriv == nil {
+		deriv = make([]float64, l.NumParameters())
+	}
+	if len(deriv) != l.NumParameters() {
+		panic(badLength)
+	}
+	diff := x - l.Mu
+	if diff > 0 {
+		deriv[0] = 1 / l.Scale
+	} else if diff < 0 {
+		deriv[0] = -1 / l.Scale
+	} else {
+		// must be NaN
+		deriv[0] = math.NaN()
+	}
+
+	deriv[1] = math.Abs(diff)/(l.Scale*l.Scale) - 1/l.Scale
+	return deriv
+}
+
+// ScoreInput returns the score function with respect to the input of the
+// distribution at the input location specified by x. The score function is the
+// derivative of the log-likelihood
+//
+//	(d/dx) log(p(x)) .
+//
+// Special cases:
+//
+//	ScoreInput(l.Mu) = NaN
+func (l Laplace) ScoreInput(x float64) float64 {
+	diff := x - l.Mu
+	if diff == 0 {
+		return math.NaN()
+	}
+	if diff > 0 {
+		return -1 / l.Scale
+	}
+	return 1 / l.Scale
+}
+
+// Skewness returns the skewness of the distribution.
+func (Laplace) Skewness() float64 {
+	return 0
+}
+
+// StdDev returns the standard deviation of the distribution.
+func (l Laplace) StdDev() float64 {
+	return math.Sqrt2 * l.Scale
+}
+
+// Survival returns the survival function (complementary CDF) at x.
+func (l Laplace) Survival(x float64) float64 {
+	if x < l.Mu {
+		return 1 - 0.5*math.Exp((x-l.Mu)/l.Scale)
+	}
+	return 0.5 * math.Exp(-(x-l.Mu)/l.Scale)
+}
+
+// setParameters modifies the parameters of the distribution.
+func (l *Laplace) setParameters(p []Parameter) {
+	if len(p) != l.NumParameters() {
+		panic(badLength)
+	}
+	if p[0].Name != "Mu" {
+		panic("laplace: " + panicNameMismatch)
+	}
+	if p[1].Name != "Scale" {
+		panic("laplace: " + panicNameMismatch)
+	}
+	l.Mu = p[0].Value
+	l.Scale = p[1].Value
+}
+
+// Variance returns the variance of the probability distribution.
+func (l Laplace) Variance() float64 {
+	return 2 * l.Scale * l.Scale
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/logistic.go b/vendor/gonum.org/v1/gonum/stat/distuv/logistic.go
new file mode 100644
index 0000000000..0392d6ccc1
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/logistic.go
@@ -0,0 +1,98 @@
+// Copyright ©2021 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+import (
+	"math"
+)
+
+// Logistic implements the Logistic distribution, a two-parameter distribution with support on the real axis.
+// Its cumulative distribution function is the logistic function.
+//
+// General form of probability density function for Logistic distribution is
+//
+//	E(x) / (s * (1 + E(x))^2)
+//	where E(x) = exp(-(x-μ)/s)
+//
+// For more information, see https://en.wikipedia.org/wiki/Logistic_distribution.
+type Logistic struct {
+	Mu float64 // Mean value
+	S  float64 // Scale parameter proportional to standard deviation
+}
+
+// CDF computes the value of the cumulative density function at x.
+func (l Logistic) CDF(x float64) float64 {
+	return 1 / (1 + math.Exp(-(x-l.Mu)/l.S))
+}
+
+// ExKurtosis returns the excess kurtosis of the distribution.
+func (l Logistic) ExKurtosis() float64 {
+	return 6.0 / 5.0
+}
+
+// LogProb computes the natural logarithm of the value of the probability
+// density function at x.
+func (l Logistic) LogProb(x float64) float64 {
+	return x - 2*math.Log(math.Exp(x)+1)
+}
+
+// Mean returns the mean of the probability distribution.
+func (l Logistic) Mean() float64 {
+	return l.Mu
+}
+
+// Mode returns the mode of the distribution.
+//
+// It is same as Mean for Logistic distribution.
+func (l Logistic) Mode() float64 {
+	return l.Mu
+}
+
+// Median returns the median of the distribution.
+//
+// It is same as Mean for Logistic distribution.
+func (l Logistic) Median() float64 {
+	return l.Mu
+}
+
+// NumParameters returns the number of parameters in the distribution.
+//
+// Always returns 2.
+func (l Logistic) NumParameters() int {
+	return 2
+}
+
+// Prob computes the value of the probability density function at x.
+func (l Logistic) Prob(x float64) float64 {
+	E := math.Exp(-(x - l.Mu) / l.S)
+	return E / (l.S * math.Pow(1+E, 2))
+}
+
+// Quantile returns the inverse of the cumulative distribution function.
+func (l Logistic) Quantile(p float64) float64 {
+	return l.Mu + l.S*math.Log(p/(1-p))
+}
+
+// Skewness returns the skewness of the distribution.
+//
+// Always 0 for Logistic distribution.
+func (l Logistic) Skewness() float64 {
+	return 0
+}
+
+// StdDev returns the standard deviation of the probability distribution.
+func (l Logistic) StdDev() float64 {
+	return l.S * math.Pi / sqrt3
+}
+
+// Survival returns the survival function (complementary CDF) at x.
+func (l Logistic) Survival(x float64) float64 {
+	return 1 - l.CDF(x)
+}
+
+// Variance returns the variance of the probability distribution.
+func (l Logistic) Variance() float64 {
+	return l.S * l.S * math.Pi * math.Pi / 3
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/lognormal.go b/vendor/gonum.org/v1/gonum/stat/distuv/lognormal.go
new file mode 100644
index 0000000000..321c1bd30e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/lognormal.go
@@ -0,0 +1,113 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+import (
+	"math"
+	"math/rand/v2"
+)
+
+// LogNormal represents a random variable whose log is normally distributed.
+// The probability density function is given by
+//
+//	1/(x σ √2π) exp(-(ln(x)-μ)^2)/(2σ^2))
+type LogNormal struct {
+	Mu    float64
+	Sigma float64
+	Src   rand.Source
+}
+
+// CDF computes the value of the cumulative density function at x.
+func (l LogNormal) CDF(x float64) float64 {
+	return 0.5 * math.Erfc(-(math.Log(x)-l.Mu)/(math.Sqrt2*l.Sigma))
+}
+
+// Entropy returns the differential entropy of the distribution.
+func (l LogNormal) Entropy() float64 {
+	return 0.5 + 0.5*math.Log(2*math.Pi*l.Sigma*l.Sigma) + l.Mu
+}
+
+// ExKurtosis returns the excess kurtosis of the distribution.
+func (l LogNormal) ExKurtosis() float64 {
+	s2 := l.Sigma * l.Sigma
+	return math.Exp(4*s2) + 2*math.Exp(3*s2) + 3*math.Exp(2*s2) - 6
+}
+
+// LogProb computes the natural logarithm of the value of the probability density function at x.
+func (l LogNormal) LogProb(x float64) float64 {
+	if x < 0 {
+		return math.Inf(-1)
+	}
+	logx := math.Log(x)
+	normdiff := (logx - l.Mu) / l.Sigma
+	return -0.5*normdiff*normdiff - logx - math.Log(l.Sigma) - logRoot2Pi
+}
+
+// Mean returns the mean of the probability distribution.
+func (l LogNormal) Mean() float64 {
+	return math.Exp(l.Mu + 0.5*l.Sigma*l.Sigma)
+}
+
+// Median returns the median of the probability distribution.
+func (l LogNormal) Median() float64 {
+	return math.Exp(l.Mu)
+}
+
+// Mode returns the mode of the probability distribution.
+func (l LogNormal) Mode() float64 {
+	return math.Exp(l.Mu - l.Sigma*l.Sigma)
+}
+
+// NumParameters returns the number of parameters in the distribution.
+func (LogNormal) NumParameters() int {
+	return 2
+}
+
+// Prob computes the value of the probability density function at x.
+func (l LogNormal) Prob(x float64) float64 {
+	return math.Exp(l.LogProb(x))
+}
+
+// Quantile returns the inverse of the cumulative probability distribution.
+func (l LogNormal) Quantile(p float64) float64 {
+	if p < 0 || p > 1 {
+		panic(badPercentile)
+	}
+	// Formula from http://www.math.uah.edu/stat/special/LogNormal.html.
+	return math.Exp(l.Mu + l.Sigma*UnitNormal.Quantile(p))
+}
+
+// Rand returns a random sample drawn from the distribution.
+func (l LogNormal) Rand() float64 {
+	var rnd float64
+	if l.Src == nil {
+		rnd = rand.NormFloat64()
+	} else {
+		rnd = rand.New(l.Src).NormFloat64()
+	}
+	return math.Exp(rnd*l.Sigma + l.Mu)
+}
+
+// Skewness returns the skewness of the distribution.
+func (l LogNormal) Skewness() float64 {
+	s2 := l.Sigma * l.Sigma
+	return (math.Exp(s2) + 2) * math.Sqrt(math.Exp(s2)-1)
+}
+
+// StdDev returns the standard deviation of the probability distribution.
+func (l LogNormal) StdDev() float64 {
+	return math.Sqrt(l.Variance())
+}
+
+// Survival returns the survival function (complementary CDF) at x.
+func (l LogNormal) Survival(x float64) float64 {
+	return 0.5 * (1 - math.Erf((math.Log(x)-l.Mu)/(math.Sqrt2*l.Sigma)))
+}
+
+// Variance returns the variance of the probability distribution.
+func (l LogNormal) Variance() float64 {
+	s2 := l.Sigma * l.Sigma
+	return (math.Exp(s2) - 1) * math.Exp(2*l.Mu+s2)
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/norm.go b/vendor/gonum.org/v1/gonum/stat/distuv/norm.go
new file mode 100644
index 0000000000..324b70eccd
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/norm.go
@@ -0,0 +1,263 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+import (
+	"math"
+	"math/rand/v2"
+
+	"gonum.org/v1/gonum/floats"
+	"gonum.org/v1/gonum/mathext"
+	"gonum.org/v1/gonum/stat"
+)
+
+// UnitNormal is an instantiation of the normal distribution with Mu = 0 and Sigma = 1.
+var UnitNormal = Normal{Mu: 0, Sigma: 1}
+
+// Normal represents a normal (Gaussian) distribution (https://en.wikipedia.org/wiki/Normal_distribution).
+type Normal struct {
+	Mu    float64 // Mean of the normal distribution
+	Sigma float64 // Standard deviation of the normal distribution
+	Src   rand.Source
+
+	// Needs to be Mu and Sigma and not Mean and StdDev because Normal has functions
+	// Mean and StdDev
+}
+
+// CDF computes the value of the cumulative density function at x.
+func (n Normal) CDF(x float64) float64 {
+	return 0.5 * math.Erfc(-(x-n.Mu)/(n.Sigma*math.Sqrt2))
+}
+
+// ConjugateUpdate updates the parameters of the distribution from the sufficient
+// statistics of a set of samples. The sufficient statistics, suffStat, have been
+// observed with nSamples observations. The prior values of the distribution are those
+// currently in the distribution, and have been observed with priorStrength samples.
+//
+// For the normal distribution, the sufficient statistics are the mean and
+// uncorrected standard deviation of the samples.
+// The prior is having seen strength[0] samples with mean Normal.Mu
+// and strength[1] samples with standard deviation Normal.Sigma. As a result of
+// this function, Normal.Mu and Normal.Sigma are updated based on the weighted
+// samples, and strength is modified to include the new number of samples observed.
+//
+// This function panics if len(suffStat) != n.NumSuffStat() or
+// len(priorStrength) != n.NumSuffStat().
+func (n *Normal) ConjugateUpdate(suffStat []float64, nSamples float64, priorStrength []float64) {
+	// TODO: Support prior strength with math.Inf(1) to allow updating with
+	// a known mean/standard deviation
+	if len(suffStat) != n.NumSuffStat() {
+		panic("norm: incorrect suffStat length")
+	}
+	if len(priorStrength) != n.NumSuffStat() {
+		panic("norm: incorrect priorStrength length")
+	}
+
+	totalMeanSamples := nSamples + priorStrength[0]
+	totalSum := suffStat[0]*nSamples + n.Mu*priorStrength[0]
+
+	totalVarianceSamples := nSamples + priorStrength[1]
+	// sample variance
+	totalVariance := nSamples * suffStat[1] * suffStat[1]
+	// add prior variance
+	totalVariance += priorStrength[1] * n.Sigma * n.Sigma
+	// add cross variance from the difference of the means
+	meanDiff := (suffStat[0] - n.Mu)
+	totalVariance += priorStrength[0] * nSamples * meanDiff * meanDiff / totalMeanSamples
+
+	n.Mu = totalSum / totalMeanSamples
+	n.Sigma = math.Sqrt(totalVariance / totalVarianceSamples)
+	floats.AddConst(nSamples, priorStrength)
+}
+
+// Entropy returns the differential entropy of the distribution.
+func (n Normal) Entropy() float64 {
+	return 0.5 * (log2Pi + 1 + 2*math.Log(n.Sigma))
+}
+
+// ExKurtosis returns the excess kurtosis of the distribution.
+func (Normal) ExKurtosis() float64 {
+	return 0
+}
+
+// Fit sets the parameters of the probability distribution from the
+// data samples x with relative weights w. If weights is nil, then all the weights
+// are 1. If weights is not nil, then the len(weights) must equal len(samples).
+func (n *Normal) Fit(samples, weights []float64) {
+	suffStat := make([]float64, n.NumSuffStat())
+	nSamples := n.SuffStat(suffStat, samples, weights)
+	n.ConjugateUpdate(suffStat, nSamples, make([]float64, n.NumSuffStat()))
+}
+
+// LogProb computes the natural logarithm of the value of the probability density function at x.
+func (n Normal) LogProb(x float64) float64 {
+	return negLogRoot2Pi - math.Log(n.Sigma) - (x-n.Mu)*(x-n.Mu)/(2*n.Sigma*n.Sigma)
+}
+
+// Mean returns the mean of the probability distribution.
+func (n Normal) Mean() float64 {
+	return n.Mu
+}
+
+// Median returns the median of the normal distribution.
+func (n Normal) Median() float64 {
+	return n.Mu
+}
+
+// Mode returns the mode of the normal distribution.
+func (n Normal) Mode() float64 {
+	return n.Mu
+}
+
+// NumParameters returns the number of parameters in the distribution.
+func (Normal) NumParameters() int {
+	return 2
+}
+
+// NumSuffStat returns the number of sufficient statistics for the distribution.
+func (Normal) NumSuffStat() int {
+	return 2
+}
+
+// Prob computes the value of the probability density function at x.
+func (n Normal) Prob(x float64) float64 {
+	return math.Exp(n.LogProb(x))
+}
+
+// Quantile returns the inverse of the cumulative probability distribution.
+func (n Normal) Quantile(p float64) float64 {
+	if p < 0 || p > 1 {
+		panic(badPercentile)
+	}
+	return n.Mu + n.Sigma*mathext.NormalQuantile(p)
+}
+
+// Rand returns a random sample drawn from the distribution.
+func (n Normal) Rand() float64 {
+	var rnd float64
+	if n.Src == nil {
+		rnd = rand.NormFloat64()
+	} else {
+		rnd = rand.New(n.Src).NormFloat64()
+	}
+	return rnd*n.Sigma + n.Mu
+}
+
+// Score returns the score function with respect to the parameters of the
+// distribution at the input location x. The score function is the derivative
+// of the log-likelihood at x with respect to the parameters
+//
+//	(∂/∂θ) log(p(x;θ))
+//
+// If deriv is non-nil, len(deriv) must equal the number of parameters otherwise
+// Score will panic, and the derivative is stored in-place into deriv. If deriv
+// is nil a new slice will be allocated and returned.
+//
+// The order is [∂LogProb / ∂Mu, ∂LogProb / ∂Sigma].
+//
+// For more information, see https://en.wikipedia.org/wiki/Score_%28statistics%29.
+func (n Normal) Score(deriv []float64, x float64) []float64 {
+	if deriv == nil {
+		deriv = make([]float64, n.NumParameters())
+	}
+	if len(deriv) != n.NumParameters() {
+		panic(badLength)
+	}
+	deriv[0] = (x - n.Mu) / (n.Sigma * n.Sigma)
+	deriv[1] = 1 / n.Sigma * (-1 + ((x-n.Mu)/n.Sigma)*((x-n.Mu)/n.Sigma))
+	return deriv
+}
+
+// ScoreInput returns the score function with respect to the input of the
+// distribution at the input location specified by x. The score function is the
+// derivative of the log-likelihood
+//
+//	(d/dx) log(p(x)) .
+func (n Normal) ScoreInput(x float64) float64 {
+	return -(1 / (2 * n.Sigma * n.Sigma)) * 2 * (x - n.Mu)
+}
+
+// Skewness returns the skewness of the distribution.
+func (Normal) Skewness() float64 {
+	return 0
+}
+
+// StdDev returns the standard deviation of the probability distribution.
+func (n Normal) StdDev() float64 {
+	return n.Sigma
+}
+
+// SuffStat computes the sufficient statistics of a set of samples to update
+// the distribution. The sufficient statistics are stored in place, and the
+// effective number of samples are returned.
+//
+// The normal distribution has two sufficient statistics, the mean of the samples
+// and the standard deviation of the samples.
+//
+// If weights is nil, the weights are assumed to be 1, otherwise panics if
+// len(samples) != len(weights). Panics if len(suffStat) != NumSuffStat().
+func (Normal) SuffStat(suffStat, samples, weights []float64) (nSamples float64) {
+	lenSamp := len(samples)
+	if len(weights) != 0 && len(samples) != len(weights) {
+		panic(badLength)
+	}
+	if len(suffStat) != (Normal{}).NumSuffStat() {
+		panic(badSuffStat)
+	}
+
+	if len(weights) == 0 {
+		nSamples = float64(lenSamp)
+	} else {
+		nSamples = floats.Sum(weights)
+	}
+
+	mean := stat.Mean(samples, weights)
+	suffStat[0] = mean
+
+	// Use Moment and not StdDev because we want it to be uncorrected
+	variance := stat.MomentAbout(2, samples, mean, weights)
+	suffStat[1] = math.Sqrt(variance)
+	return nSamples
+}
+
+// Survival returns the survival function (complementary CDF) at x.
+func (n Normal) Survival(x float64) float64 {
+	return 0.5 * (1 - math.Erf((x-n.Mu)/(n.Sigma*math.Sqrt2)))
+}
+
+// setParameters modifies the parameters of the distribution.
+func (n *Normal) setParameters(p []Parameter) {
+	if len(p) != n.NumParameters() {
+		panic("normal: incorrect number of parameters to set")
+	}
+	if p[0].Name != "Mu" {
+		panic("normal: " + panicNameMismatch)
+	}
+	if p[1].Name != "Sigma" {
+		panic("normal: " + panicNameMismatch)
+	}
+	n.Mu = p[0].Value
+	n.Sigma = p[1].Value
+}
+
+// Variance returns the variance of the probability distribution.
+func (n Normal) Variance() float64 {
+	return n.Sigma * n.Sigma
+}
+
+// parameters returns the parameters of the distribution.
+func (n Normal) parameters(p []Parameter) []Parameter {
+	nParam := n.NumParameters()
+	if p == nil {
+		p = make([]Parameter, nParam)
+	} else if len(p) != nParam {
+		panic("normal: improper parameter length")
+	}
+	p[0].Name = "Mu"
+	p[0].Value = n.Mu
+	p[1].Name = "Sigma"
+	p[1].Value = n.Sigma
+	return p
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/pareto.go b/vendor/gonum.org/v1/gonum/stat/distuv/pareto.go
new file mode 100644
index 0000000000..6ec751eaf2
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/pareto.go
@@ -0,0 +1,130 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+import (
+	"math"
+	"math/rand/v2"
+)
+
+// Pareto implements the Pareto (Type I) distribution, a one parameter distribution
+// with support above the scale parameter.
+//
+// The density function is given by
+//
+//	(α x_m^{α})/(x^{α+1}) for x >= x_m.
+//
+// For more information, see https://en.wikipedia.org/wiki/Pareto_distribution.
+type Pareto struct {
+	// Xm is the scale parameter.
+	// Xm must be greater than 0.
+	Xm float64
+
+	// Alpha is the shape parameter.
+	// Alpha must be greater than 0.
+	Alpha float64
+
+	Src rand.Source
+}
+
+// CDF computes the value of the cumulative density function at x.
+func (p Pareto) CDF(x float64) float64 {
+	if x < p.Xm {
+		return 0
+	}
+	return -math.Expm1(p.Alpha * math.Log(p.Xm/x))
+}
+
+// Entropy returns the differential entropy of the distribution.
+func (p Pareto) Entropy() float64 {
+	return math.Log(p.Xm) - math.Log(p.Alpha) + (1 + 1/p.Alpha)
+}
+
+// ExKurtosis returns the excess kurtosis of the distribution.
+func (p Pareto) ExKurtosis() float64 {
+	if p.Alpha <= 4 {
+		return math.NaN()
+	}
+	return 6 * (p.Alpha*p.Alpha*p.Alpha + p.Alpha*p.Alpha - 6*p.Alpha - 2) / (p.Alpha * (p.Alpha - 3) * (p.Alpha - 4))
+
+}
+
+// LogProb computes the natural logarithm of the value of the probability
+// density function at x.
+func (p Pareto) LogProb(x float64) float64 {
+	if x < p.Xm {
+		return math.Inf(-1)
+	}
+	return math.Log(p.Alpha) + p.Alpha*math.Log(p.Xm) - (p.Alpha+1)*math.Log(x)
+}
+
+// Mean returns the mean of the probability distribution.
+func (p Pareto) Mean() float64 {
+	if p.Alpha <= 1 {
+		return math.Inf(1)
+	}
+	return p.Alpha * p.Xm / (p.Alpha - 1)
+}
+
+// Median returns the median of the pareto distribution.
+func (p Pareto) Median() float64 {
+	return p.Quantile(0.5)
+}
+
+// Mode returns the mode of the distribution.
+func (p Pareto) Mode() float64 {
+	return p.Xm
+}
+
+// NumParameters returns the number of parameters in the distribution.
+func (p Pareto) NumParameters() int {
+	return 2
+}
+
+// Prob computes the value of the probability density function at x.
+func (p Pareto) Prob(x float64) float64 {
+	return math.Exp(p.LogProb(x))
+}
+
+// Quantile returns the inverse of the cumulative probability distribution.
+func (p Pareto) Quantile(prob float64) float64 {
+	if prob < 0 || 1 < prob {
+		panic(badPercentile)
+	}
+	return p.Xm / math.Pow(1-prob, 1/p.Alpha)
+}
+
+// Rand returns a random sample drawn from the distribution.
+func (p Pareto) Rand() float64 {
+	var rnd float64
+	if p.Src == nil {
+		rnd = rand.ExpFloat64()
+	} else {
+		rnd = rand.New(p.Src).ExpFloat64()
+	}
+	return p.Xm * math.Exp(rnd/p.Alpha)
+}
+
+// StdDev returns the standard deviation of the probability distribution.
+func (p Pareto) StdDev() float64 {
+	return math.Sqrt(p.Variance())
+}
+
+// Survival returns the survival function (complementary CDF) at x.
+func (p Pareto) Survival(x float64) float64 {
+	if x < p.Xm {
+		return 1
+	}
+	return math.Pow(p.Xm/x, p.Alpha)
+}
+
+// Variance returns the variance of the probability distribution.
+func (p Pareto) Variance() float64 {
+	if p.Alpha <= 2 {
+		return math.Inf(1)
+	}
+	am1 := p.Alpha - 1
+	return p.Xm * p.Xm * p.Alpha / (am1 * am1 * (p.Alpha - 2))
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/poisson.go b/vendor/gonum.org/v1/gonum/stat/distuv/poisson.go
new file mode 100644
index 0000000000..fcc8968ca7
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/poisson.go
@@ -0,0 +1,144 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+import (
+	"math"
+	"math/rand/v2"
+
+	"gonum.org/v1/gonum/mathext"
+)
+
+// Poisson implements the Poisson distribution, a discrete probability distribution
+// that expresses the probability of a given number of events occurring in a fixed
+// interval.
+// The poisson distribution has density function:
+//
+//	f(k) = λ^k / k! e^(-λ)
+//
+// For more information, see https://en.wikipedia.org/wiki/Poisson_distribution.
+type Poisson struct {
+	// Lambda is the average number of events in an interval.
+	// Lambda must be greater than 0.
+	Lambda float64
+
+	Src rand.Source
+}
+
+// CDF computes the value of the cumulative distribution function at x.
+func (p Poisson) CDF(x float64) float64 {
+	if x < 0 {
+		return 0
+	}
+	return mathext.GammaIncRegComp(math.Floor(x+1), p.Lambda)
+}
+
+// ExKurtosis returns the excess kurtosis of the distribution.
+func (p Poisson) ExKurtosis() float64 {
+	return 1 / p.Lambda
+}
+
+// LogProb computes the natural logarithm of the value of the probability
+// density function at x.
+func (p Poisson) LogProb(x float64) float64 {
+	if x < 0 || math.Floor(x) != x {
+		return math.Inf(-1)
+	}
+	lg, _ := math.Lgamma(math.Floor(x) + 1)
+	return x*math.Log(p.Lambda) - p.Lambda - lg
+}
+
+// Mean returns the mean of the probability distribution.
+func (p Poisson) Mean() float64 {
+	return p.Lambda
+}
+
+// NumParameters returns the number of parameters in the distribution.
+func (Poisson) NumParameters() int {
+	return 1
+}
+
+// Prob computes the value of the probability density function at x.
+func (p Poisson) Prob(x float64) float64 {
+	return math.Exp(p.LogProb(x))
+}
+
+// Rand returns a random sample drawn from the distribution.
+func (p Poisson) Rand() float64 {
+	// NUMERICAL RECIPES IN C: THE ART OF SCIENTIFIC COMPUTING (ISBN 0-521-43108-5)
+	// p. 294
+	// <http://www.aip.de/groups/soe/local/numres/bookcpdf/c7-3.pdf>
+
+	rnd := rand.ExpFloat64
+	var rng *rand.Rand
+	if p.Src != nil {
+		rng = rand.New(p.Src)
+		rnd = rng.ExpFloat64
+	}
+
+	if p.Lambda < 10.0 {
+		// Use direct method.
+		var em float64
+		t := 0.0
+		for {
+			t += rnd()
+			if t >= p.Lambda {
+				break
+			}
+			em++
+		}
+		return em
+	}
+	// Generate using:
+	//  W. Hörmann. "The transformed rejection method for generating Poisson
+	//  random variables." Insurance: Mathematics and Economics
+	//  12.1 (1993): 39-45.
+
+	// Algorithm PTRS
+	rnd = rand.Float64
+	if rng != nil {
+		rnd = rng.Float64
+	}
+	b := 0.931 + 2.53*math.Sqrt(p.Lambda)
+	a := -0.059 + 0.02483*b
+	invalpha := 1.1239 + 1.1328/(b-3.4)
+	vr := 0.9277 - 3.6224/(b-2)
+	for {
+		U := rnd() - 0.5
+		V := rnd()
+		us := 0.5 - math.Abs(U)
+		k := math.Floor((2*a/us+b)*U + p.Lambda + 0.43)
+		if us >= 0.07 && V <= vr {
+			return k
+		}
+		if k <= 0 || (us < 0.013 && V > us) {
+			continue
+		}
+		lg, _ := math.Lgamma(k + 1)
+		if math.Log(V*invalpha/(a/(us*us)+b)) <= k*math.Log(p.Lambda)-p.Lambda-lg {
+			return k
+		}
+	}
+}
+
+// Skewness returns the skewness of the distribution.
+func (p Poisson) Skewness() float64 {
+	return 1 / math.Sqrt(p.Lambda)
+}
+
+// StdDev returns the standard deviation of the probability distribution.
+func (p Poisson) StdDev() float64 {
+	return math.Sqrt(p.Variance())
+}
+
+// Survival returns the survival function (complementary CDF) at x.
+func (p Poisson) Survival(x float64) float64 {
+	return 1 - p.CDF(x)
+}
+
+// Variance returns the variance of the probability distribution.
+func (p Poisson) Variance() float64 {
+	return p.Lambda
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/statdist.go b/vendor/gonum.org/v1/gonum/stat/distuv/statdist.go
new file mode 100644
index 0000000000..bf333db1be
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/statdist.go
@@ -0,0 +1,142 @@
+// Copyright ©2018 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/mathext"
+)
+
+// Bhattacharyya is a type for computing the Bhattacharyya distance between
+// probability distributions.
+//
+// The Bhattacharyya distance is defined as
+//
+//	D_B = -ln(BC(l,r))
+//	BC = \int_-∞^∞ (p(x)q(x))^(1/2) dx
+//
+// Where BC is known as the Bhattacharyya coefficient.
+// The Bhattacharyya distance is related to the Hellinger distance by
+//
+//	H(l,r) = sqrt(1-BC(l,r))
+//
+// For more information, see
+//
+//	https://en.wikipedia.org/wiki/Bhattacharyya_distance
+type Bhattacharyya struct{}
+
+// DistBeta returns the Bhattacharyya distance between Beta distributions l and r.
+// For Beta distributions, the Bhattacharyya distance is given by
+//
+//	-ln(B((α_l + α_r)/2, (β_l + β_r)/2) / (B(α_l,β_l), B(α_r,β_r)))
+//
+// Where B is the Beta function.
+func (Bhattacharyya) DistBeta(l, r Beta) float64 {
+	// Reference: https://en.wikipedia.org/wiki/Hellinger_distance#Examples
+	return -mathext.Lbeta((l.Alpha+r.Alpha)/2, (l.Beta+r.Beta)/2) +
+		0.5*mathext.Lbeta(l.Alpha, l.Beta) + 0.5*mathext.Lbeta(r.Alpha, r.Beta)
+}
+
+// DistNormal returns the Bhattacharyya distance Normal distributions l and r.
+// For Normal distributions, the Bhattacharyya distance is given by
+//
+//	s = (σ_l^2 + σ_r^2)/2
+//	BC = 1/8 (μ_l-μ_r)^2/s + 1/2 ln(s/(σ_l*σ_r))
+func (Bhattacharyya) DistNormal(l, r Normal) float64 {
+	// Reference: https://en.wikipedia.org/wiki/Bhattacharyya_distance
+	m := l.Mu - r.Mu
+	s := (l.Sigma*l.Sigma + r.Sigma*r.Sigma) / 2
+	return 0.125*m*m/s + 0.5*math.Log(s) - 0.5*math.Log(l.Sigma) - 0.5*math.Log(r.Sigma)
+}
+
+// Hellinger is a type for computing the Hellinger distance between probability
+// distributions.
+//
+// The Hellinger distance is defined as
+//
+//	H^2(l,r) = 1/2 * int_x (\sqrt(l(x)) - \sqrt(r(x)))^2 dx
+//
+// and is bounded between 0 and 1. Note the above formula defines the squared
+// Hellinger distance, while this returns the Hellinger distance itself.
+// The Hellinger distance is related to the Bhattacharyya distance by
+//
+//	H^2 = 1 - exp(-D_B)
+//
+// For more information, see
+//
+//	https://en.wikipedia.org/wiki/Hellinger_distance
+type Hellinger struct{}
+
+// DistBeta computes the Hellinger distance between Beta distributions l and r.
+// See the documentation of Bhattacharyya.DistBeta for the distance formula.
+func (Hellinger) DistBeta(l, r Beta) float64 {
+	db := Bhattacharyya{}.DistBeta(l, r)
+	return math.Sqrt(-math.Expm1(-db))
+}
+
+// DistNormal computes the Hellinger distance between Normal distributions l and r.
+// See the documentation of Bhattacharyya.DistNormal for the distance formula.
+func (Hellinger) DistNormal(l, r Normal) float64 {
+	db := Bhattacharyya{}.DistNormal(l, r)
+	return math.Sqrt(-math.Expm1(-db))
+}
+
+// KullbackLeibler is a type for computing the Kullback-Leibler divergence from l to r.
+//
+// The Kullback-Leibler divergence is defined as
+//
+//	D_KL(l || r ) = \int_x p(x) log(p(x)/q(x)) dx
+//
+// Note that the Kullback-Leibler divergence is not symmetric with respect to
+// the order of the input arguments.
+type KullbackLeibler struct{}
+
+// DistBeta returns the Kullback-Leibler divergence between Beta distributions
+// l and r.
+//
+// For two Beta distributions, the KL divergence is computed as
+//
+//	D_KL(l || r) =  log Γ(α_l+β_l) - log Γ(α_l) - log Γ(β_l)
+//	                - log Γ(α_r+β_r) + log Γ(α_r) + log Γ(β_r)
+//	                + (α_l-α_r)(ψ(α_l)-ψ(α_l+β_l)) + (β_l-β_r)(ψ(β_l)-ψ(α_l+β_l))
+//
+// Where Γ is the gamma function and ψ is the digamma function.
+func (KullbackLeibler) DistBeta(l, r Beta) float64 {
+	// http://bariskurt.com/kullback-leibler-divergence-between-two-dirichlet-and-beta-distributions/
+	if l.Alpha <= 0 || l.Beta <= 0 {
+		panic("distuv: bad parameters for left distribution")
+	}
+	if r.Alpha <= 0 || r.Beta <= 0 {
+		panic("distuv: bad parameters for right distribution")
+	}
+	lab := l.Alpha + l.Beta
+	l1, _ := math.Lgamma(lab)
+	l2, _ := math.Lgamma(l.Alpha)
+	l3, _ := math.Lgamma(l.Beta)
+	lt := l1 - l2 - l3
+
+	r1, _ := math.Lgamma(r.Alpha + r.Beta)
+	r2, _ := math.Lgamma(r.Alpha)
+	r3, _ := math.Lgamma(r.Beta)
+	rt := r1 - r2 - r3
+
+	d0 := mathext.Digamma(l.Alpha + l.Beta)
+	ct := (l.Alpha-r.Alpha)*(mathext.Digamma(l.Alpha)-d0) + (l.Beta-r.Beta)*(mathext.Digamma(l.Beta)-d0)
+
+	return lt - rt + ct
+}
+
+// DistNormal returns the Kullback-Leibler divergence between Normal distributions
+// l and r.
+//
+// For two Normal distributions, the KL divergence is computed as
+//
+//	D_KL(l || r) = log(σ_r / σ_l) + (σ_l^2 + (μ_l-μ_r)^2)/(2 * σ_r^2) - 0.5
+func (KullbackLeibler) DistNormal(l, r Normal) float64 {
+	d := l.Mu - r.Mu
+	v := (l.Sigma*l.Sigma + d*d) / (2 * r.Sigma * r.Sigma)
+	return math.Log(r.Sigma) - math.Log(l.Sigma) + v - 0.5
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/studentst.go b/vendor/gonum.org/v1/gonum/stat/distuv/studentst.go
new file mode 100644
index 0000000000..3bbb1526be
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/studentst.go
@@ -0,0 +1,161 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+import (
+	"math"
+	"math/rand/v2"
+
+	"gonum.org/v1/gonum/mathext"
+)
+
+const logPi = 1.1447298858494001741 // http://oeis.org/A053510
+
+// StudentsT implements the three-parameter Student's T distribution, a distribution
+// over the real numbers.
+//
+// The Student's T distribution has density function
+//
+//	Γ((ν+1)/2) / (sqrt(νπ) Γ(ν/2) σ) (1 + 1/ν * ((x-μ)/σ)^2)^(-(ν+1)/2)
+//
+// The Student's T distribution approaches the normal distribution as ν → ∞.
+//
+// For more information, see https://en.wikipedia.org/wiki/Student%27s_t-distribution,
+// specifically https://en.wikipedia.org/wiki/Student%27s_t-distribution#Non-standardized_Student.27s_t-distribution .
+//
+// The standard Student's T distribution is with Mu = 0, and Sigma = 1.
+type StudentsT struct {
+	// Mu is the location parameter of the distribution, and the mean of the
+	// distribution
+	Mu float64
+
+	// Sigma is the scale parameter of the distribution. It is related to the
+	// standard deviation by std = Sigma * sqrt(Nu/(Nu-2))
+	Sigma float64
+
+	// Nu is the shape parameter of the distribution, representing the number of
+	// degrees of the distribution, and one less than the number of observations
+	// from a Normal distribution.
+	Nu float64
+
+	Src rand.Source
+}
+
+// CDF computes the value of the cumulative distribution function at x.
+func (s StudentsT) CDF(x float64) float64 {
+	// transform to standard normal
+	y := (x - s.Mu) / s.Sigma
+	if y == 0 {
+		return 0.5
+	}
+	// For t > 0
+	// F(y) = 1 - 0.5 * I_t(y)(nu/2, 1/2)
+	// t(y) = nu/(y^2 + nu)
+	// and 1 - F(y) for t < 0
+	t := s.Nu / (y*y + s.Nu)
+	if y > 0 {
+		return 1 - 0.5*mathext.RegIncBeta(0.5*s.Nu, 0.5, t)
+	}
+	return 0.5 * mathext.RegIncBeta(s.Nu/2, 0.5, t)
+}
+
+// LogProb computes the natural logarithm of the value of the probability
+// density function at x.
+func (s StudentsT) LogProb(x float64) float64 {
+	g1, _ := math.Lgamma((s.Nu + 1) / 2)
+	g2, _ := math.Lgamma(s.Nu / 2)
+	z := (x - s.Mu) / s.Sigma
+	return g1 - g2 - 0.5*math.Log(s.Nu) - 0.5*logPi - math.Log(s.Sigma) - ((s.Nu+1)/2)*math.Log(1+z*z/s.Nu)
+}
+
+// Mean returns the mean of the probability distribution.
+func (s StudentsT) Mean() float64 {
+	return s.Mu
+}
+
+// Mode returns the mode of the distribution.
+func (s StudentsT) Mode() float64 {
+	return s.Mu
+}
+
+// NumParameters returns the number of parameters in the distribution.
+func (StudentsT) NumParameters() int {
+	return 3
+}
+
+// Prob computes the value of the probability density function at x.
+func (s StudentsT) Prob(x float64) float64 {
+	return math.Exp(s.LogProb(x))
+}
+
+// Quantile returns the inverse of the cumulative distribution function.
+func (s StudentsT) Quantile(p float64) float64 {
+	if p < 0 || p > 1 {
+		panic(badPercentile)
+	}
+	// F(x) = 1 - 0.5 * I_t(x)(nu/2, 1/2)
+	// t(x) = nu/(t^2 + nu)
+	if p == 0.5 {
+		return s.Mu
+	}
+	var y float64
+	if p > 0.5 {
+		// Know t > 0
+		t := mathext.InvRegIncBeta(s.Nu/2, 0.5, 2*(1-p))
+		y = math.Sqrt(s.Nu * (1 - t) / t)
+	} else {
+		t := mathext.InvRegIncBeta(s.Nu/2, 0.5, 2*p)
+		y = -math.Sqrt(s.Nu * (1 - t) / t)
+	}
+	// Convert out of standard normal
+	return y*s.Sigma + s.Mu
+}
+
+// Rand returns a random sample drawn from the distribution.
+func (s StudentsT) Rand() float64 {
+	// http://www.math.uah.edu/stat/special/Student.html
+	n := Normal{0, 1, s.Src}.Rand()
+	c := Gamma{s.Nu / 2, 0.5, s.Src}.Rand()
+	z := n / math.Sqrt(c/s.Nu)
+	return z*s.Sigma + s.Mu
+}
+
+// StdDev returns the standard deviation of the probability distribution.
+//
+// The standard deviation is undefined for ν <= 1, and this returns math.NaN().
+func (s StudentsT) StdDev() float64 {
+	return math.Sqrt(s.Variance())
+}
+
+// Survival returns the survival function (complementary CDF) at x.
+func (s StudentsT) Survival(x float64) float64 {
+	// transform to standard normal
+	y := (x - s.Mu) / s.Sigma
+	if y == 0 {
+		return 0.5
+	}
+	// For t > 0
+	// F(y) = 1 - 0.5 * I_t(y)(nu/2, 1/2)
+	// t(y) = nu/(y^2 + nu)
+	// and 1 - F(y) for t < 0
+	t := s.Nu / (y*y + s.Nu)
+	if y > 0 {
+		return 0.5 * mathext.RegIncBeta(s.Nu/2, 0.5, t)
+	}
+	return 1 - 0.5*mathext.RegIncBeta(s.Nu/2, 0.5, t)
+}
+
+// Variance returns the variance of the probability distribution.
+//
+// The variance is undefined for ν <= 1, and this returns math.NaN().
+func (s StudentsT) Variance() float64 {
+	if s.Nu <= 1 {
+		return math.NaN()
+	}
+	if s.Nu <= 2 {
+		return math.Inf(1)
+	}
+	return s.Sigma * s.Sigma * s.Nu / (s.Nu - 2)
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/triangle.go b/vendor/gonum.org/v1/gonum/stat/distuv/triangle.go
new file mode 100644
index 0000000000..20240bbe4a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/triangle.go
@@ -0,0 +1,278 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+import (
+	"math"
+	"math/rand/v2"
+)
+
+// Triangle represents a triangle distribution (https://en.wikipedia.org/wiki/Triangular_distribution).
+type Triangle struct {
+	a, b, c float64
+	src     rand.Source
+}
+
+// NewTriangle constructs a new triangle distribution with lower limit a, upper limit b, and mode c.
+// Constraints are a < b and a ≤ c ≤ b.
+// This distribution is uncommon in nature, but may be useful for simulation.
+func NewTriangle(a, b, c float64, src rand.Source) Triangle {
+	checkTriangleParameters(a, b, c)
+	return Triangle{a: a, b: b, c: c, src: src}
+}
+
+func checkTriangleParameters(a, b, c float64) {
+	if a >= b {
+		panic("triangle: constraint of a < b violated")
+	}
+	if a > c {
+		panic("triangle: constraint of a <= c violated")
+	}
+	if c > b {
+		panic("triangle: constraint of c <= b violated")
+	}
+}
+
+// CDF computes the value of the cumulative density function at x.
+func (t Triangle) CDF(x float64) float64 {
+	switch {
+	case x <= t.a:
+		return 0
+	case x <= t.c:
+		d := x - t.a
+		return (d * d) / ((t.b - t.a) * (t.c - t.a))
+	case x < t.b:
+		d := t.b - x
+		return 1 - (d*d)/((t.b-t.a)*(t.b-t.c))
+	default:
+		return 1
+	}
+}
+
+// Entropy returns the entropy of the distribution.
+func (t Triangle) Entropy() float64 {
+	return 0.5 + math.Log(t.b-t.a) - math.Ln2
+}
+
+// ExKurtosis returns the excess kurtosis of the distribution.
+func (Triangle) ExKurtosis() float64 {
+	return -3.0 / 5.0
+}
+
+// Fit is not appropriate for Triangle, because the distribution is generally used when there is little data.
+
+// LogProb computes the natural logarithm of the value of the probability density function at x.
+func (t Triangle) LogProb(x float64) float64 {
+	return math.Log(t.Prob(x))
+}
+
+// Mean returns the mean of the probability distribution.
+func (t Triangle) Mean() float64 {
+	return (t.a + t.b + t.c) / 3
+}
+
+// Median returns the median of the probability distribution.
+func (t Triangle) Median() float64 {
+	if t.c >= (t.a+t.b)/2 {
+		return t.a + math.Sqrt((t.b-t.a)*(t.c-t.a)/2)
+	}
+	return t.b - math.Sqrt((t.b-t.a)*(t.b-t.c)/2)
+}
+
+// Mode returns the mode of the probability distribution.
+func (t Triangle) Mode() float64 {
+	return t.c
+}
+
+// NumParameters returns the number of parameters in the distribution.
+func (Triangle) NumParameters() int {
+	return 3
+}
+
+// Prob computes the value of the probability density function at x.
+func (t Triangle) Prob(x float64) float64 {
+	switch {
+	case x < t.a:
+		return 0
+	case x < t.c:
+		return 2 * (x - t.a) / ((t.b - t.a) * (t.c - t.a))
+	case x == t.c:
+		return 2 / (t.b - t.a)
+	case x <= t.b:
+		return 2 * (t.b - x) / ((t.b - t.a) * (t.b - t.c))
+	default:
+		return 0
+	}
+}
+
+// Quantile returns the inverse of the cumulative probability distribution.
+func (t Triangle) Quantile(p float64) float64 {
+	if p < 0 || p > 1 {
+		panic(badPercentile)
+	}
+
+	f := (t.c - t.a) / (t.b - t.a)
+
+	if p < f {
+		return t.a + math.Sqrt(p*(t.b-t.a)*(t.c-t.a))
+	}
+	return t.b - math.Sqrt((1-p)*(t.b-t.a)*(t.b-t.c))
+}
+
+// Rand returns a random sample drawn from the distribution.
+func (t Triangle) Rand() float64 {
+	var rnd float64
+	if t.src == nil {
+		rnd = rand.Float64()
+	} else {
+		rnd = rand.New(t.src).Float64()
+	}
+
+	return t.Quantile(rnd)
+}
+
+// Score returns the score function with respect to the parameters of the
+// distribution at the input location x. The score function is the derivative
+// of the log-likelihood at x with respect to the parameters
+//
+//	(∂/∂θ) log(p(x;θ))
+//
+// If deriv is non-nil, len(deriv) must equal the number of parameters otherwise
+// Score will panic, and the derivative is stored in-place into deriv. If deriv
+// is nil a new slice will be allocated and returned.
+//
+// The order is [∂LogProb / ∂Mu, ∂LogProb / ∂Sigma].
+//
+// For more information, see https://en.wikipedia.org/wiki/Score_%28statistics%29.
+func (t Triangle) Score(deriv []float64, x float64) []float64 {
+	if deriv == nil {
+		deriv = make([]float64, t.NumParameters())
+	}
+	if len(deriv) != t.NumParameters() {
+		panic(badLength)
+	}
+	if (x < t.a) || (x > t.b) {
+		deriv[0] = math.NaN()
+		deriv[1] = math.NaN()
+		deriv[2] = math.NaN()
+	} else {
+		invBA := 1 / (t.b - t.a)
+		invCA := 1 / (t.c - t.a)
+		invBC := 1 / (t.b - t.c)
+		switch {
+		case x < t.c:
+			deriv[0] = -1/(x-t.a) + invBA + invCA
+			deriv[1] = -invBA
+			deriv[2] = -invCA
+		case x > t.c:
+			deriv[0] = invBA
+			deriv[1] = 1/(t.b-x) - invBA - invBC
+			deriv[2] = invBC
+		default:
+			deriv[0] = invBA
+			deriv[1] = -invBA
+			deriv[2] = 0
+		}
+		switch {
+		case x == t.a:
+			deriv[0] = math.NaN()
+		case x == t.b:
+			deriv[1] = math.NaN()
+		case x == t.c:
+			deriv[2] = math.NaN()
+		}
+		switch {
+		case t.a == t.c:
+			deriv[0] = math.NaN()
+			deriv[2] = math.NaN()
+		case t.b == t.c:
+			deriv[1] = math.NaN()
+			deriv[2] = math.NaN()
+		}
+	}
+	return deriv
+}
+
+// ScoreInput returns the score function with respect to the input of the
+// distribution at the input location specified by x. The score function is the
+// derivative of the log-likelihood
+//
+//	(d/dx) log(p(x)) .
+//
+// Special cases (c is the mode of the distribution):
+//
+//	ScoreInput(c) = NaN
+//	ScoreInput(x) = NaN for x not in (a, b)
+func (t Triangle) ScoreInput(x float64) float64 {
+	if (x <= t.a) || (x >= t.b) || (x == t.c) {
+		return math.NaN()
+	}
+	if x < t.c {
+		return 1 / (x - t.a)
+	}
+	return 1 / (x - t.b)
+}
+
+// Skewness returns the skewness of the distribution.
+func (t Triangle) Skewness() float64 {
+	n := math.Sqrt2 * (t.a + t.b - 2*t.c) * (2*t.a - t.b - t.c) * (t.a - 2*t.b + t.c)
+	d := 5 * math.Pow(t.a*t.a+t.b*t.b+t.c*t.c-t.a*t.b-t.a*t.c-t.b*t.c, 3.0/2.0)
+
+	return n / d
+}
+
+// StdDev returns the standard deviation of the probability distribution.
+func (t Triangle) StdDev() float64 {
+	return math.Sqrt(t.Variance())
+}
+
+// Survival returns the survival function (complementary CDF) at x.
+func (t Triangle) Survival(x float64) float64 {
+	return 1 - t.CDF(x)
+}
+
+// parameters returns the parameters of the distribution.
+func (t Triangle) parameters(p []Parameter) []Parameter {
+	nParam := t.NumParameters()
+	if p == nil {
+		p = make([]Parameter, nParam)
+	} else if len(p) != nParam {
+		panic("triangle: improper parameter length")
+	}
+	p[0].Name = "A"
+	p[0].Value = t.a
+	p[1].Name = "B"
+	p[1].Value = t.b
+	p[2].Name = "C"
+	p[2].Value = t.c
+	return p
+}
+
+// setParameters modifies the parameters of the distribution.
+func (t *Triangle) setParameters(p []Parameter) {
+	if len(p) != t.NumParameters() {
+		panic("triangle: incorrect number of parameters to set")
+	}
+	if p[0].Name != "A" {
+		panic("triangle: " + panicNameMismatch)
+	}
+	if p[1].Name != "B" {
+		panic("triangle: " + panicNameMismatch)
+	}
+	if p[2].Name != "C" {
+		panic("triangle: " + panicNameMismatch)
+	}
+
+	checkTriangleParameters(p[0].Value, p[1].Value, p[2].Value)
+
+	t.a = p[0].Value
+	t.b = p[1].Value
+	t.c = p[2].Value
+}
+
+// Variance returns the variance of the probability distribution.
+func (t Triangle) Variance() float64 {
+	return (t.a*t.a + t.b*t.b + t.c*t.c - t.a*t.b - t.a*t.c - t.b*t.c) / 18
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/uniform.go b/vendor/gonum.org/v1/gonum/stat/distuv/uniform.go
new file mode 100644
index 0000000000..3f555e335a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/uniform.go
@@ -0,0 +1,210 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+import (
+	"math"
+	"math/rand/v2"
+)
+
+// UnitUniform is an instantiation of the uniform distribution with Min = 0
+// and Max = 1.
+var UnitUniform = Uniform{Min: 0, Max: 1}
+
+// Uniform represents a continuous uniform distribution (https://en.wikipedia.org/wiki/Uniform_distribution_%28continuous%29).
+type Uniform struct {
+	Min float64
+	Max float64
+	Src rand.Source
+}
+
+// CDF computes the value of the cumulative density function at x.
+func (u Uniform) CDF(x float64) float64 {
+	if x < u.Min {
+		return 0
+	}
+	if x > u.Max {
+		return 1
+	}
+	return (x - u.Min) / (u.Max - u.Min)
+}
+
+// Uniform doesn't have any of the DLogProbD? because the derivative is 0 everywhere
+// except where it's undefined
+
+// Entropy returns the entropy of the distribution.
+func (u Uniform) Entropy() float64 {
+	return math.Log(u.Max - u.Min)
+}
+
+// ExKurtosis returns the excess kurtosis of the distribution.
+func (Uniform) ExKurtosis() float64 {
+	return -6.0 / 5.0
+}
+
+// Uniform doesn't have Fit because it's a bad idea to fit a uniform from data.
+
+// LogProb computes the natural logarithm of the value of the probability density function at x.
+func (u Uniform) LogProb(x float64) float64 {
+	if x < u.Min {
+		return math.Inf(-1)
+	}
+	if x > u.Max {
+		return math.Inf(-1)
+	}
+	return -math.Log(u.Max - u.Min)
+}
+
+// parameters returns the parameters of the distribution.
+func (u Uniform) parameters(p []Parameter) []Parameter {
+	nParam := u.NumParameters()
+	if p == nil {
+		p = make([]Parameter, nParam)
+	} else if len(p) != nParam {
+		panic("uniform: improper parameter length")
+	}
+	p[0].Name = "Min"
+	p[0].Value = u.Min
+	p[1].Name = "Max"
+	p[1].Value = u.Max
+	return p
+}
+
+// Mean returns the mean of the probability distribution.
+func (u Uniform) Mean() float64 {
+	return (u.Max + u.Min) / 2
+}
+
+// Median returns the median of the probability distribution.
+func (u Uniform) Median() float64 {
+	return (u.Max + u.Min) / 2
+}
+
+// Uniform doesn't have a mode because it's any value in the distribution
+
+// NumParameters returns the number of parameters in the distribution.
+func (Uniform) NumParameters() int {
+	return 2
+}
+
+// Prob computes the value of the probability density function at x.
+func (u Uniform) Prob(x float64) float64 {
+	if x < u.Min {
+		return 0
+	}
+	if x > u.Max {
+		return 0
+	}
+	return 1 / (u.Max - u.Min)
+}
+
+// Quantile returns the inverse of the cumulative probability distribution.
+func (u Uniform) Quantile(p float64) float64 {
+	if p < 0 || p > 1 {
+		panic(badPercentile)
+	}
+	return p*(u.Max-u.Min) + u.Min
+}
+
+// Rand returns a random sample drawn from the distribution.
+func (u Uniform) Rand() float64 {
+	var rnd float64
+	if u.Src == nil {
+		rnd = rand.Float64()
+	} else {
+		rnd = rand.New(u.Src).Float64()
+	}
+	return rnd*(u.Max-u.Min) + u.Min
+}
+
+// Score returns the score function with respect to the parameters of the
+// distribution at the input location x. The score function is the derivative
+// of the log-likelihood at x with respect to the parameters
+//
+//	(∂/∂θ) log(p(x;θ))
+//
+// If deriv is non-nil, len(deriv) must equal the number of parameters otherwise
+// Score will panic, and the derivative is stored in-place into deriv. If deriv
+// is nil a new slice will be allocated and returned.
+//
+// The order is [∂LogProb / ∂Mu, ∂LogProb / ∂Sigma].
+//
+// For more information, see https://en.wikipedia.org/wiki/Score_%28statistics%29.
+func (u Uniform) Score(deriv []float64, x float64) []float64 {
+	if deriv == nil {
+		deriv = make([]float64, u.NumParameters())
+	}
+	if len(deriv) != u.NumParameters() {
+		panic(badLength)
+	}
+	if (x < u.Min) || (x > u.Max) {
+		deriv[0] = math.NaN()
+		deriv[1] = math.NaN()
+	} else {
+		deriv[0] = 1 / (u.Max - u.Min)
+		deriv[1] = -deriv[0]
+		if x == u.Min {
+			deriv[0] = math.NaN()
+		}
+		if x == u.Max {
+			deriv[1] = math.NaN()
+		}
+	}
+	return deriv
+}
+
+// ScoreInput returns the score function with respect to the input of the
+// distribution at the input location specified by x. The score function is the
+// derivative of the log-likelihood
+//
+//	(d/dx) log(p(x)) .
+func (u Uniform) ScoreInput(x float64) float64 {
+	if (x <= u.Min) || (x >= u.Max) {
+		return math.NaN()
+	}
+	return 0
+}
+
+// Skewness returns the skewness of the distribution.
+func (Uniform) Skewness() float64 {
+	return 0
+}
+
+// StdDev returns the standard deviation of the probability distribution.
+func (u Uniform) StdDev() float64 {
+	return math.Sqrt(u.Variance())
+}
+
+// Survival returns the survival function (complementary CDF) at x.
+func (u Uniform) Survival(x float64) float64 {
+	if x < u.Min {
+		return 1
+	}
+	if x > u.Max {
+		return 0
+	}
+	return (u.Max - x) / (u.Max - u.Min)
+}
+
+// setParameters modifies the parameters of the distribution.
+func (u *Uniform) setParameters(p []Parameter) {
+	if len(p) != u.NumParameters() {
+		panic("uniform: incorrect number of parameters to set")
+	}
+	if p[0].Name != "Min" {
+		panic("uniform: " + panicNameMismatch)
+	}
+	if p[1].Name != "Max" {
+		panic("uniform: " + panicNameMismatch)
+	}
+
+	u.Min = p[0].Value
+	u.Max = p[1].Value
+}
+
+// Variance returns the variance of the probability distribution.
+func (u Uniform) Variance() float64 {
+	return 1.0 / 12.0 * (u.Max - u.Min) * (u.Max - u.Min)
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/distuv/weibull.go b/vendor/gonum.org/v1/gonum/stat/distuv/weibull.go
new file mode 100644
index 0000000000..4042a8a901
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/distuv/weibull.go
@@ -0,0 +1,231 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package distuv
+
+import (
+	"math"
+	"math/rand/v2"
+)
+
+// Weibull distribution. Valid range for x is [0,+∞).
+type Weibull struct {
+	// Shape parameter of the distribution. A value of 1 represents
+	// the exponential distribution. A value of 2 represents the
+	// Rayleigh distribution. Valid range is (0,+∞).
+	K float64
+	// Scale parameter of the distribution. Valid range is (0,+∞).
+	Lambda float64
+	// Source of random numbers
+	Src rand.Source
+}
+
+// CDF computes the value of the cumulative density function at x.
+func (w Weibull) CDF(x float64) float64 {
+	if x < 0 {
+		return 0
+	}
+	return -math.Expm1(-math.Pow(x/w.Lambda, w.K))
+}
+
+// Entropy returns the entropy of the distribution.
+func (w Weibull) Entropy() float64 {
+	return eulerGamma*(1-1/w.K) + math.Log(w.Lambda/w.K) + 1
+}
+
+// ExKurtosis returns the excess kurtosis of the distribution.
+func (w Weibull) ExKurtosis() float64 {
+	return (-6*w.gammaIPow(1, 4) + 12*w.gammaIPow(1, 2)*math.Gamma(1+2/w.K) - 3*w.gammaIPow(2, 2) - 4*math.Gamma(1+1/w.K)*math.Gamma(1+3/w.K) + math.Gamma(1+4/w.K)) / math.Pow(math.Gamma(1+2/w.K)-w.gammaIPow(1, 2), 2)
+}
+
+// gammIPow is a shortcut for computing the gamma function to a power.
+func (w Weibull) gammaIPow(i, pow float64) float64 {
+	return math.Pow(math.Gamma(1+i/w.K), pow)
+}
+
+// LogProb computes the natural logarithm of the value of the probability
+// density function at x. -Inf is returned if x is less than zero.
+//
+// Special cases occur when x == 0, and the result depends on the shape
+// parameter as follows:
+//
+//	If 0 < K < 1, LogProb returns +Inf.
+//	If K == 1, LogProb returns 0.
+//	If K > 1, LogProb returns -Inf.
+func (w Weibull) LogProb(x float64) float64 {
+	if x < 0 {
+		return math.Inf(-1)
+	}
+	if x == 0 && w.K == 1 {
+		return 0
+	}
+	return math.Log(w.K) - math.Log(w.Lambda) + (w.K-1)*(math.Log(x)-math.Log(w.Lambda)) - math.Pow(x/w.Lambda, w.K)
+}
+
+// LogSurvival returns the log of the survival function (complementary CDF) at x.
+func (w Weibull) LogSurvival(x float64) float64 {
+	if x < 0 {
+		return 0
+	}
+	return -math.Pow(x/w.Lambda, w.K)
+}
+
+// Mean returns the mean of the probability distribution.
+func (w Weibull) Mean() float64 {
+	return w.Lambda * math.Gamma(1+1/w.K)
+}
+
+// Median returns the median of the normal distribution.
+func (w Weibull) Median() float64 {
+	return w.Lambda * math.Pow(ln2, 1/w.K)
+}
+
+// Mode returns the mode of the normal distribution.
+//
+// The mode is NaN in the special case where the K (shape) parameter
+// is less than 1.
+func (w Weibull) Mode() float64 {
+	if w.K > 1 {
+		return w.Lambda * math.Pow((w.K-1)/w.K, 1/w.K)
+	}
+	return 0
+}
+
+// NumParameters returns the number of parameters in the distribution.
+func (Weibull) NumParameters() int {
+	return 2
+}
+
+// Prob computes the value of the probability density function at x.
+func (w Weibull) Prob(x float64) float64 {
+	if x < 0 {
+		return 0
+	}
+	return math.Exp(w.LogProb(x))
+}
+
+// Quantile returns the inverse of the cumulative probability distribution.
+func (w Weibull) Quantile(p float64) float64 {
+	if p < 0 || p > 1 {
+		panic(badPercentile)
+	}
+	return w.Lambda * math.Pow(-math.Log(1-p), 1/w.K)
+}
+
+// Rand returns a random sample drawn from the distribution.
+func (w Weibull) Rand() float64 {
+	var rnd float64
+	if w.Src == nil {
+		rnd = rand.Float64()
+	} else {
+		rnd = rand.New(w.Src).Float64()
+	}
+	return w.Quantile(rnd)
+}
+
+// Score returns the score function with respect to the parameters of the
+// distribution at the input location x. The score function is the derivative
+// of the log-likelihood at x with respect to the parameters
+//
+//	(∂/∂θ) log(p(x;θ))
+//
+// If deriv is non-nil, len(deriv) must equal the number of parameters otherwise
+// Score will panic, and the derivative is stored in-place into deriv. If deriv
+// is nil a new slice will be allocated and returned.
+//
+// The order is [∂LogProb / ∂K, ∂LogProb / ∂λ].
+//
+// For more information, see https://en.wikipedia.org/wiki/Score_%28statistics%29.
+//
+// Special cases:
+//
+//	Score(x) = [NaN, NaN] for x <= 0
+func (w Weibull) Score(deriv []float64, x float64) []float64 {
+	if deriv == nil {
+		deriv = make([]float64, w.NumParameters())
+	}
+	if len(deriv) != w.NumParameters() {
+		panic(badLength)
+	}
+	if x > 0 {
+		deriv[0] = 1/w.K + math.Log(x) - math.Log(w.Lambda) - (math.Log(x)-math.Log(w.Lambda))*math.Pow(x/w.Lambda, w.K)
+		deriv[1] = (w.K * (math.Pow(x/w.Lambda, w.K) - 1)) / w.Lambda
+		return deriv
+	}
+	deriv[0] = math.NaN()
+	deriv[1] = math.NaN()
+	return deriv
+}
+
+// ScoreInput returns the score function with respect to the input of the
+// distribution at the input location specified by x. The score function is the
+// derivative of the log-likelihood
+//
+//	(d/dx) log(p(x)) .
+//
+// Special cases:
+//
+//	ScoreInput(x) = NaN for x <= 0
+func (w Weibull) ScoreInput(x float64) float64 {
+	if x > 0 {
+		return (-w.K*math.Pow(x/w.Lambda, w.K) + w.K - 1) / x
+	}
+	return math.NaN()
+}
+
+// Skewness returns the skewness of the distribution.
+func (w Weibull) Skewness() float64 {
+	stdDev := w.StdDev()
+	firstGamma, firstGammaSign := math.Lgamma(1 + 3/w.K)
+	logFirst := firstGamma + 3*(math.Log(w.Lambda)-math.Log(stdDev))
+	logSecond := math.Log(3) + math.Log(w.Mean()) + 2*math.Log(stdDev) - 3*math.Log(stdDev)
+	logThird := 3 * (math.Log(w.Mean()) - math.Log(stdDev))
+	return float64(firstGammaSign)*math.Exp(logFirst) - math.Exp(logSecond) - math.Exp(logThird)
+}
+
+// StdDev returns the standard deviation of the probability distribution.
+func (w Weibull) StdDev() float64 {
+	return math.Sqrt(w.Variance())
+}
+
+// Survival returns the survival function (complementary CDF) at x.
+func (w Weibull) Survival(x float64) float64 {
+	return math.Exp(w.LogSurvival(x))
+}
+
+// setParameters modifies the parameters of the distribution.
+func (w *Weibull) setParameters(p []Parameter) {
+	if len(p) != w.NumParameters() {
+		panic("weibull: incorrect number of parameters to set")
+	}
+	if p[0].Name != "K" {
+		panic("weibull: " + panicNameMismatch)
+	}
+	if p[1].Name != "λ" {
+		panic("weibull: " + panicNameMismatch)
+	}
+	w.K = p[0].Value
+	w.Lambda = p[1].Value
+}
+
+// Variance returns the variance of the probability distribution.
+func (w Weibull) Variance() float64 {
+	return math.Pow(w.Lambda, 2) * (math.Gamma(1+2/w.K) - w.gammaIPow(1, 2))
+}
+
+// parameters returns the parameters of the distribution.
+func (w Weibull) parameters(p []Parameter) []Parameter {
+	nParam := w.NumParameters()
+	if p == nil {
+		p = make([]Parameter, nParam)
+	} else if len(p) != nParam {
+		panic("weibull: improper parameter length")
+	}
+	p[0].Name = "K"
+	p[0].Value = w.K
+	p[1].Name = "λ"
+	p[1].Value = w.Lambda
+	return p
+
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/doc.go b/vendor/gonum.org/v1/gonum/stat/doc.go
new file mode 100644
index 0000000000..d6916cb252
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package stat provides generalized statistical functions.
+package stat // import "gonum.org/v1/gonum/stat"
diff --git a/vendor/gonum.org/v1/gonum/stat/pca_cca.go b/vendor/gonum.org/v1/gonum/stat/pca_cca.go
new file mode 100644
index 0000000000..1cc92ebf11
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/pca_cca.go
@@ -0,0 +1,317 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package stat
+
+import (
+	"errors"
+	"math"
+
+	"gonum.org/v1/gonum/floats"
+	"gonum.org/v1/gonum/mat"
+)
+
+// PC is a type for computing and extracting the principal components of a
+// matrix. The results of the principal components analysis are only valid
+// if the call to PrincipalComponents was successful.
+type PC struct {
+	n, d    int
+	weights []float64
+	svd     *mat.SVD
+	ok      bool
+}
+
+// PrincipalComponents performs a weighted principal components analysis on the
+// matrix of the input data which is represented as an n×d matrix a where each
+// row is an observation and each column is a variable.
+//
+// PrincipalComponents centers the variables but does not scale the variance.
+//
+// The weights slice is used to weight the observations. If weights is nil, each
+// weight is considered to have a value of one, otherwise the length of weights
+// must match the number of observations or PrincipalComponents will panic.
+//
+// PrincipalComponents returns whether the analysis was successful.
+func (c *PC) PrincipalComponents(a mat.Matrix, weights []float64) (ok bool) {
+	c.n, c.d = a.Dims()
+	if weights != nil && len(weights) != c.n {
+		panic("stat: len(weights) != observations")
+	}
+
+	c.svd, c.ok = svdFactorizeCentered(c.svd, a, weights)
+	if c.ok {
+		c.weights = append(c.weights[:0], weights...)
+	}
+	return c.ok
+}
+
+// VectorsTo returns the component direction vectors of a principal components
+// analysis. The vectors are returned in the columns of a d×min(n, d) matrix.
+//
+// If dst is empty, VectorsTo will resize dst to be d×min(n, d). When dst is
+// non-empty, VectorsTo will panic if dst is not d×min(n, d). VectorsTo will also
+// panic if the receiver does not contain a successful PC.
+func (c *PC) VectorsTo(dst *mat.Dense) {
+	if !c.ok {
+		panic("stat: use of unsuccessful principal components analysis")
+	}
+
+	if dst.IsEmpty() {
+		dst.ReuseAs(c.d, min(c.n, c.d))
+	} else {
+		if d, n := dst.Dims(); d != c.d || n != min(c.n, c.d) {
+			panic(mat.ErrShape)
+		}
+	}
+	c.svd.VTo(dst)
+}
+
+// VarsTo returns the column variances of the principal component scores,
+// b * vecs, where b is a matrix with centered columns. Variances are returned
+// in descending order.
+// If dst is not nil it is used to store the variances and returned.
+// Vars will panic if the receiver has not successfully performed a principal
+// components analysis or dst is not nil and the length of dst is not min(n, d).
+func (c *PC) VarsTo(dst []float64) []float64 {
+	if !c.ok {
+		panic("stat: use of unsuccessful principal components analysis")
+	}
+	if dst != nil && len(dst) != min(c.n, c.d) {
+		panic("stat: length of slice does not match analysis")
+	}
+
+	dst = c.svd.Values(dst)
+	var f float64
+	if c.weights == nil {
+		f = 1 / float64(c.n-1)
+	} else {
+		f = 1 / (floats.Sum(c.weights) - 1)
+	}
+	for i, v := range dst {
+		dst[i] = f * v * v
+	}
+	return dst
+}
+
+// CC is a type for computing the canonical correlations of a pair of matrices.
+// The results of the canonical correlation analysis are only valid
+// if the call to CanonicalCorrelations was successful.
+type CC struct {
+	// n is the number of observations used to
+	// construct the canonical correlations.
+	n int
+
+	// xd and yd are used for size checks.
+	xd, yd int
+
+	x, y, c *mat.SVD
+	ok      bool
+}
+
+// CanonicalCorrelations performs a canonical correlation analysis of the
+// input data x and y, columns of which should be interpretable as two sets
+// of measurements on the same observations (rows). These observations are
+// optionally weighted by weights. The result of the analysis is stored in
+// the receiver if the analysis is successful.
+//
+// Canonical correlation analysis finds associations between two sets of
+// variables on the same observations by finding linear combinations of the two
+// sphered datasets that maximize the correlation between them.
+//
+// Some notation: let Xc and Yc denote the centered input data matrices x
+// and y (column means subtracted from each column), let Sx and Sy denote the
+// sample covariance matrices within x and y respectively, and let Sxy denote
+// the covariance matrix between x and y. The sphered data can then be expressed
+// as Xc * Sx^{-1/2} and Yc * Sy^{-1/2} respectively, and the correlation matrix
+// between the sphered data is called the canonical correlation matrix,
+// Sx^{-1/2} * Sxy * Sy^{-1/2}. In cases where S^{-1/2} is ambiguous for some
+// covariance matrix S, S^{-1/2} is taken to be E * D^{-1/2} * Eᵀ where S can
+// be eigendecomposed as S = E * D * Eᵀ.
+//
+// The canonical correlations are the correlations between the corresponding
+// pairs of canonical variables and can be obtained with c.Corrs(). Canonical
+// variables can be obtained by projecting the sphered data into the left and
+// right eigenvectors of the canonical correlation matrix, and these
+// eigenvectors can be obtained with c.Left(m, true) and c.Right(m, true)
+// respectively. The canonical variables can also be obtained directly from the
+// centered raw data by using the back-transformed eigenvectors which can be
+// obtained with c.Left(m, false) and c.Right(m, false) respectively.
+//
+// The first pair of left and right eigenvectors of the canonical correlation
+// matrix can be interpreted as directions into which the respective sphered
+// data can be projected such that the correlation between the two projections
+// is maximized. The second pair and onwards solve the same optimization but
+// under the constraint that they are uncorrelated (orthogonal in sphered space)
+// to previous projections.
+//
+// CanonicalCorrelations will panic if the inputs x and y do not have the same
+// number of rows.
+//
+// The slice weights is used to weight the observations. If weights is nil, each
+// weight is considered to have a value of one, otherwise the length of weights
+// must match the number of observations (rows of both x and y) or
+// CanonicalCorrelations will panic.
+//
+// More details can be found at
+// https://en.wikipedia.org/wiki/Canonical_correlation
+// or in Chapter 3 of
+// Koch, Inge. Analysis of multivariate and high-dimensional data.
+// Vol. 32. Cambridge University Press, 2013. ISBN: 9780521887939
+func (c *CC) CanonicalCorrelations(x, y mat.Matrix, weights []float64) error {
+	var yn int
+	c.n, c.xd = x.Dims()
+	yn, c.yd = y.Dims()
+	if c.n != yn {
+		panic("stat: unequal number of observations")
+	}
+	if weights != nil && len(weights) != c.n {
+		panic("stat: len(weights) != observations")
+	}
+
+	// Center and factorize x and y.
+	c.x, c.ok = svdFactorizeCentered(c.x, x, weights)
+	if !c.ok {
+		return errors.New("stat: failed to factorize x")
+	}
+	c.y, c.ok = svdFactorizeCentered(c.y, y, weights)
+	if !c.ok {
+		return errors.New("stat: failed to factorize y")
+	}
+	var xu, xv, yu, yv mat.Dense
+	c.x.UTo(&xu)
+	c.x.VTo(&xv)
+	c.y.UTo(&yu)
+	c.y.VTo(&yv)
+
+	// Calculate and factorise the canonical correlation matrix.
+	var ccor mat.Dense
+	ccor.Product(&xv, xu.T(), &yu, yv.T())
+	if c.c == nil {
+		c.c = &mat.SVD{}
+	}
+	c.ok = c.c.Factorize(&ccor, mat.SVDThin)
+	if !c.ok {
+		return errors.New("stat: failed to factorize ccor")
+	}
+	return nil
+}
+
+// CorrsTo returns the canonical correlations, using dst if it is not nil.
+// If dst is not nil and len(dst) does not match the number of columns in
+// the y input matrix, Corrs will panic.
+func (c *CC) CorrsTo(dst []float64) []float64 {
+	if !c.ok {
+		panic("stat: canonical correlations missing or invalid")
+	}
+
+	if dst != nil && len(dst) != c.yd {
+		panic("stat: length of destination does not match input dimension")
+	}
+	return c.c.Values(dst)
+}
+
+// LeftTo returns the left eigenvectors of the canonical correlation matrix if
+// spheredSpace is true. If spheredSpace is false it returns these eigenvectors
+// back-transformed to the original data space.
+//
+// If dst is empty, LeftTo will resize dst to be xd×yd. When dst is
+// non-empty, LeftTo will panic if dst is not xd×yd. LeftTo will also
+// panic if the receiver does not contain a successful CC.
+func (c *CC) LeftTo(dst *mat.Dense, spheredSpace bool) {
+	if !c.ok || c.n < 2 {
+		panic("stat: canonical correlations missing or invalid")
+	}
+
+	if dst.IsEmpty() {
+		dst.ReuseAs(c.xd, c.yd)
+	} else {
+		if d, n := dst.Dims(); d != c.xd || n != c.yd {
+			panic(mat.ErrShape)
+		}
+	}
+	c.c.UTo(dst)
+	if spheredSpace {
+		return
+	}
+
+	xs := c.x.Values(nil)
+	xv := &mat.Dense{}
+	c.x.VTo(xv)
+
+	scaleColsReciSqrt(xv, xs)
+
+	dst.Product(xv, xv.T(), dst)
+	dst.Scale(math.Sqrt(float64(c.n-1)), dst)
+}
+
+// RightTo returns the right eigenvectors of the canonical correlation matrix if
+// spheredSpace is true. If spheredSpace is false it returns these eigenvectors
+// back-transformed to the original data space.
+//
+// If dst is empty, RightTo will resize dst to be yd×yd. When dst is
+// non-empty, RightTo will panic if dst is not yd×yd. RightTo will also
+// panic if the receiver does not contain a successful CC.
+func (c *CC) RightTo(dst *mat.Dense, spheredSpace bool) {
+	if !c.ok || c.n < 2 {
+		panic("stat: canonical correlations missing or invalid")
+	}
+
+	if dst.IsEmpty() {
+		dst.ReuseAs(c.yd, c.yd)
+	} else {
+		if d, n := dst.Dims(); d != c.yd || n != c.yd {
+			panic(mat.ErrShape)
+		}
+	}
+	c.c.VTo(dst)
+	if spheredSpace {
+		return
+	}
+
+	ys := c.y.Values(nil)
+	yv := &mat.Dense{}
+	c.y.VTo(yv)
+
+	scaleColsReciSqrt(yv, ys)
+
+	dst.Product(yv, yv.T(), dst)
+	dst.Scale(math.Sqrt(float64(c.n-1)), dst)
+}
+
+func svdFactorizeCentered(work *mat.SVD, m mat.Matrix, weights []float64) (svd *mat.SVD, ok bool) {
+	n, d := m.Dims()
+	centered := mat.NewDense(n, d, nil)
+	col := make([]float64, n)
+	for j := 0; j < d; j++ {
+		mat.Col(col, j, m)
+		floats.AddConst(-Mean(col, weights), col)
+		centered.SetCol(j, col)
+	}
+	for i, w := range weights {
+		floats.Scale(math.Sqrt(w), centered.RawRowView(i))
+	}
+	if work == nil {
+		work = &mat.SVD{}
+	}
+	ok = work.Factorize(centered, mat.SVDThin)
+	return work, ok
+}
+
+// scaleColsReciSqrt scales the columns of cols
+// by the reciprocal square-root of vals.
+func scaleColsReciSqrt(cols *mat.Dense, vals []float64) {
+	if cols == nil {
+		panic("stat: input nil")
+	}
+	n, d := cols.Dims()
+	if len(vals) != d {
+		panic("stat: input length mismatch")
+	}
+	col := make([]float64, n)
+	for j := 0; j < d; j++ {
+		mat.Col(col, j, cols)
+		floats.Scale(math.Sqrt(1/vals[j]), col)
+		cols.SetCol(j, col)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/roc.go b/vendor/gonum.org/v1/gonum/stat/roc.go
new file mode 100644
index 0000000000..19add6fac9
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/roc.go
@@ -0,0 +1,198 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package stat
+
+import (
+	"math"
+	"slices"
+	"sort"
+)
+
+// ROC returns paired false positive rate (FPR) and true positive rate
+// (TPR) values corresponding to cutoff points on the receiver operator
+// characteristic (ROC) curve obtained when y is treated as a binary
+// classifier for classes with weights. The cutoff thresholds used to
+// calculate the ROC are returned in thresh such that tpr[i] and fpr[i]
+// are the true and false positive rates for y >= thresh[i].
+//
+// The input y and cutoffs must be sorted, and values in y must correspond
+// to values in classes and weights. SortWeightedLabeled can be used to
+// sort y together with classes and weights.
+//
+// For a given cutoff value, observations corresponding to entries in y
+// greater than the cutoff value are classified as true, while those
+// less than or equal to the cutoff value are classified as false. These
+// assigned class labels are compared with the true values in the classes
+// slice and used to calculate the FPR and TPR.
+//
+// If weights is nil, all weights are treated as 1. If weights is not nil
+// it must have the same length as y and classes, otherwise ROC will panic.
+//
+// If cutoffs is nil or empty, all possible cutoffs are calculated,
+// resulting in fpr and tpr having length one greater than the number of
+// unique values in y. Otherwise fpr and tpr will be returned with the
+// same length as cutoffs. floats.Span can be used to generate equally
+// spaced cutoffs.
+//
+// More details about ROC curves are available at
+// https://en.wikipedia.org/wiki/Receiver_operating_characteristic
+func ROC(cutoffs, y []float64, classes []bool, weights []float64) (tpr, fpr, thresh []float64) {
+	if len(y) != len(classes) {
+		panic("stat: slice length mismatch")
+	}
+	if weights != nil && len(y) != len(weights) {
+		panic("stat: slice length mismatch")
+	}
+	if !sort.Float64sAreSorted(y) {
+		panic("stat: input must be sorted ascending")
+	}
+	if !sort.Float64sAreSorted(cutoffs) {
+		panic("stat: cutoff values must be sorted ascending")
+	}
+	if len(y) == 0 {
+		return nil, nil, nil
+	}
+	if len(cutoffs) == 0 {
+		if cutoffs == nil || cap(cutoffs) < len(y)+1 {
+			cutoffs = make([]float64, len(y)+1)
+		} else {
+			cutoffs = cutoffs[:len(y)+1]
+		}
+		// Choose all possible cutoffs for unique values in y.
+		bin := 0
+		cutoffs[bin] = y[0]
+		for i, u := range y[1:] {
+			if u == y[i] {
+				continue
+			}
+			bin++
+			cutoffs[bin] = u
+		}
+		cutoffs[bin+1] = math.Inf(1)
+		cutoffs = cutoffs[:bin+2]
+	} else {
+		// Don't mutate the provided cutoffs.
+		tmp := cutoffs
+		cutoffs = make([]float64, len(cutoffs))
+		copy(cutoffs, tmp)
+	}
+
+	tpr = make([]float64, len(cutoffs))
+	fpr = make([]float64, len(cutoffs))
+	var bin int
+	var nPos, nNeg float64
+	for i, u := range classes {
+		// Update the bin until it matches the next y value
+		// skipping empty bins.
+		for bin < len(cutoffs)-1 && y[i] >= cutoffs[bin] {
+			bin++
+			tpr[bin] = tpr[bin-1]
+			fpr[bin] = fpr[bin-1]
+		}
+		posWeight, negWeight := 1.0, 0.0
+		if weights != nil {
+			posWeight = weights[i]
+		}
+		if !u {
+			posWeight, negWeight = negWeight, posWeight
+		}
+		nPos += posWeight
+		nNeg += negWeight
+		// Count false negatives (in tpr) and true negatives (in fpr).
+		if y[i] < cutoffs[bin] {
+			tpr[bin] += posWeight
+			fpr[bin] += negWeight
+		}
+	}
+
+	invNeg := 1 / nNeg
+	invPos := 1 / nPos
+	// Convert negative counts to TPR and FPR.
+	// Bins beyond the maximum value in y are skipped
+	// leaving these fpr and tpr elements as zero.
+	for i := range tpr[:bin+1] {
+		// Prevent fused float operations by
+		// making explicit float64 conversions.
+		tpr[i] = 1 - float64(tpr[i]*invPos)
+		fpr[i] = 1 - float64(fpr[i]*invNeg)
+	}
+	slices.Reverse(tpr)
+	slices.Reverse(fpr)
+	slices.Reverse(cutoffs)
+
+	return tpr, fpr, cutoffs
+}
+
+// TOC returns the Total Operating Characteristic for the classes provided
+// and the minimum and maximum bounds for the TOC.
+//
+// The input y values that correspond to classes and weights must be sorted
+// in ascending order. classes[i] is the class of value y[i] and weights[i]
+// is the weight of y[i]. SortWeightedLabeled can be used to sort classes
+// together with weights by the rank variable, i+1.
+//
+// The returned ntp values can be interpreted as the number of true positives
+// where values above the given rank are assigned class true for each given
+// rank from 1 to len(classes).
+//
+//	ntp_i = sum_{j ≥ len(ntp)-1 - i} [ classes_j ] * weights_j, where [x] = 1 if x else 0.
+//
+// The values of min and max provide the minimum and maximum possible number
+// of false values for the set of classes. The first element of ntp, min and
+// max are always zero as this corresponds to assigning all data class false
+// and the last elements are always weighted sum of classes as this corresponds
+// to assigning every data class true. For len(classes) != 0, the lengths of
+// min, ntp and max are len(classes)+1.
+//
+// If weights is nil, all weights are treated as 1. When weights are not nil,
+// the calculation of min and max allows for partial assignment of single data
+// points. If weights is not nil it must have the same length as classes,
+// otherwise TOC will panic.
+//
+// More details about TOC curves are available at
+// https://en.wikipedia.org/wiki/Total_operating_characteristic
+func TOC(classes []bool, weights []float64) (min, ntp, max []float64) {
+	if weights != nil && len(classes) != len(weights) {
+		panic("stat: slice length mismatch")
+	}
+	if len(classes) == 0 {
+		return nil, nil, nil
+	}
+
+	ntp = make([]float64, len(classes)+1)
+	min = make([]float64, len(ntp))
+	max = make([]float64, len(ntp))
+	if weights == nil {
+		for i := range ntp[1:] {
+			ntp[i+1] = ntp[i]
+			if classes[len(classes)-i-1] {
+				ntp[i+1]++
+			}
+		}
+		totalPositive := ntp[len(ntp)-1]
+		for i := range ntp {
+			min[i] = math.Max(0, totalPositive-float64(len(classes)-i))
+			max[i] = math.Min(totalPositive, float64(i))
+		}
+		return min, ntp, max
+	}
+
+	cumw := max // Reuse max for cumulative weight. Update its elements last.
+	for i := range ntp[1:] {
+		ntp[i+1] = ntp[i]
+		w := weights[len(weights)-i-1]
+		cumw[i+1] = cumw[i] + w
+		if classes[len(classes)-i-1] {
+			ntp[i+1] += w
+		}
+	}
+	totw := cumw[len(cumw)-1]
+	totalPositive := ntp[len(ntp)-1]
+	for i := range ntp {
+		min[i] = math.Max(0, totalPositive-(totw-cumw[i]))
+		max[i] = math.Min(totalPositive, cumw[i])
+	}
+	return min, ntp, max
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/stat.go b/vendor/gonum.org/v1/gonum/stat/stat.go
new file mode 100644
index 0000000000..f7d43726f7
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/stat.go
@@ -0,0 +1,1400 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package stat
+
+import (
+	"math"
+	"sort"
+
+	"gonum.org/v1/gonum/floats"
+)
+
+// CumulantKind specifies the behavior for calculating the empirical CDF or Quantile
+type CumulantKind int
+
+// List of supported CumulantKind values for the Quantile function.
+// Constant values should match the R nomenclature. See
+// https://en.wikipedia.org/wiki/Quantile#Estimating_the_quantiles_of_a_population
+const (
+	// Empirical treats the distribution as the actual empirical distribution.
+	Empirical CumulantKind = 1
+	// LinInterp linearly interpolates the empirical distribution between sample values, with a flat extrapolation.
+	LinInterp CumulantKind = 4
+)
+
+// bhattacharyyaCoeff computes the Bhattacharyya Coefficient for probability distributions given by:
+//
+//	\sum_i \sqrt{p_i q_i}
+//
+// It is assumed that p and q have equal length.
+func bhattacharyyaCoeff(p, q []float64) float64 {
+	var bc float64
+	for i, a := range p {
+		bc += math.Sqrt(a * q[i])
+	}
+	return bc
+}
+
+// Bhattacharyya computes the distance between the probability distributions p and q given by:
+//
+//	-\ln ( \sum_i \sqrt{p_i q_i} )
+//
+// The lengths of p and q must be equal. It is assumed that p and q sum to 1.
+func Bhattacharyya(p, q []float64) float64 {
+	if len(p) != len(q) {
+		panic("stat: slice length mismatch")
+	}
+	bc := bhattacharyyaCoeff(p, q)
+	return -math.Log(bc)
+}
+
+// CDF returns the empirical cumulative distribution function value of x, that is
+// the fraction of the samples less than or equal to q. The
+// exact behavior is determined by the CumulantKind. CDF is theoretically
+// the inverse of the Quantile function, though it may not be the actual inverse
+// for all values q and CumulantKinds.
+//
+// The x data must be sorted in increasing order. If weights is nil then all
+// of the weights are 1. If weights is not nil, then len(x) must equal len(weights).
+// CDF will panic if the length of x is zero.
+//
+// CumulantKind behaviors:
+//   - Empirical: Returns the lowest fraction for which q is greater than or equal
+//     to that fraction of samples
+func CDF(q float64, c CumulantKind, x, weights []float64) float64 {
+	if weights != nil && len(x) != len(weights) {
+		panic("stat: slice length mismatch")
+	}
+	if floats.HasNaN(x) {
+		return math.NaN()
+	}
+	if len(x) == 0 {
+		panic("stat: zero length slice")
+	}
+	if !sort.Float64sAreSorted(x) {
+		panic("x data are not sorted")
+	}
+
+	if q < x[0] {
+		return 0
+	}
+	if q >= x[len(x)-1] {
+		return 1
+	}
+
+	var sumWeights float64
+	if weights == nil {
+		sumWeights = float64(len(x))
+	} else {
+		sumWeights = floats.Sum(weights)
+	}
+
+	// Calculate the index
+	switch c {
+	case Empirical:
+		// Find the smallest value that is greater than that percent of the samples
+		var w float64
+		for i, v := range x {
+			if v > q {
+				return w / sumWeights
+			}
+			if weights == nil {
+				w++
+			} else {
+				w += weights[i]
+			}
+		}
+		panic("impossible")
+	default:
+		panic("stat: bad cumulant kind")
+	}
+}
+
+// ChiSquare computes the chi-square distance between the observed frequencies 'obs' and
+// expected frequencies 'exp' given by:
+//
+//	\sum_i (obs_i-exp_i)^2 / exp_i
+//
+// The lengths of obs and exp must be equal.
+func ChiSquare(obs, exp []float64) float64 {
+	if len(obs) != len(exp) {
+		panic("stat: slice length mismatch")
+	}
+	var result float64
+	for i, a := range obs {
+		b := exp[i]
+		if a == 0 && b == 0 {
+			continue
+		}
+		result += (a - b) * (a - b) / b
+	}
+	return result
+}
+
+// CircularMean returns the circular mean of the dataset.
+//
+//	atan2(\sum_i w_i * sin(alpha_i), \sum_i w_i * cos(alpha_i))
+//
+// If weights is nil then all of the weights are 1. If weights is not nil, then
+// len(x) must equal len(weights).
+func CircularMean(x, weights []float64) float64 {
+	if weights != nil && len(x) != len(weights) {
+		panic("stat: slice length mismatch")
+	}
+
+	var aX, aY float64
+	if weights != nil {
+		for i, v := range x {
+			aX += weights[i] * math.Cos(v)
+			aY += weights[i] * math.Sin(v)
+		}
+	} else {
+		for _, v := range x {
+			aX += math.Cos(v)
+			aY += math.Sin(v)
+		}
+	}
+
+	return math.Atan2(aY, aX)
+}
+
+// Correlation returns the weighted correlation between the samples of x and y
+// with the given means.
+//
+//	sum_i {w_i (x_i - meanX) * (y_i - meanY)} / (stdX * stdY)
+//
+// The lengths of x and y must be equal. If weights is nil then all of the
+// weights are 1. If weights is not nil, then len(x) must equal len(weights).
+func Correlation(x, y, weights []float64) float64 {
+	// This is a two-pass corrected implementation. It is an adaptation of the
+	// algorithm used in the MeanVariance function, which applies a correction
+	// to the typical two pass approach.
+
+	if len(x) != len(y) {
+		panic("stat: slice length mismatch")
+	}
+	xu := Mean(x, weights)
+	yu := Mean(y, weights)
+	var (
+		sxx           float64
+		syy           float64
+		sxy           float64
+		xcompensation float64
+		ycompensation float64
+	)
+	if weights == nil {
+		for i, xv := range x {
+			yv := y[i]
+			xd := xv - xu
+			yd := yv - yu
+			sxx += xd * xd
+			syy += yd * yd
+			sxy += xd * yd
+			xcompensation += xd
+			ycompensation += yd
+		}
+		// xcompensation and ycompensation are from Chan, et. al.
+		// referenced in the MeanVariance function. They are analogous
+		// to the second term in (1.7) in that paper.
+		sxx -= xcompensation * xcompensation / float64(len(x))
+		syy -= ycompensation * ycompensation / float64(len(x))
+
+		return (sxy - xcompensation*ycompensation/float64(len(x))) / math.Sqrt(sxx*syy)
+
+	}
+
+	var sumWeights float64
+	for i, xv := range x {
+		w := weights[i]
+		yv := y[i]
+		xd := xv - xu
+		wxd := w * xd
+		yd := yv - yu
+		wyd := w * yd
+		sxx += wxd * xd
+		syy += wyd * yd
+		sxy += wxd * yd
+		xcompensation += wxd
+		ycompensation += wyd
+		sumWeights += w
+	}
+	// xcompensation and ycompensation are from Chan, et. al.
+	// referenced in the MeanVariance function. They are analogous
+	// to the second term in (1.7) in that paper, except they use
+	// the sumWeights instead of the sample count.
+	sxx -= xcompensation * xcompensation / sumWeights
+	syy -= ycompensation * ycompensation / sumWeights
+
+	return (sxy - xcompensation*ycompensation/sumWeights) / math.Sqrt(sxx*syy)
+}
+
+// Kendall returns the weighted Tau-a Kendall correlation between the
+// samples of x and y. The Kendall correlation measures the quantity of
+// concordant and discordant pairs of numbers. If weights are specified then
+// each pair is weighted by weights[i] * weights[j] and the final sum is
+// normalized to stay between -1 and 1.
+// The lengths of x and y must be equal. If weights is nil then all of the
+// weights are 1. If weights is not nil, then len(x) must equal len(weights).
+func Kendall(x, y, weights []float64) float64 {
+	if len(x) != len(y) {
+		panic("stat: slice length mismatch")
+	}
+
+	var (
+		cc float64 // number of concordant pairs
+		dc float64 // number of discordant pairs
+		n  = len(x)
+	)
+
+	if weights == nil {
+		for i := 0; i < n; i++ {
+			for j := i; j < n; j++ {
+				if i == j {
+					continue
+				}
+				if math.Signbit(x[j]-x[i]) == math.Signbit(y[j]-y[i]) {
+					cc++
+				} else {
+					dc++
+				}
+			}
+		}
+		return (cc - dc) / float64(n*(n-1)/2)
+	}
+
+	var sumWeights float64
+
+	for i := 0; i < n; i++ {
+		for j := i; j < n; j++ {
+			if i == j {
+				continue
+			}
+			weight := weights[i] * weights[j]
+			if math.Signbit(x[j]-x[i]) == math.Signbit(y[j]-y[i]) {
+				cc += weight
+			} else {
+				dc += weight
+			}
+			sumWeights += weight
+		}
+	}
+	return float64(cc-dc) / sumWeights
+}
+
+// Covariance returns the weighted covariance between the samples of x and y.
+//
+//	sum_i {w_i (x_i - meanX) * (y_i - meanY)} / (sum_j {w_j} - 1)
+//
+// The lengths of x and y must be equal. If weights is nil then all of the
+// weights are 1. If weights is not nil, then len(x) must equal len(weights).
+func Covariance(x, y, weights []float64) float64 {
+	// This is a two-pass corrected implementation. It is an adaptation of the
+	// algorithm used in the MeanVariance function, which applies a correction
+	// to the typical two pass approach.
+
+	if len(x) != len(y) {
+		panic("stat: slice length mismatch")
+	}
+	xu := Mean(x, weights)
+	yu := Mean(y, weights)
+	return covarianceMeans(x, y, weights, xu, yu)
+}
+
+// covarianceMeans returns the weighted covariance between x and y with the mean
+// of x and y already specified. See the documentation of Covariance for more
+// information.
+func covarianceMeans(x, y, weights []float64, xu, yu float64) float64 {
+	var (
+		ss            float64
+		xcompensation float64
+		ycompensation float64
+	)
+	if weights == nil {
+		for i, xv := range x {
+			yv := y[i]
+			xd := xv - xu
+			yd := yv - yu
+			ss += xd * yd
+			xcompensation += xd
+			ycompensation += yd
+		}
+		// xcompensation and ycompensation are from Chan, et. al.
+		// referenced in the MeanVariance function. They are analogous
+		// to the second term in (1.7) in that paper.
+		return (ss - xcompensation*ycompensation/float64(len(x))) / float64(len(x)-1)
+	}
+
+	var sumWeights float64
+
+	for i, xv := range x {
+		w := weights[i]
+		yv := y[i]
+		wxd := w * (xv - xu)
+		yd := (yv - yu)
+		ss += wxd * yd
+		xcompensation += wxd
+		ycompensation += w * yd
+		sumWeights += w
+	}
+	// xcompensation and ycompensation are from Chan, et. al.
+	// referenced in the MeanVariance function. They are analogous
+	// to the second term in (1.7) in that paper, except they use
+	// the sumWeights instead of the sample count.
+	return (ss - xcompensation*ycompensation/sumWeights) / (sumWeights - 1)
+}
+
+// CrossEntropy computes the cross-entropy between the two distributions specified
+// in p and q.
+func CrossEntropy(p, q []float64) float64 {
+	if len(p) != len(q) {
+		panic("stat: slice length mismatch")
+	}
+	var ce float64
+	for i, v := range p {
+		if v != 0 {
+			ce -= v * math.Log(q[i])
+		}
+	}
+	return ce
+}
+
+// Entropy computes the Shannon entropy of a distribution or the distance between
+// two distributions. The natural logarithm is used.
+//   - sum_i (p_i * log_e(p_i))
+func Entropy(p []float64) float64 {
+	var e float64
+	for _, v := range p {
+		if v != 0 { // Entropy needs 0 * log(0) == 0.
+			e -= v * math.Log(v)
+		}
+	}
+	return e
+}
+
+// ExKurtosis returns the population excess kurtosis of the sample.
+// The kurtosis is defined by the 4th moment of the mean divided by the squared
+// variance. The excess kurtosis subtracts 3.0 so that the excess kurtosis of
+// the normal distribution is zero.
+// If weights is nil then all of the weights are 1. If weights is not nil, then
+// len(x) must equal len(weights).
+func ExKurtosis(x, weights []float64) float64 {
+	mean, std := MeanStdDev(x, weights)
+	if weights == nil {
+		var e float64
+		for _, v := range x {
+			z := (v - mean) / std
+			e += z * z * z * z
+		}
+		mul, offset := kurtosisCorrection(float64(len(x)))
+		return e*mul - offset
+	}
+
+	var (
+		e          float64
+		sumWeights float64
+	)
+	for i, v := range x {
+		z := (v - mean) / std
+		e += weights[i] * z * z * z * z
+		sumWeights += weights[i]
+	}
+	mul, offset := kurtosisCorrection(sumWeights)
+	return e*mul - offset
+}
+
+// n is the number of samples
+// see https://en.wikipedia.org/wiki/Kurtosis
+func kurtosisCorrection(n float64) (mul, offset float64) {
+	return ((n + 1) / (n - 1)) * (n / (n - 2)) * (1 / (n - 3)), 3 * ((n - 1) / (n - 2)) * ((n - 1) / (n - 3))
+}
+
+// GeometricMean returns the weighted geometric mean of the dataset
+//
+//	\prod_i {x_i ^ w_i}
+//
+// This only applies with positive x and positive weights. If weights is nil
+// then all of the weights are 1. If weights is not nil, then len(x) must equal
+// len(weights).
+func GeometricMean(x, weights []float64) float64 {
+	if weights == nil {
+		var s float64
+		for _, v := range x {
+			s += math.Log(v)
+		}
+		s /= float64(len(x))
+		return math.Exp(s)
+	}
+	if len(x) != len(weights) {
+		panic("stat: slice length mismatch")
+	}
+	var (
+		s          float64
+		sumWeights float64
+	)
+	for i, v := range x {
+		s += weights[i] * math.Log(v)
+		sumWeights += weights[i]
+	}
+	s /= sumWeights
+	return math.Exp(s)
+}
+
+// HarmonicMean returns the weighted harmonic mean of the dataset
+//
+//	\sum_i {w_i} / ( sum_i {w_i / x_i} )
+//
+// This only applies with positive x and positive weights.
+// If weights is nil then all of the weights are 1. If weights is not nil, then
+// len(x) must equal len(weights).
+func HarmonicMean(x, weights []float64) float64 {
+	if weights != nil && len(x) != len(weights) {
+		panic("stat: slice length mismatch")
+	}
+	// TODO(btracey): Fix this to make it more efficient and avoid allocation.
+
+	// This can be numerically unstable (for example if x is very small).
+	// W = \sum_i {w_i}
+	// hm = exp(log(W) - log(\sum_i w_i / x_i))
+
+	logs := make([]float64, len(x))
+	var W float64
+	for i := range x {
+		if weights == nil {
+			logs[i] = -math.Log(x[i])
+			W++
+			continue
+		}
+		logs[i] = math.Log(weights[i]) - math.Log(x[i])
+		W += weights[i]
+	}
+
+	// Sum all of the logs
+	v := floats.LogSumExp(logs) // This computes log(\sum_i { w_i / x_i}).
+	return math.Exp(math.Log(W) - v)
+}
+
+// Hellinger computes the distance between the probability distributions p and q given by:
+//
+//	\sqrt{ 1 - \sum_i \sqrt{p_i q_i} }
+//
+// The lengths of p and q must be equal. It is assumed that p and q sum to 1.
+func Hellinger(p, q []float64) float64 {
+	if len(p) != len(q) {
+		panic("stat: slice length mismatch")
+	}
+	bc := bhattacharyyaCoeff(p, q)
+	return math.Sqrt(1 - bc)
+}
+
+// Histogram sums up the weighted number of data points in each bin.
+// The weight of data point x[i] will be placed into count[j] if
+// dividers[j] <= x < dividers[j+1]. The "span" function in the floats package can assist
+// with bin creation.
+//
+// The following conditions on the inputs apply:
+//   - The count variable must either be nil or have length of one less than dividers.
+//   - The values in dividers must be sorted (use the sort package).
+//   - The x values must be sorted.
+//   - If weights is nil then all of the weights are 1.
+//   - If weights is not nil, then len(x) must equal len(weights).
+func Histogram(count, dividers, x, weights []float64) []float64 {
+	if weights != nil && len(x) != len(weights) {
+		panic("stat: slice length mismatch")
+	}
+	if count == nil {
+		count = make([]float64, len(dividers)-1)
+	}
+	if len(dividers) < 2 {
+		panic("histogram: fewer than two dividers")
+	}
+	if len(count) != len(dividers)-1 {
+		panic("histogram: bin count mismatch")
+	}
+	if !sort.Float64sAreSorted(dividers) {
+		panic("histogram: dividers are not sorted")
+	}
+	if !sort.Float64sAreSorted(x) {
+		panic("histogram: x data are not sorted")
+	}
+	for i := range count {
+		count[i] = 0
+	}
+	if len(x) == 0 {
+		return count
+	}
+	if x[0] < dividers[0] {
+		panic("histogram: minimum x value is less than lowest divider")
+	}
+	if dividers[len(dividers)-1] <= x[len(x)-1] {
+		panic("histogram: maximum x value is greater than or equal to highest divider")
+	}
+
+	idx := 0
+	comp := dividers[idx+1]
+	if weights == nil {
+		for _, v := range x {
+			if v < comp {
+				// Still in the current bucket.
+				count[idx]++
+				continue
+			}
+			// Find the next divider where v is less than the divider.
+			for j := idx + 1; j < len(dividers); j++ {
+				if v < dividers[j+1] {
+					idx = j
+					comp = dividers[j+1]
+					break
+				}
+			}
+			count[idx]++
+		}
+		return count
+	}
+
+	for i, v := range x {
+		if v < comp {
+			// Still in the current bucket.
+			count[idx] += weights[i]
+			continue
+		}
+		// Need to find the next divider where v is less than the divider.
+		for j := idx + 1; j < len(count); j++ {
+			if v < dividers[j+1] {
+				idx = j
+				comp = dividers[j+1]
+				break
+			}
+		}
+		count[idx] += weights[i]
+	}
+	return count
+}
+
+// JensenShannon computes the JensenShannon divergence between the distributions
+// p and q. The Jensen-Shannon divergence is defined as
+//
+//	m = 0.5 * (p + q)
+//	JS(p, q) = 0.5 ( KL(p, m) + KL(q, m) )
+//
+// Unlike Kullback-Leibler, the Jensen-Shannon distance is symmetric. The value
+// is between 0 and ln(2).
+func JensenShannon(p, q []float64) float64 {
+	if len(p) != len(q) {
+		panic("stat: slice length mismatch")
+	}
+	var js float64
+	for i, v := range p {
+		qi := q[i]
+		m := 0.5 * (v + qi)
+		if v != 0 {
+			// add kl from p to m
+			js += 0.5 * v * (math.Log(v) - math.Log(m))
+		}
+		if qi != 0 {
+			// add kl from q to m
+			js += 0.5 * qi * (math.Log(qi) - math.Log(m))
+		}
+	}
+	return js
+}
+
+// KolmogorovSmirnov computes the largest distance between two empirical CDFs.
+// Each dataset x and y consists of sample locations and counts, xWeights and
+// yWeights, respectively.
+//
+// x and y may have different lengths, though len(x) must equal len(xWeights), and
+// len(y) must equal len(yWeights). Both x and y must be sorted.
+//
+// Special cases are:
+//
+//	= 0 if len(x) == len(y) == 0
+//	= 1 if len(x) == 0, len(y) != 0 or len(x) != 0 and len(y) == 0
+func KolmogorovSmirnov(x, xWeights, y, yWeights []float64) float64 {
+	if xWeights != nil && len(x) != len(xWeights) {
+		panic("stat: slice length mismatch")
+	}
+	if yWeights != nil && len(y) != len(yWeights) {
+		panic("stat: slice length mismatch")
+	}
+	if len(x) == 0 || len(y) == 0 {
+		if len(x) == 0 && len(y) == 0 {
+			return 0
+		}
+		return 1
+	}
+
+	if floats.HasNaN(x) {
+		return math.NaN()
+	}
+	if floats.HasNaN(y) {
+		return math.NaN()
+	}
+
+	if !sort.Float64sAreSorted(x) {
+		panic("x data are not sorted")
+	}
+	if !sort.Float64sAreSorted(y) {
+		panic("y data are not sorted")
+	}
+
+	xWeightsNil := xWeights == nil
+	yWeightsNil := yWeights == nil
+
+	var (
+		maxDist    float64
+		xSum, ySum float64
+		xCdf, yCdf float64
+		xIdx, yIdx int
+	)
+
+	if xWeightsNil {
+		xSum = float64(len(x))
+	} else {
+		xSum = floats.Sum(xWeights)
+	}
+
+	if yWeightsNil {
+		ySum = float64(len(y))
+	} else {
+		ySum = floats.Sum(yWeights)
+	}
+
+	xVal := x[0]
+	yVal := y[0]
+
+	// Algorithm description:
+	// The goal is to find the maximum difference in the empirical CDFs for the
+	// two datasets. The CDFs are piecewise-constant, and thus the distance
+	// between the CDFs will only change at the values themselves.
+	//
+	// To find the maximum distance, step through the data in ascending order
+	// of value between the two datasets. At each step, compute the empirical CDF
+	// and compare the local distance with the maximum distance.
+	// Due to some corner cases, equal data entries must be tallied simultaneously.
+	for {
+		switch {
+		case xVal < yVal:
+			xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil)
+		case yVal < xVal:
+			yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil)
+		case xVal == yVal:
+			newX := x[xIdx]
+			newY := y[yIdx]
+			if newX < newY {
+				xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil)
+			} else if newY < newX {
+				yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil)
+			} else {
+				// Update them both, they'll be equal next time and the right
+				// thing will happen.
+				xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil)
+				yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil)
+			}
+		default:
+			panic("unreachable")
+		}
+
+		dist := math.Abs(xCdf - yCdf)
+		if dist > maxDist {
+			maxDist = dist
+		}
+
+		// Both xCdf and yCdf will equal 1 at the end, so if we have reached the
+		// end of either sample list, the distance is as large as it can be.
+		if xIdx == len(x) || yIdx == len(y) {
+			return maxDist
+		}
+	}
+}
+
+// updateKS gets the next data point from one of the set. In doing so, it combines
+// the weight of all the data points of equal value. Upon return, val is the new
+// value of the data set, newCdf is the total combined CDF up until this point,
+// and newIdx is the index of the next location in that sample to examine.
+func updateKS(idx int, cdf, sum float64, values, weights []float64, isNil bool) (val, newCdf float64, newIdx int) {
+	// Sum up all the weights of consecutive values that are equal.
+	if isNil {
+		newCdf = cdf + 1/sum
+	} else {
+		newCdf = cdf + weights[idx]/sum
+	}
+	newIdx = idx + 1
+	for {
+		if newIdx == len(values) {
+			return values[newIdx-1], newCdf, newIdx
+		}
+		if values[newIdx-1] != values[newIdx] {
+			return values[newIdx], newCdf, newIdx
+		}
+		if isNil {
+			newCdf += 1 / sum
+		} else {
+			newCdf += weights[newIdx] / sum
+		}
+		newIdx++
+	}
+}
+
+// KullbackLeibler computes the Kullback-Leibler distance between the
+// distributions p and q. The natural logarithm is used.
+//
+//	sum_i(p_i * log(p_i / q_i))
+//
+// Note that the Kullback-Leibler distance is not symmetric;
+// KullbackLeibler(p,q) != KullbackLeibler(q,p)
+func KullbackLeibler(p, q []float64) float64 {
+	if len(p) != len(q) {
+		panic("stat: slice length mismatch")
+	}
+	var kl float64
+	for i, v := range p {
+		if v != 0 { // Entropy needs 0 * log(0) == 0.
+			kl += v * (math.Log(v) - math.Log(q[i]))
+		}
+	}
+	return kl
+}
+
+// LinearRegression computes the best-fit line
+//
+//	y = alpha + beta*x
+//
+// to the data in x and y with the given weights. If origin is true, the
+// regression is forced to pass through the origin.
+//
+// Specifically, LinearRegression computes the values of alpha and
+// beta such that the total residual
+//
+//	\sum_i w[i]*(y[i] - alpha - beta*x[i])^2
+//
+// is minimized. If origin is true, then alpha is forced to be zero.
+//
+// The lengths of x and y must be equal. If weights is nil then all of the
+// weights are 1. If weights is not nil, then len(x) must equal len(weights).
+func LinearRegression(x, y, weights []float64, origin bool) (alpha, beta float64) {
+	if len(x) != len(y) {
+		panic("stat: slice length mismatch")
+	}
+	if weights != nil && len(weights) != len(x) {
+		panic("stat: slice length mismatch")
+	}
+
+	w := 1.0
+	if origin {
+		var x2Sum, xySum float64
+		for i, xi := range x {
+			if weights != nil {
+				w = weights[i]
+			}
+			yi := y[i]
+			xySum += w * xi * yi
+			x2Sum += w * xi * xi
+		}
+		beta = xySum / x2Sum
+
+		return 0, beta
+	}
+
+	xu, xv := MeanVariance(x, weights)
+	yu := Mean(y, weights)
+	cov := covarianceMeans(x, y, weights, xu, yu)
+	beta = cov / xv
+	alpha = yu - beta*xu
+	return alpha, beta
+}
+
+// RSquared returns the coefficient of determination defined as
+//
+//	R^2 = 1 - \sum_i w[i]*(y[i] - alpha - beta*x[i])^2 / \sum_i w[i]*(y[i] - mean(y))^2
+//
+// for the line
+//
+//	y = alpha + beta*x
+//
+// and the data in x and y with the given weights.
+//
+// The lengths of x and y must be equal. If weights is nil then all of the
+// weights are 1. If weights is not nil, then len(x) must equal len(weights).
+func RSquared(x, y, weights []float64, alpha, beta float64) float64 {
+	if len(x) != len(y) {
+		panic("stat: slice length mismatch")
+	}
+	if weights != nil && len(weights) != len(x) {
+		panic("stat: slice length mismatch")
+	}
+
+	w := 1.0
+	yMean := Mean(y, weights)
+	var res, tot, d float64
+	for i, xi := range x {
+		if weights != nil {
+			w = weights[i]
+		}
+		yi := y[i]
+		fi := alpha + beta*xi
+		d = yi - fi
+		res += w * d * d
+		d = yi - yMean
+		tot += w * d * d
+	}
+	return 1 - res/tot
+}
+
+// RSquaredFrom returns the coefficient of determination defined as
+//
+//	R^2 = 1 - \sum_i w[i]*(estimate[i] - value[i])^2 / \sum_i w[i]*(value[i] - mean(values))^2
+//
+// and the data in estimates and values with the given weights.
+//
+// The lengths of estimates and values must be equal. If weights is nil then
+// all of the weights are 1. If weights is not nil, then len(values) must
+// equal len(weights).
+func RSquaredFrom(estimates, values, weights []float64) float64 {
+	if len(estimates) != len(values) {
+		panic("stat: slice length mismatch")
+	}
+	if weights != nil && len(weights) != len(values) {
+		panic("stat: slice length mismatch")
+	}
+
+	w := 1.0
+	mean := Mean(values, weights)
+	var res, tot, d float64
+	for i, val := range values {
+		if weights != nil {
+			w = weights[i]
+		}
+		d = val - estimates[i]
+		res += w * d * d
+		d = val - mean
+		tot += w * d * d
+	}
+	return 1 - res/tot
+}
+
+// RNoughtSquared returns the coefficient of determination defined as
+//
+//	R₀^2 = \sum_i w[i]*(beta*x[i])^2 / \sum_i w[i]*y[i]^2
+//
+// for the line
+//
+//	y = beta*x
+//
+// and the data in x and y with the given weights. RNoughtSquared should
+// only be used for best-fit lines regressed through the origin.
+//
+// The lengths of x and y must be equal. If weights is nil then all of the
+// weights are 1. If weights is not nil, then len(x) must equal len(weights).
+func RNoughtSquared(x, y, weights []float64, beta float64) float64 {
+	if len(x) != len(y) {
+		panic("stat: slice length mismatch")
+	}
+	if weights != nil && len(weights) != len(x) {
+		panic("stat: slice length mismatch")
+	}
+
+	w := 1.0
+	var ssr, tot float64
+	for i, xi := range x {
+		if weights != nil {
+			w = weights[i]
+		}
+		fi := beta * xi
+		ssr += w * fi * fi
+		yi := y[i]
+		tot += w * yi * yi
+	}
+	return ssr / tot
+}
+
+// Mean computes the weighted mean of the data set.
+//
+//	sum_i {w_i * x_i} / sum_i {w_i}
+//
+// If weights is nil then all of the weights are 1. If weights is not nil, then
+// len(x) must equal len(weights).
+func Mean(x, weights []float64) float64 {
+	if weights == nil {
+		return floats.Sum(x) / float64(len(x))
+	}
+	if len(x) != len(weights) {
+		panic("stat: slice length mismatch")
+	}
+	var (
+		sumValues  float64
+		sumWeights float64
+	)
+	for i, w := range weights {
+		sumValues += w * x[i]
+		sumWeights += w
+	}
+	return sumValues / sumWeights
+}
+
+// Mode returns the most common value in the dataset specified by x and the
+// given weights. Strict float64 equality is used when comparing values, so users
+// should take caution. If several values are the mode, any of them may be returned.
+func Mode(x, weights []float64) (val float64, count float64) {
+	if weights != nil && len(x) != len(weights) {
+		panic("stat: slice length mismatch")
+	}
+	if len(x) == 0 {
+		return 0, 0
+	}
+	m := make(map[float64]float64)
+	if weights == nil {
+		for _, v := range x {
+			m[v]++
+		}
+	} else {
+		for i, v := range x {
+			m[v] += weights[i]
+		}
+	}
+	var (
+		maxCount float64
+		max      float64
+	)
+	for val, count := range m {
+		if count > maxCount {
+			maxCount = count
+			max = val
+		}
+	}
+	return max, maxCount
+}
+
+// BivariateMoment computes the weighted mixed moment between the samples x and y.
+//
+//	E[(x - μ_x)^r*(y - μ_y)^s]
+//
+// No degrees of freedom correction is done.
+// The lengths of x and y must be equal. If weights is nil then all of the
+// weights are 1. If weights is not nil, then len(x) must equal len(weights).
+func BivariateMoment(r, s float64, x, y, weights []float64) float64 {
+	meanX := Mean(x, weights)
+	meanY := Mean(y, weights)
+	if len(x) != len(y) {
+		panic("stat: slice length mismatch")
+	}
+	if weights == nil {
+		var m float64
+		for i, vx := range x {
+			vy := y[i]
+			m += math.Pow(vx-meanX, r) * math.Pow(vy-meanY, s)
+		}
+		return m / float64(len(x))
+	}
+	if len(weights) != len(x) {
+		panic("stat: slice length mismatch")
+	}
+	var (
+		m          float64
+		sumWeights float64
+	)
+	for i, vx := range x {
+		vy := y[i]
+		w := weights[i]
+		m += w * math.Pow(vx-meanX, r) * math.Pow(vy-meanY, s)
+		sumWeights += w
+	}
+	return m / sumWeights
+}
+
+// Moment computes the weighted n^th moment of the samples,
+//
+//	E[(x - μ)^N]
+//
+// No degrees of freedom correction is done.
+// If weights is nil then all of the weights are 1. If weights is not nil, then
+// len(x) must equal len(weights).
+func Moment(moment float64, x, weights []float64) float64 {
+	// This also checks that x and weights have the same length.
+	mean := Mean(x, weights)
+	if weights == nil {
+		var m float64
+		for _, v := range x {
+			m += math.Pow(v-mean, moment)
+		}
+		return m / float64(len(x))
+	}
+	var (
+		m          float64
+		sumWeights float64
+	)
+	for i, v := range x {
+		w := weights[i]
+		m += w * math.Pow(v-mean, moment)
+		sumWeights += w
+	}
+	return m / sumWeights
+}
+
+// MomentAbout computes the weighted n^th weighted moment of the samples about
+// the given mean \mu,
+//
+//	E[(x - μ)^N]
+//
+// No degrees of freedom correction is done.
+// If weights is nil then all of the weights are 1. If weights is not nil, then
+// len(x) must equal len(weights).
+func MomentAbout(moment float64, x []float64, mean float64, weights []float64) float64 {
+	if weights == nil {
+		var m float64
+		for _, v := range x {
+			m += math.Pow(v-mean, moment)
+		}
+		m /= float64(len(x))
+		return m
+	}
+	if len(weights) != len(x) {
+		panic("stat: slice length mismatch")
+	}
+	var (
+		m          float64
+		sumWeights float64
+	)
+	for i, v := range x {
+		m += weights[i] * math.Pow(v-mean, moment)
+		sumWeights += weights[i]
+	}
+	return m / sumWeights
+}
+
+// Quantile returns the sample of x such that x is greater than or
+// equal to the fraction p of samples. The exact behavior is determined by the
+// CumulantKind, and p should be a number between 0 and 1. Quantile is theoretically
+// the inverse of the CDF function, though it may not be the actual inverse
+// for all values p and CumulantKinds.
+//
+// The x data must be sorted in increasing order. If weights is nil then all
+// of the weights are 1. If weights is not nil, then len(x) must equal len(weights).
+// Quantile will panic if the length of x is zero.
+//
+// CumulantKind behaviors:
+//   - Empirical: Returns the lowest value q for which q is greater than or equal
+//     to the fraction p of samples
+//   - LinInterp: Returns the linearly interpolated value
+func Quantile(p float64, c CumulantKind, x, weights []float64) float64 {
+	if !(p >= 0 && p <= 1) {
+		panic("stat: percentile out of bounds")
+	}
+
+	if weights != nil && len(x) != len(weights) {
+		panic("stat: slice length mismatch")
+	}
+	if len(x) == 0 {
+		panic("stat: zero length slice")
+	}
+	if floats.HasNaN(x) {
+		return math.NaN() // This is needed because the algorithm breaks otherwise.
+	}
+	if !sort.Float64sAreSorted(x) {
+		panic("x data are not sorted")
+	}
+
+	var sumWeights float64
+	if weights == nil {
+		sumWeights = float64(len(x))
+	} else {
+		sumWeights = floats.Sum(weights)
+	}
+	switch c {
+	case Empirical:
+		return empiricalQuantile(p, x, weights, sumWeights)
+	case LinInterp:
+		return linInterpQuantile(p, x, weights, sumWeights)
+	default:
+		panic("stat: bad cumulant kind")
+	}
+}
+
+func empiricalQuantile(p float64, x, weights []float64, sumWeights float64) float64 {
+	var cumsum float64
+	fidx := p * sumWeights
+	for i := range x {
+		if weights == nil {
+			cumsum++
+		} else {
+			cumsum += weights[i]
+		}
+		if cumsum >= fidx {
+			return x[i]
+		}
+	}
+	panic("impossible")
+}
+
+func linInterpQuantile(p float64, x, weights []float64, sumWeights float64) float64 {
+	var cumsum float64
+	fidx := p * sumWeights
+	for i := range x {
+		if weights == nil {
+			cumsum++
+		} else {
+			cumsum += weights[i]
+		}
+		if cumsum >= fidx {
+			if i == 0 {
+				return x[0]
+			}
+			t := cumsum - fidx
+			if weights != nil {
+				t /= weights[i]
+			}
+			return t*x[i-1] + (1-t)*x[i]
+		}
+	}
+	panic("impossible")
+}
+
+// Skew computes the skewness of the sample data.
+// If weights is nil then all of the weights are 1. If weights is not nil, then
+// len(x) must equal len(weights).
+// When weights sum to 1 or less, a biased variance estimator should be used.
+func Skew(x, weights []float64) float64 {
+
+	mean, std := MeanStdDev(x, weights)
+	if weights == nil {
+		var s float64
+		for _, v := range x {
+			z := (v - mean) / std
+			s += z * z * z
+		}
+		return s * skewCorrection(float64(len(x)))
+	}
+	var (
+		s          float64
+		sumWeights float64
+	)
+	for i, v := range x {
+		z := (v - mean) / std
+		s += weights[i] * z * z * z
+		sumWeights += weights[i]
+	}
+	return s * skewCorrection(sumWeights)
+}
+
+// From: http://www.amstat.org/publications/jse/v19n2/doane.pdf page 7
+func skewCorrection(n float64) float64 {
+	return (n / (n - 1)) * (1 / (n - 2))
+}
+
+// SortWeighted rearranges the data in x along with their corresponding
+// weights so that the x data are sorted. The data is sorted in place.
+// Weights may be nil, but if weights is non-nil then it must have the same
+// length as x.
+func SortWeighted(x, weights []float64) {
+	if weights == nil {
+		sort.Float64s(x)
+		return
+	}
+	if len(x) != len(weights) {
+		panic("stat: slice length mismatch")
+	}
+	sort.Sort(weightSorter{
+		x: x,
+		w: weights,
+	})
+}
+
+type weightSorter struct {
+	x []float64
+	w []float64
+}
+
+func (w weightSorter) Len() int           { return len(w.x) }
+func (w weightSorter) Less(i, j int) bool { return w.x[i] < w.x[j] }
+func (w weightSorter) Swap(i, j int) {
+	w.x[i], w.x[j] = w.x[j], w.x[i]
+	w.w[i], w.w[j] = w.w[j], w.w[i]
+}
+
+// SortWeightedLabeled rearranges the data in x along with their
+// corresponding weights and boolean labels so that the x data are sorted.
+// The data is sorted in place. Weights and labels may be nil, if either
+// is non-nil it must have the same length as x.
+func SortWeightedLabeled(x []float64, labels []bool, weights []float64) {
+	if labels == nil {
+		SortWeighted(x, weights)
+		return
+	}
+	if weights == nil {
+		if len(x) != len(labels) {
+			panic("stat: slice length mismatch")
+		}
+		sort.Sort(labelSorter{
+			x: x,
+			l: labels,
+		})
+		return
+	}
+	if len(x) != len(labels) || len(x) != len(weights) {
+		panic("stat: slice length mismatch")
+	}
+	sort.Sort(weightLabelSorter{
+		x: x,
+		l: labels,
+		w: weights,
+	})
+}
+
+type labelSorter struct {
+	x []float64
+	l []bool
+}
+
+func (a labelSorter) Len() int           { return len(a.x) }
+func (a labelSorter) Less(i, j int) bool { return a.x[i] < a.x[j] }
+func (a labelSorter) Swap(i, j int) {
+	a.x[i], a.x[j] = a.x[j], a.x[i]
+	a.l[i], a.l[j] = a.l[j], a.l[i]
+}
+
+type weightLabelSorter struct {
+	x []float64
+	l []bool
+	w []float64
+}
+
+func (a weightLabelSorter) Len() int           { return len(a.x) }
+func (a weightLabelSorter) Less(i, j int) bool { return a.x[i] < a.x[j] }
+func (a weightLabelSorter) Swap(i, j int) {
+	a.x[i], a.x[j] = a.x[j], a.x[i]
+	a.l[i], a.l[j] = a.l[j], a.l[i]
+	a.w[i], a.w[j] = a.w[j], a.w[i]
+}
+
+// StdDev returns the sample standard deviation.
+func StdDev(x, weights []float64) float64 {
+	_, std := MeanStdDev(x, weights)
+	return std
+}
+
+// MeanStdDev returns the sample mean and unbiased standard deviation
+// When weights sum to 1 or less, a biased variance estimator should be used.
+func MeanStdDev(x, weights []float64) (mean, std float64) {
+	mean, variance := MeanVariance(x, weights)
+	return mean, math.Sqrt(variance)
+}
+
+// StdErr returns the standard error in the mean with the given values.
+func StdErr(std, sampleSize float64) float64 {
+	return std / math.Sqrt(sampleSize)
+}
+
+// StdScore returns the standard score (a.k.a. z-score, z-value) for the value x
+// with the given mean and standard deviation, i.e.
+//
+//	(x - mean) / std
+func StdScore(x, mean, std float64) float64 {
+	return (x - mean) / std
+}
+
+// Variance computes the unbiased weighted sample variance:
+//
+//	\sum_i w_i (x_i - mean)^2 / (sum_i w_i - 1)
+//
+// If weights is nil then all of the weights are 1. If weights is not nil, then
+// len(x) must equal len(weights).
+// When weights sum to 1 or less, a biased variance estimator should be used.
+func Variance(x, weights []float64) float64 {
+	_, variance := MeanVariance(x, weights)
+	return variance
+}
+
+// MeanVariance computes the sample mean and unbiased variance, where the mean and variance are
+//
+//	\sum_i w_i * x_i / (sum_i w_i)
+//	\sum_i w_i (x_i - mean)^2 / (sum_i w_i - 1)
+//
+// respectively.
+// If weights is nil then all of the weights are 1. If weights is not nil, then
+// len(x) must equal len(weights).
+// When weights sum to 1 or less, a biased variance estimator should be used.
+func MeanVariance(x, weights []float64) (mean, variance float64) {
+	var (
+		unnormalisedVariance float64
+		sumWeights           float64
+	)
+	mean, unnormalisedVariance, sumWeights = meanUnnormalisedVarianceSumWeights(x, weights)
+	return mean, unnormalisedVariance / (sumWeights - 1)
+}
+
+// PopMeanVariance computes the sample mean and biased variance (also known as
+// "population variance"), where the mean and variance are
+//
+//	\sum_i w_i * x_i / (sum_i w_i)
+//	\sum_i w_i (x_i - mean)^2 / (sum_i w_i)
+//
+// respectively.
+// If weights is nil then all of the weights are 1. If weights is not nil, then
+// len(x) must equal len(weights).
+func PopMeanVariance(x, weights []float64) (mean, variance float64) {
+	var (
+		unnormalisedVariance float64
+		sumWeights           float64
+	)
+	mean, unnormalisedVariance, sumWeights = meanUnnormalisedVarianceSumWeights(x, weights)
+	return mean, unnormalisedVariance / sumWeights
+}
+
+// PopMeanStdDev returns the sample mean and biased standard deviation
+// (also known as "population standard deviation").
+func PopMeanStdDev(x, weights []float64) (mean, std float64) {
+	mean, variance := PopMeanVariance(x, weights)
+	return mean, math.Sqrt(variance)
+}
+
+// PopStdDev returns the population standard deviation, i.e., a square root
+// of the biased variance estimate.
+func PopStdDev(x, weights []float64) float64 {
+	_, stDev := PopMeanStdDev(x, weights)
+	return stDev
+}
+
+// PopVariance computes the unbiased weighted sample variance:
+//
+//	\sum_i w_i (x_i - mean)^2 / (sum_i w_i)
+//
+// If weights is nil then all of the weights are 1. If weights is not nil, then
+// len(x) must equal len(weights).
+func PopVariance(x, weights []float64) float64 {
+	_, variance := PopMeanVariance(x, weights)
+	return variance
+}
+
+func meanUnnormalisedVarianceSumWeights(x, weights []float64) (mean, unnormalisedVariance, sumWeights float64) {
+	// This uses the corrected two-pass algorithm (1.7), from "Algorithms for computing
+	// the sample variance: Analysis and recommendations" by Chan, Tony F., Gene H. Golub,
+	// and Randall J. LeVeque.
+
+	// Note that this will panic if the slice lengths do not match.
+	mean = Mean(x, weights)
+	var (
+		ss           float64
+		compensation float64
+	)
+	if weights == nil {
+		for _, v := range x {
+			d := v - mean
+			ss += d * d
+			compensation += d
+		}
+		unnormalisedVariance = (ss - compensation*compensation/float64(len(x)))
+		return mean, unnormalisedVariance, float64(len(x))
+	}
+
+	for i, v := range x {
+		w := weights[i]
+		d := v - mean
+		wd := w * d
+		ss += wd * d
+		compensation += wd
+		sumWeights += w
+	}
+	unnormalisedVariance = (ss - compensation*compensation/sumWeights)
+	return mean, unnormalisedVariance, sumWeights
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/statmat.go b/vendor/gonum.org/v1/gonum/stat/statmat.go
new file mode 100644
index 0000000000..4f05f30645
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/statmat.go
@@ -0,0 +1,142 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package stat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/floats"
+	"gonum.org/v1/gonum/mat"
+)
+
+// CovarianceMatrix calculates the covariance matrix (also known as the
+// variance-covariance matrix) calculated from a matrix of data, x, using
+// a two-pass algorithm. The result is stored in dst.
+//
+// If weights is not nil the weighted covariance of x is calculated. weights
+// must have length equal to the number of rows in input data matrix and
+// must not contain negative elements.
+// The dst matrix must either be empty or have the same number of
+// columns as the input data matrix.
+func CovarianceMatrix(dst *mat.SymDense, x mat.Matrix, weights []float64) {
+	// This is the matrix version of the two-pass algorithm. It doesn't use the
+	// additional floating point error correction that the Covariance function uses
+	// to reduce the impact of rounding during centering.
+
+	r, c := x.Dims()
+
+	if dst.IsEmpty() {
+		*dst = *(dst.GrowSym(c).(*mat.SymDense))
+	} else if n := dst.SymmetricDim(); n != c {
+		panic(mat.ErrShape)
+	}
+
+	var xt mat.Dense
+	xt.CloneFrom(x.T())
+	// Subtract the mean of each of the columns.
+	for i := 0; i < c; i++ {
+		v := xt.RawRowView(i)
+		// This will panic with ErrShape if len(weights) != len(v), so
+		// we don't have to check the size later.
+		mean := Mean(v, weights)
+		floats.AddConst(-mean, v)
+	}
+
+	if weights == nil {
+		// Calculate the normalization factor
+		// scaled by the sample size.
+		dst.SymOuterK(1/(float64(r)-1), &xt)
+		return
+	}
+
+	// Multiply by the sqrt of the weights, so that multiplication is symmetric.
+	sqrtwts := make([]float64, r)
+	for i, w := range weights {
+		if w < 0 {
+			panic("stat: negative covariance matrix weights")
+		}
+		sqrtwts[i] = math.Sqrt(w)
+	}
+	// Weight the rows.
+	for i := 0; i < c; i++ {
+		v := xt.RawRowView(i)
+		floats.Mul(v, sqrtwts)
+	}
+
+	// Calculate the normalization factor
+	// scaled by the weighted sample size.
+	dst.SymOuterK(1/(floats.Sum(weights)-1), &xt)
+}
+
+// CorrelationMatrix returns the correlation matrix calculated from a matrix
+// of data, x, using a two-pass algorithm. The result is stored in dst.
+//
+// If weights is not nil the weighted correlation of x is calculated. weights
+// must have length equal to the number of rows in input data matrix and
+// must not contain negative elements.
+// The dst matrix must either be empty or have the same number of
+// columns as the input data matrix.
+func CorrelationMatrix(dst *mat.SymDense, x mat.Matrix, weights []float64) {
+	// This will panic if the sizes don't match, or if weights is the wrong size.
+	CovarianceMatrix(dst, x, weights)
+	covToCorr(dst)
+}
+
+// covToCorr converts a covariance matrix to a correlation matrix.
+func covToCorr(c *mat.SymDense) {
+	r := c.SymmetricDim()
+
+	s := make([]float64, r)
+	for i := 0; i < r; i++ {
+		s[i] = 1 / math.Sqrt(c.At(i, i))
+	}
+	for i, sx := range s {
+		// Ensure that the diagonal has exactly ones.
+		c.SetSym(i, i, 1)
+		for j := i + 1; j < r; j++ {
+			v := c.At(i, j)
+			c.SetSym(i, j, v*sx*s[j])
+		}
+	}
+}
+
+// corrToCov converts a correlation matrix to a covariance matrix.
+// The input sigma should be vector of standard deviations corresponding
+// to the covariance.  It will panic if len(sigma) is not equal to the
+// number of rows in the correlation matrix.
+func corrToCov(c *mat.SymDense, sigma []float64) {
+	r, _ := c.Dims()
+
+	if r != len(sigma) {
+		panic(mat.ErrShape)
+	}
+	for i, sx := range sigma {
+		// Ensure that the diagonal has exactly sigma squared.
+		c.SetSym(i, i, sx*sx)
+		for j := i + 1; j < r; j++ {
+			v := c.At(i, j)
+			c.SetSym(i, j, v*sx*sigma[j])
+		}
+	}
+}
+
+// Mahalanobis computes the Mahalanobis distance
+//
+//	D = sqrt((x-y)ᵀ * Σ^-1 * (x-y))
+//
+// between the column vectors x and y given the cholesky decomposition of Σ.
+// Mahalanobis returns NaN if the linear solve fails.
+//
+// See https://en.wikipedia.org/wiki/Mahalanobis_distance for more information.
+func Mahalanobis(x, y mat.Vector, chol *mat.Cholesky) float64 {
+	var diff mat.VecDense
+	diff.SubVec(x, y)
+	var tmp mat.VecDense
+	err := chol.SolveVecTo(&tmp, &diff)
+	if err != nil {
+		return math.NaN()
+	}
+	return math.Sqrt(mat.Dot(&tmp, &diff))
+}
diff --git a/vendor/modules.txt b/vendor/modules.txt
index 017c1a3b7d..ae92659462 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -616,6 +616,7 @@ golang.org/x/text/unicode/norm
 golang.org/x/time/rate
 # golang.org/x/tools v0.30.0
 ## explicit; go 1.22.0
+golang.org/x/tools/container/intsets
 golang.org/x/tools/cover
 golang.org/x/tools/go/ast/astutil
 golang.org/x/tools/go/ast/inspector
@@ -644,6 +645,34 @@ golang.org/x/tools/internal/versions
 # gomodules.xyz/jsonpatch/v2 v2.4.0
 ## explicit; go 1.20
 gomodules.xyz/jsonpatch/v2
+# gonum.org/v1/gonum v0.16.0
+## explicit; go 1.23.0
+gonum.org/v1/gonum/blas
+gonum.org/v1/gonum/blas/blas64
+gonum.org/v1/gonum/blas/cblas128
+gonum.org/v1/gonum/blas/gonum
+gonum.org/v1/gonum/floats
+gonum.org/v1/gonum/floats/scalar
+gonum.org/v1/gonum/internal/asm/c128
+gonum.org/v1/gonum/internal/asm/c64
+gonum.org/v1/gonum/internal/asm/f32
+gonum.org/v1/gonum/internal/asm/f64
+gonum.org/v1/gonum/internal/cmplx64
+gonum.org/v1/gonum/internal/math32
+gonum.org/v1/gonum/lapack
+gonum.org/v1/gonum/lapack/gonum
+gonum.org/v1/gonum/lapack/lapack64
+gonum.org/v1/gonum/mat
+gonum.org/v1/gonum/mathext
+gonum.org/v1/gonum/mathext/internal/amos
+gonum.org/v1/gonum/mathext/internal/cephes
+gonum.org/v1/gonum/mathext/internal/gonum
+gonum.org/v1/gonum/optimize
+gonum.org/v1/gonum/spatial/r1
+gonum.org/v1/gonum/stat
+gonum.org/v1/gonum/stat/combin
+gonum.org/v1/gonum/stat/distmv
+gonum.org/v1/gonum/stat/distuv
 # google.golang.org/genproto/googleapis/rpc v0.0.0-20250227231956-55c901821b1e
 ## explicit; go 1.23.0
 google.golang.org/genproto/googleapis/rpc/status

From 04da524254e357c80e2feffdfc6354712979639f Mon Sep 17 00:00:00 2001
From: Francesco Romani <fromani@redhat.com>
Date: Wed, 16 Jul 2025 19:25:08 +0200
Subject: [PATCH 2/6] WIP: SMT postprocessing

TODO explain why

Signed-off-by: Francesco Romani <fromani@redhat.com>
---
 .../profilecreator/autosize/autosize.go       | 26 ++++++++++++++-----
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/pkg/performanceprofile/profilecreator/autosize/autosize.go b/pkg/performanceprofile/profilecreator/autosize/autosize.go
index 5be9800ec6..5857c7ad39 100644
--- a/pkg/performanceprofile/profilecreator/autosize/autosize.go
+++ b/pkg/performanceprofile/profilecreator/autosize/autosize.go
@@ -215,28 +215,40 @@ func Compute(env Env, params Params) (Values, Score, error) {
 		return params.DefaultAllocation(), Score{}, err
 	}
 
-	smtLevel := params.SMTLevel()
 	totCPUs := params.TotalCPUs()
 	score := Score{Cost: result.F}
-	x_cr := int(math.Round(result.Location.X[0]))
-	x_c := asMultipleOf(x_cr, smtLevel)
-	env.Log.Printf("Optimization value: Xc=%v -> Xc=%v (SMTLevel=%v)", x_cr, x_c, smtLevel)
+	x_c := int(math.Round(result.Location.X[0]))
 
-	vals := Values{
+	opt := Values{
 		ReservedCPUCount: x_c,
 		IsolatedCPUCount: totCPUs - x_c, // we can use x_w, but we just leverage invariants
 	}
-	env.Log.Printf("Optimization result: %s", vals.String())
+	env.Log.Printf("Optimization result: %s", opt.String())
 
-	if err := Validate(params, vals); err != nil {
+	if err := Validate(params, opt); err != nil {
 		env.Log.Printf("Optimization invalid: %v", err)
 		return params.DefaultAllocation(), Score{}, err
 	}
 
+	// postprocessing must be done after successfull validation
+	vals := postProcess(params, opt)
+	env.Log.Printf("Optimization postprocess. %s => %s", opt.String(), vals.String())
+
 	env.Log.Printf("Optimization done. Score: %v %s totalCPUs=%d", score.String(), vals.String(), totCPUs)
 	return vals, score, nil
 }
 
+func postProcess(params Params, vals Values) Values {
+	Tc := params.TotalCPUs()
+	sl := params.SMTLevel()
+	x_c := asMultipleOf(vals.ReservedCPUCount, sl)
+	ret := Values{
+		ReservedCPUCount: x_c,
+		IsolatedCPUCount: Tc - x_c,
+	}
+	return ret
+}
+
 func asMultipleOf(v, x int) int {
 	r := v % x
 	if r == 0 {

From 6d449223d8fac58b9c49ec32af682da9c9fbf368 Mon Sep 17 00:00:00 2001
From: Francesco Romani <fromani@redhat.com>
Date: Wed, 16 Jul 2025 16:41:52 +0200
Subject: [PATCH 3/6] autosize: handle SMT in autosizing

consider the real SMT Level when doing autosize computations.

Signed-off-by: Francesco Romani <fromani@redhat.com>
---
 pkg/performanceprofile/profilecreator/autosize/autosize.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/performanceprofile/profilecreator/autosize/autosize.go b/pkg/performanceprofile/profilecreator/autosize/autosize.go
index 5857c7ad39..916c6f24eb 100644
--- a/pkg/performanceprofile/profilecreator/autosize/autosize.go
+++ b/pkg/performanceprofile/profilecreator/autosize/autosize.go
@@ -91,7 +91,7 @@ func (p Params) SMTLevel() int {
 func (p Params) DefaultControlPlaneCores() int {
 	// intentionally overallocate to have a safe baseline
 	Tc := p.totalCPUs
-	return int(math.Round(float64(Tc) * defaultReservedRatioInitial)) // TODO handle SMT
+	return int(math.Round(float64(Tc) * defaultReservedRatioInitial))
 }
 
 // Get x_c, x_w as initial hardcoded value. Subject to optimization
@@ -147,7 +147,7 @@ func (vals Values) String() string {
 // https://github.com/gonum/gonum/issues/1725
 func Validate(params Params, vals Values) error {
 	Tc := params.TotalCPUs()
-	if vals.ReservedCPUCount < 1 { // TODO handle SMT
+	if vals.ReservedCPUCount < params.SMTLevel() {
 		return ErrUnderallocatedControlPlane
 	}
 	if vals.ReservedCPUCount > int(math.Round((float64(Tc) * defaultReservedRatioMax))) { // works, but likely unacceptable

From cc52c5f1bfd0f60b84fe332ca92de76becca2dab Mon Sep 17 00:00:00 2001
From: Francesco Romani <fromani@redhat.com>
Date: Wed, 16 Jul 2025 18:30:06 +0200
Subject: [PATCH 4/6] WIP

Signed-off-by: Francesco Romani <fromani@redhat.com>
---
 .../profilecreator/autosize/autosize.go                  | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/pkg/performanceprofile/profilecreator/autosize/autosize.go b/pkg/performanceprofile/profilecreator/autosize/autosize.go
index 916c6f24eb..1798b8cb3b 100644
--- a/pkg/performanceprofile/profilecreator/autosize/autosize.go
+++ b/pkg/performanceprofile/profilecreator/autosize/autosize.go
@@ -183,9 +183,6 @@ func objective(p Params, x []float64) float64 {
 	// Must use positive CPU values (since gonum/optimize doesn't have simple bounds for all solvers)
 	hardPenalty += defaultPenaltyWeight*math.Pow(math.Max(0, -x_c), 2) + math.Pow(math.Max(0, -x_w), 2)
 
-	// Allocate in multiples of SMT level (usually 2) -- TODO: should be soft?
-	hardPenalty += defaultPenaltyWeight * math.Pow(math.Max(0, -float64(int(math.Round(x_c))%p.SMTLevel())), 2)
-
 	return target + hardPenalty
 }
 
@@ -217,11 +214,9 @@ func Compute(env Env, params Params) (Values, Score, error) {
 
 	totCPUs := params.TotalCPUs()
 	score := Score{Cost: result.F}
-	x_c := int(math.Round(result.Location.X[0]))
-
 	opt := Values{
-		ReservedCPUCount: x_c,
-		IsolatedCPUCount: totCPUs - x_c, // we can use x_w, but we just leverage invariants
+		ReservedCPUCount: int(math.Round(result.Location.X[0])),
+		IsolatedCPUCount: int(math.Round(result.Location.X[1])),
 	}
 	env.Log.Printf("Optimization result: %s", opt.String())
 

From 877818c797bebb3432e7fbd84cd93bd081c61ba4 Mon Sep 17 00:00:00 2001
From: Francesco Romani <fromani@redhat.com>
Date: Wed, 16 Jul 2025 17:16:25 +0200
Subject: [PATCH 5/6] WIP: prevalidate and postvalidate

Signed-off-by: Francesco Romani <fromani@redhat.com>
---
 .../profilecreator/autosize/autosize.go       | 28 ++++++++++++++++---
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/pkg/performanceprofile/profilecreator/autosize/autosize.go b/pkg/performanceprofile/profilecreator/autosize/autosize.go
index 1798b8cb3b..e22dff6961 100644
--- a/pkg/performanceprofile/profilecreator/autosize/autosize.go
+++ b/pkg/performanceprofile/profilecreator/autosize/autosize.go
@@ -38,6 +38,7 @@ const (
 )
 
 var (
+	ErrInvalidParameters          = errors.New("invalid parameters")
 	ErrUnderallocatedControlPlane = errors.New("not enough CPUs for control plane")
 	ErrOverallocatedControlPlane  = errors.New("too many CPUs for control plane")
 	ErrInconsistentAllocation     = errors.New("inconsistent CPus allocation")
@@ -54,6 +55,7 @@ func DefaultEnv() Env {
 }
 
 type Params struct {
+	DeviceCount         int
 	OfflinedCPUCount    int
 	UserLevelNetworking bool
 	MachineData         *profilecreator.GHWHandler
@@ -63,7 +65,7 @@ type Params struct {
 }
 
 func (p Params) String() string {
-	return fmt.Sprintf("cpus=%d offline=%v SMTLevel=%v", p.totalCPUs, p.OfflinedCPUCount, p.smtLevel)
+	return fmt.Sprintf("cpus=%d offline=%v SMTLevel=%v devices=%d (userNetworking=%v)", p.totalCPUs, p.OfflinedCPUCount, p.smtLevel, p.DeviceCount, p.UserLevelNetworking)
 }
 
 func setupMachineData(p *Params) error {
@@ -143,9 +145,23 @@ func (vals Values) String() string {
 	return fmt.Sprintf("reserved=%v/isolated=%v", vals.ReservedCPUCount, vals.IsolatedCPUCount)
 }
 
+func CheckParameters(params Params) error {
+	if params.DeviceCount < 0 {
+		return ErrInvalidParameters
+	}
+	if params.OfflinedCPUCount < 0 {
+		return ErrInvalidParameters
+	}
+	// are we offlining everything? we need at least 1 physical core to do any work, including staying alive
+	if params.OfflinedCPUCount > (params.totalCPUs - params.smtLevel) {
+		return ErrInvalidParameters
+	}
+	return nil
+}
+
 // gonum doesn't support bounds yet so we have to make this an explicit step
 // https://github.com/gonum/gonum/issues/1725
-func Validate(params Params, vals Values) error {
+func CheckValues(params Params, vals Values) error {
 	Tc := params.TotalCPUs()
 	if vals.ReservedCPUCount < params.SMTLevel() {
 		return ErrUnderallocatedControlPlane
@@ -187,7 +203,11 @@ func objective(p Params, x []float64) float64 {
 }
 
 func Compute(env Env, params Params) (Values, Score, error) {
-	err := setupMachineData(&params)
+	err := CheckParameters(params)
+	if err != nil {
+		return params.DefaultAllocation(), Score{}, err
+	}
+	err = setupMachineData(&params)
 	if err != nil {
 		env.Log.Printf("Optimization failed: %v", err)
 		return params.DefaultAllocation(), Score{}, err
@@ -220,7 +240,7 @@ func Compute(env Env, params Params) (Values, Score, error) {
 	}
 	env.Log.Printf("Optimization result: %s", opt.String())
 
-	if err := Validate(params, opt); err != nil {
+	if err := CheckValues(params, opt); err != nil {
 		env.Log.Printf("Optimization invalid: %v", err)
 		return params.DefaultAllocation(), Score{}, err
 	}

From 6005e98bd3dba329d3c6b27e3cbe6871f17204c2 Mon Sep 17 00:00:00 2001
From: Francesco Romani <fromani@redhat.com>
Date: Wed, 16 Jul 2025 17:16:43 +0200
Subject: [PATCH 6/6] WIP: minimal CPU to fit IRQ count

Signed-off-by: Francesco Romani <fromani@redhat.com>
---
 .../profilecreator/autosize/autosize.go        | 18 +++++++++++++++++-
 .../profilecreator/cmd/root.go                 |  3 +++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/pkg/performanceprofile/profilecreator/autosize/autosize.go b/pkg/performanceprofile/profilecreator/autosize/autosize.go
index e22dff6961..856a339b68 100644
--- a/pkg/performanceprofile/profilecreator/autosize/autosize.go
+++ b/pkg/performanceprofile/profilecreator/autosize/autosize.go
@@ -30,6 +30,12 @@ import (
 // Objective:
 // We want to maximize x_w, or, equivalently, minimize x_c
 
+const (
+	// x86 limit. 256 hardware entries, of those 32 reserved. 256-32 = 224.
+	// see: https://en.wikipedia.org/wiki/Interrupt_request
+	maxIRQsPerPhysicalCore int = 224
+)
+
 const (
 	defaultPenaltyWeight                 float64 = 100.0
 	defaultReservedRatioInitial          float64 = 0.0625 // 1/16. determined empirically. Use only as initial value.
@@ -65,7 +71,7 @@ type Params struct {
 }
 
 func (p Params) String() string {
-	return fmt.Sprintf("cpus=%d offline=%v SMTLevel=%v devices=%d (userNetworking=%v)", p.totalCPUs, p.OfflinedCPUCount, p.smtLevel, p.DeviceCount, p.UserLevelNetworking)
+	return fmt.Sprintf("cpus=%d offline=%v SMTLevel=%v devices=%d (req=%v userNetworking=%v)", p.totalCPUs, p.OfflinedCPUCount, p.smtLevel, p.DeviceCount, p.MinCPUs(), p.UserLevelNetworking)
 }
 
 func setupMachineData(p *Params) error {
@@ -82,6 +88,13 @@ func setupMachineData(p *Params) error {
 	return nil
 }
 
+func (p Params) MinCPUs() int {
+	if !p.UserLevelNetworking { // TODO explain why
+		return 0
+	}
+	return (p.DeviceCount + (maxIRQsPerPhysicalCore - 1)) / maxIRQsPerPhysicalCore
+}
+
 func (p Params) TotalCPUs() int {
 	return p.totalCPUs
 }
@@ -193,6 +206,9 @@ func objective(p Params, x []float64) float64 {
 	// Don't exceed total CPUs
 	hardPenalty += defaultPenaltyWeight * math.Pow(math.Max(0, x_c+x_w-float64(p.TotalCPUs())), 2)
 
+	// Allocate as minimum what is needed to fit the desired amount of devices, thus IRQs
+	hardPenalty += defaultPenaltyWeight * math.Pow(math.Max(0, float64(p.MinCPUs())-x_c), 2)
+
 	// Meet the control plane/infra requirement to avoid the workload to starve
 	hardPenalty += defaultPenaltyWeight * math.Pow(math.Max(0, p.controlPlaneRequirement(x_w)-x_c), 2)
 
diff --git a/pkg/performanceprofile/profilecreator/cmd/root.go b/pkg/performanceprofile/profilecreator/cmd/root.go
index 7b70f291ae..a703757678 100644
--- a/pkg/performanceprofile/profilecreator/cmd/root.go
+++ b/pkg/performanceprofile/profilecreator/cmd/root.go
@@ -171,6 +171,7 @@ func NewRootCommand() *cobra.Command {
 			}
 			if isAutosizeEnabled(pcArgs) {
 				params := autosize.Params{
+					DeviceCount:         pcArgs.DeviceCount,
 					OfflinedCPUCount:    pcArgs.OfflinedCPUCount,
 					UserLevelNetworking: (pcArgs.UserLevelNetworking != nil && *pcArgs.UserLevelNetworking),
 					MachineData:         nodesHandlers[0], // assume all nodes equal, pick the easiest
@@ -433,6 +434,7 @@ type ProfileCreatorArgs struct {
 	PerPodPowerManagement       *bool  `json:"per-pod-power-management,omitempty"`
 	EnableHardwareTuning        bool   `json:"enable-hardware-tuning,omitempty"`
 	Autosize                    *bool  `json:"autosize,omitempty"`
+	DeviceCount                 int    `json:"device-count,omitempty"`
 	// internal only this argument not passed by the user
 	// but detected automatically
 	createForHypershift bool
@@ -454,6 +456,7 @@ func (pca *ProfileCreatorArgs) AddFlags(flags *pflag.FlagSet) {
 	flags.BoolVar(&pca.EnableHardwareTuning, "enable-hardware-tuning", false, "Enable setting maximum cpu frequencies")
 	flags.StringVar(&pca.NodePoolName, "node-pool-name", "", "Node pool name corresponding to the target machines (HyperShift only)")
 	flags.BoolVar(pca.Autosize, "autosize", false, "autosize the control plane")
+	flags.IntVar(&pca.DeviceCount, "device-count", 0, "Number of expected devices (TODO)")
 }
 
 func makePerformanceProfileFrom(profileData ProfileData) (runtime.Object, error) {