diff --git a/requirements.txt b/requirements.txt
index cce66eee5..38a77b4bc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,4 +19,5 @@ rich
 scikit-image
 scipy
 shapely
+numba
 utm
diff --git a/src/mintpy/ifgram_inversion.py b/src/mintpy/ifgram_inversion.py
index 87aef8409..258718652 100644
--- a/src/mintpy/ifgram_inversion.py
+++ b/src/mintpy/ifgram_inversion.py
@@ -15,6 +15,61 @@
 import numpy as np
 from scipy import linalg  # more effieint than numpy.linalg
 
+# optional dependency for JIT acceleration
+try:
+    from numba import njit, prange
+    NUMBA_AVAILABLE = True
+except Exception:  # pragma: no cover - numba is optional
+    NUMBA_AVAILABLE = False
+
+if NUMBA_AVAILABLE:
+
+    @njit(parallel=True)
+    def _calc_inv_quality_tc_numba(G, X, y):
+        num_pair, num_pixel = y.shape
+        num_date = G.shape[1]
+        out = np.zeros(num_pixel, np.float32)
+        for j in prange(num_pixel):
+            real_s = 0.0
+            imag_s = 0.0
+            for i in range(num_pair):
+                dot = 0.0
+                for k in range(num_date):
+                    dot += G[i, k] * X[k, j]
+                e = y[i, j] - dot
+                val = np.exp(1j * e)
+                real_s += val.real
+                imag_s += val.imag
+            out[j] = np.sqrt(real_s ** 2 + imag_s ** 2) / num_pair
+        return out
+
+    @njit(parallel=True)
+    def _calc_inv_quality_residual_numba(G, X, y, e2, weighted):
+        num_pair, num_pixel = y.shape
+        num_date = G.shape[1]
+        out = np.zeros(num_pixel, np.float32)
+        if weighted:
+            for j in prange(num_pixel):
+                sum_e2 = 0.0
+                for i in range(num_pair):
+                    dot = 0.0
+                    for k in range(num_date):
+                        dot += G[i, k] * X[k, j]
+                    e = y[i, j] - dot
+                    sum_e2 += (e.real * e.real + e.imag * e.imag)
+                out[j] = np.sqrt(sum_e2)
+        else:
+            for j in prange(num_pixel):
+                out[j] = np.sqrt(e2[j]) if e2.size > 0 else np.nan
+        return out
+else:  # pragma: no cover - numba not installed
+
+    def _calc_inv_quality_tc_numba(*args, **kwargs):
+        raise ImportError('numba is required for numba acceleration.')
+
+    def _calc_inv_quality_residual_numba(*args, **kwargs):
+        raise ImportError('numba is required for numba acceleration.')
+
 from mintpy.objects import cluster, ifgramStack
 from mintpy.simulation import decorrelation as decor
 from mintpy.utils import ptime, readfile, utils as ut, writefile
@@ -90,7 +145,7 @@ def run_or_skip(inps):
 ################################# Time-series Estimator ###################################
 def estimate_timeseries(A, B, y, tbase_diff, weight_sqrt=None, min_norm_velocity=True,
                         rcond=1e-5, min_redundancy=1., inv_quality_name='temporalCoherence',
-                        print_msg=True):
+                        print_msg=True, use_gpu=False, use_numba=False):
     """Estimate time-series from a stack/network of interferograms with
     Least Square minimization on deformation phase / velocity.
 
@@ -135,20 +190,39 @@ def estimate_timeseries(A, B, y, tbase_diff, weight_sqrt=None, min_norm_velocity
                                     temporalCoherence for phase
                                     residual          for offset
                                     no to turn OFF the calculation
+                use_gpu          - bool, use cupy for GPU acceleration if available
+                use_numba        - bool, use numba for CPU JIT acceleration
     Returns:    ts                - 2D np.ndarray in size of (num_date, num_pixel), phase time-series
                 inv_quality       - 1D np.ndarray in size of (num_pixel), temporal coherence (for phase) or residual (for offset)
                 num_inv_obs       - 1D np.ndarray in size of (num_pixel), number of observations (ifgrams / offsets)
                                     used during the inversion
     """
 
-    y = y.reshape(A.shape[0], -1)
+    xp = np
+    linmod = linalg
+    if use_gpu and use_numba:
+        raise ValueError('use_gpu and use_numba cannot be both True')
+    if use_gpu:
+        try:
+            import cupy as cp
+            xp = cp
+            linmod = cp.linalg
+    elif use_numba and not NUMBA_AVAILABLE:
+        raise ImportError('numba is required for numba acceleration.')
+        except (ImportError, ModuleNotFoundError) as exc:
+            raise ImportError('cupy is required for GPU acceleration.') from exc
+
+    y = xp.asarray(y).reshape(A.shape[0], -1)
     if weight_sqrt is not None:
-        weight_sqrt = weight_sqrt.reshape(A.shape[0], -1)
+        weight_sqrt = xp.asarray(weight_sqrt).reshape(A.shape[0], -1)
+    A = xp.asarray(A)
+    B = xp.asarray(B)
+    tbase_diff = xp.asarray(tbase_diff)
     num_date = A.shape[1] + 1
     num_pixel = y.shape[1]
 
     # initial output value
-    ts = np.zeros((num_date, num_pixel), dtype=np.float32)
+    ts = xp.zeros((num_date, num_pixel), dtype=xp.float32)
     if inv_quality_name == 'residual':
         inv_quality = np.nan
     else:
@@ -178,41 +252,45 @@ def estimate_timeseries(A, B, y, tbase_diff, weight_sqrt=None, min_norm_velocity
         if min_norm_velocity:
             ##### min-norm velocity
             if weight_sqrt is not None:
-                X, e2 = linalg.lstsq(np.multiply(B, weight_sqrt),
-                                     np.multiply(y, weight_sqrt),
-                                     cond=rcond)[:2]
+                X, e2 = linmod.lstsq(xp.multiply(B, weight_sqrt),
+                                     xp.multiply(y, weight_sqrt),
+                                     rcond=rcond)[:2]
             else:
-                X, e2 = linalg.lstsq(B, y, cond=rcond)[:2]
+                X, e2 = linmod.lstsq(B, y, rcond=rcond)[:2]
 
             # calc inversion quality
             if inv_quality_name != 'no':
                 inv_quality = calc_inv_quality(B, X, y, e2,
                                                inv_quality_name=inv_quality_name,
                                                weight_sqrt=weight_sqrt,
-                                               print_msg=print_msg)
+                                               print_msg=print_msg,
+                                               use_gpu=use_gpu,
+                                               use_numba=use_numba)
 
             # assemble time-series
-            ts_diff = X * np.tile(tbase_diff, (1, num_pixel))
-            ts[1:, :] = np.cumsum(ts_diff, axis=0)
+            ts_diff = X * xp.tile(tbase_diff, (1, num_pixel))
+            ts[1:, :] = xp.cumsum(ts_diff, axis=0)
 
         else:
             ##### min-norm displacement
             if weight_sqrt is not None:
-                X, e2 = linalg.lstsq(np.multiply(A, weight_sqrt),
-                                     np.multiply(y, weight_sqrt),
-                                     cond=rcond)[:2]
+                X, e2 = linmod.lstsq(xp.multiply(A, weight_sqrt),
+                                     xp.multiply(y, weight_sqrt),
+                                     rcond=rcond)[:2]
             else:
-                X, e2 = linalg.lstsq(A, y, cond=rcond)[:2]
+                X, e2 = linmod.lstsq(A, y, rcond=rcond)[:2]
 
             # calc inversion quality
             if inv_quality_name != 'no':
                 inv_quality = calc_inv_quality(A, X, y, e2,
                                                inv_quality_name=inv_quality_name,
                                                weight_sqrt=weight_sqrt,
-                                               print_msg=print_msg)
+                                               print_msg=print_msg,
+                                               use_gpu=use_gpu,
+                                               use_numba=use_numba)
 
             # assemble time-series
-            ts[1: ,:] = X
+            ts[1:, :] = X
 
     except linalg.LinAlgError:
         pass
@@ -220,6 +298,11 @@ def estimate_timeseries(A, B, y, tbase_diff, weight_sqrt=None, min_norm_velocity
     # number of observations used for inversion
     num_inv_obs = A.shape[0]
 
+    if use_gpu:
+        ts = xp.asnumpy(ts)
+        if isinstance(inv_quality, xp.ndarray):
+            inv_quality = xp.asnumpy(inv_quality)
+
     return ts, inv_quality, num_inv_obs
 
 
@@ -284,7 +367,9 @@ def skip_invalid_obs(obs, mat_list):
     return obs, mat_list
 
 
-def calc_inv_quality(G, X, y, e2, inv_quality_name='temporalCoherence', weight_sqrt=None, print_msg=True):
+def calc_inv_quality(G, X, y, e2, inv_quality_name='temporalCoherence',
+                     weight_sqrt=None, print_msg=True, use_gpu=False,
+                     use_numba=False):
     """Calculate the inversion quality of the time series estimation.
 
     Parameters: G                - 2D np.ndarray in size of (num_pair, num_date-1), design matrix A or B
@@ -294,13 +379,50 @@ def calc_inv_quality(G, X, y, e2, inv_quality_name='temporalCoherence', weight_s
                 inv_quality_name - str, name of the inversion quality parameter
                                    temporalCoherence for phase
                                    residual          for offset
-                weight_sqrt      - 2D np.ndarray in size of (num_pair, num_pixel),
+               weight_sqrt      - 2D np.ndarray in size of (num_pair, num_pixel),
                                    weight square root, None for un-weighted estimation.
+               use_gpu          - bool, use cupy for GPU acceleration if available
+               use_numba        - bool, use numba for CPU JIT acceleration
     Returns:    inv_quality      - 1D np.ndarray in size of (num_pixel), temporalCoherence / residual
     """
 
+    xp = np
+    if use_gpu:
+        try:
+            import cupy as cp
+            xp = cp
+        except Exception as exc:
+            raise ImportError('cupy is required for GPU acceleration.') from exc
+    elif use_numba and not NUMBA_AVAILABLE:
+        raise ImportError('numba is required for numba acceleration.')
+
+    G = xp.asarray(G)
+    X = xp.asarray(X)
+    y = xp.asarray(y)
+    e2 = xp.asarray(e2)
+    if weight_sqrt is not None:
+        weight_sqrt = xp.asarray(weight_sqrt)
+
+    if use_numba:
+        # ensure numpy arrays for numba
+        Gn = np.asarray(G)
+        Xn = np.asarray(X)
+        yn = np.asarray(y)
+        e2n = np.asarray(e2)
+        weight_sqrtn = np.asarray(weight_sqrt) if weight_sqrt is not None else None
+
     num_pair, num_pixel = y.shape
-    inv_quality = np.zeros(num_pixel, dtype=np.float32)
+    inv_quality = xp.zeros(num_pixel, dtype=xp.float32)
+
+    if use_numba:
+        if inv_quality_name == 'temporalCoherence':
+            inv_quality = _calc_inv_quality_tc_numba(Gn, Xn, yn)
+        elif inv_quality_name == 'residual':
+            inv_quality = _calc_inv_quality_residual_numba(Gn, Xn, yn, e2n,
+                                                          weight_sqrtn is not None)
+        else:
+            raise ValueError(f'un-recognized inversion quality name: {inv_quality_name}')
+        return inv_quality
 
     # chunk_size as the number of pixels
     chunk_size = int(ut.round_to_1(2e5 / num_pair))
@@ -317,19 +439,19 @@ def calc_inv_quality(G, X, y, e2, inv_quality_name='temporalCoherence', weight_s
 
             if inv_quality_name == 'temporalCoherence':
                 #for phase
-                e = y[:, c0:c1] - np.dot(G, X[:, c0:c1])
-                inv_quality[c0:c1] = np.abs(np.sum(np.exp(1j*e), axis=0)) / num_pair
+                e = y[:, c0:c1] - xp.dot(G, X[:, c0:c1])
+                inv_quality[c0:c1] = xp.abs(xp.sum(xp.exp(1j*e), axis=0)) / num_pair
 
             elif inv_quality_name == 'residual':
                 #for offset
                 if weight_sqrt is not None:
                     # calculate the un-weighted residual for the weighted inversion
-                    e = y[:, c0:c1] - np.dot(G, X[:, c0:c1])
-                    inv_quality[c0:c1] = np.sqrt(np.sum(np.abs(e) ** 2, axis=0))
+                    e = y[:, c0:c1] - xp.dot(G, X[:, c0:c1])
+                    inv_quality[c0:c1] = xp.sqrt(xp.sum(xp.abs(e) ** 2, axis=0))
 
                 else:
                     # use the un-weighted residual directly
-                    inv_quality[c0:c1] = np.sqrt(e2[c0:c1]) if e2[c0:c1].size > 0 else np.nan
+                    inv_quality[c0:c1] = xp.sqrt(e2[c0:c1]) if e2[c0:c1].size > 0 else xp.nan
 
             # print out message
             chunk_step = max(1, int(ut.round_to_1(num_chunk / 5)))
@@ -339,26 +461,105 @@ def calc_inv_quality(G, X, y, e2, inv_quality_name='temporalCoherence', weight_s
     else:
         if inv_quality_name == 'temporalCoherence':
             #for phase
-            e = y - np.dot(G, X)
-            inv_quality = np.abs(np.sum(np.exp(1j*e), axis=0)) / num_pair
+            e = y - xp.dot(G, X)
+            inv_quality = xp.abs(xp.sum(xp.exp(1j*e), axis=0)) / num_pair
 
         elif inv_quality_name == 'residual':
             #for offset
             if weight_sqrt is not None:
                 # calculate the un-weighted residual for the weighted inversion
-                e = y - G.dot(X)
-                inv_quality = np.sqrt(np.sum(np.abs(e) ** 2, axis=0))
+                e = y - xp.dot(G, X)
+                inv_quality = xp.sqrt(xp.sum(xp.abs(e) ** 2, axis=0))
 
             else:
                 # use the un-weighted residual directly
-                inv_quality = np.sqrt(e2) if e2.size > 0 else np.nan
+                inv_quality = xp.sqrt(e2) if e2.size > 0 else xp.nan
 
         else:
             raise ValueError(f'un-recognized inversion quality name: {inv_quality_name}')
 
+    if use_gpu:
+        inv_quality = xp.asnumpy(inv_quality)
     return inv_quality
 
 
+def benchmark_gpu_speedup(A, B, y, tbase_diff, weight_sqrt=None,
+                          min_norm_velocity=True, rcond=1e-5,
+                          min_redundancy=1., inv_quality_name='temporalCoherence'):
+    """Benchmark the speed difference between CPU and GPU modes.
+
+    This function runs :func:`estimate_timeseries` twice: once on CPU and once
+    on GPU (using CuPy).  It reports the runtime of each run and returns the
+    speedup factor (``CPU_time / GPU_time``).  If CuPy is not installed an
+    ``ImportError`` will be raised from ``estimate_timeseries``.
+
+    Returns
+    -------
+    float
+        Speedup factor comparing CPU and GPU execution time.
+    """
+
+    # CPU run
+    t0 = time.perf_counter()
+    estimate_timeseries(A, B, y, tbase_diff,
+                        weight_sqrt=weight_sqrt,
+                        min_norm_velocity=min_norm_velocity,
+                        rcond=rcond, min_redundancy=min_redundancy,
+                        inv_quality_name=inv_quality_name,
+                        print_msg=False, use_gpu=False)
+    cpu_time = time.perf_counter() - t0
+
+    # GPU run
+    t0 = time.perf_counter()
+    estimate_timeseries(A, B, y, tbase_diff,
+                        weight_sqrt=weight_sqrt,
+                        min_norm_velocity=min_norm_velocity,
+                        rcond=rcond, min_redundancy=min_redundancy,
+                        inv_quality_name=inv_quality_name,
+                        print_msg=False, use_gpu=True)
+    gpu_time = time.perf_counter() - t0
+
+    speedup = cpu_time / gpu_time if gpu_time > 0 else np.nan
+    print(f'CPU time: {cpu_time:.3f}s, GPU time: {gpu_time:.3f}s, speedup: {speedup:.2f}x')
+    return speedup
+
+
+def benchmark_numba_speedup(A, B, y, tbase_diff, weight_sqrt=None,
+                            min_norm_velocity=True, rcond=1e-5,
+                            min_redundancy=1., inv_quality_name='temporalCoherence'):
+    """Benchmark the speed difference between standard and numba-accelerated modes.
+
+    This function runs :func:`estimate_timeseries` twice: once without numba and
+    once using numba JIT acceleration. It reports the runtime of each run and
+    returns the speedup factor (``CPU_time / Numba_time``).  If Numba is not
+    installed an ``ImportError`` will be raised from ``estimate_timeseries``.
+    """
+
+    # CPU run
+    t0 = time.perf_counter()
+    estimate_timeseries(A, B, y, tbase_diff,
+                        weight_sqrt=weight_sqrt,
+                        min_norm_velocity=min_norm_velocity,
+                        rcond=rcond, min_redundancy=min_redundancy,
+                        inv_quality_name=inv_quality_name,
+                        print_msg=False, use_gpu=False, use_numba=False)
+    cpu_time = time.perf_counter() - t0
+
+    # numba run
+    t0 = time.perf_counter()
+    estimate_timeseries(A, B, y, tbase_diff,
+                        weight_sqrt=weight_sqrt,
+                        min_norm_velocity=min_norm_velocity,
+                        rcond=rcond, min_redundancy=min_redundancy,
+                        inv_quality_name=inv_quality_name,
+                        print_msg=False, use_gpu=False, use_numba=True)
+    nb_time = time.perf_counter() - t0
+
+    speedup = cpu_time / nb_time if nb_time > 0 else np.nan
+    print(f'CPU time: {cpu_time:.3f}s, Numba time: {nb_time:.3f}s, speedup: {speedup:.2f}x')
+    return speedup
+
+
 
 ###################################### File IO ############################################
 def check_design_matrix(ifgram_file, weight_func='var'):