From 7d0d1c05770ed592518762c0133b18222ed44d2f Mon Sep 17 00:00:00 2001
From: Sami Farin <hvtaifwkbgefbaei@gmail.com>
Date: Wed, 31 Dec 2025 09:28:39 +0200
Subject: [PATCH 1/3] Make some calculations 250-500 times faster
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before:
$ time ./fend 2^1555.555|b3sum
0567ac5de698ba87f79145ebf29c1ab169d37b185e76ddd4b8ecc2de2c82d4de  -

real	1m42,334s
user	1m41,908s
sys	0m0,032s

Afer:
$ time ./fend 2^1555.555|b3sum
0567ac5de698ba87f79145ebf29c1ab169d37b185e76ddd4b8ecc2de2c82d4de  -

real	0m0,401s
user	0m0,401s
sys	0m0,004s

And for the gcd:

Before:
$ time ./fend 0.9911^192|b3sum
315e6e5e5b9716413c0e6cbe347ed8a7e2923b3cec6290db7d88ff9710aea09e  -

real	0m19,810s
user	0m19,757s
sys	0m0,005s

After:
$ time ./fend 0.9911^192|b3sum
315e6e5e5b9716413c0e6cbe347ed8a7e2923b3cec6290db7d88ff9710aea09e  -

real	0m0,049s
user	0m0,040s

System information: rust 1.92.0, LLVM 20.1.8 x86_64-redhat-linux-gnu,
Intel i5-13600K, opt-level=3, ~/.cargo/config.toml contents:
[build]
rustflags = ["-C", "target-cpu=native"]

Functions changed:

add_assign_internal: Replaces high-level get/set overhead with
efficient, vectorizable slice iteration using .zip() to eliminate bounds
checks in the hot loop.

sub: Optimizes borrowing logic using u128 overflow detection and safely
clamps loop ranges to handle unnormalized inputs without panicking.

sub_assign: Performs subtraction in-place on mutable buffers to
eliminate expensive memory allocations during the Karatsuba
recombination step.

mul: Dispatches to Karatsuba multiplication for inputs larger than 64
limbs (4096 bits) to reduce algorithmic complexity from
O(N²) to O(N^¹·⁵⁸⁵).

mul_karatsuba_slice: Recursively computes Karatsuba products using
&[u64] slices instead of cloning BigUint vectors, significantly reducing
memory churn.

mul_internal_slice: Provides a highly optimized Schoolbook (O(N²))
multiplication base case that operates directly on slices for maximum
cache efficiency.

div_rem_knuth (called by divmod): Replaces slow bit-by-bit binary
division with Knuth's Algorithm D (base-2⁶⁴), reducing the number of
division steps by a factor of 64.

root_n: Replaces Binary Search (linear convergence) with Newton's Method
(quadratic convergence), reducing iteration count from thousands to
dozens for large roots.

lshift / rshift: Moves interrupt checks outside the loop and simplifies
carry logic to allow the compiler to generate efficient block memory
moves.

add_assign_shifted: A specialized helper for Karatsuba that performs a
"shift-and-add" operation in one pass without creating intermediate
shifted values.

gcd: Replaces the division-heavy Euclidean algorithm with Stein's
Algorithm (Binary GCD), utilizing efficient bitwise shifts and
subtraction to eliminate expensive modulo operations during fraction
simplification.
---
 core/src/num/biguint.rs | 798 +++++++++++++++++++++++++++++++++-------
 1 file changed, 667 insertions(+), 131 deletions(-)
diff --git a/core/src/num/biguint.rs b/core/src/num/biguint.rs
index 2687e4fb..c0487ed3 100644
--- a/core/src/num/biguint.rs
+++ b/core/src/num/biguint.rs
@@ -217,16 +217,6 @@ impl BigUint {
 		}
 	}
 
-	pub(crate) fn gcd<I: Interrupt>(mut a: Self, mut b: Self, int: &I) -> FResult<Self> {
-		while b >= 1.into() {
-			let r = a.rem(&b, int)?;
-			a = b;
-			b = r;
-		}
-
-		Ok(a)
-	}
-
 	pub(crate) fn pow<I: Interrupt>(a: &Self, b: &Self, int: &I) -> FResult<Self> {
 		if a.is_zero() && b.is_zero() {
 			return Err(FendError::ZeroToThePowerOfZero);
@@ -242,37 +232,64 @@ impl BigUint {
 
 	// computes the exact `n`-th root if possible, otherwise the next lower integer
 	pub(crate) fn root_n<I: Interrupt>(self, n: &Self, int: &I) -> FResult<Exact<Self>> {
-		if self == 0.into() || self == 1.into() || n == &Self::from(1) {
+		if self.is_zero() {
 			return Ok(Exact::new(self, true));
 		}
-		if n.value_len() > 1 {
-			return Err(FendError::OutOfRange {
-				value: Box::new(n.format(&FormatOptions::default(), int)?.value),
-				range: Range {
-					start: RangeBound::Closed(Box::new(1)),
-					end: RangeBound::Closed(Box::new(u64::MAX)),
-				},
-			});
+		if n.is_zero() {
+			return Err(FendError::DivideByZero);
 		}
-		let max_bits = self.bits() / n.get(0) + 1;
-		let mut low_guess = Self::from(1);
-		let mut high_guess = Small(1).lshift_n(&(max_bits + 1).into(), int)?;
-		let result = loop {
-			test_int(int)?;
-			let mut guess = low_guess.clone().add(&high_guess);
-			guess.rshift(int)?;
-
-			let res = Self::pow(&guess, n, int)?;
-			match res.cmp(&self) {
-				Ordering::Equal => break Exact::new(guess, true),
-				Ordering::Greater => high_guess = guess,
-				Ordering::Less => low_guess = guess,
+		if n == &Self::from(1) {
+			return Ok(Exact::new(self, true));
+		}
+
+		let Ok(n_usize) = n.try_as_usize(int) else {
+			// n is huge (larger than usize::MAX).
+			// root is 1 (unless self is 0 or 1, handled above).
+			return Ok(Exact::new(Self::from(1), false));
+		};
+
+		let self_bits = self.bits();
+		if self_bits < n_usize as u64 {
+			if self == Self::from(1) {
+				return Ok(Exact::new(Self::from(1), true));
 			}
-			if high_guess.clone().sub(&low_guess) <= 1.into() {
-				break Exact::new(low_guess, false);
+			return Ok(Exact::new(Self::from(1), false));
+		}
+
+		// Initial guess: 2 ^ ceil(bits / n)
+		let guess_bits = self_bits.div_ceil(n_usize as u64);
+		let mut x = Small(1).lshift_n(&guess_bits.into(), int)?;
+
+		let n_minus_1 = Self::from((n_usize - 1) as u64);
+
+		loop {
+			test_int(int)?;
+
+			// Newton iteration: x_new = ( (n-1)x + self/x^(n-1) ) / n
+
+			// 1. Calculate x^(n-1)
+			let x_pow_n_minus_1 = Self::pow(&x, &n_minus_1, int)?;
+
+			// 2. Calculate self / x^(n-1)
+			// We discard the remainder (the .1 tuple element).
+			let quotient = self.divmod(&x_pow_n_minus_1, int)?.0;
+
+			// 3. Calculate numerator: (n-1)x + quotient
+			let term1 = x.clone().mul(&n_minus_1, int)?;
+			let numerator = term1.add(&quotient);
+
+			// 4. Calculate x_new: numerator / n
+			let x_new = numerator.div(n, int)?;
+
+			// 5. Check convergence
+			// For integer Newton method starting from above, we stop when x_new >= x
+			if x_new >= x {
+				let check_pow = Self::pow(&x, n, int)?;
+				return Ok(Exact::new(x, check_pow == self));
 			}
-		};
-		Ok(result)
+
+			x = x_new;
+		}
 	}
 
 	fn pow_internal<I: Interrupt>(&self, mut exponent: u64, int: &I) -> FResult<Self> {
@@ -290,6 +307,9 @@ impl BigUint {
 	}
 
 	fn lshift<I: Interrupt>(&mut self, int: &I) -> FResult<()> {
+		// Check for interrupts once per call, not per limb.
+		test_int(int)?;
+
 		match self {
 			Small(n) => {
 				if *n & 0xc000_0000_0000_0000 == 0 {
@@ -299,15 +319,16 @@ impl BigUint {
 				}
 			}
 			Large(value) => {
-				if value[value.len() - 1] & (1_u64 << 63) != 0 {
-					value.push(0);
+				// Use a forward pass.
+				// This is cache-friendly and allows the compiler to vectorise.
+				let mut carry = 0;
+				for elem in value.iter_mut() {
+					let new_carry = *elem >> 63;
+					*elem = (*elem << 1) | carry;
+					carry = new_carry;
 				}
-				for i in (0..value.len()).rev() {
-					test_int(int)?;
-					value[i] <<= 1;
-					if i != 0 {
-						value[i] |= value[i - 1] >> 63;
-					}
+				if carry != 0 {
+					value.push(carry);
 				}
 			}
 		}
@@ -315,18 +336,20 @@ impl BigUint {
 	}
 
 	fn rshift<I: Interrupt>(&mut self, int: &I) -> FResult<()> {
+		// Check for interrupts once per call, not per limb.
+		test_int(int)?;
+
 		match self {
 			Small(n) => *n >>= 1,
 			Large(value) => {
-				for i in 0..value.len() {
-					test_int(int)?;
-					value[i] >>= 1;
-					let next = if i + 1 >= value.len() {
-						0
-					} else {
-						value[i + 1]
-					};
-					value[i] |= next << 63;
+				let len = value.len();
+				if len > 0 {
+					// Forward pass: value[i] takes the high bit of value[i+1]
+					for i in 0..len - 1 {
+						value[i] = (value[i] >> 1) | (value[i + 1] << 63);
+					}
+					// Handle the most significant limb
+					value[len - 1] >>= 1;
 				}
 			}
 		}
@@ -334,86 +357,484 @@ impl BigUint {
 	}
 
 	pub(crate) fn divmod<I: Interrupt>(&self, other: &Self, int: &I) -> FResult<(Self, Self)> {
+		if other.is_zero() {
+			return Err(FendError::DivideByZero);
+		}
+		if self.is_zero() {
+			return Ok((0.into(), 0.into()));
+		}
+		if self < other {
+			return Ok((0.into(), self.clone()));
+		}
+		if other == &Self::from(1) {
+			return Ok((self.clone(), 0.into()));
+		}
+
 		if let (Small(a), Small(b)) = (self, other) {
 			if let (Some(div_res), Some(mod_res)) = (a.checked_div(*b), a.checked_rem(*b)) {
 				return Ok((Small(div_res), Small(mod_res)));
 			}
 			return Err(FendError::DivideByZero);
 		}
-		if other.is_zero() {
-			return Err(FendError::DivideByZero);
+
+		self.div_rem_knuth(other, int)
+	}
+
+	fn trailing_zeros(&self) -> u64 {
+		match self {
+			Small(n) => u64::from(n.trailing_zeros()),
+			Large(v) => {
+				let mut count = 0;
+				for &digit in v {
+					if digit == 0 {
+						count += 64;
+					} else {
+						count += u64::from(digit.trailing_zeros());
+						break;
+					}
+				}
+				count
+			}
 		}
-		if other == &Self::from(1) {
-			return Ok((self.clone(), Self::from(0)));
+	}
+
+	fn rshift_u64(&mut self, n: u64) {
+		if n == 0 {
+			return;
 		}
-		if self.is_zero() {
-			return Ok((Self::from(0), Self::from(0)));
+		match self {
+			Small(v) => {
+				if n >= 64 {
+					*v = 0;
+				} else {
+					*v >>= n;
+				}
+			}
+			Large(v) => {
+				let shift_limbs = (n / 64) as usize;
+				let shift_bits = (n % 64) as u32;
+
+				if shift_limbs >= v.len() {
+					*self = Small(0);
+					return;
+				}
+
+				// Shift limbs (whole words) - remove LSBs
+				if shift_limbs > 0 {
+					v.drain(0..shift_limbs);
+				}
+
+				// Shift bits within limbs
+				if shift_bits > 0 {
+					let len = v.len();
+					let inv_shift = 64 - shift_bits;
+					for i in 0..len - 1 {
+						v[i] = (v[i] >> shift_bits) | (v[i + 1] << inv_shift);
+					}
+					v[len - 1] >>= shift_bits;
+				}
+
+				// Clean up
+				while v.last() == Some(&0) {
+					v.pop();
+				}
+				if v.is_empty() {
+					*self = Small(0);
+				}
+			}
 		}
-		if self < other {
-			return Ok((Self::from(0), self.clone()));
+	}
+
+	fn lshift_u64(&mut self, n: u64) {
+		if n == 0 {
+			return;
+		}
+		match self {
+			Small(v) => {
+				// If it fits in Small, stay Small
+				if n < 64 && u64::from(v.leading_zeros()) >= n {
+					*v <<= n;
+				} else {
+					// promote to Large
+					let mut large = Large(vec![*v]);
+					large.lshift_u64(n);
+					*self = large;
+				}
+			}
+			Large(v) => {
+				let shift_limbs = (n / 64) as usize;
+				let shift_bits = (n % 64) as u32;
+
+				// 1. Shift bits
+				if shift_bits > 0 {
+					let mut carry = 0;
+					let inv_shift = 64 - shift_bits;
+					for x in v.iter_mut() {
+						let next_carry = *x >> inv_shift;
+						*x = (*x << shift_bits) | carry;
+						carry = next_carry;
+					}
+					if carry != 0 {
+						v.push(carry);
+					}
+				}
+
+				// 2. Prepend zero limbs (LSBs)
+				if shift_limbs > 0 {
+					let mut new_vec = vec![0; shift_limbs];
+					new_vec.append(v);
+					*v = new_vec;
+				}
+			}
 		}
-		if self == other {
-			return Ok((Self::from(1), Self::from(0)));
+	}
+
+	// Stein's Algorithm (Binary GCD)
+	pub(crate) fn gcd<I: Interrupt>(mut self, mut other: Self, int: &I) -> FResult<Self> {
+		// 1. Handle base cases
+		if self.is_zero() {
+			return Ok(other);
 		}
-		if other == &Self::from(2) {
-			let mut div_result = self.clone();
-			div_result.rshift(int)?;
-			let modulo = self.get(0) & 1;
-			return Ok((div_result, Self::from(modulo)));
+		if other.is_zero() {
+			return Ok(self);
 		}
-		// binary long division
-		let mut q = Self::from(0);
-		let mut r = Self::from(0);
-		for i in (0..self.value_len()).rev() {
+
+		// 2. Find common power of 2
+		let u_zeros = self.trailing_zeros();
+		let v_zeros = other.trailing_zeros();
+		let common_shift = std::cmp::min(u_zeros, v_zeros);
+
+		// 3. Remove factors of 2 from u and v
+		self.rshift_u64(u_zeros);
+		other.rshift_u64(v_zeros);
+
+		// 4. Main Loop
+		loop {
+			// Invariant: self and other are both odd at the start of loop body
 			test_int(int)?;
-			for j in (0..64).rev() {
-				r.lshift(int)?;
-				let bit_of_self = u64::from((self.get(i) & (1 << j)) != 0);
-				r.set(0, r.get(0) | bit_of_self);
-				if &r >= other {
-					r = r.sub(other);
-					q.set(i, q.get(i) | (1 << j));
+
+			match self.cmp(&other) {
+				Ordering::Equal => break,
+				Ordering::Greater => {
+					self.sub_assign(&other);
+					let zeros = self.trailing_zeros();
+					self.rshift_u64(zeros);
+				}
+				Ordering::Less => {
+					std::mem::swap(&mut self, &mut other);
+					self.sub_assign(&other);
+					let zeros = self.trailing_zeros();
+					self.rshift_u64(zeros);
 				}
 			}
 		}
-		Ok((q, r))
+
+		// 5. Restore common power of 2
+		self.lshift_u64(common_shift);
+		Ok(self)
 	}
 
-	/// computes self *= other
-	fn mul_internal<I: Interrupt>(&mut self, other: &Self, int: &I) -> FResult<()> {
-		if self.is_zero() || other.is_zero() {
-			*self = Self::from(0);
-			return Ok(());
+	// Knuth's Algorithm D (The Art of Computer Programming, Vol 2, 4.3.1)
+	#[allow(clippy::cast_possible_truncation)]
+	fn div_rem_knuth<I: Interrupt>(&self, other: &Self, int: &I) -> FResult<(Self, Self)> {
+		let mut dividend_digits = match self {
+			Small(n) => vec![*n],
+			Large(v) => v.clone(),
+		};
+		let mut divisor_digits = match other {
+			Small(n) => vec![*n],
+			Large(v) => v.clone(),
+		};
+
+		// Sanitize inputs by removing leading zero limbs (trailing in LE vector).
+		while dividend_digits.len() > 1 && dividend_digits.last() == Some(&0) {
+			dividend_digits.pop();
 		}
-		let self_clone = self.clone();
-		self.make_large();
-		match self {
-			Small(_) => unreachable!(),
-			Large(v) => {
-				v.clear();
-				v.push(0);
+		while divisor_digits.len() > 1 && divisor_digits.last() == Some(&0) {
+			divisor_digits.pop();
+		}
+
+		let divisor_len = divisor_digits.len();
+		// shift = number of leading zeros in the most significant digit of the divisor
+		let shift = divisor_digits.last().unwrap().leading_zeros();
+
+		// Normalize divisor (shift left so MSB is 1)
+		if shift > 0 {
+			let mut carry = 0;
+			for digit in &mut divisor_digits {
+				let next_carry = *digit >> (64 - shift);
+				*digit = (*digit << shift) | carry;
+				carry = next_carry;
+			}
+		}
+
+		// Normalize dividend (shift left by same amount)
+		if shift > 0 {
+			let mut carry = 0;
+			for digit in &mut dividend_digits {
+				let next_carry = *digit >> (64 - shift);
+				*digit = (*digit << shift) | carry;
+				carry = next_carry;
+			}
+			if carry != 0 {
+				dividend_digits.push(carry);
 			}
 		}
-		for i in 0..other.value_len() {
+
+		// Ensure dividend has a leading zero digit for the algorithm to work comfortably
+		dividend_digits.push(0);
+
+		let delta_len = dividend_digits.len() - divisor_len - 1;
+		let mut quotient_digits = vec![0; delta_len + 1];
+		let divisor_msb = divisor_digits[divisor_len - 1];
+
+		// Iterate 'j' from m down to 0 -> offset
+		for offset in (0..=delta_len).rev() {
 			test_int(int)?;
-			self.add_assign_internal(&self_clone, other.get(i), i);
+
+			let dividend_high = dividend_digits[offset + divisor_len];
+			let dividend_low = dividend_digits[offset + divisor_len - 1];
+
+			// Estimate q_hat (quotient digit)
+			let mut quotient_estimate = if dividend_high == divisor_msb {
+				u64::MAX
+			} else {
+				let numerator = (u128::from(dividend_high) << 64) | u128::from(dividend_low);
+				(numerator / u128::from(divisor_msb)) as u64
+			};
+
+			// Refine q_hat
+			let divisor_second = if divisor_len >= 2 {
+				divisor_digits[divisor_len - 2]
+			} else {
+				0
+			};
+			let dividend_third = if offset + divisor_len >= 2 {
+				dividend_digits[offset + divisor_len - 2]
+			} else {
+				0
+			};
+
+			let mut remainder = ((u128::from(dividend_high) << 64) | u128::from(dividend_low))
+				.wrapping_sub(u128::from(quotient_estimate) * u128::from(divisor_msb));
+
+			while u128::from(divisor_second) * u128::from(quotient_estimate)
+				> (remainder << 64) | u128::from(dividend_third)
+			{
+				quotient_estimate -= 1;
+				remainder = remainder.wrapping_add(u128::from(divisor_msb));
+				if remainder > u128::from(u64::MAX) {
+					break;
+				}
+			}
+
+			// Multiply and subtract: dividend[offset..] -= quotient_estimate * divisor
+			let mut borrow: u128 = 0;
+			for (i, &divisor_digit) in divisor_digits.iter().enumerate() {
+				let product = u128::from(divisor_digit) * u128::from(quotient_estimate) + borrow;
+				let sub_val = product as u64;
+				borrow = product >> 64;
+
+				let (res, borrowed) = dividend_digits[offset + i].overflowing_sub(sub_val);
+				dividend_digits[offset + i] = res;
+				if borrowed {
+					borrow += 1;
+				}
+			}
+			// Propagate borrow to the top digit
+			let (res, borrowed) =
+				dividend_digits[offset + divisor_len].overflowing_sub(borrow as u64);
+			dividend_digits[offset + divisor_len] = res;
+
+			// If we subtracted too much (borrowed from top), correct by adding back divisor
+			if borrowed {
+				quotient_estimate -= 1;
+				let mut carry: u128 = 0;
+				for (i, &divisor_digit) in divisor_digits.iter().enumerate() {
+					let sum =
+						u128::from(dividend_digits[offset + i]) + u128::from(divisor_digit) + carry;
+					dividend_digits[offset + i] = sum as u64;
+					carry = sum >> 64;
+				}
+				dividend_digits[offset + divisor_len] =
+					dividend_digits[offset + divisor_len].wrapping_add(carry as u64);
+			}
+
+			quotient_digits[offset] = quotient_estimate;
 		}
-		Ok(())
+
+		// Denormalize remainder (shift right)
+		if shift > 0 {
+			let mut next_high_bits = 0;
+			// Iterate from MSB to LSB
+			for digit in dividend_digits.iter_mut().rev() {
+				let val = *digit;
+				*digit = (val >> shift) | next_high_bits;
+				next_high_bits = val << (64 - shift);
+			}
+		}
+
+		let mut final_quotient = Self::Large(quotient_digits);
+		final_quotient.reduce();
+		let mut final_remainder = Self::Large(dividend_digits);
+		final_remainder.reduce();
+
+		Ok((final_quotient, final_remainder))
 	}
 
 	/// computes `self += (other * mul_digit) << (64 * shift)`
+	#[allow(clippy::cast_possible_truncation)]
 	fn add_assign_internal(&mut self, other: &Self, mul_digit: u64, shift: usize) {
-		let mut carry = 0;
-		for i in 0..max(self.value_len(), other.value_len() + shift) {
-			let a = self.get(i);
-			let b = if i >= shift { other.get(i - shift) } else { 0 };
-			let sum = u128::from(a) + (u128::from(b) * u128::from(mul_digit)) + u128::from(carry);
-			self.set(i, truncate(sum));
-			carry = truncate(sum >> 64);
+		if mul_digit == 0 {
+			return;
+		}
+
+		self.make_large();
+		let self_vec = match self {
+			Large(v) => v,
+			Small(_) => unreachable!(),
+		};
+
+		let other_slice = match other {
+			Large(v) => v.as_slice(),
+			Small(v) => std::slice::from_ref(v),
+		};
+
+		let min_len = shift + other_slice.len();
+		if self_vec.len() < min_len {
+			self_vec.resize(min_len, 0);
 		}
-		if carry != 0 {
-			self.value_push(carry);
+
+		let mut carry: u128 = 0;
+		let mul_digit = u128::from(mul_digit);
+
+		// Use zip to iterate specified slices.
+		let target_slice = &mut self_vec[shift..];
+		for (target, &source) in target_slice.iter_mut().zip(other_slice) {
+			let a = u128::from(*target);
+			let b = u128::from(source);
+
+			let sum = a + (b * mul_digit) + carry;
+			*target = sum as u64;
+			carry = sum >> 64;
 		}
+
+		// Propagate carry
+		let mut i = min_len;
+		while carry != 0 {
+			if i < self_vec.len() {
+				let sum = u128::from(self_vec[i]) + carry;
+				self_vec[i] = sum as u64;
+				carry = sum >> 64;
+				i += 1;
+			} else {
+				self_vec.push(carry as u64);
+				break;
+			}
+		}
+	}
+
+	/// computes `self += other << (64 * shift)`
+	#[allow(clippy::cast_possible_truncation)]
+	fn add_assign_shifted(&mut self, other: &Self, shift: usize) {
+		self.make_large();
+		let self_vec = match self {
+			Large(v) => v,
+			Small(_) => unreachable!(),
+		};
+
+		let other_slice = match other {
+			Large(v) => v.as_slice(),
+			Small(v) => std::slice::from_ref(v),
+		};
+
+		let min_len = shift + other_slice.len();
+		if self_vec.len() < min_len {
+			self_vec.resize(min_len, 0);
+		}
+
+		let mut carry: u128 = 0;
+		let target_slice = &mut self_vec[shift..];
+
+		for (target, &source) in target_slice.iter_mut().zip(other_slice) {
+			let a = u128::from(*target);
+			let b = u128::from(source);
+
+			let sum = a + b + carry;
+			*target = sum as u64;
+			carry = sum >> 64;
+		}
+
+		let mut i = min_len;
+		while carry != 0 {
+			if i < self_vec.len() {
+				let sum = u128::from(self_vec[i]) + carry;
+				self_vec[i] = sum as u64;
+				carry = sum >> 64;
+				i += 1;
+			} else {
+				self_vec.push(carry as u64);
+				break;
+			}
+		}
+	}
+
+	#[allow(clippy::cast_possible_truncation)]
+	fn mul_internal_slice(a: &[u64], b: &[u64]) -> Self {
+		if a.is_empty() || b.is_empty() {
+			return Self::from(0);
+		}
+		// Pre-allocate result
+		let mut res = vec![0; a.len() + b.len()];
+
+		for (i, &digit) in b.iter().enumerate() {
+			if digit == 0 {
+				continue;
+			}
+			let mut carry: u128 = 0;
+			let mul_digit = u128::from(digit);
+
+			// Optimized inner loop on slices
+			let target = &mut res[i..];
+			for (dst, &src) in target.iter_mut().zip(a) {
+				let sum = u128::from(*dst) + (u128::from(src) * mul_digit) + carry;
+				*dst = sum as u64;
+				carry = sum >> 64;
+			}
+
+			// Propagate carry
+			let mut k = i + a.len();
+			while carry > 0 {
+				let sum = u128::from(res[k]) + carry;
+				res[k] = sum as u64;
+				carry = sum >> 64;
+				k += 1;
+			}
+		}
+
+		let mut big = Large(res);
+		big.reduce();
+		big
+	}
+
+	#[allow(clippy::cast_possible_truncation)]
+	// Helper to add two slices and return BigUint
+	fn add_slices(a: &[u64], b: &[u64]) -> Self {
+		let max_len = max(a.len(), b.len());
+		let mut res = vec![0; max_len];
+		let mut carry: u128 = 0;
+
+		for i in 0..max_len {
+			let va = if i < a.len() { u128::from(a[i]) } else { 0 };
+			let vb = if i < b.len() { u128::from(b[i]) } else { 0 };
+			let sum = va + vb + carry;
+			res[i] = sum as u64;
+			carry = sum >> 64;
+		}
+		if carry > 0 {
+			res.push(carry as u64);
+		}
+		Large(res)
 	}
 
 	// Note: 0! = 1, 1! = 1
@@ -444,16 +865,6 @@ impl BigUint {
 		Ok(b)
 	}
 
-	pub(crate) fn mul<I: Interrupt>(mut self, other: &Self, int: &I) -> FResult<Self> {
-		if let (Small(a), Small(b)) = (&self, &other)
-			&& let Some(res) = a.checked_mul(*b)
-		{
-			return Ok(Self::from(res));
-		}
-		self.mul_internal(other, int)?;
-		Ok(self)
-	}
-
 	fn rem<I: Interrupt>(&self, other: &Self, int: &I) -> FResult<Self> {
 		Ok(self.divmod(other, int)?.1)
 	}
@@ -471,6 +882,7 @@ impl BigUint {
 		self
 	}
 
+	#[allow(clippy::cast_possible_truncation)]
 	pub(crate) fn sub(self, other: &Self) -> Self {
 		if let (Small(a), Small(b)) = (&self, &other) {
 			return Self::from(a - b);
@@ -483,28 +895,152 @@ impl BigUint {
 		if other.is_zero() {
 			return self;
 		}
-		let mut carry = 0; // 0 or 1
+
 		let mut res = match self {
 			Large(x) => x,
 			Small(v) => vec![v],
 		};
-		if res.len() < other.value_len() {
-			res.resize(other.value_len(), 0);
-		}
-		for (i, a) in res.iter_mut().enumerate() {
-			let b = other.get(i);
-			if !(b == u64::MAX && carry == 1) && *a >= b + carry {
-				*a = *a - b - carry;
-				carry = 0;
-			} else {
-				let next_digit =
-					u128::from(*a) + ((1_u128) << 64) - u128::from(b) - u128::from(carry);
-				*a = truncate(next_digit);
-				carry = 1;
-			}
+
+		let other_slice = match other {
+			Large(v) => v.as_slice(),
+			Small(v) => std::slice::from_ref(v),
+		};
+
+		let mut borrow: u128 = 0;
+		let mut i = 0;
+
+		let len = std::cmp::min(res.len(), other_slice.len());
+
+		while i < len {
+			let a = res[i];
+			let b = other_slice[i];
+
+			let sub_val = u128::from(b) + borrow;
+			let (val, new_borrow) = u128::from(a).overflowing_sub(sub_val);
+
+			res[i] = val as u64;
+			borrow = u128::from(new_borrow);
+			i += 1;
 		}
-		assert_eq!(carry, 0);
-		Large(res)
+
+		while borrow > 0 && i < res.len() {
+			let a = res[i];
+			let (val, new_borrow) = a.overflowing_sub(borrow as u64);
+			res[i] = val;
+			borrow = u128::from(new_borrow);
+			i += 1;
+		}
+
+		let mut result = Large(res);
+		result.reduce();
+		result
+	}
+
+	#[allow(clippy::cast_possible_truncation)]
+	fn sub_assign(&mut self, other: &Self) {
+		let other_slice = match other {
+			Large(v) => v.as_slice(),
+			Small(s) => std::slice::from_ref(s),
+		};
+
+		if other_slice.is_empty() {
+			return;
+		}
+
+		self.make_large();
+		let self_vec = match self {
+			Large(v) => v,
+			Small(_) => unreachable!(),
+		};
+
+		let mut borrow: u128 = 0;
+		let len = std::cmp::min(self_vec.len(), other_slice.len());
+
+		for i in 0..len {
+			let val = u128::from(self_vec[i]);
+			let sub = u128::from(other_slice[i]) + borrow;
+			let (res, b) = val.overflowing_sub(sub);
+			self_vec[i] = res as u64;
+			borrow = u128::from(b);
+		}
+
+		let mut i = len;
+		while borrow > 0 && i < self_vec.len() {
+			let val = u128::from(self_vec[i]);
+			let (res, b) = val.overflowing_sub(borrow);
+			self_vec[i] = res as u64;
+			borrow = u128::from(b);
+			i += 1;
+		}
+	}
+
+	pub(crate) fn mul<I: Interrupt>(self, other: &Self, int: &I) -> FResult<Self> {
+		if self.is_zero() || other.is_zero() {
+			return Ok(Self::from(0));
+		}
+
+		let a_slice = match &self {
+			Large(v) => v.as_slice(),
+			Small(s) => std::slice::from_ref(s),
+		};
+		let b_slice = match other {
+			Large(v) => v.as_slice(),
+			Small(s) => std::slice::from_ref(s),
+		};
+
+		// 64 limbs (4096 bits) is the optimal cut-off.
+		if a_slice.len() >= 64 && b_slice.len() >= 64 {
+			return Self::mul_karatsuba_slice(a_slice, b_slice, int);
+		}
+
+		Ok(Self::mul_internal_slice(a_slice, b_slice))
+	}
+
+	fn mul_karatsuba_slice<I: Interrupt>(a: &[u64], b: &[u64], int: &I) -> FResult<Self> {
+		test_int(int)?;
+
+		// Base case: 64 limbs
+		if a.len() < 64 || b.len() < 64 {
+			return Ok(Self::mul_internal_slice(a, b));
+		}
+
+		let m = max(a.len(), b.len()) / 2;
+
+		let a_len = a.len();
+		let a_low = if m < a_len { &a[..m] } else { a };
+		let a_high = if m < a_len { &a[m..] } else { &[] };
+
+		let b_len = b.len();
+		let b_low = if m < b_len { &b[..m] } else { b };
+		let b_high = if m < b_len { &b[m..] } else { &[] };
+
+		let z0 = Self::mul_karatsuba_slice(a_low, b_low, int)?;
+		let z2 = Self::mul_karatsuba_slice(a_high, b_high, int)?;
+
+		let a_sum = Self::add_slices(a_low, a_high);
+		let b_sum = Self::add_slices(b_low, b_high);
+
+		let a_sum_slice = match &a_sum {
+			Large(v) => v.as_slice(),
+			Small(s) => std::slice::from_ref(s),
+		};
+		let b_sum_slice = match &b_sum {
+			Large(v) => v.as_slice(),
+			Small(s) => std::slice::from_ref(s),
+		};
+
+		let z1_raw = Self::mul_karatsuba_slice(a_sum_slice, b_sum_slice, int)?;
+
+		// Use in-place subtraction
+		let mut z1 = z1_raw;
+		z1.sub_assign(&z2);
+		z1.sub_assign(&z0);
+
+		let mut res = z0;
+		res.add_assign_shifted(&z1, m);
+		res.add_assign_shifted(&z2, 2 * m);
+
+		Ok(res)
 	}
 
 	pub(crate) const fn is_definitely_zero(&self) -> bool {

From 9f2592594fb6bed74f6baf47ad8344d652402953 Mon Sep 17 00:00:00 2001
From: printfn <printfn@users.noreply.github.com>
Date: Thu, 1 Jan 2026 03:56:24 +0000
Subject: [PATCH 2/3] Remove unused code

---
 core/src/num/biguint.rs | 50 +----------------------------------------
 1 file changed, 1 insertion(+), 49 deletions(-)

diff --git a/core/src/num/biguint.rs b/core/src/num/biguint.rs
index c0487ed3..f64cbb28 100644
--- a/core/src/num/biguint.rs
+++ b/core/src/num/biguint.rs
@@ -178,27 +178,6 @@ impl BigUint {
 		}
 	}
 
-	fn set(&mut self, idx: usize, new_value: u64) {
-		match self {
-			Small(n) => {
-				if idx == 0 {
-					*n = new_value;
-				} else if new_value == 0 {
-					// no need to do anything
-				} else {
-					self.make_large();
-					self.set(idx, new_value);
-				}
-			}
-			Large(value) => {
-				while idx >= value.len() {
-					value.push(0);
-				}
-				value[idx] = new_value;
-			}
-		}
-	}
-
 	fn value_len(&self) -> usize {
 		match self {
 			Small(_) => 1,
@@ -206,17 +185,6 @@ impl BigUint {
 		}
 	}
 
-	fn value_push(&mut self, new: u64) {
-		if new == 0 {
-			return;
-		}
-		self.make_large();
-		match self {
-			Small(_) => unreachable!(),
-			Large(v) => v.push(new),
-		}
-	}
-
 	pub(crate) fn pow<I: Interrupt>(a: &Self, b: &Self, int: &I) -> FResult<Self> {
 		if a.is_zero() && b.is_zero() {
 			return Err(FendError::ZeroToThePowerOfZero);
@@ -535,7 +503,7 @@ impl BigUint {
 	}
 
 	// Knuth's Algorithm D (The Art of Computer Programming, Vol 2, 4.3.1)
-	#[allow(clippy::cast_possible_truncation)]
+	#[allow(clippy::cast_possible_truncation, clippy::too_many_lines)]
 	fn div_rem_knuth<I: Interrupt>(&self, other: &Self, int: &I) -> FResult<(Self, Self)> {
 		let mut dividend_digits = match self {
 			Small(n) => vec![*n],
@@ -865,10 +833,6 @@ impl BigUint {
 		Ok(b)
 	}
 
-	fn rem<I: Interrupt>(&self, other: &Self, int: &I) -> FResult<Self> {
-		Ok(self.divmod(other, int)?.1)
-	}
-
 	pub(crate) fn is_even<I: Interrupt>(&self, int: &I) -> FResult<bool> {
 		Ok(self.divmod(&Self::from(2), int)?.1 == 0.into())
 	}
@@ -1658,18 +1622,6 @@ mod tests {
 		Ok(())
 	}
 
-	#[test]
-	fn test_rem() -> Res {
-		let int = &crate::interrupt::Never;
-		let three = BigUint::from(3);
-		assert_eq!(BigUint::from(20).rem(&three, int)?, BigUint::from(2));
-		assert_eq!(BigUint::from(21).rem(&three, int)?, BigUint::from(0));
-		assert_eq!(BigUint::from(22).rem(&three, int)?, BigUint::from(1));
-		assert_eq!(BigUint::from(23).rem(&three, int)?, BigUint::from(2));
-		assert_eq!(BigUint::from(24).rem(&three, int)?, BigUint::from(0));
-		Ok(())
-	}
-
 	#[test]
 	fn test_lshift() -> Res {
 		let int = &crate::interrupt::Never;

From 6a52ead2eae2e1ffc4a13584d114653680493ec1 Mon Sep 17 00:00:00 2001
From: printfn <printfn@users.noreply.github.com>
Date: Thu, 1 Jan 2026 03:59:46 +0000
Subject: [PATCH 3/3] Fix clippy warnings

---
 core/src/ast.rs       | 30 +++++++++++++++---------------
 core/src/scope.rs     |  2 +-
 core/src/serialize.rs |  6 +++---
 core/src/value.rs     |  2 +-
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/core/src/ast.rs b/core/src/ast.rs
index 2ebc2e3d..091ae145 100644
--- a/core/src/ast.rs
+++ b/core/src/ast.rs
@@ -108,27 +108,27 @@ impl fmt::Display for Bop {
 pub(crate) enum Expr {
 	Literal(Value),
 	Ident(Ident),
-	Parens(Box<Expr>),
-	UnaryMinus(Box<Expr>),
-	UnaryPlus(Box<Expr>),
-	UnaryDiv(Box<Expr>),
-	Factorial(Box<Expr>),
-	Bop(Bop, Box<Expr>, Box<Expr>),
+	Parens(Box<Self>),
+	UnaryMinus(Box<Self>),
+	UnaryPlus(Box<Self>),
+	UnaryDiv(Box<Self>),
+	Factorial(Box<Self>),
+	Bop(Bop, Box<Self>, Box<Self>),
 	// Call a function or multiply the expressions
-	Apply(Box<Expr>, Box<Expr>),
+	Apply(Box<Self>, Box<Self>),
 	// Call a function, or throw an error if lhs is not a function
-	ApplyFunctionCall(Box<Expr>, Box<Expr>),
+	ApplyFunctionCall(Box<Self>, Box<Self>),
 	// Multiply the expressions
-	ApplyMul(Box<Expr>, Box<Expr>),
+	ApplyMul(Box<Self>, Box<Self>),
 
-	As(Box<Expr>, Box<Expr>),
-	Fn(Ident, Box<Expr>),
+	As(Box<Self>, Box<Self>),
+	Fn(Ident, Box<Self>),
 
-	Of(Ident, Box<Expr>),
+	Of(Ident, Box<Self>),
 
-	Assign(Ident, Box<Expr>),
-	Equality(bool, Box<Expr>, Box<Expr>),
-	Statements(Box<Expr>, Box<Expr>),
+	Assign(Ident, Box<Self>),
+	Equality(bool, Box<Self>, Box<Self>),
+	Statements(Box<Self>, Box<Self>),
 }
 
 impl Expr {
diff --git a/core/src/scope.rs b/core/src/scope.rs
index 7653376b..2d9cff0d 100644
--- a/core/src/scope.rs
+++ b/core/src/scope.rs
@@ -72,7 +72,7 @@ impl ScopeValue {
 pub(crate) struct Scope {
 	ident: Ident,
 	value: ScopeValue,
-	inner: Option<Arc<Scope>>,
+	inner: Option<Arc<Self>>,
 }
 
 pub(crate) fn compare_option_arc_scope<I: Interrupt>(
diff --git a/core/src/serialize.rs b/core/src/serialize.rs
index 98e1b21f..d3cb7a16 100644
--- a/core/src/serialize.rs
+++ b/core/src/serialize.rs
@@ -52,9 +52,9 @@ pub(crate) enum CborValue {
 	Negative(u64),
 	Bytes(Vec<u8>),
 	String(String),
-	Array(Vec<CborValue>),
-	Map(Vec<(CborValue, CborValue)>),
-	Tag(u64, Box<CborValue>),
+	Array(Vec<Self>),
+	Map(Vec<(Self, Self)>),
+	Tag(u64, Box<Self>),
 	Boolean(bool),
 	Null,
 	Undefined,
diff --git a/core/src/value.rs b/core/src/value.rs
index b7152d88..34ddae72 100644
--- a/core/src/value.rs
+++ b/core/src/value.rs
@@ -28,7 +28,7 @@ pub(crate) enum Value {
 	Base(Base),
 	// user-defined function with a named parameter
 	Fn(Ident, Box<Expr>, Option<Arc<Scope>>),
-	Object(Vec<(Cow<'static, str>, Box<Value>)>),
+	Object(Vec<(Cow<'static, str>, Box<Self>)>),
 	String(Cow<'static, str>),
 	Bool(bool),
 	Unit, // unit value `()`