diff --git a/opencog/atoms/aten/TensorEquation.cc b/opencog/atoms/aten/TensorEquation.cc
index 6222011..6125db5 100644
--- a/opencog/atoms/aten/TensorEquation.cc
+++ b/opencog/atoms/aten/TensorEquation.cc
@@ -605,6 +605,15 @@ ATenValuePtr TensorEquation::execute(
 	{
 		result = apply_nonlinearity(result, Nonlinearity::THRESHOLD);
 	}
+	// For Hybrid mode: apply sigmoid to squash to [0,1] if no nonlinearity
+	// is set, then threshold to produce binary output.  The straight-through
+	// estimator in backward() allows gradients to flow through the threshold.
+	else if (_mode == ReasoningMode::HYBRID)
+	{
+		if (_nonlinearity == Nonlinearity::NONE)
+			result = apply_nonlinearity(result, Nonlinearity::SIGMOID);
+		result = apply_nonlinearity(result, Nonlinearity::THRESHOLD);
+	}
 
 	return result;
 }
@@ -863,6 +872,7 @@ TensorProgram::TensorProgram(const std::string& name, ReasoningMode mode)
 	  _convergence_threshold(1e-6),
 	  _learning_rate(0.01),
 	  _track_gradients(false),
+	  _grad_clip(0.0),
 	  _forward_count(0),
 	  _backward_count(0)
 {
@@ -1181,6 +1191,21 @@ void TensorProgram::backward(const std::string& output_name,
 	}
 }
 
+// Clip gradient vector in-place so its L2-norm does not exceed max_norm.
+// No-op when max_norm <= 0.
+static void clip_gradient_vector(std::vector<double>& g_vec, double max_norm)
+{
+	if (max_norm <= 0.0) return;
+	double norm = 0.0;
+	for (double v : g_vec) norm += v * v;
+	norm = std::sqrt(norm);
+	if (norm > max_norm)
+	{
+		double scale = max_norm / norm;
+		for (double& v : g_vec) v *= scale;
+	}
+}
+
 void TensorProgram::update_parameters()
 {
 	for (auto& eq : _equations)
@@ -1194,6 +1219,8 @@ void TensorProgram::update_parameters()
 			auto g_vec = eq->weight_grad()->to_vector();
 			auto w_shape = eq->weight()->shape();
 
+			clip_gradient_vector(g_vec, _grad_clip);
+
 			for (size_t i = 0; i < w_vec.size() && i < g_vec.size(); i++)
 				w_vec[i] -= _learning_rate * g_vec[i];
 
@@ -1207,6 +1234,8 @@ void TensorProgram::update_parameters()
 			auto g_vec = eq->bias_grad()->to_vector();
 			auto b_shape = eq->bias()->shape();
 
+			clip_gradient_vector(g_vec, _grad_clip);
+
 			for (size_t i = 0; i < b_vec.size() && i < g_vec.size(); i++)
 				b_vec[i] -= _learning_rate * g_vec[i];
 
@@ -1227,6 +1256,7 @@ double TensorProgram::train(
 	size_t epochs)
 {
 	double final_loss = 0.0;
+	_loss_history.clear();
 
 	for (size_t epoch = 0; epoch < epochs; epoch++)
 	{
@@ -1237,13 +1267,18 @@ double TensorProgram::train(
 		// Forward pass
 		forward_to_fixpoint();
 
-		// Compute loss
+		// Compute loss with current (pre-update) parameters and record it.
+		// Recording before update follows standard ML convention: the logged
+		// value is the loss the model had entering this epoch.
 		final_loss = 0.0;
 		for (const auto& [name, target] : targets)
 		{
 			final_loss += compute_loss(name, target);
 		}
 
+		// Record loss for this epoch
+		_loss_history.push_back(final_loss);
+
 		// Backward pass (simplified)
 		for (const auto& [name, target] : targets)
 		{
diff --git a/opencog/atoms/aten/TensorEquation.h b/opencog/atoms/aten/TensorEquation.h
index 163b0bd..47c4cf4 100644
--- a/opencog/atoms/aten/TensorEquation.h
+++ b/opencog/atoms/aten/TensorEquation.h
@@ -302,10 +302,14 @@ class TensorProgram
 	// For learning
 	double _learning_rate;
 	bool _track_gradients;
+	double _grad_clip;        // Gradient clipping threshold (0 = disabled)
 
 	// Accumulated tensor gradients (keyed by tensor name)
 	std::map<std::string, ATenValuePtr> _tensor_grads;
 
+	// Training history
+	std::vector<double> _loss_history;
+
 	// Statistics
 	size_t _forward_count;
 	size_t _backward_count;
@@ -325,6 +329,20 @@ class TensorProgram
 	ReasoningMode mode() const { return _mode; }
 	void set_mode(ReasoningMode m) { _mode = m; }
 
+	/**
+	 * Set maximum number of iterations for forward_to_fixpoint().
+	 */
+	void set_max_iterations(size_t n) { _max_iterations = n; }
+	size_t max_iterations() const { return _max_iterations; }
+
+	/**
+	 * Set convergence threshold for forward_to_fixpoint().
+	 * Iteration stops when the maximum absolute change in any
+	 * derived tensor value falls below this threshold.
+	 */
+	void set_convergence_threshold(double t) { _convergence_threshold = t; }
+	double convergence_threshold() const { return _convergence_threshold; }
+
 	// ========================================
 	// Fact Management
 
@@ -455,6 +473,19 @@ class TensorProgram
 	void set_track_gradients(bool t) { _track_gradients = t; }
 	bool track_gradients() const { return _track_gradients; }
 
+	/**
+	 * Set gradient clipping threshold.
+	 * Gradients with L2-norm exceeding this value are scaled down.
+	 * Set to 0 (default) to disable clipping.
+	 */
+	void set_grad_clip(double clip) { _grad_clip = clip; }
+	double grad_clip() const { return _grad_clip; }
+
+	/**
+	 * Return per-epoch loss values recorded by train().
+	 */
+	const std::vector<double>& loss_history() const { return _loss_history; }
+
 	/**
 	 * Compute loss between derived and target tensors.
 	 */
diff --git a/tests/atoms/aten/TensorLogicUTest.cxxtest b/tests/atoms/aten/TensorLogicUTest.cxxtest
index 4e00832..f790741 100644
--- a/tests/atoms/aten/TensorLogicUTest.cxxtest
+++ b/tests/atoms/aten/TensorLogicUTest.cxxtest
@@ -1872,4 +1872,230 @@ public:
 		// Loss should have decreased
 		TS_ASSERT(final_loss < loss_before);
 	}
+
+	// ========================================
+	// TensorProgram configuration accessors
+	// ========================================
+
+	void test_tensor_program_max_iterations()
+	{
+		// Verify set_max_iterations / max_iterations round-trip
+		TensorProgram prog("iter_test");
+		TS_ASSERT_EQUALS(prog.max_iterations(), 100);  // default
+
+		prog.set_max_iterations(42);
+		TS_ASSERT_EQUALS(prog.max_iterations(), 42);
+
+		// A program that doesn't converge should stop at max_iterations
+		ATenValuePtr v = createATenFromVector({0.0, 1.0}, {2});
+		prog.add_fact("in", v);
+		// Two equations that alternate: each overwrites the other's output
+		prog.add_equation("eq1", "out1", {"in"}, "i->i");
+		prog.add_equation("eq2", "out2", {"out1"}, "i->i");
+
+		prog.forward_to_fixpoint();
+
+		// Should have stopped: forward_count <= max_iterations + 1
+		TS_ASSERT(prog.forward_count() <= 43);
+	}
+
+	void test_tensor_program_convergence_threshold()
+	{
+		// Verify set_convergence_threshold / convergence_threshold
+		TensorProgram prog("conv_test");
+		TS_ASSERT_DELTA(prog.convergence_threshold(), 1e-6, 1e-15);
+
+		prog.set_convergence_threshold(0.1);
+		TS_ASSERT_DELTA(prog.convergence_threshold(), 0.1, 1e-15);
+
+		// A quickly-converging program should stop early
+		ATenValuePtr stable = createATenFromVector({1.0, 1.0}, {2});
+		prog.add_fact("in", stable);
+		prog.add_equation("eq1", "out", {"in"}, "i->i");
+
+		prog.forward_to_fixpoint();
+
+		// With a loose threshold the fixpoint should be reached quickly
+		TS_ASSERT(prog.forward_count() <= 3);
+	}
+
+	void test_tensor_program_grad_clip_accessors()
+	{
+		// Verify set_grad_clip / grad_clip round-trip
+		TensorProgram prog("clip_test");
+		TS_ASSERT_DELTA(prog.grad_clip(), 0.0, 1e-15);  // disabled by default
+
+		prog.set_grad_clip(1.0);
+		TS_ASSERT_DELTA(prog.grad_clip(), 1.0, 1e-15);
+
+		prog.set_grad_clip(0.0);
+		TS_ASSERT_DELTA(prog.grad_clip(), 0.0, 1e-15);
+	}
+
+	void test_tensor_program_grad_clip_limits_update()
+	{
+		// Verify that gradient clipping prevents weight from diverging when
+		// gradients would otherwise be very large.
+		TensorProgram prog("clip_limits");
+		prog.set_learning_rate(1.0);   // large lr → large updates without clipping
+		prog.set_grad_clip(0.01);      // restrict gradient norm to 0.01
+
+		// Large input: gradient will be large without clipping
+		ATenValuePtr in = createATenFromVector({100.0, 100.0, 100.0}, {3});
+		ATenValuePtr target = createATenFromVector({0.0, 0.0, 0.0}, {3});
+		prog.add_fact("in", in);
+		prog.add_equation("eq1", "out", {"in"}, "i->i");
+
+		TensorEquationPtr eq = prog.get_equation("eq1");
+		eq->set_learnable(true);
+		ATenValuePtr init_w = createATenFromVector({1.0, 1.0, 1.0}, {3});
+		eq->set_weight(init_w);
+
+		prog.forward();
+		ATenValuePtr grad_out = ATenValueCast(prog.get_derived("out")->sub(*target));
+		prog.backward("out", grad_out);
+
+		auto w_before = eq->weight()->to_vector();
+		prog.update_parameters();
+		auto w_after = eq->weight()->to_vector();
+
+		// Each weight change should be <= lr * clip = 1.0 * 0.01 = 0.01.
+		// A small margin of 1e-9 covers floating-point rounding in the
+		// L2-norm computation and the subsequent rescaling.
+		const double max_expected_delta = 0.01 + 1e-9;
+		for (size_t i = 0; i < w_before.size(); i++)
+		{
+			double delta = std::abs(w_before[i] - w_after[i]);
+			TS_ASSERT(delta <= max_expected_delta);
+		}
+	}
+
+	void test_tensor_program_loss_history()
+	{
+		// Verify that train() records per-epoch losses
+		TensorProgram prog("hist_test");
+		prog.set_learning_rate(0.01);
+
+		ATenValuePtr in = createATenFromVector({1.0, 2.0, 3.0}, {3});
+		ATenValuePtr target = createATenFromVector({2.0, 4.0, 6.0}, {3});
+		prog.add_fact("in", in);
+		prog.add_equation("eq1", "out", {"in"}, "i->i");
+
+		TensorEquationPtr eq = prog.get_equation("eq1");
+		eq->set_learnable(true);
+		eq->set_weight(createATenFromVector({0.5, 0.5, 0.5}, {3}));
+
+		TS_ASSERT_EQUALS(prog.loss_history().size(), 0);
+
+		std::map<std::string, ATenValuePtr> targets_map = {{"out", target}};
+		prog.train({}, targets_map, 10);
+
+		// Should have exactly one loss entry per epoch
+		TS_ASSERT_EQUALS(prog.loss_history().size(), 10);
+
+		// Loss entries should all be non-negative
+		for (double loss : prog.loss_history())
+			TS_ASSERT(loss >= 0.0);
+	}
+
+	void test_tensor_program_loss_history_decreasing()
+	{
+		// Loss should generally decrease over training epochs
+		TensorProgram prog("hist_decr");
+		prog.set_learning_rate(0.01);
+
+		ATenValuePtr in = createATenFromVector({1.0, 2.0, 3.0}, {3});
+		ATenValuePtr target = createATenFromVector({2.0, 4.0, 6.0}, {3});
+		prog.add_fact("in", in);
+		prog.add_equation("eq1", "out", {"in"}, "i->i");
+
+		TensorEquationPtr eq = prog.get_equation("eq1");
+		eq->set_learnable(true);
+		eq->set_weight(createATenFromVector({0.5, 0.5, 0.5}, {3}));
+
+		std::map<std::string, ATenValuePtr> targets_map = {{"out", target}};
+		prog.train({}, targets_map, 30);
+
+		const auto& hist = prog.loss_history();
+		TS_ASSERT(!hist.empty());
+		// Last loss should be less than the first
+		TS_ASSERT(hist.back() < hist.front());
+	}
+
+	// ========================================
+	// HYBRID reasoning mode
+	// ========================================
+
+	void test_hybrid_mode_output_binary()
+	{
+		// HYBRID mode should produce binary {0,1} outputs like BOOLEAN mode
+		TensorEquation eq("hyb_eq", "out", {"in"},
+		                  "i->i", Nonlinearity::NONE, ReasoningMode::HYBRID);
+
+		// Inputs with values spread around 0 (before sigmoid)
+		ATenValuePtr in = createATenFromVector({2.0, -2.0, 0.5, -0.5}, {4});
+		ATenValuePtr out = eq.execute({in});
+
+		auto data = out->to_vector();
+		for (double v : data)
+		{
+			// Each value must be exactly 0 or 1
+			TS_ASSERT(v == 0.0 || v == 1.0);
+		}
+	}
+
+	void test_hybrid_mode_sigmoid_then_threshold()
+	{
+		// For HYBRID + NONE nonlinearity: sigmoid is applied before threshold.
+		// sigmoid(2.0) ≈ 0.88 → threshold → 1
+		// sigmoid(-2.0) ≈ 0.12 → threshold → 0
+		TensorEquation eq("hyb_sig", "out", {"in"},
+		                  "i->i", Nonlinearity::NONE, ReasoningMode::HYBRID);
+
+		ATenValuePtr in = createATenFromVector({2.0, -2.0}, {2});
+		ATenValuePtr out = eq.execute({in});
+
+		auto data = out->to_vector();
+		TS_ASSERT_EQUALS(data.size(), 2);
+		TS_ASSERT_DELTA(data[0], 1.0, 1e-10);  // sigmoid(2) > 0.5 → 1
+		TS_ASSERT_DELTA(data[1], 0.0, 1e-10);  // sigmoid(-2) < 0.5 → 0
+	}
+
+	void test_hybrid_mode_explicit_nonlinearity()
+	{
+		// HYBRID with an explicit nonlinearity (e.g. RELU) skips the extra
+		// sigmoid and applies threshold directly to the RELU output.
+		// relu(1.0)=1.0 → threshold → 1; relu(-1.0)=0.0 → threshold → 0
+		TensorEquation eq("hyb_relu", "out", {"in"},
+		                  "i->i", Nonlinearity::RELU, ReasoningMode::HYBRID);
+
+		ATenValuePtr in = createATenFromVector({1.0, -1.0}, {2});
+		ATenValuePtr out = eq.execute({in});
+
+		auto data = out->to_vector();
+		TS_ASSERT_EQUALS(data.size(), 2);
+		TS_ASSERT_DELTA(data[0], 1.0, 1e-10);  // relu(1) = 1 > 0.5 → 1
+		TS_ASSERT_DELTA(data[1], 0.0, 1e-10);  // relu(-1) = 0 ≤ 0.5 → 0
+	}
+
+	void test_hybrid_mode_vs_boolean()
+	{
+		// For inputs that produce the same post-sigmoid values as direct
+		// threshold in BOOLEAN mode, both modes should give the same result
+		// when the pre-activation is already in [0,1].
+		ATenValuePtr in = createATenFromVector({0.9, 0.1}, {2});
+
+		TensorEquation bool_eq("bool_eq", "out", {"in"},
+		                       "i->i", Nonlinearity::NONE, ReasoningMode::BOOLEAN);
+		TensorEquation hyb_eq("hyb_eq", "out", {"in"},
+		                      "i->i", Nonlinearity::NONE, ReasoningMode::HYBRID);
+
+		// Both should give binary output; the actual values may differ because
+		// HYBRID routes through sigmoid first. Just verify both are binary.
+		auto bool_out = bool_eq.execute({in})->to_vector();
+		auto hyb_out = hyb_eq.execute({in})->to_vector();
+
+		for (double v : bool_out) TS_ASSERT(v == 0.0 || v == 1.0);
+		for (double v : hyb_out)  TS_ASSERT(v == 0.0 || v == 1.0);
+	}
 };