From f290c6806e7e1b22555b99a39628643447096285 Mon Sep 17 00:00:00 2001 From: Mahima Singh <105724608+smahima27@users.noreply.github.com> Date: Thu, 4 Dec 2025 16:05:07 +0530 Subject: [PATCH 1/3] Implement request cancellation handling to prevent unnecessary VM spin-up --- Gemfile.lock | 1 + lib/vmpooler/pool_manager.rb | 66 +++++++++++++++++-- spec/unit/pool_manager_spec.rb | 117 +++++++++++++++++++++++++++++++++ vmpooler.yaml.example | 7 ++ 4 files changed, 187 insertions(+), 4 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index cfb545a2..418f24d3 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -196,6 +196,7 @@ GEM PLATFORMS arm64-darwin-22 + arm64-darwin-23 universal-java-11 universal-java-17 x86_64-darwin-22 diff --git a/lib/vmpooler/pool_manager.rb b/lib/vmpooler/pool_manager.rb index ce3028b0..d8aea0d3 100644 --- a/lib/vmpooler/pool_manager.rb +++ b/lib/vmpooler/pool_manager.rb @@ -161,16 +161,70 @@ def handle_timed_out_vm(vm, pool, redis) request_id = redis.hget("vmpooler__vm__#{vm}", 'request_id') pool_alias = redis.hget("vmpooler__vm__#{vm}", 'pool_alias') if request_id open_socket_error = redis.hget("vmpooler__vm__#{vm}", 'open_socket_error') + clone_error = redis.hget("vmpooler__vm__#{vm}", 'clone_error') + clone_error_class = redis.hget("vmpooler__vm__#{vm}", 'clone_error_class') redis.smove("vmpooler__pending__#{pool}", "vmpooler__completed__#{pool}", vm) + if request_id ondemandrequest_hash = redis.hgetall("vmpooler__odrequest__#{request_id}") if ondemandrequest_hash && ondemandrequest_hash['status'] != 'failed' && ondemandrequest_hash['status'] != 'deleted' - # will retry a VM that did not come up as vm_ready? only if it has not been market failed or deleted - redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool}:1:#{request_id}") + # Check retry count and max retry limit before retrying + retry_count = (redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count') || '0').to_i + max_retries = $config[:config]['max_vm_retries'] || 3 + + # Determine if error is likely permanent (configuration issues) + permanent_error = is_permanent_error?(clone_error, clone_error_class) + + if retry_count < max_retries && !permanent_error + # Increment retry count and retry VM creation + redis.hset("vmpooler__odrequest__#{request_id}", 'retry_count', retry_count + 1) + redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool}:1:#{request_id}") + $logger.log('s', "[!] [#{pool}] '#{vm}' failed, retrying (attempt #{retry_count + 1}/#{max_retries})") + else + # Max retries exceeded or permanent error, mark request as permanently failed + failure_reason = if permanent_error + "Configuration error: #{clone_error}" + else + 'Max retry attempts exceeded' + end + redis.hset("vmpooler__odrequest__#{request_id}", 'status', 'failed') + redis.hset("vmpooler__odrequest__#{request_id}", 'failure_reason', failure_reason) + $logger.log('s', "[!] [#{pool}] '#{vm}' permanently failed: #{failure_reason}") + $metrics.increment("errors.permanently_failed.#{pool}") + end end end $metrics.increment("errors.markedasfailed.#{pool}") - open_socket_error + open_socket_error || clone_error + end + + # Determine if an error is likely permanent (configuration issue) vs transient + def is_permanent_error?(error_message, error_class) + return false if error_message.nil? || error_class.nil? + + permanent_error_patterns = [ + /template.*not found/i, + /template.*does not exist/i, + /invalid.*path/i, + /folder.*not found/i, + /datastore.*not found/i, + /resource pool.*not found/i, + /permission.*denied/i, + /authentication.*failed/i, + /invalid.*credentials/i, + /configuration.*error/i + ] + + permanent_error_classes = [ + 'ArgumentError', + 'NoMethodError', + 'NameError' + ] + + # Check error message patterns + permanent_error_patterns.any? { |pattern| error_message.match?(pattern) } || + # Check error class types + permanent_error_classes.include?(error_class) end def move_pending_vm_to_ready(vm, pool, redis, request_id = nil) @@ -489,14 +543,18 @@ def _clone_vm(pool_name, provider, dns_plugin, request_id = nil, pool_alias = ni dns_plugin_class_name = get_dns_plugin_class_name_for_pool(pool_name) dns_plugin.create_or_replace_record(new_vmname) unless dns_plugin_class_name == 'dynamic-dns' - rescue StandardError + rescue StandardError => e + # Store error details for retry decision making @redis.with_metrics do |redis| redis.pipelined do |pipeline| pipeline.srem("vmpooler__pending__#{pool_name}", new_vmname) + pipeline.hset("vmpooler__vm__#{new_vmname}", 'clone_error', e.message) + pipeline.hset("vmpooler__vm__#{new_vmname}", 'clone_error_class', e.class.name) expiration_ttl = $config[:redis]['data_ttl'].to_i * 60 * 60 pipeline.expire("vmpooler__vm__#{new_vmname}", expiration_ttl) end end + $logger.log('s', "[!] [#{pool_name}] '#{new_vmname}' clone failed: #{e.class}: #{e.message}") raise ensure @redis.with_metrics do |redis| diff --git a/spec/unit/pool_manager_spec.rb b/spec/unit/pool_manager_spec.rb index 3ca075e2..c7b44c07 100644 --- a/spec/unit/pool_manager_spec.rb +++ b/spec/unit/pool_manager_spec.rb @@ -345,6 +345,123 @@ end end + describe '#handle_timed_out_vm' do + before do + expect(subject).not_to be_nil + end + + before(:each) do + redis_connection_pool.with do |redis| + create_pending_vm(pool, vm, redis) + config[:config]['max_vm_retries'] = 3 + end + end + + context 'without request_id' do + it 'moves VM to completed queue and returns error' do + redis_connection_pool.with do |redis| + redis.hset("vmpooler__vm__#{vm}", 'open_socket_error', 'connection failed') + result = subject.handle_timed_out_vm(vm, pool, redis) + + expect(redis.sismember("vmpooler__pending__#{pool}", vm)).to be(false) + expect(redis.sismember("vmpooler__completed__#{pool}", vm)).to be(true) + expect(result).to eq('connection failed') + end + end + end + + context 'with request_id and transient error' do + before(:each) do + redis_connection_pool.with do |redis| + redis.hset("vmpooler__vm__#{vm}", 'request_id', request_id) + redis.hset("vmpooler__vm__#{vm}", 'pool_alias', pool) + redis.hset("vmpooler__odrequest__#{request_id}", 'status', 'pending') + redis.hset("vmpooler__vm__#{vm}", 'clone_error', 'network timeout') + redis.hset("vmpooler__vm__#{vm}", 'clone_error_class', 'Timeout::Error') + end + end + + it 'retries on first failure' do + redis_connection_pool.with do |redis| + subject.handle_timed_out_vm(vm, pool, redis) + + expect(redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count')).to eq('1') + expect(redis.zrange('vmpooler__odcreate__task', 0, -1)).to include("#{pool}:#{pool}:1:#{request_id}") + end + end + + it 'marks as failed after max retries' do + redis_connection_pool.with do |redis| + redis.hset("vmpooler__odrequest__#{request_id}", 'retry_count', '3') + + subject.handle_timed_out_vm(vm, pool, redis) + + expect(redis.hget("vmpooler__odrequest__#{request_id}", 'status')).to eq('failed') + expect(redis.hget("vmpooler__odrequest__#{request_id}", 'failure_reason')).to eq('Max retry attempts exceeded') + expect(redis.zrange('vmpooler__odcreate__task', 0, -1)).not_to include("#{pool}:#{pool}:1:#{request_id}") + end + end + end + + context 'with request_id and permanent error' do + before(:each) do + redis_connection_pool.with do |redis| + redis.hset("vmpooler__vm__#{vm}", 'request_id', request_id) + redis.hset("vmpooler__vm__#{vm}", 'pool_alias', pool) + redis.hset("vmpooler__odrequest__#{request_id}", 'status', 'pending') + redis.hset("vmpooler__vm__#{vm}", 'clone_error', 'template not found') + redis.hset("vmpooler__vm__#{vm}", 'clone_error_class', 'RuntimeError') + end + end + + it 'immediately marks as failed without retrying' do + redis_connection_pool.with do |redis| + subject.handle_timed_out_vm(vm, pool, redis) + + expect(redis.hget("vmpooler__odrequest__#{request_id}", 'status')).to eq('failed') + expect(redis.hget("vmpooler__odrequest__#{request_id}", 'failure_reason')).to include('Configuration error') + expect(redis.zrange('vmpooler__odcreate__task', 0, -1)).not_to include("#{pool}:#{pool}:1:#{request_id}") + end + end + end + end + + describe '#is_permanent_error?' do + before do + expect(subject).not_to be_nil + end + + it 'identifies template not found errors as permanent' do + expect(subject.is_permanent_error?('template not found', 'RuntimeError')).to be(true) + end + + it 'identifies invalid path errors as permanent' do + expect(subject.is_permanent_error?('invalid path specified', 'ArgumentError')).to be(true) + end + + it 'identifies permission denied errors as permanent' do + expect(subject.is_permanent_error?('permission denied', 'SecurityError')).to be(true) + end + + it 'identifies ArgumentError class as permanent' do + expect(subject.is_permanent_error?('some argument error', 'ArgumentError')).to be(true) + end + + it 'identifies network errors as transient' do + expect(subject.is_permanent_error?('connection timeout', 'Timeout::Error')).to be(false) + end + + it 'identifies socket errors as transient' do + expect(subject.is_permanent_error?('connection refused', 'Errno::ECONNREFUSED')).to be(false) + end + + it 'returns false for nil inputs' do + expect(subject.is_permanent_error?(nil, nil)).to be(false) + expect(subject.is_permanent_error?('error', nil)).to be(false) + expect(subject.is_permanent_error?(nil, 'Error')).to be(false) + end + end + describe '#move_pending_vm_to_ready' do let(:host) { { 'hostname' => vm }} diff --git a/vmpooler.yaml.example b/vmpooler.yaml.example index 818183e1..f05ded22 100644 --- a/vmpooler.yaml.example +++ b/vmpooler.yaml.example @@ -456,6 +456,12 @@ # How long (in minutes) before marking a clone in 'pending' queues as 'failed' and retrying. # (default: 15) # +# - max_vm_retries +# Maximum number of times to retry VM creation for a failed request before marking it as permanently failed. +# This helps prevent infinite retry loops when there are configuration issues like invalid template paths. +# Permanent errors (like invalid template paths) are detected and will not be retried. +# (default: 3) +# # - vm_checktime # How often (in minutes) to check the sanity of VMs in 'ready' queues. # (default: 1) @@ -619,6 +625,7 @@ vm_checktime: 1 vm_lifetime: 12 vm_lifetime_auth: 24 + max_vm_retries: 3 allowed_tags: - 'created_by' - 'project' From 9e75854ec442683919488c8c426c0ef9f03c1230 Mon Sep 17 00:00:00 2001 From: Mahima Singh <105724608+smahima27@users.noreply.github.com> Date: Thu, 4 Dec 2025 16:12:23 +0530 Subject: [PATCH 2/3] Fixed robo issues --- lib/vmpooler/pool_manager.rb | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/vmpooler/pool_manager.rb b/lib/vmpooler/pool_manager.rb index d8aea0d3..b9bae34f 100644 --- a/lib/vmpooler/pool_manager.rb +++ b/lib/vmpooler/pool_manager.rb @@ -164,17 +164,17 @@ def handle_timed_out_vm(vm, pool, redis) clone_error = redis.hget("vmpooler__vm__#{vm}", 'clone_error') clone_error_class = redis.hget("vmpooler__vm__#{vm}", 'clone_error_class') redis.smove("vmpooler__pending__#{pool}", "vmpooler__completed__#{pool}", vm) - + if request_id ondemandrequest_hash = redis.hgetall("vmpooler__odrequest__#{request_id}") if ondemandrequest_hash && ondemandrequest_hash['status'] != 'failed' && ondemandrequest_hash['status'] != 'deleted' # Check retry count and max retry limit before retrying retry_count = (redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count') || '0').to_i max_retries = $config[:config]['max_vm_retries'] || 3 - + # Determine if error is likely permanent (configuration issues) - permanent_error = is_permanent_error?(clone_error, clone_error_class) - + permanent_error = permanent_error?(clone_error, clone_error_class) + if retry_count < max_retries && !permanent_error # Increment retry count and retry VM creation redis.hset("vmpooler__odrequest__#{request_id}", 'retry_count', retry_count + 1) @@ -199,9 +199,9 @@ def handle_timed_out_vm(vm, pool, redis) end # Determine if an error is likely permanent (configuration issue) vs transient - def is_permanent_error?(error_message, error_class) + def permanent_error?(error_message, error_class) return false if error_message.nil? || error_class.nil? - + permanent_error_patterns = [ /template.*not found/i, /template.*does not exist/i, @@ -214,17 +214,17 @@ def is_permanent_error?(error_message, error_class) /invalid.*credentials/i, /configuration.*error/i ] - + permanent_error_classes = [ 'ArgumentError', 'NoMethodError', 'NameError' ] - + # Check error message patterns permanent_error_patterns.any? { |pattern| error_message.match?(pattern) } || - # Check error class types - permanent_error_classes.include?(error_class) + # Check error class types + permanent_error_classes.include?(error_class) end def move_pending_vm_to_ready(vm, pool, redis, request_id = nil) From 8372ea824f501fdbbcc4d04ad151e2831d447540 Mon Sep 17 00:00:00 2001 From: Mahima Singh <105724608+smahima27@users.noreply.github.com> Date: Thu, 4 Dec 2025 16:19:34 +0530 Subject: [PATCH 3/3] Fixed spec tests --- spec/unit/pool_manager_spec.rb | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/spec/unit/pool_manager_spec.rb b/spec/unit/pool_manager_spec.rb index c7b44c07..abe5555c 100644 --- a/spec/unit/pool_manager_spec.rb +++ b/spec/unit/pool_manager_spec.rb @@ -426,39 +426,39 @@ end end - describe '#is_permanent_error?' do + describe '#permanent_error?' do before do expect(subject).not_to be_nil end it 'identifies template not found errors as permanent' do - expect(subject.is_permanent_error?('template not found', 'RuntimeError')).to be(true) + expect(subject.permanent_error?('template not found', 'RuntimeError')).to be(true) end it 'identifies invalid path errors as permanent' do - expect(subject.is_permanent_error?('invalid path specified', 'ArgumentError')).to be(true) + expect(subject.permanent_error?('invalid path specified', 'ArgumentError')).to be(true) end it 'identifies permission denied errors as permanent' do - expect(subject.is_permanent_error?('permission denied', 'SecurityError')).to be(true) + expect(subject.permanent_error?('permission denied', 'SecurityError')).to be(true) end it 'identifies ArgumentError class as permanent' do - expect(subject.is_permanent_error?('some argument error', 'ArgumentError')).to be(true) + expect(subject.permanent_error?('some argument error', 'ArgumentError')).to be(true) end it 'identifies network errors as transient' do - expect(subject.is_permanent_error?('connection timeout', 'Timeout::Error')).to be(false) + expect(subject.permanent_error?('connection timeout', 'Timeout::Error')).to be(false) end it 'identifies socket errors as transient' do - expect(subject.is_permanent_error?('connection refused', 'Errno::ECONNREFUSED')).to be(false) + expect(subject.permanent_error?('connection refused', 'Errno::ECONNREFUSED')).to be(false) end it 'returns false for nil inputs' do - expect(subject.is_permanent_error?(nil, nil)).to be(false) - expect(subject.is_permanent_error?('error', nil)).to be(false) - expect(subject.is_permanent_error?(nil, 'Error')).to be(false) + expect(subject.permanent_error?(nil, nil)).to be(false) + expect(subject.permanent_error?('error', nil)).to be(false) + expect(subject.permanent_error?(nil, 'Error')).to be(false) end end