Skip to content

Commit eb362af

Browse files
committed
feat: [#24] replace custom SSH cloud-init monitoring with Ansible-based approach
- Remove reboot and custom completion marker from cloud-init configuration - Replace custom SSH monitoring with native Ansible cloud-init waiting - Add new Ansible playbooks for cloud-init wait and VM restart orchestration - Simplify Provision.pm by removing complex SSH monitoring logic - Preserve SSH infrastructure classes for future use and testing - Improve error handling and logging with structured Ansible output - Enhance post-provision workflow with proper VM restart management Breaking changes: - Cloud-init no longer auto-reboots (Ansible handles restart) - Removed custom completion marker approach - Updated dependencies (no longer requires sshpass) Benefits: - Native Ansible integration for better maintainability - Improved reboot handling with proper reconnection logic - Unified toolchain for all post-infrastructure operations - Better structured error reporting and status monitoring
1 parent 37cb207 commit eb362af

File tree

6 files changed

+191
-215
lines changed

6 files changed

+191
-215
lines changed

lib/TorrustDeploy/App/Command/Provision.pm

Lines changed: 21 additions & 194 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ use v5.38;
55
use TorrustDeploy::App -command;
66
use TorrustDeploy::Provision::OpenTofu;
77
use TorrustDeploy::Provision::Ansible;
8-
use TorrustDeploy::Infrastructure::SSH::Connection;
98
use Path::Tiny qw(path);
109
use File::Spec;
1110
use Time::HiRes qw(sleep);
@@ -21,10 +20,12 @@ This command will:
2120
1. Initialize OpenTofu if needed
2221
2. Copy configuration templates to the working directory
2322
3. Create a VM with hardcoded minimal configuration
24-
4. Wait for IP assignment and cloud-init completion
25-
5. Monitor cloud-init progress via SSH
23+
4. Wait for cloud-init completion using Ansible
24+
5. Run post-provision verification via Ansible
25+
6. Restart the VM after successful verification
2626
2727
The VM will be created locally using libvirt/KVM.
28+
Cloud-init handles initial setup, Ansible manages the orchestration.
2829
END_DESCRIPTION
2930
}
3031

@@ -56,21 +57,27 @@ sub execute {
5657
# Get VM IP address
5758
my $vm_ip = $tofu->get_vm_ip($tofu_dir);
5859
STDOUT->flush();
59-
60-
# Create SSH connection
61-
my $ssh_connection = TorrustDeploy::Infrastructure::SSH::Connection->new(host => $vm_ip);
62-
63-
# Wait for cloud-init completion
64-
$self->_wait_for_cloud_init($ssh_connection);
65-
60+
61+
# Set up Ansible working directory and copy templates
62+
my $ansible_dir = $work_dir->child('ansible');
63+
my $ansible = TorrustDeploy::Provision::Ansible->new();
64+
$ansible->copy_templates_and_generate_inventory($vm_ip, $ansible_dir);
65+
66+
# Wait for cloud-init completion using Ansible
67+
$ansible->wait_for_cloud_init($ansible_dir);
68+
6669
# Run Ansible post-provision verification
67-
$self->_run_ansible_verification($vm_ip, $work_dir);
70+
$ansible->run_verification($ansible_dir);
71+
72+
# Restart VM after verification
73+
$ansible->restart_vm($ansible_dir);
6874

6975
# Final completion message
7076
say "";
7177
say "✅ Provisioning completed successfully!";
7278
say "VM is ready at IP: " . $vm_ip;
7379
say "You can connect using: ssh -i ~/.ssh/testing_rsa torrust@" . $vm_ip;
80+
say "VM has been restarted and is ready for production use!";
7481
STDOUT->flush();
7582
}
7683

@@ -109,185 +116,6 @@ sub _copy_templates {
109116
say "Templates copied successfully.";
110117
}
111118

112-
sub _wait_for_cloud_init {
113-
my ($self, $ssh_connection) = @_;
114-
115-
say "Waiting for cloud-init to complete...";
116-
say "This may take several minutes while packages are installed and configured.";
117-
STDOUT->flush();
118-
119-
my $completion_file = "/var/lib/cloud/torrust-setup-complete";
120-
my $max_attempts = 360; # 30 minutes with 5-second intervals
121-
my $attempt = 0;
122-
my $ssh_connected = 0;
123-
my $cloud_init_success = 0;
124-
125-
# Step 1: Wait until SSH connection is available (for password auth to check cloud-init)
126-
say "⏳ Waiting for SSH service to become available...";
127-
STDOUT->flush();
128-
129-
while ($attempt < $max_attempts && !$ssh_connected) {
130-
$attempt++;
131-
132-
if ($ssh_connection->test_password_connection()) {
133-
$ssh_connected = 1;
134-
say "✅ SSH password connection established to " . $ssh_connection->host;
135-
STDOUT->flush();
136-
} else {
137-
if ($attempt % 6 == 0) { # Every 30 seconds
138-
say " [Waiting for SSH connection... ${attempt}0s elapsed]";
139-
STDOUT->flush();
140-
}
141-
sleep(5);
142-
}
143-
}
144-
145-
if (!$ssh_connected) {
146-
say "❌ Failed to establish SSH connection to " . $ssh_connection->host . " after " . ($max_attempts * 5 / 60) . " minutes";
147-
STDOUT->flush();
148-
$self->_print_cloud_init_logs($ssh_connection);
149-
die "SSH connection failed";
150-
}
151-
152-
# Step 2: Wait until cloud-init completion marker is created
153-
say "⏳ Waiting for cloud-init to complete...";
154-
STDOUT->flush();
155-
156-
$attempt = 0;
157-
my $consecutive_ssh_failures = 0;
158-
while ($attempt < $max_attempts) {
159-
$attempt++;
160-
161-
my $result = $ssh_connection->execute_command("test -f $completion_file");
162-
163-
# Debug: Always show result details when exit code is 0
164-
if ($result->exit_code == 0) {
165-
say " [DEBUG] File exists! Exit code: " . $result->exit_code .
166-
", Success method: " . ($result->success ? 'true' : 'false') .
167-
", Output: '" . ($result->output // 'EMPTY') . "'";
168-
STDOUT->flush();
169-
}
170-
171-
if ($result->success) {
172-
say "✅ Cloud-init setup completed successfully!";
173-
STDOUT->flush();
174-
175-
# Show completion message
176-
my $completion_result = $ssh_connection->execute_command("cat $completion_file");
177-
if ($completion_result->success && $completion_result->output) {
178-
chomp(my $output = $completion_result->output);
179-
say "📅 Completion marker: " . $output;
180-
STDOUT->flush();
181-
}
182-
$cloud_init_success = 1;
183-
last;
184-
} else {
185-
# Track consecutive SSH failures (exit code 255)
186-
if ($result->exit_code == 255) {
187-
$consecutive_ssh_failures++;
188-
# If we have too many consecutive SSH failures, try to re-establish password connection
189-
if ($consecutive_ssh_failures >= 12) { # 1 minute of consecutive failures
190-
say "⚠️ SSH connection lost, attempting to re-establish (VM may be rebooting)...";
191-
say " [Waiting 30s for VM to complete reboot...]";
192-
STDOUT->flush();
193-
sleep(30); # Give VM time to fully reboot
194-
195-
# Try to re-establish password connection (VM might have rebooted)
196-
my $reconnect_attempts = 0;
197-
while ($reconnect_attempts < 12 && !$ssh_connection->test_password_connection()) {
198-
$reconnect_attempts++;
199-
say " [Reconnection attempt $reconnect_attempts/12...]";
200-
STDOUT->flush();
201-
sleep(15); # Wait longer between attempts
202-
}
203-
204-
if ($ssh_connection->test_password_connection()) {
205-
say "✅ SSH connection re-established!";
206-
STDOUT->flush();
207-
$consecutive_ssh_failures = 0; # Reset counter after successful reconnection
208-
} else {
209-
say "❌ Failed to re-establish SSH connection after VM reboot.";
210-
say " [DEBUG] Last error: " . $result->output;
211-
STDOUT->flush();
212-
last;
213-
}
214-
}
215-
} else {
216-
# Reset counter for non-SSH failures (normal file-not-found errors)
217-
$consecutive_ssh_failures = 0;
218-
}
219-
220-
# Debug: Show why the command failed
221-
if ($attempt % 6 == 0) { # Every 30 seconds
222-
my $elapsed_seconds = $attempt * 5;
223-
say " [DEBUG ${elapsed_seconds}s] File check failed - Exit code: " . $result->exit_code .
224-
" (this is normal until cloud-init completes)";
225-
if ($consecutive_ssh_failures > 0) {
226-
say " [SSH failures: $consecutive_ssh_failures consecutive]";
227-
}
228-
STDOUT->flush();
229-
}
230-
}
231-
232-
# Show progress indicator every 2 minutes
233-
if ($attempt % 24 == 0) {
234-
my $elapsed_minutes = int($attempt * 5 / 60);
235-
say " [Cloud-init still running... ${elapsed_minutes} minutes elapsed]";
236-
STDOUT->flush();
237-
}
238-
239-
sleep(5);
240-
}
241-
242-
if (!$cloud_init_success) {
243-
say "❌ Timeout waiting for cloud-init to complete on " . $ssh_connection->host . " after " . ($max_attempts * 5 / 60) . " minutes";
244-
STDOUT->flush();
245-
$self->_print_cloud_init_logs($ssh_connection);
246-
die "Cloud-init timeout";
247-
}
248-
}
249-
250-
sub _print_cloud_init_logs {
251-
my ($self, $ssh_connection) = @_;
252-
253-
say "📄 Cloud-init logs (for debugging):";
254-
255-
# Print cloud-init-output.log
256-
say "=== /var/log/cloud-init-output.log ===";
257-
my $output_result = $ssh_connection->execute_command_with_sudo('cat /var/log/cloud-init-output.log');
258-
if ($output_result->success) {
259-
print $output_result->output;
260-
} else {
261-
say "Cloud-init output log not available";
262-
}
263-
264-
say "=== /var/log/cloud-init.log ===";
265-
my $main_result = $ssh_connection->execute_command_with_sudo('cat /var/log/cloud-init.log');
266-
if ($main_result->success) {
267-
print $main_result->output;
268-
} else {
269-
say "Cloud-init main log not available";
270-
}
271-
}
272-
273-
sub _run_ansible_verification {
274-
my ($self, $vm_ip, $work_dir) = @_;
275-
276-
say "";
277-
say "🎭 Starting Ansible post-provision verification...";
278-
STDOUT->flush();
279-
280-
# Set up Ansible working directory
281-
my $ansible_dir = $work_dir->child('ansible');
282-
283-
# Create Ansible instance and set up configuration
284-
my $ansible = TorrustDeploy::Provision::Ansible->new();
285-
$ansible->copy_templates_and_generate_inventory($vm_ip, $ansible_dir);
286-
287-
# Run verification playbook
288-
$ansible->run_verification($ansible_dir);
289-
}
290-
291119
1;
292120

293121
__END__
@@ -299,8 +127,8 @@ TorrustDeploy::App::Command::Provision - Provision Torrust Tracker VM
299127
=head1 DESCRIPTION
300128
301129
Provisions a Torrust Tracker virtual machine using OpenTofu with the libvirt provider.
302-
Creates a minimal Ubuntu 24.04 LTS VM, waits for IP assignment, and monitors cloud-init
303-
completion via SSH.
130+
Creates a minimal Ubuntu 24.04 LTS VM, waits for cloud-init completion using Ansible,
131+
runs post-provision verification, and performs a clean VM restart.
304132
305133
=head1 USAGE
306134
@@ -309,10 +137,9 @@ completion via SSH.
309137
=head1 REQUIREMENTS
310138
311139
- OpenTofu installed
312-
- Ansible installed
140+
- Ansible installed (with community.general collection for cloud_init module)
313141
- libvirt/KVM installed and running
314142
- qemu-system-x86_64
315-
- sshpass installed (for password authentication during cloud-init monitoring)
316143
- Testing SSH key pair (~/.ssh/testing_rsa)
317144
- Default libvirt storage pool configured
318145
- Template files in templates/ directory (main.tf, cloud-init.yml, ansible/)

lib/TorrustDeploy/Provision/Ansible.pm

Lines changed: 67 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -63,17 +63,25 @@ sub copy_templates_and_generate_inventory {
6363
$ansible_cfg_template->copy($ansible_cfg_dest);
6464
say "Copied: $ansible_cfg_template -> $ansible_cfg_dest";
6565

66-
# Copy playbooks
67-
my $verification_template = $templates_dir->child('post-provision-verification.yml');
68-
my $verification_dest = $ansible_dir->child('post-provision-verification.yml');
69-
70-
unless ($verification_template->exists) {
71-
die "Verification playbook template not found: $verification_template";
66+
# Copy all playbooks
67+
my @playbooks = (
68+
'wait-for-cloud-init.yml',
69+
'post-provision-verification.yml',
70+
'restart-vm.yml'
71+
);
72+
73+
for my $playbook (@playbooks) {
74+
my $template = $templates_dir->child($playbook);
75+
my $dest = $ansible_dir->child($playbook);
76+
77+
unless ($template->exists) {
78+
die "Playbook template not found: $template";
79+
}
80+
81+
$template->copy($dest);
82+
say "Copied: $template -> $dest";
7283
}
7384

74-
$verification_template->copy($verification_dest);
75-
say "Copied: $verification_template -> $verification_dest";
76-
7785
# Generate inventory with VM IP
7886
my $inventory_template = $templates_dir->child('inventory.ini.template');
7987
my $inventory_dest = $ansible_dir->child('inventory.ini');
@@ -93,6 +101,31 @@ sub copy_templates_and_generate_inventory {
93101
say "Ansible setup completed successfully.";
94102
}
95103

104+
=head2 wait_for_cloud_init
105+
106+
Wait for cloud-init completion using Ansible playbook.
107+
108+
$ansible->wait_for_cloud_init($ansible_dir);
109+
110+
=cut
111+
112+
sub wait_for_cloud_init {
113+
my ($self, $ansible_dir) = @_;
114+
115+
say "⏳ Waiting for cloud-init completion using Ansible...";
116+
STDOUT->flush();
117+
118+
# Change to ansible directory and run cloud-init wait playbook
119+
my $result = system("cd '$ansible_dir' && ansible-playbook -i inventory.ini wait-for-cloud-init.yml");
120+
121+
if ($result != 0) {
122+
die "Ansible cloud-init wait failed with exit code: $result";
123+
}
124+
125+
say "✅ Cloud-init completion verified via Ansible!";
126+
STDOUT->flush();
127+
}
128+
96129
=head2 run_verification
97130
98131
Run the post-provision verification playbook.
@@ -118,6 +151,31 @@ sub run_verification {
118151
STDOUT->flush();
119152
}
120153

154+
=head2 restart_vm
155+
156+
Restart the VM after post-provision verification.
157+
158+
$ansible->restart_vm($ansible_dir);
159+
160+
=cut
161+
162+
sub restart_vm {
163+
my ($self, $ansible_dir) = @_;
164+
165+
say "🔄 Restarting VM using Ansible...";
166+
STDOUT->flush();
167+
168+
# Change to ansible directory and run restart playbook
169+
my $result = system("cd '$ansible_dir' && ansible-playbook -i inventory.ini restart-vm.yml");
170+
171+
if ($result != 0) {
172+
die "Ansible VM restart failed with exit code: $result";
173+
}
174+
175+
say "✅ VM restart completed successfully!";
176+
STDOUT->flush();
177+
}
178+
121179
1;
122180

123181
__END__

project-words.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ keygen
2323
keyrings
2424
libssh
2525
libvirtd
26+
loadavg
2627
LOGLEVEL
2728
memtotal
2829
mkpath

0 commit comments

Comments
 (0)