memcp/run_sql_tests.py at master · launix-de/memcp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
#!/usr/bin/env python3
"""
MemCP SQL Test Runner (Optimized)

Runs structured SQL/SPARQL tests from YAML files.
- Declarative tests
- Unified execution path
- Compact success logging, verbose failure logging

Requirements (pip): requests, PyYAML
If missing, create a venv and install, e.g.:
  python3 -m venv .venv && . .venv/bin/activate && pip install -U requests PyYAML
"""

import sys
import os

# Dependency checks with clear install hints
try:
    import yaml  # PyYAML
except Exception as e:
    print("Missing dependency: PyYAML (module 'yaml').")
    print("Install with: pip install -U PyYAML")
    sys.exit(2)

try:
    import requests
except Exception as e:
    print("Missing dependency: requests.")
    print("Install with: pip install -U requests")
    sys.exit(2)

import json
import subprocess
import time
import threading
import multiprocessing
from pathlib import Path
from base64 import b64encode
from typing import Dict, List, Any, Optional, Tuple
from urllib.parse import quote

# CPU measurement helpers
NUM_CPUS = multiprocessing.cpu_count()

def find_memcp_pid() -> Optional[int]:
    """Find the PID of the memcp process."""
    try:
        result = subprocess.run(['pgrep', '-f', 'memcp'], capture_output=True, text=True, timeout=2)
        pids = result.stdout.strip().split('\n')
        for pid_str in pids:
            if pid_str.strip():
                return int(pid_str.strip())
    except:
        pass
    return None

def get_process_cpu_times(pid: int) -> Optional[Tuple[float, float]]:
    """Get user and system CPU times for a process from /proc/[pid]/stat.
    Returns (utime + cutime, stime + cstime) in seconds, or None if unavailable."""
    try:
        with open(f'/proc/{pid}/stat', 'r') as f:
            parts = f.read().split()
            # Fields: utime(14), stime(15), cutime(16), cstime(17) - 1-indexed in docs, 0-indexed here
            utime = int(parts[13])  # user time
            stime = int(parts[14])  # system time
            cutime = int(parts[15])  # children user time
            cstime = int(parts[16])  # children system time
            # Convert from clock ticks to seconds (typically 100 Hz = 100 ticks/sec)
            hz = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
            total_time = (utime + stime + cutime + cstime) / hz
            return total_time
    except:
        return None

def measure_cpu_load(pid: int, start_cpu: float, end_cpu: float, elapsed_sec: float) -> Optional[float]:
    """Calculate CPU load as percentage of total CPU capacity.
    Returns percentage where 100% = one core fully utilized, NUM_CPUS*100% = all cores."""
    if start_cpu is None or end_cpu is None or elapsed_sec <= 0:
        return None
    cpu_used = end_cpu - start_cpu
    # CPU load as percentage of wall time (100% = 1 core, 200% = 2 cores, etc.)
    return (cpu_used / elapsed_sec) * 100

# Global flag for connect-only mode
is_connect_only_mode = False

# Performance test configuration
PERF_TEST_ENABLED = os.environ.get("PERF_TEST", "0") == "1"
PERF_CALIBRATE = os.environ.get("PERF_CALIBRATE", "0") == "1"  # reset baselines to current times
PERF_NORECALIBRATE = os.environ.get("PERF_NORECALIBRATE", "0") == "1"  # freeze row counts for bisecting
PERF_EXPLAIN = os.environ.get("PERF_EXPLAIN", "0") == "1"  # show query plans
PERF_BASELINE_FILE = ".perf_baseline.json"
PERF_THRESHOLD_FACTOR = 1.1  # 10% tolerance over baseline
PERF_TARGET_MIN_MS = 10000  # target minimum query time (10s)
PERF_TARGET_MAX_MS = 20000  # target maximum query time (20s)
PERF_SCALE_FACTOR = 1.3  # scale up/down by 30%
PERF_DEFAULT_ROWS = 10000  # default starting row count
PERF_MAX_ROWS = 10_000_000  # allow large datasets for proper calibration
PERF_MAX_RAM_FRACTION = 0.3  # max 30% of RAM for table data

def get_max_rows_for_ram(bytes_per_row: int = 100) -> int:
    """Calculate max rows based on available RAM (30% limit)."""
    try:
        with open('/proc/meminfo', 'r') as f:
            for line in f:
                if line.startswith('MemTotal:'):
                    total_kb = int(line.split()[1])
                    max_bytes = int(total_kb * 1024 * PERF_MAX_RAM_FRACTION)
                    return max_bytes // bytes_per_row
    except:
        pass
    return 10_000_000  # fallback: 10M rows

class SQLTestRunner:
    def __init__(self, base_url="http://localhost:4321", username="root", password="admin", default_database="memcp-tests", suite_db_cleanup=True):
        self.base_url = base_url
        self.username = username
        self.password = password
        self.default_database = default_database
        self.suite_db_cleanup = suite_db_cleanup
        self.auth_header = self._create_auth_header()
        self.test_count = 0
        self.test_passed = 0
        self.failed_tests = []  # list of (name, is_noncritical)
        self.failed_critical = 0
        self.failed_noncritical = 0
        self.noncritical_count = 0
        self.noncritical_passed = 0
        self.setup_operations = []
        self.current_database = None
        self._ensured_dbs = set()
        self.suite_metadata = {}
        self._restart_handler = None  # callable to restart memcp between tests
        self.suite_syntax = None
        self.perf_baselines = {}  # test_name -> {"time_ms": float, "rows": int}
        self.perf_results = {}  # test_name -> {"time_ms": float, "rows": int}

    def set_restart_handler(self, fn):
        """Install a restart handler callable that restarts MemCP (returns True on success)."""
        self._restart_handler = fn

    def load_perf_baselines(self):
        """Load performance baselines from config file."""
        try:
            with open(PERF_BASELINE_FILE, 'r') as f:
                self.perf_baselines = json.load(f)
        except (FileNotFoundError, json.JSONDecodeError):
            self.perf_baselines = {}

    def save_perf_baselines(self):
        """Save updated performance baselines to config file.

        All tests in a suite share the same row count (based on slowest test)
        to avoid issues with DELETE operations.
        """
        if not self.perf_results:
            return

        max_rows = get_max_rows_for_ram()

        if PERF_NORECALIBRATE:
            # Just update times, keep existing rows
            for name, result in self.perf_results.items():
                current_rows = result["rows"]
                if name in self.perf_baselines:
                    self.perf_baselines[name]["time_ms"] = round(result["time_ms"], 1)
                else:
                    self.perf_baselines[name] = {"time_ms": round(result["time_ms"], 1), "rows": current_rows}
        else:
            # Scale each test independently to target 10-20s
            for name, result in self.perf_results.items():
                current_rows = result["rows"]
                test_time = result["time_ms"]

                if test_time < PERF_TARGET_MIN_MS:
                    new_rows = int(current_rows * PERF_SCALE_FACTOR)
                elif test_time > PERF_TARGET_MAX_MS:
                    new_rows = max(1000, int(current_rows / PERF_SCALE_FACTOR))
                else:
                    new_rows = current_rows

                # Apply RAM limit and hard cap
                new_rows = min(new_rows, max_rows, PERF_MAX_ROWS)

                self.perf_baselines[name] = {
                    "time_ms": round(test_time, 1),
                    "rows": new_rows
                }

        with open(PERF_BASELINE_FILE, 'w') as f:
            json.dump(self.perf_baselines, f, indent=2)
        print(f"📏 Updated performance baselines in {PERF_BASELINE_FILE}")

    # ----------------------
    # SQL identifier quoting
    # ----------------------
    def _quote_ident(self, name: str) -> str:
        if name is None:
            return "``"
        # Escape backticks by doubling them
        return f"`{str(name).replace('`', '``')}`"

    # ----------------------
    # Helpers
    # ----------------------
    def _create_auth_header(self):
        credentials = f"{self.username}:{self.password}"
        encoded = b64encode(credentials.encode()).decode()
        return {"Authorization": f"Basic {encoded}"}

    def _record_success(self, name: str, is_noncritical: bool = False, elapsed_ms: float = None, threshold_ms: float = None, rows: int = None, heap_mb: float = None, cpu_pct: float = None):
        self.test_passed += 1
        if elapsed_ms is not None and threshold_ms is not None:
            rows_info = f", {rows:,} rows" if rows else ""
            # Calculate time per row in microseconds
            if rows and rows > 0:
                us_per_row = (elapsed_ms * 1000) / rows
                rate_info = f", {us_per_row:.2f}µs/row"
            else:
                rate_info = ""
            # Show heap memory if available
            mem_info = f", {heap_mb:.1f}MB heap" if heap_mb else ""
            # Show CPU load as percentage of total capacity (100%/Ncores = one core)
            cpu_info = f", {cpu_pct:.0f}%/{NUM_CPUS*100}% CPU" if cpu_pct is not None else ""
            print(f"✅ {name} ({elapsed_ms:.1f}ms / {threshold_ms:.0f}ms{rows_info}{rate_info}{mem_info}{cpu_info})")
        else:
            print(f"✅ {name}")
        if is_noncritical:
            self.noncritical_passed += 1
            print(f"   ⚠️  Passed but flagged noncritical — enable soon")

    def _record_fail(self, name: str, reason: str, query: str, response: Optional[requests.Response], expect, is_noncritical: bool = False, elapsed_ms: float = None, threshold_ms: float = None):
        self.failed_tests.append((name, is_noncritical))
        if is_noncritical:
            self.failed_noncritical += 1
        else:
            self.failed_critical += 1
        time_info = f" ({elapsed_ms:.1f}ms / {threshold_ms:.0f}ms)" if elapsed_ms is not None else ""
        print(f"❌ {name}{' (noncritical)' if is_noncritical else ''}{time_info}")
        print(f"    Reason: {reason}")
        if query:
            print(f"    Query: {query[:200]}{'...' if len(query) > 200 else ''}")
        if response is not None:
            print(f"    HTTP {response.status_code}: {response.text[:500]}{'...' if len(response.text) > 500 else ''}")
        if expect is not None:
            print(f"    Expected: {expect}\n")

    # ----------------------
    # Core execution
    # ----------------------
    def ensure_database(self, database: str) -> None:
        if not database or database == "system" or database in self._ensured_dbs:
            return
        try:
            url = f"{self.base_url}/sql/system"
            create_db_sql = f"CREATE DATABASE IF NOT EXISTS {self._quote_ident(database)}"
            requests.post(url, data=create_db_sql, headers=self.auth_header, timeout=10)
            # verify availability with a lightweight call
            check_url = f"{self.base_url}/sql/{quote(database, safe='')}"
            for _ in range(3):
                resp = requests.post(check_url, data="SHOW TABLES", headers=self.auth_header, timeout=10)
                if resp is not None and "database" not in resp.text.lower():
                    self._ensured_dbs.add(database)
                    break
                time.sleep(0.1)
        except Exception:
            pass

    def execute_sql(self, database: str, query: str, auth_header: Optional[Dict[str, str]] = None, syntax: Optional[str] = None, session_id: Optional[str] = None, timeout: int = 10) -> Optional[requests.Response]:
        # proactively ensure database exists (works for connect-only too)
        self.ensure_database(database)
        encoded_db = quote(database, safe='')
        normalized = self._normalize_syntax(syntax)
        route = "psql" if normalized == "postgresql" else "sql"
        url = f"{self.base_url}/{route}/{encoded_db}"
        headers = dict(auth_header if auth_header is not None else self.auth_header)
        if session_id:
            headers["X-Session-Id"] = session_id
        # Try request; if connection fails, wait for memcp to be ready and retry a few times
        for attempt in range(5):
            try:
                return requests.post(url, data=query, headers=headers, timeout=timeout)
            except Exception:
                # parse port from base_url
                try:
                    port = int(self.base_url.rsplit(':', 1)[1])
                except Exception:
                    port = 4321
                wait_for_memcp(port, timeout=5)
        return None

    def _normalize_syntax(self, syntax: Optional[str]) -> Optional[str]:
        if not syntax:
            return None
        syntax_lower = str(syntax).strip().lower()
        if syntax_lower in ["mysql", "default"]:
            return None
        if syntax_lower in ["postgres", "postgresql", "psql"]:
            return "postgresql"
        return syntax_lower

    def execute_sparql(self, database: str, query: str, auth_header: Optional[Dict[str, str]] = None, timeout: int = 10) -> Optional[requests.Response]:
        try:
            encoded_db = quote(database, safe='')
            url = f"{self.base_url}/rdf/{encoded_db}"
            headers = auth_header if auth_header is not None else self.auth_header
            return requests.post(url, data=query, headers=headers, timeout=timeout)
        except Exception as e:
            print(f"Error executing SPARQL: {e}")
            return None

    def load_ttl(self, database: str, ttl_data: str) -> bool:
        try:
            self.ensure_database(database)
            url = f"{self.base_url}/rdf/{quote(database, safe='')}/load_ttl"
            response = requests.post(url, data=ttl_data, headers=self.auth_header, timeout=10)
            return response is not None and response.status_code == 200
        except Exception as e:
            print(f"Error loading TTL data: {e}")
            return False

    def parse_jsonl_response(self, response: requests.Response) -> Optional[List[Dict]]:
        if response is None:
            return None
        text = response.text.strip()
        if not text:
            return []
        results = []
        for line in text.splitlines():
            try:
                results.append(json.loads(line))
            except:
                continue
        return results

    # ----------------------
    # Test execution
    # ----------------------
    def run_test_case(self, test_case: Dict, database: str) -> bool:
        self.test_count += 1
        name = test_case.get("name", f"Test {self.test_count}")
        is_noncritical = bool(test_case.get("noncritical"))
        if is_noncritical:
            self.noncritical_count += 1

        # Performance test handling
        yaml_threshold_ms = test_case.get("threshold_ms")
        is_perf_test = yaml_threshold_ms is not None
        perf_rows = PERF_DEFAULT_ROWS  # default row count
        if is_perf_test:
            # Get baseline data if available
            baseline = self.perf_baselines.get(name, {})
            if isinstance(baseline, dict):
                baseline_time = baseline.get("time_ms")
                baseline_rows = baseline.get("rows", PERF_DEFAULT_ROWS)
            else:
                # Legacy format: just a number
                baseline_time = baseline
                baseline_rows = PERF_DEFAULT_ROWS

            # Use baseline time × 1.3 as threshold ONLY if in target range
            # During scaling, use YAML threshold (generous)
            if baseline_time and not PERF_CALIBRATE and baseline_time >= PERF_TARGET_MIN_MS:
                threshold_ms = baseline_time * PERF_THRESHOLD_FACTOR
            else:
                threshold_ms = yaml_threshold_ms

            # Use baseline rows (which may have been scaled)
            perf_rows = baseline_rows

            if not PERF_TEST_ENABLED:
                print(f"⏭️  {name} (skipped - set PERF_TEST=1 to run)")
                self.test_count -= 1  # don't count skipped perf tests
                return True

        # Per-test setup steps (SQL and/or Scheme, e.g. perf data generation)
        # Supports {rows} and {database} template placeholders for perf tests
        test_setup_steps = test_case.get("setup")
        heap_bytes = 0
        if test_setup_steps and isinstance(test_setup_steps, list):
            for step in test_setup_steps:
                if "sql" in step:
                    sql_code = step["sql"]
                    if is_perf_test:
                        sql_code = sql_code.replace("{rows}", str(perf_rows)).replace("{database}", database)
                    resp = self.execute_sql(database, sql_code, syntax=self.suite_syntax)
                elif "scm" in step:
                    scm = step["scm"]
                    if is_perf_test:
                        scm = scm.replace("{rows}", str(perf_rows)).replace("{database}", database)
                    url = f"{self.base_url}/scm"
                    try:
                        resp = requests.post(url, data=scm, headers=self.auth_header, timeout=600)
                    except Exception as e:
                        return self._record_fail(name, f"Setup SCM error: {e}", scm, None, None, is_noncritical)
                    if resp is None or resp.status_code != 200:
                        return self._record_fail(name, "Setup SCM failed", scm, resp, None, is_noncritical)
                    # Extract heap stats from response if available
                    try:
                        result = json.loads(resp.text.strip())
                        if isinstance(result, list) and len(result) >= 2:
                            heap_bytes = result[1]
                    except:
                        pass

        # Scheme code execution via /scm endpoint
        scm_code = test_case.get("scm")
        if scm_code:
            if is_perf_test:
                scm_code = scm_code.replace("{rows}", str(perf_rows)).replace("{database}", database)
            expect = test_case.get("expect", {})
            expect_error = expect.get("error", False)
            try:
                url = f"{self.base_url}/scm"
                resp = requests.post(url, data=scm_code, headers=self.auth_header, timeout=600)
            except Exception as e:
                if expect_error:
                    self._record_success(name, is_noncritical)
                    return True
                return self._record_fail(name, f"SCM exception: {e}", scm_code, None, None, is_noncritical)
            if resp is None:
                if expect_error:
                    self._record_success(name, is_noncritical)
                    return True
                return self._record_fail(name, "No response from /scm", scm_code, None, None, is_noncritical)
            if expect_error:
                if resp.status_code != 200:
                    self._record_success(name, is_noncritical)
                    return True
                return self._record_fail(name, f"Expected error but got 200: {resp.text[:200]}", scm_code, None, None, is_noncritical)
            if resp.status_code != 200:
                return self._record_fail(name, f"SCM error ({resp.status_code}): {resp.text[:200]}", scm_code, None, None, is_noncritical)
            self._record_success(name, is_noncritical)
            return True

        query = test_case.get("sql") or test_case.get("sparql")
        if query and is_perf_test:
            query = query.replace("{rows}", str(perf_rows)).replace("{database}", database)
        is_sparql = "sparql" in test_case
        # auth: allow per-test overrides, fallback to suite metadata, then runner defaults
        tc_user = test_case.get("username") or self.suite_metadata.get("username") or self.username
        tc_pass = test_case.get("password") or self.suite_metadata.get("password") or self.password
        creds = f"{tc_user}:{tc_pass}".encode()
        auth_header = {"Authorization": f"Basic {b64encode(creds).decode()}"}
        test_syntax = test_case.get("syntax")
        active_syntax = self._normalize_syntax(test_syntax) if test_syntax is not None else self.suite_syntax
        session_id = test_case.get("session_id")
        sql_timeout = int(test_case.get("timeout", 10))

        # TTL preload if SPARQL
        if is_sparql and "ttl_data" in test_case:
            if not self.load_ttl(database, test_case["ttl_data"]):
                return self._record_fail(name, "TTL load failed", query, None, None)

        # Special handling: SHUTDOWN command triggers graceful restart flow
        response: Optional[requests.Response]
        cpu_pct = None  # CPU load percentage, measured during query execution
        if query and query.strip().upper() == "SHUTDOWN":
            # Issue shutdown
            resp = self.execute_sql(database, query, auth_header, active_syntax)
            if resp is not None and resp.status_code >= 500:
                response = resp
            else:
                # Treat SHUTDOWN as successful regardless of response body, even if the connection closed.
                if self._restart_handler is not None:
                    self._restart_handler()
                else:
                    # No restart handler (--connect-only): wait for external supervisor to restart
                    import time as _time
                    for _i in range(60):
                        _time.sleep(1)
                        try:
                            if requests.get(self.base_url, timeout=2).status_code < 500:
                                break
                        except Exception:
                            pass
                self._record_success(name, is_noncritical)
                return True

            response = resp
        else:
            # Show query plan if PERF_EXPLAIN is enabled
            if is_perf_test and PERF_EXPLAIN and not is_sparql:
                explain_resp = self.execute_sql(database, f"DESCRIBE {query}", auth_header, active_syntax)
                if explain_resp and explain_resp.status_code == 200:
                    print(f"    📋 Query plan for {name}:")
                    for line in explain_resp.text.strip().split('\n')[:10]:
                        print(f"       {line[:120]}")

            # Warmup runs for performance tests (2 unmeasured runs before the measured one)
            if is_perf_test and test_case.get("warmup", True):
                for _ in range(2):
                    self.execute_sparql(database, query, auth_header, timeout=sql_timeout) if is_sparql else self.execute_sql(database, query, auth_header, active_syntax, timeout=sql_timeout)

            # Get memcp PID and start CPU measurement for perf tests
            memcp_pid = find_memcp_pid() if is_perf_test else None
            start_cpu = get_process_cpu_times(memcp_pid) if memcp_pid else None

            # Execute query (with timing for perf tests)
            start_time = time.monotonic()
            response = self.execute_sparql(database, query, auth_header, timeout=sql_timeout) if is_sparql else self.execute_sql(database, query, auth_header, active_syntax, session_id=session_id, timeout=sql_timeout)
            elapsed_ms = (time.monotonic() - start_time) * 1000
            elapsed_sec = elapsed_ms / 1000

            # End CPU measurement
            end_cpu = get_process_cpu_times(memcp_pid) if memcp_pid else None
            cpu_pct = measure_cpu_load(memcp_pid, start_cpu, end_cpu, elapsed_sec) if memcp_pid else None

        if response is None:
            return self._record_fail(name, "No response", query, None, None, is_noncritical)

        results = self.parse_jsonl_response(response)

        # Check performance threshold
        if is_perf_test and elapsed_ms > threshold_ms:
            return self._record_fail(name, f"Too slow: {elapsed_ms:.1f}ms > {threshold_ms:.0f}ms", query, response,
                                     test_case.get("expect"), is_noncritical, elapsed_ms, threshold_ms)

        if self.validate_expectation(test_case, response, results):
            if is_perf_test:
                heap_mb = heap_bytes / (1024 * 1024) if heap_bytes else None
                self._record_success(name, is_noncritical, elapsed_ms, threshold_ms, perf_rows, heap_mb, cpu_pct)
                # Store result for baseline update (time and row count)
                self.perf_results[name] = {"time_ms": elapsed_ms, "rows": perf_rows}
            else:
                self._record_success(name, is_noncritical)
            return True
        else:
            return self._record_fail(name, "Expectation mismatch", query, response, test_case.get("expect"), is_noncritical)

    def validate_expectation(self, test_case: Dict, response: requests.Response, results: Optional[List[Dict]]) -> bool:
        expect = test_case.get("expect", {})

        if expect.get("error"):
            return response.status_code != 200 or "Error" in response.text

        if "Error" in response.text or response.status_code != 200:
            return False

        if "affected_rows" in expect:
            expected = expect["affected_rows"]
            if results and results and "affected_rows" in results[0]:
                return results[0]["affected_rows"] == expected
            return True

        if results is None:
            return False

        if expect.get("rows") is not None:
            if len(results) != expect["rows"]:
                return False

        if expect.get("data"):
            for i, row in enumerate(expect["data"]):
                if i >= len(results):
                    return False
                for k, v in row.items():
                    if isinstance(v, float) and isinstance(results[i].get(k), (int, float)):
                        if abs(results[i][k] - v) > 0.01:
                            return False
                    elif results[i].get(k) != v:
                        return False
        return True

    # ----------------------
    # Setup & Cleanup
    # ----------------------
    def run_setup(self, setup_steps: List[Dict], database: str) -> bool:
        self.setup_operations = []
        self.current_database = database
        for step in setup_steps:
            self.setup_operations.append(step)
            resp = self.execute_sql(database, step['sql'], syntax=self.suite_syntax)
            if resp is None or resp.status_code not in [200, 500]:
                return False
        return True

    def run_cleanup(self, cleanup_steps: List[Dict], database: str) -> None:
        for step in cleanup_steps:
            self.execute_sql(database, step['sql'], syntax=self.suite_syntax)

    def cleanup_test_database(self, database: str) -> None:
        try:
            global is_connect_only_mode
            if is_connect_only_mode:
                resp = self.execute_sql(database, "SHOW TABLES")
                if resp is not None and resp.status_code == 200:
                    try:
                        tables = resp.json().get('data', [])
                        for row in tables:
                            tbl = list(row.values())[0]
                            self.execute_sql(database, f"DROP TABLE IF EXISTS {self._quote_ident(tbl)}")
                    except:
                        pass
                return
            self.execute_sql("system", f"DROP DATABASE IF EXISTS {self._quote_ident(database)}")
            # drop ensures next ensure_database will recreate
            if database in self._ensured_dbs:
                try:
                    self._ensured_dbs.remove(database)
                except KeyError:
                    pass
        except:
            pass

    # ----------------------
    # Parallel test groups
    # ----------------------
    def _run_parallel_group(self, tests: List[Dict], database: str) -> None:
        """Run a list of test cases concurrently using threads."""
        results = [None] * len(tests)  # True/False per test
        lock = threading.Lock()

        def run_one(idx, tc):
            ok = self.run_test_case(tc, database)
            with lock:
                results[idx] = ok

        threads = []
        for idx, tc in enumerate(tests):
            t = threading.Thread(target=run_one, args=(idx, tc), daemon=True)
            threads.append(t)
            t.start()
        for t in threads:
            t.join(timeout=120)

    # ----------------------
    # Spec Runner
    # ----------------------
    def run_test_spec(self, spec_file: str) -> bool:
        with open(spec_file, 'r') as f:
            spec = yaml.safe_load(f)

        metadata = spec.get('metadata', {})
        self.suite_metadata = metadata or {}
        self.suite_syntax = self._normalize_syntax(self.suite_metadata.get("syntax"))
        database = self.default_database

        if metadata.get('disabled'):
            print(f"⏭️  Suite disabled: {metadata.get('description', spec_file)}")
            return True

        # Load performance baselines for this machine
        if PERF_TEST_ENABLED:
            self.load_perf_baselines()
            if PERF_CALIBRATE:
                self.perf_baselines = {}  # reset rows to PERF_DEFAULT_ROWS
                print("🔧 Calibration mode: resetting baselines to current times")

        print(f"🎯 Running suite: {metadata.get('description', spec_file)}")
        print(f"💾 Database: {database}")

        # Optional suite-level DB isolation. In shared-server parallel runs we
        # disable this to avoid cross-suite DROP/CREATE races.
        if self.suite_db_cleanup:
            self.cleanup_test_database(database)
        self.ensure_database(database)

        if spec.get('setup') and not self.run_setup(spec['setup'], database):
            print("❌ Setup failed")
            return False

        # Group consecutive test cases by 'parallel' key and run groups concurrently
        test_cases = spec.get('test_cases', [])
        i = 0
        while i < len(test_cases):
            tc = test_cases[i]
            group = tc.get('parallel')
            if group:
                # Collect all consecutive tests with the same parallel group
                group_tests = [tc]
                j = i + 1
                while j < len(test_cases) and test_cases[j].get('parallel') == group:
                    group_tests.append(test_cases[j])
                    j += 1
                print(f"⚡ Running {len(group_tests)} tests in parallel group '{group}'")
                self._run_parallel_group(group_tests, database)
                i = j
            else:
                self.run_test_case(tc, database)
                i += 1

        if spec.get('cleanup'):
            self.run_cleanup(spec['cleanup'], database)
        if self.suite_db_cleanup:
            self.cleanup_test_database(database)

        print("="*60)
        total = self.test_count
        passed = self.test_passed
        failed_total = len(self.failed_tests)
        failed_noncrit = self.failed_noncritical
        failed_crit = self.failed_critical
        print(f"📊 Results: {passed}/{total} passed | Failures: {failed_total} | Noncritical failures: {failed_noncrit}")
        if self.failed_tests:
            print("❌ Failed:")
            for name, is_noncrit in self.failed_tests:
                suffix = " (noncritical)" if is_noncrit else ""
                print(f"   - {name}{suffix}")
        else:
            print("🎉 All tests passed!")
        print("="*60)

        # Update performance baselines on success (or in calibration mode)
        if PERF_TEST_ENABLED and self.perf_results and (failed_crit == 0 or PERF_CALIBRATE):
            self.save_perf_baselines()

        # Suite success is determined solely by critical tests
        return failed_crit == 0

def wait_for_memcp(port=4321, timeout=30) -> bool:
    for _ in range(timeout):
        try:
            requests.get(f"http://localhost:{port}", timeout=2)
            return True
        except:
            time.sleep(1)
    return False

def start_memcp_process(port: int) -> subprocess.Popen | None:
    try:
        datadir = os.environ.get("MEMCP_TEST_DATADIR", f"/tmp/memcp-sql-tests-{port}")
        env = os.environ.copy()
        devnull = open(os.devnull, 'w')
        proc = subprocess.Popen([
            "./memcp", "-data", datadir,
            f"--api-port={port}", f"--mysql-port={port+1000}",
            "--disable-mysql", "lib/main.scm"
        ], cwd=os.path.dirname(os.path.abspath(__file__)),
           env=env, stdin=subprocess.PIPE, stdout=devnull, stderr=devnull, text=True)
        if not wait_for_memcp(port):
            return None
        return proc
    except Exception:
        return None

def stop_memcp_process(proc: subprocess.Popen) -> None:
    try:
        proc.stdin.close()
        proc.wait(timeout=10)
    except Exception:
        try:
            proc.kill()
        except Exception:
            pass

def kill_memcp_by_port(port: int) -> None:
    pattern = f"memcp.*--api-port={port}"
    try:
        subprocess.run(["pkill", "-f", pattern], check=False)
    except Exception:
        pass

def main():
    if len(sys.argv) < 2:
        print("Usage: python3 run_sql_tests.py <test_spec.yaml> [port] [--connect-only] [--no-db-cleanup]")
        sys.exit(1)

    spec_file = sys.argv[1]
    port = 4321
    connect_only = False
    suite_db_cleanup = True

    for arg in sys.argv[2:]:
        if arg == "--connect-only":
            connect_only = True
        elif arg == "--no-db-cleanup":
            suite_db_cleanup = False
        elif arg.isdigit():
            port = int(arg)

    base_url = f"http://localhost:{port}"
    global is_connect_only_mode
    is_connect_only_mode = connect_only

    memcp_process = None
    if connect_only:
        try:
            requests.get(base_url, timeout=2)
        except:
            print(f"❌ Cannot connect to MemCP on port {port}")
            sys.exit(1)
    else:
        try:
            requests.get(base_url, timeout=2)
        except:
            memcp_process = start_memcp_process(port)
            if not memcp_process:
                print("❌ Failed to start MemCP")
                sys.exit(1)

    runner = SQLTestRunner(base_url, suite_db_cleanup=suite_db_cleanup)
    if not connect_only:
        def restart_handler() -> bool:
            nonlocal memcp_process
            if memcp_process:
                stop_memcp_process(memcp_process)
                memcp_process = None
            memcp_process = start_memcp_process(port)
            return memcp_process is not None
        runner.set_restart_handler(restart_handler)
    success = runner.run_test_spec(spec_file)

    if not connect_only and memcp_process:
        stop_memcp_process(memcp_process)

    sys.exit(0 if success else 1)

if __name__ == "__main__":
    main()