Skip to content

Commit 9348675

Browse files
committed
(GH-4013) Handle finalizing partition detach in GC
The process of detaching a partition is a two transaction process. When the first transaction succeeds, the partition is now "pending". The second transaction needs an ACCESS EXCLUSIVE lock on the partition and can therefore sometimes fail. When this happens, subsequent GCs will fail because only one pending partition detachment is allowed. To handle this, catch the SQLException and finalize the pending detach operation. If that was a different partition from the partition we are trying to remove, retry the detach operation that failed.
1 parent b7d2772 commit 9348675

File tree

4 files changed

+96
-6
lines changed

4 files changed

+96
-6
lines changed

documentation/release_notes_7.markdown

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,17 @@ canonical: "/puppetdb/latest/release_notes.html"
1515

1616
# PuppetDB: Release notes
1717

18+
## PuppetDB 7.20.1
19+
20+
Released TBD.
21+
22+
### Bug fixes
23+
24+
* Fixed an issue with report garbage collection where a partition would become
25+
partially detached and block future garbage collection progress. Garbage
26+
collection will now finalize the partition detach operation and remove the
27+
table. ([GitHub #4013](https://github.com/puppetlabs/puppetdb/issues/4013))
28+
1829
## PuppetDB 7.20.0
1930

2031
Released October 22 2024

src/puppetlabs/puppetdb/jdbc.clj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
(or ({:admin-shutdown "57P01"
2525
:invalid-regular-expression "2201B"
2626
:program-limit-exceeded "54000"
27+
:not-in-prerequisite-state "55000"
2728
:lock-not-available "55P03"
2829
:query-canceled "57014"}
2930
kw-name)

src/puppetlabs/puppetdb/scf/storage.clj

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1673,6 +1673,22 @@
16731673
(let [{current-db-version :version} (sutils/db-metadata)]
16741674
(not (neg? (compare current-db-version pg14-db)))))
16751675

1676+
(defn finalize-pending-detach
1677+
"Finalize a previously failed detach operation. A partitioned table can
1678+
only have one partition pending detachment at any time."
1679+
[parent]
1680+
(let [pending (->> ["SELECT inhrelid::regclass AS child
1681+
FROM pg_catalog.pg_inherits
1682+
WHERE inhparent = ?::regclass AND inhdetachpending = true"
1683+
parent]
1684+
jdbc/query-to-vec
1685+
first
1686+
:child)]
1687+
(when pending
1688+
(log/info (trs "Finalizing detach for partition {0}" pending))
1689+
(jdbc/do-commands (format "ALTER TABLE %s DETACH PARTITION %s FINALIZE" parent pending))
1690+
(str pending))))
1691+
16761692
(defn prune-daily-partitions
16771693
"Either detaches or drops obsolete day-oriented partitions
16781694
older than the date. Deletes or detaches only the oldest such candidate if
@@ -1698,14 +1714,27 @@
16981714
candidates (->> (partitioning/get-partition-names table-prefix)
16991715
(filter expired?)
17001716
sort)
1701-
drop-one (fn [table]
1717+
detach (fn detach [parent child]
1718+
(jdbc/do-commands-outside-txn
1719+
(format "alter table %s detach partition %s concurrently" parent child)))
1720+
drop-one (fn drop-one [table]
17021721
(update-lock-status status-key inc)
17031722
(try!
17041723
(if just-detach?
1705-
(jdbc/do-commands-outside-txn
1706-
(format "alter table %s detach partition %s concurrently" table-prefix table))
1724+
(let [ex (try
1725+
(detach table-prefix table)
1726+
(catch SQLException ex
1727+
(if (= (jdbc/sql-state :not-in-prerequisite-state) (.getSQLState ex))
1728+
ex
1729+
(throw ex))))]
1730+
(when (instance? SQLException ex)
1731+
(let [finalized-table (finalize-pending-detach table-prefix)]
1732+
(when-not (= finalized-table table)
1733+
;; Retry, unless the finalized partition detach was
1734+
;; for the same table
1735+
(detach table-prefix table)))))
17071736
(jdbc/do-commands
1708-
(format "drop table if exists %s cascade" table)))
1737+
(format "drop table if exists %s cascade" table)))
17091738
(finally
17101739
(update-lock-status status-key dec))))
17111740
drop #(if incremental?

test/puppetlabs/puppetdb/cli/services_test.clj

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
[puppetlabs.puppetdb.command.constants :as cmd-consts]
66
[puppetlabs.puppetdb.lint :refer [ignore-value]]
77
[puppetlabs.puppetdb.scf.partitioning
8-
:refer [get-temporal-partitions]]
8+
:refer [get-temporal-partitions]
9+
:as part]
910
[puppetlabs.trapperkeeper.testutils.logging
1011
:refer [with-log-output
1112
logs-matching
@@ -607,4 +608,52 @@
607608
:report-ttl (time/parse-period "1d")
608609
:resource-events-ttl (time/parse-period "1d")
609610
:db-lock-status db-lock-status})
610-
(is (empty? (jdbc/query ["SELECT * FROM reports"])))))))))
611+
(is (empty? (jdbc/query ["SELECT * FROM reports"]))))
612+
613+
;; This test is not applicable unless our Postgres version is new enough
614+
;; to support the concurrent partition detach feature.
615+
(when (scf-store/detach-partitions-concurrently?)
616+
(testing "a partition stuck in the pending state is finalized and removed"
617+
(let [old-ts (-> 2 time/days time/ago)
618+
partition-table (format "reports_%s"
619+
(part/date-suffix (part/to-zoned-date-time (time/to-timestamp old-ts))))
620+
lock-acquired (promise)
621+
partition-pending-detach (promise)]
622+
(store-report (time/to-string old-ts))
623+
(store-report (to-string (now)))
624+
625+
(future
626+
;; Create a query that will block the ACCESS EXCLUSIVE lock needed
627+
;; by the second transaction of the concurrent detach below
628+
(jdbc/with-transacted-connection *read-db*
629+
(jdbc/with-db-transaction []
630+
(jdbc/query [(format "select * from %s" partition-table)])
631+
(deliver lock-acquired partition-table)
632+
633+
;; wait for partition detach to fail
634+
@partition-pending-detach)))
635+
636+
;; Wait until we are sure that the detach partition operation will be blocked
637+
@lock-acquired
638+
639+
(try
640+
(jdbc/do-commands-outside-txn
641+
"SET statement_timeout = 100"
642+
(format "ALTER TABLE reports DETACH PARTITION %s CONCURRENTLY" partition-table))
643+
(catch java.sql.SQLException _)
644+
(finally
645+
(deliver partition-pending-detach partition-table)
646+
(jdbc/do-commands-outside-txn "SET statement_timeout = 0")))
647+
648+
(is (= [{:inhdetachpending true}]
649+
(jdbc/query ["select inhdetachpending from pg_catalog.pg_inherits where inhparent = 'reports'::regclass and inhrelid = ?::regclass" partition-table])))
650+
651+
(svcs/sweep-reports! *db* {:incremental? false
652+
:report-ttl (time/parse-period "1d")
653+
:resource-events-ttl (time/parse-period "1d")
654+
:db-lock-status db-lock-status})
655+
656+
(is (empty?
657+
(jdbc/query ["SELECT tablename FROM pg_tables WHERE tablename = ?" partition-table])))
658+
659+
(jdbc/do-commands "DELETE FROM reports")))))))))

0 commit comments

Comments
 (0)