Skip to content

Commit 18fa7cc

Browse files
authored
Merge pull request #4014 from austb/gh-4013/main/handle-report-gc-failures
Handle failure cases of report gc
2 parents 25e8e0f + 7d86914 commit 18fa7cc

File tree

5 files changed

+156
-20
lines changed

5 files changed

+156
-20
lines changed

documentation/release_notes_7.markdown

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,21 @@ canonical: "/puppetdb/latest/release_notes.html"
1515

1616
# PuppetDB: Release notes
1717

18+
## PuppetDB 7.20.1
19+
20+
Released TBD.
21+
22+
### Bug fixes
23+
24+
* Fixed an issue with report garbage collection where a partition would become
25+
partially detached and block future garbage collection progress. Garbage
26+
collection will now finalize the partition detach operation and remove the
27+
table. ([GitHub #4013](https://github.com/puppetlabs/puppetdb/issues/4013))
28+
* Fixed an issue with report garbage collection where a partition would be
29+
detached, but the table was never deleted. Garbage collection will now
30+
identify and clean-up these tables.
31+
([GitHub #4013](https://github.com/puppetlabs/puppetdb/issues/4013))
32+
1833
## PuppetDB 7.20.0
1934

2035
Released October 22 2024

src/puppetlabs/puppetdb/jdbc.clj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
(or ({:admin-shutdown "57P01"
2525
:invalid-regular-expression "2201B"
2626
:program-limit-exceeded "54000"
27+
:not-in-prerequisite-state "55000"
2728
:lock-not-available "55P03"
2829
:query-canceled "57014"}
2930
kw-name)

src/puppetlabs/puppetdb/scf/partitioning.clj

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
(ns puppetlabs.puppetdb.scf.partitioning
22
"Handles all work related to database table partitioning"
33
(:require
4+
[clojure.string :refer [lower-case]]
45
[puppetlabs.i18n.core :refer [trs]]
56
[puppetlabs.puppetdb.jdbc :as jdbc]
67
[schema.core :as s])
@@ -32,8 +33,8 @@
3233

3334
(defn date-suffix
3435
[date]
35-
(let [formatter (.withZone (DateTimeFormatter/BASIC_ISO_DATE) (ZoneId/of "UTC"))]
36-
(.format date formatter)))
36+
(let [formatter (.withZone DateTimeFormatter/BASIC_ISO_DATE (ZoneId/of "UTC"))]
37+
(lower-case (.format date formatter))))
3738

3839
(defn to-zoned-date-time
3940
[date]

src/puppetlabs/puppetdb/scf/storage.clj

Lines changed: 64 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1673,6 +1673,39 @@
16731673
(let [{current-db-version :version} (sutils/db-metadata)]
16741674
(not (neg? (compare current-db-version pg14-db)))))
16751675

1676+
(defn finalize-pending-detach
1677+
"Finalize a previously failed detach operation. A partitioned table can
1678+
only have one partition pending detachment at any time."
1679+
[parent]
1680+
(let [pending (->> ["SELECT inhrelid::regclass AS child
1681+
FROM pg_catalog.pg_inherits
1682+
WHERE inhparent = ?::regclass AND inhdetachpending = true"
1683+
parent]
1684+
jdbc/query-to-vec
1685+
first
1686+
:child)]
1687+
(when pending
1688+
(log/info (trs "Finalizing detach for partition {0}" pending))
1689+
(jdbc/do-commands (format "ALTER TABLE %s DETACH PARTITION %s FINALIZE" parent pending))
1690+
(str pending))))
1691+
1692+
(defn find-stranded-partitions
1693+
"Identify tables that match the child format of a partitioned table (like reports_historical)
1694+
that are not present in the pg_inherits table. These partitions have been detached, but failed
1695+
to be deleted.
1696+
1697+
Tables that are not partitioned will also not be in the pg_inherits table, so you MUST
1698+
write a child-format that does not match any non-partitioned tables.
1699+
1700+
Returns a list of strings. Each string is a stranded partition that should be removed."
1701+
[child-format]
1702+
(->> [(str "SELECT tablename"
1703+
" FROM pg_tables WHERE tablename ~ ?"
1704+
" AND tablename NOT IN (SELECT inhrelid::regclass::text FROM pg_catalog.pg_inherits)")
1705+
child-format]
1706+
jdbc/query-to-vec
1707+
(map (comp str :tablename))))
1708+
16761709
(defn prune-daily-partitions
16771710
"Either detaches or drops obsolete day-oriented partitions
16781711
older than the date. Deletes or detaches only the oldest such candidate if
@@ -1698,14 +1731,27 @@
16981731
candidates (->> (partitioning/get-partition-names table-prefix)
16991732
(filter expired?)
17001733
sort)
1701-
drop-one (fn [table]
1734+
detach (fn detach [parent child]
1735+
(jdbc/do-commands-outside-txn
1736+
(format "alter table %s detach partition %s concurrently" parent child)))
1737+
drop-one (fn drop-one [table]
17021738
(update-lock-status status-key inc)
17031739
(try!
17041740
(if just-detach?
1705-
(jdbc/do-commands-outside-txn
1706-
(format "alter table %s detach partition %s concurrently" table-prefix table))
1741+
(let [ex (try
1742+
(detach table-prefix table)
1743+
(catch SQLException ex
1744+
(if (= (jdbc/sql-state :not-in-prerequisite-state) (.getSQLState ex))
1745+
ex
1746+
(throw ex))))]
1747+
(when (instance? SQLException ex)
1748+
(let [finalized-table (finalize-pending-detach table-prefix)]
1749+
(when-not (= finalized-table table)
1750+
;; Retry, unless the finalized partition detach was
1751+
;; for the same table
1752+
(detach table-prefix table)))))
17071753
(jdbc/do-commands
1708-
(format "drop table if exists %s cascade" table)))
1754+
(format "drop table if exists %s cascade" table)))
17091755
(finally
17101756
(update-lock-status status-key dec))))
17111757
drop #(if incremental?
@@ -1745,7 +1791,7 @@
17451791
"Drops the given set of tables. Will throw an SQLException termination if the
17461792
operation takes much longer than PDB_GC_DAILY_PARTITION_DROP_LOCK_TIMEOUT_MS."
17471793
[old-partition-tables update-lock-status status-key]
1748-
(let [drop #(doseq [table old-partition-tables]
1794+
(let [drop #(doseq [table (distinct old-partition-tables)]
17491795
(try
17501796
(update-lock-status status-key inc)
17511797
(jdbc/do-commands
@@ -1769,9 +1815,10 @@
17691815
;; PG14+
17701816
(let [detached-tables
17711817
(detach-daily-partitions "resource_events" date incremental?
1772-
update-lock-status :write-locking-resource-events)]
1818+
update-lock-status :write-locking-resource-events)
1819+
stranded-tables (find-stranded-partitions "^resource_events_\\d\\d\\d\\d\\d\\d\\d\\dz$")]
17731820
(jdbc/with-db-transaction []
1774-
(drop-partition-tables! detached-tables
1821+
(drop-partition-tables! (concat detached-tables stranded-tables)
17751822
update-lock-status :write-locking-resource-events)))))
17761823

17771824
(defn cleanup-dropped-report-certnames
@@ -1824,13 +1871,15 @@
18241871
;; PG14+
18251872
;; Detach partition concurrently must take place outside of a transaction.
18261873
(let [detached-resource-event-tables
1827-
(detach-daily-partitions "resource_events" effective-resource-events-ttl
1828-
incremental? update-lock-status
1829-
:write-locking-resource-events)
1874+
(detach-daily-partitions "resource_events" effective-resource-events-ttl
1875+
incremental? update-lock-status
1876+
:write-locking-resource-events)
1877+
stranded-events-tables (find-stranded-partitions "^resource_events_\\d\\d\\d\\d\\d\\d\\d\\dz$")
18301878
detached-report-tables
1831-
(detach-daily-partitions "reports" report-ttl
1832-
incremental? update-lock-status
1833-
:write-locking-reports)]
1879+
(detach-daily-partitions "reports" report-ttl
1880+
incremental? update-lock-status
1881+
:write-locking-reports)
1882+
stranded-reports-tables (find-stranded-partitions "^reports_\\d\\d\\d\\d\\d\\d\\d\\dz$")]
18341883
;; Now we can delete the partitions with less intrusive locking.
18351884
(jdbc/with-db-transaction []
18361885
;; Nothing should acquire locks on the detached tables, but to be safe, acquire
@@ -1840,9 +1889,9 @@
18401889
;; force a resource-events GC. prior to partitioning, this would have happened
18411890
;; via a cascade when the report was deleted, but now we just drop whole tables
18421891
;; of resource events.
1843-
(drop-partition-tables! detached-resource-event-tables
1892+
(drop-partition-tables! (concat detached-resource-event-tables stranded-events-tables)
18441893
update-lock-status :write-locking-resource-events)
1845-
(drop-partition-tables! detached-report-tables
1894+
(drop-partition-tables! (concat detached-report-tables stranded-reports-tables)
18461895
update-lock-status :write-locking-reports)
18471896
;; since we cannot cascade back to the certnames table anymore, go clean up
18481897
;; the latest_report_id column after a GC.

test/puppetlabs/puppetdb/cli/services_test.clj

Lines changed: 73 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
[puppetlabs.puppetdb.command.constants :as cmd-consts]
66
[puppetlabs.puppetdb.lint :refer [ignore-value]]
77
[puppetlabs.puppetdb.scf.partitioning
8-
:refer [get-temporal-partitions]]
8+
:refer [get-temporal-partitions]
9+
:as part]
910
[puppetlabs.trapperkeeper.testutils.logging
1011
:refer [with-log-output
1112
logs-matching
@@ -583,7 +584,7 @@
583584
(with-test-db
584585
(let [config (-> (create-temp-config)
585586
(assoc :database *db* :read-database *read-db*)
586-
(assoc-in [:database :gc-interval] "0.01"))
587+
(assoc-in [:database :gc-interval] "60"))
587588
store-report #(sync-command-post (svc-utils/pdb-cmd-url)
588589
example-certname
589590
"store report"
@@ -607,4 +608,73 @@
607608
:report-ttl (time/parse-period "1d")
608609
:resource-events-ttl (time/parse-period "1d")
609610
:db-lock-status db-lock-status})
610-
(is (empty? (jdbc/query ["SELECT * FROM reports"])))))))))
611+
(is (empty? (jdbc/query ["SELECT * FROM reports"]))))
612+
613+
;; These tests are not applicable unless our Postgres version is new enough
614+
;; to support the concurrent partition detach feature.
615+
(when (scf-store/detach-partitions-concurrently?)
616+
(testing "a partition stuck in the pending state is finalized and removed"
617+
(let [old-ts (-> 2 time/days time/ago)
618+
partition-table (format "reports_%s"
619+
(part/date-suffix (part/to-zoned-date-time (time/to-timestamp old-ts))))
620+
lock-acquired (promise)
621+
partition-pending-detach (promise)]
622+
(store-report (time/to-string old-ts))
623+
(store-report (to-string (now)))
624+
625+
(future
626+
;; Create a query that will block the ACCESS EXCLUSIVE lock needed
627+
;; by the second transaction of the concurrent detach below
628+
(jdbc/with-transacted-connection *read-db*
629+
(jdbc/with-db-transaction []
630+
(jdbc/query [(format "select * from %s" partition-table)])
631+
(deliver lock-acquired partition-table)
632+
633+
;; wait for partition detach to fail
634+
@partition-pending-detach)))
635+
636+
;; Wait until we are sure that the detach partition operation will be blocked
637+
@lock-acquired
638+
639+
(try
640+
(jdbc/do-commands-outside-txn
641+
"SET statement_timeout = 100"
642+
(format "ALTER TABLE reports DETACH PARTITION %s CONCURRENTLY" partition-table))
643+
(catch java.sql.SQLException _)
644+
(finally
645+
(deliver partition-pending-detach partition-table)
646+
(jdbc/do-commands-outside-txn "SET statement_timeout = 0")))
647+
648+
(is (= [{:inhdetachpending true}]
649+
(jdbc/query ["select inhdetachpending from pg_catalog.pg_inherits where inhparent = 'reports'::regclass and inhrelid = ?::regclass" partition-table])))
650+
651+
(svcs/sweep-reports! *db* {:incremental? false
652+
:report-ttl (time/parse-period "1d")
653+
:resource-events-ttl (time/parse-period "1d")
654+
:db-lock-status db-lock-status})
655+
656+
(is (empty?
657+
(jdbc/query ["SELECT tablename FROM pg_tables WHERE tablename = ?" partition-table])))
658+
659+
(jdbc/do-commands "DELETE FROM reports")))
660+
661+
(testing "a detached partition that was not removed is cleaned up by gc"
662+
(let [old-ts (-> 2 time/days time/ago)
663+
partition-table (format "reports_%s"
664+
(part/date-suffix (part/to-zoned-date-time (time/to-timestamp old-ts))))]
665+
(store-report (time/to-string old-ts))
666+
(store-report (to-string (now)))
667+
668+
;; Strand the partition before calling GC
669+
(jdbc/do-commands-outside-txn
670+
(format "ALTER TABLE reports DETACH PARTITION %s CONCURRENTLY" partition-table))
671+
672+
(svcs/sweep-reports! *db* {:incremental? false
673+
:report-ttl (time/parse-period "1d")
674+
:resource-events-ttl (time/parse-period "1d")
675+
:db-lock-status db-lock-status})
676+
677+
(is (empty?
678+
(jdbc/query ["SELECT tablename FROM pg_tables WHERE tablename = ?" partition-table])))
679+
680+
(jdbc/do-commands "DELETE FROM reports")))))))))

0 commit comments

Comments
 (0)