Skip to content

Commit 436f4be

Browse files
Merge pull request #3524 from oanatmaria/PDB-5130
(PDB-5130) Rewrite pdb-dataset to handle partition
2 parents d864e0d + 7e8480d commit 436f4be

File tree

4 files changed

+273
-143
lines changed

4 files changed

+273
-143
lines changed

ext/bin/pdb-dataset

Lines changed: 0 additions & 142 deletions
This file was deleted.

project.clj

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -366,4 +366,8 @@
366366
"time-shift-export" ^{:doc (clojure.string/join "" ["Shifts all timestamps from a PuppetDB archive with"
367367
" the period between the most recent one in the archive and the one "
368368
"you provide, or the current date."])}
369-
["trampoline" "run" "-m" "puppetlabs.puppetdb.cli.time-shift-export"]})
369+
["trampoline" "run" "-m" "puppetlabs.puppetdb.cli.time-shift-export"]
370+
"pdb-dataset" ^{:doc (clojure.string/join "" ["Restores an empty database from a pg_dump resulted backup"
371+
" file and shifts all timestamps with the period between the most recent one in"
372+
" the databse and the one you provide, or the current date."])}
373+
["trampoline" "run" "-m" "puppetlabs.puppetdb.cli.pdb-dataset"]})
Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
(ns puppetlabs.puppetdb.cli.pdb-dataset
2+
"Pg_restore and timeshift entries utility
3+
This command-line tool restores an empty database from a backup file (pg_dump generated file), then updates all the
4+
timestamps inside the database.
5+
It does this by calculating the period between the newest timestamp inside the file and the provided date.
6+
Then, every timestamp is shifted with that period.
7+
It accepts two parameters:
8+
- [Mandatory] -d / --dumpfile
9+
Path to the dumpfile that will be used to restore the database.
10+
- [Optional]-t / --shift-to-time
11+
Timestamp to which all timestamps from the dumpfile will be shifted after the restore.
12+
If it's not provided, the system's current timestamp will be used.
13+
!!! All timestamps are converted to a Zero timezone format. e.g timestamps like: 2015-03-26T10:58:51+10:00
14+
will become 2015-03-26T11:58:51Z !!!
15+
!!! If the time difference between the latest entry in the dumpfile and the time provided to timeshift-to is less
16+
than 24 hours this tool will fail !!!"
17+
18+
(:require
19+
[clojure.java.shell :as shell]
20+
[puppetlabs.puppetdb.cli.util :refer [exit run-cli-cmd]]
21+
[puppetlabs.kitchensink.core :as kitchensink]
22+
[puppetlabs.puppetdb.utils :as utils :refer [println-err]]
23+
[puppetlabs.puppetdb.jdbc :as jdbc]
24+
[puppetlabs.puppetdb.scf.partitioning :as partitioning]
25+
[puppetlabs.puppetdb.time :refer [now to-timestamp]])
26+
(:import (java.lang Math)))
27+
28+
;; Argument parsing
29+
30+
(defn parse-timeshift-to
31+
[time-string]
32+
33+
(let [parsed-time (to-timestamp time-string)]
34+
(if-not parsed-time
35+
(utils/throw-sink-cli-error "Error: time shift date must be in UTC format!"))
36+
parsed-time))
37+
38+
(defn validate-options
39+
[options]
40+
(let [parsed-time (if (:timeshift-to options)
41+
(parse-timeshift-to (:timeshift-to options))
42+
(now))]
43+
{:timeshift-to parsed-time
44+
:dumpfile (:dumpfile options)}))
45+
46+
(defn validate-cli!
47+
[args]
48+
(let [specs [["-t" "--timeshift-to DATE" "Date in UTC format"]
49+
["-d" "--dumpfile DUMPFILE" "Dumpfile"]]
50+
required [:dumpfile]]
51+
(utils/try-process-cli
52+
(fn []
53+
(-> args
54+
(kitchensink/cli! specs required)
55+
first
56+
validate-options)))))
57+
58+
(defn collect-pdbbox-config
59+
[args]
60+
(let [pdbbox-path (System/getenv "PDBBOX")
61+
ini-file (str pdbbox-path "/conf.d/pdb.ini")]
62+
(if (or (nil? pdbbox-path)
63+
(empty? pdbbox-path))
64+
(utils/throw-sink-cli-error "Error: PDBBOX env variable not set!"))
65+
(assoc args :config (:database (kitchensink/ini-to-map ini-file)))))
66+
67+
;; Time manipulation
68+
69+
(def miliseconds-in-day 86400000)
70+
(def minutes-in-day 1440)
71+
(def miliseconds-in-minute 60000)
72+
73+
(defn make-minutes-time-diff
74+
[max-time substract-time]
75+
(let [max-time-mili (.getTime max-time)
76+
substract-time-mili (.getTime substract-time)]
77+
(quot (- max-time-mili substract-time-mili) miliseconds-in-minute)))
78+
79+
(defn to-days
80+
[timestamp]
81+
(Math/round (float (/ (.getTime timestamp) miliseconds-in-day))))
82+
83+
(defn from-days-to-timestamp
84+
[instant]
85+
(to-timestamp (* instant miliseconds-in-day)))
86+
87+
(defn days-from-inst-vec
88+
[timestamp-vec column-name]
89+
(distinct (mapv #(to-days (column-name %)) timestamp-vec)))
90+
91+
;; Table updates
92+
93+
(defn create-copy-table
94+
[table]
95+
(jdbc/do-commands (str "CREATE TABLE " table "_copy (LIKE " table " INCLUDING ALL)")))
96+
97+
(defn copy-table
98+
[table]
99+
(jdbc/do-commands (str "INSERT INTO " table "_copy
100+
SELECT * FROM " table)))
101+
102+
(defn create-partitions
103+
"Creates new partitions for reports and resource-events tables.
104+
In order to calculate the new date of the partitions and not request
105+
creation of a new partitions for every entry, we obtain an array of
106+
unique dates shifted with the period indicated by cli user."
107+
[time-diff-reports time-diff-resource-events]
108+
(let [prod-timestamp-vec (jdbc/query-to-vec "SELECT producer_timestamp FROM reports")
109+
timestamp-re (jdbc/query-to-vec "SELECT timestamp FROM resource_events")
110+
reports-partitions (days-from-inst-vec prod-timestamp-vec :producer_timestamp)
111+
resource-events-partitions (days-from-inst-vec timestamp-re :timestamp)
112+
time-diff-reports (Math/round (float (/ time-diff-reports minutes-in-day)))
113+
time-diff-resource-events (Math/round (float (/ time-diff-resource-events minutes-in-day)))
114+
reports-new-partitions (mapv #(+ time-diff-reports %) reports-partitions)
115+
resource-events-new-partitions (mapv #(+ time-diff-resource-events %) resource-events-partitions)]
116+
(doseq [day-reports reports-new-partitions
117+
day-re resource-events-new-partitions]
118+
(partitioning/create-reports-partition (from-days-to-timestamp day-reports))
119+
(partitioning/create-resource-events-partition (from-days-to-timestamp day-re)))))
120+
121+
(defn database-empty?
122+
[]
123+
(let [schema_info (jdbc/query "SELECT 1 FROM information_schema.tables WHERE table_name = 'schema_migrations'")]
124+
(empty? schema_info)))
125+
126+
(defn restore-database
127+
[args]
128+
(let [dumpfile_path (:dumpfile args)]
129+
(println-err "Restoring database from backup")
130+
(shell/sh "pg_restore" "--role=postgres" "-U" "puppetdb" "--no-owner" "--no-acl" "-d" "puppetdb" dumpfile_path)
131+
(if (database-empty?)
132+
(utils/throw-sink-cli-error "Error: Restore failed!"))
133+
args))
134+
135+
(defn ensure-database-empty
136+
[_]
137+
(if (not (database-empty?))
138+
(utils/throw-sink-cli-error "Error: puppetdb database already exists and it isn't empty!")))
139+
140+
(defn update-simple-tables
141+
[table time-diff]
142+
(jdbc/do-commands (str "UPDATE " table " SET producer_timestamp = producer_timestamp + (" time-diff " * INTERVAL
143+
'1 minute'), timestamp = timestamp + (" time-diff " * INTERVAL '1 minute')")))
144+
145+
(defn add-reports-trigger
146+
[]
147+
(jdbc/do-prepared "create function reports_insert1_trigger() returns trigger
148+
language plpgsql
149+
as
150+
$$
151+
DECLARE
152+
tablename varchar;
153+
BEGIN
154+
SELECT FORMAT('reports_%sZ',
155+
TO_CHAR(NEW.\"producer_timestamp\" AT TIME ZONE 'UTC', 'YYYYMMDD')) INTO tablename;
156+
EXECUTE 'INSERT INTO ' || tablename || ' SELECT ($1).*'
157+
USING NEW;
158+
RETURN NULL;
159+
END;
160+
$$;
161+
alter function reports_insert1_trigger() owner to puppetdb;
162+
CREATE TRIGGER reports_insert1_trigger
163+
BEFORE INSERT ON reports
164+
FOR EACH ROW EXECUTE PROCEDURE reports_insert1_trigger()"))
165+
166+
(defn update-reports
167+
[time-diff]
168+
(create-copy-table "reports")
169+
(copy-table "reports")
170+
(jdbc/do-commands (str "UPDATE reports_copy
171+
SET producer_timestamp = producer_timestamp + (" time-diff " * INTERVAL '1 minute'),
172+
start_time = start_time + (" time-diff " * INTERVAL '1 minute'),
173+
end_time = end_time + (" time-diff " * INTERVAL '1 minute'),
174+
receive_time = receive_time + (" time-diff " * INTERVAL '1 minute')"))
175+
(jdbc/do-commands "DELETE FROM reports")
176+
(add-reports-trigger)
177+
(jdbc/do-commands "INSERT INTO reports SELECT * FROM reports_copy")
178+
(jdbc/do-commands "DROP FUNCTION reports_insert1_trigger() CASCADE")
179+
(jdbc/do-commands "DROP TABLE IF EXISTS reports_copy"))
180+
181+
(defn update-resource-events
182+
[time-diff]
183+
(create-copy-table "resource_events")
184+
(copy-table "resource_events")
185+
(jdbc/do-commands (str "UPDATE resource_events_copy SET timestamp = timestamp + (" time-diff " * INTERVAL '1 minute')"))
186+
(jdbc/do-commands "DELETE FROM resource_events")
187+
(jdbc/do-commands "INSERT INTO resource_events SELECT * FROM resource_events_copy")
188+
(jdbc/do-commands "DROP TABLE IF EXISTS resource_events_copy"))
189+
190+
(defn update-tables
191+
[args]
192+
(let [time-to-shift-to (to-timestamp (:timeshift-to args))
193+
max-time (:max (first (jdbc/query "SELECT max(producer_timestamp) FROM reports")))
194+
max-time-re (:max (first (jdbc/query "SELECT max(timestamp) FROM resource_events")))
195+
time-diff (make-minutes-time-diff time-to-shift-to max-time)
196+
time-diff-re (make-minutes-time-diff time-to-shift-to max-time-re)]
197+
(println-err "Updating data timestamps")
198+
(update-simple-tables "catalogs" time-diff)
199+
(update-simple-tables "factsets" time-diff)
200+
(create-partitions time-diff time-diff-re)
201+
(update-reports time-diff)
202+
(update-resource-events time-diff-re)))
203+
204+
(defn vacuum-db
205+
[_]
206+
(println-err "Running vacuum full on puppetdb database")
207+
(shell/sh "vacuumdb" "-f" "puppetdb" "-U" "postgres"))
208+
209+
(defn connect-to-db
210+
[args methods-array]
211+
(let [config (assoc (:config args)
212+
:user "puppetdb"
213+
:subprotocol "postgresql"
214+
:pool-name "PDBDataSetPool"
215+
:connection-timeout 3000
216+
:rewrite-batched-inserts "true")]
217+
(binding [jdbc/*db* {:datasource (jdbc/make-connection-pool config)}]
218+
(mapv #(% args) methods-array))))
219+
220+
(defn -main
221+
[& args]
222+
223+
(exit (run-cli-cmd #(do
224+
(-> args
225+
validate-cli!
226+
collect-pdbbox-config
227+
(connect-to-db [ensure-database-empty
228+
restore-database
229+
update-tables
230+
vacuum-db]))
231+
0))))

0 commit comments

Comments
 (0)