From 1ff22186275d4fe7d6b7af65911597e0d88f4e58 Mon Sep 17 00:00:00 2001 From: Zeph Grunschlag Date: Sat, 19 Aug 2023 00:13:08 -0500 Subject: [PATCH 01/15] pg_stat queries --- examples/Makefile | 10 +++ performance/Makefile | 107 +++++++++++++++++++++++++++ performance/conduit_data/conduit.yml | 28 +++++++ performance/docker-compose.yml | 20 +++++ 4 files changed, 165 insertions(+) create mode 100644 examples/Makefile create mode 100644 performance/Makefile create mode 100644 performance/conduit_data/conduit.yml create mode 100644 performance/docker-compose.yml diff --git a/examples/Makefile b/examples/Makefile new file mode 100644 index 0000000..30ac040 --- /dev/null +++ b/examples/Makefile @@ -0,0 +1,10 @@ +CDATA = pypolars_data + +conduit: clean build + ../cmd/conduit/conduit -d $(CDATA) + +build: + cd .. && make + +clean: + rm -f $(CDATA)/metadata.json \ No newline at end of file diff --git a/performance/Makefile b/performance/Makefile new file mode 100644 index 0000000..9d2220d --- /dev/null +++ b/performance/Makefile @@ -0,0 +1,107 @@ +random := $(shell echo $$RANDOM) +PGCONT = performance_pg +PGUSER = algorand +PGDB = performance_db # postgres +PGCONN = postgresql://algorand:algorand@localhost:65432/performance_db +PGLOGS = pg.log +CONDUIT = ../cmd/conduit/conduit + +define QUERY_TOTAL_TIME +-c "SELECT substring(trim(regexp_replace(regexp_replace(query, '--.*?$$', '', 'gn'), '\\s+', ' ', 'g')), 1, 50) as query, \ +round(total_exec_time::numeric, 2) AS total_time, \ +calls, \ +round(mean_exec_time::numeric, 2) AS mean, \ +round((100 * total_exec_time / sum(total_exec_time::numeric) OVER ())::numeric, 2) AS percentage_cpu \ +FROM pg_stat_statements \ +ORDER BY total_exec_time DESC \ +LIMIT 10;" +endef + +define QUERY_SLOWEST +-c "SELECT userid::regrole, dbid, \ + substring(trim(regexp_replace(regexp_replace(query, '--.*?$$', '', 'gn'), '\\s+', ' ', 'g')), 1, 50) as query, \ + calls, total_exec_time/1000 as total_time_seconds, min_exec_time/1000 as min_time_seconds, \ + max_exec_time/1000 as max_time_seconds, mean_exec_time/1000 as mean_time_seconds \ +FROM pg_stat_statements \ +ORDER BY mean_exec_time DESC \ +LIMIT 10;" +endef + +define QUERY_MEMHOG +-c "SELECT userid::regrole, dbid, \ + substring(trim(regexp_replace(regexp_replace(query, '--.*?$$', '', 'gn'), '\\s+', ' ', 'g')), 1, 50) as query, \ + (shared_blks_hit+shared_blks_dirtied) as memory_usage \ +FROM pg_stat_statements \ +ORDER BY (shared_blks_hit+shared_blks_dirtied) DESC \ +LIMIT 10;" +endef + + +perf-run: clean pg-up run-conduit + +clean: # pg-down should be called manually + rm conduit_data/metadata.json || true + +pg-up: save-logs + docker-compose up -d + sleep 5 + make pg-logs-tail + docker exec -it $(PGCONT) psql -U algorand -d postgres -c "create database $(PGDB);" + make pg-query QUERY="-c \"CREATE EXTENSION pg_stat_statements;\"" + +pg-logs-tail: + docker-compose logs -f > $(PGLOGS) & + +save-logs: + mv $(PGLOGS) "$(PGLOGS)_$(random)" || true + mv conduit.log "conduit_$(random).log" || true + +QUERY := -c "SELECT * FROM pg_stat_statements LIMIT 0;" +pg-query: + psql $(PGCONN) $(QUERY) + +pg-conn: QUERY= +pg-conn: pg-query + +pg-stats: + make pg-stats-1 + make pg-stats-2 + make pg-stats-3 + +pg-stats-1: QUERY=$(QUERY_TOTAL_TIME) +pg-stats-1: pg-query + +pg-stats-2: QUERY=$(QUERY_SLOWEST) +pg-stats-2: pg-query + +pg-stats-3: QUERY=$(QUERY_MEMHOG) +pg-stats-3: pg-query + + +pg-enter: + docker exec -it $(PGCONT) psql -U algorand -d postgres + +pg-down: + docker-compose down + +build: + cd .. && make + +run-conduit: build + $(CONDUIT) -d conduit_data + +run-file-exporter: + make run-runner TEMPLATE="--template file-exporter" TIMES=1 RESETDB= + +clean-reports: + rm -rf $(REPORTS) + +pre-git-push: + mv _go.mod go.mod + mv _go.sum go.sum + cd ../../ && make tidy + +post-git-push: + mv go.mod _go.mod + mv go.sum _go.sum + cd ../../ && make tidy && go get github.com/lib/pq diff --git a/performance/conduit_data/conduit.yml b/performance/conduit_data/conduit.yml new file mode 100644 index 0000000..8bb9daf --- /dev/null +++ b/performance/conduit_data/conduit.yml @@ -0,0 +1,28 @@ +log-level: info +log-file: conduit.log +retry-count: 1 +retry-delay: "1s" +#pid-filepath: /path/to/pidfile +hide-banner: true +metrics: + mode: ON + addr: ":9999" + prefix: "conduit" +importer: + name: file_reader + config: + # assumes we run conduit inside of the `performance/` directory: + block-dir: "filereader_blocks" + filename-pattern: "%[1]d_block.msgp.gz" +processors: +exporter: + name: postgresql + config: + connection-string: "host=localhost user=algorand password=algorand dbname=performance_db port=65432 sslmode=disable" + max-conn: 20 +telemetry: + enabled: false + # uri: "" + # index: "" + # username: "" + # password: "" diff --git a/performance/docker-compose.yml b/performance/docker-compose.yml new file mode 100644 index 0000000..620c852 --- /dev/null +++ b/performance/docker-compose.yml @@ -0,0 +1,20 @@ +version: '3' + +services: + postgres: + image: postgres + container_name: performance_pg + ports: + - "65432:5432" + environment: + POSTGRES_PASSWORD: algorand + POSTGRES_USER: algorand + POSTGRES_DB: postgres + POSTGRES_HOST_AUTH_METHOD: trust + command: + -c shared_preload_libraries='pg_stat_statements' + # pg logs for the stress test with 10 blocks was 3 GB which + # interfered with performance measurements + # -c log_statement='all' + # -c log_duration=on + # -c log_min_duration_statement=0 From 3de2e9c014f57f7c321662223f4bfc3a272e5244 Mon Sep 17 00:00:00 2001 From: Zeph Grunschlag Date: Mon, 21 Aug 2023 10:49:24 -0500 Subject: [PATCH 02/15] wip --- .gitignore | 3 + performance/.gitignore | 3 + performance/Makefile | 116 +- performance/filereader_blocks/.gitignore | 1 + performance/pg_stats.ipynb | 1571 ++++++++++++++++++++++ performance/requirements.txt | 3 + 6 files changed, 1638 insertions(+), 59 deletions(-) create mode 100644 performance/.gitignore create mode 100644 performance/filereader_blocks/.gitignore create mode 100644 performance/pg_stats.ipynb create mode 100644 performance/requirements.txt diff --git a/.gitignore b/.gitignore index e138c70..5ed84a9 100644 --- a/.gitignore +++ b/.gitignore @@ -55,3 +55,6 @@ coverage.txt # conduit example cmd/conduit/data + +# python virtualenv +.venv diff --git a/performance/.gitignore b/performance/.gitignore new file mode 100644 index 0000000..becaa2a --- /dev/null +++ b/performance/.gitignore @@ -0,0 +1,3 @@ +conduit*.log +pg.log* +metadata.json \ No newline at end of file diff --git a/performance/Makefile b/performance/Makefile index 9d2220d..b467757 100644 --- a/performance/Makefile +++ b/performance/Makefile @@ -2,46 +2,25 @@ random := $(shell echo $$RANDOM) PGCONT = performance_pg PGUSER = algorand PGDB = performance_db # postgres -PGCONN = postgresql://algorand:algorand@localhost:65432/performance_db PGLOGS = pg.log CONDUIT = ../cmd/conduit/conduit -define QUERY_TOTAL_TIME --c "SELECT substring(trim(regexp_replace(regexp_replace(query, '--.*?$$', '', 'gn'), '\\s+', ' ', 'g')), 1, 50) as query, \ -round(total_exec_time::numeric, 2) AS total_time, \ -calls, \ -round(mean_exec_time::numeric, 2) AS mean, \ -round((100 * total_exec_time / sum(total_exec_time::numeric) OVER ())::numeric, 2) AS percentage_cpu \ -FROM pg_stat_statements \ -ORDER BY total_exec_time DESC \ -LIMIT 10;" -endef +nuke: pg-down clean -define QUERY_SLOWEST --c "SELECT userid::regrole, dbid, \ - substring(trim(regexp_replace(regexp_replace(query, '--.*?$$', '', 'gn'), '\\s+', ' ', 'g')), 1, 50) as query, \ - calls, total_exec_time/1000 as total_time_seconds, min_exec_time/1000 as min_time_seconds, \ - max_exec_time/1000 as max_time_seconds, mean_exec_time/1000 as mean_time_seconds \ -FROM pg_stat_statements \ -ORDER BY mean_exec_time DESC \ -LIMIT 10;" -endef +nuke-and-run: nuke perf-run -define QUERY_MEMHOG --c "SELECT userid::regrole, dbid, \ - substring(trim(regexp_replace(regexp_replace(query, '--.*?$$', '', 'gn'), '\\s+', ' ', 'g')), 1, 50) as query, \ - (shared_blks_hit+shared_blks_dirtied) as memory_usage \ -FROM pg_stat_statements \ -ORDER BY (shared_blks_hit+shared_blks_dirtied) DESC \ -LIMIT 10;" -endef +nuke-and-debug: nuke pg-up + mv ../cmd/conduit/conduit.log "../cmd/conduit/conduit_$(random).log" || true - -perf-run: clean pg-up run-conduit +perf-run: pg-up run-conduit clean: # pg-down should be called manually rm conduit_data/metadata.json || true +clean-go-cache: + cd .. && go clean -cache -testcache -modcache + + pg-up: save-logs docker-compose up -d sleep 5 @@ -49,6 +28,9 @@ pg-up: save-logs docker exec -it $(PGCONT) psql -U algorand -d postgres -c "create database $(PGDB);" make pg-query QUERY="-c \"CREATE EXTENSION pg_stat_statements;\"" +pg-down: + docker-compose down + pg-logs-tail: docker-compose logs -f > $(PGLOGS) & @@ -56,10 +38,55 @@ save-logs: mv $(PGLOGS) "$(PGLOGS)_$(random)" || true mv conduit.log "conduit_$(random).log" || true +build: + cd .. && make + +pg-enter: + docker exec -it $(PGCONT) psql -U algorand -d postgres + +run-conduit: build + $(CONDUIT) -d conduit_data + +# - for query hackery... prefer pg_stats.ipynb +PGCONN = postgresql://algorand:algorand@localhost:65432/performance_db + +QUERY_COL = substring(trim(regexp_replace(regexp_replace(query, '--.*?$$', '', 'gn'), '\\s+', ' ', 'g')), 1, 100) AS query +TOTAL_SECS_COL = round((total_exec_time/1000)::numeric, 3) AS tot_s +MEAN_SECS_COL = round((mean_exec_time/1000)::numeric, 3) AS mean_s +MIN_SECS_COL = round((min_exec_time/1000)::numeric, 3) AS min_s +MAX_SECS_COL = round((max_exec_time/1000)::numeric, 3) AS max_s +CPU_COL = round((100 * total_exec_time / sum(total_exec_time::numeric) OVER ())::numeric, 2) AS \"cpu%\" +LIMIT = 15 + +define QUERY_TOTAL_TIME +-c "SELECT dbid, $(QUERY_COL), $(TOTAL_SECS_COL), calls, $(MEAN_SECS_COL), $(CPU_COL) \ +FROM pg_stat_statements \ +ORDER BY total_exec_time DESC \ +LIMIT $(LIMIT);" +endef + +define QUERY_SLOWEST +-c "SELECT dbid, $(QUERY_COL), calls, $(TOTAL_SECS_COL), $(MIN_SECS_COL), $(MAX_SECS_COL), $(MEAN_SECS_COL) \ +FROM pg_stat_statements \ +ORDER BY mean_exec_time DESC \ +LIMIT $(LIMIT);" +endef + +define QUERY_MEMHOG +-c "SELECT dbid, $(QUERY_COL), (shared_blks_hit+shared_blks_dirtied) as mem \ +FROM pg_stat_statements \ +ORDER BY (shared_blks_hit+shared_blks_dirtied) DESC \ +LIMIT $(LIMIT);" +endef + QUERY := -c "SELECT * FROM pg_stat_statements LIMIT 0;" pg-query: psql $(PGCONN) $(QUERY) + +pg-txn-stats: QUERY=-c "SELECT max(round) AS max_round, count(*) AS txn_count from txn;" +pg-txn-stats: pg-query + pg-conn: QUERY= pg-conn: pg-query @@ -76,32 +103,3 @@ pg-stats-2: pg-query pg-stats-3: QUERY=$(QUERY_MEMHOG) pg-stats-3: pg-query - - -pg-enter: - docker exec -it $(PGCONT) psql -U algorand -d postgres - -pg-down: - docker-compose down - -build: - cd .. && make - -run-conduit: build - $(CONDUIT) -d conduit_data - -run-file-exporter: - make run-runner TEMPLATE="--template file-exporter" TIMES=1 RESETDB= - -clean-reports: - rm -rf $(REPORTS) - -pre-git-push: - mv _go.mod go.mod - mv _go.sum go.sum - cd ../../ && make tidy - -post-git-push: - mv go.mod _go.mod - mv go.sum _go.sum - cd ../../ && make tidy && go get github.com/lib/pq diff --git a/performance/filereader_blocks/.gitignore b/performance/filereader_blocks/.gitignore new file mode 100644 index 0000000..694556c --- /dev/null +++ b/performance/filereader_blocks/.gitignore @@ -0,0 +1 @@ +*.msgp.gz diff --git a/performance/pg_stats.ipynb b/performance/pg_stats.ipynb new file mode 100644 index 0000000..ccf2f31 --- /dev/null +++ b/performance/pg_stats.ipynb @@ -0,0 +1,1571 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Constants" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CONDUIT_LOG=PosixPath('/Users/zeph/github/algorand/conduit/performance/conduit.log')\n" + ] + } + ], + "source": [ + "from pathlib import Path\n", + "CONDUIT_LOG = Path.cwd() / \"conduit.log\"\n", + "\n", + "PGCONN = \"postgresql://algorand:algorand@localhost:65432/performance_db\"\n", + "LIMIT = 15\n", + "\n", + "# Query columns\n", + "QUERY_COL = \"substring(trim(regexp_replace(regexp_replace(query, '--.*?$', '', 'gn'), '\\\\s+', ' ', 'g')), 1, 100) AS query\"\n", + "TOTAL_SECS_COL = \"round((total_exec_time/1000)::numeric, 3) AS tot_s\"\n", + "MEAN_SECS_COL = \"round((mean_exec_time/1000)::numeric, 3) AS mean_s\"\n", + "MIN_SECS_COL = \"round((min_exec_time/1000)::numeric, 3) AS min_s\"\n", + "MAX_SECS_COL = \"round((max_exec_time/1000)::numeric, 3) AS max_s\"\n", + "CPU_COL = \"round((100 * total_exec_time / sum(total_exec_time::numeric) OVER ())::numeric, 2) AS cpu_pct\"\n", + "\n", + "# Queries\n", + "QUERY_TOTAL_TIME = f\"\"\"SELECT dbid, {QUERY_COL}, {TOTAL_SECS_COL}, calls, {MEAN_SECS_COL}, {CPU_COL}\n", + "FROM pg_stat_statements\n", + "ORDER BY total_exec_time DESC\n", + "LIMIT {LIMIT}\"\"\"\n", + "\n", + "QUERY_SLOWEST = f\"\"\"SELECT dbid, {QUERY_COL}, calls, {TOTAL_SECS_COL}, {MIN_SECS_COL}, {MAX_SECS_COL}, {MEAN_SECS_COL}\n", + "FROM pg_stat_statements\n", + "ORDER BY mean_exec_time DESC\n", + "LIMIT {LIMIT}\"\"\"\n", + "\n", + "QUERY_MEMHOG = f\"\"\"SELECT dbid, {QUERY_COL}, (shared_blks_hit+shared_blks_dirtied) as mem\n", + "FROM pg_stat_statements\n", + "ORDER BY (shared_blks_hit+shared_blks_dirtied) DESC\n", + "LIMIT {LIMIT}\"\"\"\n", + "\n", + "print(f\"{CONDUIT_LOG=}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Parse the log" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Start Time: 2023-08-20 22:59:43.436898-05:00\n", + "Finish Time: 2023-08-20 23:01:32.968865-05:00\n", + "Log Rounds: 10\n", + "Total Export Time: 0:01:49.531967\n", + "Mean Export Time: 10.9531967 seconds\n" + ] + } + ], + "source": [ + "from datetime import datetime\n", + "import json\n", + "import re\n", + "\n", + "with open(CONDUIT_LOG) as f:\n", + " log_content = f.read()\n", + "\n", + "lines = log_content.strip().split(\"\\n\")\n", + "\n", + "\n", + "# Regular expressions for extracting required data\n", + "start_time_pattern = re.compile(r'Block 1 read time')\n", + "finish_time_pattern = re.compile(r'round r=(\\d+) .* exported in')\n", + "time_pattern = re.compile(\n", + " r'(?P