From 236b7038530e41746605feeaee566b8f85a8434c Mon Sep 17 00:00:00 2001 From: "daniil.khasanov" Date: Wed, 23 Jul 2025 19:22:03 +0300 Subject: [PATCH 01/11] Changed string template --- CHANGELOG.md | 4 + doc/ru/usage.md | 12 +- go.mod | 3 + go.sum | 85 +++++++++ internal/generator/models/common.go | 45 +++++ internal/generator/models/common_test.go | 163 ++++++++++++++++++ .../usecase/general/generator/generator.go | 10 +- .../general/generator/value/datetime.go | 2 +- .../usecase/general/generator/value/enum.go | 2 +- .../usecase/general/generator/value/float.go | 2 +- .../general/generator/value/integer.go | 2 +- .../general/generator/value/interfaces.go | 2 +- .../usecase/general/generator/value/string.go | 33 +++- .../usecase/general/generator/value/uuid.go | 2 +- internal/generator/usecase/general/task.go | 29 +++- .../usecase/general/test/unit_test.go | 10 +- 16 files changed, 375 insertions(+), 31 deletions(-) create mode 100644 internal/generator/models/common_test.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 450f3eb..0d93338 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,3 +38,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Data partitioning - Ability to continue generation - Availability to ignore some models for generation + +### Changed + +- String templates replaced with jinja like diff --git a/doc/ru/usage.md b/doc/ru/usage.md index 0a44b75..d9452f3 100644 --- a/doc/ru/usage.md +++ b/doc/ru/usage.md @@ -160,8 +160,10 @@ open_ai: - `min_length`: Минимальная длина строки. По умолчанию `1`. - `max_length`: Максимальная длина строки. По умолчанию `32`. - `logical_type`: Логический тип строки. Поддерживаемые значения: `first_name`, `last_name`, `phone`, `text`. -- `template`: Шаблон для генерации строки. Символ `A` - любая большая буква, символ `a` - любая маленькая буква, - символ `0` - любая цифра, символ `#` - любой символ. Остальные символы остаются как есть. +- `template`: Jinja-подобный шаблон для генерации строки. Позволяет использовать любые поля генерируемой модели и + задавать паттерн строки с помощью функции `pattern`, где символ `A` - любая большая буква, символ `a` - любая маленькая буква, + символ `0` - любая цифра, символ `#` - любой символ, а остальные символы остаются как есть. + Также поддерживается использование фильтров, таких как `upper` и `lower`. - `locale`: Локаль для генерации строк. Поддерживаемые значения: `ru`, `en`. По умолчанию `en`. - `without_large_letters`: Флаг, указывающий, исключать ли большие буквы из строки. - `without_small_letters`: Флаг, указывающий, исключать ли маленькие буквы из строки. @@ -307,9 +309,13 @@ models: - name: passport type: string type_params: - template: AA 00 000 000 + template: "{{ pattern('AA 00 000 000') }}" distinct_percentage: 1 ordered: true + - name: email + type: string + type_params: + template: "{{ first_name_en | lower }}.{{ id }}@example.com" - name: rating type: float type_params: diff --git a/go.mod b/go.mod index ae4ded1..df8996a 100644 --- a/go.mod +++ b/go.mod @@ -36,7 +36,9 @@ require ( github.com/charmbracelet/x/term v0.2.1 // indirect github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/emirpasic/gods v1.18.1 // indirect github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect + github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3 // indirect github.com/goccy/go-json v0.10.5 // indirect github.com/golang/snappy v0.0.4 // indirect github.com/google/flatbuffers v25.2.10+incompatible // indirect @@ -58,6 +60,7 @@ require ( github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect github.com/muesli/cancelreader v0.2.2 // indirect github.com/muesli/termenv v0.15.3-0.20240618155329-98d742f6907a // indirect + github.com/otaviokr/topological-sort v1.1.0 // indirect github.com/pierrec/lz4/v4 v4.1.22 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/rivo/uniseg v0.4.7 // indirect diff --git a/go.sum b/go.sum index a590230..70d4c68 100644 --- a/go.sum +++ b/go.sum @@ -37,26 +37,49 @@ github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46t github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY= github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc= +github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ= github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6baUTXGLOoWe4PQhGxaX0KpnayAqC48p4= github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM= github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM= github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE= +github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3 h1:fmFk0Wt3bBxxwZnu48jqMdaOR/IZ4vdtJFuaFV8MpIE= +github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3/go.mod h1:bJWSKrZyQvfTnb2OudyUjurSG4/edverV7n82+K3JiM= +github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= +github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg7847qk6SyHyPtNmDHnmrv/HOrqktSC+C9fM+CJOE= +github.com/go-test/deep v1.1.0 h1:WOcxcdHcvdgThNXjw0t76K42FXTU7HpNQWHpA2HHNlg= +github.com/go-test/deep v1.1.0/go.mod h1:5C2ZWiW0ErCdrYzpqxLbTX7MG14M9iiw8DgHncVwcsE= github.com/goccy/go-json v0.10.5 h1:Fq85nIqj+gXn/S5ahsiTlK3TmC85qgirsdTP/+DeaC4= github.com/goccy/go-json v0.10.5/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= +github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= +github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= +github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= +github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= +github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= +github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/flatbuffers v25.2.10+incompatible h1:F3vclr7C3HpB1k9mxCGRMXq6FdUalZ6H/pNX4FP1v0Q= github.com/google/flatbuffers v25.2.10+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= +github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= @@ -65,6 +88,9 @@ github.com/hashicorp/go-hclog v1.6.3 h1:Qr2kF+eVWjTiYmU7Y31tYlP1h0q/X3Nl3tPGdaB1 github.com/hashicorp/go-hclog v1.6.3/go.mod h1:W4Qnvbt70Wk/zYJryRzDRU/4r0kIg0PVHBcfoyhpF5M= github.com/hashicorp/go-retryablehttp v0.7.7 h1:C8hUCYzor8PIfXHa4UrZkU4VvK8o9ISHxT2Q8+VepXU= github.com/hashicorp/go-retryablehttp v0.7.7/go.mod h1:pkQpWZeYWskR+D1tR2O5OcBFOxfA7DoAO6xtkuQnHTk= +github.com/heimdalr/dag v1.5.0 h1:hqVtijvY776P5OKP3QbdVBRt3Xxq6BYopz3XgklsGvo= +github.com/heimdalr/dag v1.5.0/go.mod h1:lthekrHl01dddmzqyBQ1YZbi7XcVGGzjFo0jIky5knc= +github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/ilyakaznacheev/cleanenv v1.5.0 h1:0VNZXggJE2OYdXE87bfSSwGxeiGt9moSR2lOrsHHvr4= github.com/ilyakaznacheev/cleanenv v1.5.0/go.mod h1:a5aDzaJrLCQZsazHol1w8InnDcOX0OColm64SlIi6gk= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= @@ -113,11 +139,23 @@ github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELU github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo= github.com/muesli/termenv v0.15.3-0.20240618155329-98d742f6907a h1:2MaM6YC3mGu54x+RKAA6JiFFHlHDY1UbkxqppT7wYOg= github.com/muesli/termenv v0.15.3-0.20240618155329-98d742f6907a/go.mod h1:hxSnBBYLK21Vtq/PHd0S2FYCxBXzBua8ov5s1RobyRQ= +github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= +github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A= +github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= +github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= +github.com/onsi/ginkgo v1.12.1/go.mod h1:zj2OWP4+oCPe1qIXoGWkgMRwljMUYCdkwsT2108oapk= +github.com/onsi/ginkgo v1.16.1/go.mod h1:CObGmKUOKaSC0RjmoAK7tKyn4Azo5P2IWuoMnvwxz1E= +github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY= +github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo= +github.com/onsi/gomega v1.11.0/go.mod h1:azGKhqFUon9Vuj0YmTfLSmx0FUwqXYSTl5re8lQLTUg= +github.com/otaviokr/topological-sort v1.1.0 h1:BrWj/bLOo9aZFUi0YN2/s4P/GRe2PSmb8cyX4w1ysNg= +github.com/otaviokr/topological-sort v1.1.0/go.mod h1:77ZaKUg7Ir1nL6DPwEIQFm9iH2OS5xxVWvzZ8xPTCFg= github.com/pierrec/lz4/v4 v4.1.22 h1:cKFw6uJDK+/gfw5BcDL0JL5aBsAFdsIT18eRtLj7VIU= github.com/pierrec/lz4/v4 v4.1.22/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= @@ -134,8 +172,10 @@ github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= +github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw= @@ -146,6 +186,7 @@ github.com/vbauerster/mpb/v8 v8.8.3 h1:dTOByGoqwaTJYPubhVz3lO5O6MK553XVgUo33LdnN github.com/vbauerster/mpb/v8 v8.8.3/go.mod h1:JfCCrtcMsJwP6ZwMn9e5LMnNyp3TVNpUWWkN+nd4EWk= github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU= github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= @@ -162,29 +203,60 @@ go.opentelemetry.io/otel/sdk/metric v1.34.0 h1:5CeK9ujjbFVL5c1PhLuStg1wxA7vQv7ce go.opentelemetry.io/otel/sdk/metric v1.34.0/go.mod h1:jQ/r8Ze28zRKoNRdkjCZxfs6YvBTG1+YIqyFVFYec5w= go.opentelemetry.io/otel/trace v1.34.0 h1:+ouXS2V8Rd4hp4580a8q23bg0azF2nI8cqLYnC8mh/k= go.opentelemetry.io/otel/trace v1.34.0/go.mod h1:Svm7lSjQD7kG7KJ/MUHPVXSDGz2OX4h0M2jHBhmSfRE= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.39.0 h1:SHs+kF4LP+f+p14esP5jAoDpHU8Gu/v9lFRK6IT5imM= golang.org/x/crypto v0.39.0/go.mod h1:L+Xg3Wf6HoL4Bn4238Z6ft6KfEpN0tJGo53AAPC632U= golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 h1:e66Fs6Z+fZTbFBAxKfP3PALWBtpfqks2bwGcexMxgtk= golang.org/x/exp v0.0.0-20240909161429-701f63a606c0/go.mod h1:2TbTHSBQa924w8M6Xs1QcRcFwyucIwBGpK1p2f1YFFY= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.25.0 h1:n7a+ZbQKQA/Ysbyb0/6IbB1H/X41mKgbhfv7AfG/44w= golang.org/x/mod v0.25.0/go.mod h1:IXM97Txy2VM4PJ3gI61r1YEk/gAj6zAHN3AdZt6S9Ww= +golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200520004742-59133d7f0dd7/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= +golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8= golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181122145206-62eef0e2fa9b/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190904154756-749cb33beabd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191005200804-aed5e4c7ecf9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191120155948-bd437916bb0e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210112080510-489259a85091/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M= golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA= golang.org/x/time v0.8.0 h1:9i3RxcPv3PZnitoVGMPDKZSq1xW1gK1Xy3ArNOGZfEg= golang.org/x/time v0.8.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20201224043029-2b0845dc783e/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.33.0 h1:4qz2S3zmRxbGIhDIAgjxvFutSvH5EfnsYrRBj0UI0bc= golang.org/x/tools v0.33.0/go.mod h1:CIJMaWEY88juyUfo7UbgPqbC8rU2OqfAV1h2Qp0oMYI= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da h1:noIWHXmPHxILtqtCOPIhSt0ABwskkZKjD3bXGnZGpNY= golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da/go.mod h1:NDW/Ps6MPRej6fsCIbMTohpP40sJ/P/vI1MoTEGwX90= gonum.org/v1/gonum v0.15.1 h1:FNy7N6OUZVUaWG9pTiD+jlhdQ3lMP+/LcTpJ6+a8sQ0= @@ -193,11 +265,24 @@ google.golang.org/genproto/googleapis/rpc v0.0.0-20250115164207-1a7da9e5054f h1: google.golang.org/genproto/googleapis/rpc v0.0.0-20250115164207-1a7da9e5054f/go.mod h1:+2Yz8+CLJbIfL9z73EW45avw8Lmge3xVElCP9zEKi50= google.golang.org/grpc v1.71.0 h1:kF77BGdPTQ4/JZWMlb9VpJ5pa25aqvVqogsxNHHdeBg= google.golang.org/grpc v1.71.0/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= +google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= +google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= +google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= +google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= +google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= +google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.36.5 h1:tPhr+woSbjfYvY6/GPufUoYizxw1cF/yFoxJ2fmpwlM= google.golang.org/protobuf v1.36.5/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20200902074654-038fdea0a05b/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= olympos.io/encoding/edn v0.0.0-20201019073823-d3554ca0b0a3 h1:slmdOY3vp8a7KQbHkL+FLbvbkgMqmXojpFUO/jENuqQ= diff --git a/internal/generator/models/common.go b/internal/generator/models/common.go index 635e84b..5814a2d 100644 --- a/internal/generator/models/common.go +++ b/internal/generator/models/common.go @@ -2,10 +2,12 @@ package models import ( "encoding/json" + "github.com/otaviokr/topological-sort/toposort" "io" "os" "path/filepath" "reflect" + "regexp" "strings" "github.com/ilyakaznacheev/cleanenv" @@ -119,3 +121,46 @@ func parseErrsToString(errs []error) string { return sb.String() } + +func TopologicalSort(columns []*Column) ([]string, error) { + graph := make(map[string][]string) + for _, c := range columns { + graph[c.Name] = make([]string, 0) + + if c.Type != "string" { + continue + } + + for _, r := range c.Ranges { + if r.StringParams.Template == "" { + continue + } + + graph[c.Name] = extractValuesFromTemplate(r.StringParams.Template) + } + } + + sortedVertexes, err := toposort.ReverseTarjan(graph) + if err != nil { + return nil, err + } + + return sortedVertexes, nil +} + +func extractValuesFromTemplate(template string) []string { + re := regexp.MustCompile(`{{\s*([^}]+)\s*}}`) + matches := re.FindAllStringSubmatch(template, -1) + + var values []string + for _, match := range matches { + expr := match[1] + + parts := regexp.MustCompile(`\s*\|\s*|\s+`).Split(expr, -1) + if len(parts) > 0 && parts[0] != "" && !strings.Contains(parts[0], "(") { + values = append(values, parts[0]) + } + } + + return values +} diff --git a/internal/generator/models/common_test.go b/internal/generator/models/common_test.go new file mode 100644 index 0000000..eb12dea --- /dev/null +++ b/internal/generator/models/common_test.go @@ -0,0 +1,163 @@ +package models + +import ( + "github.com/stretchr/testify/require" + "testing" +) + +func TestExtractValuesFromTemplate(t *testing.T) { + type testCase struct { + name string + template string + expected []string + } + + testCases := []testCase{ + { + name: "Empty template", + template: "", + expected: nil, + }, + { + name: "Valid template", + template: "{{ foo }}.{{boo}}", + expected: []string{"foo", "boo"}, + }, + { + name: "Template with filters", + template: "{{ foo | upper | lower }}", + expected: []string{"foo"}, + }, + { + name: "Template with functions", + template: "{{ upper('foo') | lower }}@{{ boo }}", + expected: []string{"boo"}, + }, + { + name: "Invalid template", + template: "{_{ foo }}", + expected: nil, + }, + } + + testFunc := func(t *testing.T, tc testCase) { + t.Helper() + + actual := extractValuesFromTemplate(tc.template) + require.Equal(t, tc.expected, actual) + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { testFunc(t, tc) }) + } +} + +func TestTopologicalSort(t *testing.T) { + type testCase struct { + name string + columns []*Column + wantErr bool + expected []string + } + + testCases := []testCase{ + { + name: "Empty columns", + columns: []*Column{}, + wantErr: false, + expected: []string{}, + }, + { + name: "Columns with dependencies", + columns: []*Column{ + { + Name: "1", + Type: "string", + Ranges: []*Params{ + { + StringParams: &ColumnStringParams{ + Template: "{{ 3 }}", + }, + }, + }, + }, + { + Name: "2", + Type: "string", + Ranges: []*Params{ + { + StringParams: &ColumnStringParams{ + Template: "{{ 4 }}", + }, + }, + }, + }, + { + Name: "3", + Type: "string", + Ranges: []*Params{ + { + StringParams: &ColumnStringParams{ + Template: "{{ 2 }}", + }, + }, + }, + }, + { + Name: "4", + Type: "string", + Ranges: []*Params{ + { + StringParams: &ColumnStringParams{ + Template: "", + }, + }, + }, + }, + }, + wantErr: false, + expected: []string{"4", "2", "3", "1"}, + }, + { + name: "Columns with cycle dependencies", + columns: []*Column{ + { + Name: "1", + Type: "string", + Ranges: []*Params{ + { + StringParams: &ColumnStringParams{ + Template: "{{ 2 }}", + }, + }, + }, + }, + { + Name: "2", + Type: "string", + Ranges: []*Params{ + { + StringParams: &ColumnStringParams{ + Template: "{{ 1 }}", + }, + }, + }, + }, + }, + wantErr: true, + expected: nil, + }, + } + + testFunc := func(t *testing.T, tc testCase) { + t.Helper() + + actual, err := TopologicalSort(tc.columns) + require.Equal(t, tc.wantErr, err != nil) + require.Equal(t, tc.expected, actual) + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { testFunc(t, tc) }) + } +} diff --git a/internal/generator/usecase/general/generator/generator.go b/internal/generator/usecase/general/generator/generator.go index 7e548fc..254db2e 100644 --- a/internal/generator/usecase/general/generator/generator.go +++ b/internal/generator/usecase/general/generator/generator.go @@ -200,7 +200,7 @@ type valueID struct { type BatchGenerator struct { numbers []valueID nextNumber int - valuer func(number valueID) (any, error) + valuer func(number valueID, generatedValues map[string]any) (any, error) } func (cg *ColumnGenerator) NewBatchGenerator(batchSize uint64) *BatchGenerator { @@ -226,14 +226,14 @@ func (cg *ColumnGenerator) NewBatchGenerator(batchSize uint64) *BatchGenerator { } } - valuer := func(id valueID) (any, error) { + valuer := func(id valueID, generatedValues map[string]any) (any, error) { vg := cg.rangeGenerators[id.generatorIndex] if vg.nullPercentage > 0 && fastRandomFloat(cg.dataColumnSeed+uint64(id.number)) < vg.nullPercentage { return nil, nil //nolint:nilnil } - return vg.generator.Value(id.number) + return vg.generator.Value(id.number, generatedValues) } return &BatchGenerator{ @@ -243,8 +243,8 @@ func (cg *ColumnGenerator) NewBatchGenerator(batchSize uint64) *BatchGenerator { } // Value returns random value for described column. -func (g *BatchGenerator) Value() (any, error) { - res, err := g.valuer(g.numbers[g.nextNumber]) +func (g *BatchGenerator) Value(generatedValues map[string]any) (any, error) { + res, err := g.valuer(g.numbers[g.nextNumber], generatedValues) g.nextNumber++ g.nextNumber %= len(g.numbers) diff --git a/internal/generator/usecase/general/generator/value/datetime.go b/internal/generator/usecase/general/generator/value/datetime.go index 84b1cab..600e89b 100644 --- a/internal/generator/usecase/general/generator/value/datetime.go +++ b/internal/generator/usecase/general/generator/value/datetime.go @@ -26,7 +26,7 @@ func (g *DateTimeGenerator) SetTotalCount(totalValuesCount uint64) error { } // Value returns n-th date from range. -func (g *DateTimeGenerator) Value(number float64) (any, error) { +func (g *DateTimeGenerator) Value(number float64, _ map[string]any) (any, error) { fromSec := g.From.Unix() toSec := g.To.Unix() diff --git a/internal/generator/usecase/general/generator/value/enum.go b/internal/generator/usecase/general/generator/value/enum.go index 8c8e4f5..18d4413 100644 --- a/internal/generator/usecase/general/generator/value/enum.go +++ b/internal/generator/usecase/general/generator/value/enum.go @@ -31,7 +31,7 @@ func (g *EnumGenerator) SetTotalCount(totalValuesCount uint64) error { return nil } -func (g *EnumGenerator) Value(number float64) (any, error) { +func (g *EnumGenerator) Value(number float64, _ map[string]any) (any, error) { idx := int(math.Floor(number)) / g.rowsPerValue return g.Values[idx], nil diff --git a/internal/generator/usecase/general/generator/value/float.go b/internal/generator/usecase/general/generator/value/float.go index 0b68655..c1903a4 100644 --- a/internal/generator/usecase/general/generator/value/float.go +++ b/internal/generator/usecase/general/generator/value/float.go @@ -26,7 +26,7 @@ func (g *FloatGenerator) SetTotalCount(totalValuesCount uint64) error { } // Value returns n-th float number from range. -func (g *FloatGenerator) Value(number float64) (any, error) { +func (g *FloatGenerator) Value(number float64, _ map[string]any) (any, error) { value := orderedFloat64(g.From, g.To, number, g.totalValuesCount) if g.BitWidth == 32 { //nolint:mnd diff --git a/internal/generator/usecase/general/generator/value/integer.go b/internal/generator/usecase/general/generator/value/integer.go index 6956e3d..c83b1ba 100644 --- a/internal/generator/usecase/general/generator/value/integer.go +++ b/internal/generator/usecase/general/generator/value/integer.go @@ -22,7 +22,7 @@ func (g *IntegerGenerator) SetTotalCount(totalValuesCount uint64) error { } // Value returns n-th integer number from range. -func (g *IntegerGenerator) Value(number float64) (any, error) { +func (g *IntegerGenerator) Value(number float64, _ map[string]any) (any, error) { value := orderedInt64(g.From, g.To, number, g.totalValuesCount) switch g.BitWidth { diff --git a/internal/generator/usecase/general/generator/value/interfaces.go b/internal/generator/usecase/general/generator/value/interfaces.go index c67e5da..0c4c196 100644 --- a/internal/generator/usecase/general/generator/value/interfaces.go +++ b/internal/generator/usecase/general/generator/value/interfaces.go @@ -7,7 +7,7 @@ type Generator interface { // SetTotalCount method should remember count of rows to generate SetTotalCount(totalValuesCount uint64) error // Value method should return ordered unique value by number - Value(number float64) (any, error) + Value(number float64, generatedValues map[string]any) (any, error) // ValuesCount method should return the number of possible values to generate ValuesCount() float64 } diff --git a/internal/generator/usecase/general/generator/value/string.go b/internal/generator/usecase/general/generator/value/string.go index fc1d7c5..a48a848 100644 --- a/internal/generator/usecase/general/generator/value/string.go +++ b/internal/generator/usecase/general/generator/value/string.go @@ -1,6 +1,7 @@ package value import ( + "github.com/flosch/pongo2" "math" "math/big" "slices" @@ -21,6 +22,7 @@ type StringGenerator struct { *models.ColumnStringParams totalValuesCount uint64 localeModule locale.LocalModule + template *pongo2.Template charset []rune countByPrefix []float64 sumByPrefix []float64 @@ -29,6 +31,15 @@ type StringGenerator struct { //nolint:cyclop func (g *StringGenerator) Prepare() error { + if g.Template != "" { + template, err := pongo2.FromString(g.Template) + if err != nil { + return err + } + + g.template = template + } + switch g.Locale { case "ru": g.localeModule = ru.NewLocaleModule(g.LogicalType, g.MinLength, g.MaxLength) @@ -171,8 +182,22 @@ func (g *StringGenerator) calculateCompletions(length int) []int64 { } // templateString returns n-th string by template. -func (g *StringGenerator) templateString(number float64) string { - val := []rune(g.Template) +func (g *StringGenerator) templateString(number float64, generatedValues map[string]any) (string, error) { + generatedValues["pattern"] = func(pattern string) string { + return g.patternString(number, pattern) + } + + val, err := g.template.Execute(generatedValues) + if err != nil { + return "", err + } + + return val, nil +} + +// patternString returns n-th string by pattern. +func (g *StringGenerator) patternString(number float64, pattern string) string { + val := []rune(pattern) index := number / float64(g.totalValuesCount) for i := range val { @@ -410,9 +435,9 @@ func (g *StringGenerator) simpleString(number float64) string { } // Value returns n-th string from range. -func (g *StringGenerator) Value(number float64) (any, error) { +func (g *StringGenerator) Value(number float64, row map[string]any) (any, error) { if g.Template != "" { - return g.templateString(number), nil + return g.templateString(number, row) } switch g.LogicalType { diff --git a/internal/generator/usecase/general/generator/value/uuid.go b/internal/generator/usecase/general/generator/value/uuid.go index 7bf922b..ca32580 100644 --- a/internal/generator/usecase/general/generator/value/uuid.go +++ b/internal/generator/usecase/general/generator/value/uuid.go @@ -25,7 +25,7 @@ func (g *UUIDGenerator) SetTotalCount(totalValuesCount uint64) error { } // Value returns n-th UUID from range. -func (g *UUIDGenerator) Value(number float64) (any, error) { +func (g *UUIDGenerator) Value(number float64, _ map[string]any) (any, error) { res := uuid.UUID{} index := number / float64(g.totalValuesCount) diff --git a/internal/generator/usecase/general/task.go b/internal/generator/usecase/general/task.go index 2bf435e..6067e08 100644 --- a/internal/generator/usecase/general/task.go +++ b/internal/generator/usecase/general/task.go @@ -221,7 +221,7 @@ func (t *Task) generateAndSaveValues(ctx context.Context) error { generators = append(generators, t.generators[columnKey].NewBatchGenerator(rowsCount)) } - pool.Submit(ctx, outputSyncer.WorkerSyncer(), modelName, generators, rowsCount) + pool.Submit(ctx, outputSyncer.WorkerSyncer(), model, generators, rowsCount) } }() } @@ -255,7 +255,7 @@ func (t *Task) skipRows() { // generateAndSaveBatch function generate batch of values for selected column and send it to output. func (t *Task) generateAndSaveBatch( ctx context.Context, outputSync *common.WorkerSyncer, - modelName string, generators []*generator.BatchGenerator, count uint64, + model *models.Model, generators []*generator.BatchGenerator, count uint64, ) error { defer outputSync.Done(ctx) @@ -266,29 +266,42 @@ func (t *Task) generateAndSaveBatch( } } - for g, gen := range generators { - for i := range count { + sortedColumn, err := models.TopologicalSort(model.Columns) + if err != nil { + return err + } + + originIndexes := make(map[string]int, len(model.Columns)) + for index, column := range model.Columns { + originIndexes[column.Name] = index + } + + for i := range count { + generatedValues := make(map[string]any) + + for _, columnName := range sortedColumn { if common.CtxClosed(ctx) { return &common.ContextCancelError{} } - value, err := gen.Value() + value, err := generators[originIndexes[columnName]].Value(generatedValues) if err != nil { return errors.WithMessage(err, "failed to get or generate value") } - batch[i].Values[g] = value + generatedValues[columnName] = value + batch[i].Values[originIndexes[columnName]] = value } } outputSync.WaitPrevious(ctx) - err := t.output.HandleRowsBatch(ctx, modelName, batch) + err = t.output.HandleRowsBatch(ctx, model.Name, batch) if err != nil { return errors.WithMessage(err, "failed to save batch to output") } - t.progress.Add(modelName, count) + t.progress.Add(model.Name, count) return nil } diff --git a/internal/generator/usecase/general/test/unit_test.go b/internal/generator/usecase/general/test/unit_test.go index 6efad38..0a2a7ad 100644 --- a/internal/generator/usecase/general/test/unit_test.go +++ b/internal/generator/usecase/general/test/unit_test.go @@ -437,9 +437,9 @@ func TestString(t *testing.T) { {&models.ColumnStringParams{LogicalType: models.LastNameType, MinLength: 4, MaxLength: 7}, 4, 7}, {&models.ColumnStringParams{LogicalType: models.PhoneType, MinLength: 10, MaxLength: 10}, 10, 10}, {&models.ColumnStringParams{MinLength: 100, MaxLength: 100}, 100, 100}, - {&models.ColumnStringParams{Template: "AAaa00##", Locale: "en"}, 8, 8}, - {&models.ColumnStringParams{Template: "AAaa00##", Locale: "ru"}, 8, 8}, - {&models.ColumnStringParams{Template: "0123456789012345678901234567890123456789"}, 40, 40}, + {&models.ColumnStringParams{Template: "{{ pattern('AAaa00##') }}", Locale: "en"}, 8, 8}, + {&models.ColumnStringParams{Template: "{{ pattern('AAaa00##') }}", Locale: "ru"}, 8, 8}, + {&models.ColumnStringParams{Template: "{{ pattern('0123456789012345678901234567890123456789') }}"}, 40, 40}, {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 3, MaxLength: 5}, 3, 5}, {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 254, MaxLength: 256}, 254, 256}, {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 510, MaxLength: 512}, 510, 512}, @@ -449,7 +449,7 @@ func TestString(t *testing.T) { } for _, testCase := range testCases { - column := &models.Column{Type: "string", Ranges: []*models.Params{{TypeParams: testCase.typeParams}}} + column := &models.Column{Name: "test", Type: "string", Ranges: []*models.Params{{TypeParams: testCase.typeParams}}} handled := checkType(t, column, "") strValue, ok := handled[0].Values[0].(string) @@ -599,7 +599,7 @@ func TestIdempotence(t *testing.T) { Name: "passport", Type: "string", Ranges: []*models.Params{{TypeParams: &models.ColumnStringParams{ - Template: "AA 00 000 000", + Template: "{{ pattern('AA 00 000 000') }}", }, NullPercentage: 0.5}}, }, From 7a9d5bf016adeb4eb2adb41205d40026549d718c Mon Sep 17 00:00:00 2001 From: "daniil.khasanov" Date: Thu, 24 Jul 2025 18:48:01 +0300 Subject: [PATCH 02/11] Updated the calculation of the number of possible values to generation --- go.mod | 5 +- go.sum | 13 ++-- internal/generator/common/utils.go | 21 +++++ internal/generator/common/utils_test.go | 47 +++++++++++ internal/generator/models/common.go | 33 ++------ internal/generator/models/common_test.go | 52 +------------ internal/generator/models/generator_model.go | 12 ++- internal/generator/models/models_test.go | 3 + .../usecase/general/generator/generator.go | 37 +++++---- .../general/generator/value/datetime.go | 2 +- .../usecase/general/generator/value/enum.go | 2 +- .../usecase/general/generator/value/float.go | 2 +- .../general/generator/value/integer.go | 2 +- .../general/generator/value/interfaces.go | 2 +- .../usecase/general/generator/value/string.go | 78 +++++++++++++++---- .../usecase/general/generator/value/uuid.go | 2 +- internal/generator/usecase/general/task.go | 19 +++-- .../usecase/general/test/unit_test.go | 58 +++++++++++--- 18 files changed, 244 insertions(+), 146 deletions(-) diff --git a/go.mod b/go.mod index df8996a..ad9d088 100644 --- a/go.mod +++ b/go.mod @@ -5,12 +5,14 @@ go 1.23.8 require ( github.com/apache/arrow-go/v18 v18.2.0 github.com/charmbracelet/huh/spinner v0.0.0-20250203114958-f07ae1af69ae + github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3 github.com/google/uuid v1.6.0 github.com/hashicorp/go-retryablehttp v0.7.7 github.com/ilyakaznacheev/cleanenv v1.5.0 github.com/labstack/echo/v4 v4.13.3 github.com/manifoldco/promptui v0.9.0 github.com/moby/term v0.5.2 + github.com/otaviokr/topological-sort v1.1.0 github.com/pkg/errors v0.9.1 github.com/sashabaranov/go-openai v1.36.1 github.com/spf13/afero v1.12.0 @@ -36,9 +38,7 @@ require ( github.com/charmbracelet/x/term v0.2.1 // indirect github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect - github.com/emirpasic/gods v1.18.1 // indirect github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect - github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3 // indirect github.com/goccy/go-json v0.10.5 // indirect github.com/golang/snappy v0.0.4 // indirect github.com/google/flatbuffers v25.2.10+incompatible // indirect @@ -60,7 +60,6 @@ require ( github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect github.com/muesli/cancelreader v0.2.2 // indirect github.com/muesli/termenv v0.15.3-0.20240618155329-98d742f6907a // indirect - github.com/otaviokr/topological-sort v1.1.0 // indirect github.com/pierrec/lz4/v4 v4.1.22 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/rivo/uniseg v0.4.7 // indirect diff --git a/go.sum b/go.sum index 70d4c68..8488c13 100644 --- a/go.sum +++ b/go.sum @@ -41,8 +41,6 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc= -github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ= github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6baUTXGLOoWe4PQhGxaX0KpnayAqC48p4= github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM= github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM= @@ -50,14 +48,13 @@ github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4Nij github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3 h1:fmFk0Wt3bBxxwZnu48jqMdaOR/IZ4vdtJFuaFV8MpIE= github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3/go.mod h1:bJWSKrZyQvfTnb2OudyUjurSG4/edverV7n82+K3JiM= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= +github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWoS4= github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg7847qk6SyHyPtNmDHnmrv/HOrqktSC+C9fM+CJOE= -github.com/go-test/deep v1.1.0 h1:WOcxcdHcvdgThNXjw0t76K42FXTU7HpNQWHpA2HHNlg= -github.com/go-test/deep v1.1.0/go.mod h1:5C2ZWiW0ErCdrYzpqxLbTX7MG14M9iiw8DgHncVwcsE= github.com/goccy/go-json v0.10.5 h1:Fq85nIqj+gXn/S5ahsiTlK3TmC85qgirsdTP/+DeaC4= github.com/goccy/go-json v0.10.5/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= @@ -79,7 +76,6 @@ github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMyw github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= @@ -88,8 +84,6 @@ github.com/hashicorp/go-hclog v1.6.3 h1:Qr2kF+eVWjTiYmU7Y31tYlP1h0q/X3Nl3tPGdaB1 github.com/hashicorp/go-hclog v1.6.3/go.mod h1:W4Qnvbt70Wk/zYJryRzDRU/4r0kIg0PVHBcfoyhpF5M= github.com/hashicorp/go-retryablehttp v0.7.7 h1:C8hUCYzor8PIfXHa4UrZkU4VvK8o9ISHxT2Q8+VepXU= github.com/hashicorp/go-retryablehttp v0.7.7/go.mod h1:pkQpWZeYWskR+D1tR2O5OcBFOxfA7DoAO6xtkuQnHTk= -github.com/heimdalr/dag v1.5.0 h1:hqVtijvY776P5OKP3QbdVBRt3Xxq6BYopz3XgklsGvo= -github.com/heimdalr/dag v1.5.0/go.mod h1:lthekrHl01dddmzqyBQ1YZbi7XcVGGzjFo0jIky5knc= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/ilyakaznacheev/cleanenv v1.5.0 h1:0VNZXggJE2OYdXE87bfSSwGxeiGt9moSR2lOrsHHvr4= github.com/ilyakaznacheev/cleanenv v1.5.0/go.mod h1:a5aDzaJrLCQZsazHol1w8InnDcOX0OColm64SlIi6gk= @@ -141,12 +135,15 @@ github.com/muesli/termenv v0.15.3-0.20240618155329-98d742f6907a h1:2MaM6YC3mGu54 github.com/muesli/termenv v0.15.3-0.20240618155329-98d742f6907a/go.mod h1:hxSnBBYLK21Vtq/PHd0S2FYCxBXzBua8ov5s1RobyRQ= github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A= +github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.12.1/go.mod h1:zj2OWP4+oCPe1qIXoGWkgMRwljMUYCdkwsT2108oapk= +github.com/onsi/ginkgo v1.16.1 h1:foqVmeWDD6yYpK+Yz3fHyNIxFYNxswxqNFjSKe+vI54= github.com/onsi/ginkgo v1.16.1/go.mod h1:CObGmKUOKaSC0RjmoAK7tKyn4Azo5P2IWuoMnvwxz1E= github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY= github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo= +github.com/onsi/gomega v1.11.0 h1:+CqWgvj0OZycCaqclBD1pxKHAU+tOkHmQIWvDHq2aug= github.com/onsi/gomega v1.11.0/go.mod h1:azGKhqFUon9Vuj0YmTfLSmx0FUwqXYSTl5re8lQLTUg= github.com/otaviokr/topological-sort v1.1.0 h1:BrWj/bLOo9aZFUi0YN2/s4P/GRe2PSmb8cyX4w1ysNg= github.com/otaviokr/topological-sort v1.1.0/go.mod h1:77ZaKUg7Ir1nL6DPwEIQFm9iH2OS5xxVWvzZ8xPTCFg= @@ -278,10 +275,12 @@ gopkg.in/check.v1 v1.0.0-20200902074654-038fdea0a05b/go.mod h1:Co6ibVJAznAaIkqp8 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/generator/common/utils.go b/internal/generator/common/utils.go index 38f95d0..a58f0c4 100644 --- a/internal/generator/common/utils.go +++ b/internal/generator/common/utils.go @@ -7,6 +7,7 @@ import ( "fmt" "os" "reflect" + "regexp" "slices" "strconv" "strings" @@ -357,3 +358,23 @@ func CtxClosed(ctx context.Context) bool { return false } } + +func ExtractValuesFromTemplate(template string) []string { + re := regexp.MustCompile(`{{\s*([^\s|(){}]+)[^}]*}}`) + matches := re.FindAllStringSubmatch(template, -1) + + values := make([]string, 0, len(matches)) + + for _, match := range matches { + expr := match[0] + val := match[1] + + if strings.Contains(expr, "(") && strings.Contains(expr, ")") { + continue + } + + values = append(values, val) + } + + return values +} diff --git a/internal/generator/common/utils_test.go b/internal/generator/common/utils_test.go index 96b5529..34dabf2 100644 --- a/internal/generator/common/utils_test.go +++ b/internal/generator/common/utils_test.go @@ -697,3 +697,50 @@ func TestWalkWithFilter(t *testing.T) { t.Run(tc.name, func(t *testing.T) { testFunc(t, tc) }) } } + +func TestExtractValuesFromTemplate(t *testing.T) { + type testCase struct { + name string + template string + expected []string + } + + testCases := []testCase{ + { + name: "Empty template", + template: "", + expected: []string{}, + }, + { + name: "Valid template", + template: "{{ foo }}.{{boo}}", + expected: []string{"foo", "boo"}, + }, + { + name: "Template with filters", + template: "{{ foo | upper | lower }}", + expected: []string{"foo"}, + }, + { + name: "Template with functions", + template: "{{ upper('foo') | lower }}@{{ boo }}", + expected: []string{"boo"}, + }, + { + name: "Invalid template", + template: "{_{ foo }}", + expected: []string{}, + }, + } + + testFunc := func(t *testing.T, tc testCase) { + t.Helper() + + actual := ExtractValuesFromTemplate(tc.template) + require.Equal(t, tc.expected, actual) + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { testFunc(t, tc) }) + } +} diff --git a/internal/generator/models/common.go b/internal/generator/models/common.go index 5814a2d..5f82f8e 100644 --- a/internal/generator/models/common.go +++ b/internal/generator/models/common.go @@ -2,15 +2,15 @@ package models import ( "encoding/json" - "github.com/otaviokr/topological-sort/toposort" + "github.com/tarantool/sdvg/internal/generator/common" "io" "os" "path/filepath" "reflect" - "regexp" "strings" "github.com/ilyakaznacheev/cleanenv" + "github.com/otaviokr/topological-sort/toposort" "github.com/pkg/errors" "gopkg.in/yaml.v3" ) @@ -122,45 +122,24 @@ func parseErrsToString(errs []error) string { return sb.String() } -func TopologicalSort(columns []*Column) ([]string, error) { +func topologicalSort(columns []*Column) ([]string, error) { graph := make(map[string][]string) for _, c := range columns { graph[c.Name] = make([]string, 0) - if c.Type != "string" { - continue - } - for _, r := range c.Ranges { - if r.StringParams.Template == "" { + if r.StringParams == nil || r.StringParams.Template == "" { continue } - graph[c.Name] = extractValuesFromTemplate(r.StringParams.Template) + graph[c.Name] = common.ExtractValuesFromTemplate(r.StringParams.Template) } } sortedVertexes, err := toposort.ReverseTarjan(graph) if err != nil { - return nil, err + return nil, errors.New(err.Error()) } return sortedVertexes, nil } - -func extractValuesFromTemplate(template string) []string { - re := regexp.MustCompile(`{{\s*([^}]+)\s*}}`) - matches := re.FindAllStringSubmatch(template, -1) - - var values []string - for _, match := range matches { - expr := match[1] - - parts := regexp.MustCompile(`\s*\|\s*|\s+`).Split(expr, -1) - if len(parts) > 0 && parts[0] != "" && !strings.Contains(parts[0], "(") { - values = append(values, parts[0]) - } - } - - return values -} diff --git a/internal/generator/models/common_test.go b/internal/generator/models/common_test.go index eb12dea..314bb87 100644 --- a/internal/generator/models/common_test.go +++ b/internal/generator/models/common_test.go @@ -1,56 +1,10 @@ package models import ( - "github.com/stretchr/testify/require" "testing" -) - -func TestExtractValuesFromTemplate(t *testing.T) { - type testCase struct { - name string - template string - expected []string - } - - testCases := []testCase{ - { - name: "Empty template", - template: "", - expected: nil, - }, - { - name: "Valid template", - template: "{{ foo }}.{{boo}}", - expected: []string{"foo", "boo"}, - }, - { - name: "Template with filters", - template: "{{ foo | upper | lower }}", - expected: []string{"foo"}, - }, - { - name: "Template with functions", - template: "{{ upper('foo') | lower }}@{{ boo }}", - expected: []string{"boo"}, - }, - { - name: "Invalid template", - template: "{_{ foo }}", - expected: nil, - }, - } - testFunc := func(t *testing.T, tc testCase) { - t.Helper() - - actual := extractValuesFromTemplate(tc.template) - require.Equal(t, tc.expected, actual) - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { testFunc(t, tc) }) - } -} + "github.com/stretchr/testify/require" +) func TestTopologicalSort(t *testing.T) { type testCase struct { @@ -152,7 +106,7 @@ func TestTopologicalSort(t *testing.T) { testFunc := func(t *testing.T, tc testCase) { t.Helper() - actual, err := TopologicalSort(tc.columns) + actual, err := topologicalSort(tc.columns) require.Equal(t, tc.wantErr, err != nil) require.Equal(t, tc.expected, actual) } diff --git a/internal/generator/models/generator_model.go b/internal/generator/models/generator_model.go index e3c451f..88aaf16 100644 --- a/internal/generator/models/generator_model.go +++ b/internal/generator/models/generator_model.go @@ -30,8 +30,9 @@ type Model struct { RowsPerFile uint64 `backup:"true" json:"rows_per_file" yaml:"rows_per_file"` ModelDir string `backup:"true" json:"model_dir" yaml:"model_dir"` // The columns from the partitioning key with PartitionColumn.WriteToOutput == false, must be at the end of slice. - Columns []*Column `backup:"true" json:"columns" yaml:"columns"` - PartitionColumns []*PartitionColumn `backup:"true" json:"partition_columns" yaml:"partition_columns"` + Columns []*Column `backup:"true" json:"columns" yaml:"columns"` + ColumnsTopologicalOrder []string + PartitionColumns []*PartitionColumn `backup:"true" json:"partition_columns" yaml:"partition_columns"` } // PartitionColumn type is used to describe partition parameters for column. @@ -80,6 +81,13 @@ func (m *Model) Parse() error { m.shiftColumnsToEnd(nonWriteableColumns) + sortedColumns, err := topologicalSort(m.Columns) + if err != nil { + return errors.WithMessage(err, "failed to sorting columns by dependencies") + } + + m.ColumnsTopologicalOrder = sortedColumns + return nil } diff --git a/internal/generator/models/models_test.go b/internal/generator/models/models_test.go index ffb908a..5df79da 100644 --- a/internal/generator/models/models_test.go +++ b/internal/generator/models/models_test.go @@ -1197,6 +1197,9 @@ parquet params: expectedModel := tc.expected.Models[modelName] gotModel := cfg.Models[modelName] + // skip ColumnsTopologicalOrder check + expectedModel.ColumnsTopologicalOrder = gotModel.ColumnsTopologicalOrder + for columnName := range expectedModel.Columns { expectedColumn := expectedModel.Columns[columnName] gotColumn := gotModel.Columns[columnName] diff --git a/internal/generator/usecase/general/generator/generator.go b/internal/generator/usecase/general/generator/generator.go index 254db2e..2dc8c27 100644 --- a/internal/generator/usecase/general/generator/generator.go +++ b/internal/generator/usecase/general/generator/generator.go @@ -11,12 +11,13 @@ import ( ) type rangeGenerator struct { - numFrom uint64 - numTo uint64 - sequencer sequencer - dataRandomFactor float64 - generator value.Generator - nullPercentage float64 + numFrom uint64 + numTo uint64 + distinctValuesCount uint64 + sequencer sequencer + dataRandomFactor float64 + generator value.Generator + nullPercentage float64 } type ColumnGenerator struct { @@ -28,7 +29,7 @@ type ColumnGenerator struct { } func NewColumnGenerator( - baseSeed uint64, + baseSeed uint64, distinctValuesCountByColumn map[string]uint64, modelName string, model *models.Model, column *models.Column, dataModelName string, dataModel *models.Model, dataColumn *models.Column, ) (*ColumnGenerator, error) { @@ -54,7 +55,7 @@ func NewColumnGenerator( rangeRowsCount := uint64(math.Ceil(float64(rowsCount) * dataRange.RangePercentage)) gen, err := newRangeGenerator( - column, columnSeed, + column, columnSeed, distinctValuesCountByColumn, dataModel, dataColumn, dataColumnSeed, dataRange, rangeRowsOffset, rangeRowsCount, ) @@ -67,7 +68,6 @@ func NewColumnGenerator( } rangeGenerators = append(rangeGenerators, gen) - rangeRowsOffset += rangeRowsCount } @@ -94,7 +94,7 @@ func (cg *ColumnGenerator) SkipRows(count uint64) { //nolint:cyclop func newRangeGenerator( - column *models.Column, columnSeed uint64, + column *models.Column, columnSeed uint64, distinctValuesCountByColumn map[string]uint64, dataModel *models.Model, dataColumn *models.Column, dataColumnSeed uint64, dataRange *models.Params, rangeRowsOffset, rangeRowsCount uint64, ) (*rangeGenerator, error) { @@ -140,7 +140,7 @@ func newRangeGenerator( distinctValuesCount = dataRange.DistinctCount } - generatorValuesCount := valueGenerator.ValuesCount() + generatorValuesCount := valueGenerator.ValuesCount(distinctValuesCountByColumn) if float64(distinctValuesCount) > generatorValuesCount { if dataRange.DistinctPercentage != 0 || dataRange.DistinctCount != 0 { @@ -150,6 +150,8 @@ func newRangeGenerator( distinctValuesCount = uint64(generatorValuesCount) } + distinctValuesCountByColumn[column.Name] += distinctValuesCount + rangeOrdered := dataRange.Ordered orderSeed := dataColumnSeed @@ -173,12 +175,13 @@ func newRangeGenerator( dataRandomFactor := 1 - float64(distinctValuesCount)/generatorValuesCount return &rangeGenerator{ - numFrom: rangeRowsOffset, - numTo: rangeRowsOffset + rangeRowsCount, - dataRandomFactor: dataRandomFactor, - generator: valueGenerator, - sequencer: rangeSequencer, - nullPercentage: dataRange.NullPercentage, + numFrom: rangeRowsOffset, + numTo: rangeRowsOffset + rangeRowsCount, + distinctValuesCount: distinctValuesCount, + dataRandomFactor: dataRandomFactor, + generator: valueGenerator, + sequencer: rangeSequencer, + nullPercentage: dataRange.NullPercentage, }, nil } diff --git a/internal/generator/usecase/general/generator/value/datetime.go b/internal/generator/usecase/general/generator/value/datetime.go index 600e89b..8970354 100644 --- a/internal/generator/usecase/general/generator/value/datetime.go +++ b/internal/generator/usecase/general/generator/value/datetime.go @@ -49,7 +49,7 @@ func (g *DateTimeGenerator) Value(number float64, _ map[string]any) (any, error) return value, nil } -func (g *DateTimeGenerator) ValuesCount() float64 { +func (g *DateTimeGenerator) ValuesCount(_ map[string]uint64) float64 { fromSec := g.From.Unix() toSec := g.To.Unix() diff --git a/internal/generator/usecase/general/generator/value/enum.go b/internal/generator/usecase/general/generator/value/enum.go index 18d4413..e989b68 100644 --- a/internal/generator/usecase/general/generator/value/enum.go +++ b/internal/generator/usecase/general/generator/value/enum.go @@ -37,6 +37,6 @@ func (g *EnumGenerator) Value(number float64, _ map[string]any) (any, error) { return g.Values[idx], nil } -func (g *EnumGenerator) ValuesCount() float64 { +func (g *EnumGenerator) ValuesCount(_ map[string]uint64) float64 { return float64(len(g.Values)) } diff --git a/internal/generator/usecase/general/generator/value/float.go b/internal/generator/usecase/general/generator/value/float.go index c1903a4..79ba0c5 100644 --- a/internal/generator/usecase/general/generator/value/float.go +++ b/internal/generator/usecase/general/generator/value/float.go @@ -36,6 +36,6 @@ func (g *FloatGenerator) Value(number float64, _ map[string]any) (any, error) { return value, nil } -func (g *FloatGenerator) ValuesCount() float64 { +func (g *FloatGenerator) ValuesCount(_ map[string]uint64) float64 { return math.Inf(1) } diff --git a/internal/generator/usecase/general/generator/value/integer.go b/internal/generator/usecase/general/generator/value/integer.go index c83b1ba..5aefbc1 100644 --- a/internal/generator/usecase/general/generator/value/integer.go +++ b/internal/generator/usecase/general/generator/value/integer.go @@ -37,6 +37,6 @@ func (g *IntegerGenerator) Value(number float64, _ map[string]any) (any, error) } } -func (g *IntegerGenerator) ValuesCount() float64 { +func (g *IntegerGenerator) ValuesCount(_ map[string]uint64) float64 { return float64(uint64(g.To-g.From)) + 1 } diff --git a/internal/generator/usecase/general/generator/value/interfaces.go b/internal/generator/usecase/general/generator/value/interfaces.go index 0c4c196..5094611 100644 --- a/internal/generator/usecase/general/generator/value/interfaces.go +++ b/internal/generator/usecase/general/generator/value/interfaces.go @@ -9,5 +9,5 @@ type Generator interface { // Value method should return ordered unique value by number Value(number float64, generatedValues map[string]any) (any, error) // ValuesCount method should return the number of possible values to generate - ValuesCount() float64 + ValuesCount(distinctValuesCountByColumn map[string]uint64) float64 } diff --git a/internal/generator/usecase/general/generator/value/string.go b/internal/generator/usecase/general/generator/value/string.go index a48a848..69eb01d 100644 --- a/internal/generator/usecase/general/generator/value/string.go +++ b/internal/generator/usecase/general/generator/value/string.go @@ -1,19 +1,25 @@ package value import ( - "github.com/flosch/pongo2" "math" "math/big" + "regexp" "slices" "strings" + "github.com/flosch/pongo2" "github.com/pkg/errors" + "github.com/tarantool/sdvg/internal/generator/common" "github.com/tarantool/sdvg/internal/generator/models" "github.com/tarantool/sdvg/internal/generator/usecase/general/locale" "github.com/tarantool/sdvg/internal/generator/usecase/general/locale/en" "github.com/tarantool/sdvg/internal/generator/usecase/general/locale/ru" ) +var ( + rePatternVal = regexp.MustCompile(`pattern\((?:'([^']*)'|"([^"]*)")\)`) +) + // Verify interface compliance in compile time. var _ Generator = (*StringGenerator)(nil) @@ -34,7 +40,7 @@ func (g *StringGenerator) Prepare() error { if g.Template != "" { template, err := pongo2.FromString(g.Template) if err != nil { - return err + return errors.Errorf("failed to parse template: %s", err.Error()) } g.template = template @@ -183,13 +189,13 @@ func (g *StringGenerator) calculateCompletions(length int) []int64 { // templateString returns n-th string by template. func (g *StringGenerator) templateString(number float64, generatedValues map[string]any) (string, error) { - generatedValues["pattern"] = func(pattern string) string { - return g.patternString(number, pattern) + generatedValues["pattern"] = func(pattern string) *pongo2.Value { + return pongo2.AsSafeValue(g.patternString(number, pattern)) } val, err := g.template.Execute(generatedValues) if err != nil { - return "", err + return "", errors.New(err.Error()) } return val, nil @@ -437,7 +443,12 @@ func (g *StringGenerator) simpleString(number float64) string { // Value returns n-th string from range. func (g *StringGenerator) Value(number float64, row map[string]any) (any, error) { if g.Template != "" { - return g.templateString(number, row) + val, err := g.templateString(number, row) + if err != nil { + return nil, errors.WithMessage(err, "failed to template string") + } + + return val, nil } switch g.LogicalType { @@ -455,15 +466,9 @@ func (g *StringGenerator) Value(number float64, row map[string]any) (any, error) } //nolint:cyclop -func (g *StringGenerator) ValuesCount() float64 { +func (g *StringGenerator) ValuesCount(distinctValuesCountByColumn map[string]uint64) float64 { if g.Template != "" { - totalCount := float64(0) - totalCount += math.Pow(float64(len(g.localeModule.LargeLetters())), float64(strings.Count(g.Template, "A"))) - totalCount += math.Pow(float64(len(g.localeModule.SmallLetters())), float64(strings.Count(g.Template, "a"))) - totalCount += math.Pow(float64(len(locale.Numbers)), float64(strings.Count(g.Template, "0"))) - totalCount += math.Pow(float64(len(locale.SpecialChars)), float64(strings.Count(g.Template, "#"))) - - return totalCount + return g.templateCardinality(distinctValuesCountByColumn) } switch g.LogicalType { @@ -501,3 +506,48 @@ func (g *StringGenerator) ValuesCount() float64 { return totalCount } + +func (g *StringGenerator) templateCardinality(distinctValuesCountByColumn map[string]uint64) float64 { + total := 1.0 + + patternValMatches := rePatternVal.FindAllStringSubmatch(g.Template, -1) + for _, match := range patternValMatches { + pattern := match[1] + if pattern == "" { + pattern = match[2] + } + + total *= g.patternCardinality(pattern) + } + + columns := common.ExtractValuesFromTemplate(g.Template) + for _, column := range columns { + if count, ok := distinctValuesCountByColumn[column]; ok && count > 0 { + total *= float64(count) + } + } + + return total +} + +func (g *StringGenerator) patternCardinality(pattern string) float64 { + total := 1.0 + + if count := strings.Count(pattern, "A"); count > 0 { + total *= math.Pow(float64(len(g.localeModule.LargeLetters())), float64(count)) + } + + if count := strings.Count(pattern, "a"); count > 0 { + total *= math.Pow(float64(len(g.localeModule.SmallLetters())), float64(count)) + } + + if count := strings.Count(pattern, "0"); count > 0 { + total *= math.Pow(float64(len(locale.Numbers)), float64(count)) + } + + if count := strings.Count(pattern, "#"); count > 0 { + total *= math.Pow(float64(len(locale.SpecialChars)), float64(count)) + } + + return total +} diff --git a/internal/generator/usecase/general/generator/value/uuid.go b/internal/generator/usecase/general/generator/value/uuid.go index ca32580..914e503 100644 --- a/internal/generator/usecase/general/generator/value/uuid.go +++ b/internal/generator/usecase/general/generator/value/uuid.go @@ -43,6 +43,6 @@ func (g *UUIDGenerator) Value(number float64, _ map[string]any) (any, error) { return res, nil } -func (g *UUIDGenerator) ValuesCount() float64 { +func (g *UUIDGenerator) ValuesCount(_ map[string]uint64) float64 { return float64(1<<(128-10) - 1) //nolint:mnd } diff --git a/internal/generator/usecase/general/task.go b/internal/generator/usecase/general/task.go index 6067e08..cae4282 100644 --- a/internal/generator/usecase/general/task.go +++ b/internal/generator/usecase/general/task.go @@ -84,6 +84,8 @@ func newGenerators(cfg *models.GenerationConfig) (map[string]*generator.ColumnGe generators := make(map[string]*generator.ColumnGenerator) for modelName, model := range cfg.Models { + distinctValuesCountByColumn := make(map[string]uint64) + for _, column := range model.Columns { dataModelName := modelName dataModel := model @@ -98,7 +100,7 @@ func newGenerators(cfg *models.GenerationConfig) (map[string]*generator.ColumnGe columnKey := common.GetKey(modelName, column.Name) gen, err := generator.NewColumnGenerator( - cfg.RandomSeed, + cfg.RandomSeed, distinctValuesCountByColumn, modelName, model, column, dataModelName, dataModel, dataColumn, ) @@ -266,11 +268,6 @@ func (t *Task) generateAndSaveBatch( } } - sortedColumn, err := models.TopologicalSort(model.Columns) - if err != nil { - return err - } - originIndexes := make(map[string]int, len(model.Columns)) for index, column := range model.Columns { originIndexes[column.Name] = index @@ -279,24 +276,26 @@ func (t *Task) generateAndSaveBatch( for i := range count { generatedValues := make(map[string]any) - for _, columnName := range sortedColumn { + for _, columnName := range model.ColumnsTopologicalOrder { if common.CtxClosed(ctx) { return &common.ContextCancelError{} } - value, err := generators[originIndexes[columnName]].Value(generatedValues) + idx := originIndexes[columnName] + + value, err := generators[idx].Value(generatedValues) if err != nil { return errors.WithMessage(err, "failed to get or generate value") } generatedValues[columnName] = value - batch[i].Values[originIndexes[columnName]] = value + batch[i].Values[idx] = value } } outputSync.WaitPrevious(ctx) - err = t.output.HandleRowsBatch(ctx, model.Name, batch) + err := t.output.HandleRowsBatch(ctx, model.Name, batch) if err != nil { return errors.WithMessage(err, "failed to save batch to output") } diff --git a/internal/generator/usecase/general/test/unit_test.go b/internal/generator/usecase/general/test/unit_test.go index 0a2a7ad..91a1bcd 100644 --- a/internal/generator/usecase/general/test/unit_test.go +++ b/internal/generator/usecase/general/test/unit_test.go @@ -310,7 +310,11 @@ func TestInteger(t *testing.T) { } for _, testCase := range checkTypeCases { - column := &models.Column{Type: "integer", Ranges: []*models.Params{{TypeParams: testCase.typeParams}}} + column := &models.Column{ + Name: "integers", + Type: "integer", + Ranges: []*models.Params{{TypeParams: testCase.typeParams}}, + } checkType(t, column, testCase.expected) checkOrdered(t, column) @@ -357,7 +361,11 @@ func TestInteger(t *testing.T) { } for _, testCase := range checkValueCases { - column := &models.Column{Type: "integer", Ranges: []*models.Params{{TypeParams: testCase.typeParams}}} + column := &models.Column{ + Name: "integers", + Type: "integer", + Ranges: []*models.Params{{TypeParams: testCase.typeParams}}, + } checkValue(t, column, testCase.expected) } @@ -382,7 +390,11 @@ func TestFloat(t *testing.T) { } for _, testCase := range checkTypeCases { - column := &models.Column{Type: "float", Ranges: []*models.Params{{TypeParams: testCase.typeParams}}} + column := &models.Column{ + Name: "floats", + Type: "float", + Ranges: []*models.Params{{TypeParams: testCase.typeParams}}, + } checkType(t, column, testCase.expected) checkOrdered(t, column) @@ -413,7 +425,11 @@ func TestFloat(t *testing.T) { } for _, testCase := range checkValueCases { - column := &models.Column{Type: "float", Ranges: []*models.Params{{TypeParams: testCase.typeParams}}} + column := &models.Column{ + Name: "floats", + Type: "float", + Ranges: []*models.Params{{TypeParams: testCase.typeParams}}, + } checkValue(t, column, testCase.expected) } @@ -449,7 +465,11 @@ func TestString(t *testing.T) { } for _, testCase := range testCases { - column := &models.Column{Name: "test", Type: "string", Ranges: []*models.Params{{TypeParams: testCase.typeParams}}} + column := &models.Column{ + Name: "strings", + Type: "string", + Ranges: []*models.Params{{TypeParams: testCase.typeParams}}, + } handled := checkType(t, column, "") strValue, ok := handled[0].Values[0].(string) @@ -466,7 +486,7 @@ func TestString(t *testing.T) { } func TestUUID(t *testing.T) { - column := &models.Column{Type: "uuid"} + column := &models.Column{Name: "uuids", Type: "uuid"} checkType(t, column, uuid.UUID{}) checkDistinct(t, column) checkForeignKeyCases(t, column) @@ -486,7 +506,11 @@ func TestDateTime(t *testing.T) { } for _, testCase := range checkTypeCases { - column := &models.Column{Type: "datetime", Ranges: []*models.Params{{TypeParams: testCase.typeParams}}} + column := &models.Column{ + Name: "datetimes", + Type: "datetime", + Ranges: []*models.Params{{TypeParams: testCase.typeParams}}, + } checkType(t, column, testCase.expected) checkOrdered(t, column) @@ -503,7 +527,11 @@ func TestDateTime(t *testing.T) { } for _, testCase := range checkValueCases { - column := &models.Column{Type: "datetime", Ranges: []*models.Params{{TypeParams: testCase.typeParams}}} + column := &models.Column{ + Name: "datetimes", + Type: "datetime", + Ranges: []*models.Params{{TypeParams: testCase.typeParams}}, + } checkValue(t, column, testCase.expected) } @@ -736,7 +764,11 @@ func TestEnum(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - column := &models.Column{Type: tc.dataType, Ranges: []*models.Params{{Values: tc.values}}} + column := &models.Column{ + Name: "enums", + Type: tc.dataType, + Ranges: []*models.Params{{Values: tc.values}}, + } cfg := oneColumnCfg(t, column) cfg.Models[UnitDefaultColumnName].RowsCount = tc.rowsCount @@ -745,7 +777,11 @@ func TestEnum(t *testing.T) { handledDataRows := generateFunc(t, cfg)[UnitDefaultColumnName] require.Len(t, handledDataRows, len(tc.expected)) - columnOrdered := &models.Column{Type: tc.dataType, Ranges: []*models.Params{{Values: tc.values, Ordered: true}}} + columnOrdered := &models.Column{ + Name: "enums", + Type: tc.dataType, + Ranges: []*models.Params{{Values: tc.values, Ordered: true}}, + } cfg = oneColumnCfg(t, columnOrdered) cfg.Models[UnitDefaultColumnName].RowsCount = tc.rowsCount @@ -914,7 +950,7 @@ func TestRanges(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - column := &models.Column{Type: tc.dataType, Ranges: tc.ranges} + column := &models.Column{Name: "ranges", Type: tc.dataType, Ranges: tc.ranges} cfg := oneColumnCfg(t, column) cfg.Models[UnitDefaultColumnName].RowsCount = UnitDefaultRowsCount From 29a6012d1e6f169b3f6b9fdee6f8e94e9d94c883 Mon Sep 17 00:00:00 2001 From: reversetm Date: Sat, 26 Jul 2025 16:48:36 +0300 Subject: [PATCH 03/11] Fixed error of calculation of possible values for generation of string template, updated usage.md, updated CHANGELOG, and improve and columns are sorted at the point of use. --- CHANGELOG.md | 13 +- config/models.yml | 2 +- doc/ru/usage.md | 16 +- internal/generator/models/common.go | 6 +- internal/generator/models/common_test.go | 2 +- internal/generator/models/generator_model.go | 12 +- internal/generator/models/models_test.go | 3 - .../usecase/general/generator/generator.go | 26 +- internal/generator/usecase/general/task.go | 52 ++- .../usecase/general/test/unit_test.go | 349 ++++++++++++++++-- 10 files changed, 405 insertions(+), 76 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d93338..6fd4b0f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [latest](https://github.com/tarantool/sdvg/compare/0.0.1..master) +### Changed + +- String templates replaced with jinja like + +### Breaking changes + +- The old version of string template in `type_params` of `string` type is no longer supported, + instead you should use `{{ pattern('pattern_expression') }}` + ## [0.0.1](https://github.com/tarantool/sdvg/compare/36d0930..0.0.1) - 2025-07-21 ### Added @@ -38,7 +47,3 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Data partitioning - Ability to continue generation - Availability to ignore some models for generation - -### Changed - -- String templates replaced with jinja like diff --git a/config/models.yml b/config/models.yml index b3b22d9..7bc502c 100644 --- a/config/models.yml +++ b/config/models.yml @@ -62,7 +62,7 @@ models: - name: passport type: string type_params: - template: AA 00 000 000 + template: "{{ pattern('AA 00 000 000') }}" distinct_percentage: 1 ordered: true - name: created diff --git a/doc/ru/usage.md b/doc/ru/usage.md index d9452f3..946edc5 100644 --- a/doc/ru/usage.md +++ b/doc/ru/usage.md @@ -161,8 +161,8 @@ open_ai: - `max_length`: Максимальная длина строки. По умолчанию `32`. - `logical_type`: Логический тип строки. Поддерживаемые значения: `first_name`, `last_name`, `phone`, `text`. - `template`: Jinja-подобный шаблон для генерации строки. Позволяет использовать любые поля генерируемой модели и - задавать паттерн строки с помощью функции `pattern`, где символ `A` - любая большая буква, символ `a` - любая маленькая буква, - символ `0` - любая цифра, символ `#` - любой символ, а остальные символы остаются как есть. + задавать паттерн строки с помощью функции `pattern`. Информация о фильтрах и функциях, доступных в шаблонных + строках описана [здесь](#фильтры-и-функции-используемые-в-шаблонных-строках). Также поддерживается использование фильтров, таких как `upper` и `lower`. - `locale`: Локаль для генерации строк. Поддерживаемые значения: `ru`, `en`. По умолчанию `en`. - `without_large_letters`: Флаг, указывающий, исключать ли большие буквы из строки. @@ -244,6 +244,18 @@ open_ai: Подобна структуре для формата `http`, за исключением того, что поле `format_template` неизменяемое и всегда равняется значению по умолчанию. +#### Фильтры и функции, используемые в шаблонных строках + +Шаблонные строки реализованы с использованием библиотеки `pongo2`, ознакомиться +со всеми доступными фильтрами и функциями можно в репозитории [pongo2](https://github.com/flosch/pongo2). + +Вдобавок к ним была добавлена 1 функция: + +- pattern: позволяет создать паттерн строки при помощи специальных символов. + Символ `A` - любая большая буква, символ `a` - любая маленькая буква, + символ `0` - любая цифра, символ `#` - любой символ, а остальные символы остаются как есть. + Функция доступна только в поле `template` типа данных `string`. + #### Примеры конфигурации генерации данных Пример конфигурации модели данных: diff --git a/internal/generator/models/common.go b/internal/generator/models/common.go index 5f82f8e..75142cc 100644 --- a/internal/generator/models/common.go +++ b/internal/generator/models/common.go @@ -2,7 +2,6 @@ package models import ( "encoding/json" - "github.com/tarantool/sdvg/internal/generator/common" "io" "os" "path/filepath" @@ -12,6 +11,7 @@ import ( "github.com/ilyakaznacheev/cleanenv" "github.com/otaviokr/topological-sort/toposort" "github.com/pkg/errors" + "github.com/tarantool/sdvg/internal/generator/common" "gopkg.in/yaml.v3" ) @@ -122,8 +122,8 @@ func parseErrsToString(errs []error) string { return sb.String() } -func topologicalSort(columns []*Column) ([]string, error) { - graph := make(map[string][]string) +func TopologicalSort(columns []*Column) ([]string, error) { + graph := make(map[string][]string, len(columns)) for _, c := range columns { graph[c.Name] = make([]string, 0) diff --git a/internal/generator/models/common_test.go b/internal/generator/models/common_test.go index 314bb87..b6f2403 100644 --- a/internal/generator/models/common_test.go +++ b/internal/generator/models/common_test.go @@ -106,7 +106,7 @@ func TestTopologicalSort(t *testing.T) { testFunc := func(t *testing.T, tc testCase) { t.Helper() - actual, err := topologicalSort(tc.columns) + actual, err := TopologicalSort(tc.columns) require.Equal(t, tc.wantErr, err != nil) require.Equal(t, tc.expected, actual) } diff --git a/internal/generator/models/generator_model.go b/internal/generator/models/generator_model.go index 88aaf16..e3c451f 100644 --- a/internal/generator/models/generator_model.go +++ b/internal/generator/models/generator_model.go @@ -30,9 +30,8 @@ type Model struct { RowsPerFile uint64 `backup:"true" json:"rows_per_file" yaml:"rows_per_file"` ModelDir string `backup:"true" json:"model_dir" yaml:"model_dir"` // The columns from the partitioning key with PartitionColumn.WriteToOutput == false, must be at the end of slice. - Columns []*Column `backup:"true" json:"columns" yaml:"columns"` - ColumnsTopologicalOrder []string - PartitionColumns []*PartitionColumn `backup:"true" json:"partition_columns" yaml:"partition_columns"` + Columns []*Column `backup:"true" json:"columns" yaml:"columns"` + PartitionColumns []*PartitionColumn `backup:"true" json:"partition_columns" yaml:"partition_columns"` } // PartitionColumn type is used to describe partition parameters for column. @@ -81,13 +80,6 @@ func (m *Model) Parse() error { m.shiftColumnsToEnd(nonWriteableColumns) - sortedColumns, err := topologicalSort(m.Columns) - if err != nil { - return errors.WithMessage(err, "failed to sorting columns by dependencies") - } - - m.ColumnsTopologicalOrder = sortedColumns - return nil } diff --git a/internal/generator/models/models_test.go b/internal/generator/models/models_test.go index 5df79da..ffb908a 100644 --- a/internal/generator/models/models_test.go +++ b/internal/generator/models/models_test.go @@ -1197,9 +1197,6 @@ parquet params: expectedModel := tc.expected.Models[modelName] gotModel := cfg.Models[modelName] - // skip ColumnsTopologicalOrder check - expectedModel.ColumnsTopologicalOrder = gotModel.ColumnsTopologicalOrder - for columnName := range expectedModel.Columns { expectedColumn := expectedModel.Columns[columnName] gotColumn := gotModel.Columns[columnName] diff --git a/internal/generator/usecase/general/generator/generator.go b/internal/generator/usecase/general/generator/generator.go index 2dc8c27..e78da6f 100644 --- a/internal/generator/usecase/general/generator/generator.go +++ b/internal/generator/usecase/general/generator/generator.go @@ -11,13 +11,12 @@ import ( ) type rangeGenerator struct { - numFrom uint64 - numTo uint64 - distinctValuesCount uint64 - sequencer sequencer - dataRandomFactor float64 - generator value.Generator - nullPercentage float64 + numFrom uint64 + numTo uint64 + sequencer sequencer + dataRandomFactor float64 + generator value.Generator + nullPercentage float64 } type ColumnGenerator struct { @@ -175,13 +174,12 @@ func newRangeGenerator( dataRandomFactor := 1 - float64(distinctValuesCount)/generatorValuesCount return &rangeGenerator{ - numFrom: rangeRowsOffset, - numTo: rangeRowsOffset + rangeRowsCount, - distinctValuesCount: distinctValuesCount, - dataRandomFactor: dataRandomFactor, - generator: valueGenerator, - sequencer: rangeSequencer, - nullPercentage: dataRange.NullPercentage, + numFrom: rangeRowsOffset, + numTo: rangeRowsOffset + rangeRowsCount, + dataRandomFactor: dataRandomFactor, + generator: valueGenerator, + sequencer: rangeSequencer, + nullPercentage: dataRange.NullPercentage, }, nil } diff --git a/internal/generator/usecase/general/task.go b/internal/generator/usecase/general/task.go index cae4282..6b71f1e 100644 --- a/internal/generator/usecase/general/task.go +++ b/internal/generator/usecase/general/task.go @@ -84,9 +84,21 @@ func newGenerators(cfg *models.GenerationConfig) (map[string]*generator.ColumnGe generators := make(map[string]*generator.ColumnGenerator) for modelName, model := range cfg.Models { - distinctValuesCountByColumn := make(map[string]uint64) + distinctValuesCountByColumn := make(map[string]uint64, len(model.Columns)) + + sortedColumns, err := models.TopologicalSort(model.Columns) + if err != nil { + return nil, errors.WithMessagef(err, "failed to sorting columns by dependencies for model %q", modelName) + } + + originIndexes := make(map[string]int, len(model.Columns)) + for index, column := range model.Columns { + originIndexes[column.Name] = index + } + + for _, columnName := range sortedColumns { + column := model.Columns[originIndexes[columnName]] - for _, column := range model.Columns { dataModelName := modelName dataModel := model dataColumn := column @@ -173,6 +185,8 @@ func (t *Task) WaitError() error { } // generateAndSaveValues function generates values for all model. +// +//nolint:cyclop func (t *Task) generateAndSaveValues(ctx context.Context) error { var err error @@ -203,6 +217,16 @@ func (t *Task) generateAndSaveValues(ctx context.Context) error { continue } + columnsTopologicalOrder, err := models.TopologicalSort(model.Columns) + if err != nil { + return errors.WithMessagef(err, "failed to sorting columns by dependencies for model %q", modelName) + } + + originColumnsIndexes := make(map[string]int, len(model.Columns)) + for index, column := range model.Columns { + originColumnsIndexes[column.Name] = index + } + pool.Add(1) go func() { @@ -223,7 +247,11 @@ func (t *Task) generateAndSaveValues(ctx context.Context) error { generators = append(generators, t.generators[columnKey].NewBatchGenerator(rowsCount)) } - pool.Submit(ctx, outputSyncer.WorkerSyncer(), model, generators, rowsCount) + pool.Submit( + ctx, outputSyncer.WorkerSyncer(), + modelName, columnsTopologicalOrder, originColumnsIndexes, + generators, rowsCount, + ) } }() } @@ -257,7 +285,8 @@ func (t *Task) skipRows() { // generateAndSaveBatch function generate batch of values for selected column and send it to output. func (t *Task) generateAndSaveBatch( ctx context.Context, outputSync *common.WorkerSyncer, - model *models.Model, generators []*generator.BatchGenerator, count uint64, + modelName string, columnsTopologicalOrder []string, originColumnsIndexes map[string]int, + generators []*generator.BatchGenerator, count uint64, ) error { defer outputSync.Done(ctx) @@ -268,20 +297,15 @@ func (t *Task) generateAndSaveBatch( } } - originIndexes := make(map[string]int, len(model.Columns)) - for index, column := range model.Columns { - originIndexes[column.Name] = index - } - for i := range count { - generatedValues := make(map[string]any) + generatedValues := make(map[string]any, len(originColumnsIndexes)) - for _, columnName := range model.ColumnsTopologicalOrder { + for _, columnName := range columnsTopologicalOrder { if common.CtxClosed(ctx) { return &common.ContextCancelError{} } - idx := originIndexes[columnName] + idx := originColumnsIndexes[columnName] value, err := generators[idx].Value(generatedValues) if err != nil { @@ -295,12 +319,12 @@ func (t *Task) generateAndSaveBatch( outputSync.WaitPrevious(ctx) - err := t.output.HandleRowsBatch(ctx, model.Name, batch) + err := t.output.HandleRowsBatch(ctx, modelName, batch) if err != nil { return errors.WithMessage(err, "failed to save batch to output") } - t.progress.Add(model.Name, count) + t.progress.Add(modelName, count) return nil } diff --git a/internal/generator/usecase/general/test/unit_test.go b/internal/generator/usecase/general/test/unit_test.go index 91a1bcd..cae5433 100644 --- a/internal/generator/usecase/general/test/unit_test.go +++ b/internal/generator/usecase/general/test/unit_test.go @@ -16,6 +16,7 @@ import ( outputMock "github.com/tarantool/sdvg/internal/generator/output/mock" "github.com/tarantool/sdvg/internal/generator/usecase" usecaseGeneral "github.com/tarantool/sdvg/internal/generator/usecase/general" + "github.com/tarantool/sdvg/internal/generator/usecase/general/generator/value" ) const ( @@ -69,12 +70,12 @@ func deepColumnCopy(c *models.Column) *models.Column { func toString(t *testing.T, anyValue any) string { t.Helper() - value, err := json.Marshal(anyValue) + val, err := json.Marshal(anyValue) if err != nil { - t.Fatalf("Failed to json marshal of %v: %s", value, err) + t.Fatalf("Failed to json marshal of %v: %s", val, err) } - return string(value) + return string(val) } func getCfg(t *testing.T, model map[string]*models.Model) models.GenerationConfig { @@ -192,13 +193,59 @@ func checkDistinct(t *testing.T, column *models.Column) { for i := range UnitDefaultRowsCount { require.Len(t, handled[i].Values, 1, "column: %+v\n handled: %+v", column, handled) - value := toString(t, handled[i].Values[0]) - _, alreadyHas := uniqueMap[value] - require.False(t, alreadyHas, "value: %+v\nmap: %+v", value, uniqueMap) - uniqueMap[value] = true + val := toString(t, handled[i].Values[0]) + _, alreadyHas := uniqueMap[val] + require.False(t, alreadyHas, "value: %+v\nmap: %+v", val, uniqueMap) + uniqueMap[val] = true } } +func checkValuesCount( + t *testing.T, + gen value.Generator, + valuesCountByColumn map[string]uint64, expectedValueCount float64, +) { + t.Helper() + + require.NoError(t, gen.Prepare()) + + valuesCount := gen.ValuesCount(valuesCountByColumn) + require.Equal(t, uint64(expectedValueCount), uint64(valuesCount)) +} + +func checkPossibleToGenerate(t *testing.T, columns []*models.Column, rowsCount uint64, wantErr bool) { + t.Helper() + + copyColumns := make([]*models.Column, 0, len(columns)) + for _, column := range columns { + copyColumns = append(copyColumns, deepColumnCopy(column)) + } + + cfg := getCfg(t, map[string]*models.Model{ + "test": { + RowsCount: rowsCount, + Columns: copyColumns, + }, + }) + + outputHandler := func(_ context.Context, _ string, _ []*models.DataRow) error { return nil } + + out := outputMock.NewOutput(outputHandler) + uc := usecaseGeneral.NewUseCase(usecaseGeneral.UseCaseConfig{}) + + taskID, err := uc.CreateTask( + context.Background(), + usecase.TaskConfig{ + GenerationConfig: &cfg, + Output: out, + }, + ) + + require.Equal(t, wantErr, err != nil) + err = uc.WaitResult(taskID) + require.Equal(t, wantErr, err != nil) +} + func checkForeignKey(t *testing.T, column *models.Column, nullPercentage float64, foreignOrdered bool) { t.Helper() @@ -214,6 +261,7 @@ func checkForeignKey(t *testing.T, column *models.Column, nullPercentage float64 "foreign": { RowsCount: UnitDefaultRowsCount * 2, Columns: []*models.Column{{ + Name: "foreign_key", ForeignKey: "orig.test", Params: &models.Params{Ordered: foreignOrdered}, }}, @@ -237,10 +285,10 @@ func checkForeignKey(t *testing.T, column *models.Column, nullPercentage float64 continue } - value := toString(t, origHandled[i].Values[0]) - _, alreadyHas := origMap[value] - require.False(t, alreadyHas, "value: %+v\nmap: %+v", value, origMap) - origMap[value] = true + val := toString(t, origHandled[i].Values[0]) + _, alreadyHas := origMap[val] + require.False(t, alreadyHas, "value: %+v\nmap: %+v", val, origMap) + origMap[val] = true } for i := range UnitDefaultRowsCount * 2 { @@ -266,9 +314,9 @@ func checkForeignKey(t *testing.T, column *models.Column, nullPercentage float64 } } - value := toString(t, foreignHandled[i].Values[0]) - _, alreadyHas := origMap[value] - require.True(t, alreadyHas, "value: %+v (#%d)\nmap: %+v", value, i, origMap) + val := toString(t, foreignHandled[i].Values[0]) + _, alreadyHas := origMap[val] + require.True(t, alreadyHas, "value: %+v (#%d)\nmap: %+v", val, i, origMap) } } @@ -369,6 +417,21 @@ func TestInteger(t *testing.T) { checkValue(t, column, testCase.expected) } + + checkValuesCountCases := []struct { + typeParams *models.ColumnIntegerParams + expected float64 + }{ + {&models.ColumnIntegerParams{From: 1, To: 5}, 5}, + {&models.ColumnIntegerParams{From: 100, To: 1000}, 901}, + {&models.ColumnIntegerParams{From: 1, To: 1}, 1}, + {&models.ColumnIntegerParams{From: 123, To: 654}, 532}, + } + + for _, testCase := range checkValuesCountCases { + generator := &value.IntegerGenerator{ColumnIntegerParams: testCase.typeParams} + checkValuesCount(t, generator, nil, testCase.expected) + } } func TestFloat(t *testing.T) { @@ -433,6 +496,21 @@ func TestFloat(t *testing.T) { checkValue(t, column, testCase.expected) } + + checkValuesCountCases := []struct { + typeParams *models.ColumnFloatParams + expected float64 + }{ + {&models.ColumnFloatParams{From: 1.021, To: 5.554433}, math.Inf(1)}, + {&models.ColumnFloatParams{From: 195.2345, To: 1000}, math.Inf(1)}, + {&models.ColumnFloatParams{From: 0.12345, To: 1}, math.Inf(1)}, + {&models.ColumnFloatParams{From: 123, To: 654}, math.Inf(1)}, + } + + for _, testCase := range checkValuesCountCases { + generator := &value.FloatGenerator{ColumnFloatParams: testCase.typeParams} + checkValuesCount(t, generator, nil, testCase.expected) + } } func TestString(t *testing.T) { @@ -483,6 +561,189 @@ func TestString(t *testing.T) { checkDistinct(t, column) checkForeignKeyCases(t, column) } + + checkValuesCountCases := []struct { + typeParams *models.ColumnStringParams + distinctValuesCountByColumn map[string]uint64 + expected float64 + }{ + { + &models.ColumnStringParams{ + MinLength: 1, + MaxLength: 1, + Locale: "en", + WithoutNumbers: true, + WithoutSpecialChars: true, + }, + nil, + 52, + }, + { + &models.ColumnStringParams{ + MinLength: 1, + MaxLength: 1, + Locale: "ru", + WithoutNumbers: true, + WithoutSpecialChars: true, + }, + nil, + 66.0, + }, + { + &models.ColumnStringParams{ + MinLength: 3, + MaxLength: 7, + Locale: "en", + WithoutNumbers: true, + WithoutSpecialChars: true, + }, + nil, + 1048229968448, + }, + { + &models.ColumnStringParams{ + MinLength: 2, + MaxLength: 9, + Locale: "ru", + WithoutNumbers: true, + WithoutSpecialChars: true, + }, + nil, + 24128259706319868, + }, + { + &models.ColumnStringParams{ + MinLength: 10, + MaxLength: 24, + Locale: "en", + WithoutLargeLetters: true, + WithoutSmallLetters: true, + WithoutSpecialChars: true, + }, + nil, + 1111111111111110000000000, + }, + { + &models.ColumnStringParams{ + MinLength: 1, + MaxLength: 8, + Locale: "en", + WithoutLargeLetters: true, + WithoutSmallLetters: true, + WithoutNumbers: true, + }, + nil, + 81870575520, + }, + { + &models.ColumnStringParams{ + MinLength: 10, + MaxLength: 15, + Locale: "en", + }, + nil, + 88394150280794134360488281250, + }, + { + &models.ColumnStringParams{ + MinLength: 10, + MaxLength: 15, + Locale: "ru", + }, + nil, + 868834460299970670989801640300, + }, + { + &models.ColumnStringParams{ + Locale: "en", + Template: "{{ field }}", + }, + map[string]uint64{ + "field": 11, + }, + 11, + }, + { + &models.ColumnStringParams{ + Locale: "en", + Template: "{{ pattern('A00') }}", + }, + nil, + 2600, + }, + { + &models.ColumnStringParams{ + Locale: "ru", + Template: "{{ field }}{{ pattern('a0#') }}", + }, + map[string]uint64{ + "field": 10, + }, + 75900, + }, + } + + for _, testCase := range checkValuesCountCases { + generator := &value.StringGenerator{ColumnStringParams: testCase.typeParams} + checkValuesCount(t, generator, testCase.distinctValuesCountByColumn, testCase.expected) + } + + idColumn := &models.Column{ + Name: "id", + Type: "integer", + Ranges: []*models.Params{ + { + TypeParams: &models.ColumnIntegerParams{ + FromPtr: int64Ptr(1), + ToPtr: int64Ptr(5), + }, + }, + }, + } + + emailColumn := &models.Column{ + Name: "email", + Type: "string", + Ranges: []*models.Params{ + { + TypeParams: &models.ColumnStringParams{ + Template: "{{ id }}.{{ pattern('00') }}@example.com", + }, + DistinctPercentage: 1, + }, + }, + } + + checkPossibleToGenerateCases := []struct { + columns []*models.Column + rowsCount uint64 + wantErr bool + }{ + { + columns: []*models.Column{idColumn, emailColumn}, + rowsCount: 500, + wantErr: false, + }, + { + columns: []*models.Column{emailColumn, idColumn}, + rowsCount: 500, + wantErr: false, + }, + { + columns: []*models.Column{idColumn, emailColumn}, + rowsCount: 501, + wantErr: true, + }, + { + columns: []*models.Column{emailColumn, idColumn}, + rowsCount: 501, + wantErr: true, + }, + } + + for _, testCase := range checkPossibleToGenerateCases { + checkPossibleToGenerate(t, testCase.columns, testCase.rowsCount, testCase.wantErr) + } } func TestUUID(t *testing.T) { @@ -490,6 +751,7 @@ func TestUUID(t *testing.T) { checkType(t, column, uuid.UUID{}) checkDistinct(t, column) checkForeignKeyCases(t, column) + checkValuesCount(t, &value.UUIDGenerator{}, nil, float64(1<<(128-10)-1)) } func TestDateTime(t *testing.T) { @@ -535,6 +797,45 @@ func TestDateTime(t *testing.T) { checkValue(t, column, testCase.expected) } + + checkValuesCountCases := []struct { + typeParams *models.ColumnDateTimeParams + expected float64 + }{ + { + &models.ColumnDateTimeParams{ + From: time.Date(2025, 7, 25, 10, 0, 0, 0, time.UTC), + To: time.Date(2025, 7, 25, 10, 0, 0, 0, time.UTC), + }, + 1, + }, + { + &models.ColumnDateTimeParams{ + From: time.Date(2025, 7, 25, 10, 0, 0, 500_000_000, time.UTC), + To: time.Date(2025, 7, 25, 10, 0, 5, 500_000_000, time.UTC), + }, + 6, + }, + { + &models.ColumnDateTimeParams{ + From: time.Date(2025, 7, 25, 10, 0, 0, 900_000_000, time.UTC), + To: time.Date(2025, 7, 25, 10, 0, 1, 100_000_000, time.UTC), + }, + 400_000_002, + }, + { + &models.ColumnDateTimeParams{ + From: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), + To: time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC), + }, + 31_536_001, + }, + } + + for _, testCase := range checkValuesCountCases { + generator := &value.DateTimeGenerator{ColumnDateTimeParams: testCase.typeParams} + checkValuesCount(t, generator, nil, testCase.expected) + } } func TestIdempotence(t *testing.T) { @@ -791,8 +1092,8 @@ func TestEnum(t *testing.T) { require.Len(t, handledDataRows, len(tc.expected)) for i := range handledDataRows { - value := handledDataRows[i].Values[0] - require.Equal(t, tc.expected[i], value) + val := handledDataRows[i].Values[0] + require.Equal(t, tc.expected[i], val) } }) } @@ -965,9 +1266,9 @@ func TestRanges(t *testing.T) { } for i := range handledDataRows { - value := handledDataRows[i].Values[0] + val := handledDataRows[i].Values[0] - rangeIdx, err := mapValueToRange(tc.dataType, value, tc.ranges) + rangeIdx, err := mapValueToRange(tc.dataType, val, tc.ranges) require.NoError(t, err) expectedValuesAmountPerRange[rangeIdx]-- @@ -991,13 +1292,13 @@ func mapValueToRange(columnType string, value any, ranges []*models.Params) (int switch columnType { case "integer": - switch value := value.(type) { + switch val := value.(type) { case int32: - if int32(r.IntegerParams.From) <= value && value <= int32(r.IntegerParams.To) { + if int32(r.IntegerParams.From) <= val && val <= int32(r.IntegerParams.To) { return idx, nil } case int64: - if r.IntegerParams.From <= value && value <= r.IntegerParams.To { + if r.IntegerParams.From <= val && val <= r.IntegerParams.To { return idx, nil } } @@ -1020,13 +1321,13 @@ func mapValueToRange(columnType string, value any, ranges []*models.Params) (int return idx, nil } case "float": - switch value := value.(type) { + switch val := value.(type) { case float32: - if float32(r.FloatParams.From) <= value && value <= float32(r.FloatParams.To) { + if float32(r.FloatParams.From) <= val && val <= float32(r.FloatParams.To) { return idx, nil } case float64: - if r.FloatParams.From <= value && value <= r.FloatParams.To { + if r.FloatParams.From <= val && val <= r.FloatParams.To { return idx, nil } } From 24415d20623e3c51d57124610b7f58af82ef1add Mon Sep 17 00:00:00 2001 From: reversetm Date: Mon, 28 Jul 2025 18:11:30 +0300 Subject: [PATCH 04/11] Improved performance --- CHANGELOG.md | 2 +- doc/ru/usage.md | 7 +- internal/generator/common/utils.go | 35 ++++++ internal/generator/common/utils_test.go | 74 +++++++++++ internal/generator/models/common.go | 24 ---- internal/generator/models/common_test.go | 117 ------------------ .../usecase/general/generator/generator.go | 14 ++- .../general/generator/value/interfaces.go | 2 +- .../usecase/general/generator/value/string.go | 14 ++- internal/generator/usecase/general/task.go | 48 +++++-- 10 files changed, 168 insertions(+), 169 deletions(-) delete mode 100644 internal/generator/models/common_test.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 6fd4b0f..eb57aaf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Breaking changes - The old version of string template in `type_params` of `string` type is no longer supported, - instead you should use `{{ pattern('pattern_expression') }}` + `{{ pattern('pattern_expression') }}` should be used instead. ## [0.0.1](https://github.com/tarantool/sdvg/compare/36d0930..0.0.1) - 2025-07-21 diff --git a/doc/ru/usage.md b/doc/ru/usage.md index 946edc5..73dfa50 100644 --- a/doc/ru/usage.md +++ b/doc/ru/usage.md @@ -162,8 +162,7 @@ open_ai: - `logical_type`: Логический тип строки. Поддерживаемые значения: `first_name`, `last_name`, `phone`, `text`. - `template`: Jinja-подобный шаблон для генерации строки. Позволяет использовать любые поля генерируемой модели и задавать паттерн строки с помощью функции `pattern`. Информация о фильтрах и функциях, доступных в шаблонных - строках описана [здесь](#фильтры-и-функции-используемые-в-шаблонных-строках). - Также поддерживается использование фильтров, таких как `upper` и `lower`. + строках описана в конце данного раздела. - `locale`: Локаль для генерации строк. Поддерживаемые значения: `ru`, `en`. По умолчанию `en`. - `without_large_letters`: Флаг, указывающий, исключать ли большие буквы из строки. - `without_small_letters`: Флаг, указывающий, исключать ли маленькие буквы из строки. @@ -244,7 +243,7 @@ open_ai: Подобна структуре для формата `http`, за исключением того, что поле `format_template` неизменяемое и всегда равняется значению по умолчанию. -#### Фильтры и функции, используемые в шаблонных строках +Фильтры и функции, используемые в шаблонных строках Шаблонные строки реализованы с использованием библиотеки `pongo2`, ознакомиться со всеми доступными фильтрами и функциями можно в репозитории [pongo2](https://github.com/flosch/pongo2). @@ -254,7 +253,7 @@ open_ai: - pattern: позволяет создать паттерн строки при помощи специальных символов. Символ `A` - любая большая буква, символ `a` - любая маленькая буква, символ `0` - любая цифра, символ `#` - любой символ, а остальные символы остаются как есть. - Функция доступна только в поле `template` типа данных `string`. + Функция доступна только в поле `template` типа данных `string`. #### Примеры конфигурации генерации данных diff --git a/internal/generator/common/utils.go b/internal/generator/common/utils.go index a58f0c4..49f2087 100644 --- a/internal/generator/common/utils.go +++ b/internal/generator/common/utils.go @@ -14,6 +14,7 @@ import ( "time" "github.com/google/uuid" + "github.com/otaviokr/topological-sort/toposort" "github.com/pkg/errors" "gopkg.in/yaml.v3" ) @@ -378,3 +379,37 @@ func ExtractValuesFromTemplate(template string) []string { return values } + +// TopologicalSort sorts the given items in topological order using the provided +// function to extract node name and dependencies. +// Returns the sorted node names, a flag indicating if any dependencies exist, +// and an error if a cycle is detected. +func TopologicalSort[T any](items []T, nodeFunc func(T) (string, []string)) ([]string, bool, error) { + var ( + graph = make(map[string][]string, len(items)) + sortedVertexes = make([]string, 0, len(items)) + hasDependencies bool + err error + ) + + for _, item := range items { + name, dependencies := nodeFunc(item) + if len(dependencies) > 0 { + hasDependencies = true + } + + sortedVertexes = append(sortedVertexes, name) + graph[name] = dependencies + } + + if !hasDependencies { + return sortedVertexes, false, nil + } + + sortedVertexes, err = toposort.ReverseTarjan(graph) + if err != nil { + return nil, false, errors.New(err.Error()) + } + + return sortedVertexes, hasDependencies, nil +} diff --git a/internal/generator/common/utils_test.go b/internal/generator/common/utils_test.go index 34dabf2..5c4a406 100644 --- a/internal/generator/common/utils_test.go +++ b/internal/generator/common/utils_test.go @@ -744,3 +744,77 @@ func TestExtractValuesFromTemplate(t *testing.T) { t.Run(tc.name, func(t *testing.T) { testFunc(t, tc) }) } } + +func TestTopologicalSort(t *testing.T) { + type node struct { + name string + deps []string + } + + type testCase struct { + name string + items []node + wantErr bool + wantDependencies bool + expected []string + } + + testCases := []testCase{ + { + name: "Empty items", + items: []node{}, + wantErr: false, + wantDependencies: false, + expected: []string{}, + }, + { + name: "Items with dependencies", + items: []node{ + {name: "1", deps: []string{"3"}}, + {name: "2", deps: []string{"4"}}, + {name: "3", deps: []string{"2"}}, + {name: "4", deps: []string{}}, + }, + wantErr: false, + wantDependencies: true, + expected: []string{"4", "2", "3", "1"}, + }, + { + name: "Items without dependencies", + items: []node{ + {name: "1", deps: []string{}}, + {name: "2", deps: []string{}}, + {name: "3", deps: []string{}}, + }, + wantErr: false, + wantDependencies: false, + expected: []string{"1", "2", "3"}, + }, + { + name: "Items with cycle dependencies", + items: []node{ + {name: "1", deps: []string{"2"}}, + {name: "2", deps: []string{"1"}}, + }, + wantErr: true, + wantDependencies: false, + expected: nil, + }, + } + + testFunc := func(t *testing.T, tc testCase) { + t.Helper() + + actual, hasDependencies, err := TopologicalSort(tc.items, func(node node) (string, []string) { + return node.name, node.deps + }) + + require.Equal(t, tc.wantErr, err != nil) + require.Equal(t, tc.wantDependencies, hasDependencies) + require.Equal(t, tc.expected, actual) + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { testFunc(t, tc) }) + } +} diff --git a/internal/generator/models/common.go b/internal/generator/models/common.go index 75142cc..635e84b 100644 --- a/internal/generator/models/common.go +++ b/internal/generator/models/common.go @@ -9,9 +9,7 @@ import ( "strings" "github.com/ilyakaznacheev/cleanenv" - "github.com/otaviokr/topological-sort/toposort" "github.com/pkg/errors" - "github.com/tarantool/sdvg/internal/generator/common" "gopkg.in/yaml.v3" ) @@ -121,25 +119,3 @@ func parseErrsToString(errs []error) string { return sb.String() } - -func TopologicalSort(columns []*Column) ([]string, error) { - graph := make(map[string][]string, len(columns)) - for _, c := range columns { - graph[c.Name] = make([]string, 0) - - for _, r := range c.Ranges { - if r.StringParams == nil || r.StringParams.Template == "" { - continue - } - - graph[c.Name] = common.ExtractValuesFromTemplate(r.StringParams.Template) - } - } - - sortedVertexes, err := toposort.ReverseTarjan(graph) - if err != nil { - return nil, errors.New(err.Error()) - } - - return sortedVertexes, nil -} diff --git a/internal/generator/models/common_test.go b/internal/generator/models/common_test.go deleted file mode 100644 index b6f2403..0000000 --- a/internal/generator/models/common_test.go +++ /dev/null @@ -1,117 +0,0 @@ -package models - -import ( - "testing" - - "github.com/stretchr/testify/require" -) - -func TestTopologicalSort(t *testing.T) { - type testCase struct { - name string - columns []*Column - wantErr bool - expected []string - } - - testCases := []testCase{ - { - name: "Empty columns", - columns: []*Column{}, - wantErr: false, - expected: []string{}, - }, - { - name: "Columns with dependencies", - columns: []*Column{ - { - Name: "1", - Type: "string", - Ranges: []*Params{ - { - StringParams: &ColumnStringParams{ - Template: "{{ 3 }}", - }, - }, - }, - }, - { - Name: "2", - Type: "string", - Ranges: []*Params{ - { - StringParams: &ColumnStringParams{ - Template: "{{ 4 }}", - }, - }, - }, - }, - { - Name: "3", - Type: "string", - Ranges: []*Params{ - { - StringParams: &ColumnStringParams{ - Template: "{{ 2 }}", - }, - }, - }, - }, - { - Name: "4", - Type: "string", - Ranges: []*Params{ - { - StringParams: &ColumnStringParams{ - Template: "", - }, - }, - }, - }, - }, - wantErr: false, - expected: []string{"4", "2", "3", "1"}, - }, - { - name: "Columns with cycle dependencies", - columns: []*Column{ - { - Name: "1", - Type: "string", - Ranges: []*Params{ - { - StringParams: &ColumnStringParams{ - Template: "{{ 2 }}", - }, - }, - }, - }, - { - Name: "2", - Type: "string", - Ranges: []*Params{ - { - StringParams: &ColumnStringParams{ - Template: "{{ 1 }}", - }, - }, - }, - }, - }, - wantErr: true, - expected: nil, - }, - } - - testFunc := func(t *testing.T, tc testCase) { - t.Helper() - - actual, err := TopologicalSort(tc.columns) - require.Equal(t, tc.wantErr, err != nil) - require.Equal(t, tc.expected, actual) - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { testFunc(t, tc) }) - } -} diff --git a/internal/generator/usecase/general/generator/generator.go b/internal/generator/usecase/general/generator/generator.go index e78da6f..ea52da4 100644 --- a/internal/generator/usecase/general/generator/generator.go +++ b/internal/generator/usecase/general/generator/generator.go @@ -149,7 +149,9 @@ func newRangeGenerator( distinctValuesCount = uint64(generatorValuesCount) } - distinctValuesCountByColumn[column.Name] += distinctValuesCount + if distinctValuesCountByColumn != nil { + distinctValuesCountByColumn[column.Name] += distinctValuesCount + } rangeOrdered := dataRange.Ordered orderSeed := dataColumnSeed @@ -201,7 +203,7 @@ type valueID struct { type BatchGenerator struct { numbers []valueID nextNumber int - valuer func(number valueID, generatedValues map[string]any) (any, error) + valuer func(number valueID, rowValues map[string]any) (any, error) } func (cg *ColumnGenerator) NewBatchGenerator(batchSize uint64) *BatchGenerator { @@ -227,14 +229,14 @@ func (cg *ColumnGenerator) NewBatchGenerator(batchSize uint64) *BatchGenerator { } } - valuer := func(id valueID, generatedValues map[string]any) (any, error) { + valuer := func(id valueID, rowValues map[string]any) (any, error) { vg := cg.rangeGenerators[id.generatorIndex] if vg.nullPercentage > 0 && fastRandomFloat(cg.dataColumnSeed+uint64(id.number)) < vg.nullPercentage { return nil, nil //nolint:nilnil } - return vg.generator.Value(id.number, generatedValues) + return vg.generator.Value(id.number, rowValues) } return &BatchGenerator{ @@ -244,8 +246,8 @@ func (cg *ColumnGenerator) NewBatchGenerator(batchSize uint64) *BatchGenerator { } // Value returns random value for described column. -func (g *BatchGenerator) Value(generatedValues map[string]any) (any, error) { - res, err := g.valuer(g.numbers[g.nextNumber], generatedValues) +func (g *BatchGenerator) Value(rowValues map[string]any) (any, error) { + res, err := g.valuer(g.numbers[g.nextNumber], rowValues) g.nextNumber++ g.nextNumber %= len(g.numbers) diff --git a/internal/generator/usecase/general/generator/value/interfaces.go b/internal/generator/usecase/general/generator/value/interfaces.go index 5094611..7200317 100644 --- a/internal/generator/usecase/general/generator/value/interfaces.go +++ b/internal/generator/usecase/general/generator/value/interfaces.go @@ -7,7 +7,7 @@ type Generator interface { // SetTotalCount method should remember count of rows to generate SetTotalCount(totalValuesCount uint64) error // Value method should return ordered unique value by number - Value(number float64, generatedValues map[string]any) (any, error) + Value(number float64, rowValues map[string]any) (any, error) // ValuesCount method should return the number of possible values to generate ValuesCount(distinctValuesCountByColumn map[string]uint64) float64 } diff --git a/internal/generator/usecase/general/generator/value/string.go b/internal/generator/usecase/general/generator/value/string.go index 69eb01d..0012b09 100644 --- a/internal/generator/usecase/general/generator/value/string.go +++ b/internal/generator/usecase/general/generator/value/string.go @@ -188,12 +188,16 @@ func (g *StringGenerator) calculateCompletions(length int) []int64 { } // templateString returns n-th string by template. -func (g *StringGenerator) templateString(number float64, generatedValues map[string]any) (string, error) { - generatedValues["pattern"] = func(pattern string) *pongo2.Value { +func (g *StringGenerator) templateString(number float64, rowValues map[string]any) (string, error) { + if rowValues == nil { + rowValues = make(map[string]any) + } + + rowValues["pattern"] = func(pattern string) *pongo2.Value { return pongo2.AsSafeValue(g.patternString(number, pattern)) } - val, err := g.template.Execute(generatedValues) + val, err := g.template.Execute(rowValues) if err != nil { return "", errors.New(err.Error()) } @@ -441,9 +445,9 @@ func (g *StringGenerator) simpleString(number float64) string { } // Value returns n-th string from range. -func (g *StringGenerator) Value(number float64, row map[string]any) (any, error) { +func (g *StringGenerator) Value(number float64, rowValues map[string]any) (any, error) { if g.Template != "" { - val, err := g.templateString(number, row) + val, err := g.templateString(number, rowValues) if err != nil { return nil, errors.WithMessage(err, "failed to template string") } diff --git a/internal/generator/usecase/general/task.go b/internal/generator/usecase/general/task.go index 6b71f1e..b3bf3a4 100644 --- a/internal/generator/usecase/general/task.go +++ b/internal/generator/usecase/general/task.go @@ -84,9 +84,7 @@ func newGenerators(cfg *models.GenerationConfig) (map[string]*generator.ColumnGe generators := make(map[string]*generator.ColumnGenerator) for modelName, model := range cfg.Models { - distinctValuesCountByColumn := make(map[string]uint64, len(model.Columns)) - - sortedColumns, err := models.TopologicalSort(model.Columns) + columnsTopologicalOrder, hasDependencies, err := columnsTopologicalSort(model.Columns) if err != nil { return nil, errors.WithMessagef(err, "failed to sorting columns by dependencies for model %q", modelName) } @@ -96,7 +94,12 @@ func newGenerators(cfg *models.GenerationConfig) (map[string]*generator.ColumnGe originIndexes[column.Name] = index } - for _, columnName := range sortedColumns { + var distinctValuesCountByColumn map[string]uint64 + if hasDependencies { + distinctValuesCountByColumn = make(map[string]uint64, len(model.Columns)) + } + + for _, columnName := range columnsTopologicalOrder { column := model.Columns[originIndexes[columnName]] dataModelName := modelName @@ -127,6 +130,23 @@ func newGenerators(cfg *models.GenerationConfig) (map[string]*generator.ColumnGe return generators, nil } +func columnsTopologicalSort(columns []*models.Column) ([]string, bool, error) { + return common.TopologicalSort( + columns, + func(c *models.Column) (string, []string) { + var deps []string + + for _, r := range c.Ranges { + if r.StringParams != nil && r.StringParams.Template != "" { + deps = append(deps, common.ExtractValuesFromTemplate(r.StringParams.Template)...) + } + } + + return c.Name, deps + }, + ) +} + // RunTask function generates unique values and then all values for selected model. func (t *Task) RunTask(ctx context.Context, callback func()) { started := make(chan struct{}) @@ -217,7 +237,7 @@ func (t *Task) generateAndSaveValues(ctx context.Context) error { continue } - columnsTopologicalOrder, err := models.TopologicalSort(model.Columns) + columnsTopologicalOrder, hasDependencies, err := columnsTopologicalSort(model.Columns) if err != nil { return errors.WithMessagef(err, "failed to sorting columns by dependencies for model %q", modelName) } @@ -249,7 +269,7 @@ func (t *Task) generateAndSaveValues(ctx context.Context) error { pool.Submit( ctx, outputSyncer.WorkerSyncer(), - modelName, columnsTopologicalOrder, originColumnsIndexes, + modelName, columnsTopologicalOrder, originColumnsIndexes, hasDependencies, generators, rowsCount, ) } @@ -285,7 +305,7 @@ func (t *Task) skipRows() { // generateAndSaveBatch function generate batch of values for selected column and send it to output. func (t *Task) generateAndSaveBatch( ctx context.Context, outputSync *common.WorkerSyncer, - modelName string, columnsTopologicalOrder []string, originColumnsIndexes map[string]int, + modelName string, columnsTopologicalOrder []string, originColumnsIndexes map[string]int, hasDependencies bool, generators []*generator.BatchGenerator, count uint64, ) error { defer outputSync.Done(ctx) @@ -297,9 +317,12 @@ func (t *Task) generateAndSaveBatch( } } - for i := range count { - generatedValues := make(map[string]any, len(originColumnsIndexes)) + var rowValues map[string]any + if hasDependencies { + rowValues = make(map[string]any, len(originColumnsIndexes)) + } + for i := range count { for _, columnName := range columnsTopologicalOrder { if common.CtxClosed(ctx) { return &common.ContextCancelError{} @@ -307,13 +330,16 @@ func (t *Task) generateAndSaveBatch( idx := originColumnsIndexes[columnName] - value, err := generators[idx].Value(generatedValues) + value, err := generators[idx].Value(rowValues) if err != nil { return errors.WithMessage(err, "failed to get or generate value") } - generatedValues[columnName] = value batch[i].Values[idx] = value + + if rowValues != nil { + rowValues[columnName] = value + } } } From 227c6fb6251a9623213da2e8c05df8940b013dad Mon Sep 17 00:00:00 2001 From: reversetm Date: Mon, 28 Jul 2025 18:29:17 +0300 Subject: [PATCH 05/11] Rebased and updated usage --- doc/en/usage.md | 23 ++++++++++++++++++++--- doc/ru/usage.md | 2 +- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/doc/en/usage.md b/doc/en/usage.md index 73c7d20..5af3bfc 100644 --- a/doc/en/usage.md +++ b/doc/en/usage.md @@ -154,8 +154,9 @@ Structure `models[*].columns[*].type_params` for data type `string`: - `min_length`: Minimum string length. Default is `1`. - `max_length`: Maximum string length. Default is `32`. - `logical_type`: Logical type of string. Supported values: `first_name`, `last_name`, `phone`, `text`. -- `template`: Template for string generation. Symbol `A` - any uppercase letter, symbol `a` - any lowercase letter, - symbol `0` - any digit, symbol `#` - any character. Other characters remain as-is. +- `template`: Jinja-like template for string generation. Allows you to use any fields of the generated model and + specify the pattern of the string using the `pattern` function. Information about the filters and functions + available in template strings is described at the end of this section. - `locale`: Locale for generated strings. Supported values: `ru`, `en`. Default is `en`. - `without_large_letters`: Flag indicating if uppercase letters should be excluded from the string. - `without_small_letters`: Flag indicating if lowercase letters should be excluded from the string. @@ -236,6 +237,18 @@ Structure of `output.params` for `tcs` format: Similar to the structure for the `http` format, except that the `format_template` field is immutable and always set to its default value. +Filters and functions used in template strings: + +Template strings are implemented using the `pongo2` library, you can read +all available filters and functions in the [pongo2](https://github.com/flosch/pongo2) repository. + +In addition, `1` function has been added: + +- pattern: allows you to create a string pattern using special characters. + The `A` symbol is any capital letter, the `a` symbol is any small letter, + symbol `0` is any digit, the `#` symbol is any character, and the other characters remain as they are. + The function is available only in the `template` field of the `string` data type. + #### Examples of data generation configuration Example data model configuration: @@ -301,9 +314,13 @@ models: - name: passport type: string type_params: - template: AA 00 000 000 + template: "{{ pattern('AA 00 000 000') }}" distinct_percentage: 1 ordered: true + - name: email + type: string + type_params: + template: "{{ first_name_en | lower }}.{{ id }}@example.com" - name: rating type: float type_params: diff --git a/doc/ru/usage.md b/doc/ru/usage.md index 73dfa50..38e8a72 100644 --- a/doc/ru/usage.md +++ b/doc/ru/usage.md @@ -243,7 +243,7 @@ open_ai: Подобна структуре для формата `http`, за исключением того, что поле `format_template` неизменяемое и всегда равняется значению по умолчанию. -Фильтры и функции, используемые в шаблонных строках +Фильтры и функции, используемые в шаблонных строках: Шаблонные строки реализованы с использованием библиотеки `pongo2`, ознакомиться со всеми доступными фильтрами и функциями можно в репозитории [pongo2](https://github.com/flosch/pongo2). From 557a695879b57af5f6861bf2c71ff106880d32ba Mon Sep 17 00:00:00 2001 From: reversetm Date: Wed, 30 Jul 2025 00:20:07 +0300 Subject: [PATCH 06/11] Replaced pongo2 with templates from standard library. --- CHANGELOG.md | 5 +- doc/en/usage.md | 46 ++++++++++------ doc/ru/usage.md | 49 +++++++++++------ go.mod | 1 - go.sum | 4 -- internal/generator/common/utils.go | 8 +-- internal/generator/common/utils_test.go | 11 ++-- .../usecase/general/generator/value/string.go | 54 ++++++++++++++----- internal/generator/usecase/general/task.go | 6 ++- .../usecase/general/test/unit_test.go | 16 +++--- 10 files changed, 125 insertions(+), 75 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eb57aaf..a19ccf5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,12 +9,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- String templates replaced with jinja like +- Field `template` in the `string` data type now not only specifies the pattern, + but also allows you to use the values of any columns of the generated model. ### Breaking changes - The old version of string template in `type_params` of `string` type is no longer supported, - `{{ pattern('pattern_expression') }}` should be used instead. + `{{ "pattern_expression" | pattern }}` should be used instead. ## [0.0.1](https://github.com/tarantool/sdvg/compare/36d0930..0.0.1) - 2025-07-21 diff --git a/doc/en/usage.md b/doc/en/usage.md index 5af3bfc..51af42d 100644 --- a/doc/en/usage.md +++ b/doc/en/usage.md @@ -154,8 +154,8 @@ Structure `models[*].columns[*].type_params` for data type `string`: - `min_length`: Minimum string length. Default is `1`. - `max_length`: Maximum string length. Default is `32`. - `logical_type`: Logical type of string. Supported values: `first_name`, `last_name`, `phone`, `text`. -- `template`: Jinja-like template for string generation. Allows you to use any fields of the generated model and - specify the pattern of the string using the `pattern` function. Information about the filters and functions +- `template`: Template for string generation. Allows you to use the values of any columns of the generated model and + specify the pattern of the string using the `pattern` function. Information about the functions available in template strings is described at the end of this section. - `locale`: Locale for generated strings. Supported values: `ru`, `en`. Default is `en`. - `without_large_letters`: Flag indicating if uppercase letters should be excluded from the string. @@ -191,15 +191,12 @@ Structure `output.params` for format `http`: - `workers_count`: Number of threads for writing data. Default is `1`. *Experimental field.* - `headers`: HTTP request headers specified as a dictionary. Default is none. - `format_template`: Template-based format for sending data, configured using Golang templates. - Available for use in `format_template`: - - - fields: + There are 2 fields available for use in `format_template`: * `ModelName` - name of the model. * `Rows` - array of records, where each element is a dictionary representing a data row. Dictionary keys correspond to column names, and values correspond to data in those columns. - - functions: - * `len` - returns the length of the given element. - * `json` - converts the given element to a JSON string. + + You can read about the available functions and the use of template strings at the end of this section. Example value for the `format_template` field: @@ -237,17 +234,36 @@ Structure of `output.params` for `tcs` format: Similar to the structure for the `http` format, except that the `format_template` field is immutable and always set to its default value. -Filters and functions used in template strings: +Using Template Strings:: + +Template strings are implemented using the standard golang library, you can read about +all its features and available functions in this [documentation](https://pkg.go.dev/text/template). + +Accessing Data: -Template strings are implemented using the `pongo2` library, you can read -all available filters and functions in the [pongo2](https://github.com/flosch/pongo2) repository. +In a template, data is accessed using `.`(the object or value passed to the template) +and the field name, for example: `{{ .var }}`. -In addition, `1` function has been added: +Function calls: -- pattern: allows you to create a string pattern using special characters. +- direct call: `{{ upper .name }}`. +- using pipe: `{{ .name | upper }}`. + +In addition to standard functions, the project provides `5` custom functions: + +- `pattern`: allows you to create a string pattern using special characters. The `A` symbol is any capital letter, the `a` symbol is any small letter, symbol `0` is any digit, the `#` symbol is any character, and the other characters remain as they are. The function is available only in the `template` field of the `string` data type. +- `upper`: converts the string to upper case. +- `lower`: converts the string to lower case. +- `len`: returns the length of the element. +- `json`: converts the element to a JSON string. + +Usage restrictions: + +The `pattern`, `lower`, and `upper` functions are available only in the `template` field of the `string` data type. +The `len` and `json` functions are available only in the `format_template` field of the output parameters. #### Examples of data generation configuration @@ -314,13 +330,13 @@ models: - name: passport type: string type_params: - template: "{{ pattern('AA 00 000 000') }}" + template: '{{ "AA 00 000 000" | pattern }}' distinct_percentage: 1 ordered: true - name: email type: string type_params: - template: "{{ first_name_en | lower }}.{{ id }}@example.com" + template: "{{ .first_name_en | lower }}.{{ .id }}@example.com" - name: rating type: float type_params: diff --git a/doc/ru/usage.md b/doc/ru/usage.md index 38e8a72..bc0735a 100644 --- a/doc/ru/usage.md +++ b/doc/ru/usage.md @@ -160,9 +160,9 @@ open_ai: - `min_length`: Минимальная длина строки. По умолчанию `1`. - `max_length`: Максимальная длина строки. По умолчанию `32`. - `logical_type`: Логический тип строки. Поддерживаемые значения: `first_name`, `last_name`, `phone`, `text`. -- `template`: Jinja-подобный шаблон для генерации строки. Позволяет использовать любые поля генерируемой модели и - задавать паттерн строки с помощью функции `pattern`. Информация о фильтрах и функциях, доступных в шаблонных - строках описана в конце данного раздела. +- `template`: Шаблон для генерации строки. Позволяет использовать значения любых столбов генерируемой модели и + задавать паттерн строки с помощью функции `pattern`. Информация о том, как использовать шаблонные строки, + описана в конце данного раздела. - `locale`: Локаль для генерации строк. Поддерживаемые значения: `ru`, `en`. По умолчанию `en`. - `without_large_letters`: Флаг, указывающий, исключать ли большие буквы из строки. - `without_small_letters`: Флаг, указывающий, исключать ли маленькие буквы из строки. @@ -197,15 +197,12 @@ open_ai: - `workers_count`: Количество потоков для записи данных. По умолчанию `1`. *Является экспериментальным полем.* - `headers`: Заголовки http запроса, указываются в формате словаря. По умолчанию отсутствуют. - `format_template`: Формат отправляемых данных, конфигурируемый с помощью шаблонов Golang. - Для использования в поле `format_template` доступны: - - - поля: + Для использования в `format_template` доступно 2 поля: * `ModelName` - имя модели. * `Rows` - массив записей, где каждый элемент является словарем, который представляет собой строку данных. Ключи словаря соответствуют названиям столбцов, а значения — данным в этих столбцах. - - функции: - * `len` - возвращает длину переданного элемента. - * `json` - преобразует переданный элемент в JSON строку. + + О доступных функциях и использовании шаблонных строк можно прочитать в конце данного раздела. Пример значения поля `format_template`: @@ -243,17 +240,35 @@ open_ai: Подобна структуре для формата `http`, за исключением того, что поле `format_template` неизменяемое и всегда равняется значению по умолчанию. -Фильтры и функции, используемые в шаблонных строках: +Использование шаблонных строк: + +Шаблонные строки реализованы с использованием стандартной библиотеки golang, ознакомиться +со всеми ее возможностями и доступными функциями можно данной [документации](https://pkg.go.dev/text/template). + +Доступ к данным: -Шаблонные строки реализованы с использованием библиотеки `pongo2`, ознакомиться -со всеми доступными фильтрами и функциями можно в репозитории [pongo2](https://github.com/flosch/pongo2). +Обращение к данным в шаблоне выполняется с помощью `.`(объект или значение, переданное шаблону) +и имени переменной, например, `{{ .var }}`. -Вдобавок к ним была добавлена 1 функция: +Вызовы функций: -- pattern: позволяет создать паттерн строки при помощи специальных символов. +- прямой вызов: `{{ upper .name }}`. +- с помощью pipe: `{{ .name | upper }}`. + +В проекте помимо стандартных функций доступны `5` пользовательских: + +- `pattern`: позволяет создать паттерн строки при помощи специальных символов. Символ `A` - любая большая буква, символ `a` - любая маленькая буква, символ `0` - любая цифра, символ `#` - любой символ, а остальные символы остаются как есть. - Функция доступна только в поле `template` типа данных `string`. +- `upper`: преобразует строку в верхний регистр. +- `lower`: преобразует строку в нижний регистр. +- `len`: возвращает длину элемента. +- `json`: преобразует элемент в JSON строку. + +Ограничения по использованию: + +Функции `pattern`, `lower`, и `upper` доступны только в поле `template` типа данных `string`. +Функции `len` и `json` доступны только в поле `format_template` параметров вывода. #### Примеры конфигурации генерации данных @@ -320,13 +335,13 @@ models: - name: passport type: string type_params: - template: "{{ pattern('AA 00 000 000') }}" + template: '{{ "AA 00 000 000" | pattern }}' distinct_percentage: 1 ordered: true - name: email type: string type_params: - template: "{{ first_name_en | lower }}.{{ id }}@example.com" + template: "{{ .first_name_en | lower }}.{{ .id }}@example.com" - name: rating type: float type_params: diff --git a/go.mod b/go.mod index ad9d088..1a3689c 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,6 @@ go 1.23.8 require ( github.com/apache/arrow-go/v18 v18.2.0 github.com/charmbracelet/huh/spinner v0.0.0-20250203114958-f07ae1af69ae - github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3 github.com/google/uuid v1.6.0 github.com/hashicorp/go-retryablehttp v0.7.7 github.com/ilyakaznacheev/cleanenv v1.5.0 diff --git a/go.sum b/go.sum index 8488c13..11b141e 100644 --- a/go.sum +++ b/go.sum @@ -45,8 +45,6 @@ github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6 github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM= github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM= github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE= -github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3 h1:fmFk0Wt3bBxxwZnu48jqMdaOR/IZ4vdtJFuaFV8MpIE= -github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3/go.mod h1:bJWSKrZyQvfTnb2OudyUjurSG4/edverV7n82+K3JiM= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWoS4= github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= @@ -133,7 +131,6 @@ github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELU github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo= github.com/muesli/termenv v0.15.3-0.20240618155329-98d742f6907a h1:2MaM6YC3mGu54x+RKAA6JiFFHlHDY1UbkxqppT7wYOg= github.com/muesli/termenv v0.15.3-0.20240618155329-98d742f6907a/go.mod h1:hxSnBBYLK21Vtq/PHd0S2FYCxBXzBua8ov5s1RobyRQ= -github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A= github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= @@ -271,7 +268,6 @@ google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2 google.golang.org/protobuf v1.36.5 h1:tPhr+woSbjfYvY6/GPufUoYizxw1cF/yFoxJ2fmpwlM= google.golang.org/protobuf v1.36.5/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20200902074654-038fdea0a05b/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= diff --git a/internal/generator/common/utils.go b/internal/generator/common/utils.go index 49f2087..6331038 100644 --- a/internal/generator/common/utils.go +++ b/internal/generator/common/utils.go @@ -361,7 +361,7 @@ func CtxClosed(ctx context.Context) bool { } func ExtractValuesFromTemplate(template string) []string { - re := regexp.MustCompile(`{{\s*([^\s|(){}]+)[^}]*}}`) + re := regexp.MustCompile(`{{.*?\.([^\s|}]+).*?}}`) matches := re.FindAllStringSubmatch(template, -1) values := make([]string, 0, len(matches)) @@ -387,18 +387,18 @@ func ExtractValuesFromTemplate(template string) []string { func TopologicalSort[T any](items []T, nodeFunc func(T) (string, []string)) ([]string, bool, error) { var ( graph = make(map[string][]string, len(items)) - sortedVertexes = make([]string, 0, len(items)) + sortedVertexes = make([]string, len(items)) hasDependencies bool err error ) - for _, item := range items { + for i, item := range items { name, dependencies := nodeFunc(item) if len(dependencies) > 0 { hasDependencies = true } - sortedVertexes = append(sortedVertexes, name) + sortedVertexes[i] = name graph[name] = dependencies } diff --git a/internal/generator/common/utils_test.go b/internal/generator/common/utils_test.go index 5c4a406..eb8e608 100644 --- a/internal/generator/common/utils_test.go +++ b/internal/generator/common/utils_test.go @@ -713,18 +713,13 @@ func TestExtractValuesFromTemplate(t *testing.T) { }, { name: "Valid template", - template: "{{ foo }}.{{boo}}", + template: "{{ .foo }}.{{.boo}}", expected: []string{"foo", "boo"}, }, - { - name: "Template with filters", - template: "{{ foo | upper | lower }}", - expected: []string{"foo"}, - }, { name: "Template with functions", - template: "{{ upper('foo') | lower }}@{{ boo }}", - expected: []string{"boo"}, + template: "{{ upper .foo | lower }}@{{ .boo }}", + expected: []string{"foo", "boo"}, }, { name: "Invalid template", diff --git a/internal/generator/usecase/general/generator/value/string.go b/internal/generator/usecase/general/generator/value/string.go index 0012b09..713627f 100644 --- a/internal/generator/usecase/general/generator/value/string.go +++ b/internal/generator/usecase/general/generator/value/string.go @@ -1,13 +1,16 @@ package value import ( + "bytes" + "fmt" "math" "math/big" "regexp" "slices" "strings" + "sync" + "text/template" - "github.com/flosch/pongo2" "github.com/pkg/errors" "github.com/tarantool/sdvg/internal/generator/common" "github.com/tarantool/sdvg/internal/generator/models" @@ -17,7 +20,8 @@ import ( ) var ( - rePatternVal = regexp.MustCompile(`pattern\((?:'([^']*)'|"([^"]*)")\)`) + rePatternFunc = regexp.MustCompile(`{{\s*pattern\(\s*(?:'([^']*)'|"([^"]*)")\s*\)\s*}}`) + rePatternFilter = regexp.MustCompile(`{{\s*(?:pattern\s+"([^"]+)"|"([^"]+)"\s*\|\s*pattern)\s*}}`) ) // Verify interface compliance in compile time. @@ -27,8 +31,9 @@ var _ Generator = (*StringGenerator)(nil) type StringGenerator struct { *models.ColumnStringParams totalValuesCount uint64 + template *template.Template + bufPool *sync.Pool localeModule locale.LocalModule - template *pongo2.Template charset []rune countByPrefix []float64 sumByPrefix []float64 @@ -38,12 +43,25 @@ type StringGenerator struct { //nolint:cyclop func (g *StringGenerator) Prepare() error { if g.Template != "" { - template, err := pongo2.FromString(g.Template) + tmpl, err := template.New("template"). + Funcs(template.FuncMap{ + "upper": strings.ToUpper, + "lower": strings.ToLower, + "pattern": func(s string) string { + return fmt.Sprintf("{{pattern('%s')}}", s) + }, + }). + Parse(g.Template) if err != nil { return errors.Errorf("failed to parse template: %s", err.Error()) } - g.template = template + g.template = tmpl + g.bufPool = &sync.Pool{ + New: func() any { + return new(bytes.Buffer) + }, + } } switch g.Locale { @@ -188,20 +206,28 @@ func (g *StringGenerator) calculateCompletions(length int) []int64 { } // templateString returns n-th string by template. +// +//nolint:forcetypeassert func (g *StringGenerator) templateString(number float64, rowValues map[string]any) (string, error) { - if rowValues == nil { - rowValues = make(map[string]any) - } - - rowValues["pattern"] = func(pattern string) *pongo2.Value { - return pongo2.AsSafeValue(g.patternString(number, pattern)) - } + buf := g.bufPool.Get().(*bytes.Buffer) + buf.Reset() - val, err := g.template.Execute(rowValues) + err := g.template.Execute(buf, rowValues) if err != nil { + g.bufPool.Put(buf) + return "", errors.New(err.Error()) } + val := buf.String() + g.bufPool.Put(buf) + + val = rePatternFunc.ReplaceAllStringFunc(val, func(m string) string { + sub := rePatternFunc.FindStringSubmatch(m) + + return g.patternString(number, sub[1]) + }) + return val, nil } @@ -514,7 +540,7 @@ func (g *StringGenerator) ValuesCount(distinctValuesCountByColumn map[string]uin func (g *StringGenerator) templateCardinality(distinctValuesCountByColumn map[string]uint64) float64 { total := 1.0 - patternValMatches := rePatternVal.FindAllStringSubmatch(g.Template, -1) + patternValMatches := rePatternFilter.FindAllStringSubmatch(g.Template, -1) for _, match := range patternValMatches { pattern := match[1] if pattern == "" { diff --git a/internal/generator/usecase/general/task.go b/internal/generator/usecase/general/task.go index b3bf3a4..026dfc6 100644 --- a/internal/generator/usecase/general/task.go +++ b/internal/generator/usecase/general/task.go @@ -269,7 +269,8 @@ func (t *Task) generateAndSaveValues(ctx context.Context) error { pool.Submit( ctx, outputSyncer.WorkerSyncer(), - modelName, columnsTopologicalOrder, originColumnsIndexes, hasDependencies, + modelName, hasDependencies, + columnsTopologicalOrder, originColumnsIndexes, generators, rowsCount, ) } @@ -305,7 +306,8 @@ func (t *Task) skipRows() { // generateAndSaveBatch function generate batch of values for selected column and send it to output. func (t *Task) generateAndSaveBatch( ctx context.Context, outputSync *common.WorkerSyncer, - modelName string, columnsTopologicalOrder []string, originColumnsIndexes map[string]int, hasDependencies bool, + modelName string, hasDependencies bool, + columnsTopologicalOrder []string, originColumnsIndexes map[string]int, generators []*generator.BatchGenerator, count uint64, ) error { defer outputSync.Done(ctx) diff --git a/internal/generator/usecase/general/test/unit_test.go b/internal/generator/usecase/general/test/unit_test.go index cae5433..836f614 100644 --- a/internal/generator/usecase/general/test/unit_test.go +++ b/internal/generator/usecase/general/test/unit_test.go @@ -531,9 +531,9 @@ func TestString(t *testing.T) { {&models.ColumnStringParams{LogicalType: models.LastNameType, MinLength: 4, MaxLength: 7}, 4, 7}, {&models.ColumnStringParams{LogicalType: models.PhoneType, MinLength: 10, MaxLength: 10}, 10, 10}, {&models.ColumnStringParams{MinLength: 100, MaxLength: 100}, 100, 100}, - {&models.ColumnStringParams{Template: "{{ pattern('AAaa00##') }}", Locale: "en"}, 8, 8}, - {&models.ColumnStringParams{Template: "{{ pattern('AAaa00##') }}", Locale: "ru"}, 8, 8}, - {&models.ColumnStringParams{Template: "{{ pattern('0123456789012345678901234567890123456789') }}"}, 40, 40}, + {&models.ColumnStringParams{Template: `{{ pattern "AAaa00##" }}`, Locale: "en"}, 8, 8}, + {&models.ColumnStringParams{Template: `{{ pattern "AAaa00##" }}`, Locale: "ru"}, 8, 8}, + {&models.ColumnStringParams{Template: `{{ pattern "0123456789012345678901234567890123456789" }}`}, 40, 40}, {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 3, MaxLength: 5}, 3, 5}, {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 254, MaxLength: 256}, 254, 256}, {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 510, MaxLength: 512}, 510, 512}, @@ -656,7 +656,7 @@ func TestString(t *testing.T) { { &models.ColumnStringParams{ Locale: "en", - Template: "{{ field }}", + Template: "{{ .field }}", }, map[string]uint64{ "field": 11, @@ -666,7 +666,7 @@ func TestString(t *testing.T) { { &models.ColumnStringParams{ Locale: "en", - Template: "{{ pattern('A00') }}", + Template: `{{ pattern "A00" }}`, }, nil, 2600, @@ -674,7 +674,7 @@ func TestString(t *testing.T) { { &models.ColumnStringParams{ Locale: "ru", - Template: "{{ field }}{{ pattern('a0#') }}", + Template: `{{ .field }}{{ pattern "a0#" }}`, }, map[string]uint64{ "field": 10, @@ -707,7 +707,7 @@ func TestString(t *testing.T) { Ranges: []*models.Params{ { TypeParams: &models.ColumnStringParams{ - Template: "{{ id }}.{{ pattern('00') }}@example.com", + Template: `{{ .id }}.{{ pattern "00" }}@example.com`, }, DistinctPercentage: 1, }, @@ -928,7 +928,7 @@ func TestIdempotence(t *testing.T) { Name: "passport", Type: "string", Ranges: []*models.Params{{TypeParams: &models.ColumnStringParams{ - Template: "{{ pattern('AA 00 000 000') }}", + Template: `{{ pattern "AA 00 000 000" }}`, }, NullPercentage: 0.5}}, }, From 2d56641a72ce2837cbc9a5e2895a8f6465e7fc58 Mon Sep 17 00:00:00 2001 From: reversetm Date: Wed, 30 Jul 2025 17:06:58 +0300 Subject: [PATCH 07/11] Separated the pattern and template logic, improved performance --- CHANGELOG.md | 8 +- doc/en/usage.md | 18 +-- doc/ru/usage.md | 17 +- internal/generator/common/utils.go | 9 +- internal/generator/models/generator_model.go | 15 ++ .../usecase/general/generator/generator.go | 12 +- .../general/generator/value/datetime.go | 2 +- .../usecase/general/generator/value/enum.go | 2 +- .../usecase/general/generator/value/float.go | 2 +- .../general/generator/value/integer.go | 2 +- .../general/generator/value/interfaces.go | 2 +- .../usecase/general/generator/value/string.go | 103 ++++-------- .../usecase/general/generator/value/uuid.go | 2 +- internal/generator/usecase/general/task.go | 73 ++++----- .../usecase/general/test/unit_test.go | 149 ++---------------- 15 files changed, 128 insertions(+), 288 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a19ccf5..0aaa1c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,13 +9,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- Field `template` in the `string` data type now not only specifies the pattern, - but also allows you to use the values of any columns of the generated model. +- The `template` field in the `string` data type is now used to generate template strings + with the ability to use the values of any columns of the generated model. ### Breaking changes -- The old version of string template in `type_params` of `string` type is no longer supported, - `{{ "pattern_expression" | pattern }}` should be used instead. +- Using `template` field to specify a string pattern like `Aa0#` is no longer supported, + `pattern` should be used instead. ## [0.0.1](https://github.com/tarantool/sdvg/compare/36d0930..0.0.1) - 2025-07-21 diff --git a/doc/en/usage.md b/doc/en/usage.md index 51af42d..2f7cbf3 100644 --- a/doc/en/usage.md +++ b/doc/en/usage.md @@ -154,9 +154,11 @@ Structure `models[*].columns[*].type_params` for data type `string`: - `min_length`: Minimum string length. Default is `1`. - `max_length`: Maximum string length. Default is `32`. - `logical_type`: Logical type of string. Supported values: `first_name`, `last_name`, `phone`, `text`. -- `template`: Template for string generation. Allows you to use the values of any columns of the generated model and - specify the pattern of the string using the `pattern` function. Information about the functions - available in template strings is described at the end of this section. +- `template`: Template for string generation. Allows you to use the values of any columns of the generated model. + Information about the functions available in template strings is described at the end of this section. + Cannot coexist with `ordered`, `distinct_percentage` or `distinct_count`. +- `pattern`: Pattern for string generation. The `A` symbol is any capital letter, the `a` symbol is any small letter, + symbol `0` is any digit, the `#` symbol is any character, and the other characters remain as they are. - `locale`: Locale for generated strings. Supported values: `ru`, `en`. Default is `en`. - `without_large_letters`: Flag indicating if uppercase letters should be excluded from the string. - `without_small_letters`: Flag indicating if lowercase letters should be excluded from the string. @@ -249,12 +251,8 @@ Function calls: - direct call: `{{ upper .name }}`. - using pipe: `{{ .name | upper }}`. -In addition to standard functions, the project provides `5` custom functions: +In addition to standard functions, the project provides `4` custom functions: -- `pattern`: allows you to create a string pattern using special characters. - The `A` symbol is any capital letter, the `a` symbol is any small letter, - symbol `0` is any digit, the `#` symbol is any character, and the other characters remain as they are. - The function is available only in the `template` field of the `string` data type. - `upper`: converts the string to upper case. - `lower`: converts the string to lower case. - `len`: returns the length of the element. @@ -262,7 +260,7 @@ In addition to standard functions, the project provides `5` custom functions: Usage restrictions: -The `pattern`, `lower`, and `upper` functions are available only in the `template` field of the `string` data type. +The `lower`, and `upper` functions are available only in the `template` field of the `string` data type. The `len` and `json` functions are available only in the `format_template` field of the output parameters. #### Examples of data generation configuration @@ -330,7 +328,7 @@ models: - name: passport type: string type_params: - template: '{{ "AA 00 000 000" | pattern }}' + pattern: AA 00 000 000 distinct_percentage: 1 ordered: true - name: email diff --git a/doc/ru/usage.md b/doc/ru/usage.md index bc0735a..706603f 100644 --- a/doc/ru/usage.md +++ b/doc/ru/usage.md @@ -160,9 +160,11 @@ open_ai: - `min_length`: Минимальная длина строки. По умолчанию `1`. - `max_length`: Максимальная длина строки. По умолчанию `32`. - `logical_type`: Логический тип строки. Поддерживаемые значения: `first_name`, `last_name`, `phone`, `text`. -- `template`: Шаблон для генерации строки. Позволяет использовать значения любых столбов генерируемой модели и - задавать паттерн строки с помощью функции `pattern`. Информация о том, как использовать шаблонные строки, - описана в конце данного раздела. +- `template`: Шаблон для генерации строки. Позволяет использовать значения любых столбов генерируемой модели. + Информация о том, как использовать шаблонные строки, описана в конце данного раздела. + Не работает совместно с `ordered`, `distinct_percentage` или `distinct_count`. +- `pattern`: Паттерн для генерации строки. Символ `A` - любая большая буква, символ `a` - любая маленькая буква, + символ `0` - любая цифра, символ `#` - любой символ, а остальные символы остаются как есть. - `locale`: Локаль для генерации строк. Поддерживаемые значения: `ru`, `en`. По умолчанию `en`. - `without_large_letters`: Флаг, указывающий, исключать ли большие буквы из строки. - `without_small_letters`: Флаг, указывающий, исключать ли маленькие буквы из строки. @@ -255,11 +257,8 @@ open_ai: - прямой вызов: `{{ upper .name }}`. - с помощью pipe: `{{ .name | upper }}`. -В проекте помимо стандартных функций доступны `5` пользовательских: +В проекте помимо стандартных функций доступны `4` пользовательских: -- `pattern`: позволяет создать паттерн строки при помощи специальных символов. - Символ `A` - любая большая буква, символ `a` - любая маленькая буква, - символ `0` - любая цифра, символ `#` - любой символ, а остальные символы остаются как есть. - `upper`: преобразует строку в верхний регистр. - `lower`: преобразует строку в нижний регистр. - `len`: возвращает длину элемента. @@ -267,7 +266,7 @@ open_ai: Ограничения по использованию: -Функции `pattern`, `lower`, и `upper` доступны только в поле `template` типа данных `string`. +Функции `lower`, и `upper` доступны только в поле `template` типа данных `string`. Функции `len` и `json` доступны только в поле `format_template` параметров вывода. #### Примеры конфигурации генерации данных @@ -335,7 +334,7 @@ models: - name: passport type: string type_params: - template: '{{ "AA 00 000 000" | pattern }}' + pattern: AA 00 000 000 distinct_percentage: 1 ordered: true - name: email diff --git a/internal/generator/common/utils.go b/internal/generator/common/utils.go index 6331038..523b6d4 100644 --- a/internal/generator/common/utils.go +++ b/internal/generator/common/utils.go @@ -367,14 +367,7 @@ func ExtractValuesFromTemplate(template string) []string { values := make([]string, 0, len(matches)) for _, match := range matches { - expr := match[0] - val := match[1] - - if strings.Contains(expr, "(") && strings.Contains(expr, ")") { - continue - } - - values = append(values, val) + values = append(values, match[1]) } return values diff --git a/internal/generator/models/generator_model.go b/internal/generator/models/generator_model.go index e3c451f..5f88de9 100644 --- a/internal/generator/models/generator_model.go +++ b/internal/generator/models/generator_model.go @@ -426,6 +426,16 @@ func (p *Params) Validate() []error { errs = append(errs, datetimeParamsErrs...) } + if p.StringParams != nil && p.StringParams.Template != "" { + if common.Any( + p.Ordered, + p.DistinctPercentage != 0, + p.DistinctCount != 0, + ) { + errs = append(errs, errors.New("forbidden to use string template with distinct params or ordered")) + } + } + // must be called only after parsing, filling defaults and validation of TypeParams. if p.Values != nil { if err := p.PostProcess(); err != nil { @@ -674,6 +684,7 @@ type ColumnStringParams struct { Locale string `backup:"true" json:"locale" yaml:"locale"` LogicalType string `backup:"true" json:"logical_type" yaml:"logical_type"` Template string `backup:"true" json:"template" yaml:"template"` + Pattern string `backup:"true" json:"pattern" yaml:"pattern"` WithoutLargeLetters bool `backup:"true" json:"without_large_letters" yaml:"without_large_letters"` WithoutSmallLetters bool `backup:"true" json:"without_small_letters" yaml:"without_small_letters"` WithoutNumbers bool `backup:"true" json:"without_numbers" yaml:"without_numbers"` @@ -703,6 +714,10 @@ func (p *ColumnStringParams) FillDefaults() { func (p *ColumnStringParams) Validate() []error { var errs []error + if p.Template != "" && p.Pattern != "" { + errs = append(errs, errors.Errorf("forbidden to use template and pattern at the same time")) + } + if p.MinLength > p.MaxLength { errs = append(errs, errors.Errorf( "min length (%v) should be less than or equal to max length (%v)", diff --git a/internal/generator/usecase/general/generator/generator.go b/internal/generator/usecase/general/generator/generator.go index ea52da4..286a259 100644 --- a/internal/generator/usecase/general/generator/generator.go +++ b/internal/generator/usecase/general/generator/generator.go @@ -28,7 +28,7 @@ type ColumnGenerator struct { } func NewColumnGenerator( - baseSeed uint64, distinctValuesCountByColumn map[string]uint64, + baseSeed uint64, modelName string, model *models.Model, column *models.Column, dataModelName string, dataModel *models.Model, dataColumn *models.Column, ) (*ColumnGenerator, error) { @@ -54,7 +54,7 @@ func NewColumnGenerator( rangeRowsCount := uint64(math.Ceil(float64(rowsCount) * dataRange.RangePercentage)) gen, err := newRangeGenerator( - column, columnSeed, distinctValuesCountByColumn, + column, columnSeed, dataModel, dataColumn, dataColumnSeed, dataRange, rangeRowsOffset, rangeRowsCount, ) @@ -93,7 +93,7 @@ func (cg *ColumnGenerator) SkipRows(count uint64) { //nolint:cyclop func newRangeGenerator( - column *models.Column, columnSeed uint64, distinctValuesCountByColumn map[string]uint64, + column *models.Column, columnSeed uint64, dataModel *models.Model, dataColumn *models.Column, dataColumnSeed uint64, dataRange *models.Params, rangeRowsOffset, rangeRowsCount uint64, ) (*rangeGenerator, error) { @@ -139,7 +139,7 @@ func newRangeGenerator( distinctValuesCount = dataRange.DistinctCount } - generatorValuesCount := valueGenerator.ValuesCount(distinctValuesCountByColumn) + generatorValuesCount := valueGenerator.ValuesCount() if float64(distinctValuesCount) > generatorValuesCount { if dataRange.DistinctPercentage != 0 || dataRange.DistinctCount != 0 { @@ -149,10 +149,6 @@ func newRangeGenerator( distinctValuesCount = uint64(generatorValuesCount) } - if distinctValuesCountByColumn != nil { - distinctValuesCountByColumn[column.Name] += distinctValuesCount - } - rangeOrdered := dataRange.Ordered orderSeed := dataColumnSeed diff --git a/internal/generator/usecase/general/generator/value/datetime.go b/internal/generator/usecase/general/generator/value/datetime.go index 8970354..600e89b 100644 --- a/internal/generator/usecase/general/generator/value/datetime.go +++ b/internal/generator/usecase/general/generator/value/datetime.go @@ -49,7 +49,7 @@ func (g *DateTimeGenerator) Value(number float64, _ map[string]any) (any, error) return value, nil } -func (g *DateTimeGenerator) ValuesCount(_ map[string]uint64) float64 { +func (g *DateTimeGenerator) ValuesCount() float64 { fromSec := g.From.Unix() toSec := g.To.Unix() diff --git a/internal/generator/usecase/general/generator/value/enum.go b/internal/generator/usecase/general/generator/value/enum.go index e989b68..18d4413 100644 --- a/internal/generator/usecase/general/generator/value/enum.go +++ b/internal/generator/usecase/general/generator/value/enum.go @@ -37,6 +37,6 @@ func (g *EnumGenerator) Value(number float64, _ map[string]any) (any, error) { return g.Values[idx], nil } -func (g *EnumGenerator) ValuesCount(_ map[string]uint64) float64 { +func (g *EnumGenerator) ValuesCount() float64 { return float64(len(g.Values)) } diff --git a/internal/generator/usecase/general/generator/value/float.go b/internal/generator/usecase/general/generator/value/float.go index 79ba0c5..c1903a4 100644 --- a/internal/generator/usecase/general/generator/value/float.go +++ b/internal/generator/usecase/general/generator/value/float.go @@ -36,6 +36,6 @@ func (g *FloatGenerator) Value(number float64, _ map[string]any) (any, error) { return value, nil } -func (g *FloatGenerator) ValuesCount(_ map[string]uint64) float64 { +func (g *FloatGenerator) ValuesCount() float64 { return math.Inf(1) } diff --git a/internal/generator/usecase/general/generator/value/integer.go b/internal/generator/usecase/general/generator/value/integer.go index 5aefbc1..c83b1ba 100644 --- a/internal/generator/usecase/general/generator/value/integer.go +++ b/internal/generator/usecase/general/generator/value/integer.go @@ -37,6 +37,6 @@ func (g *IntegerGenerator) Value(number float64, _ map[string]any) (any, error) } } -func (g *IntegerGenerator) ValuesCount(_ map[string]uint64) float64 { +func (g *IntegerGenerator) ValuesCount() float64 { return float64(uint64(g.To-g.From)) + 1 } diff --git a/internal/generator/usecase/general/generator/value/interfaces.go b/internal/generator/usecase/general/generator/value/interfaces.go index 7200317..b669412 100644 --- a/internal/generator/usecase/general/generator/value/interfaces.go +++ b/internal/generator/usecase/general/generator/value/interfaces.go @@ -9,5 +9,5 @@ type Generator interface { // Value method should return ordered unique value by number Value(number float64, rowValues map[string]any) (any, error) // ValuesCount method should return the number of possible values to generate - ValuesCount(distinctValuesCountByColumn map[string]uint64) float64 + ValuesCount() float64 } diff --git a/internal/generator/usecase/general/generator/value/string.go b/internal/generator/usecase/general/generator/value/string.go index 713627f..280b6b0 100644 --- a/internal/generator/usecase/general/generator/value/string.go +++ b/internal/generator/usecase/general/generator/value/string.go @@ -2,28 +2,20 @@ package value import ( "bytes" - "fmt" "math" "math/big" - "regexp" "slices" "strings" "sync" "text/template" "github.com/pkg/errors" - "github.com/tarantool/sdvg/internal/generator/common" "github.com/tarantool/sdvg/internal/generator/models" "github.com/tarantool/sdvg/internal/generator/usecase/general/locale" "github.com/tarantool/sdvg/internal/generator/usecase/general/locale/en" "github.com/tarantool/sdvg/internal/generator/usecase/general/locale/ru" ) -var ( - rePatternFunc = regexp.MustCompile(`{{\s*pattern\(\s*(?:'([^']*)'|"([^"]*)")\s*\)\s*}}`) - rePatternFilter = regexp.MustCompile(`{{\s*(?:pattern\s+"([^"]+)"|"([^"]+)"\s*\|\s*pattern)\s*}}`) -) - // Verify interface compliance in compile time. var _ Generator = (*StringGenerator)(nil) @@ -44,12 +36,10 @@ type StringGenerator struct { func (g *StringGenerator) Prepare() error { if g.Template != "" { tmpl, err := template.New("template"). + Option("missingkey=error"). Funcs(template.FuncMap{ "upper": strings.ToUpper, "lower": strings.ToLower, - "pattern": func(s string) string { - return fmt.Sprintf("{{pattern('%s')}}", s) - }, }). Parse(g.Template) if err != nil { @@ -208,7 +198,7 @@ func (g *StringGenerator) calculateCompletions(length int) []int64 { // templateString returns n-th string by template. // //nolint:forcetypeassert -func (g *StringGenerator) templateString(number float64, rowValues map[string]any) (string, error) { +func (g *StringGenerator) templateString(rowValues map[string]any) (string, error) { buf := g.bufPool.Get().(*bytes.Buffer) buf.Reset() @@ -222,18 +212,12 @@ func (g *StringGenerator) templateString(number float64, rowValues map[string]an val := buf.String() g.bufPool.Put(buf) - val = rePatternFunc.ReplaceAllStringFunc(val, func(m string) string { - sub := rePatternFunc.FindStringSubmatch(m) - - return g.patternString(number, sub[1]) - }) - return val, nil } // patternString returns n-th string by pattern. -func (g *StringGenerator) patternString(number float64, pattern string) string { - val := []rune(pattern) +func (g *StringGenerator) patternString(number float64) string { + val := []rune(g.Pattern) index := number / float64(g.totalValuesCount) for i := range val { @@ -473,14 +457,18 @@ func (g *StringGenerator) simpleString(number float64) string { // Value returns n-th string from range. func (g *StringGenerator) Value(number float64, rowValues map[string]any) (any, error) { if g.Template != "" { - val, err := g.templateString(number, rowValues) + val, err := g.templateString(rowValues) if err != nil { - return nil, errors.WithMessage(err, "failed to template string") + return nil, errors.WithMessage(err, "failed to render template string") } return val, nil } + if g.Pattern != "" { + return g.patternString(number), nil + } + switch g.LogicalType { case models.FirstNameType: return g.firstName(number), nil @@ -496,9 +484,31 @@ func (g *StringGenerator) Value(number float64, rowValues map[string]any) (any, } //nolint:cyclop -func (g *StringGenerator) ValuesCount(distinctValuesCountByColumn map[string]uint64) float64 { +func (g *StringGenerator) ValuesCount() float64 { if g.Template != "" { - return g.templateCardinality(distinctValuesCountByColumn) + return 1.0 + } + + if g.Pattern != "" { + total := 1.0 + + if count := strings.Count(g.Pattern, "A"); count > 0 { + total *= math.Pow(float64(len(g.localeModule.LargeLetters())), float64(count)) + } + + if count := strings.Count(g.Pattern, "a"); count > 0 { + total *= math.Pow(float64(len(g.localeModule.SmallLetters())), float64(count)) + } + + if count := strings.Count(g.Pattern, "0"); count > 0 { + total *= math.Pow(float64(len(locale.Numbers)), float64(count)) + } + + if count := strings.Count(g.Pattern, "#"); count > 0 { + total *= math.Pow(float64(len(locale.SpecialChars)), float64(count)) + } + + return total } switch g.LogicalType { @@ -536,48 +546,3 @@ func (g *StringGenerator) ValuesCount(distinctValuesCountByColumn map[string]uin return totalCount } - -func (g *StringGenerator) templateCardinality(distinctValuesCountByColumn map[string]uint64) float64 { - total := 1.0 - - patternValMatches := rePatternFilter.FindAllStringSubmatch(g.Template, -1) - for _, match := range patternValMatches { - pattern := match[1] - if pattern == "" { - pattern = match[2] - } - - total *= g.patternCardinality(pattern) - } - - columns := common.ExtractValuesFromTemplate(g.Template) - for _, column := range columns { - if count, ok := distinctValuesCountByColumn[column]; ok && count > 0 { - total *= float64(count) - } - } - - return total -} - -func (g *StringGenerator) patternCardinality(pattern string) float64 { - total := 1.0 - - if count := strings.Count(pattern, "A"); count > 0 { - total *= math.Pow(float64(len(g.localeModule.LargeLetters())), float64(count)) - } - - if count := strings.Count(pattern, "a"); count > 0 { - total *= math.Pow(float64(len(g.localeModule.SmallLetters())), float64(count)) - } - - if count := strings.Count(pattern, "0"); count > 0 { - total *= math.Pow(float64(len(locale.Numbers)), float64(count)) - } - - if count := strings.Count(pattern, "#"); count > 0 { - total *= math.Pow(float64(len(locale.SpecialChars)), float64(count)) - } - - return total -} diff --git a/internal/generator/usecase/general/generator/value/uuid.go b/internal/generator/usecase/general/generator/value/uuid.go index 914e503..ca32580 100644 --- a/internal/generator/usecase/general/generator/value/uuid.go +++ b/internal/generator/usecase/general/generator/value/uuid.go @@ -43,6 +43,6 @@ func (g *UUIDGenerator) Value(number float64, _ map[string]any) (any, error) { return res, nil } -func (g *UUIDGenerator) ValuesCount(_ map[string]uint64) float64 { +func (g *UUIDGenerator) ValuesCount() float64 { return float64(1<<(128-10) - 1) //nolint:mnd } diff --git a/internal/generator/usecase/general/task.go b/internal/generator/usecase/general/task.go index 026dfc6..39f88cb 100644 --- a/internal/generator/usecase/general/task.go +++ b/internal/generator/usecase/general/task.go @@ -84,24 +84,7 @@ func newGenerators(cfg *models.GenerationConfig) (map[string]*generator.ColumnGe generators := make(map[string]*generator.ColumnGenerator) for modelName, model := range cfg.Models { - columnsTopologicalOrder, hasDependencies, err := columnsTopologicalSort(model.Columns) - if err != nil { - return nil, errors.WithMessagef(err, "failed to sorting columns by dependencies for model %q", modelName) - } - - originIndexes := make(map[string]int, len(model.Columns)) - for index, column := range model.Columns { - originIndexes[column.Name] = index - } - - var distinctValuesCountByColumn map[string]uint64 - if hasDependencies { - distinctValuesCountByColumn = make(map[string]uint64, len(model.Columns)) - } - - for _, columnName := range columnsTopologicalOrder { - column := model.Columns[originIndexes[columnName]] - + for _, column := range model.Columns { dataModelName := modelName dataModel := model dataColumn := column @@ -115,7 +98,7 @@ func newGenerators(cfg *models.GenerationConfig) (map[string]*generator.ColumnGe columnKey := common.GetKey(modelName, column.Name) gen, err := generator.NewColumnGenerator( - cfg.RandomSeed, distinctValuesCountByColumn, + cfg.RandomSeed, modelName, model, column, dataModelName, dataModel, dataColumn, ) @@ -130,8 +113,8 @@ func newGenerators(cfg *models.GenerationConfig) (map[string]*generator.ColumnGe return generators, nil } -func columnsTopologicalSort(columns []*models.Column) ([]string, bool, error) { - return common.TopologicalSort( +func columnsIdxTopologicalSort(columns []*models.Column) ([]int, bool, error) { + sortedNames, hasDeps, err := common.TopologicalSort( columns, func(c *models.Column) (string, []string) { var deps []string @@ -145,6 +128,21 @@ func columnsTopologicalSort(columns []*models.Column) ([]string, bool, error) { return c.Name, deps }, ) + if err != nil { + return nil, false, err + } + + originColumnsIndexes := make(map[string]int, len(columns)) + for index, column := range columns { + originColumnsIndexes[column.Name] = index + } + + sortedIndexes := make([]int, len(sortedNames)) + for i, columnName := range sortedNames { + sortedIndexes[i] = originColumnsIndexes[columnName] + } + + return sortedIndexes, hasDeps, nil } // RunTask function generates unique values and then all values for selected model. @@ -206,7 +204,7 @@ func (t *Task) WaitError() error { // generateAndSaveValues function generates values for all model. // -//nolint:cyclop + func (t *Task) generateAndSaveValues(ctx context.Context) error { var err error @@ -237,16 +235,11 @@ func (t *Task) generateAndSaveValues(ctx context.Context) error { continue } - columnsTopologicalOrder, hasDependencies, err := columnsTopologicalSort(model.Columns) + sortedColumnsIndexes, hasDependencies, err := columnsIdxTopologicalSort(model.Columns) if err != nil { return errors.WithMessagef(err, "failed to sorting columns by dependencies for model %q", modelName) } - originColumnsIndexes := make(map[string]int, len(model.Columns)) - for index, column := range model.Columns { - originColumnsIndexes[column.Name] = index - } - pool.Add(1) go func() { @@ -269,8 +262,8 @@ func (t *Task) generateAndSaveValues(ctx context.Context) error { pool.Submit( ctx, outputSyncer.WorkerSyncer(), - modelName, hasDependencies, - columnsTopologicalOrder, originColumnsIndexes, + model, hasDependencies, + sortedColumnsIndexes, generators, rowsCount, ) } @@ -306,8 +299,8 @@ func (t *Task) skipRows() { // generateAndSaveBatch function generate batch of values for selected column and send it to output. func (t *Task) generateAndSaveBatch( ctx context.Context, outputSync *common.WorkerSyncer, - modelName string, hasDependencies bool, - columnsTopologicalOrder []string, originColumnsIndexes map[string]int, + model *models.Model, hasDependencies bool, + columnsIndexesTopologicalOrder []int, generators []*generator.BatchGenerator, count uint64, ) error { defer outputSync.Done(ctx) @@ -321,38 +314,36 @@ func (t *Task) generateAndSaveBatch( var rowValues map[string]any if hasDependencies { - rowValues = make(map[string]any, len(originColumnsIndexes)) + rowValues = make(map[string]any, len(generators)) } for i := range count { - for _, columnName := range columnsTopologicalOrder { + for _, columnIdx := range columnsIndexesTopologicalOrder { if common.CtxClosed(ctx) { return &common.ContextCancelError{} } - idx := originColumnsIndexes[columnName] - - value, err := generators[idx].Value(rowValues) + value, err := generators[columnIdx].Value(rowValues) if err != nil { return errors.WithMessage(err, "failed to get or generate value") } - batch[i].Values[idx] = value + batch[i].Values[columnIdx] = value if rowValues != nil { - rowValues[columnName] = value + rowValues[model.Columns[columnIdx].Name] = value } } } outputSync.WaitPrevious(ctx) - err := t.output.HandleRowsBatch(ctx, modelName, batch) + err := t.output.HandleRowsBatch(ctx, model.Name, batch) if err != nil { return errors.WithMessage(err, "failed to save batch to output") } - t.progress.Add(modelName, count) + t.progress.Add(model.Name, count) return nil } diff --git a/internal/generator/usecase/general/test/unit_test.go b/internal/generator/usecase/general/test/unit_test.go index 836f614..0243f2e 100644 --- a/internal/generator/usecase/general/test/unit_test.go +++ b/internal/generator/usecase/general/test/unit_test.go @@ -200,52 +200,15 @@ func checkDistinct(t *testing.T, column *models.Column) { } } -func checkValuesCount( - t *testing.T, - gen value.Generator, - valuesCountByColumn map[string]uint64, expectedValueCount float64, -) { +func checkValuesCount(t *testing.T, gen value.Generator, expectedValueCount float64) { t.Helper() require.NoError(t, gen.Prepare()) - valuesCount := gen.ValuesCount(valuesCountByColumn) + valuesCount := gen.ValuesCount() require.Equal(t, uint64(expectedValueCount), uint64(valuesCount)) } -func checkPossibleToGenerate(t *testing.T, columns []*models.Column, rowsCount uint64, wantErr bool) { - t.Helper() - - copyColumns := make([]*models.Column, 0, len(columns)) - for _, column := range columns { - copyColumns = append(copyColumns, deepColumnCopy(column)) - } - - cfg := getCfg(t, map[string]*models.Model{ - "test": { - RowsCount: rowsCount, - Columns: copyColumns, - }, - }) - - outputHandler := func(_ context.Context, _ string, _ []*models.DataRow) error { return nil } - - out := outputMock.NewOutput(outputHandler) - uc := usecaseGeneral.NewUseCase(usecaseGeneral.UseCaseConfig{}) - - taskID, err := uc.CreateTask( - context.Background(), - usecase.TaskConfig{ - GenerationConfig: &cfg, - Output: out, - }, - ) - - require.Equal(t, wantErr, err != nil) - err = uc.WaitResult(taskID) - require.Equal(t, wantErr, err != nil) -} - func checkForeignKey(t *testing.T, column *models.Column, nullPercentage float64, foreignOrdered bool) { t.Helper() @@ -430,7 +393,7 @@ func TestInteger(t *testing.T) { for _, testCase := range checkValuesCountCases { generator := &value.IntegerGenerator{ColumnIntegerParams: testCase.typeParams} - checkValuesCount(t, generator, nil, testCase.expected) + checkValuesCount(t, generator, testCase.expected) } } @@ -509,7 +472,7 @@ func TestFloat(t *testing.T) { for _, testCase := range checkValuesCountCases { generator := &value.FloatGenerator{ColumnFloatParams: testCase.typeParams} - checkValuesCount(t, generator, nil, testCase.expected) + checkValuesCount(t, generator, testCase.expected) } } @@ -531,9 +494,9 @@ func TestString(t *testing.T) { {&models.ColumnStringParams{LogicalType: models.LastNameType, MinLength: 4, MaxLength: 7}, 4, 7}, {&models.ColumnStringParams{LogicalType: models.PhoneType, MinLength: 10, MaxLength: 10}, 10, 10}, {&models.ColumnStringParams{MinLength: 100, MaxLength: 100}, 100, 100}, - {&models.ColumnStringParams{Template: `{{ pattern "AAaa00##" }}`, Locale: "en"}, 8, 8}, - {&models.ColumnStringParams{Template: `{{ pattern "AAaa00##" }}`, Locale: "ru"}, 8, 8}, - {&models.ColumnStringParams{Template: `{{ pattern "0123456789012345678901234567890123456789" }}`}, 40, 40}, + {&models.ColumnStringParams{Pattern: "AAaa00##", Locale: "en"}, 8, 8}, + {&models.ColumnStringParams{Pattern: "AAaa00##", Locale: "ru"}, 8, 8}, + {&models.ColumnStringParams{Pattern: "0123456789012345678901234567890123456789"}, 40, 40}, {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 3, MaxLength: 5}, 3, 5}, {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 254, MaxLength: 256}, 254, 256}, {&models.ColumnStringParams{LogicalType: models.TextType, MinLength: 510, MaxLength: 512}, 510, 512}, @@ -563,9 +526,8 @@ func TestString(t *testing.T) { } checkValuesCountCases := []struct { - typeParams *models.ColumnStringParams - distinctValuesCountByColumn map[string]uint64 - expected float64 + typeParams *models.ColumnStringParams + expected float64 }{ { &models.ColumnStringParams{ @@ -575,7 +537,6 @@ func TestString(t *testing.T) { WithoutNumbers: true, WithoutSpecialChars: true, }, - nil, 52, }, { @@ -586,7 +547,6 @@ func TestString(t *testing.T) { WithoutNumbers: true, WithoutSpecialChars: true, }, - nil, 66.0, }, { @@ -597,7 +557,6 @@ func TestString(t *testing.T) { WithoutNumbers: true, WithoutSpecialChars: true, }, - nil, 1048229968448, }, { @@ -608,7 +567,6 @@ func TestString(t *testing.T) { WithoutNumbers: true, WithoutSpecialChars: true, }, - nil, 24128259706319868, }, { @@ -620,7 +578,6 @@ func TestString(t *testing.T) { WithoutSmallLetters: true, WithoutSpecialChars: true, }, - nil, 1111111111111110000000000, }, { @@ -632,7 +589,6 @@ func TestString(t *testing.T) { WithoutSmallLetters: true, WithoutNumbers: true, }, - nil, 81870575520, }, { @@ -641,7 +597,6 @@ func TestString(t *testing.T) { MaxLength: 15, Locale: "en", }, - nil, 88394150280794134360488281250, }, { @@ -650,7 +605,6 @@ func TestString(t *testing.T) { MaxLength: 15, Locale: "ru", }, - nil, 868834460299970670989801640300, }, { @@ -658,91 +612,20 @@ func TestString(t *testing.T) { Locale: "en", Template: "{{ .field }}", }, - map[string]uint64{ - "field": 11, - }, - 11, + 1, }, { &models.ColumnStringParams{ - Locale: "en", - Template: `{{ pattern "A00" }}`, + Locale: "en", + Pattern: "A00", }, - nil, 2600, }, - { - &models.ColumnStringParams{ - Locale: "ru", - Template: `{{ .field }}{{ pattern "a0#" }}`, - }, - map[string]uint64{ - "field": 10, - }, - 75900, - }, } for _, testCase := range checkValuesCountCases { generator := &value.StringGenerator{ColumnStringParams: testCase.typeParams} - checkValuesCount(t, generator, testCase.distinctValuesCountByColumn, testCase.expected) - } - - idColumn := &models.Column{ - Name: "id", - Type: "integer", - Ranges: []*models.Params{ - { - TypeParams: &models.ColumnIntegerParams{ - FromPtr: int64Ptr(1), - ToPtr: int64Ptr(5), - }, - }, - }, - } - - emailColumn := &models.Column{ - Name: "email", - Type: "string", - Ranges: []*models.Params{ - { - TypeParams: &models.ColumnStringParams{ - Template: `{{ .id }}.{{ pattern "00" }}@example.com`, - }, - DistinctPercentage: 1, - }, - }, - } - - checkPossibleToGenerateCases := []struct { - columns []*models.Column - rowsCount uint64 - wantErr bool - }{ - { - columns: []*models.Column{idColumn, emailColumn}, - rowsCount: 500, - wantErr: false, - }, - { - columns: []*models.Column{emailColumn, idColumn}, - rowsCount: 500, - wantErr: false, - }, - { - columns: []*models.Column{idColumn, emailColumn}, - rowsCount: 501, - wantErr: true, - }, - { - columns: []*models.Column{emailColumn, idColumn}, - rowsCount: 501, - wantErr: true, - }, - } - - for _, testCase := range checkPossibleToGenerateCases { - checkPossibleToGenerate(t, testCase.columns, testCase.rowsCount, testCase.wantErr) + checkValuesCount(t, generator, testCase.expected) } } @@ -751,7 +634,7 @@ func TestUUID(t *testing.T) { checkType(t, column, uuid.UUID{}) checkDistinct(t, column) checkForeignKeyCases(t, column) - checkValuesCount(t, &value.UUIDGenerator{}, nil, float64(1<<(128-10)-1)) + checkValuesCount(t, &value.UUIDGenerator{}, float64(1<<(128-10)-1)) } func TestDateTime(t *testing.T) { @@ -834,7 +717,7 @@ func TestDateTime(t *testing.T) { for _, testCase := range checkValuesCountCases { generator := &value.DateTimeGenerator{ColumnDateTimeParams: testCase.typeParams} - checkValuesCount(t, generator, nil, testCase.expected) + checkValuesCount(t, generator, testCase.expected) } } @@ -928,7 +811,7 @@ func TestIdempotence(t *testing.T) { Name: "passport", Type: "string", Ranges: []*models.Params{{TypeParams: &models.ColumnStringParams{ - Template: `{{ pattern "AA 00 000 000" }}`, + Pattern: "AA 00 000 000", }, NullPercentage: 0.5}}, }, From 980f0b38548215f5c11e63d2f0fe9986107e0422 Mon Sep 17 00:00:00 2001 From: reversetm Date: Wed, 30 Jul 2025 17:23:12 +0300 Subject: [PATCH 08/11] Updated default generation config --- config/models.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/config/models.yml b/config/models.yml index 7bc502c..e382e34 100644 --- a/config/models.yml +++ b/config/models.yml @@ -62,9 +62,13 @@ models: - name: passport type: string type_params: - template: "{{ pattern('AA 00 000 000') }}" + pattern: AA 00 000 000 distinct_percentage: 1 ordered: true + - name: email + type: string + type_params: + template: "{{ .first_name_en | lower }}.{{ .id }}@email.com" - name: created type: datetime type_params: From 9d2cccf6d5873778bd51cbdb3dcf55ee78c21599 Mon Sep 17 00:00:00 2001 From: reversetm Date: Wed, 30 Jul 2025 17:30:46 +0300 Subject: [PATCH 09/11] Fixed usage --- doc/en/usage.md | 4 ++-- doc/ru/usage.md | 2 +- internal/generator/models/generator_model.go | 12 ++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/doc/en/usage.md b/doc/en/usage.md index 2f7cbf3..3943a7d 100644 --- a/doc/en/usage.md +++ b/doc/en/usage.md @@ -156,7 +156,7 @@ Structure `models[*].columns[*].type_params` for data type `string`: - `logical_type`: Logical type of string. Supported values: `first_name`, `last_name`, `phone`, `text`. - `template`: Template for string generation. Allows you to use the values of any columns of the generated model. Information about the functions available in template strings is described at the end of this section. - Cannot coexist with `ordered`, `distinct_percentage` or `distinct_count`. + Cannot coexist with `ordered`, `distinct_percentage` and `distinct_count`. - `pattern`: Pattern for string generation. The `A` symbol is any capital letter, the `a` symbol is any small letter, symbol `0` is any digit, the `#` symbol is any character, and the other characters remain as they are. - `locale`: Locale for generated strings. Supported values: `ru`, `en`. Default is `en`. @@ -236,7 +236,7 @@ Structure of `output.params` for `tcs` format: Similar to the structure for the `http` format, except that the `format_template` field is immutable and always set to its default value. -Using Template Strings:: +Using Template Strings: Template strings are implemented using the standard golang library, you can read about all its features and available functions in this [documentation](https://pkg.go.dev/text/template). diff --git a/doc/ru/usage.md b/doc/ru/usage.md index 706603f..536f7af 100644 --- a/doc/ru/usage.md +++ b/doc/ru/usage.md @@ -162,7 +162,7 @@ open_ai: - `logical_type`: Логический тип строки. Поддерживаемые значения: `first_name`, `last_name`, `phone`, `text`. - `template`: Шаблон для генерации строки. Позволяет использовать значения любых столбов генерируемой модели. Информация о том, как использовать шаблонные строки, описана в конце данного раздела. - Не работает совместно с `ordered`, `distinct_percentage` или `distinct_count`. + Не работает совместно с `ordered`, `distinct_percentage` и `distinct_count`. - `pattern`: Паттерн для генерации строки. Символ `A` - любая большая буква, символ `a` - любая маленькая буква, символ `0` - любая цифра, символ `#` - любой символ, а остальные символы остаются как есть. - `locale`: Локаль для генерации строк. Поддерживаемые значения: `ru`, `en`. По умолчанию `en`. diff --git a/internal/generator/models/generator_model.go b/internal/generator/models/generator_model.go index 5f88de9..e32f6ff 100644 --- a/internal/generator/models/generator_model.go +++ b/internal/generator/models/generator_model.go @@ -427,12 +427,12 @@ func (p *Params) Validate() []error { } if p.StringParams != nil && p.StringParams.Template != "" { - if common.Any( - p.Ordered, - p.DistinctPercentage != 0, - p.DistinctCount != 0, - ) { - errs = append(errs, errors.New("forbidden to use string template with distinct params or ordered")) + if p.Ordered { + errs = append(errs, errors.New("forbidden to use string template with ordered")) + } + + if common.Any(p.DistinctPercentage != 0, p.DistinctCount != 0) { + errs = append(errs, errors.New("forbidden to use string template with distinct params")) } } From 1e60b6354d8265e8aad17eee1d2bf7ef6d13d361 Mon Sep 17 00:00:00 2001 From: reversetm Date: Thu, 31 Jul 2025 14:03:18 +0300 Subject: [PATCH 10/11] Reworked format_template field, improved http writer performance --- CHANGELOG.md | 5 ++ doc/en/usage.md | 61 +++++++-------- doc/ru/usage.md | 63 +++++++--------- internal/generator/models/generator_output.go | 2 +- .../output/general/writer/http/helpers.go | 51 +++++++++++++ .../output/general/writer/http/http.go | 74 ++++++++++--------- .../output/general/writer/http/http_test.go | 2 +- internal/generator/usecase/general/task.go | 6 +- 8 files changed, 150 insertions(+), 114 deletions(-) create mode 100644 internal/generator/output/general/writer/http/helpers.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 0aaa1c4..f346d5f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,11 +12,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - The `template` field in the `string` data type is now used to generate template strings with the ability to use the values of any columns of the generated model. +- In the `format_template` field of the output parameters, the variable `ColumnNames` is now available. + ### Breaking changes - Using `template` field to specify a string pattern like `Aa0#` is no longer supported, `pattern` should be used instead. +- The `Rows` variable in the `format_template` filed of the output parameters is now a two-dimensional array, + not a map. + ## [0.0.1](https://github.com/tarantool/sdvg/compare/36d0930..0.0.1) - 2025-07-21 ### Added diff --git a/doc/en/usage.md b/doc/en/usage.md index 3943a7d..6683903 100644 --- a/doc/en/usage.md +++ b/doc/en/usage.md @@ -192,44 +192,23 @@ Structure `output.params` for format `http`: - `batch_size`: Number of data records sent in one request. Default is `1000`. - `workers_count`: Number of threads for writing data. Default is `1`. *Experimental field.* - `headers`: HTTP request headers specified as a dictionary. Default is none. -- `format_template`: Template-based format for sending data, configured using Golang templates. - There are 2 fields available for use in `format_template`: +- `format_template`: Template-based format for sending data, configured using templates. + There are 3 fields available for use in `format_template`: * `ModelName` - name of the model. - * `Rows` - array of records, where each element is a dictionary representing a data row. - Dictionary keys correspond to column names, and values correspond to data in those columns. - - You can read about the available functions and the use of template strings at the end of this section. - - Example value for the `format_template` field: - - ```yaml - format_template: | - { - "table_name": "{{ .ModelName }}", - "meta": { - "rows_count": {{ len .Rows }} - }, - "rows": [ - {{- range $i, $row := .Rows }} - {{- if $i}},{{ end }} - { - "id": {{ index $row "id" }}, - "username": "{{ index $row "name" }}" - } - {{- end }} - ] - } - ``` + * `ColumnNames` - array of column names. + * `Rows` - a two-dimensional array, where each outer element represents a table row, + and the inner element contains values of this row in the same order as `ColumnNames`. Default value for the `format_template` field: - ```yaml format_template: | { "table_name": {{ .ModelName }}, - "rows": {{ json .Rows }} + "rows": {{ rowsJson .ColumnNames .Rows }} } ``` + + You can read about the available functions and the use of template strings at the end of this section. Structure of `output.params` for `tcs` format: @@ -251,17 +230,27 @@ Function calls: - direct call: `{{ upper .name }}`. - using pipe: `{{ .name | upper }}`. -In addition to standard functions, the project provides `4` custom functions: +The following is a list of additional functions available in certain template fields: + +In the `template` field of `string` data type: - `upper`: converts the string to upper case. - `lower`: converts the string to lower case. -- `len`: returns the length of the element. -- `json`: converts the element to a JSON string. -Usage restrictions: +In the `format_template` field of the output parameters: -The `lower`, and `upper` functions are available only in the `template` field of the `string` data type. -The `len` and `json` functions are available only in the `format_template` field of the output parameters. +- `len`: returns the length of the element. +- `json`: converts the element to a JSON string. +- `rowsJson`: converts an array of column names (`ColumnNames`) and a two-dimensional array of rows (`Rows`) + into a JSON array whose elements are objects of the form: + ``` + { + "columnName1": value1, + "columnName2": value2, + ... + } + ``` + where each object corresponds to one row of the table. #### Examples of data generation configuration @@ -413,7 +402,7 @@ output: "meta": { "rows_count": {{ len .Rows }} }, - "rows": {{ json .Rows }} + "rows": {{ rowsJson .ColumnNames .Rows }} } models: diff --git a/doc/ru/usage.md b/doc/ru/usage.md index 536f7af..9d151c6 100644 --- a/doc/ru/usage.md +++ b/doc/ru/usage.md @@ -198,44 +198,23 @@ open_ai: - `batch_size`: Размер отправляемого в одном запросе массива данных. По умолчанию `1000`. - `workers_count`: Количество потоков для записи данных. По умолчанию `1`. *Является экспериментальным полем.* - `headers`: Заголовки http запроса, указываются в формате словаря. По умолчанию отсутствуют. -- `format_template`: Формат отправляемых данных, конфигурируемый с помощью шаблонов Golang. - Для использования в `format_template` доступно 2 поля: +- `format_template`: Формат отправляемых данных, конфигурируемый с помощью шаблонов. + Для использования в `format_template` доступно 3 поля: * `ModelName` - имя модели. - * `Rows` - массив записей, где каждый элемент является словарем, который представляет собой строку данных. - Ключи словаря соответствуют названиям столбцов, а значения — данным в этих столбцах. - - О доступных функциях и использовании шаблонных строк можно прочитать в конце данного раздела. - - Пример значения поля `format_template`: - - ```yaml - format_template: | - { - "table_name": "{{ .ModelName }}", - "meta": { - "rows_count": {{ len .Rows }} - }, - "rows": [ - {{- range $i, $row := .Rows }} - {{- if $i}},{{ end }} - { - "id": {{ index $row "id" }}, - "username": "{{ index $row "name" }}" - } - {{- end }} - ] - } - ``` + * `ColumnNames` - массив имён колонок. + * `Rows` - двумерный массив, где каждый внешний элемент представляет строку таблицы, + а внутренний содержит значения этой строки в том же порядке, что и `ColumnNames`. Значение поля `format_template` по умолчанию: - ```yaml format_template: | { - "table_name": {{ .ModelName }}, - "rows": {{ json .Rows }} + "table_name": "{{ .ModelName }}", + "rows": {{ rowsJson .ColumnNames .Rows }} } ``` + + О доступных функциях и использовании шаблонных строк можно прочитать в конце данного раздела. Структура `output.params` для формата `tcs`: @@ -257,17 +236,27 @@ open_ai: - прямой вызов: `{{ upper .name }}`. - с помощью pipe: `{{ .name | upper }}`. -В проекте помимо стандартных функций доступны `4` пользовательских: +Ниже приведён список дополнительных функций, доступных в определённых полях шаблонов: + +В поле `template` типа данных `string`: - `upper`: преобразует строку в верхний регистр. - `lower`: преобразует строку в нижний регистр. -- `len`: возвращает длину элемента. -- `json`: преобразует элемент в JSON строку. -Ограничения по использованию: +В поле `format_template` параметров вывода: -Функции `lower`, и `upper` доступны только в поле `template` типа данных `string`. -Функции `len` и `json` доступны только в поле `format_template` параметров вывода. +- `len`: возвращает длину элемента. +- `json`: преобразует элемент в JSON строку. +- `rowsJson`: преобразует массив имён колонок (`ColumnNames`) и двумерный массив строк (`Rows`) + в JSON-массив, элементами которого являются объекты вида: + ``` + { + "columnName1": value1, + "columnName2": value2, + ... + } + ``` + где каждый объект соответствует одной строке таблицы. #### Примеры конфигурации генерации данных @@ -419,7 +408,7 @@ output: "meta": { "rows_count": {{ len .Rows }} }, - "rows": {{ json .Rows }} + "rows": {{ rowsJson .ColumnNames .Rows }} } models: diff --git a/internal/generator/models/generator_output.go b/internal/generator/models/generator_output.go index 9f7e729..7964b0a 100644 --- a/internal/generator/models/generator_output.go +++ b/internal/generator/models/generator_output.go @@ -14,7 +14,7 @@ import ( const ( DefaultOutputDir = "output" DefaultOutputType = "csv" - defaultFormatTemplate = `{ "table_name": "{{ .ModelName }}", "rows": {{ json .Rows }} }` + defaultFormatTemplate = `{ "table_name": "{{ .ModelName }}", "rows": {{ rowsJson .ColumnNames .Rows }} }` tcsTimeoutHeader = "x-tcs-timeout_ms" ParquetDateTimeMillisFormat = "millis" ParquetDateTimeMicrosFormat = "micros" diff --git a/internal/generator/output/general/writer/http/helpers.go b/internal/generator/output/general/writer/http/helpers.go new file mode 100644 index 0000000..3b03501 --- /dev/null +++ b/internal/generator/output/general/writer/http/helpers.go @@ -0,0 +1,51 @@ +package http + +import ( + "encoding/json" + "fmt" + "reflect" + "strings" +) + +func toJSON(v any) (string, error) { + data, err := json.Marshal(v) + + return string(data), err +} + +func length(v any) int { + return reflect.ValueOf(v).Len() +} + +func rowsJSON(columnNames []string, rows [][]any) (string, error) { + var sb strings.Builder + + sb.WriteByte('[') + + for i, row := range rows { + if i > 0 { + sb.WriteByte(',') + } + + sb.WriteByte('{') + + for j, columnName := range columnNames { + if j > 0 { + sb.WriteByte(',') + } + + value, err := toJSON(row[j]) + if err != nil { + return "", err + } + + fmt.Fprintf(&sb, `"%s":%s`, columnName, value) + } + + sb.WriteByte('}') + } + + sb.WriteByte(']') + + return sb.String(), nil +} diff --git a/internal/generator/output/general/writer/http/http.go b/internal/generator/output/general/writer/http/http.go index 1fdde13..19eb287 100644 --- a/internal/generator/output/general/writer/http/http.go +++ b/internal/generator/output/general/writer/http/http.go @@ -3,10 +3,8 @@ package http import ( "bytes" "context" - "encoding/json" "io" "net/http" - "reflect" "strings" "sync" "text/template" @@ -25,8 +23,9 @@ const ( ) type bodyPayload struct { - ModelName string - Rows []map[string]any + ModelName string + ColumnNames []string + Rows [][]any } // Verify interface compliance in compile time. @@ -41,6 +40,8 @@ type Writer struct { retryableClient *retryablehttp.Client lastErr error + payloadPool *sync.Pool + buffer []*models.DataRow bodyTemplate *template.Template @@ -60,11 +61,26 @@ func NewWriter( config *models.HTTPParams, writtenRowsChan chan<- uint64, ) *Writer { + columnNames := make([]string, len(model.Columns)) + for i, columns := range model.Columns { + columnNames[i] = columns.Name + } + + payloadPool := &sync.Pool{ + New: func() any { + return &bodyPayload{ + ModelName: model.Name, + ColumnNames: columnNames, + } + }, + } + httpWriter := &Writer{ ctx: ctx, model: model, config: config, writtenRowsChan: writtenRowsChan, + payloadPool: payloadPool, buffer: make([]*models.DataRow, 0, config.BatchSize), writerChan: make(chan []*models.DataRow), errorsChan: make(chan error, 1), @@ -131,15 +147,10 @@ func (w *Writer) Init() error { return errors.New("the writer has already been initialized") } - tmpl := template.New("body").Funcs(template.FuncMap{ - "json": func(v any) (string, error) { - data, err := json.Marshal(v) - - return string(data), err - }, - "len": func(v any) int { - return reflect.ValueOf(v).Len() - }, + tmpl := template.New("format_template").Funcs(template.FuncMap{ + "json": toJSON, + "len": length, + "rowsJson": rowsJSON, }) tmpl, err := tmpl.Parse(w.config.FormatTemplate) @@ -211,44 +222,37 @@ func (w *Writer) handleBatch(batch []*models.DataRow) error { } func (w *Writer) buildRequest(dataRows []*models.DataRow) (*retryablehttp.Request, error) { - // Build a slice of row objects by mapping column names to their corresponding values. - // Each row is represented as a map[string]any, with column names as keys and values from dataRows. - rows := make([]map[string]any, 0, len(dataRows)) - - for _, dataRow := range dataRows { - if len(dataRow.Values) != len(w.model.Columns) { - return nil, errors.New("values count does not match columns count") - } - - rowObj := make(map[string]any, len(dataRow.Values)) - for i, value := range dataRow.Values { - rowObj[w.model.Columns[i].Name] = value - } - - rows = append(rows, rowObj) + // Build a 2D slice of values extracted from dataRows. + // Each inner slice contains the values of a single row in the same order as the columns. + rows := make([][]any, len(dataRows)) + for i, dataRow := range dataRows { + rows[i] = dataRow.Values } // Prepare the data payload for the request template rendering. // The payload includes the model name and structured row data. - body := bodyPayload{ - ModelName: w.model.Name, - Rows: rows, - } + buffer := new(bytes.Buffer) - var buf bytes.Buffer + //nolint:forcetypeassert + payload := w.payloadPool.Get().(*bodyPayload) + payload.Rows = rows - err := w.bodyTemplate.Execute(&buf, body) + err := w.bodyTemplate.Execute(buffer, payload) if err != nil { + w.payloadPool.Put(payload) + return nil, errors.New(err.Error()) } + w.payloadPool.Put(payload) + // Construct the HTTP POST request with the generated JSON body and apply configured headers. req, err := retryablehttp.NewRequest( http.MethodPost, w.config.Endpoint, - &buf, + buffer, ) if err != nil { return nil, errors.New(err.Error()) diff --git a/internal/generator/output/general/writer/http/http_test.go b/internal/generator/output/general/writer/http/http_test.go index 289e252..7bc185e 100644 --- a/internal/generator/output/general/writer/http/http_test.go +++ b/internal/generator/output/general/writer/http/http_test.go @@ -55,7 +55,7 @@ func TestHandleRowsBatch(t *testing.T) { "meta": { "rows_count": {{ len .Rows }} }, - "rows": {{ json .Rows }} + "rows": {{ rowsJson .ColumnNames .Rows }} }`, model: &models.Model{ Name: "expectedModel", diff --git a/internal/generator/usecase/general/task.go b/internal/generator/usecase/general/task.go index 39f88cb..7560e78 100644 --- a/internal/generator/usecase/general/task.go +++ b/internal/generator/usecase/general/task.go @@ -205,9 +205,7 @@ func (t *Task) WaitError() error { // generateAndSaveValues function generates values for all model. // -func (t *Task) generateAndSaveValues(ctx context.Context) error { - var err error - +func (t *Task) generateAndSaveValues(ctx context.Context) (err error) { ctx, cancelCtx := context.WithCancelCause(ctx) defer cancelCtx(err) @@ -286,7 +284,7 @@ func (t *Task) generateAndSaveValues(ctx context.Context) error { slog.Debug("generating values for all models finished") - return nil + return err } func (t *Task) skipRows() { From aa2ece0d8d3448bdfce5f7e4d64a25745ac11017 Mon Sep 17 00:00:00 2001 From: reversetm Date: Tue, 5 Aug 2025 20:40:41 +0300 Subject: [PATCH 11/11] Reduced the number of allocations --- .../output/general/writer/http/http.go | 25 ++++++++----------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/internal/generator/output/general/writer/http/http.go b/internal/generator/output/general/writer/http/http.go index 19eb287..2d841e1 100644 --- a/internal/generator/output/general/writer/http/http.go +++ b/internal/generator/output/general/writer/http/http.go @@ -18,6 +18,7 @@ import ( ) const ( + maxBodySize = 1 << 20 // 1 Mb retryWaitMin = 1 * time.Second retryWaitMax = 10 * time.Minute ) @@ -71,6 +72,7 @@ func NewWriter( return &bodyPayload{ ModelName: model.Name, ColumnNames: columnNames, + Rows: make([][]any, 0, config.BatchSize), } }, } @@ -222,11 +224,14 @@ func (w *Writer) handleBatch(batch []*models.DataRow) error { } func (w *Writer) buildRequest(dataRows []*models.DataRow) (*retryablehttp.Request, error) { - // Build a 2D slice of values extracted from dataRows. - // Each inner slice contains the values of a single row in the same order as the columns. - rows := make([][]any, len(dataRows)) - for i, dataRow := range dataRows { - rows[i] = dataRow.Values + // Grab a payload with a ready slice and reset length to zero, keep capacity. + // + //nolint:forcetypeassert + payload := w.payloadPool.Get().(*bodyPayload) + payload.Rows = payload.Rows[:0] + + for _, dataRow := range dataRows { + payload.Rows = append(payload.Rows, dataRow.Values) } // Prepare the data payload for the request template rendering. @@ -234,10 +239,6 @@ func (w *Writer) buildRequest(dataRows []*models.DataRow) (*retryablehttp.Reques buffer := new(bytes.Buffer) - //nolint:forcetypeassert - payload := w.payloadPool.Get().(*bodyPayload) - payload.Rows = rows - err := w.bodyTemplate.Execute(buffer, payload) if err != nil { w.payloadPool.Put(payload) @@ -279,13 +280,9 @@ func (w *Writer) sendRequest(req *retryablehttp.Request) error { return errors.New(err.Error()) } - - if resp == nil { - return errors.New("received nil response") - } defer resp.Body.Close() - body, err := io.ReadAll(resp.Body) + body, err := io.ReadAll(io.LimitReader(resp.Body, maxBodySize)) if err != nil { return errors.New(err.Error()) }