Skip to content

Commit 61f676d

Browse files
committed
fixing tests, removing the solved todos, targets to a diff module
1 parent 0a8fa2a commit 61f676d

File tree

9 files changed

+1202
-102
lines changed

9 files changed

+1202
-102
lines changed

dbldatagen/spec/__init__.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
"""Pydantic-based specification API for dbldatagen.
2+
3+
This module provides Pydantic models and specifications for defining data generation
4+
in a type-safe, declarative way.
5+
"""
6+
7+
# Import only the compat layer by default to avoid triggering Spark/heavy dependencies
8+
from .compat import BaseModel, Field, constr, root_validator, validator
9+
10+
# Lazy imports for heavy modules - import these explicitly when needed
11+
# from .column_spec import ColumnSpec
12+
# from .generator_spec import GeneratorSpec
13+
# from .generator_spec_impl import GeneratorSpecImpl
14+
15+
__all__ = [
16+
"BaseModel",
17+
"Field",
18+
"constr",
19+
"root_validator",
20+
"validator",
21+
"ColumnSpec",
22+
"GeneratorSpec",
23+
"GeneratorSpecImpl",
24+
]
25+
26+
27+
def __getattr__(name):
28+
"""Lazy import heavy modules to avoid triggering Spark initialization."""
29+
if name == "ColumnSpec":
30+
from .column_spec import ColumnSpec
31+
return ColumnSpec
32+
elif name == "GeneratorSpec":
33+
from .generator_spec import GeneratorSpec
34+
return GeneratorSpec
35+
elif name == "GeneratorSpecImpl":
36+
from .generator_spec_impl import GeneratorSpecImpl
37+
return GeneratorSpecImpl
38+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
39+

dbldatagen/spec/compat.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,6 @@
3232
3333
Benefits:
3434
- **No Installation Required**: Works with whatever Pydantic version is available
35-
- **Single Codebase**: One set of code works across both Pydantic versions
36-
- **Environment Agnostic**: Application code doesn't need to know which version is installed
37-
- **Future-Ready**: Easy migration path to Pydantic V2 API when ready
3835
- **Databricks Compatible**: Avoids conflicts with pre-installed libraries
3936
4037
Future Migration:

dbldatagen/spec/generator_spec.py

Lines changed: 1 addition & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -9,105 +9,11 @@
99
from dbldatagen.spec.column_spec import ColumnDefinition
1010

1111
from .compat import BaseModel, validator
12-
12+
from .output_targets import UCSchemaTarget, FilePathTarget
1313

1414
logger = logging.getLogger(__name__)
1515

1616

17-
class UCSchemaTarget(BaseModel):
18-
"""Defines a Unity Catalog schema as the output destination for generated data.
19-
20-
This class represents a Unity Catalog location (catalog.schema) where generated tables
21-
will be written. Unity Catalog is Databricks' unified governance solution for data and AI.
22-
23-
:param catalog: Unity Catalog catalog name where tables will be written
24-
:param schema_: Unity Catalog schema (database) name within the catalog
25-
:param output_format: Data format for table storage. Defaults to "delta" which is the
26-
recommended format for Unity Catalog tables
27-
28-
.. note::
29-
The schema parameter is named `schema_` (with underscore) to avoid conflict with
30-
Python's built-in schema keyword and Pydantic functionality
31-
32-
.. note::
33-
Tables will be written to the location: `{catalog}.{schema_}.{table_name}`
34-
"""
35-
catalog: str
36-
schema_: str
37-
output_format: str = "delta" # Default to delta for UC Schema
38-
39-
@validator("catalog", "schema_")
40-
def validate_identifiers(cls, v: str) -> str:
41-
"""Validates that catalog and schema names are valid identifiers.
42-
43-
Ensures the identifier is non-empty and follows Python identifier conventions.
44-
Issues a warning if the identifier is not a basic Python identifier, as this may
45-
cause issues with Unity Catalog.
46-
47-
:param v: The identifier string to validate (catalog or schema name)
48-
:returns: The validated and stripped identifier string
49-
:raises ValueError: If the identifier is empty or contains only whitespace
50-
51-
.. note::
52-
This is a Pydantic field validator that runs automatically during model instantiation
53-
"""
54-
if not v.strip():
55-
raise ValueError("Identifier must be non-empty.")
56-
if not v.isidentifier():
57-
logger.warning(
58-
f"'{v}' is not a basic Python identifier. Ensure validity for Unity Catalog.")
59-
return v.strip()
60-
61-
def __str__(self) -> str:
62-
"""Returns a human-readable string representation of the Unity Catalog target.
63-
64-
:returns: Formatted string showing catalog, schema, format and type
65-
"""
66-
return f"{self.catalog}.{self.schema_} (Format: {self.output_format}, Type: UC Table)"
67-
68-
69-
class FilePathTarget(BaseModel):
70-
"""Defines a file system path as the output destination for generated data.
71-
72-
This class represents a file system location where generated tables will be written
73-
as files. Each table will be written to a subdirectory within the base path.
74-
75-
:param base_path: Base file system path where table data files will be written.
76-
Each table will be written to {base_path}/{table_name}/
77-
:param output_format: File format for data storage. Must be either "csv" or "parquet".
78-
No default value - must be explicitly specified
79-
80-
.. note::
81-
Unlike UCSchemaTarget, this requires an explicit output_format with no default
82-
83-
.. note::
84-
The base_path can be a local file system path, DBFS path, or cloud storage path
85-
(e.g., s3://, gs://, abfs://) depending on your environment
86-
"""
87-
base_path: str
88-
output_format: Literal["csv", "parquet"] # No default, must be specified
89-
90-
@validator("base_path")
91-
def validate_base_path(cls, v: str) -> str:
92-
"""Validates that the base path is non-empty.
93-
94-
:param v: The base path string to validate
95-
:returns: The validated and stripped base path string
96-
:raises ValueError: If the base path is empty or contains only whitespace
97-
98-
.. note::
99-
This is a Pydantic field validator that runs automatically during model instantiation
100-
"""
101-
if not v.strip():
102-
raise ValueError("base_path must be non-empty.")
103-
return v.strip()
104-
105-
def __str__(self) -> str:
106-
"""Returns a human-readable string representation of the file path target.
107-
108-
:returns: Formatted string showing base path, format and type
109-
"""
110-
return f"{self.base_path} (Format: {self.output_format}, Type: File Path)"
11117

11218

11319
class TableDefinition(BaseModel):
@@ -342,7 +248,6 @@ def validate(self, strict: bool = True) -> ValidationResult: # type: ignore[ove
342248
)
343249

344250
# Check partitions if specified
345-
#TODO: though this can be a model field check, we are checking here so that one can correct
346251
# Can we find a way to use the default way?
347252
if table_def.partitions is not None and table_def.partitions <= 0:
348253
result.add_error(
@@ -351,7 +256,6 @@ def validate(self, strict: bool = True) -> ValidationResult: # type: ignore[ove
351256
)
352257

353258
# Check for duplicate column names
354-
# TODO: Not something possible if we right model, recheck
355259
column_names = [col.name for col in table_def.columns]
356260
duplicates = [name for name in set(column_names) if column_names.count(name) > 1]
357261
if duplicates:
@@ -361,8 +265,6 @@ def validate(self, strict: bool = True) -> ValidationResult: # type: ignore[ove
361265

362266
# Build column map for reference checking
363267
column_map = {col.name: col for col in table_def.columns}
364-
365-
# TODO: Check baseColumn references, this is tricky? check the dbldefaults
366268
for col in table_def.columns:
367269
if col.baseColumn and col.baseColumn != "id":
368270
if col.baseColumn not in column_map:

dbldatagen/spec/output_targets.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
from .compat import BaseModel, validator
2+
from typing import Literal
3+
import logging
4+
5+
logger = logging.getLogger(__name__)
6+
7+
8+
class UCSchemaTarget(BaseModel):
9+
"""Defines a Unity Catalog schema as the output destination for generated data.
10+
11+
This class represents a Unity Catalog location (catalog.schema) where generated tables
12+
will be written. Unity Catalog is Databricks' unified governance solution for data and AI.
13+
14+
:param catalog: Unity Catalog catalog name where tables will be written
15+
:param schema_: Unity Catalog schema (database) name within the catalog
16+
:param output_format: Data format for table storage. Defaults to "delta" which is the
17+
recommended format for Unity Catalog tables
18+
19+
.. note::
20+
The schema parameter is named `schema_` (with underscore) to avoid conflict with
21+
Python's built-in schema keyword and Pydantic functionality
22+
23+
.. note::
24+
Tables will be written to the location: `{catalog}.{schema_}.{table_name}`
25+
"""
26+
catalog: str
27+
schema_: str
28+
output_format: str = "delta" # Default to delta for UC Schema
29+
30+
@validator("catalog", "schema_")
31+
def validate_identifiers(cls, v: str) -> str:
32+
"""Validates that catalog and schema names are valid identifiers.
33+
34+
Ensures the identifier is non-empty and follows Python identifier conventions.
35+
Issues a warning if the identifier is not a basic Python identifier, as this may
36+
cause issues with Unity Catalog.
37+
38+
:param v: The identifier string to validate (catalog or schema name)
39+
:returns: The validated and stripped identifier string
40+
:raises ValueError: If the identifier is empty or contains only whitespace
41+
42+
.. note::
43+
This is a Pydantic field validator that runs automatically during model instantiation
44+
"""
45+
if not v.strip():
46+
raise ValueError("Identifier must be non-empty.")
47+
if not v.isidentifier():
48+
logger.warning(
49+
f"'{v}' is not a basic Python identifier. Ensure validity for Unity Catalog.")
50+
return v.strip()
51+
52+
def __str__(self) -> str:
53+
"""Returns a human-readable string representation of the Unity Catalog target.
54+
55+
:returns: Formatted string showing catalog, schema, format and type
56+
"""
57+
return f"{self.catalog}.{self.schema_} (Format: {self.output_format}, Type: UC Table)"
58+
59+
60+
class FilePathTarget(BaseModel):
61+
"""Defines a file system path as the output destination for generated data.
62+
63+
This class represents a file system location where generated tables will be written
64+
as files. Each table will be written to a subdirectory within the base path.
65+
66+
:param base_path: Base file system path where table data files will be written.
67+
Each table will be written to {base_path}/{table_name}/
68+
:param output_format: File format for data storage. Must be either "csv" or "parquet".
69+
No default value - must be explicitly specified
70+
71+
.. note::
72+
Unlike UCSchemaTarget, this requires an explicit output_format with no default
73+
74+
.. note::
75+
The base_path can be a local file system path, DBFS path, or cloud storage path
76+
(e.g., s3://, gs://, abfs://) depending on your environment
77+
"""
78+
base_path: str
79+
output_format: Literal["csv", "parquet"] # No default, must be specified
80+
81+
@validator("base_path")
82+
def validate_base_path(cls, v: str) -> str:
83+
"""Validates that the base path is non-empty.
84+
85+
:param v: The base path string to validate
86+
:returns: The validated and stripped base path string
87+
:raises ValueError: If the base path is empty or contains only whitespace
88+
89+
.. note::
90+
This is a Pydantic field validator that runs automatically during model instantiation
91+
"""
92+
if not v.strip():
93+
raise ValueError("base_path must be non-empty.")
94+
return v.strip()
95+
96+
def __str__(self) -> str:
97+
"""Returns a human-readable string representation of the file path target.
98+
99+
:returns: Formatted string showing base path, format and type
100+
"""
101+
return f"{self.base_path} (Format: {self.output_format}, Type: File Path)"

0 commit comments

Comments
 (0)