Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for describe column types #796

Merged
merged 4 commits into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
- Update default scope/redirect Url for OAuth U2M, so with default OAuth app user can run python models ([776](https://github.com/databricks/dbt-databricks/pull/776))
- Fix foreign key constraints by switching from `parent` to `to` and `parent_columns` to `to_columns` ([789](https://github.com/databricks/dbt-databricks/pull/789))
- Now handles external shallow clones without blowing up ([795](https://github.com/databricks/dbt-databricks/pull/795))
- Use information_schema to get column types when possible, since describe extended truncates complex types ([796](https://github.com/databricks/dbt-databricks/pull/796))

## dbt-databricks 1.8.5 (August 6, 2024)

Expand Down
38 changes: 38 additions & 0 deletions dbt/adapters/databricks/impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
from dbt.adapters.spark.impl import LIST_SCHEMAS_MACRO_NAME
from dbt.adapters.spark.impl import SparkAdapter
from dbt.adapters.spark.impl import TABLE_OR_VIEW_NOT_FOUND_MESSAGES
from dbt_common.behavior_flags import BehaviorFlag
from dbt_common.exceptions import DbtRuntimeError
from dbt_common.utils import executor
from dbt_common.utils.dict import AttrDict
Expand All @@ -88,6 +89,7 @@
SHOW_TABLES_MACRO_NAME = "show_tables"
SHOW_VIEWS_MACRO_NAME = "show_views"
GET_COLUMNS_COMMENTS_MACRO_NAME = "get_columns_comments"
GET_COLUMNS_BY_INFO_MACRO_NAME = "get_columns_comments_via_information_schema"


@dataclass
Expand Down Expand Up @@ -164,6 +166,12 @@ class DatabricksAdapter(SparkAdapter):
}
)

# This will begin working once we have 1.9 of dbt-core.
# For now does nothing
@property
def _behavior_flags(self) -> List[BehaviorFlag]:
return [{"name": "column_types_from_information_schema", "default": False}] # type: ignore

# override/overload
def acquire_connection(
self, name: Optional[str] = None, query_header_context: Any = None
Expand Down Expand Up @@ -376,6 +384,36 @@ def parse_describe_extended( # type: ignore[override]

def get_columns_in_relation( # type: ignore[override]
self, relation: DatabricksRelation
) -> List[DatabricksColumn]:
if (
# We can uncomment this once behavior flags are available to adapters
# self.behavior.column_types_from_information_schema and # type: ignore
not relation.is_hive_metastore()
eric-wang-1990 marked this conversation as resolved.
Show resolved Hide resolved
):
return self._get_columns_in_relation_by_information_schema(relation)
else:
return self._get_columns_in_relation_by_describe(relation)

def _get_columns_in_relation_by_information_schema(
self, relation: DatabricksRelation
) -> List[DatabricksColumn]:
rows = list(
handle_missing_objects(
lambda: self.execute_macro(
GET_COLUMNS_BY_INFO_MACRO_NAME, kwargs={"relation": relation}
),
AttrDict(),
)
)

columns = []
for row in rows:
columns.append(DatabricksColumn(column=row[0], dtype=row[1], comment=row[2]))

return columns

def _get_columns_in_relation_by_describe(
self, relation: DatabricksRelation
) -> List[DatabricksColumn]:
rows = list(
handle_missing_objects(
Expand Down
16 changes: 16 additions & 0 deletions dbt/include/databricks/macros/adapters/persist_docs.sql
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,22 @@
{% do return(load_result('get_columns_comments').table) %}
{% endmacro %}

{% macro get_columns_comments_via_information_schema(relation) -%}
{% call statement('get_columns_comments_via_information_schema', fetch_result=True) -%}
select
column_name,
full_data_type,
comment
from `system`.`information_schema`.`columns`
where
table_catalog = '{{ relation.database|lower }}' and
table_schema = '{{ relation.schema|lower }}' and
table_name = '{{ relation.identifier|lower }}'
{% endcall %}

{% do return(load_result('get_columns_comments_via_information_schema').table) %}
{% endmacro %}

{% macro databricks__persist_docs(relation, model, for_relation, for_columns) -%}
{%- if for_relation and config.persist_relation_docs() and model.description %}
{% do alter_table_comment(relation, model) %}
Expand Down
2 changes: 1 addition & 1 deletion dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ types-requests
types-mock
pre-commit

dbt-tests-adapter~=1.8.0
dbt-tests-adapter~=1.9.0
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
databricks-sql-connector>=3.1.0, <3.2.0
dbt-spark~=1.8.0
dbt-core>=1.8.0, <2.0
dbt-adapters>=1.3.0, <2.0
dbt-adapters>=1.6.0, <2.0
databricks-sdk==0.17.0
keyring>=23.13.0
pandas<2.2.0
Expand Down
13 changes: 13 additions & 0 deletions tests/functional/adapter/columns/fixtures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
base_model = """
select struct('a', 1, 'b', 'b', 'c', ARRAY(1,2,3)) as struct_col, 'hello' as str_col
"""

schema = """
version: 2

models:
- name: base_model
columns:
- name: struct_col
- name: str_col
"""
55 changes: 55 additions & 0 deletions tests/functional/adapter/columns/test_get_columns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import pytest

from dbt.adapters.databricks.column import DatabricksColumn
from dbt.adapters.databricks.relation import DatabricksRelation
from tests.functional.adapter.columns import fixtures
from dbt.tests import util


class ColumnsInRelation:

@pytest.fixture(scope="class")
def models(self):
return {"base_model.sql": fixtures.base_model, "schema.yml": fixtures.schema}

@pytest.fixture(scope="class", autouse=True)
def setup(self, project):
util.run_dbt(["run"])

@pytest.fixture(scope="class")
def expected_columns(self):

return [
DatabricksColumn(
column="struct_col",
dtype=(
"struct<col1:string,col2:int,col3:string,"
"col4:string,col5:string,col6:array<int>>"
),
),
DatabricksColumn(column="str_col", dtype="string"),
]

def test_columns_in_relation(self, project, expected_columns):
my_relation = DatabricksRelation.create(
database=project.database,
schema=project.test_schema,
identifier="base_model",
type=DatabricksRelation.Table,
)

with project.adapter.connection_named("_test"):
actual_columns = project.adapter.get_columns_in_relation(my_relation)
assert actual_columns == expected_columns


class TestColumnsInRelationBehaviorFlagOff(ColumnsInRelation):
@pytest.fixture(scope="class")
def project_config_update(self):
return {"flags": {}}


class TestColumnsInRelationBehaviorFlagOn(ColumnsInRelation):
@pytest.fixture(scope="class")
def project_config_update(self):
return {"flags": {"column_types_from_information_schema": True}}
Loading