diff --git a/.github/workflows/test_doc_snippets.yml b/.github/workflows/test_doc_snippets.yml index 6094f2c0ac..2bff0df899 100644 --- a/.github/workflows/test_doc_snippets.yml +++ b/.github/workflows/test_doc_snippets.yml @@ -67,6 +67,11 @@ jobs: with: python-version: "3.10.x" + - name: Setup node 20 + uses: actions/setup-node@v4 + with: + node-version: 20 + - name: Install Poetry uses: snok/install-poetry@v1 with: @@ -81,6 +86,9 @@ jobs: path: .venv key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} + - name: run docs preprocessor + run: make preprocess-docs + - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' run: poetry install --no-interaction -E duckdb -E weaviate -E parquet -E qdrant -E bigquery -E postgres -E lancedb --with docs,sentry-sdk --without airflow diff --git a/Makefile b/Makefile index 3878dddd15..3a99d96e5e 100644 --- a/Makefile +++ b/Makefile @@ -44,7 +44,7 @@ has-poetry: poetry --version dev: has-poetry - poetry install --all-extras --with docs,providers,pipeline,sources,sentry-sdk + poetry install --all-extras --with docs,providers,pipeline,sources,sentry-sdk,airflow lint: ./tools/check-package.sh @@ -107,4 +107,6 @@ test-build-images: build-library docker build -f deploy/dlt/Dockerfile.airflow --build-arg=COMMIT_SHA="$(shell git log -1 --pretty=%h)" --build-arg=IMAGE_VERSION="$(shell poetry version -s)" . # docker build -f deploy/dlt/Dockerfile --build-arg=COMMIT_SHA="$(shell git log -1 --pretty=%h)" --build-arg=IMAGE_VERSION="$(shell poetry version -s)" . - +preprocess-docs: + # run docs preprocessing to run a few checks and ensure examples can be parsed + cd docs/website && npm i && npm run preprocess-docs \ No newline at end of file diff --git a/dlt/common/configuration/specs/connection_string_credentials.py b/dlt/common/configuration/specs/connection_string_credentials.py index 5d3ec689c4..1da7961ef8 100644 --- a/dlt/common/configuration/specs/connection_string_credentials.py +++ b/dlt/common/configuration/specs/connection_string_credentials.py @@ -10,14 +10,20 @@ @configspec class ConnectionStringCredentials(CredentialsConfiguration): drivername: str = dataclasses.field(default=None, init=False, repr=False, compare=False) - database: str = None + database: Optional[str] = None password: Optional[TSecretValue] = None - username: str = None + username: Optional[str] = None host: Optional[str] = None port: Optional[int] = None query: Optional[Dict[str, Any]] = None - __config_gen_annotations__: ClassVar[List[str]] = ["port", "password", "host"] + __config_gen_annotations__: ClassVar[List[str]] = [ + "database", + "port", + "username", + "password", + "host", + ] def __init__(self, connection_string: Union[str, Dict[str, Any]] = None) -> None: """Initializes the credentials from SQLAlchemy like connection string or from dict holding connection string elements""" diff --git a/dlt/common/destination/capabilities.py b/dlt/common/destination/capabilities.py index 8f0dce79ce..9ef8fad96e 100644 --- a/dlt/common/destination/capabilities.py +++ b/dlt/common/destination/capabilities.py @@ -33,6 +33,7 @@ TTableSchema, TLoaderMergeStrategy, TTableFormat, + TLoaderReplaceStrategy, ) from dlt.common.wei import EVM_DECIMAL_PRECISION @@ -169,7 +170,7 @@ class DestinationCapabilitiesContext(ContainerInjectableContext): supported_merge_strategies: Sequence[TLoaderMergeStrategy] = None merge_strategies_selector: MergeStrategySelector = None - # TODO: also add `supported_replace_strategies` capability + supported_replace_strategies: Sequence[TLoaderReplaceStrategy] = None max_parallel_load_jobs: Optional[int] = None """The destination can set the maximum amount of parallel load jobs being executed""" diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index 9e27b66335..527b9419e8 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -27,11 +27,14 @@ from dlt.common.configuration.specs.base_configuration import extract_inner_hint from dlt.common.destination.typing import PreparedTableSchema from dlt.common.destination.utils import verify_schema_capabilities, verify_supported_data_types -from dlt.common.exceptions import TerminalValueError +from dlt.common.exceptions import TerminalException from dlt.common.metrics import LoadJobMetrics from dlt.common.normalizers.naming import NamingConvention from dlt.common.schema import Schema, TSchemaTables -from dlt.common.schema.typing import C_DLT_LOAD_ID, _TTableSchemaBase, TWriteDisposition +from dlt.common.schema.typing import ( + C_DLT_LOAD_ID, + TLoaderReplaceStrategy, +) from dlt.common.schema.utils import fill_hints_from_parent_and_clone_table from dlt.common.configuration import configspec, resolve_configuration, known_sections, NotResolved from dlt.common.configuration.specs import BaseConfiguration, CredentialsConfiguration @@ -41,14 +44,12 @@ UnknownDestinationModule, DestinationSchemaTampered, DestinationTransientException, - DestinationTerminalException, ) from dlt.common.schema.exceptions import UnknownTableException from dlt.common.storages import FileStorage from dlt.common.storages.load_storage import ParsedLoadJobFileName from dlt.common.storages.load_package import LoadJobInfo, TPipelineStateDoc -TLoaderReplaceStrategy = Literal["truncate-and-insert", "insert-from-staging", "staging-optimized"] TDestinationConfig = TypeVar("TDestinationConfig", bound="DestinationClientConfiguration") TDestinationClient = TypeVar("TDestinationClient", bound="JobClientBase") TDestinationDwhClient = TypeVar("TDestinationDwhClient", bound="DestinationClientDwhConfiguration") @@ -353,7 +354,7 @@ def __init__(self, file_path: str) -> None: # ensure file name super().__init__(file_path) self._state: TLoadJobState = "ready" - self._exception: Exception = None + self._exception: BaseException = None # variables needed by most jobs, set by the loader in set_run_vars self._schema: Schema = None @@ -392,7 +393,7 @@ def run_managed( self._job_client.prepare_load_job_execution(self) self.run() self._state = "completed" - except (DestinationTerminalException, TerminalValueError) as e: + except (TerminalException, AssertionError) as e: self._state = "failed" self._exception = e logger.exception(f"Terminal exception in job {self.job_id()} in file {self._file_path}") diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index 9221cca7ff..2247358331 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -218,6 +218,7 @@ class NormalizerInfo(TypedDict, total=True): TWriteDisposition = Literal["skip", "append", "replace", "merge"] TLoaderMergeStrategy = Literal["delete-insert", "scd2", "upsert"] +TLoaderReplaceStrategy = Literal["truncate-and-insert", "insert-from-staging", "staging-optimized"] WRITE_DISPOSITIONS: Set[TWriteDisposition] = set(get_args(TWriteDisposition)) diff --git a/dlt/common/storages/configuration.py b/dlt/common/storages/configuration.py index 4da44bceee..777b51a488 100644 --- a/dlt/common/storages/configuration.py +++ b/dlt/common/storages/configuration.py @@ -145,6 +145,8 @@ class FilesystemConfiguration(BaseConfiguration): kwargs: Optional[DictStrAny] = None client_kwargs: Optional[DictStrAny] = None deltalake_storage_options: Optional[DictStrAny] = None + max_state_files: int = 100 + """Maximum number of pipeline state files to keep; 0 or negative value disables cleanup.""" @property def protocol(self) -> str: diff --git a/dlt/destinations/impl/athena/factory.py b/dlt/destinations/impl/athena/factory.py index 5a7ae1ba8c..1749c135cc 100644 --- a/dlt/destinations/impl/athena/factory.py +++ b/dlt/destinations/impl/athena/factory.py @@ -138,6 +138,7 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps.timestamp_precision = 3 caps.supports_truncate_command = False caps.supported_merge_strategies = ["delete-insert", "upsert", "scd2"] + caps.supported_replace_strategies = ["truncate-and-insert", "insert-from-staging"] caps.merge_strategies_selector = athena_merge_strategies_selector return caps diff --git a/dlt/destinations/impl/bigquery/factory.py b/dlt/destinations/impl/bigquery/factory.py index 7f4fd74825..32a6eb6f82 100644 --- a/dlt/destinations/impl/bigquery/factory.py +++ b/dlt/destinations/impl/bigquery/factory.py @@ -118,6 +118,11 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps.supports_clone_table = True caps.schema_supports_numeric_precision = False # no precision information in BigQuery caps.supported_merge_strategies = ["delete-insert", "upsert", "scd2"] + caps.supported_replace_strategies = [ + "truncate-and-insert", + "insert-from-staging", + "staging-optimized", + ] return caps diff --git a/dlt/destinations/impl/clickhouse/factory.py b/dlt/destinations/impl/clickhouse/factory.py index 696c2783ca..7a9e16464f 100644 --- a/dlt/destinations/impl/clickhouse/factory.py +++ b/dlt/destinations/impl/clickhouse/factory.py @@ -139,6 +139,7 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps.supports_truncate_command = True caps.supported_merge_strategies = ["delete-insert", "scd2"] + caps.supported_replace_strategies = ["truncate-and-insert", "insert-from-staging"] return caps diff --git a/dlt/destinations/impl/databricks/databricks.py b/dlt/destinations/impl/databricks/databricks.py index 2cdff8a82c..54d37f8c08 100644 --- a/dlt/destinations/impl/databricks/databricks.py +++ b/dlt/destinations/impl/databricks/databricks.py @@ -2,6 +2,9 @@ from urllib.parse import urlparse, urlunparse from dlt import config +from dlt.common.configuration.specs.azure_credentials import ( + AzureServicePrincipalCredentialsWithoutDefaults, +) from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import ( HasFollowupJobs, @@ -95,7 +98,9 @@ def run(self) -> None: )) """ elif bucket_scheme in AZURE_BLOB_STORAGE_PROTOCOLS: - assert isinstance(staging_credentials, AzureCredentialsWithoutDefaults) + assert isinstance( + staging_credentials, AzureCredentialsWithoutDefaults + ), "AzureCredentialsWithoutDefaults required to pass explicit credential" # Explicit azure credentials are needed to load from bucket without a named stage credentials_clause = f"""WITH(CREDENTIAL(AZURE_SAS_TOKEN='{staging_credentials.azure_storage_sas_token}'))""" bucket_path = self.ensure_databricks_abfss_url( @@ -103,7 +108,13 @@ def run(self) -> None: ) if bucket_scheme in AZURE_BLOB_STORAGE_PROTOCOLS: - assert isinstance(staging_credentials, AzureCredentialsWithoutDefaults) + assert isinstance( + staging_credentials, + ( + AzureCredentialsWithoutDefaults, + AzureServicePrincipalCredentialsWithoutDefaults, + ), + ) bucket_path = self.ensure_databricks_abfss_url( bucket_path, staging_credentials.azure_storage_account_name ) diff --git a/dlt/destinations/impl/databricks/factory.py b/dlt/destinations/impl/databricks/factory.py index b02f191423..a73a575901 100644 --- a/dlt/destinations/impl/databricks/factory.py +++ b/dlt/destinations/impl/databricks/factory.py @@ -134,6 +134,11 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps.supports_multiple_statements = False caps.supports_clone_table = True caps.supported_merge_strategies = ["delete-insert", "upsert", "scd2"] + caps.supported_replace_strategies = [ + "truncate-and-insert", + "insert-from-staging", + "staging-optimized", + ] return caps @property diff --git a/dlt/destinations/impl/dremio/factory.py b/dlt/destinations/impl/dremio/factory.py index 29ec6257e6..997ab419c1 100644 --- a/dlt/destinations/impl/dremio/factory.py +++ b/dlt/destinations/impl/dremio/factory.py @@ -109,6 +109,7 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps.supports_multiple_statements = False caps.timestamp_precision = 3 caps.supported_merge_strategies = ["delete-insert", "scd2"] + caps.supported_replace_strategies = ["truncate-and-insert", "insert-from-staging"] return caps @property diff --git a/dlt/destinations/impl/duckdb/configuration.py b/dlt/destinations/impl/duckdb/configuration.py index bc04552078..ec58d66c8b 100644 --- a/dlt/destinations/impl/duckdb/configuration.py +++ b/dlt/destinations/impl/duckdb/configuration.py @@ -25,11 +25,6 @@ @configspec(init=False) class DuckDbBaseCredentials(ConnectionStringCredentials): - password: Optional[TSecretValue] = None - host: Optional[str] = None - port: Optional[int] = None - database: Optional[str] = None - read_only: bool = False # open database read/write def borrow_conn(self, read_only: bool) -> Any: diff --git a/dlt/destinations/impl/duckdb/factory.py b/dlt/destinations/impl/duckdb/factory.py index 6c2011c549..e3d261d9d6 100644 --- a/dlt/destinations/impl/duckdb/factory.py +++ b/dlt/destinations/impl/duckdb/factory.py @@ -148,6 +148,7 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps.alter_add_multi_column = False caps.supports_truncate_command = False caps.supported_merge_strategies = ["delete-insert", "scd2"] + caps.supported_replace_strategies = ["truncate-and-insert", "insert-from-staging"] return caps diff --git a/dlt/destinations/impl/filesystem/factory.py b/dlt/destinations/impl/filesystem/factory.py index c5218f14a3..2463da58fa 100644 --- a/dlt/destinations/impl/filesystem/factory.py +++ b/dlt/destinations/impl/filesystem/factory.py @@ -51,6 +51,7 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: "reference", ] caps.has_case_sensitive_identifiers = True + caps.supported_replace_strategies = ["truncate-and-insert", "insert-from-staging"] return caps @property diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index c9f9797785..3f2f793559 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -3,7 +3,7 @@ import base64 from types import TracebackType -from typing import Dict, List, Type, Iterable, Iterator, Optional, Tuple, Sequence, cast +from typing import Dict, List, Type, Iterable, Iterator, Optional, Tuple, Sequence, cast, Any from fsspec import AbstractFileSystem from contextlib import contextmanager @@ -164,9 +164,12 @@ def _storage_options(self) -> Dict[str, str]: return _deltalake_storage_options(self._job_client.config) def _delta_table(self) -> Optional["DeltaTable"]: # type: ignore[name-defined] # noqa: F821 - from dlt.common.libs.deltalake import try_get_deltatable + from dlt.common.libs.deltalake import DeltaTable - return try_get_deltatable(self.make_remote_url(), storage_options=self._storage_options) + if DeltaTable.is_deltatable(self.make_remote_url(), storage_options=self._storage_options): + return DeltaTable(self.make_remote_url(), storage_options=self._storage_options) + else: + return None @property def _partition_columns(self) -> List[str]: @@ -476,7 +479,9 @@ def _to_path_safe_string(self, s: str) -> str: """for base64 strings""" return base64.b64decode(s).hex() if s else None - def _list_dlt_table_files(self, table_name: str) -> Iterator[Tuple[str, List[str]]]: + def _list_dlt_table_files( + self, table_name: str, pipeline_name: str = None + ) -> Iterator[Tuple[str, List[str]]]: dirname = self.get_table_dir(table_name) if not self.fs_client.exists(self.pathlib.join(dirname, INIT_FILE_NAME)): raise DestinationUndefinedEntity({"dir": dirname}) @@ -485,7 +490,9 @@ def _list_dlt_table_files(self, table_name: str) -> Iterator[Tuple[str, List[str fileparts = filename.split(FILENAME_SEPARATOR) if len(fileparts) != 3: continue - yield filepath, fileparts + # Filters only if pipeline_name provided + if pipeline_name is None or fileparts[0] == pipeline_name: + yield filepath, fileparts def _store_load(self, load_id: str) -> None: # write entry to load "table" @@ -520,6 +527,31 @@ def _get_state_file_name(self, pipeline_name: str, version_hash: str, load_id: s f"{pipeline_name}{FILENAME_SEPARATOR}{load_id}{FILENAME_SEPARATOR}{self._to_path_safe_string(version_hash)}.jsonl", ) + def _cleanup_pipeline_states(self, pipeline_name: str) -> None: + state_table_files = list( + self._list_dlt_table_files(self.schema.state_table_name, pipeline_name) + ) + + if len(state_table_files) > self.config.max_state_files: + # filter and collect a list of state files + state_file_info: List[Dict[str, Any]] = [ + { + "load_id": float(fileparts[1]), # convert load_id to float for comparison + "filepath": filepath, + } + for filepath, fileparts in state_table_files + ] + + # sort state file info by load_id in descending order + state_file_info.sort(key=lambda x: x["load_id"], reverse=True) + + # keeping only the most recent MAX_STATE_HISTORY files + files_to_delete = state_file_info[self.config.max_state_files :] + + # delete the old files + for file_info in files_to_delete: + self._delete_file(file_info["filepath"]) + def _store_current_state(self, load_id: str) -> None: # don't save the state this way when used as staging if self.config.as_staging_destination: @@ -539,6 +571,10 @@ def _store_current_state(self, load_id: str) -> None: # write self._write_to_json_file(hash_path, cast(DictStrAny, pipeline_state_doc)) + # perform state cleanup only if max_state_files is set to a positive value + if self.config.max_state_files >= 1: + self._cleanup_pipeline_states(pipeline_name) + def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]: # search newest state selected_path = None diff --git a/dlt/destinations/impl/lancedb/factory.py b/dlt/destinations/impl/lancedb/factory.py index 339453133f..8ce2217007 100644 --- a/dlt/destinations/impl/lancedb/factory.py +++ b/dlt/destinations/impl/lancedb/factory.py @@ -40,6 +40,7 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps.decimal_precision = (38, 18) caps.timestamp_precision = 6 + caps.supported_replace_strategies = ["truncate-and-insert"] return caps diff --git a/dlt/destinations/impl/motherduck/factory.py b/dlt/destinations/impl/motherduck/factory.py index ac5dc70b57..fec1049584 100644 --- a/dlt/destinations/impl/motherduck/factory.py +++ b/dlt/destinations/impl/motherduck/factory.py @@ -40,6 +40,7 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps.supports_truncate_command = False caps.supported_merge_strategies = ["delete-insert", "scd2"] caps.max_parallel_load_jobs = 8 + caps.supported_replace_strategies = ["truncate-and-insert", "insert-from-staging"] return caps diff --git a/dlt/destinations/impl/mssql/configuration.py b/dlt/destinations/impl/mssql/configuration.py index 5b08546f73..a30b300343 100644 --- a/dlt/destinations/impl/mssql/configuration.py +++ b/dlt/destinations/impl/mssql/configuration.py @@ -14,6 +14,8 @@ @configspec(init=False) class MsSqlCredentials(ConnectionStringCredentials): drivername: Final[str] = dataclasses.field(default="mssql", init=False, repr=False, compare=False) # type: ignore + database: str = None + username: str = None password: TSecretValue = None host: str = None port: int = 1433 diff --git a/dlt/destinations/impl/mssql/factory.py b/dlt/destinations/impl/mssql/factory.py index 2fd668bdb6..1dbac8e8f5 100644 --- a/dlt/destinations/impl/mssql/factory.py +++ b/dlt/destinations/impl/mssql/factory.py @@ -109,6 +109,11 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps.max_rows_per_insert = 1000 caps.timestamp_precision = 7 caps.supported_merge_strategies = ["delete-insert", "upsert", "scd2"] + caps.supported_replace_strategies = [ + "truncate-and-insert", + "insert-from-staging", + "staging-optimized", + ] return caps diff --git a/dlt/destinations/impl/postgres/configuration.py b/dlt/destinations/impl/postgres/configuration.py index fab398fc21..656d1b3ac1 100644 --- a/dlt/destinations/impl/postgres/configuration.py +++ b/dlt/destinations/impl/postgres/configuration.py @@ -14,6 +14,8 @@ @configspec(init=False) class PostgresCredentials(ConnectionStringCredentials): drivername: Final[str] = dataclasses.field(default="postgresql", init=False, repr=False, compare=False) # type: ignore + database: str = None + username: str = None password: TSecretValue = None host: str = None port: int = 5432 diff --git a/dlt/destinations/impl/postgres/factory.py b/dlt/destinations/impl/postgres/factory.py index 1a33d44577..bde0e35f3d 100644 --- a/dlt/destinations/impl/postgres/factory.py +++ b/dlt/destinations/impl/postgres/factory.py @@ -142,6 +142,11 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps.is_max_text_data_type_length_in_bytes = True caps.supports_ddl_transactions = True caps.supported_merge_strategies = ["delete-insert", "upsert", "scd2"] + caps.supported_replace_strategies = [ + "truncate-and-insert", + "insert-from-staging", + "staging-optimized", + ] return caps diff --git a/dlt/destinations/impl/qdrant/factory.py b/dlt/destinations/impl/qdrant/factory.py index f994948d91..49c4511c8d 100644 --- a/dlt/destinations/impl/qdrant/factory.py +++ b/dlt/destinations/impl/qdrant/factory.py @@ -25,6 +25,7 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps.max_text_data_type_length = 8 * 1024 * 1024 caps.is_max_text_data_type_length_in_bytes = False caps.supports_ddl_transactions = False + caps.supported_replace_strategies = ["truncate-and-insert"] return caps diff --git a/dlt/destinations/impl/redshift/factory.py b/dlt/destinations/impl/redshift/factory.py index 20b7df859f..cab30f8e33 100644 --- a/dlt/destinations/impl/redshift/factory.py +++ b/dlt/destinations/impl/redshift/factory.py @@ -135,6 +135,7 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps.supports_ddl_transactions = True caps.alter_add_multi_column = False caps.supported_merge_strategies = ["delete-insert", "scd2"] + caps.supported_replace_strategies = ["truncate-and-insert", "insert-from-staging"] return caps diff --git a/dlt/destinations/impl/snowflake/configuration.py b/dlt/destinations/impl/snowflake/configuration.py index 3fc479f237..de8faa91a6 100644 --- a/dlt/destinations/impl/snowflake/configuration.py +++ b/dlt/destinations/impl/snowflake/configuration.py @@ -56,9 +56,9 @@ def _decode_private_key(private_key: str, password: Optional[str] = None) -> byt @configspec(init=False) class SnowflakeCredentials(ConnectionStringCredentials): drivername: Final[str] = dataclasses.field(default="snowflake", init=False, repr=False, compare=False) # type: ignore[misc] - password: Optional[TSecretStrValue] = None host: str = None database: str = None + username: str = None warehouse: Optional[str] = None role: Optional[str] = None authenticator: Optional[str] = None diff --git a/dlt/destinations/impl/snowflake/factory.py b/dlt/destinations/impl/snowflake/factory.py index 6c2369a5aa..3ed4b39276 100644 --- a/dlt/destinations/impl/snowflake/factory.py +++ b/dlt/destinations/impl/snowflake/factory.py @@ -121,6 +121,11 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps.alter_add_multi_column = True caps.supports_clone_table = True caps.supported_merge_strategies = ["delete-insert", "upsert", "scd2"] + caps.supported_replace_strategies = [ + "truncate-and-insert", + "insert-from-staging", + "staging-optimized", + ] return caps @property diff --git a/dlt/destinations/impl/sqlalchemy/configuration.py b/dlt/destinations/impl/sqlalchemy/configuration.py index f99b06a27b..b26c87dfac 100644 --- a/dlt/destinations/impl/sqlalchemy/configuration.py +++ b/dlt/destinations/impl/sqlalchemy/configuration.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Optional, Any, Final, Type, Dict, Union +from typing import TYPE_CHECKING, ClassVar, List, Optional, Any, Final, Type, Dict, Union import dataclasses from dlt.common.configuration import configspec @@ -14,8 +14,6 @@ class SqlalchemyCredentials(ConnectionStringCredentials): if TYPE_CHECKING: _engine: Optional["Engine"] = None - username: Optional[str] = None # e.g. sqlite doesn't need username - def __init__( self, connection_string: Optional[Union[str, Dict[str, Any], "Engine"]] = None ) -> None: @@ -49,6 +47,14 @@ def get_dialect(self) -> Optional[Type["Dialect"]]: return type(engine.dialect) return self.to_url().get_dialect() # type: ignore[attr-defined,no-any-return] + __config_gen_annotations__: ClassVar[List[str]] = [ + "database", + "port", + "username", + "password", + "host", + ] + @configspec class SqlalchemyClientConfiguration(DestinationClientDwhConfiguration): diff --git a/dlt/destinations/impl/sqlalchemy/db_api_client.py b/dlt/destinations/impl/sqlalchemy/db_api_client.py index c6c8ba53d6..829fe8db82 100644 --- a/dlt/destinations/impl/sqlalchemy/db_api_client.py +++ b/dlt/destinations/impl/sqlalchemy/db_api_client.py @@ -3,12 +3,12 @@ Iterator, Any, Sequence, - ContextManager, AnyStr, Union, Tuple, List, Dict, + Set, ) from contextlib import contextmanager from functools import wraps @@ -19,6 +19,7 @@ from sqlalchemy.engine import Connection from dlt.common.destination import DestinationCapabilitiesContext +from dlt.common.destination.reference import PreparedTableSchema from dlt.destinations.exceptions import ( DatabaseUndefinedRelation, DatabaseTerminalException, @@ -122,6 +123,8 @@ def __init__( self._current_connection: Optional[Connection] = None self._current_transaction: Optional[SqlaTransactionWrapper] = None self.metadata = sa.MetaData() + # Keep a list of datasets already attached on the current connection + self._sqlite_attached_datasets: Set[str] = set() @property def engine(self) -> sa.engine.Engine: @@ -155,6 +158,7 @@ def close_connection(self) -> None: self._current_connection.close() self.engine.dispose() finally: + self._sqlite_attached_datasets.clear() self._current_connection = None self._current_transaction = None @@ -234,6 +238,9 @@ def _sqlite_create_dataset(self, dataset_name: str) -> None: """Mimic multiple schemas in sqlite using ATTACH DATABASE to attach a new database file to the current connection. """ + if dataset_name == "main": + # main always exists + return if self._sqlite_is_memory_db(): new_db_fn = ":memory:" else: @@ -241,6 +248,7 @@ def _sqlite_create_dataset(self, dataset_name: str) -> None: statement = "ATTACH DATABASE :fn AS :name" self.execute_sql(statement, fn=new_db_fn, name=dataset_name) + self._sqlite_attached_datasets.add(dataset_name) def _sqlite_drop_dataset(self, dataset_name: str) -> None: """Drop a dataset in sqlite by detaching the database file @@ -252,6 +260,7 @@ def _sqlite_drop_dataset(self, dataset_name: str) -> None: if dataset_name != "main": # main is the default database, it cannot be detached statement = "DETACH DATABASE :name" self.execute_sql(statement, name=dataset_name) + self._sqlite_attached_datasets.discard(dataset_name) fn = dbs[dataset_name] if not fn: # It's a memory database, nothing to do @@ -259,6 +268,15 @@ def _sqlite_drop_dataset(self, dataset_name: str) -> None: # Delete the database file Path(fn).unlink() + @contextmanager + def with_alternative_dataset_name( + self, dataset_name: str + ) -> Iterator[SqlClientBase[Connection]]: + with super().with_alternative_dataset_name(dataset_name): + if self.dialect_name == "sqlite" and dataset_name not in self._sqlite_attached_datasets: + self._sqlite_reattach_dataset_if_exists(dataset_name) + yield self + def create_dataset(self) -> None: if self.dialect_name == "sqlite": return self._sqlite_create_dataset(self.dataset_name) @@ -332,8 +350,10 @@ def make_qualified_table_name(self, table_name: str, escape: bool = True) -> str def fully_qualified_dataset_name(self, escape: bool = True, staging: bool = False) -> str: if staging: - raise NotImplementedError("Staging not supported") - return self.dialect.identifier_preparer.format_schema(self.dataset_name) # type: ignore[attr-defined, no-any-return] + dataset_name = self.staging_dataset_name + else: + dataset_name = self.dataset_name + return self.dialect.identifier_preparer.format_schema(dataset_name) # type: ignore[attr-defined, no-any-return] def alter_table_add_columns(self, columns: Sequence[sa.Column]) -> None: if not columns: diff --git a/dlt/destinations/impl/sqlalchemy/factory.py b/dlt/destinations/impl/sqlalchemy/factory.py index 10372cda34..360dd89192 100644 --- a/dlt/destinations/impl/sqlalchemy/factory.py +++ b/dlt/destinations/impl/sqlalchemy/factory.py @@ -21,6 +21,7 @@ if t.TYPE_CHECKING: # from dlt.destinations.impl.sqlalchemy.sqlalchemy_client import SqlalchemyJobClient from dlt.destinations.impl.sqlalchemy.sqlalchemy_job_client import SqlalchemyJobClient + from sqlalchemy.engine import Engine class sqlalchemy(Destination[SqlalchemyClientConfiguration, "SqlalchemyJobClient"]): @@ -45,7 +46,10 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps.supports_ddl_transactions = True caps.max_query_parameters = 20_0000 caps.max_rows_per_insert = 10_000 # Set a default to avoid OOM on large datasets + # Multiple concatenated statements are not supported by all engines, so leave them off by default + caps.supports_multiple_statements = False caps.type_mapper = SqlalchemyTypeMapper + caps.supported_replace_strategies = ["truncate-and-insert", "insert-from-staging"] return caps @@ -74,7 +78,7 @@ def client_class(self) -> t.Type["SqlalchemyJobClient"]: def __init__( self, - credentials: t.Union[SqlalchemyCredentials, t.Dict[str, t.Any], str] = None, + credentials: t.Union[SqlalchemyCredentials, t.Dict[str, t.Any], str, "Engine"] = None, destination_name: t.Optional[str] = None, environment: t.Optional[str] = None, engine_args: t.Optional[t.Dict[str, t.Any]] = None, diff --git a/dlt/destinations/impl/sqlalchemy/load_jobs.py b/dlt/destinations/impl/sqlalchemy/load_jobs.py new file mode 100644 index 0000000000..c8486dc0f0 --- /dev/null +++ b/dlt/destinations/impl/sqlalchemy/load_jobs.py @@ -0,0 +1,136 @@ +from typing import IO, Any, Dict, Iterator, List, Sequence, TYPE_CHECKING, Optional +import math + +import sqlalchemy as sa + +from dlt.common.destination.reference import ( + RunnableLoadJob, + HasFollowupJobs, + PreparedTableSchema, +) +from dlt.common.storages import FileStorage +from dlt.common.json import json, PY_DATETIME_DECODERS +from dlt.destinations.sql_jobs import SqlFollowupJob, SqlJobParams + +from dlt.destinations.impl.sqlalchemy.db_api_client import SqlalchemyClient + +if TYPE_CHECKING: + from dlt.destinations.impl.sqlalchemy.sqlalchemy_job_client import SqlalchemyJobClient + + +class SqlalchemyJsonLInsertJob(RunnableLoadJob, HasFollowupJobs): + def __init__(self, file_path: str, table: sa.Table) -> None: + super().__init__(file_path) + self._job_client: "SqlalchemyJobClient" = None + self.table = table + + def _open_load_file(self) -> IO[bytes]: + return FileStorage.open_zipsafe_ro(self._file_path, "rb") + + def _iter_data_items(self) -> Iterator[Dict[str, Any]]: + all_cols = {col.name: None for col in self.table.columns} + with FileStorage.open_zipsafe_ro(self._file_path, "rb") as f: + for line in f: + # Decode date/time to py datetime objects. Some drivers have issues with pendulum objects + for item in json.typed_loadb(line, decoders=PY_DATETIME_DECODERS): + # Fill any missing columns in item with None. Bulk insert fails when items have different keys + if item.keys() != all_cols.keys(): + yield {**all_cols, **item} + else: + yield item + + def _iter_data_item_chunks(self) -> Iterator[Sequence[Dict[str, Any]]]: + max_rows = self._job_client.capabilities.max_rows_per_insert or math.inf + # Limit by max query length should not be needed, + # bulk insert generates an INSERT template with a single VALUES tuple of placeholders + # If any dialects don't do that we need to check the str length of the query + # TODO: Max params may not be needed. Limits only apply to placeholders in sql string (mysql/sqlite) + max_params = self._job_client.capabilities.max_query_parameters or math.inf + chunk: List[Dict[str, Any]] = [] + params_count = 0 + for item in self._iter_data_items(): + if len(chunk) + 1 == max_rows or params_count + len(item) > max_params: + # Rotate chunk + yield chunk + chunk = [] + params_count = 0 + params_count += len(item) + chunk.append(item) + + if chunk: + yield chunk + + def run(self) -> None: + _sql_client = self._job_client.sql_client + # Copy the table to the current dataset (i.e. staging) if needed + # This is a no-op if the table is already in the correct schema + table = self.table.to_metadata( + self.table.metadata, schema=_sql_client.dataset_name # type: ignore[attr-defined] + ) + + with _sql_client.begin_transaction(): + for chunk in self._iter_data_item_chunks(): + _sql_client.execute_sql(table.insert(), chunk) + + +class SqlalchemyParquetInsertJob(SqlalchemyJsonLInsertJob): + def _iter_data_item_chunks(self) -> Iterator[Sequence[Dict[str, Any]]]: + from dlt.common.libs.pyarrow import ParquetFile + + num_cols = len(self.table.columns) + max_rows = self._job_client.capabilities.max_rows_per_insert or None + max_params = self._job_client.capabilities.max_query_parameters or None + read_limit = None + + with ParquetFile(self._file_path) as reader: + if max_params is not None: + read_limit = math.floor(max_params / num_cols) + + if max_rows is not None: + if read_limit is None: + read_limit = max_rows + else: + read_limit = min(read_limit, max_rows) + + if read_limit is None: + yield reader.read().to_pylist() + return + + for chunk in reader.iter_batches(batch_size=read_limit): + yield chunk.to_pylist() + + +class SqlalchemyStagingCopyJob(SqlFollowupJob): + @classmethod + def generate_sql( + cls, + table_chain: Sequence[PreparedTableSchema], + sql_client: SqlalchemyClient, # type: ignore[override] + params: Optional[SqlJobParams] = None, + ) -> List[str]: + statements: List[str] = [] + for table in table_chain: + # Tables must have already been created in metadata + table_obj = sql_client.get_existing_table(table["name"]) + staging_table_obj = table_obj.to_metadata( + sql_client.metadata, schema=sql_client.staging_dataset_name + ) + if params["replace"]: + stmt = str(table_obj.delete().compile(dialect=sql_client.dialect)) + if not stmt.endswith(";"): + stmt += ";" + statements.append(stmt) + + stmt = str( + table_obj.insert() + .from_select( + [col.name for col in staging_table_obj.columns], staging_table_obj.select() + ) + .compile(dialect=sql_client.dialect) + ) + if not stmt.endswith(";"): + stmt += ";" + + statements.append(stmt) + + return statements diff --git a/dlt/destinations/impl/sqlalchemy/sqlalchemy_job_client.py b/dlt/destinations/impl/sqlalchemy/sqlalchemy_job_client.py index c51d3cbe3a..a2514a43e0 100644 --- a/dlt/destinations/impl/sqlalchemy/sqlalchemy_job_client.py +++ b/dlt/destinations/impl/sqlalchemy/sqlalchemy_job_client.py @@ -1,112 +1,35 @@ -from typing import Iterable, Optional, Dict, Any, Iterator, Sequence, List, Tuple, IO +from typing import Iterable, Optional, Sequence, List, Tuple from contextlib import suppress -import math import sqlalchemy as sa +from dlt.common.json import json from dlt.common import logger from dlt.common import pendulum from dlt.common.destination.reference import ( JobClientBase, LoadJob, - RunnableLoadJob, StorageSchemaInfo, StateInfo, PreparedTableSchema, + FollowupJobRequest, ) -from dlt.destinations.job_client_impl import SqlJobClientBase +from dlt.destinations.job_client_impl import SqlJobClientWithStagingDataset, SqlLoadJob from dlt.common.destination.capabilities import DestinationCapabilitiesContext from dlt.common.schema import Schema, TTableSchema, TColumnSchema, TSchemaTables from dlt.common.schema.typing import TColumnType, TTableSchemaColumns from dlt.common.schema.utils import pipeline_state_table, normalize_table_identifiers -from dlt.common.storages import FileStorage -from dlt.common.json import json, PY_DATETIME_DECODERS from dlt.destinations.exceptions import DatabaseUndefinedRelation - - -# from dlt.destinations.impl.sqlalchemy.sql_client import SqlalchemyClient from dlt.destinations.impl.sqlalchemy.db_api_client import SqlalchemyClient from dlt.destinations.impl.sqlalchemy.configuration import SqlalchemyClientConfiguration +from dlt.destinations.impl.sqlalchemy.load_jobs import ( + SqlalchemyJsonLInsertJob, + SqlalchemyParquetInsertJob, + SqlalchemyStagingCopyJob, +) -class SqlalchemyJsonLInsertJob(RunnableLoadJob): - def __init__(self, file_path: str, table: sa.Table) -> None: - super().__init__(file_path) - self._job_client: "SqlalchemyJobClient" = None - self.table = table - - def _open_load_file(self) -> IO[bytes]: - return FileStorage.open_zipsafe_ro(self._file_path, "rb") - - def _iter_data_items(self) -> Iterator[Dict[str, Any]]: - all_cols = {col.name: None for col in self.table.columns} - with FileStorage.open_zipsafe_ro(self._file_path, "rb") as f: - for line in f: - # Decode date/time to py datetime objects. Some drivers have issues with pendulum objects - for item in json.typed_loadb(line, decoders=PY_DATETIME_DECODERS): - # Fill any missing columns in item with None. Bulk insert fails when items have different keys - if item.keys() != all_cols.keys(): - yield {**all_cols, **item} - else: - yield item - - def _iter_data_item_chunks(self) -> Iterator[Sequence[Dict[str, Any]]]: - max_rows = self._job_client.capabilities.max_rows_per_insert or math.inf - # Limit by max query length should not be needed, - # bulk insert generates an INSERT template with a single VALUES tuple of placeholders - # If any dialects don't do that we need to check the str length of the query - # TODO: Max params may not be needed. Limits only apply to placeholders in sql string (mysql/sqlite) - max_params = self._job_client.capabilities.max_query_parameters or math.inf - chunk: List[Dict[str, Any]] = [] - params_count = 0 - for item in self._iter_data_items(): - if len(chunk) + 1 == max_rows or params_count + len(item) > max_params: - # Rotate chunk - yield chunk - chunk = [] - params_count = 0 - params_count += len(item) - chunk.append(item) - - if chunk: - yield chunk - - def run(self) -> None: - _sql_client = self._job_client.sql_client - - with _sql_client.begin_transaction(): - for chunk in self._iter_data_item_chunks(): - _sql_client.execute_sql(self.table.insert(), chunk) - - -class SqlalchemyParquetInsertJob(SqlalchemyJsonLInsertJob): - def _iter_data_item_chunks(self) -> Iterator[Sequence[Dict[str, Any]]]: - from dlt.common.libs.pyarrow import ParquetFile - - num_cols = len(self.table.columns) - max_rows = self._job_client.capabilities.max_rows_per_insert or None - max_params = self._job_client.capabilities.max_query_parameters or None - read_limit = None - - with ParquetFile(self._file_path) as reader: - if max_params is not None: - read_limit = math.floor(max_params / num_cols) - - if max_rows is not None: - if read_limit is None: - read_limit = max_rows - else: - read_limit = min(read_limit, max_rows) - - if read_limit is None: - yield reader.read().to_pylist() - return - - for chunk in reader.iter_batches(batch_size=read_limit): - yield chunk.to_pylist() - - -class SqlalchemyJobClient(SqlJobClientBase): +class SqlalchemyJobClient(SqlJobClientWithStagingDataset): sql_client: SqlalchemyClient # type: ignore[assignment] def __init__( @@ -117,7 +40,7 @@ def __init__( ) -> None: self.sql_client = SqlalchemyClient( config.normalize_dataset_name(schema), - None, + config.normalize_staging_dataset_name(schema), config.credentials, capabilities, engine_args=config.engine_args, @@ -157,9 +80,37 @@ def _to_column_object( unique=schema_column.get("unique", False), ) + def _create_replace_followup_jobs( + self, table_chain: Sequence[PreparedTableSchema] + ) -> List[FollowupJobRequest]: + if self.config.replace_strategy in ["insert-from-staging", "staging-optimized"]: + # Make sure all tables are generated in metadata before creating the job + for table in table_chain: + self._to_table_object(table) + return [ + SqlalchemyStagingCopyJob.from_table_chain( + table_chain, self.sql_client, {"replace": True} + ) + ] + return [] + + def _create_merge_followup_jobs( + self, table_chain: Sequence[PreparedTableSchema] + ) -> List[FollowupJobRequest]: + for table in table_chain: + self._to_table_object(table) + return [ + SqlalchemyStagingCopyJob.from_table_chain( + table_chain, self.sql_client, {"replace": False} + ) + ] + def create_load_job( self, table: PreparedTableSchema, file_path: str, load_id: str, restore: bool = False ) -> LoadJob: + job = super().create_load_job(table, file_path, load_id, restore) + if job is not None: + return job if file_path.endswith(".typed-jsonl"): table_obj = self._to_table_object(table) return SqlalchemyJsonLInsertJob(file_path, table_obj) diff --git a/dlt/destinations/impl/synapse/factory.py b/dlt/destinations/impl/synapse/factory.py index f035f2f713..14ce622f8b 100644 --- a/dlt/destinations/impl/synapse/factory.py +++ b/dlt/destinations/impl/synapse/factory.py @@ -100,6 +100,7 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps.timestamp_precision = 7 caps.supported_merge_strategies = ["delete-insert", "scd2"] + caps.supported_replace_strategies = ["truncate-and-insert", "insert-from-staging"] return caps diff --git a/dlt/destinations/impl/weaviate/factory.py b/dlt/destinations/impl/weaviate/factory.py index a5c1e9f2a1..7cb71d4944 100644 --- a/dlt/destinations/impl/weaviate/factory.py +++ b/dlt/destinations/impl/weaviate/factory.py @@ -61,6 +61,7 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps.is_max_text_data_type_length_in_bytes = False caps.supports_ddl_transactions = False caps.naming_convention = "dlt.destinations.impl.weaviate.naming" + caps.supported_replace_strategies = ["truncate-and-insert"] return caps diff --git a/dlt/load/load.py b/dlt/load/load.py index 3b231f8fa9..73117e5499 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -5,6 +5,7 @@ import os from dlt.common import logger +from dlt.common.exceptions import TerminalException from dlt.common.metrics import LoadJobMetrics from dlt.common.runtime.signals import sleep from dlt.common.configuration import with_config, known_sections @@ -197,7 +198,7 @@ def submit_job( " extension could not be associated with job type and that indicates an error" " in the code." ) - except DestinationTerminalException: + except (TerminalException, AssertionError): job = FinalizedLoadJobWithFollowupJobs.from_file_path( file_path, "failed", pretty_format_exception() ) diff --git a/dlt/normalize/validate.py b/dlt/normalize/validate.py index d680b5bddd..648deb5da9 100644 --- a/dlt/normalize/validate.py +++ b/dlt/normalize/validate.py @@ -1,7 +1,11 @@ from dlt.common.destination.capabilities import DestinationCapabilitiesContext from dlt.common.schema import Schema from dlt.common.schema.typing import TTableSchema -from dlt.common.schema.utils import find_incomplete_columns +from dlt.common.schema.utils import ( + find_incomplete_columns, + get_first_column_name_with_prop, + is_nested_table, +) from dlt.common.schema.exceptions import UnboundColumnException from dlt.common import logger @@ -41,3 +45,13 @@ def verify_normalized_table( f"`{table['table_format']}` for table `{table['name']}`. " "The setting will probably be ignored." ) + + parent_key = get_first_column_name_with_prop(table, "parent_key") + if parent_key and not is_nested_table(table): + logger.warning( + f"Table {table['name']} has parent_key on column {parent_key} but no corresponding" + " `parent` table hint to refer to parent table.Such table is not considered a nested" + " table and relational normalizer will not generate linking data. The most probable" + " cause is manual modification of the dtl schema for the table. The most probable" + f" outcome will be NULL violation during the load process on {parent_key}." + ) diff --git a/dlt/sources/rest_api/__init__.py b/dlt/sources/rest_api/__init__.py index fa6b691933..b92ed6301c 100644 --- a/dlt/sources/rest_api/__init__.py +++ b/dlt/sources/rest_api/__init__.py @@ -211,7 +211,7 @@ def rest_api_resources(config: RESTAPIConfig) -> List[DltResource]: def create_resources( client_config: ClientConfig, dependency_graph: graphlib.TopologicalSorter, - endpoint_resource_map: Dict[str, EndpointResource], + endpoint_resource_map: Dict[str, Union[EndpointResource, DltResource]], resolved_param_map: Dict[str, Optional[ResolvedParam]], ) -> Dict[str, DltResource]: resources = {} @@ -219,6 +219,10 @@ def create_resources( for resource_name in dependency_graph.static_order(): resource_name = cast(str, resource_name) endpoint_resource = endpoint_resource_map[resource_name] + if isinstance(endpoint_resource, DltResource): + resources[resource_name] = endpoint_resource + continue + endpoint_config = cast(Endpoint, endpoint_resource["endpoint"]) request_params = endpoint_config.get("params", {}) request_json = endpoint_config.get("json", None) @@ -245,6 +249,7 @@ def create_resources( headers=client_config.get("headers"), auth=create_auth(client_config.get("auth")), paginator=create_paginator(client_config.get("paginator")), + session=client_config.get("session"), ) hooks = create_response_hooks(endpoint_config.get("response_actions")) diff --git a/dlt/sources/rest_api/config_setup.py b/dlt/sources/rest_api/config_setup.py index 7bf6c81634..916715b214 100644 --- a/dlt/sources/rest_api/config_setup.py +++ b/dlt/sources/rest_api/config_setup.py @@ -52,6 +52,8 @@ OAuth2ClientCredentials, ) +from dlt.extract.resource import DltResource + from .typing import ( EndpointResourceBase, AuthConfig, @@ -269,35 +271,20 @@ def make_parent_key_name(resource_name: str, field_name: str) -> str: def build_resource_dependency_graph( resource_defaults: EndpointResourceBase, - resource_list: List[Union[str, EndpointResource]], -) -> Tuple[Any, Dict[str, EndpointResource], Dict[str, Optional[ResolvedParam]]]: + resource_list: List[Union[str, EndpointResource, DltResource]], +) -> Tuple[ + Any, Dict[str, Union[EndpointResource, DltResource]], Dict[str, Optional[ResolvedParam]] +]: dependency_graph = graphlib.TopologicalSorter() - endpoint_resource_map: Dict[str, EndpointResource] = {} resolved_param_map: Dict[str, ResolvedParam] = {} - - # expand all resources and index them - for resource_kwargs in resource_list: - if isinstance(resource_kwargs, dict): - # clone resource here, otherwise it needs to be cloned in several other places - # note that this clones only dict structure, keeping all instances without deepcopy - resource_kwargs = update_dict_nested({}, resource_kwargs) # type: ignore - - endpoint_resource = _make_endpoint_resource(resource_kwargs, resource_defaults) - assert isinstance(endpoint_resource["endpoint"], dict) - _setup_single_entity_endpoint(endpoint_resource["endpoint"]) - _bind_path_params(endpoint_resource) - - resource_name = endpoint_resource["name"] - assert isinstance( - resource_name, str - ), f"Resource name must be a string, got {type(resource_name)}" - - if resource_name in endpoint_resource_map: - raise ValueError(f"Resource {resource_name} has already been defined") - endpoint_resource_map[resource_name] = endpoint_resource + endpoint_resource_map = expand_and_index_resources(resource_list, resource_defaults) # create dependency graph for resource_name, endpoint_resource in endpoint_resource_map.items(): + if isinstance(endpoint_resource, DltResource): + dependency_graph.add(resource_name) + resolved_param_map[resource_name] = None + break assert isinstance(endpoint_resource["endpoint"], dict) # connect transformers to resources via resolved params resolved_params = _find_resolved_params(endpoint_resource["endpoint"]) @@ -322,6 +309,37 @@ def build_resource_dependency_graph( return dependency_graph, endpoint_resource_map, resolved_param_map +def expand_and_index_resources( + resource_list: List[Union[str, EndpointResource, DltResource]], + resource_defaults: EndpointResourceBase, +) -> Dict[str, Union[EndpointResource, DltResource]]: + endpoint_resource_map: Dict[str, Union[EndpointResource, DltResource]] = {} + for resource in resource_list: + if isinstance(resource, DltResource): + endpoint_resource_map[resource.name] = resource + break + elif isinstance(resource, dict): + # clone resource here, otherwise it needs to be cloned in several other places + # note that this clones only dict structure, keeping all instances without deepcopy + resource = update_dict_nested({}, resource) # type: ignore + + endpoint_resource = _make_endpoint_resource(resource, resource_defaults) + assert isinstance(endpoint_resource["endpoint"], dict) + _setup_single_entity_endpoint(endpoint_resource["endpoint"]) + _bind_path_params(endpoint_resource) + + resource_name = endpoint_resource["name"] + assert isinstance( + resource_name, str + ), f"Resource name must be a string, got {type(resource_name)}" + + if resource_name in endpoint_resource_map: + raise ValueError(f"Resource {resource_name} has already been defined") + endpoint_resource_map[resource_name] = endpoint_resource + + return endpoint_resource_map + + def _make_endpoint_resource( resource: Union[str, EndpointResource], default_config: EndpointResourceBase ) -> EndpointResource: @@ -383,7 +401,7 @@ def _bind_path_params(resource: EndpointResource) -> None: if param_type != "resolve": raise ValueError( f"The path {path} defined in resource {resource['name']} tries to bind" - f" param {name} with type {param_type}. Paths can only bind 'resource'" + f" param {name} with type {param_type}. Paths can only bind 'resolve'" " type params." ) # resolved params are bound later @@ -505,8 +523,7 @@ def response_action_hook(response: Response, *args: Any, **kwargs: Any) -> None: hook(response) elif action_type == "ignore": logger.info( - f"Ignoring response with code {response.status_code} " - f"and content '{response.json()}'." + f"Ignoring response with code {response.status_code} and content '{response.text}'." ) raise IgnoreResponseException diff --git a/dlt/sources/rest_api/typing.py b/dlt/sources/rest_api/typing.py index 22a9560433..81c53887f1 100644 --- a/dlt/sources/rest_api/typing.py +++ b/dlt/sources/rest_api/typing.py @@ -34,6 +34,9 @@ from dlt.extract.items import TTableHintTemplate from dlt.extract.incremental.typing import LastValueFunc +from dlt.extract.resource import DltResource + +from requests import Session from dlt.sources.helpers.rest_client.typing import HTTPMethodBasic @@ -187,6 +190,7 @@ class ClientConfig(TypedDict, total=False): headers: Optional[Dict[str, str]] auth: Optional[AuthConfig] paginator: Optional[PaginatorConfig] + session: Optional[Session] class IncrementalRESTArgs(IncrementalArgs, total=False): @@ -273,7 +277,7 @@ class EndpointResource(EndpointResourceBase, total=False): class RESTAPIConfigBase(TypedDict): client: ClientConfig - resources: List[Union[str, EndpointResource]] + resources: List[Union[str, EndpointResource, DltResource]] class RESTAPIConfig(RESTAPIConfigBase, total=False): diff --git a/docs/tools/fix_grammar_gpt.py b/docs/tools/fix_grammar_gpt.py index 9979a92b41..5e602f3cee 100644 --- a/docs/tools/fix_grammar_gpt.py +++ b/docs/tools/fix_grammar_gpt.py @@ -17,13 +17,15 @@ # constants BASE_DIR = "../website/docs" -GPT_MODEL = "gpt-3.5-turbo-0125" -MAX_CHUNK_SIZE = 14000 # make sure that this is below the context window size of the model to not have cut off files +GPT_MODEL = "gpt-4-turbo" +MAX_CHUNK_SIZE = 4000 # make sure that this is below the context window size of the model to not have cut off files SYSTEM_PROMPT = """\ You are a grammar checker. Every message you get will be a document that is to be grammarchecked and returned as such. You will not change the markdown syntax. You will only fix the grammar. You will not change the code snippets except for the comments therein. You will not modify the header section which is enclosed by two occurences of "---". +Make sure all headings use the Sentence case. +Never insert any codeblock start or end statements such as "```" Do not change the spelling or casing of these words: dlt, sdf, dbt """ @@ -51,6 +53,22 @@ type=str, ) + parser.add_argument( + "-o", + "--offset", + help="File count offset from where to start fixing", + default=0, + type=int, + ) + + parser.add_argument( + "-l", + "--limit", + help="File count limit, how many files to process", + default=100000, + type=int, + ) + # get args args = parser.parse_args() @@ -63,14 +81,26 @@ # run grammar check count = 0 + processed = 0 for file_path in markdown_files: count += 1 + if count <= args.offset: + continue + + if processed >= args.limit: + break + + processed += 1 + fmt.note(f"Fixing grammar for file {file_path} ({count} of {len(markdown_files)})") with open(file_path, "r", encoding="utf-8") as f: doc = f.readlines() + with open(file_path, "r", encoding="utf-8") as f: + doc_length = len(f.read()) + def get_chunk_length(chunk: List[str]) -> int: count = 0 for line in chunk: @@ -80,8 +110,11 @@ def get_chunk_length(chunk: List[str]) -> int: # cut file into sections sections: List[List[str]] = [] current_section: List[str] = [] + is_in_code_block: bool = False for line in doc: - if line.startswith("#"): + if "```" in line: + is_in_code_block = not is_in_code_block + if line.startswith("#") and not is_in_code_block: if current_section: sections.append(current_section) current_section = [line] @@ -106,27 +139,42 @@ def get_chunk_length(chunk: List[str]) -> int: # sanity test, make sure we still have the full doc assert doc == functools.reduce(lambda a, b: a + b, chunks) - fmt.note(f"Created {len(chunks)} chunks") + # count chars in doc + fmt.note(f"Created {len(chunks)} chunks for {doc_length} chars") - fixed_chunks: List[List[str]] = [] + fixed_chunks: List[str] = [] for chunk in chunks: client = OpenAI() + in_string = "".join(chunk) response = client.chat.completions.create( + seed=123981298, model=GPT_MODEL, messages=[ {"role": "system", "content": SYSTEM_PROMPT}, - {"role": "user", "content": "".join(chunk)}, + {"role": "user", "content": in_string}, ], temperature=0, ) - - fixed_chunks.append(response.choices[0].message.content) # type: ignore + fixed_chunks.append(response.choices[0].message.content) + + # here we check that no part of the doc was swallowed by gpt + fixed_doc_length = functools.reduce( + lambda count, chunk: count + len(chunk), fixed_chunks, 0 + ) + if fixed_doc_length / doc_length < 0.9: + fmt.error( + "Doc length reduced too much during processing, skipping saving, please check" + " manually" + ) + continue with open(file_path, "w", encoding="utf-8") as f: for c in fixed_chunks: - f.writelines(c) + f.write(c) + f.write("\n") + f.write("\n") if count == 0: fmt.warning("No files selected for grammar check.") else: - fmt.note(f"Fixed grammar for {count} files.") + fmt.note(f"Fixed grammar for {processed} files.") diff --git a/docs/website/docs/_book-onboarding-call.md b/docs/website/docs/_book-onboarding-call.md index 4725128bf0..561a479299 100644 --- a/docs/website/docs/_book-onboarding-call.md +++ b/docs/website/docs/_book-onboarding-call.md @@ -1 +1,2 @@ -book a call with a dltHub Solutions Engineer +Book a call with a dltHub Solutions Engineer + diff --git a/docs/website/docs/build-a-pipeline-tutorial.md b/docs/website/docs/build-a-pipeline-tutorial.md index de1a4d647f..0fe483c944 100644 --- a/docs/website/docs/build-a-pipeline-tutorial.md +++ b/docs/website/docs/build-a-pipeline-tutorial.md @@ -6,21 +6,19 @@ keywords: [getting started, quick start, basics] # Building data pipelines with `dlt`, from basic to advanced -This in-depth overview will take you through the main areas of pipelining with `dlt`. Go to the -related pages you are instead looking for the [quickstart](./intro.md). +This in-depth overview will take you through the main areas of pipelining with `dlt`. If you are looking for the [quickstart](./intro.md), go to the related pages. ## Why build pipelines with `dlt`? -`dlt` offers functionality to support the entire extract and load process. Let's look at the high level diagram: +`dlt` offers functionality to support the entire extract and load process. Let's look at the high-level diagram: ![dlt source resource pipe diagram](/img/dlt-high-level.png) +First, we have a `pipeline` function that can infer a schema from data and load the data to the destination. +We can use this pipeline with JSON data, dataframes, or other iterable objects such as generator functions. -First, we have a `pipeline` function, that can infer a schema from data and load the data to the destination. -We can use this pipeline with json data, dataframes, or other iterable objects such as generator functions. - -This pipeline provides effortless loading via a schema discovery, versioning and evolution -engine that ensures you can "just load" any data with row and column level lineage. +This pipeline provides effortless loading via a schema discovery, versioning, and evolution +engine that ensures you can "just load" any data with row and column-level lineage. By utilizing a `dlt pipeline`, we can easily adapt and structure data as it evolves, reducing the time spent on maintenance and development. @@ -28,11 +26,10 @@ maintenance and development. This allows our data team to focus on leveraging the data and driving value, while ensuring effective governance through timely notifications of any changes. -For extract, `dlt` also provides `source` and `resource` decorators that enable defining +For extraction, `dlt` also provides `source` and `resource` decorators that enable defining how extracted data should be loaded, while supporting graceful, scalable extraction via micro-batching and parallelism. - ## The simplest pipeline: 1 liner to load data with schema evolution ```py @@ -41,12 +38,9 @@ import dlt dlt.pipeline(destination='duckdb', dataset_name='mydata').run([{'id': 1, 'name': 'John'}], table_name="users") ``` -A pipeline in the `dlt` library is a powerful tool that allows you to move data from your Python code -to a destination with a single function call. By defining a pipeline, you can easily load, -normalize, and evolve your data schemas, enabling seamless data integration and analysis. +A pipeline in the `dlt` library is a powerful tool that allows you to move data from your Python code to a destination with a single function call. By defining a pipeline, you can easily load, normalize, and evolve your data schemas, enabling seamless data integration and analysis. -For example, let's consider a scenario where you want to load a list of objects into a DuckDB table -named "three". With `dlt`, you can create a pipeline and run it with just a few lines of code: +For example, let's consider a scenario where you want to load a list of objects into a DuckDB table named "three". With `dlt`, you can create a pipeline and run it with just a few lines of code: 1. [Create a pipeline](./walkthroughs/create-a-pipeline.md) to the [destination](dlt-ecosystem/destinations). 1. Give this pipeline data and [run it](./walkthroughs/run-a-pipeline.md). @@ -67,20 +61,15 @@ info = pipeline.run(data, table_name="countries") print(info) ``` -In this example, the `pipeline` function is used to create a pipeline with the specified -destination (DuckDB) and dataset name ("country_data"). The `run` method is then called to load -the data from a list of objects into the table named "countries". The `info` variable stores -information about the loaded data, such as package IDs and job metadata. +In this example, the `pipeline` function is used to create a pipeline with the specified destination (DuckDB) and dataset name ("country_data"). The `run` method is then called to load the data from a list of objects into the table named "countries". The `info` variable stores information about the loaded data, such as package IDs and job metadata. -The data you can pass to it should be iterable: lists of rows, generators, or `dlt` sources will do -just fine. +The data you can pass to it should be iterable: lists of rows, generators, or `dlt` sources will do just fine. -If you want to configure how the data is loaded, you can choose between `write_disposition`s -such as `replace`, `append` and `merge` in the pipeline function. +If you want to configure how the data is loaded, you can choose between `write_disposition`s such as `replace`, `append`, and `merge` in the pipeline function. Here is an example where we load some data to duckdb by `upserting` or `merging` on the id column found in the data. In this example, we also run a dbt package and then load the outcomes of the load jobs into their respective tables. -This will enable us to log when schema changes occurred and match them to the loaded data for lineage, granting us both column and row level lineage. +This will enable us to log when schema changes occurred and match them to the loaded data for lineage, granting us both column and row-level lineage. We also alert the schema change to a Slack channel where hopefully the producer and consumer are subscribed. ```py @@ -137,55 +126,34 @@ for package in load_info.load_packages: ## Extracting data with `dlt` -Extracting data with `dlt` is simple - you simply decorate your data-producing functions with loading -or incremental extraction metadata, which enables `dlt` to extract and load by your custom logic. +Extracting data with `dlt` is simple - you simply decorate your data-producing functions with loading or incremental extraction metadata, which enables `dlt` to extract and load by your custom logic. Technically, two key aspects contribute to `dlt`'s effectiveness: -- Scalability through iterators, chunking, parallelization. -- The utilization of implicit extraction DAGs that allow efficient API calls for data - enrichments or transformations. +- Scalability through iterators, chunking, and parallelization. +- The utilization of implicit extraction DAGs that allow efficient API calls for data enrichments or transformations. ### Scalability via iterators, chunking, and parallelization -`dlt` offers scalable data extraction by leveraging iterators, chunking, and parallelization -techniques. This approach allows for efficient processing of large datasets by breaking them down -into manageable chunks. +`dlt` offers scalable data extraction by leveraging iterators, chunking, and parallelization techniques. This approach allows for efficient processing of large datasets by breaking them down into manageable chunks. -For example, consider a scenario where you need to extract data from a massive database with -millions of records. Instead of loading the entire dataset at once, `dlt` allows you to use -iterators to fetch data in smaller, more manageable portions. This technique enables incremental -processing and loading, which is particularly useful when dealing with limited memory resources. +For example, consider a scenario where you need to extract data from a massive database with millions of records. Instead of loading the entire dataset at once, `dlt` allows you to use iterators to fetch data in smaller, more manageable portions. This technique enables incremental processing and loading, which is particularly useful when dealing with limited memory resources. -Furthermore, `dlt` facilitates parallelization during the extraction process. By processing -multiple data chunks simultaneously, `dlt` takes advantage of parallel processing capabilities, -resulting in significantly reduced extraction times. This parallelization enhances performance, -especially when dealing with high-volume data sources. +Furthermore, `dlt` facilitates parallelization during the extraction process. By processing multiple data chunks simultaneously, `dlt` takes advantage of parallel processing capabilities, resulting in significantly reduced extraction times. This parallelization enhances performance, especially when dealing with high-volume data sources. ### Implicit extraction DAGs -`dlt` incorporates the concept of implicit extraction DAGs to handle the dependencies between -data sources and their transformations automatically. A DAG represents a directed graph without -cycles, where each node represents a data source or transformation step. +`dlt` incorporates the concept of implicit extraction DAGs to handle the dependencies between data sources and their transformations automatically. A DAG represents a directed graph without cycles, where each node represents a data source or transformation step. -When using `dlt`, the tool automatically generates an extraction DAG based on the dependencies -identified between the data sources and their transformations. This extraction DAG determines the -optimal order for extracting the resources to ensure data consistency and integrity. +When using `dlt`, the tool automatically generates an extraction DAG based on the dependencies identified between the data sources and their transformations. This extraction DAG determines the optimal order for extracting the resources to ensure data consistency and integrity. -For instance, imagine a pipeline where data needs to be extracted from multiple API endpoints and -undergo certain transformations or enrichments via additional calls before loading it into a -database. `dlt` analyzes the dependencies between the API endpoints and transformations and -generates an extraction DAG accordingly. The extraction DAG ensures that the data is extracted in -the correct order, accounting for any dependencies and transformations. +For instance, imagine a pipeline where data needs to be extracted from multiple API endpoints and undergo certain transformations or enrichments via additional calls before loading it into a database. `dlt` analyzes the dependencies between the API endpoints and transformations and generates an extraction DAG accordingly. The extraction DAG ensures that the data is extracted in the correct order, accounting for any dependencies and transformations. -When deploying to Airflow, the internal DAG is unpacked into Airflow tasks in such a way to ensure -consistency and allow granular loading. +When deploying to Airflow, the internal DAG is unpacked into Airflow tasks in such a way to ensure consistency and allow granular loading. -## Defining Incremental Loading +## Defining incremental loading -[Incremental loading](general-usage/incremental-loading.md) is a crucial concept in data pipelines that involves loading only new or changed -data instead of reloading the entire dataset. This approach provides several benefits, including -low-latency data transfer and cost savings. +[Incremental loading](general-usage/incremental-loading.md) is a crucial concept in data pipelines that involves loading only new or changed data instead of reloading the entire dataset. This approach provides several benefits, including low-latency data transfer and cost savings. ### Declarative loading @@ -197,10 +165,10 @@ behavior using the `write_disposition` parameter. There are three options availa source on the current run. You can achieve this by setting `write_disposition='replace'` in your resources. It is suitable for stateless data that doesn't change, such as recorded events like page views. -1. Append: The append option adds new data to the existing destination dataset. By using +2. Append: The append option adds new data to the existing destination dataset. By using `write_disposition='append'`, you can ensure that only new records are loaded. This is suitable for stateless data that can be easily appended without any conflicts. -1. Merge: The merge option is used when you want to merge new data with the existing destination +3. Merge: The merge option is used when you want to merge new data with the existing destination dataset while also handling deduplication or upserts. It requires the use of `merge_key` and/or `primary_key` to identify and update specific records. By setting `write_disposition='merge'`, you can perform merge-based incremental loading. @@ -226,15 +194,15 @@ incrementally, deduplicating it, and performing the necessary merge operations. Advanced state management in `dlt` allows you to store and retrieve values across pipeline runs by persisting them at the destination but accessing them in a dictionary in code. This enables you to track and manage incremental loading effectively. By leveraging the pipeline state, you can -preserve information, such as last values, checkpoints or column renames, and utilize them later in +preserve information, such as last values, checkpoints, or column renames, and utilize them later in the pipeline. -## Transforming the Data +## Transforming the data Data transformation plays a crucial role in the data loading process. You can perform transformations both before and after loading the data. Here's how you can achieve it: -### Before Loading +### Before loading Before loading the data, you have the flexibility to perform transformations using Python. You can leverage Python's extensive libraries and functions to manipulate and preprocess the data as needed. @@ -248,16 +216,13 @@ consistent mapping. The `dummy_source` generates dummy data with an `id` and `na column, and the `add_map` function applies the `pseudonymize_name` transformation to each record. -### After Loading +### After loading For transformations after loading the data, you have several options available: #### [Using dbt](dlt-ecosystem/transformations/dbt/dbt.md) -dbt is a powerful framework for transforming data. It enables you to structure your transformations -into DAGs, providing cross-database compatibility and various features such as templating, -backfills, testing, and troubleshooting. You can use the dbt runner in `dlt` to seamlessly -integrate dbt into your pipeline. Here's an example of running a dbt package after loading the data: +dbt is a powerful framework for transforming data. It enables you to structure your transformations into DAGs, providing cross-database compatibility and various features such as templating, backfills, testing, and troubleshooting. You can use the dbt runner in `dlt` to seamlessly integrate dbt into your pipeline. Here's an example of running a dbt package after loading the data: ```py import dlt @@ -284,7 +249,7 @@ pipeline = dlt.pipeline( # make venv and install dbt in it. venv = dlt.dbt.get_venv(pipeline) -# get package from local or github link and run +# get package from local or GitHub link and run dbt = dlt.dbt.package(pipeline, "pipedrive/dbt_pipedrive/pipedrive", venv=venv) models = dbt.run_all() @@ -293,17 +258,11 @@ for m in models: print(f"Model {m.model_name} materialized in {m.time} with status {m.status} and message {m.message}") ``` -In this example, the first pipeline loads the data using `pipedrive_source()`. The second -pipeline performs transformations using a dbt package called `pipedrive` after loading the data. -The `dbt.package` function sets up the dbt runner, and `dbt.run_all()` executes the dbt -models defined in the package. +In this example, the first pipeline loads the data using `pipedrive_source()`. The second pipeline performs transformations using a dbt package called `pipedrive` after loading the data. The `dbt.package` function sets up the dbt runner, and `dbt.run_all()` executes the dbt models defined in the package. #### [Using the `dlt` SQL client](dlt-ecosystem/transformations/sql.md) -Another option is to leverage the `dlt` SQL client to query the loaded data and perform -transformations using SQL statements. You can execute SQL statements that change the database schema -or manipulate data within tables. Here's an example of inserting a row into the `customers` -table using the `dlt` SQL client: +Another option is to leverage the `dlt` SQL client to query the loaded data and perform transformations using SQL statements. You can execute SQL statements that change the database schema or manipulate data within tables. Here's an example of inserting a row into the `customers` table using the `dlt` SQL client: ```py pipeline = dlt.pipeline(destination="bigquery", dataset_name="crm") @@ -314,14 +273,11 @@ with pipeline.sql_client() as client: ) ``` -In this example, the `execute_sql` method of the SQL client allows you to execute SQL -statements. The statement inserts a row with values into the `customers` table. +In this example, the `execute_sql` method of the SQL client allows you to execute SQL statements. The statement inserts a row with values into the `customers` table. #### [Using Pandas](dlt-ecosystem/transformations/pandas.md) -You can fetch query results as Pandas data frames and perform transformations using Pandas -functionalities. Here's an example of reading data from the `issues` table in DuckDB and -counting reaction types using Pandas: +You can fetch query results as Pandas data frames and perform transformations using Pandas functionalities. Here's an example of reading data from the `issues` table in DuckDB and counting reaction types using Pandas: ```py pipeline = dlt.pipeline( @@ -340,10 +296,9 @@ with pipeline.sql_client() as client: counts = reactions.sum(0).sort_values(0, ascending=False) ``` -By leveraging these transformation options, you can shape and manipulate the data before or after -loading it, allowing you to meet specific requirements and ensure data quality and consistency. +By leveraging these transformation options, you can shape and manipulate the data before or after loading it, allowing you to meet specific requirements and ensure data quality and consistency. -## Adjusting the automated normalisation +## Adjusting the automated normalization To streamline the process, `dlt` recommends attaching schemas to sources implicitly instead of creating them explicitly. You can provide a few global schema settings and let the table and column @@ -356,12 +311,12 @@ By adjusting the automated normalization process in `dlt`, you can ensure that t schema meets your specific requirements and aligns with your preferred naming conventions, data types, and other customization needs. -### Customizing the Normalization Process +### Customizing the normalization process Customizing the normalization process in `dlt` allows you to adapt it to your specific requirements. You can adjust table and column names, configure column properties, define data type autodetectors, -apply performance hints, specify preferred data types, or change how ids are propagated in the +apply performance hints, specify preferred data types, or change how IDs are propagated in the unpacking process. These customization options enable you to create a schema that aligns with your desired naming @@ -370,7 +325,7 @@ the normalization process to meet your unique needs and achieve optimal results. Read more about how to configure [schema generation.](general-usage/schema.md) -### Exporting and Importing Schema Files +### Exporting and importing schema files `dlt` allows you to export and import schema files, which contain the structure and instructions for processing and loading the data. Exporting schema files enables you to modify them directly, making @@ -379,12 +334,12 @@ use them in your pipeline. Read more: [Adjust a schema docs.](./walkthroughs/adjust-a-schema.md) -## Governance Support in `dlt` Pipelines +## Governance support in `dlt` pipelines `dlt` pipelines offer robust governance support through three key mechanisms: pipeline metadata utilization, schema enforcement and curation, and schema change alerts. -### Pipeline Metadata +### Pipeline metadata `dlt` pipelines leverage metadata to provide governance capabilities. This metadata includes load IDs, which consist of a timestamp and pipeline name. Load IDs enable incremental transformations and data @@ -392,7 +347,7 @@ vaulting by tracking data loads and facilitating data lineage and traceability. Read more about [lineage](general-usage/destination-tables.md#data-lineage). -### Schema Enforcement and Curation +### Schema enforcement and curation `dlt` empowers users to enforce and curate schemas, ensuring data consistency and quality. Schemas define the structure of normalized data and guide the processing and loading of data. By adhering to @@ -414,16 +369,15 @@ control throughout the data processing lifecycle. ### Scaling and finetuning -`dlt` offers several mechanism and configuration options to scale up and finetune pipelines: +`dlt` offers several mechanisms and configuration options to scale up and finetune pipelines: -- Running extraction, normalization and load in parallel. +- Running extraction, normalization, and load in parallel. - Writing sources and resources that are run in parallel via thread pools and async execution. -- Finetune the memory buffers, intermediary file sizes and compression options. +- Finetuning the memory buffers, intermediary file sizes, and compression options. Read more about [performance.](reference/performance.md) ### Other advanced topics -`dlt` is a constantly growing library that supports many features and use cases needed by the -community. [Join our Slack](https://dlthub.com/community) -to find recent releases or discuss what you can build with `dlt`. +`dlt` is a constantly growing library that supports many features and use cases needed by the community. [Join our Slack](https://dlthub.com/community) to find recent releases or discuss what you can build with `dlt`. + diff --git a/docs/website/docs/dlt-ecosystem/destinations/athena.md b/docs/website/docs/dlt-ecosystem/destinations/athena.md index e6f99adc48..f541b6ad43 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/athena.md +++ b/docs/website/docs/dlt-ecosystem/destinations/athena.md @@ -14,7 +14,7 @@ The Athena destination stores data as Parquet files in S3 buckets and creates [e pip install "dlt[athena]" ``` -## Setup Guide +## Setup guide ### 1. Initialize the dlt project Let's start by initializing a new `dlt` project as follows: @@ -65,7 +65,7 @@ aws_secret_access_key="please set me up!" # same as credentials for filesystem region_name="please set me up!" # set your AWS region, for example "eu-central-1" for Frankfurt ``` -If you have your credentials stored in `~/.aws/credentials`, just remove the **[destination.filesystem.credentials]** and **[destination.athena.credentials]** section above and `dlt` will fall back to your **default** profile in local credentials. If you want to switch the profile, pass the profile name as follows (here: `dlt-ci-user`): +If you have your credentials stored in `~/.aws/credentials`, just remove the **[destination.filesystem.credentials]** and **[destination.athena.credentials]** sections above and `dlt` will fall back to your **default** profile in local credentials. If you want to switch the profile, pass the profile name as follows (here: `dlt-ci-user`): ```toml [destination.filesystem.credentials] profile_name="dlt-ci-user" @@ -74,7 +74,7 @@ profile_name="dlt-ci-user" profile_name="dlt-ci-user" ``` -## Additional Destination Configuration +## Additional destination configuration You can provide an Athena workgroup like so: ```toml @@ -91,7 +91,7 @@ The `athena` destination handles the write dispositions as follows: ## Data loading -Data loading happens by storing parquet files in an S3 bucket and defining a schema on Athena. If you query data via SQL queries on Athena, the returned data is read by scanning your bucket and reading all relevant parquet files in there. +Data loading occurs by storing parquet files in an S3 bucket and defining a schema on Athena. If you query data via SQL queries on Athena, the returned data is read by scanning your bucket and reading all relevant parquet files in there. `dlt` internal tables are saved as Iceberg tables. @@ -103,10 +103,10 @@ Athena does not support JSON fields, so JSON is stored as a string. > ❗**Athena does not support TIME columns in parquet files**. `dlt` will fail such jobs permanently. Convert `datetime.time` objects to `str` or `datetime.datetime` to load them. ### Table and column identifiers -Athena uses case insensitive identifiers and **will lower case all the identifiers** that are stored in the INFORMATION SCHEMA. Do not use -[case sensitive naming conventions](../../general-usage/naming-convention.md#case-sensitive-and-insensitive-destinations). Letter casing will be removed anyway and you risk to generate identifier collisions, which are detected by `dlt` and will fail the load process. -Under the hood Athena uses different SQL engines for DDL (catalog) and DML/Queries: +Athena uses case-insensitive identifiers and **will lowercase all the identifiers** that are stored in the INFORMATION SCHEMA. Do not use [case-sensitive naming conventions](../../general-usage/naming-convention.md#case-sensitive-and-insensitive-destinations). Letter casing will be removed anyway, and you risk generating identifier collisions, which are detected by `dlt` and will fail the load process. + +Under the hood, Athena uses different SQL engines for DDL (catalog) and DML/Queries: * DDL uses HIVE escaping with `````` * Other queries use PRESTO and regular SQL escaping. @@ -119,11 +119,11 @@ If you decide to change the [filename layout](./filesystem#data-loading) from th - You need to provide the `{file_id}` placeholder, and it needs to be somewhere after the `{table_name}` placeholder. - `{table_name}` must be the first placeholder in the layout. - ## Additional destination options ### Iceberg data tables -You can save your tables as Iceberg tables to Athena. This will enable you, for example, to delete data from them later if you need to. To switch a resource to the iceberg table format, supply the table_format argument like this: + +You can save your tables as Iceberg tables to Athena. This will enable you, for example, to delete data from them later if you need to. To switch a resource to the Iceberg table format, supply the table_format argument like this: ```py @dlt.resource(table_format="iceberg") @@ -131,33 +131,34 @@ def data() -> Iterable[TDataItem]: ... ``` -For every table created as an iceberg table, the Athena destination will create a regular Athena table in the staging dataset of both the filesystem and the Athena glue catalog, and then copy all data into the final iceberg table that lives with the non-iceberg tables in the same dataset on both the filesystem and the glue catalog. Switching from iceberg to regular table or vice versa is not supported. +For every table created as an Iceberg table, the Athena destination will create a regular Athena table in the staging dataset of both the filesystem and the Athena glue catalog, and then copy all data into the final Iceberg table that lives with the non-Iceberg tables in the same dataset on both the filesystem and the glue catalog. Switching from Iceberg to regular table or vice versa is not supported. #### `merge` support -The `merge` write disposition is supported for Athena when using iceberg tables. + +The `merge` write disposition is supported for Athena when using Iceberg tables. > Note that: -> 1. there is a risk of tables ending up in inconsistent state in case a pipeline run fails mid flight, because Athena doesn't support transactions, and `dlt` uses multiple DELETE/UPDATE/INSERT statements to implement `merge`, +> 1. There is a risk of tables ending up in an inconsistent state in case a pipeline run fails mid-flight because Athena doesn't support transactions, and `dlt` uses multiple DELETE/UPDATE/INSERT statements to implement `merge`. > 2. `dlt` creates additional helper tables called `insert_` and `delete_
` in the staging schema to work around Athena's lack of temporary tables. ### dbt support -Athena is supported via `dbt-athena-community`. Credentials are passed into `aws_access_key_id` and `aws_secret_access_key` of the generated dbt profile. Iceberg tables are supported, but you need to make sure that you materialize your models as iceberg tables if your source table is iceberg. We encountered problems with materializing date time columns due to different precision on iceberg (nanosecond) and regular Athena tables (millisecond). +Athena is supported via `dbt-athena-community`. Credentials are passed into `aws_access_key_id` and `aws_secret_access_key` of the generated dbt profile. Iceberg tables are supported, but you need to make sure that you materialize your models as Iceberg tables if your source table is Iceberg. We encountered problems with materializing date-time columns due to different precision on Iceberg (nanosecond) and regular Athena tables (millisecond). The Athena adapter requires that you set up **region_name** in the Athena configuration below. You can also set up the table catalog name to change the default: **awsdatacatalog** ```toml [destination.athena] aws_data_catalog="awsdatacatalog" ``` -### Syncing of `dlt` state -- This destination fully supports [dlt state sync.](../../general-usage/state#syncing-state-with-destination). The state is saved in Athena iceberg tables in your S3 bucket. +### Syncing of `dlt` state +- This destination fully supports [dlt state sync.](../../general-usage/state#syncing-state-with-destination). The state is saved in Athena Iceberg tables in your S3 bucket. ## Supported file formats + You can choose the following file formats: * [parquet](../file-formats/parquet.md) is used by default - ## Athena adapter You can use the `athena_adapter` to add partitioning to Athena tables. This is currently only supported for Iceberg tables. @@ -198,7 +199,6 @@ data_items = [ def partitioned_data(): yield [{"id": i, "category": c, "created_at": d} for i, c, d in data_items] - # Add partitioning hints to the table athena_adapter( partitioned_table, @@ -213,6 +213,5 @@ athena_adapter( pipeline = dlt.pipeline("athena_example") pipeline.run(partitioned_data) ``` - diff --git a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md index 324c712dfc..9dc983bc33 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md +++ b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md @@ -14,7 +14,7 @@ keywords: [bigquery, destination, data warehouse] pip install "dlt[bigquery]" ``` -## Setup Guide +## Setup guide **1. Initialize a project with a pipeline that loads to BigQuery by running:** @@ -28,7 +28,7 @@ dlt init chess bigquery pip install -r requirements.txt ``` -This will install dlt with the `bigquery` extra, which contains all the dependencies required by the bigquery client. +This will install dlt with the `bigquery` extra, which contains all the dependencies required by the BigQuery client. **3. Log in to or create a Google Cloud account** @@ -36,14 +36,11 @@ Sign up for or log in to the [Google Cloud Platform](https://console.cloud.googl **4. Create a new Google Cloud project** -After arriving at the [Google Cloud console welcome page](https://console.cloud.google.com/welcome), click the -project selector in the top left, then click the `New Project` button, and finally click the `Create` button -after naming the project whatever you would like. +After arriving at the [Google Cloud console welcome page](https://console.cloud.google.com/welcome), click the project selector in the top left, then click the `New Project` button, and finally click the `Create` button after naming the project whatever you would like. **5. Create a service account and grant BigQuery permissions** -You will then need to [create a service account](https://cloud.google.com/iam/docs/creating-managing-service-accounts#creating). -After clicking the `Go to Create service account` button on the linked docs page, select the project you created and name the service account whatever you would like. +You will then need to [create a service account](https://cloud.google.com/iam/docs/creating-managing-service-accounts#creating). After clicking the `Go to Create service account` button on the linked docs page, select the project you created and name the service account whatever you would like. Click the `Continue` button and grant the following roles, so that `dlt` can create schemas and load data: @@ -55,11 +52,9 @@ You don't need to grant users access to this service account now, so click the ` **6. Download the service account JSON** -In the service accounts table page that you're redirected to after clicking `Done` as instructed above, -select the three dots under the `Actions` column for the service account you created and select `Manage keys`. +In the service accounts table page that you're redirected to after clicking `Done` as instructed above, select the three dots under the `Actions` column for the service account you created and select `Manage keys`. -This will take you to a page where you can click the `Add key` button, then the `Create new key` button, -and finally the `Create` button, keeping the preselected `JSON` option. +This will take you to a page where you can click the `Add key` button, then the `Create new key` button, and finally the `Create` button, keeping the preselected `JSON` option. A `JSON` file that includes your service account private key will then be downloaded. @@ -83,12 +78,11 @@ private_key = "private_key" # please set me up! client_email = "client_email" # please set me up! ``` -You can specify the location of the data i.e. `EU` instead of `US` which is the default. +You can specify the location of the data, i.e., `EU` instead of `US`, which is the default. -### OAuth 2.0 Authentication +### OAuth 2.0 authentication -You can use OAuth 2.0 authentication. You'll need to generate a **refresh token** with the right scopes (we suggest asking our GPT-4 assistant for details). -Then you can fill the following information in `secrets.toml` +You can use OAuth 2.0 authentication. You'll need to generate a **refresh token** with the right scopes (we suggest asking our GPT-4 assistant for details). Then you can fill the following information in `secrets.toml`: ```toml [destination.bigquery] @@ -101,18 +95,16 @@ client_secret = "client_secret" # please set me up! refresh_token = "refresh_token" # please set me up! ``` -### Using Default Credentials +### Using default credentials -Google provides several ways to get default credentials i.e. from the `GOOGLE_APPLICATION_CREDENTIALS` environment variable or metadata services. -VMs available on GCP (cloud functions, Composer runners, Colab notebooks) have associated service accounts or authenticated users. -`dlt` will try to use default credentials if nothing is explicitly specified in the secrets. +Google provides several ways to get default credentials, i.e., from the `GOOGLE_APPLICATION_CREDENTIALS` environment variable or metadata services. VMs available on GCP (cloud functions, Composer runners, Colab notebooks) have associated service accounts or authenticated users. `dlt` will try to use default credentials if nothing is explicitly specified in the secrets. ```toml [destination.bigquery] location = "US" ``` -### Using Different `project_id` +### Using different `project_id` You can set the `project_id` in your configuration to be different from the one in your credentials, provided your account has access to it: ```toml @@ -124,14 +116,13 @@ project_id = "project_id_credentials" ``` In this scenario, `project_id_credentials` will be used for authentication, while `project_id_destination` will be used as the data destination. -## Write Disposition +## Write disposition All write dispositions are supported. -If you set the [`replace` strategy](../../general-usage/full-loading.md) to `staging-optimized`, the destination tables will be dropped and -recreated with a [clone command](https://cloud.google.com/bigquery/docs/table-clones-create) from the staging tables. +If you set the [`replace` strategy](../../general-usage/full-loading.md) to `staging-optimized`, the destination tables will be dropped and recreated with a [clone command](https://cloud.google.com/bigquery/docs/table-clones-create) from the staging tables. -## Data Loading +## Data loading `dlt` uses `BigQuery` load jobs that send files from the local filesystem or GCS buckets. The loader follows [Google recommendations](https://cloud.google.com/bigquery/docs/error-messages) when retrying and terminating jobs. @@ -149,13 +140,13 @@ streamed_resource.apply_hints(additional_table_hints={"x-insert-api": "streaming ``` ### Use BigQuery schema autodetect for nested fields -You can let BigQuery to infer schemas and create destination tables instead of `dlt`. As a consequence, nested fields (ie. `RECORD`), which `dlt` does not support at -this moment (they are stored as JSON), may be created. You select certain resources with [BigQuery Adapter](#bigquery-adapter) or all of them with the following config option: +You can let BigQuery infer schemas and create destination tables instead of `dlt`. As a consequence, nested fields (i.e., `RECORD`), which `dlt` does not support at +this moment (they are stored as JSON), may be created. You can select certain resources with the [BigQuery Adapter](#bigquery-adapter) or all of them with the following config option: ```toml [destination.bigquery] autodetect_schema=true ``` -We recommend to yield [arrow tables](../verified-sources/arrow-pandas.md) from your resources and `parquet` file format to load the data. In that case the schemas generated by `dlt` and BigQuery +We recommend yielding [arrow tables](../verified-sources/arrow-pandas.md) from your resources and using the `parquet` file format to load the data. In that case, the schemas generated by `dlt` and BigQuery will be identical. BigQuery will also preserve the column order from the generated parquet files. You can convert `json` data into arrow tables with [pyarrow or duckdb](../verified-sources/arrow-pandas.md#loading-json-documents). ```py @@ -167,7 +158,7 @@ from dlt.destinations.adapters import bigquery_adapter @dlt.resource(name="cve") def load_cve(): with open("cve.json", 'rb') as f: - # autodetect arrow schema and yields arrow table + # autodetect arrow schema and yield arrow table yield paj.read_json(f) pipeline = dlt.pipeline("load_json_struct", destination="bigquery") @@ -175,9 +166,9 @@ pipeline.run( bigquery_adapter(load_cve(), autodetect_schema=True) ) ``` -Above, we use `pyarrow` library to convert `json` document into `arrow` table and use `biguery_adapter` to enable schema autodetect for **cve** resource. +Above, we use the `pyarrow` library to convert a JSON document into an Arrow table and use `bigquery_adapter` to enable schema autodetect for the **cve** resource. -Yielding Python dicts/lists and loading them as `jsonl` works as well. In many cases, the resulting nested structure is simpler than those obtained via pyarrow/duckdb and parquet. However there are slight differences in inferred types from `dlt` (BigQuery coerces types more aggressively). BigQuery also does not try to preserve the column order in relation to the order of fields in JSON. +Yielding Python dicts/lists and loading them as JSONL works as well. In many cases, the resulting nested structure is simpler than those obtained via pyarrow/duckdb and parquet. However, there are slight differences in inferred types from `dlt` (BigQuery coerces types more aggressively). BigQuery also does not try to preserve the column order in relation to the order of fields in JSON. ```py import dlt @@ -193,14 +184,14 @@ pipeline.run( bigquery_adapter(load_cve(), autodetect_schema=True) ) ``` -In the example below we represent `json` data as tables up until nesting level 1. Above this nesting level, we let BigQuery to create nested fields. +In the example below, we represent JSON data as tables up to nesting level 1. Above this nesting level, we let BigQuery create nested fields. :::caution If you yield data as Python objects (dicts) and load this data as `parquet`, the nested fields will be converted into strings. This is one of the consequences of `dlt` not being able to infer nested fields. ::: -## Supported File Formats +## Supported file formats You can configure the following file formats to load data to BigQuery: @@ -213,17 +204,17 @@ When staging is enabled: * [parquet](../file-formats/parquet.md) is supported. :::caution -**Bigquery cannot load JSON columns from `parquet` files**. `dlt` will fail such jobs permanently. Instead: +**BigQuery cannot load JSON columns from Parquet files**. `dlt` will fail such jobs permanently. Instead: * Switch to `jsonl` to load and parse JSON properly. * Use schema [autodetect and nested fields](#use-bigquery-schema-autodetect-for-nested-fields) ::: -## Supported Column Hints +## Supported column hints -BigQuery supports the following [column hints](https://dlthub.com/docs/general-usage/schema#tables-and-columns): +BigQuery supports the following [column hints](../../general-usage/schema#tables-and-columns): * `partition` - creates a partition with a day granularity on the decorated column (`PARTITION BY DATE`). - May be used with `datetime`, `date`, and `bigint` data types. + It may be used with `datetime`, `date`, and `bigint` data types. Only one column per table is supported and only when a new table is created. For more information on BigQuery partitioning, read the [official docs](https://cloud.google.com/bigquery/docs/partitioned-tables). @@ -233,25 +224,25 @@ BigQuery supports the following [column hints](https://dlthub.com/docs/general-u > Instead, we set 86,400-second boundaries to enable daily partitioning. > This captures typical values, but extremely large/small outliers go to an `__UNPARTITIONED__` catch-all partition. -* `cluster` - creates a cluster column(s). Many columns per table are supported and only when a new table is created. +* `cluster` - creates cluster column(s). Many columns per table are supported and only when a new table is created. ### Table and column identifiers -BigQuery uses case sensitive identifiers by default and this is what `dlt` assumes. If the dataset you use has case insensitive identifiers (you have such option -when you create it) make sure that you use case insensitive [naming convention](../../general-usage/naming-convention.md#case-sensitive-and-insensitive-destinations) or you tell `dlt` about it so identifier collisions are properly detected. +BigQuery uses case-sensitive identifiers by default, and this is what `dlt` assumes. If the dataset you use has case-insensitive identifiers (you have such an option +when you create it), make sure that you use a case-insensitive [naming convention](../../general-usage/naming-convention.md#case-sensitive-and-insensitive-destinations) or you tell `dlt` about it so identifier collisions are properly detected. ```toml [destination.bigquery] has_case_sensitive_identifiers=false ``` -You have an option to allow `dlt` to set the case sensitivity for newly created datasets. In that case it will follow the case sensitivity of current -naming convention (ie. the default **snake_case** will create dataset with case insensitive identifiers). +You have an option to allow `dlt` to set the case sensitivity for newly created datasets. In that case, it will follow the case sensitivity of the current +naming convention (i.e., the default **snake_case** will create a dataset with case-insensitive identifiers). ```toml [destination.bigquery] should_set_case_sensitivity_on_new_dataset=true ``` The option above is off by default. -## Staging Support +## Staging support BigQuery supports GCS as a file staging destination. `dlt` will upload files in the parquet format to GCS and ask BigQuery to copy their data directly into the database. Please refer to the [Google Storage filesystem documentation](./filesystem.md#google-storage) to learn how to set up your GCS bucket with the bucket_url and credentials. @@ -259,7 +250,7 @@ If you use the same service account for GCS and your Redshift deployment, you do Alternatively to parquet files, you can specify jsonl as the staging file format. For this, set the `loader_file_format` argument of the `run` command of the pipeline to `jsonl`. -### BigQuery/GCS Staging Example +### BigQuery/GCS staging example ```py # Create a dlt pipeline that will load @@ -273,7 +264,7 @@ pipeline = dlt.pipeline( ) ``` -## Additional Destination Options +## Additional destination options You can configure the data location and various timeouts as shown below. This information is not a secret so it can be placed in `config.toml` as well: @@ -290,17 +281,17 @@ retry_deadline=60.0 * `file_upload_timeout` is a timeout for file upload when loading local files: the total time of the upload may not exceed this value (default: **30 minutes**, set in seconds) * `retry_deadline` is a deadline for a [DEFAULT_RETRY used by Google](https://cloud.google.com/python/docs/reference/storage/1.39.0/retry_timeout) -### dbt Support +### dbt support This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-bigquery](https://github.com/dbt-labs/dbt-bigquery). -Credentials, if explicitly defined, are shared with `dbt` along with other settings like **location** and retries and timeouts. -In the case of implicit credentials (i.e. available in a cloud function), `dlt` shares the `project_id` and delegates obtaining credentials to the `dbt` adapter. +Credentials, if explicitly defined, are shared with `dbt` along with other settings like **location**, retries, and timeouts. +In the case of implicit credentials (i.e., available in a cloud function), `dlt` shares the `project_id` and delegates obtaining credentials to the `dbt` adapter. -### Syncing of `dlt` State +### Syncing of dlt state -This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination) +This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). -## Bigquery Adapter +## BigQuery adapter You can use the `bigquery_adapter` to add BigQuery-specific hints to a resource. These hints influence how data is loaded into BigQuery tables, such as specifying partitioning, clustering, and numeric column rounding modes. @@ -308,7 +299,7 @@ Hints can be defined at both the column level and table level. The adapter updates the DltResource with metadata about the destination column and table DDL options. -### Use an Adapter to Apply Hints to a Resource +### Use an adapter to apply hints to a resource Here is an example of how to use the `bigquery_adapter` method to apply hints to a resource on both the column level and table level: @@ -355,7 +346,7 @@ Some things to note with the adapter's behavior: > ❗ At the time of writing, table level options aren't supported for `ALTER` operations. -Note that `bigquery_adapter` updates the resource *inplace*, but returns the resource for convenience, i.e. both the following are valid: +Note that `bigquery_adapter` updates the resource *in place*, but returns the resource for convenience, i.e., both the following are valid: ```py bigquery_adapter(my_resource, partition="partition_column_name") diff --git a/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md b/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md index 8752c571b1..0a0259a6c5 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md +++ b/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md @@ -14,7 +14,7 @@ keywords: [ clickhouse, destination, data warehouse ] pip install "dlt[clickhouse]" ``` -## Setup Guide +## Setup guide ### 1. Initialize the dlt project @@ -26,8 +26,7 @@ dlt init chess clickhouse > 💡 This command will initialize your pipeline with chess as the source and ClickHouse as the destination. -The above command generates several files and directories, including `.dlt/secrets.toml` and a requirements file for ClickHouse. You can install the necessary dependencies specified in the -requirements file by executing it as follows: +The above command generates several files and directories, including `.dlt/secrets.toml` and a requirements file for ClickHouse. You can install the necessary dependencies specified in the requirements file by executing it as follows: ```sh pip install -r requirements.txt @@ -43,7 +42,7 @@ To load data into ClickHouse, you need to create a ClickHouse database. While we 2. To create a new database, connect to your ClickHouse server using the `clickhouse-client` command line tool or a SQL client of your choice. -3. Run the following SQL commands to create a new database, user and grant the necessary permissions: +3. Run the following SQL commands to create a new database, user, and grant the necessary permissions: ```sql CREATE DATABASE IF NOT EXISTS dlt; @@ -61,7 +60,7 @@ To load data into ClickHouse, you need to create a ClickHouse database. While we [destination.clickhouse.credentials] database = "dlt" # The database name you created. username = "dlt" # ClickHouse username, default is usually "default". - password = "Dlt*12345789234567" # ClickHouse password if any. + password = "Dlt*12345789234567" # ClickHouse password, if any. host = "localhost" # ClickHouse server host. port = 9000 # ClickHouse native TCP protocol port, default is 9000. http_port = 8443 # ClickHouse HTTP port, default is 9000. @@ -73,7 +72,7 @@ To load data into ClickHouse, you need to create a ClickHouse database. While we The default non-secure HTTP port for ClickHouse is `8123`. This is different from the default port `9000`, which is used for the native TCP protocol. - You must set `http_port` if you are not using external staging (i.e. you don't set the `staging` parameter in your pipeline). This is because dlt's built-in ClickHouse local storage staging uses the [clickhouse-connect](https://github.com/ClickHouse/clickhouse-connect) library, which communicates with ClickHouse over HTTP. + You must set `http_port` if you are not using external staging (i.e., you don't set the `staging` parameter in your pipeline). This is because dlt's built-in ClickHouse local storage staging uses the [clickhouse-connect](https://github.com/ClickHouse/clickhouse-connect) library, which communicates with ClickHouse over HTTP. Make sure your ClickHouse server is configured to accept HTTP connections on the port specified by `http_port`. For example: @@ -90,7 +89,7 @@ To load data into ClickHouse, you need to create a ClickHouse database. While we 2. You can pass a database connection string similar to the one used by the `clickhouse-driver` library. The credentials above will look like this: ```toml - # keep it at the top of your toml file before any section starts. + # Keep it at the top of your toml file before any section starts. destination.clickhouse.credentials="clickhouse://dlt:Dlt*12345789234567@localhost:9000/dlt?secure=1" ``` @@ -100,7 +99,7 @@ You can set the following configuration options in the `.dlt/secrets.toml` file: ```toml [destination.clickhouse] -dataset_table_separator = "___" # The default separator for dataset table names from dataset. +dataset_table_separator = "___" # The default separator for dataset table names from the dataset. table_engine_type = "merge_tree" # The default table engine to use. dataset_sentinel_table_name = "dlt_sentinel_table" # The default name for sentinel tables. ``` @@ -114,13 +113,12 @@ All [write dispositions](../../general-usage/incremental-loading#choosing-a-writ Data is loaded into ClickHouse using the most efficient method depending on the data source: - For local files, the `clickhouse-connect` library is used to directly load files into ClickHouse tables using the `INSERT` command. -- For files in remote storage like S3, Google Cloud Storage, or Azure Blob Storage, ClickHouse table functions like `s3`, `gcs` and `azureBlobStorage` are used to read the files and insert the data - into tables. +- For files in remote storage like S3, Google Cloud Storage, or Azure Blob Storage, ClickHouse table functions like `s3`, `gcs`, and `azureBlobStorage` are used to read the files and insert the data into tables. ## Datasets -`Clickhouse` does not support multiple datasets in one database, dlt relies on datasets to exist for multiple reasons. -To make `clickhouse` work with `dlt`, tables generated by `dlt` in your `clickhouse` database will have their name prefixed with the dataset name separated by +`Clickhouse` does not support multiple datasets in one database; dlt relies on datasets to exist for multiple reasons. +To make `clickhouse` work with `dlt`, tables generated by `dlt` in your `clickhouse` database will have their names prefixed with the dataset name, separated by the configurable `dataset_table_separator`. Additionally, a special sentinel table that doesn't contain any data will be created, so dlt knows which virtual datasets already exist in a clickhouse @@ -131,16 +129,16 @@ destination. - [jsonl](../file-formats/jsonl.md) is the preferred format for both direct loading and staging. - [parquet](../file-formats/parquet.md) is supported for both direct loading and staging. -The `clickhouse` destination has a few specific deviations from the default sql destinations: +The `clickhouse` destination has a few specific deviations from the default SQL destinations: 1. `Clickhouse` has an experimental `object` datatype, but we've found it to be a bit unpredictable, so the dlt clickhouse destination will load the `json` datatype to a `text` column. If you need this feature, get in touch with our Slack community, and we will consider adding it. 2. `Clickhouse` does not support the `time` datatype. Time will be loaded to a `text` column. -3. `Clickhouse` does not support the `binary` datatype. Binary will be loaded to a `text` column. When loading from `jsonl`, this will be a base64 string, when loading from parquet this will be +3. `Clickhouse` does not support the `binary` datatype. Binary will be loaded to a `text` column. When loading from `jsonl`, this will be a base64 string; when loading from parquet, this will be the `binary` object converted to `text`. 4. `Clickhouse` accepts adding columns to a populated table that aren’t null. -5. `Clickhouse` can produce rounding errors under certain conditions when using the float / double datatype. Make sure to use decimal if you can’t afford to have rounding errors. Loading the value +5. `Clickhouse` can produce rounding errors under certain conditions when using the float/double datatype. Make sure to use decimal if you can’t afford to have rounding errors. Loading the value 12.7001 to a double column with the loader file format jsonl set will predictably produce a rounding error, for example. ## Supported column hints @@ -149,9 +147,9 @@ ClickHouse supports the following [column hints](../../general-usage/schema#tabl - `primary_key` - marks the column as part of the primary key. Multiple columns can have this hint to create a composite primary key. -## Choosing a Table Engine +## Choosing a table engine -dlt defaults to `MergeTree` table engine. You can specify an alternate table engine in two ways: +dlt defaults to the `MergeTree` table engine. You can specify an alternate table engine in two ways: ### Setting a default table engine in the configuration @@ -165,7 +163,7 @@ table_engine_type = "merge_tree" # The default table engi ### Setting the table engine for specific resources -You can also set the table engine for specific resources using the clickhouse_adapter, which will override the default engine set in `.dlt/secrets.toml`, for that resource: +You can also set the table engine for specific resources using the clickhouse_adapter, which will override the default engine set in `.dlt/secrets.toml` for that resource: ```py from dlt.destinations.adapters import clickhouse_adapter @@ -180,7 +178,7 @@ clickhouse_adapter(my_resource, table_engine_type="merge_tree") Supported values for `table_engine_type` are: - `merge_tree` (default) - creates tables using the `MergeTree` engine, suitable for most use cases. [Learn more about MergeTree](https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/mergetree). -- `shared_merge_tree` - creates tables using the `SharedMergeTree` engine, optimized for cloud-native environments with shared storage. This table is **only** available on ClickHouse Cloud, and it the default selection if `merge_tree` is selected. [Learn more about SharedMergeTree](https://clickhouse.com/docs/en/cloud/reference/shared-merge-tree). +- `shared_merge_tree` - creates tables using the `SharedMergeTree` engine, optimized for cloud-native environments with shared storage. This table is **only** available on ClickHouse Cloud, and it is the default selection if `merge_tree` is selected. [Learn more about SharedMergeTree](https://clickhouse.com/docs/en/cloud/reference/shared-merge-tree). - `replicated_merge_tree` - creates tables using the `ReplicatedMergeTree` engine, which supports data replication across multiple nodes for high availability. [Learn more about ReplicatedMergeTree](https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/replication). This defaults to `shared_merge_tree` on ClickHouse Cloud. - Experimental support for the `Log` engine family with `stripe_log` and `tiny_log`. @@ -209,7 +207,7 @@ pipeline = dlt.pipeline( ) ``` -### Using Google Cloud or S3-Compatible Storage as a Staging Area +### Using Google Cloud or S3-compatible storage as a staging area dlt supports using S3-compatible storage services, including Google Cloud Storage (GCS), as a staging area when loading data into ClickHouse. This is handled automatically by @@ -220,7 +218,7 @@ To enable this, GCS provides an S3 compatibility mode that emulates the S3 API, allowing ClickHouse to access GCS buckets via its S3 integration. For detailed instructions on setting up S3-compatible storage with dlt, including AWS S3, MinIO, and Cloudflare R2, refer to -the [dlt documentation on filesystem destinations](https://dlthub.com/docs/dlt-ecosystem/destinations/filesystem#using-s3-compatible-storage). +the [dlt documentation on filesystem destinations](../../dlt-ecosystem/destinations/filesystem#using-s3-compatible-storage). To set up GCS staging with HMAC authentication in dlt: @@ -254,4 +252,5 @@ Integration with [dbt](../transformations/dbt/dbt.md) is generally supported via This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). - \ No newline at end of file + + diff --git a/docs/website/docs/dlt-ecosystem/destinations/databricks.md b/docs/website/docs/dlt-ecosystem/destinations/databricks.md index 12b267c9d6..ddbf930306 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/databricks.md +++ b/docs/website/docs/dlt-ecosystem/destinations/databricks.md @@ -22,7 +22,7 @@ To use the Databricks destination, you need: * A Databricks workspace with a Unity Catalog metastore connected * A Gen 2 Azure storage account and container -If you already have your Databricks workspace set up, you can skip to the [Loader setup Guide](#loader-setup-guide). +If you already have your Databricks workspace set up, you can skip to the [Loader setup guide](#loader-setup-guide). ### 1. Create a Databricks workspace in Azure @@ -33,7 +33,7 @@ If you already have your Databricks workspace set up, you can skip to the [Loade 2. Create an ADLS Gen 2 storage account Search for "Storage accounts" in the Azure Portal and create a new storage account. - Make sure it's a Data Lake Storage Gen 2 account, you do this by enabling "hierarchical namespace" when creating the account. Refer to the [Azure documentation](https://learn.microsoft.com/en-us/azure/storage/blobs/create-data-lake-storage-account) for further info. + Make sure it's a Data Lake Storage Gen 2 account by enabling "hierarchical namespace" when creating the account. Refer to the [Azure documentation](https://learn.microsoft.com/en-us/azure/storage/blobs/create-data-lake-storage-account) for further information. 3. Create a container in the storage account @@ -46,7 +46,7 @@ If you already have your Databricks workspace set up, you can skip to the [Loade 5. Grant access to your storage container - Navigate to the storage container you created before and select "Access control (IAM)" in the left-hand menu. + Navigate to the storage container you created earlier and select "Access control (IAM)" in the left-hand menu. Add a new role assignment and select "Storage Blob Data Contributor" as the role. Under "Members" select "Managed Identity" and add the Databricks Access Connector you created in the previous step. @@ -75,7 +75,7 @@ If you already have your Databricks workspace set up, you can skip to the [Loade 6. Click "+ Add" again and select "Add external location" - Set the URL of our storage container. This should be in the form: `abfss://@.dfs.core.windows.net/` + Set the URL of your storage container. This should be in the form: `abfss://@.dfs.core.windows.net/` Once created, you can test the connection to make sure the container is accessible from Databricks. @@ -88,7 +88,7 @@ If you already have your Databricks workspace set up, you can skip to the [Loade Click your email in the top right corner and go to "User Settings". Go to "Developer" -> "Access Tokens". Generate a new token and save it. You will use it in your `dlt` configuration. -## Loader setup Guide +## Loader setup guide **1. Initialize a project with a pipeline that loads to Databricks by running** ```sh @@ -99,13 +99,13 @@ dlt init chess databricks ```sh pip install -r requirements.txt ``` -This will install dlt with **databricks** extra which contains Databricks Python dbapi client. +This will install dlt with the `databricks` extra, which contains the Databricks Python dbapi client. **4. Enter your credentials into `.dlt/secrets.toml`.** -This should have your connection parameters and your personal access token. +This should include your connection parameters and your personal access token. -You will find your server hostname and HTTP path in the Databricks workspace dashboard. Go to "SQL Warehouses", select your warehouse (default is called "Starter Warehouse") and go to "Connection details". +You can find your server hostname and HTTP path in the Databricks workspace dashboard. Go to "SQL Warehouses", select your warehouse (default is called "Starter Warehouse"), and go to "Connection details". Example: @@ -120,7 +120,7 @@ catalog = "my_catalog" See [staging support](#staging-support) for authentication options when `dlt` copies files from buckets. ## Write disposition -All write dispositions are supported +All write dispositions are supported. ## Data loading Data is loaded using `INSERT VALUES` statements by default. @@ -129,16 +129,15 @@ Efficient loading from a staging filesystem is also supported by configuring an For more information on staging, see the [staging support](#staging-support) section below. ## Supported file formats -* [insert-values](../file-formats/insert-format.md) is used by default -* [jsonl](../file-formats/jsonl.md) supported when staging is enabled (see limitations below) -* [parquet](../file-formats/parquet.md) supported when staging is enabled +* [insert-values](../file-formats/insert-format.md) is used by default. +* [jsonl](../file-formats/jsonl.md) supported when staging is enabled (see limitations below). +* [parquet](../file-formats/parquet.md) supported when staging is enabled. The `jsonl` format has some limitations when used with Databricks: -1. Compression must be disabled to load jsonl files in Databricks. Set `data_writer.disable_compression` to `true` in dlt config when using this format. -2. The following data types are not supported when using `jsonl` format with `databricks`: `decimal`, `json`, `date`, `binary`. Use `parquet` if your data contains these types. -3. `bigint` data type with precision is not supported with `jsonl` format - +1. Compression must be disabled to load jsonl files in Databricks. Set `data_writer.disable_compression` to `true` in the dlt config when using this format. +2. The following data types are not supported when using the JSONL format with `databricks`: `decimal`, `json`, `date`, `binary`. Use `parquet` if your data contains these types. +3. The `bigint` data type with precision is not supported with the `jsonl` format. ## Staging support @@ -168,10 +167,10 @@ pipeline = dlt.pipeline( Refer to the [Azure Blob Storage filesystem documentation](./filesystem.md#azure-blob-storage) for details on connecting your Azure Blob Storage container with the bucket_url and credentials. -Databricks requires that you use ABFS urls in following format: +Databricks requires that you use ABFS URLs in the following format: **abfss://container_name@storage_account_name.dfs.core.windows.net/path** -`dlt` is able to adapt the other representation (ie **az://container-name/path**') still we recommend that you use the correct form. +`dlt` is able to adapt the other representation (i.e., **az://container-name/path**), but we recommend that you use the correct form. Example to set up Databricks with Azure as a staging destination: @@ -189,15 +188,15 @@ pipeline = dlt.pipeline( ``` ### Use external locations and stored credentials -`dlt` forwards bucket credentials to `COPY INTO` SQL command by default. You may prefer to use [external locations or stored credentials instead](https://docs.databricks.com/en/sql/language-manual/sql-ref-external-locations.html#external-location) that are stored on the Databricks side. +`dlt` forwards bucket credentials to the `COPY INTO` SQL command by default. You may prefer to use [external locations or stored credentials instead](https://docs.databricks.com/en/sql/language-manual/sql-ref-external-locations.html#external-location) that are stored on the Databricks side. -If you set up external location for your staging path, you can tell `dlt` to use it: +If you set up an external location for your staging path, you can tell `dlt` to use it: ```toml [destination.databricks] is_staging_external_location=true ``` -If you set up Databricks credential named ie. **credential_x**, you can tell `dlt` to use it: +If you set up Databricks credentials named, for example, **credential_x**, you can tell `dlt` to use it: ```toml [destination.databricks] staging_credentials_name="credential_x" @@ -211,7 +210,7 @@ bricks = dlt.destinations.databricks(staging_credentials_name="credential_x") ``` ### dbt support -This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-databricks](https://github.com/databricks/dbt-databricks) +This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-databricks](https://github.com/databricks/dbt-databricks). ### Syncing of `dlt` state This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). diff --git a/docs/website/docs/dlt-ecosystem/destinations/destination.md b/docs/website/docs/dlt-ecosystem/destinations/destination.md index bd26aa366b..7b1e1b23a4 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/destination.md +++ b/docs/website/docs/dlt-ecosystem/destinations/destination.md @@ -21,7 +21,7 @@ pip install dlt ## Set up a destination function for your pipeline -The custom destination decorator differs from other destinations in that you do not need to provide connection credentials, but rather you provide a function which gets called for all items loaded during a pipeline run or load operation. With the `@dlt.destination`, you can convert any function that takes two arguments into a `dlt` destination. +The custom destination decorator differs from other destinations in that you do not need to provide connection credentials, but rather you provide a function that gets called for all items loaded during a pipeline run or load operation. With the `@dlt.destination`, you can convert any function that takes two arguments into a `dlt` destination. A very simple dlt pipeline that pushes a list of items into a destination function might look like this: @@ -41,7 +41,7 @@ pipeline.run([1, 2, 3], table_name="items") :::tip 1. You can also remove the typing information (`TDataItems` and `TTableSchema`) from this example. Typing is generally useful to know the shape of the incoming objects, though. -2. There are a few other ways for declaring custom destination functions for your pipeline described below. +2. There are a few other ways to declare custom destination functions for your pipeline described below. ::: ### `@dlt.destination`, custom destination function, and signature @@ -64,17 +64,17 @@ def my_destination(items: TDataItems, table: TTableSchema) -> None: ``` ### Decorator arguments -* The `batch_size` parameter on the destination decorator defines how many items per function call are batched together and sent as an array. If you set a batch-size of `0`, instead of passing in actual data items, you will receive one call per load job with the path of the file as the items argument. You can then open and process that file in any way you like. -* The `loader_file_format` parameter on the destination decorator defines in which format files are stored in the load package before being sent to the destination function. This can be `jsonl` or `parquet`. +* The `batch_size` parameter on the destination decorator defines how many items per function call are batched together and sent as an array. If you set a batch size of `0`, instead of passing in actual data items, you will receive one call per load job with the path of the file as the items argument. You can then open and process that file in any way you like. +* The `loader_file_format` parameter on the destination decorator defines the format in which files are stored in the load package before being sent to the destination function. This can be `jsonl` or `parquet`. * The `name` parameter on the destination decorator defines the name of the destination that gets created by the destination decorator. * The `naming_convention` parameter on the destination decorator defines the name of the destination that gets created by the destination decorator. This controls how table and column names are normalized. The default is `direct`, which will keep all names the same. -* The `max_nesting_level` parameter on the destination decorator defines how deep the normalizer will go to normalize nested fields on your data to create subtables. This overwrites any settings on your `source` and is set to zero to not create any nested tables by default. +* The `max_nesting_level` parameter on the destination decorator defines how deep the normalizer will go to normalize nested fields in your data to create subtables. This overwrites any settings on your `source` and is set to zero to not create any nested tables by default. * The `skip_dlt_columns_and_tables` parameter on the destination decorator defines whether internal tables and columns will be fed into the custom destination function. This is set to `True` by default. -* The `max_parallel_load_jobs` parameter will define how many load jobs will run in parallel in threads, if you have a destination that only allows five connections at a time, you can set this value to 5 for example +* The `max_parallel_load_jobs` parameter will define how many load jobs will run in parallel in threads. If you have a destination that only allows five connections at a time, you can set this value to 5, for example. * The `loader_parallelism_strategy` parameter will control how load jobs are parallelized. Set to `parallel`, the default, jobs will be parallelized no matter which table is being loaded to. `table-sequential` will parallelize loading but only ever have one load job per table at a time, `sequential` will run all load jobs sequentially on the main thread. :::note -Settings above make sure that shape of the data you receive in the destination function is as close as possible to what you see in the data source. +Settings above ensure that the shape of the data you receive in the destination function is as close as possible to what you see in the data source. * The custom destination sets the `max_nesting_level` to 0 by default, which means no sub-tables will be generated during the normalization phase. * The custom destination also skips all internal tables and columns by default. If you need these, set `skip_dlt_columns_and_tables` to False. @@ -85,7 +85,7 @@ Settings above make sure that shape of the data you receive in the destination f * The `table` parameter contains the schema table the current call belongs to, including all table hints and columns. For example, the table name can be accessed with `table["name"]`. * You can also add config values and secrets to the function arguments, see below! -## Add configuration, credentials and other secret to the destination function +## Add configuration, credentials, and other secrets to the destination function The destination decorator supports settings and secrets variables. If you, for example, plan to connect to a service that requires an API secret or a login, you can do the following: ```py @@ -94,7 +94,7 @@ def my_destination(items: TDataItems, table: TTableSchema, api_key: dlt.secrets. ... ``` -You can then set a config variable in your `.dlt/secrets.toml`: like so: +You can then set a config variable in your `.dlt/secrets.toml` like so: ```toml [destination.my_destination] @@ -105,7 +105,7 @@ Custom destinations follow the same configuration rules as [regular named destin ## Use the custom destination in `dlt` pipeline -There are multiple ways to pass the custom destination function to `dlt` pipeline: +There are multiple ways to pass the custom destination function to the `dlt` pipeline: - Directly reference the destination function ```py @@ -113,12 +113,12 @@ There are multiple ways to pass the custom destination function to `dlt` pipelin def local_destination_func(items: TDataItems, table: TTableSchema) -> None: ... - # reference function directly + # Reference function directly p = dlt.pipeline("my_pipe", destination=local_destination_func) ``` Like for [regular destinations](../../general-usage/destination.md#pass-explicit-credentials), you are allowed to pass configuration and credentials - explicitly to destination function. + explicitly to the destination function. ```py @dlt.destination(batch_size=10, loader_file_format="jsonl", name="my_destination") def my_destination(items: TDataItems, table: TTableSchema, api_key: dlt.secrets.value) -> None: @@ -129,15 +129,15 @@ There are multiple ways to pass the custom destination function to `dlt` pipelin - Directly via destination reference. In this case, don't use the decorator for the destination function. ```py - # file my_destination.py + # File my_destination.py from dlt.common.destination import Destination - # don't use the decorator + # Don't use the decorator def local_destination_func(items: TDataItems, table: TTableSchema) -> None: ... - # via destination reference + # Via destination reference p = dlt.pipeline( "my_pipe", destination=Destination.from_reference( @@ -147,11 +147,11 @@ There are multiple ways to pass the custom destination function to `dlt` pipelin ``` - Via a fully qualified string to function location (can be used from `config.toml` or ENV vars). The destination function should be located in another file. ```py - # file my_pipeline.py + # File my_pipeline.py from dlt.common.destination import Destination - # fully qualified string to function location + # Fully qualified string to function location p = dlt.pipeline( "my_pipe", destination=Destination.from_reference( @@ -162,43 +162,44 @@ There are multiple ways to pass the custom destination function to `dlt` pipelin ## Adjust batch size and retry policy for atomic loads The destination keeps a local record of how many `DataItems` were processed, so if you, for example, use the custom destination to push `DataItems` to a remote API, and this -API becomes unavailable during the load resulting in a failed `dlt` pipeline run, you can repeat the run of your pipeline at a later moment and the custom destination will **restart from the whole batch that failed**. We are preventing any data from being lost, but you can still get duplicated data if you committed half of the batch ie. to a database and then failed. -**Keeping the batch atomicity is on you**. For this reason it makes sense to choose a batch size that you can process in one transaction (say one api request or one database transaction) so that if this request or transaction fail repeatedly you can repeat it at the next run without pushing duplicate data to your remote location. For systems that -are not transactional and do not tolerate duplicated data, you can use batch of size 1. +API becomes unavailable during the load resulting in a failed `dlt` pipeline run, you can repeat the run of your pipeline at a later moment and the custom destination will **restart from the whole batch that failed**. We are preventing any data from being lost, but you can still get duplicated data if you committed half of the batch, for example, to a database and then failed. +**Keeping the batch atomicity is on you**. For this reason, it makes sense to choose a batch size that you can process in one transaction (say one API request or one database transaction) so that if this request or transaction fails repeatedly, you can repeat it at the next run without pushing duplicate data to your remote location. For systems that +are not transactional and do not tolerate duplicated data, you can use a batch of size 1. Destination functions that raise exceptions are retried 5 times before giving up (`load.raise_on_max_retries` config option). If you run the pipeline again, it will resume loading before extracting new data. If your exception derives from `DestinationTerminalException`, the whole load job will be marked as failed and not retried again. :::caution -If you wipe out the pipeline folder (where job files and destination state are saved) you will not be able to restart from the last failed batch. -However, it is fairly easy to backup and restore the pipeline directory, [see details below](#manage-pipeline-state-for-incremental-loading). +If you wipe out the pipeline folder (where job files and destination state are saved), you will not be able to restart from the last failed batch. +However, it is fairly easy to back up and restore the pipeline directory, [see details below](#manage-pipeline-state-for-incremental-loading). ::: ## Increase or decrease loading parallelism -Calls to the destination function by default will be executed on multiple threads, so you need to make sure you are not using any non-thread-safe nonlocal or global variables from outside your destination function. If you need to have all calls be executed from the same thread, you can set the `workers` [config variable of the load step](../../reference/performance.md#load) to 1. +Calls to the destination function by default will be executed on multiple threads, so you need to make sure you are not using any non-thread-safe nonlocal or global variables from outside your destination function. If you need to have all calls executed from the same thread, you can set the `workers` [config variable of the load step](../../reference/performance.md#load) to 1. :::tip -For performance reasons, we recommend keeping the multithreaded approach and making sure that you, for example, are using threadsafe connection pools to a remote database or queue. +For performance reasons, we recommend keeping the multithreaded approach and making sure that you, for example, are using thread-safe connection pools to a remote database or queue. ::: ## Write disposition -`@dlt.destination` will forward all normalized `DataItems` encountered during a pipeline run to the custom destination function, so there is no notion of "write dispositions". +`@dlt.destination` will forward all normalized `DataItems` encountered during a pipeline run to the custom destination function, so there is no notion of "write dispositions." ## Staging support `@dlt.destination` does not support staging files in remote locations before being called at this time. If you need this feature, please let us know. ## Manage pipeline state for incremental loading -Custom destinations do not have a general mechanism to restore pipeline state. This will impact data sources that rely on the state being kept ie. all incremental resources. -If you wipe the pipeline directory (ie. by deleting a folder or running on AWS lambda / Github Actions where you get a clean runner) the progress of the incremental loading is lost. On the next run you will re-acquire the data from the beginning. +Custom destinations do not have a general mechanism to restore pipeline state. This will impact data sources that rely on the state being kept, i.e., all incremental resources. +If you wipe the pipeline directory (i.e., by deleting a folder or running on AWS Lambda or GitHub Actions where you get a clean runner), the progress of the incremental loading is lost. On the next run, you will re-acquire the data from the beginning. -While we are working on a pluggable state storage you can fix the problem above by: -1. Not wiping the pipeline directory. For example if you run your pipeline on an EC instance periodically, the state will be preserved. -2. By doing a restore/backup of the pipeline directory before/after it runs. This is way easier than it sounds and [here's a script you can reuse](https://gist.github.com/rudolfix/ee6e16d8671f26ac4b9ffc915ad24b6e). +While we are working on a pluggable state storage, you can fix the problem above by: +1. Not wiping the pipeline directory. For example, if you run your pipeline on an EC instance periodically, the state will be preserved. +2. By doing a restore/backup of the pipeline directory before/after it runs. This is way easier than it sounds, and [here's a script you can reuse](https://gist.github.com/rudolfix/ee6e16d8671f26ac4b9ffc915ad24b6e). ## What's next * Check out our [Custom BigQuery Destination](../../examples/custom_destination_bigquery/) example. * Need help with building a custom destination? Ask your questions in our [Slack Community](https://dlthub.com/community) technical help channel. + diff --git a/docs/website/docs/dlt-ecosystem/destinations/dremio.md b/docs/website/docs/dlt-ecosystem/destinations/dremio.md index c087d5dc0a..82665febf1 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/dremio.md +++ b/docs/website/docs/dlt-ecosystem/destinations/dremio.md @@ -12,19 +12,19 @@ keywords: [dremio, iceberg, aws, glue catalog] pip install "dlt[dremio,s3]" ``` -## Setup Guide +## Setup guide ### 1. Initialize the dlt project Let's start by initializing a new dlt project as follows: ```sh dlt init chess dremio ``` - > 💡 This command will initialise your pipeline with chess as the source and aws dremio as the destination using the filesystem staging destination + > 💡 This command will initialize your pipeline with chess as the source and aws dremio as the destination using the filesystem staging destination. -### 2. Setup bucket storage and dremio credentials +### 2. Setup bucket storage and Dremio credentials -First install dependencies by running: +First, install dependencies by running: ```sh pip install -r requirements.txt ``` @@ -46,22 +46,22 @@ aws_secret_access_key = "please set me up!" # copy the secret access key here staging_data_source = "" # the name of the "Object Storage" data source in Dremio containing the s3 bucket [destination.dremio.credentials] -username = "" # the dremio username -password = "" # dremio password or PAT token +username = "" # the Dremio username +password = "" # Dremio password or PAT token database = "" # the name of the "data source" set up in Dremio where you want to load your data host = "localhost" # the Dremio hostname port = 32010 # the Dremio Arrow Flight grpc port drivername="grpc" # either 'grpc' or 'grpc+tls' ``` -You can also pass SqlAlchemy-like connection like below +You can also pass a SqlAlchemy-like connection like below: ```toml [destination.dremio] staging_data_source="s3_staging" credentials="grpc://:@:/" ``` -if you have your credentials stored in `~/.aws/credentials` just remove the **[destination.filesystem.credentials]** and **[destination.dremio.credentials]** section above and `dlt` will fall back to your **default** profile in local credentials. If you want to switch the profile, pass the profile name as follows (here: `dlt-ci-user`): +If you have your credentials stored in `~/.aws/credentials`, just remove the **[destination.filesystem.credentials]** and **[destination.dremio.credentials]** sections above and `dlt` will fall back to your **default** profile in local credentials. If you want to switch the profile, pass the profile name as follows (here: `dlt-ci-user`): ```toml [destination.filesystem.credentials] profile_name="dlt-ci-user" @@ -74,15 +74,15 @@ profile_name="dlt-ci-user" - `replace` - `merge` -> The `merge` write disposition uses the default DELETE/UPDATE/INSERT strategy to merge data into the destination. Be aware that Dremio does not support transactions so a partial pipeline failure can result in the destination table being in an inconsistent state. The `merge` write disposition will eventually be implemented using [MERGE INTO](https://docs.dremio.com/current/reference/sql/commands/apache-iceberg-tables/apache-iceberg-merge/) to resolve this issue. +> The `merge` write disposition uses the default DELETE/UPDATE/INSERT strategy to merge data into the destination. Be aware that Dremio does not support transactions, so a partial pipeline failure can result in the destination table being in an inconsistent state. The `merge` write disposition will eventually be implemented using [MERGE INTO](https://docs.dremio.com/current/reference/sql/commands/apache-iceberg-tables/apache-iceberg-merge/) to resolve this issue. ## Data loading -Data loading happens by copying a staged parquet files from an object storage bucket to the destination table in Dremio using [COPY INTO](https://docs.dremio.com/cloud/reference/sql/commands/copy-into-table/) statements. The destination table format is specified by the storage format for the data source in Dremio. Typically, this will be Apache Iceberg. +Data loading happens by copying staged parquet files from an object storage bucket to the destination table in Dremio using [COPY INTO](https://docs.dremio.com/cloud/reference/sql/commands/copy-into-table/) statements. The destination table format is specified by the storage format for the data source in Dremio. Typically, this will be Apache Iceberg. > ❗ **Dremio cannot load `fixed_len_byte_array` columns from `parquet` files**. -## Dataset Creation +## Dataset creation Dremio does not support `CREATE SCHEMA` DDL statements. @@ -92,9 +92,9 @@ Therefore, "Metastore" data sources, such as Hive or Glue, require that the data ## Staging support -Using a staging destination is mandatory when using the dremio destination. If you do not set staging to `filesystem`, dlt will automatically do this for you. +Using a staging destination is mandatory when using the Dremio destination. If you do not set staging to `filesystem`, dlt will automatically do this for you. -## Table Partitioning and Local Sort +## Table partitioning and local sort Apache Iceberg table partitions and local sort properties can be configured as shown below: ```py import dlt @@ -118,4 +118,5 @@ This will result in `PARTITION BY ("foo","bar")` and `LOCALSORT BY ("baz")` clau ### Syncing of `dlt` state - This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). - \ No newline at end of file + + diff --git a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md index 4b8ecec4ca..46290f928e 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md +++ b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md @@ -12,7 +12,7 @@ keywords: [duckdb, destination, data warehouse] pip install "dlt[duckdb]" ``` -## Setup Guide +## Setup guide **1. Initialize a project with a pipeline that loads to DuckDB by running:** ```sh @@ -38,7 +38,7 @@ All write dispositions are supported. ### Data types `duckdb` supports various [timestamp types](https://duckdb.org/docs/sql/data_types/timestamp.html). These can be configured using the column flags `timezone` and `precision` in the `dlt.resource` decorator or the `pipeline.run` method. -- **Precision**: supported precision values are 0, 3, 6, and 9 for fractional seconds. Note that `timezone` and `precision` cannot be used together; attempting to combine them will result in an error. +- **Precision**: Supported precision values are 0, 3, 6, and 9 for fractional seconds. Note that `timezone` and `precision` cannot be used together; attempting to combine them will result in an error. - **Timezone**: - Setting `timezone=False` maps to `TIMESTAMP`. - Setting `timezone=True` (or omitting the flag, which defaults to `True`) maps to `TIMESTAMP WITH TIME ZONE` (`TIMESTAMPTZ`). @@ -73,8 +73,8 @@ pipeline.run(events()) ### Names normalization `dlt` uses the standard **snake_case** naming convention to keep identical table and column identifiers across all destinations. If you want to use the **duckdb** wide range of characters (i.e., emojis) for table and column names, you can switch to the **duck_case** naming convention, which accepts almost any string as an identifier: -* `\n` `\r` and `"` are translated to `_` -* multiple `_` are translated to a single `_` +* New line (`\n`), carriage return (`\r`), and double quotes (`"`) are translated to an underscore (`_`). +* Consecutive underscores (`_`) are translated to a single `_` Switch the naming convention using `config.toml`: ```toml @@ -93,19 +93,19 @@ dlt.config["schema.naming"] = "duck_case" ## Supported file formats -You can configure the following file formats to load data to duckdb: -* [insert-values](../file-formats/insert-format.md) is used by default -* [parquet](../file-formats/parquet.md) is supported +You can configure the following file formats to load data into duckdb: +* [insert-values](../file-formats/insert-format.md) is used by default. +* [parquet](../file-formats/parquet.md) is supported. :::note `duckdb` cannot COPY many parquet files to a single table from multiple threads. In this situation, `dlt` serializes the loads. Still, that may be faster than INSERT. ::: * [jsonl](../file-formats/jsonl.md) :::tip -`duckdb` has [timestamp types](https://duckdb.org/docs/sql/data_types/timestamp.html) with resolutions from milliseconds to nanoseconds. However -only microseconds resolution (the most common used) is time zone aware. `dlt` generates timestamps with timezones by default so loading parquet files +`duckdb` has [timestamp types](https://duckdb.org/docs/sql/data_types/timestamp.html) with resolutions from milliseconds to nanoseconds. However, +only the microseconds resolution (the most commonly used) is time zone aware. `dlt` generates timestamps with timezones by default, so loading parquet files with default settings will fail (`duckdb` does not coerce tz-aware timestamps to naive timestamps). -Disable the timezones by changing `dlt` [parquet writer settings](../file-formats/parquet.md#writer-settings) as follows: +Disable the timezones by changing the `dlt` [Parquet writer settings](../file-formats/parquet.md#writer-settings) as follows: ```sh DATA_WRITER__TIMESTAMP_TIMEZONE="" ``` @@ -116,7 +116,7 @@ to disable tz adjustments. `duckdb` can create unique indexes for columns with `unique` hints. However, **this feature is disabled by default** as it can significantly slow down data loading. -## Destination Configuration +## Destination configuration By default, a DuckDB database will be created in the current working directory with a name `.duckdb` (`chess.duckdb` in the example above). After loading, it is available in `read/write` mode via `with pipeline.sql_client() as con:`, which is a wrapper over `DuckDBPyConnection`. See [duckdb docs](https://duckdb.org/docs/api/python/overview#persistent-storage) for details. @@ -152,7 +152,7 @@ p = dlt.pipeline( dev_mode=False, ) -# Or if you would like to use in-memory duckdb instance +# Or if you would like to use an in-memory duckdb instance db = duckdb.connect(":memory:") p = pipeline_one = dlt.pipeline( pipeline_name="in_memory_pipeline", @@ -175,7 +175,7 @@ print(db.sql("DESCRIBE;")) ``` :::note -Be careful! The in-memory instance of the database will be destroyed, once your Python script exits. +Be careful! The in-memory instance of the database will be destroyed once your Python script exits. ::: This destination accepts database connection strings in the format used by [duckdb-engine](https://github.com/Mause/duckdb_engine#configuration). @@ -187,10 +187,10 @@ destination.duckdb.credentials="duckdb:///_storage/test_quack.duckdb" The **duckdb://** URL above creates a **relative** path to `_storage/test_quack.duckdb`. To define an **absolute** path, you need to specify four slashes, i.e., `duckdb:////_storage/test_quack.duckdb`. -Dlt supports a unique connection string that triggers specific behavior for duckdb destination: +Dlt supports a unique connection string that triggers specific behavior for the `duckdb` destination: * **:pipeline:** creates the database in the working directory of the pipeline, naming it `quack.duckdb`. -Please see the code snippets below showing how to use it +Please see the code snippets below showing how to use it: 1. Via `config.toml` ```toml @@ -213,9 +213,10 @@ create_indexes=true ``` ### dbt support -This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-duckdb](https://github.com/jwills/dbt-duckdb), which is a community-supported package. The `duckdb` database is shared with `dbt`. In rare cases, you may see information that the binary database format does not match the database format expected by `dbt-duckdb`. You can avoid that by updating the `duckdb` package in your `dlt` project with `pip install -U`. +This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-duckdb](https://github.com/jwills/dbt-duckdb), which is a community-supported package. The `duckdb` database is shared with `dbt`. In rare cases, you may see information that the binary database format does not match the database format expected by `dbt-duckdb`. You can avoid this by updating the `duckdb` package in your `dlt` project with `pip install -U`. ### Syncing of `dlt` state This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). + diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index cfeb03655c..a456fa6e7d 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -1,8 +1,8 @@ # Cloud storage and filesystem -The filesystem destination stores data in remote file systems and cloud storage services like **AWS S3**, **Google Cloud Storage**, or **Azure Blob Storage**. Underneath, it uses [fsspec](https://github.com/fsspec/filesystem_spec) to abstract file operations. Its primary role is to be used as a staging for other destinations, but you can also quickly build a data lake with it. +The filesystem destination stores data in remote file systems and cloud storage services like **AWS S3**, **Google Cloud Storage**, or **Azure Blob Storage**. Underneath, it uses [fsspec](https://github.com/fsspec/filesystem_spec) to abstract file operations. Its primary role is to be used as a staging area for other destinations, but you can also quickly build a data lake with it. :::tip -Please read the notes on the layout of the data files. Currently, we are getting feedback on it. Please join our Slack (icon at the top of the page) and help us find the optimal layout. +Please read the notes on the layout of the data files. Currently, we are receiving feedback on it. Please join our Slack (icon at the top of the page) and help us find the optimal layout. ::: ## Install dlt with filesystem @@ -13,7 +13,7 @@ Install the dlt library with filesystem dependencies: pip install "dlt[filesystem]" ``` -This installs `s3fs` and `botocore` packages. +This installs the `s3fs` and `botocore` packages. :::caution @@ -25,7 +25,7 @@ pip install s3fs so pip does not fail on backtracking. ::: -## Initialise the dlt project +## Initialize the dlt project Let's start by initializing a new dlt project as follows: ```sh @@ -33,13 +33,13 @@ dlt init chess filesystem ``` :::note -This command will initialize your pipeline with chess as the source and the AWS S3 as the destination. +This command will initialize your pipeline with chess as the source and AWS S3 as the destination. ::: ## Set up the destination and credentials ### AWS S3 -The command above creates a sample `secrets.toml` and requirements file for AWS S3 bucket. You can install those dependencies by running: +The command above creates a sample `secrets.toml` and requirements file for an AWS S3 bucket. You can install those dependencies by running: ```sh pip install -r requirements.txt ``` @@ -72,14 +72,14 @@ region_name="eu-central-1" You need to create an S3 bucket and a user who can access that bucket. dlt does not create buckets automatically. 1. You can create the S3 bucket in the AWS console by clicking on "Create Bucket" in S3 and assigning the appropriate name and permissions to the bucket. -2. Once the bucket is created, you'll have the bucket URL. For example, If the bucket name is `dlt-ci-test-bucket`, then the bucket URL will be: +2. Once the bucket is created, you'll have the bucket URL. For example, if the bucket name is `dlt-ci-test-bucket`, then the bucket URL will be: ```text s3://dlt-ci-test-bucket ``` -3. To grant permissions to the user being used to access the S3 bucket, go to the IAM > Users, and click on “Add Permissions”. -4. Below you can find a sample policy that gives a minimum permission required by dlt to a bucket we created above. The policy contains permissions to list files in a bucket, get, put, and delete objects. **Remember to place your bucket name in the Resource section of the policy!** +3. To grant permissions to the user being used to access the S3 bucket, go to IAM > Users, and click on “Add Permissions”. +4. Below you can find a sample policy that gives the minimum permission required by dlt to a bucket we created above. The policy contains permissions to list files in a bucket, get, put, and delete objects. **Remember to place your bucket name in the Resource section of the policy!** ```json { @@ -103,12 +103,12 @@ You need to create an S3 bucket and a user who can access that bucket. dlt does ] } ``` -5. To grab the access and secret key for the user. Go to IAM > Users and in the “Security Credentials”, click on “Create Access Key”, and preferably select “Command Line Interface” and create the access key. -6. Grab the “Access Key” and “Secret Access Key” created that are to be used in "secrets.toml". +5. To obtain the access and secret key for the user, go to IAM > Users and in the “Security Credentials”, click on “Create Access Key”, and preferably select “Command Line Interface” and create the access key. +6. Obtain the “Access Key” and “Secret Access Key” created that are to be used in "secrets.toml". #### Using S3 compatible storage -To use an S3 compatible storage other than AWS S3 like [MinIO](https://min.io/) or [Cloudflare R2](https://www.cloudflare.com/en-ca/developer-platform/r2/), you may supply an `endpoint_url` in the config. This should be set along with AWS credentials: +To use an S3 compatible storage other than AWS S3, such as [MinIO](https://min.io/) or [Cloudflare R2](https://www.cloudflare.com/en-ca/developer-platform/r2/), you may supply an `endpoint_url` in the config. This should be set along with AWS credentials: ```toml [destination.filesystem] @@ -120,7 +120,7 @@ aws_secret_access_key = "please set me up!" # copy the secret access key here endpoint_url = "https://.r2.cloudflarestorage.com" # copy your endpoint URL here ``` -#### Adding Additional Configuration +#### Adding additional configuration To pass any additional arguments to `fsspec`, you may supply `kwargs` and `client_kwargs` in the config as a **stringified dictionary**: @@ -130,7 +130,7 @@ kwargs = '{"use_ssl": true, "auto_mkdir": true}' client_kwargs = '{"verify": "public.crt"}' ``` -### Google Storage +### Google storage Run `pip install "dlt[gs]"` which will install the `gcfs` package. To edit the `dlt` credentials file with your secret info, open `.dlt/secrets.toml`. @@ -146,10 +146,10 @@ private_key = "private_key" # please set me up! client_email = "client_email" # please set me up! ``` :::note -Note that you can share the same credentials with BigQuery, replace the `[destination.filesystem.credentials]` section with a less specific one: `[destination.credentials]` which applies to both destinations +Note that you can share the same credentials with BigQuery, replace the `[destination.filesystem.credentials]` section with a less specific one: `[destination.credentials]` which applies to both destinations. ::: -if you have default google cloud credentials in your environment (i.e. on cloud function) remove the credentials sections above and `dlt` will fall back to the available default. +If you have default Google Cloud credentials in your environment (i.e., on cloud function), remove the credentials sections above and `dlt` will fall back to the available default. Use **Cloud Storage** admin to create a new bucket. Then assign the **Storage Object Admin** role to your service account. @@ -157,13 +157,13 @@ Use **Cloud Storage** admin to create a new bucket. Then assign the **Storage Ob Run `pip install "dlt[az]"` which will install the `adlfs` package to interface with Azure Blob Storage. -Edit the credentials in `.dlt/secrets.toml`, you'll see AWS credentials by default replace them with your Azure credentials. +Edit the credentials in `.dlt/secrets.toml`, you'll see AWS credentials by default; replace them with your Azure credentials. Two forms of Azure credentials are supported: #### SAS token credentials -Supply storage account name and either sas token or storage account key +Supply storage account name and either SAS token or storage account key ```toml [destination.filesystem] @@ -177,13 +177,13 @@ azure_storage_account_key = "account_key" # please set me up! azure_storage_sas_token = "sas_token" # please set me up! ``` -If you have the correct Azure credentials set up on your machine (e.g. via azure cli), +If you have the correct Azure credentials set up on your machine (e.g., via Azure CLI), you can omit both `azure_storage_account_key` and `azure_storage_sas_token` and `dlt` will fall back to the available default. Note that `azure_storage_account_name` is still required as it can't be inferred from the environment. #### Service principal credentials -Supply a client ID, client secret and a tenant ID for a service principal authorized to access your container +Supply a client ID, client secret, and a tenant ID for a service principal authorized to access your container. ```toml [destination.filesystem] @@ -197,7 +197,7 @@ azure_tenant_id = "tenant_id" # please set me up! :::caution **Concurrent blob uploads** -`dlt` limits the number of concurrent connections for a single uploaded blob to 1. By default `adlfs` that we use, splits blobs into 4 MB chunks and uploads them concurrently which leads to gigabytes of used memory and thousands of connections for a larger load packages. You can increase the maximum concurrency as follows: +`dlt` limits the number of concurrent connections for a single uploaded blob to 1. By default, `adlfs` that we use splits blobs into 4 MB chunks and uploads them concurrently, which leads to gigabytes of used memory and thousands of connections for larger load packages. You can increase the maximum concurrency as follows: ```toml [destination.filesystem.kwargs] max_concurrency=3 @@ -206,7 +206,7 @@ max_concurrency=3 ### Local file system -If for any reason you want to have those files in a local folder, set up the `bucket_url` as follows (you are free to use `config.toml` for that as there are no secrets required) +If for any reason you want to have those files in a local folder, set up the `bucket_url` as follows (you are free to use `config.toml` for that as there are no secrets required): ```toml [destination.filesystem] @@ -221,20 +221,20 @@ For handling deeply nested layouts, consider enabling automatic directory creati kwargs = '{"auto_mkdir": true}' ``` -Or by setting environment variable: +Or by setting an environment variable: ```sh export DESTINATION__FILESYSTEM__KWARGS = '{"auto_mkdir": true/false}' ``` ::: -`dlt` correctly handles the native local file paths. Indeed, using the `file://` schema may be not intuitive especially for Windows users. +`dlt` correctly handles the native local file paths. Indeed, using the `file://` schema may not be intuitive, especially for Windows users. ```toml [destination.unc_destination] bucket_url = 'C:\a\b\c' ``` -In the example above we specify `bucket_url` using **toml's literal strings** that do not require [escaping of backslashes](https://github.com/toml-lang/toml/blob/main/toml.md#string). +In the example above, we specify `bucket_url` using **toml's literal strings** that do not require [escaping of backslashes](https://github.com/toml-lang/toml/blob/main/toml.md#string). ```toml [destination.unc_destination] @@ -247,14 +247,12 @@ bucket_url = '/var/local/data' # absolute POSIX style path bucket_url = '_storage/data' # relative POSIX style path ``` -In the examples above we define a few named filesystem destinations: -* **unc_destination** demonstrates Windows UNC path in native form -* **posix_destination** demonstrates native POSIX (Linux/Mac) absolute path -* **relative_destination** demonstrates native POSIX (Linux/Mac) relative path. In this case `filesystem` destination will store files in `$cwd/_storage/data` path -where **$cwd** is your current working directory. +In the examples above, we define a few named filesystem destinations: +* **unc_destination** demonstrates a Windows UNC path in native form. +* **posix_destination** demonstrates a native POSIX (Linux/Mac) absolute path. +* **relative_destination** demonstrates a native POSIX (Linux/Mac) relative path. In this case, the `filesystem` destination will store files in the `$cwd/_storage/data` path, where **$cwd** is your current working directory. -`dlt` supports Windows [UNC paths with file:// scheme](https://en.wikipedia.org/wiki/File_URI_scheme). They can be specified using **host** or purely as **path** -component. +`dlt` supports Windows [UNC paths with the file:// scheme](https://en.wikipedia.org/wiki/File_URI_scheme). They can be specified using **host** or purely as a **path** component. ```toml [destination.unc_with_host] @@ -265,9 +263,9 @@ bucket_url="file:////localhost/c$/a/b/c" ``` :::caution -Windows supports paths up to 255 characters. When you access a path longer than 255 characters you'll see `FileNotFound` exception. +Windows supports paths up to 255 characters. When you access a path longer than 255 characters, you'll see a `FileNotFound` exception. - To go over this limit you can use [extended paths](https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=registry). `dlt` recognizes both regular and UNC extended paths +To overcome this limit, you can use [extended paths](https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=registry). `dlt` recognizes both regular and UNC extended paths. ```toml [destination.regular_extended] @@ -279,7 +277,7 @@ bucket_url='\\?\UNC\localhost\c$\a\b\c' ::: ### SFTP -Run `pip install "dlt[sftp]` which will install the `paramiko` package alongside `dlt`, enabling secure SFTP transfers. +Run `pip install "dlt[sftp]"` which will install the `paramiko` package alongside `dlt`, enabling secure SFTP transfers. Configure your SFTP credentials by editing the `.dlt/secrets.toml` file. By default, the file contains placeholders for AWS credentials. You should replace these with your SFTP credentials. @@ -306,7 +304,7 @@ sftp_gss_trust_dns # Trust DNS for GSS-API, defaults to True ``` > For more information about credentials parameters: https://docs.paramiko.org/en/3.3/api/client.html#paramiko.client.SSHClient.connect -### Authentication Methods +### Authentication methods SFTP authentication is attempted in the following order of priority: @@ -316,10 +314,10 @@ SFTP authentication is attempted in the following order of priority: 3. **Username/Password authentication**: If a password is provided (`sftp_password`), plain username/password authentication will be attempted. -4. **GSS-API authentication**: If GSS-API (Kerberos) is enabled (sftp_gss_auth=True), authentication will use the Kerberos protocol. GSS-API may also be used for key exchange (sftp_gss_kex=True) and credential delegation (sftp_gss_deleg_creds=True). This method is useful in environments where Kerberos is set up, often in enterprise networks. +4. **GSS-API authentication**: If GSS-API (Kerberos) is enabled (`sftp_gss_auth=True`), authentication will use the Kerberos protocol. GSS-API may also be used for key exchange (`sftp_gss_kex=True`) and credential delegation (`sftp_gss_deleg_creds=True`). This method is useful in environments where Kerberos is set up, often in enterprise networks. -#### 1. **Key-based Authentication** +#### 1. Key-based authentication If you use an SSH key instead of a password, you can specify the path to your private key in the configuration. @@ -334,7 +332,7 @@ sftp_key_filename = "/path/to/id_rsa" # Replace with the path to your privat sftp_key_passphrase = "your_passphrase" # Optional: passphrase for your private key ``` -#### 2. **SSH Agent-based Authentication** +#### 2. SSH agent-based authentication If you have an SSH agent running with loaded keys, you can allow Paramiko to use these keys automatically. You can omit the password and key fields if you're relying on the SSH agent. @@ -349,7 +347,7 @@ sftp_key_passphrase = "your_passphrase" # Optional: passphrase for your privat ``` The loaded key must be one of the following types stored in ~/.ssh/: id_rsa, id_dsa, or id_ecdsa. -#### 3. **Username/Password Authentication** +#### 3. Username and password authentication This is the simplest form of authentication, where you supply a username and password directly. @@ -365,7 +363,7 @@ sftp_password = "pass" # Replace "pass" with your SFTP passwor ### Notes: -- **Key-based Authentication**: Make sure your private key has the correct permissions (`chmod 600`), or SSH will refuse to use it. +- **Key-based authentication**: Make sure your private key has the correct permissions (`chmod 600`), or SSH will refuse to use it. - **Timeouts**: It's important to adjust timeout values based on your network conditions to avoid connection issues. This configuration allows flexible SFTP authentication, whether you're using passwords, keys, or agents, and ensures secure communication between your local environment and the SFTP server. @@ -376,7 +374,7 @@ The filesystem destination handles the write dispositions as follows: - `replace` - all files that belong to such tables are deleted from the dataset folder, and then the current set of files is added. - `merge` - falls back to `append` -### 🧪 `merge` with `delta` table format +### 🧪 Merge with delta table format The [`upsert`](../../general-usage/incremental-loading.md#upsert-strategy) merge strategy is supported when using the [`delta`](#delta-table-format) table format. :::caution @@ -396,10 +394,10 @@ def my_upsert_resource(): #### Known limitations - `hard_delete` hint not supported -- deleting records from nested tables not supported - - This means updates to json columns that involve element removals are not propagated. For example, if you first load `{"key": 1, "nested": [1, 2]}` and then load `{"key": 1, "nested": [1]}`, then the record for element `2` will not be deleted from the nested table. +- Deleting records from nested tables not supported + - This means updates to JSON columns that involve element removals are not propagated. For example, if you first load `{"key": 1, "nested": [1, 2]}` and then load `{"key": 1, "nested": [1]}`, then the record for element `2` will not be deleted from the nested table. -## File Compression +## File compression The filesystem destination in the dlt library uses `gzip` compression by default for efficiency, which may result in the files being stored in a compressed format. This format may not be easily readable as plain text or JSON Lines (`jsonl`) files. If you encounter files that seem unreadable, they may be compressed. @@ -414,16 +412,16 @@ disable_compression=true - To decompress a `gzip` file, you can use tools like `gunzip`. This will convert the compressed file back to its original format, making it readable. -For more details on managing file compression, please visit our documentation on performance optimization: [Disabling and Enabling File Compression](https://dlthub.com/docs/reference/performance#disabling-and-enabling-file-compression). +For more details on managing file compression, please visit our documentation on performance optimization: [Disabling and enabling file compression](../../reference/performance#disabling-and-enabling-file-compression). ## Files layout All the files are stored in a single folder with the name of the dataset that you passed to the `run` or `load` methods of the `pipeline`. In our example chess pipeline, it is **chess_players_games_data**. :::note -Object storages are, in fact, key-blob storage so the folder structure is emulated by splitting file names into components by separator (`/`). +Object storages are, in fact, key-blob storage, so the folder structure is emulated by splitting file names into components by a separator (`/`). ::: -You can control files layout by specifying the desired configuration. There are several ways to do this. +You can control the files layout by specifying the desired configuration. There are several ways to do this. ### Default layout @@ -438,10 +436,10 @@ The default layout format has changed from `{schema_name}.{table_name}.{load_id} #### Standard placeholders * `schema_name` - the name of the [schema](../../general-usage/schema.md) -* `table_name` - table name -* `load_id` - the id of the [load package](../../general-usage/destination-tables.md#load-packages-and-load-ids) from which the file comes from -* `file_id` - the id of the file, is there are many files with data for a single table, they are copied with different file ids -* `ext` - a format of the file i.e. `jsonl` or `parquet` +* `table_name` - the table name +* `load_id` - the ID of the [load package](../../general-usage/destination-tables.md#load-packages-and-load-ids) from which the file comes +* `file_id` - the ID of the file; if there are many files with data for a single table, they are copied with different file IDs +* `ext` - the format of the file, i.e., `jsonl` or `parquet` #### Date and time placeholders :::tip @@ -454,7 +452,7 @@ Keep in mind all values are lowercased. * `load_package_timestamp_ms` - timestamp from [load package](../../general-usage/destination-tables.md#load-packages-and-load-ids) in Unix Timestamp format in milliseconds :::note -Both `timestamp_ms` and `load_package_timestamp_ms` are in milliseconds (e.g., 12334455233), not fractional seconds to make sure millisecond precision without decimals. +Both `timestamp_ms` and `load_package_timestamp_ms` are in milliseconds (e.g., 12334455233), not fractional seconds to ensure millisecond precision without decimals. ::: * Years @@ -487,7 +485,7 @@ Both `timestamp_ms` and `load_package_timestamp_ms` are in milliseconds (e.g., 1 * `ddd` - Mon, Tue, Wed * `dd` - Mo, Tu, We * `d` - 0-6 -* `Q` - quarters 1, 2, 3, 4, +* `Q` - quarters 1, 2, 3, 4 You can change the file name format by providing the layout setting for the filesystem destination like so: ```toml @@ -512,14 +510,14 @@ layout="{table_name}/{load_id}.{file_id}.{ext}" # current preconfigured naming s A few things to know when specifying your filename layout: - If you want a different base path that is common to all filenames, you can suffix your `bucket_url` rather than prefix your `layout` setting. - If you do not provide the `{ext}` placeholder, it will automatically be added to your layout at the end with a dot as a separator. -- It is the best practice to have a separator between each placeholder. Separators can be any character allowed as a filename character, but dots, dashes, and forward slashes are most common. -- When you are using the `replace` disposition, `dlt` will have to be able to figure out the correct files to delete before loading the new data. For this to work, you have to +- It is best practice to have a separator between each placeholder. Separators can be any character allowed as a filename character, but dots, dashes, and forward slashes are most common. +- When you are using the `replace` disposition, `dlt` will have to be able to figure out the correct files to delete before loading the new data. For this to work, you have to: - include the `{table_name}` placeholder in your layout - not have any other placeholders except for the `{schema_name}` placeholder before the table_name placeholder and - have a separator after the table_name placeholder Please note: -- `dlt` will mark complete loads by creating a json file in the `./_dlt_loads` folders that corresponds to the`_dlt_loads` table. For example, if `chess__1685299832.jsonl` file is present in the loads folder, you can be sure that all files for the load package `1685299832` are completely loaded +- `dlt` will mark complete loads by creating a json file in the `./_dlt_loads` folders that corresponds to the `_dlt_loads` table. For example, if the `chess__1685299832.jsonl` file is present in the loads folder, you can be sure that all files for the load package `1685299832` are completely loaded. ### Advanced layout configuration @@ -564,10 +562,10 @@ pipeline = dlt.pipeline( ) ``` -Furthermore, it is possible to +Furthermore, it is possible to: 1. Customize the behavior with callbacks for extra placeholder functionality. Each callback must accept the following positional arguments and return a string. -2. Customize the `current_datetime`, which can also be a callback function and expected to return a `pendulum.DateTime` instance. +2. Customize the `current_datetime`, which can also be a callback function and is expected to return a `pendulum.DateTime` instance. ```py import pendulum @@ -603,22 +601,25 @@ layout="{table_name}/{load_id}.{file_id}.{ext}" ``` Adopting this layout offers several advantages: -1. **Efficiency:** it's fast and simple to process. -2. **Compatibility:** supports `replace` as the write disposition method. -3. **Flexibility:** compatible with various destinations, including Athena. -4. **Performance:** a deeply nested structure can slow down file navigation, whereas a simpler layout mitigates this issue. +1. **Efficiency:** It's fast and simple to process. +2. **Compatibility:** Supports `replace` as the write disposition method. +3. **Flexibility:** Compatible with various destinations, including Athena. +4. **Performance:** A deeply nested structure can slow down file navigation, whereas a simpler layout mitigates this issue. ## Supported file formats + You can choose the following file formats: * [jsonl](../file-formats/jsonl.md) is used by default * [parquet](../file-formats/parquet.md) is supported * [csv](../file-formats/csv.md) is supported ## Supported table formats + You can choose the following table formats: * [Delta](../table-formats/delta.md) is supported ### Delta table format + You need the `deltalake` package to use this format: ```sh @@ -663,12 +664,12 @@ You can pass storage options by configuring `destination.filesystem.deltalake_st ```toml [destination.filesystem] -deltalake_storage_options = '{"AWS_S3_LOCKING_PROVIDER": "dynamodb", DELTA_DYNAMO_TABLE_NAME": "custom_table_name"}' +deltalake_storage_options = '{"AWS_S3_LOCKING_PROVIDER": "dynamodb", "DELTA_DYNAMO_TABLE_NAME": "custom_table_name"}' ``` `dlt` passes these options to the `storage_options` argument of the `write_deltalake` method in the `deltalake` library. Look at their [documentation](https://delta-io.github.io/delta-rs/api/delta_writer/#deltalake.write_deltalake) to see which options can be used. -You don't need to specify credentials here. `dlt` merges the required credentials with the options you provided, before passing it as `storage_options`. +You don't need to specify credentials here. `dlt` merges the required credentials with the options you provided before passing it as `storage_options`. >❗When using `s3`, you need to specify storage options to [configure](https://delta-io.github.io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/) locking behavior. @@ -692,11 +693,10 @@ delta_tables["another_delta_table"].optimize.z_order(["col_a", "col_b"]) ``` ## Syncing of `dlt` state -This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). To this end, special folders and files that will be created at your destination which hold information about your pipeline state, schemas and completed loads. These folders DO NOT respect your -settings in the layout section. When using filesystem as a staging destination, not all of these folders are created, as the state and schemas are -managed in the regular way by the final destination you have configured. +This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). To this end, special folders and files will be created at your destination which hold information about your pipeline state, schemas, and completed loads. These folders DO NOT respect your settings in the layout section. When using filesystem as a staging destination, not all of these folders are created, as the state and schemas are managed in the regular way by the final destination you have configured. + +You will also notice `init` files being present in the root folder and the special `dlt` folders. In the absence of the concepts of schemas and tables in blob storages and directories, `dlt` uses these special files to harmonize the behavior of the `filesystem` destination with the other implemented destinations. -You will also notice `init` files being present in the root folder and the special `dlt` folders. In the absence of the concepts of schemas and tables -in blob storages and directories, `dlt` uses these special files to harmonize the behavior of the `filesystem` destination with the other implemented destinations. +**Note:** When a load generates a new state, for example when using incremental loads, a new state file appears in the `_dlt_pipeline_state` folder at the destination. To prevent data accumulation, state cleanup mechanisms automatically remove old state files, retaining only the latest 100 by default. This cleanup process can be customized or disabled using the filesystem configuration `max_state_files`, which determines the maximum number of pipeline state files to retain (default is 100). Setting this value to 0 or a negative number disables the cleanup of old states. - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/destinations/index.md b/docs/website/docs/dlt-ecosystem/destinations/index.md index fef79d4364..e1bc6bfd92 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/index.md +++ b/docs/website/docs/dlt-ecosystem/destinations/index.md @@ -14,3 +14,4 @@ Pick one of our high-quality destinations and load your data into a local databa Otherwise, pick a destination below: + diff --git a/docs/website/docs/dlt-ecosystem/destinations/lancedb.md b/docs/website/docs/dlt-ecosystem/destinations/lancedb.md index 0d726508e6..083d196aea 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/lancedb.md +++ b/docs/website/docs/dlt-ecosystem/destinations/lancedb.md @@ -9,13 +9,12 @@ keywords: [ lancedb, vector database, destination, dlt ] [LanceDB](https://lancedb.com/) is an open-source, high-performance vector database. It allows you to store data objects and perform similarity searches over them. This destination helps you load data into LanceDB from [dlt resources](../../general-usage/resource.md). -## Setup Guide +## Setup guide -### Choosing a Model Provider +### Choosing a model provider First, you need to decide which embedding model provider to use. You can find all supported providers by visiting the official [LanceDB docs](https://lancedb.github.io/lancedb/embeddings/default_embedding_functions/). - ### Install dlt with LanceDB To use LanceDB as a destination, make sure `dlt` is installed with the `lancedb` extra: @@ -24,9 +23,9 @@ To use LanceDB as a destination, make sure `dlt` is installed with the `lancedb` pip install "dlt[lancedb]" ``` -the lancedb extra only installs `dlt` and `lancedb`. You will need to install your model provider's SDK. +The lancedb extra only installs `dlt` and `lancedb`. You will need to install your model provider's SDK. -You can find which libraries you need to also referring to the [LanceDB docs](https://lancedb.github.io/lancedb/embeddings/default_embedding_functions/). +You can find which libraries you need by also referring to the [LanceDB docs](https://lancedb.github.io/lancedb/embeddings/default_embedding_functions/). ### Configure the destination @@ -43,14 +42,14 @@ embedding_model_provider_api_key = "embedding_model_provider_api_key" # Not need ``` - The `uri` specifies the location of your LanceDB instance. It defaults to a local, on-disk instance if not provided. -- The `api_key` is your api key for LanceDB Cloud connections. If you're using LanceDB OSS, you don't need to supply this key. +- The `api_key` is your API key for LanceDB Cloud connections. If you're using LanceDB OSS, you don't need to supply this key. - The `embedding_model_provider` specifies the embedding provider used for generating embeddings. The default is `cohere`. - The `embedding_model` specifies the model used by the embedding provider for generating embeddings. Check with the embedding provider which options are available. Reference https://lancedb.github.io/lancedb/embeddings/default_embedding_functions/. -- The `embedding_model_provider_api_key` is the API key for the embedding model provider used to generate embeddings. If you're using a provider that doesn't need authentication, say ollama, you don't need to supply this key. +- The `embedding_model_provider_api_key` is the API key for the embedding model provider used to generate embeddings. If you're using a provider that doesn't need authentication, such as Ollama, you don't need to supply this key. -:::info Available Model Providers +:::info Available model providers - "gemini-text" - "bedrock-text" - "cohere" @@ -115,10 +114,9 @@ info = pipeline.run( The data is now loaded into LanceDB. -To use **vector search** after loading, you **must specify which fields LanceDB should generate embeddings for**. Do this by wrapping the data (or dlt resource) with the **`lancedb_adapter`** -function. +To use **vector search** after loading, you **must specify which fields LanceDB should generate embeddings for**. Do this by wrapping the data (or dlt resource) with the **`lancedb_adapter`** function. -## Using an Adapter to Specify Columns to Vectorise +## Using an adapter to specify columns to vectorize Out of the box, LanceDB will act as a normal database. To use LanceDB's embedding facilities, you'll need to specify which fields you'd like to embed in your dlt resource. @@ -130,7 +128,7 @@ lancedb_adapter(data, embed) It accepts the following arguments: -- `data`: a dlt resource object, or a Python data structure (e.g. a list of dictionaries). +- `data`: a dlt resource object, or a Python data structure (e.g., a list of dictionaries). - `embed`: a name of the field or a list of names to generate embeddings for. Returns: [dlt resource](../../general-usage/resource.md) object that you can pass to the `pipeline.run()`. @@ -154,7 +152,7 @@ pipeline = dlt.pipeline( destination="lancedb", ) -# apply adapter to the needed resources +# Apply adapter to the needed resources lancedb_adapter(products_tables.products, embed="description") lancedb_adapter(products_tables.customers, embed="bio") @@ -198,14 +196,13 @@ pipeline.run( This is the default disposition. It will append the data to the existing data in the destination. -## Additional Destination Options +## Additional destination options - `dataset_separator`: The character used to separate the dataset name from table names. Defaults to "___". - `vector_field_name`: The name of the special field to store vector embeddings. Defaults to "vector". - `id_field_name`: The name of the special field used for deduplication and merging. Defaults to "id__". - `max_retries`: The maximum number of retries for embedding operations. Set to 0 to disable retries. Defaults to 3. - ## dbt support The LanceDB destination doesn't support dbt integration. @@ -214,9 +211,9 @@ The LanceDB destination doesn't support dbt integration. The LanceDB destination supports syncing of the `dlt` state. -## Current Limitations +## Current limitations -### In-Memory Tables +### In-memory tables Adding new fields to an existing LanceDB table requires loading the entire table data into memory as a PyArrow table. This is because PyArrow tables are immutable, so adding fields requires creating a new table with the updated schema. @@ -228,7 +225,7 @@ Keep these considerations in mind when working with large datasets and monitor m OpenAI embedding service doesn't accept empty string bodies. We deal with this by replacing empty strings with a placeholder that should be very semantically dissimilar to 99.9% of queries. -If your source column (column which is embedded) has empty values, it is important to consider the impact of this. There might be a _slight_ change that semantic queries can hit these empty strings. +If your source column (column which is embedded) has empty values, it is important to consider the impact of this. There might be a _slight_ chance that semantic queries can hit these empty strings. We reported this issue to LanceDB: https://github.com/lancedb/lancedb/issues/1577. diff --git a/docs/website/docs/dlt-ecosystem/destinations/motherduck.md b/docs/website/docs/dlt-ecosystem/destinations/motherduck.md index f75314bb44..d914fab02e 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/motherduck.md +++ b/docs/website/docs/dlt-ecosystem/destinations/motherduck.md @@ -21,7 +21,7 @@ workers=3 or export the **LOAD__WORKERS=3** env variable. See more in [performance](../../reference/performance.md) ::: -## Setup Guide +## Setup guide **1. Initialize a project with a pipeline that loads to MotherDuck by running** ```sh @@ -71,7 +71,7 @@ By default, Parquet files and the `COPY` command are used to move files to the r The **INSERT** format is also supported and will execute large INSERT queries directly into the remote database. This method is significantly slower and may exceed the maximum query size, so it is not advised. ## dbt support -This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-duckdb](https://github.com/jwills/dbt-duckdb), which is a community-supported package. `dbt` version >= 1.7 is required +This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-duckdb](https://github.com/jwills/dbt-duckdb), which is a community-supported package. `dbt` version >= 1.7 is required. ## Multi-statement transaction support Motherduck supports multi-statement transactions. This change happened with `duckdb 0.10.2`. @@ -81,9 +81,9 @@ This destination fully supports [dlt state sync](../../general-usage/state#synci ## Troubleshooting -### My database is attached in read only mode -ie. `Error: Invalid Input Error: Cannot execute statement of type "CREATE" on database "dlt_data" which is attached in read-only mode!` -We encountered this problem for databases created with `duckdb 0.9.x` and then migrated to `0.10.x`. After switch to `1.0.x` on Motherduck, all our databases had permission "read-only" visible in UI. We could not figure out how to change it so we dropped and recreated our databases. +### My database is attached in read-only mode +i.e., `Error: Invalid Input Error: Cannot execute statement of type "CREATE" on database "dlt_data" which is attached in read-only mode!` +We encountered this problem for databases created with `duckdb 0.9.x` and then migrated to `0.10.x`. After switching to `1.0.x` on Motherduck, all our databases had permission "read-only" visible in UI. We could not figure out how to change it, so we dropped and recreated our databases. ### I see some exception with home_dir missing when opening `md:` connection. Some internal component (HTTPS) requires the **HOME** env variable to be present. Export such a variable to the command line. Here is what we do in our tests: @@ -94,3 +94,4 @@ before opening the connection. + diff --git a/docs/website/docs/dlt-ecosystem/destinations/mssql.md b/docs/website/docs/dlt-ecosystem/destinations/mssql.md index 0512fd5fca..5589f18d7c 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/mssql.md +++ b/docs/website/docs/dlt-ecosystem/destinations/mssql.md @@ -17,7 +17,7 @@ pip install "dlt[mssql]" ### Prerequisites The _Microsoft ODBC Driver for SQL Server_ must be installed to use this destination. -This cannot be included with `dlt`'s python dependencies, so you must install it separately on your system. You can find the official installation instructions [here](https://learn.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server?view=sql-server-ver16). +This cannot be included with `dlt`'s Python dependencies, so you must install it separately on your system. You can find the official installation instructions [here](https://learn.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server?view=sql-server-ver16). Supported driver versions: * `ODBC Driver 18 for SQL Server` @@ -54,7 +54,7 @@ host = "loader.database.windows.net" port = 1433 connect_timeout = 15 [destination.mssql.credentials.query] -# trust self signed SSL certificates +# trust self-signed SSL certificates TrustServerCertificate="yes" # require SSL connection Encrypt="yes" @@ -76,17 +76,17 @@ You can place any ODBC-specific settings into the query string or **destination. destination.mssql.credentials="mssql://loader.database.windows.net/dlt_data?trusted_connection=yes" ``` -**To connect to a local sql server instance running without SSL** pass `encrypt=no` parameter: +**To connect to a local SQL server instance running without SSL**, pass the `encrypt=no` parameter: ```toml destination.mssql.credentials="mssql://loader:loader@localhost/dlt_data?encrypt=no" ``` -**To allow self signed SSL certificate** when you are getting `certificate verify failed:unable to get local issuer certificate`: +**To allow a self-signed SSL certificate** when you are getting `certificate verify failed: unable to get local issuer certificate`: ```toml destination.mssql.credentials="mssql://loader:loader@localhost/dlt_data?TrustServerCertificate=yes" ``` -***To use long strings (>8k) and avoid collation errors**: +**To use long strings (>8k) and avoid collation errors**: ```toml destination.mssql.credentials="mssql://loader:loader@localhost/dlt_data?LongAsMax=yes" ``` @@ -103,7 +103,7 @@ pipeline = dlt.pipeline( All write dispositions are supported. If you set the [`replace` strategy](../../general-usage/full-loading.md) to `staging-optimized`, the destination tables will be dropped and -recreated with an `ALTER SCHEMA ... TRANSFER`. The operation is atomic: mssql supports DDL transactions. +recreated with an `ALTER SCHEMA ... TRANSFER`. The operation is atomic: MSSQL supports DDL transactions. ## Data loading Data is loaded via INSERT statements by default. MSSQL has a limit of 1000 rows per INSERT, and this is what we use. @@ -115,9 +115,9 @@ Data is loaded via INSERT statements by default. MSSQL has a limit of 1000 rows **mssql** will create unique indexes for all columns with `unique` hints. This behavior **may be disabled**. ### Table and column identifiers -SQL Server **with the default collation** uses case insensitive identifiers but will preserve the casing of identifiers that are stored in the INFORMATION SCHEMA. You can use [case sensitive naming conventions](../../general-usage/naming-convention.md#case-sensitive-and-insensitive-destinations) to keep the identifier casing. Note that you risk to generate identifier collisions, which are detected by `dlt` and will fail the load process. +SQL Server **with the default collation** uses case-insensitive identifiers but will preserve the casing of identifiers that are stored in the INFORMATION SCHEMA. You can use [case-sensitive naming conventions](../../general-usage/naming-convention.md#case-sensitive-and-insensitive-destinations) to keep the identifier casing. Note that you risk generating identifier collisions, which are detected by `dlt` and will fail the load process. -If you change SQL Server server/database collation to case sensitive, this will also affect the identifiers. Configure your destination as below in order to use case sensitive naming conventions without collisions: +If you change the SQL Server server/database collation to case-sensitive, this will also affect the identifiers. Configure your destination as below in order to use case-sensitive naming conventions without collisions: ```toml [destination.mssql] has_case_sensitive_identifiers=true diff --git a/docs/website/docs/dlt-ecosystem/destinations/postgres.md b/docs/website/docs/dlt-ecosystem/destinations/postgres.md index e506eb79fe..eb886c6674 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/postgres.md +++ b/docs/website/docs/dlt-ecosystem/destinations/postgres.md @@ -12,7 +12,7 @@ keywords: [postgres, destination, data warehouse] pip install "dlt[postgres]" ``` -## Setup Guide +## Setup guide **1. Initialize a project with a pipeline that loads to Postgres by running:** ```sh @@ -103,25 +103,22 @@ pipeline = dlt.pipeline(destination="postgres") pipeline.run(events()) ``` -### Fast loading with arrow tables and csv -You can use [arrow tables](../verified-sources/arrow-pandas.md) and [csv](../file-formats/csv.md) to quickly load tabular data. Pick the `csv` loader file format -like below +### Fast loading with arrow tables and CSV +You can use [Arrow tables](../verified-sources/arrow-pandas.md) and [CSV](../file-formats/csv.md) to quickly load tabular data. Pick the CSV loader file format like below: ```py info = pipeline.run(arrow_table, loader_file_format="csv") ``` -In the example above `arrow_table` will be converted to csv with **pyarrow** and then streamed into **postgres** with COPY command. This method skips the regular -`dlt` normalizer used for Python objects and is several times faster. +In the example above, `arrow_table` will be converted to CSV with **pyarrow** and then streamed into **postgres** with the COPY command. This method skips the regular `dlt` normalizer used for Python objects and is several times faster. ## Supported file formats * [insert-values](../file-formats/insert-format.md) is used by default. -* [csv](../file-formats/csv.md) is supported +* [CSV](../file-formats/csv.md) is supported. ## Supported column hints `postgres` will create unique indexes for all columns with `unique` hints. This behavior **may be disabled**. ### Table and column identifiers -Postgres supports both case sensitive and case insensitive identifiers. All unquoted and lowercase identifiers resolve case-insensitively in SQL statements. Case insensitive [naming conventions](../../general-usage/naming-convention.md#case-sensitive-and-insensitive-destinations) like the default **snake_case** will generate case insensitive identifiers. Case sensitive (like **sql_cs_v1**) will generate -case sensitive identifiers that must be quoted in SQL statements. +Postgres supports both case-sensitive and case-insensitive identifiers. All unquoted and lowercase identifiers resolve case-insensitively in SQL statements. Case insensitive [naming conventions](../../general-usage/naming-convention.md#case-sensitive-and-insensitive-destinations) like the default **snake_case** will generate case-insensitive identifiers. Case sensitive (like **sql_cs_v1**) will generate case-sensitive identifiers that must be quoted in SQL statements. ## Additional destination options The Postgres destination creates UNIQUE indexes by default on columns with the `unique` hint (i.e., `_dlt_id`). To disable this behavior: @@ -130,8 +127,8 @@ The Postgres destination creates UNIQUE indexes by default on columns with the ` create_indexes=false ``` -### Setting up `csv` format -You can provide [non-default](../file-formats/csv.md#default-settings) csv settings via configuration file or explicitly. +### Setting up `CSV` format +You can provide [non-default](../file-formats/csv.md#default-settings) CSV settings via a configuration file or explicitly. ```toml [destination.postgres.csv_format] delimiter="|" @@ -146,10 +143,10 @@ csv_format = CsvFormatConfiguration(delimiter="|", include_header=False) dest_ = postgres(csv_format=csv_format) ``` -Above we set `csv` file without header, with **|** as a separator. +Above, we set the `CSV` file without a header, with **|** as a separator. :::tip -You'll need those setting when [importing external files](../../general-usage/resource.md#import-external-files) +You'll need those settings when [importing external files](../../general-usage/resource.md#import-external-files). ::: ### dbt support diff --git a/docs/website/docs/dlt-ecosystem/destinations/qdrant.md b/docs/website/docs/dlt-ecosystem/destinations/qdrant.md index 5fc8097440..c59c4e8bb2 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/qdrant.md +++ b/docs/website/docs/dlt-ecosystem/destinations/qdrant.md @@ -9,7 +9,7 @@ keywords: [qdrant, vector database, destination, dlt] [Qdrant](https://qdrant.tech/) is an open-source, high-performance vector search engine/database. It deploys as an API service, providing a search for the nearest high-dimensional vectors. This destination helps you load data into Qdrant from [dlt resources](../../general-usage/resource.md). -## Setup Guide +## Setup guide 1. To use Qdrant as a destination, make sure `dlt` is installed with the `qdrant` extra: @@ -163,7 +163,7 @@ info = pipeline.run( ) ``` -Internally, dlt will use `primary_key` (`document_id` in the example above) to generate a unique identifier (UUID) for each point in Qdrant. If the object with the same UUID already exists in Qdrant, it will be updated with the new data. Otherwise, a new point will be created. +Internally, dlt will use the `primary_key` (`document_id` in the example above) to generate a unique identifier (UUID) for each point in Qdrant. If the object with the same UUID already exists in Qdrant, it will be updated with the new data. Otherwise, a new point will be created. :::caution @@ -208,7 +208,7 @@ pipeline = dlt.pipeline( - `model`: (str) The name of the FlagEmbedding model to use. See the list of supported models at [Supported Models](https://qdrant.github.io/fastembed/examples/Supported_Models/). The default value is "BAAI/bge-small-en". -### [Qdrant Client Options](#qdrant-client-options) +### Qdrant client options The `QdrantClientOptions` class provides options for configuring the Qdrant client. @@ -218,7 +218,7 @@ The `QdrantClientOptions` class provides options for configuring the Qdrant clie - `prefer_grpc`: (bool) If `true`, the client will prefer to use the gRPC interface whenever possible in custom methods. The default value is `false`. -- `https`: (bool) If `true`, the client will use the HTTPS (SSL) protocol. The default value is `true` if an API Key is provided, else `false`. +- `https`: (bool) If `true`, the client will use the HTTPS (SSL) protocol. The default value is `true` if an API key is provided, otherwise `false`. - `prefix`: (str) If set, it adds the specified `prefix` to the REST URL path. For example, setting it to "service/v1" will result in the REST API URL as `http://localhost:6333/service/v1/{qdrant-endpoint}`. Not set by default. @@ -230,7 +230,7 @@ The `QdrantClientOptions` class provides options for configuring the Qdrant clie ### Run Qdrant locally -You can find the setup instructions to run Qdrant [here](https://qdrant.tech/documentation/quick-start/#download-and-run) +You can find the setup instructions to run Qdrant [here](https://qdrant.tech/documentation/quick-start/#download-and-run). ### Syncing of `dlt` state diff --git a/docs/website/docs/dlt-ecosystem/destinations/redshift.md b/docs/website/docs/dlt-ecosystem/destinations/redshift.md index 529424a198..c503ad93e7 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/redshift.md +++ b/docs/website/docs/dlt-ecosystem/destinations/redshift.md @@ -12,7 +12,7 @@ keywords: [redshift, destination, data warehouse] pip install "dlt[redshift]" ``` -## Setup Guide +## Setup guide ### 1. Initialize the dlt project Let's start by initializing a new dlt project as follows: @@ -42,12 +42,12 @@ To load data into Redshift, you need to create a Redshift cluster and enable acc ```toml [destination.redshift.credentials] - database = "please set me up!" # copy your database name here - password = "please set me up!" # keep your redshift db instance password here - username = "please set me up!" # keep your redshift db instance username here - host = "please set me up!" # copy your redshift host from cluster endpoint here + database = "please set me up!" # Copy your database name here + password = "please set me up!" # Keep your Redshift db instance password here + username = "please set me up!" # Keep your Redshift db instance username here + host = "please set me up!" # Copy your Redshift host from cluster endpoint here port = 5439 - connect_timeout = 15 # enter the timeout value + connect_timeout = 15 # Enter the timeout value ``` 2. The "host" is derived from the cluster endpoint specified in the “General Configuration.” For example: @@ -63,7 +63,7 @@ To load data into Redshift, you need to create a Redshift cluster and enable acc You can also pass a database connection string similar to the one used by the `psycopg2` library or [SQLAlchemy](https://docs.sqlalchemy.org/en/20/core/engines.html#postgresql). The credentials above will look like this: ```toml -# keep it at the top of your toml file! before any section starts +# Keep it at the top of your toml file! Before any section starts destination.redshift.credentials="redshift://loader:@localhost/dlt_data?connect_timeout=15" ``` @@ -75,8 +75,8 @@ All [write dispositions](../../general-usage/incremental-loading#choosing-a-writ [SQL Insert](../file-formats/insert-format) is used by default. When staging is enabled: -* [jsonl](../file-formats/jsonl.md) is used by default -* [parquet](../file-formats/parquet.md) is supported +* [jsonl](../file-formats/jsonl.md) is used by default. +* [parquet](../file-formats/parquet.md) is supported. > ❗ **Redshift cannot load `VARBYTE` columns from `json` files**. `dlt` will fail such jobs permanently. Switch to `parquet` to load binaries. @@ -94,10 +94,10 @@ Amazon Redshift supports the following column hints: - `sort` - This hint creates a SORTKEY to order rows on disk physically. It is used to improve query and join speed in Redshift. Please read the [sort key docs](https://docs.aws.amazon.com/redshift/latest/dg/c_best-practices-sort-key.html) to learn more. ### Table and column identifiers -Redshift **by default** uses case insensitive identifiers and **will lower case all the identifiers** that are stored in the INFORMATION SCHEMA. Do not use -[case sensitive naming conventions](../../general-usage/naming-convention.md#case-sensitive-and-insensitive-destinations). Letter casing will be removed anyway and you risk to generate identifier collisions, which are detected by `dlt` and will fail the load process. +Redshift **by default** uses case-insensitive identifiers and **will lower case all the identifiers** that are stored in the INFORMATION SCHEMA. Do not use +[case-sensitive naming conventions](../../general-usage/naming-convention.md#case-sensitive-and-insensitive-destinations). Letter casing will be removed anyway, and you risk generating identifier collisions, which are detected by `dlt` and will fail the load process. -You can [put Redshift in case sensitive mode](https://docs.aws.amazon.com/redshift/latest/dg/r_enable_case_sensitive_identifier.html). Configure your destination as below in order to use case sensitive naming conventions: +You can [put Redshift in case-sensitive mode](https://docs.aws.amazon.com/redshift/latest/dg/r_enable_case_sensitive_identifier.html). Configure your destination as below in order to use case-sensitive naming conventions: ```toml [destination.redshift] has_case_sensitive_identifiers=true @@ -106,13 +106,13 @@ has_case_sensitive_identifiers=true ## Staging support -Redshift supports s3 as a file staging destination. dlt will upload files in the parquet format to s3 and ask Redshift to copy their data directly into the db. Please refer to the [S3 documentation](./filesystem.md#aws-s3) to learn how to set up your s3 bucket with the bucket_url and credentials. The `dlt` Redshift loader will use the AWS credentials provided for s3 to access the s3 bucket if not specified otherwise (see config options below). Alternatively to parquet files, you can also specify jsonl as the staging file format. For this, set the `loader_file_format` argument of the `run` command of the pipeline to `jsonl`. +Redshift supports s3 as a file staging destination. `dlt` will upload files in the parquet format to s3 and ask Redshift to copy their data directly into the db. Please refer to the [S3 documentation](./filesystem.md#aws-s3) to learn how to set up your s3 bucket with the bucket_url and credentials. The `dlt` Redshift loader will use the AWS credentials provided for s3 to access the s3 bucket if not specified otherwise (see config options below). Alternatively to parquet files, you can also specify jsonl as the staging file format. For this, set the `loader_file_format` argument of the `run` command of the pipeline to `jsonl`. ## Identifier names and case sensitivity * Up to 127 characters * Case insensitive * Stores identifiers in lower case -* Has case sensitive mode, if enabled you must [enable case sensitivity in destination factory](../../general-usage/destination.md#control-how-dlt-creates-table-column-and-other-identifiers) +* Has case-sensitive mode, if enabled you must [enable case sensitivity in destination factory](../../general-usage/destination.md#control-how-dlt-creates-table-column-and-other-identifiers) ### Authentication IAM Role @@ -127,8 +127,8 @@ staging_iam_role="arn:aws:iam::..." ```py # Create a dlt pipeline that will load -# chess player data to the redshift destination -# via staging on s3 +# chess player data to the Redshift destination +# via staging on S3 pipeline = dlt.pipeline( pipeline_name='chess_pipeline', destination='redshift', @@ -140,10 +140,10 @@ pipeline = dlt.pipeline( ## Additional destination options ### dbt support -- This destination [integrates with dbt](../transformations/dbt) via [dbt-redshift](https://github.com/dbt-labs/dbt-redshift). Credentials and timeout settings are shared automatically with `dbt`. +- This destination [integrates with dbt](../transformations/dbt) via [dbt-redshift](https://github.com/dbt-labs/dbt-redshift). Credentials and timeout settings are shared automatically with `dbt`. -### Syncing of `dlt` state -- This destination fully supports [dlt state sync.](../../general-usage/state#syncing-state-with-destination) +### Syncing of `dlt` state +- This destination fully supports [dlt state sync.](../../general-usage/state#syncing-state-with-destination) ## Supported loader file formats diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md index 74688ba7fa..2ea08778c6 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md +++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md @@ -12,7 +12,7 @@ keywords: [Snowflake, destination, data warehouse] pip install "dlt[snowflake]" ``` -## Setup Guide +## Setup guide **1. Initialize a project with a pipeline that loads to Snowflake by running:** ```sh @@ -44,14 +44,14 @@ In the case of Snowflake, the **host** is your [Account Identifier](https://docs The **warehouse** and **role** are optional if you assign defaults to your user. In the example below, we do not do that, so we set them explicitly. -### Setup the database user and permissions +### Set up the database user and permissions The instructions below assume that you use the default account setup that you get after creating a Snowflake account. You should have a default warehouse named **COMPUTE_WH** and a Snowflake account. Below, we create a new database, user, and assign permissions. The permissions are very generous. A more experienced user can easily reduce `dlt` permissions to just one schema in the database. ```sql ---create database with standard settings +-- create database with standard settings CREATE DATABASE dlt_data; -- create new user - set your password here CREATE USER loader WITH PASSWORD=''; --- we assign all permission to a role +-- we assign all permissions to a role CREATE ROLE DLT_LOADER_ROLE; GRANT ROLE DLT_LOADER_ROLE TO USER loader; -- give database access to new role @@ -67,14 +67,14 @@ GRANT ALL PRIVILEGES ON FUTURE TABLES IN DATABASE dlt_data TO DLT_LOADER_ROLE; Now you can use the user named `LOADER` to access the database `DLT_DATA` and log in with the specified password. -You can also decrease the suspend time for your warehouse to 1 minute (**Admin**/**Warehouses** in Snowflake UI) +You can also decrease the suspend time for your warehouse to 1 minute (**Admin**/**Warehouses** in Snowflake UI). ### Authentication types + Snowflake destination accepts three authentication types: -Snowflake destination accepts three authentication types: -- password authentication -- [key pair authentication](https://docs.snowflake.com/en/user-guide/key-pair-auth) -- oauth authentication +- Password authentication +- [Key pair authentication](https://docs.snowflake.com/en/user-guide/key-pair-auth) +- OAuth authentication The **password authentication** is not any different from other databases like Postgres or Redshift. `dlt` follows the same syntax as the [SQLAlchemy dialect](https://docs.snowflake.com/en/developer-guide/python-connector/sqlalchemy#required-parameters). @@ -85,7 +85,7 @@ destination.snowflake.credentials="snowflake://loader:@kgiotue-wn98412 ``` -In **key pair authentication**, you replace the password with a private key string that should be in Base64-encoded DER format ([DBT also recommends](https://docs.getdbt.com/docs/core/connect-data-platform/snowflake-setup#key-pair-authentication) base64-encoded private keys for Snowflake connections). The private key may also be encrypted. In that case, you must provide a passphrase alongside the private key. +In **key pair authentication**, you replace the password with a private key string that should be in Base64-encoded DER format ([dbt also recommends](https://docs.getdbt.com/docs/core/connect-data-platform/snowflake-setup#key-pair-authentication) base64-encoded private keys for Snowflake connections). The private key may also be encrypted. In that case, you must provide a passphrase alongside the private key. ```toml [destination.snowflake.credentials] database = "dlt_data" @@ -102,7 +102,7 @@ If you pass a passphrase in the connection string, please URL encode it. destination.snowflake.credentials="snowflake://loader:@kgiotue-wn98412/dlt_data?private_key=&private_key_passphrase=" ``` -In **oauth authentication**, you can use an OAuth provider like Snowflake, Okta or an external browser to authenticate. In case of Snowflake oauth, you pass your `authenticator` and refresh `token` as below: +In **OAuth authentication**, you can use an OAuth provider like Snowflake, Okta, or an external browser to authenticate. In the case of Snowflake OAuth, you pass your `authenticator` and refresh `token` as below: ```toml [destination.snowflake.credentials] database = "dlt_data" @@ -112,10 +112,11 @@ token="..." ``` or in the connection string as query parameters. -In case of external authentication, you need to find documentation for your OAuth provider. Refer to Snowflake [OAuth](https://docs.snowflake.com/en/user-guide/oauth-intro) for more details. +In the case of external authentication, you need to find documentation for your OAuth provider. Refer to Snowflake [OAuth](https://docs.snowflake.com/en/user-guide/oauth-intro) for more details. ### Additional connection options -We pass all query parameters to `connect` function of Snowflake Python Connector. For example: + +We pass all query parameters to the `connect` function of the Snowflake Python Connector. For example: ```toml [destination.snowflake.credentials] database = "dlt_data" @@ -125,17 +126,18 @@ timezone="UTC" # keep session alive beyond 4 hours client_session_keep_alive=true ``` -Will set the timezone and session keep alive. Mind that if you use `toml` your configuration is typed. The alternative: +This will set the timezone and session keep alive. Mind that if you use `toml`, your configuration is typed. The alternative: `"snowflake://loader/dlt_data?authenticator=oauth&timezone=UTC&client_session_keep_alive=true"` -will pass `client_session_keep_alive` as string to the connect method (which we didn't verify if it works). +will pass `client_session_keep_alive` as a string to the connect method (which we didn't verify if it works). + +### Write disposition -## Write disposition All write dispositions are supported. -If you set the [`replace` strategy](../../general-usage/full-loading.md) to `staging-optimized`, the destination tables will be dropped and -recreated with a [clone command](https://docs.snowflake.com/en/sql-reference/sql/create-clone) from the staging tables. +If you set the [`replace` strategy](../../general-usage/full-loading.md) to `staging-optimized`, the destination tables will be dropped and recreated with a [clone command](https://docs.snowflake.com/en/sql-reference/sql/create-clone) from the staging tables. + +### Data loading -## Data loading The data is loaded using an internal Snowflake stage. We use the `PUT` command and per-table built-in stages by default. Stage files are kept by default, unless specified otherwise via the `keep_staged_files` parameter: ```toml @@ -146,7 +148,7 @@ keep_staged_files = false ### Data types `snowflake` supports various timestamp types, which can be configured using the column flags `timezone` and `precision` in the `dlt.resource` decorator or the `pipeline.run` method. -- **Precision**: allows you to specify the number of decimal places for fractional seconds, ranging from 0 to 9. It can be used in combination with the `timezone` flag. +- **Precision**: Allows you to specify the number of decimal places for fractional seconds, ranging from 0 to 9. It can be used in combination with the `timezone` flag. - **Timezone**: - Setting `timezone=False` maps to `TIMESTAMP_NTZ`. - Setting `timezone=True` (or omitting the flag, which defaults to `True`) maps to `TIMESTAMP_TZ`. @@ -165,44 +167,44 @@ pipeline.run(events()) ``` ## Supported file formats -* [insert-values](../file-formats/insert-format.md) is used by default -* [parquet](../file-formats/parquet.md) is supported -* [jsonl](../file-formats/jsonl.md) is supported -* [csv](../file-formats/csv.md) is supported +* [insert-values](../file-formats/insert-format.md) is used by default. +* [parquet](../file-formats/parquet.md) is supported. +* [jsonl](../file-formats/jsonl.md) is supported. +* [csv](../file-formats/csv.md) is supported. When staging is enabled: -* [jsonl](../file-formats/jsonl.md) is used by default -* [parquet](../file-formats/parquet.md) is supported -* [csv](../file-formats/csv.md) is supported +* [jsonl](../file-formats/jsonl.md) is used by default. +* [parquet](../file-formats/parquet.md) is supported. +* [csv](../file-formats/csv.md) is supported. :::caution When loading from `parquet`, Snowflake will store `json` types (JSON) in `VARIANT` as a string. Use the `jsonl` format instead or use `PARSE_JSON` to update the `VARIANT` field after loading. ::: -### Custom csv formats -By default we support csv format [produced by our writers](../file-formats/csv.md#default-settings) which is comma delimited, with header and optionally quoted. +### Custom CSV formats +By default, we support the CSV format [produced by our writers](../file-formats/csv.md#default-settings), which is comma-delimited, with a header, and optionally quoted. -You can configure your own formatting ie. when [importing](../../general-usage/resource.md#import-external-files) external `csv` files. +You can configure your own formatting, i.e., when [importing](../../general-usage/resource.md#import-external-files) external `csv` files. ```toml [destination.snowflake.csv_format] delimiter="|" include_header=false on_error_continue=true ``` -Which will read, `|` delimited file, without header and will continue on errors. +This will read a `|` delimited file, without a header, and will continue on errors. Note that we ignore missing columns `ERROR_ON_COLUMN_COUNT_MISMATCH = FALSE` and we will insert NULL into them. ## Supported column hints -Snowflake supports the following [column hints](https://dlthub.com/docs/general-usage/schema#tables-and-columns): -* `cluster` - creates a cluster column(s). Many columns per table are supported and only when a new table is created. +Snowflake supports the following [column hints](../../general-usage/schema#tables-and-columns): +* `cluster` - Creates a cluster column(s). Many columns per table are supported and only when a new table is created. ## Table and column identifiers -Snowflake supports both case sensitive and case insensitive identifiers. All unquoted and uppercase identifiers resolve case-insensitively in SQL statements. Case insensitive [naming conventions](../../general-usage/naming-convention.md#case-sensitive-and-insensitive-destinations) like the default **snake_case** will generate case insensitive identifiers. Case sensitive (like **sql_cs_v1**) will generate -case sensitive identifiers that must be quoted in SQL statements. +Snowflake supports both case-sensitive and case-insensitive identifiers. All unquoted and uppercase identifiers resolve case-insensitively in SQL statements. Case-insensitive [naming conventions](../../general-usage/naming-convention.md#case-sensitive-and-insensitive-destinations) like the default **snake_case** will generate case-insensitive identifiers. Case-sensitive (like **sql_cs_v1**) will generate +case-sensitive identifiers that must be quoted in SQL statements. :::note -Names of tables and columns in [schemas](../../general-usage/schema.md) are kept in lower case like for all other destinations. This is the pattern we observed in other tools, i.e., `dbt`. In the case of `dlt`, it is, however, trivial to define your own uppercase [naming convention](../../general-usage/schema.md#naming-convention) +Names of tables and columns in [schemas](../../general-usage/schema.md) are kept in lowercase like for all other destinations. This is the pattern we observed in other tools, i.e., `dbt`. In the case of `dlt`, it is, however, trivial to define your own uppercase [naming convention](../../general-usage/schema.md#naming-convention). ::: ## Staging support @@ -216,7 +218,7 @@ Alternatively to parquet files, you can also specify jsonl as the staging file f Please refer to the [S3 documentation](./filesystem.md#aws-s3) to learn how to set up your bucket with the bucket_url and credentials. For S3, the `dlt` Redshift loader will use the AWS credentials provided for S3 to access the S3 bucket if not specified otherwise (see config options below). Alternatively, you can create a stage for your S3 Bucket by following the instructions provided in the [Snowflake S3 documentation](https://docs.snowflake.com/en/user-guide/data-load-s3-config-storage-integration). The basic steps are as follows: -* Create a storage integration linked to GCS and the right bucket +* Create a storage integration linked to GCS and the right bucket. * Grant access to this storage integration to the Snowflake role you are using to load the data into Snowflake. * Create a stage from this storage integration in the PUBLIC namespace, or the namespace of the schema of your data. * Also grant access to this stage for the role you are using to load data into Snowflake. @@ -245,9 +247,9 @@ pipeline = dlt.pipeline( ### Snowflake and Google Cloud Storage -Please refer to the [Google Storage filesystem documentation](./filesystem.md#google-storage) to learn how to set up your bucket with the bucket_url and credentials. For GCS, you can define a stage in Snowflake and provide the stage identifier in the configuration (see config options below.) Please consult the Snowflake Documentation on [how to create a stage for your GCS Bucket](https://docs.snowflake.com/en/user-guide/data-load-gcs-config). The basic steps are as follows: +Please refer to the [Google Storage filesystem documentation](./filesystem.md#google-storage) to learn how to set up your bucket with the bucket_url and credentials. For GCS, you can define a stage in Snowflake and provide the stage identifier in the configuration (see config options below). Please consult the Snowflake Documentation on [how to create a stage for your GCS Bucket](https://docs.snowflake.com/en/user-guide/data-load-gcs-config). The basic steps are as follows: -* Create a storage integration linked to GCS and the right bucket +* Create a storage integration linked to GCS and the right bucket. * Grant access to this storage integration to the Snowflake role you are using to load the data into Snowflake. * Create a stage from this storage integration in the PUBLIC namespace, or the namespace of the schema of your data. * Also grant access to this stage for the role you are using to load data into Snowflake. @@ -274,14 +276,12 @@ pipeline = dlt.pipeline( ### Snowflake and Azure Blob Storage -Please refer to the [Azure Blob Storage filesystem documentation](./filesystem.md#azure-blob-storage) to learn how to set up your bucket with the bucket_url and credentials. For Azure, the Snowflake loader will use -the filesystem credentials for your Azure Blob Storage container if not specified otherwise (see config options below). Alternatively, you can define an external stage in Snowflake and provide the stage identifier. -Please consult the Snowflake Documentation on [how to create a stage for your Azure Blob Storage Container](https://docs.snowflake.com/en/user-guide/data-load-azure). The basic steps are as follows: +Please refer to the [Azure Blob Storage filesystem documentation](./filesystem.md#azure-blob-storage) to learn how to set up your bucket with the bucket_url and credentials. For Azure, the Snowflake loader will use the filesystem credentials for your Azure Blob Storage container if not specified otherwise (see config options below). Alternatively, you can define an external stage in Snowflake and provide the stage identifier. Please consult the Snowflake Documentation on [how to create a stage for your Azure Blob Storage Container](https://docs.snowflake.com/en/user-guide/data-load-azure). The basic steps are as follows: -* Create a storage integration linked to Azure Blob Storage and the right container +* Create a storage integration linked to Azure Blob Storage and the right container. * Grant access to this storage integration to the Snowflake role you are using to load the data into Snowflake. * Create a stage from this storage integration in the PUBLIC namespace, or the namespace of the schema of your data. -* Also grant access to this stage for the role you are using to load data into Snowflake. +* Also, grant access to this stage for the role you are using to load data into Snowflake. * Provide the name of your stage (including the namespace) to `dlt` like so: ```toml @@ -304,7 +304,9 @@ pipeline = dlt.pipeline( ``` ## Additional destination options + You can define your own stage to PUT files and disable the removal of the staged files after loading. + ```toml [destination.snowflake] # Use an existing named stage instead of the default. Default uses the implicit table stage per table @@ -313,8 +315,10 @@ stage_name="DLT_STAGE" keep_staged_files=true ``` -### Setting up `csv` format +### Setting up CSV format + You can provide [non-default](../file-formats/csv.md#default-settings) csv settings via configuration file or explicitly. + ```toml [destination.snowflake.csv_format] delimiter="|" @@ -330,42 +334,43 @@ csv_format = CsvFormatConfiguration(delimiter="|", include_header=False, on_erro dest_ = snowflake(csv_format=csv_format) ``` -Above we set `csv` file without header, with **|** as a separator and we request to ignore lines with errors. +Above, we set the CSV file format without a header, with **|** as a separator, and we request to ignore lines with errors. :::tip -You'll need those setting when [importing external files](../../general-usage/resource.md#import-external-files) +You'll need these settings when [importing external files](../../general-usage/resource.md#import-external-files). ::: -### Query Tagging -`dlt` [tags sessions](https://docs.snowflake.com/en/sql-reference/parameters#query-tag) that execute loading jobs with following job properties: -* **source** - name of the source (identical with the name of `dlt` schema) +### Query tagging + +`dlt` [tags sessions](https://docs.snowflake.com/en/sql-reference/parameters#query-tag) that execute loading jobs with the following job properties: +* **source** - name of the source (identical with the name of the `dlt` schema) * **resource** - name of the resource (if known, else empty string) * **table** - name of the table loaded by the job * **load_id** - load id of the job * **pipeline_name** - name of the active pipeline (or empty string if not found) -You can define query tag by defining a query tag placeholder in snowflake credentials: +You can define a query tag by defining a query tag placeholder in Snowflake credentials: + ```toml [destination.snowflake] query_tag='{{"source":"{source}", "resource":"{resource}", "table": "{table}", "load_id":"{load_id}", "pipeline_name":"{pipeline_name}"}}' ``` -which contains Python named formatters corresponding to tag names ie. `{source}` will assume the name of the dlt source. +which contains Python named formatters corresponding to tag names i.e., `{source}` will assume the name of the dlt source. :::note -1. query tagging is off by default. `query_tag` configuration field is `None` by default and must be set to enable tagging. -2. only sessions associated with a job are tagged. sessions that migrate schemas remain untagged -3. jobs processing table chains (ie. sql merge jobs) will use top level table as **table** +1. Query tagging is off by default. The `query_tag` configuration field is `None` by default and must be set to enable tagging. +2. Only sessions associated with a job are tagged. Sessions that migrate schemas remain untagged. +3. Jobs processing table chains (i.e., SQL merge jobs) will use the top-level table as **table**. ::: ### dbt support This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-snowflake](https://github.com/dbt-labs/dbt-snowflake). Both password and key pair authentication are supported and shared with dbt runners. ### Syncing of `dlt` state -This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination) +This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). ### Snowflake connection identifier -We enable Snowflake to identify that the connection is created by `dlt`. Snowflake will use this identifier to better understand the usage patterns -associated with `dlt` integration. The connection identifier is `dltHub_dlt`. +We enable Snowflake to identify that the connection is created by `dlt`. Snowflake will use this identifier to better understand the usage patterns associated with `dlt` integration. The connection identifier is `dltHub_dlt`. diff --git a/docs/website/docs/dlt-ecosystem/destinations/sqlalchemy.md b/docs/website/docs/dlt-ecosystem/destinations/sqlalchemy.md new file mode 100644 index 0000000000..a3b19377da --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/destinations/sqlalchemy.md @@ -0,0 +1,163 @@ +--- +title: SQL databases (powered by SQLAlchemy) +description: SQLAlchemy destination +keywords: [sql, sqlalchemy, database, destination] +--- + +# SQLAlchemy destination + +The SQLAlchemy destination allows you to use any database that has an [SQLAlchemy dialect](https://docs.sqlalchemy.org/en/20/dialects/) implemented as a destination. + +Currently, MySQL and SQLite are considered to have full support and are tested as part of the `dlt` CI suite. Other dialects are not tested but should generally work. + +## Install dlt with SQLAlchemy + +Install dlt with the `sqlalchemy` extra dependency: + +```sh +pip install "dlt[sqlalchemy]" +``` + +Note that database drivers are not included and need to be installed separately for the database you plan on using. For example, for MySQL: + +```sh +pip install mysqlclient +``` + +Refer to the [SQLAlchemy documentation on dialects](https://docs.sqlalchemy.org/en/20/dialects/) for information about client libraries required for supported databases. + +### Create a pipeline + +**1. Initialize a project with a pipeline that loads to MS SQL by running:** +```sh +dlt init chess sqlalchemy +``` + +**2. Install the necessary dependencies for SQLAlchemy by running:** +```sh +pip install -r requirements.txt +``` +or run: +```sh +pip install "dlt[sqlalchemy]" +``` + +**3. Install your database client library.** + +E.g., for MySQL: +```sh +pip install mysqlclient +``` + +**4. Enter your credentials into `.dlt/secrets.toml`.** + +For example, replace with your database connection info: +```toml +[destination.sqlalchemy.credentials] +database = "dlt_data" +username = "loader" +password = "" +host = "localhost" +port = 3306 +driver_name = "mysql" +``` + +Alternatively, a valid SQLAlchemy database URL can be used, either in `secrets.toml` or as an environment variable. +E.g. + +```toml +[destination.sqlalchemy] +credentials = "mysql://loader:@localhost:3306/dlt_data" +``` + +or + +```sh +export DESTINATION__SQLALCHEMY__CREDENTIALS="mysql://loader:@localhost:3306/dlt_data" +``` + +An SQLAlchemy `Engine` can also be passed directly by creating an instance of the destination: + +```py +import sqlalchemy as sa +import dlt + +engine = sa.create_engine('sqlite:///chess_data.db') + +pipeline = dlt.pipeline( + pipeline_name='chess', + destination=dlt.destinations.sqlalchemy(engine), + dataset_name='main' +) +``` + +## Notes on SQLite + +### Dataset files +When using an SQLite database file, each dataset is stored in a separate file since SQLite does not support multiple schemas in a single database file. +Under the hood, this uses [`ATTACH DATABASE`](https://www.sqlite.org/lang_attach.html). + +The file is stored in the same directory as the main database file (provided by your database URL). + +E.g., if your SQLite URL is `sqlite:////home/me/data/chess_data.db` and your `dataset_name` is `games`, the data +is stored in `/home/me/data/chess_data__games.db` + +**Note**: If the dataset name is `main`, no additional file is created as this is the default SQLite database. + +### In-memory databases +In-memory databases require a persistent connection as the database is destroyed when the connection is closed. +Normally, connections are opened and closed for each load job and in other stages during the pipeline run. +To ensure the database persists throughout the pipeline run, you need to pass in an SQLAlchemy `Engine` object instead of credentials. +This engine is not disposed of automatically by `dlt`. Example: + +```py +import dlt +import sqlalchemy as sa + +# Create the SQLite engine +engine = sa.create_engine('sqlite:///:memory:') + +# Configure the destination instance and create pipeline +pipeline = dlt.pipeline('my_pipeline', destination=dlt.destinations.sqlalchemy(engine), dataset_name='main') + +# Run the pipeline with some data +pipeline.run([1,2,3], table_name='my_table') + +# The engine is still open and you can query the database +with engine.connect() as conn: + result = conn.execute(sa.text('SELECT * FROM my_table')) + print(result.fetchall()) +``` + +## Write dispositions + +The following write dispositions are supported: + +- `append` +- `replace` with `truncate-and-insert` and `insert-from-staging` replace strategies. `staging-optimized` falls back to `insert-from-staging`. + +The `merge` disposition is not supported and falls back to `append`. + +## Data loading + +Data is loaded in a dialect-agnostic manner with an `insert` statement generated by SQLAlchemy's core API. +Rows are inserted in batches as long as the underlying database driver supports it. By default, the batch size is 10,000 rows. + +## Syncing of `dlt` state + +This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). + +### Data types + +All `dlt` data types are supported, but how they are stored in the database depends on the SQLAlchemy dialect. +For example, SQLite does not have `DATETIME` or `TIMESTAMP` types, so `timestamp` columns are stored as `TEXT` in ISO 8601 format. + +## Supported file formats + +* [typed-jsonl](../file-formats/jsonl.md) is used by default. JSON-encoded data with typing information included. +* [parquet](../file-formats/parquet.md) is supported. + +## Supported column hints + +* `unique` hints are translated to `UNIQUE` constraints via SQLAlchemy (granted the database supports it). + diff --git a/docs/website/docs/dlt-ecosystem/destinations/synapse.md b/docs/website/docs/dlt-ecosystem/destinations/synapse.md index 0d50924cdf..51721ec298 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/synapse.md +++ b/docs/website/docs/dlt-ecosystem/destinations/synapse.md @@ -19,15 +19,15 @@ pip install "dlt[synapse]" * **Microsoft ODBC Driver for SQL Server** The _Microsoft ODBC Driver for SQL Server_ must be installed to use this destination. - This can't be included with `dlt`'s python dependencies, so you must install it separately on your system. You can find the official installation instructions [here](https://learn.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server?view=sql-server-ver16). + This cannot be included with `dlt`'s Python dependencies, so you must install it separately on your system. You can find the official installation instructions [here](https://learn.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server?view=sql-server-ver16). Supported driver versions: * `ODBC Driver 18 for SQL Server` - > 💡 Older driver versions don't work properly because they don't support the `LongAsMax` keyword that was [introduced](https://learn.microsoft.com/en-us/sql/connect/odbc/windows/features-of-the-microsoft-odbc-driver-for-sql-server-on-windows?view=sql-server-ver15#microsoft-odbc-driver-180-for-sql-server-on-windows) in `ODBC Driver 18 for SQL Server`. Synapse does not support the legacy ["long data types"](https://learn.microsoft.com/en-us/sql/t-sql/data-types/ntext-text-and-image-transact-sql), and requires "max data types" instead. `dlt` uses the `LongAsMax` keyword to automatically do the conversion. + > 💡 Older driver versions do not work properly because they do not support the `LongAsMax` keyword that was [introduced](https://learn.microsoft.com/en-us/sql/connect/odbc/windows/features-of-the-microsoft-odbc-driver-for-sql-server-on-windows?view=sql-server-ver15#microsoft-odbc-driver-180-for-sql-server-on-windows) in `ODBC Driver 18 for SQL Server`. Synapse does not support the legacy ["long data types"](https://learn.microsoft.com/en-us/sql/t-sql/data-types/ntext-text-and-image-transact-sql), and requires "max data types" instead. `dlt` uses the `LongAsMax` keyword to automatically do the conversion. * **Azure Synapse Workspace and dedicated SQL pool** - You need an Azure Synapse workspace with a dedicated SQL pool to load data into. If you don't have one yet, you can use this [quickstart](https://learn.microsoft.com/en-us/azure/synapse-analytics/quickstart-create-sql-pool-studio). + You need an Azure Synapse workspace with a dedicated SQL pool to load data into. If you do not have one yet, you can use this [quickstart](https://learn.microsoft.com/en-us/azure/synapse-analytics/quickstart-create-sql-pool-studio). ### Steps @@ -95,7 +95,7 @@ pipeline = dlt.pipeline( dataset_name='chess_data' ) ``` -To use **Active Directory Principal**, you can use the `sqlalchemy.engine.URL.create` method to create the connection URL using your Active Directory Service Principal credentials. First create the connection string as: +To use **Active Directory Principal**, you can use the `sqlalchemy.engine.URL.create` method to create the connection URL using your Active Directory Service Principal credentials. First, create the connection string as: ```py conn_str = ( f"DRIVER={{ODBC Driver 18 for SQL Server}};" @@ -141,7 +141,7 @@ Data is loaded via `INSERT` statements by default. * [parquet](../file-formats/parquet.md) is used when [staging](#staging-support) is enabled ## Data type limitations -* **Synapse cannot load `TIME` columns from `parquet` files**. `dlt` will fail such jobs permanently. Use the `insert_values` file format instead, or convert `datetime.time` objects to `str` or `datetime.datetime`, to load `TIME` columns. +* **Synapse cannot load `TIME` columns from `parquet` files**. `dlt` will fail such jobs permanently. Use the `insert_values` file format instead, or convert `datetime.time` objects to `str` or `datetime.datetime` to load `TIME` columns. * **Synapse does not have a nested/JSON/struct data type**. The `dlt` `json` data type is mapped to the `nvarchar` type in Synapse. ## Table index type @@ -173,10 +173,10 @@ Possible values: ## Supported column hints -Synapse supports the following [column hints](https://dlthub.com/docs/general-usage/schema#tables-and-columns): +Synapse supports the following [column hints](../../general-usage/schema#tables-and-columns): -* `primary_key` - creates a `PRIMARY KEY NONCLUSTERED NOT ENFORCED` constraint on the column -* `unique` - creates a `UNIQUE NOT ENFORCED` constraint on the column +* `primary_key` - creates a `PRIMARY KEY NONCLUSTERED NOT ENFORCED` constraint on the column. +* `unique` - creates a `UNIQUE NOT ENFORCED` constraint on the column. > ❗ These hints are **disabled by default**. This is because the `PRIMARY KEY` and `UNIQUE` [constraints](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-table-constraints) are tricky in Synapse: they are **not enforced** and can lead to inaccurate results if the user does not ensure all column values are unique. For the column hints to take effect, the `create_indexes` configuration needs to be set to `True`, see [additional destination options](#additional-destination-options). @@ -189,7 +189,7 @@ To run Synapse with staging on Azure Blob Storage: ```py # Create a dlt pipeline that will load -# chess player data to the snowflake destination +# chess player data to the Synapse destination # via staging on Azure Blob Storage pipeline = dlt.pipeline( pipeline_name='chess_pipeline', @@ -223,7 +223,7 @@ Descriptions: - `default_table_index_type` sets the [table index type](#table-index-type) that is used if no table index type is specified on the resource. - `create_indexes` determines if `primary_key` and `unique` [column hints](#supported-column-hints) are applied. - `staging_use_msi` determines if the Managed Identity of the Synapse workspace is used to authorize access to the [staging](#staging-support) Storage Account. Ensure the Managed Identity has the [Storage Blob Data Reader](https://learn.microsoft.com/en-us/azure/role-based-access-control/built-in-roles#storage-blob-data-reader) role (or a higher-privileged role) assigned on the blob container if you set this option to `"true"`. -- `port` used for the ODBC connection. +- `port` is used for the ODBC connection. - `connect_timeout` sets the timeout for the `pyodbc` connection attempt, in seconds. ### dbt support diff --git a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md index 962239b7e6..cce54654b8 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md +++ b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md @@ -9,7 +9,7 @@ keywords: [weaviate, vector database, destination, dlt] [Weaviate](https://weaviate.io/) is an open-source vector database. It allows you to store data objects and perform similarity searches over them. This destination helps you load data into Weaviate from [dlt resources](../../general-usage/resource.md). -## Setup Guide +## Setup guide 1. To use Weaviate as a destination, make sure dlt is installed with the 'weaviate' extra: @@ -92,7 +92,7 @@ The data is now loaded into Weaviate. Weaviate destination is different from other [dlt destinations](../destinations/). To use vector search after the data has been loaded, you must specify which fields Weaviate needs to include in the vector index. You do that by wrapping the data (or dlt resource) with the `weaviate_adapter` function. -## weaviate_adapter +## Weaviate adapter The `weaviate_adapter` is a helper function that configures the resource for the Weaviate destination: @@ -126,7 +126,7 @@ pipeline = dlt.pipeline( destination="weaviate", ) -# apply adapter to the needed resources +# Apply adapter to the needed resources weaviate_adapter(products_tables.products, vectorize="description") weaviate_adapter(products_tables.customers, vectorize="bio") @@ -211,7 +211,7 @@ Data loaded into Weaviate from various sources might have different types. To en ### Dataset name -Weaviate uses classes to categorize and identify data. To avoid potential naming conflicts, especially when dealing with multiple datasets that might have overlapping table names, dlt includes the dataset name into the Weaviate class name. This ensures a unique identifier for every class. +Weaviate uses classes to categorize and identify data. To avoid potential naming conflicts, especially when dealing with multiple datasets that might have overlapping table names, dlt includes the dataset name in the Weaviate class name. This ensures a unique identifier for every class. For example, if you have a dataset named `movies_dataset` and a table named `actors`, the Weaviate class name would be `MoviesDataset_Actors` (the default separator is an underscore). @@ -245,7 +245,7 @@ Here's a summary of the naming normalization approach: - Snake case and camel case remain unchanged: `snake_case_name` and `camelCaseName`. - Names starting with a capital letter have it lowercased: `CamelCase` -> `camelCase` -- Names with multiple underscores, such as `Snake-______c__ase_``, are compacted to `snake_c_asex`. Except for the case when underscores are leading, in which case they are kept: `___snake_case_name` becomes `___snake_case_name`. +- Names with multiple underscores, such as `Snake-______c__ase_`, are compacted to `snake_c_asex`. Except for the case when underscores are leading, in which case they are kept: `___snake_case_name` becomes `___snake_case_name`. - Names starting with a number are prefixed with a "p_". For example, `123snake_case_name` becomes `p_123snake_case_name`. #### Reserved property names @@ -253,9 +253,7 @@ Here's a summary of the naming normalization approach: Reserved property names like `id` or `additional` are prefixed with underscores for differentiation. Therefore, `id` becomes `__id` and `_id` is rendered as `___id`. ### Case insensitive naming convention -The default naming convention described above will preserve the casing of the properties (besides the first letter which is lowercased). This generates nice classes -in Weaviate but also requires that your input data does not have clashing property names when comparing case insensitive ie. (`caseName` == `casename`). In such case -Weaviate destination will fail to create classes and report a conflict. +The default naming convention described above will preserve the casing of the properties (besides the first letter which is lowercased). This generates nice classes in Weaviate but also requires that your input data does not have clashing property names when comparing case insensitively (i.e., `caseName` == `casename`). In such cases, Weaviate destination will fail to create classes and report a conflict. You can configure an alternative naming convention which will lowercase all properties. The clashing properties will be merged and the classes created. Still, if you have a document where clashing properties like: ```json @@ -282,10 +280,10 @@ naming="dlt.destinations.impl.weaviate.ci_naming" The default is `ONE`. - `batch_retries`: (int) number of retries to create a batch that failed with ReadTimeout. The default is 5. - `dataset_separator`: (str) the separator to use when generating the class names in Weaviate. -- `conn_timeout` and `read_timeout`: (float) to set timeouts (in seconds) when connecting and reading from REST API. defaults to (10.0, 180.0) -- `startup_period` (int) - how long to wait for weaviate to start +- `conn_timeout` and `read_timeout`: (float) to set timeouts (in seconds) when connecting and reading from the REST API. Defaults to (10.0, 180.0). +- `startup_period` (int) - how long to wait for Weaviate to start. - `vectorizer`: (str) the name of [the vectorizer](https://weaviate.io/developers/weaviate/modules/retriever-vectorizer-modules) to use. The default is `text2vec-openai`. -- `moduleConfig`: (dict) configurations of various Weaviate modules +- `moduleConfig`: (dict) configurations of various Weaviate modules. ### Configure Weaviate modules @@ -307,8 +305,7 @@ Below is an example that configures the **contextionary** vectorizer. You can pu vectorizer="text2vec-contextionary" module_config={text2vec-contextionary = { vectorizeClassName = false, vectorizePropertyName = true}} ``` -You can find Docker Compose with the instructions to run [here](https://github.com/dlt-hub/dlt/tree/devel/dlt/destinations/weaviate/README.md) - +You can find Docker Compose with the instructions to run [here](https://github.com/dlt-hub/dlt/tree/devel/dlt/destinations/weaviate/README.md). ### dbt support @@ -318,6 +315,5 @@ Currently, Weaviate destination does not support dbt. Weaviate destination supports syncing of the `dlt` state. - diff --git a/docs/website/docs/dlt-ecosystem/file-formats/csv.md b/docs/website/docs/dlt-ecosystem/file-formats/csv.md index 05a0c2e50d..6b9ff68269 100644 --- a/docs/website/docs/dlt-ecosystem/file-formats/csv.md +++ b/docs/website/docs/dlt-ecosystem/file-formats/csv.md @@ -7,14 +7,14 @@ import SetTheFormat from './_set_the_format.mdx'; # CSV file format -**csv** is the most basic file format to store tabular data, where all the values are strings and are separated by a delimiter (typically comma). -`dlt` uses it for specific use cases - mostly for the performance and compatibility reasons. +**csv** is the most basic file format for storing tabular data, where all values are strings and are separated by a delimiter (typically a comma). +`dlt` uses it for specific use cases - mostly for performance and compatibility reasons. -Internally we use two implementations: -- **pyarrow** csv writer - very fast, multithreaded writer for the [arrow tables](../verified-sources/arrow-pandas.md) +Internally, we use two implementations: +- **pyarrow** csv writer - a very fast, multithreaded writer for [arrow tables](../verified-sources/arrow-pandas.md) - **python stdlib writer** - a csv writer included in the Python standard library for Python objects -## Supported Destinations +## Supported destinations The `csv` format is supported by the following destinations: **Postgres**, **Filesystem**, **Snowflake** @@ -22,11 +22,11 @@ The `csv` format is supported by the following destinations: **Postgres**, **Fil -## Default Settings -`dlt` attempts to make both writers to generate similarly looking files +## Default settings +`dlt` attempts to make both writers generate similarly looking files: * separators are commas * quotes are **"** and are escaped as **""** -* `NULL` values both are empty strings and empty tokens as in the example below +* `NULL` values are both empty strings and empty tokens as in the example below * UNIX new lines are used * dates are represented as ISO 8601 * quoting style is "when needed" @@ -38,21 +38,20 @@ A,B,C A,,"" ``` -In the last row both `text2` and `text3` values are NULL. Python `csv` writer -is not able to write unquoted `None` values so we had to settle for `""` +In the last row, both `text2` and `text3` values are NULL. The Python `csv` writer +is not able to write unquoted `None` values, so we had to settle for `""`. -Note: all destinations capable of writing csvs must support it. +Note: all destinations capable of writing CSVs must support it. ### Change settings -You can change basic **csv** settings, this may be handy when working with **filesystem** destination. Other destinations are tested +You can change basic **csv** settings; this may be handy when working with the **filesystem** destination. Other destinations are tested with standard settings: * delimiter: change the delimiting character (default: ',') * include_header: include the header row (default: True) * quoting: **quote_all** - all values are quoted, **quote_needed** - quote only values that need quoting (default: `quote_needed`) -When **quote_needed** is selected: in case of Python csv writer all non-numeric values are quoted. In case of pyarrow csv writer, the exact behavior is not described in the documentation. We observed that in some cases, strings are not quoted as well. - +When **quote_needed** is selected: in the case of the Python csv writer, all non-numeric values are quoted. In the case of the pyarrow csv writer, the exact behavior is not described in the documentation. We observed that in some cases, strings are not quoted as well. ```toml [normalize.data_writer] @@ -75,7 +74,7 @@ A few additional settings are available when copying `csv` to destination tables * **encoding** - encoding of the `csv` file :::tip -You'll need those setting when [importing external files](../../general-usage/resource.md#import-external-files) +You'll need these settings when [importing external files](../../general-usage/resource.md#import-external-files). ::: ## Limitations @@ -87,4 +86,5 @@ You'll need those setting when [importing external files](../../general-usage/re **csv writer** * binary columns are supported only if they contain valid UTF-8 characters (easy to add more encodings) * json columns dumped with json.dumps -* **None** values are always quoted \ No newline at end of file +* **None** values are always quoted + diff --git a/docs/website/docs/dlt-ecosystem/file-formats/insert-format.md b/docs/website/docs/dlt-ecosystem/file-formats/insert-format.md index 3e58b5a25d..743936af72 100644 --- a/docs/website/docs/dlt-ecosystem/file-formats/insert-format.md +++ b/docs/website/docs/dlt-ecosystem/file-formats/insert-format.md @@ -5,7 +5,7 @@ keywords: [insert values, file formats] --- import SetTheFormat from './_set_the_format.mdx'; -# SQL INSERT File Format +# SQL INSERT file format This file format contains an INSERT...VALUES statement to be executed on the destination during the `load` stage. @@ -18,12 +18,13 @@ Additional data types are stored as follows: This file format is [compressed](../../reference/performance.md#disabling-and-enabling-file-compression) by default. -## Supported Destinations +## Supported destinations This format is used by default by: **DuckDB**, **Postgres**, **Redshift**, **Synapse**, **MSSQL**, **Motherduck** -It is also supported by: **Filesystem** if you'd like to store INSERT VALUES statements for some reason +It is also supported by: **Filesystem** if you'd like to store INSERT VALUES statements for some reason. ## How to configure + diff --git a/docs/website/docs/dlt-ecosystem/file-formats/jsonl.md b/docs/website/docs/dlt-ecosystem/file-formats/jsonl.md index 5957ccc8ad..54e5b1cbd2 100644 --- a/docs/website/docs/dlt-ecosystem/file-formats/jsonl.md +++ b/docs/website/docs/dlt-ecosystem/file-formats/jsonl.md @@ -5,10 +5,9 @@ keywords: [jsonl, file formats] --- import SetTheFormat from './_set_the_format.mdx'; -# jsonl - JSON Delimited +# jsonl - JSON delimited -JSON Delimited is a file format that stores several JSON documents in one file. The JSON -documents are separated by a new line. +JSON delimited is a file format that stores several JSON documents in one file. The JSON documents are separated by a new line. Additional data types are stored as follows: @@ -18,13 +17,13 @@ Additional data types are stored as follows: - `HexBytes` is stored as a hex encoded string; - `json` is serialized as a string. -This file format is -[compressed](../../reference/performance.md#disabling-and-enabling-file-compression) by default. +This file format is [compressed](../../reference/performance.md#disabling-and-enabling-file-compression) by default. -## Supported Destinations +## Supported destinations This format is used by default by: **BigQuery**, **Snowflake**, **Filesystem**. ## How to configure + diff --git a/docs/website/docs/dlt-ecosystem/file-formats/parquet.md b/docs/website/docs/dlt-ecosystem/file-formats/parquet.md index 30f7051386..3830a45ff1 100644 --- a/docs/website/docs/dlt-ecosystem/file-formats/parquet.md +++ b/docs/website/docs/dlt-ecosystem/file-formats/parquet.md @@ -9,13 +9,13 @@ import SetTheFormat from './_set_the_format.mdx'; [Apache Parquet](https://en.wikipedia.org/wiki/Apache_Parquet) is a free and open-source column-oriented data storage format in the Apache Hadoop ecosystem. `dlt` is capable of storing data in this format when configured to do so. -To use this format, you need a `pyarrow` package. You can get this package as a `dlt` extra as well: +To use this format, you need the `pyarrow` package. You can get this package as a `dlt` extra as well: ```sh pip install "dlt[parquet]" ``` -## Supported Destinations +## Supported destinations Supported by: **BigQuery**, **DuckDB**, **Snowflake**, **Filesystem**, **Athena**, **Databricks**, **Synapse** @@ -23,7 +23,7 @@ Supported by: **BigQuery**, **DuckDB**, **Snowflake**, **Filesystem**, **Athena* -## Destination AutoConfig +## Destination autoconfig `dlt` uses [destination capabilities](../../walkthroughs/create-new-destination.md#3-set-the-destination-capabilities) to configure the parquet writer: * It uses decimal and wei precision to pick the right **decimal type** and sets precision and scale. * It uses timestamp precision to pick the right **timestamp type** resolution (seconds, micro, or nano). @@ -32,17 +32,17 @@ Supported by: **BigQuery**, **DuckDB**, **Snowflake**, **Filesystem**, **Athena* Under the hood, `dlt` uses the [pyarrow parquet writer](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html) to create the files. The following options can be used to change the behavior of the writer: -- `flavor`: Sanitize schema or set other compatibility options to work with various target systems. Defaults to None which is **pyarrow** default. +- `flavor`: Sanitize schema or set other compatibility options to work with various target systems. Defaults to None, which is the **pyarrow** default. - `version`: Determine which Parquet logical types are available for use, whether the reduced set from the Parquet 1.x.x format or the expanded logical types added in later format versions. Defaults to "2.6". -- `data_page_size`: Set a target threshold for the approximate encoded size of data pages within a column chunk (in bytes). Defaults to None which is **pyarrow** default. +- `data_page_size`: Set a target threshold for the approximate encoded size of data pages within a column chunk (in bytes). Defaults to None, which is the **pyarrow** default. - `row_group_size`: Set the number of rows in a row group. [See here](#row-group-size) how this can optimize parallel processing of queries on your destination over the default setting of `pyarrow`. -- `timestamp_timezone`: A string specifying timezone, default is UTC. -- `coerce_timestamps`: resolution to which coerce timestamps, choose from **s**, **ms**, **us**, **ns** -- `allow_truncated_timestamps` - will raise if precision is lost on truncated timestamp. +- `timestamp_timezone`: A string specifying the timezone, default is UTC. +- `coerce_timestamps`: resolution to which to coerce timestamps, choose from **s**, **ms**, **us**, **ns** +- `allow_truncated_timestamps` - will raise if precision is lost on truncated timestamps. :::tip -Default parquet version used by `dlt` is 2.4. It coerces timestamps to microseconds and truncates nanoseconds silently. Such setting -provides best interoperability with database systems, including loading panda frames which have nanosecond resolution by default +The default parquet version used by `dlt` is 2.4. It coerces timestamps to microseconds and truncates nanoseconds silently. Such a setting +provides the best interoperability with database systems, including loading panda frames which have nanosecond resolution by default. ::: Read the [pyarrow parquet docs](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html) to learn more about these settings. @@ -68,28 +68,27 @@ NORMALIZE__DATA_WRITER__TIMESTAMP_TIMEZONE ``` ### Timestamps and timezones -`dlt` adds timezone (UTC adjustment) to all timestamps regardless of a precision (from seconds to nanoseconds). `dlt` will also create TZ aware timestamp columns in -the destinations. [duckdb is an exception here](../destinations/duckdb.md#supported-file-formats) +`dlt` adds timezone (UTC adjustment) to all timestamps regardless of the precision (from seconds to nanoseconds). `dlt` will also create TZ-aware timestamp columns in +the destinations. [DuckDB is an exception here](../destinations/duckdb.md#supported-file-formats). -### Disable timezones / utc adjustment flags +### Disable timezones / UTC adjustment flags You can generate parquet files without timezone adjustment information in two ways: -1. Set the **flavor** to spark. All timestamps will be generated via deprecated `int96` physical data type, without the logical one -2. Set the **timestamp_timezone** to empty string (ie. `DATA_WRITER__TIMESTAMP_TIMEZONE=""`) to generate logical type without UTC adjustment. +1. Set the **flavor** to spark. All timestamps will be generated via the deprecated `int96` physical data type, without the logical one. +2. Set the **timestamp_timezone** to an empty string (i.e., `DATA_WRITER__TIMESTAMP_TIMEZONE=""`) to generate a logical type without UTC adjustment. -To our best knowledge, arrow will convert your timezone aware DateTime(s) to UTC and store them in parquet without timezone information. +To our best knowledge, Arrow will convert your timezone-aware DateTime(s) to UTC and store them in parquet without timezone information. ### Row group size -The `pyarrow` parquet writer writes each item, i.e. table or record batch, in a separate row group. -This may lead to many small row groups which may not be optimal for certain query engines. For example, `duckdb` parallelizes on a row group. -`dlt` allows controlling the size of the row group by -[buffering and concatenating tables](../../reference/performance.md#controlling-in-memory-buffers) and batches before they are written. The concatenation is done as a zero-copy to save memory. -You can control the size of the row group by setting the maximum number of rows kept in the buffer. + +The `pyarrow` parquet writer writes each item, i.e., table or record batch, in a separate row group. This may lead to many small row groups, which may not be optimal for certain query engines. For example, `duckdb` parallelizes on a row group. `dlt` allows controlling the size of the row group by [buffering and concatenating tables](../../reference/performance.md#controlling-in-memory-buffers) and batches before they are written. The concatenation is done as a zero-copy to save memory. You can control the size of the row group by setting the maximum number of rows kept in the buffer. + ```toml [extract.data_writer] buffer_max_items=10e6 ``` -Mind that `dlt` holds the tables in memory. Thus, 1,000,000 rows in the example above may consume a significant amount of RAM. -`row_group_size` configuration setting has limited utility with `pyarrow` writer. It may be useful when you write single very large pyarrow tables -or when your in memory buffer is really large. \ No newline at end of file +Keep in mind that `dlt` holds the tables in memory. Thus, 1,000,000 rows in the example above may consume a significant amount of RAM. + +The `row_group_size` configuration setting has limited utility with the `pyarrow` writer. It may be useful when you write single very large pyarrow tables or when your in-memory buffer is really large. + diff --git a/docs/website/docs/dlt-ecosystem/staging.md b/docs/website/docs/dlt-ecosystem/staging.md index 789189b7dd..147c1f881d 100644 --- a/docs/website/docs/dlt-ecosystem/staging.md +++ b/docs/website/docs/dlt-ecosystem/staging.md @@ -16,9 +16,9 @@ Such a staging dataset has the same name as the dataset passed to `dlt.pipeline` [destination.postgres] staging_dataset_name_layout="staging_%s" ``` -The entry above switches the pattern to `staging_` prefix and for example, for a dataset with the name **github_data**, `dlt` will create **staging_github_data**. +The entry above switches the pattern to a `staging_` prefix and, for example, for a dataset with the name **github_data**, `dlt` will create **staging_github_data**. -To configure a static staging dataset name, you can do the following (we use the destination factory) +To configure a static staging dataset name, you can do the following (we use the destination factory): ```py import dlt @@ -41,21 +41,21 @@ truncate_staging_dataset=true Currently, only one destination, the [filesystem](destinations/filesystem.md), can be used as staging. The following destinations can copy remote files: 1. [Azure Synapse](destinations/synapse#staging-support) -1. [Athena](destinations/athena#staging-support) -1. [Bigquery](destinations/bigquery.md#staging-support) -1. [Dremio](destinations/dremio#staging-support) -1. [Redshift](destinations/redshift.md#staging-support) -1. [Snowflake](destinations/snowflake.md#staging-support) +2. [Athena](destinations/athena#staging-support) +3. [Bigquery](destinations/bigquery.md#staging-support) +4. [Dremio](destinations/dremio#staging-support) +5. [Redshift](destinations/redshift.md#staging-support) +6. [Snowflake](destinations/snowflake.md#staging-support) ### How to use -In essence, you need to set up two destinations and then pass them to `dlt.pipeline`. Below we'll use `filesystem` staging with `parquet` files to load into the `Redshift` destination. +In essence, you need to set up two destinations and then pass them to `dlt.pipeline`. Below, we'll use `filesystem` staging with `parquet` files to load into the `Redshift` destination. 1. **Set up the S3 bucket and filesystem staging.** Please follow our guide in the [filesystem destination documentation](destinations/filesystem.md). Test the staging as a standalone destination to make sure that files go where you want them. In your `secrets.toml`, you should now have a working `filesystem` configuration: ```toml [destination.filesystem] - bucket_url = "s3://[your_bucket_name]" # replace with your bucket name, + bucket_url = "s3://[your_bucket_name]" # replace with your bucket name [destination.filesystem.credentials] aws_access_key_id = "please set me up!" # copy the access key here @@ -88,7 +88,7 @@ In essence, you need to set up two destinations and then pass them to `dlt.pipel dataset_name='player_data' ) ``` - `dlt` will automatically select an appropriate loader file format for the staging files. Below we explicitly specify the `parquet` file format (just to demonstrate how to do it): + `dlt` will automatically select an appropriate loader file format for the staging files. Below, we explicitly specify the `parquet` file format (just to demonstrate how to do it): ```py info = pipeline.run(chess(), loader_file_format="parquet") ``` @@ -103,15 +103,15 @@ Please note that `dlt` does not delete loaded files from the staging storage aft ### How to prevent staging files truncation -Before `dlt` loads data to the staging storage, it truncates previously loaded files. To prevent it and keep the whole history -of loaded files, you can use the following parameter: +Before `dlt` loads data to the staging storage, it truncates previously loaded files. To prevent this and keep the whole history of loaded files, you can use the following parameter: ```toml [destination.redshift] -truncate_table_before_load_on_staging_destination=false +truncate_tables_on_staging_destination_before_load=false ``` :::caution -The [Athena](destinations/athena#staging-support) destination only truncates not iceberg tables with `replace` merge_disposition. -Therefore, the parameter `truncate_table_before_load_on_staging_destination` only controls the truncation of corresponding files for these tables. +The [Athena](destinations/athena#staging-support) destination only truncates non-iceberg tables with `replace` merge_disposition. +Therefore, the parameter `truncate_tables_on_staging_destination_before_load` only controls the truncation of corresponding files for these tables. ::: + diff --git a/docs/website/docs/dlt-ecosystem/table-formats/delta.md b/docs/website/docs/dlt-ecosystem/table-formats/delta.md index 7840f40d11..d8dd87b750 100644 --- a/docs/website/docs/dlt-ecosystem/table-formats/delta.md +++ b/docs/website/docs/dlt-ecosystem/table-formats/delta.md @@ -6,8 +6,9 @@ keywords: [delta, table formats] # Delta table format -[Delta](https://delta.io/) is an open source table format. `dlt` can store data as Delta tables. +[Delta](https://delta.io/) is an open-source table format. `dlt` can store data as Delta tables. -## Supported Destinations +## Supported destinations Supported by: **Databricks**, **filesystem** + diff --git a/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md b/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md index a34bab9a0c..233ae0ce21 100644 --- a/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md +++ b/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md @@ -6,8 +6,9 @@ keywords: [iceberg, table formats] # Iceberg table format -[Iceberg](https://iceberg.apache.org/) is an open source table format. `dlt` can store data as Iceberg tables. +[Iceberg](https://iceberg.apache.org/) is an open-source table format. `dlt` can store data as Iceberg tables. -## Supported Destinations +## Supported destinations Supported by: **Athena** + diff --git a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md index 526e62e44b..449f8b8bde 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md +++ b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md @@ -6,8 +6,7 @@ keywords: [transform, dbt, runner] # Transform the data with dbt -[dbt](https://github.com/dbt-labs/dbt-core) is a framework that allows for the simple structuring of your transformations into DAGs. The benefits of -using dbt include: +[dbt](https://github.com/dbt-labs/dbt-core) is a framework that allows for the simple structuring of your transformations into DAGs. The benefits of using dbt include: - End-to-end cross-db compatibility for dlt→dbt pipelines. - Ease of use by SQL analysts, with a low learning curve. @@ -20,21 +19,19 @@ You can run dbt with `dlt` by using the dbt runner. The dbt runner: -- Can create a virtual env for dbt on the fly; +- Can create a virtual environment for dbt on the fly; - Can run a dbt package from online sources (e.g., GitHub) or from local files; -- Passes configuration and credentials to dbt, so you do not need to handle them separately from - `dlt`, enabling dbt to configure on the fly. +- Passes configuration and credentials to dbt, so you do not need to handle them separately from `dlt`, enabling dbt to configure on the fly. ## How to use the dbt runner -For an example of how to use the dbt runner, see the -[jaffle shop example](https://github.com/dlt-hub/dlt/blob/devel/docs/examples/archive/dbt_run_jaffle.py). +For an example of how to use the dbt runner, see the [jaffle shop example](https://github.com/dlt-hub/dlt/blob/devel/docs/examples/archive/dbt_run_jaffle.py). Included below is another example where we run a `dlt` pipeline and then a dbt package via `dlt`: > 💡 Docstrings are available to read in your IDE. ```py -# load all pipedrive endpoints to pipedrive_raw dataset +# Load all Pipedrive endpoints to the pipedrive_raw dataset pipeline = dlt.pipeline( pipeline_name='pipedrive', destination='bigquery', @@ -45,38 +42,38 @@ load_info = pipeline.run(pipedrive_source()) print(load_info) # Create a transformation on a new dataset called 'pipedrive_dbt' -# we created a local dbt package +# We created a local dbt package # and added pipedrive_raw to its sources.yml -# the destination for the transformation is passed in the pipeline +# The destination for the transformation is passed in the pipeline pipeline = dlt.pipeline( pipeline_name='pipedrive', destination='bigquery', dataset_name='pipedrive_dbt' ) -# make or restore venv for dbt, using latest dbt version -# NOTE: if you have dbt installed in your current environment, just skip this line +# Make or restore venv for dbt, using the latest dbt version +# NOTE: If you have dbt installed in your current environment, just skip this line # and the `venv` argument to dlt.dbt.package() venv = dlt.dbt.get_venv(pipeline) -# get runner, optionally pass the venv +# Get runner, optionally pass the venv dbt = dlt.dbt.package( pipeline, "pipedrive/dbt_pipedrive/pipedrive", venv=venv ) -# run the models and collect any info -# If running fails, the error will be raised with full stack trace +# Run the models and collect any info +# If running fails, the error will be raised with a full stack trace models = dbt.run_all() -# on success print outcome +# On success, print the outcome for m in models: print( f"Model {m.model_name} materialized" + - f"in {m.time}" + - f"with status {m.status}" + - f"and message {m.message}" + f" in {m.time}" + + f" with status {m.status}" + + f" and message {m.message}" ) ``` @@ -86,10 +83,10 @@ It assumes that dbt is installed in the current Python environment and the `prof -Here's an example **duckdb** profile +Here's an example **duckdb** profile: ```yaml config: - # do not track usage, do not create .user.yml + # Do not track usage, do not create .user.yml send_anonymous_usage_stats: False duckdb_dlt_dbt_test: @@ -97,7 +94,7 @@ duckdb_dlt_dbt_test: outputs: analytics: type: duckdb - # schema: "{{ var('destination_dataset_name', var('source_dataset_name')) }}" + # Schema: "{{ var('destination_dataset_name', var('source_dataset_name')) }}" path: "duckdb_dlt_dbt_test.duckdb" extensions: - httpfs @@ -108,8 +105,8 @@ You can run the example with dbt debug log: `RUNTIME__LOG_LEVEL=DEBUG python dbt ## Other transforming tools -If you want to transform the data before loading, you can use Python. If you want to transform the -data after loading, you can use dbt or one of the following: +If you want to transform the data before loading, you can use Python. If you want to transform the data after loading, you can use dbt or one of the following: 1. [`dlt` SQL client.](../sql.md) 2. [Pandas.](../pandas.md) + diff --git a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt_cloud.md b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt_cloud.md index d15c4eb84c..58bc489459 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt_cloud.md +++ b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt_cloud.md @@ -4,11 +4,11 @@ description: Transforming the data loaded by a dlt pipeline with dbt Cloud keywords: [transform, sql] --- -# DBT Cloud Client and Helper Functions +# dbt Cloud client and helper functions -## API Client +## API client -The DBT Cloud Client is a Python class designed to interact with the dbt Cloud API (version 2). +The dbt Cloud Client is a Python class designed to interact with the dbt Cloud API (version 2). It provides methods to perform various operations on dbt Cloud, such as triggering job runs and retrieving job run statuses. ```py @@ -26,7 +26,7 @@ run_status = client.get_run_status(run_id=job_run_id) print(f"Job run status: {run_status['status_humanized']}") ``` -## Helper Functions +## Helper functions These Python functions provide an interface to interact with the dbt Cloud API. They simplify the process of triggering and monitoring job runs in dbt Cloud. @@ -65,11 +65,11 @@ from dlt.helpers.dbt_cloud import get_dbt_cloud_run_status status = get_dbt_cloud_run_status(run_id=1234, wait_for_outcome=True) ``` -## Set Credentials +## Set credentials ### secrets.toml -When using a dlt locally, we recommend using the `.dlt/secrets.toml` method to set credentials. +When using dlt locally, we recommend using the `.dlt/secrets.toml` method to set credentials. If you used the `dlt init` command, then the `.dlt` folder has already been created. Otherwise, create a `.dlt` folder in your working directory and a `secrets.toml` file inside it. @@ -86,9 +86,9 @@ job_id = "set me up!" # optional only for the run_dbt_cloud_job function (you ca run_id = "set me up!" # optional for the get_dbt_cloud_run_status function (you can pass this explicitly as an argument to the function) ``` -### Environment Variables +### Environment variables -`dlt` supports reading credentials from the environment. +dlt supports reading credentials from the environment. If dlt tries to read this from environment variables, it will use a different naming convention. @@ -102,4 +102,5 @@ DBT_CLOUD__ACCOUNT_ID DBT_CLOUD__JOB_ID ``` -For more information, read the [Credentials](https://dlthub.com/docs/general-usage/credentials) documentation. +For more information, read the [Credentials](../../../general-usage/credentials) documentation. + diff --git a/docs/website/docs/dlt-ecosystem/transformations/pandas.md b/docs/website/docs/dlt-ecosystem/transformations/pandas.md index 0e08666eaf..4125e4e114 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/pandas.md +++ b/docs/website/docs/dlt-ecosystem/transformations/pandas.md @@ -4,7 +4,7 @@ description: Transform the data loaded by a dlt pipeline with Pandas keywords: [transform, pandas] --- -# Transform the Data with Pandas +# Transform the data with Pandas You can fetch the results of any SQL query as a dataframe. If the destination supports that natively (i.e., BigQuery and DuckDB), `dlt` uses the native method. Thanks to this, reading @@ -22,7 +22,7 @@ with pipeline.sql_client() as client: with client.execute_query( 'SELECT "reactions__+1", "reactions__-1", reactions__laugh, reactions__hooray, reactions__rocket FROM issues' ) as table: - # calling `df` on a cursor, returns the data as a data frame + # calling `df` on a cursor returns the data as a data frame reactions = table.df() counts = reactions.sum(0).sort_values(0, ascending=False) ``` @@ -32,10 +32,11 @@ chunks by passing the `chunk_size` argument to the `df` method. Once your data is in a Pandas dataframe, you can transform it as needed. -## Other Transforming Tools +## Other transforming tools If you want to transform the data before loading, you can use Python. If you want to transform the data after loading, you can use Pandas or one of the following: 1. [dbt.](dbt/dbt.md) (recommended) 2. [`dlt` SQL client.](sql.md) + diff --git a/docs/website/docs/dlt-ecosystem/transformations/sql.md b/docs/website/docs/dlt-ecosystem/transformations/sql.md index b358e97b4c..ffd348d1a0 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/sql.md +++ b/docs/website/docs/dlt-ecosystem/transformations/sql.md @@ -36,7 +36,7 @@ try: "SELECT id, name, email FROM customers WHERE id = %s", 10 ) - # prints column values of the first row + # Prints column values of the first row print(res[0]) except Exception: ... @@ -48,4 +48,5 @@ If you want to transform the data before loading, you can use Python. If you wan data after loading, you can use SQL or one of the following: 1. [dbt](dbt/dbt.md) (recommended). -2. [Pandas.](pandas.md) +2. [Pandas](pandas.md). + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/_source-info-header.md b/docs/website/docs/dlt-ecosystem/verified-sources/_source-info-header.md index 112dcf06bf..2d41b6612c 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/_source-info-header.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/_source-info-header.md @@ -1,6 +1,7 @@ import Admonition from "@theme/Admonition"; import Link from '../../_book-onboarding-call.md'; - + Join our Slack community or . - \ No newline at end of file + + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md b/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md index 3e7dad9793..c4e4268647 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md @@ -9,13 +9,9 @@ import Header from './_source-info-header.md';
-[Amazon Kinesis](https://docs.aws.amazon.com/streams/latest/dev/key-concepts.html) is a cloud-based -service for real-time data streaming and analytics, enabling the processing and analysis of large -streams of data in real time. +[Amazon Kinesis](https://docs.aws.amazon.com/streams/latest/dev/key-concepts.html) is a cloud-based service for real-time data streaming and analytics, enabling the processing and analysis of large streams of data in real time. -Our AWS Kinesis [verified source](https://github.com/dlt-hub/verified-sources/tree/master/sources/kinesis) -loads messages from Kinesis streams to your preferred -[destination](https://dlthub.com/docs/dlt-ecosystem/destinations/). +Our AWS Kinesis [verified source](https://github.com/dlt-hub/verified-sources/tree/master/sources/kinesis) loads messages from Kinesis streams to your preferred [destination](../../dlt-ecosystem/destinations/). Resources that can be loaded using this verified source are: @@ -25,16 +21,14 @@ Resources that can be loaded using this verified source are: :::tip -You can check out our pipeline example -[here](https://github.com/dlt-hub/verified-sources/blob/master/sources/kinesis_pipeline.py). +You can check out our pipeline example [here](https://github.com/dlt-hub/verified-sources/blob/master/sources/kinesis_pipeline.py). ::: -## Setup Guide +## Setup guide ### Grab credentials -To use this verified source, you need an AWS `Access key` and `Secret access key`, which can be obtained -as follows: +To use this verified source, you need an AWS `Access key` and `Secret access key`, which can be obtained as follows: 1. Sign in to your AWS Management Console. 1. Navigate to the IAM (Identity and Access Management) dashboard. @@ -44,8 +38,7 @@ as follows: 1. Download or copy the Access Key ID and Secret Access Key for future use. :::info -The AWS UI, which is described here, might change. The full guide is available at this -[link](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html). +The AWS UI, which is described here, might change. The full guide is available at this [link](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html). ::: ### Initialize the verified source @@ -58,24 +51,17 @@ To get started with your data pipeline, follow these steps: dlt init kinesis duckdb ``` - [This command](../../reference/command-line-interface) will initialize - [the pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/kinesis_pipeline.py) - with Kinesis as the [source](../../general-usage/source) and [duckdb](../destinations/duckdb.md) - as the [destination](../destinations). + [This command](../../reference/command-line-interface) will initialize [the pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/kinesis_pipeline.py) with Kinesis as the [source](../../general-usage/source) and [duckdb](../destinations/duckdb.md) as the [destination](../destinations). -1. If you'd like to use a different destination, simply replace `duckdb` with the name of your - preferred [destination](../destinations). +1. If you'd like to use a different destination, simply replace `duckdb` with the name of your preferred [destination](../destinations). -1. After running this command, a new directory will be created with the necessary files and - configuration settings to get started. +1. After running this command, a new directory will be created with the necessary files and configuration settings to get started. For more information, read [Add a verified source.](../../walkthroughs/add-a-verified-source) ### Add credentials -1. In the `.dlt` folder, there's a file called `secrets.toml`. It's where you store sensitive - information securely, like access tokens. Keep this file safe. Here's its format for service - account authentication: +1. In the `.dlt` folder, there's a file called `secrets.toml`. It's where you store sensitive information securely, like access tokens. Keep this file safe. Here's its format for service account authentication: ```toml # Put your secret values and credentials here. @@ -93,13 +79,9 @@ For more information, read [Add a verified source.](../../walkthroughs/add-a-ver stream_name = "please set me up!" # Stream name (Optional). ``` -1. Replace the value of `aws_access_key_id` and `aws_secret_access_key` with the one that - [you copied above](#grab-credentials). This will ensure that the verified source can access - your Kinesis resource securely. +1. Replace the value of `aws_access_key_id` and `aws_secret_access_key` with the one that [you copied above](#grab-credentials). This will ensure that the verified source can access your Kinesis resource securely. -1. Next, follow the instructions in [Destinations](../destinations/duckdb) to add credentials for - your chosen destination. This will ensure that your data is properly routed to its final - destination. +1. Next, follow the instructions in [Destinations](../destinations/duckdb) to add credentials for your chosen destination. This will ensure that your data is properly routed to its final destination. For more information, read [Credentials](../../general-usage/credentials). @@ -110,11 +92,11 @@ For more information, read [Credentials](../../general-usage/credentials). ```sh pip install -r requirements.txt ``` -1. You're now ready to run the pipeline! To get started, run the following command: +2. You're now ready to run the pipeline! To get started, run the following command: ```sh python kinesis_pipeline.py ``` -1. Once the pipeline has finished running, you can verify that everything loaded correctly by using +3. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: ```sh dlt pipeline show @@ -132,7 +114,7 @@ For more information, read [Run a pipeline.](../../walkthroughs/run-a-pipeline) ### Resource `kinesis_stream` This resource reads a Kinesis stream and yields messages. It supports -[incremental loading](../../general-usage/incremental-loading) and parses messages as json by +[incremental loading](../../general-usage/incremental-loading) and parses messages as JSON by default. ```py @@ -180,14 +162,14 @@ resource will have the same name as the stream. When you iterate this resource ( shard, it will create an iterator to read messages: 1. If `initial_at_timestamp` is present, the resource will read all messages after this timestamp. -1. If `initial_at_timestamp` is 0, only the messages at the tip of the stream are read. -1. If no initial timestamp is provided, all messages will be retrieved (from the TRIM HORIZON). +2. If `initial_at_timestamp` is 0, only the messages at the tip of the stream are read. +3. If no initial timestamp is provided, all messages will be retrieved (from the TRIM HORIZON). The resource stores all message sequences per shard in the state. If you run the resource again, it will load messages incrementally: 1. For all shards that had messages, only messages after the last message are retrieved. -1. For shards that didn't have messages (or new shards), the last run time is used to get messages. +2. For shards that didn't have messages (or new shards), the last run time is used to get messages. Please check the `kinesis_stream` [docstring](https://github.com/dlt-hub/verified-sources/blob/master/sources/kinesis/__init__.py#L31-L46) for additional options, i.e., to limit the number of messages @@ -202,13 +184,13 @@ if False, `data` is returned as bytes. ## Customization + + ### Create your own pipeline -If you wish to create your own pipelines, you can leverage source and resource methods from this -verified source. +If you wish to create your own pipelines, you can leverage source and resource methods from this verified source. -1. Configure the [pipeline](../../general-usage/pipeline) by specifying the pipeline name, - destination, and dataset as follows: +1. Configure the [pipeline](../../general-usage/pipeline) by specifying the pipeline name, destination, and dataset as follows: ```py pipeline = dlt.pipeline( @@ -221,9 +203,9 @@ verified source. 1. To load messages from a stream from the last one hour: ```py - # the resource below will take its name from the stream name, - # it can be used multiple times by default it assumes that Data is json and parses it, - # here we disable that to just get bytes in data elements of the message + # The resource below will take its name from the stream name, + # it can be used multiple times. By default, it assumes that data is JSON and parses it, + # here we disable that to just get bytes in data elements of the message. kinesis_stream_data = kinesis_stream( "kinesis_source_name", parse_json=False, @@ -236,7 +218,7 @@ verified source. 1. For incremental Kinesis streams, to fetch only new messages: ```py - #running pipeline will get only new messages + # Running pipeline will get only new messages. info = pipeline.run(kinesis_stream_data) message_counts = pipeline.last_trace.last_normalize_info.row_counts if "kinesis_source_name" not in message_counts: @@ -245,7 +227,7 @@ verified source. print(pipeline.last_trace.last_normalize_info) ``` -1. To parse json with a simple decoder: +1. To parse JSON with a simple decoder: ```py def _maybe_parse_json(item: TDataItem) -> TDataItem: @@ -267,23 +249,23 @@ verified source. STATE_FILE = "kinesis_source_name.state.json" - # load the state if it exists + # Load the state if it exists. if os.path.exists(STATE_FILE): with open(STATE_FILE, "rb") as f: state = json.typed_loadb(f.read()) else: - # provide new state + # Provide new state. state = {} with Container().injectable_context( StateInjectableContext(state=state) ) as managed_state: - # dlt resources/source is just an iterator + # dlt resources/source is just an iterator. for message in kinesis_stream_data: - # here you can send the message somewhere + # Here you can send the message somewhere. print(message) - # save state after each message to have full transaction load - # dynamodb is also OK + # Save state after each message to have full transaction load. + # DynamoDB is also OK. with open(STATE_FILE, "wb") as f: json.typed_dump(managed_state.state, f) print(managed_state.state) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md b/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md index 4a5cdd2f71..29b5e5618c 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md @@ -5,20 +5,20 @@ keywords: [arrow, pandas, parquet, source] --- import Header from './_source-info-header.md'; -# Arrow Table / Pandas +# Arrow table / Pandas
You can load data directly from an Arrow table or Pandas dataframe. -This is supported by all destinations, but recommended especially when using destinations that support the `parquet` file format natively (e.g. [Snowflake](../destinations/snowflake.md) and [Filesystem](../destinations/filesystem.md)). +This is supported by all destinations, but it is especially recommended when using destinations that support the `parquet` file format natively (e.g., [Snowflake](../destinations/snowflake.md) and [Filesystem](../destinations/filesystem.md)). See the [destination support](#destination-support-and-fallback) section for more information. -When used with a `parquet` supported destination this is a more performant way to load structured data since `dlt` bypasses many processing steps normally involved in passing JSON objects through the pipeline. -`dlt` automatically translates the Arrow table's schema to the destination table's schema and writes the table to a parquet file which gets uploaded to the destination without any further processing. +When used with a `parquet` supported destination, this is a more performant way to load structured data since `dlt` bypasses many processing steps normally involved in passing JSON objects through the pipeline. +`dlt` automatically translates the Arrow table's schema to the destination table's schema and writes the table to a parquet file, which gets uploaded to the destination without any further processing. ## Usage -To write an Arrow source, pass any `pyarrow.Table`, `pyarrow.RecordBatch` or `pandas.DataFrame` object (or list of thereof) to the pipeline's `run` or `extract` method, or yield table(s)/dataframe(s) from a `@dlt.resource` decorated function. +To write an Arrow source, pass any `pyarrow.Table`, `pyarrow.RecordBatch`, or `pandas.DataFrame` object (or list thereof) to the pipeline's `run` or `extract` method, or yield table(s)/dataframe(s) from a `@dlt.resource` decorated function. This example loads a Pandas dataframe to a Snowflake table: @@ -58,10 +58,10 @@ Note: The data in the table must be compatible with the destination database as Destinations that support the `parquet` format natively will have the data files uploaded directly as possible. Rewriting files can be avoided completely in many cases. -When the destination does not support `parquet`, the rows are extracted from the table and written in the destination's native format (usually `insert_values`) and this is generally much slower +When the destination does not support `parquet`, the rows are extracted from the table and written in the destination's native format (usually `insert_values`), and this is generally much slower as it requires processing the table row by row and rewriting data to disk. -The output file format is chosen automatically based on the destination's capabilities, so you can load arrow or pandas frames to any destination but performance will vary. +The output file format is chosen automatically based on the destination's capabilities, so you can load arrow or pandas frames to any destination, but performance will vary. ### Destinations that support parquet natively for direct loading * duckdb & motherduck @@ -89,13 +89,13 @@ add_dlt_id = true Keep in mind that enabling these incurs some performance overhead: -- `add_dlt_load_id` has minimal overhead since the column is added to arrow table in memory during `extract` stage, before parquet file is written to disk -- `add_dlt_id` adds the column during `normalize` stage after file has been extracted to disk. The file needs to be read back from disk in chunks, processed and rewritten with new columns +- `add_dlt_load_id` has minimal overhead since the column is added to the arrow table in memory during the `extract` stage, before the parquet file is written to disk +- `add_dlt_id` adds the column during the `normalize` stage after the file has been extracted to disk. The file needs to be read back from disk in chunks, processed, and rewritten with new columns ## Incremental loading with Arrow tables You can use incremental loading with Arrow tables as well. -Usage is the same as without other dlt resources. Refer to the [incremental loading](/general-usage/incremental-loading.md) guide for more information. +Usage is the same as with other dlt resources. Refer to the [incremental loading](../../general-usage/incremental-loading.md) guide for more information. Example: @@ -104,12 +104,12 @@ import dlt from dlt.common import pendulum import pandas as pd -# Create a resource using that yields a dataframe, using the `ordered_at` field as an incremental cursor +# Create a resource that yields a dataframe, using the `ordered_at` field as an incremental cursor @dlt.resource(primary_key="order_id") def orders(ordered_at = dlt.sources.incremental('ordered_at')): - # Get dataframe/arrow table from somewhere + # Get a dataframe/arrow table from somewhere # If your database supports it, you can use the last_value to filter data at the source. - # Otherwise it will be filtered automatically after loading the data. + # Otherwise, it will be filtered automatically after loading the data. df = get_orders(since=ordered_at.last_value) yield df @@ -124,9 +124,9 @@ Look at the [Connector X + Arrow Example](../../examples/connector_x_arrow/) to ::: ## Loading JSON documents -If you want to skip default `dlt` JSON normalizer, you can use any available method to convert JSON documents into tabular data. +If you want to skip the default `dlt` JSON normalizer, you can use any available method to convert JSON documents into tabular data. * **pandas** has `read_json` and `json_normalize` methods -* **pyarrow** can infer table schema and convert JSON files into tables with `read_json` +* **pyarrow** can infer the table schema and convert JSON files into tables with `read_json` * **duckdb** can do the same with `read_json_auto` ```py @@ -153,15 +153,15 @@ The Arrow data types are translated to dlt data types as follows: | `int` | `bigint` | Precision is determined by the bit width. | | `binary` | `binary` | | | `decimal` | `decimal` | Precision and scale are determined by the type properties. | -| `struct` | `json` | | +| `struct` | `json` | | | | | | ## Loading nested types -All struct types are represented as `json` and will be loaded as JSON (if destination permits) or a string. Currently we do not support **struct** types, +All struct types are represented as `json` and will be loaded as JSON (if the destination permits) or a string. Currently, we do not support **struct** types, even if they are present in the destination (except **BigQuery** which can be [configured to handle them](../destinations/bigquery.md#use-bigquery-schema-autodetect-for-nested-fields)) -If you want to represent nested data as separated tables, you must yield panda frames and arrow tables as records. In the examples above: +If you want to represent nested data as separate tables, you must yield panda frames and arrow tables as records. In the examples above: ```py # yield panda frame as records pipeline.run(df.to_dict(orient='records'), table_name="orders") @@ -169,4 +169,5 @@ pipeline.run(df.to_dict(orient='records'), table_name="orders") # yield arrow table pipeline.run(table.to_pylist(), table_name="orders") ``` -Both Pandas and Arrow allow to stream records in batches. +Both Pandas and Arrow allow streaming records in batches. + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/github.md b/docs/website/docs/dlt-ecosystem/verified-sources/github.md index 830f4035d8..221a2c3009 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/github.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/github.md @@ -9,21 +9,20 @@ import Header from './_source-info-header.md';
-This verified source can be used to load data on issues or pull requests from any GitHub repository -onto a [destination](../../dlt-ecosystem/destinations) of your choice using [GitHub API](https://docs.github.com/en/rest?apiVersion=2022-11-28). +This verified source can be used to load data on issues or pull requests from any GitHub repository onto a [destination](../../dlt-ecosystem/destinations) of your choice using the [GitHub API](https://docs.github.com/en/rest?apiVersion=2022-11-28). Resources that can be loaded using this verified source are: | Name | Description | | ---------------- |----------------------------------------------------------------------------------| -| github_reactions | Retrieves all issues, pull requests, comments and reactions associated with them | +| github_reactions | Retrieves all issues, pull requests, comments, and reactions associated with them | | github_repo_events | Gets all the repo events associated with the repository | -## Setup Guide +## Setup guide ### Grab credentials -To get the API token, sign-in to your GitHub account and follow these steps: +To get the API token, sign in to your GitHub account and follow these steps: 1. Click on your profile picture in the top right corner. @@ -31,8 +30,7 @@ To get the API token, sign-in to your GitHub account and follow these steps: 1. Select "Developer settings" on the left panel. -1. Under "Personal access tokens", click on "Generate a personal access token (preferably under - Tokens(classic))". +1. Under "Personal access tokens", click on "Generate a personal access token (preferably under Tokens(classic))". 1. Grant at least the following scopes to the token by checking them. @@ -42,7 +40,7 @@ To get the API token, sign-in to your GitHub account and follow these steps: | read:repo_hook | Grants read and ping access to hooks in public or private repositories | | read:org | Read-only access to organization membership, organization projects, and team membership | | read:user | Grants access to read a user's profile data | - | read:project | Grants read only access to user and organization projects | + | read:project | Grants read-only access to user and organization projects | | read:discussion | Allows read access for team discussions | 1. Finally, click "Generate token". @@ -52,11 +50,11 @@ To get the API token, sign-in to your GitHub account and follow these steps: > You can optionally add API access tokens to avoid making requests as an unauthorized user. > If you wish to load data using the github_reaction source, the access token is mandatory. -More information you can see in the +For more information, see the [GitHub authentication](https://docs.github.com/en/rest/overview/authenticating-to-the-rest-api?apiVersion=2022-11-28#basic-authentication) and [GitHub API token scopes](https://docs.github.com/en/apps/oauth-apps/building-oauth-apps/scopes-for-oauth-apps) -documentations. +documentation. ### Initialize the verified source @@ -83,30 +81,24 @@ For more information, read the guide on [how to add a verified source](../../wal ### Add credentials -1. In `.dlt/secrets.toml`, you can securely store your access tokens and other sensitive - information. It's important to handle this file with care and keep it safe. Here's what the file - looks like: +1. In `.dlt/secrets.toml`, you can securely store your access tokens and other sensitive information. It's important to handle this file with care and keep it safe. Here's what the file looks like: ```toml # Put your secret values and credentials here - # Github access token (must be classic for reactions source) + # GitHub access token (must be classic for reactions source) [sources.github] access_token="please set me up!" # use GitHub access token here ``` -1. Replace the API token value with the [previously copied one](#grab-credentials) to ensure secure - access to your GitHub resources. +1. Replace the API token value with the [previously copied one](#grab-credentials) to ensure secure access to your GitHub resources. -1. Next, follow the [destination documentation](../../dlt-ecosystem/destinations) instructions to - add credentials for your chosen destination, ensuring proper routing of your data to the final - destination. +1. Next, follow the [destination documentation](../../dlt-ecosystem/destinations) instructions to add credentials for your chosen destination, ensuring proper routing of your data to the final destination. For more information, read the [General Usage: Credentials.](../../general-usage/credentials) ## Run the pipeline -1. Before running the pipeline, ensure that you have installed all the necessary dependencies by - running the command: +1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: ```sh pip install -r requirements.txt ``` @@ -114,25 +106,21 @@ For more information, read the [General Usage: Credentials.](../../general-usage ```sh python github_pipeline.py ``` -1. Once the pipeline has finished running, you can verify that everything loaded correctly by using - the following command: +1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: ```sh dlt pipeline show ``` - For example, the `pipeline_name` for the above pipeline example is `github_reactions`, you may - also use any custom name instead. + For example, the `pipeline_name` for the above pipeline example is `github_reactions`; you may also use any custom name instead. For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources -`dlt` works on the principle of [sources](../../general-usage/source) and -[resources](../../general-usage/resource). +`dlt` works on the principle of [sources](../../general-usage/source) and [resources](../../general-usage/resource). ### Source `github_reactions` -This `dlt.source` function uses GraphQL to fetch DltResource objects: issues and pull requests along -with associated reactions, comments, and reactions to comments. +This `dlt.source` function uses GraphQL to fetch DltResource objects: issues and pull requests along with associated reactions, comments, and reactions to comments. ```py @dlt.source @@ -151,21 +139,17 @@ def github_reactions( `name`: Refers to the name of the repository. -`access_token`: Classic access token should be utilized and is stored in the `.dlt/secrets.toml` -file. +`access_token`: A classic access token should be utilized and is stored in the `.dlt/secrets.toml` file. `items_per_page`: The number of issues/pull requests to retrieve in a single page. Defaults to 100. -`max_items`: The maximum number of issues/pull requests to retrieve in total. If set to None, it -means all items will be retrieved. Defaults to None. +`max_items`: The maximum number of issues/pull requests to retrieve in total. If set to None, it means all items will be retrieved. Defaults to None. -`max_item_age_seconds`: The feature to restrict retrieval of items older than a specific duration is -yet to be implemented. Defaults to None. +`max_item_age_seconds`: The feature to restrict retrieval of items older than a specific duration is yet to be implemented. Defaults to None. ### Resource `_get_reactions_data` ("issues") -The `dlt.resource` function employs the `_get_reactions_data` method to retrieve data about issues, -their associated comments, and subsequent reactions. +The `dlt.resource` function employs the `_get_reactions_data` method to retrieve data about issues, their associated comments, and subsequent reactions. ```py dlt.resource( @@ -185,11 +169,9 @@ dlt.resource( ### Source `github_repo_events` -This `dlt.source` fetches repository events incrementally, dispatching them to separate tables based -on event type. It loads new events only and appends them to tables. +This `dlt.source` fetches repository events incrementally, dispatching them to separate tables based on event type. It loads new events only and appends them to tables. -> Note: Github allows retrieving up to 300 events for public repositories, so frequent updates are -> recommended for active repos. +> Note: GitHub allows retrieving up to 300 events for public repositories, so frequent updates are recommended for active repos. ```py @dlt.source(max_table_nesting=2) @@ -203,8 +185,7 @@ def github_repo_events( `name`: Denotes the name of the repository. -`access_token`: Optional classic or fine-grained access token. If not provided, calls are made -anonymously. +`access_token`: Optional classic or fine-grained access token. If not provided, calls are made anonymously. `max_table_nesting=2` sets the maximum nesting level to 2. @@ -212,8 +193,7 @@ Read more about [nesting levels](../../general-usage/source#reduce-the-nesting-l ### Resource `repo_events` -This `dlt.resource` function serves as the resource for the `github_repo_events` source. It yields -repository events as data items. +This `dlt.resource` function serves as the resource for the `github_repo_events` source. It yields repository events as data items. ```py dlt.resource(primary_key="id", table_name=lambda i: i["type"]) # type: ignore @@ -229,9 +209,7 @@ def repo_events( `table_name`: Routes data to appropriate tables based on the data type. -`last_created_at`: This parameter determines the initial value for "last_created_at" in -dlt.sources.incremental. If no value is given, the default "initial_value" is used. The function -"last_value_func" determines the most recent 'created_at' value. +`last_created_at`: This parameter determines the initial value for "last_created_at" in dlt.sources.incremental. If no value is given, the default "initial_value" is used. The function "last_value_func" determines the most recent 'created_at' value. Read more about [incremental loading](../../general-usage/incremental-loading#incremental_loading-with-last-value). @@ -239,8 +217,7 @@ Read more about [incremental loading](../../general-usage/incremental-loading#in ### Create your own pipeline -If you wish to create your own pipelines, you can leverage source and resource methods from this -verified source. +If you wish to create your own pipelines, you can leverage source and resource methods from this verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: @@ -252,18 +229,16 @@ verified source. ) ``` - To read more about pipeline configuration, please refer to our - [documentation](../../general-usage/pipeline). + To read more about pipeline configuration, please refer to our [documentation](../../general-usage/pipeline). -1. To load all the data from repo on issues, pull requests, their comments and reactions, you can do - the following: +1. To load all the data from the repo on issues, pull requests, their comments, and reactions, you can do the following: ```py load_data = github_reactions("duckdb", "duckdb") load_info = pipeline.run(load_data) print(load_info) ``` - here, "duckdb" is the owner of the repository and the name of the repository. + Here, "duckdb" is the owner of the repository and the name of the repository. 1. To load only the first 100 issues, you can do the following: @@ -273,8 +248,7 @@ verified source. print(load_info) ``` -1. You can use fetch and process repo events data incrementally. It loads all data during the first - run and incrementally in subsequent runs. +1. You can fetch and process repo events data incrementally. It loads all data during the first run and incrementally in subsequent runs. ```py load_data = github_repo_events( @@ -287,3 +261,4 @@ verified source. It is optional to use `access_token` or make anonymous API calls. + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md b/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md index 7b4c1b0d5e..b94606a7e9 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md @@ -5,7 +5,7 @@ keywords: [google analytics api, google analytics verified source, google analyt --- import Header from './_source-info-header.md'; -# Google Analytics +# Google analytics
@@ -25,7 +25,7 @@ Sources and resources that can be loaded using this verified source are: | metrics_table | Assembles and presents data relevant to the report's metrics | | dimensions_table | Compiles and displays data related to the report's dimensions | -## Setup Guide +## Setup guide ### Grab credentials @@ -103,7 +103,9 @@ python google_analytics/setup_script_gcp_oauth.py Once you have executed the script and completed the authentication, you will receive a "refresh token" that can be used to set up the "secrets.toml". -### Share the Google Analytics Property with the API: + + +### Share the Google Analytics property with the API > Note: For service account authentication, use the client_email. For OAuth authentication, use the > email associated with the app creation and refresh token generation. @@ -185,7 +187,7 @@ For more information, read the guide on [how to add a verified source](../../wal 1. `property_id` is a unique number that identifies a particular property. You will need to explicitly pass it to get data from the property that you're interested in. For example, if the - property that you want to get data from is “GA4-Google Merch Shop” then you will need to pass its + property that you want to get data from is “GA4-Google Merch Shop,” then you will need to pass its property id "213025502". ![Property ID](./docs_images/GA4_Property_ID_size.png) @@ -198,7 +200,7 @@ For more information, read the guide on [how to add a verified source](../../wal ```toml [sources.google_analytics] - property_id = "213025502" # this is example property id, please use yours + property_id = "213025502" # this is an example property id, please use yours queries = [ {"resource_name"= "sample_analytics_data1", "dimensions"= ["browser", "city"], "metrics"= ["totalUsers", "transactions"]}, {"resource_name"= "sample_analytics_data2", "dimensions"= ["browser", "city", "dateHour"], "metrics"= ["totalUsers"]} @@ -230,7 +232,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is - `dlt_google_analytics_pipeline`, you may also use any custom name instead. + `dlt_google_analytics_pipeline`, but you may also use any custom name instead. For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). @@ -286,8 +288,7 @@ def get_metadata(client: Resource, property_id: int) -> Iterator[Metadata]: ### Transformer `metrics_table` -This transformer function extracts data using metadata and populates a table called "metrics" with -the data from each metric. +This transformer function extracts data using metadata and populates a table called "metrics" with the data from each metric. ```py @dlt.transformer(data_from=get_metadata, write_disposition="replace", name="metrics") @@ -298,14 +299,12 @@ def metrics_table(metadata: Metadata) -> Iterator[TDataItem]: `metadata`: GA4 metadata is stored in this "Metadata" class object. -Similarly, there is a transformer function called `dimensions_table` that populates a table called -"dimensions" with the data from each dimension. +Similarly, there is a transformer function called `dimensions_table` that populates a table called "dimensions" with the data from each dimension. ## Customization ### Create your own pipeline -If you wish to create your own pipelines, you can leverage source and resource methods from this -verified source. +If you wish to create your own pipelines, you can leverage source and resource methods from this verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: @@ -317,8 +316,7 @@ verified source. ) ``` - To read more about pipeline configuration, please refer to our - [documentation](../../general-usage/pipeline). + To read more about pipeline configuration, please refer to our [documentation](../../general-usage/pipeline). 1. To load all the data from metrics and dimensions: @@ -328,8 +326,7 @@ verified source. print(load_info) ``` - > Loads all the data till date in the first run, and then - > [incrementally](https://dlthub.com/docs/general-usage/incremental-loading) in subsequent runs. + > Loads all the data to date in the first run, and then [incrementally](../../general-usage/incremental-loading) in subsequent runs. 1. To load data from a specific start date: @@ -339,8 +336,7 @@ verified source. print(load_info) ``` - > Loads data starting from the specified date during the first run, and then - > [incrementally](https://dlthub.com/docs/general-usage/incremental-loading) in subsequent runs. + > Loads data starting from the specified date during the first run, and then [incrementally](../../general-usage/incremental-loading) in subsequent runs. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md b/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md index 9cd6ad8079..fade2f73a6 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md @@ -14,7 +14,7 @@ offered by Google as part of its Google Workspace suite. This Google Sheets `dlt` verified source and [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/google_sheets_pipeline.py) -loads data using “Google Sheets API” to the destination of your choice. +loads data using the “Google Sheets API” to the destination of your choice. Sources and resources that can be loaded using this verified source are: @@ -24,14 +24,14 @@ Sources and resources that can be loaded using this verified source are: | range_names | Processes the range and yields data from each range | | spreadsheet_info | Information about the spreadsheet and the ranges processed | -## Setup Guide +## Setup guide ### Grab credentials There are two methods to get authenticated for using this verified source: - OAuth credentials -- Service account credential +- Service account credentials Here we'll discuss how to set up both OAuth tokens and service account credentials. In general, OAuth tokens are preferred when user consent is required, while service account credentials are @@ -41,7 +41,7 @@ credentials. You can choose the method of authentication as per your requirement #### Google service account credentials You need to create a GCP service account to get API credentials if you don't have one. To create - one, follow these steps: +one, follow these steps: 1. Sign in to [console.cloud.google.com](http://console.cloud.google.com/). @@ -69,17 +69,17 @@ follow these steps: 1. Enable the Sheets API in the project. -1. Search credentials in the search bar and go to Credentials. +1. Search for credentials in the search bar and go to Credentials. 1. Go to Credentials -> OAuth client ID -> Select Desktop App from the Application type and give an appropriate name. -1. Download the credentials and fill "client_id", "client_secret" and "project_id" in - "secrets.toml". +1. Download the credentials and fill `client_id`, `client_secret`, and `project_id` in + `secrets.toml`. 1. Go back to credentials and select the OAuth consent screen on the left. -1. Fill in the App name, user support email(your email), authorized domain (localhost.com), and dev +1. Fill in the App name, user support email (your email), authorized domain (localhost.com), and dev contact info (your email again). 1. Add the following scope: @@ -92,7 +92,7 @@ follow these steps: 1. Generate `refresh_token`: - After configuring "client_id", "client_secret" and "project_id" in "secrets.toml". To generate + After configuring "client_id", "client_secret", and "project_id" in "secrets.toml". To generate the refresh token, run the following script from the root folder: ```sh @@ -104,6 +104,8 @@ follow these steps: ### Prepare your data + + #### Share Google Sheet with the email > Note: For service account authentication, use the client_email. For OAuth authentication, use the @@ -129,48 +131,46 @@ When setting up the pipeline, you can use either the browser-copied URL of your https://docs.google.com/spreadsheets/d/1VTtCiYgxjAwcIw7UM1_BSaxC3rzIpr0HwXZwd2OlPD4/edit?usp=sharing ``` -or spreadsheet id (which is a part of the url) +or the spreadsheet ID (which is a part of the URL) ```sh 1VTtCiYgxjAwcIw7UM1_BSaxC3rzIpr0HwXZwd2OlPD4 ``` -typically you pass it directly to the [google_spreadsheet function](#create-your-own-pipeline) or in [config.toml](#add-credentials) as defined here. +Typically, you pass it directly to the [google_spreadsheet function](#create-your-own-pipeline) or in [config.toml](#add-credentials) as defined here. - -You can provide specific ranges to `google_spreadsheet` pipeline, as detailed in following. +You can provide specific ranges to the `google_spreadsheet` pipeline, as detailed in the following. #### Guidelines about headers -Make sure your data has headers and is in the form of well-structured table. +Make sure your data has headers and is in the form of a well-structured table. The first row of any extracted range should contain headers. Please make sure: 1. The header names are strings and are unique. 1. All the columns that you intend to extract have a header. -1. The data starts exactly at the origin of the range - otherwise a source will remove padding, but it +1. The data starts exactly at the origin of the range - otherwise, a source will remove padding, but it is a waste of resources. > When a source detects any problems with headers or table layout, it will issue a WARNING in the > log. Hence, we advise running your pipeline script manually/locally and fixing all the problems. 1. Columns without headers will be removed and not extracted. 1. Columns with headers that do not contain any data will be removed. -1. If there are any problems with reading headers (i.e. header is not string or is empty or not +1. If there are any problems with reading headers (i.e., the header is not a string or is empty or not unique): the headers row will be extracted as data and automatic header names will be used. -1. Empty rows are ignored +1. Empty rows are ignored. 1. `dlt` will normalize range names and headers into table and column names - so they may be different in the database than in Google Sheets. Prefer small cap names without special characters. - #### Guidelines about named ranges -We recommend to use +We recommend using [Named Ranges](https://support.google.com/docs/answer/63175?hl=en&co=GENIE.Platform%3DDesktop) to indicate which data should be extracted from a particular spreadsheet, and this is how this source will work by default - when called without setting any other options. All the named ranges will be -converted into tables, named after them and stored in the destination. +converted into tables, named after them, and stored in the destination. -1. You can let the spreadsheet users add and remove tables by just adding/removing the ranges, +1. You can let the spreadsheet users add and remove tables by just adding or removing the ranges; you do not need to configure the pipeline again. 1. You can indicate exactly the fragments of interest, and only this data will be retrieved, so it is @@ -194,16 +194,16 @@ converted into tables, named after them and stored in the destination. If you are not happy with the workflow above, you can: -1. Disable it by setting `get_named_ranges` option to `False`. +1. Disable it by setting the `get_named_ranges` option to `False`. -1. Enable retrieving all sheets/tabs with get_sheets option set to `True`. +1. Enable retrieving all sheets/tabs with the get_sheets option set to `True`. 1. Pass a list of ranges as supported by Google Sheets in range_names. > Note: To retrieve all named ranges with "get_named_ranges" or all sheets with "get_sheets" > methods, pass an empty `range_names` list as `range_names = []`. Even when you use a set - > "get_named_ranges" to false pass the range_names as an empty list to get all the sheets with - > "get_sheets" method. + > "get_named_ranges" to false, pass the range_names as an empty list to get all the sheets with + > the "get_sheets" method. ### Initialize the verified source @@ -260,7 +260,7 @@ For more information, read the guide on [how to add a verified source](../../wal 1. Finally, enter credentials for your chosen destination as per the [docs](../destinations/). -1. Next you need to configure ".dlt/config.toml", which looks like: +1. Next, you need to configure ".dlt/config.toml", which looks like: ```toml [sources.google_sheets] @@ -277,13 +277,13 @@ For more information, read the guide on [how to add a verified source](../../wal spreadsheet_identifier = "https://docs.google.com/spreadsheets/d/1VTtCiYgxjAwcIw7UM1_BSaxC3rzIpr0HwXZwd2OlPD4/edit?usp=sharing" ``` - or spreadsheet id (which is a part of the url) + or the spreadsheet ID (which is a part of the URL) ```toml spreadsheet_identifier="1VTtCiYgxjAwcIw7UM1_BSaxC3rzIpr0HwXZwd2OlPD4" ``` -> Note: You have an option to pass "range_names" and "spreadsheet_identifier" directly to the +> Note: You have the option to pass "range_names" and "spreadsheet_identifier" directly to the > google_spreadsheet function or in ".dlt/config.toml" For more information, read the [General Usage: Credentials.](../../general-usage/credentials) @@ -320,7 +320,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug The `dlt` normalizer uses the first row of data to infer types and attempts to coerce subsequent rows, creating variant columns if unsuccessful. This is standard behavior. If `dlt` did not correctly determine the data type in the column, or you want to change the data type for other reasons, then you can provide a type hint for the affected column in the resource. -Also, since recently `dlt`'s no longer recognizing date and time types, so you have to designate it yourself as `timestamp`. +Also, since recently, `dlt` no longer recognizes date and time types, so you have to designate it yourself as `timestamp`. Use the `apply_hints` method on the resource to achieve this. Here's how you can do it: @@ -332,11 +332,11 @@ for resource in resources: "date": {"data_type": "timestamp"}, }) ``` -In this example, the `total_amount` column is enforced to be of type double and `date` is enforced to be of type timestamp. +In this example, the `total_amount` column is enforced to be of type double, and `date` is enforced to be of type timestamp. This will ensure that all values in the `total_amount` column are treated as `double`, regardless of whether they are integers or decimals in the original Google Sheets data. -And `date` column will be represented as dates, not integers. +And the `date` column will be represented as dates, not integers. -For a single resource (e.g. `Sheet1`), you can simply use: +For a single resource (e.g., `Sheet1`), you can simply use: ```py source.Sheet1.apply_hints(columns={ "total_amount": {"data_type": "double"}, @@ -387,9 +387,9 @@ def google_spreadsheet( `credentials`: GCP credentials with Google Sheets API access. -`get_sheets`: If True, imports all spreadsheet sheets into the database. +`get_sheets`: If true, imports all spreadsheet sheets into the database. -`get_named_ranges`: If True, imports either all named ranges or those +`get_named_ranges`: If true, imports either all named ranges or those [specified](google_sheets.md#guidelines-about-named-ranges) into the database. ### Resource `range_names` @@ -412,7 +412,7 @@ headers, and data types as arguments. `write_disposition`: Dictates how data is loaded to the destination. -> Please Note: +> Please note: > > 1. Empty rows are ignored. > 1. Empty cells are converted to None (and then to NULL by dlt). @@ -420,7 +420,7 @@ headers, and data types as arguments. ### Resource `spreadsheet_info` -This resource loads the info about the sheets and range names into the destination as a table. +This resource loads the information about the sheets and range names into the destination as a table. This table refreshes after each load, storing information on loaded ranges: - Spreadsheet ID and title. @@ -440,18 +440,18 @@ dlt.resource( `name`: Denotes the table name, set here as "spreadsheet_info". -`write_disposition`: Dictates how data is loaded to the destination. -[Read more](https://dlthub.com/docs/general-usage/incremental-loading#the-3-write-dispositions). +`write_disposition`: Dictates how data is loaded into the destination. +[Read more](../../general-usage/incremental-loading#the-3-write-dispositions). -`merge_key`: Parameter is used to specify the column used to identify records for merging. In this -case,"spreadsheet_id", means that the records will be merged based on the values in this column. -[Read more](https://dlthub.com/docs/general-usage/incremental-loading#merge-incremental_loading). +`merge_key`: This parameter is used to specify the column used to identify records for merging. In this +case, "spreadsheet_id" means that the records will be merged based on the values in this column. +[Read more](../../general-usage/incremental-loading#merge-incremental_loading). ## Customization + ### Create your own pipeline -If you wish to create your own pipelines, you can leverage source and resource methods from this -verified source. +If you wish to create your own pipelines, you can leverage source and resource methods from this verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: @@ -467,7 +467,7 @@ verified source. ```py load_data = google_spreadsheet( - "https://docs.google.com/spreadsheets/d/1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", #Spreadsheet URL + "https://docs.google.com/spreadsheets/d/1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", # Spreadsheet URL range_names=["range_name1", "range_name2"], # Range names get_sheets=False, get_named_ranges=False, @@ -476,14 +476,13 @@ verified source. print(load_info) ``` - > Note: You can pass the URL or spreadsheet ID and range names explicitly or in - > ".dlt/config.toml". + > Note: You can pass the URL or spreadsheet ID and range names explicitly or in ".dlt/config.toml". -1. To load all the range_names from spreadsheet: +1. To load all the range_names from the spreadsheet: ```py load_data = google_spreadsheet( - "https://docs.google.com/spreadsheets/d/1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", #Spreadsheet URL + "https://docs.google.com/spreadsheets/d/1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", # Spreadsheet URL get_sheets=False, get_named_ranges=True, ) @@ -493,11 +492,11 @@ verified source. > Pass an empty list to range_names in ".dlt/config.toml" to retrieve all range names. -1. To load all the sheets from spreadsheet: +1. To load all the sheets from the spreadsheet: ```py load_data = google_spreadsheet( - "https://docs.google.com/spreadsheets/d/1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", #Spreadsheet URL + "https://docs.google.com/spreadsheets/d/1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", # Spreadsheet URL get_sheets=True, get_named_ranges=False, ) @@ -511,7 +510,7 @@ verified source. ```py load_data = google_spreadsheet( - "https://docs.google.com/spreadsheets/d/1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", #Spreadsheet URL + "https://docs.google.com/spreadsheets/d/1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", # Spreadsheet URL get_sheets=True, get_named_ranges=True, ) @@ -525,17 +524,17 @@ verified source. ```py load_data1 = google_spreadsheet( - "https://docs.google.com/spreadsheets/d/43lkHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", #Spreadsheet URL + "https://docs.google.com/spreadsheets/d/43lkHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", # Spreadsheet URL range_names=["Sheet 1!A1:B10"], get_named_ranges=False, ) load_data2 = google_spreadsheet( - "https://docs.google.com/spreadsheets/d/3jo4HjqouQnnCIZAFa2rL6vT91YRN8aIhts22SKKO390/edit#gid=0", #Spreadsheet URL + "https://docs.google.com/spreadsheets/d/3jo4HjqouQnnCIZAFa2rL6vT91YRN8aIhts22SKKO390/edit#gid=0", # Spreadsheet URL range_names=["Sheet 1!B1:C10"], get_named_ranges=True, ) - load_info = pipeline.run([load_data1,load_data2]) + load_info = pipeline.run([load_data1, load_data2]) print(load_info) ``` @@ -543,7 +542,7 @@ verified source. ```py load_data = google_spreadsheet( - "https://docs.google.com/spreadsheets/d/43lkHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", #Spreadsheet URL + "https://docs.google.com/spreadsheets/d/43lkHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", # Spreadsheet URL range_names=["Sheet 1!A1:B10"], get_named_ranges=False, ) @@ -554,29 +553,29 @@ verified source. print(load_info) ``` -### Using Airflow with Google Spreadsheets: +### Using Airflow with Google spreadsheets -Consider the following when using Google Spreadsheets with Airflow: +Consider the following when using Google spreadsheets with Airflow: -`Efficient Data Retrieval` +`Efficient data retrieval` - Our source fetches all required data with just two API calls, regardless of the number of specified data ranges. This allows for swift data loading from google_spreadsheet before executing the pipeline. -`Airflow Specificity` +`Airflow specificity` - With Airflow, data source creation and execution are distinct processes. - If your execution environment (runner) is on a different machine, this might cause the data to be loaded twice, leading to inefficiencies. -`Airflow Helper Caution` +`Airflow helper caution` - Avoid using `scc decomposition` because it unnecessarily creates a new source instance for every specified data range. This is not efficient and can cause redundant tasks. -#### Recommended Airflow Deployment +#### Recommended Airflow deployment -Below is the correct way to set up an Airflow DAG for this purpose: +Below is the correct way to set up an Airflow DAG for this purpose: -- Define a DAG to run daily, starting from say February 1, 2023. It avoids catching up for missed runs and ensures only one instance runs at a time. +- Define a DAG to run daily, starting from February 1, 2023. It avoids catching up for missed runs and ensures only one instance runs at a time. -- Data is imported from Google Spreadsheets and directed BigQuery. +- Data is imported from Google spreadsheets and directed to BigQuery. - When adding the Google Spreadsheet task to the pipeline, avoid decomposing it; run it as a single task for efficiency. @@ -591,7 +590,7 @@ Below is the correct way to set up an Airflow DAG for this purpose: def get_named_ranges(): tasks = PipelineTasksGroup("get_named_ranges", use_data_folder=False, wipe_local_data=True) - # import your source from pipeline script + # Import your source from pipeline script from google_sheets import google_spreadsheet pipeline = dlt.pipeline( @@ -600,8 +599,9 @@ def get_named_ranges(): destination='bigquery', ) - # do not use decompose to run `google_spreadsheet` in single task + # Do not use decompose to run `google_spreadsheet` in single task tasks.add_run(pipeline, google_spreadsheet("1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580"), decompose="none", trigger_rule="all_done", retries=0, provide_context=True) ``` + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md b/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md index aac77b9b0a..3d7b577c0f 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md @@ -24,29 +24,29 @@ Sources and resources that can be loaded using this verified source are: | get_messages | resource-transformer | Retrieves emails from the mailbox using given UIDs | | get_attachments | resource-transformer | Downloads attachments from emails using given UIDs | -## Setup Guide +## Setup guide ### Grab credentials 1. For verified source configuration, you need: - "host": IMAP server hostname (e.g., Gmail: imap.gmail.com, Outlook: imap-mail.outlook.com). - - "email_account": Associated email account name (e.g. dlthub@dlthub.com). + - "email_account": Associated email account name (e.g., dlthub@dlthub.com). - "password": APP password (for third-party clients) from the email provider. 2. Host addresses and APP password procedures vary by provider and can be found via a quick Google search. For Google Mail's app password, read [here](https://support.google.com/mail/answer/185833?hl=en#:~:text=An%20app%20password%20is%20a,2%2DStep%20Verification%20turned%20on). 3. However, this guide covers Gmail inbox configuration; similar steps apply to other providers. -### Accessing Gmail Inbox +### Accessing Gmail inbox 1. SMTP server DNS: 'imap.gmail.com' for Gmail. 2. Port: 993 (for internet messaging access protocol over TLS/SSL). -### Grab App password for Gmail +### Grab app password for Gmail 1. An app password is a 16-digit code allowing less secure apps/devices to access your Google Account, available only with 2-Step Verification activated. -#### Steps to Create and Use App Passwords: +#### Steps to create and use app passwords: 1. Visit your Google Account > Security. 2. Under "How you sign in to Google", enable 2-Step Verification. @@ -84,31 +84,25 @@ For more information, read the ### Add credential -1. Inside the `.dlt` folder, you'll find a file called `secrets.toml`, which is where you can - securely store your access tokens and other sensitive information. It's important to handle this - file with care and keep it safe. Here's what the file looks like: +1. Inside the `.dlt` folder, you'll find a file called `secrets.toml`, which is where you can securely store your access tokens and other sensitive information. It's important to handle this file with care and keep it safe. Here's what the file looks like: ```toml # put your secret values and credentials here - # do not share this file and do not push it to github + # do not share this file and do not push it to GitHub [sources.inbox] host = "Please set me up!" # The host address of the email service provider. email_account = "Please set me up!" # Email account associated with the service. - password = "Please set me up!" # # APP Password for the above email account. + password = "Please set me up!" # APP Password for the above email account. ``` -2. Replace the host, email, and password value with the [previously copied one](#grab-credentials) - to ensure secure access to your Inbox resources. +2. Replace the host, email, and password value with the [previously copied one](#grab-credentials) to ensure secure access to your Inbox resources. > When adding the App Password, remove any spaces. For instance, "abcd efgh ijkl mnop" should be "abcdefghijklmnop". -3. Next, follow the [destination documentation](../../dlt-ecosystem/destinations) instructions to - add credentials for your chosen destination, ensuring proper routing of your data to the final - destination. +3. Next, follow the [destination documentation](../../dlt-ecosystem/destinations) instructions to add credentials for your chosen destination, ensuring proper routing of your data to the final destination. ## Run the pipeline -1. Before running the pipeline, ensure that you have installed all the necessary dependencies by - running the command: +1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: ```sh pip install -r requirements.txt ``` @@ -123,20 +117,17 @@ For more information, read the For pdf parsing: - PyPDF2: `pip install PyPDF2` -2. Once the pipeline has finished running, you can verify that everything loaded correctly by using - the following command: +2. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: ```sh dlt pipeline show ``` - For example, the `pipeline_name` for the above pipeline example is `standard_inbox`, you may also - use any custom name instead. + For example, the `pipeline_name` for the above pipeline example is `standard_inbox`, you may also use any custom name instead. For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline) ## Sources and resources -`dlt` works on the principle of [sources](../../general-usage/source) and -[resources](../../general-usage/resource). +`dlt` works on the principle of [sources](../../general-usage/source) and [resources](../../general-usage/resource). ### Source `inbox_source` @@ -170,7 +161,7 @@ def inbox_source( `start_date`: Start date to collect emails. Default: `/inbox/settings.py` 'DEFAULT_START_DATE'. -`filter_emails`:Email addresses for 'FROM' filtering. Default: `/inbox/settings.py` 'FILTER_EMAILS'. +`filter_emails`: Email addresses for 'FROM' filtering. Default: `/inbox/settings.py` 'FILTER_EMAILS'. `filter_by_mime_type`: MIME types for attachment filtering. Default: None. @@ -207,7 +198,7 @@ def get_messages( `items`: An iterable containing dictionaries with 'message_uid' representing the email message UIDs. -`include_body`: Includes email body if True. Default: True. +`include_body`: Includes the email body if True. Default: True. ### Resource `get_attachments_by_uid` @@ -261,7 +252,7 @@ verified source. # Print the loading details. print(load_info) ``` - > Please refer to inbox_source() docstring for email filtering options by sender, date, or mime type. + > Please refer to the inbox_source() docstring for email filtering options by sender, date, or mime type. 3. To load messages from multiple emails, including "community@dlthub.com": ```py @@ -271,7 +262,7 @@ verified source. ``` 4. In `inbox_pipeline.py`, the `pdf_to_text` transformer extracts text from PDFs, treating each page as a separate data item. - Using the `pdf_to_text` function to load parsed pdfs from mail to the database: + Using the `pdf_to_text` function to load parsed PDFs from mail to the database: ```py filter_emails = ["mycreditcard@bank.com", "community@dlthub.com."] # Email senders diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/jira.md b/docs/website/docs/dlt-ecosystem/verified-sources/jira.md index b4e8bb76de..a5bdcee64e 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/jira.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/jira.md @@ -21,14 +21,14 @@ The endpoints that this verified source supports are: | Name | Description | | --------- | ---------------------------------------------------------------------------------------- | | issues | Individual pieces of work to be completed | -| users | Administrators of a given project | +| users | Administrators of a given project | | workflows | The key aspect of managing and tracking the progress of issues or tasks within a project | | projects | A collection of tasks that need to be completed to achieve a certain outcome | To get a complete list of sub-endpoints that can be loaded, see [jira/settings.py.](https://github.com/dlt-hub/verified-sources/blob/master/sources/jira/settings.py) -## Setup Guide +## Setup guide ### Grab credentials @@ -73,9 +73,7 @@ For more information, read the guide on [how to add a verified source](../../wal ### Add credentials -1. Inside the `.dlt` folder, you'll find a file called `secrets.toml`, where you can securely store - your access tokens and other sensitive information. It's important to handle this file with care - and keep it safe. +1. Inside the `.dlt` folder, you'll find a file called `secrets.toml`, where you can securely store your access tokens and other sensitive information. It's important to handle this file with care and keep it safe. Here's what the file looks like: @@ -87,24 +85,19 @@ For more information, read the guide on [how to add a verified source](../../wal api_token = "set me up!" # please set me up! ``` -1. A subdomain in a URL identifies your Jira account. For example, in - "https://example.atlassian.net", "example" is the subdomain. +1. A subdomain in a URL identifies your Jira account. For example, in "https://example.atlassian.net", "example" is the subdomain. 1. Use the email address associated with your Jira account. -1. Replace the "access_token" value with the [previously copied one](jira.md#grab-credentials) to - ensure secure access to your Jira account. +1. Replace the "api_token" value with the [previously copied one](jira.md#grab-credentials) to ensure secure access to your Jira account. -1. Next, follow the [destination documentation](../../dlt-ecosystem/destinations) instructions to - add credentials for your chosen destination, ensuring proper routing of your data to the final - destination. +1. Next, follow the [destination documentation](../../dlt-ecosystem/destinations) instructions to add credentials for your chosen destination, ensuring proper routing of your data to the final destination. For more information, read [General Usage: Credentials.](../../general-usage/credentials) ## Run the pipeline -1. Before running the pipeline, ensure that you have installed all the necessary dependencies by - running the command: +1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: ```sh pip install -r requirements.txt ``` @@ -112,26 +105,21 @@ For more information, read [General Usage: Credentials.](../../general-usage/cre ```sh python jira_pipeline.py ``` -1. Once the pipeline has finished running, you can verify that everything loaded correctly by using - the following command: +1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: ```sh dlt pipeline show ``` - For example, the `pipeline_name` for the above pipeline example is `jira_pipeline`. You may also - use any custom name instead. + For example, the `pipeline_name` for the above pipeline example is `jira_pipeline`. You may also use any custom name instead. For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources -`dlt` works on the principle of [sources](../../general-usage/source) and -[resources](../../general-usage/resource). +`dlt` works on the principle of [sources](../../general-usage/source) and [resources](../../general-usage/resource). ### Default endpoints -You can write your own pipelines to load data to a destination using this verified source. However, -it is important to note the complete list of the default endpoints given in -[jira/settings.py.](https://github.com/dlt-hub/verified-sources/blob/master/sources/jira/settings.py) +You can write your own pipelines to load data to a destination using this verified source. However, it is important to note the complete list of the default endpoints given in [jira/settings.py.](https://github.com/dlt-hub/verified-sources/blob/master/sources/jira/settings.py) ### Source `jira` @@ -153,8 +141,7 @@ def jira( ### Source `jira_search` -This function returns a resource for querying issues using JQL -[(Jira Query Language)](https://support.atlassian.com/jira-service-management-cloud/docs/use-advanced-search-with-jira-query-language-jql/). +This function returns a resource for querying issues using JQL [(Jira Query Language)](https://support.atlassian.com/jira-service-management-cloud/docs/use-advanced-search-with-jira-query-language-jql/). ```py @dlt.source @@ -166,8 +153,7 @@ def jira_search( ... ``` -The above function uses the same arguments `subdomain`, `email`, and `api_token` as described above -for the [jira source](jira.md#source-jira). +The above function uses the same arguments `subdomain`, `email`, and `api_token` as described above for the [jira source](jira.md#source-jira). ### Resource `issues` @@ -183,14 +169,12 @@ def issues(jql_queries: List[str]) -> Iterable[TDataItem]: `jql_queries`: Accepts a list of JQL queries. ## Customization + ### Create your own pipeline -If you wish to create your own pipelines, you can leverage source and resource methods as discussed -above. +If you wish to create your own pipelines, you can leverage source and resource methods as discussed above. -1. Configure the pipeline by specifying the pipeline name, destination, and dataset. To read more - about pipeline configuration, please refer to our documentation - [here](https://dlthub.com/docs/general-usage/pipeline): +1. Configure the pipeline by specifying the pipeline name, destination, and dataset. To read more about pipeline configuration, please refer to our documentation [here](../../general-usage/pipeline): ```py pipeline = dlt.pipeline( @@ -200,16 +184,15 @@ above. ) ``` -2. To load custom endpoints such as “issues” and “users” using the jira source function: +2. To load custom endpoints such as "issues" and "users" using the jira source function: ```py - #Run the pipeline - load_info = pipeline.run(jira().with_resources("issues","users")) + # Run the pipeline + load_info = pipeline.run(jira().with_resources("issues", "users")) print(f"Load Information: {load_info}") ``` -3. To load the custom issues using JQL queries, you can use custom queries. Here is an example - below: +3. To load custom issues using JQL queries, you can use custom queries. Here is an example below: ```py # Define the JQL queries as follows diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md b/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md index fe3c426819..a402e2c5f0 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md @@ -20,7 +20,7 @@ The resource that can be loaded: | ----------------- |--------------------------------------------| | kafka_consumer | Extracts messages from Kafka topics | -## Setup Guide +## Setup guide ### Grab Kafka cluster credentials @@ -96,7 +96,7 @@ sasl_password="example_secret" For more information, read the [Walkthrough: Run a pipeline](../../walkthroughs/run-a-pipeline). -:::info If you created a topic and start reading from it immedately, the brokers may be not yet synchronized and offset from which `dlt` reads messages may become invalid. In this case the resource will return no messages. Pending messages will be received on next run (or when brokers synchronize) +:::info If you created a topic and start reading from it immediately, the brokers may not yet be synchronized, and the offset from which `dlt` reads messages may become invalid. In this case, the resource will return no messages. Pending messages will be received on the next run (or when brokers synchronize). ## Sources and resources @@ -126,7 +126,7 @@ def kafka_consumer( the `secrets.toml`. It may be used explicitly to pass an initialized Kafka Consumer object. -`msg_processor`: A function, which will be used to process every message +`msg_processor`: A function that will be used to process every message read from the given topics before saving them in the destination. It can be used explicitly to pass a custom processor. See the [default processor](https://github.com/dlt-hub/verified-sources/blob/fe8ed7abd965d9a0ca76d100551e7b64a0b95744/sources/kafka/helpers.py#L14-L50) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md b/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md index 296526b21a..4322c6806a 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md @@ -13,16 +13,16 @@ Matomo is a free and open-source web analytics platform that provides detailed i This Matomo `dlt` verified source and [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/matomo_pipeline.py) -loads data using “Matomo API” to the destination of your choice. +loads data using the “Matomo API” to the destination of your choice. The endpoints that this verified source supports are: | Name | Description | | ----------------- |---------------------------------------------------------------------------------| -| matomo_reports | Detailed analytics summaries of website traffic, visitor behavior, and more | -| matomo_visits | Individual user sessions on your website, pages viewed, visit duration and more | +| matomo_reports | Detailed analytics summaries of website traffic, visitor behavior, and more | +| matomo_visits | Individual user sessions on your website, pages viewed, visit duration, and more | -## Setup Guide +## Setup guide ### Grab credentials @@ -35,8 +35,8 @@ The endpoints that this verified source supports are: 1. Click "Create New Token." 1. Your token is displayed. 1. Copy the access token and update it in the `.dlt/secrets.toml` file. -1. Your Matomo URL is the web address in your browser when logged into Matomo, typically "https://mycompany.matomo.cloud/". Update it in the `.dlt/config.toml`. -1. The site_id is a unique ID for each monitored site in Matomo, found in the URL or via Administration > Measureables > Manage under ID. +1. Your Matomo URL is the web address in your browser when logged into Matomo, typically "https://mycompany.matomo.cloud/". Update it in the `.dlt/config.toml`. +1. The site_id is a unique ID for each monitored site in Matomo, found in the URL or via Administration > Measurables > Manage under ID. > Note: The Matomo UI, which is described here, might change. The full guide is available at [this link.](https://developer.matomo.org/guides/authentication-in-depth) @@ -66,23 +66,18 @@ For more information, read the guide on [how to add a verified source](../../wal ### Add credential -1. Inside the `.dlt` folder, you'll find a file called `secrets.toml`, which is where you can - securely store your access tokens and other sensitive information. It's important to handle this - file with care and keep it safe. Here's what the file looks like: +1. Inside the `.dlt` folder, you'll find a file called `secrets.toml`, which is where you can securely store your access tokens and other sensitive information. It's important to handle this file with care and keep it safe. Here's what the file looks like: ```toml # put your secret values and credentials here - # do not share this file and do not push it to github + # do not share this file and do not push it to GitHub [sources.matomo] api_token= "access_token" # please set me up!" ``` -1. Replace the api_token value with the [previously copied one](matomo.md#grab-credentials) - to ensure secure access to your Matomo resources. +1. Replace the api_token value with the [previously copied one](matomo.md#grab-credentials) to ensure secure access to your Matomo resources. -1. Next, follow the [destination documentation](../../dlt-ecosystem/destinations) instructions to - add credentials for your chosen destination, ensuring proper routing of your data to the final - destination. +1. Next, follow the [destination documentation](../../dlt-ecosystem/destinations) instructions to add credentials for your chosen destination, ensuring proper routing of your data to the final destination. 1. Next, store your pipeline configuration details in the `.dlt/config.toml`. @@ -95,16 +90,15 @@ For more information, read the guide on [how to add a verified source](../../wal site_id = 0 # please set me up! live_events_site_id = 0 # please set me up! ``` -1. Replace the value of `url` and `site_id` with the one that [you copied above](matomo.md#grab-url-and-site_id). +1. Replace the value of `url` and `site_id` with the one that [you copied above](matomo.md#grab-url-and-site_id). -1. To monitor live events on a website, enter the `live_event_site_id` (usually it is same as `site_id`). +1. To monitor live events on a website, enter the `live_event_site_id` (usually it is the same as `site_id`). For more information, read the [General Usage: Credentials.](../../general-usage/credentials) ## Run the pipeline -1. Before running the pipeline, ensure that you have installed all the necessary dependencies by - running the command: +1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: ```sh pip install -r requirements.txt ``` @@ -112,20 +106,17 @@ For more information, read the [General Usage: Credentials.](../../general-usage ```sh python matomo_pipeline.py ``` -1. Once the pipeline has finished running, you can verify that everything loaded correctly by using - the following command: +1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: ```sh dlt pipeline show ``` - For example, the `pipeline_name` for the above pipeline example is `matomo`, you may also - use any custom name instead. + For example, the `pipeline_name` for the above pipeline example is `matomo`, you may also use any custom name instead. For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources -`dlt` works on the principle of [sources](../../general-usage/source) and -[resources](../../general-usage/resource). +`dlt` works on the principle of [sources](../../general-usage/source) and [resources](../../general-usage/resource). ### Source `matomo_reports` @@ -144,17 +135,17 @@ def matomo_reports( `api_token`: API access token for Matomo server authentication, defaults to "./dlt/secrets.toml" -`url` : Matomo server URL, defaults to "./dlt/config.toml" +`url`: Matomo server URL, defaults to "./dlt/config.toml" `queries`: List of dictionaries containing info on what data to retrieve from Matomo API. `site_id`: Website's Site ID as per Matomo account. ->Note: This is an [incremental](https://dlthub.com/docs/general-usage/incremental-loading) source method and loads the "last_date" from the state of last pipeline run. +>Note: This is an [incremental](../../general-usage/incremental-loading) source method and loads the "last_date" from the state of the last pipeline run. -### Source `matomo_visits`: +### Source `matomo_visits` -The function loads visits from current day and the past `initial_load_past_days` in first run. In subsequent runs it continues from last load and skips active visits until closed. +The function loads visits from the current day and the past `initial_load_past_days` on the first run. In subsequent runs, it continues from the last load and skips active visits until they are closed. ```py def matomo_visits( @@ -183,7 +174,7 @@ def matomo_visits( `get_live_event_visitors`: Retrieve unique visitor data, defaulting to False. ->Note: This is an [incremental](https://dlthub.com/docs/general-usage/incremental-loading) source method and loads the "last_date" from the state of last pipeline run. +>Note: This is an [incremental](../../general-usage/incremental-loading) source method and loads the "last_date" from the state of the last pipeline run. ### Resource `get_last_visits` @@ -206,7 +197,7 @@ def get_last_visits( `site_id`: Unique ID for each Matomo site. -`last_date`: Last resource load date, if exists. +`last_date`: Last resource load date, if it exists. `visit_timeout_seconds`: Time (in seconds) until a session is inactive and deemed closed. Default: 1800. @@ -214,12 +205,13 @@ def get_last_visits( `rows_per_page`: Number of rows on each page. ->Note: This is an [incremental](https://dlthub.com/docs/general-usage/incremental-loading) resource method and loads the "last_date" from the state of last pipeline run. - +:::note +This is an [incremental](../../general-usage/incremental-loading) resource method and loads the "last_date" from the state of the last pipeline run. +::: ### Transformer `visitors` -This function, retrieves unique visit information from get_last_visits. +This function retrieves unique visit information from get_last_visits. ```py @dlt.transformer( @@ -244,8 +236,7 @@ def get_unique_visitors( ### Create your own pipeline -If you wish to create your own pipelines, you can leverage source and resource methods from this -verified source. +If you wish to create your own pipelines, you can leverage source and resource methods from this verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: @@ -257,8 +248,7 @@ verified source. ) ``` - To read more about pipeline configuration, please refer to our - [documentation](../../general-usage/pipeline). + To read more about pipeline configuration, please refer to our [documentation](../../general-usage/pipeline). 1. To load the data from reports. @@ -267,7 +257,7 @@ verified source. load_info = pipeline_reports.run(data_reports) print(load_info) ``` - >"site_id" defined in ".dlt/config.toml" + > "site_id" defined in ".dlt/config.toml" 1. To load custom data from reports using queries. @@ -278,17 +268,17 @@ verified source. "methods": ["CustomReports.getCustomReport"], "date": "2023-01-01", "period": "day", - "extra_params": {"idCustomReport": 1}, #id of the report + "extra_params": {"idCustomReport": 1}, # ID of the report }, ] - site_id = 1 #id of the site for which reports are being loaded + site_id = 1 # ID of the site for which reports are being loaded load_data = matomo_reports(queries=queries, site_id=site_id) load_info = pipeline_reports.run(load_data) print(load_info) ``` - >You can pass queries and site_id in the ".dlt/config.toml" as well. + > You can pass queries and site_id in the ".dlt/config.toml" as well. 1. To load data from reports and visits. @@ -308,3 +298,4 @@ verified source. ``` + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/notion.md b/docs/website/docs/dlt-ecosystem/verified-sources/notion.md index 69e66ed2aa..061b2c565b 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/notion.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/notion.md @@ -14,7 +14,7 @@ professional tasks, offering customizable notes, documents, databases, and more. This Notion `dlt` verified source and [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/notion_pipeline.py) -loads data using “Notion API” to the destination of your choice. +loads data using the “Notion API” to the destination of your choice. Sources that can be loaded using this verified source are: @@ -22,7 +22,7 @@ Sources that can be loaded using this verified source are: |------------------|---------------------------------------| | notion_databases | Retrieves data from Notion databases. | -## Setup Guide +## Setup guide ### Grab credentials @@ -90,7 +90,7 @@ For more information, read the guide on [how to add a verified source.](../../wa your chosen destination. This will ensure that your data is properly routed to its final destination. -For more information, read the [General Usage: Credentials.](../../general-usage/credentials) +For more information, read the [General usage: Credentials.](../../general-usage/credentials) ## Run the pipeline @@ -99,11 +99,11 @@ For more information, read the [General Usage: Credentials.](../../general-usage ```sh pip install -r requirements.txt ``` -1. You're now ready to run the pipeline! To get started, run the following command: +2. You're now ready to run the pipeline! To get started, run the following command: ```sh python notion_pipeline.py ``` -1. Once the pipeline has finished running, you can verify that everything loaded correctly by using +3. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: ```sh dlt pipeline show @@ -120,7 +120,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug ### Source `notion_databases` -This function loads notion databases from notion into the destination. +This function loads notion databases from Notion into the destination. ```py @dlt.source @@ -131,7 +131,7 @@ def notion_databases( ... ``` -`database_ids`: A list of dictionaries each containing a database id and a name. +`database_ids`: A list of dictionaries, each containing a database ID and a name. `api_key`: The Notion API secret key. @@ -161,7 +161,7 @@ verified source. To read more about pipeline configuration, please refer to our [documentation](../../general-usage/pipeline). -1. To load all the integrated databases: +2. To load all the integrated databases: ```py load_data = notion_databases() @@ -169,7 +169,7 @@ verified source. print(load_info) ``` -1. To load the custom databases: +3. To load the custom databases: ```py selected_database_ids = [{"id": "0517dae9409845cba7d","use_name":"db_one"}, {"id": "d8ee2d159ac34cfc"}] @@ -178,7 +178,7 @@ verified source. print(load_info) ``` - The Database ID can be retrieved from the URL. For example if the URL is: + The Database ID can be retrieved from the URL. For example, if the URL is: ```sh https://www.notion.so/d8ee2d159ac34cfc85827ba5a0a8ae71?v=c714dec3742440cc91a8c38914f83b6b @@ -193,3 +193,4 @@ The database name ("use_name") is optional; if skipped, the pipeline will fetch automatically. + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/openapi-generator.md b/docs/website/docs/dlt-ecosystem/verified-sources/openapi-generator.md index a987a55b15..a00a59a055 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/openapi-generator.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/openapi-generator.md @@ -9,7 +9,7 @@ import Header from './_source-info-header.md';
-Our OpenAPI source generator - `dlt-init-openapi` - generates [`dlt`](https://dlthub.com/docs) data pipelines from [OpenAPI 3.x specs](https://swagger.io/specification/) using the [rest_api verified source](./rest_api) to extract data from any REST API. If you are not familiar with the `rest_api` source, please read [rest_api](./rest_api) to learn how our `rest_api` source works. +Our OpenAPI source generator - `dlt-init-openapi` - generates [`dlt`](../../intro) data pipelines from [OpenAPI 3.x specs](https://swagger.io/specification/) using the [rest_api verified source](./rest_api) to extract data from any REST API. If you are not familiar with the `rest_api` source, please read [rest_api](./rest_api) to learn how our `rest_api` source works. :::tip We also have a cool [Google Colab example](https://colab.research.google.com/drive/1MRZvguOTZj1MlkEGzjiso8lQ_wr1MJRI?usp=sharing#scrollTo=LHGxzf1Ev_yr) that demonstrates this generator. 😎 @@ -66,7 +66,7 @@ We will create a simple example pipeline from a [PokeAPI spec](https://pokeapi.c dlt pipeline pokemon_pipeline show ``` -9. You can go to our docs at https://dlthub.com/docs to learn how to modify the generated pipeline to load to many destinations, place schema contracts on your pipeline, and many other things. +9. You can go to our docs at https://dlthub.com/docs to learn how to modify the generated pipeline to load to many destinations, place schema contracts on your pipeline, and many other things. :::note We used the `--global-limit 2` CLI flag to limit the requests to the PokeAPI @@ -94,12 +94,12 @@ pokemon_pipeline/ ``` :::warning -If you re-generate your pipeline, you will be prompted to continue if this folder exists. If you select yes, all generated files will be overwritten. All other files you may have created will remain in this folder. In non-interactive mode you will not be asked, and the generated files will be overwritten. +If you re-generate your pipeline, you will be prompted to continue if this folder exists. If you select yes, all generated files will be overwritten. All other files you may have created will remain in this folder. In non-interactive mode, you will not be asked, and the generated files will be overwritten. ::: ## A closer look at your `rest_api` dictionary in `pokemon/__init__.py` -This file contains the [configuration dictionary](./rest_api#source-configuration) for the rest_api source which is the main result of running this generator. For our Pokemon example, we have used an OpenAPI 3 spec that works out of the box. The result of this dictionary depends on the quality of the spec you are using, whether the API you are querying actually adheres to this spec, and whether our heuristics manage to find the right values. +This file contains the [configuration dictionary](./rest_api#source-configuration) for the rest_api source, which is the main result of running this generator. For our Pokemon example, we have used an OpenAPI 3 spec that works out of the box. The result of this dictionary depends on the quality of the spec you are using, whether the API you are querying actually adheres to this spec, and whether our heuristics manage to find the right values. The generated dictionary will look something like this: @@ -168,7 +168,7 @@ dlt-init-openapi pokemon --path ./path/to/my_spec.yml --no-interactive --output- **Options**: -_The only required options are either to supply a path or a URL to a spec_ +_The only required options are either to supply a path or a URL to a spec._ - `--url URL`: A URL to read the OpenAPI JSON or YAML file from. - `--path PATH`: A path to read the OpenAPI JSON or YAML file from locally. @@ -178,14 +178,14 @@ _The only required options are either to supply a path or a URL to a spec_ - `--log-level`: Set the logging level for stdout output, defaults to 20 (INFO). - `--global-limit`: Set a global limit on the generated source. - `--update-rest-api-source`: Update the locally cached rest_api verified source. -- `--allow-openapi-2`: Allows the use of OpenAPI v2. specs. Migration of the spec to 3.0 is recommended for better results though. +- `--allow-openapi-2`: Allows the use of OpenAPI v2 specs. Migration of the spec to 3.0 is recommended for better results, though. - `--version`: Show the installed version of the generator and exit. - `--help`: Show this message and exit. ## Config options You can pass a path to a config file with the `--config PATH` argument. To see available config values, go to https://github.com/dlt-hub/dlt-init-openapi/blob/devel/dlt_init_openapi/config.py and read the information below each field on the `Config` class. -The config file can be supplied as JSON or YAML dictionary. For example, to change the package name, you can create a YAML file: +The config file can be supplied as a JSON or YAML dictionary. For example, to change the package name, you can create a YAML file: ```yaml # config.yml @@ -199,7 +199,7 @@ $ dlt-init-openapi pokemon --url ... --config config.yml ``` ## Telemetry -We track your usage of this tool similar to how we track other commands in the dlt core library. Read more about this and how to disable it [here](../../reference/telemetry). +We track your usage of this tool similarly to how we track other commands in the dlt core library. Read more about this and how to disable it [here](../../reference/telemetry). ## Prior work This project started as a fork of [openapi-python-client](https://github.com/openapi-generators/openapi-python-client). Pretty much all parts are heavily changed or completely replaced, but some lines of code still exist, and we like to acknowledge the many good ideas we got from the original project :) @@ -207,4 +207,5 @@ This project started as a fork of [openapi-python-client](https://github.com/ope ## Implementation notes * OAuth Authentication currently is not natively supported. You can supply your own. * Per endpoint authentication currently is not supported by the generator. Only the first globally set securityScheme will be applied. You can add your own per endpoint if you need to. -* Basic OpenAPI 2.0 support is implemented. We recommend updating your specs at https://editor.swagger.io before using `dlt-init-openapi`. \ No newline at end of file +* Basic OpenAPI 2.0 support is implemented. We recommend updating your specs at https://editor.swagger.io before using `dlt-init-openapi`. + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/personio.md b/docs/website/docs/dlt-ecosystem/verified-sources/personio.md index 9829c94786..f638b5670c 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/personio.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/personio.md @@ -12,7 +12,7 @@ import Header from './_source-info-header.md'; Personio is a human resources management software that helps businesses streamline HR processes, including recruitment, employee data management, and payroll, in one platform. -Our [Personio verified](https://github.com/dlt-hub/verified-sources/blob/master/sources/personio) source loads data using Perosnio API to your preferred +Our [Personio verified](https://github.com/dlt-hub/verified-sources/blob/master/sources/personio) source loads data using the Personio API to your preferred [destination](../destinations). :::tip @@ -23,17 +23,17 @@ Resources that can be loaded using this verified source are: | Name | Description | Endpoint | |----------------------------|-----------------------------------------------------------------------------------|---------------------------------------------------| -| employees | Retrieves company employees details | /company/employees | +| employees | Retrieves company employees' details | /company/employees | | absences | Retrieves absence periods for absences tracked in days | /company/time-offs | -| absences_types | Retrieves list of various types of employee absences | /company/time-off-types | +| absences_types | Retrieves a list of various types of employee absences | /company/time-off-types | | attendances | Retrieves attendance records for each employee | /company/attendances | | projects | Retrieves a list of all company projects | /company/attendances/projects | | document_categories | Retrieves all document categories of the company | /company/document-categories | -| employees_absences_balance | The transformer, retrieves the absence balance for a specific employee | /company/employees/{employee_id}/absences/balance | +| employees_absences_balance | The transformer retrieves the absence balance for a specific employee | /company/employees/{employee_id}/absences/balance | | custom_reports_list | Retrieves metadata about existing custom reports (name, report type, report date) | /company/custom-reports/reports | | custom_reports | The transformer for custom reports | /company/custom-reports/reports/{report_id} | -## Setup Guide +## Setup guide ### Grab credentials @@ -42,7 +42,7 @@ To load data from Personio, you need to obtain API credentials, `client_id` and 1. Sign in to your Personio account, and ensure that your user account has API access rights. 1. Navigate to Settings > Integrations > API credentials. 1. Click on "Generate new credentials." -1. Assign necessary permissions to credentials, i.e. read access. +1. Assign necessary permissions to credentials, i.e., read access. :::info The Personio UI, which is described here, might change. The full guide is available at this [link.](https://developer.personio.de/docs#21-employee-attendance-and-absence-endpoints) @@ -173,8 +173,7 @@ def employees( `items_per_page`: Maximum number of items per page, defaults to 200. -`allow_external_schedulers`: A boolean that, if True, permits [external schedulers](../../general-usage/incremental-loading#using-airflow-schedule-for-backfill-and-incremental-loading) to manage incremental loading. - +`allow_external_schedulers`: A boolean that, if true, permits [external schedulers](../../general-usage/incremental-loading#using-airflow-schedule-for-backfill-and-incremental-loading) to manage incremental loading. Like the `employees` resource discussed above, other resources `absences` and `attendances` load data incrementally from the Personio API to your preferred destination. @@ -195,16 +194,16 @@ It is important to note that the data is loaded in `replace` mode where the exis completely replaced. In addition to the mentioned resource, -there are three more resources `projects`, `custom_reports_list` and `document_categories` -with similar behaviour. +there are three more resources `projects`, `custom_reports_list`, and `document_categories` +with similar behavior. ### Resource-transformer `employees_absences_balance` -Besides of these source and resource functions, there are two transformer functions +Besides these source and resource functions, there are two transformer functions for endpoints like `/company/employees/{employee_id}/absences/balance` and `/company/custom-reports/reports/{report_id}`. The transformer functions transform or process data from resources. -The transformer function `employees_absences_balance` process data from the `employees` resource. +The transformer function `employees_absences_balance` processes data from the `employees` resource. It fetches and returns a list of the absence balances for each employee. ```py @@ -219,7 +218,7 @@ def employees_absences_balance(employees_item: TDataItem) -> Iterable[TDataItem] ``` `employees_item`: The data item from the 'employees' resource. -It uses `@dlt.defer` decorator to enable parallel run in thread pool. +It uses the `@dlt.defer` decorator to enable parallel run in thread pool. ## Customization @@ -252,4 +251,3 @@ verified source. print(pipeline.run(load_data)) ``` - diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/pg_replication.md b/docs/website/docs/dlt-ecosystem/verified-sources/pg_replication.md index a12c831137..7934dd0067 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/pg_replication.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/pg_replication.md @@ -65,10 +65,10 @@ To get started with your data pipeline, follow these steps: dlt init pg_replication duckdb ``` - It will initialize [the pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/pg_replication_pipeline.py) with a Postgres replication as the [source](https://dlthub.com/docs/general-usage/source) and [DuckDB](https://dlthub.com/docs/dlt-ecosystem/destinations/duckdb) as the [destination](https://dlthub.com/docs/dlt-ecosystem/destinations). + It will initialize [the pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/pg_replication_pipeline.py) with a Postgres replication as the [source](../../general-usage/source) and [DuckDB](../../dlt-ecosystem/destinations/duckdb) as the [destination](../../dlt-ecosystem/destinations). -2. If you'd like to use a different destination, simply replace `duckdb` with the name of your preferred [destination](https://dlthub.com/docs/dlt-ecosystem/destinations). +2. If you'd like to use a different destination, simply replace `duckdb` with the name of your preferred [destination](../../dlt-ecosystem/destinations). 3. This source uses `sql_database` source, you can init it as follows: @@ -81,7 +81,7 @@ To get started with your data pipeline, follow these steps: 4. After running these two commands, a new directory will be created with the necessary files and configuration settings to get started. - For more information, read the guide on [how to add a verified source](https://dlthub.com/docs/walkthroughs/add-a-verified-source). + For more information, read the guide on [how to add a verified source](../../walkthroughs/add-a-verified-source). :::note You can omit the `[sql.sources.credentials]` section in `secrets.toml` as it is not required. @@ -109,9 +109,9 @@ To get started with your data pipeline, follow these steps: sources.pg_replication.credentials="postgresql://username@password.host:port/database" ``` -3. Finally, follow the instructions in [Destinations](https://dlthub.com/docs/dlt-ecosystem/destinations/) to add credentials for your chosen destination. This will ensure that your data is properly routed. +3. Finally, follow the instructions in [Destinations](../../dlt-ecosystem/destinations/) to add credentials for your chosen destination. This will ensure that your data is properly routed. -For more information, read the [Configuration section.](https://dlthub.com/docs/general-usage/credentials) +For more information, read the [Configuration section.](../../general-usage/credentials) ## Run the pipeline @@ -130,12 +130,12 @@ For more information, read the [Configuration section.](https://dlthub.com/docs/ For example, the `pipeline_name` for the above pipeline example is `pg_replication_pipeline`, you may also use any custom name instead. - For more information, read the guide on [how to run a pipeline](https://dlthub.com/docs/walkthroughs/run-a-pipeline). + For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources -`dlt` works on the principle of [sources](https://dlthub.com/docs/general-usage/source) and [resources](https://dlthub.com/docs/general-usage/resource). +`dlt` works on the principle of [sources](../../general-usage/source) and [resources](../../general-usage/resource). ### Resource `replication_resource` diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md b/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md index d571e5d386..ae99bc3f18 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md @@ -30,7 +30,7 @@ Sources and resources that can be loaded using this verified source are: | stage | Specific step in a sales process where a deal resides based on its progress | | user | Individual with a unique login credential who can access and use the platform | -## Setup Guide +## Setup guide ### Grab API token @@ -77,7 +77,7 @@ For more information, read the guide on [how to add a verified source.](../../wa ```toml [sources.pipedrive.credentials] # Note: Do not share this file and do not push it to GitHub! - pipedrive_api_key = "PIPEDRIVE_API_TOKEN" # please set me up ! + pipedrive_api_key = "PIPEDRIVE_API_TOKEN" # please set me up! ``` 1. Replace `PIPEDRIVE_API_TOKEN` with the API token you [copied above](#grab-api-token). @@ -93,11 +93,11 @@ For more information, read the [General Usage: Credentials.](../../general-usage ```sh pip install -r requirements.txt ``` -1. You're now ready to run the pipeline! To get started, run the following command: +2. You're now ready to run the pipeline! To get started, run the following command: ```sh python pipedrive_pipeline.py ``` -1. Once the pipeline has finished running, you can verify that everything loaded correctly by using +3. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: ```sh dlt pipeline show @@ -132,8 +132,8 @@ Pipedrive API. ### Source `pipedrive_source` -This function returns a list of resources including activities, deals, custom_fields_mapping and -other resources data from Pipedrive API. +This function returns a list of resources including activities, deals, custom_fields_mapping, and +other resources data from the Pipedrive API. ```py @dlt.source(name="pipedrive") @@ -146,8 +146,8 @@ def pipedrive_source( `pipedrive_api_key`: Authentication token for Pipedrive, configured in ".dlt/secrets.toml". -`since_timestamp`: Starting timestamp for incremental loading. By default, complete history is loaded - on the first run. And new data in subsequent runs. +`since_timestamp`: Starting timestamp for incremental loading. By default, the complete history is loaded + on the first run, and new data in subsequent runs. > Note: Incremental loading can be enabled or disabled depending on user preferences. @@ -167,7 +167,7 @@ for entity, resource_name in RECENTS_ENTITIES.items(): write_disposition="merge", )(entity, **resource_kwargs) - #yields endpoint_resources.values + # yields endpoint_resources.values ``` `entity and resource_name`: Key-value pairs from RECENTS_ENTITIES. @@ -198,8 +198,8 @@ def pipedrive_source(args): `write_disposition`: Sets the transformer to merge new data with existing data in the destination. -Similar to the transformer function "deals_participants" is another transformer function named -"deals_flow" that gets the flow of deals from the Pipedrive API, and then yields the result for +Similar to the transformer function "deals_participants," another transformer function named +"deals_flow" gets the flow of deals from the Pipedrive API and then yields the result for further processing or loading. ### Resource `create_state` @@ -225,7 +225,7 @@ entity exists. This updated state is then saved for future pipeline runs. Similar to the above functions, there are the following: `custom_fields_mapping`: Transformer function that parses and yields custom fields' mapping in order -to be stored in destination by dlt. +to be stored in the destination by dlt. `leads`: Resource function that incrementally loads Pipedrive leads by update_time. @@ -292,5 +292,3 @@ verified source. print(load_info) ``` - - diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/advanced.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/advanced.md index fa663b9ca5..27d2cc0b6e 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/advanced.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/advanced.md @@ -51,6 +51,9 @@ In this example, the source will ignore responses with a status code of 404, res #### Example B ```py +from requests.models import Response +from dlt.common import json + def set_encoding(response, *args, **kwargs): # sets the encoding in case it's not correctly detected response.encoding = 'windows-1252' diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md index e301128dc1..121769a11a 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md @@ -307,6 +307,7 @@ A resource configuration is used to define a [dlt resource](../../../general-usa - `write_disposition`: The write disposition for the resource. - `primary_key`: The primary key for the resource. - `include_from_parent`: A list of fields from the parent resource to be included in the resource output. See the [resource relationships](#include-fields-from-the-parent-resource) section for more details. +- `processing_steps`: A list of [processing steps](#processing-steps-filter-and-transform-data) to filter and transform the data. - `selected`: A flag to indicate if the resource is selected for loading. This could be useful when you want to load data only from child resources and not from the parent resource. You can also pass additional resource parameters that will be used to configure the dlt resource. See [dlt resource API reference](../../../api_reference/extract/decorators#resource) for more details. @@ -341,7 +342,7 @@ The fields in the endpoint configuration are: - `json`: The JSON payload to be sent with the request (for POST and PUT requests). - `paginator`: Pagination configuration for the endpoint. See the [pagination](#pagination) section for more details. - `data_selector`: A JSONPath to select the data from the response. See the [data selection](#data-selection) section for more details. -- `response_actions`: A list of actions that define how to process the response data. See the [response actions](#response-actions) section for more details. +- `response_actions`: A list of actions that define how to process the response data. See the [response actions](./advanced#response-actions) section for more details. - `incremental`: Configuration for [incremental loading](#incremental-loading). ### Pagination @@ -405,7 +406,7 @@ These are the available paginators: | `type` | Paginator class | Description | | ------------ | -------------- | ----------- | | `json_link` | [JSONLinkPaginator](../../../general-usage/http/rest-client.md#jsonresponsepaginator) | The link to the next page is in the body (JSON) of the response.
*Parameters:*
  • `next_url_path` (str) - the JSONPath to the next page URL
| -| `header_link` | [HeaderLinkPaginator](../../../general-usage/http/rest-client.md#headerlinkpaginator) | The links to the next page are in the response headers.
*Parameters:*
  • `link_header` (str) - the name of the header containing the links. Default is "next".
| +| `header_link` | [HeaderLinkPaginator](../../../general-usage/http/rest-client.md#headerlinkpaginator) | The links to the next page are in the response headers.
*Parameters:*
  • `links_next_key` (str) - the name of the header containing the links. Default is "next".
| | `offset` | [OffsetPaginator](../../../general-usage/http/rest-client.md#offsetpaginator) | The pagination is based on an offset parameter. With total items count either in the response body or explicitly provided.
*Parameters:*
  • `limit` (int) - the maximum number of items to retrieve in each request
  • `offset` (int) - the initial offset for the first request. Defaults to `0`
  • `offset_param` (str) - the name of the query parameter used to specify the offset. Defaults to "offset"
  • `limit_param` (str) - the name of the query parameter used to specify the limit. Defaults to "limit"
  • `total_path` (str) - a JSONPath expression for the total number of items. If not provided, pagination is controlled by `maximum_offset` and `stop_after_empty_page`
  • `maximum_offset` (int) - optional maximum offset value. Limits pagination even without total count
  • `stop_after_empty_page` (bool) - Whether pagination should stop when a page contains no result items. Defaults to `True`
| | `page_number` | [PageNumberPaginator](../../../general-usage/http/rest-client.md#pagenumberpaginator) | The pagination is based on a page number parameter. With total pages count either in the response body or explicitly provided.
*Parameters:*
  • `base_page` (int) - the starting page number. Defaults to `0`
  • `page_param` (str) - the query parameter name for the page number. Defaults to "page"
  • `total_path` (str) - a JSONPath expression for the total number of pages. If not provided, pagination is controlled by `maximum_page` and `stop_after_empty_page`
  • `maximum_page` (int) - optional maximum page number. Stops pagination once this page is reached
  • `stop_after_empty_page` (bool) - Whether pagination should stop when a page contains no result items. Defaults to `True`
| | `cursor` | [JSONResponseCursorPaginator](../../../general-usage/http/rest-client.md#jsonresponsecursorpaginator) | The pagination is based on a cursor parameter. The value of the cursor is in the response body (JSON).
*Parameters:*
  • `cursor_path` (str) - the JSONPath to the cursor value. Defaults to "cursors.next"
  • `cursor_param` (str) - the query parameter name for the cursor. Defaults to "after"
| @@ -654,6 +655,151 @@ You can include data from the parent resource in the child resource by using the This will include the `id`, `title`, and `created_at` fields from the `issues` resource in the `issue_comments` resource data. The name of the included fields will be prefixed with the parent resource name and an underscore (`_`) like so: `_issues_id`, `_issues_title`, `_issues_created_at`. +### Define a resource which is not a REST endpoint + +Sometimes, we want to request endpoints with specific values that are not returned by another endpoint. +Thus, you can also include arbitrary dlt resources in your `RESTAPIConfig` instead of defining a resource for every path! + +In the following example, we want to load the issues belonging to three repositories. +Instead of defining now three different issues resources, one for each of the paths `dlt-hub/dlt/issues/`, `dlt-hub/verified-sources/issues/`, `dlt-hub/dlthub-education/issues/`, we have a resource `repositories` which yields a list of repository names which will be fetched by the dependent resource `issues`. + +```py +from dlt.sources.rest_api import RESTAPIConfig + +@dlt.resource() +def repositories() -> Generator[List[Dict[str, Any]]]: + """A seed list of repositories to fetch""" + yield [{"name": "dlt"}, {"name": "verified-sources"}, {"name": "dlthub-education"}] + + +config: RESTAPIConfig = { + "client": {"base_url": "https://github.com/api/v2"}, + "resources": [ + { + "name": "issues", + "endpoint": { + "path": "dlt-hub/{repository}/issues/", + "params": { + "repository": { + "type": "resolve", + "resource": "repositories", + "field": "name", + }, + }, + }, + }, + repositories(), + ], +} +``` + +Be careful that the parent resource needs to return `Generator[List[Dict[str, Any]]]`. Thus, the following will NOT work: + +```py +@dlt.resource +def repositories() -> Generator[Dict[str, Any]]: + """Not working seed list of repositories to fetch""" + yield from [{"name": "dlt"}, {"name": "verified-sources"}, {"name": "dlthub-education"}] +``` + +### Processing steps: filter and transform data + +The `processing_steps` field in the resource configuration allows you to apply transformations to the data fetched from the API before it is loaded into your destination. This is useful when you need to filter out certain records, modify the data structure, or anonymize sensitive information. + +Each processing step is a dictionary specifying the type of operation (`filter` or `map`) and the function to apply. Steps apply in the order they are listed. + +#### Quick example + +```py +def lower_title(record): + record["title"] = record["title"].lower() + return record + +config: RESTAPIConfig = { + "client": { + "base_url": "https://api.example.com", + }, + "resources": [ + { + "name": "posts", + "processing_steps": [ + {"filter": lambda x: x["id"] < 10}, + {"map": lower_title}, + ], + }, + ], +} +``` + +In the example above: + +- First, the `filter` step uses a lambda function to include only records where `id` is less than 10. +- Thereafter, the `map` step applies the `lower_title` function to each remaining record. + +#### Using `filter` + +The `filter` step allows you to exclude records that do not meet certain criteria. The provided function should return `True` to keep the record or `False` to exclude it: + +```py +{ + "name": "posts", + "endpoint": "posts", + "processing_steps": [ + {"filter": lambda x: x["id"] in [10, 20, 30]}, + ], +} +``` + +In this example, only records with `id` equal to 10, 20, or 30 will be included. + +#### Using `map` + +The `map` step allows you to modify the records fetched from the API. The provided function should take a record as an argument and return the modified record. For example, to anonymize the `email` field: + +```py +def anonymize_email(record): + record["email"] = "REDACTED" + return record + +config: RESTAPIConfig = { + "client": { + "base_url": "https://api.example.com", + }, + "resources": [ + { + "name": "users", + "processing_steps": [ + {"map": anonymize_email}, + ], + }, + ], +} +``` + +#### Combining `filter` and `map` + +You can combine multiple processing steps to achieve complex transformations: + +```py +{ + "name": "posts", + "endpoint": "posts", + "processing_steps": [ + {"filter": lambda x: x["id"] < 10}, + {"map": lower_title}, + {"filter": lambda x: "important" in x["title"]}, + ], +} +``` + +:::tip +#### Best practices +1. Order matters: Processing steps are applied in the order they are listed. Be mindful of the sequence, especially when combining `map` and `filter`. +2. Function definition: Define your filter and map functions separately for clarity and reuse. +3. Use `filter` to exclude records early in the process to reduce the amount of data that needs to be processed. +4. Combine consecutive `map` steps into a single function for faster execution. +::: + ## Incremental loading Some APIs provide a way to fetch only new or changed data (most often by using a timestamp field like `updated_at`, `created_at`, or incremental IDs). @@ -877,7 +1023,7 @@ See the [troubleshooting guide](../../../general-usage/incremental-loading.md#tr #### Getting HTTP 404 errors -Some API may return 404 errors for resources that do not exist or have no data. Manage these responses by configuring the `ignore` action in [response actions](#response-actions). +Some API may return 404 errors for resources that do not exist or have no data. Manage these responses by configuring the `ignore` action in [response actions](./advanced#response-actions). ### Authentication issues diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/scrapy.md b/docs/website/docs/dlt-ecosystem/verified-sources/scrapy.md index 2e6b588c18..054193d77a 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/scrapy.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/scrapy.md @@ -9,7 +9,7 @@ keywords: [scraping, scraping verified source, scrapy] This verified source utilizes Scrapy, an open-source and collaborative framework for web scraping. Scrapy enables efficient extraction of required data from websites. -## Setup Guide +## Setup guide ### Initialize the verified source @@ -44,8 +44,8 @@ For more information, read the guide on start_urls = ["URL to be scraped"] # please set me up! start_urls_file = "/path/to/urls.txt" # please set me up! ``` - > When both `start_urls` and `start_urls_file` are provided they will be merged and deduplicated - > to ensure a Scrapy gets a unique set of start URLs. + > When both `start_urls` and `start_urls_file` are provided, they will be merged and deduplicated + > to ensure Scrapy gets a unique set of start URLs. 1. Inside the `.dlt` folder, you'll find a file called `secrets.toml`, which is where you can securely store your access tokens and other sensitive information. It's important to handle this @@ -61,7 +61,7 @@ For more information, read [Secrets and Configs.](../../general-usage/credential In this section, we demonstrate how to use the `MySpider` class defined in "scraping_pipeline.py" to scrape data from "https://quotes.toscrape.com/page/1/". -1. Start with configuring the `config.toml` as follows: +1. Start by configuring the `config.toml` as follows: ```toml [sources.scraping] @@ -85,12 +85,14 @@ scrape data from "https://quotes.toscrape.com/page/1/". ## Customization + + ### Create your own pipeline If you wish to create your data pipeline, follow these steps: 1. The first step requires creating a spider class that scrapes data - from the website. For example, class `Myspider` below scrapes data from + from the website. For example, the class `Myspider` below scrapes data from URL: "https://quotes.toscrape.com/page/1/". ```py @@ -153,7 +155,7 @@ If you wish to create your data pipeline, follow these steps: In the above example, scrapy settings are passed as a parameter. For more information about scrapy settings, please refer to the - [Scrapy documentation.](https://docs.scrapy.org/en/latest/topics/settings.html). + [Scrapy documentation](https://docs.scrapy.org/en/latest/topics/settings.html). 1. To limit the number of items processed, use the "on_before_start" function to set a limit on the resources the pipeline processes. For instance, setting the resource limit to two allows @@ -187,3 +189,4 @@ If you wish to create your data pipeline, follow these steps: scraping_host.pipeline_runner.scraping_resource.add_limit(2) scraping_host.run(dataset_name="quotes", write_disposition="append") ``` + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/slack.md b/docs/website/docs/dlt-ecosystem/verified-sources/slack.md index 38eda15c94..35b12bb64f 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/slack.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/slack.md @@ -25,7 +25,7 @@ Sources and resources that can be loaded using this verified source are: | get_messages_resource | Retrieves all the messages for a given channel | | access_logs | Retrieves the access logs | -## Setup Guide +## Setup guide ### Grab user OAuth token @@ -204,7 +204,7 @@ def get_messages_resource( - `end_value`: Timestamp range end, defaulting to end_dt in slack_source. - - `allow_external_schedulers`: A boolean that, if True, permits [external schedulers](../../general-usage/incremental-loading#using-airflow-schedule-for-backfill-and-incremental-loading) to manage incremental loading. + - `allow_external_schedulers`: A boolean that, if true, permits [external schedulers](../../general-usage/incremental-loading#using-airflow-schedule-for-backfill-and-incremental-loading) to manage incremental loading. ### Resource `access_logs` @@ -217,7 +217,7 @@ This method retrieves access logs from the Slack API. primary_key="user_id", write_disposition="append", ) -# it is not an incremental resource it just has a end_date filter +# It is not an incremental resource; it just has an end_date filter. def logs_resource() -> Iterable[TDataItem]: ... ``` @@ -232,8 +232,7 @@ def logs_resource() -> Iterable[TDataItem]: ## Customization ### Create your own pipeline -If you wish to create your own pipelines, you can leverage source and resource methods from this -verified source. +If you wish to create your own pipelines, you can leverage source and resource methods from this verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: @@ -252,7 +251,7 @@ verified source. # Enable below to load only 'access_logs', available for paid accounts only. # source.access_logs.selected = True - # It loads data starting from 1st September 2023 to 8th Sep 2023. + # It loads data starting from 1st September 2023 to 8th September 2023. load_info = pipeline.run(source) print(load_info) ``` @@ -270,7 +269,7 @@ verified source. start_date=datetime(2023, 9, 1), end_date=datetime(2023, 9, 8), ) - # It loads data starting from 1st September 2023 to 8th Sep 2023 from the channels: "general" and "random". + # It loads data starting from 1st September 2023 to 8th September 2023 from the channels: "general" and "random". load_info = pipeline.run(source) print(load_info) ``` diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md index 7ff08f8095..708b195456 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md @@ -14,7 +14,7 @@ import Header from '../_source-info-header.md'; Efficient data management often requires loading only new or updated data from your SQL databases, rather than reprocessing the entire dataset. This is where incremental loading comes into play. -Incremental loading uses a cursor column (e.g., timestamp or auto-incrementing ID) to load only data newer than a specified initial value, enhancing efficiency by reducing processing time and resource use. Read [here](https://dlthub.com/docs/walkthroughs/sql-incremental-configuration) for more details on incremental loading with `dlt`. +Incremental loading uses a cursor column (e.g., timestamp or auto-incrementing ID) to load only data newer than a specified initial value, enhancing efficiency by reducing processing time and resource use. Read [here](../../../walkthroughs/sql-incremental-configuration) for more details on incremental loading with `dlt`. #### How to configure @@ -27,48 +27,61 @@ certain range. #### Examples -**1. Incremental loading with the resource `sql_table`** - Consider a table "family" with a timestamp column `last_modified` that indicates when a row was last modified. To ensure that only rows modified after midnight (00:00:00) on January 1, 2024, are loaded, you would set `last_modified` timestamp as the cursor as follows: - ```py - from sql_database import sql_table - from datetime import datetime - - # Example: Incrementally loading a table based on a timestamp column - table = sql_table( - table='family', - incremental=dlt.sources.incremental( - 'last_modified', # Cursor column name - initial_value=pendulum.DateTime(2024, 1, 1, 0, 0, 0) # Initial cursor value - ) - ) - - info = pipeline.extract(table, write_disposition="merge") - print(info) - ``` - Behind the scene, the loader generates a SQL query filtering rows with `last_modified` values greater than the incremental value. In the first run, this is the initial value (midnight (00:00:00) January 1, 2024). - In subsequent runs, it is the latest value of `last_modified` that `dlt` stores in [state](https://dlthub.com/docs/general-usage/state). - -**2. Incremental loading with the source `sql_database`** - To achieve the same using the `sql_database` source, you would specify your cursor as follows: +1. **Incremental loading with the resource `sql_table`**. - ```py - source = sql_database().with_resources("family") - #using the "last_modified" field as an incremental field using initial value of midnight January 1, 2024 - source.family.apply_hints(incremental=dlt.sources.incremental("updated", initial_value=pendulum.DateTime(2022, 1, 1, 0, 0, 0))) - #running the pipeline - info = pipeline.run(source, write_disposition="merge") - print(info) - ``` - - :::info - * When using "merge" write disposition, the source table needs a primary key, which `dlt` automatically sets up. - * `apply_hints` is a powerful method that enables schema modifications after resource creation, like adjusting write disposition and primary keys. You can choose from various tables and use `apply_hints` multiple times to create pipelines with merged, appended, or replaced resources. - ::: + Consider a table "family" with a timestamp column `last_modified` that indicates when a row was last modified. To ensure that only rows modified after midnight (00:00:00) on January 1, 2024, are loaded, you would set `last_modified` timestamp as the cursor as follows: + + ```py + import dlt + from dlt.sources.sql_database import sql_table + from dlt.common.pendulum import pendulum + + # Example: Incrementally loading a table based on a timestamp column + table = sql_table( + table='family', + incremental=dlt.sources.incremental( + 'last_modified', # Cursor column name + initial_value=pendulum.DateTime(2024, 1, 1, 0, 0, 0) # Initial cursor value + ) + ) + + pipeline = dlt.pipeline(destination="duckdb") + info = pipeline.extract(table, write_disposition="merge") + print(info) + ``` + + Behind the scene, the loader generates a SQL query filtering rows with `last_modified` values greater than the incremental value. In the first run, this is the initial value (midnight (00:00:00) January 1, 2024). + In subsequent runs, it is the latest value of `last_modified` that `dlt` stores in [state](../../../general-usage/state). + +2. **Incremental loading with the source `sql_database`**. + + To achieve the same using the `sql_database` source, you would specify your cursor as follows: + + ```py + import dlt + from dlt.sources.sql_database import sql_database + + source = sql_database().with_resources("family") + #using the "last_modified" field as an incremental field using initial value of midnight January 1, 2024 + source.family.apply_hints(incremental=dlt.sources.incremental("updated", initial_value=pendulum.DateTime(2022, 1, 1, 0, 0, 0))) + + #running the pipeline + pipeline = dlt.pipeline(destination="duckdb") + info = pipeline.run(source, write_disposition="merge") + print(info) + ``` + + :::info + * When using "merge" write disposition, the source table needs a primary key, which `dlt` automatically sets up. + * `apply_hints` is a powerful method that enables schema modifications after resource creation, like adjusting write disposition and primary keys. You can choose from various tables and use `apply_hints` multiple times to create pipelines with merged, appended, or replaced resources. + ::: ## Parallelized extraction You can extract each table in a separate thread (no multiprocessing at this point). This will decrease loading time if your queries take time to execute or your network latency/speed is low. To enable this, declare your sources/resources as follows: ```py +from dlt.sources.sql_database import sql_database, sql_table + database = sql_database().parallelize() table = sql_table().parallelize() ``` @@ -83,7 +96,7 @@ The `reflection_level` argument controls how much information is reflected: - `reflection_level = "full"`: Column names, nullability, and data types are detected. For decimal types we always add precision and scale. **This is the default.** - `reflection_level = "full_with_precision"`: Column names, nullability, data types, and precision/scale are detected, also for types like text and binary. Integer sizes are set to bigint and to int for all other types. -If the SQL type is unknown or not supported by `dlt`, then, in the pyarrow backend, the column will be skipped, whereas in the other backends the type will be inferred directly from the data irrespective of the `reflection_level` specified. In the latter case, this often means that some types are coerced to strings and `dataclass` based values from sqlalchemy are inferred as `json` (JSON in most destinations). +If the SQL type is unknown or not supported by `dlt`, then, in the pyarrow backend, the column will be skipped, whereas in the other backends the type will be inferred directly from the data irrespective of the `reflection_level` specified. In the latter case, this often means that some types are coerced to strings and `dataclass` based values from sqlalchemy are inferred as `json` (JSON in most destinations). :::tip If you use reflection level **full** / **full_with_precision** you may encounter a situation where the data returned by sqlalchemy or pyarrow backend does not match the reflected data types. Most common symptoms are: 1. The destination complains that it cannot cast one type to another for a certain column. For example `connector-x` returns TIME in nanoseconds @@ -104,8 +117,9 @@ In the following example, when loading timestamps from Snowflake, you ensure tha ```py import dlt -from snowflake.sqlalchemy import TIMESTAMP_NTZ import sqlalchemy as sa +from dlt.sources.sql_database import sql_database, sql_table +from snowflake.sqlalchemy import TIMESTAMP_NTZ def type_adapter_callback(sql_type): if isinstance(sql_type, TIMESTAMP_NTZ): # Snowflake does not inherit from sa.DateTime @@ -142,9 +156,9 @@ The examples below show how you can set arguments in any of the `.toml` files (` [sources.sql_database.chat_message.incremental] cursor_path="updated_at" ``` - This is especially useful with `sql_table()` in a situation where you may want to run this resource for multiple tables. Setting parameters like this would then give you a clean way of maintaing separate configurations for each table. + This is especially useful with `sql_table()` in a situation where you may want to run this resource for multiple tables. Setting parameters like this would then give you a clean way of maintaing separate configurations for each table. -3. Handling separate configurations for database and individual tables +3. Handling separate configurations for database and individual tables When using the `sql_database()` source, you can separately configure the parameters for the database and for the individual tables. ```toml [sources.sql_database] @@ -155,7 +169,7 @@ The examples below show how you can set arguments in any of the `.toml` files (` [sources.sql_database.chat_message.incremental] cursor_path="updated_at" - ``` + ``` The resulting source created below will extract data using **pandas** backend with **chunk_size** 1000. The table **chat_message** will load data incrementally using **updated_at** column. All the other tables will not use incremental loading, and will instead load the full data. @@ -163,9 +177,9 @@ The examples below show how you can set arguments in any of the `.toml` files (` database = sql_database() ``` -You'll be able to configure all the arguments this way (except adapter callback function). [Standard dlt rules apply](https://dlthub.com/docs/general-usage/credentials/configuration#configure-dlt-sources-and-resources). - -It is also possible to set these arguments as environment variables [using the proper naming convention](https://dlthub.com/docs/general-usage/credentials/config_providers#toml-vs-environment-variables): +You'll be able to configure all the arguments this way (except adapter callback function). [Standard dlt rules apply]((/general-usage/credentials/setup). + +It is also possible to set these arguments as environment variables [using the proper naming convention](../../../general-usage/credentials/setup#naming-convention): ```sh SOURCES__SQL_DATABASE__CREDENTIALS="mssql+pyodbc://loader.database.windows.net/dlt_data?trusted_connection=yes&driver=ODBC+Driver+17+for+SQL+Server" SOURCES__SQL_DATABASE__BACKEND=pandas diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/configuration.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/configuration.md index 88ea268378..6de2a02b31 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/configuration.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/configuration.md @@ -12,7 +12,7 @@ import Header from '../_source-info-header.md'; ## Configuring the SQL Database source -`dlt` sources are python scripts made up of source and resource functions that can be easily customized. The SQL Database verified source has the following built-in source and resource: +`dlt` sources are python scripts made up of source and resource functions that can be easily customized. The SQL Database verified source has the following built-in source and resource: 1. `sql_database`: a `dlt` source which can be used to load multiple tables and views from a SQL database 2. `sql_table`: a `dlt` resource that loads a single table from the SQL database @@ -20,16 +20,18 @@ Read more about sources and resources here: [General usage: source](../../../gen ### Example usage: -1. **Load all the tables from a database** - Calling `sql_database()` loads all tables from the database. +1. **Load all the tables from a database** + Calling `sql_database()` loads all tables from the database. ```py - def load_entire_database() -> None: + import dlt + from dlt.sources.sql_database import sql_database + def load_entire_database() -> None: # Define the pipeline pipeline = dlt.pipeline( - pipeline_name="rfam", - destination='synapse', + pipeline_name="rfam", + destination='synapse', dataset_name="rfam_data" ) @@ -41,22 +43,24 @@ Read more about sources and resources here: [General usage: source](../../../gen # Print load info print(info) - ``` + ``` -2. **Load select tables from a database** - Calling `sql_database().with_resources("family", "clan")` loads only the tables `"family"` and `"clan"` from the database. +2. **Load select tables from a database** + Calling `sql_database().with_resources("family", "clan")` loads only the tables `"family"` and `"clan"` from the database. ```py - def load_select_tables_from_database() -> None: + import dlt + from dlt.sources.sql_database import sql_database + def load_select_tables_from_database() -> None: # Define the pipeline pipeline = dlt.pipeline( - pipeline_name="rfam", - destination="postgres", + pipeline_name="rfam", + destination="postgres", dataset_name="rfam_data" ) - # Fetch tables "family" and "clan" + # Fetch tables "family" and "clan" source = sql_database().with_resources("family", "clan") # Run the pipeline @@ -65,22 +69,24 @@ Read more about sources and resources here: [General usage: source](../../../gen # Print load info print(info) - ``` + ``` -3. **Load a standalone table** +3. **Load a standalone table** Calling `sql_table(table="family")` fetches only the table `"family"` ```py - def load_select_tables_from_database() -> None: + import dlt + from dlt.sources.sql_database import sql_table + def load_select_tables_from_database() -> None: # Define the pipeline pipeline = dlt.pipeline( - pipeline_name="rfam", - destination="duckdb", + pipeline_name="rfam", + destination="duckdb", dataset_name="rfam_data" ) - # Fetch the table "family" + # Fetch the table "family" table = sql_table(table="family") # Run the pipeline @@ -92,8 +98,8 @@ Read more about sources and resources here: [General usage: source](../../../gen ``` :::tip -We intend our sources to be fully hackable. Feel free to change the source code of the sources and resources to customize it to your needs. -::: +We intend our sources to be fully hackable. Feel free to change the source code of the sources and resources to customize it to your needs. +::: ## Configuring the connection @@ -106,12 +112,12 @@ We intend our sources to be fully hackable. Feel free to change the source code "dialect+database_type://username:password@server:port/database_name" ``` -For example, to connect to a MySQL database using the `pymysql` dialect you can use the following connection string: +For example, to connect to a MySQL database using the `pymysql` dialect you can use the following connection string: ```py "mysql+pymysql://rfamro:PWD@mysql-rfam-public.ebi.ac.uk:4497/Rfam" ``` -Database-specific drivers can be passed into the connection string using query parameters. For example, to connect to Microsoft SQL Server using the ODBC Driver, you would need to pass the driver as a query parameter as follows: +Database-specific drivers can be passed into the connection string using query parameters. For example, to connect to Microsoft SQL Server using the ODBC Driver, you would need to pass the driver as a query parameter as follows: ```py "mssql+pyodbc://username:password@server/database?driver=ODBC+Driver+17+for+SQL+Server" @@ -124,30 +130,32 @@ There are several options for adding your connection credentials into your `dlt` #### 1. Setting them in `secrets.toml` or as environment variables (Recommended) -You can set up credentials using [any method](https://dlthub.com/docs/devel/general-usage/credentials/setup#available-config-providers) supported by `dlt`. We recommend using `.dlt/secrets.toml` or the environment variables. See Step 2 of the [setup](./setup) for how to set credentials inside `secrets.toml`. For more information on passing credentials read [here](https://dlthub.com/docs/devel/general-usage/credentials/setup). +You can set up credentials using [any method](../../../general-usage/credentials/setup#available-config-providers) supported by `dlt`. We recommend using `.dlt/secrets.toml` or the environment variables. See Step 2 of the [setup](./setup) for how to set credentials inside `secrets.toml`. For more information on passing credentials read [here](../../../general-usage/credentials/setup). -#### 2. Passing them directly in the script +#### 2. Passing them directly in the script It is also possible to explicitly pass credentials inside the source. Example: + ```py from dlt.sources.credentials import ConnectionStringCredentials -from sql_database import sql_table +from dlt.sources.sql_database import sql_database credentials = ConnectionStringCredentials( "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam" ) -source = sql_table(credentials).with_resource("family") +source = sql_database(credentials).with_resource("family") ``` -:::note -It is recommended to configure credentials in `.dlt/secrets.toml` and to not include any sensitive information in the pipeline code. +:::note +It is recommended to configure credentials in `.dlt/secrets.toml` and to not include any sensitive information in the pipeline code. ::: ### Other connection options -#### Using SqlAlchemy Engine as credentials +#### Using SqlAlchemy Engine as credentials You are able to pass an instance of SqlAlchemy Engine instead of credentials: ```py +from dlt.sources.sql_database import sql_table from sqlalchemy import create_engine engine = create_engine("mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam") @@ -175,7 +183,10 @@ reflects the database table and preserves original types (i.e. **decimal** / **n Note that if `pandas` is installed, we'll use it to convert `SQLAlchemy` tuples into `ndarray` as it seems to be 20-30% faster than using `numpy` directly. ```py +import dlt import sqlalchemy as sa +from dlt.sources.sql_database import sql_database + pipeline = dlt.pipeline( pipeline_name="rfam_cx", destination="postgres", dataset_name="rfam_data_arrow" ) @@ -210,10 +221,13 @@ With the default settings, several data types will be coerced to dtypes in the y not to use the** `pandas` **backend if your source tables contain date, time, or decimal columns** ::: -Internally dlt uses `pandas.io.sql._wrap_result` to generate `pandas` frames. To adjust [pandas-specific settings,](https://pandas.pydata.org/docs/reference/api/pandas.read_sql_table.html) pass it in the `backend_kwargs` parameter. For example, below we set `coerce_float` to `False`: +Internally dlt uses `pandas.io.sql._wrap_result` to generate `pandas` frames. To adjust [pandas-specific settings,](https://pandas.pydata.org/docs/reference/api/pandas.read_sql_table.html) pass it in the `backend_kwargs` parameter. For example, below we set `coerce_float` to `False`: ```py +import dlt import sqlalchemy as sa +from dlt.sources.sql_database import sql_database + pipeline = dlt.pipeline( pipeline_name="rfam_cx", destination="postgres", dataset_name="rfam_data_pandas_2" ) @@ -249,7 +263,7 @@ There are certain limitations when using this backend: * JSON fields (at least those coming from postgres) are double wrapped in strings. To unwrap this, you can pass the in-built transformation function `unwrap_json_connector_x` (for example, with `add_map`): ```py - from sources.sql_database.helpers import unwrap_json_connector_x + from dlt.sources.sql_database.helpers import unwrap_json_connector_x ``` :::note @@ -259,7 +273,9 @@ There are certain limitations when using this backend: ```py """This example is taken from the benchmarking tests for ConnectorX performed on the UNSW_Flow dataset (~2mln rows, 25+ columns). Full code here: https://github.com/dlt-hub/sql_database_benchmarking""" import os +import dlt from dlt.destinations import filesystem +from dlt.sources.sql_database import sql_table unsw_table = sql_table( "postgresql://loader:loader@localhost:5432/dlt_data", diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/usage.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/usage.md index ee70e92ea0..bb2f39b007 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/usage.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/usage.md @@ -12,11 +12,13 @@ import Header from '../_source-info-header.md'; ## Applying column-wise filtering on the data being ingested -By default, the existing source and resource functions, `sql_database` and `sql_table`, ingest all of the records from the source table. But by using `query_adapter_callback`, it is possible to pass a `WHERE` clause inside the underlying `SELECT` statement using the [SQLAlchemy syntax](https://docs.sqlalchemy.org/en/14/core/selectable.html#). Thich enables filtering the data based on specific columns before extract. +By default, the existing source and resource functions, `sql_database` and `sql_table`, ingest all of the records from the source table. But by using `query_adapter_callback`, it is possible to pass a `WHERE` clause inside the underlying `SELECT` statement using the [SQLAlchemy syntax](https://docs.sqlalchemy.org/en/14/core/selectable.html#). Thich enables filtering the data based on specific columns before extract. The example below uses `query_adapter_callback` to filter on the column `customer_id` for the table `orders`: ```py +from dlt.sources.sql_database import sql_database + def query_adapter_callback(query, table): if table.name == "orders": # Only select rows where the column customer_id has value 1 @@ -30,19 +32,21 @@ source = sql_database( ``` ## Transforming the data before load -You have direct access to the extracted data through the resource objects (`sql_table()` or `sql_database().with_resource())`), each of which represents a single SQL table. These objects are generators that yield -individual rows of the table which can be modified by using custom python functions. These functions can be applied to the resource using `add_map`. +You have direct access to the extracted data through the resource objects (`sql_table()` or `sql_database().with_resource())`), each of which represents a single SQL table. These objects are generators that yield +individual rows of the table which can be modified by using custom python functions. These functions can be applied to the resource using `add_map`. :::note The PyArrow backend does not yield individual rows rather loads chunks of data as `ndarray`. In this case, the transformation function that goes into `add_map` should be configured to expect an `ndarray` input. ::: - + Examples: -1. Pseudonymizing data to hide personally identifiable information (PII) before loading it to the destination. (See [here](https://dlthub.com/docs/general-usage/customising-pipelines/pseudonymizing_columns) for more information on pseudonymizing data with `dlt`) +1. Pseudonymizing data to hide personally identifiable information (PII) before loading it to the destination. (See [here](../../../general-usage/customising-pipelines/pseudonymizing_columns) for more information on pseudonymizing data with `dlt`) ```py + import dlt import hashlib + from dlt.sources.sql_database import sql_database def pseudonymize_name(doc): ''' @@ -65,7 +69,7 @@ Examples: # using sql_database source to load family table and pseudonymize the column "rfam_acc" source = sql_database().with_resources("family") # modify this source instance's resource - source = source.family.add_map(pseudonymize_name) + source.family.add_map(pseudonymize_name) # Run the pipeline. For a large db this may take a while info = pipeline.run(source, write_disposition="replace") print(info) @@ -74,6 +78,9 @@ Examples: 2. Excluding unnecessary columns before load ```py + import dlt + from dlt.sources.sql_database import sql_database + def remove_columns(doc): del doc["rfam_id"] return doc @@ -84,7 +91,7 @@ Examples: # using sql_database source to load family table and remove the column "rfam_id" source = sql_database().with_resources("family") # modify this source instance's resource - source = source.family.add_map(remove_columns) + source.family.add_map(remove_columns) # Run the pipeline. For a large db this may take a while info = pipeline.run(source, write_disposition="replace") print(info) @@ -92,11 +99,10 @@ Examples: ## Deploying the sql_database pipeline -You can deploy the `sql_database` pipeline with any of the `dlt` deployment methods, such as [GitHub Actions](https://dlthub.com/docs/walkthroughs/deploy-a-pipeline/deploy-with-github-actions), [Airflow](https://dlthub.com/docs/walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer), [Dagster](https://dlthub.com/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster) etc. See [here](https://dlthub.com/docs/walkthroughs/deploy-a-pipeline) for a full list of deployment methods. +You can deploy the `sql_database` pipeline with any of the `dlt` deployment methods, such as [GitHub Actions](../../../walkthroughs/deploy-a-pipeline/deploy-with-github-actions), [Airflow](../../../walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer), [Dagster](../../../walkthroughs/deploy-a-pipeline/deploy-with-dagster) etc. See [here](../../../walkthroughs/deploy-a-pipeline) for a full list of deployment methods. ### Running on Airflow When running on Airflow: -1. Use the `dlt` [Airflow Helper](../../../walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md#2-modify-dag-file) to create tasks from the `sql_database` source. (If you want to run table extraction in parallel, then you can do this by setting `decompose = "parallel-isolated"` when doing the source->DAG conversion. See [here](https://dlthub.com/docs/walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer#2-modify-dag-file) for code example.) +1. Use the `dlt` [Airflow Helper](../../../walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md#2-modify-dag-file) to create tasks from the `sql_database` source. (If you want to run table extraction in parallel, then you can do this by setting `decompose = "parallel-isolated"` when doing the source->DAG conversion. See [here](../../../walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer#2-modify-dag-file) for code example.) 2. Reflect tables at runtime with `defer_table_reflect` argument. 3. Set `allow_external_schedulers` to load data using [Airflow intervals](../../../general-usage/incremental-loading.md#using-airflow-schedule-for-backfill-and-incremental-loading). - diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md b/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md index fdbefeddf1..15f75ac313 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md @@ -13,25 +13,25 @@ import Header from './_source-info-header.md'; This Stripe `dlt` verified source and [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/stripe_pipeline.py) -loads data using Stripe API to the destination of your choice. +loads data using the Stripe API to the destination of your choice. This verified source loads data from the following endpoints: -| Name | Description | +| Name | Description | |--------------------|--------------------------------------------| | Subscription | Recurring payment on Stripe | -| Account | User profile on Stripe | -| Coupon | Discount codes offered by businesses | -| Customer | Buyers using Stripe | -| Product | Items or services for sale | -| Price | Cost details for products or plans | -| Event | Significant activities in a Stripe account | -| Invoice | Payment request document | +| Account | User profile on Stripe | +| Coupon | Discount codes offered by businesses | +| Customer | Buyers using Stripe | +| Product | Items or services for sale | +| Price | Cost details for products or plans | +| Event | Significant activities in a Stripe account | +| Invoice | Payment request document | | BalanceTransaction | Funds movement record in Stripe | Please note that endpoints in the verified source can be customized as per the Stripe API [reference documentation.](https://stripe.com/docs/api) -## Setup Guide +## Setup guide ### Grab credentials @@ -89,8 +89,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage ## Run the pipeline -1. Before running the pipeline, ensure that you have installed all the necessary dependencies by - running the command: +1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: ```sh pip install -r requirements.txt @@ -102,26 +101,22 @@ For more information, read the [General Usage: Credentials.](../../general-usage python stripe_analytics_pipeline.py ``` -1. Once the pipeline has finished running, you can verify that everything loaded correctly by using - the following command: +1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: ```sh dlt pipeline show ``` - For example, the `pipeline_name` for the above pipeline example is `stripe_analytics`, you - may also use any custom name instead. + For example, the `pipeline_name` for the above pipeline example is `stripe_analytics`. You may also use any custom name instead. For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources -`dlt` works on the principle of [sources](../../general-usage/source) and -[resources](../../general-usage/resource). +`dlt` works on the principle of [sources](../../general-usage/source) and [resources](../../general-usage/resource). ### Default endpoints -You can write your own pipelines to load data to a destination using this verified source. -However, it is important to note is how the `ENDPOINTS` and `INCREMENTAL_ENDPOINTS` tuples are defined in `stripe_analytics/settings.py`. +You can write your own pipelines to load data to a destination using this verified source. However, it is important to note how the `ENDPOINTS` and `INCREMENTAL_ENDPOINTS` tuples are defined in `stripe_analytics/settings.py`. ```py # The most popular Stripe API's endpoints @@ -168,20 +163,19 @@ def incremental_stripe_source( ``` `endpoints`: Tuple containing incremental endpoint names. -`initial_start_date`: Parameter for incremental loading; data after initial_start_date is loaded on the first run (default: None). +`initial_start_date`: Parameter for incremental loading; data after the initial_start_date is loaded on the first run (default: None). `end_date`: End datetime for data loading (default: None). - After each run, 'initial_start_date' updates to the last loaded date. Subsequent runs then retrieve only new data using append mode, streamlining the process and preventing redundant data downloads. For more information, read the [Incremental loading](../../general-usage/incremental-loading). ## Customization + ### Create your own pipeline -If you wish to create your own pipelines, you can leverage source and resource methods from this -verified source. +If you wish to create your own pipelines, you can leverage source and resource methods from this verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: @@ -235,3 +229,4 @@ verified source. > To load data, maintain the pipeline name and destination dataset name. The pipeline name is vital for accessing the last run's [state](../../general-usage/state), which determines the incremental data load's end date. Altering these names can trigger a [“dev_mode”](../../general-usage/pipeline#do-experiments-with-dev-mode), disrupting the metadata (state) tracking for [incremental data loading](../../general-usage/incremental-loading). + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/workable.md b/docs/website/docs/dlt-ecosystem/verified-sources/workable.md index 9229ddca7e..73565f7e94 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/workable.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/workable.md @@ -211,7 +211,7 @@ If you wish to create your own pipelines, you can leverage source and resource m verified source. To create your data pipeline using single loading and -[incremental data loading](https://dlthub.com/docs/general-usage/incremental-loading) (only for the +[incremental data loading](../../general-usage/incremental-loading) (only for the **Candidates** endpoint), follow these steps: 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: @@ -270,10 +270,10 @@ To create your data pipeline using single loading and 1. To use incremental loading for the candidates endpoint, maintain the same pipeline and destination dataset names. The pipeline name helps retrieve the - [state](https://dlthub.com/docs/general-usage/state) of the last run, essential for incremental + [state](../../general-usage/state) of the last run, essential for incremental data loading. Changing these names might trigger a - [“dev_mode”](https://dlthub.com/docs/general-usage/pipeline#do-experiments-with-dev-mode), + [“dev_mode”](../../general-usage/pipeline#do-experiments-with-dev-mode), disrupting metadata tracking for - [incremental data loading](https://dlthub.com/docs/general-usage/incremental-loading). + [incremental data loading](../../general-usage/incremental-loading). diff --git a/docs/website/docs/general-usage/credentials/advanced.md b/docs/website/docs/general-usage/credentials/advanced.md index 793f5c2a55..c25030a154 100644 --- a/docs/website/docs/general-usage/credentials/advanced.md +++ b/docs/website/docs/general-usage/credentials/advanced.md @@ -26,7 +26,7 @@ keywords: [credentials, secrets.toml, secrets, config, configuration, environmen ``` `dlt` allows the user to specify the argument `pipedrive_api_key` explicitly if, for some reason, they do not want to use [out-of-the-box options](setup) for credentials management. -1. Required arguments (without default values) **are never injected** and must be specified when calling. For example, for the source: +2. Required arguments (without default values) **are never injected** and must be specified when calling. For example, for the source: ```py @dlt.source @@ -35,7 +35,7 @@ keywords: [credentials, secrets.toml, secrets, config, configuration, environmen ``` The argument `channels_list` would not be injected and will output an error if it is not specified explicitly. -1. Arguments with default values are injected if present in config providers. Otherwise, defaults from the function signature are used. For example, for the source: +3. Arguments with default values are injected if present in config providers. Otherwise, defaults from the function signature are used. For example, for the source: ```py @dlt.source @@ -48,7 +48,7 @@ keywords: [credentials, secrets.toml, secrets, config, configuration, environmen ``` `dlt` firstly searches for all three arguments: `page_size`, `access_token`, and `start_date` in config providers in a [specific order](setup). If it cannot find them, it will use the default values. -1. Arguments with the special default value `dlt.secrets.value` and `dlt.config.value` **must be injected** +4. Arguments with the special default value `dlt.secrets.value` and `dlt.config.value` **must be injected** (or explicitly passed). If they are not found by the config providers, the code raises an exception. The code in the functions always receives those arguments. @@ -58,12 +58,12 @@ keywords: [credentials, secrets.toml, secrets, config, configuration, environmen We highly recommend adding types to your function signatures. The effort is very low, and it gives `dlt` much more -information on what source/resource expects. +information on what the source or resource expects. Doing so provides several benefits: -1. You'll never receive the invalid data types in your code. -1. `dlt` will automatically parse and coerce types for you, so you don't need to parse it yourself. +1. You'll never receive invalid data types in your code. +1. `dlt` will automatically parse and coerce types for you, so you don't need to parse them yourself. 1. `dlt` can generate sample config and secret files for your source automatically. 1. You can request [built-in and custom credentials](complex_types) (i.e., connection strings, AWS / GCP / Azure credentials). 1. You can specify a set of possible types via `Union`, i.e., OAuth or API Key authorization. @@ -94,7 +94,7 @@ Now, ## Toml files structure `dlt` arranges the sections of [toml files](setup/#secretstoml-and-configtoml) into a **default layout** that is expected by the [injection mechanism](#injection-mechanism). -This layout makes it easy to configure simple cases but also provides a room for more explicit sections and complex cases, i.e., having several sources with different credentials +This layout makes it easy to configure simple cases but also provides room for more explicit sections and complex cases, i.e., having several sources with different credentials or even hosting several pipelines in the same project sharing the same config and credentials. ```text @@ -158,7 +158,7 @@ dlt.config["sheet_id"] = "23029402349032049" dlt.secrets["destination.postgres.credentials"] = BaseHook.get_connection('postgres_dsn').extra ``` -Will mock the `toml` provider to desired values. +This will mock the `toml` provider to desired values. ## Example @@ -173,7 +173,7 @@ def google_sheets( credentials=dlt.secrets.value, only_strings=False ): - # Allow both a dictionary and a string passed as credentials + # Allow both a dictionary and a string to be passed as credentials if isinstance(credentials, str): credentials = json.loads(credentials) # Allow both a list and a comma-delimited string to be passed as tabs @@ -200,4 +200,5 @@ In the example above: :::tip `dlt.resource` behaves in the same way, so if you have a [standalone resource](../resource.md#declare-a-standalone-resource) (one that is not an inner function of a **source**) -::: \ No newline at end of file +::: + diff --git a/docs/website/docs/general-usage/credentials/complex_types.md b/docs/website/docs/general-usage/credentials/complex_types.md index 24915c1b2e..d14e031097 100644 --- a/docs/website/docs/general-usage/credentials/complex_types.md +++ b/docs/website/docs/general-usage/credentials/complex_types.md @@ -49,7 +49,7 @@ dsn="postgres://loader:loader@localhost:5432/dlt_data" ### Mixed form -If all credentials, but the password provided explicitly in the code, `dlt` will look for the password in `secrets.toml`. +If all credentials, except the password, are provided explicitly in the code, `dlt` will look for the password in `secrets.toml`. ```toml dsn.password="loader" @@ -125,10 +125,10 @@ credentials.add_scopes(["scope3", "scope4"]) `OAuth2Credentials` is a base class to implement actual OAuth; for example, it is a base class for [GcpOAuthCredentials](#gcpoauthcredentials). -### GCP Credentials +### GCP credentials #### Examples -* [Google Analytics verified source](https://github.com/dlt-hub/verified-sources/blob/master/sources/google_analytics/__init__.py): the example of how to use GCP Credentials. +* [Google Analytics verified source](https://github.com/dlt-hub/verified-sources/blob/master/sources/google_analytics/__init__.py): an example of how to use GCP Credentials. * [Google Analytics example](https://github.com/dlt-hub/verified-sources/blob/master/sources/google_analytics/setup_script_gcp_oauth.py): how you can get the refresh token using `dlt.secrets.value`. #### Types @@ -192,6 +192,7 @@ property_id = "213025502" The `GcpOAuthCredentials` class is responsible for handling OAuth2 credentials for desktop applications in Google Cloud Platform (GCP). It can parse native values either as `GoogleOAuth2Credentials` or as serialized OAuth client secrets JSON. This class provides methods for authentication and obtaining access tokens. ##### Usage + ```py oauth_credentials = GcpOAuthCredentials() @@ -201,7 +202,7 @@ oauth_credentials = GcpOAuthCredentials() native_value_oauth = {"client_secret": ...} oauth_credentials.parse_native_representation(native_value_oauth) ``` -or more preferred use: +Or more preferred use: ```py import dlt from dlt.sources.credentials import GcpOAuthCredentials @@ -215,7 +216,7 @@ def google_analytics( credentials.auth(scopes=["scope1", "scope2"]) # Retrieve native credentials for Google clients - # For example, build the service object for Google Analytics PI. + # For example, build the service object for Google Analytics API. client = BetaAnalyticsDataClient(credentials=credentials.to_native_credentials()) # Get a string representation of the credentials @@ -223,7 +224,7 @@ def google_analytics( credentials_str = str(credentials) ... ``` -while `secrets.toml` looks as follows: +While `secrets.toml` looks as follows: ```toml [sources.google_analytics.credentials] client_id = "client_id" # please set me up! @@ -231,7 +232,7 @@ client_secret = "client_secret" # please set me up! refresh_token = "refresh_token" # please set me up! project_id = "project_id" # please set me up! ``` -and `config.toml`: +And `config.toml`: ```toml [sources.google_analytics] property_id = "213025502" @@ -239,11 +240,9 @@ property_id = "213025502" In order for the `auth()` method to succeed: -- You must provide valid `client_id`, `client_secret`, `refresh_token`, and `project_id` to get a current **access token** and authenticate with OAuth. Keep in mind that the `refresh_token` must contain all the scopes that is required for your access. +- You must provide valid `client_id`, `client_secret`, `refresh_token`, and `project_id` to get a current **access token** and authenticate with OAuth. Keep in mind that the `refresh_token` must contain all the scopes that are required for your access. - If the `refresh_token` is not provided, and you run the pipeline from a console or a notebook, `dlt` will use InstalledAppFlow to run the desktop authentication flow. - - #### Defaults If configuration values are missing, `dlt` will use the default Google credentials (from `default()`) if available. Read more about [Google defaults.](https://googleapis.dev/python/google-auth/latest/user-guide.html#application-default-credentials) @@ -264,7 +263,7 @@ credentials.region_name = "us-east-1" ``` or ```py -# Imports an external boto3 session and sets the credentials properties accordingly. +# Imports an external botocore session and sets the credentials properties accordingly. import botocore.session credentials = AwsCredentials() @@ -306,7 +305,7 @@ bucket_url = "bucket_url" If configuration is not provided, `dlt` uses the default AWS credentials (from `.aws/credentials`) as present on the machine: -- It works by creating an instance of botocore Session. +- It works by creating an instance of a botocore Session. - If `profile_name` is specified, the credentials for that profile are used. If not, the default profile is used. ### AzureCredentials @@ -364,7 +363,7 @@ Example: ```py @dlt.source def zen_source(credentials: Union[ZenApiKeyCredentials, ZenEmailCredentials, str] = dlt.secrets.value, some_option: bool = False): - # Depending on what the user provides in config, ZenApiKeyCredentials or ZenEmailCredentials will be injected in the `credentials` argument. Both classes implement `auth` so you can always call it. + # Depending on what the user provides in config, ZenApiKeyCredentials or ZenEmailCredentials will be injected into the `credentials` argument. Both classes implement `auth` so you can always call it. credentials.auth() return dlt.resource([credentials], name="credentials") @@ -374,7 +373,7 @@ assert list(zen_source())[0].email == "mx" # Pass explicit native value assert list(zen_source("secret:🔑:secret"))[0].api_secret == "secret" -# pass explicit dict +# Pass explicit dict assert list(zen_source(credentials={"email": "emx", "password": "pass"}))[0].email == "emx" ``` @@ -383,26 +382,23 @@ This applies not only to credentials but to [all specs](#writing-custom-specs). ::: :::tip -Check out the [complete example](https://github.com/dlt-hub/dlt/blob/devel/tests/common/configuration/test_spec_union.py), to learn how to create unions -of credentials that derive from the common class, so you can handle it seamlessly in your code. +Check out the [complete example](https://github.com/dlt-hub/dlt/blob/devel/tests/common/configuration/test_spec_union.py), to learn how to create unions of credentials that derive from the common class, so you can handle it seamlessly in your code. ::: ## Writing custom specs -**Custom specifications** let you take full control over the function arguments. You can +**Custom specifications** let you take full control over the function arguments. You can: - Control which values should be injected, the types, default values. - Specify optional and final fields. - Form hierarchical configurations (specs in specs). -- Provide own handlers for `on_partial` (called before failing on missing config key) or `on_resolved`. -- Provide own native value parsers. -- Provide own default credentials logic. -- Utilise Python dataclass functionality. -- Utilise Python `dict` functionality (`specs` instances can be created from dicts and serialized - from dicts). +- Provide your own handlers for `on_partial` (called before failing on missing config key) or `on_resolved`. +- Provide your own native value parsers. +- Provide your own default credentials logic. +- Utilize Python dataclass functionality. +- Utilize Python `dict` functionality (`specs` instances can be created from dicts and serialized from dicts). -In fact, `dlt` synthesizes a unique spec for each decorated function. For example, in the case of `google_sheets`, the following -class is created: +In fact, `dlt` synthesizes a unique spec for each decorated function. For example, in the case of `google_sheets`, the following class is created: ```py from dlt.sources.config import configspec, with_config @@ -417,24 +413,19 @@ class GoogleSheetsConfiguration(BaseConfiguration): ### All specs derive from [BaseConfiguration](https://github.com/dlt-hub/dlt/blob/devel/dlt/common/configuration/specs/base_configuration.py#L170) This class serves as a foundation for creating configuration objects with specific characteristics: -- It provides methods to parse and represent the configuration - in native form (`parse_native_representation` and `to_native_representation`). +- It provides methods to parse and represent the configuration in native form (`parse_native_representation` and `to_native_representation`). - It defines methods for accessing and manipulating configuration fields. -- It implements a dictionary-compatible interface on top of the dataclass. -This allows instances of this class to be treated like dictionaries. +- It implements a dictionary-compatible interface on top of the dataclass. This allows instances of this class to be treated like dictionaries. -- It defines helper functions for checking if a certain attribute is present, -if a field is valid, and for calling methods in the method resolution order (MRO). +- It defines helper functions for checking if a certain attribute is present, if a field is valid, and for calling methods in the method resolution order (MRO). More information about this class can be found in the class docstrings. ### All credentials derive from [CredentialsConfiguration](https://github.com/dlt-hub/dlt/blob/devel/dlt/common/configuration/specs/base_configuration.py#L307) -This class is a subclass of `BaseConfiguration` -and is meant to serve as a base class for handling various types of credentials. -It defines methods for initializing credentials, converting them to native representations, -and generating string representations while ensuring sensitive information is appropriately handled. +This class is a subclass of `BaseConfiguration` and is meant to serve as a base class for handling various types of credentials. It defines methods for initializing credentials, converting them to native representations, and generating string representations while ensuring sensitive information is appropriately handled. + +More information about this class can be found in the class docstrings. -More information about this class can be found in the class docstrings. \ No newline at end of file diff --git a/docs/website/docs/general-usage/credentials/index.md b/docs/website/docs/general-usage/credentials/index.md index c9cbe6707c..95e0ec36ac 100644 --- a/docs/website/docs/general-usage/credentials/index.md +++ b/docs/website/docs/general-usage/credentials/index.md @@ -9,10 +9,11 @@ import DocCardList from '@theme/DocCardList'; 1. Environment variables 2. Configuration files (`secrets.toml` and `config.toml`) -3. Key managers and Vaults +3. Key managers and vaults `dlt` automatically extracts configuration settings and secrets based on flexible [naming conventions](setup/#naming-convention). It then [injects](advanced/#injection-mechanism) these values where needed in code. -# Learn Details About +# Learn details about + + - \ No newline at end of file diff --git a/docs/website/docs/general-usage/credentials/setup.md b/docs/website/docs/general-usage/credentials/setup.md index 7933bab183..5f05e68b6d 100644 --- a/docs/website/docs/general-usage/credentials/setup.md +++ b/docs/website/docs/general-usage/credentials/setup.md @@ -30,12 +30,12 @@ A custom config provider is helpful if you want to use your own configuration fi 1. [Default Argument Values](advanced#ingestion-mechanism): These are the values specified in the function's signature. :::tip -Please make sure your pipeline name contains no whitespace or any other punctuation characters except `"-"` and `"_"`. This way you will ensure your code is working with any configuration option. +Please make sure your pipeline name contains no whitespace or any other punctuation characters except `"-"` and `"_"`. This way, you will ensure your code is working with any configuration option. ::: ## Naming convention -`dlt` uses a specific naming hierarchy to search for the secrets and configs values. This makes configurations and secrets easy to manage. +`dlt` uses a specific naming hierarchy to search for the secrets and config values. This makes configurations and secrets easy to manage. To keep the naming convention flexible, `dlt` looks for a lot of possible combinations of key names, starting from the most specific possible path. Then, if the value is not found, it removes the right-most section and tries again. @@ -85,7 +85,7 @@ The most specific possible path for **destinations** looks like: ```sh -[.destination..credentials] +[.destination..credentials] ="some_value" ``` @@ -120,12 +120,12 @@ def deals(api_key: str = dlt.secrets.value): `dlt` will search for the following names in this order: 1. `sources.pipedrive.deals.api_key` -1. `sources.pipedrive.api_key` -1. `sources.api_key` -1. `api_key` +2. `sources.pipedrive.api_key` +3. `sources.api_key` +4. `api_key` :::tip -You can use your pipeline name to have separate configurations for each pipeline in your project. All config values will be looked with the pipeline name first and then again without it. +You can use your pipeline name to have separate configurations for each pipeline in your project. All config values will be looked at with the pipeline name first and then again without it. ```toml [pipeline_name_1.sources.google_sheets.credentials] @@ -156,10 +156,10 @@ or set up all parameters of connection separately: drivername="snowflake" username="user" password="password" -database = "database" -host = "service-account" -warehouse = "warehouse_name" -role = "role" +database="database" +host="service-account" +warehouse="warehouse_name" +role="role" ``` `dlt` can work with both ways and convert one to another. To learn more about which credential types are supported, visit the [complex credential types](./complex_types) page. @@ -177,7 +177,7 @@ export SOURCES__FACEBOOK_ADS__ACCESS_TOKEN="" Check out the [example](#examples) of setting up credentials through environment variables. :::tip -To organize development and securely manage environment variables for credentials storage, you can use the [python-dotenv](https://pypi.org/project/python-dotenv/) to automatically load variables from an `.env` file. +To organize development and securely manage environment variables for credentials storage, you can use [python-dotenv](https://pypi.org/project/python-dotenv/) to automatically load variables from an `.env` file. ::: ## Vaults @@ -187,7 +187,7 @@ For other vault integrations, you are welcome to [contact sales](https://dlthub. ## secrets.toml and config.toml -The TOML config provider in dlt utilizes two TOML files: +The TOML config provider in `dlt` utilizes two TOML files: `config.toml`: @@ -239,7 +239,7 @@ The TOML provider also has the capability to read files from `~/.dlt/` (located `dlt` organizes sections in TOML files in a specific structure required by the [injection mechanism](advanced/#injection-mechanism). Understanding this structure gives you more flexibility in setting credentials. For more details, see [Toml files structure](advanced/#toml-files-structure). -## Custom Providers +## Custom providers You can use the `CustomLoaderDocProvider` classes to supply a custom dictionary to `dlt` for use as a supplier of `config` and `secret` values. The code below demonstrates how to use a config stored in `config.json`. @@ -255,14 +255,14 @@ def load_config(): config_dict = json.load(f) # create the custom provider -provider = CustomLoaderDocProvider("my_json_provider",load_config) +provider = CustomLoaderDocProvider("my_json_provider", load_config) # register provider dlt.config.register_provider(provider) ``` :::tip -Check our an [example](../../examples/custom_config_provider) for a `yaml` based config provider that supports switchable profiles. +Check out an [example](../../examples/custom_config_provider) for a `yaml` based config provider that supports switchable profiles. ::: ## Examples @@ -324,8 +324,8 @@ export RUNTIME__LOG_LEVEL="INFO" export DESTINATION__FILESYSTEM__BUCKET_URL="s3://[your_bucket_name]" export NORMALIZE__DATA_WRITER__DISABLE_COMPRESSION="true" export SOURCE__NOTION__API_KEY="api_key" -export DESTINATION__FILESYSTEM__CREDENTIALS__AWS_ACCESS_KEY_ID="api_key" -export DESTINATION__FILESYSTEM__CREDENTIALS__AWS_SECRET_ACCESS_KEY="api_key" +export DESTINATION__FILESYSTEM__CREDENTIALS__AWS_ACCESS_KEY_ID="ABCDEFGHIJKLMNOPQRST" +export DESTINATION__FILESYSTEM__CREDENTIALS__AWS_SECRET_ACCESS_KEY="1234567890_access_key" ``` @@ -335,6 +335,8 @@ export DESTINATION__FILESYSTEM__CREDENTIALS__AWS_SECRET_ACCESS_KEY="api_key" ```py import os import dlt +import botocore.session +from dlt.common.credentials import AwsCredentials # you can freely set up configuration directly in the code @@ -345,7 +347,7 @@ os.environ["NORMALIZE__DATA_WRITER__DISABLE_COMPRESSION"] = "true" # or even directly to the dlt.config dlt.config["runtime.log_level"] = "INFO" -dlt.config["destination.filesystem.bucket_url"] = "INFO" +dlt.config["destination.filesystem.bucket_url"] = "s3://[your_bucket_name]" dlt.config["normalize.data_writer.disable_compression"] = "true" # but please, do not set up the secrets in the code! @@ -353,8 +355,6 @@ dlt.config["normalize.data_writer.disable_compression"] = "true" os.environ["SOURCE__NOTION__API_KEY"] = os.environ.get("NOTION_KEY") # or use a third-party credentials supplier -import botocore.session - credentials = AwsCredentials() session = botocore.session.get_session() credentials.parse_native_representation(session) @@ -365,6 +365,7 @@ dlt.secrets["destination.filesystem.credentials"] = credentials + ### Google credentials for both source and destination Let's assume we use the `bigquery` destination and the `google_sheets` source. They both use Google credentials and expect them to be configured under the `credentials` key. @@ -406,8 +407,8 @@ export CREDENTIALS__PROJECT_ID="" ```py import os -# do not set up the secrets directly in the code! -# what you can do is reassign env variables +# Do not set up the secrets directly in the code! +# What you can do is reassign env variables. os.environ["CREDENTIALS__CLIENT_EMAIL"] = os.environ.get("GOOGLE_CLIENT_EMAIL") os.environ["CREDENTIALS__PRIVATE_KEY"] = os.environ.get("GOOGLE_PRIVATE_KEY") os.environ["CREDENTIALS__PROJECT_ID"] = os.environ.get("GOOGLE_PROJECT_ID") @@ -431,13 +432,13 @@ os.environ["CREDENTIALS__PROJECT_ID"] = os.environ.get("GOOGLE_PROJECT_ID") ```toml -# google sheet credentials +# Google Sheet credentials [sources.credentials] client_email = "" private_key = "" project_id = "" -# bigquery credentials +# BigQuery credentials [destination.credentials] client_email = "" private_key = "" @@ -449,12 +450,12 @@ project_id = "" ```sh -# google sheet credentials +# Google Sheet credentials export SOURCES__CREDENTIALS__CLIENT_EMAIL="" export SOURCES__CREDENTIALS__PRIVATE_KEY="" export SOURCES__CREDENTIALS__PROJECT_ID="" -# bigquery credentials +# BigQuery credentials export DESTINATION__CREDENTIALS__CLIENT_EMAIL="" export DESTINATION__CREDENTIALS__PRIVATE_KEY="" export DESTINATION__CREDENTIALS__PROJECT_ID="" @@ -468,13 +469,13 @@ export DESTINATION__CREDENTIALS__PROJECT_ID="" import dlt import os -# do not set up the secrets directly in the code! -# what you can do is reassign env variables +# Do not set up the secrets directly in the code! +# What you can do is reassign env variables. os.environ["DESTINATION__CREDENTIALS__CLIENT_EMAIL"] = os.environ.get("BIGQUERY_CLIENT_EMAIL") os.environ["DESTINATION__CREDENTIALS__PRIVATE_KEY"] = os.environ.get("BIGQUERY_PRIVATE_KEY") os.environ["DESTINATION__CREDENTIALS__PROJECT_ID"] = os.environ.get("BIGQUERY_PROJECT_ID") -# or set them to the dlt.secrets +# Or set them to the dlt.secrets. dlt.secrets["sources.credentials.client_email"] = os.environ.get("SHEETS_CLIENT_EMAIL") dlt.secrets["sources.credentials.private_key"] = os.environ.get("SHEETS_PRIVATE_KEY") dlt.secrets["sources.credentials.project_id"] = os.environ.get("SHEETS_PROJECT_ID") @@ -513,23 +514,23 @@ Let's assume we have several different Google sources and destinations. We can u ```toml -# google sheet credentials +# Google Sheet credentials [sources.google_sheets.credentials] client_email = "" private_key = "" -project_id = "" +project_id = "" -# google analytics credentials +# Google Analytics credentials [sources.google_analytics.credentials] client_email = "" private_key = "" -project_id = "" +project_id = "" -# bigquery credentials +# BigQuery credentials [destination.bigquery.credentials] client_email = "" private_key = "" -project_id = "" +project_id = "" ``` @@ -537,17 +538,17 @@ project_id = "" ```sh -# google sheet credentials +# Google Sheet credentials export SOURCES__GOOGLE_SHEETS__CREDENTIALS__CLIENT_EMAIL="" export SOURCES__GOOGLE_SHEETS__CREDENTIALS__PRIVATE_KEY="" export SOURCES__GOOGLE_SHEETS__CREDENTIALS__PROJECT_ID="" -# google analytics credentials +# Google Analytics credentials export SOURCES__GOOGLE_ANALYTICS__CREDENTIALS__CLIENT_EMAIL="" export SOURCES__GOOGLE_ANALYTICS__CREDENTIALS__PRIVATE_KEY="" export SOURCES__GOOGLE_ANALYTICS__CREDENTIALS__PROJECT_ID="" -# bigquery credentials +# BigQuery credentials export DESTINATION__BIGQUERY__CREDENTIALS__CLIENT_EMAIL="" export DESTINATION__BIGQUERY__CREDENTIALS__PRIVATE_KEY="" export DESTINATION__BIGQUERY__CREDENTIALS__PROJECT_ID="" @@ -561,8 +562,8 @@ export DESTINATION__BIGQUERY__CREDENTIALS__PROJECT_ID="" import os import dlt -# do not set up the secrets directly in the code! -# what you can do is reassign env variables +# Do not set up the secrets directly in the code! +# What you can do is reassign env variables os.environ["SOURCES__GOOGLE_ANALYTICS__CREDENTIALS__CLIENT_EMAIL"] = os.environ.get("SHEETS_CLIENT_EMAIL") os.environ["SOURCES__GOOGLE_ANALYTICS__CREDENTIALS__PRIVATE_KEY"] = os.environ.get("ANALYTICS_PRIVATE_KEY") os.environ["SOURCES__GOOGLE_ANALYTICS__CREDENTIALS__PROJECT_ID"] = os.environ.get("ANALYTICS_PROJECT_ID") @@ -571,7 +572,7 @@ os.environ["DESTINATION__CREDENTIALS__CLIENT_EMAIL"] = os.environ.get("BIGQUERY_ os.environ["DESTINATION__CREDENTIALS__PRIVATE_KEY"] = os.environ.get("BIGQUERY_PRIVATE_KEY") os.environ["DESTINATION__CREDENTIALS__PROJECT_ID"] = os.environ.get("BIGQUERY_PROJECT_ID") -# or set them to the dlt.secrets +# Or set them to the dlt.secrets dlt.secrets["sources.credentials.client_email"] = os.environ.get("SHEETS_CLIENT_EMAIL") dlt.secrets["sources.credentials.private_key"] = os.environ.get("SHEETS_PRIVATE_KEY") dlt.secrets["sources.credentials.project_id"] = os.environ.get("SHEETS_PROJECT_ID") @@ -583,7 +584,7 @@ dlt.secrets["sources.credentials.project_id"] = os.environ.get("SHEETS_PROJECT_I ### Credentials for several sources of the same type -Let's assume we have several sources of the same type, how can we separate them in the `secrets.toml`? The recommended solution is to use different pipeline names for each source: +Let's assume we have several sources of the same type. How can we separate them in the `secrets.toml`? The recommended solution is to use different pipeline names for each source: __v_` where `original_name` is the existing column name (with data type clash) and `type` is the name of the data type stored in the variant. -## Load Packages and Load IDs +## Load packages and load IDs -Each execution of the pipeline generates one or more load packages. A load package typically contains data retrieved from -all the [resources](glossary.md#resource) of a particular [source](glossary.md#source). -These packages are uniquely identified by a `load_id`. The `load_id` of a particular package is added to the top data tables -(referenced as `_dlt_load_id` column in the example above) and to the special `_dlt_loads` table with a status of 0 (when the load process is fully completed). +Each execution of the pipeline generates one or more load packages. A load package typically contains data retrieved from all the [resources](glossary.md#resource) of a particular [source](glossary.md#source). These packages are uniquely identified by a `load_id`. The `load_id` of a particular package is added to the top data tables (referenced as `_dlt_load_id` column in the example above) and to the special `_dlt_loads` table with a status of 0 (when the load process is fully completed). To illustrate this, let's load more data into the same destination: @@ -173,8 +169,7 @@ data = [ ] ``` -The rest of the pipeline definition remains the same. Running this pipeline will create a new load -package with a new `load_id` and add the data to the existing tables. The `users` table will now look like this: +The rest of the pipeline definition remains the same. Running this pipeline will create a new load package with a new `load_id` and add the data to the existing tables. The `users` table will now look like this: **mydata.users** @@ -193,39 +188,21 @@ The `_dlt_loads` table will look like this: | 1234562350.98417 | quick_start | 0 | 2023-09-12 16:45:51.17865+00 | aOEb...Qekd/58= | | **1234563456.12345** | quick_start | 0 | 2023-09-12 16:46:03.10662+00 | aOEb...Qekd/58= | -The `_dlt_loads` table tracks complete loads and allows chaining transformations on top of them. -Many destinations do not support distributed and long-running transactions (e.g., Amazon Redshift). -In that case, the user may see the partially loaded data. It is possible to filter such data out: any -row with a `load_id` that does not exist in `_dlt_loads` is not yet completed. The same procedure may be used to identify -and delete data for packages that never got completed. +The `_dlt_loads` table tracks complete loads and allows chaining transformations on top of them. Many destinations do not support distributed and long-running transactions (e.g., Amazon Redshift). In that case, the user may see the partially loaded data. It is possible to filter such data out: any row with a `load_id` that does not exist in `_dlt_loads` is not yet completed. The same procedure may be used to identify and delete data for packages that never got completed. -For each load, you can test and [alert](../running-in-production/alerting.md) on anomalies (e.g., -no data, too much loaded to a table). There are also some useful load stats in the `Load info` tab -of the [Streamlit app](../dlt-ecosystem/visualizations/exploring-the-data.md#exploring-the-data) -mentioned above. +For each load, you can test and [alert](../running-in-production/alerting.md) on anomalies (e.g., no data, too much loaded to a table). There are also some useful load stats in the `Load info` tab of the [Streamlit app](../dlt-ecosystem/visualizations/exploring-the-data.md#exploring-the-data) mentioned above. -You can add [transformations](../dlt-ecosystem/transformations/) and chain them together -using the `status` column. You start the transformation for all the data with a particular -`load_id` with a status of 0 and then update it to 1. The next transformation starts with the status -of 1 and is then updated to 2. This can be repeated for every additional transformation. +You can add [transformations](../dlt-ecosystem/transformations/) and chain them together using the `status` column. You start the transformation for all the data with a particular `load_id` with a status of 0 and then update it to 1. The next transformation starts with the status of 1 and is then updated to 2. This can be repeated for every additional transformation. ### Data lineage -Data lineage can be super relevant for architectures like the -[data vault architecture](https://www.data-vault.co.uk/what-is-data-vault/) or when troubleshooting. -The data vault architecture is a data warehouse that large organizations use when representing the -same process across multiple systems, which adds data lineage requirements. Using the pipeline name -and `load_id` provided out of the box by `dlt`, you are able to identify the source and time of data. +Data lineage can be super relevant for architectures like the [data vault architecture](https://www.data-vault.co.uk/what-is-data-vault/) or when troubleshooting. The data vault architecture is a data warehouse that large organizations use when representing the same process across multiple systems, which adds data lineage requirements. Using the pipeline name and `load_id` provided out of the box by `dlt`, you are able to identify the source and time of data. -You can [save](../running-in-production/running.md#inspect-and-save-the-load-info-and-trace) -complete lineage info for a particular `load_id` including a list of loaded files, error messages -(if any), elapsed times, schema changes. This can be helpful, for example, when troubleshooting -problems. +You can [save](../running-in-production/running.md#inspect-and-save-the-load-info-and-trace) complete lineage info for a particular `load_id` including a list of loaded files, error messages (if any), elapsed times, schema changes. This can be helpful, for example, when troubleshooting problems. ## Staging dataset -So far we've been using the `append` write disposition in our example pipeline. This means that -each time we run the pipeline, the data is appended to the existing tables. When you use the [merge write disposition](incremental-loading.md), dlt creates a staging database schema for staging data. This schema is named `_staging` [by default](https://dlthub.com/docs/devel/dlt-ecosystem/staging#staging-dataset) and contains the same tables as the destination schema. When you run the pipeline, the data from the staging tables is loaded into the destination tables in a single atomic transaction. +So far, we've been using the `append` write disposition in our example pipeline. This means that each time we run the pipeline, the data is appended to the existing tables. When you use the [merge write disposition](incremental-loading.md), dlt creates a staging database schema for staging data. This schema is named `_staging` [by default](../dlt-ecosystem/staging#staging-dataset) and contains the same tables as the destination schema. When you run the pipeline, the data from the staging tables is loaded into the destination tables in a single atomic transaction. Let's illustrate this with an example. We change our pipeline to use the `merge` write disposition: @@ -249,7 +226,7 @@ load_info = pipeline.run(users) ``` Running this pipeline will create a schema in the destination database with the name `mydata_staging`. -If you inspect the tables in this schema, you will find the `mydata_staging.users` table identical to the`mydata.users` table in the previous example. +If you inspect the tables in this schema, you will find the `mydata_staging.users` table identical to the `mydata.users` table in the previous example. Here is what the tables may look like after running the pipeline: @@ -272,10 +249,9 @@ Notice that the `mydata.users` table now contains the data from both the previou ## Dev mode (versioned) datasets -When you set the `dev_mode` argument to `True` in `dlt.pipeline` call, dlt creates a versioned dataset. +When you set the `dev_mode` argument to `True` in the `dlt.pipeline` call, dlt creates a versioned dataset. This means that each time you run the pipeline, the data is loaded into a new dataset (a new database schema). -The dataset name is the same as the `dataset_name` you provided in the pipeline definition with a -datetime-based suffix. +The dataset name is the same as the `dataset_name` you provided in the pipeline definition with a datetime-based suffix. We modify our pipeline to use the `dev_mode` option to see how this works: @@ -296,29 +272,27 @@ pipeline = dlt.pipeline( load_info = pipeline.run(data, table_name="users") ``` -Every time you run this pipeline, a new schema will be created in the destination database with a -datetime-based suffix. The data will be loaded into tables in this schema. -For example, the first time you run the pipeline, the schema will be named -`mydata_20230912064403`, the second time it will be named `mydata_20230912064407`, and so on. +Every time you run this pipeline, a new schema will be created in the destination database with a datetime-based suffix. The data will be loaded into tables in this schema. +For example, the first time you run the pipeline, the schema will be named `mydata_20230912064403`, the second time it will be named `mydata_20230912064407`, and so on. ## Loading data into existing tables not created by dlt You can also load data from `dlt` into tables that already exist in the destination dataset and were not created by `dlt`. -There are a few things to keep in mind when you are doing this: +There are a few things to keep in mind when doing this: -If you load data to a table that exists but does not contain any data, in most cases your load will succeed without problems. +If you load data into a table that exists but does not contain any data, in most cases, your load will succeed without problems. `dlt` will create the needed columns and insert the incoming data. `dlt` will only be aware of columns that exist on the -discovered or provided internal schema, so if you have columns in your destination, that are not anticipated by `dlt`, they -will remain in the destination but stay unknown to `dlt`. This will generally not be a problem. +discovered or provided internal schema, so if you have columns in your destination that are not anticipated by `dlt`, they +will remain in the destination but stay unknown to `dlt`. This generally will not be a problem. If your destination table already exists and contains columns that have the same name as columns discovered by `dlt` but -do not have matching datatypes, your load will fail and you will have to fix the column on the destination table first, +do not have matching datatypes, your load will fail, and you will have to fix the column on the destination table first, or change the column name in your incoming data to something else to avoid a collision. If your destination table exists and already contains data, your load might also initially fail, since `dlt` creates -special `non-nullable` columns that contains required mandatory metadata. Some databases will not allow you to create -`non-nullable` columns on tables that have data, since the initial value for these columns of the existing rows can -not be inferred. You will have to manually create these columns with the correct type on your existing tables and +special `non-nullable` columns that contain required mandatory metadata. Some databases will not allow you to create +`non-nullable` columns on tables that have data, since the initial value for these columns of the existing rows cannot +be inferred. You will have to manually create these columns with the correct type on your existing tables and make them `nullable`, then fill in values for the existing rows. Some databases may allow you to create a new column that is `non-nullable` and take a default value for existing rows in the same command. The columns you will need to create are: @@ -328,9 +302,10 @@ create are: | _dlt_load_id | text/string/varchar | | _dlt_id | text/string/varchar | -For nested tables you may also need to create: +For nested tables, you may also need to create: | name | type | | --- | --- | | _dlt_parent_id | text/string/varchar | -| _dlt_root_id | text/string/varchar | \ No newline at end of file +| _dlt_root_id | text/string/varchar | + diff --git a/docs/website/docs/general-usage/destination.md b/docs/website/docs/general-usage/destination.md index d88a0b53f2..fa133b6257 100644 --- a/docs/website/docs/general-usage/destination.md +++ b/docs/website/docs/general-usage/destination.md @@ -6,44 +6,44 @@ keywords: [destination, load data, configure destination, name destination] # Destination -[Destination](glossary.md#destination) is a location in which `dlt` creates and maintains the current version of the schema and loads your data. Destinations come in various forms: databases, datalakes, vector stores or files. `dlt` deals with this variety via modules which you declare when creating a pipeline. +[Destination](glossary.md#destination) is a location in which `dlt` creates and maintains the current version of the schema and loads your data. Destinations come in various forms: databases, datalakes, vector stores, or files. `dlt` deals with this variety via modules which you declare when creating a pipeline. We maintain a set of [built-in destinations](../dlt-ecosystem/destinations/) that you can use right away. ## Declare the destination type -We recommend that you declare the destination type when creating a pipeline instance with `dlt.pipeline`. This allows the `run` method to synchronize your local pipeline state with destination and `extract` and `normalize` to create compatible load packages and schemas. You can also pass the destination to `run` and `load` methods. +We recommend that you declare the destination type when creating a pipeline instance with `dlt.pipeline`. This allows the `run` method to synchronize your local pipeline state with the destination and `extract` and `normalize` to create compatible load packages and schemas. You can also pass the destination to the `run` and `load` methods. * Use destination **shorthand type** -Above we want to use **filesystem** built-in destination. You can use shorthand types only for built-ins. +Above, we want to use the **filesystem** built-in destination. You can use shorthand types only for built-ins. * Use full **destination factory type** -Above we use built in **filesystem** destination by providing a factory type `filesystem` from module `dlt.destinations`. You can pass [destinations from external modules](#declare-external-destination) as well. +Above, we use the built-in **filesystem** destination by providing a factory type `filesystem` from the module `dlt.destinations`. You can pass [destinations from external modules](#declare-external-destination) as well. * Import **destination factory** -Above we import destination factory for **filesystem** and pass it to the pipeline. +Above, we import the destination factory for **filesystem** and pass it to the pipeline. All examples above will create the same destination class with default parameters and pull required config and secret values from [configuration](credentials/index.md) - they are equivalent. ### Pass explicit parameters and a name to a destination -You can instantiate **destination factory** yourself to configure it explicitly. When doing this you work with destinations the same way you work with [sources](source.md) +You can instantiate the **destination factory** yourself to configure it explicitly. When doing this, you work with destinations the same way you work with [sources](source.md) -Above we import and instantiate the `filesystem` destination factory. We pass explicit url of the bucket and name the destination to `production_az_bucket`. +Above, we import and instantiate the `filesystem` destination factory. We pass the explicit URL of the bucket and name the destination `production_az_bucket`. -If destination is not named, its shorthand type (the Python factory name) serves as a destination name. Name your destination explicitly if you need several separate configurations of destinations of the same type (i.e. you wish to maintain credentials for development, staging and production storage buckets in the same config file). Destination name is also stored in the [load info](../running-in-production/running.md#inspect-and-save-the-load-info-and-trace) and pipeline traces so use them also when you need more descriptive names (other than, for example, `filesystem`). +If a destination is not named, its shorthand type (the Python factory name) serves as a destination name. Name your destination explicitly if you need several separate configurations of destinations of the same type (i.e., you wish to maintain credentials for development, staging, and production storage buckets in the same config file). The destination name is also stored in the [load info](../running-in-production/running.md#inspect-and-save-the-load-info-and-trace) and pipeline traces, so use them also when you need more descriptive names (other than, for example, `filesystem`). ## Configure a destination -We recommend to pass the credentials and other required parameters to configuration via TOML files, environment variables or other [config providers](credentials/setup). This allows you, for example, to easily switch to production destinations after deployment. +We recommend passing the credentials and other required parameters to configuration via TOML files, environment variables, or other [config providers](credentials/setup). This allows you, for example, to easily switch to production destinations after deployment. -We recommend to use the [default config section layout](credentials/setup#structure-of-secrets.toml-and-config.toml) as below: +We recommend using the [default config section layout](credentials/setup#structure-of-secrets.toml-and-config.toml) as below: or via environment variables: @@ -53,30 +53,32 @@ DESTINATION__FILESYSTEM__CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME=dltdata DESTINATION__FILESYSTEM__CREDENTIALS__AZURE_STORAGE_ACCOUNT_KEY="storage key" ``` -For named destinations you use their names in the config section +For named destinations, you use their names in the config section -Note that when you use [`dlt init` command](../walkthroughs/add-a-verified-source.md) to create or add a data source, `dlt` creates a sample configuration for selected destination. +Note that when you use the [`dlt init` command](../walkthroughs/add-a-verified-source.md) to create or add a data source, `dlt` creates a sample configuration for the selected destination. + + ### Pass explicit credentials -You can pass credentials explicitly when creating destination factory instance. This replaces the `credentials` argument in `dlt.pipeline` and `pipeline.load` methods - which is now deprecated. You can pass the required credentials object, its dictionary representation or the supported native form like below: +You can pass credentials explicitly when creating a destination factory instance. This replaces the `credentials` argument in `dlt.pipeline` and `pipeline.load` methods, which is now deprecated. You can pass the required credentials object, its dictionary representation, or the supported native form like below: :::tip -You can create and pass partial credentials and `dlt` will fill the missing data. Below we pass postgres connection string but without password and expect that it will be present in environment variables (or any other [config provider](credentials/setup)) +You can create and pass partial credentials, and `dlt` will fill in the missing data. Below, we pass a PostgreSQL connection string but without a password and expect that it will be present in environment variables (or any other [config provider](credentials/setup)) -Please read how to use [various built in credentials types](credentials/complex_types). +Please read how to use [various built-in credentials types](credentials/complex_types). ::: ### Inspect destination capabilities -[Destination capabilities](../walkthroughs/create-new-destination.md#3-set-the-destination-capabilities) tell `dlt` what given destination can and cannot do. For example it tells which file formats it can load, what is maximum query or identifier length. Inspect destination capabilities as follows: +[Destination capabilities](../walkthroughs/create-new-destination.md#3-set-the-destination-capabilities) tell `dlt` what a given destination can and cannot do. For example, it tells which file formats it can load, what the maximum query or identifier length is. Inspect destination capabilities as follows: ```py import dlt pipeline = dlt.pipeline("snowflake_test", destination="snowflake") @@ -84,13 +86,13 @@ print(dict(pipeline.destination.capabilities())) ``` ### Pass additional parameters and change destination capabilities -Destination factory accepts additional parameters that will be used to pre-configure it and change destination capabilities. +The destination factory accepts additional parameters that will be used to pre-configure it and change destination capabilities. ```py import dlt duck_ = dlt.destinations.duckdb(naming_convention="duck_case", recommended_file_size=120000) print(dict(duck_.capabilities())) ``` -Example above is overriding `naming_convention` and `recommended_file_size` in the destination capabilities. +The example above is overriding the `naming_convention` and `recommended_file_size` in the destination capabilities. ### Configure multiple destinations in a pipeline To configure multiple destinations within a pipeline, you need to provide the credentials for each destination in the "secrets.toml" file. This example demonstrates how to configure a BigQuery destination named `destination_one`: @@ -124,56 +126,56 @@ Similarly, you can assign multiple destinations to the same or different drivers ## Access a destination When loading data, `dlt` will access the destination in two cases: 1. At the beginning of the `run` method to sync the pipeline state with the destination (or if you call `pipeline.sync_destination` explicitly). -2. In the `pipeline.load` method - to migrate schema and load the load package. +2. In the `pipeline.load` method - to migrate the schema and load the load package. -Obviously, dlt will access the destination when you instantiate [sql_client](../dlt-ecosystem/transformations/sql.md). +Obviously, `dlt` will access the destination when you instantiate [sql_client](../dlt-ecosystem/transformations/sql.md). :::note -`dlt` will not import the destination dependencies or access destination configuration if access is not needed. You can build multi-stage pipelines where steps are executed in separate processes or containers - the `extract` and `normalize` step do not need destination dependencies, configuration and actual connection. +`dlt` will not import the destination dependencies or access destination configuration if access is not needed. You can build multi-stage pipelines where steps are executed in separate processes or containers - the `extract` and `normalize` step do not need destination dependencies, configuration, and actual connection. ::: -## Control how `dlt` creates table, column and other identifiers -`dlt` maps identifiers found in the source data into destination identifiers (ie. table and columns names) using [naming conventions](naming-convention.md) which ensure that -character set, identifier length and other properties fit into what given destination can handle. For example our [default naming convention (**snake case**)](naming-convention.md#default-naming-convention-snake_case) converts all names in the source (ie. JSON document fields) into snake case, case insensitive identifiers. +## Control how `dlt` creates table, column, and other identifiers +`dlt` maps identifiers found in the source data into destination identifiers (i.e., table and column names) using [naming conventions](naming-convention.md) which ensure that +character set, identifier length, and other properties fit into what the given destination can handle. For example, our [default naming convention (**snake case**)](naming-convention.md#default-naming-convention-snake_case) converts all names in the source (i.e., JSON document fields) into snake case, case-insensitive identifiers. -Each destination declares its preferred naming convention, support for case sensitive identifiers and case folding function that case insensitive identifiers follow. For example: -1. Redshift - by default does not support case sensitive identifiers and converts all of them to lower case. -2. Snowflake - supports case sensitive identifiers and considers upper cased identifiers as case insensitive (which is the default case folding) -3. DuckDb - does not support case sensitive identifiers but does not case fold them so it preserves the original casing in the information schema. -4. Athena - does not support case sensitive identifiers and converts all of them to lower case. -5. BigQuery - all identifiers are case sensitive, there's no case insensitive mode available via case folding (but it can be enabled in dataset level). +Each destination declares its preferred naming convention, support for case-sensitive identifiers, and case folding function that case-insensitive identifiers follow. For example: +1. Redshift - by default, does not support case-sensitive identifiers and converts all of them to lower case. +2. Snowflake - supports case-sensitive identifiers and considers upper-cased identifiers as case-insensitive (which is the default case folding). +3. DuckDb - does not support case-sensitive identifiers but does not case fold them, so it preserves the original casing in the information schema. +4. Athena - does not support case-sensitive identifiers and converts all of them to lower case. +5. BigQuery - all identifiers are case-sensitive; there's no case-insensitive mode available via case folding (but it can be enabled at the dataset level). -You can change the naming convention used in [many different ways](naming-convention.md#configure-naming-convention), below we set the preferred naming convention on the Snowflake destination to `sql_cs` to switch Snowflake to case sensitive mode: +You can change the naming convention used in [many different ways](naming-convention.md#configure-naming-convention). Below, we set the preferred naming convention on the Snowflake destination to `sql_cs` to switch Snowflake to case-sensitive mode: ```py import dlt snow_ = dlt.destinations.snowflake(naming_convention="sql_cs_v1") ``` -Setting naming convention will impact all new schemas being created (ie. on first pipeline run) and will re-normalize all existing identifiers. +Setting the naming convention will impact all new schemas being created (i.e., on the first pipeline run) and will re-normalize all existing identifiers. :::caution -`dlt` prevents re-normalization of identifiers in tables that were already created at the destination. Use [refresh](pipeline.md#refresh-pipeline-data-and-state) mode to drop the data. You can also disable this behavior via [configuration](naming-convention.md#avoid-identifier-collisions) +`dlt` prevents re-normalization of identifiers in tables that were already created at the destination. Use [refresh](pipeline.md#refresh-pipeline-data-and-state) mode to drop the data. You can also disable this behavior via [configuration](naming-convention.md#avoid-identifier-collisions). ::: :::note -Destinations that support case sensitive identifiers but use case folding convention to enable case insensitive identifiers are configured in case insensitive mode by default. Examples: Postgres, Snowflake, Oracle. +Destinations that support case-sensitive identifiers but use a case folding convention to enable case-insensitive identifiers are configured in case-insensitive mode by default. Examples: Postgres, Snowflake, Oracle. ::: :::caution -If you use case sensitive naming convention with case insensitive destination, `dlt` will: -1. Fail the load if it detects identifier collision due to case folding +If you use a case-sensitive naming convention with a case-insensitive destination, `dlt` will: +1. Fail the load if it detects an identifier collision due to case folding. 2. Warn if any case folding is applied by the destination. ::: -### Enable case sensitive identifiers support -Selected destinations may be configured so they start accepting case sensitive identifiers. For example, it is possible to set case sensitive collation on **mssql** database and then tell `dlt` about it. +### Enable case-sensitive identifiers support +Selected destinations may be configured so they start accepting case-sensitive identifiers. For example, it is possible to set case-sensitive collation on an **mssql** database and then tell `dlt` about it. ```py from dlt.destinations import mssql dest_ = mssql(has_case_sensitive_identifiers=True, naming_convention="sql_cs_v1") ``` -Above we can safely use case sensitive naming convention without worrying of name collisions. +Above, we can safely use a case-sensitive naming convention without worrying about name collisions. You can configure the case sensitivity, **but configuring destination capabilities is not currently supported**. ```toml @@ -182,10 +184,11 @@ has_case_sensitive_identifiers=true ``` :::note -In most cases setting the flag above just indicates to `dlt` that you switched the case sensitive option on a destination. `dlt` will not do that for you. Refer to destination documentation for details. +In most cases, setting the flag above just indicates to `dlt` that you switched the case-sensitive option on a destination. `dlt` will not do that for you. Refer to the destination documentation for details. ::: -## Create new destination +## Create a new destination You have two ways to implement a new destination: -1. You can use `@dlt.destination` decorator and [implement a sink function](../dlt-ecosystem/destinations/destination.md). This is perfect way to implement reverse ETL destinations that push data back to REST APIs. -2. You can implement [a full destination](../walkthroughs/create-new-destination.md) where you have a full control over load jobs and schema migration. +1. You can use the `@dlt.destination` decorator and [implement a sink function](../dlt-ecosystem/destinations/destination.md). This is a perfect way to implement reverse ETL destinations that push data back to REST APIs. +2. You can implement [a full destination](../walkthroughs/create-new-destination.md) where you have full control over load jobs and schema migration. + diff --git a/docs/website/docs/general-usage/full-loading.md b/docs/website/docs/general-usage/full-loading.md index 434615fecf..b252fbef92 100644 --- a/docs/website/docs/general-usage/full-loading.md +++ b/docs/website/docs/general-usage/full-loading.md @@ -5,9 +5,7 @@ keywords: [full loading, loading methods, replace] --- # Full loading -Full loading is the act of fully reloading the data of your tables. All existing data -will be removed and replaced by whatever the source produced on this run. Resources -that are not selected while performing a full load will not replace any data in the destination. +Full loading is the act of fully reloading the data of your tables. All existing data will be removed and replaced by whatever the source produced on this run. Resources that are not selected while performing a full load will not replace any data in the destination. ## Performing a full load @@ -20,51 +18,41 @@ reactions = ["%2B1", "-1", "smile", "tada", "thinking_face", "heart", "rocket", for reaction in reactions: for page_no in range(1, 3): page = requests.get(f"https://api.github.com/repos/{repo}/issues?state=all&sort=reactions-{reaction}&per_page=100&page={page_no}", headers=headers) - print(f"got page for {reaction} page {page_no}, requests left", page.headers["x-ratelimit-remaining"]) + print(f"Got page for {reaction} page {page_no}, requests left", page.headers["x-ratelimit-remaining"]) issues.extend(page.json()) p.run(issues, write_disposition="replace", primary_key="id", table_name="issues") ``` ## Choosing the correct replace strategy for your full load -`dlt` implements three different strategies for doing a full load on your table: `truncate-and-insert`, `insert-from-staging` and `staging-optimized`. The exact behaviour of these strategies can also vary between the available destinations. +dlt implements three different strategies for doing a full load on your table: `truncate-and-insert`, `insert-from-staging`, and `staging-optimized`. The exact behavior of these strategies can also vary between the available destinations. You can select a strategy with a setting in your `config.toml` file. If you do not select a strategy, dlt will default to `truncate-and-insert`. ```toml [destination] -# set the optimized replace strategy +# Set the optimized replace strategy replace_strategy = "staging-optimized" ``` ### The `truncate-and-insert` strategy -The `truncate-and-insert` replace strategy is the default and the fastest of all three strategies. If you load data with this setting, then the -destination tables will be truncated at the beginning of the load and the new data will be inserted consecutively but not within the same transaction. -The downside of this strategy is, that your tables will have no data for a while until the load is completed. You -may end up with new data in some tables and no data in other tables if the load fails during the run. Such incomplete load may be however detected by checking if the -[_dlt_loads table contains load id](destination-tables.md#load-packages-and-load-ids) from _dlt_load_id of the replaced tables. If you prefer to have no data downtime, please use one of the other strategies. +The `truncate-and-insert` replace strategy is the default and the fastest of all three strategies. If you load data with this setting, then the destination tables will be truncated at the beginning of the load, and the new data will be inserted consecutively but not within the same transaction. +The downside of this strategy is that your tables will have no data for a while until the load is completed. You may end up with new data in some tables and no data in other tables if the load fails during the run. Such an incomplete load may be detected by checking if the [_dlt_loads table contains a load id](destination-tables.md#load-packages-and-load-ids) from _dlt_load_id of the replaced tables. If you prefer to have no data downtime, please use one of the other strategies. ### The `insert-from-staging` strategy -The `insert-from-staging` is the slowest of all three strategies. It will load all new data into staging tables away from your final destination tables and will then truncate and insert the new data in one transaction. +The `insert-from-staging` strategy is the slowest of all three strategies. It will load all new data into staging tables away from your final destination tables and will then truncate and insert the new data in one transaction. It also maintains a consistent state between nested and root tables at all times. Use this strategy if you have the requirement for consistent destination datasets with zero downtime and the `optimized` strategy does not work for you. This strategy behaves the same way across all destinations. ### The `staging-optimized` strategy -The `staging-optimized` strategy has all the upsides of the `insert-from-staging` but implements certain optimizations for faster loading on some destinations. -This comes at the cost of destination tables being dropped and recreated in some cases, which will mean that any views or other constraints you have -placed on those tables will be dropped with the table. If you have a setup where you need to retain your destination tables, do not use the `staging-optimized` -strategy. If you do not care about tables being dropped but need the upsides of the `insert-from-staging` with some performance (and cost) saving -opportunities, you should use this strategy. The `staging-optimized` strategy behaves differently across destinations: +The `staging-optimized` strategy has all the upsides of the `insert-from-staging` but implements certain optimizations for faster loading on some destinations. This comes at the cost of destination tables being dropped and recreated in some cases, which means that any views or other constraints you have placed on those tables will be dropped with the table. If you have a setup where you need to retain your destination tables, do not use the `staging-optimized` strategy. If you do not care about tables being dropped but need the upsides of the `insert-from-staging` with some performance (and cost) saving opportunities, you should use this strategy. The `staging-optimized` strategy behaves differently across destinations: * Postgres: After loading the new data into the staging tables, the destination tables will be dropped and replaced by the staging tables. No data needs to be moved, so this strategy is almost as fast as `truncate-and-insert`. -* bigquery: After loading the new data into the staging tables, the destination tables will be dropped and - recreated with a [clone command](https://cloud.google.com/bigquery/docs/table-clones-create) from the staging tables. This is a low cost and fast way to create a second independent table from the data of another. Learn - more about [table cloning on bigquery](https://cloud.google.com/bigquery/docs/table-clones-intro). -* snowflake: After loading the new data into the staging tables, the destination tables will be dropped and - recreated with a [clone command](https://docs.snowflake.com/en/sql-reference/sql/create-clone) from the staging tables. This is a low cost and fast way to create a second independent table from the data of another. Learn - more about [table cloning on snowflake](https://docs.snowflake.com/en/user-guide/object-clone). +* BigQuery: After loading the new data into the staging tables, the destination tables will be dropped and recreated with a [clone command](https://cloud.google.com/bigquery/docs/table-clones-create) from the staging tables. This is a low-cost and fast way to create a second independent table from the data of another. Learn more about [table cloning on BigQuery](https://cloud.google.com/bigquery/docs/table-clones-intro). +* Snowflake: After loading the new data into the staging tables, the destination tables will be dropped and recreated with a [clone command](https://docs.snowflake.com/en/sql-reference/sql/create-clone) from the staging tables. This is a low-cost and fast way to create a second independent table from the data of another. Learn more about [table cloning on Snowflake](https://docs.snowflake.com/en/user-guide/object-clone). For all other [destinations](../dlt-ecosystem/destinations/index.md), please look at their respective documentation pages to see if and how the `staging-optimized` strategy is implemented. If it is not implemented, `dlt` will fall back to the `insert-from-staging` strategy. + diff --git a/docs/website/docs/general-usage/glossary.md b/docs/website/docs/general-usage/glossary.md index 5ae256b268..9cff97823e 100644 --- a/docs/website/docs/general-usage/glossary.md +++ b/docs/website/docs/general-usage/glossary.md @@ -8,13 +8,13 @@ keywords: [glossary, resource, source, pipeline] ## [Source](source) -Location that holds data with certain structure. Organized into one or more resources. +A location that holds data with a certain structure, organized into one or more resources. - If endpoints in an API are the resources, then the API is the source. -- If tabs in a spreadsheet are the resources, then the source is the spreadsheet. -- If tables in a database are the resources, then the source is the database. +- If tabs in a spreadsheet are the resources, then the spreadsheet is the source. +- If tables in a database are the resources, then the database is the source. -Within this documentation, **source** refers also to the software component (i.e. Python function) +Within this documentation, **source** also refers to the software component (i.e., a Python function) that **extracts** data from the source location using one or more resource components. ## [Resource](resource) @@ -26,38 +26,39 @@ origin. - If the source is a spreadsheet, then a resource is a tab in that spreadsheet. - If the source is a database, then a resource is a table in that database. -Within this documentation, **resource** refers also to the software component (i.e. Python function) -that **extracts** the data from source location. +Within this documentation, **resource** also refers to the software component (i.e., a Python function) +that **extracts** the data from the source location. ## [Destination](../dlt-ecosystem/destinations) -The data store where data from the source is loaded (e.g. Google BigQuery). +The data store where data from the source is loaded (e.g., Google BigQuery). ## [Pipeline](pipeline) Moves the data from the source to the destination, according to instructions provided in the schema -(i.e. extracting, normalizing, and loading the data). +(i.e., extracting, normalizing, and loading the data). ## [Verified source](../walkthroughs/add-a-verified-source) A Python module distributed with `dlt init` that allows creating pipelines that extract data from a -particular **Source**. Such module is intended to be published in order for others to use it to +particular **Source**. Such a module is intended to be published in order for others to use it to build pipelines. -A source must be published to become "verified": which means that it has tests, test data, -demonstration scripts, documentation and the dataset produces was reviewed by a data engineer. +A source must be published to become "verified," which means that it has tests, test data, +demonstration scripts, documentation, and the dataset produced was reviewed by a data engineer. ## [Schema](schema) -Describes the structure of normalized data (e.g. unpacked tables, column types, etc.) and provides -instructions on how the data should be processed and loaded (i.e. it tells `dlt` about the content +Describes the structure of normalized data (e.g., unpacked tables, column types, etc.) and provides +instructions on how the data should be processed and loaded (i.e., it tells `dlt` about the content of the data and how to load it into the destination). ## [Config](credentials/setup#secrets.toml-and-config.toml) -A set of values that are passed to the pipeline at run time (e.g. to change its behavior locally vs. +A set of values that are passed to the pipeline at runtime (e.g., to change its behavior locally vs. in production). ## [Credentials](credentials/complex_types) A subset of configuration whose elements are kept secret and never shared in plain text. + diff --git a/docs/website/docs/general-usage/http/overview.md b/docs/website/docs/general-usage/http/overview.md index 7358e577f4..01f3c88026 100644 --- a/docs/website/docs/general-usage/http/overview.md +++ b/docs/website/docs/general-usage/http/overview.md @@ -14,7 +14,7 @@ Additionally, dlt provides tools to simplify working with APIs: ## Quick example -Here's a simple pipeline that reads issues from the [dlt GitHub repository](https://github.com/dlt-hub/dlt/issues). The API endpoint is https://api.github.com/repos/dlt-hub/dlt/issues. The result is "paginated", meaning that the API returns a limited number of issues per page. The `paginate()` method iterates over all pages and yields the results which are then processed by the pipeline. +Here's a simple pipeline that reads issues from the [dlt GitHub repository](https://github.com/dlt-hub/dlt/issues). The API endpoint is https://api.github.com/repos/dlt-hub/dlt/issues. The result is "paginated," meaning that the API returns a limited number of issues per page. The `paginate()` method iterates over all pages and yields the results which are then processed by the pipeline. ```py import dlt @@ -46,7 +46,7 @@ print(load_info) Here's what the code does: 1. We create a `RESTClient` instance with the base URL of the API: in this case, the GitHub API (https://api.github.com). -2. Issues endpoint returns a list of issues. Since there could be hundreds of issues, the API "paginates" the results: it returns a limited number of issues in each response along with a link to the next batch of issues (or "page"). The `paginate()` method iterates over all pages and yields the batches of issues. +2. The issues endpoint returns a list of issues. Since there could be hundreds of issues, the API "paginates" the results: it returns a limited number of issues in each response along with a link to the next batch of issues (or "page"). The `paginate()` method iterates over all pages and yields the batches of issues. 3. Here we specify the address of the endpoint we want to read from: `/repos/dlt-hub/dlt/issues`. 4. We pass the parameters to the actual API call to control the data we get back. In this case, we ask for 100 issues per page (`"per_page": 100`), sorted by the last update date (`"sort": "updated"`) in descending order (`"direction": "desc"`). 5. We yield the page from the resource function to the pipeline. The `page` is an instance of the [`PageData`](#pagedata) and contains the data from the current page of the API response and some metadata. @@ -87,5 +87,6 @@ print(load_info) In the example above: 1. We create a `RESTClient` instance with the base URL of the API: in this case, the [PokéAPI](https://pokeapi.co/). We also specify the paginator to use explicitly: `JSONLinkPaginator` with the `next_url_path` set to `"next"`. This tells the paginator to look for the next page URL in the `next` key of the JSON response. -2. In `data_selector` we specify the JSON path to extract the data from the response. This is used to extract the data from the response JSON. -3. By default the number of items per page is limited to 20. We override this by specifying the `limit` parameter in the API call. +2. In `data_selector`, we specify the JSON path to extract the data from the response. This is used to extract the data from the response JSON. +3. By default, the number of items per page is limited to 20. We override this by specifying the `limit` parameter in the API call. + diff --git a/docs/website/docs/general-usage/http/requests.md b/docs/website/docs/general-usage/http/requests.md index a6da3079af..cf7711cdd7 100644 --- a/docs/website/docs/general-usage/http/requests.md +++ b/docs/website/docs/general-usage/http/requests.md @@ -10,7 +10,7 @@ We recommend using this to make API calls in your sources as it makes your pipel The dlt requests client will additionally set the default user-agent header to `dlt/{DLT_VERSION_NAME}`. -For most use cases this is a drop in replacement for `requests`, so in places where you would normally do: +For most use cases, this is a drop-in replacement for `requests`, so in places where you would normally do: ```py import requests @@ -35,21 +35,21 @@ data = response.json() ## Retry rules -By default failing requests are retried up to 5 times with an exponentially increasing delay. That means the first retry will wait 1 second and the fifth retry will wait 16 seconds. +By default, failing requests are retried up to 5 times with an exponentially increasing delay. That means the first retry will wait 1 second, and the fifth retry will wait 16 seconds. -If all retry attempts fail the corresponding requests exception is raised. E.g. `requests.HTTPError` or `requests.ConnectionTimeout` +If all retry attempts fail, the corresponding requests exception is raised. E.g., `requests.HTTPError` or `requests.ConnectionTimeout`. All standard HTTP server errors trigger a retry. This includes: * Error status codes: All status codes in the `500` range and `429` (too many requests). - Commonly servers include a `Retry-After` header with `429` and `503` responses. - When detected this value supersedes the standard retry delay. + Commonly, servers include a `Retry-After` header with `429` and `503` responses. + When detected, this value supersedes the standard retry delay. * Connection and timeout errors - When the remote server is unreachable, the connection is unexpectedly dropped or when the request takes longer than the configured `timeout`. + When the remote server is unreachable, the connection is unexpectedly dropped, or when the request takes longer than the configured `timeout`. ## Customizing retry settings @@ -63,7 +63,7 @@ request_timeout = 120 # Timeout in seconds request_max_retry_delay = 30 # Cap exponential delay to 30 seconds ``` -For more control you can create your own instance of `dlt.sources.requests.Client` and use that instead of the global client. +For more control, you can create your own instance of `dlt.sources.requests.Client` and use that instead of the global client. This lets you customize which status codes and exceptions to retry on: @@ -98,3 +98,4 @@ http_client = Client( retry_condition=retry_if_error_key ) ``` + diff --git a/docs/website/docs/general-usage/http/rest-client.md b/docs/website/docs/general-usage/http/rest-client.md index 40c83f8c5b..125604ab94 100644 --- a/docs/website/docs/general-usage/http/rest-client.md +++ b/docs/website/docs/general-usage/http/rest-client.md @@ -33,7 +33,7 @@ The `RESTClient` class is initialized with the following parameters: - `base_url`: The root URL of the API. All requests will be made relative to this URL. - `headers`: Default headers to include in every request. This can be used to set common headers like `User-Agent` or other custom headers. - `auth`: The authentication configuration. See the [Authentication](#authentication) section for more details. -- `paginator`: A paginator instance for handling paginated responses. See the [Paginators](#paginators) below. +- `paginator`: A paginator instance for handling paginated responses. See the [Paginators](#paginators) section below. - `data_selector`: A [JSONPath selector](https://github.com/h2non/jsonpath-ng?tab=readme-ov-file#jsonpath-syntax) for extracting data from the responses. This defines a way to extract the data from the response JSON. Only used when paginating. - `session`: An optional session for making requests. This should be a [Requests session](https://requests.readthedocs.io/en/latest/api/#requests.Session) instance that can be used to set up custom request behavior for the client. @@ -56,17 +56,15 @@ for page in client.paginate("/posts"): ``` :::tip -If `paginator` is not specified, the `paginate()` method will attempt to automatically detect the pagination mechanism used by the API. If the API uses a standard pagination mechanism like having a `next` link in the response's headers or JSON body, the `paginate()` method will handle this automatically. Otherwise, you can specify a paginator object explicitly or implement a custom paginator. +If a `paginator` is not specified, the `paginate()` method will attempt to automatically detect the pagination mechanism used by the API. If the API uses a standard pagination mechanism like having a `next` link in the response's headers or JSON body, the `paginate()` method will handle this automatically. Otherwise, you can specify a paginator object explicitly or implement a custom paginator. ::: ### Selecting data from the response -When paginating through API responses, the `RESTClient` tries to automatically extract the data from the response. Sometimes though you may need to explicitly -specify how to extract the data from the response JSON. +When paginating through API responses, the `RESTClient` tries to automatically extract the data from the response. Sometimes, however, you may need to explicitly specify how to extract the data from the response JSON. -Use `data_selector` parameter of the `RESTClient` class or the `paginate()` method to tell the client how to extract the data. -`data_selector` is a [JSONPath](https://github.com/h2non/jsonpath-ng?tab=readme-ov-file#jsonpath-syntax) expression that points to the key in -the JSON that contains the data to be extracted. +Use the `data_selector` parameter of the `RESTClient` class or the `paginate()` method to tell the client how to extract the data. +`data_selector` is a [JSONPath](https://github.com/h2non/jsonpath-ng?tab=readme-ov-file#jsonpath-syntax) expression that points to the key in the JSON that contains the data to be extracted. For example, if the API response looks like this: @@ -100,7 +98,7 @@ The `data_selector` needs to be set to `"results.posts"`. Read more about [JSONP ### PageData -Each `PageData` instance contains the data for a single page, along with context such as the original request and response objects, allowing for detailed inspection.. The `PageData` is a list-like object that contains the following attributes: +Each `PageData` instance contains the data for a single page, along with context such as the original request and response objects, allowing for detailed inspection. The `PageData` is a list-like object that contains the following attributes: - `request`: The original request object. - `response`: The response object. @@ -161,17 +159,15 @@ def get_data(): yield page ``` - #### HeaderLinkPaginator -This paginator handles pagination based on a link to the next page in the response headers (e.g., the `Link` header, as used by GitHub API). +This paginator handles pagination based on a link to the next page in the response headers (e.g., the `Link` header, as used by the GitHub API). **Parameters:** - `links_next_key`: The relation type (rel) to identify the next page link within the Link header. Defaults to "next". -Note: normally, you don't need to specify this paginator explicitly, as it is used automatically when the API returns a `Link` header. On rare occasions, you may -need to specify the paginator when the API uses a different relation type. +Note: Normally, you don't need to specify this paginator explicitly, as it is used automatically when the API returns a `Link` header. On rare occasions, you may need to specify the paginator when the API uses a different relation type. #### OffsetPaginator @@ -184,13 +180,13 @@ need to specify the paginator when the API uses a different relation type. - `offset_param`: The name of the query parameter used to specify the offset. Defaults to `"offset"`. - `limit_param`: The name of the query parameter used to specify the limit. Defaults to `"limit"`. - `total_path`: A JSONPath expression for the total number of items. If not provided, pagination is controlled by `maximum_offset` and `stop_after_empty_page`. -- `maximum_offset`: Optional maximum offset value. Limits pagination even without total count. +- `maximum_offset`: Optional maximum offset value. Limits pagination even without a total count. - `stop_after_empty_page`: Whether pagination should stop when a page contains no result items. Defaults to `True`. **Example:** Assuming an API endpoint `https://api.example.com/items` supports pagination with `offset` and `limit` parameters. -E.g. `https://api.example.com/items?offset=0&limit=100`, `https://api.example.com/items?offset=100&limit=100`, etc. And includes the total count in its responses, e.g.: +E.g., `https://api.example.com/items?offset=0&limit=100`, `https://api.example.com/items?offset=100&limit=100`, etc., and includes the total count in its responses, e.g.: ```json { @@ -224,7 +220,7 @@ client = RESTClient( ) ``` -Additionally, you can limit pagination with `maximum_offset`, for example during development. If `maximum_offset` is reached before the first empty page then pagination stops: +Additionally, you can limit pagination with `maximum_offset`, for example during development. If `maximum_offset` is reached before the first empty page, then pagination stops: ```py client = RESTClient( @@ -237,8 +233,7 @@ client = RESTClient( ) ``` -You can disable automatic stoppage of pagination by setting `stop_after_stop_after_empty_page = False`. In this case, you must provide either `total_path` or `maximum_offset` to guarantee that the paginator terminates. - +You can disable automatic stoppage of pagination by setting `stop_after_empty_page = False`. In this case, you must provide either `total_path` or `maximum_offset` to guarantee that the paginator terminates. #### PageNumberPaginator @@ -287,20 +282,19 @@ client = RESTClient( ) ``` -Additionally, you can limit pagination with `maximum_offset`, for example during development. If `maximum_page` is reached before the first empty page then pagination stops: +Additionally, you can limit pagination with `maximum_page`, for example during development. If `maximum_page` is reached before the first empty page, then pagination stops: ```py client = RESTClient( base_url="https://api.example.com", - paginator=OffsetPaginator( - maximum_page=2, # limits response to 2 pages - total_path=None, + paginator=PageNumberPaginator( + maximum_page=2, # Limits response to 2 pages + total_path=None ) ) ``` -You can disable automatic stoppage of pagination by setting `stop_after_stop_after_empty_page = False`. In this case, you must provide either `total_path` or `maximum_page` to guarantee that the paginator terminates. - +You can disable automatic stoppage of pagination by setting `stop_after_empty_page = False`. In this case, you must provide either `total_path` or `maximum_page` to guarantee that the paginator terminates. #### JSONResponseCursorPaginator @@ -335,17 +329,17 @@ client = RESTClient( ### Implementing a custom paginator -When working with APIs that use non-standard pagination schemes, or when you need more control over the pagination process, you can implement a custom paginator by subclassing the `BasePaginator` class and implementing the methods `init_request`, `update_state` and `update_request`. +When working with APIs that use non-standard pagination schemes, or when you need more control over the pagination process, you can implement a custom paginator by subclassing the `BasePaginator` class and implementing the methods `init_request`, `update_state`, and `update_request`. - `init_request(request: Request) -> None`: This method is called before making the first API call in the `RESTClient.paginate` method. You can use this method to set up the initial request query parameters, headers, etc. For example, you can set the initial page number or cursor value. - `update_state(response: Response, data: Optional[List[Any]]) -> None`: This method updates the paginator's state based on the response of the API call. Typically, you extract pagination details (like the next page reference) from the response and store them in the paginator instance. -- `update_request(request: Request) -> None`: Before making the next API call in `RESTClient.paginate` method, `update_request` is used to modify the request with the necessary parameters to fetch the next page (based on the current state of the paginator). For example, you can add query parameters to the request, or modify the URL. +- `update_request(request: Request) -> None`: Before making the next API call in the `RESTClient.paginate` method, `update_request` is used to modify the request with the necessary parameters to fetch the next page (based on the current state of the paginator). For example, you can add query parameters to the request or modify the URL. -#### Example 1: creating a query parameter paginator +#### Example 1: Creating a query parameter paginator -Suppose an API uses query parameters for pagination, incrementing an page parameter for each subsequent page, without providing direct links to next pages in its responses. E.g. `https://api.example.com/posts?page=1`, `https://api.example.com/posts?page=2`, etc. Here's how you could implement a paginator for this scheme: +Suppose an API uses query parameters for pagination, incrementing a page parameter for each subsequent page, without providing direct links to the next pages in its responses. E.g., `https://api.example.com/posts?page=1`, `https://api.example.com/posts?page=2`, etc. Here's how you could implement a paginator for this scheme: ```py from typing import Any, List, Optional @@ -359,7 +353,7 @@ class QueryParamPaginator(BasePaginator): self.page = initial_page def init_request(self, request: Request) -> None: - # This will set the initial page number (e.g. page=1) + # This will set the initial page number (e.g., page=1) self.update_request(request) def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None: @@ -395,9 +389,9 @@ def get_data(): [`PageNumberPaginator`](#pagenumberpaginator) that ships with dlt does the same thing, but with more flexibility and error handling. This example is meant to demonstrate how to implement a custom paginator. For most use cases, you should use the [built-in paginators](#paginators). ::: -#### Example 2: creating a paginator for POST requests +#### Example 2: Creating a paginator for POST requests -Some APIs use POST requests for pagination, where the next page is fetched by sending a POST request with a cursor or other parameters in the request body. This is frequently used in "search" API endpoints or other endpoints with big payloads. Here's how you could implement a paginator for a case like this: +Some APIs use POST requests for pagination, where the next page is fetched by sending a POST request with a cursor or other parameters in the request body. This is frequently used in "search" API endpoints or other endpoints with large payloads. Here's how you could implement a paginator for a case like this: ```py from typing import Any, List, Optional @@ -447,12 +441,12 @@ The available authentication methods are defined in the `dlt.sources.helpers.res - [OAuth2ClientCredentials](#oauth20-authorization) For specific use cases, you can [implement custom authentication](#implementing-custom-authentication) by subclassing the `AuthBase` class from the Requests library. -For specific flavors of OAuth 2.0 you can [implement custom OAuth 2.0](#oauth2-authorization) +For specific flavors of OAuth 2.0, you can [implement custom OAuth 2.0](#oauth2-authorization) by subclassing `OAuth2ClientCredentials`. ### Bearer token authentication -Bearer Token Authentication (`BearerTokenAuth`) is an auth method where the client sends a token in the request's Authorization header (e.g. `Authorization: Bearer `). The server validates this token and grants access if the token is valid. +Bearer Token Authentication (`BearerTokenAuth`) is an auth method where the client sends a token in the request's Authorization header (e.g., `Authorization: Bearer `). The server validates this token and grants access if the token is valid. **Parameters:** @@ -475,7 +469,7 @@ for page in client.paginate("/protected/resource"): ### API key authentication -API Key Authentication (`ApiKeyAuth`) is an auth method where the client sends an API key in a custom header (e.g. `X-API-Key: `, or as a query parameter). +API Key Authentication (`ApiKeyAuth`) is an auth method where the client sends an API key in a custom header (e.g., `X-API-Key: `, or as a query parameter). **Parameters:** @@ -521,15 +515,15 @@ response = client.get("/protected/resource") ### OAuth 2.0 authorization OAuth 2.0 is a common protocol for authorization. We have implemented two-legged authorization employed for server-to-server authorization because the end user (resource owner) does not need to grant approval. -The REST client acts as the OAuth client which obtains a temporary access token from the authorization server. This access token is then sent to the resource server to access protected content. If the access token is expired, the OAuth client automatically refreshes it. +The REST client acts as the OAuth client, which obtains a temporary access token from the authorization server. This access token is then sent to the resource server to access protected content. If the access token is expired, the OAuth client automatically refreshes it. -Unfortunately, most OAuth 2.0 implementations vary and thus you might need to subclass `OAuth2ClientCredentials` and implement `build_access_token_request()` to suite the requirements of the specific authorization server you want to interact with. +Unfortunately, most OAuth 2.0 implementations vary, and thus you might need to subclass `OAuth2ClientCredentials` and implement `build_access_token_request()` to suit the requirements of the specific authorization server you want to interact with. **Parameters:** -- `access_token_url`: The url to obtain the temporary access token. +- `access_token_url`: The URL to obtain the temporary access token. - `client_id`: Client credential to obtain authorization. Usually issued via a developer portal. - `client_secret`: Client credential to obtain authorization. Usually issued via a developer portal. -- `access_token_request_data`: A dictionary with data required by the autorization server apart from the `client_id`, `client_secret`, and `"grant_type": "client_credentials"`. Defaults to `None`. +- `access_token_request_data`: A dictionary with data required by the authorization server apart from the `client_id`, `client_secret`, and `"grant_type": "client_credentials"`. Defaults to `None`. - `default_token_expiration`: The time in seconds after which the temporary access token expires. Defaults to 3600. **Example:** @@ -540,7 +534,7 @@ from dlt.sources.helpers.rest_client import RESTClient from dlt.sources.helpers.rest_client.auth import OAuth2ClientCredentials class OAuth2ClientCredentialsHTTPBasic(OAuth2ClientCredentials): - """Used e.g. by Zoom Zoom Video Communications, Inc.""" + """Used e.g. by Zoom Video Communications, Inc.""" def build_access_token_request(self) -> Dict[str, Any]: authentication: str = b64encode( f"{self.client_id}:{self.client_secret}".encode() @@ -597,7 +591,7 @@ client = RESTClient( ## Advanced usage -`RESTClient.paginate()` allows to specify a [custom hook function](https://requests.readthedocs.io/en/latest/user/advanced/#event-hooks) that can be used to modify the response objects. For example, to handle specific HTTP status codes gracefully: +`RESTClient.paginate()` allows you to specify a [custom hook function](https://requests.readthedocs.io/en/latest/user/advanced/#event-hooks) that can be used to modify the response objects. For example, to handle specific HTTP status codes gracefully: ```py def custom_response_handler(response): @@ -608,7 +602,7 @@ def custom_response_handler(response): client.paginate("/posts", hooks={"response": [custom_response_handler]}) ``` -The handler function may raise `IgnoreResponseException` to exit the pagination loop early. This is useful for the enpoints that return a 404 status code when there are no items to paginate. +The handler function may raise `IgnoreResponseException` to exit the pagination loop early. This is useful for endpoints that return a 404 status code when there are no items to paginate. ## Shortcut for paginating API responses @@ -621,7 +615,6 @@ for page in paginate("https://api.example.com/posts"): print(page) ``` - ## Retry You can customize how the RESTClient retries failed requests by editing your `config.toml`. @@ -641,8 +634,7 @@ request_max_retry_delay = 30 # Cap exponential delay to 30 seconds ### `RESTClient.get()` and `RESTClient.post()` methods -These methods work similarly to the [get()](https://docs.python-requests.org/en/latest/api/#requests.get) and [post()](https://docs.python-requests.org/en/latest/api/#requests.post) functions -from the Requests library. They return a [Response](https://docs.python-requests.org/en/latest/api/#requests.Response) object that contains the response data. +These methods work similarly to the [get()](https://docs.python-requests.org/en/latest/api/#requests.get) and [post()](https://docs.python-requests.org/en/latest/api/#requests.post) functions from the Requests library. They return a [Response](https://docs.python-requests.org/en/latest/api/#requests.Response) object that contains the response data. You can inspect the `Response` object to get the `response.status_code`, `response.headers`, and `response.content`. For example: ```py @@ -659,7 +651,7 @@ print(response.content) ### `RESTClient.paginate()` -Debugging `paginate()` is trickier because it's a generator function that yields [`PageData`](#pagedata) objects. Here's several ways to debug the `paginate()` method: +Debugging `paginate()` is trickier because it's a generator function that yields [`PageData`](#pagedata) objects. Here are several ways to debug the `paginate()` method: 1. Enable [logging](../../running-in-production/running.md#set-the-log-level-and-format) to see detailed information about the HTTP requests: @@ -702,3 +694,4 @@ for page in client.paginate( ): print(page) ``` + diff --git a/docs/website/docs/general-usage/incremental-loading.md b/docs/website/docs/general-usage/incremental-loading.md index 819ac2fb0c..88f009e3c2 100644 --- a/docs/website/docs/general-usage/incremental-loading.md +++ b/docs/website/docs/general-usage/incremental-loading.md @@ -6,24 +6,19 @@ keywords: [incremental loading, loading methods, append, merge] # Incremental loading -Incremental loading is the act of loading only new or changed data and not old records that we -already loaded. It enables low-latency and low cost data transfer. +Incremental loading is the act of loading only new or changed data and not old records that we have already loaded. It enables low-latency and low-cost data transfer. -The challenge of incremental pipelines is that if we do not keep track of the state of the load -(i.e. which increments were loaded and which are to be loaded). Read more about state -[here](state.md). +The challenge of incremental pipelines is that if we do not keep track of the state of the load (i.e., which increments were loaded and which are to be loaded), we may encounter issues. Read more about state [here](state.md). ## Choosing a write disposition ### The 3 write dispositions: -- **Full load**: replaces the destination dataset with whatever the source produced on this run. To -achieve this, use `write_disposition='replace'` in your resources. Learn more in the [full loading docs](./full-loading.md) +- **Full load**: replaces the destination dataset with whatever the source produced on this run. To achieve this, use `write_disposition='replace'` in your resources. Learn more in the [full loading docs](./full-loading.md). - **Append**: appends the new data to the destination. Use `write_disposition='append'`. -- **Merge**: Merges new data to the destination using `merge_key` and/or deduplicates/upserts new data -using `primary_key`. Use `write_disposition='merge'`. +- **Merge**: Merges new data into the destination using `merge_key` and/or deduplicates/upserts new data using `primary_key`. Use `write_disposition='merge'`. ### Two simple questions determine the write disposition you use @@ -33,19 +28,15 @@ using `primary_key`. Use `write_disposition='merge'`. -The "write disposition" you choose depends on the data set and how you can extract it. +The "write disposition" you choose depends on the dataset and how you can extract it. -To find the "write disposition" you should use, the first question you should ask yourself is "Is my -data stateful or stateless"? Stateful data has a state that is subject to change - for example a -user's profile Stateless data cannot change - for example, a recorded event, such as a page view. +To find the "write disposition" you should use, the first question you should ask yourself is "Is my data stateful or stateless"? Stateful data has a state that is subject to change - for example, a user's profile. Stateless data cannot change - for example, a recorded event, such as a page view. Because stateless data does not need to be updated, we can just append it. For stateful data, comes a second question - Can I extract it incrementally from the source? If yes, you should use [slowly changing dimensions (Type-2)](#scd2-strategy), which allow you to maintain historical records of data changes over time. -If not, then we need to replace the entire data set. If however we can request the data incrementally such -as "all users added or modified since yesterday" then we can simply apply changes to our existing -dataset with the merge write disposition. +If not, then we need to replace the entire dataset. However, if we can request the data incrementally, such as "all users added or modified since yesterday," then we can simply apply changes to our existing dataset with the merge write disposition. ## Merge incremental loading @@ -59,19 +50,12 @@ The `merge` write disposition can be used with three different strategies: The default `delete-insert` strategy is used in two scenarios: -1. You want to keep only one instance of certain record i.e. you receive updates of the `user` state - from an API and want to keep just one record per `user_id`. -1. You receive data in daily batches, and you want to make sure that you always keep just a single - instance of a record for each batch even in case you load an old batch or load the current batch - several times a day (i.e. to receive "live" updates). +1. You want to keep only one instance of a certain record, i.e., you receive updates of the `user` state from an API and want to keep just one record per `user_id`. +2. You receive data in daily batches, and you want to make sure that you always keep just a single instance of a record for each batch, even in case you load an old batch or load the current batch several times a day (i.e., to receive "live" updates). -The `delete-insert` strategy loads data to a `staging` dataset, deduplicates the staging data if a -`primary_key` is provided, deletes the data from the destination using `merge_key` and `primary_key`, -and then inserts the new records. All of this happens in a single atomic transaction for a root and all -nested tables. +The `delete-insert` strategy loads data to a `staging` dataset, deduplicates the staging data if a `primary_key` is provided, deletes the data from the destination using `merge_key` and `primary_key`, and then inserts the new records. All of this happens in a single atomic transaction for a root and all nested tables. -Example below loads all the GitHub events and updates them in the destination using "id" as primary -key, making sure that only a single copy of event is present in `github_repo_events` table: +Example below loads all the GitHub events and updates them in the destination using "id" as the primary key, making sure that only a single copy of the event is present in the `github_repo_events` table: ```py @dlt.resource(primary_key="id", write_disposition="merge") @@ -99,7 +83,7 @@ def resource(): ... ``` -Example below merges on a column `batch_day` that holds the day for which given record is valid. +Example below merges on a column `batch_day` that holds the day for which the given record is valid. Merge keys also can be compound: ```py @@ -108,9 +92,7 @@ def get_daily_batch(day): yield _get_batch_from_bucket(day) ``` -As with any other write disposition you can use it to load data ad hoc. Below we load issues with -top reactions for `duckdb` repo. The lists have, obviously, many overlapping issues, but we want to -keep just one instance of each. +As with any other write disposition, you can use it to load data ad hoc. Below we load issues with top reactions for the `duckdb` repo. The lists have, obviously, many overlapping issues, but we want to keep just one instance of each. ```py p = dlt.pipeline(destination="bigquery", dataset_name="github") @@ -124,14 +106,12 @@ for reaction in reactions: p.run(issues, write_disposition="merge", primary_key="id", table_name="issues") ``` -Example below dispatches GitHub events to several tables by event type, keeps one copy of each event -by "id" and skips loading of past records using "last value" incremental. As you can see, all of -this we can just declare in our resource. +Example below dispatches GitHub events to several tables by event type, keeps one copy of each event by "id" and skips loading of past records using "last value" incremental. As you can see, all of this we can just declare in our resource. ```py @dlt.resource(primary_key="id", write_disposition="merge", table_name=lambda i: i['type']) def github_repo_events(last_created_at = dlt.sources.incremental("created_at", "1970-01-01T00:00:00Z")): - """A resource taking a stream of github events and dispatching them to tables named by event type. Deduplicates be 'id'. Loads incrementally by 'created_at' """ + """A resource taking a stream of github events and dispatching them to tables named by event type. Deduplicates by 'id'. Loads incrementally by 'created_at' """ yield from _get_rest_pages("events") ``` @@ -140,11 +120,10 @@ If you use the `merge` write disposition, but do not specify merge or primary ke The appended data will be inserted from a staging table in one transaction for most destinations in this case. ::: - #### Delete records The `hard_delete` column hint can be used to delete records from the destination dataset. The behavior of the delete mechanism depends on the data type of the column marked with the hint: -1) `bool` type: only `True` leads to a delete—`None` and `False` values are disregarded -2) other types: each `not None` value leads to a delete +1) `bool` type: only `True` leads to a delete—`None` and `False` values are disregarded. +2) Other types: each `not None` value leads to a delete. Each record in the destination table with the same `primary_key` or `merge_key` as a record in the source dataset that's marked as a delete will be deleted. @@ -158,17 +137,17 @@ Deletes are propagated to any nested table that might exist. For each record tha columns={"deleted_flag": {"hard_delete": True}} ) def resource(): - # this will insert a record (assuming a record with id = 1 does not yet exist) + # This will insert a record (assuming a record with id = 1 does not yet exist). yield {"id": 1, "val": "foo", "deleted_flag": False} - # this will update the record + # This will update the record. yield {"id": 1, "val": "bar", "deleted_flag": None} - # this will delete the record + # This will delete the record. yield {"id": 1, "val": "foo", "deleted_flag": True} - # similarly, this would have also deleted the record - # only the key and the column marked with the "hard_delete" hint suffice to delete records + # Similarly, this would have also deleted the record. + # Only the key and the column marked with the "hard_delete" hint suffice to delete records. yield {"id": 1, "deleted_flag": True} ... ``` @@ -180,13 +159,13 @@ def resource(): write_disposition="merge", columns={"deleted_at_ts": {"hard_delete": True}}) def resource(): - # this will insert two records + # This will insert two records. yield [ {"id": 1, "val": "foo", "deleted_at_ts": None}, {"id": 1, "val": "bar", "deleted_at_ts": None} ] - # this will delete two records + # This will delete two records. yield {"id": 1, "val": "foo", "deleted_at_ts": "2024-02-22T12:34:56Z"} ... ``` @@ -198,14 +177,14 @@ def resource(): write_disposition="merge", columns={"deleted_flag": {"hard_delete": True}, "lsn": {"dedup_sort": "desc"}}) def resource(): - # this will insert one record (the one with lsn = 3) + # This will insert one record (the one with lsn = 3). yield [ {"id": 1, "val": "foo", "lsn": 1, "deleted_flag": None}, {"id": 1, "val": "baz", "lsn": 3, "deleted_flag": None}, {"id": 1, "val": "bar", "lsn": 2, "deleted_flag": True} ] - # this will insert nothing, because the "latest" record is a delete + # This will insert nothing, because the "latest" record is a delete. yield [ {"id": 2, "val": "foo", "lsn": 1, "deleted_flag": False}, {"id": 2, "lsn": 2, "deleted_flag": True} @@ -219,11 +198,7 @@ Indexing is important for doing lookups by column value, especially for merge wr #### Forcing root key propagation -Merge write disposition requires that the `_dlt_id` (`row_key`) of root table is propagated to nested -tables. This concept is similar to foreign key but it always references the root (top level) table, skipping any intermediate parents -We call it `root key`. Root key is automatically propagated for all tables that have `merge` write disposition -set. We do not enable it everywhere because it takes storage space. Nevertheless, is some cases you -may want to permanently enable root key propagation. +Merge write disposition requires that the `_dlt_id` (`row_key`) of the root table be propagated to nested tables. This concept is similar to a foreign key but always references the root (top level) table, skipping any intermediate parents. We call it `root key`. The root key is automatically propagated for all tables that have the `merge` write disposition set. We do not enable it everywhere because it takes up storage space. Nevertheless, in some cases, you may want to permanently enable root key propagation. ```py pipeline = dlt.pipeline( @@ -245,24 +220,22 @@ fb_ads.ads.bind(states=("PAUSED", )) info = pipeline.run(fb_ads.with_resources("ads"), write_disposition="merge") ``` -In example above we enforce the root key propagation with `fb_ads.root_key = True`. This ensures -that correct data is propagated on initial `replace` load so the future `merge` load can be -executed. You can achieve the same in the decorator `@dlt.source(root_key=True)`. +In the example above, we enforce the root key propagation with `fb_ads.root_key = True`. This ensures that the correct data is propagated on the initial `replace` load so the future `merge` load can be executed. You can achieve the same in the decorator `@dlt.source(root_key=True)`. ### `scd2` strategy -`dlt` can create [Slowly Changing Dimension Type 2](https://en.wikipedia.org/wiki/Slowly_changing_dimension#Type_2:_add_new_row) (SCD2) destination tables for dimension tables that change in the source. The resource is expected to provide a full extract of the source table each run. A row hash is stored in `_dlt_id` and used as surrogate key to identify source records that have been inserted, updated, or deleted. A `NULL` value is used by default to indicate an active record, but it's possible to use a configurable high timestamp (e.g. 9999-12-31 00:00:00.000000) instead. +`dlt` can create [Slowly Changing Dimension Type 2](https://en.wikipedia.org/wiki/Slowly_changing_dimension#Type_2:_add_new_row) (SCD2) destination tables for dimension tables that change in the source. The resource is expected to provide a full extract of the source table each run. A row hash is stored in `_dlt_id` and used as a surrogate key to identify source records that have been inserted, updated, or deleted. A `NULL` value is used by default to indicate an active record, but it's possible to use a configurable high timestamp (e.g., 9999-12-31 00:00:00.000000) instead. :::note -The `unique` hint for `_dlt_id` in the root table is set to `false` when using `scd2`. This differs from [default behavior](./destination-tables.md#child-and-parent-tables). The reason is that the surrogate key stored in `_dlt_id` contains duplicates after an _insert-delete-reinsert_ pattern: -1. record with surrogate key X is inserted in a load at `t1` -2. record with surrogate key X is deleted in a later load at `t2` -3. record with surrogate key X is reinserted in an even later load at `t3` +The `unique` hint for `_dlt_id` in the root table is set to `false` when using `scd2`. This differs from [default behavior](./destination-tables.md#child-and-parent-tables). The reason is that the surrogate key stored in `_dlt_id` contains duplicates after an _insert-delete-reinsert_ pattern: +1. A record with surrogate key X is inserted in a load at `t1`. +2. The record with surrogate key X is deleted in a later load at `t2`. +3. The record with surrogate key X is reinserted in an even later load at `t3`. -After this pattern, the `scd2` table in the destination has two records for surrogate key X: one for validity window `[t1, t2]`, and one for `[t3, NULL]`. A duplicate value exists in `_dlt_id` because both records have the same surrogate key. +After this pattern, the `scd2` table in the destination has two records for surrogate key X: one for the validity window `[t1, t2]`, and one for `[t3, NULL]`. A duplicate value exists in `_dlt_id` because both records have the same surrogate key. Note that: -- the composite key `(_dlt_id, _dlt_valid_from)` is unique -- `_dlt_id` remains unique for nested tables—`scd2` does not affect this +- The composite key `(_dlt_id, _dlt_valid_from)` is unique. +- `_dlt_id` remains unique for nested tables—`scd2` does not affect this. ::: #### Example: `scd2` merge strategy @@ -372,8 +345,8 @@ def dim_customer(): ... ``` -#### Example: use your own row hash -By default, `dlt` generates a row hash based on all columns provided by the resource and stores it in `_dlt_id`. You can use your own hash instead by specifying `row_version_column_name` in the `write_disposition` dictionary. You might already have a column present in your resource that can naturally serve as row hash, in which case it's more efficient to use those pre-existing hash values than to generate new artificial ones. This option also allows you to use hashes based on a subset of columns, in case you want to ignore changes in some of the columns. When using your own hash, values for `_dlt_id` are randomly generated. +#### Example: Use your own row hash +By default, `dlt` generates a row hash based on all columns provided by the resource and stores it in `_dlt_id`. You can use your own hash instead by specifying `row_version_column_name` in the `write_disposition` dictionary. You might already have a column present in your resource that can naturally serve as a row hash, in which case it's more efficient to use those pre-existing hash values than to generate new artificial ones. This option also allows you to use hashes based on a subset of columns, in case you want to ignore changes in some of the columns. When using your own hash, values for `_dlt_id` are randomly generated. ```py @dlt.resource( write_disposition={ @@ -387,9 +360,9 @@ def dim_customer(): ... ``` -#### 🧪 Use scd2 with Arrow Tables and Panda frames -`dlt` will not add **row hash** column to the tabular data automatically (we are working on it). -You need to do that yourself by adding a transform function to `scd2` resource that computes row hashes (using pandas.util, should be fairly fast). +#### 🧪 Use scd2 with Arrow tables and Panda frames +`dlt` will not add a **row hash** column to the tabular data automatically (we are working on it). +You need to do that yourself by adding a transform function to the `scd2` resource that computes row hashes (using pandas.util, should be fairly fast). ```py import dlt from dlt.sources.helpers.transform import add_row_hash_to_table @@ -404,10 +377,10 @@ scd2_r = dlt.resource( }, ).add_map(add_row_hash_to_table("row_hash")) ``` -`add_row_hash_to_table` is the name of the transform function that will compute and create `row_hash` column that is declared as holding the hash by `row_version_column_name`. +`add_row_hash_to_table` is the name of the transform function that will compute and create the `row_hash` column that is declared as holding the hash by `row_version_column_name`. :::tip -You can modify existing resources that yield data in tabular form by calling `apply_hints` and passing `scd2` config in `write_disposition` and then by +You can modify existing resources that yield data in tabular form by calling `apply_hints` and passing the `scd2` config in `write_disposition` and then by adding the transform with `add_map`. ::: @@ -416,9 +389,9 @@ Nested tables, if any, do not contain validity columns. Validity columns are onl #### Limitations -* You cannot use columns like `updated_at` or integer `version` of a record that are unique within a `primary_key` (even if it is defined). Hash column -must be unique for a root table. We are working to allow `updated_at` style tracking -* We do not detect changes in nested tables (except new records) if row hash of the corresponding parent row does not change. Use `updated_at` or similar +* You cannot use columns like `updated_at` or integer `version` of a record that are unique within a `primary_key` (even if it is defined). The hash column +must be unique for a root table. We are working to allow `updated_at` style tracking. +* We do not detect changes in nested tables (except new records) if the row hash of the corresponding parent row does not change. Use `updated_at` or a similar column in the root table to stamp changes in nested data. * `merge_key(s)` are (for now) ignored. @@ -432,12 +405,12 @@ The `upsert` merge strategy is currently supported for these destinations: - `mssql` - `postgres` - `snowflake` -- 🧪 `filesytem` with `delta` table format (see limitations [here](../dlt-ecosystem/destinations/filesystem.md#known-limitations)) +- `filesystem` with `delta` table format (see limitations [here](../dlt-ecosystem/destinations/filesystem.md#known-limitations)) ::: The `upsert` merge strategy does primary-key based *upserts*: -- *update* record if key exists in target table -- *insert* record if key does not exist in target table +- *update* a record if the key exists in the target table +- *insert* a record if the key does not exist in the target table You can [delete records](#delete-records) with the `hard_delete` hint. @@ -462,18 +435,14 @@ def my_upsert_resource(): ## Incremental loading with a cursor field -In most of the REST APIs (and other data sources i.e. database tables) you can request new or updated -data by passing a timestamp or id of the "last" record to a query. The API/database returns just the -new/updated records from which you take maximum/minimum timestamp/id for the next load. +In most REST APIs (and other data sources, i.e., database tables), you can request new or updated data by passing a timestamp or ID of the "last" record to a query. The API/database returns just the new/updated records from which you take the maximum/minimum timestamp/ID for the next load. -To do incremental loading this way, we need to +To do incremental loading this way, we need to: -- figure which field is used to track changes (the so called **cursor field**) (e.g. “inserted_at”, "updated_at”, etc.); -- how to past the "last" (maximum/minimum) value of cursor field to an API to get just new / modified data (how we do this depends on the source API). +- Figure out which field is used to track changes (the so-called **cursor field**) (e.g., “inserted_at”, "updated_at”, etc.); +- Determine how to pass the "last" (maximum/minimum) value of the cursor field to an API to get just new or modified data (how we do this depends on the source API). -Once you've figured that out, `dlt` takes care of finding maximum/minimum cursor field values, removing -duplicates and managing the state with last values of cursor. Take a look at GitHub example below, where we -request recently created issues. +Once you've figured that out, `dlt` takes care of finding maximum/minimum cursor field values, removing duplicates, and managing the state with the last values of the cursor. Take a look at the GitHub example below, where we request recently created issues. ```py @dlt.resource(primary_key="id") @@ -482,38 +451,28 @@ def repo_issues( repository, updated_at = dlt.sources.incremental("updated_at", initial_value="1970-01-01T00:00:00Z") ): - # get issues since "updated_at" stored in state on previous run (or initial_value on first run) + # Get issues since "updated_at" stored in state on previous run (or initial_value on first run) for page in _get_issues_page(access_token, repository, since=updated_at.start_value): yield page - # last_value is updated after every page + # Last_value is updated after every page print(updated_at.last_value) ``` -Here we add `updated_at` argument that will receive incremental state, initialized to -`1970-01-01T00:00:00Z`. It is configured to track `updated_at` field in issues yielded by -`repo_issues` resource. It will store the newest `updated_at` value in `dlt` -[state](state.md) and make it available in `updated_at.start_value` on next pipeline -run. This value is inserted in `_get_issues_page` function into request query param **since** to [Github API](https://docs.github.com/en/rest/issues/issues?#list-repository-issues) - -In essence, `dlt.sources.incremental` instance above -* **updated_at.initial_value** which is always equal to "1970-01-01T00:00:00Z" passed in constructor -* **updated_at.start_value** a maximum `updated_at` value from the previous run or the **initial_value** on first run -* **updated_at.last_value** a "real time" `updated_at` value updated with each yielded item or page. before first yield it equals **start_value** -* **updated_at.end_value** (here not used) [marking end of backfill range](#using-end_value-for-backfill) +Here we add an `updated_at` argument that will receive incremental state, initialized to `1970-01-01T00:00:00Z`. It is configured to track the `updated_at` field in issues yielded by the `repo_issues` resource. It will store the newest `updated_at` value in `dlt` [state](state.md) and make it available in `updated_at.start_value` on the next pipeline run. This value is inserted in the `_get_issues_page` function into the request query param **since** to the [GitHub API](https://docs.github.com/en/rest/issues/issues?#list-repository-issues). -When paginating you probably need **start_value** which does not change during the execution of the resource, however -most paginators will return a **next page** link which you should use. +In essence, the `dlt.sources.incremental` instance above: +* **updated_at.initial_value** which is always equal to "1970-01-01T00:00:00Z" passed in the constructor +* **updated_at.start_value** a maximum `updated_at` value from the previous run or the **initial_value** on the first run +* **updated_at.last_value** a "real-time" `updated_at` value updated with each yielded item or page. Before the first yield, it equals **start_value** +* **updated_at.end_value** (here not used) [marking the end of the backfill range](#using-end_value-for-backfill) -Behind the scenes, `dlt` will deduplicate the results ie. in case the last issue is returned again -(`updated_at` filter is inclusive) and skip already loaded ones. +When paginating, you probably need the **start_value** which does not change during the execution of the resource, however, most paginators will return a **next page** link which you should use. +Behind the scenes, dlt will deduplicate the results, i.e., in case the last issue is returned again (`updated_at` filter is inclusive) and skip already loaded ones. -In the example below we -incrementally load the GitHub events, where API does not let us filter for the newest events - it -always returns all of them. Nevertheless, `dlt` will load only the new items, filtering out all the -duplicates and past issues. +In the example below, we incrementally load the GitHub events, where the API does not let us filter for the newest events - it always returns all of them. Nevertheless, `dlt` will load only the new items, filtering out all the duplicates and past issues. ```py -# use naming function in table name to generate separate tables for each event +# Use naming function in table name to generate separate tables for each event @dlt.resource(primary_key="id", table_name=lambda i: i['type']) # type: ignore def repo_events( last_created_at = dlt.sources.incremental("created_at", initial_value="1970-01-01T00:00:00Z", last_value_func=max), row_order="desc" @@ -523,32 +482,30 @@ def repo_events( yield page ``` -We just yield all the events and `dlt` does the filtering (using `id` column declared as -`primary_key`). +We just yield all the events and `dlt` does the filtering (using the `id` column declared as `primary_key`). -Github returns events ordered from newest to oldest. So we declare the `rows_order` as **descending** to [stop requesting more pages once the incremental value is out of range](#declare-row-order-to-not-request-unnecessary-data). We stop requesting more data from the API after finding the first event with `created_at` earlier than `initial_value`. +GitHub returns events ordered from newest to oldest. So we declare the `rows_order` as **descending** to [stop requesting more pages once the incremental value is out of range](#declare-row-order-to-not-request-unnecessary-data). We stop requesting more data from the API after finding the first event with `created_at` earlier than `initial_value`. :::note -`dlt.sources.incremental` is implemented as a [filter function](resource.md#filter-transform-and-pivot-data) that is executed **after** all other transforms -you add with `add_map` / `add_filter`. This means that you can manipulate the data item before incremental filter sees it. For example: -* you can create surrogate primary key from other columns -* you can modify cursor value or create a new field composed from other fields -* dump Pydantic models to Python dicts to allow incremental to find custost values +`dlt.sources.incremental` is implemented as a [filter function](resource.md#filter-transform-and-pivot-data) that is executed **after** all other transforms you add with `add_map` or `add_filter`. This means that you can manipulate the data item before the incremental filter sees it. For example: +* You can create a surrogate primary key from other columns +* You can modify the cursor value or create a new field composed of other fields +* Dump Pydantic models to Python dicts to allow incremental to find custom values [Data validation with Pydantic](schema-contracts.md#use-pydantic-models-for-data-validation) happens **before** incremental filtering. ::: -### max, min or custom `last_value_func` +### Max, min, or custom `last_value_func` -`dlt.sources.incremental` allows to choose a function that orders (compares) cursor values to current `last_value`. -* The default function is built-in `max` which returns bigger value of the two -* Another built-in `min` returns smaller value. +`dlt.sources.incremental` allows you to choose a function that orders (compares) cursor values to the current `last_value`. +* The default function is the built-in `max`, which returns the larger value of the two. +* Another built-in, `min`, returns the smaller value. -You can pass your custom function as well. This lets you define -`last_value` on nested types i.e. dictionaries and store indexes of last values, not just simple +You can also pass your custom function. This lets you define +`last_value` on nested types, i.e., dictionaries, and store indexes of last values, not just simple types. The `last_value` argument is a [JSON Path](https://github.com/json-path/JsonPath#operators) and lets you select nested data (including the whole data item when `$` is used). -Example below creates last value which is a dictionary holding a max `created_at` value for each +The example below creates a last value which is a dictionary holding a max `created_at` value for each created table name: ```py @@ -598,24 +555,25 @@ def get_events(last_created_at = dlt.sources.incremental("created_at", last_valu ``` ### Using `end_value` for backfill + You can specify both initial and end dates when defining incremental loading. Let's go back to our Github example: ```py @dlt.resource(primary_key="id") def repo_issues( access_token, repository, - created_at = dlt.sources.incremental("created_at", initial_value="1970-01-01T00:00:00Z", end_value="2022-07-01T00:00:00Z") + created_at=dlt.sources.incremental("created_at", initial_value="1970-01-01T00:00:00Z", end_value="2022-07-01T00:00:00Z") ): - # get issues from created from last "created_at" value + # get issues created from the last "created_at" value for page in _get_issues_page(access_token, repository, since=created_at.start_value, until=created_at.end_value): yield page ``` -Above we use `initial_value` and `end_value` arguments of the `incremental` to define the range of issues that we want to retrieve +Above, we use the `initial_value` and `end_value` arguments of the `incremental` to define the range of issues that we want to retrieve and pass this range to the Github API (`since` and `until`). As in the examples above, `dlt` will make sure that only the issues from -defined range are returned. +the defined range are returned. Please note that when `end_date` is specified, `dlt` **will not modify the existing incremental state**. The backfill is **stateless** and: -1. You can run backfill and incremental load in parallel (ie. in Airflow DAG) in a single pipeline. +1. You can run backfill and incremental load in parallel (i.e., in an Airflow DAG) in a single pipeline. 2. You can partition your backfill into several smaller chunks and run them in parallel as well. To define specific ranges to load, you can simply override the incremental argument in the resource, for example: @@ -634,36 +592,36 @@ august_issues = repo_issues( ... ``` -Note that `dlt`'s incremental filtering considers the ranges half closed. `initial_value` is inclusive, `end_value` is exclusive, so chaining ranges like above works without overlaps. - +Note that dlt's incremental filtering considers the ranges half-closed. `initial_value` is inclusive, `end_value` is exclusive, so chaining ranges like above works without overlaps. ### Declare row order to not request unnecessary data -With `row_order` argument set, `dlt` will stop getting data from the data source (ie. Github API) if it detect that values of cursor field are out of range of **start** and **end** values. + +With the `row_order` argument set, dlt will stop retrieving data from the data source (e.g., GitHub API) if it detects that the values of the cursor field are out of the range of **start** and **end** values. In particular: -* `dlt` stops processing when the resource yields any item with an _equal or greater_ cursor value than the `end_value` and `row_order` is set to **asc**. (`end_value` is not included) -* `dlt` stops processing when the resource yields any item with a _lower_ cursor value than the `last_value` and `row_order` is set to **desc**. (`last_value` is included) +* dlt stops processing when the resource yields any item with a cursor value _equal to or greater than_ the `end_value` and `row_order` is set to **asc**. (`end_value` is not included) +* dlt stops processing when the resource yields any item with a cursor value _lower_ than the `last_value` and `row_order` is set to **desc**. (`last_value` is included) :::note -"higher" and "lower" here refers to when the default `last_value_func` is used (`max()`), +"higher" and "lower" here refer to when the default `last_value_func` is used (`max()`), when using `min()` "higher" and "lower" are inverted. ::: :::caution If you use `row_order`, **make sure that the data source returns ordered records** (ascending / descending) on the cursor field, -e.g. if an API returns results both higher and lower +e.g., if an API returns results both higher and lower than the given `end_value` in no particular order, data reading stops and you'll miss the data items that were out of order. ::: -Row order is the most useful when: +Row order is most useful when: -1. The data source does **not** offer start/end filtering of results (e.g. there is no `start_time/end_time` query parameter or similar) -2. The source returns results **ordered by the cursor field** +1. The data source does **not** offer start/end filtering of results (e.g., there is no `start_time/end_time` query parameter or similar). +2. The source returns results **ordered by the cursor field**. -The github events example is exactly such case. The results are ordered on cursor value descending but there's no way to tell API to limit returned items to those created before certain date. Without the `row_order` setting, we'd be getting all events, each time we extract the `github_events` resource. +The GitHub events example is exactly such a case. The results are ordered on cursor value descending, but there's no way to tell the API to limit returned items to those created before a certain date. Without the `row_order` setting, we'd be getting all events, each time we extract the `github_events` resource. -In the same fashion the `row_order` can be used to **optimize backfill** so we don't continue -making unnecessary API requests after the end of range is reached. For example: +In the same fashion, the `row_order` can be used to **optimize backfill** so we don't continue +making unnecessary API requests after the end of the range is reached. For example: ```py @dlt.resource(primary_key="id") @@ -682,22 +640,22 @@ def tickets( yield page ``` -In this example we're loading tickets from Zendesk. The Zendesk API yields items paginated and ordered by oldest to newest, -but only offers a `start_time` parameter for filtering so we cannot tell it to -stop getting data at `end_value`. Instead we set `row_order` to `asc` and `dlt` wil stop -getting more pages from API after first page with cursor value `updated_at` is found older +In this example, we're loading tickets from Zendesk. The Zendesk API yields items paginated and ordered from oldest to newest, +but only offers a `start_time` parameter for filtering, so we cannot tell it to +stop retrieving data at `end_value`. Instead, we set `row_order` to `asc` and `dlt` will stop +getting more pages from the API after the first page with a cursor value `updated_at` is found older than `end_value`. :::caution In rare cases when you use Incremental with a transformer, `dlt` will not be able to automatically close -generator associated with a row that is out of range. You can still call the `can_close()` method on -incremental and exit yield loop when true. +the generator associated with a row that is out of range. You can still call the `can_close()` method on +incremental and exit the yield loop when true. ::: :::tip The `dlt.sources.incremental` instance provides `start_out_of_range` and `end_out_of_range` attributes which are set when the resource yields an element with a higher/lower cursor value than the -initial or end values. If you do not want `dlt` to stop processing automatically and instead to handle such events yourself, do not specify `row_order`: +initial or end values. If you do not want `dlt` to stop processing automatically and instead want to handle such events yourself, do not specify `row_order`: ```py @dlt.transformer(primary_key="id") def tickets( @@ -722,21 +680,9 @@ def tickets( ### Deduplicate overlapping ranges with primary key -`Incremental` **does not** deduplicate datasets like **merge** write disposition does. It however -makes sure than when another portion of data is extracted, records that were previously loaded won't be -included again. `dlt` assumes that you load a range of data, where the lower bound is inclusive (ie. greater than equal). -This makes sure that you never lose any data but will also re-acquire some rows. -For example: you have a database table with an cursor field on `updated_at` which has a day resolution, then there's a high -chance that after you extract data on a given day, still more records will be added. When you extract on the next day, you -should reacquire data from the last day to make sure all records are present, this will however create overlap with data -from previous extract. - -By default, content hash (a hash of `json` representation of a row) will be used to deduplicate. -This may be slow so`dlt.sources.incremental` will inherit the primary key that is set on the resource. -You can optionally set a `primary_key` that is used exclusively to -deduplicate and which does not become a table hint. The same setting lets you disable the -deduplication altogether when empty tuple is passed. Below we pass `primary_key` directly to -`incremental` to disable deduplication. That overrides `delta` primary_key set in the resource: +`Incremental` **does not** deduplicate datasets like the **merge** write disposition does. However, it ensures that when another portion of data is extracted, records that were previously loaded won't be included again. `dlt` assumes that you load a range of data, where the lower bound is inclusive (i.e., greater than or equal). This ensures that you never lose any data but will also re-acquire some rows. For example, if you have a database table with a cursor field on `updated_at` which has a day resolution, then there's a high chance that after you extract data on a given day, more records will still be added. When you extract on the next day, you should reacquire data from the last day to ensure all records are present; however, this will create overlap with data from the previous extract. + +By default, a content hash (a hash of the `json` representation of a row) will be used to deduplicate. This may be slow, so `dlt.sources.incremental` will inherit the primary key that is set on the resource. You can optionally set a `primary_key` that is used exclusively to deduplicate and which does not become a table hint. The same setting lets you disable the deduplication altogether when an empty tuple is passed. Below, we pass `primary_key` directly to `incremental` to disable deduplication. That overrides the `delta` primary_key set in the resource: ```py @dlt.resource(primary_key="delta") @@ -748,8 +694,7 @@ def some_data(last_timestamp=dlt.sources.incremental("item.ts", primary_key=())) ### Using `dlt.sources.incremental` with dynamically created resources -When resources are [created dynamically](source.md#create-resources-dynamically) it is possible to -use `dlt.sources.incremental` definition as well. +When resources are [created dynamically](source.md#create-resources-dynamically), it is possible to use the `dlt.sources.incremental` definition as well. ```py @dlt.source @@ -772,21 +717,20 @@ def stripe(): )(endpoint) ``` -Please note that in the example above, `get_resource` is passed as a function to `dlt.resource` to -which we bind the endpoint: **dlt.resource(...)(endpoint)**. +Please note that in the example above, `get_resource` is passed as a function to `dlt.resource` to which we bind the endpoint: **dlt.resource(...)(endpoint)**. :::caution The typical mistake is to pass a generator (not a function) as below: `yield dlt.resource(get_resource(endpoint), name=endpoint.value, write_disposition="merge", primary_key="id")`. -Here we call **get_resource(endpoint)** and that creates un-evaluated generator on which resource -is created. That prevents `dlt` from controlling the **created** argument during runtime and will -result in `IncrementalUnboundError` exception. +Here we call **get_resource(endpoint)** and that creates an un-evaluated generator on which the resource is created. That prevents `dlt` from controlling the **created** argument during runtime and will result in an `IncrementalUnboundError` exception. ::: ### Using Airflow schedule for backfill and incremental loading -When [running in Airflow task](../walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md#2-modify-dag-file), you can opt-in your resource to get the `initial_value`/`start_value` and `end_value` from Airflow schedule associated with your DAG. Let's assume that **Zendesk tickets** resource contains a year of data with thousands of tickets. We want to backfill the last year of data week by week and then continue incremental loading daily. + +When [running an Airflow task](../walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md#2-modify-dag-file), you can opt-in your resource to get the `initial_value`/`start_value` and `end_value` from the Airflow schedule associated with your DAG. Let's assume that the **Zendesk tickets** resource contains a year of data with thousands of tickets. We want to backfill the last year of data week by week and then continue with incremental loading daily. + ```py @dlt.resource(primary_key="id") def tickets( @@ -801,11 +745,13 @@ def tickets( ): yield page ``` -We opt-in to Airflow scheduler by setting `allow_external_schedulers` to `True`: -1. When running on Airflow, the start and end values are controlled by Airflow and `dlt` [state](state.md) is not used. -2. In all other environments, the `incremental` behaves as usual, maintaining `dlt` state. -Let's generate a deployment with `dlt deploy zendesk_pipeline.py airflow-composer` and customize the dag: +We opt-in to the Airflow scheduler by setting `allow_external_schedulers` to `True`: +1. When running on Airflow, the start and end values are controlled by Airflow and the dlt [state](state.md) is not used. +2. In all other environments, the `incremental` behaves as usual, maintaining the dlt state. + +Let's generate a deployment with `dlt deploy zendesk_pipeline.py airflow-composer` and customize the DAG: + ```py @dag( schedule_interval='@weekly', @@ -828,21 +774,22 @@ def zendesk_backfill_bigquery(): ) # select only incremental endpoints in support api data = zendesk_support().with_resources("tickets", "ticket_events", "ticket_metric_events") - # create the source, the "serialize" decompose option will converts dlt resources into Airflow tasks. use "none" to disable it + # create the source, the "serialize" decompose option will convert dlt resources into Airflow tasks. use "none" to disable it tasks.add_run(pipeline, data, decompose="serialize", trigger_rule="all_done", retries=0, provide_context=True) zendesk_backfill_bigquery() ``` + What got customized: -1. We use weekly schedule, and want to get the data from February 2023 (`start_date`) until end of July ('end_date'). -2. We make Airflow to generate all weekly runs (`catchup` is True). -2. We create `zendesk_support` resources where we select only the incremental resources we want to backfill. +1. We use a weekly schedule and want to get the data from February 2023 (`start_date`) until the end of July (`end_date`). +2. We make Airflow generate all weekly runs (`catchup` is True). +3. We create `zendesk_support` resources where we select only the incremental resources we want to backfill. -When you enable the DAG in Airflow, it will generate several runs and start executing them, starting in February and ending in August. Your resource will receive -subsequent weekly intervals starting with `2023-02-12, 00:00:00 UTC` to `2023-02-19, 00:00:00 UTC`. +When you enable the DAG in Airflow, it will generate several runs and start executing them, starting in February and ending in August. Your resource will receive subsequent weekly intervals starting with `2023-02-12, 00:00:00 UTC` to `2023-02-19, 00:00:00 UTC`. You can repurpose the DAG above to start loading new data incrementally after (or during) the backfill: + ```py @dag( schedule_interval='@daily', @@ -864,19 +811,21 @@ def zendesk_new_bigquery(): ) tasks.add_run(pipeline, zendesk_support(), decompose="serialize", trigger_rule="all_done", retries=0, provide_context=True) ``` -Above, we switch to daily schedule and disable catchup and end date. We also load all the support resources to the same dataset as backfill (`zendesk_support_data`). -If you want to run this DAG parallel with the backfill DAG, change the pipeline name ie. to `zendesk_support_new` as above. + +Above, we switch to a daily schedule and disable catchup and end date. We also load all the support resources to the same dataset as backfill (`zendesk_support_data`). +If you want to run this DAG parallel with the backfill DAG, change the pipeline name, for example, to `zendesk_support_new` as above. **Under the hood** -Before `dlt` starts executing incremental resources, it looks for `data_interval_start` and `data_interval_end` Airflow task context variables. Those got mapped to `initial_value` and `end_value` of the -`Incremental` class: -1. `dlt` is smart enough to convert Airflow datetime to iso strings or unix timestamps if your resource is using them. In our example we instantiate `updated_at=dlt.sources.incremental[int]`, where we declare the last value type to be **int**. `dlt` can also infer type if you provide `initial_value` argument. + +Before `dlt` starts executing incremental resources, it looks for `data_interval_start` and `data_interval_end` Airflow task context variables. These are mapped to `initial_value` and `end_value` of the `Incremental` class: +1. `dlt` is smart enough to convert Airflow datetime to ISO strings or Unix timestamps if your resource is using them. In our example, we instantiate `updated_at=dlt.sources.incremental[int]`, where we declare the last value type to be **int**. `dlt` can also infer the type if you provide the `initial_value` argument. 2. If `data_interval_end` is in the future or is None, `dlt` sets the `end_value` to **now**. -3. If `data_interval_start` == `data_interval_end` we have a manually triggered DAG run. In that case `data_interval_end` will also be set to **now**. +3. If `data_interval_start` == `data_interval_end`, we have a manually triggered DAG run. In that case, `data_interval_end` will also be set to **now**. **Manual runs** -You can run DAGs manually but you must remember to specify the Airflow logical date of the run in the past (use Run with config option). For such run `dlt` will load all data from that past date until now. -If you do not specify the past date, a run with a range (now, now) will happen yielding no data. + +You can run DAGs manually, but you must remember to specify the Airflow logical date of the run in the past (use the Run with config option). For such a run, `dlt` will load all data from that past date until now. +If you do not specify the past date, a run with a range (now, now) will happen, yielding no data. ### Reading incremental loading parameters from configuration @@ -892,7 +841,7 @@ Consider the example below for reading incremental loading parameters from "conf `cursor_path` is assigned the value "idAfter" with an initial value of 10. -1. Here's how the `generate_incremental_records` resource uses `cursor_path` defined in "config.toml": +1. Here's how the `generate_incremental_records` resource uses the `cursor_path` defined in "config.toml": ```py @dlt.resource(table_name="incremental_records") def generate_incremental_records(id_after: dlt.sources.incremental = dlt.config.value): @@ -913,8 +862,8 @@ Consider the example below for reading incremental loading parameters from "conf You can customize the incremental processing of dlt by setting the parameter `on_cursor_value_missing`. When loading incrementally with the default settings, there are two assumptions: -1. each row contains the cursor path -2. each row is expected to contain a value at the cursor path that is not `None`. +1. Each row contains the cursor path. +2. Each row is expected to contain a value at the cursor path that is not `None`. For example, the two following source data will raise an error: ```py @@ -938,7 +887,7 @@ list(some_data_without_cursor_value()) ``` -To process a data set where some records do not include the incremental cursor path or where the values at the cursor path are `None,` there are the following four options: +To process a data set where some records do not include the incremental cursor path or where the values at the cursor path are `None`, there are the following four options: 1. Configure the incremental load to raise an exception in case there is a row where the cursor path is missing or has the value `None` using `incremental(..., on_cursor_value_missing="raise")`. This is the default behavior. 2. Configure the incremental load to tolerate the missing cursor path and `None` values using `incremental(..., on_cursor_value_missing="include")`. @@ -961,7 +910,7 @@ assert result[1] == {"id": 2, "created_at": 2} assert result[2] == {"id": 3, "created_at": 4, "updated_at": None} ``` -If you do not want to import records without the cursor path or where the value at the cursor path is `None` use the following incremental configuration: +If you do not want to import records without the cursor path or where the value at the cursor path is `None`, use the following incremental configuration: ```py @dlt.resource @@ -977,11 +926,11 @@ assert len(result) == 1 ``` ### Transform records before incremental processing -If you want to load data that includes `None` values you can transform the records before the incremental processing. +If you want to load data that includes `None` values, you can transform the records before the incremental processing. You can add steps to the pipeline that [filter, transform, or pivot your data](../general-usage/resource.md#filter-transform-and-pivot-data). :::caution -It is important to set the `insert_at` parameter of the `add_map` function to control the order of the execution and ensure that your custom steps are executed before the incremental processing starts. +It is important to set the `insert_at` parameter of the `add_map` function to control the order of execution and ensure that your custom steps are executed before the incremental processing starts. In the following example, the step of data yielding is at `index = 0`, the custom transformation at `index = 1`, and the incremental processing at `index = 2`. ::: @@ -1001,13 +950,13 @@ def set_default_updated_at(record): record["updated_at"] = record.get("created_at") return record -# modifies records before the incremental processing +# Modifies records before the incremental processing with_default_values = some_data().add_map(set_default_updated_at, insert_at=1) result = list(with_default_values) assert len(result) == 3 assert result[2]["updated_at"] == 4 -# removes records before the incremental processing +# Removes records before the incremental processing without_none = some_data().add_filter(lambda r: r.get("updated_at") is not None, insert_at=1) result_filtered = list(without_none) assert len(result_filtered) == 2 @@ -1016,67 +965,62 @@ assert len(result_filtered) == 2 ## Doing a full refresh -You may force a full refresh of a `merge` and `append` pipelines: +You may force a full refresh of `merge` and `append` pipelines: -1. In case of a `merge` the data in the destination is deleted and loaded fresh. Currently we do not - deduplicate data during the full refresh. -1. In case of `dlt.sources.incremental` the data is deleted and loaded from scratch. The state of - the incremental is reset to the initial value. +1. In the case of a `merge`, the data in the destination is deleted and loaded fresh. Currently, we do not deduplicate data during the full refresh. +1. In the case of `dlt.sources.incremental`, the data is deleted and loaded from scratch. The state of the incremental is reset to the initial value. Example: ```py p = dlt.pipeline(destination="bigquery", dataset_name="dataset_name") -# do a full refresh +# Do a full refresh p.run(merge_source(), write_disposition="replace") -# do a full refresh of just one table +# Do a full refresh of just one table p.run(merge_source().with_resources("merge_table"), write_disposition="replace") -# run a normal merge +# Run a normal merge p.run(merge_source()) ``` -Passing write disposition to `replace` will change write disposition on all the resources in +Passing write disposition to `replace` will change the write disposition on all the resources in `repo_events` during the run of the pipeline. ## Custom incremental loading with pipeline state -The pipeline state is a Python dictionary which gets committed atomically with the data; you can set -values in it in your resources and on next pipeline run, request them back. +The pipeline state is a Python dictionary that gets committed atomically with the data; you can set +values in it in your resources and on the next pipeline run, request them back. -The pipeline state is in principle scoped to the resource - all values of the state set by resource -are private and isolated from any other resource. You can also access the source-scoped state which +The pipeline state is, in principle, scoped to the resource - all values of the state set by a resource +are private and isolated from any other resource. You can also access the source-scoped state, which can be shared across resources. [You can find more information on pipeline state here](state.md#pipeline-state). -### Preserving the last value in resource state. +### Preserving the last value in resource state -For the purpose of preserving the “last value” or similar loading checkpoints, we can open a dlt -state dictionary with a key and a default value as below. When the resource is executed and the data -is loaded, the yielded resource data will be loaded at the same time with the update to the state. +For the purpose of preserving the "last value" or similar loading checkpoints, we can open a dlt state dictionary with a key and a default value as below. When the resource is executed and the data is loaded, the yielded resource data will be loaded at the same time with the update to the state. -In the two examples below you see how the `dlt.sources.incremental` is working under the hood. +In the two examples below, you see how the `dlt.sources.incremental` is working under the hood. ```py @resource() def tweets(): - # Get a last value from loaded metadata. If not exist, get None + # Get the last value from loaded metadata. If it does not exist, get None last_val = dlt.current.resource_state().setdefault("last_updated", None) - # get data and yield it + # Get data and yield it data = get_data(start_from=last_val) yield data - # change the state to the new value + # Change the state to the new value dlt.current.resource_state()["last_updated"] = data["last_timestamp"] ``` -If we keep a list or a dictionary in the state, we can modify the underlying values in the objects, -and thus we do not need to set the state back explicitly. +If we keep a list or a dictionary in the state, we can modify the underlying values in the objects, and thus we do not need to set the state back explicitly. ```py @resource() def tweets(): - # Get a last value from loaded metadata. If not exist, get None + # Get the last value from loaded metadata. If it does not exist, get None loaded_dates = dlt.current.resource_state().setdefault("days_loaded", []) - # do stuff: get data and add new values to the list + # Do stuff: get data and add new values to the list # `loaded_date` is a reference to the `dlt.current.resource_state()["days_loaded"]` list # and thus modifying it modifies the state yield data @@ -1085,24 +1029,18 @@ def tweets(): Step by step explanation of how to get or set the state: -1. We can use the function `var = dlt.current.resource_state().setdefault("key", [])`. This allows - us to retrieve the values of `key`. If `key` was not set yet, we will get the default value `[]` - instead. -1. We now can treat `var` as a python list - We can append new values to it, or if applicable we can - read the values from previous loads. -1. On pipeline run, the data will load, and the new `var`'s value will get saved in the state. The - state is stored at the destination, so it will be available on subsequent runs. +1. We can use the function `var = dlt.current.resource_state().setdefault("key", [])`. This allows us to retrieve the values of `key`. If `key` was not set yet, we will get the default value `[]` instead. +2. We can now treat `var` as a Python list - We can append new values to it, or if applicable, we can read the values from previous loads. +3. On pipeline run, the data will load, and the new `var`'s value will get saved in the state. The state is stored at the destination, so it will be available on subsequent runs. ### Advanced state usage: storing a list of processed entities -Let’s look at the `player_games` resource from the chess pipeline. The chess API has a method to -request games archives for a given month. The task is to prevent the user to load the same month -data twice - even if the user makes a mistake and requests the same months range again: +Let's look at the `player_games` resource from the chess pipeline. The chess API has a method to request games archives for a given month. The task is to prevent the user from loading the same month data twice - even if the user makes a mistake and requests the same months range again: - Our data is requested in 2 steps: - Get all available archives URLs. - Get the data from each URL. -- We will add the “chess archives” URLs to this list we created. +- We will add the "chess archives" URLs to this list we created. - This will allow us to track what data we have loaded. - When the data is loaded, the list of archives is loaded with it. - Later we can read this list and know what data has already been loaded. @@ -1114,23 +1052,23 @@ In the following example, we initialize a variable with an empty list as a defau def players_games(chess_url, players, start_month=None, end_month=None): loaded_archives_cache = dlt.current.resource_state().setdefault("archives", []) - # as far as python is concerned, this variable behaves like + # As far as Python is concerned, this variable behaves like # loaded_archives_cache = state['archives'] or [] - # afterwards we can modify list, and finally + # Afterwards, we can modify the list, and finally # when the data is loaded, the cache is updated with our loaded_archives_cache - # get archives for a given player + # Get archives for a given player archives = get_players_archives(chess_url, players) for url in archives: - # if not in cache, yield the data and cache the URL + # If not in cache, yield the data and cache the URL if url not in loaded_archives_cache: - # add URL to cache and yield the associated data + # Add URL to cache and yield the associated data loaded_archives_cache.append(url) r = requests.get(url) r.raise_for_status() yield r.json().get("games", []) else: - print(f"skipping archive {url}") + print(f"Skipping archive {url}") ``` ### Advanced state usage: tracking the last value for all search terms in Twitter API @@ -1140,7 +1078,7 @@ def players_games(chess_url, players, start_month=None, end_month=None): def search_tweets(twitter_bearer_token=dlt.secrets.value, search_terms=None, start_time=None, end_time=None, last_value=None): headers = _headers(twitter_bearer_token) for search_term in search_terms: - # make cache for each term + # Make cache for each term last_value_cache = dlt.current.resource_state().setdefault(f"last_value_{search_term}", None) print(f'last_value_cache: {last_value_cache}') params = {...} @@ -1149,9 +1087,9 @@ def search_tweets(twitter_bearer_token=dlt.secrets.value, search_terms=None, sta for page in response: page['search_term'] = search_term last_id = page.get('meta', {}).get('newest_id', 0) - #set it back - not needed if we + # Set it back - not needed if we dlt.current.resource_state()[f"last_value_{search_term}"] = max(last_value_cache or 0, int(last_id)) - # print the value for each search term + # Print the value for each search term print(f'new_last_value_cache for term {search_term}: {last_value_cache}') yield page @@ -1161,11 +1099,11 @@ def search_tweets(twitter_bearer_token=dlt.secrets.value, search_terms=None, sta If you see that the incremental loading is not working as expected and the incremental values are not modified between pipeline runs, check the following: -1. Make sure the `destination`, `pipeline_name` and `dataset_name` are the same between pipeline runs. +1. Make sure the `destination`, `pipeline_name`, and `dataset_name` are the same between pipeline runs. 2. Check if `dev_mode` is `False` in the pipeline configuration. Check if `refresh` for associated sources and resources is not enabled. -3. Check the logs for `Bind incremental on ...` message. This message indicates that the incremental value was bound to the resource and shows the state of the incremental value. +3. Check the logs for the `Bind incremental on ...` message. This message indicates that the incremental value was bound to the resource and shows the state of the incremental value. 4. After the pipeline run, check the state of the pipeline. You can do this by running the following command: @@ -1217,3 +1155,4 @@ sources: ``` Verify that the `last_value` is updated between pipeline runs. + diff --git a/docs/website/docs/general-usage/naming-convention.md b/docs/website/docs/general-usage/naming-convention.md index 16898cf8d1..f1766d1797 100644 --- a/docs/website/docs/general-usage/naming-convention.md +++ b/docs/website/docs/general-usage/naming-convention.md @@ -4,8 +4,8 @@ description: Control how dlt creates table, column and other identifiers keywords: [identifiers, snake case, case sensitive, case insensitive, naming] --- -# Naming Convention -`dlt` creates table and column identifiers from the data. The data source, i.e. a stream of JSON documents, may have identifiers (i.e. key names in a dictionary) with any Unicode characters, of any length, and naming style. On the other hand, destinations require that you follow strict rules when you name tables, columns, or collections. +# Naming convention +dlt creates table and column identifiers from the data. The data source, i.e., a stream of JSON documents, may have identifiers (i.e., key names in a dictionary) with any Unicode characters, of any length, and naming style. On the other hand, destinations require that you follow strict rules when you name tables, columns, or collections. A good example is [Redshift](../dlt-ecosystem/destinations/redshift.md#naming-convention) that accepts case-insensitive alphanumeric identifiers with a maximum of 127 characters. `dlt` groups tables from a single [source](source.md) in a [schema](schema.md). Each schema defines a **naming convention** that tells `dlt` how to translate identifiers to the @@ -20,19 +20,19 @@ The standard behavior of `dlt` is to **use the same naming convention for all de ### Use default naming convention (snake_case) **snake_case** is a case-insensitive naming convention, converting source identifiers into lower-case snake case identifiers with a reduced alphabet. -- Spaces around identifiers are trimmed -- Keeps ASCII alphanumerics and underscores, replaces all other characters with underscores (with the exceptions below) -- Replaces `+` and `*` with `x`, `-` with `_`, `@` with `a`, and `|` with `l` +- Spaces around identifiers are trimmed. +- Keeps ASCII alphanumerics and underscores, replaces all other characters with underscores (with the exceptions below). +- Replaces `+` and `*` with `x`, `-` with `_`, `@` with `a`, and `|` with `l`. - Prepends `_` if the name starts with a number. - Multiples of `_` are converted into a single `_`. -- Replaces all trailing `_` with `x` +- Replaces all trailing `_` with `x`. -Uses __ as a nesting separator for tables and flattened column names. +Uses `__` as a nesting separator for tables and flattened column names. :::tip If you do not like **snake_case**, your next safe option is **sql_ci**, which generates SQL-safe, lowercase, case-insensitive identifiers without any other transformations. To permanently change the default naming convention on a given machine: -1. set an environment variable `SCHEMA__NAMING` to `sql_ci_v1` OR -2. add the following line to your global `config.toml` (the one in your home dir, i.e. `~/.dlt/config.toml`) +1. Set an environment variable `SCHEMA__NAMING` to `sql_ci_v1` OR +2. Add the following line to your global `config.toml` (the one in your home dir, i.e., `~/.dlt/config.toml`): ```toml [schema] naming="sql_ci_v1" @@ -43,15 +43,15 @@ naming="sql_ci_v1" ### Pick the right identifier form when defining resources `dlt` keeps source (not normalized) identifiers during data [extraction](../reference/explainers/how-dlt-works.md#extract) and translates them during [normalization](../reference/explainers/how-dlt-works.md#normalize). For you, it means: 1. If you write a [transformer](resource.md#process-resources-with-dlttransformer) or a [mapping/filtering function](resource.md#filter-transform-and-pivot-data), you will see the original data, without any normalization. Use the source identifiers to access the dicts! -2. If you define a `primary_key` or `cursor` that participate in [cursor field incremental loading](incremental-loading.md#incremental-loading-with-a-cursor-field), use the source identifiers (`dlt` uses them to inspect source data, `Incremental` class is just a filtering function). -3. When defining any other hints, i.e. `columns` or `merge_key`, you can pick source or destination identifiers. `dlt` normalizes all hints together with your data. -4. The `Schema` object (i.e. obtained from the pipeline or from `dlt` source via `discover_schema`) **always contains destination (normalized) identifiers**. +2. If you define a `primary_key` or `cursor` that participates in [cursor field incremental loading](incremental-loading.md#incremental-loading-with-a-cursor-field), use the source identifiers (`dlt` uses them to inspect source data, `Incremental` class is just a filtering function). +3. When defining any other hints, i.e., `columns` or `merge_key`, you can pick source or destination identifiers. `dlt` normalizes all hints together with your data. +4. The `Schema` object (i.e., obtained from the pipeline or from `dlt` source via `discover_schema`) **always contains destination (normalized) identifiers**. ### Understand the identifier normalization Identifiers are translated from source to destination form in the **normalize** step. Here's how `dlt` picks the naming convention: * The default naming convention is **snake_case**. -* Each destination may define a preferred naming convention in [destination capabilities](destination.md#pass-additional-parameters-and-change-destination-capabilities). Some destinations (i.e. Weaviate) need a specialized naming convention and will override the default. +* Each destination may define a preferred naming convention in [destination capabilities](destination.md#pass-additional-parameters-and-change-destination-capabilities). Some destinations (i.e., Weaviate) need a specialized naming convention and will override the default. * You can [configure a naming convention explicitly](#set-and-adjust-naming-convention-explicitly). Such configuration overrides the destination settings. * This naming convention is used when new schemas are created. It happens when the pipeline is run for the first time. * Schemas preserve the naming convention when saved. Your running pipelines will maintain existing naming conventions if not requested otherwise. @@ -62,9 +62,9 @@ If you change the naming convention and `dlt` detects a change in the destinatio ::: ### Case-sensitive and insensitive destinations -Naming conventions declare if the destination identifiers they produce are case-sensitive or insensitive. This helps `dlt` to [generate case-sensitive / insensitive identifiers for the destinations that support both](destination.md#control-how-dlt-creates-table-column-and-other-identifiers). For example: if you pick a case-insensitive naming like **snake_case** or **sql_ci_v1**, with Snowflake, `dlt` will generate all uppercase identifiers that Snowflake sees as case-insensitive. If you pick a case-sensitive naming like **sql_cs_v1**, `dlt` will generate quoted case-sensitive identifiers that preserve identifier capitalization. +Naming conventions declare if the destination identifiers they produce are case-sensitive or insensitive. This helps `dlt` to [generate case-sensitive / insensitive identifiers for the destinations that support both](destination.md#control-how-dlt-creates-table-column-and-other-identifiers). For example, if you pick a case-insensitive naming like **snake_case** or **sql_ci_v1**, with Snowflake, `dlt` will generate all uppercase identifiers that Snowflake sees as case-insensitive. If you pick a case-sensitive naming like **sql_cs_v1**, `dlt` will generate quoted case-sensitive identifiers that preserve identifier capitalization. -Note that many destinations are exclusively case-insensitive, of which some preserve the casing of identifiers (i.e. **duckdb**) and some will case-fold identifiers when creating tables (i.e. **Redshift**, **Athena** do lowercase on the names). `dlt` is able to detect resulting identifier [collisions](#avoid-identifier-collisions) and stop the load process before data is mangled. +Note that many destinations are exclusively case-insensitive, of which some preserve the casing of identifiers (i.e., **duckdb**) and some will case-fold identifiers when creating tables (i.e., **Redshift**, **Athena** do lowercase on the names). `dlt` is able to detect resulting identifier [collisions](#avoid-identifier-collisions) and stop the load process before data is mangled. ### Identifier shortening Identifier shortening happens during normalization. `dlt` takes the maximum length of the identifier from the destination capabilities and will trim the identifiers that are too long. The default shortening behavior generates short deterministic hashes of the source identifiers and places them in the middle of the destination identifier. This (with a high probability) avoids shortened identifier collisions. @@ -75,16 +75,16 @@ Identifier shortening happens during normalization. `dlt` takes the maximum leng ## Pick your own naming convention ### Configure naming convention -You can use `config.toml`, environment variables, or any other configuration provider to set the naming convention name. Configured naming convention **overrides all other settings** -- changes the naming convention stored in the already created schema -- overrides the destination capabilities preference. +You can use `config.toml`, environment variables, or any other configuration provider to set the naming convention name. The configured naming convention **overrides all other settings**: +- Changes the naming convention stored in the already created schema. +- Overrides the destination capabilities preference. ```toml [schema] naming="sql_ci_v1" ``` The configuration above will request **sql_ci_v1** for all pipelines (schemas). An environment variable `SCHEMA__NAMING` set to `sql_ci_v1` has the same effect. -You have an option to set the naming convention per source: +You have the option to set the naming convention per source: ```toml [sources.zendesk] config="prop" @@ -93,7 +93,7 @@ naming="sql_cs_v1" [sources.zendesk.credentials] password="pass" ``` -The snippet above demonstrates how to apply certain naming for an example `zendesk` source. +The snippet above demonstrates how to apply a certain naming for an example `zendesk` source. You can use naming conventions that you created yourself or got from other users. In that case, you should pass a full Python import path to the [module that contains the naming convention](#write-your-own-naming-convention): ```toml @@ -106,11 +106,11 @@ naming="tests.common.cases.normalizers.sql_upper" ### Available naming conventions You can pick from a few built-in naming conventions. -* `snake_case` - the default -* `duck_case` - case-sensitive, allows all Unicode characters like emoji 💥 -* `direct` - case-sensitive, allows all Unicode characters, does not contract underscores -* `sql_cs_v1` - case-sensitive, generates SQL-safe identifiers -* `sql_ci_v1` - case-insensitive, generates SQL-safe lowercase identifiers +* `snake_case` - the default. +* `duck_case` - case-sensitive, allows all Unicode characters like emoji 💥. +* `direct` - case-sensitive, allows all Unicode characters, does not contract underscores. +* `sql_cs_v1` - case-sensitive, generates SQL-safe identifiers. +* `sql_ci_v1` - case-insensitive, generates SQL-safe lowercase identifiers. ### Ignore naming convention for `dataset_name` @@ -139,27 +139,28 @@ Depending on the destination, certain names may not be allowed. To ensure your d ## Avoid identifier collisions `dlt` detects various types of identifier collisions and ignores the others. -1. `dlt` detects collisions if a case-sensitive naming convention is used on a case-insensitive destination -2. `dlt` detects collisions if a change of naming convention changes the identifiers of tables already created in the destination -3. `dlt` detects collisions when the naming convention is applied to column names of arrow tables +1. dlt detects collisions if a case-sensitive naming convention is used on a case-insensitive destination. +2. dlt detects collisions if a change of naming convention changes the identifiers of tables already created in the destination. +3. dlt detects collisions when the naming convention is applied to column names of arrow tables. `dlt` will not detect a collision when normalizing source data. If you have a dictionary, keys will be merged if they collide after being normalized. You can create a custom naming convention that does not generate collisions on data, see examples below. - ## Write your own naming convention -Custom naming conventions are classes that derive from `NamingConvention` that you can import from `dlt.common.normalizers.naming`. We recommend the following module layout: -1. Each naming convention resides in a separate Python module (file) -2. The class is always named `NamingConvention` + +Custom naming conventions are classes that derive from `NamingConvention`, which you can import from `dlt.common.normalizers.naming`. We recommend the following module layout: +1. Each naming convention resides in a separate Python module (file). +2. The class is always named `NamingConvention`. In that case, you can use a fully qualified module name in [schema configuration](#configure-naming-convention) or pass the module [explicitly](#set-and-adjust-naming-convention-explicitly). We include [two examples](../examples/custom_naming) of naming conventions that you may find useful: 1. A variant of `sql_ci` that generates identifier collisions with a low (user-defined) probability by appending a deterministic tag to each name. -2. A variant of `sql_cs` that allows for LATIN (i.e. umlaut) characters +2. A variant of `sql_cs` that allows for LATIN (i.e., umlaut) characters. :::note -Note that a fully qualified name of your custom naming convention will be stored in the `Schema` and `dlt` will attempt to import it when the schema is loaded from storage. +Note that the fully qualified name of your custom naming convention will be stored in the schema, and dlt will attempt to import it when the schema is loaded from storage. You should distribute your custom naming conventions with your pipeline code or via a pip package from which it can be imported. -::: \ No newline at end of file +::: + diff --git a/docs/website/docs/general-usage/pipeline.md b/docs/website/docs/general-usage/pipeline.md index 40f9419bc2..0a159af910 100644 --- a/docs/website/docs/general-usage/pipeline.md +++ b/docs/website/docs/general-usage/pipeline.md @@ -6,14 +6,14 @@ keywords: [pipeline, source, full refresh, dev mode] # Pipeline -A [pipeline](glossary.md#pipeline) is a connection that moves the data from your Python code to a +A [pipeline](glossary.md#pipeline) is a connection that moves data from your Python code to a [destination](glossary.md#destination). The pipeline accepts `dlt` [sources](source.md) or -[resources](resource.md) as well as generators, async generators, lists and any iterables. -Once the pipeline runs, all resources get evaluated and the data is loaded at destination. +[resources](resource.md), as well as generators, async generators, lists, and any iterables. +Once the pipeline runs, all resources are evaluated and the data is loaded at the destination. Example: -This pipeline will load a list of objects into `duckdb` table with a name "three": +This pipeline will load a list of objects into a DuckDB table named "three": ```py import dlt @@ -25,30 +25,30 @@ info = pipeline.run([{'id':1}, {'id':2}, {'id':3}], table_name="three") print(info) ``` -You instantiate a pipeline by calling `dlt.pipeline` function with following arguments: +You instantiate a pipeline by calling the `dlt.pipeline` function with the following arguments: -- `pipeline_name` a name of the pipeline that will be used to identify it in trace and monitoring +- `pipeline_name`: a name of the pipeline that will be used to identify it in trace and monitoring events and to restore its state and data schemas on subsequent runs. If not provided, `dlt` will - create pipeline name from the file name of currently executing Python module. -- `destination` a name of the [destination](../dlt-ecosystem/destinations) to which dlt - will load the data. May also be provided to `run` method of the `pipeline`. -- `dataset_name` a name of the dataset to which the data will be loaded. A dataset is a logical - group of tables i.e. `schema` in relational databases or folder grouping many files. May also be - provided later to the `run` or `load` methods of the pipeline. If not provided at all then - defaults to the `pipeline_name`. + create a pipeline name from the file name of the currently executing Python module. +- `destination`: a name of the [destination](../dlt-ecosystem/destinations) to which dlt + will load the data. It may also be provided to the `run` method of the `pipeline`. +- `dataset_name`: a name of the dataset to which the data will be loaded. A dataset is a logical + group of tables, i.e., `schema` in relational databases or a folder grouping many files. It may also be + provided later to the `run` or `load` methods of the pipeline. If not provided at all, then + it defaults to the `pipeline_name`. -To load the data you call the `run` method and pass your data in `data` argument. +To load the data, you call the `run` method and pass your data in the `data` argument. Arguments: -- `data` (the first argument) may be a dlt source, resource, generator function, or any Iterator / - Iterable (i.e. a list or the result of `map` function). +- `data` (the first argument) may be a dlt source, resource, generator function, or any Iterator or + Iterable (i.e., a list or the result of the `map` function). - `write_disposition` controls how to write data to a table. Defaults to "append". - `append` will always add new data at the end of the table. - `replace` will replace existing data with new data. - `skip` will prevent data from loading. - `merge` will deduplicate and merge data based on `primary_key` and `merge_key` hints. -- `table_name` - specified in case when table name cannot be inferred i.e. from the resources or name +- `table_name`: specified in cases when the table name cannot be inferred, i.e., from the resources or name of the generator function. Example: This pipeline will load the data the generator `generate_rows(10)` produces: @@ -70,24 +70,25 @@ print(info) ## Pipeline working directory Each pipeline that you create with `dlt` stores extracted files, load packages, inferred schemas, -execution traces and the [pipeline state](state.md) in a folder in the local filesystem. The default -location for such folders is in user home directory: `~/.dlt/pipelines/`. +execution traces, and the [pipeline state](state.md) in a folder in the local filesystem. The default +location for such folders is in the user's home directory: `~/.dlt/pipelines/`. You can inspect stored artifacts using the command [dlt pipeline info](../reference/command-line-interface.md#dlt-pipeline) and [programmatically](../walkthroughs/run-a-pipeline.md#4-inspect-a-load-process). -> 💡 A pipeline with given name looks for its working directory in location above - so if you have two +> 💡 A pipeline with a given name looks for its working directory in the location above - so if you have two > pipeline scripts that create a pipeline with the same name, they will see the same working folder -> and share all the possible state. You may override the default location using `pipelines_dir` +> and share all the possible state. You may override the default location using the `pipelines_dir` > argument when creating the pipeline. -> 💡 You can attach `Pipeline` instance to an existing working folder, without creating a new +> 💡 You can attach a `Pipeline` instance to an existing working folder, without creating a new > pipeline with `dlt.attach`. -### Separate working environments with `pipelines_dir`. -You can run several pipelines with the same name but with different configuration ie. to target development / staging / production environments. -Set the `pipelines_dir` argument to store all the working folders in specific place. For example: +### Separate working environments with `pipelines_dir` + +You can run several pipelines with the same name but with different configurations, for example, to target development, staging, or production environments. +Set the `pipelines_dir` argument to store all the working folders in a specific place. For example: ```py import dlt from dlt.common.pipeline import get_dlt_pipelines_dir @@ -95,36 +96,36 @@ from dlt.common.pipeline import get_dlt_pipelines_dir dev_pipelines_dir = os.path.join(get_dlt_pipelines_dir(), "dev") pipeline = dlt.pipeline(destination="duckdb", dataset_name="sequence", pipelines_dir=dev_pipelines_dir) ``` -stores pipeline working folder in `~/.dlt/pipelines/dev/`. Mind that you need to pass this `~/.dlt/pipelines/dev/` -in to all cli commands to get info/trace for that pipeline. +This code stores the pipeline working folder in `~/.dlt/pipelines/dev/`. Note that you need to pass this `~/.dlt/pipelines/dev/` +into all CLI commands to get info/trace for that pipeline. ## Do experiments with dev mode -If you [create a new pipeline script](../walkthroughs/create-a-pipeline.md) you will be -experimenting a lot. If you want that each time the pipeline resets its state and loads data to a +If you [create a new pipeline script](../walkthroughs/create-a-pipeline.md), you will be +experimenting a lot. If you want each time the pipeline resets its state and loads data to a new dataset, set the `dev_mode` argument of the `dlt.pipeline` method to True. Each time the -pipeline is created, `dlt` adds datetime-based suffix to the dataset name. +pipeline is created, `dlt` adds a datetime-based suffix to the dataset name. ## Refresh pipeline data and state You can reset parts or all of your sources by using the `refresh` argument to `dlt.pipeline` or the pipeline's `run` or `extract` method. -That means when you run the pipeline the sources/resources being processed will have their state reset and their tables either dropped or truncated +That means when you run the pipeline, the sources/resources being processed will have their state reset and their tables either dropped or truncated, depending on which refresh mode is used. -`refresh` option works with all relational/sql destinations and file buckets (`filesystem`). it does not work with vector databases (we are working on that) and +The `refresh` option works with all relational or SQL destinations and cloud storages and files (`filesystem`). It does not work with vector databases (we are working on that) and with custom destinations. The `refresh` argument should have one of the following string values to decide the refresh mode: ### Drop tables and pipeline state for a source with `drop_sources` All sources being processed in `pipeline.run` or `pipeline.extract` are refreshed. -That means all tables listed in their schemas are dropped and state belonging to those sources and all their resources is completely wiped. -The tables are deleted both from pipeline's schema and from the destination database. +That means all tables listed in their schemas are dropped and the state belonging to those sources and all their resources is completely wiped. +The tables are deleted both from the pipeline's schema and from the destination database. -If you only have one source or run with all your sources together, then this is practically like running the pipeline again for the first time +If you only have one source or run with all your sources together, then this is practically like running the pipeline again for the first time. :::caution -This erases schema history for the selected sources and only the latest version is stored +This erases schema history for the selected sources and only the latest version is stored. ::: ```py @@ -133,26 +134,27 @@ import dlt pipeline = dlt.pipeline("airtable_demo", destination="duckdb") pipeline.run(airtable_emojis(), refresh="drop_sources") ``` -In example above we instruct `dlt` to wipe pipeline state belonging to `airtable_emojis` source and drop all the database tables in `duckdb` to +In the example above, we instruct `dlt` to wipe the pipeline state belonging to the `airtable_emojis` source and drop all the database tables in `duckdb` to which data was loaded. The `airtable_emojis` source had two resources named "📆 Schedule" and "💰 Budget" loading to tables "_schedule" and "_budget". Here's what `dlt` does step by step: -1. collects a list of tables to drop by looking for all the tables in the schema that are created in the destination. -2. removes existing pipeline state associated with `airtable_emojis` source -3. resets the schema associated with `airtable_emojis` source -4. executes `extract` and `normalize` steps. those will create fresh pipeline state and a schema -5. before it executes `load` step, the collected tables are dropped from staging and regular dataset -6. schema `airtable_emojis` (associated with the source) is be removed from `_dlt_version` table -7. executes `load` step as usual so tables are re-created and fresh schema and pipeline state are stored. +1. Collects a list of tables to drop by looking for all the tables in the schema that are created in the destination. +2. Removes existing pipeline state associated with the `airtable_emojis` source. +3. Resets the schema associated with the `airtable_emojis` source. +4. Executes `extract` and `normalize` steps. These will create fresh pipeline state and a schema. +5. Before it executes the `load` step, the collected tables are dropped from staging and regular dataset. +6. Schema `airtable_emojis` (associated with the source) is removed from the `_dlt_version` table. +7. Executes the `load` step as usual so tables are re-created and fresh schema and pipeline state are stored. ### Selectively drop tables and resource state with `drop_resources` -Limits the refresh to the resources being processed in `pipeline.run` or `pipeline.extract` (.e.g by using `source.with_resources(...)`). -Tables belonging to those resources are dropped and their resource state is wiped (that includes incremental state). -The tables are deleted both from pipeline's schema and from the destination database. -Source level state keys are not deleted in this mode (i.e. `dlt.state()[<'my_key>'] = ''`) +Limits the refresh to the resources being processed in `pipeline.run` or `pipeline.extract` (e.g., by using `source.with_resources(...)`). +Tables belonging to those resources are dropped, and their resource state is wiped (that includes incremental state). +The tables are deleted both from the pipeline's schema and from the destination database. + +Source level state keys are not deleted in this mode (i.e., `dlt.state()[<'my_key>'] = ''`) :::caution -This erases schema history for all affected sources and only the latest schema version is stored. +This erases schema history for all affected sources, and only the latest schema version is stored. ::: ```py @@ -161,11 +163,12 @@ import dlt pipeline = dlt.pipeline("airtable_demo", destination="duckdb") pipeline.run(airtable_emojis().with_resources("📆 Schedule"), refresh="drop_resources") ``` -Above we request that the state associated with "📆 Schedule" resource is reset and the table generated by it ("_schedule") is dropped. Other resources, -tables and state are not affected. Please check `drop_sources` for step by step description of what `dlt` does internally. +Above, we request that the state associated with the "📆 Schedule" resource is reset, and the table generated by it ("_schedule") is dropped. Other resources, +tables, and state are not affected. Please check `drop_sources` for a step-by-step description of what `dlt` does internally. ### Selectively truncate tables and reset resource state with `drop_data` -Same as `drop_resources` but instead of dropping tables from schema only the data is deleted from them (i.e. by `TRUNCATE ` in sql destinations). Resource state for selected resources is also wiped. In case of [incremental resources](incremental-loading.md#incremental-loading-with-a-cursor-field) this will + +Same as `drop_resources`, but instead of dropping tables from the schema, only the data is deleted from them (i.e., by `TRUNCATE ` in SQL destinations). Resource state for selected resources is also wiped. In the case of [incremental resources](incremental-loading.md#incremental-loading-with-a-cursor-field), this will reset the cursor state and fully reload the data from the `initial_value`. The schema remains unmodified in this case. @@ -175,30 +178,30 @@ import dlt pipeline = dlt.pipeline("airtable_demo", destination="duckdb") pipeline.run(airtable_emojis().with_resources("📆 Schedule"), refresh="drop_data") ``` -Above the incremental state of the "📆 Schedule" is reset before `extract` step so data is fully reacquired. Just before `load` step starts, - the "_schedule" is truncated and new (full) table data will be inserted/copied. +Above, the incremental state of the "📆 Schedule" is reset before the `extract` step so data is fully reacquired. Just before the `load` step starts, +the "_schedule" is truncated, and new (full) table data will be inserted/copied. ## Display the loading progress -You can add a progress monitor to the pipeline. Typically, its role is to visually assure user that -pipeline run is progressing. `dlt` supports 4 progress monitors out of the box: +You can add a progress monitor to the pipeline. Typically, its role is to visually assure the user that +the pipeline run is progressing. dlt supports 4 progress monitors out of the box: - [enlighten](https://github.com/Rockhopper-Technologies/enlighten) - a status bar with progress bars that also allows for logging. -- [tqdm](https://github.com/tqdm/tqdm) - most popular Python progress bar lib, proven to work in +- [tqdm](https://github.com/tqdm/tqdm) - the most popular Python progress bar lib, proven to work in Notebooks. - [alive_progress](https://github.com/rsalmei/alive-progress) - with the most fancy animations. -- **log** - dumps the progress information to log, console or text stream. **the most useful on - production** optionally adds memory and cpu usage stats. +- **log** - dumps the progress information to log, console, or text stream. **the most useful on + production** optionally adds memory and CPU usage stats. > 💡 You must install the required progress bar library yourself. -You pass the progress monitor in `progress` argument of the pipeline. You can use a name from the +You pass the progress monitor in the `progress` argument of the pipeline. You can use a name from the list above as in the following example: ```py # create a pipeline loading chess data that dumps -# progress to stdout each 10 seconds (the default) +# progress to stdout every 10 seconds (the default) pipeline = dlt.pipeline( pipeline_name="chess_pipeline", destination='duckdb', @@ -232,3 +235,4 @@ pipeline = dlt.pipeline( Note that the value of the `progress` argument is [configurable](../walkthroughs/run-a-pipeline.md#2-see-the-progress-during-loading). + diff --git a/docs/website/docs/general-usage/resource.md b/docs/website/docs/general-usage/resource.md index d4dedd42bd..579452cc0c 100644 --- a/docs/website/docs/general-usage/resource.md +++ b/docs/website/docs/general-usage/resource.md @@ -8,13 +8,12 @@ keywords: [resource, api endpoint, dlt.resource] ## Declare a resource -A [resource](glossary.md#resource) is an ([optionally async](../reference/performance.md#parallelism)) function that yields data. To create a -resource, we add the `@dlt.resource` decorator to that function. +A [resource](glossary.md#resource) is an ([optionally async](../reference/performance.md#parallelism)) function that yields data. To create a resource, we add the `@dlt.resource` decorator to that function. Commonly used arguments: -- `name` The name of the table generated by this resource. Defaults to the decorated function name. -- `write_disposition` How should the data be loaded at the destination? Currently supported: `append`, +- `name`: The name of the table generated by this resource. Defaults to the decorated function name. +- `write_disposition`: How should the data be loaded at the destination? Currently supported: `append`, `replace`, and `merge`. Defaults to `append.` Example: @@ -23,7 +22,7 @@ Example: @dlt.resource(name='table_name', write_disposition='replace') def generate_rows(): for i in range(10): - yield {'id':i, 'example_string':'abc'} + yield {'id': i, 'example_string': 'abc'} @dlt.source def source_name(): @@ -40,22 +39,16 @@ for row in source_name().resources.get('table_name'): print(row) ``` -Typically, resources are declared and grouped with related resources within a [source](source.md) -function. +Typically, resources are declared and grouped with related resources within a [source](source.md) function. ### Define schema -`dlt` will infer [schema](schema.md) for tables associated with resources from the resource's data. -You can modify the generation process by using the table and column hints. Resource decorator -accepts the following arguments: +`dlt` will infer the [schema](schema.md) for tables associated with resources from the resource's data. +You can modify the generation process by using the table and column hints. The resource decorator accepts the following arguments: -1. `table_name` the name of the table, if different from the resource name. -1. `primary_key` and `merge_key` define the name of the columns (compound keys are allowed) that will - receive those hints. Used in [incremental loading](incremental-loading.md). -1. `columns` let's you define one or more columns, including the data types, nullability, and other - hints. The column definition is a `TypedDict`: `TTableSchemaColumns`. In the example below, we tell - `dlt` that column `tags` (containing a list of tags) in the `user` table should have type `json`, - which means that it will be loaded as JSON/struct and not as a separate nested table. +1. `table_name`: the name of the table, if different from the resource name. +1. `primary_key` and `merge_key`: define the name of the columns (compound keys are allowed) that will receive those hints. Used in [incremental loading](incremental-loading.md). +1. `columns`: lets you define one or more columns, including the data types, nullability, and other hints. The column definition is a `TypedDict`: `TTableSchemaColumns`. In the example below, we tell `dlt` that the column `tags` (containing a list of tags) in the `user` table should have type `json`, which means that it will be loaded as JSON/struct and not as a separate nested table. ```py @dlt.resource(name="user", columns={"tags": {"data_type": "json"}}) @@ -67,8 +60,7 @@ accepts the following arguments: ``` :::note -You can pass dynamic hints which are functions that take the data item as input and return a -hint value. This lets you create table and column schemas depending on the data. See an [example below](#adjust-schema-when-you-yield-data). +You can pass dynamic hints which are functions that take the data item as input and return a hint value. This lets you create table and column schemas depending on the data. See an [example below](#adjust-schema-when-you-yield-data). ::: :::tip @@ -76,7 +68,7 @@ You can mark some resource arguments as [configuration and credentials](credenti ::: ### Put a contract on tables, columns, and data -Use the `schema_contract` argument to tell dlt how to [deal with new tables, data types, and bad data types](schema-contracts.md). For example, if you set it to **freeze**, `dlt` will not allow for any new tables, columns, or data types to be introduced to the schema - it will raise an exception. Learn more on available contract modes [here](schema-contracts.md#setting-up-the-contract) +Use the `schema_contract` argument to tell dlt how to [deal with new tables, data types, and bad data types](schema-contracts.md). For example, if you set it to **freeze**, `dlt` will not allow for any new tables, columns, or data types to be introduced to the schema - it will raise an exception. Learn more about available contract modes [here](schema-contracts.md#setting-up-the-contract). ### Define a schema with Pydantic @@ -85,14 +77,13 @@ For example: ```py from pydantic import BaseModel - +from typing import List, Optional, Union class Address(BaseModel): street: str city: str postal_code: str - class User(BaseModel): id: int name: str @@ -101,7 +92,6 @@ class User(BaseModel): address: Address status: Union[int, str] - @dlt.resource(name="user", columns=User) def get_users(): ... @@ -114,11 +104,11 @@ Pydantic models integrate well with [schema contracts](schema-contracts.md) as d Things to note: -- Fields with an `Optional` type are marked as `nullable` +- Fields with an `Optional` type are marked as `nullable`. - Fields with a `Union` type are converted to the first (not `None`) type listed in the union. For example, `status: Union[int, str]` results in a `bigint` column. -- `list`, `dict`, and nested Pydantic model fields will use the `json` type which means they'll be stored as a JSON object in the database instead of creating nested tables. +- `list`, `dict`, and nested Pydantic model fields will use the `json` type, which means they'll be stored as a JSON object in the database instead of creating nested tables. -You can override this by configuring the Pydantic model +You can override this by configuring the Pydantic model: ```py from typing import ClassVar @@ -140,7 +130,7 @@ We do not support `RootModel` that validate simple types. You can add such a val ### Dispatch data to many tables You can load data to many tables from a single resource. The most common case is a stream of events -of different types, each with different data schema. To deal with this, you can use the `table_name` +of different types, each with a different data schema. To deal with this, you can use the `table_name` argument on `dlt.resource`. You could pass the table name as a function with the data item as an argument and the `table_name` string as a return value. @@ -155,7 +145,7 @@ def repo_events() -> Iterator[TDataItems]: # the `table_schema` method gets the table schema generated by a resource and takes an optional # data item to evaluate dynamic hints -print(repo_events().compute_table_schema({"type": "WatchEvent", id:...})) +print(repo_events().compute_table_schema({"type": "WatchEvent", "id": ...})) ``` In more advanced cases, you can dispatch data to different tables directly in the code of the @@ -176,8 +166,8 @@ You can add arguments to your resource functions like to any other. Below we par ```py @dlt.resource(name='table_name', write_disposition='replace') def generate_rows(nr): - for i in range(nr): - yield {'id':i, 'example_string':'abc'} + for i in range(nr): + yield {'id': i, 'example_string': 'abc'} for row in generate_rows(10): print(row) @@ -191,10 +181,7 @@ so `dlt` can pass them automatically to your functions. ### Process resources with `dlt.transformer` -You can feed data from a resource into another one. The most common case is when you have an API -that returns a list of objects (i.e. users) in one endpoint and user details in another. You can deal -with this by declaring a resource that obtains a list of users and another resource that receives -items from the list and downloads the profiles. +You can feed data from one resource into another. The most common case is when you have an API that returns a list of objects (i.e., users) in one endpoint and user details in another. You can deal with this by declaring a resource that obtains a list of users and another resource that receives items from the list and downloads the profiles. ```py @dlt.resource(write_disposition="replace") @@ -202,7 +189,7 @@ def users(limit=None): for u in _get_users(limit): yield u -# feed data from users as user_item below, +# Feed data from users as user_item below, # all transformers must have at least one # argument that will receive data from the parent resource @dlt.transformer(data_from=users) @@ -210,22 +197,21 @@ def users_details(user_item): for detail in _get_details(user_item["user_id"]): yield detail -# just load the user_details. +# Just load the user_details. # dlt figures out dependencies for you. pipeline.run(user_details) ``` -In the example above, `user_details` will receive data from the default instance of the `users` resource (with `limit` set to `None`). You can also use -**pipe |** operator to bind resources dynamically +In the example above, `user_details` will receive data from the default instance of the `users` resource (with `limit` set to `None`). You can also use the **pipe |** operator to bind resources dynamically. ```py -# you can be more explicit and use a pipe operator. -# with it you can create dynamic pipelines where the dependencies -# are set at run time and resources are parametrized i.e. -# below we want to load only 100 users from `users` endpoint +# You can be more explicit and use a pipe operator. +# With it, you can create dynamic pipelines where the dependencies +# are set at run time and resources are parametrized, i.e., +# below we want to load only 100 users from the `users` endpoint. pipeline.run(users(limit=100) | user_details) ``` :::tip -Transformers are allowed not only to **yield** but also to **return** values and can decorate **async** functions and [**async generators**](../reference/performance.md#extract). Below we decorate an async function and request details on two pokemons. Http calls are made in parallel via httpx library. +Transformers are allowed not only to **yield** but also to **return** values and can decorate **async** functions and [**async generators**](../reference/performance.md#extract). Below we decorate an async function and request details on two pokemons. HTTP calls are made in parallel via the httpx library. ```py import dlt import httpx @@ -237,22 +223,20 @@ async def pokemon(id): r = await client.get(f"https://pokeapi.co/api/v2/pokemon/{id}") return r.json() -# get bulbasaur and ivysaur (you need dlt 0.4.6 for pipe operator working with lists) +# Get Bulbasaur and Ivysaur (you need dlt 0.4.6 for the pipe operator working with lists). print(list([1,2] | pokemon())) ``` ::: ### Declare a standalone resource -A standalone resource is defined on a function that is top level in a module (not an inner function) that accepts config and secrets values. Additionally, -if the `standalone` flag is specified, the decorated function signature and docstring will be preserved. `dlt.resource` will just wrap the -decorated function, and the user must call the wrapper to get the actual resource. Below we declare a `filesystem` resource that must be called before use. +A standalone resource is defined on a function that is top-level in a module (not an inner function) that accepts config and secrets values. Additionally, if the `standalone` flag is specified, the decorated function signature and docstring will be preserved. `dlt.resource` will just wrap the decorated function, and the user must call the wrapper to get the actual resource. Below we declare a `filesystem` resource that must be called before use. ```py @dlt.resource(standalone=True) def filesystem(bucket_url=dlt.config.value): - """list and yield files in `bucket_url`""" + """List and yield files in `bucket_url`.""" ... -# `filesystem` must be called before it is extracted or used in any other way +# `filesystem` must be called before it is extracted or used in any other way. pipeline.run(filesystem("s3://my-bucket/reports"), table_name="reports") ``` @@ -264,13 +248,11 @@ def kinesis(stream_name: str): kinesis_stream = kinesis("telemetry_stream") ``` -`kinesis_stream` resource has a name **telemetry_stream** - +`kinesis_stream` resource has a name **telemetry_stream**. ### Declare parallel and async resources You can extract multiple resources in parallel threads or with async IO. -To enable this for a sync resource you can set the `parallelized` flag to `True` in the resource decorator: - +To enable this for a sync resource, you can set the `parallelized` flag to `True` in the resource decorator: ```py @dlt.resource(parallelized=True) @@ -300,9 +282,9 @@ Please find more details in [extract performance](../reference/performance.md#ex ## Customize resources -### Filter, transform and pivot data +### Filter, transform, and pivot data -You can attach any number of transformations that are evaluated on an item per item basis to your +You can attach any number of transformations that are evaluated on an item-per-item basis to your resource. The available transformation types: - **map** - transform the data item (`resource.add_map`). @@ -314,7 +296,7 @@ Example: We have a resource that loads a list of users from an API endpoint. We so: 1. We remove users with `user_id == "me"`. -1. We anonymize user data. +2. We anonymize user data. Here's our resource: @@ -350,7 +332,7 @@ You can limit how deep `dlt` goes when generating nested tables and flattening d and generate nested tables for all nested lists, without limit. :::note -`max_table_nesting` is optional so you can skip it, in this case dlt will +`max_table_nesting` is optional so you can skip it, in this case, dlt will use it from the source if it is specified there or fallback to the default value which has 1000 as the maximum nesting level. ::: @@ -396,15 +378,15 @@ resource = my_resource() resource.max_table_nesting = 0 ``` -Several data sources are prone to contain semi-structured documents with very deep nesting i.e. +Several data sources are prone to contain semi-structured documents with very deep nesting, i.e., MongoDB databases. Our practical experience is that setting the `max_nesting_level` to 2 or 3 produces the clearest and human-readable schemas. ### Sample from large data -If your resource loads thousands of pages of data from a REST API or millions of rows from a db table, you may want to just sample a fragment of it in order to quickly see the dataset with example data and test your transformations, etc. In order to do that, you limit how many items will be yielded by a resource (or source) by calling the `add_limit` method. This method will close the generator which produces the data after the limit is reached. +If your resource loads thousands of pages of data from a REST API or millions of rows from a database table, you may want to sample just a fragment of it in order to quickly see the dataset with example data and test your transformations, etc. To do this, you limit how many items will be yielded by a resource (or source) by calling the `add_limit` method. This method will close the generator that produces the data after the limit is reached. -In the example below, we load just 10 first items from an infinite counter - that would otherwise never end. +In the example below, we load just the first 10 items from an infinite counter - that would otherwise never end. ```py r = dlt.resource(itertools.count(), name="infinity").add_limit(10) @@ -428,8 +410,8 @@ The code above will extract `15*10=150` records. This is happening because in ea Some constraints of `add_limit` include: 1. `add_limit` does not skip any items. It closes the iterator/generator that produces data after the limit is reached. -1. You cannot limit transformers. They should process all the data they receive fully to avoid inconsistencies in generated datasets. -1. Async resources with a limit added may occasionally produce one item more than the limit on some runs. This behavior is not deterministic. +2. You cannot limit transformers. They should process all the data they receive fully to avoid inconsistencies in generated datasets. +3. Async resources with a limit added may occasionally produce one item more than the limit on some runs. This behavior is not deterministic. :::tip If you are parameterizing the value of `add_limit` and sometimes need it to be disabled, you can set `None` or `-1` to disable the limiting. @@ -438,7 +420,7 @@ You can also set the limit to `0` for the resource to not yield any items. ### Set table name and adjust schema -You can change the schema of a resource, be it standalone or as a part of a source. Look for a method named `apply_hints` which takes the same arguments as the resource decorator. Obviously, you should call this method before data is extracted from the resource. The example below converts an `append` resource loading the `users` table into a [merge](incremental-loading.md#merge-incremental_loading) resource that will keep just one updated record per `user_id`. It also adds ["last value" incremental loading](incremental-loading.md#incremental_loading-with-last-value) on the `created_at` column to prevent requesting again the already loaded records: +You can change the schema of a resource, whether it is standalone or part of a source. Look for a method named `apply_hints` which takes the same arguments as the resource decorator. Obviously, you should call this method before data is extracted from the resource. The example below converts an `append` resource loading the `users` table into a [merge](incremental-loading.md#merge-incremental_loading) resource that will keep just one updated record per `user_id`. It also adds ["last value" incremental loading](incremental-loading.md#incremental_loading-with-last-value) on the `created_at` column to prevent requesting again the already loaded records: ```py tables = sql_database() @@ -450,7 +432,7 @@ tables.users.apply_hints( pipeline.run(tables) ``` -To just change the name of a table to which the resource will load data, do the following: +To change the name of a table to which the resource will load data, do the following: ```py tables = sql_database() tables.users.table_name = "other_users" @@ -458,40 +440,40 @@ tables.users.table_name = "other_users" ### Adjust schema when you yield data -You can set or update the table name, columns, and other schema elements when your resource is executed and you already yield data. Such changes will be merged with the existing schema in the same way the `apply_hints` method above works. There are many reasons to adjust the schema at runtime. For example, when using Airflow, you should avoid lengthy operations (i.e. reflecting database tables) during the creation of the DAG, so it is better to do it when the DAG executes. You may also emit partial hints (i.e. precision and scale for decimal types) for columns to help `dlt` type inference. +You can set or update the table name, columns, and other schema elements when your resource is executed, and you already yield data. Such changes will be merged with the existing schema in the same way the `apply_hints` method above works. There are many reasons to adjust the schema at runtime. For example, when using Airflow, you should avoid lengthy operations (i.e., reflecting database tables) during the creation of the DAG, so it is better to do it when the DAG executes. You may also emit partial hints (i.e., precision and scale for decimal types) for columns to help `dlt` type inference. ```py @dlt.resource def sql_table(credentials, schema, table): - # create a SQL Alchemy engine + # Create a SQL Alchemy engine engine = engine_from_credentials(credentials) engine.execution_options(stream_results=True) metadata = MetaData(schema=schema) - # reflect the table schema + # Reflect the table schema table_obj = Table(table, metadata, autoload_with=engine) for idx, batch in enumerate(table_rows(engine, table_obj)): if idx == 0: - # emit the first row with hints, table_to_columns and get_primary_key are helpers that extract dlt schema from + # Emit the first row with hints, table_to_columns and get_primary_key are helpers that extract dlt schema from # SqlAlchemy model yield dlt.mark.with_hints( batch, dlt.mark.make_hints(columns=table_to_columns(table_obj), primary_key=get_primary_key(table_obj)), ) else: - # just yield all the other rows + # Just yield all the other rows yield batch ``` -In the example above, we use `dlt.mark.with_hints` and `dlt.mark.make_hints` to emit columns and primary key with the first extracted item. The table schema will be adjusted after the `batch` is processed in the extract pipeline but before any schema contracts are applied and data is persisted in the load package. +In the example above, we use `dlt.mark.with_hints` and `dlt.mark.make_hints` to emit columns and primary key with the first extracted item. The table schema will be adjusted after the `batch` is processed in the extract pipeline but before any schema contracts are applied, and data is persisted in the load package. :::tip -You can emit columns as a Pydantic model and use dynamic hints (i.e. lambda for table name) as well. You should avoid redefining `Incremental` this way. +You can emit columns as a Pydantic model and use dynamic hints (i.e., lambda for table name) as well. You should avoid redefining `Incremental` this way. ::: ### Import external files -You can import external files i.e. `csv`, `parquet`, and `jsonl` by yielding items marked with `with_file_import`, optionally passing a table schema corresponding to the imported file. `dlt` will not read, parse, and normalize any names (i.e. `csv` or `arrow` headers) and will attempt to copy the file into the destination as is. +You can import external files, i.e., CSV, Parquet, and JSONL, by yielding items marked with `with_file_import`, optionally passing a table schema corresponding to the imported file. dlt will not read, parse, or normalize any names (i.e., CSV or Arrow headers) and will attempt to copy the file into the destination as is. ```py import os import dlt @@ -536,7 +518,7 @@ include_header=false on_error_continue=true ``` -You can sniff the schema from the data i.e. using `duckdb` to infer the table schema from a `csv` file. `dlt.mark.with_file_import` accepts additional arguments that you can use to pass hints at runtime. +You can sniff the schema from the data, i.e., using DuckDB to infer the table schema from a CSV file. `dlt.mark.with_file_import` accepts additional arguments that you can use to pass hints at runtime. :::note * If you do not define any columns, the table will not be created in the destination. `dlt` will still attempt to load data into it, so if you create a fitting table upfront, the load process will succeed. @@ -544,7 +526,7 @@ You can sniff the schema from the data i.e. using `duckdb` to infer the table sc ::: ### Duplicate and rename resources -There are cases when your resources are generic (i.e. bucket filesystem) and you want to load several instances of it (i.e. files from different folders) to separate tables. In the example below, we use the `filesystem` source to load csvs from two different folders into separate tables: +There are cases when your resources are generic (i.e., bucket filesystem) and you want to load several instances of it (i.e., files from different folders) into separate tables. In the example below, we use the `filesystem` source to load csvs from two different folders into separate tables: ```py @dlt.resource(standalone=True) def filesystem(bucket_url): @@ -567,7 +549,7 @@ pipeline.run( ) ``` -The `with_name` method returns a deep copy of the original resource, its data pipe, and the data pipes of a parent resource. A renamed clone is fully separated from the original resource (and other clones) when loading: it maintains a separate [resource state](state.md#read-and-write-pipeline-state-in-a-resource) and will load to a table +The `with_name` method returns a deep copy of the original resource, its data pipe, and the data pipes of a parent resource. A renamed clone is fully separated from the original resource (and other clones) when loading: it maintains a separate [resource state](state.md#read-and-write-pipeline-state-in-a-resource) and will load to a table. ## Load resources @@ -576,8 +558,8 @@ You can pass individual resources or a list of resources to the `dlt.pipeline` o ```py @dlt.resource(name='table_name', write_disposition='replace') def generate_rows(nr): - for i in range(nr): - yield {'id':i, 'example_string':'abc'} + for i in range(nr): + yield {'id': i, 'example_string': 'abc'} pipeline = dlt.pipeline( pipeline_name="rows_pipeline", @@ -591,12 +573,14 @@ pipeline.run([generate_rows(10), generate_rows(20)]) ``` ### Pick loader file format for a particular resource + You can request a particular loader file format to be used for a resource. + ```py @dlt.resource(file_format="parquet") def generate_rows(nr): - for i in range(nr): - yield {'id':i, 'example_string':'abc'} + for i in range(nr): + yield {'id': i, 'example_string': 'abc'} ``` The resource above will be saved and loaded from a `parquet` file (if the destination supports it). @@ -613,6 +597,8 @@ p.run(merge_source(), refresh="drop_data") ``` You can also [fully drop the tables](pipeline.md#refresh-pipeline-data-and-state) in the `merge_source`: + ```py p.run(merge_source(), refresh="drop_sources") ``` + diff --git a/docs/website/docs/general-usage/schema-contracts.md b/docs/website/docs/general-usage/schema-contracts.md index e48fe979fd..6c557f2c45 100644 --- a/docs/website/docs/general-usage/schema-contracts.md +++ b/docs/website/docs/general-usage/schema-contracts.md @@ -20,49 +20,50 @@ This resource will allow new tables (both nested tables and [tables with dynamic ### Setting up the contract You can control the following **schema entities**: -* `tables` - contract is applied when a new table is created -* `columns` - contract is applied when a new column is created on an existing table -* `data_type` - contract is applied when data cannot be coerced into a data type associate with existing column. +* `tables` - the contract is applied when a new table is created +* `columns` - the contract is applied when a new column is created on an existing table +* `data_type` - the contract is applied when data cannot be coerced into a data type associated with an existing column. -You can use **contract modes** to tell `dlt` how to apply contract for a particular entity: +You can use **contract modes** to tell `dlt` how to apply the contract for a particular entity: * `evolve`: No constraints on schema changes. -* `freeze`: This will raise an exception if data is encountered that does not fit the existing schema, so no data will be loaded to the destination +* `freeze`: This will raise an exception if data is encountered that does not fit the existing schema, so no data will be loaded to the destination. * `discard_row`: This will discard any extracted row if it does not adhere to the existing schema, and this row will not be loaded to the destination. -* `discard_value`: This will discard data in an extracted row that does not adhere to the existing schema and the row will be loaded without this data. +* `discard_value`: This will discard data in an extracted row that does not adhere to the existing schema, and the row will be loaded without this data. :::note The default mode (**evolve**) works as follows: -1. New tables may be always created -2. New columns may be always appended to the existing table -3. Data that do not coerce to existing data type of a particular column will be sent to a [variant column](schema.md#variant-columns) created for this particular type. +1. New tables may always be created. +2. New columns may always be appended to the existing table. +3. Data that do not coerce to the existing data type of a particular column will be sent to a [variant column](schema.md#variant-columns) created for this particular type. ::: -#### Passing schema_contract argument +#### Passing the schema_contract argument The `schema_contract` exists on the [dlt.source](source.md) decorator as a default for all resources in that source and on the [dlt.resource](source.md) decorator as a directive for the individual resource - and as a consequence - on all tables created by this resource. -Additionally it exists on the `pipeline.run()` method, which will override all existing settings. +Additionally, it exists on the `pipeline.run()` method, which will override all existing settings. The `schema_contract` argument accepts two forms: 1. **full**: a mapping of schema entities to contract modes -2. **shorthand** a contract mode (string) that will be applied to all schema entities. +2. **shorthand**: a contract mode (string) that will be applied to all schema entities. -For example setting `schema_contract` to *freeze* will expand to the full form: +For example, setting `schema_contract` to *freeze* will expand to the full form: ```py {"tables": "freeze", "columns": "freeze", "data_type": "freeze"} ``` -You can change the contract on the **source** instance via `schema_contract` property. For **resource** you can use [apply_hints](resource#set-table-name-and-adjust-schema). +You can change the contract on the **source** instance via the `schema_contract` property. For **resource**, you can use [apply_hints](resource#set-table-name-and-adjust-schema). -#### Nuances of contract modes. +#### Nuances of contract modes 1. Contracts are applied **after names of tables and columns are normalized**. -2. Contract defined on a resource is applied to all root tables and nested tables created by that resource. -3. `discard_row` works on table level. So for example if you have two tables in nested relationship ie. *users* and *users__addresses* and contract is violated in *users__addresses* table, the row of that table is discarded while the parent row in *users* table will be loaded. +2. A contract defined on a resource is applied to all root tables and nested tables created by that resource. +3. `discard_row` works on the table level. For example, if you have two tables in a nested relationship, i.e., *users* and *users__addresses*, and the contract is violated in the *users__addresses* table, the row of that table is discarded while the parent row in the *users* table will be loaded. ### Use Pydantic models for data validation -Pydantic models can be used to [define table schemas and validate incoming data](resource.md#define-a-schema-with-pydantic). You can use any model you already have. `dlt` will internally synthesize (if necessary) new models that conform with the **schema contract** on the resource. -Just passing a model in `column` argument of the [dlt.resource](resource.md#define-a-schema-with-pydantic) sets a schema contract that conforms to default Pydantic behavior: +Pydantic models can be used to [define table schemas and validate incoming data](resource.md#define-a-schema-with-pydantic). You can use any model you already have. `dlt` will internally synthesize (if necessary) new models that conform to the **schema contract** on the resource. + +Just passing a model in the `column` argument of the [dlt.resource](resource.md#define-a-schema-with-pydantic) sets a schema contract that conforms to the default Pydantic behavior: ```py { "tables": "evolve", @@ -70,18 +71,18 @@ Just passing a model in `column` argument of the [dlt.resource](resource.md#defi "data_type": "freeze" } ``` -New tables are allowed, extra fields are ignored and invalid data raises an exception. +New tables are allowed, extra fields are ignored, and invalid data raises an exception. -If you pass schema contract explicitly the following happens to schema entities: -1. **tables** do not impact the Pydantic models +If you pass a schema contract explicitly, the following happens to schema entities: +1. **tables** do not impact the Pydantic models. 2. **columns** modes are mapped into the **extra** modes of Pydantic (see below). `dlt` will apply this setting recursively if models contain other models. -3. **data_type** supports following modes for Pydantic: **evolve** will synthesize lenient model that allows for any data type. This may result with variant columns upstream. +3. **data_type** supports the following modes for Pydantic: **evolve** will synthesize a lenient model that allows for any data type. This may result in variant columns upstream. **freeze** will re-raise `ValidationException`. **discard_row** will remove the non-validating data items. -**discard_value** is not currently supported. We may eventually do that on Pydantic v2. +**discard_value** is not currently supported. We may eventually do that in Pydantic v2. `dlt` maps column contract modes into the extra fields settings as follows. -Note that this works in two directions. If you use a model with such setting explicitly configured, `dlt` sets the column contract mode accordingly. This also avoids synthesizing modified models. +Note that this works in two directions. If you use a model with such a setting explicitly configured, `dlt` sets the column contract mode accordingly. This also avoids synthesizing modified models. | column mode | pydantic extra | | ------------- | -------------- | @@ -90,34 +91,34 @@ Note that this works in two directions. If you use a model with such setting exp | discard_value | ignore | | discard_row | forbid | -`discard_row` requires additional handling when ValidationError is raised. +`discard_row` requires additional handling when a ValidationError is raised. :::tip Model validation is added as a [transform step](resource.md#filter-transform-and-pivot-data) to the resource. This step will convert the incoming data items into instances of validating models. You could easily convert them back to dictionaries by using `add_map(lambda item: item.dict())` on a resource. ::: :::note -Pydantic models work on the **extracted** data **before names are normalized or nested tables are created**. Make sure to name model fields as in your input data and handle nested data with the nested models. +Pydantic models work on the **extracted** data **before names are normalized or nested tables are created**. Make sure to name model fields as in your input data and handle nested data with nested models. -As a consequence, `discard_row` will drop the whole data item - even if nested model was affected. +As a consequence, `discard_row` will drop the whole data item - even if a nested model was affected. ::: -### Set contracts on Arrow Tables and Pandas +### Set contracts on Arrow tables and Pandas + All contract settings apply to [arrow tables and panda frames](../dlt-ecosystem/verified-sources/arrow-pandas.md) as well. -1. **tables** mode the same - no matter what is the data item type -2. **columns** will allow new columns, raise an exception or modify tables/frames still in extract step to avoid re-writing parquet files. -3. **data_type** changes to data types in tables/frames are not allowed and will result in data type schema clash. We could allow for more modes (evolving data types in Arrow tables sounds weird but ping us on Slack if you need it.) +1. **tables** mode is the same - no matter what the data item type is. +2. **columns** will allow new columns, raise an exception, or modify tables/frames still in the extract step to avoid rewriting Parquet files. +3. **data_type** changes to data types in tables/frames are not allowed and will result in a data type schema clash. We could allow for more modes (evolving data types in Arrow tables sounds weird but ping us on Slack if you need it.) Here's how `dlt` deals with column modes: -1. **evolve** new columns are allowed (table may be reordered to put them at the end) -2. **discard_value** column will be deleted -3. **discard_row** rows with the column present will be deleted and then column will be deleted -4. **freeze** exception on a new column - +1. **evolve** new columns are allowed (the table may be reordered to put them at the end). +2. **discard_value** the column will be deleted. +3. **discard_row** rows with the column present will be deleted and then the column will be deleted. +4. **freeze** an exception on a new column. ### Get context from DataValidationError in freeze mode -When contract is violated in freeze mode, `dlt` raises `DataValidationError` exception. This exception gives access to the full context and passes the evidence to the caller. -As with any other exception coming from pipeline run, it will be re-raised via `PipelineStepFailed` exception which you should catch in except: +When a contract is violated in freeze mode, `dlt` raises a `DataValidationError` exception. This exception provides access to the full context and passes the evidence to the caller. +As with any other exception coming from a pipeline run, it will be re-raised via a `PipelineStepFailed` exception, which you should catch in an except block: ```py try: @@ -129,27 +130,24 @@ except PipelineStepFailed as pip_ex: if pip_ex.step == "extract": if isinstance(pip_ex.__context__, DataValidationError): ... - - ``` `DataValidationError` provides the following context: -1. `schema_name`, `table_name` and `column_name` provide the logical "location" at which the contract was violated. -2. `schema_entity` and `contract_mode` tell which contract was violated -3. `table_schema` contains the schema against which the contract was validated. May be Pydantic model or `dlt` TTableSchema instance -4. `schema_contract` the full, expanded schema contract -5. `data_item` causing data item (Python dict, arrow table, pydantic model or list of there of) - +1. `schema_name`, `table_name`, and `column_name` provide the logical "location" at which the contract was violated. +2. `schema_entity` and `contract_mode` indicate which contract was violated. +3. `table_schema` contains the schema against which the contract was validated. It may be a Pydantic model or a dlt `TTableSchema` instance. +4. `schema_contract` is the full, expanded schema contract. +5. `data_item` is the causing data item (Python dict, arrow table, Pydantic model, or list thereof). ### Contracts on new tables -If a table is a **new table** that has not been created on the destination yet, dlt will allow the creation of new columns. For a single pipeline run, the column mode is changed (internally) to **evolve** and then reverted back to the original mode. This allows for initial schema inference to happen and then on subsequent run, the inferred contract will be applied to a new data. +If a table is a **new table** that has not been created on the destination yet, dlt will allow the creation of new columns. For a single pipeline run, the column mode is changed (internally) to **evolve** and then reverted back to the original mode. This allows for initial schema inference to happen, and then on subsequent runs, the inferred contract will be applied to the new data. -Following tables are considered new: -1. Child tables inferred from the nested data -2. Dynamic tables created from the data during extraction -3. Tables containing **incomplete** columns - columns without data type bound to them. +The following tables are considered new: +1. Child tables inferred from nested data. +2. Dynamic tables created from the data during extraction. +3. Tables containing **incomplete** columns - columns without a data type bound to them. -For example such table is considered new because column **number** is incomplete (define primary key and NOT null but no data type) +For example, such a table is considered new because the column **number** is incomplete (defined as primary key and NOT null but no data type): ```yaml blocks: description: Ethereum blocks @@ -161,18 +159,18 @@ blocks: name: number ``` -What tables are not considered new: -1. Those with columns defined by Pydantic modes +Tables that are not considered new: +1. Those with columns defined by Pydantic models. ### Working with datasets that have manually added tables and columns on the first load -In some cases you might be working with datasets that have tables or columns created outside of dlt. If you are loading to a table not created by `dlt` for the first time, `dlt` will not know about this table while enforcing schema contracts. This means that if you do a load where the `tables` are set to `evolve`, all will work as planned. If you have `tables` set to `freeze`, dlt will raise an exception because it thinks you are creating a new table (which you are from dlts perspective). You can allow `evolve` for one load and then switch back to `freeze`. +In some cases, you might be working with datasets that have tables or columns created outside of dlt. If you are loading to a table not created by dlt for the first time, dlt will not know about this table while enforcing schema contracts. This means that if you do a load where the `tables` are set to `evolve`, all will work as planned. If you have `tables` set to `freeze`, dlt will raise an exception because it thinks you are creating a new table (which you are from dlt's perspective). You can allow `evolve` for one load and then switch back to `freeze`. The same thing will happen if `dlt` knows your table, but you have manually added a column to your destination and you have `columns` set to `freeze`. -### Code Examples +### Code examples -The below code will silently ignore new subtables, allow new columns to be added to existing tables and raise an error if a variant of a column is discovered. +The below code will silently ignore new subtables, allow new columns to be added to existing tables, and raise an error if a variant of a column is discovered. ```py @dlt.resource(schema_contract={"tables": "discard_row", "columns": "evolve", "data_type": "freeze"}) @@ -180,14 +178,14 @@ def items(): ... ``` -The below Code will raise on any encountered schema change. Note: You can always set a string which will be interpreted as though all keys are set to these values. +The below code will raise an error on any encountered schema change. Note: You can always set a string which will be interpreted as though all keys are set to these values. ```py pipeline.run(my_source(), schema_contract="freeze") ``` -The below code defines some settings on the source which can be overwritten on the resource which in turn can be overwritten by the global override on the `run` method. -Here for all resources variant columns are frozen and raise an error if encountered, on `items` new columns are allowed but `other_items` inherits the `freeze` setting from +The below code defines some settings on the source which can be overwritten on the resource, which in turn can be overwritten by the global override on the `run` method. +Here, for all resources, variant columns are frozen and raise an error if encountered. On `items`, new columns are allowed, but `other_items` inherits the `freeze` setting from the source, thus new columns are frozen there. New tables are allowed. ```py @@ -210,4 +208,5 @@ pipeline.run(source()) # this will freeze the whole schema, regardless of the decorator settings pipeline.run(source(), schema_contract="freeze") -``` \ No newline at end of file +``` + diff --git a/docs/website/docs/general-usage/schema-evolution.md b/docs/website/docs/general-usage/schema-evolution.md index b2b81cfdca..7b50ea139d 100644 --- a/docs/website/docs/general-usage/schema-evolution.md +++ b/docs/website/docs/general-usage/schema-evolution.md @@ -6,23 +6,23 @@ keywords: [schema evolution, schema, dlt schema] ## When to use schema evolution? -Schema evolution is a best practice when ingesting most data. It’s simply a way to get data across a format barrier. +Schema evolution is a best practice when ingesting most data. It's simply a way to get data across a format barrier. -It separates the technical challenge of “loading” data, from the business challenge of “curating” data. This enables us to have pipelines that are maintainable by different individuals at different stages. +It separates the technical challenge of "loading" data from the business challenge of "curating" data. This enables us to have pipelines that are maintainable by different individuals at different stages. -However, for cases where schema evolution might be triggered by malicious events, such as in web tracking, data contracts are advised. Read more about how to implement data contracts [here](https://dlthub.com/docs/general-usage/schema-contracts). +However, for cases where schema evolution might be triggered by malicious events, such as in web tracking, data contracts are advised. Read more about how to implement data contracts [here](./schema-contracts). ## Schema evolution with `dlt` `dlt` automatically infers the initial schema for your first pipeline run. However, in most cases, the schema tends to change over time, which makes it critical for downstream consumers to adapt to schema changes. -As the structure of data changes, such as the addition of new columns, changing data types, etc., `dlt` handles these schema changes, enabling you to adapt to changes without losing velocity. +As the structure of data changes, such as the addition of new columns or changing data types, `dlt` handles these schema changes, enabling you to adapt to changes without losing velocity. ## Inferring a schema from nested data -The first run of a pipeline will scan the data that goes through it and generate a schema. To convert nested data into relational format, `dlt` flattens dictionaries and unpacks nested lists into sub-tables. +The first run of a pipeline will scan the data that goes through it and generate a schema. To convert nested data into a relational format, `dlt` flattens dictionaries and unpacks nested lists into sub-tables. -We’ll review some examples here and figure out how `dlt` creates initial schema and how normalisation works. Consider a pipeline that loads the following schema: +We'll review some examples here and figure out how `dlt` creates the initial schema and how normalization works. Consider a pipeline that loads the following schema: ```py data = [{ @@ -47,18 +47,18 @@ The schema of data above is loaded to the destination as follows: ### What did the schema inference engine do? -As you can see above the `dlt's` inference engine generates the structure of the data based on the source and provided hints. It normalizes the data, creates tables and columns, and infers data types. +As you can see above, the dlt's inference engine generates the structure of the data based on the source and provided hints. It normalizes the data, creates tables and columns, and infers data types. -For more information, you can refer to the **[Schema](https://dlthub.com/docs/general-usage/schema)** and **[Adjust a Schema](https://dlthub.com/docs/walkthroughs/adjust-a-schema)** sections in the documentation. +For more information, you can refer to the [Schema](./schema) and [Adjust a Schema](../walkthroughs/adjust-a-schema) sections in the documentation. ## Evolving the schema -For a typical data source schema tends to change with time, and `dlt` handles this changing schema seamlessly. +For a typical data source, the schema tends to change over time, and dlt handles this changing schema seamlessly. Let’s add the following 4 cases: -- A column is added : a field named “CEO” was added. -- A column type is changed: Datatype of column named “inventory_nr” was changed from integer to string. +- A column is added: a field named “CEO” was added. +- A column type is changed: The datatype of the column named “inventory_nr” was changed from integer to string. - A column is removed: a field named “room” was commented out/removed. - A column is renamed: a field “building” was renamed to “main_block”. @@ -106,11 +106,11 @@ By separating the technical process of loading data from curation, you free the **Tracking column lineage** -The column lineage can be tracked by loading the 'load_info' to the destination. The 'load_info' contains information about columns ‘data types’, ‘add times’, and ‘load id’. To read more please see [the data lineage article](https://dlthub.com/docs/blog/dlt-data-lineage) we have on the blog. +The column lineage can be tracked by loading the 'load_info' to the destination. The 'load_info' contains information about columns’ data types, add times, and load id. To read more please see [the data lineage article](https://dlthub.com/blog/dlt-data-lineage) we have on the blog. **Getting notifications** -We can read the load outcome and send it to slack webhook with `dlt`. +We can read the load outcome and send it to a Slack webhook with dlt. ```py # Import the send_slack_message function from the dlt library from dlt.common.runtime.slack import send_slack_message @@ -139,16 +139,15 @@ This script sends Slack notifications for schema updates using the `send_slack_m ## How to control evolution -`dlt` allows schema evolution control via its schema and data contracts. Refer to our **[documentation](https://dlthub.com/docs/general-usage/schema-contracts)** for details. +`dlt` allows schema evolution control via its schema and data contracts. Refer to our **[documentation](./schema-contracts)** for details. -### How to test for removed columns - applying “not null” constraint +### How to test for removed columns - applying "not null" constraint -A column not existing, and a column being null, are two different things. However, when it comes to APIs and json, it’s usually all treated the same - the key-value pair will simply not exist. +A column not existing and a column being null are two different things. However, when it comes to APIs and JSON, it’s usually all treated the same - the key-value pair will simply not exist. To remove a column, exclude it from the output of the resource function. Subsequent data inserts will treat this column as null. Verify column removal by applying a not null constraint. For instance, after removing the "room" column, apply a not null constraint to confirm its exclusion. ```py - data = [{ "organization": "Tech Innovations Inc.", "address": { @@ -166,7 +165,7 @@ pipeline = dlt.pipeline("organizations_pipeline", destination="duckdb") # Adding not null constraint pipeline.run(data, table_name="org", columns={"room": {"data_type": "bigint", "nullable": False}}) ``` -During pipeline execution a data validation error indicates that a removed column is being passed as null. +During pipeline execution, a data validation error indicates that a removed column is being passed as null. ## Some schema changes in the data @@ -202,14 +201,15 @@ The schema of the data above is loaded to the destination as follows: ## What did the schema evolution engine do? -The schema evolution engine in the `dlt` library is designed to handle changes in the structure of your data over time. For example: +The schema evolution engine in the `dlt` library is designed to handle changes in the structure of your data over time. For example: -- As above in continuation of the inferred schema, the “specifications” are nested in "details”, which are nested in “Inventory”, all under table name “org”. So the table created for projects is `org__inventory__details__specifications`. +- As above in continuation of the inferred schema, the “specifications” are nested in "details", which are nested in “Inventory”, all under the table name “org”. So the table created for projects is `org__inventory__details__specifications`. -These is a simple examples of how schema evolution works. +This is a simple example of how schema evolution works. ## Schema evolution using schema and data contracts -Demonstrating schema evolution without talking about schema and data contracts is only one side of the coin. Schema and data contracts dictate the terms of how the schema being written to destination should evolve. +Demonstrating schema evolution without talking about schema and data contracts is only one side of the coin. Schema and data contracts dictate the terms of how the schema being written to the destination should evolve. + +Schema and data contracts can be applied to entities such as ‘tables’, ‘columns’, and ‘data_types’ using contract modes such as ‘evolve’, ‘freeze’, ‘discard_rows’, and ‘discard_columns’ to tell dlt how to apply contracts for a particular entity. To read more about **schema and data contracts**, read our [documentation](./schema-contracts). -Schema and data contracts can be applied to entities ‘tables’ , ‘columns’ and ‘data_types’ using contract modes ‘evolve’, freeze’, ‘discard_rows’ and ‘discard_columns’ to tell `dlt` how to apply contract for a particular entity. To read more about **schema and data contracts** read our [documentation](https://dlthub.com/docs/general-usage/schema-contracts). \ No newline at end of file diff --git a/docs/website/docs/general-usage/schema.md b/docs/website/docs/general-usage/schema.md index 534d3ca3bd..2903221a36 100644 --- a/docs/website/docs/general-usage/schema.md +++ b/docs/website/docs/general-usage/schema.md @@ -6,66 +6,66 @@ keywords: [schema, dlt schema, yaml] # Schema -The schema describes the structure of normalized data (e.g. tables, columns, data types, etc.) and -provides instructions on how the data should be processed and loaded. `dlt` generates schemas from -the data during the normalization process. User can affect this standard behavior by providing -**hints** that change how tables, columns and other metadata is generated and how the data is -loaded. Such hints can be passed in the code ie. to `dlt.resource` decorator or `pipeline.run` -method. Schemas can be also exported and imported as files, which can be directly modified. +The schema describes the structure of normalized data (e.g., tables, columns, data types, etc.) and +provides instructions on how the data should be processed and loaded. dlt generates schemas from +the data during the normalization process. Users can affect this standard behavior by providing +**hints** that change how tables, columns, and other metadata are generated and how the data is +loaded. Such hints can be passed in the code, i.e., to the `dlt.resource` decorator or `pipeline.run` +method. Schemas can also be exported and imported as files, which can be directly modified. > 💡 `dlt` associates a schema with a [source](source.md) and a table schema with a > [resource](resource.md). ## Schema content hash and version -Each schema file contains content based hash `version_hash` that is used to: +Each schema file contains a content-based hash `version_hash` that is used to: -1. Detect manual changes to schema (ie. user edits content). +1. Detect manual changes to the schema (i.e., user edits content). 1. Detect if the destination database schema is synchronized with the file schema. Each time the schema is saved, the version hash is updated. -Each schema contains a numeric version which increases automatically whenever schema is updated and -saved. Numeric version is meant to be human-readable. There are cases (parallel processing) where +Each schema contains a numeric version which increases automatically whenever the schema is updated and +saved. The numeric version is meant to be human-readable. There are cases (parallel processing) where the order is lost. -> 💡 Schema in the destination is migrated if its hash is not stored in `_dlt_versions` table. In -> principle many pipelines may send data to a single dataset. If table name clash then a single +> 💡 The schema in the destination is migrated if its hash is not stored in the `_dlt_versions` table. In +> principle, many pipelines may send data to a single dataset. If table names clash, then a single > table with the union of the columns will be created. If columns clash, and they have different -> types etc. then the load may fail if the data cannot be coerced. +> types, etc., then the load may fail if the data cannot be coerced. ## Naming convention -`dlt` creates tables, nested tables and column schemas from the data. The data being loaded, -typically JSON documents, contains identifiers (i.e. key names in a dictionary) with any Unicode -characters, any lengths and naming styles. On the other hand the destinations accept very strict -namespaces for their identifiers. Like Redshift that accepts case-insensitive alphanumeric -identifiers with maximum 127 characters. +`dlt` creates tables, nested tables, and column schemas from the data. The data being loaded, +typically JSON documents, contains identifiers (i.e., key names in a dictionary) with any Unicode +characters, any lengths, and naming styles. On the other hand, the destinations accept very strict +namespaces for their identifiers. Like Redshift, that accepts case-insensitive alphanumeric +identifiers with a maximum of 127 characters. -Each schema contains [naming convention](naming-convention.md) that tells `dlt` how to translate identifiers to the -namespace that the destination understands. This convention can be configured, changed in code or enforced via +Each schema contains a [naming convention](naming-convention.md) that tells dlt how to translate identifiers to the +namespace that the destination understands. This convention can be configured, changed in code, or enforced via destination. The default naming convention: -1. Converts identifiers to snake_case, small caps. Removes all ascii characters except ascii +1. Converts identifiers to snake_case, small caps. Removes all ASCII characters except ASCII alphanumerics and underscores. -1. Adds `_` if name starts with number. -1. Multiples of `_` are converted into single `_`. +1. Adds `_` if the name starts with a number. +1. Multiples of `_` are converted into a single `_`. 1. Nesting is expressed as double `_` in names. -1. It shorts the identifier if it exceed the length at the destination. +1. It shortens the identifier if it exceeds the length at the destination. -> 💡 Standard behavior of `dlt` is to **use the same naming convention for all destinations** so -> users see always the same tables and columns in their databases. +> 💡 The standard behavior of `dlt` is to **use the same naming convention for all destinations** so +> users always see the same tables and columns in their databases. -> 💡 If you provide any schema elements that contain identifiers via decorators or arguments (i.e. -> `table_name` or `columns`) all the names used will be converted via the naming convention when -> adding to the schema. For example if you execute `dlt.run(... table_name="CamelCase")` the data +> 💡 If you provide any schema elements that contain identifiers via decorators or arguments (i.e., +> `table_name` or `columns`), all the names used will be converted via the naming convention when +> adding to the schema. For example, if you execute `dlt.run(... table_name="CamelCase")` the data > will be loaded into `camel_case`. -> 💡 Use simple, short small caps identifiers for everything! +> 💡 Use simple, short, small caps identifiers for everything! -To retain the original naming convention (like keeping `"createdAt"` as it is instead of converting it to `"created_at"`), you can use the direct naming convention, in "config.toml" as follows: +To retain the original naming convention (like keeping `"createdAt"` as it is instead of converting it to `"created_at"`), you can use the direct naming convention in "config.toml" as follows: ```toml [schema] naming="direct" @@ -74,82 +74,70 @@ naming="direct" Opting for `"direct"` naming bypasses most name normalization processes. This means any unusual characters present will be carried over unchanged to database tables and columns. Please be aware of this behavior to avoid potential issues. ::: -The naming convention is configurable and users can easily create their own -conventions that i.e. pass all the identifiers unchanged if the destination accepts that (i.e. +The naming convention is configurable, and users can easily create their own +conventions that, i.e., pass all the identifiers unchanged if the destination accepts that (i.e., DuckDB). ## Data normalizer -Data normalizer changes the structure of the input data, so it can be loaded into destination. The -standard `dlt` normalizer creates a relational structure from Python dictionaries and lists. -Elements of that structure: table and column definitions, are added to the schema. +The data normalizer changes the structure of the input data so it can be loaded into the destination. The standard `dlt` normalizer creates a relational structure from Python dictionaries and lists. Elements of that structure, such as table and column definitions, are added to the schema. -The data normalizer is configurable and users can plug their own normalizers i.e. to handle the -nested table linking differently or generate parquet-like data structs instead of nested -tables. +The data normalizer is configurable, and users can plug in their own normalizers, for example, to handle nested table linking differently or generate parquet-like data structures instead of nested tables. ## Tables and columns -The key components of a schema are tables and columns. You can find a dictionary of tables in -`tables` key or via `tables` property of Schema object. +The key components of a schema are tables and columns. You can find a dictionary of tables in the `tables` key or via the `tables` property of the Schema object. A table schema has the following properties: 1. `name` and `description`. -1. `columns` with dictionary of table schemas. -1. `write_disposition` hint telling `dlt` how new data coming to the table is loaded. -1. `schema_contract` - describes a [contract on the table](schema-contracts.md) -1. `parent` a part of the nested reference, defined on a nested table and points to the parent table. +2. `columns` with a dictionary of table schemas. +3. `write_disposition` hint telling `dlt` how new data coming to the table is loaded. +4. `schema_contract` - describes a [contract on the table](schema-contracts.md). +5. `parent` is a part of the nested reference, defined on a nested table and points to the parent table. -Table schema is extended by data normalizer. Standard data normalizer adds propagated columns to it. +The table schema is extended by the data normalizer. The standard data normalizer adds propagated columns to it. -A column schema contains following properties: +A column schema contains the following properties: 1. `name` and `description` of a column in a table. Data type information: 1. `data_type` with a column data type. -1. `precision` a precision for **text**, **timestamp**, **time**, **bigint**, **binary**, and **decimal** types -1. `scale` a scale for **decimal** type -1. `timezone` a flag indicating TZ aware or NTZ **timestamp** and **time**. Default value is **true** -1. `nullable` tells if column is nullable or not. -1. `is_variant` telling that column was generated as variant of another column. - -A column schema contains following basic hints: - -1. `primary_key` marks a column as a part of primary key. -1. `unique` tells that column is unique. on some destination that generates unique index. -1. `merge_key` marks a column as a part of merge key used by - [incremental load](./incremental-loading.md#merge-incremental_loading). - -Hints below are used to create [nested references](#root-and-nested-tables-nested-references) -1. `row_key` a special form of primary key created by `dlt` to uniquely identify rows of data -1. `parent_key` a special form of foreign key used by nested tables to refer to parent tables -1. `root_key` marks a column as a part of root key which is a type of foreign key always referring to the - root table. -1. `_dlt_list_idx` index on a nested list from which nested table is created. +2. `precision` is a precision for **text**, **timestamp**, **time**, **bigint**, **binary**, and **decimal** types. +3. `scale` is a scale for the **decimal** type. +4. `timezone` is a flag indicating TZ aware or NTZ **timestamp** and **time**. The default value is **true**. +5. `nullable` tells if the column is nullable or not. +6. `is_variant` indicates that the column was generated as a variant of another column. + +A column schema contains the following basic hints: + +1. `primary_key` marks a column as part of the primary key. +2. `unique` indicates that the column is unique. On some destinations, this generates a unique index. +3. `merge_key` marks a column as part of the merge key used by [incremental load](./incremental-loading.md#merge-incremental_loading). + +Hints below are used to create [nested references](#root-and-nested-tables-nested-references): +1. `row_key` is a special form of primary key created by `dlt` to uniquely identify rows of data. +2. `parent_key` is a special form of foreign key used by nested tables to refer to parent tables. +3. `root_key` marks a column as part of the root key, which is a type of foreign key always referring to the root table. +4. `_dlt_list_idx` is an index on a nested list from which a nested table is created. `dlt` lets you define additional performance hints: -1. `partition` marks column to be used to partition data. -1. `cluster` marks column to be part to be used to cluster data -1. `sort` marks column as sortable/having order. on some destinations that non-unique generates - index. +1. `partition` marks a column to be used to partition data. +2. `cluster` marks a column to be used to cluster data. +3. `sort` marks a column as sortable/having order. On some destinations, this non-unique generates an index. :::note -Each destination can interpret the hints in its own way. For example `cluster` hint is used by -Redshift to define table distribution and by BigQuery to specify cluster column. DuckDB and -Postgres ignore it when creating tables. +Each destination can interpret the hints in its own way. For example, the `cluster` hint is used by Redshift to define table distribution and by BigQuery to specify a cluster column. DuckDB and Postgres ignore it when creating tables. ::: ### Variant columns -Variant columns are generated by a normalizer when it encounters data item with type that cannot be -coerced in existing column. Please see our [`coerce_row`](https://github.com/dlt-hub/dlt/blob/7d9baf1b8fdf2813bcf7f1afe5bb3558993305ca/dlt/common/schema/schema.py#L205) if you are interested to see how internally it works. +Variant columns are generated by a normalizer when it encounters a data item with a type that cannot be coerced into an existing column. Please see our [`coerce_row`](https://github.com/dlt-hub/dlt/blob/7d9baf1b8fdf2813bcf7f1afe5bb3558993305ca/dlt/common/schema/schema.py#L205) if you are interested in seeing how it works internally. -Let's consider our [getting started](../intro) example with slightly different approach, -where `id` is an integer type at the beginning +Let's consider our [getting started](../intro) example with a slightly different approach, where `id` is an integer type at the beginning: ```py data = [ @@ -157,14 +145,14 @@ data = [ ] ``` -once pipeline runs we will have the following schema: +Once the pipeline runs, we will have the following schema: | name | data_type | nullable | | ------------- | ------------- | -------- | | id | bigint | true | | human_name | text | true | -Now imagine the data has changed and `id` field also contains strings +Now imagine the data has changed and the `id` field also contains strings: ```py data = [ @@ -173,8 +161,7 @@ data = [ ] ``` -So after you run the pipeline `dlt` will automatically infer type changes and will add a new field in the schema `id__v_text` -to reflect that new data type for `id` so for any type which is not compatible with integer it will create a new field. +So after you run the pipeline, `dlt` will automatically infer type changes and will add a new field in the schema `id__v_text` to reflect that new data type for `id`. For any type that is not compatible with integer, it will create a new field. | name | data_type | nullable | | ------------- | ------------- | -------- | @@ -182,10 +169,9 @@ to reflect that new data type for `id` so for any type which is not compatible w | human_name | text | true | | id__v_text | text | true | -On the other hand if `id` field was already a string then introducing new data with `id` containing other types -will not change schema because they can be coerced to string. +On the other hand, if the `id` field was already a string, then introducing new data with `id` containing other types will not change the schema because they can be coerced to string. -Now go ahead and try to add a new record where `id` is float number, you should see a new field `id__v_double` in the schema. +Now go ahead and try to add a new record where `id` is a float number; you should see a new field `id__v_double` in the schema. ### Data types @@ -203,63 +189,57 @@ Now go ahead and try to add a new record where `id` is float number, you should | decimal | `Decimal('4.56')` | Supports precision and scale | | wei | `2**56` | | -`wei` is a datatype tries to best represent native Ethereum 256bit integers and fixed point -decimals. It works correctly on Postgres and BigQuery. All the other destinations have insufficient -precision. +`wei` is a datatype that tries to best represent native Ethereum 256-bit integers and fixed-point decimals. It works correctly on Postgres and BigQuery. All other destinations have insufficient precision. -`json` data type tells `dlt` to load that element as JSON or string and do not attempt to flatten -or create a nested table out of it. Note that structured types like arrays or maps are not supported by `dlt` at this point. +`json` data type tells `dlt` to load that element as JSON or string and not attempt to flatten or create a nested table out of it. Note that structured types like arrays or maps are not supported by `dlt` at this point. -`time` data type is saved in destination without timezone info, if timezone is included it is stripped. E.g. `'14:01:02+02:00` -> `'14:01:02'`. +`time` data type is saved in the destination without timezone info; if timezone is included, it is stripped. E.g., `'14:01:02+02:00` -> `'14:01:02'`. :::tip -The precision and scale are interpreted by particular destination and are validated when a column is created. Destinations that -do not support precision for a given data type will ignore it. +The precision and scale are interpreted by the particular destination and are validated when a column is created. Destinations that do not support precision for a given data type will ignore it. -The precision for **timestamp** is useful when creating **parquet** files. Use 3 - for milliseconds, 6 for microseconds, 9 for nanoseconds +The precision for **timestamp** is useful when creating **parquet** files. Use 3 for milliseconds, 6 for microseconds, and 9 for nanoseconds. -The precision for **bigint** is mapped to available integer types ie. TINYINT, INT, BIGINT. The default is 64 bits (8 bytes) precision (BIGINT) +The precision for **bigint** is mapped to available integer types, i.e., TINYINT, INT, BIGINT. The default is 64 bits (8 bytes) precision (BIGINT). ::: ## Table references -`dlt` tables to refer to other tables. It supports two types of such references. -1. **nested reference** created automatically when nested data (ie. `json` document containing nested list) is converted into relational form. Those -references use specialized column and table hints and are used ie. when [merging data](incremental-loading.md). -2. **table references** are optional, user-defined annotations that are not verified and enforced but may be used by downstream tools ie. -to generate automatic tests or models for the loaded data. +`dlt` tables refer to other tables. It supports two types of such references: +1. **Nested reference** created automatically when nested data (i.e., a `json` document containing a nested list) is converted into relational form. These references use specialized column and table hints and are used, for example, when [merging data](incremental-loading.md). +2. **Table references** are optional, user-defined annotations that are not verified and enforced but may be used by downstream tools, for example, to generate automatic tests or models for the loaded data. ### Nested references: root and nested tables -When `dlt` normalizes nested data into relational schema it will automatically create [**root** and **nested** tables](destination-tables.md) and link them using **nested references**. +When `dlt` normalizes nested data into a relational schema, it automatically creates [**root** and **nested** tables](destination-tables.md) and links them using **nested references**. -1. All tables get a column with `row_key` hint (named `_dlt_id` by default) to uniquely identify each row of data. -2. Nested tables get `parent` table hint with a name of the parent table. Root table does not have `parent` hint defined. -3. Nested tables get a column with `parent_key` hint (named `_dlt_parent_id` by default) that refers to `row_key` of the `parent` table. +1. All tables receive a column with the `row_key` hint (named `_dlt_id` by default) to uniquely identify each row of data. +2. Nested tables receive a `parent` table hint with the name of the parent table. The root table does not have a `parent` hint defined. +3. Nested tables receive a column with the `parent_key` hint (named `_dlt_parent_id` by default) that refers to the `row_key` of the `parent` table. -`parent` + `row_key` + `parent_key` form a **nested reference**: from nested table to `parent` table and are extensively used when loading data. Both `replace` and `merge` write dispositions +`parent` + `row_key` + `parent_key` form a **nested reference**: from the nested table to the `parent` table and are extensively used when loading data. Both `replace` and `merge` write dispositions. `row_key` is created as follows: -1. Random string on **root** tables, except for [`upsert`](incremental-loading.md#upsert-strategy) and -[`scd2`](incremental-loading.md#scd2-strategy) merge strategies, where it is a deterministic hash of `primary_key` (or whole row, so called `content_hash`, if PK is not defined). -2. A deterministic hash of `parent_key`, `parent` table name and position in the list (`_dlt_list_idx`) +1. A random string on **root** tables, except for [`upsert`](incremental-loading.md#upsert-strategy) and +[`scd2`](incremental-loading.md#scd2-strategy) merge strategies, where it is a deterministic hash of the `primary_key` (or whole row, so-called `content_hash`, if PK is not defined). +2. A deterministic hash of `parent_key`, `parent` table name, and position in the list (`_dlt_list_idx`) for **nested** tables. -You are able to bring your own `row_key` by adding `_dlt_id` column/field to your data (both root and nested). All data types with equal operator are supported. +You are able to bring your own `row_key` by adding a `_dlt_id` column/field to your data (both root and nested). All data types with an equal operator are supported. -`merge` write disposition requires additional nested reference that goes from **nested** to **root** table, skipping all parent tables in between. This reference is created by [adding a column with hint](incremental-loading.md#forcing-root-key-propagation) `root_key` (named `_dlt_root_id` by default) to nested tables. +`merge` write disposition requires an additional nested reference that goes from **nested** to **root** table, skipping all parent tables in between. This reference is created by [adding a column with a hint](incremental-loading.md#forcing-root-key-propagation) `root_key` (named `_dlt_root_id` by default) to nested tables. ### Table references You can annotate tables with table references. This feature is coming soon. ## Schema settings -The `settings` section of schema file lets you define various global rules that impact how tables -and columns are inferred from data. For example you can assign **primary_key** hint to all columns with name `id` or force **timestamp** data type on all columns containing `timestamp` with an use of regex pattern. +The `settings` section of the schema file lets you define various global rules that impact how tables +and columns are inferred from data. For example, you can assign a **primary_key** hint to all columns named `id` or force a **timestamp** data type on all columns containing `timestamp` with the use of a regex pattern. ### Data type autodetectors -You can define a set of functions that will be used to infer the data type of the column from a +You can define a set of functions that will be used to infer the data type of a column from a value. The functions are run from top to bottom on the lists. Look in `detections.py` to see what is -available. **iso_timestamp** detector that looks for ISO 8601 strings and converts them to **timestamp** +available. The **iso_timestamp** detector that looks for ISO 8601 strings and converts them to **timestamp** is enabled by default. ```yaml @@ -273,24 +253,21 @@ settings: - wei_to_double ``` -Alternatively you can add and remove detections from code: +Alternatively, you can add and remove detections from code: ```py source = data_source() # remove iso time detector source.schema.remove_type_detection("iso_timestamp") - # convert UNIX timestamp (float, withing a year from NOW) into timestamp + # convert UNIX timestamp (float, within a year from NOW) into timestamp source.schema.add_type_detection("timestamp") ``` -Above we modify a schema that comes with a source to detect UNIX timestamps with **timestamp** detector. +Above, we modify a schema that comes with a source to detect UNIX timestamps with the **timestamp** detector. ### Column hint rules -You can define a global rules that will apply hints of a newly inferred columns. Those rules apply -to normalized column names. You can use column names directly or with regular expressions. `dlt` is matching -the column names **after they got normalized with naming convention**. +You can define global rules that will apply hints to newly inferred columns. These rules apply to normalized column names. You can use column names directly or with regular expressions. `dlt` matches the column names **after they have been normalized with naming conventions**. -By default, schema adopts hints rules from json(relational) normalizer to support correct hinting -of columns added by normalizer: +By default, the schema adopts hint rules from the json(relational) normalizer to support correct hinting of columns added by the normalizer: ```yaml settings: @@ -310,13 +287,13 @@ settings: root_key: - _dlt_root_id ``` -Above we require exact column name match for a hint to apply. You can also use regular expression (which we call `SimpleRegex`) as follows: +Above, we require an exact column name match for a hint to apply. You can also use a regular expression (which we call `SimpleRegex`) as follows: ```yaml settings: partition: - re:_timestamp$ ``` -Above we add `partition` hint to all columns ending with `_timestamp`. You can do same thing in the code +Above, we add a `partition` hint to all columns ending with `_timestamp`. You can do the same thing in the code: ```py source = data_source() # this will update existing hints with the hints passed @@ -325,10 +302,7 @@ Above we add `partition` hint to all columns ending with `_timestamp`. You can d ### Preferred data types -You can define rules that will set the data type for newly created columns. Put the rules under -`preferred_types` key of `settings`. On the left side there's a rule on a column name, on the right -side is the data type. You can use column names directly or with regular expressions. -`dlt` is matching the column names **after they got normalized with naming convention**. +You can define rules that will set the data type for newly created columns. Put the rules under the `preferred_types` key of `settings`. On the left side, there's a rule on a column name; on the right side is the data type. You can use column names directly or with regular expressions. `dlt` matches the column names **after they have been normalized with naming conventions**. Example: @@ -341,8 +315,8 @@ settings: updated_at: timestamp ``` -Above we prefer `timestamp` data type for all columns containing **timestamp** substring and define a few exact matches ie. **created_at**. -Here's same thing in code +Above, we prefer the `timestamp` data type for all columns containing the **timestamp** substring and define a few exact matches, i.e., **created_at**. +Here's the same thing in code: ```py source = data_source() source.schema.update_preferred_types( @@ -390,7 +364,7 @@ load_info = pipeline.run(source_data) This example iterates through MongoDB collections, applying the **json** [data type](schema#data-types) to a specified column, and then processes the data with `pipeline.run`. ## View and print the schema -To view and print the default schema in a clear YAML format use the command: +To view and print the default schema in a clear YAML format, use the command: ```py pipeline.default_schema.to_pretty_yaml() @@ -419,16 +393,16 @@ schema files in your pipeline. ## Attaching schemas to sources -We recommend to not create schemas explicitly. Instead, user should provide a few global schema -settings and then let the table and column schemas to be generated from the resource hints and the +We recommend not creating schemas explicitly. Instead, users should provide a few global schema +settings and then let the table and column schemas be generated from the resource hints and the data itself. The `dlt.source` decorator accepts a schema instance that you can create yourself and modify in -whatever way you wish. The decorator also support a few typical use cases: +whatever way you wish. The decorator also supports a few typical use cases: ### Schema created implicitly by decorator -If no schema instance is passed, the decorator creates a schema with the name set to source name and +If no schema instance is passed, the decorator creates a schema with the name set to the source name and all the settings to default. ### Automatically load schema file stored with source python module @@ -437,16 +411,16 @@ If no schema instance is passed, and a file with a name `{source name}_schema.ym same folder as the module with the decorated function, it will be automatically loaded and used as the schema. -This should make easier to bundle a fully specified (or pre-configured) schema with a source. +This should make it easier to bundle a fully specified (or pre-configured) schema with a source. ### Schema is modified in the source function body -What if you can configure your schema or add some tables only inside your schema function, when i.e. -you have the source credentials and user settings available? You could for example add detailed -schemas of all the database tables when someone requests a table data to be loaded. This information -is available only at the moment source function is called. +What if you can configure your schema or add some tables only inside your schema function, when, for example, +you have the source credentials and user settings available? You could, for example, add detailed +schemas of all the database tables when someone requests table data to be loaded. This information +is available only at the moment the source function is called. -Similarly to the `source_state()` and `resource_state()` , source and resource function has current +Similarly to the `source_state()` and `resource_state()`, the source and resource function has the current schema available via `dlt.current.source_schema()`. Example: @@ -458,8 +432,9 @@ def textual(nesting_level: int): schema = dlt.current.source_schema() # remove date detector schema.remove_type_detection("iso_timestamp") - # convert UNIX timestamp (float, withing a year from NOW) into timestamp + # convert UNIX timestamp (float, within a year from NOW) into timestamp schema.add_type_detection("timestamp") return dlt.resource([]) ``` + diff --git a/docs/website/docs/general-usage/source.md b/docs/website/docs/general-usage/source.md index e94cc2bd30..f91eca58de 100644 --- a/docs/website/docs/general-usage/source.md +++ b/docs/website/docs/general-usage/source.md @@ -6,58 +6,59 @@ keywords: [source, api, dlt.source] # Source -A [source](glossary.md#source) is a logical grouping of resources i.e. endpoints of a +A [source](glossary.md#source) is a logical grouping of resources, i.e., endpoints of a single API. The most common approach is to define it in a separate Python module. - A source is a function decorated with `@dlt.source` that returns one or more resources. -- A source can optionally define a [schema](schema.md) with tables, columns, performance hints and +- A source can optionally define a [schema](schema.md) with tables, columns, performance hints, and more. - The source Python module typically contains optional customizations and data transformations. -- The source Python module typically contains the authentication and pagination code for particular +- The source Python module typically contains the authentication and pagination code for a particular API. ## Declare sources -You declare source by decorating an (optionally async) function that return or yields one or more resource with `dlt.source`. Our -[Create a pipeline](../walkthroughs/create-a-pipeline.md) how to guide teaches you how to do that. +You declare a source by decorating an (optionally async) function that returns or yields one or more resources with `@dlt.source`. Our +[Create a pipeline](../walkthroughs/create-a-pipeline.md) how-to guide teaches you how to do that. ### Create resources dynamically -You can create resources by using `dlt.resource` as a function. In an example below we reuse a +You can create resources by using `dlt.resource` as a function. In the example below, we reuse a single generator function to create a list of resources for several Hubspot endpoints. ```py @dlt.source def hubspot(api_key=dlt.secrets.value): - endpoints = ["companies", "deals", "product"] + endpoints = ["companies", "deals", "products"] def get_resource(endpoint): yield requests.get(url + "/" + endpoint).json() for endpoint in endpoints: - # calling get_resource creates generator, + # calling get_resource creates a generator, # the actual code of the function will be executed in pipeline.run yield dlt.resource(get_resource(endpoint), name=endpoint) ``` ### Attach and configure schemas -You can [create, attach and configure schema](schema.md#attaching-schemas-to-sources) that will be +You can [create, attach, and configure schemas](schema.md#attaching-schemas-to-sources) that will be used when loading the source. -### Avoid long lasting operations in source function -Do not extract data in source function. Leave that task to your resources if possible. Source function is executed immediately when called (contrary to resources which delay execution - like Python generators). There are several benefits (error handling, execution metrics, parallelization) you get when you extract data in `pipeline.run` or `pipeline.extract`. +### Avoid long-lasting operations in source function -If this is impractical (for example you want to reflect a database to create resources for tables) make sure you do not call source function too often. [See this note if you plan to deploy on Airflow](../walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md#2-modify-dag-file) +Do not extract data in the source function. Leave that task to your resources if possible. The source function is executed immediately when called (contrary to resources which delay execution - like Python generators). There are several benefits (error handling, execution metrics, parallelization) you get when you extract data in `pipeline.run` or `pipeline.extract`. + +If this is impractical (for example, you want to reflect a database to create resources for tables), make sure you do not call the source function too often. [See this note if you plan to deploy on Airflow](../walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md#2-modify-dag-file) ## Customize sources ### Access and select resources to load -You can access resources present in a source and select which of them you want to load. In case of -`hubspot` resource above we could select and load "companies", "deals" and "products" resources: +You can access resources present in a source and select which of them you want to load. In the case of +the `hubspot` resource above, we could select and load the "companies", "deals", and "products" resources: ```py from hubspot import hubspot @@ -67,7 +68,7 @@ source = hubspot() print(source.resources.keys()) # print names of all resources # print resources that are selected to load print(source.resources.selected.keys()) -# load only "companies" and "deals" using "with_resources" convenience method +# load only "companies" and "deals" using the "with_resources" convenience method pipeline.run(source.with_resources("companies", "deals")) ``` @@ -75,7 +76,7 @@ Resources can be individually accessed and selected: ```py # resources are accessible as attributes of a source -for c in source.companies: # enumerate all data in companies resource +for c in source.companies: # enumerate all data in the companies resource print(c) # check if deals are selected to load @@ -84,9 +85,9 @@ print(source.deals.selected) source.deals.selected = False ``` -### Filter, transform and pivot data +### Filter, transform, and pivot data -You can modify and filter data in resources, for example if we want to keep only deals after certain +You can modify and filter data in resources, for example, if we want to keep only deals after a certain date: ```py @@ -97,11 +98,7 @@ Find more on transforms [here](resource.md#filter-transform-and-pivot-data). ### Load data partially -You can limit the number of items produced by each resource by calling a `add_limit` method on a -source. This is useful for testing, debugging and generating sample datasets for experimentation. -You can easily get your test dataset in a few minutes, when otherwise you'd need to wait hours for -the full loading to complete. Below we limit the `pipedrive` source to just get **10 pages** of data -from each endpoint. Mind that the transformers will be evaluated fully: +You can limit the number of items produced by each resource by calling the `add_limit` method on a source. This is useful for testing, debugging, and generating sample datasets for experimentation. You can easily get your test dataset in a few minutes, when otherwise you'd need to wait hours for the full loading to complete. Below, we limit the `pipedrive` source to just get **10 pages** of data from each endpoint. Mind that the transformers will be evaluated fully: ```py from pipedrive import pipedrive_source @@ -119,11 +116,7 @@ Find more on sampling data [here](resource.md#sample-from-large-data). ### Add more resources to existing source -You can add a custom resource to source after it was created. Imagine that you want to score all the -deals with a keras model that will tell you if the deal is a fraud or not. In order to do that you -declare a new -[transformer that takes the data from](resource.md#feeding-data-from-one-resource-into-another) `deals` -resource and add it to the source. +You can add a custom resource to a source after it was created. Imagine that you want to score all the deals with a keras model that will tell you if the deal is a fraud or not. In order to do that, you declare a new [transformer that takes the data from](resource.md#feeding-data-from-one-resource-into-another) `deals` resource and add it to the source. ```py import dlt @@ -143,7 +136,7 @@ source.resources.add(source.deals | deal_scores) # load the data: you'll see the new table `deal_scores` in your destination! pipeline.run(source) ``` -You can also set the resources in the source as follows +You can also set the resources in the source as follows: ```py source.deal_scores = source.deals | deal_scores ``` @@ -152,13 +145,12 @@ or source.resources["deal_scores"] = source.deals | deal_scores ``` :::note -When adding resource to the source, `dlt` clones the resource so your existing instance is not affected. +When adding a resource to the source, `dlt` clones the resource so your existing instance is not affected. ::: ### Reduce the nesting level of generated tables -You can limit how deep `dlt` goes when generating nested tables and flattening dicts into columns. By default, the library will descend -and generate nested tables for all nested lists and columns form dicts, without limit. +You can limit how deep `dlt` goes when generating nested tables and flattening dicts into columns. By default, the library will descend and generate nested tables for all nested lists and columns from dicts, without limit. ```py @dlt.source(max_table_nesting=1) @@ -166,13 +158,10 @@ def mongo_db(): ... ``` -In the example above, we want only 1 level of nested tables to be generated (so there are no nested -tables of a nested table). Typical settings: +In the example above, we want only 1 level of nested tables to be generated (so there are no nested tables of a nested table). Typical settings: -- `max_table_nesting=0` will not generate nested tables and will not flatten dicts into columns at all. All nested data will be - represented as JSON. -- `max_table_nesting=1` will generate nested tables of root tables and nothing more. All nested - data in nested tables will be represented as JSON. +- `max_table_nesting=0` will not generate nested tables and will not flatten dicts into columns at all. All nested data will be represented as JSON. +- `max_table_nesting=1` will generate nested tables of root tables and nothing more. All nested data in nested tables will be represented as JSON. You can achieve the same effect after the source instance is created: @@ -183,17 +172,12 @@ source = mongo_db() source.max_table_nesting = 0 ``` -Several data sources are prone to contain semi-structured documents with very deep nesting i.e. -MongoDB databases. Our practical experience is that setting the `max_nesting_level` to 2 or 3 -produces the clearest and human-readable schemas. +Several data sources are prone to contain semi-structured documents with very deep nesting, e.g., MongoDB databases. Our practical experience is that setting the `max_nesting_level` to 2 or 3 produces the clearest and human-readable schemas. :::tip -The `max_table_nesting` parameter at the source level doesn't automatically apply to individual -resources when accessed directly (e.g., using `source.resources["resource_1"])`. To make sure it -works, either use `source.with_resources("resource_1")` or set the parameter directly on the resource. +The `max_table_nesting` parameter at the source level doesn't automatically apply to individual resources when accessed directly (e.g., using `source.resources["resource_1"]`). To make sure it works, either use `source.with_resources("resource_1")` or set the parameter directly on the resource. ::: - You can directly configure the `max_table_nesting` parameter on the resource level as: ```py @@ -209,28 +193,28 @@ my_source.my_resource.max_table_nesting = 0 ### Modify schema -The schema is available via `schema` property of the source. -[You can manipulate this schema i.e. add tables, change column definitions etc. before the data is loaded.](schema.md#schema-is-modified-in-the-source-function-body) +The schema is available via the `schema` property of the source. +[You can manipulate this schema, i.e., add tables, change column definitions, etc., before the data is loaded.](schema.md#schema-is-modified-in-the-source-function-body) -Source provides two other convenience properties: +The source provides two other convenience properties: -1. `max_table_nesting` to set the maximum nesting level for nested tables and flattened columns -1. `root_key` to propagate the `_dlt_id` of from a root table to all nested tables. +1. `max_table_nesting` to set the maximum nesting level for nested tables and flattened columns. +1. `root_key` to propagate the `_dlt_id` from a root table to all nested tables. ## Load sources -You can pass individual sources or list of sources to the `dlt.pipeline` object. By default, all the -sources will be loaded to a single dataset. +You can pass individual sources or a list of sources to the `dlt.pipeline` object. By default, all the +sources will be loaded into a single dataset. You are also free to decompose a single source into several ones. For example, you may want to break -down a 50 table copy job into an airflow dag with high parallelism to load the data faster. To do +down a 50-table copy job into an Airflow DAG with high parallelism to load the data faster. To do so, you could get the list of resources as: ```py # get a list of resources' names resource_list = sql_source().resources.keys() -#now we are able to make a pipeline for each resource +# now we are able to make a pipeline for each resource for res in resource_list: pipeline.run(sql_source().with_resources(res)) ``` @@ -249,3 +233,4 @@ With selected resources: ```py p.run(tables.with_resources("users"), write_disposition="replace") ``` + diff --git a/docs/website/docs/general-usage/state.md b/docs/website/docs/general-usage/state.md index b34d37c8b1..db742c20b5 100644 --- a/docs/website/docs/general-usage/state.md +++ b/docs/website/docs/general-usage/state.md @@ -6,21 +6,21 @@ keywords: [state, metadata, dlt.current.resource_state, dlt.current.source_state # State -The pipeline state is a Python dictionary which lives alongside your data; you can store values in -it and, on next pipeline run, request them back. +The pipeline state is a Python dictionary that lives alongside your data; you can store values in +it and, on the next pipeline run, request them back. ## Read and write pipeline state in a resource -You read and write the state in your resources. Below we use the state to create a list of chess -game archives which we then use to +You read and write the state in your resources. Below, we use the state to create a list of chess +game archives, which we then use to [prevent requesting duplicates](incremental-loading.md#advanced-state-usage-storing-a-list-of-processed-entities). ```py @dlt.resource(write_disposition="append") def players_games(chess_url, player, start_month=None, end_month=None): - # create or request a list of archives from resource scoped state + # create or request a list of archives from resource-scoped state checked_archives = dlt.current.resource_state().setdefault("archives", []) - # get list of archives for a particular player + # get a list of archives for a particular player archives = player_archives(chess_url, player) for url in archives: if url in checked_archives: @@ -35,55 +35,55 @@ def players_games(chess_url, player, start_month=None, end_month=None): yield r.json().get("games", []) ``` -Above, we request the resource-scoped state. The `checked_archives` list stored under `archives` +Above, we request the resource-scoped state. The `checked_archives` list stored under the `archives` dictionary key is private and visible only to the `players_games` resource. -The pipeline state is stored locally in -[pipeline working directory](pipeline.md#pipeline-working-directory) and as a consequence - it +The pipeline state is stored locally in the +[pipeline working directory](pipeline.md#pipeline-working-directory) and, as a consequence, it cannot be shared with pipelines with different names. You must also make sure that data written into -the state is JSON Serializable. Except standard Python types, `dlt` handles `DateTime`, `Decimal`, -`bytes` and `UUID`. +the state is JSON serializable. Except for standard Python types, `dlt` handles `DateTime`, `Decimal`, +`bytes`, and `UUID`. ## Share state across resources and read state in a source -You can also access the source-scoped state with `dlt.current.source_state()` which can be shared +You can also access the source-scoped state with `dlt.current.source_state()`, which can be shared across resources of a particular source and is also available read-only in the source-decorated -functions. The most common use case for the source-scoped state is to store mapping of custom fields +functions. The most common use case for the source-scoped state is to store a mapping of custom fields to their displayable names. You can take a look at our [pipedrive source](https://github.com/dlt-hub/verified-sources/blob/master/sources/pipedrive/__init__.py#L118) for an example of state passed across resources. :::tip -[decompose your source](../reference/performance.md#source-decomposition-for-serial-and-parallel-resource-execution) -in order to, for example run it on Airflow in parallel. If you cannot avoid that, designate one of -the resources as state writer and all the other as state readers. This is exactly what `pipedrive` -pipeline does. With such structure you will still be able to run some of your resources in +[Decompose your source](../reference/performance.md#source-decomposition-for-serial-and-parallel-resource-execution) +to, for example, run it on Airflow in parallel. If you cannot avoid that, designate one of +the resources as the state writer and all others as state readers. This is exactly what the `pipedrive` +pipeline does. With such a structure, you will still be able to run some of your resources in parallel. ::: :::caution -The `dlt.state()` is a deprecated alias to `dlt.current.source_state()` and will be soon +The `dlt.state()` is a deprecated alias to `dlt.current.source_state()` and will soon be removed. ::: ## Syncing state with destination -What if you run your pipeline on, for example, Airflow where every task gets a clean filesystem and -[pipeline working directory](pipeline.md#pipeline-working-directory) is always deleted? `dlt` loads -your state into the destination together with all other data and when faced with a clean start, it -will try to restore state from the destination. +What if you run your pipeline on, for example, Airflow, where every task gets a clean filesystem and +the [pipeline working directory](pipeline.md#pipeline-working-directory) is always deleted? `dlt` loads +your state into the destination along with all other data, and when faced with a clean start, it +will try to restore the state from the destination. -The remote state is identified by pipeline name, the destination location (as given by the -credentials) and destination dataset. To re-use the same state, use the same pipeline name and +The remote state is identified by the pipeline name, the destination location (as given by the +credentials), and the destination dataset. To reuse the same state, use the same pipeline name and destination. The state is stored in the `_dlt_pipeline_state` table at the destination and contains information -about the pipeline, pipeline run (that the state belongs to) and state blob. +about the pipeline, the pipeline run (to which the state belongs), and the state blob. -`dlt` has `dlt pipeline sync` command where you can +`dlt` has a `dlt pipeline sync` command where you can [request the state back from that table](../reference/command-line-interface.md#sync-pipeline-with-the-destination). > 💡 If you can keep the pipeline working directory across the runs, you can disable the state sync -> by setting `restore_from_destination=false` i.e. in your `config.toml`. +> by setting `restore_from_destination=false` in your `config.toml`. ## When to use pipeline state @@ -94,77 +94,72 @@ about the pipeline, pipeline run (that the state belongs to) and state blob. if the list is not much bigger than 100k elements. - [Store large dictionaries of last values](incremental-loading.md#advanced-state-usage-tracking-the-last-value-for-all-search-terms-in-twitter-api) if you are not able to implement it with the standard incremental construct. -- Store the custom fields dictionaries, dynamic configurations and other source-scoped state. +- Store custom fields dictionaries, dynamic configurations, and other source-scoped state. ## Do not use pipeline state if it can grow to millions of records -Do not use dlt state when it may grow to millions of elements. Do you plan to store modification -timestamps of all of your millions of user records? This is probably a bad idea! In that case you +Do not use `dlt` state when it may grow to millions of elements. Do you plan to store modification +timestamps of all your millions of user records? This is probably a bad idea! In that case, you could: -- Store the state in dynamo-db, redis etc. taking into the account that if the extract stage fails - you'll end with invalid state. +- Store the state in DynamoDB, Redis, etc., taking into account that if the extract stage fails, + you'll end up with an invalid state. - Use your loaded data as the state. `dlt` exposes the current pipeline via `dlt.current.pipeline()` from which you can obtain [sqlclient](../dlt-ecosystem/transformations/sql.md) - and load the data of interest. In that case try at least to process your user records in batches. + and load the data of interest. In that case, try at least to process your user records in batches. ### Access data in the destination instead of pipeline state -In the example below, we load recent comments made by given `user_id`. We access `user_comments` table to select -maximum comment id for a given user. + +In the example below, we load recent comments made by a given `user_id`. We access the `user_comments` table to select the maximum comment id for a given user. ```py import dlt @dlt.resource(name="user_comments") def comments(user_id: str): current_pipeline = dlt.current.pipeline() - # find last comment id for given user_id by looking in destination + # find the last comment id for the given user_id by looking in the destination max_id: int = 0 - # on first pipeline run, user_comments table does not yet exist so do not check at all - # alternatively catch DatabaseUndefinedRelation which is raised when unknown table is selected + # on the first pipeline run, the user_comments table does not yet exist so do not check at all + # alternatively, catch DatabaseUndefinedRelation which is raised when an unknown table is selected if not current_pipeline.first_run: with current_pipeline.sql_client() as client: - # we may get last user comment or None which we replace with 0 + # we may get the last user comment or None which we replace with 0 max_id = ( client.execute_sql( "SELECT MAX(_id) FROM user_comments WHERE user_id=?", user_id )[0][0] or 0 ) - # use max_id to filter our results (we simulate API query) + # use max_id to filter our results (we simulate an API query) yield from [ {"_id": i, "value": letter, "user_id": user_id} for i, letter in zip([1, 2, 3], ["A", "B", "C"]) if i > max_id ] ``` -When pipeline is first run, the destination dataset and `user_comments` table do not yet exist. We skip the destination -query by using `first_run` property of the pipeline. We also handle a situation where there are no comments for a user_id -by replacing None with 0 as `max_id`. +When the pipeline is first run, the destination dataset and `user_comments` table do not yet exist. We skip the destination query by using the `first_run` property of the pipeline. We also handle a situation where there are no comments for a user_id by replacing None with 0 as `max_id`. ## Inspect the pipeline state -You can inspect pipeline state with -[`dlt pipeline` command](../reference/command-line-interface.md#dlt-pipeline): +You can inspect the pipeline state with the [`dlt pipeline` command](../reference/command-line-interface.md#dlt-pipeline): ```sh dlt pipeline -v chess_pipeline info ``` -will display source and resource state slots for all known sources. +This will display the source and resource state slots for all known sources. ## Reset the pipeline state: full or partial **To fully reset the state:** - Drop the destination dataset to fully reset the pipeline. -- [Set the `dev_mode` flag when creating pipeline](pipeline.md#do-experiments-with-dev-mode). -- Use the `dlt pipeline drop --drop-all` command to - [drop state and tables for a given schema name](../reference/command-line-interface.md#selectively-drop-tables-and-reset-state). +- [Set the `dev_mode` flag when creating the pipeline](pipeline.md#do-experiments-with-dev-mode). +- Use the `dlt pipeline drop --drop-all` command to [drop the state and tables for a given schema name](../reference/command-line-interface.md#selectively-drop-tables-and-reset-state). **To partially reset the state:** -- Use the `dlt pipeline drop ` command to - [drop state and tables for a given resource](../reference/command-line-interface.md#selectively-drop-tables-and-reset-state). -- Use the `dlt pipeline drop --state-paths` command to - [reset the state at given path without touching the tables and data](../reference/command-line-interface.md#selectively-drop-tables-and-reset-state). +- Use the `dlt pipeline drop ` command to [drop the state and tables for a given resource](../reference/command-line-interface.md#selectively-drop-tables-and-reset-state). +- Use the `dlt pipeline drop --state-paths` command to [reset the state at a given path without touching the tables and data](../reference/command-line-interface.md#selectively-drop-tables-and-reset-state). + diff --git a/docs/website/docs/intro.md b/docs/website/docs/intro.md index 6660696cfb..650c47920b 100644 --- a/docs/website/docs/intro.md +++ b/docs/website/docs/intro.md @@ -18,7 +18,7 @@ dlt is designed to be easy to use, flexible, and scalable: - dlt infers [schemas](./general-usage/schema) and [data types](./general-usage/schema/#data-types), [normalizes the data](./general-usage/schema/#data-normalizer), and handles nested data structures. - dlt supports a variety of [popular destinations](./dlt-ecosystem/destinations/) and has an interface to add [custom destinations](./dlt-ecosystem/destinations/destination) to create reverse ETL pipelines. -- dlt can be deployed anywhere Python runs, be it on [Airflow](./walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer), [serverless functions](./walkthroughs/deploy-a-pipeline/deploy-with-google-cloud-functions) or any other cloud deployment of your choice. +- dlt can be deployed anywhere Python runs, be it on [Airflow](./walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer), [serverless functions](./walkthroughs/deploy-a-pipeline/deploy-with-google-cloud-functions), or any other cloud deployment of your choice. - dlt automates pipeline maintenance with [schema evolution](./general-usage/schema-evolution) and [schema and data contracts](./general-usage/schema-contracts). To get started with dlt, install the library using pip: @@ -43,7 +43,7 @@ We recommend using a clean virtual environment for your experiments! Read the [d ]}> -Use dlt's [REST API source](./tutorial/rest-api) to extract data from any REST API. Define API endpoints you’d like to fetch data from, pagination method and authentication and dlt will handle the rest: +Use dlt's [REST API source](./tutorial/rest-api) to extract data from any REST API. Define the API endpoints you’d like to fetch data from, the pagination method, and authentication, and dlt will handle the rest: ```py import dlt @@ -76,7 +76,7 @@ Follow the [REST API source tutorial](./tutorial/rest-api) to learn more about t -Use the [SQL source](./tutorial/sql-database) to extract data from the database like PostgreSQL, MySQL, SQLite, Oracle and more. +Use the [SQL source](./tutorial/sql-database) to extract data from databases like PostgreSQL, MySQL, SQLite, Oracle, and more. ```py from dlt.sources.sql_database import sql_database @@ -99,7 +99,7 @@ Follow the [SQL source tutorial](./tutorial/sql-database) to learn more about th -[Filesystem](./tutorial/filesystem) source extracts data from AWS S3, Google Cloud Storage, Google Drive, Azure, or a local file system. +The [Filesystem](./tutorial/filesystem) source extracts data from AWS S3, Google Cloud Storage, Google Drive, Azure, or a local file system. ```py from dlt.sources.filesystem import filesystem @@ -155,4 +155,5 @@ If you'd like to try out dlt without installing it on your machine, check out th 1. Give the library a ⭐ and check out the code on [GitHub](https://github.com/dlt-hub/dlt). 1. Ask questions and share how you use the library on [Slack](https://dlthub.com/community). -1. Report problems and make feature requests [here](https://github.com/dlt-hub/dlt/issues/new/choose). \ No newline at end of file +1. Report problems and make feature requests [here](https://github.com/dlt-hub/dlt/issues/new/choose). + diff --git a/docs/website/docs/reference/telemetry.md b/docs/website/docs/reference/telemetry.md index ea5140bc96..43d537e670 100644 --- a/docs/website/docs/reference/telemetry.md +++ b/docs/website/docs/reference/telemetry.md @@ -135,7 +135,7 @@ The message context contains the following information: ## Send telemetry data to your own tracker You can setup your own tracker to receive telemetry events. You can create scalable, globally distributed -edge service [using `dlt` and Cloudflare](https://dlthub.com/docs/blog/dlt-segment-migration). +edge service [using `dlt` and Cloudflare](https://dlthub.com/blog/dlt-segment-migration). Once your tracker is running, point `dlt` to it. You can use global `config.toml` to redirect all pipelines on a given machine. diff --git a/docs/website/docs/tutorial/filesystem.md b/docs/website/docs/tutorial/filesystem.md index b748f794d5..6d30eed3e6 100644 --- a/docs/website/docs/tutorial/filesystem.md +++ b/docs/website/docs/tutorial/filesystem.md @@ -4,7 +4,7 @@ description: Learn how to load data files like JSON, JSONL, CSV, and Parquet fro keywords: [dlt, tutorial, filesystem, cloud storage, file system, python, data pipeline, incremental loading, json, jsonl, csv, parquet, duckdb] --- -This tutorial is for you if you need to load data files like JSONL, CSV, and Parquet from either Cloud Storage (ex. AWS S3, Google Cloud Storage, Google Drive, Azure Blob Storage) or a local file system. +This tutorial is for you if you need to load data files like JSONL, CSV, and Parquet from either Cloud Storage (e.g., AWS S3, Google Cloud Storage, Google Drive, Azure Blob Storage) or a local file system. ## What you will learn @@ -48,24 +48,22 @@ Here’s what each file does: - `config.toml`: This file contains the configuration settings for your dlt project. :::note -When deploying your pipeline in a production environment, managing all configurations with files might not be convenient. In this case, we recommend you to use the environment variables to store secrets and configs instead. Read more about [configuration providers](../general-usage/credentials/setup#available-config-providers) available in dlt. +When deploying your pipeline in a production environment, managing all configurations with files might not be convenient. In this case, we recommend you use environment variables to store secrets and configs instead. Read more about [configuration providers](../general-usage/credentials/setup#available-config-providers) available in dlt. ::: ## 2. Creating the pipeline The filesystem source provides users with building blocks for loading data from any type of files. You can break down the data extraction into two steps: -1. Listing the files in the bucket / directory. +1. Listing the files in the bucket/directory. 2. Reading the files and yielding records. dlt's filesystem source includes several resources: -- the `filesystem` resource lists files in the directory or bucket -- several readers resources (`read_csv`, `read_parquet`, `read_jsonl`) read files and yield the records. These resources have a -special type, they called [transformers](../general-usage/resource#process-resources-with-dlttransformer). Transformers expect items from another resource. -In this particular case transformers expect `FileItem` object and transform it into multiple records. +- The `filesystem` resource lists files in the directory or bucket. +- Several readers resources (`read_csv`, `read_parquet`, `read_jsonl`) read files and yield the records. These resources have a special type; they are called [transformers](../general-usage/resource#process-resources-with-dlttransformer). Transformers expect items from another resource. In this particular case, transformers expect a `FileItem` object and transform it into multiple records. -Let's initialize a source and create a pipeline for loading CSV files from Google Cloud Storage to DuckDB. You can replace code from `filesystem_pipeline.py` with the following: +Let's initialize a source and create a pipeline for loading CSV files from Google Cloud Storage to DuckDB. You can replace the code from `filesystem_pipeline.py` with the following: ```py import dlt @@ -81,26 +79,25 @@ print(info) What's happening in the snippet above? -1. We import the `filesystem` resource and initialize it with a bucket URL (`gs://filesystem-tutorial`) and the `file_glob` parameter. dlt uses `file_glob` to filter files names in the bucket. `filesystem` returns a generator object. -2. We pipe the files names yielded by the filesystem resource to the transformer resource `read_csv` to read each file and iterate over records from the file. We name this transformer resource `"encounters"` using the `with_name()`. dlt will use the resource name `"encounters"` as a table name when loading the data. +1. We import the `filesystem` resource and initialize it with a bucket URL (`gs://filesystem-tutorial`) and the `file_glob` parameter. dlt uses `file_glob` to filter file names in the bucket. `filesystem` returns a generator object. +2. We pipe the file names yielded by the filesystem resource to the transformer resource `read_csv` to read each file and iterate over records from the file. We name this transformer resource `"encounters"` using the `with_name()` method. dlt will use the resource name `"encounters"` as a table name when loading the data. :::note A [transformer](../general-usage/resource#process-resources-with-dlttransformer) in dlt is a special type of resource that processes each record from another resource. This lets you chain multiple resources together. ::: -3. We create the dlt pipeline configuring with the name `hospital_data_pipeline` and DuckDB destination. +3. We create the dlt pipeline, configuring it with the name `hospital_data_pipeline` and DuckDB as the destination. 4. We call `pipeline.run()`. This is where the underlying generators are iterated: - dlt retrieves remote data, - normalizes data, - creates or updates the table in the destination, - loads the extracted data into the destination. - 5. `print(info)` outputs pipeline running stats we get from `pipeline.run()` +5. `print(info)` outputs the pipeline running stats we get from `pipeline.run()`. ## 3. Configuring the filesystem source :::note -In this tutorial we will work with publicly accessed dataset [Hospital Patient Records](https://mavenanalytics.io/data-playground?order=date_added%2Cdesc&search=Hospital%20Patient%20Records) -synthetic electronic health care records. You can use the exact credentials from this tutorial to load this dataset from GCP. +In this tutorial, we will work with the publicly accessed dataset [Hospital Patient Records](https://mavenanalytics.io/data-playground?order=date_added%2Cdesc&search=Hospital%20Patient%20Records), which contains synthetic electronic health care records. You can use the exact credentials from this tutorial to load this dataset from GCP.
Citation Jason Walonoski, Mark Kramer, Joseph Nichols, Andre Quina, Chris Moesel, Dylan Hall, Carlton Duffett, Kudakwashe Dube, Thomas Gallagher, Scott McLachlan, Synthea: An approach, method, and software mechanism for generating synthetic patients and the synthetic electronic health care record, Journal of the American Medical Informatics Association, Volume 25, Issue 3, March 2018, Pages 230–238, https://doi.org/10.1093/jamia/ocx079 @@ -170,7 +167,7 @@ files = filesystem( As you can see, all parameters of `filesystem` can be specified directly in the code or taken from the configuration. :::tip -dlt supports more ways of authorizing with the cloud storages, including identity-based and default credentials. To learn more about adding credentials to your pipeline, please refer to the [Configuration and secrets section](../general-usage/credentials/complex_types#aws-credentials). +dlt supports more ways of authorizing with cloud storages, including identity-based and default credentials. To learn more about adding credentials to your pipeline, please refer to the [Configuration and secrets section](../general-usage/credentials/complex_types#aws-credentials). ::: ## 4. Running the pipeline @@ -257,9 +254,9 @@ info = pipeline.run(reader, write_disposition="merge") print(info) ``` -Notice that we used `apply_hints` on the `files` resource, not on `reader`. Why did we do that? As mentioned before, the `filesystem` resource lists all files in the storage based on the `file_glob` parameter. So at this point, we can also specify additional conditions to filter out files. In this case, we only want to load files that have been modified since the last load. dlt will automatically keep the state of incremental load and manage the correct filtering. +Notice that we used `apply_hints` on the `files` resource, not on `reader`. As mentioned before, the `filesystem` resource lists all files in the storage based on the `file_glob` parameter. So at this point, we can also specify additional conditions to filter out files. In this case, we only want to load files that have been modified since the last load. dlt will automatically keep the state of the incremental load and manage the correct filtering. -But what if we not only want to process modified files, but we also want to load only new records? In the `encounters` table, we can see the column named `STOP` indicating the timestamp of the end of the encounter. Let's modify our code to load only those records whose `STOP` timestamp was updated since our last load. +But what if we not only want to process modified files but also want to load only new records? In the `encounters` table, we can see the column named `STOP` indicating the timestamp of the end of the encounter. Let's modify our code to load only those records whose `STOP` timestamp was updated since our last load. ```py import dlt @@ -302,7 +299,7 @@ from dlt.sources.filesystem import filesystem def read_csv_custom(items: Iterator[FileItemDict], chunksize: int = 10000, **pandas_kwargs: Any) -> Iterator[TDataItems]: import pandas as pd - # apply defaults to pandas kwargs + # Apply defaults to pandas kwargs kwargs = {**{"header": "infer", "chunksize": chunksize}, **pandas_kwargs} for file_obj in items: @@ -340,7 +337,7 @@ from dlt.common.storages.fsspec_filesystem import FileItemDict from dlt.common.typing import TDataItems from dlt.sources.filesystem import filesystem -# Define a standalone transformer to read data from a json file. +# Define a standalone transformer to read data from a JSON file. @dlt.transformer(standalone=True) def read_json(items: Iterator[FileItemDict]) -> Iterator[TDataItems]: for file_obj in items: @@ -367,3 +364,4 @@ Interested in learning more about dlt? Here are some suggestions: - Learn more about the filesystem source configuration in [filesystem source](../dlt-ecosystem/verified-sources/filesystem) - Learn more about different credential types in [Built-in credentials](../general-usage/credentials/complex_types#built-in-credentials) - Learn how to [create a custom source](./load-data-from-an-api.md) in the advanced tutorial + diff --git a/docs/website/docs/tutorial/load-data-from-an-api.md b/docs/website/docs/tutorial/load-data-from-an-api.md index 5b1d63373c..3640f0e8d7 100644 --- a/docs/website/docs/tutorial/load-data-from-an-api.md +++ b/docs/website/docs/tutorial/load-data-from-an-api.md @@ -9,7 +9,7 @@ This tutorial introduces you to foundational dlt concepts, demonstrating how to ## What you will learn - Loading data from a list of Python dictionaries into DuckDB. -- Low level API usage with built-in HTTP client. +- Low-level API usage with a built-in HTTP client. - Understand and manage data loading behaviors. - Incrementally load new data and deduplicate existing data. - Dynamic resource creation and reducing code redundancy. @@ -74,13 +74,13 @@ Load package 1692364844.460054 is LOADED and contains no failed jobs ### Explore the data -To allow sneak peek and basic discovery you can take advantage of [built-in integration with Strealmit](../reference/command-line-interface#show-tables-and-data-in-the-destination): +To allow a sneak peek and basic discovery, you can take advantage of [built-in integration with Streamlit](../reference/command-line-interface#show-tables-and-data-in-the-destination): ```sh dlt pipeline quick_start show ``` -**quick_start** is the name of the pipeline from the script above. If you do not have Streamlit installed yet do: +**quick_start** is the name of the pipeline from the script above. If you do not have Streamlit installed yet, do: ```sh pip install streamlit @@ -94,13 +94,13 @@ Streamlit Explore data. Schema and data for a test pipeline “quick_start”. :::tip `dlt` works in Jupyter Notebook and Google Colab! See our [Quickstart Colab Demo.](https://colab.research.google.com/drive/1NfSB1DpwbbHX9_t5vlalBTf13utwpMGx?usp=sharing) -Looking for source code of all the snippets? You can find and run them [from this repository](https://github.com/dlt-hub/dlt/blob/devel/docs/website/docs/getting-started-snippets.py). +Looking for the source code of all the snippets? You can find and run them [from this repository](https://github.com/dlt-hub/dlt/blob/devel/docs/website/docs/getting-started-snippets.py). ::: -Now that you have a basic understanding of how to get started with dlt, you might be eager to dive deeper. For that we need to switch to a more advanced data source - the GitHub API. We will load issues from our [dlt-hub/dlt](https://github.com/dlt-hub/dlt) repository. +Now that you have a basic understanding of how to get started with dlt, you might be eager to dive deeper. For that, we need to switch to a more advanced data source - the GitHub API. We will load issues from our [dlt-hub/dlt](https://github.com/dlt-hub/dlt) repository. :::note -This tutorial uses GitHub REST API for demonstration purposes only. If you need to read data from a REST API, consider using the dlt's REST API source. Check out the [REST API source tutorial](./rest-api) for quick start or [REST API source reference](../dlt-ecosystem/verified-sources/rest_api) for more details. +This tutorial uses the GitHub REST API for demonstration purposes only. If you need to read data from a REST API, consider using dlt's REST API source. Check out the [REST API source tutorial](./rest-api) for a quick start or the [REST API source reference](../dlt-ecosystem/verified-sources/rest_api) for more details. ::: ## Create a pipeline @@ -112,7 +112,7 @@ First, we need to create a [pipeline](../general-usage/pipeline). Pipelines are Here's what the code above does: 1. It makes a request to the GitHub API endpoint and checks if the response is successful. -2. Then it creates a dlt pipeline with the name `github_issues` and specifies that the data should be loaded to the `duckdb` destination and the `github_data` dataset. Nothing gets loaded yet. +2. Then, it creates a dlt pipeline with the name `github_issues` and specifies that the data should be loaded to the `duckdb` destination and the `github_data` dataset. Nothing gets loaded yet. 3. Finally, it runs the pipeline with the data from the API response (`response.json()`) and specifies that the data should be loaded to the `issues` table. The `run` method returns a `LoadInfo` object that contains information about the loaded data. ## Run the pipeline @@ -134,7 +134,7 @@ dlt pipeline github_issues show Try running the pipeline again with `python github_issues.py`. You will notice that the **issues** table contains two copies of the same data. This happens because the default load mode is `append`. It is very useful, for example, when you have daily data updates and you want to ingest them. To get the latest data, we'd need to run the script again. But how to do that without duplicating the data? -One option is to tell `dlt` to replace the data in existing tables in the destination by using `replace` write disposition. Change the `github_issues.py` script to the following: +One option is to tell `dlt` to replace the data in existing tables in the destination by using the `replace` write disposition. Change the `github_issues.py` script to the following: ```py import dlt @@ -161,7 +161,7 @@ load_info = pipeline.run( print(load_info) ``` -Run this script twice to see that **issues** table still contains only one copy of the data. +Run this script twice to see that the **issues** table still contains only one copy of the data. :::tip What if the API has changed and new fields get added to the response? @@ -172,18 +172,18 @@ See the `replace` mode and table schema migration in action in our [Schema evolu Learn more: - [Full load - how to replace your data](../general-usage/full-loading). -- [Append, replace and merge your tables](../general-usage/incremental-loading). +- [Append, replace, and merge your tables](../general-usage/incremental-loading). ## Declare loading behavior -So far we have been passing the data to the `run` method directly. This is a quick way to get started. However, frequently, you receive data in chunks, and you want to load it as it arrives. For example, you might want to load data from an API endpoint with pagination or a large file that does not fit in memory. In such cases, you can use Python generators as a data source. +So far, we have been passing the data to the `run` method directly. This is a quick way to get started. However, frequently, you receive data in chunks, and you want to load it as it arrives. For example, you might want to load data from an API endpoint with pagination or a large file that does not fit in memory. In such cases, you can use Python generators as a data source. You can pass a generator to the `run` method directly or use the `@dlt.resource` decorator to turn the generator into a [dlt resource](../general-usage/resource). The decorator allows you to specify the loading behavior and relevant resource parameters. ### Load only new data (incremental loading) -Let's improve our GitHub API example and get only issues that were created since last load. -Instead of using `replace` write disposition and downloading all issues each time the pipeline is run, we do the following: +Let's improve our GitHub API example and get only issues that were created since the last load. +Instead of using the `replace` write disposition and downloading all issues each time the pipeline is run, we do the following: @@ -192,17 +192,17 @@ Let's take a closer look at the code above. We use the `@dlt.resource` decorator to declare the table name into which data will be loaded and specify the `append` write disposition. -We request issues for dlt-hub/dlt repository ordered by **created_at** field (descending) and yield them page by page in `get_issues` generator function. +We request issues for the dlt-hub/dlt repository ordered by the **created_at** field (descending) and yield them page by page in the `get_issues` generator function. -We also use `dlt.sources.incremental` to track `created_at` field present in each issue to filter in the newly created. +We also use `dlt.sources.incremental` to track the `created_at` field present in each issue to filter in the newly created ones. Now run the script. It loads all the issues from our repo to `duckdb`. Run it again, and you can see that no issues got added (if no issues were created in the meantime). -Now you can run this script on a daily schedule and each day you’ll load only issues created after the time of the previous pipeline run. +Now you can run this script on a daily schedule, and each day you’ll load only issues created after the time of the previous pipeline run. :::tip -Between pipeline runs, `dlt` keeps the state in the same database it loaded data to. -Peek into that state, the tables loaded and get other information with: +Between pipeline runs, `dlt` keeps the state in the same database it loaded data into. +Peek into that state, the tables loaded, and get other information with: ```sh dlt pipeline -v github_issues_incremental info @@ -219,25 +219,25 @@ Learn more: ### Update and deduplicate your data The script above finds **new** issues and adds them to the database. -It will ignore any updates to **existing** issue text, emoji reactions etc. -To get always fresh content of all the issues you combine incremental load with `merge` write disposition, +It will ignore any updates to **existing** issue text, emoji reactions, etc. +To always get fresh content of all the issues, combine incremental load with the `merge` write disposition, like in the script below. -Above we add `primary_key` argument to the `dlt.resource()` that tells `dlt` how to identify the issues in the database to find duplicates which content it will merge. +Above, we add the `primary_key` argument to the `dlt.resource()` that tells `dlt` how to identify the issues in the database to find duplicates whose content it will merge. Note that we now track the `updated_at` field — so we filter in all issues **updated** since the last pipeline run (which also includes those newly created). -Pay attention how we use **since** parameter from [GitHub API](https://docs.github.com/en/rest/issues/issues?apiVersion=2022-11-28#list-repository-issues) +Pay attention to how we use the **since** parameter from the [GitHub API](https://docs.github.com/en/rest/issues/issues?apiVersion=2022-11-28#list-repository-issues) and `updated_at.last_value` to tell GitHub to return issues updated only **after** the date we pass. `updated_at.last_value` holds the last `updated_at` value from the previous run. [Learn more about merge write disposition](../general-usage/incremental-loading#merge-incremental_loading). ## Using pagination helper -In the previous examples, we used the `requests` library to make HTTP requests to the GitHub API and handled pagination manually. `dlt` has the built-in [REST client](../general-usage/http/rest-client.md) that simplifies API requests. We'll pick the `paginate()` helper from it for the next example. The `paginate` function takes a URL and optional parameters (quite similar to `requests`) and returns a generator that yields pages of data. +In the previous examples, we used the `requests` library to make HTTP requests to the GitHub API and handled pagination manually. `dlt` has a built-in [REST client](../general-usage/http/rest-client.md) that simplifies API requests. We'll use the `paginate()` helper from it for the next example. The `paginate` function takes a URL and optional parameters (quite similar to `requests`) and returns a generator that yields pages of data. Here's how the updated script looks: @@ -282,10 +282,10 @@ Let's zoom in on the changes: 1. The `while` loop that handled pagination is replaced with reading pages from the `paginate()` generator. 2. `paginate()` takes the URL of the API endpoint and optional parameters. In this case, we pass the `since` parameter to get only issues updated after the last pipeline run. -3. We're not explicitly setting up pagination, `paginate()` handles it for us. Magic! Under the hood, `paginate()` analyzes the response and detects the pagination method used by the API. Read more about pagination in the [REST client documentation](../general-usage/http/rest-client.md#paginating-api-responses). +3. We're not explicitly setting up pagination; `paginate()` handles it for us. Magic! Under the hood, `paginate()` analyzes the response and detects the pagination method used by the API. Read more about pagination in the [REST client documentation](../general-usage/http/rest-client.md#paginating-api-responses). If you want to take full advantage of the `dlt` library, then we strongly suggest that you build your sources out of existing building blocks: -To make most of `dlt`, consider the following: +To make the most of `dlt`, consider the following: ## Use source decorator @@ -301,7 +301,7 @@ from dlt.sources.helpers.rest_client import paginate primary_key="id", ) def get_comments( - updated_at = dlt.sources.incremental("updated_at", initial_value="1970-01-01T00:00:00Z") + updated_at=dlt.sources.incremental("updated_at", initial_value="1970-01-01T00:00:00Z") ): for page in paginate( "https://api.github.com/repos/dlt-hub/dlt/comments", @@ -310,7 +310,7 @@ def get_comments( yield page ``` -We can load this resource separately from the issues resource, however loading both issues and comments in one go is more efficient. To do that, we'll use the `@dlt.source` decorator on a function that returns a list of resources: +We can load this resource separately from the issues resource; however, loading both issues and comments in one go is more efficient. To do that, we'll use the `@dlt.source` decorator on a function that returns a list of resources: ```py @dlt.source @@ -330,7 +330,7 @@ from dlt.sources.helpers.rest_client import paginate primary_key="id", ) def get_issues( - updated_at = dlt.sources.incremental("updated_at", initial_value="1970-01-01T00:00:00Z") + updated_at=dlt.sources.incremental("updated_at", initial_value="1970-01-01T00:00:00Z") ): for page in paginate( "https://api.github.com/repos/dlt-hub/dlt/issues", @@ -338,7 +338,7 @@ def get_issues( "since": updated_at.last_value, "per_page": 100, "sort": "updated", - "directions": "desc", + "direction": "desc", "state": "open", } ): @@ -351,7 +351,7 @@ def get_issues( primary_key="id", ) def get_comments( - updated_at = dlt.sources.incremental("updated_at", initial_value="1970-01-01T00:00:00Z") + updated_at=dlt.sources.incremental("updated_at", initial_value="1970-01-01T00:00:00Z") ): for page in paginate( "https://api.github.com/repos/dlt-hub/dlt/comments", @@ -380,7 +380,7 @@ print(load_info) ### Dynamic resources -You've noticed that there's a lot of code duplication in the `get_issues` and `get_comments` functions. We can reduce that by extracting the common fetching code into a separate function and use it in both resources. Even better, we can use `dlt.resource` as a function and pass it the `fetch_github_data()` generator function directly. Here's the refactored code: +You've noticed that there's a lot of code duplication in the `get_issues` and `get_comments` functions. We can reduce that by extracting the common fetching code into a separate function and using it in both resources. Even better, we can use `dlt.resource` as a function and pass it the `fetch_github_data()` generator function directly. Here's the refactored code: ```py import dlt @@ -414,9 +414,9 @@ row_counts = pipeline.last_trace.last_normalize_info ## Handle secrets -For the next step we'd want to get the [number of repository clones](https://docs.github.com/en/rest/metrics/traffic?apiVersion=2022-11-28#get-repository-clones) for our dlt repo from the GitHub API. However, the `traffic/clones` endpoint that returns the data requires [authentication](https://docs.github.com/en/rest/overview/authenticating-to-the-rest-api?apiVersion=2022-11-28). +For the next step, we'd want to get the [number of repository clones](https://docs.github.com/en/rest/metrics/traffic?apiVersion=2022-11-28#get-repository-clones) for our dlt repo from the GitHub API. However, the `traffic/clones` endpoint that returns the data requires [authentication](https://docs.github.com/en/rest/overview/authenticating-to-the-rest-api?apiVersion=2022-11-28). -Let's handle this by changing our `fetch_github_data()` first: +Let's handle this by changing our `fetch_github_data()` function first: ```py from dlt.sources.helpers.rest_client.auth import BearerTokenAuth @@ -444,13 +444,13 @@ def github_source(access_token): ... ``` -Here, we added `access_token` parameter and now we can use it to pass the access token to the request: +Here, we added an `access_token` parameter and now we can use it to pass the access token to the request: ```py load_info = pipeline.run(github_source(access_token="ghp_XXXXX")) ``` -It's a good start. But we'd want to follow the best practices and not hardcode the token in the script. One option is to set the token as an environment variable, load it with `os.getenv()` and pass it around as a parameter. dlt offers a more convenient way to handle secrets and credentials: it lets you inject the arguments using a special `dlt.secrets.value` argument value. +It's a good start. But we'd want to follow the best practices and not hardcode the token in the script. One option is to set the token as an environment variable, load it with `os.getenv()`, and pass it around as a parameter. dlt offers a more convenient way to handle secrets and credentials: it lets you inject the arguments using a special `dlt.secrets.value` argument value. To use it, change the `github_source()` function to: @@ -467,7 +467,7 @@ When you add `dlt.secrets.value` as a default value for an argument, `dlt` will 1. Special environment variables. 2. `secrets.toml` file. -The `secret.toml` file is located in the `~/.dlt` folder (for global configuration) or in the `.dlt` folder in the project folder (for project-specific configuration). +The `secrets.toml` file is located in the `~/.dlt` folder (for global configuration) or in the `.dlt` folder in the project folder (for project-specific configuration). Let's add the token to the `~/.dlt/secrets.toml` file: @@ -505,7 +505,7 @@ load_info = pipeline.run(github_source()) ## Configurable sources -The next step is to make our dlt GitHub source reusable so it can load data from any GitHub repo. We'll do that by changing both `github_source()` and `fetch_github_data()` functions to accept the repo name as a parameter: +The next step is to make our dlt GitHub source reusable so it can load data from any GitHub repo. We'll do that by changing both the `github_source()` and `fetch_github_data()` functions to accept the repo name as a parameter: ```py import dlt @@ -515,7 +515,7 @@ BASE_GITHUB_URL = "https://api.github.com/repos/{repo_name}" def fetch_github_data(repo_name, endpoint, params={}, access_token=None): - """Fetch data from GitHub API based on repo_name, endpoint, and params.""" + """Fetch data from the GitHub API based on repo_name, endpoint, and params.""" url = BASE_GITHUB_URL.format(repo_name=repo_name) + f"/{endpoint}" return paginate( url, @@ -564,18 +564,16 @@ Interested in learning more? Here are some suggestions: 1. You've been running your pipelines locally. Learn how to [deploy and run them in the cloud](../walkthroughs/deploy-a-pipeline/). 2. Dive deeper into how dlt works by reading the [Using dlt](../general-usage) section. Some highlights: - [Set up "last value" incremental loading](../general-usage/incremental-loading#incremental_loading-with-last-value). - - Learn about data loading strategies: [append, replace and merge](../general-usage/incremental-loading). + - Learn about data loading strategies: [append, replace, and merge](../general-usage/incremental-loading). - [Connect the transformers to the resources](../general-usage/resource#feeding-data-from-one-resource-into-another) to load additional data or enrich it. - [Customize your data schema—set primary and merge keys, define column nullability, and specify data types](../general-usage/resource#define-schema). - [Create your resources dynamically from data](../general-usage/source#create-resources-dynamically). - [Transform your data before loading](../general-usage/resource#customize-resources) and see some [examples of customizations like column renames and anonymization](../general-usage/customising-pipelines/renaming_columns). - Employ data transformations using [SQL](../dlt-ecosystem/transformations/sql) or [Pandas](../dlt-ecosystem/transformations/sql). - [Pass config and credentials into your sources and resources](../general-usage/credentials). - - [Run in production: inspecting, tracing, retry policies and cleaning up](../running-in-production/running). - - [Run resources in parallel, optimize buffers and local storage](../reference/performance.md) + - [Run in production: inspecting, tracing, retry policies, and cleaning up](../running-in-production/running). + - [Run resources in parallel, optimize buffers, and local storage](../reference/performance.md) - [Use REST API client helpers](../general-usage/http/rest-client.md) to simplify working with REST APIs. -3. Explore [destinations](../dlt-ecosystem/destinations/) and [sources](../dlt-ecosystem/verified-sources/) provided by us and community. -4. Explore the [Examples](../examples) section to see how dlt can be used in real-world scenarios - - +3. Explore [destinations](../dlt-ecosystem/destinations/) and [sources](../dlt-ecosystem/verified-sources/) provided by us and the community. +4. Explore the [Examples](../examples) section to see how dlt can be used in real-world scenarios. diff --git a/docs/website/docs/tutorial/rest-api.md b/docs/website/docs/tutorial/rest-api.md index 3e214e0b55..e1c4d63daa 100644 --- a/docs/website/docs/tutorial/rest-api.md +++ b/docs/website/docs/tutorial/rest-api.md @@ -36,7 +36,7 @@ If you see the version number (such as "dlt 0.5.3"), you're ready to proceed. ## Setting up a new project -Initialize a new dlt project with REST API source and DuckDB destination: +Initialize a new dlt project with a REST API source and DuckDB destination: ```sh dlt init rest_api duckdb @@ -76,7 +76,7 @@ Let's verify that the pipeline is working as expected. Run the following command python rest_api_pipeline.py ``` -You should see the output of the pipeline execution in the terminal. The output will also diplay the location of the DuckDB database file where the data is stored: +You should see the output of the pipeline execution in the terminal. The output will also display the location of the DuckDB database file where the data is stored: ```sh Pipeline rest_api_pokemon load step completed in 1.08 seconds @@ -100,7 +100,7 @@ dlt pipeline rest_api_pokemon show ``` The command opens a new browser window with the data browser application. `rest_api_pokemon` is the name of the pipeline defined in the `rest_api_pipeline.py` file. -You can explore the loaded data, run queries and see some pipeline execution details: +You can explore the loaded data, run queries, and see some pipeline execution details: ![Explore rest_api data in Streamlit App](https://dlt-static.s3.eu-central-1.amazonaws.com/images/docs-rest-api-tutorial-streamlit-screenshot.png) @@ -109,6 +109,9 @@ You can explore the loaded data, run queries and see some pipeline execution det Now that your environment and the project are set up, let's take a closer look at the configuration of the REST API source. Open the `rest_api_pipeline.py` file in your code editor and locate the following code snippet: ```py +import dlt +from dlt.sources.rest_api import rest_api_source + def load_pokemon() -> None: pipeline = dlt.pipeline( pipeline_name="rest_api_pokemon", @@ -142,9 +145,9 @@ def load_pokemon() -> None: print(load_info) ``` -Here what's happening in the code: +Here's what's happening in the code: -1. With `dlt.pipeline()` we define a new pipeline named `rest_api_pokemon` with DuckDB as the destination and `rest_api_data` as the dataset name. +1. With `dlt.pipeline()`, we define a new pipeline named `rest_api_pokemon` with DuckDB as the destination and `rest_api_data` as the dataset name. 2. The `rest_api_source()` function creates a new REST API source object. 3. We pass this source object to the `pipeline.run()` method to start the pipeline execution. Inside the `run()` method, dlt will fetch data from the API and load it into the DuckDB database. 4. The `print(load_info)` outputs the pipeline execution details to the console. @@ -166,7 +169,7 @@ config: RESTAPIConfig = { ``` - The `client` configuration is used to connect to the web server and authenticate if necessary. For our simple example, we only need to specify the `base_url` of the API: `https://pokeapi.co/api/v2/`. -- The `resource_defaults` configuration allows you to set default parameters for all resources. Normally you would set common parameters here, such as pagination limits. In our Pokemon API example, we set the `limit` parameter to 1000 for all resources to retrieve more data in a single request and reduce the number of HTTP API calls. +- The `resource_defaults` configuration allows you to set default parameters for all resources. Normally, you would set common parameters here, such as pagination limits. In our Pokemon API example, we set the `limit` parameter to 1000 for all resources to retrieve more data in a single request and reduce the number of HTTP API calls. - The `resources` list contains the names of the resources you want to load from the API. REST API will use some conventions to determine the endpoint URL based on the resource name. For example, the resource name `pokemon` will be translated to the endpoint URL `https://pokeapi.co/api/v2/pokemon`. :::note @@ -176,7 +179,7 @@ You may have noticed that we didn't specify any pagination configuration in the ## Appending, replacing, and merging loaded data -Try running the pipeline again with `python rest_api_pipeline.py`. You will notice that all the tables have data duplicated. This happens because by default, dlt appends the data to the destination table. In dlt you can control how the data is loaded into the destination table by setting the `write_disposition` parameter in the resource configuration. The possible values are: +Try running the pipeline again with `python rest_api_pipeline.py`. You will notice that all the tables have duplicated data. This happens because, by default, dlt appends the data to the destination table. In dlt, you can control how the data is loaded into the destination table by setting the `write_disposition` parameter in the resource configuration. The possible values are: - `append`: Appends the data to the destination table. This is the default. - `replace`: Replaces the data in the destination table with the new data. - `merge`: Merges the new data with the existing data in the destination table based on the primary key. @@ -234,7 +237,7 @@ pokemon_source = rest_api_source( }, }, # For the `berry` and `location` resources, we keep - # the`replace` write disposition + # the `replace` write disposition "write_disposition": "replace", }, "resources": [ @@ -263,6 +266,9 @@ When working with some APIs, you may need to load data incrementally to avoid fe To illustrate incremental loading, let's consider the GitHub API. In the `rest_api_pipeline.py` file, you can find an example of how to load data from the GitHub API incrementally. Let's take a look at the configuration: ```py +import dlt +from dlt.sources.rest_api import rest_api_source + pipeline = dlt.pipeline( pipeline_name="rest_api_github", destination="duckdb", @@ -302,11 +308,11 @@ github_source = rest_api_source({ ], }) -load_info = pipeline.run(github_source()) +load_info = pipeline.run(github_source) print(load_info) ``` -In this configuration, the `since` parameter is defined as a special incremental parameter. The `cursor_path` field specifies the JSON path to the field that will be used to fetch the updated data and we use the `initial_value` for the initial value for the incremental parameter. This value will be used in the first request to fetch the data. +In this configuration, the `since` parameter is defined as a special incremental parameter. The `cursor_path` field specifies the JSON path to the field that will be used to fetch the updated data, and we use the `initial_value` for the initial value for the incremental parameter. This value will be used in the first request to fetch the data. When the pipeline runs, dlt will automatically update the `since` parameter with the latest value from the response data. This way, you can fetch only the new or updated data from the API. @@ -318,5 +324,6 @@ Congratulations on completing the tutorial! You've learned how to set up a REST Interested in learning more about dlt? Here are some suggestions: -- Learn more about the REST API source configuration in [REST API source documentation](../dlt-ecosystem/verified-sources/rest_api/) -- Learn how to [create a custom source](./load-data-from-an-api.md) in the advanced tutorial \ No newline at end of file +- Learn more about the REST API source configuration in the [REST API source documentation](../dlt-ecosystem/verified-sources/rest_api/) +- Learn how to [create a custom source](./load-data-from-an-api.md) in the advanced tutorial. + diff --git a/docs/website/docs/tutorial/sql-database.md b/docs/website/docs/tutorial/sql-database.md index 1a7702b637..abaec53ce2 100644 --- a/docs/website/docs/tutorial/sql-database.md +++ b/docs/website/docs/tutorial/sql-database.md @@ -42,7 +42,7 @@ After running this command, your project will have the following structure: Here’s what each file does: -- `sql_database_pipeline.py`: This is the main script where you'll define your data pipeline. It contains several different examples for how you can configure your SQL Database pipeline. +- `sql_database_pipeline.py`: This is the main script where you'll define your data pipeline. It contains several different examples of how you can configure your SQL Database pipeline. - `requirements.txt`: This file lists all the Python dependencies required for your project. - `.dlt/`: This directory contains the [configuration files](../general-usage/credentials/) for your project: - `secrets.toml`: This file stores your credentials, API keys, tokens, and other sensitive information. @@ -64,16 +64,19 @@ Running the script as it is will execute the function `load_standalone_table_res The following function will load the tables `family` and `genome`. ```py +import dlt +from dlt.sources.sql_database import sql_database + def load_tables_family_and_genome(): - # create a dlt source that will load tables "family" and "genome" + # Create a dlt source that will load tables "family" and "genome" source = sql_database().with_resources("family", "genome") # Create a dlt pipeline object pipeline = dlt.pipeline( - pipeline_name="sql_to_duckdb_pipeline", # custom name for the pipeline + pipeline_name="sql_to_duckdb_pipeline", # Custom name for the pipeline destination="duckdb", # dlt destination to which the data will be loaded - dataset_name="sql_to_duckdb_pipeline_data" # custom name for the dataset created in the destination + dataset_name="sql_to_duckdb_pipeline_data" # Custom name for the dataset created in the destination ) # Run the pipeline @@ -96,7 +99,7 @@ Explanation: ## 3. Add credentials -To sucessfully connect to your SQL database, you will need to pass credentials into your pipeline. dlt automatically looks for this information inside the generated TOML files. +To successfully connect to your SQL database, you will need to pass credentials into your pipeline. dlt automatically looks for this information inside the generated TOML files. Simply paste the [connection details](https://docs.rfam.org/en/latest/database.html) inside `secrets.toml` as follows: ```toml @@ -114,7 +117,7 @@ Alternatively, you can also paste the credentials as a connection string: sources.sql_database.credentials="mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam" ``` -For more details on the credentials format and other connection methods read the section on [configuring connection to the SQL Database](https://dlthub.com/docs/dlt-ecosystem/verified-sources/sql_database#credentials-format). +For more details on the credentials format and other connection methods, read the section on [configuring connection to the SQL Database](../dlt-ecosystem/verified-sources/sql_database#credentials-format). ## 4. Install dependencies @@ -138,7 +141,7 @@ After performing steps 1-4, you should now be able to successfully run the pipel ```sh python sql_database_pipeline.py ``` -This will create the file `sql_to_duckdb_pipeline.duckdb` in your dlt project directory which contains the loaded data. +This will create the file `sql_to_duckdb_pipeline.duckdb` in your dlt project directory, which contains the loaded data. ## 6. Explore the data @@ -154,14 +157,13 @@ Next, run the following command to launch the data browser app: dlt pipeline sql_to_duckdb_pipeline show ``` -You can explore the loaded data, run queries and see some pipeline execution details. +You can explore the loaded data, run queries, and see some pipeline execution details. ![streamlit-screenshot](https://storage.googleapis.com/dlt-blog-images/docs-sql-database-tutorial-streamlit-screenshot.png) ## 7. Append, replace, or merge loaded data -Try running the pipeline again with `python sql_database_pipeline.py`. You will notice that -all the tables have the data duplicated. This happens as dlt, by default, appends data to the destination tables in every load. This behavior can be adjusted by setting the `write_disposition` parameter inside the `pipeline.run()` method. The possible settings are: +Try running the pipeline again with `python sql_database_pipeline.py`. You will notice that all the tables have the data duplicated. This happens as dlt, by default, appends data to the destination tables in every load. This behavior can be adjusted by setting the `write_disposition` parameter inside the `pipeline.run()` method. The possible settings are: - `append`: Appends the data to the destination table. This is the default. - `replace`: Replaces the data in the destination table with the new data. @@ -172,6 +174,9 @@ all the tables have the data duplicated. This happens as dlt, by default, append To prevent the data from being duplicated in each row, set `write_disposition` to `replace`: ```py +import dlt +from dlt.sources.sql_database import sql_database + def load_tables_family_and_genome(): source = sql_database().with_resources("family", "genome") @@ -197,9 +202,12 @@ Run the pipeline again with `sql_database_pipeline.py`. This time, the data will When you want to update the existing data as new data is loaded, you can use the `merge` write disposition. This requires specifying a primary key for the table. The primary key is used to match the new data with the existing data in the destination table. -In the previous example, we set `write_disposition="replace"` inside `pipeline.run()` which caused all the tables to be loaded with `replace`. However, it's also possible to define the `write_disposition` strategy separately for each tables using the `apply_hints` method. In the example below, we use `apply_hints` on each table to specify different primary keys for merge: +In the previous example, we set `write_disposition="replace"` inside `pipeline.run()` which caused all the tables to be loaded with `replace`. However, it's also possible to define the `write_disposition` strategy separately for each table using the `apply_hints` method. In the example below, we use `apply_hints` on each table to specify different primary keys for merge: ```py +import dlt +from dlt.sources.sql_database import sql_database + def load_tables_family_and_genome(): source = sql_database().with_resources("family", "genome") @@ -224,11 +232,14 @@ if __name__ == '__main__': ## 8. Load data incrementally -Often you don't want to load the whole data in each load, but rather only the new or modified data. dlt makes this easy with [incremental loading](../general-usage/incremental-loading). +Often, you don't want to load the entire dataset in each load, but rather only the new or modified data. dlt makes this easy with [incremental loading](../general-usage/incremental-loading). In the example below, we configure the table `"family"` to load incrementally based on the column `"updated"`: ```py +import dlt +from dlt.sources.sql_database import sql_database + def load_tables_family_and_genome(): source = sql_database().with_resources("family", "genome") @@ -262,3 +273,4 @@ Interested in learning more about dlt? Here are some suggestions: - Learn more about the SQL Database source configuration in [the SQL Database source reference](../dlt-ecosystem/verified-sources/sql_database) - Learn more about different credential types in [Built-in credentials](../general-usage/credentials/complex_types#built-in-credentials) - Learn how to [create a custom source](./load-data-from-an-api.md) in the advanced tutorial + diff --git a/docs/website/docs/walkthroughs/create-a-pipeline.md b/docs/website/docs/walkthroughs/create-a-pipeline.md index d463921319..0aa253dc10 100644 --- a/docs/website/docs/walkthroughs/create-a-pipeline.md +++ b/docs/website/docs/walkthroughs/create-a-pipeline.md @@ -9,7 +9,7 @@ keywords: [how to, create a pipeline, rest client] This guide walks you through creating a pipeline that uses our [REST API Client](../general-usage/http/rest-client) to connect to [DuckDB](../dlt-ecosystem/destinations/duckdb). :::tip -We're using DuckDB as a destination here, but you can adapt the steps to any [source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/) and [destination](https://dlthub.com/docs/dlt-ecosystem/destinations/) by +We're using DuckDB as a destination here, but you can adapt the steps to any [source](../dlt-ecosystem/verified-sources/) and [destination](../dlt-ecosystem/destinations/) by using the [command](../reference/command-line-interface#dlt-init) `dlt init ` and tweaking the pipeline accordingly. ::: diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-prefect.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-prefect.md index f0cc29da87..8f58ded0e6 100644 --- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-prefect.md +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-prefect.md @@ -31,7 +31,7 @@ Here's a concise guide to orchestrating a `dlt` pipeline with Prefect using "Mov ### Here’s a summary of the steps followed: -1. Create a `dlt` pipeline. For detailed instructions on creating a pipeline, please refer to the [documentation](https://dlthub.com/docs/walkthroughs/create-a-pipeline). +1. Create a `dlt` pipeline. For detailed instructions on creating a pipeline, please refer to the [documentation](../create-a-pipeline). 1. Add `@task` decorator to the individual functions. 1. Here we use `@task` decorator for `get_users` function: diff --git a/docs/website/docs/walkthroughs/zendesk-weaviate.md b/docs/website/docs/walkthroughs/zendesk-weaviate.md index cc88e59433..31e8aed73e 100644 --- a/docs/website/docs/walkthroughs/zendesk-weaviate.md +++ b/docs/website/docs/walkthroughs/zendesk-weaviate.md @@ -14,7 +14,7 @@ For our example we will use "subject" and "description" fields from a ticket as ## Prerequisites -We're going to use some ready-made components from the [dlt ecosystem](https://dlthub.com/docs/dlt-ecosystem) to make this process easier: +We're going to use some ready-made components from the [sources](../dlt-ecosystem/verified-sources) and [destinations](../dlt-ecosystem/destinations) to make this process easier: 1. A [Zendesk verified source](../dlt-ecosystem/verified-sources/zendesk.md) to extract the tickets from the API. 2. A [Weaviate destination](../dlt-ecosystem/destinations/weaviate.md) to load the data into a Weaviate instance. diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 710c3ac57c..23c8d192ba 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -88,6 +88,18 @@ const sidebars = { items: [ 'dlt-ecosystem/verified-sources/rest_api/basic', 'dlt-ecosystem/verified-sources/rest_api/advanced', + { + type: 'category', + label: 'REST API helpers', + link: { + type: 'doc', + id: 'general-usage/http/overview', + }, + items: [ + 'general-usage/http/rest-client', + 'general-usage/http/requests', + ] + }, ] }, { @@ -137,18 +149,6 @@ const sidebars = { 'dlt-ecosystem/verified-sources/stripe', 'dlt-ecosystem/verified-sources/workable', 'dlt-ecosystem/verified-sources/zendesk', - { - type: 'category', - label: 'REST API helpers', - link: { - type: 'doc', - id: 'general-usage/http/overview', - }, - items: [ - 'general-usage/http/rest-client', - 'general-usage/http/requests', - ] - }, 'walkthroughs/add-a-verified-source', ] }, diff --git a/docs/website/tools/preprocess_docs.js b/docs/website/tools/preprocess_docs.js index c5c0c33246..e0088a9768 100644 --- a/docs/website/tools/preprocess_docs.js +++ b/docs/website/tools/preprocess_docs.js @@ -358,9 +358,61 @@ function syncExamples() { console.log(`Synced ${count} examples`) } -fs.rmSync(MD_TARGET_DIR, {force: true, recursive: true}) -syncExamples(); -preprocess_docs(); +// strings to search for, this check could be better but it +// is a quick fix +const HTTP_LINK = "](https://dlthub.com/docs"; +const ABS_LINK = "](/" +const ABS_IMG_LINK = "](/img" + +/** + * Inspect all md files an run some checks + */ +function checkDocs() { + let foundError = false; + for (const fileName of walkSync(MD_SOURCE_DIR)) { + if (!DOCS_EXTENSIONS.includes(path.extname(fileName))) { + continue + } + + // here we simply check that there are no absolute or devel links in the markdown files + let lines = fs.readFileSync(fileName, 'utf8').split(/\r?\n/); + + for (let [index, line] of lines.entries()) { + + const lineNo = index + 1; + line = line.toLocaleLowerCase(); + + if (line.includes(ABS_LINK) && !line.includes(ABS_IMG_LINK)) { + foundError = true; + console.error(`Found absolute md link in file ${fileName}, line ${lineNo}`) + } + + if (line.includes(HTTP_LINK)) { + foundError = true; + console.error(`Found http md link referencing these docs in file ${fileName}, line ${lineNo}`) + } + + } + + + + } + + if (foundError) { + throw Error("Found one or more errors while checking docs.") + } + console.info("Found no errors in md files") +} + + +function processDocs() { + fs.rmSync(MD_TARGET_DIR, {force: true, recursive: true}) + syncExamples(); + preprocess_docs(); + checkDocs(); +} + +processDocs() /** * Watch for changes and preprocess the docs if --watch cli command flag is present @@ -373,10 +425,8 @@ if (process.argv.includes("--watch")) { if (Date.now() - lastUpdate < 500) { return; } - fs.rmSync(MD_TARGET_DIR, {force: true, recursive: true}) console.log('%s changed...', name); - syncExamples(); - preprocess_docs(); + processDocs(); lastUpdate = Date.now(); }); } \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 8f2ff58094..12c0d75d1e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2636,58 +2636,57 @@ dates = ["pytz (>=2019.1)"] [[package]] name = "duckdb" -version = "0.10.3" +version = "1.1.0" description = "DuckDB in-process database" optional = false python-versions = ">=3.7.0" files = [ - {file = "duckdb-0.10.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd25cc8d001c09a19340739ba59d33e12a81ab285b7a6bed37169655e1cefb31"}, - {file = "duckdb-0.10.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2f9259c637b917ca0f4c63887e8d9b35ec248f5d987c886dfc4229d66a791009"}, - {file = "duckdb-0.10.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b48f5f1542f1e4b184e6b4fc188f497be8b9c48127867e7d9a5f4a3e334f88b0"}, - {file = "duckdb-0.10.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e327f7a3951ea154bb56e3fef7da889e790bd9a67ca3c36afc1beb17d3feb6d6"}, - {file = "duckdb-0.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d8b20ed67da004b4481973f4254fd79a0e5af957d2382eac8624b5c527ec48c"}, - {file = "duckdb-0.10.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d37680b8d7be04e4709db3a66c8b3eb7ceba2a5276574903528632f2b2cc2e60"}, - {file = "duckdb-0.10.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3d34b86d6a2a6dfe8bb757f90bfe7101a3bd9e3022bf19dbddfa4b32680d26a9"}, - {file = "duckdb-0.10.3-cp310-cp310-win_amd64.whl", hash = "sha256:73b1cb283ca0f6576dc18183fd315b4e487a545667ffebbf50b08eb4e8cdc143"}, - {file = "duckdb-0.10.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d917dde19fcec8cadcbef1f23946e85dee626ddc133e1e3f6551f15a61a03c61"}, - {file = "duckdb-0.10.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:46757e0cf5f44b4cb820c48a34f339a9ccf83b43d525d44947273a585a4ed822"}, - {file = "duckdb-0.10.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:338c14d8ac53ac4aa9ec03b6f1325ecfe609ceeb72565124d489cb07f8a1e4eb"}, - {file = "duckdb-0.10.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:651fcb429602b79a3cf76b662a39e93e9c3e6650f7018258f4af344c816dab72"}, - {file = "duckdb-0.10.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d3ae3c73b98b6215dab93cc9bc936b94aed55b53c34ba01dec863c5cab9f8e25"}, - {file = "duckdb-0.10.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:56429b2cfe70e367fb818c2be19f59ce2f6b080c8382c4d10b4f90ba81f774e9"}, - {file = "duckdb-0.10.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b46c02c2e39e3676b1bb0dc7720b8aa953734de4fd1b762e6d7375fbeb1b63af"}, - {file = "duckdb-0.10.3-cp311-cp311-win_amd64.whl", hash = "sha256:bcd460feef56575af2c2443d7394d405a164c409e9794a4d94cb5fdaa24a0ba4"}, - {file = "duckdb-0.10.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:e229a7c6361afbb0d0ab29b1b398c10921263c52957aefe3ace99b0426fdb91e"}, - {file = "duckdb-0.10.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:732b1d3b6b17bf2f32ea696b9afc9e033493c5a3b783c292ca4b0ee7cc7b0e66"}, - {file = "duckdb-0.10.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f5380d4db11fec5021389fb85d614680dc12757ef7c5881262742250e0b58c75"}, - {file = "duckdb-0.10.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:468a4e0c0b13c55f84972b1110060d1b0f854ffeb5900a178a775259ec1562db"}, - {file = "duckdb-0.10.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fa1e7ff8d18d71defa84e79f5c86aa25d3be80d7cb7bc259a322de6d7cc72da"}, - {file = "duckdb-0.10.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ed1063ed97c02e9cf2e7fd1d280de2d1e243d72268330f45344c69c7ce438a01"}, - {file = "duckdb-0.10.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:22f2aad5bb49c007f3bfcd3e81fdedbc16a2ae41f2915fc278724ca494128b0c"}, - {file = "duckdb-0.10.3-cp312-cp312-win_amd64.whl", hash = "sha256:8f9e2bb00a048eb70b73a494bdc868ce7549b342f7ffec88192a78e5a4e164bd"}, - {file = "duckdb-0.10.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a6c2fc49875b4b54e882d68703083ca6f84b27536d57d623fc872e2f502b1078"}, - {file = "duckdb-0.10.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a66c125d0c30af210f7ee599e7821c3d1a7e09208196dafbf997d4e0cfcb81ab"}, - {file = "duckdb-0.10.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d99dd7a1d901149c7a276440d6e737b2777e17d2046f5efb0c06ad3b8cb066a6"}, - {file = "duckdb-0.10.3-cp37-cp37m-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5ec3bbdb209e6095d202202893763e26c17c88293b88ef986b619e6c8b6715bd"}, - {file = "duckdb-0.10.3-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:2b3dec4ef8ed355d7b7230b40950b30d0def2c387a2e8cd7efc80b9d14134ecf"}, - {file = "duckdb-0.10.3-cp37-cp37m-win_amd64.whl", hash = "sha256:04129f94fb49bba5eea22f941f0fb30337f069a04993048b59e2811f52d564bc"}, - {file = "duckdb-0.10.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:d75d67024fc22c8edfd47747c8550fb3c34fb1cbcbfd567e94939ffd9c9e3ca7"}, - {file = "duckdb-0.10.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f3796e9507c02d0ddbba2e84c994fae131da567ce3d9cbb4cbcd32fadc5fbb26"}, - {file = "duckdb-0.10.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:78e539d85ebd84e3e87ec44d28ad912ca4ca444fe705794e0de9be3dd5550c11"}, - {file = "duckdb-0.10.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7a99b67ac674b4de32073e9bc604b9c2273d399325181ff50b436c6da17bf00a"}, - {file = "duckdb-0.10.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1209a354a763758c4017a1f6a9f9b154a83bed4458287af9f71d84664ddb86b6"}, - {file = "duckdb-0.10.3-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3b735cea64aab39b67c136ab3a571dbf834067f8472ba2f8bf0341bc91bea820"}, - {file = "duckdb-0.10.3-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:816ffb9f758ed98eb02199d9321d592d7a32a6cb6aa31930f4337eb22cfc64e2"}, - {file = "duckdb-0.10.3-cp38-cp38-win_amd64.whl", hash = "sha256:1631184b94c3dc38b13bce4045bf3ae7e1b0ecbfbb8771eb8d751d8ffe1b59b3"}, - {file = "duckdb-0.10.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:fb98c35fc8dd65043bc08a2414dd9f59c680d7e8656295b8969f3f2061f26c52"}, - {file = "duckdb-0.10.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7e75c9f5b6a92b2a6816605c001d30790f6d67ce627a2b848d4d6040686efdf9"}, - {file = "duckdb-0.10.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae786eddf1c2fd003466e13393b9348a44b6061af6fe7bcb380a64cac24e7df7"}, - {file = "duckdb-0.10.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9387da7b7973707b0dea2588749660dd5dd724273222680e985a2dd36787668"}, - {file = "duckdb-0.10.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:538f943bf9fa8a3a7c4fafa05f21a69539d2c8a68e557233cbe9d989ae232899"}, - {file = "duckdb-0.10.3-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6930608f35025a73eb94252964f9f19dd68cf2aaa471da3982cf6694866cfa63"}, - {file = "duckdb-0.10.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:03bc54a9cde5490918aad82d7d2a34290e3dfb78d5b889c6626625c0f141272a"}, - {file = "duckdb-0.10.3-cp39-cp39-win_amd64.whl", hash = "sha256:372b6e3901d85108cafe5df03c872dfb6f0dbff66165a0cf46c47246c1957aa0"}, - {file = "duckdb-0.10.3.tar.gz", hash = "sha256:c5bd84a92bc708d3a6adffe1f554b94c6e76c795826daaaf482afc3d9c636971"}, + {file = "duckdb-1.1.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:5e4cbc408e6e41146dea89b9044dae7356e353db0c96b183e5583ee02bc6ae5d"}, + {file = "duckdb-1.1.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:6370ae27ec8167ccfbefb94f58ad9fdc7bac142399960549d6d367f233189868"}, + {file = "duckdb-1.1.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:4e1c3414f7fd01f4810dc8b335deffc91933a159282d65fef11c1286bc0ded04"}, + {file = "duckdb-1.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c6bc2a58689adf5520303c5f68b065b9f980bd31f1366c541b8c7490abaf55cd"}, + {file = "duckdb-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d02be208d2885ca085d4c852b911493b8cdac9d6eae893259da32bd72a437c25"}, + {file = "duckdb-1.1.0-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:655df442ceebfc6f3fd6c8766e04b60d44dddedfa90275d794f9fab2d3180879"}, + {file = "duckdb-1.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:6e183729bb64be7798ccbfda6283ebf423c869268c25af2b56929e48f763be2f"}, + {file = "duckdb-1.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:61fb838da51e07ceb0222c4406b059b90e10efcc453c19a3650b73c0112138c4"}, + {file = "duckdb-1.1.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:7807e2f0d3344668e433f0dc1f54bfaddd410589611393e9a7ed56f8dec9514f"}, + {file = "duckdb-1.1.0-cp311-cp311-macosx_12_0_universal2.whl", hash = "sha256:3da30b7b466f710d52caa1fdc3ef0bf4176ad7f115953cd9f8b0fbf0f723778f"}, + {file = "duckdb-1.1.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:b9b6a77ef0183f561b1fc2945fcc762a71570ffd33fea4e3a855d413ed596fe4"}, + {file = "duckdb-1.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:16243e66a9fd0e64ee265f2634d137adc6593f54ddf3ef55cb8a29e1decf6e54"}, + {file = "duckdb-1.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42b910a149e00f40a1766dc74fa309d4255b912a5d2fdcc387287658048650f6"}, + {file = "duckdb-1.1.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:47849d546dc4238c0f20e95fe53b621aa5b08684e68fff91fd84a7092be91a17"}, + {file = "duckdb-1.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:11ec967b67159361ceade34095796a8d19368ea5c30cad988f44896b082b0816"}, + {file = "duckdb-1.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:510b5885ed6c267b9c0e1e7c6138fdffc2dd6f934a5a95b76da85da127213338"}, + {file = "duckdb-1.1.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:657bc7ac64d5faf069a782ae73afac51ef30ae2e5d0e09ce6a09d03db84ab35e"}, + {file = "duckdb-1.1.0-cp312-cp312-macosx_12_0_universal2.whl", hash = "sha256:89f3de8cba57d19b41cd3c47dd06d979bd2a2ffead115480e37afbe72b02896d"}, + {file = "duckdb-1.1.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:f6486323ab20656d22ffa8f3c6e109dde30d0b327b7c831f22ebcfe747f97fb0"}, + {file = "duckdb-1.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78a4510f82431ee3f14db689fe8727a4a9062c8f2fbb3bcfe3bfad3c1a198004"}, + {file = "duckdb-1.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64bf2a6e23840d662bd2ac09206a9bd4fa657418884d69e5c352d4456dc70b3c"}, + {file = "duckdb-1.1.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:23fc9aa0af74e3803ed90c8d98280fd5bcac8c940592bf6288e8fd60fb051d00"}, + {file = "duckdb-1.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1f3aea31341ce400640dd522e4399b941f66df17e39884f446638fe958d6117c"}, + {file = "duckdb-1.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:3db4ab31c20de4edaef152930836b38e7662cd71370748fdf2c38ba9cf854dc4"}, + {file = "duckdb-1.1.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e3b6b4fe1edfe35f64f403a9f0ab75258cee35abd964356893ee37424174b7e4"}, + {file = "duckdb-1.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aad02f50d5a2020822d1638fc1a9bcf082056f11d2e15ccfc1c1ed4d0f85a3be"}, + {file = "duckdb-1.1.0-cp37-cp37m-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eb66e9e7391801928ea134dcab12d2e4c97f2ce0391c603a3e480bbb15830bc8"}, + {file = "duckdb-1.1.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:069fb7bca459e31edb32a61f0eea95d7a8a766bef7b8318072563abf8e939593"}, + {file = "duckdb-1.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e39f9b7b62e64e10d421ff04480290a70129c38067d1a4f600e9212b10542c5a"}, + {file = "duckdb-1.1.0-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:55ef98bcc7ba745752607f1b926e8d9b7ce32c42c423bbad10c44820aefe23a7"}, + {file = "duckdb-1.1.0-cp38-cp38-macosx_12_0_universal2.whl", hash = "sha256:e2a08175e43b865c1e9611efd18cacd29ddd69093de442b1ebdf312071df7719"}, + {file = "duckdb-1.1.0-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:0e3644b1f034012d82b9baa12a7ea306fe71dc6623731b28c753c4a617ff9499"}, + {file = "duckdb-1.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:211a33c1ddb5cc609f75eb43772b0b03b45d2fa89bec107e4715267ca907806a"}, + {file = "duckdb-1.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8e74b6f8a5145abbf7e6c1a2a61f0adbcd493c19b358f524ec9a3cebdf362abb"}, + {file = "duckdb-1.1.0-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:58f1633dd2c5af5088ae2d119418e200855d0699d84f2fae9d46d30f404bcead"}, + {file = "duckdb-1.1.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:d18caea926b1e301c29b140418fca697aad728129e269b4f82c2795a184549e1"}, + {file = "duckdb-1.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:cd9fb1408942411ad360f8414bc3fbf0091c396ca903d947a10f2e31324d5cbd"}, + {file = "duckdb-1.1.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:bd11bc899cebf5ff936d1276a2dfb7b7db08aba3bcc42924afeafc2163bddb43"}, + {file = "duckdb-1.1.0-cp39-cp39-macosx_12_0_universal2.whl", hash = "sha256:53825a63193c582a78c152ea53de8d145744ddbeea18f452625a82ebc33eb14a"}, + {file = "duckdb-1.1.0-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:29dc18087de47563b3859a6b98bbed96e1c96ce5db829646dc3b16a916997e7d"}, + {file = "duckdb-1.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecb19319883564237a7a03a104dbe7f445e73519bb67108fcab3d19b6b91fe30"}, + {file = "duckdb-1.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aac2fcabe2d5072c252d0b3087365f431de812d8199705089fb073e4d039d19c"}, + {file = "duckdb-1.1.0-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d89eaaa5df8a57e7d2bc1f4c46493bb1fee319a00155f2015810ad2ace6570ae"}, + {file = "duckdb-1.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:d86a6926313913cd2cc7e08816d3e7f72ba340adf2959279b1a80058be6526d9"}, + {file = "duckdb-1.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:d8333f3e85fa2a0f1c222b752c2bd42ea875235ff88492f7bcbb6867d0f644eb"}, + {file = "duckdb-1.1.0.tar.gz", hash = "sha256:b4d4c12b1f98732151bd31377753e0da1a20f6423016d2d097d2e31953ec7c23"}, ] [[package]] @@ -4450,6 +4449,76 @@ files = [ [package.dependencies] ansicon = {version = "*", markers = "platform_system == \"Windows\""} +[[package]] +name = "jiter" +version = "0.5.0" +description = "Fast iterable JSON parser." +optional = false +python-versions = ">=3.8" +files = [ + {file = "jiter-0.5.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:b599f4e89b3def9a94091e6ee52e1d7ad7bc33e238ebb9c4c63f211d74822c3f"}, + {file = "jiter-0.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2a063f71c4b06225543dddadbe09d203dc0c95ba352d8b85f1221173480a71d5"}, + {file = "jiter-0.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:acc0d5b8b3dd12e91dd184b87273f864b363dfabc90ef29a1092d269f18c7e28"}, + {file = "jiter-0.5.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c22541f0b672f4d741382a97c65609332a783501551445ab2df137ada01e019e"}, + {file = "jiter-0.5.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:63314832e302cc10d8dfbda0333a384bf4bcfce80d65fe99b0f3c0da8945a91a"}, + {file = "jiter-0.5.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a25fbd8a5a58061e433d6fae6d5298777c0814a8bcefa1e5ecfff20c594bd749"}, + {file = "jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:503b2c27d87dfff5ab717a8200fbbcf4714516c9d85558048b1fc14d2de7d8dc"}, + {file = "jiter-0.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6d1f3d27cce923713933a844872d213d244e09b53ec99b7a7fdf73d543529d6d"}, + {file = "jiter-0.5.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:c95980207b3998f2c3b3098f357994d3fd7661121f30669ca7cb945f09510a87"}, + {file = "jiter-0.5.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:afa66939d834b0ce063f57d9895e8036ffc41c4bd90e4a99631e5f261d9b518e"}, + {file = "jiter-0.5.0-cp310-none-win32.whl", hash = "sha256:f16ca8f10e62f25fd81d5310e852df6649af17824146ca74647a018424ddeccf"}, + {file = "jiter-0.5.0-cp310-none-win_amd64.whl", hash = "sha256:b2950e4798e82dd9176935ef6a55cf6a448b5c71515a556da3f6b811a7844f1e"}, + {file = "jiter-0.5.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:d4c8e1ed0ef31ad29cae5ea16b9e41529eb50a7fba70600008e9f8de6376d553"}, + {file = "jiter-0.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c6f16e21276074a12d8421692515b3fd6d2ea9c94fd0734c39a12960a20e85f3"}, + {file = "jiter-0.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5280e68e7740c8c128d3ae5ab63335ce6d1fb6603d3b809637b11713487af9e6"}, + {file = "jiter-0.5.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:583c57fc30cc1fec360e66323aadd7fc3edeec01289bfafc35d3b9dcb29495e4"}, + {file = "jiter-0.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:26351cc14507bdf466b5f99aba3df3143a59da75799bf64a53a3ad3155ecded9"}, + {file = "jiter-0.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4829df14d656b3fb87e50ae8b48253a8851c707da9f30d45aacab2aa2ba2d614"}, + {file = "jiter-0.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a42a4bdcf7307b86cb863b2fb9bb55029b422d8f86276a50487982d99eed7c6e"}, + {file = "jiter-0.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:04d461ad0aebf696f8da13c99bc1b3e06f66ecf6cfd56254cc402f6385231c06"}, + {file = "jiter-0.5.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e6375923c5f19888c9226582a124b77b622f8fd0018b843c45eeb19d9701c403"}, + {file = "jiter-0.5.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2cec323a853c24fd0472517113768c92ae0be8f8c384ef4441d3632da8baa646"}, + {file = "jiter-0.5.0-cp311-none-win32.whl", hash = "sha256:aa1db0967130b5cab63dfe4d6ff547c88b2a394c3410db64744d491df7f069bb"}, + {file = "jiter-0.5.0-cp311-none-win_amd64.whl", hash = "sha256:aa9d2b85b2ed7dc7697597dcfaac66e63c1b3028652f751c81c65a9f220899ae"}, + {file = "jiter-0.5.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:9f664e7351604f91dcdd557603c57fc0d551bc65cc0a732fdacbf73ad335049a"}, + {file = "jiter-0.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:044f2f1148b5248ad2c8c3afb43430dccf676c5a5834d2f5089a4e6c5bbd64df"}, + {file = "jiter-0.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:702e3520384c88b6e270c55c772d4bd6d7b150608dcc94dea87ceba1b6391248"}, + {file = "jiter-0.5.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:528d742dcde73fad9d63e8242c036ab4a84389a56e04efd854062b660f559544"}, + {file = "jiter-0.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8cf80e5fe6ab582c82f0c3331df27a7e1565e2dcf06265afd5173d809cdbf9ba"}, + {file = "jiter-0.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:44dfc9ddfb9b51a5626568ef4e55ada462b7328996294fe4d36de02fce42721f"}, + {file = "jiter-0.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c451f7922992751a936b96c5f5b9bb9312243d9b754c34b33d0cb72c84669f4e"}, + {file = "jiter-0.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:308fce789a2f093dca1ff91ac391f11a9f99c35369117ad5a5c6c4903e1b3e3a"}, + {file = "jiter-0.5.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7f5ad4a7c6b0d90776fdefa294f662e8a86871e601309643de30bf94bb93a64e"}, + {file = "jiter-0.5.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ea189db75f8eca08807d02ae27929e890c7d47599ce3d0a6a5d41f2419ecf338"}, + {file = "jiter-0.5.0-cp312-none-win32.whl", hash = "sha256:e3bbe3910c724b877846186c25fe3c802e105a2c1fc2b57d6688b9f8772026e4"}, + {file = "jiter-0.5.0-cp312-none-win_amd64.whl", hash = "sha256:a586832f70c3f1481732919215f36d41c59ca080fa27a65cf23d9490e75b2ef5"}, + {file = "jiter-0.5.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:f04bc2fc50dc77be9d10f73fcc4e39346402ffe21726ff41028f36e179b587e6"}, + {file = "jiter-0.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6f433a4169ad22fcb550b11179bb2b4fd405de9b982601914ef448390b2954f3"}, + {file = "jiter-0.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ad4a6398c85d3a20067e6c69890ca01f68659da94d74c800298581724e426c7e"}, + {file = "jiter-0.5.0-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6baa88334e7af3f4d7a5c66c3a63808e5efbc3698a1c57626541ddd22f8e4fbf"}, + {file = "jiter-0.5.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ece0a115c05efca597c6d938f88c9357c843f8c245dbbb53361a1c01afd7148"}, + {file = "jiter-0.5.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:335942557162ad372cc367ffaf93217117401bf930483b4b3ebdb1223dbddfa7"}, + {file = "jiter-0.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:649b0ee97a6e6da174bffcb3c8c051a5935d7d4f2f52ea1583b5b3e7822fbf14"}, + {file = "jiter-0.5.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f4be354c5de82157886ca7f5925dbda369b77344b4b4adf2723079715f823989"}, + {file = "jiter-0.5.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5206144578831a6de278a38896864ded4ed96af66e1e63ec5dd7f4a1fce38a3a"}, + {file = "jiter-0.5.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8120c60f8121ac3d6f072b97ef0e71770cc72b3c23084c72c4189428b1b1d3b6"}, + {file = "jiter-0.5.0-cp38-none-win32.whl", hash = "sha256:6f1223f88b6d76b519cb033a4d3687ca157c272ec5d6015c322fc5b3074d8a5e"}, + {file = "jiter-0.5.0-cp38-none-win_amd64.whl", hash = "sha256:c59614b225d9f434ea8fc0d0bec51ef5fa8c83679afedc0433905994fb36d631"}, + {file = "jiter-0.5.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:0af3838cfb7e6afee3f00dc66fa24695199e20ba87df26e942820345b0afc566"}, + {file = "jiter-0.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:550b11d669600dbc342364fd4adbe987f14d0bbedaf06feb1b983383dcc4b961"}, + {file = "jiter-0.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:489875bf1a0ffb3cb38a727b01e6673f0f2e395b2aad3c9387f94187cb214bbf"}, + {file = "jiter-0.5.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b250ca2594f5599ca82ba7e68785a669b352156260c5362ea1b4e04a0f3e2389"}, + {file = "jiter-0.5.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8ea18e01f785c6667ca15407cd6dabbe029d77474d53595a189bdc813347218e"}, + {file = "jiter-0.5.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:462a52be85b53cd9bffd94e2d788a09984274fe6cebb893d6287e1c296d50653"}, + {file = "jiter-0.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:92cc68b48d50fa472c79c93965e19bd48f40f207cb557a8346daa020d6ba973b"}, + {file = "jiter-0.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1c834133e59a8521bc87ebcad773608c6fa6ab5c7a022df24a45030826cf10bc"}, + {file = "jiter-0.5.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ab3a71ff31cf2d45cb216dc37af522d335211f3a972d2fe14ea99073de6cb104"}, + {file = "jiter-0.5.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cccd3af9c48ac500c95e1bcbc498020c87e1781ff0345dd371462d67b76643eb"}, + {file = "jiter-0.5.0-cp39-none-win32.whl", hash = "sha256:368084d8d5c4fc40ff7c3cc513c4f73e02c85f6009217922d0823a48ee7adf61"}, + {file = "jiter-0.5.0-cp39-none-win_amd64.whl", hash = "sha256:ce03f7b4129eb72f1687fa11300fbf677b02990618428934662406d2a76742a1"}, + {file = "jiter-0.5.0.tar.gz", hash = "sha256:1d916ba875bcab5c5f7d927df998c4cb694d27dceddf3392e58beaf10563368a"}, +] + [[package]] name = "jmespath" version = "1.0.1" @@ -5846,23 +5915,24 @@ sympy = "*" [[package]] name = "openai" -version = "1.35.3" +version = "1.45.1" description = "The official Python library for the openai API" optional = false python-versions = ">=3.7.1" files = [ - {file = "openai-1.35.3-py3-none-any.whl", hash = "sha256:7b26544cef80f125431c073ffab3811d2421fbb9e30d3bd5c2436aba00b042d5"}, - {file = "openai-1.35.3.tar.gz", hash = "sha256:d6177087f150b381d49499be782d764213fdf638d391b29ca692b84dd675a389"}, + {file = "openai-1.45.1-py3-none-any.whl", hash = "sha256:4a6cce402aec803ae57ae7eff4b5b94bf6c0e1703a8d85541c27243c2adeadf8"}, + {file = "openai-1.45.1.tar.gz", hash = "sha256:f79e384916b219ab2f028bbf9c778e81291c61eb0645ccfa1828a4b18b55d534"}, ] [package.dependencies] anyio = ">=3.5.0,<5" distro = ">=1.7.0,<2" httpx = ">=0.23.0,<1" +jiter = ">=0.4.0,<1" pydantic = ">=1.9.0,<3" sniffio = "*" tqdm = ">4" -typing-extensions = ">=4.7,<5" +typing-extensions = ">=4.11,<5" [package.extras] datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"] @@ -9195,13 +9265,13 @@ files = [ [[package]] name = "typing-extensions" -version = "4.7.1" -description = "Backported and Experimental Type Hints for Python 3.7+" +version = "4.12.2" +description = "Backported and Experimental Type Hints for Python 3.8+" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "typing_extensions-4.7.1-py3-none-any.whl", hash = "sha256:440d5dd3af93b060174bf433bccd69b0babc3b15b1a8dca43789fd7f61514b36"}, - {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"}, + {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, + {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, ] [[package]] @@ -9759,4 +9829,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "cf2b7cd45b7127328b25128320607b25a2c3b869f2ee6f79412fa12dc56441eb" +content-hash = "985bb75a9579b44a5f9fd029ade1cc77455b544f2e18f9741b1d0d89bd188537" diff --git a/pyproject.toml b/pyproject.toml index 5f60108e3d..2c47eb43c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dlt" -version = "1.0.0" +version = "1.1.0" description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run." authors = ["dltHub Inc. "] maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Anton Burnashev ", "David Scharf " ] @@ -60,7 +60,7 @@ psycopg2cffi = {version = ">=2.9.0", optional = true, markers="platform_python_i grpcio = {version = ">=1.50.0", optional = true} google-cloud-bigquery = {version = ">=2.26.0", optional = true} pyarrow = {version = ">=12.0.0", optional = true} -duckdb = {version = ">=0.6.1,<0.11", optional = true} +duckdb = {version = ">=0.9", optional = true} # keep per-python version dependency as a reference # duckdb = [ # {version = ">=0.6.1,<0.10.0", python = ">=3.8,<3.12", optional = true}, @@ -238,7 +238,7 @@ alive-progress = ">=3.0.1" pyarrow = ">=14.0.0" psycopg2-binary = ">=2.9" lancedb = { version = ">=0.8.2", markers = "python_version >= '3.9'", allow-prereleases = true } -openai = ">=1.35" +openai = ">=1.45" connectorx = { version = ">=0.3.2" } [tool.black] # https://black.readthedocs.io/en/stable/usage_and_configuration/the_basics.html#configuration-via-a-file diff --git a/tests/common/configuration/test_credentials.py b/tests/common/configuration/test_credentials.py index 1c6319b551..9c09ccacd0 100644 --- a/tests/common/configuration/test_credentials.py +++ b/tests/common/configuration/test_credentials.py @@ -24,7 +24,7 @@ from dlt.destinations.impl.snowflake.configuration import SnowflakeCredentials from tests.utils import TEST_DICT_CONFIG_PROVIDER, preserve_environ from tests.common.utils import json_case_path -from tests.common.configuration.utils import environment +from tests.common.configuration.utils import ConnectionStringCompatCredentials, environment SERVICE_JSON = """ @@ -125,7 +125,7 @@ def test_connection_string_letter_case(environment: Any) -> None: def test_connection_string_resolved_from_native_representation(environment: Any) -> None: destination_dsn = "mysql+pymsql://localhost:5432/dlt_data" - c = ConnectionStringCredentials() + c = ConnectionStringCompatCredentials() c.parse_native_representation(destination_dsn) assert c.is_partial() assert not c.is_resolved() @@ -141,7 +141,7 @@ def test_connection_string_resolved_from_native_representation(environment: Any) assert c.password is None # password must resolve - c = ConnectionStringCredentials() + c = ConnectionStringCompatCredentials() c.parse_native_representation("mysql+pymsql://USER@/dlt_data") # not partial! password is optional assert not c.is_partial() diff --git a/tests/common/configuration/test_toml_provider.py b/tests/common/configuration/test_toml_provider.py index 3b16a930e6..a19aea8796 100644 --- a/tests/common/configuration/test_toml_provider.py +++ b/tests/common/configuration/test_toml_provider.py @@ -32,6 +32,7 @@ from tests.utils import preserve_environ from tests.common.configuration.utils import ( + ConnectionStringCompatCredentials, SecretCredentials, WithCredentialsConfiguration, CoercionTestConfiguration, @@ -150,12 +151,12 @@ def test_secrets_toml_credentials(environment: Any, toml_providers: ConfigProvid with pytest.raises(ConfigFieldMissingException): print(dict(resolve.resolve_configuration(GcpServiceAccountCredentialsWithoutDefaults()))) # also try postgres credentials - c2 = ConnectionStringCredentials() + c2 = ConnectionStringCompatCredentials() c2.update({"drivername": "postgres"}) c2 = resolve.resolve_configuration(c2, sections=("destination", "redshift")) assert c2.database == "destination.redshift.credentials" # bigquery credentials do not match redshift credentials - c3 = ConnectionStringCredentials() + c3 = ConnectionStringCompatCredentials() c3.update({"drivername": "postgres"}) with pytest.raises(ConfigFieldMissingException): resolve.resolve_configuration(c3, sections=("destination", "bigquery")) diff --git a/tests/common/configuration/utils.py b/tests/common/configuration/utils.py index 677ec3d329..c28f93e32b 100644 --- a/tests/common/configuration/utils.py +++ b/tests/common/configuration/utils.py @@ -21,6 +21,7 @@ from dlt.common.configuration.specs import BaseConfiguration, CredentialsConfiguration from dlt.common.configuration.container import Container from dlt.common.configuration.providers import ConfigProvider, EnvironProvider +from dlt.common.configuration.specs.connection_string_credentials import ConnectionStringCredentials from dlt.common.configuration.utils import get_resolved_traces from dlt.common.configuration.specs.config_providers_context import ConfigProvidersContext from dlt.common.typing import TSecretValue, StrAny @@ -78,6 +79,12 @@ class SectionedConfiguration(BaseConfiguration): password: str = None +@configspec +class ConnectionStringCompatCredentials(ConnectionStringCredentials): + database: str = None + username: str = None + + @pytest.fixture(scope="function") def environment() -> Any: saved_environ = environ.copy() diff --git a/tests/destinations/test_custom_destination.py b/tests/destinations/test_custom_destination.py index 476e2f1b03..030362f80e 100644 --- a/tests/destinations/test_custom_destination.py +++ b/tests/destinations/test_custom_destination.py @@ -12,7 +12,10 @@ from dlt.common.schema import TTableSchema from dlt.common.data_writers.writers import TLoaderFileFormat from dlt.common.destination.reference import Destination -from dlt.common.destination.exceptions import InvalidDestinationReference +from dlt.common.destination.exceptions import ( + DestinationTransientException, + InvalidDestinationReference, +) from dlt.common.configuration.exceptions import ConfigFieldMissingException, ConfigurationValueError from dlt.common.configuration.specs import ConnectionStringCredentials from dlt.common.configuration.inject import get_fun_spec @@ -278,7 +281,7 @@ def test_sink(items: TDataItems, table: TTableSchema) -> None: if table_name in provoke_error: for item in items: if provoke_error[table_name] == item["id"]: - raise AssertionError("Oh no!") + raise DestinationTransientException("Oh no!") calls.setdefault(table_name, []).append(items) diff --git a/tests/helpers/dbt_tests/test_runner_dbt_versions.py b/tests/helpers/dbt_tests/test_runner_dbt_versions.py index a82345d732..1b5874ede1 100644 --- a/tests/helpers/dbt_tests/test_runner_dbt_versions.py +++ b/tests/helpers/dbt_tests/test_runner_dbt_versions.py @@ -79,10 +79,10 @@ def test_infer_venv_deps() -> None: # provide version ranges requirements = _create_dbt_deps(["duckdb"], dbt_version=">3") # special duckdb dependency - assert requirements[:-1] == ["dbt-core>3", "dbt-duckdb", "duckdb==0.10.3"] + assert requirements[:-1] == ["dbt-core>3", "dbt-duckdb", "duckdb==1.1.0"] # we do not validate version ranges, pip will do it and fail when creating venv requirements = _create_dbt_deps(["motherduck"], dbt_version="y") - assert requirements[:-1] == ["dbt-corey", "dbt-duckdb", "duckdb==0.10.3"] + assert requirements[:-1] == ["dbt-corey", "dbt-duckdb", "duckdb==1.1.0"] def test_default_profile_name() -> None: diff --git a/tests/load/filesystem/test_filesystem_common.py b/tests/load/filesystem/test_filesystem_common.py index 29ca1a2b57..0db93410e5 100644 --- a/tests/load/filesystem/test_filesystem_common.py +++ b/tests/load/filesystem/test_filesystem_common.py @@ -50,6 +50,7 @@ def test_filesystem_configuration() -> None: "bucket_url": "az://root", "credentials": None, "client_kwargs": None, + "max_state_files": 100, "kwargs": None, "deltalake_storage_options": None, } @@ -173,6 +174,7 @@ def test_filesystem_configuration_with_additional_arguments() -> None: "read_only": False, "bucket_url": "az://root", "credentials": None, + "max_state_files": 100, "kwargs": {"use_ssl": True}, "client_kwargs": {"verify": "public.crt"}, "deltalake_storage_options": {"AWS_S3_LOCKING_PROVIDER": "dynamodb"}, diff --git a/tests/load/pipeline/test_databricks_pipeline.py b/tests/load/pipeline/test_databricks_pipeline.py index 9d152bb099..2225d0001c 100644 --- a/tests/load/pipeline/test_databricks_pipeline.py +++ b/tests/load/pipeline/test_databricks_pipeline.py @@ -62,7 +62,7 @@ def test_databricks_external_location(destination_config: DestinationTestConfigu in pipeline.list_failed_jobs_in_package(info.loads_ids[0])[0].failed_message ) - # should fail on non existing stored credentials + # # should fail on non existing stored credentials bricks = databricks(is_staging_external_location=False, staging_credentials_name="CREDENTIAL_X") pipeline = destination_config.setup_pipeline( "test_databricks_external_location", @@ -78,8 +78,12 @@ def test_databricks_external_location(destination_config: DestinationTestConfigu # should fail on non existing stored credentials # auto stage with regular az:// used + principal_az_stage = filesystem(destination_name="fsazureprincipal") pipeline = destination_config.setup_pipeline( - "test_databricks_external_location", dataset_name=dataset_name, destination=bricks + "test_databricks_external_location", + dataset_name=dataset_name, + destination=bricks, + staging=principal_az_stage, ) info = pipeline.run([1, 2, 3], table_name="digits", **destination_config.run_kwargs) assert info.has_failed_jobs is True diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 92e927f438..11e0c88451 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -16,6 +16,7 @@ from dlt.common.storages.load_package import ParsedLoadJobFileName from dlt.common.utils import uniq_id from dlt.common.schema.typing import TWriteDisposition +from dlt.common.configuration.exceptions import ConfigurationValueError from dlt.destinations import filesystem from dlt.destinations.impl.filesystem.filesystem import FilesystemClient from dlt.destinations.impl.filesystem.typing import TExtraPlaceholders @@ -1334,3 +1335,186 @@ def table_3(): # test truncate multiple fs_client.truncate_tables(["table_1", "table_3"]) assert load_table_counts(p, "table_1", "table_2", "table_3") == {"table_2": 21} + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(all_buckets_filesystem_configs=True), + ids=lambda x: x.name, +) +def test_cleanup_states_by_load_id(destination_config: DestinationTestConfiguration) -> None: + """ + Test the pipeline state cleanup functionality by verifying that old state files are removed based on `load_id` when multiple loads are executed. + + Specifically, the oldest state file (corresponding to the first `load_id`) should be deleted. + + This test checks that when running a pipeline with a resource that produces incremental data, older state files are cleared according to the `max_state_files` setting. + + Steps: + 1. Set `max_state_files` to 2, allowing only two newest state files to be kept. + 2. Run the pipeline three times. + 3. Verify that the state file from the first load is no longer present in the state table. + """ + + dataset_name = f"{destination_config.destination_name}{uniq_id()}" + p = destination_config.setup_pipeline("p1", dataset_name=dataset_name) + + @dlt.resource(name="items", primary_key="id") + def r1(_=dlt.sources.incremental("id")): + yield from [{"id": 0}] + + @dlt.resource(name="items", primary_key="id") + def r2(_=dlt.sources.incremental("id")): + yield from [{"id": 0}, {"id": 1}] + + @dlt.resource(name="items", primary_key="id") + def r3(_=dlt.sources.incremental("id")): + yield from [{"id": 0}, {"id": 1}, {"id": 2}] + + os.environ["DESTINATION__FILESYSTEM__MAX_STATE_FILES"] = str(2) + + info = p.run(r1) + first_load_id = info.loads_ids[0] + + info = p.run(r2) + second_load_id = [load_id for load_id in info.loads_ids if load_id != first_load_id][0] + + info = p.run(r3) + third_load_id = [ + load_id + for load_id in info.loads_ids + if load_id != first_load_id and load_id != second_load_id + ][0] + + client: FilesystemClient = p.destination_client() # type: ignore + state_table_files = list(client._list_dlt_table_files(client.schema.state_table_name, "p1")) + + assert not any(fileparts[1] == first_load_id for _, fileparts in state_table_files) + assert any(fileparts[1] == second_load_id for _, fileparts in state_table_files) + assert any(fileparts[1] == third_load_id for _, fileparts in state_table_files) + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(all_buckets_filesystem_configs=True), + ids=lambda x: x.name, +) +@pytest.mark.parametrize("max_state_files", [-1, 0, 1, 3]) +def test_cleanup_states( + destination_config: DestinationTestConfiguration, max_state_files: int +) -> None: + """ + Test the behavior of pipeline state cleanup based on different max_state_files configurations. + + Steps: + 1. Run the pipeline five times with max_state_files set to -1, 0, 1, and 3. + 2. Verify that state files are cleaned or retained according to the max_state_files setting: + - Negative or zero values disable cleanup. + - Positive values trigger cleanup, keeping only the specified number of state files. + """ + os.environ["DESTINATION__FILESYSTEM__MAX_STATE_FILES"] = str(max_state_files) + + dataset_name = f"{destination_config.destination_name}{uniq_id()}" + p = destination_config.setup_pipeline("p1", dataset_name=dataset_name) + + @dlt.resource(name="items", primary_key="id") + def r1(_=dlt.sources.incremental("id")): + yield from [{"id": 0}] + + @dlt.resource(name="items", primary_key="id") + def r2(_=dlt.sources.incremental("id")): + yield from [{"id": 0}, {"id": 1}] + + @dlt.resource(name="items", primary_key="id") + def r3(_=dlt.sources.incremental("id")): + yield from [{"id": 0}, {"id": 1}, {"id": 2}] + + @dlt.resource(name="items", primary_key="id") + def r4(_=dlt.sources.incremental("id")): + yield from [{"id": 0}, {"id": 1}, {"id": 2}, {"id": 3}] + + @dlt.resource(name="items", primary_key="id") + def r5(_=dlt.sources.incremental("id")): + yield from [{"id": 0}, {"id": 1}, {"id": 2}, {"id": 3}, {"id": 4}] + + # run pipeline + run_count = 5 + + p.run(r1) + p.run(r2) + p.run(r3) + p.run(r4) + p.run(r5) + + client: FilesystemClient = p.destination_client() # type: ignore + state_table_files = list(client._list_dlt_table_files(client.schema.state_table_name, "p1")) + + if max_state_files == -1 or max_state_files == 0: + assert len(state_table_files) == run_count + else: + assert len(state_table_files) == max_state_files + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(all_buckets_filesystem_configs=True), + ids=lambda x: x.name, +) +def test_cleanup_states_shared_dataset(destination_config: DestinationTestConfiguration) -> None: + """ + Test that two pipelines sharing the same bucket_url and dataset_name can independently + clean their _dlt_pipeline_state files with different max_state_files configurations. + + Steps: + 1. Run pipeline p1 five times with max_state_files set to 5. + 2. Run pipeline p2 five times with max_state_files set to 2. + 3. Verify that each pipeline only deletes its own state files and does not affect the other. + """ + dataset_name = f"{destination_config.destination_name}{uniq_id()}" + + p1 = destination_config.setup_pipeline("p1", dataset_name=dataset_name) + p2 = destination_config.setup_pipeline("p2", dataset_name=dataset_name) + + @dlt.resource(name="items", primary_key="id") + def r1(_=dlt.sources.incremental("id")): + yield from [{"id": 0}] + + @dlt.resource(name="items", primary_key="id") + def r2(_=dlt.sources.incremental("id")): + yield from [{"id": 0}, {"id": 1}] + + @dlt.resource(name="items", primary_key="id") + def r3(_=dlt.sources.incremental("id")): + yield from [{"id": 0}, {"id": 1}, {"id": 2}] + + @dlt.resource(name="items", primary_key="id") + def r4(_=dlt.sources.incremental("id")): + yield from [{"id": 0}, {"id": 1}, {"id": 2}, {"id": 3}] + + @dlt.resource(name="items", primary_key="id") + def r5(_=dlt.sources.incremental("id")): + yield from [{"id": 0}, {"id": 1}, {"id": 2}, {"id": 3}, {"id": 4}] + + os.environ["DESTINATION__FILESYSTEM__MAX_STATE_FILES"] = str(5) + p1.run(r1) + p1.run(r2) + p1.run(r3) + p1.run(r4) + p1.run(r5) + + os.environ["DESTINATION__FILESYSTEM__MAX_STATE_FILES"] = str(2) + p2.run(r1) + p2.run(r2) + p2.run(r3) + p2.run(r4) + p2.run(r5) + + p1_client: FilesystemClient = p1.destination_client() # type: ignore + p1_state_files = list(p1_client._list_dlt_table_files(p1_client.schema.state_table_name, "p1")) + + p2_client: FilesystemClient = p2.destination_client() # type: ignore + p2_state_files = list(p2_client._list_dlt_table_files(p2_client.schema.state_table_name, "p2")) + + assert len(p1_state_files) == 5 + + assert len(p2_state_files) == 2 diff --git a/tests/load/pipeline/test_pipelines.py b/tests/load/pipeline/test_pipelines.py index 659bca6cb9..d064456c0d 100644 --- a/tests/load/pipeline/test_pipelines.py +++ b/tests/load/pipeline/test_pipelines.py @@ -48,7 +48,7 @@ destinations_configs, DestinationTestConfiguration, ) -from tests.load.pipeline.utils import REPLACE_STRATEGIES +from tests.load.pipeline.utils import REPLACE_STRATEGIES, skip_if_unsupported_replace_strategy # mark all tests as essential, do not remove pytestmark = pytest.mark.essential @@ -641,11 +641,7 @@ def test_dataset_name_change(destination_config: DestinationTestConfiguration) - def test_pipeline_upfront_tables_two_loads( destination_config: DestinationTestConfiguration, replace_strategy: str ) -> None: - if not destination_config.supports_merge and replace_strategy != "truncate-and-insert": - pytest.skip( - f"Destination {destination_config.name} does not support merge and thus" - f" {replace_strategy}" - ) + skip_if_unsupported_replace_strategy(destination_config, replace_strategy) # use staging tables for replace os.environ["DESTINATION__REPLACE_STRATEGY"] = replace_strategy diff --git a/tests/load/pipeline/test_replace_disposition.py b/tests/load/pipeline/test_replace_disposition.py index 82cef83019..569bb8ce33 100644 --- a/tests/load/pipeline/test_replace_disposition.py +++ b/tests/load/pipeline/test_replace_disposition.py @@ -9,7 +9,7 @@ destinations_configs, DestinationTestConfiguration, ) -from tests.load.pipeline.utils import REPLACE_STRATEGIES +from tests.load.pipeline.utils import REPLACE_STRATEGIES, skip_if_unsupported_replace_strategy @pytest.mark.essential @@ -24,11 +24,7 @@ def test_replace_disposition( destination_config: DestinationTestConfiguration, replace_strategy: str ) -> None: - if not destination_config.supports_merge and replace_strategy != "truncate-and-insert": - pytest.skip( - f"Destination {destination_config.name} does not support merge and thus" - f" {replace_strategy}" - ) + skip_if_unsupported_replace_strategy(destination_config, replace_strategy) # only allow 40 items per file os.environ["DATA_WRITER__FILE_MAX_ITEMS"] = "40" @@ -242,11 +238,7 @@ def load_items_none(): def test_replace_table_clearing( destination_config: DestinationTestConfiguration, replace_strategy: str ) -> None: - if not destination_config.supports_merge and replace_strategy != "truncate-and-insert": - pytest.skip( - f"Destination {destination_config.name} does not support merge and thus" - f" {replace_strategy}" - ) + skip_if_unsupported_replace_strategy(destination_config, replace_strategy) # use staging tables for replace os.environ["DESTINATION__REPLACE_STRATEGY"] = replace_strategy diff --git a/tests/load/pipeline/utils.py b/tests/load/pipeline/utils.py index 679c2d6da9..1a1324e59a 100644 --- a/tests/load/pipeline/utils.py +++ b/tests/load/pipeline/utils.py @@ -1 +1,19 @@ +import pytest + +from tests.load.utils import DestinationTestConfiguration + REPLACE_STRATEGIES = ["truncate-and-insert", "insert-from-staging", "staging-optimized"] + + +def skip_if_unsupported_replace_strategy( + destination_config: DestinationTestConfiguration, replace_strategy: str +): + """Skip test if destination does not support the given replace strategy.""" + supported_replace_strategies = ( + destination_config.raw_capabilities().supported_replace_strategies + ) + if not supported_replace_strategies or replace_strategy not in supported_replace_strategies: + pytest.skip( + f"Destination {destination_config.name} does not support the replace strategy" + f" {replace_strategy}" + ) diff --git a/tests/load/utils.py b/tests/load/utils.py index f443748f8e..19601f2cf1 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -35,6 +35,7 @@ DestinationClientStagingConfiguration, TDestinationReferenceArg, WithStagingDataset, + DestinationCapabilitiesContext, ) from dlt.common.destination import TLoaderFileFormat, Destination from dlt.common.destination.reference import DEFAULT_FILE_LAYOUT @@ -171,6 +172,10 @@ def destination_factory(self, **kwargs) -> Destination[Any, Any]: self.setup() return Destination.from_reference(dest_type, destination_name=dest_name, **kwargs) + def raw_capabilities(self) -> DestinationCapabilitiesContext: + dest = Destination.from_reference(self.destination_type) + return dest._raw_capabilities() + @property def name(self) -> str: name: str = self.destination_name or self.destination_type diff --git a/tests/sources/rest_api/configurations/source_configs.py b/tests/sources/rest_api/configurations/source_configs.py index 334bfdd230..8e26a4183b 100644 --- a/tests/sources/rest_api/configurations/source_configs.py +++ b/tests/sources/rest_api/configurations/source_configs.py @@ -1,10 +1,15 @@ from collections import namedtuple from typing import cast, List +import requests import dlt +import dlt.common from dlt.common.typing import TSecretStrValue from dlt.common.exceptions import DictValidationException from dlt.common.configuration.specs import configspec + +import dlt.sources.helpers +import dlt.sources.helpers.requests from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator from dlt.sources.helpers.rest_client.auth import OAuth2AuthBase @@ -108,6 +113,12 @@ class CustomOAuthAuth(OAuth2AuthBase): pass +@dlt.resource(name="repositories", selected=False) +def repositories(): + """A seed list of repositories to fetch""" + yield [{"name": "dlt"}, {"name": "verified-sources"}, {"name": "dlthub-education"}] + + VALID_CONFIGS: List[RESTAPIConfig] = [ { "client": {"base_url": "https://api.example.com"}, @@ -304,6 +315,44 @@ class CustomOAuthAuth(OAuth2AuthBase): }, ], }, + { + "client": {"base_url": "https://github.com/api/v2"}, + "resources": [ + { + "name": "issues", + "endpoint": { + "path": "dlt-hub/{repository}/issues/", + "params": { + "repository": { + "type": "resolve", + "resource": "repositories", + "field": "name", + }, + }, + }, + }, + repositories(), + ], + }, + { + "client": {"base_url": "https://github.com/api/v2"}, + "resources": [ + { + "name": "issues", + "endpoint": { + "path": "dlt-hub/{repository}/issues/", + "params": { + "repository": { + "type": "resolve", + "resource": "repositories", + "field": "name", + }, + }, + }, + }, + repositories(), + ], + }, ] diff --git a/tests/sources/rest_api/configurations/test_configuration.py b/tests/sources/rest_api/configurations/test_configuration.py index 0167ea1eb8..6adbfc5175 100644 --- a/tests/sources/rest_api/configurations/test_configuration.py +++ b/tests/sources/rest_api/configurations/test_configuration.py @@ -401,3 +401,71 @@ def test_resource_defaults_no_params() -> None: "per_page": 50, "sort": "updated", } + + +def test_accepts_DltResource_in_resources() -> None: + @dlt.resource(selected=False) + def repositories(): + """A seed list of repositories to fetch""" + yield [{"name": "dlt"}, {"name": "verified-sources"}, {"name": "dlthub-education"}] + + config: RESTAPIConfig = { + "client": {"base_url": "https://github.com/api/v2"}, + "resources": [ + { + "name": "issues", + "endpoint": { + "path": "dlt-hub/{repository}/issues/", + "params": { + "repository": { + "type": "resolve", + "resource": "repositories", + "field": "name", + }, + }, + }, + }, + repositories(), + ], + } + + source = rest_api_source(config) + assert list(source.resources.keys()) == ["repositories", "issues"] + assert list(source.selected_resources.keys()) == ["issues"] + + +def test_resource_defaults_dont_apply_to_DltResource() -> None: + @dlt.resource() + def repositories(): + """A seed list of repositories to fetch""" + yield [{"name": "dlt"}, {"name": "verified-sources"}, {"name": "dlthub-education"}] + + config: RESTAPIConfig = { + "client": {"base_url": "https://github.com/api/v2"}, + "resource_defaults": { + "write_disposition": "replace", + }, + "resources": [ + { + "name": "issues", + "endpoint": { + "path": "dlt-hub/{repository}/issues/", + "params": { + "repository": { + "type": "resolve", + "resource": "repositories", + "field": "name", + }, + }, + }, + }, + repositories(), + ], + } + + source = rest_api_source(config) + assert source.resources["issues"].write_disposition == "replace" + assert source.resources["repositories"].write_disposition != "replace", ( + "DltResource defined outside of the RESTAPIConfig object is influenced by the content of" + " the RESTAPIConfig" + ) diff --git a/tests/sources/rest_api/conftest.py b/tests/sources/rest_api/conftest.py index 8ef4e41255..7f20dc2252 100644 --- a/tests/sources/rest_api/conftest.py +++ b/tests/sources/rest_api/conftest.py @@ -141,6 +141,16 @@ def post_detail_404(request, context): context.status_code = 404 return {"error": "Post not found"} + @router.get(r"/posts/\d+/some_details_204") + def post_detail_204(request, context): + """Return 204 No Content for post with id > 0. Used to test ignoring 204 responses.""" + post_id = int(request.url.split("/")[-2]) + if post_id < 1: + return {"id": post_id, "body": f"Post body {post_id}"} + else: + context.status_code = 204 + return None + @router.get(r"/posts_under_a_different_key$") def posts_with_results_key(request, context): return paginate_by_page_number(request, generate_posts(), records_key="many-results") diff --git a/tests/sources/rest_api/integration/test_offline.py b/tests/sources/rest_api/integration/test_offline.py index 2c1f48537b..57cffc99d0 100644 --- a/tests/sources/rest_api/integration/test_offline.py +++ b/tests/sources/rest_api/integration/test_offline.py @@ -2,7 +2,7 @@ from unittest import mock import pytest -from requests import Request, Response +from requests import Request, Response, Session import dlt from dlt.common import pendulum @@ -140,6 +140,46 @@ def test_ignoring_endpoint_returning_404(mock_api_server): ] +def test_ignoring_endpoint_returning_204(mock_api_server): + mock_source = rest_api_source( + { + "client": {"base_url": "https://api.example.com"}, + "resources": [ + "posts", + { + "name": "post_details", + "endpoint": { + "path": "posts/{post_id}/some_details_204", + "params": { + "post_id": { + "type": "resolve", + "resource": "posts", + "field": "id", + } + }, + "response_actions": [ + { + "status_code": 204, + "action": "ignore", + }, + ], + }, + }, + ], + } + ) + + res = list(mock_source.with_resources("posts", "post_details").add_limit(1)) + + assert res[:5] == [ + {"id": 0, "body": "Post body 0"}, + {"id": 0, "title": "Post 0"}, + {"id": 1, "title": "Post 1"}, + {"id": 2, "title": "Post 2"}, + {"id": 3, "title": "Post 3"}, + ] + + def test_source_with_post_request(mock_api_server): class JSONBodyPageCursorPaginator(BaseReferencePaginator): def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None: @@ -327,3 +367,72 @@ def test_posts_with_inremental_date_conversion(mock_api_server) -> None: _, called_kwargs = mock_paginate.call_args_list[0] assert called_kwargs["params"] == {"since": "1970-01-01", "until": "1970-01-02"} assert called_kwargs["path"] == "posts" + + +def test_multiple_response_actions_on_every_response(mock_api_server, mocker): + class CustomSession(Session): + pass + + def send_spy(*args, **kwargs): + return original_send(*args, **kwargs) + + my_session = CustomSession() + original_send = my_session.send + mocked_send = mocker.patch.object(my_session, "send", side_effect=send_spy) + + source = rest_api_source( + { + "client": { + "base_url": "https://api.example.com", + "session": my_session, + }, + "resources": [ + { + "name": "posts", + }, + ], + } + ) + + list(source.with_resources("posts").add_limit(1)) + + mocked_send.assert_called_once() + assert mocked_send.call_args[0][0].url == "https://api.example.com/posts" + + +def test_DltResource_gets_called(mock_api_server, mocker) -> None: + @dlt.resource() + def post_list(): + yield [{"id": "0"}, {"id": "1"}, {"id": "2"}] + + config: RESTAPIConfig = { + "client": {"base_url": "http://api.example.com/"}, + "resource_defaults": { + "write_disposition": "replace", + }, + "resources": [ + { + "name": "posts", + "endpoint": { + "path": "posts/{post_id}/comments", + "params": { + "post_id": { + "type": "resolve", + "resource": "post_list", + "field": "id", + }, + }, + }, + }, + post_list(), + ], + } + + RESTClient = dlt.sources.helpers.rest_client.RESTClient + with mock.patch.object(RESTClient, "paginate") as mock_paginate: + source = rest_api_source(config) + _ = list(source) + assert mock_paginate.call_count == 3 + for i in range(3): + _, kwargs = mock_paginate.call_args_list[i] + assert kwargs["path"] == f"posts/{i}/comments"