Google Sheets minimal example
info
The source code for this example can be found in our repository at: https://github.com/dlt-hub/dlt/tree/devel/docs/examples/google_sheets
About this Example
In this example, you'll find a Python script that demonstrates how to load Google Sheets data using the dlt library.
We'll learn how to:
- use built-in credentials;
- use union of credentials;
- create dynamically generated resources.
tip
This example is for educational purposes. For best practices, we recommend using Google Sheets verified source.
Full source code
# NOTE: this line is only for dlt CI purposes, you may delete it if you are using this example
__source_name__ = "google_sheets"
from typing import Any, Iterator, Sequence, Union, cast
from googleapiclient.discovery import build
import dlt
from dlt.common.configuration.specs import (
    GcpOAuthCredentials,
    GcpServiceAccountCredentials,
)
from dlt.common.typing import DictStrAny, StrAny
def _initialize_sheets(
    credentials: Union[GcpOAuthCredentials, GcpServiceAccountCredentials]
) -> Any:
    # Build the service object.
    service = build("sheets", "v4", credentials=credentials.to_native_credentials())
    return service
@dlt.source
def google_spreadsheet(
    spreadsheet_id: str,
    sheet_names: Sequence[str],
    credentials: Union[
        GcpServiceAccountCredentials, GcpOAuthCredentials, str, StrAny
    ] = dlt.secrets.value,
) -> Any:
    sheets = _initialize_sheets(cast(GcpServiceAccountCredentials, credentials))
    def get_sheet(sheet_name: str) -> Iterator[DictStrAny]:
        # get list of list of typed values
        result = (
            sheets.spreadsheets()
            .values()
            .get(
                spreadsheetId=spreadsheet_id,
                range=sheet_name,
                # unformatted returns typed values
                valueRenderOption="UNFORMATTED_VALUE",
                # will return formatted dates
                dateTimeRenderOption="FORMATTED_STRING",
            )
            .execute()
        )
        # pprint.pprint(result)
        values = result.get("values")
        # yield dicts assuming row 0 contains headers and following rows values and all rows have identical length
        for v in values[1:]:
            yield {h: v for h, v in zip(values[0], v)}
    # create resources from supplied sheet names
    return [
        dlt.resource(get_sheet(name), name=name, write_disposition="replace")
        for name in sheet_names
    ]
if __name__ == "__main__":
    pipeline = dlt.pipeline(destination="duckdb")
    # see example.secrets.toml to where to put credentials
    sheet_id = "1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580"
    range_names = ["hidden_columns_merged_cells", "Blank Columns"]
    # "2022-05", "model_metadata"
    load_info = pipeline.run(
        google_spreadsheet(
            spreadsheet_id=sheet_id,
            sheet_names=range_names,
        )
    )
    print(load_info)
    row_counts = pipeline.last_trace.last_normalize_info.row_counts
    print(row_counts.keys())
    assert row_counts["hidden_columns_merged_cells"] == 7
    assert row_counts["blank_columns"] == 21