Skip to content

Commit

Permalink
Move table import from init.py to tabular/import_.py
Browse files Browse the repository at this point in the history
  • Loading branch information
olsen232 committed Jun 15, 2022
1 parent a395097 commit c252f29
Show file tree
Hide file tree
Showing 3 changed files with 342 additions and 332 deletions.
3 changes: 2 additions & 1 deletion kart/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
"data": {"data"},
"diff": {"diff"},
"fsck": {"fsck"},
"init": {"import", "init"},
"init": {"init"},
"log": {"log"},
"merge": {"merge"},
"meta": {"commit-files", "meta"},
Expand All @@ -40,6 +40,7 @@
"status": {"status"},
"query": {"query"},
"upgrade": {"upgrade"},
"tabular.import_": {"import"},
"point_cloud.import_": {"point-cloud-import"},
}

Expand Down
333 changes: 2 additions & 331 deletions kart/init.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,348 +2,19 @@
from pathlib import Path

import click
from osgeo import gdal

from kart import is_windows

from .cli_util import JsonFromFile, MutexOption, StringFromFile, call_and_exit_flag
from .cli_util import StringFromFile
from .core import check_git_user
from .dataset_util import validate_dataset_paths
from .exceptions import InvalidOperation
from .fast_import import FastImportSettings, ReplaceExisting, fast_import_tables
from .key_filters import RepoKeyFilter
from .fast_import import FastImportSettings, fast_import_tables
from .repo import KartRepo, PotentialRepo
from .spatial_filter import SpatialFilterString, spatial_filter_help_text
from .tabular.import_source import TableImportSource
from .tabular.ogr_import_source import FORMAT_TO_OGR_MAP
from .tabular.pk_generation import PkGeneratingTableImportSource
from .working_copy import PartType


def list_import_formats(ctx):
"""
List the supported import formats
"""
click.echo("Geopackage: PATH.gpkg")
click.echo("PostgreSQL: postgresql://HOST/DBNAME[/DBSCHEMA]")
click.echo("SQL Server: mssql://HOST/DBNAME[/DBSCHEMA]")
click.echo("MySQL: mysql://HOST[/DBNAME]")

ogr_types = set()
for prefix, ogr_driver_name in FORMAT_TO_OGR_MAP.items():
d = gdal.GetDriverByName(ogr_driver_name)
if d:
m = d.GetMetadata()
# only vector formats which can read things.
if m.get("DCAP_VECTOR") == "YES" and m.get("DCAP_OPEN") == "YES":
ogr_types.add(prefix)

if "SHP" in ogr_types:
click.echo("Shapefile: PATH.shp")


class GenerateIDsFromFile(StringFromFile):
name = "ids"

def convert(self, value, param, ctx):
fp = super().convert(
value,
param,
ctx,
# Get the file object, so we don't have to read the whole thing
as_file=True,
)
return (line.rstrip("\n") for line in fp)


def any_at_all(iterable):
# Returns True if anything exists in iterable - even falsey values like None or 0.
# Advances the iterable by one if non-empty, so is best used like so:
# >> if any_at_all(iterable): raise Error("Iterable should be empty.")
return any(True for _ in iterable)


@click.command("import")
@click.pass_context
@click.argument("source")
@click.argument(
"tables",
nargs=-1,
)
@click.option(
"--all-tables",
"-a",
help="Import all tables from the source.",
is_flag=True,
cls=MutexOption,
exclusive_with=["do_list", "tables"],
)
@click.option(
"--message",
"-m",
type=StringFromFile(encoding="utf-8"),
help="Commit message. By default this is auto-generated.",
)
@click.option(
"--table-info",
type=JsonFromFile(
encoding="utf-8",
schema={
"type": "object",
"$schema": "http://json-schema.org/draft-07/schema",
"patternProperties": {
".*": {
"type": "object",
"properties": {
"title": {"type": "string"},
"description": {"type": "string"},
"xmlMetadata": {"type": "string"},
},
}
},
},
),
default="{}",
help=(
"Specifies overrides for imported tables, in a nested JSON object. \n"
"Each key is a dataset name, and each value is an object. \n"
"Valid overrides are 'title', 'description' and 'xmlMetadata'.\n"
"""e.g.: --table-info='{"land_parcels": {"title": "Land Parcels 1:50k"}'\n"""
"To import from a file, prefix with `@`, e.g. `[email protected]`"
),
)
@click.option(
"--list",
"do_list",
is_flag=True,
help="List all tables present in the source path",
cls=MutexOption,
exclusive_with=["all_tables", "tables"],
)
@call_and_exit_flag(
"--list-formats",
callback=list_import_formats,
help="List available import formats, and then exit",
)
@click.option(
"--output-format",
"-o",
type=click.Choice(["text", "json"]),
default="text",
)
@click.option(
"--primary-key",
help="Which field to use as the primary key. Must be unique. Auto-detected when possible.",
)
@click.option(
"--replace-existing",
is_flag=True,
help="Replace existing dataset(s) of the same name.",
)
@click.option(
"--replace-ids",
type=GenerateIDsFromFile(encoding="utf-8"),
help=(
"Replace only features with the given IDs. IDs should be given one-per-line. "
"Use file arguments ([email protected]). Implies --replace-existing. "
"Requires the dataset to have a primary key, unless the value given is an empty "
"string (replaces no features)"
),
)
@click.option(
"--similarity-detection-limit",
hidden=True,
type=click.INT,
default=10000,
help=(
"When replacing an existing dataset where primary keys are auto-generated: the maximum number of unmatched "
"features to search through for similar features, so that primary keys can be reassigned for features that "
"are similar but have had minor edits. Zero means that no similarity detection is performed. (Advanced users only)"
),
)
@click.option(
"--allow-empty",
is_flag=True,
default=False,
help=(
"Usually recording a commit that has the exact same tree as its sole "
"parent commit is a mistake, and the command prevents you from making "
"such a commit. This option bypasses the safety"
),
)
@click.option(
"--max-delta-depth",
hidden=True,
default=0,
type=click.INT,
help="--depth option to git-fast-import (advanced users only)",
)
@click.option(
"--checkout/--no-checkout",
"do_checkout",
is_flag=True,
default=True,
help="Whether to create a working copy once the import is finished, if no working copy exists yet.",
)
@click.option(
"--num-processes",
type=click.INT,
help="How many git-fast-import processes to use. Defaults to the number of available CPU cores.",
)
def import_(
ctx,
all_tables,
message,
do_list,
output_format,
primary_key,
source,
tables,
table_info,
replace_existing,
replace_ids,
similarity_detection_limit,
allow_empty,
max_delta_depth,
do_checkout,
num_processes,
):
"""
Import data into a repository.
$ kart import SOURCE [TABLE_SPEC] [TABLE_SPEC]
SOURCE: Import from dataset: "FORMAT:PATH" eg. "GPKG:my.gpkg"
TABLE_SPEC: Import a particular table, optionally with a new name: "TABLE[:AS_NAME]"
eg. "2019_08_06_median_waterlevel:waterlevel"
$ kart import GPKG:my.gpkg [table1[:new_name1]] [table2[:new_name2]]
To show available tables in the import data, use
$ kart import --list GPKG:my.gpkg
"""

if output_format == "json" and not do_list:
raise click.UsageError(
"Illegal usage: '--output-format=json' only supports --list"
)
if do_list:
TableImportSource.open(source).print_table_list(do_json=output_format == "json")
return

repo = ctx.obj.repo
check_git_user(repo)

base_import_source = TableImportSource.open(source)
if all_tables:
tables = base_import_source.get_tables().keys()
elif not tables:
tables = [base_import_source.prompt_for_table("Select a table to import")]

import_sources = []
for table in tables:
if ":" in table:
table, dest_path = table.split(":", 1)
if not dest_path:
raise click.BadParameter("Invalid table name", param_hint="tables")
else:
dest_path = None

meta_overrides = table_info.get(dest_path or table, {})
if is_windows and dest_path:
dest_path = dest_path.replace("\\", "/") # git paths use / as a delimiter

if "xmlMetadata" in meta_overrides:
meta_overrides["metadata.xml"] = meta_overrides.pop("xmlMetadata")
import_source = base_import_source.clone_for_table(
table,
dest_path=dest_path,
primary_key=primary_key,
meta_overrides=meta_overrides,
)

if replace_ids is not None:
if repo.table_dataset_version < 2:
raise InvalidOperation(
f"--replace-ids is not supported for V{repo.table_dataset_version} datasets"
)
if not import_source.schema.pk_columns:
# non-PK datasets can use this if it's only ever an empty list.
if any_at_all(replace_ids):
raise InvalidOperation(
"--replace-ids requires an import source with a primary key"
)

replace_existing = True

if replace_existing:
if repo.table_dataset_version < 2:
raise InvalidOperation(
f"--replace-existing is not supported for V{repo.table_dataset_version} datasets"
)
try:
existing_ds = repo.datasets()[import_source.dest_path]
except KeyError:
# no such existing dataset. no problems
pass
else:
# Align the column IDs to the existing schema.
# This is important, otherwise importing the same data twice
# will result in a new schema object, and thus a new blob for every feature.
# Note that alignment works better if we add the generated-pk-column first (when needed),
# if one schema has this and the other lacks it they will be harder to align.
import_source = PkGeneratingTableImportSource.wrap_source_if_needed(
import_source,
repo,
similarity_detection_limit=similarity_detection_limit,
)
import_source.align_schema_to_existing_schema(existing_ds.schema)
if (
import_source.schema.legend.pk_columns
!= existing_ds.schema.legend.pk_columns
and replace_ids is not None
):
raise InvalidOperation(
"--replace-ids is not supported when the primary key column is being changed"
)
import_sources.append(import_source)

TableImportSource.check_valid(import_sources, param_hint="tables")

new_ds_paths = [s.dest_path for s in import_sources]
if replace_existing:
validate_dataset_paths(new_ds_paths)
else:
old_ds_paths = [ds.path for ds in repo.datasets()]
validate_dataset_paths(old_ds_paths + new_ds_paths)

replace_existing_enum = (
ReplaceExisting.GIVEN if replace_existing else ReplaceExisting.DONT_REPLACE
)
fast_import_tables(
repo,
import_sources,
settings=FastImportSettings(
num_processes=num_processes, max_delta_depth=max_delta_depth
),
verbosity=ctx.obj.verbosity + 1,
message=message,
replace_existing=replace_existing_enum,
from_commit=repo.head_commit,
replace_ids=replace_ids,
allow_empty=allow_empty,
)

# During imports we can keep old changes since they won't conflict with newly imported datasets.
parts_to_create = [PartType.TABULAR] if do_checkout else []
repo.working_copy.reset_to_head(
repo_key_filter=RepoKeyFilter.datasets(new_ds_paths),
create_parts_if_missing=parts_to_create,
)


@click.command()
@click.pass_context
@click.argument(
Expand Down
Loading

0 comments on commit c252f29

Please sign in to comment.