diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1e8f787..21a162e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -12,7 +12,7 @@ jobs: - name: update apt run: sudo apt update - name: install dependencies - run: sudo apt install -y cmake gcc g++ + run: sudo apt install -y cmake gcc g++ libbz2-dev - name: cmake run: mkdir build && cd build && cmake .. - name: make @@ -29,7 +29,7 @@ jobs: - name: update apt run: sudo apt update - name: install dependencies - run: sudo apt install -y cmake gcc g++ + run: sudo apt install -y cmake gcc g++ libbz2-dev - name: cmake run: mkdir build && cd build && cmake .. - name: make @@ -46,7 +46,7 @@ jobs: - name: update apt run: sudo apt update - name: install dependencies - run: sudo apt install -y cmake clang + run: sudo apt install -y cmake clang libbz2-dev - name: cmake run: mkdir build && cd build && cmake .. shell: bash @@ -67,7 +67,7 @@ jobs: - name: update apt run: sudo apt update - name: install dependencies - run: sudo apt install -y cmake clang + run: sudo apt install -y cmake clang libbz2-dev - name: cmake run: mkdir build && cd build && cmake .. shell: bash @@ -86,7 +86,7 @@ jobs: - name: Checkout submodules run: git submodule update --init --recursive - name: install dependencies - run: brew install cmake + run: brew install cmake lbzip2 - name: cmake run: mkdir build && cd build && cmake .. - name: make @@ -101,7 +101,7 @@ jobs: - name: Checkout submodules run: git submodule update --init --recursive - name: install dependencies - run: brew install cmake + run: brew install cmake lbzip2 - name: cmake run: mkdir build && cd build && cmake .. - name: make diff --git a/evaluation/Makefile b/evaluation/Makefile new file mode 100644 index 0000000..29ccf43 --- /dev/null +++ b/evaluation/Makefile @@ -0,0 +1,206 @@ +POSTGRES_USER = postgres +POSTGRES_DB = spatialjoin_db +SPATIALJOIN_EVAL_SCRIPT = spatialjoin-evaluation.py +SPATIALJOIN = spatialjoin +SPATIALJOIN_ARGS = --num-threads 2 --num-caches 2 --no-oriented-envelope # BCSDoi with 2 threads +POSTGRES_TIMEOUT = 10h + +DATA_DIR = . + +QUERY_1_POSTGRES = SELECT COUNT(*) FROM classes AS a, classes AS b WHERE a.class = 'highway' AND b.id = 'rel:2171347' AND ST_Contains(b.geom, a.geom) +QUERY_2_POSTGRES = SELECT COUNT(*) FROM classes AS a, classes AS b WHERE a.class = 'highway' AND b.id = 'rel:51477' AND ST_Contains(b.geom, a.geom) +QUERY_3_POSTGRES = SELECT COUNT(*) FROM classes AS a, classes AS b WHERE a.class = 'building' AND b.class = 'power' AND b.type = 'line' AND ST_Intersects(a.geom, b.geom) +QUERY_4_POSTGRES = SELECT COUNT(*) FROM classes AS a, classes AS b WHERE a.class = 'highway' AND b.class = 'highway' AND a.type = 'residential' AND b.type = 'residential' AND ST_Intersects(a.geom, b.geom) + +# TODO: QUERY 5, Number of postboxes by country + +.PHONY: eval help tables check + +.PRECIOUS: %.tsv $.tsv.gz $(DATADIR)/%.tsv $(DATADIR)/$.tsv.gz + +.SECONDEXPANSION: + +help: + @echo "spatialjoin evaluation script\n" + @echo "Supported datasets" + @echo " region-freiburg, region-finland, region-germany, region-ohm-planet, region-osm-planet" + @echo "\nGeneral targets\n" + @echo " make check\n check PostgreSQL/PostGIS and spatialjoin installation" + @echo " make eval\n run entire evaluation" + @echo "\nIndividual targets\n" + @echo " make -table\n prepare PostGIS table for " + @echo " make eval-self-join--postgres\n run self-join evaluation on for PostGIS" + @echo " make eval-self-join--spatialjoin\n run self-join evaluation on for spatialjoin" + @echo " make classes-table\n create a table 'classes' containing all objects of predefined classes" + @echo " make eval-query-\n evaluate query QUERYID (1,2, 3, 4, 5) against Postgres" + @echo " make eval-combinations--spatialjoin\n run self-evaluation for spatialjoin on " + +check: + @echo -n "Data dir for exports: " + @realpath $(DATA_DIR) + @echo -n "PostgreSQL user: " + @echo $(POSTGRES_USER) + @echo -n "PostgreSQL database: " + @echo $(POSTGRES_DB) + @echo -n "PostgreSQL query timeout: " + @echo $(POSTGRES_TIMEOUT) + @echo -n "PostgreSQL data directory: " + @psql -U $(POSTGRES_USER) -tA -c "SHOW data_directory;" + @echo -n "PostgreSQL working memory: " + @psql -U $(POSTGRES_USER) -tA -c "SHOW work_mem;" + @echo -n "PostgreSQL max processess: " + @psql -U $(POSTGRES_USER) -tA -c "SHOW max_worker_processes;" + @echo -n "PostgreSQL max workers: " + @psql -U $(POSTGRES_USER) -tA -c "SHOW max_parallel_workers;" + @echo -n "PostgreSQL version: " + @psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -tA -c "SELECT version();" | cut -d ' ' -f2 + @echo -n "PostGIS version: " + @psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -tA -c "SELECT PostGIS_Version();" + @echo -n "spatialjoin binary: " + @echo $(SPATIALJOIN) + @echo -n "spatialjoin version: " + @$(SPATIALJOIN) --version + @echo -n "spatialjoin eval script: " + @[ -f $(SPATIALJOIN_EVAL_SCRIPT) ] && realpath $(SPATIALJOIN_EVAL_SCRIPT) || echo " NOT FOUND" + +$(DATA_DIR)/region-osm-planet.tsv.gz: + curl -s https://qlever.cs.uni-freiburg.de/api/osm-planet -H "Accept: text/csv" -H "Content-type: application/sparql-query" --data "PREFIX geo: PREFIX ogc: PREFIX osmrel: SELECT ?osm_id ?geometry WHERE { ?osm_id geo:hasGeometry/geo:asWKT ?geometry }" | sed 's/,/\t/;s|https://www.openstreetmap.org/|osm|;s|/|:|;s/"//g' | gzip -1 > $@ + +$(DATA_DIR)/region-freiburg.tsv.gz: + curl -s https://qlever.cs.uni-freiburg.de/api/osm-planet -H "Accept: text/csv" -H "Content-type: application/sparql-query" --data "PREFIX geo: PREFIX ogc: PREFIX osmrel: SELECT ?osm_id ?geometry WHERE { osmrel:62768 ogc:sfContains ?osm_id . ?osm_id geo:hasGeometry/geo:asWKT ?geometry }" | sed 's/,/\t/;s|https://www.openstreetmap.org/|osm|;s|/|:|;s/"//g' | gzip -1 > $@ + +$(DATA_DIR)/region-finland.tsv.gz: + curl -s https://qlever.cs.uni-freiburg.de/api/osm-planet -H "Accept: text/csv" -H "Content-type: application/sparql-query" --data "PREFIX geo: PREFIX ogc: PREFIX osmrel: SELECT ?osm_id ?geometry WHERE { osmrel:54224 ogc:sfContains ?osm_id . ?osm_id geo:hasGeometry/geo:asWKT ?geometry }" | sed 's/,/\t/;s|https://www.openstreetmap.org/|osm|;s|/|:|;s/"//g' | gzip -1 > $@ + +$(DATA_DIR)/region-germany.tsv.gz: + curl -s https://qlever.cs.uni-freiburg.de/api/osm-planet -H "Accept: text/csv" -H "Content-type: application/sparql-query" --data "PREFIX geo: PREFIX ogc: PREFIX osmrel: SELECT ?osm_id ?geometry WHERE { osmrel:51477 ogc:sfContains ?osm_id . ?osm_id geo:hasGeometry/geo:asWKT ?geometry }" | sed 's/,/\t/;s|https://www.openstreetmap.org/|osm|;s|/|:|;s/"//g' | gzip -1 > $@ + +$(DATA_DIR)/region-ohm-planet.tsv.gz: + curl -s https://qlever.cs.uni-freiburg.de/api/ohm-planet -H "Accept: text/csv" -H "Content-type: application/sparql-query" --data "PREFIX geo: PREFIX ogc: PREFIX osmrel: SELECT ?osm_id ?geometry WHERE { ?osm_id geo:hasGeometry/geo:asWKT ?geometry }" | sed 's/,/\t/;s|https://www.openstreetmap.org/|osm|;s|/|:|;s/"//g' | gzip -1 > $@ + +region-%-table: $(DATA_DIR)/region-%.tsv.gz + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "CREATE TABLE IF NOT EXISTS \"region-$*\" (id VARCHAR PRIMARY KEY, geom GEOMETRY);" + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "CREATE TABLE IF NOT EXISTS \"region-$*_loader\" (id VARCHAR, geom_text VARCHAR);" + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "DELETE FROM \"region-$*\";" + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "DELETE FROM \"region-$*_loader\";" + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "\copy \"region-$*_loader\" FROM PROGRAM 'gzip -dc $(shell realpath $^)' WITH (FORMAT csv, DELIMITER E'\t', HEADER true);" + @# filter invalid single-point LINESTRINGs here, they are still present in the old OHM QLever instance + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "INSERT INTO \"region-$*\" (id, geom) SELECT id, ST_GeomFromText(geom_text, 4326) FROM \"region-$*_loader\" WHERE NOT starts_with(geom_text, 'LINESTRING') OR POSITION(',' IN geom_text) > 0;" + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "DROP table \"region-$*_loader\";" + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -tA -c "SELECT COUNT(*) FROM \"region-$*\";" + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "CREATE INDEX IF NOT EXISTS \"region-$*_geom_idx\" ON \"region-$*\" USING GIST (geom);" + +$(DATA_DIR)/class-%.tsv.gz: + curl -s https://qlever.cs.uni-freiburg.de/api/osm-planet -H "Accept: text/csv" -H "Content-type: application/sparql-query" --data "PREFIX osm: PREFIX geo: PREFIX ogc: PREFIX osmrel: PREFIX osmkey: SELECT (REPLACE(REPLACE(STR(?osm_id_), STR(osm:), \"osm\"), \"/\", \":\") AS ?osm_id) (REPLACE(STR(osmkey:$*), STR(osmkey:), \"\") AS ?predicate) ?type ?geometry WHERE { { SELECT ?osm_id_ (SAMPLE(?type_) AS ?type) WHERE { ?osm_id_ osmkey:$* ?type_ } GROUP BY ?osm_id_ } ?osm_id_ geo:hasGeometry/geo:asWKT ?geometry }" | sed 's/,/\t/g;s|https://www.openstreetmap.org/|osm|;s|/|:|;s/"//g' | sed 's/"//g;s/\^\^$$//' | gzip -1 > $@ + +classes-table: $(DATA_DIR)/class-building.tsv.gz $(DATA_DIR)/class-highway.tsv.gz $(DATA_DIR)/class-amenity.tsv.gz $(DATA_DIR)/class-power.tsv.gz + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "CREATE TABLE classes (id VARCHAR PRIMARY KEY, class VARCHAR, type VARCHAR, geom GEOMETRY);" + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "CREATE TABLE classes_loader (id VARCHAR, class VARCHAR, type VARCHAR, geom_text VARCHAR);" + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "COPY classes_loader FROM PROGRAM 'gzip -dc $(shell realpath $(DATA_DIR)/class-building.tsv.gz)' WITH (FORMAT csv, DELIMITER E'\t', HEADER true);" + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "COPY classes_loader FROM PROGRAM 'gzip -dc $(shell realpath $(DATA_DIR)/class-highway.tsv.gz)' WITH (FORMAT csv, DELIMITER E'\t', HEADER true);" + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "COPY classes_loader FROM PROGRAM 'gzip -dc $(shell realpath $(DATA_DIR)/class-amenity.tsv.gz)' WITH (FORMAT csv, DELIMITER E'\t', HEADER true);" + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "COPY classes_loader FROM PROGRAM 'gzip -dc $(shell realpath $(DATA_DIR)/class-power.tsv.gz)' WITH (FORMAT csv, DELIMITER E'\t', HEADER true);" + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "INSERT INTO classes (id, class, type, geom) SELECT DISTINCT ON (id) id, class, type, ST_GeomFromText(geom_text, 4326) FROM classes_loader;" + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "DROP table classes_loader;" + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "SELECT COUNT(*) FROM classes;" + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "CREATE INDEX classes_geom_idx ON classes USING GIST (geom);" + +$(DATA_DIR)/static-residential-streets.tsv.gz: + curl https://ad-publications.cs.uni-freiburg.de/SIGSPATIAL_spatialjoin_BBKL_2024.materials/residential-streets.tsv.bz2 | bunzip2 -c | gzip -1 > $@ + +$(DATA_DIR)/static-%.1.tsv.gz: + curl https://ad-publications.cs.uni-freiburg.de/SIGSPATIAL_spatialjoin_BBKL_2024.materials/residential-streets.tsv.bz2 | bunzip2 -c | sed 's/\t/\t1\t/' | gzip -1 > $@ + +$(DATA_DIR)/static-%.tsv.gz: + curl https://ad-publications.cs.uni-freiburg.de/SIGSPATIAL_spatialjoin_BBKL_2024.materials/$*.tsv | gzip -1 > $@ + +$(DATA_DIR)/static-%.1.tsv.gz: + curl https://ad-publications.cs.uni-freiburg.de/SIGSPATIAL_spatialjoin_BBKL_2024.materials/$*.tsv | sed 's/\t/\t1\t/' | gzip -1 > $@ + +static-%-table: $(DATA_DIR)/static-%.tsv.gz + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "CREATE TABLE IF NOT EXISTS \"static-$*\" (id VARCHAR PRIMARY KEY, geom GEOMETRY);" + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "CREATE TABLE IF NOT EXISTS \"static-$*_loader\" (id VARCHAR, geom_text VARCHAR);" + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "DELETE FROM \"static-$*\";" + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "DELETE FROM \"static-$*_loader\";" + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "\copy \"static-$*_loader\" FROM PROGRAM 'gzip -dc $(shell realpath $^)' WITH (FORMAT csv, DELIMITER E'\t', HEADER true);" + @# filter invalid single-point LINESTRINGs here, they are still present in the old OHM QLever instance + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "INSERT INTO \"static-$*\" (id, geom) SELECT id, ST_GeomFromText(geom_text, 4326) FROM \"static-$*_loader\" WHERE NOT starts_with(geom_text, 'LINESTRING') OR POSITION(',' IN geom_text) > 0;" + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "DROP table \"static-$*_loader\";" + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -tA -c "SELECT COUNT(*) FROM \"static-$*\";" + psql -U $(POSTGRES_USER) -d $(POSTGRES_DB) -c "CREATE INDEX IF NOT EXISTS \"static-$*_geom_idx\" ON \"static-$*\" USING GIST (geom);" + +eval-self-join-%-postgres: + @echo + @echo ++ Starting postgres full self-join evaluation for \'$*\': + @psql -q -U $(POSTGRES_USER) -d $(POSTGRES_DB) -tA -c "SELECT FROM \"$*\" LIMIT 1" > /dev/null 2>&1 || (echo "ERROR: Table $* does not yet exist, run 'make $*-table' first\\n";false) + @echo Postgres full self-join candidates for \'$*\': + @psql -q -U $(POSTGRES_USER) -d $(POSTGRES_DB) -tA -c "\timing" -c "SET statement_timeout = '$(POSTGRES_TIMEOUT)'; SELECT COUNT(*)::text || ' rows retrieved' FROM \"$*\" AS a, \"$*\" AS b WHERE a.geom && b.geom;" || true + @echo Postgres full self-join on ST_Intersects for \'$*\': + @psql -q -U $(POSTGRES_USER) -d $(POSTGRES_DB) -tA -c "\timing" -c "SET statement_timeout = '$(POSTGRES_TIMEOUT)'; SELECT COUNT(*)::text || ' rows retrieved' FROM \"$*\" AS a, \"$*\" AS b WHERE ST_Intersects(a.geom, b.geom);" || true + +eval-self-join-%-spatialjoin: $(DATA_DIR)/%.spatialjoin-input.tsv + @echo + @echo ++ Starting spatialjoin full self-join evaluation for \'$*\': + @echo spatialjoin full self-join candidates for \'$*\': + @./$(SPATIALJOIN) $(SPATIALJOIN_ARGS) --no-geometry-checks < $< > /dev/null 2> .spatialjoin-$*.log + @grep "Total predicate generation time" .spatialjoin-$*.log | sed "s/.* Total predicate generation time (without parsing): \([0-9s\.]*\)/\1/g" + @echo spatialjoin full self-join for \'$*\': + @./$(SPATIALJOIN) $(SPATIALJOIN_ARGS) < $< > /dev/null 2> .spatialjoin-$*.log + @grep "Total predicate generation time" .spatialjoin-$*.log | sed "s/.* Total predicate generation time (without parsing): \([0-9s\.]*\)/\1/g" + @rm .spatialjoin-$*.log + +%.spatialjoin-input.tsv: $(DATA_DIR)/%.tsv.gz + ((gzip -dc $< | head -n1 | wc -w | grep -q 4 && gzip -dc $< | cut -d' ' -f 1,4 | tail -n +2 | head) || (gzip -dc $< | tail -n +2)) > $@ + +eval-combinations-%-spatialjoin: %.spatialjoin-input.tsv + @echo + @echo ++ Starting spatialjoin self-evaluation for \'$*\': + @./$(SPATIALJOIN_EVAL_SCRIPT) $* --spatialjoin $(SPATIALJOIN) --combinations bcsdoi,Bcsdoi,BCsdoi,BCSdoi,BCSDoi,BCSdOi,BCSdoI | tee $*.spatialjoin-evaluation.tsv + @echo + @echo ++ Analyzing spatialjoin self-evaluation for \'$*\': + @./$(SPATIALJOIN_EVAL_SCRIPT) $* --spatialjoin $(SPATIALJOIN) --combinations bcsdoi,Bcsdoi,BCsdoi,BCSdoi,BCSDoi,BCSdOi,BCSdoI --analyze total --minutes + +eval-self-join-%: eval-self-join-%-spatialjoin eval-self-join-%-postgres + @echo + +eval-query-%: + @echo + @echo ++ Starting postgres evaluation for query $*: + @echo "(Query is: '$(QUERY_$*_POSTGRES)' )" + @psql -q -U $(POSTGRES_USER) -d $(POSTGRES_DB) -tA -c "SELECT FROM classes LIMIT 1" > /dev/null 2>&1 || (echo "ERROR: Table classes does not yet exist, run 'make classes-table' first\\n";false) + @echo Postgres result size and time: + @psql -q -U $(POSTGRES_USER) -d $(POSTGRES_DB) -tA -c "\timing" -c "SET statement_timeout = '$(POSTGRES_TIMEOUT)'; $(QUERY_$*_POSTGRES);" || true + +eval-non-self-join-%-postgres: + @echo + @echo ++ Starting postgres evaluation for non-self join $(word 1,$(subst _, ,$*)) vs $(word 2,$(subst _, ,$*)) + @psql -q -U $(POSTGRES_USER) -d $(POSTGRES_DB) -tA -c "SELECT FROM \"static-$(word 1,$(subst _, ,$*))\" LIMIT 1" > /dev/null 2>&1 || (echo "ERROR: Table static-$(word 1,$(subst _, ,$*)) does not yet exist, run 'make $(word 1,$(subst _, ,$*))-table' first\\n";false) + @psql -q -U $(POSTGRES_USER) -d $(POSTGRES_DB) -tA -c "SELECT FROM \"static-$(word 2,$(subst _, ,$*))\" LIMIT 1" > /dev/null 2>&1 || (echo "ERROR: Table static-$(word 2,$(subst _, ,$*)) does not yet exist, run 'make $(word 2,$(subst _, ,$*))-table' first\\n";false) + @echo Postgres candidates for non-self join $(word 1,$(subst _, ,$*)) vs $(word 2,$(subst _, ,$*)): + @psql -q -U $(POSTGRES_USER) -d $(POSTGRES_DB) -tA -c "\timing" -c "SET statement_timeout = '$(POSTGRES_TIMEOUT)'; SELECT COUNT(*)::text || ' rows retrieved' FROM \"static-$(word 1,$(subst _, ,$*))\" AS a, \"static-$(word 2,$(subst _, ,$*))\" AS b WHERE a.geom && b.geom;" || true + @echo Postgres full ST_Intersects for non-self join $(word 1,$(subst _, ,$*)) vs $(word 2,$(subst _, ,$*)): + @psql -q -U $(POSTGRES_USER) -d $(POSTGRES_DB) -tA -c "\timing" -c "SET statement_timeout = '$(POSTGRES_TIMEOUT)'; SELECT COUNT(*)::text || ' rows retrieved' FROM \"static-$(word 1,$(subst _, ,$*))\" AS a, \"static-$(word 2,$(subst _, ,$*))\" AS b WHERE ST_Intersects(a.geom, b.geom);" || true + +eval-non-self-join-%-spatialjoin: $(DATA_DIR)/static-$$(word 1,$$(subst _, , %)).tsv.gz $(DATA_DIR)/static-$$(word 2,$$(subst _, , %)).1.tsv.gz + @echo + @echo ++ Starting spatialjoin evaluation for non-self join $(word 1,$(subst _, ,$*)) vs $(word 2,$(subst _, ,$*)) + @echo spatialjoin candidates for non-self join $(word 1,$(subst _, ,$*)) vs $(word 2,$(subst _, ,$*)): + @zcat $^ | ./$(SPATIALJOIN) $(SPATIALJOIN_ARGS) --no-geometry-checks > /dev/null 2> .spatialjoin-$*.log + @grep "Total predicate generation time" .spatialjoin-$*.log | sed "s/.* Total predicate generation time (without parsing): \([0-9s\.]*\)/\1/g" + @echo spatialjoin full non-self join $(word 1,$(subst _, ,$*)) vs $(word 2,$(subst _, ,$*)): + @zcat $^ | ./$(SPATIALJOIN) $(SPATIALJOIN_ARGS) > /dev/null 2> .spatialjoin-$*.log + @grep "Total predicate generation time" .spatialjoin-$*.log | sed "s/.* Total predicate generation time (without parsing): \([0-9s\.]*\)/\1/g" + @rm .spatialjoin-$*.log + +eval-non-self-join-%: eval-non-self-join-%-spatialjoin eval-non-self-join-%-postgres + @# + +tables: region-freiburg-table region-germany-table region-finland-table region-ohm-planet-table region-osm-planet-table classes-table static-restaurants-table static-residential-streets-table static-powerlines-tables static-administrative-regions-table + +eval-queries: eval-query-1 eval-query-2 eval-query-3 eval-query-4 + +eval-self-joins: eval-self-join-region-ohm-planet eval-selfjoin-region-finland eval-selfjoin-region-germany eval-selfjoin-region-osm-planet + +eval-non-self-joins: eval-non-self-join-restaurants_transit-stops eval-non-self-join-residential-streets_administrative-regions eval-non-self-join-residential-streets_residential-streets eval-non-self-join-powerlines_residential-streets + +eval: eval-combinations-region-osm-planet eval-self-joins eval-non-self-joins eval-queries diff --git a/evaluation/README.md b/evaluation/README.md new file mode 100644 index 0000000..f35eac5 --- /dev/null +++ b/evaluation/README.md @@ -0,0 +1,87 @@ +# Evaluation instructions and results + +We evaluated the performance of our spatial join and compared it against +PostgreSQL+PostGIS. In the following sections, we provide instructions and +results for the evaluation. + +## Setup PostgreSQL and PostGIS on Ubuntu 24.04 + +Install the required packages: + + +``` +sudo apt update +sudo apt install postgresql postgresql-contrib postgis postgresql-16-postgis-3 gdal-bin +``` + +Next, create a new database storage in a directory of your choice. +``` +export POSTGIS_DIR=/local/data-ssd/postgis/spatialjoin +sudo mkdir -p ${POSTGIS_DIR} && sudo chown postgres:postgres ${POSTGIS_DIR} +sudo -u postgres /usr/lib/postgresql/16/bin/initdb -D ${POSTGIS_DIR} +sudo vim ${POSTGIS_DIR}/postgresql.conf +``` +In the file `${POSTGIS_DIR}/postgresql.conf`, set the following: +``` +work_mem = 4MB +max_worker_processes = 8 +max_parallel_workers_per_gather = 2 +max_parallel_workers = 8 +``` +Afterwards, restart Postgres with the selected database storage directory: + +``` +sudo su - postgres -c "/usr/lib/postgresql/16/bin/pg_ctl -D ${POSTGIS_DIR} -l logfile start" +``` + +## Create a database + +Create a database `spatialjoin_db` and enable PostGIS. +``` +sudo su - postgres -c "createdb spatialjoin_db" +psql -U postgres -d spatialjoin_db -c "CREATE EXTENSION postgis;" +``` + +## Install spatialjoin + +Build the `spatialjoin` executable in this repository and include it in the `PATH`: + +``` +mkdir build && cd build +cmake .. +make -j +cd .. +export PATH=PATH:$(pwd)/build +``` + +## Run full evaluation using the provided Makefile + +First, check if the PostgreSQL, PostGIS and spatialjoin installation works as expected: + +``` +make check +``` + +Afterwards, create the tables required for the evaluation. This will take a while. Note that this will completely rebuild *all* tables every time. + +``` +make tables +``` + +Finally, run the complete evaluation: + +``` +make eval +``` + +You can change individual configuration parameters (listed on the top section of the Makefile) by setting them explicity, e.g. `make POSTGRES_USER=patrick POSTGRES_DB=eval tables`. + +## Run individual evaluations using the provided Makefile + +Run + +``` +make help +``` + +to get a list of available target. diff --git a/scripts/spatialjoin-evaluation.py b/evaluation/spatialjoin-evaluation.py similarity index 96% rename from scripts/spatialjoin-evaluation.py rename to evaluation/spatialjoin-evaluation.py index 444642b..d4e944c 100755 --- a/scripts/spatialjoin-evaluation.py +++ b/evaluation/spatialjoin-evaluation.py @@ -80,7 +80,7 @@ def compute(args: argparse.Namespace): # The command line for this combination. cmd = (f"cat {args.basename}.spatialjoin-input.tsv |" - f" spatialjoin{sweep_mode} {combination}") + f" {args.spatialjoin}{sweep_mode} {combination}") # Optionally, generate RDF output. if args.rdf_output: @@ -132,12 +132,13 @@ def compute(args: argparse.Namespace): parse_time = "[not found]" sweep_time = "[not found]" for line in result.stderr.decode().split("\n"): - match = re.match(".*INFO : done \\(([0-9.]+)s\\)\\.", line) + match = re.match(".*INFO : Done parsing \\(([0-9.]+)s\\)\\.", line) if match: - if parse_time == "[not found]": - parse_time = f"{float(match.group(1)):.3f}" - elif sweep_time == "[not found]": - sweep_time = f"{float(match.group(1)):.3f}" + parse_time = f"{float(match.group(1)):.3f}" + + match = re.match(".*INFO : Done sweeping \\(([0-9.]+)s\\)\\.", line) + if match: + sweep_time = f"{float(match.group(1)):.3f}" print(f"{name}\t{total_time}\t{parse_time}\t{sweep_time}", flush=True) @@ -340,6 +341,9 @@ def sort_key(pair): parser.add_argument("--minutes", action="store_true", default=False, help="Show times in minutes instead of seconds") + parser.add_argument("--spatialjoin", + default="spatialjoin", + help="spatialjoin executable") argcomplete.autocomplete(parser, always_complete_options="long") args = parser.parse_args() diff --git a/src/spatialjoin/CMakeLists.txt b/src/spatialjoin/CMakeLists.txt index f1a5826..813abb8 100755 --- a/src/spatialjoin/CMakeLists.txt +++ b/src/spatialjoin/CMakeLists.txt @@ -10,6 +10,13 @@ include_directories( ${ZLIB_INCLUDE_DIRS} ) + +configure_file ( + "_config.h.in" + "_config.h" +) + + add_executable(spatialjoin ${spatialjoin_main}) add_library(spatialjoin-dev ${SPATIALJOIN_SRC}) diff --git a/src/spatialjoin/SpatialJoinMain.cpp b/src/spatialjoin/SpatialJoinMain.cpp index 3f33d16..a3b28dd 100755 --- a/src/spatialjoin/SpatialJoinMain.cpp +++ b/src/spatialjoin/SpatialJoinMain.cpp @@ -4,9 +4,10 @@ #include -#include "BoxIds.h" -#include "Sweeper.h" -#include "WKTParse.h" +#include "spatialjoin/BoxIds.h" +#include "spatialjoin/Sweeper.h" +#include "spatialjoin/WKTParse.h" +#include "spatialjoin/_config.h" #include "util/Misc.h" #include "util/geo/Geo.h" #include "util/log/Log.h" @@ -32,6 +33,8 @@ void printHelp(int argc, char** argv) { UNUSED(argc); std::cout << "\n" + << VERSION_FULL << "\n(built " << __DATE__ << " " << __TIME__ + << ")\n\n" << "(C) 2023-" << YEAR << " " << COPY << "\n" << "Authors: " << AUTHORS << "\n\n" << "Usage: " << argv[0] << " [--help] [-h] \n\n" @@ -127,6 +130,7 @@ int main(int argc, char** argv) { bool noGeometryChecks = false; bool preSortCache = false; + bool printStats = false; size_t numThreads = NUM_THREADS; size_t numCaches = NUM_THREADS; @@ -183,6 +187,13 @@ int main(int argc, char** argv) { useInnerOuter = true; } else if (cur == "--pre-sort-cache") { preSortCache = true; + } else if (cur == "--print-stats") { + printStats = true; + } else if (cur == "--version") { + std::cout + << "spatialjoin " << VERSION_FULL << " (built " << __DATE__ << " " << __TIME__ + << ")\n"; + exit(0); } else { std::cerr << "Unknown option '" << cur << "', see -h" << std::endl; exit(1); @@ -249,6 +260,10 @@ int main(int argc, char** argv) { std::string dangling; size_t gid = 1; + std::function statsCb; + + if (printStats) statsCb = [](const std::string& s) { std::cerr << s; }; + Sweeper sweeper({numThreads, numCaches, prefix, @@ -270,7 +285,7 @@ int main(int argc, char** argv) { noGeometryChecks, {}, [](const std::string& s) { LOGTO(INFO, std::cerr) << s; }, - [](const std::string& s) { std::cerr << s; }, + statsCb, {}}, cache, output); @@ -288,25 +303,33 @@ int main(int argc, char** argv) { // end event jobs.add({}); + + LOGTO(INFO, std::cerr) << "Done parsing (" << TOOK(ts) / 1000000000.0 << "s)."; + // wait for all workers to finish for (auto& thr : thrds) thr.join(); + auto genTs = TIME(); + + LOGTO(INFO, std::cerr) << "Sorting sweep events..."; + sweeper.flush(); - LOGTO(INFO, std::cerr) << "done (" << TOOK(ts) / 1000000000.0 << "s)."; + LOGTO(INFO, std::cerr) << "Done sorting sweep events (" << TOOK(ts) / 1000000000.0 << "s)."; if (preSortCache) { ts = TIME(); LOGTO(INFO, std::cerr) << "Pre-sorting cache..."; sweeper.sortCache(); sweeper.flush(); - LOGTO(INFO, std::cerr) << "done (" << TOOK(ts) / 1000000000.0 << "s)."; + LOGTO(INFO, std::cerr) << "Done pre-sorting cache (" << TOOK(ts) / 1000000000.0 << "s)."; } LOGTO(INFO, std::cerr) << "Sweeping..."; ts = TIME(); sweeper.sweep(); - LOGTO(INFO, std::cerr) << "done (" << TOOK(ts) / 1000000000.0 << "s)."; + LOGTO(INFO, std::cerr) << "Done sweeping (" << TOOK(ts) / 1000000000.0 << "s)."; + LOGTO(INFO, std::cerr) << "Total predicate generation time (without parsing): " << TOOK(genTs) / 1000000000.0 << "s"; delete[] buf; } diff --git a/src/spatialjoin/Sweeper.cpp b/src/spatialjoin/Sweeper.cpp index ddf119c..7c47451 100644 --- a/src/spatialjoin/Sweeper.cpp +++ b/src/spatialjoin/Sweeper.cpp @@ -12,10 +12,10 @@ #include #include -#include "BoxIds.h" -#include "InnerOuter.h" -#include "IntervalIdx.h" -#include "Sweeper.h" +#include "spatialjoin/BoxIds.h" +#include "spatialjoin/InnerOuter.h" +#include "spatialjoin/IntervalIdx.h" +#include "spatialjoin/Sweeper.h" #include "util/Misc.h" #include "util/log/Log.h" @@ -851,8 +851,6 @@ void Sweeper::flush() { _lineCache.flush(); _simpleLineCache.flush(); - log("Sorting events..."); - std::string newFName = util::getTmpFName(_cache, ".spatialjoin", "sorttmp"); int newFile = open(newFName.c_str(), O_RDWR | O_CREAT, 0666); unlink(newFName.c_str()); @@ -883,8 +881,6 @@ void Sweeper::flush() { #ifdef __unix__ posix_fadvise(_file, 0, 0, POSIX_FADV_SEQUENTIAL); #endif - - log("...done"); } // _____________________________________________________________________________ diff --git a/src/spatialjoin/Sweeper.h b/src/spatialjoin/Sweeper.h index 5b5ee0e..80218d2 100644 --- a/src/spatialjoin/Sweeper.h +++ b/src/spatialjoin/Sweeper.h @@ -21,9 +21,9 @@ #include #include -#include "GeometryCache.h" -#include "IntervalIdx.h" -#include "Stats.h" +#include "spatialjoin/GeometryCache.h" +#include "spatialjoin/IntervalIdx.h" +#include "spatialjoin/Stats.h" #include "util/JobQueue.h" #include "util/geo/Geo.h" diff --git a/src/spatialjoin/_config.h.in b/src/spatialjoin/_config.h.in new file mode 100644 index 0000000..e4a528e --- /dev/null +++ b/src/spatialjoin/_config.h.in @@ -0,0 +1,13 @@ +// Copyright 2025 +// Author: Patrick Brosi + +#ifndef SRC_SPATIALJOIN_CONFIG_H_ +#define SRC_SPATIALJOIN_CONFIG_H_ + +// version number from cmake version module +#define VERSION_FULL "@VERSION_GIT_FULL@" + +// version number from cmake version module +#define INSTALL_PREFIX "@CMAKE_INSTALL_PREFIX@" + +#endif // SRC_SPATIALJOIN_CONFIG_H_N