diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..5e6c971 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.10-bullseye + +RUN apt-get update && apt-get install -y sqlite3 libgeos-dev lz4 wget g++ cmake cmake-curses-gui make libexpat1-dev zlib1g-dev libbz2-dev libsparsehash-dev \ + libboost-program-options-dev libboost-dev libgdal-dev libproj-dev libsqlite3-mod-spatialite libspatialindex-dev + +RUN mkdir -p /usr/src/app/ + +WORKDIR /usr/src/app/ + +COPY . ./ + +RUN pip install -r requirements.txt + diff --git a/compare_stops.py b/compare_stops.py index eb899dc..448c290 100644 --- a/compare_stops.py +++ b/compare_stops.py @@ -55,6 +55,7 @@ def main(osmfile, db_file, stops_file, gtfs_file, stopsprovider, logfile): elif stopsprovider == 'DELFI': zhv_importer = DelfiStopsImporter(db) else: + zhv_importer = None logger.error("No importer for stopsprovider %s", stopsprovider) #return 1 diff --git a/osm_stop_matcher/GtfsImporter.py b/osm_stop_matcher/GtfsImporter.py index e13cade..b11ab75 100644 --- a/osm_stop_matcher/GtfsImporter.py +++ b/osm_stop_matcher/GtfsImporter.py @@ -1,7 +1,7 @@ import argparse import csv import datetime -from osm_stop_matcher.util import xstr, drop_table_if_exists +from osm_stop_matcher.util import xstr, drop_table_if_exists, get_parent_station import sqlite3 import spatialite import zipfile @@ -44,7 +44,7 @@ def import_stops(self, stops_file): reader = csv.DictReader(io.TextIOWrapper(stops_file, 'utf-8')) to_db = [(i['stop_id'], i['stop_name'], i['stop_lat'] - , i['stop_lon'], i['location_type'], i['parent_station'], + , i['stop_lon'], i['location_type'], get_parent_station(i['stop_id']), i['platform_code']) for i in reader] cur.executemany("INSERT INTO gtfs_stops (stop_id,stop_name,stop_lat,stop_lon,location_type,parent_station,platform_code) VALUES (?, ?, ?, ?, ?, ?, ?);", to_db) @@ -61,7 +61,7 @@ def load_haltestellen_unified(self): SElECT '' Landkreis, '' Gemeinde, '' Ortsteil, substr(stop_name, instr(stop_name, ' ')+1) Haltestelle, stop_name Haltestelle_lang, '' HalteBeschreibung, stop_id globaleID, '' HalteTyp, NULL gueltigAb, NULL gueltigBis, cast(stop_lat as real) lat, cast(stop_lon as real) lon, CASE WHEN (LENGTH(stop_id)-LENGTH(REPLACE(stop_id, ':','')))=4 THEN 'Steig' ELSE 'Halt' END Art , platform_code Name_Steig, - NULL mode, NULL parent, NULL match_state, NULL linien, platform_code FROM gtfs_stops + NULL mode, parent_station parent, NULL match_state, NULL linien, platform_code FROM gtfs_stops where (location_type="0" or location_type=""); """) @@ -134,36 +134,57 @@ def update_name_steig(self): def update_mode(self): cur = self.db.cursor() - drop_table_if_exists(self.db, "tmp_stop_times") - cur.execute("CREATE table tmp_stop_times AS SELECT stop_id, min(trip_id) trip_id FROM gtfs_stop_times GROUP BY stop_id") - cur.execute("CREATE INDEX tst on tmp_stop_times(stop_id)") + drop_table_if_exists(self.db, "tmp_stop_modes") + # when one route_short_name has multiple different modes, use higher valued one (train), since this is usually due to SEV (e.g. RE3 with route_type 3) + # when still multiple modes at one stop, leave mode as NULL + cur.execute("""CREATE table tmp_stop_modes AS + SELECT stop_id, mode + FROM ( + SELECT stop_id, MAX(mode) AS mode, route_short_name + FROM ( + SELECT st.stop_id, r.route_short_name, + CASE WHEN r.route_type in ('0', '1', '400','900') THEN 'light_rail' + WHEN r.route_type in ('2', '100', '101', '102','103','106','109') THEN 'train' + WHEN r.route_type in ('3', '700') THEN 'bus' + WHEN r.route_type in ('4','1000') THEN 'ferry' + WHEN r.route_type in ('5') THEN 'funicular' + ELSE NULL END AS mode + FROM gtfs_stop_times st + JOIN gtfs_trips t ON t.trip_id=st.trip_id + JOIN gtfs_routes r ON r.route_id=t.route_id + WHERE r.route_short_name NOT LIKE 'SEV%' + GROUP BY st.stop_id, r.route_type, r.route_short_name + ) AS modes_by_route + GROUP BY stop_id, route_short_name + ) AS modes_without_sev + GROUP BY stop_id + HAVING COUNT(DISTINCT mode) = 1 + """) + cur.execute("CREATE INDEX tst on tmp_stop_modes(stop_id)") query = """UPDATE haltestellen_unified SET mode= - (SELECT CASE WHEN r.route_type in ('0', '1', '400','900') THEN 'light_rail' - WHEN r.route_type in ('2', '100', '101', '102','103','106','109') THEN 'train' - WHEN r.route_type in ('3', '700') THEN 'bus' - WHEN r.route_type in ('4','1000') THEN 'ferry' - WHEN r.route_type in ('5') THEN 'funicular' - ELSE NULL END - FROM tmp_stop_times st - JOIN gtfs_trips t ON t.trip_id=st.trip_id - JOIN gtfs_routes r ON r.route_id=t.route_id - WHERE st.stop_id = haltestellen_unified.globaleID)""" + (SELECT mode + FROM tmp_stop_modes st + WHERE st.stop_id = haltestellen_unified.globaleID)""" cur.execute(query) - drop_table_if_exists(self.db, "tmp_stop_times") + drop_table_if_exists(self.db, "tmp_stop_modes") self.db.commit() def update_linien(self): cur = self.db.cursor() drop_table_if_exists(self.db, "tmp_stop_routes") cur.execute("""CREATE TABLE tmp_stop_routes AS - SELECT stop_id, group_concat(route_short_name) route_short_names FROM ( - SELECT st.stop_id, r1.route_short_name + SELECT stop_id, group_concat(CASE + WHEN route_type = '101' THEN 'ICE' + WHEN route_type = '102' THEN 'ECIC' + ELSE route_short_name + END) route_short_names FROM ( + SELECT st.stop_id, r1.route_short_name, r1.route_type FROM gtfs_stop_times st JOIN gtfs_trips t ON t.trip_id=st.trip_id JOIN gtfs_routes r1 ON r1.route_id=t.route_id JOIN gtfs_stops s ON s.stop_id=st.stop_id WHERE r1.route_short_name != '' - GROUP BY st.stop_id, r1.route_short_name) + GROUP BY st.stop_id, r1.route_short_name, r1.route_type) GROUP BY stop_id""") cur.execute("CREATE INDEX stop_routes_idx on tmp_stop_routes(stop_id)") diff --git a/osm_stop_matcher/MatchPicker.py b/osm_stop_matcher/MatchPicker.py index a74b9b1..47a2162 100644 --- a/osm_stop_matcher/MatchPicker.py +++ b/osm_stop_matcher/MatchPicker.py @@ -2,6 +2,7 @@ import math from . import config +from .util import drop_table_if_exists def get_rating(row): return row['rating'] @@ -52,6 +53,9 @@ def __init__(self, db): def pick_matches(self): + if config.SIMPLE_MATCH_PICKER: + return self.simple_pick_matches() + cur = self.db.cursor() cur.execute("DELETE FROM matches") cur.execute("SELECT * FROM candidates WHERE rating >= ? ORDER BY ifopt_id", [config.RATING_BELOW_CANDIDATES_ARE_IGNORED]) @@ -109,6 +113,21 @@ def pick_matches(self): self.logger.info('Deleted matches with worse name_distance if multiple osm_stop match same agency stop') self.db.commit() + def simple_pick_matches(self): + self.logger.info('Simple match picking...') + cur = self.db.cursor() + drop_table_if_exists(self.db, "matches") + cur.execute("""CREATE TABLE matches AS + SELECT ifopt_id, osm_id, rating, distance, name_distance, platform_matches, successor_rating, mode_rating FROM candidates + WHERE rating > 0.99 + UNION + SELECT ifopt_id, osm_id, MAX(rating) rating, distance, name_distance, platform_matches, successor_rating, mode_rating FROM candidates + GROUP BY ifopt_id + UNION + SELECT ifopt_id, osm_id, MAX(rating) rating, distance, name_distance, platform_matches, successor_rating, mode_rating FROM candidates + GROUP BY osm_id""") + + self.db.commit() def import_matches(self, matches): cur = self.db.cursor() diff --git a/osm_stop_matcher/StopMatcher.py b/osm_stop_matcher/StopMatcher.py index 0a3828b..686cb76 100644 --- a/osm_stop_matcher/StopMatcher.py +++ b/osm_stop_matcher/StopMatcher.py @@ -8,8 +8,9 @@ from osm_stop_matcher.util import drop_table_if_exists, backup_table_if_exists +from . import config + class StopMatcher(): - UNKNOWN_MODE_RATING = 0.3 MINIMUM_SUCCESSOR_SIMILARITY = 0.6 MINIMUM_SUCCESSOR_PREDECESSOR_DISTANCE = 0.11 @@ -124,7 +125,7 @@ def rank_mode(self, stop, candidate): candidate["mode"] == 'trainish' and stop["mode"] in ['train', 'light_rail']): return 1 elif not candidate["mode"] or not stop["mode"]: - return 0.7 + return config.UNKNOWN_MODE_RATING else: return 0 @@ -177,7 +178,7 @@ def rank_candidates(self, stop, stop_id, coords, candidates): self.logger.debug('rank %s', candidate) # estimate distance distance = haversine(coords, (candidate["lat"],candidate["lon"]), unit=Unit.METERS) - if distance > 400: + if distance > config.MAXIMUM_DISTANCE: return matches # Ignore bahn candidates when looking for bus_stop @@ -189,7 +190,7 @@ def rank_candidates(self, stop, stop_id, coords, candidates): (rating, name_distance, matched_name, osm_name, platform_matches, successor_rating, mode_rating) = self.rank_candidate(stop, candidate, distance) #if last_name_distance > name_distance: - if last_name_distance > name_distance and name_distance < 0.3: + if last_name_distance > name_distance and name_distance < config.MINIMUM_NAME_SIMILARITY: self.logger.info("Ignore {} ({}) {} ({}) with distance {} and name similarity {}. Platform matches? {} as name distance low".format(matched_name,stop_id, osm_name, candidate["id"], distance, name_distance,platform_matches)) continue elif rating < 0.001: diff --git a/osm_stop_matcher/config.py b/osm_stop_matcher/config.py index f0b470b..1c79223 100644 --- a/osm_stop_matcher/config.py +++ b/osm_stop_matcher/config.py @@ -3,4 +3,9 @@ # To avoid combinatorial explosition when calculating best match combination per stop, # we only consider the best candidate per osm_stop if a certain number of candidates is exceeded -MAX_CANDIDATE_COUNT_PER_STOP_BEFORE_ONLY_BEST_PER_QUAY_ARE_CONSIDERED = 50 \ No newline at end of file +MAX_CANDIDATE_COUNT_PER_STOP_BEFORE_ONLY_BEST_PER_QUAY_ARE_CONSIDERED = 50 + +MINIMUM_NAME_SIMILARITY = 0.3 +MAXIMUM_DISTANCE = 400 +UNKNOWN_MODE_RATING = 0.7 +SIMPLE_MATCH_PICKER = False \ No newline at end of file diff --git a/osm_stop_matcher/util.py b/osm_stop_matcher/util.py index 9450151..333c41e 100644 --- a/osm_stop_matcher/util.py +++ b/osm_stop_matcher/util.py @@ -1,5 +1,6 @@ import logging import sqlite3 +import re logger = logging.getLogger('osm_stop_matcher.util') @@ -43,3 +44,6 @@ def backup_table_if_exists(db, table, backup_table): def xstr(str): return None if '' == str else str + +def get_parent_station(ifopt_id): + return re.sub(r'^([^:_]+:[^:_]+:[^:_]+)(_[^:]+)?(:.+)?$', r'\1', ifopt_id) \ No newline at end of file