Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sanitize street and house numbers #468

Merged
merged 7 commits into from
Jan 31, 2022
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package app.ehrenamtskarte.backend.stores.importer

import app.ehrenamtskarte.backend.stores.importer.types.AcceptingStore
import org.slf4j.Logger

fun Logger.logChange(storeInfo: String, property: String, oldValue: String?, newValue: String?) {
if (oldValue == newValue) {
info("$property of '$storeInfo' could not be improved, keeping '$oldValue'")
} else {
info("$property of '$storeInfo' changed to '$newValue' from '$oldValue'")
}
}

fun Logger.logChange(store: AcceptingStore, property: String, oldValue: String?, newValue: String?) {
val storeInfo = listOfNotNull(store.name, store.location, store.street, store.houseNumber).joinToString()
logChange(storeInfo, property, oldValue, newValue)
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,39 +2,94 @@ package app.ehrenamtskarte.backend.stores.importer.steps

import app.ehrenamtskarte.backend.common.COUNTRY_CODE
import app.ehrenamtskarte.backend.stores.importer.PipelineStep
import app.ehrenamtskarte.backend.stores.importer.logChange
import app.ehrenamtskarte.backend.stores.importer.replaceNa
import app.ehrenamtskarte.backend.stores.importer.types.AcceptingStore
import app.ehrenamtskarte.backend.stores.importer.types.LbeAcceptingStore
import org.intellij.lang.annotations.Language
import org.slf4j.Logger

const val MISCELLANEOUS_CATEGORY = 9
const val ALTERNATIVE_MISCELLANEOUS_CATEGORY = 99

class Map(private val logger: Logger) : PipelineStep<List<LbeAcceptingStore>, List<AcceptingStore>>() {
private val houseNumberRegex = houseNumberRegex()

override fun execute(input: List<LbeAcceptingStore>) = input.mapNotNull {
try {
AcceptingStore(
it.name!!.trim(),
COUNTRY_CODE,
it.location!!.trim(),
cleanPostalCode(it.postalCode),
it.cleanPostalCode(),
it.street.clean(),
it.houseNumber.clean(),
null,
it.longitude.safeToDouble(),
it.latitude.safeToDouble(),
categoryId(it.category!!),
it.email.clean(),
it.telephone.clean(),
it.homepage.clean(),
it.discount.clean()
)
).sanitizeStreetHouseNumber()
} catch (e: Exception) {
logger.info("Exception occurred while mapping $it", e)
null
}
}

private fun houseNumberRegex(): Regex {
// E.g. "B[200]", "H[7]" (mostly in industrial parks)
@Language("RegExp")
val prefix = """[A-Z]?"""

// E.g. "[5] - 7", "[2]+3" or "[11] und 12"
@Language("RegExp")
val range = """\s?(-|\+|u\.|und|/)\s?[0-9]+"""

// E.g. "[13] 1/2" or "[1] 3/4"
@Language("RegExp")
val fraction = """\s?[0-9]/[0-9]"""

// E.g. "[12]a" or "[2] B" (must be followed by a whitespace or the end of the string)
@Language("RegExp")
val letter = """\s?[a-zA-Z]($|\s)"""

return Regex("""$prefix[0-9]+(($range)|($fraction)|($letter))?""")
}

private fun AcceptingStore.sanitizeStreetHouseNumber(): AcceptingStore {
val isStreetPolluted = street?.find { it.isDigit() } != null
val isHouseNumberPolluted = houseNumber != null && !houseNumberRegex.matches(houseNumber)

if (isStreetPolluted || isHouseNumberPolluted) {
val address = listOfNotNull(street, houseNumber).joinToString(" ")
val houseNumberMatch = houseNumberRegex.find(address)

if (houseNumberMatch == null) {
// No house number, the whole address is the street
logger.logChange("$name, $location", "Address", "$street|$houseNumber", address)
return copy(street = address, houseNumber = null)
}

val cleanStreet = address.substring(0, houseNumberMatch.range.first).trim()
val cleanHouseNumber = houseNumberMatch.value.toLowerCase().trim()

// Residue that is neither the street nor the house number, e.g. "im Hauptbahnhof", "Ecke Theaterstraße"
val residue = if (houseNumberMatch.range.last < address.length - 1) {
val res = address.substring(houseNumberMatch.range.last + 1).trim { !it.isLetterOrDigit() }.clean()
if (res != cleanHouseNumber) res else null
} else null

val newAddress = listOfNotNull(cleanStreet, cleanHouseNumber, residue).joinToString("|")
logger.logChange("$name, $location", "Address", "$street|$houseNumber", newAddress)

return copy(street = cleanStreet, houseNumber = cleanHouseNumber, additionalAddressInformation = residue)
}
return this
}

private fun String?.safeToDouble(): Double? {
return this?.clean()?.replace(",", ".")?.toDouble()
}
Expand All @@ -48,10 +103,15 @@ class Map(private val logger: Logger) : PipelineStep<List<LbeAcceptingStore>, Li
return this?.replaceNa()?.trim()
}

private fun cleanPostalCode(postalCode: String?): String? {
if (postalCode == null) return null
val fiveDigitRegex = """\d{5}""".toRegex()
return fiveDigitRegex.find(postalCode)?.value
private fun LbeAcceptingStore.cleanPostalCode(): String? {
val oldPostalCode = postalCode ?: return null
val fiveDigitRegex = Regex("""[0-9]{5}""")

val newPostalCode = fiveDigitRegex.find(oldPostalCode)?.value
if (newPostalCode != oldPostalCode.clean()) {
logger.logChange("$name, $location", "Postal code", oldPostalCode, newPostalCode)
}
return newPostalCode
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import app.ehrenamtskarte.backend.stores.geocoding.FeatureFetcher
import app.ehrenamtskarte.backend.stores.geocoding.isCloseToBoundingBox
import app.ehrenamtskarte.backend.stores.geocoding.isInBoundingBox
import app.ehrenamtskarte.backend.stores.importer.PipelineStep
import app.ehrenamtskarte.backend.stores.importer.logChange
import app.ehrenamtskarte.backend.stores.importer.types.AcceptingStore
import io.ktor.client.HttpClient
import kotlinx.coroutines.runBlocking
Expand Down Expand Up @@ -43,14 +44,14 @@ class Sanitize(private val logger: Logger, httpClient: HttpClient) : PipelineSte
val oldCoordinates = "$latitude, $longitude"
val newCoordinates = "${feature.latitude()}, ${feature.longitude()}"

logChange(this, "Coordinates", oldCoordinates, newCoordinates)
logger.logChange(this, "Coordinates", oldCoordinates, newCoordinates)

return copy(longitude = feature.longitude(), latitude = feature.latitude())
}

// Match by coordinates -> replace wrong postal code
val newPostalCode = feature?.postalCode() ?: postalCode
logChange(this, "Postal code", postalCode, newPostalCode)
logger.logChange(this, "Postal code", postalCode, newPostalCode)

return copy(postalCode = newPostalCode)
}
Expand All @@ -59,16 +60,6 @@ class Sanitize(private val logger: Logger, httpClient: HttpClient) : PipelineSte
return this
}

private fun logChange(store: AcceptingStore, property: String, oldValue: String?, newValue: String?) {
val storeInfo = listOfNotNull(store.name, store.location, store.street, store.houseNumber).joinToString()

if (oldValue == newValue) {
logger.info("$property of '$storeInfo' could not be improved, keeping '$oldValue'")
} else {
logger.info("$property of '$storeInfo' changed from '$oldValue' to '$newValue'")
}
}

private fun Feature.latitude(): Double = (geometry as Point).coordinates.latitude
private fun Feature.longitude(): Double = (geometry as Point).coordinates.longitude
private fun Feature.postalCode(): String? = address()["postcode"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ data class AcceptingStore(
val postalCode: String?,
val street: String?,
val houseNumber: String?,
val additionalAddressInformation: String?,
steffenkleinle marked this conversation as resolved.
Show resolved Hide resolved
val longitude: Double?,
val latitude: Double?,
val categoryId: Int,
Expand Down