Skip to content

Commit

Permalink
Add possibility to ignore filename pairs
Browse files Browse the repository at this point in the history
  • Loading branch information
sergey-misuk-valor committed Sep 27, 2024
1 parent aab199f commit 68b28b4
Show file tree
Hide file tree
Showing 18 changed files with 477 additions and 119 deletions.
7 changes: 5 additions & 2 deletions src/hope_dedup_engine/apps/api/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,8 @@
DUPLICATE = "duplicate"
DUPLICATE_LIST = f"{DUPLICATE}s"

IGNORED_KEYS = "ignored_key"
IGNORED_KEYS_LIST = f"{IGNORED_KEYS}s"
IGNORED = "ignored"
REFERENCE_PK = "reference_pk"
FILENAME = "filename"
IGNORED_REFERENCE_PK_LIST = f"{IGNORED}/{REFERENCE_PK}s"
IGNORED_FILENAME_LIST = f"{IGNORED}/{FILENAME}s"
42 changes: 31 additions & 11 deletions src/hope_dedup_engine/apps/api/deduplication/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,40 @@ def _sort_keys(pair: DuplicateKeyPair) -> DuplicateKeyPair:
def _save_duplicates(
finder: DuplicateFinder,
deduplication_set: DeduplicationSet,
ignored_key_pairs: frozenset[tuple[str, str]],
lock_enabled: bool,
lock: DeduplicationSetLock,
) -> None:
reference_pk_to_filename_mapping = dict(
deduplication_set.image_set.values_list("reference_pk", "filename")
)
ignored_filename_pairs = frozenset(
map(
tuple,
map(
sorted,
deduplication_set.ignoredfilenamepair_set.values_list(
"first", "second"
),
),
)
)

ignored_reference_pk_pairs = frozenset(
deduplication_set.ignoredreferencepkpair_set.values_list("first", "second")
)

for first, second, score in map(_sort_keys, finder.run()):
if (first, second) not in ignored_key_pairs:
first_filename, second_filename = sorted(
(
reference_pk_to_filename_mapping[first],
reference_pk_to_filename_mapping[second],
)
)
ignored = (first, second) in ignored_reference_pk_pairs or (
first_filename,
second_filename,
) in ignored_filename_pairs
if not ignored:
duplicate, _ = Duplicate.objects.get_or_create(
deduplication_set=deduplication_set,
first_reference_pk=first,
Expand Down Expand Up @@ -55,17 +83,9 @@ def find_duplicates(deduplication_set_id: str, serialized_lock: str) -> None:
# clean results
Duplicate.objects.filter(deduplication_set=deduplication_set).delete()

ignored_key_pairs = frozenset(
deduplication_set.ignoredkeypair_set.values_list(
"first_reference_pk", "second_reference_pk"
)
)

weight_total = 0
for finder in get_finders(deduplication_set):
_save_duplicates(
finder, deduplication_set, ignored_key_pairs, lock_enabled, lock
)
_save_duplicates(finder, deduplication_set, lock_enabled, lock)
weight_total += finder.weight

for duplicate in deduplication_set.duplicate_set.all():
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Generated by Django 5.0.7 on 2024-09-25 10:29

from django.db import migrations


class Migration(migrations.Migration):

dependencies = [
("api", "0005_config_deduplicationset_config"),
]

operations = [
migrations.RenameModel(
old_name="IgnoredKeyPair",
new_name="IgnoredReferencePkPair",
),
migrations.RenameField(
model_name="ignoredreferencepkpair",
old_name="first_reference_pk",
new_name="first",
),
migrations.RenameField(
model_name="ignoredreferencepkpair",
old_name="second_reference_pk",
new_name="second",
),
migrations.AlterUniqueTogether(
name="ignoredreferencepkpair",
unique_together={("deduplication_set", "first", "second")},
),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Generated by Django 5.0.7 on 2024-09-25 11:24

import django.db.models.deletion
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("api", "0006_rename_ignoredkeypair_ignoredreferencepkpair_and_more"),
]

operations = [
migrations.CreateModel(
name="IgnoredFilenamePair",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("first", models.CharField(max_length=100)),
("second", models.CharField(max_length=100)),
(
"deduplication_set",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
to="api.deduplicationset",
),
),
],
options={
"unique_together": {("deduplication_set", "first", "second")},
},
),
]
37 changes: 26 additions & 11 deletions src/hope_dedup_engine/apps/api/models/deduplication.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,21 +92,36 @@ class Duplicate(models.Model):
score = models.FloatField(default=0)


class IgnoredKeyPair(models.Model):
class IgnoredPair(models.Model):
deduplication_set = models.ForeignKey(DeduplicationSet, on_delete=models.CASCADE)
first_reference_pk = models.CharField(max_length=REFERENCE_PK_LENGTH)
second_reference_pk = models.CharField(max_length=REFERENCE_PK_LENGTH)

class Meta:
unique_together = (
"deduplication_set",
"first_reference_pk",
"second_reference_pk",
)
abstract = True

@override
def save(self, **kwargs: Any) -> None:
self.first_reference_pk, self.second_reference_pk = sorted(
(self.first_reference_pk, self.second_reference_pk)
)
self.first, self.second = sorted((self.first, self.second))
super().save(**kwargs)


UNIQUE_FOR_IGNORED_PAIR = (
"deduplication_set",
"first",
"second",
)


class IgnoredReferencePkPair(IgnoredPair):
first = models.CharField(max_length=REFERENCE_PK_LENGTH)
second = models.CharField(max_length=REFERENCE_PK_LENGTH)

class Meta:
unique_together = UNIQUE_FOR_IGNORED_PAIR


class IgnoredFilenamePair(IgnoredPair):
first = models.CharField(max_length=REFERENCE_PK_LENGTH)
second = models.CharField(max_length=REFERENCE_PK_LENGTH)

class Meta:
unique_together = UNIQUE_FOR_IGNORED_PAIR
31 changes: 24 additions & 7 deletions src/hope_dedup_engine/apps/api/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from hope_dedup_engine.apps.api.models.deduplication import (
Config,
Duplicate,
IgnoredKeyPair,
IgnoredFilenamePair,
IgnoredReferencePkPair,
Image,
)

Expand Down Expand Up @@ -96,16 +97,32 @@ class Meta:
fields = "first", "second", "score"


class IgnoredKeyPairSerializer(serializers.ModelSerializer):
CREATE_PAIR_FIELDS = "first", "second"
PAIR_FIELDS = ("id", "deduplication_set") + CREATE_PAIR_FIELDS


class IgnoredReferencePkPairSerializer(serializers.ModelSerializer):
class Meta:
model = IgnoredReferencePkPair
fields = PAIR_FIELDS


class CreateIgnoredReferencePkPairSerializer(serializers.ModelSerializer):
class Meta:
model = IgnoredReferencePkPair
fields = CREATE_PAIR_FIELDS


class IgnoredFilenamePairSerializer(serializers.ModelSerializer):
class Meta:
model = IgnoredKeyPair
fields = "__all__"
model = IgnoredFilenamePair
fields = PAIR_FIELDS


class CreateIgnoredKeyPairSerializer(serializers.ModelSerializer):
class CreateIgnoredFilenamePairSerializer(serializers.ModelSerializer):
class Meta:
model = IgnoredKeyPair
fields = ("first_reference_pk", "second_reference_pk")
model = IgnoredFilenamePair
fields = CREATE_PAIR_FIELDS


class EmptySerializer(serializers.Serializer):
Expand Down
13 changes: 10 additions & 3 deletions src/hope_dedup_engine/apps/api/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,16 @@
DEDUPLICATION_SET,
DEDUPLICATION_SET_LIST,
DUPLICATE_LIST,
IGNORED_KEYS_LIST,
IGNORED_FILENAME_LIST,
IGNORED_REFERENCE_PK_LIST,
IMAGE_LIST,
)
from hope_dedup_engine.apps.api.views import (
BulkImageViewSet,
DeduplicationSetViewSet,
DuplicateViewSet,
IgnoredKeyPairViewSet,
IgnoredFilenamePairViewSet,
IgnoredReferencePkPairViewSet,
ImageViewSet,
)

Expand All @@ -40,7 +42,12 @@
DUPLICATE_LIST, DuplicateViewSet, basename=DUPLICATE_LIST
)
deduplication_sets_router.register(
IGNORED_KEYS_LIST, IgnoredKeyPairViewSet, basename=IGNORED_KEYS_LIST
IGNORED_FILENAME_LIST, IgnoredFilenamePairViewSet, basename=IGNORED_FILENAME_LIST
)
deduplication_sets_router.register(
IGNORED_REFERENCE_PK_LIST,
IgnoredReferencePkPairViewSet,
basename=IGNORED_REFERENCE_PK_LIST,
)

urlpatterns = [
Expand Down
46 changes: 36 additions & 10 deletions src/hope_dedup_engine/apps/api/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,20 @@
from hope_dedup_engine.apps.api.models import DeduplicationSet
from hope_dedup_engine.apps.api.models.deduplication import (
Duplicate,
IgnoredKeyPair,
IgnoredFilenamePair,
IgnoredReferencePkPair,
Image,
)
from hope_dedup_engine.apps.api.serializers import (
CreateDeduplicationSetSerializer,
CreateIgnoredKeyPairSerializer,
CreateIgnoredFilenamePairSerializer,
CreateIgnoredReferencePkPairSerializer,
CreateImageSerializer,
DeduplicationSetSerializer,
DuplicateSerializer,
EmptySerializer,
IgnoredKeyPairSerializer,
IgnoredFilenamePairSerializer,
IgnoredReferencePkPairSerializer,
ImageSerializer,
)
from hope_dedup_engine.apps.api.utils import delete_model_data, start_processing
Expand Down Expand Up @@ -272,8 +275,8 @@ def list(self, request: Request, *args: Any, **kwargs: Any) -> Response:
return super().list(request, *args, **kwargs)


class IgnoredKeyPairViewSet(
nested_viewsets.NestedViewSetMixin[IgnoredKeyPair],
class IgnoredPairViewSet[T](
nested_viewsets.NestedViewSetMixin[T],
mixins.ListModelMixin,
mixins.CreateModelMixin,
viewsets.GenericViewSet,
Expand All @@ -284,8 +287,6 @@ class IgnoredKeyPairViewSet(
AssignedToExternalSystem,
UserAndDeduplicationSetAreOfTheSameSystem,
)
serializer_class = IgnoredKeyPairSerializer
queryset = IgnoredKeyPair.objects.all()
parent_lookup_kwargs = {
DEDUPLICATION_SET_PARAM: DEDUPLICATION_SET_FILTER,
}
Expand All @@ -297,13 +298,38 @@ def perform_create(self, serializer: Serializer) -> None:
deduplication_set.updated_by = self.request.user
deduplication_set.save()

@extend_schema(description="List all ignored key pairs for the deduplication set")

class IgnoredFilenamePairViewSet(IgnoredPairViewSet[IgnoredFilenamePair]):
serializer_class = IgnoredFilenamePairSerializer
queryset = IgnoredFilenamePair.objects.all()

@extend_schema(
description="List all ignored filename pairs for the deduplication set"
)
def list(self, request: Request, *args: Any, **kwargs: Any) -> Response:
return super().list(request, *args, **kwargs)

@extend_schema(
request=CreateIgnoredFilenamePairSerializer,
description="Add ignored filename pair for the deduplication set",
)
def create(self, request: Request, *args: Any, **kwargs: Any) -> Response:
return super().create(request, *args, **kwargs)


class IgnoredReferencePkPairViewSet(IgnoredPairViewSet[IgnoredReferencePkPair]):
serializer_class = IgnoredReferencePkPairSerializer
queryset = IgnoredReferencePkPair.objects.all()

@extend_schema(
description="List all ignored reference pk pairs for the deduplication set"
)
def list(self, request: Request, *args: Any, **kwargs: Any) -> Response:
return super().list(request, *args, **kwargs)

@extend_schema(
request=CreateIgnoredKeyPairSerializer,
description="Add ignored key pair for the deduplication set",
request=CreateIgnoredReferencePkPairSerializer,
description="Add ignored reference pk pair for the deduplication set",
)
def create(self, request: Request, *args: Any, **kwargs: Any) -> Response:
return super().create(request, *args, **kwargs)
6 changes: 4 additions & 2 deletions tests/api/api_const.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
BULK_IMAGE_LIST,
DEDUPLICATION_SET_LIST,
DUPLICATE_LIST,
IGNORED_KEYS_LIST,
IGNORED_FILENAME_LIST,
IGNORED_REFERENCE_PK_LIST,
IMAGE_LIST,
)

Expand All @@ -17,4 +18,5 @@
BULK_IMAGE_LIST_VIEW = f"{BULK_IMAGE_LIST}-{LIST}"
BULK_IMAGE_CLEAR_VIEW = f"{BULK_IMAGE_LIST}-clear"
DUPLICATE_LIST_VIEW = f"{DUPLICATE_LIST}-{LIST}"
IGNORED_KEYS_LIST_VIEW = f"{IGNORED_KEYS_LIST}-{LIST}"
IGNORED_REFERENCE_PK_LIST_VIEW = f"{IGNORED_REFERENCE_PK_LIST}-{LIST}"
IGNORED_FILENAME_LIST_VIEW = f"{IGNORED_FILENAME_LIST}-{LIST}"
Loading

0 comments on commit 68b28b4

Please sign in to comment.