flairNLP · alanakbik · Feb 4, 2022 · Dec 28, 2021 · Dec 29, 2021 · Dec 29, 2021
diff --git a/flair/models/clustering/Clustering.py b/flair/models/clustering/Clustering.py
@@ -0,0 +1,10 @@
+from abc import ABC, abstractmethod
+
+
+class Clustering(ABC):
+    @abstractmethod
+    def cluster(self, vectors: list) -> list:
+        pass
+
+    def getLabelList(self, listSenctence) -> list:
+        return list(map(lambda e: int(e.get_labels('cluster')[0].value), listSenctence))
diff --git a/flair/models/clustering/Evaluation.py b/flair/models/clustering/Evaluation.py
@@ -0,0 +1,41 @@
+from sklearn.datasets import fetch_20newsgroups
+from sklearn.metrics import accuracy_score, normalized_mutual_info_score
+
+
+def getStackOverFlowLabels():
+    with open("evaluation/StackOverflow/title_StackOverflow.txt", "r", encoding="utf8") as myfile:
+        data = myfile.readlines()
+        return data
+
+
+def getStackOverFlowData():
+    with open("evaluation/StackOverflow/title_StackOverflow.txt", "r", encoding="utf8") as myfile:
+        data = myfile.readlines()
+        return data
+
+
+maxDocuments = 400
+categories = [
+    'rec.motorcycles',
+    'rec.sport.baseball',
+    'comp.graphics',
+    'sci.space',
+    'talk.politics.mideast'
+]
+
+
+def get20NewsData():
+    ng5 = fetch_20newsgroups(categories=categories)
+    return ng5.data[1:maxDocuments]
+
+
+def get20NewsLabel():
+    ng5 = fetch_20newsgroups(categories=categories)
+    return ng5.target[1:maxDocuments]
+
+
+def evaluate(labels: list, predict_labels: list):
+    acc = accuracy_score(labels, predict_labels)
+    nmi = normalized_mutual_info_score(labels, predict_labels)
+    print("ACC: " + str(acc))
+    print("NMI: " + str(nmi))
diff --git a/flair/models/clustering/birch/Birch.py b/flair/models/clustering/birch/Birch.py
@@ -0,0 +1,51 @@
+from flair.embeddings import DocumentEmbeddings
+
+from Clustering import Clustering
+from birch.model.CfTree import CfTree
+from birch.model.ClusteringFeature import ClusteringFeature
+from flair.datasets import DataLoader
+
+from kmeans.K_Means import KMeans
+
+branchingFactorNonLeaf = 0
+branchingFactorLeaf = 0
+distanceMax = 1000000000
+threshold = 0
+
+
+class Birch(Clustering):
+    def __init__(self, thresholds: float, embeddings: DocumentEmbeddings, B: int, L: int):
+        global threshold
+        threshold = thresholds
+        global branchingFactorLeaf
+        branchingFactorLeaf = L
+        global branchingFactorNonLeaf
+        branchingFactorNonLeaf = B
+        global distanceMax
+
+        self.embeddings = embeddings
+        self.cfTree = CfTree()
+        self.predict = []
+
+    def cluster(self, vectors: list, batchSize: int = 64):
+        print("Starting BIRCH clustering with threshold: " + str(threshold))
+        self.predict = [0] * len(vectors)
+
+        for batch in DataLoader(vectors, batch_size=batchSize):
+            self.embeddings.embed(batch)
+
+        for idx, vector in enumerate(vectors):
+            self.cfTree.insertCf(ClusteringFeature(vector.embedding, idx=idx))
+            self.cfTree.validate()
+
+        cfs = self.cfTree.getLeafCfs()
+        cfVectors = self.cfTree.getVectorsFromCf(cfs)
+
+        kMeans = KMeans(3)
+        kMeans.clusterVectors(cfVectors)
+
+        for idx, cf in enumerate(cfs):
+            for cfIndex in cf.indices:
+                self.predict[cfIndex] = kMeans.predict[idx]
+
+        return self.cfTree
diff --git a/flair/models/clustering/birch/__init__.py b/flair/models/clustering/birch/__init__.py
diff --git a/flair/models/clustering/birch/model/CfNode.py b/flair/models/clustering/birch/model/CfNode.py
@@ -0,0 +1,16 @@
+from birch.model.ClusteringFeature import ClusteringFeature
+
+
+class CfNode:
+    def __init__(self):
+        self.cfs = []
+        self.isLeaf = False
+        self.parent = None
+
+    def sumAllCfs(self) -> ClusteringFeature:
+        cf = ClusteringFeature()
+
+        for help in self.cfs:
+            cf.absorbCf(help)
+
+        return cf
diff --git a/flair/models/clustering/birch/model/CfTree.py b/flair/models/clustering/birch/model/CfTree.py
@@ -0,0 +1,172 @@
+import numpy as np
+from birch.model import LeafNode
+from birch.model.CfNode import CfNode
+from birch.model.ClusteringFeature import ClusteringFeature
+from birch.model.NonLeafNode import NonLeafNode
+from distance import Distance
+
+
+class CfTree:
+    def __init__(self):
+        self.root = NonLeafNode()
+        self.firstChild = self.root.entries[0]
+
+    def insertCf(self, cf: ClusteringFeature):
+        leaf = self.getClosestLeaf(cf, self.root)
+        cf_node = leaf.getClosestCF(cf)
+
+        if cf_node.canAbsorbCf(cf):
+            cf_node.absorbCf(cf)
+            self.updatePathSimple(leaf)
+            return
+        if leaf.canAddNewCf():
+            leaf.addCF(cf)
+            self.updatePathSimple(leaf)
+        else:
+            newLeaf = self.splitLeaf(leaf, cf)
+            self.updatePathWithNewLeaf(newLeaf)
+
+    def splitLeaf(self, leaf: LeafNode, cf: ClusteringFeature) -> LeafNode:
+        leaf.cfs.append(cf)
+        indices = Distance.getFurthest2Points(leaf.cfs)
+        oldCf = [leaf.cfs[indices[0]]]
+        newCf = [leaf.cfs[indices[1]]]
+
+        for cf in leaf.cfs:
+            if not cf is oldCf[0] and not cf is newCf[0]:
+                if cf.calcualteDistance(oldCf[0]) < cf.calcualteDistance(newCf[0]):
+                    oldCf.append(cf)
+                else:
+                    newCf.append(cf)
+
+        index = leaf.parent.getChildIndex(leaf)
+        leaf.cfs = oldCf
+        leaf.parent.cfs[index] = leaf.sumAllCfs()
+
+        newLeaf = LeafNode.LeafNode(newCf, parent=leaf.parent)
+        leaf.next = newLeaf
+        newLeaf.prev = newLeaf
+
+        return newLeaf
+
+    def updatePathSimple(self, child: LeafNode):
+        parent = child.parent
+
+        while parent is not None:
+            idx = parent.getChildIndex(child)
+            parent.cfs[idx] = child.sumAllCfs()
+            child = parent
+            parent = parent.parent
+
+    def updatePathWithNewLeaf(self, newLeaf: LeafNode):
+        # TODO: update the whole path in a loop
+        if newLeaf.parent.canAddNode():
+            newLeaf.parent.addNode(newLeaf)
+        else:
+            self.splitNonLeafNode(newLeaf)
+
+    def splitNonLeafNode(self, node: CfNode):
+
+        if node.parent != None:
+            node.parent.addNode(node)
+            nonLeafNode = node.parent
+        else:
+            nonLeafNode = node
+
+        indices = Distance.getFurthest2Points(nonLeafNode.cfs)
+        oldCf = [indices[0]]
+        newCf = [indices[1]]
+        nodeCfs = nonLeafNode.cfs
+        nodeEntries = nonLeafNode.entries
+
+        for idx, cf in enumerate(nonLeafNode.cfs):
+            if not cf is nodeCfs[oldCf[0]] and not cf is nodeCfs[newCf[0]]:
+                if cf.calcualteDistance(nodeCfs[oldCf[0]]) < cf.calcualteDistance(nodeCfs[newCf[0]]):
+                    oldCf.append(idx)
+                else:
+                    newCf.append(idx)
+
+        newNode = NonLeafNode()
+        newNode.cfs = list(np.array(nodeCfs)[np.array(newCf)])
+        newNode.entries = list(np.array(nodeEntries)[np.array(newCf)])
+
+        for item in newNode.entries:
+            item.parent = newNode
+
+        nonLeafNode.cfs = list(np.array(nodeCfs)[np.array(oldCf)])
+        nonLeafNode.entries = list(np.array(nodeEntries)[np.array(oldCf)])
+
+        for item in nonLeafNode.entries:
+            item.parent = nonLeafNode
+
+        if nonLeafNode.parent is None:
+            self.root = NonLeafNode()
+            self.root.entries = []
+            self.root.cfs = []
+            self.root.addNode(nonLeafNode)
+            self.root.addNode(newNode)
+            print("new Height -> new root")
+        else:
+            if nonLeafNode.parent.canAddNode():
+                print("add Node")
+                nonLeafNode.parent.addNode(newNode)
+            else:
+                print("split again ")
+                self.splitNonLeafNode(nonLeafNode.parent)
+
+    def getClosestLeaf(self, cf: ClusteringFeature, nonLeafNode: NonLeafNode) -> LeafNode:
+        cfNode = nonLeafNode.getClosestChild(cf)
+        if cfNode is None:
+            return None
+
+        if cfNode.isLeaf:
+            return cfNode
+        else:
+            return self.getClosestLeaf(cf, cfNode)
+
+    def validate(self):
+        self.validateNode(self.root)
+
+    def validateNode(self, nonLeafNode: NonLeafNode) -> bool:
+        n = 0
+        # TODO: fix
+        # for idx, node in enumerate(nonLeafNode.entries):
+        #     n = self.calculateCfs(node)
+        #     nNonLeaf = nonLeafNode.cfs[idx].N
+        #     if n != nNonLeaf:
+        #         print(False, idx)
+        #         return False
+
+        return True
+
+    def calculateCfs(self, nonLeafNode: NonLeafNode) -> int:
+        if nonLeafNode.isLeaf:
+            return nonLeafNode.sumAllCfs().N
+        else:
+            n = 0
+            for idx, node in enumerate(nonLeafNode.entries):
+                n = self.validateNode(node)
+                nNonLeaf = nonLeafNode.cfs[idx].N
+                if n != nNonLeaf:
+                    print(False, n, nNonLeaf)
+
+    def getLeafList(self) -> list:
+        next = self.firstChild
+        leafs = [next]
+        while next.next is not None:
+            print("next")
+            next = next.next
+            leafs.append(next)
+
+        return leafs
+
+    def getLeafCfs(self) -> list:
+        leafs = self.getLeafList()
+        cfVectors = []
+        for leaf in leafs:
+            for cf in leaf.cfs:
+                cfVectors.append(cf)
+        return cfVectors
+
+    def getVectorsFromCf(self, cfs: list) -> list:
+        return [cf.getCenter() for cf in cfs]
diff --git a/flair/models/clustering/birch/model/ClusteringFeature.py b/flair/models/clustering/birch/model/ClusteringFeature.py
@@ -0,0 +1,49 @@
+import torch
+from torch import Tensor
+
+from birch import Birch
+from distance import Distance
+
+
+class ClusteringFeature:
+    def __init__(self, tensor: Tensor = None, idx: int = None):
+        if tensor is None:
+            self.N = 0
+            self.SS = None
+            self.LS = None
+        else:
+            self.N = 1
+            self.SS = tensor
+            self.LS = tensor * tensor
+        if idx is None:
+            self.indices = []
+        else:
+            self.indices = [idx]
+
+    def absorbCf(self, cf):
+        self.N += cf.N
+        self.indices.extend(cf.indices)
+        if self.LS is None:
+            self.LS = cf.LS
+        else:
+            self.LS += cf.LS
+        if self.SS is None:
+            self.SS = cf.SS
+        else:
+            self.SS *= cf.SS
+
+    def getCenter(self) -> Tensor:
+        return self.LS / self.N
+
+    def calcualteDistance(self, vector) -> Tensor:
+        if self.LS is None:
+            return Tensor([Birch.distanceMax - 100])
+        else:
+            return Distance.getCosineDistance(self.getCenter(), vector.getCenter())
+
+    def canAbsorbCf(self, cf) -> bool:
+        if self.LS is None:
+            return True
+
+        distance = Distance.getCosineDistance(self.getCenter(), cf.getCenter())
+        return distance <= Birch.threshold
diff --git a/flair/models/clustering/birch/model/LeafNode.py b/flair/models/clustering/birch/model/LeafNode.py
@@ -0,0 +1,39 @@
+from torch import Tensor
+
+import birch.Birch
+from birch.model.ClusteringFeature import ClusteringFeature
+from birch.model.CfNode import CfNode
+
+
+class LeafNode(CfNode):
+    def __init__(self, initCfs: list = None, parent=None):
+        super().__init__()
+        if initCfs is None:
+            self.cfs = [ClusteringFeature()]
+        else:
+            self.cfs = initCfs
+        self.parent = parent
+        self.isLeaf = True
+        self.prev = None
+        self.next = None
+
+    def addCF(self, cf: ClusteringFeature):
+        self.cfs.append(cf)
+
+    def canAddNewCf(self):
+        return self.cfs.__len__() < birch.Birch.branchingFactorLeaf
+
+    def getClosestCF(self, vector: Tensor) -> ClusteringFeature:
+        minDistance = birch.Birch.distanceMax
+        cfResult = None
+
+        for cf in self.cfs:
+            distance = cf.calcualteDistance(vector)
+
+            if distance < minDistance:
+                minDistance = distance
+                cfResult = cf
+
+        return cfResult
+
+