From b1e1d08ccc5a42a48b1a15a339eb412aca2a44a2 Mon Sep 17 00:00:00 2001 From: Navin Singh Date: Fri, 1 Jul 2022 01:57:59 +0530 Subject: [PATCH 1/2] getUnmarkedRecords() - updated to the version with correct functionality and fixed its name --- client/src/main/java/zingg/client/Client.java | 4 ++-- client/src/main/java/zingg/client/IZingg.java | 2 +- core/src/main/java/zingg/Labeller.java | 24 ++++--------------- core/src/main/java/zingg/ZinggBase.java | 14 ++++++++--- python/phases/assessModel.py | 2 +- python/zingg/zingg.py | 4 ++-- 6 files changed, 22 insertions(+), 28 deletions(-) diff --git a/client/src/main/java/zingg/client/Client.java b/client/src/main/java/zingg/client/Client.java index 5d124a41c..c8c4fcd0d 100644 --- a/client/src/main/java/zingg/client/Client.java +++ b/client/src/main/java/zingg/client/Client.java @@ -253,8 +253,8 @@ public Dataset getMarkedRecords() { return zingg.getMarkedRecords(); } - public Dataset getUnMarkedRecords() { - return zingg.getUnMarkedRecords(); + public Dataset getUnmarkedRecords() { + return zingg.getUnmarkedRecords(); } } \ No newline at end of file diff --git a/client/src/main/java/zingg/client/IZingg.java b/client/src/main/java/zingg/client/IZingg.java index 5f4c30e3f..8931ac57c 100644 --- a/client/src/main/java/zingg/client/IZingg.java +++ b/client/src/main/java/zingg/client/IZingg.java @@ -22,7 +22,7 @@ public void init(Arguments args, String license) public Dataset getMarkedRecords(); - public Dataset getUnMarkedRecords(); + public Dataset getUnmarkedRecords(); public Long getMarkedRecordsStat(Dataset markedRecords, long value); diff --git a/core/src/main/java/zingg/Labeller.java b/core/src/main/java/zingg/Labeller.java index 3e9daee08..eff8b0751 100644 --- a/core/src/main/java/zingg/Labeller.java +++ b/core/src/main/java/zingg/Labeller.java @@ -33,6 +33,7 @@ public Labeller() { public void execute() throws ZinggClientException { try { LOG.info("Reading inputs for labelling phase ..."); + initLabellerStat(); Dataset unmarkedRecords = getUnmarkedRecords(); processRecordsCli(unmarkedRecords); LOG.info("Finished labelling phase"); @@ -42,28 +43,13 @@ public void execute() throws ZinggClientException { } } - public Dataset getUnmarkedRecords() throws ZinggClientException { - Dataset unmarkedRecords = null; - Dataset markedRecords = null; - try { - unmarkedRecords = PipeUtil.read(spark, false, false, PipeUtil.getTrainingDataUnmarkedPipe(args)); - markedRecords = getMarkedRecords(); - if (markedRecords != null ) { - unmarkedRecords = unmarkedRecords.join(markedRecords, - unmarkedRecords.col(ColName.CLUSTER_COLUMN).equalTo(markedRecords.col(ColName.CLUSTER_COLUMN)), - "left_anti"); - getMarkedRecordsStat(markedRecords); - } - } catch (ZinggClientException e) { - LOG.warn("No unmarked record for labelling"); + public void initLabellerStat() { + Dataset markedRecords = getMarkedRecords(); + if (markedRecords != null ) { + getMarkedRecordsStat(markedRecords); } - return unmarkedRecords; } - - - - protected void getMarkedRecordsStat(Dataset markedRecords) { positivePairsCount = getMatchedMarkedRecordsStat(markedRecords); negativePairsCount = getUnmatchedMarkedRecordsStat(markedRecords); diff --git a/core/src/main/java/zingg/ZinggBase.java b/core/src/main/java/zingg/ZinggBase.java index c5dc795d8..e3c871982 100644 --- a/core/src/main/java/zingg/ZinggBase.java +++ b/core/src/main/java/zingg/ZinggBase.java @@ -191,13 +191,21 @@ public Dataset getMarkedRecords() { return null; } - public Dataset getUnMarkedRecords() { + public Dataset getUnmarkedRecords() { + Dataset unmarkedRecords = null; + Dataset markedRecords = null; try { - return PipeUtil.read(spark, false, false, PipeUtil.getTrainingDataUnmarkedPipe(args)); + unmarkedRecords = PipeUtil.read(spark, false, false, PipeUtil.getTrainingDataUnmarkedPipe(args)); + markedRecords = getMarkedRecords(); + if (markedRecords != null ) { + unmarkedRecords = unmarkedRecords.join(markedRecords, + unmarkedRecords.col(ColName.CLUSTER_COLUMN).equalTo(markedRecords.col(ColName.CLUSTER_COLUMN)), + "left_anti"); + } } catch (ZinggClientException e) { LOG.warn("No unmarked record"); } - return null; + return unmarkedRecords; } public Long getMarkedRecordsStat(Dataset markedRecords, long value) { diff --git a/python/phases/assessModel.py b/python/phases/assessModel.py index 2a5c94e16..90f5191c8 100644 --- a/python/phases/assessModel.py +++ b/python/phases/assessModel.py @@ -19,7 +19,7 @@ def main(): client.initAndExecute() pMarkedDF = client.getPandasDfFromDs(client.getMarkedRecords()) - pUnMarkedDF = client.getPandasDfFromDs(client.getUnMarkedRecords()) + pUnMarkedDF = client.getPandasDfFromDs(client.getUnmarkedRecords()) total_marked = pMarkedDF.shape[0] total_unmarked = pUnMarkedDF.shape[0] diff --git a/python/zingg/zingg.py b/python/zingg/zingg.py index 47a2af097..e68a3e550 100644 --- a/python/zingg/zingg.py +++ b/python/zingg/zingg.py @@ -32,8 +32,8 @@ def initAndExecute(self): self.client.execute() def getMarkedRecords(self): return self.client.getMarkedRecords() - def getUnMarkedRecords(self): - return self.client.getUnMarkedRecords() + def getUnmarkedRecords(self): + return self.client.getUnmarkedRecords() def setArguments(self, args): self.client.setArguments() def getArguments(self): From 42cde4a46beabe5a9315b92335b183aa1dc14e53 Mon Sep 17 00:00:00 2001 From: Navin Singh Date: Tue, 5 Jul 2022 13:13:43 +0530 Subject: [PATCH 2/2] reorg code and removed initLabellerStat() --- core/src/main/java/zingg/Labeller.java | 17 ++++++----------- python/phases/assessModel.py | 6 +++--- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/core/src/main/java/zingg/Labeller.java b/core/src/main/java/zingg/Labeller.java index eff8b0751..b35ff66af 100644 --- a/core/src/main/java/zingg/Labeller.java +++ b/core/src/main/java/zingg/Labeller.java @@ -33,7 +33,7 @@ public Labeller() { public void execute() throws ZinggClientException { try { LOG.info("Reading inputs for labelling phase ..."); - initLabellerStat(); + getMarkedRecordsStat(getMarkedRecords()); Dataset unmarkedRecords = getUnmarkedRecords(); processRecordsCli(unmarkedRecords); LOG.info("Finished labelling phase"); @@ -43,20 +43,15 @@ public void execute() throws ZinggClientException { } } - public void initLabellerStat() { - Dataset markedRecords = getMarkedRecords(); + protected void getMarkedRecordsStat(Dataset markedRecords) { if (markedRecords != null ) { - getMarkedRecordsStat(markedRecords); + positivePairsCount = getMatchedMarkedRecordsStat(markedRecords); + negativePairsCount = getUnmatchedMarkedRecordsStat(markedRecords); + notSurePairsCount = getUnsureMarkedRecordsStat(markedRecords); + totalCount = markedRecords.count() / 2; } } - protected void getMarkedRecordsStat(Dataset markedRecords) { - positivePairsCount = getMatchedMarkedRecordsStat(markedRecords); - negativePairsCount = getUnmatchedMarkedRecordsStat(markedRecords); - notSurePairsCount = getUnsureMarkedRecordsStat(markedRecords); - totalCount = markedRecords.count() / 2; - } - public void processRecordsCli(Dataset lines) throws ZinggClientException { LOG.info("Processing Records for CLI Labelling"); if (lines != null && lines.count() > 0) { diff --git a/python/phases/assessModel.py b/python/phases/assessModel.py index 90f5191c8..a853ffb13 100644 --- a/python/phases/assessModel.py +++ b/python/phases/assessModel.py @@ -19,17 +19,17 @@ def main(): client.initAndExecute() pMarkedDF = client.getPandasDfFromDs(client.getMarkedRecords()) - pUnMarkedDF = client.getPandasDfFromDs(client.getUnmarkedRecords()) + pUnmarkedDF = client.getPandasDfFromDs(client.getUnmarkedRecords()) total_marked = pMarkedDF.shape[0] - total_unmarked = pUnMarkedDF.shape[0] + total_unmarked = pUnmarkedDF.shape[0] matched_marked = client.getMatchedMarkedRecordsStat() unmatched_marked = client.getUnmatchedMarkedRecordsStat() unsure_marked = client.getUnsureMarkedRecordsStat() LOG.info("") LOG.info("No. of Records Marked : %d", total_marked) - LOG.info("No. of Records UnMarked : %d", total_unmarked) + LOG.info("No. of Records Unmarked : %d", total_unmarked) LOG.info("No. of Matches : %d", matched_marked) LOG.info("No. of Non-Matches : %d", unmatched_marked) LOG.info("No. of Not Sure : %d", unsure_marked)