-
Notifications
You must be signed in to change notification settings - Fork 120
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Exception handling in PipeUtil::read() #229
Changes from all commits
fe052df
4619542
191f810
086a29f
9ea14e4
b4d292a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,5 @@ | ||
package zingg; | ||
|
||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
import java.util.Scanner; | ||
|
||
|
@@ -51,7 +49,7 @@ public Dataset<Row> getUnmarkedRecords() throws ZinggClientException { | |
unmarkedRecords = PipeUtil.read(spark, false, false, PipeUtil.getTrainingDataUnmarkedPipe(args)); | ||
try { | ||
markedRecords = PipeUtil.read(spark, false, false, PipeUtil.getTrainingDataMarkedPipe(args)); | ||
} catch (Exception e) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. dont we still need to catch the other expcetions? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. read() throws only ZinggClientException. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes ZCE extends from throwable |
||
} catch (ZinggClientException e) { | ||
LOG.warn("No record has been marked yet"); | ||
} | ||
if (markedRecords != null ) { | ||
|
@@ -60,7 +58,7 @@ public Dataset<Row> getUnmarkedRecords() throws ZinggClientException { | |
"left_anti"); | ||
getMarkedRecordsStat(markedRecords); | ||
} | ||
} catch (Exception e) { | ||
} catch (ZinggClientException e) { | ||
LOG.warn("No unmarked record for labelling"); | ||
} | ||
return unmarkedRecords; | ||
|
@@ -75,61 +73,59 @@ protected void getMarkedRecordsStat(Dataset<Row> markedRecords) { | |
|
||
public void processRecordsCli(Dataset<Row> lines) throws ZinggClientException { | ||
LOG.info("Processing Records for CLI Labelling"); | ||
printMarkedRecordsStat(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. see earlier comment about returns There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. removed return statement. |
||
if (lines == null || lines.count() == 0) { | ||
LOG.info("It seems there are no unmarked records at this moment. Please run findTrainingData job to build some pairs to be labelled and then run this labeler."); | ||
return; | ||
} | ||
|
||
lines = lines.cache(); | ||
List<Column> displayCols = DSUtil.getFieldDefColumns(lines, args, false, args.getShowConcise()); | ||
if (lines != null && lines.count() > 0) { | ||
printMarkedRecordsStat(); | ||
|
||
List<Row> clusterIDs = lines.select(ColName.CLUSTER_COLUMN).distinct().collectAsList(); | ||
try { | ||
double score; | ||
double prediction; | ||
Dataset<Row> updatedRecords = null; | ||
int selected_option = -1; | ||
String msg1, msg2; | ||
int totalPairs = clusterIDs.size(); | ||
|
||
for (int index = 0; index < totalPairs; index++){ | ||
Dataset<Row> currentPair = lines.filter(lines.col(ColName.CLUSTER_COLUMN).equalTo( | ||
clusterIDs.get(index).getAs(ColName.CLUSTER_COLUMN))).cache(); | ||
|
||
score = currentPair.head().getAs(ColName.SCORE_COL); | ||
prediction = currentPair.head().getAs(ColName.PREDICTION_COL); | ||
|
||
msg1 = String.format("\tCurrent labelling round : %d/%d pairs labelled\n", index, totalPairs); | ||
String matchType = LabelMatchType.get(prediction).msg; | ||
if (prediction == ColValues.IS_NOT_KNOWN_PREDICTION) { | ||
msg2 = String.format( | ||
"\tZingg does not do any prediction for the above pairs as Zingg is still collecting training data to build the preliminary models."); | ||
} else { | ||
msg2 = String.format("\tZingg predicts the above records %s with a similarity score of %.2f", | ||
matchType, Math.floor(score * 100) * 0.01); | ||
lines = lines.cache(); | ||
List<Column> displayCols = DSUtil.getFieldDefColumns(lines, args, false, args.getShowConcise()); | ||
List<Row> clusterIDs = lines.select(ColName.CLUSTER_COLUMN).distinct().collectAsList(); | ||
try { | ||
double score; | ||
double prediction; | ||
Dataset<Row> updatedRecords = null; | ||
int selected_option = -1; | ||
String msg1, msg2; | ||
int totalPairs = clusterIDs.size(); | ||
|
||
for (int index = 0; index < totalPairs; index++) { | ||
Dataset<Row> currentPair = lines.filter(lines.col(ColName.CLUSTER_COLUMN).equalTo( | ||
clusterIDs.get(index).getAs(ColName.CLUSTER_COLUMN))).cache(); | ||
|
||
score = currentPair.head().getAs(ColName.SCORE_COL); | ||
prediction = currentPair.head().getAs(ColName.PREDICTION_COL); | ||
|
||
msg1 = String.format("\tCurrent labelling round : %d/%d pairs labelled\n", index, totalPairs); | ||
String matchType = LabelMatchType.get(prediction).msg; | ||
if (prediction == ColValues.IS_NOT_KNOWN_PREDICTION) { | ||
msg2 = String.format( | ||
"\tZingg does not do any prediction for the above pairs as Zingg is still collecting training data to build the preliminary models."); | ||
} else { | ||
msg2 = String.format("\tZingg predicts the above records %s with a similarity score of %.2f", | ||
matchType, Math.floor(score * 100) * 0.01); | ||
} | ||
//String msgHeader = msg1 + msg2; | ||
|
||
selected_option = displayRecordsAndGetUserInput(DSUtil.select(currentPair, displayCols), msg1, msg2); | ||
updateLabellerStat(selected_option, 1); | ||
printMarkedRecordsStat(); | ||
if (selected_option == 9) { | ||
LOG.info("User has quit in the middle. Updating the records."); | ||
break; | ||
} | ||
updatedRecords = updateRecords(selected_option, currentPair, updatedRecords); | ||
} | ||
//String msgHeader = msg1 + msg2; | ||
|
||
selected_option = displayRecordsAndGetUserInput(DSUtil.select(currentPair, displayCols), msg1, msg2); | ||
updateLabellerStat(selected_option, 1); | ||
printMarkedRecordsStat(); | ||
if (selected_option == 9) { | ||
LOG.info("User has quit in the middle. Updating the records."); | ||
break; | ||
writeLabelledOutput(updatedRecords); | ||
LOG.warn("Processing finished."); | ||
} catch (Exception e) { | ||
if (LOG.isDebugEnabled()) { | ||
e.printStackTrace(); | ||
} | ||
updatedRecords = updateRecords(selected_option, currentPair, updatedRecords); | ||
} | ||
writeLabelledOutput(updatedRecords); | ||
LOG.warn("Processing finished."); | ||
} catch (Exception e) { | ||
if (LOG.isDebugEnabled()) { | ||
e.printStackTrace(); | ||
LOG.warn("Labelling error has occured " + e.getMessage()); | ||
throw new ZinggClientException("An error has occured while Labelling.", e); | ||
} | ||
LOG.warn("Labelling error has occured " + e.getMessage()); | ||
throw new ZinggClientException(e.getMessage()); | ||
} else { | ||
LOG.info("It seems there are no unmarked records at this moment. Please run findTrainingData job to build some pairs to be labelled and then run this labeler."); | ||
} | ||
return; | ||
} | ||
|
||
|
||
|
@@ -203,7 +199,7 @@ protected void printMarkedRecordsStat() { | |
System.out.println(msg); | ||
} | ||
|
||
protected void writeLabelledOutput(Dataset<Row> records) { | ||
protected void writeLabelledOutput(Dataset<Row> records) throws ZinggClientException { | ||
if (records == null) { | ||
LOG.warn("No records to be labelled."); | ||
return; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -41,11 +41,12 @@ public Matcher() { | |
setZinggOptions(ZinggOptions.MATCH); | ||
} | ||
|
||
protected Dataset<Row> getTestData() { | ||
return PipeUtil.read(spark, true, args.getNumPartitions(), true, args.getData()); | ||
protected Dataset<Row> getTestData() throws ZinggClientException{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why this change? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To satisfy the requirement of catching or throwing the ZinggClientException. All the intermediate functions are mandating to add this. |
||
Dataset<Row> data = PipeUtil.read(spark, true, args.getNumPartitions(), true, args.getData()); | ||
return data; | ||
} | ||
|
||
protected Dataset<Row> getBlocked(Dataset<Row> testData) throws Exception{ | ||
protected Dataset<Row> getBlocked(Dataset<Row> testData) throws Exception, ZinggClientException{ | ||
LOG.debug("Blocking model file location is " + args.getBlockFile()); | ||
Tree<Canopy> tree = BlockingTreeUtil.readBlockingTree(spark, args); | ||
Dataset<Row> blocked = testData.map(new Block.BlockFunction(tree), RowEncoder.apply(Block.appendHashCol(testData.schema()))); | ||
|
@@ -159,7 +160,7 @@ public void execute() throws ZinggClientException { | |
} | ||
} | ||
|
||
public void writeOutput(Dataset<Row> blocked, Dataset<Row> dupesActual) { | ||
public void writeOutput(Dataset<Row> blocked, Dataset<Row> dupesActual) throws ZinggClientException { | ||
try{ | ||
//input dupes are pairs | ||
///pick ones according to the threshold by user | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we need to still print this out in else, no?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes. made changes in Labeller and UpdateLabeller.