Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue #692 obv dupe changes for ftd and matcher #694

Merged
merged 33 commits into from
Oct 22, 2023
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
b69bde8
1st draft, refactor to separate class
vikasgupta78 Oct 17, 2023
2eaceae
junit 1st cut
vikasgupta78 Oct 17, 2023
394398d
null check and better junits
vikasgupta78 Oct 17, 2023
232a09c
log level change issue #649
vikasgupta78 Oct 18, 2023
8578d0a
make constructor concise
vikasgupta78 Oct 18, 2023
1ada4d4
obv dupe filter
vikasgupta78 Oct 18, 2023
9a3738d
refactoring
vikasgupta78 Oct 18, 2023
00cb66c
refactor SparkFrame
vikasgupta78 Oct 18, 2023
57a7c90
refactoring obv dupe filter from ZFrame
vikasgupta78 Oct 18, 2023
273cfd2
cleanup
vikasgupta78 Oct 18, 2023
36f68c3
message if all sample is obv dupe
vikasgupta78 Oct 18, 2023
f7a9431
merging latest from 0.4.0
vikasgupta78 Oct 18, 2023
b53e876
merge from 0.4.0 , resolved conflict
vikasgupta78 Oct 18, 2023
70f1c33
review comments from PR #694
vikasgupta78 Oct 19, 2023
5936ace
junits improvement
vikasgupta78 Oct 19, 2023
c2ddce9
Use JSON instead of String for obv dupe condition
vikasgupta78 Oct 20, 2023
ca66e6e
test for obv dupe cond in args
vikasgupta78 Oct 20, 2023
8603114
refactor package
vikasgupta78 Oct 20, 2023
52ff3ce
refactor
vikasgupta78 Oct 20, 2023
f3480e1
refactor
vikasgupta78 Oct 20, 2023
a3d77a6
refactor
vikasgupta78 Oct 20, 2023
a8815db
refactor , make more readable/modular
vikasgupta78 Oct 20, 2023
38e4fb7
renamed
vikasgupta78 Oct 20, 2023
57a9fd7
renamed
vikasgupta78 Oct 20, 2023
0af67f4
refactor : separate out test cases for filter and util
vikasgupta78 Oct 20, 2023
2bf6297
null check
vikasgupta78 Oct 20, 2023
b29758c
changed from passing context to dsutil and changed number of generic …
vikasgupta78 Oct 21, 2023
45e9205
moved massageObvDupes method
vikasgupta78 Oct 21, 2023
d51d92a
renamed
vikasgupta78 Oct 21, 2023
f94b557
renamed junits
vikasgupta78 Oct 21, 2023
6db0092
renamed obv dupe classes
vikasgupta78 Oct 21, 2023
eb6b42f
removed redundant null check
vikasgupta78 Oct 21, 2023
7d55b2e
comment updated
vikasgupta78 Oct 22, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions common/client/src/main/java/zingg/common/client/Arguments.java
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ public class Arguments implements Serializable {
float stopWordsCutoff = 0.1f;
long blockSize = 100L;
String column;
String obviousDupeCondition;
ObviousDupes[] obviousDupes;


public void setThreshold(double threshold) {
Expand Down Expand Up @@ -478,12 +478,12 @@ public void setColumn(String column) {
this.column = column;
}

public String getObviousDupeCondition() {
return obviousDupeCondition;
public ObviousDupes[] getObviousDupes() {
return obviousDupes;
}

public void setObviousDupeCondition(String obviousDupeCondition) {
this.obviousDupeCondition = obviousDupeCondition;
public void setObviousDupes(ObviousDupes[] obviousDupes) {
this.obviousDupes = obviousDupes;
}

public long getBlockSize() {
Expand Down
40 changes: 40 additions & 0 deletions common/client/src/main/java/zingg/common/client/ObviousDupes.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package zingg.common.client;

import java.io.Serializable;
import java.util.Arrays;
import java.util.HashMap;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

public class ObviousDupes implements Serializable {

private static final long serialVersionUID = 1L;
public static final Log LOG = LogFactory.getLog(ObviousDupes.class);

public static final String fieldName = "fieldName";

public ObviousDupes() {

}

public ObviousDupes(HashMap<String, String>[] matchCondition) {
this.matchCondition = matchCondition;
}

HashMap<String,String>[] matchCondition;

public HashMap<String, String>[] getMatchCondition() {
return matchCondition;
}

public void setMatchCondition(HashMap<String, String>[] matchCondition) {
this.matchCondition = matchCondition;
}

@Override
public String toString() {
return Arrays.toString(matchCondition);
}

}
28 changes: 12 additions & 16 deletions common/client/src/main/java/zingg/common/client/ZFrame.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import java.util.List;


//Dataset, Row, column
public interface ZFrame<D, R, C> {

Expand All @@ -11,11 +10,7 @@ public interface ZFrame<D, R, C> {

public static final String COL_COUNT = "count";
public static final String COL_VALUE = "VALUE";

public static final String orSeperator = "\\|";
public static final String andSeperator = "\\&";



public ZFrame<D, R, C> cache();
public ZFrame<D, R, C> as(String s);
public String[] columns();
Expand Down Expand Up @@ -99,7 +94,9 @@ public interface ZFrame<D, R, C> {
public C gt(String c, double val);

public C equalTo(String c, String e);


public C equalTo(C column1, C column2);

public C notEqual(String c, String e);

public C notEqual(String e);
Expand All @@ -110,6 +107,13 @@ public interface ZFrame<D, R, C> {

public C notEqual(String c, int e);

public C not(C col);

public C isNotNull(C col);

public C and(C col1, C col2);

public C or(C col1, C col2);

public void show(int num);
public void show();
Expand Down Expand Up @@ -160,13 +164,5 @@ public interface ZFrame<D, R, C> {
public ZFrame<D, R, C> filterNotNullCond(String colName);

public ZFrame<D, R, C> filterNullCond(String colName);

public C getObviousDupesFilter(String obviousDupeString, C extraAndCond);

public C getObviousDupesFilter(ZFrame<D, R, C> dfToJoin, String obviousDupeString, C extraAndCond);

public C getReverseObviousDupesFilter(String obviousDupeString, C extraAndCond);

public C getReverseObviousDupesFilter(ZFrame<D, R, C> dfToJoin, String obviousDupeString, C extraAndCond);


}
22 changes: 22 additions & 0 deletions common/client/src/test/java/zingg/common/client/TestArguments.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -241,4 +242,25 @@ public void testMatchTypeWrong() {

}


@Test
public void testObvDupe() {
Arguments args;
try {
args = argsUtil.createArgumentsFromJSON(getClass().getResource("../../../testArguments/configObvDupe.json").getFile(), "test");

ObviousDupes[] obviousDupes = args.getObviousDupes();
HashMap<String,String>[] matchCondition = obviousDupes[0].getMatchCondition();

assertEquals("fname", matchCondition[0].get(ObviousDupes.fieldName));

} catch (Exception | ZinggClientException e) {
// TODO Auto-generated catch block
e.printStackTrace();
fail("Could not read config");
}

}


}
117 changes: 117 additions & 0 deletions common/client/src/test/resources/testArguments/configObvDupe.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
{
"fieldDefinition":[
{
"fieldName" : "recId",
"matchType" : "dont_use",
"fields" : "recId",
"dataType": "string"
},
{
"fieldName" : "fname",
"matchType" : "fuzzy",
"fields" : "fname",
"dataType": "string"
},
{
"fieldName" : "lname",
"matchType" : "fuzzy",
"fields" : "lname",
"dataType": "string"
},
{
"fieldName" : "stNo",
"matchType": "fuzzy",
"fields" : "stNo",
"dataType": "string"
},
{
"fieldName" : "add1",
"matchType": "fuzzy",
"fields" : "add1",
"dataType": "string"
},
{
"fieldName" : "add2",
"matchType": "fuzzy",
"fields" : "add2",
"dataType": "string"
},
{
"fieldName" : "city",
"matchType": "fuzzy",
"fields" : "city",
"dataType": "string"
},
{
"fieldName" : "areacode",
"matchType": "fuzzy",
"fields" : "areacode",
"dataType": "string"
},
{
"fieldName" : "state",
"matchType": "fuzzy",
"fields" : "state",
"dataType": "string"
},
{
"fieldName" : "dob",
"matchType": "fuzzy",
"fields" : "dob",
"dataType": "string"
},
{
"fieldName" : "ssn",
"matchType": "fuzzy",
"fields" : "ssn",
"dataType": "string"
}
],
"output" : [{
"name":"output",
"format":"csv",
"props": {
"location": "/tmp/zinggOutput",
"delimiter": ",",
"header":true
}
}],
"data" : [{
"name":"test",
"format":"csv",
"props": {
"location": "examples/febrl/test.csv",
"delimiter": ",",
"header":false
},
"schema": "recId string, fname string, lname string, stNo string, add1 string, add2 string, city string, state string, areacode string, dob string, ssn string"
}
],
"obviousDupes":[
{
"matchCondition":[
{
"fieldName":"fname"
},
{
"fieldName":"stNo"
},
{
"fieldName":"add1"
}
]
},
{
"matchCondition":[
{
"fieldName":"recId"
}
]
}
],
"labelDataSampleSize" : 0.5,
"numPartitions":4,
"modelId": 100,
"zinggDir": "models"

}
Loading