From 5037bec0b90dbb89d4f9f877b6a897d4dfbe2bdc Mon Sep 17 00:00:00 2001
From: Navin Singh
Date: Wed, 23 Mar 2022 00:02:10 +0530
Subject: [PATCH 1/2] generate stop words and save into csv files
---
core/src/main/java/zingg/Documenter.java | 155 ++++++++++++++--------
core/src/main/java/zingg/util/DSUtil.java | 2 +-
core/src/main/resources/stopWords.ftlh | 3 +
3 files changed, 106 insertions(+), 54 deletions(-)
create mode 100644 core/src/main/resources/stopWords.ftlh
diff --git a/core/src/main/java/zingg/Documenter.java b/core/src/main/java/zingg/Documenter.java
index f91ee6471..643bda948 100644
--- a/core/src/main/java/zingg/Documenter.java
+++ b/core/src/main/java/zingg/Documenter.java
@@ -1,39 +1,42 @@
package zingg;
-import java.util.ArrayList;
-import java.util.Arrays;
+import static org.apache.spark.sql.functions.desc;
+import static org.apache.spark.sql.functions.explode;
+import static org.apache.spark.sql.functions.split;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.Writer;
+import java.util.HashMap;
import java.util.List;
-import java.util.Scanner;
+import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
-import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
-import org.apache.spark.sql.functions;
+import freemarker.template.Configuration;
+import freemarker.template.Template;
+import freemarker.template.TemplateExceptionHandler;
+import zingg.client.FieldDefinition;
+import zingg.client.MatchType;
import zingg.client.ZinggClientException;
import zingg.client.ZinggOptions;
-import zingg.client.pipe.Pipe;
import zingg.client.util.ColName;
-import zingg.client.util.ColValues;
import zingg.util.DSUtil;
import zingg.util.PipeUtil;
-import zingg.util.RowAdapter;
import zingg.util.RowWrapper;
-import freemarker.ext.rhino.RhinoWrapper;
-import freemarker.template.*;
-import java.util.*;
-import java.io.*;
public class Documenter extends ZinggBase {
protected static String name = "zingg.Documenter";
public static final Log LOG = LogFactory.getLog(Documenter.class);
+ public static Configuration config;
public Documenter() {
setZinggOptions(ZinggOptions.GENERATE_DOCS);
+ config = createConfigurationObject();
}
public void execute() throws ZinggClientException {
@@ -52,6 +55,7 @@ public void execute() throws ZinggClientException {
root.put("columns", markedRecords.columns());
root.put("fieldDefinitionCount", args.getFieldDefinition().size());
buildAndWriteHTML(root);
+ generateStopWords();
} catch (Exception e) {
e.printStackTrace();
throw new ZinggClientException(e.getMessage());
@@ -60,48 +64,93 @@ public void execute() throws ZinggClientException {
public void buildAndWriteHTML(Map root) throws Exception {
- /* ------------------------------------------------------------------------ */
- /* You should do this ONLY ONCE in the whole application life-cycle: */
+ Configuration cfg = getTemplateConfig();
- /* Create and adjust the configuration singleton */
- Configuration cfg = new Configuration(Configuration.VERSION_2_3_29);
+ /* Get the template (uses cache internally) */
+ Template temp = cfg.getTemplate("model.ftlh");
+
+ /* Merge data-model with template */
+ // Writer out = new OutputStreamWriter(System.out);
+ Writer file = new FileWriter(new File(args.getZinggDocFile()));
+ // StringWriter writer = new StringWriter();
+ temp.process(root, file);
+ // Note: Depending on what `out` is, you may need to call `out.close()`.
+ // This is usually the case for file output, but not for servlet output.
+ // file.flush();
+
+ // List textList = Collections.singletonList(writer.toString());
+
+ // Dataset data = spark.createDataset(textList, Encoders.STRING()).toDF();
+
+ // PipeUtil.write(data, args, ctx, PipeUtil.getModelDocumentationPipe(args));
+ file.close();
+ // LOG.warn("written documentation at " + args.getZinggDocFile());
+ }
+
+ public Configuration getTemplateConfig() {
+ if (config == null) {
+ config = createConfigurationObject();
+ }
+ return config;
+ }
+
+ private Configuration createConfigurationObject() {
+ /* ------------------------------------------------------------------------ */
+ /* You should do this ONLY ONCE in the whole application life-cycle: */
+
+ /* Create and adjust the configuration singleton */
+ Configuration cfg = new Configuration(Configuration.VERSION_2_3_29);
cfg.setClassForTemplateLoading(this.getClass(), "/");
- // cfg.setDirectoryForTemplateLoading(new File("/where/you/store/templates"));
- // Recommended settings for new projects:
- cfg.setDefaultEncoding("UTF-8");
- cfg.setTemplateExceptionHandler(TemplateExceptionHandler.RETHROW_HANDLER);
- cfg.setLogTemplateExceptions(false);
- cfg.setWrapUncheckedExceptions(true);
- cfg.setFallbackOnNullLoopVariable(false);
+ // cfg.setDirectoryForTemplateLoading(new File("/where/you/store/templates"));
+ // Recommended settings for new projects:
+ cfg.setDefaultEncoding("UTF-8");
+ cfg.setTemplateExceptionHandler(TemplateExceptionHandler.RETHROW_HANDLER);
+ cfg.setLogTemplateExceptions(false);
+ cfg.setWrapUncheckedExceptions(true);
+ cfg.setFallbackOnNullLoopVariable(false);
cfg.setObjectWrapper(new RowWrapper(cfg.getIncompatibleImprovements()));
- /* ------------------------------------------------------------------------ */
- /* You usually do these for MULTIPLE TIMES in the application life-cycle: */
-
-
-
- /* Get the template (uses cache internally) */
- Template temp = cfg.getTemplate("model.ftlh");
-
- /* Merge data-model with template */
- // Writer out = new OutputStreamWriter(System.out);
- Writer file = new FileWriter (new File(args.getZinggDocFile()));
- //StringWriter writer = new StringWriter();
- temp.process(root, file);
- // Note: Depending on what `out` is, you may need to call `out.close()`.
- // This is usually the case for file output, but not for servlet output.
- //file.flush();
-
- //List textList = Collections.singletonList(writer.toString());
-
- //Dataset data = spark.createDataset(textList, Encoders.STRING()).toDF();
-
- //PipeUtil.write(data, args, ctx, PipeUtil.getModelDocumentationPipe(args));
- file.close();
- //LOG.warn("written documentation at " + args.getZinggDocFile());
- }
-
-
-
-
+ /* ------------------------------------------------------------------------ */
+ /* You usually do these for MULTIPLE TIMES in the application life-cycle: */
+ return cfg;
+ }
+
+ private void generateStopWords() throws ZinggClientException {
+ LOG.info("Stop words generation starts");
+ Dataset data = PipeUtil.read(spark, false, false, args.getData());
+ LOG.warn("Read input data : " + data.count());
+
+ List fields = DSUtil.getFieldDefinitionFiltered(args, MatchType.DONT_USE);
+ for (FieldDefinition field : fields) {
+ generateStopWordsAndWriteCSV(data, field);
+ }
+ LOG.info("Stop words generation finishes");
+ }
+
+ private void generateStopWordsAndWriteCSV(Dataset data, FieldDefinition field) throws ZinggClientException {
+ LOG.debug("Field: " + field.fieldName);
+ data = data.select(split(data.col(field.fieldName), "\\s+").as("split"));
+ data = data.select(explode(data.col("split")).as("word"));
+ data = data.filter(data.col("word").notEqual(""));
+ data = data.groupBy("word").count().orderBy(desc("count"));
+ String filename = "/home/navin/workDir/zingg-1/" + field.fieldName + ".csv";
+ csvWriter(data, filename);
+ }
+
+ public void csvWriter(Dataset records, String fileName) throws ZinggClientException {
+ try {
+ Configuration cfg = getTemplateConfig();
+ cfg.setObjectWrapper(new RowWrapper(cfg.getIncompatibleImprovements()));
+
+ Template temp = cfg.getTemplate("stopWords.ftlh");
+ Writer file = new FileWriter(new File(fileName));
+ Map root = new HashMap();
+ root.put("stopWords", records.collectAsList());
+ temp.process(root, file);
+ file.close();
+ } catch (Exception e) {
+ e.printStackTrace();
+ throw new ZinggClientException(e.getMessage());
+ }
+ }
}
diff --git a/core/src/main/java/zingg/util/DSUtil.java b/core/src/main/java/zingg/util/DSUtil.java
index d66ba8d23..89c7dce6c 100644
--- a/core/src/main/java/zingg/util/DSUtil.java
+++ b/core/src/main/java/zingg/util/DSUtil.java
@@ -250,7 +250,7 @@ private static Dataset getTraining(SparkSession spark, Arguments args, Pipe
public static List getFieldDefinitionFiltered(Arguments args, MatchType type) {
return args.getFieldDefinition()
.stream()
- .filter(f -> !(f.getMatchType() == null || f.getMatchType().equals(MatchType.DONT_USE)))
+ .filter(f -> !(f.getMatchType() == null || f.getMatchType().equals(type)))
.collect(Collectors.toList());
}
}
diff --git a/core/src/main/resources/stopWords.ftlh b/core/src/main/resources/stopWords.ftlh
new file mode 100644
index 000000000..159837151
--- /dev/null
+++ b/core/src/main/resources/stopWords.ftlh
@@ -0,0 +1,3 @@
+<#list stopWords as word>
+${word[0]}
+#list>
\ No newline at end of file
From 63ee14205b38ffa0471f74bf37676fcf368b0aba Mon Sep 17 00:00:00 2001
From: Navin Singh
Date: Wed, 23 Mar 2022 17:24:43 +0530
Subject: [PATCH 2/2] html docs for stop words; linking in base doc; cutoff
config
---
.../src/main/java/zingg/client/Arguments.java | 13 ++++-
core/src/main/java/zingg/Documenter.java | 46 +++++++++++++-----
core/src/main/resources/model.ftlh | 2 +-
.../{stopWords.ftlh => stopWordsCSV.ftlh} | 0
core/src/main/resources/stopWordsHTML.ftlh | 48 +++++++++++++++++++
5 files changed, 94 insertions(+), 15 deletions(-)
rename core/src/main/resources/{stopWords.ftlh => stopWordsCSV.ftlh} (100%)
create mode 100644 core/src/main/resources/stopWordsHTML.ftlh
diff --git a/client/src/main/java/zingg/client/Arguments.java b/client/src/main/java/zingg/client/Arguments.java
index 136b26fff..d787f5457 100644
--- a/client/src/main/java/zingg/client/Arguments.java
+++ b/client/src/main/java/zingg/client/Arguments.java
@@ -104,6 +104,7 @@ public class Arguments implements Serializable {
int jobId = 1;
boolean collectMetrics = true;
boolean showConcise = false;
+ float stopWordsCutoff = 0.1f;
private static final String ENV_VAR_MARKER_START = "$";
private static final String ENV_VAR_MARKER_END = "$";
@@ -507,7 +508,7 @@ public String getZinggBaseModelDir() {
@JsonIgnore
public String getZinggDocDir() {
- return getZinggBaseModelDir();
+ return zinggDir + "/" + modelId;
}
@JsonIgnore
@@ -598,6 +599,16 @@ public boolean getCollectMetrics() {
public void setCollectMetrics(boolean collectMetrics) {
this.collectMetrics = collectMetrics;
}
+
+ public float getStopWordsCutoff() {
+ return stopWordsCutoff;
+ }
+
+ public void setStopWordsCutoff(float stopWordsCutoff) throws ZinggClientException {
+ if (stopWordsCutoff > 1 || stopWordsCutoff < 0)
+ throw new ZinggClientException("Stop words cutoff should be between 0 and 1");
+ this.stopWordsCutoff = stopWordsCutoff;
+ }
public boolean getShowConcise() {
return showConcise;
diff --git a/core/src/main/java/zingg/Documenter.java b/core/src/main/java/zingg/Documenter.java
index 643bda948..768d56648 100644
--- a/core/src/main/java/zingg/Documenter.java
+++ b/core/src/main/java/zingg/Documenter.java
@@ -34,6 +34,9 @@ public class Documenter extends ZinggBase {
public static final Log LOG = LogFactory.getLog(Documenter.class);
public static Configuration config;
+ private final String CSV_TEMPLATE = "stopWordsCSV.ftlh";
+ private final String HTML_TEMPLATE = "stopWordsHTML.ftlh";
+
public Documenter() {
setZinggOptions(ZinggOptions.GENERATE_DOCS);
config = createConfigurationObject();
@@ -55,7 +58,7 @@ public void execute() throws ZinggClientException {
root.put("columns", markedRecords.columns());
root.put("fieldDefinitionCount", args.getFieldDefinition().size());
buildAndWriteHTML(root);
- generateStopWords();
+ extractStopWords();
} catch (Exception e) {
e.printStackTrace();
throw new ZinggClientException(e.getMessage());
@@ -115,37 +118,54 @@ private Configuration createConfigurationObject() {
return cfg;
}
- private void generateStopWords() throws ZinggClientException {
+ private void extractStopWords() throws ZinggClientException {
LOG.info("Stop words generation starts");
Dataset data = PipeUtil.read(spark, false, false, args.getData());
LOG.warn("Read input data : " + data.count());
List fields = DSUtil.getFieldDefinitionFiltered(args, MatchType.DONT_USE);
for (FieldDefinition field : fields) {
- generateStopWordsAndWriteCSV(data, field);
+ findAndWriteStopWords(data, field);
}
LOG.info("Stop words generation finishes");
}
- private void generateStopWordsAndWriteCSV(Dataset data, FieldDefinition field) throws ZinggClientException {
+ private void findAndWriteStopWords(Dataset data, FieldDefinition field) throws ZinggClientException {
+ String stopWordsDir = args.getZinggDocDir() + "/stopWords/";
+ String columnsDir = args.getZinggDocDir() + "/columns/";
+
+ checkAndCreateDir(stopWordsDir);
+ checkAndCreateDir(columnsDir);
+
LOG.debug("Field: " + field.fieldName);
data = data.select(split(data.col(field.fieldName), "\\s+").as("split"));
data = data.select(explode(data.col("split")).as("word"));
data = data.filter(data.col("word").notEqual(""));
data = data.groupBy("word").count().orderBy(desc("count"));
- String filename = "/home/navin/workDir/zingg-1/" + field.fieldName + ".csv";
- csvWriter(data, filename);
+ data = data.limit(Math.round(data.count()*args.getStopWordsCutoff()));
+ String filenameCSV = stopWordsDir + field.fieldName + ".csv";
+ String filenameHTML = columnsDir + field.fieldName + ".html";
+ Map root = new HashMap();
+ root.put("modelId", args.getModelId());
+ root.put("stopWords", data.collectAsList());
+
+ writeStopWords(CSV_TEMPLATE, root, filenameCSV);
+ writeStopWords(HTML_TEMPLATE, root, filenameHTML);
+ }
+
+ private void checkAndCreateDir(String dirName) {
+ File directory = new File(dirName);
+ if (!directory.exists()) {
+ directory.mkdirs();
+ }
}
- public void csvWriter(Dataset records, String fileName) throws ZinggClientException {
+ public void writeStopWords(String template, Map root, String fileName)
+ throws ZinggClientException {
try {
Configuration cfg = getTemplateConfig();
- cfg.setObjectWrapper(new RowWrapper(cfg.getIncompatibleImprovements()));
-
- Template temp = cfg.getTemplate("stopWords.ftlh");
+ Template temp = cfg.getTemplate(template);
Writer file = new FileWriter(new File(fileName));
- Map root = new HashMap();
- root.put("stopWords", records.collectAsList());
temp.process(root, file);
file.close();
} catch (Exception e) {
@@ -153,4 +173,4 @@ public void csvWriter(Dataset records, String fileName) throws ZinggClientE
throw new ZinggClientException(e.getMessage());
}
}
-}
+}
\ No newline at end of file
diff --git a/core/src/main/resources/model.ftlh b/core/src/main/resources/model.ftlh
index 03dc4f38b..d6307ba5b 100644
--- a/core/src/main/resources/model.ftlh
+++ b/core/src/main/resources/model.ftlh
@@ -18,7 +18,7 @@
Cluster |
<#list 3 ..< numColumns -1 as entityIndex>
- ${columns[entityIndex]!} |
+ ${columns[entityIndex]!} |
#list>
diff --git a/core/src/main/resources/stopWords.ftlh b/core/src/main/resources/stopWordsCSV.ftlh
similarity index 100%
rename from core/src/main/resources/stopWords.ftlh
rename to core/src/main/resources/stopWordsCSV.ftlh
diff --git a/core/src/main/resources/stopWordsHTML.ftlh b/core/src/main/resources/stopWordsHTML.ftlh
new file mode 100644
index 000000000..4162f6043
--- /dev/null
+++ b/core/src/main/resources/stopWordsHTML.ftlh
@@ -0,0 +1,48 @@
+
+
+ Zingg model documentation
+
+
+
+
+
+
+
+
+
+
+ Word |
+ Count |
+
+
+ <#list stopWords as words>
+
+
+ ${words[0]!}
+ |
+
+ ${words[1]!}
+ |
+
+ #list>
+
+
+
+
+
+
+
+
+
\ No newline at end of file