From 5037bec0b90dbb89d4f9f877b6a897d4dfbe2bdc Mon Sep 17 00:00:00 2001 From: Navin Singh Date: Wed, 23 Mar 2022 00:02:10 +0530 Subject: [PATCH 1/2] generate stop words and save into csv files --- core/src/main/java/zingg/Documenter.java | 155 ++++++++++++++-------- core/src/main/java/zingg/util/DSUtil.java | 2 +- core/src/main/resources/stopWords.ftlh | 3 + 3 files changed, 106 insertions(+), 54 deletions(-) create mode 100644 core/src/main/resources/stopWords.ftlh diff --git a/core/src/main/java/zingg/Documenter.java b/core/src/main/java/zingg/Documenter.java index f91ee6471..643bda948 100644 --- a/core/src/main/java/zingg/Documenter.java +++ b/core/src/main/java/zingg/Documenter.java @@ -1,39 +1,42 @@ package zingg; -import java.util.ArrayList; -import java.util.Arrays; +import static org.apache.spark.sql.functions.desc; +import static org.apache.spark.sql.functions.explode; +import static org.apache.spark.sql.functions.split; + +import java.io.File; +import java.io.FileWriter; +import java.io.Writer; +import java.util.HashMap; import java.util.List; -import java.util.Scanner; +import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.spark.sql.Column; import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Row; -import org.apache.spark.sql.functions; +import freemarker.template.Configuration; +import freemarker.template.Template; +import freemarker.template.TemplateExceptionHandler; +import zingg.client.FieldDefinition; +import zingg.client.MatchType; import zingg.client.ZinggClientException; import zingg.client.ZinggOptions; -import zingg.client.pipe.Pipe; import zingg.client.util.ColName; -import zingg.client.util.ColValues; import zingg.util.DSUtil; import zingg.util.PipeUtil; -import zingg.util.RowAdapter; import zingg.util.RowWrapper; -import freemarker.ext.rhino.RhinoWrapper; -import freemarker.template.*; -import java.util.*; -import java.io.*; public class Documenter extends ZinggBase { protected static String name = "zingg.Documenter"; public static final Log LOG = LogFactory.getLog(Documenter.class); + public static Configuration config; public Documenter() { setZinggOptions(ZinggOptions.GENERATE_DOCS); + config = createConfigurationObject(); } public void execute() throws ZinggClientException { @@ -52,6 +55,7 @@ public void execute() throws ZinggClientException { root.put("columns", markedRecords.columns()); root.put("fieldDefinitionCount", args.getFieldDefinition().size()); buildAndWriteHTML(root); + generateStopWords(); } catch (Exception e) { e.printStackTrace(); throw new ZinggClientException(e.getMessage()); @@ -60,48 +64,93 @@ public void execute() throws ZinggClientException { public void buildAndWriteHTML(Map root) throws Exception { - /* ------------------------------------------------------------------------ */ - /* You should do this ONLY ONCE in the whole application life-cycle: */ + Configuration cfg = getTemplateConfig(); - /* Create and adjust the configuration singleton */ - Configuration cfg = new Configuration(Configuration.VERSION_2_3_29); + /* Get the template (uses cache internally) */ + Template temp = cfg.getTemplate("model.ftlh"); + + /* Merge data-model with template */ + // Writer out = new OutputStreamWriter(System.out); + Writer file = new FileWriter(new File(args.getZinggDocFile())); + // StringWriter writer = new StringWriter(); + temp.process(root, file); + // Note: Depending on what `out` is, you may need to call `out.close()`. + // This is usually the case for file output, but not for servlet output. + // file.flush(); + + // List textList = Collections.singletonList(writer.toString()); + + // Dataset data = spark.createDataset(textList, Encoders.STRING()).toDF(); + + // PipeUtil.write(data, args, ctx, PipeUtil.getModelDocumentationPipe(args)); + file.close(); + // LOG.warn("written documentation at " + args.getZinggDocFile()); + } + + public Configuration getTemplateConfig() { + if (config == null) { + config = createConfigurationObject(); + } + return config; + } + + private Configuration createConfigurationObject() { + /* ------------------------------------------------------------------------ */ + /* You should do this ONLY ONCE in the whole application life-cycle: */ + + /* Create and adjust the configuration singleton */ + Configuration cfg = new Configuration(Configuration.VERSION_2_3_29); cfg.setClassForTemplateLoading(this.getClass(), "/"); - // cfg.setDirectoryForTemplateLoading(new File("/where/you/store/templates")); - // Recommended settings for new projects: - cfg.setDefaultEncoding("UTF-8"); - cfg.setTemplateExceptionHandler(TemplateExceptionHandler.RETHROW_HANDLER); - cfg.setLogTemplateExceptions(false); - cfg.setWrapUncheckedExceptions(true); - cfg.setFallbackOnNullLoopVariable(false); + // cfg.setDirectoryForTemplateLoading(new File("/where/you/store/templates")); + // Recommended settings for new projects: + cfg.setDefaultEncoding("UTF-8"); + cfg.setTemplateExceptionHandler(TemplateExceptionHandler.RETHROW_HANDLER); + cfg.setLogTemplateExceptions(false); + cfg.setWrapUncheckedExceptions(true); + cfg.setFallbackOnNullLoopVariable(false); cfg.setObjectWrapper(new RowWrapper(cfg.getIncompatibleImprovements())); - /* ------------------------------------------------------------------------ */ - /* You usually do these for MULTIPLE TIMES in the application life-cycle: */ - - - - /* Get the template (uses cache internally) */ - Template temp = cfg.getTemplate("model.ftlh"); - - /* Merge data-model with template */ - // Writer out = new OutputStreamWriter(System.out); - Writer file = new FileWriter (new File(args.getZinggDocFile())); - //StringWriter writer = new StringWriter(); - temp.process(root, file); - // Note: Depending on what `out` is, you may need to call `out.close()`. - // This is usually the case for file output, but not for servlet output. - //file.flush(); - - //List textList = Collections.singletonList(writer.toString()); - - //Dataset data = spark.createDataset(textList, Encoders.STRING()).toDF(); - - //PipeUtil.write(data, args, ctx, PipeUtil.getModelDocumentationPipe(args)); - file.close(); - //LOG.warn("written documentation at " + args.getZinggDocFile()); - } - - - - + /* ------------------------------------------------------------------------ */ + /* You usually do these for MULTIPLE TIMES in the application life-cycle: */ + return cfg; + } + + private void generateStopWords() throws ZinggClientException { + LOG.info("Stop words generation starts"); + Dataset data = PipeUtil.read(spark, false, false, args.getData()); + LOG.warn("Read input data : " + data.count()); + + List fields = DSUtil.getFieldDefinitionFiltered(args, MatchType.DONT_USE); + for (FieldDefinition field : fields) { + generateStopWordsAndWriteCSV(data, field); + } + LOG.info("Stop words generation finishes"); + } + + private void generateStopWordsAndWriteCSV(Dataset data, FieldDefinition field) throws ZinggClientException { + LOG.debug("Field: " + field.fieldName); + data = data.select(split(data.col(field.fieldName), "\\s+").as("split")); + data = data.select(explode(data.col("split")).as("word")); + data = data.filter(data.col("word").notEqual("")); + data = data.groupBy("word").count().orderBy(desc("count")); + String filename = "/home/navin/workDir/zingg-1/" + field.fieldName + ".csv"; + csvWriter(data, filename); + } + + public void csvWriter(Dataset records, String fileName) throws ZinggClientException { + try { + Configuration cfg = getTemplateConfig(); + cfg.setObjectWrapper(new RowWrapper(cfg.getIncompatibleImprovements())); + + Template temp = cfg.getTemplate("stopWords.ftlh"); + Writer file = new FileWriter(new File(fileName)); + Map root = new HashMap(); + root.put("stopWords", records.collectAsList()); + temp.process(root, file); + file.close(); + } catch (Exception e) { + e.printStackTrace(); + throw new ZinggClientException(e.getMessage()); + } + } } diff --git a/core/src/main/java/zingg/util/DSUtil.java b/core/src/main/java/zingg/util/DSUtil.java index d66ba8d23..89c7dce6c 100644 --- a/core/src/main/java/zingg/util/DSUtil.java +++ b/core/src/main/java/zingg/util/DSUtil.java @@ -250,7 +250,7 @@ private static Dataset getTraining(SparkSession spark, Arguments args, Pipe public static List getFieldDefinitionFiltered(Arguments args, MatchType type) { return args.getFieldDefinition() .stream() - .filter(f -> !(f.getMatchType() == null || f.getMatchType().equals(MatchType.DONT_USE))) + .filter(f -> !(f.getMatchType() == null || f.getMatchType().equals(type))) .collect(Collectors.toList()); } } diff --git a/core/src/main/resources/stopWords.ftlh b/core/src/main/resources/stopWords.ftlh new file mode 100644 index 000000000..159837151 --- /dev/null +++ b/core/src/main/resources/stopWords.ftlh @@ -0,0 +1,3 @@ +<#list stopWords as word> +${word[0]} + \ No newline at end of file From 63ee14205b38ffa0471f74bf37676fcf368b0aba Mon Sep 17 00:00:00 2001 From: Navin Singh Date: Wed, 23 Mar 2022 17:24:43 +0530 Subject: [PATCH 2/2] html docs for stop words; linking in base doc; cutoff config --- .../src/main/java/zingg/client/Arguments.java | 13 ++++- core/src/main/java/zingg/Documenter.java | 46 +++++++++++++----- core/src/main/resources/model.ftlh | 2 +- .../{stopWords.ftlh => stopWordsCSV.ftlh} | 0 core/src/main/resources/stopWordsHTML.ftlh | 48 +++++++++++++++++++ 5 files changed, 94 insertions(+), 15 deletions(-) rename core/src/main/resources/{stopWords.ftlh => stopWordsCSV.ftlh} (100%) create mode 100644 core/src/main/resources/stopWordsHTML.ftlh diff --git a/client/src/main/java/zingg/client/Arguments.java b/client/src/main/java/zingg/client/Arguments.java index 136b26fff..d787f5457 100644 --- a/client/src/main/java/zingg/client/Arguments.java +++ b/client/src/main/java/zingg/client/Arguments.java @@ -104,6 +104,7 @@ public class Arguments implements Serializable { int jobId = 1; boolean collectMetrics = true; boolean showConcise = false; + float stopWordsCutoff = 0.1f; private static final String ENV_VAR_MARKER_START = "$"; private static final String ENV_VAR_MARKER_END = "$"; @@ -507,7 +508,7 @@ public String getZinggBaseModelDir() { @JsonIgnore public String getZinggDocDir() { - return getZinggBaseModelDir(); + return zinggDir + "/" + modelId; } @JsonIgnore @@ -598,6 +599,16 @@ public boolean getCollectMetrics() { public void setCollectMetrics(boolean collectMetrics) { this.collectMetrics = collectMetrics; } + + public float getStopWordsCutoff() { + return stopWordsCutoff; + } + + public void setStopWordsCutoff(float stopWordsCutoff) throws ZinggClientException { + if (stopWordsCutoff > 1 || stopWordsCutoff < 0) + throw new ZinggClientException("Stop words cutoff should be between 0 and 1"); + this.stopWordsCutoff = stopWordsCutoff; + } public boolean getShowConcise() { return showConcise; diff --git a/core/src/main/java/zingg/Documenter.java b/core/src/main/java/zingg/Documenter.java index 643bda948..768d56648 100644 --- a/core/src/main/java/zingg/Documenter.java +++ b/core/src/main/java/zingg/Documenter.java @@ -34,6 +34,9 @@ public class Documenter extends ZinggBase { public static final Log LOG = LogFactory.getLog(Documenter.class); public static Configuration config; + private final String CSV_TEMPLATE = "stopWordsCSV.ftlh"; + private final String HTML_TEMPLATE = "stopWordsHTML.ftlh"; + public Documenter() { setZinggOptions(ZinggOptions.GENERATE_DOCS); config = createConfigurationObject(); @@ -55,7 +58,7 @@ public void execute() throws ZinggClientException { root.put("columns", markedRecords.columns()); root.put("fieldDefinitionCount", args.getFieldDefinition().size()); buildAndWriteHTML(root); - generateStopWords(); + extractStopWords(); } catch (Exception e) { e.printStackTrace(); throw new ZinggClientException(e.getMessage()); @@ -115,37 +118,54 @@ private Configuration createConfigurationObject() { return cfg; } - private void generateStopWords() throws ZinggClientException { + private void extractStopWords() throws ZinggClientException { LOG.info("Stop words generation starts"); Dataset data = PipeUtil.read(spark, false, false, args.getData()); LOG.warn("Read input data : " + data.count()); List fields = DSUtil.getFieldDefinitionFiltered(args, MatchType.DONT_USE); for (FieldDefinition field : fields) { - generateStopWordsAndWriteCSV(data, field); + findAndWriteStopWords(data, field); } LOG.info("Stop words generation finishes"); } - private void generateStopWordsAndWriteCSV(Dataset data, FieldDefinition field) throws ZinggClientException { + private void findAndWriteStopWords(Dataset data, FieldDefinition field) throws ZinggClientException { + String stopWordsDir = args.getZinggDocDir() + "/stopWords/"; + String columnsDir = args.getZinggDocDir() + "/columns/"; + + checkAndCreateDir(stopWordsDir); + checkAndCreateDir(columnsDir); + LOG.debug("Field: " + field.fieldName); data = data.select(split(data.col(field.fieldName), "\\s+").as("split")); data = data.select(explode(data.col("split")).as("word")); data = data.filter(data.col("word").notEqual("")); data = data.groupBy("word").count().orderBy(desc("count")); - String filename = "/home/navin/workDir/zingg-1/" + field.fieldName + ".csv"; - csvWriter(data, filename); + data = data.limit(Math.round(data.count()*args.getStopWordsCutoff())); + String filenameCSV = stopWordsDir + field.fieldName + ".csv"; + String filenameHTML = columnsDir + field.fieldName + ".html"; + Map root = new HashMap(); + root.put("modelId", args.getModelId()); + root.put("stopWords", data.collectAsList()); + + writeStopWords(CSV_TEMPLATE, root, filenameCSV); + writeStopWords(HTML_TEMPLATE, root, filenameHTML); + } + + private void checkAndCreateDir(String dirName) { + File directory = new File(dirName); + if (!directory.exists()) { + directory.mkdirs(); + } } - public void csvWriter(Dataset records, String fileName) throws ZinggClientException { + public void writeStopWords(String template, Map root, String fileName) + throws ZinggClientException { try { Configuration cfg = getTemplateConfig(); - cfg.setObjectWrapper(new RowWrapper(cfg.getIncompatibleImprovements())); - - Template temp = cfg.getTemplate("stopWords.ftlh"); + Template temp = cfg.getTemplate(template); Writer file = new FileWriter(new File(fileName)); - Map root = new HashMap(); - root.put("stopWords", records.collectAsList()); temp.process(root, file); file.close(); } catch (Exception e) { @@ -153,4 +173,4 @@ public void csvWriter(Dataset records, String fileName) throws ZinggClientE throw new ZinggClientException(e.getMessage()); } } -} +} \ No newline at end of file diff --git a/core/src/main/resources/model.ftlh b/core/src/main/resources/model.ftlh index 03dc4f38b..d6307ba5b 100644 --- a/core/src/main/resources/model.ftlh +++ b/core/src/main/resources/model.ftlh @@ -18,7 +18,7 @@ Cluster <#list 3 ..< numColumns -1 as entityIndex> - ${columns[entityIndex]!} + ${columns[entityIndex]!} diff --git a/core/src/main/resources/stopWords.ftlh b/core/src/main/resources/stopWordsCSV.ftlh similarity index 100% rename from core/src/main/resources/stopWords.ftlh rename to core/src/main/resources/stopWordsCSV.ftlh diff --git a/core/src/main/resources/stopWordsHTML.ftlh b/core/src/main/resources/stopWordsHTML.ftlh new file mode 100644 index 000000000..4162f6043 --- /dev/null +++ b/core/src/main/resources/stopWordsHTML.ftlh @@ -0,0 +1,48 @@ + + + Zingg model documentation + + + + +

+ + + + + + + + + + <#list stopWords as words> + + + + + + + +
WordCount
+ ${words[0]!} + + ${words[1]!} +
+ + +

+ + + \ No newline at end of file