From 5037bec0b90dbb89d4f9f877b6a897d4dfbe2bdc Mon Sep 17 00:00:00 2001
From: Navin Singh <navin.rathore@gmail.com>
Date: Wed, 23 Mar 2022 00:02:10 +0530
Subject: [PATCH 1/2] generate stop words and save into csv files

---
 core/src/main/java/zingg/Documenter.java  | 155 ++++++++++++++--------
 core/src/main/java/zingg/util/DSUtil.java |   2 +-
 core/src/main/resources/stopWords.ftlh    |   3 +
 3 files changed, 106 insertions(+), 54 deletions(-)
 create mode 100644 core/src/main/resources/stopWords.ftlh
diff --git a/core/src/main/java/zingg/Documenter.java b/core/src/main/java/zingg/Documenter.java
index f91ee6471..643bda948 100644
--- a/core/src/main/java/zingg/Documenter.java
+++ b/core/src/main/java/zingg/Documenter.java
@@ -1,39 +1,42 @@
 package zingg;
 
-import java.util.ArrayList;
-import java.util.Arrays;
+import static org.apache.spark.sql.functions.desc;
+import static org.apache.spark.sql.functions.explode;
+import static org.apache.spark.sql.functions.split;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.Writer;
+import java.util.HashMap;
 import java.util.List;
-import java.util.Scanner;
+import java.util.Map;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-import org.apache.spark.sql.Column;
 import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.Row;
-import org.apache.spark.sql.functions;
 
+import freemarker.template.Configuration;
+import freemarker.template.Template;
+import freemarker.template.TemplateExceptionHandler;
+import zingg.client.FieldDefinition;
+import zingg.client.MatchType;
 import zingg.client.ZinggClientException;
 import zingg.client.ZinggOptions;
-import zingg.client.pipe.Pipe;
 import zingg.client.util.ColName;
-import zingg.client.util.ColValues;
 import zingg.util.DSUtil;
 import zingg.util.PipeUtil;
-import zingg.util.RowAdapter;
 import zingg.util.RowWrapper;
-import freemarker.ext.rhino.RhinoWrapper;
-import freemarker.template.*;
-import java.util.*;
-import java.io.*;
 
 public class Documenter extends ZinggBase {
 
 	protected static String name = "zingg.Documenter";
 	public static final Log LOG = LogFactory.getLog(Documenter.class);
+	public static Configuration config;
 
 	public Documenter() {
 		setZinggOptions(ZinggOptions.GENERATE_DOCS);
+		config = createConfigurationObject();
 	}
 
 	public void execute() throws ZinggClientException {
@@ -52,6 +55,7 @@ public void execute() throws ZinggClientException {
 			root.put("columns", markedRecords.columns());
 			root.put("fieldDefinitionCount", args.getFieldDefinition().size());
 			buildAndWriteHTML(root);
+			generateStopWords();
 		} catch (Exception e) {
 			e.printStackTrace();
 			throw new ZinggClientException(e.getMessage());
@@ -60,48 +64,93 @@ public void execute() throws ZinggClientException {
 
 	public void buildAndWriteHTML(Map<String, Object> root) throws Exception {
 
-        /* ------------------------------------------------------------------------ */
-        /* You should do this ONLY ONCE in the whole application life-cycle:        */
+		Configuration cfg = getTemplateConfig();
 
-        /* Create and adjust the configuration singleton */
-        Configuration cfg = new Configuration(Configuration.VERSION_2_3_29);
+		/* Get the template (uses cache internally) */
+		Template temp = cfg.getTemplate("model.ftlh");
+
+		/* Merge data-model with template */
+		// Writer out = new OutputStreamWriter(System.out);
+		Writer file = new FileWriter(new File(args.getZinggDocFile()));
+		// StringWriter writer = new StringWriter();
+		temp.process(root, file);
+		// Note: Depending on what `out` is, you may need to call `out.close()`.
+		// This is usually the case for file output, but not for servlet output.
+		// file.flush();
+
+		// List<String> textList = Collections.singletonList(writer.toString());
+
+		// Dataset<Row> data = spark.createDataset(textList, Encoders.STRING()).toDF();
+
+		// PipeUtil.write(data, args, ctx, PipeUtil.getModelDocumentationPipe(args));
+		file.close();
+		// LOG.warn("written documentation at " + args.getZinggDocFile());
+	}
+
+	public Configuration getTemplateConfig() {
+		if (config == null) {
+			config = createConfigurationObject();
+		}
+		return config;
+	}
+
+	private Configuration createConfigurationObject() {
+		/* ------------------------------------------------------------------------ */
+		/* You should do this ONLY ONCE in the whole application life-cycle: */
+
+		/* Create and adjust the configuration singleton */
+		Configuration cfg = new Configuration(Configuration.VERSION_2_3_29);
 		cfg.setClassForTemplateLoading(this.getClass(), "/");
-       // cfg.setDirectoryForTemplateLoading(new File("/where/you/store/templates"));
-        // Recommended settings for new projects:
-        cfg.setDefaultEncoding("UTF-8");
-        cfg.setTemplateExceptionHandler(TemplateExceptionHandler.RETHROW_HANDLER);
-        cfg.setLogTemplateExceptions(false);
-        cfg.setWrapUncheckedExceptions(true);
-        cfg.setFallbackOnNullLoopVariable(false);
+		// cfg.setDirectoryForTemplateLoading(new File("/where/you/store/templates"));
+		// Recommended settings for new projects:
+		cfg.setDefaultEncoding("UTF-8");
+		cfg.setTemplateExceptionHandler(TemplateExceptionHandler.RETHROW_HANDLER);
+		cfg.setLogTemplateExceptions(false);
+		cfg.setWrapUncheckedExceptions(true);
+		cfg.setFallbackOnNullLoopVariable(false);
 		cfg.setObjectWrapper(new RowWrapper(cfg.getIncompatibleImprovements()));
 
-        /* ------------------------------------------------------------------------ */
-        /* You usually do these for MULTIPLE TIMES in the application life-cycle:   */
-
-        
-       
-        /* Get the template (uses cache internally) */
-        Template temp = cfg.getTemplate("model.ftlh");
-
-        /* Merge data-model with template */
-       // Writer out = new OutputStreamWriter(System.out);
-		Writer file = new FileWriter (new File(args.getZinggDocFile()));
-		//StringWriter writer = new StringWriter();
-        temp.process(root, file);
-        // Note: Depending on what `out` is, you may need to call `out.close()`.
-        // This is usually the case for file output, but not for servlet output.
-		//file.flush();
-
-		//List<String> textList = Collections.singletonList(writer.toString());
-		
-		//Dataset<Row> data = spark.createDataset(textList, Encoders.STRING()).toDF();
-
-		//PipeUtil.write(data, args, ctx, PipeUtil.getModelDocumentationPipe(args));
-        file.close();
-		//LOG.warn("written documentation at " + args.getZinggDocFile());
-    }
-
-	
-	
-	
+		/* ------------------------------------------------------------------------ */
+		/* You usually do these for MULTIPLE TIMES in the application life-cycle: */
+		return cfg;
+	}
+
+	private void generateStopWords() throws ZinggClientException {
+		LOG.info("Stop words generation starts");
+		Dataset<Row> data = PipeUtil.read(spark, false, false, args.getData());
+		LOG.warn("Read input data : " + data.count());
+
+		List<FieldDefinition> fields = DSUtil.getFieldDefinitionFiltered(args, MatchType.DONT_USE);
+		for (FieldDefinition field : fields) {
+			generateStopWordsAndWriteCSV(data, field);
+		}
+		LOG.info("Stop words generation finishes");
+	}
+
+	private void generateStopWordsAndWriteCSV(Dataset<Row> data, FieldDefinition field) throws ZinggClientException {
+		LOG.debug("Field: " + field.fieldName);
+		data = data.select(split(data.col(field.fieldName), "\\s+").as("split"));
+		data = data.select(explode(data.col("split")).as("word"));
+		data = data.filter(data.col("word").notEqual(""));
+		data = data.groupBy("word").count().orderBy(desc("count"));
+		String filename = "/home/navin/workDir/zingg-1/" + field.fieldName + ".csv";
+		csvWriter(data, filename);
+	}
+
+	public void csvWriter(Dataset<Row> records, String fileName) throws ZinggClientException {
+		try {
+			Configuration cfg = getTemplateConfig();
+			cfg.setObjectWrapper(new RowWrapper(cfg.getIncompatibleImprovements()));
+
+			Template temp = cfg.getTemplate("stopWords.ftlh");
+			Writer file = new FileWriter(new File(fileName));
+			Map<String, Object> root = new HashMap<String, Object>();
+			root.put("stopWords", records.collectAsList());
+			temp.process(root, file);
+			file.close();
+		} catch (Exception e) {
+			e.printStackTrace();
+			throw new ZinggClientException(e.getMessage());
+		}
+	}
 }
diff --git a/core/src/main/java/zingg/util/DSUtil.java b/core/src/main/java/zingg/util/DSUtil.java
index d66ba8d23..89c7dce6c 100644
--- a/core/src/main/java/zingg/util/DSUtil.java
+++ b/core/src/main/java/zingg/util/DSUtil.java
@@ -250,7 +250,7 @@ private static Dataset<Row> getTraining(SparkSession spark, Arguments args, Pipe
 	public static List<FieldDefinition> getFieldDefinitionFiltered(Arguments args, MatchType type) {
 		return args.getFieldDefinition()
 				.stream()
-				.filter(f -> !(f.getMatchType() == null || f.getMatchType().equals(MatchType.DONT_USE)))
+				.filter(f -> !(f.getMatchType() == null || f.getMatchType().equals(type)))
 				.collect(Collectors.toList());
 	}
 }
diff --git a/core/src/main/resources/stopWords.ftlh b/core/src/main/resources/stopWords.ftlh
new file mode 100644
index 000000000..159837151
--- /dev/null
+++ b/core/src/main/resources/stopWords.ftlh
@@ -0,0 +1,3 @@
+<#list stopWords as word>
+${word[0]}
+</#list>
\ No newline at end of file

From 63ee14205b38ffa0471f74bf37676fcf368b0aba Mon Sep 17 00:00:00 2001
From: Navin Singh <navin.rathore@gmail.com>
Date: Wed, 23 Mar 2022 17:24:43 +0530
Subject: [PATCH 2/2] html docs for stop words; linking in base doc; cutoff
 config

---
 .../src/main/java/zingg/client/Arguments.java | 13 ++++-
 core/src/main/java/zingg/Documenter.java      | 46 +++++++++++++-----
 core/src/main/resources/model.ftlh            |  2 +-
 .../{stopWords.ftlh => stopWordsCSV.ftlh}     |  0
 core/src/main/resources/stopWordsHTML.ftlh    | 48 +++++++++++++++++++
 5 files changed, 94 insertions(+), 15 deletions(-)
 rename core/src/main/resources/{stopWords.ftlh => stopWordsCSV.ftlh} (100%)
 create mode 100644 core/src/main/resources/stopWordsHTML.ftlh

diff --git a/client/src/main/java/zingg/client/Arguments.java b/client/src/main/java/zingg/client/Arguments.java
index 136b26fff..d787f5457 100644
--- a/client/src/main/java/zingg/client/Arguments.java
+++ b/client/src/main/java/zingg/client/Arguments.java
@@ -104,6 +104,7 @@ public class Arguments implements Serializable {
 	int jobId = 1;
 	boolean collectMetrics = true;
 	boolean showConcise = false;
+	float stopWordsCutoff = 0.1f;
 	
 	private static final String ENV_VAR_MARKER_START = "$";
 	private static final String ENV_VAR_MARKER_END = "$";
@@ -507,7 +508,7 @@ public String getZinggBaseModelDir() {
 
 	@JsonIgnore
 	public String getZinggDocDir() {
-		return getZinggBaseModelDir();
+		return zinggDir + "/" + modelId;
 	}
 
 	@JsonIgnore
@@ -598,6 +599,16 @@ public boolean getCollectMetrics() {
 	public void setCollectMetrics(boolean collectMetrics) {
 		this.collectMetrics = collectMetrics;
 	}
+	 
+	public float getStopWordsCutoff() {
+		return stopWordsCutoff;
+	}
+
+	public void setStopWordsCutoff(float stopWordsCutoff) throws ZinggClientException {
+		if (stopWordsCutoff > 1 || stopWordsCutoff < 0)
+			throw new ZinggClientException("Stop words cutoff should be between 0 and 1");
+		this.stopWordsCutoff = stopWordsCutoff;
+	}
 
 	public boolean getShowConcise() {
 		return showConcise;
diff --git a/core/src/main/java/zingg/Documenter.java b/core/src/main/java/zingg/Documenter.java
index 643bda948..768d56648 100644
--- a/core/src/main/java/zingg/Documenter.java
+++ b/core/src/main/java/zingg/Documenter.java
@@ -34,6 +34,9 @@ public class Documenter extends ZinggBase {
 	public static final Log LOG = LogFactory.getLog(Documenter.class);
 	public static Configuration config;
 
+	private final String CSV_TEMPLATE = "stopWordsCSV.ftlh";
+	private final String HTML_TEMPLATE = "stopWordsHTML.ftlh";
+
 	public Documenter() {
 		setZinggOptions(ZinggOptions.GENERATE_DOCS);
 		config = createConfigurationObject();
@@ -55,7 +58,7 @@ public void execute() throws ZinggClientException {
 			root.put("columns", markedRecords.columns());
 			root.put("fieldDefinitionCount", args.getFieldDefinition().size());
 			buildAndWriteHTML(root);
-			generateStopWords();
+			extractStopWords();
 		} catch (Exception e) {
 			e.printStackTrace();
 			throw new ZinggClientException(e.getMessage());
@@ -115,37 +118,54 @@ private Configuration createConfigurationObject() {
 		return cfg;
 	}
 
-	private void generateStopWords() throws ZinggClientException {
+	private void extractStopWords() throws ZinggClientException {
 		LOG.info("Stop words generation starts");
 		Dataset<Row> data = PipeUtil.read(spark, false, false, args.getData());
 		LOG.warn("Read input data : " + data.count());
 
 		List<FieldDefinition> fields = DSUtil.getFieldDefinitionFiltered(args, MatchType.DONT_USE);
 		for (FieldDefinition field : fields) {
-			generateStopWordsAndWriteCSV(data, field);
+			findAndWriteStopWords(data, field);
 		}
 		LOG.info("Stop words generation finishes");
 	}
 
-	private void generateStopWordsAndWriteCSV(Dataset<Row> data, FieldDefinition field) throws ZinggClientException {
+	private void findAndWriteStopWords(Dataset<Row> data, FieldDefinition field) throws ZinggClientException {
+		String stopWordsDir = args.getZinggDocDir() + "/stopWords/";
+		String columnsDir = args.getZinggDocDir() + "/columns/";
+
+		checkAndCreateDir(stopWordsDir);
+		checkAndCreateDir(columnsDir);
+
 		LOG.debug("Field: " + field.fieldName);
 		data = data.select(split(data.col(field.fieldName), "\\s+").as("split"));
 		data = data.select(explode(data.col("split")).as("word"));
 		data = data.filter(data.col("word").notEqual(""));
 		data = data.groupBy("word").count().orderBy(desc("count"));
-		String filename = "/home/navin/workDir/zingg-1/" + field.fieldName + ".csv";
-		csvWriter(data, filename);
+		data = data.limit(Math.round(data.count()*args.getStopWordsCutoff()));
+		String filenameCSV = stopWordsDir + field.fieldName + ".csv";
+		String filenameHTML = columnsDir + field.fieldName + ".html";
+		Map<String, Object> root = new HashMap<String, Object>();
+		root.put("modelId", args.getModelId());
+		root.put("stopWords", data.collectAsList());
+
+		writeStopWords(CSV_TEMPLATE, root, filenameCSV);
+		writeStopWords(HTML_TEMPLATE, root, filenameHTML);
+	}
+
+	private void checkAndCreateDir(String dirName) {
+		File directory = new File(dirName);
+		if (!directory.exists()) {
+			directory.mkdirs();
+		}
 	}
 
-	public void csvWriter(Dataset<Row> records, String fileName) throws ZinggClientException {
+	public void writeStopWords(String template, Map<String, Object> root, String fileName)
+			throws ZinggClientException {
 		try {
 			Configuration cfg = getTemplateConfig();
-			cfg.setObjectWrapper(new RowWrapper(cfg.getIncompatibleImprovements()));
-
-			Template temp = cfg.getTemplate("stopWords.ftlh");
+			Template temp = cfg.getTemplate(template);
 			Writer file = new FileWriter(new File(fileName));
-			Map<String, Object> root = new HashMap<String, Object>();
-			root.put("stopWords", records.collectAsList());
 			temp.process(root, file);
 			file.close();
 		} catch (Exception e) {
@@ -153,4 +173,4 @@ public void csvWriter(Dataset<Row> records, String fileName) throws ZinggClientE
 			throw new ZinggClientException(e.getMessage());
 		}
 	}
-}
+}
\ No newline at end of file
diff --git a/core/src/main/resources/model.ftlh b/core/src/main/resources/model.ftlh
index 03dc4f38b..d6307ba5b 100644
--- a/core/src/main/resources/model.ftlh
+++ b/core/src/main/resources/model.ftlh
@@ -18,7 +18,7 @@
   <tr>
     <th>Cluster</th>
     <#list 3 ..< numColumns -1 as entityIndex>
-    <th> ${columns[entityIndex]!}</th>
+    <th> <a href="columns/${columns[entityIndex]}.html"> ${columns[entityIndex]!} </a></th>
     </#list>
   </tr>
   </thead>
diff --git a/core/src/main/resources/stopWords.ftlh b/core/src/main/resources/stopWordsCSV.ftlh
similarity index 100%
rename from core/src/main/resources/stopWords.ftlh
rename to core/src/main/resources/stopWordsCSV.ftlh
diff --git a/core/src/main/resources/stopWordsHTML.ftlh b/core/src/main/resources/stopWordsHTML.ftlh
new file mode 100644
index 000000000..4162f6043
--- /dev/null
+++ b/core/src/main/resources/stopWordsHTML.ftlh
@@ -0,0 +1,48 @@
+<html>
+<head>
+  <title>Zingg model documentation</title>
+  <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/css/bootstrap.min.css" integrity="sha384-MCw98/SFnGE8fJT3GXwEOngsV7Zt27NXFoaoApmYm81iuXoPkFOJwJ8ERdknLPMO" crossorigin="anonymous">
+</head>
+<body>
+<nav class="navbar navbar-light bg-light">
+  <a class="navbar-brand" href="https://www.zingg.ai">
+    <img src="https://github.com/zinggai/zingg/raw/main/assets/zinggWhiteTransparent.png" class="d-inline-block align-top" alt="">
+  </a>
+  <a href="../model.html">
+  <div class="justify-content-end">Model ${modelId}</div>
+  </a>
+</nav>
+   <p>
+  <table class="table table-borderless">
+  <thead class="thead thead-dark">
+  </thead>
+  <tbody>
+  <tr>
+    <th>Word</th>
+    <th>Count</th>
+  </tr>
+
+  <#list stopWords as words>
+  <tr>
+    <td>
+      ${words[0]!}
+      </td>
+      <td>
+      ${words[1]!}
+      </td>
+      </tr>
+  </#list>
+
+ </tbody>
+  </table>
+   
+
+</p>
+</body>
+</html>
+<style>
+    .header{
+        position:sticky;
+        top: 0 ;
+    }
+</style>
\ No newline at end of file