Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extract Stop words #186

Merged
merged 2 commits into from
Mar 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion client/src/main/java/zingg/client/Arguments.java
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ public class Arguments implements Serializable {
int jobId = 1;
boolean collectMetrics = true;
boolean showConcise = false;
float stopWordsCutoff = 0.1f;

private static final String ENV_VAR_MARKER_START = "$";
private static final String ENV_VAR_MARKER_END = "$";
Expand Down Expand Up @@ -507,7 +508,7 @@ public String getZinggBaseModelDir() {

@JsonIgnore
public String getZinggDocDir() {
return getZinggBaseModelDir();
return zinggDir + "/" + modelId;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we need this change?

Copy link
Contributor Author

@navinrathore navinrathore Mar 23, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

getZinggDocDir() was unused.
getZinggBaseModelDir() is models/<id>/model directory. it contains model detail.
The getZinggDocDir() seems corresponding to doc/documentation, therefore used after modification.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks. so cant we then keep returning the getzinggBaseModelDir which is anyways what you are returning?

}

@JsonIgnore
Expand Down Expand Up @@ -598,6 +599,16 @@ public boolean getCollectMetrics() {
public void setCollectMetrics(boolean collectMetrics) {
this.collectMetrics = collectMetrics;
}

public float getStopWordsCutoff() {
return stopWordsCutoff;
}

public void setStopWordsCutoff(float stopWordsCutoff) throws ZinggClientException {
if (stopWordsCutoff > 1 || stopWordsCutoff < 0)
throw new ZinggClientException("Stop words cutoff should be between 0 and 1");
this.stopWordsCutoff = stopWordsCutoff;
}

public boolean getShowConcise() {
return showConcise;
Expand Down
177 changes: 123 additions & 54 deletions core/src/main/java/zingg/Documenter.java
Original file line number Diff line number Diff line change
@@ -1,39 +1,45 @@
package zingg;

import java.util.ArrayList;
import java.util.Arrays;
import static org.apache.spark.sql.functions.desc;
import static org.apache.spark.sql.functions.explode;
import static org.apache.spark.sql.functions.split;

import java.io.File;
import java.io.FileWriter;
import java.io.Writer;
import java.util.HashMap;
import java.util.List;
import java.util.Scanner;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.functions;

import freemarker.template.Configuration;
import freemarker.template.Template;
import freemarker.template.TemplateExceptionHandler;
import zingg.client.FieldDefinition;
import zingg.client.MatchType;
import zingg.client.ZinggClientException;
import zingg.client.ZinggOptions;
import zingg.client.pipe.Pipe;
import zingg.client.util.ColName;
import zingg.client.util.ColValues;
import zingg.util.DSUtil;
import zingg.util.PipeUtil;
import zingg.util.RowAdapter;
import zingg.util.RowWrapper;
import freemarker.ext.rhino.RhinoWrapper;
import freemarker.template.*;
import java.util.*;
import java.io.*;

public class Documenter extends ZinggBase {

protected static String name = "zingg.Documenter";
public static final Log LOG = LogFactory.getLog(Documenter.class);
public static Configuration config;

private final String CSV_TEMPLATE = "stopWordsCSV.ftlh";
private final String HTML_TEMPLATE = "stopWordsHTML.ftlh";

public Documenter() {
setZinggOptions(ZinggOptions.GENERATE_DOCS);
config = createConfigurationObject();
}

public void execute() throws ZinggClientException {
Expand All @@ -52,6 +58,7 @@ public void execute() throws ZinggClientException {
root.put("columns", markedRecords.columns());
root.put("fieldDefinitionCount", args.getFieldDefinition().size());
buildAndWriteHTML(root);
extractStopWords();
} catch (Exception e) {
e.printStackTrace();
throw new ZinggClientException(e.getMessage());
Expand All @@ -60,48 +67,110 @@ public void execute() throws ZinggClientException {

public void buildAndWriteHTML(Map<String, Object> root) throws Exception {

/* ------------------------------------------------------------------------ */
/* You should do this ONLY ONCE in the whole application life-cycle: */
Configuration cfg = getTemplateConfig();

/* Get the template (uses cache internally) */
Template temp = cfg.getTemplate("model.ftlh");

/* Merge data-model with template */
// Writer out = new OutputStreamWriter(System.out);
Writer file = new FileWriter(new File(args.getZinggDocFile()));
// StringWriter writer = new StringWriter();
temp.process(root, file);
// Note: Depending on what `out` is, you may need to call `out.close()`.
// This is usually the case for file output, but not for servlet output.
// file.flush();

// List<String> textList = Collections.singletonList(writer.toString());

// Dataset<Row> data = spark.createDataset(textList, Encoders.STRING()).toDF();

// PipeUtil.write(data, args, ctx, PipeUtil.getModelDocumentationPipe(args));
file.close();
// LOG.warn("written documentation at " + args.getZinggDocFile());
}

public Configuration getTemplateConfig() {
if (config == null) {
config = createConfigurationObject();
}
return config;
}

private Configuration createConfigurationObject() {
/* ------------------------------------------------------------------------ */
/* You should do this ONLY ONCE in the whole application life-cycle: */

/* Create and adjust the configuration singleton */
Configuration cfg = new Configuration(Configuration.VERSION_2_3_29);
/* Create and adjust the configuration singleton */
Configuration cfg = new Configuration(Configuration.VERSION_2_3_29);
cfg.setClassForTemplateLoading(this.getClass(), "/");
// cfg.setDirectoryForTemplateLoading(new File("/where/you/store/templates"));
// Recommended settings for new projects:
cfg.setDefaultEncoding("UTF-8");
cfg.setTemplateExceptionHandler(TemplateExceptionHandler.RETHROW_HANDLER);
cfg.setLogTemplateExceptions(false);
cfg.setWrapUncheckedExceptions(true);
cfg.setFallbackOnNullLoopVariable(false);
// cfg.setDirectoryForTemplateLoading(new File("/where/you/store/templates"));
// Recommended settings for new projects:
cfg.setDefaultEncoding("UTF-8");
cfg.setTemplateExceptionHandler(TemplateExceptionHandler.RETHROW_HANDLER);
cfg.setLogTemplateExceptions(false);
cfg.setWrapUncheckedExceptions(true);
cfg.setFallbackOnNullLoopVariable(false);
cfg.setObjectWrapper(new RowWrapper(cfg.getIncompatibleImprovements()));

/* ------------------------------------------------------------------------ */
/* You usually do these for MULTIPLE TIMES in the application life-cycle: */



/* Get the template (uses cache internally) */
Template temp = cfg.getTemplate("model.ftlh");

/* Merge data-model with template */
// Writer out = new OutputStreamWriter(System.out);
Writer file = new FileWriter (new File(args.getZinggDocFile()));
//StringWriter writer = new StringWriter();
temp.process(root, file);
// Note: Depending on what `out` is, you may need to call `out.close()`.
// This is usually the case for file output, but not for servlet output.
//file.flush();

//List<String> textList = Collections.singletonList(writer.toString());

//Dataset<Row> data = spark.createDataset(textList, Encoders.STRING()).toDF();

//PipeUtil.write(data, args, ctx, PipeUtil.getModelDocumentationPipe(args));
file.close();
//LOG.warn("written documentation at " + args.getZinggDocFile());
}




}
/* ------------------------------------------------------------------------ */
/* You usually do these for MULTIPLE TIMES in the application life-cycle: */
return cfg;
}

private void extractStopWords() throws ZinggClientException {
LOG.info("Stop words generation starts");
Dataset<Row> data = PipeUtil.read(spark, false, false, args.getData());
LOG.warn("Read input data : " + data.count());

List<FieldDefinition> fields = DSUtil.getFieldDefinitionFiltered(args, MatchType.DONT_USE);
for (FieldDefinition field : fields) {
findAndWriteStopWords(data, field);
}
LOG.info("Stop words generation finishes");
}

private void findAndWriteStopWords(Dataset<Row> data, FieldDefinition field) throws ZinggClientException {
String stopWordsDir = args.getZinggDocDir() + "/stopWords/";
String columnsDir = args.getZinggDocDir() + "/columns/";

checkAndCreateDir(stopWordsDir);
checkAndCreateDir(columnsDir);

LOG.debug("Field: " + field.fieldName);
data = data.select(split(data.col(field.fieldName), "\\s+").as("split"));
data = data.select(explode(data.col("split")).as("word"));
data = data.filter(data.col("word").notEqual(""));
data = data.groupBy("word").count().orderBy(desc("count"));
data = data.limit(Math.round(data.count()*args.getStopWordsCutoff()));
String filenameCSV = stopWordsDir + field.fieldName + ".csv";
String filenameHTML = columnsDir + field.fieldName + ".html";
Map<String, Object> root = new HashMap<String, Object>();
root.put("modelId", args.getModelId());
root.put("stopWords", data.collectAsList());

writeStopWords(CSV_TEMPLATE, root, filenameCSV);
writeStopWords(HTML_TEMPLATE, root, filenameHTML);
}

private void checkAndCreateDir(String dirName) {
File directory = new File(dirName);
if (!directory.exists()) {
directory.mkdirs();
}
}

public void writeStopWords(String template, Map<String, Object> root, String fileName)
throws ZinggClientException {
try {
Configuration cfg = getTemplateConfig();
Template temp = cfg.getTemplate(template);
Writer file = new FileWriter(new File(fileName));
temp.process(root, file);
file.close();
} catch (Exception e) {
e.printStackTrace();
throw new ZinggClientException(e.getMessage());
}
}
}
2 changes: 1 addition & 1 deletion core/src/main/java/zingg/util/DSUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ private static Dataset<Row> getTraining(SparkSession spark, Arguments args, Pipe
public static List<FieldDefinition> getFieldDefinitionFiltered(Arguments args, MatchType type) {
return args.getFieldDefinition()
.stream()
.filter(f -> !(f.getMatchType() == null || f.getMatchType().equals(MatchType.DONT_USE)))
.filter(f -> !(f.getMatchType() == null || f.getMatchType().equals(type)))
.collect(Collectors.toList());
}
}
2 changes: 1 addition & 1 deletion core/src/main/resources/model.ftlh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
<tr>
<th>Cluster</th>
<#list 3 ..< numColumns -1 as entityIndex>
<th> ${columns[entityIndex]!}</th>
<th> <a href="columns/${columns[entityIndex]}.html"> ${columns[entityIndex]!} </a></th>
</#list>
</tr>
</thead>
Expand Down
3 changes: 3 additions & 0 deletions core/src/main/resources/stopWordsCSV.ftlh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<#list stopWords as word>
${word[0]}
</#list>
48 changes: 48 additions & 0 deletions core/src/main/resources/stopWordsHTML.ftlh
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
<html>
<head>
<title>Zingg model documentation</title>
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/css/bootstrap.min.css" integrity="sha384-MCw98/SFnGE8fJT3GXwEOngsV7Zt27NXFoaoApmYm81iuXoPkFOJwJ8ERdknLPMO" crossorigin="anonymous">
</head>
<body>
<nav class="navbar navbar-light bg-light">
<a class="navbar-brand" href="https://www.zingg.ai">
<img src="https://github.com/zinggai/zingg/raw/main/assets/zinggWhiteTransparent.png" class="d-inline-block align-top" alt="">
</a>
<a href="../model.html">
<div class="justify-content-end">Model ${modelId}</div>
</a>
</nav>
<p>
<table class="table table-borderless">
<thead class="thead thead-dark">
</thead>
<tbody>
<tr>
<th>Word</th>
<th>Count</th>
</tr>

<#list stopWords as words>
<tr>
<td>
${words[0]!}
</td>
<td>
${words[1]!}
</td>
</tr>
</#list>

</tbody>
</table>


</p>
</body>
</html>
<style>
.header{
position:sticky;
top: 0 ;
}
</style>