Merge pull request #274 from navinrathore/StopWordsNoHeader216

Working with StopWord file if its header does not include the column 'StopWord'
zinggAI · May 19, 2022 · ab054de · ab054de
2 parents 61c9bc7 + a47f15e
commit ab054de
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 0 deletions.
diff --git a/core/src/main/java/zingg/preprocess/StopWords.java b/core/src/main/java/zingg/preprocess/StopWords.java
@@ -3,6 +3,7 @@
 import static org.apache.spark.sql.functions.udf;
 
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 import java.util.stream.Collectors;
 
@@ -25,13 +26,17 @@ public class StopWords {
 	protected static String name = "zingg.preprocess.StopWords";
 	public static final Log LOG = LogFactory.getLog(StopWords.class);
 	protected static String stopWordColumn = "StopWord";
+	protected static final int COLUMN_INDEX_DEFAULT = 0;
 
     public static Dataset<Row> preprocessForStopWords(SparkSession spark, Arguments args, Dataset<Row> ds) throws ZinggClientException {
 
 		List<String> wordList = new ArrayList<String>();
 		for (FieldDefinition def : args.getFieldDefinition()) {
 			if (!(def.getStopWords() == null || def.getStopWords() == "")) {
 				Dataset<Row> stopWords = PipeUtil.read(spark, false, false, PipeUtil.getStopWordsPipe(args, def.getStopWords()));
+				if (!Arrays.asList(stopWords.schema().fieldNames()).contains(stopWordColumn)) {
+					stopWordColumn = stopWords.columns()[COLUMN_INDEX_DEFAULT];
+				}
 				wordList = stopWords.select(stopWordColumn).as(Encoders.STRING()).collectAsList();
 				String pattern = wordList.stream().collect(Collectors.joining("|", "\\b(", ")\\b\\s?"));
 				ds = ds.withColumn(def.getFieldName(), removeStopWords(pattern.toLowerCase()).apply(ds.col(def.getFieldName())));

diff --git a/core/src/test/java/zingg/preprocess/TestStopWords.java b/core/src/test/java/zingg/preprocess/TestStopWords.java
@@ -106,6 +106,50 @@ public void testRemoveStopWordsFromDataset() {
 		}
 	}
 
+	@Test
+	public void testStopWordColumnMissingFromStopWordFile() {
+		try {
+			StructType schemaOriginal = new StructType(new StructField[] {
+					new StructField(ColName.ID_COL, DataTypes.StringType, false, Metadata.empty()),
+					new StructField("field1", DataTypes.StringType, false, Metadata.empty()),
+					new StructField("field2", DataTypes.StringType, false, Metadata.empty()),
+					new StructField("field3", DataTypes.StringType, false, Metadata.empty()),
+					new StructField(ColName.SOURCE_COL, DataTypes.StringType, false, Metadata.empty())
+			});
+
+			Dataset<Row> original = spark.createDataFrame(
+					Arrays.asList(
+							RowFactory.create("10", "The zingg is a spark application", "two",
+									"Yes. a good application", "test"),
+							RowFactory.create("20", "It is very popular in Data Science", "Three", "true indeed",
+									"test"),
+							RowFactory.create("30", "It is written in java and scala", "four", "", "test"),
+							RowFactory.create("40", "Best of luck to zingg Mobile/T-Mobile", "Five", "thank you", "test")),
+					schemaOriginal);
+
+			Dataset<Row> datasetExpected = spark.createDataFrame(
+				Arrays.asList(
+						RowFactory.create("10", "zingg spark application", "two", "Yes. a good application", "test"),
+						RowFactory.create("20", "very popular data science", "Three", "true indeed", "test"),
+						RowFactory.create("30", "written java scala", "four", "", "test"),
+						RowFactory.create("40", "best luck to zingg ", "Five", "thank you", "test")),
+				schemaOriginal);
+  			String stopWordsFileName = getClass().getResource("../../stopWordsWithoutHeader.csv").getFile();
+ 			FieldDefinition fd = new FieldDefinition();
+			fd.setStopWords(stopWordsFileName);
+			fd.setFieldName("field1");
+
+			List<FieldDefinition> fieldDefinitionList = Arrays.asList(fd);
+			args.setFieldDefinition(fieldDefinitionList);
+
+ 			Dataset<Row> newDataSet = StopWords.preprocessForStopWords(spark, args, original);
+ 			assertTrue(datasetExpected.except(newDataSet).isEmpty());
+			assertTrue(newDataSet.except(datasetExpected).isEmpty());
+		} catch (Throwable e) {
+			fail("Unexpected exception " + e.getMessage());
+		}
+	}
+
 	@Test
 	public void testForOriginalDataAfterPostprocess() {
 

diff --git a/core/src/test/resources/stopWordsWithoutHeader.csv b/core/src/test/resources/stopWordsWithoutHeader.csv
@@ -0,0 +1,16 @@
+java
+Mobile/T-Mobile
+a
+an
+the
+is
+It
+of
+and
+yes
+no
+I
+has
+have
+you
+in