Skip to content

Commit

Permalink
creating df progamatically rather from a file
Browse files Browse the repository at this point in the history
  • Loading branch information
navinrathore committed Jul 6, 2022
1 parent 69c9cd2 commit e19add1
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 33 deletions.
41 changes: 36 additions & 5 deletions core/src/test/java/zingg/ZinggSparkTester.java
Original file line number Diff line number Diff line change
@@ -1,21 +1,24 @@
package zingg;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.IntStream;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;


import zingg.client.Arguments;
import zingg.preprocess.TestStopWords;
Expand All @@ -28,6 +31,9 @@ public class ZinggSparkTester {

public static final Log LOG = LogFactory.getLog(ZinggSparkTester.class);

protected static final String FIELD_INTEGER = "fieldInteger";
protected static final String FIELD_DOUBLE = "fieldDouble";

@BeforeAll
public static void setup() {
try {
Expand Down Expand Up @@ -81,5 +87,30 @@ public Dataset<Row> createDFWithDoubles(int numRows, int numCols) {

}


protected Dataset<Row> createDFWithSampleNumerics() {
StructType schema = new StructType(new StructField[] {
new StructField(FIELD_DOUBLE, DataTypes.DoubleType, true, Metadata.empty()),
new StructField(FIELD_INTEGER, DataTypes.IntegerType, true, Metadata.empty())
});
String a[] = new String[] {
"0.55,55",
"1.234,1234",
"34,gbp",
"99.56,9956",
"56gbp,56",
"23,23gbp",
",45",
"65,",
",",
"0.5 gbp,23",
"56.00,56",
"$,64.0",
"null,34",
"78,null",
"78,87",
};
Dataset<String> dsStr = spark.createDataset(Arrays.asList(a), Encoders.STRING());
Dataset<Row> df = spark.read().schema(schema).csv(dsStr);
return df;
}
}
14 changes: 1 addition & 13 deletions core/src/test/java/zingg/hash/TestGetAs.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,18 @@

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

import zingg.ZinggSparkTester;

public class TestGetAs extends ZinggSparkTester {

private static final String FIELD_INTEGER = "fieldInteger";
private static final String FIELD_DOUBLE = "fieldDouble";
Dataset<Row> df;

@BeforeEach
private void setupGetAs() {
String filePath = getClass().getResource("/hash/testHash.csv").getFile();
// String filePath =
StructType schema = new StructType(new StructField[] {
new StructField(FIELD_DOUBLE, DataTypes.DoubleType, false, Metadata.empty()),
new StructField(FIELD_INTEGER, DataTypes.IntegerType, false, Metadata.empty())
});
df = spark.read().format("csv").schema(schema).load(filePath);
df = createDFWithSampleNumerics();
}

/*test values: 0.5 gbp/gbp<blank> etc.*/
Expand Down
15 changes: 0 additions & 15 deletions core/src/test/resources/hash/testHash.csv

This file was deleted.

0 comments on commit e19add1

Please sign in to comment.