Load
1
package com.owl.common.options;
2
3
import org.apache.commons.lang3.StringUtils;
4
5
import java.util.Properties;
6
7
/**
8
* Owl Options related to data loading
9
*/
10
public class LoadOpt {
11
// Options order: "unsorted",
12
// "dataset scope columns", "dataset scope rows", "look back",
13
// "common options for both data sources", "file as data source", "db as data source"
14
15
public static final String SINGLE_QUOTE = "'";
16
public static final String DOUBLE_QUOTE = "\"";
17
public static final String BACK_TICK = "`";
18
19
/**
20
* If true, don't save any metadata
21
* TODO confirm if this is correct
22
*/
23
public Boolean readonly = false;
24
25
/**
26
* The Password manager.
27
*/
28
public String passwordManager = null;
29
30
/**
31
* Catalog alias (Catalog name)
32
*/
33
public String alias = StringUtils.EMPTY;
34
35
// --- Dataset Scope Column specifications ------- //
36
// Properties that select columns for Dataset activities or modifies (data type or new columns)
37
// prior and/or during loading into Spark DF
38
/**
39
* Dataset scope query. (IMPORTANT)
40
* The query should contain all the columns necessary to run the activities.
41
* TODO: figure out if this gets used when using files
42
*/
43
public String query = StringUtils.EMPTY;
44
45
/**
46
* Concatenated column names (sep = ",") for columns that are keys
47
* TODO: confirm
48
*/
49
public String key = StringUtils.EMPTY;
50
51
/**
52
* SELECT expression to transform expressions with assignment by "=" and delimited by "|".
53
* e.g. colname=cast(colname as string)|colname2=colname2(cast as date)
54
*/
55
public String expression = StringUtils.EMPTY;
56
57
/**
58
* Add "OWL_RUN_ID" UNIX timestamp (s) column to Spark DF usng the OwlOptions.runId.
59
* Does not obey timeStampDivisor (timestamp in seconds because Spark)
60
*/
61
public Boolean addDateColumn = false;
62
63
/**
64
* Fill null values in Spark DF with 0 (numeric columns only)
65
*/
66
public Boolean zeroFillNull = false;
67
68
/**
69
* A string that indicates a null value; any value matching this string will be set as nulls in the Spark DF
70
* Default: "" -> NULL
71
* Example: 'null' -> NULL
72
* --
73
* Note: to emptyStirngFillNull (replace String column null -> "", use expression
74
*/
75
public String replaceNulls = StringUtils.EMPTY;
76
77
/**
78
* All data types forced to strings for type safe processing.
79
* Not implemented in activity (yet)
80
*/
81
public Boolean stringMode = false;
82
83
// --- Dataset Scope Row specifications ------- //
84
// Properties that filter rows for Dataset activities
85
// prior and/or during loading into Spark DF
86
/**
87
* Convert row into string and only use rows containing this value.
88
* Strict matching only.
89
*/
90
public String filter = StringUtils.EMPTY;
91
92
/**
93
* Convert row into string and only use rows containing this value.
94
* Strict matching only.
95
*/
96
public String filterNot = StringUtils.EMPTY;
97
98
// --- Look back ------- //
99
// For Look back feature
100
/**
101
* Build up history of OwlChecks. Does not include current OwlCheck.
102
* TODO: Document the relationship with unionLookBack
103
*/
104
public Integer backRun = null;
105
106
/**
107
* Whether to load data for looking back in history.
108
* How much historical data to load is based on OutlierOpt.lookback and PatternOpt.lookback.
109
*/
110
public Boolean unionLookBack = false;
111
112
// --- Shared Data Loading Options ------- //
113
// Properties that affect data loading & pre-processing for both files and db as source
114
/**
115
* Whether to use cached data for activities
116
*/
117
public Boolean cache = true;
118
119
/**
120
* The year, month, and day format of date columns in the dataset for loading the data only.
121
* Default = "yyyy-MM-dd"
122
*/
123
public String dateFormat = "yyyy-MM-dd";
124
125
/**
126
* The hour, minute, second, and milisecond format of date columns in the dataset for loading the data only/
127
* Default = "HH:mm:ss.SSS"
128
* Not used. Questionably why separate timeFormat variable exists when dateFromat can represent hms as well.
129
*/
130
public String timeFormat = "HH:mm:ss.SSS";
131
132
/**
133
* Whether to convert date columns (specified by activity opts) in dataset
134
* into timestamp in ms (to make it seconds, set Props.timeStampDivisor = "s")
135
* TODO: Needs LoadOpt.timeStampDivisor and fix Utils.scala date2Timestamp
136
*/
137
public Boolean timestamp = false;
138
139
/* TODO add timeStampDivisor here and map between owl props?
140
public String timeStampDivisor = "ms"
141
*/
142
143
// --- Using file as data source ------- //
144
// Properties that control where & how static file is read
145
/**
146
* Full path to the file.
147
* If hdfs, then "hdfs://...".
148
* If s3, then "s3://...", "s3a://...", or "s3n://...".
149
* If parquet, then "...parquet" or "...PARQUET"
150
*/
151
public String filePath = StringUtils.EMPTY;
152
153
/**
154
* SQL query used on file.
155
* owl_id is added if not included in select clause.
156
* If empty, then defaults to full file query.
157
* (Does not update LoadOpts.fullFile to true).
158
*/
159
public String fileQuery = StringUtils.EMPTY;
160
161
/**
162
* Whether to use full file (i.e. use all columns) on data load
163
*/
164
public Boolean fullFile = false;
165
166
/**
167
* File column names, comma separated
168
*/
169
public String fileHeader = null;
170
171
/* TODO checkHeader needs to be moved here from DupeOpt
172
public Boolean checkHeader = true;*/
173
174
/**
175
* Whether to have Spark infer the schema of data source
176
* If props.profile2 == true, this is overwritten to false!
177
* If xml file, this is ignored and schema is always inferred by Spark on xml data load.
178
* If avro file, this value is respected (but may get overwritten by props.profile2)
179
* (see activity2.Load.file)
180
*/
181
public Boolean inferSchema = true;
182
183
/**
184
* Sample without replacement from file. Valid value is a fraction [0, 1.0].
185
* Only affects when filetype is xml or unspecified (and therefore assumed to be delimited table)
186
*/
187
public Double sample = 1.0;
188
189
/**
190
* Filetype (avro, json, orc, parquet, xml). Unspecified file
191
*/
192
public FileType fileType = null;
193
194
/**
195
* Delimiter for file. If number of characters after replacing "\" with "" is 2 or more character
196
* (e.g. compound delimiters like \t\t), then defaults to "\t" and attempts to read file as tsv
197
* See Activity2.load.file for details
198
*/
199
public String delimiter = ",";
200
201
/**
202
* File character encoding
203
*/
204
public String fileCharSet = "UTF-8";
205
206
/**
207
* The Avro schema for relevant avro file. Ignored if empty string
208
*/
209
public String avroSchema = StringUtils.EMPTY;
210
211
/**
212
* The Xml row tag for xml file. Ignored if empty string.
213
*/
214
public String xmlRowTag = StringUtils.EMPTY;
215
216
/**
217
* Whether to flatten arrays in nested schema
218
* TODO explain better. Does this only affect JSON file?
219
*/
220
public Boolean flatten = false;
221
222
/**
223
* Whether data contains maps in json that requires extra handling"
224
* TODO explain better. Does this only affect JSON file?
225
*/
226
public Boolean handleMaps = false;
227
228
/**
229
* Whether to handle mixed json.
230
* TODO explain better. Does this only affect JSON file?
231
*/
232
public Boolean handleMixedJson = false;
233
234
/**
235
* Spark.read option multiline, for JSON file only
236
*/
237
public Boolean multiLine = false;
238
239
// --- Using database as data source ------ //
240
/**
241
* Path to DB Driver. (e.g. /opt/owl/driver/postgres)
242
*/
243
public String lib = StringUtils.EMPTY;
244
245
/**
246
* DB Driver name (Java namespace, e.g. org.postgresql.Driver).
247
* Leave as null (default) and LoadOpts.connectionURL will resolve the driver name.
248
*/
249
public String driverName = null;
250
251
/**
252
* Connections name in metastore DB (public.connections.aliasname).
253
* Does not refer to the "name" of the database. Refers to "aliasname" that the user set when
254
* uploading connection config to Owl.
255
*/
256
public String connectionName = StringUtils.EMPTY;
257
258
/**
259
* The Connection url, prefixed by jdbc.
260
* e.g. "jdbc:postgresql://localhost:5432"
261
*/
262
public String connectionUrl = StringUtils.EMPTY;
263
264
/**
265
* DB username
266
*/
267
public String userName = StringUtils.EMPTY;
268
269
/**
270
* DB password
271
*/
272
public String password = StringUtils.EMPTY;
273
274
/**
275
* JDBC Connection properties (e.g. fetchsize)
276
*/
277
public Properties connectionProperties = null;
278
279
/**
280
* Whether data source is Hive Native (not using JDBC)
281
* TODO: Why is the default null as opposed to false?
282
*/
283
public Boolean hiveNative = null;
284
285
/**
286
* Whether data source is Hive Hadoop Web Cluster (not using JDBC)
287
*/
288
public Boolean hiveNativeHWC = false;
289
290
// --- Parallel JDBC ------- //
291
/**
292
* When running parallel JDBC, use LoadOpts.query and OwlOptions.dataset as base table
293
*/
294
public Boolean useSql = true;
295
296
/**
297
* When running parallel JDBC, specify column name
298
* ?? Activity2.Load and web has hard-coded magic string "OWLAUTOJDBC"
299
*/
300
public String columnName = null;
301
302
/**
303
* When running parallel JDBC, the upper bound for partition column.
304
* (e.g. "1000000")
305
*/
306
public String lowerBound = null;
307
308
/**
309
* When running parallel JDBC, the upper bound for partition column.
310
* (e.g. "5000000")
311
*/
312
public String upperBound = null;
313
314
/**
315
* When running parallel JDBC, the number of partitions used.
316
* If 0, then numPartitions used is based on the number of available Spark Executor (1/2 ~ 2/3)
317
* If > 20, then overwritten to 20 (no more than 20 concurrent connections to a database on a single dataset)
318
*/
319
public Integer numPartitions = 0;
320
321
// --- SQL Query properties ---------- //
322
// TODO: does this effect DB as source or file as source as well?
323
/**
324
* Whether the escape character would be back tick (`).
325
* Ignored if escapeCharacter is non-empty (if using OwlCheck from Options).
326
* Marked as true if props.escapeCharacter is a tick
327
* (to preserve bijection between props and opts, and vice versa).
328
*/
329
public Boolean escapeWithBackTick = false;
330
/**
331
* Whether the escape character would be single quote (').
332
* Ignored if escapeCharacter is non-empty (if using OwlCheck from Options).
333
* Marked as true if props.escapeCharacter is a tick
334
* (to preserve bijection between props and opts, and vice versa).
335
*/
336
public Boolean escapeWithSingleQuote = false;
337
/**
338
* Whether the escape character would be double quote (").
339
* Ignored if escapeCharacter is non-empty(if using OwlCheck from Options).
340
* Marked as true if props.escapeCharacter is a tick
341
* (to preserve bijection between props and opts, and vice versa).
342
*/
343
public Boolean escapeWithDoubleQuote = false;
344
345
/**
346
* Specify custom escape character. This takes precedence over all other escapeWithXYZ options.
347
* i.e. if non-empty, then other escapeWithXYZ options are ignored.
348
* If empty (default), no escaping attempt is made (and SQL query may fail if it contains reserved word)
349
*
350
* @deprecated Access level of this field will be changed to private. Please use {@link #setEscapeCharacter(String)} instead.
351
*/
352
@Deprecated
353
public String escapeCharacter = StringUtils.EMPTY;
354
355
356
/**
357
* The enum File type.
358
*/
359
public enum FileType {
360
/**
361
* Avro file type.
362
*/
363
avro,
364
/**
365
* Json file type.
366
*/
367
json,
368
/**
369
* Orc file type.
370
*/
371
orc,
372
/**
373
* Parquet file type.
374
*/
375
parquet,
376
/**
377
* Xml file type.
378
*/
379
xml
380
}
381
}
382
Copied!
Last modified 20d ago
Copy link