Dupe
1
package com.owl.common.options;
2
3
/**
4
* Options for Dupe Activity
5
*/
6
public class DupeOpt {
7
8
/**
9
* Whether to run Dupe Activity
10
*/
11
public Boolean on = false; // --dupe
12
13
/**
14
* @deprecated Unused for Activity2
15
*/
16
public Boolean only = false; // --dupeonly
17
18
/**
19
* Column names to include Dupe Activity
20
*/
21
public String[] include; // -dupeinc
22
23
/**
24
* Column names to exclude Dupe Activity
25
*/
26
public String[] exclude; // dupeexc
27
28
/**
29
* Indicator for complexity. See Activity2.Dupe.Scala.execute()
30
* depth == 0 : exact match (sets props.dupeExactMatch = TRUE downstream)
31
*/
32
public Integer depth = 2; // -depth
33
34
/**
35
* The minimum dupe scores between two duplicates. (currently calculated as "edit distance", out of upperBound)
36
* Two values with dupe score less than this is lowerBound are not duplicates (i.e. "truly" different values)
37
*/
38
public Integer lowerBound = 80; // -dupelb, -dupecutoff
39
40
/**
41
* The maximum possible dupe score for duplicate records (for a given dupe detection method).
42
* Currently assumed to be 100.
43
*/
44
public Integer upperBound = 100; // -dupeub, -dupepermatchupperlimit
45
46
/**
47
* Approximate dupe score used to create block index (when DF is large)
48
*/
49
public Integer approximate = 1; // -dupeapprox
50
51
/**
52
* Number of observations per unique duplicate
53
*/
54
public Integer limitPerDupe = 15;
55
56
/**
57
* Whether to process column headers when data load uses manual column names (LoadOpts.fileHeader)
58
* TODO this belongs in LoadOpts, not DupeOpts
59
*/
60
public Boolean checkHeader = true;
61
62
/**
63
* TODO remove
64
*
65
* @deprecated not used;
66
*/
67
public String filter;
68
69
/**
70
* If true, dupe activity is case insensitive. If false, dupe activity is case sensitive.
71
* Convenience feature for upper and lower set to 100
72
*/
73
public Boolean ignoreCase = false; //-dupenocase
74
75
/**
76
* Number of points each duplicate contributes to the total schema score (in Hoot)
77
*/
78
public Double score = 1.0; //-dupescore points per duplicate found default 1
79
80
/**
81
* Number of unique duplicates to compute during dupe activity
82
*/
83
public Integer limit = 300; //-dupelimit default 300
Copied!
Last modified 20d ago
Copy link