Column Match
This example shows how one can get column level match statistics across datasources in an Owl Notebook. Supports exact and fuzzy matching.

Set ColMatch Parameters

1
%spark
2
import com.owl.common.domain._
3
import com.owl.common.Props
4
import com.owl.core.util.OwlUtils
5
import scala.collection.JavaConverters._
6
import com.owl.common.Utils
7
val c1 = new Connection()
8
c1.dataset = "silo.account"
9
c1.user = "user"
10
c1.password = "pass"
11
c1.query = "select id, networth, acc_name, acc_branch from silo.account limit 200000"
12
c1.url = "jdbc:mysql://<db url>:3306"
13
14
val c2 = new Connection()
15
c2.dataset = "silo.user_account"
16
c2.user = "user"
17
c2.password = "pass"
18
c2.query = "SELECT acc_name, acc_branch, networth FROM silo.account limit 200000"
19
c2.url = "jdbc:mysql://<db url>:3306"
20
21
val c3 = new Connection()
22
c3.dataset = "silo.user_account"
23
c3.user = "user"
24
c3.password = "pass"
25
c3.query = "SELECT acc_name as acc_name2, acc_branch, networth FROM silo.account limit 100000"
26
c3.url = "jdbc:mysql://<db url>:3306"
27
28
props.dataset = "colMatchTest1"
29
props.runId = "2017-02-04"
30
props.connectionList = List(c1,c2,c3).asJava
31
props.colMatchBatchSize = 2
32
props.colMatchDurationMins = 3
33
val owl = OwlUtils.OwlContext(spark.emptyDataFrame, props)
Copied!

Exact Match

1
%spark
2
props.colMatchLevel = "exact"
3
owl.register(props)
4
owl.colMatchDF().show
Copied!

Sample Result

1
+------------+-----------------+----------+----------+---------------+
2
| dataset_1| dataset_2| col_1| col_2|matchPercentage|
3
+------------+-----------------+----------+----------+---------------+
4
|silo.account|silo.user_account| id| acc_name| 0|
5
|silo.account|silo.user_account| id|acc_branch| 0|
6
|silo.account|silo.user_account| id| networth| 0|
7
|silo.account|silo.user_account| id| owl_id| 0|
8
|silo.account|silo.user_account| networth| acc_name| 0|
9
|silo.account|silo.user_account| networth|acc_branch| 16|
10
|silo.account|silo.user_account| networth| networth| 100|
11
|silo.account|silo.user_account| networth| owl_id| 0|
12
|silo.account|silo.user_account| acc_name| acc_name| 87|
13
|silo.account|silo.user_account| acc_name|acc_branch| 0|
14
|silo.account|silo.user_account| acc_name| networth| 0|
15
|silo.account|silo.user_account| acc_name| owl_id| 0|
16
|silo.account|silo.user_account|acc_branch| acc_name| 0|
17
|silo.account|silo.user_account|acc_branch|acc_branch| 87|
18
|silo.account|silo.user_account|acc_branch| networth| 12|
19
|silo.account|silo.user_account|acc_branch| owl_id| 0|
20
|silo.account|silo.user_account| owl_id| acc_name| 0|
21
|silo.account|silo.user_account| owl_id|acc_branch| 0|
22
|silo.account|silo.user_account| owl_id| networth| 0|
23
|silo.account|silo.user_account| owl_id| owl_id| 0|
24
+------------+-----------------+----------+----------+---------------+
25
only showing top 20 rows
Copied!

Fuzzy Match

1
%spark
2
props.colMatchLevel = "fuzzy"
3
props.colMatchFuzzyDistance = 4
4
owl.register(props)
5
owl.colMatchDF().show
Copied!

Sample Result

1
+------------+-----------------+----------+----------+---------------+
2
| dataset_1| dataset_2| col_1| col_2|matchPercentage|
3
+------------+-----------------+----------+----------+---------------+
4
|silo.account|silo.user_account| id| acc_name| 5|
5
|silo.account|silo.user_account| id|acc_branch| 27|
6
|silo.account|silo.user_account| id| networth| 22|
7
|silo.account|silo.user_account| id| owl_id| 0|
8
|silo.account|silo.user_account| networth| acc_name| 100|
9
|silo.account|silo.user_account| networth|acc_branch| 233|
10
|silo.account|silo.user_account| networth| networth| 200|
11
|silo.account|silo.user_account| networth| owl_id| 0|
12
|silo.account|silo.user_account| acc_name| acc_name| 162|
13
|silo.account|silo.user_account| acc_name|acc_branch| 262|
14
|silo.account|silo.user_account| acc_name| networth| 75|
15
|silo.account|silo.user_account| acc_name| owl_id| 0|
16
|silo.account|silo.user_account|acc_branch| acc_name| 262|
17
|silo.account|silo.user_account|acc_branch|acc_branch| 612|
18
|silo.account|silo.user_account|acc_branch| networth| 175|
19
|silo.account|silo.user_account|acc_branch| owl_id| 0|
20
|silo.account|silo.user_account| owl_id| acc_name| 0|
21
|silo.account|silo.user_account| owl_id|acc_branch| 0|
22
|silo.account|silo.user_account| owl_id| networth| 0|
23
|silo.account|silo.user_account| owl_id| owl_id| 0|
24
+------------+-----------------+----------+----------+---------------+
25
only showing top 20 rows
Copied!
Last modified 20d ago