2021.10
Collibra DIC Integration
Powered By GitBook
REST APIs
All REST APIs are available inside the application under admin section. The APIs can be used against the application in live working mode, which is preferred over documentation of APIs because it means the API works and was tested at compile time versus documentation time.

Product API

The product API is for end-users who want to interact with the official and supported API. You can also generate a client side SDK from the API with 4 steps below.
1
#psuedo code example REST API
2
3
dataset = 'public.nyse'
4
runId = '2021-03-05'
5
6
#SAVE datasetDef
7
dataset = POST /v3/datasetDefs/ {json_datasetDef}
8
9
#UPDATE datasetDef
10
dataset = PUT /v3/datasetDefs/ {json_datasetDef}
11
12
#RUN JOB
13
jobId = POST /v3/jobs/run/{dataset},{runDate}
14
15
#CHECK STATUS
16
status = /v3/jobs{jobId}/status
17
18
#GET DQ FINDINGS
19
findings = /v3/jobs/{jobId}/findings
Copied!

Generate Client SDK

    2.
    Click File Import URL
    3.
    Paste a URL that looks like this https://<host>/v2/api-docs?group=Product%20API
    4.
    Click generate client (python, java, scala, C#)
1
#Python SDK Example
2
3
#GET CMDLINE
4
cmdLine = get_job_cmdline(dataset)
5
6
#SUBMIT JOB
7
job_id = run(dataset, run_date)
8
9
#CHECK STATUS
10
status = get_job_status(job_id)
11
12
#GET DQ ISSUES
13
status = get_job_findings(dataset, run_date)
Copied!

Dataset Definition

The JSON for the full dataset definition. It can be more terse to send in the cmdline string of just the variables you use for your DQ Job.
1
-df "yyyy/MM/dd" -owluser <user> -numexecutors 1 -executormemory 1g \
2
-f s3a://s3-datasets/dataset.csv -h <host>:5432/dev?currentSchema=public \
3
-fq "select * from dataset" -drivermemory 1g -master k8s:// -ds dataset_csv_1 \
4
-deploymode cluster -bhlb 10 -rd "2021-04-01" -fullfile -loglevel INFO -cxn s3test5 \
5
-sparkprinc [email protected] -sparkkeytab /tmp/user2.keytab
Copied!
1
{
2
"dataset": "",
3
"runId": "",
4
"runIdEnd": "",
5
"runState": "DRAFT",
6
"passFail": 1,
7
"passFailLimit": 75,
8
"jobId": 0,
9
"coreMaxActiveConnections": null,
10
"linkId": null,
11
"licenseKey": "",
12
"logFile": "",
13
"logLevel": "",
14
"hootOnly": false,
15
"prettyPrint": true,
16
"useTemplate": false,
17
"parallel": false,
18
"plan": false,
19
"dataPreviewOff": false,
20
"datasetSafeOff": false,
21
"obslimit": 300,
22
"pgUser": "",
23
"pgPassword": "",
24
"host": null,
25
"port": null,
26
"user": "anonymous : use -owluser",
27
"alertEmail": null,
28
"scheduleTime": null,
29
"schemaScore": 1,
30
"optionAppend": "",
31
"keyDelimiter": "~~",
32
"agentId": null,
33
"load": {
34
"readonly": false,
35
"passwordManager": null,
36
"alias": "",
37
"query": "",
38
"key": "",
39
"expression": "",
40
"addDateColumn": false,
41
"zeroFillNull": false,
42
"replaceNulls": "",
43
"stringMode": false,
44
"operator": null,
45
"dateColumn": null,
46
"transform": null,
47
"filter": "",
48
"filterNot": "",
49
"sample": 1,
50
"backRun": 0,
51
"backRunBin": "DAY",
52
"unionLookBack": false,
53
"cache": true,
54
"dateFormat": "yyyy-MM-dd",
55
"timeFormat": "HH:mm:ss.SSS",
56
"timestamp": false,
57
"filePath": "",
58
"fileQuery": "",
59
"fullFile": false,
60
"fileHeader": null,
61
"inferSchema": true,
62
"fileType": null,
63
"delimiter": ",",
64
"fileCharSet": "UTF-8",
65
"skipLines": 0,
66
"avroSchema": "",
67
"xmlRowTag": "",
68
"flatten": false,
69
"handleMaps": false,
70
"handleMixedJson": false,
71
"multiLine": false,
72
"lib": "",
73
"driverName": null,
74
"connectionName": "",
75
"connectionUrl": "",
76
"userName": "",
77
"password": "",
78
"connectionProperties": {},
79
"hiveNative": null,
80
"hiveNativeHWC": false,
81
"useSql": true,
82
"columnName": null,
83
"lowerBound": null,
84
"upperBound": null,
85
"numPartitions": 0,
86
"escapeWithBackTick": false,
87
"escapeWithSingleQuote": false,
88
"escapeWithDoubleQuote": false,
89
"escapeCharacter": "",
90
"hasHeader": true
91
},
92
"outliers": [
93
{
94
"id": null,
95
"on": false,
96
"only": false,
97
"lookback": 5,
98
"key": null,
99
"include": null,
100
"exclude": null,
101
"dateColumn": null,
102
"timeColumn": null,
103
"timeBin": "DAY",
104
"timeBinQuery": "",
105
"categorical": true,
106
"by": null,
107
"limit": 300,
108
"minHistory": 3,
109
"historyLimit": 5,
110
"score": 1,
111
"aggFunc": "",
112
"aggQuery": "",
113
"query": "",
114
"q1": 0.15,
115
"q3": 0.85,
116
"categoricalColumnConcatenation": false,
117
"limitCategorical": null,
118
"measurementUnit": "",
119
"multiplierUpper": 1.35,
120
"multiplierLower": 1.35,
121
"record": true,
122
"filter": null,
123
"combine": true,
124
"categoricalConfidenceType": "",
125
"categoricalTopN": 3,
126
"categoricalBottomN": 2,
127
"categoricalMaxConfidence": 0.02,
128
"categoricalMaxFrequencyPercentile": 0.25,
129
"categoricalMinFrequency": 1,
130
"categoricalMinVariance": 0,
131
"categoricalMaxCategoryN": 1,
132
"categoricalParallel": true,
133
"categoricalAlgorithm": "",
134
"categoricalAlgorithmParameters": {}
135
}
136
],
137
"outlier": {
138
"id": null,
139
"on": false,
140
"only": false,
141
"lookback": 5,
142
"key": null,
143
"include": null,
144
"exclude": null,
145
"dateColumn": null,
146
"timeColumn": null,
147
"timeBin": "DAY",
148
"timeBinQuery": "",
149
"categorical": true,
150
"by": null,
151
"limit": 300,
152
"minHistory": 3,
153
"historyLimit": 5,
154
"score": 1,
155
"aggFunc": "",
156
"aggQuery": "",
157
"query": "",
158
"q1": 0.15,
159
"q3": 0.85,
160
"categoricalColumnConcatenation": false,
161
"limitCategorical": null,
162
"measurementUnit": "",
163
"multiplierUpper": 1.35,
164
"multiplierLower": 1.35,
165
"record": true,
166
"filter": null,
167
"combine": true,
168
"categoricalConfidenceType": "",
169
"categoricalTopN": 3,
170
"categoricalBottomN": 2,
171
"categoricalMaxConfidence": 0.02,
172
"categoricalMaxFrequencyPercentile": 0.25,
173
"categoricalMinFrequency": 1,
174
"categoricalMinVariance": 0,
175
"categoricalMaxCategoryN": 1,
176
"categoricalParallel": true,
177
"categoricalAlgorithm": "",
178
"categoricalAlgorithmParameters": {}
179
},
180
"pattern": {
181
"id": null,
182
"only": false,
183
"lookback": 5,
184
"key": null,
185
"dateColumn": null,
186
"include": null,
187
"exclude": null,
188
"score": 1,
189
"minSupport": 0.000033,
190
"confidence": 0.6,
191
"limit": 30,
192
"query": "",
193
"filter": null,
194
"timeBin": "DAY",
195
"on": false,
196
"match": true,
197
"lowFreq": false,
198
"bucketLimit": 450000,
199
"deDupe": true
200
},
201
"patterns": [
202
{
203
"id": null,
204
"only": false,
205
"lookback": 5,
206
"key": null,
207
"dateColumn": null,
208
"include": null,
209
"exclude": null,
210
"score": 1,
211
"minSupport": 0.000033,
212
"confidence": 0.6,
213
"limit": 30,
214
"query": "",
215
"filter": null,
216
"timeBin": "DAY",
217
"on": false,
218
"match": true,
219
"lowFreq": false,
220
"bucketLimit": 450000,
221
"deDupe": true
222
}
223
],
224
"dupe": {
225
"on": false,
226
"only": false,
227
"include": null,
228
"exclude": null,
229
"depth": 0,
230
"lowerBound": 99,
231
"upperBound": 100,
232
"approximate": 1,
233
"limitPerDupe": 15,
234
"checkHeader": true,
235
"filter": null,
236
"ignoreCase": true,
237
"score": 1,
238
"limit": 300
239
},
240
"profile": {
241
"on": true,
242
"only": false,
243
"include": null,
244
"exclude": null,
245
"shape": true,
246
"correlation": null,
247
"histogram": null,
248
"semantic": null,
249
"limit": 300,
250
"histogramLimit": 0,
251
"score": 1,
252
"shapeTotalScore": 0,
253
"shapeSensitivity": 0,
254
"shapeMaxPerCol": 0,
255
"shapeMaxColSize": 0,
256
"shapeGranular": null,
257
"behavioralDimension": "",
258
"behavioralDimensionGroup": "",
259
"behavioralValueColumn": "",
260
"behaviorScoreOff": false,
261
"behaviorLookback": 10,
262
"behaviorMinSupport": 4,
263
"profilePushDown": null,
264
"behaviorRowCheck": true,
265
"behaviorTimeCheck": true,
266
"behaviorMinValueCheck": true,
267
"behaviorMaxValueCheck": true,
268
"behaviorNullCheck": true,
269
"behaviorEmptyCheck": true,
270
"behaviorUniqueCheck": true,
271
"adaptiveTier": null
272
},
273
"source": {
274
"on": false,
275
"only": false,
276
"validateValues": false,
277
"matches": false,
278
"sourcePushDownCount": false,
279
"include": null,
280
"exclude": null,
281
"includeSrc": null,
282
"excludeSrc": null,
283
"key": null,
284
"map": null,
285
"score": 1,
286
"limit": 30,
287
"dataset": "",
288
"driverName": "",
289
"user": "",
290
"password": "",
291
"passwordManager": "",
292
"connectionName": "",
293
"connectionUrl": "",
294
"query": "",
295
"lib": "",
296
"checkType": true,
297
"checkCase": false,
298
"validateValuesFilter": "",
299
"validateSchemaOrder": false,
300
"connectionProperties": {},
301
"filePath": "",
302
"fileQuery": "",
303
"fullFile": false,
304
"header": null,
305
"skipLines": 0,
306
"inferSchema": true,
307
"fileType": null,
308
"delimiter": ",",
309
"fileCharSet": "UTF-8",
310
"avroSchema": "",
311
"xmlRowTag": "",
312
"flatten": false,
313
"handleMaps": false,
314
"handleMixedJson": false,
315
"multiLine": false,
316
"hasHeader": true
317
},
318
"rule": {
319
"on": true,
320
"only": false,
321
"lib": null,
322
"name": "",
323
"absoluteScoring": false,
324
"ruleBreakPreviewLimit": 6
325
},
326
"colMatch": {
327
"colMatchParallelProcesses": 3,
328
"colMatchDurationMins": 20,
329
"colMatchBatchSize": 2,
330
"level": "exact",
331
"fuzzyDistance": 1,
332
"connectionList": []
333
},
334
"spark": {
335
"numExecutors": 3,
336
"driverMemory": "",
337
"executorMemory": "",
338
"executorCores": 1,
339
"conf": "",
340
"queue": "",
341
"master": "local[*]",
342
"principal": "",
343
"keyTab": "",
344
"deployMode": "",
345
"jars": null,
346
"packages": null,
347
"files": null
348
},
349
"env": {
350
"jdbcPrincipal": "",
351
"jdbcKeyTab": ""
352
},
353
"record": {
354
"on": false,
355
"in": "",
356
"notIn": "",
357
"include": null,
358
"percDeltaLimit": 0.1,
359
"score": 1
360
},
361
"transforms": [],
362
"pipeline": []
363
}
Copied!

JWT Token For Auth

1
import requests
2
import json
3
url = "http://localhost:9000/auth/signin"
4
payload = json.dumps({
5
"username": "<user>",
6
"password": "<pass>",
7
"iss": "public"
8
})
9
headers = {
10
'Content-Type': 'application/json'
11
}
12
response = requests.request("POST", url, headers=headers, data=payload)
13
print(response.text)
Copied!
1
curl --location --request POST 'http://localhost:9000/auth/signin' \
2
--header 'Content-Type: application/json' \
3
--data-raw '{
4
"username": "<user>",
5
"password": "<pass>",
6
"iss": "public"
7
}'
Copied!

Python Example

Alternatively, you can use the rest endpoints directly. This example shows how it can be done with Python.
    1.
    Create a dataset def
      1.
      using the UI (Explorer) or
      2.
      using the dataset-def-api (https://<ip>/swagger-ui.html#/dataset-def-api)
    2.
    Confirm your Python environment has the appropriate modules and imports
    3.
    Fill-in the variables and customize to your preference
      1.
      url, user and pass
      2.
      dataset, runDate, and agentName
1
import requests
2
import json
3
4
# Authenticate
5
owl = "https://<url>"
6
url = "https://<url>/auth/signin"
7
payload = json.dumps({
8
"username": "<user>", # Edit Here
9
"password": "<pass>", # Edit Here
10
"iss": "public" # Edit Here
11
})
12
headers = {
13
'Content-Type': 'application/json'
14
}
15
response = requests.request("POST", url, headers=headers, data=payload, verify=False)
16
owl_header = {'Authorization': 'Bearer ' + response.json()['token']}
17
18
19
# Run
20
dataset = '<your_dataset_name>' # Edit Here
21
runDate = '2021-08-08' # Edit Here
22
agentName = '<your_agent_name' # Edit Here
23
24
response = requests.post(
25
url = owl + '/v3/jobs/run?agentName='+agentName+'&dataset='+dataset+'&runDate='+runDate,
26
headers=owl_header,
27
verify=False
28
)
29
30
jobId = str(response.json()['jobId'])
31
32
33
# Status
34
for stat in range(100):
35
time.sleep(1)
36
37
response = requests.get(
38
url = owl + '/v3/jobs/'+jobId,
39
headers=owl_header,
40
verify=False
41
)
42
43
job = response.json()
44
45
if job['status'] == 'FINISHED':
46
break
47
48
49
# Results
50
response = requests.get(
51
url = owl + '/v3/jobs/'+jobId+'/findings',
52
headers=owl_header,
53
verify=False
54
)
55
56
print(response.json())
Copied!
This assumes you have created a dataset definition using the UI or from the template.

Command Line instead of JSON dataset def

You can run a similar job submission using the cmd line. Please note it is easiest to get the saved command line from the dataset-def-api /v3/datasetDefs/{dataset}/cmdline (with proper escaping) and passed to the /v3/jobs/runCmdLine.

Breaking Down The Sections

Submit the Job

Send in a dataset name, date and agent to submit the job. This kicks off the engine to go do the work.
1
# Run
2
dataset = 'API_V3'
3
runDate = '2021-08-08'
4
agentName = 'owldq-owl-agent-owldq-dev-0'
5
6
response = requests.post(
7
url = owl + '/v3/jobs/run?agentName='+agentName+'&dataset='+dataset+'&runDate='+runDate,
8
headers=owl_header
9
)
10
11
jobId = str(response.json()['jobId'])
Copied!

Get the Status

Using the jobId returned from the job submission, you can check the status. In the example above, there is an interval to wait for the job to complete. You can create your own logic and orchestrate more precisely.
1
response = requests.get(
2
url = owl + '/v3/jobs/'+jobId,
3
headers=owl_header
4
)
Copied!

Get the Results

Using the same jobId returned from the job submission, you can check the results. You will get a detailed json object with all the capabilities and detections in one payload. This is where you would decision, based on your organization and process.
1
response = requests.get(
2
url = owl + '/v3/jobs/'+jobId,
3
headers=owl_header
4
)
Copied!

Python Example Raw

1
import requests
2
import json
3
4
# Variables
5
owl = 'https://<ip_address>' #Edit
6
user = '<user>' #Edit
7
password = '<password>' #Edit
8
tenant = 'public' #Edit
9
dataset = '<your_dataset_name>' #Edit
10
runDate = '2021-08-08' #Edit
11
agentName = 'your_agent_name' #Edit
12
13
# Authenticate
14
url = owl+'/auth/signin'
15
payload = json.dumps({"username": user, "password": password, "iss": tenant })
16
headers = {'Content-Type': 'application/json'}
17
response = requests.request("POST", url, headers=headers, data=payload, verify=False)
18
owl_header = {'Authorization': 'Bearer ' + response.json()['token']}
19
20
# Run
21
response = requests.post(url = owl + '/v3/jobs/run?agentName='+agentName+'&dataset='+dataset+'&runDate='+runDate, headers=owl_header, verify=False)
22
jobId = str(response.json()['jobId'])
23
24
# Status
25
for stat in range(100):
26
time.sleep(1)
27
28
response = requests.get(url = owl + '/v3/jobs/'+jobId, headers=owl_header, verify=False)
29
status = response.json()['status']
30
31
if status == 'FINISHED':
32
break
33
34
# Results
35
response = requests.get(url = owl + '/v3/jobs/'+jobId+'/findings', headers=owl_header, verify=False)
36
Copied!

Internal API

Collibra DQ also exposes the internal API so that all potential operations are available. The caveat is that these calls may change over time or expose underlying functionality.
Last modified 1mo ago