-
Notifications
You must be signed in to change notification settings - Fork 1
/
Program.fs
517 lines (403 loc) · 16.9 KB
/
Program.fs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
open System
open System.Collections.Generic
open System.IO
open Argu
open RestSharp
open RestSharp.Authenticators.OAuth2
open FSharp.Json
open YamlDotNet.RepresentationModel
open YamlDotNet.Serialization
open AI.Dev.OpenAI.GPT
type OAIRequest =
{ model: string
prompt: string
temperature: double
max_tokens: int }
and OAIRequestWithUserInfo = { prompt: string; email: string }
and OAIResponse = { choices: OAIChoice[] }
and OAIChoice = { text: string }
and Manifest = { nodes: Map<string, NodeMetadata> }
and NodeMetadata =
{ original_file_path: string
patch_path: string option
compiled_code: string option
raw_code: string option
description: string option
name: string
unique_id: string
fqn: string[]
columns: Map<string, ColumnMetadata>
depends_on: Depends option }
and ColumnMetadata = { name: string; description: string }
and Depends =
{ nodes: string[] option
macros: string[] option }
and Env =
{ apiKey: KeyOrUserInfo
basePath: string
projectName: string
models: HashSet<string> option
dry_run: bool }
and KeyOrUserInfo =
| Key of string
| UserInfo of string
and Arguments =
| Working_Directory of path: string
| Gen_Undocumented
| Gen_Specific of models_list: string
| Dry_Run
interface IArgParserTemplate with
member s.Usage =
match s with
| Working_Directory _ -> "DBT project root (default: .)"
| Gen_Undocumented ->
"Generate docs for all undocumented models (default: enabled, disabled by --gen-specific)"
| Gen_Specific _ ->
"Generate docs only for specified model names (comma-separated list) (default: none, disabled by --gen-undocumented)"
| Dry_Run -> "Don't write any docs, print them to the command line. (default: false)"
and ArgsConfig =
{ workingDirectory: string
genMode: GenMode
dry_run: bool }
and GenMode =
| Undocumented
| Specific of string list
let mkPrompt (reverseDeps: Dictionary<string, List<string>>) (node: NodeMetadata) =
let deps =
match node.depends_on with
| Some deps -> String.concat "," (Option.defaultValue [||] deps.nodes)
| None -> "(No dependencies)"
let rDeps =
if reverseDeps.ContainsKey(node.unique_id) then
String.concat "," reverseDeps[node.unique_id]
else
"Not used by any other models"
let staging =
if Array.contains "staging" node.fqn then
"\nThis is a staging model. Be sure to mention that in the summary.\n"
else
""
let raw_code =
match node.raw_code with
| Some c -> c
| None -> ""
$@"Write markdown documentation to explain the following DBT model. Be clear and informative, but also accurate. The only information available is the metadata below.
Explain the raw SQL, then explain the dependencies. Do not list the SQL code or column names themselves; an explanation is sufficient.
Model name: {node.name}
Raw SQL code: {raw_code}
Depends on: {deps}
Depended on by: {rDeps}
{staging}
First, generate a human-readable name for the table as the title (i.e. fct_orders -> # Orders Fact Table).
Then, describe the dependencies (both model dependencies and the warehouse tables used by the SQL.) Do this under ## Dependencies.
Then, describe what other models reference this model in ## How it's used
Then summarize the model logic in ## Summary.
"
let mkColumnPrompt (node: NodeMetadata) (col: ColumnMetadata) : string =
$@"Write markdown documentation to explain the following DBT column in the context of the parent model and SQL code. Be clear and informative, but also accurate. The only information available is the metadata below.
Do not list the SQL code or column names themselves; an explanation is sufficient.
Column Name: {col.name}
Parent Model name: {node.name}
Raw SQL code: {node.raw_code}
First, explain the meaning of the column in plain, non-technical English.false
Then, explain how the column is extracted in code.
"
type SummarizedResult =
{ patch_path: string option
summary: string
original_file_path: string
columnSummaries: Map<string, string>
name: string }
let stdoutLock = Object()
exception TooManyTokensError
let runOpenAIRequest (env: Env) (prompt: string) : Async<string> =
async {
let tokens = GPT3Tokenizer.Encode(prompt)
if tokens.Count + 1000 >= 4096 then
raise TooManyTokensError
let temp = 0.2
let baseReq: OAIRequest =
{ model = "text-davinci-003"
prompt = prompt
temperature = temp
max_tokens = 1000 }
let request, options =
match env.apiKey with
| Key k ->
let url = "https://api.openai.com/v1/completions"
let options = RestClientOptions(url)
let request = RestRequest()
let _ = request.AddJsonBody(baseReq)
options.Authenticator <- OAuth2AuthorizationRequestHeaderAuthenticator(k, "Bearer")
(request, options)
| UserInfo email ->
let url = "https://api.textql.com/api/oai"
let options = RestClientOptions(url)
let request = RestRequest()
let body = { prompt = prompt; email = email }
let _ = request.AddJsonBody(body)
(request, options)
let client = new RestClient(options)
let! response = Async.AwaitTask(client.PostAsync(request))
let result = Json.deserialize<OAIResponse> response.Content
return result.choices[0].text
}
let genColumnSummaries (env: Env) (node: NodeMetadata) : Async<Map<string, string>> =
async {
let prefix = "[ai-gen] "
let mapper (k, column) =
async {
let! result = runOpenAIRequest env (mkColumnPrompt node column)
return (k, prefix + result)
}
let! resultSeq =
node.columns
|> Map.filter (fun _k v -> v.description.Equals(""))
|> Map.toSeq
|> Seq.map mapper
|> Async.Parallel
return (Map.ofSeq resultSeq)
}
let openAISummarize
(env: Env)
(reverseDeps: Dictionary<string, List<string>>)
(node: NodeMetadata)
: Async<SummarizedResult option> =
async {
lock stdoutLock (fun _ -> printfn $"Generating docs for: {node.name}")
let summaryPrefix =
"This description is generated by an AI model. Take it with a grain of salt!\n"
let! result =
Async.Catch(
async {
let! tblResult = runOpenAIRequest env (mkPrompt reverseDeps node)
let! colResult = genColumnSummaries env node
return (tblResult, colResult)
}
)
match result with
| Choice1Of2 (tblResult, columnSummaries) ->
return
Some(
{ patch_path = node.patch_path
name = node.name
original_file_path = node.original_file_path
summary = summaryPrefix + tblResult
columnSummaries = columnSummaries }
)
| Choice2Of2 TooManyTokensError ->
lock stdoutLock (fun _ ->
printfn
$"Prompt for {node.name} returned too many tokens to fit into GPT-3. Perhaps the SQL code or dependency map is too large?")
return None
| Choice2Of2 e ->
lock stdoutLock (fun _ -> printfn $"OAI request to {node.name} failed: {e.Message}")
return None
}
let insertColumnDescription
env
(nodeResult: SummarizedResult)
(colMap: Map<string, string>)
(modelNode: YamlNode)
: unit =
let modelNode' = modelNode :?> YamlMappingNode
let nameNode = modelNode'.Children[YamlScalarNode("name")] :?> YamlScalarNode
let name = nameNode.Value
match Map.tryFind name colMap with // If it's in the node map it shouldn't have a description or it's the empty string
| None -> ()
| Some colResult ->
let docName = "tql_generated_doc__" + nodeResult.name + "__" + name
let mdPath =
String.concat
"/"
[ env.basePath
Path.GetDirectoryName(nodeResult.original_file_path)
docName + ".md" ]
let header = "{% docs " + docName + " %}"
let footer = "{% enddocs %}"
let docContent = String.concat "\n" [ header; colResult; footer ]
lock stdoutLock (fun _ -> printfn $"Writing new docs to: {mdPath}")
if env.dry_run then
printfn $"{docContent}"
else
File.WriteAllText(mdPath, docContent)
let _ = modelNode'.Children.Remove(YamlScalarNode("description"))
modelNode'.Children.Add(YamlScalarNode("description"), YamlScalarNode("{{ doc(\"" + docName + "\") }}"))
let insertDescription env (nodeMap: Map<string, SummarizedResult>) (modelNode: YamlNode) : unit =
let modelNode' = modelNode :?> YamlMappingNode
let nameNode = modelNode'.Children[YamlScalarNode("name")] :?> YamlScalarNode
let name = nameNode.Value
match Map.tryFind name nodeMap with // If it's in the node map it shouldn't have a description or it's the empty string
| None -> ()
| Some node ->
let docName = "tql_generated_doc__" + node.name
let mdPath =
String.concat
"/"
[ env.basePath
Path.GetDirectoryName(node.original_file_path)
docName + ".md" ]
let header = "{% docs " + docName + " %}"
let footer = "{% enddocs %}"
let docContent = String.concat "\n" [ header; node.summary; footer ]
lock stdoutLock (fun _ -> printfn $"Writing new docs to: {mdPath}")
if modelNode'.Children.ContainsKey(YamlScalarNode("columns")) then
let colsNode = modelNode'.Children[YamlScalarNode("columns")] :?> YamlSequenceNode
colsNode |> Seq.iter (insertColumnDescription env node node.columnSummaries)
if env.dry_run then
printfn $"{docContent}"
else
File.WriteAllText(mdPath, docContent)
let _ = modelNode'.Children.Remove(YamlScalarNode("description"))
modelNode'.Children.Add(YamlScalarNode("description"), YamlScalarNode("{{ doc(\"" + docName + "\") }}"))
let insertDocs (env: Env) (patchPathMay: string option, nodes: SummarizedResult seq) : unit =
match patchPathMay with
| None -> ()
| Some patchPath ->
let path = env.basePath + "/" + patchPath.Replace(env.projectName + "://", "")
let contents = File.ReadAllText(path)
let deserializer =
let builder = DeserializerBuilder()
builder.Build()
let config = deserializer.Deserialize<YamlMappingNode>(contents)
let models = YamlScalarNode("models")
let resultMap = nodes |> Seq.fold (fun m n -> Map.add n.name n m) Map.empty
let modelsNode = (config.Children[models] :?> YamlSequenceNode)
modelsNode |> Seq.iter (insertDescription env resultMap)
let serializer = SerializerBuilder().Build()
let yaml = serializer.Serialize(config)
lock stdoutLock (fun _ -> printfn $"Adding description to {Seq.length nodes} models in {path}")
if env.dry_run then
printfn $"{yaml}"
else
File.WriteAllText(path, yaml)
let readProjectConfig (basePath: string) : string =
let path = basePath + "/dbt_project.yml"
let contents = File.ReadAllText(path)
let deserializer = DeserializerBuilder().Build()
let config = deserializer.Deserialize<YamlMappingNode>(contents)
let nameNode = config.Children[YamlScalarNode("name")] :?> YamlScalarNode
nameNode.Value
let isModel (name: string) =
let nodeType = name.Split('.')[0]
nodeType.Equals("model")
let shouldWriteDoc (env: Env) (pair: KeyValuePair<string, NodeMetadata>) : bool =
let pred nm =
match env.models with
| None -> pair.Value.description.Equals("")
| Some models -> models.Contains nm
let hasPatchPath =
match pair.Value.patch_path with
| None -> false
| _ -> true
let cond = isModel pair.Key && pred pair.Value.name
if not hasPatchPath && cond then
printfn
$"Model {pair.Key} doesn't appear to be declared in a .yml file. Generating docs isn't yet supported for models without a corresponding yaml declaration."
hasPatchPath && cond
let mkReverseDependencyMap (nodes: Map<string, NodeMetadata>) =
let ans: Dictionary<string, List<string>> = Dictionary()
let folder () (nm: string) (metadata: NodeMetadata) =
let nodes =
match metadata.depends_on with
| Some dep -> Option.defaultValue [||] dep.nodes
| None -> [||]
if isModel nm then
for modelDep in nodes do
if ans.ContainsKey modelDep then
ans[ modelDep ].Add nm
else
ans[modelDep] <- ResizeArray [ nm ]
nodes |> Map.fold folder ()
ans
exception ApiKeyNotFound of unit
let parseArgs argv =
let foldArgs config0 arg =
match arg with
| Working_Directory path -> { config0 with workingDirectory = path }
| Gen_Undocumented -> { config0 with genMode = Undocumented }
| Gen_Specific models_list -> { config0 with genMode = Specific(Seq.toList (models_list.Split ',')) }
| Dry_Run -> { config0 with dry_run = true }
let config0: ArgsConfig =
{ workingDirectory = "./"
genMode = Undocumented
dry_run = false }
let parser = ArgumentParser.Create<Arguments>(programName = "DbtHelper")
let results = parser.Parse(argv)
let all = results.GetAllResults()
Seq.fold foldArgs config0 all
[<EntryPoint>]
let main argv =
let init: (Manifest * Env) option =
try
let argsEnv = parseArgs argv
let contents =
try
File.ReadAllText(argsEnv.workingDirectory + "/target/manifest.json")
with e ->
printfn "Reading target/manifest.json failed. Please re-run from a dbt project with generated docs"
raise e
let manifest =
try
Json.deserialize<Manifest> contents
with e ->
printfn "manifest.json deserialization failed"
raise e
let projectName =
try
readProjectConfig argsEnv.workingDirectory
with e ->
printfn "Reading dbt_project.yml failed. Please re-run from a dbt project root."
raise e
let apiKey =
match Environment.GetEnvironmentVariable("OPENAI_API_KEY") with
| null ->
printfn "You haven't specified an API Key. No worries, this one's on TextQL!"
printfn
"In return, please type your email address. We don't collect any other data, nor sell your email to third parties."
printfn
"If you're okay with this, press enter. Otherwise, type 'no' and set the OPENAI_API_KEY environment variable."
printf "Email (type no to abort): "
let email = Console.ReadLine()
if email.Equals "no" then
raise (ApiKeyNotFound())
else
UserInfo email
| k -> Key k
let models =
match argsEnv.genMode with
| Undocumented -> None
| Specific ls -> Some(HashSet(ls))
Some(
manifest,
{ apiKey = apiKey
basePath = argsEnv.workingDirectory
projectName = projectName
models = models
dry_run = argsEnv.dry_run }
)
with
| :? ArguParseException as e ->
printfn $"{e.Message}"
None
| e ->
printfn "Initialization failed. Aborting"
printfn $"{e}"
None
match init with
| None -> 1
| Some (manifest, env) ->
if env.dry_run then
printfn "Dry Run. Results will not be written."
let rDeps = mkReverseDependencyMap manifest.nodes
let limitedPar fn = Async.Parallel(fn, 4)
manifest.nodes
|> Seq.filter (shouldWriteDoc env)
|> Seq.map (fun x -> openAISummarize env rDeps x.Value)
|> limitedPar
|> Async.RunSynchronously
|> Seq.choose id
|> Seq.groupBy (fun x -> x.patch_path)
|> Seq.iter (insertDocs env)
printfn "Success! Make sure to run `dbt docs generate`."
0