@@ -27,8 +27,17 @@ evaluate and classify in real time the domains in the table using a query.
27
27
`The tutorial is available as a Jupyter notebook
28
28
<https://github.com/DevoInc/python-mlmodelmanager-client/blob/main/notebooks/dga-domain-classifier.ipynb> `_.
29
29
30
- Build the model
31
- ---------------
30
+ Requirements
31
+ ------------
32
+
33
+ * Python >= 3.7.
34
+ * Devo table ``demo.ecommerce.data ``.
35
+
36
+ It is recommended for convenience to create a virtual environment to run the
37
+ tutorial or use the notebook provided.
38
+
39
+ Setup
40
+ -----
32
41
33
42
Let's start by installing the required packages.
34
43
@@ -54,7 +63,7 @@ Declare some constants for convenience in the code.
54
63
.. code-block ::
55
64
56
65
# A valid Devo access token
57
- TOKEN = '<your_token_here>'
66
+ DEVO_TOKEN = '<your_token_here>'
58
67
59
68
# URL of Devo API, e.g. https://apiv2-us.devo.com/search/query/
60
69
DEVO_API_URL = '<devo_api_url_here>'
@@ -66,10 +75,10 @@ Declare some constants for convenience in the code.
66
75
DOMAIN = '<your_domain_here>'
67
76
68
77
# The name of the model
69
- NAME = 'dga_classifier'
78
+ MODEL_NAME = 'dga_classifier'
70
79
71
80
# The description of the models
72
- DESCRIPTION = 'DGA domain classifier'
81
+ MODEL_DESCRIPTION = 'DGA domain classifier'
73
82
74
83
# The path where model file will be stored
75
84
MODELS_PATH = '~/models'
@@ -79,11 +88,13 @@ Declare some constants for convenience in the code.
79
88
80
89
VOWELS = "aeiouAEIOU"
81
90
82
- We use the `h2o <https://docs.h2o.ai/h2o/latest-stable/h2o-py/docs/index.html >`_
83
- library to create a model capable of detecting whether a domain is malicious or
84
- not and this `dataset
91
+ Prepare the data
92
+ ----------------
93
+
94
+ This `dataset
85
95
<https://devo-ml-models-public-demos.s3.eu-west-3.amazonaws.com/legit_dga/dataset.csv> `_
86
- , which has the form: *host;domain;class;subclass. *
96
+ will help us to train our model once it has been built. The dataset has the
97
+ form ``host;domain;class;subclass ``.
87
98
88
99
.. code-block :: text
89
100
@@ -98,16 +109,12 @@ not and this `dataset
98
109
100bestbuy.com;100bestbuy;legit;legit
99
110
...
100
111
101
- In the dataset preparation we will add the columns ``length ``, ``entropy `` and
102
- ``vowel_proportion `` for each domain, and also the flag ``malicious `` indicating
103
- if it is a DGA domain according to the ``class `` column value.
104
-
105
- As a result we will have a model saved in a file in `~/models `.
112
+ We will add the columns ``length ``, ``entropy `` and ``vowel_proportion `` for
113
+ each domain, and also the flag ``malicious `` indicating if it is a DGA domain
114
+ according to the ``class `` column value.
106
115
107
116
.. code-block ::
108
117
109
- h2o.init()
110
-
111
118
# import dataset
112
119
domains = h2o.import_file(DATASET_URL, header=1)
113
120
@@ -122,11 +129,21 @@ As a result we will have a model saved in a file in `~/models`.
122
129
domains['malicious'] = domains['class'] != 'legit'
123
130
domains['malicious'] = domains['malicious'].asfactor()
124
131
132
+ Build the model
133
+ ---------------
134
+
135
+ We use the `h2o <https://docs.h2o.ai/h2o/latest-stable/h2o-py/docs/index.html >`_
136
+ library to create a model capable of detecting whether a domain is malicious.
137
+
138
+ .. code-block ::
139
+
140
+ h2o.init()
141
+
125
142
# split dataset
126
143
train, valid = domains.split_frame(ratios=[.8], seed=1234)
127
144
128
145
# create and train the model
129
- model = H2OGradientBoostingEstimator(model_id=NAME )
146
+ model = H2OGradientBoostingEstimator(model_id=MODEL_NAME )
130
147
model.train(
131
148
x=['length', 'entropy', 'vowel_proportion'],
132
149
y='malicious',
@@ -140,12 +157,6 @@ As a result we will have a model saved in a file in `~/models`.
140
157
141
158
h2o.cluster().shutdown()
142
159
143
- .. note ::
144
-
145
- The aim of this tutorial is to show the integration of the ML Model
146
- Manager Client into the machine learning process not the development of
147
- an optimal and accurate machine learning model.
148
-
149
160
Register the model
150
161
------------------
151
162
@@ -156,14 +167,14 @@ Client.
156
167
.. code-block ::
157
168
158
169
# create the mlmm client
159
- mlmm = create_client_from_token(DEVO_MLMM_URL, TOKEN )
170
+ mlmm = create_client_from_token(DEVO_MLMM_URL, DEVO_TOKEN )
160
171
161
172
# register the model
162
173
mlmm.add_model(
163
- NAME ,
174
+ MODEL_NAME ,
164
175
engines.H2O,
165
- os.path.join(MODELS_PATH, f"{NAME }.zip"),
166
- description=DESCRIPTION ,
176
+ os.path.join(MODELS_PATH, f"{MODEL_NAME }.zip"),
177
+ description=MODEL_DESCRIPTION ,
167
178
force=True
168
179
)
169
180
@@ -193,7 +204,7 @@ A query that might be worthwhile would be something like this.
193
204
float(length(domain)) as length,
194
205
shannonentropy(domain) as entropy,
195
206
float(countbyfilter(domain, "{VOWELS}")) as vowel_proportion,
196
- mlevalmodel("{DOMAIN}", "{NAME }", length, entropy, vowel_proportion) as class
207
+ mlevalmodel("{DOMAIN}", "{MODEL_NAME }", length, entropy, vowel_proportion) as class
197
208
'''
198
209
199
210
.. note ::
@@ -211,7 +222,7 @@ and securely.
211
222
212
223
# create a Devo API client
213
224
api = Client(
214
- auth={"token": TOKEN },
225
+ auth={"token": DEVO_TOKEN },
215
226
address=DEVO_API_URL,
216
227
config=ClientConfig(
217
228
response="json/simple/compact",
0 commit comments