SAP · marcorosa · Apr 11, 2022 · Mar 7, 2022 · Mar 7, 2022 · Mar 7, 2022
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -10,7 +10,7 @@ jobs:
 
     strategy:
       matrix:
-        python-version: [3.6, 3.7, 3.8, 3.9]
+        python-version: ['3.7', '3.8', '3.9', '3.10']
 
     services:
       postgres:
@@ -62,14 +62,6 @@ jobs:
       run: |
         python setup.py install --user
 
-    - name: Download and link credentialdigger's models
-      env:
-        path_model: https://github.com/SAP/credential-digger/releases/download/PM-v1.0.1/path_model-1.0.1.tar.gz
-        snippet_model: https://github.com/SAP/credential-digger/releases/download/SM-v1.0.0/snippet_model-1.0.0.tar.gz
-      run: |
-        python -m credentialdigger download path_model
-        python -m credentialdigger download snippet_model
-
     - name: Run unit tests
       run: |
         pytest tests/unit_tests
@@ -85,7 +77,5 @@ jobs:
         POSTGRES_DB: credential_digger_tests
         DBHOST: localhost
         DBPORT: 5432
-        path_model: https://github.com/SAP/credential-digger/releases/download/PM-v1.0.1/path_model-1.0.1.tar.gz
-        snippet_model: https://github.com/SAP/credential-digger/releases/download/SM-v1.0.0/snippet_model-1.0.0.tar.gz
       run: |
         pytest tests/functional_tests
diff --git a/credentialdigger/client.py b/credentialdigger/client.py
@@ -7,6 +7,7 @@
 from datetime import datetime, timezone
 
 import yaml
+from git import GitCommandError
 from github import Github
 from rich.progress import Progress
 
@@ -982,11 +983,16 @@ def scan_user(self, username, category=None, models=None, debug=False,
             # Get repo clone url without .git at the end
             repo_url = repo.clone_url[:-4]
             logger.info(f'{i}/{repos_num}) Scanning {repo.url}')
-            missing_ids[repo_url] = self._scan(repo_url, scanner,
-                                               models=models,
-                                               debug=debug,
-                                               similarity=similarity,
-                                               git_token=git_token)
+            try:
+                missing_ids[repo_url] = self._scan(repo_url, scanner,
+                                                   models=models,
+                                                   debug=debug,
+                                                   similarity=similarity,
+                                                   git_token=git_token)
+            except GitCommandError:
+                logger.warning(f'{i}/{repos_num} Ignore {repo_url} '
+                               '(it can not be cloned)')
+
         return missing_ids
 
     def scan_wiki(self, repo_url, category=None, models=None, debug=False,
@@ -1366,7 +1372,8 @@ def update_similar_snippets(self,
             # Compute similarity of target_embedding and embedding
             similarity = compute_similarity(target_embedding,
                                             embedding)
-            if similarity > threshold:
-                self.update_discovery(d['id'], state)
+            # Increase counter if similar and the update is successful
+            if (similarity > threshold and
+                    self.update_discovery(d['id'], state)):
                 n_updated_snippets += 1
         return n_updated_snippets
diff --git a/credentialdigger/client_postgres.py b/credentialdigger/client_postgres.py
@@ -508,7 +508,7 @@ def update_repo(self, url, last_scan):
         bool
             `True` if the update is successful, `False` otherwise
         """
-        super().update_repo(
+        return super().update_repo(
             url=url, last_scan=last_scan,
             query='UPDATE repos SET last_scan=%s WHERE url=%s RETURNING true'
         )
@@ -528,7 +528,7 @@ def update_discovery(self, discovery_id, new_state):
         bool
             `True` if the update is successful, `False` otherwise
         """
-        super().update_discovery(
+        return super().update_discovery(
             discovery_id=discovery_id,
             new_state=new_state,
             query='UPDATE discoveries SET state=%s WHERE id=%s RETURNING true')
@@ -548,7 +548,7 @@ def update_discoveries(self, discoveries_ids, new_state):
         bool
             `True` if the update is successful, `False` otherwise
         """
-        super().update_discoveries(
+        return super().update_discoveries(
             discoveries_ids=discoveries_ids,
             new_state=new_state,
             query='UPDATE discoveries SET state=%s WHERE id IN %s RETURNING true')
@@ -581,6 +581,6 @@ def update_discovery_group(self, new_state, repo_url, file_name, snippet=None):
         if snippet is not None:
             query += ' and snippet=%s'
         query += ' RETURNING true'
-        super().update_discovery_group(
+        return super().update_discovery_group(
             new_state=new_state, repo_url=repo_url, file_name=file_name,
             snippet=snippet, query=query)
diff --git a/credentialdigger/client_sqlite.py b/credentialdigger/client_sqlite.py
@@ -67,7 +67,7 @@ def query_check(self, query, *args):
         try:
             cursor.execute(query, args)
             self.db.commit()
-            return cursor.rowcount < 1
+            return cursor.rowcount >= 1
         except (TypeError, IndexError):
             """ A TypeError is raised if any of the required arguments is
             missing. """
@@ -544,7 +544,7 @@ def update_repo(self, url, last_scan):
         bool
             `True` if the update is successful, `False` otherwise
         """
-        super().update_repo(
+        return super().update_repo(
             url=url, last_scan=last_scan,
             query='UPDATE repos SET last_scan=? WHERE url=?'
         )
@@ -564,7 +564,7 @@ def update_discovery(self, discovery_id, new_state):
         bool
             `True` if the update is successful, `False` otherwise
         """
-        super().update_discovery(
+        return super().update_discovery(
             new_state=new_state, discovery_id=discovery_id,
             query='UPDATE discoveries SET state=? WHERE id=?'
         )
@@ -584,7 +584,7 @@ def update_discoveries(self, discoveries_ids, new_state):
         bool
             `True` if the update is successful, `False` otherwise
         """
-        super().update_discoveries(
+        return super().update_discoveries(
             discoveries_ids=discoveries_ids,
             new_state=new_state,
             query='UPDATE discoveries SET state=? WHERE id IN('
@@ -617,6 +617,6 @@ def update_discovery_group(self, new_state, repo_url, file_name, snippet=None):
             query += ' and file_name=?'
         if snippet is not None:
             query += ' and snippet=?'
-        super().update_discovery_group(
+        return super().update_discovery_group(
             new_state=new_state, repo_url=repo_url, file_name=file_name,
             snippet=snippet, query=query)
diff --git a/credentialdigger/models/password_model.py b/credentialdigger/models/password_model.py
@@ -46,17 +46,18 @@ def analyze_batch(self, discoveries):
         # We have to classify only the "new" discoveries
         new_discoveries = [d for d in discoveries if d['state'] == 'new']
         no_new_discoveries = [d for d in discoveries if d['state'] != 'new']
-        # Create a dataset with all the preprocessed (new) snippets
-        data = self._pre_process([d['snippet'] for d in new_discoveries])
-        # data = self._preprocess_batch_data(snippets)
-        # Compute a prediction for each snippet
-        outputs = self.model.predict(data)
-        logits = outputs['logits']
-        predictions = tf.argmax(logits, 1)
-        # Check predictions and set FP discoveries accordingly
-        for d, p in zip(new_discoveries, predictions):
-            if p == 0:
-                d['state'] = 'false_positive'
+        # Process new_discoveries if not empty
+        if new_discoveries:
+            # Create a dataset with all the preprocessed (new) snippets
+            data = self._pre_process([d['snippet'] for d in new_discoveries])
+            # Compute a prediction for each snippet
+            outputs = self.model.predict(data)
+            logits = outputs['logits']
+            predictions = tf.argmax(logits, 1)
+            # Check predictions and set FP discoveries accordingly
+            for d, p in zip(new_discoveries, predictions):
+                if p == 0:
+                    d['state'] = 'false_positive'
         return new_discoveries + no_new_discoveries
 
     def analyze(self, discovery):
@@ -69,11 +70,9 @@ def analyze(self, discovery):
 
         Returns
         -------
-        discoveries: list of dict
-            The discoveries, with states updated according to
-            the model's predictions
-        n_false_positives: int
-            The number of false positives detected by the model
+        bool
+            True if the snippet is safe (i.e., there is no leak).
+            False otherwise
         """
         # Preprocess the snippet
         data = self._pre_process([discovery['snippet']])
@@ -84,6 +83,7 @@ def analyze(self, discovery):
             # The model classified this snippet as a false positive
             # (i.e., spam)
             return True
+        return False
 
     def _pre_process(self, snippet):
         """ Compute encodings of snippets and format them to a standard

diff --git a/credentialdigger/models/path_model.py b/credentialdigger/models/path_model.py
@@ -21,7 +21,8 @@ def analyze(self, discovery):
         Returns
         -------
         bool
-            True if the discovery is classified as false positive (i.e., spam)
+            True if the discovery is classified as false positive (i.e., spam),
+            False otherwise
         """
         return bool(self.fp_keywords.search(discovery['file_name'].lower()))
 

diff --git a/credentialdigger/scanners/file_scanner.py b/credentialdigger/scanners/file_scanner.py
@@ -157,7 +157,7 @@ def scan_file(self, project_root, relative_path, **kwargs):
                 for row in file_to_scan:
                     rh = ResultHandler()
                     self.stream.scan(
-                        row if sys.version_info < (3, 9) else row.encode(
+                        row if sys.version_info < (3, 8) else row.encode(
                             'utf-8'),
                         match_event_handler=rh.handle_results,
                         context=[row.strip(), relative_path, commit_id,
@@ -169,6 +169,8 @@ def scan_file(self, project_root, relative_path, **kwargs):
         except UnicodeDecodeError:
             # Don't scan binary files
             pass
+        except FileNotFoundError:
+            logger.warning(f'Ignore {relative_path} (file not found)')
         return discoveries
 
     def _prune(self, rel_dir_root, dirs, files, max_depth=-1, ignore_list=[]):

diff --git a/credentialdigger/scanners/git_scanner.py b/credentialdigger/scanners/git_scanner.py
@@ -100,6 +100,7 @@ def get_git_repo(self, repo_url, local_repo):
                 GitRepo.clone_from(repo_url, project_path)
                 repo = GitRepo(project_path)
             except GitCommandError as e:
+                logger.warning('Repo can not be cloned')
                 shutil.rmtree(project_path)
                 raise e
 
@@ -405,7 +406,7 @@ def _regex_check(self, printable_diff, filename, commit_hash):
 
             rh = ResultHandler()
             self.stream.scan(
-                row if sys.version_info < (3, 9) else row.encode('utf-8'),
+                row if sys.version_info < (3, 8) else row.encode('utf-8'),
                 match_event_handler=rh.handle_results,
                 context=[row, filename, commit_hash, line_number])
             if rh.result:

diff --git a/requirements.txt b/requirements.txt
@@ -1,21 +1,21 @@
 Flask
 flask_jwt_extended
 GitPython
-hyperscan==0.2.0; python_version >= "3.9"
-hyperscan==0.1.5; python_version < "3.9"
+hyperscan==0.2.0; python_version >= "3.8"
+hyperscan==0.1.5; python_version < "3.8"
 numpy
 pandas
 psycopg2-binary
 PyGithub
 python-dotenv
 pyyaml
-rich
+rich~=12.2
 srsly>=2.4.0
-tensorflow==2.6.2; python_version >= "3.8"
-tensorflow==2.4.*; python_version < "3.8"
-tensorflow-estimator==2.6.0; python_version >= "3.8"
-tensorflow-estimator==2.4.*; python_version < "3.8"
-tensorflow-text==2.6.0; python_version >= "3.8"
-tensorflow-text==2.4.*; python_version < "3.8"
+tensorflow==2.8.0; python_version >= "3.8"
+tensorflow~=2.4; python_version < "3.8"
+tensorflow-estimator==2.8.0; python_version >= "3.8"
+tensorflow-estimator~=2.4; python_version < "3.8"
+tensorflow-text==2.8.1; python_version >= "3.8"
+tensorflow-text~=2.4; python_version < "3.8"
 tf-models-official
 transformers
diff --git a/setup.py b/setup.py
@@ -13,7 +13,7 @@ def requirements():
 
 setuptools.setup(
     name='credentialdigger',
-    version='4.6.1',
+    version='4.7.0',
     author='SAP SE',
     maintainer='Marco Rosa, Slim Trabelsi',
     maintainer_email='marco.rosa@sap.com, slim.trabelsi@sap.com',

diff --git a/tests/functional_tests/test_rules.yml b/tests/functional_tests/test_rules.yml
@@ -31,6 +31,6 @@ rules:
     category: token
     description: MailChimp API key
 
-  - regex: sshpass|password|pwd|passwd|pass
+  - regex: sshpass|password|pwd|passwd|pass[\W]
     category: password
-    description: password keywords
+    description: password keywords
diff --git a/tests/functional_tests/test_scans_postgres.py b/tests/functional_tests/test_scans_postgres.py
@@ -7,6 +7,8 @@
 from git import Repo as GitRepo
 from psycopg2 import connect
 
+TOTAL_PW_DISCOVERIES = 11
+
 
 class TestScansPostgres(unittest.TestCase):
     dotenv = os.path.join(os.path.dirname(os.path.abspath(__file__)), ".env")
@@ -35,7 +37,7 @@ def test_scan_github(self):
             cli.main(["", "scan", "--category", "password",
                       "--dotenv", self.dotenv,
                       "--force", self.repo_url])
-        self.assertEqual(cm.exception.code, 9)
+        self.assertEqual(cm.exception.code, TOTAL_PW_DISCOVERIES)
 
     def test_scan_local(self):
         repo_path = tempfile.mkdtemp()
@@ -46,9 +48,9 @@ def test_scan_local(self):
                       "--models", "PathModel", "PasswordModel",
                       "--category", "password",
                       "--force", "--local", repo_path])
-        # When using the models, we expect to be left with less than 9
-        # discoveries to manually review
-        self.assertTrue(cm.exception.code < 9)
+        # When using the models, we expect to be left with less than
+        # TOTAL_PW_DISCOVERIES discoveries to manually review
+        self.assertTrue(cm.exception.code < TOTAL_PW_DISCOVERIES)
 
         shutil.rmtree(repo_path)
 

diff --git a/tests/functional_tests/test_scans_sqlite.py b/tests/functional_tests/test_scans_sqlite.py
@@ -6,6 +6,8 @@
 from credentialdigger.client_sqlite import SqliteClient
 from git import Repo as GitRepo
 
+TOTAL_PW_DISCOVERIES = 11
+
 
 class TestScansSqlite(unittest.TestCase):
     repo_url = 'https://github.com/SAP/credential-digger-tests'
@@ -27,7 +29,7 @@ def test_scan_github(self):
             cli.main(["", "scan", "--sqlite", self.db_path,
                       "--category", "password",
                       "--force", self.repo_url])
-        self.assertEqual(cm.exception.code, 9)
+        self.assertEqual(cm.exception.code, TOTAL_PW_DISCOVERIES)
 
     def test_scan_local(self):
         repo_path = os.path.join(self.tmp_path, "tmp_repo")
@@ -38,9 +40,9 @@ def test_scan_local(self):
                       "--models", "PathModel", "PasswordModel",
                       "--category", "password",
                       "--force", "--local", repo_path])
-        # When using the models, we expect to be left with less than 9
-        # discoveries to manually review
-        self.assertTrue(cm.exception.code < 9)
+        # When using the models, we expect to be left with less than
+        # TOTAL_PW_DISCOVERIES discoveries to manually review
+        self.assertTrue(cm.exception.code < TOTAL_PW_DISCOVERIES)
 
     def test_scan_wiki(self):
         with self.assertRaises(SystemExit) as cm: