Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

upgrade to v4.7.0 #228

Merged
merged 20 commits into from
Apr 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 1 addition & 11 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:

strategy:
matrix:
python-version: [3.6, 3.7, 3.8, 3.9]
python-version: ['3.7', '3.8', '3.9', '3.10']

services:
postgres:
Expand Down Expand Up @@ -62,14 +62,6 @@ jobs:
run: |
python setup.py install --user

- name: Download and link credentialdigger's models
env:
path_model: https://github.com/SAP/credential-digger/releases/download/PM-v1.0.1/path_model-1.0.1.tar.gz
snippet_model: https://github.com/SAP/credential-digger/releases/download/SM-v1.0.0/snippet_model-1.0.0.tar.gz
run: |
python -m credentialdigger download path_model
python -m credentialdigger download snippet_model

- name: Run unit tests
run: |
pytest tests/unit_tests
Expand All @@ -85,7 +77,5 @@ jobs:
POSTGRES_DB: credential_digger_tests
DBHOST: localhost
DBPORT: 5432
path_model: https://github.com/SAP/credential-digger/releases/download/PM-v1.0.1/path_model-1.0.1.tar.gz
snippet_model: https://github.com/SAP/credential-digger/releases/download/SM-v1.0.0/snippet_model-1.0.0.tar.gz
run: |
pytest tests/functional_tests
21 changes: 14 additions & 7 deletions credentialdigger/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from datetime import datetime, timezone

import yaml
from git import GitCommandError
from github import Github
from rich.progress import Progress

Expand Down Expand Up @@ -982,11 +983,16 @@ def scan_user(self, username, category=None, models=None, debug=False,
# Get repo clone url without .git at the end
repo_url = repo.clone_url[:-4]
logger.info(f'{i}/{repos_num}) Scanning {repo.url}')
missing_ids[repo_url] = self._scan(repo_url, scanner,
models=models,
debug=debug,
similarity=similarity,
git_token=git_token)
try:
missing_ids[repo_url] = self._scan(repo_url, scanner,
models=models,
debug=debug,
similarity=similarity,
git_token=git_token)
except GitCommandError:
logger.warning(f'{i}/{repos_num} Ignore {repo_url} '
'(it can not be cloned)')

return missing_ids

def scan_wiki(self, repo_url, category=None, models=None, debug=False,
Expand Down Expand Up @@ -1366,7 +1372,8 @@ def update_similar_snippets(self,
# Compute similarity of target_embedding and embedding
similarity = compute_similarity(target_embedding,
embedding)
if similarity > threshold:
self.update_discovery(d['id'], state)
# Increase counter if similar and the update is successful
if (similarity > threshold and
self.update_discovery(d['id'], state)):
n_updated_snippets += 1
return n_updated_snippets
8 changes: 4 additions & 4 deletions credentialdigger/client_postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@ def update_repo(self, url, last_scan):
bool
`True` if the update is successful, `False` otherwise
"""
super().update_repo(
return super().update_repo(
url=url, last_scan=last_scan,
query='UPDATE repos SET last_scan=%s WHERE url=%s RETURNING true'
)
Expand All @@ -528,7 +528,7 @@ def update_discovery(self, discovery_id, new_state):
bool
`True` if the update is successful, `False` otherwise
"""
super().update_discovery(
return super().update_discovery(
discovery_id=discovery_id,
new_state=new_state,
query='UPDATE discoveries SET state=%s WHERE id=%s RETURNING true')
Expand All @@ -548,7 +548,7 @@ def update_discoveries(self, discoveries_ids, new_state):
bool
`True` if the update is successful, `False` otherwise
"""
super().update_discoveries(
return super().update_discoveries(
discoveries_ids=discoveries_ids,
new_state=new_state,
query='UPDATE discoveries SET state=%s WHERE id IN %s RETURNING true')
Expand Down Expand Up @@ -581,6 +581,6 @@ def update_discovery_group(self, new_state, repo_url, file_name, snippet=None):
if snippet is not None:
query += ' and snippet=%s'
query += ' RETURNING true'
super().update_discovery_group(
return super().update_discovery_group(
new_state=new_state, repo_url=repo_url, file_name=file_name,
snippet=snippet, query=query)
10 changes: 5 additions & 5 deletions credentialdigger/client_sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def query_check(self, query, *args):
try:
cursor.execute(query, args)
self.db.commit()
return cursor.rowcount < 1
return cursor.rowcount >= 1
except (TypeError, IndexError):
""" A TypeError is raised if any of the required arguments is
missing. """
Expand Down Expand Up @@ -544,7 +544,7 @@ def update_repo(self, url, last_scan):
bool
`True` if the update is successful, `False` otherwise
"""
super().update_repo(
return super().update_repo(
url=url, last_scan=last_scan,
query='UPDATE repos SET last_scan=? WHERE url=?'
)
Expand All @@ -564,7 +564,7 @@ def update_discovery(self, discovery_id, new_state):
bool
`True` if the update is successful, `False` otherwise
"""
super().update_discovery(
return super().update_discovery(
new_state=new_state, discovery_id=discovery_id,
query='UPDATE discoveries SET state=? WHERE id=?'
)
Expand All @@ -584,7 +584,7 @@ def update_discoveries(self, discoveries_ids, new_state):
bool
`True` if the update is successful, `False` otherwise
"""
super().update_discoveries(
return super().update_discoveries(
discoveries_ids=discoveries_ids,
new_state=new_state,
query='UPDATE discoveries SET state=? WHERE id IN('
Expand Down Expand Up @@ -617,6 +617,6 @@ def update_discovery_group(self, new_state, repo_url, file_name, snippet=None):
query += ' and file_name=?'
if snippet is not None:
query += ' and snippet=?'
super().update_discovery_group(
return super().update_discovery_group(
new_state=new_state, repo_url=repo_url, file_name=file_name,
snippet=snippet, query=query)
32 changes: 16 additions & 16 deletions credentialdigger/models/password_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,17 +46,18 @@ def analyze_batch(self, discoveries):
# We have to classify only the "new" discoveries
new_discoveries = [d for d in discoveries if d['state'] == 'new']
no_new_discoveries = [d for d in discoveries if d['state'] != 'new']
# Create a dataset with all the preprocessed (new) snippets
data = self._pre_process([d['snippet'] for d in new_discoveries])
# data = self._preprocess_batch_data(snippets)
# Compute a prediction for each snippet
outputs = self.model.predict(data)
logits = outputs['logits']
predictions = tf.argmax(logits, 1)
# Check predictions and set FP discoveries accordingly
for d, p in zip(new_discoveries, predictions):
if p == 0:
d['state'] = 'false_positive'
# Process new_discoveries if not empty
if new_discoveries:
# Create a dataset with all the preprocessed (new) snippets
data = self._pre_process([d['snippet'] for d in new_discoveries])
# Compute a prediction for each snippet
outputs = self.model.predict(data)
logits = outputs['logits']
predictions = tf.argmax(logits, 1)
# Check predictions and set FP discoveries accordingly
for d, p in zip(new_discoveries, predictions):
if p == 0:
d['state'] = 'false_positive'
return new_discoveries + no_new_discoveries

def analyze(self, discovery):
Expand All @@ -69,11 +70,9 @@ def analyze(self, discovery):

Returns
-------
discoveries: list of dict
The discoveries, with states updated according to
the model's predictions
n_false_positives: int
The number of false positives detected by the model
bool
True if the snippet is safe (i.e., there is no leak).
False otherwise
"""
# Preprocess the snippet
data = self._pre_process([discovery['snippet']])
Expand All @@ -84,6 +83,7 @@ def analyze(self, discovery):
# The model classified this snippet as a false positive
# (i.e., spam)
return True
return False

def _pre_process(self, snippet):
""" Compute encodings of snippets and format them to a standard
Expand Down
3 changes: 2 additions & 1 deletion credentialdigger/models/path_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ def analyze(self, discovery):
Returns
-------
bool
True if the discovery is classified as false positive (i.e., spam)
True if the discovery is classified as false positive (i.e., spam),
False otherwise
"""
return bool(self.fp_keywords.search(discovery['file_name'].lower()))

Expand Down
4 changes: 3 additions & 1 deletion credentialdigger/scanners/file_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def scan_file(self, project_root, relative_path, **kwargs):
for row in file_to_scan:
rh = ResultHandler()
self.stream.scan(
row if sys.version_info < (3, 9) else row.encode(
row if sys.version_info < (3, 8) else row.encode(
'utf-8'),
match_event_handler=rh.handle_results,
context=[row.strip(), relative_path, commit_id,
Expand All @@ -169,6 +169,8 @@ def scan_file(self, project_root, relative_path, **kwargs):
except UnicodeDecodeError:
# Don't scan binary files
pass
except FileNotFoundError:
logger.warning(f'Ignore {relative_path} (file not found)')
return discoveries

def _prune(self, rel_dir_root, dirs, files, max_depth=-1, ignore_list=[]):
Expand Down
3 changes: 2 additions & 1 deletion credentialdigger/scanners/git_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def get_git_repo(self, repo_url, local_repo):
GitRepo.clone_from(repo_url, project_path)
repo = GitRepo(project_path)
except GitCommandError as e:
logger.warning('Repo can not be cloned')
shutil.rmtree(project_path)
raise e

Expand Down Expand Up @@ -405,7 +406,7 @@ def _regex_check(self, printable_diff, filename, commit_hash):

rh = ResultHandler()
self.stream.scan(
row if sys.version_info < (3, 9) else row.encode('utf-8'),
row if sys.version_info < (3, 8) else row.encode('utf-8'),
match_event_handler=rh.handle_results,
context=[row, filename, commit_hash, line_number])
if rh.result:
Expand Down
18 changes: 9 additions & 9 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
Flask
flask_jwt_extended
GitPython
hyperscan==0.2.0; python_version >= "3.9"
hyperscan==0.1.5; python_version < "3.9"
hyperscan==0.2.0; python_version >= "3.8"
hyperscan==0.1.5; python_version < "3.8"
numpy
pandas
psycopg2-binary
PyGithub
python-dotenv
pyyaml
rich
rich~=12.2
srsly>=2.4.0
tensorflow==2.6.2; python_version >= "3.8"
tensorflow==2.4.*; python_version < "3.8"
tensorflow-estimator==2.6.0; python_version >= "3.8"
tensorflow-estimator==2.4.*; python_version < "3.8"
tensorflow-text==2.6.0; python_version >= "3.8"
tensorflow-text==2.4.*; python_version < "3.8"
tensorflow==2.8.0; python_version >= "3.8"
tensorflow~=2.4; python_version < "3.8"
tensorflow-estimator==2.8.0; python_version >= "3.8"
tensorflow-estimator~=2.4; python_version < "3.8"
tensorflow-text==2.8.1; python_version >= "3.8"
tensorflow-text~=2.4; python_version < "3.8"
tf-models-official
transformers
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def requirements():

setuptools.setup(
name='credentialdigger',
version='4.6.1',
version='4.7.0',
author='SAP SE',
maintainer='Marco Rosa, Slim Trabelsi',
maintainer_email='marco.rosa@sap.com, slim.trabelsi@sap.com',
Expand Down
4 changes: 2 additions & 2 deletions tests/functional_tests/test_rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,6 @@ rules:
category: token
description: MailChimp API key

- regex: sshpass|password|pwd|passwd|pass
- regex: sshpass|password|pwd|passwd|pass[\W]
category: password
description: password keywords
description: password keywords
10 changes: 6 additions & 4 deletions tests/functional_tests/test_scans_postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from git import Repo as GitRepo
from psycopg2 import connect

TOTAL_PW_DISCOVERIES = 11


class TestScansPostgres(unittest.TestCase):
dotenv = os.path.join(os.path.dirname(os.path.abspath(__file__)), ".env")
Expand Down Expand Up @@ -35,7 +37,7 @@ def test_scan_github(self):
cli.main(["", "scan", "--category", "password",
"--dotenv", self.dotenv,
"--force", self.repo_url])
self.assertEqual(cm.exception.code, 9)
self.assertEqual(cm.exception.code, TOTAL_PW_DISCOVERIES)

def test_scan_local(self):
repo_path = tempfile.mkdtemp()
Expand All @@ -46,9 +48,9 @@ def test_scan_local(self):
"--models", "PathModel", "PasswordModel",
"--category", "password",
"--force", "--local", repo_path])
# When using the models, we expect to be left with less than 9
# discoveries to manually review
self.assertTrue(cm.exception.code < 9)
# When using the models, we expect to be left with less than
# TOTAL_PW_DISCOVERIES discoveries to manually review
self.assertTrue(cm.exception.code < TOTAL_PW_DISCOVERIES)

shutil.rmtree(repo_path)

Expand Down
10 changes: 6 additions & 4 deletions tests/functional_tests/test_scans_sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from credentialdigger.client_sqlite import SqliteClient
from git import Repo as GitRepo

TOTAL_PW_DISCOVERIES = 11


class TestScansSqlite(unittest.TestCase):
repo_url = 'https://github.com/SAP/credential-digger-tests'
Expand All @@ -27,7 +29,7 @@ def test_scan_github(self):
cli.main(["", "scan", "--sqlite", self.db_path,
"--category", "password",
"--force", self.repo_url])
self.assertEqual(cm.exception.code, 9)
self.assertEqual(cm.exception.code, TOTAL_PW_DISCOVERIES)

def test_scan_local(self):
repo_path = os.path.join(self.tmp_path, "tmp_repo")
Expand All @@ -38,9 +40,9 @@ def test_scan_local(self):
"--models", "PathModel", "PasswordModel",
"--category", "password",
"--force", "--local", repo_path])
# When using the models, we expect to be left with less than 9
# discoveries to manually review
self.assertTrue(cm.exception.code < 9)
# When using the models, we expect to be left with less than
# TOTAL_PW_DISCOVERIES discoveries to manually review
self.assertTrue(cm.exception.code < TOTAL_PW_DISCOVERIES)

def test_scan_wiki(self):
with self.assertRaises(SystemExit) as cm:
Expand Down