Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added validators to validate URL fixes #1176 #1308

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion web/reNgine/common_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from dashboard.models import *
from startScan.models import *
from targetApp.models import *
from reNgine.utilities import is_valid_url


logger = get_task_logger(__name__)
Expand Down Expand Up @@ -334,7 +335,7 @@ def get_http_urls(
endpoints = [e for e in endpoints if e.is_alive]

# Grab only http_url from endpoint objects
endpoints = [e.http_url for e in endpoints]
endpoints = [e.http_url for e in endpoints if is_valid_url(e.http_url)]
if ignore_files: # ignore all files
extensions_path = f'{RENGINE_HOME}/fixtures/extensions.txt'
with open(extensions_path, 'r') as f:
Expand Down
28 changes: 28 additions & 0 deletions web/reNgine/utilities.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import validators

from celery._state import get_current_task
from celery.utils.log import ColorFormatter
Expand Down Expand Up @@ -86,3 +87,30 @@ def replace_nulls(obj):
return {key: replace_nulls(value) for key, value in obj.items()}
else:
return obj


def is_valid_url(url, validate_only_http_scheme=True):
"""
Validate a URL/endpoint

Args:
url (str): The URL to validate.
validate_only_http_scheme (bool): If True, only validate HTTP/HTTPS URLs.

Returns:
bool: True if the URL is valid, False otherwise.
"""
# no urls returns false
if not url:
return False

# urls with space are not valid urls
if ' ' in url:
return False

if validators.url(url):
# check for scheme, for example ftp:// can be a valid url but may not be required to crawl etc
if validate_only_http_scheme:
return url.startswith('http://') or url.startswith('https://')
return True
return False
Loading