From 09919c13d227d0f1ceb314cb760f38f5384e4a07 Mon Sep 17 00:00:00 2001 From: Christoph Pirkl <4711730+kaklakariada@users.noreply.github.com> Date: Mon, 9 Sep 2024 13:59:00 +0200 Subject: [PATCH] #151: Add option resolve_hostnames (#152) Co-authored-by: Nicola Coretti --- CHANGELOG.md | 9 +- README.md | 2 +- docs/DEVELOPER_GUIDE.md | 27 ++++++ docs/REFERENCE.md | 7 ++ pyexasol/connection.py | 79 ++++++++++----- pyexasol/version.py | 2 +- pyproject.toml | 2 +- test/integration/connection_test.py | 145 ++++++++++++++++++++++++++++ test/integration/export_test.py | 19 ++++ test/integration/import_test.py | 19 ++++ test/integration/proxy_test.py | 11 +++ test/integration/tls_test.py | 11 +++ 12 files changed, 303 insertions(+), 30 deletions(-) create mode 100644 docs/DEVELOPER_GUIDE.md create mode 100644 test/integration/connection_test.py diff --git a/CHANGELOG.md b/CHANGELOG.md index c4643a8..ab1c16e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,10 @@ ## [Unreleased] +## [0.27.0] - 2024-09-09 + - Relocked dependencies (Internal) +- [#151](https://github.com/exasol/pyexasol/issues/151): Added option to deactivate hostname resolution ## [0.26.0] - 2024-07-04 @@ -12,9 +15,9 @@ This driver facade should only be used if one is certain that using the dbapi2 is the right solution for their scenario, taking all implications into account. For more details on why and who should avoid using dbapi2, please refer to the [DBAPI2 compatibility section](/docs/DBAPI_COMPAT.md) in our documentation. -- Droped support for python 3.7 -- Droped support for Exasol 6.x -- Droped support for Exasol 7.0.x +- Dropped support for python 3.7 +- Dropped support for Exasol 6.x +- Dropped support for Exasol 7.0.x - Relocked dependencies (Internal) - Switched packaging and project workflow to poetry (internal) diff --git a/README.md b/README.md index 0276aad..4577d74 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,7 @@ PyEXASOL provides API to read & write multiple data streams in parallel using se - [DB-API 2.0 compatibility](/docs/DBAPI_COMPAT.md) - [Optional dependencies](/docs/DEPENDENCIES.md) - [Changelog](/CHANGELOG.md) +- [Developer Guide](/docs/DEVELOPER_GUIDE.md) ## PyEXASOL main concepts @@ -116,4 +117,3 @@ Enjoy! ## Maintained by [Exasol](https://www.exasol.com) 2023 — Today - diff --git a/docs/DEVELOPER_GUIDE.md b/docs/DEVELOPER_GUIDE.md new file mode 100644 index 0000000..48c9e00 --- /dev/null +++ b/docs/DEVELOPER_GUIDE.md @@ -0,0 +1,27 @@ +# Developer Guide + +This guide explains how to develop `pyexasol` and run tests. + +## Initial Setup + +Create a virtual environment and install dependencies: + +```sh +poetry install --all-extras +``` + +Run the following to enter the virtual environment: + +```sh +poetry shell +``` + +## Running Integration Tests + +To run integration tests first start a local database: + +```sh +nox -s db-start +``` + +Then you can run tests as usual with `pytest`. diff --git a/docs/REFERENCE.md b/docs/REFERENCE.md index b54973c..b7a0c8d 100644 --- a/docs/REFERENCE.md +++ b/docs/REFERENCE.md @@ -114,6 +114,7 @@ Open new connection and return `ExaConnection` object. | `udf_output_connect_address` | `('udf_host', 8580)` | Specific SCRIPT_OUTPUT_ADDRESS value to connect from Exasol to UDF script output server (Default: inherited from TCP server) | | `udf_output_dir` | `/tmp` | Path or path-like object pointing to directory for script output log files (Default: `tempfile.gettempdir()`) | | `http_proxy` | `http://myproxy.com:3128` | HTTP proxy string in Linux [`http_proxy`](https://www.shellhacks.com/linux-proxy-server-settings-set-proxy-command-line/) format (Default: `None`) | +| `resolve_hostnames` | `False` | Explicitly resolve host names to IP addresses before connecting. Deactivating this will let the operating system resolve the host name (Default: `True`) | | `client_name` | `MyClient` | Custom name of client application displayed in Exasol sessions tables (Default: `PyEXASOL`) | | `client_version` | `1.0.0` | Custom version of client application (Default: `pyexasol.__version__`) | | `client_os_username` | `john` | Custom OS username displayed in Exasol sessions table (Default: `getpass.getuser()`) | @@ -122,6 +123,12 @@ Open new connection and return `ExaConnection` object. | `access_token` | `...` | OpenID access token to use for the login process | | `refresh_token` | `...` | OpenID refresh token to use for the login process | +### Host Name Resolution + +By default pyexasol resolves host names to IP addresses, randomly shuffles the IP addresses and tries to connect until connection succeeds. See the [design documentation](/docs/DESIGN.md#automatic-resolution-and-randomization-of-connection-addresses) for details. + +If host name resolution causes problems, you can deactivate it by specifying argument `resolve_hostnames=False`. This may be required when connecting through a proxy that allows connections only to defined host names. In all other cases we recommend to omit the argument. + ## connect_local_config() Open new connection and return `ExaConnection` object using local .ini file (usually `~/.pyexasol.ini`) to read credentials and connection parameters. Please read [local config](/docs/LOCAL_CONFIG.md) page for more details. diff --git a/pyexasol/connection.py b/pyexasol/connection.py index 4e5d7a7..f5f5dd6 100644 --- a/pyexasol/connection.py +++ b/pyexasol/connection.py @@ -16,6 +16,10 @@ from . import callback as cb +from typing import ( + NamedTuple, + Optional +) from .exceptions import * from .statement import ExaStatement from .logger import ExaLogger @@ -27,6 +31,13 @@ from .version import __version__ +class Host(NamedTuple): + """This represents a resolved host name with its IP address and port number.""" + hostname: str + ip_address: Optional[str] + port: int + fingerprint: Optional[str] + class ExaConnection(object): cls_statement = ExaStatement cls_formatter = ExaFormatter @@ -69,6 +80,7 @@ def __init__(self , udf_output_connect_address=None , udf_output_dir=None , http_proxy=None + , resolve_hostnames=True , client_name=None , client_version=None , client_os_username=None @@ -104,6 +116,7 @@ def __init__(self :param udf_output_connect_address: Specific SCRIPT_OUTPUT_ADDRESS value to connect from Exasol to UDF script output server (default: inherited from TCP server) :param udf_output_dir: Directory to store captured UDF script output logs, split by _/ :param http_proxy: HTTP proxy string in Linux http_proxy format (default: None) + :param resolve_hostnames: Explicitly resolve host names to IP addresses before connecting. Deactivating this will let the operating system resolve the host name (default: True) :param client_name: Custom name of client application displayed in Exasol sessions tables (Default: PyEXASOL) :param client_version: Custom version of client application (Default: pyexasol.__version__) :param client_os_username: Custom OS username displayed in Exasol sessions table (Default: getpass.getuser()) @@ -144,6 +157,7 @@ def __init__(self 'udf_output_dir': udf_output_dir, 'http_proxy': http_proxy, + 'resolve_hostnames': resolve_hostnames, 'client_name': client_name, 'client_version': client_version, @@ -652,30 +666,17 @@ def _init_ws(self): """ dsn_items = self._process_dsn(self.options['dsn']) failed_attempts = 0 - - ws_prefix = 'wss://' if self.options['encryption'] else 'ws://' - ws_options = self._get_ws_options() - for hostname, ipaddr, port, fingerprint in dsn_items: - self.logger.debug(f"Connection attempt [{ipaddr}:{port}]") - - # Use correct hostname matching IP address for each connection attempt - if self.options['encryption']: - ws_options['sslopt']['server_hostname'] = hostname - try: - self._ws = websocket.create_connection(f'{ws_prefix}{ipaddr}:{port}', **ws_options) + self._ws = self._create_websocket_connection(hostname, ipaddr, port) except Exception as e: - self.logger.debug(f'Failed to connect [{ipaddr}:{port}]: {e}') - failed_attempts += 1 - if failed_attempts == len(dsn_items): - raise ExaConnectionFailedError(self, 'Could not connect to Exasol: ' + str(e)) + raise ExaConnectionFailedError(self, 'Could not connect to Exasol: ' + str(e)) from e else: self._ws.settimeout(self.options['socket_timeout']) - self.ws_ipaddr = ipaddr + self.ws_ipaddr = ipaddr or hostname self.ws_port = port self._ws_send = self._ws.send @@ -686,6 +687,32 @@ def _init_ws(self): return + def _create_websocket_connection(self, hostname:str, ipaddr:str, port:int) -> websocket.WebSocket: + ws_options = self._get_ws_options() + # Use correct hostname matching IP address for each connection attempt + if self.options['encryption'] and self.options["resolve_hostnames"]: + ws_options['sslopt']['server_hostname'] = hostname + + connection_string = self._get_websocket_connection_string(hostname, ipaddr, port) + self.logger.debug(f"Connection attempt {connection_string}") + try: + return websocket.create_connection(connection_string, **ws_options) + except Exception as e: + self.logger.debug(f'Failed to connect [{connection_string}]: {e}') + raise e + + def _get_websocket_connection_string(self, hostname:str, ipaddr:Optional[str], port:int) -> str: + host = hostname + if self.options["resolve_hostnames"]: + if ipaddr is None: + raise ValueError("IP address was not resolved") + host = ipaddr + if self.options["encryption"]: + return f"wss://{host}:{port}" + else: + return f"ws://{host}:{port}" + + def _get_ws_options(self): options = { 'timeout': self.options['connection_timeout'], @@ -729,13 +756,13 @@ def _get_login_attributes(self): return attributes - def _process_dsn(self, dsn): + def _process_dsn(self, dsn: str) -> list[Host]: """ Parse DSN, expand ranges and resolve IP addresses for all hostnames Return list of (hostname, ip_address, port) tuples in random order Randomness is required to guarantee proper distribution of workload across all nodes """ - if len(dsn.strip()) == 0: + if dsn is None or len(dsn.strip()) == 0: raise ExaConnectionDsnError(self, 'Connection string is empty') current_port = constant.DEFAULT_PORT @@ -787,24 +814,28 @@ def _process_dsn(self, dsn): result.extend(self._resolve_hostname(hostname, current_port, current_fingerprint)) # Just a single hostname or single IP address else: - result.extend(self._resolve_hostname(m.group('hostname_prefix'), current_port, current_fingerprint)) + hostname = m.group('hostname_prefix') + if self.options["resolve_hostnames"]: + result.extend(self._resolve_hostname(hostname, current_port, current_fingerprint)) + else: + result.append(Host(hostname, None, current_port, current_fingerprint)) random.shuffle(result) return result - def _resolve_hostname(self, hostname, port, fingerprint): + def _resolve_hostname(self, hostname: str, port: int, fingerprint: Optional[str]) -> list[Host]: """ Resolve all IP addresses for hostname and add port It also implicitly checks that all hostnames mentioned in DSN can be resolved """ try: - hostname, alias_list, ipaddr_list = socket.gethostbyname_ex(hostname) - except OSError: + hostname, _, ipaddr_list = socket.gethostbyname_ex(hostname) + except OSError as e: raise ExaConnectionDsnError(self, f'Could not resolve IP address of hostname [{hostname}] ' - f'derived from connection string') + f'derived from connection string') from e - return [(hostname, ipaddr, port, fingerprint) for ipaddr in ipaddr_list] + return [Host(hostname, ipaddr, port, fingerprint) for ipaddr in ipaddr_list] def _validate_fingerprint(self, provided_fingerprint): server_fingerprint = hashlib.sha256(self._ws.sock.getpeercert(True)).hexdigest().upper() diff --git a/pyexasol/version.py b/pyexasol/version.py index 826d20e..cf7b6d6 100644 --- a/pyexasol/version.py +++ b/pyexasol/version.py @@ -1 +1 @@ -__version__ = '0.26.0' +__version__ = '0.27.0' diff --git a/pyproject.toml b/pyproject.toml index 04edcec..3cfd43a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pyexasol" -version = "0.26.0" +version = "0.27.0" license = "MIT" readme = "README.md" description = "Exasol python driver with extra features" diff --git a/test/integration/connection_test.py b/test/integration/connection_test.py new file mode 100644 index 0000000..fcc59ba --- /dev/null +++ b/test/integration/connection_test.py @@ -0,0 +1,145 @@ +import pytest +import websocket +import ssl +from unittest import mock +from dataclasses import dataclass +from typing import Optional + +from pyexasol.exceptions import ExaConnectionDsnError +from pyexasol.connection import (Host, ExaConnection) + +# pylint: disable=protected-access/W0212 + +@dataclass(frozen=True) +class ConnectionMockFixture: + connection: ExaConnection + get_hostname_mock: mock.Mock + create_websocket_connection_mock: mock.Mock + + def simulate_resolve_hostname(self, host: str, ips: list[str]): + self.get_hostname_mock.return_value = (host, [], ips) + + def simulate_resolve_hostnames(self, hosts: list[tuple[str, list[str], list[str]]]): + self.get_hostname_mock.side_effect = hosts + + def assert_websocket_created(self, url: str, **args: dict): + self.create_websocket_connection_mock.assert_called_once_with(url, **args) + + def resolve_hostname(self, hostname: str, port: int, fingerprint: Optional[str]): + return self.connection._resolve_hostname(hostname, port, fingerprint) + + def process_dsn(self, dsn: str): + return self.connection._process_dsn(dsn) + + def init_ws(self): + self.connection._init_ws() + + def get_websocket_connection_string(self, hostname:str, ipaddr:Optional[str], port:int) -> str: + return self.connection._get_websocket_connection_string(hostname, ipaddr, port) + + +@pytest.fixture +def connection_mock(connection): + org_ws = connection._ws + org_ws_send = connection._ws_send + org_ws_recv = connection._ws_recv + try: + with mock.patch("socket.gethostbyname_ex") as get_hostname_mock: + with mock.patch("websocket.create_connection") as create_websocket_connection_mock: + create_websocket_connection_mock.return_value = mock.Mock(websocket.WebSocket) + yield ConnectionMockFixture(connection, get_hostname_mock, create_websocket_connection_mock) + finally: + connection._ws = org_ws + connection._ws_send = org_ws_send + connection._ws_recv = org_ws_recv + +def test_resolve_hostname(connection_mock): + connection_mock.simulate_resolve_hostname("host", ["ip1", "ip2"]) + actual = connection_mock.resolve_hostname("host", 1234, "fingerprint") + expected = [("host","ip1", 1234, "fingerprint"),("host","ip2", 1234, "fingerprint")] + assert len(actual) == len(expected) + for i in range(0, len(expected)): + assert expected[i] in actual + + +@pytest.mark.parametrize("empty_dsn", [None, "", " ", "\t"]) +def test_process_empty_dsn_fails(connection_mock, empty_dsn): + with pytest.raises(ExaConnectionDsnError, match="Connection string is empty"): + connection_mock.process_dsn(empty_dsn) + +def test_process_dsn_resolves_hostname_to_ip_address(connection_mock): + connection_mock.simulate_resolve_hostnames([("host1", [], ["ip1"])]) + actual = connection_mock.process_dsn("host1:1234") + expected = [Host("host1", "ip1", 1234, None)] + assert expected == actual + +def test_process_dsn_does_not_resolve_hostname(connection_mock): + connection_mock.connection.options["resolve_hostnames"] = False + actual = connection_mock.process_dsn("host1:1234") + expected = [Host("host1", None, 1234, None)] + assert expected == actual + +def test_process_dsn_shuffles_hosts(connection_mock): + dsn = "host1:1234,host2:4321" + def resolve_hostname(con): + connection_mock.simulate_resolve_hostnames([("host1", [], ["ip11", "ip12"]), ("host2", [], ["ip21", "ip22"])]) + return tuple(con.process_dsn(dsn)) + count = 100 + results = {resolve_hostname(connection_mock) for _ in range(0, count)} + assert len(results) > 1 + +def test_process_dsn_with_fallback_to_default_port(connection_mock): + connection_mock.simulate_resolve_hostname("host1", ["ip1"]) + actual = connection_mock.process_dsn("host1") + expected = [Host("host1", "ip1", 8563, None)] + assert actual == expected + +def test_process_dsn_with_fingerprint(connection_mock): + connection_mock.simulate_resolve_hostname("host1", ["ip1"]) + actual = connection_mock.process_dsn("host1/135a1d2dce102de866f58267521f4232153545a075dc85f8f7596f57e588a181:1234") + expected = [Host("host1", "ip1", 1234, "135A1D2DCE102DE866F58267521F4232153545A075DC85F8F7596F57E588A181")] + assert actual == expected + +def test_init_ws_connects_via_ipaddress(connection_mock): + connection_mock.simulate_resolve_hostname("localhost", ["ip1"]) + connection_mock.init_ws() + ssl_options = {'cert_reqs': ssl.CERT_NONE, 'server_hostname': 'localhost'} + connection_mock.assert_websocket_created("wss://ip1:8563", timeout=10, skip_utf8_validation=True, enable_multithread=True, sslopt=ssl_options) + +def test_init_ws_connects_without_encryption(connection_mock): + connection_mock.connection.options["encryption"] = False + connection_mock.simulate_resolve_hostname("localhost", ["ip1"]) + connection_mock.init_ws() + connection_mock.assert_websocket_created("ws://ip1:8563", timeout=10, skip_utf8_validation=True, enable_multithread=True) + +def test_init_ws_connects_without_encryption_via_hostname(connection_mock): + connection_mock.connection.options["encryption"] = False + connection_mock.connection.options["resolve_hostnames"] = False + connection_mock.simulate_resolve_hostname("localhost", ["ip1"]) + connection_mock.init_ws() + connection_mock.assert_websocket_created("ws://localhost:8563", timeout=10, skip_utf8_validation=True, enable_multithread=True) + +def test_init_ws_connects_via_hostname(connection_mock): + connection_mock.connection.options["resolve_hostnames"] = False + connection_mock.simulate_resolve_hostname("localhost", ["ip1"]) + connection_mock.init_ws() + ssl_options = {'cert_reqs': ssl.CERT_NONE} + connection_mock.assert_websocket_created("wss://localhost:8563", timeout=10, skip_utf8_validation=True, enable_multithread=True, sslopt=ssl_options) + +def test_get_websocket_connection_string(connection_mock): + actual = connection_mock.get_websocket_connection_string("host1", "ip1", 1234) + assert "wss://ip1:1234" == actual + +def test_get_websocket_connection_string_unencrypted(connection_mock): + connection_mock.connection.options["encryption"] = False + actual = connection_mock.get_websocket_connection_string("host1", "ip1", 1234) + assert "ws://ip1:1234" == actual + +def test_get_websocket_connection_string_do_not_resolve_hostname(connection_mock): + connection_mock.connection.options["resolve_hostnames"] = False + actual = connection_mock.get_websocket_connection_string("host1", "ip1", 1234) + assert "wss://host1:1234" == actual + +def test_get_websocket_connection_string_missing_ip_address(connection_mock): + with pytest.raises(ValueError, match="IP address was not resolved"): + connection_mock.get_websocket_connection_string("host1", None, 1234) diff --git a/test/integration/export_test.py b/test/integration/export_test.py index 91d1f30..ec9b942 100644 --- a/test/integration/export_test.py +++ b/test/integration/export_test.py @@ -13,6 +13,14 @@ def connection(dsn, user, password, schema): yield con +@pytest.fixture +def connection_without_resolving_hostnames(dsn, user, password, schema): + with pyexasol.connect( + dsn=dsn, user=user, password=password, schema=schema, compression=True, resolve_hostnames=False + ) as con: + yield con + + @pytest.fixture def table_name(): yield "CLIENT_NAMES" @@ -92,6 +100,17 @@ def test_export_with_column_names(connection, table, data, export_file, expected assert actual == expected +@pytest.mark.etl +def test_export_without_resolving_hostname(connection_without_resolving_hostnames, table, data, export_file, expected_csv): + params = {"with_column_names": True} + connection_without_resolving_hostnames.export_to_file(export_file, table, export_params=params) + + expected = expected_csv(table, data, **params) + actual = export_file.read_text() + + assert actual == expected + + @pytest.mark.etl def test_custom_export_callback(connection, table, data, export_file, expected_csv): def export_cb(pipe, dst): diff --git a/test/integration/import_test.py b/test/integration/import_test.py index ac90466..72e7013 100644 --- a/test/integration/import_test.py +++ b/test/integration/import_test.py @@ -12,6 +12,14 @@ def connection(dsn, user, password, schema): yield con +@pytest.fixture +def connection_without_resolving_hostnames(dsn, user, password, schema): + with pyexasol.connect( + dsn=dsn, user=user, password=password, schema=schema, compression=True, resolve_hostnames=False + ) as con: + yield con + + @pytest.fixture def table_name(): yield "CLIENT_NAMES" @@ -67,6 +75,17 @@ def test_import_csv(connection, empty_table, csv_file, data): assert actual == expected +@pytest.mark.etl +def test_import_without_resolving_hostname(connection_without_resolving_hostnames, empty_table, csv_file, data): + connection_without_resolving_hostnames.import_from_file(csv_file, empty_table) + result = connection_without_resolving_hostnames.execute(f"SELECT * FROM {empty_table};") + + expected = set(data) + actual = set(result.fetchall()) + + assert actual == expected + + @pytest.mark.etl def test_import_with_reordered_columns(connection, empty_table, csv_file, swaped_data): params = {"columns": ["LASTNAME", "FIRSTNAME"]} diff --git a/test/integration/proxy_test.py b/test/integration/proxy_test.py index cc542d2..33be13d 100644 --- a/test/integration/proxy_test.py +++ b/test/integration/proxy_test.py @@ -56,6 +56,17 @@ def test_connect_through_proxy(dsn, user, password, schema, proxy): assert expected == actual +@pytest.mark.configuration +def test_connect_through_proxy_without_resolving_host_names(dsn, user, password, schema, proxy): + with pyexasol.connect( + dsn=dsn, user=user, password=password, schema=schema, http_proxy=proxy, resolve_hostnames=False + ) as connection: + result = connection.execute("SELECT 1;") + expected = 1 + actual = result.fetchval() + assert expected == actual + + @pytest.mark.configuration def test_connect_through_proxy_with_authentication( dsn, user, password, schema, proxy_with_auth diff --git a/test/integration/tls_test.py b/test/integration/tls_test.py index cff2b1b..6498c09 100644 --- a/test/integration/tls_test.py +++ b/test/integration/tls_test.py @@ -60,6 +60,17 @@ def test_connect_with_tls(dsn, user, password, schema): assert actual == expected +@pytest.mark.tls +def test_connect_with_tls_without_resolving_hostname(dsn, user, password, schema): + expected = 1 + with pyexasol.connect( + dsn=dsn, user=user, password=password, schema=schema, encryption=True, resolve_hostnames=False + ) as connection: + actual = connection.execute("SELECT 1;").fetchval() + + assert actual == expected + + @pytest.mark.tls def test_connect_with_valid_fingerprint( dsn_with_valid_fingerprint, user, password, schema