-
Notifications
You must be signed in to change notification settings - Fork 108
adlfs user agent #501
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
adlfs user agent #501
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -73,6 +73,12 @@ | |
_SOCKET_TIMEOUT_DEFAULT = object() | ||
|
||
|
||
def _get_user_agent(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In order to break the circular loop here, I think it would be worth moving this logic: https://github.com/anjaliratnam-msft/adlfs/blob/f64dff2b2ce62dd2d194e0be02804104b4505d7d/adlfs/__init__.py#L6-L11 to the Generally, a |
||
from adlfs import __version__ | ||
|
||
return f"adlfs/{__version__}" | ||
|
||
|
||
# https://github.com/Azure/azure-sdk-for-python/issues/11419#issuecomment-628143480 | ||
def make_callback(key, callback): | ||
if callback is None: | ||
|
@@ -118,6 +124,41 @@ def _coalesce_version_id(*args) -> Optional[str]: | |
return version_ids.pop() | ||
|
||
|
||
def _create_aio_blob_service_client( | ||
account_url: str, | ||
location_mode: Optional[str] = None, | ||
credential: Optional[str] = None, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm technically this is not the correct type; it can be more than a string. But it looks like the initializer for the filesystem use this type as well. I think it is fine to keep it as is but it may be worth doing a pass at a later time to add correct typing to the project and integrate |
||
) -> AIOBlobServiceClient: | ||
if credential is not None: | ||
return AIOBlobServiceClient( | ||
account_url=account_url, | ||
credential=credential, | ||
_location_mode=location_mode, | ||
user_agent=_get_user_agent(), | ||
) | ||
elif location_mode is not None: | ||
return AIOBlobServiceClient( | ||
account_url=account_url, | ||
credential=None, | ||
_location_mode=location_mode, | ||
user_agent=_get_user_agent(), | ||
) | ||
else: | ||
return AIOBlobServiceClient( | ||
account_url=account_url, | ||
user_agent=_get_user_agent(), | ||
) | ||
Comment on lines
+132
to
+150
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For this factory, it would be interesting to see if we can consolidate the actual client instantiation to one instance. To do this we should be able to leverage keyword arguments, for example: service_client_kwargs = {
"account_url": account_url,
"user_agent": _get_user_agent(),
}
if credential is not None:
service_client_kwargs["credentials"] = credentials
if location_mode is not None:
service_client_kwargs["_location_mode"] = location_mode
return AIOBlobServiceClient(**service_client_kwargs) The main reason that I'm suggesting we do this is it reduces the verbosity and makes it easier to follow/add to the logic when there is only one path to instantiating the client. |
||
|
||
|
||
def _create_aio_blob_service_client_from_connection_string( | ||
connection_string: str, | ||
) -> AIOBlobServiceClient: | ||
return AIOBlobServiceClient.from_connection_string( | ||
conn_str=connection_string, | ||
user_agent=_get_user_agent(), | ||
) | ||
|
||
|
||
class AzureBlobFileSystem(AsyncFileSystem): | ||
""" | ||
Access Azure Datalake Gen2 and Azure Storage if it were a file system using Multiprotocol Access | ||
|
@@ -473,8 +514,10 @@ def do_connect(self): | |
|
||
try: | ||
if self.connection_string is not None: | ||
self.service_client = AIOBlobServiceClient.from_connection_string( | ||
conn_str=self.connection_string | ||
self.service_client = ( | ||
_create_aio_blob_service_client_from_connection_string( | ||
connection_string=self.connection_string, | ||
) | ||
) | ||
elif self.account_name is not None: | ||
if hasattr(self, "account_host"): | ||
|
@@ -487,26 +530,25 @@ def do_connect(self): | |
creds = [self.credential, self.account_key] | ||
if any(creds): | ||
self.service_client = [ | ||
AIOBlobServiceClient( | ||
_create_aio_blob_service_client( | ||
account_url=self.account_url, | ||
location_mode=self.location_mode, | ||
credential=cred, | ||
_location_mode=self.location_mode, | ||
) | ||
for cred in creds | ||
if cred is not None | ||
][0] | ||
elif self.sas_token is not None: | ||
if not self.sas_token.startswith("?"): | ||
self.sas_token = f"?{self.sas_token}" | ||
self.service_client = AIOBlobServiceClient( | ||
self.service_client = _create_aio_blob_service_client( | ||
account_url=self.account_url + self.sas_token, | ||
credential=None, | ||
_location_mode=self.location_mode, | ||
location_mode=self.location_mode, | ||
) | ||
else: | ||
# Fall back to anonymous login, and assume public container | ||
self.service_client = AIOBlobServiceClient( | ||
account_url=self.account_url | ||
self.service_client = _create_aio_blob_service_client( | ||
account_url=self.account_url, | ||
) | ||
else: | ||
raise ValueError( | ||
|
@@ -2047,27 +2089,28 @@ def connect_client(self): | |
creds = [self.fs.sync_credential, self.fs.account_key, self.fs.credential] | ||
if any(creds): | ||
self.container_client = [ | ||
AIOBlobServiceClient( | ||
_create_aio_blob_service_client( | ||
account_url=self.fs.account_url, | ||
credential=cred, | ||
_location_mode=self.fs.location_mode, | ||
location_mode=self.fs.location_mode, | ||
).get_container_client(self.container_name) | ||
for cred in creds | ||
if cred is not None | ||
][0] | ||
elif self.fs.connection_string is not None: | ||
self.container_client = AIOBlobServiceClient.from_connection_string( | ||
conn_str=self.fs.connection_string | ||
).get_container_client(self.container_name) | ||
self.container_client = ( | ||
_create_aio_blob_service_client_from_connection_string( | ||
connection_string=self.fs.connection_string, | ||
).get_container_client(self.container_name) | ||
) | ||
elif self.fs.sas_token is not None: | ||
self.container_client = AIOBlobServiceClient( | ||
account_url=self.fs.account_url + self.fs.sas_token, credential=None | ||
self.container_client = _create_aio_blob_service_client( | ||
account_url=self.fs.account_url + self.fs.sas_token, | ||
).get_container_client(self.container_name) | ||
else: | ||
self.container_client = AIOBlobServiceClient( | ||
account_url=self.fs.account_url | ||
self.container_client = _create_aio_blob_service_client( | ||
account_url=self.fs.account_url, | ||
).get_container_client(self.container_name) | ||
|
||
except Exception as e: | ||
raise ValueError( | ||
f"Unable to fetch container_client with provided params for {e}!!" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,13 +2,15 @@ | |
import os | ||
import tempfile | ||
from unittest import mock | ||
from unittest.mock import patch | ||
|
||
import azure.storage.blob.aio | ||
import dask.dataframe as dd | ||
import fsspec | ||
import numpy as np | ||
import pandas as pd | ||
import pytest | ||
from azure.storage.blob.aio import BlobServiceClient as AIOBlobServiceClient | ||
from packaging.version import parse as parse_version | ||
from pandas.testing import assert_frame_equal | ||
|
||
|
@@ -2045,3 +2047,100 @@ def test_open_file_x(storage: azure.storage.blob.BlobServiceClient, tmpdir): | |
with fs.open("data/afile", "xb") as f: | ||
pass | ||
assert fs.cat_file("data/afile") == b"data" | ||
|
||
|
||
def test_user_agent_blob_file_connection_str( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For all of these test cases that we are adding, I'm thinking we just move them to a
|
||
storage: azure.storage.blob.BlobServiceClient, mocker | ||
): | ||
from adlfs import __version__ | ||
|
||
fs = AzureBlobFileSystem( | ||
account_name=storage.account_name, connection_string=CONN_STR | ||
) | ||
mock_client = mocker.patch.object( | ||
AIOBlobServiceClient, "from_connection_string", return_value=mocker.MagicMock() | ||
) | ||
Comment on lines
+2060
to
+2062
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Playing around with these mocks, I think it would be worth just having two new patch fixtures: one for the @pytest.fixture()
def mock_from_connection_string(mocker):
return mocker.patch.object(
AIOBlobServiceClient, "from_connection_string", autospec=True,
side_effect=AIOBlobServiceClient.from_connection_string
)
@pytest.fixture()
def mock_service_client_init(mocker):
return mocker.patch.object(
AIOBlobServiceClient, "__init__", autospec=True,
side_effect=AIOBlobServiceClient.__init__
) This is a pattern used in another test case in |
||
mock_client.return_value.get_container_client = mocker.MagicMock() | ||
|
||
f = AzureBlobFile(fs, "data/root/a/file.txt", mode="rb") | ||
f.container_name = "data" | ||
f.connect_client() | ||
|
||
mock_client.assert_called_once() | ||
assert "user_agent" in mock_client.call_args.kwargs | ||
assert mock_client.call_args.kwargs["user_agent"] == f"adlfs/{__version__}" | ||
Comment on lines
+2069
to
+2071
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would probably make sense to consolidate these assertions to a helper function (e.g., |
||
|
||
|
||
def test_user_agent_blob_file_initializer( | ||
storage: azure.storage.blob.BlobServiceClient, mocker | ||
): | ||
from adlfs import __version__ | ||
|
||
path = "root/a/file.txt" | ||
mocker.patch("adlfs.spec.filter_blobs", []) | ||
|
||
mock_details = mocker.PropertyMock( | ||
return_value={ | ||
"name": path, | ||
"metadata": {}, | ||
"size": 0, | ||
"content_settings": {"content_type": ""}, | ||
} | ||
) | ||
mocker.patch.object(AzureBlobFile, "details", mock_details) | ||
mock_client = mocker.patch("adlfs.spec.AIOBlobServiceClient", spec=True) | ||
|
||
mock_container_client = mocker.MagicMock() | ||
mock_blob_client = mocker.MagicMock() | ||
mock_blob_client.get_blob_properties = mocker.AsyncMock(return_value={}) | ||
mock_blob_client.__aenter__ = mocker.AsyncMock(return_value=mock_blob_client) | ||
mock_blob_client.__aexit__ = mocker.AsyncMock() | ||
mock_blob_client.close = mocker.AsyncMock() | ||
mock_container_client.get_blob_client = mocker.MagicMock( | ||
return_value=mock_blob_client | ||
) | ||
mock_client.return_value.get_container_client = mocker.MagicMock( | ||
return_value=mock_container_client | ||
) | ||
|
||
fs = AzureBlobFileSystem( | ||
account_name=storage.account_name, | ||
) | ||
AzureBlobFile(fs, "data/root/a/file.txt", mode="rb") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For this test case, I'm thinking we either do the following to test the case where we are not using
The main reason that I think we should explore that approach is that because we are already using Azurite, it seems ideal to try to avoid building up mocks/fakes if we can avoid it to reduce the overall amount of scaffolding needed. |
||
|
||
mock_client.assert_called_once() | ||
assert "user_agent" in mock_client.call_args.kwargs | ||
assert mock_client.call_args.kwargs["user_agent"] == f"adlfs/{__version__}" | ||
|
||
|
||
@patch("adlfs.spec.AIOBlobServiceClient") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you elaborate on why this patch outside of the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When I run it on its own the test passes but when I run the whole test directory it fails without patching the class. For some reason it's only this test that's causing issues. I'm pretty sure it's happening because the AIOBlobServiceClient class is getting patched in another test and it's affecting this, but I couldn't figure out where it's coming from. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah that sounds like a patch that is not properly being cleaned up. I'll take a look to see if I can find anything There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, in terms of state remaining, we may be running up against the caching feature of fsspec: https://filesystem-spec.readthedocs.io/en/latest/features.html#instance-caching. For any of the calls we make to instantiating an |
||
def test_user_agent_connection_str( | ||
storage: azure.storage.blob.BlobServiceClient, mocker | ||
): | ||
from adlfs import __version__ | ||
|
||
mock_client_instance = mocker.MagicMock() | ||
mock_client_instance.close = mocker.AsyncMock() | ||
mock_client = mocker.patch( | ||
"adlfs.spec.AIOBlobServiceClient.from_connection_string", | ||
return_value=mock_client_instance, | ||
) | ||
|
||
AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Whenever creating an |
||
mock_client.assert_called_once() | ||
assert "user_agent" in mock_client.call_args.kwargs | ||
assert mock_client.call_args.kwargs["user_agent"] == f"adlfs/{__version__}" | ||
|
||
|
||
def test_user_agent_initializer(storage: azure.storage.blob.BlobServiceClient, mocker): | ||
from adlfs import __version__ | ||
|
||
fs = AzureBlobFileSystem( | ||
account_name=storage.account_name, | ||
) | ||
mock_client = mocker.patch("adlfs.spec.AIOBlobServiceClient", spec=True) | ||
|
||
fs.do_connect() | ||
mock_client.assert_called_once() | ||
assert "user_agent" in mock_client.call_args.kwargs | ||
assert mock_client.call_args.kwargs["user_agent"] == f"adlfs/{__version__}" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should also make sure to add a changelog entry for this update here: https://github.com/fsspec/adlfs/blob/main/CHANGELOG.md