Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

box: Add searchoptions for BoxRetriever, documentation for BoxRetriever as agent tool #26181

Merged
merged 4 commits into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
244 changes: 226 additions & 18 deletions docs/docs/integrations/retrievers/box.ipynb

Large diffs are not rendered by default.

12 changes: 11 additions & 1 deletion libs/partners/box/langchain_box/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,14 @@

from langchain_box.document_loaders import BoxLoader
from langchain_box.retrievers import BoxRetriever
from langchain_box.utilities import BoxAuth, BoxAuthType, _BoxAPIWrapper
from langchain_box.utilities.box import (
BoxAuth,
BoxAuthType,
BoxSearchOptions,
DocumentFiles,
SearchTypeFilter,
_BoxAPIWrapper,
)

try:
__version__ = metadata.version(__package__)
Expand All @@ -16,6 +23,9 @@
"BoxRetriever",
"BoxAuth",
"BoxAuthType",
"BoxSearchOptions",
"DocumentFiles",
"SearchTypeFilter",
"_BoxAPIWrapper",
"__version__",
]
8 changes: 6 additions & 2 deletions libs/partners/box/langchain_box/retrievers/box.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pydantic import ConfigDict, model_validator
from typing_extensions import Self

from langchain_box.utilities import BoxAuth, _BoxAPIWrapper
from langchain_box.utilities import BoxAuth, BoxSearchOptions, _BoxAPIWrapper


class BoxRetriever(BaseRetriever):
Expand Down Expand Up @@ -128,7 +128,10 @@ def format_docs(docs):
"""character_limit is an int that caps the number of characters to
return per document."""

_box: Optional[_BoxAPIWrapper] = None
box_search_options: Optional[BoxSearchOptions] = None
"""Search options to configure BoxRetriever to narrow search results."""

_box: Optional[_BoxAPIWrapper]

model_config = ConfigDict(
arbitrary_types_allowed=True,
Expand All @@ -150,6 +153,7 @@ def validate_box_loader_inputs(self) -> Self:
box_developer_token=self.box_developer_token,
box_auth=self.box_auth,
character_limit=self.character_limit,
box_search_options=self.box_search_options,
)

self._box = _box
Expand Down
18 changes: 16 additions & 2 deletions libs/partners/box/langchain_box/utilities/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
"""Box API Utilities."""

from langchain_box.utilities.box import BoxAuth, BoxAuthType, _BoxAPIWrapper
from langchain_box.utilities.box import (
BoxAuth,
BoxAuthType,
BoxSearchOptions,
DocumentFiles,
SearchTypeFilter,
_BoxAPIWrapper,
)

__all__ = ["BoxAuth", "BoxAuthType", "_BoxAPIWrapper"]
__all__ = [
"BoxAuth",
"BoxAuthType",
"BoxSearchOptions",
"DocumentFiles",
"SearchTypeFilter",
"_BoxAPIWrapper",
]
149 changes: 145 additions & 4 deletions libs/partners/box/langchain_box/utilities/box.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,128 @@ def get_client(self) -> box_sdk_gen.BoxClient:
return self._box_client


class SearchTypeFilter(Enum):
"""SearchTypeFilter.

Enum to limit the what we search.
"""

NAME = "name"
"""The name of the item, as defined by its ``name`` field."""

DESCRIPTION = "description"
"""The description of the item, as defined by its ``description`` field."""

FILE_CONTENT = "file_content"
"""The actual content of the file."""

COMMENTS = "comments"
"""The content of any of the comments on a file or folder."""

TAGS = "tags"
"""Any tags that are applied to an item, as defined by its ``tags`` field."""


class BoxSearchOptions(BaseModel):
ancestor_folder_ids: Optional[List[str]] = None
"""Limits the search results to items within the given list of folders,
defined as a comma separated lists of folder IDs."""

search_type_filter: Optional[List[SearchTypeFilter]] = None
"""Limits the search results to any items that match the search query for a
specific part of the file, for example the file description.

Content types are defined as a comma separated lists of Box recognized
content types. The allowed content types are as follows. Default is all."""

created_date_range: Optional[List[str]] = None
"""Limits the search results to any items created within a given date range.

Date ranges are defined as comma separated RFC3339 timestamps.

If the the start date is omitted (,2014-05-17T13:35:01-07:00) anything
created before the end date will be returned.

If the end date is omitted (2014-05-15T13:35:01-07:00,) the current
date will be used as the end date instead."""

file_extensions: Optional[List[DocumentFiles]] = None
"""Limits the search results to any files that match any of the provided
file extensions. This list is a comma-separated list of
``langchain_box.utilities.DocumentFiles`` entries"""

k: Optional[int] = 100
"""Defines the maximum number of items to return. Defaults to 100, maximum
is 200."""

size_range: Optional[List[int]] = None
"""Limits the search results to any items with a size within a given file
size range. This applied to files and folders.

Size ranges are defined as comma separated list of a lower and upper
byte size limit (inclusive).

The upper and lower bound can be omitted to create open ranges."""

updated_date_range: Optional[List[str]] = None
"""Limits the search results to any items updated within a given date range.

Date ranges are defined as comma separated RFC3339 timestamps.

If the start date is omitted (,2014-05-17T13:35:01-07:00) anything
updated before the end date will be returned.

If the end date is omitted (2014-05-15T13:35:01-07:00,) the current
date will be used as the end date instead."""

class Config:
arbitrary_types_allowed = True
use_enum_values = True
extra = "allow"

@model_validator(mode="after")
def validate_search_options(self) -> Self:
"""Validate k is between 1 and 200"""
if self.k > 200 or self.k < 1: # type: ignore[operator]
raise ValueError(
f"Invalid setting of k {self.k}. " "Value must be between 1 and 200."
)

"""Validate created_date_range start date is before end date"""
if self.created_date_range:
if (
self.created_date_range[0] is None # type: ignore[index]
or self.created_date_range[0] == "" # type: ignore[index]
or self.created_date_range[1] is None # type: ignore[index]
or self.created_date_range[1] == "" # type: ignore[index]
):
pass
else:
if (
self.created_date_range[0] # type: ignore[index]
> self.created_date_range[1] # type: ignore[index]
):
raise ValueError("Start date must be before end date.")

"""Validate updated_date_range start date is before end date"""
if self.updated_date_range:
if (
self.updated_date_range[0] is None # type: ignore[index]
or self.updated_date_range[0] == "" # type: ignore[index]
or self.updated_date_range[1] is None # type: ignore[index]
or self.updated_date_range[1] == "" # type: ignore[index]
):
pass
else:
if (
self.updated_date_range[0] # type: ignore[index]
> self.updated_date_range[1] # type: ignore[index]
):
raise ValueError("Start date must be before end date.")

return self


class _BoxAPIWrapper(BaseModel):
"""Wrapper for Box API."""

Expand All @@ -485,7 +607,10 @@ class _BoxAPIWrapper(BaseModel):
"""character_limit is an int that caps the number of characters to
return per document."""

_box: Optional[box_sdk_gen.BoxClient] = None
box_search_options: Optional[BoxSearchOptions] = None
"""Search options to configure BoxRetriever to narrow search results."""

_box: Optional[box_sdk_gen.BoxClient]

model_config = ConfigDict(
arbitrary_types_allowed=True,
Expand Down Expand Up @@ -636,9 +761,25 @@ def search_box(self, query: str) -> List[Document]:
files = []

try:
results = self._box.search.search_for_content( # type: ignore[union-attr]
query=query, fields=["id", "type", "extension"]
)
results = None

if self.box_search_options is None:
results = self._box.search.search_for_content( # type: ignore[union-attr]
query=query, fields=["id", "type", "extension"], type="file"
)
else:
results = self._box.search.search_for_content( # type: ignore[union-attr]
query=query,
fields=["id", "type", "extension"],
type="file",
ancestor_folder_ids=self.box_search_options.ancestor_folder_ids, # type: ignore[union-attr]
content_types=self.box_search_options.search_type_filter, # type: ignore[union-attr]
created_at_range=self.box_search_options.created_date_range, # type: ignore[union-attr]
file_extensions=self.box_search_options.file_extensions, # type: ignore[union-attr]
limit=self.box_search_options.k, # type: ignore[union-attr]
size_range=self.box_search_options.size_range, # type: ignore[union-attr]
updated_at_range=self.box_search_options.updated_date_range, # type: ignore[union-attr]
)

if results.entries is None or len(results.entries) <= 0:
return None # type: ignore[return-value]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,13 @@
from pytest_mock import MockerFixture

from langchain_box.retrievers import BoxRetriever
from langchain_box.utilities import BoxAuth, BoxAuthType
from langchain_box.utilities import (
BoxAuth,
BoxAuthType,
BoxSearchOptions,
DocumentFiles,
SearchTypeFilter,
)


# Test auth types
Expand Down Expand Up @@ -62,6 +68,44 @@ def test_search(mocker: MockerFixture) -> None:
]


# test search options
def test_search_options(mocker: MockerFixture) -> None:
mocker.patch(
"langchain_box.utilities._BoxAPIWrapper.search_box",
return_value=(
[
Document(
page_content="Test file mode\ndocument contents",
metadata={"title": "Testing Files"},
)
]
),
)

box_search_options = BoxSearchOptions(
ancestor_folder_ids=["box_folder_id"],
search_type_filter=[SearchTypeFilter.FILE_CONTENT],
created_date_range=["2023-01-01T00:00:00-07:00", "2024-08-01T00:00:00-07:00,"],
file_extensions=[DocumentFiles.DOCX, DocumentFiles.PDF],
k=200,
size_range=[1, 1000000],
updated_date_range=None,
)

retriever = BoxRetriever( # type: ignore[call-arg]
box_developer_token="box_developer_token", box_search_options=box_search_options
)

documents = retriever.invoke("query")

assert documents == [
Document(
page_content="Test file mode\ndocument contents",
metadata={"title": "Testing Files"},
)
]


# test ai retrieval
def test_ai(mocker: MockerFixture) -> None:
mocker.patch(
Expand Down
3 changes: 3 additions & 0 deletions libs/partners/box/tests/unit_tests/test_imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
"BoxRetriever",
"BoxAuth",
"BoxAuthType",
"BoxSearchOptions",
"DocumentFiles",
"SearchTypeFilter",
"_BoxAPIWrapper",
"__version__",
]
Expand Down
Loading