Skip to content

Commit

Permalink
box: Add searchoptions for BoxRetriever, documentation for BoxRetriev…
Browse files Browse the repository at this point in the history
…er as agent tool (#26181)

Thank you for contributing to LangChain!

- [x] **PR title**: "package: description"
- Where "package" is whichever of langchain, community, core,
experimental, etc. is being modified. Use "docs: ..." for purely docs
changes, "templates: ..." for template changes, "infra: ..." for CI
changes.
  - Example: "community: add foobar LLM"


Added search options for BoxRetriever and added documentation to
demonstrate how to use BoxRetriever as an agent tool - @BoxPlatform


- [x] **Add tests and docs**: If you're adding a new integration, please
include
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.


- [x] **Lint and test**: Run `make format`, `make lint` and `make test`
from the root of the package(s) you've modified. See contribution
guidelines for more: https://python.langchain.com/docs/contributing/

Additional guidelines:
- Make sure optional dependencies are imported within a function.
- Please do not add dependencies to pyproject.toml files (even optional
ones) unless they are required for unit tests.
- Most PRs should not touch more than one package.
- Changes should be backwards compatible.
- If you are adding something to community, do not re-import it in
langchain.

If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.
  • Loading branch information
shurrey committed Sep 19, 2024
1 parent e0c36af commit acbb4e4
Show file tree
Hide file tree
Showing 7 changed files with 452 additions and 28 deletions.
244 changes: 226 additions & 18 deletions docs/docs/integrations/retrievers/box.ipynb

Large diffs are not rendered by default.

12 changes: 11 additions & 1 deletion libs/partners/box/langchain_box/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,14 @@

from langchain_box.document_loaders import BoxLoader
from langchain_box.retrievers import BoxRetriever
from langchain_box.utilities import BoxAuth, BoxAuthType, _BoxAPIWrapper
from langchain_box.utilities.box import (
BoxAuth,
BoxAuthType,
BoxSearchOptions,
DocumentFiles,
SearchTypeFilter,
_BoxAPIWrapper,
)

try:
__version__ = metadata.version(__package__)
Expand All @@ -16,6 +23,9 @@
"BoxRetriever",
"BoxAuth",
"BoxAuthType",
"BoxSearchOptions",
"DocumentFiles",
"SearchTypeFilter",
"_BoxAPIWrapper",
"__version__",
]
8 changes: 6 additions & 2 deletions libs/partners/box/langchain_box/retrievers/box.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pydantic import ConfigDict, model_validator
from typing_extensions import Self

from langchain_box.utilities import BoxAuth, _BoxAPIWrapper
from langchain_box.utilities import BoxAuth, BoxSearchOptions, _BoxAPIWrapper


class BoxRetriever(BaseRetriever):
Expand Down Expand Up @@ -128,7 +128,10 @@ def format_docs(docs):
"""character_limit is an int that caps the number of characters to
return per document."""

_box: Optional[_BoxAPIWrapper] = None
box_search_options: Optional[BoxSearchOptions] = None
"""Search options to configure BoxRetriever to narrow search results."""

_box: Optional[_BoxAPIWrapper]

model_config = ConfigDict(
arbitrary_types_allowed=True,
Expand All @@ -150,6 +153,7 @@ def validate_box_loader_inputs(self) -> Self:
box_developer_token=self.box_developer_token,
box_auth=self.box_auth,
character_limit=self.character_limit,
box_search_options=self.box_search_options,
)

self._box = _box
Expand Down
18 changes: 16 additions & 2 deletions libs/partners/box/langchain_box/utilities/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
"""Box API Utilities."""

from langchain_box.utilities.box import BoxAuth, BoxAuthType, _BoxAPIWrapper
from langchain_box.utilities.box import (
BoxAuth,
BoxAuthType,
BoxSearchOptions,
DocumentFiles,
SearchTypeFilter,
_BoxAPIWrapper,
)

__all__ = ["BoxAuth", "BoxAuthType", "_BoxAPIWrapper"]
__all__ = [
"BoxAuth",
"BoxAuthType",
"BoxSearchOptions",
"DocumentFiles",
"SearchTypeFilter",
"_BoxAPIWrapper",
]
149 changes: 145 additions & 4 deletions libs/partners/box/langchain_box/utilities/box.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,128 @@ def get_client(self) -> box_sdk_gen.BoxClient:
return self._box_client


class SearchTypeFilter(Enum):
"""SearchTypeFilter.
Enum to limit the what we search.
"""

NAME = "name"
"""The name of the item, as defined by its ``name`` field."""

DESCRIPTION = "description"
"""The description of the item, as defined by its ``description`` field."""

FILE_CONTENT = "file_content"
"""The actual content of the file."""

COMMENTS = "comments"
"""The content of any of the comments on a file or folder."""

TAGS = "tags"
"""Any tags that are applied to an item, as defined by its ``tags`` field."""


class BoxSearchOptions(BaseModel):
ancestor_folder_ids: Optional[List[str]] = None
"""Limits the search results to items within the given list of folders,
defined as a comma separated lists of folder IDs."""

search_type_filter: Optional[List[SearchTypeFilter]] = None
"""Limits the search results to any items that match the search query for a
specific part of the file, for example the file description.
Content types are defined as a comma separated lists of Box recognized
content types. The allowed content types are as follows. Default is all."""

created_date_range: Optional[List[str]] = None
"""Limits the search results to any items created within a given date range.
Date ranges are defined as comma separated RFC3339 timestamps.
If the the start date is omitted (,2014-05-17T13:35:01-07:00) anything
created before the end date will be returned.
If the end date is omitted (2014-05-15T13:35:01-07:00,) the current
date will be used as the end date instead."""

file_extensions: Optional[List[DocumentFiles]] = None
"""Limits the search results to any files that match any of the provided
file extensions. This list is a comma-separated list of
``langchain_box.utilities.DocumentFiles`` entries"""

k: Optional[int] = 100
"""Defines the maximum number of items to return. Defaults to 100, maximum
is 200."""

size_range: Optional[List[int]] = None
"""Limits the search results to any items with a size within a given file
size range. This applied to files and folders.
Size ranges are defined as comma separated list of a lower and upper
byte size limit (inclusive).
The upper and lower bound can be omitted to create open ranges."""

updated_date_range: Optional[List[str]] = None
"""Limits the search results to any items updated within a given date range.
Date ranges are defined as comma separated RFC3339 timestamps.
If the start date is omitted (,2014-05-17T13:35:01-07:00) anything
updated before the end date will be returned.
If the end date is omitted (2014-05-15T13:35:01-07:00,) the current
date will be used as the end date instead."""

class Config:
arbitrary_types_allowed = True
use_enum_values = True
extra = "allow"

@model_validator(mode="after")
def validate_search_options(self) -> Self:
"""Validate k is between 1 and 200"""
if self.k > 200 or self.k < 1: # type: ignore[operator]
raise ValueError(
f"Invalid setting of k {self.k}. " "Value must be between 1 and 200."
)

"""Validate created_date_range start date is before end date"""
if self.created_date_range:
if (
self.created_date_range[0] is None # type: ignore[index]
or self.created_date_range[0] == "" # type: ignore[index]
or self.created_date_range[1] is None # type: ignore[index]
or self.created_date_range[1] == "" # type: ignore[index]
):
pass
else:
if (
self.created_date_range[0] # type: ignore[index]
> self.created_date_range[1] # type: ignore[index]
):
raise ValueError("Start date must be before end date.")

"""Validate updated_date_range start date is before end date"""
if self.updated_date_range:
if (
self.updated_date_range[0] is None # type: ignore[index]
or self.updated_date_range[0] == "" # type: ignore[index]
or self.updated_date_range[1] is None # type: ignore[index]
or self.updated_date_range[1] == "" # type: ignore[index]
):
pass
else:
if (
self.updated_date_range[0] # type: ignore[index]
> self.updated_date_range[1] # type: ignore[index]
):
raise ValueError("Start date must be before end date.")

return self


class _BoxAPIWrapper(BaseModel):
"""Wrapper for Box API."""

Expand All @@ -485,7 +607,10 @@ class _BoxAPIWrapper(BaseModel):
"""character_limit is an int that caps the number of characters to
return per document."""

_box: Optional[box_sdk_gen.BoxClient] = None
box_search_options: Optional[BoxSearchOptions] = None
"""Search options to configure BoxRetriever to narrow search results."""

_box: Optional[box_sdk_gen.BoxClient]

model_config = ConfigDict(
arbitrary_types_allowed=True,
Expand Down Expand Up @@ -636,9 +761,25 @@ def search_box(self, query: str) -> List[Document]:
files = []

try:
results = self._box.search.search_for_content( # type: ignore[union-attr]
query=query, fields=["id", "type", "extension"]
)
results = None

if self.box_search_options is None:
results = self._box.search.search_for_content( # type: ignore[union-attr]
query=query, fields=["id", "type", "extension"], type="file"
)
else:
results = self._box.search.search_for_content( # type: ignore[union-attr]
query=query,
fields=["id", "type", "extension"],
type="file",
ancestor_folder_ids=self.box_search_options.ancestor_folder_ids, # type: ignore[union-attr]
content_types=self.box_search_options.search_type_filter, # type: ignore[union-attr]
created_at_range=self.box_search_options.created_date_range, # type: ignore[union-attr]
file_extensions=self.box_search_options.file_extensions, # type: ignore[union-attr]
limit=self.box_search_options.k, # type: ignore[union-attr]
size_range=self.box_search_options.size_range, # type: ignore[union-attr]
updated_at_range=self.box_search_options.updated_date_range, # type: ignore[union-attr]
)

if results.entries is None or len(results.entries) <= 0:
return None # type: ignore[return-value]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,13 @@
from pytest_mock import MockerFixture

from langchain_box.retrievers import BoxRetriever
from langchain_box.utilities import BoxAuth, BoxAuthType
from langchain_box.utilities import (
BoxAuth,
BoxAuthType,
BoxSearchOptions,
DocumentFiles,
SearchTypeFilter,
)


# Test auth types
Expand Down Expand Up @@ -62,6 +68,44 @@ def test_search(mocker: MockerFixture) -> None:
]


# test search options
def test_search_options(mocker: MockerFixture) -> None:
mocker.patch(
"langchain_box.utilities._BoxAPIWrapper.search_box",
return_value=(
[
Document(
page_content="Test file mode\ndocument contents",
metadata={"title": "Testing Files"},
)
]
),
)

box_search_options = BoxSearchOptions(
ancestor_folder_ids=["box_folder_id"],
search_type_filter=[SearchTypeFilter.FILE_CONTENT],
created_date_range=["2023-01-01T00:00:00-07:00", "2024-08-01T00:00:00-07:00,"],
file_extensions=[DocumentFiles.DOCX, DocumentFiles.PDF],
k=200,
size_range=[1, 1000000],
updated_date_range=None,
)

retriever = BoxRetriever( # type: ignore[call-arg]
box_developer_token="box_developer_token", box_search_options=box_search_options
)

documents = retriever.invoke("query")

assert documents == [
Document(
page_content="Test file mode\ndocument contents",
metadata={"title": "Testing Files"},
)
]


# test ai retrieval
def test_ai(mocker: MockerFixture) -> None:
mocker.patch(
Expand Down
3 changes: 3 additions & 0 deletions libs/partners/box/tests/unit_tests/test_imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
"BoxRetriever",
"BoxAuth",
"BoxAuthType",
"BoxSearchOptions",
"DocumentFiles",
"SearchTypeFilter",
"_BoxAPIWrapper",
"__version__",
]
Expand Down

0 comments on commit acbb4e4

Please sign in to comment.