Skip to content

Adding Glob support for Data lake #36

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ocifs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# coding: utf-8
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
# Copyright (c) 2021, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

from .core import OCIFileSystem
Expand Down
67 changes: 52 additions & 15 deletions ocifs/core.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# coding: utf-8
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
# Copyright (c) 2021, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
import os
from ast import literal_eval
Expand Down Expand Up @@ -359,6 +359,20 @@ def split_path(self, path, **kwargs):
obj_path = obj_path.rstrip("/")
return bucket, namespace, obj_path

def setup_oci_client(self, config, **kwargs):
try:
logger.debug(
f"Lakesharing Object Storage Client is being set up for supporting data lake support and "
f"interacting with object storage using the config passed in: {self.config}"
)
return LakeSharingObjectStorageClient(self.config, **self.config_kwargs)
except Exception as e:
logger.error(
f"Exception encountered when attempting to initialize the Lakesharing Object Storage Client "
f"using the config:{self.config}"
)
raise e

def connect(self, refresh=True):
"""Establish oci connection object.

Expand Down Expand Up @@ -386,20 +400,7 @@ def connect(self, refresh=True):
{"additional_user_agent": f"Oracle-ocifs/version={__version__}"}
)
self._get_region()
try:
self.oci_client = LakeSharingObjectStorageClient(
self.config, **self.config_kwargs
)
logger.debug(
f"Lakesharing Object Storage Client is being set up for supporting data lake support and "
f"interacting with object storage using the config passed in: {self.config}"
)
except Exception as e:
logger.error(
"Exception encountered when attempting to initialize the Lakesharing Object Storage Client"
" using the config:{self.config}"
)
raise e
self.oci_client = self.setup_oci_client(self.config, **self.config_kwargs)
return self.oci_client

def invalidate_cache(self, path=None):
Expand Down Expand Up @@ -1274,6 +1275,42 @@ def walk(self, path, maxdepth=None, **kwargs):
raise ValueError("Cannot crawl all of OCI Object Storage")
return super().walk(path, maxdepth=maxdepth, **kwargs)

def glob(self, path, maxdepth=None, **kwargs):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this be in the ocilake specific implementation file?

Copy link
Member Author

@ramackri ramackri Feb 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if the URI contains lake then the customized logic will be executed otherwise fs spec custom implementation will be invoked. We used self.split_path(path) (which is mentioned inside core.py ---> will fetch bucket/namespace info from lake service).This custom logic will not create any impact on the existing use cases

"""
Find files by glob-matching.

If the path ends with '/', only folders are returned.

We support ``"**"``,
``"?"`` and ``"[..]"``. We do not support ^ for pattern negation.

The `maxdepth` option is applied on the first `**` found in the path.

Search path names that contain embedded characters special to this
implementation of glob may not produce expected results;
e.g., 'foo/bar/*starredfilename*'.

kwargs are passed to ``ls``.
"""
path_sans_protocol = self._strip_protocol(path)
full_bucket, _, obj_path = path_sans_protocol.partition("/")
# Added the below check for lake support
if "@ocid1.lake" in full_bucket:
ocifs_url = full_bucket
ocifs_url = f"ocilake://{ocifs_url}"
bucket, namespace, key = self.split_path(path)
bucket_full_path = _build_full_path(bucket, namespace, key)
bucket_with_namespace_path = _build_full_path(bucket, namespace)
path_list = super().glob(bucket_full_path, maxdepth=maxdepth, **kwargs)
formatted_path_list = []
for path in path_list:
formatted_path_list.append(
ocifs_url + path.removeprefix(bucket_with_namespace_path)
)
return formatted_path_list
else:
return super().glob(path, maxdepth=maxdepth, **kwargs)

def cat(self, path, recursive=False, on_error="raise", **kwargs):
"""Fetch (potentially multiple) paths' contents

Expand Down
2 changes: 1 addition & 1 deletion ocifs/data_lake/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# coding: utf-8
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
# Copyright (c) 2021, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
2 changes: 1 addition & 1 deletion ocifs/data_lake/lake_mount.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# coding: utf-8
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
# Copyright (c) 2021, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
from oci.util import (
formatted_flat_dict,
Expand Down
2 changes: 1 addition & 1 deletion ocifs/data_lake/lake_sharing_client.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# coding: utf-8
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
# Copyright (c) 2021, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
import logging
import os
Expand Down
2 changes: 1 addition & 1 deletion ocifs/data_lake/lake_sharing_object_storage_client.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# coding: utf-8
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
# Copyright (c) 2021, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
import logging
import os
Expand Down
2 changes: 1 addition & 1 deletion ocifs/data_lake/lakehouse.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# coding: utf-8
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
# Copyright (c) 2021, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
from oci.util import (
formatted_flat_dict,
Expand Down
2 changes: 1 addition & 1 deletion ocifs/data_lake/lakehouse_client.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# coding: utf-8
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
# Copyright (c) 2021, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
from __future__ import absolute_import

Expand Down
2 changes: 1 addition & 1 deletion ocifs/data_lake/managed_prefix_collection.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# coding: utf-8
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
# Copyright (c) 2021, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
from oci.util import (
formatted_flat_dict,
Expand Down
2 changes: 1 addition & 1 deletion ocifs/data_lake/managed_prefix_summary.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# coding: utf-8
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
# Copyright (c) 2021, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
from oci.util import (
formatted_flat_dict,
Expand Down
2 changes: 1 addition & 1 deletion ocifs/data_lake/mount_specification.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# coding: utf-8
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
# Copyright (c) 2021, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
from oci.util import (
formatted_flat_dict,
Expand Down
2 changes: 1 addition & 1 deletion ocifs/data_lake/par_response.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# coding: utf-8
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
# Copyright (c) 2021, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
from oci.util import (
formatted_flat_dict,
Expand Down
2 changes: 1 addition & 1 deletion ocifs/data_lake/rename_object_details.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# coding: utf-8
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
# Copyright (c) 2021, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
from oci.util import (
formatted_flat_dict,
Expand Down
2 changes: 1 addition & 1 deletion ocifs/errors.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# coding: utf-8
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
# Copyright (c) 2021, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
import errno
import functools
Expand Down
2 changes: 1 addition & 1 deletion ocifs/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# coding: utf-8
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
# Copyright (c) 2021, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
from ..core import OCIFileSystem
2 changes: 1 addition & 1 deletion ocifs/tests/test_integration.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# coding: utf-8
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
# Copyright (c) 2021, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
import pandas as pd
import numpy as np
Expand Down
2 changes: 1 addition & 1 deletion ocifs/tests/test_integration_lake.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# coding: utf-8
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
# Copyright (c) 2021, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
import pandas as pd
import os
Expand Down
2 changes: 1 addition & 1 deletion ocifs/tests/test_spec.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
# Copyright (c) 2023 Oracle and/or its affiliates.
# Copyright (c) 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

from contextlib import contextmanager
Expand Down
28 changes: 27 additions & 1 deletion ocifs/tests/test_spec_lake.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
# Copyright (c) 2023 Oracle and/or its affiliates.
# Copyright (c) 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

import io
Expand Down Expand Up @@ -202,6 +202,32 @@ def test_oci_ls(fs):
assert nested_file1_path in fs.ls(full_external_mount_name + "/nested")


def test_glob(fs):
fn = full_external_mount_name + "/nested/file1"
assert fn not in fs.glob(full_external_mount_name + "/")
assert fn not in fs.glob(full_external_mount_name + "/*")
assert fn not in fs.glob(full_external_mount_name + "/nested")
assert fn in fs.glob(full_external_mount_name + "/nested/*")
assert fn in fs.glob(full_external_mount_name + "/nested/file*")
assert fn in fs.glob(full_external_mount_name + "/*/*")
assert [full_external_mount_name + "/nested/nested2"] == fs.glob(
full_external_mount_name + "/nested/nested2"
)
out = fs.glob(full_external_mount_name + "/nested/nested2/*")
assert {
f"{full_external_mount_name}/nested/nested2/file1",
f"{full_external_mount_name}/nested/nested2/file2",
} == set(out)

# Make sure glob() deals with the dot character (.) correctly.
assert full_external_mount_name + "/file.dat" in fs.glob(
full_external_mount_name + "/file.*"
)
assert full_external_mount_name + "/filexdat" not in fs.glob(
full_external_mount_name + "/file.*"
)


def test_oci_ls_detail(fs):
L = fs.ls(full_external_mount_name + "/nested", detail=True)
assert all(isinstance(item, CaseInsensitiveDict) for item in L)
Expand Down
2 changes: 1 addition & 1 deletion ocifs/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# coding: utf-8
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
# Copyright (c) 2021, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

import sys
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
# Copyright (c) 2021, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

### File setup.py obsolete and must not be used. Please update pyproject.toml instead.
Expand Down