Merge pull request #62 from polaris-hub/feat/improve-code-examples

cwognum · web-flow · commit c6f758b0112b · 2023-11-28T11:59:00.000-05:00
Updated code-example to work out-of-the-box
diff --git a/README.md b/README.md
@@ -49,20 +49,26 @@ This library is a Python client to interact with the [Polaris Hub](https://polar
 ```python
 import polaris as po
 
-# Download a benchmark (the associated dataset will be transparently downloaded)
-benchmark = po.load_benchmark("org_or_user/name")
+# Load the benchmark from the Hub
+benchmark = po.load_benchmark("polaris/hello_world_benchmark")
 
-# Retrieve the splits
+# Get the train and test data-loaders
 train, test = benchmark.get_train_test_split()
 
-# Work your magic!
-y_pred = ...
+# Use the training data to train your model
+# Get the input as an array with 'train.inputs' and 'train.targets'  
+# Or simply iterate over the train object.
+for x, y in train:
+    ...
 
-# Run the evaluation procedure
-results = benchmark.evaluate(y_pred)
+# Work your magic to accurately predict the test set
+predictions = [0.0 for x in test]
 
-# Upload your results to the hub
-results.upload_to_hub()
+# Evaluate your predictions
+results = benchmark.evaluate(predictions)
+
+# Submit your results
+results.upload_to_hub(owner="dummy-user")
 ```
 
 ## Documentation
diff --git a/docs/api/load.md b/docs/api/load.md
@@ -0,0 +1,8 @@
+ 
+::: polaris.load_dataset
+
+---
+
+::: polaris.load_benchmark
+
+---
diff --git a/docs/quickstart.md b/docs/quickstart.md
@@ -24,13 +24,26 @@ If all you care about is to partake in a benchmark that is hosted on the hub, it
 ```python
 import polaris as po
 
-benchmark = po.load_benchmark("org_or_user/name")
+# Load the benchmark from the Hub
+benchmark = po.load_benchmark("polaris/hello_world_benchmark")
+
+# Get the train and test data-loaders
 train, test = benchmark.get_train_test_split()
 
-y_pred = ...  # Work your magic!
+# Use the training data to train your model
+# Get the input as an array with 'train.inputs' and 'train.targets'  
+# Or simply iterate over the train object.
+for x, y in train:
+    ...
+
+# Work your magic to accurately predict the test set
+predictions = [0.0 for x in test]
+
+# Evaluate your predictions
+results = benchmark.evaluate(predictions)
 
-results = benchmark.evaluate(y_pred)
-results.upload_to_hub()
+# Submit your results
+results.upload_to_hub(owner="dummy-user")
 ```
 
 That's all there is to it to partake in a benchmark. No complicated, custom data-loaders or evaluation protocol. With just a few lines of code, you can feel confident that you are properly evaluating your model and focus on what you do best: Solving the hard problems in our domain!
diff --git a/docs/tutorials/basics.ipynb b/docs/tutorials/basics.ipynb
@@ -63,7 +63,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m2023-11-06 17:37:18.375\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpolaris.hub.client\u001b[0m:\u001b[36mlogin\u001b[0m:\u001b[36m262\u001b[0m - \u001b[1mYou are already logged in to the Polaris Hub as lu-valencelabs (lu@valencediscovery.com). Set `overwrite=True` to force re-authentication.\u001b[0m\n"
+      "\u001b[32m2023-11-27 14:54:08.788\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpolaris.hub.client\u001b[0m:\u001b[36mlogin\u001b[0m:\u001b[36m262\u001b[0m - \u001b[1mYou are already logged in to the Polaris Hub as cwognum (cas@valencediscovery.com). Set `overwrite=True` to force re-authentication.\u001b[0m\n"
      ]
     }
    ],
@@ -285,7 +285,7 @@
     {
      "data": {
       "text/html": [
-       "<table border=\"1\"><tr><th>name</th><td>None</td></tr><tr><th>description</th><td></td></tr><tr><th>tags</th><td></td></tr><tr><th>user_attributes</th><td></td></tr><tr><th>owner</th><td>None</td></tr><tr><th>benchmark_name</th><td>hello_world_benchmark</td></tr><tr><th>benchmark_owner</th><td><table border=\"1\"><tr><th>slug</th><td>polaris</td></tr><tr><th>organization_id</th><td>org_2WG9hRFgKNIRtGw4orsMPcr1F4S</td></tr><tr><th>user_id</th><td>None</td></tr><tr><th>owner</th><td>org_2WG9hRFgKNIRtGw4orsMPcr1F4S</td></tr></table></td></tr><tr><th>github_url</th><td>None</td></tr><tr><th>paper_url</th><td>None</td></tr><tr><th>contributors</th><td>None</td></tr><tr><th>results</th><td><table border=\"1\"><thead><tr><th>Test set</th><th>Target label</th><th>Metric</th><th>Score</th></tr></thead><tbody><tr><td>test</td><td>SOL</td><td>mean_squared_error</td><td>2.6875139821</td></tr><tr><td>test</td><td>SOL</td><td>mean_absolute_error</td><td>1.2735690161</td></tr></tbody></table></td></tr></table>"
+       "<table border=\"1\"><tr><th>name</th><td>None</td></tr><tr><th>description</th><td></td></tr><tr><th>tags</th><td></td></tr><tr><th>user_attributes</th><td></td></tr><tr><th>owner</th><td>None</td></tr><tr><th>benchmark_name</th><td>hello_world_benchmark</td></tr><tr><th>benchmark_owner</th><td><table border=\"1\"><tr><th>slug</th><td>polaris</td></tr><tr><th>external_id</th><td>org_2WG9hRFgKNIRtGw4orsMPcr1F4S</td></tr><tr><th>type</th><td>organization</td></tr></table></td></tr><tr><th>github_url</th><td>None</td></tr><tr><th>paper_url</th><td>None</td></tr><tr><th>contributors</th><td>None</td></tr><tr><th>artifact_id</th><td>None</td></tr><tr><th>benchmark_artifact_id</th><td>polaris/hello-world-benchmark</td></tr><tr><th>results</th><td><table border=\"1\"><thead><tr><th>Test set</th><th>Target label</th><th>Metric</th><th>Score</th></tr></thead><tbody><tr><td>test</td><td>SOL</td><td>mean_squared_error</td><td>2.6875139821</td></tr><tr><td>test</td><td>SOL</td><td>mean_absolute_error</td><td>1.2735690161</td></tr></tbody></table></td></tr></table>"
       ],
       "text/plain": [
        "{\n",
@@ -297,13 +297,14 @@
        "  \"benchmark_name\": \"hello_world_benchmark\",\n",
        "  \"benchmark_owner\": {\n",
        "    \"slug\": \"polaris\",\n",
-       "    \"organization_id\": \"org_2WG9hRFgKNIRtGw4orsMPcr1F4S\",\n",
-       "    \"user_id\": null,\n",
-       "    \"owner\": \"org_2WG9hRFgKNIRtGw4orsMPcr1F4S\"\n",
+       "    \"external_id\": \"org_2WG9hRFgKNIRtGw4orsMPcr1F4S\",\n",
+       "    \"type\": \"organization\"\n",
        "  },\n",
        "  \"github_url\": null,\n",
        "  \"paper_url\": null,\n",
        "  \"contributors\": null,\n",
+       "  \"artifact_id\": null,\n",
+       "  \"benchmark_artifact_id\": \"polaris/hello-world-benchmark\",\n",
        "  \"results\": [\n",
        "    {\n",
        "      \"Test set\": \"test\",\n",
@@ -341,7 +342,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 15,
    "id": "a601f415-c563-4efe-94c3-0d44f3fd6576",
    "metadata": {},
    "outputs": [],
@@ -362,7 +363,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 16,
    "id": "60cbf4b9-8514-480d-beda-8a50e5f7c9a6",
    "metadata": {
     "scrolled": true
@@ -372,16 +373,16 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/Users/lu.zhu/miniconda3/envs/pov3/lib/python3.11/site-packages/pydantic/main.py:309: UserWarning: Pydantic serializer warnings:\n",
+      "/home/cas/micromamba/envs/polaris/lib/python3.12/site-packages/pydantic/main.py:308: UserWarning: Pydantic serializer warnings:\n",
       "  Expected `url` but got `str` - serialized value may not be as expected\n",
       "  Expected `url` but got `str` - serialized value may not be as expected\n",
       "  return self.__pydantic_serializer__.to_python(\n",
-      "\u001b[32m2023-11-06 17:38:06.152\u001b[0m | \u001b[32m\u001b[1mSUCCESS \u001b[0m | \u001b[36mpolaris.hub.client\u001b[0m:\u001b[36mupload_results\u001b[0m:\u001b[36m413\u001b[0m - \u001b[32m\u001b[1mYour result has been successfully uploaded to the Hub. View it here: https://polarishub.io/benchmarks/polaris/hello_world_benchmark/YYH033LKM1BaT8byAC5Jc\u001b[0m\n"
+      "\u001b[32m2023-11-27 14:54:46.649\u001b[0m | \u001b[32m\u001b[1mSUCCESS \u001b[0m | \u001b[36mpolaris.hub.client\u001b[0m:\u001b[36mupload_results\u001b[0m:\u001b[36m428\u001b[0m - \u001b[32m\u001b[1mYour result has been successfully uploaded to the Hub. View it here: https://polarishub.io/benchmarks/polaris/hello_world_benchmark/ns4JrC3hQNK9M1hbVPchy\u001b[0m\n"
      ]
     }
    ],
    "source": [
-    "client.upload_results(results)\n",
+    "client.upload_results(results, owner=\"cwognum\")\n",
     "client.close()"
    ]
   },
@@ -396,14 +397,6 @@
     "\n",
     "---"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0868ff53-7a42-4e4c-bae4-29fb04c513c7",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
@@ -422,7 +415,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.5"
+   "version": "3.12.0"
   }
  },
  "nbformat": 4,
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -22,6 +22,7 @@ nav:
       - Custom Datasets and Benchmarks: tutorials/custom_dataset_benchmark.ipynb
       # - Creating Datasets with zarr: tutorials/dataset_zarr.ipynb
   - API Reference:
+      - Load: api/load.md
       - Core:
           - Dataset: api/dataset.md
           - Benchmark: api/benchmark.md
diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py
@@ -22,7 +22,7 @@
 from polaris.utils.dict2html import dict2html
 from polaris.utils.errors import InvalidBenchmarkError, PolarisChecksumError
 from polaris.utils.misc import listit
-from polaris.utils.types import AccessType, DataFormat, PredictionsType, SplitType
+from polaris.utils.types import AccessType, DataFormat, HubOwner, PredictionsType, SplitType
 
 ColumnsType = Union[str, list[str]]
 
@@ -371,6 +371,7 @@ def upload_to_hub(
         settings: Optional[PolarisHubSettings] = None,
         cache_auth_token: bool = True,
         access: Optional[AccessType] = "private",
+        owner: Optional[Union[HubOwner, str]] = None,
         **kwargs: dict,
     ):
         """
@@ -382,7 +383,7 @@ def upload_to_hub(
         with PolarisHubClient(
             env_file=env_file, settings=settings, cache_auth_token=cache_auth_token, **kwargs
         ) as client:
-            return client.upload_benchmark(self, access)
+            return client.upload_benchmark(self, access=access, owner=owner)
 
     def to_json(self, destination: str) -> str:
         """Save the benchmark to a destination directory as a JSON file.
diff --git a/polaris/dataset/_dataset.py b/polaris/dataset/_dataset.py
@@ -23,7 +23,7 @@
 from polaris.utils.dict2html import dict2html
 from polaris.utils.errors import InvalidDatasetError, PolarisChecksumError
 from polaris.utils.io import get_zarr_root, robust_copy
-from polaris.utils.types import AccessType, HttpUrlString, License
+from polaris.utils.types import AccessType, HttpUrlString, HubOwner, License
 
 # Constants
 _SUPPORTED_TABLE_EXTENSIONS = ["parquet"]
@@ -201,6 +201,7 @@ def upload_to_hub(
         settings: Optional[PolarisHubSettings] = None,
         cache_auth_token: bool = True,
         access: Optional[AccessType] = "private",
+        owner: Optional[Union[HubOwner, str]] = None,
         **kwargs: dict,
     ):
         """
@@ -212,7 +213,7 @@ def upload_to_hub(
         with PolarisHubClient(
             env_file=env_file, settings=settings, cache_auth_token=cache_auth_token, **kwargs
         ) as client:
-            return client.upload_dataset(self, access)
+            return client.upload_dataset(self, access=access, owner=owner)
 
     @classmethod
     def from_zarr(cls, path: str) -> "Dataset":
diff --git a/polaris/evaluate/_results.py b/polaris/evaluate/_results.py
@@ -182,6 +182,7 @@ def upload_to_hub(
         settings: Optional[PolarisHubSettings] = None,
         cache_auth_token: bool = True,
         access: Optional[AccessType] = "private",
+        owner: Optional[Union[HubOwner, str]] = None,
         **kwargs: dict,
     ):
         """
@@ -193,7 +194,7 @@ def upload_to_hub(
         with PolarisHubClient(
             env_file=env_file, settings=settings, cache_auth_token=cache_auth_token, **kwargs
         ) as client:
-            return client.upload_results(self, access)
+            return client.upload_results(self, access=access, owner=owner)
 
     def _repr_dict_(self) -> dict:
         """Utility function for pretty-printing to the command line and jupyter notebooks"""
diff --git a/polaris/hub/client.py b/polaris/hub/client.py
@@ -373,7 +373,12 @@ def get_benchmark(self, owner: Union[str, HubOwner], name: str) -> BenchmarkSpec
         )
         return benchmark_cls(**response)
 
-    def upload_results(self, results: BenchmarkResults, access: AccessType = "private"):
+    def upload_results(
+        self,
+        results: BenchmarkResults,
+        access: AccessType = "private",
+        owner: Optional[Union[HubOwner, str]] = None,
+    ):
         """Upload the results to the Polaris Hub.
 
         Info: Owner
@@ -395,9 +400,19 @@ def upload_results(self, results: BenchmarkResults, access: AccessType = "privat
         Args:
             results: The results to upload.
             access: Grant public or private access to result
+            owner: Which Hub user or organization owns the artifact.
+                Optional if and only if the `benchmark.owner` attribute is set.
         """
 
         # Get the serialized model data-structure
+
+        if results.owner is None:
+            if owner is None:
+                raise ValueError(
+                    "The `owner` argument must be specified if the `results.owner` attribute is not set."
+                )
+            results.owner = owner if isinstance(owner, HubOwner) else HubOwner(slug=owner)
+
         result_json = results.model_dump(by_alias=True, exclude_none=True)
 
         # Make a request to the hub
@@ -414,7 +429,11 @@ def upload_results(self, results: BenchmarkResults, access: AccessType = "privat
         return response
 
     def upload_dataset(
-        self, dataset: Dataset, access: AccessType = "private", timeout: TimeoutTypes = (10, 200)
+        self,
+        dataset: Dataset,
+        access: AccessType = "private",
+        timeout: TimeoutTypes = (10, 200),
+        owner: Optional[Union[HubOwner, str]] = None,
     ):
         """Upload the dataset to the Polaris Hub.
 
@@ -432,8 +451,21 @@ def upload_dataset(
             dataset: The dataset to upload.
             access: Grant public or private access to result
             timeout: Request timeout values. User can modify the value when uploading large dataset as needed.
+                This can be a single value with the timeout in seconds for all IO operations, or a more granular
+                tuple with (connect_timeout, write_timeout). The type of the the timout parameter comes from `httpx`.
+                Since datasets can get large, it might be needed to increase the write timeout for larger datasets.
+                See also: https://www.python-httpx.org/advanced/#timeout-configuration
+            owner: Which Hub user or organization owns the artifact.
+                Optional if and only if the `benchmark.owner` attribute is set.
         """
 
+        if dataset.owner is None:
+            if owner is None:
+                raise ValueError(
+                    "The `owner` argument must be specified if the `dataset.owner` attribute is not set."
+                )
+            dataset.owner = owner if isinstance(owner, HubOwner) else HubOwner(slug=owner)
+
         # Get the serialized data-model
         # We exclude the table as it handled separately and the cache_dir as it is user-specific
         dataset_json = dataset.model_dump(exclude={"cache_dir", "table"}, exclude_none=True, by_alias=True)
@@ -500,7 +532,12 @@ def upload_dataset(
 
         return response
 
-    def upload_benchmark(self, benchmark: BenchmarkSpecification, access: AccessType = "private"):
+    def upload_benchmark(
+        self,
+        benchmark: BenchmarkSpecification,
+        access: AccessType = "private",
+        owner: Optional[Union[HubOwner, str]] = None,
+    ):
         """Upload the benchmark to the Polaris Hub.
 
         Info: Owner
@@ -520,7 +557,15 @@ def upload_benchmark(self, benchmark: BenchmarkSpecification, access: AccessType
         Args:
             benchmark: The benchmark to upload.
             access: Grant public or private access to result
+            owner: Which Hub user or organization owns the artifact.
+                Optional if and only if the `benchmark.owner` attribute is set.
         """
+        if benchmark.owner is None:
+            if owner is None:
+                raise ValueError(
+                    "The `owner` argument must be specified if the `benchmark.owner` attribute is not set."
+                )
+            benchmark.owner = owner if isinstance(owner, HubOwner) else HubOwner(slug=owner)
 
         # Get the serialized data-model
         # We exclude the dataset as we expect it to exist on the hub already.
diff --git a/polaris/loader/__init__.py b/polaris/loader/__init__.py
diff --git a/polaris/loader/load.py b/polaris/loader/load.py

-Original file line number
+Diff line change
@@ @@ -0,0 +1,8 @@ @@
++
 +::: polaris.load_dataset
++
 +---
++
 +::: polaris.load_benchmark
++
 +---