Skip to content

Commit

Permalink
add lint workflow (#22)
Browse files Browse the repository at this point in the history
* add lint workflow

* lint

* fix type check bug

* skip test data for spell check

* add codespell rule to pre commit config

* fix spelling
  • Loading branch information
edknv committed Nov 4, 2023
1 parent 050f604 commit 8483255
Show file tree
Hide file tree
Showing 66 changed files with 297 additions and 416 deletions.
14 changes: 14 additions & 0 deletions .github/workflows/lint.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
name: lint

on:
pull_request:
push:
branches: [main]

jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
- uses: pre-commit/[email protected]
65 changes: 18 additions & 47 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,54 +5,25 @@ repos:
hooks:
- id: absolufy-imports
- repo: https://github.com/python/black
rev: 22.10.0
rev: 23.10.1
hooks:
- id: black
# - repo: https://github.com/timothycrosley/isort
# rev: 5.10.1
# hooks:
# - id: isort
# additional_dependencies: [toml]
# exclude: examples/*
# types
# - repo: https://github.com/pre-commit/mirrors-mypy
# rev: 'v0.940'
# hooks:
# - id: mypy
# language_version: python3
# args: [--no-strict-optional, --ignore-missing-imports, --show-traceback, --install-types, --non-interactive]
# exclude: docs/*
# code style
# - repo: https://github.com/pycqa/pylint
# rev: pylint-2.7.4
# hooks:
# - id: pylint
# exlude: notebooks/*
- repo: https://gitlab.com/pycqa/flake8
rev: 3.9.2
- repo: https://github.com/timothycrosley/isort
rev: 5.12.0
hooks:
- id: isort
- repo: https://github.com/pycqa/flake8
rev: 6.1.0
hooks:
- id: flake8
exlude: notebooks/*
# notebooks
# - repo: https://github.com/s-weigand/flake8-nb
# rev: v0.3.0
# hooks:
# - id: flake8-nb
# files: \.ipynb$
# documentation
# - repo: https://github.com/econchick/interrogate
# rev: 1.5.0
# hooks:
# - id: interrogate
# exclude: ^(build|docs|merlin/io|tests|setup.py|versioneer.py)
# args: [--config=pyproject.toml]
# - repo: https://github.com/codespell-project/codespell
# rev: v2.1.0
# hooks:
# - id: codespell
# # security
# - repo: https://github.com/PyCQA/bandit
# rev: 1.7.0
# hooks:
# - id: bandit
# args: [--verbose, -ll, -x, tests,examples,bench]
- repo: https://github.com/codespell-project/codespell
rev: v2.2.6
hooks:
- id: codespell
exclude: tests/testdata
# security
- repo: https://github.com/PyCQA/bandit
rev: 1.7.0
hooks:
- id: bandit
args: [--verbose, -ll, -x, tests,examples,bench]
1 change: 1 addition & 0 deletions ci/ignore_codespell_words.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
nin
37 changes: 20 additions & 17 deletions crossfit/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# flake8: noqa

from crossfit import backend, metric, op
from crossfit.backend.dask.cluster import Distributed, Serial
from crossfit.calculate.aggregate import Aggregator
Expand All @@ -6,9 +8,8 @@
from crossfit.data.array.dispatch import crossarray
from crossfit.data.dataframe.core import FrameBackend
from crossfit.data.dataframe.dispatch import CrossFrame
from crossfit.metric import * # noqa
from crossfit.op import * # noqa

from crossfit.metric import *
from crossfit.op import *

__all__ = [
"Aggregator",
Expand All @@ -27,21 +28,23 @@


try:
from crossfit.backend.torch import SentenceTransformerModel, TorchExactSearch, HFModel
from crossfit.backend.torch import HFModel, SentenceTransformerModel, TorchExactSearch
from crossfit.dataset.base import IRDataset, MultiDataset
from crossfit.dataset.load import load_dataset
from crossfit.report.beir.embed import embed
from crossfit.report.beir.report import beir_report
from crossfit.dataset.load import load_dataset
from crossfit.dataset.base import IRDataset, MultiDataset

__all__.extend([
"embed",
"beir_report",
"load_dataset",
"TorchExactSearch",
"SentenceTransformerModel",
"HFModel",
"MultiDataset",
"IRDataset",
])

__all__.extend(
[
"embed",
"beir_report",
"load_dataset",
"TorchExactSearch",
"SentenceTransformerModel",
"HFModel",
"MultiDataset",
"IRDataset",
]
)
except ImportError as e:
pass
5 changes: 3 additions & 2 deletions crossfit/backend/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from crossfit.backend.numpy.sparse import *
# flake8: noqa

from crossfit.backend.dask.dataframe import *
from crossfit.backend.numpy.sparse import *
from crossfit.backend.pandas.array import *
from crossfit.backend.pandas.dataframe import *


try:
from crossfit.backend.cudf.array import *
from crossfit.backend.cudf.dataframe import *
Expand Down
2 changes: 1 addition & 1 deletion crossfit/backend/cudf/array.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging

from crossfit.data.array import conversion
from crossfit.data.array.dispatch import np_backend_dispatch, ArrayBackend
from crossfit.data.array.dispatch import ArrayBackend, np_backend_dispatch


@np_backend_dispatch.register_lazy("cudf")
Expand Down
3 changes: 1 addition & 2 deletions crossfit/backend/cudf/dataframe.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from typing import Callable


from crossfit.backend.pandas.dataframe import PandasDataFrame
from crossfit.data.array.dispatch import crossarray
from crossfit.data.dataframe.dispatch import CrossFrame
from crossfit.backend.pandas.dataframe import PandasDataFrame


class CudfDataFrame(PandasDataFrame):
Expand Down
2 changes: 1 addition & 1 deletion crossfit/backend/cudf/series.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import cupy as cp
import cudf
import cupy as cp
from cudf.core.column import as_column


Expand Down
2 changes: 1 addition & 1 deletion crossfit/backend/cupy/array.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging

from crossfit.data.array import conversion
from crossfit.data.array.dispatch import np_backend_dispatch, ArrayBackend
from crossfit.data.array.dispatch import ArrayBackend, np_backend_dispatch


@np_backend_dispatch.register_lazy("cupy")
Expand Down
2 changes: 1 addition & 1 deletion crossfit/backend/dask/aggregate.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from functools import partial

import dask.dataframe as dd
from dask.delayed import Delayed
from dask.highlevelgraph import HighLevelGraph
import dask.dataframe as dd

from crossfit.calculate.aggregate import Aggregator
from crossfit.data.dataframe.dispatch import CrossFrame
Expand Down
13 changes: 6 additions & 7 deletions crossfit/backend/dask/cluster.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
from typing import Callable, Optional, Any
from contextvars import ContextVar
import importlib
import gc
import importlib
import warnings
from contextvars import ContextVar
from typing import Any, Callable, Optional

import dask
from dask.distributed import Client, get_client
from dask.dataframe.optimize import optimize as dd_optimize
import distributed
from dask.dataframe.optimize import optimize as dd_optimize
from dask.distributed import Client, get_client

from crossfit.backend.gpu import HAS_GPU


_crossfit_dask_client = ContextVar("_crossfit_dask_client", default="auto")


Expand Down Expand Up @@ -174,7 +173,7 @@ class only supports the automatic generation of
The easiest way to use `Distributed` is within a
conventional `with` statement::
from merlin.core.utils import Disributed
from merlin.core.utils import Disrtibuted
workflow = nvt.Workflow(["col"] >> ops.Normalize())
dataset = nvt.Dataset(...)
Expand Down
2 changes: 1 addition & 1 deletion crossfit/backend/dask/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
from typing import Callable, List

import dask.dataframe as dd

from crossfit.data.dataframe.core import FrameBackend
from crossfit.data.dataframe.dispatch import CrossFrame


# @CrossFrame.register_lazy("dask")
# def register_dask_backend():
# import dask.dataframe as dd
Expand Down
6 changes: 3 additions & 3 deletions crossfit/backend/numpy/sparse.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import itertools
from crossfit.data.array.masked import MaskedArray

import numba
import numpy as np
import scipy.sparse as sp
import numba

from crossfit.data.sparse.dispatch import CrossSparse
from crossfit.data.array.masked import MaskedArray
from crossfit.data.sparse.core import SparseMatrixBackend
from crossfit.data.sparse.dispatch import CrossSparse


class NPSparseMatrixBackend(SparseMatrixBackend):
Expand Down
4 changes: 2 additions & 2 deletions crossfit/backend/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def concat(cls, frames: List[FrameBackend], axis: int = 0):
if len(frames) == 0:
raise TypeError(f"Expected non-empty list, got {frames}")
for frame in frames:
if type(frame) != cls:
if type(frame) is not cls:
raise TypeError(f"All frames should be type {cls}, got {type(frame)}")

return cls(
Expand Down Expand Up @@ -103,8 +103,8 @@ def groupby_indices(self, by: list) -> dict:
@CrossFrame.register_lazy("numpy")
def register_numpy_backend():
try:
import pandas as pd
import numpy as np
import pandas as pd

@CrossFrame.register(np.ndarray)
def _numpy_to_pandas(data, name="data"):
Expand Down
2 changes: 1 addition & 1 deletion crossfit/backend/torch/array.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging

from crossfit.data.array import conversion
from crossfit.data.array.dispatch import np_backend_dispatch, ArrayBackend
from crossfit.data.array.dispatch import ArrayBackend, np_backend_dispatch

try:
import torch
Expand Down
23 changes: 9 additions & 14 deletions crossfit/backend/torch/hf/model.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
from functools import lru_cache
import gc
import os
from crossfit.dataset.home import CF_HOME
import joblib
from functools import lru_cache

import joblib
import numpy as np
import torch
from sklearn.linear_model import LinearRegression
from tqdm import tqdm
from transformers import AutoConfig, AutoModel, AutoTokenizer
from sklearn.linear_model import LinearRegression

from crossfit.backend.torch.model import Model
from crossfit.dataset.home import CF_HOME


class HFModel(Model):
Expand Down Expand Up @@ -68,19 +69,13 @@ def fit_memory_estimate_curve(self, model=None):
torch.cuda.reset_peak_memory_stats()

batch = {
"input_ids": torch.randint(1, 501, (batch_size, seq_len)).to(
device=device
),
"attention_mask": torch.ones((batch_size, seq_len)).to(
device=device
),
"input_ids": torch.randint(1, 501, (batch_size, seq_len)).to(device=device),
"attention_mask": torch.ones((batch_size, seq_len)).to(device=device),
}

try:
outputs = model(batch)
memory_used = torch.cuda.max_memory_allocated() / (
1024**2
) # Convert to MB
_ = model(batch)
memory_used = torch.cuda.max_memory_allocated() / (1024**2) # Convert to MB
X.append([batch_size, seq_len, seq_len**2])
y.append(memory_used)

Expand Down
Loading

0 comments on commit 8483255

Please sign in to comment.