diff --git a/.env.template b/.env.template index ff6a4cb..d41cd68 100644 --- a/.env.template +++ b/.env.template @@ -1,3 +1,4 @@ OPENAI_API_KEY=sk-proj- HF_TOKEN=hf_B- WANDB_API_KEY= +PYTHONPATH=. diff --git a/.gitattributes b/.gitattributes index 5948bbd..faf1212 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,40 @@ # Ignore Jupyter Notebooks from Github Linguist Stats *.ipynb linguist-vendored + +# Ignore Large File Storage objects +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.csv filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore index c029eef..8a83fe9 100644 --- a/.gitignore +++ b/.gitignore @@ -167,8 +167,6 @@ cython_debug/ # Data /data /temp -*.parquet -*.csv # Write up *pdf/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 69559ab..2511889 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,7 +10,26 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v5.0.0 hooks: + - id: check-added-large-files + - id: check-executables-have-shebangs + - id: check-json + - id: check-merge-conflict + - id: check-toml - id: check-yaml - id: end-of-file-fixer + - id: mixed-line-ending + args: ["--fix=lf"] + - id: requirements-txt-fixer - id: trailing-whitespace - - id: check-added-large-files + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.13.0 + hooks: + - id: mypy + args: ["--ignore-missing-imports"] + additional_dependencies: + [ + "types-python-slugify", + "types-requests", + "types-PyYAML", + "types-pytz", + ] diff --git a/README.md b/README.md index 7a7001e..722efc0 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ Otherwise, you can try the setup script: bash setup.sh ``` -It attempts to install with [uv](https://docs.astral.sh/uv/) (a fast, Rust-based Python package and project manager) using `.python-version` file and `pyproject.toml` file. This is the recommended way to manage the project, since its resolver is faster and more reliable than `pip`. +It attempts to install with [uv](https://docs.astral.sh/uv/) (a fast, Rust-based Python package and project manager) using `pyproject.toml` file. This is the recommended way to manage the project, since its dependency resolver is faster and more reliable than `pip`. Otherwise, it falls back to `pip` installation. diff --git a/pyproject.toml b/pyproject.toml index bab8427..307081d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,9 +5,11 @@ description = "Generate mnemonic sentences for English words" readme = "README.md" requires-python = ">=3.10" dependencies = [ + "accelerate>=1.0.1", "datasets", # Hugging Face datasets "evaluate", # HF evaluation - "gradio>=4.26.0", # Web app + "gradio>=4.26.0", + "hf-transfer>=0.1.8", # Web app "numpy<2.0.0", # Wait for other packages to update "openai>=1.57.0", "peft", # HF parameter-efficient training @@ -15,8 +17,9 @@ dependencies = [ "python-dotenv>=1.0.1", # Load environment variables "pyyaml>=6.0.2", # YAML config "ruff>=0.7.1", + "spaces>=0.31.0", "tenacity>=9.0.0", # Retry (e.g. API calls) - "torch>=2.5.1", # PyTorch + "torch>=2.4.0", # PyTorch "tqdm>=4.67.1", # Progress bar "transformers", # HF transformers "trl", # HF transformer reinforcement learning diff --git a/requirements.txt b/requirements.txt index 33df715..11d5754 100644 --- a/requirements.txt +++ b/requirements.txt @@ -98,6 +98,7 @@ gitdb==4.0.11 gitpython==3.1.43 # via wandb gradio==4.26.0 + # via spaces gradio-client==0.15.1 # via gradio h11==0.14.0 @@ -111,6 +112,8 @@ httpx==0.27.2 # gradio # gradio-client # openai + # safehttpx + # spaces huggingface-hub==0.26.1 # via # accelerate @@ -196,6 +199,7 @@ packaging==24.1 # huggingface-hub # matplotlib # peft + # spaces # transformers pandas==2.2.3 # via @@ -220,6 +224,7 @@ psutil==5.9.8 # via # accelerate # peft + # spaces # wandb pyarrow==18.1.0 # via datasets @@ -230,6 +235,7 @@ pydantic==2.9.2 # fastapi # gradio # openai + # spaces # wandb pydantic-core==2.23.4 # via pydantic @@ -269,6 +275,7 @@ requests==2.31.0 # datasets # evaluate # huggingface-hub + # spaces # transformers # wandb rich==13.9.3 @@ -281,6 +288,7 @@ rpds-py==0.22.3 # referencing ruff==0.8.2 # via gradio +safehttpx==0.1.6 safetensors==0.4.5 # via # accelerate @@ -309,6 +317,7 @@ sniffio==1.3.1 # anyio # httpx # openai +spaces==0.31.0 starlette==0.41.2 # via fastapi sympy==1.13.1 @@ -353,6 +362,7 @@ typing-extensions==4.12.2 # pydantic # pydantic-core # rich + # spaces # torch # typeguard # typer diff --git a/src/app/README.md b/src/app/README.md new file mode 100644 index 0000000..47099eb --- /dev/null +++ b/src/app/README.md @@ -0,0 +1,14 @@ +--- +title: Gemma 2 9B IT +emoji: 😻 +colorFrom: indigo +colorTo: pink +sdk: gradio +sdk_version: 5.8.0 +python_version: 3.10 +app_file: app.py +pinned: false +short_description: Chatbot +--- + +Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/src/app/app.py b/src/app/app.py new file mode 100644 index 0000000..df6975e --- /dev/null +++ b/src/app/app.py @@ -0,0 +1,162 @@ +"""Chat interface demo for Google Gemma 2 9B IT model. + +Cloned and adapted from the demo: https://huggingface.co/spaces/huggingface-projects/gemma-2-9b-it/tree/main/app.py +""" + +import os +from threading import Thread +from typing import Iterator + +import gradio as gr +import spaces +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer + +DESCRIPTION = """ +This is a demo for the Google Gemma 2 9B IT model. Use it to generate mnemonics for English words you want to learn and remember. +Input your instructions or start with one of the examples provided. The input supports a subset of markdown formatting such as bold, italics, code, tables. You can also use the following special tokens to customize the mnemonic: +""" + +MAX_MAX_NEW_TOKENS = 2048 +DEFAULT_MAX_NEW_TOKENS = 1024 +MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096")) + +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + +model_id = "google/gemma-2-9b-it" +tokenizer = AutoTokenizer.from_pretrained(model_id) +model = AutoModelForCausalLM.from_pretrained( + model_id, + device_map="auto", + torch_dtype=torch.bfloat16, +) +model.config.sliding_window = 4096 +model.eval() + + +@spaces.GPU(duration=90) +def generate( + message: str, + chat_history: list[dict], + max_new_tokens: int = 1024, + temperature: float = 0.6, + top_p: float = 0.9, + top_k: int = 50, + repetition_penalty: float = 1.2, +) -> Iterator[str]: + """Generate a response to a message using the model. + + Args: + message: The message to respond to. + chat_history: The conversation history. + max_new_tokens: The maximum number of tokens to generate. + temperature: The temperature for sampling. + top_p: The top-p value for nucleus sampling. + top_k: The top-k value for sampling. + repetition_penalty: The repetition penalty. + + Yields: + Iterator[str]: The generated response. + """ + conversation = chat_history.copy() + conversation.append({"role": "user", "content": message}) + + input_ids = tokenizer.apply_chat_template( + conversation, add_generation_prompt=True, return_tensors="pt" + ) + if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH: + input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:] + gr.Warning( + f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens." + ) + input_ids = input_ids.to(model.device) + + streamer = TextIteratorStreamer( + tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True + ) + generate_kwargs = dict( + {"input_ids": input_ids}, + streamer=streamer, + max_new_tokens=max_new_tokens, + do_sample=True, + top_p=top_p, + top_k=top_k, + temperature=temperature, + num_beams=1, + repetition_penalty=repetition_penalty, + ) + t = Thread(target=model.generate, kwargs=generate_kwargs) + t.start() + + outputs = [] + for text in streamer: + outputs.append(text) + yield "".join(outputs) + + +chat_interface = gr.ChatInterface( + fn=generate, + additional_inputs=[ + gr.Slider( + label="Max new tokens", + minimum=1, + maximum=MAX_MAX_NEW_TOKENS, + step=1, + value=DEFAULT_MAX_NEW_TOKENS, + ), + gr.Slider( + label="Temperature", + minimum=0.1, + maximum=4.0, + step=0.1, + value=0.6, + ), + gr.Slider( + label="Top-p (nucleus sampling)", + minimum=0.05, + maximum=1.0, + step=0.05, + value=0.9, + ), + gr.Slider( + label="Top-k", + minimum=1, + maximum=1000, + step=1, + value=50, + ), + gr.Slider( + label="Repetition penalty", + minimum=1.0, + maximum=2.0, + step=0.05, + value=1.2, + ), + ], + stop_btn=True, + examples=[ + [ + "Produce a cue to help me learn and retrieve the meaning of this word whenever I look at it (and nothing else): preposterous" + ], + [ + "Create a cue that elicits vivid mental image for the word 'observient' so I could remember its meaning." + ], + [ + "I need a mnemonic for 'dilapidated' to learn its meaning and contextual usage." + ], + [ + "Help me remember the meaning of 'encapsulate' by connecting it to its etymology or related words." + ], + ], + cache_examples=False, + type="messages", +) + +with gr.Blocks(css_paths="style.css", fill_height=True) as demo: + gr.Markdown(DESCRIPTION) + (chat_interface.render(),) + gr.ClearButton(elem_id="clear-button") + + +if __name__ == "__main__": + demo.queue(max_size=20).launch() diff --git a/src/app/app2.py b/src/app/app2.py new file mode 100644 index 0000000..0d67a78 --- /dev/null +++ b/src/app/app2.py @@ -0,0 +1,70 @@ +"""Gradio interface for generating mnemonics from instructions. + +TODO: Combine this interface with the chatbot interface in app.py. +""" + +import gradio as gr +from transformers import AutoModelForCausalLM, AutoTokenizer + +model_name = "chiffonng/gemma2-9b-it-mnemonics" + +# Load model and tokenizer +tokenizer = AutoTokenizer.from_pretrained(model_name) +model = AutoModelForCausalLM.from_pretrained(model_name) + + +def generate_text(instruction: str) -> str: + """Generate mnemonic from user input/instruction. + + Args: + instruction (str): User instructions to generate mnemonic. + + Returns: + str: Generated mnemonic text. + """ + inputs = tokenizer.encode(instruction, return_tensors="pt") + outputs = model.generate(inputs, max_length=256) + return tokenizer.decode(outputs[0], skip_special_tokens=True) + + +# Create simple Gradio interface +demo = gr.Interface( + fn=generate_text, + inputs=gr.Textbox(label="Instruction"), + outputs=gr.Textbox(label="Output"), + title="Mnemonic Generation", + description="Enter an instruction to generate mnemonic text.", +) + + +def chatbot_response(message: str, history: list) -> list: + """Generates a response from the chatbot based on the input message and updates the conversation history. + + Args: + message (str): The input message from the user. + history (list): The conversation history, a list of tuples where each tuple contains a user message and a chatbot response. + + Returns: + list: The updated conversation history with the new message and response appended. + """ + inputs = tokenizer.encode(message, return_tensors="pt") + outputs = model.generate(inputs, max_length=100) + response = tokenizer.decode(outputs[0], skip_special_tokens=True) + history.append((message, response)) + return history + + +# Create Gradio ChatInterface +chatbot = gr.ChatInterface( + fn=chatbot_response, + title="Mnemonic Generation Chatbot", + description="Chat with the model to generate mnemonics.", + retry_btn=True, + undo_btn=True, + clear_btn=True, +) + + +# Launch the interface +demo.launch() +# chatbot.launch() diff --git a/src/data/data_loaders.py b/src/data/data_loaders.py index 18d71b7..a2bd99f 100644 --- a/src/data/data_loaders.py +++ b/src/data/data_loaders.py @@ -6,6 +6,8 @@ from datasets import ClassLabel, DatasetDict, load_dataset if TYPE_CHECKING: + from typing import Optional + from datasets import Dataset import utils.constants as c @@ -59,7 +61,7 @@ def load_local_dataset(file_path: PathLike, **kwargs) -> "Dataset": def load_hf_dataset( - repo_id: str = None, + repo_id: Optional[str] = None, to_csv: bool = False, file_path: PathLike = None, **kwargs, @@ -78,7 +80,7 @@ def load_hf_dataset( login_hf_hub() if repo_id is None: - repo_id = c.HF_DATASET_REPO + repo_id = c.HF_DATASET_NAME logger.info(f"Loading dataset from {repo_id}.") dataset = load_dataset(repo_id, **kwargs) diff --git a/src/data/mnemonic_processing.py b/src/data/mnemonic_processing.py index 5706a97..46253cb 100644 --- a/src/data/mnemonic_processing.py +++ b/src/data/mnemonic_processing.py @@ -2,6 +2,7 @@ import logging from pathlib import Path +from typing import TYPE_CHECKING, no_type_check from warnings import warn import pandas as pd @@ -21,6 +22,10 @@ from typing_extensions import Annotated from yaml import safe_load +if TYPE_CHECKING: + from openai import Response + +from utils.aliases import PathLike from utils.constants import ( CLASSIFIED_DATASET_CSV, CLASSIFIED_DATASET_PARQUET, @@ -46,7 +51,7 @@ client = OpenAI() # Load config and prompts -with Path.open("config/classify_mnemonics.yaml", "r") as f: +with Path("config/classify_mnemonics.yaml").open("r") as f: classification_conf = safe_load(f) # dict of config batch_size = classification_conf["batch_size"] @@ -66,20 +71,20 @@ class ClassificationSchema(BaseModel): classifications: list[ValidClassification] -def combine_key_value(path: str) -> list[str]: +def combine_key_value(path: PathLike) -> list[str]: """Load 2-column data from a file, to format: key: value. Args: - path (str): The path to the file containing the 2-column data. + path (PathLike): The path to the file containing the 2-column data. Returns: combined_col (list[str]): The combined key and value columns. """ - path = check_file_path(path, extensions=[PARQUET_EXT, CSV_EXT]) + path_obj: Path = check_file_path(path, extensions=[PARQUET_EXT, CSV_EXT]) - if path.suffix == PARQUET_EXT: + if path_obj.suffix == PARQUET_EXT: df = pd.read_parquet(path, engine="pyarrow") - elif path.suffix == CSV_EXT: + elif path_obj.suffix == CSV_EXT: df = pd.read_csv(path, header="infer", quotechar='"') logger.info(f"Read {df.shape[0]} rows from {str(path)}.") @@ -137,7 +142,7 @@ def create_batches(data: list[str], batch_size=batch_size) -> list[str]: before=before_log(logger, logging.WARNING), after=after_log(logger, logging.WARNING), ) -def classify_mnemonics_api(batches: list[str]): +def classify_mnemonics_api(batches: str | list[str]): """Classify mnemonics using OpenAI's API, GPT-4o mini and return the responses as JSON array of numbers. Retry up to 3 times if rate limited. Args: @@ -182,7 +187,7 @@ def get_structured_response( batch: str, model_config: dict, response_format: BaseModel = ClassificationSchema, -): +) -> dict: # mypy: ignore """Get response from OpenAI API. Documentation: https://platform.openai.com/docs/guides/structured-outputs/how-to-use. Args: @@ -192,7 +197,7 @@ def get_structured_response( response_format (BaseModel, optional): The response format. Defaults to ClassificationSchema. Returns: - structure_msg (message object from OpenAI's Response object): A structured message object. + structure_msg (dict = openai.Response...message): The structured message object. """ try: structure_msg = ( @@ -202,7 +207,7 @@ def get_structured_response( {"role": "system", "content": model_config["prompts"]["system"]}, { "role": "user", - "content": f"{model_config["prompts"]["user"]}{batch}", + "content": model_config["prompts"]["user"] + batch, }, ], max_tokens=batch_size * 3 + 1, # 3 tokens per mnemonic @@ -230,18 +235,21 @@ def get_structured_response( raise e +@no_type_check def parse_structured_response( - structure_msg: object, batch: str, batch_index: int -) -> list[int | str]: + structure_msg: object, + batch: str, + batch_index: int, +) -> list[int]: """Parse the structured message from OpenAI's API. Args: - structure_msg (message object from OpenAI's Response object): A structured message object. + structure_msg (openai.Response...message): The structured message object. batch (str): The batch of mnemonics. batch_index (int): The index of the batch. Returns: - classification_batch_i (list[int|str]): The list of parsed categories. + (list[int]): The list of parsed categories. """ try: if structure_msg.parsed: @@ -286,14 +294,14 @@ def parse_structured_response( def save_structured_outputs( - outputs: list[ValidClassification], input_path: str | Path, output_path: str | Path + outputs: list[ValidClassification], input_path: PathLike, output_path: PathLike ): """Save the classification results to an existing file of mnemonics. Args: outputs (list[ValidClassification]): The list of parsed categories. - input_path (str | Path): The path to the file containing the mnemonics. - output_path (str | Path): The path to .csv or .parquet file to write the parsed. + input_path (PathLike): The path to the file containing the mnemonics. + output_path (PathLike): The path to .csv or .parquet file to write the parsed. Raises: ValueError: If the output file is not in parquet or csv format. diff --git a/src/utils/constants.py b/src/utils/constants.py index 114a900..5993da8 100644 --- a/src/utils/constants.py +++ b/src/utils/constants.py @@ -24,8 +24,9 @@ CATEGORY_NAMES = ["unsure", "shallow-encoding", "deep-encoding", "mixed"] CATEGORY_DICT = {name: i for i, name in enumerate(CATEGORY_NAMES)} -# Hugging Face datasets -HF_DATASET_REPO = "chiffonng/mnemonic-sft" # / +# Hugging Face collection +HF_DATASET_NAME = "chiffonng/en-vocab-mnemonics" # / +HF_MODEL_NAME = "chiffonng/gemma2-9b-it-mnemonics" # / # Model paths -CHECKPOINT_DIR = "ckpt" +OUTPUT_DIR = "output" diff --git a/src/utils/error_handling.py b/src/utils/error_handling.py index 3fc1761..458366d 100644 --- a/src/utils/error_handling.py +++ b/src/utils/error_handling.py @@ -2,7 +2,7 @@ from enum import Enum from pathlib import Path -from typing import TypeAlias +from typing import Optional, TypeAlias from warnings import warn from utils.aliases import ExtensionsType, PathLike @@ -51,7 +51,7 @@ def validate_and_normalize_extensions(extensions: ExtensionsType) -> list[str]: return extensions -def check_extension(path: Path, extensions: list[str]): +def check_extension(path: Path, extensions: ExtensionsType) -> None: """Check if the path has one of the allowed extensions.""" if extensions and path.suffix not in extensions: raise ValueError( @@ -92,7 +92,7 @@ def check_file_path( def check_dir_path( dir_path: PathLike, new_ok: bool = False, - extensions: list[str] = None, + extensions: Optional[list[str]] = None, ) -> Path | list[Path]: """Check if the directory path exists, convert it to a Path object if it is a string, and return it. Optionally, check if the directory contains files with the specified extensions. @@ -129,21 +129,21 @@ def check_dir_path( def which_file_exists( - *files: list[Path] | list[str], extensions: ExtensionsType = None + *files: list[PathLike], extensions: Optional[ExtensionsType] = None ) -> Path: """Return the first file found in the list of files. Optionally, return the first file with the specified extensions. Args: - files (list[Path] | list[str]): The list of files to check. + files (list[PathLike]): The list of files to check. extensions (list[str], optional): A list of allowed file extensions. Defaults to []. Returns: - file (Path): The first file found in the list. + file_path (Path): The first file found in the list. """ for file in files: - file = check_file_path(file, new_ok=True, extensions=extensions) - if file.exists(): - return file + file_path: Path = check_file_path(file, new_ok=True, extensions=extensions) + if file_path.exists(): + return file_path raise FileNotFoundError( f"None of the specified files were found: {[str(p) for p in files]}." diff --git a/uv.lock b/uv.lock index af1e6e4..ea03586 100644 --- a/uv.lock +++ b/uv.lock @@ -1006,6 +1006,43 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/95/04/ff642e65ad6b90db43e668d70ffb6736436c7ce41fcc549f4e9472234127/h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761", size = 58259 }, ] +[[package]] +name = "hf-transfer" +version = "0.1.8" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d3/0e/ba51e31148f0a9bc8d44878086535c2dc6d9a8dce321250e9bcdd3c110ea/hf_transfer-0.1.8.tar.gz", hash = "sha256:26d229468152e7a3ec12664cac86b8c2800695fd85f9c9a96677a775cc04f0b3", size = 23595 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4f/eb/469e68c4259c4f4ad8e00967ad2f72ff1ba5e2712b4e1093e3e03c5cbc3d/hf_transfer-0.1.8-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:70858f9e94286738ed300484a45beb5cfee6a7ddac4c5886f9c6fce7823ac5ab", size = 1422386 }, + { url = "https://files.pythonhosted.org/packages/bd/3d/5e8966b47aa86cd50f2017c76c2634aa09a437224567f379bc28d6580d7c/hf_transfer-0.1.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:38adc73f0a8526319d90f7cc5dc2d5e4bb66f487a513d94b98aa6725be732e4a", size = 1406027 }, + { url = "https://files.pythonhosted.org/packages/61/e0/fd5f849ed7b2bf9b2bb008f3df3ee5a8773ca98362302833708cce26c337/hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:44d2f0c08198d8d899fe9d66e86aee2dd844bd7ce33888f261373fcec81d2a54", size = 3781136 }, + { url = "https://files.pythonhosted.org/packages/d5/e9/fad10fb8b04c91cb8775b850f2bc578a1fb6168e2ab2b04ebb8525466159/hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1de2a4ef36f9e60b3d3bec00193c0aafd75771709f2ca51b9b162373f5af3d32", size = 3099910 }, + { url = "https://files.pythonhosted.org/packages/8c/ae/8a608949a87280ed14f0f5e0adbeccab54a7ea3d3aabdf77ec38544dd44f/hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e319269e3606a5ff2979296841766649ac73598a4a8eee2a968f86c8071fea5a", size = 3589277 }, + { url = "https://files.pythonhosted.org/packages/81/ca/855ea35c9f997b500acd1baf6d6920ead00a0b7a8fccdcac74fe7e4f66d9/hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0f6026cf3be6a53ea42f92172f60c1c0675baaa9073f865e671b661dde5fd157", size = 3409983 }, + { url = "https://files.pythonhosted.org/packages/5e/89/863f333b49603cc8d3c8862a428cc8fbaa9388ac8f076e9fa5ef3e729c3c/hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f865c33ada5bd3650c2b46e59979f2d7755c3f517f8d0facc78576a0c7d26406", size = 3562732 }, + { url = "https://files.pythonhosted.org/packages/95/93/8137b83bd4ca6b1b4dab36e42af8c19d62c98ff8837306429547a92cbde0/hf_transfer-0.1.8-cp310-none-win32.whl", hash = "sha256:2054730e8d8ed21917c64be7199e06424b2bd08df1c43a72766afaed7992f2d3", size = 1129924 }, + { url = "https://files.pythonhosted.org/packages/da/36/7583964f7cb0671071488f358dd388a8ef21f3a9bfe2e3596dac199010fc/hf_transfer-0.1.8-cp310-none-win_amd64.whl", hash = "sha256:2b4f1a9446ba31170b5b1eca4e916504d18378a6b5fe959896bdac8a736a5ecb", size = 1209808 }, + { url = "https://files.pythonhosted.org/packages/72/94/d1c3d383536051f61a5d1d50bbc848a5c165d67d94bde0286ea343d5e00a/hf_transfer-0.1.8-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:e27c15fcc5869ad7e52bbc0bdec6106b288d1c463f8d2da92f28615a3b181361", size = 1422132 }, + { url = "https://files.pythonhosted.org/packages/a0/a0/d10411151752499381052dbaf99fcbaefa8aaa3b5912b0535eea92d4699c/hf_transfer-0.1.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:871a0032d011ebc6409a73a8406b98b84ff2cd3ed7d9e1af8cdf4d660b9fab9b", size = 1405922 }, + { url = "https://files.pythonhosted.org/packages/85/df/70543e805988b8a1085830e7f5ca290cc7a72c869b4ac2be1a4b619435aa/hf_transfer-0.1.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:686fa756e1e0214bb6327d33c66732c52274d94a8460beb50604ad988b391cf6", size = 3780881 }, + { url = "https://files.pythonhosted.org/packages/93/c9/6920e63df88b2acaa3a4b0b616edca476ef8525d38d6f71437c0c9992b5d/hf_transfer-0.1.8-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:36a03b1b2911b0cf15b1b9d971a34b32dadcc4f2fd979aaff5979d6ce4017c34", size = 3099659 }, + { url = "https://files.pythonhosted.org/packages/7d/b0/f2a85771491de8f887e71ba8769d9fa15c53cadf4c0959954735f5f6e71b/hf_transfer-0.1.8-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:079db90c81f41f4cf3227dfaaa855a9b8e9aef45bc7c2be29ce7232cd83ff881", size = 3588878 }, + { url = "https://files.pythonhosted.org/packages/d8/36/cf7bd093988bdb530abbbfddd4cac80e3ccee4d80454af24fc0913bf2033/hf_transfer-0.1.8-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ac08a4524127fdd14c234d4bcbe49d1c498acf5335c781714823179bcc8dc039", size = 3409342 }, + { url = "https://files.pythonhosted.org/packages/30/61/b38643f305e1f0f76c8894cec38d5d39d0d6265a75cc9de0a94917ddff3d/hf_transfer-0.1.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:837432e73cb17274a6782b6216e8ce058aa325a475dc44a5a6a753d48b86d18a", size = 3562382 }, + { url = "https://files.pythonhosted.org/packages/cd/66/723bc1eeca445a1ce5cf72026f45f8a7ae656a1e47fce026cca92e31dbd5/hf_transfer-0.1.8-cp311-none-win32.whl", hash = "sha256:b180f9823dde35aba9bc0f1d0c04ac8a873baebd3732a7ffe4f11940abc7df0d", size = 1129916 }, + { url = "https://files.pythonhosted.org/packages/dd/7e/139527d276416bdeb08546cdcbd6f3e02326f3a6a6c2f00c71300a709e71/hf_transfer-0.1.8-cp311-none-win_amd64.whl", hash = "sha256:37907d2135cebcf8b6d419bb575148d89c224f16b69357f027bd29d0e85c6529", size = 1209794 }, + { url = "https://files.pythonhosted.org/packages/5b/d6/54c9ea16c782cb79cdae78500c0a4bc7474236f94537ee954771e6e86c8c/hf_transfer-0.1.8-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:baf948f4f493949309cbe60529620b9b0aef854a22b6e526753364acc57c09b6", size = 1424195 }, + { url = "https://files.pythonhosted.org/packages/63/57/09e2aa7fa63bc640d9c3fda2cc724744b46227d239bb4ae9bf33efc338c2/hf_transfer-0.1.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0bce5c8bdefa478c5d5eaa646cc4ce1df5cfe764d98572ad0c6b8773e98d49f6", size = 1408105 }, + { url = "https://files.pythonhosted.org/packages/19/72/f247f9632410d8b9655332b2007924557c293094ea91648336f49403afe7/hf_transfer-0.1.8-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54d6f8a1a86128d651a3799e1267c343d60f81f2c565d7c5416eb8e674e4cf0e", size = 3782066 }, + { url = "https://files.pythonhosted.org/packages/d0/cf/8eccb6fcff8eedd79334ffaf65c44109e8bece1ecc232c1036de697d51fa/hf_transfer-0.1.8-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f79fd1b0c2ed93efb4c5f684118d7a762ecdd218e170df8208c4e13d3dcd4959", size = 3103992 }, + { url = "https://files.pythonhosted.org/packages/23/e8/f5d4ef6febc9ece1099e1f8de64f05f4d9f5b62461c4e54aac324a94d1ab/hf_transfer-0.1.8-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:414df35692670683bf5623498ef9d88a8df5d77e9516515da6e2b34d1054c11f", size = 3590083 }, + { url = "https://files.pythonhosted.org/packages/aa/de/cd8b36ecfd1c40119f307cb0dfd4ca5cd437beb8c92219d52a4253e0059a/hf_transfer-0.1.8-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3c9798d5f951f66b96d40a7a53910260cb5874fda56cf5944dddb7c571f37ec3", size = 3406261 }, + { url = "https://files.pythonhosted.org/packages/37/7f/914b684779dae9d2db4cdb6efa50426da7411754d820b8ddc9c10eef5042/hf_transfer-0.1.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:060c661691f85a61392e57579c80eb64b5ee277434e81fb582f605c1c8ff05d5", size = 3560705 }, + { url = "https://files.pythonhosted.org/packages/de/17/e9ff11be0ab52d113091462f65fa280bd5c04c80e5b1dadb7f8de9645848/hf_transfer-0.1.8-cp312-none-win32.whl", hash = "sha256:f7840e32379820c3e1571a480238e05ea043e970c99d2e999578004a2eb17788", size = 1130448 }, + { url = "https://files.pythonhosted.org/packages/58/60/04c18bbeb46cc2dc6fd237323c03f2e4c700bca122f28567dbb344ff5bab/hf_transfer-0.1.8-cp312-none-win_amd64.whl", hash = "sha256:9a3204ec423cc5e659872e8179f8704ad9ce2abb1e6a991f8838aedf1dc07830", size = 1206317 }, + { url = "https://files.pythonhosted.org/packages/ae/e1/647dbd310042c11638ef330060777084f3394a82adc8274624b0f0601198/hf_transfer-0.1.8-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:928ff036c3e98e10dcfbdb4fcdfc4592d37a5cc8e365a7ba8dfd4337e849d675", size = 3591149 }, + { url = "https://files.pythonhosted.org/packages/13/c4/aaf060b26e720a7b4cb90d7f02dc18a56b18894cbd72fb610f75b11fb9dc/hf_transfer-0.1.8-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d49ba3ce67035f460ae1924fe2feafec155cb535eec7f31ed5109c19064cd294", size = 3564510 }, +] + [[package]] name = "httpcore" version = "1.0.6" @@ -1466,10 +1503,12 @@ name = "mnemonic-gen" version = "0.2.0" source = { virtual = "." } dependencies = [ + { name = "accelerate" }, { name = "datasets" }, { name = "evaluate" }, { name = "gradio", version = "4.26.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux'" }, { name = "gradio", version = "5.4.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" }, + { name = "hf-transfer" }, { name = "numpy" }, { name = "openai" }, { name = "peft" }, @@ -1477,6 +1516,7 @@ dependencies = [ { name = "python-dotenv" }, { name = "pyyaml" }, { name = "ruff" }, + { name = "spaces" }, { name = "tenacity" }, { name = "torch" }, { name = "tqdm" }, @@ -1495,9 +1535,11 @@ dev = [ [package.metadata] requires-dist = [ + { name = "accelerate", specifier = ">=1.0.1" }, { name = "datasets" }, { name = "evaluate" }, { name = "gradio", specifier = ">=4.26.0" }, + { name = "hf-transfer", specifier = ">=0.1.8" }, { name = "numpy", specifier = "<2.0.0" }, { name = "openai", specifier = ">=1.57.0" }, { name = "peft" }, @@ -1505,6 +1547,7 @@ requires-dist = [ { name = "python-dotenv", specifier = ">=1.0.1" }, { name = "pyyaml", specifier = ">=6.0.2" }, { name = "ruff", specifier = ">=0.7.1" }, + { name = "spaces", specifier = ">=0.31.0" }, { name = "tenacity", specifier = ">=9.0.0" }, { name = "torch", specifier = ">=2.5.1" }, { name = "tqdm", specifier = ">=4.67.1" }, @@ -2943,6 +2986,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235 }, ] +[[package]] +name = "spaces" +version = "0.31.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "gradio", version = "4.26.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux'" }, + { name = "gradio", version = "5.4.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" }, + { name = "httpx" }, + { name = "packaging" }, + { name = "psutil" }, + { name = "pydantic" }, + { name = "requests" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b9/57/73a23d582c7ba2f53a9394d7efa482144f026f549035ba2b70e3cb085e47/spaces-0.31.0.tar.gz", hash = "sha256:28c8ceee2437231e9279eedc057a13870432903c3ee7fcfb57b636bf34db2278", size = 21759 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ad/b4/9f3c2ba72d25bd54aa2f41d13ff840961d1c7cd3ab8aa3b82be933e1a87d/spaces-0.31.0-py3-none-any.whl", hash = "sha256:e0acf655d3a0209a3cec73b83ef1510a0b10b682cfb0b720a68d3ca1bb0f84e2", size = 28415 }, +] + [[package]] name = "stack-data" version = "0.6.3"