diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..2ac58cd --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,23 @@ +name: Tests + +on: + pull_request: + branches: [main] + push: + branches: [main] + +jobs: + test: + runs-on: self-hosted # Requires a self-hosted runner with a GPU and model weights. + # Register one at: Settings -> Actions -> Runners -> New self-hosted runner. + + steps: + - uses: actions/checkout@v4 + + - name: Install package + run: uv sync --extra dev --extra openai --extra microsoft + + - name: Run tests + run: uv run pytest -v + env: + LLM_MODEL_CACHE: ${{ secrets.LLM_MODEL_CACHE }} diff --git a/README.md b/README.md index 197807e..d4e0583 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ uv sync ...with optional libraries: ``` -uv sync --extra +uv sync --extra --extra ``` ### ...with pip: diff --git a/pyproject.toml b/pyproject.toml index f7455e5..a49c4a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "llm" # The pip install . -version = "0.3.0" +version = "0.4.0" description = "Library for easy use of LLMs." readme = "README.md" authors = [ @@ -15,6 +15,7 @@ dependencies = [ "torchvision", "sentence-transformers>=5.2.2", "llm-conversation", + "pytest>=8.0", ] [project.optional-dependencies] @@ -49,6 +50,9 @@ torch = { index = "pytorch-cu130" } torchvision = { index = "pytorch-cu130" } llm-conversation = { git = "https://github.com/EricApgar/llm-conversation", rev = "v0.2.0" } +[tool.pytest.ini_options] +testpaths = ["tests"] + [build-system] requires = ["uv_build>=0.9.7,<0.10.0"] build-backend = "uv_build" diff --git a/src/llm/models/gpt_oss_20b.py b/src/llm/models/gpt_oss_20b.py index b349da5..4141c36 100644 --- a/src/llm/models/gpt_oss_20b.py +++ b/src/llm/models/gpt_oss_20b.py @@ -1,15 +1,6 @@ -import torch -from transformers import AutoTokenizer, AutoModelForCausalLM -from openai_harmony import ( - Conversation as HarmonyConversation, - RenderConversationConfig, - load_harmony_encoding, - HarmonyEncodingName, - DeveloperContent, - ReasoningEffort, - SystemContent, - Message, - Role) +import os + +from transformers import pipeline from llm.models.template import Template from llm_conversation import Conversation @@ -21,7 +12,7 @@ def __init__(self, hf_token: str=None): super().__init__(hf_token=hf_token) self.name = 'openai/gpt-oss-20b' - self.tokenizer = None + self.model: pipeline = None def load(self, @@ -31,6 +22,9 @@ def load(self, quantization: str=None, device: str=None): + if (not remote) and (not os.path.isdir(location)): + raise ValueError(f'Nonexistant location ({location}) - fix or set remote=True.') + self.location = location self.remote = remote self.commit = commit @@ -38,161 +32,87 @@ def load(self, self._set_device(device=device) - self.model = AutoModelForCausalLM.from_pretrained( - pretrained_model_name_or_path=self.name, + model_kwargs = { + 'cache_dir': self.location, + 'local_files_only': not self.remote} + + self.model = pipeline( + task='text-generation', + model=self.name, + dtype='auto', + device_map=self.device, token=self.hf_token, - cache_dir=self.location, - local_files_only=not self.remote, revision=self.commit, - low_cpu_mem_usage=True, - # quantization_config=quantization_config, - device_map=self.device, - trust_remote_code=True, # self.remote, TODO - _attn_implementation='eager', - torch_dtype='auto') # Might be obsolete. Change to "dtype"? - - self.tokenizer = AutoTokenizer.from_pretrained(self.name) - - if self.tokenizer.pad_token_id is None or self.tokenizer.pad_token_id == self.tokenizer.eos_token_id: - self.tokenizer.pad_token_id = 0 # use a dedicated ID that isn't EOS + # trust_remote_code=self.remote, + model_kwargs=model_kwargs) return - + def ask(self, prompt: str | Conversation, - max_tokens: int=1024, - temperature: float=0.5, - reasoning_level: str='low', - repetition_penalty: float=1.12, - top_p: float=0.95): - ''' - Call an LLM with a prompt and generate a response. + max_tokens: int=512, + temperature: float=0.9, + reasoning_level: str=None): - This model works best when the input is formatted into an - openai-harmony conversation structure, so all inputs are converted - into a generic Conversation structure (if not already one) and then - converted into the harmony structure. - ''' - - if not self.model: - raise ValueError('Must load model before using! (see model.load())') - - encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS) + formatted_messages = self._format_prompt(prompt=prompt, reasoning_level=reasoning_level) - if isinstance(prompt, str): # Create a structured conversation from - convo = Conversation() - convo.add_response(role='user', text=prompt) + kwargs = {} + if temperature == 0: + kwargs['do_sample'] = False else: - convo = prompt - - convo_harmony = self._to_harmony(conversation=convo, reasoning_level=reasoning_level) - - render_cfg = RenderConversationConfig(auto_drop_analysis=True) - prefill_ids = encoding.render_conversation_for_completion( - convo_harmony, - Role.ASSISTANT, - config=render_cfg) - stop_token_ids = encoding.stop_tokens_for_assistant_actions() + kwargs['temperature'] = temperature - input_ids = torch.tensor([prefill_ids], device=self.model.device) - attention_mask = torch.ones_like(input_ids) - - out = self.model.generate( - input_ids=input_ids, + model_output = self.model( + formatted_messages, max_new_tokens=max_tokens, - do_sample=True, - temperature=temperature, - top_p=top_p, - # min_p=min_p, - repetition_penalty=repetition_penalty, - eos_token_id=stop_token_ids, - # pad_token_id=self.tokenizer.eos_token_id, - attention_mask=attention_mask, - pad_token_id=self.tokenizer.pad_token_id) - - generated_tokens = out[0, input_ids.shape[-1]:].tolist() - - # NOTE: Translate tokens directly to output (Debugging Only) - text_tokens = self.tokenizer.batch_decode( - generated_tokens, - skip_special_tokens=True, - clean_up_tokenization_spaces=False) - - # I can patch the harmony token problem by assuming that the message is in the generated tokens, and just - # wiping out everything up until the first occurrence of ['', 'analysis', ]. - generated_tokens = generated_tokens[get_good_token_start(token_list=text_tokens):] - - # Transform tokens into the text equivalent (contains model reasoning and thinking). - full_response = encoding.parse_messages_from_completion_tokens(generated_tokens, role=Role.ASSISTANT) + **kwargs) + + full_text_response = model_output[0]['generated_text'][-1]['content'] - # Extract the actual response from the full set of generated text. - final_response = next(m for m in full_response if m.channel == "final") - text_response = final_response.content[0].text + if 'assistantfinal' in full_text_response: + text = full_text_response.split("assistantfinal", 1)[1].strip() + else: + raise ValueError(f'Mangled LLM output. Could not find expected end marker "assistantfinal" in generated text: {full_text_response}') - return text_response + return text @staticmethod - def _to_harmony(conversation: Conversation, reasoning_level: str) -> HarmonyConversation: + def _format_prompt(prompt: str | Conversation, reasoning_level: str=None) -> list[dict]: ''' - Build a Harmony-Conversation object from a Generic Conversation object. + Structure the input convo and images into the expected format + to get a good clean LLM response. Embedd it and prepare for LLM + token generation. ''' - if reasoning_level == 'low': - reasoning_level = ReasoningEffort.LOW - elif reasoning_level == 'medium': - reasoning_level = ReasoningEffort.MEDIUM - elif reasoning_level == 'high': - reasoning_level = ReasoningEffort.HIGH - - # System Details about the Overall Conversation. - system_msg = Message.from_role_and_content( - Role.SYSTEM, - SystemContent.new().with_reasoning_effort(reasoning_level)) - - developer_msg = Message.from_role_and_content( - Role.DEVELOPER, - DeveloperContent.new().with_instructions(conversation.overall_prompt)) - - msgs = [system_msg, developer_msg] - - # Background Context Information. - if conversation.context: - context_block = '\n'.join([ - "BACKGROUND CONTEXT (not part of the dialogue):", - '\n'.join(conversation.context), - "END BACKGROUND CONTEXT"]) - - msgs.append(Message.from_role_and_content(Role.USER, context_block)) - - # Conversation history between user and AI. - for turn in conversation.history: - if turn.role == "user": - msgs.append(Message.from_role_and_content(Role.USER, turn.text)) - else: - msgs.append(Message.from_role_and_content(Role.ASSISTANT, turn.text)) + if isinstance(prompt, str): + convo = Conversation() + convo.add_response(role='user', text=prompt) + else: + convo = prompt - harmony_convo = HarmonyConversation.from_messages(msgs) + system_pieces = [] + formatted_messages = [] - return harmony_convo + if reasoning_level: + system_pieces.append(f'Reasoning level: {reasoning_level}.') + if convo.overall_prompt: + system_pieces.append(convo.overall_prompt) -def get_good_token_start(token_list: list[str]): - ''' - This is a patch for handling bad generated tokens which break - the openai-harmony prompt formatter. + if convo.context: + for context in convo.context: + system_pieces.append(context) - It finds the start of rational thought in the generated output, skipping - over the nonsense content that's generated. - ''' - for i in range(len(token_list) - 1): - if token_list[i] == "" and token_list[i + 1] == "analysis": - return i + if system_pieces: # Merge background context pieces. + formatted_messages.append({'role': 'system', 'content': ' '.join(system_pieces)}) - raise ValueError("Could not find ['', 'analysis'] in the list") + if convo.history: + for response in convo.history: + formatted_messages.append({'role': response.role, 'content': response.text}) - return + return formatted_messages if __name__ == '__main__': diff --git a/src/llm/models/gpt_oss_20b_dev.py b/src/llm/models/gpt_oss_20b_dev.py new file mode 100644 index 0000000..2855955 --- /dev/null +++ b/src/llm/models/gpt_oss_20b_dev.py @@ -0,0 +1,213 @@ +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM +from openai_harmony import ( + Conversation as HarmonyConversation, + RenderConversationConfig, + load_harmony_encoding, + HarmonyEncodingName, + DeveloperContent, + ReasoningEffort, + SystemContent, + Message, + Role) + +from llm.models.template import Template +from llm_conversation import Conversation + + +class GptOss20bDev(Template): + ''' + This development version of the GPT-OSS-20B model uses a lower level + method for generating tokens compared to the transformers.pipeline() + method. + ''' + + def __init__(self, hf_token: str=None): + super().__init__(hf_token=hf_token) + + self.name = 'openai/gpt-oss-20b' + self.tokenizer = None + + + def load(self, + location: str, + remote: bool=False, + commit: str=None, + device: str=None): + + self.location = location + self.remote = remote + self.commit = commit + + self._set_device(device=device) + + self.model = AutoModelForCausalLM.from_pretrained( + pretrained_model_name_or_path=self.name, + token=self.hf_token, + cache_dir=self.location, + local_files_only=not self.remote, + revision=self.commit, + low_cpu_mem_usage=True, + device_map=self.device, + trust_remote_code=True, # self.remote, TODO + _attn_implementation='eager', + dtype='auto') # Might be obsolete. Change to "dtype"? + + self.tokenizer = AutoTokenizer.from_pretrained(self.name) + + if self.tokenizer.pad_token_id is None or self.tokenizer.pad_token_id == self.tokenizer.eos_token_id: + self.tokenizer.pad_token_id = 0 # use a dedicated ID that isn't EOS + + return + + + def ask(self, + prompt: str | Conversation, + max_tokens: int=256, + temperature: float=1.0, + reasoning_level: str='low', + repetition_penalty: float=1.15, + top_p: float=0.95): + ''' + Call an LLM with a prompt and generate a response. + + This model works best when the input is formatted into an + openai-harmony conversation structure, so all inputs are converted + into a generic Conversation structure (if not already one) and then + converted into the harmony structure. + ''' + + if not self.model: + raise ValueError('Must load model before using! (see model.load())') + + encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS) + + if isinstance(prompt, str): # Create a structured conversation from input. + convo = Conversation() + convo.add_response(role='user', text=prompt) + else: + convo = prompt + + convo_harmony = self._to_harmony(conversation=convo, reasoning_level=reasoning_level) + + render_cfg = RenderConversationConfig(auto_drop_analysis=True) + prefill_ids = encoding.render_conversation_for_completion( + convo_harmony, + Role.ASSISTANT, + config=render_cfg) + stop_token_ids = encoding.stop_tokens_for_assistant_actions() + + input_ids = torch.tensor([prefill_ids], device=self.model.device) + attention_mask = torch.ones_like(input_ids) + + # Generate new tokens from the LLM. + generated_tokens = self.model.generate( + input_ids=input_ids, + max_new_tokens=max_tokens, + do_sample=True, + temperature=temperature, + top_p=top_p, # NOTE: Debatable usefulness. + repetition_penalty=repetition_penalty, + eos_token_id=stop_token_ids, + attention_mask=attention_mask, # NOTE: Debatable usefulness. + pad_token_id=self.tokenizer.pad_token_id) # NOTE: Debatable usefulness. self.tokenizer.eos_token_id + + # Shape the generated tokens into final form. + generated_tokens = generated_tokens[0, input_ids.shape[-1]:].tolist() + + # Translate tokens (which are numbers) directly to output. Basically a look up table. + # This is an tangent we take to check if the output is mangled and needs to be adjusted. + text_tokens = self.tokenizer.batch_decode( + generated_tokens, + skip_special_tokens=True, + clean_up_tokenization_spaces=False) + + # I can patch the mangled harmony token problem by assuming that the message is in the generated tokens, + # and just wiping out everything up until the first occurrence of [..., '', 'analysis', ...]. + generated_tokens = generated_tokens[get_good_token_start(token_list=text_tokens):] + + # Transform tokens into the text equivalent (will contain model reasoning and thinking). + full_response = encoding.parse_messages_from_completion_tokens(generated_tokens, role=Role.ASSISTANT) + + # Extract the actual response (sans reasoning) from the full set of generated text. + final_response = next(m for m in full_response if m.channel == "final") + text_response = final_response.content[0].text + + return text_response + + + @staticmethod + def _to_harmony(conversation: Conversation, reasoning_level: str) -> HarmonyConversation: + ''' + Build a Harmony-Conversation object from a Generic Conversation object. + ''' + + if reasoning_level == 'low': + reasoning_level = ReasoningEffort.LOW + elif reasoning_level == 'medium': + reasoning_level = ReasoningEffort.MEDIUM + elif reasoning_level == 'high': + reasoning_level = ReasoningEffort.HIGH + + # System Details about the Overall Conversation. + system_msg = Message.from_role_and_content( + Role.SYSTEM, + SystemContent.new().with_reasoning_effort(reasoning_level)) + + developer_msg = Message.from_role_and_content( + Role.DEVELOPER, + DeveloperContent.new().with_instructions(conversation.overall_prompt)) + + msgs = [system_msg, developer_msg] + + # Background Context Information. + if conversation.context: + context_block = '\n'.join([ + "BACKGROUND CONTEXT (not part of the dialogue):", + '\n'.join(conversation.context), + "END BACKGROUND CONTEXT"]) + + msgs.append(Message.from_role_and_content(Role.USER, context_block)) + + # Conversation history between user and AI. + for turn in conversation.history: + if turn.role == "user": + msgs.append(Message.from_role_and_content(Role.USER, turn.text)) + else: + msgs.append(Message.from_role_and_content(Role.ASSISTANT, turn.text)) + + harmony_convo = HarmonyConversation.from_messages(msgs) + + return harmony_convo + + +def get_good_token_start(token_list: list[str]) -> int: + ''' + This is a patch for handling bad generated tokens which break + the openai-harmony prompt formatter. + + It finds the start of rational thought in the generated output, skipping + over the nonsense content that's generated. + ''' + i_start = None + + for i in range(len(token_list) - 1): + if token_list[i] == "" and token_list[i + 1] == "analysis": + i_start = i + break + + if i_start is None: + raise ValueError(f'Mangled LLM output. Could not find expected start tokens ["", "analysis"] in generated tokens: {token_list}') + + return i_start + + +if __name__ == '__main__': + + model = GptOss20bDev() + model.load(location=r'/home/eric/Repos/model_cache') + response = model.ask(prompt='Name a primary color.') + + print(response) + + pass \ No newline at end of file diff --git a/src/llm/models/phi4_multimodal_instruct.py b/src/llm/models/phi4_multimodal_instruct.py index 6fa4dc0..9b3a601 100644 --- a/src/llm/models/phi4_multimodal_instruct.py +++ b/src/llm/models/phi4_multimodal_instruct.py @@ -1,8 +1,13 @@ +from __future__ import annotations +from typing import TYPE_CHECKING + from transformers import AutoProcessor, AutoModelForCausalLM from llm.models.template import Template +from llm_conversation import Conversation -# from PIL import image as PillowImage +if TYPE_CHECKING: + from PIL import Image as PillowImage class Phi4MultimodalInstruct(Template): @@ -21,6 +26,14 @@ def load(self, commit: str='0cb22ab20b10ac01c49ecd8b7138dcd98bc00548', quantization: str=None, device: str=None): + ''' + The commit is locked to this because this is the version of Phi4 with patches + needed to run. There's a whole mess regarding Phi-4 getting out of date with + transformers and breaking, and no one updating the HF Phi-4. See the discussions + (https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions) for details. + + Just know that it's huge hassle to try to get Phi4 working as the transformers lib updates. + ''' self.location = location self.remote = remote @@ -30,6 +43,7 @@ def load(self, self._set_device(device=device) self._patch_dynamic_cache() + # TODO: Turn on quantization at some point. # quantization_config = BitsAndBytesConfig( # load_in_4bit=True, # bnb_4bit_compute_dtype=torch.float16) @@ -53,52 +67,69 @@ def load(self, def ask(self, - prompt: str, - images: list=None, # list[PillowImage.Image] - max_tokens: int=256, - temperature: float=0.1): + prompt: str | Conversation, + images: list[PillowImage.Image]=None, + max_tokens: int=1024, + # temperature: float=0.5, + # reasoning_level: str='low', + # top_p: float=0.95, + repetition_penalty: float=1.12) -> str: if not self.model: raise ValueError('Must load model before using! (see model.load())') + + if isinstance(prompt, str): # Create a structured conversation from input. + convo = Conversation() + convo.add_response(role='user', text=prompt) + else: + convo = prompt - embedding = self.embed(text=prompt, images=images) + embedding = self._structure_inputs(convo=convo, images=images) generation_args = { 'max_new_tokens': max_tokens, - # temperature: 0.0, 'do_sample': False} - - generate_ids = self.model.generate( + + # Generate new tokens from the input via an LLM. + generated_tokens = self.model.generate( **embedding, eos_token_id=self.processor.tokenizer.eos_token_id, + repetition_penalty=repetition_penalty, **generation_args) - - # Decode the output (un-embed the output to convert to text). - generate_ids = generate_ids[:, embedding['input_ids'].shape[1]:] + # Extract the explicit response tokens (sans thinking and original question). + response_tokens = generated_tokens[:, embedding['input_ids'].shape[1]:] + + # Translate the tokens to text (essentially a look up table). response = self.processor.batch_decode( - generate_ids, + response_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - + return response - def embed(self, text: str=None, images: list=None) -> dict: + def _structure_inputs(self, convo: Conversation, images: list=None) -> dict: ''' - Background: - <|user|>\n<|image_1|>\n<|image_2|\n<|image_3|\n{prompt}<|end|>\n<|assistant|>\n + Structure the input convo and images into the expected format + to get a good clean LLM response. Embedd it and prepare for LLM + token generation. ''' - # TODO: Image list might not need chat template applied again. - if images is not None: - image_tags = ''.join([f'<|image_{i+1}|>\n' for i, _ in enumerate(images)]) - content_wrap = '<|user|\n>' + image_tags + f'{text}<|end|>\n<|assistant|>\n' - else: - # content_wrap = f'<|user|>\n{text}<|end|>\n<|assistant|>\n' - content_wrap = text + if not convo.overall_prompt: + convo.set_overall_prompt(text='') - messages = [{'role': 'user', 'content': content_wrap}] + system_prompt = convo.overall_prompt + ' '.join(convo.context) + messages = [{ + 'role': 'system', + 'content': system_prompt}] + [{ + 'role': i.role, + 'content': i.text} for i in convo.history] + + if images: # Modify last item in convo to carry image tags. + image_tags = ''.join([f'<|image_{i+1}|>' for i, _ in enumerate(images)]) + last_role = messages[-1]['role'] + messages[-1] = {'role': last_role, 'content': image_tags + messages[-1]['content']} structured_prompt = self.processor.tokenizer.apply_chat_template( messages, @@ -113,17 +144,20 @@ def embed(self, text: str=None, images: list=None) -> dict: return embedding - def _load_processor(self): + def _load_processor(self, num_images: int=1): + + if num_images == 1: + num_crops = 16 + else: + num_crops = 4 - # TODO: For best performance, supposed to use num_crops=4 for multi - # frame and 16 for single frame. self.processor = AutoProcessor.from_pretrained( pretrained_model_name_or_path=self.name, trust_remote_code=True, # self.remote, - num_crops=4) - + num_crops=num_crops) + return - + def _patch_dynamic_cache(self): ''' @@ -165,8 +199,22 @@ def get_usable_length(self, new_seq_length: int, layer_idx: int=0) -> int: if __name__ == '__main__': model = Phi4MultimodalInstruct() - model.load(location=r'/home/eric/Repos/model_cache') # - response = model.ask(prompt='Name a primary color.') - print(response) + model.load(location=r'/home/eric/Repos/model_cache') # NOTE: set . + + response = model.ask(prompt='Name a primary color. Be brief.', max_tokens=256) + print(f'{response}\n') + + convo = Conversation() + convo.set_overall_prompt(text='You are a helpful assistant.') + convo.add_context(text='Your favorite color is red.') + convo.add_context(text='Your favorite shape is the hexagon.') + convo.add_response(role='user', text='What is your favorite color-shape combination?') + response = model.ask(prompt=convo, max_tokens=256) + print(f'{response}\n') + + from PIL import Image as PillowImage + image = PillowImage.open(r'/home/eric/Desktop/monkey.png') # NOTE: Point to existing image. + response = model.ask(prompt='Describe the image.', images=[image], max_tokens=256) + print(f'{response}\n') pass \ No newline at end of file diff --git a/src/llm/models/template.py b/src/llm/models/template.py index 6dc2abf..f97ca7b 100644 --- a/src/llm/models/template.py +++ b/src/llm/models/template.py @@ -24,7 +24,8 @@ def load(self, location: str, remote: bool=False, commit: str=None, - quantization: str=None): + quantization: str=None, + device: str=None): ''' Load in the LLM with specific settings. @@ -35,6 +36,7 @@ def load(self, models are loaded from. commit: specific git commit for a model. quantization: work in progress. Currently unused. + device: which device to store the model on (typically a GPU). ''' pass @@ -46,7 +48,8 @@ def ask( max_tokens: int=256, temperature: float=0.0) -> str: ''' - Ask the LLM a query. + Ask the LLM a query. Most arguments are common, with Image input being the + exception for multimodal LLMs. prompt: text question or statement to make. max_tokens: maximum number of generated tokens. If an answer requires diff --git a/src/llm/other/demo.py b/src/llm/other/demo.py index adca571..3737f5c 100644 --- a/src/llm/other/demo.py +++ b/src/llm/other/demo.py @@ -9,6 +9,7 @@ if __name__ == '__main__': + # model = GptOss20b() model = GptOss20b() model.load(location=r'/home/eric/Repos/model_cache') @@ -32,7 +33,7 @@ c.add_response(role='user', text=user_response) - system_response = model.ask(prompt=c, max_tokens=1024) + system_response = model.ask(prompt=c) print(f'[Seamus]: {system_response}\n') diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_gpt_oss_20b.py b/tests/test_gpt_oss_20b.py new file mode 100644 index 0000000..49eb7a6 --- /dev/null +++ b/tests/test_gpt_oss_20b.py @@ -0,0 +1,40 @@ +""" +Tests for GptOss20b. + +These tests load a real model and call ask(), so they require local model +weights. Sync the environment first, then pass LLM_MODEL_CACHE inline when +invoking pytest so it only exists for that one command and does not persist +in your shell: + + uv sync --extra dev --extra openai + LLM_MODEL_CACHE=/home/yourname/Repos/model_cache pytest + +Tests are skipped automatically if LLM_MODEL_CACHE is not set. +""" + +import os + +import pytest +import torch + +from llm.models.gpt_oss_20b import GptOss20b + + +MODEL_CACHE = os.environ.get('LLM_MODEL_CACHE') + + +@pytest.fixture(scope='module') +def model(): + if not MODEL_CACHE: + pytest.skip('LLM_MODEL_CACHE environment variable not set.') + m = GptOss20b() + m.load(location=MODEL_CACHE) + yield m + del m + torch.cuda.empty_cache() + + +def test_ask(model): + response = model.ask(prompt='Name a primary color.') + assert isinstance(response, str) + assert len(response) > 0 diff --git a/tests/test_phi4_multimodal_instruct.py b/tests/test_phi4_multimodal_instruct.py new file mode 100644 index 0000000..b399b08 --- /dev/null +++ b/tests/test_phi4_multimodal_instruct.py @@ -0,0 +1,46 @@ +""" +Tests for Phi4MultimodalInstruct. + +These tests load a real model and call ask(), so they require local model +weights. Sync the environment first, then pass LLM_MODEL_CACHE inline when +invoking pytest so it only exists for that one command and does not persist +in your shell: + + uv sync --extra dev --extra microsoft + LLM_MODEL_CACHE=/home/yourname/Repos/model_cache pytest + +Tests are skipped automatically if LLM_MODEL_CACHE is not set. +""" + +import os + +import pytest +import torch +from PIL import Image as PillowImage + +from llm.models.phi4_multimodal_instruct import Phi4MultimodalInstruct + + +MODEL_CACHE = os.environ.get('LLM_MODEL_CACHE') + + +@pytest.fixture(scope='module') +def model(): + if not MODEL_CACHE: + pytest.skip('LLM_MODEL_CACHE environment variable not set.') + m = Phi4MultimodalInstruct() + m.load(location=MODEL_CACHE) + yield m + del m + torch.cuda.empty_cache() + + +@pytest.fixture(scope='module') +def image(): + return PillowImage.new('RGB', (64, 64), color=(255, 0, 0)) + + +def test_ask_with_image(model, image): + response = model.ask(prompt='Describe the image.', images=[image], max_tokens=256) + assert isinstance(response, str) + assert len(response) > 0 diff --git a/uv.lock b/uv.lock index cf7eb2a..05bacc9 100644 --- a/uv.lock +++ b/uv.lock @@ -146,6 +146,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, ] +[[package]] +name = "iniconfig" +version = "2.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, +] + [[package]] name = "jinja2" version = "3.1.6" @@ -183,10 +192,11 @@ wheels = [ [[package]] name = "llm" -version = "0.3.0" +version = "0.4.0" source = { editable = "." } dependencies = [ { name = "llm-conversation" }, + { name = "pytest" }, { name = "sentence-transformers" }, { name = "torch" }, { name = "torchvision", version = "0.24.0", source = { registry = "https://download.pytorch.org/whl/cu130" }, marker = "platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'" }, @@ -227,6 +237,7 @@ requires-dist = [ { name = "openai-harmony", marker = "extra == 'openai'", specifier = ">=0.0.8" }, { name = "peft", marker = "extra == 'all'", specifier = ">=0.18.1" }, { name = "peft", marker = "extra == 'microsoft'", specifier = ">=0.18.1" }, + { name = "pytest", specifier = ">=8.0" }, { name = "sentence-transformers", specifier = ">=5.2.2" }, { name = "torch", index = "https://download.pytorch.org/whl/cu130" }, { name = "torchvision", index = "https://download.pytorch.org/whl/cu130" }, @@ -518,6 +529,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bc/96/aaa61ce33cc98421fb6088af2a03be4157b1e7e0e87087c888e2370a7f45/pillow-12.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:7dfb439562f234f7d57b1ac6bc8fe7f838a4bd49c79230e0f6a1da93e82f1fad", size = 2436012, upload-time = "2025-10-15T18:22:23.621Z" }, ] +[[package]] +name = "pluggy" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, +] + [[package]] name = "psutil" version = "7.1.3" @@ -576,6 +596,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f7/07/34573da085946b6a313d7c42f82f16e8920bfd730665de2d11c0c37a74b5/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76d0819de158cd855d1cbb8fcafdf6f5cf1eb8e470abe056d5d161106e38062b", size = 2139017, upload-time = "2025-11-04T13:42:59.471Z" }, ] +[[package]] +name = "pygments" +version = "2.19.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, +] + +[[package]] +name = "pytest" +version = "9.0.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" }, +] + [[package]] name = "pyyaml" version = "6.0.3"