diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 0000000..2ac58cd
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,23 @@
+name: Tests
+
+on:
+  pull_request:
+    branches: [main]
+  push:
+    branches: [main]
+
+jobs:
+  test:
+    runs-on: self-hosted  # Requires a self-hosted runner with a GPU and model weights.
+                          # Register one at: Settings -> Actions -> Runners -> New self-hosted runner.
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install package
+        run: uv sync --extra dev --extra openai --extra microsoft
+
+      - name: Run tests
+        run: uv run pytest -v
+        env:
+          LLM_MODEL_CACHE: ${{ secrets.LLM_MODEL_CACHE }}
diff --git a/README.md b/README.md
index 197807e..d4e0583 100644
--- a/README.md
+++ b/README.md
@@ -47,7 +47,7 @@ uv sync
 
 ...with optional libraries:
 ```
-uv sync --extra <tag-1> <tag-2>
+uv sync --extra <tag-1> --extra <tag-2>
 ```
 
 ### ...with pip:
diff --git a/pyproject.toml b/pyproject.toml
index f7455e5..a49c4a8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "llm"  # The pip install <name>.
-version = "0.3.0"
+version = "0.4.0"
 description = "Library for easy use of LLMs."
 readme = "README.md"
 authors = [
@@ -15,6 +15,7 @@ dependencies = [
     "torchvision",
     "sentence-transformers>=5.2.2",
     "llm-conversation",
+    "pytest>=8.0",
 ]
 
 [project.optional-dependencies]
@@ -49,6 +50,9 @@ torch = { index = "pytorch-cu130" }
 torchvision = { index = "pytorch-cu130" }
 llm-conversation = { git = "https://github.com/EricApgar/llm-conversation", rev = "v0.2.0" }
 
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+
 [build-system]
 requires = ["uv_build>=0.9.7,<0.10.0"]
 build-backend = "uv_build"
diff --git a/src/llm/models/gpt_oss_20b.py b/src/llm/models/gpt_oss_20b.py
index b349da5..4141c36 100644
--- a/src/llm/models/gpt_oss_20b.py
+++ b/src/llm/models/gpt_oss_20b.py
@@ -1,15 +1,6 @@
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from openai_harmony import (
-    Conversation as HarmonyConversation,
-    RenderConversationConfig,
-    load_harmony_encoding,
-    HarmonyEncodingName,
-    DeveloperContent,
-    ReasoningEffort,
-    SystemContent,
-    Message,
-    Role)
+import os
+
+from transformers import pipeline
 
 from llm.models.template import Template
 from llm_conversation import Conversation
@@ -21,7 +12,7 @@ def __init__(self, hf_token: str=None):
         super().__init__(hf_token=hf_token)
 
         self.name = 'openai/gpt-oss-20b'
-        self.tokenizer = None
+        self.model: pipeline = None
 
 
     def load(self,
@@ -31,6 +22,9 @@ def load(self,
         quantization: str=None,
         device: str=None):
 
+        if (not remote) and (not os.path.isdir(location)):
+            raise ValueError(f'Nonexistant location ({location}) - fix or set remote=True.')
+
         self.location = location
         self.remote = remote
         self.commit = commit
@@ -38,161 +32,87 @@ def load(self,
 
         self._set_device(device=device)
 
-        self.model = AutoModelForCausalLM.from_pretrained(
-            pretrained_model_name_or_path=self.name,
+        model_kwargs = {
+            'cache_dir': self.location,
+            'local_files_only': not self.remote}
+        
+        self.model = pipeline(
+            task='text-generation',
+            model=self.name,
+            dtype='auto',
+            device_map=self.device,
             token=self.hf_token,
-            cache_dir=self.location,
-            local_files_only=not self.remote,
             revision=self.commit,
-            low_cpu_mem_usage=True,
-            # quantization_config=quantization_config,
-            device_map=self.device,
-            trust_remote_code=True,  # self.remote, TODO
-            _attn_implementation='eager',
-            torch_dtype='auto')  # Might be obsolete. Change to "dtype"?
-
-        self.tokenizer = AutoTokenizer.from_pretrained(self.name)
-
-        if self.tokenizer.pad_token_id is None or self.tokenizer.pad_token_id == self.tokenizer.eos_token_id:
-            self.tokenizer.pad_token_id = 0  # use a dedicated ID that isn't EOS
+            # trust_remote_code=self.remote,
+            model_kwargs=model_kwargs)
 
         return
-
+    
 
     def ask(self,
         prompt: str | Conversation,
-        max_tokens: int=1024,
-        temperature: float=0.5,
-        reasoning_level: str='low',
-        repetition_penalty: float=1.12,
-        top_p: float=0.95):
-        '''
-        Call an LLM with a prompt and generate a response.
+        max_tokens: int=512,
+        temperature: float=0.9,
+        reasoning_level: str=None):
 
-        This model works best when the input is formatted into an
-        openai-harmony conversation structure, so all inputs are converted
-        into a generic Conversation structure (if not already one) and then
-        converted into the harmony structure.
-        '''
-
-        if not self.model:
-            raise ValueError('Must load model before using! (see model.load())')
-        
-        encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+        formatted_messages = self._format_prompt(prompt=prompt, reasoning_level=reasoning_level)
 
-        if isinstance(prompt, str):  # Create a structured conversation from 
-            convo = Conversation()
-            convo.add_response(role='user', text=prompt)
+        kwargs = {}
+        if temperature == 0:
+            kwargs['do_sample'] = False
         else:
-            convo = prompt
-
-        convo_harmony = self._to_harmony(conversation=convo, reasoning_level=reasoning_level)
-
-        render_cfg = RenderConversationConfig(auto_drop_analysis=True)
-        prefill_ids = encoding.render_conversation_for_completion(
-            convo_harmony,
-            Role.ASSISTANT,
-            config=render_cfg)
-        stop_token_ids = encoding.stop_tokens_for_assistant_actions()
+            kwargs['temperature'] = temperature
 
-        input_ids = torch.tensor([prefill_ids], device=self.model.device)
-        attention_mask = torch.ones_like(input_ids)
-
-        out = self.model.generate(
-            input_ids=input_ids,
+        model_output = self.model(
+            formatted_messages,
             max_new_tokens=max_tokens,
-            do_sample=True,
-            temperature=temperature,
-            top_p=top_p,
-            # min_p=min_p,
-            repetition_penalty=repetition_penalty,
-            eos_token_id=stop_token_ids,
-            # pad_token_id=self.tokenizer.eos_token_id,
-            attention_mask=attention_mask,
-            pad_token_id=self.tokenizer.pad_token_id)
-
-        generated_tokens = out[0, input_ids.shape[-1]:].tolist()
-
-        # NOTE: Translate tokens directly to output (Debugging Only)
-        text_tokens = self.tokenizer.batch_decode(
-            generated_tokens,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=False)
-
-        # I can patch the harmony token problem by assuming that the message is in the generated tokens, and just
-        # wiping out everything up until the first occurrence of ['', 'analysis', ].
-        generated_tokens = generated_tokens[get_good_token_start(token_list=text_tokens):]
-
-        # Transform tokens into the text equivalent (contains model reasoning and thinking).
-        full_response = encoding.parse_messages_from_completion_tokens(generated_tokens, role=Role.ASSISTANT)
+            **kwargs)
+        
+        full_text_response = model_output[0]['generated_text'][-1]['content']
 
-        # Extract the actual response from the full set of generated text.
-        final_response = next(m for m in full_response if m.channel == "final")
-        text_response = final_response.content[0].text
+        if 'assistantfinal' in full_text_response:
+            text = full_text_response.split("assistantfinal", 1)[1].strip()
+        else:
+            raise ValueError(f'Mangled LLM output. Could not find expected end marker "assistantfinal" in generated text: {full_text_response}')
 
-        return text_response
+        return text
 
 
     @staticmethod
-    def _to_harmony(conversation: Conversation, reasoning_level: str) -> HarmonyConversation:
+    def _format_prompt(prompt: str | Conversation, reasoning_level: str=None) -> list[dict]:
         '''
-        Build a Harmony-Conversation object from a Generic Conversation object.
+        Structure the input convo and images into the expected format
+        to get a good clean LLM response. Embedd it and prepare for LLM
+        token generation.
         '''
 
-        if reasoning_level == 'low':
-            reasoning_level = ReasoningEffort.LOW
-        elif reasoning_level == 'medium':
-            reasoning_level = ReasoningEffort.MEDIUM
-        elif reasoning_level == 'high':
-            reasoning_level = ReasoningEffort.HIGH
-
-        # System Details about the Overall Conversation.
-        system_msg = Message.from_role_and_content(
-            Role.SYSTEM,
-            SystemContent.new().with_reasoning_effort(reasoning_level))
-        
-        developer_msg = Message.from_role_and_content(
-            Role.DEVELOPER,
-            DeveloperContent.new().with_instructions(conversation.overall_prompt))
-
-        msgs = [system_msg, developer_msg]
-
-        # Background Context Information.
-        if conversation.context:
-            context_block = '\n'.join([
-                "BACKGROUND CONTEXT (not part of the dialogue):",
-                '\n'.join(conversation.context),
-                "END BACKGROUND CONTEXT"])
-
-            msgs.append(Message.from_role_and_content(Role.USER, context_block))
-
-        # Conversation history between user and AI.
-        for turn in conversation.history:
-            if turn.role == "user":
-                msgs.append(Message.from_role_and_content(Role.USER, turn.text))
-            else:
-                msgs.append(Message.from_role_and_content(Role.ASSISTANT, turn.text))
+        if isinstance(prompt, str):
+            convo = Conversation()
+            convo.add_response(role='user', text=prompt)
+        else:
+            convo = prompt
 
-        harmony_convo = HarmonyConversation.from_messages(msgs)
+        system_pieces = []
+        formatted_messages = []
 
-        return harmony_convo
+        if reasoning_level:
+            system_pieces.append(f'Reasoning level: {reasoning_level}.')
 
+        if convo.overall_prompt:
+            system_pieces.append(convo.overall_prompt)
 
-def get_good_token_start(token_list: list[str]):
-    '''
-    This is a patch for handling bad generated tokens which break
-    the openai-harmony prompt formatter.
+        if convo.context:
+            for context in convo.context:
+                system_pieces.append(context)
 
-    It finds the start of rational thought in the generated output, skipping
-    over the nonsense content that's generated.
-    '''
-    for i in range(len(token_list) - 1):
-        if token_list[i] == "" and token_list[i + 1] == "analysis":
-            return i
+        if system_pieces:  # Merge background context pieces.
+            formatted_messages.append({'role': 'system', 'content': ' '.join(system_pieces)})
 
-    raise ValueError("Could not find ['', 'analysis'] in the list")
+        if convo.history:
+            for response in convo.history:
+                formatted_messages.append({'role': response.role, 'content': response.text})
 
-    return
+        return formatted_messages
 
 
 if __name__ == '__main__':
diff --git a/src/llm/models/gpt_oss_20b_dev.py b/src/llm/models/gpt_oss_20b_dev.py
new file mode 100644
index 0000000..2855955
--- /dev/null
+++ b/src/llm/models/gpt_oss_20b_dev.py
@@ -0,0 +1,213 @@
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from openai_harmony import (
+    Conversation as HarmonyConversation,
+    RenderConversationConfig,
+    load_harmony_encoding,
+    HarmonyEncodingName,
+    DeveloperContent,
+    ReasoningEffort,
+    SystemContent,
+    Message,
+    Role)
+
+from llm.models.template import Template
+from llm_conversation import Conversation
+
+
+class GptOss20bDev(Template):
+    '''
+    This development version of the GPT-OSS-20B model uses a lower level
+    method for generating tokens compared to the transformers.pipeline()
+    method.
+    '''
+
+    def __init__(self, hf_token: str=None):
+        super().__init__(hf_token=hf_token)
+
+        self.name = 'openai/gpt-oss-20b'
+        self.tokenizer = None
+
+
+    def load(self,
+        location: str,
+        remote: bool=False,
+        commit: str=None,
+        device: str=None):
+
+        self.location = location
+        self.remote = remote
+        self.commit = commit
+
+        self._set_device(device=device)
+
+        self.model = AutoModelForCausalLM.from_pretrained(
+            pretrained_model_name_or_path=self.name,
+            token=self.hf_token,
+            cache_dir=self.location,
+            local_files_only=not self.remote,
+            revision=self.commit,
+            low_cpu_mem_usage=True,
+            device_map=self.device,
+            trust_remote_code=True,  # self.remote, TODO
+            _attn_implementation='eager',
+            dtype='auto')  # Might be obsolete. Change to "dtype"?
+
+        self.tokenizer = AutoTokenizer.from_pretrained(self.name)
+
+        if self.tokenizer.pad_token_id is None or self.tokenizer.pad_token_id == self.tokenizer.eos_token_id:
+            self.tokenizer.pad_token_id = 0  # use a dedicated ID that isn't EOS
+
+        return
+
+
+    def ask(self,
+        prompt: str | Conversation,
+        max_tokens: int=256,
+        temperature: float=1.0,
+        reasoning_level: str='low',
+        repetition_penalty: float=1.15,
+        top_p: float=0.95):
+        '''
+        Call an LLM with a prompt and generate a response.
+
+        This model works best when the input is formatted into an
+        openai-harmony conversation structure, so all inputs are converted
+        into a generic Conversation structure (if not already one) and then
+        converted into the harmony structure.
+        '''
+
+        if not self.model:
+            raise ValueError('Must load model before using! (see model.load())')
+
+        encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+
+        if isinstance(prompt, str):  # Create a structured conversation from input.
+            convo = Conversation()
+            convo.add_response(role='user', text=prompt)
+        else:
+            convo = prompt
+
+        convo_harmony = self._to_harmony(conversation=convo, reasoning_level=reasoning_level)
+
+        render_cfg = RenderConversationConfig(auto_drop_analysis=True)
+        prefill_ids = encoding.render_conversation_for_completion(
+            convo_harmony,
+            Role.ASSISTANT,
+            config=render_cfg)
+        stop_token_ids = encoding.stop_tokens_for_assistant_actions()
+
+        input_ids = torch.tensor([prefill_ids], device=self.model.device)
+        attention_mask = torch.ones_like(input_ids)
+
+        # Generate new tokens from the LLM.
+        generated_tokens = self.model.generate(
+            input_ids=input_ids,
+            max_new_tokens=max_tokens,
+            do_sample=True,
+            temperature=temperature,
+            top_p=top_p,  # NOTE: Debatable usefulness.
+            repetition_penalty=repetition_penalty,
+            eos_token_id=stop_token_ids,
+            attention_mask=attention_mask,  # NOTE: Debatable usefulness.
+            pad_token_id=self.tokenizer.pad_token_id)  # NOTE: Debatable usefulness. self.tokenizer.eos_token_id
+
+        # Shape the generated tokens into final form.
+        generated_tokens = generated_tokens[0, input_ids.shape[-1]:].tolist()
+
+        # Translate tokens (which are numbers) directly to output. Basically a look up table.
+        # This is an tangent we take to check if the output is mangled and needs to be adjusted.
+        text_tokens = self.tokenizer.batch_decode(
+            generated_tokens,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False)
+
+        # I can patch the mangled harmony token problem by assuming that the message is in the generated tokens, 
+        # and just wiping out everything up until the first occurrence of [..., '', 'analysis', ...].
+        generated_tokens = generated_tokens[get_good_token_start(token_list=text_tokens):]
+
+        # Transform tokens into the text equivalent (will contain model reasoning and thinking).
+        full_response = encoding.parse_messages_from_completion_tokens(generated_tokens, role=Role.ASSISTANT)
+
+        # Extract the actual response (sans reasoning) from the full set of generated text.
+        final_response = next(m for m in full_response if m.channel == "final")
+        text_response = final_response.content[0].text
+
+        return text_response
+
+
+    @staticmethod
+    def _to_harmony(conversation: Conversation, reasoning_level: str) -> HarmonyConversation:
+        '''
+        Build a Harmony-Conversation object from a Generic Conversation object.
+        '''
+
+        if reasoning_level == 'low':
+            reasoning_level = ReasoningEffort.LOW
+        elif reasoning_level == 'medium':
+            reasoning_level = ReasoningEffort.MEDIUM
+        elif reasoning_level == 'high':
+            reasoning_level = ReasoningEffort.HIGH
+
+        # System Details about the Overall Conversation.
+        system_msg = Message.from_role_and_content(
+            Role.SYSTEM,
+            SystemContent.new().with_reasoning_effort(reasoning_level))
+
+        developer_msg = Message.from_role_and_content(
+            Role.DEVELOPER,
+            DeveloperContent.new().with_instructions(conversation.overall_prompt))
+
+        msgs = [system_msg, developer_msg]
+
+        # Background Context Information.
+        if conversation.context:
+            context_block = '\n'.join([
+                "BACKGROUND CONTEXT (not part of the dialogue):",
+                '\n'.join(conversation.context),
+                "END BACKGROUND CONTEXT"])
+
+            msgs.append(Message.from_role_and_content(Role.USER, context_block))
+
+        # Conversation history between user and AI.
+        for turn in conversation.history:
+            if turn.role == "user":
+                msgs.append(Message.from_role_and_content(Role.USER, turn.text))
+            else:
+                msgs.append(Message.from_role_and_content(Role.ASSISTANT, turn.text))
+
+        harmony_convo = HarmonyConversation.from_messages(msgs)
+
+        return harmony_convo
+
+
+def get_good_token_start(token_list: list[str]) -> int:
+    '''
+    This is a patch for handling bad generated tokens which break
+    the openai-harmony prompt formatter.
+
+    It finds the start of rational thought in the generated output, skipping
+    over the nonsense content that's generated.
+    '''
+    i_start = None
+
+    for i in range(len(token_list) - 1):
+        if token_list[i] == "" and token_list[i + 1] == "analysis":
+            i_start = i
+            break
+
+    if i_start is None:
+        raise ValueError(f'Mangled LLM output. Could not find expected start tokens ["", "analysis"] in generated tokens: {token_list}')
+
+    return i_start
+
+
+if __name__ == '__main__':
+
+    model = GptOss20bDev()
+    model.load(location=r'/home/eric/Repos/model_cache')
+    response = model.ask(prompt='Name a primary color.')
+
+    print(response)
+
+    pass
\ No newline at end of file
diff --git a/src/llm/models/phi4_multimodal_instruct.py b/src/llm/models/phi4_multimodal_instruct.py
index 6fa4dc0..9b3a601 100644
--- a/src/llm/models/phi4_multimodal_instruct.py
+++ b/src/llm/models/phi4_multimodal_instruct.py
@@ -1,8 +1,13 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+
 from transformers import AutoProcessor, AutoModelForCausalLM
 
 from llm.models.template import Template
+from llm_conversation import Conversation
 
-# from PIL import image as PillowImage
+if TYPE_CHECKING:
+    from PIL import Image as PillowImage
 
 
 class Phi4MultimodalInstruct(Template):
@@ -21,6 +26,14 @@ def load(self,
         commit: str='0cb22ab20b10ac01c49ecd8b7138dcd98bc00548',
         quantization: str=None,
         device: str=None):
+        '''
+        The commit is locked to this because this is the version of Phi4 with patches
+        needed to run. There's a whole mess regarding Phi-4 getting out of date with
+        transformers and breaking, and no one updating the HF Phi-4. See the discussions
+        (https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions) for details.
+
+        Just know that it's huge hassle to try to get Phi4 working as the transformers lib updates.
+        '''
 
         self.location = location
         self.remote = remote
@@ -30,6 +43,7 @@ def load(self,
         self._set_device(device=device)
         self._patch_dynamic_cache()
 
+        # TODO: Turn on quantization at some point.
         # quantization_config = BitsAndBytesConfig(
         #     load_in_4bit=True,
         #     bnb_4bit_compute_dtype=torch.float16)
@@ -53,52 +67,69 @@ def load(self,
 
 
     def ask(self,
-        prompt: str,
-        images: list=None,  # list[PillowImage.Image]
-        max_tokens: int=256,
-        temperature: float=0.1):
+        prompt: str | Conversation,
+        images: list[PillowImage.Image]=None,
+        max_tokens: int=1024,
+        # temperature: float=0.5,
+        # reasoning_level: str='low',
+        # top_p: float=0.95,
+        repetition_penalty: float=1.12) -> str:
 
         if not self.model:
             raise ValueError('Must load model before using! (see model.load())')
+        
+        if isinstance(prompt, str):  # Create a structured conversation from input.
+            convo = Conversation()
+            convo.add_response(role='user', text=prompt)
+        else:
+            convo = prompt
 
-        embedding = self.embed(text=prompt, images=images)
+        embedding = self._structure_inputs(convo=convo, images=images)
 
         generation_args = {
             'max_new_tokens': max_tokens,
-            # temperature: 0.0,
             'do_sample': False}
-        
-        generate_ids = self.model.generate(
+
+        # Generate new tokens from the input via an LLM.
+        generated_tokens = self.model.generate(
             **embedding,
             eos_token_id=self.processor.tokenizer.eos_token_id,
+            repetition_penalty=repetition_penalty,
             **generation_args)
-        
-        # Decode the output (un-embed the output to convert to text).
-        generate_ids = generate_ids[:, embedding['input_ids'].shape[1]:]
 
+        # Extract the explicit response tokens (sans thinking and original question).
+        response_tokens = generated_tokens[:, embedding['input_ids'].shape[1]:]
+
+        # Translate the tokens to text (essentially a look up table).
         response = self.processor.batch_decode(
-            generate_ids,
+            response_tokens,
             skip_special_tokens=True,
             clean_up_tokenization_spaces=False)[0]
-        
+
         return response
     
 
-    def embed(self, text: str=None, images: list=None) -> dict:
+    def _structure_inputs(self, convo: Conversation, images: list=None) -> dict:
         '''
-        Background:
-        <|user|>\n<|image_1|>\n<|image_2|\n<|image_3|\n{prompt}<|end|>\n<|assistant|>\n
+        Structure the input convo and images into the expected format
+        to get a good clean LLM response. Embedd it and prepare for LLM
+        token generation.
         '''
 
-        # TODO: Image list might not need chat template applied again.
-        if images is not None:
-            image_tags = ''.join([f'<|image_{i+1}|>\n' for i, _ in enumerate(images)])
-            content_wrap = '<|user|\n>' + image_tags + f'{text}<|end|>\n<|assistant|>\n'
-        else:
-            # content_wrap = f'<|user|>\n{text}<|end|>\n<|assistant|>\n'
-            content_wrap = text
+        if not convo.overall_prompt:
+            convo.set_overall_prompt(text='')
 
-        messages = [{'role': 'user', 'content': content_wrap}]
+        system_prompt = convo.overall_prompt + ' '.join(convo.context)
+        messages = [{
+            'role': 'system',
+            'content': system_prompt}] + [{
+                'role': i.role,
+                'content': i.text} for i in convo.history]
+
+        if images:  # Modify last item in convo to carry image tags.
+            image_tags = ''.join([f'<|image_{i+1}|>' for i, _ in enumerate(images)])
+            last_role = messages[-1]['role']
+            messages[-1] = {'role': last_role, 'content': image_tags + messages[-1]['content']}
 
         structured_prompt = self.processor.tokenizer.apply_chat_template(
             messages,
@@ -113,17 +144,20 @@ def embed(self, text: str=None, images: list=None) -> dict:
         return embedding
 
 
-    def _load_processor(self):
+    def _load_processor(self, num_images: int=1):
+
+        if num_images == 1:
+            num_crops = 16
+        else:
+            num_crops = 4
 
-        # TODO: For best performance, supposed to use num_crops=4 for multi
-        # frame and 16 for single frame.
         self.processor = AutoProcessor.from_pretrained(
             pretrained_model_name_or_path=self.name,
             trust_remote_code=True,  # self.remote,
-            num_crops=4)
-        
+            num_crops=num_crops)
+
         return
-    
+
 
     def _patch_dynamic_cache(self):
         '''
@@ -165,8 +199,22 @@ def get_usable_length(self, new_seq_length: int, layer_idx: int=0) -> int:
 if __name__ == '__main__':
 
     model = Phi4MultimodalInstruct()
-    model.load(location=r'/home/eric/Repos/model_cache')  # <path to model cache>
-    response = model.ask(prompt='Name a primary color.')
-    print(response)
+    model.load(location=r'/home/eric/Repos/model_cache')  # NOTE: set <path to model cache>.
+
+    response = model.ask(prompt='Name a primary color. Be brief.', max_tokens=256)
+    print(f'{response}\n')
+
+    convo = Conversation()
+    convo.set_overall_prompt(text='You are a helpful assistant.')
+    convo.add_context(text='Your favorite color is red.')
+    convo.add_context(text='Your favorite shape is the hexagon.')
+    convo.add_response(role='user', text='What is your favorite color-shape combination?')
+    response = model.ask(prompt=convo, max_tokens=256)
+    print(f'{response}\n')
+
+    from PIL import Image as PillowImage
+    image = PillowImage.open(r'/home/eric/Desktop/monkey.png')  # NOTE: Point to existing image.
+    response = model.ask(prompt='Describe the image.', images=[image], max_tokens=256)
+    print(f'{response}\n')
 
     pass
\ No newline at end of file
diff --git a/src/llm/models/template.py b/src/llm/models/template.py
index 6dc2abf..f97ca7b 100644
--- a/src/llm/models/template.py
+++ b/src/llm/models/template.py
@@ -24,7 +24,8 @@ def load(self,
         location: str,
         remote: bool=False,
         commit: str=None,
-        quantization: str=None):
+        quantization: str=None,
+        device: str=None):
         '''
         Load in the LLM with specific settings.
 
@@ -35,6 +36,7 @@ def load(self,
             models are loaded from.
         commit: specific git commit for a model.
         quantization: work in progress. Currently unused.
+        device: which device to store the model on (typically a GPU).
         '''
 
         pass
@@ -46,7 +48,8 @@ def ask(
         max_tokens: int=256,
         temperature: float=0.0) -> str:
         '''
-        Ask the LLM a query.
+        Ask the LLM a query. Most arguments are common, with Image input being the
+        exception for multimodal LLMs.
 
         prompt: text question or statement to make.
         max_tokens: maximum number of generated tokens. If an answer requires
diff --git a/src/llm/other/demo.py b/src/llm/other/demo.py
index adca571..3737f5c 100644
--- a/src/llm/other/demo.py
+++ b/src/llm/other/demo.py
@@ -9,6 +9,7 @@
 
 if __name__ == '__main__':
 
+    # model = GptOss20b()
     model = GptOss20b()
     model.load(location=r'/home/eric/Repos/model_cache')
 
@@ -32,7 +33,7 @@
 
         c.add_response(role='user', text=user_response)
 
-        system_response = model.ask(prompt=c, max_tokens=1024)
+        system_response = model.ask(prompt=c)
 
         print(f'[Seamus]: {system_response}\n')
 
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_gpt_oss_20b.py b/tests/test_gpt_oss_20b.py
new file mode 100644
index 0000000..49eb7a6
--- /dev/null
+++ b/tests/test_gpt_oss_20b.py
@@ -0,0 +1,40 @@
+"""
+Tests for GptOss20b.
+
+These tests load a real model and call ask(), so they require local model
+weights. Sync the environment first, then pass LLM_MODEL_CACHE inline when
+invoking pytest so it only exists for that one command and does not persist
+in your shell:
+
+    uv sync --extra dev --extra openai
+    LLM_MODEL_CACHE=/home/yourname/Repos/model_cache pytest
+
+Tests are skipped automatically if LLM_MODEL_CACHE is not set.
+"""
+
+import os
+
+import pytest
+import torch
+
+from llm.models.gpt_oss_20b import GptOss20b
+
+
+MODEL_CACHE = os.environ.get('LLM_MODEL_CACHE')
+
+
+@pytest.fixture(scope='module')
+def model():
+    if not MODEL_CACHE:
+        pytest.skip('LLM_MODEL_CACHE environment variable not set.')
+    m = GptOss20b()
+    m.load(location=MODEL_CACHE)
+    yield m
+    del m
+    torch.cuda.empty_cache()
+
+
+def test_ask(model):
+    response = model.ask(prompt='Name a primary color.')
+    assert isinstance(response, str)
+    assert len(response) > 0
diff --git a/tests/test_phi4_multimodal_instruct.py b/tests/test_phi4_multimodal_instruct.py
new file mode 100644
index 0000000..b399b08
--- /dev/null
+++ b/tests/test_phi4_multimodal_instruct.py
@@ -0,0 +1,46 @@
+"""
+Tests for Phi4MultimodalInstruct.
+
+These tests load a real model and call ask(), so they require local model
+weights. Sync the environment first, then pass LLM_MODEL_CACHE inline when
+invoking pytest so it only exists for that one command and does not persist
+in your shell:
+
+    uv sync --extra dev --extra microsoft
+    LLM_MODEL_CACHE=/home/yourname/Repos/model_cache pytest
+
+Tests are skipped automatically if LLM_MODEL_CACHE is not set.
+"""
+
+import os
+
+import pytest
+import torch
+from PIL import Image as PillowImage
+
+from llm.models.phi4_multimodal_instruct import Phi4MultimodalInstruct
+
+
+MODEL_CACHE = os.environ.get('LLM_MODEL_CACHE')
+
+
+@pytest.fixture(scope='module')
+def model():
+    if not MODEL_CACHE:
+        pytest.skip('LLM_MODEL_CACHE environment variable not set.')
+    m = Phi4MultimodalInstruct()
+    m.load(location=MODEL_CACHE)
+    yield m
+    del m
+    torch.cuda.empty_cache()
+
+
+@pytest.fixture(scope='module')
+def image():
+    return PillowImage.new('RGB', (64, 64), color=(255, 0, 0))
+
+
+def test_ask_with_image(model, image):
+    response = model.ask(prompt='Describe the image.', images=[image], max_tokens=256)
+    assert isinstance(response, str)
+    assert len(response) > 0
diff --git a/uv.lock b/uv.lock
index cf7eb2a..05bacc9 100644
--- a/uv.lock
+++ b/uv.lock
@@ -146,6 +146,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" },
 ]
 
+[[package]]
+name = "iniconfig"
+version = "2.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
+]
+
 [[package]]
 name = "jinja2"
 version = "3.1.6"
@@ -183,10 +192,11 @@ wheels = [
 
 [[package]]
 name = "llm"
-version = "0.3.0"
+version = "0.4.0"
 source = { editable = "." }
 dependencies = [
     { name = "llm-conversation" },
+    { name = "pytest" },
     { name = "sentence-transformers" },
     { name = "torch" },
     { name = "torchvision", version = "0.24.0", source = { registry = "https://download.pytorch.org/whl/cu130" }, marker = "platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'" },
@@ -227,6 +237,7 @@ requires-dist = [
     { name = "openai-harmony", marker = "extra == 'openai'", specifier = ">=0.0.8" },
     { name = "peft", marker = "extra == 'all'", specifier = ">=0.18.1" },
     { name = "peft", marker = "extra == 'microsoft'", specifier = ">=0.18.1" },
+    { name = "pytest", specifier = ">=8.0" },
     { name = "sentence-transformers", specifier = ">=5.2.2" },
     { name = "torch", index = "https://download.pytorch.org/whl/cu130" },
     { name = "torchvision", index = "https://download.pytorch.org/whl/cu130" },
@@ -518,6 +529,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/bc/96/aaa61ce33cc98421fb6088af2a03be4157b1e7e0e87087c888e2370a7f45/pillow-12.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:7dfb439562f234f7d57b1ac6bc8fe7f838a4bd49c79230e0f6a1da93e82f1fad", size = 2436012, upload-time = "2025-10-15T18:22:23.621Z" },
 ]
 
+[[package]]
+name = "pluggy"
+version = "1.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
+]
+
 [[package]]
 name = "psutil"
 version = "7.1.3"
@@ -576,6 +596,31 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f7/07/34573da085946b6a313d7c42f82f16e8920bfd730665de2d11c0c37a74b5/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76d0819de158cd855d1cbb8fcafdf6f5cf1eb8e470abe056d5d161106e38062b", size = 2139017, upload-time = "2025-11-04T13:42:59.471Z" },
 ]
 
+[[package]]
+name = "pygments"
+version = "2.19.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" },
+]
+
+[[package]]
+name = "pytest"
+version = "9.0.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "iniconfig" },
+    { name = "packaging" },
+    { name = "pluggy" },
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" },
+]
+
 [[package]]
 name = "pyyaml"
 version = "6.0.3"