From 37405083755afa94e638d74da08bae30859c62c1 Mon Sep 17 00:00:00 2001
From: Char15Xu <charlesxuty3@berkeley.edu>
Date: Fri, 9 Aug 2024 17:59:05 -0700
Subject: [PATCH 1/6] add anthropic model support, testing needed

---
 .DS_Store                    | Bin 0 -> 6148 bytes
 Makefile                     |   4 +
 benchmark/.DS_Store          | Bin 0 -> 6148 bytes
 examples/model/claude.py     |  34 +++++
 libem/core/model/__init__.py |   6 +-
 libem/core/model/claude.py   | 247 +++++++++++++++++++++++++++++++++++
 libem/parameter.py           |   2 +-
 7 files changed, 291 insertions(+), 2 deletions(-)
 create mode 100644 .DS_Store
 create mode 100644 benchmark/.DS_Store
 create mode 100644 examples/model/claude.py
 create mode 100644 libem/core/model/claude.py
diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..2108caa4ed676c146b85d919c6d3217b69c16dd1
GIT binary patch
literal 6148
zcmeHKJ5EC}5S%3`f@o4w`U>2@ioyxFKz>L_1R{abzlw8lG-f|Vq6b}QqFHG@_Ik&b
zr+E7ofGv)@2Vf3hMt8)Q4`cIv_mSOH#E5jB@q#z3@qyRf=P3L4fOBuL!86{mJ>_rr
zo5Nx6+cGI21*Cu!kOETRf(le|o!wpVOdTZ!q`-A3;NOQvckG2jVthI{L<>NiG91Qv
z^b*A80b(y45*eXcQi(~mYB4P7jJL|`g+pS}Vby$C-E7sNSlrI@TcpE!qDCno1<n<i
z<Z|xy|B?Pj|9?)>N(xAUt5U$G%ctdnPpaBFdYsqVMqklA=bP@vc~Ce+IVMIq=EBSI
deI#XG^Evl>;gA?~#)D4O&w%S9lLG&(zz<2<6@LH#

literal 0
HcmV?d00001

diff --git a/Makefile b/Makefile
index 6314069..dbf84c8 100644
--- a/Makefile
+++ b/Makefile
@@ -153,3 +153,7 @@ duckdb:
 	python examples/apps/integration/duckdb_cluster.py
 mongodb:
 	python examples/apps/integration/mongodb_cluster.py
+
+.PHONY: claude
+claude:
+	python examples/model/claude.py
\ No newline at end of file
diff --git a/benchmark/.DS_Store b/benchmark/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..601d5e8aa0d9ab99cb009600950a5548a9560fbb
GIT binary patch
literal 6148
zcmeHKJ5EC}5S)cbM50NV(pTUHRuoRa1rQ|NlOpj*|0>SK(K7ofh#quF5i~2U$6oK)
z@)U310<g_-_XMl}Ea{H;@?mbi?>@7;iWrg3GX{KO+>P(U+hLadd%(F5_`n_`jz9T_
z!M9{mKnh3!DIf);z=aj40=s;?@VPop3P^$5SHQmyjqcbBr^NVlaEKOwxL`Pp^XMgr
z%>%?<I3+Sdv!oJ}YSm&`(iv}+*9)h_q{FKDu)5i*L$SD>=eJ0Q^+b(QKnk2IaG%SS
z*Z*hwAN~J1Nh>KJ1^$%+w%ERGH+)jn*4g8{);9Vp-E+R_Zkz{&LzH7;lw&Tu9IqoO
a^P11O-wUV2pfetHqJ9Qk7nu~ewgM;M))mA6

literal 0
HcmV?d00001

diff --git a/examples/model/claude.py b/examples/model/claude.py
new file mode 100644
index 0000000..940ea22
--- /dev/null
+++ b/examples/model/claude.py
@@ -0,0 +1,34 @@
+import libem
+
+from libem.match.prompt import rules
+
+def positive():
+    e1 = "Dyson Hot+Cool AM09 Jet Focus heater and fan, White/Silver"
+    e2 = "Dyson AM09 Hot + Cool Jet Focus Fan Heater - W/S"
+
+    is_match = libem.match(e1, e2)
+
+    print("Entity 1:", e1)
+    print("Entity 2:", e2)
+    print("Match:", is_match['answer'])
+
+def negative():
+    e1 = "Dyson Hot+Cool AM09 Jet Focus heater and fan, White/Silver"
+    e2 = "Dyson AM09 Hot + Cool Jet Focus Fan Heater - Black japan"
+
+    rules.add("Color differentiates entities.")
+    is_match = libem.match(e1, e2)
+
+    print("Entity 1:", e1)
+    print("Entity 2:", e2)
+    print("Match:", is_match['answer'])
+
+def main():
+    libem.calibrate({
+        "libem.match.parameter.model": "claude-3-5-sonnet-20240620",
+    }, verbose=True)
+    positive()
+    negative()
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/libem/core/model/__init__.py b/libem/core/model/__init__.py
index 79f6d6c..51cac24 100644
--- a/libem/core/model/__init__.py
+++ b/libem/core/model/__init__.py
@@ -1,5 +1,5 @@
 from libem.core.model import (
-    openai, llama
+    openai, llama, claude
 )
 from libem.core import exec
 import libem
@@ -15,6 +15,10 @@ async def async_call(*args, **kwargs) -> dict:
         return llama.call(*args, **kwargs)
     elif kwargs.get("model", "") == "llama3.1":
         return llama.call(*args, **kwargs)
+    elif kwargs.get("model", "") == "llama3.1":
+        return llama.call(*args, **kwargs)
+    elif kwargs.get("model", "") == "claude-3-5-sonnet-20240620":
+        return await claude.call(*args, **kwargs) 
     else:
         return await openai.async_call(*args, **kwargs)
 
diff --git a/libem/core/model/claude.py b/libem/core/model/claude.py
new file mode 100644
index 0000000..e3d7b4f
--- /dev/null
+++ b/libem/core/model/claude.py
@@ -0,0 +1,247 @@
+import os
+import json
+import httpx
+import importlib
+import inspect
+import asyncio
+from anthropic import (
+    AsyncAnthropic, APITimeoutError
+)
+
+import libem
+from libem.core import exec
+
+os.environ.setdefault(
+    "CLAUDE_API_KEY",
+    libem.LIBEM_CONFIG.get("CLAUDE_API_KEY", "")
+)
+
+_client = None
+
+
+def get_client():
+    global _client
+
+    if not os.environ.get("CLAUDE_API_KEY"):
+        raise EnvironmentError(f"CLAUDE_API_KEY is not set.")
+
+    if not _client:
+        _client = AsyncAnthropic(
+            api_key=os.environ["CLAUDE_API_KEY"],
+            http_client=httpx.AsyncClient(
+                limits=httpx.Limits(
+                    max_connections=1000,
+                    max_keepalive_connections=100
+                )
+            )
+        )
+    return _client
+
+
+def call(*args, **kwargs) -> dict:
+    return exec.run_async_task(
+        async_call(*args, **kwargs)
+    )
+
+
+# Model call with multiple rounds of tool use
+async def async_call(
+        prompt: str | list | dict,
+        tools: list[str] = None,
+        context: list = None,
+        model: str = "claude-3-5-sonnet-20240620",
+        temperature: float = 0.0,
+        seed: int = None,
+        max_model_call: int = 3,
+) -> dict:
+    client = get_client()
+
+    context = context or []
+
+    # format the prompt to messages
+    system_message = None
+    user_messages = []
+
+    match prompt:
+        case list():
+            for msg in prompt:
+                if msg["role"] == "system":
+                    system_message = msg["content"]
+                else:
+                    user_messages.append(msg)
+        case dict():
+            for role, content in prompt.items():
+                if role == "system":
+                    system_message = content
+                else:
+                    user_messages.append({"role": role, "content": content})
+        case str():
+            user_messages = [{"role": "user", "content": prompt}]
+        case _:
+            raise ValueError(f"Invalid prompt type: {type(prompt)}")
+
+    # Handle context
+    for msg in context:
+        if msg["role"] == "system":
+            if system_message is None:
+                system_message = msg["content"]
+            else:
+                system_message += "\n" + msg["content"]
+        else:
+            user_messages.insert(0, msg)
+
+    messages = user_messages
+
+    # trace variables
+    num_model_calls = 0
+    num_input_tokens, num_output_tokens = 0, 0
+    tool_usages, tool_outputs = [], []
+
+    """Start call"""
+
+    if not tools:
+        try:
+            response = await client.messages.create(
+                messages=messages,
+                system=system_message,
+                model=model,
+                temperature=temperature,
+                max_tokens = 1000,
+            )
+        except APITimeoutError as e:  # catch timeout error
+            raise libem.ModelTimedoutException(e)
+        
+        response_message = response.content[0].text
+        print(response_message)
+        num_model_calls += 1
+        num_input_tokens += response.usage.input_tokens
+        num_output_tokens += response.usage.input_tokens
+    else:
+        # Load the tool modules
+        tools = [importlib.import_module(tool) for tool in tools]
+
+        # Get the functions from the tools and
+        # prefer async functions if available
+        available_functions = {
+            tool.name: getattr(tool, 'async_func', tool.func)
+            for tool in tools
+        }
+
+        # Get the schema from the tools
+        tools = [tool.schema for tool in tools]
+
+        # Call model
+        try:
+            response = await client.messages.create(
+                messages=messages,
+                system=system_message,
+                tools=tools,
+                tool_choice="auto",
+                model=model,
+                temperature=temperature,
+                max_tokens = 1000,
+            )
+            
+        except APITimeoutError as e:  # catch timeout error
+            raise libem.ModelTimedoutException(e)
+
+        response_message = response.content[0].text
+        tool_uses = response_message.tool_use
+        
+        num_model_calls += 1
+        num_input_tokens += response.usage.input_tokens
+        num_output_tokens += response.usage.input_tokens
+
+        # Call tools
+        while tool_use:
+            messages.append(response_message)
+
+            for tool_use in tool_uses:
+                function_name = tool_use.name
+                function_to_call = available_functions[function_name]
+                function_args = json.loads(tool_use.input)
+
+                libem.debug(f"[{function_name}] {function_args}")
+
+                if inspect.iscoroutinefunction(function_to_call):
+                    function_response = function_to_call(**function_args)
+                else:
+                    function_response = function_to_call(**function_args)
+
+                messages.append(
+                    {
+                        "role": "tool",
+                        "name": function_name,
+                        "content": str(function_response),
+                        "tool_use_id": tool_use.id,
+                    }
+                )
+
+                tool_usages.append({
+                    "id": tool_use.id,
+                    'name': function_name,
+                    "arguments": function_args,
+                    "response": function_response,
+                })
+
+                tool_outputs.append({
+                    function_name: function_response,
+                })
+
+            tool_uses = []
+
+            if num_model_calls < max_model_call:
+                # Call the model again with the tool outcomes
+                try:
+                    response = await client.messages.create(
+                        messages=messages,
+                        system=system_message,
+                        tools=tools,
+                        tool_choice="auto",
+                        model=model,
+                        temperature=temperature,
+                        max_tokens = 1000,
+                    )
+                except APITimeoutError as e:  # catch timeout error
+                    raise libem.ModelTimedoutException(e)
+
+                response_message = response.content[0].text
+                tool_uses = response_message.tool_use
+                
+                num_model_calls += 1
+                num_input_tokens += response.usage.input_tokens
+                num_output_tokens += response.usage.input_tokens
+
+            if num_model_calls == max_model_call:
+                libem.debug(f"[model] max call reached: "
+                            f"{messages}\n{response_message}")
+
+    """End call"""
+
+    messages.append(response_message)
+
+    libem.trace.add({
+        "model": {
+            "messages": messages,
+            "tool_usages": tool_usages,
+            "num_model_calls": num_model_calls,
+            "num_input_tokens": num_input_tokens,
+            "num_output_tokens": num_output_tokens,
+        }
+    })
+
+    return {
+        "output": response_message,
+        "tool_outputs": tool_outputs,
+        "messages": messages,
+        "stats": {
+            "num_model_calls": num_model_calls,
+            "num_input_tokens": num_input_tokens,
+            "num_output_tokens": num_output_tokens,
+        }
+    }
+
+
+def reset():
+    global _client
+    _client = None
\ No newline at end of file
diff --git a/libem/parameter.py b/libem/parameter.py
index 60b50c2..1535013 100644
--- a/libem/parameter.py
+++ b/libem/parameter.py
@@ -4,7 +4,7 @@
     default="gpt-4o-2024-08-06",
     options=["gpt-4o","gpt-4o-mini", "gpt-4",
              "gpt-4-turbo", "gpt-3.5-turbo",
-             "llama3", "llama3.1"]
+             "llama3", "llama3.1", "claude-3-5-sonnet-20240620"]
 )
 temperature = Parameter(
     default=0,

From eb1f549375ee083b43cf1e71f1e3db91e39e33e7 Mon Sep 17 00:00:00 2001
From: Char15Xu <charlesxuty3@berkeley.edu>
Date: Fri, 9 Aug 2024 18:02:43 -0700
Subject: [PATCH 2/6] fixed conflict error in libem/core/model/init.py

---
 libem/core/model/__init__.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/libem/core/model/__init__.py b/libem/core/model/__init__.py
index 51cac24..4e68086 100644
--- a/libem/core/model/__init__.py
+++ b/libem/core/model/__init__.py
@@ -15,8 +15,6 @@ async def async_call(*args, **kwargs) -> dict:
         return llama.call(*args, **kwargs)
     elif kwargs.get("model", "") == "llama3.1":
         return llama.call(*args, **kwargs)
-    elif kwargs.get("model", "") == "llama3.1":
-        return llama.call(*args, **kwargs)
     elif kwargs.get("model", "") == "claude-3-5-sonnet-20240620":
         return await claude.call(*args, **kwargs) 
     else:

From 963c3b1655a8f4d8c1b47cab2445c6794908ad67 Mon Sep 17 00:00:00 2001
From: Char15Xu <charlesxuty3@berkeley.edu>
Date: Tue, 13 Aug 2024 00:05:41 -0700
Subject: [PATCH 3/6] merge changes

---
 .DS_Store           | Bin 6148 -> 0 bytes
 benchmark/.DS_Store | Bin 6148 -> 0 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 .DS_Store
 delete mode 100644 benchmark/.DS_Store

diff --git a/.DS_Store b/.DS_Store
deleted file mode 100644
index 2108caa4ed676c146b85d919c6d3217b69c16dd1..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHKJ5EC}5S%3`f@o4w`U>2@ioyxFKz>L_1R{abzlw8lG-f|Vq6b}QqFHG@_Ik&b
zr+E7ofGv)@2Vf3hMt8)Q4`cIv_mSOH#E5jB@q#z3@qyRf=P3L4fOBuL!86{mJ>_rr
zo5Nx6+cGI21*Cu!kOETRf(le|o!wpVOdTZ!q`-A3;NOQvckG2jVthI{L<>NiG91Qv
z^b*A80b(y45*eXcQi(~mYB4P7jJL|`g+pS}Vby$C-E7sNSlrI@TcpE!qDCno1<n<i
z<Z|xy|B?Pj|9?)>N(xAUt5U$G%ctdnPpaBFdYsqVMqklA=bP@vc~Ce+IVMIq=EBSI
deI#XG^Evl>;gA?~#)D4O&w%S9lLG&(zz<2<6@LH#

diff --git a/benchmark/.DS_Store b/benchmark/.DS_Store
deleted file mode 100644
index 601d5e8aa0d9ab99cb009600950a5548a9560fbb..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHKJ5EC}5S)cbM50NV(pTUHRuoRa1rQ|NlOpj*|0>SK(K7ofh#quF5i~2U$6oK)
z@)U310<g_-_XMl}Ea{H;@?mbi?>@7;iWrg3GX{KO+>P(U+hLadd%(F5_`n_`jz9T_
z!M9{mKnh3!DIf);z=aj40=s;?@VPop3P^$5SHQmyjqcbBr^NVlaEKOwxL`Pp^XMgr
z%>%?<I3+Sdv!oJ}YSm&`(iv}+*9)h_q{FKDu)5i*L$SD>=eJ0Q^+b(QKnk2IaG%SS
z*Z*hwAN~J1Nh>KJ1^$%+w%ERGH+)jn*4g8{);9Vp-E+R_Zkz{&LzH7;lw&Tu9IqoO
a^P11O-wUV2pfetHqJ9Qk7nu~ewgM;M))mA6


From 7d2d09140e2822fb4bd9e55a3551fdc40a93d489 Mon Sep 17 00:00:00 2001
From: Char15Xu <charlesxuty3@berkeley.edu>
Date: Tue, 13 Aug 2024 00:12:49 -0700
Subject: [PATCH 4/6] add claude key configure support in libem.cli and syntax
 revisions

---
 Makefile                     | 11 ++++++-----
 cli/libem                    | 20 +++++++++++++++-----
 libem/core/model/__init__.py | 16 ++++++++--------
 3 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/Makefile b/Makefile
index dbf84c8..dd49133 100644
--- a/Makefile
+++ b/Makefile
@@ -60,6 +60,11 @@ async_batch:
 	python examples/optimize/async_batch.py
 batch: async_batch
 
+# claude model example
+.PHONY: claude
+claude:
+	python examples/model/claude.py
+
 # opensource model examples
 .PHONY: mlx_lm llama_cpp llama_ex local
 mlx_lm:
@@ -152,8 +157,4 @@ link:
 duckdb:
 	python examples/apps/integration/duckdb_cluster.py
 mongodb:
-	python examples/apps/integration/mongodb_cluster.py
-
-.PHONY: claude
-claude:
-	python examples/model/claude.py
\ No newline at end of file
+	python examples/apps/integration/mongodb_cluster.py
\ No newline at end of file
diff --git a/cli/libem b/cli/libem
index 81b870d..11c8b3f 100644
--- a/cli/libem
+++ b/cli/libem
@@ -82,13 +82,23 @@ def configure(args):
         config = {}
 
     # Prompt for OPENAI_API_KEY
-    existing_key = config.get('OPENAI_API_KEY', '')
-    new_key = input(f"Enter OPENAI_API_KEY (press Enter to keep the existing key: "
-                    f"'{mask_key(existing_key)}'): ").strip()
+    existing_openai_key = config.get('OPENAI_API_KEY', '')
+    new_openai_key = input(f"Enter OPENAI_API_KEY (press Enter to keep the existing key: "
+                    f"'{mask_key(existing_openai_key)}'): ").strip()
 
     # If no input, keep the existing key; otherwise, update
-    if new_key:
-        config['OPENAI_API_KEY'] = new_key
+    if new_openai_key:
+        config['OPENAI_API_KEY'] = new_openai_key
+    else:
+        print("No input provided. Using existing API key.")
+
+    existing_claude_key = config.get('CLAUDE_API_KEY', '')
+    new_claude_key = input(f"Enter CLAUDE_API_KEY (press Enter to keep the existing key: "
+                    f"'{mask_key(existing_claude_key)}'): ").strip()
+
+    # If no input, keep the existing key; otherwise, update
+    if new_claude_key:
+        config['CLAUDE_API_KEY'] = new_claude_key
     else:
         print("No input provided. Using existing API key.")
 
diff --git a/libem/core/model/__init__.py b/libem/core/model/__init__.py
index 4e68086..807e9d5 100644
--- a/libem/core/model/__init__.py
+++ b/libem/core/model/__init__.py
@@ -11,16 +11,16 @@ def call(*args, **kwargs) -> dict:
 
 
 async def async_call(*args, **kwargs) -> dict:
-    if kwargs.get("model", "") == "llama3":
-        return llama.call(*args, **kwargs)
-    elif kwargs.get("model", "") == "llama3.1":
-        return llama.call(*args, **kwargs)
-    elif kwargs.get("model", "") == "claude-3-5-sonnet-20240620":
-        return await claude.call(*args, **kwargs) 
-    else:
-        return await openai.async_call(*args, **kwargs)
+    match kwargs.get("model", ""):
+        case "llama3" | "llama3.1":
+            return llama.call(*args, **kwargs)
+        case "claude-3-5-sonnet-20240620":
+            return await claude.call(*args, **kwargs)
+        case _:
+            return await openai.async_call(*args, **kwargs)
 
 
 def reset():
     openai.reset()
+    claude.reset()
     llama.reset()

From d2e5c9e9781fbfb20e30f33dcdc1ea5ea12e8c87 Mon Sep 17 00:00:00 2001
From: Char15Xu <charlesxuty3@berkeley.edu>
Date: Tue, 20 Aug 2024 15:17:53 -0700
Subject: [PATCH 5/6] claude successfully work on examples/model/claude.py

---
 libem/core/model/claude.py | 2 +-
 libem/match/prompt.py      | 2 +-
 requirements.txt           | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/libem/core/model/claude.py b/libem/core/model/claude.py
index e3d7b4f..59c5e7b 100644
--- a/libem/core/model/claude.py
+++ b/libem/core/model/claude.py
@@ -3,7 +3,7 @@
 import httpx
 import importlib
 import inspect
-import asyncio
+
 from anthropic import (
     AsyncAnthropic, APITimeoutError
 )
diff --git a/libem/match/prompt.py b/libem/match/prompt.py
index c4548d8..ba4f58c 100644
--- a/libem/match/prompt.py
+++ b/libem/match/prompt.py
@@ -26,7 +26,7 @@
         lambda: "strict"
         if model() in {
             "llama3", "llama3.1",
-            "gpt-4o-2024-08-06"
+            "gpt-4o-2024-08-06", "claude-3-5-sonnet-20240620",
         }
         else "standard"
     ),
diff --git a/requirements.txt b/requirements.txt
index 34e7c90..976590b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,3 +22,4 @@ tqdm
 recordlinkage
 duckdb
 pymongo
+anthropic

From 9b4c0a5ed9297e90b236765afe0015886b7efc00 Mon Sep 17 00:00:00 2001
From: Char15Xu <charlesxuty3@berkeley.edu>
Date: Sat, 24 Aug 2024 15:17:54 -0700
Subject: [PATCH 6/6] delete print statement in cli/libem

---
 benchmark/README.md | 2 +-
 cli/libem           | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/benchmark/README.md b/benchmark/README.md
index e2936a6..77a2e7b 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -176,7 +176,7 @@ python -m benchmark.run -s <suite-name>
 
 ### Meta-Llama3-8B-Instruct-8bit
 
-> Llama model runs on Apple M2 silicon
+> Llama model runs on Apple M1 silicon
 
 |   Benchmark    | Precision | Recall |  F1   | Cost ($) | Pairs per $ | Throughput (pps) |
 |:--------------:|:---------:|:------:|:-----:|:----------------:|:----------------:|:----------------:|
diff --git a/cli/libem b/cli/libem
index 11c8b3f..4db115d 100644
--- a/cli/libem
+++ b/cli/libem
@@ -89,8 +89,6 @@ def configure(args):
     # If no input, keep the existing key; otherwise, update
     if new_openai_key:
         config['OPENAI_API_KEY'] = new_openai_key
-    else:
-        print("No input provided. Using existing API key.")
 
     existing_claude_key = config.get('CLAUDE_API_KEY', '')
     new_claude_key = input(f"Enter CLAUDE_API_KEY (press Enter to keep the existing key: "
@@ -99,8 +97,6 @@ def configure(args):
     # If no input, keep the existing key; otherwise, update
     if new_claude_key:
         config['CLAUDE_API_KEY'] = new_claude_key
-    else:
-        print("No input provided. Using existing API key.")
 
     ...