diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 3d31c23bb..58a107964 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -27,9 +27,15 @@ jobs:
- name: Xmake Build & Install
run: |
- xmake
- xmake install
-
+ xmake -y
+ xmake install -y
+
+ - name: Install System Dependencies (Ubuntu)
+ if: runner.os == 'Linux'
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y libopenblas-dev
+
- name: Install Python
run: |
cd python
diff --git a/chat.py b/chat.py
new file mode 100644
index 000000000..62bd4136c
--- /dev/null
+++ b/chat.py
@@ -0,0 +1,61 @@
+import os
+from transformers import AutoTokenizer
+from python.llaisys.models.qwen2 import Qwen2
+from python.llaisys.libllaisys import DeviceType
+
+MODEL_PATH = "/home/koishiyo/.cache/huggingface/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/ad9f0ae0864d7fbcd1cd905e3c6c5b069cc8b562"
+
+def main():
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+ model = Qwen2(MODEL_PATH, device=DeviceType.CPU)
+
+ print("\n🚀 LLAISYS 有状态流式推理引擎已启动!")
+ is_first_turn = True
+
+ while True:
+ user_input = input("🧑 你: ")
+ if user_input.strip().lower() in ['quit', 'exit']: break
+ if not user_input.strip(): continue
+
+ if is_first_turn:
+ messages = [
+ {"role": "system", "content": "You are a helpful assistant."},
+ {"role": "user", "content": user_input}
+ ]
+ prompt_str = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+ is_first_turn = False
+ else:
+ # 强行补上上一轮丢失的 <|im_end|>,维持完美的 KV Cache!
+ prompt_str = f"<|im_end|>\n<|im_start|>user\n{user_input}<|im_end|>\n<|im_start|>assistant\n"
+
+ # encode 时必须加 add_special_tokens=False,防止污染 C++ 里的 Cache
+ new_input_tokens = tokenizer.encode(prompt_str, add_special_tokens=False)
+
+ print("🤖 AI: ", end="", flush=True)
+
+ all_generated_tokens = []
+ printed_text_len = 0
+
+ # 👑 终极替换:调用我们刚才写的流式生成器!
+ for next_token in model.stream_generate(
+ new_input_tokens,
+ max_new_tokens=400,
+ temperature=0.7,
+ top_p=0.9
+ ):
+ all_generated_tokens.append(next_token)
+
+ # 完整解码当前生成的所有 token
+ current_text = tokenizer.decode(all_generated_tokens, skip_special_tokens=True)
+
+ # 算出这次新增了哪些字符(完美解决中文多 Token 乱码问题)
+ new_text = current_text[printed_text_len:]
+
+ if new_text:
+ print(new_text, end="", flush=True)
+ printed_text_len = len(current_text)
+
+ print("\n") # 这轮对话结束,换行
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/chat_server.py b/chat_server.py
new file mode 100644
index 000000000..307604ea9
--- /dev/null
+++ b/chat_server.py
@@ -0,0 +1,174 @@
+import uvicorn
+import json
+from fastapi import FastAPI
+from fastapi.responses import HTMLResponse, StreamingResponse
+from pydantic import BaseModel
+
+# ==========================================
+# 1. 导入你的模型和分词器
+# ==========================================
+from llaisys.models.qwen2 import Qwen2
+from transformers import AutoTokenizer
+
+# ⚠️ 注意:请将这里的路径替换为你电脑上真实的 Qwen2 模型文件夹路径!
+MODEL_PATH = "/home/koishiyo/.cache/huggingface/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/ad9f0ae0864d7fbcd1cd905e3c6c5b069cc8b562"
+
+print(f"正在加载分词器: {MODEL_PATH}")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+
+print("正在将大模型权重载入 C++ 引擎内存,请稍候...")
+model = Qwen2(MODEL_PATH)
+print("模型加载完毕!LLAISYS 引擎启动成功!")
+
+# 初始化 FastAPI
+app = FastAPI(title="LLAISYS Chatbot")
+
+class ChatRequest(BaseModel):
+ message: str
+
+# ==========================================
+# 2. 前端 HTML + CSS + JS (流式打字机效果)
+# ==========================================
+html_content = """
+
+
+
+ LLAISYS 纯 C++ 推理引擎
+
+
+
+
+
+
+
+
你好!我是由你从零手写的 LLAISYS 推理引擎驱动的。我们来聊天吧!
+
+
+
+
+
+
+
+
+
+
+"""
+
+@app.get("/")
+async def get_ui():
+ return HTMLResponse(content=html_content)
+
+# ==========================================
+# 3. 后端流式推理接口 (真正呼叫 C++ 底层)
+# ==========================================
+@app.post("/v1/chat/completions")
+async def chat_api(req: ChatRequest):
+
+ def stream_generator():
+ try:
+ # 1. 使用官方标准的对话模板 (Chat Template)
+ messages = [
+ {"role": "system", "content": "你是一个乐于助人的AI助手。"},
+ {"role": "user", "content": req.message}
+ ]
+ # apply_chat_template 会自动且正确地把特殊符号转换成真正的控制 Token ID
+ prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+ input_ids = tokenizer.encode(prompt_text)
+
+ # 2. 呼叫 C++ 底层的 stream_generate 算子
+ # 这里传的参数 (temperature, top_p, top_k) 会直接穿透到你手写的 sampler_cpu.cpp 里!
+ for token_id in model.stream_generate(
+ inputs=input_ids,
+ max_new_tokens=512,
+ temperature=0.8,
+ top_p=0.9,
+ top_k=50
+ ):
+ # 3. ID 还原成汉字
+ word = tokenizer.decode([token_id], skip_special_tokens=True)
+
+ # 4. 组装成 Server-Sent Events (SSE) 格式发送给前端
+ if word:
+ yield f"data: {json.dumps({'delta': word}, ensure_ascii=False)}\n\n"
+
+ except Exception as e:
+ print(f"推理时发生错误: {e}")
+ yield f"data: {json.dumps({'delta': '[Engine Error]'}, ensure_ascii=False)}\n\n"
+
+ return StreamingResponse(stream_generator(), media_type="text/event-stream")
+
+if __name__ == "__main__":
+ # 启动命令
+ uvicorn.run(app, host="127.0.0.1", port=8000)
\ No newline at end of file
diff --git a/include/llaisys/models/qwen2.h b/include/llaisys/models/qwen2.h
index 7054626d4..2f7ff7524 100644
--- a/include/llaisys/models/qwen2.h
+++ b/include/llaisys/models/qwen2.h
@@ -3,40 +3,62 @@
#include "../tensor.h"
-__C {
- struct LlaisysQwen2Meta {
- llaisysDataType_t dtype;
- size_t nlayer, hs, nh, nkvh, dh, di, maxseq, voc;
- float epsilon, theta;
- int64_t end_token;
- };
-
- struct LlaisysQwen2Weights {
- llaisysTensor_t in_embed;
- llaisysTensor_t out_embed;
- llaisysTensor_t out_norm_w; // a.k.a. model.norm.weight
- llaisysTensor_t *attn_norm_w; // a.k.a. input_layernorm.weight
- llaisysTensor_t *attn_q_w;
- llaisysTensor_t *attn_q_b;
- llaisysTensor_t *attn_k_w;
- llaisysTensor_t *attn_k_b;
- llaisysTensor_t *attn_v_w;
- llaisysTensor_t *attn_v_b;
- llaisysTensor_t *attn_o_w;
- llaisysTensor_t *mlp_norm_w; // a.k.a. post_attention_layernorm.weight
- llaisysTensor_t *mlp_gate_w;
- llaisysTensor_t *mlp_up_w;
- llaisysTensor_t *mlp_down_w;
- };
-
- struct LlaisysQwen2Model;
-
- __export struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice);
-
- __export void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model * model);
-
- __export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model);
-
- __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken);
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// 模型超参数元数据
+typedef struct {
+ int dtype; // 0=F32, 1=F16...
+ size_t nlayer; // 层数
+ size_t hs; // Hidden Size
+ size_t nh; // Num Attention Heads
+ size_t nkvh; // Num KV Heads
+ size_t dh; // Head Dim (hs / nh)
+ size_t di; // Intermediate Size (FFN)
+ size_t maxseq; // Max Position Embeddings
+ size_t voc; // Vocab Size
+ float epsilon; // RMS Norm Epsilon
+ float theta; // RoPE Theta
+ int64_t end_token; // EOS Token ID
+} LlaisysQwen2Meta;
+
+// 权重指针容器 (C++端分配数组,Python端填充数据)
+typedef struct {
+ llaisysTensor_t in_embed;
+ llaisysTensor_t out_embed;
+ llaisysTensor_t out_norm_w;
+
+ // 以下是指针数组 (Array of Tensors),长度为 nlayer
+ llaisysTensor_t *attn_norm_w;
+ llaisysTensor_t *attn_q_w;
+ llaisysTensor_t *attn_q_b;
+ llaisysTensor_t *attn_k_w;
+ llaisysTensor_t *attn_k_b;
+ llaisysTensor_t *attn_v_w;
+ llaisysTensor_t *attn_v_b;
+ llaisysTensor_t *attn_o_w; // Qwen 通常无 o_bias
+
+ llaisysTensor_t *mlp_norm_w;
+ llaisysTensor_t *mlp_gate_w;
+ llaisysTensor_t *mlp_up_w;
+ llaisysTensor_t *mlp_down_w;
+} LlaisysQwen2Weights;
+
+// 不透明模型句柄
+struct LlaisysQwen2Model;
+
+// API 导出
+__export struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice);
+
+__export void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model *model);
+
+__export LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model *model);
+
+__export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model *model, int64_t *token_ids, size_t ntoken, float temperature, float top_p, size_t top_k);
+
+#ifdef __cplusplus
}
+#endif
+
#endif // LLAISYS_MODELS_QWEN2_H
diff --git a/python/llaisys/libllaisys/__init__.py b/python/llaisys/libllaisys/__init__.py
index f536fb527..18049ec89 100644
--- a/python/llaisys/libllaisys/__init__.py
+++ b/python/llaisys/libllaisys/__init__.py
@@ -30,7 +30,11 @@ def load_shared_library():
if not os.path.isfile(lib_path):
raise FileNotFoundError(f"Shared library not found: {lib_path}")
-
+ # 👇👇👇 在 return 之前,加上这两行终极黑魔法 👇👇👇
+ import ctypes
+ # 告诉操作系统:提前把 OpenBLAS 加载到全局内存里!
+ ctypes.CDLL("libopenblas.so.0", mode=ctypes.RTLD_GLOBAL)
+
return ctypes.CDLL(str(lib_path))
@@ -39,17 +43,72 @@ def load_shared_library():
load_tensor(LIB_LLAISYS)
load_ops(LIB_LLAISYS)
+# ============================================================================
+# Qwen2 Bindings
+# ============================================================================
+
+class LlaisysQwen2Meta(ctypes.Structure):
+ _fields_ = [
+ ("dtype", ctypes.c_int),
+ ("nlayer", ctypes.c_size_t),
+ ("hs", ctypes.c_size_t),
+ ("nh", ctypes.c_size_t),
+ ("nkvh", ctypes.c_size_t),
+ ("dh", ctypes.c_size_t),
+ ("di", ctypes.c_size_t),
+ ("maxseq", ctypes.c_size_t),
+ ("voc", ctypes.c_size_t),
+ ("epsilon", ctypes.c_float),
+ ("theta", ctypes.c_float),
+ ("end_token", ctypes.c_int64),
+ ]
+
+class LlaisysQwen2Weights(ctypes.Structure):
+ _fields_ = [
+ ("in_embed", llaisysTensor_t),
+ ("out_embed", llaisysTensor_t),
+ ("out_norm_w", llaisysTensor_t),
+ ("attn_norm_w", ctypes.POINTER(llaisysTensor_t)),
+ ("attn_q_w", ctypes.POINTER(llaisysTensor_t)),
+ ("attn_q_b", ctypes.POINTER(llaisysTensor_t)),
+ ("attn_k_w", ctypes.POINTER(llaisysTensor_t)),
+ ("attn_k_b", ctypes.POINTER(llaisysTensor_t)),
+ ("attn_v_w", ctypes.POINTER(llaisysTensor_t)),
+ ("attn_v_b", ctypes.POINTER(llaisysTensor_t)),
+ ("attn_o_w", ctypes.POINTER(llaisysTensor_t)),
+ ("mlp_norm_w", ctypes.POINTER(llaisysTensor_t)),
+ ("mlp_gate_w", ctypes.POINTER(llaisysTensor_t)),
+ ("mlp_up_w", ctypes.POINTER(llaisysTensor_t)),
+ ("mlp_down_w", ctypes.POINTER(llaisysTensor_t)),
+ ]
+
+try:
+ LIB_LLAISYS.llaisysQwen2ModelCreate.restype = ctypes.c_void_p
+ LIB_LLAISYS.llaisysQwen2ModelCreate.argtypes = [ctypes.POINTER(LlaisysQwen2Meta), ctypes.c_int, ctypes.POINTER(ctypes.c_int), ctypes.c_int]
+
+ LIB_LLAISYS.llaisysQwen2ModelDestroy.restype = None
+ LIB_LLAISYS.llaisysQwen2ModelDestroy.argtypes = [ctypes.c_void_p]
+
+ LIB_LLAISYS.llaisysQwen2ModelWeights.restype = ctypes.POINTER(LlaisysQwen2Weights)
+ LIB_LLAISYS.llaisysQwen2ModelWeights.argtypes = [ctypes.c_void_p]
+
+ LIB_LLAISYS.llaisysQwen2ModelInfer.restype = ctypes.c_int64
+ LIB_LLAISYS.llaisysQwen2ModelInfer.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_int64), ctypes.c_size_t]
+
+ if hasattr(LIB_LLAISYS, 'llaisysTensorData'):
+ LIB_LLAISYS.llaisysTensorData.restype = ctypes.c_void_p
+ LIB_LLAISYS.llaisysTensorData.argtypes = [ctypes.c_void_p]
+except AttributeError:
+ pass
__all__ = [
"LIB_LLAISYS",
"LlaisysRuntimeAPI",
- "llaisysStream_t",
"llaisysTensor_t",
"llaisysDataType_t",
"DataType",
"llaisysDeviceType_t",
"DeviceType",
- "llaisysMemcpyKind_t",
- "MemcpyKind",
- "llaisysStream_t",
-]
+ "LlaisysQwen2Meta",
+ "LlaisysQwen2Weights"
+]
\ No newline at end of file
diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py
index 0d07b0b21..e0e857c73 100644
--- a/python/llaisys/models/qwen2.py
+++ b/python/llaisys/models/qwen2.py
@@ -1,33 +1,183 @@
-from typing import Sequence
-from ..libllaisys import LIB_LLAISYS
-from ..libllaisys import DeviceType
-
+import json
+import ctypes
+import numpy as np
+import gc
+import platform
+import os
+from typing import Sequence, List
from pathlib import Path
-import safetensors
+from ..libllaisys import LIB_LLAISYS, DeviceType, LlaisysQwen2Meta
+
+# ==========================================
+# 【核心修改】:精准定义与 C++ 交互的参数类型
+# ==========================================
+LIB_LLAISYS.llaisysQwen2ModelInfer.argtypes = [
+ ctypes.c_void_p, # model handle
+ ctypes.POINTER(ctypes.c_int64), # token_ids
+ ctypes.c_size_t, # ntoken
+ ctypes.c_float, # temperature
+ ctypes.c_float, # top_p
+ ctypes.c_size_t # top_k
+]
+LIB_LLAISYS.llaisysQwen2ModelInfer.restype = ctypes.c_int64
+LIB_LLAISYS.llaisysQwen2ModelGetLogits.restype = ctypes.POINTER(ctypes.c_float)
+try:
+ from safetensors import safe_open
+except ImportError:
+ raise ImportError("pip install safetensors")
class Qwen2:
+ def __init__(self, model_path: str, device: DeviceType = DeviceType.CPU):
+ model_path = Path(model_path)
+
+ config_path = model_path / "config.json"
+ if not config_path.exists():
+ raise FileNotFoundError(f"Config not found at {config_path}")
+
+ with open(config_path, "r") as f:
+ cfg = json.load(f)
- def __init__(self, model_path, device: DeviceType = DeviceType.CPU):
- # TODO: Implement model constructor
+ self.meta = LlaisysQwen2Meta()
+ self.meta.dtype = 0
+ self.meta.nlayer = cfg["num_hidden_layers"]
+ self.meta.hs = cfg["hidden_size"]
+ self.meta.nh = cfg["num_attention_heads"]
+ self.meta.nkvh = cfg["num_key_value_heads"]
+ self.meta.dh = self.meta.hs // self.meta.nh
+ self.meta.di = cfg["intermediate_size"]
+
+ raw_maxseq = cfg.get("max_position_embeddings", 4096)
+ is_windows_ci = (platform.system() == "Windows" and os.environ.get("GITHUB_ACTIONS") == "true")
+
+ if is_windows_ci:
+ print(f"[CI-Optimization] Windows detected. Clamping max_seq from {raw_maxseq} to 1024 to save memory.")
+ self.meta.maxseq = 1024
+ else:
+ self.meta.maxseq = raw_maxseq
- model_path = Path(model_path)
+ self.meta.voc = cfg["vocab_size"]
+ self.meta.epsilon = cfg["rms_norm_eps"]
+ self.meta.theta = cfg.get("rope_theta", 1000000.0)
+ self.meta.end_token = cfg.get("eos_token_id", 151643)
+ if isinstance(self.meta.end_token, list): self.meta.end_token = self.meta.end_token[0]
+
+ print(f"[LLaisys] Init Qwen2: {self.meta.nlayer}L, {self.meta.hs}H, Context: {self.meta.maxseq}")
+
+ self.handle = LIB_LLAISYS.llaisysQwen2ModelCreate(
+ ctypes.byref(self.meta),
+ device.value,
+ None,
+ 0
+ )
+ if not self.handle: raise RuntimeError("Failed to create model")
+
+ self.c_weights = LIB_LLAISYS.llaisysQwen2ModelWeights(self.handle).contents
+ self._load_weights(model_path)
+
+ def _load_weights(self, path):
+ # 注意:虽然这里导入模型参数用了 PyTorch,但这不属于推理过程,
+ # 只是为了读取 .safetensors 权重。项目规则是“推理阶段不使用PyTorch”。
+ import torch
+ weight_files = sorted(list(path.glob("*.safetensors")))
+ for f in weight_files:
+ print(f"Loading {f.name}...")
+ with safe_open(f, framework="pt", device="cpu") as st:
+ for name in st.keys():
+ ptr = self._route(name)
+ if ptr:
+ tensor = st.get_tensor(name)
+ tensor = tensor.to(torch.float32)
+ data = tensor.numpy()
+ data = np.ascontiguousarray(data)
+ dst = LIB_LLAISYS.llaisysTensorData(ptr)
+ if dst:
+ ctypes.memmove(dst, data.ctypes.data, data.nbytes)
+ del tensor
+ del data
+ gc.collect()
+
+ def _route(self, name):
+ w = self.c_weights
+ if name == "model.embed_tokens.weight": return w.in_embed
+ if name == "model.norm.weight": return w.out_norm_w
+ if name == "lm_head.weight": return w.out_embed
+
+ if name.startswith("model.layers."):
+ parts = name.split(".")
+ idx = int(parts[2])
+ if idx >= self.meta.nlayer: return None
+
+ module = parts[3]
+ sub = parts[4]
+ is_bias = "bias" in parts[-1]
+
+ if module == "self_attn":
+ if sub == "q_proj": return w.attn_q_b[idx] if is_bias else w.attn_q_w[idx]
+ if sub == "k_proj": return w.attn_k_b[idx] if is_bias else w.attn_k_w[idx]
+ if sub == "v_proj": return w.attn_v_b[idx] if is_bias else w.attn_v_w[idx]
+ if sub == "o_proj": return w.attn_o_w[idx]
+ elif module == "mlp":
+ if sub == "gate_proj": return w.mlp_gate_w[idx]
+ if sub == "up_proj": return w.mlp_up_w[idx]
+ if sub == "down_proj": return w.mlp_down_w[idx]
+ elif module == "input_layernorm": return w.attn_norm_w[idx]
+ elif module == "post_attention_layernorm": return w.mlp_norm_w[idx]
+ return None
+
+ def __del__(self):
+ if hasattr(self, 'handle') and self.handle:
+ LIB_LLAISYS.llaisysQwen2ModelDestroy(self.handle)
- for file in sorted(model_path.glob("*.safetensors")):
- data_ = safetensors.safe_open(file, framework="numpy", device="cpu")
- for name_ in data_.keys():
- ## TODO: load the model weights
- pass
+ def generate(self, inputs: Sequence[int], max_new_tokens=50, temperature=0.7, top_p=0.9, top_k=50, **kwargs) -> List[int]:
+ curr = list(inputs)
+
+ c_temp = ctypes.c_float(temperature)
+ c_top_p = ctypes.c_float(top_p)
+ c_top_k = ctypes.c_size_t(top_k)
- def generate(
- self,
- inputs: Sequence[int],
- max_new_tokens: int = None,
- top_k: int = 1,
- top_p: float = 0.8,
- temperature: float = 0.8,
- ):
+ # Prefill 阶段
+ seq_len = len(curr)
+ arr = (ctypes.c_int64 * seq_len)(*curr)
+
+ next_tok = LIB_LLAISYS.llaisysQwen2ModelInfer(self.handle, arr, seq_len, c_temp, c_top_p, c_top_k)
+
+ for _ in range(max_new_tokens):
+ # 🚀 【核心修复】:先存进去!不管是不是结束符,都得保证它在列表里
+ curr.append(next_tok)
+
+ # 🚀 然后再判断要不要跳出循环
+ if next_tok == self.meta.end_token or next_tok == 151645:
+ break
+
+ # Decode 阶段:每次只传1个长度,返回下1个
+ arr = (ctypes.c_int64 * 1)(next_tok)
+ next_tok = LIB_LLAISYS.llaisysQwen2ModelInfer(self.handle, arr, 1, c_temp, c_top_p, c_top_k)
+
+ return curr
- # TODO: Implement generate function
+ def stream_generate(self, inputs: Sequence[int], max_new_tokens=400, temperature=0.7, top_p=0.9, top_k=50, **kwargs):
+ """流式生成器:极其简洁,因为重活都交给 C++ 的 sampler 了!"""
+ curr = list(inputs)
+
+ c_temp = ctypes.c_float(temperature)
+ c_top_p = ctypes.c_float(top_p)
+ c_top_k = ctypes.c_size_t(top_k)
- return []
+ # Prefill 阶段:喂给 C++ 整个问题,得到第一个回答的 Token
+ seq_len = len(curr)
+ arr = (ctypes.c_int64 * seq_len)(*curr)
+ next_tok = LIB_LLAISYS.llaisysQwen2ModelInfer(self.handle, arr, seq_len, c_temp, c_top_p, c_top_k)
+
+ # 持续生成
+ for _ in range(max_new_tokens):
+ yield next_tok
+
+ if next_tok == self.meta.end_token or next_tok == 151645:
+ break
+
+ curr.append(next_tok)
+
+ # Decode 阶段:把刚生成的词喂给 C++,换取下一个词
+ arr = (ctypes.c_int64 * 1)(next_tok)
+ next_tok = LIB_LLAISYS.llaisysQwen2ModelInfer(self.handle, arr, 1, c_temp, c_top_p, c_top_k)
\ No newline at end of file
diff --git a/src/llaisys/qwen2.cpp b/src/llaisys/qwen2.cpp
new file mode 100644
index 000000000..b8df5720e
--- /dev/null
+++ b/src/llaisys/qwen2.cpp
@@ -0,0 +1,274 @@
+#include "llaisys/models/qwen2.h"
+#include "../tensor/tensor.hpp"
+#include "../utils.hpp"
+
+// 引用具体算子
+#include "../ops/embedding/op.hpp"
+#include "../ops/rms_norm/op.hpp"
+#include "../ops/linear/op.hpp"
+#include "../ops/rope/op.hpp"
+#include "../ops/self_attention/op.hpp"
+#include "../ops/swiglu/op.hpp"
+#include "../ops/add/op.hpp"
+// 【新增】:引入采样器算子!
+#include "../ops/sampler/cpu/sampler_cpu.hpp"
+
+#include
+#include
+#include
+#include
+#include
+
+using namespace llaisys;
+
+// 类型桥接
+struct LlaisysTensor { llaisys::tensor_t tensor; };
+llaisysTensor_t wrap(tensor_t t) { return new LlaisysTensor{t}; }
+tensor_t unwrap(llaisysTensor_t t) { return ((LlaisysTensor*)t)->tensor; }
+
+class Qwen2Impl {
+public:
+ LlaisysQwen2Meta meta;
+ LlaisysQwen2Weights weights;
+ size_t current_pos;
+
+ std::vector k_cache;
+ std::vector v_cache;
+
+ // Buffers
+ tensor_t buf_input_ids, buf_pos_ids;
+ tensor_t buf_hidden, buf_residual;
+
+ // QKV 相关 Buffer
+ tensor_t buf_q, buf_k, buf_v;
+ tensor_t buf_att_out;
+
+ tensor_t buf_att_proj;
+ tensor_t buf_gate, buf_up, buf_ffn_inter, buf_down;
+ tensor_t buf_last_hidden, buf_logits;
+
+public:
+ Qwen2Impl(const LlaisysQwen2Meta* meta_in) {
+ this->meta = *meta_in;
+ this->current_pos = 0;
+ alloc_weight_structs();
+ init_weight_tensors();
+ init_kv_cache();
+ init_buffers();
+ }
+
+ ~Qwen2Impl() {
+ free_layer_weights(weights.attn_norm_w);
+ free_layer_weights(weights.attn_q_w); free_layer_weights(weights.attn_q_b);
+ free_layer_weights(weights.attn_k_w); free_layer_weights(weights.attn_k_b);
+ free_layer_weights(weights.attn_v_w); free_layer_weights(weights.attn_v_b);
+ free_layer_weights(weights.attn_o_w);
+ free_layer_weights(weights.mlp_norm_w);
+ free_layer_weights(weights.mlp_gate_w);
+ free_layer_weights(weights.mlp_up_w);
+ free_layer_weights(weights.mlp_down_w);
+ delete (LlaisysTensor*)weights.in_embed;
+ delete (LlaisysTensor*)weights.out_embed;
+ delete (LlaisysTensor*)weights.out_norm_w;
+ }
+
+private:
+ void free_layer_weights(llaisysTensor_t* array) {
+ if (!array) return;
+ for (size_t i = 0; i < meta.nlayer; i++) if (array[i]) delete (LlaisysTensor*)array[i];
+ delete[] array;
+ }
+
+ llaisysTensor_t create_w(const std::vector& shape) {
+ return wrap(Tensor::create(shape, LLAISYS_DTYPE_F32, LLAISYS_DEVICE_CPU));
+ }
+ tensor_t create_b(const std::vector& shape, llaisysDataType_t dtype = LLAISYS_DTYPE_F32) {
+ return Tensor::create(shape, dtype, LLAISYS_DEVICE_CPU);
+ }
+
+ void alloc_weight_structs() {
+ size_t n = meta.nlayer;
+ weights.attn_norm_w = new llaisysTensor_t[n];
+ weights.attn_q_w = new llaisysTensor_t[n]; weights.attn_q_b = new llaisysTensor_t[n];
+ weights.attn_k_w = new llaisysTensor_t[n]; weights.attn_k_b = new llaisysTensor_t[n];
+ weights.attn_v_w = new llaisysTensor_t[n]; weights.attn_v_b = new llaisysTensor_t[n];
+ weights.attn_o_w = new llaisysTensor_t[n];
+ weights.mlp_norm_w = new llaisysTensor_t[n];
+ weights.mlp_gate_w = new llaisysTensor_t[n];
+ weights.mlp_up_w = new llaisysTensor_t[n];
+ weights.mlp_down_w = new llaisysTensor_t[n];
+ }
+
+ void init_weight_tensors() {
+ weights.in_embed = create_w({meta.voc, meta.hs});
+ weights.out_embed = create_w({meta.voc, meta.hs});
+ weights.out_norm_w = create_w({meta.hs});
+ size_t qs = meta.nh * meta.dh;
+ size_t kvs = meta.nkvh * meta.dh;
+ for (size_t i = 0; i < meta.nlayer; i++) {
+ weights.attn_norm_w[i] = create_w({meta.hs});
+ weights.attn_q_w[i] = create_w({qs, meta.hs}); weights.attn_q_b[i] = create_w({qs});
+ weights.attn_k_w[i] = create_w({kvs, meta.hs}); weights.attn_k_b[i] = create_w({kvs});
+ weights.attn_v_w[i] = create_w({kvs, meta.hs}); weights.attn_v_b[i] = create_w({kvs});
+ weights.attn_o_w[i] = create_w({meta.hs, qs});
+ weights.mlp_norm_w[i] = create_w({meta.hs});
+ weights.mlp_gate_w[i] = create_w({meta.di, meta.hs});
+ weights.mlp_up_w[i] = create_w({meta.di, meta.hs});
+ weights.mlp_down_w[i] = create_w({meta.hs, meta.di});
+ }
+ }
+
+ void init_kv_cache() {
+ std::vector shape = {meta.maxseq, meta.nkvh, meta.dh};
+ for (size_t i = 0; i < meta.nlayer; i++) {
+ k_cache.push_back(create_b(shape));
+ v_cache.push_back(create_b(shape));
+ }
+ }
+
+ void init_buffers() {
+ size_t s = meta.maxseq;
+ size_t dim_q = meta.nh * meta.dh;
+ size_t dim_kv = meta.nkvh * meta.dh;
+
+ buf_input_ids = create_b({s}, LLAISYS_DTYPE_I64);
+ buf_pos_ids = create_b({s}, LLAISYS_DTYPE_I64);
+ buf_hidden = create_b({s, meta.hs});
+ buf_residual = create_b({s, meta.hs});
+
+ buf_q = create_b({s, dim_q});
+ buf_k = create_b({s, dim_kv});
+ buf_v = create_b({s, dim_kv});
+
+ buf_att_out = create_b({s, dim_q});
+ buf_att_proj = create_b({s, meta.hs});
+
+ buf_gate = create_b({s, meta.di});
+ buf_up = create_b({s, meta.di});
+ buf_ffn_inter = create_b({s, meta.di});
+ buf_down = create_b({s, meta.hs});
+
+ buf_last_hidden = create_b({1, meta.hs});
+ buf_logits = create_b({1, meta.voc});
+ }
+
+public:
+ // 【修改】:接收采样超参数 temperature, top_p, top_k
+ int64_t infer(int64_t* token_ids, size_t ntoken, float temperature, float top_p, size_t top_k) {
+ size_t seq_len = ntoken;
+ size_t start_pos = this->current_pos;
+ size_t total_len = start_pos + seq_len;
+
+ // === Step 1: Slice Views ===
+ tensor_t cur_input_ids = buf_input_ids->slice(0, 0, seq_len);
+ tensor_t cur_pos_ids = buf_pos_ids->slice(0, 0, seq_len);
+ tensor_t cur_hidden = buf_hidden->slice(0, 0, seq_len);
+ tensor_t cur_residual = buf_residual->slice(0, 0, seq_len);
+
+ tensor_t cur_q = buf_q->slice(0, 0, seq_len);
+ tensor_t cur_k = buf_k->slice(0, 0, seq_len);
+ tensor_t cur_v = buf_v->slice(0, 0, seq_len);
+
+ tensor_t cur_att_out_flat = buf_att_out->slice(0, 0, seq_len);
+ tensor_t cur_att_proj = buf_att_proj->slice(0, 0, seq_len);
+
+ tensor_t cur_gate = buf_gate->slice(0, 0, seq_len);
+ tensor_t cur_up = buf_up->slice(0, 0, seq_len);
+ tensor_t cur_ffn_inter = buf_ffn_inter->slice(0, 0, seq_len);
+ tensor_t cur_down = buf_down->slice(0, 0, seq_len);
+
+ // === Step 2: Inference Loop ===
+ std::memcpy(cur_input_ids->data(), token_ids, seq_len * sizeof(int64_t));
+ int64_t* pos_ptr = (int64_t*)cur_pos_ids->data();
+ for (size_t i = 0; i < seq_len; ++i) pos_ptr[i] = start_pos + i;
+
+ ops::embedding(cur_hidden, cur_input_ids, unwrap(weights.in_embed));
+
+ for (size_t i = 0; i < meta.nlayer; i++) {
+ size_t bytes_hidden = cur_hidden->numel() * sizeof(float);
+ std::memcpy(cur_residual->data(), cur_hidden->data(), bytes_hidden);
+
+ ops::rms_norm(cur_hidden, cur_hidden, unwrap(weights.attn_norm_w[i]), meta.epsilon);
+
+ ops::linear(cur_q, cur_hidden, unwrap(weights.attn_q_w[i]), unwrap(weights.attn_q_b[i]));
+ ops::linear(cur_k, cur_hidden, unwrap(weights.attn_k_w[i]), unwrap(weights.attn_k_b[i]));
+ ops::linear(cur_v, cur_hidden, unwrap(weights.attn_v_w[i]), unwrap(weights.attn_v_b[i]));
+
+ tensor_t q_3d = cur_q->reshape({seq_len, meta.nh, meta.dh});
+ tensor_t k_3d = cur_k->reshape({seq_len, meta.nkvh, meta.dh});
+
+ ops::rope(q_3d, q_3d, cur_pos_ids, meta.theta);
+ ops::rope(k_3d, k_3d, cur_pos_ids, meta.theta);
+
+ tensor_t kc = k_cache[i];
+ tensor_t vc = v_cache[i];
+ size_t bytes_copy = seq_len * meta.nkvh * meta.dh * sizeof(float);
+ size_t offset = start_pos * meta.nkvh * meta.dh * sizeof(float);
+ std::memcpy(kc->data() + offset, cur_k->data(), bytes_copy);
+ std::memcpy(vc->data() + offset, cur_v->data(), bytes_copy);
+
+ tensor_t kc_view = kc->slice(0, 0, total_len)->reshape({total_len, meta.nkvh, meta.dh});
+ tensor_t vc_view = vc->slice(0, 0, total_len)->reshape({total_len, meta.nkvh, meta.dh});
+ tensor_t att_out_3d = cur_att_out_flat->reshape({seq_len, meta.nh, meta.dh});
+ tensor_t v_3d = cur_v->reshape({seq_len, meta.nkvh, meta.dh});
+
+ float scale = 1.0f / std::sqrt((float)meta.dh);
+ ops::self_attention(att_out_3d, q_3d, kc_view, vc_view, scale);
+
+ ops::linear(cur_att_proj, cur_att_out_flat, unwrap(weights.attn_o_w[i]), nullptr);
+ ops::add(cur_hidden, cur_att_proj, cur_residual);
+
+ std::memcpy(cur_residual->data(), cur_hidden->data(), bytes_hidden);
+
+ ops::rms_norm(cur_hidden, cur_hidden, unwrap(weights.mlp_norm_w[i]), meta.epsilon);
+
+ ops::linear(cur_gate, cur_hidden, unwrap(weights.mlp_gate_w[i]), nullptr);
+ ops::linear(cur_up, cur_hidden, unwrap(weights.mlp_up_w[i]), nullptr);
+ ops::swiglu(cur_ffn_inter, cur_gate, cur_up);
+ ops::linear(cur_down, cur_ffn_inter, unwrap(weights.mlp_down_w[i]), nullptr);
+
+ ops::add(cur_hidden, cur_down, cur_residual);
+ }
+
+ ops::rms_norm(cur_hidden, cur_hidden, unwrap(weights.out_norm_w), meta.epsilon);
+
+ std::byte* last_row = cur_hidden->data() + (seq_len - 1) * meta.hs * sizeof(float);
+ std::memcpy(buf_last_hidden->data(), last_row, meta.hs * sizeof(float));
+ ops::linear(buf_logits, buf_last_hidden, unwrap(weights.out_embed), nullptr);
+
+ // ==========================================
+ // 【核心修改】:删除了 Argmax,调用强大的 Sampler
+ // ==========================================
+ float* logits = (float*)buf_logits->data();
+ int64_t next_token = ops::cpu::sample(logits, meta.voc, temperature, top_p, top_k);
+
+ this->current_pos += seq_len;
+ return next_token;
+ }
+};
+
+extern "C" {
+__export struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice) {
+ return (struct LlaisysQwen2Model *) new Qwen2Impl(meta);
+}
+__export void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model * model) {
+ if (model) delete (Qwen2Impl*)model;
+}
+__export LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model) {
+ if (!model) return nullptr;
+ return &((Qwen2Impl*)model)->weights;
+}
+// 【修改】:C-API 签名增加采样参数
+__export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken, float temperature, float top_p, size_t top_k) {
+ if (!model) return -1;
+ return ((Qwen2Impl*)model)->infer(token_ids, ntoken, temperature, top_p, top_k);
+}
+__export void* llaisysTensorData(void* t) {
+ if (!t) return nullptr;
+ return ((LlaisysTensor*)t)->tensor->data();
+}
+__export float* llaisysQwen2ModelGetLogits(struct LlaisysQwen2Model * model) {
+ if (!model) return nullptr;
+ return (float*)((Qwen2Impl*)model)->buf_logits->data();
+}
+}
diff --git a/src/ops/argmax/cpu/argmax_cpu.cpp b/src/ops/argmax/cpu/argmax_cpu.cpp
new file mode 100644
index 000000000..fe92f9942
--- /dev/null
+++ b/src/ops/argmax/cpu/argmax_cpu.cpp
@@ -0,0 +1,54 @@
+#include "argmax_cpu.hpp"
+#include "../../../utils.hpp"
+#include
+
+template
+void argmax_kernel(const T *vals, size_t numel, T *out_val, int64_t *out_idx) {
+ if (numel == 0) {
+ return;
+ }
+
+ T max_v_raw = vals[0];
+ int64_t max_i = 0;
+
+ float max_v_f32 = llaisys::utils::cast(vals[0]);
+
+ for (size_t i = 1; i < numel; i++) {
+
+ float current_val = llaisys::utils::cast(vals[i]);
+
+ if (current_val > max_v_f32) {
+ max_v_f32 = current_val;
+ max_v_raw = vals[i];
+ max_i = i;
+ }
+ }
+
+ *out_val = max_v_raw;
+ *out_idx = max_i;
+}
+
+namespace llaisys::ops::cpu {
+
+void argmax(const std::byte *vals, std::byte *max_val, std::byte *max_idx, llaisysDataType_t dtype, size_t numel) {
+
+ int64_t *idx_ptr = reinterpret_cast(max_idx);
+
+ switch (dtype) {
+ case LLAISYS_DTYPE_F32:
+
+ return argmax_kernel(reinterpret_cast(vals), numel, reinterpret_cast(max_val), idx_ptr);
+
+ case LLAISYS_DTYPE_F16:
+
+ return argmax_kernel(reinterpret_cast(vals), numel, reinterpret_cast(max_val), idx_ptr);
+
+ case LLAISYS_DTYPE_BF16:
+
+ return argmax_kernel(reinterpret_cast(vals), numel, reinterpret_cast(max_val), idx_ptr);
+
+ default:
+ EXCEPTION_UNSUPPORTED_DATATYPE(dtype);
+ }
+}
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/argmax/cpu/argmax_cpu.hpp b/src/ops/argmax/cpu/argmax_cpu.hpp
new file mode 100644
index 000000000..847da4d82
--- /dev/null
+++ b/src/ops/argmax/cpu/argmax_cpu.hpp
@@ -0,0 +1,6 @@
+#pragma once
+#include "llaisys.h"
+
+namespace llaisys::ops::cpu {
+void argmax(const std::byte *vals, std::byte *max_val, std::byte *max_idx, llaisysDataType_t dtype, size_t numel);
+}
\ No newline at end of file
diff --git a/src/ops/argmax/op.cpp b/src/ops/argmax/op.cpp
index 6dc37d426..5074f6dd3 100644
--- a/src/ops/argmax/op.cpp
+++ b/src/ops/argmax/op.cpp
@@ -1,7 +1,43 @@
#include "op.hpp"
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+
+#include "cpu/argmax_cpu.hpp"
namespace llaisys::ops {
void argmax(tensor_t max_idx, tensor_t max_val, tensor_t vals) {
- TO_BE_IMPLEMENTED();
+ CHECK_SAME_DEVICE(max_idx, max_val, vals);
+
+ ASSERT(vals->isContiguous() && max_idx->isContiguous() && max_val->isContiguous(),
+ "Argmax: all tensors must be contiguous.");
+
+ ASSERT(vals->ndim() == 1, "Argmax: input 'vals' must be a 1D tensor.");
+
+ ASSERT(max_val->numel() == 1 && max_idx->numel() == 1,
+ "Argmax: output tensors must be scalars (single element).");
+
+ CHECK_SAME_DTYPE(max_val->dtype(), vals->dtype());
+
+ ASSERT(max_idx->dtype() == LLAISYS_DTYPE_I64,
+ "Argmax: 'max_idx' must be Int64.");
+
+ if (vals->deviceType() == LLAISYS_DEVICE_CPU) {
+ return cpu::argmax(vals->data(), max_val->data(), max_idx->data(), vals->dtype(), vals->numel());
+ }
+
+ llaisys::core::context().setDevice(vals->deviceType(), vals->deviceId());
+
+ switch (vals->deviceType()) {
+ case LLAISYS_DEVICE_CPU:
+
+ return cpu::argmax(vals->data(), max_val->data(), max_idx->data(), vals->dtype(), vals->numel());
+#ifdef ENABLE_NVIDIA_API
+ case LLAISYS_DEVICE_NVIDIA:
+ TO_BE_IMPLEMENTED();
+ return;
+#endif
+ default:
+ EXCEPTION_UNSUPPORTED_DEVICE;
+ }
}
} // namespace llaisys::ops
diff --git a/src/ops/embedding/cpu/embedding_cpu.cpp b/src/ops/embedding/cpu/embedding_cpu.cpp
new file mode 100644
index 000000000..770872814
--- /dev/null
+++ b/src/ops/embedding/cpu/embedding_cpu.cpp
@@ -0,0 +1,57 @@
+#include "embedding_cpu.hpp"
+#include
+
+
+template
+void embedding_(const T *weight_data, const int64_t *index_data, T *out_data,
+ size_t embedding_dim, size_t num_embeddings, size_t num_indices) {
+
+ for (size_t i = 0; i < num_indices; i++) {
+ int64_t idx = index_data[i];
+
+ // 越界保护
+ if (idx < 0 || static_cast(idx) >= num_embeddings) {
+ continue;
+ }
+
+ const T *src = weight_data + idx * embedding_dim;
+ T *dst = out_data + i * embedding_dim;
+
+ std::memcpy(dst, src, embedding_dim * sizeof(T));
+ }
+}
+
+namespace llaisys::ops::cpu {
+
+void embedding(const std::byte *weight_data, const std::byte *index_data, std::byte *out_data,
+ llaisysDataType_t dtype,
+ size_t embedding_dim, size_t num_embeddings, size_t num_indices) {
+
+ const int64_t *idx_ptr = reinterpret_cast(index_data);
+
+ switch (dtype) {
+ case LLAISYS_DTYPE_F32:
+ return embedding_(
+ reinterpret_cast(weight_data),
+ idx_ptr,
+ reinterpret_cast(out_data),
+ embedding_dim, num_embeddings, num_indices);
+
+ case LLAISYS_DTYPE_BF16:
+ return embedding_(
+ reinterpret_cast(weight_data),
+ idx_ptr,
+ reinterpret_cast(out_data),
+ embedding_dim, num_embeddings, num_indices);
+ case LLAISYS_DTYPE_F16:
+ return embedding_(
+ reinterpret_cast(weight_data),
+ idx_ptr,
+ reinterpret_cast(out_data),
+ embedding_dim, num_embeddings, num_indices);
+ default:
+ EXCEPTION_UNSUPPORTED_DATATYPE(dtype);
+ }
+}
+
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/embedding/cpu/embedding_cpu.hpp b/src/ops/embedding/cpu/embedding_cpu.hpp
new file mode 100644
index 000000000..70a9468ef
--- /dev/null
+++ b/src/ops/embedding/cpu/embedding_cpu.hpp
@@ -0,0 +1,14 @@
+#pragma once
+#include "../../../tensor/tensor.hpp"
+
+namespace llaisys::ops::cpu {
+
+void embedding(const std::byte *weight_data,
+ const std::byte *index_data,
+ std::byte *out_data,
+ llaisysDataType_t dtype,
+ size_t embedding_dim,
+ size_t num_embeddings,
+ size_t num_indices);
+
+}
\ No newline at end of file
diff --git a/src/ops/embedding/op.cpp b/src/ops/embedding/op.cpp
index 84b9a5d06..7e61e5727 100644
--- a/src/ops/embedding/op.cpp
+++ b/src/ops/embedding/op.cpp
@@ -1,7 +1,61 @@
#include "op.hpp"
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+
+#include "cpu/embedding_cpu.hpp"
+
namespace llaisys::ops {
void embedding(tensor_t out, tensor_t index, tensor_t weight) {
- TO_BE_IMPLEMENTED();
+
+ CHECK_SAME_DEVICE(out, index, weight);
+
+ ASSERT(out->isContiguous() && index->isContiguous() && weight->isContiguous(), "Embedding: inputs must be contiguous.");
+ ASSERT(index->dtype() == LLAISYS_DTYPE_I64, "Embedding: index must be Int64.");
+
+ CHECK_SAME_DTYPE(out->dtype(), weight->dtype());
+
+ ASSERT(weight->ndim() == 2, "Embedding: weight must be 2D.");
+ ASSERT(index->ndim() == 1, "Embedding: index must be 1D.");
+ ASSERT(out->ndim() == 2, "Embedding: output must be 2D.");
+ ASSERT(out->shape()[0] == index->shape()[0], "Embedding: output dim 0 must match index length.");
+ ASSERT(out->shape()[1] == weight->shape()[1], "Embedding: output dim 1 must match weight dim 1.");
+
+ size_t embedding_dim = weight->shape()[1];
+ size_t num_embeddings = weight->shape()[0];
+ size_t num_indices = index->numel();
+
+ if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+ return cpu::embedding(
+ weight->data(),
+ index->data(),
+ out->data(),
+ weight->dtype(),
+ embedding_dim,
+ num_embeddings,
+ num_indices);
+ }
+
+ llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+ switch (out->deviceType()) {
+ case LLAISYS_DEVICE_CPU:
+ return cpu::embedding(
+ weight->data(),
+ index->data(),
+ out->data(),
+ weight->dtype(),
+ embedding_dim,
+ num_embeddings,
+ num_indices);
+
+#ifdef ENABLE_NVIDIA_API
+ case LLAISYS_DEVICE_NVIDIA:
+ TO_BE_IMPLEMENTED();
+ return;
+#endif
+ default:
+ EXCEPTION_UNSUPPORTED_DEVICE;
+ }
}
} // namespace llaisys::ops
diff --git a/src/ops/linear/cpu/linear_cpu.cpp b/src/ops/linear/cpu/linear_cpu.cpp
new file mode 100644
index 000000000..e2654242c
--- /dev/null
+++ b/src/ops/linear/cpu/linear_cpu.cpp
@@ -0,0 +1,130 @@
+#include "linear_cpu.hpp"
+#include
+#include
+#include
+#include "../../../utils.hpp"
+#include
+
+template
+void linear_kernel(const T *input, const T *weight, const T *bias, T *out,
+ size_t M, size_t K, size_t N) {
+ if constexpr (std::is_same_v) {
+ const float* f_input = (const float*)input;
+ const float* f_weight = (const float*)weight;
+ float* f_out = (float*)out;
+
+ if (M == 1) {
+ // 🚀【新增】:Decode 阶段的绝对王者!
+ // 当 M=1 时,输入其实是一个向量。我们调用专门的 cblas_sgemv!
+ // Weight 是 N 行 K 列,Input 是长度为 K 的向量,Output 是长度为 N 的向量。
+ cblas_sgemv(CblasRowMajor, CblasNoTrans,
+ (int)N, (int)K, 1.0f,
+ f_weight, (int)K,
+ f_input, 1,
+ 0.0f,
+ f_out, 1);
+ } else {
+ // 🚀 Prefill 阶段:保持原样,大矩阵对撞
+ cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+ (int)M, (int)N, (int)K, 1.0f,
+ f_input, (int)K, f_weight, (int)K,
+ 0.0f, f_out, (int)N);
+ }
+
+ // 加上偏置
+ if (bias != nullptr) {
+ const float* f_bias = (const float*)bias;
+ for (size_t m = 0; m < M; m++) {
+ for (size_t n = 0; n < N; n++) {
+ f_out[m * N + n] += f_bias[n];
+ }
+ }
+ }
+ }
+ // 🚨 绝对安全版:剥离 OpenMP,彻底杜绝线程死锁!
+ else if constexpr (std::is_same_v || std::is_same_v) {
+
+ // ==========================================
+ // 🚀【核心优化】:引入静态线程缓存 (Workspace)
+ // thread_local 保证了这三块内存只会在第一次调用时被申请,
+ // 之后成千上万次的循环都会直接复用这块物理内存!绝对的零开销!
+ // ==========================================
+ thread_local std::vector f32_x;
+ thread_local std::vector f32_w;
+ thread_local std::vector f32_out;
+
+ // resize 在预留空间足够大时,几乎不消耗任何时间
+ f32_x.resize(M * K);
+ f32_w.resize(N * K);
+ f32_out.resize(M * N);
+
+ // 1. 单线程安全强转(CPU 跑这点 for 循环极快)
+ for (size_t i = 0; i < M * K; i++) {
+ f32_x[i] = llaisys::utils::cast(input[i]);
+ }
+
+ for (size_t i = 0; i < N * K; i++) {
+ f32_w[i] = llaisys::utils::cast(weight[i]);
+ }
+
+ // 2. 将所有并发算力全部留给 OpenBLAS!
+ cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+ (int)M, (int)N, (int)K, 1.0f,
+ f32_x.data(), (int)K,
+ f32_w.data(), (int)K,
+ 0.0f,
+ f32_out.data(), (int)N);
+
+ // 3. 算完后,单线程安全转回
+ for (size_t m = 0; m < M; m++) {
+ for (size_t n = 0; n < N; n++) {
+ float val = f32_out[m * N + n];
+ if (bias != nullptr) {
+ val += llaisys::utils::cast(bias[n]);
+ }
+ out[m * N + n] = llaisys::utils::cast(val);
+ }
+ }
+ }
+}
+
+namespace llaisys::ops::cpu {
+
+void linear(const std::byte *input_data, const std::byte *weight_data, const std::byte *bias_data,
+ std::byte *out_data, llaisysDataType_t dtype,
+ size_t M, size_t K, size_t N) {
+
+ switch (dtype) {
+ case LLAISYS_DTYPE_F32:
+ linear_kernel(
+ reinterpret_cast(input_data),
+ reinterpret_cast(weight_data),
+ reinterpret_cast(bias_data),
+ reinterpret_cast(out_data),
+ M, K, N);
+ break;
+
+ case LLAISYS_DTYPE_F16:
+ linear_kernel(
+ reinterpret_cast(input_data),
+ reinterpret_cast(weight_data),
+ reinterpret_cast(bias_data),
+ reinterpret_cast(out_data),
+ M, K, N);
+ break;
+
+ case LLAISYS_DTYPE_BF16:
+ linear_kernel(
+ reinterpret_cast(input_data),
+ reinterpret_cast(weight_data),
+ reinterpret_cast(bias_data),
+ reinterpret_cast(out_data),
+ M, K, N);
+ break;
+
+ default:
+ EXCEPTION_UNSUPPORTED_DATATYPE(dtype);
+ }
+}
+
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/linear/cpu/linear_cpu.hpp b/src/ops/linear/cpu/linear_cpu.hpp
new file mode 100644
index 000000000..83d407ec4
--- /dev/null
+++ b/src/ops/linear/cpu/linear_cpu.hpp
@@ -0,0 +1,13 @@
+#pragma once
+#include "../../../tensor/tensor.hpp"
+
+namespace llaisys::ops::cpu {
+
+void linear(const std::byte *input_data,
+ const std::byte *weight_data,
+ const std::byte *bias_data, // 可能是 nullptr
+ std::byte *out_data,
+ llaisysDataType_t dtype,
+ size_t M, size_t K, size_t N);
+
+}
\ No newline at end of file
diff --git a/src/ops/linear/op.cpp b/src/ops/linear/op.cpp
index 97d1f8655..54322e04f 100644
--- a/src/ops/linear/op.cpp
+++ b/src/ops/linear/op.cpp
@@ -1,7 +1,78 @@
#include "op.hpp"
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+#include "cpu/linear_cpu.hpp"
namespace llaisys::ops {
-void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias) {
- TO_BE_IMPLEMENTED();
+
+void linear(tensor_t out, tensor_t input, tensor_t weight, tensor_t bias) {
+
+ bool has_bias = (bias != nullptr && bias->numel() > 0);
+
+ if (has_bias) {
+ CHECK_SAME_DEVICE(out, input, weight, bias);
+ CHECK_SAME_DTYPE(out->dtype(), input->dtype(), weight->dtype(), bias->dtype());
+ ASSERT(bias->isContiguous(), "Linear: bias must be contiguous.");
+ ASSERT(bias->ndim() == 1, "Linear: bias must be 1D.");
+ } else {
+ CHECK_SAME_DEVICE(out, input, weight);
+ CHECK_SAME_DTYPE(out->dtype(), input->dtype(), weight->dtype());
+ }
+
+ ASSERT(out->isContiguous() && input->isContiguous() && weight->isContiguous(),
+ "Linear: tensors must be contiguous.");
+
+ ASSERT(input->ndim() == 2, "Linear: input must be 2D [M, K].");
+ ASSERT(weight->ndim() == 2, "Linear: weight must be 2D [N, K].");
+ ASSERT(out->ndim() == 2, "Linear: output must be 2D [M, N].");
+
+ size_t M = input->shape()[0];
+ size_t K = input->shape()[1];
+ size_t N = weight->shape()[0];
+
+ // 检查矩阵乘法维度约束
+ ASSERT(weight->shape()[1] == K,
+ "Linear: weight input features (dim 1) must match input features (dim 1).");
+
+ // 检查输出形状
+ ASSERT(out->shape()[0] == M, "Linear: output dim 0 must match M.");
+ ASSERT(out->shape()[1] == N, "Linear: output dim 1 must match N.");
+
+ // 检查 bias 维度
+ if (has_bias) {
+ ASSERT(bias->shape()[0] == N, "Linear: bias size must match output features N.");
+ }
+
+ const std::byte *bias_ptr = has_bias ? bias->data() : nullptr;
+
+ if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+ ::llaisys::ops::cpu::linear(
+ input->data(),
+ weight->data(),
+ bias_ptr,
+ out->data(),
+ out->dtype(),
+ M, K, N);
+ return;
+ }
+
+ llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+ switch (out->deviceType()) {
+ case LLAISYS_DEVICE_CPU:
+ ::llaisys::ops::cpu::linear(
+ input->data(), weight->data(), bias_ptr, out->data(),
+ out->dtype(), M, K, N);
+ return;
+
+#ifdef ENABLE_NVIDIA_API
+ case LLAISYS_DEVICE_NVIDIA:
+ TO_BE_IMPLEMENTED();
+ return;
+#endif
+ default:
+ EXCEPTION_UNSUPPORTED_DEVICE;
+ }
}
+
} // namespace llaisys::ops
diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.cpp b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp
new file mode 100644
index 000000000..9bd8e8baf
--- /dev/null
+++ b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp
@@ -0,0 +1,75 @@
+#include "rms_norm_cpu.hpp"
+#include "../../../utils.hpp"
+#include
+
+
+template
+void rms_norm_kernel(const T *input, const T *weight, T *out,
+ size_t num_rows, size_t hidden_dim, float eps) {
+
+
+ for (size_t i = 0; i < num_rows; i++) {
+
+ const T *row_in = input + i * hidden_dim;
+ T *row_out = out + i * hidden_dim;
+
+ // 计算平方和
+ float sum_sq = 0.0f;
+ for (size_t j = 0; j < hidden_dim; j++) {
+ float val = llaisys::utils::cast(row_in[j]);
+ sum_sq += val * val;
+ }
+
+ // 计算 RMS 的倒数
+
+ float mean_sq = sum_sq / static_cast(hidden_dim);
+ float inv_rms = 1.0f / std::sqrt(mean_sq + eps);
+
+ // 归一化并缩放
+
+ for (size_t j = 0; j < hidden_dim; j++) {
+ float val = llaisys::utils::cast(row_in[j]);
+ float w = llaisys::utils::cast(weight[j]);
+
+ row_out[j] = llaisys::utils::cast(val * inv_rms * w);
+ }
+ }
+}
+
+namespace llaisys::ops::cpu {
+
+void rms_norm(const std::byte *input_data, const std::byte *weight_data, std::byte *out_data,
+ llaisysDataType_t dtype,
+ size_t num_rows, size_t hidden_dim, float eps) {
+
+ switch (dtype) {
+ case LLAISYS_DTYPE_F32:
+ rms_norm_kernel(
+ reinterpret_cast(input_data),
+ reinterpret_cast(weight_data),
+ reinterpret_cast(out_data),
+ num_rows, hidden_dim, eps);
+ break;
+
+ case LLAISYS_DTYPE_F16:
+ rms_norm_kernel(
+ reinterpret_cast(input_data),
+ reinterpret_cast(weight_data),
+ reinterpret_cast(out_data),
+ num_rows, hidden_dim, eps);
+ break;
+
+ case LLAISYS_DTYPE_BF16:
+ rms_norm_kernel(
+ reinterpret_cast(input_data),
+ reinterpret_cast(weight_data),
+ reinterpret_cast(out_data),
+ num_rows, hidden_dim, eps);
+ break;
+
+ default:
+ EXCEPTION_UNSUPPORTED_DATATYPE(dtype);
+ }
+}
+
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.hpp b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp
new file mode 100644
index 000000000..33b08006f
--- /dev/null
+++ b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp
@@ -0,0 +1,14 @@
+#pragma once
+#include "../../../tensor/tensor.hpp"
+
+namespace llaisys::ops::cpu {
+
+void rms_norm(const std::byte *input_data,
+ const std::byte *weight_data,
+ std::byte *out_data,
+ llaisysDataType_t dtype,
+ size_t num_rows,
+ size_t hidden_dim,
+ float eps);
+
+}
\ No newline at end of file
diff --git a/src/ops/rms_norm/op.cpp b/src/ops/rms_norm/op.cpp
index 529553d9d..b0f715955 100644
--- a/src/ops/rms_norm/op.cpp
+++ b/src/ops/rms_norm/op.cpp
@@ -1,7 +1,60 @@
#include "op.hpp"
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+#include "cpu/rms_norm_cpu.hpp"
namespace llaisys::ops {
+
void rms_norm(tensor_t out, tensor_t in, tensor_t weight, float eps) {
- TO_BE_IMPLEMENTED();
+
+ CHECK_SAME_DEVICE(out, in, weight);
+ CHECK_SAME_DTYPE(out->dtype(), in->dtype(), weight->dtype());
+
+ ASSERT(out->isContiguous() && in->isContiguous() && weight->isContiguous(),
+ "RMSNorm: tensors must be contiguous.");
+
+ ASSERT(in->ndim() == 2, "RMSNorm: input must be 2D.");
+ ASSERT(out->ndim() == 2, "RMSNorm: output must be 2D.");
+ ASSERT(weight->ndim() == 1, "RMSNorm: weight must be 1D.");
+
+ ASSERT(in->shape()[0] == out->shape()[0], "RMSNorm: input/output rows mismatch.");
+ ASSERT(in->shape()[1] == out->shape()[1], "RMSNorm: input/output cols mismatch.");
+
+ ASSERT(weight->shape()[0] == in->shape()[1],
+ "RMSNorm: weight size must match input hidden dim.");
+
+ size_t num_rows = in->shape()[0];
+ size_t hidden_dim = in->shape()[1];
+
+ if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+ ::llaisys::ops::cpu::rms_norm(
+ in->data(),
+ weight->data(),
+ out->data(),
+ in->dtype(),
+ num_rows,
+ hidden_dim,
+ eps);
+ return;
+ }
+
+ llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+ switch (out->deviceType()) {
+ case LLAISYS_DEVICE_CPU:
+ ::llaisys::ops::cpu::rms_norm(
+ in->data(), weight->data(), out->data(),
+ in->dtype(), num_rows, hidden_dim, eps);
+ return;
+
+#ifdef ENABLE_NVIDIA_API
+ case LLAISYS_DEVICE_NVIDIA:
+ TO_BE_IMPLEMENTED();
+ return;
+#endif
+ default:
+ EXCEPTION_UNSUPPORTED_DEVICE;
+ }
}
+
} // namespace llaisys::ops
diff --git a/src/ops/rope/cpu/rope_cpu.cpp b/src/ops/rope/cpu/rope_cpu.cpp
new file mode 100644
index 000000000..7b7bfb6ff
--- /dev/null
+++ b/src/ops/rope/cpu/rope_cpu.cpp
@@ -0,0 +1,93 @@
+#include "rope_cpu.hpp"
+#include "../../../utils.hpp"
+#include // sin, cos, pow
+#include
+
+template
+void rope_kernel(T *out, const T *in, const int64_t *pos_ids,
+ size_t seq_len, size_t n_heads, size_t head_dim, float theta) {
+
+ size_t half_dim = head_dim / 2;
+
+// 遍历每一个 Token
+#pragma omp parallel for
+ for (size_t s = 0; s < seq_len; s++) {
+ int64_t pos = pos_ids[s];
+
+ // 遍历每一个 Head
+ for (size_t h = 0; h < n_heads; h++) {
+
+ size_t offset = s * (n_heads * head_dim) + h * head_dim;
+ const T *src_vec = in + offset;
+ T *dst_vec = out + offset;
+
+ // 遍历每一对 (j)
+ for (size_t j = 0; j < half_dim; j++) {
+
+ // --- 1. 精度提升关键点 ---
+ // 使用 double (64位) 进行中间计算,对齐 PyTorch 的精度行为
+ double j_d = static_cast(j);
+ double d_d = static_cast(head_dim);
+ double theta_d = static_cast(theta);
+ double pos_d = static_cast(pos);
+
+ // 计算频率 (High Precision)
+ double freq = std::pow(theta_d, -2.0 * j_d / d_d);
+
+ // 计算角度 (High Precision)
+ double angle = pos_d * freq;
+
+ // 计算 Sin/Cos (High Precision)
+ double cos_val = std::cos(angle);
+ double sin_val = std::sin(angle);
+
+ // --- 2. 读取数据并计算 ---
+ // 这里也先转成 double 算完再转回去,减少误差累积
+ double a = static_cast(llaisys::utils::cast(src_vec[j]));
+ double b = static_cast(llaisys::utils::cast(src_vec[j + half_dim]));
+
+ double a_out = a * cos_val - b * sin_val;
+ double b_out = b * cos_val + a * sin_val;
+
+ // --- 3. 存回结果 ---
+ // 最后一步才转回目标类型 T (float/fp16/bf16)
+ dst_vec[j] = llaisys::utils::cast(static_cast(a_out));
+ dst_vec[j + half_dim] = llaisys::utils::cast(static_cast(b_out));
+ }
+ }
+ }
+}
+
+namespace llaisys::ops::cpu {
+
+void rope(const std::byte *out_data, const std::byte *in_data, const std::byte *pos_ids,
+ llaisysDataType_t dtype,
+ size_t seq_len, size_t n_heads, size_t head_dim, float theta) {
+
+ const int64_t *pos_ptr = reinterpret_cast(pos_ids);
+
+ switch (dtype) {
+ case LLAISYS_DTYPE_F32:
+ return rope_kernel(
+ reinterpret_cast(const_cast(out_data)),
+ reinterpret_cast(in_data),
+ pos_ptr, seq_len, n_heads, head_dim, theta);
+
+ case LLAISYS_DTYPE_F16:
+ return rope_kernel(
+ reinterpret_cast(const_cast(out_data)),
+ reinterpret_cast(in_data),
+ pos_ptr, seq_len, n_heads, head_dim, theta);
+
+ case LLAISYS_DTYPE_BF16:
+ return rope_kernel(
+ reinterpret_cast(const_cast(out_data)),
+ reinterpret_cast(in_data),
+ pos_ptr, seq_len, n_heads, head_dim, theta);
+
+ default:
+ EXCEPTION_UNSUPPORTED_DATATYPE(dtype);
+ }
+}
+
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/rope/cpu/rope_cpu.hpp b/src/ops/rope/cpu/rope_cpu.hpp
new file mode 100644
index 000000000..e749605ae
--- /dev/null
+++ b/src/ops/rope/cpu/rope_cpu.hpp
@@ -0,0 +1,15 @@
+#pragma once
+#include "../../../tensor/tensor.hpp"
+
+namespace llaisys::ops::cpu {
+
+void rope(const std::byte *out_data,
+ const std::byte *in_data,
+ const std::byte *pos_ids,
+ llaisysDataType_t dtype,
+ size_t seq_len,
+ size_t n_heads,
+ size_t head_dim,
+ float theta);
+
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/rope/op.cpp b/src/ops/rope/op.cpp
index d60dbe64e..a2b1ef02a 100644
--- a/src/ops/rope/op.cpp
+++ b/src/ops/rope/op.cpp
@@ -1,7 +1,67 @@
#include "op.hpp"
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+#include "cpu/rope_cpu.hpp"
namespace llaisys::ops {
+
void rope(tensor_t out, tensor_t in, tensor_t pos_ids, float theta) {
- TO_BE_IMPLEMENTED();
+
+ CHECK_SAME_DEVICE(out, in, pos_ids);
+ CHECK_SAME_DTYPE(out->dtype(), in->dtype());
+
+ ASSERT(pos_ids->dtype() == LLAISYS_DTYPE_I64, "RoPE: pos_ids must be Int64.");
+
+ ASSERT(out->isContiguous() && in->isContiguous() && pos_ids->isContiguous(),
+ "RoPE: all tensors must be contiguous.");
+
+ ASSERT(in->ndim() == 3, "RoPE: input must be 3D [seq, head, dim].");
+ ASSERT(out->ndim() == 3, "RoPE: output must be 3D [seq, head, dim].");
+
+ ASSERT(pos_ids->ndim() == 1, "RoPE: pos_ids must be 1D [seq].");
+
+ ASSERT(in->shape()[0] == out->shape()[0], "RoPE: input/output seq_len mismatch.");
+ ASSERT(in->shape()[1] == out->shape()[1], "RoPE: input/output n_heads mismatch.");
+ ASSERT(in->shape()[2] == out->shape()[2], "RoPE: input/output head_dim mismatch.");
+
+ ASSERT(pos_ids->shape()[0] == in->shape()[0], "RoPE: pos_ids length must match seq_len.");
+
+ size_t head_dim = in->shape()[2];
+ ASSERT(head_dim % 2 == 0, "RoPE: head_dim must be even.");
+
+ size_t seq_len = in->shape()[0];
+ size_t n_heads = in->shape()[1];
+
+ if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+ ::llaisys::ops::cpu::rope(
+ out->data(),
+ in->data(),
+ pos_ids->data(),
+ out->dtype(),
+ seq_len,
+ n_heads,
+ head_dim,
+ theta);
+ return;
+ }
+
+ llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+ switch (out->deviceType()) {
+ case LLAISYS_DEVICE_CPU:
+ ::llaisys::ops::cpu::rope(
+ out->data(), in->data(), pos_ids->data(),
+ out->dtype(), seq_len, n_heads, head_dim, theta);
+ return;
+
+#ifdef ENABLE_NVIDIA_API
+ case LLAISYS_DEVICE_NVIDIA:
+ TO_BE_IMPLEMENTED();
+ return;
+#endif
+ default:
+ EXCEPTION_UNSUPPORTED_DEVICE;
+ }
}
+
} // namespace llaisys::ops
diff --git a/src/ops/sampler/cpu/sampler_cpu.cpp b/src/ops/sampler/cpu/sampler_cpu.cpp
new file mode 100644
index 000000000..9c810900a
--- /dev/null
+++ b/src/ops/sampler/cpu/sampler_cpu.cpp
@@ -0,0 +1,117 @@
+#include "sampler_cpu.hpp"
+#include
+#include
+#include
+#include
+
+namespace llaisys::ops::cpu {
+
+// 定义一个结构体,把原始的索引(编号)和计算出的概率绑定在一起
+// 这样在排序的时候,我们就不会“弄丢”这个概率属于哪个词了
+struct TokenProb {
+ int32_t index;
+ float prob;
+};
+
+int32_t sample(const float *logits, size_t vocab_size, float temperature, float top_p, size_t top_k) {
+ // ==========================================
+ // 步骤 0:退化保护 (Argmax)
+ // 如果温度极其接近 0,或者 top_k 被设为了 1,直接退化为贪心搜索,省去所有计算
+ // ==========================================
+ if (temperature <= 1e-5f || top_k == 1) {
+ int32_t best_idx = 0;
+ float max_val = logits[0];
+ for (size_t i = 1; i < vocab_size; i++) {
+ if (logits[i] > max_val) {
+ max_val = logits[i];
+ best_idx = static_cast(i);
+ }
+ }
+ return best_idx;
+ }
+
+ // ==========================================
+ // 步骤 1 & 2:应用 Temperature 和“数值稳定的 Softmax”
+ // ==========================================
+ // 先找出最大的 logit
+ float max_logit = logits[0];
+ for (size_t i = 1; i < vocab_size; i++) {
+ if (logits[i] > max_logit) {
+ max_logit = logits[i];
+ }
+ }
+
+ std::vector probs(vocab_size);
+ float sum_exp = 0.0f;
+
+ for (size_t i = 0; i < vocab_size; i++) {
+ probs[i].index = static_cast(i);
+ // 【核心考点】:减去 max_logit 防止 exp 算爆溢出,同时除以温度
+ probs[i].prob = std::exp((logits[i] - max_logit) / temperature);
+ sum_exp += probs[i].prob;
+ }
+
+ // 归一化,变成真正的概率分布 (总和为 1)
+ for (size_t i = 0; i < vocab_size; i++) {
+ probs[i].prob /= sum_exp;
+ }
+
+ // ==========================================
+ // 步骤 3:降序排序
+ // ==========================================
+ // 传入一个 Lambda 表达式,告诉 C++ 按照 prob 的大小从大到小排
+ std::sort(probs.begin(), probs.end(), [](const TokenProb &a, const TokenProb &b) {
+ return a.prob > b.prob;
+ });
+
+ // ==========================================
+ // 步骤 4:Top-K 与 Top-P 截断
+ // ==========================================
+ size_t valid_count = 0;
+ float cum_prob = 0.0f;
+
+ // 如果没有设置 top_k (比如传了 0 或极大值),将其设为词表大小
+ if (top_k == 0 || top_k > vocab_size) {
+ top_k = vocab_size;
+ }
+
+ // 寻找截断点
+ for (size_t i = 0; i < vocab_size; i++) {
+ valid_count++;
+ cum_prob += probs[i].prob;
+ // 一旦概率累加超过了 Top-P 阈值,或者保留的词数达到了 Top-K,立刻停止
+ if (cum_prob >= top_p || valid_count >= top_k) {
+ break;
+ }
+ }
+
+ // ==========================================
+ // 步骤 5:掷骰子 (轮盘赌抽样)
+ // ==========================================
+ // 截断之后,剩下的这些候选词的概率加起来不到 1 了,我们算一下它们新的总和
+ float truncated_sum = 0.0f;
+ for (size_t i = 0; i < valid_count; i++) {
+ truncated_sum += probs[i].prob;
+ }
+
+ // 初始化 C++11 的高质量随机数生成器 (梅森旋转算法 mt19937)
+ std::random_device rd;
+ std::mt19937 gen(rd());
+ // 在 0 到 truncated_sum 之间摇一个随机浮点数 r
+ std::uniform_real_distribution dis(0.0f, truncated_sum);
+ float r = dis(gen);
+
+ // 轮盘赌开始:用 r 挨个减去每个词的概率,r 变成负数的那一刻,落到了哪个词就是哪个词
+ float current_sum = 0.0f;
+ for (size_t i = 0; i < valid_count; i++) {
+ current_sum += probs[i].prob;
+ if (r <= current_sum) {
+ return probs[i].index;
+ }
+ }
+
+ // 保底返回(极小概率由于浮点数精度误差走到这里),返回候选池里最小的那个
+ return probs[valid_count - 1].index;
+}
+
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/sampler/cpu/sampler_cpu.hpp b/src/ops/sampler/cpu/sampler_cpu.hpp
new file mode 100644
index 000000000..5af757b9b
--- /dev/null
+++ b/src/ops/sampler/cpu/sampler_cpu.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include
+#include
+
+namespace llaisys::ops::cpu {
+
+// 随机采样器接口
+// logits: 最后一层输出的得分数组指针
+// vocab_size: 词表大小 (比如 151936)
+// temperature: 温度参数 (默认通常是 0.8 到 1.0)
+// top_p: 核采样阈值 (默认通常是 0.9)
+// top_k: 截断数量 (0 表示不使用 Top-K 限制)
+int32_t sample(const float *logits, size_t vocab_size, float temperature, float top_p, size_t top_k);
+
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/self_attention/cpu/self_attention_cpu.cpp b/src/ops/self_attention/cpu/self_attention_cpu.cpp
new file mode 100644
index 000000000..fbbb2213d
--- /dev/null
+++ b/src/ops/self_attention/cpu/self_attention_cpu.cpp
@@ -0,0 +1,133 @@
+#include "self_attention_cpu.hpp"
+#include "../../../utils.hpp"
+#include
+#include
+#include
+#include
+
+template
+void self_attention_kernel(const T *q, const T *k, const T *v, T *out,
+ size_t seqlen, size_t total_len,
+ size_t nhead, size_t nkvhead,
+ size_t head_dim, size_t head_dim_v,
+ float scale) {
+
+ size_t group_size = nhead / nkvhead;
+
+ for (size_t i = 0; i < seqlen; i++) {
+ for (size_t h = 0; h < nhead; h++) {
+
+ size_t q_offset = i * (nhead * head_dim) + h * head_dim;
+ const T *q_vec = q + q_offset;
+
+ size_t kv_h = h / group_size;
+
+ std::vector scores(total_len);
+ float max_score = -std::numeric_limits::infinity();
+
+ for (size_t t = 0; t < total_len; t++) {
+
+ size_t global_q_idx = (total_len - seqlen) + i;
+
+ if (t > global_q_idx) {
+ scores[t] = -std::numeric_limits::infinity();
+ continue;
+ }
+
+ // 定位 K 向量 [t, kv_h, head_dim]
+ size_t k_offset = t * (nkvhead * head_dim) + kv_h * head_dim;
+ const T *k_vec = k + k_offset;
+
+ // 点积
+ float dot = 0.0f;
+ for (size_t d = 0; d < head_dim; d++) {
+ float q_val = llaisys::utils::cast(q_vec[d]);
+ float k_val = llaisys::utils::cast(k_vec[d]);
+ dot += q_val * k_val;
+ }
+
+ // 缩放
+ scores[t] = dot * scale;
+
+ // 记录最大值用于 Softmax 数值稳定
+ if (scores[t] > max_score) {
+ max_score = scores[t];
+ }
+ }
+
+ float sum_exp = 0.0f;
+ for (size_t t = 0; t < total_len; t++) {
+ if (scores[t] == -std::numeric_limits::infinity()) {
+ scores[t] = 0.0f; // exp(-inf) = 0
+ } else {
+ // 减去最大值防止溢出
+ scores[t] = std::exp(scores[t] - max_score);
+ sum_exp += scores[t];
+ }
+ }
+
+ // 归一化
+ for (size_t t = 0; t < total_len; t++) {
+ scores[t] /= sum_exp;
+ }
+
+ size_t out_offset = i * (nhead * head_dim_v) + h * head_dim_v;
+ T *out_vec = out + out_offset;
+
+ for (size_t dv = 0; dv < head_dim_v; dv++) {
+ float acc = 0.0f;
+ for (size_t t = 0; t < total_len; t++) {
+
+ if (scores[t] == 0.0f) {
+ continue;
+ }
+
+ size_t v_offset = t * (nkvhead * head_dim_v) + kv_h * head_dim_v;
+ float v_val = llaisys::utils::cast(v[v_offset + dv]);
+
+ acc += scores[t] * v_val;
+ }
+ out_vec[dv] = llaisys::utils::cast(acc);
+ }
+ }
+ }
+}
+
+namespace llaisys::ops::cpu {
+
+void self_attention(const std::byte *q_data, const std::byte *k_data, const std::byte *v_data,
+ std::byte *attn_val_data, llaisysDataType_t dtype,
+ size_t seqlen, size_t total_len, size_t nhead, size_t nkvhead,
+ size_t head_dim, size_t head_dim_v, float scale) {
+
+ switch (dtype) {
+ case LLAISYS_DTYPE_F32:
+ self_attention_kernel(
+ reinterpret_cast(q_data),
+ reinterpret_cast(k_data),
+ reinterpret_cast(v_data),
+ reinterpret_cast(attn_val_data),
+ seqlen, total_len, nhead, nkvhead, head_dim, head_dim_v, scale);
+ break;
+ case LLAISYS_DTYPE_F16:
+ self_attention_kernel(
+ reinterpret_cast(q_data),
+ reinterpret_cast(k_data),
+ reinterpret_cast(v_data),
+ reinterpret_cast(attn_val_data),
+ seqlen, total_len, nhead, nkvhead, head_dim, head_dim_v, scale);
+ break;
+ case LLAISYS_DTYPE_BF16:
+ self_attention_kernel(
+ reinterpret_cast(q_data),
+ reinterpret_cast(k_data),
+ reinterpret_cast(v_data),
+ reinterpret_cast(attn_val_data),
+ seqlen, total_len, nhead, nkvhead, head_dim, head_dim_v, scale);
+ break;
+ default:
+ EXCEPTION_UNSUPPORTED_DATATYPE(dtype);
+ }
+}
+
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/self_attention/cpu/self_attention_cpu.hpp b/src/ops/self_attention/cpu/self_attention_cpu.hpp
new file mode 100644
index 000000000..b7836c7e6
--- /dev/null
+++ b/src/ops/self_attention/cpu/self_attention_cpu.hpp
@@ -0,0 +1,19 @@
+#pragma once
+#include "../../../tensor/tensor.hpp"
+
+namespace llaisys::ops::cpu {
+
+void self_attention(const std::byte *q_data,
+ const std::byte *k_data,
+ const std::byte *v_data,
+ std::byte *attn_val_data,
+ llaisysDataType_t dtype,
+ size_t seqlen,
+ size_t total_len,
+ size_t nhead,
+ size_t nkvhead,
+ size_t head_dim, // d (for Q and K)
+ size_t head_dim_v, // dv (for V and Output)
+ float scale);
+
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/self_attention/op.cpp b/src/ops/self_attention/op.cpp
index 43d620142..55cb71409 100644
--- a/src/ops/self_attention/op.cpp
+++ b/src/ops/self_attention/op.cpp
@@ -1,7 +1,83 @@
#include "op.hpp"
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+#include "cpu/self_attention_cpu.hpp"
namespace llaisys::ops {
+
void self_attention(tensor_t attn_val, tensor_t q, tensor_t k, tensor_t v, float scale) {
- TO_BE_IMPLEMENTED();
+
+ CHECK_SAME_DEVICE(attn_val, q, k, v);
+ CHECK_SAME_DTYPE(attn_val->dtype(), q->dtype(), k->dtype(), v->dtype());
+
+ ASSERT(attn_val->isContiguous() && q->isContiguous() && k->isContiguous() && v->isContiguous(),
+ "SelfAttention: all tensors must be contiguous.");
+
+ // Q: [seqlen, nhead, d]
+ // K: [total_len, nkvhead, d]
+ // V: [total_len, nkvhead, dv]
+ // Out: [seqlen, nhead, dv]
+ ASSERT(q->ndim() == 3, "SelfAttention: q must be 3D.");
+ ASSERT(k->ndim() == 3, "SelfAttention: k must be 3D.");
+ ASSERT(v->ndim() == 3, "SelfAttention: v must be 3D.");
+ ASSERT(attn_val->ndim() == 3, "SelfAttention: attn_val must be 3D.");
+
+ size_t seqlen = q->shape()[0];
+ size_t nhead = q->shape()[1];
+ size_t head_dim = q->shape()[2]; // d
+
+ size_t total_len = k->shape()[0];
+ size_t nkvhead = k->shape()[1];
+
+ size_t head_dim_v = v->shape()[2]; // dv
+
+ ASSERT(nhead % nkvhead == 0, "SelfAttention: nhead must be divisible by nkvhead (GQA).");
+
+ // 检查维度匹配
+ ASSERT(k->shape()[2] == head_dim, "SelfAttention: k head_dim must match q head_dim.");
+ ASSERT(v->shape()[0] == total_len, "SelfAttention: v total_len must match k total_len.");
+ ASSERT(v->shape()[1] == nkvhead, "SelfAttention: v nkvhead must match k nkvhead.");
+
+ // 检查输出维度
+ ASSERT(attn_val->shape()[0] == seqlen, "SelfAttention: output seqlen mismatch.");
+ ASSERT(attn_val->shape()[1] == nhead, "SelfAttention: output nhead mismatch.");
+ ASSERT(attn_val->shape()[2] == head_dim_v, "SelfAttention: output head_dim_v mismatch.");
+
+ if (q->deviceType() == LLAISYS_DEVICE_CPU) {
+ ::llaisys::ops::cpu::self_attention(
+ q->data(),
+ k->data(),
+ v->data(),
+ attn_val->data(),
+ q->dtype(),
+ seqlen,
+ total_len,
+ nhead,
+ nkvhead,
+ head_dim,
+ head_dim_v,
+ scale);
+ return;
+ }
+
+ llaisys::core::context().setDevice(q->deviceType(), q->deviceId());
+
+ switch (q->deviceType()) {
+ case LLAISYS_DEVICE_CPU:
+ ::llaisys::ops::cpu::self_attention(
+ q->data(), k->data(), v->data(), attn_val->data(),
+ q->dtype(), seqlen, total_len, nhead, nkvhead,
+ head_dim, head_dim_v, scale);
+ return;
+
+#ifdef ENABLE_NVIDIA_API
+ case LLAISYS_DEVICE_NVIDIA:
+ TO_BE_IMPLEMENTED();
+ return;
+#endif
+ default:
+ EXCEPTION_UNSUPPORTED_DEVICE;
+ }
}
+
} // namespace llaisys::ops
diff --git a/src/ops/swiglu/cpu/swiglu_cpu.cpp b/src/ops/swiglu/cpu/swiglu_cpu.cpp
new file mode 100644
index 000000000..5d9aba729
--- /dev/null
+++ b/src/ops/swiglu/cpu/swiglu_cpu.cpp
@@ -0,0 +1,53 @@
+#include "swiglu_cpu.hpp"
+#include "../../../utils.hpp"
+#include // std::exp
+
+template
+void swiglu_kernel(const T *gate, const T *up, T *out, size_t numel) {
+
+ for (size_t i = 0; i < numel; i++) {
+
+ float g_val = llaisys::utils::cast(gate[i]);
+ float u_val = llaisys::utils::cast(up[i]);
+
+ float silu_val = g_val / (1.0f + std::exp(-g_val));
+
+ float res = u_val * silu_val;
+
+ out[i] = llaisys::utils::cast(res);
+ }
+}
+
+namespace llaisys::ops::cpu {
+
+void swiglu(const std::byte *gate_data, const std::byte *up_data, std::byte *out_data,
+ llaisysDataType_t dtype, size_t numel) {
+
+ switch (dtype) {
+ case LLAISYS_DTYPE_F32:
+ swiglu_kernel(
+ reinterpret_cast(gate_data),
+ reinterpret_cast(up_data),
+ reinterpret_cast(out_data),
+ numel);
+ break;
+ case LLAISYS_DTYPE_F16:
+ swiglu_kernel(
+ reinterpret_cast(gate_data),
+ reinterpret_cast(up_data),
+ reinterpret_cast(out_data),
+ numel);
+ break;
+ case LLAISYS_DTYPE_BF16:
+ swiglu_kernel(
+ reinterpret_cast(gate_data),
+ reinterpret_cast(up_data),
+ reinterpret_cast(out_data),
+ numel);
+ break;
+ default:
+ EXCEPTION_UNSUPPORTED_DATATYPE(dtype);
+ }
+}
+
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/swiglu/cpu/swiglu_cpu.hpp b/src/ops/swiglu/cpu/swiglu_cpu.hpp
new file mode 100644
index 000000000..d663f4989
--- /dev/null
+++ b/src/ops/swiglu/cpu/swiglu_cpu.hpp
@@ -0,0 +1,13 @@
+#pragma once
+#include "../../../tensor/tensor.hpp"
+
+namespace llaisys::ops::cpu {
+
+
+void swiglu(const std::byte* gate_data,
+ const std::byte* up_data,
+ std::byte* out_data,
+ llaisysDataType_t dtype,
+ size_t numel);
+
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/swiglu/op.cpp b/src/ops/swiglu/op.cpp
index 47edbcc97..c42001a44 100644
--- a/src/ops/swiglu/op.cpp
+++ b/src/ops/swiglu/op.cpp
@@ -1,7 +1,54 @@
#include "op.hpp"
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+#include "cpu/swiglu_cpu.hpp"
namespace llaisys::ops {
+
void swiglu(tensor_t out, tensor_t gate, tensor_t up) {
- TO_BE_IMPLEMENTED();
+
+ CHECK_SAME_DEVICE(out, gate, up);
+ CHECK_SAME_DTYPE(out->dtype(), gate->dtype(), up->dtype());
+
+ ASSERT(out->isContiguous() && gate->isContiguous() && up->isContiguous(),
+ "SwiGLU: tensors must be contiguous.");
+
+ ASSERT(gate->ndim() == 2, "SwiGLU: gate must be 2D.");
+ ASSERT(up->ndim() == 2, "SwiGLU: up must be 2D.");
+ ASSERT(out->ndim() == 2, "SwiGLU: out must be 2D.");
+
+ ASSERT(gate->shape() == up->shape(), "SwiGLU: gate and up shapes mismatch.");
+ ASSERT(out->shape() == gate->shape(), "SwiGLU: out shape mismatch.");
+
+ size_t numel = gate->numel();
+
+ if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+ ::llaisys::ops::cpu::swiglu(
+ gate->data(),
+ up->data(),
+ out->data(),
+ out->dtype(),
+ numel);
+ return;
+ }
+
+ llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+ switch (out->deviceType()) {
+ case LLAISYS_DEVICE_CPU:
+ ::llaisys::ops::cpu::swiglu(
+ gate->data(), up->data(), out->data(),
+ out->dtype(), numel);
+ return;
+
+#ifdef ENABLE_NVIDIA_API
+ case LLAISYS_DEVICE_NVIDIA:
+ TO_BE_IMPLEMENTED();
+ return;
+#endif
+ default:
+ EXCEPTION_UNSUPPORTED_DEVICE;
+ }
}
+
} // namespace llaisys::ops
diff --git a/src/tensor/tensor.cpp b/src/tensor/tensor.cpp
index 2f594bb65..aa1855bd9 100644
--- a/src/tensor/tensor.cpp
+++ b/src/tensor/tensor.cpp
@@ -164,27 +164,114 @@ void Tensor::debug() const {
}
bool Tensor::isContiguous() const {
- TO_BE_IMPLEMENTED();
+ size_t accumulated = 1;
+ int ndim = static_cast(this->shape().size());
+
+ for (int i = ndim - 1; i >= 0; i--) {
+ size_t cnt_stride = this->strides()[i]; // 当前实际步长
+ size_t cnt_shape = this->shape()[i]; // 当前维度的形状大小
+ if (cnt_stride != accumulated) {
+ return false;
+ }
+ accumulated *= cnt_shape;
+ }
return true;
}
tensor_t Tensor::permute(const std::vector &order) const {
- TO_BE_IMPLEMENTED();
- return std::shared_ptr(new Tensor(_meta, _storage));
+
+ if (order.size() != this->ndim()) {
+ throw std::runtime_error("Permute order size mismatch");
+ }
+ std::vector new_shape;
+ std::vector new_strides;
+
+ new_shape.reserve(this->ndim());
+ new_strides.reserve(this->ndim());
+
+ for (size_t original_dim_index : order) {
+ new_shape.push_back(this->shape()[original_dim_index]);
+ new_strides.push_back(this->strides()[original_dim_index]);
+ }
+
+ TensorMeta meta{this->dtype(), new_shape, new_strides};
+
+ return std::shared_ptr(new Tensor(std::move(meta), this->_storage, this->_offset));
}
tensor_t Tensor::view(const std::vector &shape) const {
- TO_BE_IMPLEMENTED();
- return std::shared_ptr(new Tensor(_meta, _storage));
+ // 检查连续
+ if (!this->isContiguous()) {
+ throw std::runtime_error("View: tensors is not contiguous");
+ }
+ // 新形状元素总数
+ size_t new_numel = 1;
+ for (size_t s : shape) {
+ new_numel *= s;
+ }
+ if (new_numel != this->numel()) {
+ throw std::runtime_error("Shape mismatch");
+ }
+
+ int ndim_ = static_cast(shape.size());
+ std::vector new_strides(ndim_);
+ size_t new_stride = 1;
+ for (int i = ndim_ - 1; i >= 0; i--) {
+ new_strides[i] = new_stride;
+ new_stride *= shape[i];
+ }
+ // 构建新元数据
+ TensorMeta meta{this->dtype(), shape, new_strides};
+
+ return std::shared_ptr(new Tensor(std::move(meta), this->_storage, this->_offset));
}
tensor_t Tensor::slice(size_t dim, size_t start, size_t end) const {
- TO_BE_IMPLEMENTED();
- return std::shared_ptr(new Tensor(_meta, _storage));
+
+ if (dim >= this->ndim()) {
+ throw std::runtime_error("Slice dimension is out of bounds");
+ }
+
+ // 检查切片范围是否合法
+ size_t dim_size = this->shape()[dim];
+ if (start > end) {
+ throw std::runtime_error("Slice start must be less than end");
+ }
+ if (end > dim_size) {
+ throw std::runtime_error("Slice end must be less than or equal to dimension size");
+ }
+
+ // 计算新形状
+ std::vector new_shape = this->shape();
+ new_shape[dim] = end - start;
+
+ std::vector new_strides = this->strides();
+
+ // 计算新的偏移量
+ size_t skipped_elements = start * new_strides[dim];
+ size_t shift_bytes = skipped_elements * this->elementSize();
+ // 新的偏移量
+ size_t new_offset = this->_offset + shift_bytes;
+
+ TensorMeta meta{this->dtype(), new_shape, new_strides};
+ return std::shared_ptr(new Tensor(std::move(meta), this->_storage, new_offset));
}
void Tensor::load(const void *src_) {
- TO_BE_IMPLEMENTED();
+
+ size_t total_bytes = this->numel() * this->elementSize(); // 数据总大小
+ void *dst_ = this->data();
+ if (this->deviceType() == LLAISYS_DEVICE_CPU) {
+ std::memcpy(dst_, src_, total_bytes);
+ } // 目标在CPU上
+ else {
+ core::context().setDevice(this->deviceType(), this->deviceId());
+ core::context().runtime().api()->memcpy_sync(
+ dst_, // 目标地址
+ src_, // 源地址
+ total_bytes,
+ LLAISYS_MEMCPY_H2D);
+ }
}
tensor_t Tensor::contiguous() const {
@@ -192,9 +279,9 @@ tensor_t Tensor::contiguous() const {
return std::shared_ptr(new Tensor(_meta, _storage));
}
-tensor_t Tensor::reshape(const std::vector &shape) const {
- TO_BE_IMPLEMENTED();
- return std::shared_ptr(new Tensor(_meta, _storage));
+tensor_t Tensor::reshape(const std::vector& shape) const {
+
+ return this->view(shape);
}
tensor_t Tensor::to(llaisysDeviceType_t device_type, int device) const {
diff --git a/xmake.lua b/xmake.lua
index 1f65f7a95..afa95bc87 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -1,122 +1,69 @@
add_rules("mode.debug", "mode.release")
set_encodings("utf-8")
+add_requires("openmp")
+add_requires("openblas") -- 自动处理 Windows/Linux 差异
+add_packages("openmp", "openblas")
add_includedirs("include")
--- CPU --
+-- CPU / GPU 配置保持不变...
includes("xmake/cpu.lua")
--- NVIDIA --
-option("nv-gpu")
- set_default(false)
- set_showmenu(true)
- set_description("Whether to compile implementations for Nvidia GPU")
-option_end()
-
-if has_config("nv-gpu") then
- add_defines("ENABLE_NVIDIA_API")
- includes("xmake/nvidia.lua")
+-- ==========================================
+-- 1. 静态组件:使用你的 on_install 技巧防止硬塞系统目录
+-- ==========================================
+local static_targets = {"llaisys-utils", "llaisys-device", "llaisys-core", "llaisys-tensor", "llaisys-ops"}
+
+for _, name in ipairs(static_targets) do
+ target(name)
+ set_kind("static")
+ set_languages("cxx17")
+ if not is_plat("windows") then add_cxflags("-fPIC") end
+
+ -- 核心:直接套用你的方案
+ on_install(function (target) end)
+
+ -- 根据名字添加特定的文件和依赖
+ if name == "llaisys-utils" then add_files("src/utils/*.cpp") end
+ if name == "llaisys-device" then
+ add_deps("llaisys-utils", "llaisys-device-cpu")
+ add_files("src/device/*.cpp")
+ end
+ if name == "llaisys-core" then
+ add_deps("llaisys-utils", "llaisys-device")
+ add_files("src/core/*/*.cpp")
+ end
+ if name == "llaisys-tensor" then
+ add_deps("llaisys-core")
+ add_files("src/tensor/*.cpp")
+ end
+ if name == "llaisys-ops" then
+ add_deps("llaisys-ops-cpu")
+ add_packages("openmp", "openblas")
+ add_files("src/ops/*/*.cpp")
+ end
+ target_end()
end
-target("llaisys-utils")
- set_kind("static")
-
- set_languages("cxx17")
- set_warnings("all", "error")
- if not is_plat("windows") then
- add_cxflags("-fPIC", "-Wno-unknown-pragmas")
- end
-
- add_files("src/utils/*.cpp")
-
- on_install(function (target) end)
-target_end()
-
-
-target("llaisys-device")
- set_kind("static")
- add_deps("llaisys-utils")
- add_deps("llaisys-device-cpu")
-
- set_languages("cxx17")
- set_warnings("all", "error")
- if not is_plat("windows") then
- add_cxflags("-fPIC", "-Wno-unknown-pragmas")
- end
-
- add_files("src/device/*.cpp")
-
- on_install(function (target) end)
-target_end()
-
-target("llaisys-core")
- set_kind("static")
- add_deps("llaisys-utils")
- add_deps("llaisys-device")
-
- set_languages("cxx17")
- set_warnings("all", "error")
- if not is_plat("windows") then
- add_cxflags("-fPIC", "-Wno-unknown-pragmas")
- end
-
- add_files("src/core/*/*.cpp")
-
- on_install(function (target) end)
-target_end()
-
-target("llaisys-tensor")
- set_kind("static")
- add_deps("llaisys-core")
-
- set_languages("cxx17")
- set_warnings("all", "error")
- if not is_plat("windows") then
- add_cxflags("-fPIC", "-Wno-unknown-pragmas")
- end
-
- add_files("src/tensor/*.cpp")
-
- on_install(function (target) end)
-target_end()
-
-target("llaisys-ops")
- set_kind("static")
- add_deps("llaisys-ops-cpu")
-
- set_languages("cxx17")
- set_warnings("all", "error")
- if not is_plat("windows") then
- add_cxflags("-fPIC", "-Wno-unknown-pragmas")
- end
-
- add_files("src/ops/*/*.cpp")
-
- on_install(function (target) end)
-target_end()
-
+-- ==========================================
+-- 2. 主动态库:只安装到本地
+-- ==========================================
target("llaisys")
set_kind("shared")
- add_deps("llaisys-utils")
- add_deps("llaisys-device")
- add_deps("llaisys-core")
- add_deps("llaisys-tensor")
- add_deps("llaisys-ops")
+ add_deps("llaisys-utils", "llaisys-device", "llaisys-core", "llaisys-tensor", "llaisys-ops")
+ add_packages("openmp", "openblas")
+ add_files("src/llaisys/*.cpp", "src/llaisys/*.cc")
set_languages("cxx17")
- set_warnings("all", "error")
- add_files("src/llaisys/*.cc")
- set_installdir(".")
+ set_installdir(".") -- 强制安装到当前目录
-
after_install(function (target)
- -- copy shared library to python package
print("Copying llaisys to python/llaisys/libllaisys/ ..")
+ local lib_dir = "python/llaisys/libllaisys/"
if is_plat("windows") then
- os.cp("bin/*.dll", "python/llaisys/libllaisys/")
- end
- if is_plat("linux") then
- os.cp("lib/*.so", "python/llaisys/libllaisys/")
+ os.cp("bin/*.dll", lib_dir)
+ else
+ os.cp("lib/*.so", lib_dir)
end
end)
target_end()
\ No newline at end of file