ServiceNow · bigximik · Apr 3, 2025 · Apr 8, 2025 · Apr 8, 2025 · Apr 8, 2025
diff --git a/check_logits_hidden_layers.ipynb b/check_logits_hidden_layers.ipynb
@@ -0,0 +1,387 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from fast_llm.data.dataset.gpt.memmap import GPTMemmapDataset\n",
+    "from pathlib import Path\n",
+    "import numpy as np\n",
+    "from transformers import AutoTokenizer\n",
+    "import torch\n",
+    "import pickle\n",
+    "\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "files_root = Path(\"/mnt/datasets/tests/denis/tensors_f32/\")\n",
+    "#files_root = Path(\"/mnt/datasets/tests/denis/tensors/\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fm_files = {int(file.stem.split(\"tensor\")[1]): file for file in (files_root / \"fast_llm/logits/\").glob(\"tensor*.pt\")}\n",
+    "hf_files = {int(file.stem.split(\"tensor\")[1]): file for file in (files_root / \"hf/logits\").glob(\"tensor*.pt\")}\n",
+    "assert len(fm_files) == len(hf_files)\n",
+    "len(fm_files)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hf_tokens = []\n",
+    "fm_tokens = []\n",
+    "max_adiff = []\n",
+    "mean_adiff = []\n",
+    "sum_adiff = []\n",
+    "for i in range(len(fm_files)):\n",
+    "    fm_data = torch.load(fm_files[i])\n",
+    "    hf_data = torch.load(hf_files[i])\n",
+    "    \n",
+    "    hf_tokens.append(hf_data[0, -1, :].argmax().item())\n",
+    "    fm_tokens.append(fm_data[0, -1, :].argmax().item())\n",
+    "\n",
+    "    adiff = torch.abs(hf_data[0, -1, :] - fm_data[0, -1, :])\n",
+    "    max_adiff.append(adiff.max().item())\n",
+    "    mean_adiff.append(adiff.mean().item())\n",
+    "    sum_adiff.append(adiff.sum().item())\n",
+    "    \n",
+    "all(a == b for a, b in zip(hf_tokens, fm_tokens))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "min(len(hf_tokens)+1 if ab[0] == ab[1] else i for i, ab in enumerate(zip(hf_tokens, fm_tokens)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharex=True)\n",
+    "\n",
+    "# Left plot: max and mean absolute differences\n",
+    "axes[0].plot(max_adiff, label='max')\n",
+    "axes[0].plot(mean_adiff, label='mean')\n",
+    "axes[0].set_title('Max and Mean Absolute Difference')\n",
+    "axes[0].set_xlabel('Token Position Index')\n",
+    "axes[0].set_ylabel('Absolute Difference')\n",
+    "axes[0].legend()\n",
+    "axes[0].grid(True)\n",
+    "\n",
+    "# Right plot: sum absolute difference\n",
+    "axes[1].plot(sum_adiff, label='sum', color='tab:orange')\n",
+    "axes[1].set_title('Sum Absolute Difference')\n",
+    "axes[1].set_xlabel('Token Position Index')\n",
+    "axes[1].set_ylabel('Absolute Difference')\n",
+    "axes[1].legend()\n",
+    "axes[1].grid(True)\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fm_hidden_files = {int(file.stem.split(\"data\")[1]): file for file in (files_root / \"fast_llm/hidden_states/\").glob(\"data*.pickle\")}\n",
+    "hf_hidden_files = {int(file.stem.split(\"data\")[1]): file for file in (files_root / \"hf/hidden_states\").glob(\"data*.pickle\")}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def mad(new_token_index, fm_hidden_files, hf_hidden_files):\n",
+    "    with fm_hidden_files[new_token_index].open(\"rb\") as f:\n",
+    "        fm_data = pickle.load(f)\n",
+    "    with hf_hidden_files[new_token_index].open(\"rb\") as f:\n",
+    "        hf_data = pickle.load(f)\n",
+    "    max_adiffs_hidden_layers = []\n",
+    "    for i in range(len(hf_data)):\n",
+    "        max_adiff = torch.abs(hf_data[i][0,-1,:]-fm_data[i]['tensor'][0,-1,:]).max().item()\n",
+    "        max_adiffs_hidden_layers.append(max_adiff)\n",
+    "    return max_adiffs_hidden_layers\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "new_token_index = 107\n",
+    "new_token_index1 = 108\n",
+    "max_adiffs_hidden_layers = mad(0, fm_hidden_files, hf_hidden_files)\n",
+    "max_adiffs_hidden_layers2 = mad(new_token_index, fm_hidden_files, hf_hidden_files)\n",
+    "max_adiffs_hidden_layers3 = mad(new_token_index1, fm_hidden_files, hf_hidden_files)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharex=True)\n",
+    "\n",
+    "axes[0].plot(max_adiffs_hidden_layers, label='new_token_0', color='blue')\n",
+    "axes[0].plot(max_adiffs_hidden_layers2, label=f'new_token_{new_token_index}', color='green')\n",
+    "axes[0].set_title('Max and Mean Absolute Difference')\n",
+    "axes[0].set_xlabel('Hidden Layer Index')\n",
+    "axes[0].set_ylabel('Max Absolute Difference')\n",
+    "axes[0].legend()\n",
+    "axes[0].grid(True)\n",
+    "\n",
+    "axes[1].plot(max_adiffs_hidden_layers, label='new_token_0', color='blue')\n",
+    "axes[1].plot(max_adiffs_hidden_layers3, label=f'new_token_{new_token_index1}', color='green')\n",
+    "axes[1].set_title('Max and Mean Absolute Difference')\n",
+    "axes[1].set_xlabel('Hidden Layer Index')\n",
+    "axes[1].set_ylabel('Max Absolute Difference')\n",
+    "axes[1].legend()\n",
+    "axes[1].grid(True)\n",
+    "\n",
+    "\n",
+    "\n",
+    "plt.title('Per-layer Max Absolute Differences')\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(hf_tokens_bf16[106:120])\n",
+    "print(fm_tokens_b16[106:120])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(hf_tokens[106:120])\n",
+    "print(fm_tokens[106:120])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hf_tokens_bf16  = hf_tokens\n",
+    "fm_tokens_b16 = fm_tokens"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "min(len(hf_tokens)+1 if ab[0] == ab[1] else i for i, ab in enumerate(zip(hf_tokens, fm_tokens)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "min(len(hf_tokens)+1 if ab[0] == ab[1] else i for i, ab in enumerate(zip(hf_tokens, hf_tokens_bf16)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "min(len(hf_tokens)+1 if ab[0] == ab[1] else i for i, ab in enumerate(zip(fm_tokens, fm_tokens_b16)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "min(len(hf_tokens)+1 if ab[0] == ab[1] else i for i, ab in enumerate(zip(hf_tokens, fm_tokens)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import safetensors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# this is just to show possibility\n",
+    "# assumes no converiosn of key names or tensors or aggregation of tensors is needed\n",
+    "def load(path, model):\n",
+    "   with safetensors.safe_open(path, 'pt', device=model.distributed.device) as f:\n",
+    "      key = 'model.embed_tokens.weight'\n",
+    "      # this would load only part of the tensor for this tensor parallel, etc rank\n",
+    "      # get_local_slice_ranges would return a multidimensional range object \n",
+    "      tensor = f.get_slice(key)[model.get_local_slice_ranges(key)]\n",
+    "      model.import_tensor(key, tensor)\n",
+    "      "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from fast_llm.engine.distributed.config import DistributedConfig"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"| rank | local_rank | tensor_rank | pipeline_rank | data_rank | sequence_data_rank | batch_data_rank | | | | | | |\")\n",
+    "print(\"| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |\")\n",
+    "for rank in range(16):\n",
+    "    cfg  = DistributedConfig(rank=rank, world_size=16, local_world_size=8, tensor_parallel=2, pipeline_parallel=2, sequence_data_parallel=2, pipeline_first=True)\n",
+    "    res = f\"| {cfg.rank} | {cfg.local_rank} | {cfg.tensor_rank} | {cfg.pipeline_rank} | {cfg.data_rank} | {cfg.sequence_data_rank} | {cfg.batch_data_rank} |\"\n",
+    "    for name, dm in cfg.distributed_dims.items():\n",
+    "        if name == 'world':\n",
+    "            continue\n",
+    "        res += f\"{name}_{dm.id} |\"\n",
+    "    print(res)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "res = '|'\n",
+    "for name, dm in cfg.distributed_dims.items():\n",
+    "    if name == 'world':\n",
+    "        continue\n",
+    "    res += f\"{name}_{dm.id} |\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "res"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pickle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with  open(\"/mnt/checkpoints/test/denis/smol_eval_experiment_test/lm_eval/batch_0.pkl\", 'rb') as f:\n",
+    "    data = pickle.load(f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data[1:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}