leloykun · Hossam121212 · Jul 24, 2023 · Jul 24, 2023 · Jul 24, 2023 · Jul 24, 2023
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -0,0 +1,124 @@
+name: Continuous Integration
+
+on:
+  push:
+    branches:
+      - master
+    paths: ['.github/workflows/**', '**/Makefile', '**/*.c', '**/*.h']
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: ['**/Makefile', '**/*.c', '**/*.h']
+
+env:
+  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+
+jobs:
+  # check basic builds to avoid breaking changes
+  ubuntu-focal-make:
+    runs-on: ubuntu-20.04
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential -y
+
+      - name: Build
+        id: make_build
+        run: |
+          make
+
+      - name: Build runfast
+        id: make_build_runfast
+        run: |
+          make runfast
+
+  macOS-latest-make:
+    runs-on: macos-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+
+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+
+      - name: Build
+        id: make_build
+        run: |
+          make
+
+      - name: Build runfast
+        id: make_build_runfast
+        run: |
+          make runfast
+
+      - name: Build clang
+        id: make_build_clang
+        run: |
+          make run CC=clang
+
+  windows-latest-make:
+    runs-on: windows-latest
+
+    strategy:
+      matrix:
+        arch:
+          - amd64
+          - amd64_x86
+          - amd64_arm64
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+
+      - name: Setup MSBuild
+        uses: microsoft/setup-msbuild@v1
+
+      - name: Setup MSVC ${{ matrix.arch }}
+        uses: ilammy/msvc-dev-cmd@v1
+        with:
+          arch: ${{ matrix.arch }}
+
+      - name: Build ${{ matrix.arch }}
+        id: build_msvc
+        run: |
+          .\build_msvc.bat
+
+  windows-latest-mingw:
+    runs-on: windows-latest
+
+    defaults:
+      run:
+        shell: msys2 {0}
+
+    strategy:
+      matrix:
+        include:
+          - { sys: mingw64, env: x86_64 }
+
+    steps:
+      - name: Checkout
+        id: checkout
+        uses: actions/checkout@v3
+
+      - uses: msys2/setup-msys2@v2
+        id: setup-msys2
+        with:
+          msystem: ${{ matrix.sys }}
+          install: mingw-w64-${{matrix.env}}-gcc make
+
+      - name: Build ${{ matrix.sys }} ${{ matrix.env }}
+        id: build_mingw
+        run: |
+          make win64
diff --git a/Makefile b/Makefile
@@ -0,0 +1,50 @@
+# choose your compiler, e.g. gcc/clang
+# example override to clang: make run CC=clang
+CC = gcc
+
+# the most basic way of building that is most likely to work on most systems
+.PHONY: run
+run: run.c
+	$(CC) -O3 -o run run.c -lm
+
+# useful for a debug build, can then e.g. analyze with valgrind, example:
+# $ valgrind --leak-check=full ./run out/model.bin 1.0 3
+rundebug: run.c
+	$(CC) -g -o run run.c -lm
+
+# https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html
+# https://simonbyrne.github.io/notes/fastmath/
+# -Ofast enables all -O3 optimizations. 
+# Disregards strict standards compliance.
+# It also enables optimizations that are not valid for all standard-compliant programs. 
+# It turns on -ffast-math, -fallow-store-data-races and the Fortran-specific 
+# -fstack-arrays, unless -fmax-stack-var-size is specified, and -fno-protect-parens. 
+# It turns off -fsemantic-interposition.
+# In our specific application this is *probably* okay to use
+.PHONY: runfast
+runfast: run.c
+	$(CC) -Ofast -o run run.c -lm
+
+# additionally compiles with OpenMP, allowing multithreaded runs
+# make sure to also enable multiple threads when running, e.g.:
+# OMP_NUM_THREADS=4 ./run out/model.bin
+.PHONY: runomp
+runomp: run.c
+	$(CC) -Ofast -fopenmp -march=native run.c  -lm  -o run
+
+.PHONY: win64
+win64: 
+	x86_64-w64-mingw32-gcc -Ofast -D_WIN32 -o run.exe -I. run.c win.c
+
+# compiles with gnu99 standard flags for amazon linux, coreos, etc. compatibility
+.PHONY: rungnu
+rungnu:
+	$(CC) -Ofast -std=gnu11 -o run run.c -lm
+
+.PHONY: runompgnu
+runompgnu:
+	$(CC) -Ofast -fopenmp -std=gnu11 run.c  -lm  -o run
+
+.PHONY: clean
+clean:
+	rm -f run
diff --git a/README.md b/README.md
diff --git a/build_msvc.bat b/build_msvc.bat
@@ -0,0 +1 @@
+cl.exe /fp:fast /Ox /openmp /I. run.c win.c 
diff --git a/export_meta_llama_bin.py b/export_meta_llama_bin.py
@@ -0,0 +1,112 @@
+"""
+This script exports the Llama 2 weights in llama2c.bin format.
+"""
+import os
+import sys
+import struct
+from pathlib import Path
+import json
+
+import torch
+
+from model import precompute_freqs_cis
+
+
+def export(p, state_dict, filepath='model.bin'):
+    """export the model weights in fp32 into .bin file to be read from C"""
+    f = open(filepath, 'wb')
+
+    def serialize(key):
+        print(f"writing {key}...")
+        t = state_dict[key].contiguous().view(-1).type(torch.float32).numpy()
+        f.write(memoryview(t))
+        del state_dict[key]
+
+    # first write out the header
+    hidden_dim = state_dict['layers.0.feed_forward.w1.weight'].shape[0]
+    p['vocab_size'] = 32000
+    p['max_seq_len'] = 2048
+
+    n_kv_heads = p.get('n_kv_heads') or p['n_heads']
+    header = struct.pack(
+        'iiiiiii',
+        p['dim'], hidden_dim, p['n_layers'], p['n_heads'],
+        n_kv_heads, -p['vocab_size'], p['max_seq_len']
+    )
+    # NOTE ABOVE: -ve vocab_size is indicating that the classifier weights are present
+    # in the checkpoint and should be loaded.
+    f.write(header)
+
+    # next write out the embedding weights
+    print("writing tok_embeddings...")
+    serialize('tok_embeddings.weight')
+
+    # now all the layers
+    # attention weights
+    for i in range(p['n_layers']): serialize(f'layers.{i}.attention_norm.weight')
+    for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wq.weight')
+    for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wk.weight')
+    for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wv.weight')
+    for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wo.weight')
+    # ffn weights
+    for i in range(p['n_layers']): serialize(f'layers.{i}.ffn_norm.weight')
+    for i in range(p['n_layers']): serialize(f'layers.{i}.feed_forward.w1.weight')
+    for i in range(p['n_layers']): serialize(f'layers.{i}.feed_forward.w2.weight')
+    for i in range(p['n_layers']): serialize(f'layers.{i}.feed_forward.w3.weight')
+
+    # final rmsnorm
+    serialize('norm.weight')
+    # freqs_cos, freqs_sin
+    freqs_cos, freqs_sin = precompute_freqs_cis(p['dim'] // p['n_heads'], p['max_seq_len'] * 2)
+    state_dict['freqs_cos'] = freqs_cos[:p['max_seq_len']]
+    state_dict['freqs_sin'] = freqs_sin[:p['max_seq_len']]
+    serialize('freqs_cos')
+    serialize('freqs_sin')
+
+    # finally write the output weights
+    serialize('output.weight')
+
+    f.close()
+    print(f"wrote {filepath}")
+
+
+def concat_weights(models):
+    state_dict = {}
+    for name in list(models[0]):
+        tensors = [model[name] for model in models]
+        if len(tensors) == 1 or len(tensors[0].shape) == 1:
+            state_dict[name] = tensors[0]
+            continue
+        is_axis_1 = (
+            name.startswith('tok_embeddings.')
+            or name.endswith('.attention.wo.weight')
+            or name.endswith('.feed_forward.w2.weight')
+        )
+        axis = 1 if is_axis_1 else 0
+        state_dict[name] = torch.cat(tensors, dim=axis)
+        for model in models:
+            del model[name]
+    return state_dict
+
+
+def load_and_export(model_path, output_path):
+    params_path = os.path.join(model_path, 'params.json')
+    with open(params_path) as f:
+        params = json.load(f)
+        print(params)
+
+    model_paths = sorted(list(Path(model_path).glob('consolidated.*.pth')))
+    models = [torch.load(p, map_location='cpu') for p in model_paths]
+    state_dict = concat_weights(models)
+    del models
+    export(params, state_dict, output_path)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 1:
+        print('[Llama model folder path] [output path]')
+        exit()
+
+    model_path = sys.argv[1]
+    output_path = sys.argv[2]
+    load_and_export(model_path, output_path)