diff --git a/.gitignore b/.gitignore index 600d2d3..a7ced15 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ -.vscode \ No newline at end of file +.vscode +llama-weights +llm-env +weights.npz \ No newline at end of file diff --git a/convert.py b/convert.py new file mode 100644 index 0000000..2299512 --- /dev/null +++ b/convert.py @@ -0,0 +1,123 @@ +import numpy as np +from safetensors import safe_open +import torch +import os + +# -------- CONFIG -------- +MODEL_DIR = "./llama-weights" # folder containing all safetensors +OUT_PATH = "weights.npz" + +DTYPE = torch.float16 # use float16 (recommended) or torch.float32 +# ------------------------ + +# Collect shard files +files = sorted([f for f in os.listdir(MODEL_DIR) if f.endswith(".safetensors")]) +print("Found shards:", files) + +weights = {} + +# -------- MERGE FUNCTION -------- +def merge_tensor(key, tensor): + if key not in weights: + weights[key] = tensor + else: + # Smart merge rules + if tensor.ndim == 2: + if any(x in key for x in ["o_proj", "down_proj"]): + # concatenate along output dimension + weights[key] = np.concatenate([weights[key], tensor], axis=1) + else: + # default + weights[key] = np.concatenate([weights[key], tensor], axis=0) + + elif tensor.ndim == 1: + # norms / biases → keep first + pass + + else: + raise ValueError(f"Unknown tensor shape: {key} {tensor.shape}") + +# -------- LOAD SHARDS -------- +for file in files: + path = os.path.join(MODEL_DIR, file) + print(f"\nLoading {file}") + + with safe_open(path, framework="pt", device="cpu") as f: + for key in f.keys(): + tensor = f.get_tensor(key) + + # ---- dtype fix (bfloat16 → float16/float32) ---- + if tensor.dtype == torch.bfloat16: + tensor = tensor.to(DTYPE) + else: + tensor = tensor.to(DTYPE) + + tensor = tensor.numpy() + + merge_tensor(key, tensor) + +# -------- RENAME KEYS -------- +final_weights = {} + +for key, tensor in weights.items(): + new_key = key + + # embeddings + if key == "model.embed_tokens.weight": + new_key = "tok_embeddings" + + # final norm + elif key == "model.norm.weight": + new_key = "norm" + + # output + elif key == "lm_head.weight": + new_key = "output" + + # layers + elif key.startswith("model.layers"): + parts = key.split(".") + layer_id = parts[2] + + if "self_attn.q_proj.weight" in key: + new_key = f"layers.{layer_id}.attention.wq" + + elif "self_attn.k_proj.weight" in key: + new_key = f"layers.{layer_id}.attention.wk" + + elif "self_attn.v_proj.weight" in key: + new_key = f"layers.{layer_id}.attention.wv" + + elif "self_attn.o_proj.weight" in key: + new_key = f"layers.{layer_id}.attention.wo" + + elif "mlp.gate_proj.weight" in key: + new_key = f"layers.{layer_id}.feed_forward.w1" + + elif "mlp.down_proj.weight" in key: + new_key = f"layers.{layer_id}.feed_forward.w2" + + elif "mlp.up_proj.weight" in key: + new_key = f"layers.{layer_id}.feed_forward.w3" + + elif "input_layernorm.weight" in key: + new_key = f"layers.{layer_id}.attention_norm" + + elif "post_attention_layernorm.weight" in key: + new_key = f"layers.{layer_id}.ffn_norm" + + else: + print("Skipping:", key) + continue + + else: + print("Skipping:", key) + continue + + final_weights[new_key] = tensor + print(f"{key} -> {new_key} {tensor.shape}") + +# -------- SAVE -------- +np.savez(OUT_PATH, **final_weights) + +print(f"\nāœ… SUCCESS: Saved merged weights to {OUT_PATH}")