// Copyright (c) 2023-2026 Chris (boreddevnl)
// This software is released under the GNU General Public License v3.0. See LICENSE file for details.
// This header needs to maintain in any file it is present in, as per the GPL license terms.

// Kernel memory manager — provides kmalloc/kfree/krealloc for the rest of the kernel.
// Uses a slab allocator for small objects (<= 512 B) and a sorted block-list allocator for everything else.

#include "memory_manager.h"
#include <stdint.h>
#include "limine.h"
#include "platform.h"
#include "spinlock.h"

#define PAGE_SIZE  4096UL
#define SLAB_PAGE_MAGIC     0x534C4142U
#define SLAB_PAGE_MAGIC_INV (~SLAB_PAGE_MAGIC) 

static const uint16_t slab_sizes[SLAB_CLASSES] = {8, 16, 32, 64, 128, 256, 512};

// Each slab page is exactly PAGE_SIZE. Header lives at the start, object slots follow.
// Free slots store a next-pointer at offset 0 (intrusive LIFO free-list).
typedef struct SlabPage {
    uint32_t        magic;
    uint32_t        magic_inv;
    uint16_t        obj_size;
    uint16_t        free_count;
    uint16_t        total_count;
    uint16_t        obj_start;   // byte offset from page base to first slot
    uint16_t        class_idx;
    uint16_t        _pad;
    struct SlabPage *next;
    void            *freelist;
} SlabPage;

typedef struct {
    SlabPage *pages;
    size_t    total_allocs;
    size_t    total_frees;
} SlabCache;

// Block list starts in BSS and migrates to a heap allocation once it fills up.
static MemBlock   _bootstrap_blocks[BLOCK_LIST_INITIAL_CAPACITY];
static MemBlock  *block_list     = _bootstrap_blocks;
static int        block_capacity = BLOCK_LIST_INITIAL_CAPACITY;
static int        block_count    = 0;
static bool       on_heap        = false;
static bool       growing        = false;

static size_t    memory_pool_size   = 0;
static size_t    total_allocated    = 0;
static size_t    peak_allocated     = 0;
static uint32_t  allocation_counter = 0;
static bool      initialized        = false;
static spinlock_t mm_lock           = SPINLOCK_INIT;

static SlabCache slab_caches[SLAB_CLASSES];
static size_t    slab_total_allocs = 0;
static size_t    slab_total_frees  = 0;

extern void serial_write(const char *str);
extern void serial_write_num(uint32_t n);

void mem_memset(void *dest, int val, size_t len) {
    uint8_t *p = (uint8_t *)dest;
    while (len--) *p++ = (uint8_t)val;
}

void mem_memcpy(void *dest, const void *src, size_t len) {
    uint8_t *d = (uint8_t *)dest;
    const uint8_t *s = (const uint8_t *)src;
    while (len--) *d++ = *s++;
}

static void mem_memmove(void *dest, const void *src, size_t len) {
    uint8_t       *d = (uint8_t *)dest;
    const uint8_t *s = (const uint8_t *)src;
    if (d < s) { while (len--) *d++ = *s++; }
    else        { d += len; s += len; while (len--) *(--d) = *(--s); }
}

static void  *_kmalloc_locked(size_t size, size_t alignment);
static void   _kfree_locked(void *ptr);
static bool   insert_block_at(int idx, void *addr, size_t size, bool allocated, uint32_t id);
static void   remove_block_at(int idx);
static bool   grow_block_list(void);

// Returns a sequential allocation ticked counter
static uint32_t get_timestamp(void) {
    static uint32_t tick = 0;
    return tick++;
}

static bool insert_block_at(int idx, void *addr, size_t size, bool allocated, uint32_t id) {
    // Proactive growth: If we're within 10 slots of full, grow the list now.
    // This ensures we always have metadata space to split blocks even during a nested kmalloc.
    if (block_count >= block_capacity - 10 && !growing) {
        grow_block_list();
    }
    if (block_count >= block_capacity) return false;
    for (int j = block_count; j > idx; j--)
        block_list[j] = block_list[j - 1];
    block_list[idx] = (MemBlock){
        .address       = addr,
        .size          = size,
        .allocated     = allocated,
        .allocation_id = id,
        .timestamp     = allocated ? get_timestamp() : 0,
    };
    block_count++;
    return true;
}

static void remove_block_at(int idx) {
    for (int j = idx; j < block_count - 1; j++)
        block_list[j] = block_list[j + 1];
    block_count--;
}

// Splits the chosen block into [head padding | allocation | tail remainder].
// All three parts are tracked as separate MemBlock entries. New memory is zero-filled.
static void *_kmalloc_locked(size_t size, size_t alignment) {
    // Default to 8-byte alignment; this satisfies the strictest scalar type (double/pointer) on x86-64.
    if (alignment == 0) alignment = 8;
    // Round size up to the next 8-byte boundary so every returned block stays naturally aligned.
    size = (size + 7) & ~7ULL;

// First-fit search for a suitable block.
restart:
    for (int i = 0; i < block_count; i++) {
        if (block_list[i].allocated) continue;

        uintptr_t base    = (uintptr_t)block_list[i].address;
        size_t    bsize   = block_list[i].size;
        uintptr_t aligned = (base + alignment - 1) & ~(uintptr_t)(alignment - 1);
        size_t    padding = aligned - base;

        if (bsize < size + padding) continue;

        size_t tail  = bsize - (size + padding);
        int    extra = (padding > 0) + (tail > 0);  // up to 2 new blocks: head padding + tail remainder

        // +2 worst case: padding block + tail block. Grow if needed; restart so indices stay valid.
        if (block_count + extra + 2 > block_capacity) {
            if (grow_block_list()) goto restart;
            if (block_count + extra > block_capacity) continue;
        }

        void    *ptr = (void *)aligned;
        uint32_t id  = ++allocation_counter;

        int cur = i;
        if (padding > 0) {
            block_list[i].size = padding;
            if (!insert_block_at(i + 1, ptr, size, true, id)) continue;
            cur = i + 1;
        } else {
            block_list[i] = (MemBlock){ ptr, size, true, id, get_timestamp() };
        }

        if (tail > 0)
            insert_block_at(cur + 1, (void *)((uintptr_t)ptr + size), tail, false, 0);

        total_allocated += size;
        if (total_allocated > peak_allocated) peak_allocated = total_allocated;
        mem_memset(ptr, 0, size);
        return ptr;
    }
    return NULL;
}

// Frees and coalesces with adjacent free neighbours (right first, then left).
static void _kfree_locked(void *ptr) {
    int i = -1;
    for (int j = 0; j < block_count; j++) {
        if (block_list[j].allocated && block_list[j].address == ptr) { i = j; break; }
    }
    if (i < 0) return;

    total_allocated           -= block_list[i].size;
    block_list[i].allocated    = false;
    block_list[i].allocation_id = 0;

    if (i + 1 < block_count && !block_list[i + 1].allocated &&
        (uintptr_t)block_list[i].address + block_list[i].size ==
        (uintptr_t)block_list[i + 1].address) {
        block_list[i].size += block_list[i + 1].size;
        remove_block_at(i + 1);
    }
    if (i > 0 && !block_list[i - 1].allocated &&
        (uintptr_t)block_list[i - 1].address + block_list[i - 1].size ==
        (uintptr_t)block_list[i].address) {
        block_list[i - 1].size += block_list[i].size;
        remove_block_at(i);
    }
}

// _kmalloc_locked can call grow_block_list again if the block list fills
// during the allocation of the new array, causing infinite recursion without this flag.
static bool grow_block_list(void) {
    if (growing) return false;
    growing = true;

    int new_cap = block_capacity * 2;
    MemBlock *nl = (MemBlock *)_kmalloc_locked((size_t)new_cap * sizeof(MemBlock), 8);
    if (!nl) { growing = false; return false; }

    mem_memcpy(nl, block_list, (size_t)block_count * sizeof(MemBlock));
    
    MemBlock *old_ptr = block_list;
    bool old_on_heap  = on_heap;

    block_list     = nl;
    block_capacity = new_cap;
    on_heap        = true;
    growing        = false;

    if (old_on_heap) _kfree_locked(old_ptr);
    return true;
}

// Uses insertion sort. Only called once at init on a list that is already nearly sorted by address.
static void sort_block_list(void) {
    for (int i = 1; i < block_count; i++) {
        MemBlock key = block_list[i];
        int j = i - 1;
        while (j >= 0 && (uintptr_t)block_list[j].address > (uintptr_t)key.address)
            block_list[j + 1] = block_list[j--];
        block_list[j + 1] = key;
    }
}

// Fragmentation = percentage of free memory stranded outside the largest free block.
static size_t calculate_fragmentation(void) {
    size_t free_total = memory_pool_size - total_allocated;
    if (!free_total || !total_allocated) return 0;
    size_t largest = 0;
    for (int i = 0; i < block_count; i++)
        if (!block_list[i].allocated && block_list[i].size > largest)
            largest = block_list[i].size;
    return 100 - (largest * 100) / free_total;
}

static int slab_class_for_size(size_t size) {
    for (int i = 0; i < SLAB_CLASSES; i++)
        if (size <= slab_sizes[i]) return i;
    return -1;
}

static inline bool slab_ptr_belongs_to_page(const SlabPage *page, const void *ptr) {
    if (!ptr) return false;
    uintptr_t uptr = (uintptr_t)ptr;
    uintptr_t base = (uintptr_t)page;
    uintptr_t off  = uptr - base;
    if (off < page->obj_start || off >= PAGE_SIZE) return false;
    return ((off - page->obj_start) % page->obj_size) == 0;
}

static bool slab_page_in_cache(const SlabCache *cache, const SlabPage *target) {
    for (const SlabPage *p = cache->pages; p; p = p->next)
        if (p == target) return true;
    return false;
}

// Walk the free-list to catch double-frees before they corrupt the allocator.
static bool slab_ptr_is_free_in_page(const SlabPage *page, const void *ptr) {
    const void *it = page->freelist;
    uint16_t seen = 0;

    while (it && seen < page->total_count) {
        if (it == ptr) return true;
        if (!slab_ptr_belongs_to_page(page, it)) return false;
        it = *(void * const *)it;
        seen++;
    }
    return false;
}

static SlabPage *slab_new_page(int cls) {
    uint16_t obj_size = slab_sizes[cls];
    SlabPage *page = (SlabPage *)_kmalloc_locked(PAGE_SIZE, PAGE_SIZE);
    if (!page) return NULL;

    size_t   hdr_end   = sizeof(SlabPage);
    size_t   obj_start = (hdr_end + obj_size - 1) & ~(size_t)(obj_size - 1);
    if (obj_start >= PAGE_SIZE) { _kfree_locked(page); return NULL; }
    uint16_t count     = (uint16_t)((PAGE_SIZE - obj_start) / obj_size);
    if (!count) { _kfree_locked(page); return NULL; }

    page->magic       = SLAB_PAGE_MAGIC;
    page->magic_inv   = SLAB_PAGE_MAGIC_INV;
    page->obj_size    = obj_size;
    page->free_count  = count;
    page->total_count = count;
    page->obj_start   = (uint16_t)obj_start;
    page->class_idx   = (uint16_t)cls;
    page->_pad        = 0;
    page->next        = NULL;

    uintptr_t base = (uintptr_t)page + obj_start;
    for (uint16_t k = 0; k < count - 1; k++)
        *(void **)(base + (size_t)k * obj_size) = (void *)(base + (size_t)(k + 1) * obj_size);
    *(void **)(base + (size_t)(count - 1) * obj_size) = NULL;
    page->freelist = (void *)base;
    return page;
}

// Locate the owning SlabPage by masking the lower 12 bits of ptr (pages are PAGE_SIZE-aligned).
// Runs a battery of header checks before trusting the page, guarding against wild pointers.
static inline bool slab_owns(void *ptr, SlabPage **out) {
    if (!ptr) return false;

    uintptr_t uptr = (uintptr_t)ptr;
    SlabPage *page = (SlabPage *)(uptr & ~(PAGE_SIZE - 1));

    if (page->magic != SLAB_PAGE_MAGIC) return false;
    if (page->magic_inv != SLAB_PAGE_MAGIC_INV) return false;
    if (!page->obj_size || !page->total_count) return false;
    if (page->obj_start < sizeof(SlabPage) || page->obj_start >= PAGE_SIZE) return false;
    if (page->class_idx >= SLAB_CLASSES) return false;
    if (slab_sizes[page->class_idx] != page->obj_size) return false;
    if (page->free_count > page->total_count) return false;

    uint16_t expected = (uint16_t)((PAGE_SIZE - page->obj_start) / page->obj_size);
    if (expected != page->total_count) return false;

    if (page->freelist && !slab_ptr_belongs_to_page(page, page->freelist)) return false;
    // slab_page_in_cache is checked last. It walks the linked list, so the cheap magic/bounds
    // checks above reject the vast majority of wild pointers before reaching it.
    if (!slab_page_in_cache(&slab_caches[page->class_idx], page)) return false;

    if (!slab_ptr_belongs_to_page(page, ptr)) return false;

    *out = page;
    return true;
}

static void *slab_alloc(int cls) {
    SlabCache *cache = &slab_caches[cls];

    SlabPage *page = cache->pages;
    while (page && page->free_count == 0)
        page = page->next;

    if (!page) {
        page = slab_new_page(cls);
        if (!page) return NULL;
        page->next   = cache->pages;
        cache->pages = page;
    }

    void *obj = page->freelist;

    // Freelist head must be a kernel higher-half address. Treat anything below the conservative
    // threshold 0xFFFF000000000000 as corruption (canonical boundary is 0xFFFF800000000000).
    if ((uintptr_t)obj < 0xFFFF000000000000ULL) {
        char b[17]; extern void itoa_hex(uint64_t, char *);
        serial_write("[SLAB] corrupt freelist cls=");
        itoa_hex((uint64_t)cls, b); serial_write(b);
        serial_write(" page="); itoa_hex((uint64_t)page, b); serial_write(b);
        serial_write(" fl=");   itoa_hex((uint64_t)obj, b);  serial_write(b);
        serial_write("\n");

        // Remove the corrupted page from the list to avoid hitting it again
        if (cache->pages == page) {
            cache->pages = page->next;
        } else {
            SlabPage *prev = cache->pages;
            while (prev && prev->next != page) prev = prev->next;
            if (prev) prev->next = page->next;
        }

        page->free_count = 0;
        page->freelist   = NULL;
        page->next       = NULL; // Isolate it

        page = slab_new_page(cls);
        if (!page) return NULL;
        page->next   = cache->pages;
        cache->pages = page;
        obj = page->freelist;
    }

    page->freelist = *(void **)obj;
    page->free_count--;
    cache->total_allocs++;
    slab_total_allocs++;

    mem_memset(obj, 0, slab_sizes[cls]);
    return obj;
}

static void slab_free(void *ptr) {
    SlabPage *page;
    if (!slab_owns(ptr, &page)) return;
    // Fast over-free guard: if the page is already completely free there is nothing valid to free.
    if (page->free_count >= page->total_count) return;
    if (slab_ptr_is_free_in_page(page, ptr)) return;

    *(void **)ptr  = page->freelist;
    page->freelist = ptr;
    page->free_count++;

    int cls = slab_class_for_size(page->obj_size);
    if (cls >= 0) {
        slab_caches[cls].total_frees++;
        slab_total_frees++;
    }
}

void memory_manager_init_from_memmap(struct limine_memmap_response *memmap) {
    if (initialized || !memmap) return;

    mem_memset(_bootstrap_blocks, 0, sizeof(_bootstrap_blocks));
    block_list      = _bootstrap_blocks;
    block_capacity  = BLOCK_LIST_INITIAL_CAPACITY;
    block_count     = 0;
    on_heap         = false;
    total_allocated = peak_allocated = allocation_counter = 0;
    memory_pool_size = 0;
    mem_memset(slab_caches, 0, sizeof(slab_caches));
    slab_total_allocs = slab_total_frees = 0;

    for (uint64_t i = 0; i < memmap->entry_count; i++) {
        struct limine_memmap_entry *e = memmap->entries[i];
        if (e->type != LIMINE_MEMMAP_USABLE) continue;

        uint64_t base = e->base, size = e->length;
        // Skip the first 1 MiB. Real-mode IVT, BIOS data, and legacy ROM regions live here.
        if (base < 0x100000) {
            if (base + size <= 0x100000) continue;
            size -= 0x100000 - base;
            base  = 0x100000;
        }
        if (size < PAGE_SIZE) continue;
        if (block_count >= block_capacity) break;

        block_list[block_count++] = (MemBlock){
            .address = (void *)p2v(base), .size = size, 
        };
        memory_pool_size += size;
    }

    sort_block_list();
    initialized = true;

    serial_write("[MEM] Total usable memory: ");
    serial_write_num((uint32_t)(memory_pool_size / 1024 / 1024));
    serial_write(" MB\n");
}

// Routes small (<= 512 B, alignment <= 8) requests to the slab allocator; everything else uses the block list.
void *kmalloc_aligned(size_t size, size_t alignment) {
    if (!initialized || size == 0) return NULL;

    uint64_t rflags = spinlock_acquire_irqsave(&mm_lock);
    void *ptr;

    if (alignment <= 8) {
        int cls = slab_class_for_size(size);
        if (cls >= 0) {
            ptr = slab_alloc(cls);
            spinlock_release_irqrestore(&mm_lock, rflags);
            return ptr;
        }
    }

    ptr = _kmalloc_locked(size, alignment);
    spinlock_release_irqrestore(&mm_lock, rflags);
    return ptr;
}

void *kmalloc(size_t size) {
    return kmalloc_aligned(size, 8);
}

// kcalloc ensures memory is zeroed, which is critical for many kernel and library 
// structures (like lwIP PCBs) that assume a null-initialized state.
void *kcalloc(size_t n, size_t size) {
    size_t total = n * size;
    void *ptr = kmalloc(total);
    if (ptr) mem_memset(ptr, 0, total);
    return ptr;
}

void kfree(void *ptr) {
    if (!ptr || !initialized) return;
    uint64_t rflags = spinlock_acquire_irqsave(&mm_lock);
    SlabPage *page;
    if (slab_owns(ptr, &page))
        slab_free(ptr);
    else
        _kfree_locked(ptr);
    spinlock_release_irqrestore(&mm_lock, rflags);
}


void *krealloc(void *ptr, size_t new_size) {
    if (new_size == 0) { kfree(ptr); return NULL; }
    if (!ptr)           return kmalloc(new_size);

    new_size = (new_size + 7) & ~7ULL;

    uint64_t rflags = spinlock_acquire_irqsave(&mm_lock);

    size_t old_size = 0;
    SlabPage *page = NULL;
    int block_idx = -1;
    bool is_slab = slab_owns(ptr, &page);
    
    if (is_slab) {
        old_size = page->obj_size;
    } else {
        for (int i = 0; i < block_count; i++) {
            if (block_list[i].allocated && block_list[i].address == ptr) {
                old_size = block_list[i].size;
                block_idx = i;
                break;
            }
        }
    }

    if (!old_size) { spinlock_release_irqrestore(&mm_lock, rflags); return NULL; }

    // Shrink-in-place and migration logic
    if (old_size > new_size) {
        if (is_slab) {
            int new_cls = slab_class_for_size(new_size);
            // If the shrink requirement pushes the allocation into a smaller slab class,
            // fall through the check to trigger standard copy-migration to free the bigger slot.
            if (new_cls < 0 || slab_sizes[new_cls] >= page->obj_size) {
                spinlock_release_irqrestore(&mm_lock, rflags);
                return ptr;
            }
        } else if (block_idx >= 0) {
            // Block Allocator: Shrink dynamic blocks if threshold >= 32 bytes to prevent micro-fragmentation.
            size_t diff = old_size - new_size;
            if (diff >= 32) {
                block_list[block_idx].size = new_size;
                void *tail_addr = (void *)((uintptr_t)ptr + new_size);
                
                if (insert_block_at(block_idx + 1, tail_addr, diff, false, 0)) {
                    total_allocated -= diff;
                    
                    int f_idx = block_idx + 1;
                    if (f_idx + 1 < block_count && !block_list[f_idx + 1].allocated &&
                        (uintptr_t)block_list[f_idx].address + block_list[f_idx].size ==
                        (uintptr_t)block_list[f_idx + 1].address) {
                        block_list[f_idx].size += block_list[f_idx + 1].size;
                        remove_block_at(f_idx + 1);
                    }
                } else {
                    block_list[block_idx].size = old_size; 
                }
            }
            spinlock_release_irqrestore(&mm_lock, rflags);
            return ptr;
        }
    }
    
    if (old_size == new_size) {
        spinlock_release_irqrestore(&mm_lock, rflags);
        return ptr;
    }

    int cls = slab_class_for_size(new_size);
    void *np = (cls >= 0) ? slab_alloc(cls) : _kmalloc_locked(new_size, 8);
    if (!np) { spinlock_release_irqrestore(&mm_lock, rflags); return NULL; }

    // Hold the lock across both the new alloc and the free of the old pointer
    // to keep the operation atomic (no other CPU can observe a partial realloc).
    mem_memmove(np, ptr, old_size);
    if (slab_owns(ptr, &page))
        slab_free(ptr);
    else
        _kfree_locked(ptr);

    spinlock_release_irqrestore(&mm_lock, rflags);
    return np;
}

MemStats memory_get_stats(void) {
    MemStats s = {0};
    s.total_memory     = memory_pool_size;
    s.used_memory      = total_allocated;
    s.available_memory = memory_pool_size - total_allocated;
    s.peak_memory_used = peak_allocated;
    s.smallest_free_block = memory_pool_size;

    for (int i = 0; i < block_count; i++) {
        if (block_list[i].allocated) {
            s.allocated_blocks++;
        } else {
            s.free_blocks++;
            if (block_list[i].size > s.largest_free_block)
                s.largest_free_block = block_list[i].size;
            if (block_list[i].size < s.smallest_free_block)
                s.smallest_free_block = block_list[i].size;
        }
    }
    if (!s.free_blocks) s.smallest_free_block = 0;

    s.fragmentation_percent = calculate_fragmentation();
    s.slab_allocs           = slab_total_allocs;
    s.slab_frees            = slab_total_frees;
    return s;
}