STABILITY: SMP improvements

This commit is contained in:
boreddevnl 2026-04-12 00:26:04 +02:00
parent 38ed0b5ffa
commit afc4e16fcf
11 changed files with 152 additions and 80 deletions

3
.gitignore vendored
View file

@ -29,4 +29,5 @@ limine
**/.DS_Store **/.DS_Store
.DS_Store .DS_Store
/build/ /build/
*.o *.o
disk.img

BIN
disk.img

Binary file not shown.

View file

@ -45,7 +45,11 @@ isr%2_wrapper:
push r14 push r14
push r15 push r15
; Save SSE/FPU state (fxsave requires 16-byte alignment) test qword [rsp + 144], 3
jz %%skip_swap
swapgs
%%skip_swap:
sub rsp, 512 sub rsp, 512
fxsave [rsp] fxsave [rsp]
@ -76,6 +80,12 @@ isr%2_wrapper:
pop rcx pop rcx
pop rbx pop rbx
pop rax pop rax
test qword [rsp + 24], 3
jz %%skip_swap_back
swapgs
%%skip_swap_back:
add rsp, 16 ; drop dummy vector and error code add rsp, 16 ; drop dummy vector and error code
iretq iretq
%endmacro %endmacro
@ -163,8 +173,12 @@ exception_common:
push r13 push r13
push r14 push r14
push r15 push r15
test qword [rsp + 144], 3
jz .skip_swap_exc
swapgs
.skip_swap_exc:
; Save SSE/FPU state (fxsave requires 16-byte alignment)
sub rsp, 512 sub rsp, 512
fxsave [rsp] fxsave [rsp]
@ -196,6 +210,12 @@ exception_common:
pop rcx pop rcx
pop rbx pop rbx
pop rax pop rax
test qword [rsp + 24], 3
jz .skip_swap_back_exc
swapgs
.skip_swap_back_exc:
add rsp, 16 ; drop vector and error code add rsp, 16 ; drop vector and error code
iretq iretq

View file

@ -15,15 +15,14 @@ section .text
; R9 = arg5 ; R9 = arg5
syscall_entry: syscall_entry:
; 1. Switch to Kernel Stack safely swapgs
; Note: For true SMP safety, we need per-CPU storage (via swapgs).
; For now, we use a global scratch which is only safe because we mask interrupts on entry. mov [gs:40], rsp
mov [rel user_rsp_scratch], rsp mov rsp, [gs:48]
mov rsp, [rel kernel_syscall_stack]
; 2. Build iretq frame (compatible with registers_t) ; 2. Build iretq frame
push 0x1B ; SS (User Data) push 0x1B ; SS (User Data)
push qword [rel user_rsp_scratch] ; RSP push qword [gs:40] ; RSP
push r11 ; RFLAGS (captured by syscall) push r11 ; RFLAGS (captured by syscall)
push 0x23 ; CS (User Code) push 0x23 ; CS (User Code)
push rcx ; RIP (return address from syscall) push rcx ; RIP (return address from syscall)
@ -81,14 +80,7 @@ syscall_entry:
pop rax pop rax
add rsp, 16 ; drop int_no/err_code add rsp, 16 ; drop int_no/err_code
; Debug: check RIP before iretq swapgs
; We can't easily print from here without destroying registers,
; but we can at least check if it's canonical.
iretq iretq
section .bss section .bss
global kernel_syscall_stack
global user_rsp_scratch
kernel_syscall_stack: resq 1
user_rsp_scratch: resq 1

View file

@ -158,6 +158,8 @@ void kmain(void) {
// The memory manager will now scan the memory map and manage all usable regions. // The memory manager will now scan the memory map and manage all usable regions.
memory_manager_init_from_memmap(memmap_request.response); memory_manager_init_from_memmap(memmap_request.response);
serial_write("[DEBUG] memory_manager_init OK\n"); serial_write("[DEBUG] memory_manager_init OK\n");
smp_init_bsp();
serial_write("[DEBUG] smp_init_bsp OK\n");
} else { } else {
serial_write("[DEBUG] ERROR: No usable memory for heap! Check Limine memmap.\n"); serial_write("[DEBUG] ERROR: No usable memory for heap! Check Limine memmap.\n");
hcf(); hcf();

View file

@ -1561,15 +1561,11 @@ static int vfs_realfs_read(void *fs_private, void *file_handle, void *buf, int s
uint8_t *cluster_buf = (uint8_t*)kmalloc(cluster_size); uint8_t *cluster_buf = (uint8_t*)kmalloc(cluster_size);
if (!cluster_buf) return -1; if (!cluster_buf) return -1;
extern void serial_write(const char*);
serial_write("[VFS] vfs_realfs_read enter\n");
uint64_t rflags = spinlock_acquire_irqsave(&vol->lock); uint64_t rflags = spinlock_acquire_irqsave(&vol->lock);
int ret = realfs_read_file(handle, buf, size, cluster_buf); int ret = realfs_read_file(handle, buf, size, cluster_buf);
spinlock_release_irqrestore(&vol->lock, rflags); spinlock_release_irqrestore(&vol->lock, rflags);
kfree(cluster_buf); kfree(cluster_buf);
serial_write("[VFS] vfs_realfs_read exit\n");
return ret; return ret;
} }
@ -1583,15 +1579,11 @@ static int vfs_realfs_write(void *fs_private, void *file_handle, const void *buf
uint8_t *cluster_buf = (uint8_t*)kmalloc(cluster_size); uint8_t *cluster_buf = (uint8_t*)kmalloc(cluster_size);
if (!cluster_buf) return -1; if (!cluster_buf) return -1;
extern void serial_write(const char*);
serial_write("[VFS] vfs_realfs_write enter\n");
uint64_t rflags = spinlock_acquire_irqsave(&vol->lock); uint64_t rflags = spinlock_acquire_irqsave(&vol->lock);
int ret = realfs_write_file(handle, buf, size, cluster_buf); int ret = realfs_write_file(handle, buf, size, cluster_buf);
spinlock_release_irqrestore(&vol->lock, rflags); spinlock_release_irqrestore(&vol->lock, rflags);
kfree(cluster_buf); kfree(cluster_buf);
serial_write("[VFS] vfs_realfs_write exit\n");
return ret; return ret;
} }

View file

@ -23,10 +23,10 @@ process_t processes[MAX_PROCESSES] __attribute__((aligned(16)));
int process_count = 0; int process_count = 0;
static process_t* current_process[MAX_CPUS_SCHED] = {0}; // Per-CPU static process_t* current_process[MAX_CPUS_SCHED] = {0}; // Per-CPU
static uint32_t next_pid = 0; static uint32_t next_pid = 0;
static void *free_kernel_stack_later = NULL; static void *free_kernel_stack_later[MAX_CPUS_SCHED] = {0};
static uint64_t free_pml4_later = 0; static uint64_t free_pml4_later[MAX_CPUS_SCHED] = {0};
static spinlock_t runqueue_lock = SPINLOCK_INIT; static spinlock_t runqueue_lock = SPINLOCK_INIT;
static uint32_t next_cpu_assign = 1; // Round-robin CPU assignment (start from CPU 1) static uint32_t next_cpu_assign = 1;
void process_init(void) { void process_init(void) {
for (int i = 0; i < MAX_PROCESSES; i++) { for (int i = 0; i < MAX_PROCESSES; i++) {
@ -376,21 +376,35 @@ process_t* process_get_current(void) {
} }
uint64_t process_schedule(uint64_t current_rsp) { uint64_t process_schedule(uint64_t current_rsp) {
if (free_kernel_stack_later) { uint32_t my_cpu = smp_this_cpu_id();
kfree(free_kernel_stack_later); uint64_t rflags = spinlock_acquire_irqsave(&runqueue_lock);
free_kernel_stack_later = NULL;
void *cleanup_stack = NULL;
uint64_t cleanup_pml4 = 0;
if (free_kernel_stack_later[my_cpu]) {
cleanup_stack = free_kernel_stack_later[my_cpu];
free_kernel_stack_later[my_cpu] = NULL;
} }
if (free_pml4_later) { if (free_pml4_later[my_cpu]) {
extern void paging_destroy_user_pml4_phys(uint64_t pml4_phys); cleanup_pml4 = free_pml4_later[my_cpu];
paging_destroy_user_pml4_phys(free_pml4_later); free_pml4_later[my_cpu] = 0;
free_pml4_later = 0;
} }
uint32_t my_cpu = smp_this_cpu_id();
process_t *cur = current_process[my_cpu]; process_t *cur = current_process[my_cpu];
if (!cur || !cur->next || cur == cur->next) if (!cur || !cur->next || cur == cur->next) {
spinlock_release_irqrestore(&runqueue_lock, rflags);
// Perform cleanup outside the lock
if (cleanup_stack) kfree(cleanup_stack);
if (cleanup_pml4) {
extern void paging_destroy_user_pml4_phys(uint64_t pml4_phys);
paging_destroy_user_pml4_phys(cleanup_pml4);
}
return current_rsp; return current_rsp;
}
// Save context // Save context
cur->rsp = current_rsp; cur->rsp = current_rsp;
@ -412,11 +426,8 @@ uint64_t process_schedule(uint64_t current_rsp) {
next_proc = next_proc->next; next_proc = next_proc->next;
} }
// If we didn't find a ready process for our CPU, stay on current (unless we are terminated)
if (next_proc->cpu_affinity != my_cpu || next_proc->pid == 0xFFFFFFFF) { if (next_proc->cpu_affinity != my_cpu || next_proc->pid == 0xFFFFFFFF) {
// Fallback to idle if current is terminated
if (cur && cur->pid == 0xFFFFFFFF) { if (cur && cur->pid == 0xFFFFFFFF) {
// Find the idle process for this CPU
for (int i = 0; i < MAX_PROCESSES; i++) { for (int i = 0; i < MAX_PROCESSES; i++) {
if (processes[i].pid == 0 || (processes[i].cpu_affinity == my_cpu && processes[i].is_user == false)) { if (processes[i].pid == 0 || (processes[i].cpu_affinity == my_cpu && processes[i].is_user == false)) {
next_proc = &processes[i]; next_proc = &processes[i];
@ -424,18 +435,25 @@ uint64_t process_schedule(uint64_t current_rsp) {
} }
} }
} else { } else {
spinlock_release_irqrestore(&runqueue_lock, rflags);
if (cleanup_stack) kfree(cleanup_stack);
if (cleanup_pml4) {
extern void paging_destroy_user_pml4_phys(uint64_t pml4_phys);
paging_destroy_user_pml4_phys(cleanup_pml4);
}
return current_rsp; return current_rsp;
} }
} }
current_process[my_cpu] = next_proc; current_process[my_cpu] = next_proc;
// Update Kernel Stack for User Mode interrupts and System Calls
if (current_process[my_cpu]->is_user && current_process[my_cpu]->kernel_stack) { if (current_process[my_cpu]->is_user && current_process[my_cpu]->kernel_stack) {
tss_set_stack_cpu(my_cpu, current_process[my_cpu]->kernel_stack); tss_set_stack_cpu(my_cpu, current_process[my_cpu]->kernel_stack);
if (my_cpu == 0) { cpu_state_t *cpu_state = smp_get_cpu(my_cpu);
extern uint64_t kernel_syscall_stack; if (cpu_state) {
kernel_syscall_stack = current_process[my_cpu]->kernel_stack; cpu_state->kernel_syscall_stack = current_process[my_cpu]->kernel_stack;
} }
} }
@ -443,8 +461,16 @@ uint64_t process_schedule(uint64_t current_rsp) {
paging_switch_directory(current_process[my_cpu]->pml4_phys); paging_switch_directory(current_process[my_cpu]->pml4_phys);
current_process[my_cpu]->ticks++; current_process[my_cpu]->ticks++;
uint64_t next_rsp = current_process[my_cpu]->rsp;
return current_process[my_cpu]->rsp; spinlock_release_irqrestore(&runqueue_lock, rflags);
if (cleanup_stack) kfree(cleanup_stack);
if (cleanup_pml4) {
extern void paging_destroy_user_pml4_phys(uint64_t pml4_phys);
paging_destroy_user_pml4_phys(cleanup_pml4);
}
return next_rsp;
} }
process_t* process_get_by_pid(uint32_t pid) { process_t* process_get_by_pid(uint32_t pid) {
@ -600,25 +626,22 @@ uint64_t process_terminate_current(void) {
// 4. Load context for the NEXT process // 4. Load context for the NEXT process
if (current_process[my_cpu]->is_user && current_process[my_cpu]->kernel_stack) { if (current_process[my_cpu]->is_user && current_process[my_cpu]->kernel_stack) {
tss_set_stack_cpu(my_cpu, current_process[my_cpu]->kernel_stack); tss_set_stack_cpu(my_cpu, current_process[my_cpu]->kernel_stack);
if (my_cpu == 0) { cpu_state_t *cpu_state = smp_get_cpu(my_cpu);
extern uint64_t kernel_syscall_stack; if (cpu_state) {
kernel_syscall_stack = current_process[my_cpu]->kernel_stack; cpu_state->kernel_syscall_stack = current_process[my_cpu]->kernel_stack;
} }
} }
paging_switch_directory(current_process[my_cpu]->pml4_phys); paging_switch_directory(current_process[my_cpu]->pml4_phys);
// 5. Free memory
if (to_delete->user_stack_alloc) kfree(to_delete->user_stack_alloc); kfree(to_delete->user_stack_alloc);
to_delete->user_stack_alloc = NULL;
extern void paging_destroy_user_pml4_phys(uint64_t pml4_phys);
if (to_delete->pml4_phys && to_delete->is_user) {
paging_destroy_user_pml4_phys(to_delete->pml4_phys);
} }
to_delete->user_stack_alloc = NULL; free_kernel_stack_later[my_cpu] = to_delete->kernel_stack_alloc;
free_kernel_stack_later = to_delete->kernel_stack_alloc;
to_delete->kernel_stack_alloc = NULL; to_delete->kernel_stack_alloc = NULL;
free_pml4_later[my_cpu] = to_delete->pml4_phys;
to_delete->pml4_phys = 0; to_delete->pml4_phys = 0;
uint64_t next_rsp = current_process[my_cpu]->rsp; uint64_t next_rsp = current_process[my_cpu]->rsp;
@ -666,4 +689,3 @@ process_t* process_get_by_ui_window(void *win) {
} }
return NULL; return NULL;
} }

View file

@ -18,6 +18,16 @@ extern void serial_write_hex(uint64_t n);
static cpu_state_t *cpu_states = NULL; static cpu_state_t *cpu_states = NULL;
static uint32_t total_cpus = 0; static uint32_t total_cpus = 0;
static uint32_t bsp_lapic_id = 0; static uint32_t bsp_lapic_id = 0;
static cpu_state_t *bsp_cpu_state = NULL;
#define MSR_GS_BASE 0xC0000101
#define MSR_KERNEL_GS_BASE 0xC0000102
static inline void wrmsr(uint32_t msr, uint64_t value) {
uint32_t low = (uint32_t)value;
uint32_t high = (uint32_t)(value >> 32);
asm volatile("wrmsr" : : "c"(msr), "a"(low), "d"(high));
}
static uint32_t read_lapic_id(void) { static uint32_t read_lapic_id(void) {
uint32_t eax, ebx, ecx, edx; uint32_t eax, ebx, ecx, edx;
@ -27,6 +37,12 @@ static uint32_t read_lapic_id(void) {
uint32_t smp_this_cpu_id(void) { uint32_t smp_this_cpu_id(void) {
if (total_cpus <= 1) return 0; if (total_cpus <= 1) return 0;
// Use GS-based self-pointer to get the structure first
cpu_state_t *state;
asm volatile("movq %%gs:0, %0" : "=r"(state) : : "memory");
if (state) return state->cpu_id;
uint32_t lapic = read_lapic_id(); uint32_t lapic = read_lapic_id();
for (uint32_t i = 0; i < total_cpus; i++) { for (uint32_t i = 0; i < total_cpus; i++) {
if (cpu_states[i].lapic_id == lapic) return i; if (cpu_states[i].lapic_id == lapic) return i;
@ -68,13 +84,21 @@ static void ap_entry(struct limine_smp_info *info) {
extern void idt_load(void); extern void idt_load(void);
idt_load(); idt_load();
extern void syscall_init(void);
syscall_init();
uint64_t kernel_cr3 = paging_get_pml4_phys(); uint64_t kernel_cr3 = paging_get_pml4_phys();
asm volatile("mov %0, %%cr3" : : "r"(kernel_cr3)); asm volatile("mov %0, %%cr3" : : "r"(kernel_cr3));
extern void lapic_enable(void); extern void lapic_enable(void);
lapic_enable(); lapic_enable();
cpu_states[my_id].self = &cpu_states[my_id];
cpu_states[my_id].online = true; cpu_states[my_id].online = true;
cpu_states[my_id].kernel_syscall_stack = cpu_states[my_id].kernel_stack;
wrmsr(MSR_GS_BASE, (uint64_t)&cpu_states[my_id]);
wrmsr(MSR_KERNEL_GS_BASE, (uint64_t)&cpu_states[my_id]);
serial_write("[SMP] AP "); serial_write("[SMP] AP ");
serial_write_num(my_id); serial_write_num(my_id);
@ -90,6 +114,19 @@ static void ap_entry(struct limine_smp_info *info) {
work_queue_drain_loop(); work_queue_drain_loop();
} }
void smp_init_bsp(void) {
static cpu_state_t bsp_state_static = {0};
bsp_state_static.cpu_id = 0;
bsp_state_static.lapic_id = read_lapic_id();
bsp_state_static.self = &bsp_state_static;
bsp_state_static.online = true;
wrmsr(MSR_GS_BASE, (uint64_t)&bsp_state_static);
wrmsr(MSR_KERNEL_GS_BASE, (uint64_t)&bsp_state_static);
bsp_cpu_state = &bsp_state_static;
}
// --- SMP Initialization --- // --- SMP Initialization ---
uint32_t smp_init(struct limine_smp_response *smp_resp) { uint32_t smp_init(struct limine_smp_response *smp_resp) {
if (!smp_resp || smp_resp->cpu_count <= 1) { if (!smp_resp || smp_resp->cpu_count <= 1) {
@ -132,8 +169,15 @@ uint32_t smp_init(struct limine_smp_response *smp_resp) {
cpu_states[i].lapic_id = cpu->lapic_id; cpu_states[i].lapic_id = cpu->lapic_id;
if (cpu->lapic_id == bsp_lapic_id) { if (cpu->lapic_id == bsp_lapic_id) {
cpu_states[i].online = true; cpu_states[i] = *bsp_cpu_state; // Copy early BSP state
cpu_states[i].self = &cpu_states[i];
cpu_states[i].kernel_stack = 0; // Limine stack for now
cpu_states[i].kernel_syscall_stack = 0;
bsp_index = i; bsp_index = i;
wrmsr(MSR_GS_BASE, (uint64_t)&cpu_states[i]);
wrmsr(MSR_KERNEL_GS_BASE, (uint64_t)&cpu_states[i]);
serial_write("[SMP] BSP CPU "); serial_write("[SMP] BSP CPU ");
serial_write_num(i); serial_write_num(i);
serial_write(" (LAPIC "); serial_write(" (LAPIC ");

View file

@ -8,29 +8,27 @@
#include <stdbool.h> #include <stdbool.h>
#include "spinlock.h" #include "spinlock.h"
// Per-CPU state. Dynamically allocated at boot based on actual CPU count.
typedef struct cpu_state { typedef struct cpu_state {
uint32_t cpu_id; // Logical CPU index (0 = BSP) struct cpu_state *self;
uint32_t lapic_id; // Local APIC ID from Limine uint32_t cpu_id;
uint64_t kernel_stack; // Top of kernel stack for this CPU uint32_t lapic_id;
void *kernel_stack_alloc; // Base allocation for kfree uint64_t kernel_stack;
volatile bool online; // True once AP is fully initialized void *kernel_stack_alloc;
volatile bool online;
uint64_t user_rsp_scratch;
uint64_t kernel_syscall_stack;
} cpu_state_t; } cpu_state_t;
void smp_init_bsp(void);
// Initialize SMP — call after GDT/IDT/memory init but before wm_init.
// Pass the Limine SMP response. APs will be started and will enter their
// idle loops. Returns the number of CPUs brought online.
struct limine_smp_response; struct limine_smp_response;
uint32_t smp_init(struct limine_smp_response *smp_resp); uint32_t smp_init(struct limine_smp_response *smp_resp);
// Get the current CPU index (0 = BSP). Uses CPUID to read LAPIC ID,
// then looks up in the cpu table.
uint32_t smp_this_cpu_id(void); uint32_t smp_this_cpu_id(void);
// Total number of CPUs online.
uint32_t smp_cpu_count(void); uint32_t smp_cpu_count(void);
// Get per-CPU state by index.
cpu_state_t *smp_get_cpu(uint32_t cpu_id); cpu_state_t *smp_get_cpu(uint32_t cpu_id);
#endif #endif

View file

@ -77,6 +77,14 @@ static void smp_user_wrapper(void *arg) {
} }
void syscall_init(void) { void syscall_init(void) {
uint64_t efer = rdmsr(MSR_EFER);
efer |= 1;
wrmsr(MSR_EFER, efer);
uint64_t star = ((uint64_t)0x001B << 48) | ((uint64_t)0x0008 << 32);
wrmsr(MSR_STAR, star);
extern void syscall_entry(void);
wrmsr(MSR_LSTAR, (uint64_t)syscall_entry);
wrmsr(MSR_FMASK, 0x200);
} }
static void user_window_close(Window *win) { static void user_window_close(Window *win) {

View file

@ -710,12 +710,7 @@ static void explorer_load_directory(Window *win, const char *path) {
int count = vfs_list_directory(path, entries, capacity); int count = vfs_list_directory(path, entries, capacity);
// Trace string to see if we reached here
extern void serial_write(const char*); extern void serial_write_num(uint32_t);
serial_write("[EXPLORER] load_directory: "); serial_write(path); serial_write(" | loop start. count: "); serial_write_num(count); serial_write("\n");
while (count == capacity) { while (count == capacity) {
serial_write("[EXPLORER] Doubling capacity to: "); serial_write_num(capacity * 2); serial_write("\n");
capacity *= 2; capacity *= 2;
vfs_dirent_t *new_entries = (vfs_dirent_t*)krealloc(entries, capacity * sizeof(vfs_dirent_t)); vfs_dirent_t *new_entries = (vfs_dirent_t*)krealloc(entries, capacity * sizeof(vfs_dirent_t));
if (!new_entries) { kfree(entries); return; } if (!new_entries) { kfree(entries); return; }
@ -723,8 +718,6 @@ static void explorer_load_directory(Window *win, const char *path) {
count = vfs_list_directory(path, entries, capacity); count = vfs_list_directory(path, entries, capacity);
} }
serial_write("[EXPLORER] load_directory loop complete.\n");
if (state->items_capacity < count) { if (state->items_capacity < count) {
int new_cap = count < EXPLORER_INITIAL_CAPACITY ? EXPLORER_INITIAL_CAPACITY : count; int new_cap = count < EXPLORER_INITIAL_CAPACITY ? EXPLORER_INITIAL_CAPACITY : count;
ExplorerItem *new_items = (ExplorerItem*)krealloc(state->items, new_cap * sizeof(ExplorerItem)); ExplorerItem *new_items = (ExplorerItem*)krealloc(state->items, new_cap * sizeof(ExplorerItem));