From 080db36f89787a7aecdfbdb0ad5fddbcb5d73250 Mon Sep 17 00:00:00 2001 From: Yasser Arguelles Snape Date: Sat, 8 Jul 2023 02:57:10 -0700 Subject: [PATCH] x64: fixed phi parallel copies (might do swaps later) --- tb/src/x64/generic_cg.h | 62 ++++++++++++++++++++++++++++++++++------- tb/src/x64/reg_alloc.h | 6 ++-- tb/src/x64/x64.c | 35 ++++++++++------------- 3 files changed, 70 insertions(+), 33 deletions(-) diff --git a/tb/src/x64/generic_cg.h b/tb/src/x64/generic_cg.h index 85c8dc83..6dcb6ff0 100644 --- a/tb/src/x64/generic_cg.h +++ b/tb/src/x64/generic_cg.h @@ -107,6 +107,11 @@ typedef struct Def { Clobbers* clobbers; } Def; +typedef struct { + // if is usually -1 unless there's weird parallel copies + int val, tmp; +} PhiVal; + typedef NL_Map(TB_Node*, MachineBB) MachineBBs; typedef DynArray(DefIndex) RegAllocWorklist; @@ -122,6 +127,9 @@ typedef struct { TB_Node* fallthrough; TB_PostorderWalk order; + // temporary but still precious + DynArray(PhiVal) phi_vals; + // machine output sequences Inst *first, *last; DynArray(Def) defs; @@ -151,6 +159,13 @@ typedef struct { TB_SafepointKey* safepoints; } Ctx; +enum { + // dst = COPY src + INST_COPY = 1022, + INST_MOVE = 1021, + INST_USE = 1020, +}; + #if 1 #define ASM if (ctx->emit.emit_asm) #else @@ -191,6 +206,7 @@ static bool fits_into_int32(uint64_t x) { } static bool wont_spill_around(int type); +static Inst inst_move(TB_DataType dt, int lhs, int rhs); static int classify_reg_class(TB_DataType dt); static int isel(Ctx* restrict ctx, TB_Node* n); static void finna_use_reg(Ctx* restrict ctx, int reg_class, int reg_num); @@ -417,14 +433,14 @@ static RegAllocWorklist liveness(Ctx* restrict ctx, TB_Function* f) { timeline += 2; // convert initial move into copy - if (inst->type == X86_INST_MOVE) { + if (inst->type == INST_MOVE) { assert(inst->regs[1] < -1); int di = -inst->regs[1] - 2; if (!set_get(©_init, di)) { set_put(©_init, di); - inst->type = (int) X86_INST_COPY; + inst->type = INST_COPY; inst->regs[0] = USE(inst->regs[1]); inst->regs[1] = inst->regs[2]; inst->regs[2] = 0; @@ -536,9 +552,11 @@ static void hint(Ctx* restrict ctx, DefIndex di, int reg) { static void phi_edge(Ctx* restrict ctx, TB_Node* dst, int index) { TB_NodeRegion* region = TB_NODE_GET_EXTRA(dst); + DynArray(PhiVal) phi_vals = ctx->phi_vals; + dyn_array_clear(phi_vals); + FOREACH_N(i, 0, region->proj_count) { TB_Node* n = region->projs[i]; - assert(n->type == TB_PHI); // allocate virtual register ptrdiff_t search = nl_map_get(ctx->values, n); @@ -546,17 +564,40 @@ static void phi_edge(Ctx* restrict ctx, TB_Node* dst, int index) { if (search < 0) { dst_vreg = DEF(n, classify_reg_class(n->dt)); nl_map_put(ctx->values, n, dst_vreg); - - // log_debug("values[%p] = %d", n, dst_vreg); } else { dst_vreg = ctx->values[search].v; - // log_debug("reuse values[%p] (%d)", n, dst_vreg); } - // handle phis - // log_debug("phi %p: %d", n, dst_vreg); - copy_value(ctx, n, USE(dst_vreg), n->inputs[1 + index], n->dt); + PhiVal p = { dst_vreg, -1 }; + dyn_array_put(phi_vals, p); + } + + // do copies which on parallel phis (swaps usually but we don't do those yet) + FOREACH_N(i, 0, region->proj_count) { + TB_Node* n = region->projs[i]; + assert(n->type == TB_PHI); + + if (n->inputs[1 + index]->type == TB_PHI && n->inputs[1 + index]->inputs[0] == dst) { + int tmp = DEF(n, classify_reg_class(n->dt)); + copy_value(ctx, n, USE(tmp), n->inputs[1 + index], n->dt); + phi_vals[i].tmp = tmp; + } + } + + // do normal copies + FOREACH_N(i, 0, region->proj_count) { + TB_Node* n = region->projs[i]; + + int dst = USE(phi_vals[i].val); + if (phi_vals[i].tmp >= 0) { + int src = USE(phi_vals[i].tmp); + SUBMIT(inst_move(n->dt, dst, src)); + } else { + copy_value(ctx, n, dst, n->inputs[1 + index], n->dt); + } } + + ctx->phi_vals = phi_vals; } static void schedule_effect(Ctx* restrict ctx, TB_Node* parent, TB_Node* n) { @@ -615,7 +656,7 @@ static void compile_function(TB_Function* restrict f, TB_FunctionOutput* restric } }; - // ctx.emit.emit_asm = true; + ctx.emit.emit_asm = true; /* if (ctx.emit.emit_asm) { tb_function_print(f, tb_default_print_callback, stdout); }*/ @@ -698,6 +739,7 @@ static void compile_function(TB_Function* restrict f, TB_FunctionOutput* restric nl_map_free(ctx.emit.labels); nl_map_free(ctx.values); nl_map_free(ctx.machine_bbs); + dyn_array_destroy(ctx.phi_vals); if (dyn_array_length(f->lines)) { f->lines[0].pos = 0; diff --git a/tb/src/x64/reg_alloc.h b/tb/src/x64/reg_alloc.h index eb3af03c..3d432870 100644 --- a/tb/src/x64/reg_alloc.h +++ b/tb/src/x64/reg_alloc.h @@ -1,6 +1,6 @@ // TODO(NeGate): We should switch to Efficient global regsiter allocation, 2011 // https://arxiv.org/pdf/2011.05608.pdf -#define REG_ALLOC_LOG if (0) +#define REG_ALLOC_LOG if (1) // returns true if used in the next n instructions static bool check_if_used(Ctx* restrict ctx, Inst* inst, int def_i, int n) { @@ -81,7 +81,7 @@ static int spill_register(Ctx* restrict ctx, RegAllocWorklist* worklist, Inst* s // if it's used, refer to reload bool skip_next = false; FOREACH_REVERSE_N(j, 1, 4) if (inst->regs[j] == USE(split_def)) { - if (inst->type == X86_INST_MOVE && j == 1) { + if (inst->type == INST_MOVE && j == 1) { skip_next = true; r.old = split_def; spill(ctx, inst, &r); @@ -197,7 +197,7 @@ static void reg_alloc(Ctx* restrict ctx, TB_Function* f, RegAllocWorklist workli int time = d->start; REG_ALLOC_LOG { printf(" \x1b[32m# D%zu t=[%d,%d) ", di, time, d->end); - if (d->node) printf("%p", d->node); + if (d->node) printf("%p %s", d->node, tb_node_get_name(d->node)); printf("\x1b[0m\n"); } diff --git a/tb/src/x64/x64.c b/tb/src/x64/x64.c index 101807e6..ded227fc 100644 --- a/tb/src/x64/x64.c +++ b/tb/src/x64/x64.c @@ -13,12 +13,7 @@ enum { REG_CLASS_XMM }; -typedef enum X86_InstType { - // dst = COPY src - X86_INST_COPY = 1022, - X86_INST_MOVE = 1021, - X86_INST_USE = 1020, -} X86_InstType; +typedef int X86_InstType; // for memory operands imm[0] is two fields: // top 32bits is scale, bottom 32bits is displacement @@ -167,18 +162,9 @@ static Inst inst_u(int op, TB_DataType dt) { }; } -static Inst inst_move(TB_DataType dt, int lhs, int rhs) { - return (Inst){ - .type = (int)X86_INST_MOVE, - .layout = X86_OP_RR, - .data_type = legalize(dt), - .regs = { -1, lhs, rhs } - }; -} - static Inst inst_use(int src) { return (Inst){ - .type = (int)X86_INST_USE, + .type = INST_USE, .layout = X86_OP_NONE, .data_type = TB_X86_TYPE_NONE, .regs = { src }, @@ -214,9 +200,18 @@ static Inst inst_g(int op, TB_DataType dt, int dst, const TB_Symbol* sym) { }; } +static Inst inst_move(TB_DataType dt, int lhs, int rhs) { + return (Inst){ + .type = (int)INST_MOVE, + .layout = X86_OP_RR, + .data_type = legalize(dt), + .regs = { -1, lhs, rhs } + }; +} + static Inst inst_copy(TB_DataType dt, int lhs, int rhs) { return (Inst){ - .type = (int) X86_INST_COPY, + .type = INST_COPY, .layout = X86_OP_RR, .data_type = legalize(dt), .regs = { lhs, rhs } @@ -1516,7 +1511,7 @@ static void emit_code(Ctx* restrict ctx) { }; dyn_array_put(f->lines, l); continue; - } else if (inst->type == X86_INST_USE) { + } else if (inst->type == INST_USE) { continue; } @@ -1615,11 +1610,11 @@ static void emit_code(Ctx* restrict ctx) { // TODO(NeGate): this can potentially place the prefix too early if (inst->prefix & INST_REP) EMIT1(&ctx->emit, 0xF3); - if (inst->type == X86_INST_MOVE) { + if (inst->type == INST_MOVE) { if (!is_value_match(&ops[1], &ops[2])) { inst2_print(ctx, is_fp ? FP_MOV : MOV, &ops[1], &ops[2], inst->data_type); } - } else if (inst->type == X86_INST_COPY) { + } else if (inst->type == INST_COPY) { if (!is_value_match(&ops[0], &ops[1])) { inst2_print(ctx, is_fp ? FP_MOV : MOV, &ops[0], &ops[1], inst->data_type); }