[PATCH 5/5] d3dcompiler: Allocate temporary registers for variables.

Zebediah Figura zfigura at codeweavers.com
Wed Apr 1 14:12:24 CDT 2020


On 4/1/20 1:37 PM, Matteo Bruni wrote:
> On Mon, Mar 30, 2020 at 4:54 AM Zebediah Figura <z.figura12 at gmail.com> wrote:
> 
> Hi Zeb,
> 
> I have a number of questions / comments, inline.
> 
>> diff --git a/dlls/d3dcompiler_43/hlsl.y b/dlls/d3dcompiler_43/hlsl.y
>> index d6c64edcace..f828e05e335 100644
>> --- a/dlls/d3dcompiler_43/hlsl.y
>> +++ b/dlls/d3dcompiler_43/hlsl.y
>> @@ -2732,6 +2732,177 @@ static void compute_liveness(struct hlsl_ir_function_decl *entry_func)
>>       compute_liveness_recurse(entry_func->body, 0, 0);
>>   }
>>
>> +struct liveness_ctx
>> +{
>> +    size_t count;
>> +    struct
>> +    {
>> +        /* 0 if not live yet. */
>> +        unsigned int last_read;
>> +    } *regs;
>> +};
>> +
>> +static unsigned char get_dead_writemask(struct liveness_ctx *liveness,
>> +        unsigned int first_write, unsigned int index, unsigned int components)
> 
> Maybe get_available_writemask()? Same for the other similarly named functions.

Sure.

> 
>> +{
>> +    unsigned char i, writemask = 0, count = 0;
>> +
>> +    for (i = 0; i < 4; ++i)
>> +    {
>> +        if (liveness->regs[index + i].last_read <= first_write)
>> +        {
>> +            writemask |= 1 << i;
>> +            if (++count == components)
>> +                return writemask;
>> +        }
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static struct hlsl_reg allocate_temp_register(struct liveness_ctx *liveness,
>> +        unsigned int first_write, unsigned int last_read, unsigned char components)
> 
> How is this going to be different to the non-temp register allocation?

The main difference is with uniforms (and anonymous constants, in sm1), 
since they're essentially live from program entry.

That said, after rereading my uniform allocation path, I'm not sure why 
I did write it any differently. The only real reason would seem to be to 
avoid looping through the entire register array every time, but I didn't 
even bother making that optimization...

> 
>> +{
>> +    struct hlsl_reg ret = {.allocated = TRUE};
>> +    unsigned char writemask, i;
>> +    unsigned int regnum;
>> +
>> +    for (regnum = 0; regnum < liveness->count; regnum += 4)
>> +    {
>> +        if ((writemask = get_dead_writemask(liveness, first_write, regnum, components)))
>> +            break;
>> +    }
>> +    if (regnum == liveness->count)
>> +    {
>> +        liveness->count = max(liveness->count * 2, 32);
>> +        liveness->regs = d3dcompiler_realloc(liveness->regs, liveness->count * sizeof(*liveness->regs));
> 
> Do we want to use array_reserve() here?

Yes, definitely. I wrote this code before adding that, and forgot about 
it...

> 
>> +        writemask = (1 << components) - 1;
>> +    }
>> +    for (i = 0; i < 4; ++i)
>> +    {
>> +        if (writemask & (1 << i))
>> +            liveness->regs[regnum + i].last_read = last_read;
>> +    }
>> +    ret.reg = regnum / 4;
>> +    ret.writemask = writemask;
>> +    return ret;
>> +}
>> +
>> +static BOOL is_range_dead(struct liveness_ctx *liveness, unsigned int first_write,
>> +        unsigned int index, unsigned int elements)
>> +{
>> +    unsigned int i;
>> +
>> +    for (i = 0; i < elements; i += 4)
>> +    {
>> +        if (!get_dead_writemask(liveness, first_write, index + i, 4))
>> +            return FALSE;
>> +    }
>> +    return TRUE;
>> +}
>> +
>> +/* "elements" is the total number of consecutive whole registers needed. */
>> +static struct hlsl_reg allocate_temp_range(struct liveness_ctx *liveness,
>> +        unsigned int first_write, unsigned int last_read, unsigned int elements)
>> +{
>> +    struct hlsl_reg ret = {.allocated = TRUE};
>> +    unsigned int i, regnum;
>> +
>> +    elements *= 4;
>> +
>> +    for (regnum = 0; regnum < liveness->count; regnum += 4)
>> +    {
>> +        if (is_range_dead(liveness, first_write, regnum, min(elements, liveness->count - regnum)))
>> +            break;
>> +    }
>> +    if (regnum + elements >= liveness->count)
>> +    {
>> +        liveness->count = max(liveness->count * 2, regnum + elements);
>> +        liveness->regs = d3dcompiler_realloc(liveness->regs, liveness->count * sizeof(*liveness->regs));
>> +    }
>> +    for (i = 0; i < elements; ++i)
>> +        liveness->regs[regnum + i].last_read = last_read;
>> +    ret.reg = regnum / 4;
>> +    return ret;
> 
> It feels like the for should execute elements * 4 iterations, instead.

It does; "elements *= 4" above. Maybe explicitly multiplying by 4 every 
time would be clearer?

> 
>> +}
>> +
>> +static void allocate_variable_temp_register(struct hlsl_ir_var *var, struct liveness_ctx *liveness)
>> +{
>> +    if (!var->reg.allocated && var->last_read)
>> +    {
>> +        if (var->data_type->reg_size > 1)
>> +        {
>> +            var->reg = allocate_temp_range(liveness, var->first_write,
>> +                    var->last_read, var->data_type->reg_size);
>> +            TRACE("Allocated r%u-r%u to %s (liveness %u-%u).\n", var->reg.reg,
>> +                    var->reg.reg + var->data_type->reg_size - 1, var->name, var->first_write, var->last_read);
>> +        }
>> +        else
>> +        {
>> +            var->reg = allocate_temp_register(liveness, var->first_write,
>> +                    var->last_read, var->data_type->dimx);
>> +            TRACE("Allocated r%u%s to %s (liveness %u-%u).\n", var->reg.reg,
>> +                    debug_writemask(var->reg.writemask), var->name, var->first_write, var->last_read);
>> +        }
>> +    }
>> +}
>> +
>> +static void allocate_temp_registers_recurse(struct list *instrs, struct liveness_ctx *liveness)
>> +{
>> +    struct hlsl_ir_node *instr;
>> +
>> +    LIST_FOR_EACH_ENTRY(instr, instrs, struct hlsl_ir_node, entry)
>> +    {
>> +        switch (instr->type)
>> +        {
>> +        case HLSL_IR_ASSIGNMENT:
>> +        {
>> +            struct hlsl_ir_assignment *assignment = assignment_from_node(instr);
>> +            allocate_variable_temp_register(hlsl_var_from_deref(&assignment->lhs), liveness);
>> +            break;
>> +        }
>> +        case HLSL_IR_IF:
>> +        {
>> +            struct hlsl_ir_if *iff = if_from_node(instr);
>> +            allocate_temp_registers_recurse(iff->then_instrs, liveness);
>> +            if (iff->else_instrs)
>> +                allocate_temp_registers_recurse(iff->else_instrs, liveness);
>> +            break;
>> +        }
>> +        case HLSL_IR_LOOP:
>> +        {
>> +            struct hlsl_ir_loop *loop = loop_from_node(instr);
>> +            allocate_temp_registers_recurse(loop->body, liveness);
>> +            break;
>> +        }
>> +        default:
>> +            break;
>> +        }
>> +    }
>> +}
> 
> Do we need to allocate temporary registers for other instructions too,
> like expressions?

Yes, that's in a separate patch. I was torn between "submit everything 
related to RA at once so there's enough context" and "keep patch set 
sizes reviewable" :-/

> 
>> +
>> +/* Simple greedy temporary register allocation pass that just assigns a unique
>> + * index to all (simultaneously live) variables or intermediate values. Agnostic
>> + * as to how many registers are actually available for the current backend, and
>> + * does not handle constants. */
>> +static void allocate_temp_registers(struct hlsl_ir_function_decl *entry_func)
>> +{
>> +    struct liveness_ctx liveness = {};
> 
> I'm pretty sure that this is a gcc extension: you want '{0}' instead.
> 

Huh, indeed it is. I'll avoid that in the future.



More information about the wine-devel mailing list