src/shader_recompiler/frontend/maxwell/translate/impl/logic_operation_three_input_lut3.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92

# Copyright © 2022 degasus <markus@selfnet.de>
# This work is free. You can redistribute it and/or modify it under the
# terms of the Do What The Fuck You Want To Public License, Version 2,
# as published by Sam Hocevar. See http://www.wtfpl.net/ for more details.

from itertools import product

# The primitive instructions
OPS = {
    'ir.BitwiseAnd({}, {})' : (2, 1, lambda a,b: a&b),
    'ir.BitwiseOr({}, {})' : (2, 1, lambda a,b: a|b),
    'ir.BitwiseXor({}, {})' : (2, 1, lambda a,b: a^b),
    'ir.BitwiseNot({})' : (1, 0.1, lambda a: (~a) & 255), # Only tiny cost, as this can often inlined in other instructions
}

# Our database of combination of instructions
optimized_calls = {}
def cmp(lhs, rhs):
    if lhs is None: # new entry
        return True
    if lhs[3] > rhs[3]: # costs
        return True
    if lhs[3] < rhs[3]: # costs
        return False
    if len(lhs[0]) > len(rhs[0]): # string len
        return True
    if len(lhs[0]) < len(rhs[0]): # string len
        return False
    if lhs[0] > rhs[0]: # string sorting
        return True
    if lhs[0] < rhs[0]: # string sorting
        return False
    assert lhs == rhs, "redundant instruction, bug in brute force"
    return False
def register(imm, instruction, count, latency):
    # Use the sum of instruction count and latency as costs to evaluate which combination is best
    costs = count + latency

    old = optimized_calls.get(imm, None)
    new = (instruction, count, latency, costs)

    # Update if new or better
    if cmp(old, new):
        optimized_calls[imm] = new
        return True

    return False

# Constants: 0, 1 (for free)
register(0, 'ir.Imm32(0)', 0, 0)
register(255, 'ir.Imm32(0xFFFFFFFF)', 0, 0)

# Inputs: a, b, c (for free)
ta = 0xF0
tb = 0xCC
tc = 0xAA
inputs = {
    ta : 'a',
    tb : 'b',
    tc : 'c',
}
for imm, instruction in inputs.items():
    register(imm, instruction, 0, 0)
    register((~imm) & 255, 'ir.BitwiseNot({})'.format(instruction), 0.099, 0.099) # slightly cheaper NEG on inputs

# Try to combine two values from the db with an instruction.
# If it is better than the old method, update it.
while True:
    registered = 0
    calls_copy = optimized_calls.copy()
    for OP, (argc, cost, f) in OPS.items():
        for args in product(calls_copy.items(), repeat=argc):
            # unpack(transponse) the arrays
            imm = [arg[0] for arg in args]
            value = [arg[1][0] for arg in args]
            count = [arg[1][1] for arg in args]
            latency = [arg[1][2] for arg in args]

            registered += register(
                f(*imm),
                OP.format(*value),
                sum(count) + cost,
                max(latency) + cost)
    if registered == 0:
        # No update at all? So terminate
        break

# Hacky output. Please improve me to output valid C++ instead.
s = """    case {imm}:
        return {op};"""
for imm in range(256):
    print(s.format(imm=imm, op=optimized_calls[imm][0]))