Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 23 additions & 25 deletions Exercises/assignment5.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ where:
opcode: opcode reserved for custom instructions.
funct3 and funct7: opcode modifiers.
```
Use custom extension opcode=0x0B with func7=1 and func3=0;
Use custom extension opcode=0x0B with func7=9 and func3=0;

You will need to modify `vx_intrinsics.h` to add your new VX_DOT8 instruction.

Expand Down Expand Up @@ -97,37 +97,35 @@ We recommend checking out how VX_SPLIT and VX_PRED instructions are decoded in S
- Update `Emulator::decode()` in `decode.cpp` to decode the new instruction format.

``` c++
switch (func7) {
case 1:
switch (func3) {
case 0: // DOT8
case 9: {
switch (funct3) {
case 0: { // DOT8
auto instr = std::allocate_shared<Instr>(instr_pool_, uuid, FUType::ALU);
instr->setDestReg(rd, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs2, RegType::Integer);
break;
instr->setSrcReg(0, rs1, RegType::Integer);
instr->setSrcReg(1, rs2, RegType::Integer);
instr->setOpType(AluType::DOT8);
ibuffer.push_back(instr);
} break;

default:
std::abort();
}
} break;
```

- Update `AluType` enum in `types.h` to add `DOT8` type
- Update `Emulator::execute()` in `execute.cpp` to implement the actual `VX_DOT8` emulation. You will execute the new instruction on the ALU functional unit.

``` c++
switch (func7) {
case 1:
switch (func3) {
case 0: { // DOT8
trace->fu_type = FUType::ALU;
trace->alu_type = AluType::DOT8;
trace->src_regs[0] = {RegType::Integer, rsrc0};
trace->src_regs[1] = {RegType::Integer, rsrc1};
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!warp.tmask.test(t))
continue;
// TODO:
}
rd_write = true;
} break;
} break;
}
case AluType::DOT8: {
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!warp.tmask.test(t))
continue;
// TODO:
}
rd_write = true;
} break;
```

- Update `AluUnit::tick()` in `func_unit.cpp` to implement the timing of `VX_DOT8`.
Expand Down
74 changes: 40 additions & 34 deletions Exercises/assignment6.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ where:
opcode: opcode reserved for custom instructions.
funct3 and funct7: opcode modifiers.
```
Use custom extension opcode=0x0B with func7=1 and func3=0;
Use custom extension opcode=0x0B with func7=9 and func3=0;

You will need to modify `vx_intrinsics.h` to add your new VX_DOT8 instruction.

Expand Down Expand Up @@ -98,26 +98,32 @@ Modify the RTL code to implement the custom ISA extension. We recommend checking
- Update `hw/rtl/core/VX_decode.sv` to decode the new instruction. Select the ALU functional unit for executing this new instruction.

``` verilog
7'h01: begin
case (func3)
3'h0: begin // DOT8
ex_type = // TODO: destination functional unit
op_type = // TODO: instruction type
use_rd = // TODO: writing back to rd
// TODO: set using rd
// TODO: set using rs1
// TODO: set using rs2
end
default:;
endcase
end
`ifdef EXT_DOT8_ENABLE
7'h09: begin
case (funct3)
3'h0: begin // DOT8
ex_type = // TODO: destination functional unit
op_type = // TODO: instruction type
op_args.alu.xtype = // TODO
use_rd = // TODO: writing back to rd
// TODO: set using rd
// TODO: set using rs1
// TODO: set using rs2

end
default:;
endcase
end
`endif


```
- Create a new VX\_alu\_dot8.sv module that implements DOT8

``` verilog
`include "VX_define.vh"

module VX_alu_dot8 #(
module VX_alu_dot8 import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter NUM_LANES = 1
) (
Expand All @@ -128,23 +134,23 @@ module VX_alu_dot8 #(
VX_execute_if.slave execute_if,

// Outputs
VX_commit_if.master commit_if
VX_result_if.master result_if
);
`UNUSED_SPARAM (INSTANCE_ID)
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam TAG_WIDTH = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + PID_WIDTH + 1 + 1;
localparam TAG_WIDTH = UUID_WIDTH + NW_WIDTH + NUM_LANES + PC_BITS + NUM_REGS_BITS + 1 + PID_WIDTH + 1 + 1;
localparam LATENCY_DOT8 = `LATENCY_DOT8;
localparam PE_RATIO = 2;
localparam NUM_PES = `UP(NUM_LANES / PE_RATIO);
// localparam MUL_LATENCY = 1;

`UNUSED_VAR (execute_if.data.op_type)
`UNUSED_VAR (execute_if.data.tid)
`UNUSED_VAR (execute_if.data.rs3_data)

wire [NUM_LANES-1:0][2*`XLEN-1:0] data_in;

for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_dot8_lanes
assign data_in[i][0 +: `XLEN] = execute_if.data.rs1_data[i];
assign data_in[i][`XLEN +: `XLEN] = execute_if.data.rs2_data[i];
end
Expand Down Expand Up @@ -182,26 +188,26 @@ module VX_alu_dot8 #(
.pe_enable (pe_enable),
.pe_data_in (pe_data_out),
.pe_data_out(pe_data_in),
.valid_out (commit_if.valid),
.data_out (commit_if.data.data),
.valid_out (result_if.valid),
.data_out (result_if.data.data),
.tag_out ({
commit_if.data.uuid,
commit_if.data.wid,
commit_if.data.tmask,
commit_if.data.PC,
commit_if.data.rd,
commit_if.data.wb,
commit_if.data.pid,
commit_if.data.sop,
commit_if.data.eop
result_if.data.uuid,
result_if.data.wid,
result_if.data.tmask,
result_if.data.PC,
result_if.data.rd,
result_if.data.wb,
result_if.data.pid,
result_if.data.sop,
result_if.data.eop
}),
.ready_out (commit_if.ready)
.ready_out (result_if.ready)
);

// PEs instancing
for (genvar i = 0; i < NUM_PES; ++i) begin
wire [XLEN-1:0] a = pe_data_in[i][0 +: XLEN];
wire [XLEN-1:0] b = pe_data_in[i][XLEN +: XLEN];
for (genvar i = 0; i < NUM_PES; ++i) begin : g_dot8
wire [XLEN-1:0] a = pe_data_in[i][0 +: XLEN]; // rs1
wire [XLEN-1:0] b = pe_data_in[i][XLEN +: XLEN]; // rs2
// TODO:
wire [31:0] result;
`BUFFER_EX(result, c, pe_enable, 1, LATENCY_DOT8); // c is the result of the dot product
Expand Down