diff --git a/src/emulator/pin/causalityTool.cpp b/src/emulator/pin/causalityTool.cpp index 216fd5d..dbd1e51 100755 --- a/src/emulator/pin/causalityTool.cpp +++ b/src/emulator/pin/causalityTool.cpp @@ -24,7 +24,7 @@ #include #include #include - +using namespace std; #include #ifndef _WIN32 @@ -688,7 +688,7 @@ VOID Instruction(INS ins, VOID *v) { UINT32 memOperands = INS_MemoryOperandCount(ins); - if (INS_IsBranchOrCall(ins))//INS_IsIndirectBranchOrCall(ins)) + if (INS_IsControlFlow(ins))//INS_IsBranchOrCall(ins))//INS_IsIndirectBranchOrCall(ins)) { INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR) BrnFun, IARG_THREAD_ID, IARG_BRANCH_TARGET_ADDR, IARG_BRANCH_TAKEN, IARG_INST_PTR, diff --git a/src/emulator/pin/makefile_linux_mac b/src/emulator/pin/makefile_linux_mac index ead1ffa..abd8b93 100755 --- a/src/emulator/pin/makefile_linux_mac +++ b/src/emulator/pin/makefile_linux_mac @@ -1,4 +1,4 @@ -PIN_KIT ?=/home/rajshekar/softwares/pin-97554/ +PIN_KIT ?=/home/rajshekar/softwares/pin-98332/ CXX=$(shell make PIN_ROOT=$(PIN_KIT) VAR=CXX -f pin_makefile print_var) LINKER=$(shell make PIN_ROOT=$(PIN_KIT) VAR=LINKER -f pin_makefile print_var) TOOL_CXXFLAGS=$(shell make PIN_ROOT=$(PIN_KIT) VAR=TOOL_CXXFLAGS -f pin_makefile print_var) @@ -50,6 +50,8 @@ $(BINDIR)/causalityTool.$(LIB_EXTENSION): $(BINDIR)/causalityTool.$(OBJ_EXTENSIO $(BINDIR)/causalityTool.$(OBJ_EXTENSION): causalityTool.cpp $(COMMDIR)/IPCBase.h $(COMMDIR)/shm/shmem.h $(COMMDIR)/filePacket/filePacket.h $(COMMDIR)/shm/shmem.cc $(CXX) $(TOOL_CXXFLAGS) $(COMM_INCLUDE) -c causalityTool.cpp ../../simulator/emulatorinterface/communication/shm/shmem.cc + mkdir $(JNIBINDIR) + mkdir $(BINDIR) mv causalityTool.$(OBJ_EXTENSION) $(BINDIR)/causalityTool.$(OBJ_EXTENSION) mv shmem.$(OBJ_EXTENSION) $(BINDIR)/shmem.$(OBJ_EXTENSION) @@ -58,16 +60,21 @@ $(BINDIR)/shmem.$(OBJ_EXTENSION): $(COMMDIR)/IPCBase.h $(COMMDIR)/shm/shmem.h $ ################################ JNI stuff comes here ############################################ -JNIPACKAGE = emulatorinterface.communication.shm.SharedMem -JNINCLUDE =-I/usr/lib/jvm/java-8-openjdk-amd64/include/linux -I/usr/lib/jvm/java-8-openjdk-amd64/include +#JNIPACKAGE = emulatorinterface.communication.shm.SharedMem #use this for java-8 +JNIPACKAGE = ../../simulator/emulatorinterface/communication/shm/SharedMem.java #use this for java-11 +#JNINCLUDE =-I/usr/lib/jvm/java-8-openjdk-amd64/include/linux -I/usr/lib/jvm/java-8-openjdk-amd64/include #use this for java-8 +JNINCLUDE =-I/usr/lib/jvm/java-11-openjdk-amd64/include/linux -I/usr/lib/jvm/java-11-openjdk-amd64/include #use this for java-11 JNILinkingFlags = -shared -Wall $(POSITION_INDEPENDENCE) -JAVAH = javah -jni +#JAVAH = javah -jni #use this for java-8 +JAVAH = javac #use this for java-11 $(JNIBINDIR)/libshmlib.$(LIB_EXTENSION): $(JNIBINDIR)/SharedMem.h $(COMMDIR)/shm/JNIShm.c $(COMMDIR)/common.h $(shell $(JNICOMMAND)) $(JNIBINDIR)/SharedMem.h: $(TOPBINDIR)/emulatorinterface/communication/shm/SharedMem.class - $(JAVAH) -classpath $(TOPBINDIR) -o $(JNIBINDIR)/SharedMem.h $(JNIPACKAGE) + #$(JAVAH) -classpath $(TOPBINDIR) -o $(JNIBINDIR)/SharedMem.h $(JNIPACKAGE) #use this for java-8 + $(JAVAH) -classpath $(TOPBINDIR) -h $(JNIBINDIR) $(JNIPACKAGE) #use this line and the next for java-11 + mv $(JNIBINDIR)/emulatorinterface_communication_shm_SharedMem.h $(JNIBINDIR)/SharedMem.h clean: - rm -rf $(BINDIR)/* $(JNIBINDIR)/* + rm -rf $(BINDIR) $(JNIBINDIR) diff --git a/src/simulator/config/BranchPredictorConfig.java b/src/simulator/config/BranchPredictorConfig.java index af53543..8fb28d7 100755 --- a/src/simulator/config/BranchPredictorConfig.java +++ b/src/simulator/config/BranchPredictorConfig.java @@ -5,8 +5,9 @@ public class BranchPredictorConfig { public int BHRsize; public int saturating_bits; public BP predictorMode; + public String TAGESCLLibDirectory; public static enum BP { - NoPredictor, PerfectPredictor, AlwaysTaken, AlwaysNotTaken, Tournament, Bimodal, GShare, GAg, GAp, PAg, PAp,TAGE + NoPredictor, PerfectPredictor, AlwaysTaken, AlwaysNotTaken, Tournament, Bimodal, GShare, GAg, GAp, PAg, PAp,TAGE, TAGE_SC_L, } } diff --git a/src/simulator/config/CoreConfig.java b/src/simulator/config/CoreConfig.java index 46e2113..0a878be 100755 --- a/src/simulator/config/CoreConfig.java +++ b/src/simulator/config/CoreConfig.java @@ -58,7 +58,9 @@ public class CoreConfig public int STLBAccessPorts; public int STLBPortOccupancy; + public int NoOfMicroOpCacheEntries; public int DecodeWidth; + public int RenameWidth; public int IssueWidth; public int RetireWidth; public int ROBSize; diff --git a/src/simulator/config/XMLParser.java b/src/simulator/config/XMLParser.java index 9801662..f612033 100755 --- a/src/simulator/config/XMLParser.java +++ b/src/simulator/config/XMLParser.java @@ -481,6 +481,9 @@ public class XMLParser core.STLBPortOccupancy = Integer.parseInt(getImmediateString("PortOccupancy", sTLBElmnt)); core.sTLBPower = getEnergyConfig(sTLBElmnt); + Element microOpCacheElmnt = (Element)(coreElmnt.getElementsByTagName("MicroOpCache")).item(0); + core.NoOfMicroOpCacheEntries = Integer.parseInt(getImmediateString("NumberOfMicroOps", microOpCacheElmnt)); + Element decodeElmnt = (Element)(coreElmnt.getElementsByTagName("Decode")).item(0); core.DecodeWidth = Integer.parseInt(getImmediateString("Width", decodeElmnt)); core.decodePower = getEnergyConfig(decodeElmnt); @@ -499,6 +502,7 @@ public class XMLParser core.resultsBroadcastBusPower = getEnergyConfig(resultsBroadcastBusElmnt); Element renameElmnt = (Element)(coreElmnt.getElementsByTagName("Rename")).item(0); + core.RenameWidth = Integer.parseInt(getImmediateString("Width", renameElmnt)); Element ratElmnt = (Element)(renameElmnt.getElementsByTagName("RAT")).item(0); core.intRATPower = getEnergyConfig((Element)ratElmnt.getElementsByTagName("Integer").item(0)); @@ -1066,9 +1070,14 @@ public class XMLParser { branchPredictor.predictorMode = BP.TAGE; } + else if(tempStr.equalsIgnoreCase("TAGE-SC-L")) + { + branchPredictor.predictorMode = BP.TAGE_SC_L; + } branchPredictor.PCBits = Integer.parseInt(getImmediateString("PCBits", predictorElmnt)); branchPredictor.BHRsize = Integer.parseInt(getImmediateString("BHRsize", predictorElmnt)); branchPredictor.saturating_bits = Integer.parseInt(getImmediateString("SaturatingBits", predictorElmnt)); + branchPredictor.TAGESCLLibDirectory = getImmediateString("TAGESCLLibDirectory", predictorElmnt); } private static boolean setDirectoryCoherent(String immediateString) { diff --git a/src/simulator/config/config.xml b/src/simulator/config/config.xml index ffa48bb..affeeb5 100755 --- a/src/simulator/config/config.xml +++ b/src/simulator/config/config.xml @@ -47,12 +47,12 @@ TDP = 15W - /home/rajshekar/tmp/gcc_trace + /home/rajshekar/projects/tejas/tests/test1_trace - /home/rajshekar/softwares/pin-97554/ + /home/rajshekar/softwares/pin-98332/ /home/rajshekar/projects/tejas/workspace/Tejas/src/emulator/pin/obj-pin/causalityTool.so TODO/home/prathmesh/workspace/qemu/x86_64-linux-user/qemu-x86_64 /home/prathmesh/tmp/testQemu.o - /home/rajshekar/resources/tejas_configs/ + /home/rajshekar/projects/tejas/workspace/Tejas/src/emulator/pin/obj-comm/ /home/rajshekar_resources/tejas_configs/getBenchmarkPID.sh /home/rajshekar/resources/tejas_configs/killAllDescendents.sh @@ -112,11 +112,12 @@ TDP = 15W outOfOrder - TAGE + TAGE 8 16 17 2 + /home/rajshekar/projects/tejas/workspace/Tejas/src/simulator/pipeline/branchpredictor/TAGESCL/ 0.0178 0.0962 @@ -164,6 +165,10 @@ TDP = 15W 0.00546275 0.06792852941 + + + 2304 + 6 @@ -172,6 +177,7 @@ TDP = 15W + 6 0.0045 diff --git a/src/simulator/generic/Core.java b/src/simulator/generic/Core.java index d7f8cea..ced4ed7 100755 --- a/src/simulator/generic/Core.java +++ b/src/simulator/generic/Core.java @@ -206,6 +206,10 @@ public class Core extends SimulationElement{ public int getDecodeWidth() { return coreConfig.DecodeWidth; } + + public int getRenameWidth() { + return coreConfig.RenameWidth; + } public int getVectorRegisterFileSize() { return coreConfig.VectorRegFileSize; diff --git a/src/simulator/generic/Statistics.java b/src/simulator/generic/Statistics.java index 5b0a736..fa72323 100755 --- a/src/simulator/generic/Statistics.java +++ b/src/simulator/generic/Statistics.java @@ -22,6 +22,7 @@ import memorysystem.nuca.NucaCache; import memorysystem.nuca.NucaCache.NucaType; import net.NocInterface; import net.Router; +import pipeline.outoforder.OutOrderExecutionEngine; import config.CoreConfig; import config.EmulatorConfig; import config.EnergyConfig; @@ -33,6 +34,7 @@ import emulatorinterface.translator.qemuTranslationCache.TranslatedInstructionCa import dram.MainMemoryDRAMController; import config.MainMemoryConfig; +import config.PipelineType; public class Statistics { @@ -199,12 +201,22 @@ public class Statistics { outputFileWriter.write("time taken\t=\t" + formatDouble((double)coreCyclesTaken[i]/GlobalClock.effectiveGlobalClockFrequency) + " microseconds\n"); outputFileWriter.write("\n"); + if(cores[i].getCoreConfig().pipelineType == PipelineType.outOfOrder) + { + outputFileWriter.write("number of micro-op cache accesses = " + ((OutOrderExecutionEngine)cores[i].getExecEngine()).getMicroOpCache().numSearches + "\n"); + outputFileWriter.write("micro-op cache hit rate = " + formatDouble((double)((OutOrderExecutionEngine)cores[i].getExecEngine()).getMicroOpCache().numHits/(double)((OutOrderExecutionEngine)cores[i].getExecEngine()).getMicroOpCache().numSearches) + "\n"); + outputFileWriter.write("\n"); + } + outputFileWriter.write("number of branches\t=\t" + cores[i].getExecEngine().getNumberOfBranches() + "\n"); outputFileWriter.write("number of mispredicted branches\t=\t" + cores[i].getExecEngine().getNumberOfMispredictedBranches() + "\n"); outputFileWriter.write("branch predictor accuracy\t=\t" + formatDouble((double)((double)(1.0 - (double)cores[i].getExecEngine().getNumberOfMispredictedBranches()/(double)cores[i].getExecEngine().getNumberOfBranches())*100.0)) + " %\n"); outputFileWriter.write("number of jumps\t=\t" + cores[i].getExecEngine().getNumberOfJumps() + "\n"); outputFileWriter.write("number of mispredicted jump targets\t=\t" + cores[i].getExecEngine().getNumberOfMispredictedTargets() + "\n"); outputFileWriter.write("target predictor accuracy\t=\t" + formatDouble((double)((double)(1.0 - (double)cores[i].getExecEngine().getNumberOfMispredictedTargets()/(double)cores[i].getExecEngine().getNumberOfJumps())*100.0)) + " %\n"); + outputFileWriter.write("number of predicate instructions\t=\t" + ((OutOrderExecutionEngine)cores[i].getExecEngine()).getReorderBuffer().predicateCount + "\n"); + outputFileWriter.write("number of mispredicted predicate instructions\t=\t" + ((OutOrderExecutionEngine)cores[i].getExecEngine()).getReorderBuffer().predicateMispredCount + "\n"); + outputFileWriter.write("predicate predictor accuracy\t=\t" + formatDouble((double)((double)(1.0 - (double)((OutOrderExecutionEngine)cores[i].getExecEngine()).getReorderBuffer().predicateMispredCount/(double)((OutOrderExecutionEngine)cores[i].getExecEngine()).getReorderBuffer().predicateCount)*100.0)) + " %\n"); outputFileWriter.write("\n"); outputFileWriter.write("predictor type = " + coreConfig.branchPredictor.predictorMode + "\n"); @@ -213,6 +225,14 @@ public class Statistics { outputFileWriter.write("Saturating bits = " + coreConfig.branchPredictor.saturating_bits + "\n"); outputFileWriter.write("\n"); + outputFileWriter.write("\nIW Full stall = " + ((OutOrderExecutionEngine)cores[i].getExecEngine()).getReorderBuffer().getStall1Count()); + outputFileWriter.write("\nrename stall = " + ((OutOrderExecutionEngine)cores[i].getExecEngine()).getReorderBuffer().getStall2Count()); + outputFileWriter.write("\nLSQ Full stall = " + ((OutOrderExecutionEngine)cores[i].getExecEngine()).getReorderBuffer().getStall3Count()); + outputFileWriter.write("\nROB Full stall = " + ((OutOrderExecutionEngine)cores[i].getExecEngine()).getReorderBuffer().getStall4Count()); + outputFileWriter.write("\nMispred stall = " + ((OutOrderExecutionEngine)cores[i].getExecEngine()).getReorderBuffer().getStall5Count()); + outputFileWriter.write("\nSerialization instruction stall = " + ((OutOrderExecutionEngine)cores[i].getExecEngine()).getReorderBuffer().getStall6Count()); + outputFileWriter.write("\n"); + } outputFileWriter.write("\n"); } diff --git a/src/simulator/memorysystem/LSQ.java b/src/simulator/memorysystem/LSQ.java index 8fe8cd4..5bac7c0 100755 --- a/src/simulator/memorysystem/LSQ.java +++ b/src/simulator/memorysystem/LSQ.java @@ -301,6 +301,13 @@ public class LSQ extends SimulationElement } public boolean isFull(boolean isLoad) + { + if(privIsFull(isLoad)) + freeOneEntry(isLoad); + return privIsFull(isLoad); + } + + private boolean privIsFull(boolean isLoad) { if(isLoad) { @@ -495,52 +502,10 @@ committed LSQEntry tmpEntry = lsqueue[i]; // if it is a store, send the request to the cache - if(tmpEntry.getType() == LSQEntry.LSQEntryType.STORE) + if(tmpEntry.getType() == LSQEntry.LSQEntryType.STORE + || tmpEntry.getType() == LSQEntryType.LOAD && tmpEntry.isForwarded()) { - if(tmpEntry.isValid() == false) - { - misc.Error.showErrorAndExit("store not ready to be committed"); - } - - boolean requestIssued = - containingMemSys.issueRequestToL1Cache(RequestType.Cache_Write, - tmpEntry.getAddr()); - - if(requestIssued == false) - { - event.addEventTime(1); - event.getEventQ().addEvent(event); - break; //removals must be in-order : if u can't commit the operation at the head, u can't commit the ones that follow it - } - - else - { - if(head == tail) - { - head = tail = -1; - } - else - { - this.head = this.incrementQ(this.head); - } - this.curNumStoresInQ--; - tmpEntry.setRemoved(true); - } - } - - //If it is a LOAD which has received its value - else if (tmpEntry.isForwarded()) - { - if(head == tail) - { - head = tail = -1; - } - else - { - this.head = this.incrementQ(this.head); - } - this.curNumLoadsInQ--; - tmpEntry.setRemoved(true); + tmpEntry.setCanBeRemoved(true); } //If it is a LOAD which has not yet received its value @@ -559,6 +524,82 @@ committed //incrementNumAccesses(1); } + public void freeOneEntry(boolean isLoadToBeRemoved) + { + boolean removedEnough = false; + + while(removedEnough == false) + { + LSQEntry tmpEntry = lsqueue[head]; + + if(tmpEntry.isCanBeRemoved() == false) + { + return; + } + + // if it is a store, send the request to the cache + if(tmpEntry.getType() == LSQEntry.LSQEntryType.STORE) + { + if(tmpEntry.isValid() == false) + { + misc.Error.showErrorAndExit("store not ready to be committed"); + } + + boolean requestIssued = + containingMemSys.issueRequestToL1Cache(RequestType.Cache_Write, + tmpEntry.getAddr()); + + if(requestIssued == false) + { + return; //removals must be in-order : if u can't commit the operation at the head, u can't commit the ones that follow it + } + + else + { + if(head == tail) + { + head = tail = -1; + } + else + { + this.head = this.incrementQ(this.head); + } + this.curNumStoresInQ--; + tmpEntry.setRemoved(true); + + if(isLoadToBeRemoved == false) + removedEnough = true; + } + } + + //If it is a LOAD which has received its value + else if (tmpEntry.isForwarded()) + { + if(head == tail) + { + head = tail = -1; + } + else + { + this.head = this.incrementQ(this.head); + } + this.curNumLoadsInQ--; + tmpEntry.setRemoved(true); + + if(isLoadToBeRemoved == true) + removedEnough = true; + } + + //If it is a LOAD which has not yet received its value + else + { + System.err.println("Error in LSQ " +this.containingMemSys.coreID+ " : ROB sent commit for a load which has not received its value"); + misc.Error.showErrorAndExit(tmpEntry.getIndexInQ() + " : load : " + tmpEntry.getAddr()); + } + //incrementNumAccesses(1); + } + } + void incrementNumAccesses(int incrementBy) { numAccesses += incrementBy; @@ -570,4 +611,4 @@ committed power.printEnergyStats(outputFileWriter, componentName); return power; } -} \ No newline at end of file +} diff --git a/src/simulator/memorysystem/LSQEntry.java b/src/simulator/memorysystem/LSQEntry.java index 415c2d2..fbe8dca 100755 --- a/src/simulator/memorysystem/LSQEntry.java +++ b/src/simulator/memorysystem/LSQEntry.java @@ -33,7 +33,7 @@ public class LSQEntry private boolean valid; private boolean issued; private boolean forwarded;//Whether the load has got its value or not - + private boolean canBeRemoved; private boolean removed; //If the entry has been committed and removed from the LSQ public enum LSQEntryType {LOAD, STORE}; @@ -45,6 +45,7 @@ public class LSQEntry valid = false; issued = false; forwarded = false; + canBeRemoved = false; removed = true; } @@ -54,6 +55,7 @@ public class LSQEntry valid = false; issued = false; forwarded = false; + canBeRemoved = false; removed = false; } @@ -108,6 +110,14 @@ public class LSQEntry this.forwarded = forwarded; } + public boolean isCanBeRemoved() { + return canBeRemoved; + } + + public void setCanBeRemoved(boolean canBeRemoved) { + this.canBeRemoved = canBeRemoved; + } + protected boolean isRemoved() { return removed; } diff --git a/src/simulator/pipeline/ExecutionEngine.java b/src/simulator/pipeline/ExecutionEngine.java index dc94c6c..ec6a78a 100755 --- a/src/simulator/pipeline/ExecutionEngine.java +++ b/src/simulator/pipeline/ExecutionEngine.java @@ -20,6 +20,7 @@ import pipeline.branchpredictor.PApPredictor; import pipeline.branchpredictor.PerfectPredictor; import pipeline.branchpredictor.TournamentPredictor; import pipeline.branchpredictor.TAGE; +import pipeline.branchpredictor.TAGESCL.TAGESCL; import pipeline.branchpredictor.BTB; import generic.Core; import generic.GenericCircularQueue; @@ -88,6 +89,8 @@ public abstract class ExecutionEngine { this.branchPredictor = new TAGE(this, coreConfig.branchPredictor.PCBits, coreConfig.branchPredictor.saturating_bits); + else if(coreConfig.branchPredictor.predictorMode == BP.TAGE_SC_L) + this.branchPredictor = new TAGESCL(this); BTB = new BTB(coreConfig.branchPredictor.PCBits, coreConfig.branchPredictor.BHRsize); } diff --git a/src/simulator/pipeline/OpTypeToFUTypeMapping.java b/src/simulator/pipeline/OpTypeToFUTypeMapping.java index 931afa1..0582e77 100755 --- a/src/simulator/pipeline/OpTypeToFUTypeMapping.java +++ b/src/simulator/pipeline/OpTypeToFUTypeMapping.java @@ -10,13 +10,13 @@ public class OpTypeToFUTypeMapping { public static FunctionalUnitType[] intALUFUs = {FunctionalUnitType.integerALU}; public static FunctionalUnitType[] intMulFUs = {FunctionalUnitType.integerMul}; public static FunctionalUnitType[] intDivFUs = {FunctionalUnitType.integerDiv}; - public static FunctionalUnitType[] floatALUFUs = {FunctionalUnitType.floatALU, FunctionalUnitType.FMA}; - public static FunctionalUnitType[] floatMulFUs = {FunctionalUnitType.floatMul, FunctionalUnitType.FMA}; + public static FunctionalUnitType[] floatALUFUs = {FunctionalUnitType.FMA, FunctionalUnitType.floatALU}; + public static FunctionalUnitType[] floatMulFUs = {FunctionalUnitType.FMA, FunctionalUnitType.floatMul}; public static FunctionalUnitType[] floatDivFUs = {FunctionalUnitType.floatDiv}; public static FunctionalUnitType[] intVectorALUFUs = {FunctionalUnitType.integerVectorALU}; public static FunctionalUnitType[] intVectorMulFUs = {FunctionalUnitType.integerVectorMul}; - public static FunctionalUnitType[] floatVectorALUFUs = {FunctionalUnitType.floatVectorALU, FunctionalUnitType.FMA}; - public static FunctionalUnitType[] floatVectorMulFUs = {FunctionalUnitType.floatVectorMul, FunctionalUnitType.FMA}; + public static FunctionalUnitType[] floatVectorALUFUs = {FunctionalUnitType.FMA, FunctionalUnitType.floatVectorALU}; + public static FunctionalUnitType[] floatVectorMulFUs = {FunctionalUnitType.FMA, FunctionalUnitType.floatVectorMul}; public static FunctionalUnitType[] FMAFUs = {FunctionalUnitType.FMA}; public static FunctionalUnitType[] VectorFMAFUs = {FunctionalUnitType.FMA}; public static FunctionalUnitType[] loadFUs = {FunctionalUnitType.load}; diff --git a/src/simulator/pipeline/branchpredictor/TAGESCL/Makefile b/src/simulator/pipeline/branchpredictor/TAGESCL/Makefile new file mode 100644 index 0000000..1b34d5b --- /dev/null +++ b/src/simulator/pipeline/branchpredictor/TAGESCL/Makefile @@ -0,0 +1,8 @@ +all: + javac -h . TAGESCLInvoker.java + #gcc -fPIC -I/usr/lib/jvm/java-8-openjdk-amd64/include/ -I/usr/lib/jvm/java-8-openjdk-amd64/include/linux/ -shared -o libnative.so pipeline_branchpredictor_TAGESCL_TAGESCLInvoker.cc + gcc -fPIC -I/usr/lib/jvm/java-11-openjdk-amd64/include/ -I/usr/lib/jvm/java-11-openjdk-amd64/include/linux/ -shared -o libnative.so pipeline_branchpredictor_TAGESCL_TAGESCLInvoker.cc +clean: + rm -f *.h + rm -f *.class + rm -f *.so diff --git a/src/simulator/pipeline/branchpredictor/TAGESCL/TAGESCL.java b/src/simulator/pipeline/branchpredictor/TAGESCL/TAGESCL.java new file mode 100644 index 0000000..f739e64 --- /dev/null +++ b/src/simulator/pipeline/branchpredictor/TAGESCL/TAGESCL.java @@ -0,0 +1,32 @@ +package pipeline.branchpredictor.TAGESCL; + +import pipeline.ExecutionEngine; +import pipeline.branchpredictor.BranchPredictor; + +public class TAGESCL extends BranchPredictor { + + TAGESCLInvoker ti; + public TAGESCL(ExecutionEngine containingExecEngine) + { + super(containingExecEngine); + ti = new TAGESCLInvoker(containingExecEngine.getContainingCore().getCoreConfig().branchPredictor.TAGESCLLibDirectory); + } + + public boolean predict(long address, boolean outcome) + { + return ti.invokerPredict(address); + } + + public void Train(long address, boolean outcome, boolean predict) + { + misc.Error.showErrorAndExit("use the other Train() function"); + //don't use this!! + //use Train(long PC, int opType, boolean resolveDir, boolean predDir, long branchTarget) + } + + public void Train(long PC, int opType, boolean resolveDir, boolean predDir, long branchTarget) + { + ti.invokerTrain (PC, opType, resolveDir, predDir, branchTarget); + } + +} diff --git a/src/simulator/pipeline/branchpredictor/TAGESCL/TAGESCLInvoker.java b/src/simulator/pipeline/branchpredictor/TAGESCL/TAGESCLInvoker.java new file mode 100644 index 0000000..5c712b3 --- /dev/null +++ b/src/simulator/pipeline/branchpredictor/TAGESCL/TAGESCLInvoker.java @@ -0,0 +1,25 @@ +package pipeline.branchpredictor.TAGESCL; + +public class TAGESCLInvoker { + + private native void initialize(); + private native boolean predict (long PC); + private native void train (long PC, int opType, boolean resolveDir, boolean predDir, long branchTarget); + + public TAGESCLInvoker(String TAGESCLLibDirectory) + { + System.load(TAGESCLLibDirectory + "/libnative.so"); + initialize(); + } + + public boolean invokerPredict(long address) + { + return predict(address); + } + + public void invokerTrain(long PC, int opType, boolean resolveDir, boolean predDir, long branchTarget) + { + train (PC, opType, resolveDir, predDir, branchTarget); + } + +} diff --git a/src/simulator/pipeline/branchpredictor/TAGESCL/pipeline_branchpredictor_TAGESCL_TAGESCLInvoker.cc b/src/simulator/pipeline/branchpredictor/TAGESCL/pipeline_branchpredictor_TAGESCL_TAGESCLInvoker.cc new file mode 100644 index 0000000..3f9323c --- /dev/null +++ b/src/simulator/pipeline/branchpredictor/TAGESCL/pipeline_branchpredictor_TAGESCL_TAGESCLInvoker.cc @@ -0,0 +1,1778 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include "TAGESCL.h" +#include "pipeline_branchpredictor_TAGESCL_TAGESCLInvoker.h" + +using namespace std; + +#define UINT32 unsigned int +#define INT32 int +#define UINT64 unsigned long long +#define COUNTER unsigned long long + + +#define NOT_TAKEN 0 +#define TAKEN 1 + +#define FAILURE 0 +#define SUCCESS 1 + +//JD2_17_2016 break down types into COND/UNCOND +typedef enum { + OPTYPE_OP = 2, + + OPTYPE_RET_UNCOND, + OPTYPE_JMP_DIRECT_UNCOND, + OPTYPE_JMP_INDIRECT_UNCOND, + OPTYPE_CALL_DIRECT_UNCOND, + OPTYPE_CALL_INDIRECT_UNCOND, + + OPTYPE_RET_COND, + OPTYPE_JMP_DIRECT_COND, + OPTYPE_JMP_INDIRECT_COND, + OPTYPE_CALL_DIRECT_COND, + OPTYPE_CALL_INDIRECT_COND, + + OPTYPE_ERROR, + + OPTYPE_MAX +} OpType; + + +static inline UINT32 SatIncrement(UINT32 x, UINT32 max) { + if (x < max) return x + 1; + return x; +} + +static inline UINT32 SatDecrement(UINT32 x) { + if (x > 0) return x - 1; + return x; +} + + +#define BORNTICK 1024 +//To get the predictor storage budget on stderr uncomment the next line +#define PRINTSIZE +#include +long long IMLIcount; // use to monitor the iteration number + +#define SC // 8.2 % if TAGE alone +#define IMLI // 0.2 % +#define LOCALH + +#ifdef LOCALH // 2.7 % +#define LOOPPREDICTOR //loop predictor enable +#define LOCALS //enable the 2nd local history +#define LOCALT //enables the 3rd local history + +#endif + + + +//The statistical corrector components + +#define PERCWIDTH 6 //Statistical corrector counter width 5 -> 6 : 0.6 % +//The three BIAS tables in the SC component +//We play with the TAGE confidence here, with the number of the hitting bank +#define LOGBIAS 8 +int8_t Bias[(1 << LOGBIAS)]; +#define INDBIAS (((((PC ^(PC >>2))<<1) ^ (LowConf &(LongestMatchPred!=alttaken))) <<1) + pred_inter) & ((1<>(LOGBIAS-2)))<<1) ^ (HighConf))<<1) + pred_inter) & ((1<>2))<<7)) & ((1< Micro 2015 paper: a big disappointment on CBP2016 traces +#ifdef IMLI +#define LOGINB 8 // 128-entry +#define INB 1 +int Im[INB] = { 8 }; +int8_t IGEHLA[INB][(1 << LOGINB)] = { {0} }; + +int8_t *IGEHL[INB]; + +#define LOGIMNB 9 // 2* 256 -entry +#define IMNB 2 + +int IMm[IMNB] = { 10, 4 }; +int8_t IMGEHLA[IMNB][(1 << LOGIMNB)] = { {0} }; + +int8_t *IMGEHL[IMNB]; +long long IMHIST[256]; + +#endif + +//global branch GEHL +#define LOGGNB 10 // 1 1K + 2 * 512-entry tables +#define GNB 3 +int Gm[GNB] = { 40, 24, 10 }; +int8_t GGEHLA[GNB][(1 << LOGGNB)] = { {0} }; + +int8_t *GGEHL[GNB]; + +//variation on global branch history +#define PNB 3 +#define LOGPNB 9 // 1 1K + 2 * 512-entry tables +int Pm[PNB] = { 25, 16, 9 }; +int8_t PGEHLA[PNB][(1 << LOGPNB)] = { {0} }; + +int8_t *PGEHL[PNB]; + +//first local history +#define LOGLNB 10 // 1 1K + 2 * 512-entry tables +#define LNB 3 +int Lm[LNB] = { 11, 6, 3 }; +int8_t LGEHLA[LNB][(1 << LOGLNB)] = { {0} }; + +int8_t *LGEHL[LNB]; +#define LOGLOCAL 8 +#define NLOCAL (1<>2)) & (NLOCAL-1)) +long long L_shist[NLOCAL]; //local histories + +// second local history +#define LOGSNB 9 // 1 1K + 2 * 512-entry tables +#define SNB 3 +int Sm[SNB] = { 16, 11, 6 }; +int8_t SGEHLA[SNB][(1 << LOGSNB)] = { {0} }; + +int8_t *SGEHL[SNB]; +#define LOGSECLOCAL 4 +#define NSECLOCAL (1<>5))) & (NSECLOCAL-1)) +long long S_slhist[NSECLOCAL]; + +//third local history +#define LOGTNB 10 // 2 * 512-entry tables +#define TNB 2 +int Tm[TNB] = { 9, 4 }; +int8_t TGEHLA[TNB][(1 << LOGTNB)] = { {0} }; + +int8_t *TGEHL[TNB]; +#define NTLOCAL 16 +#define INDTLOCAL (((PC ^ (PC >>(LOGTNB)))) & (NTLOCAL-1)) // different hash for the history +long long T_slhist[NTLOCAL]; + + + + + +// playing with putting more weights (x2) on some of the SC components +// playing on using different update thresholds on SC +//update threshold for the statistical corrector +#define VARTHRES +#define WIDTHRES 12 +#define WIDTHRESP 8 +#ifdef VARTHRES +#define LOGSIZEUP 6 //not worth increasing +#else +#define LOGSIZEUP 0 +#endif +#define LOGSIZEUPS (LOGSIZEUP/2) +int updatethreshold; +int Pupdatethreshold[(1 << LOGSIZEUP)]; //size is fixed by LOGSIZEUP +#define INDUPD (PC ^ (PC >>2)) & ((1 << LOGSIZEUP) - 1) +#define INDUPDS ((PC ^ (PC >>2)) & ((1 << (LOGSIZEUPS)) - 1)) +int8_t WG[(1 << LOGSIZEUPS)]; +int8_t WL[(1 << LOGSIZEUPS)]; +int8_t WS[(1 << LOGSIZEUPS)]; +int8_t WT[(1 << LOGSIZEUPS)]; +int8_t WP[(1 << LOGSIZEUPS)]; +int8_t WI[(1 << LOGSIZEUPS)]; +int8_t WIM[(1 << LOGSIZEUPS)]; +int8_t WB[(1 << LOGSIZEUPS)]; +#define EWIDTH 6 +int LSUM; + +// The two counters used to choose between TAGE and SC on Low Conf SC +int8_t FirstH, SecondH; +bool MedConf; // is the TAGE prediction medium confidence + + +#define CONFWIDTH 7 //for the counters in the choser +#define HISTBUFFERLENGTH 4096 // we use a 4K entries history buffer to store the branch history (this allows us to explore using history length up to 4K) + + + + + +// utility class for index computation +// this is the cyclic shift register for folding +// a long global history into a smaller number of bits; see P. Michaud's PPM-like predictor at CBP-1 +class folded_history +{ +public: + + + unsigned comp; + int CLENGTH; + int OLENGTH; + int OUTPOINT; + + folded_history () + { + } + + + void init (int original_length, int compressed_length) + { + comp = 0; + OLENGTH = original_length; + CLENGTH = compressed_length; + OUTPOINT = OLENGTH % CLENGTH; + + } + + void update (uint8_t * h, int PT) + { + comp = (comp << 1) ^ h[PT & (HISTBUFFERLENGTH - 1)]; + comp ^= h[(PT + OLENGTH) & (HISTBUFFERLENGTH - 1)] << OUTPOINT; + comp ^= (comp >> CLENGTH); + comp = (comp) & ((1 << CLENGTH) - 1); + } + +}; + + + + +class bentry // TAGE bimodal table entry +{ +public: + int8_t hyst; + int8_t pred; + + + bentry () + { + pred = 0; + + hyst = 1; + } + +}; +class gentry // TAGE global table entry +{ +public: + int8_t ctr; + uint tag; + int8_t u; + + gentry () + { + ctr = 0; + u = 0; + tag = 0; + + + } +}; + + + +#define POWER +//use geometric history length + +#define NHIST 36 // twice the number of different histories + +#define NBANKLOW 10 // number of banks in the shared bank-interleaved for the low history lengths +#define NBANKHIGH 20 // number of banks in the shared bank-interleaved for the history lengths + +int SizeTable[NHIST + 1]; + + +#define BORN 13 // below BORN in the table for low history lengths, >= BORN in the table for high history lengths, + +// we use 2-way associativity for the medium history lengths +#define BORNINFASSOC 9 //2 -way assoc for those banks 0.4 % +#define BORNSUPASSOC 23 + +/*in practice 2 bits or 3 bits par branch: around 1200 cond. branchs*/ + +#define MINHIST 6 //not optimized so far +#define MAXHIST 3000 + + +#define LOGG 10 /* logsize of the banks in the tagged TAGE tables */ +#define TBITS 8 //minimum width of the tags (low history lengths), +4 for high history lengths + + +bool NOSKIP[NHIST + 1]; // to manage the associativity for different history lengths +bool LowConf; +bool HighConf; + + + +#define NNN 1 // number of extra entries allocated on a TAGE misprediction (1+NNN) +#define HYSTSHIFT 2 // bimodal hysteresis shared by 4 entries +#define LOGB 13 // log of number of entries in bimodal predictor + + +#define PHISTWIDTH 27 // width of the path history used in TAGE +#define UWIDTH 1 // u counter width on TAGE (2 bits not worth the effort for a 512 Kbits predictor 0.2 %) +#define CWIDTH 3 // predictor counter width on the TAGE tagged tables + + +//the counter(s) to chose between longest match and alternate prediction on TAGE when weak counters +#define LOGSIZEUSEALT 4 +bool AltConf; // Confidence on the alternate prediction +#define ALTWIDTH 5 +#define SIZEUSEALT (1<<(LOGSIZEUSEALT)) +#define INDUSEALT (((((HitBank-1)/8)<<1)+AltConf) % (SIZEUSEALT-1)) +int8_t use_alt_on_na[SIZEUSEALT]; +//very marginal benefit +long long GHIST; +int8_t BIM; + +int TICK; // for the reset of the u counter +uint8_t ghist[HISTBUFFERLENGTH]; +int ptghist; +long long phist; //path history +folded_history ch_i[NHIST + 1]; //utility for computing TAGE indices +folded_history ch_t[2][NHIST + 1]; //utility for computing TAGE tags + +//For the TAGE predictor +bentry *btable; //bimodal TAGE table +gentry *gtable[NHIST + 1]; // tagged TAGE tables +int m[NHIST + 1]; +int TB[NHIST + 1]; +int logg[NHIST + 1]; + +int GI[NHIST + 1]; // indexes to the different tables are computed only once +uint GTAG[NHIST + 1]; // tags for the different tables are computed only once +int BI; // index of the bimodal table +bool pred_taken; // prediction +bool alttaken; // alternate TAGEprediction +bool tage_pred; // TAGE prediction +bool LongestMatchPred; +int HitBank; // longest matching bank +int AltBank; // alternate matching bank +int Seed; // for the pseudo-random number generator +bool pred_inter; + + +#ifdef LOOPPREDICTOR +//parameters of the loop predictor +#define LOGL 5 +#define WIDTHNBITERLOOP 10 // we predict only loops with less than 1K iterations +#define LOOPTAG 10 //tag width in the loop predictor + +class lentry //loop predictor entry +{ +public: + uint16_t NbIter; //10 bits + uint8_t confid; // 4bits + uint16_t CurrentIter; // 10 bits + + uint16_t TAG; // 10 bits + uint8_t age; // 4 bits + bool dir; // 1 bit + + //39 bits per entry + lentry () + { + confid = 0; + CurrentIter = 0; + NbIter = 0; + TAG = 0; + age = 0; + dir = false; + + + + } + +}; + +lentry *ltable; //loop predictor table +//variables for the loop predictor +bool predloop; // loop predictor prediction +int LIB; +int LI; +int LHIT; //hitting way in the loop predictor +int LTAG; //tag on the loop predictor +bool LVALID; // validity of the loop predictor prediction +int8_t WITHLOOP; // counter to monitor whether or not loop prediction is beneficial + +#endif + +int +predictorsize () +{ + int STORAGESIZE = 0; + int inter = 0; + + + STORAGESIZE += + NBANKHIGH * (1 << (logg[BORN])) * (CWIDTH + UWIDTH + TB[BORN]); + STORAGESIZE += NBANKLOW * (1 << (logg[1])) * (CWIDTH + UWIDTH + TB[1]); + + STORAGESIZE += (SIZEUSEALT) * ALTWIDTH; + STORAGESIZE += (1 << LOGB) + (1 << (LOGB - HYSTSHIFT)); + STORAGESIZE += m[NHIST]; + STORAGESIZE += PHISTWIDTH; + STORAGESIZE += 10; //the TICK counter + + fprintf (stderr, " (TAGE %d) ", STORAGESIZE); +#ifdef SC +#ifdef LOOPPREDICTOR + + inter = (1 << LOGL) * (2 * WIDTHNBITERLOOP + LOOPTAG + 4 + 4 + 1); + fprintf (stderr, " (LOOP %d) ", inter); + STORAGESIZE += inter; + +#endif + + inter += WIDTHRES; + inter = WIDTHRESP * ((1 << LOGSIZEUP)); //the update threshold counters + inter += 3 * EWIDTH * (1 << LOGSIZEUPS); // the extra weight of the partial sums + inter += (PERCWIDTH) * 3 * (1 << (LOGBIAS)); + + inter += + (GNB - 2) * (1 << (LOGGNB)) * (PERCWIDTH) + + (1 << (LOGGNB - 1)) * (2 * PERCWIDTH); + inter += Gm[0]; //global histories for SC + inter += (PNB - 2) * (1 << (LOGPNB)) * (PERCWIDTH) + + (1 << (LOGPNB - 1)) * (2 * PERCWIDTH); +//we use phist already counted for these tables + +#ifdef LOCALH + inter += + (LNB - 2) * (1 << (LOGLNB)) * (PERCWIDTH) + + (1 << (LOGLNB - 1)) * (2 * PERCWIDTH); + inter += NLOCAL * Lm[0]; + inter += EWIDTH * (1 << LOGSIZEUPS); +#ifdef LOCALS + inter += + (SNB - 2) * (1 << (LOGSNB)) * (PERCWIDTH) + + (1 << (LOGSNB - 1)) * (2 * PERCWIDTH); + inter += NSECLOCAL * (Sm[0]); + inter += EWIDTH * (1 << LOGSIZEUPS); + +#endif +#ifdef LOCALT + inter += + (TNB - 2) * (1 << (LOGTNB)) * (PERCWIDTH) + + (1 << (LOGTNB - 1)) * (2 * PERCWIDTH); + inter += NTLOCAL * Tm[0]; + inter += EWIDTH * (1 << LOGSIZEUPS); +#endif + + + + + + + + + +#endif + + + +#ifdef IMLI + + inter += (1 << (LOGINB - 1)) * PERCWIDTH; + inter += Im[0]; + + inter += IMNB * (1 << (LOGIMNB - 1)) * PERCWIDTH; + inter += 2 * EWIDTH * (1 << LOGSIZEUPS); // the extra weight of the partial sums + inter += 256 * IMm[0]; +#endif + inter += 2 * CONFWIDTH; //the 2 counters in the choser + STORAGESIZE += inter; + + + fprintf (stderr, " (SC %d) ", inter); +#endif +#ifdef PRINTSIZE + fprintf (stderr, " (TOTAL %d bits %d Kbits) ", STORAGESIZE, + STORAGESIZE / 1024); + fprintf (stdout, " (TOTAL %d bits %d Kbits) ", STORAGESIZE, + STORAGESIZE / 1024); +#endif + + + return (STORAGESIZE); + + +} + + + + + + +class PREDICTOR +{ +public: + int THRES; + + PREDICTOR (void) + { + + reinit (); +#ifdef PRINTSIZE + predictorsize (); +#endif + } + + + void reinit () + { + + m[1] = MINHIST; + m[NHIST / 2] = MAXHIST; + for (int i = 2; i <= NHIST / 2; i++) + { + m[i] = + (int) (((double) MINHIST * + pow ((double) (MAXHIST) / (double) MINHIST, + (double) (i - 1) / (double) (((NHIST / 2) - 1)))) + + 0.5); + // fprintf(stderr, "(%d %d)", m[i],i); + + } + for (int i = 1; i <= NHIST; i++) + { + NOSKIP[i] = ((i - 1) & 1) + || ((i >= BORNINFASSOC) & (i < BORNSUPASSOC)); + + } + + NOSKIP[4] = 0; + NOSKIP[NHIST - 2] = 0; + NOSKIP[8] = 0; + NOSKIP[NHIST - 6] = 0; + // just eliminate some extra tables (very very marginal) + + for (int i = NHIST; i > 1; i--) + { + m[i] = m[(i + 1) / 2]; + + + } + for (int i = 1; i <= NHIST; i++) + { + TB[i] = TBITS + 4 * (i >= BORN); + logg[i] = LOGG; + + } + + +#ifdef LOOPPREDICTOR + ltable = new lentry[1 << (LOGL)]; +#endif + + + gtable[1] = new gentry[NBANKLOW * (1 << LOGG)]; + SizeTable[1] = NBANKLOW * (1 << LOGG); + + gtable[BORN] = new gentry[NBANKHIGH * (1 << LOGG)]; + SizeTable[BORN] = NBANKHIGH * (1 << LOGG); + + for (int i = BORN + 1; i <= NHIST; i++) + gtable[i] = gtable[BORN]; + for (int i = 2; i <= BORN - 1; i++) + gtable[i] = gtable[1]; + btable = new bentry[1 << LOGB]; + + for (int i = 1; i <= NHIST; i++) + { + ch_i[i].init (m[i], (logg[i])); + ch_t[0][i].init (ch_i[i].OLENGTH, TB[i]); + ch_t[1][i].init (ch_i[i].OLENGTH, TB[i] - 1); + + } +#ifdef LOOPPREDICTOR + LVALID = false; + WITHLOOP = -1; +#endif + Seed = 0; + + TICK = 0; + phist = 0; + Seed = 0; + + for (int i = 0; i < HISTBUFFERLENGTH; i++) + ghist[0] = 0; + ptghist = 0; + updatethreshold=35<<3; + + for (int i = 0; i < (1 << LOGSIZEUP); i++) + Pupdatethreshold[i] = 0; + for (int i = 0; i < GNB; i++) + GGEHL[i] = &GGEHLA[i][0]; + for (int i = 0; i < LNB; i++) + LGEHL[i] = &LGEHLA[i][0]; + + for (int i = 0; i < GNB; i++) + for (int j = 0; j < ((1 << LOGGNB) - 1); j++) + { + if (!(j & 1)) + { + GGEHL[i][j] = -1; + + } + } + for (int i = 0; i < LNB; i++) + for (int j = 0; j < ((1 << LOGLNB) - 1); j++) + { + if (!(j & 1)) + { + LGEHL[i][j] = -1; + + } + } + + for (int i = 0; i < SNB; i++) + SGEHL[i] = &SGEHLA[i][0]; + for (int i = 0; i < TNB; i++) + TGEHL[i] = &TGEHLA[i][0]; + for (int i = 0; i < PNB; i++) + PGEHL[i] = &PGEHLA[i][0]; +#ifdef IMLI +#ifdef IMLIOH + for (int i = 0; i < FNB; i++) + FGEHL[i] = &FGEHLA[i][0]; + + for (int i = 0; i < FNB; i++) + for (int j = 0; j < ((1 << LOGFNB) - 1); j++) + { + if (!(j & 1)) + { + FGEHL[i][j] = -1; + + } + } +#endif + for (int i = 0; i < INB; i++) + IGEHL[i] = &IGEHLA[i][0]; + for (int i = 0; i < INB; i++) + for (int j = 0; j < ((1 << LOGINB) - 1); j++) + { + if (!(j & 1)) + { + IGEHL[i][j] = -1; + + } + } + for (int i = 0; i < IMNB; i++) + IMGEHL[i] = &IMGEHLA[i][0]; + for (int i = 0; i < IMNB; i++) + for (int j = 0; j < ((1 << LOGIMNB) - 1); j++) + { + if (!(j & 1)) + { + IMGEHL[i][j] = -1; + + } + } + +#endif + for (int i = 0; i < SNB; i++) + for (int j = 0; j < ((1 << LOGSNB) - 1); j++) + { + if (!(j & 1)) + { + SGEHL[i][j] = -1; + + } + } + for (int i = 0; i < TNB; i++) + for (int j = 0; j < ((1 << LOGTNB) - 1); j++) + { + if (!(j & 1)) + { + TGEHL[i][j] = -1; + + } + } + for (int i = 0; i < PNB; i++) + for (int j = 0; j < ((1 << LOGPNB) - 1); j++) + { + if (!(j & 1)) + { + PGEHL[i][j] = -1; + + } + } + + + for (int i = 0; i < (1 << LOGB); i++) + { + btable[i].pred = 0; + btable[i].hyst = 1; + } + + + + + for (int j = 0; j < (1 << LOGBIAS); j++) + { + switch (j & 3) + { + case 0: + BiasSK[j] = -8; + break; + case 1: + BiasSK[j] = 7; + break; + case 2: + BiasSK[j] = -32; + + break; + case 3: + BiasSK[j] = 31; + break; + } + } + for (int j = 0; j < (1 << LOGBIAS); j++) + { + switch (j & 3) + { + case 0: + Bias[j] = -32; + + break; + case 1: + Bias[j] = 31; + break; + case 2: + Bias[j] = -1; + break; + case 3: + Bias[j] = 0; + break; + } + } + for (int j = 0; j < (1 << LOGBIAS); j++) + { + switch (j & 3) + { + case 0: + BiasBank[j] = -32; + + break; + case 1: + BiasBank[j] = 31; + break; + case 2: + BiasBank[j] = -1; + break; + case 3: + BiasBank[j] = 0; + break; + } + } + for (int i = 0; i < SIZEUSEALT; i++) + { + use_alt_on_na[i] = 0; + + } + for (int i = 0; i < (1 << LOGSIZEUPS); i++) + { + WG[i] = 7; + WL[i] = 7; + WS[i] = 7; + WT[i] = 7; + WP[i] = 7; + WI[i] = 7; + WB[i] = 4; + } + TICK = 0; + for (int i = 0; i < NLOCAL; i++) + { + L_shist[i] = 0; + } + for (int i = 0; i < NSECLOCAL; i++) + { + S_slhist[i] = 0; + + } + GHIST = 0; + ptghist = 0; + phist = 0; + + } + + + + + // index function for the bimodal table + + int bindex (UINT64 PC) + { + return ((PC ^ (PC >> LOGB)) & ((1 << (LOGB)) - 1)); + } + + +// the index functions for the tagged tables uses path history as in the OGEHL predictor +//F serves to mix path history: not very important impact + + int F (long long A, int size, int bank) + { + int A1, A2; + A = A & ((1 << size) - 1); + A1 = (A & ((1 << logg[bank]) - 1)); + A2 = (A >> logg[bank]); + + if (bank < logg[bank]) + A2 = + ((A2 << bank) & ((1 << logg[bank]) - 1)) + + (A2 >> (logg[bank] - bank)); + A = A1 ^ A2; + if (bank < logg[bank]) + A = + ((A << bank) & ((1 << logg[bank]) - 1)) + (A >> (logg[bank] - bank)); + return (A); + } + +// gindex computes a full hash of PC, ghist and phist + int gindex (unsigned int PC, int bank, long long hist, + folded_history * ch_i) + { + int index; + int M = (m[bank] > PHISTWIDTH) ? PHISTWIDTH : m[bank]; + index = + PC ^ (PC >> (abs (logg[bank] - bank) + 1)) + ^ ch_i[bank].comp ^ F (hist, M, bank); + + return (index & ((1 << (logg[bank])) - 1)); + } + + // tag computation + uint16_t gtag (unsigned int PC, int bank, folded_history * ch0, + folded_history * ch1) + { + int tag = (PC) ^ ch0[bank].comp ^ (ch1[bank].comp << 1); + return (tag & ((1 << (TB[bank])) - 1)); + } + + // up-down saturating counter + void ctrupdate (int8_t & ctr, bool taken, int nbits) + { + if (taken) + { + if (ctr < ((1 << (nbits - 1)) - 1)) + ctr++; + } + else + { + if (ctr > -(1 << (nbits - 1))) + ctr--; + } + } + + + bool getbim () + { + BIM = (btable[BI].pred << 1) + (btable[BI >> HYSTSHIFT].hyst); + HighConf = (BIM == 0) || (BIM == 3); + LowConf = !HighConf; + AltConf = HighConf; + MedConf = false; + return (btable[BI].pred > 0); + } + + void baseupdate (bool Taken) + { + int inter = BIM; + if (Taken) + { + if (inter < 3) + inter += 1; + } + else if (inter > 0) + inter--; + btable[BI].pred = inter >> 1; + btable[BI >> HYSTSHIFT].hyst = (inter & 1); + }; + +//just a simple pseudo random number generator: use available information +// to allocate entries in the loop predictor + int MYRANDOM () + { + Seed++; + Seed ^= phist; + Seed = (Seed >> 21) + (Seed << 11); + Seed ^= ptghist; + Seed = (Seed >> 10) + (Seed << 22); + return (Seed); + }; + + + // TAGE PREDICTION: same code at fetch or retire time but the index and tags must recomputed + void Tagepred (UINT64 PC) + { + HitBank = 0; + AltBank = 0; + for (int i = 1; i <= NHIST; i += 2) + { + GI[i] = gindex (PC, i, phist, ch_i); + GTAG[i] = gtag (PC, i, ch_t[0], ch_t[1]); + GTAG[i + 1] = GTAG[i]; + GI[i + 1] = GI[i] ^ (GTAG[i] & ((1 << LOGG) - 1)); + } +int T = (PC ^ (phist & ((1 << m[BORN]) - 1))) % NBANKHIGH; +//int T = (PC ^ phist) % NBANKHIGH; + for (int i = BORN; i <= NHIST; i++) + if (NOSKIP[i]) + { + GI[i] += (T << LOGG); + T++; + T = T % NBANKHIGH; + + } + T = (PC ^ (phist & ((1 << m[1]) - 1))) % NBANKLOW; + + for (int i = 1; i <= BORN - 1; i++) + if (NOSKIP[i]) + { + GI[i] += (T << LOGG); + T++; + T = T % NBANKLOW; + + } +//just do not forget most address are aligned on 4 bytes + BI = (PC ^ (PC >> 2)) & ((1 << LOGB) - 1); + + { + alttaken = getbim (); + tage_pred = alttaken; + LongestMatchPred = alttaken; + } + +//Look for the bank with longest matching history + for (int i = NHIST; i > 0; i--) + { + if (NOSKIP[i]) + if (gtable[i][GI[i]].tag == GTAG[i]) + { + HitBank = i; + LongestMatchPred = (gtable[HitBank][GI[HitBank]].ctr >= 0); + break; + } + } + +//Look for the alternate bank + for (int i = HitBank - 1; i > 0; i--) + { + if (NOSKIP[i]) + if (gtable[i][GI[i]].tag == GTAG[i]) + { + + AltBank = i; + break; + } + } +//computes the prediction and the alternate prediction + + if (HitBank > 0) + { + if (AltBank > 0) + { + alttaken = (gtable[AltBank][GI[AltBank]].ctr >= 0); + AltConf = (abs (2 * gtable[AltBank][GI[AltBank]].ctr + 1) > 1); + + } + else + alttaken = getbim (); + +//if the entry is recognized as a newly allocated entry and +//USE_ALT_ON_NA is positive use the alternate prediction + + bool Huse_alt_on_na = (use_alt_on_na[INDUSEALT] >= 0); + if ((!Huse_alt_on_na) + || (abs (2 * gtable[HitBank][GI[HitBank]].ctr + 1) > 1)) + tage_pred = LongestMatchPred; + else + tage_pred = alttaken; + + HighConf = + (abs (2 * gtable[HitBank][GI[HitBank]].ctr + 1) >= + (1 << CWIDTH) - 1); + LowConf = (abs (2 * gtable[HitBank][GI[HitBank]].ctr + 1) == 1); + MedConf = (abs (2 * gtable[HitBank][GI[HitBank]].ctr + 1) == 5); + + } + } + + +//compute the prediction + bool GetPrediction (UINT64 PC) +// JNIEXPORT jboolean JNICALL Java_TAGESCL_GetPrediction +// (JNIEnv *env, jobject thisObj, jlong PC) + { +// computes the TAGE table addresses and the partial tags + + + Tagepred (PC); + pred_taken = tage_pred; +#ifndef SC + return (tage_pred); +#endif + +#ifdef LOOPPREDICTOR + predloop = getloop (PC); // loop prediction + pred_taken = ((WITHLOOP >= 0) && (LVALID)) ? predloop : pred_taken; +#endif + pred_inter = pred_taken; + +//Compute the SC prediction + + LSUM = 0; + +//integrate BIAS prediction + int8_t ctr = Bias[INDBIAS]; + + LSUM += (2 * ctr + 1); + ctr = BiasSK[INDBIASSK]; + LSUM += (2 * ctr + 1); + ctr = BiasBank[INDBIASBANK]; + LSUM += (2 * ctr + 1); +#ifdef VARTHRES + LSUM = (1 + (WB[INDUPDS] >= 0)) * LSUM; +#endif +//integrate the GEHL predictions + LSUM += + Gpredict ((PC << 1) + pred_inter, GHIST, Gm, GGEHL, GNB, LOGGNB, WG); + LSUM += Gpredict (PC, phist, Pm, PGEHL, PNB, LOGPNB, WP); +#ifdef LOCALH + LSUM += Gpredict (PC, L_shist[INDLOCAL], Lm, LGEHL, LNB, LOGLNB, WL); +#ifdef LOCALS + LSUM += Gpredict (PC, S_slhist[INDSLOCAL], Sm, SGEHL, SNB, LOGSNB, WS); +#endif +#ifdef LOCALT + LSUM += Gpredict (PC, T_slhist[INDTLOCAL], Tm, TGEHL, TNB, LOGTNB, WT); +#endif +#endif + +#ifdef IMLI + LSUM += + Gpredict (PC, IMHIST[(IMLIcount)], IMm, IMGEHL, IMNB, LOGIMNB, WIM); + LSUM += Gpredict (PC, IMLIcount, Im, IGEHL, INB, LOGINB, WI); +#endif + bool SCPRED = (LSUM >= 0); +//just an heuristic if the respective contribution of component groups can be multiplied by 2 or not + THRES = (updatethreshold>>3)+Pupdatethreshold[INDUPD] +#ifdef VARTHRES + + 12 * ((WB[INDUPDS] >= 0) + (WP[INDUPDS] >= 0) +#ifdef LOCALH + + (WS[INDUPDS] >= 0) + (WT[INDUPDS] >= 0) + (WL[INDUPDS] >= 0) +#endif + + (WG[INDUPDS] >= 0) +#ifdef IMLI + + (WI[INDUPDS] >= 0) +#endif + ) +#endif + ; + + //Minimal benefit in trying to avoid accuracy loss on low confidence SC prediction and high/medium confidence on TAGE + // but just uses 2 counters 0.3 % MPKI reduction + if (pred_inter != SCPRED) + { +//Choser uses TAGE confidence and |LSUM| + pred_taken = SCPRED; + if (HighConf) + { + if ((abs (LSUM) < THRES / 4)) + { + pred_taken = pred_inter; + } + + else if ((abs (LSUM) < THRES / 2)) + pred_taken = (SecondH < 0) ? SCPRED : pred_inter; + } + + if (MedConf) + if ((abs (LSUM) < THRES / 4)) + { + pred_taken = (FirstH < 0) ? SCPRED : pred_inter; + } + + } + + return pred_taken; + } + + void HistoryUpdate (UINT64 PC, OpType opType, bool taken, + UINT64 target, long long &X, int &Y, + folded_history * H, folded_history * G, + folded_history * J) + { + int brtype = 0; + + switch (opType) + { + case OPTYPE_RET_UNCOND: + case OPTYPE_JMP_INDIRECT_UNCOND: + case OPTYPE_JMP_INDIRECT_COND: + case OPTYPE_CALL_INDIRECT_UNCOND: + case OPTYPE_CALL_INDIRECT_COND: + case OPTYPE_RET_COND: + brtype = 2; + break; + case OPTYPE_JMP_DIRECT_COND: + case OPTYPE_CALL_DIRECT_COND: + case OPTYPE_JMP_DIRECT_UNCOND: + case OPTYPE_CALL_DIRECT_UNCOND: + brtype = 0; + break; + default: + exit (1); + } + switch (opType) + { + case OPTYPE_JMP_DIRECT_COND: + case OPTYPE_CALL_DIRECT_COND: + case OPTYPE_JMP_INDIRECT_COND: + case OPTYPE_CALL_INDIRECT_COND: + case OPTYPE_RET_COND: + brtype += 1; + break; + + } + + +//special treatment for indirect branchs; + int maxt = 2; + if (brtype & 1) + maxt = 2; + else if ((brtype & 2) ) + maxt = 3; + +#ifdef IMLI + if (brtype & 1) + { +#ifdef IMLI + IMHIST[IMLIcount] = (IMHIST[IMLIcount] << 1) + taken; +#endif + if (target < PC) + + { +//This branch corresponds to a loop + if (!taken) + { +//exit of the "loop" + IMLIcount = 0; + + } + if (taken) + { + + if (IMLIcount < ((1 << Im[0]) - 1)) + IMLIcount++; + } + } + } + + +#endif + + if (brtype & 1) + { + GHIST = (GHIST << 1) + (taken & (target < PC)); + L_shist[INDLOCAL] = (L_shist[INDLOCAL] << 1) + (taken); + S_slhist[INDSLOCAL] = + ((S_slhist[INDSLOCAL] << 1) + taken) ^ (PC & 15); + T_slhist[INDTLOCAL] = (T_slhist[INDTLOCAL] << 1) + taken; + } + + + int T = ((PC ^ (PC >> 2))) ^ taken; + int PATH = PC ^ (PC >> 2) ^ (PC >> 4); + if ((brtype == 3) & taken) + { + T = (T ^ (target >> 2)); + PATH = PATH ^ (target >> 2) ^ (target >> 4); + } + + for (int t = 0; t < maxt; t++) + { + bool DIR = (T & 1); + T >>= 1; + int PATHBIT = (PATH & 127); + PATH >>= 1; +//update history + Y--; + ghist[Y & (HISTBUFFERLENGTH - 1)] = DIR; + X = (X << 1) ^ PATHBIT; + + + for (int i = 1; i <= NHIST; i++) + { + + H[i].update (ghist, Y); + G[i].update (ghist, Y); + J[i].update (ghist, Y); + + + } + } + + X = (X & ((1<= 0); + if (pred_inter != SCPRED) + { + if ((abs (LSUM) < THRES)) + if ((HighConf)) + { + + + if ((abs (LSUM) < THRES / 2)) + if ((abs (LSUM) >= THRES / 4)) + ctrupdate (SecondH, (pred_inter == resolveDir), CONFWIDTH); + } + if ((MedConf)) + if ((abs (LSUM) < THRES / 4)) + { + ctrupdate (FirstH, (pred_inter == resolveDir), CONFWIDTH); + } + } + + if ((SCPRED != resolveDir) || ((abs (LSUM) < THRES))) + { + { + if (SCPRED != resolveDir) + {Pupdatethreshold[INDUPD] += 1;updatethreshold+=1; + } + + else + {Pupdatethreshold[INDUPD] -= 1;updatethreshold -= 1; + } + + + if (Pupdatethreshold[INDUPD] >= (1 << (WIDTHRESP - 1))) + Pupdatethreshold[INDUPD] = (1 << (WIDTHRESP - 1)) - 1; +//Pupdatethreshold[INDUPD] could be negative + if (Pupdatethreshold[INDUPD] < -(1 << (WIDTHRESP - 1))) + Pupdatethreshold[INDUPD] = -(1 << (WIDTHRESP - 1)); + if (updatethreshold >= (1 << (WIDTHRES - 1))) + updatethreshold = (1 << (WIDTHRES - 1)) - 1; +//updatethreshold could be negative + if (updatethreshold < -(1 << (WIDTHRES - 1))) + updatethreshold = -(1 << (WIDTHRES - 1)); + } +#ifdef VARTHRES + { + int XSUM = + LSUM - ((WB[INDUPDS] >= 0) * ((2 * Bias[INDBIAS] + 1) + + (2 * BiasSK[INDBIASSK] + 1) + + (2 * BiasBank[INDBIASBANK] + 1))); + if ((XSUM + + ((2 * Bias[INDBIAS] + 1) + (2 * BiasSK[INDBIASSK] + 1) + + (2 * BiasBank[INDBIASBANK] + 1)) >= 0) != (XSUM >= 0)) + ctrupdate (WB[INDUPDS], + (((2 * Bias[INDBIAS] + 1) + + (2 * BiasSK[INDBIASSK] + 1) + + (2 * BiasBank[INDBIASBANK] + 1) >= 0) == resolveDir), + EWIDTH); + } +#endif + ctrupdate (Bias[INDBIAS], resolveDir, PERCWIDTH); + ctrupdate (BiasSK[INDBIASSK], resolveDir, PERCWIDTH); + ctrupdate (BiasBank[INDBIASBANK], resolveDir, PERCWIDTH); + Gupdate ((PC << 1) + pred_inter, resolveDir, + GHIST, Gm, GGEHL, GNB, LOGGNB, WG); + Gupdate (PC, resolveDir, phist, Pm, PGEHL, PNB, LOGPNB, WP); +#ifdef LOCALH + Gupdate (PC, resolveDir, L_shist[INDLOCAL], Lm, LGEHL, LNB, LOGLNB, + WL); +#ifdef LOCALS + Gupdate (PC, resolveDir, S_slhist[INDSLOCAL], Sm, + SGEHL, SNB, LOGSNB, WS); +#endif +#ifdef LOCALT + + Gupdate (PC, resolveDir, T_slhist[INDTLOCAL], Tm, TGEHL, TNB, LOGTNB, + WT); +#endif +#endif + + +#ifdef IMLI + Gupdate (PC, resolveDir, IMHIST[(IMLIcount)], IMm, IMGEHL, IMNB, + LOGIMNB, WIM); + Gupdate (PC, resolveDir, IMLIcount, Im, IGEHL, INB, LOGINB, WI); +#endif + + + + } +#endif + +//TAGE UPDATE + bool ALLOC = ((tage_pred != resolveDir) & (HitBank < NHIST)); + + + //do not allocate too often if the overall prediction is correct + + if (HitBank > 0) + { +// Manage the selection between longest matching and alternate matching +// for "pseudo"-newly allocated longest matching entry + // this is extremely important for TAGE only, not that important when the overall predictor is implemented + bool PseudoNewAlloc = + (abs (2 * gtable[HitBank][GI[HitBank]].ctr + 1) <= 1); +// an entry is considered as newly allocated if its prediction counter is weak + if (PseudoNewAlloc) + { + if (LongestMatchPred == resolveDir) + ALLOC = false; +// if it was delivering the correct prediction, no need to allocate a new entry +//even if the overall prediction was false + + + if (LongestMatchPred != alttaken) + { + ctrupdate (use_alt_on_na[INDUSEALT], (alttaken == resolveDir), + ALTWIDTH); + } + + + + } + + + } + + if (pred_taken == resolveDir) + if ((MYRANDOM () & 31) != 0) + ALLOC = false; + + if (ALLOC) + { + + int T = NNN; + + int A = 1; + if ((MYRANDOM () & 127) < 32) + A = 2; + int Penalty = 0; + int NA = 0; + int DEP = ((((HitBank - 1 + 2 * A) & 0xffe)) ^ (MYRANDOM () & 1)); +// just a complex formula to chose between X and X+1, when X is odd: sorry + + for (int I = DEP; I < NHIST; I += 2) + { + int i = I + 1; + bool Done = false; + if (NOSKIP[i]) + { + if (gtable[i][GI[i]].u == 0) + + { +#define OPTREMP +// the replacement is optimized with a single u bit: 0.2 % +#ifdef OPTREMP + if (abs (2 * gtable[i][GI[i]].ctr + 1) <= 3) +#endif + { + gtable[i][GI[i]].tag = GTAG[i]; + gtable[i][GI[i]].ctr = (resolveDir) ? 0 : -1; + NA++; + if (T <= 0) + { + break; + } + I += 2; + Done = true; + T -= 1; + } +#ifdef OPTREMP + else + { + if (gtable[i][GI[i]].ctr > 0) + gtable[i][GI[i]].ctr--; + else + gtable[i][GI[i]].ctr++; + } + +#endif + + } + + + + else + { + Penalty++; + } + } + + if (!Done) + { + i = (I ^ 1) + 1; + if (NOSKIP[i]) + { + + if (gtable[i][GI[i]].u == 0) + { +#ifdef OPTREMP + if (abs (2 * gtable[i][GI[i]].ctr + 1) <= 3) +#endif + + { + gtable[i][GI[i]].tag = GTAG[i]; + gtable[i][GI[i]].ctr = (resolveDir) ? 0 : -1; + NA++; + if (T <= 0) + { + break; + } + I += 2; + T -= 1; + } +#ifdef OPTREMP + else + { + if (gtable[i][GI[i]].ctr > 0) + gtable[i][GI[i]].ctr--; + else + gtable[i][GI[i]].ctr++; + } + +#endif + + + } + else + { + Penalty++; + } + } + + } + + } + TICK += (Penalty - 2 * NA); + + +//just the best formula for the Championship: + //In practice when one out of two entries are useful + if (TICK < 0) + TICK = 0; + if (TICK >= BORNTICK) + { + + for (int i = 1; i <= BORN; i += BORN - 1) + for (int j = 0; j < SizeTable[i]; j++) + gtable[i][j].u >>= 1; + TICK = 0; + + + } + } + +//update predictions + if (HitBank > 0) + { + if (abs (2 * gtable[HitBank][GI[HitBank]].ctr + 1) == 1) + if (LongestMatchPred != resolveDir) + + { // acts as a protection + if (AltBank > 0) + { + ctrupdate (gtable[AltBank][GI[AltBank]].ctr, + resolveDir, CWIDTH); + } + if (AltBank == 0) + baseupdate (resolveDir); + + } + ctrupdate (gtable[HitBank][GI[HitBank]].ctr, resolveDir, CWIDTH); +//sign changes: no way it can have been useful + if (abs (2 * gtable[HitBank][GI[HitBank]].ctr + 1) == 1) + gtable[HitBank][GI[HitBank]].u = 0; + if (alttaken == resolveDir) + if (AltBank > 0) + if (abs (2 * gtable[AltBank][GI[AltBank]].ctr + 1) == 7) + if (gtable[HitBank][GI[HitBank]].u == 1) + { + if (LongestMatchPred == resolveDir) + { + gtable[HitBank][GI[HitBank]].u = 0; + } + } + } + + else + baseupdate (resolveDir); + + if (LongestMatchPred != alttaken) + if (LongestMatchPred == resolveDir) + { + if (gtable[HitBank][GI[HitBank]].u < (1 << UWIDTH) - 1) + gtable[HitBank][GI[HitBank]].u++; + } +//END TAGE UPDATE + + + HistoryUpdate (PC, opType, resolveDir, branchTarget, + phist, ptghist, ch_i, ch_t[0], ch_t[1]); + + +//END PREDICTOR UPDATE + + + } +#define GINDEX (((long long) PC) ^ bhist ^ (bhist >> (8 - i)) ^ (bhist >> (16 - 2 * i)) ^ (bhist >> (24 - 3 * i)) ^ (bhist >> (32 - 3 * i)) ^ (bhist >> (40 - 4 * i))) & ((1 << (logs - (i >= (NBR - 2)))) - 1) + int Gpredict (UINT64 PC, long long BHIST, int *length, + int8_t ** tab, int NBR, int logs, int8_t * W) + { + int PERCSUM = 0; + for (int i = 0; i < NBR; i++) + { + long long bhist = BHIST & ((long long) ((1 << length[i]) - 1)); + long long index = GINDEX; + + int8_t ctr = tab[i][index]; + + PERCSUM += (2 * ctr + 1); + + + } +#ifdef VARTHRES + PERCSUM = (1 + (W[INDUPDS] >= 0)) * PERCSUM; +#endif + return ((PERCSUM)); + } + void Gupdate (UINT64 PC, bool taken, long long BHIST, int *length, + int8_t ** tab, int NBR, int logs, int8_t * W) + { + + int PERCSUM = 0; + + for (int i = 0; i < NBR; i++) + { + long long bhist = BHIST & ((long long) ((1 << length[i]) - 1)); + long long index = GINDEX; + + PERCSUM += (2 * tab[i][index] + 1); + ctrupdate (tab[i][index], taken, PERCWIDTH); + } +#ifdef VARTHRES + { + int XSUM = LSUM - ((W[INDUPDS] >= 0)) * PERCSUM; + if ((XSUM + PERCSUM >= 0) != (XSUM >= 0)) + ctrupdate (W[INDUPDS], ((PERCSUM >= 0) == taken), EWIDTH); + } +#endif + } + + + void TrackOtherInst (UINT64 PC, OpType opType, bool taken, + UINT64 branchTarget) + { + + + HistoryUpdate (PC, opType, taken, branchTarget, phist, + ptghist, ch_i, ch_t[0], ch_t[1]); + + + + } + +#ifdef LOOPPREDICTOR + int lindex (UINT64 PC) + { + return (((PC ^ (PC >> 2)) & ((1 << (LOGL - 2)) - 1)) << 2); + } + + +//loop prediction: only used if high confidence +//skewed associative 4-way +//At fetch time: speculative +#define CONFLOOP 15 + bool getloop (UINT64 PC) + { + LHIT = -1; + + LI = lindex (PC); + LIB = ((PC >> (LOGL - 2)) & ((1 << (LOGL - 2)) - 1)); + LTAG = (PC >> (LOGL - 2)) & ((1 << 2 * LOOPTAG) - 1); + LTAG ^= (LTAG >> LOOPTAG); + LTAG = (LTAG & ((1 << LOOPTAG) - 1)); + + for (int i = 0; i < 4; i++) + { + int index = (LI ^ ((LIB >> i) << 2)) + i; + + if (ltable[index].TAG == LTAG) + { + LHIT = i; + LVALID = ((ltable[index].confid == CONFLOOP) + || (ltable[index].confid * ltable[index].NbIter > 128)); + + + if (ltable[index].CurrentIter + 1 == ltable[index].NbIter) + return (!(ltable[index].dir)); + return ((ltable[index].dir)); + + } + } + + LVALID = false; + return (false); + + } + + + + void loopupdate (UINT64 PC, bool Taken, bool ALLOC) + { + if (LHIT >= 0) + { + int index = (LI ^ ((LIB >> LHIT) << 2)) + LHIT; +//already a hit + if (LVALID) + { + if (Taken != predloop) + { +// free the entry + ltable[index].NbIter = 0; + ltable[index].age = 0; + ltable[index].confid = 0; + ltable[index].CurrentIter = 0; + return; + + } + else if ((predloop != tage_pred) || ((MYRANDOM () & 7) == 0)) + if (ltable[index].age < CONFLOOP) + ltable[index].age++; + } + + ltable[index].CurrentIter++; + ltable[index].CurrentIter &= ((1 << WIDTHNBITERLOOP) - 1); + //loop with more than 2** WIDTHNBITERLOOP iterations are not treated correctly; but who cares :-) + if (ltable[index].CurrentIter > ltable[index].NbIter) + { + ltable[index].confid = 0; + ltable[index].NbIter = 0; +//treat like the 1st encounter of the loop + } + if (Taken != ltable[index].dir) + { + if (ltable[index].CurrentIter == ltable[index].NbIter) + { + if (ltable[index].confid < CONFLOOP) + ltable[index].confid++; + if (ltable[index].NbIter < 3) + //just do not predict when the loop count is 1 or 2 + { +// free the entry + ltable[index].dir = Taken; + ltable[index].NbIter = 0; + ltable[index].age = 0; + ltable[index].confid = 0; + } + } + else + { + if (ltable[index].NbIter == 0) + { +// first complete nest; + ltable[index].confid = 0; + ltable[index].NbIter = ltable[index].CurrentIter; + } + else + { +//not the same number of iterations as last time: free the entry + ltable[index].NbIter = 0; + ltable[index].confid = 0; + } + } + ltable[index].CurrentIter = 0; + } + + } + else if (ALLOC) + + { + UINT64 X = MYRANDOM () & 3; + + if ((MYRANDOM () & 3) == 0) + for (int i = 0; i < 4; i++) + { + int LHIT = (X + i) & 3; + int index = (LI ^ ((LIB >> LHIT) << 2)) + LHIT; + if (ltable[index].age == 0) + { + ltable[index].dir = !Taken; +// most of mispredictions are on last iterations + ltable[index].TAG = LTAG; + ltable[index].NbIter = 0; + ltable[index].age = 7; + ltable[index].confid = 0; + ltable[index].CurrentIter = 0; + break; + + } + else + ltable[index].age--; + break; + } + } + } +#endif +}; + +PREDICTOR *p; +JNIEXPORT void JNICALL Java_pipeline_branchpredictor_TAGESCL_TAGESCLInvoker_initialize + (JNIEnv *, jobject) +{ + p = new PREDICTOR(); +} + +JNIEXPORT jboolean JNICALL Java_pipeline_branchpredictor_TAGESCL_TAGESCLInvoker_predict + (JNIEnv *env, jobject thisObject, jlong PC) +{ + return p->GetPrediction(PC); +} + +JNIEXPORT void JNICALL Java_pipeline_branchpredictor_TAGESCL_TAGESCLInvoker_train + (JNIEnv *env, jobject thisObject, jlong PC, jint opTypeArg, jboolean resolveDir, jboolean predDir, jlong branchTarget) +{ + OpType opType = static_cast(opTypeArg); + p->UpdatePredictor(PC, opType, resolveDir, predDir, branchTarget); +} diff --git a/src/simulator/pipeline/outoforder/DecodeLogic.java b/src/simulator/pipeline/outoforder/DecodeLogic.java index e774e69..daa6486 100755 --- a/src/simulator/pipeline/outoforder/DecodeLogic.java +++ b/src/simulator/pipeline/outoforder/DecodeLogic.java @@ -39,9 +39,7 @@ public class DecodeLogic extends SimulationElement { public void performDecode() { - if(containingExecutionEngine.isToStall5() == true /*pipeline stalled due to branch mis-prediction*/ - || containingExecutionEngine.isToStall1() == true /*IW full*/ - || containingExecutionEngine.isToStall2() == true /*rename stall*/) + if(containingExecutionEngine.isToStall5() == true /*pipeline stalled due to branch mis-prediction*/) { return; } diff --git a/src/simulator/pipeline/outoforder/FetchLogic.java b/src/simulator/pipeline/outoforder/FetchLogic.java index 24b754e..7129a55 100755 --- a/src/simulator/pipeline/outoforder/FetchLogic.java +++ b/src/simulator/pipeline/outoforder/FetchLogic.java @@ -1,6 +1,7 @@ package pipeline.outoforder; import config.SimulationConfig; +import config.SystemConfig; import emulatorinterface.translator.x86.registers.Registers; import main.ArchitecturalComponent; import main.CustomObjectPool; @@ -11,6 +12,7 @@ import generic.Core; import generic.Event; import generic.EventQueue; import generic.GenericCircularQueue; +import generic.GlobalClock; import generic.Instruction; import generic.OperationType; import generic.PortType; @@ -25,12 +27,14 @@ public class FetchLogic extends SimulationElement { GenericCircularQueue[] inputToPipeline; int inputPipeToReadNext; ICacheBuffer iCacheBuffer; + MicroOpCache microOpCache; GenericCircularQueue fetchBuffer; int fetchWidth; OperationType[] instructionsToBeDropped; boolean sleep; long serialNo; + long lastValidIPSeen = -1; public FetchLogic(Core core, OutOrderExecutionEngine execEngine) { @@ -38,6 +42,7 @@ public class FetchLogic extends SimulationElement { this.core = core; this.execEngine = execEngine; fetchBuffer = execEngine.getFetchBuffer(); + microOpCache = execEngine.getMicroOpCache(); fetchWidth = core.getDecodeWidth(); inputPipeToReadNext = 0; sleep = false; @@ -77,33 +82,33 @@ public class FetchLogic extends SimulationElement { Instruction newInstruction; - if(!execEngine.isToStall1() && - !execEngine.isToStall2() && - !execEngine.isToStall3() && - !execEngine.isToStall4() && - !execEngine.isToStall5() && - !execEngine.isToStall6()) + if(execEngine.isToStall5()) + return; + + //add instructions, for whom "fetch" from iCache has completed, to fetch buffer + //decode stage reads from this buffer + for(int i = 0; i < fetchWidth; i++) { - //add instructions, for whom "fetch" from iCache has completed, to fetch buffer - //decode stage reads from this buffer - for(int i = 0; i < fetchWidth; i++) + if(fetchBuffer.isFull() == true) { - if(fetchBuffer.isFull() == true) - { - break; - } + break; + } + + newInstruction = iCacheBuffer.getNextInstruction(); + if(newInstruction != null) + { + fetchBuffer.enqueue(newInstruction); - newInstruction = iCacheBuffer.getNextInstruction(); - if(newInstruction != null) + if(SimulationConfig.debugMode) { - fetchBuffer.enqueue(newInstruction); - } - else - { - this.core.getExecEngine().incrementInstructionMemStall(1); - break; + System.out.println("fetched : " + GlobalClock.getCurrentTime()/core.getStepSize() + " : " + newInstruction); } } + else + { + this.core.getExecEngine().incrementInstructionMemStall(1); + break; + } } //this loop reads from inputToPipeline and places the instruction in iCacheBuffer @@ -191,9 +196,31 @@ public class FetchLogic extends SimulationElement { { // The first micro-operation of an instruction has a valid CISC IP. All the subsequent // micro-ops will have IP = -1(meaning invalid). We must not forward this requests to iCache. + // If the micro-ops are available in the micro-op cache, we don't need to access the i-cache + if(newInstruction.getCISCProgramCounter()!=-1 && newInstruction.getCISCProgramCounter() != lastValidIPSeen + && microOpCache.isPresentInCache(newInstruction.getCISCProgramCounter()) == false) + { + execEngine.getCoreMemorySystem().issueRequestToInstrCache(newInstruction.getCISCProgramCounter()); + } + else + { + iCacheBuffer.fetchComplete[iCacheBuffer.tail] = true; + if(newInstruction.getCISCProgramCounter()==-1 || newInstruction.getCISCProgramCounter() == lastValidIPSeen) + microOpCache.isPresentInCache(lastValidIPSeen); //accessing micro-op cache just to get the micro-op cache LRU and counters right + } + if(newInstruction.getCISCProgramCounter()!=-1) { - execEngine.getCoreMemorySystem().issueRequestToInstrCache(newInstruction.getCISCProgramCounter()); + lastValidIPSeen = newInstruction.getCISCProgramCounter(); + } + else + { + newInstruction.setCISCProgramCounter(lastValidIPSeen); + } + + if(SimulationConfig.debugMode) + { + System.out.println("fetch_initiated : " + GlobalClock.getCurrentTime()/core.getStepSize() + " : " + newInstruction); } } } @@ -235,7 +262,8 @@ public class FetchLogic extends SimulationElement { public void processCompletionOfMemRequest(long address) { - iCacheBuffer.updateFetchComplete(address); + int numberOfMicroOpsUpdated = iCacheBuffer.updateFetchComplete(address); + microOpCache.addToCache(address, numberOfMicroOpsUpdated); } public GenericCircularQueue[] getInputToPipeline() { diff --git a/src/simulator/pipeline/outoforder/ICacheBuffer.java b/src/simulator/pipeline/outoforder/ICacheBuffer.java index 2b9f716..857ea0f 100755 --- a/src/simulator/pipeline/outoforder/ICacheBuffer.java +++ b/src/simulator/pipeline/outoforder/ICacheBuffer.java @@ -78,21 +78,26 @@ public class ICacheBuffer { return toBeReturned; } - public void updateFetchComplete(long programCounter) + public int updateFetchComplete(long programCounter) { + int numberOfMicroOpsUpdated = 0; + if(head == -1) - return; + return numberOfMicroOpsUpdated; for(int i = head; ; i = (i + 1)%size) { if(buffer[i] != null && buffer[i].getCISCProgramCounter() == programCounter) { fetchComplete[i] = true; + numberOfMicroOpsUpdated++; } if(i == tail) break; } + + return numberOfMicroOpsUpdated; } public boolean isFull() diff --git a/src/simulator/pipeline/outoforder/IWPushLogic.java b/src/simulator/pipeline/outoforder/IWPushLogic.java index b2473c0..dabe4c7 100755 --- a/src/simulator/pipeline/outoforder/IWPushLogic.java +++ b/src/simulator/pipeline/outoforder/IWPushLogic.java @@ -16,7 +16,7 @@ public class IWPushLogic extends SimulationElement { OutOrderExecutionEngine execEngine; GenericCircularQueue renameBuffer; InstructionWindow IW; - int decodeWidth; + int renameWidth; public IWPushLogic(Core core, OutOrderExecutionEngine execEngine) { @@ -25,7 +25,7 @@ public class IWPushLogic extends SimulationElement { this.execEngine = execEngine; renameBuffer = execEngine.getRenameBuffer(); IW = execEngine.getInstructionWindow(); - decodeWidth = core.getDecodeWidth(); + renameWidth = core.getRenameWidth(); } /* @@ -39,7 +39,7 @@ public class IWPushLogic extends SimulationElement { return; } - for(int i = 0; i < decodeWidth; i++) + for(int i = 0; i < renameWidth; i++) { ReorderBufferEntry headROBEntry = renameBuffer.peek(0); if(headROBEntry != null) diff --git a/src/simulator/pipeline/outoforder/MicroOpCache.java b/src/simulator/pipeline/outoforder/MicroOpCache.java new file mode 100644 index 0000000..ab8d76a --- /dev/null +++ b/src/simulator/pipeline/outoforder/MicroOpCache.java @@ -0,0 +1,107 @@ +package pipeline.outoforder; + +import java.util.HashMap; +import java.util.Map; +import java.util.Vector; + +import config.SimulationConfig; +import generic.Event; +import generic.EventQueue; +import generic.GlobalClock; +import generic.PortType; +import generic.SimulationElement; + +public class MicroOpCache extends SimulationElement { + + int maxSize; //in terms of number of micro-ops + int curSize; + HashMap uopCache; + + public long numAdditions; + public long numSearches; + public long numHits; + + public MicroOpCache(int maxSize) { + super(PortType.Unlimited, -1, -1, -1, -1); + this.maxSize = maxSize; + uopCache = new HashMap(); + } + + @Override + public void handleEvent(EventQueue eventQ, Event event) { + // TODO Auto-generated method stub + + } + + public boolean isPresentInCache(long searchPC) //will be called for each micro-op (and not each CISC instruction) + { + numSearches++; + + MicroOpCacheEntry entry = uopCache.get(searchPC); + if(entry != null) + { + entry.timeLastUsed = GlobalClock.getCurrentTime(); + numHits++; + if(SimulationConfig.debugMode) + { + System.out.println("hit in microp-cache : " + GlobalClock.getCurrentTime()/24 + " : " + Long.toHexString(searchPC)); + } + return true; + } + + return false; + } + + public void addToCache(long newPC, int numberOfMicroOps) + { + if(uopCache.containsKey(newPC) == false) + { + //remove old entries to make place for the new one + while(curSize + numberOfMicroOps > maxSize) + { + //find LRU PC + long LRU_PC = -1; + MicroOpCacheEntry LRUEntry = null; + for(Map.Entry entry : uopCache.entrySet()) + { + if(LRUEntry == null) + { + LRUEntry = entry.getValue(); + LRU_PC = entry.getKey(); + } + else + { + if(entry.getValue().timeLastUsed < LRUEntry.timeLastUsed) + { + LRUEntry = entry.getValue(); + LRU_PC = entry.getKey(); + } + } + } + + //remove all micro-ops corresponding to LRU PC + uopCache.remove(LRU_PC); + curSize -= LRUEntry.numberOfMicroOps; + } + + //add new micro-ops + MicroOpCacheEntry newEntry = new MicroOpCacheEntry(); + newEntry.numberOfMicroOps = numberOfMicroOps; + newEntry.timeLastUsed = GlobalClock.getCurrentTime(); + uopCache.put(newPC, newEntry); + curSize += numberOfMicroOps; + + numAdditions += numberOfMicroOps; + if(SimulationConfig.debugMode) + { + System.out.println("add to microp-cache : " + GlobalClock.getCurrentTime()/24 + " : " + Long.toHexString(newPC)); + } + } + } +} + +class MicroOpCacheEntry +{ + int numberOfMicroOps; + long timeLastUsed; +} \ No newline at end of file diff --git a/src/simulator/pipeline/outoforder/OutOrderExecutionEngine.java b/src/simulator/pipeline/outoforder/OutOrderExecutionEngine.java index eb07b6b..48fceeb 100755 --- a/src/simulator/pipeline/outoforder/OutOrderExecutionEngine.java +++ b/src/simulator/pipeline/outoforder/OutOrderExecutionEngine.java @@ -22,6 +22,7 @@ public class OutOrderExecutionEngine extends ExecutionEngine { //components of the execution engine private ICacheBuffer iCacheBuffer; private FetchLogic fetcher; + private MicroOpCache microOpCache; private GenericCircularQueue fetchBuffer; private DecodeLogic decoder; private GenericCircularQueue decodeBuffer; @@ -81,11 +82,12 @@ public class OutOrderExecutionEngine extends ExecutionEngine { vectorRegisterFile = new RegisterFile(core, core.getVectorRegisterFileSize()); vectorRenameTable = new RenameTable(this, core.getNVectorArchitecturalRegisters(), core.getVectorRegisterFileSize(), vectorRegisterFile, core.getNo_of_input_pipes()); - fetchBuffer = new GenericCircularQueue(Instruction.class, core.getDecodeWidth()); + fetchBuffer = new GenericCircularQueue(Instruction.class, (core.getDecodeWidth()>core.getRenameWidth()?core.getDecodeWidth():core.getRenameWidth())); + microOpCache = new MicroOpCache(core.getCoreConfig().NoOfMicroOpCacheEntries); fetcher = new FetchLogic(core, this); - decodeBuffer = new GenericCircularQueue(ReorderBufferEntry.class, core.getDecodeWidth()); + decodeBuffer = new GenericCircularQueue(ReorderBufferEntry.class, (core.getDecodeWidth()>core.getRenameWidth()?core.getDecodeWidth():core.getRenameWidth())); decoder = new DecodeLogic(core, this); - renameBuffer = new GenericCircularQueue(ReorderBufferEntry.class, core.getDecodeWidth()); + renameBuffer = new GenericCircularQueue(ReorderBufferEntry.class, (core.getDecodeWidth()>core.getRenameWidth()?core.getDecodeWidth():core.getRenameWidth())); renamer = new RenameLogic(core, this); IWPusher = new IWPushLogic(core, this); selector = new SelectLogic(core, this); @@ -201,6 +203,10 @@ public class OutOrderExecutionEngine extends ExecutionEngine { public GenericCircularQueue getRenameBuffer() { return renameBuffer; } + + public MicroOpCache getMicroOpCache() { + return microOpCache; + } public FetchLogic getFetcher() { return fetcher; diff --git a/src/simulator/pipeline/outoforder/RenameLogic.java b/src/simulator/pipeline/outoforder/RenameLogic.java index 9f407fb..f3a9c9e 100755 --- a/src/simulator/pipeline/outoforder/RenameLogic.java +++ b/src/simulator/pipeline/outoforder/RenameLogic.java @@ -23,7 +23,7 @@ public class RenameLogic extends SimulationElement { OutOrderExecutionEngine execEngine; GenericCircularQueue decodeBuffer; GenericCircularQueue renameBuffer; - int decodeWidth; + int renameWidth; int threadID; Instruction instruction; @@ -37,18 +37,17 @@ public class RenameLogic extends SimulationElement { this.execEngine = execEngine; decodeBuffer = execEngine.getDecodeBuffer(); renameBuffer = execEngine.getRenameBuffer(); - decodeWidth = core.getDecodeWidth(); + renameWidth = core.getRenameWidth(); } public void performRename() { - if(execEngine.isToStall5() == true /*pipeline stalled due to branch mis-prediction*/ - || execEngine.isToStall1() == true /*IW full*/) + if(execEngine.isToStall5() == true /*pipeline stalled due to branch mis-prediction*/) { return; } - for(int i = 0; i < decodeWidth; i++) + for(int i = 0; i < renameWidth; i++) { if(renameBuffer.isFull() == true) { diff --git a/src/simulator/pipeline/outoforder/ReorderBuffer.java b/src/simulator/pipeline/outoforder/ReorderBuffer.java index 371c858..34e0c9a 100755 --- a/src/simulator/pipeline/outoforder/ReorderBuffer.java +++ b/src/simulator/pipeline/outoforder/ReorderBuffer.java @@ -21,9 +21,11 @@ import java.io.OutputStreamWriter; import java.util.zip.GZIPOutputStream; import main.CustomObjectPool; +import pipeline.branchpredictor.TAGESCL.TAGESCL; import config.EmulatorConfig; import config.EnergyConfig; import config.SimulationConfig; +import config.BranchPredictorConfig.BP; public class ReorderBuffer extends SimulationElement{ @@ -49,6 +51,8 @@ public class ReorderBuffer extends SimulationElement{ int stall6Count; long branchCount; long mispredCount; + public long predicateCount; + public long predicateMispredCount; long jumpCount; long targetMispredCount; long lastValidIPSeen; @@ -249,45 +253,120 @@ public class ReorderBuffer extends SimulationElement{ if(firstOpType == OperationType.branch) { //perform prediction - boolean prediction = execEngine.getBranchPredictor().predict( - lastValidIPSeen, - first.getInstruction().isBranchTaken()); + boolean prediction; + prediction = execEngine.getBranchPredictor().predict( + lastValidIPSeen, + first.getInstruction().isBranchTaken()); + if(prediction != first.getInstruction().isBranchTaken()) - { + { + if(SimulationConfig.debugMode) + { + System.out.println("branch mispredicted : " + firstInstruction.getSerialNo()); + } + anyMispredictedBranch = true; mispredCount++; - } - this.execEngine.getBranchPredictor().incrementNumAccesses(1); + } //train predictor - execEngine.getBranchPredictor().Train( - lastValidIPSeen, - firstInstruction.isBranchTaken(), - prediction - ); - this.execEngine.getBranchPredictor().incrementNumAccesses(1); + if(core.getCoreConfig().branchPredictor.predictorMode != BP.TAGE_SC_L) + { + execEngine.getBranchPredictor().Train( + lastValidIPSeen, + firstInstruction.isBranchTaken(), + prediction + ); + } + else + { + ((TAGESCL)execEngine.getBranchPredictor()).Train( + lastValidIPSeen, + 9, //OPTYPE_JMP_DIRECT_COND, + firstInstruction.isBranchTaken(), + prediction, + firstInstruction.getBranchTargetAddress() + ); + } this.execEngine.getBTB().GHRTrain(firstInstruction.isBranchTaken()); - + this.execEngine.getBranchPredictor().incrementNumAccesses(2); + branchCount++; } //jump operation if(firstOpType == OperationType.jump) { - long actualTarget = firstInstruction.getBranchTargetAddress(); + long actualTarget = first.getInstruction().getBranchTargetAddress(); long predictedTarget = this.execEngine.getBTB().BTBPredict(lastValidIPSeen); if(actualTarget != predictedTarget) { + if(SimulationConfig.debugMode) + { + System.out.println("jump target mispredicted : " + firstInstruction.getSerialNo()); + } + anyMispredictedBranch = true; targetMispredCount++; } - + this.execEngine.getBTB().BTBTrain(lastValidIPSeen, actualTarget); + + if(core.getCoreConfig().branchPredictor.predictorMode == BP.TAGE_SC_L) + { + ((TAGESCL)execEngine.getBranchPredictor()).Train( + lastValidIPSeen, + 4, //OPTYPE_JMP_DIRECT_UNCOND, + true, + true, + firstInstruction.getBranchTargetAddress() + ); + } jumpCount++; } + //predicate prediction + if(firstInstruction.isPredicate()) + { + //perform prediction + boolean prediction = execEngine.getBranchPredictor().predict( + lastValidIPSeen, + !first.getInstruction().isPredicateAndNotExecuted()); + if(prediction != !first.getInstruction().isPredicateAndNotExecuted()) + { + if(SimulationConfig.debugMode) + { + System.out.println("predicate mispredicted : " + firstInstruction.getSerialNo()); + } + + anyMispredictedBranch = true; + predicateMispredCount++; + } + + //train predictor + if(core.getCoreConfig().branchPredictor.predictorMode != BP.TAGE_SC_L) + { + execEngine.getBranchPredictor().Train( + lastValidIPSeen, + !firstInstruction.isPredicateAndNotExecuted(), + prediction + ); + } + else + { + ((TAGESCL)execEngine.getBranchPredictor()).Train( + lastValidIPSeen, + 9, //OPTYPE_JMP_DIRECT_COND, + !firstInstruction.isPredicateAndNotExecuted(), + prediction, + firstInstruction.getBranchTargetAddress() + ); + } + predicateCount++; + } + //Signal LSQ for committing the Instruction at the queue head if(firstOpType == OperationType.load || firstOpType == OperationType.store) { @@ -648,6 +727,10 @@ public class ReorderBuffer extends SimulationElement{ return stall5Count; } + public int getStall6Count() { + return stall6Count; + } + public long getBranchCount() { return branchCount; }