LLVM学习笔记(43-2)

V7.0的变化

V7.0 SubtargetEmitter::EmitProcessorModels()改写颇多,因为对处理器的描述进行了相当程度的增强。

1344  void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) {

1345    // For each processor model.

1346    for (const CodeGenProcModel &PM : SchedModels.procModels()) {

1347      // Emit extra processor info if available.

1348      if (PM.hasExtraProcessorInfo())

1349        EmitExtraProcessorInfo(PM, OS);

1350      // Emit processor resource table.

1351      if (PM.hasInstrSchedModel())

1352        EmitProcessorResources(PM, OS);

1353      else if(!PM.ProcResourceDefs.empty())

1354        PrintFatalError(PM.ModelDef->getLoc(), "SchedMachineModel defines "

1355                      "ProcResources without defining WriteRes SchedWriteRes");

1356 

1357      // Begin processor itinerary properties

1358      OS << "\n";

1359      OS << "static const llvm::MCSchedModel " << PM.ModelName << " = {\n";

1360      EmitProcessorProp(OS, PM.ModelDef, "IssueWidth", ',');

1361      EmitProcessorProp(OS, PM.ModelDef, "MicroOpBufferSize", ',');

1362      EmitProcessorProp(OS, PM.ModelDef, "LoopMicroOpBufferSize", ',');

1363      EmitProcessorProp(OS, PM.ModelDef, "LoadLatency", ',');

1364      EmitProcessorProp(OS, PM.ModelDef, "HighLatency", ',');

1365      EmitProcessorProp(OS, PM.ModelDef, "MispredictPenalty", ',');

1366 

1367      bool PostRAScheduler =

1368        (PM.ModelDef ? PM.ModelDef->getValueAsBit("PostRAScheduler") : false);

1369 

1370      OS << "  " << (PostRAScheduler ? "true" : "false")  << ", // "

1371         << "PostRAScheduler\n";

1372 

1373      bool CompleteModel =

1374        (PM.ModelDef ? PM.ModelDef->getValueAsBit("CompleteModel") : false);

1375 

1376      OS << "  " << (CompleteModel ? "true" : "false") << ", // "

1377         << "CompleteModel\n";

1378 

1379      OS << "  " << PM.Index << ", // Processor ID\n";

1380      if (PM.hasInstrSchedModel())

1381        OS << "  " << PM.ModelName << "ProcResources" << ",\n"

1382           << "  " << PM.ModelName << "SchedClasses" << ",\n"

1383           << "  " << PM.ProcResourceDefs.size()+1 << ",\n"

1384           << "  " << (SchedModels.schedClassEnd()

1385                       - SchedModels.schedClassBegin()) << ",\n";

1386      else

1387        OS << "  nullptr, nullptr, 0, 0,"

1388           << " // No instruction-level machine model.\n";

1389      if (PM.hasItineraries())

1390        OS << "  " << PM.ItinsDef->getName() << ",\n";

1391      else

1392        OS << "  nullptr, // No Itinerary\n";

1393      if (PM.hasExtraProcessorInfo())

1394        OS << "  &" << PM.ModelName << "ExtraInfo,\n";

1395      else

1396        OS << "  nullptr // No extra processor descriptor\n";

1397      OS << "};\n";

1398    }

1399  }

我们知道在CodeGenSchedModelsProcModels容器里,第一个调度模型是等价于空指针的NoSchedModel。首先通过1360行一下这个模型:

static const llvm::MCSchedModel NoSchedModel = {

  MCSchedModel::DefaultIssueWidth,

  MCSchedModel::DefaultMicroOpBufferSize,

  MCSchedModel::DefaultLoopMicroOpBufferSize,

  MCSchedModel::DefaultLoadLatency,

  MCSchedModel::DefaultHighLatency,

  MCSchedModel::DefaultMispredictPenalty,

  false, // PostRAScheduler

  false, // CompleteModel

  0, // Processor ID

  nullptr, nullptr, 0, 0, // No instruction-level machine model.

  nullptr, // No Itinerary

  nullptr // No extra processor descriptor

};

而对于定义了回收控制单元、寄存器文件、性能计数器的处理器来说,还需要上面1349行的EmitExtraProcessorInfo()来输出额外的处理器信息:

753     void SubtargetEmitter::EmitExtraProcessorInfo(const CodeGenProcModel &ProcModel,

754                                                   raw_ostream &OS) {

755       // Generate a table of register file descriptors (one entry per each user

756       // defined register file), and a table of register costs.

757       unsigned NumCostEntries = EmitRegisterFileTables(ProcModel, OS);

758    

759       // Generate a table of ProcRes counter names.

760       const bool HasPfmIssueCounters = EmitPfmIssueCountersTable(ProcModel, OS);

761    

762       // Now generate a table for the extra processor info.

763       OS << "\nstatic const llvm::MCExtraProcessorInfo " << ProcModel.ModelName

764          << "ExtraInfo = {\n  ";

765    

766       // Add information related to the retire control unit.

767       EmitRetireControlUnitInfo(ProcModel, OS);

768    

769       // Add information related to the register files (i.e. where to find register

770       // file descriptors and register costs).

771       EmitRegisterFileInfo(ProcModel, ProcModel.RegisterFiles.size(),

772                            NumCostEntries, OS);

773    

774       EmitPfmCounters(ProcModel, HasPfmIssueCounters, OS);

775    

776       OS << "};\n";

777     }

首先由EmitRegisterFileTables()输出寄存器重命名代价表与寄存器文件描述表。

646     unsigned

647     SubtargetEmitter::EmitRegisterFileTables(const CodeGenProcModel &ProcModel,

648                                              raw_ostream &OS) {

649       if (llvm::all_of(ProcModel.RegisterFiles, [](const CodeGenRegisterFile &RF) {

650             return RF.hasDefaultCosts();

651           }))

652         return 0;

653    

654       // Print the RegisterCost table first.

655       OS << "\n// {RegisterClassID, Register Cost}\n";

656       OS << "static const llvm::MCRegisterCostEntry " << ProcModel.ModelName

657          << "RegisterCosts"

658          << "[] = {\n";

659    

660       for (const CodeGenRegisterFile &RF : ProcModel.RegisterFiles) {

661         // Skip register files with a default cost table.

662         if (RF.hasDefaultCosts())

663           continue;

664         // Add entries to the cost table.

665         for (const CodeGenRegisterCost &RC : RF.Costs) {

666           OS << "  { ";

667           Record *Rec = RC.RCDef;

668           if (Rec->getValue("Namespace"))

669             OS << Rec->getValueAsString("Namespace") << "::";

670           OS << Rec->getName() << "RegClassID, " << RC.Cost << "},\n";

671         }

672       }

673       OS << "};\n";

674    

675       // Now generate a table with register file info.

676       OS << "\n // {Name, #PhysRegs, #CostEntries, IndexToCostTbl}\n";

677       OS << "static const llvm::MCRegisterFileDesc " << ProcModel.ModelName

678          << "RegisterFiles"

679          << "[] = {\n"

680          << "  { \"InvalidRegisterFile\", 0, 0, 0 },\n";

681       unsigned CostTblIndex = 0;

682    

683       for (const CodeGenRegisterFile &RD : ProcModel.RegisterFiles) {

684         OS << "  { ";

685         OS << '"' << RD.Name << '"' << ", " << RD.NumPhysRegs << ", ";

686         unsigned NumCostEntries = RD.Costs.size();

687         OS << NumCostEntries << ", " << CostTblIndex << "},\n";

688         CostTblIndex += NumCostEntries;

689       }

690       OS << "};\n";

691    

692       return CostTblIndex;

693     }

RegisterFile的定义里(参考RegisterFile的定义一节),如果RegCosts部分是缺省的,表示重命名代价是一个物理寄存器,这时CodeGenSchedModels::collectRegisterFiles()会自动给相应的CodeGenRegisterFile实例的Costs里赋值1

上面这段代码,以Zen处理器为例,输出这两个表:

// {RegisterClassID, Register Cost}

static const llvm::MCRegisterCostEntry Znver1ModelRegisterCosts[] = {

  { X86::VR64RegClassID, 1},

  { X86::VR128RegClassID, 1},

  { X86::VR256RegClassID, 2},

  { X86::GR64RegClassID, 1},

  { X86::CCRRegClassID, 1},

};

 

 // {Name, #PhysRegs, #CostEntries, IndexToCostTbl}

static const llvm::MCRegisterFileDesc Znver1ModelRegisterFiles[] = {

  { "InvalidRegisterFile", 0, 0, 0 },

  { "ZnFpuPRF", 160, 3, 0},

  { "ZnIntegerPRF", 168, 2, 3},

};

接下来,对于定义了性能计数器的处理器,比如SandyBridge,输出计数器表(参考处理器的性能计数器定义一节)。

695     static bool EmitPfmIssueCountersTable(const CodeGenProcModel &ProcModel,

696                                           raw_ostream &OS) {

697       unsigned NumCounterDefs = 1 + ProcModel.ProcResourceDefs.size();

698       std::vector<const Record *> CounterDefs(NumCounterDefs);

699       bool HasCounters = false;

700       for (const Record *CounterDef : ProcModel.PfmIssueCounterDefs) {

701         const Record *&CD = CounterDefs[ProcModel.getProcResourceIdx(

702             CounterDef->getValueAsDef("Resource"))];

703         if (CD) {

704           PrintFatalError(CounterDef->getLoc(),

705                           "multiple issue counters for " +

706                               CounterDef->getValueAsDef("Resource")->getName());

707         }

708         CD = CounterDef;

709         HasCounters = true;

710       }

711       if (!HasCounters) {

712         return false;

713       }

714       OS << "\nstatic const char* " << ProcModel.ModelName

715          << "PfmIssueCounters[] = {\n";

716       for (unsigned i = 0; i != NumCounterDefs; ++i) {

717         const Record *CounterDef = CounterDefs[i];

718         if (CounterDef) {

719           const auto PfmCounters = CounterDef->getValueAsListOfStrings("Counters");

720           if (PfmCounters.empty())

721             PrintFatalError(CounterDef->getLoc(), "empty counter list");

722           OS << "  \"" << PfmCounters[0];

723           for (unsigned p = 1, e = PfmCounters.size(); p != e; ++p)

724             OS << ",\" \"" << PfmCounters[p];

725           OS << "\",  // #" << i << " = ";

726           OS << CounterDef->getValueAsDef("Resource")->getName() << "\n";

727         } else {

728           OS << "  nullptr, // #" << i << "\n";

729         }

730       }

731       OS << "};\n";

732       return true;

733     }

SandyBridge的一个定义为例,这个定义是(X86PfmCounters.td):

14       let SchedModel = SandyBridgeModel in {

15       def SBCycleCounter : PfmCycleCounter<"unhalted_core_cycles">;

16       def SBPort0Counter : PfmIssueCounter<SBPort0, ["uops_dispatched_port:port_0"]>;

17       def SBPort1Counter : PfmIssueCounter<SBPort1, ["uops_dispatched_port:port_1"]>;

18       def SBPort23Counter : PfmIssueCounter<SBPort23,

19                                             ["uops_dispatched_port:port_2",

20                                              "uops_dispatched_port:port_3"]>;

21       def SBPort4Counter : PfmIssueCounter<SBPort4, ["uops_dispatched_port:port_4"]>;

22       def SBPort5Counter : PfmIssueCounter<SBPort5, ["uops_dispatched_port:port_5"]>;

23       }

上述代码将输出这样的数组,这个数组的大小是资源单元定义的个数加1,因此数组是以处理器资源的序号为索引的:

static const char* SandyBridgeModelPfmIssueCounters[] = {

  nullptr, // #0

  nullptr, // #1

  nullptr, // #2

  "uops_dispatched_port:port_0",  // #3 = SBPort0

  "uops_dispatched_port:port_1",  // #4 = SBPort1

  "uops_dispatched_port:port_4",  // #5 = SBPort4

  "uops_dispatched_port:port_5",  // #6 = SBPort5

  nullptr, // #7

  nullptr, // #8

  nullptr, // #9

  "uops_dispatched_port:port_2," "uops_dispatched_port:port_3",  // #10 = SBPort23

  nullptr, // #11

  nullptr, // #12

};

接着通过下面的EmitRetireControlUnitInfo()EmitRegisterFileInfo()EmitPfmCounters()输出所谓的ExtraInfo数组。这个数组的元素类型是结构体MCExtraProcessorInfo

170     struct MCExtraProcessorInfo {

171       // Actual size of the reorder buffer in hardware.

172       unsigned ReorderBufferSize;

173       // Number of instructions retired per cycle.

174       unsigned MaxRetirePerCycle;

175       const MCRegisterFileDesc *RegisterFiles;

176       unsigned NumRegisterFiles;

177       const MCRegisterCostEntry *RegisterCostTable;

178       unsigned NumRegisterCostEntries;

179    

180       struct PfmCountersInfo {

181         // An optional name of a performance counter that can be used to measure

182         // cycles.

183         const char *CycleCounter;

184    

185         // For each MCProcResourceDesc defined by the processor, an optional list of

186         // names of performance counters that can be used to measure the resource

187         // utilization.

188         const char **IssueCounters;

189       };

190       PfmCountersInfo PfmCounters;

191     };

因此,将输出这样的一个结构体定义:

static const llvm::MCExtraProcessorInfo SandyBridgeModelExtraInfo = {

  0, // ReorderBufferSize

  0, // MaxRetirePerCycle

  nullptr,

  0, // Number of register files.

  nullptr,

  0, // Number of register cost entries.

  {

    "unhalted_core_cycles",  // Cycle counter.

    SandyBridgeModelPfmIssueCounters

  }

};

最后对有调度类型被重新映射的处理器模型,由下面的方法输出特定的数据结构。

779     void SubtargetEmitter::EmitProcessorResources(const CodeGenProcModel &ProcModel,

780                                                   raw_ostream &OS) {

781       EmitProcessorResourceSubUnits(ProcModel, OS);

782    

783       OS << "\n// {Name, NumUnits, SuperIdx, IsBuffered, SubUnitsIdxBegin}\n";

784       OS << "static const llvm::MCProcResourceDesc " << ProcModel.ModelName

785          << "ProcResources"

786          << "[] = {\n"

787          << "  {\"InvalidUnit\", 0, 0, 0, 0},\n";

788    

789       unsigned SubUnitsOffset = 1;

790       for (unsigned i = 0, e = ProcModel.ProcResourceDefs.size(); i < e; ++i) {

791         Record *PRDef = ProcModel.ProcResourceDefs[i];

792    

793         Record *SuperDef = nullptr;

794         unsigned SuperIdx = 0;

795         unsigned NumUnits = 0;

796         const unsigned SubUnitsBeginOffset = SubUnitsOffset;

797         int BufferSize = PRDef->getValueAsInt("BufferSize");

798         if (PRDef->isSubClassOf("ProcResGroup")) {

799           RecVec ResUnits = PRDef->getValueAsListOfDefs("Resources");

800           for (Record *RU : ResUnits) {

801             NumUnits += RU->getValueAsInt("NumUnits");

802             SubUnitsOffset += RU->getValueAsInt("NumUnits");

803           }

804         }

805         else {

806           // Find the SuperIdx

807           if (PRDef->getValueInit("Super")->isComplete()) {

808             SuperDef =

809                 SchedModels.findProcResUnits(PRDef->getValueAsDef("Super"),

810                                              ProcModel, PRDef->getLoc());

811             SuperIdx = ProcModel.getProcResourceIdx(SuperDef);

812           }

813           NumUnits = PRDef->getValueAsInt("NumUnits");

814         }

815         // Emit the ProcResourceDesc

816         OS << "  {\"" << PRDef->getName() << "\", ";

817         if (PRDef->getName().size() < 15)

818           OS.indent(15 - PRDef->getName().size());

819         OS << NumUnits << ", " << SuperIdx << ", " << BufferSize << ", ";

820         if (SubUnitsBeginOffset != SubUnitsOffset) {

821           OS << ProcModel.ModelName << "ProcResourceSubUnits + "

822              << SubUnitsBeginOffset;

823         } else {

824           OS << "nullptr";

825         }

826         OS << "}, // #" << i+1;

827         if (SuperDef)

828           OS << ", Super=" << SuperDef->getName();

829         OS << "\n";

830       }

831       OS << "};\n";

832     }

首先是输出资源子单元的描述。前面看到TD通过ProcResGroup或者ProcResource来组织与描述资源,资源描述的最小单位是资源单元(ProcResourceUnits),因此这里尝试输出一个以资源单元描述的资源组。

593     void SubtargetEmitter::EmitProcessorResourceSubUnits(

594         const CodeGenProcModel &ProcModel, raw_ostream &OS) {

595       OS << "\nstatic const unsigned " << ProcModel.ModelName

596          << "ProcResourceSubUnits[] = {\n"

597          << "  0,  // Invalid\n";

598    

599       for (unsigned i = 0, e = ProcModel.ProcResourceDefs.size(); i < e; ++i) {

600         Record *PRDef = ProcModel.ProcResourceDefs[i];

601         if (!PRDef->isSubClassOf("ProcResGroup"))

602           continue;

603         RecVec ResUnits = PRDef->getValueAsListOfDefs("Resources");

604         for (Record *RUDef : ResUnits) {

605           Record *const RU =

606               SchedModels.findProcResUnits(RUDef, ProcModel, PRDef->getLoc());

607           for (unsigned J = 0; J < RU->getValueAsInt("NumUnits"); ++J) {

608             OS << "  " << ProcModel.getProcResourceIdx(RU) << ", ";

609           }

610         }

611         OS << "  // " << PRDef->getName() << "\n";

612       }

613       OS << "};\n";

614     }

SandyBridge为例,将输出这样的一个数组:

static const unsigned SandyBridgeModelProcResourceSubUnits[] = {

  0,  // Invalid

  3,   4,   // SBPort01

  3,   6,   // SBPort05

  4,   6,   // SBPort15

  3,   4,   6,   // SBPort015

  3,   4,   10,   10,   5,   6,   // SBPortAny

};

SubtargetEmitter::EmitProcessorResources()的主体输出资源单元的描述结构体。这个结构体的类型是MCProcResourceDesc

32       struct MCProcResourceDesc {

33         const char *Name;

34         unsigned NumUnits; // Number of resource of this kind

35         unsigned SuperIdx; // Index of the resources kind that contains this kind.

36      

37         // Number of resources that may be buffered.

38         //

39         // Buffered resources (BufferSize != 0) may be consumed at some indeterminate

40         // cycle after dispatch. This should be used for out-of-order cpus when

41         // instructions that use this resource can be buffered in a reservaton

42         // station.

43         //

44         // Unbuffered resources (BufferSize == 0) always consume their resource some

45         // fixed number of cycles after dispatch. If a resource is unbuffered, then

46         // the scheduler will avoid scheduling instructions with conflicting resources

47         // in the same cycle. This is for in-order cpus, or the in-order portion of

48         // an out-of-order cpus.

49         int BufferSize;

50      

51         // If the resource has sub-units, a pointer to the first element of an array

52         // of `NumUnits` elements containing the ProcResourceIdx of the sub units.

53         // nullptr if the resource does not have sub-units.

54         const unsigned *SubUnitsIdxBegin;

55      

56         bool operator==(const MCProcResourceDesc &Other) const {

57           return NumUnits == Other.NumUnits && SuperIdx == Other.SuperIdx

58             && BufferSize == Other.BufferSize;

59         }

60       };

因此,输出的数据是:

// {Name, NumUnits, SuperIdx, IsBuffered, SubUnitsIdxBegin}

static const llvm::MCProcResourceDesc SandyBridgeModelProcResources[] = {

  {"InvalidUnit", 0, 0, 0, 0},

  {"SBDivider",       1, 0, -1, nullptr}, // #1

  {"SBFPDivider",     1, 0, -1, nullptr}, // #2

  {"SBPort0",         1, 0, -1, nullptr}, // #3

  {"SBPort1",         1, 0, -1, nullptr}, // #4

  {"SBPort4",         1, 0, -1, nullptr}, // #5

  {"SBPort5",         1, 0, -1, nullptr}, // #6

  {"SBPort01",        2, 0, -1, SandyBridgeModelProcResourceSubUnits + 1}, // #7

  {"SBPort05",        2, 0, -1, SandyBridgeModelProcResourceSubUnits + 3}, // #8

  {"SBPort15",        2, 0, -1, SandyBridgeModelProcResourceSubUnits + 5}, // #9

  {"SBPort23",        2, 0, -1, nullptr}, // #10

  {"SBPort015",       3, 0, -1, SandyBridgeModelProcResourceSubUnits + 7}, // #11

  {"SBPortAny",       6, 0, 54, SandyBridgeModelProcResourceSubUnits + 10}, // #12

};

SandyBridge没有使用超级资源的描述方法,因此SuperIdx域都是0。而IsBuffered域实际上是缓冲的大小,BufferSize = -1表示发布端口由一体化保留站填充。

回到SubtargetEmitter::run(),下面的代码输出一个重要的方法:InitX86MCSubtargetInfo()。

SubtargetEmitter::run(续)

1437    // MCInstrInfo initialization routine.                                                                                     <- v7.0删除

1438    OS << "static inline void Init" << Target

1439       << "MCSubtargetInfo(MCSubtargetInfo *II, "

1440       << "const Triple &TT, StringRef CPU, StringRef FS) {\n";

1441    OS << "  II->InitMCSubtargetInfo(TT, CPU, FS, ";

 

  // MCInstrInfo initialization routine.                                                                                     <- v7.0增加

  emitGenMCSubtargetInfo(OS);

 

  OS << "\nstatic inline MCSubtargetInfo *create" << Target

     << "MCSubtargetInfoImpl("

     << "const Triple &TT, StringRef CPU, StringRef FS) {\n";

  OS << "  return new " << Target << "GenMCSubtargetInfo(TT, CPU, FS, ";

1442    if (NumFeatures)

1443      OS << Target << "FeatureKV, ";

1444    else

1445      OS << "None, ";

1446    if (NumProcs)

1447      OS << Target << "SubTypeKV, ";

1448    else

1449      OS << "None, ";

1450    OS << '\n'; OS.indent(22);

1451    OS << Target << "ProcSchedKV, "

1452       << Target << "WriteProcResTable, "

1453       << Target << "WriteLatencyTable, "

1454       << Target << "ReadAdvanceTable, ";

1455    if (SchedModels.hasItineraries()) {

1456      OS << '\n'; OS.indent(22);

1457      OS << Target << "Stages, "

1458         << Target << "OperandCycles, "

1459         << Target << "ForwardingPaths";

1460    } else

1461      OS << "0, 0, 0";

1462    OS << ");\n}\n\n";

1463 

1464    OS << "} // End llvm namespace \n";

1465 

1466    OS << "#endif // GET_SUBTARGETINFO_MC_DESC\n\n";

生成的InitX86MCSubtargetInfo()的定义如下(连带收尾代码):

#undef DBGFIELD

static inline void InitX86MCSubtargetInfo(MCSubtargetInfo *II, const Triple &TT, StringRef CPU, StringRef FS) {

  II->InitMCSubtargetInfo(TT, CPU, FS, X86FeatureKV, X86SubTypeKV,

                      X86ProcSchedKV, X86WriteProcResTable, X86WriteLatencyTable, X86ReadAdvanceTable,

                      X86Stages, X86OperandCycles, X86ForwardingPaths);

}

} // End llvm namespace

#endif // GET_SUBTARGETINFO_MC_DESC

通过这个方法,X86目标机器的数据就与机器无关的MC框架挂上钩。

V7.0的变化——resolveVariantSchedClassImpl()等

V7.0不再输出InitX86MCSubtargetInfo()方法,现在的方法是createX86MCSubtargetInfoImpl()。方法emitGenMCSubtargetInfo()用于构建这个方法,以及现在需要的X86GenMCSubtargetInforesolveVariantSchedClass()方法。

1678  void SubtargetEmitter::emitGenMCSubtargetInfo(raw_ostream &OS) {

1679    OS << "namespace " << Target << "_MC {\n"

1680       << "unsigned resolveVariantSchedClassImpl(unsigned SchedClass,\n"

1681       << "    const MCInst *MI, unsigned CPUID) {\n";

1682    emitSchedModelHelpersImpl(OS, /* OnlyExpandMCPredicates */ true);

1683    OS << "}\n";

1684    OS << "} // end of namespace " << Target << "_MC\n\n";

1685 

1686    OS << "struct " << Target

1687       << "GenMCSubtargetInfo : public MCSubtargetInfo {\n";

1688    OS << "  " << Target << "GenMCSubtargetInfo(const Triple &TT, \n"

1689       << "    StringRef CPU, StringRef FS, ArrayRef<SubtargetFeatureKV> PF,\n"

1690       << "    ArrayRef<SubtargetFeatureKV> PD,\n"

1691       << "    const SubtargetInfoKV *ProcSched,\n"

1692       << "    const MCWriteProcResEntry *WPR,\n"

1693       << "    const MCWriteLatencyEntry *WL,\n"

1694       << "    const MCReadAdvanceEntry *RA, const InstrStage *IS,\n"

1695       << "    const unsigned *OC, const unsigned *FP) :\n"

1696       << "      MCSubtargetInfo(TT, CPU, FS, PF, PD, ProcSched,\n"

1697       << "                      WPR, WL, RA, IS, OC, FP) { }\n\n"

1698       << "  unsigned resolveVariantSchedClass(unsigned SchedClass,\n"

1699       << "      const MCInst *MI, unsigned CPUID) const override {\n"

1700       << "    return " << Target << "_MC"

1701       << "::resolveVariantSchedClassImpl(SchedClass, MI, CPUID); \n";

1702    OS << "  }\n";

1703    OS << "};\n";

1704  }

上面1682行的emitSchedModelHelpersImpl()输出resolveVariantSchedClassImpl()的主体。1464行的循环首先收集发生了迁移的调度类,即被InstRW定义改写的调度方案。在存在这样的调度类的情形下,在1476行循环遍历当前调度类的迁移方案。因为现在是为MCSubtargetInfo生成代码,参数OnlyExpandMCInstPredicatestrue,当前CodeGenSchedTransition实例里的谓词必须都是MCSchedPredicate定义(1477~1478行)。

1460  void SubtargetEmitter::emitSchedModelHelpersImpl(

1461      raw_ostream &OS, bool OnlyExpandMCInstPredicates) {

1462    // Collect Variant Classes.

1463    IdxVec VariantClasses;

1464    for (const CodeGenSchedClass &SC : SchedModels.schedClasses()) {

1465      if (SC.Transitions.empty())

1466        continue;

1467      VariantClasses.push_back(SC.Index);

1468    }

1469 

1470    if (!VariantClasses.empty()) {

1471      bool FoundPredicates = false;

1472     for (unsigned VC : VariantClasses) {

1473        // Emit code for each variant scheduling class.

1474        const CodeGenSchedClass &SC = SchedModels.getSchedClass(VC);

1475        IdxVec ProcIndices;

1476        for (const CodeGenSchedTransition &T : SC.Transitions) {

1477          if (OnlyExpandMCInstPredicates &&

1478              !all_of(T.PredTerm, [](const Record *Rec) {

1479                return Rec->isSubClassOf("MCSchedPredicate");

1480              }))

1481            continue;

1482 

1483          IdxVec PI;

1484          std::set_union(T.ProcIndices.begin(), T.ProcIndices.end(),

1485                         ProcIndices.begin(), ProcIndices.end(),

1486                         std::back_inserter(PI));

1487          ProcIndices.swap(PI);

1488        }

1489        if (ProcIndices.empty())

1490          continue;

1491 

1492        // Emit a switch statement only if there are predicates to expand.

1493        if (!FoundPredicates) {

1494          OS << "  switch (SchedClass) {\n";

1495          FoundPredicates = true;

1496        }

1497 

1498        OS << "  case " << VC << ": // " << SC.Name << '\n';

1499        PredicateExpander PE;

1500        PE.setByRef(false);

1501        PE.setExpandForMC(OnlyExpandMCInstPredicates);

1502        for (unsigned PI : ProcIndices) {

1503          OS << "    ";

1504          if (PI != 0) {

1505            OS << (OnlyExpandMCInstPredicates

1506                       ? "if (CPUID == "

1507                       : "if (SchedModel->getProcessorID() == ");

1508            OS << PI << ") ";

1509          }

1510          OS << "{ // " << (SchedModels.procModelBegin() + PI)->ModelName << '\n';

1511 

1512          for (const CodeGenSchedTransition &T : SC.Transitions) {

1513            if (PI != 0 && !count(T.ProcIndices, PI))

1514              continue;

1515            PE.setIndentLevel(4);

1516            emitPredicates(T, SchedModels.getSchedClass(T.ToClassIdx), PE, OS);

1517          }

1518 

1519          OS << "    }\n";

1520          if (PI == 0)

1521            break;

1522        }

1523        if (SC.isInferred())

1524          OS << "    return " << SC.Index << ";\n";

1525        OS << "    break;\n";

1526      }

1527 

1528      if (FoundPredicates)

1529       OS << "  };\n";

1530    }

1531 

1532    if (OnlyExpandMCInstPredicates) {

1533      OS << "  // Don't know how to resolve this scheduling class.\n"

1534         << "  return 0;\n";

1535      return;

1536    }

1537 

1538    OS << "  report_fatal_error(\"Expected a variant SchedClass\");\n";

1539  }

另外,1484行的set_union()操作将一个调度类所有迁移支持的处理器取合集。并为每个记录下来的CodeGenSchedTransition对象调用emitPredicates()方法来生成谓词代码:

1482  static void emitPredicates(const CodeGenSchedTransition &T,

1483                             const CodeGenSchedClass &SC,

1484                             PredicateExpander &PE,

1485                             raw_ostream &OS) {

1486    std::string Buffer;

1487    raw_string_ostream StringStream(Buffer);

1488    formatted_raw_ostream FOS(StringStream);

1489 

1490    FOS.PadToColumn(6);

1491    FOS << "if (";

1492    for (RecIter RI = T.PredTerm.begin(), RE = T.PredTerm.end(); RI != RE; ++RI) {

1493      if (RI != T.PredTerm.begin()) {

1494        FOS << "\n";

1495        FOS.PadToColumn(8);

1496        FOS << "&& ";

1497      }

1498      const Record *Rec = *RI;

1499      if (Rec->isSubClassOf("MCSchedPredicate"))

1500        PE.expandPredicate(FOS, Rec->getValueAsDef("Pred"));

1501      else

1502        FOS << "(" << Rec->getValueAsString("Predicate") << ")";

1503    }

1504 

1505    FOS << ")\n";

1506    FOS.PadToColumn(8);

1507    FOS << "return " << T.ToClassIdx << "; // " << SC.Name << '\n';

1508    FOS.flush();

1509    OS << Buffer;

1510  }

那么上面的方法输出这样的代码:

namespace X86_MC {

unsigned resolveVariantSchedClassImpl(unsigned SchedClass,

    const MCInst *MI, unsigned CPUID) {

  switch (SchedClass) {

  case 577: // MMX_PADDQirr_MMX_PSUBQirr

    if (CPUID == 4) { // BtVer2Model

      if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg())

        return 1203; // JWriteZeroLatency

      if (true)

        return 1204; // WriteVecALU

    }

    break;

  case 703: // PCMPGTQrr_VPCMPGTQrr

    if (CPUID == 4) { // BtVer2Model

      if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg())

        return 1203; // JWriteZeroLatency

      if (true)

        return 1205; // WriteVecALUX

    }

    break;

  case 986: // SUB32rr_SUB64rr_XOR32rr_XOR64rr

    if (CPUID == 4) { // BtVer2Model

      if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg())

        return 1203; // JWriteZeroLatency

      if (true)

        return 1206; // WriteALU

    }

    break;

  case 987: // XORPSrr_VXORPSrr_XORPDrr_VXORPDrr_ANDNPSrr_VANDNPSrr_ANDNPDrr_VANDNPDrr

    if (CPUID == 4) { // BtVer2Model

      if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg())

        return 1203; // JWriteZeroLatency

      if (true)

        return 1207; // WriteFLogic

    }

    break;

  case 988: // MMX_PXORirr_MMX_PANDNirr

    if (CPUID == 4) { // BtVer2Model

      if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg())

        return 1203; // JWriteZeroLatency

      if (true)

        return 1208; // WriteVecLogic

    }

    break;

  case 989: // PXORrr_VPXORrr_PANDNrr_VPANDNrr

    if (CPUID == 4) { // BtVer2Model

      if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg())

        return 1203; // JWriteZeroLatency

      if (true)

        return 1209; // WriteVecLogicX

    }

    break;

  case 990: // MMX_PSUBBirr_MMX_PSUBDirr_MMX_PSUBWirr_MMX_PCMPGTBirr_MMX_PCMPGTDirr_MMX_PCMPGTWirr

    if (CPUID == 4) { // BtVer2Model

      if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg())

        return 1203; // JWriteZeroLatency

      if (true)

        return 1204; // WriteVecALU

    }

    break;

  case 991: // PSUBBrr_VPSUBBrr_PSUBDrr_VPSUBDrr_VPSUBQrr_PSUBWrr_VPSUBWrr_PCMPGTBrr_VPCMPGTBrr_PCMPGTDrr_VPCMPGTDrr_PCMPGTWrr_VPCMPGTWrr

    if (CPUID == 4) { // BtVer2Model

      if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg())

        return 1203; // JWriteZeroLatency

      if (true)

        return 1205; // WriteVecALUX

    }

    break;

  case 992: // PSUBQrr

    if (CPUID == 4) { // BtVer2Model

      if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg())

        return 1203; // JWriteZeroLatency

      if (true)

        return 1205; // WriteVecALUX

    }

    break;

  case 993: // LEA32r_LEA64r_LEA64_32r

    if (CPUID == 4) { // BtVer2Model

      if ((

          X86_MC::isThreeOperandsLEA(*MI)

          || (

            MI->getOperand(2).isImm()

            && MI->getOperand(2).getImm() != 1

          )

        ))

        return 1210; // JWrite3OpsLEA

      if (true)

        return 1211; // WriteLEA

    }

    break;

  case 996: // MMX_PCMPGTBirr_MMX_PCMPGTDirr_MMX_PCMPGTWirr

    if (CPUID == 4) { // BtVer2Model

      if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg())

        return 1203; // JWriteZeroLatency

      if (true)

        return 1204; // WriteVecALU

    }

    break;

  case 1000: // PSUBBrr_PSUBDrr_PSUBWrr_VPSUBBrr_VPSUBDrr_VPSUBQrr_VPSUBWrr

    if (CPUID == 4) { // BtVer2Model

      if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg())

        return 1203; // JWriteZeroLatency

      if (true)

        return 1205; // WriteVecALUX

    }

    break;

  };

  // Don't know how to resolve this scheduling class.

  return 0;

}

} // end of namespace X86_MC

 

struct X86GenMCSubtargetInfo : public MCSubtargetInfo {

  X86GenMCSubtargetInfo(const Triple &TT,

    StringRef CPU, StringRef FS, ArrayRef<SubtargetFeatureKV> PF,

    ArrayRef<SubtargetFeatureKV> PD,

    const SubtargetInfoKV *ProcSched,

    const MCWriteProcResEntry *WPR,

    const MCWriteLatencyEntry *WL,

    const MCReadAdvanceEntry *RA, const InstrStage *IS,

    const unsigned *OC, const unsigned *FP) :

      MCSubtargetInfo(TT, CPU, FS, PF, PD, ProcSched,

                      WPR, WL, RA, IS, OC, FP) { }

 

  unsigned resolveVariantSchedClass(unsigned SchedClass,

      const MCInst *MI, unsigned CPUID) const override {

    return X86_MC::resolveVariantSchedClassImpl(SchedClass, MI, CPUID);

  }

};

 

static inline MCSubtargetInfo *createX86MCSubtargetInfoImpl(const Triple &TT, StringRef CPU, StringRef FS) {

  return new X86GenMCSubtargetInfo(TT, CPU, FS, X86FeatureKV, X86SubTypeKV,

                      X86ProcSchedKV, X86WriteProcResTable, X86WriteLatencyTable, X86ReadAdvanceTable,

                      nullptr, nullptr, nullptr);

}

要映射的处理器是BtVer2Model,在X86ScheduleBtVer2.td文件里我们可以看到这样的定义:

631     def : InstRW<[JWriteVZeroIdiomALU], (instrs MMX_PSUBBirr, MMX_PSUBDirr,

632                                                 MMX_PSUBQirr, MMX_PSUBWirr,

633                                                 MMX_PCMPGTBirr, MMX_PCMPGTDirr,

634                                                 MMX_PCMPGTWirr)>;

这个调度类的处理就是上面“case 990”处。我们来看看这个映射过程,顺便回顾一下相关的代码生成过程。首先看一下其中一个被映射调度类MMX_PSUBBirr的定义,这个定义在文件X86InstrMMX.td中:

318     defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b,

319                                        SchedWriteVecALU.MMX>;

MMXI_binop_rm_int是定义在文件X86InstrMMX.td中的指令定义,在这里不是特别有趣。而MMX_PSUBB定义里的SchedWriteVecALU.MMX是一个在文件X86Schedule.tdSchedWrite定义,描述该指令的调度细节:

567     def SchedWriteVecALU

568     : X86SchedWriteWidths<WriteVecALU, WriteVecALUX, WriteVecALUY, WriteVecALUZ>;

X86SchedWriteWidths的定义来自同一个文件,描述了不同类型指令适用的资源使用情况:

66       class X86SchedWriteWidths<X86FoldableSchedWrite sScl,

67                                 X86FoldableSchedWrite s128,

68                                 X86FoldableSchedWrite s256,

69                                 X86FoldableSchedWrite s512> {

70         X86FoldableSchedWrite Scl = sScl; // Scalar float/double operations.

71         X86FoldableSchedWrite MMX = sScl; // MMX operations.

72         X86FoldableSchedWrite XMM = s128; // XMM operations.

73         X86FoldableSchedWrite YMM = s256; // YMM operations.

74         X86FoldableSchedWrite ZMM = s512; // ZMM operations.

75       }

71MMX即是WriteVecALU,它来自X86Schedule.td,是MMX整形向量指令使用的资源的描述:

302     defm WriteVecALU : X86SchedWritePair; // Vector integer ALU op, no logicals.

基类X86SchedWritePair也定义在同一个文件:

43       multiclass X86SchedWritePair {

44         // Register-Memory operation.

45         def Ld : SchedWrite;

46         // Register-Register operation.

47         def NAME : X86FoldableSchedWrite {

48           let Folded = !cast<SchedWrite>(NAME#"Ld");

49         }

50       }

47行给出了WriteVecALU的定义,而X86FoldableSchedWrite只是SchedWrite的简单派生定义。文件X86ScheduleBtVer2.td给出它资源使用的情况:

443     defm : JWriteResFpuPair<WriteVecALU, [JFPU01, JVALU], 1>;

JwriteResFpuPairWriteVecALU与资源组[JFPU01, JVALU]关联起来,时延为1132行的定义对应上面47行的定义,它多使用一个JLAGU资源,时延为2

120     multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW,

121                                 list<ProcResourceKind> ExePorts,

122                                 int Lat, list<int> Res = [], int UOps = 1> {

123       // Register variant is using a single cycle on ExePort.

124       def : WriteRes<SchedRW, ExePorts> {

125         let Latency = Lat;

126         let ResourceCycles = Res;

127         let NumMicroOps = UOps;

128       }

129    

130       // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the

131       // latency.

132       def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {

133         let Latency = !add(Lat, 5);

134         let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));

135         let NumMicroOps = UOps;

136       }

137     }

看到正是通过这些定义,指令的选择和指令的调度联系起来了。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值