文章目录
1. Background
为了避免内存和处理器处理速度之间的巨大差异,研究人员使用缓存满足“大而快”的存储结构需求。一般来讲,主要关注的缓存研究方面包括:
- 缓存命中率
- 缓存替换算法
- 缓存缺失代价
- 数据一致性 (多核)
此外,在最近的研究中,还包括对缓存安全性的相关研究,而硬件设计周期较长,代价较高,因此在上述研究中,一般会考虑首先使用模拟器分析策略或算法的效率,之后再完成实现等。
主流模拟器包括 gem5,chamsim,zsim 等等,这里重点理解 zsim 中的存储器层次结构,尤其是缓存的层次结构
1.1代码结构
注:图片来自 zsim 官方PPT
2. 整体结构
2.1 基本类
2.1.1 GlobAlloc
// galloc.h
// 基本类,封装了一些new, delete的操作
class GlobAlloc {
public:
virtual ~GlobAlloc() {}
inline void* operator new (size_t sz) {
return gm_malloc(sz);
}
//Placement new
inline void* operator new (size_t sz, void* ptr) {
return ptr;
}
inline void operator delete(void *p, size_t sz) {
gm_free(p);
}
//Placement delete... make ICC happy. This would only fire on an exception
void operator delete (void* p, void* ptr) {}
};
2.1.2 MemObject
包括了所有存储对象(cache,memory)的基类
// memory_hierarchy.h
class MemObject : public GlobAlloc {
public:
//Returns response cycle
virtual uint64_t access(MemReq& req) = 0;
virtual uint64_t accessSkew(MemReq& req) {return 0;};
virtual void initStats(AggregateStat* parentStat) {}
virtual const char* getName() = 0;
bool isLLC;
virtual int checkSkew(MemReq& req, int *raceDetected) { return 0;};
virtual int lockSkew(MemReq& req) { return 0;};
virtual int unlockSkew(MemReq& req) { return 0;};
};
2.1.3 BaseCache
class BaseCache : public MemObject {
public:
virtual void setParents(uint32_t _childId, const g_vector<MemObject*>& parents, Network* network) = 0;
virtual void setChildren(const g_vector<BaseCache*>& children, Network* network) = 0;
virtual uint64_t invalidate(const InvReq& req) = 0;
virtual bool refresh(Address lineAddr, uint32_t lineId){return true;};
};
cache 基类中,一共继承了三个不同的 cache 结构:
- class Cache
- class StreamPrefetcher
- class TraceDriverProxyCache
首先关注最基本的 Cache 类
2.2 Cache 类
2.2.1 Cache 类声明
class Cache : public BaseCache {
protected:
CC* cc;
CacheArray* array;
ReplPolicy* rp;
uint32_t numLines;
//Latencies
uint32_t accLat; //latency of a normal access (could split in get/put, probably not needed)
uint32_t invLat; //latency of an invalidation
g_string name;
public:
Cache(uint32_t _numLines, CC* _cc, CacheArray* _array, ReplPolicy* _rp, uint32_t _accLat,
uint32_t _invLat, const g_string& _name);
const char* getName();
void setParents(uint32_t _childId, const g_vector<MemObject*>& parents, Network* network);
void setChildren(const g_vector<BaseCache*>& children, Network* network);
void initStats(AggregateStat* parentStat);
virtual uint64_t access(MemReq& req);
//NOTE: reqWriteback is pulled up to true, but not pulled down to false.
virtual uint64_t invalidate(const InvReq& req) {
startInvalidate();
return finishInvalidate(req);
}
protected:
void initCacheStats(AggregateStat* cacheStat);
void startInvalidate(); // grabs cc's downLock
uint64_t finishInvalidate(const InvReq& req); // performs inv and releases downLock
};
cache 类继承自 BaseCache,其中增加了一些 FTM的功能,现在对其中的主要 interface 进行描述
2.2.2 CC
Coherence controllers
// Generic, integrated controller interface
class CC : public GlobAlloc {
public:
//Initialization
virtual void setParents(uint32_t childId, const g_vector<MemObject*>& parents, Network* network) = 0;
virtual void setChildren(const g_vector<BaseCache*>& children, Network* network) = 0;
virtual void initStats(AggregateStat* cacheStat) = 0;
//Access methods; see Cache for call sequence
virtual bool startAccess(MemReq& req) = 0; //initial locking, address races; returns true if access should be skipped; may change req!
virtual bool shouldAllocate(const MemReq& req) = 0; //called when we don't find req's lineAddr in the array
virtual uint64_t processEviction(const MemReq& triggerReq, Address wbLineAddr, int32_t lineId, uint64_t startCycle) = 0; //called iff shouldAllocate returns true
virtual uint64_t processAccess(const MemReq& req, int32_t lineId, uint64_t startCycle, uint64_t* getDoneCycle = nullptr) = 0;
virtual void endAccess(const MemReq& req) = 0;
//Inv methods
virtual void startInv() = 0;
virtual uint64_t processInv(const InvReq& req, int32_t lineId, uint64_t startCycle) = 0;
//Repl policy interface
virtual uint32_t numSharers(uint32_t lineId) = 0;
virtual bool isValid(uint32_t lineId) = 0;
};
此CC基类,共继承出两个不同的子类
//
class MESICC : public CC {
MESITopCC* tcc;
MESIBottomCC* bcc;
...
};
//
class MESITerminalCC : public CC{
MESIBottomCC* bcc;
...
};
此外,还包括两种不同的用于管理层次结构(向上、向下)的接口
// Implements the "top" part: Keeps directory information, handles downgrades and invalidates
class MESITopCC : public GlobAlloc {...};
//
class MESIBottomCC : public GlobAlloc {...};
2.2.3 array
cache 储存数据部分,并提供 lookup等接口,用于插入新数据,查找等操作。
class CacheArray : public GlobAlloc {
public:
/* Returns tag's ID if present, -1 otherwise. If updateReplacement is set,
call the replacement policy's update() on the line accessed*/
virtual int32_t lookup(const Address lineAddr, const MemReq* req, bool updateReplacement) = 0;
/* Runs replacement scheme, returns tag ID of new pos and address of line to write back*/
virtual uint32_t preinsert(const Address lineAddr, const MemReq* req, Address* wbLineAddr) = 0;
virtual void postinsert(const Address lineAddr, const MemReq* req, uint32_t lineId) = 0;
virtual void initStats(AggregateStat* parent) {}
};
CacheArray 是一个纯虚类,不同的结构,不同的实现方式需要进行不同的实例化,例如常用的 SetAssocArray:
class SetAssocArray : public CacheArray {
protected:
Address* array;
ReplPolicy* rp;
HashFamily* hf;
uint32_t numLines;
uint32_t numSets;
uint32_t assoc;
uint32_t setMask;
public:
SetAssocArray(uint32_t _numLines, uint32_t _assoc, ReplPolicy* _rp, HashFamily* _hf);
int32_t lookup(const Address lineAddr, const MemReq* req, bool updateReplacement);
uint32_t preinsert(const Address lineAddr, const MemReq* req, Address* wbLineAddr);
void postinsert(const Address lineAddr, const MemReq* req, uint32_t candidate);
};
2.2.4 ReplPolicy
缓存替换算法的位置,维护一些缓存状态信息
/* Generic replacement policy interface. A replacement policy is initialized by the cache
(by calling setTop/BottomCC) and used by the cache array. Usage follows two models:
* - On lookups, update() is called if the replacement policy is to be updated on a hit
* - On each replacement, rank() is called with the req and a list of replacement candidates.
* - When the replacement is done, replaced() is called.
*/
class ReplPolicy : public GlobAlloc {
protected:
CC* cc; //coherence controller, used to figure out whether candidates are valid
// or number of sharers
public:
ReplPolicy() : cc(nullptr) {}
virtual void setCC(CC* _cc) {cc = _cc;}
virtual void update(uint32_t id, const MemReq* req) = 0;
virtual void replaced(uint32_t id) = 0;
// 这里要维护与 array 相关的结构
virtual uint32_t rankCands(const MemReq* req, SetAssocCands cands) = 0;
virtual uint32_t rankCands(const MemReq* req, ZCands cands) = 0;
virtual void initStats(AggregateStat* parent) {}
};
一些相关子类,表示替换算法的逻辑,例:LRU 替换算法
template <bool sharersAware>
class LRUReplPolicy : public ReplPolicy {...}
2.3 class 子类
zsim 中,cache 的子类共有三种:
//
class FilterCache : public Cache {...};
//
class TimingCache : public Cache {...};
//
class TracingCache : public Cache{...};
根据后续中提到的缓存类型实例化不同种类的 cache
3 Cache 中的重要 interface 及工作过程
3.1 Cache::access()
对于存储结构而言,比较重要的就是读写操作,在 zsim 中,采用的是 access 接口
具体的代码为:
uint64_t Cache::access(MemReq& req) {
uint64_t respCycle = req.cycle;
bool skipAccess = cc->startAccess(req);
if (likely(!skipAccess)) {
bool updateReplacement = (req.type == GETS) || (req.type == GETX);
int32_t lineId = array->lookup(req.lineAddr, &req, updateReplacement);
respCycle += accLat;
if (lineId == -1 && cc->shouldAllocate(req)) {
//Make space for new line
Address wbLineAddr;
//find the lineId to replace
lineId = array->preinsert(req.lineAddr, &req, &wbLineAddr);
trace(Cache, "[%s] Evicting 0x%lx", name.c_str(), wbLineAddr);
//1. if needed, send invalidates/downgrades to lower level
cc->processEviction(req, wbLineAddr, lineId, respCycle);
//do the actual insertion. NOTE: Now we must split insert into a 2-phase
// thing because cc unlocks us.
array->postinsert(req.lineAddr, &req, lineId);
}
// Enforce single-record invariant: Writeback access may have a timing
// record. If so, read it.
EventRecorder* evRec = zinfo->eventRecorders[req.srcId];
TimingRecord wbAcc;
wbAcc.clear();
if (unlikely(evRec && evRec->hasRecord())) {
wbAcc = evRec->popRecord();
}
respCycle = cc->processAccess(req, lineId, respCycle);
// Access may have generated another timing record. If *both* access
// and wb have records, stitch them together
if (unlikely(wbAcc.isValid())) {
if (!evRec->hasRecord()) {
// Downstream should not care about endEvent for PUTs
wbAcc.endEvent = nullptr;
evRec->pushRecord(wbAcc);
} else {
// Connect both events
TimingRecord acc = evRec->popRecord();
assert(wbAcc.reqCycle >= req.cycle);
assert(acc.reqCycle >= req.cycle);
DelayEvent* startEv = new (evRec) DelayEvent(0);
DelayEvent* dWbEv = new (evRec) DelayEvent(wbAcc.reqCycle-req.cycle);
DelayEvent* dAccEv = new (evRec) DelayEvent(acc.reqCycle-req.cycle);
startEv->setMinStartCycle(req.cycle);
dWbEv->setMinStartCycle(req.cycle);
dAccEv->setMinStartCycle(req.cycle);
startEv->addChild(dWbEv, evRec)->addChild(wbAcc.startEvent, evRec);
startEv->addChild(dAccEv, evRec)->addChild(acc.startEvent, evRec);
acc.reqCycle = req.cycle;
acc.startEvent = startEv;
// endEvent / endCycle stay the same; wbAcc's endEvent not connected
evRec->pushRecord(acc);
}
}
}
cc->endAccess(req);
return respCycle;
}
[tip]:
// 有利于分支优化预测
#define likely(x) __builtin_expect((x), 1)
#define unlikely(x) __builtin_expect((x), 0)
3.1.1 cc->startAccess
为缓存访问提供准备,重点是一些锁的机制,以及判断访问合理性,
// may need to skip access due to races
bool startAccess(MemReq& req) {
...
if (req.childLock) {
futex_unlock(req.childLock);
}
tcc->lock(); //must lock tcc FIRST
bcc->lock();
bool skipAccess = CheckForMESIRace(req.type /*may change*/,
req.state, req.initialState);
return skipAccess;
}
3.1.2 cc->processAccess
根据存储器层次结构,缓存替换算法需要维护状态信息,同时下一级缓存中也应该存在处理 access 的操作
// MESICC
uint64_t processAccess(const MemReq& req, int32_t lineId, uint64_t startCycle,
uint64_t* getDoneCycle = nullptr)
{
uint64_t respCycle = startCycle;
if (lineId == -1 || (((req.type == PUTS) || (req.type == PUTX)) && !bcc->isValid(lineId))) {
//can only be a non-inclusive wback
assert(nonInclusiveHack);
assert((req.type == PUTS) || (req.type == PUTX));
respCycle = bcc->processNonInclusiveWriteback(req.lineAddr, req.type, startCycle,
req.state, req.srcId, req.flags);
} else {
//Prefetches are side requests and get handled a bit differently
bool isPrefetch = req.flags & MemReq::PREFETCH;
assert(!isPrefetch || req.type == GETS);
uint32_t flags = req.flags & ~MemReq::PREFETCH;
respCycle = bcc->processAccess(req.lineAddr, lineId, req.type, startCycle, req.srcId, flags);
if (getDoneCycle) *getDoneCycle = respCycle;
if (!isPrefetch) {
bool lowerLevelWriteback = false;
//change directory info, invalidate other children if needed,
// tell requester about its state
respCycle = tcc->processAccess(req.lineAddr, lineId, req.type, req.childId,
bcc->isExclusive(lineId), req.state,
&lowerLevelWriteback, respCycle, req.srcId, flags);
if (lowerLevelWriteback) {
bcc->processWritebackOnAccess(req.lineAddr, lineId, req.type);
}
}
}
return respCycle;
}
// MESITerminal
uint64_t processAccess(const MemReq& req, int32_t lineId, uint64_t startCycle,
uint64_t* getDoneCycle = nullptr) {
assert(lineId != -1);
assert(!getDoneCycle);
//if needed, fetch line or upgrade miss from upper level
uint64_t respCycle = bcc->processAccess(req.lineAddr, lineId, req.type, startCycle,
req.srcId, req.flags);
//at this point, the line is in a good state w.r.t. upper levels
return respCycle;
}
3.1.3 cc->endAccess
解锁操作
void endAccess(const MemReq& req) {
//Relock child before we unlock ourselves (hand-over-hand)
if (req.childLock) {
futex_lock(req.childLock);
}
bcc->unlock();
tcc->unlock();
}
void endAccess(const MemReq& req) {
//Relock child before we unlock ourselves (hand-over-hand)
if (req.childLock) {
futex_lock(req.childLock);
}
bcc->unlock();
}
3.2 当需要发生缓存替换
- 从 array 中找到需要替换的 ID
- 维护一致性
- 写回(如果需要)
- 插入新数据
if (lineId == -1 && cc->shouldAllocate(req)) {
//Make space for new line
Address wbLineAddr;
lineId = array->preinsert(req.lineAddr, &req, &wbLineAddr); //find the lineId to replace
//Evictions are not in the critical path in any sane implementation
// -- we do not include their delays
// NOTE: We might be "evicting" an invalid line for all we know.
// Coherence controllers will know what to do
//1. if needed, send invalidates/downgrades to lower level
cc->processEviction(req, wbLineAddr, lineId, respCycle);
array->postinsert(req.lineAddr, &req, lineId);
// do the actual insertion.
// NOTE: Now we must split insert into a 2-phase thing because cc unlocks us.
}