|
@ -0,0 +1,391 @@ |
|
|
|
|
|
# Cache Lab
|
|
|
|
|
|
|
|
|
|
|
|
10225501432 邓博昊 |
|
|
|
|
|
|
|
|
|
|
|
## Part A
|
|
|
|
|
|
|
|
|
|
|
|
要求:实现一个缓存模拟器,根据给定的 trace 文件来输出对应的操作 |
|
|
|
|
|
|
|
|
|
|
|
讲义提供了一个程序示例,在安装valgrind后,使用如下命令 |
|
|
|
|
|
|
|
|
|
|
|
```bash |
|
|
|
|
|
valgrind --log-fd=1 --tool=lackey -v --trace-mem=yes ls -l |
|
|
|
|
|
``` |
|
|
|
|
|
|
|
|
|
|
|
输出的trace文件内容如下 |
|
|
|
|
|
|
|
|
|
|
|
```bash |
|
|
|
|
|
I 04ead900,3 |
|
|
|
|
|
I 04ead903,3 |
|
|
|
|
|
I 04ead906,5 |
|
|
|
|
|
I 04ead838,3 |
|
|
|
|
|
I 04ead83b,3 |
|
|
|
|
|
I 04ead83e,5 |
|
|
|
|
|
L 1ffefff968,8 |
|
|
|
|
|
I 04ead843,3 |
|
|
|
|
|
I 04ead846,3 |
|
|
|
|
|
I 04ead849,5 |
|
|
|
|
|
L 1ffefff960,8 |
|
|
|
|
|
I 04ead84e,3 |
|
|
|
|
|
I 04ead851,3 |
|
|
|
|
|
...... |
|
|
|
|
|
``` |
|
|
|
|
|
|
|
|
|
|
|
trace文件中记载着每一次对内存的操作,前面的字母代表操作类型,统一的格式是: |
|
|
|
|
|
|
|
|
|
|
|
``` |
|
|
|
|
|
[空格][操作类型][空格][内存地址][逗号][大小] |
|
|
|
|
|
``` |
|
|
|
|
|
|
|
|
|
|
|
在此过程中,如若第一个字符并非空格而为I,则意指执行加载操作,并无实质意义。 |
|
|
|
|
|
|
|
|
|
|
|
操作类型主要分布于以下三种: |
|
|
|
|
|
|
|
|
|
|
|
1. L:读取,从内存中检索 |
|
|
|
|
|
2. S:存储,向内存中写入 |
|
|
|
|
|
3. M:修改,此过程包括一次读取及一次存储操作 |
|
|
|
|
|
|
|
|
|
|
|
地址则指向一个64位16进制内存地址;而大小则用以表示该操作所需访问的内存字节数。需要注意的是,I指令无需插入空格,而M/S/L指令之前需添加一个空格用于解析指令。 |
|
|
|
|
|
|
|
|
|
|
|
随后,实验为我们提供了一个名为csim-ref的程序,任务便是撰写一份与之功能一致的程序。 |
|
|
|
|
|
|
|
|
|
|
|
```bash |
|
|
|
|
|
Usage: ./csim-ref [-hv] -s <num> -E <num> -b <num> -t <file> |
|
|
|
|
|
Options: |
|
|
|
|
|
-h Print this help message. |
|
|
|
|
|
-v Optional verbose flag. |
|
|
|
|
|
-s <num> Number of set index bits. |
|
|
|
|
|
-E <num> Number of lines per set. |
|
|
|
|
|
-b <num> Number of block offset bits. |
|
|
|
|
|
-t <file> Trace file. |
|
|
|
|
|
|
|
|
|
|
|
Examples: |
|
|
|
|
|
linux> ./csim-ref -s 4 -E 1 -b 4 -t traces/yi.trace |
|
|
|
|
|
linux> ./csim-ref -v -s 8 -E 2 -b 4 -t traces/yi.trace |
|
|
|
|
|
``` |
|
|
|
|
|
|
|
|
|
|
|
**分析** |
|
|
|
|
|
`getopt`获取命令行参数 |
|
|
|
|
|
|
|
|
|
|
|
`fscanf`读入trace文件内容 |
|
|
|
|
|
|
|
|
|
|
|
`malloc`分配空间给cache |
|
|
|
|
|
|
|
|
|
|
|
数据访问带来的miss: |
|
|
|
|
|
|
|
|
|
|
|
* L:Load,数据载入,可能发生1次miss |
|
|
|
|
|
* S:Store,可能发生1次miss |
|
|
|
|
|
* M:store后再load,两次访存。1 miss & 1 hit + 可能eviction |
|
|
|
|
|
|
|
|
|
|
|
所以L/S指令结果是miss或者hit或者miss+eviction;而M指令结果是hit+hit或者miss+hit 或者 miss+eviction+hit |
|
|
|
|
|
|
|
|
|
|
|
### Cache结构
|
|
|
|
|
|
|
|
|
|
|
|
设计Cache基本单元为 `block`,cache由cacheblock组成 |
|
|
|
|
|
|
|
|
|
|
|
```c |
|
|
|
|
|
typedef struct |
|
|
|
|
|
{ |
|
|
|
|
|
unsigned tag; |
|
|
|
|
|
unsigned usedtime; |
|
|
|
|
|
} block; |
|
|
|
|
|
block *cache; |
|
|
|
|
|
``` |
|
|
|
|
|
|
|
|
|
|
|
其中`usedtime`是判断LRU cache行。初始值为0表示没有用过,相当于invalid。非零值越小代表越少使用,`usedtime`最大代表刚使用。 |
|
|
|
|
|
|
|
|
|
|
|
### 命令行参数解析
|
|
|
|
|
|
|
|
|
|
|
|
首先对命令行参数进行解析 |
|
|
|
|
|
|
|
|
|
|
|
```C |
|
|
|
|
|
int getOpt(int argc,char **argv,int *s,int *E,int *b,int *verbose,char *tracefile) |
|
|
|
|
|
{ |
|
|
|
|
|
int oc; |
|
|
|
|
|
while((oc=getopt(argc,argv,"hvs:E:b:t:"))!=-1){ |
|
|
|
|
|
switch(oc){ |
|
|
|
|
|
case 'h': printHelpMenu();break; // print usage |
|
|
|
|
|
case 'v': *verbose=1;break; |
|
|
|
|
|
case 's': *s = atoi(optarg);break; |
|
|
|
|
|
case 'E': *E = atoi(optarg);break; |
|
|
|
|
|
case 'b': *b = atoi(optarg);break; |
|
|
|
|
|
case 't': strcpy(tracefile,optarg);break; |
|
|
|
|
|
default : printf("input error\n");break; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
return 0; |
|
|
|
|
|
} |
|
|
|
|
|
``` |
|
|
|
|
|
|
|
|
|
|
|
### 初始化cache
|
|
|
|
|
|
|
|
|
|
|
|
然后初始化cache |
|
|
|
|
|
|
|
|
|
|
|
```C |
|
|
|
|
|
cache = (block *)malloc(sizeof(block)* E<<s); |
|
|
|
|
|
memset(cache,0,sizeof(block)* E<<s); |
|
|
|
|
|
``` |
|
|
|
|
|
|
|
|
|
|
|
### 读取文件参数
|
|
|
|
|
|
|
|
|
|
|
|
`fscanf`读取trace文件中的指令、地址 |
|
|
|
|
|
|
|
|
|
|
|
```c |
|
|
|
|
|
fp = fopen (tracefile,"r"); |
|
|
|
|
|
while(fscanf(fp,"%s%x,%d\n",op,&addr,&size) > 0){ |
|
|
|
|
|
if(verbose) |
|
|
|
|
|
printf("%s %x,%d ",op,addr,size); |
|
|
|
|
|
switch(op[0]){ |
|
|
|
|
|
case 'M': hit++; |
|
|
|
|
|
case 'L': |
|
|
|
|
|
case 'S': find(op[0],addr,size,++t); |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
``` |
|
|
|
|
|
|
|
|
|
|
|
### 数据访问
|
|
|
|
|
|
|
|
|
|
|
|
获取`tag`和 `set index` |
|
|
|
|
|
|
|
|
|
|
|
```C |
|
|
|
|
|
unsigned tag = addr >>b >>s ; |
|
|
|
|
|
unsigned set_index = addr >> b &((1<<s) -1); |
|
|
|
|
|
``` |
|
|
|
|
|
|
|
|
|
|
|
找到对应的set |
|
|
|
|
|
|
|
|
|
|
|
```c |
|
|
|
|
|
block *cache_set = cache + E * set_index ; // set address |
|
|
|
|
|
block *eviction_block = cache_set; // LRU cacheline |
|
|
|
|
|
``` |
|
|
|
|
|
|
|
|
|
|
|
进行数据查找,其中eviction_block表示查询过程中LRU的cache行,也就是usedtime最小的(但是非0)在一个set里面遍历cache行 |
|
|
|
|
|
|
|
|
|
|
|
* 如果`usedtime`!=0且tag匹配:hit |
|
|
|
|
|
* 如果`usedtime`=0,是个空block,使用这个block:miss |
|
|
|
|
|
* 如果`usedtime`!=0,tag不匹配,跟`eviction_block.usedtime`比较,如果时间更小,更新`eviction_block`=该cacheblock |
|
|
|
|
|
|
|
|
|
|
|
如果循环结束,也就证明该set的所有cache行都满了,就替换LRU cache行。 |
|
|
|
|
|
|
|
|
|
|
|
```c |
|
|
|
|
|
void find(char op, unsigned addr,unsigned size,int time){ |
|
|
|
|
|
int i; |
|
|
|
|
|
unsigned tag = addr >>b >>s ; |
|
|
|
|
|
unsigned set_index = addr >> b &((1<<s) -1); |
|
|
|
|
|
block *cache_set = cache + E * set_index ; // set address |
|
|
|
|
|
block *eviction_block = cache_set; // LRU cacheline |
|
|
|
|
|
for(i = 0;i<E;i++){ |
|
|
|
|
|
if(cache_set[i].usedtime>0 && cache_set[i].tag ==tag){ //hit |
|
|
|
|
|
cache_set[i].usedtime = time; |
|
|
|
|
|
hit++; |
|
|
|
|
|
if(verbose) cacheStateOut(op,0); |
|
|
|
|
|
return; |
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
else if(!cache_set[i].usedtime){ // empty block |
|
|
|
|
|
miss++; |
|
|
|
|
|
cache_set[i].tag = tag; |
|
|
|
|
|
cache_set[i].usedtime = time; |
|
|
|
|
|
if(verbose) cacheStateOut(op,1); |
|
|
|
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
else if(cache_set[i].usedtime < eviction_block->usedtime) // !=tag , current block is older |
|
|
|
|
|
eviction_block = cache_set+i; |
|
|
|
|
|
} |
|
|
|
|
|
miss ++; |
|
|
|
|
|
eviction ++; |
|
|
|
|
|
eviction_block->tag = tag; // replace sacrifice cacheline |
|
|
|
|
|
eviction_block->usedtime = time; |
|
|
|
|
|
if(verbose) cacheStateOut(op,2); |
|
|
|
|
|
return ; |
|
|
|
|
|
} |
|
|
|
|
|
``` |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## Part B
|
|
|
|
|
|
|
|
|
|
|
|
Part B 要我们实作矩阵转置,并将 cache miss 尽可能降低,Part B 的程序限制如下 |
|
|
|
|
|
|
|
|
|
|
|
- 在 stack 中至多 12 个整数型态的局部变量 |
|
|
|
|
|
- 不得使用 long 或位操作,将 2 个整数型态变量存在 1 个变量中 |
|
|
|
|
|
- 不得使用递归 |
|
|
|
|
|
- 不得修改矩阵 A ,但可以修改矩阵 B |
|
|
|
|
|
- 不得自定义矩阵或使用 对变量动态配置内存空间`malloc` |
|
|
|
|
|
|
|
|
|
|
|
缓存参数 |
|
|
|
|
|
|
|
|
|
|
|
- 缓存取大小 1KB |
|
|
|
|
|
- 采用直映射(E=1) |
|
|
|
|
|
- Block 大小为 32 Byte(b=5) |
|
|
|
|
|
- Set 共 32 组(s=5) |
|
|
|
|
|
|
|
|
|
|
|
Eviction 的策略 |
|
|
|
|
|
|
|
|
|
|
|
- 矩阵 A & B 的第一行在 cache 中为同一组 |
|
|
|
|
|
- 对角线元素互相 evict |
|
|
|
|
|
|
|
|
|
|
|
测试矩阵大小及分数 |
|
|
|
|
|
|
|
|
|
|
|
- 32 x 32: cache miss < 300 满分 |
|
|
|
|
|
- 64 x 64: cache miss < 1300 满分 |
|
|
|
|
|
- 61 x 67: cache miss < 2000 满分 |
|
|
|
|
|
|
|
|
|
|
|
**分析:** |
|
|
|
|
|
|
|
|
|
|
|
在该实验中,缓存采用的是直接映射高速缓存,s = 5,b = 5,E = 1。对于该缓存,总共存在32个组,每个组共32个字节,可以装入8个int型变量,是非常有限的缓存,矩阵大小>cache大小。 |
|
|
|
|
|
|
|
|
|
|
|
主要需要解决以下两个问题: |
|
|
|
|
|
|
|
|
|
|
|
* 直接映射缓存所带来的冲突不命中。观察程序中矩阵存储的位置即可以发现,矩阵A和矩阵B的同一行实际上被映射到了同一个缓存组。当进行对角线的引用时,一定会发生缓存的冲突不命中。需要仔细地处理对角线上的元素。 |
|
|
|
|
|
* 所需优化的矩阵的总大小超出了缓存的总大小。必然导致程序的访存效率低下。 |
|
|
|
|
|
|
|
|
|
|
|
为了解决第一个问题,我们需要仔细地考虑对于矩阵访问顺序;第二个问题,采用矩阵的分块(Blocking)方法降低miss |
|
|
|
|
|
|
|
|
|
|
|
### 32 * 32
|
|
|
|
|
|
|
|
|
|
|
|
缓存一个块的大小为 32 Bytes,可放入 8 个整数类型,又整个缓存有 32 组,代表缓存一次可以存放 32 x 8 = 256 个连续位置的整数。 对于32 x 32的矩阵来说,等于每8列(256/32)就会发生冲突,因此理想的分块大小应该为**8 x 8** |
|
|
|
|
|
|
|
|
|
|
|
另外,因为假设为直接映射,每组都只有一行,等于说只要发生冲突一定有 eviction,代表我们必需尽可能降低行替换的次数。 作业特别说明对角线元素互相evict,我们画图观察转置对角线元素会发生什么情况,为了简化以4 x 4的状况来呈现 |
|
|
|
|
|
|
|
|
|
|
|
- T1: 第一次置换,都是 cache miss |
|
|
|
|
|
- T2: 第二次置换,A 是 cache hit,但 B 矩阵第二行不在快取中为 cache miss |
|
|
|
|
|
- T3: 第二次置换,为了将 B 矩阵第二行读进快取,必需将 A 矩阵第二行替换掉 |
|
|
|
|
|
- T4: 第三次置换,因为 T3 替换了 A 矩阵第二行,在 T4 又必需加载回来 |
|
|
|
|
|
|
|
|
|
|
|
从以上分析可以发现,快取在A &B**对角线**元素的那一行发生**冲突**,所以对角线元素的替换会产生2次的miss及eviction。 |
|
|
|
|
|
|
|
|
|
|
|
简单`8 * 8`分块: |
|
|
|
|
|
|
|
|
|
|
|
```c |
|
|
|
|
|
if(M == 32){ |
|
|
|
|
|
for (i = 0; i < N; i+=8) { |
|
|
|
|
|
for (j = 0; j < M; j+=8) { |
|
|
|
|
|
for(k = i ;k < i + 8 && k<N;k++){ |
|
|
|
|
|
for(l = j ; l < j + 8 && l < M;l++) |
|
|
|
|
|
{ |
|
|
|
|
|
a0 = A[k][l]; |
|
|
|
|
|
B[l][k] = a0; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
``` |
|
|
|
|
|
|
|
|
|
|
|
测试结果超过了300miss,原因是**对角线访问冲突问题** |
|
|
|
|
|
|
|
|
|
|
|
#### 对角线访问冲突问题
|
|
|
|
|
|
|
|
|
|
|
|
矩阵A和矩阵B的同一行实际上被映射到了同一个cache block。当进行对角线的引用时,一定会发生缓存的冲突不命中。并且,由于A和B的元素时一个一个处理的,必定会造成反复多次的冲突不命中。(如下图A第一个元素读miss,B第一个元素存miss,A读第二个元素miss) |
|
|
|
|
|
|
|
|
|
|
|
**解决方法:通过变量一次性读出A的一整行,再存入B** |
|
|
|
|
|
|
|
|
|
|
|
```c |
|
|
|
|
|
for (i = 0; i < N; i+=8) { |
|
|
|
|
|
for (j = 0; j < M; j+=8) { |
|
|
|
|
|
if(i == j){ |
|
|
|
|
|
for(k = i ;k < i + 8 && k<N;k++){ |
|
|
|
|
|
a0 = A[k][j]; |
|
|
|
|
|
a1 = A[k][j+1]; |
|
|
|
|
|
a2 = A[k][j+2]; |
|
|
|
|
|
a3 = A[k][j+3]; |
|
|
|
|
|
a4 = A[k][j+4]; |
|
|
|
|
|
a5 = A[k][j+5]; |
|
|
|
|
|
a6 = A[k][j+6]; |
|
|
|
|
|
a7 = A[k][j+7]; |
|
|
|
|
|
B[j][k] = a0; |
|
|
|
|
|
B[j+1][k] = a1; |
|
|
|
|
|
B[j+2][k] = a2; |
|
|
|
|
|
B[j+3][k] = a3; |
|
|
|
|
|
B[j+4][k] = a4; |
|
|
|
|
|
B[j+5][k] = a5; |
|
|
|
|
|
B[j+6][k] = a6; |
|
|
|
|
|
B[j+7][k] = a7; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
else{ |
|
|
|
|
|
for(k = i ;k < i + 8 && k<N;k++){ |
|
|
|
|
|
for(l = j ; l < j + 8 && l < M;l++) |
|
|
|
|
|
B[l][k] = A[k][l]; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
``` |
|
|
|
|
|
|
|
|
|
|
|
### 64 * 64
|
|
|
|
|
|
|
|
|
|
|
|
方法:将8 * 8 块再分成4个4 * 4的块进一步处理 |
|
|
|
|
|
|
|
|
|
|
|
* 首先对左上角和右上角进行处理: |
|
|
|
|
|
|
|
|
|
|
|
1. B左上角 = A左上角转置。B右上角=A右上角转置。 |
|
|
|
|
|
2. 我们最后只需要把这部分平移到B的左下角就好。 |
|
|
|
|
|
|
|
|
|
|
|
* 现在B左上角完成 |
|
|
|
|
|
|
|
|
|
|
|
1. 首先用四个变量存储A的左下角的一列。 |
|
|
|
|
|
2. 再用四个变量存储B的右上角的一行。 |
|
|
|
|
|
3. 把四个变量存储的A的左下角的一列移动到B右上角的一行 |
|
|
|
|
|
4. 把四个变量存储的B的右上角的一行平移到B左下角的一列 |
|
|
|
|
|
5. B的右下角=A的右下角转置 |
|
|
|
|
|
|
|
|
|
|
|
```c |
|
|
|
|
|
for (i = 0; i < N; i += 8) { |
|
|
|
|
|
for (j = 0; j < M; j += 8) { |
|
|
|
|
|
for (k = i; k < i + 4; k++) { |
|
|
|
|
|
a0 = A[k][j]; |
|
|
|
|
|
a1 = A[k][j + 1]; |
|
|
|
|
|
a2 = A[k][j + 2]; |
|
|
|
|
|
a3 = A[k][j + 3]; |
|
|
|
|
|
a4 = A[k][j + 4]; |
|
|
|
|
|
a5 = A[k][j + 5]; |
|
|
|
|
|
a6 = A[k][j + 6]; |
|
|
|
|
|
a7 = A[k][j + 7]; |
|
|
|
|
|
|
|
|
|
|
|
B[j][k] = a0; |
|
|
|
|
|
B[j + 1][k] = a1; |
|
|
|
|
|
B[j + 2][k] = a2; |
|
|
|
|
|
B[j + 3][k] = a3; |
|
|
|
|
|
|
|
|
|
|
|
B[j][k + 4] = a4; |
|
|
|
|
|
B[j + 1][k + 4] = a5; |
|
|
|
|
|
B[j + 2][k + 4] = a6; |
|
|
|
|
|
B[j + 3][k + 4] = a7; |
|
|
|
|
|
} |
|
|
|
|
|
for (l = j + 4; l < j + 8; l++) { |
|
|
|
|
|
|
|
|
|
|
|
a4 = A[i + 4][l - 4]; // A left-down col |
|
|
|
|
|
a5 = A[i + 5][l - 4]; |
|
|
|
|
|
a6 = A[i + 6][l - 4]; |
|
|
|
|
|
a7 = A[i + 7][l - 4]; |
|
|
|
|
|
|
|
|
|
|
|
a0 = B[l - 4][i + 4]; // B right-above line |
|
|
|
|
|
a1 = B[l - 4][i + 5]; |
|
|
|
|
|
a2 = B[l - 4][i + 6]; |
|
|
|
|
|
a3 = B[l - 4][i + 7]; |
|
|
|
|
|
|
|
|
|
|
|
B[l - 4][i + 4] = a4; // set B right-above line |
|
|
|
|
|
B[l - 4][i + 5] = a5; |
|
|
|
|
|
B[l - 4][i + 6] = a6; |
|
|
|
|
|
B[l - 4][i + 7] = a7; |
|
|
|
|
|
|
|
|
|
|
|
B[l][i] = a0; // set B left-down col |
|
|
|
|
|
B[l][i + 1] = a1; |
|
|
|
|
|
B[l][i + 2] = a2; |
|
|
|
|
|
B[l][i + 3] = a3; |
|
|
|
|
|
|
|
|
|
|
|
B[l][i + 4] = A[i + 4][l]; |
|
|
|
|
|
B[l][i + 5] = A[i + 5][l]; |
|
|
|
|
|
B[l][i + 6] = A[i + 6][l]; |
|
|
|
|
|
B[l][i + 7] = A[i + 7][l]; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
``` |
|
|
|
|
|
|
|
|
|
|
|
### 61 * 67
|
|
|
|
|
|
|
|
|
|
|
|
对于不规则的矩阵,其核心依然是通过分块的方式优化Cache的读写效率。然而,要找到非常明显的规律来判断何时能填满一个Cache却并非易事。鉴于要求较为宽松,我们无需考虑处理对角线的情况,而是直接执行转置操作。只需尝试并更换不同的边长分块就能达到期望效果。实际上,采用16 × 16的分块规模就足以确保获得满分。 |
|
|
|
|
|
|