disk (磁盘)

布局

sector: 磁盘存取的最小单位。在xv6中为1kb

block: 文件系统存取的最小单位,为sector的任意整数倍。在xv6中为1kb

disk_layout

  • boot block: 启动操作系统的代码
  • super block: 描述文件系统信息
  • log blocks: 日志信息
  • inode blocks: 存放所有inode
  • bitmap block: 记录data block是否空闲
  • data blocks: 存储文件和目录的内容

supber block

// kernel/fs.h
struct superblock {
  uint magic;        // Must be FSMAGIC
  uint size;         // Size of file system image (blocks)
  uint nblocks;      // Number of data blocks
  uint ninodes;      // Number of inodes.
  uint nlog;         // Number of log blocks
  uint logstart;     // Block number of first log block
  uint inodestart;   // Block number of first inode block
  uint bmapstart;    // Block number of first free map block
};
#define FSSIZE       2000  // size of file system in blocks
#define MAXOPBLOCKS  10  // max # of blocks any FS op writes
#define LOGSIZE      (MAXOPBLOCKS*3)  // max data blocks in on-disk log
#define NINODES 200
// kernel/fs.c
// there should be one superblock per disk device, 
// but we run with only one device
struct superblock sb; 

// Init fs
void
fsinit(int dev) {
  readsb(dev, &sb);
  if(sb.magic != FSMAGIC)
    panic("invalid file system");
  initlog(dev, &sb);
}

// Read the super block.
static void
readsb(int dev, struct superblock *sb)
{
  struct buf *bp;

  bp = bread(dev, 1);
  memmove(sb, bp->data, sizeof(*sb));
  brelse(bp);
}

buffer cache

struct buf

// kernel/buf.h
struct buf {
  int valid;   // has data been read from disk?
  int disk;    // does disk "own" buf?
  uint dev;
  uint blockno;
  struct sleeplock lock;
  uint refcnt;      // 
  struct buf *prev; // LRU cache list
  struct buf *next;
  uchar data[BSIZE];
};

bcache

// kernel/bio.c
struct {
  struct spinlock lock;
  struct buf buf[NBUF];

  // Linked list of all buffers, through prev/next.
  // Sorted by how recently the buffer was used.
  // head.next is most recent, head.prev is least.
  // 双向循环链表
  struct buf head;
} bcache;

bread()

// kernel/bio.c
struct buf*
bread(uint dev, uint blockno)
{
  struct buf *b;

  b = bget(dev, blockno);
  if(!b->valid) {
    virtio_disk_rw(b, 0);
    b->valid = 1;
  }
  return b;
}

bget()

// kernel/bio.c
static struct buf*
bget(uint dev, uint blockno)
{
  struct buf *b;

  acquire(&bcache.lock);

  // Is the block already cached?
  for(b = bcache.head.next; b != &bcache.head; b = b->next){
    if(b->dev == dev && b->blockno == blockno){
      b->refcnt++;
      release(&bcache.lock);
      acquiresleep(&b->lock);
      return b;
    }
  }

  // Not cached.
  // Recycle the least recently used (LRU) unused buffer.
  // 逆序遍历
  for(b = bcache.head.prev; b != &bcache.head; b = b->prev){
    if(b->refcnt == 0) {
      b->dev = dev;
      b->blockno = blockno;
      // 还未从磁盘读取数据
      b->valid = 0;
      b->refcnt = 1;
      release(&bcache.lock);
      acquiresleep(&b->lock);
      return b;
    }
  }
  panic("bget: no buffers");
}

brelese()

// kernel/bio.c
void
brelse(struct buf *b)
{
  if(!holdingsleep(&b->lock))
    panic("brelse");

  releasesleep(&b->lock);

  acquire(&bcache.lock);
  b->refcnt--;
  if (b->refcnt == 0) {
    // no one is waiting for it.
    // b移动到链表表头
    b->next->prev = b->prev;
    b->prev->next = b->next;
    b->next = bcache.head.next;
    b->prev = &bcache.head;
    bcache.head.next->prev = b;
    bcache.head.next = b;
  }
  
  release(&bcache.lock);
}

logging

why

  • case 1

    // kernel/sysfile.c
    static struct inode*
    create(char *path, short type, short major, short minor)
    {
    ...
      if((ip = ialloc(dp->dev, type)) == 0){
        iunlockput(dp);
        return 0;
      }
      <- crashed here, what will happen
    ...
    }
    

    crash会导致我们会丢失这个inode

  • case 2

    在为文件分配block时

    1. 从 data blocks 中找到一块空闲 block
    2. 将该 block number 写入到文件的 inode 中
    3. 在bitmap中标记该block已使用

    如果2,3之间 crash 会怎么样

    crash 可能会导致这个 block 被分配给多个文件

    fatal !

what

buffer cache 之上的一种机制,用来保证系统调用的原子性,同时能够在系统 crash 之后进行 Fast Recovery

how

// kernel/log.c
struct logheader {
  int n;
  int block[LOGSIZE];
};

struct log {
  struct spinlock lock;
  int start;       // start of log blocks
  int size;         // number of log blocks
  int outstanding; // how many FS sys calls are executing.
  int committing;  // in commit(), please wait.
  int dev;
  struct logheader lh;
};
struct log log;

log_layout

log 实现

  • log write4

    当需要更新 inode block 或 bitmap block 或 data block 时,我们并不直接写入到磁盘对应的位置,而是记录一条 log 到磁盘的 log 分区

    // kernel/log.c
    void
    log_write(struct buf *b)
    {
      int i;
    
      acquire(&log.lock);
      if (log.lh.n >= LOGSIZE || log.lh.n >= log.size - 1)
        panic("too big a transaction");
      if (log.outstanding < 1)
        panic("log_write outside of trans");
      // 要写入的 block number 已存在
      for (i = 0; i < log.lh.n; i++) {
        if (log.lh.block[i] == b->blockno)   // log absorption
          break;
      }
      log.lh.block[i] = b->blockno;
      if (i == log.lh.n) {  // Add new block to log?
        bpin(b);
        log.lh.n++;
      }
      // i != log.lh.n
      // log 已存在并且未 commit,nothing to do
      release(&log.lock);
    }
    
  • commit

    // kernel/log.c
    static void
    commit()
    {
      if (log.lh.n > 0) {
        write_log();     // Write modified blocks from cache to log
        write_head();    // Write header to disk -- the real commit
        install_trans(0); // Now install writes to home locations
        log.lh.n = 0;
        write_head();    // Erase the transaction from the log
      }
    }
    

    对单个 disk block 的读写具有原子性

    commit 可保证系统调用的原子性

    // kernel/log.c
    static void
    write_log(void)
    {
      int tail;
    
      for (tail = 0; tail < log.lh.n; tail++) {
        struct buf *to = bread(log.dev, log.start+tail+1); // log block
        struct buf *from = bread(log.dev, log.lh.block[tail]); // cache block
        // 将 log 中记录的缓冲块号的缓冲块复制到 log 缓冲块
        memmove(to->data, from->data, BSIZE);
        // 将 log 缓冲块写出到磁盘
        bwrite(to);  // write the log
        brelse(from);
        brelse(to);
      }
    }
    
    // kernel/log.c
    static void
    write_head(void)
    {
      struct buf *buf = bread(log.dev, log.start);
      struct logheader *hb = (struct logheader *) (buf->data);
      int i;
      // 将内存中的 logheader 复制到 log head 的缓冲块
      hb->n = log.lh.n;
      for (i = 0; i < log.lh.n; i++) {
        hb->block[i] = log.lh.block[i];
      }
      // 将 log head 的缓冲块写出到磁盘
      bwrite(buf);
      brelse(buf);
    }
    
  • install trans

    // kernel/log.c
    static void
    install_trans(int recovering)
    {
      int tail;
    
      for (tail = 0; tail < log.lh.n; tail++) {
        struct buf *lbuf = bread(log.dev, log.start+tail+1); // read log block
        struct buf *dbuf = bread(log.dev, log.lh.block[tail]); // read dst
        memmove(dbuf->data, lbuf->data, BSIZE);  // copy block to dst
        bwrite(dbuf);  // write dst to disk
        if(recovering == 0)
          bunpin(dbuf);
        brelse(lbuf);
        brelse(dbuf);
      }
    }
    
  • clean log

    // kernel/log.c
    static void
    commit()
    {
    ...
      log.lh.n = 0;
      write_head();    // Erase the transaction from the log
    }
    
  • recovery

    // kernel/log.c
    static void
    recover_from_log(void)
    {
      read_head();
      install_trans(1); // if committed, copy from log to disk
      log.lh.n = 0;
      write_head(); // clear the log
    }
    

usage

uint64
sys_open()
{
...
  // 合法性检查
  begin_op();
  ...
  log_write();
  ...
  log_write();
  ...
  end_op();
...
}
// kernel/log.c
void
begin_op(void)
{
  acquire(&log.lock);
  while(1){
    // 有系统调用正在 commit
    if(log.committing){
      sleep(&log, &log.lock);
      // 可能超出 log 大小限制
    } else if(log.lh.n + (log.outstanding+1)*MAXOPBLOCKS > LOGSIZE){
      // this op might exhaust log space; wait for commit.
      sleep(&log, &log.lock);
    } else {
      log.outstanding += 1;
      release(&log.lock);
      break;
    }
  }
}
// kernel/log.c
void
end_op(void)
{
  int do_commit = 0;

  acquire(&log.lock);
  log.outstanding -= 1;
  if(log.committing)
    panic("log.committing");
  // 所有系统调用都已经 end_op()
  if(log.outstanding == 0){
    do_commit = 1;
    log.committing = 1;
  } else {
    // begin_op() may be waiting for log space,
    // and decrementing log.outstanding has decreased
    // the amount of reserved space.
    wakeup(&log);
  }
  release(&log.lock);
  
  // if log.outstanding != 0
  // noting to do

  if(do_commit){
    // call commit w/o holding locks, since not allowed
    // to sleep with locks.
    commit();
    acquire(&log.lock);
    log.committing = 0;
    wakeup(&log);
    release(&log.lock);
  }
}