posix_fadvise中，willneed流程

yuanlanjun

浏览: 1185373 次

最近访客更多访客>>

iptcp

u012363178

libihan

yych007

博主相关

博客

微博

相册

留言

关于我

文章分类

全部博客 (1560)

社区版块

存档分类

2012-08 ( 30)
2012-07 ( 442)
2012-06 ( 101)
更多存档...

调用posix_fadvise函数，并使用POSIX_FADV_WILLNEED选项将会把硬盘指定区域的数据读取到page cache中，下面就来分析一下willneed的工作流程：

/* fadvise系统调用的入口 */

SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)

{

/* 省略参数合法性检查部分 */

......

/* WILLNEED选项 */

case POSIX_FADV_WILLNEED:

/* 检查是否存在readpage函数的指针 */
if (!mapping->a_ops->readpage) {
ret = -EINVAL;
break;
}

/* 计算起始的页号，结束的页号 */
/* First and last PARTIAL page! */
start_index = offset >> PAGE_CACHE_SHIFT;
end_index = endbyte >> PAGE_CACHE_SHIFT;

/* 算出要读取的页数。防止溢出，多加上一个页 */
/* Careful about overflow on the "+1" */
nrpages = end_index - start_index + 1;

/* 如果页数为0，说明要对整个文件进行预取 */
if (!nrpages)
nrpages = ~0UL;

/* 调用force_page_cache_readahead函数，开始预取 */
ret = force_page_cache_readahead(mapping, file,
start_index,
nrpages);

/* 预取成功，将返回值设为0 */

if (ret > 0)
ret = 0;
break;

......

}

接下去，就来分析force_page_cache_readahead函数了。

/*
* Chunk the readahead into 2 megabyte units, so that we don't pin too much
* memory at once.

* 将要预取的数据划分成2MB的单元，这样就不用一次占用太多的内存

*/
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read)
{
int ret = 0;

/* 检查readpage和readpages的指针是否有效 */
if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
return -EINVAL;

/* 计算最大可读取的页数，主要是用来处理nr_to_read为~0UL的情况 */
nr_to_read = max_sane_readahead(nr_to_read);

while (nr_to_read) {
int err;

/* 先计算出2MB对应的页数 */
unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE;

/* 如果要预取的数据量小于2MB，则将this_chunk改为要预取的页数 */
if (this_chunk > nr_to_read)
this_chunk = nr_to_read;

/* 调用__do_page_cache_readahead，向下一层进发 */
err = __do_page_cache_readahead(mapping, filp,
offset, this_chunk, 0);

/* 预取失败 */

if (err < 0) {
ret = err; /* 将函数的返回值设为错误码 */
break;
}

ret += err; /* 计算已读取的页数 */
offset += this_chunk; /* 偏移增加 */
nr_to_read -= this_chunk; /* 待读取的页数减少 */
}
return ret;
}

下面看__do_page_cache_readahead这个函数：

/*
* __do_page_cache_readahead() actually reads a chunk of disk. It allocates all
* the pages first, then submits them all for I/O. This avoids the very bad
* behaviour which would occur if page allocations are causing VM writeback.
* We really don't want to intermingle reads and writes like that.
*

* __do_page_cache_readahead()函数实际上读取了磁盘的一块数据。它首先在内存中分配出所需的页，然后将其提交给I/O。

* 这样就避免了在极端情况下，可能会发生的分配页时引起的VM回写，从而中断读、写操作。
* Returns the number of pages requested, or the maximum amount of I/O allowed.
*/
static int
__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read,
unsigned long lookahead_size)
{
struct inode *inode = mapping->host;
struct page *page;
unsigned long end_index; /* The last page we want to read */
LIST_HEAD(page_pool);
int page_idx;
int ret = 0;
loff_t isize = i_size_read(inode); /* 获得文件大小 */

if (isize == 0) /* 文件大小为0，直接返回 */
goto out;

end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); /* 计算文件的长度，以页为单位，并且要加1 */

/*
* Preallocate as many pages as we will need.
*/

/* 预分配需要预取的页 */

for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
pgoff_t page_offset = offset + page_idx;

if (page_offset > end_index) /* 判断是否到了文件的尾部 */
break;

rcu_read_lock(); /* 使用Read-Copy Update锁 */

/* 搜索页缓存的基树，看要预取的页是否已经被读取到page cache中了 */

page = radix_tree_lookup(&mapping->page_tree, page_offset);

/* 释放锁 */
rcu_read_unlock();

/* 判断该页是否已经在page cache中，如果已经存在，则直接continue */

if (page)
continue;

/* 分配一个新的页 */
page = page_cache_alloc_cold(mapping);
if (!page)
break;

/* 初始化该页面 */

page->index = page_offset;
list_add(&page->lru, &page_pool); /* 将这几个页加入page_pool */
if (page_idx == nr_to_read - lookahead_size)
SetPageReadahead(page); /*将页的标志(&(page)->flag)设为PG_readahead*/
ret++; /* 增加计数 */
}

/*
* Now start the IO. We ignore I/O errors - if the page is not
* uptodate then the caller will launch readpage again, and
* will then handle the error.
*/
if (ret)
read_pages(mapping, filp, &page_pool, ret); /* 接下来调用readpages读取数据 */
BUG_ON(!list_empty(&page_pool));
out:
return ret;
}

再下一步是read_pages函数，

static int read_pages(struct address_space *mapping, struct file *filp,
struct list_head *pages, unsigned nr_pages)
{
unsigned page_idx;
int ret;

/* 如果存在readpages指针，使用readpages读取数据 */
if (mapping->a_ops->readpages) {
ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); /* 读取一段数据 */
/* Clean up the remaining pages */
put_pages_list(pages); /* 清理页 */
goto out;
}

/* 对于某些不存在readpages接口的设备，用readpage函数循环读取 */
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
struct page *page = list_to_page(pages); /* 获得page的指针 */
list_del(&page->lru); /* 将这个page从page_pool中删除 */
if (!add_to_page_cache_lru(page, mapping,
page->index, GFP_KERNEL)) { /* 将该页加入page cache */
mapping->a_ops->readpage(filp, page); /* 调用回调，将该页从磁盘上读取出来 */
}
page_cache_release(page); /* 清理页 */
}
ret = 0;
out:
return ret; /* 成功为0，失败为错误码 */
}

到处，WILLNEED对应的预取流程告一段落了。其中readpages、readpage对应的实现，要参考具体的文件系统或者块设备，比如ext3、ext4等

分享到：