goto out;
}
- /* If the non-strided (ria_pages == 0) readahead window
- * (ria_start + ret) has grown across an RPC boundary, then trim
- * readahead size by the amount beyond the RPC so it ends on an
- * RPC boundary. If the readahead window is already ending on
- * an RPC boundary (beyond_rpc == 0), or smaller than a full
- * RPC (beyond_rpc < ret) the readahead size is unchanged.
- * The (beyond_rpc != 0) check is skipped since the conditional
- * branch is more expensive than subtracting zero from the result.
- *
- * Strided read is left unaligned to avoid small fragments beyond
- * the RPC boundary from needing an extra read RPC.
- */
- if (ria->ria_pages == 0) {
- long beyond_rpc = (ria->ria_start + ret) % PTLRPC_MAX_BRW_PAGES;
-
- if (/* beyond_rpc != 0 && */ beyond_rpc < ret)
- ret -= beyond_rpc;
- }
-
if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) {
atomic_sub(ret, &ra->ra_cur_pages);
ret = 0;
#define RAS_CDEBUG(ras) \
CDEBUG(D_READA, \
- "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu" \
- "csr %lu sf %lu sp %lu sl %lu\n", \
+ "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu rpc %lu " \
+ "r %lu ri %lu csr %lu sf %lu sp %lu sl %lu\n", \
ras->ras_last_readpage, ras->ras_consecutive_requests, \
ras->ras_consecutive_pages, ras->ras_window_start, \
ras->ras_window_len, ras->ras_next_readahead, \
+ ras->ras_rpc_size, \
ras->ras_requests, ras->ras_request_index, \
ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \
ras->ras_stride_pages, ras->ras_stride_length)
ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\
ria->ria_pages)
-/* Limit this to the blocksize instead of PTLRPC_BRW_MAX_SIZE, since we don't
- * know what the actual RPC size is. If this needs to change, it makes more
- * sense to tune the i_blkbits value for the file based on the OSTs it is
- * striped over, rather than having a constant value for all files here.
- */
-
-/* RAS_INCREASE_STEP should be (1UL << (inode->i_blkbits - PAGE_SHIFT)).
- * Temporarily set RAS_INCREASE_STEP to 1MB. After 4MB RPC is enabled
- * by default, this should be adjusted corresponding with max_read_ahead_mb
- * and max_read_ahead_per_file_mb otherwise the readahead budget can be used
- * up quickly which will affect read performance significantly. See LU-2816
- */
-#define RAS_INCREASE_STEP(inode) (ONE_MB_BRW_SIZE >> PAGE_SHIFT)
-
static inline int stride_io_mode(struct ll_readahead_state *ras)
{
return ras->ras_consecutive_stride_requests > 1;
length);
}
+static unsigned long ras_align(struct ll_readahead_state *ras,
+ unsigned long index,
+ unsigned long *remainder)
+{
+ unsigned long rem = index % ras->ras_rpc_size;
+
+ if (remainder)
+ *remainder = rem;
+ return index - rem;
+}
+
/*Check whether the index is in the defined ra-window */
static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria)
{
ria->ria_length < ria->ria_pages);
}
-static int ll_read_ahead_pages(const struct lu_env *env,
- struct cl_io *io, struct cl_page_list *queue,
- struct ra_io_arg *ria,
- unsigned long *reserved_pages,
- pgoff_t *ra_end)
+static unsigned long
+ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io,
+ struct cl_page_list *queue, struct ll_readahead_state *ras,
+ struct ra_io_arg *ria)
{
struct cl_read_ahead ra = { 0 };
- int rc, count = 0;
+ unsigned long ra_end = 0;
bool stride_ria;
pgoff_t page_idx;
+ int rc;
LASSERT(ria);
RIA_DEBUG(ria);
stride_ria = ria->ria_length > ria->ria_pages && ria->ria_pages > 0;
for (page_idx = ria->ria_start;
- page_idx <= ria->ria_end && *reserved_pages > 0; page_idx++) {
+ page_idx <= ria->ria_end && ria->ria_reserved > 0; page_idx++) {
if (ras_inside_ra_window(page_idx, ria)) {
if (!ra.cra_end || ra.cra_end < page_idx) {
+ unsigned long end;
+
cl_read_ahead_release(env, &ra);
rc = cl_io_read_ahead(env, io, page_idx, &ra);
if (rc < 0)
break;
+ CDEBUG(D_READA, "idx: %lu, ra: %lu, rpc: %lu\n",
+ page_idx, ra.cra_end, ra.cra_rpc_size);
LASSERTF(ra.cra_end >= page_idx,
"object: %p, indcies %lu / %lu\n",
io->ci_obj, ra.cra_end, page_idx);
+ /*
+ * update read ahead RPC size.
+ * NB: it's racy but doesn't matter
+ */
+ if (ras->ras_rpc_size > ra.cra_rpc_size &&
+ ra.cra_rpc_size > 0)
+ ras->ras_rpc_size = ra.cra_rpc_size;
+ /* trim it to align with optimal RPC size */
+ end = ras_align(ras, ria->ria_end + 1, NULL);
+ if (end > 0 && !ria->ria_eof)
+ ria->ria_end = end - 1;
+ if (ria->ria_end < ria->ria_end_min)
+ ria->ria_end = ria->ria_end_min;
+ if (ria->ria_end > ra.cra_end)
+ ria->ria_end = ra.cra_end;
}
- /* If the page is inside the read-ahead window*/
+ /* If the page is inside the read-ahead window */
rc = ll_read_ahead_page(env, io, queue, page_idx);
- if (!rc) {
- (*reserved_pages)--;
- count++;
- }
+ if (rc < 0)
+ break;
+
+ ra_end = page_idx;
+ if (!rc)
+ ria->ria_reserved--;
} else if (stride_ria) {
/* If it is not in the read-ahead window, and it is
* read-ahead mode, then check whether it should skip
}
cl_read_ahead_release(env, &ra);
- *ra_end = page_idx;
- return count;
+ return ra_end;
}
static int ll_readahead(const struct lu_env *env, struct cl_io *io,
struct vvp_io *vio = vvp_env_io(env);
struct ll_thread_info *lti = ll_env_info(env);
struct cl_attr *attr = vvp_env_thread_attr(env);
- unsigned long len, mlen = 0, reserved;
+ unsigned long len, mlen = 0;
pgoff_t ra_end, start = 0, end = 0;
struct inode *inode;
struct ra_io_arg *ria = <i->lti_ria;
end < vio->vui_ra_start + vio->vui_ra_count - 1)
end = vio->vui_ra_start + vio->vui_ra_count - 1;
- if (end != 0) {
- unsigned long rpc_boundary;
- /*
- * Align RA window to an optimal boundary.
- *
- * XXX This would be better to align to cl_max_pages_per_rpc
- * instead of PTLRPC_MAX_BRW_PAGES, because the RPC size may
- * be aligned to the RAID stripe size in the future and that
- * is more important than the RPC size.
- */
- /* Note: we only trim the RPC, instead of extending the RPC
- * to the boundary, so to avoid reading too much pages during
- * random reading.
- */
- rpc_boundary = (end + 1) & (~(PTLRPC_MAX_BRW_PAGES - 1));
- if (rpc_boundary > 0)
- rpc_boundary--;
-
- if (rpc_boundary > start)
- end = rpc_boundary;
+ if (end) {
+ unsigned long end_index;
/* Truncate RA window to end of file */
- end = min(end, (unsigned long)((kms - 1) >> PAGE_SHIFT));
+ end_index = (unsigned long)((kms - 1) >> PAGE_SHIFT);
+ if (end_index <= end) {
+ end = end_index;
+ ria->ria_eof = true;
+ }
ras->ras_next_readahead = max(end, end + 1);
RAS_CDEBUG(ras);
/* at least to extend the readahead window to cover current read */
if (!hit && vio->vui_ra_valid &&
vio->vui_ra_start + vio->vui_ra_count > ria->ria_start) {
+ unsigned long remainder;
+
/* to the end of current read window. */
mlen = vio->vui_ra_start + vio->vui_ra_count - ria->ria_start;
/* trim to RPC boundary */
- start = ria->ria_start & (PTLRPC_MAX_BRW_PAGES - 1);
- mlen = min(mlen, PTLRPC_MAX_BRW_PAGES - start);
+ ras_align(ras, ria->ria_start, &remainder);
+ mlen = min(mlen, ras->ras_rpc_size - remainder);
+ ria->ria_end_min = ria->ria_start + mlen;
}
- reserved = ll_ra_count_get(ll_i2sbi(inode), ria, len, mlen);
- if (reserved < len)
+ ria->ria_reserved = ll_ra_count_get(ll_i2sbi(inode), ria, len, mlen);
+ if (ria->ria_reserved < len)
ll_ra_stats_inc(inode, RA_STAT_MAX_IN_FLIGHT);
CDEBUG(D_READA, "reserved pages %lu/%lu/%lu, ra_cur %d, ra_max %lu\n",
- reserved, len, mlen,
+ ria->ria_reserved, len, mlen,
atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages),
ll_i2sbi(inode)->ll_ra_info.ra_max_pages);
- ret = ll_read_ahead_pages(env, io, queue, ria, &reserved, &ra_end);
+ ra_end = ll_read_ahead_pages(env, io, queue, ras, ria);
- if (reserved != 0)
- ll_ra_count_put(ll_i2sbi(inode), reserved);
+ if (ria->ria_reserved)
+ ll_ra_count_put(ll_i2sbi(inode), ria->ria_reserved);
- if (ra_end == end + 1 && ra_end == (kms >> PAGE_SHIFT))
+ if (ra_end == end && ra_end == (kms >> PAGE_SHIFT))
ll_ra_stats_inc(inode, RA_STAT_EOF);
/* if we didn't get to the end of the region we reserved from
CDEBUG(D_READA, "ra_end = %lu end = %lu stride end = %lu pages = %d\n",
ra_end, end, ria->ria_end, ret);
- if (ra_end != end + 1) {
+ if (ra_end > 0 && ra_end != end) {
ll_ra_stats_inc(inode, RA_STAT_FAILED_REACH_END);
spin_lock(&ras->ras_lock);
- if (ra_end < ras->ras_next_readahead &&
+ if (ra_end <= ras->ras_next_readahead &&
index_in_window(ra_end, ras->ras_window_start, 0,
ras->ras_window_len)) {
- ras->ras_next_readahead = ra_end;
+ ras->ras_next_readahead = ra_end + 1;
RAS_CDEBUG(ras);
}
spin_unlock(&ras->ras_lock);
static void ras_set_start(struct inode *inode, struct ll_readahead_state *ras,
unsigned long index)
{
- ras->ras_window_start = index & (~(RAS_INCREASE_STEP(inode) - 1));
+ ras->ras_window_start = ras_align(ras, index, NULL);
}
/* called with the ras_lock held or from places where it doesn't matter */
void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras)
{
spin_lock_init(&ras->ras_lock);
+ ras->ras_rpc_size = PTLRPC_MAX_BRW_PAGES;
ras_reset(inode, ras, 0);
ras->ras_requests = 0;
}
* but current clio architecture does not support retrieve such
* information from lower layer. FIXME later
*/
- if (stride_io_mode(ras))
- ras_stride_increase_window(ras, ra, RAS_INCREASE_STEP(inode));
- else
- ras->ras_window_len = min(ras->ras_window_len +
- RAS_INCREASE_STEP(inode),
- ra->ra_max_pages_per_file);
+ if (stride_io_mode(ras)) {
+ ras_stride_increase_window(ras, ra, ras->ras_rpc_size);
+ } else {
+ unsigned long wlen;
+
+ wlen = min(ras->ras_window_len + ras->ras_rpc_size,
+ ra->ra_max_pages_per_file);
+ ras->ras_window_len = ras_align(ras, wlen, NULL);
+ }
}
static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
* instead of ras_window_start, which is RPC aligned
*/
ras->ras_next_readahead = max(index, ras->ras_next_readahead);
+ ras->ras_window_start = max(ras->ras_stride_offset,
+ ras->ras_window_start);
} else {
if (ras->ras_next_readahead < ras->ras_window_start)
ras->ras_next_readahead = ras->ras_window_start;
*/
ras->ras_next_readahead = max(index, ras->ras_next_readahead);
ras->ras_stride_offset = index;
- ras->ras_window_len = RAS_INCREASE_STEP(inode);
+ ras->ras_window_start = max(index, ras->ras_window_start);
}
/* The initial ras_window_len is set to the request size. To avoid
struct cl_2queue *queue = &io->ci_queue;
struct ll_sb_info *sbi = ll_i2sbi(inode);
struct vvp_page *vpg;
+ bool uptodate;
int rc = 0;
vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page));
+ uptodate = vpg->vpg_defer_uptodate;
+
if (sbi->ll_ra_info.ra_max_pages_per_file > 0 &&
sbi->ll_ra_info.ra_max_pages > 0) {
struct vvp_io *vio = vvp_env_io(env);
enum ras_update_flags flags = 0;
- if (vpg->vpg_defer_uptodate)
+ if (uptodate)
flags |= LL_RAS_HIT;
if (!vio->vui_ra_valid)
flags |= LL_RAS_MMAP;
ras_update(sbi, inode, ras, vvp_index(vpg), flags);
}
- if (vpg->vpg_defer_uptodate) {
+ cl_2queue_init(queue);
+ if (uptodate) {
vpg->vpg_ra_used = 1;
cl_page_export(env, page, 1);
+ cl_page_disown(env, io, page);
+ } else {
+ cl_page_list_add(&queue->c2_qin, page);
}
- cl_2queue_init(queue);
- /*
- * Add page into the queue even when it is marked uptodate above.
- * this will unlock it automatically as part of cl_page_list_disown().
- */
- cl_page_list_add(&queue->c2_qin, page);
if (sbi->ll_ra_info.ra_max_pages_per_file > 0 &&
sbi->ll_ra_info.ra_max_pages > 0) {
int rc2;
rc2 = ll_readahead(env, io, &queue->c2_qin, ras,
- vpg->vpg_defer_uptodate);
+ uptodate);
CDEBUG(D_READA, DFID "%d pages read ahead at %lu\n",
PFID(ll_inode2fid(inode)), rc2, vvp_index(vpg));
}