ext4: fix suboptimal seek_{data,hole} extents traversial
authorDmitry Monakhov <dmonakhov@openvz.org>
Tue, 2 Dec 2014 23:08:53 +0000 (18:08 -0500)
committerTheodore Ts'o <tytso@mit.edu>
Tue, 2 Dec 2014 23:08:53 +0000 (18:08 -0500)
It is ridiculous practice to scan inode block by block, this technique
applicable only for old indirect files. This takes significant amount
of time for really large files. Let's reuse ext4_fiemap which already
traverse inode-tree in most optimal meaner.

TESTCASE:
ftruncate64(fd, 0);
ftruncate64(fd, 1ULL << 40);
/* lseek will spin very long time */
lseek64(fd, 0, SEEK_DATA);
lseek64(fd, 0, SEEK_HOLE);

Original report: https://lkml.org/lkml/2014/10/16/620

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
fs/ext4/extents.c
fs/ext4/file.c

index bed43081720f718fc30dca204be8509ddcf5eac5..e5d3eadf47b1e7fb6251c590016044cf8b5c4c98 100644 (file)
@@ -5166,8 +5166,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 
        /* fallback to generic here if not in extents fmt */
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
-               return generic_block_fiemap(inode, fieinfo, start, len,
-                       ext4_get_block);
+               return __generic_block_fiemap(inode, fieinfo, start, len,
+                                             ext4_get_block);
 
        if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
                return -EBADR;
index 8131be8c0af3166aac865557baa9f0371564a397..513c12cf444c239f5c34bd4d73c653029bdaca96 100644 (file)
@@ -273,24 +273,19 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
  * we determine this extent as a data or a hole according to whether the
  * page cache has data or not.
  */
-static int ext4_find_unwritten_pgoff(struct inode *inode,
-                                    int whence,
-                                    struct ext4_map_blocks *map,
-                                    loff_t *offset)
+static int ext4_find_unwritten_pgoff(struct inode *inode, int whence,
+                                    loff_t endoff, loff_t *offset)
 {
        struct pagevec pvec;
-       unsigned int blkbits;
        pgoff_t index;
        pgoff_t end;
-       loff_t endoff;
        loff_t startoff;
        loff_t lastoff;
        int found = 0;
 
-       blkbits = inode->i_sb->s_blocksize_bits;
        startoff = *offset;
        lastoff = startoff;
-       endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
+
 
        index = startoff >> PAGE_CACHE_SHIFT;
        end = endoff >> PAGE_CACHE_SHIFT;
@@ -408,147 +403,144 @@ out:
 static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
 {
        struct inode *inode = file->f_mapping->host;
-       struct ext4_map_blocks map;
-       struct extent_status es;
-       ext4_lblk_t start, last, end;
-       loff_t dataoff, isize;
-       int blkbits;
-       int ret = 0;
+       struct fiemap_extent_info fie;
+       struct fiemap_extent ext[2];
+       loff_t next;
+       int i, ret = 0;
 
        mutex_lock(&inode->i_mutex);
-
-       isize = i_size_read(inode);
-       if (offset >= isize) {
+       if (offset >= inode->i_size) {
                mutex_unlock(&inode->i_mutex);
                return -ENXIO;
        }
-
-       blkbits = inode->i_sb->s_blocksize_bits;
-       start = offset >> blkbits;
-       last = start;
-       end = isize >> blkbits;
-       dataoff = offset;
-
-       do {
-               map.m_lblk = last;
-               map.m_len = end - last + 1;
-               ret = ext4_map_blocks(NULL, inode, &map, 0);
-               if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
-                       if (last != start)
-                               dataoff = (loff_t)last << blkbits;
+       fie.fi_flags = 0;
+       fie.fi_extents_max = 2;
+       fie.fi_extents_start = (struct fiemap_extent __user *) &ext;
+       while (1) {
+               mm_segment_t old_fs = get_fs();
+
+               fie.fi_extents_mapped = 0;
+               memset(ext, 0, sizeof(*ext) * fie.fi_extents_max);
+
+               set_fs(get_ds());
+               ret = ext4_fiemap(inode, &fie, offset, maxsize - offset);
+               set_fs(old_fs);
+               if (ret)
                        break;
-               }
 
-               /*
-                * If there is a delay extent at this offset,
-                * it will be as a data.
-                */
-               ext4_es_find_delayed_extent_range(inode, last, last, &es);
-               if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
-                       if (last != start)
-                               dataoff = (loff_t)last << blkbits;
+               /* No extents found, EOF */
+               if (!fie.fi_extents_mapped) {
+                       ret = -ENXIO;
                        break;
                }
+               for (i = 0; i < fie.fi_extents_mapped; i++) {
+                       next = (loff_t)(ext[i].fe_length + ext[i].fe_logical);
 
-               /*
-                * If there is a unwritten extent at this offset,
-                * it will be as a data or a hole according to page
-                * cache that has data or not.
-                */
-               if (map.m_flags & EXT4_MAP_UNWRITTEN) {
-                       int unwritten;
-                       unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
-                                                             &map, &dataoff);
-                       if (unwritten)
-                               break;
-               }
+                       if (offset < (loff_t)ext[i].fe_logical)
+                               offset = (loff_t)ext[i].fe_logical;
+                       /*
+                        * If extent is not unwritten, then it contains valid
+                        * data, mapped or delayed.
+                        */
+                       if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN))
+                               goto out;
 
-               last++;
-               dataoff = (loff_t)last << blkbits;
-       } while (last <= end);
+                       /*
+                        * If there is a unwritten extent at this offset,
+                        * it will be as a data or a hole according to page
+                        * cache that has data or not.
+                        */
+                       if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
+                                                     next, &offset))
+                               goto out;
 
+                       if (ext[i].fe_flags & FIEMAP_EXTENT_LAST) {
+                               ret = -ENXIO;
+                               goto out;
+                       }
+                       offset = next;
+               }
+       }
+       if (offset > inode->i_size)
+               offset = inode->i_size;
+out:
        mutex_unlock(&inode->i_mutex);
+       if (ret)
+               return ret;
 
-       if (dataoff > isize)
-               return -ENXIO;
-
-       return vfs_setpos(file, dataoff, maxsize);
+       return vfs_setpos(file, offset, maxsize);
 }
 
 /*
- * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
+ * ext4_seek_hole() retrieves the offset for SEEK_HOLE
  */
 static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
 {
        struct inode *inode = file->f_mapping->host;
-       struct ext4_map_blocks map;
-       struct extent_status es;
-       ext4_lblk_t start, last, end;
-       loff_t holeoff, isize;
-       int blkbits;
-       int ret = 0;
+       struct fiemap_extent_info fie;
+       struct fiemap_extent ext[2];
+       loff_t next;
+       int i, ret = 0;
 
        mutex_lock(&inode->i_mutex);
-
-       isize = i_size_read(inode);
-       if (offset >= isize) {
+       if (offset >= inode->i_size) {
                mutex_unlock(&inode->i_mutex);
                return -ENXIO;
        }
 
-       blkbits = inode->i_sb->s_blocksize_bits;
-       start = offset >> blkbits;
-       last = start;
-       end = isize >> blkbits;
-       holeoff = offset;
+       fie.fi_flags = 0;
+       fie.fi_extents_max = 2;
+       fie.fi_extents_start = (struct fiemap_extent __user *)&ext;
+       while (1) {
+               mm_segment_t old_fs = get_fs();
 
-       do {
-               map.m_lblk = last;
-               map.m_len = end - last + 1;
-               ret = ext4_map_blocks(NULL, inode, &map, 0);
-               if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
-                       last += ret;
-                       holeoff = (loff_t)last << blkbits;
-                       continue;
-               }
+               fie.fi_extents_mapped = 0;
+               memset(ext, 0, sizeof(*ext));
 
-               /*
-                * If there is a delay extent at this offset,
-                * we will skip this extent.
-                */
-               ext4_es_find_delayed_extent_range(inode, last, last, &es);
-               if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
-                       last = es.es_lblk + es.es_len;
-                       holeoff = (loff_t)last << blkbits;
-                       continue;
-               }
+               set_fs(get_ds());
+               ret = ext4_fiemap(inode, &fie, offset, maxsize - offset);
+               set_fs(old_fs);
+               if (ret)
+                       break;
 
-               /*
-                * If there is a unwritten extent at this offset,
-                * it will be as a data or a hole according to page
-                * cache that has data or not.
-                */
-               if (map.m_flags & EXT4_MAP_UNWRITTEN) {
-                       int unwritten;
-                       unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
-                                                             &map, &holeoff);
-                       if (!unwritten) {
-                               last += ret;
-                               holeoff = (loff_t)last << blkbits;
+               /* No extents found */
+               if (!fie.fi_extents_mapped)
+                       break;
+
+               for (i = 0; i < fie.fi_extents_mapped; i++) {
+                       next = (loff_t)(ext[i].fe_logical + ext[i].fe_length);
+                       /*
+                        * If extent is not unwritten, then it contains valid
+                        * data, mapped or delayed.
+                        */
+                       if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN)) {
+                               if (offset < (loff_t)ext[i].fe_logical)
+                                       goto out;
+                               offset = next;
                                continue;
                        }
-               }
-
-               /* find a hole */
-               break;
-       } while (last <= end);
+                       /*
+                        * If there is a unwritten extent at this offset,
+                        * it will be as a data or a hole according to page
+                        * cache that has data or not.
+                        */
+                       if (ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
+                                                     next, &offset))
+                               goto out;
 
+                       offset = next;
+                       if (ext[i].fe_flags & FIEMAP_EXTENT_LAST)
+                               goto out;
+               }
+       }
+       if (offset > inode->i_size)
+               offset = inode->i_size;
+out:
        mutex_unlock(&inode->i_mutex);
+       if (ret)
+               return ret;
 
-       if (holeoff > isize)
-               holeoff = isize;
-
-       return vfs_setpos(file, holeoff, maxsize);
+       return vfs_setpos(file, offset, maxsize);
 }
 
 /*