Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* |
2 | * linux/mm/madvise.c | |
3 | * | |
4 | * Copyright (C) 1999 Linus Torvalds | |
5 | * Copyright (C) 2002 Christoph Hellwig | |
6 | */ | |
7 | ||
8 | #include <linux/mman.h> | |
9 | #include <linux/pagemap.h> | |
10 | #include <linux/syscalls.h> | |
11 | #include <linux/hugetlb.h> | |
12 | ||
13 | /* | |
14 | * We can potentially split a vm area into separate | |
15 | * areas, each area with its own behavior. | |
16 | */ | |
17 | static long madvise_behavior(struct vm_area_struct * vma, unsigned long start, | |
18 | unsigned long end, int behavior) | |
19 | { | |
20 | struct mm_struct * mm = vma->vm_mm; | |
21 | int error = 0; | |
e798c6e8 PM |
22 | int new_flags = vma->vm_flags & ~VM_READHINTMASK; |
23 | ||
24 | switch (behavior) { | |
25 | case MADV_SEQUENTIAL: | |
26 | new_flags |= VM_SEQ_READ; | |
27 | break; | |
28 | case MADV_RANDOM: | |
29 | new_flags |= VM_RAND_READ; | |
30 | break; | |
31 | default: | |
32 | break; | |
33 | } | |
34 | ||
35 | if (new_flags == vma->vm_flags) | |
36 | goto out; | |
1da177e4 LT |
37 | |
38 | if (start != vma->vm_start) { | |
39 | error = split_vma(mm, vma, start, 1); | |
40 | if (error) | |
41 | goto out; | |
42 | } | |
43 | ||
44 | if (end != vma->vm_end) { | |
45 | error = split_vma(mm, vma, end, 0); | |
46 | if (error) | |
47 | goto out; | |
48 | } | |
49 | ||
50 | /* | |
51 | * vm_flags is protected by the mmap_sem held in write mode. | |
52 | */ | |
53 | VM_ClearReadHint(vma); | |
e798c6e8 | 54 | vma->vm_flags = new_flags; |
1da177e4 LT |
55 | |
56 | out: | |
57 | if (error == -ENOMEM) | |
58 | error = -EAGAIN; | |
59 | return error; | |
60 | } | |
61 | ||
62 | /* | |
63 | * Schedule all required I/O operations. Do not wait for completion. | |
64 | */ | |
65 | static long madvise_willneed(struct vm_area_struct * vma, | |
66 | unsigned long start, unsigned long end) | |
67 | { | |
68 | struct file *file = vma->vm_file; | |
69 | ||
70 | if (!file) | |
71 | return -EBADF; | |
72 | ||
73 | start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | |
74 | if (end > vma->vm_end) | |
75 | end = vma->vm_end; | |
76 | end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | |
77 | ||
78 | force_page_cache_readahead(file->f_mapping, | |
79 | file, start, max_sane_readahead(end - start)); | |
80 | return 0; | |
81 | } | |
82 | ||
83 | /* | |
84 | * Application no longer needs these pages. If the pages are dirty, | |
85 | * it's OK to just throw them away. The app will be more careful about | |
86 | * data it wants to keep. Be sure to free swap resources too. The | |
87 | * zap_page_range call sets things up for refill_inactive to actually free | |
88 | * these pages later if no one else has touched them in the meantime, | |
89 | * although we could add these pages to a global reuse list for | |
90 | * refill_inactive to pick up before reclaiming other pages. | |
91 | * | |
92 | * NB: This interface discards data rather than pushes it out to swap, | |
93 | * as some implementations do. This has performance implications for | |
94 | * applications like large transactional databases which want to discard | |
95 | * pages in anonymous maps after committing to backing store the data | |
96 | * that was kept in them. There is no reason to write this data out to | |
97 | * the swap area if the application is discarding it. | |
98 | * | |
99 | * An interface that causes the system to free clean pages and flush | |
100 | * dirty pages is already available as msync(MS_INVALIDATE). | |
101 | */ | |
102 | static long madvise_dontneed(struct vm_area_struct * vma, | |
103 | unsigned long start, unsigned long end) | |
104 | { | |
105 | if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma)) | |
106 | return -EINVAL; | |
107 | ||
108 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) { | |
109 | struct zap_details details = { | |
110 | .nonlinear_vma = vma, | |
111 | .last_index = ULONG_MAX, | |
112 | }; | |
113 | zap_page_range(vma, start, end - start, &details); | |
114 | } else | |
115 | zap_page_range(vma, start, end - start, NULL); | |
116 | return 0; | |
117 | } | |
118 | ||
119 | static long madvise_vma(struct vm_area_struct * vma, unsigned long start, | |
120 | unsigned long end, int behavior) | |
121 | { | |
122 | long error = -EBADF; | |
123 | ||
124 | switch (behavior) { | |
125 | case MADV_NORMAL: | |
126 | case MADV_SEQUENTIAL: | |
127 | case MADV_RANDOM: | |
128 | error = madvise_behavior(vma, start, end, behavior); | |
129 | break; | |
130 | ||
131 | case MADV_WILLNEED: | |
132 | error = madvise_willneed(vma, start, end); | |
133 | break; | |
134 | ||
135 | case MADV_DONTNEED: | |
136 | error = madvise_dontneed(vma, start, end); | |
137 | break; | |
138 | ||
139 | default: | |
140 | error = -EINVAL; | |
141 | break; | |
142 | } | |
143 | ||
144 | return error; | |
145 | } | |
146 | ||
147 | /* | |
148 | * The madvise(2) system call. | |
149 | * | |
150 | * Applications can use madvise() to advise the kernel how it should | |
151 | * handle paging I/O in this VM area. The idea is to help the kernel | |
152 | * use appropriate read-ahead and caching techniques. The information | |
153 | * provided is advisory only, and can be safely disregarded by the | |
154 | * kernel without affecting the correct operation of the application. | |
155 | * | |
156 | * behavior values: | |
157 | * MADV_NORMAL - the default behavior is to read clusters. This | |
158 | * results in some read-ahead and read-behind. | |
159 | * MADV_RANDOM - the system should read the minimum amount of data | |
160 | * on any access, since it is unlikely that the appli- | |
161 | * cation will need more than what it asks for. | |
162 | * MADV_SEQUENTIAL - pages in the given range will probably be accessed | |
163 | * once, so they can be aggressively read ahead, and | |
164 | * can be freed soon after they are accessed. | |
165 | * MADV_WILLNEED - the application is notifying the system to read | |
166 | * some pages ahead. | |
167 | * MADV_DONTNEED - the application is finished with the given range, | |
168 | * so the kernel can free resources associated with it. | |
169 | * | |
170 | * return values: | |
171 | * zero - success | |
172 | * -EINVAL - start + len < 0, start is not page-aligned, | |
173 | * "behavior" is not a valid value, or application | |
174 | * is attempting to release locked or shared pages. | |
175 | * -ENOMEM - addresses in the specified range are not currently | |
176 | * mapped, or are outside the AS of the process. | |
177 | * -EIO - an I/O error occurred while paging in data. | |
178 | * -EBADF - map exists, but area maps something that isn't a file. | |
179 | * -EAGAIN - a kernel resource was temporarily unavailable. | |
180 | */ | |
181 | asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) | |
182 | { | |
183 | unsigned long end; | |
184 | struct vm_area_struct * vma; | |
185 | int unmapped_error = 0; | |
186 | int error = -EINVAL; | |
187 | size_t len; | |
188 | ||
189 | down_write(¤t->mm->mmap_sem); | |
190 | ||
191 | if (start & ~PAGE_MASK) | |
192 | goto out; | |
193 | len = (len_in + ~PAGE_MASK) & PAGE_MASK; | |
194 | ||
195 | /* Check to see whether len was rounded up from small -ve to zero */ | |
196 | if (len_in && !len) | |
197 | goto out; | |
198 | ||
199 | end = start + len; | |
200 | if (end < start) | |
201 | goto out; | |
202 | ||
203 | error = 0; | |
204 | if (end == start) | |
205 | goto out; | |
206 | ||
207 | /* | |
208 | * If the interval [start,end) covers some unmapped address | |
209 | * ranges, just ignore them, but return -ENOMEM at the end. | |
210 | */ | |
211 | vma = find_vma(current->mm, start); | |
212 | for (;;) { | |
213 | /* Still start < end. */ | |
214 | error = -ENOMEM; | |
215 | if (!vma) | |
216 | goto out; | |
217 | ||
218 | /* Here start < vma->vm_end. */ | |
219 | if (start < vma->vm_start) { | |
220 | unmapped_error = -ENOMEM; | |
221 | start = vma->vm_start; | |
222 | } | |
223 | ||
224 | /* Here vma->vm_start <= start < vma->vm_end. */ | |
225 | if (end <= vma->vm_end) { | |
226 | if (start < end) { | |
227 | error = madvise_vma(vma, start, end, | |
228 | behavior); | |
229 | if (error) | |
230 | goto out; | |
231 | } | |
232 | error = unmapped_error; | |
233 | goto out; | |
234 | } | |
235 | ||
236 | /* Here vma->vm_start <= start < vma->vm_end < end. */ | |
237 | error = madvise_vma(vma, start, vma->vm_end, behavior); | |
238 | if (error) | |
239 | goto out; | |
240 | start = vma->vm_end; | |
241 | vma = vma->vm_next; | |
242 | } | |
243 | ||
244 | out: | |
245 | up_write(¤t->mm->mmap_sem); | |
246 | return error; | |
247 | } |