FreeBSD/Linux Kernel Cross Reference
sys/mm/filemap.c
1 /*
2 * linux/mm/filemap.c
3 *
4 * Copyright (C) 1994-1999 Linus Torvalds
5 */
6
7 /*
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
11 */
12 #include <linux/module.h>
13 #include <linux/slab.h>
14 #include <linux/shm.h>
15 #include <linux/mman.h>
16 #include <linux/locks.h>
17 #include <linux/pagemap.h>
18 #include <linux/swap.h>
19 #include <linux/smp_lock.h>
20 #include <linux/blkdev.h>
21 #include <linux/file.h>
22 #include <linux/swapctl.h>
23 #include <linux/init.h>
24 #include <linux/mm.h>
25 #include <linux/iobuf.h>
26
27 #include <asm/pgalloc.h>
28 #include <asm/uaccess.h>
29 #include <asm/mman.h>
30
31 #include <linux/highmem.h>
32
33 /*
34 * Shared mappings implemented 30.11.1994. It's not fully working yet,
35 * though.
36 *
37 * Shared mappings now work. 15.8.1995 Bruno.
38 *
39 * finished 'unifying' the page and buffer cache and SMP-threaded the
40 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
41 *
42 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
43 */
44
45 atomic_t page_cache_size = ATOMIC_INIT(0);
46 unsigned int page_hash_bits;
47 struct page **page_hash_table;
48
49 int vm_max_readahead = 31;
50 int vm_min_readahead = 3;
51 EXPORT_SYMBOL(vm_max_readahead);
52 EXPORT_SYMBOL(vm_min_readahead);
53
54
55 spinlock_cacheline_t pagecache_lock_cacheline = {SPIN_LOCK_UNLOCKED};
56 /*
57 * NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock
58 * with the pagecache_lock held.
59 *
60 * Ordering:
61 * swap_lock ->
62 * pagemap_lru_lock ->
63 * pagecache_lock
64 */
65 spinlock_cacheline_t pagemap_lru_lock_cacheline = {SPIN_LOCK_UNLOCKED};
66
67 #define CLUSTER_PAGES (1 << page_cluster)
68 #define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster)
69
70 static void FASTCALL(add_page_to_hash_queue(struct page * page, struct page **p));
71 static void add_page_to_hash_queue(struct page * page, struct page **p)
72 {
73 struct page *next = *p;
74
75 *p = page;
76 page->next_hash = next;
77 page->pprev_hash = p;
78 if (next)
79 next->pprev_hash = &page->next_hash;
80 if (page->buffers)
81 PAGE_BUG(page);
82 atomic_inc(&page_cache_size);
83 }
84
85 static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page)
86 {
87 struct list_head *head = &mapping->clean_pages;
88
89 mapping->nrpages++;
90 list_add(&page->list, head);
91 page->mapping = mapping;
92 }
93
94 static inline void remove_page_from_inode_queue(struct page * page)
95 {
96 struct address_space * mapping = page->mapping;
97
98 if (mapping->a_ops->removepage)
99 mapping->a_ops->removepage(page);
100
101 list_del(&page->list);
102 page->mapping = NULL;
103 wmb();
104 mapping->nrpages--;
105 }
106
107 static inline void remove_page_from_hash_queue(struct page * page)
108 {
109 struct page *next = page->next_hash;
110 struct page **pprev = page->pprev_hash;
111
112 if (next)
113 next->pprev_hash = pprev;
114 *pprev = next;
115 page->pprev_hash = NULL;
116 atomic_dec(&page_cache_size);
117 }
118
119 /*
120 * Remove a page from the page cache and free it. Caller has to make
121 * sure the page is locked and that nobody else uses it - or that usage
122 * is safe.
123 */
124 void __remove_inode_page(struct page *page)
125 {
126 remove_page_from_inode_queue(page);
127 remove_page_from_hash_queue(page);
128 }
129
130 void remove_inode_page(struct page *page)
131 {
132 if (!PageLocked(page))
133 PAGE_BUG(page);
134
135 spin_lock(&pagecache_lock);
136 __remove_inode_page(page);
137 spin_unlock(&pagecache_lock);
138 }
139
140 static inline int sync_page(struct page *page)
141 {
142 struct address_space *mapping = page->mapping;
143
144 if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
145 return mapping->a_ops->sync_page(page);
146 return 0;
147 }
148
149 /*
150 * Add a page to the dirty page list.
151 */
152 void set_page_dirty(struct page *page)
153 {
154 if (!test_and_set_bit(PG_dirty, &page->flags)) {
155 struct address_space *mapping = page->mapping;
156
157 if (mapping) {
158 spin_lock(&pagecache_lock);
159 mapping = page->mapping;
160 if (mapping) { /* may have been truncated */
161 list_del(&page->list);
162 list_add(&page->list, &mapping->dirty_pages);
163 }
164 spin_unlock(&pagecache_lock);
165
166 if (mapping && mapping->host)
167 mark_inode_dirty_pages(mapping->host);
168 }
169 }
170 }
171
172 /**
173 * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
174 * @inode: the inode which pages we want to invalidate
175 *
176 * This function only removes the unlocked pages, if you want to
177 * remove all the pages of one inode, you must call truncate_inode_pages.
178 */
179
180 void invalidate_inode_pages(struct inode * inode)
181 {
182 struct list_head *head, *curr;
183 struct page * page;
184
185 head = &inode->i_mapping->clean_pages;
186
187 spin_lock(&pagemap_lru_lock);
188 spin_lock(&pagecache_lock);
189 curr = head->next;
190
191 while (curr != head) {
192 page = list_entry(curr, struct page, list);
193 curr = curr->next;
194
195 /* We cannot invalidate something in dirty.. */
196 if (PageDirty(page))
197 continue;
198
199 /* ..or locked */
200 if (TryLockPage(page))
201 continue;
202
203 if (page->buffers && !try_to_free_buffers(page, 0))
204 goto unlock;
205
206 if (page_count(page) != 1)
207 goto unlock;
208
209 __lru_cache_del(page);
210 __remove_inode_page(page);
211 UnlockPage(page);
212 page_cache_release(page);
213 continue;
214 unlock:
215 UnlockPage(page);
216 continue;
217 }
218
219 spin_unlock(&pagecache_lock);
220 spin_unlock(&pagemap_lru_lock);
221 }
222
223 static int do_flushpage(struct page *page, unsigned long offset)
224 {
225 int (*flushpage) (struct page *, unsigned long);
226 flushpage = page->mapping->a_ops->flushpage;
227 if (flushpage)
228 return (*flushpage)(page, offset);
229 return block_flushpage(page, offset);
230 }
231
232 static inline void truncate_partial_page(struct page *page, unsigned partial)
233 {
234 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
235 if (page->buffers)
236 do_flushpage(page, partial);
237 }
238
239 static void truncate_complete_page(struct page *page)
240 {
241 /* Leave it on the LRU if it gets converted into anonymous buffers */
242 if (!page->buffers || do_flushpage(page, 0))
243 lru_cache_del(page);
244
245 /*
246 * We remove the page from the page cache _after_ we have
247 * destroyed all buffer-cache references to it. Otherwise some
248 * other process might think this inode page is not in the
249 * page cache and creates a buffer-cache alias to it causing
250 * all sorts of fun problems ...
251 */
252 ClearPageDirty(page);
253 ClearPageUptodate(page);
254 remove_inode_page(page);
255 page_cache_release(page);
256 }
257
258 static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *));
259 static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial)
260 {
261 struct list_head *curr;
262 struct page * page;
263 int unlocked = 0;
264
265 restart:
266 curr = head->prev;
267 while (curr != head) {
268 unsigned long offset;
269
270 page = list_entry(curr, struct page, list);
271 offset = page->index;
272
273 /* Is one of the pages to truncate? */
274 if ((offset >= start) || (*partial && (offset + 1) == start)) {
275 int failed;
276
277 page_cache_get(page);
278 failed = TryLockPage(page);
279
280 list_del(head);
281 if (!failed)
282 /* Restart after this page */
283 list_add_tail(head, curr);
284 else
285 /* Restart on this page */
286 list_add(head, curr);
287
288 spin_unlock(&pagecache_lock);
289 unlocked = 1;
290
291 if (!failed) {
292 if (*partial && (offset + 1) == start) {
293 truncate_partial_page(page, *partial);
294 *partial = 0;
295 } else
296 truncate_complete_page(page);
297
298 UnlockPage(page);
299 } else
300 wait_on_page(page);
301
302 page_cache_release(page);
303
304 if (current->need_resched) {
305 __set_current_state(TASK_RUNNING);
306 schedule();
307 }
308
309 spin_lock(&pagecache_lock);
310 goto restart;
311 }
312 curr = curr->prev;
313 }
314 return unlocked;
315 }
316
317
318 /**
319 * truncate_inode_pages - truncate *all* the pages from an offset
320 * @mapping: mapping to truncate
321 * @lstart: offset from with to truncate
322 *
323 * Truncate the page cache at a set offset, removing the pages
324 * that are beyond that offset (and zeroing out partial pages).
325 * If any page is locked we wait for it to become unlocked.
326 */
327 void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
328 {
329 unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
330 unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
331 int unlocked;
332
333 spin_lock(&pagecache_lock);
334 do {
335 unlocked = truncate_list_pages(&mapping->clean_pages, start, &partial);
336 unlocked |= truncate_list_pages(&mapping->dirty_pages, start, &partial);
337 unlocked |= truncate_list_pages(&mapping->locked_pages, start, &partial);
338 } while (unlocked);
339 /* Traversed all three lists without dropping the lock */
340 spin_unlock(&pagecache_lock);
341 }
342
343 static inline int invalidate_this_page2(struct page * page,
344 struct list_head * curr,
345 struct list_head * head)
346 {
347 int unlocked = 1;
348
349 /*
350 * The page is locked and we hold the pagecache_lock as well
351 * so both page_count(page) and page->buffers stays constant here.
352 */
353 if (page_count(page) == 1 + !!page->buffers) {
354 /* Restart after this page */
355 list_del(head);
356 list_add_tail(head, curr);
357
358 page_cache_get(page);
359 spin_unlock(&pagecache_lock);
360 truncate_complete_page(page);
361 } else {
362 if (page->buffers) {
363 /* Restart after this page */
364 list_del(head);
365 list_add_tail(head, curr);
366
367 page_cache_get(page);
368 spin_unlock(&pagecache_lock);
369 block_invalidate_page(page);
370 } else
371 unlocked = 0;
372
373 ClearPageDirty(page);
374 ClearPageUptodate(page);
375 }
376
377 return unlocked;
378 }
379
380 static int FASTCALL(invalidate_list_pages2(struct list_head *));
381 static int invalidate_list_pages2(struct list_head *head)
382 {
383 struct list_head *curr;
384 struct page * page;
385 int unlocked = 0;
386
387 restart:
388 curr = head->prev;
389 while (curr != head) {
390 page = list_entry(curr, struct page, list);
391
392 if (!TryLockPage(page)) {
393 int __unlocked;
394
395 __unlocked = invalidate_this_page2(page, curr, head);
396 UnlockPage(page);
397 unlocked |= __unlocked;
398 if (!__unlocked) {
399 curr = curr->prev;
400 continue;
401 }
402 } else {
403 /* Restart on this page */
404 list_del(head);
405 list_add(head, curr);
406
407 page_cache_get(page);
408 spin_unlock(&pagecache_lock);
409 unlocked = 1;
410 wait_on_page(page);
411 }
412
413 page_cache_release(page);
414 if (current->need_resched) {
415 __set_current_state(TASK_RUNNING);
416 schedule();
417 }
418
419 spin_lock(&pagecache_lock);
420 goto restart;
421 }
422 return unlocked;
423 }
424
425 /**
426 * invalidate_inode_pages2 - Clear all the dirty bits around if it can't
427 * free the pages because they're mapped.
428 * @mapping: the address_space which pages we want to invalidate
429 */
430 void invalidate_inode_pages2(struct address_space * mapping)
431 {
432 int unlocked;
433
434 spin_lock(&pagecache_lock);
435 do {
436 unlocked = invalidate_list_pages2(&mapping->clean_pages);
437 unlocked |= invalidate_list_pages2(&mapping->dirty_pages);
438 unlocked |= invalidate_list_pages2(&mapping->locked_pages);
439 } while (unlocked);
440 spin_unlock(&pagecache_lock);
441 }
442
443 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
444 {
445 goto inside;
446
447 for (;;) {
448 page = page->next_hash;
449 inside:
450 if (!page)
451 goto not_found;
452 if (page->mapping != mapping)
453 continue;
454 if (page->index == offset)
455 break;
456 }
457
458 not_found:
459 return page;
460 }
461
462 static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *))
463 {
464 struct list_head *curr;
465 struct page *page;
466 int retval = 0;
467
468 spin_lock(&pagecache_lock);
469 curr = head->next;
470 while (curr != head) {
471 page = list_entry(curr, struct page, list);
472 curr = curr->next;
473 if (!page->buffers)
474 continue;
475 if (page->index >= end)
476 continue;
477 if (page->index < start)
478 continue;
479
480 page_cache_get(page);
481 spin_unlock(&pagecache_lock);
482 lock_page(page);
483
484 /* The buffers could have been free'd while we waited for the page lock */
485 if (page->buffers)
486 retval |= fn(page);
487
488 UnlockPage(page);
489 spin_lock(&pagecache_lock);
490 curr = page->list.next;
491 page_cache_release(page);
492 }
493 spin_unlock(&pagecache_lock);
494
495 return retval;
496 }
497
498 /*
499 * Two-stage data sync: first start the IO, then go back and
500 * collect the information..
501 */
502 int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
503 {
504 int retval;
505
506 /* writeout dirty buffers on pages from both clean and dirty lists */
507 retval = do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page);
508 retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, writeout_one_page);
509 retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, writeout_one_page);
510
511 /* now wait for locked buffers on pages from both clean and dirty lists */
512 retval |= do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, waitfor_one_page);
513 retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, waitfor_one_page);
514 retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, waitfor_one_page);
515
516 return retval;
517 }
518
519 /*
520 * In-memory filesystems have to fail their
521 * writepage function - and this has to be
522 * worked around in the VM layer..
523 *
524 * We
525 * - mark the page dirty again (but do NOT
526 * add it back to the inode dirty list, as
527 * that would livelock in fdatasync)
528 * - activate the page so that the page stealer
529 * doesn't try to write it out over and over
530 * again.
531 */
532 int fail_writepage(struct page *page)
533 {
534 /* Only activate on memory-pressure, not fsync.. */
535 if (PageLaunder(page)) {
536 activate_page(page);
537 SetPageReferenced(page);
538 }
539
540 /* Set the page dirty again, unlock */
541 SetPageDirty(page);
542 UnlockPage(page);
543 return 0;
544 }
545
546 EXPORT_SYMBOL(fail_writepage);
547
548 /**
549 * filemap_fdatasync - walk the list of dirty pages of the given address space
550 * and writepage() all of them.
551 *
552 * @mapping: address space structure to write
553 *
554 */
555 int filemap_fdatasync(struct address_space * mapping)
556 {
557 int ret = 0;
558 int (*writepage)(struct page *) = mapping->a_ops->writepage;
559
560 spin_lock(&pagecache_lock);
561
562 while (!list_empty(&mapping->dirty_pages)) {
563 struct page *page = list_entry(mapping->dirty_pages.prev, struct page, list);
564
565 list_del(&page->list);
566 list_add(&page->list, &mapping->locked_pages);
567
568 if (!PageDirty(page))
569 continue;
570
571 page_cache_get(page);
572 spin_unlock(&pagecache_lock);
573
574 lock_page(page);
575
576 if (PageDirty(page)) {
577 int err;
578 ClearPageDirty(page);
579 err = writepage(page);
580 if (err && !ret)
581 ret = err;
582 } else
583 UnlockPage(page);
584
585 page_cache_release(page);
586 spin_lock(&pagecache_lock);
587 }
588 spin_unlock(&pagecache_lock);
589 return ret;
590 }
591
592 /**
593 * filemap_fdatawait - walk the list of locked pages of the given address space
594 * and wait for all of them.
595 *
596 * @mapping: address space structure to wait for
597 *
598 */
599 int filemap_fdatawait(struct address_space * mapping)
600 {
601 int ret = 0;
602
603 spin_lock(&pagecache_lock);
604
605 while (!list_empty(&mapping->locked_pages)) {
606 struct page *page = list_entry(mapping->locked_pages.next, struct page, list);
607
608 list_del(&page->list);
609 list_add(&page->list, &mapping->clean_pages);
610
611 if (!PageLocked(page))
612 continue;
613
614 page_cache_get(page);
615 spin_unlock(&pagecache_lock);
616
617 ___wait_on_page(page);
618 if (PageError(page))
619 ret = -EIO;
620
621 page_cache_release(page);
622 spin_lock(&pagecache_lock);
623 }
624 spin_unlock(&pagecache_lock);
625 return ret;
626 }
627
628 /*
629 * Add a page to the inode page cache.
630 *
631 * The caller must have locked the page and
632 * set all the page flags correctly..
633 */
634 void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
635 {
636 if (!PageLocked(page))
637 BUG();
638
639 page->index = index;
640 page_cache_get(page);
641 spin_lock(&pagecache_lock);
642 add_page_to_inode_queue(mapping, page);
643 add_page_to_hash_queue(page, page_hash(mapping, index));
644 spin_unlock(&pagecache_lock);
645
646 lru_cache_add(page);
647 }
648
649 /*
650 * This adds a page to the page cache, starting out as locked,
651 * owned by us, but unreferenced, not uptodate and with no errors.
652 */
653 static inline void __add_to_page_cache(struct page * page,
654 struct address_space *mapping, unsigned long offset,
655 struct page **hash)
656 {
657 unsigned long flags;
658
659 flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_dirty | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked);
660 page->flags = flags | (1 << PG_locked);
661 page_cache_get(page);
662 page->index = offset;
663 add_page_to_inode_queue(mapping, page);
664 add_page_to_hash_queue(page, hash);
665 }
666
667 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
668 {
669 spin_lock(&pagecache_lock);
670 __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
671 spin_unlock(&pagecache_lock);
672 lru_cache_add(page);
673 }
674
675 int add_to_page_cache_unique(struct page * page,
676 struct address_space *mapping, unsigned long offset,
677 struct page **hash)
678 {
679 int err;
680 struct page *alias;
681
682 spin_lock(&pagecache_lock);
683 alias = __find_page_nolock(mapping, offset, *hash);
684
685 err = 1;
686 if (!alias) {
687 __add_to_page_cache(page,mapping,offset,hash);
688 err = 0;
689 }
690
691 spin_unlock(&pagecache_lock);
692 if (!err)
693 lru_cache_add(page);
694 return err;
695 }
696
697 /*
698 * This adds the requested page to the page cache if it isn't already there,
699 * and schedules an I/O to read in its contents from disk.
700 */
701 static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
702 static int page_cache_read(struct file * file, unsigned long offset)
703 {
704 struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
705 struct page **hash = page_hash(mapping, offset);
706 struct page *page;
707
708 spin_lock(&pagecache_lock);
709 page = __find_page_nolock(mapping, offset, *hash);
710 spin_unlock(&pagecache_lock);
711 if (page)
712 return 0;
713
714 page = page_cache_alloc(mapping);
715 if (!page)
716 return -ENOMEM;
717
718 if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
719 int error = mapping->a_ops->readpage(file, page);
720 page_cache_release(page);
721 return error;
722 }
723 /*
724 * We arrive here in the unlikely event that someone
725 * raced with us and added our page to the cache first.
726 */
727 page_cache_release(page);
728 return 0;
729 }
730
731 /*
732 * Read in an entire cluster at once. A cluster is usually a 64k-
733 * aligned block that includes the page requested in "offset."
734 */
735 static int FASTCALL(read_cluster_nonblocking(struct file * file, unsigned long offset,
736 unsigned long filesize));
737 static int read_cluster_nonblocking(struct file * file, unsigned long offset,
738 unsigned long filesize)
739 {
740 unsigned long pages = CLUSTER_PAGES;
741
742 offset = CLUSTER_OFFSET(offset);
743 while ((pages-- > 0) && (offset < filesize)) {
744 int error = page_cache_read(file, offset);
745 if (error < 0)
746 return error;
747 offset ++;
748 }
749
750 return 0;
751 }
752
753 /*
754 * Knuth recommends primes in approximately golden ratio to the maximum
755 * integer representable by a machine word for multiplicative hashing.
756 * Chuck Lever verified the effectiveness of this technique:
757 * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
758 *
759 * These primes are chosen to be bit-sparse, that is operations on
760 * them can use shifts and additions instead of multiplications for
761 * machines where multiplications are slow.
762 */
763 #if BITS_PER_LONG == 32
764 /* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
765 #define GOLDEN_RATIO_PRIME 0x9e370001UL
766 #elif BITS_PER_LONG == 64
767 /* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
768 #define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
769 #else
770 #error Define GOLDEN_RATIO_PRIME for your wordsize.
771 #endif
772
773 /*
774 * In order to wait for pages to become available there must be
775 * waitqueues associated with pages. By using a hash table of
776 * waitqueues where the bucket discipline is to maintain all
777 * waiters on the same queue and wake all when any of the pages
778 * become available, and for the woken contexts to check to be
779 * sure the appropriate page became available, this saves space
780 * at a cost of "thundering herd" phenomena during rare hash
781 * collisions.
782 */
783 static inline wait_queue_head_t *page_waitqueue(struct page *page)
784 {
785 const zone_t *zone = page_zone(page);
786 wait_queue_head_t *wait = zone->wait_table;
787 unsigned long hash = (unsigned long)page;
788
789 #if BITS_PER_LONG == 64
790 /* Sigh, gcc can't optimise this alone like it does for 32 bits. */
791 unsigned long n = hash;
792 n <<= 18;
793 hash -= n;
794 n <<= 33;
795 hash -= n;
796 n <<= 3;
797 hash += n;
798 n <<= 3;
799 hash -= n;
800 n <<= 4;
801 hash += n;
802 n <<= 2;
803 hash += n;
804 #else
805 /* On some cpus multiply is faster, on others gcc will do shifts */
806 hash *= GOLDEN_RATIO_PRIME;
807 #endif
808 hash >>= zone->wait_table_shift;
809
810 return &wait[hash];
811 }
812
813 /*
814 * This must be called after every submit_bh with end_io
815 * callbacks that would result into the blkdev layer waking
816 * up the page after a queue unplug.
817 */
818 void wakeup_page_waiters(struct page * page)
819 {
820 wait_queue_head_t * head;
821
822 head = page_waitqueue(page);
823 if (waitqueue_active(head))
824 wake_up(head);
825 }
826
827 /*
828 * Wait for a page to get unlocked.
829 *
830 * This must be called with the caller "holding" the page,
831 * ie with increased "page->count" so that the page won't
832 * go away during the wait..
833 *
834 * The waiting strategy is to get on a waitqueue determined
835 * by hashing. Waiters will then collide, and the newly woken
836 * task must then determine whether it was woken for the page
837 * it really wanted, and go back to sleep on the waitqueue if
838 * that wasn't it. With the waitqueue semantics, it never leaves
839 * the waitqueue unless it calls, so the loop moves forward one
840 * iteration every time there is
841 * (1) a collision
842 * and
843 * (2) one of the colliding pages is woken
844 *
845 * This is the thundering herd problem, but it is expected to
846 * be very rare due to the few pages that are actually being
847 * waited on at any given time and the quality of the hash function.
848 */
849 void ___wait_on_page(struct page *page)
850 {
851 wait_queue_head_t *waitqueue = page_waitqueue(page);
852 struct task_struct *tsk = current;
853 DECLARE_WAITQUEUE(wait, tsk);
854
855 add_wait_queue(waitqueue, &wait);
856 do {
857 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
858 if (!PageLocked(page))
859 break;
860 sync_page(page);
861 schedule();
862 } while (PageLocked(page));
863 __set_task_state(tsk, TASK_RUNNING);
864 remove_wait_queue(waitqueue, &wait);
865 }
866
867 /*
868 * unlock_page() is the other half of the story just above
869 * __wait_on_page(). Here a couple of quick checks are done
870 * and a couple of flags are set on the page, and then all
871 * of the waiters for all of the pages in the appropriate
872 * wait queue are woken.
873 */
874 void unlock_page(struct page *page)
875 {
876 wait_queue_head_t *waitqueue = page_waitqueue(page);
877 ClearPageLaunder(page);
878 smp_mb__before_clear_bit();
879 if (!test_and_clear_bit(PG_locked, &(page)->flags))
880 BUG();
881 smp_mb__after_clear_bit();
882
883 /*
884 * Although the default semantics of wake_up() are
885 * to wake all, here the specific function is used
886 * to make it even more explicit that a number of
887 * pages are being waited on here.
888 */
889 if (waitqueue_active(waitqueue))
890 wake_up_all(waitqueue);
891 }
892
893 /*
894 * Get a lock on the page, assuming we need to sleep
895 * to get it..
896 */
897 static void __lock_page(struct page *page)
898 {
899 wait_queue_head_t *waitqueue = page_waitqueue(page);
900 struct task_struct *tsk = current;
901 DECLARE_WAITQUEUE(wait, tsk);
902
903 add_wait_queue_exclusive(waitqueue, &wait);
904 for (;;) {
905 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
906 if (PageLocked(page)) {
907 sync_page(page);
908 schedule();
909 }
910 if (!TryLockPage(page))
911 break;
912 }
913 __set_task_state(tsk, TASK_RUNNING);
914 remove_wait_queue(waitqueue, &wait);
915 }
916
917 /*
918 * Get an exclusive lock on the page, optimistically
919 * assuming it's not locked..
920 */
921 void lock_page(struct page *page)
922 {
923 if (TryLockPage(page))
924 __lock_page(page);
925 }
926
927 /*
928 * a rather lightweight function, finding and getting a reference to a
929 * hashed page atomically.
930 */
931 struct page * __find_get_page(struct address_space *mapping,
932 unsigned long offset, struct page **hash)
933 {
934 struct page *page;
935
936 /*
937 * We scan the hash list read-only. Addition to and removal from
938 * the hash-list needs a held write-lock.
939 */
940 spin_lock(&pagecache_lock);
941 page = __find_page_nolock(mapping, offset, *hash);
942 if (page)
943 page_cache_get(page);
944 spin_unlock(&pagecache_lock);
945 return page;
946 }
947
948 /*
949 * Same as above, but trylock it instead of incrementing the count.
950 */
951 struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
952 {
953 struct page *page;
954 struct page **hash = page_hash(mapping, offset);
955
956 spin_lock(&pagecache_lock);
957 page = __find_page_nolock(mapping, offset, *hash);
958 if (page) {
959 if (TryLockPage(page))
960 page = NULL;
961 }
962 spin_unlock(&pagecache_lock);
963 return page;
964 }
965
966 /*
967 * Must be called with the pagecache lock held,
968 * will return with it held (but it may be dropped
969 * during blocking operations..
970 */
971 static struct page * FASTCALL(__find_lock_page_helper(struct address_space *, unsigned long, struct page *));
972 static struct page * __find_lock_page_helper(struct address_space *mapping,
973 unsigned long offset, struct page *hash)
974 {
975 struct page *page;
976
977 /*
978 * We scan the hash list read-only. Addition to and removal from
979 * the hash-list needs a held write-lock.
980 */
981 repeat:
982 page = __find_page_nolock(mapping, offset, hash);
983 if (page) {
984 page_cache_get(page);
985 if (TryLockPage(page)) {
986 spin_unlock(&pagecache_lock);
987 lock_page(page);
988 spin_lock(&pagecache_lock);
989
990 /* Has the page been re-allocated while we slept? */
991 if (page->mapping != mapping || page->index != offset) {
992 UnlockPage(page);
993 page_cache_release(page);
994 goto repeat;
995 }
996 }
997 }
998 return page;
999 }
1000
1001 /*
1002 * Same as the above, but lock the page too, verifying that
1003 * it's still valid once we own it.
1004 */
1005 struct page * __find_lock_page (struct address_space *mapping,
1006 unsigned long offset, struct page **hash)
1007 {
1008 struct page *page;
1009
1010 spin_lock(&pagecache_lock);
1011 page = __find_lock_page_helper(mapping, offset, *hash);
1012 spin_unlock(&pagecache_lock);
1013 return page;
1014 }
1015
1016 /*
1017 * Same as above, but create the page if required..
1018 */
1019 struct page * find_or_create_page(struct address_space *mapping, unsigned long index, unsigned int gfp_mask)
1020 {
1021 struct page *page;
1022 struct page **hash = page_hash(mapping, index);
1023
1024 spin_lock(&pagecache_lock);
1025 page = __find_lock_page_helper(mapping, index, *hash);
1026 spin_unlock(&pagecache_lock);
1027 if (!page) {
1028 struct page *newpage = alloc_page(gfp_mask);
1029 if (newpage) {
1030 spin_lock(&pagecache_lock);
1031 page = __find_lock_page_helper(mapping, index, *hash);
1032 if (likely(!page)) {
1033 page = newpage;
1034 __add_to_page_cache(page, mapping, index, hash);
1035 newpage = NULL;
1036 }
1037 spin_unlock(&pagecache_lock);
1038 if (newpage == NULL)
1039 lru_cache_add(page);
1040 else
1041 page_cache_release(newpage);
1042 }
1043 }
1044 return page;
1045 }
1046
1047 /*
1048 * Same as grab_cache_page, but do not wait if the page is unavailable.
1049 * This is intended for speculative data generators, where the data can
1050 * be regenerated if the page couldn't be grabbed. This routine should
1051 * be safe to call while holding the lock for another page.
1052 */
1053 struct page *grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
1054 {
1055 struct page *page, **hash;
1056
1057 hash = page_hash(mapping, index);
1058 page = __find_get_page(mapping, index, hash);
1059
1060 if ( page ) {
1061 if ( !TryLockPage(page) ) {
1062 /* Page found and locked */
1063 /* This test is overly paranoid, but what the heck... */
1064 if ( unlikely(page->mapping != mapping || page->index != index) ) {
1065 /* Someone reallocated this page under us. */
1066 UnlockPage(page);
1067 page_cache_release(page);
1068 return NULL;
1069 } else {
1070 return page;
1071 }
1072 } else {
1073 /* Page locked by someone else */
1074 page_cache_release(page);
1075 return NULL;
1076 }
1077 }
1078
1079 page = page_cache_alloc(mapping);
1080 if ( unlikely(!page) )
1081 return NULL; /* Failed to allocate a page */
1082
1083 if ( unlikely(add_to_page_cache_unique(page, mapping, index, hash)) ) {
1084 /* Someone else grabbed the page already. */
1085 page_cache_release(page);
1086 return NULL;
1087 }
1088
1089 return page;
1090 }
1091
1092 #if 0
1093 #define PROFILE_READAHEAD
1094 #define DEBUG_READAHEAD
1095 #endif
1096
1097 /*
1098 * Read-ahead profiling information
1099 * --------------------------------
1100 * Every PROFILE_MAXREADCOUNT, the following information is written
1101 * to the syslog:
1102 * Percentage of asynchronous read-ahead.
1103 * Average of read-ahead fields context value.
1104 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
1105 * to the syslog.
1106 */
1107
1108 #ifdef PROFILE_READAHEAD
1109
1110 #define PROFILE_MAXREADCOUNT 1000
1111
1112 static unsigned long total_reada;
1113 static unsigned long total_async;
1114 static unsigned long total_ramax;
1115 static unsigned long total_ralen;
1116 static unsigned long total_rawin;
1117
1118 static void profile_readahead(int async, struct file *filp)
1119 {
1120 unsigned long flags;
1121
1122 ++total_reada;
1123 if (async)
1124 ++total_async;
1125
1126 total_ramax += filp->f_ramax;
1127 total_ralen += filp->f_ralen;
1128 total_rawin += filp->f_rawin;
1129
1130 if (total_reada > PROFILE_MAXREADCOUNT) {
1131 save_flags(flags);
1132 cli();
1133 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
1134 restore_flags(flags);
1135 return;
1136 }
1137
1138 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
1139 total_ramax/total_reada,
1140 total_ralen/total_reada,
1141 total_rawin/total_reada,
1142 (total_async*100)/total_reada);
1143 #ifdef DEBUG_READAHEAD
1144 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
1145 filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
1146 #endif
1147
1148 total_reada = 0;
1149 total_async = 0;
1150 total_ramax = 0;
1151 total_ralen = 0;
1152 total_rawin = 0;
1153
1154 restore_flags(flags);
1155 }
1156 }
1157 #endif /* defined PROFILE_READAHEAD */
1158
1159 /*
1160 * Read-ahead context:
1161 * -------------------
1162 * The read ahead context fields of the "struct file" are the following:
1163 * - f_raend : position of the first byte after the last page we tried to
1164 * read ahead.
1165 * - f_ramax : current read-ahead maximum size.
1166 * - f_ralen : length of the current IO read block we tried to read-ahead.
1167 * - f_rawin : length of the current read-ahead window.
1168 * if last read-ahead was synchronous then
1169 * f_rawin = f_ralen
1170 * otherwise (was asynchronous)
1171 * f_rawin = previous value of f_ralen + f_ralen
1172 *
1173 * Read-ahead limits:
1174 * ------------------
1175 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
1176 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
1177 *
1178 * Synchronous read-ahead benefits:
1179 * --------------------------------
1180 * Using reasonable IO xfer length from peripheral devices increase system
1181 * performances.
1182 * Reasonable means, in this context, not too large but not too small.
1183 * The actual maximum value is:
1184 * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
1185 * and 32K if defined (4K page size assumed).
1186 *
1187 * Asynchronous read-ahead benefits:
1188 * ---------------------------------
1189 * Overlapping next read request and user process execution increase system
1190 * performance.
1191 *
1192 * Read-ahead risks:
1193 * -----------------
1194 * We have to guess which further data are needed by the user process.
1195 * If these data are often not really needed, it's bad for system
1196 * performances.
1197 * However, we know that files are often accessed sequentially by
1198 * application programs and it seems that it is possible to have some good
1199 * strategy in that guessing.
1200 * We only try to read-ahead files that seems to be read sequentially.
1201 *
1202 * Asynchronous read-ahead risks:
1203 * ------------------------------
1204 * In order to maximize overlapping, we must start some asynchronous read
1205 * request from the device, as soon as possible.
1206 * We must be very careful about:
1207 * - The number of effective pending IO read requests.
1208 * ONE seems to be the only reasonable value.
1209 * - The total memory pool usage for the file access stream.
1210 * This maximum memory usage is implicitly 2 IO read chunks:
1211 * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
1212 * 64k if defined (4K page size assumed).
1213 */
1214
1215 static inline int get_max_readahead(struct inode * inode)
1216 {
1217 if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
1218 return vm_max_readahead;
1219 return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
1220 }
1221
1222 static void generic_file_readahead(int reada_ok,
1223 struct file * filp, struct inode * inode,
1224 struct page * page)
1225 {
1226 unsigned long end_index;
1227 unsigned long index = page->index;
1228 unsigned long max_ahead, ahead;
1229 unsigned long raend;
1230 int max_readahead = get_max_readahead(inode);
1231
1232 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1233
1234 raend = filp->f_raend;
1235 max_ahead = 0;
1236
1237 /*
1238 * The current page is locked.
1239 * If the current position is inside the previous read IO request, do not
1240 * try to reread previously read ahead pages.
1241 * Otherwise decide or not to read ahead some pages synchronously.
1242 * If we are not going to read ahead, set the read ahead context for this
1243 * page only.
1244 */
1245 if (PageLocked(page)) {
1246 if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) {
1247 raend = index;
1248 if (raend < end_index)
1249 max_ahead = filp->f_ramax;
1250 filp->f_rawin = 0;
1251 filp->f_ralen = 1;
1252 if (!max_ahead) {
1253 filp->f_raend = index + filp->f_ralen;
1254 filp->f_rawin += filp->f_ralen;
1255 }
1256 }
1257 }
1258 /*
1259 * The current page is not locked.
1260 * If we were reading ahead and,
1261 * if the current max read ahead size is not zero and,
1262 * if the current position is inside the last read-ahead IO request,
1263 * it is the moment to try to read ahead asynchronously.
1264 * We will later force unplug device in order to force asynchronous read IO.
1265 */
1266 else if (reada_ok && filp->f_ramax && raend >= 1 &&
1267 index <= raend && index + filp->f_ralen >= raend) {
1268 /*
1269 * Add ONE page to max_ahead in order to try to have about the same IO max size
1270 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
1271 * Compute the position of the last page we have tried to read in order to
1272 * begin to read ahead just at the next page.
1273 */
1274 raend -= 1;
1275 if (raend < end_index)
1276 max_ahead = filp->f_ramax + 1;
1277
1278 if (max_ahead) {
1279 filp->f_rawin = filp->f_ralen;
1280 filp->f_ralen = 0;
1281 reada_ok = 2;
1282 }
1283 }
1284 /*
1285 * Try to read ahead pages.
1286 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
1287 * scheduler, will work enough for us to avoid too bad actuals IO requests.
1288 */
1289 ahead = 0;
1290 while (ahead < max_ahead) {
1291 ahead ++;
1292 if ((raend + ahead) >= end_index)
1293 break;
1294 if (page_cache_read(filp, raend + ahead) < 0)
1295 break;
1296 }
1297 /*
1298 * If we tried to read ahead some pages,
1299 * If we tried to read ahead asynchronously,
1300 * Try to force unplug of the device in order to start an asynchronous
1301 * read IO request.
1302 * Update the read-ahead context.
1303 * Store the length of the current read-ahead window.
1304 * Double the current max read ahead size.
1305 * That heuristic avoid to do some large IO for files that are not really
1306 * accessed sequentially.
1307 */
1308 if (ahead) {
1309 filp->f_ralen += ahead;
1310 filp->f_rawin += filp->f_ralen;
1311 filp->f_raend = raend + ahead + 1;
1312
1313 filp->f_ramax += filp->f_ramax;
1314
1315 if (filp->f_ramax > max_readahead)
1316 filp->f_ramax = max_readahead;
1317
1318 #ifdef PROFILE_READAHEAD
1319 profile_readahead((reada_ok == 2), filp);
1320 #endif
1321 }
1322
1323 return;
1324 }
1325
1326 /*
1327 * Mark a page as having seen activity.
1328 *
1329 * If it was already so marked, move it to the active queue and drop
1330 * the referenced bit. Otherwise, just mark it for future action..
1331 */
1332 void mark_page_accessed(struct page *page)
1333 {
1334 if (!PageActive(page) && PageReferenced(page)) {
1335 activate_page(page);
1336 ClearPageReferenced(page);
1337 } else
1338 SetPageReferenced(page);
1339 }
1340
1341 /*
1342 * This is a generic file read routine, and uses the
1343 * inode->i_op->readpage() function for the actual low-level
1344 * stuff.
1345 *
1346 * This is really ugly. But the goto's actually try to clarify some
1347 * of the logic when it comes to error handling etc.
1348 */
1349 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
1350 {
1351 struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
1352 struct inode *inode = mapping->host;
1353 unsigned long index, offset;
1354 struct page *cached_page;
1355 int reada_ok;
1356 int error;
1357 int max_readahead = get_max_readahead(inode);
1358
1359 cached_page = NULL;
1360 index = *ppos >> PAGE_CACHE_SHIFT;
1361 offset = *ppos & ~PAGE_CACHE_MASK;
1362
1363 /*
1364 * If the current position is outside the previous read-ahead window,
1365 * we reset the current read-ahead context and set read ahead max to zero
1366 * (will be set to just needed value later),
1367 * otherwise, we assume that the file accesses are sequential enough to
1368 * continue read-ahead.
1369 */
1370 if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
1371 reada_ok = 0;
1372 filp->f_raend = 0;
1373 filp->f_ralen = 0;
1374 filp->f_ramax = 0;
1375 filp->f_rawin = 0;
1376 } else {
1377 reada_ok = 1;
1378 }
1379 /*
1380 * Adjust the current value of read-ahead max.
1381 * If the read operation stay in the first half page, force no readahead.
1382 * Otherwise try to increase read ahead max just enough to do the read request.
1383 * Then, at least MIN_READAHEAD if read ahead is ok,
1384 * and at most MAX_READAHEAD in all cases.
1385 */
1386 if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
1387 filp->f_ramax = 0;
1388 } else {
1389 unsigned long needed;
1390
1391 needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
1392
1393 if (filp->f_ramax < needed)
1394 filp->f_ramax = needed;
1395
1396 if (reada_ok && filp->f_ramax < vm_min_readahead)
1397 filp->f_ramax = vm_min_readahead;
1398 if (filp->f_ramax > max_readahead)
1399 filp->f_ramax = max_readahead;
1400 }
1401
1402 for (;;) {
1403 struct page *page, **hash;
1404 unsigned long end_index, nr, ret;
1405
1406 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1407
1408 if (index > end_index)
1409 break;
1410 nr = PAGE_CACHE_SIZE;
1411 if (index == end_index) {
1412 nr = inode->i_size & ~PAGE_CACHE_MASK;
1413 if (nr <= offset)
1414 break;
1415 }
1416
1417 nr = nr - offset;
1418
1419 /*
1420 * Try to find the data in the page cache..
1421 */
1422 hash = page_hash(mapping, index);
1423
1424 spin_lock(&pagecache_lock);
1425 page = __find_page_nolock(mapping, index, *hash);
1426 if (!page)
1427 goto no_cached_page;
1428 found_page:
1429 page_cache_get(page);
1430 spin_unlock(&pagecache_lock);
1431
1432 if (!Page_Uptodate(page))
1433 goto page_not_up_to_date;
1434 generic_file_readahead(reada_ok, filp, inode, page);
1435 page_ok:
1436 /* If users can be writing to this page using arbitrary
1437 * virtual addresses, take care about potential aliasing
1438 * before reading the page on the kernel side.
1439 */
1440 if (mapping->i_mmap_shared != NULL)
1441 flush_dcache_page(page);
1442
1443 /*
1444 * Mark the page accessed if we read the
1445 * beginning or we just did an lseek.
1446 */
1447 if (!offset || !filp->f_reada)
1448 mark_page_accessed(page);
1449
1450 /*
1451 * Ok, we have the page, and it's up-to-date, so
1452 * now we can copy it to user space...
1453 *
1454 * The actor routine returns how many bytes were actually used..
1455 * NOTE! This may not be the same as how much of a user buffer
1456 * we filled up (we may be padding etc), so we can only update
1457 * "pos" here (the actor routine has to update the user buffer
1458 * pointers and the remaining count).
1459 */
1460 ret = actor(desc, page, offset, nr);
1461 offset += ret;
1462 index += offset >> PAGE_CACHE_SHIFT;
1463 offset &= ~PAGE_CACHE_MASK;
1464
1465 page_cache_release(page);
1466 if (ret == nr && desc->count)
1467 continue;
1468 break;
1469
1470 /*
1471 * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1472 */
1473 page_not_up_to_date:
1474 generic_file_readahead(reada_ok, filp, inode, page);
1475
1476 if (Page_Uptodate(page))
1477 goto page_ok;
1478
1479 /* Get exclusive access to the page ... */
1480 lock_page(page);
1481
1482 /* Did it get unhashed before we got the lock? */
1483 if (!page->mapping) {
1484 UnlockPage(page);
1485 page_cache_release(page);
1486 continue;
1487 }
1488
1489 /* Did somebody else fill it already? */
1490 if (Page_Uptodate(page)) {
1491 UnlockPage(page);
1492 goto page_ok;
1493 }
1494
1495 readpage:
1496 /* ... and start the actual read. The read will unlock the page. */
1497 error = mapping->a_ops->readpage(filp, page);
1498
1499 if (!error) {
1500 if (Page_Uptodate(page))
1501 goto page_ok;
1502
1503 /* Again, try some read-ahead while waiting for the page to finish.. */
1504 generic_file_readahead(reada_ok, filp, inode, page);
1505 wait_on_page(page);
1506 if (Page_Uptodate(page))
1507 goto page_ok;
1508 error = -EIO;
1509 }
1510
1511 /* UHHUH! A synchronous read error occurred. Report it */
1512 desc->error = error;
1513 page_cache_release(page);
1514 break;
1515
1516 no_cached_page:
1517 /*
1518 * Ok, it wasn't cached, so we need to create a new
1519 * page..
1520 *
1521 * We get here with the page cache lock held.
1522 */
1523 if (!cached_page) {
1524 spin_unlock(&pagecache_lock);
1525 cached_page = page_cache_alloc(mapping);
1526 if (!cached_page) {
1527 desc->error = -ENOMEM;
1528 break;
1529 }
1530
1531 /*
1532 * Somebody may have added the page while we
1533 * dropped the page cache lock. Check for that.
1534 */
1535 spin_lock(&pagecache_lock);
1536 page = __find_page_nolock(mapping, index, *hash);
1537 if (page)
1538 goto found_page;
1539 }
1540
1541 /*
1542 * Ok, add the new page to the hash-queues...
1543 */
1544 page = cached_page;
1545 __add_to_page_cache(page, mapping, index, hash);
1546 spin_unlock(&pagecache_lock);
1547 lru_cache_add(page);
1548 cached_page = NULL;
1549
1550 goto readpage;
1551 }
1552
1553 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1554 filp->f_reada = 1;
1555 if (cached_page)
1556 page_cache_release(cached_page);
1557 UPDATE_ATIME(inode);
1558 }
1559
1560 static inline int have_mapping_directIO(struct address_space * mapping)
1561 {
1562 return mapping->a_ops->direct_IO || mapping->a_ops->direct_fileIO;
1563 }
1564
1565 /* Switch between old and new directIO formats */
1566 static inline int do_call_directIO(int rw, struct file *filp, struct kiobuf *iobuf, unsigned long offset, int blocksize)
1567 {
1568 struct address_space * mapping = filp->f_dentry->d_inode->i_mapping;
1569
1570 if (mapping->a_ops->direct_fileIO)
1571 return mapping->a_ops->direct_fileIO(rw, filp, iobuf, offset, blocksize);
1572 return mapping->a_ops->direct_IO(rw, mapping->host, iobuf, offset, blocksize);
1573 }
1574
1575 /*
1576 * i_sem and i_alloc_sem should be held already. i_sem may be dropped
1577 * later once we've mapped the new IO. i_alloc_sem is kept until the IO
1578 * completes.
1579 */
1580
1581 static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset)
1582 {
1583 ssize_t retval;
1584 int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits, iosize, progress;
1585 struct kiobuf * iobuf;
1586 struct address_space * mapping = filp->f_dentry->d_inode->i_mapping;
1587 struct inode * inode = mapping->host;
1588 loff_t size = inode->i_size;
1589
1590 new_iobuf = 0;
1591 iobuf = filp->f_iobuf;
1592 if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
1593 /*
1594 * A parallel read/write is using the preallocated iobuf
1595 * so just run slow and allocate a new one.
1596 */
1597 retval = alloc_kiovec(1, &iobuf);
1598 if (retval)
1599 goto out;
1600 new_iobuf = 1;
1601 }
1602
1603 blocksize = 1 << inode->i_blkbits;
1604 blocksize_bits = inode->i_blkbits;
1605 blocksize_mask = blocksize - 1;
1606 chunk_size = KIO_MAX_ATOMIC_IO << 10;
1607
1608 retval = -EINVAL;
1609 if ((offset & blocksize_mask) || (count & blocksize_mask) || ((unsigned long) buf & blocksize_mask))
1610 goto out_free;
1611 if (!have_mapping_directIO(mapping))
1612 goto out_free;
1613
1614 if ((rw == READ) && (offset + count > size))
1615 count = size - offset;
1616
1617 /*
1618 * Flush to disk exclusively the _data_, metadata must remain
1619 * completly asynchronous or performance will go to /dev/null.
1620 */
1621 retval = filemap_fdatasync(mapping);
1622 if (retval == 0)
1623 retval = fsync_inode_data_buffers(inode);
1624 if (retval == 0)
1625 retval = filemap_fdatawait(mapping);
1626 if (retval < 0)
1627 goto out_free;
1628
1629 progress = retval = 0;
1630 while (count > 0) {
1631 iosize = count;
1632 if (iosize > chunk_size)
1633 iosize = chunk_size;
1634
1635 retval = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize);
1636 if (retval)
1637 break;
1638
1639 retval = do_call_directIO(rw, filp, iobuf, (offset+progress) >> blocksize_bits, blocksize);
1640
1641 if (rw == READ && retval > 0)
1642 mark_dirty_kiobuf(iobuf, retval);
1643
1644 if (retval >= 0) {
1645 count -= retval;
1646 buf += retval;
1647 /* warning: weird semantics here, we're reporting a read behind the end of the file */
1648 progress += retval;
1649 }
1650
1651 unmap_kiobuf(iobuf);
1652
1653 if (retval != iosize)
1654 break;
1655 }
1656
1657 if (progress)
1658 retval = progress;
1659
1660 out_free:
1661 if (!new_iobuf)
1662 clear_bit(0, &filp->f_iobuf_lock);
1663 else
1664 free_kiovec(1, &iobuf);
1665 out:
1666 return retval;
1667 }
1668
1669 int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1670 {
1671 char *kaddr;
1672 unsigned long left, count = desc->count;
1673
1674 if (size > count)
1675 size = count;
1676
1677 kaddr = kmap(page);
1678 left = __copy_to_user(desc->buf, kaddr + offset, size);
1679 kunmap(page);
1680
1681 if (left) {
1682 size -= left;
1683 desc->error = -EFAULT;
1684 }
1685 desc->count = count - size;
1686 desc->written += size;
1687 desc->buf += size;
1688 return size;
1689 }
1690
1691 /*
1692 * This is the "read()" routine for all filesystems
1693 * that can use the page cache directly.
1694 */
1695 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1696 {
1697 ssize_t retval;
1698
1699 if ((ssize_t) count < 0)
1700 return -EINVAL;
1701
1702 if (filp->f_flags & O_DIRECT)
1703 goto o_direct;
1704
1705 retval = -EFAULT;
1706 if (access_ok(VERIFY_WRITE, buf, count)) {
1707 retval = 0;
1708
1709 if (count) {
1710 read_descriptor_t desc;
1711
1712 desc.written = 0;
1713 desc.count = count;
1714 desc.buf = buf;
1715 desc.error = 0;
1716 do_generic_file_read(filp, ppos, &desc, file_read_actor);
1717
1718 retval = desc.written;
1719 if (!retval)
1720 retval = desc.error;
1721 }
1722 }
1723 out:
1724 return retval;
1725
1726 o_direct:
1727 {
1728 loff_t pos = *ppos, size;
1729 struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
1730 struct inode *inode = mapping->host;
1731
1732 retval = 0;
1733 if (!count)
1734 goto out; /* skip atime */
1735 down_read(&inode->i_alloc_sem);
1736 down(&inode->i_sem);
1737 size = inode->i_size;
1738 if (pos < size) {
1739 retval = generic_file_direct_IO(READ, filp, buf, count, pos);
1740 if (retval > 0)
1741 *ppos = pos + retval;
1742 }
1743 up(&inode->i_sem);
1744 up_read(&inode->i_alloc_sem);
1745 UPDATE_ATIME(filp->f_dentry->d_inode);
1746 goto out;
1747 }
1748 }
1749
1750 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
1751 {
1752 ssize_t written;
1753 unsigned long count = desc->count;
1754 struct file *file = (struct file *) desc->buf;
1755
1756 if (size > count)
1757 size = count;
1758
1759 if (file->f_op->sendpage) {
1760 written = file->f_op->sendpage(file, page, offset,
1761 size, &file->f_pos, size<count);
1762 } else {
1763 char *kaddr;
1764 mm_segment_t old_fs;
1765
1766 old_fs = get_fs();
1767 set_fs(KERNEL_DS);
1768
1769 kaddr = kmap(page);
1770 written = file->f_op->write(file, kaddr + offset, size, &file->f_pos);
1771 kunmap(page);
1772
1773 set_fs(old_fs);
1774 }
1775 if (written < 0) {
1776 desc->error = written;
1777 written = 0;
1778 }
1779 desc->count = count - written;
1780 desc->written += written;
1781 return written;
1782 }
1783
1784 static ssize_t common_sendfile(int out_fd, int in_fd, loff_t *offset, size_t count)
1785 {
1786 ssize_t retval;
1787 struct file * in_file, * out_file;
1788 struct inode * in_inode, * out_inode;
1789
1790 /*
1791 * Get input file, and verify that it is ok..
1792 */
1793 retval = -EBADF;
1794 in_file = fget(in_fd);
1795 if (!in_file)
1796 goto out;
1797 if (!(in_file->f_mode & FMODE_READ))
1798 goto fput_in;
1799 retval = -EINVAL;
1800 in_inode = in_file->f_dentry->d_inode;
1801 if (!in_inode)
1802 goto fput_in;
1803 if (!in_inode->i_mapping->a_ops->readpage)
1804 goto fput_in;
1805 retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1806 if (retval)
1807 goto fput_in;
1808
1809 /*
1810 * Get output file, and verify that it is ok..
1811 */
1812 retval = -EBADF;
1813 out_file = fget(out_fd);
1814 if (!out_file)
1815 goto fput_in;
1816 if (!(out_file->f_mode & FMODE_WRITE))
1817 goto fput_out;
1818 retval = -EINVAL;
1819 if (!out_file->f_op || !out_file->f_op->write)
1820 goto fput_out;
1821 out_inode = out_file->f_dentry->d_inode;
1822 retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1823 if (retval)
1824 goto fput_out;
1825
1826 retval = 0;
1827 if (count) {
1828 read_descriptor_t desc;
1829
1830 if (!offset)
1831 offset = &in_file->f_pos;
1832
1833 desc.written = 0;
1834 desc.count = count;
1835 desc.buf = (char *) out_file;
1836 desc.error = 0;
1837 do_generic_file_read(in_file, offset, &desc, file_send_actor);
1838
1839 retval = desc.written;
1840 if (!retval)
1841 retval = desc.error;
1842 }
1843
1844 fput_out:
1845 fput(out_file);
1846 fput_in:
1847 fput(in_file);
1848 out:
1849 return retval;
1850 }
1851
1852 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1853 {
1854 loff_t pos, *ppos = NULL;
1855 ssize_t ret;
1856 if (offset) {
1857 off_t off;
1858 if (unlikely(get_user(off, offset)))
1859 return -EFAULT;
1860 pos = off;
1861 ppos = &pos;
1862 }
1863 ret = common_sendfile(out_fd, in_fd, ppos, count);
1864 if (offset)
1865 put_user((off_t)pos, offset);
1866 return ret;
1867 }
1868
1869 asmlinkage ssize_t sys_sendfile64(int out_fd, int in_fd, loff_t *offset, size_t count)
1870 {
1871 loff_t pos, *ppos = NULL;
1872 ssize_t ret;
1873 if (offset) {
1874 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1875 return -EFAULT;
1876 ppos = &pos;
1877 }
1878 ret = common_sendfile(out_fd, in_fd, ppos, count);
1879 if (offset)
1880 put_user(pos, offset);
1881 return ret;
1882 }
1883
1884 static ssize_t do_readahead(struct file *file, unsigned long index, unsigned long nr)
1885 {
1886 struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
1887 unsigned long max;
1888
1889 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1890 return -EINVAL;
1891
1892 /* Limit it to the size of the file.. */
1893 max = (mapping->host->i_size + ~PAGE_CACHE_MASK) >> PAGE_CACHE_SHIFT;
1894 if (index > max)
1895 return 0;
1896 max -= index;
1897 if (nr > max)
1898 nr = max;
1899
1900 /* And limit it to a sane percentage of the inactive list.. */
1901 max = nr_inactive_pages / 2;
1902 if (nr > max)
1903 nr = max;
1904
1905 while (nr) {
1906 page_cache_read(file, index);
1907 index++;
1908 nr--;
1909 }
1910 return 0;
1911 }
1912
1913 asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1914 {
1915 ssize_t ret;
1916 struct file *file;
1917
1918 ret = -EBADF;
1919 file = fget(fd);
1920 if (file) {
1921 if (file->f_mode & FMODE_READ) {
1922 unsigned long start = offset >> PAGE_CACHE_SHIFT;
1923 unsigned long len = (count + ((long)offset & ~PAGE_CACHE_MASK)) >> PAGE_CACHE_SHIFT;
1924 ret = do_readahead(file, start, len);
1925 }
1926 fput(file);
1927 }
1928 return ret;
1929 }
1930
1931 /*
1932 * Read-ahead and flush behind for MADV_SEQUENTIAL areas. Since we are
1933 * sure this is sequential access, we don't need a flexible read-ahead
1934 * window size -- we can always use a large fixed size window.
1935 */
1936 static void nopage_sequential_readahead(struct vm_area_struct * vma,
1937 unsigned long pgoff, unsigned long filesize)
1938 {
1939 unsigned long ra_window;
1940
1941 ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
1942 ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
1943
1944 /* vm_raend is zero if we haven't read ahead in this area yet. */
1945 if (vma->vm_raend == 0)
1946 vma->vm_raend = vma->vm_pgoff + ra_window;
1947
1948 /*
1949 * If we've just faulted the page half-way through our window,
1950 * then schedule reads for the next window, and release the
1951 * pages in the previous window.
1952 */
1953 if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
1954 unsigned long start = vma->vm_pgoff + vma->vm_raend;
1955 unsigned long end = start + ra_window;
1956
1957 if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
1958 end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
1959 if (start > end)
1960 return;
1961
1962 while ((start < end) && (start < filesize)) {
1963 if (read_cluster_nonblocking(vma->vm_file,
1964 start, filesize) < 0)
1965 break;
1966 start += CLUSTER_PAGES;
1967 }
1968 run_task_queue(&tq_disk);
1969
1970 /* if we're far enough past the beginning of this area,
1971 recycle pages that are in the previous window. */
1972 if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
1973 unsigned long window = ra_window << PAGE_SHIFT;
1974
1975 end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
1976 end -= window + window;
1977 filemap_sync(vma, end - window, window, MS_INVALIDATE);
1978 }
1979
1980 vma->vm_raend += ra_window;
1981 }
1982
1983 return;
1984 }
1985
1986 /*
1987 * filemap_nopage() is invoked via the vma operations vector for a
1988 * mapped memory region to read in file data during a page fault.
1989 *
1990 * The goto's are kind of ugly, but this streamlines the normal case of having
1991 * it in the page cache, and handles the special cases reasonably without
1992 * having a lot of duplicated code.
1993 */
1994 struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int unused)
1995 {
1996 int error;
1997 struct file *file = area->vm_file;
1998 struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
1999 struct inode *inode = mapping->host;
2000 struct page *page, **hash;
2001 unsigned long size, pgoff, endoff;
2002
2003 pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
2004 endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
2005
2006 retry_all:
2007 /*
2008 * An external ptracer can access pages that normally aren't
2009 * accessible..
2010 */
2011 size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
2012 if ((pgoff >= size) && (area->vm_mm == current->mm))
2013 return NULL;
2014
2015 /* The "size" of the file, as far as mmap is concerned, isn't bigger than the mapping */
2016 if (size > endoff)
2017 size = endoff;
2018
2019 /*
2020 * Do we have something in the page cache already?
2021 */
2022 hash = page_hash(mapping, pgoff);
2023 retry_find:
2024 page = __find_get_page(mapping, pgoff, hash);
2025 if (!page)
2026 goto no_cached_page;
2027
2028 /*
2029 * Ok, found a page in the page cache, now we need to check
2030 * that it's up-to-date.
2031 */
2032 if (!Page_Uptodate(page))
2033 goto page_not_uptodate;
2034
2035 success:
2036 /*
2037 * Try read-ahead for sequential areas.
2038 */
2039 if (VM_SequentialReadHint(area))
2040 nopage_sequential_readahead(area, pgoff, size);
2041
2042 /*
2043 * Found the page and have a reference on it, need to check sharing
2044 * and possibly copy it over to another page..
2045 */
2046 mark_page_accessed(page);
2047 flush_page_to_ram(page);
2048 return page;
2049
2050 no_cached_page:
2051 /*
2052 * If the requested offset is within our file, try to read a whole
2053 * cluster of pages at once.
2054 *
2055 * Otherwise, we're off the end of a privately mapped file,
2056 * so we need to map a zero page.
2057 */
2058 if ((pgoff < size) && !VM_RandomReadHint(area))
2059 error = read_cluster_nonblocking(file, pgoff, size);
2060 else
2061 error = page_cache_read(file, pgoff);
2062
2063 /*
2064 * The page we want has now been added to the page cache.
2065 * In the unlikely event that someone removed it in the
2066 * meantime, we'll just come back here and read it again.
2067 */
2068 if (error >= 0)
2069 goto retry_find;
2070
2071 /*
2072 * An error return from page_cache_read can result if the
2073 * system is low on memory, or a problem occurs while trying
2074 * to schedule I/O.
2075 */
2076 if (error == -ENOMEM)
2077 return NOPAGE_OOM;
2078 return NULL;
2079
2080 page_not_uptodate:
2081 lock_page(page);
2082
2083 /* Did it get unhashed while we waited for it? */
2084 if (!page->mapping) {
2085 UnlockPage(page);
2086 page_cache_release(page);
2087 goto retry_all;
2088 }
2089
2090 /* Did somebody else get it up-to-date? */
2091 if (Page_Uptodate(page)) {
2092 UnlockPage(page);
2093 goto success;
2094 }
2095
2096 if (!mapping->a_ops->readpage(file, page)) {
2097 wait_on_page(page);
2098 if (Page_Uptodate(page))
2099 goto success;
2100 }
2101
2102 /*
2103 * Umm, take care of errors if the page isn't up-to-date.
2104 * Try to re-read it _once_. We do this synchronously,
2105 * because there really aren't any performance issues here
2106 * and we need to check for errors.
2107 */
2108 lock_page(page);
2109
2110 /* Somebody truncated the page on us? */
2111 if (!page->mapping) {
2112 UnlockPage(page);
2113 page_cache_release(page);
2114 goto retry_all;
2115 }
2116
2117 /* Somebody else successfully read it in? */
2118 if (Page_Uptodate(page)) {
2119 UnlockPage(page);
2120 goto success;
2121 }
2122 ClearPageError(page);
2123 if (!mapping->a_ops->readpage(file, page)) {
2124 wait_on_page(page);
2125 if (Page_Uptodate(page))
2126 goto success;
2127 }
2128
2129 /*
2130 * Things didn't work out. Return zero to tell the
2131 * mm layer so, possibly freeing the page cache page first.
2132 */
2133 page_cache_release(page);
2134 return NULL;
2135 }
2136
2137 /* Called with mm->page_table_lock held to protect against other
2138 * threads/the swapper from ripping pte's out from under us.
2139 */
2140 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
2141 unsigned long address, unsigned int flags)
2142 {
2143 pte_t pte = *ptep;
2144
2145 if (pte_present(pte)) {
2146 struct page *page = pte_page(pte);
2147 if (VALID_PAGE(page) && !PageReserved(page) && ptep_test_and_clear_dirty(ptep)) {
2148 flush_tlb_page(vma, address);
2149 set_page_dirty(page);
2150 }
2151 }
2152 return 0;
2153 }
2154
2155 static inline int filemap_sync_pte_range(pmd_t * pmd,
2156 unsigned long address, unsigned long size,
2157 struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
2158 {
2159 pte_t * pte;
2160 unsigned long end;
2161 int error;
2162
2163 if (pmd_none(*pmd))
2164 return 0;
2165 if (pmd_bad(*pmd)) {
2166 pmd_ERROR(*pmd);
2167 pmd_clear(pmd);
2168 return 0;
2169 }
2170 pte = pte_offset(pmd, address);
2171 offset += address & PMD_MASK;
2172 address &= ~PMD_MASK;
2173 end = address + size;
2174 if (end > PMD_SIZE)
2175 end = PMD_SIZE;
2176 error = 0;
2177 do {
2178 error |= filemap_sync_pte(pte, vma, address + offset, flags);
2179 address += PAGE_SIZE;
2180 pte++;
2181 } while (address && (address < end));
2182 return error;
2183 }
2184
2185 static inline int filemap_sync_pmd_range(pgd_t * pgd,
2186 unsigned long address, unsigned long size,
2187 struct vm_area_struct *vma, unsigned int flags)
2188 {
2189 pmd_t * pmd;
2190 unsigned long offset, end;
2191 int error;
2192
2193 if (pgd_none(*pgd))
2194 return 0;
2195 if (pgd_bad(*pgd)) {
2196 pgd_ERROR(*pgd);
2197 pgd_clear(pgd);
2198 return 0;
2199 }
2200 pmd = pmd_offset(pgd, address);
2201 offset = address & PGDIR_MASK;
2202 address &= ~PGDIR_MASK;
2203 end = address + size;
2204 if (end > PGDIR_SIZE)
2205 end = PGDIR_SIZE;
2206 error = 0;
2207 do {
2208 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
2209 address = (address + PMD_SIZE) & PMD_MASK;
2210 pmd++;
2211 } while (address && (address < end));
2212 return error;
2213 }
2214
2215 int filemap_sync(struct vm_area_struct * vma, unsigned long address,
2216 size_t size, unsigned int flags)
2217 {
2218 pgd_t * dir;
2219 unsigned long end = address + size;
2220 int error = 0;
2221
2222 /* Aquire the lock early; it may be possible to avoid dropping
2223 * and reaquiring it repeatedly.
2224 */
2225 spin_lock(&vma->vm_mm->page_table_lock);
2226
2227 dir = pgd_offset(vma->vm_mm, address);
2228 flush_cache_range(vma->vm_mm, end - size, end);
2229 if (address >= end)
2230 BUG();
2231 do {
2232 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
2233 address = (address + PGDIR_SIZE) & PGDIR_MASK;
2234 dir++;
2235 } while (address && (address < end));
2236 flush_tlb_range(vma->vm_mm, end - size, end);
2237
2238 spin_unlock(&vma->vm_mm->page_table_lock);
2239
2240 return error;
2241 }
2242
2243 static struct vm_operations_struct generic_file_vm_ops = {
2244 nopage: filemap_nopage,
2245 };
2246
2247 /* This is used for a general mmap of a disk file */
2248
2249 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
2250 {
2251 struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
2252 struct inode *inode = mapping->host;
2253
2254 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
2255 if (!mapping->a_ops->writepage)
2256 return -EINVAL;
2257 }
2258 if (!mapping->a_ops->readpage)
2259 return -ENOEXEC;
2260 UPDATE_ATIME(inode);
2261 vma->vm_ops = &generic_file_vm_ops;
2262 return 0;
2263 }
2264
2265 /*
2266 * The msync() system call.
2267 */
2268
2269 /*
2270 * MS_SYNC syncs the entire file - including mappings.
2271 *
2272 * MS_ASYNC initiates writeout of just the dirty mapped data.
2273 * This provides no guarantee of file integrity - things like indirect
2274 * blocks may not have started writeout. MS_ASYNC is primarily useful
2275 * where the application knows that it has finished with the data and
2276 * wishes to intelligently schedule its own I/O traffic.
2277 */
2278 static int msync_interval(struct vm_area_struct * vma,
2279 unsigned long start, unsigned long end, int flags)
2280 {
2281 int ret = 0;
2282 struct file * file = vma->vm_file;
2283
2284 if ( (flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED) )
2285 return -EBUSY;
2286
2287 if (file && (vma->vm_flags & VM_SHARED)) {
2288 ret = filemap_sync(vma, start, end-start, flags);
2289
2290 if (!ret && (flags & (MS_SYNC|MS_ASYNC))) {
2291 struct inode * inode = file->f_dentry->d_inode;
2292
2293 down(&inode->i_sem);
2294 ret = filemap_fdatasync(inode->i_mapping);
2295 if (flags & MS_SYNC) {
2296 int err;
2297
2298 if (file->f_op && file->f_op->fsync) {
2299 err = file->f_op->fsync(file, file->f_dentry, 1);
2300 if (err && !ret)
2301 ret = err;
2302 }
2303 err = filemap_fdatawait(inode->i_mapping);
2304 if (err && !ret)
2305 ret = err;
2306 }
2307 up(&inode->i_sem);
2308 }
2309 }
2310 return ret;
2311 }
2312
2313 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
2314 {
2315 unsigned long end;
2316 struct vm_area_struct * vma;
2317 int unmapped_error, error = -EINVAL;
2318
2319 down_read(¤t->mm->mmap_sem);
2320 if (start & ~PAGE_MASK)
2321 goto out;
2322 len = (len + ~PAGE_MASK) & PAGE_MASK;
2323 end = start + len;
2324 if (end < start)
2325 goto out;
2326 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
2327 goto out;
2328 if ((flags & MS_ASYNC) && (flags & MS_SYNC))
2329 goto out;
2330
2331 error = 0;
2332 if (end == start)
2333 goto out;
2334 /*
2335 * If the interval [start,end) covers some unmapped address ranges,
2336 * just ignore them, but return -ENOMEM at the end.
2337 */
2338 vma = find_vma(current->mm, start);
2339 unmapped_error = 0;
2340 for (;;) {
2341 /* Still start < end. */
2342 error = -ENOMEM;
2343 if (!vma)
2344 goto out;
2345 /* Here start < vma->vm_end. */
2346 if (start < vma->vm_start) {
2347 unmapped_error = -ENOMEM;
2348 start = vma->vm_start;
2349 }
2350 /* Here vma->vm_start <= start < vma->vm_end. */
2351 if (end <= vma->vm_end) {
2352 if (start < end) {
2353 error = msync_interval(vma, start, end, flags);
2354 if (error)
2355 goto out;
2356 }
2357 error = unmapped_error;
2358 goto out;
2359 }
2360 /* Here vma->vm_start <= start < vma->vm_end < end. */
2361 error = msync_interval(vma, start, vma->vm_end, flags);
2362 if (error)
2363 goto out;
2364 start = vma->vm_end;
2365 vma = vma->vm_next;
2366 }
2367 out:
2368 up_read(¤t->mm->mmap_sem);
2369 return error;
2370 }
2371
2372 static inline void setup_read_behavior(struct vm_area_struct * vma,
2373 int behavior)
2374 {
2375 VM_ClearReadHint(vma);
2376 switch(behavior) {
2377 case MADV_SEQUENTIAL:
2378 vma->vm_flags |= VM_SEQ_READ;
2379 break;
2380 case MADV_RANDOM:
2381 vma->vm_flags |= VM_RAND_READ;
2382 break;
2383 default:
2384 break;
2385 }
2386 return;
2387 }
2388
2389 static long madvise_fixup_start(struct vm_area_struct * vma,
2390 unsigned long end, int behavior)
2391 {
2392 struct vm_area_struct * n;
2393 struct mm_struct * mm = vma->vm_mm;
2394
2395 n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2396 if (!n)
2397 return -EAGAIN;
2398 *n = *vma;
2399 n->vm_end = end;
2400 setup_read_behavior(n, behavior);
2401 n->vm_raend = 0;
2402 if (n->vm_file)
2403 get_file(n->vm_file);
2404 if (n->vm_ops && n->vm_ops->open)
2405 n->vm_ops->open(n);
2406 vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
2407 lock_vma_mappings(vma);
2408 spin_lock(&mm->page_table_lock);
2409 vma->vm_start = end;
2410 __insert_vm_struct(mm, n);
2411 spin_unlock(&mm->page_table_lock);
2412 unlock_vma_mappings(vma);
2413 return 0;
2414 }
2415
2416 static long madvise_fixup_end(struct vm_area_struct * vma,
2417 unsigned long start, int behavior)
2418 {
2419 struct vm_area_struct * n;
2420 struct mm_struct * mm = vma->vm_mm;
2421
2422 n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2423 if (!n)
2424 return -EAGAIN;
2425 *n = *vma;
2426 n->vm_start = start;
2427 n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
2428 setup_read_behavior(n, behavior);
2429 n->vm_raend = 0;
2430 if (n->vm_file)
2431 get_file(n->vm_file);
2432 if (n->vm_ops && n->vm_ops->open)
2433 n->vm_ops->open(n);
2434 lock_vma_mappings(vma);
2435 spin_lock(&mm->page_table_lock);
2436 vma->vm_end = start;
2437 __insert_vm_struct(mm, n);
2438 spin_unlock(&mm->page_table_lock);
2439 unlock_vma_mappings(vma);
2440 return 0;
2441 }
2442
2443 static long madvise_fixup_middle(struct vm_area_struct * vma,
2444 unsigned long start, unsigned long end, int behavior)
2445 {
2446 struct vm_area_struct * left, * right;
2447 struct mm_struct * mm = vma->vm_mm;
2448
2449 left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2450 if (!left)
2451 return -EAGAIN;
2452 right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2453 if (!right) {
2454 kmem_cache_free(vm_area_cachep, left);
2455 return -EAGAIN;
2456 }
2457 *left = *vma;
2458 *right = *vma;
2459 left->vm_end = start;
2460 right->vm_start = end;
2461 right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
2462 left->vm_raend = 0;
2463 right->vm_raend = 0;
2464 if (vma->vm_file)
2465 atomic_add(2, &vma->vm_file->f_count);
2466
2467 if (vma->vm_ops && vma->vm_ops->open) {
2468 vma->vm_ops->open(left);
2469 vma->vm_ops->open(right);
2470 }
2471 vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
2472 vma->vm_raend = 0;
2473 lock_vma_mappings(vma);
2474 spin_lock(&mm->page_table_lock);
2475 vma->vm_start = start;
2476 vma->vm_end = end;
2477 setup_read_behavior(vma, behavior);
2478 __insert_vm_struct(mm, left);
2479 __insert_vm_struct(mm, right);
2480 spin_unlock(&mm->page_table_lock);
2481 unlock_vma_mappings(vma);
2482 return 0;
2483 }
2484
2485 /*
2486 * We can potentially split a vm area into separate
2487 * areas, each area with its own behavior.
2488 */
2489 static long madvise_behavior(struct vm_area_struct * vma,
2490 unsigned long start, unsigned long end, int behavior)
2491 {
2492 int error = 0;
2493
2494 /* This caps the number of vma's this process can own */
2495 if (vma->vm_mm->map_count > max_map_count)
2496 return -ENOMEM;
2497
2498 if (start == vma->vm_start) {
2499 if (end == vma->vm_end) {
2500 setup_read_behavior(vma, behavior);
2501 vma->vm_raend = 0;
2502 } else
2503 error = madvise_fixup_start(vma, end, behavior);
2504 } else {
2505 if (end == vma->vm_end)
2506 error = madvise_fixup_end(vma, start, behavior);
2507 else
2508 error = madvise_fixup_middle(vma, start, end, behavior);
2509 }
2510
2511 return error;
2512 }
2513
2514 /*
2515 * Schedule all required I/O operations, then run the disk queue
2516 * to make sure they are started. Do not wait for completion.
2517 */
2518 static long madvise_willneed(struct vm_area_struct * vma,
2519 unsigned long start, unsigned long end)
2520 {
2521 long error = -EBADF;
2522 struct file * file;
2523 struct inode * inode;
2524 unsigned long size, rlim_rss;
2525
2526 /* Doesn't work if there's no mapped file. */
2527 if (!vma->vm_file)
2528 return error;
2529 file = vma->vm_file;
2530 inode = file->f_dentry->d_inode;
2531 if (!inode->i_mapping->a_ops->readpage)
2532 return error;
2533 size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
2534
2535 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2536 if (end > vma->vm_end)
2537 end = vma->vm_end;
2538 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2539
2540 /* Make sure this doesn't exceed the process's max rss. */
2541 error = -EIO;
2542 rlim_rss = current->rlim ? current->rlim[RLIMIT_RSS].rlim_cur :
2543 LONG_MAX; /* default: see resource.h */
2544 if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
2545 return error;
2546
2547 /* round to cluster boundaries if this isn't a "random" area. */
2548 if (!VM_RandomReadHint(vma)) {
2549 start = CLUSTER_OFFSET(start);
2550 end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
2551
2552 while ((start < end) && (start < size)) {
2553 error = read_cluster_nonblocking(file, start, size);
2554 start += CLUSTER_PAGES;
2555 if (error < 0)
2556 break;
2557 }
2558 } else {
2559 while ((start < end) && (start < size)) {
2560 error = page_cache_read(file, start);
2561 start++;
2562 if (error < 0)
2563 break;
2564 }
2565 }
2566
2567 /* Don't wait for someone else to push these requests. */
2568 run_task_queue(&tq_disk);
2569
2570 return error;
2571 }
2572
2573 /*
2574 * Application no longer needs these pages. If the pages are dirty,
2575 * it's OK to just throw them away. The app will be more careful about
2576 * data it wants to keep. Be sure to free swap resources too. The
2577 * zap_page_range call sets things up for refill_inactive to actually free
2578 * these pages later if no one else has touched them in the meantime,
2579 * although we could add these pages to a global reuse list for
2580 * refill_inactive to pick up before reclaiming other pages.
2581 *
2582 * NB: This interface discards data rather than pushes it out to swap,
2583 * as some implementations do. This has performance implications for
2584 * applications like large transactional databases which want to discard
2585 * pages in anonymous maps after committing to backing store the data
2586 * that was kept in them. There is no reason to write this data out to
2587 * the swap area if the application is discarding it.
2588 *
2589 * An interface that causes the system to free clean pages and flush
2590 * dirty pages is already available as msync(MS_INVALIDATE).
2591 */
2592 static long madvise_dontneed(struct vm_area_struct * vma,
2593 unsigned long start, unsigned long end)
2594 {
2595 if (vma->vm_flags & VM_LOCKED)
2596 return -EINVAL;
2597
2598 zap_page_range(vma->vm_mm, start, end - start);
2599 return 0;
2600 }
2601
2602 static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
2603 unsigned long end, int behavior)
2604 {
2605 long error = -EBADF;
2606
2607 switch (behavior) {
2608 case MADV_NORMAL:
2609 case MADV_SEQUENTIAL:
2610 case MADV_RANDOM:
2611 error = madvise_behavior(vma, start, end, behavior);
2612 break;
2613
2614 case MADV_WILLNEED:
2615 error = madvise_willneed(vma, start, end);
2616 break;
2617
2618 case MADV_DONTNEED:
2619 error = madvise_dontneed(vma, start, end);
2620 break;
2621
2622 default:
2623 error = -EINVAL;
2624 break;
2625 }
2626
2627 return error;
2628 }
2629
2630 /*
2631 * The madvise(2) system call.
2632 *
2633 * Applications can use madvise() to advise the kernel how it should
2634 * handle paging I/O in this VM area. The idea is to help the kernel
2635 * use appropriate read-ahead and caching techniques. The information
2636 * provided is advisory only, and can be safely disregarded by the
2637 * kernel without affecting the correct operation of the application.
2638 *
2639 * behavior values:
2640 * MADV_NORMAL - the default behavior is to read clusters. This
2641 * results in some read-ahead and read-behind.
2642 * MADV_RANDOM - the system should read the minimum amount of data
2643 * on any access, since it is unlikely that the appli-
2644 * cation will need more than what it asks for.
2645 * MADV_SEQUENTIAL - pages in the given range will probably be accessed
2646 * once, so they can be aggressively read ahead, and
2647 * can be freed soon after they are accessed.
2648 * MADV_WILLNEED - the application is notifying the system to read
2649 * some pages ahead.
2650 * MADV_DONTNEED - the application is finished with the given range,
2651 * so the kernel can free resources associated with it.
2652 *
2653 * return values:
2654 * zero - success
2655 * -EINVAL - start + len < 0, start is not page-aligned,
2656 * "behavior" is not a valid value, or application
2657 * is attempting to release locked or shared pages.
2658 * -ENOMEM - addresses in the specified range are not currently
2659 * mapped, or are outside the AS of the process.
2660 * -EIO - an I/O error occurred while paging in data.
2661 * -EBADF - map exists, but area maps something that isn't a file.
2662 * -EAGAIN - a kernel resource was temporarily unavailable.
2663 */
2664 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
2665 {
2666 unsigned long end;
2667 struct vm_area_struct * vma;
2668 int unmapped_error = 0;
2669 int error = -EINVAL;
2670
2671 down_write(¤t->mm->mmap_sem);
2672
2673 if (start & ~PAGE_MASK)
2674 goto out;
2675 len = (len + ~PAGE_MASK) & PAGE_MASK;
2676 end = start + len;
2677 if (end < start)
2678 goto out;
2679
2680 error = 0;
2681 if (end == start)
2682 goto out;
2683
2684 /*
2685 * If the interval [start,end) covers some unmapped address
2686 * ranges, just ignore them, but return -ENOMEM at the end.
2687 */
2688 vma = find_vma(current->mm, start);
2689 for (;;) {
2690 /* Still start < end. */
2691 error = -ENOMEM;
2692 if (!vma)
2693 goto out;
2694
2695 /* Here start < vma->vm_end. */
2696 if (start < vma->vm_start) {
2697 unmapped_error = -ENOMEM;
2698 start = vma->vm_start;
2699 }
2700
2701 /* Here vma->vm_start <= start < vma->vm_end. */
2702 if (end <= vma->vm_end) {
2703 if (start < end) {
2704 error = madvise_vma(vma, start, end,
2705 behavior);
2706 if (error)
2707 goto out;
2708 }
2709 error = unmapped_error;
2710 goto out;
2711 }
2712
2713 /* Here vma->vm_start <= start < vma->vm_end < end. */
2714 error = madvise_vma(vma, start, vma->vm_end, behavior);
2715 if (error)
2716 goto out;
2717 start = vma->vm_end;
2718 vma = vma->vm_next;
2719 }
2720
2721 out:
2722 up_write(¤t->mm->mmap_sem);
2723 return error;
2724 }
2725
2726 /*
2727 * Later we can get more picky about what "in core" means precisely.
2728 * For now, simply check to see if the page is in the page cache,
2729 * and is up to date; i.e. that no page-in operation would be required
2730 * at this time if an application were to map and access this page.
2731 */
2732 static unsigned char mincore_page(struct vm_area_struct * vma,
2733 unsigned long pgoff)
2734 {
2735 unsigned char present = 0;
2736 struct address_space * as = vma->vm_file->f_dentry->d_inode->i_mapping;
2737 struct page * page, ** hash = page_hash(as, pgoff);
2738
2739 spin_lock(&pagecache_lock);
2740 page = __find_page_nolock(as, pgoff, *hash);
2741 if ((page) && (Page_Uptodate(page)))
2742 present = 1;
2743 spin_unlock(&pagecache_lock);
2744
2745 return present;
2746 }
2747
2748 static long mincore_vma(struct vm_area_struct * vma,
2749 unsigned long start, unsigned long end, unsigned char * vec)
2750 {
2751 long error, i, remaining;
2752 unsigned char * tmp;
2753
2754 error = -ENOMEM;
2755 if (!vma->vm_file)
2756 return error;
2757
2758 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2759 if (end > vma->vm_end)
2760 end = vma->vm_end;
2761 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2762
2763 error = -EAGAIN;
2764 tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
2765 if (!tmp)
2766 return error;
2767
2768 /* (end - start) is # of pages, and also # of bytes in "vec */
2769 remaining = (end - start),
2770
2771 error = 0;
2772 for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
2773 int j = 0;
2774 long thispiece = (remaining < PAGE_SIZE) ?
2775 remaining : PAGE_SIZE;
2776
2777 while (j < thispiece)
2778 tmp[j++] = mincore_page(vma, start++);
2779
2780 if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
2781 error = -EFAULT;
2782 break;
2783 }
2784 }
2785
2786 free_page((unsigned long) tmp);
2787 return error;
2788 }
2789
2790 /*
2791 * The mincore(2) system call.
2792 *
2793 * mincore() returns the memory residency status of the pages in the
2794 * current process's address space specified by [addr, addr + len).
2795 * The status is returned in a vector of bytes. The least significant
2796 * bit of each byte is 1 if the referenced page is in memory, otherwise
2797 * it is zero.
2798 *
2799 * Because the status of a page can change after mincore() checks it
2800 * but before it returns to the application, the returned vector may
2801 * contain stale information. Only locked pages are guaranteed to
2802 * remain in memory.
2803 *
2804 * return values:
2805 * zero - success
2806 * -EFAULT - vec points to an illegal address
2807 * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
2808 * or len has a nonpositive value
2809 * -ENOMEM - Addresses in the range [addr, addr + len] are
2810 * invalid for the address space of this process, or
2811 * specify one or more pages which are not currently
2812 * mapped
2813 * -EAGAIN - A kernel resource was temporarily unavailable.
2814 */
2815 asmlinkage long sys_mincore(unsigned long start, size_t len,
2816 unsigned char * vec)
2817 {
2818 int index = 0;
2819 unsigned long end;
2820 struct vm_area_struct * vma;
2821 int unmapped_error = 0;
2822 long error = -EINVAL;
2823
2824 down_read(¤t->mm->mmap_sem);
2825
2826 if (start & ~PAGE_CACHE_MASK)
2827 goto out;
2828 len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK;
2829 end = start + len;
2830 if (end < start)
2831 goto out;
2832
2833 error = 0;
2834 if (end == start)
2835 goto out;
2836
2837 /*
2838 * If the interval [start,end) covers some unmapped address
2839 * ranges, just ignore them, but return -ENOMEM at the end.
2840 */
2841 vma = find_vma(current->mm, start);
2842 for (;;) {
2843 /* Still start < end. */
2844 error = -ENOMEM;
2845 if (!vma)
2846 goto out;
2847
2848 /* Here start < vma->vm_end. */
2849 if (start < vma->vm_start) {
2850 unmapped_error = -ENOMEM;
2851 start = vma->vm_start;
2852 }
2853
2854 /* Here vma->vm_start <= start < vma->vm_end. */
2855 if (end <= vma->vm_end) {
2856 if (start < end) {
2857 error = mincore_vma(vma, start, end,
2858 &vec[index]);
2859 if (error)
2860 goto out;
2861 }
2862 error = unmapped_error;
2863 goto out;
2864 }
2865
2866 /* Here vma->vm_start <= start < vma->vm_end < end. */
2867 error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
2868 if (error)
2869 goto out;
2870 index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
2871 start = vma->vm_end;
2872 vma = vma->vm_next;
2873 }
2874
2875 out:
2876 up_read(¤t->mm->mmap_sem);
2877 return error;
2878 }
2879
2880 static inline
2881 struct page *__read_cache_page(struct address_space *mapping,
2882 unsigned long index,
2883 int (*filler)(void *,struct page*),
2884 void *data)
2885 {
2886 struct page **hash = page_hash(mapping, index);
2887 struct page *page, *cached_page = NULL;
2888 int err;
2889 repeat:
2890 page = __find_get_page(mapping, index, hash);
2891 if (!page) {
2892 if (!cached_page) {
2893 cached_page = page_cache_alloc(mapping);
2894 if (!cached_page)
2895 return ERR_PTR(-ENOMEM);
2896 }
2897 page = cached_page;
2898 if (add_to_page_cache_unique(page, mapping, index, hash))
2899 goto repeat;
2900 cached_page = NULL;
2901 err = filler(data, page);
2902 if (err < 0) {
2903 page_cache_release(page);
2904 page = ERR_PTR(err);
2905 }
2906 }
2907 if (cached_page)
2908 page_cache_release(cached_page);
2909 return page;
2910 }
2911
2912 /*
2913 * Read into the page cache. If a page already exists,
2914 * and Page_Uptodate() is not set, try to fill the page.
2915 */
2916 struct page *read_cache_page(struct address_space *mapping,
2917 unsigned long index,
2918 int (*filler)(void *,struct page*),
2919 void *data)
2920 {
2921 struct page *page;
2922 int err;
2923
2924 retry:
2925 page = __read_cache_page(mapping, index, filler, data);
2926 if (IS_ERR(page))
2927 goto out;
2928 mark_page_accessed(page);
2929 if (Page_Uptodate(page))
2930 goto out;
2931
2932 lock_page(page);
2933 if (!page->mapping) {
2934 UnlockPage(page);
2935 page_cache_release(page);
2936 goto retry;
2937 }
2938 if (Page_Uptodate(page)) {
2939 UnlockPage(page);
2940 goto out;
2941 }
2942 err = filler(data, page);
2943 if (err < 0) {
2944 page_cache_release(page);
2945 page = ERR_PTR(err);
2946 }
2947 out:
2948 return page;
2949 }
2950
2951 static inline struct page * __grab_cache_page(struct address_space *mapping,
2952 unsigned long index, struct page **cached_page)
2953 {
2954 struct page *page, **hash = page_hash(mapping, index);
2955 repeat:
2956 page = __find_lock_page(mapping, index, hash);
2957 if (!page) {
2958 if (!*cached_page) {
2959 *cached_page = page_cache_alloc(mapping);
2960 if (!*cached_page)
2961 return NULL;
2962 }
2963 page = *cached_page;
2964 if (add_to_page_cache_unique(page, mapping, index, hash))
2965 goto repeat;
2966 *cached_page = NULL;
2967 }
2968 return page;
2969 }
2970
2971 inline void remove_suid(struct inode *inode)
2972 {
2973 unsigned int mode;
2974
2975 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
2976 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
2977
2978 /* was any of the uid bits set? */
2979 mode &= inode->i_mode;
2980 if (mode && !capable(CAP_FSETID)) {
2981 inode->i_mode &= ~mode;
2982 mark_inode_dirty(inode);
2983 }
2984 }
2985
2986 /*
2987 * precheck_file_write():
2988 * Check the conditions on a file descriptor prior to beginning a write
2989 * on it. Contains the common precheck code for both buffered and direct
2990 * IO.
2991 */
2992 int precheck_file_write(struct file *file, struct inode *inode,
2993 size_t *count, loff_t *ppos)
2994 {
2995 ssize_t err;
2996 unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2997 loff_t pos = *ppos;
2998
2999 err = -EINVAL;
3000 if (pos < 0)
3001 goto out;
3002
3003 err = file->f_error;
3004 if (err) {
3005 file->f_error = 0;
3006 goto out;
3007 }
3008
3009 /* FIXME: this is for backwards compatibility with 2.4 */
3010 if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND)
3011 *ppos = pos = inode->i_size;
3012
3013 /*
3014 * Check whether we've reached the file size limit.
3015 */
3016 err = -EFBIG;
3017
3018 if (!S_ISBLK(inode->i_mode) && limit != RLIM_INFINITY) {
3019 if (pos >= limit) {
3020 send_sig(SIGXFSZ, current, 0);
3021 goto out;
3022 }
3023 if (pos > 0xFFFFFFFFULL || *count > limit - (u32)pos) {
3024 /* send_sig(SIGXFSZ, current, 0); */
3025 *count = limit - (u32)pos;
3026 }
3027 }
3028
3029 /*
3030 * LFS rule
3031 */
3032 if ( pos + *count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
3033 if (pos >= MAX_NON_LFS) {
3034 send_sig(SIGXFSZ, current, 0);
3035 goto out;
3036 }
3037 if (*count > MAX_NON_LFS - (u32)pos) {
3038 /* send_sig(SIGXFSZ, current, 0); */
3039 *count = MAX_NON_LFS - (u32)pos;
3040 }
3041 }
3042
3043 /*
3044 * Are we about to exceed the fs block limit ?
3045 *
3046 * If we have written data it becomes a short write
3047 * If we have exceeded without writing data we send
3048 * a signal and give them an EFBIG.
3049 *
3050 * Linus frestrict idea will clean these up nicely..
3051 */
3052
3053 if (!S_ISBLK(inode->i_mode)) {
3054 if (pos >= inode->i_sb->s_maxbytes)
3055 {
3056 if (*count || pos > inode->i_sb->s_maxbytes) {
3057 send_sig(SIGXFSZ, current, 0);
3058 err = -EFBIG;
3059 goto out;
3060 }
3061 /* zero-length writes at ->s_maxbytes are OK */
3062 }
3063
3064 if (pos + *count > inode->i_sb->s_maxbytes)
3065 *count = inode->i_sb->s_maxbytes - pos;
3066 } else {
3067 if (is_read_only(inode->i_rdev)) {
3068 err = -EPERM;
3069 goto out;
3070 }
3071 if (pos >= inode->i_size) {
3072 if (*count || pos > inode->i_size) {
3073 err = -ENOSPC;
3074 goto out;
3075 }
3076 }
3077
3078 if (pos + *count > inode->i_size)
3079 *count = inode->i_size - pos;
3080 }
3081
3082 err = 0;
3083 out:
3084 return err;
3085 }
3086
3087 /*
3088 * Write to a file through the page cache.
3089 *
3090 * We currently put everything into the page cache prior to writing it.
3091 * This is not a problem when writing full pages. With partial pages,
3092 * however, we first have to read the data into the cache, then
3093 * dirty the page, and finally schedule it for writing. Alternatively, we
3094 * could write-through just the portion of data that would go into that
3095 * page, but that would kill performance for applications that write data
3096 * line by line, and it's prone to race conditions.
3097 *
3098 * Note that this routine doesn't try to keep track of dirty pages. Each
3099 * file system has to do this all by itself, unfortunately.
3100 * okir@monad.swb.de
3101 */
3102 ssize_t
3103 do_generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
3104 {
3105 struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
3106 struct inode *inode = mapping->host;
3107 loff_t pos;
3108 struct page *page, *cached_page;
3109 ssize_t written;
3110 long status = 0;
3111 ssize_t err;
3112 unsigned bytes;
3113
3114 cached_page = NULL;
3115 pos = *ppos;
3116 written = 0;
3117
3118 err = precheck_file_write(file, inode, &count, &pos);
3119 if (err != 0 || count == 0)
3120 goto out;
3121
3122 remove_suid(inode);
3123 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
3124 mark_inode_dirty_sync(inode);
3125
3126 do {
3127 unsigned long index, offset;
3128 long page_fault;
3129 char *kaddr;
3130
3131 /*
3132 * Try to find the page in the cache. If it isn't there,
3133 * allocate a free page.
3134 */
3135 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
3136 index = pos >> PAGE_CACHE_SHIFT;
3137 bytes = PAGE_CACHE_SIZE - offset;
3138 if (bytes > count)
3139 bytes = count;
3140
3141 /*
3142 * Bring in the user page that we will copy from _first_.
3143 * Otherwise there's a nasty deadlock on copying from the
3144 * same page as we're writing to, without it being marked
3145 * up-to-date.
3146 */
3147 { volatile unsigned char dummy;
3148 __get_user(dummy, buf);
3149 __get_user(dummy, buf+bytes-1);
3150 }
3151
3152 status = -ENOMEM; /* we'll assign it later anyway */
3153 page = __grab_cache_page(mapping, index, &cached_page);
3154 if (!page)
3155 break;
3156
3157 /* We have exclusive IO access to the page.. */
3158 if (!PageLocked(page)) {
3159 PAGE_BUG(page);
3160 }
3161
3162 kaddr = kmap(page);
3163 status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
3164 if (status)
3165 goto sync_failure;
3166 page_fault = __copy_from_user(kaddr+offset, buf, bytes);
3167 flush_dcache_page(page);
3168 status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
3169 if (page_fault)
3170 goto fail_write;
3171 if (!status)
3172 status = bytes;
3173
3174 if (status >= 0) {
3175 written += status;
3176 count -= status;
3177 pos += status;
3178 buf += status;
3179 }
3180 unlock:
3181 kunmap(page);
3182 /* Mark it unlocked again and drop the page.. */
3183 SetPageReferenced(page);
3184 UnlockPage(page);
3185 page_cache_release(page);
3186
3187 if (status < 0)
3188 break;
3189 } while (count);
3190 done:
3191 *ppos = pos;
3192
3193 if (cached_page)
3194 page_cache_release(cached_page);
3195
3196 /* For now, when the user asks for O_SYNC, we'll actually
3197 * provide O_DSYNC. */
3198 if (status >= 0) {
3199 if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
3200 status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA);
3201 }
3202
3203 err = written ? written : status;
3204 out:
3205
3206 return err;
3207 fail_write:
3208 status = -EFAULT;
3209 goto unlock;
3210
3211 sync_failure:
3212 /*
3213 * If blocksize < pagesize, prepare_write() may have instantiated a
3214 * few blocks outside i_size. Trim these off again.
3215 */
3216 kunmap(page);
3217 UnlockPage(page);
3218 page_cache_release(page);
3219 if (pos + bytes > inode->i_size)
3220 vmtruncate(inode, inode->i_size);
3221 goto done;
3222 }
3223
3224 ssize_t
3225 do_generic_direct_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
3226 {
3227 struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
3228 struct inode *inode = mapping->host;
3229 loff_t pos;
3230 ssize_t written;
3231 long status = 0;
3232 ssize_t err;
3233
3234 pos = *ppos;
3235 written = 0;
3236
3237 err = precheck_file_write(file, inode, &count, &pos);
3238 if (err != 0 || count == 0)
3239 goto out;
3240
3241 if (!file->f_flags & O_DIRECT)
3242 BUG();
3243
3244 remove_suid(inode);
3245 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
3246 mark_inode_dirty_sync(inode);
3247
3248 written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos);
3249 if (written > 0) {
3250 loff_t end = pos + written;
3251 if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
3252 inode->i_size = end;
3253 mark_inode_dirty(inode);
3254 }
3255 *ppos = end;
3256 invalidate_inode_pages2(mapping);
3257 }
3258 /*
3259 * Sync the fs metadata but not the minor inode changes and
3260 * of course not the data as we did direct DMA for the IO.
3261 */
3262 if (written >= 0 && file->f_flags & O_SYNC)
3263 status = generic_osync_inode(inode, OSYNC_METADATA);
3264
3265 err = written ? written : status;
3266 out:
3267 return err;
3268 }
3269
3270 static int do_odirect_fallback(struct file *file, struct inode *inode,
3271 const char *buf, size_t count, loff_t *ppos)
3272 {
3273 ssize_t ret;
3274 int err;
3275
3276 down(&inode->i_sem);
3277 ret = do_generic_file_write(file, buf, count, ppos);
3278 if (ret > 0) {
3279 err = do_fdatasync(file);
3280 if (err)
3281 ret = err;
3282 }
3283 up(&inode->i_sem);
3284 return ret;
3285 }
3286
3287 ssize_t
3288 generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
3289 {
3290 struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
3291 ssize_t err;
3292
3293 if ((ssize_t) count < 0)
3294 return -EINVAL;
3295
3296 if (!access_ok(VERIFY_READ, buf, count))
3297 return -EFAULT;
3298
3299 if (file->f_flags & O_DIRECT) {
3300 /* do_generic_direct_write may drop i_sem during the
3301 actual IO */
3302 down_read(&inode->i_alloc_sem);
3303 down(&inode->i_sem);
3304 err = do_generic_direct_write(file, buf, count, ppos);
3305 up(&inode->i_sem);
3306 up_read(&inode->i_alloc_sem);
3307 if (unlikely(err == -ENOTBLK))
3308 err = do_odirect_fallback(file, inode, buf, count, ppos);
3309 } else {
3310 down(&inode->i_sem);
3311 err = do_generic_file_write(file, buf, count, ppos);
3312 up(&inode->i_sem);
3313 }
3314
3315 return err;
3316 }
3317
3318 void __init page_cache_init(unsigned long mempages)
3319 {
3320 unsigned long htable_size, order;
3321
3322 htable_size = mempages;
3323 htable_size *= sizeof(struct page *);
3324 for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
3325 ;
3326
3327 do {
3328 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
3329
3330 page_hash_bits = 0;
3331 while((tmp >>= 1UL) != 0UL)
3332 page_hash_bits++;
3333
3334 page_hash_table = (struct page **)
3335 __get_free_pages(GFP_ATOMIC, order);
3336 } while(page_hash_table == NULL && --order > 0);
3337
3338 printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
3339 (1 << page_hash_bits), order, (PAGE_SIZE << order));
3340 if (!page_hash_table)
3341 panic("Failed to allocate page hash table\n");
3342 memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
3343 }
Cache object: 2fc44c78fe8166d7cbe850d9c0575914
|