The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/mm/madvise.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  *      linux/mm/madvise.c
    3  *
    4  * Copyright (C) 1999  Linus Torvalds
    5  * Copyright (C) 2002  Christoph Hellwig
    6  */
    7 
    8 #include <linux/mman.h>
    9 #include <linux/pagemap.h>
   10 #include <linux/syscalls.h>
   11 #include <linux/mempolicy.h>
   12 #include <linux/page-isolation.h>
   13 #include <linux/hugetlb.h>
   14 #include <linux/falloc.h>
   15 #include <linux/sched.h>
   16 #include <linux/ksm.h>
   17 #include <linux/fs.h>
   18 #include <linux/file.h>
   19 
   20 /*
   21  * Any behaviour which results in changes to the vma->vm_flags needs to
   22  * take mmap_sem for writing. Others, which simply traverse vmas, need
   23  * to only take it for reading.
   24  */
   25 static int madvise_need_mmap_write(int behavior)
   26 {
   27         switch (behavior) {
   28         case MADV_REMOVE:
   29         case MADV_WILLNEED:
   30         case MADV_DONTNEED:
   31                 return 0;
   32         default:
   33                 /* be safe, default to 1. list exceptions explicitly */
   34                 return 1;
   35         }
   36 }
   37 
   38 /*
   39  * We can potentially split a vm area into separate
   40  * areas, each area with its own behavior.
   41  */
   42 static long madvise_behavior(struct vm_area_struct * vma,
   43                      struct vm_area_struct **prev,
   44                      unsigned long start, unsigned long end, int behavior)
   45 {
   46         struct mm_struct * mm = vma->vm_mm;
   47         int error = 0;
   48         pgoff_t pgoff;
   49         unsigned long new_flags = vma->vm_flags;
   50 
   51         switch (behavior) {
   52         case MADV_NORMAL:
   53                 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
   54                 break;
   55         case MADV_SEQUENTIAL:
   56                 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
   57                 break;
   58         case MADV_RANDOM:
   59                 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
   60                 break;
   61         case MADV_DONTFORK:
   62                 new_flags |= VM_DONTCOPY;
   63                 break;
   64         case MADV_DOFORK:
   65                 if (vma->vm_flags & VM_IO) {
   66                         error = -EINVAL;
   67                         goto out;
   68                 }
   69                 new_flags &= ~VM_DONTCOPY;
   70                 break;
   71         case MADV_DONTDUMP:
   72                 new_flags |= VM_DONTDUMP;
   73                 break;
   74         case MADV_DODUMP:
   75                 if (new_flags & VM_SPECIAL) {
   76                         error = -EINVAL;
   77                         goto out;
   78                 }
   79                 new_flags &= ~VM_DONTDUMP;
   80                 break;
   81         case MADV_MERGEABLE:
   82         case MADV_UNMERGEABLE:
   83                 error = ksm_madvise(vma, start, end, behavior, &new_flags);
   84                 if (error)
   85                         goto out;
   86                 break;
   87         case MADV_HUGEPAGE:
   88         case MADV_NOHUGEPAGE:
   89                 error = hugepage_madvise(vma, &new_flags, behavior);
   90                 if (error)
   91                         goto out;
   92                 break;
   93         }
   94 
   95         if (new_flags == vma->vm_flags) {
   96                 *prev = vma;
   97                 goto out;
   98         }
   99 
  100         pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
  101         *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
  102                                 vma->vm_file, pgoff, vma_policy(vma));
  103         if (*prev) {
  104                 vma = *prev;
  105                 goto success;
  106         }
  107 
  108         *prev = vma;
  109 
  110         if (start != vma->vm_start) {
  111                 error = split_vma(mm, vma, start, 1);
  112                 if (error)
  113                         goto out;
  114         }
  115 
  116         if (end != vma->vm_end) {
  117                 error = split_vma(mm, vma, end, 0);
  118                 if (error)
  119                         goto out;
  120         }
  121 
  122 success:
  123         /*
  124          * vm_flags is protected by the mmap_sem held in write mode.
  125          */
  126         vma->vm_flags = new_flags;
  127 
  128 out:
  129         if (error == -ENOMEM)
  130                 error = -EAGAIN;
  131         return error;
  132 }
  133 
  134 /*
  135  * Schedule all required I/O operations.  Do not wait for completion.
  136  */
  137 static long madvise_willneed(struct vm_area_struct * vma,
  138                              struct vm_area_struct ** prev,
  139                              unsigned long start, unsigned long end)
  140 {
  141         struct file *file = vma->vm_file;
  142 
  143         if (!file)
  144                 return -EBADF;
  145 
  146         if (file->f_mapping->a_ops->get_xip_mem) {
  147                 /* no bad return value, but ignore advice */
  148                 return 0;
  149         }
  150 
  151         *prev = vma;
  152         start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
  153         if (end > vma->vm_end)
  154                 end = vma->vm_end;
  155         end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
  156 
  157         force_page_cache_readahead(file->f_mapping, file, start, end - start);
  158         return 0;
  159 }
  160 
  161 /*
  162  * Application no longer needs these pages.  If the pages are dirty,
  163  * it's OK to just throw them away.  The app will be more careful about
  164  * data it wants to keep.  Be sure to free swap resources too.  The
  165  * zap_page_range call sets things up for shrink_active_list to actually free
  166  * these pages later if no one else has touched them in the meantime,
  167  * although we could add these pages to a global reuse list for
  168  * shrink_active_list to pick up before reclaiming other pages.
  169  *
  170  * NB: This interface discards data rather than pushes it out to swap,
  171  * as some implementations do.  This has performance implications for
  172  * applications like large transactional databases which want to discard
  173  * pages in anonymous maps after committing to backing store the data
  174  * that was kept in them.  There is no reason to write this data out to
  175  * the swap area if the application is discarding it.
  176  *
  177  * An interface that causes the system to free clean pages and flush
  178  * dirty pages is already available as msync(MS_INVALIDATE).
  179  */
  180 static long madvise_dontneed(struct vm_area_struct * vma,
  181                              struct vm_area_struct ** prev,
  182                              unsigned long start, unsigned long end)
  183 {
  184         *prev = vma;
  185         if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
  186                 return -EINVAL;
  187 
  188         if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
  189                 struct zap_details details = {
  190                         .nonlinear_vma = vma,
  191                         .last_index = ULONG_MAX,
  192                 };
  193                 zap_page_range(vma, start, end - start, &details);
  194         } else
  195                 zap_page_range(vma, start, end - start, NULL);
  196         return 0;
  197 }
  198 
  199 /*
  200  * Application wants to free up the pages and associated backing store.
  201  * This is effectively punching a hole into the middle of a file.
  202  *
  203  * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
  204  * Other filesystems return -ENOSYS.
  205  */
  206 static long madvise_remove(struct vm_area_struct *vma,
  207                                 struct vm_area_struct **prev,
  208                                 unsigned long start, unsigned long end)
  209 {
  210         loff_t offset;
  211         int error;
  212         struct file *f;
  213 
  214         *prev = NULL;   /* tell sys_madvise we drop mmap_sem */
  215 
  216         if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
  217                 return -EINVAL;
  218 
  219         f = vma->vm_file;
  220 
  221         if (!f || !f->f_mapping || !f->f_mapping->host) {
  222                         return -EINVAL;
  223         }
  224 
  225         if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
  226                 return -EACCES;
  227 
  228         offset = (loff_t)(start - vma->vm_start)
  229                         + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
  230 
  231         /*
  232          * Filesystem's fallocate may need to take i_mutex.  We need to
  233          * explicitly grab a reference because the vma (and hence the
  234          * vma's reference to the file) can go away as soon as we drop
  235          * mmap_sem.
  236          */
  237         get_file(f);
  238         up_read(&current->mm->mmap_sem);
  239         error = do_fallocate(f,
  240                                 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
  241                                 offset, end - start);
  242         fput(f);
  243         down_read(&current->mm->mmap_sem);
  244         return error;
  245 }
  246 
  247 #ifdef CONFIG_MEMORY_FAILURE
  248 /*
  249  * Error injection support for memory error handling.
  250  */
  251 static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
  252 {
  253         int ret = 0;
  254 
  255         if (!capable(CAP_SYS_ADMIN))
  256                 return -EPERM;
  257         for (; start < end; start += PAGE_SIZE) {
  258                 struct page *p;
  259                 int ret = get_user_pages_fast(start, 1, 0, &p);
  260                 if (ret != 1)
  261                         return ret;
  262                 if (bhv == MADV_SOFT_OFFLINE) {
  263                         printk(KERN_INFO "Soft offlining page %lx at %lx\n",
  264                                 page_to_pfn(p), start);
  265                         ret = soft_offline_page(p, MF_COUNT_INCREASED);
  266                         if (ret)
  267                                 break;
  268                         continue;
  269                 }
  270                 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
  271                        page_to_pfn(p), start);
  272                 /* Ignore return value for now */
  273                 memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
  274         }
  275         return ret;
  276 }
  277 #endif
  278 
  279 static long
  280 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
  281                 unsigned long start, unsigned long end, int behavior)
  282 {
  283         switch (behavior) {
  284         case MADV_REMOVE:
  285                 return madvise_remove(vma, prev, start, end);
  286         case MADV_WILLNEED:
  287                 return madvise_willneed(vma, prev, start, end);
  288         case MADV_DONTNEED:
  289                 return madvise_dontneed(vma, prev, start, end);
  290         default:
  291                 return madvise_behavior(vma, prev, start, end, behavior);
  292         }
  293 }
  294 
  295 static int
  296 madvise_behavior_valid(int behavior)
  297 {
  298         switch (behavior) {
  299         case MADV_DOFORK:
  300         case MADV_DONTFORK:
  301         case MADV_NORMAL:
  302         case MADV_SEQUENTIAL:
  303         case MADV_RANDOM:
  304         case MADV_REMOVE:
  305         case MADV_WILLNEED:
  306         case MADV_DONTNEED:
  307 #ifdef CONFIG_KSM
  308         case MADV_MERGEABLE:
  309         case MADV_UNMERGEABLE:
  310 #endif
  311 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  312         case MADV_HUGEPAGE:
  313         case MADV_NOHUGEPAGE:
  314 #endif
  315         case MADV_DONTDUMP:
  316         case MADV_DODUMP:
  317                 return 1;
  318 
  319         default:
  320                 return 0;
  321         }
  322 }
  323 
  324 /*
  325  * The madvise(2) system call.
  326  *
  327  * Applications can use madvise() to advise the kernel how it should
  328  * handle paging I/O in this VM area.  The idea is to help the kernel
  329  * use appropriate read-ahead and caching techniques.  The information
  330  * provided is advisory only, and can be safely disregarded by the
  331  * kernel without affecting the correct operation of the application.
  332  *
  333  * behavior values:
  334  *  MADV_NORMAL - the default behavior is to read clusters.  This
  335  *              results in some read-ahead and read-behind.
  336  *  MADV_RANDOM - the system should read the minimum amount of data
  337  *              on any access, since it is unlikely that the appli-
  338  *              cation will need more than what it asks for.
  339  *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
  340  *              once, so they can be aggressively read ahead, and
  341  *              can be freed soon after they are accessed.
  342  *  MADV_WILLNEED - the application is notifying the system to read
  343  *              some pages ahead.
  344  *  MADV_DONTNEED - the application is finished with the given range,
  345  *              so the kernel can free resources associated with it.
  346  *  MADV_REMOVE - the application wants to free up the given range of
  347  *              pages and associated backing store.
  348  *  MADV_DONTFORK - omit this area from child's address space when forking:
  349  *              typically, to avoid COWing pages pinned by get_user_pages().
  350  *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
  351  *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
  352  *              this area with pages of identical content from other such areas.
  353  *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
  354  *
  355  * return values:
  356  *  zero    - success
  357  *  -EINVAL - start + len < 0, start is not page-aligned,
  358  *              "behavior" is not a valid value, or application
  359  *              is attempting to release locked or shared pages.
  360  *  -ENOMEM - addresses in the specified range are not currently
  361  *              mapped, or are outside the AS of the process.
  362  *  -EIO    - an I/O error occurred while paging in data.
  363  *  -EBADF  - map exists, but area maps something that isn't a file.
  364  *  -EAGAIN - a kernel resource was temporarily unavailable.
  365  */
  366 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
  367 {
  368         unsigned long end, tmp;
  369         struct vm_area_struct * vma, *prev;
  370         int unmapped_error = 0;
  371         int error = -EINVAL;
  372         int write;
  373         size_t len;
  374 
  375 #ifdef CONFIG_MEMORY_FAILURE
  376         if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
  377                 return madvise_hwpoison(behavior, start, start+len_in);
  378 #endif
  379         if (!madvise_behavior_valid(behavior))
  380                 return error;
  381 
  382         write = madvise_need_mmap_write(behavior);
  383         if (write)
  384                 down_write(&current->mm->mmap_sem);
  385         else
  386                 down_read(&current->mm->mmap_sem);
  387 
  388         if (start & ~PAGE_MASK)
  389                 goto out;
  390         len = (len_in + ~PAGE_MASK) & PAGE_MASK;
  391 
  392         /* Check to see whether len was rounded up from small -ve to zero */
  393         if (len_in && !len)
  394                 goto out;
  395 
  396         end = start + len;
  397         if (end < start)
  398                 goto out;
  399 
  400         error = 0;
  401         if (end == start)
  402                 goto out;
  403 
  404         /*
  405          * If the interval [start,end) covers some unmapped address
  406          * ranges, just ignore them, but return -ENOMEM at the end.
  407          * - different from the way of handling in mlock etc.
  408          */
  409         vma = find_vma_prev(current->mm, start, &prev);
  410         if (vma && start > vma->vm_start)
  411                 prev = vma;
  412 
  413         for (;;) {
  414                 /* Still start < end. */
  415                 error = -ENOMEM;
  416                 if (!vma)
  417                         goto out;
  418 
  419                 /* Here start < (end|vma->vm_end). */
  420                 if (start < vma->vm_start) {
  421                         unmapped_error = -ENOMEM;
  422                         start = vma->vm_start;
  423                         if (start >= end)
  424                                 goto out;
  425                 }
  426 
  427                 /* Here vma->vm_start <= start < (end|vma->vm_end) */
  428                 tmp = vma->vm_end;
  429                 if (end < tmp)
  430                         tmp = end;
  431 
  432                 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
  433                 error = madvise_vma(vma, &prev, start, tmp, behavior);
  434                 if (error)
  435                         goto out;
  436                 start = tmp;
  437                 if (prev && start < prev->vm_end)
  438                         start = prev->vm_end;
  439                 error = unmapped_error;
  440                 if (start >= end)
  441                         goto out;
  442                 if (prev)
  443                         vma = prev->vm_next;
  444                 else    /* madvise_remove dropped mmap_sem */
  445                         vma = find_vma(current->mm, start);
  446         }
  447 out:
  448         if (write)
  449                 up_write(&current->mm->mmap_sem);
  450         else
  451                 up_read(&current->mm->mmap_sem);
  452 
  453         return error;
  454 }

Cache object: 1f73bb61482eb4c33f1fa14fbf288887


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.