X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=include%2Flinux%2Fmm.h;h=a17b147c61e74853e51268c3eaf144aee8c37006;hb=3cec556a84be02bcd8755422eec61f1b9bee4e2f;hp=7d20b25c58fcb055f1644759af6552dd394f3636;hpb=725d704ecaca4a43f067092c140d4f3271cf2856;p=safe%2Fjmp%2Flinux-2.6 diff --git a/include/linux/mm.h b/include/linux/mm.h index 7d20b25..a17b147 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -15,6 +15,8 @@ #include #include #include +#include +#include struct mempolicy; struct anon_vma; @@ -112,6 +114,8 @@ struct vm_area_struct { #endif }; +extern struct kmem_cache *vm_area_cachep; + /* * This struct defines the per-mm list of VMAs for uClinux. If CONFIG_MMU is * disabled, then there's a single shared list of VMAs maintained by the @@ -197,6 +201,7 @@ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); void (*close)(struct vm_area_struct * area); struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type); + unsigned long (*nopfn)(struct vm_area_struct * area, unsigned long address); int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock); /* notification that a previously read-only page is about to become @@ -214,61 +219,6 @@ struct vm_operations_struct { struct mmu_gather; struct inode; -/* - * Each physical page in the system has a struct page associated with - * it to keep track of whatever it is we are using the page for at the - * moment. Note that we have no way to track which tasks are using - * a page. - */ -struct page { - unsigned long flags; /* Atomic flags, some possibly - * updated asynchronously */ - atomic_t _count; /* Usage count, see below. */ - atomic_t _mapcount; /* Count of ptes mapped in mms, - * to show when page is mapped - * & limit reverse map searches. - */ - union { - struct { - unsigned long private; /* Mapping-private opaque data: - * usually used for buffer_heads - * if PagePrivate set; used for - * swp_entry_t if PageSwapCache; - * indicates order in the buddy - * system if PG_buddy is set. - */ - struct address_space *mapping; /* If low bit clear, points to - * inode address_space, or NULL. - * If page mapped as anonymous - * memory, low bit is set, and - * it points to anon_vma object: - * see PAGE_MAPPING_ANON below. - */ - }; -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS - spinlock_t ptl; -#endif - }; - pgoff_t index; /* Our offset within mapping. */ - struct list_head lru; /* Pageout list, eg. active_list - * protected by zone->lru_lock ! - */ - /* - * On machines where all RAM is mapped into kernel address space, - * we can simply calculate the virtual address. On machines with - * highmem some memory is mapped into kernel virtual memory - * dynamically, so we need a place to store that address. - * Note that this field could be 16 bits on x86 ... ;) - * - * Architectures with slow multiplication can define - * WANT_PAGE_VIRTUAL in asm/page.h - */ -#if defined(WANT_PAGE_VIRTUAL) - void *virtual; /* Kernel virtual address (NULL if - not kmapped, ie. highmem) */ -#endif /* WANT_PAGE_VIRTUAL */ -}; - #define page_private(page) ((page)->private) #define set_page_private(page, v) ((page)->private = (v)) @@ -298,8 +248,7 @@ struct page { */ /* - * Drop a ref, return true if the logical refcount fell to zero (the page has - * no users) + * Drop a ref, return true if the refcount fell to zero (the page has no users) */ static inline int put_page_testzero(struct page *page) { @@ -317,8 +266,6 @@ static inline int get_page_unless_zero(struct page *page) return atomic_inc_not_zero(&page->_count); } -extern void FASTCALL(__page_cache_release(struct page *)); - static inline int page_count(struct page *page) { if (unlikely(PageCompound(page))) @@ -349,6 +296,24 @@ void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); /* + * Compound pages have a destructor function. Provide a + * prototype for that function and accessor functions. + * These are _only_ valid on the head of a PG_compound page. + */ +typedef void compound_page_dtor(struct page *); + +static inline void set_compound_page_dtor(struct page *page, + compound_page_dtor *dtor) +{ + page[1].lru.next = (void *)dtor; +} + +static inline compound_page_dtor *get_compound_page_dtor(struct page *page) +{ + return (compound_page_dtor *)page[1].lru.next; +} + +/* * Multiple processes may "see" the same page. E.g. for untouched * mappings of /dev/null, all processes see the same page full of * zeroes, and text pages of executables and shared libraries have @@ -357,43 +322,55 @@ void split_page(struct page *page, unsigned int order); * For the non-reserved pages, page_count(page) denotes a reference count. * page_count() == 0 means the page is free. page->lru is then used for * freelist management in the buddy allocator. - * page_count() == 1 means the page is used for exactly one purpose - * (e.g. a private data page of one process). + * page_count() > 0 means the page has been allocated. * - * A page may be used for kmalloc() or anyone else who does a - * __get_free_page(). In this case the page_count() is at least 1, and - * all other fields are unused but should be 0 or NULL. The - * management of this page is the responsibility of the one who uses - * it. + * Pages are allocated by the slab allocator in order to provide memory + * to kmalloc and kmem_cache_alloc. In this case, the management of the + * page, and the fields in 'struct page' are the responsibility of mm/slab.c + * unless a particular usage is carefully commented. (the responsibility of + * freeing the kmalloc memory is the caller's, of course). * - * The other pages (we may call them "process pages") are completely + * A page may be used by anyone else who does a __get_free_page(). + * In this case, page_count still tracks the references, and should only + * be used through the normal accessor functions. The top bits of page->flags + * and page->virtual store page management information, but all other fields + * are unused and could be used privately, carefully. The management of this + * page is the responsibility of the one who allocated it, and those who have + * subsequently been given references to it. + * + * The other pages (we may call them "pagecache pages") are completely * managed by the Linux memory manager: I/O, buffers, swapping etc. * The following discussion applies only to them. * - * A page may belong to an inode's memory mapping. In this case, - * page->mapping is the pointer to the inode, and page->index is the - * file offset of the page, in units of PAGE_CACHE_SIZE. + * A pagecache page contains an opaque `private' member, which belongs to the + * page's address_space. Usually, this is the address of a circular list of + * the page's disk buffers. PG_private must be set to tell the VM to call + * into the filesystem to release these pages. * - * A page contains an opaque `private' member, which belongs to the - * page's address_space. Usually, this is the address of a circular - * list of the page's disk buffers. + * A page may belong to an inode's memory mapping. In this case, page->mapping + * is the pointer to the inode, and page->index is the file offset of the page, + * in units of PAGE_CACHE_SIZE. * - * For pages belonging to inodes, the page_count() is the number of - * attaches, plus 1 if `private' contains something, plus one for - * the page cache itself. + * If pagecache pages are not associated with an inode, they are said to be + * anonymous pages. These may become associated with the swapcache, and in that + * case PG_swapcache is set, and page->private is an offset into the swapcache. * - * Instead of keeping dirty/clean pages in per address-space lists, we instead - * now tag pages as dirty/under writeback in the radix tree. + * In either case (swapcache or inode backed), the pagecache itself holds one + * reference to the page. Setting PG_private should also increment the + * refcount. The each user mapping also has a reference to the page. * - * There is also a per-mapping radix tree mapping index to the page - * in memory if present. The tree is rooted at mapping->root. + * The pagecache pages are stored in a per-mapping radix tree, which is + * rooted at mapping->page_tree, and indexed by offset. + * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space + * lists, we instead now tag pages as dirty/writeback in the radix tree. * - * All process pages can do I/O: + * All pagecache pages may be subject to I/O: * - inode pages may need to be read from disk, * - inode pages which have been modified and are MAP_SHARED may need - * to be written to disk, - * - private pages which have been modified may need to be swapped out - * to swap space and (later) to be read back into memory. + * to be written back to the inode on disk, + * - anonymous pages (including MAP_PRIVATE file mappings) which have been + * modified may need to be swapped out to swap space and (later) to be read + * back into memory. */ /* @@ -439,7 +416,9 @@ void split_page(struct page *page, unsigned int order); * We are going to use the flags for the page to node mapping if its in * there. This includes the case where there is no node, so it is implicit. */ -#define FLAGS_HAS_NODE (NODES_WIDTH > 0 || NODES_SHIFT == 0) +#if !(NODES_WIDTH > 0 || NODES_SHIFT == 0) +#define NODE_NOT_IN_PAGE_FLAGS +#endif #ifndef PFN_SECTION_SHIFT #define PFN_SECTION_SHIFT 0 @@ -454,13 +433,18 @@ void split_page(struct page *page, unsigned int order); #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) -/* NODE:ZONE or SECTION:ZONE is used to lookup the zone from a page. */ -#if FLAGS_HAS_NODE -#define ZONETABLE_SHIFT (NODES_SHIFT + ZONES_SHIFT) +/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allcator */ +#ifdef NODE_NOT_IN_PAGEFLAGS +#define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) #else -#define ZONETABLE_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) +#define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT) +#endif + +#if ZONES_WIDTH > 0 +#define ZONEID_PGSHIFT ZONES_PGSHIFT +#else +#define ZONEID_PGSHIFT NODES_PGOFF #endif -#define ZONETABLE_PGSHIFT ZONES_PGSHIFT #if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED #error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED @@ -469,54 +453,74 @@ void split_page(struct page *page, unsigned int order); #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) #define NODES_MASK ((1UL << NODES_WIDTH) - 1) #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) -#define ZONETABLE_MASK ((1UL << ZONETABLE_SHIFT) - 1) +#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) -static inline unsigned long page_zonenum(struct page *page) +static inline enum zone_type page_zonenum(struct page *page) { return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } -struct zone; -extern struct zone *zone_table[]; - +/* + * The identification function is only used by the buddy allocator for + * determining if two pages could be buddies. We are not really + * identifying a zone since we could be using a the section number + * id if we have not node id available in page flags. + * We guarantee only that it will return the same value for two + * combinable pages in a zone. + */ static inline int page_zone_id(struct page *page) { - return (page->flags >> ZONETABLE_PGSHIFT) & ZONETABLE_MASK; + BUILD_BUG_ON(ZONEID_PGSHIFT == 0 && ZONEID_MASK); + return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK; } -static inline struct zone *page_zone(struct page *page) + +static inline int zone_to_nid(struct zone *zone) +{ +#ifdef CONFIG_NUMA + return zone->node; +#else + return 0; +#endif +} + +#ifdef NODE_NOT_IN_PAGE_FLAGS +extern int page_to_nid(struct page *page); +#else +static inline int page_to_nid(struct page *page) { - return zone_table[page_zone_id(page)]; + return (page->flags >> NODES_PGSHIFT) & NODES_MASK; } +#endif -static inline unsigned long page_to_nid(struct page *page) +static inline struct zone *page_zone(struct page *page) { - if (FLAGS_HAS_NODE) - return (page->flags >> NODES_PGSHIFT) & NODES_MASK; - else - return page_zone(page)->zone_pgdat->node_id; + return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; } + static inline unsigned long page_to_section(struct page *page) { return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK; } -static inline void set_page_zone(struct page *page, unsigned long zone) +static inline void set_page_zone(struct page *page, enum zone_type zone) { page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT; } + static inline void set_page_node(struct page *page, unsigned long node) { page->flags &= ~(NODES_MASK << NODES_PGSHIFT); page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; } + static inline void set_page_section(struct page *page, unsigned long section) { page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; } -static inline void set_page_links(struct page *page, unsigned long zone, +static inline void set_page_links(struct page *page, enum zone_type zone, unsigned long node, unsigned long pfn) { set_page_zone(page, zone); @@ -529,11 +533,6 @@ static inline void set_page_links(struct page *page, unsigned long zone, */ #include -#ifndef CONFIG_DISCONTIGMEM -/* The array of struct pages - for discontigmem use pgdat->lmem_map */ -extern struct page *mem_map; -#endif - static __always_inline void *lowmem_page_address(struct page *page) { return __va(page_to_pfn(page) << PAGE_SHIFT); @@ -631,6 +630,13 @@ static inline int page_mapped(struct page *page) */ #define NOPAGE_SIGBUS (NULL) #define NOPAGE_OOM ((struct page *) (-1)) +#define NOPAGE_REFAULT ((struct page *) (-2)) /* Return to userspace, rerun */ + +/* + * Error return values for the *_nopfn functions + */ +#define NOPFN_SIGBUS ((unsigned long) -1) +#define NOPFN_OOM ((unsigned long) -2) /* * Different kinds of faults, as returned by handle_mm_fault(). @@ -775,7 +781,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); void print_bad_pte(struct vm_area_struct *, pte_t, unsigned long); -int __set_page_dirty_buffers(struct page *page); +extern int try_to_release_page(struct page * page, gfp_t gfp_mask); +extern void do_invalidatepage(struct page *page, unsigned long offset); + int __set_page_dirty_nobuffers(struct page *page); int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page); @@ -810,6 +818,39 @@ struct shrinker; extern struct shrinker *set_shrinker(int, shrinker_t); extern void remove_shrinker(struct shrinker *shrinker); +/* + * Some shared mappigns will want the pages marked read-only + * to track write events. If so, we'll downgrade vm_page_prot + * to the private version (using protection_map[] without the + * VM_SHARED bit). + */ +static inline int vma_wants_writenotify(struct vm_area_struct *vma) +{ + unsigned int vm_flags = vma->vm_flags; + + /* If it was private or non-writable, the write bit is already clear */ + if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) + return 0; + + /* The backer wishes to know when pages are first written to? */ + if (vma->vm_ops && vma->vm_ops->page_mkwrite) + return 1; + + /* The open routine did something to the protections already? */ + if (pgprot_val(vma->vm_page_prot) != + pgprot_val(protection_map[vm_flags & + (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)])) + return 0; + + /* Specialty mapping? */ + if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE)) + return 0; + + /* Can the mapping track the dirty pages? */ + return vma->vm_file && vma->vm_file->f_mapping && + mapping_cap_account_dirty(vma->vm_file->f_mapping); +} + extern pte_t *FASTCALL(get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)); int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); @@ -887,6 +928,56 @@ extern void free_area_init(unsigned long * zones_size); extern void free_area_init_node(int nid, pg_data_t *pgdat, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP +/* + * With CONFIG_ARCH_POPULATES_NODE_MAP set, an architecture may initialise its + * zones, allocate the backing mem_map and account for memory holes in a more + * architecture independent manner. This is a substitute for creating the + * zone_sizes[] and zholes_size[] arrays and passing them to + * free_area_init_node() + * + * An architecture is expected to register range of page frames backed by + * physical memory with add_active_range() before calling + * free_area_init_nodes() passing in the PFN each zone ends at. At a basic + * usage, an architecture is expected to do something like + * + * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, + * max_highmem_pfn}; + * for_each_valid_physical_page_range() + * add_active_range(node_id, start_pfn, end_pfn) + * free_area_init_nodes(max_zone_pfns); + * + * If the architecture guarantees that there are no holes in the ranges + * registered with add_active_range(), free_bootmem_active_regions() + * will call free_bootmem_node() for each registered physical page range. + * Similarly sparse_memory_present_with_active_regions() calls + * memory_present() for each range when SPARSEMEM is enabled. + * + * See mm/page_alloc.c for more information on each function exposed by + * CONFIG_ARCH_POPULATES_NODE_MAP + */ +extern void free_area_init_nodes(unsigned long *max_zone_pfn); +extern void add_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn); +extern void shrink_active_range(unsigned int nid, unsigned long old_end_pfn, + unsigned long new_end_pfn); +extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn); +extern void remove_all_active_ranges(void); +extern unsigned long absent_pages_in_range(unsigned long start_pfn, + unsigned long end_pfn); +extern void get_pfn_range_for_nid(unsigned int nid, + unsigned long *start_pfn, unsigned long *end_pfn); +extern unsigned long find_min_pfn_with_active_regions(void); +extern unsigned long find_max_pfn_with_active_regions(void); +extern void free_bootmem_with_active_regions(int nid, + unsigned long max_low_pfn); +extern void sparse_memory_present_with_active_regions(int nid); +#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID +extern int early_pfn_to_nid(unsigned long pfn); +#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ +#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ +extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long); extern void setup_per_zone_pages_min(void); extern void mem_init(void); @@ -1047,12 +1138,7 @@ static inline void vm_stat_account(struct mm_struct *mm, #ifndef CONFIG_DEBUG_PAGEALLOC static inline void -kernel_map_pages(struct page *page, int numpages, int enable) -{ - if (!PageHighMem(page) && !enable) - debug_check_no_locks_freed(page_address(page), - numpages * PAGE_SIZE); -} +kernel_map_pages(struct page *page, int numpages, int enable) {} #endif extern struct vm_area_struct *get_gate_vma(struct task_struct *tsk); @@ -1064,9 +1150,6 @@ int in_gate_area_no_task(unsigned long addr); #define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);}) #endif /* __HAVE_ARCH_GATE_AREA */ -/* /proc//oom_adj set to -17 protects from the oom-killer */ -#define OOM_DISABLE -17 - int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, @@ -1080,7 +1163,7 @@ void drop_slab(void); extern int randomize_va_space; #endif -const char *arch_vma_name(struct vm_area_struct *vma); +__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma); #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */