diff --git a/Documentation/ABI/stable/thermal-notification b/Documentation/ABI/stable/thermal-notification new file mode 100644 index 0000000..9723e8b --- /dev/null +++ b/Documentation/ABI/stable/thermal-notification @@ -0,0 +1,4 @@ +What: A notification mechanism for thermal related events +Description: + This interface enables notification for thermal related events. + The notification is in the form of a netlink event. diff --git a/Documentation/IPMI.txt b/Documentation/IPMI.txt index 69dd29e..b2bea15 100644 --- a/Documentation/IPMI.txt +++ b/Documentation/IPMI.txt @@ -533,6 +533,33 @@ completion during sending a panic event. Other Pieces ------------ +Get the detailed info related with the IPMI device +-------------------------------------------------- + +Some users need more detailed information about a device, like where +the address came from or the raw base device for the IPMI interface. +You can use the IPMI smi_watcher to catch the IPMI interfaces as they +come or go, and to grab the information, you can use the function +ipmi_get_smi_info(), which returns the following structure: + +struct ipmi_smi_info { + enum ipmi_addr_src addr_src; + struct device *dev; + union { + struct { + void *acpi_handle; + } acpi_info; + } addr_info; +}; + +Currently special info for only for SI_ACPI address sources is +returned. Others may be added as necessary. + +Note that the dev pointer is included in the above structure, and +assuming ipmi_smi_get_info returns success, you must call put_device +on the dev pointer. + + Watchdog -------- diff --git a/Documentation/acpi/apei/output_format.txt b/Documentation/acpi/apei/output_format.txt new file mode 100644 index 0000000..9146952 --- /dev/null +++ b/Documentation/acpi/apei/output_format.txt @@ -0,0 +1,122 @@ + APEI output format + ~~~~~~~~~~~~~~~~~~ + +APEI uses printk as hardware error reporting interface, the output +format is as follow. + + := +APEI generic hardware error status +severity: , +section: , severity: , +flags: +
+fru_id: +fru_text: +section_type:
+
+ +* := recoverable | fatal | corrected | info + +
# := +[primary][, containment warning][, reset][, threshold exceeded]\ +[, resource not accessible][, latent error] + +
:= generic processor error | memory error | \ +PCIe error | unknown, + +
:= + | | \ + | + + := +[processor_type: , ] +[processor_isa: , ] +[error_type: +] +[operation: , ] +[flags: +] +[level: ] +[version_info: ] +[processor_id: ] +[target_address: ] +[requestor_id: ] +[responder_id: ] +[IP: ] + +* := IA32/X64 | IA64 + +* := IA32 | IA64 | X64 + +# := +[cache error][, TLB error][, bus error][, micro-architectural error] + +* := unknown or generic | data read | data write | \ +instruction execution + +# := +[restartable][, precise IP][, overflow][, corrected] + + := +[error_status: ] +[physical_address: ] +[physical_address_mask: ] +[node: ] +[card: ] +[module: ] +[bank: ] +[device: ] +[row: ] +[column: ] +[bit_position: ] +[requestor_id: ] +[responder_id: ] +[target_id: ] +[error_type: , ] + +* := +unknown | no error | single-bit ECC | multi-bit ECC | \ +single-symbol chipkill ECC | multi-symbol chipkill ECC | master abort | \ +target abort | parity error | watchdog timeout | invalid address | \ +mirror Broken | memory sparing | scrub corrected error | \ +scrub uncorrected error + + := +[port_type: , ] +[version: .] +[command: , status: ] +[device_id: ::. +slot: +secondary_bus: +vendor_id: , device_id: +class_code: ] +[serial number: , ] +[bridge: secondary_status: , control: ] + +* := PCIe end point | legacy PCI end point | \ +unknown | unknown | root port | upstream switch port | \ +downstream switch port | PCIe to PCI/PCI-X bridge | \ +PCI/PCI-X to PCIe bridge | root complex integrated endpoint device | \ +root complex event collector + +Where, [] designate corresponding content is optional + +All description with * has the following format: + +field: , + +Where value of should be the position of "string" in description. Otherwise, will be "unknown". + +All description with # has the following format: + +field: + + +Where each string in corresponding to one set bit of +. The bit position is the position of "string" in description. + +For more detailed explanation of every field, please refer to UEFI +specification version 2.3 or later, section Appendix N: Common +Platform Error Record. diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 7781857..bac328c 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -385,6 +385,10 @@ mapped_file - # of bytes of mapped file (includes tmpfs/shmem) pgpgin - # of pages paged in (equivalent to # of charging events). pgpgout - # of pages paged out (equivalent to # of uncharging events). swap - # of bytes of swap usage +dirty - # of bytes that are waiting to get written back to the disk. +writeback - # of bytes that are actively being written back to the disk. +nfs_unstable - # of bytes sent to the NFS server, but not yet committed to + the actual storage. inactive_anon - # of bytes of anonymous memory and swap cache memory on LRU list. active_anon - # of bytes of anonymous and swap cache memory on active @@ -406,6 +410,9 @@ total_mapped_file - sum of all children's "cache" total_pgpgin - sum of all children's "pgpgin" total_pgpgout - sum of all children's "pgpgout" total_swap - sum of all children's "swap" +total_dirty - sum of all children's "dirty" +total_writeback - sum of all children's "writeback" +total_nfs_unstable - sum of all children's "nfs_unstable" total_inactive_anon - sum of all children's "inactive_anon" total_active_anon - sum of all children's "active_anon" total_inactive_file - sum of all children's "inactive_file" @@ -453,6 +460,73 @@ memory under it will be reclaimed. You can reset failcnt by writing 0 to failcnt file. # echo 0 > .../memory.failcnt +5.5 dirty memory + +Control the maximum amount of dirty pages a cgroup can have at any given time. + +Limiting dirty memory is like fixing the max amount of dirty (hard to reclaim) +page cache used by a cgroup. So, in case of multiple cgroup writers, they will +not be able to consume more than their designated share of dirty pages and will +be forced to perform write-out if they cross that limit. + +The interface is equivalent to the procfs interface: /proc/sys/vm/dirty_*. It +is possible to configure a limit to trigger both a direct writeback or a +background writeback performed by per-bdi flusher threads. The root cgroup +memory.dirty_* control files are read-only and match the contents of +the /proc/sys/vm/dirty_* files. + +Per-cgroup dirty limits can be set using the following files in the cgroupfs: + +- memory.dirty_ratio: the amount of dirty memory (expressed as a percentage of + cgroup memory) at which a process generating dirty pages will itself start + writing out dirty data. + +- memory.dirty_limit_in_bytes: the amount of dirty memory (expressed in bytes) + in the cgroup at which a process generating dirty pages will start itself + writing out dirty data. Suffix (k, K, m, M, g, or G) can be used to indicate + that value is kilo, mega or gigabytes. + + Note: memory.dirty_limit_in_bytes is the counterpart of memory.dirty_ratio. + Only one of them may be specified at a time. When one is written it is + immediately taken into account to evaluate the dirty memory limits and the + other appears as 0 when read. + +- memory.dirty_background_ratio: the amount of dirty memory of the cgroup + (expressed as a percentage of cgroup memory) at which background writeback + kernel threads will start writing out dirty data. + +- memory.dirty_background_limit_in_bytes: the amount of dirty memory (expressed + in bytes) in the cgroup at which background writeback kernel threads will + start writing out dirty data. Suffix (k, K, m, M, g, or G) can be used to + indicate that value is kilo, mega or gigabytes. + + Note: memory.dirty_background_limit_in_bytes is the counterpart of + memory.dirty_background_ratio. Only one of them may be specified at a time. + When one is written it is immediately taken into account to evaluate the dirty + memory limits and the other appears as 0 when read. + +A cgroup may contain more dirty memory than its dirty limit. This is possible +because of the principle that the first cgroup to touch a page is charged for +it. Subsequent page counting events (dirty, writeback, nfs_unstable) are also +counted to the originally charged cgroup. + +Example: If page is allocated by a cgroup A task, then the page is charged to +cgroup A. If the page is later dirtied by a task in cgroup B, then the cgroup A +dirty count will be incremented. If cgroup A is over its dirty limit but cgroup +B is not, then dirtying a cgroup A page from a cgroup B task may push cgroup A +over its dirty limit without throttling the dirtying cgroup B task. + +When use_hierarchy=0, each cgroup has dirty memory usage and limits. +System-wide dirty limits are also consulted. Dirty memory consumption is +checked against both system-wide and per-cgroup dirty limits. + +The current implementation does not enforce per-cgroup dirty limits when +use_hierarchy=1. System-wide dirty limits are used for processes in such +cgroups. Attempts to read memory.dirty_* files return the system-wide +values. Writes to the memory.dirty_* files return error. An enhanced +implementation is needed to check the chain of parents to ensure that no +dirty limit is exceeded. + 6. Hierarchy support The memory controller supports a deep hierarchy and hierarchical accounting. diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt index 524de92..59293ac 100644 --- a/Documentation/device-mapper/dm-crypt.txt +++ b/Documentation/device-mapper/dm-crypt.txt @@ -8,7 +8,7 @@ Parameters: Encryption cipher and an optional IV generation mode. - (In format cipher-chainmode-ivopts:ivmode). + (In format cipher[:keycount]-chainmode-ivopts:ivmode). Examples: des aes-cbc-essiv:sha256 @@ -20,6 +20,11 @@ Parameters: Key used for encryption. It is encoded as a hexadecimal number. You can only use key sizes that are valid for the selected cipher. + + Multi-key compatibility mode. You can define keys and + then sectors are encrypted according to their offsets (sector 0 uses key0; + sector 1 uses key1 etc.). must be a power of two. + The IV offset is a sector count that is added to the sector number before creating the IV. diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt new file mode 100644 index 0000000..33b6b70 --- /dev/null +++ b/Documentation/device-mapper/dm-raid.txt @@ -0,0 +1,70 @@ +Device-mapper RAID (dm-raid) is a bridge from DM to MD. It +provides a way to use device-mapper interfaces to access the MD RAID +drivers. + +As with all device-mapper targets, the nominal public interfaces are the +constructor (CTR) tables and the status outputs (both STATUSTYPE_INFO +and STATUSTYPE_TABLE). The CTR table looks like the following: + +1: raid \ +2: <#raid_params> \ +3: <#raid_devs> .. + +Line 1 contains the standard first three arguments to any device-mapper +target - the start, length, and target type fields. The target type in +this case is "raid". + +Line 2 contains the arguments that define the particular raid +type/personality/level, the required arguments for that raid type, and +any optional arguments. Possible raid types include: raid4, raid5_la, +raid5_ls, raid5_rs, raid6_zr, raid6_nr, and raid6_nc. (raid1 is +planned for the future.) The list of required and optional parameters +is the same for all the current raid types. The required parameters are +positional, while the optional parameters are given as key/value pairs. +The possible parameters are as follows: + Chunk size in sectors. + [[no]sync] Force/Prevent RAID initialization + [rebuild ] Rebuild the drive indicated by the index + [daemon_sleep ] Time between bitmap daemon work to clear bits + [min_recovery_rate ] Throttle RAID initialization + [max_recovery_rate ] Throttle RAID initialization + [max_write_behind ] See '-write-behind=' (man mdadm) + [stripe_cache ] Stripe cache size for higher RAIDs + +Line 3 contains the list of devices that compose the array in +metadata/data device pairs. If the metadata is stored separately, a '-' +is given for the metadata device position. If a drive has failed or is +missing at creation time, a '-' can be given for both the metadata and +data drives for a given position. + +NB. Currently all metadata devices must be specified as '-'. + +Examples: +# RAID4 - 4 data drives, 1 parity +# No metadata devices specified to hold superblock/bitmap info +# Chunk size of 1MiB +# (Lines separated for easy reading) +0 1960893648 raid \ + raid4 1 2048 \ + 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 + +# RAID4 - 4 data drives, 1 parity (no metadata devices) +# Chunk size of 1MiB, force RAID initialization, +# min recovery rate at 20 kiB/sec/disk +0 1960893648 raid \ + raid4 4 2048 min_recovery_rate 20 sync\ + 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 + +Performing a 'dmsetup table' should display the CTR table used to +construct the mapping (with possible reordering of optional +parameters). + +Performing a 'dmsetup status' will yield information on the state and +health of the array. The output is as follows: +1: raid \ +2: <#devices> <1 health char for each dev> + +Line 1 is standard DM output. Line 2 is best shown by example: + 0 1960893648 raid raid4 5 AAAAA 2/490221568 +Here we can see the RAID type is raid4, there are 5 devices - all of +which are 'A'live, and the array is 2/490221568 complete with recovery. diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 6cbbd20..8c594c4 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -248,6 +248,17 @@ Who: Zhang Rui --------------------------- +What: CONFIG_ACPI_PROCFS_POWER +When: 2.6.39 +Why: sysfs I/F for ACPI power devices, including AC and Battery, + has been working in upstream kenrel since 2.6.24, Sep 2007. + In 2.6.37, we make the sysfs I/F always built in and this option + disabled by default. + Remove this option and the ACPI power procfs interface in 2.6.39. +Who: Zhang Rui + +--------------------------- + What: /proc/acpi/button When: August 2007 Why: /proc/acpi/button has been replaced by events to the input layer diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 266d205..dfbcd1b 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -365,8 +365,8 @@ must be done in the RCU callback. [recommended] vfs now tries to do path walking in "rcu-walk mode", which avoids atomic operations and scalability hazards on dentries and inodes (see -Documentation/filesystems/path-walk.txt). d_hash and d_compare changes (above) -are examples of the changes required to support this. For more complex +Documentation/filesystems/path-lookup.txt). d_hash and d_compare changes +(above) are examples of the changes required to support this. For more complex filesystem callbacks, the vfs drops out of rcu-walk mode before the fs call, so no changes are required to the filesystem. However, this is costly and loses the benefits of rcu-walk mode. We will begin to add filesystem callbacks that @@ -383,8 +383,8 @@ Documentation/filesystems/vfs.txt for more details. permission and check_acl are inode permission checks that are called on many or all directory inodes on the way down a path walk (to check for -exec permission). These must now be rcu-walk aware (flags & IPERM_RCU). See -Documentation/filesystems/vfs.txt for more details. +exec permission). These must now be rcu-walk aware (flags & IPERM_FLAG_RCU). +See Documentation/filesystems/vfs.txt for more details. -- [mandatory] diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 9471225..23cae65 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -375,6 +375,7 @@ Anonymous: 0 kB Swap: 0 kB KernelPageSize: 4 kB MMUPageSize: 4 kB +Locked: 374 kB The first of these lines shows the same information as is displayed for the mapping in /proc/PID/maps. The remaining lines show the size of the mapping @@ -670,6 +671,8 @@ varies by architecture and compile options. The following is from a > cat /proc/meminfo +The "Locked" indicates whether the mapping is locked in memory or not. + MemTotal: 16344972 kB MemFree: 13634064 kB @@ -1320,6 +1323,10 @@ scaled linearly with /proc//oom_score_adj. Writing to /proc//oom_score_adj or /proc//oom_adj will change the other with its scaled value. +The value of /proc//oom_score_adj may be reduced no lower than the last +value set by a CAP_SYS_RESOURCE process. To reduce the value any lower +requires CAP_SYS_RESOURCE. + NOTICE: /proc//oom_adj is deprecated and will be removed, please see Documentation/feature-removal-schedule.txt. diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index fbb324e..cae6d27 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -415,8 +415,8 @@ otherwise noted. permission: called by the VFS to check for access rights on a POSIX-like filesystem. - May be called in rcu-walk mode (flags & IPERM_RCU). If in rcu-walk - mode, the filesystem must check the permission without blocking or + May be called in rcu-walk mode (flags & IPERM_FLAG_RCU). If in rcu-walk + mode, the filesystem must check the permission without blocking or storing to the inode. If a situation is encountered that rcu-walk cannot handle, return diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt index a492d92..792faa3 100644 --- a/Documentation/gpio.txt +++ b/Documentation/gpio.txt @@ -135,7 +135,7 @@ setting up a platform_device using the GPIO, is mark its direction: int gpio_direction_input(unsigned gpio); int gpio_direction_output(unsigned gpio, int value); -The return value is zero for success, else a negative errno. It must +The return value is zero for success, else a negative errno. It should be checked, since the get/set calls don't have error returns and since misconfiguration is possible. You should normally issue these calls from a task context. However, for spinlock-safe GPIOs it's OK to use them diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 55fe759..b72e071 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -199,11 +199,6 @@ and is between 256 and 4096 characters. It is defined in the file unusable. The "log_buf_len" parameter may be useful if you need to capture more output. - acpi_display_output= [HW,ACPI] - acpi_display_output=vendor - acpi_display_output=video - See above. - acpi_irq_balance [HW,ACPI] ACPI will balance active IRQs default in APIC mode diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt index cb3d15b..b61e46f 100644 --- a/Documentation/thermal/sysfs-api.txt +++ b/Documentation/thermal/sysfs-api.txt @@ -278,3 +278,15 @@ method, the sys I/F structure will be built like this: |---name: acpitz |---temp1_input: 37000 |---temp1_crit: 100000 + +4. Event Notification + +The framework includes a simple notification mechanism, in the form of a +netlink event. Netlink socket initialization is done during the _init_ +of the framework. Drivers which intend to use the notification mechanism +just need to call generate_netlink_event() with two arguments viz +(originator, event). Typically the originator will be an integer assigned +to a thermal_zone_device when it registers itself with the framework. The +event will be one of:{THERMAL_AUX0, THERMAL_AUX1, THERMAL_CRITICAL, +THERMAL_DEV_FAULT}. Notification can be sent when the current temperature +crosses any of the configured thresholds. diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt new file mode 100644 index 0000000..0924aac --- /dev/null +++ b/Documentation/vm/transhuge.txt @@ -0,0 +1,298 @@ += Transparent Hugepage Support = + +== Objective == + +Performance critical computing applications dealing with large memory +working sets are already running on top of libhugetlbfs and in turn +hugetlbfs. Transparent Hugepage Support is an alternative means of +using huge pages for the backing of virtual memory with huge pages +that supports the automatic promotion and demotion of page sizes and +without the shortcomings of hugetlbfs. + +Currently it only works for anonymous memory mappings but in the +future it can expand over the pagecache layer starting with tmpfs. + +The reason applications are running faster is because of two +factors. The first factor is almost completely irrelevant and it's not +of significant interest because it'll also have the downside of +requiring larger clear-page copy-page in page faults which is a +potentially negative effect. The first factor consists in taking a +single page fault for each 2M virtual region touched by userland (so +reducing the enter/exit kernel frequency by a 512 times factor). This +only matters the first time the memory is accessed for the lifetime of +a memory mapping. The second long lasting and much more important +factor will affect all subsequent accesses to the memory for the whole +runtime of the application. The second factor consist of two +components: 1) the TLB miss will run faster (especially with +virtualization using nested pagetables but almost always also on bare +metal without virtualization) and 2) a single TLB entry will be +mapping a much larger amount of virtual memory in turn reducing the +number of TLB misses. With virtualization and nested pagetables the +TLB can be mapped of larger size only if both KVM and the Linux guest +are using hugepages but a significant speedup already happens if only +one of the two is using hugepages just because of the fact the TLB +miss is going to run faster. + +== Design == + +- "graceful fallback": mm components which don't have transparent + hugepage knowledge fall back to breaking a transparent hugepage and + working on the regular pages and their respective regular pmd/pte + mappings + +- if a hugepage allocation fails because of memory fragmentation, + regular pages should be gracefully allocated instead and mixed in + the same vma without any failure or significant delay and without + userland noticing + +- if some task quits and more hugepages become available (either + immediately in the buddy or through the VM), guest physical memory + backed by regular pages should be relocated on hugepages + automatically (with khugepaged) + +- it doesn't require memory reservation and in turn it uses hugepages + whenever possible (the only possible reservation here is kernelcore= + to avoid unmovable pages to fragment all the memory but such a tweak + is not specific to transparent hugepage support and it's a generic + feature that applies to all dynamic high order allocations in the + kernel) + +- this initial support only offers the feature in the anonymous memory + regions but it'd be ideal to move it to tmpfs and the pagecache + later + +Transparent Hugepage Support maximizes the usefulness of free memory +if compared to the reservation approach of hugetlbfs by allowing all +unused memory to be used as cache or other movable (or even unmovable +entities). It doesn't require reservation to prevent hugepage +allocation failures to be noticeable from userland. It allows paging +and all other advanced VM features to be available on the +hugepages. It requires no modifications for applications to take +advantage of it. + +Applications however can be further optimized to take advantage of +this feature, like for example they've been optimized before to avoid +a flood of mmap system calls for every malloc(4k). Optimizing userland +is by far not mandatory and khugepaged already can take care of long +lived page allocations even for hugepage unaware applications that +deals with large amounts of memory. + +In certain cases when hugepages are enabled system wide, application +may end up allocating more memory resources. An application may mmap a +large region but only touch 1 byte of it, in that case a 2M page might +be allocated instead of a 4k page for no good. This is why it's +possible to disable hugepages system-wide and to only have them inside +MADV_HUGEPAGE madvise regions. + +Embedded systems should enable hugepages only inside madvise regions +to eliminate any risk of wasting any precious byte of memory and to +only run faster. + +Applications that gets a lot of benefit from hugepages and that don't +risk to lose memory by using hugepages, should use +madvise(MADV_HUGEPAGE) on their critical mmapped regions. + +== sysfs == + +Transparent Hugepage Support can be entirely disabled (mostly for +debugging purposes) or only enabled inside MADV_HUGEPAGE regions (to +avoid the risk of consuming more memory resources) or enabled system +wide. This can be achieved with one of: + +echo always >/sys/kernel/mm/transparent_hugepage/enabled +echo madvise >/sys/kernel/mm/transparent_hugepage/enabled +echo never >/sys/kernel/mm/transparent_hugepage/enabled + +It's also possible to limit defrag efforts in the VM to generate +hugepages in case they're not immediately free to madvise regions or +to never try to defrag memory and simply fallback to regular pages +unless hugepages are immediately available. Clearly if we spend CPU +time to defrag memory, we would expect to gain even more by the fact +we use hugepages later instead of regular pages. This isn't always +guaranteed, but it may be more likely in case the allocation is for a +MADV_HUGEPAGE region. + +echo always >/sys/kernel/mm/transparent_hugepage/defrag +echo madvise >/sys/kernel/mm/transparent_hugepage/defrag +echo never >/sys/kernel/mm/transparent_hugepage/defrag + +khugepaged will be automatically started when +transparent_hugepage/enabled is set to "always" or "madvise, and it'll +be automatically shutdown if it's set to "never". + +khugepaged runs usually at low frequency so while one may not want to +invoke defrag algorithms synchronously during the page faults, it +should be worth invoking defrag at least in khugepaged. However it's +also possible to disable defrag in khugepaged: + +echo yes >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag +echo no >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag + +You can also control how many pages khugepaged should scan at each +pass: + +/sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan + +and how many milliseconds to wait in khugepaged between each pass (you +can set this to 0 to run khugepaged at 100% utilization of one core): + +/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs + +and how many milliseconds to wait in khugepaged if there's an hugepage +allocation failure to throttle the next allocation attempt. + +/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs + +The khugepaged progress can be seen in the number of pages collapsed: + +/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed + +for each pass: + +/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans + +== Boot parameter == + +You can change the sysfs boot time defaults of Transparent Hugepage +Support by passing the parameter "transparent_hugepage=always" or +"transparent_hugepage=madvise" or "transparent_hugepage=never" +(without "") to the kernel command line. + +== Need of application restart == + +The transparent_hugepage/enabled values only affect future +behavior. So to make them effective you need to restart any +application that could have been using hugepages. This also applies to +the regions registered in khugepaged. + +== get_user_pages and follow_page == + +get_user_pages and follow_page if run on a hugepage, will return the +head or tail pages as usual (exactly as they would do on +hugetlbfs). Most gup users will only care about the actual physical +address of the page and its temporary pinning to release after the I/O +is complete, so they won't ever notice the fact the page is huge. But +if any driver is going to mangle over the page structure of the tail +page (like for checking page->mapping or other bits that are relevant +for the head page and not the tail page), it should be updated to jump +to check head page instead (while serializing properly against +split_huge_page() to avoid the head and tail pages to disappear from +under it, see the futex code to see an example of that, hugetlbfs also +needed special handling in futex code for similar reasons). + +NOTE: these aren't new constraints to the GUP API, and they match the +same constrains that applies to hugetlbfs too, so any driver capable +of handling GUP on hugetlbfs will also work fine on transparent +hugepage backed mappings. + +In case you can't handle compound pages if they're returned by +follow_page, the FOLL_SPLIT bit can be specified as parameter to +follow_page, so that it will split the hugepages before returning +them. Migration for example passes FOLL_SPLIT as parameter to +follow_page because it's not hugepage aware and in fact it can't work +at all on hugetlbfs (but it instead works fine on transparent +hugepages thanks to FOLL_SPLIT). migration simply can't deal with +hugepages being returned (as it's not only checking the pfn of the +page and pinning it during the copy but it pretends to migrate the +memory in regular page sizes and with regular pte/pmd mappings). + +== Optimizing the applications == + +To be guaranteed that the kernel will map a 2M page immediately in any +memory region, the mmap region has to be hugepage naturally +aligned. posix_memalign() can provide that guarantee. + +== Hugetlbfs == + +You can use hugetlbfs on a kernel that has transparent hugepage +support enabled just fine as always. No difference can be noted in +hugetlbfs other than there will be less overall fragmentation. All +usual features belonging to hugetlbfs are preserved and +unaffected. libhugetlbfs will also work fine as usual. + +== Graceful fallback == + +Code walking pagetables but unware about huge pmds can simply call +split_huge_page_pmd(mm, pmd) where the pmd is the one returned by +pmd_offset. It's trivial to make the code transparent hugepage aware +by just grepping for "pmd_offset" and adding split_huge_page_pmd where +missing after pmd_offset returns the pmd. Thanks to the graceful +fallback design, with a one liner change, you can avoid to write +hundred if not thousand of lines of complex code to make your code +hugepage aware. + +If you're not walking pagetables but you run into a physical hugepage +but you can't handle it natively in your code, you can split it by +calling split_huge_page(page). This is what the Linux VM does before +it tries to swapout the hugepage for example. + +Example to make mremap.c transparent hugepage aware with a one liner +change: + +diff --git a/mm/mremap.c b/mm/mremap.c +--- a/mm/mremap.c ++++ b/mm/mremap.c +@@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru + return NULL; + + pmd = pmd_offset(pud, addr); ++ split_huge_page_pmd(mm, pmd); + if (pmd_none_or_clear_bad(pmd)) + return NULL; + +== Locking in hugepage aware code == + +We want as much code as possible hugepage aware, as calling +split_huge_page() or split_huge_page_pmd() has a cost. + +To make pagetable walks huge pmd aware, all you need to do is to call +pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the +mmap_sem in read (or write) mode to be sure an huge pmd cannot be +created from under you by khugepaged (khugepaged collapse_huge_page +takes the mmap_sem in write mode in addition to the anon_vma lock). If +pmd_trans_huge returns false, you just fallback in the old code +paths. If instead pmd_trans_huge returns true, you have to take the +mm->page_table_lock and re-run pmd_trans_huge. Taking the +page_table_lock will prevent the huge pmd to be converted into a +regular pmd from under you (split_huge_page can run in parallel to the +pagetable walk). If the second pmd_trans_huge returns false, you +should just drop the page_table_lock and fallback to the old code as +before. Otherwise you should run pmd_trans_splitting on the pmd. In +case pmd_trans_splitting returns true, it means split_huge_page is +already in the middle of splitting the page. So if pmd_trans_splitting +returns true it's enough to drop the page_table_lock and call +wait_split_huge_page and then fallback the old code paths. You are +guaranteed by the time wait_split_huge_page returns, the pmd isn't +huge anymore. If pmd_trans_splitting returns false, you can proceed to +process the huge pmd and the hugepage natively. Once finished you can +drop the page_table_lock. + +== compound_lock, get_user_pages and put_page == + +split_huge_page internally has to distribute the refcounts in the head +page to the tail pages before clearing all PG_head/tail bits from the +page structures. It can do that easily for refcounts taken by huge pmd +mappings. But the GUI API as created by hugetlbfs (that returns head +and tail pages if running get_user_pages on an address backed by any +hugepage), requires the refcount to be accounted on the tail pages and +not only in the head pages, if we want to be able to run +split_huge_page while there are gup pins established on any tail +page. Failure to be able to run split_huge_page if there's any gup pin +on any tail page, would mean having to split all hugepages upfront in +get_user_pages which is unacceptable as too many gup users are +performance critical and they must work natively on hugepages like +they work natively on hugetlbfs already (hugetlbfs is simpler because +hugetlbfs pages cannot be splitted so there wouldn't be requirement of +accounting the pins on the tail pages for hugetlbfs). If we wouldn't +account the gup refcounts on the tail pages during gup, we won't know +anymore which tail page is pinned by gup and which is not while we run +split_huge_page. But we still have to add the gup pin to the head page +too, to know when we can free the compound page in case it's never +splitted during its lifetime. That requires changing not just +get_page, but put_page as well so that when put_page runs on a tail +page (and only on a tail page) it will find its respective head page, +and then it will decrease the head page refcount in addition to the +tail page refcount. To obtain a head page reliably and to decrease its +refcount without race conditions, put_page has to serialize against +__split_huge_page_refcount using a special per-page lock called +compound_lock. diff --git a/MAINTAINERS b/MAINTAINERS index 3dd5c6f..af656de 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6592,13 +6592,12 @@ F: Documentation/i2c/busses/i2c-viapro F: drivers/i2c/busses/i2c-viapro.c VIA SD/MMC CARD CONTROLLER DRIVER -M: Joseph Chan +M: Bruce Chang M: Harald Welte S: Maintained F: drivers/mmc/host/via-sdmmc.c VIA UNICHROME(PRO)/CHROME9 FRAMEBUFFER DRIVER -M: Joseph Chan M: Florian Tobias Schandinat L: linux-fbdev@vger.kernel.org S: Maintained diff --git a/arch/alpha/include/asm/mman.h b/arch/alpha/include/asm/mman.h index 99c56d4..72db984 100644 --- a/arch/alpha/include/asm/mman.h +++ b/arch/alpha/include/asm/mman.h @@ -53,6 +53,9 @@ #define MADV_MERGEABLE 12 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ +#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */ +#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index 0c1bb68..2cfe816 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -38,17 +38,9 @@ #ifdef CONFIG_MMU void *module_alloc(unsigned long size) { - struct vm_struct *area; - - size = PAGE_ALIGN(size); - if (!size) - return NULL; - - area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); - if (!area) - return NULL; - - return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); + return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, + GFP_KERNEL, PAGE_KERNEL_EXEC, -1, + __builtin_return_address(0)); } #else /* CONFIG_MMU */ void *module_alloc(unsigned long size) diff --git a/arch/arm/mach-omap2/cpuidle34xx.c b/arch/arm/mach-omap2/cpuidle34xx.c index 11b89e9..f7b22a1 100644 --- a/arch/arm/mach-omap2/cpuidle34xx.c +++ b/arch/arm/mach-omap2/cpuidle34xx.c @@ -47,6 +47,8 @@ #define OMAP3_STATE_MAX OMAP3_STATE_C7 +#define CPUIDLE_FLAG_CHECK_BM 0x10000 /* use omap3_enter_idle_bm() */ + struct omap3_processor_cx { u8 valid; u8 type; diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c index 93292a1..709244c 100644 --- a/arch/arm/mm/pgd.c +++ b/arch/arm/mm/pgd.c @@ -50,7 +50,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) if (!new_pmd) goto no_pmd; - new_pte = pte_alloc_map(mm, new_pmd, 0); + new_pte = pte_alloc_map(mm, NULL, new_pmd, 0); if (!new_pte) goto no_pte; diff --git a/arch/avr32/boards/atngw100/setup.c b/arch/avr32/boards/atngw100/setup.c index 8c6a244..659d119 100644 --- a/arch/avr32/boards/atngw100/setup.c +++ b/arch/avr32/boards/atngw100/setup.c @@ -188,7 +188,7 @@ static void __init set_hw_addr(struct platform_device *pdev) */ regs = (void __iomem __force *)res->start; pclk = clk_get(&pdev->dev, "pclk"); - if (!pclk) + if (IS_ERR(pclk)) return; clk_enable(pclk); diff --git a/arch/avr32/boards/atstk1000/atstk1002.c b/arch/avr32/boards/atstk1000/atstk1002.c index 2adc261..6ce30fb 100644 --- a/arch/avr32/boards/atstk1000/atstk1002.c +++ b/arch/avr32/boards/atstk1000/atstk1002.c @@ -203,7 +203,7 @@ static void __init set_hw_addr(struct platform_device *pdev) */ regs = (void __iomem __force *)res->start; pclk = clk_get(&pdev->dev, "pclk"); - if (!pclk) + if (IS_ERR(pclk)) return; clk_enable(pclk); diff --git a/arch/avr32/boards/favr-32/setup.c b/arch/avr32/boards/favr-32/setup.c index 75f19f4..86fab77 100644 --- a/arch/avr32/boards/favr-32/setup.c +++ b/arch/avr32/boards/favr-32/setup.c @@ -206,7 +206,7 @@ static void __init set_hw_addr(struct platform_device *pdev) */ regs = (void __iomem __force *)res->start; pclk = clk_get(&pdev->dev, "pclk"); - if (!pclk) + if (IS_ERR(pclk)) return; clk_enable(pclk); diff --git a/arch/avr32/boards/hammerhead/setup.c b/arch/avr32/boards/hammerhead/setup.c index dd00987..da14fbd 100644 --- a/arch/avr32/boards/hammerhead/setup.c +++ b/arch/avr32/boards/hammerhead/setup.c @@ -150,7 +150,7 @@ static void __init set_hw_addr(struct platform_device *pdev) regs = (void __iomem __force *)res->start; pclk = clk_get(&pdev->dev, "pclk"); - if (!pclk) + if (IS_ERR(pclk)) return; clk_enable(pclk); diff --git a/arch/avr32/boards/merisc/setup.c b/arch/avr32/boards/merisc/setup.c index 623b077..e61bc94 100644 --- a/arch/avr32/boards/merisc/setup.c +++ b/arch/avr32/boards/merisc/setup.c @@ -134,7 +134,7 @@ static void __init set_hw_addr(struct platform_device *pdev) regs = (void __iomem __force *)res->start; pclk = clk_get(&pdev->dev, "pclk"); - if (!pclk) + if (IS_ERR(pclk)) return; clk_enable(pclk); diff --git a/arch/avr32/boards/mimc200/setup.c b/arch/avr32/boards/mimc200/setup.c index 523d8e1..c4da5cb 100644 --- a/arch/avr32/boards/mimc200/setup.c +++ b/arch/avr32/boards/mimc200/setup.c @@ -162,7 +162,7 @@ static void __init set_hw_addr(struct platform_device *pdev) */ regs = (void __iomem __force *)res->start; pclk = clk_get(&pdev->dev, "pclk"); - if (!pclk) + if (IS_ERR(pclk)) return; clk_enable(pclk); diff --git a/arch/avr32/configs/atngw100_defconfig b/arch/avr32/configs/atngw100_defconfig index 9854013..6f9ca56 100644 --- a/arch/avr32/configs/atngw100_defconfig +++ b/arch/avr32/configs/atngw100_defconfig @@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y +CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_BASE_FULL is not set # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set CONFIG_NO_HZ=y @@ -29,6 +26,7 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -72,8 +70,8 @@ CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_TCLIB=y -CONFIG_EEPROM_AT24=m CONFIG_NETDEVICES=y CONFIG_TUN=m CONFIG_NET_ETHERNET=y @@ -106,6 +104,7 @@ CONFIG_GPIO_SYSFS=y CONFIG_WATCHDOG=y CONFIG_AT32AP700X_WDT=y CONFIG_USB_GADGET=y +CONFIG_USB_GADGET_VBUS_DRAW=350 CONFIG_USB_ZERO=m CONFIG_USB_ETH=m CONFIG_USB_GADGETFS=m @@ -115,14 +114,12 @@ CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y -CONFIG_MMC_SPI=m CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y CONFIG_LEDS_GPIO=y CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_TIMER=y CONFIG_LEDS_TRIGGER_HEARTBEAT=y -CONFIG_LEDS_TRIGGER_DEFAULT_ON=y CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_AT32AP700X=y CONFIG_DMADEVICES=y @@ -130,21 +127,23 @@ CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_FAT_DEFAULT_CODEPAGE=850 +CONFIG_PROC_KCORE=y CONFIG_TMPFS=y -CONFIG_CONFIGFS_FS=m +CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y -CONFIG_UFS_FS=y +CONFIG_UBIFS_FS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m CONFIG_NFSD_V3=y -CONFIG_SMB_FS=m CONFIG_CIFS=m CONFIG_NLS_CODEPAGE_437=m CONFIG_NLS_CODEPAGE_850=m @@ -155,5 +154,3 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set -CONFIG_CRYPTO_PCBC=m diff --git a/arch/avr32/configs/atngw100_evklcd100_defconfig b/arch/avr32/configs/atngw100_evklcd100_defconfig index 7ceda35..7eece0a 100644 --- a/arch/avr32/configs/atngw100_evklcd100_defconfig +++ b/arch/avr32/configs/atngw100_evklcd100_defconfig @@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y +CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_BASE_FULL is not set # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set CONFIG_NO_HZ=y @@ -31,6 +28,7 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -74,8 +72,10 @@ CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_TCLIB=y CONFIG_NETDEVICES=y +CONFIG_TUN=m CONFIG_NET_ETHERNET=y CONFIG_MACB=y # CONFIG_NETDEV_1000 is not set @@ -104,6 +104,7 @@ CONFIG_I2C_GPIO=m CONFIG_SPI=y CONFIG_SPI_ATMEL=y CONFIG_SPI_SPIDEV=m +CONFIG_GPIO_SYSFS=y # CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_AT32AP700X_WDT=y @@ -127,6 +128,7 @@ CONFIG_USB_FILE_STORAGE=m CONFIG_USB_G_SERIAL=m CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y +CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y @@ -141,11 +143,14 @@ CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_FAT_DEFAULT_CODEPAGE=850 +CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y @@ -155,7 +160,6 @@ CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m CONFIG_NFSD_V3=y -CONFIG_SMB_FS=m CONFIG_CIFS=m CONFIG_NLS_CODEPAGE_437=m CONFIG_NLS_CODEPAGE_850=m @@ -166,4 +170,3 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set diff --git a/arch/avr32/configs/atngw100_evklcd101_defconfig b/arch/avr32/configs/atngw100_evklcd101_defconfig index 7bc5b2c..387eb9d 100644 --- a/arch/avr32/configs/atngw100_evklcd101_defconfig +++ b/arch/avr32/configs/atngw100_evklcd101_defconfig @@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y +CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_BASE_FULL is not set # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set CONFIG_NO_HZ=y @@ -30,6 +27,7 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -73,8 +71,10 @@ CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_TCLIB=y CONFIG_NETDEVICES=y +CONFIG_TUN=m CONFIG_NET_ETHERNET=y CONFIG_MACB=y # CONFIG_NETDEV_1000 is not set @@ -103,6 +103,7 @@ CONFIG_I2C_GPIO=m CONFIG_SPI=y CONFIG_SPI_ATMEL=y CONFIG_SPI_SPIDEV=m +CONFIG_GPIO_SYSFS=y # CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_AT32AP700X_WDT=y @@ -126,6 +127,7 @@ CONFIG_USB_FILE_STORAGE=m CONFIG_USB_G_SERIAL=m CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y +CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y @@ -140,11 +142,14 @@ CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_FAT_DEFAULT_CODEPAGE=850 +CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y @@ -154,7 +159,6 @@ CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m CONFIG_NFSD_V3=y -CONFIG_SMB_FS=m CONFIG_CIFS=m CONFIG_NLS_CODEPAGE_437=m CONFIG_NLS_CODEPAGE_850=m @@ -165,4 +169,3 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set diff --git a/arch/avr32/configs/atngw100mkii_defconfig b/arch/avr32/configs/atngw100mkii_defconfig index 4bd3682..f0fe237 100644 --- a/arch/avr32/configs/atngw100mkii_defconfig +++ b/arch/avr32/configs/atngw100mkii_defconfig @@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y +CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_BASE_FULL is not set # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set CONFIG_NO_HZ=y @@ -29,6 +26,7 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -74,6 +72,7 @@ CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_TCLIB=y CONFIG_NETDEVICES=y CONFIG_TUN=m @@ -107,6 +106,7 @@ CONFIG_GPIO_SYSFS=y CONFIG_WATCHDOG=y CONFIG_AT32AP700X_WDT=y CONFIG_USB_GADGET=y +CONFIG_USB_GADGET_VBUS_DRAW=350 CONFIG_USB_ZERO=m CONFIG_USB_ETH=m CONFIG_USB_GADGETFS=m @@ -116,14 +116,12 @@ CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y -CONFIG_MMC_SPI=m CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y CONFIG_LEDS_GPIO=y CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_TIMER=y CONFIG_LEDS_TRIGGER_HEARTBEAT=y -CONFIG_LEDS_TRIGGER_DEFAULT_ON=y CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_AT32AP700X=y CONFIG_DMADEVICES=y @@ -131,21 +129,23 @@ CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_FAT_DEFAULT_CODEPAGE=850 +CONFIG_PROC_KCORE=y CONFIG_TMPFS=y -CONFIG_CONFIGFS_FS=m +CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y -CONFIG_UFS_FS=y +CONFIG_UBIFS_FS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m CONFIG_NFSD_V3=y -CONFIG_SMB_FS=m CONFIG_CIFS=m CONFIG_NLS_CODEPAGE_437=m CONFIG_NLS_CODEPAGE_850=m @@ -156,5 +156,3 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set -CONFIG_CRYPTO_PCBC=m diff --git a/arch/avr32/configs/atngw100mkii_evklcd100_defconfig b/arch/avr32/configs/atngw100mkii_evklcd100_defconfig index f8437ef..e4a7c1d 100644 --- a/arch/avr32/configs/atngw100mkii_evklcd100_defconfig +++ b/arch/avr32/configs/atngw100mkii_evklcd100_defconfig @@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y +CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_BASE_FULL is not set # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set CONFIG_NO_HZ=y @@ -32,6 +29,7 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -77,8 +75,10 @@ CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_TCLIB=y CONFIG_NETDEVICES=y +CONFIG_TUN=m CONFIG_NET_ETHERNET=y CONFIG_MACB=y # CONFIG_NETDEV_1000 is not set @@ -107,6 +107,7 @@ CONFIG_I2C_GPIO=m CONFIG_SPI=y CONFIG_SPI_ATMEL=y CONFIG_SPI_SPIDEV=m +CONFIG_GPIO_SYSFS=y # CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_AT32AP700X_WDT=y @@ -130,6 +131,7 @@ CONFIG_USB_FILE_STORAGE=m CONFIG_USB_G_SERIAL=m CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y +CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y @@ -144,11 +146,14 @@ CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_FAT_DEFAULT_CODEPAGE=850 +CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y @@ -158,7 +163,6 @@ CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m CONFIG_NFSD_V3=y -CONFIG_SMB_FS=m CONFIG_CIFS=m CONFIG_NLS_CODEPAGE_437=m CONFIG_NLS_CODEPAGE_850=m @@ -169,4 +173,3 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set diff --git a/arch/avr32/configs/atngw100mkii_evklcd101_defconfig b/arch/avr32/configs/atngw100mkii_evklcd101_defconfig index 7f58f99..6f37f70 100644 --- a/arch/avr32/configs/atngw100mkii_evklcd101_defconfig +++ b/arch/avr32/configs/atngw100mkii_evklcd101_defconfig @@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y +CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_BASE_FULL is not set # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set CONFIG_NO_HZ=y @@ -31,6 +28,7 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -76,8 +74,10 @@ CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_TCLIB=y CONFIG_NETDEVICES=y +CONFIG_TUN=m CONFIG_NET_ETHERNET=y CONFIG_MACB=y # CONFIG_NETDEV_1000 is not set @@ -106,6 +106,7 @@ CONFIG_I2C_GPIO=m CONFIG_SPI=y CONFIG_SPI_ATMEL=y CONFIG_SPI_SPIDEV=m +CONFIG_GPIO_SYSFS=y # CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_AT32AP700X_WDT=y @@ -129,6 +130,7 @@ CONFIG_USB_FILE_STORAGE=m CONFIG_USB_G_SERIAL=m CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y +CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y @@ -143,11 +145,14 @@ CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_FAT_DEFAULT_CODEPAGE=850 +CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y @@ -157,7 +162,6 @@ CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m CONFIG_NFSD_V3=y -CONFIG_SMB_FS=m CONFIG_CIFS=m CONFIG_NLS_CODEPAGE_437=m CONFIG_NLS_CODEPAGE_850=m @@ -168,4 +172,3 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set diff --git a/arch/avr32/configs/atstk1002_defconfig b/arch/avr32/configs/atstk1002_defconfig index aec4c43..4fb01f5 100644 --- a/arch/avr32/configs/atstk1002_defconfig +++ b/arch/avr32/configs/atstk1002_defconfig @@ -3,7 +3,6 @@ CONFIG_EXPERIMENTAL=y CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set @@ -11,7 +10,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set @@ -26,6 +25,7 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -35,6 +35,7 @@ CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE_DEMUX=m CONFIG_NET_IPGRE=m CONFIG_INET_AH=m CONFIG_INET_ESP=m @@ -58,16 +59,14 @@ CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_AMDSTD=y CONFIG_MTD_PHYSMAP=y -CONFIG_MTD_DATAFLASH=m -CONFIG_MTD_M25P80=m CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_PWM=m CONFIG_ATMEL_TCLIB=y CONFIG_ATMEL_SSC=m -CONFIG_EEPROM_AT24=m # CONFIG_SCSI_PROC_FS is not set CONFIG_BLK_DEV_SD=m CONFIG_BLK_DEV_SR=m @@ -120,7 +119,6 @@ CONFIG_SND_MIXER_OSS=m CONFIG_SND_PCM_OSS=m # CONFIG_SND_SUPPORT_OLD_API is not set # CONFIG_SND_VERBOSE_PROCFS is not set -# CONFIG_SND_DRIVERS is not set CONFIG_SND_AT73C213=m # CONFIG_HID_SUPPORT is not set CONFIG_USB_GADGET=y @@ -131,16 +129,15 @@ CONFIG_USB_FILE_STORAGE=m CONFIG_USB_G_SERIAL=m CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y +CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y -CONFIG_MMC_SPI=m CONFIG_NEW_LEDS=y -CONFIG_LEDS_CLASS=m +CONFIG_LEDS_CLASS=y CONFIG_LEDS_ATMEL_PWM=m CONFIG_LEDS_GPIO=m CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_TIMER=m CONFIG_LEDS_TRIGGER_HEARTBEAT=m -CONFIG_LEDS_TRIGGER_DEFAULT_ON=m CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_AT32AP700X=y CONFIG_DMADEVICES=y @@ -149,20 +146,23 @@ CONFIG_EXT3_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m +CONFIG_FAT_DEFAULT_CODEPAGE=850 CONFIG_PROC_KCORE=y CONFIG_TMPFS=y +CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y -# CONFIG_JFFS2_FS_WRITEBUFFER is not set CONFIG_UBIFS_FS=y -CONFIG_MINIX_FS=m CONFIG_NFS_FS=y CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y +CONFIG_CIFS=m CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_850=m CONFIG_NLS_ISO8859_1=m CONFIG_NLS_UTF8=m CONFIG_MAGIC_SYSRQ=y @@ -170,6 +170,3 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set -# CONFIG_CRYPTO_HW is not set -CONFIG_CRC_T10DIF=m diff --git a/arch/avr32/configs/atstk1003_defconfig b/arch/avr32/configs/atstk1003_defconfig index 50ba3db..9faaf9b 100644 --- a/arch/avr32/configs/atstk1003_defconfig +++ b/arch/avr32/configs/atstk1003_defconfig @@ -2,22 +2,15 @@ CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y -CONFIG_TASKSTATS=y -CONFIG_TASK_DELAY_ACCT=y -CONFIG_AUDIT=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_BASE_FULL is not set -# CONFIG_SLUB_DEBUG is not set # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set @@ -33,6 +26,7 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -54,18 +48,18 @@ CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_AMDSTD=y CONFIG_MTD_PHYSMAP=y -CONFIG_MTD_DATAFLASH=m -CONFIG_MTD_M25P80=m +CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_PWM=m CONFIG_ATMEL_TCLIB=y CONFIG_ATMEL_SSC=m -CONFIG_EEPROM_AT24=m # CONFIG_SCSI_PROC_FS is not set CONFIG_BLK_DEV_SD=m CONFIG_BLK_DEV_SR=m +# CONFIG_SCSI_LOWLEVEL is not set CONFIG_ATA=m # CONFIG_SATA_PMP is not set CONFIG_PATA_AT32=m @@ -77,6 +71,7 @@ CONFIG_PPP_ASYNC=m CONFIG_PPP_DEFLATE=m CONFIG_PPP_BSDCOMP=m CONFIG_INPUT=m +CONFIG_INPUT_EVDEV=m # CONFIG_KEYBOARD_ATKBD is not set CONFIG_KEYBOARD_GPIO=m # CONFIG_MOUSE_PS2 is not set @@ -106,7 +101,6 @@ CONFIG_SND_PCM_OSS=m CONFIG_SND_AT73C213=m # CONFIG_HID_SUPPORT is not set CONFIG_USB_GADGET=y -CONFIG_USB_GADGET_DEBUG_FS=y CONFIG_USB_ZERO=m CONFIG_USB_ETH=m CONFIG_USB_GADGETFS=m @@ -116,36 +110,39 @@ CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y -CONFIG_MMC_SPI=m CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y CONFIG_LEDS_ATMEL_PWM=m -CONFIG_LEDS_GPIO=y +CONFIG_LEDS_GPIO=m CONFIG_LEDS_TRIGGERS=y -CONFIG_LEDS_TRIGGER_TIMER=y -CONFIG_LEDS_TRIGGER_HEARTBEAT=y -CONFIG_LEDS_TRIGGER_DEFAULT_ON=y +CONFIG_LEDS_TRIGGER_TIMER=m +CONFIG_LEDS_TRIGGER_HEARTBEAT=m CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_AT32AP700X=y CONFIG_DMADEVICES=y -CONFIG_DW_DMAC=y -CONFIG_EXT2_FS=m -CONFIG_EXT3_FS=m +CONFIG_EXT2_FS=y +CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m +CONFIG_FAT_DEFAULT_CODEPAGE=850 CONFIG_PROC_KCORE=y CONFIG_TMPFS=y -CONFIG_CONFIGFS_FS=m +CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y +CONFIG_UBIFS_FS=y # CONFIG_NETWORK_FILESYSTEMS is not set CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_850=m CONFIG_NLS_ISO8859_1=m CONFIG_NLS_UTF8=m CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y +CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -CONFIG_CRC_T10DIF=m diff --git a/arch/avr32/configs/atstk1004_defconfig b/arch/avr32/configs/atstk1004_defconfig index 329e10b..3d2a5d8 100644 --- a/arch/avr32/configs/atstk1004_defconfig +++ b/arch/avr32/configs/atstk1004_defconfig @@ -1,19 +1,32 @@ CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y +CONFIG_RELAY=y +CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_BASE_FULL is not set -# CONFIG_FUTEX is not set -# CONFIG_EPOLL is not set -# CONFIG_SIGNALFD is not set -# CONFIG_TIMERFD is not set -# CONFIG_EVENTFD is not set # CONFIG_COMPAT_BRK is not set -CONFIG_SLOB=y -# CONFIG_BLOCK is not set +CONFIG_PROFILING=y +CONFIG_OPROFILE=m +# CONFIG_KPROBES is not set +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +# CONFIG_BLK_DEV_BSG is not set +# CONFIG_IOSCHED_DEADLINE is not set +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y CONFIG_BOARD_ATSTK1004=y # CONFIG_OWNERSHIP_TRACE is not set +CONFIG_NMI_DEBUGGING=y +CONFIG_PM=y +CONFIG_CPU_FREQ=y +# CONFIG_CPU_FREQ_STAT is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y +CONFIG_CPU_FREQ_GOV_USERSPACE=y +CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -31,40 +44,104 @@ CONFIG_MTD=y CONFIG_MTD_PARTITIONS=y CONFIG_MTD_CMDLINE_PARTS=y CONFIG_MTD_CHAR=y +CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_AMDSTD=y CONFIG_MTD_PHYSMAP=y -# CONFIG_MISC_DEVICES is not set -# CONFIG_INPUT is not set +CONFIG_MTD_UBI=y +CONFIG_BLK_DEV_LOOP=m +CONFIG_BLK_DEV_NBD=m +CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y +CONFIG_ATMEL_PWM=m +CONFIG_ATMEL_TCLIB=y +CONFIG_ATMEL_SSC=m +# CONFIG_SCSI_PROC_FS is not set +CONFIG_BLK_DEV_SD=m +CONFIG_BLK_DEV_SR=m +# CONFIG_SCSI_LOWLEVEL is not set +CONFIG_ATA=m +# CONFIG_SATA_PMP is not set +CONFIG_PATA_AT32=m +CONFIG_NETDEVICES=y +# CONFIG_NETDEV_1000 is not set +# CONFIG_NETDEV_10000 is not set +CONFIG_PPP=m +CONFIG_PPP_ASYNC=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPP_BSDCOMP=m +CONFIG_INPUT=m +CONFIG_INPUT_EVDEV=m +# CONFIG_KEYBOARD_ATKBD is not set +CONFIG_KEYBOARD_GPIO=m +# CONFIG_MOUSE_PS2 is not set +CONFIG_MOUSE_GPIO=m # CONFIG_SERIO is not set # CONFIG_VT is not set # CONFIG_DEVKMEM is not set CONFIG_SERIAL_ATMEL=y CONFIG_SERIAL_ATMEL_CONSOLE=y -# CONFIG_SERIAL_ATMEL_PDC is not set # CONFIG_LEGACY_PTYS is not set # CONFIG_HW_RANDOM is not set +CONFIG_I2C=m +CONFIG_I2C_CHARDEV=m +CONFIG_I2C_GPIO=m CONFIG_SPI=y CONFIG_SPI_ATMEL=y +CONFIG_SPI_SPIDEV=m +CONFIG_GPIO_SYSFS=y # CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_AT32AP700X_WDT=y CONFIG_FB=y CONFIG_FB_ATMEL=y CONFIG_BACKLIGHT_LCD_SUPPORT=y +CONFIG_LCD_CLASS_DEVICE=y CONFIG_LCD_LTV350QV=y # CONFIG_BACKLIGHT_CLASS_DEVICE is not set CONFIG_USB_GADGET=y -CONFIG_USB_ETH=y -# CONFIG_USB_ETH_RNDIS is not set +CONFIG_USB_ZERO=m +CONFIG_USB_ETH=m +CONFIG_USB_GADGETFS=m +CONFIG_USB_FILE_STORAGE=m +CONFIG_USB_G_SERIAL=m +CONFIG_USB_CDC_COMPOSITE=m +CONFIG_MMC=y +CONFIG_MMC_TEST=m +CONFIG_MMC_ATMELMCI=y +CONFIG_NEW_LEDS=y +CONFIG_LEDS_CLASS=y +CONFIG_LEDS_ATMEL_PWM=m +CONFIG_LEDS_GPIO=m +CONFIG_LEDS_TRIGGERS=y +CONFIG_LEDS_TRIGGER_TIMER=m +CONFIG_LEDS_TRIGGER_HEARTBEAT=m CONFIG_RTC_CLASS=y -# CONFIG_RTC_INTF_PROC is not set CONFIG_RTC_DRV_AT32AP700X=y +CONFIG_DMADEVICES=y +CONFIG_EXT2_FS=y +CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set +# CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set +CONFIG_FUSE_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +CONFIG_FAT_DEFAULT_CODEPAGE=850 CONFIG_PROC_KCORE=y -# CONFIG_PROC_PAGE_MONITOR is not set CONFIG_TMPFS=y +CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y -# CONFIG_JFFS2_FS_WRITEBUFFER is not set +CONFIG_UBIFS_FS=y # CONFIG_NETWORK_FILESYSTEMS is not set +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_UTF8=m CONFIG_MAGIC_SYSRQ=y +CONFIG_DEBUG_FS=y +CONFIG_DEBUG_KERNEL=y +CONFIG_DETECT_HUNG_TASK=y +CONFIG_FRAME_POINTER=y diff --git a/arch/avr32/configs/atstk1006_defconfig b/arch/avr32/configs/atstk1006_defconfig index dbcc1b5..1ed8f22 100644 --- a/arch/avr32/configs/atstk1006_defconfig +++ b/arch/avr32/configs/atstk1006_defconfig @@ -3,7 +3,6 @@ CONFIG_EXPERIMENTAL=y CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set @@ -11,7 +10,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set @@ -37,6 +36,7 @@ CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE_DEMUX=m CONFIG_NET_IPGRE=m CONFIG_INET_AH=m CONFIG_INET_ESP=m @@ -60,15 +60,13 @@ CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_AMDSTD=y CONFIG_MTD_PHYSMAP=y -CONFIG_MTD_DATAFLASH=m -CONFIG_MTD_DATAFLASH_OTP=y -CONFIG_MTD_M25P80=m CONFIG_MTD_NAND=y CONFIG_MTD_NAND_ATMEL=y CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_PWM=m CONFIG_ATMEL_TCLIB=y CONFIG_ATMEL_SSC=m @@ -132,17 +130,17 @@ CONFIG_USB_ETH=m CONFIG_USB_GADGETFS=m CONFIG_USB_FILE_STORAGE=m CONFIG_USB_G_SERIAL=m +CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y +CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y -CONFIG_MMC_SPI=m CONFIG_NEW_LEDS=y -CONFIG_LEDS_CLASS=m +CONFIG_LEDS_CLASS=y CONFIG_LEDS_ATMEL_PWM=m CONFIG_LEDS_GPIO=m CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_TIMER=m CONFIG_LEDS_TRIGGER_HEARTBEAT=m -CONFIG_LEDS_TRIGGER_DEFAULT_ON=m CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_AT32AP700X=y CONFIG_DMADEVICES=y @@ -156,15 +154,18 @@ CONFIG_EXT4_FS=y CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m +CONFIG_FAT_DEFAULT_CODEPAGE=850 CONFIG_PROC_KCORE=y CONFIG_TMPFS=y +CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y CONFIG_UBIFS_FS=y -CONFIG_MINIX_FS=m CONFIG_NFS_FS=y CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y +CONFIG_CIFS=m CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_850=m CONFIG_NLS_ISO8859_1=m CONFIG_NLS_UTF8=m CONFIG_MAGIC_SYSRQ=y @@ -172,7 +173,3 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set -CONFIG_CRYPTO_FIPS=y -# CONFIG_CRYPTO_HW is not set -CONFIG_CRC_T10DIF=m diff --git a/arch/avr32/configs/favr-32_defconfig b/arch/avr32/configs/favr-32_defconfig index 0c813b6..aeadc95 100644 --- a/arch/avr32/configs/favr-32_defconfig +++ b/arch/avr32/configs/favr-32_defconfig @@ -11,7 +11,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set diff --git a/arch/avr32/configs/hammerhead_defconfig b/arch/avr32/configs/hammerhead_defconfig index dcc01f0..1692bee 100644 --- a/arch/avr32/configs/hammerhead_defconfig +++ b/arch/avr32/configs/hammerhead_defconfig @@ -12,7 +12,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y diff --git a/arch/avr32/include/asm/syscalls.h b/arch/avr32/include/asm/syscalls.h index ab608b7..244f2ac 100644 --- a/arch/avr32/include/asm/syscalls.h +++ b/arch/avr32/include/asm/syscalls.h @@ -15,20 +15,6 @@ #include #include -/* kernel/process.c */ -asmlinkage int sys_fork(struct pt_regs *); -asmlinkage int sys_clone(unsigned long, unsigned long, - unsigned long, unsigned long, - struct pt_regs *); -asmlinkage int sys_vfork(struct pt_regs *); -asmlinkage int sys_execve(const char __user *, char __user *__user *, - char __user *__user *, struct pt_regs *); - -/* kernel/signal.c */ -asmlinkage int sys_sigaltstack(const stack_t __user *, stack_t __user *, - struct pt_regs *); -asmlinkage int sys_rt_sigreturn(struct pt_regs *); - /* mm/cache.c */ asmlinkage int sys_cacheflush(int, void __user *, size_t); diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c index 9c46aaa..ef5a2a0 100644 --- a/arch/avr32/kernel/process.c +++ b/arch/avr32/kernel/process.c @@ -367,14 +367,13 @@ asmlinkage int sys_fork(struct pt_regs *regs) } asmlinkage int sys_clone(unsigned long clone_flags, unsigned long newsp, - unsigned long parent_tidptr, - unsigned long child_tidptr, struct pt_regs *regs) + void __user *parent_tidptr, void __user *child_tidptr, + struct pt_regs *regs) { if (!newsp) newsp = regs->sp; - return do_fork(clone_flags, newsp, regs, 0, - (int __user *)parent_tidptr, - (int __user *)child_tidptr); + return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, + child_tidptr); } asmlinkage int sys_vfork(struct pt_regs *regs) diff --git a/arch/avr32/kernel/time.c b/arch/avr32/kernel/time.c index 668ed28..05ad291 100644 --- a/arch/avr32/kernel/time.c +++ b/arch/avr32/kernel/time.c @@ -35,7 +35,6 @@ static struct clocksource counter = { .rating = 50, .read = read_cycle_count, .mask = CLOCKSOURCE_MASK(32), - .shift = 16, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -123,9 +122,7 @@ void __init time_init(void) /* figure rate for counter */ counter_hz = clk_get_rate(boot_cpu_data.clk); - counter.mult = clocksource_hz2mult(counter_hz, counter.shift); - - ret = clocksource_register(&counter); + ret = clocksource_register_hz(&counter, counter_hz); if (ret) pr_debug("timer: could not register clocksource: %d\n", ret); diff --git a/arch/ia64/include/asm/io.h b/arch/ia64/include/asm/io.h index cc8335e..e5a6c35 100644 --- a/arch/ia64/include/asm/io.h +++ b/arch/ia64/include/asm/io.h @@ -426,6 +426,11 @@ extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size) extern void iounmap (volatile void __iomem *addr); extern void __iomem * early_ioremap (unsigned long phys_addr, unsigned long size); extern void early_iounmap (volatile void __iomem *addr, unsigned long size); +static inline void __iomem * ioremap_cache (unsigned long phys_addr, unsigned long size) +{ + return ioremap(phys_addr, size); +} + /* * String version of IO memory access ops: diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index 348e44d..03afe79 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h @@ -717,8 +717,9 @@ prefetchw (const void *x) #define spin_lock_prefetch(x) prefetchw(x) extern unsigned long boot_option_idle_override; -extern unsigned long idle_halt; -extern unsigned long idle_nomwait; + +enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_FORCE_MWAIT, + IDLE_NOMWAIT, IDLE_POLL}; #endif /* !__ASSEMBLY__ */ diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index ac76da0..89accc6 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -618,7 +618,7 @@ pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, } /* forward declaration */ -static static const struct dentry_operations pfmfs_dentry_operations; +static const struct dentry_operations pfmfs_dentry_operations; static struct dentry * pfmfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 16f1c7b..6d33c5c 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -53,12 +53,8 @@ void (*ia64_mark_idle)(int); -unsigned long boot_option_idle_override = 0; +unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; EXPORT_SYMBOL(boot_option_idle_override); -unsigned long idle_halt; -EXPORT_SYMBOL(idle_halt); -unsigned long idle_nomwait; -EXPORT_SYMBOL(idle_nomwait); void (*pm_idle) (void); EXPORT_SYMBOL(pm_idle); void (*pm_power_off) (void); diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 1841ee7..5ca674b 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -38,7 +38,7 @@ huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) if (pud) { pmd = pmd_alloc(mm, pud, taddr); if (pmd) - pte = pte_alloc_map(mm, pmd, taddr); + pte = pte_alloc_map(mm, NULL, pmd, taddr); } return pte; } diff --git a/arch/mips/include/asm/mman.h b/arch/mips/include/asm/mman.h index c892bfb..785b4ea 100644 --- a/arch/mips/include/asm/mman.h +++ b/arch/mips/include/asm/mman.h @@ -77,6 +77,9 @@ #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ #define MADV_HWPOISON 100 /* poison a page for testing */ +#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */ +#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c index 6f51dda..d87a72e 100644 --- a/arch/mips/kernel/module.c +++ b/arch/mips/kernel/module.c @@ -46,17 +46,9 @@ static DEFINE_SPINLOCK(dbe_lock); void *module_alloc(unsigned long size) { #ifdef MODULE_START - struct vm_struct *area; - - size = PAGE_ALIGN(size); - if (!size) - return NULL; - - area = __get_vm_area(size, VM_ALLOC, MODULE_START, MODULE_END); - if (!area) - return NULL; - - return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL); + return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END, + GFP_KERNEL, PAGE_KERNEL, -1, + __builtin_return_address(0)); #else if (size == 0) return NULL; diff --git a/arch/parisc/include/asm/mman.h b/arch/parisc/include/asm/mman.h index 9749c8a..f5b7bf5 100644 --- a/arch/parisc/include/asm/mman.h +++ b/arch/parisc/include/asm/mman.h @@ -59,6 +59,9 @@ #define MADV_MERGEABLE 65 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */ +#define MADV_HUGEPAGE 67 /* Worth backing with hugepages */ +#define MADV_NOHUGEPAGE 68 /* Not worth backing with hugepages */ + /* compatibility flags */ #define MAP_FILE 0 #define MAP_VARIABLE 0 diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c index d7efdbf..fec1320 100644 --- a/arch/powerpc/mm/gup.c +++ b/arch/powerpc/mm/gup.c @@ -16,6 +16,16 @@ #ifdef __HAVE_ARCH_PTE_SPECIAL +static inline void get_huge_page_tail(struct page *page) +{ + /* + * __split_huge_page_refcount() cannot run + * from under us. + */ + VM_BUG_ON(atomic_read(&page->_count) < 0); + atomic_inc(&page->_count); +} + /* * The performance critical leaf functions are made noinline otherwise gcc * inlines everything into a single function which results in too much @@ -47,6 +57,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, put_page(page); return 0; } + if (PageTail(page)) + get_huge_page_tail(page); pages[*nr] = page; (*nr)++; diff --git a/arch/sh/kernel/cpu/shmobile/cpuidle.c b/arch/sh/kernel/cpu/shmobile/cpuidle.c index 83972aa..c19e2a9 100644 --- a/arch/sh/kernel/cpu/shmobile/cpuidle.c +++ b/arch/sh/kernel/cpu/shmobile/cpuidle.c @@ -81,7 +81,6 @@ void sh_mobile_setup_cpuidle(void) state->target_residency = 1 * 2; state->power_usage = 3; state->flags = 0; - state->flags |= CPUIDLE_FLAG_SHALLOW; state->flags |= CPUIDLE_FLAG_TIME_VALID; state->enter = cpuidle_sleep_enter; diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index 9163db3..d776234 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -35,7 +35,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, if (pud) { pmd = pmd_alloc(mm, pud, addr); if (pmd) - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_map(mm, NULL, pmd, addr); } } diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c index ee3c7dd..8d348c4 100644 --- a/arch/sparc/kernel/module.c +++ b/arch/sparc/kernel/module.c @@ -23,17 +23,11 @@ static void *module_map(unsigned long size) { - struct vm_struct *area; - - size = PAGE_ALIGN(size); - if (!size || size > MODULES_LEN) - return NULL; - - area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); - if (!area) + if (PAGE_ALIGN(size) > MODULES_LEN) return NULL; - - return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL); + return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, + GFP_KERNEL, PAGE_KERNEL, -1, + __builtin_return_address(0)); } static char *dot2underscore(char *name) diff --git a/arch/sparc/mm/generic_32.c b/arch/sparc/mm/generic_32.c index 5edcac1..e6067b7 100644 --- a/arch/sparc/mm/generic_32.c +++ b/arch/sparc/mm/generic_32.c @@ -50,7 +50,7 @@ static inline int io_remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned end = PGDIR_SIZE; offset -= address; do { - pte_t * pte = pte_alloc_map(mm, pmd, address); + pte_t *pte = pte_alloc_map(mm, NULL, pmd, address); if (!pte) return -ENOMEM; io_remap_pte_range(mm, pte, address, end - address, address + offset, prot, space); diff --git a/arch/sparc/mm/generic_64.c b/arch/sparc/mm/generic_64.c index 04f2bf4..3cb00df 100644 --- a/arch/sparc/mm/generic_64.c +++ b/arch/sparc/mm/generic_64.c @@ -92,7 +92,7 @@ static inline int io_remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned end = PGDIR_SIZE; offset -= address; do { - pte_t * pte = pte_alloc_map(mm, pmd, address); + pte_t *pte = pte_alloc_map(mm, NULL, pmd, address); if (!pte) return -ENOMEM; io_remap_pte_range(mm, pte, address, end - address, address + offset, prot, space); diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 5fdddf1..f4e9764 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -214,7 +214,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, if (pud) { pmd = pmd_alloc(mm, pud, addr); if (pmd) - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_map(mm, NULL, pmd, addr); } return pte; } diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index 3d099f9..1aee587 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -31,7 +31,7 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc, if (!pmd) goto out_pmd; - pte = pte_alloc_map(mm, pmd, proc); + pte = pte_alloc_map(mm, NULL, pmd, proc); if (!pte) goto out_pte; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index aa75f21..ffd7f8d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -822,6 +822,7 @@ extern bool kvm_rebooting; #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_age_hva(struct kvm *kvm, unsigned long hva); +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 7709c12..2071a8b 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -435,6 +435,11 @@ static inline void pte_update(struct mm_struct *mm, unsigned long addr, { PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep); } +static inline void pmd_update(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + PVOP_VCALL3(pv_mmu_ops.pmd_update, mm, addr, pmdp); +} static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep) @@ -442,6 +447,12 @@ static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr, PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep); } +static inline void pmd_update_defer(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + PVOP_VCALL3(pv_mmu_ops.pmd_update_defer, mm, addr, pmdp); +} + static inline pte_t __pte(pteval_t val) { pteval_t ret; @@ -543,6 +554,20 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ +#if PAGETABLE_LEVELS >= 3 + if (sizeof(pmdval_t) > sizeof(long)) + /* 5 arg words */ + pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd); + else + PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp, pmd.pmd); +#endif +} +#endif + static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) { pmdval_t val = native_pmd_val(pmd); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index b82bac9..8288509 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -265,10 +265,16 @@ struct pv_mmu_ops { void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval); void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); + void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmdval); void (*pte_update)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void (*pte_update_defer)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); + void (*pmd_update)(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp); + void (*pmd_update_defer)(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp); pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 2334982..98391db 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -46,6 +46,15 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp) #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) #endif +#ifdef CONFIG_SMP +static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) +{ + return __pmd(xchg((pmdval_t *)xp, 0)); +} +#else +#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) +#endif + /* * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, * split up the 29 bits of offset into this range: diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 177b016..94b979d 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -104,6 +104,29 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep) #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) #endif +#ifdef CONFIG_SMP +union split_pmd { + struct { + u32 pmd_low; + u32 pmd_high; + }; + pmd_t pmd; +}; +static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) +{ + union split_pmd res, *orig = (union split_pmd *)pmdp; + + /* xchg acts as a barrier before setting of the high bits */ + res.pmd_low = xchg(&orig->pmd_low, 0); + res.pmd_high = orig->pmd_high; + orig->pmd_high = 0; + + return res.pmd; +} +#else +#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) +#endif + /* * Bits 0, 6 and 7 are taken in the low part of the pte, * put the 32 bits of offset into the high part. diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index ada823a..18601c8 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -35,6 +35,7 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); #else /* !CONFIG_PARAVIRT */ #define set_pte(ptep, pte) native_set_pte(ptep, pte) #define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) +#define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd) #define set_pte_atomic(ptep, pte) \ native_set_pte_atomic(ptep, pte) @@ -59,6 +60,8 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); #define pte_update(mm, addr, ptep) do { } while (0) #define pte_update_defer(mm, addr, ptep) do { } while (0) +#define pmd_update(mm, addr, ptep) do { } while (0) +#define pmd_update_defer(mm, addr, ptep) do { } while (0) #define pgd_val(x) native_pgd_val(x) #define __pgd(x) native_make_pgd(x) @@ -94,6 +97,11 @@ static inline int pte_young(pte_t pte) return pte_flags(pte) & _PAGE_ACCESSED; } +static inline int pmd_young(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_ACCESSED; +} + static inline int pte_write(pte_t pte) { return pte_flags(pte) & _PAGE_RW; @@ -142,6 +150,23 @@ static inline int pmd_large(pmd_t pte) (_PAGE_PSE | _PAGE_PRESENT); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmd_trans_splitting(pmd_t pmd) +{ + return pmd_val(pmd) & _PAGE_SPLITTING; +} + +static inline int pmd_trans_huge(pmd_t pmd) +{ + return pmd_val(pmd) & _PAGE_PSE; +} + +static inline int has_transparent_hugepage(void) +{ + return cpu_has_pse; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + static inline pte_t pte_set_flags(pte_t pte, pteval_t set) { pteval_t v = native_pte_val(pte); @@ -216,6 +241,55 @@ static inline pte_t pte_mkspecial(pte_t pte) return pte_set_flags(pte, _PAGE_SPECIAL); } +static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) +{ + pmdval_t v = native_pmd_val(pmd); + + return __pmd(v | set); +} + +static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) +{ + pmdval_t v = native_pmd_val(pmd); + + return __pmd(v & ~clear); +} + +static inline pmd_t pmd_mkold(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_ACCESSED); +} + +static inline pmd_t pmd_wrprotect(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_RW); +} + +static inline pmd_t pmd_mkdirty(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_DIRTY); +} + +static inline pmd_t pmd_mkhuge(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_PSE); +} + +static inline pmd_t pmd_mkyoung(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_ACCESSED); +} + +static inline pmd_t pmd_mkwrite(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_RW); +} + +static inline pmd_t pmd_mknotpresent(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_PRESENT); +} + /* * Mask out unsupported bits in a present pgprot. Non-present pgprots * can use those bits for other purposes, so leave them be. @@ -256,6 +330,16 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) return __pte(val); } +static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) +{ + pmdval_t val = pmd_val(pmd); + + val &= _HPAGE_CHG_MASK; + val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK; + + return __pmd(val); +} + /* mprotect needs to preserve PAT bits when updating vm_page_prot */ #define pgprot_modify pgprot_modify static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) @@ -350,7 +434,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pmd_page(pmd) pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT) +#define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT) /* * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] @@ -524,12 +608,26 @@ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) return res; } +static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp) +{ + pmd_t res = *pmdp; + + native_pmd_clear(pmdp); + return res; +} + static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep , pte_t pte) { native_set_pte(ptep, pte); } +static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp , pmd_t pmd) +{ + native_set_pmd(pmdp, pmd); +} + #ifndef CONFIG_PARAVIRT /* * Rules for using pte_update - it must be called after any PTE update which @@ -607,6 +705,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, #define flush_tlb_fix_spurious_fault(vma, address) +#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) + +#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +extern int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty); + +#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG +extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp); + +#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH +extern int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); + + +#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH +extern void pmdp_splitting_flush(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp); + +#define __HAVE_ARCH_PMD_WRITE +static inline int pmd_write(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_RW; +} + +#define __HAVE_ARCH_PMDP_GET_AND_CLEAR +static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + pmd_t pmd = native_pmdp_get_and_clear(pmdp); + pmd_update(mm, addr, pmdp); + return pmd; +} + +#define __HAVE_ARCH_PMDP_SET_WRPROTECT +static inline void pmdp_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp); + pmd_update(mm, addr, pmdp); +} + /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); * diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index f86da20..975f709 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -59,6 +59,16 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) native_set_pte(ptep, pte); } +static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) +{ + *pmdp = pmd; +} + +static inline void native_pmd_clear(pmd_t *pmd) +{ + native_set_pmd(pmd, native_make_pmd(0)); +} + static inline pte_t native_ptep_get_and_clear(pte_t *xp) { #ifdef CONFIG_SMP @@ -72,14 +82,17 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp) #endif } -static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) +static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) { - *pmdp = pmd; -} - -static inline void native_pmd_clear(pmd_t *pmd) -{ - native_set_pmd(pmd, native_make_pmd(0)); +#ifdef CONFIG_SMP + return native_make_pmd(xchg(&xp->pmd, 0)); +#else + /* native_local_pmdp_get_and_clear, + but duplicated because of cyclic dependency */ + pmd_t ret = *xp; + native_pmd_clear(xp); + return ret; +#endif } static inline void native_set_pud(pud_t *pudp, pud_t pud) @@ -168,6 +181,7 @@ extern void cleanup_highmap(void); #define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK) #define __HAVE_ARCH_PTE_SAME + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index d1f4a76..7db7723 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -22,6 +22,7 @@ #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ #define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 +#define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ /* If _PAGE_BIT_PRESENT is clear, we use these: */ @@ -45,6 +46,7 @@ #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) +#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING) #define __HAVE_ARCH_PTE_SPECIAL #ifdef CONFIG_KMEMCHECK @@ -70,6 +72,7 @@ /* Set of bits not changed in pte_modify */ #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) +#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) #define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) #define _PAGE_CACHE_WB (0) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 53fd1d5..45636ce 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -761,10 +761,11 @@ extern void select_idle_routine(const struct cpuinfo_x86 *c); extern void init_c1e_mask(void); extern unsigned long boot_option_idle_override; -extern unsigned long idle_halt; -extern unsigned long idle_nomwait; extern bool c1e_detected; +enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT, + IDLE_POLL, IDLE_FORCE_MWAIT}; + extern void enable_sep_cpu(void); extern int sysenter_setup(void); diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 8760cc6..f25bdf2 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -42,6 +42,11 @@ extern unsigned int machine_to_phys_order; extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); +extern int m2p_add_override(unsigned long mfn, struct page *page); +extern int m2p_remove_override(struct page *page); +extern struct page *m2p_find_override(unsigned long mfn); +extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); + static inline unsigned long pfn_to_mfn(unsigned long pfn) { unsigned long mfn; @@ -72,9 +77,6 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) if (xen_feature(XENFEAT_auto_translated_physmap)) return mfn; - if (unlikely((mfn >> machine_to_phys_order) != 0)) - return ~0; - pfn = 0; /* * The array access can fail (e.g., device space beyond end of RAM). @@ -83,6 +85,14 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) */ __get_user(pfn, &machine_to_phys_mapping[mfn]); + /* + * If this appears to be a foreign mfn (because the pfn + * doesn't map back to the mfn), then check the local override + * table to see if there's a better pfn to use. + */ + if (get_phys_to_machine(pfn) != mfn) + pfn = m2p_find_override_pfn(mfn, pfn); + return pfn; } diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index ec881c6..b3a7113 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -509,6 +509,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) return 0; } +EXPORT_SYMBOL_GPL(acpi_gsi_to_irq); int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) { diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index d6fb146..df20723 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -234,6 +234,7 @@ unsigned __kprobes long oops_begin(void) bust_spinlocks(1); return flags; } +EXPORT_SYMBOL_GPL(oops_begin); void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) { diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 0c2b7ef..294f26d 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 8f29560..ab23f1a 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -37,20 +37,11 @@ void *module_alloc(unsigned long size) { - struct vm_struct *area; - - if (!size) - return NULL; - size = PAGE_ALIGN(size); - if (size > MODULES_LEN) + if (PAGE_ALIGN(size) > MODULES_LEN) return NULL; - - area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); - if (!area) - return NULL; - - return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM, - PAGE_KERNEL_EXEC); + return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, + GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, + -1, __builtin_return_address(0)); } /* Free memory returned from module_alloc */ diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c5b2500..869e1ae 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -421,8 +421,11 @@ struct pv_mmu_ops pv_mmu_ops = { .set_pte = native_set_pte, .set_pte_at = native_set_pte_at, .set_pmd = native_set_pmd, + .set_pmd_at = native_set_pmd_at, .pte_update = paravirt_nop, .pte_update_defer = paravirt_nop, + .pmd_update = paravirt_nop, + .pmd_update_defer = paravirt_nop, .ptep_modify_prot_start = __ptep_modify_prot_start, .ptep_modify_prot_commit = __ptep_modify_prot_commit, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 09c08a1..d8286ed 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -22,11 +22,6 @@ #include #include -unsigned long idle_halt; -EXPORT_SYMBOL(idle_halt); -unsigned long idle_nomwait; -EXPORT_SYMBOL(idle_nomwait); - struct kmem_cache *task_xstate_cachep; EXPORT_SYMBOL_GPL(task_xstate_cachep); @@ -327,7 +322,7 @@ long sys_execve(const char __user *name, /* * Idle related variables and functions */ -unsigned long boot_option_idle_override = 0; +unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; EXPORT_SYMBOL(boot_option_idle_override); /* @@ -386,6 +381,8 @@ void default_idle(void) else local_irq_enable(); current_thread_info()->status |= TS_POLLING; + trace_power_end(smp_processor_id()); + trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } else { local_irq_enable(); /* loop is done by the caller */ @@ -443,8 +440,6 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { - trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id()); - trace_cpu_idle((ax>>4)+1, smp_processor_id()); if (!need_resched()) { if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -471,6 +466,8 @@ static void mwait_idle(void) __sti_mwait(0, 0); else local_irq_enable(); + trace_power_end(smp_processor_id()); + trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } else local_irq_enable(); } @@ -503,7 +500,6 @@ static void poll_idle(void) * * idle=mwait overrides this decision and forces the usage of mwait. */ -static int __cpuinitdata force_mwait; #define MWAIT_INFO 0x05 #define MWAIT_ECX_EXTENDED_INFO 0x01 @@ -513,7 +509,7 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; - if (force_mwait) + if (boot_option_idle_override == IDLE_FORCE_MWAIT) return 1; if (c->cpuid_level < MWAIT_INFO) @@ -633,9 +629,10 @@ static int __init idle_setup(char *str) if (!strcmp(str, "poll")) { printk("using polling idle threads.\n"); pm_idle = poll_idle; - } else if (!strcmp(str, "mwait")) - force_mwait = 1; - else if (!strcmp(str, "halt")) { + boot_option_idle_override = IDLE_POLL; + } else if (!strcmp(str, "mwait")) { + boot_option_idle_override = IDLE_FORCE_MWAIT; + } else if (!strcmp(str, "halt")) { /* * When the boot option of idle=halt is added, halt is * forced to be used for CPU idle. In such case CPU C2/C3 @@ -644,8 +641,7 @@ static int __init idle_setup(char *str) * the boot_option_idle_override. */ pm_idle = default_idle; - idle_halt = 1; - return 0; + boot_option_idle_override = IDLE_HALT; } else if (!strcmp(str, "nomwait")) { /* * If the boot option of "idle=nomwait" is added, @@ -653,12 +649,10 @@ static int __init idle_setup(char *str) * states. In such case it won't touch the variable * of boot_option_idle_override. */ - idle_nomwait = 1; - return 0; + boot_option_idle_override = IDLE_NOMWAIT; } else return -1; - boot_option_idle_override = 1; return 0; } early_param("idle", idle_setup); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 4b9befa..8d12878 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -57,8 +57,6 @@ #include #include -#include - asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); /* @@ -113,8 +111,6 @@ void cpu_idle(void) stop_critical_timings(); pm_idle(); start_critical_timings(); - trace_power_end(smp_processor_id()); - trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 4c818a7..bd387e8 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -51,8 +51,6 @@ #include #include -#include - asmlinkage extern void ret_from_fork(void); DEFINE_PER_CPU(unsigned long, old_rsp); @@ -141,10 +139,6 @@ void cpu_idle(void) pm_idle(); start_critical_timings(); - trace_power_end(smp_processor_id()); - trace_cpu_idle(PWR_EVENT_EXIT, - smp_processor_id()); - /* In many cases the interrupt that ended idle has already called exit_idle. But some idle loops can be woken up without interrupt. */ diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index c2f1b26..998e972 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -133,7 +133,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, pmd = pmd_alloc(&tboot_mm, pud, vaddr); if (!pmd) return -1; - pte = pte_alloc_map(&tboot_mm, pmd, vaddr); + pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr); if (!pte) return -1; set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 61fb985..863f875 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -179,6 +179,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) if (pud_none_or_clear_bad(pud)) goto out; pmd = pmd_offset(pud, 0xA0000); + split_huge_page_pmd(mm, pmd); if (pmd_none_or_clear_bad(pmd)) goto out; pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9cafbb4..f02b8ed 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -554,14 +554,18 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) return ret; } -static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) +static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) { struct kvm_memory_slot *slot; - int host_level, level, max_level; - slot = gfn_to_memslot(vcpu->kvm, large_gfn); if (slot && slot->dirty_bitmap) - return PT_PAGE_TABLE_LEVEL; + return true; + return false; +} + +static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) +{ + int host_level, level, max_level; host_level = host_mapping_level(vcpu->kvm, large_gfn); @@ -941,6 +945,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, return young; } +static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long data) +{ + u64 *spte; + int young = 0; + + /* + * If there's no access bit in the secondary pte set by the + * hardware it's up to gup-fast/gup to set the access bit in + * the primary pte or in the page structure. + */ + if (!shadow_accessed_mask) + goto out; + + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + u64 _spte = *spte; + BUG_ON(!(_spte & PT_PRESENT_MASK)); + young = _spte & PT_ACCESSED_MASK; + if (young) { + young = 1; + break; + } + spte = rmap_next(kvm, rmapp, spte); + } +out: + return young; +} + #define RMAP_RECYCLE_THRESHOLD 1000 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) @@ -961,6 +994,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva) return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); } +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); +} + #ifdef MMU_DEBUG static int is_empty_shadow_page(u64 *spt) { @@ -2281,6 +2319,48 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) return 1; } +static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, + gfn_t *gfnp, pfn_t *pfnp, int *levelp) +{ + pfn_t pfn = *pfnp; + gfn_t gfn = *gfnp; + int level = *levelp; + + /* + * Check if it's a transparent hugepage. If this would be an + * hugetlbfs page, level wouldn't be set to + * PT_PAGE_TABLE_LEVEL and there would be no adjustment done + * here. + */ + if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && + level == PT_PAGE_TABLE_LEVEL && + PageTransCompound(pfn_to_page(pfn)) && + !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { + unsigned long mask; + /* + * mmu_notifier_retry was successful and we hold the + * mmu_lock here, so the pmd can't become splitting + * from under us, and in turn + * __split_huge_page_refcount() can't run from under + * us and we can safely transfer the refcount from + * PG_tail to PG_head as we switch the pfn to tail to + * head. + */ + *levelp = level = PT_DIRECTORY_LEVEL; + mask = KVM_PAGES_PER_HPAGE(level) - 1; + VM_BUG_ON((gfn & mask) != (pfn & mask)); + if (pfn & mask) { + gfn &= ~mask; + *gfnp = gfn; + kvm_release_pfn_clean(pfn); + pfn &= ~mask; + if (!get_page_unless_zero(pfn_to_page(pfn))) + BUG(); + *pfnp = pfn; + } + } +} + static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gva_t gva, pfn_t *pfn, bool write, bool *writable); @@ -2289,20 +2369,25 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, { int r; int level; + int force_pt_level; pfn_t pfn; unsigned long mmu_seq; bool map_writable; - level = mapping_level(vcpu, gfn); - - /* - * This path builds a PAE pagetable - so we can map 2mb pages at - * maximum. Therefore check if the level is larger than that. - */ - if (level > PT_DIRECTORY_LEVEL) - level = PT_DIRECTORY_LEVEL; + force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); + if (likely(!force_pt_level)) { + level = mapping_level(vcpu, gfn); + /* + * This path builds a PAE pagetable - so we can map + * 2mb pages at maximum. Therefore check if the level + * is larger than that. + */ + if (level > PT_DIRECTORY_LEVEL) + level = PT_DIRECTORY_LEVEL; - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + } else + level = PT_PAGE_TABLE_LEVEL; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -2318,6 +2403,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); + if (likely(!force_pt_level)) + transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, prefault); spin_unlock(&vcpu->kvm->mmu_lock); @@ -2655,6 +2742,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, pfn_t pfn; int r; int level; + int force_pt_level; gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; int write = error_code & PFERR_WRITE_MASK; @@ -2667,9 +2755,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (r) return r; - level = mapping_level(vcpu, gfn); - - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); + if (likely(!force_pt_level)) { + level = mapping_level(vcpu, gfn); + gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + } else + level = PT_PAGE_TABLE_LEVEL; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -2684,6 +2775,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); + if (likely(!force_pt_level)) + transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); r = __direct_map(vcpu, gpa, write, map_writable, level, gfn, pfn, prefault); spin_unlock(&vcpu->kvm->mmu_lock); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 53210f1..6bccc24 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -550,6 +550,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int r; pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; + int force_pt_level; unsigned long mmu_seq; bool map_writable; @@ -577,7 +578,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, return 0; } - if (walker.level >= PT_DIRECTORY_LEVEL) { + if (walker.level >= PT_DIRECTORY_LEVEL) + force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn); + else + force_pt_level = 1; + if (!force_pt_level) { level = min(walker.level, mapping_level(vcpu, walker.gfn)); walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); } @@ -599,6 +604,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); kvm_mmu_free_some_pages(vcpu); + if (!force_pt_level) + transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, level, &write_pt, pfn, map_writable, prefault); (void)sptep; diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 738e659..dbe34b9 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -8,6 +8,7 @@ #include #include #include +#include #include @@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); get_page(page); + SetPageReferenced(page); pages[*nr] = page; (*nr)++; @@ -103,6 +105,17 @@ static inline void get_head_page_multiple(struct page *page, int nr) VM_BUG_ON(page != compound_head(page)); VM_BUG_ON(page_count(page) == 0); atomic_add(nr, &page->_count); + SetPageReferenced(page); +} + +static inline void get_huge_page_tail(struct page *page) +{ + /* + * __split_huge_page_refcount() cannot run + * from under us. + */ + VM_BUG_ON(atomic_read(&page->_count) < 0); + atomic_inc(&page->_count); } static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, @@ -128,6 +141,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; + if (PageTail(page)) + get_huge_page_tail(page); (*nr)++; page++; refs++; @@ -148,7 +163,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, pmd_t pmd = *pmdp; next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) + /* + * The pmd_trans_splitting() check below explains why + * pmdp_splitting_flush has to flush the tlb, to stop + * this gup-fast code from running while we set the + * splitting bit in the pmd. Returning zero will take + * the slow path that will call wait_split_huge_page() + * if the pmd is still in splitting state. gup-fast + * can't because it has irq disabled and + * wait_split_huge_page() would never return as the + * tlb flush IPI wouldn't run. + */ + if (pmd_none(pmd) || pmd_trans_splitting(pmd)) return 0; if (unlikely(pmd_large(pmd))) { if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 8be8c7d..500242d 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -320,6 +320,25 @@ int ptep_set_access_flags(struct vm_area_struct *vma, return changed; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty) +{ + int changed = !pmd_same(*pmdp, entry); + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + if (changed && dirty) { + *pmdp = entry; + pmd_update_defer(vma->vm_mm, address, pmdp); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + } + + return changed; +} +#endif + int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { @@ -335,6 +354,23 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, return ret; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + int ret = 0; + + if (pmd_young(*pmdp)) + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, + (unsigned long *)pmdp); + + if (ret) + pmd_update(vma->vm_mm, addr, pmdp); + + return ret; +} +#endif + int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { @@ -347,6 +383,36 @@ int ptep_clear_flush_young(struct vm_area_struct *vma, return young; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + int young; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + young = pmdp_test_and_clear_young(vma, address, pmdp); + if (young) + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + + return young; +} + +void pmdp_splitting_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + int set; + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + set = !test_and_set_bit(_PAGE_BIT_SPLITTING, + (unsigned long *)pmdp); + if (set) { + pmd_update(vma->vm_mm, address, pmdp); + /* need tlb flush only to serialize against gup-fast */ + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + } +} +#endif + /** * reserve_top_address - reserves a hole in the top of kernel address space * @reserve - size of hole to reserve diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 7793851..17c565d 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -12,7 +12,8 @@ CFLAGS_mmu.o := $(nostackp) obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm.o xen-asm_$(BITS).o \ - grant-table.o suspend.o platform-pci-unplug.o + grant-table.o suspend.o platform-pci-unplug.o \ + p2m.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 44924e5..7575e55 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -173,371 +173,6 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ */ #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) -/* - * Xen leaves the responsibility for maintaining p2m mappings to the - * guests themselves, but it must also access and update the p2m array - * during suspend/resume when all the pages are reallocated. - * - * The p2m table is logically a flat array, but we implement it as a - * three-level tree to allow the address space to be sparse. - * - * Xen - * | - * p2m_top p2m_top_mfn - * / \ / \ - * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn - * / \ / \ / / - * p2m p2m p2m p2m p2m p2m p2m ... - * - * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. - * - * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the - * maximum representable pseudo-physical address space is: - * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages - * - * P2M_PER_PAGE depends on the architecture, as a mfn is always - * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to - * 512 and 1024 entries respectively. - */ - -unsigned long xen_max_p2m_pfn __read_mostly; - -#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) -#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) -#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) - -#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) - -/* Placeholders for holes in the address space */ -static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); - -static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); - -RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); -RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); - -static inline unsigned p2m_top_index(unsigned long pfn) -{ - BUG_ON(pfn >= MAX_P2M_PFN); - return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); -} - -static inline unsigned p2m_mid_index(unsigned long pfn) -{ - return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; -} - -static inline unsigned p2m_index(unsigned long pfn) -{ - return pfn % P2M_PER_PAGE; -} - -static void p2m_top_init(unsigned long ***top) -{ - unsigned i; - - for (i = 0; i < P2M_TOP_PER_PAGE; i++) - top[i] = p2m_mid_missing; -} - -static void p2m_top_mfn_init(unsigned long *top) -{ - unsigned i; - - for (i = 0; i < P2M_TOP_PER_PAGE; i++) - top[i] = virt_to_mfn(p2m_mid_missing_mfn); -} - -static void p2m_top_mfn_p_init(unsigned long **top) -{ - unsigned i; - - for (i = 0; i < P2M_TOP_PER_PAGE; i++) - top[i] = p2m_mid_missing_mfn; -} - -static void p2m_mid_init(unsigned long **mid) -{ - unsigned i; - - for (i = 0; i < P2M_MID_PER_PAGE; i++) - mid[i] = p2m_missing; -} - -static void p2m_mid_mfn_init(unsigned long *mid) -{ - unsigned i; - - for (i = 0; i < P2M_MID_PER_PAGE; i++) - mid[i] = virt_to_mfn(p2m_missing); -} - -static void p2m_init(unsigned long *p2m) -{ - unsigned i; - - for (i = 0; i < P2M_MID_PER_PAGE; i++) - p2m[i] = INVALID_P2M_ENTRY; -} - -/* - * Build the parallel p2m_top_mfn and p2m_mid_mfn structures - * - * This is called both at boot time, and after resuming from suspend: - * - At boot time we're called very early, and must use extend_brk() - * to allocate memory. - * - * - After resume we're called from within stop_machine, but the mfn - * tree should alreay be completely allocated. - */ -void xen_build_mfn_list_list(void) -{ - unsigned long pfn; - - /* Pre-initialize p2m_top_mfn to be completely missing */ - if (p2m_top_mfn == NULL) { - p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(p2m_mid_missing_mfn); - - p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_top_mfn_p_init(p2m_top_mfn_p); - - p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_top_mfn_init(p2m_top_mfn); - } else { - /* Reinitialise, mfn's all change after migration */ - p2m_mid_mfn_init(p2m_mid_missing_mfn); - } - - for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { - unsigned topidx = p2m_top_index(pfn); - unsigned mididx = p2m_mid_index(pfn); - unsigned long **mid; - unsigned long *mid_mfn_p; - - mid = p2m_top[topidx]; - mid_mfn_p = p2m_top_mfn_p[topidx]; - - /* Don't bother allocating any mfn mid levels if - * they're just missing, just update the stored mfn, - * since all could have changed over a migrate. - */ - if (mid == p2m_mid_missing) { - BUG_ON(mididx); - BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); - p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); - pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; - continue; - } - - if (mid_mfn_p == p2m_mid_missing_mfn) { - /* - * XXX boot-time only! We should never find - * missing parts of the mfn tree after - * runtime. extend_brk() will BUG if we call - * it too late. - */ - mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(mid_mfn_p); - - p2m_top_mfn_p[topidx] = mid_mfn_p; - } - - p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); - mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); - } -} - -void xen_setup_mfn_list_list(void) -{ - BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); - - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = - virt_to_mfn(p2m_top_mfn); - HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; -} - -/* Set up p2m_top to point to the domain-builder provided p2m pages */ -void __init xen_build_dynamic_phys_to_machine(void) -{ - unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; - unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); - unsigned long pfn; - - xen_max_p2m_pfn = max_pfn; - - p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_init(p2m_missing); - - p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_init(p2m_mid_missing); - - p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_top_init(p2m_top); - - /* - * The domain builder gives us a pre-constructed p2m array in - * mfn_list for all the pages initially given to us, so we just - * need to graft that into our tree structure. - */ - for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { - unsigned topidx = p2m_top_index(pfn); - unsigned mididx = p2m_mid_index(pfn); - - if (p2m_top[topidx] == p2m_mid_missing) { - unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_init(mid); - - p2m_top[topidx] = mid; - } - - p2m_top[topidx][mididx] = &mfn_list[pfn]; - } -} - -unsigned long get_phys_to_machine(unsigned long pfn) -{ - unsigned topidx, mididx, idx; - - if (unlikely(pfn >= MAX_P2M_PFN)) - return INVALID_P2M_ENTRY; - - topidx = p2m_top_index(pfn); - mididx = p2m_mid_index(pfn); - idx = p2m_index(pfn); - - return p2m_top[topidx][mididx][idx]; -} -EXPORT_SYMBOL_GPL(get_phys_to_machine); - -static void *alloc_p2m_page(void) -{ - return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); -} - -static void free_p2m_page(void *p) -{ - free_page((unsigned long)p); -} - -/* - * Fully allocate the p2m structure for a given pfn. We need to check - * that both the top and mid levels are allocated, and make sure the - * parallel mfn tree is kept in sync. We may race with other cpus, so - * the new pages are installed with cmpxchg; if we lose the race then - * simply free the page we allocated and use the one that's there. - */ -static bool alloc_p2m(unsigned long pfn) -{ - unsigned topidx, mididx; - unsigned long ***top_p, **mid; - unsigned long *top_mfn_p, *mid_mfn; - - topidx = p2m_top_index(pfn); - mididx = p2m_mid_index(pfn); - - top_p = &p2m_top[topidx]; - mid = *top_p; - - if (mid == p2m_mid_missing) { - /* Mid level is missing, allocate a new one */ - mid = alloc_p2m_page(); - if (!mid) - return false; - - p2m_mid_init(mid); - - if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) - free_p2m_page(mid); - } - - top_mfn_p = &p2m_top_mfn[topidx]; - mid_mfn = p2m_top_mfn_p[topidx]; - - BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); - - if (mid_mfn == p2m_mid_missing_mfn) { - /* Separately check the mid mfn level */ - unsigned long missing_mfn; - unsigned long mid_mfn_mfn; - - mid_mfn = alloc_p2m_page(); - if (!mid_mfn) - return false; - - p2m_mid_mfn_init(mid_mfn); - - missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); - mid_mfn_mfn = virt_to_mfn(mid_mfn); - if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) - free_p2m_page(mid_mfn); - else - p2m_top_mfn_p[topidx] = mid_mfn; - } - - if (p2m_top[topidx][mididx] == p2m_missing) { - /* p2m leaf page is missing */ - unsigned long *p2m; - - p2m = alloc_p2m_page(); - if (!p2m) - return false; - - p2m_init(p2m); - - if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) - free_p2m_page(p2m); - else - mid_mfn[mididx] = virt_to_mfn(p2m); - } - - return true; -} - -/* Try to install p2m mapping; fail if intermediate bits missing */ -bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) -{ - unsigned topidx, mididx, idx; - - if (unlikely(pfn >= MAX_P2M_PFN)) { - BUG_ON(mfn != INVALID_P2M_ENTRY); - return true; - } - - topidx = p2m_top_index(pfn); - mididx = p2m_mid_index(pfn); - idx = p2m_index(pfn); - - if (p2m_top[topidx][mididx] == p2m_missing) - return mfn == INVALID_P2M_ENTRY; - - p2m_top[topidx][mididx][idx] = mfn; - - return true; -} - -bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) -{ - if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { - BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); - return true; - } - - if (unlikely(!__set_phys_to_machine(pfn, mfn))) { - if (!alloc_p2m(pfn)) - return false; - - if (!__set_phys_to_machine(pfn, mfn)) - return false; - } - - return true; -} - unsigned long arbitrary_virt_to_mfn(void *vaddr) { xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c new file mode 100644 index 0000000..8f2251d --- /dev/null +++ b/arch/x86/xen/p2m.c @@ -0,0 +1,510 @@ +/* + * Xen leaves the responsibility for maintaining p2m mappings to the + * guests themselves, but it must also access and update the p2m array + * during suspend/resume when all the pages are reallocated. + * + * The p2m table is logically a flat array, but we implement it as a + * three-level tree to allow the address space to be sparse. + * + * Xen + * | + * p2m_top p2m_top_mfn + * / \ / \ + * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn + * / \ / \ / / + * p2m p2m p2m p2m p2m p2m p2m ... + * + * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. + * + * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the + * maximum representable pseudo-physical address space is: + * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages + * + * P2M_PER_PAGE depends on the architecture, as a mfn is always + * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to + * 512 and 1024 entries respectively. + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include "xen-ops.h" + +static void __init m2p_override_init(void); + +unsigned long xen_max_p2m_pfn __read_mostly; + +#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) +#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) +#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) + +#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) + +/* Placeholders for holes in the address space */ +static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); + +static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); + +RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); +RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); + +static inline unsigned p2m_top_index(unsigned long pfn) +{ + BUG_ON(pfn >= MAX_P2M_PFN); + return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); +} + +static inline unsigned p2m_mid_index(unsigned long pfn) +{ + return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; +} + +static inline unsigned p2m_index(unsigned long pfn) +{ + return pfn % P2M_PER_PAGE; +} + +static void p2m_top_init(unsigned long ***top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = p2m_mid_missing; +} + +static void p2m_top_mfn_init(unsigned long *top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = virt_to_mfn(p2m_mid_missing_mfn); +} + +static void p2m_top_mfn_p_init(unsigned long **top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = p2m_mid_missing_mfn; +} + +static void p2m_mid_init(unsigned long **mid) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + mid[i] = p2m_missing; +} + +static void p2m_mid_mfn_init(unsigned long *mid) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + mid[i] = virt_to_mfn(p2m_missing); +} + +static void p2m_init(unsigned long *p2m) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + p2m[i] = INVALID_P2M_ENTRY; +} + +/* + * Build the parallel p2m_top_mfn and p2m_mid_mfn structures + * + * This is called both at boot time, and after resuming from suspend: + * - At boot time we're called very early, and must use extend_brk() + * to allocate memory. + * + * - After resume we're called from within stop_machine, but the mfn + * tree should alreay be completely allocated. + */ +void xen_build_mfn_list_list(void) +{ + unsigned long pfn; + + /* Pre-initialize p2m_top_mfn to be completely missing */ + if (p2m_top_mfn == NULL) { + p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(p2m_mid_missing_mfn); + + p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn_p_init(p2m_top_mfn_p); + + p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn_init(p2m_top_mfn); + } else { + /* Reinitialise, mfn's all change after migration */ + p2m_mid_mfn_init(p2m_mid_missing_mfn); + } + + for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); + unsigned long **mid; + unsigned long *mid_mfn_p; + + mid = p2m_top[topidx]; + mid_mfn_p = p2m_top_mfn_p[topidx]; + + /* Don't bother allocating any mfn mid levels if + * they're just missing, just update the stored mfn, + * since all could have changed over a migrate. + */ + if (mid == p2m_mid_missing) { + BUG_ON(mididx); + BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); + p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); + pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; + continue; + } + + if (mid_mfn_p == p2m_mid_missing_mfn) { + /* + * XXX boot-time only! We should never find + * missing parts of the mfn tree after + * runtime. extend_brk() will BUG if we call + * it too late. + */ + mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(mid_mfn_p); + + p2m_top_mfn_p[topidx] = mid_mfn_p; + } + + p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); + mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); + } +} + +void xen_setup_mfn_list_list(void) +{ + BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); + + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = + virt_to_mfn(p2m_top_mfn); + HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; +} + +/* Set up p2m_top to point to the domain-builder provided p2m pages */ +void __init xen_build_dynamic_phys_to_machine(void) +{ + unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; + unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); + unsigned long pfn; + + xen_max_p2m_pfn = max_pfn; + + p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_init(p2m_missing); + + p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_init(p2m_mid_missing); + + p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_init(p2m_top); + + /* + * The domain builder gives us a pre-constructed p2m array in + * mfn_list for all the pages initially given to us, so we just + * need to graft that into our tree structure. + */ + for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); + + if (p2m_top[topidx] == p2m_mid_missing) { + unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_init(mid); + + p2m_top[topidx] = mid; + } + + p2m_top[topidx][mididx] = &mfn_list[pfn]; + } + + m2p_override_init(); +} + +unsigned long get_phys_to_machine(unsigned long pfn) +{ + unsigned topidx, mididx, idx; + + if (unlikely(pfn >= MAX_P2M_PFN)) + return INVALID_P2M_ENTRY; + + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); + idx = p2m_index(pfn); + + return p2m_top[topidx][mididx][idx]; +} +EXPORT_SYMBOL_GPL(get_phys_to_machine); + +static void *alloc_p2m_page(void) +{ + return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); +} + +static void free_p2m_page(void *p) +{ + free_page((unsigned long)p); +} + +/* + * Fully allocate the p2m structure for a given pfn. We need to check + * that both the top and mid levels are allocated, and make sure the + * parallel mfn tree is kept in sync. We may race with other cpus, so + * the new pages are installed with cmpxchg; if we lose the race then + * simply free the page we allocated and use the one that's there. + */ +static bool alloc_p2m(unsigned long pfn) +{ + unsigned topidx, mididx; + unsigned long ***top_p, **mid; + unsigned long *top_mfn_p, *mid_mfn; + + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); + + top_p = &p2m_top[topidx]; + mid = *top_p; + + if (mid == p2m_mid_missing) { + /* Mid level is missing, allocate a new one */ + mid = alloc_p2m_page(); + if (!mid) + return false; + + p2m_mid_init(mid); + + if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) + free_p2m_page(mid); + } + + top_mfn_p = &p2m_top_mfn[topidx]; + mid_mfn = p2m_top_mfn_p[topidx]; + + BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); + + if (mid_mfn == p2m_mid_missing_mfn) { + /* Separately check the mid mfn level */ + unsigned long missing_mfn; + unsigned long mid_mfn_mfn; + + mid_mfn = alloc_p2m_page(); + if (!mid_mfn) + return false; + + p2m_mid_mfn_init(mid_mfn); + + missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); + mid_mfn_mfn = virt_to_mfn(mid_mfn); + if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) + free_p2m_page(mid_mfn); + else + p2m_top_mfn_p[topidx] = mid_mfn; + } + + if (p2m_top[topidx][mididx] == p2m_missing) { + /* p2m leaf page is missing */ + unsigned long *p2m; + + p2m = alloc_p2m_page(); + if (!p2m) + return false; + + p2m_init(p2m); + + if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) + free_p2m_page(p2m); + else + mid_mfn[mididx] = virt_to_mfn(p2m); + } + + return true; +} + +/* Try to install p2m mapping; fail if intermediate bits missing */ +bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + unsigned topidx, mididx, idx; + + if (unlikely(pfn >= MAX_P2M_PFN)) { + BUG_ON(mfn != INVALID_P2M_ENTRY); + return true; + } + + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); + idx = p2m_index(pfn); + + if (p2m_top[topidx][mididx] == p2m_missing) + return mfn == INVALID_P2M_ENTRY; + + p2m_top[topidx][mididx][idx] = mfn; + + return true; +} + +bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); + return true; + } + + if (unlikely(!__set_phys_to_machine(pfn, mfn))) { + if (!alloc_p2m(pfn)) + return false; + + if (!__set_phys_to_machine(pfn, mfn)) + return false; + } + + return true; +} + +#define M2P_OVERRIDE_HASH_SHIFT 10 +#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT) + +static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH); +static DEFINE_SPINLOCK(m2p_override_lock); + +static void __init m2p_override_init(void) +{ + unsigned i; + + m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH, + sizeof(unsigned long)); + + for (i = 0; i < M2P_OVERRIDE_HASH; i++) + INIT_LIST_HEAD(&m2p_overrides[i]); +} + +static unsigned long mfn_hash(unsigned long mfn) +{ + return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT); +} + +/* Add an MFN override for a particular page */ +int m2p_add_override(unsigned long mfn, struct page *page) +{ + unsigned long flags; + unsigned long pfn; + unsigned long address; + unsigned level; + pte_t *ptep = NULL; + + pfn = page_to_pfn(page); + if (!PageHighMem(page)) { + address = (unsigned long)__va(pfn << PAGE_SHIFT); + ptep = lookup_address(address, &level); + + if (WARN(ptep == NULL || level != PG_LEVEL_4K, + "m2p_add_override: pfn %lx not mapped", pfn)) + return -EINVAL; + } + + page->private = mfn; + page->index = pfn_to_mfn(pfn); + + __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); + if (!PageHighMem(page)) + /* Just zap old mapping for now */ + pte_clear(&init_mm, address, ptep); + + spin_lock_irqsave(&m2p_override_lock, flags); + list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); + spin_unlock_irqrestore(&m2p_override_lock, flags); + + return 0; +} + +int m2p_remove_override(struct page *page) +{ + unsigned long flags; + unsigned long mfn; + unsigned long pfn; + unsigned long address; + unsigned level; + pte_t *ptep = NULL; + + pfn = page_to_pfn(page); + mfn = get_phys_to_machine(pfn); + if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) + return -EINVAL; + + if (!PageHighMem(page)) { + address = (unsigned long)__va(pfn << PAGE_SHIFT); + ptep = lookup_address(address, &level); + + if (WARN(ptep == NULL || level != PG_LEVEL_4K, + "m2p_remove_override: pfn %lx not mapped", pfn)) + return -EINVAL; + } + + spin_lock_irqsave(&m2p_override_lock, flags); + list_del(&page->lru); + spin_unlock_irqrestore(&m2p_override_lock, flags); + __set_phys_to_machine(pfn, page->index); + + if (!PageHighMem(page)) + set_pte_at(&init_mm, address, ptep, + pfn_pte(pfn, PAGE_KERNEL)); + /* No tlb flush necessary because the caller already + * left the pte unmapped. */ + + return 0; +} + +struct page *m2p_find_override(unsigned long mfn) +{ + unsigned long flags; + struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)]; + struct page *p, *ret; + + ret = NULL; + + spin_lock_irqsave(&m2p_override_lock, flags); + + list_for_each_entry(p, bucket, lru) { + if (p->private == mfn) { + ret = p; + break; + } + } + + spin_unlock_irqrestore(&m2p_override_lock, flags); + + return ret; +} + +unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn) +{ + struct page *p = m2p_find_override(mfn); + unsigned long ret = pfn; + + if (p) + ret = page_to_pfn(p); + + return ret; +} +EXPORT_SYMBOL_GPL(m2p_find_override_pfn); diff --git a/arch/xtensa/include/asm/mman.h b/arch/xtensa/include/asm/mman.h index fca4db4..3078901 100644 --- a/arch/xtensa/include/asm/mman.h +++ b/arch/xtensa/include/asm/mman.h @@ -83,6 +83,9 @@ #define MADV_MERGEABLE 12 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ +#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */ +#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index 3f3489c..10c7ad5 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -51,12 +51,7 @@ config ACPI_PROCFS For backwards compatibility, this option allows deprecated /proc/acpi/ files to exist, even when they have been replaced by functions in /sys. - The deprecated files (and their replacements) include: - /proc/acpi/processor/*/throttling (/sys/class/thermal/ - cooling_device*/*) - /proc/acpi/video/*/brightness (/sys/class/backlight/) - /proc/acpi/thermal_zone/*/* (/sys/class/thermal/) This option has no effect on /proc/acpi/ files and functions which do not yet exist in /sys. @@ -74,6 +69,8 @@ config ACPI_PROCFS_POWER /proc/acpi/ac_adapter/* (sys/class/power_supply/*) This option has no effect on /proc/acpi/ directories and functions, which do not yet exist in /sys + This option, together with the proc directories, will be + deleted in 2.6.39. Say N to delete power /proc/acpi/ directories that have moved to /sys/ @@ -209,6 +206,17 @@ config ACPI_PROCESSOR To compile this driver as a module, choose M here: the module will be called processor. +config ACPI_IPMI + tristate "IPMI" + depends on EXPERIMENTAL && IPMI_SI && IPMI_HANDLER + default n + help + This driver enables the ACPI to access the BMC controller. And it + uses the IPMI request/response message to communicate with BMC + controller, which can be found on on the server. + + To compile this driver as a module, choose M here: + the module will be called as acpi_ipmi. config ACPI_HOTPLUG_CPU bool diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile index 3d031d0..d113fa5 100644 --- a/drivers/acpi/Makefile +++ b/drivers/acpi/Makefile @@ -24,7 +24,7 @@ acpi-y += atomicio.o # sleep related files acpi-y += wakeup.o acpi-y += sleep.o -acpi-$(CONFIG_ACPI_SLEEP) += proc.o +acpi-$(CONFIG_ACPI_SLEEP) += proc.o nvs.o # @@ -69,5 +69,6 @@ processor-y += processor_idle.o processor_thermal.o processor-$(CONFIG_CPU_FREQ) += processor_perflib.o obj-$(CONFIG_ACPI_PROCESSOR_AGGREGATOR) += acpi_pad.o +obj-$(CONFIG_ACPI_IPMI) += acpi_ipmi.o obj-$(CONFIG_ACPI_APEI) += apei/ diff --git a/drivers/acpi/ac.c b/drivers/acpi/ac.c index 25d3aae..58c3f74 100644 --- a/drivers/acpi/ac.c +++ b/drivers/acpi/ac.c @@ -197,7 +197,8 @@ static int acpi_ac_add_fs(struct acpi_device *device) { struct proc_dir_entry *entry = NULL; - + printk(KERN_WARNING PREFIX "Deprecated procfs I/F for AC is loaded," + " please retry with CONFIG_ACPI_PROCFS_POWER cleared\n"); if (!acpi_device_dir(device)) { acpi_device_dir(device) = proc_mkdir(acpi_device_bid(device), acpi_ac_dir); diff --git a/drivers/acpi/acpi_ipmi.c b/drivers/acpi/acpi_ipmi.c new file mode 100644 index 0000000..f40acef --- /dev/null +++ b/drivers/acpi/acpi_ipmi.c @@ -0,0 +1,525 @@ +/* + * acpi_ipmi.c - ACPI IPMI opregion + * + * Copyright (C) 2010 Intel Corporation + * Copyright (C) 2010 Zhao Yakui + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Zhao Yakui"); +MODULE_DESCRIPTION("ACPI IPMI Opregion driver"); +MODULE_LICENSE("GPL"); + +#define IPMI_FLAGS_HANDLER_INSTALL 0 + +#define ACPI_IPMI_OK 0 +#define ACPI_IPMI_TIMEOUT 0x10 +#define ACPI_IPMI_UNKNOWN 0x07 +/* the IPMI timeout is 5s */ +#define IPMI_TIMEOUT (5 * HZ) + +struct acpi_ipmi_device { + /* the device list attached to driver_data.ipmi_devices */ + struct list_head head; + /* the IPMI request message list */ + struct list_head tx_msg_list; + struct mutex tx_msg_lock; + acpi_handle handle; + struct pnp_dev *pnp_dev; + ipmi_user_t user_interface; + int ipmi_ifnum; /* IPMI interface number */ + long curr_msgid; + unsigned long flags; + struct ipmi_smi_info smi_data; +}; + +struct ipmi_driver_data { + struct list_head ipmi_devices; + struct ipmi_smi_watcher bmc_events; + struct ipmi_user_hndl ipmi_hndlrs; + struct mutex ipmi_lock; +}; + +struct acpi_ipmi_msg { + struct list_head head; + /* + * General speaking the addr type should be SI_ADDR_TYPE. And + * the addr channel should be BMC. + * In fact it can also be IPMB type. But we will have to + * parse it from the Netfn command buffer. It is so complex + * that it is skipped. + */ + struct ipmi_addr addr; + long tx_msgid; + /* it is used to track whether the IPMI message is finished */ + struct completion tx_complete; + struct kernel_ipmi_msg tx_message; + int msg_done; + /* tx data . And copy it from ACPI object buffer */ + u8 tx_data[64]; + int tx_len; + u8 rx_data[64]; + int rx_len; + struct acpi_ipmi_device *device; +}; + +/* IPMI request/response buffer per ACPI 4.0, sec 5.5.2.4.3.2 */ +struct acpi_ipmi_buffer { + u8 status; + u8 length; + u8 data[64]; +}; + +static void ipmi_register_bmc(int iface, struct device *dev); +static void ipmi_bmc_gone(int iface); +static void ipmi_msg_handler(struct ipmi_recv_msg *msg, void *user_msg_data); +static void acpi_add_ipmi_device(struct acpi_ipmi_device *ipmi_device); +static void acpi_remove_ipmi_device(struct acpi_ipmi_device *ipmi_device); + +static struct ipmi_driver_data driver_data = { + .ipmi_devices = LIST_HEAD_INIT(driver_data.ipmi_devices), + .bmc_events = { + .owner = THIS_MODULE, + .new_smi = ipmi_register_bmc, + .smi_gone = ipmi_bmc_gone, + }, + .ipmi_hndlrs = { + .ipmi_recv_hndl = ipmi_msg_handler, + }, +}; + +static struct acpi_ipmi_msg *acpi_alloc_ipmi_msg(struct acpi_ipmi_device *ipmi) +{ + struct acpi_ipmi_msg *ipmi_msg; + struct pnp_dev *pnp_dev = ipmi->pnp_dev; + + ipmi_msg = kzalloc(sizeof(struct acpi_ipmi_msg), GFP_KERNEL); + if (!ipmi_msg) { + dev_warn(&pnp_dev->dev, "Can't allocate memory for ipmi_msg\n"); + return NULL; + } + init_completion(&ipmi_msg->tx_complete); + INIT_LIST_HEAD(&ipmi_msg->head); + ipmi_msg->device = ipmi; + return ipmi_msg; +} + +#define IPMI_OP_RGN_NETFN(offset) ((offset >> 8) & 0xff) +#define IPMI_OP_RGN_CMD(offset) (offset & 0xff) +static void acpi_format_ipmi_msg(struct acpi_ipmi_msg *tx_msg, + acpi_physical_address address, + acpi_integer *value) +{ + struct kernel_ipmi_msg *msg; + struct acpi_ipmi_buffer *buffer; + struct acpi_ipmi_device *device; + + msg = &tx_msg->tx_message; + /* + * IPMI network function and command are encoded in the address + * within the IPMI OpRegion; see ACPI 4.0, sec 5.5.2.4.3. + */ + msg->netfn = IPMI_OP_RGN_NETFN(address); + msg->cmd = IPMI_OP_RGN_CMD(address); + msg->data = tx_msg->tx_data; + /* + * value is the parameter passed by the IPMI opregion space handler. + * It points to the IPMI request message buffer + */ + buffer = (struct acpi_ipmi_buffer *)value; + /* copy the tx message data */ + msg->data_len = buffer->length; + memcpy(tx_msg->tx_data, buffer->data, msg->data_len); + /* + * now the default type is SYSTEM_INTERFACE and channel type is BMC. + * If the netfn is APP_REQUEST and the cmd is SEND_MESSAGE, + * the addr type should be changed to IPMB. Then we will have to parse + * the IPMI request message buffer to get the IPMB address. + * If so, please fix me. + */ + tx_msg->addr.addr_type = IPMI_SYSTEM_INTERFACE_ADDR_TYPE; + tx_msg->addr.channel = IPMI_BMC_CHANNEL; + tx_msg->addr.data[0] = 0; + + /* Get the msgid */ + device = tx_msg->device; + mutex_lock(&device->tx_msg_lock); + device->curr_msgid++; + tx_msg->tx_msgid = device->curr_msgid; + mutex_unlock(&device->tx_msg_lock); +} + +static void acpi_format_ipmi_response(struct acpi_ipmi_msg *msg, + acpi_integer *value, int rem_time) +{ + struct acpi_ipmi_buffer *buffer; + + /* + * value is also used as output parameter. It represents the response + * IPMI message returned by IPMI command. + */ + buffer = (struct acpi_ipmi_buffer *)value; + if (!rem_time && !msg->msg_done) { + buffer->status = ACPI_IPMI_TIMEOUT; + return; + } + /* + * If the flag of msg_done is not set or the recv length is zero, it + * means that the IPMI command is not executed correctly. + * The status code will be ACPI_IPMI_UNKNOWN. + */ + if (!msg->msg_done || !msg->rx_len) { + buffer->status = ACPI_IPMI_UNKNOWN; + return; + } + /* + * If the IPMI response message is obtained correctly, the status code + * will be ACPI_IPMI_OK + */ + buffer->status = ACPI_IPMI_OK; + buffer->length = msg->rx_len; + memcpy(buffer->data, msg->rx_data, msg->rx_len); +} + +static void ipmi_flush_tx_msg(struct acpi_ipmi_device *ipmi) +{ + struct acpi_ipmi_msg *tx_msg, *temp; + int count = HZ / 10; + struct pnp_dev *pnp_dev = ipmi->pnp_dev; + + list_for_each_entry_safe(tx_msg, temp, &ipmi->tx_msg_list, head) { + /* wake up the sleep thread on the Tx msg */ + complete(&tx_msg->tx_complete); + } + + /* wait for about 100ms to flush the tx message list */ + while (count--) { + if (list_empty(&ipmi->tx_msg_list)) + break; + schedule_timeout(1); + } + if (!list_empty(&ipmi->tx_msg_list)) + dev_warn(&pnp_dev->dev, "tx msg list is not NULL\n"); +} + +static void ipmi_msg_handler(struct ipmi_recv_msg *msg, void *user_msg_data) +{ + struct acpi_ipmi_device *ipmi_device = user_msg_data; + int msg_found = 0; + struct acpi_ipmi_msg *tx_msg; + struct pnp_dev *pnp_dev = ipmi_device->pnp_dev; + + if (msg->user != ipmi_device->user_interface) { + dev_warn(&pnp_dev->dev, "Unexpected response is returned. " + "returned user %p, expected user %p\n", + msg->user, ipmi_device->user_interface); + ipmi_free_recv_msg(msg); + return; + } + mutex_lock(&ipmi_device->tx_msg_lock); + list_for_each_entry(tx_msg, &ipmi_device->tx_msg_list, head) { + if (msg->msgid == tx_msg->tx_msgid) { + msg_found = 1; + break; + } + } + + mutex_unlock(&ipmi_device->tx_msg_lock); + if (!msg_found) { + dev_warn(&pnp_dev->dev, "Unexpected response (msg id %ld) is " + "returned.\n", msg->msgid); + ipmi_free_recv_msg(msg); + return; + } + + if (msg->msg.data_len) { + /* copy the response data to Rx_data buffer */ + memcpy(tx_msg->rx_data, msg->msg_data, msg->msg.data_len); + tx_msg->rx_len = msg->msg.data_len; + tx_msg->msg_done = 1; + } + complete(&tx_msg->tx_complete); + ipmi_free_recv_msg(msg); +}; + +static void ipmi_register_bmc(int iface, struct device *dev) +{ + struct acpi_ipmi_device *ipmi_device, *temp; + struct pnp_dev *pnp_dev; + ipmi_user_t user; + int err; + struct ipmi_smi_info smi_data; + acpi_handle handle; + + err = ipmi_get_smi_info(iface, &smi_data); + + if (err) + return; + + if (smi_data.addr_src != SI_ACPI) { + put_device(smi_data.dev); + return; + } + + handle = smi_data.addr_info.acpi_info.acpi_handle; + + mutex_lock(&driver_data.ipmi_lock); + list_for_each_entry(temp, &driver_data.ipmi_devices, head) { + /* + * if the corresponding ACPI handle is already added + * to the device list, don't add it again. + */ + if (temp->handle == handle) + goto out; + } + + ipmi_device = kzalloc(sizeof(*ipmi_device), GFP_KERNEL); + + if (!ipmi_device) + goto out; + + pnp_dev = to_pnp_dev(smi_data.dev); + ipmi_device->handle = handle; + ipmi_device->pnp_dev = pnp_dev; + + err = ipmi_create_user(iface, &driver_data.ipmi_hndlrs, + ipmi_device, &user); + if (err) { + dev_warn(&pnp_dev->dev, "Can't create IPMI user interface\n"); + kfree(ipmi_device); + goto out; + } + acpi_add_ipmi_device(ipmi_device); + ipmi_device->user_interface = user; + ipmi_device->ipmi_ifnum = iface; + mutex_unlock(&driver_data.ipmi_lock); + memcpy(&ipmi_device->smi_data, &smi_data, sizeof(struct ipmi_smi_info)); + return; + +out: + mutex_unlock(&driver_data.ipmi_lock); + put_device(smi_data.dev); + return; +} + +static void ipmi_bmc_gone(int iface) +{ + struct acpi_ipmi_device *ipmi_device, *temp; + + mutex_lock(&driver_data.ipmi_lock); + list_for_each_entry_safe(ipmi_device, temp, + &driver_data.ipmi_devices, head) { + if (ipmi_device->ipmi_ifnum != iface) + continue; + + acpi_remove_ipmi_device(ipmi_device); + put_device(ipmi_device->smi_data.dev); + kfree(ipmi_device); + break; + } + mutex_unlock(&driver_data.ipmi_lock); +} +/* -------------------------------------------------------------------------- + * Address Space Management + * -------------------------------------------------------------------------- */ +/* + * This is the IPMI opregion space handler. + * @function: indicates the read/write. In fact as the IPMI message is driven + * by command, only write is meaningful. + * @address: This contains the netfn/command of IPMI request message. + * @bits : not used. + * @value : it is an in/out parameter. It points to the IPMI message buffer. + * Before the IPMI message is sent, it represents the actual request + * IPMI message. After the IPMI message is finished, it represents + * the response IPMI message returned by IPMI command. + * @handler_context: IPMI device context. + */ + +static acpi_status +acpi_ipmi_space_handler(u32 function, acpi_physical_address address, + u32 bits, acpi_integer *value, + void *handler_context, void *region_context) +{ + struct acpi_ipmi_msg *tx_msg; + struct acpi_ipmi_device *ipmi_device = handler_context; + int err, rem_time; + acpi_status status; + /* + * IPMI opregion message. + * IPMI message is firstly written to the BMC and system software + * can get the respsonse. So it is unmeaningful for the read access + * of IPMI opregion. + */ + if ((function & ACPI_IO_MASK) == ACPI_READ) + return AE_TYPE; + + if (!ipmi_device->user_interface) + return AE_NOT_EXIST; + + tx_msg = acpi_alloc_ipmi_msg(ipmi_device); + if (!tx_msg) + return AE_NO_MEMORY; + + acpi_format_ipmi_msg(tx_msg, address, value); + mutex_lock(&ipmi_device->tx_msg_lock); + list_add_tail(&tx_msg->head, &ipmi_device->tx_msg_list); + mutex_unlock(&ipmi_device->tx_msg_lock); + err = ipmi_request_settime(ipmi_device->user_interface, + &tx_msg->addr, + tx_msg->tx_msgid, + &tx_msg->tx_message, + NULL, 0, 0, 0); + if (err) { + status = AE_ERROR; + goto end_label; + } + rem_time = wait_for_completion_timeout(&tx_msg->tx_complete, + IPMI_TIMEOUT); + acpi_format_ipmi_response(tx_msg, value, rem_time); + status = AE_OK; + +end_label: + mutex_lock(&ipmi_device->tx_msg_lock); + list_del(&tx_msg->head); + mutex_unlock(&ipmi_device->tx_msg_lock); + kfree(tx_msg); + return status; +} + +static void ipmi_remove_space_handler(struct acpi_ipmi_device *ipmi) +{ + if (!test_bit(IPMI_FLAGS_HANDLER_INSTALL, &ipmi->flags)) + return; + + acpi_remove_address_space_handler(ipmi->handle, + ACPI_ADR_SPACE_IPMI, &acpi_ipmi_space_handler); + + clear_bit(IPMI_FLAGS_HANDLER_INSTALL, &ipmi->flags); +} + +static int ipmi_install_space_handler(struct acpi_ipmi_device *ipmi) +{ + acpi_status status; + + if (test_bit(IPMI_FLAGS_HANDLER_INSTALL, &ipmi->flags)) + return 0; + + status = acpi_install_address_space_handler(ipmi->handle, + ACPI_ADR_SPACE_IPMI, + &acpi_ipmi_space_handler, + NULL, ipmi); + if (ACPI_FAILURE(status)) { + struct pnp_dev *pnp_dev = ipmi->pnp_dev; + dev_warn(&pnp_dev->dev, "Can't register IPMI opregion space " + "handle\n"); + return -EINVAL; + } + set_bit(IPMI_FLAGS_HANDLER_INSTALL, &ipmi->flags); + return 0; +} + +static void acpi_add_ipmi_device(struct acpi_ipmi_device *ipmi_device) +{ + + INIT_LIST_HEAD(&ipmi_device->head); + + mutex_init(&ipmi_device->tx_msg_lock); + INIT_LIST_HEAD(&ipmi_device->tx_msg_list); + ipmi_install_space_handler(ipmi_device); + + list_add_tail(&ipmi_device->head, &driver_data.ipmi_devices); +} + +static void acpi_remove_ipmi_device(struct acpi_ipmi_device *ipmi_device) +{ + /* + * If the IPMI user interface is created, it should be + * destroyed. + */ + if (ipmi_device->user_interface) { + ipmi_destroy_user(ipmi_device->user_interface); + ipmi_device->user_interface = NULL; + } + /* flush the Tx_msg list */ + if (!list_empty(&ipmi_device->tx_msg_list)) + ipmi_flush_tx_msg(ipmi_device); + + list_del(&ipmi_device->head); + ipmi_remove_space_handler(ipmi_device); +} + +static int __init acpi_ipmi_init(void) +{ + int result = 0; + + if (acpi_disabled) + return result; + + mutex_init(&driver_data.ipmi_lock); + + result = ipmi_smi_watcher_register(&driver_data.bmc_events); + + return result; +} + +static void __exit acpi_ipmi_exit(void) +{ + struct acpi_ipmi_device *ipmi_device, *temp; + + if (acpi_disabled) + return; + + ipmi_smi_watcher_unregister(&driver_data.bmc_events); + + /* + * When one smi_watcher is unregistered, it is only deleted + * from the smi_watcher list. But the smi_gone callback function + * is not called. So explicitly uninstall the ACPI IPMI oregion + * handler and free it. + */ + mutex_lock(&driver_data.ipmi_lock); + list_for_each_entry_safe(ipmi_device, temp, + &driver_data.ipmi_devices, head) { + acpi_remove_ipmi_device(ipmi_device); + put_device(ipmi_device->smi_data.dev); + kfree(ipmi_device); + } + mutex_unlock(&driver_data.ipmi_lock); +} + +module_init(acpi_ipmi_init); +module_exit(acpi_ipmi_exit); diff --git a/drivers/acpi/acpica/Makefile b/drivers/acpi/acpica/Makefile index a7e1d1a..eec2ead 100644 --- a/drivers/acpi/acpica/Makefile +++ b/drivers/acpi/acpica/Makefile @@ -14,7 +14,7 @@ acpi-y := dsfield.o dsmthdat.o dsopcode.o dswexec.o dswscope.o \ acpi-y += evevent.o evregion.o evsci.o evxfevnt.o \ evmisc.o evrgnini.o evxface.o evxfregn.o \ - evgpe.o evgpeblk.o evgpeinit.o evgpeutil.o + evgpe.o evgpeblk.o evgpeinit.o evgpeutil.o evxfgpe.o acpi-y += exconfig.o exfield.o exnames.o exoparg6.o exresolv.o exstorob.o\ exconvrt.o exfldio.o exoparg1.o exprep.o exresop.o exsystem.o\ diff --git a/drivers/acpi/acpica/acevents.h b/drivers/acpi/acpica/acevents.h index a6f99cc..70e0b28 100644 --- a/drivers/acpi/acpica/acevents.h +++ b/drivers/acpi/acpica/acevents.h @@ -51,8 +51,6 @@ acpi_status acpi_ev_initialize_events(void); acpi_status acpi_ev_install_xrupt_handlers(void); -acpi_status acpi_ev_install_fadt_gpes(void); - u32 acpi_ev_fixed_event_detect(void); /* @@ -82,9 +80,9 @@ acpi_ev_update_gpe_enable_mask(struct acpi_gpe_event_info *gpe_event_info); acpi_status acpi_ev_enable_gpe(struct acpi_gpe_event_info *gpe_event_info); -acpi_status acpi_raw_enable_gpe(struct acpi_gpe_event_info *gpe_event_info); +acpi_status acpi_ev_add_gpe_reference(struct acpi_gpe_event_info *gpe_event_info); -acpi_status acpi_raw_disable_gpe(struct acpi_gpe_event_info *gpe_event_info); +acpi_status acpi_ev_remove_gpe_reference(struct acpi_gpe_event_info *gpe_event_info); struct acpi_gpe_event_info *acpi_ev_get_gpe_event_info(acpi_handle gpe_device, u32 gpe_number); @@ -93,6 +91,8 @@ struct acpi_gpe_event_info *acpi_ev_low_get_gpe_info(u32 gpe_number, struct acpi_gpe_block_info *gpe_block); +acpi_status acpi_ev_finish_gpe(struct acpi_gpe_event_info *gpe_event_info); + /* * evgpeblk - Upper-level GPE block support */ @@ -107,12 +107,13 @@ acpi_ev_create_gpe_block(struct acpi_namespace_node *gpe_device, acpi_status acpi_ev_initialize_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info, struct acpi_gpe_block_info *gpe_block, - void *ignored); + void *context); acpi_status acpi_ev_delete_gpe_block(struct acpi_gpe_block_info *gpe_block); u32 -acpi_ev_gpe_dispatch(struct acpi_gpe_event_info *gpe_event_info, +acpi_ev_gpe_dispatch(struct acpi_namespace_node *gpe_device, + struct acpi_gpe_event_info *gpe_event_info, u32 gpe_number); /* @@ -126,10 +127,6 @@ acpi_status acpi_ev_match_gpe_method(acpi_handle obj_handle, u32 level, void *context, void **return_value); -acpi_status -acpi_ev_match_prw_and_gpe(acpi_handle obj_handle, - u32 level, void *context, void **return_value); - /* * evgpeutil - GPE utilities */ @@ -138,6 +135,10 @@ acpi_ev_walk_gpe_list(acpi_gpe_callback gpe_walk_callback, void *context); u8 acpi_ev_valid_gpe_event(struct acpi_gpe_event_info *gpe_event_info); +acpi_status +acpi_ev_get_gpe_device(struct acpi_gpe_xrupt_info *gpe_xrupt_info, + struct acpi_gpe_block_info *gpe_block, void *context); + struct acpi_gpe_xrupt_info *acpi_ev_get_gpe_xrupt_block(u32 interrupt_number); acpi_status acpi_ev_delete_gpe_xrupt(struct acpi_gpe_xrupt_info *gpe_xrupt); diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h index ad88fca..9bb69c5 100644 --- a/drivers/acpi/acpica/acglobal.h +++ b/drivers/acpi/acpica/acglobal.h @@ -146,6 +146,9 @@ u8 acpi_gbl_system_awake_and_running; extern u32 acpi_gbl_nesting_level; +ACPI_EXTERN u32 acpi_gpe_count; +ACPI_EXTERN u32 acpi_fixed_event_count[ACPI_NUM_FIXED_EVENTS]; + /* Support for dynamic control method tracing mechanism */ ACPI_EXTERN u32 acpi_gbl_original_dbg_level; @@ -370,7 +373,9 @@ ACPI_EXTERN struct acpi_fixed_event_handler ACPI_EXTERN struct acpi_gpe_xrupt_info *acpi_gbl_gpe_xrupt_list_head; ACPI_EXTERN struct acpi_gpe_block_info *acpi_gbl_gpe_fadt_blocks[ACPI_MAX_GPE_BLOCKS]; -ACPI_EXTERN u8 acpi_all_gpes_initialized; +ACPI_EXTERN u8 acpi_gbl_all_gpes_initialized; +ACPI_EXTERN ACPI_GBL_EVENT_HANDLER acpi_gbl_global_event_handler; +ACPI_EXTERN void *acpi_gbl_global_event_handler_context; /***************************************************************************** * diff --git a/drivers/acpi/acpica/achware.h b/drivers/acpi/acpica/achware.h index 167470a..258d628 100644 --- a/drivers/acpi/acpica/achware.h +++ b/drivers/acpi/acpica/achware.h @@ -94,7 +94,7 @@ u32 acpi_hw_get_gpe_register_bit(struct acpi_gpe_event_info *gpe_event_info, struct acpi_gpe_register_info *gpe_register_info); acpi_status -acpi_hw_low_set_gpe(struct acpi_gpe_event_info *gpe_event_info, u8 action); +acpi_hw_low_set_gpe(struct acpi_gpe_event_info *gpe_event_info, u32 action); acpi_status acpi_hw_disable_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info, diff --git a/drivers/acpi/acpica/aclocal.h b/drivers/acpi/acpica/aclocal.h index 2ceb0c0..74000f5 100644 --- a/drivers/acpi/acpica/aclocal.h +++ b/drivers/acpi/acpica/aclocal.h @@ -408,17 +408,18 @@ struct acpi_predefined_data { /* Dispatch info for each GPE -- either a method or handler, cannot be both */ -struct acpi_handler_info { - acpi_event_handler address; /* Address of handler, if any */ +struct acpi_gpe_handler_info { + acpi_gpe_handler address; /* Address of handler, if any */ void *context; /* Context to be passed to handler */ struct acpi_namespace_node *method_node; /* Method node for this GPE level (saved) */ - u8 orig_flags; /* Original misc info about this GPE */ - u8 orig_enabled; /* Set if the GPE was originally enabled */ + u8 original_flags; /* Original (pre-handler) GPE info */ + u8 originally_enabled; /* True if GPE was originally enabled */ }; union acpi_gpe_dispatch_info { struct acpi_namespace_node *method_node; /* Method node for this GPE level */ - struct acpi_handler_info *handler; + struct acpi_gpe_handler_info *handler; /* Installed GPE handler */ + struct acpi_namespace_node *device_node; /* Parent _PRW device for implicit notify */ }; /* @@ -458,7 +459,7 @@ struct acpi_gpe_block_info { u32 register_count; /* Number of register pairs in block */ u16 gpe_count; /* Number of individual GPEs in block */ u8 block_base_number; /* Base GPE number for this block */ - u8 initialized; /* If set, the GPE block has been initialized */ + u8 initialized; /* TRUE if this block is initialized */ }; /* Information about GPE interrupt handlers, one per each interrupt level used for GPEs */ diff --git a/drivers/acpi/acpica/evevent.c b/drivers/acpi/acpica/evevent.c index c61c303..e5e313c 100644 --- a/drivers/acpi/acpica/evevent.c +++ b/drivers/acpi/acpica/evevent.c @@ -217,9 +217,17 @@ u32 acpi_ev_fixed_event_detect(void) status_bit_mask) && (fixed_enable & acpi_gbl_fixed_event_info[i]. enable_bit_mask)) { + /* + * Found an active (signalled) event. Invoke global event + * handler if present. + */ + acpi_fixed_event_count[i]++; + if (acpi_gbl_global_event_handler) { + acpi_gbl_global_event_handler + (ACPI_EVENT_TYPE_FIXED, NULL, i, + acpi_gbl_global_event_handler_context); + } - /* Found an active (signalled) event */ - acpi_os_fixed_event_count(i); int_status |= acpi_ev_fixed_event_dispatch(i); } } diff --git a/drivers/acpi/acpica/evgpe.c b/drivers/acpi/acpica/evgpe.c index f226eac..7c339d3 100644 --- a/drivers/acpi/acpica/evgpe.c +++ b/drivers/acpi/acpica/evgpe.c @@ -52,6 +52,8 @@ ACPI_MODULE_NAME("evgpe") /* Local prototypes */ static void ACPI_SYSTEM_XFACE acpi_ev_asynch_execute_gpe_method(void *context); +static void ACPI_SYSTEM_XFACE acpi_ev_asynch_enable_gpe(void *context); + /******************************************************************************* * * FUNCTION: acpi_ev_update_gpe_enable_mask @@ -102,7 +104,7 @@ acpi_ev_update_gpe_enable_mask(struct acpi_gpe_event_info *gpe_event_info) * * RETURN: Status * - * DESCRIPTION: Clear the given GPE from stale events and enable it. + * DESCRIPTION: Clear a GPE of stale events and enable it. * ******************************************************************************/ acpi_status @@ -113,12 +115,13 @@ acpi_ev_enable_gpe(struct acpi_gpe_event_info *gpe_event_info) ACPI_FUNCTION_TRACE(ev_enable_gpe); /* - * We will only allow a GPE to be enabled if it has either an - * associated method (_Lxx/_Exx) or a handler. Otherwise, the - * GPE will be immediately disabled by acpi_ev_gpe_dispatch the - * first time it fires. + * We will only allow a GPE to be enabled if it has either an associated + * method (_Lxx/_Exx) or a handler, or is using the implicit notify + * feature. Otherwise, the GPE will be immediately disabled by + * acpi_ev_gpe_dispatch the first time it fires. */ - if (!(gpe_event_info->flags & ACPI_GPE_DISPATCH_MASK)) { + if ((gpe_event_info->flags & ACPI_GPE_DISPATCH_MASK) == + ACPI_GPE_DISPATCH_NONE) { return_ACPI_STATUS(AE_NO_HANDLER); } @@ -137,9 +140,9 @@ acpi_ev_enable_gpe(struct acpi_gpe_event_info *gpe_event_info) /******************************************************************************* * - * FUNCTION: acpi_raw_enable_gpe + * FUNCTION: acpi_ev_add_gpe_reference * - * PARAMETERS: gpe_event_info - GPE to enable + * PARAMETERS: gpe_event_info - Add a reference to this GPE * * RETURN: Status * @@ -148,16 +151,21 @@ acpi_ev_enable_gpe(struct acpi_gpe_event_info *gpe_event_info) * ******************************************************************************/ -acpi_status acpi_raw_enable_gpe(struct acpi_gpe_event_info *gpe_event_info) +acpi_status acpi_ev_add_gpe_reference(struct acpi_gpe_event_info *gpe_event_info) { acpi_status status = AE_OK; + ACPI_FUNCTION_TRACE(ev_add_gpe_reference); + if (gpe_event_info->runtime_count == ACPI_UINT8_MAX) { return_ACPI_STATUS(AE_LIMIT); } gpe_event_info->runtime_count++; if (gpe_event_info->runtime_count == 1) { + + /* Enable on first reference */ + status = acpi_ev_update_gpe_enable_mask(gpe_event_info); if (ACPI_SUCCESS(status)) { status = acpi_ev_enable_gpe(gpe_event_info); @@ -173,9 +181,9 @@ acpi_status acpi_raw_enable_gpe(struct acpi_gpe_event_info *gpe_event_info) /******************************************************************************* * - * FUNCTION: acpi_raw_disable_gpe + * FUNCTION: acpi_ev_remove_gpe_reference * - * PARAMETERS: gpe_event_info - GPE to disable + * PARAMETERS: gpe_event_info - Remove a reference to this GPE * * RETURN: Status * @@ -184,16 +192,21 @@ acpi_status acpi_raw_enable_gpe(struct acpi_gpe_event_info *gpe_event_info) * ******************************************************************************/ -acpi_status acpi_raw_disable_gpe(struct acpi_gpe_event_info *gpe_event_info) +acpi_status acpi_ev_remove_gpe_reference(struct acpi_gpe_event_info *gpe_event_info) { acpi_status status = AE_OK; + ACPI_FUNCTION_TRACE(ev_remove_gpe_reference); + if (!gpe_event_info->runtime_count) { return_ACPI_STATUS(AE_LIMIT); } gpe_event_info->runtime_count--; if (!gpe_event_info->runtime_count) { + + /* Disable on last reference */ + status = acpi_ev_update_gpe_enable_mask(gpe_event_info); if (ACPI_SUCCESS(status)) { status = acpi_hw_low_set_gpe(gpe_event_info, @@ -379,7 +392,7 @@ u32 acpi_ev_gpe_detect(struct acpi_gpe_xrupt_info * gpe_xrupt_list) } ACPI_DEBUG_PRINT((ACPI_DB_INTERRUPTS, - "Read GPE Register at GPE%X: Status=%02X, Enable=%02X\n", + "Read GPE Register at GPE%02X: Status=%02X, Enable=%02X\n", gpe_register_info->base_gpe_number, status_reg, enable_reg)); @@ -405,7 +418,9 @@ u32 acpi_ev_gpe_detect(struct acpi_gpe_xrupt_info * gpe_xrupt_list) * or method. */ int_status |= - acpi_ev_gpe_dispatch(&gpe_block-> + acpi_ev_gpe_dispatch(gpe_block-> + node, + &gpe_block-> event_info[((acpi_size) i * ACPI_GPE_REGISTER_WIDTH) + j], j + gpe_register_info->base_gpe_number); } } @@ -435,17 +450,25 @@ u32 acpi_ev_gpe_detect(struct acpi_gpe_xrupt_info * gpe_xrupt_list) * an interrupt handler. * ******************************************************************************/ -static void acpi_ev_asynch_enable_gpe(void *context); static void ACPI_SYSTEM_XFACE acpi_ev_asynch_execute_gpe_method(void *context) { - struct acpi_gpe_event_info *gpe_event_info = (void *)context; + struct acpi_gpe_event_info *gpe_event_info = context; acpi_status status; - struct acpi_gpe_event_info local_gpe_event_info; + struct acpi_gpe_event_info *local_gpe_event_info; struct acpi_evaluate_info *info; ACPI_FUNCTION_TRACE(ev_asynch_execute_gpe_method); + /* Allocate a local GPE block */ + + local_gpe_event_info = + ACPI_ALLOCATE_ZEROED(sizeof(struct acpi_gpe_event_info)); + if (!local_gpe_event_info) { + ACPI_EXCEPTION((AE_INFO, AE_NO_MEMORY, "while handling a GPE")); + return_VOID; + } + status = acpi_ut_acquire_mutex(ACPI_MTX_EVENTS); if (ACPI_FAILURE(status)) { return_VOID; @@ -462,7 +485,7 @@ static void ACPI_SYSTEM_XFACE acpi_ev_asynch_execute_gpe_method(void *context) * Take a snapshot of the GPE info for this level - we copy the info to * prevent a race condition with remove_handler/remove_block. */ - ACPI_MEMCPY(&local_gpe_event_info, gpe_event_info, + ACPI_MEMCPY(local_gpe_event_info, gpe_event_info, sizeof(struct acpi_gpe_event_info)); status = acpi_ut_release_mutex(ACPI_MTX_EVENTS); @@ -470,12 +493,26 @@ static void ACPI_SYSTEM_XFACE acpi_ev_asynch_execute_gpe_method(void *context) return_VOID; } - /* - * Must check for control method type dispatch one more time to avoid a - * race with ev_gpe_install_handler - */ - if ((local_gpe_event_info.flags & ACPI_GPE_DISPATCH_MASK) == - ACPI_GPE_DISPATCH_METHOD) { + /* Do the correct dispatch - normal method or implicit notify */ + + switch (local_gpe_event_info->flags & ACPI_GPE_DISPATCH_MASK) { + case ACPI_GPE_DISPATCH_NOTIFY: + + /* + * Implicit notify. + * Dispatch a DEVICE_WAKE notify to the appropriate handler. + * NOTE: the request is queued for execution after this method + * completes. The notify handlers are NOT invoked synchronously + * from this thread -- because handlers may in turn run other + * control methods. + */ + status = + acpi_ev_queue_notify_request(local_gpe_event_info->dispatch. + device_node, + ACPI_NOTIFY_DEVICE_WAKE); + break; + + case ACPI_GPE_DISPATCH_METHOD: /* Allocate the evaluation information block */ @@ -488,7 +525,7 @@ static void ACPI_SYSTEM_XFACE acpi_ev_asynch_execute_gpe_method(void *context) * control method that corresponds to this GPE */ info->prefix_node = - local_gpe_event_info.dispatch.method_node; + local_gpe_event_info->dispatch.method_node; info->flags = ACPI_IGNORE_RETURN_VALUE; status = acpi_ns_evaluate(info); @@ -499,46 +536,98 @@ static void ACPI_SYSTEM_XFACE acpi_ev_asynch_execute_gpe_method(void *context) ACPI_EXCEPTION((AE_INFO, status, "while evaluating GPE method [%4.4s]", acpi_ut_get_node_name - (local_gpe_event_info.dispatch. + (local_gpe_event_info->dispatch. method_node))); } + + break; + + default: + return_VOID; /* Should never happen */ } + /* Defer enabling of GPE until all notify handlers are done */ - acpi_os_execute(OSL_NOTIFY_HANDLER, acpi_ev_asynch_enable_gpe, - gpe_event_info); + + status = acpi_os_execute(OSL_NOTIFY_HANDLER, + acpi_ev_asynch_enable_gpe, + local_gpe_event_info); + if (ACPI_FAILURE(status)) { + ACPI_FREE(local_gpe_event_info); + } return_VOID; } -static void acpi_ev_asynch_enable_gpe(void *context) + +/******************************************************************************* + * + * FUNCTION: acpi_ev_asynch_enable_gpe + * + * PARAMETERS: Context (gpe_event_info) - Info for this GPE + * Callback from acpi_os_execute + * + * RETURN: None + * + * DESCRIPTION: Asynchronous clear/enable for GPE. This allows the GPE to + * complete (i.e., finish execution of Notify) + * + ******************************************************************************/ + +static void ACPI_SYSTEM_XFACE acpi_ev_asynch_enable_gpe(void *context) { struct acpi_gpe_event_info *gpe_event_info = context; + + (void)acpi_ev_finish_gpe(gpe_event_info); + + ACPI_FREE(gpe_event_info); + return; +} + + +/******************************************************************************* + * + * FUNCTION: acpi_ev_finish_gpe + * + * PARAMETERS: gpe_event_info - Info for this GPE + * + * RETURN: Status + * + * DESCRIPTION: Clear/Enable a GPE. Common code that is used after execution + * of a GPE method or a synchronous or asynchronous GPE handler. + * + ******************************************************************************/ + +acpi_status acpi_ev_finish_gpe(struct acpi_gpe_event_info *gpe_event_info) +{ acpi_status status; + if ((gpe_event_info->flags & ACPI_GPE_XRUPT_TYPE_MASK) == ACPI_GPE_LEVEL_TRIGGERED) { /* - * GPE is level-triggered, we clear the GPE status bit after handling - * the event. + * GPE is level-triggered, we clear the GPE status bit after + * handling the event. */ status = acpi_hw_clear_gpe(gpe_event_info); if (ACPI_FAILURE(status)) { - return_VOID; + return (status); } } /* - * Enable this GPE, conditionally. This means that the GPE will only be - * physically enabled if the enable_for_run bit is set in the event_info + * Enable this GPE, conditionally. This means that the GPE will + * only be physically enabled if the enable_for_run bit is set + * in the event_info. */ - (void)acpi_hw_low_set_gpe(gpe_event_info, ACPI_GPE_COND_ENABLE); - - return_VOID; + (void)acpi_hw_low_set_gpe(gpe_event_info, ACPI_GPE_CONDITIONAL_ENABLE); + return (AE_OK); } + /******************************************************************************* * * FUNCTION: acpi_ev_gpe_dispatch * - * PARAMETERS: gpe_event_info - Info for this GPE + * PARAMETERS: gpe_device - Device node. NULL for GPE0/GPE1 + * gpe_event_info - Info for this GPE * gpe_number - Number relative to the parent GPE block * * RETURN: INTERRUPT_HANDLED or INTERRUPT_NOT_HANDLED @@ -551,13 +640,22 @@ static void acpi_ev_asynch_enable_gpe(void *context) ******************************************************************************/ u32 -acpi_ev_gpe_dispatch(struct acpi_gpe_event_info *gpe_event_info, u32 gpe_number) +acpi_ev_gpe_dispatch(struct acpi_namespace_node *gpe_device, + struct acpi_gpe_event_info *gpe_event_info, u32 gpe_number) { acpi_status status; + u32 return_value; ACPI_FUNCTION_TRACE(ev_gpe_dispatch); - acpi_os_gpe_count(gpe_number); + /* Invoke global event handler if present */ + + acpi_gpe_count++; + if (acpi_gbl_global_event_handler) { + acpi_gbl_global_event_handler(ACPI_EVENT_TYPE_GPE, gpe_device, + gpe_number, + acpi_gbl_global_event_handler_context); + } /* * If edge-triggered, clear the GPE status bit now. Note that @@ -568,59 +666,55 @@ acpi_ev_gpe_dispatch(struct acpi_gpe_event_info *gpe_event_info, u32 gpe_number) status = acpi_hw_clear_gpe(gpe_event_info); if (ACPI_FAILURE(status)) { ACPI_EXCEPTION((AE_INFO, status, - "Unable to clear GPE[0x%2X]", - gpe_number)); + "Unable to clear GPE%02X", gpe_number)); return_UINT32(ACPI_INTERRUPT_NOT_HANDLED); } } /* - * Dispatch the GPE to either an installed handler, or the control method - * associated with this GPE (_Lxx or _Exx). If a handler exists, we invoke - * it and do not attempt to run the method. If there is neither a handler - * nor a method, we disable this GPE to prevent further such pointless - * events from firing. + * Always disable the GPE so that it does not keep firing before + * any asynchronous activity completes (either from the execution + * of a GPE method or an asynchronous GPE handler.) + * + * If there is no handler or method to run, just disable the + * GPE and leave it disabled permanently to prevent further such + * pointless events from firing. + */ + status = acpi_hw_low_set_gpe(gpe_event_info, ACPI_GPE_DISABLE); + if (ACPI_FAILURE(status)) { + ACPI_EXCEPTION((AE_INFO, status, + "Unable to disable GPE%02X", gpe_number)); + return_UINT32(ACPI_INTERRUPT_NOT_HANDLED); + } + + /* + * Dispatch the GPE to either an installed handler or the control + * method associated with this GPE (_Lxx or _Exx). If a handler + * exists, we invoke it and do not attempt to run the method. + * If there is neither a handler nor a method, leave the GPE + * disabled. */ switch (gpe_event_info->flags & ACPI_GPE_DISPATCH_MASK) { case ACPI_GPE_DISPATCH_HANDLER: - /* - * Invoke the installed handler (at interrupt level) - * Ignore return status for now. - * TBD: leave GPE disabled on error? - */ - (void)gpe_event_info->dispatch.handler->address(gpe_event_info-> - dispatch. - handler-> - context); + /* Invoke the installed handler (at interrupt level) */ - /* It is now safe to clear level-triggered events. */ + return_value = + gpe_event_info->dispatch.handler->address(gpe_device, + gpe_number, + gpe_event_info-> + dispatch.handler-> + context); - if ((gpe_event_info->flags & ACPI_GPE_XRUPT_TYPE_MASK) == - ACPI_GPE_LEVEL_TRIGGERED) { - status = acpi_hw_clear_gpe(gpe_event_info); - if (ACPI_FAILURE(status)) { - ACPI_EXCEPTION((AE_INFO, status, - "Unable to clear GPE[0x%2X]", - gpe_number)); - return_UINT32(ACPI_INTERRUPT_NOT_HANDLED); - } + /* If requested, clear (if level-triggered) and reenable the GPE */ + + if (return_value & ACPI_REENABLE_GPE) { + (void)acpi_ev_finish_gpe(gpe_event_info); } break; case ACPI_GPE_DISPATCH_METHOD: - - /* - * Disable the GPE, so it doesn't keep firing before the method has a - * chance to run (it runs asynchronously with interrupts enabled). - */ - status = acpi_hw_low_set_gpe(gpe_event_info, ACPI_GPE_DISABLE); - if (ACPI_FAILURE(status)) { - ACPI_EXCEPTION((AE_INFO, status, - "Unable to disable GPE[0x%2X]", - gpe_number)); - return_UINT32(ACPI_INTERRUPT_NOT_HANDLED); - } + case ACPI_GPE_DISPATCH_NOTIFY: /* * Execute the method associated with the GPE @@ -631,7 +725,7 @@ acpi_ev_gpe_dispatch(struct acpi_gpe_event_info *gpe_event_info, u32 gpe_number) gpe_event_info); if (ACPI_FAILURE(status)) { ACPI_EXCEPTION((AE_INFO, status, - "Unable to queue handler for GPE[0x%2X] - event disabled", + "Unable to queue handler for GPE%2X - event disabled", gpe_number)); } break; @@ -644,20 +738,9 @@ acpi_ev_gpe_dispatch(struct acpi_gpe_event_info *gpe_event_info, u32 gpe_number) * a GPE to be enabled if it has no handler or method. */ ACPI_ERROR((AE_INFO, - "No handler or method for GPE[0x%2X], disabling event", + "No handler or method for GPE%02X, disabling event", gpe_number)); - /* - * Disable the GPE. The GPE will remain disabled a handler - * is installed or ACPICA is restarted. - */ - status = acpi_hw_low_set_gpe(gpe_event_info, ACPI_GPE_DISABLE); - if (ACPI_FAILURE(status)) { - ACPI_EXCEPTION((AE_INFO, status, - "Unable to disable GPE[0x%2X]", - gpe_number)); - return_UINT32(ACPI_INTERRUPT_NOT_HANDLED); - } break; } diff --git a/drivers/acpi/acpica/evgpeblk.c b/drivers/acpi/acpica/evgpeblk.c index 020add3..9acb869 100644 --- a/drivers/acpi/acpica/evgpeblk.c +++ b/drivers/acpi/acpica/evgpeblk.c @@ -361,9 +361,9 @@ acpi_ev_create_gpe_block(struct acpi_namespace_node *gpe_device, gpe_block->node = gpe_device; gpe_block->gpe_count = (u16)(register_count * ACPI_GPE_REGISTER_WIDTH); + gpe_block->initialized = FALSE; gpe_block->register_count = register_count; gpe_block->block_base_number = gpe_block_base_number; - gpe_block->initialized = FALSE; ACPI_MEMCPY(&gpe_block->block_address, gpe_block_address, sizeof(struct acpi_generic_address)); @@ -386,7 +386,7 @@ acpi_ev_create_gpe_block(struct acpi_namespace_node *gpe_device, return_ACPI_STATUS(status); } - acpi_all_gpes_initialized = FALSE; + acpi_gbl_all_gpes_initialized = FALSE; /* Find all GPE methods (_Lxx or_Exx) for this block */ @@ -423,14 +423,12 @@ acpi_ev_create_gpe_block(struct acpi_namespace_node *gpe_device, * * FUNCTION: acpi_ev_initialize_gpe_block * - * PARAMETERS: gpe_device - Handle to the parent GPE block - * gpe_block - Gpe Block info + * PARAMETERS: acpi_gpe_callback * * RETURN: Status * - * DESCRIPTION: Initialize and enable a GPE block. First find and run any - * _PRT methods associated with the block, then enable the - * appropriate GPEs. + * DESCRIPTION: Initialize and enable a GPE block. Enable GPEs that have + * associated methods. * Note: Assumes namespace is locked. * ******************************************************************************/ @@ -450,8 +448,8 @@ acpi_ev_initialize_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info, ACPI_FUNCTION_TRACE(ev_initialize_gpe_block); /* - * Ignore a null GPE block (e.g., if no GPE block 1 exists) and - * GPE blocks that have been initialized already. + * Ignore a null GPE block (e.g., if no GPE block 1 exists), and + * any GPE blocks that have been initialized already. */ if (!gpe_block || gpe_block->initialized) { return_ACPI_STATUS(AE_OK); @@ -459,8 +457,8 @@ acpi_ev_initialize_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info, /* * Enable all GPEs that have a corresponding method and have the - * ACPI_GPE_CAN_WAKE flag unset. Any other GPEs within this block must - * be enabled via the acpi_enable_gpe() interface. + * ACPI_GPE_CAN_WAKE flag unset. Any other GPEs within this block + * must be enabled via the acpi_enable_gpe() interface. */ gpe_enabled_count = 0; @@ -472,14 +470,19 @@ acpi_ev_initialize_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info, gpe_index = (i * ACPI_GPE_REGISTER_WIDTH) + j; gpe_event_info = &gpe_block->event_info[gpe_index]; - /* Ignore GPEs that have no corresponding _Lxx/_Exx method */ - - if (!(gpe_event_info->flags & ACPI_GPE_DISPATCH_METHOD) + /* + * Ignore GPEs that have no corresponding _Lxx/_Exx method + * and GPEs that are used to wake the system + */ + if (((gpe_event_info->flags & ACPI_GPE_DISPATCH_MASK) == + ACPI_GPE_DISPATCH_NONE) + || ((gpe_event_info->flags & ACPI_GPE_DISPATCH_MASK) + == ACPI_GPE_DISPATCH_HANDLER) || (gpe_event_info->flags & ACPI_GPE_CAN_WAKE)) { continue; } - status = acpi_raw_enable_gpe(gpe_event_info); + status = acpi_ev_add_gpe_reference(gpe_event_info); if (ACPI_FAILURE(status)) { ACPI_EXCEPTION((AE_INFO, status, "Could not enable GPE 0x%02X", diff --git a/drivers/acpi/acpica/evgpeinit.c b/drivers/acpi/acpica/evgpeinit.c index 4c8dea5..c59dc23 100644 --- a/drivers/acpi/acpica/evgpeinit.c +++ b/drivers/acpi/acpica/evgpeinit.c @@ -45,11 +45,27 @@ #include "accommon.h" #include "acevents.h" #include "acnamesp.h" -#include "acinterp.h" #define _COMPONENT ACPI_EVENTS ACPI_MODULE_NAME("evgpeinit") +/* + * Note: History of _PRW support in ACPICA + * + * Originally (2000 - 2010), the GPE initialization code performed a walk of + * the entire namespace to execute the _PRW methods and detect all GPEs + * capable of waking the system. + * + * As of 10/2010, the _PRW method execution has been removed since it is + * actually unnecessary. The host OS must in fact execute all _PRW methods + * in order to identify the device/power-resource dependencies. We now put + * the onus on the host OS to identify the wake GPEs as part of this process + * and to inform ACPICA of these GPEs via the acpi_setup_gpe_for_wake interface. This + * not only reduces the complexity of the ACPICA initialization code, but in + * some cases (on systems with very large namespaces) it should reduce the + * kernel boot time as well. + */ + /******************************************************************************* * * FUNCTION: acpi_ev_gpe_initialize @@ -222,7 +238,7 @@ void acpi_ev_update_gpes(acpi_owner_id table_owner_id) acpi_status status = AE_OK; /* - * 2) Find any _Lxx/_Exx GPE methods that have just been loaded. + * Find any _Lxx/_Exx GPE methods that have just been loaded. * * Any GPEs that correspond to new _Lxx/_Exx methods are immediately * enabled. @@ -235,9 +251,9 @@ void acpi_ev_update_gpes(acpi_owner_id table_owner_id) return; } + walk_info.count = 0; walk_info.owner_id = table_owner_id; walk_info.execute_by_owner_id = TRUE; - walk_info.count = 0; /* Walk the interrupt level descriptor list */ @@ -298,7 +314,7 @@ void acpi_ev_update_gpes(acpi_owner_id table_owner_id) * xx - is the GPE number [in HEX] * * If walk_info->execute_by_owner_id is TRUE, we only execute examine GPE methods - * with that owner. + * with that owner. * ******************************************************************************/ @@ -415,6 +431,7 @@ acpi_ev_match_gpe_method(acpi_handle obj_handle, * Add the GPE information from above to the gpe_event_info block for * use during dispatch of this GPE. */ + gpe_event_info->flags &= ~(ACPI_GPE_DISPATCH_MASK); gpe_event_info->flags |= (u8)(type | ACPI_GPE_DISPATCH_METHOD); gpe_event_info->dispatch.method_node = method_node; diff --git a/drivers/acpi/acpica/evgpeutil.c b/drivers/acpi/acpica/evgpeutil.c index 19a0e51..10e4774 100644 --- a/drivers/acpi/acpica/evgpeutil.c +++ b/drivers/acpi/acpica/evgpeutil.c @@ -154,6 +154,45 @@ u8 acpi_ev_valid_gpe_event(struct acpi_gpe_event_info *gpe_event_info) /******************************************************************************* * + * FUNCTION: acpi_ev_get_gpe_device + * + * PARAMETERS: GPE_WALK_CALLBACK + * + * RETURN: Status + * + * DESCRIPTION: Matches the input GPE index (0-current_gpe_count) with a GPE + * block device. NULL if the GPE is one of the FADT-defined GPEs. + * + ******************************************************************************/ + +acpi_status +acpi_ev_get_gpe_device(struct acpi_gpe_xrupt_info *gpe_xrupt_info, + struct acpi_gpe_block_info *gpe_block, void *context) +{ + struct acpi_gpe_device_info *info = context; + + /* Increment Index by the number of GPEs in this block */ + + info->next_block_base_index += gpe_block->gpe_count; + + if (info->index < info->next_block_base_index) { + /* + * The GPE index is within this block, get the node. Leave the node + * NULL for the FADT-defined GPEs + */ + if ((gpe_block->node)->type == ACPI_TYPE_DEVICE) { + info->gpe_device = gpe_block->node; + } + + info->status = AE_OK; + return (AE_CTRL_END); + } + + return (AE_OK); +} + +/******************************************************************************* + * * FUNCTION: acpi_ev_get_gpe_xrupt_block * * PARAMETERS: interrupt_number - Interrupt for a GPE block diff --git a/drivers/acpi/acpica/evmisc.c b/drivers/acpi/acpica/evmisc.c index fcaed9f..8e31bb5 100644 --- a/drivers/acpi/acpica/evmisc.c +++ b/drivers/acpi/acpica/evmisc.c @@ -284,41 +284,41 @@ static void ACPI_SYSTEM_XFACE acpi_ev_notify_dispatch(void *context) * RETURN: ACPI_INTERRUPT_HANDLED * * DESCRIPTION: Invoked directly from the SCI handler when a global lock - * release interrupt occurs. Attempt to acquire the global lock, - * if successful, signal the thread waiting for the lock. + * release interrupt occurs. If there's a thread waiting for + * the global lock, signal it. * * NOTE: Assumes that the semaphore can be signaled from interrupt level. If * this is not possible for some reason, a separate thread will have to be * scheduled to do this. * ******************************************************************************/ +static u8 acpi_ev_global_lock_pending; +static spinlock_t _acpi_ev_global_lock_pending_lock; +#define acpi_ev_global_lock_pending_lock &_acpi_ev_global_lock_pending_lock static u32 acpi_ev_global_lock_handler(void *context) { - u8 acquired = FALSE; + acpi_status status; + acpi_cpu_flags flags; - /* - * Attempt to get the lock. - * - * If we don't get it now, it will be marked pending and we will - * take another interrupt when it becomes free. - */ - ACPI_ACQUIRE_GLOBAL_LOCK(acpi_gbl_FACS, acquired); - if (acquired) { + flags = acpi_os_acquire_lock(acpi_ev_global_lock_pending_lock); - /* Got the lock, now wake all threads waiting for it */ + if (!acpi_ev_global_lock_pending) { + goto out; + } - acpi_gbl_global_lock_acquired = TRUE; - /* Send a unit to the semaphore */ + /* Send a unit to the semaphore */ - if (ACPI_FAILURE - (acpi_os_signal_semaphore - (acpi_gbl_global_lock_semaphore, 1))) { - ACPI_ERROR((AE_INFO, - "Could not signal Global Lock semaphore")); - } + status = acpi_os_signal_semaphore(acpi_gbl_global_lock_semaphore, 1); + if (ACPI_FAILURE(status)) { + ACPI_ERROR((AE_INFO, "Could not signal Global Lock semaphore")); } + acpi_ev_global_lock_pending = FALSE; + + out: + acpi_os_release_lock(acpi_ev_global_lock_pending_lock, flags); + return (ACPI_INTERRUPT_HANDLED); } @@ -415,6 +415,7 @@ static int acpi_ev_global_lock_acquired; acpi_status acpi_ev_acquire_global_lock(u16 timeout) { + acpi_cpu_flags flags; acpi_status status = AE_OK; u8 acquired = FALSE; @@ -467,32 +468,47 @@ acpi_status acpi_ev_acquire_global_lock(u16 timeout) return_ACPI_STATUS(AE_OK); } - /* Attempt to acquire the actual hardware lock */ + flags = acpi_os_acquire_lock(acpi_ev_global_lock_pending_lock); + + do { + + /* Attempt to acquire the actual hardware lock */ + + ACPI_ACQUIRE_GLOBAL_LOCK(acpi_gbl_FACS, acquired); + if (acquired) { + acpi_gbl_global_lock_acquired = TRUE; + + ACPI_DEBUG_PRINT((ACPI_DB_EXEC, + "Acquired hardware Global Lock\n")); + break; + } - ACPI_ACQUIRE_GLOBAL_LOCK(acpi_gbl_FACS, acquired); - if (acquired) { + acpi_ev_global_lock_pending = TRUE; - /* We got the lock */ + acpi_os_release_lock(acpi_ev_global_lock_pending_lock, flags); + /* + * Did not get the lock. The pending bit was set above, and we + * must wait until we get the global lock released interrupt. + */ ACPI_DEBUG_PRINT((ACPI_DB_EXEC, - "Acquired hardware Global Lock\n")); + "Waiting for hardware Global Lock\n")); - acpi_gbl_global_lock_acquired = TRUE; - return_ACPI_STATUS(AE_OK); - } + /* + * Wait for handshake with the global lock interrupt handler. + * This interface releases the interpreter if we must wait. + */ + status = acpi_ex_system_wait_semaphore( + acpi_gbl_global_lock_semaphore, + ACPI_WAIT_FOREVER); - /* - * Did not get the lock. The pending bit was set above, and we must now - * wait until we get the global lock released interrupt. - */ - ACPI_DEBUG_PRINT((ACPI_DB_EXEC, "Waiting for hardware Global Lock\n")); + flags = acpi_os_acquire_lock(acpi_ev_global_lock_pending_lock); - /* - * Wait for handshake with the global lock interrupt handler. - * This interface releases the interpreter if we must wait. - */ - status = acpi_ex_system_wait_semaphore(acpi_gbl_global_lock_semaphore, - ACPI_WAIT_FOREVER); + } while (ACPI_SUCCESS(status)); + + acpi_ev_global_lock_pending = FALSE; + + acpi_os_release_lock(acpi_ev_global_lock_pending_lock, flags); return_ACPI_STATUS(status); } diff --git a/drivers/acpi/acpica/evxface.c b/drivers/acpi/acpica/evxface.c index 36af222..1226689 100644 --- a/drivers/acpi/acpica/evxface.c +++ b/drivers/acpi/acpica/evxface.c @@ -92,6 +92,57 @@ acpi_status acpi_install_exception_handler(acpi_exception_handler handler) ACPI_EXPORT_SYMBOL(acpi_install_exception_handler) #endif /* ACPI_FUTURE_USAGE */ + +/******************************************************************************* + * + * FUNCTION: acpi_install_global_event_handler + * + * PARAMETERS: Handler - Pointer to the global event handler function + * Context - Value passed to the handler on each event + * + * RETURN: Status + * + * DESCRIPTION: Saves the pointer to the handler function. The global handler + * is invoked upon each incoming GPE and Fixed Event. It is + * invoked at interrupt level at the time of the event dispatch. + * Can be used to update event counters, etc. + * + ******************************************************************************/ +acpi_status +acpi_install_global_event_handler(ACPI_GBL_EVENT_HANDLER handler, void *context) +{ + acpi_status status; + + ACPI_FUNCTION_TRACE(acpi_install_global_event_handler); + + /* Parameter validation */ + + if (!handler) { + return_ACPI_STATUS(AE_BAD_PARAMETER); + } + + status = acpi_ut_acquire_mutex(ACPI_MTX_EVENTS); + if (ACPI_FAILURE(status)) { + return_ACPI_STATUS(status); + } + + /* Don't allow two handlers. */ + + if (acpi_gbl_global_event_handler) { + status = AE_ALREADY_EXISTS; + goto cleanup; + } + + acpi_gbl_global_event_handler = handler; + acpi_gbl_global_event_handler_context = context; + + cleanup: + (void)acpi_ut_release_mutex(ACPI_MTX_EVENTS); + return_ACPI_STATUS(status); +} + +ACPI_EXPORT_SYMBOL(acpi_install_global_event_handler) + /******************************************************************************* * * FUNCTION: acpi_install_fixed_event_handler @@ -671,10 +722,10 @@ ACPI_EXPORT_SYMBOL(acpi_remove_notify_handler) acpi_status acpi_install_gpe_handler(acpi_handle gpe_device, u32 gpe_number, - u32 type, acpi_event_handler address, void *context) + u32 type, acpi_gpe_handler address, void *context) { struct acpi_gpe_event_info *gpe_event_info; - struct acpi_handler_info *handler; + struct acpi_gpe_handler_info *handler; acpi_status status; acpi_cpu_flags flags; @@ -693,7 +744,7 @@ acpi_install_gpe_handler(acpi_handle gpe_device, /* Allocate memory for the handler object */ - handler = ACPI_ALLOCATE_ZEROED(sizeof(struct acpi_handler_info)); + handler = ACPI_ALLOCATE_ZEROED(sizeof(struct acpi_gpe_handler_info)); if (!handler) { status = AE_NO_MEMORY; goto unlock_and_exit; @@ -722,7 +773,7 @@ acpi_install_gpe_handler(acpi_handle gpe_device, handler->address = address; handler->context = context; handler->method_node = gpe_event_info->dispatch.method_node; - handler->orig_flags = gpe_event_info->flags & + handler->original_flags = gpe_event_info->flags & (ACPI_GPE_XRUPT_TYPE_MASK | ACPI_GPE_DISPATCH_MASK); /* @@ -731,10 +782,10 @@ acpi_install_gpe_handler(acpi_handle gpe_device, * disabled now to avoid spurious execution of the handler. */ - if ((handler->orig_flags & ACPI_GPE_DISPATCH_METHOD) + if ((handler->original_flags & ACPI_GPE_DISPATCH_METHOD) && gpe_event_info->runtime_count) { - handler->orig_enabled = 1; - (void)acpi_raw_disable_gpe(gpe_event_info); + handler->originally_enabled = 1; + (void)acpi_ev_remove_gpe_reference(gpe_event_info); } /* Install the handler */ @@ -777,10 +828,10 @@ ACPI_EXPORT_SYMBOL(acpi_install_gpe_handler) ******************************************************************************/ acpi_status acpi_remove_gpe_handler(acpi_handle gpe_device, - u32 gpe_number, acpi_event_handler address) + u32 gpe_number, acpi_gpe_handler address) { struct acpi_gpe_event_info *gpe_event_info; - struct acpi_handler_info *handler; + struct acpi_gpe_handler_info *handler; acpi_status status; acpi_cpu_flags flags; @@ -835,7 +886,7 @@ acpi_remove_gpe_handler(acpi_handle gpe_device, gpe_event_info->dispatch.method_node = handler->method_node; gpe_event_info->flags &= ~(ACPI_GPE_XRUPT_TYPE_MASK | ACPI_GPE_DISPATCH_MASK); - gpe_event_info->flags |= handler->orig_flags; + gpe_event_info->flags |= handler->original_flags; /* * If the GPE was previously associated with a method and it was @@ -843,9 +894,9 @@ acpi_remove_gpe_handler(acpi_handle gpe_device, * post-initialization configuration. */ - if ((handler->orig_flags & ACPI_GPE_DISPATCH_METHOD) - && handler->orig_enabled) - (void)acpi_raw_enable_gpe(gpe_event_info); + if ((handler->original_flags & ACPI_GPE_DISPATCH_METHOD) + && handler->originally_enabled) + (void)acpi_ev_add_gpe_reference(gpe_event_info); /* Now we can free the handler object */ diff --git a/drivers/acpi/acpica/evxfevnt.c b/drivers/acpi/acpica/evxfevnt.c index a1dabe3..90488c1 100644 --- a/drivers/acpi/acpica/evxfevnt.c +++ b/drivers/acpi/acpica/evxfevnt.c @@ -43,18 +43,11 @@ #include #include "accommon.h" -#include "acevents.h" -#include "acnamesp.h" #include "actables.h" #define _COMPONENT ACPI_EVENTS ACPI_MODULE_NAME("evxfevnt") -/* Local prototypes */ -static acpi_status -acpi_ev_get_gpe_device(struct acpi_gpe_xrupt_info *gpe_xrupt_info, - struct acpi_gpe_block_info *gpe_block, void *context); - /******************************************************************************* * * FUNCTION: acpi_enable @@ -213,185 +206,6 @@ ACPI_EXPORT_SYMBOL(acpi_enable_event) /******************************************************************************* * - * FUNCTION: acpi_gpe_wakeup - * - * PARAMETERS: gpe_device - Parent GPE Device. NULL for GPE0/GPE1 - * gpe_number - GPE level within the GPE block - * Action - Enable or Disable - * - * RETURN: Status - * - * DESCRIPTION: Set or clear the GPE's wakeup enable mask bit. - * - ******************************************************************************/ -acpi_status acpi_gpe_wakeup(acpi_handle gpe_device, u32 gpe_number, u8 action) -{ - acpi_status status = AE_OK; - struct acpi_gpe_event_info *gpe_event_info; - struct acpi_gpe_register_info *gpe_register_info; - acpi_cpu_flags flags; - u32 register_bit; - - ACPI_FUNCTION_TRACE(acpi_gpe_wakeup); - - flags = acpi_os_acquire_lock(acpi_gbl_gpe_lock); - - /* Ensure that we have a valid GPE number */ - - gpe_event_info = acpi_ev_get_gpe_event_info(gpe_device, gpe_number); - if (!gpe_event_info || !(gpe_event_info->flags & ACPI_GPE_CAN_WAKE)) { - status = AE_BAD_PARAMETER; - goto unlock_and_exit; - } - - gpe_register_info = gpe_event_info->register_info; - if (!gpe_register_info) { - status = AE_NOT_EXIST; - goto unlock_and_exit; - } - - register_bit = - acpi_hw_get_gpe_register_bit(gpe_event_info, gpe_register_info); - - /* Perform the action */ - - switch (action) { - case ACPI_GPE_ENABLE: - ACPI_SET_BIT(gpe_register_info->enable_for_wake, - (u8)register_bit); - break; - - case ACPI_GPE_DISABLE: - ACPI_CLEAR_BIT(gpe_register_info->enable_for_wake, - (u8)register_bit); - break; - - default: - ACPI_ERROR((AE_INFO, "%u, Invalid action", action)); - status = AE_BAD_PARAMETER; - break; - } - -unlock_and_exit: - acpi_os_release_lock(acpi_gbl_gpe_lock, flags); - return_ACPI_STATUS(status); -} - -ACPI_EXPORT_SYMBOL(acpi_gpe_wakeup) - -/******************************************************************************* - * - * FUNCTION: acpi_enable_gpe - * - * PARAMETERS: gpe_device - Parent GPE Device. NULL for GPE0/GPE1 - * gpe_number - GPE level within the GPE block - * - * RETURN: Status - * - * DESCRIPTION: Add a reference to a GPE. On the first reference, the GPE is - * hardware-enabled. - * - ******************************************************************************/ -acpi_status acpi_enable_gpe(acpi_handle gpe_device, u32 gpe_number) -{ - acpi_status status = AE_BAD_PARAMETER; - struct acpi_gpe_event_info *gpe_event_info; - acpi_cpu_flags flags; - - ACPI_FUNCTION_TRACE(acpi_enable_gpe); - - flags = acpi_os_acquire_lock(acpi_gbl_gpe_lock); - - /* Ensure that we have a valid GPE number */ - - gpe_event_info = acpi_ev_get_gpe_event_info(gpe_device, gpe_number); - if (gpe_event_info) { - status = acpi_raw_enable_gpe(gpe_event_info); - } - - acpi_os_release_lock(acpi_gbl_gpe_lock, flags); - return_ACPI_STATUS(status); -} -ACPI_EXPORT_SYMBOL(acpi_enable_gpe) - -/******************************************************************************* - * - * FUNCTION: acpi_disable_gpe - * - * PARAMETERS: gpe_device - Parent GPE Device. NULL for GPE0/GPE1 - * gpe_number - GPE level within the GPE block - * - * RETURN: Status - * - * DESCRIPTION: Remove a reference to a GPE. When the last reference is - * removed, only then is the GPE disabled (for runtime GPEs), or - * the GPE mask bit disabled (for wake GPEs) - * - ******************************************************************************/ -acpi_status acpi_disable_gpe(acpi_handle gpe_device, u32 gpe_number) -{ - acpi_status status = AE_BAD_PARAMETER; - struct acpi_gpe_event_info *gpe_event_info; - acpi_cpu_flags flags; - - ACPI_FUNCTION_TRACE(acpi_disable_gpe); - - flags = acpi_os_acquire_lock(acpi_gbl_gpe_lock); - - /* Ensure that we have a valid GPE number */ - - gpe_event_info = acpi_ev_get_gpe_event_info(gpe_device, gpe_number); - if (gpe_event_info) { - status = acpi_raw_disable_gpe(gpe_event_info) ; - } - - acpi_os_release_lock(acpi_gbl_gpe_lock, flags); - return_ACPI_STATUS(status); -} -ACPI_EXPORT_SYMBOL(acpi_disable_gpe) - -/******************************************************************************* - * - * FUNCTION: acpi_gpe_can_wake - * - * PARAMETERS: gpe_device - Parent GPE Device. NULL for GPE0/GPE1 - * gpe_number - GPE level within the GPE block - * - * RETURN: Status - * - * DESCRIPTION: Set the ACPI_GPE_CAN_WAKE flag for the given GPE. If the GPE - * has a corresponding method and is currently enabled, disable it - * (GPEs with corresponding methods are enabled unconditionally - * during initialization, but GPEs that can wake up are expected - * to be initially disabled). - * - ******************************************************************************/ -acpi_status acpi_gpe_can_wake(acpi_handle gpe_device, u32 gpe_number) -{ - acpi_status status = AE_OK; - struct acpi_gpe_event_info *gpe_event_info; - acpi_cpu_flags flags; - - ACPI_FUNCTION_TRACE(acpi_gpe_can_wake); - - flags = acpi_os_acquire_lock(acpi_gbl_gpe_lock); - - /* Ensure that we have a valid GPE number */ - - gpe_event_info = acpi_ev_get_gpe_event_info(gpe_device, gpe_number); - if (gpe_event_info) { - gpe_event_info->flags |= ACPI_GPE_CAN_WAKE; - } else { - status = AE_BAD_PARAMETER; - } - - acpi_os_release_lock(acpi_gbl_gpe_lock, flags); - return_ACPI_STATUS(status); -} -ACPI_EXPORT_SYMBOL(acpi_gpe_can_wake) - -/******************************************************************************* - * * FUNCTION: acpi_disable_event * * PARAMETERS: Event - The fixed eventto be enabled @@ -483,44 +297,6 @@ ACPI_EXPORT_SYMBOL(acpi_clear_event) /******************************************************************************* * - * FUNCTION: acpi_clear_gpe - * - * PARAMETERS: gpe_device - Parent GPE Device. NULL for GPE0/GPE1 - * gpe_number - GPE level within the GPE block - * - * RETURN: Status - * - * DESCRIPTION: Clear an ACPI event (general purpose) - * - ******************************************************************************/ -acpi_status acpi_clear_gpe(acpi_handle gpe_device, u32 gpe_number) -{ - acpi_status status = AE_OK; - struct acpi_gpe_event_info *gpe_event_info; - acpi_cpu_flags flags; - - ACPI_FUNCTION_TRACE(acpi_clear_gpe); - - flags = acpi_os_acquire_lock(acpi_gbl_gpe_lock); - - /* Ensure that we have a valid GPE number */ - - gpe_event_info = acpi_ev_get_gpe_event_info(gpe_device, gpe_number); - if (!gpe_event_info) { - status = AE_BAD_PARAMETER; - goto unlock_and_exit; - } - - status = acpi_hw_clear_gpe(gpe_event_info); - - unlock_and_exit: - acpi_os_release_lock(acpi_gbl_gpe_lock, flags); - return_ACPI_STATUS(status); -} - -ACPI_EXPORT_SYMBOL(acpi_clear_gpe) -/******************************************************************************* - * * FUNCTION: acpi_get_event_status * * PARAMETERS: Event - The fixed event @@ -575,379 +351,3 @@ acpi_status acpi_get_event_status(u32 event, acpi_event_status * event_status) } ACPI_EXPORT_SYMBOL(acpi_get_event_status) - -/******************************************************************************* - * - * FUNCTION: acpi_get_gpe_status - * - * PARAMETERS: gpe_device - Parent GPE Device. NULL for GPE0/GPE1 - * gpe_number - GPE level within the GPE block - * event_status - Where the current status of the event will - * be returned - * - * RETURN: Status - * - * DESCRIPTION: Get status of an event (general purpose) - * - ******************************************************************************/ -acpi_status -acpi_get_gpe_status(acpi_handle gpe_device, - u32 gpe_number, acpi_event_status *event_status) -{ - acpi_status status = AE_OK; - struct acpi_gpe_event_info *gpe_event_info; - acpi_cpu_flags flags; - - ACPI_FUNCTION_TRACE(acpi_get_gpe_status); - - flags = acpi_os_acquire_lock(acpi_gbl_gpe_lock); - - /* Ensure that we have a valid GPE number */ - - gpe_event_info = acpi_ev_get_gpe_event_info(gpe_device, gpe_number); - if (!gpe_event_info) { - status = AE_BAD_PARAMETER; - goto unlock_and_exit; - } - - /* Obtain status on the requested GPE number */ - - status = acpi_hw_get_gpe_status(gpe_event_info, event_status); - - if (gpe_event_info->flags & ACPI_GPE_DISPATCH_MASK) - *event_status |= ACPI_EVENT_FLAG_HANDLE; - - unlock_and_exit: - acpi_os_release_lock(acpi_gbl_gpe_lock, flags); - return_ACPI_STATUS(status); -} - -ACPI_EXPORT_SYMBOL(acpi_get_gpe_status) -/******************************************************************************* - * - * FUNCTION: acpi_install_gpe_block - * - * PARAMETERS: gpe_device - Handle to the parent GPE Block Device - * gpe_block_address - Address and space_iD - * register_count - Number of GPE register pairs in the block - * interrupt_number - H/W interrupt for the block - * - * RETURN: Status - * - * DESCRIPTION: Create and Install a block of GPE registers - * - ******************************************************************************/ -acpi_status -acpi_install_gpe_block(acpi_handle gpe_device, - struct acpi_generic_address *gpe_block_address, - u32 register_count, u32 interrupt_number) -{ - acpi_status status = AE_OK; - union acpi_operand_object *obj_desc; - struct acpi_namespace_node *node; - struct acpi_gpe_block_info *gpe_block; - - ACPI_FUNCTION_TRACE(acpi_install_gpe_block); - - if ((!gpe_device) || (!gpe_block_address) || (!register_count)) { - return_ACPI_STATUS(AE_BAD_PARAMETER); - } - - status = acpi_ut_acquire_mutex(ACPI_MTX_NAMESPACE); - if (ACPI_FAILURE(status)) { - return (status); - } - - node = acpi_ns_validate_handle(gpe_device); - if (!node) { - status = AE_BAD_PARAMETER; - goto unlock_and_exit; - } - - /* - * For user-installed GPE Block Devices, the gpe_block_base_number - * is always zero - */ - status = - acpi_ev_create_gpe_block(node, gpe_block_address, register_count, 0, - interrupt_number, &gpe_block); - if (ACPI_FAILURE(status)) { - goto unlock_and_exit; - } - - /* Install block in the device_object attached to the node */ - - obj_desc = acpi_ns_get_attached_object(node); - if (!obj_desc) { - - /* - * No object, create a new one (Device nodes do not always have - * an attached object) - */ - obj_desc = acpi_ut_create_internal_object(ACPI_TYPE_DEVICE); - if (!obj_desc) { - status = AE_NO_MEMORY; - goto unlock_and_exit; - } - - status = - acpi_ns_attach_object(node, obj_desc, ACPI_TYPE_DEVICE); - - /* Remove local reference to the object */ - - acpi_ut_remove_reference(obj_desc); - - if (ACPI_FAILURE(status)) { - goto unlock_and_exit; - } - } - - /* Now install the GPE block in the device_object */ - - obj_desc->device.gpe_block = gpe_block; - - unlock_and_exit: - (void)acpi_ut_release_mutex(ACPI_MTX_NAMESPACE); - return_ACPI_STATUS(status); -} - -ACPI_EXPORT_SYMBOL(acpi_install_gpe_block) - -/******************************************************************************* - * - * FUNCTION: acpi_remove_gpe_block - * - * PARAMETERS: gpe_device - Handle to the parent GPE Block Device - * - * RETURN: Status - * - * DESCRIPTION: Remove a previously installed block of GPE registers - * - ******************************************************************************/ -acpi_status acpi_remove_gpe_block(acpi_handle gpe_device) -{ - union acpi_operand_object *obj_desc; - acpi_status status; - struct acpi_namespace_node *node; - - ACPI_FUNCTION_TRACE(acpi_remove_gpe_block); - - if (!gpe_device) { - return_ACPI_STATUS(AE_BAD_PARAMETER); - } - - status = acpi_ut_acquire_mutex(ACPI_MTX_NAMESPACE); - if (ACPI_FAILURE(status)) { - return (status); - } - - node = acpi_ns_validate_handle(gpe_device); - if (!node) { - status = AE_BAD_PARAMETER; - goto unlock_and_exit; - } - - /* Get the device_object attached to the node */ - - obj_desc = acpi_ns_get_attached_object(node); - if (!obj_desc || !obj_desc->device.gpe_block) { - return_ACPI_STATUS(AE_NULL_OBJECT); - } - - /* Delete the GPE block (but not the device_object) */ - - status = acpi_ev_delete_gpe_block(obj_desc->device.gpe_block); - if (ACPI_SUCCESS(status)) { - obj_desc->device.gpe_block = NULL; - } - - unlock_and_exit: - (void)acpi_ut_release_mutex(ACPI_MTX_NAMESPACE); - return_ACPI_STATUS(status); -} - -ACPI_EXPORT_SYMBOL(acpi_remove_gpe_block) - -/******************************************************************************* - * - * FUNCTION: acpi_get_gpe_device - * - * PARAMETERS: Index - System GPE index (0-current_gpe_count) - * gpe_device - Where the parent GPE Device is returned - * - * RETURN: Status - * - * DESCRIPTION: Obtain the GPE device associated with the input index. A NULL - * gpe device indicates that the gpe number is contained in one of - * the FADT-defined gpe blocks. Otherwise, the GPE block device. - * - ******************************************************************************/ -acpi_status -acpi_get_gpe_device(u32 index, acpi_handle *gpe_device) -{ - struct acpi_gpe_device_info info; - acpi_status status; - - ACPI_FUNCTION_TRACE(acpi_get_gpe_device); - - if (!gpe_device) { - return_ACPI_STATUS(AE_BAD_PARAMETER); - } - - if (index >= acpi_current_gpe_count) { - return_ACPI_STATUS(AE_NOT_EXIST); - } - - /* Setup and walk the GPE list */ - - info.index = index; - info.status = AE_NOT_EXIST; - info.gpe_device = NULL; - info.next_block_base_index = 0; - - status = acpi_ev_walk_gpe_list(acpi_ev_get_gpe_device, &info); - if (ACPI_FAILURE(status)) { - return_ACPI_STATUS(status); - } - - *gpe_device = info.gpe_device; - return_ACPI_STATUS(info.status); -} - -ACPI_EXPORT_SYMBOL(acpi_get_gpe_device) - -/******************************************************************************* - * - * FUNCTION: acpi_ev_get_gpe_device - * - * PARAMETERS: GPE_WALK_CALLBACK - * - * RETURN: Status - * - * DESCRIPTION: Matches the input GPE index (0-current_gpe_count) with a GPE - * block device. NULL if the GPE is one of the FADT-defined GPEs. - * - ******************************************************************************/ -static acpi_status -acpi_ev_get_gpe_device(struct acpi_gpe_xrupt_info *gpe_xrupt_info, - struct acpi_gpe_block_info *gpe_block, void *context) -{ - struct acpi_gpe_device_info *info = context; - - /* Increment Index by the number of GPEs in this block */ - - info->next_block_base_index += gpe_block->gpe_count; - - if (info->index < info->next_block_base_index) { - /* - * The GPE index is within this block, get the node. Leave the node - * NULL for the FADT-defined GPEs - */ - if ((gpe_block->node)->type == ACPI_TYPE_DEVICE) { - info->gpe_device = gpe_block->node; - } - - info->status = AE_OK; - return (AE_CTRL_END); - } - - return (AE_OK); -} - -/****************************************************************************** - * - * FUNCTION: acpi_disable_all_gpes - * - * PARAMETERS: None - * - * RETURN: Status - * - * DESCRIPTION: Disable and clear all GPEs in all GPE blocks - * - ******************************************************************************/ - -acpi_status acpi_disable_all_gpes(void) -{ - acpi_status status; - - ACPI_FUNCTION_TRACE(acpi_disable_all_gpes); - - status = acpi_ut_acquire_mutex(ACPI_MTX_EVENTS); - if (ACPI_FAILURE(status)) { - return_ACPI_STATUS(status); - } - - status = acpi_hw_disable_all_gpes(); - (void)acpi_ut_release_mutex(ACPI_MTX_EVENTS); - - return_ACPI_STATUS(status); -} - -/****************************************************************************** - * - * FUNCTION: acpi_enable_all_runtime_gpes - * - * PARAMETERS: None - * - * RETURN: Status - * - * DESCRIPTION: Enable all "runtime" GPEs, in all GPE blocks - * - ******************************************************************************/ - -acpi_status acpi_enable_all_runtime_gpes(void) -{ - acpi_status status; - - ACPI_FUNCTION_TRACE(acpi_enable_all_runtime_gpes); - - status = acpi_ut_acquire_mutex(ACPI_MTX_EVENTS); - if (ACPI_FAILURE(status)) { - return_ACPI_STATUS(status); - } - - status = acpi_hw_enable_all_runtime_gpes(); - (void)acpi_ut_release_mutex(ACPI_MTX_EVENTS); - - return_ACPI_STATUS(status); -} - -/****************************************************************************** - * - * FUNCTION: acpi_update_gpes - * - * PARAMETERS: None - * - * RETURN: None - * - * DESCRIPTION: Enable all GPEs that have associated _Lxx or _Exx methods and - * are not pointed to by any device _PRW methods indicating that - * these GPEs are generally intended for system or device wakeup - * (such GPEs have to be enabled directly when the devices whose - * _PRW methods point to them are set up for wakeup signaling). - * - ******************************************************************************/ - -acpi_status acpi_update_gpes(void) -{ - acpi_status status; - - ACPI_FUNCTION_TRACE(acpi_update_gpes); - - status = acpi_ut_acquire_mutex(ACPI_MTX_EVENTS); - if (ACPI_FAILURE(status)) { - return_ACPI_STATUS(status); - } else if (acpi_all_gpes_initialized) { - goto unlock; - } - - status = acpi_ev_walk_gpe_list(acpi_ev_initialize_gpe_block, NULL); - if (ACPI_SUCCESS(status)) { - acpi_all_gpes_initialized = TRUE; - } - -unlock: - (void)acpi_ut_release_mutex(ACPI_MTX_EVENTS); - - return_ACPI_STATUS(status); -} diff --git a/drivers/acpi/acpica/evxfgpe.c b/drivers/acpi/acpica/evxfgpe.c new file mode 100644 index 0000000..416845b --- /dev/null +++ b/drivers/acpi/acpica/evxfgpe.c @@ -0,0 +1,669 @@ +/****************************************************************************** + * + * Module Name: evxfgpe - External Interfaces for General Purpose Events (GPEs) + * + *****************************************************************************/ + +/* + * Copyright (C) 2000 - 2010, Intel Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + * of any contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + */ + +#include +#include "accommon.h" +#include "acevents.h" +#include "acnamesp.h" + +#define _COMPONENT ACPI_EVENTS +ACPI_MODULE_NAME("evxfgpe") + +/****************************************************************************** + * + * FUNCTION: acpi_update_all_gpes + * + * PARAMETERS: None + * + * RETURN: Status + * + * DESCRIPTION: Complete GPE initialization and enable all GPEs that have + * associated _Lxx or _Exx methods and are not pointed to by any + * device _PRW methods (this indicates that these GPEs are + * generally intended for system or device wakeup. Such GPEs + * have to be enabled directly when the devices whose _PRW + * methods point to them are set up for wakeup signaling.) + * + * NOTE: Should be called after any GPEs are added to the system. Primarily, + * after the system _PRW methods have been run, but also after a GPE Block + * Device has been added or if any new GPE methods have been added via a + * dynamic table load. + * + ******************************************************************************/ + +acpi_status acpi_update_all_gpes(void) +{ + acpi_status status; + + ACPI_FUNCTION_TRACE(acpi_update_all_gpes); + + status = acpi_ut_acquire_mutex(ACPI_MTX_EVENTS); + if (ACPI_FAILURE(status)) { + return_ACPI_STATUS(status); + } + + if (acpi_gbl_all_gpes_initialized) { + goto unlock_and_exit; + } + + status = acpi_ev_walk_gpe_list(acpi_ev_initialize_gpe_block, NULL); + if (ACPI_SUCCESS(status)) { + acpi_gbl_all_gpes_initialized = TRUE; + } + +unlock_and_exit: + (void)acpi_ut_release_mutex(ACPI_MTX_EVENTS); + + return_ACPI_STATUS(status); +} + +ACPI_EXPORT_SYMBOL(acpi_update_all_gpes) + +/******************************************************************************* + * + * FUNCTION: acpi_enable_gpe + * + * PARAMETERS: gpe_device - Parent GPE Device. NULL for GPE0/GPE1 + * gpe_number - GPE level within the GPE block + * + * RETURN: Status + * + * DESCRIPTION: Add a reference to a GPE. On the first reference, the GPE is + * hardware-enabled. + * + ******************************************************************************/ + +acpi_status acpi_enable_gpe(acpi_handle gpe_device, u32 gpe_number) +{ + acpi_status status = AE_BAD_PARAMETER; + struct acpi_gpe_event_info *gpe_event_info; + acpi_cpu_flags flags; + + ACPI_FUNCTION_TRACE(acpi_enable_gpe); + + flags = acpi_os_acquire_lock(acpi_gbl_gpe_lock); + + /* Ensure that we have a valid GPE number */ + + gpe_event_info = acpi_ev_get_gpe_event_info(gpe_device, gpe_number); + if (gpe_event_info) { + status = acpi_ev_add_gpe_reference(gpe_event_info); + } + + acpi_os_release_lock(acpi_gbl_gpe_lock, flags); + return_ACPI_STATUS(status); +} +ACPI_EXPORT_SYMBOL(acpi_enable_gpe) + +/******************************************************************************* + * + * FUNCTION: acpi_disable_gpe + * + * PARAMETERS: gpe_device - Parent GPE Device. NULL for GPE0/GPE1 + * gpe_number - GPE level within the GPE block + * + * RETURN: Status + * + * DESCRIPTION: Remove a reference to a GPE. When the last reference is + * removed, only then is the GPE disabled (for runtime GPEs), or + * the GPE mask bit disabled (for wake GPEs) + * + ******************************************************************************/ + +acpi_status acpi_disable_gpe(acpi_handle gpe_device, u32 gpe_number) +{ + acpi_status status = AE_BAD_PARAMETER; + struct acpi_gpe_event_info *gpe_event_info; + acpi_cpu_flags flags; + + ACPI_FUNCTION_TRACE(acpi_disable_gpe); + + flags = acpi_os_acquire_lock(acpi_gbl_gpe_lock); + + /* Ensure that we have a valid GPE number */ + + gpe_event_info = acpi_ev_get_gpe_event_info(gpe_device, gpe_number); + if (gpe_event_info) { + status = acpi_ev_remove_gpe_reference(gpe_event_info) ; + } + + acpi_os_release_lock(acpi_gbl_gpe_lock, flags); + return_ACPI_STATUS(status); +} +ACPI_EXPORT_SYMBOL(acpi_disable_gpe) + + +/******************************************************************************* + * + * FUNCTION: acpi_setup_gpe_for_wake + * + * PARAMETERS: wake_device - Device associated with the GPE (via _PRW) + * gpe_device - Parent GPE Device. NULL for GPE0/GPE1 + * gpe_number - GPE level within the GPE block + * + * RETURN: Status + * + * DESCRIPTION: Mark a GPE as having the ability to wake the system. This + * interface is intended to be used as the host executes the + * _PRW methods (Power Resources for Wake) in the system tables. + * Each _PRW appears under a Device Object (The wake_device), and + * contains the info for the wake GPE associated with the + * wake_device. + * + ******************************************************************************/ +acpi_status +acpi_setup_gpe_for_wake(acpi_handle wake_device, + acpi_handle gpe_device, u32 gpe_number) +{ + acpi_status status = AE_BAD_PARAMETER; + struct acpi_gpe_event_info *gpe_event_info; + struct acpi_namespace_node *device_node; + acpi_cpu_flags flags; + + ACPI_FUNCTION_TRACE(acpi_setup_gpe_for_wake); + + /* Parameter Validation */ + + if (!wake_device) { + /* + * By forcing wake_device to be valid, we automatically enable the + * implicit notify feature on all hosts. + */ + return_ACPI_STATUS(AE_BAD_PARAMETER); + } + + /* Validate wake_device is of type Device */ + + device_node = ACPI_CAST_PTR(struct acpi_namespace_node, wake_device); + if (device_node->type != ACPI_TYPE_DEVICE) { + return_ACPI_STATUS(AE_BAD_PARAMETER); + } + + flags = acpi_os_acquire_lock(acpi_gbl_gpe_lock); + + /* Ensure that we have a valid GPE number */ + + gpe_event_info = acpi_ev_get_gpe_event_info(gpe_device, gpe_number); + if (gpe_event_info) { + /* + * If there is no method or handler for this GPE, then the + * wake_device will be notified whenever this GPE fires (aka + * "implicit notify") Note: The GPE is assumed to be + * level-triggered (for windows compatibility). + */ + if ((gpe_event_info->flags & ACPI_GPE_DISPATCH_MASK) == + ACPI_GPE_DISPATCH_NONE) { + gpe_event_info->flags = + (ACPI_GPE_DISPATCH_NOTIFY | + ACPI_GPE_LEVEL_TRIGGERED); + gpe_event_info->dispatch.device_node = device_node; + } + + gpe_event_info->flags |= ACPI_GPE_CAN_WAKE; + status = AE_OK; + } + + acpi_os_release_lock(acpi_gbl_gpe_lock, flags); + return_ACPI_STATUS(status); +} +ACPI_EXPORT_SYMBOL(acpi_setup_gpe_for_wake) + +/******************************************************************************* + * + * FUNCTION: acpi_set_gpe_wake_mask + * + * PARAMETERS: gpe_device - Parent GPE Device. NULL for GPE0/GPE1 + * gpe_number - GPE level within the GPE block + * Action - Enable or Disable + * + * RETURN: Status + * + * DESCRIPTION: Set or clear the GPE's wakeup enable mask bit. The GPE must + * already be marked as a WAKE GPE. + * + ******************************************************************************/ + +acpi_status acpi_set_gpe_wake_mask(acpi_handle gpe_device, u32 gpe_number, u8 action) +{ + acpi_status status = AE_OK; + struct acpi_gpe_event_info *gpe_event_info; + struct acpi_gpe_register_info *gpe_register_info; + acpi_cpu_flags flags; + u32 register_bit; + + ACPI_FUNCTION_TRACE(acpi_set_gpe_wake_mask); + + flags = acpi_os_acquire_lock(acpi_gbl_gpe_lock); + + /* + * Ensure that we have a valid GPE number and that this GPE is in + * fact a wake GPE + */ + gpe_event_info = acpi_ev_get_gpe_event_info(gpe_device, gpe_number); + if (!gpe_event_info) { + status = AE_BAD_PARAMETER; + goto unlock_and_exit; + } + + if (!(gpe_event_info->flags & ACPI_GPE_CAN_WAKE)) { + status = AE_TYPE; + goto unlock_and_exit; + } + + gpe_register_info = gpe_event_info->register_info; + if (!gpe_register_info) { + status = AE_NOT_EXIST; + goto unlock_and_exit; + } + + register_bit = + acpi_hw_get_gpe_register_bit(gpe_event_info, gpe_register_info); + + /* Perform the action */ + + switch (action) { + case ACPI_GPE_ENABLE: + ACPI_SET_BIT(gpe_register_info->enable_for_wake, + (u8)register_bit); + break; + + case ACPI_GPE_DISABLE: + ACPI_CLEAR_BIT(gpe_register_info->enable_for_wake, + (u8)register_bit); + break; + + default: + ACPI_ERROR((AE_INFO, "%u, Invalid action", action)); + status = AE_BAD_PARAMETER; + break; + } + +unlock_and_exit: + acpi_os_release_lock(acpi_gbl_gpe_lock, flags); + return_ACPI_STATUS(status); +} + +ACPI_EXPORT_SYMBOL(acpi_set_gpe_wake_mask) + +/******************************************************************************* + * + * FUNCTION: acpi_clear_gpe + * + * PARAMETERS: gpe_device - Parent GPE Device. NULL for GPE0/GPE1 + * gpe_number - GPE level within the GPE block + * + * RETURN: Status + * + * DESCRIPTION: Clear an ACPI event (general purpose) + * + ******************************************************************************/ +acpi_status acpi_clear_gpe(acpi_handle gpe_device, u32 gpe_number) +{ + acpi_status status = AE_OK; + struct acpi_gpe_event_info *gpe_event_info; + acpi_cpu_flags flags; + + ACPI_FUNCTION_TRACE(acpi_clear_gpe); + + flags = acpi_os_acquire_lock(acpi_gbl_gpe_lock); + + /* Ensure that we have a valid GPE number */ + + gpe_event_info = acpi_ev_get_gpe_event_info(gpe_device, gpe_number); + if (!gpe_event_info) { + status = AE_BAD_PARAMETER; + goto unlock_and_exit; + } + + status = acpi_hw_clear_gpe(gpe_event_info); + + unlock_and_exit: + acpi_os_release_lock(acpi_gbl_gpe_lock, flags); + return_ACPI_STATUS(status); +} + +ACPI_EXPORT_SYMBOL(acpi_clear_gpe) + +/******************************************************************************* + * + * FUNCTION: acpi_get_gpe_status + * + * PARAMETERS: gpe_device - Parent GPE Device. NULL for GPE0/GPE1 + * gpe_number - GPE level within the GPE block + * event_status - Where the current status of the event will + * be returned + * + * RETURN: Status + * + * DESCRIPTION: Get the current status of a GPE (signalled/not_signalled) + * + ******************************************************************************/ +acpi_status +acpi_get_gpe_status(acpi_handle gpe_device, + u32 gpe_number, acpi_event_status *event_status) +{ + acpi_status status = AE_OK; + struct acpi_gpe_event_info *gpe_event_info; + acpi_cpu_flags flags; + + ACPI_FUNCTION_TRACE(acpi_get_gpe_status); + + flags = acpi_os_acquire_lock(acpi_gbl_gpe_lock); + + /* Ensure that we have a valid GPE number */ + + gpe_event_info = acpi_ev_get_gpe_event_info(gpe_device, gpe_number); + if (!gpe_event_info) { + status = AE_BAD_PARAMETER; + goto unlock_and_exit; + } + + /* Obtain status on the requested GPE number */ + + status = acpi_hw_get_gpe_status(gpe_event_info, event_status); + + if (gpe_event_info->flags & ACPI_GPE_DISPATCH_MASK) + *event_status |= ACPI_EVENT_FLAG_HANDLE; + + unlock_and_exit: + acpi_os_release_lock(acpi_gbl_gpe_lock, flags); + return_ACPI_STATUS(status); +} + +ACPI_EXPORT_SYMBOL(acpi_get_gpe_status) + +/****************************************************************************** + * + * FUNCTION: acpi_disable_all_gpes + * + * PARAMETERS: None + * + * RETURN: Status + * + * DESCRIPTION: Disable and clear all GPEs in all GPE blocks + * + ******************************************************************************/ + +acpi_status acpi_disable_all_gpes(void) +{ + acpi_status status; + + ACPI_FUNCTION_TRACE(acpi_disable_all_gpes); + + status = acpi_ut_acquire_mutex(ACPI_MTX_EVENTS); + if (ACPI_FAILURE(status)) { + return_ACPI_STATUS(status); + } + + status = acpi_hw_disable_all_gpes(); + (void)acpi_ut_release_mutex(ACPI_MTX_EVENTS); + + return_ACPI_STATUS(status); +} + +ACPI_EXPORT_SYMBOL(acpi_disable_all_gpes) + +/****************************************************************************** + * + * FUNCTION: acpi_enable_all_runtime_gpes + * + * PARAMETERS: None + * + * RETURN: Status + * + * DESCRIPTION: Enable all "runtime" GPEs, in all GPE blocks + * + ******************************************************************************/ + +acpi_status acpi_enable_all_runtime_gpes(void) +{ + acpi_status status; + + ACPI_FUNCTION_TRACE(acpi_enable_all_runtime_gpes); + + status = acpi_ut_acquire_mutex(ACPI_MTX_EVENTS); + if (ACPI_FAILURE(status)) { + return_ACPI_STATUS(status); + } + + status = acpi_hw_enable_all_runtime_gpes(); + (void)acpi_ut_release_mutex(ACPI_MTX_EVENTS); + + return_ACPI_STATUS(status); +} + +ACPI_EXPORT_SYMBOL(acpi_enable_all_runtime_gpes) + +/******************************************************************************* + * + * FUNCTION: acpi_install_gpe_block + * + * PARAMETERS: gpe_device - Handle to the parent GPE Block Device + * gpe_block_address - Address and space_iD + * register_count - Number of GPE register pairs in the block + * interrupt_number - H/W interrupt for the block + * + * RETURN: Status + * + * DESCRIPTION: Create and Install a block of GPE registers. The GPEs are not + * enabled here. + * + ******************************************************************************/ +acpi_status +acpi_install_gpe_block(acpi_handle gpe_device, + struct acpi_generic_address *gpe_block_address, + u32 register_count, u32 interrupt_number) +{ + acpi_status status; + union acpi_operand_object *obj_desc; + struct acpi_namespace_node *node; + struct acpi_gpe_block_info *gpe_block; + + ACPI_FUNCTION_TRACE(acpi_install_gpe_block); + + if ((!gpe_device) || (!gpe_block_address) || (!register_count)) { + return_ACPI_STATUS(AE_BAD_PARAMETER); + } + + status = acpi_ut_acquire_mutex(ACPI_MTX_NAMESPACE); + if (ACPI_FAILURE(status)) { + return (status); + } + + node = acpi_ns_validate_handle(gpe_device); + if (!node) { + status = AE_BAD_PARAMETER; + goto unlock_and_exit; + } + + /* + * For user-installed GPE Block Devices, the gpe_block_base_number + * is always zero + */ + status = + acpi_ev_create_gpe_block(node, gpe_block_address, register_count, 0, + interrupt_number, &gpe_block); + if (ACPI_FAILURE(status)) { + goto unlock_and_exit; + } + + /* Install block in the device_object attached to the node */ + + obj_desc = acpi_ns_get_attached_object(node); + if (!obj_desc) { + + /* + * No object, create a new one (Device nodes do not always have + * an attached object) + */ + obj_desc = acpi_ut_create_internal_object(ACPI_TYPE_DEVICE); + if (!obj_desc) { + status = AE_NO_MEMORY; + goto unlock_and_exit; + } + + status = + acpi_ns_attach_object(node, obj_desc, ACPI_TYPE_DEVICE); + + /* Remove local reference to the object */ + + acpi_ut_remove_reference(obj_desc); + + if (ACPI_FAILURE(status)) { + goto unlock_and_exit; + } + } + + /* Now install the GPE block in the device_object */ + + obj_desc->device.gpe_block = gpe_block; + + unlock_and_exit: + (void)acpi_ut_release_mutex(ACPI_MTX_NAMESPACE); + return_ACPI_STATUS(status); +} + +ACPI_EXPORT_SYMBOL(acpi_install_gpe_block) + +/******************************************************************************* + * + * FUNCTION: acpi_remove_gpe_block + * + * PARAMETERS: gpe_device - Handle to the parent GPE Block Device + * + * RETURN: Status + * + * DESCRIPTION: Remove a previously installed block of GPE registers + * + ******************************************************************************/ +acpi_status acpi_remove_gpe_block(acpi_handle gpe_device) +{ + union acpi_operand_object *obj_desc; + acpi_status status; + struct acpi_namespace_node *node; + + ACPI_FUNCTION_TRACE(acpi_remove_gpe_block); + + if (!gpe_device) { + return_ACPI_STATUS(AE_BAD_PARAMETER); + } + + status = acpi_ut_acquire_mutex(ACPI_MTX_NAMESPACE); + if (ACPI_FAILURE(status)) { + return (status); + } + + node = acpi_ns_validate_handle(gpe_device); + if (!node) { + status = AE_BAD_PARAMETER; + goto unlock_and_exit; + } + + /* Get the device_object attached to the node */ + + obj_desc = acpi_ns_get_attached_object(node); + if (!obj_desc || !obj_desc->device.gpe_block) { + return_ACPI_STATUS(AE_NULL_OBJECT); + } + + /* Delete the GPE block (but not the device_object) */ + + status = acpi_ev_delete_gpe_block(obj_desc->device.gpe_block); + if (ACPI_SUCCESS(status)) { + obj_desc->device.gpe_block = NULL; + } + + unlock_and_exit: + (void)acpi_ut_release_mutex(ACPI_MTX_NAMESPACE); + return_ACPI_STATUS(status); +} + +ACPI_EXPORT_SYMBOL(acpi_remove_gpe_block) + +/******************************************************************************* + * + * FUNCTION: acpi_get_gpe_device + * + * PARAMETERS: Index - System GPE index (0-current_gpe_count) + * gpe_device - Where the parent GPE Device is returned + * + * RETURN: Status + * + * DESCRIPTION: Obtain the GPE device associated with the input index. A NULL + * gpe device indicates that the gpe number is contained in one of + * the FADT-defined gpe blocks. Otherwise, the GPE block device. + * + ******************************************************************************/ +acpi_status +acpi_get_gpe_device(u32 index, acpi_handle *gpe_device) +{ + struct acpi_gpe_device_info info; + acpi_status status; + + ACPI_FUNCTION_TRACE(acpi_get_gpe_device); + + if (!gpe_device) { + return_ACPI_STATUS(AE_BAD_PARAMETER); + } + + if (index >= acpi_current_gpe_count) { + return_ACPI_STATUS(AE_NOT_EXIST); + } + + /* Setup and walk the GPE list */ + + info.index = index; + info.status = AE_NOT_EXIST; + info.gpe_device = NULL; + info.next_block_base_index = 0; + + status = acpi_ev_walk_gpe_list(acpi_ev_get_gpe_device, &info); + if (ACPI_FAILURE(status)) { + return_ACPI_STATUS(status); + } + + *gpe_device = ACPI_CAST_PTR(acpi_handle, info.gpe_device); + return_ACPI_STATUS(info.status); +} + +ACPI_EXPORT_SYMBOL(acpi_get_gpe_device) diff --git a/drivers/acpi/acpica/hwgpe.c b/drivers/acpi/acpica/hwgpe.c index 14750db..85c3cbd 100644 --- a/drivers/acpi/acpica/hwgpe.c +++ b/drivers/acpi/acpica/hwgpe.c @@ -62,10 +62,10 @@ acpi_hw_enable_wakeup_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info, * PARAMETERS: gpe_event_info - Info block for the GPE * gpe_register_info - Info block for the GPE register * - * RETURN: Status + * RETURN: Register mask with a one in the GPE bit position * - * DESCRIPTION: Compute GPE enable mask with one bit corresponding to the given - * GPE set. + * DESCRIPTION: Compute the register mask for this GPE. One bit is set in the + * correct position for the input GPE. * ******************************************************************************/ @@ -85,12 +85,12 @@ u32 acpi_hw_get_gpe_register_bit(struct acpi_gpe_event_info *gpe_event_info, * * RETURN: Status * - * DESCRIPTION: Enable or disable a single GPE in its enable register. + * DESCRIPTION: Enable or disable a single GPE in the parent enable register. * ******************************************************************************/ acpi_status -acpi_hw_low_set_gpe(struct acpi_gpe_event_info *gpe_event_info, u8 action) +acpi_hw_low_set_gpe(struct acpi_gpe_event_info *gpe_event_info, u32 action) { struct acpi_gpe_register_info *gpe_register_info; acpi_status status; @@ -113,14 +113,20 @@ acpi_hw_low_set_gpe(struct acpi_gpe_event_info *gpe_event_info, u8 action) return (status); } - /* Set ot clear just the bit that corresponds to this GPE */ + /* Set or clear just the bit that corresponds to this GPE */ register_bit = acpi_hw_get_gpe_register_bit(gpe_event_info, gpe_register_info); switch (action) { - case ACPI_GPE_COND_ENABLE: - if (!(register_bit & gpe_register_info->enable_for_run)) + case ACPI_GPE_CONDITIONAL_ENABLE: + + /* Only enable if the enable_for_run bit is set */ + + if (!(register_bit & gpe_register_info->enable_for_run)) { return (AE_BAD_PARAMETER); + } + + /*lint -fallthrough */ case ACPI_GPE_ENABLE: ACPI_SET_BIT(enable_mask, register_bit); @@ -131,7 +137,7 @@ acpi_hw_low_set_gpe(struct acpi_gpe_event_info *gpe_event_info, u8 action) break; default: - ACPI_ERROR((AE_INFO, "Invalid action\n")); + ACPI_ERROR((AE_INFO, "Invalid GPE Action, %u\n", action)); return (AE_BAD_PARAMETER); } @@ -168,13 +174,13 @@ acpi_status acpi_hw_clear_gpe(struct acpi_gpe_event_info * gpe_event_info) return (AE_NOT_EXIST); } - register_bit = acpi_hw_get_gpe_register_bit(gpe_event_info, - gpe_register_info); - /* * Write a one to the appropriate bit in the status register to * clear this GPE. */ + register_bit = + acpi_hw_get_gpe_register_bit(gpe_event_info, gpe_register_info); + status = acpi_hw_write(register_bit, &gpe_register_info->status_address); @@ -201,8 +207,8 @@ acpi_hw_get_gpe_status(struct acpi_gpe_event_info * gpe_event_info, u32 in_byte; u32 register_bit; struct acpi_gpe_register_info *gpe_register_info; - acpi_status status; acpi_event_status local_event_status = 0; + acpi_status status; ACPI_FUNCTION_ENTRY(); diff --git a/drivers/acpi/acpica/utglobal.c b/drivers/acpi/acpica/utglobal.c index e87bc67..508537f 100644 --- a/drivers/acpi/acpica/utglobal.c +++ b/drivers/acpi/acpica/utglobal.c @@ -768,7 +768,7 @@ acpi_status acpi_ut_init_globals(void) acpi_gbl_gpe_fadt_blocks[0] = NULL; acpi_gbl_gpe_fadt_blocks[1] = NULL; acpi_current_gpe_count = 0; - acpi_all_gpes_initialized = FALSE; + acpi_gbl_all_gpes_initialized = FALSE; /* Global handlers */ @@ -778,6 +778,7 @@ acpi_status acpi_ut_init_globals(void) acpi_gbl_init_handler = NULL; acpi_gbl_table_handler = NULL; acpi_gbl_interface_handler = NULL; + acpi_gbl_global_event_handler = NULL; /* Global Lock support */ diff --git a/drivers/acpi/apei/apei-internal.h b/drivers/acpi/apei/apei-internal.h index 18df1e9..ef0581f 100644 --- a/drivers/acpi/apei/apei-internal.h +++ b/drivers/acpi/apei/apei-internal.h @@ -109,6 +109,8 @@ static inline u32 apei_estatus_len(struct acpi_hest_generic_status *estatus) return sizeof(*estatus) + estatus->data_length; } +void apei_estatus_print(const char *pfx, + const struct acpi_hest_generic_status *estatus); int apei_estatus_check_header(const struct acpi_hest_generic_status *estatus); int apei_estatus_check(const struct acpi_hest_generic_status *estatus); #endif diff --git a/drivers/acpi/apei/cper.c b/drivers/acpi/apei/cper.c index f4cf2fc..31464a0 100644 --- a/drivers/acpi/apei/cper.c +++ b/drivers/acpi/apei/cper.c @@ -46,6 +46,317 @@ u64 cper_next_record_id(void) } EXPORT_SYMBOL_GPL(cper_next_record_id); +static const char *cper_severity_strs[] = { + "recoverable", + "fatal", + "corrected", + "info", +}; + +static const char *cper_severity_str(unsigned int severity) +{ + return severity < ARRAY_SIZE(cper_severity_strs) ? + cper_severity_strs[severity] : "unknown"; +} + +/* + * cper_print_bits - print strings for set bits + * @pfx: prefix for each line, including log level and prefix string + * @bits: bit mask + * @strs: string array, indexed by bit position + * @strs_size: size of the string array: @strs + * + * For each set bit in @bits, print the corresponding string in @strs. + * If the output length is longer than 80, multiple line will be + * printed, with @pfx is printed at the beginning of each line. + */ +static void cper_print_bits(const char *pfx, unsigned int bits, + const char *strs[], unsigned int strs_size) +{ + int i, len = 0; + const char *str; + char buf[84]; + + for (i = 0; i < strs_size; i++) { + if (!(bits & (1U << i))) + continue; + str = strs[i]; + if (len && len + strlen(str) + 2 > 80) { + printk("%s\n", buf); + len = 0; + } + if (!len) + len = snprintf(buf, sizeof(buf), "%s%s", pfx, str); + else + len += snprintf(buf+len, sizeof(buf)-len, ", %s", str); + } + if (len) + printk("%s\n", buf); +} + +static const char *cper_proc_type_strs[] = { + "IA32/X64", + "IA64", +}; + +static const char *cper_proc_isa_strs[] = { + "IA32", + "IA64", + "X64", +}; + +static const char *cper_proc_error_type_strs[] = { + "cache error", + "TLB error", + "bus error", + "micro-architectural error", +}; + +static const char *cper_proc_op_strs[] = { + "unknown or generic", + "data read", + "data write", + "instruction execution", +}; + +static const char *cper_proc_flag_strs[] = { + "restartable", + "precise IP", + "overflow", + "corrected", +}; + +static void cper_print_proc_generic(const char *pfx, + const struct cper_sec_proc_generic *proc) +{ + if (proc->validation_bits & CPER_PROC_VALID_TYPE) + printk("%s""processor_type: %d, %s\n", pfx, proc->proc_type, + proc->proc_type < ARRAY_SIZE(cper_proc_type_strs) ? + cper_proc_type_strs[proc->proc_type] : "unknown"); + if (proc->validation_bits & CPER_PROC_VALID_ISA) + printk("%s""processor_isa: %d, %s\n", pfx, proc->proc_isa, + proc->proc_isa < ARRAY_SIZE(cper_proc_isa_strs) ? + cper_proc_isa_strs[proc->proc_isa] : "unknown"); + if (proc->validation_bits & CPER_PROC_VALID_ERROR_TYPE) { + printk("%s""error_type: 0x%02x\n", pfx, proc->proc_error_type); + cper_print_bits(pfx, proc->proc_error_type, + cper_proc_error_type_strs, + ARRAY_SIZE(cper_proc_error_type_strs)); + } + if (proc->validation_bits & CPER_PROC_VALID_OPERATION) + printk("%s""operation: %d, %s\n", pfx, proc->operation, + proc->operation < ARRAY_SIZE(cper_proc_op_strs) ? + cper_proc_op_strs[proc->operation] : "unknown"); + if (proc->validation_bits & CPER_PROC_VALID_FLAGS) { + printk("%s""flags: 0x%02x\n", pfx, proc->flags); + cper_print_bits(pfx, proc->flags, cper_proc_flag_strs, + ARRAY_SIZE(cper_proc_flag_strs)); + } + if (proc->validation_bits & CPER_PROC_VALID_LEVEL) + printk("%s""level: %d\n", pfx, proc->level); + if (proc->validation_bits & CPER_PROC_VALID_VERSION) + printk("%s""version_info: 0x%016llx\n", pfx, proc->cpu_version); + if (proc->validation_bits & CPER_PROC_VALID_ID) + printk("%s""processor_id: 0x%016llx\n", pfx, proc->proc_id); + if (proc->validation_bits & CPER_PROC_VALID_TARGET_ADDRESS) + printk("%s""target_address: 0x%016llx\n", + pfx, proc->target_addr); + if (proc->validation_bits & CPER_PROC_VALID_REQUESTOR_ID) + printk("%s""requestor_id: 0x%016llx\n", + pfx, proc->requestor_id); + if (proc->validation_bits & CPER_PROC_VALID_RESPONDER_ID) + printk("%s""responder_id: 0x%016llx\n", + pfx, proc->responder_id); + if (proc->validation_bits & CPER_PROC_VALID_IP) + printk("%s""IP: 0x%016llx\n", pfx, proc->ip); +} + +static const char *cper_mem_err_type_strs[] = { + "unknown", + "no error", + "single-bit ECC", + "multi-bit ECC", + "single-symbol chipkill ECC", + "multi-symbol chipkill ECC", + "master abort", + "target abort", + "parity error", + "watchdog timeout", + "invalid address", + "mirror Broken", + "memory sparing", + "scrub corrected error", + "scrub uncorrected error", +}; + +static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem) +{ + if (mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS) + printk("%s""error_status: 0x%016llx\n", pfx, mem->error_status); + if (mem->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) + printk("%s""physical_address: 0x%016llx\n", + pfx, mem->physical_addr); + if (mem->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS_MASK) + printk("%s""physical_address_mask: 0x%016llx\n", + pfx, mem->physical_addr_mask); + if (mem->validation_bits & CPER_MEM_VALID_NODE) + printk("%s""node: %d\n", pfx, mem->node); + if (mem->validation_bits & CPER_MEM_VALID_CARD) + printk("%s""card: %d\n", pfx, mem->card); + if (mem->validation_bits & CPER_MEM_VALID_MODULE) + printk("%s""module: %d\n", pfx, mem->module); + if (mem->validation_bits & CPER_MEM_VALID_BANK) + printk("%s""bank: %d\n", pfx, mem->bank); + if (mem->validation_bits & CPER_MEM_VALID_DEVICE) + printk("%s""device: %d\n", pfx, mem->device); + if (mem->validation_bits & CPER_MEM_VALID_ROW) + printk("%s""row: %d\n", pfx, mem->row); + if (mem->validation_bits & CPER_MEM_VALID_COLUMN) + printk("%s""column: %d\n", pfx, mem->column); + if (mem->validation_bits & CPER_MEM_VALID_BIT_POSITION) + printk("%s""bit_position: %d\n", pfx, mem->bit_pos); + if (mem->validation_bits & CPER_MEM_VALID_REQUESTOR_ID) + printk("%s""requestor_id: 0x%016llx\n", pfx, mem->requestor_id); + if (mem->validation_bits & CPER_MEM_VALID_RESPONDER_ID) + printk("%s""responder_id: 0x%016llx\n", pfx, mem->responder_id); + if (mem->validation_bits & CPER_MEM_VALID_TARGET_ID) + printk("%s""target_id: 0x%016llx\n", pfx, mem->target_id); + if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) { + u8 etype = mem->error_type; + printk("%s""error_type: %d, %s\n", pfx, etype, + etype < ARRAY_SIZE(cper_mem_err_type_strs) ? + cper_mem_err_type_strs[etype] : "unknown"); + } +} + +static const char *cper_pcie_port_type_strs[] = { + "PCIe end point", + "legacy PCI end point", + "unknown", + "unknown", + "root port", + "upstream switch port", + "downstream switch port", + "PCIe to PCI/PCI-X bridge", + "PCI/PCI-X to PCIe bridge", + "root complex integrated endpoint device", + "root complex event collector", +}; + +static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie) +{ + if (pcie->validation_bits & CPER_PCIE_VALID_PORT_TYPE) + printk("%s""port_type: %d, %s\n", pfx, pcie->port_type, + pcie->port_type < ARRAY_SIZE(cper_pcie_port_type_strs) ? + cper_pcie_port_type_strs[pcie->port_type] : "unknown"); + if (pcie->validation_bits & CPER_PCIE_VALID_VERSION) + printk("%s""version: %d.%d\n", pfx, + pcie->version.major, pcie->version.minor); + if (pcie->validation_bits & CPER_PCIE_VALID_COMMAND_STATUS) + printk("%s""command: 0x%04x, status: 0x%04x\n", pfx, + pcie->command, pcie->status); + if (pcie->validation_bits & CPER_PCIE_VALID_DEVICE_ID) { + const __u8 *p; + printk("%s""device_id: %04x:%02x:%02x.%x\n", pfx, + pcie->device_id.segment, pcie->device_id.bus, + pcie->device_id.device, pcie->device_id.function); + printk("%s""slot: %d\n", pfx, + pcie->device_id.slot >> CPER_PCIE_SLOT_SHIFT); + printk("%s""secondary_bus: 0x%02x\n", pfx, + pcie->device_id.secondary_bus); + printk("%s""vendor_id: 0x%04x, device_id: 0x%04x\n", pfx, + pcie->device_id.vendor_id, pcie->device_id.device_id); + p = pcie->device_id.class_code; + printk("%s""class_code: %02x%02x%02x\n", pfx, p[0], p[1], p[2]); + } + if (pcie->validation_bits & CPER_PCIE_VALID_SERIAL_NUMBER) + printk("%s""serial number: 0x%04x, 0x%04x\n", pfx, + pcie->serial_number.lower, pcie->serial_number.upper); + if (pcie->validation_bits & CPER_PCIE_VALID_BRIDGE_CONTROL_STATUS) + printk( + "%s""bridge: secondary_status: 0x%04x, control: 0x%04x\n", + pfx, pcie->bridge.secondary_status, pcie->bridge.control); +} + +static const char *apei_estatus_section_flag_strs[] = { + "primary", + "containment warning", + "reset", + "threshold exceeded", + "resource not accessible", + "latent error", +}; + +static void apei_estatus_print_section( + const char *pfx, const struct acpi_hest_generic_data *gdata, int sec_no) +{ + uuid_le *sec_type = (uuid_le *)gdata->section_type; + __u16 severity; + + severity = gdata->error_severity; + printk("%s""section: %d, severity: %d, %s\n", pfx, sec_no, severity, + cper_severity_str(severity)); + printk("%s""flags: 0x%02x\n", pfx, gdata->flags); + cper_print_bits(pfx, gdata->flags, apei_estatus_section_flag_strs, + ARRAY_SIZE(apei_estatus_section_flag_strs)); + if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID) + printk("%s""fru_id: %pUl\n", pfx, (uuid_le *)gdata->fru_id); + if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT) + printk("%s""fru_text: %.20s\n", pfx, gdata->fru_text); + + if (!uuid_le_cmp(*sec_type, CPER_SEC_PROC_GENERIC)) { + struct cper_sec_proc_generic *proc_err = (void *)(gdata + 1); + printk("%s""section_type: general processor error\n", pfx); + if (gdata->error_data_length >= sizeof(*proc_err)) + cper_print_proc_generic(pfx, proc_err); + else + goto err_section_too_small; + } else if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) { + struct cper_sec_mem_err *mem_err = (void *)(gdata + 1); + printk("%s""section_type: memory error\n", pfx); + if (gdata->error_data_length >= sizeof(*mem_err)) + cper_print_mem(pfx, mem_err); + else + goto err_section_too_small; + } else if (!uuid_le_cmp(*sec_type, CPER_SEC_PCIE)) { + struct cper_sec_pcie *pcie = (void *)(gdata + 1); + printk("%s""section_type: PCIe error\n", pfx); + if (gdata->error_data_length >= sizeof(*pcie)) + cper_print_pcie(pfx, pcie); + else + goto err_section_too_small; + } else + printk("%s""section type: unknown, %pUl\n", pfx, sec_type); + + return; + +err_section_too_small: + pr_err(FW_WARN "error section length is too small\n"); +} + +void apei_estatus_print(const char *pfx, + const struct acpi_hest_generic_status *estatus) +{ + struct acpi_hest_generic_data *gdata; + unsigned int data_len, gedata_len; + int sec_no = 0; + __u16 severity; + + printk("%s""APEI generic hardware error status\n", pfx); + severity = estatus->error_severity; + printk("%s""severity: %d, %s\n", pfx, severity, + cper_severity_str(severity)); + data_len = estatus->data_length; + gdata = (struct acpi_hest_generic_data *)(estatus + 1); + while (data_len > sizeof(*gdata)) { + gedata_len = gdata->error_data_length; + apei_estatus_print_section(pfx, gdata, sec_no); + data_len -= gedata_len + sizeof(*gdata); + sec_no++; + } +} +EXPORT_SYMBOL_GPL(apei_estatus_print); + int apei_estatus_check_header(const struct acpi_hest_generic_status *estatus) { if (estatus->data_length && diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 0d505e5..d1d484d 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -12,10 +12,6 @@ * For more information about Generic Hardware Error Source, please * refer to ACPI Specification version 4.0, section 17.3.2.6 * - * Now, only SCI notification type and memory errors are - * supported. More notification type and hardware error type will be - * added later. - * * Copyright 2010 Intel Corp. * Author: Huang Ying * @@ -39,14 +35,18 @@ #include #include #include +#include #include #include #include #include +#include +#include #include #include #include #include +#include #include "apei-internal.h" @@ -55,42 +55,131 @@ #define GHES_ESTATUS_MAX_SIZE 65536 /* - * One struct ghes is created for each generic hardware error - * source. - * + * One struct ghes is created for each generic hardware error source. * It provides the context for APEI hardware error timer/IRQ/SCI/NMI - * handler. Handler for one generic hardware error source is only - * triggered after the previous one is done. So handler can uses - * struct ghes without locking. + * handler. * * estatus: memory buffer for error status block, allocated during * HEST parsing. */ #define GHES_TO_CLEAR 0x0001 +#define GHES_EXITING 0x0002 struct ghes { struct acpi_hest_generic *generic; struct acpi_hest_generic_status *estatus; - struct list_head list; u64 buffer_paddr; unsigned long flags; + union { + struct list_head list; + struct timer_list timer; + unsigned int irq; + }; }; +static int ghes_panic_timeout __read_mostly = 30; + /* - * Error source lists, one list for each notification method. The - * members in lists are struct ghes. + * All error sources notified with SCI shares one notifier function, + * so they need to be linked and checked one by one. This is applied + * to NMI too. * - * The list members are only added in HEST parsing and deleted during - * module_exit, that is, single-threaded. So no lock is needed for - * that. - * - * But the mutual exclusion is needed between members adding/deleting - * and timer/IRQ/SCI/NMI handler, which may traverse the list. RCU is - * used for that. + * RCU is used for these lists, so ghes_list_mutex is only used for + * list changing, not for traversing. */ static LIST_HEAD(ghes_sci); +static LIST_HEAD(ghes_nmi); static DEFINE_MUTEX(ghes_list_mutex); +/* + * NMI may be triggered on any CPU, so ghes_nmi_lock is used for + * mutual exclusion. + */ +static DEFINE_RAW_SPINLOCK(ghes_nmi_lock); + +/* + * Because the memory area used to transfer hardware error information + * from BIOS to Linux can be determined only in NMI, IRQ or timer + * handler, but general ioremap can not be used in atomic context, so + * a special version of atomic ioremap is implemented for that. + */ + +/* + * Two virtual pages are used, one for NMI context, the other for + * IRQ/PROCESS context + */ +#define GHES_IOREMAP_PAGES 2 +#define GHES_IOREMAP_NMI_PAGE(base) (base) +#define GHES_IOREMAP_IRQ_PAGE(base) ((base) + PAGE_SIZE) + +/* virtual memory area for atomic ioremap */ +static struct vm_struct *ghes_ioremap_area; +/* + * These 2 spinlock is used to prevent atomic ioremap virtual memory + * area from being mapped simultaneously. + */ +static DEFINE_RAW_SPINLOCK(ghes_ioremap_lock_nmi); +static DEFINE_SPINLOCK(ghes_ioremap_lock_irq); + +static int ghes_ioremap_init(void) +{ + ghes_ioremap_area = __get_vm_area(PAGE_SIZE * GHES_IOREMAP_PAGES, + VM_IOREMAP, VMALLOC_START, VMALLOC_END); + if (!ghes_ioremap_area) { + pr_err(GHES_PFX "Failed to allocate virtual memory area for atomic ioremap.\n"); + return -ENOMEM; + } + + return 0; +} + +static void ghes_ioremap_exit(void) +{ + free_vm_area(ghes_ioremap_area); +} + +static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn) +{ + unsigned long vaddr; + + vaddr = (unsigned long)GHES_IOREMAP_NMI_PAGE(ghes_ioremap_area->addr); + ioremap_page_range(vaddr, vaddr + PAGE_SIZE, + pfn << PAGE_SHIFT, PAGE_KERNEL); + + return (void __iomem *)vaddr; +} + +static void __iomem *ghes_ioremap_pfn_irq(u64 pfn) +{ + unsigned long vaddr; + + vaddr = (unsigned long)GHES_IOREMAP_IRQ_PAGE(ghes_ioremap_area->addr); + ioremap_page_range(vaddr, vaddr + PAGE_SIZE, + pfn << PAGE_SHIFT, PAGE_KERNEL); + + return (void __iomem *)vaddr; +} + +static void ghes_iounmap_nmi(void __iomem *vaddr_ptr) +{ + unsigned long vaddr = (unsigned long __force)vaddr_ptr; + void *base = ghes_ioremap_area->addr; + + BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_NMI_PAGE(base)); + unmap_kernel_range_noflush(vaddr, PAGE_SIZE); + __flush_tlb_one(vaddr); +} + +static void ghes_iounmap_irq(void __iomem *vaddr_ptr) +{ + unsigned long vaddr = (unsigned long __force)vaddr_ptr; + void *base = ghes_ioremap_area->addr; + + BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_IRQ_PAGE(base)); + unmap_kernel_range_noflush(vaddr, PAGE_SIZE); + __flush_tlb_one(vaddr); +} + static struct ghes *ghes_new(struct acpi_hest_generic *generic) { struct ghes *ghes; @@ -101,7 +190,6 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic) if (!ghes) return ERR_PTR(-ENOMEM); ghes->generic = generic; - INIT_LIST_HEAD(&ghes->list); rc = acpi_pre_map_gar(&generic->error_status_address); if (rc) goto err_free; @@ -158,22 +246,41 @@ static inline int ghes_severity(int severity) } } -/* SCI handler run in work queue, so ioremap can be used here */ -static int ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len, - int from_phys) +static void ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len, + int from_phys) { - void *vaddr; - - vaddr = ioremap_cache(paddr, len); - if (!vaddr) - return -ENOMEM; - if (from_phys) - memcpy(buffer, vaddr, len); - else - memcpy(vaddr, buffer, len); - iounmap(vaddr); - - return 0; + void __iomem *vaddr; + unsigned long flags = 0; + int in_nmi = in_nmi(); + u64 offset; + u32 trunk; + + while (len > 0) { + offset = paddr - (paddr & PAGE_MASK); + if (in_nmi) { + raw_spin_lock(&ghes_ioremap_lock_nmi); + vaddr = ghes_ioremap_pfn_nmi(paddr >> PAGE_SHIFT); + } else { + spin_lock_irqsave(&ghes_ioremap_lock_irq, flags); + vaddr = ghes_ioremap_pfn_irq(paddr >> PAGE_SHIFT); + } + trunk = PAGE_SIZE - offset; + trunk = min(trunk, len); + if (from_phys) + memcpy_fromio(buffer, vaddr + offset, trunk); + else + memcpy_toio(vaddr + offset, buffer, trunk); + len -= trunk; + paddr += trunk; + buffer += trunk; + if (in_nmi) { + ghes_iounmap_nmi(vaddr); + raw_spin_unlock(&ghes_ioremap_lock_nmi); + } else { + ghes_iounmap_irq(vaddr); + spin_unlock_irqrestore(&ghes_ioremap_lock_irq, flags); + } + } } static int ghes_read_estatus(struct ghes *ghes, int silent) @@ -194,10 +301,8 @@ static int ghes_read_estatus(struct ghes *ghes, int silent) if (!buf_paddr) return -ENOENT; - rc = ghes_copy_tofrom_phys(ghes->estatus, buf_paddr, - sizeof(*ghes->estatus), 1); - if (rc) - return rc; + ghes_copy_tofrom_phys(ghes->estatus, buf_paddr, + sizeof(*ghes->estatus), 1); if (!ghes->estatus->block_status) return -ENOENT; @@ -212,17 +317,15 @@ static int ghes_read_estatus(struct ghes *ghes, int silent) goto err_read_block; if (apei_estatus_check_header(ghes->estatus)) goto err_read_block; - rc = ghes_copy_tofrom_phys(ghes->estatus + 1, - buf_paddr + sizeof(*ghes->estatus), - len - sizeof(*ghes->estatus), 1); - if (rc) - return rc; + ghes_copy_tofrom_phys(ghes->estatus + 1, + buf_paddr + sizeof(*ghes->estatus), + len - sizeof(*ghes->estatus), 1); if (apei_estatus_check(ghes->estatus)) goto err_read_block; rc = 0; err_read_block: - if (rc && !silent) + if (rc && !silent && printk_ratelimit()) pr_warning(FW_WARN GHES_PFX "Failed to read error status block!\n"); return rc; @@ -255,11 +358,26 @@ static void ghes_do_proc(struct ghes *ghes) } #endif } +} - if (!processed && printk_ratelimit()) - pr_warning(GHES_PFX - "Unknown error record from generic hardware error source: %d\n", - ghes->generic->header.source_id); +static void ghes_print_estatus(const char *pfx, struct ghes *ghes) +{ + /* Not more than 2 messages every 5 seconds */ + static DEFINE_RATELIMIT_STATE(ratelimit, 5*HZ, 2); + + if (pfx == NULL) { + if (ghes_severity(ghes->estatus->error_severity) <= + GHES_SEV_CORRECTED) + pfx = KERN_WARNING HW_ERR; + else + pfx = KERN_ERR HW_ERR; + } + if (__ratelimit(&ratelimit)) { + printk( + "%s""Hardware error from APEI Generic Hardware Error Source: %d\n", + pfx, ghes->generic->header.source_id); + apei_estatus_print(pfx, ghes->estatus); + } } static int ghes_proc(struct ghes *ghes) @@ -269,6 +387,7 @@ static int ghes_proc(struct ghes *ghes) rc = ghes_read_estatus(ghes, 0); if (rc) goto out; + ghes_print_estatus(NULL, ghes); ghes_do_proc(ghes); out: @@ -276,6 +395,42 @@ out: return 0; } +static void ghes_add_timer(struct ghes *ghes) +{ + struct acpi_hest_generic *g = ghes->generic; + unsigned long expire; + + if (!g->notify.poll_interval) { + pr_warning(FW_WARN GHES_PFX "Poll interval is 0 for generic hardware error source: %d, disabled.\n", + g->header.source_id); + return; + } + expire = jiffies + msecs_to_jiffies(g->notify.poll_interval); + ghes->timer.expires = round_jiffies_relative(expire); + add_timer(&ghes->timer); +} + +static void ghes_poll_func(unsigned long data) +{ + struct ghes *ghes = (void *)data; + + ghes_proc(ghes); + if (!(ghes->flags & GHES_EXITING)) + ghes_add_timer(ghes); +} + +static irqreturn_t ghes_irq_func(int irq, void *data) +{ + struct ghes *ghes = data; + int rc; + + rc = ghes_proc(ghes); + if (rc) + return IRQ_NONE; + + return IRQ_HANDLED; +} + static int ghes_notify_sci(struct notifier_block *this, unsigned long event, void *data) { @@ -292,10 +447,63 @@ static int ghes_notify_sci(struct notifier_block *this, return ret; } +static int ghes_notify_nmi(struct notifier_block *this, + unsigned long cmd, void *data) +{ + struct ghes *ghes, *ghes_global = NULL; + int sev, sev_global = -1; + int ret = NOTIFY_DONE; + + if (cmd != DIE_NMI) + return ret; + + raw_spin_lock(&ghes_nmi_lock); + list_for_each_entry_rcu(ghes, &ghes_nmi, list) { + if (ghes_read_estatus(ghes, 1)) { + ghes_clear_estatus(ghes); + continue; + } + sev = ghes_severity(ghes->estatus->error_severity); + if (sev > sev_global) { + sev_global = sev; + ghes_global = ghes; + } + ret = NOTIFY_STOP; + } + + if (ret == NOTIFY_DONE) + goto out; + + if (sev_global >= GHES_SEV_PANIC) { + oops_begin(); + ghes_print_estatus(KERN_EMERG HW_ERR, ghes_global); + /* reboot to log the error! */ + if (panic_timeout == 0) + panic_timeout = ghes_panic_timeout; + panic("Fatal hardware error!"); + } + + list_for_each_entry_rcu(ghes, &ghes_nmi, list) { + if (!(ghes->flags & GHES_TO_CLEAR)) + continue; + /* Do not print estatus because printk is not NMI safe */ + ghes_do_proc(ghes); + ghes_clear_estatus(ghes); + } + +out: + raw_spin_unlock(&ghes_nmi_lock); + return ret; +} + static struct notifier_block ghes_notifier_sci = { .notifier_call = ghes_notify_sci, }; +static struct notifier_block ghes_notifier_nmi = { + .notifier_call = ghes_notify_nmi, +}; + static int __devinit ghes_probe(struct platform_device *ghes_dev) { struct acpi_hest_generic *generic; @@ -306,18 +514,27 @@ static int __devinit ghes_probe(struct platform_device *ghes_dev) if (!generic->enabled) return -ENODEV; - if (generic->error_block_length < - sizeof(struct acpi_hest_generic_status)) { - pr_warning(FW_BUG GHES_PFX -"Invalid error block length: %u for generic hardware error source: %d\n", - generic->error_block_length, + switch (generic->notify.type) { + case ACPI_HEST_NOTIFY_POLLED: + case ACPI_HEST_NOTIFY_EXTERNAL: + case ACPI_HEST_NOTIFY_SCI: + case ACPI_HEST_NOTIFY_NMI: + break; + case ACPI_HEST_NOTIFY_LOCAL: + pr_warning(GHES_PFX "Generic hardware error source: %d notified via local interrupt is not supported!\n", generic->header.source_id); goto err; + default: + pr_warning(FW_WARN GHES_PFX "Unknown notification type: %u for generic hardware error source: %d\n", + generic->notify.type, generic->header.source_id); + goto err; } - if (generic->records_to_preallocate == 0) { - pr_warning(FW_BUG GHES_PFX -"Invalid records to preallocate: %u for generic hardware error source: %d\n", - generic->records_to_preallocate, + + rc = -EIO; + if (generic->error_block_length < + sizeof(struct acpi_hest_generic_status)) { + pr_warning(FW_BUG GHES_PFX "Invalid error block length: %u for generic hardware error source: %d\n", + generic->error_block_length, generic->header.source_id); goto err; } @@ -327,38 +544,43 @@ static int __devinit ghes_probe(struct platform_device *ghes_dev) ghes = NULL; goto err; } - if (generic->notify.type == ACPI_HEST_NOTIFY_SCI) { + switch (generic->notify.type) { + case ACPI_HEST_NOTIFY_POLLED: + ghes->timer.function = ghes_poll_func; + ghes->timer.data = (unsigned long)ghes; + init_timer_deferrable(&ghes->timer); + ghes_add_timer(ghes); + break; + case ACPI_HEST_NOTIFY_EXTERNAL: + /* External interrupt vector is GSI */ + if (acpi_gsi_to_irq(generic->notify.vector, &ghes->irq)) { + pr_err(GHES_PFX "Failed to map GSI to IRQ for generic hardware error source: %d\n", + generic->header.source_id); + goto err; + } + if (request_irq(ghes->irq, ghes_irq_func, + 0, "GHES IRQ", ghes)) { + pr_err(GHES_PFX "Failed to register IRQ for generic hardware error source: %d\n", + generic->header.source_id); + goto err; + } + break; + case ACPI_HEST_NOTIFY_SCI: mutex_lock(&ghes_list_mutex); if (list_empty(&ghes_sci)) register_acpi_hed_notifier(&ghes_notifier_sci); list_add_rcu(&ghes->list, &ghes_sci); mutex_unlock(&ghes_list_mutex); - } else { - unsigned char *notify = NULL; - - switch (generic->notify.type) { - case ACPI_HEST_NOTIFY_POLLED: - notify = "POLL"; - break; - case ACPI_HEST_NOTIFY_EXTERNAL: - case ACPI_HEST_NOTIFY_LOCAL: - notify = "IRQ"; - break; - case ACPI_HEST_NOTIFY_NMI: - notify = "NMI"; - break; - } - if (notify) { - pr_warning(GHES_PFX -"Generic hardware error source: %d notified via %s is not supported!\n", - generic->header.source_id, notify); - } else { - pr_warning(FW_WARN GHES_PFX -"Unknown notification type: %u for generic hardware error source: %d\n", - generic->notify.type, generic->header.source_id); - } - rc = -ENODEV; - goto err; + break; + case ACPI_HEST_NOTIFY_NMI: + mutex_lock(&ghes_list_mutex); + if (list_empty(&ghes_nmi)) + register_die_notifier(&ghes_notifier_nmi); + list_add_rcu(&ghes->list, &ghes_nmi); + mutex_unlock(&ghes_list_mutex); + break; + default: + BUG(); } platform_set_drvdata(ghes_dev, ghes); @@ -379,7 +601,14 @@ static int __devexit ghes_remove(struct platform_device *ghes_dev) ghes = platform_get_drvdata(ghes_dev); generic = ghes->generic; + ghes->flags |= GHES_EXITING; switch (generic->notify.type) { + case ACPI_HEST_NOTIFY_POLLED: + del_timer_sync(&ghes->timer); + break; + case ACPI_HEST_NOTIFY_EXTERNAL: + free_irq(ghes->irq, ghes); + break; case ACPI_HEST_NOTIFY_SCI: mutex_lock(&ghes_list_mutex); list_del_rcu(&ghes->list); @@ -387,12 +616,23 @@ static int __devexit ghes_remove(struct platform_device *ghes_dev) unregister_acpi_hed_notifier(&ghes_notifier_sci); mutex_unlock(&ghes_list_mutex); break; + case ACPI_HEST_NOTIFY_NMI: + mutex_lock(&ghes_list_mutex); + list_del_rcu(&ghes->list); + if (list_empty(&ghes_nmi)) + unregister_die_notifier(&ghes_notifier_nmi); + mutex_unlock(&ghes_list_mutex); + /* + * To synchronize with NMI handler, ghes can only be + * freed after NMI handler finishes. + */ + synchronize_rcu(); + break; default: BUG(); break; } - synchronize_rcu(); ghes_fini(ghes); kfree(ghes); @@ -412,6 +652,8 @@ static struct platform_driver ghes_platform_driver = { static int __init ghes_init(void) { + int rc; + if (acpi_disabled) return -ENODEV; @@ -420,12 +662,25 @@ static int __init ghes_init(void) return -EINVAL; } - return platform_driver_register(&ghes_platform_driver); + rc = ghes_ioremap_init(); + if (rc) + goto err; + + rc = platform_driver_register(&ghes_platform_driver); + if (rc) + goto err_ioremap_exit; + + return 0; +err_ioremap_exit: + ghes_ioremap_exit(); +err: + return rc; } static void __exit ghes_exit(void) { platform_driver_unregister(&ghes_platform_driver); + ghes_ioremap_exit(); } module_init(ghes_init); diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index 95649d3..68bc227 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -631,6 +631,17 @@ static int acpi_battery_update(struct acpi_battery *battery) return result; } +static void acpi_battery_refresh(struct acpi_battery *battery) +{ + if (!battery->bat.dev) + return; + + acpi_battery_get_info(battery); + /* The battery may have changed its reporting units. */ + sysfs_remove_battery(battery); + sysfs_add_battery(battery); +} + /* -------------------------------------------------------------------------- FS Interface (/proc) -------------------------------------------------------------------------- */ @@ -868,6 +879,8 @@ static int acpi_battery_add_fs(struct acpi_device *device) struct proc_dir_entry *entry = NULL; int i; + printk(KERN_WARNING PREFIX "Deprecated procfs I/F for battery is loaded," + " please retry with CONFIG_ACPI_PROCFS_POWER cleared\n"); if (!acpi_device_dir(device)) { acpi_device_dir(device) = proc_mkdir(acpi_device_bid(device), acpi_battery_dir); @@ -914,6 +927,8 @@ static void acpi_battery_notify(struct acpi_device *device, u32 event) if (!battery) return; old = battery->bat.dev; + if (event == ACPI_BATTERY_NOTIFY_INFO) + acpi_battery_refresh(battery); acpi_battery_update(battery); acpi_bus_generate_proc_event(device, event, acpi_battery_present(battery)); @@ -983,6 +998,7 @@ static int acpi_battery_resume(struct acpi_device *device) if (!device) return -EINVAL; battery = acpi_driver_data(device); + acpi_battery_refresh(battery); battery->update_time = 0; acpi_battery_update(battery); return 0; diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index d68bd61..7ced61f 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -52,22 +52,6 @@ EXPORT_SYMBOL(acpi_root_dir); #define STRUCT_TO_INT(s) (*((int*)&s)) -static int set_power_nocheck(const struct dmi_system_id *id) -{ - printk(KERN_NOTICE PREFIX "%s detected - " - "disable power check in power transition\n", id->ident); - acpi_power_nocheck = 1; - return 0; -} -static struct dmi_system_id __cpuinitdata power_nocheck_dmi_table[] = { - { - set_power_nocheck, "HP Pavilion 05", { - DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), - DMI_MATCH(DMI_SYS_VENDOR, "HP Pavilion 05"), - DMI_MATCH(DMI_PRODUCT_VERSION, "2001211RE101GLEND") }, NULL}, - {}, -}; - #ifdef CONFIG_X86 static int set_copy_dsdt(const struct dmi_system_id *id) @@ -196,33 +180,24 @@ EXPORT_SYMBOL(acpi_bus_get_private_data); Power Management -------------------------------------------------------------------------- */ -int acpi_bus_get_power(acpi_handle handle, int *state) +static int __acpi_bus_get_power(struct acpi_device *device, int *state) { int result = 0; acpi_status status = 0; - struct acpi_device *device = NULL; unsigned long long psc = 0; - - result = acpi_bus_get_device(handle, &device); - if (result) - return result; + if (!device || !state) + return -EINVAL; *state = ACPI_STATE_UNKNOWN; - if (!device->flags.power_manageable) { - /* TBD: Non-recursive algorithm for walking up hierarchy */ - if (device->parent) - *state = device->parent->power.state; - else - *state = ACPI_STATE_D0; - } else { + if (device->flags.power_manageable) { /* * Get the device's power state either directly (via _PSC) or * indirectly (via power resources). */ if (device->power.flags.power_resources) { - result = acpi_power_get_inferred_state(device); + result = acpi_power_get_inferred_state(device, state); if (result) return result; } else if (device->power.flags.explicit_get) { @@ -230,59 +205,33 @@ int acpi_bus_get_power(acpi_handle handle, int *state) NULL, &psc); if (ACPI_FAILURE(status)) return -ENODEV; - device->power.state = (int)psc; + *state = (int)psc; } - - *state = device->power.state; + } else { + /* TBD: Non-recursive algorithm for walking up hierarchy. */ + *state = device->parent ? + device->parent->power.state : ACPI_STATE_D0; } ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Device [%s] power state is D%d\n", - device->pnp.bus_id, device->power.state)); + device->pnp.bus_id, *state)); return 0; } -EXPORT_SYMBOL(acpi_bus_get_power); -int acpi_bus_set_power(acpi_handle handle, int state) +static int __acpi_bus_set_power(struct acpi_device *device, int state) { int result = 0; acpi_status status = AE_OK; - struct acpi_device *device = NULL; char object_name[5] = { '_', 'P', 'S', '0' + state, '\0' }; - - result = acpi_bus_get_device(handle, &device); - if (result) - return result; - - if ((state < ACPI_STATE_D0) || (state > ACPI_STATE_D3)) + if (!device || (state < ACPI_STATE_D0) || (state > ACPI_STATE_D3)) return -EINVAL; /* Make sure this is a valid target state */ - if (!device->flags.power_manageable) { - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Device `[%s]' is not power manageable\n", - kobject_name(&device->dev.kobj))); - return -ENODEV; - } - /* - * Get device's current power state - */ - if (!acpi_power_nocheck) { - /* - * Maybe the incorrect power state is returned on the bogus - * bios, which is different with the real power state. - * For example: the bios returns D0 state and the real power - * state is D3. OS expects to set the device to D0 state. In - * such case if OS uses the power state returned by the BIOS, - * the device can't be transisted to the correct power state. - * So if the acpi_power_nocheck is set, it is unnecessary to - * get the power state by calling acpi_bus_get_power. - */ - acpi_bus_get_power(device->handle, &device->power.state); - } - if ((state == device->power.state) && !device->flags.force_power_state) { + if (state == device->power.state) { ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Device is already at D%d\n", state)); return 0; @@ -351,8 +300,75 @@ int acpi_bus_set_power(acpi_handle handle, int state) return result; } + +int acpi_bus_set_power(acpi_handle handle, int state) +{ + struct acpi_device *device; + int result; + + result = acpi_bus_get_device(handle, &device); + if (result) + return result; + + if (!device->flags.power_manageable) { + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "Device [%s] is not power manageable\n", + dev_name(&device->dev))); + return -ENODEV; + } + + return __acpi_bus_set_power(device, state); +} EXPORT_SYMBOL(acpi_bus_set_power); + +int acpi_bus_init_power(struct acpi_device *device) +{ + int state; + int result; + + if (!device) + return -EINVAL; + + device->power.state = ACPI_STATE_UNKNOWN; + + result = __acpi_bus_get_power(device, &state); + if (result) + return result; + + if (device->power.flags.power_resources) + result = acpi_power_on_resources(device, state); + + if (!result) + device->power.state = state; + + return result; +} + + +int acpi_bus_update_power(acpi_handle handle, int *state_p) +{ + struct acpi_device *device; + int state; + int result; + + result = acpi_bus_get_device(handle, &device); + if (result) + return result; + + result = __acpi_bus_get_power(device, &state); + if (result) + return result; + + result = __acpi_bus_set_power(device, state); + if (!result && state_p) + *state_p = state; + + return result; +} +EXPORT_SYMBOL_GPL(acpi_bus_update_power); + + bool acpi_bus_power_manageable(acpi_handle handle) { struct acpi_device *device; @@ -1023,15 +1039,8 @@ static int __init acpi_init(void) if (acpi_disabled) return result; - /* - * If the laptop falls into the DMI check table, the power state check - * will be disabled in the course of device power transition. - */ - dmi_check_system(power_nocheck_dmi_table); - acpi_scan_init(); acpi_ec_init(); - acpi_power_init(); acpi_debugfs_init(); acpi_sleep_proc_init(); acpi_wakeup_device_init(); diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c index 71ef9cd..76bbb78 100644 --- a/drivers/acpi/button.c +++ b/drivers/acpi/button.c @@ -279,6 +279,9 @@ static int acpi_lid_send_state(struct acpi_device *device) input_report_switch(button->input, SW_LID, !state); input_sync(button->input); + if (state) + pm_wakeup_event(&device->dev, 0); + ret = blocking_notifier_call_chain(&acpi_lid_notifier, state, device); if (ret == NOTIFY_DONE) ret = blocking_notifier_call_chain(&acpi_lid_notifier, state, @@ -314,6 +317,8 @@ static void acpi_button_notify(struct acpi_device *device, u32 event) input_sync(input); input_report_key(input, keycode, 0); input_sync(input); + + pm_wakeup_event(&device->dev, 0); } acpi_bus_generate_proc_event(device, event, ++button->pushed); @@ -426,7 +431,7 @@ static int acpi_button_add(struct acpi_device *device) acpi_enable_gpe(device->wakeup.gpe_device, device->wakeup.gpe_number); device->wakeup.run_wake_count++; - device->wakeup.state.enabled = 1; + device_set_wakeup_enable(&device->dev, true); } printk(KERN_INFO PREFIX "%s [%s]\n", name, acpi_device_bid(device)); @@ -449,7 +454,7 @@ static int acpi_button_remove(struct acpi_device *device, int type) acpi_disable_gpe(device->wakeup.gpe_device, device->wakeup.gpe_number); device->wakeup.run_wake_count--; - device->wakeup.state.enabled = 0; + device_set_wakeup_enable(&device->dev, false); } acpi_button_remove_fs(device); diff --git a/drivers/acpi/dock.c b/drivers/acpi/dock.c index 81514a4..1864ad3 100644 --- a/drivers/acpi/dock.c +++ b/drivers/acpi/dock.c @@ -725,7 +725,7 @@ static void dock_notify(acpi_handle handle, u32 event, void *data) complete_dock(ds); dock_event(ds, event, DOCK_EVENT); dock_lock(ds, 1); - acpi_update_gpes(); + acpi_update_all_gpes(); break; } if (dock_present(ds) || dock_in_progress(ds)) diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index 302b31e..fa848c4 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -606,7 +606,8 @@ static int ec_check_sci(struct acpi_ec *ec, u8 state) return 0; } -static u32 acpi_ec_gpe_handler(void *data) +static u32 acpi_ec_gpe_handler(acpi_handle gpe_device, + u32 gpe_number, void *data) { struct acpi_ec *ec = data; @@ -618,7 +619,7 @@ static u32 acpi_ec_gpe_handler(void *data) wake_up(&ec->wait); ec_check_sci(ec, acpi_ec_read_status(ec)); } - return ACPI_INTERRUPT_HANDLED; + return ACPI_INTERRUPT_HANDLED | ACPI_REENABLE_GPE; } /* -------------------------------------------------------------------------- diff --git a/drivers/acpi/fan.c b/drivers/acpi/fan.c index 6004908..467479f 100644 --- a/drivers/acpi/fan.c +++ b/drivers/acpi/fan.c @@ -86,7 +86,7 @@ static int fan_get_cur_state(struct thermal_cooling_device *cdev, unsigned long if (!device) return -EINVAL; - result = acpi_bus_get_power(device->handle, &acpi_state); + result = acpi_bus_update_power(device->handle, &acpi_state); if (result) return result; @@ -123,7 +123,6 @@ static struct thermal_cooling_device_ops fan_cooling_ops = { static int acpi_fan_add(struct acpi_device *device) { int result = 0; - int state = 0; struct thermal_cooling_device *cdev; if (!device) @@ -132,16 +131,12 @@ static int acpi_fan_add(struct acpi_device *device) strcpy(acpi_device_name(device), "Fan"); strcpy(acpi_device_class(device), ACPI_FAN_CLASS); - result = acpi_bus_get_power(device->handle, &state); + result = acpi_bus_update_power(device->handle, NULL); if (result) { - printk(KERN_ERR PREFIX "Reading power state\n"); + printk(KERN_ERR PREFIX "Setting initial power state\n"); goto end; } - device->flags.force_power_state = 1; - acpi_bus_set_power(device->handle, state); - device->flags.force_power_state = 0; - cdev = thermal_cooling_device_register("Fan", device, &fan_cooling_ops); if (IS_ERR(cdev)) { @@ -200,22 +195,14 @@ static int acpi_fan_suspend(struct acpi_device *device, pm_message_t state) static int acpi_fan_resume(struct acpi_device *device) { - int result = 0; - int power_state = 0; + int result; if (!device) return -EINVAL; - result = acpi_bus_get_power(device->handle, &power_state); - if (result) { - printk(KERN_ERR PREFIX - "Error reading fan power state\n"); - return result; - } - - device->flags.force_power_state = 1; - acpi_bus_set_power(device->handle, power_state); - device->flags.force_power_state = 0; + result = acpi_bus_update_power(device->handle, NULL); + if (result) + printk(KERN_ERR PREFIX "Error updating fan power state\n"); return result; } diff --git a/drivers/acpi/glue.c b/drivers/acpi/glue.c index 78b0164..7c47ed5 100644 --- a/drivers/acpi/glue.c +++ b/drivers/acpi/glue.c @@ -167,11 +167,8 @@ static int acpi_bind_one(struct device *dev, acpi_handle handle) "firmware_node"); ret = sysfs_create_link(&acpi_dev->dev.kobj, &dev->kobj, "physical_node"); - if (acpi_dev->wakeup.flags.valid) { + if (acpi_dev->wakeup.flags.valid) device_set_wakeup_capable(dev, true); - device_set_wakeup_enable(dev, - acpi_dev->wakeup.state.enabled); - } } return 0; diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index a212bfe..b1cc81a 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -41,9 +41,10 @@ static inline int acpi_debugfs_init(void) { return 0; } int acpi_power_init(void); int acpi_device_sleep_wake(struct acpi_device *dev, int enable, int sleep_state, int dev_state); -int acpi_power_get_inferred_state(struct acpi_device *device); +int acpi_power_get_inferred_state(struct acpi_device *device, int *state); +int acpi_power_on_resources(struct acpi_device *device, int state); int acpi_power_transition(struct acpi_device *device, int state); -extern int acpi_power_nocheck; +int acpi_bus_init_power(struct acpi_device *device); int acpi_wakeup_device_init(void); void acpi_early_processor_set_pdc(void); @@ -82,8 +83,16 @@ extern int acpi_sleep_init(void); #ifdef CONFIG_ACPI_SLEEP int acpi_sleep_proc_init(void); +int suspend_nvs_alloc(void); +void suspend_nvs_free(void); +int suspend_nvs_save(void); +void suspend_nvs_restore(void); #else static inline int acpi_sleep_proc_init(void) { return 0; } +static inline int suspend_nvs_alloc(void) { return 0; } +static inline void suspend_nvs_free(void) {} +static inline int suspend_nvs_save(void) { return 0; } +static inline void suspend_nvs_restore(void) {} #endif #endif /* _ACPI_INTERNAL_H_ */ diff --git a/drivers/acpi/nvs.c b/drivers/acpi/nvs.c new file mode 100644 index 0000000..54b6ab8 --- /dev/null +++ b/drivers/acpi/nvs.c @@ -0,0 +1,144 @@ +/* + * nvs.c - Routines for saving and restoring ACPI NVS memory region + * + * Copyright (C) 2008-2011 Rafael J. Wysocki , Novell Inc. + * + * This file is released under the GPLv2. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * Platforms, like ACPI, may want us to save some memory used by them during + * suspend and to restore the contents of this memory during the subsequent + * resume. The code below implements a mechanism allowing us to do that. + */ + +struct nvs_page { + unsigned long phys_start; + unsigned int size; + void *kaddr; + void *data; + struct list_head node; +}; + +static LIST_HEAD(nvs_list); + +/** + * suspend_nvs_register - register platform NVS memory region to save + * @start - physical address of the region + * @size - size of the region + * + * The NVS region need not be page-aligned (both ends) and we arrange + * things so that the data from page-aligned addresses in this region will + * be copied into separate RAM pages. + */ +int suspend_nvs_register(unsigned long start, unsigned long size) +{ + struct nvs_page *entry, *next; + + while (size > 0) { + unsigned int nr_bytes; + + entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); + if (!entry) + goto Error; + + list_add_tail(&entry->node, &nvs_list); + entry->phys_start = start; + nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); + entry->size = (size < nr_bytes) ? size : nr_bytes; + + start += entry->size; + size -= entry->size; + } + return 0; + + Error: + list_for_each_entry_safe(entry, next, &nvs_list, node) { + list_del(&entry->node); + kfree(entry); + } + return -ENOMEM; +} + +/** + * suspend_nvs_free - free data pages allocated for saving NVS regions + */ +void suspend_nvs_free(void) +{ + struct nvs_page *entry; + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) { + free_page((unsigned long)entry->data); + entry->data = NULL; + if (entry->kaddr) { + acpi_os_unmap_memory(entry->kaddr, entry->size); + entry->kaddr = NULL; + } + } +} + +/** + * suspend_nvs_alloc - allocate memory necessary for saving NVS regions + */ +int suspend_nvs_alloc(void) +{ + struct nvs_page *entry; + + list_for_each_entry(entry, &nvs_list, node) { + entry->data = (void *)__get_free_page(GFP_KERNEL); + if (!entry->data) { + suspend_nvs_free(); + return -ENOMEM; + } + } + return 0; +} + +/** + * suspend_nvs_save - save NVS memory regions + */ +int suspend_nvs_save(void) +{ + struct nvs_page *entry; + + printk(KERN_INFO "PM: Saving platform NVS memory\n"); + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) { + entry->kaddr = acpi_os_map_memory(entry->phys_start, + entry->size); + if (!entry->kaddr) { + suspend_nvs_free(); + return -ENOMEM; + } + memcpy(entry->data, entry->kaddr, entry->size); + } + + return 0; +} + +/** + * suspend_nvs_restore - restore NVS memory regions + * + * This function is going to be called with interrupts disabled, so it + * cannot iounmap the virtual addresses used to access the NVS region. + */ +void suspend_nvs_restore(void) +{ + struct nvs_page *entry; + + printk(KERN_INFO "PM: Restoring platform NVS memory\n"); + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) + memcpy(entry->kaddr, entry->data, entry->size); +} diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index 055d7b7..e2dd6de 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -320,7 +320,7 @@ acpi_os_map_memory(acpi_physical_address phys, acpi_size size) pg_off = round_down(phys, PAGE_SIZE); pg_sz = round_up(phys + size, PAGE_SIZE) - pg_off; - virt = ioremap(pg_off, pg_sz); + virt = ioremap_cache(pg_off, pg_sz); if (!virt) { kfree(map); return NULL; @@ -642,7 +642,7 @@ acpi_os_read_memory(acpi_physical_address phys_addr, u32 * value, u32 width) virt_addr = acpi_map_vaddr_lookup(phys_addr, size); rcu_read_unlock(); if (!virt_addr) { - virt_addr = ioremap(phys_addr, size); + virt_addr = ioremap_cache(phys_addr, size); unmap = 1; } if (!value) @@ -678,7 +678,7 @@ acpi_os_write_memory(acpi_physical_address phys_addr, u32 value, u32 width) virt_addr = acpi_map_vaddr_lookup(phys_addr, size); rcu_read_unlock(); if (!virt_addr) { - virt_addr = ioremap(phys_addr, size); + virt_addr = ioremap_cache(phys_addr, size); unmap = 1; } @@ -1233,8 +1233,7 @@ __setup("acpi_enforce_resources=", acpi_enforce_resources_setup); int acpi_check_resource_conflict(const struct resource *res) { struct acpi_res_list *res_list_elem; - int ioport; - int clash = 0; + int ioport = 0, clash = 0; if (acpi_enforce_resources == ENFORCE_RESOURCES_NO) return 0; @@ -1264,9 +1263,13 @@ int acpi_check_resource_conflict(const struct resource *res) if (clash) { if (acpi_enforce_resources != ENFORCE_RESOURCES_NO) { printk(KERN_WARNING "ACPI: resource %s %pR" - " conflicts with ACPI region %s %pR\n", + " conflicts with ACPI region %s " + "[%s 0x%zx-0x%zx]\n", res->name, res, res_list_elem->name, - res_list_elem); + (res_list_elem->resource_type == + ACPI_ADR_SPACE_SYSTEM_IO) ? "io" : "mem", + (size_t) res_list_elem->start, + (size_t) res_list_elem->end); if (acpi_enforce_resources == ENFORCE_RESOURCES_LAX) printk(KERN_NOTICE "ACPI: This conflict may" " cause random problems and system" diff --git a/drivers/acpi/power.c b/drivers/acpi/power.c index 4c9c2fb..9ac2a9f 100644 --- a/drivers/acpi/power.c +++ b/drivers/acpi/power.c @@ -56,9 +56,6 @@ ACPI_MODULE_NAME("power"); #define ACPI_POWER_RESOURCE_STATE_ON 0x01 #define ACPI_POWER_RESOURCE_STATE_UNKNOWN 0xFF -int acpi_power_nocheck; -module_param_named(power_nocheck, acpi_power_nocheck, bool, 000); - static int acpi_power_add(struct acpi_device *device); static int acpi_power_remove(struct acpi_device *device, int type); static int acpi_power_resume(struct acpi_device *device); @@ -148,9 +145,8 @@ static int acpi_power_get_state(acpi_handle handle, int *state) static int acpi_power_get_list_state(struct acpi_handle_list *list, int *state) { - int result = 0, state1; - u32 i = 0; - + int cur_state; + int i = 0; if (!list || !state) return -EINVAL; @@ -158,25 +154,33 @@ static int acpi_power_get_list_state(struct acpi_handle_list *list, int *state) /* The state of the list is 'on' IFF all resources are 'on'. */ for (i = 0; i < list->count; i++) { - /* - * The state of the power resource can be obtained by - * using the ACPI handle. In such case it is unnecessary to - * get the Power resource first and then get its state again. - */ - result = acpi_power_get_state(list->handles[i], &state1); + struct acpi_power_resource *resource; + acpi_handle handle = list->handles[i]; + int result; + + result = acpi_power_get_context(handle, &resource); if (result) return result; - *state = state1; + mutex_lock(&resource->resource_lock); - if (*state != ACPI_POWER_RESOURCE_STATE_ON) + result = acpi_power_get_state(handle, &cur_state); + + mutex_unlock(&resource->resource_lock); + + if (result) + return result; + + if (cur_state != ACPI_POWER_RESOURCE_STATE_ON) break; } ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Resource list is %s\n", - *state ? "on" : "off")); + cur_state ? "on" : "off")); - return result; + *state = cur_state; + + return 0; } static int __acpi_power_on(struct acpi_power_resource *resource) @@ -222,7 +226,7 @@ static int acpi_power_on(acpi_handle handle) return result; } -static int acpi_power_off_device(acpi_handle handle) +static int acpi_power_off(acpi_handle handle) { int result = 0; acpi_status status = AE_OK; @@ -266,6 +270,35 @@ static int acpi_power_off_device(acpi_handle handle) return result; } +static void __acpi_power_off_list(struct acpi_handle_list *list, int num_res) +{ + int i; + + for (i = num_res - 1; i >= 0 ; i--) + acpi_power_off(list->handles[i]); +} + +static void acpi_power_off_list(struct acpi_handle_list *list) +{ + __acpi_power_off_list(list, list->count); +} + +static int acpi_power_on_list(struct acpi_handle_list *list) +{ + int result = 0; + int i; + + for (i = 0; i < list->count; i++) { + result = acpi_power_on(list->handles[i]); + if (result) { + __acpi_power_off_list(list, i); + break; + } + } + + return result; +} + /** * acpi_device_sleep_wake - execute _DSW (Device Sleep Wake) or (deprecated in * ACPI 3.0) _PSW (Power State Wake) @@ -404,8 +437,7 @@ int acpi_disable_wakeup_device_power(struct acpi_device *dev) /* Close power resource */ for (i = 0; i < dev->wakeup.resources.count; i++) { - int ret = acpi_power_off_device( - dev->wakeup.resources.handles[i]); + int ret = acpi_power_off(dev->wakeup.resources.handles[i]); if (ret) { printk(KERN_ERR PREFIX "Transition power state\n"); dev->wakeup.flags.valid = 0; @@ -423,19 +455,16 @@ int acpi_disable_wakeup_device_power(struct acpi_device *dev) Device Power Management -------------------------------------------------------------------------- */ -int acpi_power_get_inferred_state(struct acpi_device *device) +int acpi_power_get_inferred_state(struct acpi_device *device, int *state) { int result = 0; struct acpi_handle_list *list = NULL; int list_state = 0; int i = 0; - - if (!device) + if (!device || !state) return -EINVAL; - device->power.state = ACPI_STATE_UNKNOWN; - /* * We know a device's inferred power state when all the resources * required for a given D-state are 'on'. @@ -450,22 +479,26 @@ int acpi_power_get_inferred_state(struct acpi_device *device) return result; if (list_state == ACPI_POWER_RESOURCE_STATE_ON) { - device->power.state = i; + *state = i; return 0; } } - device->power.state = ACPI_STATE_D3; - + *state = ACPI_STATE_D3; return 0; } +int acpi_power_on_resources(struct acpi_device *device, int state) +{ + if (!device || state < ACPI_STATE_D0 || state > ACPI_STATE_D3) + return -EINVAL; + + return acpi_power_on_list(&device->power.states[state].resources); +} + int acpi_power_transition(struct acpi_device *device, int state) { - int result = 0; - struct acpi_handle_list *cl = NULL; /* Current Resources */ - struct acpi_handle_list *tl = NULL; /* Target Resources */ - int i = 0; + int result; if (!device || (state < ACPI_STATE_D0) || (state > ACPI_STATE_D3)) return -EINVAL; @@ -477,37 +510,20 @@ int acpi_power_transition(struct acpi_device *device, int state) || (device->power.state > ACPI_STATE_D3)) return -ENODEV; - cl = &device->power.states[device->power.state].resources; - tl = &device->power.states[state].resources; - /* TBD: Resources must be ordered. */ /* * First we reference all power resources required in the target list - * (e.g. so the device doesn't lose power while transitioning). + * (e.g. so the device doesn't lose power while transitioning). Then, + * we dereference all power resources used in the current list. */ - for (i = 0; i < tl->count; i++) { - result = acpi_power_on(tl->handles[i]); - if (result) - goto end; - } + result = acpi_power_on_list(&device->power.states[state].resources); + if (!result) + acpi_power_off_list( + &device->power.states[device->power.state].resources); - /* - * Then we dereference all power resources used in the current list. - */ - for (i = 0; i < cl->count; i++) { - result = acpi_power_off_device(cl->handles[i]); - if (result) - goto end; - } - - end: - if (result) - device->power.state = ACPI_STATE_UNKNOWN; - else { - /* We shouldn't change the state till all above operations succeed */ - device->power.state = state; - } + /* We shouldn't change the state unless the above operations succeed. */ + device->power.state = result ? ACPI_STATE_UNKNOWN : state; return result; } diff --git a/drivers/acpi/proc.c b/drivers/acpi/proc.c index afad677..f5f9869 100644 --- a/drivers/acpi/proc.c +++ b/drivers/acpi/proc.c @@ -311,7 +311,9 @@ acpi_system_wakeup_device_seq_show(struct seq_file *seq, void *offset) dev->pnp.bus_id, (u32) dev->wakeup.sleep_state, dev->wakeup.flags.run_wake ? '*' : ' ', - dev->wakeup.state.enabled ? "enabled" : "disabled"); + (device_may_wakeup(&dev->dev) + || (ldev && device_may_wakeup(ldev))) ? + "enabled" : "disabled"); if (ldev) seq_printf(seq, "%s:%s", ldev->bus ? ldev->bus->name : "no-bus", @@ -328,8 +330,10 @@ static void physical_device_enable_wakeup(struct acpi_device *adev) { struct device *dev = acpi_get_physical_device(adev->handle); - if (dev && device_can_wakeup(dev)) - device_set_wakeup_enable(dev, adev->wakeup.state.enabled); + if (dev && device_can_wakeup(dev)) { + bool enable = !device_may_wakeup(dev); + device_set_wakeup_enable(dev, enable); + } } static ssize_t @@ -341,7 +345,6 @@ acpi_system_write_wakeup_device(struct file *file, char strbuf[5]; char str[5] = ""; unsigned int len = count; - struct acpi_device *found_dev = NULL; if (len > 4) len = 4; @@ -361,33 +364,13 @@ acpi_system_write_wakeup_device(struct file *file, continue; if (!strncmp(dev->pnp.bus_id, str, 4)) { - dev->wakeup.state.enabled = - dev->wakeup.state.enabled ? 0 : 1; - found_dev = dev; - break; - } - } - if (found_dev) { - physical_device_enable_wakeup(found_dev); - list_for_each_safe(node, next, &acpi_wakeup_device_list) { - struct acpi_device *dev = container_of(node, - struct - acpi_device, - wakeup_list); - - if ((dev != found_dev) && - (dev->wakeup.gpe_number == - found_dev->wakeup.gpe_number) - && (dev->wakeup.gpe_device == - found_dev->wakeup.gpe_device)) { - printk(KERN_WARNING - "ACPI: '%s' and '%s' have the same GPE, " - "can't disable/enable one separately\n", - dev->pnp.bus_id, found_dev->pnp.bus_id); - dev->wakeup.state.enabled = - found_dev->wakeup.state.enabled; + if (device_can_wakeup(&dev->dev)) { + bool enable = !device_may_wakeup(&dev->dev); + device_set_wakeup_enable(&dev->dev, enable); + } else { physical_device_enable_wakeup(dev); } + break; } } mutex_unlock(&acpi_device_lock); diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c index bec561c..3c1a2fe 100644 --- a/drivers/acpi/processor_core.c +++ b/drivers/acpi/processor_core.c @@ -23,7 +23,7 @@ static int set_no_mwait(const struct dmi_system_id *id) { printk(KERN_NOTICE PREFIX "%s detected - " "disabling mwait for CPU C-states\n", id->ident); - idle_nomwait = 1; + boot_option_idle_override = IDLE_NOMWAIT; return 0; } @@ -283,7 +283,7 @@ acpi_processor_eval_pdc(acpi_handle handle, struct acpi_object_list *pdc_in) { acpi_status status = AE_OK; - if (idle_nomwait) { + if (boot_option_idle_override == IDLE_NOMWAIT) { /* * If mwait is disabled for CPU C-states, the C2C3_FFH access * mode will be disabled in the parameter of _PDC object. diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c index 85e4804..360a74e 100644 --- a/drivers/acpi/processor_driver.c +++ b/drivers/acpi/processor_driver.c @@ -40,10 +40,6 @@ #include #include #include -#ifdef CONFIG_ACPI_PROCFS -#include -#include -#endif #include #include #include @@ -246,53 +242,6 @@ static int acpi_processor_errata(struct acpi_processor *pr) return result; } -#ifdef CONFIG_ACPI_PROCFS -static struct proc_dir_entry *acpi_processor_dir = NULL; - -static int __cpuinit acpi_processor_add_fs(struct acpi_device *device) -{ - struct proc_dir_entry *entry = NULL; - - - if (!acpi_device_dir(device)) { - acpi_device_dir(device) = proc_mkdir(acpi_device_bid(device), - acpi_processor_dir); - if (!acpi_device_dir(device)) - return -ENODEV; - } - - /* 'throttling' [R/W] */ - entry = proc_create_data(ACPI_PROCESSOR_FILE_THROTTLING, - S_IFREG | S_IRUGO | S_IWUSR, - acpi_device_dir(device), - &acpi_processor_throttling_fops, - acpi_driver_data(device)); - if (!entry) - return -EIO; - return 0; -} -static int acpi_processor_remove_fs(struct acpi_device *device) -{ - - if (acpi_device_dir(device)) { - remove_proc_entry(ACPI_PROCESSOR_FILE_THROTTLING, - acpi_device_dir(device)); - remove_proc_entry(acpi_device_bid(device), acpi_processor_dir); - acpi_device_dir(device) = NULL; - } - - return 0; -} -#else -static inline int acpi_processor_add_fs(struct acpi_device *device) -{ - return 0; -} -static inline int acpi_processor_remove_fs(struct acpi_device *device) -{ - return 0; -} -#endif /* -------------------------------------------------------------------------- Driver Interface -------------------------------------------------------------------------- */ @@ -478,8 +427,13 @@ static int acpi_cpu_soft_notify(struct notifier_block *nfb, if (action == CPU_ONLINE && pr) { acpi_processor_ppc_has_changed(pr, 0); acpi_processor_cst_has_changed(pr); + acpi_processor_reevaluate_tstate(pr, action); acpi_processor_tstate_has_changed(pr); } + if (action == CPU_DEAD && pr) { + /* invalidate the flag.throttling after one CPU is offline */ + acpi_processor_reevaluate_tstate(pr, action); + } return NOTIFY_OK; } @@ -537,14 +491,10 @@ static int __cpuinit acpi_processor_add(struct acpi_device *device) per_cpu(processors, pr->id) = pr; - result = acpi_processor_add_fs(device); - if (result) - goto err_free_cpumask; - sysdev = get_cpu_sysdev(pr->id); if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev")) { result = -EFAULT; - goto err_remove_fs; + goto err_free_cpumask; } #ifdef CONFIG_CPU_FREQ @@ -590,8 +540,6 @@ err_thermal_unregister: thermal_cooling_device_unregister(pr->cdev); err_power_exit: acpi_processor_power_exit(pr, device); -err_remove_fs: - acpi_processor_remove_fs(device); err_free_cpumask: free_cpumask_var(pr->throttling.shared_cpu_map); @@ -620,8 +568,6 @@ static int acpi_processor_remove(struct acpi_device *device, int type) sysfs_remove_link(&device->dev.kobj, "sysdev"); - acpi_processor_remove_fs(device); - if (pr->cdev) { sysfs_remove_link(&device->dev.kobj, "thermal_cooling"); sysfs_remove_link(&pr->cdev->device.kobj, "device"); @@ -854,12 +800,6 @@ static int __init acpi_processor_init(void) memset(&errata, 0, sizeof(errata)); -#ifdef CONFIG_ACPI_PROCFS - acpi_processor_dir = proc_mkdir(ACPI_PROCESSOR_CLASS, acpi_root_dir); - if (!acpi_processor_dir) - return -ENOMEM; -#endif - if (!cpuidle_register_driver(&acpi_idle_driver)) { printk(KERN_DEBUG "ACPI: %s registered with cpuidle\n", acpi_idle_driver.name); @@ -885,10 +825,6 @@ static int __init acpi_processor_init(void) out_cpuidle: cpuidle_unregister_driver(&acpi_idle_driver); -#ifdef CONFIG_ACPI_PROCFS - remove_proc_entry(ACPI_PROCESSOR_CLASS, acpi_root_dir); -#endif - return result; } @@ -907,10 +843,6 @@ static void __exit acpi_processor_exit(void) cpuidle_unregister_driver(&acpi_idle_driver); -#ifdef CONFIG_ACPI_PROCFS - remove_proc_entry(ACPI_PROCESSOR_CLASS, acpi_root_dir); -#endif - return; } diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index a765b82..d615b7d 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -79,6 +79,13 @@ module_param(bm_check_disable, uint, 0000); static unsigned int latency_factor __read_mostly = 2; module_param(latency_factor, uint, 0644); +static int disabled_by_idle_boot_param(void) +{ + return boot_option_idle_override == IDLE_POLL || + boot_option_idle_override == IDLE_FORCE_MWAIT || + boot_option_idle_override == IDLE_HALT; +} + /* * IBM ThinkPad R40e crashes mysteriously when going into C2 or C3. * For now disable this. Probably a bug somewhere else. @@ -455,7 +462,7 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr) continue; } if (cx.type == ACPI_STATE_C1 && - (idle_halt || idle_nomwait)) { + (boot_option_idle_override == IDLE_NOMWAIT)) { /* * In most cases the C1 space_id obtained from * _CST object is FIXED_HARDWARE access mode. @@ -1016,7 +1023,6 @@ static int acpi_processor_setup_cpuidle(struct acpi_processor *pr) state->flags = 0; switch (cx->type) { case ACPI_STATE_C1: - state->flags |= CPUIDLE_FLAG_SHALLOW; if (cx->entry_method == ACPI_CSTATE_FFH) state->flags |= CPUIDLE_FLAG_TIME_VALID; @@ -1025,16 +1031,13 @@ static int acpi_processor_setup_cpuidle(struct acpi_processor *pr) break; case ACPI_STATE_C2: - state->flags |= CPUIDLE_FLAG_BALANCED; state->flags |= CPUIDLE_FLAG_TIME_VALID; state->enter = acpi_idle_enter_simple; dev->safe_state = state; break; case ACPI_STATE_C3: - state->flags |= CPUIDLE_FLAG_DEEP; state->flags |= CPUIDLE_FLAG_TIME_VALID; - state->flags |= CPUIDLE_FLAG_CHECK_BM; state->enter = pr->flags.bm_check ? acpi_idle_enter_bm : acpi_idle_enter_simple; @@ -1058,7 +1061,7 @@ int acpi_processor_cst_has_changed(struct acpi_processor *pr) { int ret = 0; - if (boot_option_idle_override) + if (disabled_by_idle_boot_param()) return 0; if (!pr) @@ -1089,19 +1092,10 @@ int __cpuinit acpi_processor_power_init(struct acpi_processor *pr, acpi_status status = 0; static int first_run; - if (boot_option_idle_override) + if (disabled_by_idle_boot_param()) return 0; if (!first_run) { - if (idle_halt) { - /* - * When the boot option of "idle=halt" is added, halt - * is used for CPU IDLE. - * In such case C2/C3 is meaningless. So the max_cstate - * is set to one. - */ - max_cstate = 1; - } dmi_check_system(processor_power_dmi_table); max_cstate = acpi_processor_cstate_check(max_cstate); if (max_cstate < ACPI_C_STATES_MAX) @@ -1142,7 +1136,7 @@ int __cpuinit acpi_processor_power_init(struct acpi_processor *pr, int acpi_processor_power_exit(struct acpi_processor *pr, struct acpi_device *device) { - if (boot_option_idle_override) + if (disabled_by_idle_boot_param()) return 0; cpuidle_unregister_device(&pr->power.dev); diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c index ff36327..fa84e97 100644 --- a/drivers/acpi/processor_throttling.c +++ b/drivers/acpi/processor_throttling.c @@ -32,10 +32,6 @@ #include #include #include -#ifdef CONFIG_ACPI_PROCFS -#include -#include -#endif #include #include @@ -370,6 +366,58 @@ int acpi_processor_tstate_has_changed(struct acpi_processor *pr) } /* + * This function is used to reevaluate whether the T-state is valid + * after one CPU is onlined/offlined. + * It is noted that it won't reevaluate the following properties for + * the T-state. + * 1. Control method. + * 2. the number of supported T-state + * 3. TSD domain + */ +void acpi_processor_reevaluate_tstate(struct acpi_processor *pr, + unsigned long action) +{ + int result = 0; + + if (action == CPU_DEAD) { + /* When one CPU is offline, the T-state throttling + * will be invalidated. + */ + pr->flags.throttling = 0; + return; + } + /* the following is to recheck whether the T-state is valid for + * the online CPU + */ + if (!pr->throttling.state_count) { + /* If the number of T-state is invalid, it is + * invalidated. + */ + pr->flags.throttling = 0; + return; + } + pr->flags.throttling = 1; + + /* Disable throttling (if enabled). We'll let subsequent + * policy (e.g.thermal) decide to lower performance if it + * so chooses, but for now we'll crank up the speed. + */ + + result = acpi_processor_get_throttling(pr); + if (result) + goto end; + + if (pr->throttling.state) { + result = acpi_processor_set_throttling(pr, 0, false); + if (result) + goto end; + } + +end: + if (result) + pr->flags.throttling = 0; +} +/* * _PTC - Processor Throttling Control (and status) register location */ static int acpi_processor_get_throttling_control(struct acpi_processor *pr) @@ -876,7 +924,11 @@ static int acpi_processor_get_throttling(struct acpi_processor *pr) */ cpumask_copy(saved_mask, ¤t->cpus_allowed); /* FIXME: use work_on_cpu() */ - set_cpus_allowed_ptr(current, cpumask_of(pr->id)); + if (set_cpus_allowed_ptr(current, cpumask_of(pr->id))) { + /* Can't migrate to the target pr->id CPU. Exit */ + free_cpumask_var(saved_mask); + return -ENODEV; + } ret = pr->throttling.acpi_processor_get_throttling(pr); /* restore the previous state */ set_cpus_allowed_ptr(current, saved_mask); @@ -1051,6 +1103,14 @@ int acpi_processor_set_throttling(struct acpi_processor *pr, return -ENOMEM; } + if (cpu_is_offline(pr->id)) { + /* + * the cpu pointed by pr->id is offline. Unnecessary to change + * the throttling state any more. + */ + return -ENODEV; + } + cpumask_copy(saved_mask, ¤t->cpus_allowed); t_state.target_state = state; p_throttling = &(pr->throttling); @@ -1074,7 +1134,11 @@ int acpi_processor_set_throttling(struct acpi_processor *pr, */ if (p_throttling->shared_type == DOMAIN_COORD_TYPE_SW_ANY) { /* FIXME: use work_on_cpu() */ - set_cpus_allowed_ptr(current, cpumask_of(pr->id)); + if (set_cpus_allowed_ptr(current, cpumask_of(pr->id))) { + /* Can't migrate to the pr->id CPU. Exit */ + ret = -ENODEV; + goto exit; + } ret = p_throttling->acpi_processor_set_throttling(pr, t_state.target_state, force); } else { @@ -1106,7 +1170,8 @@ int acpi_processor_set_throttling(struct acpi_processor *pr, } t_state.cpu = i; /* FIXME: use work_on_cpu() */ - set_cpus_allowed_ptr(current, cpumask_of(i)); + if (set_cpus_allowed_ptr(current, cpumask_of(i))) + continue; ret = match_pr->throttling. acpi_processor_set_throttling( match_pr, t_state.target_state, force); @@ -1126,6 +1191,7 @@ int acpi_processor_set_throttling(struct acpi_processor *pr, /* restore the previous state */ /* FIXME: use work_on_cpu() */ set_cpus_allowed_ptr(current, saved_mask); +exit: free_cpumask_var(online_throttling_cpus); free_cpumask_var(saved_mask); return ret; @@ -1216,113 +1282,3 @@ int acpi_processor_get_throttling_info(struct acpi_processor *pr) return result; } -#ifdef CONFIG_ACPI_PROCFS -/* proc interface */ -static int acpi_processor_throttling_seq_show(struct seq_file *seq, - void *offset) -{ - struct acpi_processor *pr = seq->private; - int i = 0; - int result = 0; - - if (!pr) - goto end; - - if (!(pr->throttling.state_count > 0)) { - seq_puts(seq, "\n"); - goto end; - } - - result = acpi_processor_get_throttling(pr); - - if (result) { - seq_puts(seq, - "Could not determine current throttling state.\n"); - goto end; - } - - seq_printf(seq, "state count: %d\n" - "active state: T%d\n" - "state available: T%d to T%d\n", - pr->throttling.state_count, pr->throttling.state, - pr->throttling_platform_limit, - pr->throttling.state_count - 1); - - seq_puts(seq, "states:\n"); - if (pr->throttling.acpi_processor_get_throttling == - acpi_processor_get_throttling_fadt) { - for (i = 0; i < pr->throttling.state_count; i++) - seq_printf(seq, " %cT%d: %02d%%\n", - (i == pr->throttling.state ? '*' : ' '), i, - (pr->throttling.states[i].performance ? pr-> - throttling.states[i].performance / 10 : 0)); - } else { - for (i = 0; i < pr->throttling.state_count; i++) - seq_printf(seq, " %cT%d: %02d%%\n", - (i == pr->throttling.state ? '*' : ' '), i, - (int)pr->throttling.states_tss[i]. - freqpercentage); - } - - end: - return 0; -} - -static int acpi_processor_throttling_open_fs(struct inode *inode, - struct file *file) -{ - return single_open(file, acpi_processor_throttling_seq_show, - PDE(inode)->data); -} - -static ssize_t acpi_processor_write_throttling(struct file *file, - const char __user * buffer, - size_t count, loff_t * data) -{ - int result = 0; - struct seq_file *m = file->private_data; - struct acpi_processor *pr = m->private; - char state_string[5] = ""; - char *charp = NULL; - size_t state_val = 0; - char tmpbuf[5] = ""; - - if (!pr || (count > sizeof(state_string) - 1)) - return -EINVAL; - - if (copy_from_user(state_string, buffer, count)) - return -EFAULT; - - state_string[count] = '\0'; - if ((count > 0) && (state_string[count-1] == '\n')) - state_string[count-1] = '\0'; - - charp = state_string; - if ((state_string[0] == 't') || (state_string[0] == 'T')) - charp++; - - state_val = simple_strtoul(charp, NULL, 0); - if (state_val >= pr->throttling.state_count) - return -EINVAL; - - snprintf(tmpbuf, 5, "%zu", state_val); - - if (strcmp(tmpbuf, charp) != 0) - return -EINVAL; - - result = acpi_processor_set_throttling(pr, state_val, false); - if (result) - return result; - - return count; -} - -const struct file_operations acpi_processor_throttling_fops = { - .owner = THIS_MODULE, - .open = acpi_processor_throttling_open_fs, - .read = seq_read, - .write = acpi_processor_write_throttling, - .llseek = seq_lseek, - .release = single_release, -}; -#endif diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c index e5dbedb..51ae379 100644 --- a/drivers/acpi/sbs.c +++ b/drivers/acpi/sbs.c @@ -484,6 +484,8 @@ acpi_sbs_add_fs(struct proc_dir_entry **dir, const struct file_operations *state_fops, const struct file_operations *alarm_fops, void *data) { + printk(KERN_WARNING PREFIX "Deprecated procfs I/F for SBS is loaded," + " please retry with CONFIG_ACPI_PROCFS_POWER cleared\n"); if (!*dir) { *dir = proc_mkdir(dir_name, parent_dir); if (!*dir) { diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 29ef505..b99e624 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -778,7 +778,7 @@ acpi_bus_extract_wakeup_device_power_package(acpi_handle handle, wakeup->resources.handles[i] = element->reference.handle; } - acpi_gpe_can_wake(wakeup->gpe_device, wakeup->gpe_number); + acpi_setup_gpe_for_wake(handle, wakeup->gpe_device, wakeup->gpe_number); out: kfree(buffer.pointer); @@ -803,7 +803,7 @@ static void acpi_bus_set_run_wake_flags(struct acpi_device *device) /* Power button, Lid switch always enable wakeup */ if (!acpi_match_device_ids(device, button_device_ids)) { device->wakeup.flags.run_wake = 1; - device->wakeup.flags.always_enabled = 1; + device_set_wakeup_capable(&device->dev, true); return; } @@ -815,16 +815,22 @@ static void acpi_bus_set_run_wake_flags(struct acpi_device *device) !!(event_status & ACPI_EVENT_FLAG_HANDLE); } -static int acpi_bus_get_wakeup_device_flags(struct acpi_device *device) +static void acpi_bus_get_wakeup_device_flags(struct acpi_device *device) { + acpi_handle temp; acpi_status status = 0; int psw_error; + /* Presence of _PRW indicates wake capable */ + status = acpi_get_handle(device->handle, "_PRW", &temp); + if (ACPI_FAILURE(status)) + return; + status = acpi_bus_extract_wakeup_device_power_package(device->handle, &device->wakeup); if (ACPI_FAILURE(status)) { ACPI_EXCEPTION((AE_INFO, status, "Extracting _PRW package")); - goto end; + return; } device->wakeup.flags.valid = 1; @@ -840,13 +846,10 @@ static int acpi_bus_get_wakeup_device_flags(struct acpi_device *device) if (psw_error) ACPI_DEBUG_PRINT((ACPI_DB_INFO, "error in _DSW or _PSW evaluation\n")); - -end: - if (ACPI_FAILURE(status)) - device->flags.wake_capable = 0; - return 0; } +static void acpi_bus_add_power_resource(acpi_handle handle); + static int acpi_bus_get_power_flags(struct acpi_device *device) { acpi_status status = 0; @@ -875,8 +878,12 @@ static int acpi_bus_get_power_flags(struct acpi_device *device) acpi_evaluate_reference(device->handle, object_name, NULL, &ps->resources); if (ps->resources.count) { + int j; + device->power.flags.power_resources = 1; ps->flags.valid = 1; + for (j = 0; j < ps->resources.count; j++) + acpi_bus_add_power_resource(ps->resources.handles[j]); } /* Evaluate "_PSx" to see if we can do explicit sets */ @@ -901,10 +908,7 @@ static int acpi_bus_get_power_flags(struct acpi_device *device) device->power.states[ACPI_STATE_D3].flags.valid = 1; device->power.states[ACPI_STATE_D3].power = 0; - /* TBD: System wake support and resource requirements. */ - - device->power.state = ACPI_STATE_UNKNOWN; - acpi_bus_get_power(device->handle, &(device->power.state)); + acpi_bus_init_power(device); return 0; } @@ -947,11 +951,6 @@ static int acpi_bus_get_flags(struct acpi_device *device) if (ACPI_SUCCESS(status)) device->flags.power_manageable = 1; - /* Presence of _PRW indicates wake capable */ - status = acpi_get_handle(device->handle, "_PRW", &temp); - if (ACPI_SUCCESS(status)) - device->flags.wake_capable = 1; - /* TBD: Performance management */ return 0; @@ -1278,11 +1277,7 @@ static int acpi_add_single_object(struct acpi_device **child, * Wakeup device management *----------------------- */ - if (device->flags.wake_capable) { - result = acpi_bus_get_wakeup_device_flags(device); - if (result) - goto end; - } + acpi_bus_get_wakeup_device_flags(device); /* * Performance Management @@ -1326,6 +1321,20 @@ end: #define ACPI_STA_DEFAULT (ACPI_STA_DEVICE_PRESENT | ACPI_STA_DEVICE_ENABLED | \ ACPI_STA_DEVICE_UI | ACPI_STA_DEVICE_FUNCTIONING) +static void acpi_bus_add_power_resource(acpi_handle handle) +{ + struct acpi_bus_ops ops = { + .acpi_op_add = 1, + .acpi_op_start = 1, + }; + struct acpi_device *device = NULL; + + acpi_bus_get_device(handle, &device); + if (!device) + acpi_add_single_object(&device, handle, ACPI_BUS_TYPE_POWER, + ACPI_STA_DEFAULT, &ops); +} + static int acpi_bus_type_and_status(acpi_handle handle, int *type, unsigned long long *sta) { @@ -1371,7 +1380,6 @@ static acpi_status acpi_bus_check_add(acpi_handle handle, u32 lvl, struct acpi_bus_ops *ops = context; int type; unsigned long long sta; - struct acpi_device_wakeup wakeup; struct acpi_device *device; acpi_status status; int result; @@ -1382,7 +1390,13 @@ static acpi_status acpi_bus_check_add(acpi_handle handle, u32 lvl, if (!(sta & ACPI_STA_DEVICE_PRESENT) && !(sta & ACPI_STA_DEVICE_FUNCTIONING)) { - acpi_bus_extract_wakeup_device_power_package(handle, &wakeup); + struct acpi_device_wakeup wakeup; + acpi_handle temp; + + status = acpi_get_handle(handle, "_PRW", &temp); + if (ACPI_SUCCESS(status)) + acpi_bus_extract_wakeup_device_power_package(handle, + &wakeup); return AE_CTRL_DEPTH; } @@ -1467,7 +1481,7 @@ int acpi_bus_start(struct acpi_device *device) result = acpi_bus_scan(device->handle, &ops, NULL); - acpi_update_gpes(); + acpi_update_all_gpes(); return result; } @@ -1573,6 +1587,8 @@ int __init acpi_scan_init(void) printk(KERN_ERR PREFIX "Could not register bus type\n"); } + acpi_power_init(); + /* * Enumerate devices in the ACPI namespace. */ @@ -1584,7 +1600,7 @@ int __init acpi_scan_init(void) if (result) acpi_device_unregister(acpi_root, ACPI_BUS_REMOVAL_NORMAL); else - acpi_update_gpes(); + acpi_update_all_gpes(); return result; } diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index c423231..fdd3aee 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -124,8 +124,7 @@ static int acpi_pm_freeze(void) static int acpi_pm_pre_suspend(void) { acpi_pm_freeze(); - suspend_nvs_save(); - return 0; + return suspend_nvs_save(); } /** @@ -151,7 +150,7 @@ static int acpi_pm_prepare(void) { int error = __acpi_pm_prepare(); if (!error) - acpi_pm_pre_suspend(); + error = acpi_pm_pre_suspend(); return error; } @@ -435,6 +434,14 @@ static struct dmi_system_id __initdata acpisleep_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "VGN-NW130D"), }, }, + { + .callback = init_nvs_nosave, + .ident = "Averatec AV1020-ED2", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "AVERATEC"), + DMI_MATCH(DMI_PRODUCT_NAME, "1000 Series"), + }, + }, {}, }; #endif /* CONFIG_SUSPEND */ diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c index f8588f8..61891e7 100644 --- a/drivers/acpi/sysfs.c +++ b/drivers/acpi/sysfs.c @@ -438,7 +438,7 @@ static void delete_gpe_attr_array(void) return; } -void acpi_os_gpe_count(u32 gpe_number) +static void gpe_count(u32 gpe_number) { acpi_gpe_count++; @@ -454,7 +454,7 @@ void acpi_os_gpe_count(u32 gpe_number) return; } -void acpi_os_fixed_event_count(u32 event_number) +static void fixed_event_count(u32 event_number) { if (!all_counters) return; @@ -468,6 +468,16 @@ void acpi_os_fixed_event_count(u32 event_number) return; } +static void acpi_gbl_event_handler(u32 event_type, acpi_handle device, + u32 event_number, void *context) +{ + if (event_type == ACPI_EVENT_TYPE_GPE) + gpe_count(event_number); + + if (event_type == ACPI_EVENT_TYPE_FIXED) + fixed_event_count(event_number); +} + static int get_status(u32 index, acpi_event_status *status, acpi_handle *handle) { @@ -601,6 +611,7 @@ end: void acpi_irq_stats_init(void) { + acpi_status status; int i; if (all_counters) @@ -619,6 +630,10 @@ void acpi_irq_stats_init(void) if (all_counters == NULL) goto fail; + status = acpi_install_global_event_handler(acpi_gbl_event_handler, NULL); + if (ACPI_FAILURE(status)) + goto fail; + counter_attrs = kzalloc(sizeof(struct kobj_attribute) * (num_counters), GFP_KERNEL); if (counter_attrs == NULL) diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index 5a27b0a..2607e17 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -1059,8 +1059,9 @@ static int acpi_thermal_resume(struct acpi_device *device) break; tz->trips.active[i].flags.enabled = 1; for (j = 0; j < tz->trips.active[i].devices.count; j++) { - result = acpi_bus_get_power(tz->trips.active[i].devices. - handles[j], &power_state); + result = acpi_bus_update_power( + tz->trips.active[i].devices.handles[j], + &power_state); if (result || (power_state != ACPI_STATE_D0)) { tz->trips.active[i].flags.enabled = 0; break; diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c index 15a0fde..90f8f76 100644 --- a/drivers/acpi/video.c +++ b/drivers/acpi/video.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -81,6 +80,13 @@ module_param(brightness_switch_enabled, bool, 0644); static int allow_duplicates; module_param(allow_duplicates, bool, 0644); +/* + * Some BIOSes claim they use minimum backlight at boot, + * and this may bring dimming screen after boot + */ +static int use_bios_initial_backlight = 1; +module_param(use_bios_initial_backlight, bool, 0644); + static int register_count = 0; static int acpi_video_bus_add(struct acpi_device *device); static int acpi_video_bus_remove(struct acpi_device *device, int type); @@ -172,9 +178,6 @@ struct acpi_video_device_cap { u8 _BQC:1; /* Get current brightness level */ u8 _BCQ:1; /* Some buggy BIOS uses _BCQ instead of _BQC */ u8 _DDC:1; /*Return the EDID for this device */ - u8 _DCS:1; /*Return status of output device */ - u8 _DGS:1; /*Query graphics state */ - u8 _DSS:1; /*Device state set */ }; struct acpi_video_brightness_flags { @@ -202,7 +205,6 @@ struct acpi_video_device { struct acpi_video_device_brightness *brightness; struct backlight_device *backlight; struct thermal_cooling_device *cooling_dev; - struct output_device *output_dev; }; static const char device_decode[][30] = { @@ -226,10 +228,6 @@ static int acpi_video_get_next_level(struct acpi_video_device *device, u32 level_current, u32 event); static int acpi_video_switch_brightness(struct acpi_video_device *device, int event); -static int acpi_video_device_get_state(struct acpi_video_device *device, - unsigned long long *state); -static int acpi_video_output_get(struct output_device *od); -static int acpi_video_device_set_state(struct acpi_video_device *device, int state); /*backlight device sysfs support*/ static int acpi_video_get_brightness(struct backlight_device *bd) @@ -265,30 +263,6 @@ static const struct backlight_ops acpi_backlight_ops = { .update_status = acpi_video_set_brightness, }; -/*video output device sysfs support*/ -static int acpi_video_output_get(struct output_device *od) -{ - unsigned long long state; - struct acpi_video_device *vd = - (struct acpi_video_device *)dev_get_drvdata(&od->dev); - acpi_video_device_get_state(vd, &state); - return (int)state; -} - -static int acpi_video_output_set(struct output_device *od) -{ - unsigned long state = od->request_state; - struct acpi_video_device *vd= - (struct acpi_video_device *)dev_get_drvdata(&od->dev); - return acpi_video_device_set_state(vd, state); -} - -static struct output_properties acpi_output_properties = { - .set_state = acpi_video_output_set, - .get_status = acpi_video_output_get, -}; - - /* thermal cooling device callbacks */ static int video_get_max_state(struct thermal_cooling_device *cooling_dev, unsigned long *state) @@ -344,34 +318,6 @@ static struct thermal_cooling_device_ops video_cooling_ops = { Video Management -------------------------------------------------------------------------- */ -/* device */ - -static int -acpi_video_device_get_state(struct acpi_video_device *device, - unsigned long long *state) -{ - int status; - - status = acpi_evaluate_integer(device->dev->handle, "_DCS", NULL, state); - - return status; -} - -static int -acpi_video_device_set_state(struct acpi_video_device *device, int state) -{ - int status; - union acpi_object arg0 = { ACPI_TYPE_INTEGER }; - struct acpi_object_list args = { 1, &arg0 }; - unsigned long long ret; - - - arg0.integer.value = state; - status = acpi_evaluate_integer(device->dev->handle, "_DSS", &args, &ret); - - return status; -} - static int acpi_video_device_lcd_query_levels(struct acpi_video_device *device, union acpi_object **levels) @@ -766,9 +712,11 @@ acpi_video_init_brightness(struct acpi_video_device *device) * when invoked for the first time, i.e. level_old is invalid. * set the backlight to max_level in this case */ - for (i = 2; i < br->count; i++) - if (level_old == br->levels[i]) - level = level_old; + if (use_bios_initial_backlight) { + for (i = 2; i < br->count; i++) + if (level_old == br->levels[i]) + level = level_old; + } goto set_level; } @@ -831,15 +779,6 @@ static void acpi_video_device_find_cap(struct acpi_video_device *device) if (ACPI_SUCCESS(acpi_get_handle(device->dev->handle, "_DDC", &h_dummy1))) { device->cap._DDC = 1; } - if (ACPI_SUCCESS(acpi_get_handle(device->dev->handle, "_DCS", &h_dummy1))) { - device->cap._DCS = 1; - } - if (ACPI_SUCCESS(acpi_get_handle(device->dev->handle, "_DGS", &h_dummy1))) { - device->cap._DGS = 1; - } - if (ACPI_SUCCESS(acpi_get_handle(device->dev->handle, "_DSS", &h_dummy1))) { - device->cap._DSS = 1; - } if (acpi_video_backlight_support()) { struct backlight_properties props; @@ -904,21 +843,6 @@ static void acpi_video_device_find_cap(struct acpi_video_device *device) printk(KERN_ERR PREFIX "Create sysfs link\n"); } - - if (acpi_video_display_switch_support()) { - - if (device->cap._DCS && device->cap._DSS) { - static int count; - char *name; - name = kasprintf(GFP_KERNEL, "acpi_video%d", count); - if (!name) - return; - count++; - device->output_dev = video_output_register(name, - NULL, device, &acpi_output_properties); - kfree(name); - } - } } /* @@ -1360,6 +1284,9 @@ int acpi_video_get_edid(struct acpi_device *device, int type, int device_id, if (!video_device) continue; + if (!video_device->cap._DDC) + continue; + if (type) { switch (type) { case ACPI_VIDEO_DISPLAY_CRT: @@ -1452,7 +1379,6 @@ static int acpi_video_bus_put_one_device(struct acpi_video_device *device) thermal_cooling_device_unregister(device->cooling_dev); device->cooling_dev = NULL; } - video_output_unregister(device->output_dev); return 0; } diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c index b836761..42d3d72 100644 --- a/drivers/acpi/video_detect.c +++ b/drivers/acpi/video_detect.c @@ -17,15 +17,14 @@ * capabilities the graphics cards plugged in support. The check for general * video capabilities will be triggered by the first caller of * acpi_video_get_capabilities(NULL); which will happen when the first - * backlight (or display output) switching supporting driver calls: + * backlight switching supporting driver calls: * acpi_video_backlight_support(); * * Depending on whether ACPI graphics extensions (cmp. ACPI spec Appendix B) * are available, video.ko should be used to handle the device. * * Otherwise vendor specific drivers like thinkpad_acpi, asus_acpi, - * sony_acpi,... can take care about backlight brightness and display output - * switching. + * sony_acpi,... can take care about backlight brightness. * * If CONFIG_ACPI_VIDEO is neither set as "compiled in" (y) nor as a module (m) * this file will not be compiled, acpi_video_get_capabilities() and @@ -83,11 +82,6 @@ long acpi_is_video_device(struct acpi_device *device) if (!device) return 0; - /* Is this device able to support video switching ? */ - if (ACPI_SUCCESS(acpi_get_handle(device->handle, "_DOD", &h_dummy)) || - ACPI_SUCCESS(acpi_get_handle(device->handle, "_DOS", &h_dummy))) - video_caps |= ACPI_VIDEO_OUTPUT_SWITCHING; - /* Is this device able to retrieve a video ROM ? */ if (ACPI_SUCCESS(acpi_get_handle(device->handle, "_ROM", &h_dummy))) video_caps |= ACPI_VIDEO_ROM_AVAILABLE; @@ -161,8 +155,6 @@ long acpi_video_get_capabilities(acpi_handle graphics_handle) * * if (dmi_name_in_vendors("XY")) { * acpi_video_support |= - * ACPI_VIDEO_OUTPUT_SWITCHING_DMI_VENDOR; - * acpi_video_support |= * ACPI_VIDEO_BACKLIGHT_DMI_VENDOR; *} */ @@ -212,33 +204,8 @@ int acpi_video_backlight_support(void) EXPORT_SYMBOL(acpi_video_backlight_support); /* - * Returns true if video.ko can do display output switching. - * This does not work well/at all with binary graphics drivers - * which disable system io ranges and do it on their own. - */ -int acpi_video_display_switch_support(void) -{ - if (!acpi_video_caps_checked) - acpi_video_get_capabilities(NULL); - - if (acpi_video_support & ACPI_VIDEO_OUTPUT_SWITCHING_FORCE_VENDOR) - return 0; - else if (acpi_video_support & ACPI_VIDEO_OUTPUT_SWITCHING_FORCE_VIDEO) - return 1; - - if (acpi_video_support & ACPI_VIDEO_OUTPUT_SWITCHING_DMI_VENDOR) - return 0; - else if (acpi_video_support & ACPI_VIDEO_OUTPUT_SWITCHING_DMI_VIDEO) - return 1; - - return acpi_video_support & ACPI_VIDEO_OUTPUT_SWITCHING; -} -EXPORT_SYMBOL(acpi_video_display_switch_support); - -/* - * Use acpi_display_output=vendor/video or acpi_backlight=vendor/video - * To force that backlight or display output switching is processed by vendor - * specific acpi drivers or video.ko driver. + * Use acpi_backlight=vendor/video to force that backlight switching + * is processed by vendor specific acpi drivers or video.ko driver. */ static int __init acpi_backlight(char *str) { @@ -255,19 +222,3 @@ static int __init acpi_backlight(char *str) return 1; } __setup("acpi_backlight=", acpi_backlight); - -static int __init acpi_display_output(char *str) -{ - if (str == NULL || *str == '\0') - return 1; - else { - if (!strcmp("vendor", str)) - acpi_video_support |= - ACPI_VIDEO_OUTPUT_SWITCHING_FORCE_VENDOR; - if (!strcmp("video", str)) - acpi_video_support |= - ACPI_VIDEO_OUTPUT_SWITCHING_FORCE_VIDEO; - } - return 1; -} -__setup("acpi_display_output=", acpi_display_output); diff --git a/drivers/acpi/wakeup.c b/drivers/acpi/wakeup.c index f62a50c..ed65014 100644 --- a/drivers/acpi/wakeup.c +++ b/drivers/acpi/wakeup.c @@ -37,15 +37,16 @@ void acpi_enable_wakeup_devices(u8 sleep_state) container_of(node, struct acpi_device, wakeup_list); if (!dev->wakeup.flags.valid - || !(dev->wakeup.state.enabled || dev->wakeup.prepare_count) - || sleep_state > (u32) dev->wakeup.sleep_state) + || sleep_state > (u32) dev->wakeup.sleep_state + || !(device_may_wakeup(&dev->dev) + || dev->wakeup.prepare_count)) continue; - if (dev->wakeup.state.enabled) + if (device_may_wakeup(&dev->dev)) acpi_enable_wakeup_device_power(dev, sleep_state); /* The wake-up power should have been enabled already. */ - acpi_gpe_wakeup(dev->wakeup.gpe_device, dev->wakeup.gpe_number, + acpi_set_gpe_wake_mask(dev->wakeup.gpe_device, dev->wakeup.gpe_number, ACPI_GPE_ENABLE); } } @@ -63,14 +64,15 @@ void acpi_disable_wakeup_devices(u8 sleep_state) container_of(node, struct acpi_device, wakeup_list); if (!dev->wakeup.flags.valid - || !(dev->wakeup.state.enabled || dev->wakeup.prepare_count) - || (sleep_state > (u32) dev->wakeup.sleep_state)) + || sleep_state > (u32) dev->wakeup.sleep_state + || !(device_may_wakeup(&dev->dev) + || dev->wakeup.prepare_count)) continue; - acpi_gpe_wakeup(dev->wakeup.gpe_device, dev->wakeup.gpe_number, + acpi_set_gpe_wake_mask(dev->wakeup.gpe_device, dev->wakeup.gpe_number, ACPI_GPE_DISABLE); - if (dev->wakeup.state.enabled) + if (device_may_wakeup(&dev->dev)) acpi_disable_wakeup_device_power(dev); } } @@ -84,8 +86,8 @@ int __init acpi_wakeup_device_init(void) struct acpi_device *dev = container_of(node, struct acpi_device, wakeup_list); - if (dev->wakeup.flags.always_enabled) - dev->wakeup.state.enabled = 1; + if (device_can_wakeup(&dev->dev)) + device_set_wakeup_enable(&dev->dev, true); } mutex_unlock(&acpi_device_lock); return 0; diff --git a/drivers/base/node.c b/drivers/base/node.c index ce012a9..36b4305 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -117,12 +117,21 @@ static ssize_t node_read_meminfo(struct sys_device * dev, "Node %d WritebackTmp: %8lu kB\n" "Node %d Slab: %8lu kB\n" "Node %d SReclaimable: %8lu kB\n" - "Node %d SUnreclaim: %8lu kB\n", + "Node %d SUnreclaim: %8lu kB\n" +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + "Node %d AnonHugePages: %8lu kB\n" +#endif + , nid, K(node_page_state(nid, NR_FILE_DIRTY)), nid, K(node_page_state(nid, NR_WRITEBACK)), nid, K(node_page_state(nid, NR_FILE_PAGES)), nid, K(node_page_state(nid, NR_FILE_MAPPED)), - nid, K(node_page_state(nid, NR_ANON_PAGES)), + nid, K(node_page_state(nid, NR_ANON_PAGES) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + + node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) * + HPAGE_PMD_NR +#endif + ), nid, K(node_page_state(nid, NR_SHMEM)), nid, node_page_state(nid, NR_KERNEL_STACK) * THREAD_SIZE / 1024, @@ -133,7 +142,13 @@ static ssize_t node_read_meminfo(struct sys_device * dev, nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE) + node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)), - nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE))); + nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE)) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + , nid, + K(node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) * + HPAGE_PMD_NR) +#endif + ); n += hugetlb_report_node_meminfo(nid, buf + n); return n; } diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 2fe72f8..38223e9 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -970,6 +970,33 @@ out_kfree: } EXPORT_SYMBOL(ipmi_create_user); +int ipmi_get_smi_info(int if_num, struct ipmi_smi_info *data) +{ + int rv = 0; + ipmi_smi_t intf; + struct ipmi_smi_handlers *handlers; + + mutex_lock(&ipmi_interfaces_mutex); + list_for_each_entry_rcu(intf, &ipmi_interfaces, link) { + if (intf->intf_num == if_num) + goto found; + } + /* Not found, return an error */ + rv = -EINVAL; + mutex_unlock(&ipmi_interfaces_mutex); + return rv; + +found: + handlers = intf->handlers; + rv = -ENOSYS; + if (handlers->get_smi_info) + rv = handlers->get_smi_info(intf->send_info, data); + mutex_unlock(&ipmi_interfaces_mutex); + + return rv; +} +EXPORT_SYMBOL(ipmi_get_smi_info); + static void free_user(struct kref *ref) { ipmi_user_t user = container_of(ref, struct ipmi_user, refcount); diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index f27c04e..b6ae6e9 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include "ipmi_si_sm.h" @@ -109,10 +110,6 @@ enum si_type { }; static char *si_to_str[] = { "kcs", "smic", "bt" }; -enum ipmi_addr_src { - SI_INVALID = 0, SI_HOTMOD, SI_HARDCODED, SI_SPMI, SI_ACPI, SI_SMBIOS, - SI_PCI, SI_DEVICETREE, SI_DEFAULT -}; static char *ipmi_addr_src_to_str[] = { NULL, "hotmod", "hardcoded", "SPMI", "ACPI", "SMBIOS", "PCI", "device-tree", "default" }; @@ -293,6 +290,7 @@ struct smi_info { struct task_struct *thread; struct list_head link; + union ipmi_smi_info_union addr_info; }; #define smi_inc_stat(smi, stat) \ @@ -1188,6 +1186,18 @@ static int smi_start_processing(void *send_info, return 0; } +static int get_smi_info(void *send_info, struct ipmi_smi_info *data) +{ + struct smi_info *smi = send_info; + + data->addr_src = smi->addr_source; + data->dev = smi->dev; + data->addr_info = smi->addr_info; + get_device(smi->dev); + + return 0; +} + static void set_maintenance_mode(void *send_info, int enable) { struct smi_info *smi_info = send_info; @@ -1199,6 +1209,7 @@ static void set_maintenance_mode(void *send_info, int enable) static struct ipmi_smi_handlers handlers = { .owner = THIS_MODULE, .start_processing = smi_start_processing, + .get_smi_info = get_smi_info, .sender = sender, .request_events = request_events, .set_maintenance_mode = set_maintenance_mode, @@ -1930,7 +1941,8 @@ static void __devinit hardcode_find_bmc(void) static int acpi_failure; /* For GPE-type interrupts. */ -static u32 ipmi_acpi_gpe(void *context) +static u32 ipmi_acpi_gpe(acpi_handle gpe_device, + u32 gpe_number, void *context) { struct smi_info *smi_info = context; unsigned long flags; @@ -2158,6 +2170,7 @@ static int __devinit ipmi_pnp_probe(struct pnp_dev *dev, printk(KERN_INFO PFX "probing via ACPI\n"); handle = acpi_dev->handle; + info->addr_info.acpi_info.acpi_handle = handle; /* _IFT tells us the interface type: KCS, BT, etc */ status = acpi_evaluate_integer(handle, "_IFT", NULL, &tmp); diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 386888f..bf50924 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -96,7 +96,15 @@ static void cpuidle_idle_call(void) /* enter the state and update stats */ dev->last_state = target_state; + + trace_power_start(POWER_CSTATE, next_state, dev->cpu); + trace_cpu_idle(next_state, dev->cpu); + dev->last_residency = target_state->enter(dev, target_state); + + trace_power_end(dev->cpu); + trace_cpu_idle(PWR_EVENT_EXIT, dev->cpu); + if (dev->last_state) target_state = dev->last_state; @@ -106,8 +114,6 @@ static void cpuidle_idle_call(void) /* give the governor an opportunity to reflect on the outcome */ if (cpuidle_curr_governor->reflect) cpuidle_curr_governor->reflect(dev); - trace_power_end(smp_processor_id()); - trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } /** @@ -155,6 +161,45 @@ void cpuidle_resume_and_unlock(void) EXPORT_SYMBOL_GPL(cpuidle_resume_and_unlock); +#ifdef CONFIG_ARCH_HAS_CPU_RELAX +static int poll_idle(struct cpuidle_device *dev, struct cpuidle_state *st) +{ + ktime_t t1, t2; + s64 diff; + int ret; + + t1 = ktime_get(); + local_irq_enable(); + while (!need_resched()) + cpu_relax(); + + t2 = ktime_get(); + diff = ktime_to_us(ktime_sub(t2, t1)); + if (diff > INT_MAX) + diff = INT_MAX; + + ret = (int) diff; + return ret; +} + +static void poll_idle_init(struct cpuidle_device *dev) +{ + struct cpuidle_state *state = &dev->states[0]; + + cpuidle_set_statedata(state, NULL); + + snprintf(state->name, CPUIDLE_NAME_LEN, "POLL"); + snprintf(state->desc, CPUIDLE_DESC_LEN, "CPUIDLE CORE POLL IDLE"); + state->exit_latency = 0; + state->target_residency = 0; + state->power_usage = -1; + state->flags = 0; + state->enter = poll_idle; +} +#else +static void poll_idle_init(struct cpuidle_device *dev) {} +#endif /* CONFIG_ARCH_HAS_CPU_RELAX */ + /** * cpuidle_enable_device - enables idle PM for a CPU * @dev: the CPU @@ -179,6 +224,8 @@ int cpuidle_enable_device(struct cpuidle_device *dev) return ret; } + poll_idle_init(dev); + if ((ret = cpuidle_add_state_sysfs(dev))) return ret; @@ -233,45 +280,6 @@ void cpuidle_disable_device(struct cpuidle_device *dev) EXPORT_SYMBOL_GPL(cpuidle_disable_device); -#ifdef CONFIG_ARCH_HAS_CPU_RELAX -static int poll_idle(struct cpuidle_device *dev, struct cpuidle_state *st) -{ - ktime_t t1, t2; - s64 diff; - int ret; - - t1 = ktime_get(); - local_irq_enable(); - while (!need_resched()) - cpu_relax(); - - t2 = ktime_get(); - diff = ktime_to_us(ktime_sub(t2, t1)); - if (diff > INT_MAX) - diff = INT_MAX; - - ret = (int) diff; - return ret; -} - -static void poll_idle_init(struct cpuidle_device *dev) -{ - struct cpuidle_state *state = &dev->states[0]; - - cpuidle_set_statedata(state, NULL); - - snprintf(state->name, CPUIDLE_NAME_LEN, "C0"); - snprintf(state->desc, CPUIDLE_DESC_LEN, "CPUIDLE CORE POLL IDLE"); - state->exit_latency = 0; - state->target_residency = 0; - state->power_usage = -1; - state->flags = CPUIDLE_FLAG_POLL; - state->enter = poll_idle; -} -#else -static void poll_idle_init(struct cpuidle_device *dev) {} -#endif /* CONFIG_ARCH_HAS_CPU_RELAX */ - /** * __cpuidle_register_device - internal register function called before register * and enable routines @@ -292,8 +300,6 @@ static int __cpuidle_register_device(struct cpuidle_device *dev) init_completion(&dev->kobj_unregister); - poll_idle_init(dev); - /* * cpuidle driver should set the dev->power_specified bit * before registering the device if the driver provides diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 7af4436..64828a7 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -107,7 +107,6 @@ config DRM_I915 select FB_CFB_IMAGEBLIT # i915 depends on ACPI_VIDEO when ACPI is enabled # but for select to work, need to select ACPI_VIDEO's dependencies, ick - select VIDEO_OUTPUT_CONTROL if ACPI select BACKLIGHT_CLASS_DEVICE if ACPI select INPUT if ACPI select ACPI_VIDEO if ACPI diff --git a/drivers/gpu/stub/Kconfig b/drivers/gpu/stub/Kconfig index 0e1edd7..09aea5f 100644 --- a/drivers/gpu/stub/Kconfig +++ b/drivers/gpu/stub/Kconfig @@ -3,7 +3,6 @@ config STUB_POULSBO depends on PCI # Poulsbo stub depends on ACPI_VIDEO when ACPI is enabled # but for select to work, need to select ACPI_VIDEO's dependencies, ick - select VIDEO_OUTPUT_CONTROL if ACPI select BACKLIGHT_CLASS_DEVICE if ACPI select INPUT if ACPI select ACPI_VIDEO if ACPI diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 56ac09d..7acb32e 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -59,6 +59,8 @@ #include /* ktime_get_real() */ #include #include +#include +#include #include #define INTEL_IDLE_VERSION "0.4" @@ -73,6 +75,7 @@ static int max_cstate = MWAIT_MAX_NUM_CSTATES - 1; static unsigned int mwait_substates; +#define LAPIC_TIMER_ALWAYS_RELIABLE 0xFFFFFFFF /* Reliable LAPIC Timer States, bit 1 for C1 etc. */ static unsigned int lapic_timer_reliable_states = (1 << 1); /* Default to only C1 */ @@ -82,6 +85,14 @@ static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state); static struct cpuidle_state *cpuidle_state_table; /* + * Set this flag for states where the HW flushes the TLB for us + * and so we don't need cross-calls to keep it consistent. + * If this flag is set, SW flushes the TLB, so even if the + * HW doesn't do the flushing, this flag is safe to use. + */ +#define CPUIDLE_FLAG_TLB_FLUSHED 0x10000 + +/* * States are indexed by the cstate number, * which is also the index into the MWAIT hint array. * Thus C0 is a dummy. @@ -122,7 +133,7 @@ static struct cpuidle_state snb_cstates[MWAIT_MAX_NUM_CSTATES] = { .driver_data = (void *) 0x00, .flags = CPUIDLE_FLAG_TIME_VALID, .exit_latency = 1, - .target_residency = 4, + .target_residency = 1, .enter = &intel_idle }, { /* MWAIT C2 */ .name = "SNB-C3", @@ -130,7 +141,7 @@ static struct cpuidle_state snb_cstates[MWAIT_MAX_NUM_CSTATES] = { .driver_data = (void *) 0x10, .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 80, - .target_residency = 160, + .target_residency = 211, .enter = &intel_idle }, { /* MWAIT C3 */ .name = "SNB-C6", @@ -138,7 +149,7 @@ static struct cpuidle_state snb_cstates[MWAIT_MAX_NUM_CSTATES] = { .driver_data = (void *) 0x20, .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 104, - .target_residency = 208, + .target_residency = 345, .enter = &intel_idle }, { /* MWAIT C4 */ .name = "SNB-C7", @@ -146,7 +157,7 @@ static struct cpuidle_state snb_cstates[MWAIT_MAX_NUM_CSTATES] = { .driver_data = (void *) 0x30, .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 109, - .target_residency = 300, + .target_residency = 345, .enter = &intel_idle }, }; @@ -220,8 +231,6 @@ static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state) kt_before = ktime_get_real(); stop_critical_timings(); - trace_power_start(POWER_CSTATE, (eax >> 4) + 1, cpu); - trace_cpu_idle((eax >> 4) + 1, cpu); if (!need_resched()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); @@ -243,6 +252,39 @@ static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state) return usec_delta; } +static void __setup_broadcast_timer(void *arg) +{ + unsigned long reason = (unsigned long)arg; + int cpu = smp_processor_id(); + + reason = reason ? + CLOCK_EVT_NOTIFY_BROADCAST_ON : CLOCK_EVT_NOTIFY_BROADCAST_OFF; + + clockevents_notify(reason, &cpu); +} + +static int __cpuinit setup_broadcast_cpuhp_notify(struct notifier_block *n, + unsigned long action, void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + + switch (action & 0xf) { + case CPU_ONLINE: + smp_call_function_single(hotcpu, __setup_broadcast_timer, + (void *)true, 1); + break; + case CPU_DOWN_PREPARE: + smp_call_function_single(hotcpu, __setup_broadcast_timer, + (void *)false, 1); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata setup_broadcast_notifier = { + .notifier_call = setup_broadcast_cpuhp_notify, +}; + /* * intel_idle_probe() */ @@ -305,7 +347,11 @@ static int intel_idle_probe(void) } if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */ - lapic_timer_reliable_states = 0xFFFFFFFF; + lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE; + else { + smp_call_function(__setup_broadcast_timer, (void *)true, 1); + register_cpu_notifier(&setup_broadcast_notifier); + } pr_debug(PREFIX "v" INTEL_IDLE_VERSION " model 0x%X\n", boot_cpu_data.x86_model); @@ -403,6 +449,10 @@ static int __init intel_idle_init(void) { int retval; + /* Do not load intel_idle at all for now if idle= is passed */ + if (boot_option_idle_override != IDLE_NO_OVERRIDE) + return -ENODEV; + retval = intel_idle_probe(); if (retval) return retval; @@ -428,6 +478,11 @@ static void __exit intel_idle_exit(void) intel_idle_cpuidle_devices_uninit(); cpuidle_unregister_driver(&intel_idle_driver); + if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE) { + smp_call_function(__setup_broadcast_timer, (void *)false, 1); + unregister_cpu_notifier(&setup_broadcast_notifier); + } + return; } diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index bf1a95e..98d9ec8 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -240,6 +240,30 @@ config DM_MIRROR Allow volume managers to mirror logical volumes, also needed for live data migration tools such as 'pvmove'. +config DM_RAID + tristate "RAID 4/5/6 target (EXPERIMENTAL)" + depends on BLK_DEV_DM && EXPERIMENTAL + select MD_RAID456 + select BLK_DEV_MD + ---help--- + A dm target that supports RAID4, RAID5 and RAID6 mappings + + A RAID-5 set of N drives with a capacity of C MB per drive provides + the capacity of C * (N - 1) MB, and protects against a failure + of a single drive. For a given sector (row) number, (N - 1) drives + contain data sectors, and one drive contains the parity protection. + For a RAID-4 set, the parity blocks are present on a single drive, + while a RAID-5 set distributes the parity across the drives in one + of the available parity distribution methods. + + A RAID-6 set of N drives with a capacity of C MB per drive + provides the capacity of C * (N - 2) MB, and protects + against a failure of any two drives. For a given sector + (row) number, (N - 2) drives contain data sectors, and two + drives contains two independent redundancy syndromes. Like + RAID-5, RAID-6 distributes the syndromes across the drives + in one of the available parity distribution methods. + config DM_LOG_USERSPACE tristate "Mirror userspace logging (EXPERIMENTAL)" depends on DM_MIRROR && EXPERIMENTAL && NET diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 5e3aac4..d013860 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o obj-$(CONFIG_DM_ZERO) += dm-zero.o +obj-$(CONFIG_DM_RAID) += dm-raid.o ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 5a1ffe3..9a35320 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -210,11 +210,11 @@ static struct page *read_sb_page(mddev_t *mddev, loff_t offset, || test_bit(Faulty, &rdev->flags)) continue; - target = rdev->sb_start + offset + index * (PAGE_SIZE/512); + target = offset + index * (PAGE_SIZE/512); if (sync_page_io(rdev, target, roundup(size, bdev_logical_block_size(rdev->bdev)), - page, READ)) { + page, READ, true)) { page->index = index; attach_page_buffers(page, NULL); /* so that free_buffer will * quietly no-op */ @@ -264,14 +264,18 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev) static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) { mdk_rdev_t *rdev = NULL; + struct block_device *bdev; mddev_t *mddev = bitmap->mddev; while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { int size = PAGE_SIZE; loff_t offset = mddev->bitmap_info.offset; + + bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; + if (page->index == bitmap->file_pages-1) size = roundup(bitmap->last_page_size, - bdev_logical_block_size(rdev->bdev)); + bdev_logical_block_size(bdev)); /* Just make sure we aren't corrupting data or * metadata */ @@ -1542,7 +1546,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) wait_event(bitmap->mddev->recovery_wait, atomic_read(&bitmap->mddev->recovery_active) == 0); - bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync; + bitmap->mddev->curr_resync_completed = sector; set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); s = 0; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index d5b0e4c..4e054bd 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -18,10 +18,14 @@ #include #include #include +#include #include #include #include #include +#include +#include +#include #include @@ -63,6 +67,7 @@ struct dm_crypt_request { struct convert_context *ctx; struct scatterlist sg_in; struct scatterlist sg_out; + sector_t iv_sector; }; struct crypt_config; @@ -73,11 +78,13 @@ struct crypt_iv_operations { void (*dtr)(struct crypt_config *cc); int (*init)(struct crypt_config *cc); int (*wipe)(struct crypt_config *cc); - int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); + int (*generator)(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq); + int (*post)(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq); }; struct iv_essiv_private { - struct crypto_cipher *tfm; struct crypto_hash *hash_tfm; u8 *salt; }; @@ -86,11 +93,32 @@ struct iv_benbi_private { int shift; }; +#define LMK_SEED_SIZE 64 /* hash + 0 */ +struct iv_lmk_private { + struct crypto_shash *hash_tfm; + u8 *seed; +}; + /* * Crypt: maps a linear range of a block device * and encrypts / decrypts at the same time. */ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID }; + +/* + * Duplicated per-CPU state for cipher. + */ +struct crypt_cpu { + struct ablkcipher_request *req; + /* ESSIV: struct crypto_cipher *essiv_tfm */ + void *iv_private; + struct crypto_ablkcipher *tfms[0]; +}; + +/* + * The fields in here must be read only after initialization, + * changing state should be in crypt_cpu. + */ struct crypt_config { struct dm_dev *dev; sector_t start; @@ -108,17 +136,25 @@ struct crypt_config { struct workqueue_struct *crypt_queue; char *cipher; - char *cipher_mode; + char *cipher_string; struct crypt_iv_operations *iv_gen_ops; union { struct iv_essiv_private essiv; struct iv_benbi_private benbi; + struct iv_lmk_private lmk; } iv_gen_private; sector_t iv_offset; unsigned int iv_size; /* + * Duplicated per cpu state. Access through + * per_cpu_ptr() only. + */ + struct crypt_cpu __percpu *cpu; + unsigned tfms_count; + + /* * Layout of each crypto request: * * struct ablkcipher_request @@ -132,11 +168,10 @@ struct crypt_config { * correctly aligned. */ unsigned int dmreq_start; - struct ablkcipher_request *req; - struct crypto_ablkcipher *tfm; unsigned long flags; unsigned int key_size; + unsigned int key_parts; u8 key[0]; }; @@ -148,6 +183,20 @@ static struct kmem_cache *_crypt_io_pool; static void clone_init(struct dm_crypt_io *, struct bio *); static void kcryptd_queue_crypt(struct dm_crypt_io *io); +static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq); + +static struct crypt_cpu *this_crypt_config(struct crypt_config *cc) +{ + return this_cpu_ptr(cc->cpu); +} + +/* + * Use this to access cipher attributes that are the same for each CPU. + */ +static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc) +{ + return __this_cpu_ptr(cc->cpu)->tfms[0]; +} /* * Different IV generation algorithms: @@ -168,23 +217,38 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io); * null: the initial vector is always zero. Provides compatibility with * obsolete loop_fish2 devices. Do not use for new devices. * + * lmk: Compatible implementation of the block chaining mode used + * by the Loop-AES block device encryption system + * designed by Jari Ruusu. See http://loop-aes.sourceforge.net/ + * It operates on full 512 byte sectors and uses CBC + * with an IV derived from the sector number, the data and + * optionally extra IV seed. + * This means that after decryption the first block + * of sector must be tweaked according to decrypted data. + * Loop-AES can use three encryption schemes: + * version 1: is plain aes-cbc mode + * version 2: uses 64 multikey scheme with lmk IV generator + * version 3: the same as version 2 with additional IV seed + * (it uses 65 keys, last key is used as IV seed) + * * plumb: unimplemented, see: * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 */ -static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector) +static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { memset(iv, 0, cc->iv_size); - *(u32 *)iv = cpu_to_le32(sector & 0xffffffff); + *(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff); return 0; } static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, - sector_t sector) + struct dm_crypt_request *dmreq) { memset(iv, 0, cc->iv_size); - *(u64 *)iv = cpu_to_le64(sector); + *(u64 *)iv = cpu_to_le64(dmreq->iv_sector); return 0; } @@ -195,7 +259,8 @@ static int crypt_iv_essiv_init(struct crypt_config *cc) struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; struct hash_desc desc; struct scatterlist sg; - int err; + struct crypto_cipher *essiv_tfm; + int err, cpu; sg_init_one(&sg, cc->key, cc->key_size); desc.tfm = essiv->hash_tfm; @@ -205,8 +270,16 @@ static int crypt_iv_essiv_init(struct crypt_config *cc) if (err) return err; - return crypto_cipher_setkey(essiv->tfm, essiv->salt, + for_each_possible_cpu(cpu) { + essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private, + + err = crypto_cipher_setkey(essiv_tfm, essiv->salt, crypto_hash_digestsize(essiv->hash_tfm)); + if (err) + return err; + } + + return 0; } /* Wipe salt and reset key derived from volume key */ @@ -214,24 +287,76 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc) { struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); + struct crypto_cipher *essiv_tfm; + int cpu, r, err = 0; memset(essiv->salt, 0, salt_size); - return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size); + for_each_possible_cpu(cpu) { + essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private; + r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size); + if (r) + err = r; + } + + return err; +} + +/* Set up per cpu cipher state */ +static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc, + struct dm_target *ti, + u8 *salt, unsigned saltsize) +{ + struct crypto_cipher *essiv_tfm; + int err; + + /* Setup the essiv_tfm with the given salt */ + essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(essiv_tfm)) { + ti->error = "Error allocating crypto tfm for ESSIV"; + return essiv_tfm; + } + + if (crypto_cipher_blocksize(essiv_tfm) != + crypto_ablkcipher_ivsize(any_tfm(cc))) { + ti->error = "Block size of ESSIV cipher does " + "not match IV size of block cipher"; + crypto_free_cipher(essiv_tfm); + return ERR_PTR(-EINVAL); + } + + err = crypto_cipher_setkey(essiv_tfm, salt, saltsize); + if (err) { + ti->error = "Failed to set key for ESSIV cipher"; + crypto_free_cipher(essiv_tfm); + return ERR_PTR(err); + } + + return essiv_tfm; } static void crypt_iv_essiv_dtr(struct crypt_config *cc) { + int cpu; + struct crypt_cpu *cpu_cc; + struct crypto_cipher *essiv_tfm; struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; - crypto_free_cipher(essiv->tfm); - essiv->tfm = NULL; - crypto_free_hash(essiv->hash_tfm); essiv->hash_tfm = NULL; kzfree(essiv->salt); essiv->salt = NULL; + + for_each_possible_cpu(cpu) { + cpu_cc = per_cpu_ptr(cc->cpu, cpu); + essiv_tfm = cpu_cc->iv_private; + + if (essiv_tfm) + crypto_free_cipher(essiv_tfm); + + cpu_cc->iv_private = NULL; + } } static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, @@ -240,7 +365,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, struct crypto_cipher *essiv_tfm = NULL; struct crypto_hash *hash_tfm = NULL; u8 *salt = NULL; - int err; + int err, cpu; if (!opts) { ti->error = "Digest algorithm missing for ESSIV mode"; @@ -262,48 +387,44 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, goto bad; } - /* Allocate essiv_tfm */ - essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(essiv_tfm)) { - ti->error = "Error allocating crypto tfm for ESSIV"; - err = PTR_ERR(essiv_tfm); - goto bad; - } - if (crypto_cipher_blocksize(essiv_tfm) != - crypto_ablkcipher_ivsize(cc->tfm)) { - ti->error = "Block size of ESSIV cipher does " - "not match IV size of block cipher"; - err = -EINVAL; - goto bad; - } - cc->iv_gen_private.essiv.salt = salt; - cc->iv_gen_private.essiv.tfm = essiv_tfm; cc->iv_gen_private.essiv.hash_tfm = hash_tfm; + for_each_possible_cpu(cpu) { + essiv_tfm = setup_essiv_cpu(cc, ti, salt, + crypto_hash_digestsize(hash_tfm)); + if (IS_ERR(essiv_tfm)) { + crypt_iv_essiv_dtr(cc); + return PTR_ERR(essiv_tfm); + } + per_cpu_ptr(cc->cpu, cpu)->iv_private = essiv_tfm; + } + return 0; bad: - if (essiv_tfm && !IS_ERR(essiv_tfm)) - crypto_free_cipher(essiv_tfm); if (hash_tfm && !IS_ERR(hash_tfm)) crypto_free_hash(hash_tfm); kfree(salt); return err; } -static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) +static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { + struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private; + memset(iv, 0, cc->iv_size); - *(u64 *)iv = cpu_to_le64(sector); - crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv); + *(u64 *)iv = cpu_to_le64(dmreq->iv_sector); + crypto_cipher_encrypt_one(essiv_tfm, iv, iv); + return 0; } static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, const char *opts) { - unsigned bs = crypto_ablkcipher_blocksize(cc->tfm); + unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc)); int log = ilog2(bs); /* we need to calculate how far we must shift the sector count @@ -328,25 +449,177 @@ static void crypt_iv_benbi_dtr(struct crypt_config *cc) { } -static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector) +static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { __be64 val; memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ - val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1); + val = cpu_to_be64(((u64)dmreq->iv_sector << cc->iv_gen_private.benbi.shift) + 1); put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); return 0; } -static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, sector_t sector) +static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { memset(iv, 0, cc->iv_size); return 0; } +static void crypt_iv_lmk_dtr(struct crypt_config *cc) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + + if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm)) + crypto_free_shash(lmk->hash_tfm); + lmk->hash_tfm = NULL; + + kzfree(lmk->seed); + lmk->seed = NULL; +} + +static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti, + const char *opts) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + + lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0); + if (IS_ERR(lmk->hash_tfm)) { + ti->error = "Error initializing LMK hash"; + return PTR_ERR(lmk->hash_tfm); + } + + /* No seed in LMK version 2 */ + if (cc->key_parts == cc->tfms_count) { + lmk->seed = NULL; + return 0; + } + + lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL); + if (!lmk->seed) { + crypt_iv_lmk_dtr(cc); + ti->error = "Error kmallocing seed storage in LMK"; + return -ENOMEM; + } + + return 0; +} + +static int crypt_iv_lmk_init(struct crypt_config *cc) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + int subkey_size = cc->key_size / cc->key_parts; + + /* LMK seed is on the position of LMK_KEYS + 1 key */ + if (lmk->seed) + memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size), + crypto_shash_digestsize(lmk->hash_tfm)); + + return 0; +} + +static int crypt_iv_lmk_wipe(struct crypt_config *cc) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + + if (lmk->seed) + memset(lmk->seed, 0, LMK_SEED_SIZE); + + return 0; +} + +static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq, + u8 *data) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + struct { + struct shash_desc desc; + char ctx[crypto_shash_descsize(lmk->hash_tfm)]; + } sdesc; + struct md5_state md5state; + u32 buf[4]; + int i, r; + + sdesc.desc.tfm = lmk->hash_tfm; + sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + r = crypto_shash_init(&sdesc.desc); + if (r) + return r; + + if (lmk->seed) { + r = crypto_shash_update(&sdesc.desc, lmk->seed, LMK_SEED_SIZE); + if (r) + return r; + } + + /* Sector is always 512B, block size 16, add data of blocks 1-31 */ + r = crypto_shash_update(&sdesc.desc, data + 16, 16 * 31); + if (r) + return r; + + /* Sector is cropped to 56 bits here */ + buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF); + buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000); + buf[2] = cpu_to_le32(4024); + buf[3] = 0; + r = crypto_shash_update(&sdesc.desc, (u8 *)buf, sizeof(buf)); + if (r) + return r; + + /* No MD5 padding here */ + r = crypto_shash_export(&sdesc.desc, &md5state); + if (r) + return r; + + for (i = 0; i < MD5_HASH_WORDS; i++) + __cpu_to_le32s(&md5state.hash[i]); + memcpy(iv, &md5state.hash, cc->iv_size); + + return 0; +} + +static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + u8 *src; + int r = 0; + + if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { + src = kmap_atomic(sg_page(&dmreq->sg_in), KM_USER0); + r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset); + kunmap_atomic(src, KM_USER0); + } else + memset(iv, 0, cc->iv_size); + + return r; +} + +static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + u8 *dst; + int r; + + if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) + return 0; + + dst = kmap_atomic(sg_page(&dmreq->sg_out), KM_USER0); + r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset); + + /* Tweak the first block of plaintext sector */ + if (!r) + crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size); + + kunmap_atomic(dst, KM_USER0); + return r; +} + static struct crypt_iv_operations crypt_iv_plain_ops = { .generator = crypt_iv_plain_gen }; @@ -373,6 +646,15 @@ static struct crypt_iv_operations crypt_iv_null_ops = { .generator = crypt_iv_null_gen }; +static struct crypt_iv_operations crypt_iv_lmk_ops = { + .ctr = crypt_iv_lmk_ctr, + .dtr = crypt_iv_lmk_dtr, + .init = crypt_iv_lmk_init, + .wipe = crypt_iv_lmk_wipe, + .generator = crypt_iv_lmk_gen, + .post = crypt_iv_lmk_post +}; + static void crypt_convert_init(struct crypt_config *cc, struct convert_context *ctx, struct bio *bio_out, struct bio *bio_in, @@ -400,6 +682,13 @@ static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc, return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start); } +static u8 *iv_of_dmreq(struct crypt_config *cc, + struct dm_crypt_request *dmreq) +{ + return (u8 *)ALIGN((unsigned long)(dmreq + 1), + crypto_ablkcipher_alignmask(any_tfm(cc)) + 1); +} + static int crypt_convert_block(struct crypt_config *cc, struct convert_context *ctx, struct ablkcipher_request *req) @@ -411,9 +700,9 @@ static int crypt_convert_block(struct crypt_config *cc, int r = 0; dmreq = dmreq_of_req(cc, req); - iv = (u8 *)ALIGN((unsigned long)(dmreq + 1), - crypto_ablkcipher_alignmask(cc->tfm) + 1); + iv = iv_of_dmreq(cc, dmreq); + dmreq->iv_sector = ctx->sector; dmreq->ctx = ctx; sg_init_table(&dmreq->sg_in, 1); sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, @@ -436,7 +725,7 @@ static int crypt_convert_block(struct crypt_config *cc, } if (cc->iv_gen_ops) { - r = cc->iv_gen_ops->generator(cc, iv, ctx->sector); + r = cc->iv_gen_ops->generator(cc, iv, dmreq); if (r < 0) return r; } @@ -449,21 +738,28 @@ static int crypt_convert_block(struct crypt_config *cc, else r = crypto_ablkcipher_decrypt(req); + if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) + r = cc->iv_gen_ops->post(cc, iv, dmreq); + return r; } static void kcryptd_async_done(struct crypto_async_request *async_req, int error); + static void crypt_alloc_req(struct crypt_config *cc, struct convert_context *ctx) { - if (!cc->req) - cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); - ablkcipher_request_set_tfm(cc->req, cc->tfm); - ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG | - CRYPTO_TFM_REQ_MAY_SLEEP, - kcryptd_async_done, - dmreq_of_req(cc, cc->req)); + struct crypt_cpu *this_cc = this_crypt_config(cc); + unsigned key_index = ctx->sector & (cc->tfms_count - 1); + + if (!this_cc->req) + this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); + + ablkcipher_request_set_tfm(this_cc->req, this_cc->tfms[key_index]); + ablkcipher_request_set_callback(this_cc->req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + kcryptd_async_done, dmreq_of_req(cc, this_cc->req)); } /* @@ -472,6 +768,7 @@ static void crypt_alloc_req(struct crypt_config *cc, static int crypt_convert(struct crypt_config *cc, struct convert_context *ctx) { + struct crypt_cpu *this_cc = this_crypt_config(cc); int r; atomic_set(&ctx->pending, 1); @@ -483,7 +780,7 @@ static int crypt_convert(struct crypt_config *cc, atomic_inc(&ctx->pending); - r = crypt_convert_block(cc, ctx, cc->req); + r = crypt_convert_block(cc, ctx, this_cc->req); switch (r) { /* async */ @@ -492,7 +789,7 @@ static int crypt_convert(struct crypt_config *cc, INIT_COMPLETION(ctx->restart); /* fall through*/ case -EINPROGRESS: - cc->req = NULL; + this_cc->req = NULL; ctx->sector++; continue; @@ -651,6 +948,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io) * They must be separated as otherwise the final stages could be * starved by new requests which can block in the first stages due * to memory allocation. + * + * The work is done per CPU global for all dm-crypt instances. + * They should not depend on each other and do not block. */ static void crypt_endio(struct bio *clone, int error) { @@ -691,26 +991,30 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone) clone->bi_destructor = dm_crypt_bio_destructor; } -static void kcryptd_io_read(struct dm_crypt_io *io) +static void kcryptd_unplug(struct crypt_config *cc) +{ + blk_unplug(bdev_get_queue(cc->dev->bdev)); +} + +static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) { struct crypt_config *cc = io->target->private; struct bio *base_bio = io->base_bio; struct bio *clone; - crypt_inc_pending(io); - /* * The block layer might modify the bvec array, so always * copy the required bvecs because we need the original * one in order to decrypt the whole bio data *afterwards*. */ - clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs); - if (unlikely(!clone)) { - io->error = -ENOMEM; - crypt_dec_pending(io); - return; + clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs); + if (!clone) { + kcryptd_unplug(cc); + return 1; } + crypt_inc_pending(io); + clone_init(io, clone); clone->bi_idx = 0; clone->bi_vcnt = bio_segments(base_bio); @@ -720,6 +1024,7 @@ static void kcryptd_io_read(struct dm_crypt_io *io) sizeof(struct bio_vec) * clone->bi_vcnt); generic_make_request(clone); + return 0; } static void kcryptd_io_write(struct dm_crypt_io *io) @@ -732,9 +1037,12 @@ static void kcryptd_io(struct work_struct *work) { struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); - if (bio_data_dir(io->base_bio) == READ) - kcryptd_io_read(io); - else + if (bio_data_dir(io->base_bio) == READ) { + crypt_inc_pending(io); + if (kcryptd_io_read(io, GFP_NOIO)) + io->error = -ENOMEM; + crypt_dec_pending(io); + } else kcryptd_io_write(io); } @@ -901,6 +1209,9 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, return; } + if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) + error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq); + mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); if (!atomic_dec_and_test(&ctx->pending)) @@ -971,34 +1282,84 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size) } } -static int crypt_set_key(struct crypt_config *cc, char *key) +static void crypt_free_tfms(struct crypt_config *cc, int cpu) { - unsigned key_size = strlen(key) >> 1; + struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu); + unsigned i; - if (cc->key_size && cc->key_size != key_size) + for (i = 0; i < cc->tfms_count; i++) + if (cpu_cc->tfms[i] && !IS_ERR(cpu_cc->tfms[i])) { + crypto_free_ablkcipher(cpu_cc->tfms[i]); + cpu_cc->tfms[i] = NULL; + } +} + +static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode) +{ + struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu); + unsigned i; + int err; + + for (i = 0; i < cc->tfms_count; i++) { + cpu_cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0); + if (IS_ERR(cpu_cc->tfms[i])) { + err = PTR_ERR(cpu_cc->tfms[i]); + crypt_free_tfms(cc, cpu); + return err; + } + } + + return 0; +} + +static int crypt_setkey_allcpus(struct crypt_config *cc) +{ + unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count); + int cpu, err = 0, i, r; + + for_each_possible_cpu(cpu) { + for (i = 0; i < cc->tfms_count; i++) { + r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfms[i], + cc->key + (i * subkey_size), subkey_size); + if (r) + err = r; + } + } + + return err; +} + +static int crypt_set_key(struct crypt_config *cc, char *key) +{ + /* The key size may not be changed. */ + if (cc->key_size != (strlen(key) >> 1)) return -EINVAL; - cc->key_size = key_size; /* initial settings */ + /* Hyphen (which gives a key_size of zero) means there is no key. */ + if (!cc->key_size && strcmp(key, "-")) + return -EINVAL; - if ((!key_size && strcmp(key, "-")) || - (key_size && crypt_decode_key(cc->key, key, key_size) < 0)) + if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0) return -EINVAL; set_bit(DM_CRYPT_KEY_VALID, &cc->flags); - return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); + return crypt_setkey_allcpus(cc); } static int crypt_wipe_key(struct crypt_config *cc) { clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); memset(&cc->key, 0, cc->key_size * sizeof(u8)); - return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); + + return crypt_setkey_allcpus(cc); } static void crypt_dtr(struct dm_target *ti) { struct crypt_config *cc = ti->private; + struct crypt_cpu *cpu_cc; + int cpu; ti->private = NULL; @@ -1010,6 +1371,14 @@ static void crypt_dtr(struct dm_target *ti) if (cc->crypt_queue) destroy_workqueue(cc->crypt_queue); + if (cc->cpu) + for_each_possible_cpu(cpu) { + cpu_cc = per_cpu_ptr(cc->cpu, cpu); + if (cpu_cc->req) + mempool_free(cpu_cc->req, cc->req_pool); + crypt_free_tfms(cc, cpu); + } + if (cc->bs) bioset_free(cc->bs); @@ -1023,14 +1392,14 @@ static void crypt_dtr(struct dm_target *ti) if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) cc->iv_gen_ops->dtr(cc); - if (cc->tfm && !IS_ERR(cc->tfm)) - crypto_free_ablkcipher(cc->tfm); - if (cc->dev) dm_put_device(ti, cc->dev); + if (cc->cpu) + free_percpu(cc->cpu); + kzfree(cc->cipher); - kzfree(cc->cipher_mode); + kzfree(cc->cipher_string); /* Must zero key material before freeing */ kzfree(cc); @@ -1040,9 +1409,9 @@ static int crypt_ctr_cipher(struct dm_target *ti, char *cipher_in, char *key) { struct crypt_config *cc = ti->private; - char *tmp, *cipher, *chainmode, *ivmode, *ivopts; + char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; char *cipher_api = NULL; - int ret = -EINVAL; + int cpu, ret = -EINVAL; /* Convert to crypto api definition? */ if (strchr(cipher_in, '(')) { @@ -1050,23 +1419,31 @@ static int crypt_ctr_cipher(struct dm_target *ti, return -EINVAL; } + cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL); + if (!cc->cipher_string) + goto bad_mem; + /* * Legacy dm-crypt cipher specification - * cipher-mode-iv:ivopts + * cipher[:keycount]-mode-iv:ivopts */ tmp = cipher_in; - cipher = strsep(&tmp, "-"); + keycount = strsep(&tmp, "-"); + cipher = strsep(&keycount, ":"); + + if (!keycount) + cc->tfms_count = 1; + else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 || + !is_power_of_2(cc->tfms_count)) { + ti->error = "Bad cipher key count specification"; + return -EINVAL; + } + cc->key_parts = cc->tfms_count; cc->cipher = kstrdup(cipher, GFP_KERNEL); if (!cc->cipher) goto bad_mem; - if (tmp) { - cc->cipher_mode = kstrdup(tmp, GFP_KERNEL); - if (!cc->cipher_mode) - goto bad_mem; - } - chainmode = strsep(&tmp, "-"); ivopts = strsep(&tmp, "-"); ivmode = strsep(&ivopts, ":"); @@ -1074,10 +1451,19 @@ static int crypt_ctr_cipher(struct dm_target *ti, if (tmp) DMWARN("Ignoring unexpected additional cipher options"); - /* Compatibility mode for old dm-crypt mappings */ + cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)) + + cc->tfms_count * sizeof(*(cc->cpu->tfms)), + __alignof__(struct crypt_cpu)); + if (!cc->cpu) { + ti->error = "Cannot allocate per cpu state"; + goto bad_mem; + } + + /* + * For compatibility with the original dm-crypt mapping format, if + * only the cipher name is supplied, use cbc-plain. + */ if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) { - kfree(cc->cipher_mode); - cc->cipher_mode = kstrdup("cbc-plain", GFP_KERNEL); chainmode = "cbc"; ivmode = "plain"; } @@ -1099,11 +1485,12 @@ static int crypt_ctr_cipher(struct dm_target *ti, } /* Allocate cipher */ - cc->tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0); - if (IS_ERR(cc->tfm)) { - ret = PTR_ERR(cc->tfm); - ti->error = "Error allocating crypto tfm"; - goto bad; + for_each_possible_cpu(cpu) { + ret = crypt_alloc_tfms(cc, cpu, cipher_api); + if (ret < 0) { + ti->error = "Error allocating crypto tfm"; + goto bad; + } } /* Initialize and set key */ @@ -1114,7 +1501,7 @@ static int crypt_ctr_cipher(struct dm_target *ti, } /* Initialize IV */ - cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm); + cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc)); if (cc->iv_size) /* at least a 64 bit sector number should fit in our buffer */ cc->iv_size = max(cc->iv_size, @@ -1137,7 +1524,15 @@ static int crypt_ctr_cipher(struct dm_target *ti, cc->iv_gen_ops = &crypt_iv_benbi_ops; else if (strcmp(ivmode, "null") == 0) cc->iv_gen_ops = &crypt_iv_null_ops; - else { + else if (strcmp(ivmode, "lmk") == 0) { + cc->iv_gen_ops = &crypt_iv_lmk_ops; + /* Version 2 and 3 is recognised according + * to length of provided multi-key string. + * If present (version 3), last key is used as IV seed. + */ + if (cc->key_size % cc->key_parts) + cc->key_parts++; + } else { ret = -EINVAL; ti->error = "Invalid IV mode"; goto bad; @@ -1194,6 +1589,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->error = "Cannot allocate encryption context"; return -ENOMEM; } + cc->key_size = key_size; ti->private = cc; ret = crypt_ctr_cipher(ti, argv[0], argv[1]); @@ -1208,9 +1604,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } cc->dmreq_start = sizeof(struct ablkcipher_request); - cc->dmreq_start += crypto_ablkcipher_reqsize(cc->tfm); + cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc)); cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); - cc->dmreq_start += crypto_ablkcipher_alignmask(cc->tfm) & + cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) & ~(crypto_tfm_ctx_alignment() - 1); cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + @@ -1219,7 +1615,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->error = "Cannot allocate crypt request mempool"; goto bad; } - cc->req = NULL; cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); if (!cc->page_pool) { @@ -1252,13 +1647,20 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) cc->start = tmpll; ret = -ENOMEM; - cc->io_queue = create_singlethread_workqueue("kcryptd_io"); + cc->io_queue = alloc_workqueue("kcryptd_io", + WQ_NON_REENTRANT| + WQ_MEM_RECLAIM, + 1); if (!cc->io_queue) { ti->error = "Couldn't create kcryptd io queue"; goto bad; } - cc->crypt_queue = create_singlethread_workqueue("kcryptd"); + cc->crypt_queue = alloc_workqueue("kcryptd", + WQ_NON_REENTRANT| + WQ_CPU_INTENSIVE| + WQ_MEM_RECLAIM, + 1); if (!cc->crypt_queue) { ti->error = "Couldn't create kcryptd queue"; goto bad; @@ -1286,9 +1688,10 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector)); - if (bio_data_dir(io->base_bio) == READ) - kcryptd_queue_io(io); - else + if (bio_data_dir(io->base_bio) == READ) { + if (kcryptd_io_read(io, GFP_NOWAIT)) + kcryptd_queue_io(io); + } else kcryptd_queue_crypt(io); return DM_MAPIO_SUBMITTED; @@ -1306,10 +1709,7 @@ static int crypt_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: - if (cc->cipher_mode) - DMEMIT("%s-%s ", cc->cipher, cc->cipher_mode); - else - DMEMIT("%s ", cc->cipher); + DMEMIT("%s ", cc->cipher_string); if (cc->key_size > 0) { if ((maxlen - sz) < ((cc->key_size << 1) + 1)) @@ -1421,7 +1821,7 @@ static int crypt_iterate_devices(struct dm_target *ti, static struct target_type crypt_target = { .name = "crypt", - .version = {1, 7, 0}, + .version = {1, 10, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index baa1191..f18375d 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -352,7 +352,7 @@ static int __init dm_delay_init(void) { int r = -ENOMEM; - kdelayd_wq = create_workqueue("kdelayd"); + kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); if (!kdelayd_wq) { DMERR("Couldn't start kdelayd"); goto bad_queue; diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 4b54618..6d12775 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -295,19 +295,55 @@ retry: DMWARN("remove_all left %d open device(s)", dev_skipped); } +/* + * Set the uuid of a hash_cell that isn't already set. + */ +static void __set_cell_uuid(struct hash_cell *hc, char *new_uuid) +{ + mutex_lock(&dm_hash_cells_mutex); + hc->uuid = new_uuid; + mutex_unlock(&dm_hash_cells_mutex); + + list_add(&hc->uuid_list, _uuid_buckets + hash_str(new_uuid)); +} + +/* + * Changes the name of a hash_cell and returns the old name for + * the caller to free. + */ +static char *__change_cell_name(struct hash_cell *hc, char *new_name) +{ + char *old_name; + + /* + * Rename and move the name cell. + */ + list_del(&hc->name_list); + old_name = hc->name; + + mutex_lock(&dm_hash_cells_mutex); + hc->name = new_name; + mutex_unlock(&dm_hash_cells_mutex); + + list_add(&hc->name_list, _name_buckets + hash_str(new_name)); + + return old_name; +} + static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, const char *new) { - char *new_name, *old_name; + char *new_data, *old_name = NULL; struct hash_cell *hc; struct dm_table *table; struct mapped_device *md; + unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0; /* * duplicate new. */ - new_name = kstrdup(new, GFP_KERNEL); - if (!new_name) + new_data = kstrdup(new, GFP_KERNEL); + if (!new_data) return ERR_PTR(-ENOMEM); down_write(&_hash_lock); @@ -315,13 +351,19 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, /* * Is new free ? */ - hc = __get_name_cell(new); + if (change_uuid) + hc = __get_uuid_cell(new); + else + hc = __get_name_cell(new); + if (hc) { - DMWARN("asked to rename to an already-existing name %s -> %s", + DMWARN("Unable to change %s on mapped device %s to one that " + "already exists: %s", + change_uuid ? "uuid" : "name", param->name, new); dm_put(hc->md); up_write(&_hash_lock); - kfree(new_name); + kfree(new_data); return ERR_PTR(-EBUSY); } @@ -330,22 +372,30 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, */ hc = __get_name_cell(param->name); if (!hc) { - DMWARN("asked to rename a non-existent device %s -> %s", - param->name, new); + DMWARN("Unable to rename non-existent device, %s to %s%s", + param->name, change_uuid ? "uuid " : "", new); up_write(&_hash_lock); - kfree(new_name); + kfree(new_data); return ERR_PTR(-ENXIO); } /* - * rename and move the name cell. + * Does this device already have a uuid? */ - list_del(&hc->name_list); - old_name = hc->name; - mutex_lock(&dm_hash_cells_mutex); - hc->name = new_name; - mutex_unlock(&dm_hash_cells_mutex); - list_add(&hc->name_list, _name_buckets + hash_str(new_name)); + if (change_uuid && hc->uuid) { + DMWARN("Unable to change uuid of mapped device %s to %s " + "because uuid is already set to %s", + param->name, new, hc->uuid); + dm_put(hc->md); + up_write(&_hash_lock); + kfree(new_data); + return ERR_PTR(-EINVAL); + } + + if (change_uuid) + __set_cell_uuid(hc, new_data); + else + old_name = __change_cell_name(hc, new_data); /* * Wake up any dm event waiters. @@ -729,7 +779,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) hc = __find_device_hash_cell(param); if (!hc) { - DMWARN("device doesn't appear to be in the dev hash table."); + DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); up_write(&_hash_lock); return -ENXIO; } @@ -741,7 +791,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) */ r = dm_lock_for_deletion(md); if (r) { - DMWARN("unable to remove open device %s", hc->name); + DMDEBUG_LIMIT("unable to remove open device %s", hc->name); up_write(&_hash_lock); dm_put(md); return r; @@ -774,21 +824,24 @@ static int invalid_str(char *str, void *end) static int dev_rename(struct dm_ioctl *param, size_t param_size) { int r; - char *new_name = (char *) param + param->data_start; + char *new_data = (char *) param + param->data_start; struct mapped_device *md; + unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0; - if (new_name < param->data || - invalid_str(new_name, (void *) param + param_size) || - strlen(new_name) > DM_NAME_LEN - 1) { - DMWARN("Invalid new logical volume name supplied."); + if (new_data < param->data || + invalid_str(new_data, (void *) param + param_size) || + strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) { + DMWARN("Invalid new mapped device name or uuid string supplied."); return -EINVAL; } - r = check_name(new_name); - if (r) - return r; + if (!change_uuid) { + r = check_name(new_data); + if (r) + return r; + } - md = dm_hash_rename(param, new_name); + md = dm_hash_rename(param, new_data); if (IS_ERR(md)) return PTR_ERR(md); @@ -885,7 +938,7 @@ static int do_resume(struct dm_ioctl *param) hc = __find_device_hash_cell(param); if (!hc) { - DMWARN("device doesn't appear to be in the dev hash table."); + DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); up_write(&_hash_lock); return -ENXIO; } @@ -1212,7 +1265,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size) hc = __find_device_hash_cell(param); if (!hc) { - DMWARN("device doesn't appear to be in the dev hash table."); + DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); up_write(&_hash_lock); return -ENXIO; } diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index d8587ba..924f5f0 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -37,6 +37,13 @@ struct dm_kcopyd_client { unsigned int nr_pages; unsigned int nr_free_pages; + /* + * Block devices to unplug. + * Non-NULL pointer means that a block device has some pending requests + * and needs to be unplugged. + */ + struct block_device *unplug[2]; + struct dm_io_client *io_client; wait_queue_head_t destroyq; @@ -308,6 +315,31 @@ static int run_complete_job(struct kcopyd_job *job) return 0; } +/* + * Unplug the block device at the specified index. + */ +static void unplug(struct dm_kcopyd_client *kc, int rw) +{ + if (kc->unplug[rw] != NULL) { + blk_unplug(bdev_get_queue(kc->unplug[rw])); + kc->unplug[rw] = NULL; + } +} + +/* + * Prepare block device unplug. If there's another device + * to be unplugged at the same array index, we unplug that + * device first. + */ +static void prepare_unplug(struct dm_kcopyd_client *kc, int rw, + struct block_device *bdev) +{ + if (likely(kc->unplug[rw] == bdev)) + return; + unplug(kc, rw); + kc->unplug[rw] = bdev; +} + static void complete_io(unsigned long error, void *context) { struct kcopyd_job *job = (struct kcopyd_job *) context; @@ -345,7 +377,7 @@ static int run_io_job(struct kcopyd_job *job) { int r; struct dm_io_request io_req = { - .bi_rw = job->rw | REQ_SYNC | REQ_UNPLUG, + .bi_rw = job->rw, .mem.type = DM_IO_PAGE_LIST, .mem.ptr.pl = job->pages, .mem.offset = job->offset, @@ -354,10 +386,16 @@ static int run_io_job(struct kcopyd_job *job) .client = job->kc->io_client, }; - if (job->rw == READ) + if (job->rw == READ) { r = dm_io(&io_req, 1, &job->source, NULL); - else + prepare_unplug(job->kc, READ, job->source.bdev); + } else { + if (job->num_dests > 1) + io_req.bi_rw |= REQ_UNPLUG; r = dm_io(&io_req, job->num_dests, job->dests, NULL); + if (!(io_req.bi_rw & REQ_UNPLUG)) + prepare_unplug(job->kc, WRITE, job->dests[0].bdev); + } return r; } @@ -435,10 +473,18 @@ static void do_work(struct work_struct *work) * Pages jobs when successful will jump onto the io jobs * list. io jobs call wake when they complete and it all * starts again. + * + * Note that io_jobs add block devices to the unplug array, + * this array is cleared with "unplug" calls. It is thus + * forbidden to run complete_jobs after io_jobs and before + * unplug because the block device could be destroyed in + * job completion callback. */ process_jobs(&kc->complete_jobs, kc, run_complete_job); process_jobs(&kc->pages_jobs, kc, run_pages_job); process_jobs(&kc->io_jobs, kc, run_io_job); + unplug(kc, READ); + unplug(kc, WRITE); } /* @@ -619,12 +665,15 @@ int dm_kcopyd_client_create(unsigned int nr_pages, INIT_LIST_HEAD(&kc->io_jobs); INIT_LIST_HEAD(&kc->pages_jobs); + memset(kc->unplug, 0, sizeof(kc->unplug)); + kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache); if (!kc->job_pool) goto bad_slab; INIT_WORK(&kc->kcopyd_work, do_work); - kc->kcopyd_wq = create_singlethread_workqueue("kcopyd"); + kc->kcopyd_wq = alloc_workqueue("kcopyd", + WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); if (!kc->kcopyd_wq) goto bad_workqueue; diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index 1ed0094..aa2e0c3 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c @@ -12,12 +12,22 @@ #include "dm-log-userspace-transfer.h" +#define DM_LOG_USERSPACE_VSN "1.1.0" + struct flush_entry { int type; region_t region; struct list_head list; }; +/* + * This limit on the number of mark and clear request is, to a degree, + * arbitrary. However, there is some basis for the choice in the limits + * imposed on the size of data payload by dm-log-userspace-transfer.c: + * dm_consult_userspace(). + */ +#define MAX_FLUSH_GROUP_COUNT 32 + struct log_c { struct dm_target *ti; uint32_t region_size; @@ -37,8 +47,15 @@ struct log_c { */ uint64_t in_sync_hint; + /* + * Mark and clear requests are held until a flush is issued + * so that we can group, and thereby limit, the amount of + * network traffic between kernel and userspace. The 'flush_lock' + * is used to protect these lists. + */ spinlock_t flush_lock; - struct list_head flush_list; /* only for clear and mark requests */ + struct list_head mark_list; + struct list_head clear_list; }; static mempool_t *flush_entry_pool; @@ -169,7 +186,8 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, strncpy(lc->uuid, argv[0], DM_UUID_LEN); spin_lock_init(&lc->flush_lock); - INIT_LIST_HEAD(&lc->flush_list); + INIT_LIST_HEAD(&lc->mark_list); + INIT_LIST_HEAD(&lc->clear_list); str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str); if (str_size < 0) { @@ -181,8 +199,11 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR, ctr_str, str_size, NULL, NULL); - if (r == -ESRCH) { - DMERR("Userspace log server not found"); + if (r < 0) { + if (r == -ESRCH) + DMERR("Userspace log server not found"); + else + DMERR("Userspace log server failed to create log"); goto out; } @@ -214,10 +235,9 @@ out: static void userspace_dtr(struct dm_dirty_log *log) { - int r; struct log_c *lc = log->context; - r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, + (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, NULL, 0, NULL, NULL); @@ -338,6 +358,71 @@ static int userspace_in_sync(struct dm_dirty_log *log, region_t region, return (r) ? 0 : (int)in_sync; } +static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list) +{ + int r = 0; + struct flush_entry *fe; + + list_for_each_entry(fe, flush_list, list) { + r = userspace_do_request(lc, lc->uuid, fe->type, + (char *)&fe->region, + sizeof(fe->region), + NULL, NULL); + if (r) + break; + } + + return r; +} + +static int flush_by_group(struct log_c *lc, struct list_head *flush_list) +{ + int r = 0; + int count; + uint32_t type = 0; + struct flush_entry *fe, *tmp_fe; + LIST_HEAD(tmp_list); + uint64_t group[MAX_FLUSH_GROUP_COUNT]; + + /* + * Group process the requests + */ + while (!list_empty(flush_list)) { + count = 0; + + list_for_each_entry_safe(fe, tmp_fe, flush_list, list) { + group[count] = fe->region; + count++; + + list_del(&fe->list); + list_add(&fe->list, &tmp_list); + + type = fe->type; + if (count >= MAX_FLUSH_GROUP_COUNT) + break; + } + + r = userspace_do_request(lc, lc->uuid, type, + (char *)(group), + count * sizeof(uint64_t), + NULL, NULL); + if (r) { + /* Group send failed. Attempt one-by-one. */ + list_splice_init(&tmp_list, flush_list); + r = flush_one_by_one(lc, flush_list); + break; + } + } + + /* + * Must collect flush_entrys that were successfully processed + * as a group so that they will be free'd by the caller. + */ + list_splice_init(&tmp_list, flush_list); + + return r; +} + /* * userspace_flush * @@ -360,31 +445,25 @@ static int userspace_flush(struct dm_dirty_log *log) int r = 0; unsigned long flags; struct log_c *lc = log->context; - LIST_HEAD(flush_list); + LIST_HEAD(mark_list); + LIST_HEAD(clear_list); struct flush_entry *fe, *tmp_fe; spin_lock_irqsave(&lc->flush_lock, flags); - list_splice_init(&lc->flush_list, &flush_list); + list_splice_init(&lc->mark_list, &mark_list); + list_splice_init(&lc->clear_list, &clear_list); spin_unlock_irqrestore(&lc->flush_lock, flags); - if (list_empty(&flush_list)) + if (list_empty(&mark_list) && list_empty(&clear_list)) return 0; - /* - * FIXME: Count up requests, group request types, - * allocate memory to stick all requests in and - * send to server in one go. Failing the allocation, - * do it one by one. - */ + r = flush_by_group(lc, &mark_list); + if (r) + goto fail; - list_for_each_entry(fe, &flush_list, list) { - r = userspace_do_request(lc, lc->uuid, fe->type, - (char *)&fe->region, - sizeof(fe->region), - NULL, NULL); - if (r) - goto fail; - } + r = flush_by_group(lc, &clear_list); + if (r) + goto fail; r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, NULL, 0, NULL, NULL); @@ -395,7 +474,11 @@ fail: * Calling code will receive an error and will know that * the log facility has failed. */ - list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) { + list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) { + list_del(&fe->list); + mempool_free(fe, flush_entry_pool); + } + list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) { list_del(&fe->list); mempool_free(fe, flush_entry_pool); } @@ -425,7 +508,7 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region) spin_lock_irqsave(&lc->flush_lock, flags); fe->type = DM_ULOG_MARK_REGION; fe->region = region; - list_add(&fe->list, &lc->flush_list); + list_add(&fe->list, &lc->mark_list); spin_unlock_irqrestore(&lc->flush_lock, flags); return; @@ -462,7 +545,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region) spin_lock_irqsave(&lc->flush_lock, flags); fe->type = DM_ULOG_CLEAR_REGION; fe->region = region; - list_add(&fe->list, &lc->flush_list); + list_add(&fe->list, &lc->clear_list); spin_unlock_irqrestore(&lc->flush_lock, flags); return; @@ -684,7 +767,7 @@ static int __init userspace_dirty_log_init(void) return r; } - DMINFO("version 1.0.0 loaded"); + DMINFO("version " DM_LOG_USERSPACE_VSN " loaded"); return 0; } @@ -694,7 +777,7 @@ static void __exit userspace_dirty_log_exit(void) dm_ulog_tfr_exit(); mempool_destroy(flush_entry_pool); - DMINFO("version 1.0.0 unloaded"); + DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded"); return; } diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c index 075cbcf..049eaf1 100644 --- a/drivers/md/dm-log-userspace-transfer.c +++ b/drivers/md/dm-log-userspace-transfer.c @@ -198,6 +198,7 @@ resend: memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg)); memcpy(tfr->uuid, uuid, DM_UUID_LEN); + tfr->version = DM_ULOG_REQUEST_VERSION; tfr->luid = luid; tfr->seq = dm_ulog_seq++; diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 33420e6..6951536 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -455,7 +455,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, r = PTR_ERR(lc->io_req.client); DMWARN("couldn't allocate disk io client"); kfree(lc); - return -ENOMEM; + return r; } lc->disk_header = vmalloc(buf_size); diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 487ecda..b82d288 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -23,6 +23,8 @@ #define DM_MSG_PREFIX "multipath" #define MESG_STR(x) x, sizeof(x) +#define DM_PG_INIT_DELAY_MSECS 2000 +#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1) /* Path properties */ struct pgpath { @@ -33,8 +35,7 @@ struct pgpath { unsigned fail_count; /* Cumulative failure count */ struct dm_path path; - struct work_struct deactivate_path; - struct work_struct activate_path; + struct delayed_work activate_path; }; #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) @@ -65,11 +66,15 @@ struct multipath { const char *hw_handler_name; char *hw_handler_params; + unsigned nr_priority_groups; struct list_head priority_groups; + + wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ + unsigned pg_init_required; /* pg_init needs calling? */ unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ - wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ + unsigned pg_init_delay_retry; /* Delay pg_init retry? */ unsigned nr_valid_paths; /* Total number of usable paths */ struct pgpath *current_pgpath; @@ -82,6 +87,7 @@ struct multipath { unsigned saved_queue_if_no_path;/* Saved state during suspension */ unsigned pg_init_retries; /* Number of times to retry pg_init */ unsigned pg_init_count; /* Number of times pg_init called */ + unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */ struct work_struct process_queued_ios; struct list_head queued_ios; @@ -116,7 +122,6 @@ static struct workqueue_struct *kmultipathd, *kmpath_handlerd; static void process_queued_ios(struct work_struct *work); static void trigger_event(struct work_struct *work); static void activate_path(struct work_struct *work); -static void deactivate_path(struct work_struct *work); /*----------------------------------------------- @@ -129,8 +134,7 @@ static struct pgpath *alloc_pgpath(void) if (pgpath) { pgpath->is_active = 1; - INIT_WORK(&pgpath->deactivate_path, deactivate_path); - INIT_WORK(&pgpath->activate_path, activate_path); + INIT_DELAYED_WORK(&pgpath->activate_path, activate_path); } return pgpath; @@ -141,14 +145,6 @@ static void free_pgpath(struct pgpath *pgpath) kfree(pgpath); } -static void deactivate_path(struct work_struct *work) -{ - struct pgpath *pgpath = - container_of(work, struct pgpath, deactivate_path); - - blk_abort_queue(pgpath->path.dev->bdev->bd_disk->queue); -} - static struct priority_group *alloc_priority_group(void) { struct priority_group *pg; @@ -199,6 +195,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti) INIT_LIST_HEAD(&m->queued_ios); spin_lock_init(&m->lock); m->queue_io = 1; + m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; INIT_WORK(&m->process_queued_ios, process_queued_ios); INIT_WORK(&m->trigger_event, trigger_event); init_waitqueue_head(&m->pg_init_wait); @@ -238,14 +235,19 @@ static void free_multipath(struct multipath *m) static void __pg_init_all_paths(struct multipath *m) { struct pgpath *pgpath; + unsigned long pg_init_delay = 0; m->pg_init_count++; m->pg_init_required = 0; + if (m->pg_init_delay_retry) + pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ? + m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS); list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) { /* Skip failed paths */ if (!pgpath->is_active) continue; - if (queue_work(kmpath_handlerd, &pgpath->activate_path)) + if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path, + pg_init_delay)) m->pg_init_in_progress++; } } @@ -793,8 +795,9 @@ static int parse_features(struct arg_set *as, struct multipath *m) const char *param_name; static struct param _params[] = { - {0, 3, "invalid number of feature args"}, + {0, 5, "invalid number of feature args"}, {1, 50, "pg_init_retries must be between 1 and 50"}, + {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, }; r = read_param(_params, shift(as), &argc, &ti->error); @@ -821,6 +824,14 @@ static int parse_features(struct arg_set *as, struct multipath *m) continue; } + if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) && + (argc >= 1)) { + r = read_param(_params + 2, shift(as), + &m->pg_init_delay_msecs, &ti->error); + argc--; + continue; + } + ti->error = "Unrecognised multipath feature request"; r = -EINVAL; } while (argc && !r); @@ -931,7 +942,7 @@ static void flush_multipath_work(struct multipath *m) flush_workqueue(kmpath_handlerd); multipath_wait_for_pg_init_completion(m); flush_workqueue(kmultipathd); - flush_scheduled_work(); + flush_work_sync(&m->trigger_event); } static void multipath_dtr(struct dm_target *ti) @@ -995,7 +1006,6 @@ static int fail_path(struct pgpath *pgpath) pgpath->path.dev->name, m->nr_valid_paths); schedule_work(&m->trigger_event); - queue_work(kmultipathd, &pgpath->deactivate_path); out: spin_unlock_irqrestore(&m->lock, flags); @@ -1034,7 +1044,7 @@ static int reinstate_path(struct pgpath *pgpath) m->current_pgpath = NULL; queue_work(kmultipathd, &m->process_queued_ios); } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { - if (queue_work(kmpath_handlerd, &pgpath->activate_path)) + if (queue_work(kmpath_handlerd, &pgpath->activate_path.work)) m->pg_init_in_progress++; } @@ -1169,6 +1179,7 @@ static void pg_init_done(void *data, int errors) struct priority_group *pg = pgpath->pg; struct multipath *m = pg->m; unsigned long flags; + unsigned delay_retry = 0; /* device or driver problems */ switch (errors) { @@ -1193,8 +1204,9 @@ static void pg_init_done(void *data, int errors) */ bypass_pg(m, pg, 1); break; - /* TODO: For SCSI_DH_RETRY we should wait a couple seconds */ case SCSI_DH_RETRY: + /* Wait before retrying. */ + delay_retry = 1; case SCSI_DH_IMM_RETRY: case SCSI_DH_RES_TEMP_UNAVAIL: if (pg_init_limit_reached(m, pgpath)) @@ -1227,6 +1239,7 @@ static void pg_init_done(void *data, int errors) if (!m->pg_init_required) m->queue_io = 0; + m->pg_init_delay_retry = delay_retry; queue_work(kmultipathd, &m->process_queued_ios); /* @@ -1241,7 +1254,7 @@ out: static void activate_path(struct work_struct *work) { struct pgpath *pgpath = - container_of(work, struct pgpath, activate_path); + container_of(work, struct pgpath, activate_path.work); scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev), pg_init_done, pgpath); @@ -1382,11 +1395,14 @@ static int multipath_status(struct dm_target *ti, status_type_t type, DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count); else { DMEMIT("%u ", m->queue_if_no_path + - (m->pg_init_retries > 0) * 2); + (m->pg_init_retries > 0) * 2 + + (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2); if (m->queue_if_no_path) DMEMIT("queue_if_no_path "); if (m->pg_init_retries) DMEMIT("pg_init_retries %u ", m->pg_init_retries); + if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) + DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs); } if (!m->hw_handler_name || type == STATUSTYPE_INFO) @@ -1655,7 +1671,7 @@ out: *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 1, 1}, + .version = {1, 2, 0}, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, @@ -1687,7 +1703,7 @@ static int __init dm_multipath_init(void) return -EINVAL; } - kmultipathd = create_workqueue("kmpathd"); + kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0); if (!kmultipathd) { DMERR("failed to create workqueue kmpathd"); dm_unregister_target(&multipath_target); @@ -1701,7 +1717,8 @@ static int __init dm_multipath_init(void) * old workqueue would also create a bottleneck in the * path of the storage hardware device activation. */ - kmpath_handlerd = create_singlethread_workqueue("kmpath_handlerd"); + kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd", + WQ_MEM_RECLAIM); if (!kmpath_handlerd) { DMERR("failed to create workqueue kmpath_handlerd"); destroy_workqueue(kmultipathd); diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c new file mode 100644 index 0000000..b9e1e15 --- /dev/null +++ b/drivers/md/dm-raid.c @@ -0,0 +1,697 @@ +/* + * Copyright (C) 2010-2011 Neil Brown + * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include + +#include "md.h" +#include "raid5.h" +#include "dm.h" +#include "bitmap.h" + +#define DM_MSG_PREFIX "raid" + +/* + * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then + * make it so the flag doesn't set anything. + */ +#ifndef MD_SYNC_STATE_FORCED +#define MD_SYNC_STATE_FORCED 0 +#endif + +struct raid_dev { + /* + * Two DM devices, one to hold metadata and one to hold the + * actual data/parity. The reason for this is to not confuse + * ti->len and give more flexibility in altering size and + * characteristics. + * + * While it is possible for this device to be associated + * with a different physical device than the data_dev, it + * is intended for it to be the same. + * |--------- Physical Device ---------| + * |- meta_dev -|------ data_dev ------| + */ + struct dm_dev *meta_dev; + struct dm_dev *data_dev; + struct mdk_rdev_s rdev; +}; + +/* + * Flags for rs->print_flags field. + */ +#define DMPF_DAEMON_SLEEP 0x1 +#define DMPF_MAX_WRITE_BEHIND 0x2 +#define DMPF_SYNC 0x4 +#define DMPF_NOSYNC 0x8 +#define DMPF_STRIPE_CACHE 0x10 +#define DMPF_MIN_RECOVERY_RATE 0x20 +#define DMPF_MAX_RECOVERY_RATE 0x40 + +struct raid_set { + struct dm_target *ti; + + uint64_t print_flags; + + struct mddev_s md; + struct raid_type *raid_type; + struct dm_target_callbacks callbacks; + + struct raid_dev dev[0]; +}; + +/* Supported raid types and properties. */ +static struct raid_type { + const char *name; /* RAID algorithm. */ + const char *descr; /* Descriptor text for logging. */ + const unsigned parity_devs; /* # of parity devices. */ + const unsigned minimal_devs; /* minimal # of devices in set. */ + const unsigned level; /* RAID level. */ + const unsigned algorithm; /* RAID algorithm. */ +} raid_types[] = { + {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, + {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, + {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, + {"raid5_ls", "RAID5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC}, + {"raid5_rs", "RAID5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC}, + {"raid6_zr", "RAID6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART}, + {"raid6_nr", "RAID6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART}, + {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} +}; + +static struct raid_type *get_raid_type(char *name) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(raid_types); i++) + if (!strcmp(raid_types[i].name, name)) + return &raid_types[i]; + + return NULL; +} + +static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs) +{ + unsigned i; + struct raid_set *rs; + sector_t sectors_per_dev; + + if (raid_devs <= raid_type->parity_devs) { + ti->error = "Insufficient number of devices"; + return ERR_PTR(-EINVAL); + } + + sectors_per_dev = ti->len; + if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) { + ti->error = "Target length not divisible by number of data devices"; + return ERR_PTR(-EINVAL); + } + + rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL); + if (!rs) { + ti->error = "Cannot allocate raid context"; + return ERR_PTR(-ENOMEM); + } + + mddev_init(&rs->md); + + rs->ti = ti; + rs->raid_type = raid_type; + rs->md.raid_disks = raid_devs; + rs->md.level = raid_type->level; + rs->md.new_level = rs->md.level; + rs->md.dev_sectors = sectors_per_dev; + rs->md.layout = raid_type->algorithm; + rs->md.new_layout = rs->md.layout; + rs->md.delta_disks = 0; + rs->md.recovery_cp = 0; + + for (i = 0; i < raid_devs; i++) + md_rdev_init(&rs->dev[i].rdev); + + /* + * Remaining items to be initialized by further RAID params: + * rs->md.persistent + * rs->md.external + * rs->md.chunk_sectors + * rs->md.new_chunk_sectors + */ + + return rs; +} + +static void context_free(struct raid_set *rs) +{ + int i; + + for (i = 0; i < rs->md.raid_disks; i++) + if (rs->dev[i].data_dev) + dm_put_device(rs->ti, rs->dev[i].data_dev); + + kfree(rs); +} + +/* + * For every device we have two words + * : meta device name or '-' if missing + * : data device name or '-' if missing + * + * This code parses those words. + */ +static int dev_parms(struct raid_set *rs, char **argv) +{ + int i; + int rebuild = 0; + int metadata_available = 0; + int ret = 0; + + for (i = 0; i < rs->md.raid_disks; i++, argv += 2) { + rs->dev[i].rdev.raid_disk = i; + + rs->dev[i].meta_dev = NULL; + rs->dev[i].data_dev = NULL; + + /* + * There are no offsets, since there is a separate device + * for data and metadata. + */ + rs->dev[i].rdev.data_offset = 0; + rs->dev[i].rdev.mddev = &rs->md; + + if (strcmp(argv[0], "-")) { + rs->ti->error = "Metadata devices not supported"; + return -EINVAL; + } + + if (!strcmp(argv[1], "-")) { + if (!test_bit(In_sync, &rs->dev[i].rdev.flags) && + (!rs->dev[i].rdev.recovery_offset)) { + rs->ti->error = "Drive designated for rebuild not specified"; + return -EINVAL; + } + + continue; + } + + ret = dm_get_device(rs->ti, argv[1], + dm_table_get_mode(rs->ti->table), + &rs->dev[i].data_dev); + if (ret) { + rs->ti->error = "RAID device lookup failure"; + return ret; + } + + rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev; + list_add(&rs->dev[i].rdev.same_set, &rs->md.disks); + if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) + rebuild++; + } + + if (metadata_available) { + rs->md.external = 0; + rs->md.persistent = 1; + rs->md.major_version = 2; + } else if (rebuild && !rs->md.recovery_cp) { + /* + * Without metadata, we will not be able to tell if the array + * is in-sync or not - we must assume it is not. Therefore, + * it is impossible to rebuild a drive. + * + * Even if there is metadata, the on-disk information may + * indicate that the array is not in-sync and it will then + * fail at that time. + * + * User could specify 'nosync' option if desperate. + */ + DMERR("Unable to rebuild drive while array is not in-sync"); + rs->ti->error = "RAID device lookup failure"; + return -EINVAL; + } + + return 0; +} + +/* + * Possible arguments are... + * RAID456: + * [optional_args] + * + * Optional args: + * [[no]sync] Force or prevent recovery of the entire array + * [rebuild ] Rebuild the drive indicated by the index + * [daemon_sleep ] Time between bitmap daemon work to clear bits + * [min_recovery_rate ] Throttle RAID initialization + * [max_recovery_rate ] Throttle RAID initialization + * [max_write_behind ] See '-write-behind=' (man mdadm) + * [stripe_cache ] Stripe cache size for higher RAIDs + */ +static int parse_raid_params(struct raid_set *rs, char **argv, + unsigned num_raid_params) +{ + unsigned i, rebuild_cnt = 0; + unsigned long value; + char *key; + + /* + * First, parse the in-order required arguments + */ + if ((strict_strtoul(argv[0], 10, &value) < 0) || + !is_power_of_2(value) || (value < 8)) { + rs->ti->error = "Bad chunk size"; + return -EINVAL; + } + + rs->md.new_chunk_sectors = rs->md.chunk_sectors = value; + argv++; + num_raid_params--; + + /* + * Second, parse the unordered optional arguments + */ + for (i = 0; i < rs->md.raid_disks; i++) + set_bit(In_sync, &rs->dev[i].rdev.flags); + + for (i = 0; i < num_raid_params; i++) { + if (!strcmp(argv[i], "nosync")) { + rs->md.recovery_cp = MaxSector; + rs->print_flags |= DMPF_NOSYNC; + rs->md.flags |= MD_SYNC_STATE_FORCED; + continue; + } + if (!strcmp(argv[i], "sync")) { + rs->md.recovery_cp = 0; + rs->print_flags |= DMPF_SYNC; + rs->md.flags |= MD_SYNC_STATE_FORCED; + continue; + } + + /* The rest of the optional arguments come in key/value pairs */ + if ((i + 1) >= num_raid_params) { + rs->ti->error = "Wrong number of raid parameters given"; + return -EINVAL; + } + + key = argv[i++]; + if (strict_strtoul(argv[i], 10, &value) < 0) { + rs->ti->error = "Bad numerical argument given in raid params"; + return -EINVAL; + } + + if (!strcmp(key, "rebuild")) { + if (++rebuild_cnt > rs->raid_type->parity_devs) { + rs->ti->error = "Too many rebuild drives given"; + return -EINVAL; + } + if (value > rs->md.raid_disks) { + rs->ti->error = "Invalid rebuild index given"; + return -EINVAL; + } + clear_bit(In_sync, &rs->dev[value].rdev.flags); + rs->dev[value].rdev.recovery_offset = 0; + } else if (!strcmp(key, "max_write_behind")) { + rs->print_flags |= DMPF_MAX_WRITE_BEHIND; + + /* + * In device-mapper, we specify things in sectors, but + * MD records this value in kB + */ + value /= 2; + if (value > COUNTER_MAX) { + rs->ti->error = "Max write-behind limit out of range"; + return -EINVAL; + } + rs->md.bitmap_info.max_write_behind = value; + } else if (!strcmp(key, "daemon_sleep")) { + rs->print_flags |= DMPF_DAEMON_SLEEP; + if (!value || (value > MAX_SCHEDULE_TIMEOUT)) { + rs->ti->error = "daemon sleep period out of range"; + return -EINVAL; + } + rs->md.bitmap_info.daemon_sleep = value; + } else if (!strcmp(key, "stripe_cache")) { + rs->print_flags |= DMPF_STRIPE_CACHE; + + /* + * In device-mapper, we specify things in sectors, but + * MD records this value in kB + */ + value /= 2; + + if (rs->raid_type->level < 5) { + rs->ti->error = "Inappropriate argument: stripe_cache"; + return -EINVAL; + } + if (raid5_set_cache_size(&rs->md, (int)value)) { + rs->ti->error = "Bad stripe_cache size"; + return -EINVAL; + } + } else if (!strcmp(key, "min_recovery_rate")) { + rs->print_flags |= DMPF_MIN_RECOVERY_RATE; + if (value > INT_MAX) { + rs->ti->error = "min_recovery_rate out of range"; + return -EINVAL; + } + rs->md.sync_speed_min = (int)value; + } else if (!strcmp(key, "max_recovery_rate")) { + rs->print_flags |= DMPF_MAX_RECOVERY_RATE; + if (value > INT_MAX) { + rs->ti->error = "max_recovery_rate out of range"; + return -EINVAL; + } + rs->md.sync_speed_max = (int)value; + } else { + DMERR("Unable to parse RAID parameter: %s", key); + rs->ti->error = "Unable to parse RAID parameters"; + return -EINVAL; + } + } + + /* Assume there are no metadata devices until the drives are parsed */ + rs->md.persistent = 0; + rs->md.external = 1; + + return 0; +} + +static void do_table_event(struct work_struct *ws) +{ + struct raid_set *rs = container_of(ws, struct raid_set, md.event_work); + + dm_table_event(rs->ti->table); +} + +static int raid_is_congested(struct dm_target_callbacks *cb, int bits) +{ + struct raid_set *rs = container_of(cb, struct raid_set, callbacks); + + return md_raid5_congested(&rs->md, bits); +} + +static void raid_unplug(struct dm_target_callbacks *cb) +{ + struct raid_set *rs = container_of(cb, struct raid_set, callbacks); + + md_raid5_unplug_device(rs->md.private); +} + +/* + * Construct a RAID4/5/6 mapping: + * Args: + * <#raid_params> \ + * <#raid_devs> { .. } + * + * ** metadata devices are not supported yet, use '-' instead ** + * + * varies by . See 'parse_raid_params' for + * details on possible . + */ +static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + int ret; + struct raid_type *rt; + unsigned long num_raid_params, num_raid_devs; + struct raid_set *rs = NULL; + + /* Must have at least <#raid_params> */ + if (argc < 2) { + ti->error = "Too few arguments"; + return -EINVAL; + } + + /* raid type */ + rt = get_raid_type(argv[0]); + if (!rt) { + ti->error = "Unrecognised raid_type"; + return -EINVAL; + } + argc--; + argv++; + + /* number of RAID parameters */ + if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) { + ti->error = "Cannot understand number of RAID parameters"; + return -EINVAL; + } + argc--; + argv++; + + /* Skip over RAID params for now and find out # of devices */ + if (num_raid_params + 1 > argc) { + ti->error = "Arguments do not agree with counts given"; + return -EINVAL; + } + + if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) || + (num_raid_devs >= INT_MAX)) { + ti->error = "Cannot understand number of raid devices"; + return -EINVAL; + } + + rs = context_alloc(ti, rt, (unsigned)num_raid_devs); + if (IS_ERR(rs)) + return PTR_ERR(rs); + + ret = parse_raid_params(rs, argv, (unsigned)num_raid_params); + if (ret) + goto bad; + + ret = -EINVAL; + + argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */ + argv += num_raid_params + 1; + + if (argc != (num_raid_devs * 2)) { + ti->error = "Supplied RAID devices does not match the count given"; + goto bad; + } + + ret = dev_parms(rs, argv); + if (ret) + goto bad; + + INIT_WORK(&rs->md.event_work, do_table_event); + ti->split_io = rs->md.chunk_sectors; + ti->private = rs; + + mutex_lock(&rs->md.reconfig_mutex); + ret = md_run(&rs->md); + rs->md.in_sync = 0; /* Assume already marked dirty */ + mutex_unlock(&rs->md.reconfig_mutex); + + if (ret) { + ti->error = "Fail to run raid array"; + goto bad; + } + + rs->callbacks.congested_fn = raid_is_congested; + rs->callbacks.unplug_fn = raid_unplug; + dm_table_add_target_callbacks(ti->table, &rs->callbacks); + + return 0; + +bad: + context_free(rs); + + return ret; +} + +static void raid_dtr(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + list_del_init(&rs->callbacks.list); + md_stop(&rs->md); + context_free(rs); +} + +static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) +{ + struct raid_set *rs = ti->private; + mddev_t *mddev = &rs->md; + + mddev->pers->make_request(mddev, bio); + + return DM_MAPIO_SUBMITTED; +} + +static int raid_status(struct dm_target *ti, status_type_t type, + char *result, unsigned maxlen) +{ + struct raid_set *rs = ti->private; + unsigned raid_param_cnt = 1; /* at least 1 for chunksize */ + unsigned sz = 0; + int i; + sector_t sync; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks); + + for (i = 0; i < rs->md.raid_disks; i++) { + if (test_bit(Faulty, &rs->dev[i].rdev.flags)) + DMEMIT("D"); + else if (test_bit(In_sync, &rs->dev[i].rdev.flags)) + DMEMIT("A"); + else + DMEMIT("a"); + } + + if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery)) + sync = rs->md.curr_resync_completed; + else + sync = rs->md.recovery_cp; + + if (sync > rs->md.resync_max_sectors) + sync = rs->md.resync_max_sectors; + + DMEMIT(" %llu/%llu", + (unsigned long long) sync, + (unsigned long long) rs->md.resync_max_sectors); + + break; + case STATUSTYPE_TABLE: + /* The string you would use to construct this array */ + for (i = 0; i < rs->md.raid_disks; i++) + if (rs->dev[i].data_dev && + !test_bit(In_sync, &rs->dev[i].rdev.flags)) + raid_param_cnt++; /* for rebuilds */ + + raid_param_cnt += (hweight64(rs->print_flags) * 2); + if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)) + raid_param_cnt--; + + DMEMIT("%s %u %u", rs->raid_type->name, + raid_param_cnt, rs->md.chunk_sectors); + + if ((rs->print_flags & DMPF_SYNC) && + (rs->md.recovery_cp == MaxSector)) + DMEMIT(" sync"); + if (rs->print_flags & DMPF_NOSYNC) + DMEMIT(" nosync"); + + for (i = 0; i < rs->md.raid_disks; i++) + if (rs->dev[i].data_dev && + !test_bit(In_sync, &rs->dev[i].rdev.flags)) + DMEMIT(" rebuild %u", i); + + if (rs->print_flags & DMPF_DAEMON_SLEEP) + DMEMIT(" daemon_sleep %lu", + rs->md.bitmap_info.daemon_sleep); + + if (rs->print_flags & DMPF_MIN_RECOVERY_RATE) + DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min); + + if (rs->print_flags & DMPF_MAX_RECOVERY_RATE) + DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max); + + if (rs->print_flags & DMPF_MAX_WRITE_BEHIND) + DMEMIT(" max_write_behind %lu", + rs->md.bitmap_info.max_write_behind); + + if (rs->print_flags & DMPF_STRIPE_CACHE) { + raid5_conf_t *conf = rs->md.private; + + /* convert from kiB to sectors */ + DMEMIT(" stripe_cache %d", + conf ? conf->max_nr_stripes * 2 : 0); + } + + DMEMIT(" %d", rs->md.raid_disks); + for (i = 0; i < rs->md.raid_disks; i++) { + DMEMIT(" -"); /* metadata device */ + + if (rs->dev[i].data_dev) + DMEMIT(" %s", rs->dev[i].data_dev->name); + else + DMEMIT(" -"); + } + } + + return 0; +} + +static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) +{ + struct raid_set *rs = ti->private; + unsigned i; + int ret = 0; + + for (i = 0; !ret && i < rs->md.raid_disks; i++) + if (rs->dev[i].data_dev) + ret = fn(ti, + rs->dev[i].data_dev, + 0, /* No offset on data devs */ + rs->md.dev_sectors, + data); + + return ret; +} + +static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct raid_set *rs = ti->private; + unsigned chunk_size = rs->md.chunk_sectors << 9; + raid5_conf_t *conf = rs->md.private; + + blk_limits_io_min(limits, chunk_size); + blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded)); +} + +static void raid_presuspend(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + md_stop_writes(&rs->md); +} + +static void raid_postsuspend(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + mddev_suspend(&rs->md); +} + +static void raid_resume(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + mddev_resume(&rs->md); +} + +static struct target_type raid_target = { + .name = "raid", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = raid_ctr, + .dtr = raid_dtr, + .map = raid_map, + .status = raid_status, + .iterate_devices = raid_iterate_devices, + .io_hints = raid_io_hints, + .presuspend = raid_presuspend, + .postsuspend = raid_postsuspend, + .resume = raid_resume, +}; + +static int __init dm_raid_init(void) +{ + return dm_register_target(&raid_target); +} + +static void __exit dm_raid_exit(void) +{ + dm_unregister_target(&raid_target); +} + +module_init(dm_raid_init); +module_exit(dm_raid_exit); + +MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); +MODULE_ALIAS("dm-raid4"); +MODULE_ALIAS("dm-raid5"); +MODULE_ALIAS("dm-raid6"); +MODULE_AUTHOR("Neil Brown "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 19a59b0..dee3267 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -261,7 +261,7 @@ static int mirror_flush(struct dm_target *ti) struct dm_io_request io_req = { .bi_rw = WRITE_FLUSH, .mem.type = DM_IO_KMEM, - .mem.ptr.bvec = NULL, + .mem.ptr.addr = NULL, .client = ms->io_client, }; @@ -637,6 +637,12 @@ static void do_write(struct mirror_set *ms, struct bio *bio) .client = ms->io_client, }; + if (bio->bi_rw & REQ_DISCARD) { + io_req.bi_rw |= REQ_DISCARD; + io_req.mem.type = DM_IO_KMEM; + io_req.mem.ptr.addr = NULL; + } + for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) map_region(dest++, m, bio); @@ -670,7 +676,8 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) bio_list_init(&requeue); while ((bio = bio_list_pop(writes))) { - if (bio->bi_rw & REQ_FLUSH) { + if ((bio->bi_rw & REQ_FLUSH) || + (bio->bi_rw & REQ_DISCARD)) { bio_list_add(&sync, bio); continue; } @@ -1076,8 +1083,10 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->private = ms; ti->split_io = dm_rh_get_region_size(ms->rh); ti->num_flush_requests = 1; + ti->num_discard_requests = 1; - ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); + ms->kmirrord_wq = alloc_workqueue("kmirrord", + WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); if (!ms->kmirrord_wq) { DMERR("couldn't start kmirrord"); r = -ENOMEM; @@ -1130,7 +1139,7 @@ static void mirror_dtr(struct dm_target *ti) del_timer_sync(&ms->timer); flush_workqueue(ms->kmirrord_wq); - flush_scheduled_work(); + flush_work_sync(&ms->trigger_event); dm_kcopyd_client_destroy(ms->kcopyd_client); destroy_workqueue(ms->kmirrord_wq); free_context(ms, ti, ms->nr_mirrors); @@ -1406,7 +1415,7 @@ static int mirror_iterate_devices(struct dm_target *ti, static struct target_type mirror_target = { .name = "mirror", - .version = {1, 12, 0}, + .version = {1, 12, 1}, .module = THIS_MODULE, .ctr = mirror_ctr, .dtr = mirror_dtr, diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 2129cdb..95891df 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -256,7 +256,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw, */ INIT_WORK_ONSTACK(&req.work, do_metadata); queue_work(ps->metadata_wq, &req.work); - flush_workqueue(ps->metadata_wq); + flush_work(&req.work); return req.result; } @@ -818,7 +818,7 @@ static int persistent_ctr(struct dm_exception_store *store, atomic_set(&ps->pending_count, 0); ps->callbacks = NULL; - ps->metadata_wq = create_singlethread_workqueue("ksnaphd"); + ps->metadata_wq = alloc_workqueue("ksnaphd", WQ_MEM_RECLAIM, 0); if (!ps->metadata_wq) { kfree(ps); DMERR("couldn't start header metadata update thread"); diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 53cf79d..fdde53c 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -19,7 +19,6 @@ #include #include #include -#include #include "dm-exception-store.h" @@ -80,9 +79,6 @@ struct dm_snapshot { /* Origin writes don't trigger exceptions until this is set */ int active; - /* Whether or not owning mapped_device is suspended */ - int suspended; - atomic_t pending_exceptions_count; mempool_t *pending_pool; @@ -106,10 +102,6 @@ struct dm_snapshot { struct dm_kcopyd_client *kcopyd_client; - /* Queue of snapshot writes for ksnapd to flush */ - struct bio_list queued_bios; - struct work_struct queued_bios_work; - /* Wait for events based on state_bits */ unsigned long state_bits; @@ -160,9 +152,6 @@ struct dm_dev *dm_snap_cow(struct dm_snapshot *s) } EXPORT_SYMBOL(dm_snap_cow); -static struct workqueue_struct *ksnapd; -static void flush_queued_bios(struct work_struct *work); - static sector_t chunk_to_sector(struct dm_exception_store *store, chunk_t chunk) { @@ -1110,7 +1099,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s->ti = ti; s->valid = 1; s->active = 0; - s->suspended = 0; atomic_set(&s->pending_exceptions_count, 0); init_rwsem(&s->lock); INIT_LIST_HEAD(&s->list); @@ -1153,9 +1141,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) spin_lock_init(&s->tracked_chunk_lock); - bio_list_init(&s->queued_bios); - INIT_WORK(&s->queued_bios_work, flush_queued_bios); - ti->private = s; ti->num_flush_requests = num_flush_requests; @@ -1279,8 +1264,6 @@ static void snapshot_dtr(struct dm_target *ti) struct dm_snapshot *s = ti->private; struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; - flush_workqueue(ksnapd); - down_read(&_origins_lock); /* Check whether exception handover must be cancelled */ (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); @@ -1342,20 +1325,6 @@ static void flush_bios(struct bio *bio) } } -static void flush_queued_bios(struct work_struct *work) -{ - struct dm_snapshot *s = - container_of(work, struct dm_snapshot, queued_bios_work); - struct bio *queued_bios; - unsigned long flags; - - spin_lock_irqsave(&s->pe_lock, flags); - queued_bios = bio_list_get(&s->queued_bios); - spin_unlock_irqrestore(&s->pe_lock, flags); - - flush_bios(queued_bios); -} - static int do_origin(struct dm_dev *origin, struct bio *bio); /* @@ -1760,15 +1729,6 @@ static void snapshot_merge_presuspend(struct dm_target *ti) stop_merge(s); } -static void snapshot_postsuspend(struct dm_target *ti) -{ - struct dm_snapshot *s = ti->private; - - down_write(&s->lock); - s->suspended = 1; - up_write(&s->lock); -} - static int snapshot_preresume(struct dm_target *ti) { int r = 0; @@ -1783,7 +1743,7 @@ static int snapshot_preresume(struct dm_target *ti) DMERR("Unable to resume snapshot source until " "handover completes."); r = -EINVAL; - } else if (!snap_src->suspended) { + } else if (!dm_suspended(snap_src->ti)) { DMERR("Unable to perform snapshot handover until " "source is suspended."); r = -EINVAL; @@ -1816,7 +1776,6 @@ static void snapshot_resume(struct dm_target *ti) down_write(&s->lock); s->active = 1; - s->suspended = 0; up_write(&s->lock); } @@ -2194,7 +2153,7 @@ static int origin_iterate_devices(struct dm_target *ti, static struct target_type origin_target = { .name = "snapshot-origin", - .version = {1, 7, 0}, + .version = {1, 7, 1}, .module = THIS_MODULE, .ctr = origin_ctr, .dtr = origin_dtr, @@ -2207,13 +2166,12 @@ static struct target_type origin_target = { static struct target_type snapshot_target = { .name = "snapshot", - .version = {1, 9, 0}, + .version = {1, 10, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, .map = snapshot_map, .end_io = snapshot_end_io, - .postsuspend = snapshot_postsuspend, .preresume = snapshot_preresume, .resume = snapshot_resume, .status = snapshot_status, @@ -2222,14 +2180,13 @@ static struct target_type snapshot_target = { static struct target_type merge_target = { .name = dm_snapshot_merge_target_name, - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, .map = snapshot_merge_map, .end_io = snapshot_end_io, .presuspend = snapshot_merge_presuspend, - .postsuspend = snapshot_postsuspend, .preresume = snapshot_preresume, .resume = snapshot_merge_resume, .status = snapshot_status, @@ -2291,17 +2248,8 @@ static int __init dm_snapshot_init(void) goto bad_tracked_chunk_cache; } - ksnapd = create_singlethread_workqueue("ksnapd"); - if (!ksnapd) { - DMERR("Failed to create ksnapd workqueue."); - r = -ENOMEM; - goto bad_pending_pool; - } - return 0; -bad_pending_pool: - kmem_cache_destroy(tracked_chunk_cache); bad_tracked_chunk_cache: kmem_cache_destroy(pending_cache); bad_pending_cache: @@ -2322,8 +2270,6 @@ bad_register_snapshot_target: static void __exit dm_snapshot_exit(void) { - destroy_workqueue(ksnapd); - dm_unregister_target(&snapshot_target); dm_unregister_target(&origin_target); dm_unregister_target(&merge_target); diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index f0371b4..dddfa14 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -39,23 +39,20 @@ struct stripe_c { struct dm_target *ti; /* Work struct used for triggering events*/ - struct work_struct kstriped_ws; + struct work_struct trigger_event; struct stripe stripe[0]; }; -static struct workqueue_struct *kstriped; - /* * An event is triggered whenever a drive * drops out of a stripe volume. */ static void trigger_event(struct work_struct *work) { - struct stripe_c *sc = container_of(work, struct stripe_c, kstriped_ws); - + struct stripe_c *sc = container_of(work, struct stripe_c, + trigger_event); dm_table_event(sc->ti->table); - } static inline struct stripe_c *alloc_context(unsigned int stripes) @@ -160,7 +157,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -ENOMEM; } - INIT_WORK(&sc->kstriped_ws, trigger_event); + INIT_WORK(&sc->trigger_event, trigger_event); /* Set pointer to dm target; used in trigger_event */ sc->ti = ti; @@ -211,7 +208,7 @@ static void stripe_dtr(struct dm_target *ti) for (i = 0; i < sc->stripes; i++) dm_put_device(ti, sc->stripe[i].dev); - flush_workqueue(kstriped); + flush_work_sync(&sc->trigger_event); kfree(sc); } @@ -367,7 +364,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, atomic_inc(&(sc->stripe[i].error_count)); if (atomic_read(&(sc->stripe[i].error_count)) < DM_IO_ERROR_THRESHOLD) - queue_work(kstriped, &sc->kstriped_ws); + schedule_work(&sc->trigger_event); } return error; @@ -401,7 +398,7 @@ static void stripe_io_hints(struct dm_target *ti, static struct target_type stripe_target = { .name = "striped", - .version = {1, 3, 0}, + .version = {1, 3, 1}, .module = THIS_MODULE, .ctr = stripe_ctr, .dtr = stripe_dtr, @@ -422,20 +419,10 @@ int __init dm_stripe_init(void) return r; } - kstriped = create_singlethread_workqueue("kstriped"); - if (!kstriped) { - DMERR("failed to create workqueue kstriped"); - dm_unregister_target(&stripe_target); - return -ENOMEM; - } - return r; } void dm_stripe_exit(void) { dm_unregister_target(&stripe_target); - destroy_workqueue(kstriped); - - return; } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 985c20a..dffa0ac 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -71,6 +71,8 @@ struct dm_table { void *event_context; struct dm_md_mempools *mempools; + + struct list_head target_callbacks; }; /* @@ -204,6 +206,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode, return -ENOMEM; INIT_LIST_HEAD(&t->devices); + INIT_LIST_HEAD(&t->target_callbacks); atomic_set(&t->holders, 0); t->discards_supported = 1; @@ -1225,10 +1228,17 @@ int dm_table_resume_targets(struct dm_table *t) return 0; } +void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb) +{ + list_add(&cb->list, &t->target_callbacks); +} +EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks); + int dm_table_any_congested(struct dm_table *t, int bdi_bits) { struct dm_dev_internal *dd; struct list_head *devices = dm_table_get_devices(t); + struct dm_target_callbacks *cb; int r = 0; list_for_each_entry(dd, devices, list) { @@ -1243,6 +1253,10 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) bdevname(dd->dm_dev.bdev, b)); } + list_for_each_entry(cb, &t->target_callbacks, list) + if (cb->congested_fn) + r |= cb->congested_fn(cb, bdi_bits); + return r; } @@ -1264,6 +1278,7 @@ void dm_table_unplug_all(struct dm_table *t) { struct dm_dev_internal *dd; struct list_head *devices = dm_table_get_devices(t); + struct dm_target_callbacks *cb; list_for_each_entry(dd, devices, list) { struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev); @@ -1276,6 +1291,10 @@ void dm_table_unplug_all(struct dm_table *t) dm_device_name(t->md), bdevname(dd->dm_dev.bdev, b)); } + + list_for_each_entry(cb, &t->target_callbacks, list) + if (cb->unplug_fn) + cb->unplug_fn(cb); } struct mapped_device *dm_table_get_md(struct dm_table *t) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f48a2f3..eaa3af0 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -32,7 +32,6 @@ #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" #define DM_COOKIE_LENGTH 24 -static DEFINE_MUTEX(dm_mutex); static const char *_name = DM_NAME; static unsigned int major = 0; @@ -328,7 +327,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode) { struct mapped_device *md; - mutex_lock(&dm_mutex); spin_lock(&_minor_lock); md = bdev->bd_disk->private_data; @@ -346,7 +344,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode) out: spin_unlock(&_minor_lock); - mutex_unlock(&dm_mutex); return md ? 0 : -ENXIO; } @@ -355,10 +352,12 @@ static int dm_blk_close(struct gendisk *disk, fmode_t mode) { struct mapped_device *md = disk->private_data; - mutex_lock(&dm_mutex); + spin_lock(&_minor_lock); + atomic_dec(&md->open_count); dm_put(md); - mutex_unlock(&dm_mutex); + + spin_unlock(&_minor_lock); return 0; } @@ -1638,13 +1637,15 @@ static void dm_request_fn(struct request_queue *q) if (map_request(ti, clone, md)) goto requeued; - spin_lock_irq(q->queue_lock); + BUG_ON(!irqs_disabled()); + spin_lock(q->queue_lock); } goto out; requeued: - spin_lock_irq(q->queue_lock); + BUG_ON(!irqs_disabled()); + spin_lock(q->queue_lock); plug_and_out: if (!elv_queue_empty(q)) @@ -1884,7 +1885,8 @@ static struct mapped_device *alloc_dev(int minor) add_disk(md->disk); format_dev_t(md->name, MKDEV(_major, minor)); - md->wq = create_singlethread_workqueue("kdmflush"); + md->wq = alloc_workqueue("kdmflush", + WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); if (!md->wq) goto bad_thread; @@ -1992,13 +1994,14 @@ static void event_callback(void *context) wake_up(&md->eventq); } +/* + * Protected by md->suspend_lock obtained by dm_swap_table(). + */ static void __set_size(struct mapped_device *md, sector_t size) { set_capacity(md->disk, size); - mutex_lock(&md->bdev->bd_inode->i_mutex); i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); - mutex_unlock(&md->bdev->bd_inode->i_mutex); } /* diff --git a/drivers/md/md.c b/drivers/md/md.c index 7fc090a..cf8594c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -288,10 +288,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio) int rv; int cpu; - if (mddev == NULL || mddev->pers == NULL) { + if (mddev == NULL || mddev->pers == NULL + || !mddev->ready) { bio_io_error(bio); return 0; } + smp_rmb(); /* Ensure implications of 'active' are visible */ rcu_read_lock(); if (mddev->suspended) { DEFINE_WAIT(__wait); @@ -703,9 +705,9 @@ static struct mdk_personality *find_pers(int level, char *clevel) } /* return the offset of the super block in 512byte sectors */ -static inline sector_t calc_dev_sboffset(struct block_device *bdev) +static inline sector_t calc_dev_sboffset(mdk_rdev_t *rdev) { - sector_t num_sectors = i_size_read(bdev->bd_inode) / 512; + sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; return MD_NEW_SIZE_SECTORS(num_sectors); } @@ -763,7 +765,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, */ struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); - bio->bi_bdev = rdev->bdev; + bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; bio->bi_sector = sector; bio_add_page(bio, page, size, 0); bio->bi_private = rdev; @@ -793,7 +795,7 @@ static void bi_complete(struct bio *bio, int error) } int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, - struct page *page, int rw) + struct page *page, int rw, bool metadata_op) { struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); struct completion event; @@ -801,8 +803,12 @@ int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, rw |= REQ_SYNC | REQ_UNPLUG; - bio->bi_bdev = rdev->bdev; - bio->bi_sector = sector; + bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? + rdev->meta_bdev : rdev->bdev; + if (metadata_op) + bio->bi_sector = sector + rdev->sb_start; + else + bio->bi_sector = sector + rdev->data_offset; bio_add_page(bio, page, size, 0); init_completion(&event); bio->bi_private = &event; @@ -827,7 +833,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size) return 0; - if (!sync_page_io(rdev, rdev->sb_start, size, rdev->sb_page, READ)) + if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true)) goto fail; rdev->sb_loaded = 1; return 0; @@ -989,7 +995,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version * * It also happens to be a multiple of 4Kb. */ - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); ret = read_disk_sb(rdev, MD_SB_BYTES); if (ret) return ret; @@ -1330,7 +1336,7 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) return 0; /* component must fit device */ if (rdev->mddev->bitmap_info.offset) return 0; /* can't move bitmap */ - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); if (!num_sectors || num_sectors > rdev->sb_start) num_sectors = rdev->sb_start; md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, @@ -2465,6 +2471,10 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) if (rdev2->raid_disk == slot) return -EEXIST; + if (slot >= rdev->mddev->raid_disks && + slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) + return -ENOSPC; + rdev->raid_disk = slot; if (test_bit(In_sync, &rdev->flags)) rdev->saved_raid_disk = slot; @@ -2482,7 +2492,8 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) /* failure here is OK */; /* don't wakeup anyone, leave that to userspace. */ } else { - if (slot >= rdev->mddev->raid_disks) + if (slot >= rdev->mddev->raid_disks && + slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) return -ENOSPC; rdev->raid_disk = slot; /* assume it is working */ @@ -3107,7 +3118,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) char nm[20]; if (rdev->raid_disk < 0) continue; - if (rdev->new_raid_disk > mddev->raid_disks) + if (rdev->new_raid_disk >= mddev->raid_disks) rdev->new_raid_disk = -1; if (rdev->new_raid_disk == rdev->raid_disk) continue; @@ -3736,6 +3747,8 @@ action_show(mddev_t *mddev, char *page) return sprintf(page, "%s\n", type); } +static void reap_sync_thread(mddev_t *mddev); + static ssize_t action_store(mddev_t *mddev, const char *page, size_t len) { @@ -3750,9 +3763,7 @@ action_store(mddev_t *mddev, const char *page, size_t len) if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(mddev->sync_thread); - mddev->sync_thread = NULL; - mddev->recovery = 0; + reap_sync_thread(mddev); } } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) @@ -3904,7 +3915,7 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); static ssize_t sync_completed_show(mddev_t *mddev, char *page) { - unsigned long max_sectors, resync; + unsigned long long max_sectors, resync; if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return sprintf(page, "none\n"); @@ -3915,7 +3926,7 @@ sync_completed_show(mddev_t *mddev, char *page) max_sectors = mddev->dev_sectors; resync = mddev->curr_resync_completed; - return sprintf(page, "%lu / %lu\n", resync, max_sectors); + return sprintf(page, "%llu / %llu\n", resync, max_sectors); } static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); @@ -4002,19 +4013,24 @@ suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) { char *e; unsigned long long new = simple_strtoull(buf, &e, 10); + unsigned long long old = mddev->suspend_lo; if (mddev->pers == NULL || mddev->pers->quiesce == NULL) return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; - if (new >= mddev->suspend_hi || - (new > mddev->suspend_lo && new < mddev->suspend_hi)) { - mddev->suspend_lo = new; + + mddev->suspend_lo = new; + if (new >= old) + /* Shrinking suspended region */ mddev->pers->quiesce(mddev, 2); - return len; - } else - return -EINVAL; + else { + /* Expanding suspended region - need to wait */ + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } + return len; } static struct md_sysfs_entry md_suspend_lo = __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); @@ -4031,20 +4047,24 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) { char *e; unsigned long long new = simple_strtoull(buf, &e, 10); + unsigned long long old = mddev->suspend_hi; if (mddev->pers == NULL || mddev->pers->quiesce == NULL) return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; - if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || - (new > mddev->suspend_lo && new > mddev->suspend_hi)) { - mddev->suspend_hi = new; + + mddev->suspend_hi = new; + if (new <= old) + /* Shrinking suspended region */ + mddev->pers->quiesce(mddev, 2); + else { + /* Expanding suspended region - need to wait */ mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); - return len; - } else - return -EINVAL; + } + return len; } static struct md_sysfs_entry md_suspend_hi = __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); @@ -4422,7 +4442,9 @@ int md_run(mddev_t *mddev) * We don't want the data to overlap the metadata, * Internal Bitmap issues have been handled elsewhere. */ - if (rdev->data_offset < rdev->sb_start) { + if (rdev->meta_bdev) { + /* Nothing to check */; + } else if (rdev->data_offset < rdev->sb_start) { if (mddev->dev_sectors && rdev->data_offset + mddev->dev_sectors > rdev->sb_start) { @@ -4556,7 +4578,8 @@ int md_run(mddev_t *mddev) mddev->safemode_timer.data = (unsigned long) mddev; mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ mddev->in_sync = 1; - + smp_wmb(); + mddev->ready = 1; list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0) { char nm[20]; @@ -4693,13 +4716,12 @@ static void md_clean(mddev_t *mddev) mddev->plug = NULL; } -void md_stop_writes(mddev_t *mddev) +static void __md_stop_writes(mddev_t *mddev) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(mddev->sync_thread); - mddev->sync_thread = NULL; + reap_sync_thread(mddev); } del_timer_sync(&mddev->safemode_timer); @@ -4713,10 +4735,18 @@ void md_stop_writes(mddev_t *mddev) md_update_sb(mddev, 1); } } + +void md_stop_writes(mddev_t *mddev) +{ + mddev_lock(mddev); + __md_stop_writes(mddev); + mddev_unlock(mddev); +} EXPORT_SYMBOL_GPL(md_stop_writes); void md_stop(mddev_t *mddev) { + mddev->ready = 0; mddev->pers->stop(mddev); if (mddev->pers->sync_request && mddev->to_remove == NULL) mddev->to_remove = &md_redundancy_group; @@ -4736,7 +4766,7 @@ static int md_set_readonly(mddev_t *mddev, int is_open) goto out; } if (mddev->pers) { - md_stop_writes(mddev); + __md_stop_writes(mddev); err = -ENXIO; if (mddev->ro==1) @@ -4773,7 +4803,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) if (mddev->ro) set_disk_ro(disk, 0); - md_stop_writes(mddev); + __md_stop_writes(mddev); md_stop(mddev); mddev->queue->merge_bvec_fn = NULL; mddev->queue->unplug_fn = NULL; @@ -5151,9 +5181,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) /* set saved_raid_disk if appropriate */ if (!mddev->persistent) { if (info->state & (1<raid_disk < mddev->raid_disks) + info->raid_disk < mddev->raid_disks) { rdev->raid_disk = info->raid_disk; - else + set_bit(In_sync, &rdev->flags); + } else rdev->raid_disk = -1; } else super_types[mddev->major_version]. @@ -5230,7 +5261,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) printk(KERN_INFO "md: nonpersistent superblock ...\n"); rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; } else - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); rdev->sectors = rdev->sb_start; err = bind_rdev_to_array(rdev, mddev); @@ -5297,7 +5328,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) } if (mddev->persistent) - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); else rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; @@ -5510,7 +5541,6 @@ static int update_size(mddev_t *mddev, sector_t num_sectors) * sb_start or, if that is sync_thread) return -EBUSY; @@ -6033,7 +6063,8 @@ static int md_thread(void * arg) || kthread_should_stop(), thread->timeout); - if (test_and_clear_bit(THREAD_WAKEUP, &thread->flags)) + clear_bit(THREAD_WAKEUP, &thread->flags); + if (!kthread_should_stop()) thread->run(thread->mddev); } @@ -6799,7 +6830,7 @@ void md_do_sync(mddev_t *mddev) desc, mdname(mddev)); mddev->curr_resync = j; } - mddev->curr_resync_completed = mddev->curr_resync; + mddev->curr_resync_completed = j; while (j < max_sectors) { sector_t sectors; @@ -6817,8 +6848,7 @@ void md_do_sync(mddev_t *mddev) md_unplug(mddev); wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active) == 0); - mddev->curr_resync_completed = - mddev->curr_resync; + mddev->curr_resync_completed = j; set_bit(MD_CHANGE_CLEAN, &mddev->flags); sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } @@ -7023,6 +7053,45 @@ static int remove_and_add_spares(mddev_t *mddev) } return spares; } + +static void reap_sync_thread(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + + /* resync has finished, collect result */ + md_unregister_thread(mddev->sync_thread); + mddev->sync_thread = NULL; + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + /* success...*/ + /* activate any spares */ + if (mddev->pers->spare_active(mddev)) + sysfs_notify(&mddev->kobj, NULL, + "degraded"); + } + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + mddev->pers->finish_reshape) + mddev->pers->finish_reshape(mddev); + md_update_sb(mddev, 1); + + /* if array is no-longer degraded, then any saved_raid_disk + * information must be scrapped + */ + if (!mddev->degraded) + list_for_each_entry(rdev, &mddev->disks, same_set) + rdev->saved_raid_disk = -1; + + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + /* flag recovery needed just to double check */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + sysfs_notify_dirent_safe(mddev->sysfs_action); + md_new_event(mddev); +} + /* * This routine is regularly called by all per-raid-array threads to * deal with generic issues like resync and super-block update. @@ -7047,9 +7116,6 @@ static int remove_and_add_spares(mddev_t *mddev) */ void md_check_recovery(mddev_t *mddev) { - mdk_rdev_t *rdev; - - if (mddev->bitmap) bitmap_daemon_work(mddev); @@ -7117,34 +7183,7 @@ void md_check_recovery(mddev_t *mddev) goto unlock; } if (mddev->sync_thread) { - /* resync has finished, collect result */ - md_unregister_thread(mddev->sync_thread); - mddev->sync_thread = NULL; - if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { - /* success...*/ - /* activate any spares */ - if (mddev->pers->spare_active(mddev)) - sysfs_notify(&mddev->kobj, NULL, - "degraded"); - } - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - mddev->pers->finish_reshape) - mddev->pers->finish_reshape(mddev); - md_update_sb(mddev, 1); - - /* if array is no-longer degraded, then any saved_raid_disk - * information must be scrapped - */ - if (!mddev->degraded) - list_for_each_entry(rdev, &mddev->disks, same_set) - rdev->saved_raid_disk = -1; - - mddev->recovery = 0; - /* flag recovery needed just to double check */ - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - sysfs_notify_dirent_safe(mddev->sysfs_action); - md_new_event(mddev); + reap_sync_thread(mddev); goto unlock; } /* Set RUNNING before clearing NEEDED to avoid @@ -7202,7 +7241,11 @@ void md_check_recovery(mddev_t *mddev) " thread...\n", mdname(mddev)); /* leave the spares where they are, it shouldn't hurt */ - mddev->recovery = 0; + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); } else md_wakeup_thread(mddev->sync_thread); sysfs_notify_dirent_safe(mddev->sysfs_action); diff --git a/drivers/md/md.h b/drivers/md/md.h index d05bab5..eec517c 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -60,6 +60,12 @@ struct mdk_rdev_s mddev_t *mddev; /* RAID array if running */ int last_events; /* IO event timestamp */ + /* + * If meta_bdev is non-NULL, it means that a separate device is + * being used to store the metadata (superblock/bitmap) which + * would otherwise be contained on the same device as the data (bdev). + */ + struct block_device *meta_bdev; struct block_device *bdev; /* block device handle */ struct page *sb_page; @@ -148,7 +154,8 @@ struct mddev_s * are happening, so run/ * takeover/stop are not safe */ - + int ready; /* See when safe to pass + * IO requests down */ struct gendisk *gendisk; struct kobject kobj; @@ -497,8 +504,8 @@ extern void md_flush_request(mddev_t *mddev, struct bio *bio); extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, sector_t sector, int size, struct page *page); extern void md_super_wait(mddev_t *mddev); -extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, - struct page *page, int rw); +extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, + struct page *page, int rw, bool metadata_op); extern void md_do_sync(mddev_t *mddev); extern void md_new_event(mddev_t *mddev); extern int md_allow_write(mddev_t *mddev); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 845cf95..a23ffa3 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1027,8 +1027,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) } else set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" - KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n", + printk(KERN_ALERT + "md/raid1:%s: Disk failure on %s, disabling device.\n" + "md/raid1:%s: Operation continuing on %d devices.\n", mdname(mddev), bdevname(rdev->bdev, b), mdname(mddev), conf->raid_disks - mddev->degraded); } @@ -1364,10 +1365,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) */ rdev = conf->mirrors[d].rdev; if (sync_page_io(rdev, - sect + rdev->data_offset, + sect, s<<9, bio->bi_io_vec[idx].bv_page, - READ)) { + READ, false)) { success = 1; break; } @@ -1390,10 +1391,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) rdev = conf->mirrors[d].rdev; atomic_add(s, &rdev->corrected_errors); if (sync_page_io(rdev, - sect + rdev->data_offset, + sect, s<<9, bio->bi_io_vec[idx].bv_page, - WRITE) == 0) + WRITE, false) == 0) md_error(mddev, rdev); } d = start; @@ -1405,10 +1406,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) continue; rdev = conf->mirrors[d].rdev; if (sync_page_io(rdev, - sect + rdev->data_offset, + sect, s<<9, bio->bi_io_vec[idx].bv_page, - READ) == 0) + READ, false) == 0) md_error(mddev, rdev); } } else { @@ -1488,10 +1489,8 @@ static void fix_read_error(conf_t *conf, int read_disk, rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags) && - sync_page_io(rdev, - sect + rdev->data_offset, - s<<9, - conf->tmppage, READ)) + sync_page_io(rdev, sect, s<<9, + conf->tmppage, READ, false)) success = 1; else { d++; @@ -1514,9 +1513,8 @@ static void fix_read_error(conf_t *conf, int read_disk, rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev, - sect + rdev->data_offset, - s<<9, conf->tmppage, WRITE) + if (sync_page_io(rdev, sect, s<<9, + conf->tmppage, WRITE, false) == 0) /* Well, this device is dead */ md_error(mddev, rdev); @@ -1531,9 +1529,8 @@ static void fix_read_error(conf_t *conf, int read_disk, rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev, - sect + rdev->data_offset, - s<<9, conf->tmppage, READ) + if (sync_page_io(rdev, sect, s<<9, + conf->tmppage, READ, false) == 0) /* Well, this device is dead */ md_error(mddev, rdev); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 0641674..69b6595 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1051,8 +1051,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) } set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n" - KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n", + printk(KERN_ALERT + "md/raid10:%s: Disk failure on %s, disabling device.\n" + "md/raid10:%s: Operation continuing on %d devices.\n", mdname(mddev), bdevname(rdev->bdev, b), mdname(mddev), conf->raid_disks - mddev->degraded); } @@ -1559,9 +1560,9 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) rcu_read_unlock(); success = sync_page_io(rdev, r10_bio->devs[sl].addr + - sect + rdev->data_offset, + sect, s<<9, - conf->tmppage, READ); + conf->tmppage, READ, false); rdev_dec_pending(rdev, mddev); rcu_read_lock(); if (success) @@ -1598,8 +1599,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) atomic_add(s, &rdev->corrected_errors); if (sync_page_io(rdev, r10_bio->devs[sl].addr + - sect + rdev->data_offset, - s<<9, conf->tmppage, WRITE) + sect, + s<<9, conf->tmppage, WRITE, false) == 0) { /* Well, this device is dead */ printk(KERN_NOTICE @@ -1635,9 +1636,9 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) rcu_read_unlock(); if (sync_page_io(rdev, r10_bio->devs[sl].addr + - sect + rdev->data_offset, + sect, s<<9, conf->tmppage, - READ) == 0) { + READ, false) == 0) { /* Well, this device is dead */ printk(KERN_NOTICE "md/raid10:%s: unable to read back " diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index dc574f3..5044bab 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1721,7 +1721,6 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) set_bit(Faulty, &rdev->flags); printk(KERN_ALERT "md/raid:%s: Disk failure on %s, disabling device.\n" - KERN_ALERT "md/raid:%s: Operation continuing on %d devices.\n", mdname(mddev), bdevname(rdev->bdev, b), @@ -4237,7 +4236,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes)==0); mddev->reshape_position = conf->reshape_progress; - mddev->curr_resync_completed = mddev->curr_resync; + mddev->curr_resync_completed = sector_nr; conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); @@ -4338,7 +4337,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes) == 0); mddev->reshape_position = conf->reshape_progress; - mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors; + mddev->curr_resync_completed = sector_nr; conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); @@ -5339,7 +5338,7 @@ static int raid5_spare_active(mddev_t *mddev) && !test_bit(Faulty, &tmp->rdev->flags) && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { count++; - sysfs_notify_dirent(tmp->rdev->sysfs_state); + sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); } } spin_lock_irqsave(&conf->device_lock, flags); @@ -5528,8 +5527,8 @@ static int raid5_start_reshape(mddev_t *mddev) return -ENOSPC; list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk < 0 && - !test_bit(Faulty, &rdev->flags)) + if ((rdev->raid_disk < 0 || rdev->raid_disk >= conf->raid_disks) + && !test_bit(Faulty, &rdev->flags)) spares++; if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) @@ -5589,6 +5588,11 @@ static int raid5_start_reshape(mddev_t *mddev) /* Failure here is OK */; } else break; + } else if (rdev->raid_disk >= conf->previous_raid_disks + && !test_bit(Faulty, &rdev->flags)) { + /* This is a spare that was manually added */ + set_bit(In_sync, &rdev->flags); + added_devices++; } /* When a reshape changes the number of devices, ->degraded diff --git a/drivers/platform/x86/fujitsu-laptop.c b/drivers/platform/x86/fujitsu-laptop.c index 19e92b2..95e3b09 100644 --- a/drivers/platform/x86/fujitsu-laptop.c +++ b/drivers/platform/x86/fujitsu-laptop.c @@ -689,7 +689,7 @@ static int acpi_fujitsu_add(struct acpi_device *device) if (error) goto err_free_input_dev; - result = acpi_bus_get_power(fujitsu->acpi_handle, &state); + result = acpi_bus_update_power(fujitsu->acpi_handle, &state); if (result) { printk(KERN_ERR "Error reading power state\n"); goto err_unregister_input_dev; @@ -857,7 +857,7 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device) if (error) goto err_free_input_dev; - result = acpi_bus_get_power(fujitsu_hotkey->acpi_handle, &state); + result = acpi_bus_update_power(fujitsu_hotkey->acpi_handle, &state); if (result) { printk(KERN_ERR "Error reading power state\n"); goto err_unregister_input_dev; diff --git a/drivers/pnp/Makefile b/drivers/pnp/Makefile index 8de3775..bfba893 100644 --- a/drivers/pnp/Makefile +++ b/drivers/pnp/Makefile @@ -2,11 +2,13 @@ # Makefile for the Linux Plug-and-Play Support. # -obj-y := core.o card.o driver.o resource.o manager.o support.o interface.o quirks.o +obj-y := pnp.o + +pnp-y := core.o card.o driver.o resource.o manager.o support.o interface.o quirks.o obj-$(CONFIG_PNPACPI) += pnpacpi/ obj-$(CONFIG_PNPBIOS) += pnpbios/ obj-$(CONFIG_ISAPNP) += isapnp/ # pnp_system_init goes after pnpacpi/pnpbios init -obj-y += system.o +pnp-y += system.o diff --git a/drivers/pnp/core.c b/drivers/pnp/core.c index 0f34d96..cb6ce42 100644 --- a/drivers/pnp/core.c +++ b/drivers/pnp/core.c @@ -220,10 +220,5 @@ subsys_initcall(pnp_init); int pnp_debug; #if defined(CONFIG_PNP_DEBUG_MESSAGES) -static int __init pnp_debug_setup(char *__unused) -{ - pnp_debug = 1; - return 1; -} -__setup("pnp.debug", pnp_debug_setup); +module_param_named(debug, pnp_debug, int, 0644); #endif diff --git a/drivers/pnp/driver.c b/drivers/pnp/driver.c index d1dbb9d..00e9403 100644 --- a/drivers/pnp/driver.c +++ b/drivers/pnp/driver.c @@ -189,8 +189,11 @@ static int pnp_bus_resume(struct device *dev) if (!pnp_drv) return 0; - if (pnp_dev->protocol->resume) - pnp_dev->protocol->resume(pnp_dev); + if (pnp_dev->protocol->resume) { + error = pnp_dev->protocol->resume(pnp_dev); + if (error) + return error; + } if (pnp_can_write(pnp_dev)) { error = pnp_start_dev(pnp_dev); diff --git a/drivers/pnp/isapnp/Makefile b/drivers/pnp/isapnp/Makefile index cac18bb..6e607aa 100644 --- a/drivers/pnp/isapnp/Makefile +++ b/drivers/pnp/isapnp/Makefile @@ -1,7 +1,7 @@ # # Makefile for the kernel ISAPNP driver. # +obj-y += pnp.o +pnp-y := core.o compat.o -isapnp-proc-$(CONFIG_PROC_FS) = proc.o - -obj-y := core.o compat.o $(isapnp-proc-y) +pnp-$(CONFIG_PROC_FS) += proc.o diff --git a/drivers/pnp/pnpacpi/Makefile b/drivers/pnp/pnpacpi/Makefile index 905326f..40c93da 100644 --- a/drivers/pnp/pnpacpi/Makefile +++ b/drivers/pnp/pnpacpi/Makefile @@ -1,5 +1,6 @@ # # Makefile for the kernel PNPACPI driver. # +obj-y += pnp.o -obj-y := core.o rsparser.o +pnp-y := core.o rsparser.o diff --git a/drivers/pnp/pnpacpi/core.c b/drivers/pnp/pnpacpi/core.c index 57313f4..ca84d50 100644 --- a/drivers/pnp/pnpacpi/core.c +++ b/drivers/pnp/pnpacpi/core.c @@ -81,12 +81,19 @@ static int pnpacpi_get_resources(struct pnp_dev *dev) static int pnpacpi_set_resources(struct pnp_dev *dev) { - struct acpi_device *acpi_dev = dev->data; - acpi_handle handle = acpi_dev->handle; + struct acpi_device *acpi_dev; + acpi_handle handle; struct acpi_buffer buffer; int ret; pnp_dbg(&dev->dev, "set resources\n"); + + handle = DEVICE_ACPI_HANDLE(&dev->dev); + if (!handle || ACPI_FAILURE(acpi_bus_get_device(handle, &acpi_dev))) { + dev_dbg(&dev->dev, "ACPI device not found in %s!\n", __func__); + return -ENODEV; + } + ret = pnpacpi_build_resource_template(dev, &buffer); if (ret) return ret; @@ -105,12 +112,18 @@ static int pnpacpi_set_resources(struct pnp_dev *dev) static int pnpacpi_disable_resources(struct pnp_dev *dev) { - struct acpi_device *acpi_dev = dev->data; - acpi_handle handle = acpi_dev->handle; + struct acpi_device *acpi_dev; + acpi_handle handle; int ret; dev_dbg(&dev->dev, "disable resources\n"); + handle = DEVICE_ACPI_HANDLE(&dev->dev); + if (!handle || ACPI_FAILURE(acpi_bus_get_device(handle, &acpi_dev))) { + dev_dbg(&dev->dev, "ACPI device not found in %s!\n", __func__); + return 0; + } + /* acpi_unregister_gsi(pnp_irq(dev, 0)); */ ret = 0; if (acpi_bus_power_manageable(handle)) @@ -124,46 +137,74 @@ static int pnpacpi_disable_resources(struct pnp_dev *dev) #ifdef CONFIG_ACPI_SLEEP static bool pnpacpi_can_wakeup(struct pnp_dev *dev) { - struct acpi_device *acpi_dev = dev->data; - acpi_handle handle = acpi_dev->handle; + struct acpi_device *acpi_dev; + acpi_handle handle; + + handle = DEVICE_ACPI_HANDLE(&dev->dev); + if (!handle || ACPI_FAILURE(acpi_bus_get_device(handle, &acpi_dev))) { + dev_dbg(&dev->dev, "ACPI device not found in %s!\n", __func__); + return false; + } return acpi_bus_can_wakeup(handle); } static int pnpacpi_suspend(struct pnp_dev *dev, pm_message_t state) { - struct acpi_device *acpi_dev = dev->data; - acpi_handle handle = acpi_dev->handle; - int power_state; + struct acpi_device *acpi_dev; + acpi_handle handle; + int error = 0; + + handle = DEVICE_ACPI_HANDLE(&dev->dev); + if (!handle || ACPI_FAILURE(acpi_bus_get_device(handle, &acpi_dev))) { + dev_dbg(&dev->dev, "ACPI device not found in %s!\n", __func__); + return 0; + } if (device_can_wakeup(&dev->dev)) { - int rc = acpi_pm_device_sleep_wake(&dev->dev, + error = acpi_pm_device_sleep_wake(&dev->dev, device_may_wakeup(&dev->dev)); + if (error) + return error; + } + + if (acpi_bus_power_manageable(handle)) { + int power_state = acpi_pm_device_sleep_state(&dev->dev, NULL); + + if (power_state < 0) + power_state = (state.event == PM_EVENT_ON) ? + ACPI_STATE_D0 : ACPI_STATE_D3; - if (rc) - return rc; + /* + * acpi_bus_set_power() often fails (keyboard port can't be + * powered-down?), and in any case, our return value is ignored + * by pnp_bus_suspend(). Hence we don't revert the wakeup + * setting if the set_power fails. + */ + error = acpi_bus_set_power(handle, power_state); } - power_state = acpi_pm_device_sleep_state(&dev->dev, NULL); - if (power_state < 0) - power_state = (state.event == PM_EVENT_ON) ? - ACPI_STATE_D0 : ACPI_STATE_D3; - - /* acpi_bus_set_power() often fails (keyboard port can't be - * powered-down?), and in any case, our return value is ignored - * by pnp_bus_suspend(). Hence we don't revert the wakeup - * setting if the set_power fails. - */ - return acpi_bus_set_power(handle, power_state); + + return error; } static int pnpacpi_resume(struct pnp_dev *dev) { - struct acpi_device *acpi_dev = dev->data; - acpi_handle handle = acpi_dev->handle; + struct acpi_device *acpi_dev; + acpi_handle handle = DEVICE_ACPI_HANDLE(&dev->dev); + int error = 0; + + if (!handle || ACPI_FAILURE(acpi_bus_get_device(handle, &acpi_dev))) { + dev_dbg(&dev->dev, "ACPI device not found in %s!\n", __func__); + return -ENODEV; + } if (device_may_wakeup(&dev->dev)) acpi_pm_device_sleep_wake(&dev->dev, false); - return acpi_bus_set_power(handle, ACPI_STATE_D0); + + if (acpi_bus_power_manageable(handle)) + error = acpi_bus_set_power(handle, ACPI_STATE_D0); + + return error; } #endif diff --git a/drivers/pnp/pnpbios/Makefile b/drivers/pnp/pnpbios/Makefile index 3cd3ed7..240b0ff 100644 --- a/drivers/pnp/pnpbios/Makefile +++ b/drivers/pnp/pnpbios/Makefile @@ -1,7 +1,8 @@ # # Makefile for the kernel PNPBIOS driver. # +obj-y := pnp.o -pnpbios-proc-$(CONFIG_PNPBIOS_PROC_FS) = proc.o +pnp-y := core.o bioscalls.o rsparser.o -obj-y := core.o bioscalls.o rsparser.o $(pnpbios-proc-y) +pnp-$(CONFIG_PNPBIOS_PROC_FS) += proc.o diff --git a/drivers/serial/atmel_serial.c b/drivers/serial/atmel_serial.c index 3892666..2a1d52f 100644 --- a/drivers/serial/atmel_serial.c +++ b/drivers/serial/atmel_serial.c @@ -1732,6 +1732,11 @@ static int __devinit atmel_serial_probe(struct platform_device *pdev) device_init_wakeup(&pdev->dev, 1); platform_set_drvdata(pdev, port); + if (port->rs485.flags & SER_RS485_ENABLED) { + UART_PUT_MR(&port->uart, ATMEL_US_USMODE_NORMAL); + UART_PUT_CR(&port->uart, ATMEL_US_RTSEN); + } + return 0; err_add_port: diff --git a/drivers/sfi/sfi_core.c b/drivers/sfi/sfi_core.c index ceba593..04113e5 100644 --- a/drivers/sfi/sfi_core.c +++ b/drivers/sfi/sfi_core.c @@ -101,7 +101,7 @@ static void __iomem * __ref sfi_map_memory(u64 phys, u32 size) return NULL; if (sfi_use_ioremap) - return ioremap(phys, size); + return ioremap_cache(phys, size); else return early_ioremap(phys, size); } diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index bf7c687..f7a5dba 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -4,6 +4,7 @@ menuconfig THERMAL tristate "Generic Thermal sysfs driver" + depends on NET help Generic Thermal Sysfs driver offers a generic mechanism for thermal management. Usually it's made up of one or more thermal diff --git a/drivers/thermal/thermal_sys.c b/drivers/thermal/thermal_sys.c index 13c72c6..7d0e63c 100644 --- a/drivers/thermal/thermal_sys.c +++ b/drivers/thermal/thermal_sys.c @@ -32,6 +32,8 @@ #include #include #include +#include +#include MODULE_AUTHOR("Zhang Rui"); MODULE_DESCRIPTION("Generic thermal management sysfs support"); @@ -58,6 +60,22 @@ static LIST_HEAD(thermal_tz_list); static LIST_HEAD(thermal_cdev_list); static DEFINE_MUTEX(thermal_list_lock); +static unsigned int thermal_event_seqnum; + +static struct genl_family thermal_event_genl_family = { + .id = GENL_ID_GENERATE, + .name = THERMAL_GENL_FAMILY_NAME, + .version = THERMAL_GENL_VERSION, + .maxattr = THERMAL_GENL_ATTR_MAX, +}; + +static struct genl_multicast_group thermal_event_mcgrp = { + .name = THERMAL_GENL_MCAST_GROUP_NAME, +}; + +static int genetlink_init(void); +static void genetlink_exit(void); + static int get_idr(struct idr *idr, struct mutex *lock, int *id) { int err; @@ -823,11 +841,8 @@ static struct class thermal_class = { * @devdata: device private data. * @ops: standard thermal cooling devices callbacks. */ -struct thermal_cooling_device *thermal_cooling_device_register(char *type, - void *devdata, - struct - thermal_cooling_device_ops - *ops) +struct thermal_cooling_device *thermal_cooling_device_register( + char *type, void *devdata, const struct thermal_cooling_device_ops *ops) { struct thermal_cooling_device *cdev; struct thermal_zone_device *pos; @@ -1048,13 +1063,9 @@ EXPORT_SYMBOL(thermal_zone_device_update); * section 11.1.5.1 of the ACPI specification 3.0. */ struct thermal_zone_device *thermal_zone_device_register(char *type, - int trips, - void *devdata, struct - thermal_zone_device_ops - *ops, int tc1, int - tc2, - int passive_delay, - int polling_delay) + int trips, void *devdata, + const struct thermal_zone_device_ops *ops, + int tc1, int tc2, int passive_delay, int polling_delay) { struct thermal_zone_device *tz; struct thermal_cooling_device *pos; @@ -1214,6 +1225,82 @@ void thermal_zone_device_unregister(struct thermal_zone_device *tz) EXPORT_SYMBOL(thermal_zone_device_unregister); +int generate_netlink_event(u32 orig, enum events event) +{ + struct sk_buff *skb; + struct nlattr *attr; + struct thermal_genl_event *thermal_event; + void *msg_header; + int size; + int result; + + /* allocate memory */ + size = nla_total_size(sizeof(struct thermal_genl_event)) + \ + nla_total_size(0); + + skb = genlmsg_new(size, GFP_ATOMIC); + if (!skb) + return -ENOMEM; + + /* add the genetlink message header */ + msg_header = genlmsg_put(skb, 0, thermal_event_seqnum++, + &thermal_event_genl_family, 0, + THERMAL_GENL_CMD_EVENT); + if (!msg_header) { + nlmsg_free(skb); + return -ENOMEM; + } + + /* fill the data */ + attr = nla_reserve(skb, THERMAL_GENL_ATTR_EVENT, \ + sizeof(struct thermal_genl_event)); + + if (!attr) { + nlmsg_free(skb); + return -EINVAL; + } + + thermal_event = nla_data(attr); + if (!thermal_event) { + nlmsg_free(skb); + return -EINVAL; + } + + memset(thermal_event, 0, sizeof(struct thermal_genl_event)); + + thermal_event->orig = orig; + thermal_event->event = event; + + /* send multicast genetlink message */ + result = genlmsg_end(skb, msg_header); + if (result < 0) { + nlmsg_free(skb); + return result; + } + + result = genlmsg_multicast(skb, 0, thermal_event_mcgrp.id, GFP_ATOMIC); + if (result) + printk(KERN_INFO "failed to send netlink event:%d", result); + + return result; +} +EXPORT_SYMBOL(generate_netlink_event); + +static int genetlink_init(void) +{ + int result; + + result = genl_register_family(&thermal_event_genl_family); + if (result) + return result; + + result = genl_register_mc_group(&thermal_event_genl_family, + &thermal_event_mcgrp); + if (result) + genl_unregister_family(&thermal_event_genl_family); + return result; +} + static int __init thermal_init(void) { int result = 0; @@ -1225,9 +1312,15 @@ static int __init thermal_init(void) mutex_destroy(&thermal_idr_lock); mutex_destroy(&thermal_list_lock); } + result = genetlink_init(); return result; } +static void genetlink_exit(void) +{ + genl_unregister_family(&thermal_event_genl_family); +} + static void __exit thermal_exit(void) { class_unregister(&thermal_class); @@ -1235,7 +1328,8 @@ static void __exit thermal_exit(void) idr_destroy(&thermal_cdev_idr); mutex_destroy(&thermal_idr_lock); mutex_destroy(&thermal_list_lock); + genetlink_exit(); } -subsys_initcall(thermal_init); +fs_initcall(thermal_init); module_exit(thermal_exit); diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 5a48ce9..07bec09 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -71,11 +71,18 @@ config XEN_SYS_HYPERVISOR but will have no xen contents. config XEN_XENBUS_FRONTEND - tristate + tristate + +config XEN_GNTDEV + tristate "userspace grant access device driver" + depends on XEN + select MMU_NOTIFIER + help + Allows userspace processes to use grants. config XEN_PLATFORM_PCI tristate "xen platform pci device driver" - depends on XEN_PVHVM + depends on XEN_PVHVM && PCI default m help Driver for the Xen PCI Platform device: it is responsible for diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 533a199..5088cc2 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -9,11 +9,14 @@ obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o obj-$(CONFIG_XEN_XENCOMM) += xencomm.o obj-$(CONFIG_XEN_BALLOON) += balloon.o obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o +obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o obj-$(CONFIG_XENFS) += xenfs/ obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o -obj-$(CONFIG_XEN_PLATFORM_PCI) += platform-pci.o +obj-$(CONFIG_XEN_PLATFORM_PCI) += xen-platform-pci.o obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o obj-$(CONFIG_XEN_DOM0) += pci.o xen-evtchn-y := evtchn.o +xen-gntdev-y := gntdev.o +xen-platform-pci-y := platform-pci.o diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c new file mode 100644 index 0000000..1e31cdc --- /dev/null +++ b/drivers/xen/gntdev.c @@ -0,0 +1,665 @@ +/****************************************************************************** + * gntdev.c + * + * Device for accessing (in user-space) pages that have been granted by other + * domains. + * + * Copyright (c) 2006-2007, D G Murray. + * (c) 2009 Gerd Hoffmann + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#undef DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Derek G. Murray , " + "Gerd Hoffmann "); +MODULE_DESCRIPTION("User-space granted page access driver"); + +static int limit = 1024; +module_param(limit, int, 0644); +MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped at " + "once by a gntdev instance"); + +struct gntdev_priv { + struct list_head maps; + uint32_t used; + uint32_t limit; + /* lock protects maps from concurrent changes */ + spinlock_t lock; + struct mm_struct *mm; + struct mmu_notifier mn; +}; + +struct grant_map { + struct list_head next; + struct gntdev_priv *priv; + struct vm_area_struct *vma; + int index; + int count; + int flags; + int is_mapped; + struct ioctl_gntdev_grant_ref *grants; + struct gnttab_map_grant_ref *map_ops; + struct gnttab_unmap_grant_ref *unmap_ops; + struct page **pages; +}; + +/* ------------------------------------------------------------------ */ + +static void gntdev_print_maps(struct gntdev_priv *priv, + char *text, int text_index) +{ +#ifdef DEBUG + struct grant_map *map; + + pr_debug("maps list (priv %p, usage %d/%d)\n", + priv, priv->used, priv->limit); + + list_for_each_entry(map, &priv->maps, next) + pr_debug(" index %2d, count %2d %s\n", + map->index, map->count, + map->index == text_index && text ? text : ""); +#endif +} + +static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) +{ + struct grant_map *add; + int i; + + add = kzalloc(sizeof(struct grant_map), GFP_KERNEL); + if (NULL == add) + return NULL; + + add->grants = kzalloc(sizeof(add->grants[0]) * count, GFP_KERNEL); + add->map_ops = kzalloc(sizeof(add->map_ops[0]) * count, GFP_KERNEL); + add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL); + add->pages = kzalloc(sizeof(add->pages[0]) * count, GFP_KERNEL); + if (NULL == add->grants || + NULL == add->map_ops || + NULL == add->unmap_ops || + NULL == add->pages) + goto err; + + for (i = 0; i < count; i++) { + add->pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (add->pages[i] == NULL) + goto err; + } + + add->index = 0; + add->count = count; + add->priv = priv; + + if (add->count + priv->used > priv->limit) + goto err; + + return add; + +err: + if (add->pages) + for (i = 0; i < count; i++) { + if (add->pages[i]) + __free_page(add->pages[i]); + } + kfree(add->pages); + kfree(add->grants); + kfree(add->map_ops); + kfree(add->unmap_ops); + kfree(add); + return NULL; +} + +static void gntdev_add_map(struct gntdev_priv *priv, struct grant_map *add) +{ + struct grant_map *map; + + list_for_each_entry(map, &priv->maps, next) { + if (add->index + add->count < map->index) { + list_add_tail(&add->next, &map->next); + goto done; + } + add->index = map->index + map->count; + } + list_add_tail(&add->next, &priv->maps); + +done: + priv->used += add->count; + gntdev_print_maps(priv, "[new]", add->index); +} + +static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv, + int index, int count) +{ + struct grant_map *map; + + list_for_each_entry(map, &priv->maps, next) { + if (map->index != index) + continue; + if (map->count != count) + continue; + return map; + } + return NULL; +} + +static struct grant_map *gntdev_find_map_vaddr(struct gntdev_priv *priv, + unsigned long vaddr) +{ + struct grant_map *map; + + list_for_each_entry(map, &priv->maps, next) { + if (!map->vma) + continue; + if (vaddr < map->vma->vm_start) + continue; + if (vaddr >= map->vma->vm_end) + continue; + return map; + } + return NULL; +} + +static int gntdev_del_map(struct grant_map *map) +{ + int i; + + if (map->vma) + return -EBUSY; + for (i = 0; i < map->count; i++) + if (map->unmap_ops[i].handle) + return -EBUSY; + + map->priv->used -= map->count; + list_del(&map->next); + return 0; +} + +static void gntdev_free_map(struct grant_map *map) +{ + int i; + + if (!map) + return; + + if (map->pages) + for (i = 0; i < map->count; i++) { + if (map->pages[i]) + __free_page(map->pages[i]); + } + kfree(map->pages); + kfree(map->grants); + kfree(map->map_ops); + kfree(map->unmap_ops); + kfree(map); +} + +/* ------------------------------------------------------------------ */ + +static int find_grant_ptes(pte_t *pte, pgtable_t token, + unsigned long addr, void *data) +{ + struct grant_map *map = data; + unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT; + u64 pte_maddr; + + BUG_ON(pgnr >= map->count); + pte_maddr = arbitrary_virt_to_machine(pte).maddr; + + gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, + GNTMAP_contains_pte | map->flags, + map->grants[pgnr].ref, + map->grants[pgnr].domid); + gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, + GNTMAP_contains_pte | map->flags, + 0 /* handle */); + return 0; +} + +static int map_grant_pages(struct grant_map *map) +{ + int i, err = 0; + + pr_debug("map %d+%d\n", map->index, map->count); + err = gnttab_map_refs(map->map_ops, map->pages, map->count); + if (err) + return err; + + for (i = 0; i < map->count; i++) { + if (map->map_ops[i].status) + err = -EINVAL; + map->unmap_ops[i].handle = map->map_ops[i].handle; + } + return err; +} + +static int unmap_grant_pages(struct grant_map *map, int offset, int pages) +{ + int i, err = 0; + + pr_debug("map %d+%d [%d+%d]\n", map->index, map->count, offset, pages); + err = gnttab_unmap_refs(map->unmap_ops + offset, map->pages, pages); + if (err) + return err; + + for (i = 0; i < pages; i++) { + if (map->unmap_ops[offset+i].status) + err = -EINVAL; + map->unmap_ops[offset+i].handle = 0; + } + return err; +} + +/* ------------------------------------------------------------------ */ + +static void gntdev_vma_close(struct vm_area_struct *vma) +{ + struct grant_map *map = vma->vm_private_data; + + pr_debug("close %p\n", vma); + map->is_mapped = 0; + map->vma = NULL; + vma->vm_private_data = NULL; +} + +static int gntdev_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + pr_debug("vaddr %p, pgoff %ld (shouldn't happen)\n", + vmf->virtual_address, vmf->pgoff); + vmf->flags = VM_FAULT_ERROR; + return 0; +} + +static struct vm_operations_struct gntdev_vmops = { + .close = gntdev_vma_close, + .fault = gntdev_vma_fault, +}; + +/* ------------------------------------------------------------------ */ + +static void mn_invl_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); + struct grant_map *map; + unsigned long mstart, mend; + int err; + + spin_lock(&priv->lock); + list_for_each_entry(map, &priv->maps, next) { + if (!map->vma) + continue; + if (!map->is_mapped) + continue; + if (map->vma->vm_start >= end) + continue; + if (map->vma->vm_end <= start) + continue; + mstart = max(start, map->vma->vm_start); + mend = min(end, map->vma->vm_end); + pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", + map->index, map->count, + map->vma->vm_start, map->vma->vm_end, + start, end, mstart, mend); + err = unmap_grant_pages(map, + (mstart - map->vma->vm_start) >> PAGE_SHIFT, + (mend - mstart) >> PAGE_SHIFT); + WARN_ON(err); + } + spin_unlock(&priv->lock); +} + +static void mn_invl_page(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + mn_invl_range_start(mn, mm, address, address + PAGE_SIZE); +} + +static void mn_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); + struct grant_map *map; + int err; + + spin_lock(&priv->lock); + list_for_each_entry(map, &priv->maps, next) { + if (!map->vma) + continue; + pr_debug("map %d+%d (%lx %lx)\n", + map->index, map->count, + map->vma->vm_start, map->vma->vm_end); + err = unmap_grant_pages(map, /* offset */ 0, map->count); + WARN_ON(err); + } + spin_unlock(&priv->lock); +} + +struct mmu_notifier_ops gntdev_mmu_ops = { + .release = mn_release, + .invalidate_page = mn_invl_page, + .invalidate_range_start = mn_invl_range_start, +}; + +/* ------------------------------------------------------------------ */ + +static int gntdev_open(struct inode *inode, struct file *flip) +{ + struct gntdev_priv *priv; + int ret = 0; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + INIT_LIST_HEAD(&priv->maps); + spin_lock_init(&priv->lock); + priv->limit = limit; + + priv->mm = get_task_mm(current); + if (!priv->mm) { + kfree(priv); + return -ENOMEM; + } + priv->mn.ops = &gntdev_mmu_ops; + ret = mmu_notifier_register(&priv->mn, priv->mm); + mmput(priv->mm); + + if (ret) { + kfree(priv); + return ret; + } + + flip->private_data = priv; + pr_debug("priv %p\n", priv); + + return 0; +} + +static int gntdev_release(struct inode *inode, struct file *flip) +{ + struct gntdev_priv *priv = flip->private_data; + struct grant_map *map; + int err; + + pr_debug("priv %p\n", priv); + + spin_lock(&priv->lock); + while (!list_empty(&priv->maps)) { + map = list_entry(priv->maps.next, struct grant_map, next); + err = gntdev_del_map(map); + if (WARN_ON(err)) + gntdev_free_map(map); + + } + spin_unlock(&priv->lock); + + mmu_notifier_unregister(&priv->mn, priv->mm); + kfree(priv); + return 0; +} + +static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv, + struct ioctl_gntdev_map_grant_ref __user *u) +{ + struct ioctl_gntdev_map_grant_ref op; + struct grant_map *map; + int err; + + if (copy_from_user(&op, u, sizeof(op)) != 0) + return -EFAULT; + pr_debug("priv %p, add %d\n", priv, op.count); + if (unlikely(op.count <= 0)) + return -EINVAL; + if (unlikely(op.count > priv->limit)) + return -EINVAL; + + err = -ENOMEM; + map = gntdev_alloc_map(priv, op.count); + if (!map) + return err; + if (copy_from_user(map->grants, &u->refs, + sizeof(map->grants[0]) * op.count) != 0) { + gntdev_free_map(map); + return err; + } + + spin_lock(&priv->lock); + gntdev_add_map(priv, map); + op.index = map->index << PAGE_SHIFT; + spin_unlock(&priv->lock); + + if (copy_to_user(u, &op, sizeof(op)) != 0) { + spin_lock(&priv->lock); + gntdev_del_map(map); + spin_unlock(&priv->lock); + gntdev_free_map(map); + return err; + } + return 0; +} + +static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv, + struct ioctl_gntdev_unmap_grant_ref __user *u) +{ + struct ioctl_gntdev_unmap_grant_ref op; + struct grant_map *map; + int err = -ENOENT; + + if (copy_from_user(&op, u, sizeof(op)) != 0) + return -EFAULT; + pr_debug("priv %p, del %d+%d\n", priv, (int)op.index, (int)op.count); + + spin_lock(&priv->lock); + map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count); + if (map) + err = gntdev_del_map(map); + spin_unlock(&priv->lock); + if (!err) + gntdev_free_map(map); + return err; +} + +static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv, + struct ioctl_gntdev_get_offset_for_vaddr __user *u) +{ + struct ioctl_gntdev_get_offset_for_vaddr op; + struct grant_map *map; + + if (copy_from_user(&op, u, sizeof(op)) != 0) + return -EFAULT; + pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr); + + spin_lock(&priv->lock); + map = gntdev_find_map_vaddr(priv, op.vaddr); + if (map == NULL || + map->vma->vm_start != op.vaddr) { + spin_unlock(&priv->lock); + return -EINVAL; + } + op.offset = map->index << PAGE_SHIFT; + op.count = map->count; + spin_unlock(&priv->lock); + + if (copy_to_user(u, &op, sizeof(op)) != 0) + return -EFAULT; + return 0; +} + +static long gntdev_ioctl_set_max_grants(struct gntdev_priv *priv, + struct ioctl_gntdev_set_max_grants __user *u) +{ + struct ioctl_gntdev_set_max_grants op; + + if (copy_from_user(&op, u, sizeof(op)) != 0) + return -EFAULT; + pr_debug("priv %p, limit %d\n", priv, op.count); + if (op.count > limit) + return -E2BIG; + + spin_lock(&priv->lock); + priv->limit = op.count; + spin_unlock(&priv->lock); + return 0; +} + +static long gntdev_ioctl(struct file *flip, + unsigned int cmd, unsigned long arg) +{ + struct gntdev_priv *priv = flip->private_data; + void __user *ptr = (void __user *)arg; + + switch (cmd) { + case IOCTL_GNTDEV_MAP_GRANT_REF: + return gntdev_ioctl_map_grant_ref(priv, ptr); + + case IOCTL_GNTDEV_UNMAP_GRANT_REF: + return gntdev_ioctl_unmap_grant_ref(priv, ptr); + + case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR: + return gntdev_ioctl_get_offset_for_vaddr(priv, ptr); + + case IOCTL_GNTDEV_SET_MAX_GRANTS: + return gntdev_ioctl_set_max_grants(priv, ptr); + + default: + pr_debug("priv %p, unknown cmd %x\n", priv, cmd); + return -ENOIOCTLCMD; + } + + return 0; +} + +static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) +{ + struct gntdev_priv *priv = flip->private_data; + int index = vma->vm_pgoff; + int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + struct grant_map *map; + int err = -EINVAL; + + if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + pr_debug("map %d+%d at %lx (pgoff %lx)\n", + index, count, vma->vm_start, vma->vm_pgoff); + + spin_lock(&priv->lock); + map = gntdev_find_map_index(priv, index, count); + if (!map) + goto unlock_out; + if (map->vma) + goto unlock_out; + if (priv->mm != vma->vm_mm) { + printk(KERN_WARNING "Huh? Other mm?\n"); + goto unlock_out; + } + + vma->vm_ops = &gntdev_vmops; + + vma->vm_flags |= VM_RESERVED|VM_DONTCOPY|VM_DONTEXPAND|VM_PFNMAP; + + vma->vm_private_data = map; + map->vma = vma; + + map->flags = GNTMAP_host_map | GNTMAP_application_map; + if (!(vma->vm_flags & VM_WRITE)) + map->flags |= GNTMAP_readonly; + + spin_unlock(&priv->lock); + + err = apply_to_page_range(vma->vm_mm, vma->vm_start, + vma->vm_end - vma->vm_start, + find_grant_ptes, map); + if (err) { + printk(KERN_WARNING "find_grant_ptes() failure.\n"); + return err; + } + + err = map_grant_pages(map); + if (err) { + printk(KERN_WARNING "map_grant_pages() failure.\n"); + return err; + } + + map->is_mapped = 1; + + return 0; + +unlock_out: + spin_unlock(&priv->lock); + return err; +} + +static const struct file_operations gntdev_fops = { + .owner = THIS_MODULE, + .open = gntdev_open, + .release = gntdev_release, + .mmap = gntdev_mmap, + .unlocked_ioctl = gntdev_ioctl +}; + +static struct miscdevice gntdev_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "xen/gntdev", + .fops = &gntdev_fops, +}; + +/* ------------------------------------------------------------------ */ + +static int __init gntdev_init(void) +{ + int err; + + if (!xen_domain()) + return -ENODEV; + + err = misc_register(&gntdev_miscdev); + if (err != 0) { + printk(KERN_ERR "Could not register gntdev device\n"); + return err; + } + return 0; +} + +static void __exit gntdev_exit(void) +{ + misc_deregister(&gntdev_miscdev); +} + +module_init(gntdev_init); +module_exit(gntdev_exit); + +/* ------------------------------------------------------------------ */ diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 6c45318..9ef54eb 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -447,6 +447,52 @@ unsigned int gnttab_max_grant_frames(void) } EXPORT_SYMBOL_GPL(gnttab_max_grant_frames); +int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, + struct page **pages, unsigned int count) +{ + int i, ret; + pte_t *pte; + unsigned long mfn; + + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count); + if (ret) + return ret; + + for (i = 0; i < count; i++) { + /* m2p override only supported for GNTMAP_contains_pte mappings */ + if (!(map_ops[i].flags & GNTMAP_contains_pte)) + continue; + pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + + (map_ops[i].host_addr & ~PAGE_MASK)); + mfn = pte_mfn(*pte); + ret = m2p_add_override(mfn, pages[i]); + if (ret) + return ret; + } + + return ret; +} +EXPORT_SYMBOL_GPL(gnttab_map_refs); + +int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, + struct page **pages, unsigned int count) +{ + int i, ret; + + ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count); + if (ret) + return ret; + + for (i = 0; i < count; i++) { + ret = m2p_remove_override(pages[i]); + if (ret) + return ret; + } + + return ret; +} +EXPORT_SYMBOL_GPL(gnttab_unmap_refs); + static int gnttab_map(unsigned int start_idx, unsigned int end_idx) { struct gnttab_setup_table setup; diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index c01b5dd..afbe041 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -105,7 +105,7 @@ static int __devinit platform_pci_init(struct pci_dev *pdev, const struct pci_device_id *ent) { int i, ret; - long ioaddr, iolen; + long ioaddr; long mmio_addr, mmio_len; unsigned int max_nr_gframes; @@ -114,7 +114,6 @@ static int __devinit platform_pci_init(struct pci_dev *pdev, return i; ioaddr = pci_resource_start(pdev, 0); - iolen = pci_resource_len(pdev, 0); mmio_addr = pci_resource_start(pdev, 1); mmio_len = pci_resource_len(pdev, 1); @@ -125,19 +124,13 @@ static int __devinit platform_pci_init(struct pci_dev *pdev, goto pci_out; } - if (request_mem_region(mmio_addr, mmio_len, DRV_NAME) == NULL) { - dev_err(&pdev->dev, "MEM I/O resource 0x%lx @ 0x%lx busy\n", - mmio_addr, mmio_len); - ret = -EBUSY; + ret = pci_request_region(pdev, 1, DRV_NAME); + if (ret < 0) goto pci_out; - } - if (request_region(ioaddr, iolen, DRV_NAME) == NULL) { - dev_err(&pdev->dev, "I/O resource 0x%lx @ 0x%lx busy\n", - iolen, ioaddr); - ret = -EBUSY; + ret = pci_request_region(pdev, 0, DRV_NAME); + if (ret < 0) goto mem_out; - } platform_mmio = mmio_addr; platform_mmiolen = mmio_len; @@ -169,9 +162,9 @@ static int __devinit platform_pci_init(struct pci_dev *pdev, return 0; out: - release_region(ioaddr, iolen); + pci_release_region(pdev, 0); mem_out: - release_mem_region(mmio_addr, mmio_len); + pci_release_region(pdev, 1); pci_out: pci_disable_device(pdev); return ret; diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 9ed4769..d3b28ab 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -141,13 +141,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) return rc; } -static inode *ecryptfs_get_inode(struct inode *lower_inode, +static struct inode *ecryptfs_get_inode(struct inode *lower_inode, struct super_block *sb) { struct inode *inode; int rc = 0; - lower_inode = lower_dentry->d_inode; if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) { rc = -EXDEV; goto out; @@ -202,7 +201,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry, { struct inode *lower_inode = lower_dentry->d_inode; struct inode *inode = ecryptfs_get_inode(lower_inode, sb); - if (IS_ERR(inode) + if (IS_ERR(inode)) return PTR_ERR(inode); if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD) d_add(dentry, inode); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3d06ccc..59c6e49 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -84,13 +84,9 @@ static inline struct inode *wb_inode(struct list_head *head) return list_entry(head, struct inode, i_wb_list); } -static void bdi_queue_work(struct backing_dev_info *bdi, - struct wb_writeback_work *work) +/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */ +static void bdi_wakeup_flusher(struct backing_dev_info *bdi) { - trace_writeback_queue(bdi, work); - - spin_lock_bh(&bdi->wb_lock); - list_add_tail(&work->list, &bdi->work_list); if (bdi->wb.task) { wake_up_process(bdi->wb.task); } else { @@ -98,15 +94,26 @@ static void bdi_queue_work(struct backing_dev_info *bdi, * The bdi thread isn't there, wake up the forker thread which * will create and run it. */ - trace_writeback_nothread(bdi, work); wake_up_process(default_backing_dev_info.wb.task); } +} + +static void bdi_queue_work(struct backing_dev_info *bdi, + struct wb_writeback_work *work) +{ + trace_writeback_queue(bdi, work); + + spin_lock_bh(&bdi->wb_lock); + list_add_tail(&work->list, &bdi->work_list); + if (!bdi->wb.task) + trace_writeback_nothread(bdi, work); + bdi_wakeup_flusher(bdi); spin_unlock_bh(&bdi->wb_lock); } static void __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - bool range_cyclic, bool for_background) + bool range_cyclic) { struct wb_writeback_work *work; @@ -126,7 +133,6 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, work->sync_mode = WB_SYNC_NONE; work->nr_pages = nr_pages; work->range_cyclic = range_cyclic; - work->for_background = for_background; bdi_queue_work(bdi, work); } @@ -144,7 +150,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, */ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) { - __bdi_start_writeback(bdi, nr_pages, true, false); + __bdi_start_writeback(bdi, nr_pages, true); } /** @@ -152,13 +158,21 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) * @bdi: the backing device to write from * * Description: - * This does WB_SYNC_NONE background writeback. The IO is only - * started when this function returns, we make no guarentees on - * completion. Caller need not hold sb s_umount semaphore. + * This makes sure WB_SYNC_NONE background writeback happens. When + * this function returns, it is only guaranteed that for given BDI + * some IO is happening if we are over background dirty threshold. + * Caller need not hold sb s_umount semaphore. */ void bdi_start_background_writeback(struct backing_dev_info *bdi) { - __bdi_start_writeback(bdi, LONG_MAX, true, true); + /* + * We just wake up the flusher thread. It will perform background + * writeback as soon as there is no other work to do. + */ + trace_writeback_wake_background(bdi); + spin_lock_bh(&bdi->wb_lock); + bdi_wakeup_flusher(bdi); + spin_unlock_bh(&bdi->wb_lock); } /* @@ -616,6 +630,7 @@ static long wb_writeback(struct bdi_writeback *wb, }; unsigned long oldest_jif; long wrote = 0; + long write_chunk; struct inode *inode; if (wbc.for_kupdate) { @@ -628,6 +643,24 @@ static long wb_writeback(struct bdi_writeback *wb, wbc.range_end = LLONG_MAX; } + /* + * WB_SYNC_ALL mode does livelock avoidance by syncing dirty + * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX + * here avoids calling into writeback_inodes_wb() more than once. + * + * The intended call sequence for WB_SYNC_ALL writeback is: + * + * wb_writeback() + * __writeback_inodes_sb() <== called only once + * write_cache_pages() <== called once for each inode + * (quickly) tag currently dirty pages + * (maybe slowly) sync all tagged pages + */ + if (wbc.sync_mode == WB_SYNC_NONE) + write_chunk = MAX_WRITEBACK_PAGES; + else + write_chunk = LONG_MAX; + wbc.wb_start = jiffies; /* livelock avoidance */ for (;;) { /* @@ -637,6 +670,16 @@ static long wb_writeback(struct bdi_writeback *wb, break; /* + * Background writeout and kupdate-style writeback may + * run forever. Stop them if there is other work to do + * so that e.g. sync can proceed. They'll be restarted + * after the other works are all done. + */ + if ((work->for_background || work->for_kupdate) && + !list_empty(&wb->bdi->work_list)) + break; + + /* * For background writeout, stop when we are below the * background dirty threshold */ @@ -644,7 +687,7 @@ static long wb_writeback(struct bdi_writeback *wb, break; wbc.more_io = 0; - wbc.nr_to_write = MAX_WRITEBACK_PAGES; + wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; trace_wbc_writeback_start(&wbc, wb->bdi); @@ -654,8 +697,8 @@ static long wb_writeback(struct bdi_writeback *wb, writeback_inodes_wb(wb, &wbc); trace_wbc_writeback_written(&wbc, wb->bdi); - work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; - wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; + work->nr_pages -= write_chunk - wbc.nr_to_write; + wrote += write_chunk - wbc.nr_to_write; /* * If we consumed everything, see if we have more @@ -670,7 +713,7 @@ static long wb_writeback(struct bdi_writeback *wb, /* * Did we write something? Try for more */ - if (wbc.nr_to_write < MAX_WRITEBACK_PAGES) + if (wbc.nr_to_write < write_chunk) continue; /* * Nothing written. Wait for some inode to @@ -718,6 +761,23 @@ static unsigned long get_nr_dirty_pages(void) get_nr_dirty_inodes(); } +static long wb_check_background_flush(struct bdi_writeback *wb) +{ + if (over_bground_thresh()) { + + struct wb_writeback_work work = { + .nr_pages = LONG_MAX, + .sync_mode = WB_SYNC_NONE, + .for_background = 1, + .range_cyclic = 1, + }; + + return wb_writeback(wb, &work); + } + + return 0; +} + static long wb_check_old_data_flush(struct bdi_writeback *wb) { unsigned long expired; @@ -787,6 +847,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) * Check for periodic writeback, kupdated() style */ wrote += wb_check_old_data_flush(wb); + wrote += wb_check_background_flush(wb); clear_bit(BDI_writeback_running, &wb->bdi->state); return wrote; @@ -873,7 +934,7 @@ void wakeup_flusher_threads(long nr_pages) list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { if (!bdi_has_dirty_io(bdi)) continue; - __bdi_start_writeback(bdi, nr_pages, false, false); + __bdi_start_writeback(bdi, nr_pages, false); } rcu_read_unlock(); } @@ -1164,7 +1225,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle); * @sb: the superblock * * This function writes and waits on any dirty inode belonging to this - * super_block. The number of pages synced is returned. + * super_block. */ void sync_inodes_sb(struct super_block *sb) { @@ -1242,11 +1303,11 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc) EXPORT_SYMBOL(sync_inode); /** - * sync_inode - write an inode to disk + * sync_inode_metadata - write an inode to disk * @inode: the inode to sync * @wait: wait for I/O to complete. * - * Write an inode to disk and adjust it's dirty state after completion. + * Write an inode to disk and adjust its dirty state after completion. * * Note: only writes the actual inode, no associated data or other metadata. */ diff --git a/fs/mpage.c b/fs/mpage.c index fd56ca2..d78455a 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -40,7 +40,7 @@ * status of that page is hard. See end_buffer_async_read() for the details. * There is no point in duplicating all that complexity. */ -static void mpage_end_io_read(struct bio *bio, int err) +static void mpage_end_io(struct bio *bio, int err) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; @@ -50,44 +50,29 @@ static void mpage_end_io_read(struct bio *bio, int err) if (--bvec >= bio->bi_io_vec) prefetchw(&bvec->bv_page->flags); - - if (uptodate) { - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); - } while (bvec >= bio->bi_io_vec); - bio_put(bio); -} - -static void mpage_end_io_write(struct bio *bio, int err) -{ - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - - do { - struct page *page = bvec->bv_page; - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - - if (!uptodate){ - SetPageError(page); - if (page->mapping) - set_bit(AS_EIO, &page->mapping->flags); + if (bio_data_dir(bio) == READ) { + if (uptodate) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } else { /* bio_data_dir(bio) == WRITE */ + if (!uptodate) { + SetPageError(page); + if (page->mapping) + set_bit(AS_EIO, &page->mapping->flags); + } + end_page_writeback(page); } - end_page_writeback(page); } while (bvec >= bio->bi_io_vec); bio_put(bio); } static struct bio *mpage_bio_submit(int rw, struct bio *bio) { - bio->bi_end_io = mpage_end_io_read; - if (rw == WRITE) - bio->bi_end_io = mpage_end_io_write; + bio->bi_end_io = mpage_end_io; submit_bio(rw, bio); return NULL; } diff --git a/fs/namei.c b/fs/namei.c index 0b14f69..8664330 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -479,6 +479,14 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry struct fs_struct *fs = current->fs; struct dentry *parent = nd->path.dentry; + /* + * It can be possible to revalidate the dentry that we started + * the path walk with. force_reval_path may also revalidate the + * dentry already committed to the nameidata. + */ + if (unlikely(parent == dentry)) + return nameidata_drop_rcu(nd); + BUG_ON(!(nd->flags & LOOKUP_RCU)); if (nd->root.mnt) { spin_lock(&fs->lock); @@ -583,6 +591,13 @@ void release_open_intent(struct nameidata *nd) fput(nd->intent.open.file); } +/* + * Call d_revalidate and handle filesystems that request rcu-walk + * to be dropped. This may be called and return in rcu-walk mode, + * regardless of success or error. If -ECHILD is returned, the caller + * must return -ECHILD back up the path walk stack so path walk may + * be restarted in ref-walk mode. + */ static int d_revalidate(struct dentry *dentry, struct nameidata *nd) { int status; @@ -673,6 +688,9 @@ force_reval_path(struct path *path, struct nameidata *nd) return 0; if (!status) { + /* Don't d_invalidate in rcu-walk mode */ + if (nameidata_drop_rcu(nd)) + return -ECHILD; d_invalidate(dentry); status = -ESTALE; } @@ -2105,11 +2123,13 @@ static struct file *do_last(struct nameidata *nd, struct path *path, dir = nd->path.dentry; case LAST_DOT: if (need_reval_dot(dir)) { - error = d_revalidate(nd->path.dentry, nd); - if (!error) - error = -ESTALE; - if (error < 0) + int status = d_revalidate(nd->path.dentry, nd); + if (!status) + status = -ESTALE; + if (status < 0) { + error = status; goto exit; + } } /* fallthrough */ case LAST_ROOT: diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 95b081b..df8c03a 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1406,11 +1406,15 @@ no_open: static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) { struct dentry *parent = NULL; - struct inode *inode = dentry->d_inode; + struct inode *inode; struct inode *dir; struct nfs_open_context *ctx; int openflags, ret = 0; + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; if (!is_atomic_open(nd) || d_mountpoint(dentry)) goto no_open; @@ -1579,6 +1583,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode, { struct iattr attr; int error; + int open_flags = 0; dfprintk(VFS, "NFS: create(%s/%ld), %s\n", dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); @@ -1586,7 +1591,10 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode, attr.ia_mode = mode; attr.ia_valid = ATTR_MODE; - error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL); + if ((nd->flags & LOOKUP_CREATE) != 0) + open_flags = nd->intent.open.flags; + + error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL); if (error != 0) goto out_err; return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c index 93f1cdd..9d096e8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1151,7 +1151,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, goto err_task_lock; } - if (oom_score_adj < task->signal->oom_score_adj && + if (oom_score_adj < task->signal->oom_score_adj_min && !capable(CAP_SYS_RESOURCE)) { err = -EACCES; goto err_sighand; @@ -1164,6 +1164,8 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, atomic_dec(&task->mm->oom_disable_count); } task->signal->oom_score_adj = oom_score_adj; + if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) + task->signal->oom_score_adj_min = oom_score_adj; /* * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is * always attainable. diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index a65239c..ed257d1 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -101,6 +101,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_MEMORY_FAILURE "HardwareCorrupted: %5lu kB\n" #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + "AnonHugePages: %8lu kB\n" +#endif , K(i.totalram), K(i.freeram), @@ -128,7 +131,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(i.freeswap), K(global_page_state(NR_FILE_DIRTY)), K(global_page_state(NR_WRITEBACK)), - K(global_page_state(NR_ANON_PAGES)), + K(global_page_state(NR_ANON_PAGES) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * + HPAGE_PMD_NR +#endif + ), K(global_page_state(NR_FILE_MAPPED)), K(global_page_state(NR_SHMEM)), K(global_page_state(NR_SLAB_RECLAIMABLE) + @@ -151,6 +159,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_MEMORY_FAILURE ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10) #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * + HPAGE_PMD_NR) +#endif ); hugetlb_report_meminfo(m); diff --git a/fs/proc/page.c b/fs/proc/page.c index b06c674..6d8e6a9 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -116,15 +116,17 @@ u64 stable_page_flags(struct page *page) if (PageHuge(page)) u |= 1 << KPF_HUGE; - u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); - /* - * Caveats on high order pages: - * PG_buddy will only be set on the head page; SLUB/SLQB do the same - * for PG_slab; SLOB won't set PG_slab at all on compound pages. + * Caveats on high order pages: page->_count will only be set + * -1 on the head page; SLUB/SLQB do the same for PG_slab; + * SLOB won't set PG_slab at all on compound pages. */ + if (PageBuddy(page)) + u |= 1 << KPF_BUDDY; + + u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); + u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); - u |= kpf_copy_bit(k, KPF_BUDDY, PG_buddy); u |= kpf_copy_bit(k, KPF_ERROR, PG_error); u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index c3755bd..60b9148 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -418,7 +418,8 @@ static int show_smap(struct seq_file *m, void *v) "Anonymous: %8lu kB\n" "Swap: %8lu kB\n" "KernelPageSize: %8lu kB\n" - "MMUPageSize: %8lu kB\n", + "MMUPageSize: %8lu kB\n" + "Locked: %8lu kB\n", (vma->vm_end - vma->vm_start) >> 10, mss.resident >> 10, (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), @@ -430,7 +431,9 @@ static int show_smap(struct seq_file *m, void *v) mss.anonymous >> 10, mss.swap >> 10, vma_kernel_pagesize(vma) >> 10, - vma_mmu_pagesize(vma) >> 10); + vma_mmu_pagesize(vma) >> 10, + (vma->vm_flags & VM_LOCKED) ? + (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); if (m->count < m->size) /* vma is copied successfully */ m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 359ef11..78ca429 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -148,9 +148,7 @@ struct acpi_device_flags { u32 suprise_removal_ok:1; u32 power_manageable:1; u32 performance_manageable:1; - u32 wake_capable:1; /* Wakeup(_PRW) supported? */ - u32 force_power_state:1; - u32 reserved:22; + u32 reserved:24; }; /* File System */ @@ -242,20 +240,14 @@ struct acpi_device_perf { struct acpi_device_wakeup_flags { u8 valid:1; /* Can successfully enable wakeup? */ u8 run_wake:1; /* Run-Wake GPE devices */ - u8 always_enabled:1; /* Run-wake devices that are always enabled */ u8 notifier_present:1; /* Wake-up notify handler has been installed */ }; -struct acpi_device_wakeup_state { - u8 enabled:1; -}; - struct acpi_device_wakeup { acpi_handle gpe_device; u64 gpe_number; u64 sleep_state; struct acpi_handle_list resources; - struct acpi_device_wakeup_state state; struct acpi_device_wakeup_flags flags; int prepare_count; int run_wake_count; @@ -328,8 +320,8 @@ void acpi_bus_data_handler(acpi_handle handle, void *context); acpi_status acpi_bus_get_status_handle(acpi_handle handle, unsigned long long *sta); int acpi_bus_get_status(struct acpi_device *device); -int acpi_bus_get_power(acpi_handle handle, int *state); int acpi_bus_set_power(acpi_handle handle, int state); +int acpi_bus_update_power(acpi_handle handle, int *state_p); bool acpi_bus_power_manageable(acpi_handle handle); bool acpi_bus_can_wakeup(acpi_handle handle); #ifdef CONFIG_ACPI_PROC_EVENT diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 53b7cfd..241b8a0 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -47,7 +47,7 @@ /* Current ACPICA subsystem version in YYYYMMDD format */ -#define ACPI_CA_VERSION 0x20101013 +#define ACPI_CA_VERSION 0x20101209 #include "actypes.h" #include "actbl.h" @@ -229,6 +229,10 @@ acpi_status acpi_install_initialization_handler(acpi_init_handler handler, u32 function); acpi_status +acpi_install_global_event_handler(ACPI_GBL_EVENT_HANDLER handler, + void *context); + +acpi_status acpi_install_fixed_event_handler(u32 acpi_event, acpi_event_handler handler, void *context); @@ -258,11 +262,11 @@ acpi_remove_address_space_handler(acpi_handle device, acpi_status acpi_install_gpe_handler(acpi_handle gpe_device, u32 gpe_number, - u32 type, acpi_event_handler address, void *context); + u32 type, acpi_gpe_handler address, void *context); acpi_status acpi_remove_gpe_handler(acpi_handle gpe_device, - u32 gpe_number, acpi_event_handler address); + u32 gpe_number, acpi_gpe_handler address); #ifdef ACPI_FUTURE_USAGE acpi_status acpi_install_exception_handler(acpi_exception_handler handler); @@ -292,11 +296,13 @@ acpi_status acpi_enable_gpe(acpi_handle gpe_device, u32 gpe_number); acpi_status acpi_disable_gpe(acpi_handle gpe_device, u32 gpe_number); -acpi_status acpi_gpe_can_wake(acpi_handle gpe_device, u32 gpe_number); - acpi_status acpi_clear_gpe(acpi_handle gpe_device, u32 gpe_number); -acpi_status acpi_gpe_wakeup(acpi_handle gpe_device, u32 gpe_number, u8 action); +acpi_status +acpi_setup_gpe_for_wake(acpi_handle parent_device, + acpi_handle gpe_device, u32 gpe_number); + +acpi_status acpi_set_gpe_wake_mask(acpi_handle gpe_device, u32 gpe_number, u8 action); acpi_status acpi_get_gpe_status(acpi_handle gpe_device, @@ -315,7 +321,7 @@ acpi_install_gpe_block(acpi_handle gpe_device, acpi_status acpi_remove_gpe_block(acpi_handle gpe_device); -acpi_status acpi_update_gpes(void); +acpi_status acpi_update_all_gpes(void); /* * Resource interfaces diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h index 2b134b6..939a431 100644 --- a/include/acpi/actypes.h +++ b/include/acpi/actypes.h @@ -656,33 +656,34 @@ typedef u32 acpi_event_status; #define ACPI_GPE_MAX 0xFF #define ACPI_NUM_GPE 256 -/* Actions for acpi_gpe_wakeup, acpi_hw_low_set_gpe */ +/* Actions for acpi_set_gpe_wake_mask, acpi_hw_low_set_gpe */ #define ACPI_GPE_ENABLE 0 #define ACPI_GPE_DISABLE 1 -#define ACPI_GPE_COND_ENABLE 2 +#define ACPI_GPE_CONDITIONAL_ENABLE 2 /* * GPE info flags - Per GPE - * +-------+---+-+-+ - * | 7:4 |3:2|1|0| - * +-------+---+-+-+ - * | | | | - * | | | +--- Interrupt type: edge or level triggered - * | | +----- GPE can wake the system - * | +-------- Type of dispatch:to method, handler, or none - * +-------------- + * +-------+-+-+---+ + * | 7:4 |3|2|1:0| + * +-------+-+-+---+ + * | | | | + * | | | +-- Type of dispatch:to method, handler, notify, or none + * | | +----- Interrupt type: edge or level triggered + * | +------- Is a Wake GPE + * +------------ */ -#define ACPI_GPE_XRUPT_TYPE_MASK (u8) 0x01 -#define ACPI_GPE_LEVEL_TRIGGERED (u8) 0x01 -#define ACPI_GPE_EDGE_TRIGGERED (u8) 0x00 +#define ACPI_GPE_DISPATCH_NONE (u8) 0x00 +#define ACPI_GPE_DISPATCH_METHOD (u8) 0x01 +#define ACPI_GPE_DISPATCH_HANDLER (u8) 0x02 +#define ACPI_GPE_DISPATCH_NOTIFY (u8) 0x03 +#define ACPI_GPE_DISPATCH_MASK (u8) 0x03 -#define ACPI_GPE_CAN_WAKE (u8) 0x02 +#define ACPI_GPE_LEVEL_TRIGGERED (u8) 0x04 +#define ACPI_GPE_EDGE_TRIGGERED (u8) 0x00 +#define ACPI_GPE_XRUPT_TYPE_MASK (u8) 0x04 -#define ACPI_GPE_DISPATCH_MASK (u8) 0x0C -#define ACPI_GPE_DISPATCH_HANDLER (u8) 0x04 -#define ACPI_GPE_DISPATCH_METHOD (u8) 0x08 -#define ACPI_GPE_DISPATCH_NOT_USED (u8) 0x00 +#define ACPI_GPE_CAN_WAKE (u8) 0x08 /* * Flags for GPE and Lock interfaces @@ -894,9 +895,20 @@ typedef void /* * Various handlers and callback procedures */ +typedef +void (*ACPI_GBL_EVENT_HANDLER) (u32 event_type, + acpi_handle device, + u32 event_number, void *context); + +#define ACPI_EVENT_TYPE_GPE 0 +#define ACPI_EVENT_TYPE_FIXED 1 + typedef u32(*acpi_event_handler) (void *context); typedef +u32 (*acpi_gpe_handler) (acpi_handle gpe_device, u32 gpe_number, void *context); + +typedef void (*acpi_notify_handler) (acpi_handle device, u32 value, void *context); typedef @@ -951,6 +963,10 @@ u32 (*acpi_interface_handler) (acpi_string interface_name, u32 supported); #define ACPI_INTERRUPT_NOT_HANDLED 0x00 #define ACPI_INTERRUPT_HANDLED 0x01 +/* GPE handler return values */ + +#define ACPI_REENABLE_GPE 0x80 + /* Length of 32-bit EISAID values when converted back to a string */ #define ACPI_EISAID_STRING_SIZE 8 /* Includes null terminator */ diff --git a/include/acpi/processor.h b/include/acpi/processor.h index 1b62102..55192ac 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -324,6 +324,12 @@ int acpi_processor_tstate_has_changed(struct acpi_processor *pr); int acpi_processor_get_throttling_info(struct acpi_processor *pr); extern int acpi_processor_set_throttling(struct acpi_processor *pr, int state, bool force); +/* + * Reevaluate whether the T-state is invalid after one cpu is + * onlined/offlined. In such case the flags.throttling will be updated. + */ +extern void acpi_processor_reevaluate_tstate(struct acpi_processor *pr, + unsigned long action); extern const struct file_operations acpi_processor_throttling_fops; extern void acpi_processor_throttling_init(void); /* in processor_idle.c */ diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h index 6098cae..ff5c660 100644 --- a/include/asm-generic/gpio.h +++ b/include/asm-generic/gpio.h @@ -147,11 +147,11 @@ extern struct gpio_chip *gpiochip_find(void *data, /* Always use the library code for GPIO management calls, * or when sleeping may be involved. */ -extern int __must_check gpio_request(unsigned gpio, const char *label); +extern int gpio_request(unsigned gpio, const char *label); extern void gpio_free(unsigned gpio); -extern int __must_check gpio_direction_input(unsigned gpio); -extern int __must_check gpio_direction_output(unsigned gpio, int value); +extern int gpio_direction_input(unsigned gpio); +extern int gpio_direction_output(unsigned gpio, int value); extern int gpio_set_debounce(unsigned gpio, unsigned debounce); @@ -192,8 +192,8 @@ struct gpio { const char *label; }; -extern int __must_check gpio_request_one(unsigned gpio, unsigned long flags, const char *label); -extern int __must_check gpio_request_array(struct gpio *array, size_t num); +extern int gpio_request_one(unsigned gpio, unsigned long flags, const char *label); +extern int gpio_request_array(struct gpio *array, size_t num); extern void gpio_free_array(struct gpio *array, size_t num); #ifdef CONFIG_GPIO_SYSFS diff --git a/include/asm-generic/mman-common.h b/include/asm-generic/mman-common.h index 3da9e27..787abbb 100644 --- a/include/asm-generic/mman-common.h +++ b/include/asm-generic/mman-common.h @@ -45,6 +45,9 @@ #define MADV_MERGEABLE 12 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ +#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */ +#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 6f3c6ae..f1eddf7 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -5,67 +5,108 @@ #ifdef CONFIG_MMU #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS -/* - * Largely same as above, but only sets the access flags (dirty, - * accessed, and writable). Furthermore, we know it always gets set - * to a "more permissive" setting, which allows most architectures - * to optimize this. We return whether the PTE actually changed, which - * in turn instructs the caller to do things like update__mmu_cache. - * This used to be done in the caller, but sparc needs minor faults to - * force that call on sun4c so we changed this macro slightly - */ -#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \ -({ \ - int __changed = !pte_same(*(__ptep), __entry); \ - if (__changed) { \ - set_pte_at((__vma)->vm_mm, (__address), __ptep, __entry); \ - flush_tlb_page(__vma, __address); \ - } \ - __changed; \ -}) +extern int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty); +#endif + +#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +extern int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty); #endif #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -#define ptep_test_and_clear_young(__vma, __address, __ptep) \ -({ \ - pte_t __pte = *(__ptep); \ - int r = 1; \ - if (!pte_young(__pte)) \ - r = 0; \ - else \ - set_pte_at((__vma)->vm_mm, (__address), \ - (__ptep), pte_mkold(__pte)); \ - r; \ -}) +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pte_t *ptep) +{ + pte_t pte = *ptep; + int r = 1; + if (!pte_young(pte)) + r = 0; + else + set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte)); + return r; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd = *pmdp; + int r = 1; + if (!pmd_young(pmd)) + r = 0; + else + set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd)); + return r; +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp) +{ + BUG(); + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -#define ptep_clear_flush_young(__vma, __address, __ptep) \ -({ \ - int __young; \ - __young = ptep_test_and_clear_young(__vma, __address, __ptep); \ - if (__young) \ - flush_tlb_page(__vma, __address); \ - __young; \ -}) +int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep); +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR -#define ptep_get_and_clear(__mm, __address, __ptep) \ -({ \ - pte_t __pte = *(__ptep); \ - pte_clear((__mm), (__address), (__ptep)); \ - __pte; \ +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, + unsigned long address, + pte_t *ptep) +{ + pte_t pte = *ptep; + pte_clear(mm, address, ptep); + return pte; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_GET_AND_CLEAR +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, + unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd = *pmdp; + pmd_clear(mm, address, pmdp); + return pmd; }) +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, + unsigned long address, + pmd_t *pmdp) +{ + BUG(); + return __pmd(0); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL -#define ptep_get_and_clear_full(__mm, __address, __ptep, __full) \ -({ \ - pte_t __pte; \ - __pte = ptep_get_and_clear((__mm), (__address), (__ptep)); \ - __pte; \ -}) +static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, + unsigned long address, pte_t *ptep, + int full) +{ + pte_t pte; + pte = ptep_get_and_clear(mm, address, ptep); + return pte; +} #endif /* @@ -74,20 +115,25 @@ * not present, or in the process of an address space destruction. */ #ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL -#define pte_clear_not_present_full(__mm, __address, __ptep, __full) \ -do { \ - pte_clear((__mm), (__address), (__ptep)); \ -} while (0) +static inline void pte_clear_not_present_full(struct mm_struct *mm, + unsigned long address, + pte_t *ptep, + int full) +{ + pte_clear(mm, address, ptep); +} #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH -#define ptep_clear_flush(__vma, __address, __ptep) \ -({ \ - pte_t __pte; \ - __pte = ptep_get_and_clear((__vma)->vm_mm, __address, __ptep); \ - flush_tlb_page(__vma, __address); \ - __pte; \ -}) +extern pte_t ptep_clear_flush(struct vm_area_struct *vma, + unsigned long address, + pte_t *ptep); +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH +extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT @@ -99,8 +145,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres } #endif +#ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline void pmdp_set_wrprotect(struct mm_struct *mm, + unsigned long address, pmd_t *pmdp) +{ + pmd_t old_pmd = *pmdp; + set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd)); +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline void pmdp_set_wrprotect(struct mm_struct *mm, + unsigned long address, pmd_t *pmdp) +{ + BUG(); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif + +#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH +extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp); +#endif + #ifndef __HAVE_ARCH_PTE_SAME -#define pte_same(A,B) (pte_val(A) == pte_val(B)) +static inline int pte_same(pte_t pte_a, pte_t pte_b) +{ + return pte_val(pte_a) == pte_val(pte_b); +} +#endif + +#ifndef __HAVE_ARCH_PMD_SAME +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) +{ + return pmd_val(pmd_a) == pmd_val(pmd_b); +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) +{ + BUG(); + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PAGE_TEST_DIRTY @@ -348,6 +435,24 @@ extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, unsigned long size); #endif +#ifndef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmd_trans_huge(pmd_t pmd) +{ + return 0; +} +static inline int pmd_trans_splitting(pmd_t pmd) +{ + return 0; +} +#ifndef __HAVE_ARCH_PMD_WRITE +static inline int pmd_write(pmd_t pmd) +{ + BUG(); + return 0; +} +#endif /* __HAVE_ARCH_PMD_WRITE */ +#endif + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_GENERIC_PGTABLE_H */ diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 67c91b4..eb176bb 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -352,4 +352,14 @@ static inline int acpi_table_parse(char *id, return -1; } #endif /* !CONFIG_ACPI */ + +#ifdef CONFIG_ACPI_SLEEP +int suspend_nvs_register(unsigned long start, unsigned long size); +#else +static inline int suspend_nvs_register(unsigned long a, unsigned long b) +{ + return 0; +} +#endif + #endif /*_LINUX_ACPI_H*/ diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 5ac5155..dfa2ed4 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -11,6 +11,9 @@ /* The full zone was compacted */ #define COMPACT_COMPLETE 3 +#define COMPACT_MODE_DIRECT_RECLAIM 0 +#define COMPACT_MODE_KSWAPD 1 + #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, @@ -21,7 +24,12 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *mask); + int order, gfp_t gfp_mask, nodemask_t *mask, + bool sync); +extern unsigned long compaction_suitable(struct zone *zone, int order); +extern unsigned long compact_zone_order(struct zone *zone, int order, + gfp_t gfp_mask, bool sync, + int compact_mode); /* Do not skip compaction more than 64 times */ #define COMPACT_MAX_DEFER_SHIFT 6 @@ -54,7 +62,20 @@ static inline bool compaction_deferred(struct zone *zone) #else static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *nodemask) + int order, gfp_t gfp_mask, nodemask_t *nodemask, + bool sync) +{ + return COMPACT_CONTINUE; +} + +static inline unsigned long compaction_suitable(struct zone *zone, int order) +{ + return COMPACT_SKIPPED; +} + +static inline unsigned long compact_zone_order(struct zone *zone, int order, + gfp_t gfp_mask, bool sync, + int compact_mode) { return COMPACT_CONTINUE; } diff --git a/include/linux/cper.h b/include/linux/cper.h index bf972f8..3104aaf 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -39,10 +39,12 @@ * Severity difinition for error_severity in struct cper_record_header * and section_severity in struct cper_section_descriptor */ -#define CPER_SEV_RECOVERABLE 0x0 -#define CPER_SEV_FATAL 0x1 -#define CPER_SEV_CORRECTED 0x2 -#define CPER_SEV_INFORMATIONAL 0x3 +enum { + CPER_SEV_RECOVERABLE, + CPER_SEV_FATAL, + CPER_SEV_CORRECTED, + CPER_SEV_INFORMATIONAL, +}; /* * Validation bits difinition for validation_bits in struct @@ -201,6 +203,47 @@ UUID_LE(0x036F84E1, 0x7F37, 0x428c, 0xA7, 0x9E, 0x57, 0x5F, \ 0xDF, 0xAA, 0x84, 0xEC) +#define CPER_PROC_VALID_TYPE 0x0001 +#define CPER_PROC_VALID_ISA 0x0002 +#define CPER_PROC_VALID_ERROR_TYPE 0x0004 +#define CPER_PROC_VALID_OPERATION 0x0008 +#define CPER_PROC_VALID_FLAGS 0x0010 +#define CPER_PROC_VALID_LEVEL 0x0020 +#define CPER_PROC_VALID_VERSION 0x0040 +#define CPER_PROC_VALID_BRAND_INFO 0x0080 +#define CPER_PROC_VALID_ID 0x0100 +#define CPER_PROC_VALID_TARGET_ADDRESS 0x0200 +#define CPER_PROC_VALID_REQUESTOR_ID 0x0400 +#define CPER_PROC_VALID_RESPONDER_ID 0x0800 +#define CPER_PROC_VALID_IP 0x1000 + +#define CPER_MEM_VALID_ERROR_STATUS 0x0001 +#define CPER_MEM_VALID_PHYSICAL_ADDRESS 0x0002 +#define CPER_MEM_VALID_PHYSICAL_ADDRESS_MASK 0x0004 +#define CPER_MEM_VALID_NODE 0x0008 +#define CPER_MEM_VALID_CARD 0x0010 +#define CPER_MEM_VALID_MODULE 0x0020 +#define CPER_MEM_VALID_BANK 0x0040 +#define CPER_MEM_VALID_DEVICE 0x0080 +#define CPER_MEM_VALID_ROW 0x0100 +#define CPER_MEM_VALID_COLUMN 0x0200 +#define CPER_MEM_VALID_BIT_POSITION 0x0400 +#define CPER_MEM_VALID_REQUESTOR_ID 0x0800 +#define CPER_MEM_VALID_RESPONDER_ID 0x1000 +#define CPER_MEM_VALID_TARGET_ID 0x2000 +#define CPER_MEM_VALID_ERROR_TYPE 0x4000 + +#define CPER_PCIE_VALID_PORT_TYPE 0x0001 +#define CPER_PCIE_VALID_VERSION 0x0002 +#define CPER_PCIE_VALID_COMMAND_STATUS 0x0004 +#define CPER_PCIE_VALID_DEVICE_ID 0x0008 +#define CPER_PCIE_VALID_SERIAL_NUMBER 0x0010 +#define CPER_PCIE_VALID_BRIDGE_CONTROL_STATUS 0x0020 +#define CPER_PCIE_VALID_CAPABILITY 0x0040 +#define CPER_PCIE_VALID_AER_INFO 0x0080 + +#define CPER_PCIE_SLOT_SHIFT 3 + /* * All tables and structs must be byte-packed to match CPER * specification, since the tables are provided by the system BIOS @@ -306,6 +349,41 @@ struct cper_sec_mem_err { __u8 error_type; }; +struct cper_sec_pcie { + __u64 validation_bits; + __u32 port_type; + struct { + __u8 minor; + __u8 major; + __u8 reserved[2]; + } version; + __u16 command; + __u16 status; + __u32 reserved; + struct { + __u16 vendor_id; + __u16 device_id; + __u8 class_code[3]; + __u8 function; + __u8 device; + __u16 segment; + __u8 bus; + __u8 secondary_bus; + __u16 slot; + __u8 reserved; + } device_id; + struct { + __u32 lower; + __u32 upper; + } serial_number; + struct { + __u16 secondary_status; + __u16 control; + } bridge; + __u8 capability[60]; + __u8 aer_info[96]; +}; + /* Reset to default packing */ #pragma pack() diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 1be416b..36719ea 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -47,13 +47,7 @@ struct cpuidle_state { /* Idle State Flags */ #define CPUIDLE_FLAG_TIME_VALID (0x01) /* is residency time measurable? */ -#define CPUIDLE_FLAG_CHECK_BM (0x02) /* BM activity will exit state */ -#define CPUIDLE_FLAG_POLL (0x10) /* no latency, no savings */ -#define CPUIDLE_FLAG_SHALLOW (0x20) /* low latency, minimal savings */ -#define CPUIDLE_FLAG_BALANCED (0x40) /* medium latency, moderate savings */ -#define CPUIDLE_FLAG_DEEP (0x80) /* high latency, large savings */ #define CPUIDLE_FLAG_IGNORE (0x100) /* ignore during this idle period */ -#define CPUIDLE_FLAG_TLB_FLUSHED (0x200) /* tlb will be flushed */ #define CPUIDLE_DRIVER_FLAGS_MASK (0xFFFF0000) diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 2970022..272496d 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -193,6 +193,13 @@ struct dm_target { char *error; }; +/* Each target can link one of these into the table */ +struct dm_target_callbacks { + struct list_head list; + int (*congested_fn) (struct dm_target_callbacks *, int); + void (*unplug_fn)(struct dm_target_callbacks *); +}; + int dm_register_target(struct target_type *t); void dm_unregister_target(struct target_type *t); @@ -269,6 +276,11 @@ int dm_table_add_target(struct dm_table *t, const char *type, sector_t start, sector_t len, char *params); /* + * Target_ctr should call this if it needs to add any callbacks. + */ +void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb); + +/* * Finally call this to make the table ready for use. */ int dm_table_complete(struct dm_table *t); diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h index 49eab36..78bbf47 100644 --- a/include/linux/dm-ioctl.h +++ b/include/linux/dm-ioctl.h @@ -44,7 +44,7 @@ * Remove a device, destroy any tables. * * DM_DEV_RENAME: - * Rename a device. + * Rename a device or set its uuid if none was previously supplied. * * DM_SUSPEND: * This performs both suspend and resume, depending which flag is @@ -267,9 +267,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 18 -#define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2010-06-29)" +#define DM_VERSION_MINOR 19 +#define DM_VERSION_PATCHLEVEL 1 +#define DM_VERSION_EXTRA "-ioctl (2011-01-07)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ @@ -322,4 +322,10 @@ enum { */ #define DM_UEVENT_GENERATED_FLAG (1 << 13) /* Out */ +/* + * If set, rename changes the uuid not the name. Only permitted + * if no uuid was previously supplied: an existing uuid cannot be changed. + */ +#define DM_UUID_FLAG (1 << 14) /* In */ + #endif /* _LINUX_DM_IOCTL_H */ diff --git a/include/linux/dm-log-userspace.h b/include/linux/dm-log-userspace.h index 0c3c3a2..eeace7d 100644 --- a/include/linux/dm-log-userspace.h +++ b/include/linux/dm-log-userspace.h @@ -370,6 +370,16 @@ #define DM_ULOG_REQUEST_TYPE(request_type) \ (DM_ULOG_REQUEST_MASK & (request_type)) +/* + * DM_ULOG_REQUEST_VERSION is incremented when there is a + * change to the way information is passed between kernel + * and userspace. This could be a structure change of + * dm_ulog_request or a change in the way requests are + * issued/handled. Changes are outlined here: + * version 1: Initial implementation + */ +#define DM_ULOG_REQUEST_VERSION 1 + struct dm_ulog_request { /* * The local unique identifier (luid) and the universally unique @@ -383,8 +393,9 @@ struct dm_ulog_request { */ uint64_t luid; char uuid[DM_UUID_LEN]; - char padding[7]; /* Padding because DM_UUID_LEN = 129 */ + char padding[3]; /* Padding because DM_UUID_LEN = 129 */ + uint32_t version; /* See DM_ULOG_REQUEST_VERSION */ int32_t error; /* Used to report back processing errors */ uint32_t seq; /* Sequence number for request */ diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f54adfc..a3b148a 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -34,6 +34,7 @@ struct vm_area_struct; #else #define ___GFP_NOTRACK 0 #endif +#define ___GFP_NO_KSWAPD 0x400000u /* * GFP bitmasks.. @@ -81,13 +82,15 @@ struct vm_area_struct; #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */ +#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) + /* * This may seem redundant, but it's a way of annotating false positives vs. * allocations that simply cannot be supported (e.g. page tables). */ #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) -#define __GFP_BITS_SHIFT 22 /* Room for 22 __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 23 /* Room for 23 __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* This equals 0, but use constants in case they ever change */ @@ -106,6 +109,9 @@ struct vm_area_struct; __GFP_HARDWALL | __GFP_HIGHMEM | \ __GFP_MOVABLE) #define GFP_IOFS (__GFP_IO | __GFP_FS) +#define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ + __GFP_NO_KSWAPD) #ifdef CONFIG_NUMA #define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) @@ -325,14 +331,17 @@ alloc_pages(gfp_t gfp_mask, unsigned int order) { return alloc_pages_current(gfp_mask, order); } -extern struct page *alloc_page_vma(gfp_t gfp_mask, +extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr); #else #define alloc_pages(gfp_mask, order) \ alloc_pages_node(numa_node_id(), gfp_mask, order) -#define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0) +#define alloc_pages_vma(gfp_mask, order, vma, addr) \ + alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) +#define alloc_page_vma(gfp_mask, vma, addr) \ + alloc_pages_vma(gfp_mask, 0, vma, addr) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); diff --git a/include/linux/gpio.h b/include/linux/gpio.h index f79d67f..4b47ed9 100644 --- a/include/linux/gpio.h +++ b/include/linux/gpio.h @@ -30,7 +30,7 @@ static inline int gpio_is_valid(int number) return 0; } -static inline int __must_check gpio_request(unsigned gpio, const char *label) +static inline int gpio_request(unsigned gpio, const char *label) { return -ENOSYS; } @@ -62,12 +62,12 @@ static inline void gpio_free_array(struct gpio *array, size_t num) WARN_ON(1); } -static inline int __must_check gpio_direction_input(unsigned gpio) +static inline int gpio_direction_input(unsigned gpio) { return -ENOSYS; } -static inline int __must_check gpio_direction_output(unsigned gpio, int value) +static inline int gpio_direction_output(unsigned gpio, int value) { return -ENOSYS; } diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h new file mode 100644 index 0000000..8e6c8c4 --- /dev/null +++ b/include/linux/huge_mm.h @@ -0,0 +1,179 @@ +#ifndef _LINUX_HUGE_MM_H +#define _LINUX_HUGE_MM_H + +extern int do_huge_pmd_anonymous_page(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + unsigned int flags); +extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, + struct vm_area_struct *vma); +extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pmd_t orig_pmd); +extern pgtable_t get_pmd_huge_pte(struct mm_struct *mm); +extern struct page *follow_trans_huge_pmd(struct mm_struct *mm, + unsigned long addr, + pmd_t *pmd, + unsigned int flags); +extern int zap_huge_pmd(struct mmu_gather *tlb, + struct vm_area_struct *vma, + pmd_t *pmd); +extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned char *vec); +extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, pgprot_t newprot); + +enum transparent_hugepage_flag { + TRANSPARENT_HUGEPAGE_FLAG, + TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, +#ifdef CONFIG_DEBUG_VM + TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, +#endif +}; + +enum page_check_address_pmd_flag { + PAGE_CHECK_ADDRESS_PMD_FLAG, + PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, + PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, +}; +extern pmd_t *page_check_address_pmd(struct page *page, + struct mm_struct *mm, + unsigned long address, + enum page_check_address_pmd_flag flag); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define HPAGE_PMD_SHIFT HPAGE_SHIFT +#define HPAGE_PMD_MASK HPAGE_MASK +#define HPAGE_PMD_SIZE HPAGE_SIZE + +#define transparent_hugepage_enabled(__vma) \ + ((transparent_hugepage_flags & \ + (1<vm_flags & VM_HUGEPAGE))) && \ + !((__vma)->vm_flags & VM_NOHUGEPAGE)) +#define transparent_hugepage_defrag(__vma) \ + ((transparent_hugepage_flags & \ + (1<vm_flags & VM_HUGEPAGE)) +#ifdef CONFIG_DEBUG_VM +#define transparent_hugepage_debug_cow() \ + (transparent_hugepage_flags & \ + (1<root->lock); \ + /* \ + * spin_unlock_wait() is just a loop in C and so the \ + * CPU can reorder anything around it. \ + */ \ + smp_mb(); \ + BUG_ON(pmd_trans_splitting(*____pmd) || \ + pmd_trans_huge(*____pmd)); \ + } while (0) +#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) +#define HPAGE_PMD_NR (1< MAX_ORDER +#error "hugepages can't be allocated by the buddy allocator" +#endif +extern int hugepage_madvise(struct vm_area_struct *vma, + unsigned long *vm_flags, int advice); +extern void __vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next); +static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next) +{ + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) + return; + __vma_adjust_trans_huge(vma, start, end, adjust_next); +} +static inline int hpage_nr_pages(struct page *page) +{ + if (unlikely(PageTransHuge(page))) + return HPAGE_PMD_NR; + return 1; +} +static inline struct page *compound_trans_head(struct page *page) +{ + if (PageTail(page)) { + struct page *head; + head = page->first_page; + smp_rmb(); + /* + * head may be a dangling pointer. + * __split_huge_page_refcount clears PageTail before + * overwriting first_page, so if PageTail is still + * there it means the head pointer isn't dangling. + */ + if (PageTail(page)) + return head; + } + return page; +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +#define HPAGE_PMD_SHIFT ({ BUG(); 0; }) +#define HPAGE_PMD_MASK ({ BUG(); 0; }) +#define HPAGE_PMD_SIZE ({ BUG(); 0; }) + +#define hpage_nr_pages(x) 1 + +#define transparent_hugepage_enabled(__vma) 0 + +#define transparent_hugepage_flags 0UL +static inline int split_huge_page(struct page *page) +{ + return 0; +} +#define split_huge_page_pmd(__mm, __pmd) \ + do { } while (0) +#define wait_split_huge_page(__anon_vma, __pmd) \ + do { } while (0) +#define compound_trans_head(page) compound_head(page) +static inline int hugepage_madvise(struct vm_area_struct *vma, + unsigned long *vm_flags, int advice) +{ + BUG(); + return 0; +} +static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next) +{ +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#endif /* _LINUX_HUGE_MM_H */ diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h index 65aae34..045f2f2 100644 --- a/include/linux/ipmi.h +++ b/include/linux/ipmi.h @@ -454,6 +454,44 @@ unsigned int ipmi_addr_length(int addr_type); /* Validate that the given IPMI address is valid. */ int ipmi_validate_addr(struct ipmi_addr *addr, int len); +/* + * How did the IPMI driver find out about the device? + */ +enum ipmi_addr_src { + SI_INVALID = 0, SI_HOTMOD, SI_HARDCODED, SI_SPMI, SI_ACPI, SI_SMBIOS, + SI_PCI, SI_DEVICETREE, SI_DEFAULT +}; + +union ipmi_smi_info_union { + /* + * the acpi_info element is defined for the SI_ACPI + * address type + */ + struct { + void *acpi_handle; + } acpi_info; +}; + +struct ipmi_smi_info { + enum ipmi_addr_src addr_src; + + /* + * Base device for the interface. Don't forget to put this when + * you are done. + */ + struct device *dev; + + /* + * The addr_info provides more detailed info for some IPMI + * devices, depending on the addr_src. Currently only SI_ACPI + * info is provided. + */ + union ipmi_smi_info_union addr_info; +}; + +/* This is to get the private info of ipmi_smi_t */ +extern int ipmi_get_smi_info(int if_num, struct ipmi_smi_info *data); + #endif /* __KERNEL__ */ diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h index 4b48318..906590a 100644 --- a/include/linux/ipmi_smi.h +++ b/include/linux/ipmi_smi.h @@ -39,6 +39,7 @@ #include #include #include +#include /* This files describes the interface for IPMI system management interface drivers to bind into the IPMI message handler. */ @@ -86,6 +87,13 @@ struct ipmi_smi_handlers { int (*start_processing)(void *send_info, ipmi_smi_t new_intf); + /* + * Get the detailed private info of the low level interface and store + * it into the structure of ipmi_smi_data. For example: the + * ACPI device handle will be returned for the pnp_acpi IPMI device. + */ + int (*get_smi_info)(void *send_info, struct ipmi_smi_info *data); + /* Called to enqueue an SMI message to be sent. This operation is not allowed to fail. If an error occurs, it should report back the error in a received message. It may diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 979c68c..6a64c6f 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -57,7 +57,7 @@ struct irq_desc { #endif struct timer_rand_state *timer_rand_state; - unsigned int *kstat_irqs; + unsigned int __percpu *kstat_irqs; irq_flow_handler_t handle_irq; struct irqaction *action; /* IRQ action list */ unsigned int status; /* IRQ status */ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 57dac70..5a9d905 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -600,6 +600,13 @@ struct sysinfo { #define NUMA_BUILD 0 #endif +/* This helps us avoid #ifdef CONFIG_COMPACTION */ +#ifdef CONFIG_COMPACTION +#define COMPACTION_BUILD 1 +#else +#define COMPACTION_BUILD 0 +#endif + /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */ #ifdef CONFIG_FTRACE_MCOUNT_RECORD # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 44e83ba..0cce2db 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -46,16 +46,14 @@ DECLARE_PER_CPU(struct kernel_stat, kstat); extern unsigned long long nr_context_switches(void); #ifndef CONFIG_GENERIC_HARDIRQS -#define kstat_irqs_this_cpu(irq) \ - (this_cpu_read(kstat.irqs[irq]) struct irq_desc; static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc) { - kstat_this_cpu.irqs[irq]++; - kstat_this_cpu.irqs_sum++; + __this_cpu_inc(kstat.irqs[irq]); + __this_cpu_inc(kstat.irqs_sum); } static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) @@ -65,17 +63,18 @@ static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) #else #include extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu); -#define kstat_irqs_this_cpu(DESC) \ - ((DESC)->kstat_irqs[smp_processor_id()]) -#define kstat_incr_irqs_this_cpu(irqno, DESC) do {\ - ((DESC)->kstat_irqs[smp_processor_id()]++);\ - kstat_this_cpu.irqs_sum++; } while (0) + +#define kstat_incr_irqs_this_cpu(irqno, DESC) \ +do { \ + __this_cpu_inc(*(DESC)->kstat_irqs); \ + __this_cpu_inc(kstat.irqs_sum); \ +} while (0) #endif static inline void kstat_incr_softirqs_this_cpu(unsigned int irq) { - kstat_this_cpu.softirqs[irq]++; + __this_cpu_inc(kstat.softirqs[irq]); } static inline unsigned int kstat_softirqs_cpu(unsigned int irq, int cpu) diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h new file mode 100644 index 0000000..6b394f0 --- /dev/null +++ b/include/linux/khugepaged.h @@ -0,0 +1,67 @@ +#ifndef _LINUX_KHUGEPAGED_H +#define _LINUX_KHUGEPAGED_H + +#include /* MMF_VM_HUGEPAGE */ + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +extern int __khugepaged_enter(struct mm_struct *mm); +extern void __khugepaged_exit(struct mm_struct *mm); +extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma); + +#define khugepaged_enabled() \ + (transparent_hugepage_flags & \ + ((1<flags)) + return __khugepaged_enter(mm); + return 0; +} + +static inline void khugepaged_exit(struct mm_struct *mm) +{ + if (test_bit(MMF_VM_HUGEPAGE, &mm->flags)) + __khugepaged_exit(mm); +} + +static inline int khugepaged_enter(struct vm_area_struct *vma) +{ + if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) + if ((khugepaged_always() || + (khugepaged_req_madv() && + vma->vm_flags & VM_HUGEPAGE)) && + !(vma->vm_flags & VM_NOHUGEPAGE)) + if (__khugepaged_enter(vma->vm_mm)) + return -ENOMEM; + return 0; +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) +{ + return 0; +} +static inline void khugepaged_exit(struct mm_struct *mm) +{ +} +static inline int khugepaged_enter(struct vm_area_struct *vma) +{ + return 0; +} +static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma) +{ + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#endif /* _LINUX_KHUGEPAGED_H */ diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h index 9ee97e7..b2adbb4 100644 --- a/include/linux/list_bl.h +++ b/include/linux/list_bl.h @@ -16,7 +16,7 @@ * some fast and compact auxiliary data. */ -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) +#if defined(CONFIG_SMP) #define LIST_BL_LOCKMASK 1UL #else #define LIST_BL_LOCKMASK 0UL @@ -62,7 +62,8 @@ static inline void hlist_bl_set_first(struct hlist_bl_head *h, struct hlist_bl_node *n) { LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); - LIST_BL_BUG_ON(!((unsigned long)h->first & LIST_BL_LOCKMASK)); + LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) != + LIST_BL_LOCKMASK); h->first = (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK); } diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 159a076..6a576f9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -25,6 +25,11 @@ struct page_cgroup; struct page; struct mm_struct; +/* Stats that can be updated by kernel. */ +enum mem_cgroup_page_stat_item { + MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */ +}; + extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, struct list_head *dst, unsigned long *scanned, int order, @@ -93,7 +98,7 @@ extern int mem_cgroup_prepare_migration(struct page *page, struct page *newpage, struct mem_cgroup **ptr); extern void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, struct page *newpage); + struct page *oldpage, struct page *newpage, bool migration_ok); /* * For memory reclaim. @@ -121,7 +126,22 @@ static inline bool mem_cgroup_disabled(void) return false; } -void mem_cgroup_update_file_mapped(struct page *page, int val); +void mem_cgroup_update_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx, + int val); + +static inline void mem_cgroup_inc_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) +{ + mem_cgroup_update_page_stat(page, idx, 1); +} + +static inline void mem_cgroup_dec_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) +{ + mem_cgroup_update_page_stat(page, idx, -1); +} + unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_t gfp_mask); u64 mem_cgroup_get_limit(struct mem_cgroup *mem); @@ -231,8 +251,7 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage, } static inline void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, - struct page *newpage) + struct page *oldpage, struct page *newpage, bool migration_ok) { } @@ -293,8 +312,13 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { } -static inline void mem_cgroup_update_file_mapped(struct page *page, - int val) +static inline void mem_cgroup_inc_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) +{ +} + +static inline void mem_cgroup_dec_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) { } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 31c237a..24376fe 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -13,12 +13,16 @@ struct mem_section; #ifdef CONFIG_MEMORY_HOTPLUG /* - * Types for free bootmem. - * The normal smallest mapcount is -1. Here is smaller value than it. + * Types for free bootmem stored in page->lru.next. These have to be in + * some random range in unsigned long space for debugging purposes. */ -#define SECTION_INFO (-1 - 1) -#define MIX_SECTION_INFO (-1 - 2) -#define NODE_INFO (-1 - 3) +enum { + MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12, + SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE, + MIX_SECTION_INFO, + NODE_INFO, + MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO, +}; /* * pgdat resizing functions diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 085527f..e39aeec 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -13,9 +13,11 @@ extern void putback_lru_pages(struct list_head *l); extern int migrate_page(struct address_space *, struct page *, struct page *); extern int migrate_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining); + unsigned long private, bool offlining, + bool sync); extern int migrate_huge_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining); + unsigned long private, bool offlining, + bool sync); extern int fail_migrate_page(struct address_space *, struct page *, struct page *); @@ -33,9 +35,11 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, static inline void putback_lru_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining) { return -ENOSYS; } + unsigned long private, bool offlining, + bool sync) { return -ENOSYS; } static inline int migrate_huge_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining) { return -ENOSYS; } + unsigned long private, bool offlining, + bool sync) { return -ENOSYS; } static inline int migrate_prep(void) { return -ENOSYS; } static inline int migrate_prep_local(void) { return -ENOSYS; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 721f451..956a355 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -14,6 +14,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -82,6 +83,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_GROWSUP 0x00000200 #else #define VM_GROWSUP 0x00000000 +#define VM_NOHUGEPAGE 0x00000200 /* MADV_NOHUGEPAGE marked this vma */ #endif #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ @@ -101,7 +103,11 @@ extern unsigned int kobjsize(const void *objp); #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ +#ifndef CONFIG_TRANSPARENT_HUGEPAGE #define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */ +#else +#define VM_HUGEPAGE 0x01000000 /* MADV_HUGEPAGE marked this vma */ +#endif #define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */ #define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */ @@ -242,6 +248,7 @@ struct inode; * files which need it (119 of them) */ #include +#include /* * Methods to modify the page usage count. @@ -305,6 +312,39 @@ static inline int is_vmalloc_or_module_addr(const void *x) } #endif +static inline void compound_lock(struct page *page) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + bit_spin_lock(PG_compound_lock, &page->flags); +#endif +} + +static inline void compound_unlock(struct page *page) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + bit_spin_unlock(PG_compound_lock, &page->flags); +#endif +} + +static inline unsigned long compound_lock_irqsave(struct page *page) +{ + unsigned long uninitialized_var(flags); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + local_irq_save(flags); + compound_lock(page); +#endif + return flags; +} + +static inline void compound_unlock_irqrestore(struct page *page, + unsigned long flags) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + compound_unlock(page); + local_irq_restore(flags); +#endif +} + static inline struct page *compound_head(struct page *page) { if (unlikely(PageTail(page))) @@ -319,9 +359,29 @@ static inline int page_count(struct page *page) static inline void get_page(struct page *page) { - page = compound_head(page); - VM_BUG_ON(atomic_read(&page->_count) == 0); + /* + * Getting a normal page or the head of a compound page + * requires to already have an elevated page->_count. Only if + * we're getting a tail page, the elevated page->_count is + * required only in the head page, so for tail pages the + * bugcheck only verifies that the page->_count isn't + * negative. + */ + VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page)); atomic_inc(&page->_count); + /* + * Getting a tail page will elevate both the head and tail + * page->_count(s). + */ + if (unlikely(PageTail(page))) { + /* + * This is safe only because + * __split_huge_page_refcount can't run under + * get_page(). + */ + VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); + atomic_inc(&page->first_page->_count); + } } static inline struct page *virt_to_head_page(const void *x) @@ -339,6 +399,27 @@ static inline void init_page_count(struct page *page) atomic_set(&page->_count, 1); } +/* + * PageBuddy() indicate that the page is free and in the buddy system + * (see mm/page_alloc.c). + */ +static inline int PageBuddy(struct page *page) +{ + return atomic_read(&page->_mapcount) == -2; +} + +static inline void __SetPageBuddy(struct page *page) +{ + VM_BUG_ON(atomic_read(&page->_mapcount) != -1); + atomic_set(&page->_mapcount, -2); +} + +static inline void __ClearPageBuddy(struct page *page) +{ + VM_BUG_ON(!PageBuddy(page)); + atomic_set(&page->_mapcount, -1); +} + void put_page(struct page *page); void put_pages_list(struct list_head *pages); @@ -370,12 +451,39 @@ static inline int compound_order(struct page *page) return (unsigned long)page[1].lru.prev; } +static inline int compound_trans_order(struct page *page) +{ + int order; + unsigned long flags; + + if (!PageHead(page)) + return 0; + + flags = compound_lock_irqsave(page); + order = compound_order(page); + compound_unlock_irqrestore(page, flags); + return order; +} + static inline void set_compound_order(struct page *page, unsigned long order) { page[1].lru.prev = (void *)order; } /* + * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when + * servicing faults for write access. In the normal case, do always want + * pte_mkwrite. But get_user_pages can cause write faults for mappings + * that do not have writing enabled, when used by access_process_vm. + */ +static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) +{ + if (likely(vma->vm_flags & VM_WRITE)) + pte = pte_mkwrite(pte); + return pte; +} + +/* * Multiple processes may "see" the same page. E.g. for untouched * mappings of /dev/null, all processes see the same page full of * zeroes, and text pages of executables and shared libraries have @@ -657,7 +765,7 @@ static inline struct address_space *page_mapping(struct page *page) VM_BUG_ON(PageSlab(page)); if (unlikely(PageSwapCache(page))) mapping = &swapper_space; - else if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON)) + else if ((unsigned long)mapping & PAGE_MAPPING_ANON) mapping = NULL; return mapping; } @@ -1064,7 +1172,8 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); #endif -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); +int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long address); int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); /* @@ -1133,16 +1242,18 @@ static inline void pgtable_page_dtor(struct page *page) pte_unmap(pte); \ } while (0) -#define pte_alloc_map(mm, pmd, address) \ - ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ - NULL: pte_offset_map(pmd, address)) +#define pte_alloc_map(mm, vma, pmd, address) \ + ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, vma, \ + pmd, address))? \ + NULL: pte_offset_map(pmd, address)) #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ - ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ + ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, NULL, \ + pmd, address))? \ NULL: pte_offset_map_lock(mm, pmd, address, ptlp)) #define pte_alloc_kernel(pmd, address) \ - ((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ + ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ NULL: pte_offset_kernel(pmd, address)) extern void free_area_init(unsigned long * zones_size); @@ -1415,6 +1526,8 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, #define FOLL_GET 0x04 /* do get_page on page */ #define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ +#define FOLL_MLOCK 0x40 /* mark page as mlocked */ +#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); @@ -1518,5 +1631,14 @@ static inline int is_hwpoison_address(unsigned long addr) extern void dump_page(struct page *page); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) +extern void clear_huge_page(struct page *page, + unsigned long addr, + unsigned int pages_per_huge_page); +extern void copy_user_huge_page(struct page *dst, struct page *src, + unsigned long addr, struct vm_area_struct *vma, + unsigned int pages_per_huge_page); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 8835b87..8f7d247 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -1,6 +1,8 @@ #ifndef LINUX_MM_INLINE_H #define LINUX_MM_INLINE_H +#include + /** * page_is_file_cache - should the page be on a file LRU or anon LRU? * @page: the page to test @@ -20,18 +22,25 @@ static inline int page_is_file_cache(struct page *page) } static inline void -add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) +__add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l, + struct list_head *head) { - list_add(&page->lru, &zone->lru[l].list); - __inc_zone_state(zone, NR_LRU_BASE + l); + list_add(&page->lru, head); + __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page)); mem_cgroup_add_lru_list(page, l); } static inline void +add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) +{ + __add_page_to_lru_list(zone, page, l, &zone->lru[l].list); +} + +static inline void del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l) { list_del(&page->lru); - __dec_zone_state(zone, NR_LRU_BASE + l); + __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); mem_cgroup_del_lru_list(page, l); } @@ -66,7 +75,7 @@ del_page_from_lru(struct zone *zone, struct page *page) l += LRU_ACTIVE; } } - __dec_zone_state(zone, NR_LRU_BASE + l); + __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); mem_cgroup_del_lru_list(page, l); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bb7288a..26bc4e2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -310,6 +310,9 @@ struct mm_struct { #ifdef CONFIG_MMU_NOTIFIER struct mmu_notifier_mm *mmu_notifier_mm; #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + pgtable_t pmd_huge_pte; /* protected by page_table_lock */ +#endif /* How many tasks sharing this mm are OOM_DISABLE */ atomic_t oom_disable_count; }; diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 43dcfbd..cc2e7df 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -62,6 +62,16 @@ struct mmu_notifier_ops { unsigned long address); /* + * test_young is called to check the young/accessed bitflag in + * the secondary pte. This is used to know if the page is + * frequently used without actually clearing the flag or tearing + * down the secondary mapping on the page. + */ + int (*test_young)(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address); + + /* * change_pte is called in cases that pte mapping to page is changed: * for example, when ksm remaps pte to point to a new shared page. */ @@ -163,6 +173,8 @@ extern void __mmu_notifier_mm_destroy(struct mm_struct *mm); extern void __mmu_notifier_release(struct mm_struct *mm); extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long address); +extern int __mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address); extern void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte); extern void __mmu_notifier_invalidate_page(struct mm_struct *mm, @@ -186,6 +198,14 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, return 0; } +static inline int mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) +{ + if (mm_has_notifiers(mm)) + return __mmu_notifier_test_young(mm, address); + return 0; +} + static inline void mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { @@ -243,6 +263,32 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) __pte; \ }) +#define pmdp_clear_flush_notify(__vma, __address, __pmdp) \ +({ \ + pmd_t __pmd; \ + struct vm_area_struct *___vma = __vma; \ + unsigned long ___address = __address; \ + VM_BUG_ON(__address & ~HPAGE_PMD_MASK); \ + mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address, \ + (__address)+HPAGE_PMD_SIZE);\ + __pmd = pmdp_clear_flush(___vma, ___address, __pmdp); \ + mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address, \ + (__address)+HPAGE_PMD_SIZE); \ + __pmd; \ +}) + +#define pmdp_splitting_flush_notify(__vma, __address, __pmdp) \ +({ \ + struct vm_area_struct *___vma = __vma; \ + unsigned long ___address = __address; \ + VM_BUG_ON(__address & ~HPAGE_PMD_MASK); \ + mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address, \ + (__address)+HPAGE_PMD_SIZE);\ + pmdp_splitting_flush(___vma, ___address, __pmdp); \ + mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address, \ + (__address)+HPAGE_PMD_SIZE); \ +}) + #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ ({ \ int __young; \ @@ -254,6 +300,17 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) __young; \ }) +#define pmdp_clear_flush_young_notify(__vma, __address, __pmdp) \ +({ \ + int __young; \ + struct vm_area_struct *___vma = __vma; \ + unsigned long ___address = __address; \ + __young = pmdp_clear_flush_young(___vma, ___address, __pmdp); \ + __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ + ___address); \ + __young; \ +}) + #define set_pte_at_notify(__mm, __address, __ptep, __pte) \ ({ \ struct mm_struct *___mm = __mm; \ @@ -276,6 +333,12 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, return 0; } +static inline int mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) +{ + return 0; +} + static inline void mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { @@ -305,7 +368,10 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) } #define ptep_clear_flush_young_notify ptep_clear_flush_young +#define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_flush_notify ptep_clear_flush +#define pmdp_clear_flush_notify pmdp_clear_flush +#define pmdp_splitting_flush_notify pmdp_splitting_flush #define set_pte_at_notify set_pte_at #endif /* CONFIG_MMU_NOTIFIER */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 39c24eb..02ecb01 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -114,6 +114,7 @@ enum zone_stat_item { NUMA_LOCAL, /* allocation from local node */ NUMA_OTHER, /* allocation from other node */ #endif + NR_ANON_TRANSPARENT_HUGEPAGES, NR_VM_ZONE_STAT_ITEMS }; /* @@ -458,12 +459,6 @@ static inline int zone_is_oom_locked(const struct zone *zone) return test_bit(ZONE_OOM_LOCKED, &zone->flags); } -#ifdef CONFIG_SMP -unsigned long zone_nr_free_pages(struct zone *zone); -#else -#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES) -#endif /* CONFIG_SMP */ - /* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the @@ -645,6 +640,7 @@ typedef struct pglist_data { wait_queue_head_t kswapd_wait; struct task_struct *kswapd; int kswapd_max_order; + enum zone_type classzone_idx; } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) @@ -660,8 +656,10 @@ typedef struct pglist_data { extern struct mutex zonelists_mutex; void build_all_zonelists(void *data); -void wakeup_kswapd(struct zone *zone, int order); -int zone_watermark_ok(struct zone *z, int order, unsigned long mark, +void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); +bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags); +bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags); enum memmap_context { MEMMAP_EARLY, diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5f38c46..0db8037 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -48,9 +48,6 @@ * struct page (these bits with information) are always mapped into kernel * address space... * - * PG_buddy is set to indicate that the page is free and in the buddy system - * (see mm/page_alloc.c). - * * PG_hwpoison indicates that a page got corrupted in hardware and contains * data with incorrect ECC bits that triggered a machine check. Accessing is * not safe since it may cause another machine check. Don't touch! @@ -96,7 +93,6 @@ enum pageflags { PG_swapcache, /* Swap page: swp_entry_t in private */ PG_mappedtodisk, /* Has blocks allocated on-disk */ PG_reclaim, /* To be reclaimed asap */ - PG_buddy, /* Page is free, on buddy lists */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable, /* Page is "unevictable" */ #ifdef CONFIG_MMU @@ -108,6 +104,9 @@ enum pageflags { #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + PG_compound_lock, +#endif __NR_PAGEFLAGS, /* Filesystems */ @@ -198,7 +197,7 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; } struct page; /* forward declaration */ TESTPAGEFLAG(Locked, locked) TESTSETFLAG(Locked, locked) -PAGEFLAG(Error, error) +PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error) PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced) PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty) PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru) @@ -230,7 +229,6 @@ PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1) * risky: they bypass page accounting. */ TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback) -__PAGEFLAG(Buddy, buddy) PAGEFLAG(MappedToDisk, mappedtodisk) /* PG_readahead is only used for file reads; PG_reclaim is only for writes */ @@ -344,7 +342,7 @@ static inline void set_page_writeback(struct page *page) * tests can be used in performance sensitive paths. PageCompound is * generally not used in hot code paths. */ -__PAGEFLAG(Head, head) +__PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head) __PAGEFLAG(Tail, tail) static inline int PageCompound(struct page *page) @@ -352,6 +350,13 @@ static inline int PageCompound(struct page *page) return page->flags & ((1L << PG_head) | (1L << PG_tail)); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline void ClearPageCompound(struct page *page) +{ + BUG_ON(!PageHead(page)); + ClearPageHead(page); +} +#endif #else /* * Reduce page flag use as much as possible by overlapping @@ -389,14 +394,61 @@ static inline void __ClearPageTail(struct page *page) page->flags &= ~PG_head_tail_mask; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline void ClearPageCompound(struct page *page) +{ + BUG_ON((page->flags & PG_head_tail_mask) != (1 << PG_compound)); + clear_bit(PG_compound, &page->flags); +} +#endif + #endif /* !PAGEFLAGS_EXTENDED */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * PageHuge() only returns true for hugetlbfs pages, but not for + * normal or transparent huge pages. + * + * PageTransHuge() returns true for both transparent huge and + * hugetlbfs pages, but not normal pages. PageTransHuge() can only be + * called only in the core VM paths where hugetlbfs pages can't exist. + */ +static inline int PageTransHuge(struct page *page) +{ + VM_BUG_ON(PageTail(page)); + return PageHead(page); +} + +static inline int PageTransCompound(struct page *page) +{ + return PageCompound(page); +} + +#else + +static inline int PageTransHuge(struct page *page) +{ + return 0; +} + +static inline int PageTransCompound(struct page *page) +{ + return 0; +} +#endif + #ifdef CONFIG_MMU #define __PG_MLOCKED (1 << PG_mlocked) #else #define __PG_MLOCKED 0 #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define __PG_COMPOUND_LOCK (1 << PG_compound_lock) +#else +#define __PG_COMPOUND_LOCK 0 +#endif + /* * Flags checked when a page is freed. Pages being freed should not have * these flags set. It they are, there is a problem. @@ -404,9 +456,10 @@ static inline void __ClearPageTail(struct page *page) #define PAGE_FLAGS_CHECK_AT_FREE \ (1 << PG_lru | 1 << PG_locked | \ 1 << PG_private | 1 << PG_private_2 | \ - 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \ + 1 << PG_writeback | 1 << PG_reserved | \ 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ - 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON) + 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | \ + __PG_COMPOUND_LOCK) /* * Flags checked when a page is prepped for return by the page allocator. diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index b02195d..5b0c971 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -35,12 +35,18 @@ struct page_cgroup *lookup_page_cgroup(struct page *page); enum { /* flags for mem_cgroup */ - PCG_LOCK, /* page cgroup is locked */ + PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */ PCG_CACHE, /* charged as cache */ PCG_USED, /* this object is in use. */ - PCG_ACCT_LRU, /* page has been accounted for */ - PCG_FILE_MAPPED, /* page is accounted as "mapped" */ PCG_MIGRATION, /* under page migration */ + /* flags for mem_cgroup and file and I/O status */ + PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */ + PCG_FILE_MAPPED, /* page is accounted as "mapped" */ + PCG_FILE_DIRTY, /* page is dirty */ + PCG_FILE_WRITEBACK, /* page is under writeback */ + PCG_FILE_UNSTABLE_NFS, /* page is NFS unstable */ + /* No lock in page_cgroup */ + PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */ }; #define TESTPCGFLAG(uname, lname) \ @@ -59,6 +65,10 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \ static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \ { return test_and_clear_bit(PCG_##lname, &pc->flags); } +#define TESTSETPCGFLAG(uname, lname) \ +static inline int TestSetPageCgroup##uname(struct page_cgroup *pc) \ + { return test_and_set_bit(PCG_##lname, &pc->flags); } + /* Cache flag is set only once (at allocation) */ TESTPCGFLAG(Cache, CACHE) CLEARPCGFLAG(Cache, CACHE) @@ -78,6 +88,22 @@ SETPCGFLAG(FileMapped, FILE_MAPPED) CLEARPCGFLAG(FileMapped, FILE_MAPPED) TESTPCGFLAG(FileMapped, FILE_MAPPED) +SETPCGFLAG(FileDirty, FILE_DIRTY) +CLEARPCGFLAG(FileDirty, FILE_DIRTY) +TESTPCGFLAG(FileDirty, FILE_DIRTY) +TESTCLEARPCGFLAG(FileDirty, FILE_DIRTY) +TESTSETPCGFLAG(FileDirty, FILE_DIRTY) + +SETPCGFLAG(FileWriteback, FILE_WRITEBACK) +CLEARPCGFLAG(FileWriteback, FILE_WRITEBACK) +TESTPCGFLAG(FileWriteback, FILE_WRITEBACK) + +SETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) +CLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) +TESTPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) +TESTCLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) +TESTSETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) + SETPCGFLAG(Migration, MIGRATION) CLEARPCGFLAG(Migration, MIGRATION) TESTPCGFLAG(Migration, MIGRATION) @@ -94,6 +120,10 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc) static inline void lock_page_cgroup(struct page_cgroup *pc) { + /* + * Don't take this lock in IRQ context. + * This lock is for pc->mem_cgroup, USED, CACHE, MIGRATION + */ bit_spin_lock(PCG_LOCK, &pc->flags); } @@ -107,6 +137,24 @@ static inline int page_is_cgroup_locked(struct page_cgroup *pc) return bit_spin_is_locked(PCG_LOCK, &pc->flags); } +static inline void move_lock_page_cgroup(struct page_cgroup *pc, + unsigned long *flags) +{ + /* + * We know updates to pc->flags of page cache's stats are from both of + * usual context or IRQ context. Disable IRQ to avoid deadlock. + */ + local_irq_save(*flags); + bit_spin_lock(PCG_MOVE_LOCK, &pc->flags); +} + +static inline void move_unlock_page_cgroup(struct page_cgroup *pc, + unsigned long *flags) +{ + bit_spin_unlock(PCG_MOVE_LOCK, &pc->flags); + local_irq_restore(*flags); +} + #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct page_cgroup; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 2d1ffe3..9c66e99 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -48,7 +48,7 @@ static inline void mapping_clear_unevictable(struct address_space *mapping) static inline int mapping_unevictable(struct address_space *mapping) { - if (likely(mapping)) + if (mapping) return test_bit(AS_UNEVICTABLE, &mapping->flags); return !!mapping; } diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index ab2baa5..23241c2 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -146,6 +146,22 @@ static inline void *radix_tree_deref_slot(void **pslot) } /** + * radix_tree_deref_slot_protected - dereference a slot without RCU lock but with tree lock held + * @pslot: pointer to slot, returned by radix_tree_lookup_slot + * Returns: item that was stored in that slot with any direct pointer flag + * removed. + * + * Similar to radix_tree_deref_slot but only used during migration when a pages + * mapping is being moved. The caller does not hold the RCU read lock but it + * must hold the tree lock to prevent parallel updates. + */ +static inline void *radix_tree_deref_slot_protected(void **pslot, + spinlock_t *treelock) +{ + return rcu_dereference_protected(*pslot, lockdep_is_held(treelock)); +} + +/** * radix_tree_deref_retry - check radix_tree_deref_slot * @arg: pointer returned by radix_tree_deref_slot * Returns: 0 if retry is not required, otherwise retry is required diff --git a/include/linux/rculist_bl.h b/include/linux/rculist_bl.h index b872b49..cf1244f 100644 --- a/include/linux/rculist_bl.h +++ b/include/linux/rculist_bl.h @@ -11,7 +11,8 @@ static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h, struct hlist_bl_node *n) { LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); - LIST_BL_BUG_ON(!((unsigned long)h->first & LIST_BL_LOCKMASK)); + LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) != + LIST_BL_LOCKMASK); rcu_assign_pointer(h->first, (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK)); } diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bb83c0d..e9fd04c 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -198,6 +198,8 @@ enum ttu_flags { }; #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) +bool is_vma_temporary_stack(struct vm_area_struct *vma); + int try_to_unmap(struct page *, enum ttu_flags flags); int try_to_unmap_one(struct page *, struct vm_area_struct *, unsigned long address, enum ttu_flags flags); diff --git a/include/linux/sched.h b/include/linux/sched.h index 96e2321..d747f94 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -21,7 +21,8 @@ #define CLONE_DETACHED 0x00400000 /* Unused, ignored */ #define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ -#define CLONE_STOPPED 0x02000000 /* Start in stopped state */ +/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state) + and is now available for re-use. */ #define CLONE_NEWUTS 0x04000000 /* New utsname group? */ #define CLONE_NEWIPC 0x08000000 /* New ipcs */ #define CLONE_NEWUSER 0x10000000 /* New user namespace */ @@ -433,6 +434,7 @@ extern int get_dumpable(struct mm_struct *mm); #endif /* leave room for more dump flags */ #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ +#define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) @@ -633,6 +635,8 @@ struct signal_struct { int oom_adj; /* OOM kill score adjustment (bit shift) */ int oom_score_adj; /* OOM kill score adjustment */ + int oom_score_adj_min; /* OOM kill score adjustment minimum value. + * Only settable by CAP_SYS_RESOURCE. */ struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations diff --git a/include/linux/suspend.h b/include/linux/suspend.h index c1f4998..5a89e36 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -258,23 +258,6 @@ static inline int hibernate(void) { return -ENOSYS; } static inline bool system_entering_hibernation(void) { return false; } #endif /* CONFIG_HIBERNATION */ -#ifdef CONFIG_SUSPEND_NVS -extern int suspend_nvs_register(unsigned long start, unsigned long size); -extern int suspend_nvs_alloc(void); -extern void suspend_nvs_free(void); -extern void suspend_nvs_save(void); -extern void suspend_nvs_restore(void); -#else /* CONFIG_SUSPEND_NVS */ -static inline int suspend_nvs_register(unsigned long a, unsigned long b) -{ - return 0; -} -static inline int suspend_nvs_alloc(void) { return 0; } -static inline void suspend_nvs_free(void) {} -static inline void suspend_nvs_save(void) {} -static inline void suspend_nvs_restore(void) {} -#endif /* CONFIG_SUSPEND_NVS */ - #ifdef CONFIG_PM_SLEEP void save_processor_state(void); void restore_processor_state(void); diff --git a/include/linux/swap.h b/include/linux/swap.h index eba53e7..4d55932 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -208,6 +208,8 @@ extern unsigned int nr_free_pagecache_pages(void); /* linux/mm/swap.c */ extern void __lru_cache_add(struct page *, enum lru_list lru); extern void lru_cache_add_lru(struct page *, enum lru_list lru); +extern void lru_add_page_tail(struct zone* zone, + struct page *page, struct page *page_tail); extern void activate_page(struct page *); extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 1de8b9e..8651556 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -77,7 +77,7 @@ struct thermal_cooling_device { char type[THERMAL_NAME_LENGTH]; struct device device; void *devdata; - struct thermal_cooling_device_ops *ops; + const struct thermal_cooling_device_ops *ops; struct list_head node; }; @@ -114,7 +114,7 @@ struct thermal_zone_device { int last_temperature; bool passive; unsigned int forced_passive; - struct thermal_zone_device_ops *ops; + const struct thermal_zone_device_ops *ops; struct list_head cooling_devices; struct idr idr; struct mutex lock; /* protect cooling devices list */ @@ -127,13 +127,41 @@ struct thermal_zone_device { struct thermal_hwmon_attr temp_crit; /* hwmon sys attr */ #endif }; +/* Adding event notification support elements */ +#define THERMAL_GENL_FAMILY_NAME "thermal_event" +#define THERMAL_GENL_VERSION 0x01 +#define THERMAL_GENL_MCAST_GROUP_NAME "thermal_mc_group" + +enum events { + THERMAL_AUX0, + THERMAL_AUX1, + THERMAL_CRITICAL, + THERMAL_DEV_FAULT, +}; + +struct thermal_genl_event { + u32 orig; + enum events event; +}; +/* attributes of thermal_genl_family */ +enum { + THERMAL_GENL_ATTR_UNSPEC, + THERMAL_GENL_ATTR_EVENT, + __THERMAL_GENL_ATTR_MAX, +}; +#define THERMAL_GENL_ATTR_MAX (__THERMAL_GENL_ATTR_MAX - 1) + +/* commands supported by the thermal_genl_family */ +enum { + THERMAL_GENL_CMD_UNSPEC, + THERMAL_GENL_CMD_EVENT, + __THERMAL_GENL_CMD_MAX, +}; +#define THERMAL_GENL_CMD_MAX (__THERMAL_GENL_CMD_MAX - 1) struct thermal_zone_device *thermal_zone_device_register(char *, int, void *, - struct - thermal_zone_device_ops - *, int tc1, int tc2, - int passive_freq, - int polling_freq); + const struct thermal_zone_device_ops *, int tc1, int tc2, + int passive_freq, int polling_freq); void thermal_zone_device_unregister(struct thermal_zone_device *); int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int, @@ -142,9 +170,8 @@ int thermal_zone_unbind_cooling_device(struct thermal_zone_device *, int, struct thermal_cooling_device *); void thermal_zone_device_update(struct thermal_zone_device *); struct thermal_cooling_device *thermal_cooling_device_register(char *, void *, - struct - thermal_cooling_device_ops - *); + const struct thermal_cooling_device_ops *); void thermal_cooling_device_unregister(struct thermal_cooling_device *); +extern int generate_netlink_event(u32 orig, enum events event); #endif /* __THERMAL_H__ */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 44b54f6..4ed6fcd 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -59,8 +59,9 @@ extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); -extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot); +extern void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, int node, void *caller); extern void vfree(const void *addr); extern void *vmap(struct page **pages, unsigned int count, @@ -90,9 +91,6 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, void *caller); -extern struct vm_struct *get_vm_area_node(unsigned long size, - unsigned long flags, int node, - gfp_t gfp_mask); extern struct vm_struct *remove_vm_area(const void *addr); extern int map_vm_area(struct vm_struct *area, pgprot_t prot, @@ -120,7 +118,7 @@ extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); #ifdef CONFIG_SMP struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, - size_t align, gfp_t gfp_mask); + size_t align); void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms); #endif diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index eaaea37..833e676 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -254,6 +254,11 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); void refresh_cpu_vm_stats(int); + +int calculate_pressure_threshold(struct zone *zone); +int calculate_normal_threshold(struct zone *zone); +void set_pgdat_percpu_threshold(pg_data_t *pgdat, + int (*calculate_pressure)(struct zone *)); #else /* CONFIG_SMP */ /* @@ -298,6 +303,8 @@ static inline void __dec_zone_page_state(struct page *page, #define dec_zone_page_state __dec_zone_page_state #define mod_zone_page_state __mod_zone_page_state +#define set_pgdat_percpu_threshold(pgdat, callback) { } + static inline void refresh_cpu_vm_stats(int cpu) { } #endif diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h new file mode 100644 index 0000000..388bcdd --- /dev/null +++ b/include/trace/events/compaction.h @@ -0,0 +1,74 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM compaction + +#if !defined(_TRACE_COMPACTION_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_COMPACTION_H + +#include +#include +#include "gfpflags.h" + +DECLARE_EVENT_CLASS(mm_compaction_isolate_template, + + TP_PROTO(unsigned long nr_scanned, + unsigned long nr_taken), + + TP_ARGS(nr_scanned, nr_taken), + + TP_STRUCT__entry( + __field(unsigned long, nr_scanned) + __field(unsigned long, nr_taken) + ), + + TP_fast_assign( + __entry->nr_scanned = nr_scanned; + __entry->nr_taken = nr_taken; + ), + + TP_printk("nr_scanned=%lu nr_taken=%lu", + __entry->nr_scanned, + __entry->nr_taken) +); + +DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_migratepages, + + TP_PROTO(unsigned long nr_scanned, + unsigned long nr_taken), + + TP_ARGS(nr_scanned, nr_taken) +); + +DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, + TP_PROTO(unsigned long nr_scanned, + unsigned long nr_taken), + + TP_ARGS(nr_scanned, nr_taken) +); + +TRACE_EVENT(mm_compaction_migratepages, + + TP_PROTO(unsigned long nr_migrated, + unsigned long nr_failed), + + TP_ARGS(nr_migrated, nr_failed), + + TP_STRUCT__entry( + __field(unsigned long, nr_migrated) + __field(unsigned long, nr_failed) + ), + + TP_fast_assign( + __entry->nr_migrated = nr_migrated; + __entry->nr_failed = nr_failed; + ), + + TP_printk("nr_migrated=%lu nr_failed=%lu", + __entry->nr_migrated, + __entry->nr_failed) +); + + +#endif /* _TRACE_COMPACTION_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index c255fcc..ea422aa 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -25,13 +25,13 @@ #define trace_reclaim_flags(page, sync) ( \ (page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ - (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ ) #define trace_shrink_flags(file, sync) ( \ - (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_MIXED : \ + (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_MIXED : \ (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) | \ - (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ ) TRACE_EVENT(mm_vmscan_kswapd_sleep, diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 89a2b2d..4e249b9 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -81,6 +81,7 @@ DEFINE_EVENT(writeback_class, name, \ TP_ARGS(bdi)) DEFINE_WRITEBACK_EVENT(writeback_nowork); +DEFINE_WRITEBACK_EVENT(writeback_wake_background); DEFINE_WRITEBACK_EVENT(writeback_wake_thread); DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread); DEFINE_WRITEBACK_EVENT(writeback_bdi_register); diff --git a/include/xen/gntdev.h b/include/xen/gntdev.h new file mode 100644 index 0000000..eb23f41 --- /dev/null +++ b/include/xen/gntdev.h @@ -0,0 +1,119 @@ +/****************************************************************************** + * gntdev.h + * + * Interface to /dev/xen/gntdev. + * + * Copyright (c) 2007, D G Murray + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __LINUX_PUBLIC_GNTDEV_H__ +#define __LINUX_PUBLIC_GNTDEV_H__ + +struct ioctl_gntdev_grant_ref { + /* The domain ID of the grant to be mapped. */ + uint32_t domid; + /* The grant reference of the grant to be mapped. */ + uint32_t ref; +}; + +/* + * Inserts the grant references into the mapping table of an instance + * of gntdev. N.B. This does not perform the mapping, which is deferred + * until mmap() is called with @index as the offset. + */ +#define IOCTL_GNTDEV_MAP_GRANT_REF \ +_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref)) +struct ioctl_gntdev_map_grant_ref { + /* IN parameters */ + /* The number of grants to be mapped. */ + uint32_t count; + uint32_t pad; + /* OUT parameters */ + /* The offset to be used on a subsequent call to mmap(). */ + uint64_t index; + /* Variable IN parameter. */ + /* Array of grant references, of size @count. */ + struct ioctl_gntdev_grant_ref refs[1]; +}; + +/* + * Removes the grant references from the mapping table of an instance of + * of gntdev. N.B. munmap() must be called on the relevant virtual address(es) + * before this ioctl is called, or an error will result. + */ +#define IOCTL_GNTDEV_UNMAP_GRANT_REF \ +_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref)) +struct ioctl_gntdev_unmap_grant_ref { + /* IN parameters */ + /* The offset was returned by the corresponding map operation. */ + uint64_t index; + /* The number of pages to be unmapped. */ + uint32_t count; + uint32_t pad; +}; + +/* + * Returns the offset in the driver's address space that corresponds + * to @vaddr. This can be used to perform a munmap(), followed by an + * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by + * the caller. The number of pages that were allocated at the same time as + * @vaddr is returned in @count. + * + * N.B. Where more than one page has been mapped into a contiguous range, the + * supplied @vaddr must correspond to the start of the range; otherwise + * an error will result. It is only possible to munmap() the entire + * contiguously-allocated range at once, and not any subrange thereof. + */ +#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \ +_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr)) +struct ioctl_gntdev_get_offset_for_vaddr { + /* IN parameters */ + /* The virtual address of the first mapped page in a range. */ + uint64_t vaddr; + /* OUT parameters */ + /* The offset that was used in the initial mmap() operation. */ + uint64_t offset; + /* The number of pages mapped in the VM area that begins at @vaddr. */ + uint32_t count; + uint32_t pad; +}; + +/* + * Sets the maximum number of grants that may mapped at once by this gntdev + * instance. + * + * N.B. This must be called before any other ioctl is performed on the device. + */ +#define IOCTL_GNTDEV_SET_MAX_GRANTS \ +_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants)) +struct ioctl_gntdev_set_max_grants { + /* IN parameter */ + /* The maximum number of grants that may be mapped at once. */ + uint32_t count; +}; + +#endif /* __LINUX_PUBLIC_GNTDEV_H__ */ diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 9a73170..b1fab6b 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -37,10 +37,16 @@ #ifndef __ASM_GNTTAB_H__ #define __ASM_GNTTAB_H__ -#include +#include + +#include #include + +#include #include +#include + /* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */ #define NR_GRANT_FRAMES 4 @@ -107,6 +113,37 @@ void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid, unsigned long pfn); +static inline void +gnttab_set_map_op(struct gnttab_map_grant_ref *map, phys_addr_t addr, + uint32_t flags, grant_ref_t ref, domid_t domid) +{ + if (flags & GNTMAP_contains_pte) + map->host_addr = addr; + else if (xen_feature(XENFEAT_auto_translated_physmap)) + map->host_addr = __pa(addr); + else + map->host_addr = addr; + + map->flags = flags; + map->ref = ref; + map->dom = domid; +} + +static inline void +gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr, + uint32_t flags, grant_handle_t handle) +{ + if (flags & GNTMAP_contains_pte) + unmap->host_addr = addr; + else if (xen_feature(XENFEAT_auto_translated_physmap)) + unmap->host_addr = __pa(addr); + else + unmap->host_addr = addr; + + unmap->handle = handle; + unmap->dev_bus_addr = 0; +} + int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, unsigned long max_nr_gframes, struct grant_entry **__shared); @@ -118,4 +155,9 @@ unsigned int gnttab_max_grant_frames(void); #define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr)) +int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, + struct page **pages, unsigned int count); +int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, + struct page **pages, unsigned int count); + #endif /* __ASM_GNTTAB_H__ */ diff --git a/kernel/fork.c b/kernel/fork.c index d9b44f2..25e4291 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include @@ -330,6 +331,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) retval = ksm_fork(mm, oldmm); if (retval) goto out; + retval = khugepaged_fork(mm, oldmm); + if (retval) + goto out; prev = NULL; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { @@ -529,6 +533,9 @@ void __mmdrop(struct mm_struct *mm) mm_free_pgd(mm); destroy_context(mm); mmu_notifier_mm_destroy(mm); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + VM_BUG_ON(mm->pmd_huge_pte); +#endif free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -543,6 +550,7 @@ void mmput(struct mm_struct *mm) if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); ksm_exit(mm); + khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { @@ -669,6 +677,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk) mm->token_priority = 0; mm->last_interval = 0; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + mm->pmd_huge_pte = NULL; +#endif + if (!mm_init(mm, tsk)) goto fail_nomem; @@ -910,6 +922,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; + sig->oom_score_adj_min = current->signal->oom_score_adj_min; mutex_init(&sig->cred_guard_mutex); @@ -1410,23 +1423,6 @@ long do_fork(unsigned long clone_flags, } /* - * We hope to recycle these flags after 2.6.26 - */ - if (unlikely(clone_flags & CLONE_STOPPED)) { - static int __read_mostly count = 100; - - if (count > 0 && printk_ratelimit()) { - char comm[TASK_COMM_LEN]; - - count--; - printk(KERN_INFO "fork(): process `%s' used deprecated " - "clone flags 0x%lx\n", - get_task_comm(comm, current), - clone_flags & CLONE_STOPPED); - } - } - - /* * When called from kernel_thread, don't do user tracing stuff. */ if (likely(user_mode(regs))) @@ -1464,16 +1460,7 @@ long do_fork(unsigned long clone_flags, */ p->flags &= ~PF_STARTING; - if (unlikely(clone_flags & CLONE_STOPPED)) { - /* - * We'll start up with an immediate SIGSTOP. - */ - sigaddset(&p->pending.signal, SIGSTOP); - set_tsk_thread_flag(p, TIF_SIGPENDING); - __set_task_state(p, TASK_STOPPED); - } else { - wake_up_new_task(p, clone_flags); - } + wake_up_new_task(p, clone_flags); tracehook_report_clone_complete(trace, regs, clone_flags, nr, p); diff --git a/kernel/futex.c b/kernel/futex.c index 3019b92..5207563 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -233,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; - struct page *page; + struct page *page, *page_head; int err; /* @@ -265,11 +265,46 @@ again: if (err < 0) return err; - page = compound_head(page); - lock_page(page); - if (!page->mapping) { - unlock_page(page); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + page_head = page; + if (unlikely(PageTail(page))) { put_page(page); + /* serialize against __split_huge_page_splitting() */ + local_irq_disable(); + if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) { + page_head = compound_head(page); + /* + * page_head is valid pointer but we must pin + * it before taking the PG_lock and/or + * PG_compound_lock. The moment we re-enable + * irqs __split_huge_page_splitting() can + * return and the head page can be freed from + * under us. We can't take the PG_lock and/or + * PG_compound_lock on a page that could be + * freed from under us. + */ + if (page != page_head) { + get_page(page_head); + put_page(page); + } + local_irq_enable(); + } else { + local_irq_enable(); + goto again; + } + } +#else + page_head = compound_head(page); + if (page != page_head) { + get_page(page_head); + put_page(page); + } +#endif + + lock_page(page_head); + if (!page_head->mapping) { + unlock_page(page_head); + put_page(page_head); goto again; } @@ -280,20 +315,20 @@ again: * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. */ - if (PageAnon(page)) { + if (PageAnon(page_head)) { key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; } else { key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = page->mapping->host; - key->shared.pgoff = page->index; + key->shared.inode = page_head->mapping->host; + key->shared.pgoff = page_head->index; } get_futex_key_refs(key); - unlock_page(page); - put_page(page); + unlock_page(page_head); + put_page(page_head); return 0; } diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9988d03..282f202 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -72,6 +72,8 @@ static inline int desc_node(struct irq_desc *desc) { return 0; } static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) { + int cpu; + desc->irq_data.irq = irq; desc->irq_data.chip = &no_irq_chip; desc->irq_data.chip_data = NULL; @@ -83,7 +85,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->irq_count = 0; desc->irqs_unhandled = 0; desc->name = NULL; - memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); + for_each_possible_cpu(cpu) + *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; desc_smp_init(desc, node); } @@ -133,8 +136,7 @@ static struct irq_desc *alloc_desc(int irq, int node) if (!desc) return NULL; /* allocate based on nr_cpu_ids */ - desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs), - gfp, node); + desc->kstat_irqs = alloc_percpu(unsigned int); if (!desc->kstat_irqs) goto err_desc; @@ -149,7 +151,7 @@ static struct irq_desc *alloc_desc(int irq, int node) return desc; err_kstat: - kfree(desc->kstat_irqs); + free_percpu(desc->kstat_irqs); err_desc: kfree(desc); return NULL; @@ -166,7 +168,7 @@ static void free_desc(unsigned int irq) mutex_unlock(&sparse_irq_lock); free_masks(desc); - kfree(desc->kstat_irqs); + free_percpu(desc->kstat_irqs); kfree(desc); } @@ -234,7 +236,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { } }; -static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; int __init early_irq_init(void) { int count, i, node = first_online_node; @@ -250,7 +251,8 @@ int __init early_irq_init(void) for (i = 0; i < count; i++) { desc[i].irq_data.irq = i; desc[i].irq_data.chip = &no_irq_chip; - desc[i].kstat_irqs = kstat_irqs_all[i]; + /* TODO : do this allocation on-demand ... */ + desc[i].kstat_irqs = alloc_percpu(unsigned int); alloc_masks(desc + i, GFP_KERNEL, node); desc_smp_init(desc + i, node); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); @@ -275,6 +277,22 @@ static void free_desc(unsigned int irq) static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) { +#if defined(CONFIG_KSTAT_IRQS_ONDEMAND) + struct irq_desc *desc; + unsigned int i; + + for (i = 0; i < cnt; i++) { + desc = irq_to_desc(start + i); + if (desc && !desc->kstat_irqs) { + unsigned int __percpu *stats = alloc_percpu(unsigned int); + + if (!stats) + return -1; + if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL) + free_percpu(stats); + } + } +#endif return start; } #endif /* !CONFIG_SPARSE_IRQ */ @@ -391,7 +409,9 @@ void dynamic_irq_cleanup(unsigned int irq) unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { struct irq_desc *desc = irq_to_desc(irq); - return desc ? desc->kstat_irqs[cpu] : 0; + + return desc && desc->kstat_irqs ? + *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; } #ifdef CONFIG_GENERIC_HARDIRQS @@ -401,10 +421,10 @@ unsigned int kstat_irqs(unsigned int irq) int cpu; int sum = 0; - if (!desc) + if (!desc || !desc->kstat_irqs) return 0; for_each_possible_cpu(cpu) - sum += desc->kstat_irqs[cpu]; + sum += *per_cpu_ptr(desc->kstat_irqs, cpu); return sum; } #endif /* CONFIG_GENERIC_HARDIRQS */ diff --git a/kernel/panic.c b/kernel/panic.c index 4c13b1a..991bb87 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -34,6 +34,7 @@ static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); int panic_timeout; +EXPORT_SYMBOL_GPL(panic_timeout); ATOMIC_NOTIFIER_HEAD(panic_notifier_list); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index a5aff3e..2657299 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -100,13 +100,9 @@ config PM_SLEEP_ADVANCED_DEBUG depends on PM_ADVANCED_DEBUG default n -config SUSPEND_NVS - bool - config SUSPEND bool "Suspend to RAM and standby" depends on PM && ARCH_SUSPEND_POSSIBLE - select SUSPEND_NVS if HAS_IOMEM default y ---help--- Allow the system to enter sleep states in which main memory is @@ -140,7 +136,6 @@ config HIBERNATION depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE select LZO_COMPRESS select LZO_DECOMPRESS - select SUSPEND_NVS if HAS_IOMEM ---help--- Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the diff --git a/kernel/power/Makefile b/kernel/power/Makefile index b755972..c350e18 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -7,6 +7,5 @@ obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ block_io.o -obj-$(CONFIG_SUSPEND_NVS) += nvs.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c deleted file mode 100644 index 1836db6..0000000 --- a/kernel/power/nvs.c +++ /dev/null @@ -1,136 +0,0 @@ -/* - * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory - * - * Copyright (C) 2008,2009 Rafael J. Wysocki , Novell Inc. - * - * This file is released under the GPLv2. - */ - -#include -#include -#include -#include -#include -#include - -/* - * Platforms, like ACPI, may want us to save some memory used by them during - * suspend and to restore the contents of this memory during the subsequent - * resume. The code below implements a mechanism allowing us to do that. - */ - -struct nvs_page { - unsigned long phys_start; - unsigned int size; - void *kaddr; - void *data; - struct list_head node; -}; - -static LIST_HEAD(nvs_list); - -/** - * suspend_nvs_register - register platform NVS memory region to save - * @start - physical address of the region - * @size - size of the region - * - * The NVS region need not be page-aligned (both ends) and we arrange - * things so that the data from page-aligned addresses in this region will - * be copied into separate RAM pages. - */ -int suspend_nvs_register(unsigned long start, unsigned long size) -{ - struct nvs_page *entry, *next; - - while (size > 0) { - unsigned int nr_bytes; - - entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); - if (!entry) - goto Error; - - list_add_tail(&entry->node, &nvs_list); - entry->phys_start = start; - nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); - entry->size = (size < nr_bytes) ? size : nr_bytes; - - start += entry->size; - size -= entry->size; - } - return 0; - - Error: - list_for_each_entry_safe(entry, next, &nvs_list, node) { - list_del(&entry->node); - kfree(entry); - } - return -ENOMEM; -} - -/** - * suspend_nvs_free - free data pages allocated for saving NVS regions - */ -void suspend_nvs_free(void) -{ - struct nvs_page *entry; - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) { - free_page((unsigned long)entry->data); - entry->data = NULL; - if (entry->kaddr) { - iounmap(entry->kaddr); - entry->kaddr = NULL; - } - } -} - -/** - * suspend_nvs_alloc - allocate memory necessary for saving NVS regions - */ -int suspend_nvs_alloc(void) -{ - struct nvs_page *entry; - - list_for_each_entry(entry, &nvs_list, node) { - entry->data = (void *)__get_free_page(GFP_KERNEL); - if (!entry->data) { - suspend_nvs_free(); - return -ENOMEM; - } - } - return 0; -} - -/** - * suspend_nvs_save - save NVS memory regions - */ -void suspend_nvs_save(void) -{ - struct nvs_page *entry; - - printk(KERN_INFO "PM: Saving platform NVS memory\n"); - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) { - entry->kaddr = ioremap(entry->phys_start, entry->size); - memcpy(entry->data, entry->kaddr, entry->size); - } -} - -/** - * suspend_nvs_restore - restore NVS memory regions - * - * This function is going to be called with interrupts disabled, so it - * cannot iounmap the virtual addresses used to access the NVS region. - */ -void suspend_nvs_restore(void) -{ - struct nvs_page *entry; - - printk(KERN_INFO "PM: Restoring platform NVS memory\n"); - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) - memcpy(entry->kaddr, entry->data, entry->size); -} diff --git a/lib/ioremap.c b/lib/ioremap.c index 5730ecd..da4e2ad 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -90,3 +91,4 @@ int ioremap_page_range(unsigned long addr, return err; } +EXPORT_SYMBOL_GPL(ioremap_page_range); diff --git a/mm/Kconfig b/mm/Kconfig index c2c8a4a..3ad483b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -302,6 +302,44 @@ config NOMMU_INITIAL_TRIM_EXCESS See Documentation/nommu-mmap.txt for more information. +config TRANSPARENT_HUGEPAGE + bool "Transparent Hugepage Support" + depends on X86 && MMU + select COMPACTION + help + Transparent Hugepages allows the kernel to use huge pages and + huge tlb transparently to the applications whenever possible. + This feature can improve computing performance to certain + applications by speeding up page faults during memory + allocation, by reducing the number of tlb misses and by speeding + up the pagetable walking. + + If memory constrained on embedded, you may want to say N. + +choice + prompt "Transparent Hugepage Support sysfs defaults" + depends on TRANSPARENT_HUGEPAGE + default TRANSPARENT_HUGEPAGE_ALWAYS + help + Selects the sysfs defaults for Transparent Hugepage Support. + + config TRANSPARENT_HUGEPAGE_ALWAYS + bool "always" + help + Enabling Transparent Hugepage always, can increase the + memory footprint of applications without a guaranteed + benefit but it will work automatically for all applications. + + config TRANSPARENT_HUGEPAGE_MADVISE + bool "madvise" + help + Enabling Transparent Hugepage madvise, will only provide a + performance improvement benefit to the applications using + madvise(MADV_HUGEPAGE) but it won't risk to increase the + memory footprint of applications without a guaranteed + benefit. +endchoice + # # UP and nommu archs use km based percpu allocator # diff --git a/mm/Makefile b/mm/Makefile index f73f75a..2b1b575 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,7 +5,7 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ - vmalloc.o pagewalk.o + vmalloc.o pagewalk.o pgtable-generic.o obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ @@ -37,6 +37,7 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o diff --git a/mm/compaction.c b/mm/compaction.c index 1a8894e..6d592a0 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -16,6 +16,9 @@ #include #include "internal.h" +#define CREATE_TRACE_POINTS +#include + /* * compact_control is used to track pages being migrated and the free pages * they are being migrated to during memory compaction. The free_pfn starts @@ -30,6 +33,7 @@ struct compact_control { unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ + bool sync; /* Synchronous migration */ /* Account for isolated anon and file pages */ unsigned long nr_anon; @@ -38,6 +42,8 @@ struct compact_control { unsigned int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; + + int compact_mode; }; static unsigned long release_freepages(struct list_head *freelist) @@ -60,7 +66,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, struct list_head *freelist) { unsigned long zone_end_pfn, end_pfn; - int total_isolated = 0; + int nr_scanned = 0, total_isolated = 0; struct page *cursor; /* Get the last PFN we should scan for free pages at */ @@ -81,6 +87,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, if (!pfn_valid_within(blockpfn)) continue; + nr_scanned++; if (!PageBuddy(page)) continue; @@ -100,6 +107,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, } } + trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); return total_isolated; } @@ -234,6 +242,8 @@ static unsigned long isolate_migratepages(struct zone *zone, struct compact_control *cc) { unsigned long low_pfn, end_pfn; + unsigned long last_pageblock_nr = 0, pageblock_nr; + unsigned long nr_scanned = 0, nr_isolated = 0; struct list_head *migratelist = &cc->migratepages; /* Do not scan outside zone boundaries */ @@ -266,20 +276,51 @@ static unsigned long isolate_migratepages(struct zone *zone, struct page *page; if (!pfn_valid_within(low_pfn)) continue; + nr_scanned++; /* Get the page and skip if free */ page = pfn_to_page(low_pfn); if (PageBuddy(page)) continue; + /* + * For async migration, also only scan in MOVABLE blocks. Async + * migration is optimistic to see if the minimum amount of work + * satisfies the allocation + */ + pageblock_nr = low_pfn >> pageblock_order; + if (!cc->sync && last_pageblock_nr != pageblock_nr && + get_pageblock_migratetype(page) != MIGRATE_MOVABLE) { + low_pfn += pageblock_nr_pages; + low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; + last_pageblock_nr = pageblock_nr; + continue; + } + + if (!PageLRU(page)) + continue; + + /* + * PageLRU is set, and lru_lock excludes isolation, + * splitting and collapsing (collapsing has already + * happened if PageLRU is set). + */ + if (PageTransHuge(page)) { + low_pfn += (1 << compound_order(page)) - 1; + continue; + } + /* Try isolate the page */ if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) continue; + VM_BUG_ON(PageTransCompound(page)); + /* Successfully isolated */ del_page_from_lru_list(zone, page, page_lru(page)); list_add(&page->lru, migratelist); cc->nr_migratepages++; + nr_isolated++; /* Avoid isolating too much */ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) @@ -291,6 +332,8 @@ static unsigned long isolate_migratepages(struct zone *zone, spin_unlock_irq(&zone->lru_lock); cc->migrate_pfn = low_pfn; + trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); + return cc->nr_migratepages; } @@ -341,10 +384,10 @@ static void update_nr_listpages(struct compact_control *cc) } static int compact_finished(struct zone *zone, - struct compact_control *cc) + struct compact_control *cc) { unsigned int order; - unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order); + unsigned long watermark; if (fatal_signal_pending(current)) return COMPACT_PARTIAL; @@ -354,12 +397,27 @@ static int compact_finished(struct zone *zone, return COMPACT_COMPLETE; /* Compaction run is not finished if the watermark is not met */ + if (cc->compact_mode != COMPACT_MODE_KSWAPD) + watermark = low_wmark_pages(zone); + else + watermark = high_wmark_pages(zone); + watermark += (1 << cc->order); + if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) return COMPACT_CONTINUE; if (cc->order == -1) return COMPACT_CONTINUE; + /* + * Generating only one page of the right order is not enough + * for kswapd, we must continue until we're above the high + * watermark as a pool for high order GFP_ATOMIC allocations + * too. + */ + if (cc->compact_mode == COMPACT_MODE_KSWAPD) + return COMPACT_CONTINUE; + /* Direct compactor: Is a suitable page free? */ for (order = cc->order; order < MAX_ORDER; order++) { /* Job done if page is free of the right migratetype */ @@ -374,10 +432,62 @@ static int compact_finished(struct zone *zone, return COMPACT_CONTINUE; } +/* + * compaction_suitable: Is this suitable to run compaction on this zone now? + * Returns + * COMPACT_SKIPPED - If there are too few free pages for compaction + * COMPACT_PARTIAL - If the allocation would succeed without compaction + * COMPACT_CONTINUE - If compaction should run now + */ +unsigned long compaction_suitable(struct zone *zone, int order) +{ + int fragindex; + unsigned long watermark; + + /* + * Watermarks for order-0 must be met for compaction. Note the 2UL. + * This is because during migration, copies of pages need to be + * allocated and for a short time, the footprint is higher + */ + watermark = low_wmark_pages(zone) + (2UL << order); + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + return COMPACT_SKIPPED; + + /* + * fragmentation index determines if allocation failures are due to + * low memory or external fragmentation + * + * index of -1 implies allocations might succeed dependingon watermarks + * index towards 0 implies failure is due to lack of memory + * index towards 1000 implies failure is due to fragmentation + * + * Only compact if a failure would be due to fragmentation. + */ + fragindex = fragmentation_index(zone, order); + if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) + return COMPACT_SKIPPED; + + if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) + return COMPACT_PARTIAL; + + return COMPACT_CONTINUE; +} + static int compact_zone(struct zone *zone, struct compact_control *cc) { int ret; + ret = compaction_suitable(zone, cc->order); + switch (ret) { + case COMPACT_PARTIAL: + case COMPACT_SKIPPED: + /* Compaction is likely to fail */ + return ret; + case COMPACT_CONTINUE: + /* Fall through to compaction */ + ; + } + /* Setup to move all movable pages to the end of the zone */ cc->migrate_pfn = zone->zone_start_pfn; cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; @@ -393,7 +503,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; migrate_pages(&cc->migratepages, compaction_alloc, - (unsigned long)cc, 0); + (unsigned long)cc, false, + cc->sync); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; @@ -401,6 +512,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); if (nr_remaining) count_vm_events(COMPACTPAGEFAILED, nr_remaining); + trace_mm_compaction_migratepages(nr_migrate - nr_remaining, + nr_remaining); /* Release LRU pages not migrated */ if (!list_empty(&cc->migratepages)) { @@ -417,8 +530,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) return ret; } -static unsigned long compact_zone_order(struct zone *zone, - int order, gfp_t gfp_mask) +unsigned long compact_zone_order(struct zone *zone, + int order, gfp_t gfp_mask, + bool sync, + int compact_mode) { struct compact_control cc = { .nr_freepages = 0, @@ -426,6 +541,8 @@ static unsigned long compact_zone_order(struct zone *zone, .order = order, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, + .sync = sync, + .compact_mode = compact_mode, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -441,16 +558,17 @@ int sysctl_extfrag_threshold = 500; * @order: The order of the current allocation * @gfp_mask: The GFP mask of the current allocation * @nodemask: The allowed nodes to allocate from + * @sync: Whether migration is synchronous or not * * This is the main entry point for direct page compaction. */ unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *nodemask) + int order, gfp_t gfp_mask, nodemask_t *nodemask, + bool sync) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; - unsigned long watermark; struct zoneref *z; struct zone *zone; int rc = COMPACT_SKIPPED; @@ -460,7 +578,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, * made because an assumption is made that the page allocator can satisfy * the "cheaper" orders without taking special steps */ - if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io) + if (!order || !may_enter_fs || !may_perform_io) return rc; count_vm_event(COMPACTSTALL); @@ -468,43 +586,14 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { - int fragindex; int status; - /* - * Watermarks for order-0 must be met for compaction. Note - * the 2UL. This is because during migration, copies of - * pages need to be allocated and for a short time, the - * footprint is higher - */ - watermark = low_wmark_pages(zone) + (2UL << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) - continue; - - /* - * fragmentation index determines if allocation failures are - * due to low memory or external fragmentation - * - * index of -1 implies allocations might succeed depending - * on watermarks - * index towards 0 implies failure is due to lack of memory - * index towards 1000 implies failure is due to fragmentation - * - * Only compact if a failure would be due to fragmentation. - */ - fragindex = fragmentation_index(zone, order); - if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) - continue; - - if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) { - rc = COMPACT_PARTIAL; - break; - } - - status = compact_zone_order(zone, order, gfp_mask); + status = compact_zone_order(zone, order, gfp_mask, sync, + COMPACT_MODE_DIRECT_RECLAIM); rc = max(status, rc); - if (zone_watermark_ok(zone, order, watermark, 0, 0)) + /* If a normal allocation would succeed, stop compacting */ + if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) break; } @@ -531,6 +620,7 @@ static int compact_node(int nid) .nr_freepages = 0, .nr_migratepages = 0, .order = -1, + .compact_mode = COMPACT_MODE_DIRECT_RECLAIM, }; zone = &pgdat->node_zones[zoneid]; diff --git a/mm/dmapool.c b/mm/dmapool.c index 4df2de7..03bf3bb 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -324,7 +324,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, if (mem_flags & __GFP_WAIT) { DECLARE_WAITQUEUE(wait, current); - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_UNINTERRUPTIBLE); __add_wait_queue(&pool->waitq, &wait); spin_unlock_irqrestore(&pool->lock, flags); @@ -355,20 +355,15 @@ EXPORT_SYMBOL(dma_pool_alloc); static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) { - unsigned long flags; struct dma_page *page; - spin_lock_irqsave(&pool->lock, flags); list_for_each_entry(page, &pool->page_list, page_list) { if (dma < page->dma) continue; if (dma < (page->dma + pool->allocation)) - goto done; + return page; } - page = NULL; - done: - spin_unlock_irqrestore(&pool->lock, flags); - return page; + return NULL; } /** @@ -386,8 +381,10 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) unsigned long flags; unsigned int offset; + spin_lock_irqsave(&pool->lock, flags); page = pool_find_page(pool, dma); if (!page) { + spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) dev_err(pool->dev, "dma_pool_free %s, %p/%lx (bad dma)\n", @@ -401,6 +398,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) offset = vaddr - page->vaddr; #ifdef DMAPOOL_DEBUG if ((dma - page->dma) != offset) { + spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) dev_err(pool->dev, "dma_pool_free %s, %p (bad vaddr)/%Lx\n", @@ -418,6 +416,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) chain = *(int *)(page->vaddr + chain); continue; } + spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) dev_err(pool->dev, "dma_pool_free %s, dma %Lx " "already free\n", pool->name, @@ -432,7 +431,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) memset(vaddr, POOL_POISON_FREED, pool->size); #endif - spin_lock_irqsave(&pool->lock, flags); page->in_use--; *(int *)vaddr = page->offset; page->offset = offset; diff --git a/mm/filemap.c b/mm/filemap.c index ca38939..83a45d3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -298,7 +298,7 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, continue; wait_on_page_writeback(page); - if (PageError(page)) + if (TestClearPageError(page)) ret = -EIO; } pagevec_release(&pvec); @@ -837,9 +837,6 @@ repeat: if (radix_tree_deref_retry(page)) goto restart; - if (page->mapping == NULL || page->index != index) - break; - if (!page_cache_get_speculative(page)) goto repeat; @@ -849,6 +846,16 @@ repeat: goto repeat; } + /* + * must check mapping and index after taking the ref. + * otherwise we can get both false positives and false + * negatives, which is just confusing to the caller. + */ + if (page->mapping == NULL || page->index != index) { + page_cache_release(page); + break; + } + pages[ret] = page; ret++; index++; @@ -2220,7 +2227,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, gfp_notmask = __GFP_FS; repeat: page = find_lock_page(mapping, index); - if (likely(page)) + if (page) return page; page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); diff --git a/mm/huge_memory.c b/mm/huge_memory.c new file mode 100644 index 0000000..004c9c2 --- /dev/null +++ b/mm/huge_memory.c @@ -0,0 +1,2346 @@ +/* + * Copyright (C) 2009 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * By default transparent hugepage support is enabled for all mappings + * and khugepaged scans all mappings. Defrag is only invoked by + * khugepaged hugepage allocations and by page faults inside + * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived + * allocations. + */ +unsigned long transparent_hugepage_flags __read_mostly = +#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS + (1< min_free_kbytes) + min_free_kbytes = recommended_min; + setup_per_zone_wmarks(); + return 0; +} +late_initcall(set_recommended_min_free_kbytes); + +static int start_khugepaged(void) +{ + int err = 0; + if (khugepaged_enabled()) { + int wakeup; + if (unlikely(!mm_slot_cache || !mm_slots_hash)) { + err = -ENOMEM; + goto out; + } + mutex_lock(&khugepaged_mutex); + if (!khugepaged_thread) + khugepaged_thread = kthread_run(khugepaged, NULL, + "khugepaged"); + if (unlikely(IS_ERR(khugepaged_thread))) { + printk(KERN_ERR + "khugepaged: kthread_run(khugepaged) failed\n"); + err = PTR_ERR(khugepaged_thread); + khugepaged_thread = NULL; + } + wakeup = !list_empty(&khugepaged_scan.mm_head); + mutex_unlock(&khugepaged_mutex); + if (wakeup) + wake_up_interruptible(&khugepaged_wait); + + set_recommended_min_free_kbytes(); + } else + /* wakeup to exit */ + wake_up_interruptible(&khugepaged_wait); +out: + return err; +} + +#ifdef CONFIG_SYSFS + +static ssize_t double_flag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf, + enum transparent_hugepage_flag enabled, + enum transparent_hugepage_flag req_madv) +{ + if (test_bit(enabled, &transparent_hugepage_flags)) { + VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags)); + return sprintf(buf, "[always] madvise never\n"); + } else if (test_bit(req_madv, &transparent_hugepage_flags)) + return sprintf(buf, "always [madvise] never\n"); + else + return sprintf(buf, "always madvise [never]\n"); +} +static ssize_t double_flag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count, + enum transparent_hugepage_flag enabled, + enum transparent_hugepage_flag req_madv) +{ + if (!memcmp("always", buf, + min(sizeof("always")-1, count))) { + set_bit(enabled, &transparent_hugepage_flags); + clear_bit(req_madv, &transparent_hugepage_flags); + } else if (!memcmp("madvise", buf, + min(sizeof("madvise")-1, count))) { + clear_bit(enabled, &transparent_hugepage_flags); + set_bit(req_madv, &transparent_hugepage_flags); + } else if (!memcmp("never", buf, + min(sizeof("never")-1, count))) { + clear_bit(enabled, &transparent_hugepage_flags); + clear_bit(req_madv, &transparent_hugepage_flags); + } else + return -EINVAL; + + return count; +} + +static ssize_t enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return double_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_FLAG, + TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); +} +static ssize_t enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + ssize_t ret; + + ret = double_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_FLAG, + TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); + + if (ret > 0) { + int err = start_khugepaged(); + if (err) + ret = err; + } + + if (ret > 0 && + (test_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags) || + test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags))) + set_recommended_min_free_kbytes(); + + return ret; +} +static struct kobj_attribute enabled_attr = + __ATTR(enabled, 0644, enabled_show, enabled_store); + +static ssize_t single_flag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf, + enum transparent_hugepage_flag flag) +{ + if (test_bit(flag, &transparent_hugepage_flags)) + return sprintf(buf, "[yes] no\n"); + else + return sprintf(buf, "yes [no]\n"); +} +static ssize_t single_flag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count, + enum transparent_hugepage_flag flag) +{ + if (!memcmp("yes", buf, + min(sizeof("yes")-1, count))) { + set_bit(flag, &transparent_hugepage_flags); + } else if (!memcmp("no", buf, + min(sizeof("no")-1, count))) { + clear_bit(flag, &transparent_hugepage_flags); + } else + return -EINVAL; + + return count; +} + +/* + * Currently defrag only disables __GFP_NOWAIT for allocation. A blind + * __GFP_REPEAT is too aggressive, it's never worth swapping tons of + * memory just to allocate one more hugepage. + */ +static ssize_t defrag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return double_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); +} +static ssize_t defrag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return double_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); +} +static struct kobj_attribute defrag_attr = + __ATTR(defrag, 0644, defrag_show, defrag_store); + +#ifdef CONFIG_DEBUG_VM +static ssize_t debug_cow_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return single_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); +} +static ssize_t debug_cow_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return single_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); +} +static struct kobj_attribute debug_cow_attr = + __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store); +#endif /* CONFIG_DEBUG_VM */ + +static struct attribute *hugepage_attr[] = { + &enabled_attr.attr, + &defrag_attr.attr, +#ifdef CONFIG_DEBUG_VM + &debug_cow_attr.attr, +#endif + NULL, +}; + +static struct attribute_group hugepage_attr_group = { + .attrs = hugepage_attr, +}; + +static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); +} + +static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = strict_strtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + khugepaged_scan_sleep_millisecs = msecs; + wake_up_interruptible(&khugepaged_wait); + + return count; +} +static struct kobj_attribute scan_sleep_millisecs_attr = + __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, + scan_sleep_millisecs_store); + +static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); +} + +static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = strict_strtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + khugepaged_alloc_sleep_millisecs = msecs; + wake_up_interruptible(&khugepaged_wait); + + return count; +} +static struct kobj_attribute alloc_sleep_millisecs_attr = + __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, + alloc_sleep_millisecs_store); + +static ssize_t pages_to_scan_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_pages_to_scan); +} +static ssize_t pages_to_scan_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long pages; + + err = strict_strtoul(buf, 10, &pages); + if (err || !pages || pages > UINT_MAX) + return -EINVAL; + + khugepaged_pages_to_scan = pages; + + return count; +} +static struct kobj_attribute pages_to_scan_attr = + __ATTR(pages_to_scan, 0644, pages_to_scan_show, + pages_to_scan_store); + +static ssize_t pages_collapsed_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_pages_collapsed); +} +static struct kobj_attribute pages_collapsed_attr = + __ATTR_RO(pages_collapsed); + +static ssize_t full_scans_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_full_scans); +} +static struct kobj_attribute full_scans_attr = + __ATTR_RO(full_scans); + +static ssize_t khugepaged_defrag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return single_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); +} +static ssize_t khugepaged_defrag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return single_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); +} +static struct kobj_attribute khugepaged_defrag_attr = + __ATTR(defrag, 0644, khugepaged_defrag_show, + khugepaged_defrag_store); + +/* + * max_ptes_none controls if khugepaged should collapse hugepages over + * any unmapped ptes in turn potentially increasing the memory + * footprint of the vmas. When max_ptes_none is 0 khugepaged will not + * reduce the available free memory in the system as it + * runs. Increasing max_ptes_none will instead potentially reduce the + * free memory in the system during the khugepaged scan. + */ +static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_max_ptes_none); +} +static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long max_ptes_none; + + err = strict_strtoul(buf, 10, &max_ptes_none); + if (err || max_ptes_none > HPAGE_PMD_NR-1) + return -EINVAL; + + khugepaged_max_ptes_none = max_ptes_none; + + return count; +} +static struct kobj_attribute khugepaged_max_ptes_none_attr = + __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, + khugepaged_max_ptes_none_store); + +static struct attribute *khugepaged_attr[] = { + &khugepaged_defrag_attr.attr, + &khugepaged_max_ptes_none_attr.attr, + &pages_to_scan_attr.attr, + &pages_collapsed_attr.attr, + &full_scans_attr.attr, + &scan_sleep_millisecs_attr.attr, + &alloc_sleep_millisecs_attr.attr, + NULL, +}; + +static struct attribute_group khugepaged_attr_group = { + .attrs = khugepaged_attr, + .name = "khugepaged", +}; +#endif /* CONFIG_SYSFS */ + +static int __init hugepage_init(void) +{ + int err; +#ifdef CONFIG_SYSFS + static struct kobject *hugepage_kobj; +#endif + + err = -EINVAL; + if (!has_transparent_hugepage()) { + transparent_hugepage_flags = 0; + goto out; + } + +#ifdef CONFIG_SYSFS + err = -ENOMEM; + hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); + if (unlikely(!hugepage_kobj)) { + printk(KERN_ERR "hugepage: failed kobject create\n"); + goto out; + } + + err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group); + if (err) { + printk(KERN_ERR "hugepage: failed register hugeage group\n"); + goto out; + } + + err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group); + if (err) { + printk(KERN_ERR "hugepage: failed register hugeage group\n"); + goto out; + } +#endif + + err = khugepaged_slab_init(); + if (err) + goto out; + + err = mm_slots_hash_init(); + if (err) { + khugepaged_slab_free(); + goto out; + } + + /* + * By default disable transparent hugepages on smaller systems, + * where the extra memory used could hurt more than TLB overhead + * is likely to save. The admin can still enable it through /sys. + */ + if (totalram_pages < (512 << (20 - PAGE_SHIFT))) + transparent_hugepage_flags = 0; + + start_khugepaged(); + + set_recommended_min_free_kbytes(); + +out: + return err; +} +module_init(hugepage_init) + +static int __init setup_transparent_hugepage(char *str) +{ + int ret = 0; + if (!str) + goto out; + if (!strcmp(str, "always")) { + set_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags); + ret = 1; + } else if (!strcmp(str, "madvise")) { + clear_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags); + ret = 1; + } else if (!strcmp(str, "never")) { + clear_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags); + ret = 1; + } +out: + if (!ret) + printk(KERN_WARNING + "transparent_hugepage= cannot parse, ignored\n"); + return ret; +} +__setup("transparent_hugepage=", setup_transparent_hugepage); + +static void prepare_pmd_huge_pte(pgtable_t pgtable, + struct mm_struct *mm) +{ + assert_spin_locked(&mm->page_table_lock); + + /* FIFO */ + if (!mm->pmd_huge_pte) + INIT_LIST_HEAD(&pgtable->lru); + else + list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); + mm->pmd_huge_pte = pgtable; +} + +static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) +{ + if (likely(vma->vm_flags & VM_WRITE)) + pmd = pmd_mkwrite(pmd); + return pmd; +} + +static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long haddr, pmd_t *pmd, + struct page *page) +{ + int ret = 0; + pgtable_t pgtable; + + VM_BUG_ON(!PageCompound(page)); + pgtable = pte_alloc_one(mm, haddr); + if (unlikely(!pgtable)) { + mem_cgroup_uncharge_page(page); + put_page(page); + return VM_FAULT_OOM; + } + + clear_huge_page(page, haddr, HPAGE_PMD_NR); + __SetPageUptodate(page); + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_none(*pmd))) { + spin_unlock(&mm->page_table_lock); + mem_cgroup_uncharge_page(page); + put_page(page); + pte_free(mm, pgtable); + } else { + pmd_t entry; + entry = mk_pmd(page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + entry = pmd_mkhuge(entry); + /* + * The spinlocking to take the lru_lock inside + * page_add_new_anon_rmap() acts as a full memory + * barrier to be sure clear_huge_page writes become + * visible after the set_pmd_at() write. + */ + page_add_new_anon_rmap(page, vma, haddr); + set_pmd_at(mm, haddr, pmd, entry); + prepare_pmd_huge_pte(pgtable, mm); + add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); + spin_unlock(&mm->page_table_lock); + } + + return ret; +} + +static inline gfp_t alloc_hugepage_gfpmask(int defrag) +{ + return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT); +} + +static inline struct page *alloc_hugepage_vma(int defrag, + struct vm_area_struct *vma, + unsigned long haddr) +{ + return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), + HPAGE_PMD_ORDER, vma, haddr); +} + +#ifndef CONFIG_NUMA +static inline struct page *alloc_hugepage(int defrag) +{ + return alloc_pages(alloc_hugepage_gfpmask(defrag), + HPAGE_PMD_ORDER); +} +#endif + +int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + unsigned int flags) +{ + struct page *page; + unsigned long haddr = address & HPAGE_PMD_MASK; + pte_t *pte; + + if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + if (unlikely(khugepaged_enter(vma))) + return VM_FAULT_OOM; + page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), + vma, haddr); + if (unlikely(!page)) + goto out; + if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { + put_page(page); + goto out; + } + + return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page); + } +out: + /* + * Use __pte_alloc instead of pte_alloc_map, because we can't + * run pte_offset_map on the pmd, if an huge pmd could + * materialize from under us from a different thread. + */ + if (unlikely(__pte_alloc(mm, vma, pmd, address))) + return VM_FAULT_OOM; + /* if an huge pmd materialized from under us just retry later */ + if (unlikely(pmd_trans_huge(*pmd))) + return 0; + /* + * A regular pmd is established and it can't morph into a huge pmd + * from under us anymore at this point because we hold the mmap_sem + * read mode and khugepaged takes it in write mode. So now it's + * safe to run pte_offset_map(). + */ + pte = pte_offset_map(pmd, address); + return handle_pte_fault(mm, vma, address, pte, pmd, flags); +} + +int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, + struct vm_area_struct *vma) +{ + struct page *src_page; + pmd_t pmd; + pgtable_t pgtable; + int ret; + + ret = -ENOMEM; + pgtable = pte_alloc_one(dst_mm, addr); + if (unlikely(!pgtable)) + goto out; + + spin_lock(&dst_mm->page_table_lock); + spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING); + + ret = -EAGAIN; + pmd = *src_pmd; + if (unlikely(!pmd_trans_huge(pmd))) { + pte_free(dst_mm, pgtable); + goto out_unlock; + } + if (unlikely(pmd_trans_splitting(pmd))) { + /* split huge page running from under us */ + spin_unlock(&src_mm->page_table_lock); + spin_unlock(&dst_mm->page_table_lock); + pte_free(dst_mm, pgtable); + + wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ + goto out; + } + src_page = pmd_page(pmd); + VM_BUG_ON(!PageHead(src_page)); + get_page(src_page); + page_dup_rmap(src_page); + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); + + pmdp_set_wrprotect(src_mm, addr, src_pmd); + pmd = pmd_mkold(pmd_wrprotect(pmd)); + set_pmd_at(dst_mm, addr, dst_pmd, pmd); + prepare_pmd_huge_pte(pgtable, dst_mm); + + ret = 0; +out_unlock: + spin_unlock(&src_mm->page_table_lock); + spin_unlock(&dst_mm->page_table_lock); +out: + return ret; +} + +/* no "address" argument so destroys page coloring of some arch */ +pgtable_t get_pmd_huge_pte(struct mm_struct *mm) +{ + pgtable_t pgtable; + + assert_spin_locked(&mm->page_table_lock); + + /* FIFO */ + pgtable = mm->pmd_huge_pte; + if (list_empty(&pgtable->lru)) + mm->pmd_huge_pte = NULL; + else { + mm->pmd_huge_pte = list_entry(pgtable->lru.next, + struct page, lru); + list_del(&pgtable->lru); + } + return pgtable; +} + +static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmd, pmd_t orig_pmd, + struct page *page, + unsigned long haddr) +{ + pgtable_t pgtable; + pmd_t _pmd; + int ret = 0, i; + struct page **pages; + + pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, + GFP_KERNEL); + if (unlikely(!pages)) { + ret |= VM_FAULT_OOM; + goto out; + } + + for (i = 0; i < HPAGE_PMD_NR; i++) { + pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, + vma, address); + if (unlikely(!pages[i] || + mem_cgroup_newpage_charge(pages[i], mm, + GFP_KERNEL))) { + if (pages[i]) + put_page(pages[i]); + mem_cgroup_uncharge_start(); + while (--i >= 0) { + mem_cgroup_uncharge_page(pages[i]); + put_page(pages[i]); + } + mem_cgroup_uncharge_end(); + kfree(pages); + ret |= VM_FAULT_OOM; + goto out; + } + } + + for (i = 0; i < HPAGE_PMD_NR; i++) { + copy_user_highpage(pages[i], page + i, + haddr + PAGE_SHIFT*i, vma); + __SetPageUptodate(pages[i]); + cond_resched(); + } + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto out_free_pages; + VM_BUG_ON(!PageHead(page)); + + pmdp_clear_flush_notify(vma, haddr, pmd); + /* leave pmd empty until pte is filled */ + + pgtable = get_pmd_huge_pte(mm); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + entry = mk_pte(pages[i], vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + page_add_new_anon_rmap(pages[i], vma, haddr); + pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); + } + kfree(pages); + + mm->nr_ptes++; + smp_wmb(); /* make pte visible before pmd */ + pmd_populate(mm, pmd, pgtable); + page_remove_rmap(page); + spin_unlock(&mm->page_table_lock); + + ret |= VM_FAULT_WRITE; + put_page(page); + +out: + return ret; + +out_free_pages: + spin_unlock(&mm->page_table_lock); + mem_cgroup_uncharge_start(); + for (i = 0; i < HPAGE_PMD_NR; i++) { + mem_cgroup_uncharge_page(pages[i]); + put_page(pages[i]); + } + mem_cgroup_uncharge_end(); + kfree(pages); + goto out; +} + +int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, pmd_t orig_pmd) +{ + int ret = 0; + struct page *page, *new_page; + unsigned long haddr; + + VM_BUG_ON(!vma->anon_vma); + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto out_unlock; + + page = pmd_page(orig_pmd); + VM_BUG_ON(!PageCompound(page) || !PageHead(page)); + haddr = address & HPAGE_PMD_MASK; + if (page_mapcount(page) == 1) { + pmd_t entry; + entry = pmd_mkyoung(orig_pmd); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) + update_mmu_cache(vma, address, entry); + ret |= VM_FAULT_WRITE; + goto out_unlock; + } + get_page(page); + spin_unlock(&mm->page_table_lock); + + if (transparent_hugepage_enabled(vma) && + !transparent_hugepage_debug_cow()) + new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), + vma, haddr); + else + new_page = NULL; + + if (unlikely(!new_page)) { + ret = do_huge_pmd_wp_page_fallback(mm, vma, address, + pmd, orig_pmd, page, haddr); + put_page(page); + goto out; + } + + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + put_page(new_page); + put_page(page); + ret |= VM_FAULT_OOM; + goto out; + } + + copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); + __SetPageUptodate(new_page); + + spin_lock(&mm->page_table_lock); + put_page(page); + if (unlikely(!pmd_same(*pmd, orig_pmd))) { + mem_cgroup_uncharge_page(new_page); + put_page(new_page); + } else { + pmd_t entry; + VM_BUG_ON(!PageHead(page)); + entry = mk_pmd(new_page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + entry = pmd_mkhuge(entry); + pmdp_clear_flush_notify(vma, haddr, pmd); + page_add_new_anon_rmap(new_page, vma, haddr); + set_pmd_at(mm, haddr, pmd, entry); + update_mmu_cache(vma, address, entry); + page_remove_rmap(page); + put_page(page); + ret |= VM_FAULT_WRITE; + } +out_unlock: + spin_unlock(&mm->page_table_lock); +out: + return ret; +} + +struct page *follow_trans_huge_pmd(struct mm_struct *mm, + unsigned long addr, + pmd_t *pmd, + unsigned int flags) +{ + struct page *page = NULL; + + assert_spin_locked(&mm->page_table_lock); + + if (flags & FOLL_WRITE && !pmd_write(*pmd)) + goto out; + + page = pmd_page(*pmd); + VM_BUG_ON(!PageHead(page)); + if (flags & FOLL_TOUCH) { + pmd_t _pmd; + /* + * We should set the dirty bit only for FOLL_WRITE but + * for now the dirty bit in the pmd is meaningless. + * And if the dirty bit will become meaningful and + * we'll only set it with FOLL_WRITE, an atomic + * set_bit will be required on the pmd to set the + * young bit, instead of the current set_pmd_at. + */ + _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); + set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); + } + page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; + VM_BUG_ON(!PageCompound(page)); + if (flags & FOLL_GET) + get_page(page); + +out: + return page; +} + +int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd) +{ + int ret = 0; + + spin_lock(&tlb->mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(&tlb->mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, + pmd); + } else { + struct page *page; + pgtable_t pgtable; + pgtable = get_pmd_huge_pte(tlb->mm); + page = pmd_page(*pmd); + pmd_clear(pmd); + page_remove_rmap(page); + VM_BUG_ON(page_mapcount(page) < 0); + add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); + VM_BUG_ON(!PageHead(page)); + spin_unlock(&tlb->mm->page_table_lock); + tlb_remove_page(tlb, page); + pte_free(tlb->mm, pgtable); + ret = 1; + } + } else + spin_unlock(&tlb->mm->page_table_lock); + + return ret; +} + +int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned char *vec) +{ + int ret = 0; + + spin_lock(&vma->vm_mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + ret = !pmd_trans_splitting(*pmd); + spin_unlock(&vma->vm_mm->page_table_lock); + if (unlikely(!ret)) + wait_split_huge_page(vma->anon_vma, pmd); + else { + /* + * All logical pages in the range are present + * if backed by a huge page. + */ + memset(vec, 1, (end - addr) >> PAGE_SHIFT); + } + } else + spin_unlock(&vma->vm_mm->page_table_lock); + + return ret; +} + +int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, pgprot_t newprot) +{ + struct mm_struct *mm = vma->vm_mm; + int ret = 0; + + spin_lock(&mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(&mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, pmd); + } else { + pmd_t entry; + + entry = pmdp_get_and_clear(mm, addr, pmd); + entry = pmd_modify(entry, newprot); + set_pmd_at(mm, addr, pmd, entry); + spin_unlock(&vma->vm_mm->page_table_lock); + flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE); + ret = 1; + } + } else + spin_unlock(&vma->vm_mm->page_table_lock); + + return ret; +} + +pmd_t *page_check_address_pmd(struct page *page, + struct mm_struct *mm, + unsigned long address, + enum page_check_address_pmd_flag flag) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd, *ret = NULL; + + if (address & ~HPAGE_PMD_MASK) + goto out; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + goto out; + if (pmd_page(*pmd) != page) + goto out; + /* + * split_vma() may create temporary aliased mappings. There is + * no risk as long as all huge pmd are found and have their + * splitting bit set before __split_huge_page_refcount + * runs. Finding the same huge pmd more than once during the + * same rmap walk is not a problem. + */ + if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && + pmd_trans_splitting(*pmd)) + goto out; + if (pmd_trans_huge(*pmd)) { + VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && + !pmd_trans_splitting(*pmd)); + ret = pmd; + } +out: + return ret; +} + +static int __split_huge_page_splitting(struct page *page, + struct vm_area_struct *vma, + unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + pmd_t *pmd; + int ret = 0; + + spin_lock(&mm->page_table_lock); + pmd = page_check_address_pmd(page, mm, address, + PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); + if (pmd) { + /* + * We can't temporarily set the pmd to null in order + * to split it, the pmd must remain marked huge at all + * times or the VM won't take the pmd_trans_huge paths + * and it won't wait on the anon_vma->root->lock to + * serialize against split_huge_page*. + */ + pmdp_splitting_flush_notify(vma, address, pmd); + ret = 1; + } + spin_unlock(&mm->page_table_lock); + + return ret; +} + +static void __split_huge_page_refcount(struct page *page) +{ + int i; + unsigned long head_index = page->index; + struct zone *zone = page_zone(page); + int zonestat; + + /* prevent PageLRU to go away from under us, and freeze lru stats */ + spin_lock_irq(&zone->lru_lock); + compound_lock(page); + + for (i = 1; i < HPAGE_PMD_NR; i++) { + struct page *page_tail = page + i; + + /* tail_page->_count cannot change */ + atomic_sub(atomic_read(&page_tail->_count), &page->_count); + BUG_ON(page_count(page) <= 0); + atomic_add(page_mapcount(page) + 1, &page_tail->_count); + BUG_ON(atomic_read(&page_tail->_count) <= 0); + + /* after clearing PageTail the gup refcount can be released */ + smp_mb(); + + page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + page_tail->flags |= (page->flags & + ((1L << PG_referenced) | + (1L << PG_swapbacked) | + (1L << PG_mlocked) | + (1L << PG_uptodate))); + page_tail->flags |= (1L << PG_dirty); + + /* + * 1) clear PageTail before overwriting first_page + * 2) clear PageTail before clearing PageHead for VM_BUG_ON + */ + smp_wmb(); + + /* + * __split_huge_page_splitting() already set the + * splitting bit in all pmd that could map this + * hugepage, that will ensure no CPU can alter the + * mapcount on the head page. The mapcount is only + * accounted in the head page and it has to be + * transferred to all tail pages in the below code. So + * for this code to be safe, the split the mapcount + * can't change. But that doesn't mean userland can't + * keep changing and reading the page contents while + * we transfer the mapcount, so the pmd splitting + * status is achieved setting a reserved bit in the + * pmd, not by clearing the present bit. + */ + BUG_ON(page_mapcount(page_tail)); + page_tail->_mapcount = page->_mapcount; + + BUG_ON(page_tail->mapping); + page_tail->mapping = page->mapping; + + page_tail->index = ++head_index; + + BUG_ON(!PageAnon(page_tail)); + BUG_ON(!PageUptodate(page_tail)); + BUG_ON(!PageDirty(page_tail)); + BUG_ON(!PageSwapBacked(page_tail)); + + lru_add_page_tail(zone, page, page_tail); + } + + __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); + + /* + * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics, + * so adjust those appropriately if this page is on the LRU. + */ + if (PageLRU(page)) { + zonestat = NR_LRU_BASE + page_lru(page); + __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1)); + } + + ClearPageCompound(page); + compound_unlock(page); + spin_unlock_irq(&zone->lru_lock); + + for (i = 1; i < HPAGE_PMD_NR; i++) { + struct page *page_tail = page + i; + BUG_ON(page_count(page_tail) <= 0); + /* + * Tail pages may be freed if there wasn't any mapping + * like if add_to_swap() is running on a lru page that + * had its mapping zapped. And freeing these pages + * requires taking the lru_lock so we do the put_page + * of the tail pages after the split is complete. + */ + put_page(page_tail); + } + + /* + * Only the head page (now become a regular page) is required + * to be pinned by the caller. + */ + BUG_ON(page_count(page) <= 0); +} + +static int __split_huge_page_map(struct page *page, + struct vm_area_struct *vma, + unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + pmd_t *pmd, _pmd; + int ret = 0, i; + pgtable_t pgtable; + unsigned long haddr; + + spin_lock(&mm->page_table_lock); + pmd = page_check_address_pmd(page, mm, address, + PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); + if (pmd) { + pgtable = get_pmd_huge_pte(mm); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0, haddr = address; i < HPAGE_PMD_NR; + i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + BUG_ON(PageCompound(page+i)); + entry = mk_pte(page + i, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (!pmd_write(*pmd)) + entry = pte_wrprotect(entry); + else + BUG_ON(page_mapcount(page) != 1); + if (!pmd_young(*pmd)) + entry = pte_mkold(entry); + pte = pte_offset_map(&_pmd, haddr); + BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); + } + + mm->nr_ptes++; + smp_wmb(); /* make pte visible before pmd */ + /* + * Up to this point the pmd is present and huge and + * userland has the whole access to the hugepage + * during the split (which happens in place). If we + * overwrite the pmd with the not-huge version + * pointing to the pte here (which of course we could + * if all CPUs were bug free), userland could trigger + * a small page size TLB miss on the small sized TLB + * while the hugepage TLB entry is still established + * in the huge TLB. Some CPU doesn't like that. See + * http://support.amd.com/us/Processor_TechDocs/41322.pdf, + * Erratum 383 on page 93. Intel should be safe but is + * also warns that it's only safe if the permission + * and cache attributes of the two entries loaded in + * the two TLB is identical (which should be the case + * here). But it is generally safer to never allow + * small and huge TLB entries for the same virtual + * address to be loaded simultaneously. So instead of + * doing "pmd_populate(); flush_tlb_range();" we first + * mark the current pmd notpresent (atomically because + * here the pmd_trans_huge and pmd_trans_splitting + * must remain set at all times on the pmd until the + * split is complete for this pmd), then we flush the + * SMP TLB and finally we write the non-huge version + * of the pmd entry with pmd_populate. + */ + set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd)); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + pmd_populate(mm, pmd, pgtable); + ret = 1; + } + spin_unlock(&mm->page_table_lock); + + return ret; +} + +/* must be called with anon_vma->root->lock hold */ +static void __split_huge_page(struct page *page, + struct anon_vma *anon_vma) +{ + int mapcount, mapcount2; + struct anon_vma_chain *avc; + + BUG_ON(!PageHead(page)); + BUG_ON(PageTail(page)); + + mapcount = 0; + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; + unsigned long addr = vma_address(page, vma); + BUG_ON(is_vma_temporary_stack(vma)); + if (addr == -EFAULT) + continue; + mapcount += __split_huge_page_splitting(page, vma, addr); + } + /* + * It is critical that new vmas are added to the tail of the + * anon_vma list. This guarantes that if copy_huge_pmd() runs + * and establishes a child pmd before + * __split_huge_page_splitting() freezes the parent pmd (so if + * we fail to prevent copy_huge_pmd() from running until the + * whole __split_huge_page() is complete), we will still see + * the newly established pmd of the child later during the + * walk, to be able to set it as pmd_trans_splitting too. + */ + if (mapcount != page_mapcount(page)) + printk(KERN_ERR "mapcount %d page_mapcount %d\n", + mapcount, page_mapcount(page)); + BUG_ON(mapcount != page_mapcount(page)); + + __split_huge_page_refcount(page); + + mapcount2 = 0; + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; + unsigned long addr = vma_address(page, vma); + BUG_ON(is_vma_temporary_stack(vma)); + if (addr == -EFAULT) + continue; + mapcount2 += __split_huge_page_map(page, vma, addr); + } + if (mapcount != mapcount2) + printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", + mapcount, mapcount2, page_mapcount(page)); + BUG_ON(mapcount != mapcount2); +} + +int split_huge_page(struct page *page) +{ + struct anon_vma *anon_vma; + int ret = 1; + + BUG_ON(!PageAnon(page)); + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) + goto out; + ret = 0; + if (!PageCompound(page)) + goto out_unlock; + + BUG_ON(!PageSwapBacked(page)); + __split_huge_page(page, anon_vma); + + BUG_ON(PageCompound(page)); +out_unlock: + page_unlock_anon_vma(anon_vma); +out: + return ret; +} + +int hugepage_madvise(struct vm_area_struct *vma, + unsigned long *vm_flags, int advice) +{ + switch (advice) { + case MADV_HUGEPAGE: + /* + * Be somewhat over-protective like KSM for now! + */ + if (*vm_flags & (VM_HUGEPAGE | + VM_SHARED | VM_MAYSHARE | + VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | + VM_MIXEDMAP | VM_SAO)) + return -EINVAL; + *vm_flags &= ~VM_NOHUGEPAGE; + *vm_flags |= VM_HUGEPAGE; + /* + * If the vma become good for khugepaged to scan, + * register it here without waiting a page fault that + * may not happen any time soon. + */ + if (unlikely(khugepaged_enter_vma_merge(vma))) + return -ENOMEM; + break; + case MADV_NOHUGEPAGE: + /* + * Be somewhat over-protective like KSM for now! + */ + if (*vm_flags & (VM_NOHUGEPAGE | + VM_SHARED | VM_MAYSHARE | + VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | + VM_MIXEDMAP | VM_SAO)) + return -EINVAL; + *vm_flags &= ~VM_HUGEPAGE; + *vm_flags |= VM_NOHUGEPAGE; + /* + * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning + * this vma even if we leave the mm registered in khugepaged if + * it got registered before VM_NOHUGEPAGE was set. + */ + break; + } + + return 0; +} + +static int __init khugepaged_slab_init(void) +{ + mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", + sizeof(struct mm_slot), + __alignof__(struct mm_slot), 0, NULL); + if (!mm_slot_cache) + return -ENOMEM; + + return 0; +} + +static void __init khugepaged_slab_free(void) +{ + kmem_cache_destroy(mm_slot_cache); + mm_slot_cache = NULL; +} + +static inline struct mm_slot *alloc_mm_slot(void) +{ + if (!mm_slot_cache) /* initialization failed */ + return NULL; + return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); +} + +static inline void free_mm_slot(struct mm_slot *mm_slot) +{ + kmem_cache_free(mm_slot_cache, mm_slot); +} + +static int __init mm_slots_hash_init(void) +{ + mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), + GFP_KERNEL); + if (!mm_slots_hash) + return -ENOMEM; + return 0; +} + +#if 0 +static void __init mm_slots_hash_free(void) +{ + kfree(mm_slots_hash); + mm_slots_hash = NULL; +} +#endif + +static struct mm_slot *get_mm_slot(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + struct hlist_head *bucket; + struct hlist_node *node; + + bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) + % MM_SLOTS_HASH_HEADS]; + hlist_for_each_entry(mm_slot, node, bucket, hash) { + if (mm == mm_slot->mm) + return mm_slot; + } + return NULL; +} + +static void insert_to_mm_slots_hash(struct mm_struct *mm, + struct mm_slot *mm_slot) +{ + struct hlist_head *bucket; + + bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) + % MM_SLOTS_HASH_HEADS]; + mm_slot->mm = mm; + hlist_add_head(&mm_slot->hash, bucket); +} + +static inline int khugepaged_test_exit(struct mm_struct *mm) +{ + return atomic_read(&mm->mm_users) == 0; +} + +int __khugepaged_enter(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int wakeup; + + mm_slot = alloc_mm_slot(); + if (!mm_slot) + return -ENOMEM; + + /* __khugepaged_exit() must not run from under us */ + VM_BUG_ON(khugepaged_test_exit(mm)); + if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { + free_mm_slot(mm_slot); + return 0; + } + + spin_lock(&khugepaged_mm_lock); + insert_to_mm_slots_hash(mm, mm_slot); + /* + * Insert just behind the scanning cursor, to let the area settle + * down a little. + */ + wakeup = list_empty(&khugepaged_scan.mm_head); + list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); + spin_unlock(&khugepaged_mm_lock); + + atomic_inc(&mm->mm_count); + if (wakeup) + wake_up_interruptible(&khugepaged_wait); + + return 0; +} + +int khugepaged_enter_vma_merge(struct vm_area_struct *vma) +{ + unsigned long hstart, hend; + if (!vma->anon_vma) + /* + * Not yet faulted in so we will register later in the + * page fault if needed. + */ + return 0; + if (vma->vm_file || vma->vm_ops) + /* khugepaged not yet working on file or special mappings */ + return 0; + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (hstart < hend) + return khugepaged_enter(vma); + return 0; +} + +void __khugepaged_exit(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int free = 0; + + spin_lock(&khugepaged_mm_lock); + mm_slot = get_mm_slot(mm); + if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { + hlist_del(&mm_slot->hash); + list_del(&mm_slot->mm_node); + free = 1; + } + + if (free) { + spin_unlock(&khugepaged_mm_lock); + clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + free_mm_slot(mm_slot); + mmdrop(mm); + } else if (mm_slot) { + spin_unlock(&khugepaged_mm_lock); + /* + * This is required to serialize against + * khugepaged_test_exit() (which is guaranteed to run + * under mmap sem read mode). Stop here (after we + * return all pagetables will be destroyed) until + * khugepaged has finished working on the pagetables + * under the mmap_sem. + */ + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } else + spin_unlock(&khugepaged_mm_lock); +} + +static void release_pte_page(struct page *page) +{ + /* 0 stands for page_is_file_cache(page) == false */ + dec_zone_page_state(page, NR_ISOLATED_ANON + 0); + unlock_page(page); + putback_lru_page(page); +} + +static void release_pte_pages(pte_t *pte, pte_t *_pte) +{ + while (--_pte >= pte) { + pte_t pteval = *_pte; + if (!pte_none(pteval)) + release_pte_page(pte_page(pteval)); + } +} + +static void release_all_pte_pages(pte_t *pte) +{ + release_pte_pages(pte, pte + HPAGE_PMD_NR); +} + +static int __collapse_huge_page_isolate(struct vm_area_struct *vma, + unsigned long address, + pte_t *pte) +{ + struct page *page; + pte_t *_pte; + int referenced = 0, isolated = 0, none = 0; + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; + _pte++, address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (pte_none(pteval)) { + if (++none <= khugepaged_max_ptes_none) + continue; + else { + release_pte_pages(pte, _pte); + goto out; + } + } + if (!pte_present(pteval) || !pte_write(pteval)) { + release_pte_pages(pte, _pte); + goto out; + } + page = vm_normal_page(vma, address, pteval); + if (unlikely(!page)) { + release_pte_pages(pte, _pte); + goto out; + } + VM_BUG_ON(PageCompound(page)); + BUG_ON(!PageAnon(page)); + VM_BUG_ON(!PageSwapBacked(page)); + + /* cannot use mapcount: can't collapse if there's a gup pin */ + if (page_count(page) != 1) { + release_pte_pages(pte, _pte); + goto out; + } + /* + * We can do it before isolate_lru_page because the + * page can't be freed from under us. NOTE: PG_lock + * is needed to serialize against split_huge_page + * when invoked from the VM. + */ + if (!trylock_page(page)) { + release_pte_pages(pte, _pte); + goto out; + } + /* + * Isolate the page to avoid collapsing an hugepage + * currently in use by the VM. + */ + if (isolate_lru_page(page)) { + unlock_page(page); + release_pte_pages(pte, _pte); + goto out; + } + /* 0 stands for page_is_file_cache(page) == false */ + inc_zone_page_state(page, NR_ISOLATED_ANON + 0); + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(PageLRU(page)); + + /* If there is no mapped pte young don't collapse the page */ + if (pte_young(pteval) || PageReferenced(page) || + mmu_notifier_test_young(vma->vm_mm, address)) + referenced = 1; + } + if (unlikely(!referenced)) + release_all_pte_pages(pte); + else + isolated = 1; +out: + return isolated; +} + +static void __collapse_huge_page_copy(pte_t *pte, struct page *page, + struct vm_area_struct *vma, + unsigned long address, + spinlock_t *ptl) +{ + pte_t *_pte; + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { + pte_t pteval = *_pte; + struct page *src_page; + + if (pte_none(pteval)) { + clear_user_highpage(page, address); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); + } else { + src_page = pte_page(pteval); + copy_user_highpage(page, src_page, address, vma); + VM_BUG_ON(page_mapcount(src_page) != 1); + VM_BUG_ON(page_count(src_page) != 2); + release_pte_page(src_page); + /* + * ptl mostly unnecessary, but preempt has to + * be disabled to update the per-cpu stats + * inside page_remove_rmap(). + */ + spin_lock(ptl); + /* + * paravirt calls inside pte_clear here are + * superfluous. + */ + pte_clear(vma->vm_mm, address, _pte); + page_remove_rmap(src_page); + spin_unlock(ptl); + free_page_and_swap_cache(src_page); + } + + address += PAGE_SIZE; + page++; + } +} + +static void collapse_huge_page(struct mm_struct *mm, + unsigned long address, + struct page **hpage, + struct vm_area_struct *vma) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd, _pmd; + pte_t *pte; + pgtable_t pgtable; + struct page *new_page; + spinlock_t *ptl; + int isolated; + unsigned long hstart, hend; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); +#ifndef CONFIG_NUMA + VM_BUG_ON(!*hpage); + new_page = *hpage; +#else + VM_BUG_ON(*hpage); + /* + * Allocate the page while the vma is still valid and under + * the mmap_sem read mode so there is no memory allocation + * later when we take the mmap_sem in write mode. This is more + * friendly behavior (OTOH it may actually hide bugs) to + * filesystems in userland with daemons allocating memory in + * the userland I/O paths. Allocating memory with the + * mmap_sem in read mode is good idea also to allow greater + * scalability. + */ + new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); + if (unlikely(!new_page)) { + up_read(&mm->mmap_sem); + *hpage = ERR_PTR(-ENOMEM); + return; + } +#endif + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + up_read(&mm->mmap_sem); + put_page(new_page); + return; + } + + /* after allocating the hugepage upgrade to mmap_sem write mode */ + up_read(&mm->mmap_sem); + + /* + * Prevent all access to pagetables with the exception of + * gup_fast later hanlded by the ptep_clear_flush and the VM + * handled by the anon_vma lock + PG_lock. + */ + down_write(&mm->mmap_sem); + if (unlikely(khugepaged_test_exit(mm))) + goto out; + + vma = find_vma(mm, address); + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (address < hstart || address + HPAGE_PMD_SIZE > hend) + goto out; + + if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || + (vma->vm_flags & VM_NOHUGEPAGE)) + goto out; + + /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) + goto out; + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + /* pmd can't go away or become huge under us */ + if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) + goto out; + + anon_vma_lock(vma->anon_vma); + + pte = pte_offset_map(pmd, address); + ptl = pte_lockptr(mm, pmd); + + spin_lock(&mm->page_table_lock); /* probably unnecessary */ + /* + * After this gup_fast can't run anymore. This also removes + * any huge TLB entry from the CPU so we won't allow + * huge and small TLB entries for the same virtual address + * to avoid the risk of CPU bugs in that area. + */ + _pmd = pmdp_clear_flush_notify(vma, address, pmd); + spin_unlock(&mm->page_table_lock); + + spin_lock(ptl); + isolated = __collapse_huge_page_isolate(vma, address, pte); + spin_unlock(ptl); + pte_unmap(pte); + + if (unlikely(!isolated)) { + spin_lock(&mm->page_table_lock); + BUG_ON(!pmd_none(*pmd)); + set_pmd_at(mm, address, pmd, _pmd); + spin_unlock(&mm->page_table_lock); + anon_vma_unlock(vma->anon_vma); + mem_cgroup_uncharge_page(new_page); + goto out; + } + + /* + * All pages are isolated and locked so anon_vma rmap + * can't run anymore. + */ + anon_vma_unlock(vma->anon_vma); + + __collapse_huge_page_copy(pte, new_page, vma, address, ptl); + __SetPageUptodate(new_page); + pgtable = pmd_pgtable(_pmd); + VM_BUG_ON(page_count(pgtable) != 1); + VM_BUG_ON(page_mapcount(pgtable) != 0); + + _pmd = mk_pmd(new_page, vma->vm_page_prot); + _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); + _pmd = pmd_mkhuge(_pmd); + + /* + * spin_lock() below is not the equivalent of smp_wmb(), so + * this is needed to avoid the copy_huge_page writes to become + * visible after the set_pmd_at() write. + */ + smp_wmb(); + + spin_lock(&mm->page_table_lock); + BUG_ON(!pmd_none(*pmd)); + page_add_new_anon_rmap(new_page, vma, address); + set_pmd_at(mm, address, pmd, _pmd); + update_mmu_cache(vma, address, entry); + prepare_pmd_huge_pte(pgtable, mm); + mm->nr_ptes--; + spin_unlock(&mm->page_table_lock); + +#ifndef CONFIG_NUMA + *hpage = NULL; +#endif + khugepaged_pages_collapsed++; +out_up_write: + up_write(&mm->mmap_sem); + return; + +out: +#ifdef CONFIG_NUMA + put_page(new_page); +#endif + goto out_up_write; +} + +static int khugepaged_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + struct page **hpage) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte, *_pte; + int ret = 0, referenced = 0, none = 0; + struct page *page; + unsigned long _address; + spinlock_t *ptl; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) + goto out; + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; + _pte++, _address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (pte_none(pteval)) { + if (++none <= khugepaged_max_ptes_none) + continue; + else + goto out_unmap; + } + if (!pte_present(pteval) || !pte_write(pteval)) + goto out_unmap; + page = vm_normal_page(vma, _address, pteval); + if (unlikely(!page)) + goto out_unmap; + VM_BUG_ON(PageCompound(page)); + if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) + goto out_unmap; + /* cannot use mapcount: can't collapse if there's a gup pin */ + if (page_count(page) != 1) + goto out_unmap; + if (pte_young(pteval) || PageReferenced(page) || + mmu_notifier_test_young(vma->vm_mm, address)) + referenced = 1; + } + if (referenced) + ret = 1; +out_unmap: + pte_unmap_unlock(pte, ptl); + if (ret) + /* collapse_huge_page will return with the mmap_sem released */ + collapse_huge_page(mm, address, hpage, vma); +out: + return ret; +} + +static void collect_mm_slot(struct mm_slot *mm_slot) +{ + struct mm_struct *mm = mm_slot->mm; + + VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + + if (khugepaged_test_exit(mm)) { + /* free mm_slot */ + hlist_del(&mm_slot->hash); + list_del(&mm_slot->mm_node); + + /* + * Not strictly needed because the mm exited already. + * + * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + */ + + /* khugepaged_mm_lock actually not necessary for the below */ + free_mm_slot(mm_slot); + mmdrop(mm); + } +} + +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, + struct page **hpage) +{ + struct mm_slot *mm_slot; + struct mm_struct *mm; + struct vm_area_struct *vma; + int progress = 0; + + VM_BUG_ON(!pages); + VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + + if (khugepaged_scan.mm_slot) + mm_slot = khugepaged_scan.mm_slot; + else { + mm_slot = list_entry(khugepaged_scan.mm_head.next, + struct mm_slot, mm_node); + khugepaged_scan.address = 0; + khugepaged_scan.mm_slot = mm_slot; + } + spin_unlock(&khugepaged_mm_lock); + + mm = mm_slot->mm; + down_read(&mm->mmap_sem); + if (unlikely(khugepaged_test_exit(mm))) + vma = NULL; + else + vma = find_vma(mm, khugepaged_scan.address); + + progress++; + for (; vma; vma = vma->vm_next) { + unsigned long hstart, hend; + + cond_resched(); + if (unlikely(khugepaged_test_exit(mm))) { + progress++; + break; + } + + if ((!(vma->vm_flags & VM_HUGEPAGE) && + !khugepaged_always()) || + (vma->vm_flags & VM_NOHUGEPAGE)) { + progress++; + continue; + } + + /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) { + khugepaged_scan.address = vma->vm_end; + progress++; + continue; + } + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); + + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (hstart >= hend) { + progress++; + continue; + } + if (khugepaged_scan.address < hstart) + khugepaged_scan.address = hstart; + if (khugepaged_scan.address > hend) { + khugepaged_scan.address = hend + HPAGE_PMD_SIZE; + progress++; + continue; + } + BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); + + while (khugepaged_scan.address < hend) { + int ret; + cond_resched(); + if (unlikely(khugepaged_test_exit(mm))) + goto breakouterloop; + + VM_BUG_ON(khugepaged_scan.address < hstart || + khugepaged_scan.address + HPAGE_PMD_SIZE > + hend); + ret = khugepaged_scan_pmd(mm, vma, + khugepaged_scan.address, + hpage); + /* move to next address */ + khugepaged_scan.address += HPAGE_PMD_SIZE; + progress += HPAGE_PMD_NR; + if (ret) + /* we released mmap_sem so break loop */ + goto breakouterloop_mmap_sem; + if (progress >= pages) + goto breakouterloop; + } + } +breakouterloop: + up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ +breakouterloop_mmap_sem: + + spin_lock(&khugepaged_mm_lock); + BUG_ON(khugepaged_scan.mm_slot != mm_slot); + /* + * Release the current mm_slot if this mm is about to die, or + * if we scanned all vmas of this mm. + */ + if (khugepaged_test_exit(mm) || !vma) { + /* + * Make sure that if mm_users is reaching zero while + * khugepaged runs here, khugepaged_exit will find + * mm_slot not pointing to the exiting mm. + */ + if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { + khugepaged_scan.mm_slot = list_entry( + mm_slot->mm_node.next, + struct mm_slot, mm_node); + khugepaged_scan.address = 0; + } else { + khugepaged_scan.mm_slot = NULL; + khugepaged_full_scans++; + } + + collect_mm_slot(mm_slot); + } + + return progress; +} + +static int khugepaged_has_work(void) +{ + return !list_empty(&khugepaged_scan.mm_head) && + khugepaged_enabled(); +} + +static int khugepaged_wait_event(void) +{ + return !list_empty(&khugepaged_scan.mm_head) || + !khugepaged_enabled(); +} + +static void khugepaged_do_scan(struct page **hpage) +{ + unsigned int progress = 0, pass_through_head = 0; + unsigned int pages = khugepaged_pages_to_scan; + + barrier(); /* write khugepaged_pages_to_scan to local stack */ + + while (progress < pages) { + cond_resched(); + +#ifndef CONFIG_NUMA + if (!*hpage) { + *hpage = alloc_hugepage(khugepaged_defrag()); + if (unlikely(!*hpage)) + break; + } +#else + if (IS_ERR(*hpage)) + break; +#endif + + if (unlikely(kthread_should_stop() || freezing(current))) + break; + + spin_lock(&khugepaged_mm_lock); + if (!khugepaged_scan.mm_slot) + pass_through_head++; + if (khugepaged_has_work() && + pass_through_head < 2) + progress += khugepaged_scan_mm_slot(pages - progress, + hpage); + else + progress = pages; + spin_unlock(&khugepaged_mm_lock); + } +} + +static void khugepaged_alloc_sleep(void) +{ + DEFINE_WAIT(wait); + add_wait_queue(&khugepaged_wait, &wait); + schedule_timeout_interruptible( + msecs_to_jiffies( + khugepaged_alloc_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); +} + +#ifndef CONFIG_NUMA +static struct page *khugepaged_alloc_hugepage(void) +{ + struct page *hpage; + + do { + hpage = alloc_hugepage(khugepaged_defrag()); + if (!hpage) + khugepaged_alloc_sleep(); + } while (unlikely(!hpage) && + likely(khugepaged_enabled())); + return hpage; +} +#endif + +static void khugepaged_loop(void) +{ + struct page *hpage; + +#ifdef CONFIG_NUMA + hpage = NULL; +#endif + while (likely(khugepaged_enabled())) { +#ifndef CONFIG_NUMA + hpage = khugepaged_alloc_hugepage(); + if (unlikely(!hpage)) + break; +#else + if (IS_ERR(hpage)) { + khugepaged_alloc_sleep(); + hpage = NULL; + } +#endif + + khugepaged_do_scan(&hpage); +#ifndef CONFIG_NUMA + if (hpage) + put_page(hpage); +#endif + try_to_freeze(); + if (unlikely(kthread_should_stop())) + break; + if (khugepaged_has_work()) { + DEFINE_WAIT(wait); + if (!khugepaged_scan_sleep_millisecs) + continue; + add_wait_queue(&khugepaged_wait, &wait); + schedule_timeout_interruptible( + msecs_to_jiffies( + khugepaged_scan_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); + } else if (khugepaged_enabled()) + wait_event_freezable(khugepaged_wait, + khugepaged_wait_event()); + } +} + +static int khugepaged(void *none) +{ + struct mm_slot *mm_slot; + + set_freezable(); + set_user_nice(current, 19); + + /* serialize with start_khugepaged() */ + mutex_lock(&khugepaged_mutex); + + for (;;) { + mutex_unlock(&khugepaged_mutex); + BUG_ON(khugepaged_thread != current); + khugepaged_loop(); + BUG_ON(khugepaged_thread != current); + + mutex_lock(&khugepaged_mutex); + if (!khugepaged_enabled()) + break; + if (unlikely(kthread_should_stop())) + break; + } + + spin_lock(&khugepaged_mm_lock); + mm_slot = khugepaged_scan.mm_slot; + khugepaged_scan.mm_slot = NULL; + if (mm_slot) + collect_mm_slot(mm_slot); + spin_unlock(&khugepaged_mm_lock); + + khugepaged_thread = NULL; + mutex_unlock(&khugepaged_mutex); + + return 0; +} + +void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) +{ + struct page *page; + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_trans_huge(*pmd))) { + spin_unlock(&mm->page_table_lock); + return; + } + page = pmd_page(*pmd); + VM_BUG_ON(!page_count(page)); + get_page(page); + spin_unlock(&mm->page_table_lock); + + split_huge_page(page); + + put_page(page); + BUG_ON(pmd_trans_huge(*pmd)); +} + +static void split_huge_page_address(struct mm_struct *mm, + unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + return; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return; + /* + * Caller holds the mmap_sem write mode, so a huge pmd cannot + * materialize from under us. + */ + split_huge_page_pmd(mm, pmd); +} + +void __vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next) +{ + /* + * If the new start address isn't hpage aligned and it could + * previously contain an hugepage: check if we need to split + * an huge pmd. + */ + if (start & ~HPAGE_PMD_MASK && + (start & HPAGE_PMD_MASK) >= vma->vm_start && + (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) + split_huge_page_address(vma->vm_mm, start); + + /* + * If the new end address isn't hpage aligned and it could + * previously contain an hugepage: check if we need to split + * an huge pmd. + */ + if (end & ~HPAGE_PMD_MASK && + (end & HPAGE_PMD_MASK) >= vma->vm_start && + (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) + split_huge_page_address(vma->vm_mm, end); + + /* + * If we're also updating the vma->vm_next->vm_start, if the new + * vm_next->vm_start isn't page aligned and it could previously + * contain an hugepage: check if we need to split an huge pmd. + */ + if (adjust_next > 0) { + struct vm_area_struct *next = vma->vm_next; + unsigned long nstart = next->vm_start; + nstart += adjust_next << PAGE_SHIFT; + if (nstart & ~HPAGE_PMD_MASK && + (nstart & HPAGE_PMD_MASK) >= next->vm_start && + (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) + split_huge_page_address(next->vm_mm, nstart); + } +} diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8585524..bb0b7c1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -394,71 +394,6 @@ static int vma_has_reserves(struct vm_area_struct *vma) return 0; } -static void clear_gigantic_page(struct page *page, - unsigned long addr, unsigned long sz) -{ - int i; - struct page *p = page; - - might_sleep(); - for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) { - cond_resched(); - clear_user_highpage(p, addr + i * PAGE_SIZE); - } -} -static void clear_huge_page(struct page *page, - unsigned long addr, unsigned long sz) -{ - int i; - - if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) { - clear_gigantic_page(page, addr, sz); - return; - } - - might_sleep(); - for (i = 0; i < sz/PAGE_SIZE; i++) { - cond_resched(); - clear_user_highpage(page + i, addr + i * PAGE_SIZE); - } -} - -static void copy_user_gigantic_page(struct page *dst, struct page *src, - unsigned long addr, struct vm_area_struct *vma) -{ - int i; - struct hstate *h = hstate_vma(vma); - struct page *dst_base = dst; - struct page *src_base = src; - - for (i = 0; i < pages_per_huge_page(h); ) { - cond_resched(); - copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); - - i++; - dst = mem_map_next(dst, dst_base, i); - src = mem_map_next(src, src_base, i); - } -} - -static void copy_user_huge_page(struct page *dst, struct page *src, - unsigned long addr, struct vm_area_struct *vma) -{ - int i; - struct hstate *h = hstate_vma(vma); - - if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { - copy_user_gigantic_page(dst, src, addr, vma); - return; - } - - might_sleep(); - for (i = 0; i < pages_per_huge_page(h); i++) { - cond_resched(); - copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); - } -} - static void copy_gigantic_page(struct page *dst, struct page *src) { int i; @@ -1428,6 +1363,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj, return sprintf(buf, "%lu\n", nr_huge_pages); } + static ssize_t nr_hugepages_store_common(bool obey_mempolicy, struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) @@ -1440,9 +1376,14 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, err = strict_strtoul(buf, 10, &count); if (err) - return 0; + goto out; h = kobj_to_hstate(kobj, &nid); + if (h->order >= MAX_ORDER) { + err = -EINVAL; + goto out; + } + if (nid == NUMA_NO_NODE) { /* * global hstate attribute @@ -1468,6 +1409,9 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, NODEMASK_FREE(nodes_allowed); return len; +out: + NODEMASK_FREE(nodes_allowed); + return err; } static ssize_t nr_hugepages_show(struct kobject *kobj, @@ -1510,6 +1454,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, struct hstate *h = kobj_to_hstate(kobj, NULL); return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); } + static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { @@ -1517,9 +1462,12 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, unsigned long input; struct hstate *h = kobj_to_hstate(kobj, NULL); + if (h->order >= MAX_ORDER) + return -EINVAL; + err = strict_strtoul(buf, 10, &input); if (err) - return 0; + return err; spin_lock(&hugetlb_lock); h->nr_overcommit_huge_pages = input; @@ -1922,13 +1870,19 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, { struct hstate *h = &default_hstate; unsigned long tmp; + int ret; if (!write) tmp = h->max_huge_pages; + if (write && h->order >= MAX_ORDER) + return -EINVAL; + table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, buffer, length, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (ret) + goto out; if (write) { NODEMASK_ALLOC(nodemask_t, nodes_allowed, @@ -1943,8 +1897,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, if (nodes_allowed != &node_states[N_HIGH_MEMORY]) NODEMASK_FREE(nodes_allowed); } - - return 0; +out: + return ret; } int hugetlb_sysctl_handler(struct ctl_table *table, int write, @@ -1982,21 +1936,27 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, { struct hstate *h = &default_hstate; unsigned long tmp; + int ret; if (!write) tmp = h->nr_overcommit_huge_pages; + if (write && h->order >= MAX_ORDER) + return -EINVAL; + table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, buffer, length, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (ret) + goto out; if (write) { spin_lock(&hugetlb_lock); h->nr_overcommit_huge_pages = tmp; spin_unlock(&hugetlb_lock); } - - return 0; +out: + return ret; } #endif /* CONFIG_SYSCTL */ @@ -2454,7 +2414,8 @@ retry_avoidcopy: return VM_FAULT_OOM; } - copy_user_huge_page(new_page, old_page, address, vma); + copy_user_huge_page(new_page, old_page, address, vma, + pages_per_huge_page(h)); __SetPageUptodate(new_page); /* @@ -2558,7 +2519,7 @@ retry: ret = -PTR_ERR(page); goto out; } - clear_huge_page(page, address, huge_page_size(h)); + clear_huge_page(page, address, pages_per_huge_page(h)); __SetPageUptodate(page); if (vma->vm_flags & VM_MAYSHARE) { diff --git a/mm/internal.h b/mm/internal.h index dedb0af..4c98630 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -39,6 +39,15 @@ static inline void __put_page(struct page *page) extern unsigned long highest_memmap_pfn; +#ifdef CONFIG_SMP +extern int putback_active_lru_page(struct zone *zone, struct page *page); +#else +static inline int putback_active_lru_page(struct zone *zone, struct page *page) +{ + return 0; +} +#endif + /* * in mm/vmscan.c: */ @@ -134,6 +143,10 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) } } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +extern unsigned long vma_address(struct page *page, + struct vm_area_struct *vma); +#endif #else /* !CONFIG_MMU */ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) { @@ -243,7 +256,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, unsigned int foll_flags, - struct page **pages, struct vm_area_struct **vmas); + struct page **pages, struct vm_area_struct **vmas, + int *nonblocking); #define ZONE_RECLAIM_NOSCAN -2 #define ZONE_RECLAIM_FULL -1 diff --git a/mm/ksm.c b/mm/ksm.c index 43bc893..c2b2a94 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include "internal.h" @@ -411,6 +412,20 @@ out: up_read(&mm->mmap_sem); } +static struct page *page_trans_compound_anon(struct page *page) +{ + if (PageTransCompound(page)) { + struct page *head = compound_trans_head(page); + /* + * head may actually be splitted and freed from under + * us but it's ok here. + */ + if (PageAnon(head)) + return head; + } + return NULL; +} + static struct page *get_mergeable_page(struct rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; @@ -430,7 +445,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) page = follow_page(vma, addr, FOLL_GET); if (IS_ERR_OR_NULL(page)) goto out; - if (PageAnon(page)) { + if (PageAnon(page) || page_trans_compound_anon(page)) { flush_anon_page(vma, page, addr); flush_dcache_page(page); } else { @@ -708,6 +723,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, if (addr == -EFAULT) goto out; + BUG_ON(PageTransCompound(page)); ptep = page_check_address(page, mm, addr, &ptl, 0); if (!ptep) goto out; @@ -783,6 +799,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, goto out; pmd = pmd_offset(pud, addr); + BUG_ON(pmd_trans_huge(*pmd)); if (!pmd_present(*pmd)) goto out; @@ -800,6 +817,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); page_remove_rmap(page); + if (!page_mapped(page)) + try_to_free_swap(page); put_page(page); pte_unmap_unlock(ptep, ptl); @@ -808,6 +827,33 @@ out: return err; } +static int page_trans_compound_anon_split(struct page *page) +{ + int ret = 0; + struct page *transhuge_head = page_trans_compound_anon(page); + if (transhuge_head) { + /* Get the reference on the head to split it. */ + if (get_page_unless_zero(transhuge_head)) { + /* + * Recheck we got the reference while the head + * was still anonymous. + */ + if (PageAnon(transhuge_head)) + ret = split_huge_page(transhuge_head); + else + /* + * Retry later if split_huge_page run + * from under us. + */ + ret = 1; + put_page(transhuge_head); + } else + /* Retry later if split_huge_page run from under us. */ + ret = 1; + } + return ret; +} + /* * try_to_merge_one_page - take two pages and merge them into one * @vma: the vma that holds the pte pointing to page @@ -828,6 +874,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, if (!(vma->vm_flags & VM_MERGEABLE)) goto out; + if (PageTransCompound(page) && page_trans_compound_anon_split(page)) + goto out; + BUG_ON(PageTransCompound(page)); if (!PageAnon(page)) goto out; @@ -1247,6 +1296,18 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) slot = ksm_scan.mm_slot; if (slot == &ksm_mm_head) { + /* + * A number of pages can hang around indefinitely on per-cpu + * pagevecs, raised page count preventing write_protect_page + * from merging them. Though it doesn't really matter much, + * it is puzzling to see some stuck in pages_volatile until + * other activity jostles them out, and they also prevented + * LTP's KSM test from succeeding deterministically; so drain + * them here (here rather than on entry to ksm_do_scan(), + * so we don't IPI too often when pages_to_scan is set low). + */ + lru_add_drain_all(); + root_unstable_tree = RB_ROOT; spin_lock(&ksm_mmlist_lock); @@ -1277,7 +1338,13 @@ next_mm: if (ksm_test_exit(mm)) break; *page = follow_page(vma, ksm_scan.address, FOLL_GET); - if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) { + if (IS_ERR_OR_NULL(*page)) { + ksm_scan.address += PAGE_SIZE; + cond_resched(); + continue; + } + if (PageAnon(*page) || + page_trans_compound_anon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); rmap_item = get_next_rmap_item(slot, @@ -1291,8 +1358,7 @@ next_mm: up_read(&mm->mmap_sem); return rmap_item; } - if (!IS_ERR_OR_NULL(*page)) - put_page(*page); + put_page(*page); ksm_scan.address += PAGE_SIZE; cond_resched(); } @@ -1352,7 +1418,7 @@ static void ksm_do_scan(unsigned int scan_npages) struct rmap_item *rmap_item; struct page *uninitialized_var(page); - while (scan_npages--) { + while (scan_npages-- && likely(!freezing(current))) { cond_resched(); rmap_item = scan_get_next_rmap_item(&page); if (!rmap_item) @@ -1370,6 +1436,7 @@ static int ksmd_should_run(void) static int ksm_scan_thread(void *nothing) { + set_freezable(); set_user_nice(current, 5); while (!kthread_should_stop()) { @@ -1378,11 +1445,13 @@ static int ksm_scan_thread(void *nothing) ksm_do_scan(ksm_thread_pages_to_scan); mutex_unlock(&ksm_thread_mutex); + try_to_freeze(); + if (ksmd_should_run()) { schedule_timeout_interruptible( msecs_to_jiffies(ksm_thread_sleep_millisecs)); } else { - wait_event_interruptible(ksm_thread_wait, + wait_event_freezable(ksm_thread_wait, ksmd_should_run() || kthread_should_stop()); } } diff --git a/mm/madvise.c b/mm/madvise.c index 319528b..2221491 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -71,6 +71,12 @@ static long madvise_behavior(struct vm_area_struct * vma, if (error) goto out; break; + case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: + error = hugepage_madvise(vma, &new_flags, behavior); + if (error) + goto out; + break; } if (new_flags == vma->vm_flags) { @@ -283,6 +289,10 @@ madvise_behavior_valid(int behavior) case MADV_MERGEABLE: case MADV_UNMERGEABLE: #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: +#endif return 1; default: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 00bb8a6..8ab8410 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -292,7 +292,6 @@ static struct move_charge_struct { unsigned long moved_charge; unsigned long moved_swap; struct task_struct *moving_task; /* a task moving charges */ - struct mm_struct *mm; wait_queue_head_t waitq; /* a waitq for other context */ } mc = { .lock = __SPIN_LOCK_UNLOCKED(mc.lock), @@ -821,7 +820,6 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) return; VM_BUG_ON(list_empty(&pc->lru)); list_del_init(&pc->lru); - return; } void mem_cgroup_del_lru(struct page *page) @@ -1087,7 +1085,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, case 0: list_move(&page->lru, dst); mem_cgroup_del_lru(page); - nr_taken++; + nr_taken += hpage_nr_pages(page); break; case -EBUSY: /* we don't affect global LRU but rotate in our LRU */ @@ -1312,8 +1310,9 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) u64 limit; u64 memsw; - limit = res_counter_read_u64(&memcg->res, RES_LIMIT) + - total_swap_pages; + limit = res_counter_read_u64(&memcg->res, RES_LIMIT); + limit += total_swap_pages << PAGE_SHIFT; + memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); /* * If memsw is finite and limits the amount of swap space available @@ -1600,11 +1599,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) * possibility of race condition. If there is, we take a lock. */ -static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) +void mem_cgroup_update_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx, int val) { struct mem_cgroup *mem; struct page_cgroup *pc = lookup_page_cgroup(page); bool need_unlock = false; + unsigned long uninitialized_var(flags); if (unlikely(!pc)) return; @@ -1616,37 +1617,34 @@ static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) /* pc->mem_cgroup is unstable ? */ if (unlikely(mem_cgroup_stealed(mem))) { /* take a lock against to access pc->mem_cgroup */ - lock_page_cgroup(pc); + move_lock_page_cgroup(pc, &flags); need_unlock = true; mem = pc->mem_cgroup; if (!mem || !PageCgroupUsed(pc)) goto out; } - this_cpu_add(mem->stat->count[idx], val); - switch (idx) { - case MEM_CGROUP_STAT_FILE_MAPPED: + case MEMCG_NR_FILE_MAPPED: if (val > 0) SetPageCgroupFileMapped(pc); else if (!page_mapped(page)) ClearPageCgroupFileMapped(pc); + idx = MEM_CGROUP_STAT_FILE_MAPPED; break; default: BUG(); } + this_cpu_add(mem->stat->count[idx], val); + out: if (unlikely(need_unlock)) - unlock_page_cgroup(pc); + move_unlock_page_cgroup(pc, &flags); rcu_read_unlock(); return; } - -void mem_cgroup_update_file_mapped(struct page *page, int val) -{ - mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val); -} +EXPORT_SYMBOL(mem_cgroup_update_page_stat); /* * size of first charge trial. "32" comes from vmscan.c's magic value. @@ -1887,12 +1885,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, * oom-killer can be invoked. */ static int __mem_cgroup_try_charge(struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) + gfp_t gfp_mask, + struct mem_cgroup **memcg, bool oom, + int page_size) { int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem = NULL; int ret; - int csize = CHARGE_SIZE; + int csize = max(CHARGE_SIZE, (unsigned long) page_size); /* * Unlike gloval-vm's OOM-kill, we're not in memory shortage @@ -1917,7 +1917,7 @@ again: VM_BUG_ON(css_is_removed(&mem->css)); if (mem_cgroup_is_root(mem)) goto done; - if (consume_stock(mem)) + if (page_size == PAGE_SIZE && consume_stock(mem)) goto done; css_get(&mem->css); } else { @@ -1940,7 +1940,7 @@ again: rcu_read_unlock(); goto done; } - if (consume_stock(mem)) { + if (page_size == PAGE_SIZE && consume_stock(mem)) { /* * It seems dagerous to access memcg without css_get(). * But considering how consume_stok works, it's not @@ -1981,7 +1981,7 @@ again: case CHARGE_OK: break; case CHARGE_RETRY: /* not in OOM situation but retry */ - csize = PAGE_SIZE; + csize = page_size; css_put(&mem->css); mem = NULL; goto again; @@ -2002,8 +2002,8 @@ again: } } while (ret != CHARGE_OK); - if (csize > PAGE_SIZE) - refill_stock(mem, csize - PAGE_SIZE); + if (csize > page_size) + refill_stock(mem, csize - page_size); css_put(&mem->css); done: *memcg = mem; @@ -2031,9 +2031,10 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, } } -static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) +static void mem_cgroup_cancel_charge(struct mem_cgroup *mem, + int page_size) { - __mem_cgroup_cancel_charge(mem, 1); + __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT); } /* @@ -2087,22 +2088,10 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be * USED state. If already USED, uncharge and return. */ - -static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, - struct page_cgroup *pc, - enum charge_type ctype) +static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem, + struct page_cgroup *pc, + enum charge_type ctype) { - /* try_charge() can return NULL to *memcg, taking care of it. */ - if (!mem) - return; - - lock_page_cgroup(pc); - if (unlikely(PageCgroupUsed(pc))) { - unlock_page_cgroup(pc); - mem_cgroup_cancel_charge(mem); - return; - } - pc->mem_cgroup = mem; /* * We access a page_cgroup asynchronously without lock_page_cgroup(). @@ -2127,6 +2116,33 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, } mem_cgroup_charge_statistics(mem, pc, true); +} + +static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, + struct page_cgroup *pc, + enum charge_type ctype, + int page_size) +{ + int i; + int count = page_size >> PAGE_SHIFT; + + /* try_charge() can return NULL to *memcg, taking care of it. */ + if (!mem) + return; + + lock_page_cgroup(pc); + if (unlikely(PageCgroupUsed(pc))) { + unlock_page_cgroup(pc); + mem_cgroup_cancel_charge(mem, page_size); + return; + } + + /* + * we don't need page_cgroup_lock about tail pages, becase they are not + * accessed by any other context at this point. + */ + for (i = 0; i < count; i++) + ____mem_cgroup_commit_charge(mem, pc + i, ctype); unlock_page_cgroup(pc); /* @@ -2173,7 +2189,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, mem_cgroup_charge_statistics(from, pc, false); if (uncharge) /* This is not "cancel", but cancel_charge does all we need. */ - mem_cgroup_cancel_charge(from); + mem_cgroup_cancel_charge(from, PAGE_SIZE); /* caller should have done css_get */ pc->mem_cgroup = to; @@ -2195,9 +2211,13 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) { int ret = -EINVAL; + unsigned long flags; + lock_page_cgroup(pc); if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { + move_lock_page_cgroup(pc, &flags); __mem_cgroup_move_account(pc, from, to, uncharge); + move_unlock_page_cgroup(pc, &flags); ret = 0; } unlock_page_cgroup(pc); @@ -2234,13 +2254,14 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, goto put; parent = mem_cgroup_from_cont(pcg); - ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, + PAGE_SIZE); if (ret || !parent) goto put_back; ret = mem_cgroup_move_account(pc, child, parent, true); if (ret) - mem_cgroup_cancel_charge(parent); + mem_cgroup_cancel_charge(parent, PAGE_SIZE); put_back: putback_lru_page(page); put: @@ -2261,6 +2282,12 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, struct mem_cgroup *mem = NULL; struct page_cgroup *pc; int ret; + int page_size = PAGE_SIZE; + + if (PageTransHuge(page)) { + page_size <<= compound_order(page); + VM_BUG_ON(!PageTransHuge(page)); + } pc = lookup_page_cgroup(page); /* can happen at boot */ @@ -2268,11 +2295,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, return 0; prefetchw(pc); - ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); + ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size); if (ret || !mem) return ret; - __mem_cgroup_commit_charge(mem, pc, ctype); + __mem_cgroup_commit_charge(mem, pc, ctype, page_size); return 0; } @@ -2281,8 +2308,6 @@ int mem_cgroup_newpage_charge(struct page *page, { if (mem_cgroup_disabled()) return 0; - if (PageCompound(page)) - return 0; /* * If already mapped, we don't have to account. * If page cache, page->mapping has address_space. @@ -2388,13 +2413,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, if (!mem) goto charge_cur_mm; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); + ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE); css_put(&mem->css); return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; - return __mem_cgroup_try_charge(mm, mask, ptr, true); + return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE); } static void @@ -2410,7 +2435,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, cgroup_exclude_rmdir(&ptr->css); pc = lookup_page_cgroup(page); mem_cgroup_lru_del_before_commit_swapcache(page); - __mem_cgroup_commit_charge(ptr, pc, ctype); + __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE); mem_cgroup_lru_add_after_commit_swapcache(page); /* * Now swap is on-memory. This means this page may be @@ -2459,11 +2484,12 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) return; if (!mem) return; - mem_cgroup_cancel_charge(mem); + mem_cgroup_cancel_charge(mem, PAGE_SIZE); } static void -__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) +__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, + int page_size) { struct memcg_batch_info *batch = NULL; bool uncharge_memsw = true; @@ -2490,6 +2516,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) goto direct_uncharge; + if (page_size != PAGE_SIZE) + goto direct_uncharge; + /* * In typical case, batch->memcg == mem. This means we can * merge a series of uncharges to an uncharge of res_counter. @@ -2503,9 +2532,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) batch->memsw_bytes += PAGE_SIZE; return; direct_uncharge: - res_counter_uncharge(&mem->res, PAGE_SIZE); + res_counter_uncharge(&mem->res, page_size); if (uncharge_memsw) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); + res_counter_uncharge(&mem->memsw, page_size); if (unlikely(batch->memcg != mem)) memcg_oom_recover(mem); return; @@ -2517,8 +2546,11 @@ direct_uncharge: static struct mem_cgroup * __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { + int i; + int count; struct page_cgroup *pc; struct mem_cgroup *mem = NULL; + int page_size = PAGE_SIZE; if (mem_cgroup_disabled()) return NULL; @@ -2526,6 +2558,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) if (PageSwapCache(page)) return NULL; + if (PageTransHuge(page)) { + page_size <<= compound_order(page); + VM_BUG_ON(!PageTransHuge(page)); + } + + count = page_size >> PAGE_SHIFT; /* * Check if our page_cgroup is valid */ @@ -2558,7 +2596,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - mem_cgroup_charge_statistics(mem, pc, false); + for (i = 0; i < count; i++) + mem_cgroup_charge_statistics(mem, pc + i, false); ClearPageCgroupUsed(pc); /* @@ -2579,7 +2618,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) mem_cgroup_get(mem); } if (!mem_cgroup_is_root(mem)) - __do_uncharge(mem, ctype); + __do_uncharge(mem, ctype, page_size); return mem; @@ -2774,6 +2813,7 @@ int mem_cgroup_prepare_migration(struct page *page, enum charge_type ctype; int ret = 0; + VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) return 0; @@ -2823,7 +2863,7 @@ int mem_cgroup_prepare_migration(struct page *page, return 0; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE); css_put(&mem->css);/* drop extra refcnt */ if (ret || *ptr == NULL) { if (PageAnon(page)) { @@ -2850,13 +2890,13 @@ int mem_cgroup_prepare_migration(struct page *page, ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; else ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; - __mem_cgroup_commit_charge(mem, pc, ctype); + __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE); return ret; } /* remove redundant charge if migration failed*/ void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, struct page *newpage) + struct page *oldpage, struct page *newpage, bool migration_ok) { struct page *used, *unused; struct page_cgroup *pc; @@ -2865,8 +2905,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, return; /* blocks rmdir() */ cgroup_exclude_rmdir(&mem->css); - /* at migration success, oldpage->mapping is NULL. */ - if (oldpage->mapping) { + if (!migration_ok) { used = oldpage; unused = newpage; } else { @@ -4176,13 +4215,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) */ if (!node_state(node, N_NORMAL_MEMORY)) tmp = -1; - pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); + pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); if (!pn) return 1; mem->info.nodeinfo[node] = pn; - memset(pn, 0, sizeof(*pn)); - for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; for_each_lru(l) @@ -4206,14 +4243,13 @@ static struct mem_cgroup *mem_cgroup_alloc(void) /* Can be very big if MAX_NUMNODES is very big */ if (size < PAGE_SIZE) - mem = kmalloc(size, GFP_KERNEL); + mem = kzalloc(size, GFP_KERNEL); else - mem = vmalloc(size); + mem = vzalloc(size); if (!mem) return NULL; - memset(mem, 0, size); mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); if (!mem->stat) goto out_free; @@ -4461,7 +4497,8 @@ one_by_one: batch_count = PRECHARGE_COUNT_AT_ONCE; cond_resched(); } - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, + PAGE_SIZE); if (ret || !mem) /* mem_cgroup_clear_mc() will do uncharge later */ return -ENOMEM; @@ -4623,6 +4660,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; + VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) if (is_target_pte_for_mc(vma, addr, *pte, NULL)) @@ -4638,7 +4676,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) unsigned long precharge; struct vm_area_struct *vma; - /* We've already held the mmap_sem */ + down_read(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) { struct mm_walk mem_cgroup_count_precharge_walk = { .pmd_entry = mem_cgroup_count_precharge_pte_range, @@ -4650,6 +4688,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) walk_page_range(vma->vm_start, vma->vm_end, &mem_cgroup_count_precharge_walk); } + up_read(&mm->mmap_sem); precharge = mc.precharge; mc.precharge = 0; @@ -4659,10 +4698,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) static int mem_cgroup_precharge_mc(struct mm_struct *mm) { - return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); + unsigned long precharge = mem_cgroup_count_precharge(mm); + + VM_BUG_ON(mc.moving_task); + mc.moving_task = current; + return mem_cgroup_do_precharge(precharge); } -static void mem_cgroup_clear_mc(void) +/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ +static void __mem_cgroup_clear_mc(void) { struct mem_cgroup *from = mc.from; struct mem_cgroup *to = mc.to; @@ -4697,23 +4741,28 @@ static void mem_cgroup_clear_mc(void) PAGE_SIZE * mc.moved_swap); } /* we've already done mem_cgroup_get(mc.to) */ - mc.moved_swap = 0; } - if (mc.mm) { - up_read(&mc.mm->mmap_sem); - mmput(mc.mm); - } + memcg_oom_recover(from); + memcg_oom_recover(to); + wake_up_all(&mc.waitq); +} + +static void mem_cgroup_clear_mc(void) +{ + struct mem_cgroup *from = mc.from; + + /* + * we must clear moving_task before waking up waiters at the end of + * task migration. + */ + mc.moving_task = NULL; + __mem_cgroup_clear_mc(); spin_lock(&mc.lock); mc.from = NULL; mc.to = NULL; spin_unlock(&mc.lock); - mc.moving_task = NULL; - mc.mm = NULL; mem_cgroup_end_move(from); - memcg_oom_recover(from); - memcg_oom_recover(to); - wake_up_all(&mc.waitq); } static int mem_cgroup_can_attach(struct cgroup_subsys *ss, @@ -4735,38 +4784,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, return 0; /* We move charges only when we move a owner of the mm */ if (mm->owner == p) { - /* - * We do all the move charge works under one mmap_sem to - * avoid deadlock with down_write(&mmap_sem) - * -> try_charge() -> if (mc.moving_task) -> sleep. - */ - down_read(&mm->mmap_sem); - VM_BUG_ON(mc.from); VM_BUG_ON(mc.to); VM_BUG_ON(mc.precharge); VM_BUG_ON(mc.moved_charge); VM_BUG_ON(mc.moved_swap); - VM_BUG_ON(mc.moving_task); - VM_BUG_ON(mc.mm); - mem_cgroup_start_move(from); spin_lock(&mc.lock); mc.from = from; mc.to = mem; - mc.precharge = 0; - mc.moved_charge = 0; - mc.moved_swap = 0; spin_unlock(&mc.lock); - mc.moving_task = current; - mc.mm = mm; + /* We set mc.moving_task later */ ret = mem_cgroup_precharge_mc(mm); if (ret) mem_cgroup_clear_mc(); - /* We call up_read() and mmput() in clear_mc(). */ - } else - mmput(mm); + } + mmput(mm); } return ret; } @@ -4789,6 +4823,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, spinlock_t *ptl; retry: + VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; addr += PAGE_SIZE) { pte_t ptent = *(pte++); @@ -4854,7 +4889,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) struct vm_area_struct *vma; lru_add_drain_all(); - /* We've already held the mmap_sem */ +retry: + if (unlikely(!down_read_trylock(&mm->mmap_sem))) { + /* + * Someone who are holding the mmap_sem might be waiting in + * waitq. So we cancel all extra charges, wake up all waiters, + * and retry. Because we cancel precharges, we might not be able + * to move enough charges, but moving charge is a best-effort + * feature anyway, so it wouldn't be a big problem. + */ + __mem_cgroup_clear_mc(); + cond_resched(); + goto retry; + } for (vma = mm->mmap; vma; vma = vma->vm_next) { int ret; struct mm_walk mem_cgroup_move_charge_walk = { @@ -4873,6 +4920,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) */ break; } + up_read(&mm->mmap_sem); } static void mem_cgroup_move_task(struct cgroup_subsys *ss, @@ -4881,11 +4929,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct task_struct *p, bool threadgroup) { - if (!mc.mm) + struct mm_struct *mm; + + if (!mc.to) /* no need to move charge */ return; - mem_cgroup_move_charge(mc.mm); + mm = get_task_mm(p); + if (mm) { + mem_cgroup_move_charge(mm); + mmput(mm); + } mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 46ab2c0..548fbd7 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -203,7 +203,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, #ifdef __ARCH_SI_TRAPNO si.si_trapno = trapno; #endif - si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; + si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; /* * Don't use force here, it's convenient if the signal * can be temporarily blocked. @@ -386,6 +386,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct task_struct *tsk; struct anon_vma *av; + if (!PageHuge(page) && unlikely(split_huge_page(page))) + return; read_lock(&tasklist_lock); av = page_lock_anon_vma(page); if (av == NULL) /* Not actually mapped anymore */ @@ -928,7 +930,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, static void set_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_order(hpage); + int nr_pages = 1 << compound_trans_order(hpage); for (i = 0; i < nr_pages; i++) SetPageHWPoison(hpage + i); } @@ -936,7 +938,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage) static void clear_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_order(hpage); + int nr_pages = 1 << compound_trans_order(hpage); for (i = 0; i < nr_pages; i++) ClearPageHWPoison(hpage + i); } @@ -966,7 +968,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) return 0; } - nr_pages = 1 << compound_order(hpage); + nr_pages = 1 << compound_trans_order(hpage); atomic_long_add(nr_pages, &mce_bad_pages); /* @@ -1164,7 +1166,7 @@ int unpoison_memory(unsigned long pfn) return 0; } - nr_pages = 1 << compound_order(page); + nr_pages = 1 << compound_trans_order(page); if (!get_page_unless_zero(page)) { /* @@ -1290,9 +1292,10 @@ static int soft_offline_huge_page(struct page *page, int flags) /* Keep page count to indicate a given hugepage is isolated. */ list_add(&hpage->lru, &pagelist); - ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); + ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, + true); if (ret) { - putback_lru_pages(&pagelist); + putback_lru_pages(&pagelist); pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) @@ -1301,7 +1304,7 @@ static int soft_offline_huge_page(struct page *page, int flags) } done: if (!PageHWPoison(hpage)) - atomic_long_add(1 << compound_order(hpage), &mce_bad_pages); + atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages); set_page_hwpoison_huge_page(hpage); dequeue_hwpoisoned_huge_page(hpage); /* keep elevated page count for bad page */ @@ -1413,7 +1416,8 @@ int soft_offline_page(struct page *page, int flags) LIST_HEAD(pagelist); list_add(&page->lru, &pagelist); - ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); + ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + 0, true); if (ret) { pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); diff --git a/mm/memory.c b/mm/memory.c index 02e48aa..31250fa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -394,9 +394,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, } } -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long address) { pgtable_t new = pte_alloc_one(mm, address); + int wait_split_huge_page; if (!new) return -ENOMEM; @@ -416,14 +418,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ spin_lock(&mm->page_table_lock); - if (!pmd_present(*pmd)) { /* Has another populated it ? */ + wait_split_huge_page = 0; + if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ mm->nr_ptes++; pmd_populate(mm, pmd, new); new = NULL; - } + } else if (unlikely(pmd_trans_splitting(*pmd))) + wait_split_huge_page = 1; spin_unlock(&mm->page_table_lock); if (new) pte_free(mm, new); + if (wait_split_huge_page) + wait_split_huge_page(vma->anon_vma, pmd); return 0; } @@ -436,10 +442,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) smp_wmb(); /* See comment in __pte_alloc */ spin_lock(&init_mm.page_table_lock); - if (!pmd_present(*pmd)) { /* Has another populated it ? */ + if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ pmd_populate_kernel(&init_mm, pmd, new); new = NULL; - } + } else + VM_BUG_ON(pmd_trans_splitting(*pmd)); spin_unlock(&init_mm.page_table_lock); if (new) pte_free_kernel(&init_mm, new); @@ -719,9 +726,9 @@ out_set_pte: return 0; } -static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, - unsigned long addr, unsigned long end) +int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) { pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; @@ -795,6 +802,17 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*src_pmd)) { + int err; + VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); + err = copy_huge_pmd(dst_mm, src_mm, + dst_pmd, src_pmd, addr, vma); + if (err == -ENOMEM) + return -ENOMEM; + if (!err) + continue; + /* fall through */ + } if (pmd_none_or_clear_bad(src_pmd)) continue; if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, @@ -997,6 +1015,16 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*pmd)) { + if (next-addr != HPAGE_PMD_SIZE) { + VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); + split_huge_page_pmd(vma->vm_mm, pmd); + } else if (zap_huge_pmd(tlb, vma, pmd)) { + (*zap_work)--; + continue; + } + /* fall through */ + } if (pmd_none_or_clear_bad(pmd)) { (*zap_work)--; continue; @@ -1262,7 +1290,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, pud = pud_offset(pgd, address); if (pud_none(*pud)) goto no_page_table; - if (pud_huge(*pud)) { + if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { BUG_ON(flags & FOLL_GET); page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); goto out; @@ -1273,11 +1301,32 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) goto no_page_table; - if (pmd_huge(*pmd)) { + if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { BUG_ON(flags & FOLL_GET); page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); goto out; } + if (pmd_trans_huge(*pmd)) { + if (flags & FOLL_SPLIT) { + split_huge_page_pmd(mm, pmd); + goto split_fallthrough; + } + spin_lock(&mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(&mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, pmd); + } else { + page = follow_trans_huge_pmd(mm, address, + pmd, flags); + spin_unlock(&mm->page_table_lock); + goto out; + } + } else + spin_unlock(&mm->page_table_lock); + /* fall through */ + } +split_fallthrough: if (unlikely(pmd_bad(*pmd))) goto no_page_table; @@ -1310,6 +1359,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, */ mark_page_accessed(page); } + if (flags & FOLL_MLOCK) { + /* + * The preliminary mapping check is mainly to avoid the + * pointless overhead of lock_page on the ZERO_PAGE + * which might bounce very badly if there is contention. + * + * If the page is already locked, we don't need to + * handle it now - vmscan will handle it later if and + * when it attempts to reclaim the page. + */ + if (page->mapping && trylock_page(page)) { + lru_add_drain(); /* push cached pages to LRU */ + /* + * Because we lock page here and migration is + * blocked by the pte's page reference, we need + * only check for file-cache page truncation. + */ + if (page->mapping) + mlock_vma_page(page); + unlock_page(page); + } + } unlock: pte_unmap_unlock(ptep, ptl); out: @@ -1341,7 +1412,8 @@ no_page_table: int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int nr_pages, unsigned int gup_flags, - struct page **pages, struct vm_area_struct **vmas) + struct page **pages, struct vm_area_struct **vmas, + int *nonblocking) { int i; unsigned long vm_flags; @@ -1386,6 +1458,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pmd = pmd_offset(pud, pg); if (pmd_none(*pmd)) return i ? : -EFAULT; + VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map(pmd, pg); if (pte_none(*pte)) { pte_unmap(pte); @@ -1441,10 +1514,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, cond_resched(); while (!(page = follow_page(vma, start, foll_flags))) { int ret; + unsigned int fault_flags = 0; + + if (foll_flags & FOLL_WRITE) + fault_flags |= FAULT_FLAG_WRITE; + if (nonblocking) + fault_flags |= FAULT_FLAG_ALLOW_RETRY; ret = handle_mm_fault(mm, vma, start, - (foll_flags & FOLL_WRITE) ? - FAULT_FLAG_WRITE : 0); + fault_flags); if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) @@ -1460,6 +1538,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, else tsk->min_flt++; + if (ret & VM_FAULT_RETRY) { + *nonblocking = 0; + return i; + } + /* * The VM_FAULT_WRITE bit tells us that * do_wp_page has broken COW when necessary, @@ -1559,7 +1642,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (force) flags |= FOLL_FORCE; - return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, + NULL); } EXPORT_SYMBOL(get_user_pages); @@ -1584,7 +1668,8 @@ struct page *get_dump_page(unsigned long addr) struct page *page; if (__get_user_pages(current, current->mm, addr, 1, - FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) + FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, + NULL) < 1) return NULL; flush_cache_page(vma, addr, page_to_pfn(page)); return page; @@ -1598,8 +1683,10 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, pud_t * pud = pud_alloc(mm, pgd, addr); if (pud) { pmd_t * pmd = pmd_alloc(mm, pud, addr); - if (pmd) + if (pmd) { + VM_BUG_ON(pmd_trans_huge(*pmd)); return pte_alloc_map_lock(mm, pmd, addr, ptl); + } } return NULL; } @@ -1818,6 +1905,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, pmd = pmd_alloc(mm, pud, addr); if (!pmd) return -ENOMEM; + VM_BUG_ON(pmd_trans_huge(*pmd)); do { next = pmd_addr_end(addr, end); if (remap_pte_range(mm, pmd, addr, next, @@ -2048,19 +2136,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, return same; } -/* - * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when - * servicing faults for write access. In the normal case, do always want - * pte_mkwrite. But get_user_pages can cause write faults for mappings - * that do not have writing enabled, when used by access_process_vm. - */ -static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) -{ - if (likely(vma->vm_flags & VM_WRITE)) - pte = pte_mkwrite(pte); - return pte; -} - static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) { /* @@ -2112,7 +2187,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *old_page, *new_page; pte_t entry; - int reuse = 0, ret = 0; + int ret = 0; int page_mkwrite = 0; struct page *dirty_page = NULL; @@ -2149,14 +2224,16 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, } page_cache_release(old_page); } - reuse = reuse_swap_page(old_page); - if (reuse) + if (reuse_swap_page(old_page)) { /* * The page is all ours. Move it to our anon_vma so * the rmap code will not search our parent or siblings. * Protected against the rmap code by the page lock. */ page_move_anon_rmap(old_page, vma, address); + unlock_page(old_page); + goto reuse; + } unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { @@ -2220,18 +2297,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, } dirty_page = old_page; get_page(dirty_page); - reuse = 1; - } - if (reuse) { reuse: flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, address, page_table, entry,1)) update_mmu_cache(vma, address, page_table); + pte_unmap_unlock(page_table, ptl); ret |= VM_FAULT_WRITE; - goto unlock; + + if (!dirty_page) + return ret; + + /* + * Yes, Virginia, this is actually required to prevent a race + * with clear_page_dirty_for_io() from clearing the page dirty + * bit after it clear all dirty ptes, but before a racing + * do_wp_page installs a dirty pte. + * + * do_no_page is protected similarly. + */ + if (!page_mkwrite) { + wait_on_page_locked(dirty_page); + set_page_dirty_balance(dirty_page, page_mkwrite); + } + put_page(dirty_page); + if (page_mkwrite) { + struct address_space *mapping = dirty_page->mapping; + + set_page_dirty(dirty_page); + unlock_page(dirty_page); + page_cache_release(dirty_page); + if (mapping) { + /* + * Some device drivers do not set page.mapping + * but still dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + } + + /* file_update_time outside page_lock */ + if (vma->vm_file) + file_update_time(vma->vm_file); + + return ret; } /* @@ -2337,39 +2448,6 @@ gotten: page_cache_release(old_page); unlock: pte_unmap_unlock(page_table, ptl); - if (dirty_page) { - /* - * Yes, Virginia, this is actually required to prevent a race - * with clear_page_dirty_for_io() from clearing the page dirty - * bit after it clear all dirty ptes, but before a racing - * do_wp_page installs a dirty pte. - * - * do_no_page is protected similarly. - */ - if (!page_mkwrite) { - wait_on_page_locked(dirty_page); - set_page_dirty_balance(dirty_page, page_mkwrite); - } - put_page(dirty_page); - if (page_mkwrite) { - struct address_space *mapping = dirty_page->mapping; - - set_page_dirty(dirty_page); - unlock_page(dirty_page); - page_cache_release(dirty_page); - if (mapping) { - /* - * Some device drivers do not set page.mapping - * but still dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } - } - - /* file_update_time outside page_lock */ - if (vma->vm_file) - file_update_time(vma->vm_file); - } return ret; oom_free_new: page_cache_release(new_page); @@ -3147,9 +3225,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static inline int handle_pte_fault(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - pte_t *pte, pmd_t *pmd, unsigned int flags) +int handle_pte_fault(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + pte_t *pte, pmd_t *pmd, unsigned int flags) { pte_t entry; spinlock_t *ptl; @@ -3228,9 +3306,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pmd = pmd_alloc(mm, pud, address); if (!pmd) return VM_FAULT_OOM; - pte = pte_alloc_map(mm, pmd, address); - if (!pte) + if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { + if (!vma->vm_ops) + return do_huge_pmd_anonymous_page(mm, vma, address, + pmd, flags); + } else { + pmd_t orig_pmd = *pmd; + barrier(); + if (pmd_trans_huge(orig_pmd)) { + if (flags & FAULT_FLAG_WRITE && + !pmd_write(orig_pmd) && + !pmd_trans_splitting(orig_pmd)) + return do_huge_pmd_wp_page(mm, vma, address, + pmd, orig_pmd); + return 0; + } + } + + /* + * Use __pte_alloc instead of pte_alloc_map, because we can't + * run pte_offset_map on the pmd, if an huge pmd could + * materialize from under us from a different thread. + */ + if (unlikely(__pte_alloc(mm, vma, pmd, address))) return VM_FAULT_OOM; + /* if an huge pmd materialized from under us just retry later */ + if (unlikely(pmd_trans_huge(*pmd))) + return 0; + /* + * A regular pmd is established and it can't morph into a huge pmd + * from under us anymore at this point because we hold the mmap_sem + * read mode and khugepaged takes it in write mode. So now it's + * safe to run pte_offset_map(). + */ + pte = pte_offset_map(pmd, address); return handle_pte_fault(mm, vma, address, pte, pmd, flags); } @@ -3296,7 +3405,12 @@ int make_pages_present(unsigned long addr, unsigned long end) vma = find_vma(current->mm, addr); if (!vma) return -ENOMEM; - write = (vma->vm_flags & VM_WRITE) != 0; + /* + * We want to touch writable mappings with a write fault in order + * to break COW, except for shared mappings because these don't COW + * and we would not want to dirty them for nothing. + */ + write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE; BUG_ON(addr >= end); BUG_ON(end > vma->vm_end); len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; @@ -3368,6 +3482,7 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address, goto out; pmd = pmd_offset(pud, address); + VM_BUG_ON(pmd_trans_huge(*pmd)); if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) goto out; @@ -3608,3 +3723,74 @@ void might_fault(void) } EXPORT_SYMBOL(might_fault); #endif + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) +static void clear_gigantic_page(struct page *page, + unsigned long addr, + unsigned int pages_per_huge_page) +{ + int i; + struct page *p = page; + + might_sleep(); + for (i = 0; i < pages_per_huge_page; + i++, p = mem_map_next(p, page, i)) { + cond_resched(); + clear_user_highpage(p, addr + i * PAGE_SIZE); + } +} +void clear_huge_page(struct page *page, + unsigned long addr, unsigned int pages_per_huge_page) +{ + int i; + + if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { + clear_gigantic_page(page, addr, pages_per_huge_page); + return; + } + + might_sleep(); + for (i = 0; i < pages_per_huge_page; i++) { + cond_resched(); + clear_user_highpage(page + i, addr + i * PAGE_SIZE); + } +} + +static void copy_user_gigantic_page(struct page *dst, struct page *src, + unsigned long addr, + struct vm_area_struct *vma, + unsigned int pages_per_huge_page) +{ + int i; + struct page *dst_base = dst; + struct page *src_base = src; + + for (i = 0; i < pages_per_huge_page; ) { + cond_resched(); + copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); + + i++; + dst = mem_map_next(dst, dst_base, i); + src = mem_map_next(src, src_base, i); + } +} + +void copy_user_huge_page(struct page *dst, struct page *src, + unsigned long addr, struct vm_area_struct *vma, + unsigned int pages_per_huge_page) +{ + int i; + + if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { + copy_user_gigantic_page(dst, src, addr, vma, + pages_per_huge_page); + return; + } + + might_sleep(); + for (i = 0; i < pages_per_huge_page; i++) { + cond_resched(); + copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); + } +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2c6523a..e92f047 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -82,9 +82,10 @@ static void release_memory_resource(struct resource *res) #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE #ifndef CONFIG_SPARSEMEM_VMEMMAP -static void get_page_bootmem(unsigned long info, struct page *page, int type) +static void get_page_bootmem(unsigned long info, struct page *page, + unsigned long type) { - atomic_set(&page->_mapcount, type); + page->lru.next = (struct list_head *) type; SetPagePrivate(page); set_page_private(page, info); atomic_inc(&page->_count); @@ -94,15 +95,16 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type) * so use __ref to tell modpost not to generate a warning */ void __ref put_page_bootmem(struct page *page) { - int type; + unsigned long type; - type = atomic_read(&page->_mapcount); - BUG_ON(type >= -1); + type = (unsigned long) page->lru.next; + BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || + type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); if (atomic_dec_return(&page->_count) == 1) { ClearPagePrivate(page); set_page_private(page, 0); - reset_page_mapcount(page); + INIT_LIST_HEAD(&page->lru); __free_pages_bootmem(page, 0); } @@ -733,7 +735,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) goto out; } /* this function returns # of failed pages */ - ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); + ret = migrate_pages(&source, hotremove_migrate_alloc, 0, + true, true); if (ret) putback_lru_pages(&source); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 11ff260..368fc9d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -514,6 +514,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + split_huge_page_pmd(vma->vm_mm, pmd); if (pmd_none_or_clear_bad(pmd)) continue; if (check_pte_range(vma, pmd, addr, next, nodes, @@ -935,7 +936,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, return PTR_ERR(vma); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_node_page, dest, 0); + err = migrate_pages(&pagelist, new_node_page, dest, + false, true); if (err) putback_lru_pages(&pagelist); } @@ -1155,7 +1157,8 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { nr_failed = migrate_pages(&pagelist, new_vma_page, - (unsigned long)vma, 0); + (unsigned long)vma, + false, true); if (nr_failed) putback_lru_pages(&pagelist); } @@ -1308,16 +1311,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, /* Find the mm_struct */ rcu_read_lock(); - read_lock(&tasklist_lock); task = pid ? find_task_by_vpid(pid) : current; if (!task) { - read_unlock(&tasklist_lock); rcu_read_unlock(); err = -ESRCH; goto out; } mm = get_task_mm(task); - read_unlock(&tasklist_lock); rcu_read_unlock(); err = -EINVAL; @@ -1796,7 +1796,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, } /** - * alloc_page_vma - Allocate a page for a VMA. + * alloc_pages_vma - Allocate a page for a VMA. * * @gfp: * %GFP_USER user allocation. @@ -1805,6 +1805,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * %GFP_FS allocation should not call back into a file system. * %GFP_ATOMIC don't sleep. * + * @order:Order of the GFP allocation. * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual Address of the allocation. Must be inside the VMA. * @@ -1818,7 +1819,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * Should be called with the mm_sem of the vma hold. */ struct page * -alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) +alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, + unsigned long addr) { struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; @@ -1830,7 +1832,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); mpol_cond_put(pol); - page = alloc_page_interleave(gfp, 0, nid); + page = alloc_page_interleave(gfp, order, nid); put_mems_allowed(); return page; } @@ -1839,7 +1841,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) /* * slow path: ref counted shared policy */ - struct page *page = __alloc_pages_nodemask(gfp, 0, + struct page *page = __alloc_pages_nodemask(gfp, order, zl, policy_nodemask(gfp, pol)); __mpol_put(pol); put_mems_allowed(); @@ -1848,7 +1850,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) /* * fast path: default or task policy */ - page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); + page = __alloc_pages_nodemask(gfp, order, zl, + policy_nodemask(gfp, pol)); put_mems_allowed(); return page; } diff --git a/mm/migrate.c b/mm/migrate.c index 6ae8a66..46fe8cc 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -113,6 +113,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, goto out; pmd = pmd_offset(pud, addr); + if (pmd_trans_huge(*pmd)) + goto out; if (!pmd_present(*pmd)) goto out; @@ -246,7 +248,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, expected_count = 2 + page_has_private(page); if (page_count(page) != expected_count || - (struct page *)radix_tree_deref_slot(pslot) != page) { + radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -318,7 +320,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, expected_count = 2 + page_has_private(page); if (page_count(page) != expected_count || - (struct page *)radix_tree_deref_slot(pslot) != page) { + radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -614,13 +616,12 @@ static int move_to_new_page(struct page *newpage, struct page *page, * to the newly allocated page in newpage. */ static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, int offlining) + struct page *page, int force, bool offlining, bool sync) { int rc = 0; int *result = NULL; struct page *newpage = get_new_page(page, private, &result); int remap_swapcache = 1; - int rcu_locked = 0; int charge = 0; struct mem_cgroup *mem = NULL; struct anon_vma *anon_vma = NULL; @@ -632,6 +633,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, /* page was freed from under us. So we are done. */ goto move_newpage; } + if (unlikely(PageTransHuge(page))) + if (unlikely(split_huge_page(page))) + goto move_newpage; /* prepare cgroup just returns 0 or -ENOMEM */ rc = -EAGAIN; @@ -639,6 +643,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (!trylock_page(page)) { if (!force) goto move_newpage; + + /* + * It's not safe for direct compaction to call lock_page. + * For example, during page readahead pages are added locked + * to the LRU. Later, when the IO completes the pages are + * marked uptodate and unlocked. However, the queueing + * could be merging multiple pages for one bio (e.g. + * mpage_readpages). If an allocation happens for the + * second or third page, the process can end up locking + * the same page twice and deadlocking. Rather than + * trying to be clever about what pages can be locked, + * avoid the use of lock_page for direct compaction + * altogether. + */ + if (current->flags & PF_MEMALLOC) + goto move_newpage; + lock_page(page); } @@ -665,27 +686,33 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, BUG_ON(charge); if (PageWriteback(page)) { - if (!force) + if (!force || !sync) goto uncharge; wait_on_page_writeback(page); } /* * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, * we cannot notice that anon_vma is freed while we migrates a page. - * This rcu_read_lock() delays freeing anon_vma pointer until the end + * This get_anon_vma() delays freeing anon_vma pointer until the end * of migration. File cache pages are no problem because of page_lock() * File Caches may use write_page() or lock_page() in migration, then, * just care Anon page here. */ if (PageAnon(page)) { - rcu_read_lock(); - rcu_locked = 1; - - /* Determine how to safely use anon_vma */ - if (!page_mapped(page)) { - if (!PageSwapCache(page)) - goto rcu_unlock; - + /* + * Only page_lock_anon_vma() understands the subtleties of + * getting a hold on an anon_vma from outside one of its mms. + */ + anon_vma = page_lock_anon_vma(page); + if (anon_vma) { + /* + * Take a reference count on the anon_vma if the + * page is mapped so that it is guaranteed to + * exist when the page is remapped later + */ + get_anon_vma(anon_vma); + page_unlock_anon_vma(anon_vma); + } else if (PageSwapCache(page)) { /* * We cannot be sure that the anon_vma of an unmapped * swapcache page is safe to use because we don't @@ -700,13 +727,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, */ remap_swapcache = 0; } else { - /* - * Take a reference count on the anon_vma if the - * page is mapped so that it is guaranteed to - * exist when the page is remapped later - */ - anon_vma = page_anon_vma(page); - get_anon_vma(anon_vma); + goto uncharge; } } @@ -723,16 +744,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, * free the metadata, so the page can be freed. */ if (!page->mapping) { - if (!PageAnon(page) && page_has_private(page)) { - /* - * Go direct to try_to_free_buffers() here because - * a) that's what try_to_release_page() would do anyway - * b) we may be under rcu_read_lock() here, so we can't - * use GFP_KERNEL which is what try_to_release_page() - * needs to be effective. - */ + VM_BUG_ON(PageAnon(page)); + if (page_has_private(page)) { try_to_free_buffers(page); - goto rcu_unlock; + goto uncharge; } goto skip_unmap; } @@ -746,17 +761,14 @@ skip_unmap: if (rc && remap_swapcache) remove_migration_ptes(page, page); -rcu_unlock: /* Drop an anon_vma reference if we took one */ if (anon_vma) drop_anon_vma(anon_vma); - if (rcu_locked) - rcu_read_unlock(); uncharge: if (!charge) - mem_cgroup_end_migration(mem, page, newpage); + mem_cgroup_end_migration(mem, page, newpage, rc == 0); unlock: unlock_page(page); @@ -810,12 +822,11 @@ move_newpage: */ static int unmap_and_move_huge_page(new_page_t get_new_page, unsigned long private, struct page *hpage, - int force, int offlining) + int force, bool offlining, bool sync) { int rc = 0; int *result = NULL; struct page *new_hpage = get_new_page(hpage, private, &result); - int rcu_locked = 0; struct anon_vma *anon_vma = NULL; if (!new_hpage) @@ -824,18 +835,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, rc = -EAGAIN; if (!trylock_page(hpage)) { - if (!force) + if (!force || !sync) goto out; lock_page(hpage); } if (PageAnon(hpage)) { - rcu_read_lock(); - rcu_locked = 1; - - if (page_mapped(hpage)) { - anon_vma = page_anon_vma(hpage); - atomic_inc(&anon_vma->external_refcount); + anon_vma = page_lock_anon_vma(hpage); + if (anon_vma) { + get_anon_vma(anon_vma); + page_unlock_anon_vma(anon_vma); } } @@ -847,16 +856,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (rc) remove_migration_ptes(hpage, hpage); - if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, - &anon_vma->lock)) { - int empty = list_empty(&anon_vma->head); - spin_unlock(&anon_vma->lock); - if (empty) - anon_vma_free(anon_vma); - } - - if (rcu_locked) - rcu_read_unlock(); + if (anon_vma) + drop_anon_vma(anon_vma); out: unlock_page(hpage); @@ -892,7 +893,8 @@ out: * Return: Number of pages not migrated or error code. */ int migrate_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private, int offlining) + new_page_t get_new_page, unsigned long private, bool offlining, + bool sync) { int retry = 1; int nr_failed = 0; @@ -912,7 +914,8 @@ int migrate_pages(struct list_head *from, cond_resched(); rc = unmap_and_move(get_new_page, private, - page, pass > 2, offlining); + page, pass > 2, offlining, + sync); switch(rc) { case -ENOMEM: @@ -941,7 +944,8 @@ out: } int migrate_huge_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private, int offlining) + new_page_t get_new_page, unsigned long private, bool offlining, + bool sync) { int retry = 1; int nr_failed = 0; @@ -957,7 +961,8 @@ int migrate_huge_pages(struct list_head *from, cond_resched(); rc = unmap_and_move_huge_page(get_new_page, - private, page, pass > 2, offlining); + private, page, pass > 2, offlining, + sync); switch(rc) { case -ENOMEM: @@ -1042,7 +1047,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm, if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) goto set_status; - page = follow_page(vma, pp->addr, FOLL_GET); + page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT); err = PTR_ERR(page); if (IS_ERR(page)) @@ -1090,7 +1095,7 @@ set_status: err = 0; if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_page_node, - (unsigned long)pm, 0); + (unsigned long)pm, 0, true); if (err) putback_lru_pages(&pagelist); } diff --git a/mm/mincore.c b/mm/mincore.c index 9ac42dc..a4e6b9d 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -154,6 +154,13 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*pmd)) { + if (mincore_huge_pmd(vma, pmd, addr, next, vec)) { + vec += (next - addr) >> PAGE_SHIFT; + continue; + } + /* fall through */ + } if (pmd_none_or_clear_bad(pmd)) mincore_unmapped_range(vma, addr, next, vec); else diff --git a/mm/mlock.c b/mm/mlock.c index b70919c..13e81ee 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -155,13 +155,12 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add * vma->vm_mm->mmap_sem must be held for at least read. */ static long __mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + int *nonblocking) { struct mm_struct *mm = vma->vm_mm; unsigned long addr = start; - struct page *pages[16]; /* 16 gives a reasonable batch */ int nr_pages = (end - start) / PAGE_SIZE; - int ret = 0; int gup_flags; VM_BUG_ON(start & ~PAGE_MASK); @@ -170,73 +169,26 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, VM_BUG_ON(end > vma->vm_end); VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); - gup_flags = FOLL_TOUCH | FOLL_GET; - if (vma->vm_flags & VM_WRITE) + gup_flags = FOLL_TOUCH; + /* + * We want to touch writable mappings with a write fault in order + * to break COW, except for shared mappings because these don't COW + * and we would not want to dirty them for nothing. + */ + if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) gup_flags |= FOLL_WRITE; + if (vma->vm_flags & VM_LOCKED) + gup_flags |= FOLL_MLOCK; + /* We don't try to access the guard page of a stack vma */ if (stack_guard_page(vma, start)) { addr += PAGE_SIZE; nr_pages--; } - while (nr_pages > 0) { - int i; - - cond_resched(); - - /* - * get_user_pages makes pages present if we are - * setting mlock. and this extra reference count will - * disable migration of this page. However, page may - * still be truncated out from under us. - */ - ret = __get_user_pages(current, mm, addr, - min_t(int, nr_pages, ARRAY_SIZE(pages)), - gup_flags, pages, NULL); - /* - * This can happen for, e.g., VM_NONLINEAR regions before - * a page has been allocated and mapped at a given offset, - * or for addresses that map beyond end of a file. - * We'll mlock the pages if/when they get faulted in. - */ - if (ret < 0) - break; - - lru_add_drain(); /* push cached pages to LRU */ - - for (i = 0; i < ret; i++) { - struct page *page = pages[i]; - - if (page->mapping) { - /* - * That preliminary check is mainly to avoid - * the pointless overhead of lock_page on the - * ZERO_PAGE: which might bounce very badly if - * there is contention. However, we're still - * dirtying its cacheline with get/put_page: - * we'll add another __get_user_pages flag to - * avoid it if that case turns out to matter. - */ - lock_page(page); - /* - * Because we lock page here and migration is - * blocked by the elevated reference, we need - * only check for file-cache page truncation. - */ - if (page->mapping) - mlock_vma_page(page); - unlock_page(page); - } - put_page(page); /* ref from get_user_pages() */ - } - - addr += ret * PAGE_SIZE; - nr_pages -= ret; - ret = 0; - } - - return ret; /* 0 or negative error code */ + return __get_user_pages(current, mm, addr, nr_pages, gup_flags, + NULL, NULL, nonblocking); } /* @@ -280,7 +232,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))) { - __mlock_vma_pages_range(vma, start, end); + __mlock_vma_pages_range(vma, start, end, NULL); /* Hide errors from mmap() and other callers */ return 0; @@ -372,18 +324,10 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, int ret = 0; int lock = newflags & VM_LOCKED; - if (newflags == vma->vm_flags || - (vma->vm_flags & (VM_IO | VM_PFNMAP))) + if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || + is_vm_hugetlb_page(vma) || vma == get_gate_vma(current)) goto out; /* don't set VM_LOCKED, don't count */ - if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || - is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current)) { - if (lock) - make_pages_present(start, end); - goto out; /* don't set VM_LOCKED, don't count */ - } - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); @@ -419,14 +363,10 @@ success: * set VM_LOCKED, __mlock_vma_pages_range will bring it back. */ - if (lock) { + if (lock) vma->vm_flags = newflags; - ret = __mlock_vma_pages_range(vma, start, end); - if (ret < 0) - ret = __mlock_posix_error_return(ret); - } else { + else munlock_vma_pages_range(vma, start, end); - } out: *prev = vma; @@ -439,7 +379,8 @@ static int do_mlock(unsigned long start, size_t len, int on) struct vm_area_struct * vma, * prev; int error; - len = PAGE_ALIGN(len); + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(len != PAGE_ALIGN(len)); end = start + len; if (end < start) return -EINVAL; @@ -482,6 +423,62 @@ static int do_mlock(unsigned long start, size_t len, int on) return error; } +static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) +{ + struct mm_struct *mm = current->mm; + unsigned long end, nstart, nend; + struct vm_area_struct *vma = NULL; + int locked = 0; + int ret = 0; + + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(len != PAGE_ALIGN(len)); + end = start + len; + + for (nstart = start; nstart < end; nstart = nend) { + /* + * We want to fault in pages for [nstart; end) address range. + * Find first corresponding VMA. + */ + if (!locked) { + locked = 1; + down_read(&mm->mmap_sem); + vma = find_vma(mm, nstart); + } else if (nstart >= vma->vm_end) + vma = vma->vm_next; + if (!vma || vma->vm_start >= end) + break; + /* + * Set [nstart; nend) to intersection of desired address + * range with the first VMA. Also, skip undesirable VMA types. + */ + nend = min(end, vma->vm_end); + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + continue; + if (nstart < vma->vm_start) + nstart = vma->vm_start; + /* + * Now fault in a range of pages. __mlock_vma_pages_range() + * double checks the vma flags, so that it won't mlock pages + * if the vma was already munlocked. + */ + ret = __mlock_vma_pages_range(vma, nstart, nend, &locked); + if (ret < 0) { + if (ignore_errors) { + ret = 0; + continue; /* continue at next VMA */ + } + ret = __mlock_posix_error_return(ret); + break; + } + nend = nstart + ret * PAGE_SIZE; + ret = 0; + } + if (locked) + up_read(&mm->mmap_sem); + return ret; /* 0 or negative error code */ +} + SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) { unsigned long locked; @@ -507,6 +504,8 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) error = do_mlock(start, len, 1); up_write(¤t->mm->mmap_sem); + if (!error) + error = do_mlock_pages(start, len, 0); return error; } @@ -571,6 +570,10 @@ SYSCALL_DEFINE1(mlockall, int, flags) capable(CAP_IPC_LOCK)) ret = do_mlockall(flags); up_write(¤t->mm->mmap_sem); + if (!ret && (flags & MCL_CURRENT)) { + /* Ignore errors */ + do_mlock_pages(0, TASK_SIZE, 1); + } out: return ret; } diff --git a/mm/mmap.c b/mm/mmap.c index 50a4aa0..2ec8eb5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -253,7 +254,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) down_write(&mm->mmap_sem); #ifdef CONFIG_COMPAT_BRK - min_brk = mm->end_code; + /* + * CONFIG_COMPAT_BRK can still be overridden by setting + * randomize_va_space to 2, which will still cause mm->start_brk + * to be arbitrarily shifted + */ + if (mm->start_brk > PAGE_ALIGN(mm->end_data)) + min_brk = mm->start_brk; + else + min_brk = mm->end_data; #else min_brk = mm->start_brk; #endif @@ -588,6 +597,8 @@ again: remove_next = 1 + (end > next->vm_end); } } + vma_adjust_trans_huge(vma, start, end, adjust_next); + /* * When changing only vma->vm_end, we don't really need anon_vma * lock. This is a fairly rare case by itself, but the anon_vma @@ -815,6 +826,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, end, prev->vm_pgoff, NULL); if (err) return NULL; + khugepaged_enter_vma_merge(prev); return prev; } @@ -833,6 +845,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, next->vm_pgoff - pglen, NULL); if (err) return NULL; + khugepaged_enter_vma_merge(area); return area; } @@ -1761,6 +1774,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } } vma_unlock_anon_vma(vma); + khugepaged_enter_vma_merge(vma); return error; } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ @@ -1808,6 +1822,7 @@ static int expand_downwards(struct vm_area_struct *vma, } } vma_unlock_anon_vma(vma); + khugepaged_enter_vma_merge(vma); return error; } diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 438951d..8d032de 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -100,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, return young; } +int __mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + int young = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->test_young) { + young = mn->ops->test_young(mn, mm, address); + if (young) + break; + } + } + rcu_read_unlock(); + + return young; +} + void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { diff --git a/mm/mmzone.c b/mm/mmzone.c index e35bfb8..f5b7d17 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn, return 1; } #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ - -#ifdef CONFIG_SMP -/* Called when a more accurate view of NR_FREE_PAGES is needed */ -unsigned long zone_nr_free_pages(struct zone *zone) -{ - unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES); - - /* - * While kswapd is awake, it is considered the zone is under some - * memory pressure. Under pressure, there is a risk that - * per-cpu-counter-drift will allow the min watermark to be breached - * potentially causing a live-lock. While kswapd is awake and - * free pages are low, get a better estimate for free pages - */ - if (nr_free_pages < zone->percpu_drift_mark && - !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) - return zone_page_state_snapshot(zone, NR_FREE_PAGES); - - return nr_free_pages; -} -#endif /* CONFIG_SMP */ diff --git a/mm/mprotect.c b/mm/mprotect.c index 4c51338..5a688a2 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -78,7 +78,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, pte_unmap_unlock(pte - 1, ptl); } -static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, +static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable) { @@ -88,13 +88,21 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*pmd)) { + if (next - addr != HPAGE_PMD_SIZE) + split_huge_page_pmd(vma->vm_mm, pmd); + else if (change_huge_pmd(vma, pmd, addr, newprot)) + continue; + /* fall through */ + } if (pmd_none_or_clear_bad(pmd)) continue; - change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable); + change_pte_range(vma->vm_mm, pmd, addr, next, newprot, + dirty_accountable); } while (pmd++, addr = next, addr != end); } -static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, +static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable) { @@ -106,7 +114,8 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable); + change_pmd_range(vma, pud, addr, next, newprot, + dirty_accountable); } while (pud++, addr = next, addr != end); } @@ -126,7 +135,8 @@ static void change_protection(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable); + change_pud_range(vma, pgd, addr, next, newprot, + dirty_accountable); } while (pgd++, addr = next, addr != end); flush_tlb_range(vma, start, end); } diff --git a/mm/mremap.c b/mm/mremap.c index 563fbdd..9925b63 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -41,13 +41,15 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) return NULL; pmd = pmd_offset(pud, addr); + split_huge_page_pmd(mm, pmd); if (pmd_none_or_clear_bad(pmd)) return NULL; return pmd; } -static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) +static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr) { pgd_t *pgd; pud_t *pud; @@ -62,7 +64,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) if (!pmd) return NULL; - if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr)) + VM_BUG_ON(pmd_trans_huge(*pmd)); + if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr)) return NULL; return pmd; @@ -147,7 +150,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, old_pmd = get_old_pmd(vma->vm_mm, old_addr); if (!old_pmd) continue; - new_pmd = alloc_new_pmd(vma->vm_mm, new_addr); + new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); if (!new_pmd) break; next = (new_addr + PMD_SIZE) & PMD_MASK; diff --git a/mm/nommu.c b/mm/nommu.c index ef4045d..f59e142 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -127,7 +127,8 @@ unsigned int kobjsize(const void *objp) int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int nr_pages, unsigned int foll_flags, - struct page **pages, struct vm_area_struct **vmas) + struct page **pages, struct vm_area_struct **vmas, + int *retry) { struct vm_area_struct *vma; unsigned long vm_flags; @@ -185,7 +186,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (force) flags |= FOLL_FORCE; - return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, + NULL); } EXPORT_SYMBOL(get_user_pages); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b5d8a1f..2cb01f6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -410,9 +410,12 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { unsigned long background; unsigned long dirty; - unsigned long available_memory = determine_dirtyable_memory(); + unsigned long uninitialized_var(available_memory); struct task_struct *tsk; + if (!vm_dirty_bytes || !dirty_background_bytes) + available_memory = determine_dirtyable_memory(); + if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); else @@ -1103,7 +1106,7 @@ EXPORT_SYMBOL(write_one_page); int __set_page_dirty_no_writeback(struct page *page) { if (!PageDirty(page)) - SetPageDirty(page); + return !TestSetPageDirty(page); return 0; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 826ba69..90c1439 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -357,6 +357,7 @@ void prep_compound_page(struct page *page, unsigned long order) } } +/* update __split_huge_page_refcount if you change this function */ static int destroy_compound_page(struct page *page, unsigned long order) { int i; @@ -426,18 +427,10 @@ static inline void rmv_page_order(struct page *page) * * Assumption: *_mem_map is contiguous at least up to MAX_ORDER */ -static inline struct page * -__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) -{ - unsigned long buddy_idx = page_idx ^ (1 << order); - - return page + (buddy_idx - page_idx); -} - static inline unsigned long -__find_combined_index(unsigned long page_idx, unsigned int order) +__find_buddy_index(unsigned long page_idx, unsigned int order) { - return (page_idx & ~(1 << order)); + return page_idx ^ (1 << order); } /* @@ -448,8 +441,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order) * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. * - * For recording whether a page is in the buddy system, we use PG_buddy. - * Setting, clearing, and testing PG_buddy is serialized by zone->lock. + * For recording whether a page is in the buddy system, we set ->_mapcount -2. + * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock. * * For recording page's order, we use page_private(page). */ @@ -482,7 +475,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, * as necessary, plus some accounting needed to play nicely with other * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous - * free pages of length of (1 << order) and marked with PG_buddy. Page's + * free pages of length of (1 << order) and marked with _mapcount -2. Page's * order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were @@ -499,6 +492,7 @@ static inline void __free_one_page(struct page *page, { unsigned long page_idx; unsigned long combined_idx; + unsigned long uninitialized_var(buddy_idx); struct page *buddy; if (unlikely(PageCompound(page))) @@ -513,7 +507,8 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON(bad_range(zone, page)); while (order < MAX_ORDER-1) { - buddy = __page_find_buddy(page, page_idx, order); + buddy_idx = __find_buddy_index(page_idx, order); + buddy = page + (buddy_idx - page_idx); if (!page_is_buddy(page, buddy, order)) break; @@ -521,7 +516,7 @@ static inline void __free_one_page(struct page *page, list_del(&buddy->lru); zone->free_area[order].nr_free--; rmv_page_order(buddy); - combined_idx = __find_combined_index(page_idx, order); + combined_idx = buddy_idx & page_idx; page = page + (combined_idx - page_idx); page_idx = combined_idx; order++; @@ -538,9 +533,10 @@ static inline void __free_one_page(struct page *page, */ if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { struct page *higher_page, *higher_buddy; - combined_idx = __find_combined_index(page_idx, order); - higher_page = page + combined_idx - page_idx; - higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1); + combined_idx = buddy_idx & page_idx; + higher_page = page + (combined_idx - page_idx); + buddy_idx = __find_buddy_index(combined_idx, order + 1); + higher_buddy = page + (buddy_idx - combined_idx); if (page_is_buddy(higher_page, higher_buddy, order + 1)) { list_add_tail(&page->lru, &zone->free_area[order].free_list[migratetype]); @@ -651,13 +647,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order) trace_mm_page_free_direct(page, order); kmemcheck_free_shadow(page, order); - for (i = 0; i < (1 << order); i++) { - struct page *pg = page + i; - - if (PageAnon(pg)) - pg->mapping = NULL; - bad += free_pages_check(pg); - } + if (PageAnon(page)) + page->mapping = NULL; + for (i = 0; i < (1 << order); i++) + bad += free_pages_check(page + i); if (bad) return false; @@ -1460,24 +1453,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) #endif /* CONFIG_FAIL_PAGE_ALLOC */ /* - * Return 1 if free pages are above 'mark'. This takes into account the order + * Return true if free pages are above 'mark'. This takes into account the order * of the allocation. */ -int zone_watermark_ok(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags) +static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags, long free_pages) { /* free_pages my go negative - that's OK */ long min = mark; - long free_pages = zone_nr_free_pages(z) - (1 << order) + 1; int o; + free_pages -= (1 << order) + 1; if (alloc_flags & ALLOC_HIGH) min -= min / 2; if (alloc_flags & ALLOC_HARDER) min -= min / 4; if (free_pages <= min + z->lowmem_reserve[classzone_idx]) - return 0; + return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ free_pages -= z->free_area[o].nr_free << o; @@ -1486,9 +1479,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, min >>= 1; if (free_pages <= min) - return 0; + return false; } - return 1; + return true; +} + +bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags) +{ + return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + zone_page_state(z, NR_FREE_PAGES)); +} + +bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags) +{ + long free_pages = zone_page_state(z, NR_FREE_PAGES); + + if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) + free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); + + return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + free_pages); } #ifdef CONFIG_NUMA @@ -1793,15 +1805,18 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) + int migratetype, unsigned long *did_some_progress, + bool sync_migration) { struct page *page; if (!order || compaction_deferred(preferred_zone)) return NULL; + current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, - nodemask); + nodemask, sync_migration); + current->flags &= ~PF_MEMALLOC; if (*did_some_progress != COMPACT_SKIPPED) { /* Page migration frees to the PCP lists but we want merging */ @@ -1837,7 +1852,8 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) + int migratetype, unsigned long *did_some_progress, + bool sync_migration) { return NULL; } @@ -1852,23 +1868,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, { struct page *page = NULL; struct reclaim_state reclaim_state; - struct task_struct *p = current; bool drained = false; cond_resched(); /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - p->flags |= PF_MEMALLOC; + current->flags |= PF_MEMALLOC; lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; - p->reclaim_state = &reclaim_state; + current->reclaim_state = &reclaim_state; *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); - p->reclaim_state = NULL; + current->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); - p->flags &= ~PF_MEMALLOC; + current->flags &= ~PF_MEMALLOC; cond_resched(); @@ -1920,19 +1935,19 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, static inline void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, - enum zone_type high_zoneidx) + enum zone_type high_zoneidx, + enum zone_type classzone_idx) { struct zoneref *z; struct zone *zone; for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) - wakeup_kswapd(zone, order); + wakeup_kswapd(zone, order, classzone_idx); } static inline int gfp_to_alloc_flags(gfp_t gfp_mask) { - struct task_struct *p = current; int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; const gfp_t wait = gfp_mask & __GFP_WAIT; @@ -1948,18 +1963,23 @@ gfp_to_alloc_flags(gfp_t gfp_mask) alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); if (!wait) { - alloc_flags |= ALLOC_HARDER; + /* + * Not worth trying to allocate harder for + * __GFP_NOMEMALLOC even if it can't schedule. + */ + if (!(gfp_mask & __GFP_NOMEMALLOC)) + alloc_flags |= ALLOC_HARDER; /* * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. * See also cpuset_zone_allowed() comment in kernel/cpuset.c. */ alloc_flags &= ~ALLOC_CPUSET; - } else if (unlikely(rt_task(p)) && !in_interrupt()) + } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { if (!in_interrupt() && - ((p->flags & PF_MEMALLOC) || + ((current->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))) alloc_flags |= ALLOC_NO_WATERMARKS; } @@ -1978,7 +1998,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int alloc_flags; unsigned long pages_reclaimed = 0; unsigned long did_some_progress; - struct task_struct *p = current; + bool sync_migration = false; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -2003,7 +2023,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - wake_all_kswapd(order, zonelist, high_zoneidx); + if (!(gfp_mask & __GFP_NO_KSWAPD)) + wake_all_kswapd(order, zonelist, high_zoneidx, + zone_idx(preferred_zone)); /* * OK, we're below the kswapd watermark and have kicked background @@ -2034,21 +2056,26 @@ rebalance: goto nopage; /* Avoid recursion of direct reclaim */ - if (p->flags & PF_MEMALLOC) + if (current->flags & PF_MEMALLOC) goto nopage; /* Avoid allocations with no watermarks from looping endlessly */ if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) goto nopage; - /* Try direct compaction */ + /* + * Try direct compaction. The first pass is asynchronous. Subsequent + * attempts after direct reclaim are synchronous + */ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress); + migratetype, &did_some_progress, + sync_migration); if (page) goto got_pg; + sync_migration = true; /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, @@ -2102,13 +2129,27 @@ rebalance: /* Wait for some write requests to complete then retry */ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); goto rebalance; + } else { + /* + * High-order allocations do not necessarily loop after + * direct reclaim and reclaim/compaction depends on compaction + * being called after reclaim so call directly if necessary + */ + page = __alloc_pages_direct_compact(gfp_mask, order, + zonelist, high_zoneidx, + nodemask, + alloc_flags, preferred_zone, + migratetype, &did_some_progress, + sync_migration); + if (page) + goto got_pg; } nopage: if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { printk(KERN_WARNING "%s: page allocation failure." " order:%d, mode:0x%x\n", - p->comm, order, gfp_mask); + current->comm, order, gfp_mask); dump_stack(); show_mem(); } @@ -2442,7 +2483,7 @@ void show_free_areas(void) " all_unreclaimable? %s" "\n", zone->name, - K(zone_nr_free_pages(zone)), + K(zone_page_state(zone, NR_FREE_PAGES)), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), @@ -2585,9 +2626,16 @@ static int __parse_numa_zonelist_order(char *s) static __init int setup_numa_zonelist_order(char *s) { - if (s) - return __parse_numa_zonelist_order(s); - return 0; + int ret; + + if (!s) + return 0; + + ret = __parse_numa_zonelist_order(s); + if (ret == 0) + strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); + + return ret; } early_param("numa_zonelist_order", setup_numa_zonelist_order); @@ -5517,7 +5565,6 @@ static struct trace_print_flags pageflag_names[] = { {1UL << PG_swapcache, "swapcache" }, {1UL << PG_mappedtodisk, "mappedtodisk" }, {1UL << PG_reclaim, "reclaim" }, - {1UL << PG_buddy, "buddy" }, {1UL << PG_swapbacked, "swapbacked" }, {1UL << PG_unevictable, "unevictable" }, #ifdef CONFIG_MMU @@ -5565,7 +5612,7 @@ void dump_page(struct page *page) { printk(KERN_ALERT "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", - page, page_count(page), page_mapcount(page), + page, atomic_read(&page->_count), page_mapcount(page), page->mapping, page->index); dump_page_flags(page->flags); } diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 38cc58b..7cfa6ae 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -34,6 +34,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + split_huge_page_pmd(walk->mm, pmd); if (pmd_none_or_clear_bad(pmd)) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 7d9c1d0..ea53496 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -421,7 +421,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void) return NULL; vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, - pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL); + pcpu_nr_groups, pcpu_atom_size); if (!vms) { pcpu_free_chunk(chunk); return NULL; diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c new file mode 100644 index 0000000..d030548 --- /dev/null +++ b/mm/pgtable-generic.c @@ -0,0 +1,123 @@ +/* + * mm/pgtable-generic.c + * + * Generic pgtable methods declared in asm-generic/pgtable.h + * + * Copyright (C) 2010 Linus Torvalds + */ + +#include +#include + +#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +/* + * Only sets the access flags (dirty, accessed, and + * writable). Furthermore, we know it always gets set to a "more + * permissive" setting, which allows most architectures to optimize + * this. We return whether the PTE actually changed, which in turn + * instructs the caller to do things like update__mmu_cache. This + * used to be done in the caller, but sparc needs minor faults to + * force that call on sun4c so we changed this macro slightly + */ +int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + int changed = !pte_same(*ptep, entry); + if (changed) { + set_pte_at(vma->vm_mm, address, ptep, entry); + flush_tlb_page(vma, address); + } + return changed; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + int changed = !pmd_same(*pmdp, entry); + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + if (changed) { + set_pmd_at(vma->vm_mm, address, pmdp, entry); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + } + return changed; +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ + BUG(); + return 0; +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +} +#endif + +#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + int young; + young = ptep_test_and_clear_young(vma, address, ptep); + if (young) + flush_tlb_page(vma, address); + return young; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + int young; +#ifndef CONFIG_TRANSPARENT_HUGEPAGE + BUG(); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + young = pmdp_test_and_clear_young(vma, address, pmdp); + if (young) + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return young; +} +#endif + +#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH +pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, + pte_t *ptep) +{ + pte_t pte; + pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); + flush_tlb_page(vma, address); + return pte; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH +pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd; +#ifndef CONFIG_TRANSPARENT_HUGEPAGE + BUG(); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return pmd; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH +pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + pmd_t pmd = pmd_mksplitting(*pmdp); + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + set_pmd_at(vma->vm_mm, address, pmdp, pmd); + /* tlb flush only to serialize against gup-fast */ + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ + BUG(); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +} +#endif diff --git a/mm/rmap.c b/mm/rmap.c index c95d2ba..f21f4a1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -177,6 +177,10 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, list_add(&avc->same_vma, &vma->anon_vma_chain); anon_vma_lock(anon_vma); + /* + * It's critical to add new vmas to the tail of the anon_vma, + * see comment in huge_memory.c:__split_huge_page(). + */ list_add_tail(&avc->same_anon_vma, &anon_vma->head); anon_vma_unlock(anon_vma); } @@ -360,7 +364,7 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma) * Returns virtual address or -EFAULT if page's index/offset is not * within the range mapped the @vma. */ -static inline unsigned long +inline unsigned long vma_address(struct page *page, struct vm_area_struct *vma) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -435,6 +439,8 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return NULL; + if (pmd_trans_huge(*pmd)) + return NULL; pte = pte_offset_map(pmd, address); /* Make a quick check before getting the lock */ @@ -489,35 +495,17 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; - pte_t *pte; - spinlock_t *ptl; int referenced = 0; - pte = page_check_address(page, mm, address, &ptl, 0); - if (!pte) - goto out; - /* * Don't want to elevate referenced for mlocked page that gets this far, * in order that it progresses to try_to_unmap and is moved to the * unevictable list. */ if (vma->vm_flags & VM_LOCKED) { - *mapcount = 1; /* break early from loop */ + *mapcount = 0; /* break early from loop */ *vm_flags |= VM_LOCKED; - goto out_unmap; - } - - if (ptep_clear_flush_young_notify(vma, address, pte)) { - /* - * Don't treat a reference through a sequentially read - * mapping as such. If the page has been used in - * another mapping, we will catch it; if this other - * mapping is already gone, the unmap path will have - * set PG_referenced or activated the page. - */ - if (likely(!VM_SequentialReadHint(vma))) - referenced++; + goto out; } /* Pretend the page is referenced if the task has the @@ -526,9 +514,39 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, rwsem_is_locked(&mm->mmap_sem)) referenced++; -out_unmap: + if (unlikely(PageTransHuge(page))) { + pmd_t *pmd; + + spin_lock(&mm->page_table_lock); + pmd = page_check_address_pmd(page, mm, address, + PAGE_CHECK_ADDRESS_PMD_FLAG); + if (pmd && !pmd_trans_splitting(*pmd) && + pmdp_clear_flush_young_notify(vma, address, pmd)) + referenced++; + spin_unlock(&mm->page_table_lock); + } else { + pte_t *pte; + spinlock_t *ptl; + + pte = page_check_address(page, mm, address, &ptl, 0); + if (!pte) + goto out; + + if (ptep_clear_flush_young_notify(vma, address, pte)) { + /* + * Don't treat a reference through a sequentially read + * mapping as such. If the page has been used in + * another mapping, we will catch it; if this other + * mapping is already gone, the unmap path will have + * set PG_referenced or activated the page. + */ + if (likely(!VM_SequentialReadHint(vma))) + referenced++; + } + pte_unmap_unlock(pte, ptl); + } + (*mapcount)--; - pte_unmap_unlock(pte, ptl); if (referenced) *vm_flags |= vma->vm_flags; @@ -864,8 +882,13 @@ void do_page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, int exclusive) { int first = atomic_inc_and_test(&page->_mapcount); - if (first) - __inc_zone_page_state(page, NR_ANON_PAGES); + if (first) { + if (!PageTransHuge(page)) + __inc_zone_page_state(page, NR_ANON_PAGES); + else + __inc_zone_page_state(page, + NR_ANON_TRANSPARENT_HUGEPAGES); + } if (unlikely(PageKsm(page))) return; @@ -893,7 +916,10 @@ void page_add_new_anon_rmap(struct page *page, VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); SetPageSwapBacked(page); atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ - __inc_zone_page_state(page, NR_ANON_PAGES); + if (!PageTransHuge(page)) + __inc_zone_page_state(page, NR_ANON_PAGES); + else + __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __page_set_anon_rmap(page, vma, address, 1); if (page_evictable(page, vma)) lru_cache_add_lru(page, LRU_ACTIVE_ANON); @@ -911,7 +937,7 @@ void page_add_file_rmap(struct page *page) { if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_update_file_mapped(page, 1); + mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); } } @@ -946,10 +972,14 @@ void page_remove_rmap(struct page *page) return; if (PageAnon(page)) { mem_cgroup_uncharge_page(page); - __dec_zone_page_state(page, NR_ANON_PAGES); + if (!PageTransHuge(page)) + __dec_zone_page_state(page, NR_ANON_PAGES); + else + __dec_zone_page_state(page, + NR_ANON_TRANSPARENT_HUGEPAGES); } else { __dec_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_update_file_mapped(page, -1); + mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); } /* * It would be tidy to reset the PageAnon mapping here, @@ -1202,7 +1232,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, return ret; } -static bool is_vma_temporary_stack(struct vm_area_struct *vma) +bool is_vma_temporary_stack(struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); @@ -1400,6 +1430,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) int ret; BUG_ON(!PageLocked(page)); + VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); if (unlikely(PageKsm(page))) ret = try_to_unmap_ksm(page, flags); diff --git a/mm/slub.c b/mm/slub.c index 008cd74..c7ef007 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3636,7 +3636,7 @@ static int list_locations(struct kmem_cache *s, char *buf, len += sprintf(buf + len, "%7ld ", l->count); if (l->addr) - len += sprint_symbol(buf + len, (unsigned long)l->addr); + len += sprintf(buf + len, "%pS", (void *)l->addr); else len += sprintf(buf + len, ""); @@ -3946,12 +3946,9 @@ SLAB_ATTR(min_partial); static ssize_t ctor_show(struct kmem_cache *s, char *buf) { - if (s->ctor) { - int n = sprint_symbol(buf, (unsigned long)s->ctor); - - return n + sprintf(buf + n, "\n"); - } - return 0; + if (!s->ctor) + return 0; + return sprintf(buf, "%pS\n", s->ctor); } SLAB_ATTR_RO(ctor); diff --git a/mm/sparse.c b/mm/sparse.c index 95ac219..9325020 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -671,10 +671,10 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) static void free_map_bootmem(struct page *page, unsigned long nr_pages) { unsigned long maps_section_nr, removing_section_nr, i; - int magic; + unsigned long magic; for (i = 0; i < nr_pages; i++, page++) { - magic = atomic_read(&page->_mapcount); + magic = (unsigned long) page->lru.next; BUG_ON(magic == NODE_INFO); diff --git a/mm/swap.c b/mm/swap.c index 3f48542..bbc1ce9 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -56,17 +56,97 @@ static void __page_cache_release(struct page *page) del_page_from_lru(zone, page); spin_unlock_irqrestore(&zone->lru_lock, flags); } +} + +static void __put_single_page(struct page *page) +{ + __page_cache_release(page); free_hot_cold_page(page, 0); } -static void put_compound_page(struct page *page) +static void __put_compound_page(struct page *page) { - page = compound_head(page); - if (put_page_testzero(page)) { - compound_page_dtor *dtor; + compound_page_dtor *dtor; - dtor = get_compound_page_dtor(page); - (*dtor)(page); + __page_cache_release(page); + dtor = get_compound_page_dtor(page); + (*dtor)(page); +} + +static void put_compound_page(struct page *page) +{ + if (unlikely(PageTail(page))) { + /* __split_huge_page_refcount can run under us */ + struct page *page_head = page->first_page; + smp_rmb(); + /* + * If PageTail is still set after smp_rmb() we can be sure + * that the page->first_page we read wasn't a dangling pointer. + * See __split_huge_page_refcount() smp_wmb(). + */ + if (likely(PageTail(page) && get_page_unless_zero(page_head))) { + unsigned long flags; + /* + * Verify that our page_head wasn't converted + * to a a regular page before we got a + * reference on it. + */ + if (unlikely(!PageHead(page_head))) { + /* PageHead is cleared after PageTail */ + smp_rmb(); + VM_BUG_ON(PageTail(page)); + goto out_put_head; + } + /* + * Only run compound_lock on a valid PageHead, + * after having it pinned with + * get_page_unless_zero() above. + */ + smp_mb(); + /* page_head wasn't a dangling pointer */ + flags = compound_lock_irqsave(page_head); + if (unlikely(!PageTail(page))) { + /* __split_huge_page_refcount run before us */ + compound_unlock_irqrestore(page_head, flags); + VM_BUG_ON(PageHead(page_head)); + out_put_head: + if (put_page_testzero(page_head)) + __put_single_page(page_head); + out_put_single: + if (put_page_testzero(page)) + __put_single_page(page); + return; + } + VM_BUG_ON(page_head != page->first_page); + /* + * We can release the refcount taken by + * get_page_unless_zero now that + * split_huge_page_refcount is blocked on the + * compound_lock. + */ + if (put_page_testzero(page_head)) + VM_BUG_ON(1); + /* __split_huge_page_refcount will wait now */ + VM_BUG_ON(atomic_read(&page->_count) <= 0); + atomic_dec(&page->_count); + VM_BUG_ON(atomic_read(&page_head->_count) <= 0); + compound_unlock_irqrestore(page_head, flags); + if (put_page_testzero(page_head)) { + if (PageHead(page_head)) + __put_compound_page(page_head); + else + __put_single_page(page_head); + } + } else { + /* page_head is a dangling pointer */ + VM_BUG_ON(PageTail(page)); + goto out_put_single; + } + } else if (put_page_testzero(page)) { + if (PageHead(page)) + __put_compound_page(page); + else + __put_single_page(page); } } @@ -75,7 +155,7 @@ void put_page(struct page *page) if (unlikely(PageCompound(page))) put_compound_page(page); else if (put_page_testzero(page)) - __page_cache_release(page); + __put_single_page(page); } EXPORT_SYMBOL(put_page); @@ -98,15 +178,13 @@ void put_pages_list(struct list_head *pages) } EXPORT_SYMBOL(put_pages_list); -/* - * pagevec_move_tail() must be called with IRQ disabled. - * Otherwise this may cause nasty races. - */ -static void pagevec_move_tail(struct pagevec *pvec) +static void pagevec_lru_move_fn(struct pagevec *pvec, + void (*move_fn)(struct page *page, void *arg), + void *arg) { int i; - int pgmoved = 0; struct zone *zone = NULL; + unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; @@ -114,29 +192,49 @@ static void pagevec_move_tail(struct pagevec *pvec) if (pagezone != zone) { if (zone) - spin_unlock(&zone->lru_lock); + spin_unlock_irqrestore(&zone->lru_lock, flags); zone = pagezone; - spin_lock(&zone->lru_lock); - } - if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - int lru = page_lru_base_type(page); - list_move_tail(&page->lru, &zone->lru[lru].list); - pgmoved++; + spin_lock_irqsave(&zone->lru_lock, flags); } + + (*move_fn)(page, arg); } if (zone) - spin_unlock(&zone->lru_lock); - __count_vm_events(PGROTATED, pgmoved); - release_pages(pvec->pages, pvec->nr, pvec->cold); + spin_unlock_irqrestore(&zone->lru_lock, flags); + release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); pagevec_reinit(pvec); } +static void pagevec_move_tail_fn(struct page *page, void *arg) +{ + int *pgmoved = arg; + struct zone *zone = page_zone(page); + + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + int lru = page_lru_base_type(page); + list_move_tail(&page->lru, &zone->lru[lru].list); + (*pgmoved)++; + } +} + +/* + * pagevec_move_tail() must be called with IRQ disabled. + * Otherwise this may cause nasty races. + */ +static void pagevec_move_tail(struct pagevec *pvec) +{ + int pgmoved = 0; + + pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved); + __count_vm_events(PGROTATED, pgmoved); +} + /* * Writeback is about to end against a page which has been marked for immediate * reclaim. If it still appears to be reclaimable, move it to the tail of the * inactive list. */ -void rotate_reclaimable_page(struct page *page) +void rotate_reclaimable_page(struct page *page) { if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && !PageUnevictable(page) && PageLRU(page)) { @@ -173,27 +271,94 @@ static void update_page_reclaim_stat(struct zone *zone, struct page *page, } /* - * FIXME: speed this up? + * A page will go to active list either by activate_page or putback_lru_page. + * In the activate_page case, the page hasn't active bit set. The page might + * not in LRU list because it's isolated before it gets a chance to be moved to + * active list. The window is small because pagevec just stores several pages. + * For such case, we do nothing for such page. + * In the putback_lru_page case, the page isn't in lru list but has active + * bit set */ -void activate_page(struct page *page) +static void __activate_page(struct page *page, void *arg) { struct zone *zone = page_zone(page); + int file = page_is_file_cache(page); + int lru = page_lru_base_type(page); + bool putback = !PageLRU(page); - spin_lock_irq(&zone->lru_lock); - if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - int file = page_is_file_cache(page); - int lru = page_lru_base_type(page); + /* The page is isolated before it's moved to active list */ + if (!PageLRU(page) && !PageActive(page)) + return; + if ((PageLRU(page) && PageActive(page)) || PageUnevictable(page)) + return; + + if (!putback) del_page_from_lru_list(zone, page, lru); + else + SetPageLRU(page); - SetPageActive(page); - lru += LRU_ACTIVE; - add_page_to_lru_list(zone, page, lru); - __count_vm_event(PGACTIVATE); + SetPageActive(page); + lru += LRU_ACTIVE; + add_page_to_lru_list(zone, page, lru); - update_page_reclaim_stat(zone, page, file, 1); + if (putback) + return; + __count_vm_event(PGACTIVATE); + update_page_reclaim_stat(zone, page, file, 1); +} + +#ifdef CONFIG_SMP +static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); + +static void activate_page_drain(int cpu) +{ + struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); + + if (pagevec_count(pvec)) + pagevec_lru_move_fn(pvec, __activate_page, NULL); +} + +void activate_page(struct page *page) +{ + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); + + page_cache_get(page); + if (!pagevec_add(pvec, page)) + pagevec_lru_move_fn(pvec, __activate_page, NULL); + put_cpu_var(activate_page_pvecs); + } +} + +/* Caller should hold zone->lru_lock */ +int putback_active_lru_page(struct zone *zone, struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); + + if (!pagevec_add(pvec, page)) { + spin_unlock_irq(&zone->lru_lock); + pagevec_lru_move_fn(pvec, __activate_page, NULL); + spin_lock_irq(&zone->lru_lock); } + put_cpu_var(activate_page_pvecs); + return 1; +} + +#else +static inline void activate_page_drain(int cpu) +{ +} + +void activate_page(struct page *page) +{ + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) + __activate_page(page, NULL); spin_unlock_irq(&zone->lru_lock); } +#endif /* * Mark a page as having seen activity. @@ -292,6 +457,7 @@ static void drain_cpu_pagevecs(int cpu) pagevec_move_tail(pvec); local_irq_restore(flags); } + activate_page_drain(cpu); } void lru_add_drain(void) @@ -399,44 +565,70 @@ void __pagevec_release(struct pagevec *pvec) EXPORT_SYMBOL(__pagevec_release); +/* used by __split_huge_page_refcount() */ +void lru_add_page_tail(struct zone* zone, + struct page *page, struct page *page_tail) +{ + int active; + enum lru_list lru; + const int file = 0; + struct list_head *head; + + VM_BUG_ON(!PageHead(page)); + VM_BUG_ON(PageCompound(page_tail)); + VM_BUG_ON(PageLRU(page_tail)); + VM_BUG_ON(!spin_is_locked(&zone->lru_lock)); + + SetPageLRU(page_tail); + + if (page_evictable(page_tail, NULL)) { + if (PageActive(page)) { + SetPageActive(page_tail); + active = 1; + lru = LRU_ACTIVE_ANON; + } else { + active = 0; + lru = LRU_INACTIVE_ANON; + } + update_page_reclaim_stat(zone, page_tail, file, active); + if (likely(PageLRU(page))) + head = page->lru.prev; + else + head = &zone->lru[lru].list; + __add_page_to_lru_list(zone, page_tail, lru, head); + } else { + SetPageUnevictable(page_tail); + add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE); + } +} + +static void ____pagevec_lru_add_fn(struct page *page, void *arg) +{ + enum lru_list lru = (enum lru_list)arg; + struct zone *zone = page_zone(page); + int file = is_file_lru(lru); + int active = is_active_lru(lru); + + VM_BUG_ON(PageActive(page)); + VM_BUG_ON(PageUnevictable(page)); + VM_BUG_ON(PageLRU(page)); + + SetPageLRU(page); + if (active) + SetPageActive(page); + update_page_reclaim_stat(zone, page, file, active); + add_page_to_lru_list(zone, page, lru); +} + /* * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. */ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) { - int i; - struct zone *zone = NULL; - VM_BUG_ON(is_unevictable_lru(lru)); - for (i = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - struct zone *pagezone = page_zone(page); - int file; - int active; - - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); - } - VM_BUG_ON(PageActive(page)); - VM_BUG_ON(PageUnevictable(page)); - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - active = is_active_lru(lru); - file = is_file_lru(lru); - if (active) - SetPageActive(page); - update_page_reclaim_stat(zone, page, file, active); - add_page_to_lru_list(zone, page, lru); - } - if (zone) - spin_unlock_irq(&zone->lru_lock); - release_pages(pvec->pages, pvec->nr, pvec->cold); - pagevec_reinit(pvec); + pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru); } EXPORT_SYMBOL(____pagevec_lru_add); diff --git a/mm/swap_state.c b/mm/swap_state.c index e10f583..5c8cfab 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -157,6 +157,12 @@ int add_to_swap(struct page *page) if (!entry.val) return 0; + if (unlikely(PageTransHuge(page))) + if (unlikely(split_huge_page(page))) { + swapcache_free(entry, NULL); + return 0; + } + /* * Radix-tree node allocations from PF_MEMALLOC contexts could * completely exhaust the page allocator. __GFP_NOMEMALLOC diff --git a/mm/swapfile.c b/mm/swapfile.c index b6adcfb..07a458d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -964,6 +964,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (unlikely(pmd_trans_huge(*pmd))) + continue; if (pmd_none_or_clear_bad(pmd)) continue; ret = unuse_pte_range(vma, pmd, addr, next, entry, page); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index eb5cc7d..f9b1667 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -748,7 +748,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, VMALLOC_START, VMALLOC_END, node, gfp_mask); - if (unlikely(IS_ERR(va))) { + if (IS_ERR(va)) { kfree(vb); return ERR_CAST(va); } @@ -1175,6 +1175,7 @@ void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) { vunmap_page_range(addr, addr + size); } +EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush); /** * unmap_kernel_range - unmap kernel VM area and flush cache and TLB @@ -1315,13 +1316,6 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, -1, GFP_KERNEL, caller); } -struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, - int node, gfp_t gfp_mask) -{ - return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, - node, gfp_mask, __builtin_return_address(0)); -} - static struct vm_struct *find_vm_area(const void *addr) { struct vmap_area *va; @@ -1537,25 +1531,12 @@ fail: return NULL; } -void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) -{ - void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1, - __builtin_return_address(0)); - - /* - * A ref_count = 3 is needed because the vm_struct and vmap_area - * structures allocated in the __get_vm_area_node() function contain - * references to the virtual address of the vmalloc'ed block. - */ - kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask); - - return addr; -} - /** - * __vmalloc_node - allocate virtually contiguous memory + * __vmalloc_node_range - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment + * @start: vm area range start + * @end: vm area range end * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages * @node: node to use for allocation or -1 @@ -1565,9 +1546,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. */ -static void *__vmalloc_node(unsigned long size, unsigned long align, - gfp_t gfp_mask, pgprot_t prot, - int node, void *caller) +void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, int node, void *caller) { struct vm_struct *area; void *addr; @@ -1577,8 +1558,8 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, if (!size || (size >> PAGE_SHIFT) > totalram_pages) return NULL; - area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START, - VMALLOC_END, node, gfp_mask, caller); + area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node, + gfp_mask, caller); if (!area) return NULL; @@ -1595,6 +1576,27 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, return addr; } +/** + * __vmalloc_node - allocate virtually contiguous memory + * @size: allocation size + * @align: desired alignment + * @gfp_mask: flags for the page level allocator + * @prot: protection mask for the allocated pages + * @node: node to use for allocation or -1 + * @caller: caller's return address + * + * Allocate enough pages to cover @size from the page level + * allocator with @gfp_mask flags. Map them into contiguous + * kernel virtual space, using a pagetable protection of @prot. + */ +static void *__vmalloc_node(unsigned long size, unsigned long align, + gfp_t gfp_mask, pgprot_t prot, + int node, void *caller) +{ + return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, + gfp_mask, prot, node, caller); +} + void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) { return __vmalloc_node(size, 1, gfp_mask, prot, -1, @@ -2203,17 +2205,16 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, * @sizes: array containing size of each area * @nr_vms: the number of areas to allocate * @align: alignment, all entries in @offsets and @sizes must be aligned to this - * @gfp_mask: allocation mask * * Returns: kmalloc'd vm_struct pointer array pointing to allocated * vm_structs on success, %NULL on failure * * Percpu allocator wants to use congruent vm areas so that it can * maintain the offsets among percpu areas. This function allocates - * congruent vmalloc areas for it. These areas tend to be scattered - * pretty far, distance between two areas easily going up to - * gigabytes. To avoid interacting with regular vmallocs, these areas - * are allocated from top. + * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to + * be scattered pretty far, distance between two areas easily going up + * to gigabytes. To avoid interacting with regular vmallocs, these + * areas are allocated from top. * * Despite its complicated look, this allocator is rather simple. It * does everything top-down and scans areas from the end looking for @@ -2224,7 +2225,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, */ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, - size_t align, gfp_t gfp_mask) + size_t align) { const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); @@ -2234,8 +2235,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, unsigned long base, start, end, last_end; bool purged = false; - gfp_mask &= GFP_RECLAIM_MASK; - /* verify parameters and allocate data structures */ BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); for (last_area = 0, area = 0; area < nr_vms; area++) { @@ -2268,14 +2267,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, return NULL; } - vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask); - vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask); + vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); + vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); if (!vas || !vms) goto err_free; for (area = 0; area < nr_vms; area++) { - vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask); - vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask); + vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL); + vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); if (!vas[area] || !vms[area]) goto err_free; } @@ -2456,13 +2455,8 @@ static int s_show(struct seq_file *m, void *p) seq_printf(m, "0x%p-0x%p %7ld", v->addr, v->addr + v->size, v->size); - if (v->caller) { - char buff[KSYM_SYMBOL_LEN]; - - seq_putc(m, ' '); - sprint_symbol(buff, (unsigned long)v->caller); - seq_puts(m, buff); - } + if (v->caller) + seq_printf(m, " %pS", v->caller); if (v->nr_pages) seq_printf(m, " pages=%d", v->nr_pages); diff --git a/mm/vmscan.c b/mm/vmscan.c index 9ca587c..99999a9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -40,6 +41,7 @@ #include #include #include +#include #include #include @@ -51,11 +53,23 @@ #define CREATE_TRACE_POINTS #include -enum lumpy_mode { - LUMPY_MODE_NONE, - LUMPY_MODE_ASYNC, - LUMPY_MODE_SYNC, -}; +/* + * reclaim_mode determines how the inactive list is shrunk + * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages + * RECLAIM_MODE_ASYNC: Do not block + * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback + * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference + * page from the LRU and reclaim all pages within a + * naturally aligned range + * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of + * order-0 pages and then compact the zone + */ +typedef unsigned __bitwise__ reclaim_mode_t; +#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) +#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) +#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) +#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u) +#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) struct scan_control { /* Incremented by the number of inactive pages that were scanned */ @@ -88,7 +102,7 @@ struct scan_control { * Intend to reclaim enough continuous memory rather than reclaim * enough amount of memory. i.e, mode for high order allocation. */ - enum lumpy_mode lumpy_reclaim_mode; + reclaim_mode_t reclaim_mode; /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; @@ -271,34 +285,37 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, return ret; } -static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, +static void set_reclaim_mode(int priority, struct scan_control *sc, bool sync) { - enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; + reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; /* - * Some reclaim have alredy been failed. No worth to try synchronous - * lumpy reclaim. + * Initially assume we are entering either lumpy reclaim or + * reclaim/compaction.Depending on the order, we will either set the + * sync mode or just reclaim order-0 pages later. */ - if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) - return; + if (COMPACTION_BUILD) + sc->reclaim_mode = RECLAIM_MODE_COMPACTION; + else + sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM; /* - * If we need a large contiguous chunk of memory, or have - * trouble getting a small set of contiguous pages, we - * will reclaim both active and inactive pages. + * Avoid using lumpy reclaim or reclaim/compaction if possible by + * restricting when its set to either costly allocations or when + * under memory pressure */ if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - sc->lumpy_reclaim_mode = mode; + sc->reclaim_mode |= syncmode; else if (sc->order && priority < DEF_PRIORITY - 2) - sc->lumpy_reclaim_mode = mode; + sc->reclaim_mode |= syncmode; else - sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; + sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; } -static void disable_lumpy_reclaim_mode(struct scan_control *sc) +static void reset_reclaim_mode(struct scan_control *sc) { - sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; + sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; } static inline int is_page_cache_freeable(struct page *page) @@ -429,7 +446,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * first attempt to free a range of pages fails. */ if (PageWriteback(page) && - sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC) + (sc->reclaim_mode & RECLAIM_MODE_SYNC)) wait_on_page_writeback(page); if (!PageWriteback(page)) { @@ -437,7 +454,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, ClearPageReclaim(page); } trace_mm_vmscan_writepage(page, - trace_reclaim_flags(page, sc->lumpy_reclaim_mode)); + trace_reclaim_flags(page, sc->reclaim_mode)); inc_zone_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -622,7 +639,7 @@ static enum page_references page_check_references(struct page *page, referenced_page = TestClearPageReferenced(page); /* Lumpy reclaim - ignore references */ - if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE) + if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) return PAGEREF_RECLAIM; /* @@ -739,7 +756,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * for any page for which writeback has already * started. */ - if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC && + if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && may_enter_fs) wait_on_page_writeback(page); else { @@ -895,7 +912,7 @@ cull_mlocked: try_to_free_swap(page); unlock_page(page); putback_lru_page(page); - disable_lumpy_reclaim_mode(sc); + reset_reclaim_mode(sc); continue; activate_locked: @@ -908,7 +925,7 @@ activate_locked: keep_locked: unlock_page(page); keep: - disable_lumpy_reclaim_mode(sc); + reset_reclaim_mode(sc); keep_lumpy: list_add(&page->lru, &ret_pages); VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); @@ -1028,7 +1045,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, case 0: list_move(&page->lru, dst); mem_cgroup_del_lru(page); - nr_taken++; + nr_taken += hpage_nr_pages(page); break; case -EBUSY: @@ -1086,7 +1103,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); mem_cgroup_del_lru(cursor_page); - nr_taken++; + nr_taken += hpage_nr_pages(page); nr_lumpy_taken++; if (PageDirty(cursor_page)) nr_lumpy_dirty++; @@ -1141,14 +1158,15 @@ static unsigned long clear_active_flags(struct list_head *page_list, struct page *page; list_for_each_entry(page, page_list, lru) { + int numpages = hpage_nr_pages(page); lru = page_lru_base_type(page); if (PageActive(page)) { lru += LRU_ACTIVE; ClearPageActive(page); - nr_active++; + nr_active += numpages; } if (count) - count[lru]++; + count[lru] += numpages; } return nr_active; @@ -1253,13 +1271,16 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, spin_lock_irq(&zone->lru_lock); continue; } - SetPageLRU(page); lru = page_lru(page); - add_page_to_lru_list(zone, page, lru); if (is_active_lru(lru)) { int file = is_file_lru(lru); - reclaim_stat->recent_rotated[file]++; + int numpages = hpage_nr_pages(page); + reclaim_stat->recent_rotated[file] += numpages; + if (putback_active_lru_page(zone, page)) + continue; } + SetPageLRU(page); + add_page_to_lru_list(zone, page, lru); if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); __pagevec_release(&pvec); @@ -1324,7 +1345,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, return false; /* Only stall on lumpy reclaim */ - if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) + if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) return false; /* If we have relaimed everything on the isolated list, no stall */ @@ -1368,15 +1389,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, return SWAP_CLUSTER_MAX; } - set_lumpy_reclaim_mode(priority, sc, false); + set_reclaim_mode(priority, sc, false); lru_add_drain(); spin_lock_irq(&zone->lru_lock); if (scanning_global_lru(sc)) { nr_taken = isolate_pages_global(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? - ISOLATE_INACTIVE : ISOLATE_BOTH, + sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? + ISOLATE_BOTH : ISOLATE_INACTIVE, zone, 0, file); zone->pages_scanned += nr_scanned; if (current_is_kswapd()) @@ -1388,8 +1409,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, } else { nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? - ISOLATE_INACTIVE : ISOLATE_BOTH, + sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? + ISOLATE_BOTH : ISOLATE_INACTIVE, zone, sc->mem_cgroup, 0, file); /* @@ -1411,7 +1432,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, /* Check if we should syncronously wait for writeback */ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { - set_lumpy_reclaim_mode(priority, sc, true); + set_reclaim_mode(priority, sc, true); nr_reclaimed += shrink_page_list(&page_list, zone, sc); } @@ -1426,7 +1447,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, zone_idx(zone), nr_scanned, nr_reclaimed, priority, - trace_shrink_flags(file, sc->lumpy_reclaim_mode)); + trace_shrink_flags(file, sc->reclaim_mode)); return nr_reclaimed; } @@ -1466,7 +1487,7 @@ static void move_active_pages_to_lru(struct zone *zone, list_move(&page->lru, &zone->lru[lru].list); mem_cgroup_add_lru_list(page, lru); - pgmoved++; + pgmoved += hpage_nr_pages(page); if (!pagevec_add(&pvec, page) || list_empty(list)) { spin_unlock_irq(&zone->lru_lock); @@ -1534,7 +1555,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { - nr_rotated++; + nr_rotated += hpage_nr_pages(page); /* * Identify referenced, file-backed active pages and * give them one more trip around the active list. So @@ -1805,6 +1826,57 @@ out: } /* + * Reclaim/compaction depends on a number of pages being freed. To avoid + * disruption to the system, a small number of order-0 pages continue to be + * rotated and reclaimed in the normal fashion. However, by the time we get + * back to the allocator and call try_to_compact_zone(), we ensure that + * there are enough free pages for it to be likely successful + */ +static inline bool should_continue_reclaim(struct zone *zone, + unsigned long nr_reclaimed, + unsigned long nr_scanned, + struct scan_control *sc) +{ + unsigned long pages_for_compaction; + unsigned long inactive_lru_pages; + + /* If not in reclaim/compaction mode, stop */ + if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) + return false; + + /* + * If we failed to reclaim and have scanned the full list, stop. + * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far + * faster but obviously would be less likely to succeed + * allocation. If this is desirable, use GFP_REPEAT to decide + * if both reclaimed and scanned should be checked or just + * reclaimed + */ + if (!nr_reclaimed && !nr_scanned) + return false; + + /* + * If we have not reclaimed enough pages for compaction and the + * inactive lists are large enough, continue reclaiming + */ + pages_for_compaction = (2UL << sc->order); + inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + if (sc->nr_reclaimed < pages_for_compaction && + inactive_lru_pages > pages_for_compaction) + return true; + + /* If compaction would go ahead or the allocation would succeed, stop */ + switch (compaction_suitable(zone, sc->order)) { + case COMPACT_PARTIAL: + case COMPACT_CONTINUE: + return false; + default: + return true; + } +} + +/* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ static void shrink_zone(int priority, struct zone *zone, @@ -1813,9 +1885,12 @@ static void shrink_zone(int priority, struct zone *zone, unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; enum lru_list l; - unsigned long nr_reclaimed = sc->nr_reclaimed; + unsigned long nr_reclaimed; unsigned long nr_to_reclaim = sc->nr_to_reclaim; + unsigned long nr_scanned = sc->nr_scanned; +restart: + nr_reclaimed = 0; get_scan_count(zone, sc, nr, priority); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || @@ -1841,8 +1916,7 @@ static void shrink_zone(int priority, struct zone *zone, if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) break; } - - sc->nr_reclaimed = nr_reclaimed; + sc->nr_reclaimed += nr_reclaimed; /* * Even if we did not try to evict anon pages at all, we want to @@ -1851,6 +1925,11 @@ static void shrink_zone(int priority, struct zone *zone, if (inactive_anon_is_low(zone, sc)) shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); + /* reclaim/compaction might need reclaim to continue */ + if (should_continue_reclaim(zone, nr_reclaimed, + sc->nr_scanned - nr_scanned, sc)) + goto restart; + throttle_vm_writeout(sc->gfp_mask); } @@ -2124,38 +2203,87 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, } #endif +/* + * pgdat_balanced is used when checking if a node is balanced for high-order + * allocations. Only zones that meet watermarks and are in a zone allowed + * by the callers classzone_idx are added to balanced_pages. The total of + * balanced pages must be at least 25% of the zones allowed by classzone_idx + * for the node to be considered balanced. Forcing all zones to be balanced + * for high orders can cause excessive reclaim when there are imbalanced zones. + * The choice of 25% is due to + * o a 16M DMA zone that is balanced will not balance a zone on any + * reasonable sized machine + * o On all other machines, the top zone must be at least a reasonable + * precentage of the middle zones. For example, on 32-bit x86, highmem + * would need to be at least 256M for it to be balance a whole node. + * Similarly, on x86-64 the Normal zone would need to be at least 1G + * to balance a node on its own. These seemed like reasonable ratios. + */ +static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, + int classzone_idx) +{ + unsigned long present_pages = 0; + int i; + + for (i = 0; i <= classzone_idx; i++) + present_pages += pgdat->node_zones[i].present_pages; + + return balanced_pages > (present_pages >> 2); +} + /* is kswapd sleeping prematurely? */ -static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) +static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, + int classzone_idx) { int i; + unsigned long balanced = 0; + bool all_zones_ok = true; /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ if (remaining) - return 1; + return true; - /* If after HZ/10, a zone is below the high mark, it's premature */ + /* Check the watermark levels */ for (i = 0; i < pgdat->nr_zones; i++) { struct zone *zone = pgdat->node_zones + i; if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable) + /* + * balance_pgdat() skips over all_unreclaimable after + * DEF_PRIORITY. Effectively, it considers them balanced so + * they must be considered balanced here as well if kswapd + * is to sleep + */ + if (zone->all_unreclaimable) { + balanced += zone->present_pages; continue; + } - if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), - 0, 0)) - return 1; + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), + classzone_idx, 0)) + all_zones_ok = false; + else + balanced += zone->present_pages; } - return 0; + /* + * For high-order requests, the balanced zones must contain at least + * 25% of the nodes pages for kswapd to sleep. For order-0, all zones + * must be balanced + */ + if (order) + return pgdat_balanced(pgdat, balanced, classzone_idx); + else + return !all_zones_ok; } /* * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). * - * Returns the number of pages which were actually freed. + * Returns the final order kswapd was reclaiming at * * There is special handling here for zones which are full of pinned pages. * This can happen if the pages are all mlocked, or if they are all used by @@ -2172,11 +2300,14 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) * interoperates with the page allocator fallback scheme to ensure that aging * of pages is balanced across the zones. */ -static unsigned long balance_pgdat(pg_data_t *pgdat, int order) +static unsigned long balance_pgdat(pg_data_t *pgdat, int order, + int *classzone_idx) { int all_zones_ok; + unsigned long balanced; int priority; int i; + int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long total_scanned; struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc = { @@ -2199,7 +2330,6 @@ loop_again: count_vm_event(PAGEOUTRUN); for (priority = DEF_PRIORITY; priority >= 0; priority--) { - int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long lru_pages = 0; int has_under_min_watermark_zone = 0; @@ -2208,6 +2338,7 @@ loop_again: disable_swap_token(); all_zones_ok = 1; + balanced = 0; /* * Scan in the highmem->dma direction for the highest @@ -2230,9 +2361,10 @@ loop_again: shrink_active_list(SWAP_CLUSTER_MAX, zone, &sc, priority, 0); - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) { end_zone = i; + *classzone_idx = i; break; } } @@ -2255,6 +2387,7 @@ loop_again: * cause too much scanning of the lower zones. */ for (i = 0; i <= end_zone; i++) { + int compaction; struct zone *zone = pgdat->node_zones + i; int nr_slab; @@ -2276,7 +2409,7 @@ loop_again: * We put equal pressure on every zone, unless one * zone has way too many pages free already. */ - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, 8*high_wmark_pages(zone), end_zone, 0)) shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; @@ -2284,9 +2417,26 @@ loop_again: lru_pages); sc.nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; + + compaction = 0; + if (order && + zone_watermark_ok(zone, 0, + high_wmark_pages(zone), + end_zone, 0) && + !zone_watermark_ok(zone, order, + high_wmark_pages(zone), + end_zone, 0)) { + compact_zone_order(zone, + order, + sc.gfp_mask, false, + COMPACT_MODE_KSWAPD); + compaction = 1; + } + if (zone->all_unreclaimable) continue; - if (nr_slab == 0 && !zone_reclaimable(zone)) + if (!compaction && nr_slab == 0 && + !zone_reclaimable(zone)) zone->all_unreclaimable = 1; /* * If we've done a decent amount of scanning and @@ -2297,7 +2447,7 @@ loop_again: total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), end_zone, 0)) { all_zones_ok = 0; /* @@ -2305,7 +2455,7 @@ loop_again: * means that we have a GFP_ATOMIC allocation * failure risk. Hurry up! */ - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, min_wmark_pages(zone), end_zone, 0)) has_under_min_watermark_zone = 1; } else { @@ -2317,10 +2467,12 @@ loop_again: * spectulatively avoid congestion waits */ zone_clear_flag(zone, ZONE_CONGESTED); + if (i <= *classzone_idx) + balanced += zone->present_pages; } } - if (all_zones_ok) + if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) break; /* kswapd: all done */ /* * OK, kswapd is getting into trouble. Take a nap, then take @@ -2343,7 +2495,13 @@ loop_again: break; } out: - if (!all_zones_ok) { + + /* + * order-0: All zones must meet high watermark for a balanced node + * high-order: Balanced zones must make up at least 25% of the node + * for the node to be balanced + */ + if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) { cond_resched(); try_to_freeze(); @@ -2368,7 +2526,88 @@ out: goto loop_again; } - return sc.nr_reclaimed; + /* + * If kswapd was reclaiming at a higher order, it has the option of + * sleeping without all zones being balanced. Before it does, it must + * ensure that the watermarks for order-0 on *all* zones are met and + * that the congestion flags are cleared. The congestion flag must + * be cleared as kswapd is the only mechanism that clears the flag + * and it is potentially going to sleep here. + */ + if (order) { + for (i = 0; i <= end_zone; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + if (zone->all_unreclaimable && priority != DEF_PRIORITY) + continue; + + /* Confirm the zone is balanced for order-0 */ + if (!zone_watermark_ok(zone, 0, + high_wmark_pages(zone), 0, 0)) { + order = sc.order = 0; + goto loop_again; + } + + /* If balanced, clear the congested flag */ + zone_clear_flag(zone, ZONE_CONGESTED); + } + } + + /* + * Return the order we were reclaiming at so sleeping_prematurely() + * makes a decision on the order we were last reclaiming at. However, + * if another caller entered the allocator slow path while kswapd + * was awake, order will remain at the higher level + */ + *classzone_idx = end_zone; + return order; +} + +static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) +{ + long remaining = 0; + DEFINE_WAIT(wait); + + if (freezing(current) || kthread_should_stop()) + return; + + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + + /* Try to sleep for a short interval */ + if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { + remaining = schedule_timeout(HZ/10); + finish_wait(&pgdat->kswapd_wait, &wait); + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + } + + /* + * After a short sleep, check if it was a premature sleep. If not, then + * go fully to sleep until explicitly woken up. + */ + if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { + trace_mm_vmscan_kswapd_sleep(pgdat->node_id); + + /* + * vmstat counters are not perfectly accurate and the estimated + * value for counters such as NR_FREE_PAGES can deviate from the + * true value by nr_online_cpus * threshold. To avoid the zone + * watermarks being breached while under pressure, we reduce the + * per-cpu vmstat threshold while kswapd is awake and restore + * them before going back to sleep. + */ + set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); + schedule(); + set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); + } else { + if (remaining) + count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); + else + count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); + } + finish_wait(&pgdat->kswapd_wait, &wait); } /* @@ -2387,9 +2626,10 @@ out: static int kswapd(void *p) { unsigned long order; + int classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; - DEFINE_WAIT(wait); + struct reclaim_state reclaim_state = { .reclaimed_slab = 0, }; @@ -2417,49 +2657,30 @@ static int kswapd(void *p) set_freezable(); order = 0; + classzone_idx = MAX_NR_ZONES - 1; for ( ; ; ) { unsigned long new_order; + int new_classzone_idx; int ret; - prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); new_order = pgdat->kswapd_max_order; + new_classzone_idx = pgdat->classzone_idx; pgdat->kswapd_max_order = 0; - if (order < new_order) { + pgdat->classzone_idx = MAX_NR_ZONES - 1; + if (order < new_order || classzone_idx > new_classzone_idx) { /* * Don't sleep if someone wants a larger 'order' - * allocation + * allocation or has tigher zone constraints */ order = new_order; + classzone_idx = new_classzone_idx; } else { - if (!freezing(current) && !kthread_should_stop()) { - long remaining = 0; - - /* Try to sleep for a short interval */ - if (!sleeping_prematurely(pgdat, order, remaining)) { - remaining = schedule_timeout(HZ/10); - finish_wait(&pgdat->kswapd_wait, &wait); - prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); - } - - /* - * After a short sleep, check if it was a - * premature sleep. If not, then go fully - * to sleep until explicitly woken up - */ - if (!sleeping_prematurely(pgdat, order, remaining)) { - trace_mm_vmscan_kswapd_sleep(pgdat->node_id); - schedule(); - } else { - if (remaining) - count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); - else - count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); - } - } - + kswapd_try_to_sleep(pgdat, order, classzone_idx); order = pgdat->kswapd_max_order; + classzone_idx = pgdat->classzone_idx; + pgdat->kswapd_max_order = 0; + pgdat->classzone_idx = MAX_NR_ZONES - 1; } - finish_wait(&pgdat->kswapd_wait, &wait); ret = try_to_freeze(); if (kthread_should_stop()) @@ -2471,7 +2692,7 @@ static int kswapd(void *p) */ if (!ret) { trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - balance_pgdat(pgdat, order); + order = balance_pgdat(pgdat, order, &classzone_idx); } } return 0; @@ -2480,23 +2701,26 @@ static int kswapd(void *p) /* * A zone is low on free memory, so wake its kswapd task to service it. */ -void wakeup_kswapd(struct zone *zone, int order) +void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) { pg_data_t *pgdat; if (!populated_zone(zone)) return; - pgdat = zone->zone_pgdat; - if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) - return; - if (pgdat->kswapd_max_order < order) - pgdat->kswapd_max_order = order; - trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) return; + pgdat = zone->zone_pgdat; + if (pgdat->kswapd_max_order < order) { + pgdat->kswapd_max_order = order; + pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); + } if (!waitqueue_active(&pgdat->kswapd_wait)) return; + if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) + return; + + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); wake_up_interruptible(&pgdat->kswapd_wait); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 312d728..0c3b504 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -83,7 +83,31 @@ EXPORT_SYMBOL(vm_stat); #ifdef CONFIG_SMP -static int calculate_threshold(struct zone *zone) +int calculate_pressure_threshold(struct zone *zone) +{ + int threshold; + int watermark_distance; + + /* + * As vmstats are not up to date, there is drift between the estimated + * and real values. For high thresholds and a high number of CPUs, it + * is possible for the min watermark to be breached while the estimated + * value looks fine. The pressure threshold is a reduced value such + * that even the maximum amount of drift will not accidentally breach + * the min watermark + */ + watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone); + threshold = max(1, (int)(watermark_distance / num_online_cpus())); + + /* + * Maximum threshold is 125 + */ + threshold = min(125, threshold); + + return threshold; +} + +int calculate_normal_threshold(struct zone *zone) { int threshold; int mem; /* memory in 128 MB units */ @@ -142,7 +166,7 @@ static void refresh_zone_stat_thresholds(void) for_each_populated_zone(zone) { unsigned long max_drift, tolerate_drift; - threshold = calculate_threshold(zone); + threshold = calculate_normal_threshold(zone); for_each_online_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold @@ -161,6 +185,26 @@ static void refresh_zone_stat_thresholds(void) } } +void set_pgdat_percpu_threshold(pg_data_t *pgdat, + int (*calculate_pressure)(struct zone *)) +{ + struct zone *zone; + int cpu; + int threshold; + int i; + + for (i = 0; i < pgdat->nr_zones; i++) { + zone = &pgdat->node_zones[i]; + if (!zone->percpu_drift_mark) + continue; + + threshold = (*calculate_pressure)(zone); + for_each_possible_cpu(cpu) + per_cpu_ptr(zone->pageset, cpu)->stat_threshold + = threshold; + } +} + /* * For use when we know that interrupts are disabled. */ @@ -836,6 +880,7 @@ static const char * const vmstat_text[] = { "numa_local", "numa_other", #endif + "nr_anon_transparent_hugepages", "nr_dirty_threshold", "nr_dirty_background_threshold", @@ -911,7 +956,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n scanned %lu" "\n spanned %lu" "\n present %lu", - zone_nr_free_pages(zone), + zone_page_state(zone, NR_FREE_PAGES), min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7f68625..f29abeb 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -104,8 +104,26 @@ static pfn_t fault_pfn; inline int kvm_is_mmio_pfn(pfn_t pfn) { if (pfn_valid(pfn)) { - struct page *page = compound_head(pfn_to_page(pfn)); - return PageReserved(page); + int reserved; + struct page *tail = pfn_to_page(pfn); + struct page *head = compound_trans_head(tail); + reserved = PageReserved(head); + if (head != tail) { + /* + * "head" is not a dangling pointer + * (compound_trans_head takes care of that) + * but the hugepage may have been splitted + * from under us (and we may not hold a + * reference count on the head page so it can + * be reused before we run PageReferenced), so + * we've to check PageTail before returning + * what we just read. + */ + smp_rmb(); + if (PageTail(tail)) + return reserved; + } + return PageReserved(tail); } return true; @@ -352,6 +370,22 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, return young; } +static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + int young, idx; + + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + young = kvm_test_age_hva(kvm, address); + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); + + return young; +} + static void kvm_mmu_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) { @@ -368,6 +402,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, + .test_young = kvm_mmu_notifier_test_young, .change_pte = kvm_mmu_notifier_change_pte, .release = kvm_mmu_notifier_release, };