LCOV - code coverage report
Current view: top level - kernel - futex.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 268 1241 21.6 %
Date: 2021-04-22 12:43:58 Functions: 29 75 38.7 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-or-later
       2             : /*
       3             :  *  Fast Userspace Mutexes (which I call "Futexes!").
       4             :  *  (C) Rusty Russell, IBM 2002
       5             :  *
       6             :  *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
       7             :  *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
       8             :  *
       9             :  *  Removed page pinning, fix privately mapped COW pages and other cleanups
      10             :  *  (C) Copyright 2003, 2004 Jamie Lokier
      11             :  *
      12             :  *  Robust futex support started by Ingo Molnar
      13             :  *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
      14             :  *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
      15             :  *
      16             :  *  PI-futex support started by Ingo Molnar and Thomas Gleixner
      17             :  *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
      18             :  *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
      19             :  *
      20             :  *  PRIVATE futexes by Eric Dumazet
      21             :  *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
      22             :  *
      23             :  *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
      24             :  *  Copyright (C) IBM Corporation, 2009
      25             :  *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
      26             :  *
      27             :  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
      28             :  *  enough at me, Linus for the original (flawed) idea, Matthew
      29             :  *  Kirkwood for proof-of-concept implementation.
      30             :  *
      31             :  *  "The futexes are also cursed."
      32             :  *  "But they come in a choice of three flavours!"
      33             :  */
      34             : #include <linux/compat.h>
      35             : #include <linux/jhash.h>
      36             : #include <linux/pagemap.h>
      37             : #include <linux/syscalls.h>
      38             : #include <linux/hugetlb.h>
      39             : #include <linux/freezer.h>
      40             : #include <linux/memblock.h>
      41             : #include <linux/fault-inject.h>
      42             : #include <linux/time_namespace.h>
      43             : 
      44             : #include <asm/futex.h>
      45             : 
      46             : #include "locking/rtmutex_common.h"
      47             : 
      48             : /*
      49             :  * READ this before attempting to hack on futexes!
      50             :  *
      51             :  * Basic futex operation and ordering guarantees
      52             :  * =============================================
      53             :  *
      54             :  * The waiter reads the futex value in user space and calls
      55             :  * futex_wait(). This function computes the hash bucket and acquires
      56             :  * the hash bucket lock. After that it reads the futex user space value
      57             :  * again and verifies that the data has not changed. If it has not changed
      58             :  * it enqueues itself into the hash bucket, releases the hash bucket lock
      59             :  * and schedules.
      60             :  *
      61             :  * The waker side modifies the user space value of the futex and calls
      62             :  * futex_wake(). This function computes the hash bucket and acquires the
      63             :  * hash bucket lock. Then it looks for waiters on that futex in the hash
      64             :  * bucket and wakes them.
      65             :  *
      66             :  * In futex wake up scenarios where no tasks are blocked on a futex, taking
      67             :  * the hb spinlock can be avoided and simply return. In order for this
      68             :  * optimization to work, ordering guarantees must exist so that the waiter
      69             :  * being added to the list is acknowledged when the list is concurrently being
      70             :  * checked by the waker, avoiding scenarios like the following:
      71             :  *
      72             :  * CPU 0                               CPU 1
      73             :  * val = *futex;
      74             :  * sys_futex(WAIT, futex, val);
      75             :  *   futex_wait(futex, val);
      76             :  *   uval = *futex;
      77             :  *                                     *futex = newval;
      78             :  *                                     sys_futex(WAKE, futex);
      79             :  *                                       futex_wake(futex);
      80             :  *                                       if (queue_empty())
      81             :  *                                         return;
      82             :  *   if (uval == val)
      83             :  *      lock(hash_bucket(futex));
      84             :  *      queue();
      85             :  *     unlock(hash_bucket(futex));
      86             :  *     schedule();
      87             :  *
      88             :  * This would cause the waiter on CPU 0 to wait forever because it
      89             :  * missed the transition of the user space value from val to newval
      90             :  * and the waker did not find the waiter in the hash bucket queue.
      91             :  *
      92             :  * The correct serialization ensures that a waiter either observes
      93             :  * the changed user space value before blocking or is woken by a
      94             :  * concurrent waker:
      95             :  *
      96             :  * CPU 0                                 CPU 1
      97             :  * val = *futex;
      98             :  * sys_futex(WAIT, futex, val);
      99             :  *   futex_wait(futex, val);
     100             :  *
     101             :  *   waiters++; (a)
     102             :  *   smp_mb(); (A) <-- paired with -.
     103             :  *                                  |
     104             :  *   lock(hash_bucket(futex));      |
     105             :  *                                  |
     106             :  *   uval = *futex;                 |
     107             :  *                                  |        *futex = newval;
     108             :  *                                  |        sys_futex(WAKE, futex);
     109             :  *                                  |          futex_wake(futex);
     110             :  *                                  |
     111             :  *                                  `--------> smp_mb(); (B)
     112             :  *   if (uval == val)
     113             :  *     queue();
     114             :  *     unlock(hash_bucket(futex));
     115             :  *     schedule();                         if (waiters)
     116             :  *                                           lock(hash_bucket(futex));
     117             :  *   else                                    wake_waiters(futex);
     118             :  *     waiters--; (b)                        unlock(hash_bucket(futex));
     119             :  *
     120             :  * Where (A) orders the waiters increment and the futex value read through
     121             :  * atomic operations (see hb_waiters_inc) and where (B) orders the write
     122             :  * to futex and the waiters read (see hb_waiters_pending()).
     123             :  *
     124             :  * This yields the following case (where X:=waiters, Y:=futex):
     125             :  *
     126             :  *      X = Y = 0
     127             :  *
     128             :  *      w[X]=1          w[Y]=1
     129             :  *      MB              MB
     130             :  *      r[Y]=y          r[X]=x
     131             :  *
     132             :  * Which guarantees that x==0 && y==0 is impossible; which translates back into
     133             :  * the guarantee that we cannot both miss the futex variable change and the
     134             :  * enqueue.
     135             :  *
     136             :  * Note that a new waiter is accounted for in (a) even when it is possible that
     137             :  * the wait call can return error, in which case we backtrack from it in (b).
     138             :  * Refer to the comment in queue_lock().
     139             :  *
     140             :  * Similarly, in order to account for waiters being requeued on another
     141             :  * address we always increment the waiters for the destination bucket before
     142             :  * acquiring the lock. It then decrements them again  after releasing it -
     143             :  * the code that actually moves the futex(es) between hash buckets (requeue_futex)
     144             :  * will do the additional required waiter count housekeeping. This is done for
     145             :  * double_lock_hb() and double_unlock_hb(), respectively.
     146             :  */
     147             : 
     148             : #ifdef CONFIG_HAVE_FUTEX_CMPXCHG
     149             : #define futex_cmpxchg_enabled 1
     150             : #else
     151             : static int  __read_mostly futex_cmpxchg_enabled;
     152             : #endif
     153             : 
     154             : /*
     155             :  * Futex flags used to encode options to functions and preserve them across
     156             :  * restarts.
     157             :  */
     158             : #ifdef CONFIG_MMU
     159             : # define FLAGS_SHARED           0x01
     160             : #else
     161             : /*
     162             :  * NOMMU does not have per process address space. Let the compiler optimize
     163             :  * code away.
     164             :  */
     165             : # define FLAGS_SHARED           0x00
     166             : #endif
     167             : #define FLAGS_CLOCKRT           0x02
     168             : #define FLAGS_HAS_TIMEOUT       0x04
     169             : 
     170             : /*
     171             :  * Priority Inheritance state:
     172             :  */
     173             : struct futex_pi_state {
     174             :         /*
     175             :          * list of 'owned' pi_state instances - these have to be
     176             :          * cleaned up in do_exit() if the task exits prematurely:
     177             :          */
     178             :         struct list_head list;
     179             : 
     180             :         /*
     181             :          * The PI object:
     182             :          */
     183             :         struct rt_mutex pi_mutex;
     184             : 
     185             :         struct task_struct *owner;
     186             :         refcount_t refcount;
     187             : 
     188             :         union futex_key key;
     189             : } __randomize_layout;
     190             : 
     191             : /**
     192             :  * struct futex_q - The hashed futex queue entry, one per waiting task
     193             :  * @list:               priority-sorted list of tasks waiting on this futex
     194             :  * @task:               the task waiting on the futex
     195             :  * @lock_ptr:           the hash bucket lock
     196             :  * @key:                the key the futex is hashed on
     197             :  * @pi_state:           optional priority inheritance state
     198             :  * @rt_waiter:          rt_waiter storage for use with requeue_pi
     199             :  * @requeue_pi_key:     the requeue_pi target futex key
     200             :  * @bitset:             bitset for the optional bitmasked wakeup
     201             :  *
     202             :  * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
     203             :  * we can wake only the relevant ones (hashed queues may be shared).
     204             :  *
     205             :  * A futex_q has a woken state, just like tasks have TASK_RUNNING.
     206             :  * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
     207             :  * The order of wakeup is always to make the first condition true, then
     208             :  * the second.
     209             :  *
     210             :  * PI futexes are typically woken before they are removed from the hash list via
     211             :  * the rt_mutex code. See unqueue_me_pi().
     212             :  */
     213             : struct futex_q {
     214             :         struct plist_node list;
     215             : 
     216             :         struct task_struct *task;
     217             :         spinlock_t *lock_ptr;
     218             :         union futex_key key;
     219             :         struct futex_pi_state *pi_state;
     220             :         struct rt_mutex_waiter *rt_waiter;
     221             :         union futex_key *requeue_pi_key;
     222             :         u32 bitset;
     223             : } __randomize_layout;
     224             : 
     225             : static const struct futex_q futex_q_init = {
     226             :         /* list gets initialized in queue_me()*/
     227             :         .key = FUTEX_KEY_INIT,
     228             :         .bitset = FUTEX_BITSET_MATCH_ANY
     229             : };
     230             : 
     231             : /*
     232             :  * Hash buckets are shared by all the futex_keys that hash to the same
     233             :  * location.  Each key may have multiple futex_q structures, one for each task
     234             :  * waiting on a futex.
     235             :  */
     236             : struct futex_hash_bucket {
     237             :         atomic_t waiters;
     238             :         spinlock_t lock;
     239             :         struct plist_head chain;
     240             : } ____cacheline_aligned_in_smp;
     241             : 
     242             : /*
     243             :  * The base of the bucket array and its size are always used together
     244             :  * (after initialization only in hash_futex()), so ensure that they
     245             :  * reside in the same cacheline.
     246             :  */
     247             : static struct {
     248             :         struct futex_hash_bucket *queues;
     249             :         unsigned long            hashsize;
     250             : } __futex_data __read_mostly __aligned(2*sizeof(long));
     251             : #define futex_queues   (__futex_data.queues)
     252             : #define futex_hashsize (__futex_data.hashsize)
     253             : 
     254             : 
     255             : /*
     256             :  * Fault injections for futexes.
     257             :  */
     258             : #ifdef CONFIG_FAIL_FUTEX
     259             : 
     260             : static struct {
     261             :         struct fault_attr attr;
     262             : 
     263             :         bool ignore_private;
     264             : } fail_futex = {
     265             :         .attr = FAULT_ATTR_INITIALIZER,
     266             :         .ignore_private = false,
     267             : };
     268             : 
     269             : static int __init setup_fail_futex(char *str)
     270             : {
     271             :         return setup_fault_attr(&fail_futex.attr, str);
     272             : }
     273             : __setup("fail_futex=", setup_fail_futex);
     274             : 
     275             : static bool should_fail_futex(bool fshared)
     276             : {
     277             :         if (fail_futex.ignore_private && !fshared)
     278             :                 return false;
     279             : 
     280             :         return should_fail(&fail_futex.attr, 1);
     281             : }
     282             : 
     283             : #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
     284             : 
     285             : static int __init fail_futex_debugfs(void)
     286             : {
     287             :         umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
     288             :         struct dentry *dir;
     289             : 
     290             :         dir = fault_create_debugfs_attr("fail_futex", NULL,
     291             :                                         &fail_futex.attr);
     292             :         if (IS_ERR(dir))
     293             :                 return PTR_ERR(dir);
     294             : 
     295             :         debugfs_create_bool("ignore-private", mode, dir,
     296             :                             &fail_futex.ignore_private);
     297             :         return 0;
     298             : }
     299             : 
     300             : late_initcall(fail_futex_debugfs);
     301             : 
     302             : #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
     303             : 
     304             : #else
     305         766 : static inline bool should_fail_futex(bool fshared)
     306             : {
     307         766 :         return false;
     308             : }
     309             : #endif /* CONFIG_FAIL_FUTEX */
     310             : 
     311             : #ifdef CONFIG_COMPAT
     312             : static void compat_exit_robust_list(struct task_struct *curr);
     313             : #endif
     314             : 
     315             : /*
     316             :  * Reflects a new waiter being added to the waitqueue.
     317             :  */
     318         168 : static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
     319             : {
     320             : #ifdef CONFIG_SMP
     321         168 :         atomic_inc(&hb->waiters);
     322             :         /*
     323             :          * Full barrier (A), see the ordering comment above.
     324             :          */
     325         168 :         smp_mb__after_atomic();
     326             : #endif
     327         168 : }
     328             : 
     329             : /*
     330             :  * Reflects a waiter being removed from the waitqueue by wakeup
     331             :  * paths.
     332             :  */
     333         167 : static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
     334             : {
     335             : #ifdef CONFIG_SMP
     336         167 :         atomic_dec(&hb->waiters);
     337             : #endif
     338         167 : }
     339             : 
     340         594 : static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
     341             : {
     342             : #ifdef CONFIG_SMP
     343             :         /*
     344             :          * Full barrier (B), see the ordering comment above.
     345             :          */
     346         594 :         smp_mb();
     347         594 :         return atomic_read(&hb->waiters);
     348             : #else
     349             :         return 1;
     350             : #endif
     351             : }
     352             : 
     353             : /**
     354             :  * hash_futex - Return the hash bucket in the global hash
     355             :  * @key:        Pointer to the futex key for which the hash is calculated
     356             :  *
     357             :  * We hash on the keys returned from get_futex_key (see below) and return the
     358             :  * corresponding hash bucket in the global hash.
     359             :  */
     360         762 : static struct futex_hash_bucket *hash_futex(union futex_key *key)
     361             : {
     362         762 :         u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
     363             :                           key->both.offset);
     364             : 
     365         762 :         return &futex_queues[hash & (futex_hashsize - 1)];
     366             : }
     367             : 
     368             : 
     369             : /**
     370             :  * match_futex - Check whether two futex keys are equal
     371             :  * @key1:       Pointer to key1
     372             :  * @key2:       Pointer to key2
     373             :  *
     374             :  * Return 1 if two futex_keys are equal, 0 otherwise.
     375             :  */
     376         167 : static inline int match_futex(union futex_key *key1, union futex_key *key2)
     377             : {
     378         167 :         return (key1 && key2
     379         167 :                 && key1->both.word == key2->both.word
     380         166 :                 && key1->both.ptr == key2->both.ptr
     381         333 :                 && key1->both.offset == key2->both.offset);
     382             : }
     383             : 
     384             : enum futex_access {
     385             :         FUTEX_READ,
     386             :         FUTEX_WRITE
     387             : };
     388             : 
     389             : /**
     390             :  * futex_setup_timer - set up the sleeping hrtimer.
     391             :  * @time:       ptr to the given timeout value
     392             :  * @timeout:    the hrtimer_sleeper structure to be set up
     393             :  * @flags:      futex flags
     394             :  * @range_ns:   optional range in ns
     395             :  *
     396             :  * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
     397             :  *         value given
     398             :  */
     399             : static inline struct hrtimer_sleeper *
     400         168 : futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
     401             :                   int flags, u64 range_ns)
     402             : {
     403         168 :         if (!time)
     404             :                 return NULL;
     405             : 
     406           0 :         hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
     407             :                                       CLOCK_REALTIME : CLOCK_MONOTONIC,
     408             :                                       HRTIMER_MODE_ABS);
     409             :         /*
     410             :          * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
     411             :          * effectively the same as calling hrtimer_set_expires().
     412             :          */
     413           0 :         hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
     414             : 
     415           0 :         return timeout;
     416             : }
     417             : 
     418             : /*
     419             :  * Generate a machine wide unique identifier for this inode.
     420             :  *
     421             :  * This relies on u64 not wrapping in the life-time of the machine; which with
     422             :  * 1ns resolution means almost 585 years.
     423             :  *
     424             :  * This further relies on the fact that a well formed program will not unmap
     425             :  * the file while it has a (shared) futex waiting on it. This mapping will have
     426             :  * a file reference which pins the mount and inode.
     427             :  *
     428             :  * If for some reason an inode gets evicted and read back in again, it will get
     429             :  * a new sequence number and will _NOT_ match, even though it is the exact same
     430             :  * file.
     431             :  *
     432             :  * It is important that match_futex() will never have a false-positive, esp.
     433             :  * for PI futexes that can mess up the state. The above argues that false-negatives
     434             :  * are only possible for malformed programs.
     435             :  */
     436           0 : static u64 get_inode_sequence_number(struct inode *inode)
     437             : {
     438           0 :         static atomic64_t i_seq;
     439           0 :         u64 old;
     440             : 
     441             :         /* Does the inode already have a sequence number? */
     442           0 :         old = atomic64_read(&inode->i_sequence);
     443           0 :         if (likely(old))
     444             :                 return old;
     445             : 
     446           0 :         for (;;) {
     447           0 :                 u64 new = atomic64_add_return(1, &i_seq);
     448           0 :                 if (WARN_ON_ONCE(!new))
     449           0 :                         continue;
     450             : 
     451           0 :                 old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
     452           0 :                 if (old)
     453           0 :                         return old;
     454             :                 return new;
     455             :         }
     456             : }
     457             : 
     458             : /**
     459             :  * get_futex_key() - Get parameters which are the keys for a futex
     460             :  * @uaddr:      virtual address of the futex
     461             :  * @fshared:    false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
     462             :  * @key:        address where result is stored.
     463             :  * @rw:         mapping needs to be read/write (values: FUTEX_READ,
     464             :  *              FUTEX_WRITE)
     465             :  *
     466             :  * Return: a negative error code or 0
     467             :  *
     468             :  * The key words are stored in @key on success.
     469             :  *
     470             :  * For shared mappings (when @fshared), the key is:
     471             :  *
     472             :  *   ( inode->i_sequence, page->index, offset_within_page )
     473             :  *
     474             :  * [ also see get_inode_sequence_number() ]
     475             :  *
     476             :  * For private mappings (or when !@fshared), the key is:
     477             :  *
     478             :  *   ( current->mm, address, 0 )
     479             :  *
     480             :  * This allows (cross process, where applicable) identification of the futex
     481             :  * without keeping the page pinned for the duration of the FUTEX_WAIT.
     482             :  *
     483             :  * lock_page() might sleep, the caller should not hold a spinlock.
     484             :  */
     485         762 : static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
     486             :                          enum futex_access rw)
     487             : {
     488         762 :         unsigned long address = (unsigned long)uaddr;
     489         762 :         struct mm_struct *mm = current->mm;
     490         762 :         struct page *page, *tail;
     491         762 :         struct address_space *mapping;
     492         762 :         int err, ro = 0;
     493             : 
     494             :         /*
     495             :          * The futex address must be "naturally" aligned.
     496             :          */
     497         762 :         key->both.offset = address % PAGE_SIZE;
     498         762 :         if (unlikely((address % sizeof(u32)) != 0))
     499             :                 return -EINVAL;
     500         762 :         address -= key->both.offset;
     501             : 
     502         762 :         if (unlikely(!access_ok(uaddr, sizeof(u32))))
     503             :                 return -EFAULT;
     504             : 
     505         762 :         if (unlikely(should_fail_futex(fshared)))
     506             :                 return -EFAULT;
     507             : 
     508             :         /*
     509             :          * PROCESS_PRIVATE futexes are fast.
     510             :          * As the mm cannot disappear under us and the 'key' only needs
     511             :          * virtual address, we dont even have to find the underlying vma.
     512             :          * Note : We do have to check 'uaddr' is a valid user address,
     513             :          *        but access_ok() should be faster than find_vma()
     514             :          */
     515         762 :         if (!fshared) {
     516         760 :                 key->private.mm = mm;
     517         760 :                 key->private.address = address;
     518         760 :                 return 0;
     519             :         }
     520             : 
     521           2 : again:
     522             :         /* Ignore any VERIFY_READ mapping (futex common case) */
     523           2 :         if (unlikely(should_fail_futex(true)))
     524             :                 return -EFAULT;
     525             : 
     526           2 :         err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
     527             :         /*
     528             :          * If write access is not required (eg. FUTEX_WAIT), try
     529             :          * and get read-only access.
     530             :          */
     531           2 :         if (err == -EFAULT && rw == FUTEX_READ) {
     532           0 :                 err = get_user_pages_fast(address, 1, 0, &page);
     533           0 :                 ro = 1;
     534             :         }
     535           2 :         if (err < 0)
     536           0 :                 return err;
     537             :         else
     538           2 :                 err = 0;
     539             : 
     540             :         /*
     541             :          * The treatment of mapping from this point on is critical. The page
     542             :          * lock protects many things but in this context the page lock
     543             :          * stabilizes mapping, prevents inode freeing in the shared
     544             :          * file-backed region case and guards against movement to swap cache.
     545             :          *
     546             :          * Strictly speaking the page lock is not needed in all cases being
     547             :          * considered here and page lock forces unnecessarily serialization
     548             :          * From this point on, mapping will be re-verified if necessary and
     549             :          * page lock will be acquired only if it is unavoidable
     550             :          *
     551             :          * Mapping checks require the head page for any compound page so the
     552             :          * head page and mapping is looked up now. For anonymous pages, it
     553             :          * does not matter if the page splits in the future as the key is
     554             :          * based on the address. For filesystem-backed pages, the tail is
     555             :          * required as the index of the page determines the key. For
     556             :          * base pages, there is no tail page and tail == page.
     557             :          */
     558           2 :         tail = page;
     559           2 :         page = compound_head(page);
     560           2 :         mapping = READ_ONCE(page->mapping);
     561             : 
     562             :         /*
     563             :          * If page->mapping is NULL, then it cannot be a PageAnon
     564             :          * page; but it might be the ZERO_PAGE or in the gate area or
     565             :          * in a special mapping (all cases which we are happy to fail);
     566             :          * or it may have been a good file page when get_user_pages_fast
     567             :          * found it, but truncated or holepunched or subjected to
     568             :          * invalidate_complete_page2 before we got the page lock (also
     569             :          * cases which we are happy to fail).  And we hold a reference,
     570             :          * so refcount care in invalidate_complete_page's remove_mapping
     571             :          * prevents drop_caches from setting mapping to NULL beneath us.
     572             :          *
     573             :          * The case we do have to guard against is when memory pressure made
     574             :          * shmem_writepage move it from filecache to swapcache beneath us:
     575             :          * an unlikely race, but we do need to retry for page->mapping.
     576             :          */
     577           2 :         if (unlikely(!mapping)) {
     578           0 :                 int shmem_swizzled;
     579             : 
     580             :                 /*
     581             :                  * Page lock is required to identify which special case above
     582             :                  * applies. If this is really a shmem page then the page lock
     583             :                  * will prevent unexpected transitions.
     584             :                  */
     585           0 :                 lock_page(page);
     586           0 :                 shmem_swizzled = PageSwapCache(page) || page->mapping;
     587           0 :                 unlock_page(page);
     588           0 :                 put_page(page);
     589             : 
     590           0 :                 if (shmem_swizzled)
     591           0 :                         goto again;
     592             : 
     593             :                 return -EFAULT;
     594             :         }
     595             : 
     596             :         /*
     597             :          * Private mappings are handled in a simple way.
     598             :          *
     599             :          * If the futex key is stored on an anonymous page, then the associated
     600             :          * object is the mm which is implicitly pinned by the calling process.
     601             :          *
     602             :          * NOTE: When userspace waits on a MAP_SHARED mapping, even if
     603             :          * it's a read-only handle, it's expected that futexes attach to
     604             :          * the object not the particular process.
     605             :          */
     606           2 :         if (PageAnon(page)) {
     607             :                 /*
     608             :                  * A RO anonymous page will never change and thus doesn't make
     609             :                  * sense for futex operations.
     610             :                  */
     611           2 :                 if (unlikely(should_fail_futex(true)) || ro) {
     612           0 :                         err = -EFAULT;
     613           0 :                         goto out;
     614             :                 }
     615             : 
     616           2 :                 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
     617           2 :                 key->private.mm = mm;
     618           2 :                 key->private.address = address;
     619             : 
     620             :         } else {
     621           0 :                 struct inode *inode;
     622             : 
     623             :                 /*
     624             :                  * The associated futex object in this case is the inode and
     625             :                  * the page->mapping must be traversed. Ordinarily this should
     626             :                  * be stabilised under page lock but it's not strictly
     627             :                  * necessary in this case as we just want to pin the inode, not
     628             :                  * update the radix tree or anything like that.
     629             :                  *
     630             :                  * The RCU read lock is taken as the inode is finally freed
     631             :                  * under RCU. If the mapping still matches expectations then the
     632             :                  * mapping->host can be safely accessed as being a valid inode.
     633             :                  */
     634           0 :                 rcu_read_lock();
     635             : 
     636           0 :                 if (READ_ONCE(page->mapping) != mapping) {
     637           0 :                         rcu_read_unlock();
     638           0 :                         put_page(page);
     639             : 
     640           0 :                         goto again;
     641             :                 }
     642             : 
     643           0 :                 inode = READ_ONCE(mapping->host);
     644           0 :                 if (!inode) {
     645           0 :                         rcu_read_unlock();
     646           0 :                         put_page(page);
     647             : 
     648           0 :                         goto again;
     649             :                 }
     650             : 
     651           0 :                 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
     652           0 :                 key->shared.i_seq = get_inode_sequence_number(inode);
     653           0 :                 key->shared.pgoff = basepage_index(tail);
     654           0 :                 rcu_read_unlock();
     655             :         }
     656             : 
     657           2 : out:
     658           2 :         put_page(page);
     659           2 :         return err;
     660             : }
     661             : 
     662             : /**
     663             :  * fault_in_user_writeable() - Fault in user address and verify RW access
     664             :  * @uaddr:      pointer to faulting user space address
     665             :  *
     666             :  * Slow path to fixup the fault we just took in the atomic write
     667             :  * access to @uaddr.
     668             :  *
     669             :  * We have no generic implementation of a non-destructive write to the
     670             :  * user address. We know that we faulted in the atomic pagefault
     671             :  * disabled section so we can as well avoid the #PF overhead by
     672             :  * calling get_user_pages() right away.
     673             :  */
     674           0 : static int fault_in_user_writeable(u32 __user *uaddr)
     675             : {
     676           0 :         struct mm_struct *mm = current->mm;
     677           0 :         int ret;
     678             : 
     679           0 :         mmap_read_lock(mm);
     680           0 :         ret = fixup_user_fault(mm, (unsigned long)uaddr,
     681             :                                FAULT_FLAG_WRITE, NULL);
     682           0 :         mmap_read_unlock(mm);
     683             : 
     684           0 :         return ret < 0 ? ret : 0;
     685             : }
     686             : 
     687             : /**
     688             :  * futex_top_waiter() - Return the highest priority waiter on a futex
     689             :  * @hb:         the hash bucket the futex_q's reside in
     690             :  * @key:        the futex key (to distinguish it from other futex futex_q's)
     691             :  *
     692             :  * Must be called with the hb lock held.
     693             :  */
     694           0 : static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
     695             :                                         union futex_key *key)
     696             : {
     697           0 :         struct futex_q *this;
     698             : 
     699           0 :         plist_for_each_entry(this, &hb->chain, list) {
     700           0 :                 if (match_futex(&this->key, key))
     701           0 :                         return this;
     702             :         }
     703             :         return NULL;
     704             : }
     705             : 
     706           1 : static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
     707             :                                       u32 uval, u32 newval)
     708             : {
     709           1 :         int ret;
     710             : 
     711           1 :         pagefault_disable();
     712           1 :         ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
     713           1 :         pagefault_enable();
     714             : 
     715           1 :         return ret;
     716             : }
     717             : 
     718         168 : static int get_futex_value_locked(u32 *dest, u32 __user *from)
     719             : {
     720         168 :         int ret;
     721             : 
     722         168 :         pagefault_disable();
     723         168 :         ret = __get_user(*dest, from);
     724         168 :         pagefault_enable();
     725             : 
     726         168 :         return ret ? -EFAULT : 0;
     727             : }
     728             : 
     729             : 
     730             : /*
     731             :  * PI code:
     732             :  */
     733           0 : static int refill_pi_state_cache(void)
     734             : {
     735           0 :         struct futex_pi_state *pi_state;
     736             : 
     737           0 :         if (likely(current->pi_state_cache))
     738             :                 return 0;
     739             : 
     740           0 :         pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
     741             : 
     742           0 :         if (!pi_state)
     743             :                 return -ENOMEM;
     744             : 
     745           0 :         INIT_LIST_HEAD(&pi_state->list);
     746             :         /* pi_mutex gets initialized later */
     747           0 :         pi_state->owner = NULL;
     748           0 :         refcount_set(&pi_state->refcount, 1);
     749           0 :         pi_state->key = FUTEX_KEY_INIT;
     750             : 
     751           0 :         current->pi_state_cache = pi_state;
     752             : 
     753           0 :         return 0;
     754             : }
     755             : 
     756           0 : static struct futex_pi_state *alloc_pi_state(void)
     757             : {
     758           0 :         struct futex_pi_state *pi_state = current->pi_state_cache;
     759             : 
     760           0 :         WARN_ON(!pi_state);
     761           0 :         current->pi_state_cache = NULL;
     762             : 
     763           0 :         return pi_state;
     764             : }
     765             : 
     766           0 : static void pi_state_update_owner(struct futex_pi_state *pi_state,
     767             :                                   struct task_struct *new_owner)
     768             : {
     769           0 :         struct task_struct *old_owner = pi_state->owner;
     770             : 
     771           0 :         lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
     772             : 
     773           0 :         if (old_owner) {
     774           0 :                 raw_spin_lock(&old_owner->pi_lock);
     775           0 :                 WARN_ON(list_empty(&pi_state->list));
     776           0 :                 list_del_init(&pi_state->list);
     777           0 :                 raw_spin_unlock(&old_owner->pi_lock);
     778             :         }
     779             : 
     780           0 :         if (new_owner) {
     781           0 :                 raw_spin_lock(&new_owner->pi_lock);
     782           0 :                 WARN_ON(!list_empty(&pi_state->list));
     783           0 :                 list_add(&pi_state->list, &new_owner->pi_state_list);
     784           0 :                 pi_state->owner = new_owner;
     785           0 :                 raw_spin_unlock(&new_owner->pi_lock);
     786             :         }
     787           0 : }
     788             : 
     789           0 : static void get_pi_state(struct futex_pi_state *pi_state)
     790             : {
     791           0 :         WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
     792           0 : }
     793             : 
     794             : /*
     795             :  * Drops a reference to the pi_state object and frees or caches it
     796             :  * when the last reference is gone.
     797             :  */
     798           0 : static void put_pi_state(struct futex_pi_state *pi_state)
     799             : {
     800           0 :         if (!pi_state)
     801             :                 return;
     802             : 
     803           0 :         if (!refcount_dec_and_test(&pi_state->refcount))
     804             :                 return;
     805             : 
     806             :         /*
     807             :          * If pi_state->owner is NULL, the owner is most probably dying
     808             :          * and has cleaned up the pi_state already
     809             :          */
     810           0 :         if (pi_state->owner) {
     811           0 :                 unsigned long flags;
     812             : 
     813           0 :                 raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
     814           0 :                 pi_state_update_owner(pi_state, NULL);
     815           0 :                 rt_mutex_proxy_unlock(&pi_state->pi_mutex);
     816           0 :                 raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
     817             :         }
     818             : 
     819           0 :         if (current->pi_state_cache) {
     820           0 :                 kfree(pi_state);
     821             :         } else {
     822             :                 /*
     823             :                  * pi_state->list is already empty.
     824             :                  * clear pi_state->owner.
     825             :                  * refcount is at 0 - put it back to 1.
     826             :                  */
     827           0 :                 pi_state->owner = NULL;
     828           0 :                 refcount_set(&pi_state->refcount, 1);
     829           0 :                 current->pi_state_cache = pi_state;
     830             :         }
     831             : }
     832             : 
     833             : #ifdef CONFIG_FUTEX_PI
     834             : 
     835             : /*
     836             :  * This task is holding PI mutexes at exit time => bad.
     837             :  * Kernel cleans up PI-state, but userspace is likely hosed.
     838             :  * (Robust-futex cleanup is separate and might save the day for userspace.)
     839             :  */
     840           0 : static void exit_pi_state_list(struct task_struct *curr)
     841             : {
     842           0 :         struct list_head *next, *head = &curr->pi_state_list;
     843           0 :         struct futex_pi_state *pi_state;
     844           0 :         struct futex_hash_bucket *hb;
     845           0 :         union futex_key key = FUTEX_KEY_INIT;
     846             : 
     847           0 :         if (!futex_cmpxchg_enabled)
     848           0 :                 return;
     849             :         /*
     850             :          * We are a ZOMBIE and nobody can enqueue itself on
     851             :          * pi_state_list anymore, but we have to be careful
     852             :          * versus waiters unqueueing themselves:
     853             :          */
     854           0 :         raw_spin_lock_irq(&curr->pi_lock);
     855           0 :         while (!list_empty(head)) {
     856           0 :                 next = head->next;
     857           0 :                 pi_state = list_entry(next, struct futex_pi_state, list);
     858           0 :                 key = pi_state->key;
     859           0 :                 hb = hash_futex(&key);
     860             : 
     861             :                 /*
     862             :                  * We can race against put_pi_state() removing itself from the
     863             :                  * list (a waiter going away). put_pi_state() will first
     864             :                  * decrement the reference count and then modify the list, so
     865             :                  * its possible to see the list entry but fail this reference
     866             :                  * acquire.
     867             :                  *
     868             :                  * In that case; drop the locks to let put_pi_state() make
     869             :                  * progress and retry the loop.
     870             :                  */
     871           0 :                 if (!refcount_inc_not_zero(&pi_state->refcount)) {
     872           0 :                         raw_spin_unlock_irq(&curr->pi_lock);
     873           0 :                         cpu_relax();
     874           0 :                         raw_spin_lock_irq(&curr->pi_lock);
     875           0 :                         continue;
     876             :                 }
     877           0 :                 raw_spin_unlock_irq(&curr->pi_lock);
     878             : 
     879           0 :                 spin_lock(&hb->lock);
     880           0 :                 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
     881           0 :                 raw_spin_lock(&curr->pi_lock);
     882             :                 /*
     883             :                  * We dropped the pi-lock, so re-check whether this
     884             :                  * task still owns the PI-state:
     885             :                  */
     886           0 :                 if (head->next != next) {
     887             :                         /* retain curr->pi_lock for the loop invariant */
     888           0 :                         raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
     889           0 :                         spin_unlock(&hb->lock);
     890           0 :                         put_pi_state(pi_state);
     891           0 :                         continue;
     892             :                 }
     893             : 
     894           0 :                 WARN_ON(pi_state->owner != curr);
     895           0 :                 WARN_ON(list_empty(&pi_state->list));
     896           0 :                 list_del_init(&pi_state->list);
     897           0 :                 pi_state->owner = NULL;
     898             : 
     899           0 :                 raw_spin_unlock(&curr->pi_lock);
     900           0 :                 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
     901           0 :                 spin_unlock(&hb->lock);
     902             : 
     903           0 :                 rt_mutex_futex_unlock(&pi_state->pi_mutex);
     904           0 :                 put_pi_state(pi_state);
     905             : 
     906           0 :                 raw_spin_lock_irq(&curr->pi_lock);
     907             :         }
     908           0 :         raw_spin_unlock_irq(&curr->pi_lock);
     909             : }
     910             : #else
     911             : static inline void exit_pi_state_list(struct task_struct *curr) { }
     912             : #endif
     913             : 
     914             : /*
     915             :  * We need to check the following states:
     916             :  *
     917             :  *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
     918             :  *
     919             :  * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
     920             :  * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
     921             :  *
     922             :  * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
     923             :  *
     924             :  * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
     925             :  * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
     926             :  *
     927             :  * [6]  Found  | Found    | task      | 0         | 1      | Valid
     928             :  *
     929             :  * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
     930             :  *
     931             :  * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
     932             :  * [9]  Found  | Found    | task      | 0         | 0      | Invalid
     933             :  * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
     934             :  *
     935             :  * [1]  Indicates that the kernel can acquire the futex atomically. We
     936             :  *      came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
     937             :  *
     938             :  * [2]  Valid, if TID does not belong to a kernel thread. If no matching
     939             :  *      thread is found then it indicates that the owner TID has died.
     940             :  *
     941             :  * [3]  Invalid. The waiter is queued on a non PI futex
     942             :  *
     943             :  * [4]  Valid state after exit_robust_list(), which sets the user space
     944             :  *      value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
     945             :  *
     946             :  * [5]  The user space value got manipulated between exit_robust_list()
     947             :  *      and exit_pi_state_list()
     948             :  *
     949             :  * [6]  Valid state after exit_pi_state_list() which sets the new owner in
     950             :  *      the pi_state but cannot access the user space value.
     951             :  *
     952             :  * [7]  pi_state->owner can only be NULL when the OWNER_DIED bit is set.
     953             :  *
     954             :  * [8]  Owner and user space value match
     955             :  *
     956             :  * [9]  There is no transient state which sets the user space TID to 0
     957             :  *      except exit_robust_list(), but this is indicated by the
     958             :  *      FUTEX_OWNER_DIED bit. See [4]
     959             :  *
     960             :  * [10] There is no transient state which leaves owner and user space
     961             :  *      TID out of sync. Except one error case where the kernel is denied
     962             :  *      write access to the user address, see fixup_pi_state_owner().
     963             :  *
     964             :  *
     965             :  * Serialization and lifetime rules:
     966             :  *
     967             :  * hb->lock:
     968             :  *
     969             :  *      hb -> futex_q, relation
     970             :  *      futex_q -> pi_state, relation
     971             :  *
     972             :  *      (cannot be raw because hb can contain arbitrary amount
     973             :  *       of futex_q's)
     974             :  *
     975             :  * pi_mutex->wait_lock:
     976             :  *
     977             :  *      {uval, pi_state}
     978             :  *
     979             :  *      (and pi_mutex 'obviously')
     980             :  *
     981             :  * p->pi_lock:
     982             :  *
     983             :  *      p->pi_state_list -> pi_state->list, relation
     984             :  *
     985             :  * pi_state->refcount:
     986             :  *
     987             :  *      pi_state lifetime
     988             :  *
     989             :  *
     990             :  * Lock order:
     991             :  *
     992             :  *   hb->lock
     993             :  *     pi_mutex->wait_lock
     994             :  *       p->pi_lock
     995             :  *
     996             :  */
     997             : 
     998             : /*
     999             :  * Validate that the existing waiter has a pi_state and sanity check
    1000             :  * the pi_state against the user space value. If correct, attach to
    1001             :  * it.
    1002             :  */
    1003           0 : static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
    1004             :                               struct futex_pi_state *pi_state,
    1005             :                               struct futex_pi_state **ps)
    1006             : {
    1007           0 :         pid_t pid = uval & FUTEX_TID_MASK;
    1008           0 :         u32 uval2;
    1009           0 :         int ret;
    1010             : 
    1011             :         /*
    1012             :          * Userspace might have messed up non-PI and PI futexes [3]
    1013             :          */
    1014           0 :         if (unlikely(!pi_state))
    1015             :                 return -EINVAL;
    1016             : 
    1017             :         /*
    1018             :          * We get here with hb->lock held, and having found a
    1019             :          * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
    1020             :          * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
    1021             :          * which in turn means that futex_lock_pi() still has a reference on
    1022             :          * our pi_state.
    1023             :          *
    1024             :          * The waiter holding a reference on @pi_state also protects against
    1025             :          * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
    1026             :          * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
    1027             :          * free pi_state before we can take a reference ourselves.
    1028             :          */
    1029           0 :         WARN_ON(!refcount_read(&pi_state->refcount));
    1030             : 
    1031             :         /*
    1032             :          * Now that we have a pi_state, we can acquire wait_lock
    1033             :          * and do the state validation.
    1034             :          */
    1035           0 :         raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
    1036             : 
    1037             :         /*
    1038             :          * Since {uval, pi_state} is serialized by wait_lock, and our current
    1039             :          * uval was read without holding it, it can have changed. Verify it
    1040             :          * still is what we expect it to be, otherwise retry the entire
    1041             :          * operation.
    1042             :          */
    1043           0 :         if (get_futex_value_locked(&uval2, uaddr))
    1044           0 :                 goto out_efault;
    1045             : 
    1046           0 :         if (uval != uval2)
    1047           0 :                 goto out_eagain;
    1048             : 
    1049             :         /*
    1050             :          * Handle the owner died case:
    1051             :          */
    1052           0 :         if (uval & FUTEX_OWNER_DIED) {
    1053             :                 /*
    1054             :                  * exit_pi_state_list sets owner to NULL and wakes the
    1055             :                  * topmost waiter. The task which acquires the
    1056             :                  * pi_state->rt_mutex will fixup owner.
    1057             :                  */
    1058           0 :                 if (!pi_state->owner) {
    1059             :                         /*
    1060             :                          * No pi state owner, but the user space TID
    1061             :                          * is not 0. Inconsistent state. [5]
    1062             :                          */
    1063           0 :                         if (pid)
    1064           0 :                                 goto out_einval;
    1065             :                         /*
    1066             :                          * Take a ref on the state and return success. [4]
    1067             :                          */
    1068           0 :                         goto out_attach;
    1069             :                 }
    1070             : 
    1071             :                 /*
    1072             :                  * If TID is 0, then either the dying owner has not
    1073             :                  * yet executed exit_pi_state_list() or some waiter
    1074             :                  * acquired the rtmutex in the pi state, but did not
    1075             :                  * yet fixup the TID in user space.
    1076             :                  *
    1077             :                  * Take a ref on the state and return success. [6]
    1078             :                  */
    1079           0 :                 if (!pid)
    1080           0 :                         goto out_attach;
    1081             :         } else {
    1082             :                 /*
    1083             :                  * If the owner died bit is not set, then the pi_state
    1084             :                  * must have an owner. [7]
    1085             :                  */
    1086           0 :                 if (!pi_state->owner)
    1087           0 :                         goto out_einval;
    1088             :         }
    1089             : 
    1090             :         /*
    1091             :          * Bail out if user space manipulated the futex value. If pi
    1092             :          * state exists then the owner TID must be the same as the
    1093             :          * user space TID. [9/10]
    1094             :          */
    1095           0 :         if (pid != task_pid_vnr(pi_state->owner))
    1096           0 :                 goto out_einval;
    1097             : 
    1098           0 : out_attach:
    1099           0 :         get_pi_state(pi_state);
    1100           0 :         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
    1101           0 :         *ps = pi_state;
    1102           0 :         return 0;
    1103             : 
    1104           0 : out_einval:
    1105           0 :         ret = -EINVAL;
    1106           0 :         goto out_error;
    1107             : 
    1108           0 : out_eagain:
    1109           0 :         ret = -EAGAIN;
    1110           0 :         goto out_error;
    1111             : 
    1112           0 : out_efault:
    1113           0 :         ret = -EFAULT;
    1114           0 :         goto out_error;
    1115             : 
    1116           0 : out_error:
    1117           0 :         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
    1118           0 :         return ret;
    1119             : }
    1120             : 
    1121             : /**
    1122             :  * wait_for_owner_exiting - Block until the owner has exited
    1123             :  * @ret: owner's current futex lock status
    1124             :  * @exiting:    Pointer to the exiting task
    1125             :  *
    1126             :  * Caller must hold a refcount on @exiting.
    1127             :  */
    1128           0 : static void wait_for_owner_exiting(int ret, struct task_struct *exiting)
    1129             : {
    1130           0 :         if (ret != -EBUSY) {
    1131           0 :                 WARN_ON_ONCE(exiting);
    1132             :                 return;
    1133             :         }
    1134             : 
    1135           0 :         if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
    1136             :                 return;
    1137             : 
    1138           0 :         mutex_lock(&exiting->futex_exit_mutex);
    1139             :         /*
    1140             :          * No point in doing state checking here. If the waiter got here
    1141             :          * while the task was in exec()->exec_futex_release() then it can
    1142             :          * have any FUTEX_STATE_* value when the waiter has acquired the
    1143             :          * mutex. OK, if running, EXITING or DEAD if it reached exit()
    1144             :          * already. Highly unlikely and not a problem. Just one more round
    1145             :          * through the futex maze.
    1146             :          */
    1147           0 :         mutex_unlock(&exiting->futex_exit_mutex);
    1148             : 
    1149           0 :         put_task_struct(exiting);
    1150             : }
    1151             : 
    1152           0 : static int handle_exit_race(u32 __user *uaddr, u32 uval,
    1153             :                             struct task_struct *tsk)
    1154             : {
    1155           0 :         u32 uval2;
    1156             : 
    1157             :         /*
    1158             :          * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
    1159             :          * caller that the alleged owner is busy.
    1160             :          */
    1161           0 :         if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
    1162             :                 return -EBUSY;
    1163             : 
    1164             :         /*
    1165             :          * Reread the user space value to handle the following situation:
    1166             :          *
    1167             :          * CPU0                         CPU1
    1168             :          *
    1169             :          * sys_exit()                   sys_futex()
    1170             :          *  do_exit()                    futex_lock_pi()
    1171             :          *                                futex_lock_pi_atomic()
    1172             :          *   exit_signals(tsk)              No waiters:
    1173             :          *    tsk->flags |= PF_EXITING;          *uaddr == 0x00000PID
    1174             :          *  mm_release(tsk)                 Set waiter bit
    1175             :          *   exit_robust_list(tsk) {        *uaddr = 0x80000PID;
    1176             :          *      Set owner died              attach_to_pi_owner() {
    1177             :          *    *uaddr = 0xC0000000;           tsk = get_task(PID);
    1178             :          *   }                               if (!tsk->flags & PF_EXITING) {
    1179             :          *  ...                                attach();
    1180             :          *  tsk->futex_state =               } else {
    1181             :          *      FUTEX_STATE_DEAD;              if (tsk->futex_state !=
    1182             :          *                                        FUTEX_STATE_DEAD)
    1183             :          *                                       return -EAGAIN;
    1184             :          *                                     return -ESRCH; <--- FAIL
    1185             :          *                                   }
    1186             :          *
    1187             :          * Returning ESRCH unconditionally is wrong here because the
    1188             :          * user space value has been changed by the exiting task.
    1189             :          *
    1190             :          * The same logic applies to the case where the exiting task is
    1191             :          * already gone.
    1192             :          */
    1193           0 :         if (get_futex_value_locked(&uval2, uaddr))
    1194             :                 return -EFAULT;
    1195             : 
    1196             :         /* If the user space value has changed, try again. */
    1197           0 :         if (uval2 != uval)
    1198           0 :                 return -EAGAIN;
    1199             : 
    1200             :         /*
    1201             :          * The exiting task did not have a robust list, the robust list was
    1202             :          * corrupted or the user space value in *uaddr is simply bogus.
    1203             :          * Give up and tell user space.
    1204             :          */
    1205             :         return -ESRCH;
    1206             : }
    1207             : 
    1208             : /*
    1209             :  * Lookup the task for the TID provided from user space and attach to
    1210             :  * it after doing proper sanity checks.
    1211             :  */
    1212           0 : static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
    1213             :                               struct futex_pi_state **ps,
    1214             :                               struct task_struct **exiting)
    1215             : {
    1216           0 :         pid_t pid = uval & FUTEX_TID_MASK;
    1217           0 :         struct futex_pi_state *pi_state;
    1218           0 :         struct task_struct *p;
    1219             : 
    1220             :         /*
    1221             :          * We are the first waiter - try to look up the real owner and attach
    1222             :          * the new pi_state to it, but bail out when TID = 0 [1]
    1223             :          *
    1224             :          * The !pid check is paranoid. None of the call sites should end up
    1225             :          * with pid == 0, but better safe than sorry. Let the caller retry
    1226             :          */
    1227           0 :         if (!pid)
    1228             :                 return -EAGAIN;
    1229           0 :         p = find_get_task_by_vpid(pid);
    1230           0 :         if (!p)
    1231           0 :                 return handle_exit_race(uaddr, uval, NULL);
    1232             : 
    1233           0 :         if (unlikely(p->flags & PF_KTHREAD)) {
    1234           0 :                 put_task_struct(p);
    1235           0 :                 return -EPERM;
    1236             :         }
    1237             : 
    1238             :         /*
    1239             :          * We need to look at the task state to figure out, whether the
    1240             :          * task is exiting. To protect against the change of the task state
    1241             :          * in futex_exit_release(), we do this protected by p->pi_lock:
    1242             :          */
    1243           0 :         raw_spin_lock_irq(&p->pi_lock);
    1244           0 :         if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
    1245             :                 /*
    1246             :                  * The task is on the way out. When the futex state is
    1247             :                  * FUTEX_STATE_DEAD, we know that the task has finished
    1248             :                  * the cleanup:
    1249             :                  */
    1250           0 :                 int ret = handle_exit_race(uaddr, uval, p);
    1251             : 
    1252           0 :                 raw_spin_unlock_irq(&p->pi_lock);
    1253             :                 /*
    1254             :                  * If the owner task is between FUTEX_STATE_EXITING and
    1255             :                  * FUTEX_STATE_DEAD then store the task pointer and keep
    1256             :                  * the reference on the task struct. The calling code will
    1257             :                  * drop all locks, wait for the task to reach
    1258             :                  * FUTEX_STATE_DEAD and then drop the refcount. This is
    1259             :                  * required to prevent a live lock when the current task
    1260             :                  * preempted the exiting task between the two states.
    1261             :                  */
    1262           0 :                 if (ret == -EBUSY)
    1263           0 :                         *exiting = p;
    1264             :                 else
    1265           0 :                         put_task_struct(p);
    1266           0 :                 return ret;
    1267             :         }
    1268             : 
    1269             :         /*
    1270             :          * No existing pi state. First waiter. [2]
    1271             :          *
    1272             :          * This creates pi_state, we have hb->lock held, this means nothing can
    1273             :          * observe this state, wait_lock is irrelevant.
    1274             :          */
    1275           0 :         pi_state = alloc_pi_state();
    1276             : 
    1277             :         /*
    1278             :          * Initialize the pi_mutex in locked state and make @p
    1279             :          * the owner of it:
    1280             :          */
    1281           0 :         rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
    1282             : 
    1283             :         /* Store the key for possible exit cleanups: */
    1284           0 :         pi_state->key = *key;
    1285             : 
    1286           0 :         WARN_ON(!list_empty(&pi_state->list));
    1287           0 :         list_add(&pi_state->list, &p->pi_state_list);
    1288             :         /*
    1289             :          * Assignment without holding pi_state->pi_mutex.wait_lock is safe
    1290             :          * because there is no concurrency as the object is not published yet.
    1291             :          */
    1292           0 :         pi_state->owner = p;
    1293           0 :         raw_spin_unlock_irq(&p->pi_lock);
    1294             : 
    1295           0 :         put_task_struct(p);
    1296             : 
    1297           0 :         *ps = pi_state;
    1298             : 
    1299           0 :         return 0;
    1300             : }
    1301             : 
    1302           0 : static int lookup_pi_state(u32 __user *uaddr, u32 uval,
    1303             :                            struct futex_hash_bucket *hb,
    1304             :                            union futex_key *key, struct futex_pi_state **ps,
    1305             :                            struct task_struct **exiting)
    1306             : {
    1307           0 :         struct futex_q *top_waiter = futex_top_waiter(hb, key);
    1308             : 
    1309             :         /*
    1310             :          * If there is a waiter on that futex, validate it and
    1311             :          * attach to the pi_state when the validation succeeds.
    1312             :          */
    1313           0 :         if (top_waiter)
    1314           0 :                 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
    1315             : 
    1316             :         /*
    1317             :          * We are the first waiter - try to look up the owner based on
    1318             :          * @uval and attach to it.
    1319             :          */
    1320           0 :         return attach_to_pi_owner(uaddr, uval, key, ps, exiting);
    1321             : }
    1322             : 
    1323           0 : static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
    1324             : {
    1325           0 :         int err;
    1326           0 :         u32 curval;
    1327             : 
    1328           0 :         if (unlikely(should_fail_futex(true)))
    1329             :                 return -EFAULT;
    1330             : 
    1331           0 :         err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
    1332           0 :         if (unlikely(err))
    1333             :                 return err;
    1334             : 
    1335             :         /* If user space value changed, let the caller retry */
    1336           0 :         return curval != uval ? -EAGAIN : 0;
    1337             : }
    1338             : 
    1339             : /**
    1340             :  * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
    1341             :  * @uaddr:              the pi futex user address
    1342             :  * @hb:                 the pi futex hash bucket
    1343             :  * @key:                the futex key associated with uaddr and hb
    1344             :  * @ps:                 the pi_state pointer where we store the result of the
    1345             :  *                      lookup
    1346             :  * @task:               the task to perform the atomic lock work for.  This will
    1347             :  *                      be "current" except in the case of requeue pi.
    1348             :  * @exiting:            Pointer to store the task pointer of the owner task
    1349             :  *                      which is in the middle of exiting
    1350             :  * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
    1351             :  *
    1352             :  * Return:
    1353             :  *  -  0 - ready to wait;
    1354             :  *  -  1 - acquired the lock;
    1355             :  *  - <0 - error
    1356             :  *
    1357             :  * The hb->lock and futex_key refs shall be held by the caller.
    1358             :  *
    1359             :  * @exiting is only set when the return value is -EBUSY. If so, this holds
    1360             :  * a refcount on the exiting task on return and the caller needs to drop it
    1361             :  * after waiting for the exit to complete.
    1362             :  */
    1363           0 : static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
    1364             :                                 union futex_key *key,
    1365             :                                 struct futex_pi_state **ps,
    1366             :                                 struct task_struct *task,
    1367             :                                 struct task_struct **exiting,
    1368             :                                 int set_waiters)
    1369             : {
    1370           0 :         u32 uval, newval, vpid = task_pid_vnr(task);
    1371           0 :         struct futex_q *top_waiter;
    1372           0 :         int ret;
    1373             : 
    1374             :         /*
    1375             :          * Read the user space value first so we can validate a few
    1376             :          * things before proceeding further.
    1377             :          */
    1378           0 :         if (get_futex_value_locked(&uval, uaddr))
    1379             :                 return -EFAULT;
    1380             : 
    1381           0 :         if (unlikely(should_fail_futex(true)))
    1382             :                 return -EFAULT;
    1383             : 
    1384             :         /*
    1385             :          * Detect deadlocks.
    1386             :          */
    1387           0 :         if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
    1388             :                 return -EDEADLK;
    1389             : 
    1390           0 :         if ((unlikely(should_fail_futex(true))))
    1391             :                 return -EDEADLK;
    1392             : 
    1393             :         /*
    1394             :          * Lookup existing state first. If it exists, try to attach to
    1395             :          * its pi_state.
    1396             :          */
    1397           0 :         top_waiter = futex_top_waiter(hb, key);
    1398           0 :         if (top_waiter)
    1399           0 :                 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
    1400             : 
    1401             :         /*
    1402             :          * No waiter and user TID is 0. We are here because the
    1403             :          * waiters or the owner died bit is set or called from
    1404             :          * requeue_cmp_pi or for whatever reason something took the
    1405             :          * syscall.
    1406             :          */
    1407           0 :         if (!(uval & FUTEX_TID_MASK)) {
    1408             :                 /*
    1409             :                  * We take over the futex. No other waiters and the user space
    1410             :                  * TID is 0. We preserve the owner died bit.
    1411             :                  */
    1412           0 :                 newval = uval & FUTEX_OWNER_DIED;
    1413           0 :                 newval |= vpid;
    1414             : 
    1415             :                 /* The futex requeue_pi code can enforce the waiters bit */
    1416           0 :                 if (set_waiters)
    1417           0 :                         newval |= FUTEX_WAITERS;
    1418             : 
    1419           0 :                 ret = lock_pi_update_atomic(uaddr, uval, newval);
    1420             :                 /* If the take over worked, return 1 */
    1421           0 :                 return ret < 0 ? ret : 1;
    1422             :         }
    1423             : 
    1424             :         /*
    1425             :          * First waiter. Set the waiters bit before attaching ourself to
    1426             :          * the owner. If owner tries to unlock, it will be forced into
    1427             :          * the kernel and blocked on hb->lock.
    1428             :          */
    1429           0 :         newval = uval | FUTEX_WAITERS;
    1430           0 :         ret = lock_pi_update_atomic(uaddr, uval, newval);
    1431           0 :         if (ret)
    1432             :                 return ret;
    1433             :         /*
    1434             :          * If the update of the user space value succeeded, we try to
    1435             :          * attach to the owner. If that fails, no harm done, we only
    1436             :          * set the FUTEX_WAITERS bit in the user space variable.
    1437             :          */
    1438           0 :         return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
    1439             : }
    1440             : 
    1441             : /**
    1442             :  * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
    1443             :  * @q:  The futex_q to unqueue
    1444             :  *
    1445             :  * The q->lock_ptr must not be NULL and must be held by the caller.
    1446             :  */
    1447         166 : static void __unqueue_futex(struct futex_q *q)
    1448             : {
    1449         166 :         struct futex_hash_bucket *hb;
    1450             : 
    1451         166 :         if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
    1452             :                 return;
    1453         332 :         lockdep_assert_held(q->lock_ptr);
    1454             : 
    1455         166 :         hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
    1456         166 :         plist_del(&q->list, &hb->chain);
    1457         166 :         hb_waiters_dec(hb);
    1458             : }
    1459             : 
    1460             : /*
    1461             :  * The hash bucket lock must be held when this is called.
    1462             :  * Afterwards, the futex_q must not be accessed. Callers
    1463             :  * must ensure to later call wake_up_q() for the actual
    1464             :  * wakeups to occur.
    1465             :  */
    1466         166 : static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
    1467             : {
    1468         166 :         struct task_struct *p = q->task;
    1469             : 
    1470         332 :         if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
    1471             :                 return;
    1472             : 
    1473         166 :         get_task_struct(p);
    1474         166 :         __unqueue_futex(q);
    1475             :         /*
    1476             :          * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
    1477             :          * is written, without taking any locks. This is possible in the event
    1478             :          * of a spurious wakeup, for example. A memory barrier is required here
    1479             :          * to prevent the following store to lock_ptr from getting ahead of the
    1480             :          * plist_del in __unqueue_futex().
    1481             :          */
    1482         166 :         smp_store_release(&q->lock_ptr, NULL);
    1483             : 
    1484             :         /*
    1485             :          * Queue the task for later wakeup for after we've released
    1486             :          * the hb->lock.
    1487             :          */
    1488         166 :         wake_q_add_safe(wake_q, p);
    1489             : }
    1490             : 
    1491             : /*
    1492             :  * Caller must hold a reference on @pi_state.
    1493             :  */
    1494           0 : static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
    1495             : {
    1496           0 :         u32 curval, newval;
    1497           0 :         struct task_struct *new_owner;
    1498           0 :         bool postunlock = false;
    1499           0 :         DEFINE_WAKE_Q(wake_q);
    1500           0 :         int ret = 0;
    1501             : 
    1502           0 :         new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
    1503           0 :         if (WARN_ON_ONCE(!new_owner)) {
    1504             :                 /*
    1505             :                  * As per the comment in futex_unlock_pi() this should not happen.
    1506             :                  *
    1507             :                  * When this happens, give up our locks and try again, giving
    1508             :                  * the futex_lock_pi() instance time to complete, either by
    1509             :                  * waiting on the rtmutex or removing itself from the futex
    1510             :                  * queue.
    1511             :                  */
    1512           0 :                 ret = -EAGAIN;
    1513           0 :                 goto out_unlock;
    1514             :         }
    1515             : 
    1516             :         /*
    1517             :          * We pass it to the next owner. The WAITERS bit is always kept
    1518             :          * enabled while there is PI state around. We cleanup the owner
    1519             :          * died bit, because we are the owner.
    1520             :          */
    1521           0 :         newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
    1522             : 
    1523           0 :         if (unlikely(should_fail_futex(true))) {
    1524             :                 ret = -EFAULT;
    1525             :                 goto out_unlock;
    1526             :         }
    1527             : 
    1528           0 :         ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
    1529           0 :         if (!ret && (curval != uval)) {
    1530             :                 /*
    1531             :                  * If a unconditional UNLOCK_PI operation (user space did not
    1532             :                  * try the TID->0 transition) raced with a waiter setting the
    1533             :                  * FUTEX_WAITERS flag between get_user() and locking the hash
    1534             :                  * bucket lock, retry the operation.
    1535             :                  */
    1536           0 :                 if ((FUTEX_TID_MASK & curval) == uval)
    1537             :                         ret = -EAGAIN;
    1538             :                 else
    1539           0 :                         ret = -EINVAL;
    1540             :         }
    1541             : 
    1542           0 :         if (!ret) {
    1543             :                 /*
    1544             :                  * This is a point of no return; once we modified the uval
    1545             :                  * there is no going back and subsequent operations must
    1546             :                  * not fail.
    1547             :                  */
    1548           0 :                 pi_state_update_owner(pi_state, new_owner);
    1549           0 :                 postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
    1550             :         }
    1551             : 
    1552           0 : out_unlock:
    1553           0 :         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
    1554             : 
    1555           0 :         if (postunlock)
    1556           0 :                 rt_mutex_postunlock(&wake_q);
    1557             : 
    1558           0 :         return ret;
    1559             : }
    1560             : 
    1561             : /*
    1562             :  * Express the locking dependencies for lockdep:
    1563             :  */
    1564             : static inline void
    1565           0 : double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
    1566             : {
    1567           0 :         if (hb1 <= hb2) {
    1568           0 :                 spin_lock(&hb1->lock);
    1569           0 :                 if (hb1 < hb2)
    1570           0 :                         spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
    1571             :         } else { /* hb1 > hb2 */
    1572           0 :                 spin_lock(&hb2->lock);
    1573           0 :                 spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
    1574             :         }
    1575           0 : }
    1576             : 
    1577             : static inline void
    1578           0 : double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
    1579             : {
    1580           0 :         spin_unlock(&hb1->lock);
    1581           0 :         if (hb1 != hb2)
    1582           0 :                 spin_unlock(&hb2->lock);
    1583           0 : }
    1584             : 
    1585             : /*
    1586             :  * Wake up waiters matching bitset queued on this futex (uaddr).
    1587             :  */
    1588             : static int
    1589         594 : futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
    1590             : {
    1591         594 :         struct futex_hash_bucket *hb;
    1592         594 :         struct futex_q *this, *next;
    1593         594 :         union futex_key key = FUTEX_KEY_INIT;
    1594         594 :         int ret;
    1595         594 :         DEFINE_WAKE_Q(wake_q);
    1596             : 
    1597         594 :         if (!bitset)
    1598             :                 return -EINVAL;
    1599             : 
    1600         594 :         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
    1601         594 :         if (unlikely(ret != 0))
    1602             :                 return ret;
    1603             : 
    1604         594 :         hb = hash_futex(&key);
    1605             : 
    1606             :         /* Make sure we really have tasks to wakeup */
    1607        1188 :         if (!hb_waiters_pending(hb))
    1608             :                 return ret;
    1609             : 
    1610         167 :         spin_lock(&hb->lock);
    1611             : 
    1612         169 :         plist_for_each_entry_safe(this, next, &hb->chain, list) {
    1613         167 :                 if (match_futex (&this->key, &key)) {
    1614         166 :                         if (this->pi_state || this->rt_waiter) {
    1615             :                                 ret = -EINVAL;
    1616             :                                 break;
    1617             :                         }
    1618             : 
    1619             :                         /* Check if one of the bits is set in both bitsets */
    1620         166 :                         if (!(this->bitset & bitset))
    1621           0 :                                 continue;
    1622             : 
    1623         166 :                         mark_wake_futex(&wake_q, this);
    1624         166 :                         if (++ret >= nr_wake)
    1625             :                                 break;
    1626             :                 }
    1627             :         }
    1628             : 
    1629         167 :         spin_unlock(&hb->lock);
    1630         167 :         wake_up_q(&wake_q);
    1631         167 :         return ret;
    1632             : }
    1633             : 
    1634           0 : static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
    1635             : {
    1636           0 :         unsigned int op =         (encoded_op & 0x70000000) >> 28;
    1637           0 :         unsigned int cmp =        (encoded_op & 0x0f000000) >> 24;
    1638           0 :         int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
    1639           0 :         int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
    1640           0 :         int oldval, ret;
    1641             : 
    1642           0 :         if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
    1643           0 :                 if (oparg < 0 || oparg > 31) {
    1644           0 :                         char comm[sizeof(current->comm)];
    1645             :                         /*
    1646             :                          * kill this print and return -EINVAL when userspace
    1647             :                          * is sane again
    1648             :                          */
    1649           0 :                         pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
    1650             :                                         get_task_comm(comm, current), oparg);
    1651           0 :                         oparg &= 31;
    1652             :                 }
    1653           0 :                 oparg = 1 << oparg;
    1654             :         }
    1655             : 
    1656           0 :         pagefault_disable();
    1657           0 :         ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
    1658           0 :         pagefault_enable();
    1659           0 :         if (ret)
    1660             :                 return ret;
    1661             : 
    1662           0 :         switch (cmp) {
    1663           0 :         case FUTEX_OP_CMP_EQ:
    1664           0 :                 return oldval == cmparg;
    1665           0 :         case FUTEX_OP_CMP_NE:
    1666           0 :                 return oldval != cmparg;
    1667           0 :         case FUTEX_OP_CMP_LT:
    1668           0 :                 return oldval < cmparg;
    1669           0 :         case FUTEX_OP_CMP_GE:
    1670           0 :                 return oldval >= cmparg;
    1671           0 :         case FUTEX_OP_CMP_LE:
    1672           0 :                 return oldval <= cmparg;
    1673           0 :         case FUTEX_OP_CMP_GT:
    1674           0 :                 return oldval > cmparg;
    1675             :         default:
    1676             :                 return -ENOSYS;
    1677             :         }
    1678             : }
    1679             : 
    1680             : /*
    1681             :  * Wake up all waiters hashed on the physical page that is mapped
    1682             :  * to this virtual address:
    1683             :  */
    1684             : static int
    1685           0 : futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
    1686             :               int nr_wake, int nr_wake2, int op)
    1687             : {
    1688           0 :         union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
    1689           0 :         struct futex_hash_bucket *hb1, *hb2;
    1690           0 :         struct futex_q *this, *next;
    1691           0 :         int ret, op_ret;
    1692           0 :         DEFINE_WAKE_Q(wake_q);
    1693             : 
    1694           0 : retry:
    1695           0 :         ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
    1696           0 :         if (unlikely(ret != 0))
    1697           0 :                 return ret;
    1698           0 :         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
    1699           0 :         if (unlikely(ret != 0))
    1700           0 :                 return ret;
    1701             : 
    1702           0 :         hb1 = hash_futex(&key1);
    1703           0 :         hb2 = hash_futex(&key2);
    1704             : 
    1705           0 : retry_private:
    1706           0 :         double_lock_hb(hb1, hb2);
    1707           0 :         op_ret = futex_atomic_op_inuser(op, uaddr2);
    1708           0 :         if (unlikely(op_ret < 0)) {
    1709           0 :                 double_unlock_hb(hb1, hb2);
    1710             : 
    1711           0 :                 if (!IS_ENABLED(CONFIG_MMU) ||
    1712           0 :                     unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
    1713             :                         /*
    1714             :                          * we don't get EFAULT from MMU faults if we don't have
    1715             :                          * an MMU, but we might get them from range checking
    1716             :                          */
    1717           0 :                         ret = op_ret;
    1718           0 :                         return ret;
    1719             :                 }
    1720             : 
    1721           0 :                 if (op_ret == -EFAULT) {
    1722           0 :                         ret = fault_in_user_writeable(uaddr2);
    1723           0 :                         if (ret)
    1724           0 :                                 return ret;
    1725             :                 }
    1726             : 
    1727           0 :                 if (!(flags & FLAGS_SHARED)) {
    1728           0 :                         cond_resched();
    1729           0 :                         goto retry_private;
    1730             :                 }
    1731             : 
    1732           0 :                 cond_resched();
    1733           0 :                 goto retry;
    1734             :         }
    1735             : 
    1736           0 :         plist_for_each_entry_safe(this, next, &hb1->chain, list) {
    1737           0 :                 if (match_futex (&this->key, &key1)) {
    1738           0 :                         if (this->pi_state || this->rt_waiter) {
    1739           0 :                                 ret = -EINVAL;
    1740           0 :                                 goto out_unlock;
    1741             :                         }
    1742           0 :                         mark_wake_futex(&wake_q, this);
    1743           0 :                         if (++ret >= nr_wake)
    1744             :                                 break;
    1745             :                 }
    1746             :         }
    1747             : 
    1748           0 :         if (op_ret > 0) {
    1749           0 :                 op_ret = 0;
    1750           0 :                 plist_for_each_entry_safe(this, next, &hb2->chain, list) {
    1751           0 :                         if (match_futex (&this->key, &key2)) {
    1752           0 :                                 if (this->pi_state || this->rt_waiter) {
    1753           0 :                                         ret = -EINVAL;
    1754           0 :                                         goto out_unlock;
    1755             :                                 }
    1756           0 :                                 mark_wake_futex(&wake_q, this);
    1757           0 :                                 if (++op_ret >= nr_wake2)
    1758             :                                         break;
    1759             :                         }
    1760             :                 }
    1761           0 :                 ret += op_ret;
    1762             :         }
    1763             : 
    1764           0 : out_unlock:
    1765           0 :         double_unlock_hb(hb1, hb2);
    1766           0 :         wake_up_q(&wake_q);
    1767           0 :         return ret;
    1768             : }
    1769             : 
    1770             : /**
    1771             :  * requeue_futex() - Requeue a futex_q from one hb to another
    1772             :  * @q:          the futex_q to requeue
    1773             :  * @hb1:        the source hash_bucket
    1774             :  * @hb2:        the target hash_bucket
    1775             :  * @key2:       the new key for the requeued futex_q
    1776             :  */
    1777             : static inline
    1778           0 : void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
    1779             :                    struct futex_hash_bucket *hb2, union futex_key *key2)
    1780             : {
    1781             : 
    1782             :         /*
    1783             :          * If key1 and key2 hash to the same bucket, no need to
    1784             :          * requeue.
    1785             :          */
    1786           0 :         if (likely(&hb1->chain != &hb2->chain)) {
    1787           0 :                 plist_del(&q->list, &hb1->chain);
    1788           0 :                 hb_waiters_dec(hb1);
    1789           0 :                 hb_waiters_inc(hb2);
    1790           0 :                 plist_add(&q->list, &hb2->chain);
    1791           0 :                 q->lock_ptr = &hb2->lock;
    1792             :         }
    1793           0 :         q->key = *key2;
    1794           0 : }
    1795             : 
    1796             : /**
    1797             :  * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
    1798             :  * @q:          the futex_q
    1799             :  * @key:        the key of the requeue target futex
    1800             :  * @hb:         the hash_bucket of the requeue target futex
    1801             :  *
    1802             :  * During futex_requeue, with requeue_pi=1, it is possible to acquire the
    1803             :  * target futex if it is uncontended or via a lock steal.  Set the futex_q key
    1804             :  * to the requeue target futex so the waiter can detect the wakeup on the right
    1805             :  * futex, but remove it from the hb and NULL the rt_waiter so it can detect
    1806             :  * atomic lock acquisition.  Set the q->lock_ptr to the requeue target hb->lock
    1807             :  * to protect access to the pi_state to fixup the owner later.  Must be called
    1808             :  * with both q->lock_ptr and hb->lock held.
    1809             :  */
    1810             : static inline
    1811           0 : void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
    1812             :                            struct futex_hash_bucket *hb)
    1813             : {
    1814           0 :         q->key = *key;
    1815             : 
    1816           0 :         __unqueue_futex(q);
    1817             : 
    1818           0 :         WARN_ON(!q->rt_waiter);
    1819           0 :         q->rt_waiter = NULL;
    1820             : 
    1821           0 :         q->lock_ptr = &hb->lock;
    1822             : 
    1823           0 :         wake_up_state(q->task, TASK_NORMAL);
    1824           0 : }
    1825             : 
    1826             : /**
    1827             :  * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
    1828             :  * @pifutex:            the user address of the to futex
    1829             :  * @hb1:                the from futex hash bucket, must be locked by the caller
    1830             :  * @hb2:                the to futex hash bucket, must be locked by the caller
    1831             :  * @key1:               the from futex key
    1832             :  * @key2:               the to futex key
    1833             :  * @ps:                 address to store the pi_state pointer
    1834             :  * @exiting:            Pointer to store the task pointer of the owner task
    1835             :  *                      which is in the middle of exiting
    1836             :  * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
    1837             :  *
    1838             :  * Try and get the lock on behalf of the top waiter if we can do it atomically.
    1839             :  * Wake the top waiter if we succeed.  If the caller specified set_waiters,
    1840             :  * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
    1841             :  * hb1 and hb2 must be held by the caller.
    1842             :  *
    1843             :  * @exiting is only set when the return value is -EBUSY. If so, this holds
    1844             :  * a refcount on the exiting task on return and the caller needs to drop it
    1845             :  * after waiting for the exit to complete.
    1846             :  *
    1847             :  * Return:
    1848             :  *  -  0 - failed to acquire the lock atomically;
    1849             :  *  - >0 - acquired the lock, return value is vpid of the top_waiter
    1850             :  *  - <0 - error
    1851             :  */
    1852             : static int
    1853           0 : futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
    1854             :                            struct futex_hash_bucket *hb2, union futex_key *key1,
    1855             :                            union futex_key *key2, struct futex_pi_state **ps,
    1856             :                            struct task_struct **exiting, int set_waiters)
    1857             : {
    1858           0 :         struct futex_q *top_waiter = NULL;
    1859           0 :         u32 curval;
    1860           0 :         int ret, vpid;
    1861             : 
    1862           0 :         if (get_futex_value_locked(&curval, pifutex))
    1863             :                 return -EFAULT;
    1864             : 
    1865           0 :         if (unlikely(should_fail_futex(true)))
    1866             :                 return -EFAULT;
    1867             : 
    1868             :         /*
    1869             :          * Find the top_waiter and determine if there are additional waiters.
    1870             :          * If the caller intends to requeue more than 1 waiter to pifutex,
    1871             :          * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
    1872             :          * as we have means to handle the possible fault.  If not, don't set
    1873             :          * the bit unecessarily as it will force the subsequent unlock to enter
    1874             :          * the kernel.
    1875             :          */
    1876           0 :         top_waiter = futex_top_waiter(hb1, key1);
    1877             : 
    1878             :         /* There are no waiters, nothing for us to do. */
    1879           0 :         if (!top_waiter)
    1880             :                 return 0;
    1881             : 
    1882             :         /* Ensure we requeue to the expected futex. */
    1883           0 :         if (!match_futex(top_waiter->requeue_pi_key, key2))
    1884             :                 return -EINVAL;
    1885             : 
    1886             :         /*
    1887             :          * Try to take the lock for top_waiter.  Set the FUTEX_WAITERS bit in
    1888             :          * the contended case or if set_waiters is 1.  The pi_state is returned
    1889             :          * in ps in contended cases.
    1890             :          */
    1891           0 :         vpid = task_pid_vnr(top_waiter->task);
    1892           0 :         ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
    1893             :                                    exiting, set_waiters);
    1894           0 :         if (ret == 1) {
    1895           0 :                 requeue_pi_wake_futex(top_waiter, key2, hb2);
    1896           0 :                 return vpid;
    1897             :         }
    1898             :         return ret;
    1899             : }
    1900             : 
    1901             : /**
    1902             :  * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
    1903             :  * @uaddr1:     source futex user address
    1904             :  * @flags:      futex flags (FLAGS_SHARED, etc.)
    1905             :  * @uaddr2:     target futex user address
    1906             :  * @nr_wake:    number of waiters to wake (must be 1 for requeue_pi)
    1907             :  * @nr_requeue: number of waiters to requeue (0-INT_MAX)
    1908             :  * @cmpval:     @uaddr1 expected value (or %NULL)
    1909             :  * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
    1910             :  *              pi futex (pi to pi requeue is not supported)
    1911             :  *
    1912             :  * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
    1913             :  * uaddr2 atomically on behalf of the top waiter.
    1914             :  *
    1915             :  * Return:
    1916             :  *  - >=0 - on success, the number of tasks requeued or woken;
    1917             :  *  -  <0 - on error
    1918             :  */
    1919           0 : static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
    1920             :                          u32 __user *uaddr2, int nr_wake, int nr_requeue,
    1921             :                          u32 *cmpval, int requeue_pi)
    1922             : {
    1923           0 :         union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
    1924           0 :         int task_count = 0, ret;
    1925           0 :         struct futex_pi_state *pi_state = NULL;
    1926           0 :         struct futex_hash_bucket *hb1, *hb2;
    1927           0 :         struct futex_q *this, *next;
    1928           0 :         DEFINE_WAKE_Q(wake_q);
    1929             : 
    1930           0 :         if (nr_wake < 0 || nr_requeue < 0)
    1931             :                 return -EINVAL;
    1932             : 
    1933             :         /*
    1934             :          * When PI not supported: return -ENOSYS if requeue_pi is true,
    1935             :          * consequently the compiler knows requeue_pi is always false past
    1936             :          * this point which will optimize away all the conditional code
    1937             :          * further down.
    1938             :          */
    1939           0 :         if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
    1940             :                 return -ENOSYS;
    1941             : 
    1942           0 :         if (requeue_pi) {
    1943             :                 /*
    1944             :                  * Requeue PI only works on two distinct uaddrs. This
    1945             :                  * check is only valid for private futexes. See below.
    1946             :                  */
    1947           0 :                 if (uaddr1 == uaddr2)
    1948             :                         return -EINVAL;
    1949             : 
    1950             :                 /*
    1951             :                  * requeue_pi requires a pi_state, try to allocate it now
    1952             :                  * without any locks in case it fails.
    1953             :                  */
    1954           0 :                 if (refill_pi_state_cache())
    1955             :                         return -ENOMEM;
    1956             :                 /*
    1957             :                  * requeue_pi must wake as many tasks as it can, up to nr_wake
    1958             :                  * + nr_requeue, since it acquires the rt_mutex prior to
    1959             :                  * returning to userspace, so as to not leave the rt_mutex with
    1960             :                  * waiters and no owner.  However, second and third wake-ups
    1961             :                  * cannot be predicted as they involve race conditions with the
    1962             :                  * first wake and a fault while looking up the pi_state.  Both
    1963             :                  * pthread_cond_signal() and pthread_cond_broadcast() should
    1964             :                  * use nr_wake=1.
    1965             :                  */
    1966           0 :                 if (nr_wake != 1)
    1967             :                         return -EINVAL;
    1968             :         }
    1969             : 
    1970           0 : retry:
    1971           0 :         ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
    1972           0 :         if (unlikely(ret != 0))
    1973           0 :                 return ret;
    1974           0 :         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
    1975             :                             requeue_pi ? FUTEX_WRITE : FUTEX_READ);
    1976           0 :         if (unlikely(ret != 0))
    1977           0 :                 return ret;
    1978             : 
    1979             :         /*
    1980             :          * The check above which compares uaddrs is not sufficient for
    1981             :          * shared futexes. We need to compare the keys:
    1982             :          */
    1983           0 :         if (requeue_pi && match_futex(&key1, &key2))
    1984             :                 return -EINVAL;
    1985             : 
    1986           0 :         hb1 = hash_futex(&key1);
    1987           0 :         hb2 = hash_futex(&key2);
    1988             : 
    1989           0 : retry_private:
    1990           0 :         hb_waiters_inc(hb2);
    1991           0 :         double_lock_hb(hb1, hb2);
    1992             : 
    1993           0 :         if (likely(cmpval != NULL)) {
    1994           0 :                 u32 curval;
    1995             : 
    1996           0 :                 ret = get_futex_value_locked(&curval, uaddr1);
    1997             : 
    1998           0 :                 if (unlikely(ret)) {
    1999           0 :                         double_unlock_hb(hb1, hb2);
    2000           0 :                         hb_waiters_dec(hb2);
    2001             : 
    2002           0 :                         ret = get_user(curval, uaddr1);
    2003           0 :                         if (ret)
    2004           0 :                                 return ret;
    2005             : 
    2006           0 :                         if (!(flags & FLAGS_SHARED))
    2007           0 :                                 goto retry_private;
    2008             : 
    2009           0 :                         goto retry;
    2010             :                 }
    2011           0 :                 if (curval != *cmpval) {
    2012           0 :                         ret = -EAGAIN;
    2013           0 :                         goto out_unlock;
    2014             :                 }
    2015             :         }
    2016             : 
    2017           0 :         if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
    2018           0 :                 struct task_struct *exiting = NULL;
    2019             : 
    2020             :                 /*
    2021             :                  * Attempt to acquire uaddr2 and wake the top waiter. If we
    2022             :                  * intend to requeue waiters, force setting the FUTEX_WAITERS
    2023             :                  * bit.  We force this here where we are able to easily handle
    2024             :                  * faults rather in the requeue loop below.
    2025             :                  */
    2026           0 :                 ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
    2027             :                                                  &key2, &pi_state,
    2028             :                                                  &exiting, nr_requeue);
    2029             : 
    2030             :                 /*
    2031             :                  * At this point the top_waiter has either taken uaddr2 or is
    2032             :                  * waiting on it.  If the former, then the pi_state will not
    2033             :                  * exist yet, look it up one more time to ensure we have a
    2034             :                  * reference to it. If the lock was taken, ret contains the
    2035             :                  * vpid of the top waiter task.
    2036             :                  * If the lock was not taken, we have pi_state and an initial
    2037             :                  * refcount on it. In case of an error we have nothing.
    2038             :                  */
    2039           0 :                 if (ret > 0) {
    2040           0 :                         WARN_ON(pi_state);
    2041           0 :                         task_count++;
    2042             :                         /*
    2043             :                          * If we acquired the lock, then the user space value
    2044             :                          * of uaddr2 should be vpid. It cannot be changed by
    2045             :                          * the top waiter as it is blocked on hb2 lock if it
    2046             :                          * tries to do so. If something fiddled with it behind
    2047             :                          * our back the pi state lookup might unearth it. So
    2048             :                          * we rather use the known value than rereading and
    2049             :                          * handing potential crap to lookup_pi_state.
    2050             :                          *
    2051             :                          * If that call succeeds then we have pi_state and an
    2052             :                          * initial refcount on it.
    2053             :                          */
    2054           0 :                         ret = lookup_pi_state(uaddr2, ret, hb2, &key2,
    2055             :                                               &pi_state, &exiting);
    2056             :                 }
    2057             : 
    2058           0 :                 switch (ret) {
    2059             :                 case 0:
    2060             :                         /* We hold a reference on the pi state. */
    2061           0 :                         break;
    2062             : 
    2063             :                         /* If the above failed, then pi_state is NULL */
    2064           0 :                 case -EFAULT:
    2065           0 :                         double_unlock_hb(hb1, hb2);
    2066           0 :                         hb_waiters_dec(hb2);
    2067           0 :                         ret = fault_in_user_writeable(uaddr2);
    2068           0 :                         if (!ret)
    2069           0 :                                 goto retry;
    2070           0 :                         return ret;
    2071           0 :                 case -EBUSY:
    2072             :                 case -EAGAIN:
    2073             :                         /*
    2074             :                          * Two reasons for this:
    2075             :                          * - EBUSY: Owner is exiting and we just wait for the
    2076             :                          *   exit to complete.
    2077             :                          * - EAGAIN: The user space value changed.
    2078             :                          */
    2079           0 :                         double_unlock_hb(hb1, hb2);
    2080           0 :                         hb_waiters_dec(hb2);
    2081             :                         /*
    2082             :                          * Handle the case where the owner is in the middle of
    2083             :                          * exiting. Wait for the exit to complete otherwise
    2084             :                          * this task might loop forever, aka. live lock.
    2085             :                          */
    2086           0 :                         wait_for_owner_exiting(ret, exiting);
    2087           0 :                         cond_resched();
    2088           0 :                         goto retry;
    2089           0 :                 default:
    2090           0 :                         goto out_unlock;
    2091             :                 }
    2092             :         }
    2093             : 
    2094           0 :         plist_for_each_entry_safe(this, next, &hb1->chain, list) {
    2095           0 :                 if (task_count - nr_wake >= nr_requeue)
    2096             :                         break;
    2097             : 
    2098           0 :                 if (!match_futex(&this->key, &key1))
    2099           0 :                         continue;
    2100             : 
    2101             :                 /*
    2102             :                  * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
    2103             :                  * be paired with each other and no other futex ops.
    2104             :                  *
    2105             :                  * We should never be requeueing a futex_q with a pi_state,
    2106             :                  * which is awaiting a futex_unlock_pi().
    2107             :                  */
    2108           0 :                 if ((requeue_pi && !this->rt_waiter) ||
    2109           0 :                     (!requeue_pi && this->rt_waiter) ||
    2110           0 :                     this->pi_state) {
    2111             :                         ret = -EINVAL;
    2112             :                         break;
    2113             :                 }
    2114             : 
    2115             :                 /*
    2116             :                  * Wake nr_wake waiters.  For requeue_pi, if we acquired the
    2117             :                  * lock, we already woke the top_waiter.  If not, it will be
    2118             :                  * woken by futex_unlock_pi().
    2119             :                  */
    2120           0 :                 if (++task_count <= nr_wake && !requeue_pi) {
    2121           0 :                         mark_wake_futex(&wake_q, this);
    2122           0 :                         continue;
    2123             :                 }
    2124             : 
    2125             :                 /* Ensure we requeue to the expected futex for requeue_pi. */
    2126           0 :                 if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
    2127             :                         ret = -EINVAL;
    2128             :                         break;
    2129             :                 }
    2130             : 
    2131             :                 /*
    2132             :                  * Requeue nr_requeue waiters and possibly one more in the case
    2133             :                  * of requeue_pi if we couldn't acquire the lock atomically.
    2134             :                  */
    2135           0 :                 if (requeue_pi) {
    2136             :                         /*
    2137             :                          * Prepare the waiter to take the rt_mutex. Take a
    2138             :                          * refcount on the pi_state and store the pointer in
    2139             :                          * the futex_q object of the waiter.
    2140             :                          */
    2141           0 :                         get_pi_state(pi_state);
    2142           0 :                         this->pi_state = pi_state;
    2143           0 :                         ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
    2144             :                                                         this->rt_waiter,
    2145             :                                                         this->task);
    2146           0 :                         if (ret == 1) {
    2147             :                                 /*
    2148             :                                  * We got the lock. We do neither drop the
    2149             :                                  * refcount on pi_state nor clear
    2150             :                                  * this->pi_state because the waiter needs the
    2151             :                                  * pi_state for cleaning up the user space
    2152             :                                  * value. It will drop the refcount after
    2153             :                                  * doing so.
    2154             :                                  */
    2155           0 :                                 requeue_pi_wake_futex(this, &key2, hb2);
    2156           0 :                                 continue;
    2157           0 :                         } else if (ret) {
    2158             :                                 /*
    2159             :                                  * rt_mutex_start_proxy_lock() detected a
    2160             :                                  * potential deadlock when we tried to queue
    2161             :                                  * that waiter. Drop the pi_state reference
    2162             :                                  * which we took above and remove the pointer
    2163             :                                  * to the state from the waiters futex_q
    2164             :                                  * object.
    2165             :                                  */
    2166           0 :                                 this->pi_state = NULL;
    2167           0 :                                 put_pi_state(pi_state);
    2168             :                                 /*
    2169             :                                  * We stop queueing more waiters and let user
    2170             :                                  * space deal with the mess.
    2171             :                                  */
    2172           0 :                                 break;
    2173             :                         }
    2174             :                 }
    2175           0 :                 requeue_futex(this, hb1, hb2, &key2);
    2176             :         }
    2177             : 
    2178             :         /*
    2179             :          * We took an extra initial reference to the pi_state either
    2180             :          * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We
    2181             :          * need to drop it here again.
    2182             :          */
    2183           0 :         put_pi_state(pi_state);
    2184             : 
    2185           0 : out_unlock:
    2186           0 :         double_unlock_hb(hb1, hb2);
    2187           0 :         wake_up_q(&wake_q);
    2188           0 :         hb_waiters_dec(hb2);
    2189           0 :         return ret ? ret : task_count;
    2190             : }
    2191             : 
    2192             : /* The key must be already stored in q->key. */
    2193         168 : static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
    2194             :         __acquires(&hb->lock)
    2195             : {
    2196         168 :         struct futex_hash_bucket *hb;
    2197             : 
    2198         168 :         hb = hash_futex(&q->key);
    2199             : 
    2200             :         /*
    2201             :          * Increment the counter before taking the lock so that
    2202             :          * a potential waker won't miss a to-be-slept task that is
    2203             :          * waiting for the spinlock. This is safe as all queue_lock()
    2204             :          * users end up calling queue_me(). Similarly, for housekeeping,
    2205             :          * decrement the counter at queue_unlock() when some error has
    2206             :          * occurred and we don't end up adding the task to the list.
    2207             :          */
    2208         168 :         hb_waiters_inc(hb); /* implies smp_mb(); (A) */
    2209             : 
    2210         168 :         q->lock_ptr = &hb->lock;
    2211             : 
    2212         168 :         spin_lock(&hb->lock);
    2213         168 :         return hb;
    2214             : }
    2215             : 
    2216             : static inline void
    2217           1 : queue_unlock(struct futex_hash_bucket *hb)
    2218             :         __releases(&hb->lock)
    2219             : {
    2220           2 :         spin_unlock(&hb->lock);
    2221           0 :         hb_waiters_dec(hb);
    2222           0 : }
    2223             : 
    2224         167 : static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
    2225             : {
    2226         167 :         int prio;
    2227             : 
    2228             :         /*
    2229             :          * The priority used to register this element is
    2230             :          * - either the real thread-priority for the real-time threads
    2231             :          * (i.e. threads with a priority lower than MAX_RT_PRIO)
    2232             :          * - or MAX_RT_PRIO for non-RT threads.
    2233             :          * Thus, all RT-threads are woken first in priority order, and
    2234             :          * the others are woken last, in FIFO order.
    2235             :          */
    2236         167 :         prio = min(current->normal_prio, MAX_RT_PRIO);
    2237             : 
    2238         167 :         plist_node_init(&q->list, prio);
    2239         167 :         plist_add(&q->list, &hb->chain);
    2240         167 :         q->task = current;
    2241         167 : }
    2242             : 
    2243             : /**
    2244             :  * queue_me() - Enqueue the futex_q on the futex_hash_bucket
    2245             :  * @q:  The futex_q to enqueue
    2246             :  * @hb: The destination hash bucket
    2247             :  *
    2248             :  * The hb->lock must be held by the caller, and is released here. A call to
    2249             :  * queue_me() is typically paired with exactly one call to unqueue_me().  The
    2250             :  * exceptions involve the PI related operations, which may use unqueue_me_pi()
    2251             :  * or nothing if the unqueue is done as part of the wake process and the unqueue
    2252             :  * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
    2253             :  * an example).
    2254             :  */
    2255         167 : static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
    2256             :         __releases(&hb->lock)
    2257             : {
    2258         167 :         __queue_me(q, hb);
    2259         167 :         spin_unlock(&hb->lock);
    2260             : }
    2261             : 
    2262             : /**
    2263             :  * unqueue_me() - Remove the futex_q from its futex_hash_bucket
    2264             :  * @q:  The futex_q to unqueue
    2265             :  *
    2266             :  * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
    2267             :  * be paired with exactly one earlier call to queue_me().
    2268             :  *
    2269             :  * Return:
    2270             :  *  - 1 - if the futex_q was still queued (and we removed unqueued it);
    2271             :  *  - 0 - if the futex_q was already removed by the waking thread
    2272             :  */
    2273         166 : static int unqueue_me(struct futex_q *q)
    2274             : {
    2275         166 :         spinlock_t *lock_ptr;
    2276         166 :         int ret = 0;
    2277             : 
    2278             :         /* In the common case we don't take the spinlock, which is nice. */
    2279         166 : retry:
    2280             :         /*
    2281             :          * q->lock_ptr can change between this read and the following spin_lock.
    2282             :          * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
    2283             :          * optimizing lock_ptr out of the logic below.
    2284             :          */
    2285         166 :         lock_ptr = READ_ONCE(q->lock_ptr);
    2286         166 :         if (lock_ptr != NULL) {
    2287           0 :                 spin_lock(lock_ptr);
    2288             :                 /*
    2289             :                  * q->lock_ptr can change between reading it and
    2290             :                  * spin_lock(), causing us to take the wrong lock.  This
    2291             :                  * corrects the race condition.
    2292             :                  *
    2293             :                  * Reasoning goes like this: if we have the wrong lock,
    2294             :                  * q->lock_ptr must have changed (maybe several times)
    2295             :                  * between reading it and the spin_lock().  It can
    2296             :                  * change again after the spin_lock() but only if it was
    2297             :                  * already changed before the spin_lock().  It cannot,
    2298             :                  * however, change back to the original value.  Therefore
    2299             :                  * we can detect whether we acquired the correct lock.
    2300             :                  */
    2301           0 :                 if (unlikely(lock_ptr != q->lock_ptr)) {
    2302           0 :                         spin_unlock(lock_ptr);
    2303           0 :                         goto retry;
    2304             :                 }
    2305           0 :                 __unqueue_futex(q);
    2306             : 
    2307           0 :                 BUG_ON(q->pi_state);
    2308             : 
    2309           0 :                 spin_unlock(lock_ptr);
    2310           0 :                 ret = 1;
    2311             :         }
    2312             : 
    2313         166 :         return ret;
    2314             : }
    2315             : 
    2316             : /*
    2317             :  * PI futexes can not be requeued and must remove themself from the
    2318             :  * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
    2319             :  * and dropped here.
    2320             :  */
    2321           0 : static void unqueue_me_pi(struct futex_q *q)
    2322             :         __releases(q->lock_ptr)
    2323             : {
    2324           0 :         __unqueue_futex(q);
    2325             : 
    2326           0 :         BUG_ON(!q->pi_state);
    2327           0 :         put_pi_state(q->pi_state);
    2328           0 :         q->pi_state = NULL;
    2329             : 
    2330           0 :         spin_unlock(q->lock_ptr);
    2331           0 : }
    2332             : 
    2333           0 : static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
    2334             :                                   struct task_struct *argowner)
    2335             : {
    2336           0 :         struct futex_pi_state *pi_state = q->pi_state;
    2337           0 :         struct task_struct *oldowner, *newowner;
    2338           0 :         u32 uval, curval, newval, newtid;
    2339           0 :         int err = 0;
    2340             : 
    2341           0 :         oldowner = pi_state->owner;
    2342             : 
    2343             :         /*
    2344             :          * We are here because either:
    2345             :          *
    2346             :          *  - we stole the lock and pi_state->owner needs updating to reflect
    2347             :          *    that (@argowner == current),
    2348             :          *
    2349             :          * or:
    2350             :          *
    2351             :          *  - someone stole our lock and we need to fix things to point to the
    2352             :          *    new owner (@argowner == NULL).
    2353             :          *
    2354             :          * Either way, we have to replace the TID in the user space variable.
    2355             :          * This must be atomic as we have to preserve the owner died bit here.
    2356             :          *
    2357             :          * Note: We write the user space value _before_ changing the pi_state
    2358             :          * because we can fault here. Imagine swapped out pages or a fork
    2359             :          * that marked all the anonymous memory readonly for cow.
    2360             :          *
    2361             :          * Modifying pi_state _before_ the user space value would leave the
    2362             :          * pi_state in an inconsistent state when we fault here, because we
    2363             :          * need to drop the locks to handle the fault. This might be observed
    2364             :          * in the PID check in lookup_pi_state.
    2365             :          */
    2366           0 : retry:
    2367           0 :         if (!argowner) {
    2368           0 :                 if (oldowner != current) {
    2369             :                         /*
    2370             :                          * We raced against a concurrent self; things are
    2371             :                          * already fixed up. Nothing to do.
    2372             :                          */
    2373             :                         return 0;
    2374             :                 }
    2375             : 
    2376           0 :                 if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
    2377             :                         /* We got the lock. pi_state is correct. Tell caller. */
    2378             :                         return 1;
    2379             :                 }
    2380             : 
    2381             :                 /*
    2382             :                  * The trylock just failed, so either there is an owner or
    2383             :                  * there is a higher priority waiter than this one.
    2384             :                  */
    2385           0 :                 newowner = rt_mutex_owner(&pi_state->pi_mutex);
    2386             :                 /*
    2387             :                  * If the higher priority waiter has not yet taken over the
    2388             :                  * rtmutex then newowner is NULL. We can't return here with
    2389             :                  * that state because it's inconsistent vs. the user space
    2390             :                  * state. So drop the locks and try again. It's a valid
    2391             :                  * situation and not any different from the other retry
    2392             :                  * conditions.
    2393             :                  */
    2394           0 :                 if (unlikely(!newowner)) {
    2395           0 :                         err = -EAGAIN;
    2396           0 :                         goto handle_err;
    2397             :                 }
    2398             :         } else {
    2399           0 :                 WARN_ON_ONCE(argowner != current);
    2400           0 :                 if (oldowner == current) {
    2401             :                         /*
    2402             :                          * We raced against a concurrent self; things are
    2403             :                          * already fixed up. Nothing to do.
    2404             :                          */
    2405             :                         return 1;
    2406             :                 }
    2407             :                 newowner = argowner;
    2408             :         }
    2409             : 
    2410           0 :         newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
    2411             :         /* Owner died? */
    2412           0 :         if (!pi_state->owner)
    2413           0 :                 newtid |= FUTEX_OWNER_DIED;
    2414             : 
    2415           0 :         err = get_futex_value_locked(&uval, uaddr);
    2416           0 :         if (err)
    2417           0 :                 goto handle_err;
    2418             : 
    2419           0 :         for (;;) {
    2420           0 :                 newval = (uval & FUTEX_OWNER_DIED) | newtid;
    2421             : 
    2422           0 :                 err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
    2423           0 :                 if (err)
    2424           0 :                         goto handle_err;
    2425             : 
    2426           0 :                 if (curval == uval)
    2427             :                         break;
    2428           0 :                 uval = curval;
    2429             :         }
    2430             : 
    2431             :         /*
    2432             :          * We fixed up user space. Now we need to fix the pi_state
    2433             :          * itself.
    2434             :          */
    2435           0 :         pi_state_update_owner(pi_state, newowner);
    2436             : 
    2437           0 :         return argowner == current;
    2438             : 
    2439             :         /*
    2440             :          * In order to reschedule or handle a page fault, we need to drop the
    2441             :          * locks here. In the case of a fault, this gives the other task
    2442             :          * (either the highest priority waiter itself or the task which stole
    2443             :          * the rtmutex) the chance to try the fixup of the pi_state. So once we
    2444             :          * are back from handling the fault we need to check the pi_state after
    2445             :          * reacquiring the locks and before trying to do another fixup. When
    2446             :          * the fixup has been done already we simply return.
    2447             :          *
    2448             :          * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
    2449             :          * drop hb->lock since the caller owns the hb -> futex_q relation.
    2450             :          * Dropping the pi_mutex->wait_lock requires the state revalidate.
    2451             :          */
    2452           0 : handle_err:
    2453           0 :         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
    2454           0 :         spin_unlock(q->lock_ptr);
    2455             : 
    2456           0 :         switch (err) {
    2457           0 :         case -EFAULT:
    2458           0 :                 err = fault_in_user_writeable(uaddr);
    2459           0 :                 break;
    2460             : 
    2461           0 :         case -EAGAIN:
    2462           0 :                 cond_resched();
    2463           0 :                 err = 0;
    2464           0 :                 break;
    2465             : 
    2466             :         default:
    2467           0 :                 WARN_ON_ONCE(1);
    2468           0 :                 break;
    2469             :         }
    2470             : 
    2471           0 :         spin_lock(q->lock_ptr);
    2472           0 :         raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
    2473             : 
    2474             :         /*
    2475             :          * Check if someone else fixed it for us:
    2476             :          */
    2477           0 :         if (pi_state->owner != oldowner)
    2478           0 :                 return argowner == current;
    2479             : 
    2480             :         /* Retry if err was -EAGAIN or the fault in succeeded */
    2481           0 :         if (!err)
    2482           0 :                 goto retry;
    2483             : 
    2484             :         /*
    2485             :          * fault_in_user_writeable() failed so user state is immutable. At
    2486             :          * best we can make the kernel state consistent but user state will
    2487             :          * be most likely hosed and any subsequent unlock operation will be
    2488             :          * rejected due to PI futex rule [10].
    2489             :          *
    2490             :          * Ensure that the rtmutex owner is also the pi_state owner despite
    2491             :          * the user space value claiming something different. There is no
    2492             :          * point in unlocking the rtmutex if current is the owner as it
    2493             :          * would need to wait until the next waiter has taken the rtmutex
    2494             :          * to guarantee consistent state. Keep it simple. Userspace asked
    2495             :          * for this wreckaged state.
    2496             :          *
    2497             :          * The rtmutex has an owner - either current or some other
    2498             :          * task. See the EAGAIN loop above.
    2499             :          */
    2500           0 :         pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
    2501             : 
    2502           0 :         return err;
    2503             : }
    2504             : 
    2505           0 : static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
    2506             :                                 struct task_struct *argowner)
    2507             : {
    2508           0 :         struct futex_pi_state *pi_state = q->pi_state;
    2509           0 :         int ret;
    2510             : 
    2511           0 :         lockdep_assert_held(q->lock_ptr);
    2512             : 
    2513           0 :         raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
    2514           0 :         ret = __fixup_pi_state_owner(uaddr, q, argowner);
    2515           0 :         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
    2516           0 :         return ret;
    2517             : }
    2518             : 
    2519             : static long futex_wait_restart(struct restart_block *restart);
    2520             : 
    2521             : /**
    2522             :  * fixup_owner() - Post lock pi_state and corner case management
    2523             :  * @uaddr:      user address of the futex
    2524             :  * @q:          futex_q (contains pi_state and access to the rt_mutex)
    2525             :  * @locked:     if the attempt to take the rt_mutex succeeded (1) or not (0)
    2526             :  *
    2527             :  * After attempting to lock an rt_mutex, this function is called to cleanup
    2528             :  * the pi_state owner as well as handle race conditions that may allow us to
    2529             :  * acquire the lock. Must be called with the hb lock held.
    2530             :  *
    2531             :  * Return:
    2532             :  *  -  1 - success, lock taken;
    2533             :  *  -  0 - success, lock not taken;
    2534             :  *  - <0 - on error (-EFAULT)
    2535             :  */
    2536           0 : static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
    2537             : {
    2538           0 :         if (locked) {
    2539             :                 /*
    2540             :                  * Got the lock. We might not be the anticipated owner if we
    2541             :                  * did a lock-steal - fix up the PI-state in that case:
    2542             :                  *
    2543             :                  * Speculative pi_state->owner read (we don't hold wait_lock);
    2544             :                  * since we own the lock pi_state->owner == current is the
    2545             :                  * stable state, anything else needs more attention.
    2546             :                  */
    2547           0 :                 if (q->pi_state->owner != current)
    2548           0 :                         return fixup_pi_state_owner(uaddr, q, current);
    2549             :                 return 1;
    2550             :         }
    2551             : 
    2552             :         /*
    2553             :          * If we didn't get the lock; check if anybody stole it from us. In
    2554             :          * that case, we need to fix up the uval to point to them instead of
    2555             :          * us, otherwise bad things happen. [10]
    2556             :          *
    2557             :          * Another speculative read; pi_state->owner == current is unstable
    2558             :          * but needs our attention.
    2559             :          */
    2560           0 :         if (q->pi_state->owner == current)
    2561           0 :                 return fixup_pi_state_owner(uaddr, q, NULL);
    2562             : 
    2563             :         /*
    2564             :          * Paranoia check. If we did not take the lock, then we should not be
    2565             :          * the owner of the rt_mutex. Warn and establish consistent state.
    2566             :          */
    2567           0 :         if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
    2568           0 :                 return fixup_pi_state_owner(uaddr, q, current);
    2569             : 
    2570             :         return 0;
    2571             : }
    2572             : 
    2573             : /**
    2574             :  * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
    2575             :  * @hb:         the futex hash bucket, must be locked by the caller
    2576             :  * @q:          the futex_q to queue up on
    2577             :  * @timeout:    the prepared hrtimer_sleeper, or null for no timeout
    2578             :  */
    2579         167 : static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
    2580             :                                 struct hrtimer_sleeper *timeout)
    2581             : {
    2582             :         /*
    2583             :          * The task state is guaranteed to be set before another task can
    2584             :          * wake it. set_current_state() is implemented using smp_store_mb() and
    2585             :          * queue_me() calls spin_unlock() upon completion, both serializing
    2586             :          * access to the hash list and forcing another memory barrier.
    2587             :          */
    2588         167 :         set_current_state(TASK_INTERRUPTIBLE);
    2589         167 :         queue_me(q, hb);
    2590             : 
    2591             :         /* Arm the timer */
    2592         167 :         if (timeout)
    2593           0 :                 hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
    2594             : 
    2595             :         /*
    2596             :          * If we have been removed from the hash list, then another task
    2597             :          * has tried to wake us, and we can skip the call to schedule().
    2598             :          */
    2599         167 :         if (likely(!plist_node_empty(&q->list))) {
    2600             :                 /*
    2601             :                  * If the timer has already expired, current will already be
    2602             :                  * flagged for rescheduling. Only call schedule if there
    2603             :                  * is no timeout, or if it has yet to expire.
    2604             :                  */
    2605         167 :                 if (!timeout || timeout->task)
    2606         167 :                         freezable_schedule();
    2607             :         }
    2608         166 :         __set_current_state(TASK_RUNNING);
    2609         166 : }
    2610             : 
    2611             : /**
    2612             :  * futex_wait_setup() - Prepare to wait on a futex
    2613             :  * @uaddr:      the futex userspace address
    2614             :  * @val:        the expected value
    2615             :  * @flags:      futex flags (FLAGS_SHARED, etc.)
    2616             :  * @q:          the associated futex_q
    2617             :  * @hb:         storage for hash_bucket pointer to be returned to caller
    2618             :  *
    2619             :  * Setup the futex_q and locate the hash_bucket.  Get the futex value and
    2620             :  * compare it with the expected value.  Handle atomic faults internally.
    2621             :  * Return with the hb lock held and a q.key reference on success, and unlocked
    2622             :  * with no q.key reference on failure.
    2623             :  *
    2624             :  * Return:
    2625             :  *  -  0 - uaddr contains val and hb has been locked;
    2626             :  *  - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
    2627             :  */
    2628         168 : static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
    2629             :                            struct futex_q *q, struct futex_hash_bucket **hb)
    2630             : {
    2631         168 :         u32 uval;
    2632         168 :         int ret;
    2633             : 
    2634             :         /*
    2635             :          * Access the page AFTER the hash-bucket is locked.
    2636             :          * Order is important:
    2637             :          *
    2638             :          *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
    2639             :          *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
    2640             :          *
    2641             :          * The basic logical guarantee of a futex is that it blocks ONLY
    2642             :          * if cond(var) is known to be true at the time of blocking, for
    2643             :          * any cond.  If we locked the hash-bucket after testing *uaddr, that
    2644             :          * would open a race condition where we could block indefinitely with
    2645             :          * cond(var) false, which would violate the guarantee.
    2646             :          *
    2647             :          * On the other hand, we insert q and release the hash-bucket only
    2648             :          * after testing *uaddr.  This guarantees that futex_wait() will NOT
    2649             :          * absorb a wakeup if *uaddr does not match the desired values
    2650             :          * while the syscall executes.
    2651             :          */
    2652         168 : retry:
    2653         168 :         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
    2654         168 :         if (unlikely(ret != 0))
    2655           0 :                 return ret;
    2656             : 
    2657         168 : retry_private:
    2658         168 :         *hb = queue_lock(q);
    2659             : 
    2660         168 :         ret = get_futex_value_locked(&uval, uaddr);
    2661             : 
    2662         168 :         if (ret) {
    2663           0 :                 queue_unlock(*hb);
    2664             : 
    2665           0 :                 ret = get_user(uval, uaddr);
    2666           0 :                 if (ret)
    2667           0 :                         return ret;
    2668             : 
    2669           0 :                 if (!(flags & FLAGS_SHARED))
    2670           0 :                         goto retry_private;
    2671             : 
    2672           0 :                 goto retry;
    2673             :         }
    2674             : 
    2675         168 :         if (uval != val) {
    2676           1 :                 queue_unlock(*hb);
    2677           1 :                 ret = -EWOULDBLOCK;
    2678             :         }
    2679             : 
    2680             :         return ret;
    2681             : }
    2682             : 
    2683         168 : static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
    2684             :                       ktime_t *abs_time, u32 bitset)
    2685             : {
    2686         168 :         struct hrtimer_sleeper timeout, *to;
    2687         168 :         struct restart_block *restart;
    2688         168 :         struct futex_hash_bucket *hb;
    2689         168 :         struct futex_q q = futex_q_init;
    2690         168 :         int ret;
    2691             : 
    2692         168 :         if (!bitset)
    2693             :                 return -EINVAL;
    2694         168 :         q.bitset = bitset;
    2695             : 
    2696         168 :         to = futex_setup_timer(abs_time, &timeout, flags,
    2697         168 :                                current->timer_slack_ns);
    2698         168 : retry:
    2699             :         /*
    2700             :          * Prepare to wait on uaddr. On success, holds hb lock and increments
    2701             :          * q.key refs.
    2702             :          */
    2703         168 :         ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
    2704         168 :         if (ret)
    2705           1 :                 goto out;
    2706             : 
    2707             :         /* queue_me and wait for wakeup, timeout, or a signal. */
    2708         167 :         futex_wait_queue_me(hb, &q, to);
    2709             : 
    2710             :         /* If we were woken (and unqueued), we succeeded, whatever. */
    2711         166 :         ret = 0;
    2712             :         /* unqueue_me() drops q.key ref */
    2713         166 :         if (!unqueue_me(&q))
    2714         166 :                 goto out;
    2715           0 :         ret = -ETIMEDOUT;
    2716           0 :         if (to && !to->task)
    2717           0 :                 goto out;
    2718             : 
    2719             :         /*
    2720             :          * We expect signal_pending(current), but we might be the
    2721             :          * victim of a spurious wakeup as well.
    2722             :          */
    2723           0 :         if (!signal_pending(current))
    2724           0 :                 goto retry;
    2725             : 
    2726           0 :         ret = -ERESTARTSYS;
    2727           0 :         if (!abs_time)
    2728           0 :                 goto out;
    2729             : 
    2730           0 :         restart = &current->restart_block;
    2731           0 :         restart->fn = futex_wait_restart;
    2732           0 :         restart->futex.uaddr = uaddr;
    2733           0 :         restart->futex.val = val;
    2734           0 :         restart->futex.time = *abs_time;
    2735           0 :         restart->futex.bitset = bitset;
    2736           0 :         restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
    2737             : 
    2738           0 :         ret = -ERESTART_RESTARTBLOCK;
    2739             : 
    2740         167 : out:
    2741         167 :         if (to) {
    2742           0 :                 hrtimer_cancel(&to->timer);
    2743           0 :                 destroy_hrtimer_on_stack(&to->timer);
    2744             :         }
    2745             :         return ret;
    2746             : }
    2747             : 
    2748             : 
    2749           0 : static long futex_wait_restart(struct restart_block *restart)
    2750             : {
    2751           0 :         u32 __user *uaddr = restart->futex.uaddr;
    2752           0 :         ktime_t t, *tp = NULL;
    2753             : 
    2754           0 :         if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
    2755           0 :                 t = restart->futex.time;
    2756           0 :                 tp = &t;
    2757             :         }
    2758           0 :         restart->fn = do_no_restart_syscall;
    2759             : 
    2760           0 :         return (long)futex_wait(uaddr, restart->futex.flags,
    2761             :                                 restart->futex.val, tp, restart->futex.bitset);
    2762             : }
    2763             : 
    2764             : 
    2765             : /*
    2766             :  * Userspace tried a 0 -> TID atomic transition of the futex value
    2767             :  * and failed. The kernel side here does the whole locking operation:
    2768             :  * if there are waiters then it will block as a consequence of relying
    2769             :  * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
    2770             :  * a 0 value of the futex too.).
    2771             :  *
    2772             :  * Also serves as futex trylock_pi()'ing, and due semantics.
    2773             :  */
    2774           0 : static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
    2775             :                          ktime_t *time, int trylock)
    2776             : {
    2777           0 :         struct hrtimer_sleeper timeout, *to;
    2778           0 :         struct task_struct *exiting = NULL;
    2779           0 :         struct rt_mutex_waiter rt_waiter;
    2780           0 :         struct futex_hash_bucket *hb;
    2781           0 :         struct futex_q q = futex_q_init;
    2782           0 :         int res, ret;
    2783             : 
    2784           0 :         if (!IS_ENABLED(CONFIG_FUTEX_PI))
    2785             :                 return -ENOSYS;
    2786             : 
    2787           0 :         if (refill_pi_state_cache())
    2788             :                 return -ENOMEM;
    2789             : 
    2790           0 :         to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0);
    2791             : 
    2792             : retry:
    2793           0 :         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
    2794           0 :         if (unlikely(ret != 0))
    2795           0 :                 goto out;
    2796             : 
    2797           0 : retry_private:
    2798           0 :         hb = queue_lock(&q);
    2799             : 
    2800           0 :         ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
    2801             :                                    &exiting, 0);
    2802           0 :         if (unlikely(ret)) {
    2803             :                 /*
    2804             :                  * Atomic work succeeded and we got the lock,
    2805             :                  * or failed. Either way, we do _not_ block.
    2806             :                  */
    2807           0 :                 switch (ret) {
    2808           0 :                 case 1:
    2809             :                         /* We got the lock. */
    2810           0 :                         ret = 0;
    2811           0 :                         goto out_unlock_put_key;
    2812           0 :                 case -EFAULT:
    2813           0 :                         goto uaddr_faulted;
    2814             :                 case -EBUSY:
    2815             :                 case -EAGAIN:
    2816             :                         /*
    2817             :                          * Two reasons for this:
    2818             :                          * - EBUSY: Task is exiting and we just wait for the
    2819             :                          *   exit to complete.
    2820             :                          * - EAGAIN: The user space value changed.
    2821             :                          */
    2822           0 :                         queue_unlock(hb);
    2823             :                         /*
    2824             :                          * Handle the case where the owner is in the middle of
    2825             :                          * exiting. Wait for the exit to complete otherwise
    2826             :                          * this task might loop forever, aka. live lock.
    2827             :                          */
    2828           0 :                         wait_for_owner_exiting(ret, exiting);
    2829           0 :                         cond_resched();
    2830           0 :                         goto retry;
    2831           0 :                 default:
    2832           0 :                         goto out_unlock_put_key;
    2833             :                 }
    2834             :         }
    2835             : 
    2836           0 :         WARN_ON(!q.pi_state);
    2837             : 
    2838             :         /*
    2839             :          * Only actually queue now that the atomic ops are done:
    2840             :          */
    2841           0 :         __queue_me(&q, hb);
    2842             : 
    2843           0 :         if (trylock) {
    2844           0 :                 ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
    2845             :                 /* Fixup the trylock return value: */
    2846           0 :                 ret = ret ? 0 : -EWOULDBLOCK;
    2847           0 :                 goto no_block;
    2848             :         }
    2849             : 
    2850           0 :         rt_mutex_init_waiter(&rt_waiter);
    2851             : 
    2852             :         /*
    2853             :          * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
    2854             :          * hold it while doing rt_mutex_start_proxy(), because then it will
    2855             :          * include hb->lock in the blocking chain, even through we'll not in
    2856             :          * fact hold it while blocking. This will lead it to report -EDEADLK
    2857             :          * and BUG when futex_unlock_pi() interleaves with this.
    2858             :          *
    2859             :          * Therefore acquire wait_lock while holding hb->lock, but drop the
    2860             :          * latter before calling __rt_mutex_start_proxy_lock(). This
    2861             :          * interleaves with futex_unlock_pi() -- which does a similar lock
    2862             :          * handoff -- such that the latter can observe the futex_q::pi_state
    2863             :          * before __rt_mutex_start_proxy_lock() is done.
    2864             :          */
    2865           0 :         raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
    2866           0 :         spin_unlock(q.lock_ptr);
    2867             :         /*
    2868             :          * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
    2869             :          * such that futex_unlock_pi() is guaranteed to observe the waiter when
    2870             :          * it sees the futex_q::pi_state.
    2871             :          */
    2872           0 :         ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
    2873           0 :         raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
    2874             : 
    2875           0 :         if (ret) {
    2876           0 :                 if (ret == 1)
    2877           0 :                         ret = 0;
    2878           0 :                 goto cleanup;
    2879             :         }
    2880             : 
    2881           0 :         if (unlikely(to))
    2882           0 :                 hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
    2883             : 
    2884           0 :         ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
    2885             : 
    2886           0 : cleanup:
    2887           0 :         spin_lock(q.lock_ptr);
    2888             :         /*
    2889             :          * If we failed to acquire the lock (deadlock/signal/timeout), we must
    2890             :          * first acquire the hb->lock before removing the lock from the
    2891             :          * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
    2892             :          * lists consistent.
    2893             :          *
    2894             :          * In particular; it is important that futex_unlock_pi() can not
    2895             :          * observe this inconsistency.
    2896             :          */
    2897           0 :         if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
    2898           0 :                 ret = 0;
    2899             : 
    2900           0 : no_block:
    2901             :         /*
    2902             :          * Fixup the pi_state owner and possibly acquire the lock if we
    2903             :          * haven't already.
    2904             :          */
    2905           0 :         res = fixup_owner(uaddr, &q, !ret);
    2906             :         /*
    2907             :          * If fixup_owner() returned an error, proprogate that.  If it acquired
    2908             :          * the lock, clear our -ETIMEDOUT or -EINTR.
    2909             :          */
    2910           0 :         if (res)
    2911           0 :                 ret = (res < 0) ? res : 0;
    2912             : 
    2913             :         /* Unqueue and drop the lock */
    2914           0 :         unqueue_me_pi(&q);
    2915           0 :         goto out;
    2916             : 
    2917           0 : out_unlock_put_key:
    2918           0 :         queue_unlock(hb);
    2919             : 
    2920           0 : out:
    2921           0 :         if (to) {
    2922           0 :                 hrtimer_cancel(&to->timer);
    2923           0 :                 destroy_hrtimer_on_stack(&to->timer);
    2924             :         }
    2925           0 :         return ret != -EINTR ? ret : -ERESTARTNOINTR;
    2926             : 
    2927           0 : uaddr_faulted:
    2928           0 :         queue_unlock(hb);
    2929             : 
    2930           0 :         ret = fault_in_user_writeable(uaddr);
    2931           0 :         if (ret)
    2932           0 :                 goto out;
    2933             : 
    2934           0 :         if (!(flags & FLAGS_SHARED))
    2935           0 :                 goto retry_private;
    2936             : 
    2937           0 :         goto retry;
    2938             : }
    2939             : 
    2940             : /*
    2941             :  * Userspace attempted a TID -> 0 atomic transition, and failed.
    2942             :  * This is the in-kernel slowpath: we look up the PI state (if any),
    2943             :  * and do the rt-mutex unlock.
    2944             :  */
    2945           0 : static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
    2946             : {
    2947           0 :         u32 curval, uval, vpid = task_pid_vnr(current);
    2948           0 :         union futex_key key = FUTEX_KEY_INIT;
    2949           0 :         struct futex_hash_bucket *hb;
    2950           0 :         struct futex_q *top_waiter;
    2951           0 :         int ret;
    2952             : 
    2953           0 :         if (!IS_ENABLED(CONFIG_FUTEX_PI))
    2954             :                 return -ENOSYS;
    2955             : 
    2956           0 : retry:
    2957           0 :         if (get_user(uval, uaddr))
    2958             :                 return -EFAULT;
    2959             :         /*
    2960             :          * We release only a lock we actually own:
    2961             :          */
    2962           0 :         if ((uval & FUTEX_TID_MASK) != vpid)
    2963             :                 return -EPERM;
    2964             : 
    2965           0 :         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
    2966           0 :         if (ret)
    2967           0 :                 return ret;
    2968             : 
    2969           0 :         hb = hash_futex(&key);
    2970           0 :         spin_lock(&hb->lock);
    2971             : 
    2972             :         /*
    2973             :          * Check waiters first. We do not trust user space values at
    2974             :          * all and we at least want to know if user space fiddled
    2975             :          * with the futex value instead of blindly unlocking.
    2976             :          */
    2977           0 :         top_waiter = futex_top_waiter(hb, &key);
    2978           0 :         if (top_waiter) {
    2979           0 :                 struct futex_pi_state *pi_state = top_waiter->pi_state;
    2980             : 
    2981           0 :                 ret = -EINVAL;
    2982           0 :                 if (!pi_state)
    2983           0 :                         goto out_unlock;
    2984             : 
    2985             :                 /*
    2986             :                  * If current does not own the pi_state then the futex is
    2987             :                  * inconsistent and user space fiddled with the futex value.
    2988             :                  */
    2989           0 :                 if (pi_state->owner != current)
    2990           0 :                         goto out_unlock;
    2991             : 
    2992           0 :                 get_pi_state(pi_state);
    2993             :                 /*
    2994             :                  * By taking wait_lock while still holding hb->lock, we ensure
    2995             :                  * there is no point where we hold neither; and therefore
    2996             :                  * wake_futex_pi() must observe a state consistent with what we
    2997             :                  * observed.
    2998             :                  *
    2999             :                  * In particular; this forces __rt_mutex_start_proxy() to
    3000             :                  * complete such that we're guaranteed to observe the
    3001             :                  * rt_waiter. Also see the WARN in wake_futex_pi().
    3002             :                  */
    3003           0 :                 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
    3004           0 :                 spin_unlock(&hb->lock);
    3005             : 
    3006             :                 /* drops pi_state->pi_mutex.wait_lock */
    3007           0 :                 ret = wake_futex_pi(uaddr, uval, pi_state);
    3008             : 
    3009           0 :                 put_pi_state(pi_state);
    3010             : 
    3011             :                 /*
    3012             :                  * Success, we're done! No tricky corner cases.
    3013             :                  */
    3014           0 :                 if (!ret)
    3015             :                         return ret;
    3016             :                 /*
    3017             :                  * The atomic access to the futex value generated a
    3018             :                  * pagefault, so retry the user-access and the wakeup:
    3019             :                  */
    3020           0 :                 if (ret == -EFAULT)
    3021           0 :                         goto pi_faulted;
    3022             :                 /*
    3023             :                  * A unconditional UNLOCK_PI op raced against a waiter
    3024             :                  * setting the FUTEX_WAITERS bit. Try again.
    3025             :                  */
    3026           0 :                 if (ret == -EAGAIN)
    3027           0 :                         goto pi_retry;
    3028             :                 /*
    3029             :                  * wake_futex_pi has detected invalid state. Tell user
    3030             :                  * space.
    3031             :                  */
    3032           0 :                 return ret;
    3033             :         }
    3034             : 
    3035             :         /*
    3036             :          * We have no kernel internal state, i.e. no waiters in the
    3037             :          * kernel. Waiters which are about to queue themselves are stuck
    3038             :          * on hb->lock. So we can safely ignore them. We do neither
    3039             :          * preserve the WAITERS bit not the OWNER_DIED one. We are the
    3040             :          * owner.
    3041             :          */
    3042           0 :         if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {
    3043           0 :                 spin_unlock(&hb->lock);
    3044           0 :                 switch (ret) {
    3045           0 :                 case -EFAULT:
    3046           0 :                         goto pi_faulted;
    3047             : 
    3048           0 :                 case -EAGAIN:
    3049           0 :                         goto pi_retry;
    3050             : 
    3051             :                 default:
    3052           0 :                         WARN_ON_ONCE(1);
    3053           0 :                         return ret;
    3054             :                 }
    3055             :         }
    3056             : 
    3057             :         /*
    3058             :          * If uval has changed, let user space handle it.
    3059             :          */
    3060           0 :         ret = (curval == uval) ? 0 : -EAGAIN;
    3061             : 
    3062           0 : out_unlock:
    3063           0 :         spin_unlock(&hb->lock);
    3064           0 :         return ret;
    3065             : 
    3066           0 : pi_retry:
    3067           0 :         cond_resched();
    3068           0 :         goto retry;
    3069             : 
    3070           0 : pi_faulted:
    3071             : 
    3072           0 :         ret = fault_in_user_writeable(uaddr);
    3073           0 :         if (!ret)
    3074           0 :                 goto retry;
    3075             : 
    3076             :         return ret;
    3077             : }
    3078             : 
    3079             : /**
    3080             :  * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
    3081             :  * @hb:         the hash_bucket futex_q was original enqueued on
    3082             :  * @q:          the futex_q woken while waiting to be requeued
    3083             :  * @key2:       the futex_key of the requeue target futex
    3084             :  * @timeout:    the timeout associated with the wait (NULL if none)
    3085             :  *
    3086             :  * Detect if the task was woken on the initial futex as opposed to the requeue
    3087             :  * target futex.  If so, determine if it was a timeout or a signal that caused
    3088             :  * the wakeup and return the appropriate error code to the caller.  Must be
    3089             :  * called with the hb lock held.
    3090             :  *
    3091             :  * Return:
    3092             :  *  -  0 = no early wakeup detected;
    3093             :  *  - <0 = -ETIMEDOUT or -ERESTARTNOINTR
    3094             :  */
    3095             : static inline
    3096           0 : int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
    3097             :                                    struct futex_q *q, union futex_key *key2,
    3098             :                                    struct hrtimer_sleeper *timeout)
    3099             : {
    3100           0 :         int ret = 0;
    3101             : 
    3102             :         /*
    3103             :          * With the hb lock held, we avoid races while we process the wakeup.
    3104             :          * We only need to hold hb (and not hb2) to ensure atomicity as the
    3105             :          * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
    3106             :          * It can't be requeued from uaddr2 to something else since we don't
    3107             :          * support a PI aware source futex for requeue.
    3108             :          */
    3109           0 :         if (!match_futex(&q->key, key2)) {
    3110           0 :                 WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
    3111             :                 /*
    3112             :                  * We were woken prior to requeue by a timeout or a signal.
    3113             :                  * Unqueue the futex_q and determine which it was.
    3114             :                  */
    3115           0 :                 plist_del(&q->list, &hb->chain);
    3116           0 :                 hb_waiters_dec(hb);
    3117             : 
    3118             :                 /* Handle spurious wakeups gracefully */
    3119           0 :                 ret = -EWOULDBLOCK;
    3120           0 :                 if (timeout && !timeout->task)
    3121             :                         ret = -ETIMEDOUT;
    3122           0 :                 else if (signal_pending(current))
    3123           0 :                         ret = -ERESTARTNOINTR;
    3124             :         }
    3125           0 :         return ret;
    3126             : }
    3127             : 
    3128             : /**
    3129             :  * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
    3130             :  * @uaddr:      the futex we initially wait on (non-pi)
    3131             :  * @flags:      futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
    3132             :  *              the same type, no requeueing from private to shared, etc.
    3133             :  * @val:        the expected value of uaddr
    3134             :  * @abs_time:   absolute timeout
    3135             :  * @bitset:     32 bit wakeup bitset set by userspace, defaults to all
    3136             :  * @uaddr2:     the pi futex we will take prior to returning to user-space
    3137             :  *
    3138             :  * The caller will wait on uaddr and will be requeued by futex_requeue() to
    3139             :  * uaddr2 which must be PI aware and unique from uaddr.  Normal wakeup will wake
    3140             :  * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
    3141             :  * userspace.  This ensures the rt_mutex maintains an owner when it has waiters;
    3142             :  * without one, the pi logic would not know which task to boost/deboost, if
    3143             :  * there was a need to.
    3144             :  *
    3145             :  * We call schedule in futex_wait_queue_me() when we enqueue and return there
    3146             :  * via the following--
    3147             :  * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
    3148             :  * 2) wakeup on uaddr2 after a requeue
    3149             :  * 3) signal
    3150             :  * 4) timeout
    3151             :  *
    3152             :  * If 3, cleanup and return -ERESTARTNOINTR.
    3153             :  *
    3154             :  * If 2, we may then block on trying to take the rt_mutex and return via:
    3155             :  * 5) successful lock
    3156             :  * 6) signal
    3157             :  * 7) timeout
    3158             :  * 8) other lock acquisition failure
    3159             :  *
    3160             :  * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
    3161             :  *
    3162             :  * If 4 or 7, we cleanup and return with -ETIMEDOUT.
    3163             :  *
    3164             :  * Return:
    3165             :  *  -  0 - On success;
    3166             :  *  - <0 - On error
    3167             :  */
    3168           0 : static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
    3169             :                                  u32 val, ktime_t *abs_time, u32 bitset,
    3170             :                                  u32 __user *uaddr2)
    3171             : {
    3172           0 :         struct hrtimer_sleeper timeout, *to;
    3173           0 :         struct rt_mutex_waiter rt_waiter;
    3174           0 :         struct futex_hash_bucket *hb;
    3175           0 :         union futex_key key2 = FUTEX_KEY_INIT;
    3176           0 :         struct futex_q q = futex_q_init;
    3177           0 :         int res, ret;
    3178             : 
    3179           0 :         if (!IS_ENABLED(CONFIG_FUTEX_PI))
    3180             :                 return -ENOSYS;
    3181             : 
    3182           0 :         if (uaddr == uaddr2)
    3183             :                 return -EINVAL;
    3184             : 
    3185           0 :         if (!bitset)
    3186             :                 return -EINVAL;
    3187             : 
    3188           0 :         to = futex_setup_timer(abs_time, &timeout, flags,
    3189           0 :                                current->timer_slack_ns);
    3190             : 
    3191             :         /*
    3192             :          * The waiter is allocated on our stack, manipulated by the requeue
    3193             :          * code while we sleep on uaddr.
    3194             :          */
    3195           0 :         rt_mutex_init_waiter(&rt_waiter);
    3196             : 
    3197           0 :         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
    3198           0 :         if (unlikely(ret != 0))
    3199           0 :                 goto out;
    3200             : 
    3201           0 :         q.bitset = bitset;
    3202           0 :         q.rt_waiter = &rt_waiter;
    3203           0 :         q.requeue_pi_key = &key2;
    3204             : 
    3205             :         /*
    3206             :          * Prepare to wait on uaddr. On success, increments q.key (key1) ref
    3207             :          * count.
    3208             :          */
    3209           0 :         ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
    3210           0 :         if (ret)
    3211           0 :                 goto out;
    3212             : 
    3213             :         /*
    3214             :          * The check above which compares uaddrs is not sufficient for
    3215             :          * shared futexes. We need to compare the keys:
    3216             :          */
    3217           0 :         if (match_futex(&q.key, &key2)) {
    3218           0 :                 queue_unlock(hb);
    3219           0 :                 ret = -EINVAL;
    3220           0 :                 goto out;
    3221             :         }
    3222             : 
    3223             :         /* Queue the futex_q, drop the hb lock, wait for wakeup. */
    3224           0 :         futex_wait_queue_me(hb, &q, to);
    3225             : 
    3226           0 :         spin_lock(&hb->lock);
    3227           0 :         ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
    3228           0 :         spin_unlock(&hb->lock);
    3229           0 :         if (ret)
    3230           0 :                 goto out;
    3231             : 
    3232             :         /*
    3233             :          * In order for us to be here, we know our q.key == key2, and since
    3234             :          * we took the hb->lock above, we also know that futex_requeue() has
    3235             :          * completed and we no longer have to concern ourselves with a wakeup
    3236             :          * race with the atomic proxy lock acquisition by the requeue code. The
    3237             :          * futex_requeue dropped our key1 reference and incremented our key2
    3238             :          * reference count.
    3239             :          */
    3240             : 
    3241             :         /* Check if the requeue code acquired the second futex for us. */
    3242           0 :         if (!q.rt_waiter) {
    3243             :                 /*
    3244             :                  * Got the lock. We might not be the anticipated owner if we
    3245             :                  * did a lock-steal - fix up the PI-state in that case.
    3246             :                  */
    3247           0 :                 if (q.pi_state && (q.pi_state->owner != current)) {
    3248           0 :                         spin_lock(q.lock_ptr);
    3249           0 :                         ret = fixup_pi_state_owner(uaddr2, &q, current);
    3250             :                         /*
    3251             :                          * Drop the reference to the pi state which
    3252             :                          * the requeue_pi() code acquired for us.
    3253             :                          */
    3254           0 :                         put_pi_state(q.pi_state);
    3255           0 :                         spin_unlock(q.lock_ptr);
    3256             :                         /*
    3257             :                          * Adjust the return value. It's either -EFAULT or
    3258             :                          * success (1) but the caller expects 0 for success.
    3259             :                          */
    3260           0 :                         ret = ret < 0 ? ret : 0;
    3261             :                 }
    3262             :         } else {
    3263           0 :                 struct rt_mutex *pi_mutex;
    3264             : 
    3265             :                 /*
    3266             :                  * We have been woken up by futex_unlock_pi(), a timeout, or a
    3267             :                  * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
    3268             :                  * the pi_state.
    3269             :                  */
    3270           0 :                 WARN_ON(!q.pi_state);
    3271           0 :                 pi_mutex = &q.pi_state->pi_mutex;
    3272           0 :                 ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
    3273             : 
    3274           0 :                 spin_lock(q.lock_ptr);
    3275           0 :                 if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
    3276           0 :                         ret = 0;
    3277             : 
    3278           0 :                 debug_rt_mutex_free_waiter(&rt_waiter);
    3279             :                 /*
    3280             :                  * Fixup the pi_state owner and possibly acquire the lock if we
    3281             :                  * haven't already.
    3282             :                  */
    3283           0 :                 res = fixup_owner(uaddr2, &q, !ret);
    3284             :                 /*
    3285             :                  * If fixup_owner() returned an error, proprogate that.  If it
    3286             :                  * acquired the lock, clear -ETIMEDOUT or -EINTR.
    3287             :                  */
    3288           0 :                 if (res)
    3289           0 :                         ret = (res < 0) ? res : 0;
    3290             : 
    3291             :                 /* Unqueue and drop the lock. */
    3292           0 :                 unqueue_me_pi(&q);
    3293             :         }
    3294             : 
    3295           0 :         if (ret == -EINTR) {
    3296             :                 /*
    3297             :                  * We've already been requeued, but cannot restart by calling
    3298             :                  * futex_lock_pi() directly. We could restart this syscall, but
    3299             :                  * it would detect that the user space "val" changed and return
    3300             :                  * -EWOULDBLOCK.  Save the overhead of the restart and return
    3301             :                  * -EWOULDBLOCK directly.
    3302             :                  */
    3303           0 :                 ret = -EWOULDBLOCK;
    3304             :         }
    3305             : 
    3306           0 : out:
    3307           0 :         if (to) {
    3308           0 :                 hrtimer_cancel(&to->timer);
    3309           0 :                 destroy_hrtimer_on_stack(&to->timer);
    3310             :         }
    3311             :         return ret;
    3312             : }
    3313             : 
    3314             : /*
    3315             :  * Support for robust futexes: the kernel cleans up held futexes at
    3316             :  * thread exit time.
    3317             :  *
    3318             :  * Implementation: user-space maintains a per-thread list of locks it
    3319             :  * is holding. Upon do_exit(), the kernel carefully walks this list,
    3320             :  * and marks all locks that are owned by this thread with the
    3321             :  * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
    3322             :  * always manipulated with the lock held, so the list is private and
    3323             :  * per-thread. Userspace also maintains a per-thread 'list_op_pending'
    3324             :  * field, to allow the kernel to clean up if the thread dies after
    3325             :  * acquiring the lock, but just before it could have added itself to
    3326             :  * the list. There can only be one such pending lock.
    3327             :  */
    3328             : 
    3329             : /**
    3330             :  * sys_set_robust_list() - Set the robust-futex list head of a task
    3331             :  * @head:       pointer to the list-head
    3332             :  * @len:        length of the list-head, as userspace expects
    3333             :  */
    3334        1428 : SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
    3335             :                 size_t, len)
    3336             : {
    3337         714 :         if (!futex_cmpxchg_enabled)
    3338             :                 return -ENOSYS;
    3339             :         /*
    3340             :          * The kernel knows only one size for now:
    3341             :          */
    3342         714 :         if (unlikely(len != sizeof(*head)))
    3343             :                 return -EINVAL;
    3344             : 
    3345         714 :         current->robust_list = head;
    3346             : 
    3347         714 :         return 0;
    3348             : }
    3349             : 
    3350             : /**
    3351             :  * sys_get_robust_list() - Get the robust-futex list head of a task
    3352             :  * @pid:        pid of the process [zero for current task]
    3353             :  * @head_ptr:   pointer to a list-head pointer, the kernel fills it in
    3354             :  * @len_ptr:    pointer to a length field, the kernel fills in the header size
    3355             :  */
    3356           0 : SYSCALL_DEFINE3(get_robust_list, int, pid,
    3357             :                 struct robust_list_head __user * __user *, head_ptr,
    3358             :                 size_t __user *, len_ptr)
    3359             : {
    3360           0 :         struct robust_list_head __user *head;
    3361           0 :         unsigned long ret;
    3362           0 :         struct task_struct *p;
    3363             : 
    3364           0 :         if (!futex_cmpxchg_enabled)
    3365             :                 return -ENOSYS;
    3366             : 
    3367           0 :         rcu_read_lock();
    3368             : 
    3369           0 :         ret = -ESRCH;
    3370           0 :         if (!pid)
    3371           0 :                 p = current;
    3372             :         else {
    3373           0 :                 p = find_task_by_vpid(pid);
    3374           0 :                 if (!p)
    3375           0 :                         goto err_unlock;
    3376             :         }
    3377             : 
    3378           0 :         ret = -EPERM;
    3379           0 :         if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
    3380           0 :                 goto err_unlock;
    3381             : 
    3382           0 :         head = p->robust_list;
    3383           0 :         rcu_read_unlock();
    3384             : 
    3385           0 :         if (put_user(sizeof(*head), len_ptr))
    3386             :                 return -EFAULT;
    3387           0 :         return put_user(head, head_ptr);
    3388             : 
    3389           0 : err_unlock:
    3390           0 :         rcu_read_unlock();
    3391             : 
    3392           0 :         return ret;
    3393             : }
    3394             : 
    3395             : /* Constants for the pending_op argument of handle_futex_death */
    3396             : #define HANDLE_DEATH_PENDING    true
    3397             : #define HANDLE_DEATH_LIST       false
    3398             : 
    3399             : /*
    3400             :  * Process a futex-list entry, check whether it's owned by the
    3401             :  * dying task, and do notification if so:
    3402             :  */
    3403           0 : static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
    3404             :                               bool pi, bool pending_op)
    3405             : {
    3406           0 :         u32 uval, nval, mval;
    3407           0 :         int err;
    3408             : 
    3409             :         /* Futex address must be 32bit aligned */
    3410           0 :         if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
    3411             :                 return -1;
    3412             : 
    3413           0 : retry:
    3414           0 :         if (get_user(uval, uaddr))
    3415             :                 return -1;
    3416             : 
    3417             :         /*
    3418             :          * Special case for regular (non PI) futexes. The unlock path in
    3419             :          * user space has two race scenarios:
    3420             :          *
    3421             :          * 1. The unlock path releases the user space futex value and
    3422             :          *    before it can execute the futex() syscall to wake up
    3423             :          *    waiters it is killed.
    3424             :          *
    3425             :          * 2. A woken up waiter is killed before it can acquire the
    3426             :          *    futex in user space.
    3427             :          *
    3428             :          * In both cases the TID validation below prevents a wakeup of
    3429             :          * potential waiters which can cause these waiters to block
    3430             :          * forever.
    3431             :          *
    3432             :          * In both cases the following conditions are met:
    3433             :          *
    3434             :          *      1) task->robust_list->list_op_pending != NULL
    3435             :          *         @pending_op == true
    3436             :          *      2) User space futex value == 0
    3437             :          *      3) Regular futex: @pi == false
    3438             :          *
    3439             :          * If these conditions are met, it is safe to attempt waking up a
    3440             :          * potential waiter without touching the user space futex value and
    3441             :          * trying to set the OWNER_DIED bit. The user space futex value is
    3442             :          * uncontended and the rest of the user space mutex state is
    3443             :          * consistent, so a woken waiter will just take over the
    3444             :          * uncontended futex. Setting the OWNER_DIED bit would create
    3445             :          * inconsistent state and malfunction of the user space owner died
    3446             :          * handling.
    3447             :          */
    3448           0 :         if (pending_op && !pi && !uval) {
    3449           0 :                 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
    3450           0 :                 return 0;
    3451             :         }
    3452             : 
    3453           0 :         if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
    3454             :                 return 0;
    3455             : 
    3456             :         /*
    3457             :          * Ok, this dying thread is truly holding a futex
    3458             :          * of interest. Set the OWNER_DIED bit atomically
    3459             :          * via cmpxchg, and if the value had FUTEX_WAITERS
    3460             :          * set, wake up a waiter (if any). (We have to do a
    3461             :          * futex_wake() even if OWNER_DIED is already set -
    3462             :          * to handle the rare but possible case of recursive
    3463             :          * thread-death.) The rest of the cleanup is done in
    3464             :          * userspace.
    3465             :          */
    3466           0 :         mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
    3467             : 
    3468             :         /*
    3469             :          * We are not holding a lock here, but we want to have
    3470             :          * the pagefault_disable/enable() protection because
    3471             :          * we want to handle the fault gracefully. If the
    3472             :          * access fails we try to fault in the futex with R/W
    3473             :          * verification via get_user_pages. get_user() above
    3474             :          * does not guarantee R/W access. If that fails we
    3475             :          * give up and leave the futex locked.
    3476             :          */
    3477           0 :         if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) {
    3478           0 :                 switch (err) {
    3479           0 :                 case -EFAULT:
    3480           0 :                         if (fault_in_user_writeable(uaddr))
    3481             :                                 return -1;
    3482           0 :                         goto retry;
    3483             : 
    3484           0 :                 case -EAGAIN:
    3485           0 :                         cond_resched();
    3486           0 :                         goto retry;
    3487             : 
    3488             :                 default:
    3489           0 :                         WARN_ON_ONCE(1);
    3490           0 :                         return err;
    3491             :                 }
    3492             :         }
    3493             : 
    3494           0 :         if (nval != uval)
    3495           0 :                 goto retry;
    3496             : 
    3497             :         /*
    3498             :          * Wake robust non-PI futexes here. The wakeup of
    3499             :          * PI futexes happens in exit_pi_state():
    3500             :          */
    3501           0 :         if (!pi && (uval & FUTEX_WAITERS))
    3502           0 :                 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
    3503             : 
    3504             :         return 0;
    3505             : }
    3506             : 
    3507             : /*
    3508             :  * Fetch a robust-list pointer. Bit 0 signals PI futexes:
    3509             :  */
    3510        1386 : static inline int fetch_robust_entry(struct robust_list __user **entry,
    3511             :                                      struct robust_list __user * __user *head,
    3512             :                                      unsigned int *pi)
    3513             : {
    3514        1386 :         unsigned long uentry;
    3515             : 
    3516        1386 :         if (get_user(uentry, (unsigned long __user *)head))
    3517             :                 return -EFAULT;
    3518             : 
    3519        1386 :         *entry = (void __user *)(uentry & ~1UL);
    3520        1386 :         *pi = uentry & 1;
    3521             : 
    3522        1386 :         return 0;
    3523             : }
    3524             : 
    3525             : /*
    3526             :  * Walk curr->robust_list (very carefully, it's a userspace list!)
    3527             :  * and mark any locks found there dead, and notify any waiters.
    3528             :  *
    3529             :  * We silently return on any sign of list-walking problem.
    3530             :  */
    3531         693 : static void exit_robust_list(struct task_struct *curr)
    3532             : {
    3533         693 :         struct robust_list_head __user *head = curr->robust_list;
    3534         693 :         struct robust_list __user *entry, *next_entry, *pending;
    3535         693 :         unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
    3536         693 :         unsigned int next_pi;
    3537         693 :         unsigned long futex_offset;
    3538         693 :         int rc;
    3539             : 
    3540         693 :         if (!futex_cmpxchg_enabled)
    3541           0 :                 return;
    3542             : 
    3543             :         /*
    3544             :          * Fetch the list head (which was registered earlier, via
    3545             :          * sys_set_robust_list()):
    3546             :          */
    3547         693 :         if (fetch_robust_entry(&entry, &head->list.next, &pi))
    3548             :                 return;
    3549             :         /*
    3550             :          * Fetch the relative futex offset:
    3551             :          */
    3552         693 :         if (get_user(futex_offset, &head->futex_offset))
    3553             :                 return;
    3554             :         /*
    3555             :          * Fetch any possibly pending lock-add first, and handle it
    3556             :          * if it exists:
    3557             :          */
    3558         693 :         if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
    3559             :                 return;
    3560             : 
    3561         693 :         next_entry = NULL;      /* avoid warning with gcc */
    3562         693 :         while (entry != &head->list) {
    3563             :                 /*
    3564             :                  * Fetch the next entry in the list before calling
    3565             :                  * handle_futex_death:
    3566             :                  */
    3567           0 :                 rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
    3568             :                 /*
    3569             :                  * A pending lock might already be on the list, so
    3570             :                  * don't process it twice:
    3571             :                  */
    3572           0 :                 if (entry != pending) {
    3573           0 :                         if (handle_futex_death((void __user *)entry + futex_offset,
    3574             :                                                 curr, pi, HANDLE_DEATH_LIST))
    3575             :                                 return;
    3576             :                 }
    3577           0 :                 if (rc)
    3578             :                         return;
    3579           0 :                 entry = next_entry;
    3580           0 :                 pi = next_pi;
    3581             :                 /*
    3582             :                  * Avoid excessively long or circular lists:
    3583             :                  */
    3584           0 :                 if (!--limit)
    3585             :                         break;
    3586             : 
    3587           0 :                 cond_resched();
    3588             :         }
    3589             : 
    3590         693 :         if (pending) {
    3591           0 :                 handle_futex_death((void __user *)pending + futex_offset,
    3592             :                                    curr, pip, HANDLE_DEATH_PENDING);
    3593             :         }
    3594             : }
    3595             : 
    3596        1865 : static void futex_cleanup(struct task_struct *tsk)
    3597             : {
    3598        1865 :         if (unlikely(tsk->robust_list)) {
    3599         693 :                 exit_robust_list(tsk);
    3600         693 :                 tsk->robust_list = NULL;
    3601             :         }
    3602             : 
    3603             : #ifdef CONFIG_COMPAT
    3604        1865 :         if (unlikely(tsk->compat_robust_list)) {
    3605           0 :                 compat_exit_robust_list(tsk);
    3606           0 :                 tsk->compat_robust_list = NULL;
    3607             :         }
    3608             : #endif
    3609             : 
    3610        1865 :         if (unlikely(!list_empty(&tsk->pi_state_list)))
    3611           0 :                 exit_pi_state_list(tsk);
    3612        1865 : }
    3613             : 
    3614             : /**
    3615             :  * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
    3616             :  * @tsk:        task to set the state on
    3617             :  *
    3618             :  * Set the futex exit state of the task lockless. The futex waiter code
    3619             :  * observes that state when a task is exiting and loops until the task has
    3620             :  * actually finished the futex cleanup. The worst case for this is that the
    3621             :  * waiter runs through the wait loop until the state becomes visible.
    3622             :  *
    3623             :  * This is called from the recursive fault handling path in do_exit().
    3624             :  *
    3625             :  * This is best effort. Either the futex exit code has run already or
    3626             :  * not. If the OWNER_DIED bit has been set on the futex then the waiter can
    3627             :  * take it over. If not, the problem is pushed back to user space. If the
    3628             :  * futex exit code did not run yet, then an already queued waiter might
    3629             :  * block forever, but there is nothing which can be done about that.
    3630             :  */
    3631           0 : void futex_exit_recursive(struct task_struct *tsk)
    3632             : {
    3633             :         /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
    3634           0 :         if (tsk->futex_state == FUTEX_STATE_EXITING)
    3635           0 :                 mutex_unlock(&tsk->futex_exit_mutex);
    3636           0 :         tsk->futex_state = FUTEX_STATE_DEAD;
    3637           0 : }
    3638             : 
    3639        1865 : static void futex_cleanup_begin(struct task_struct *tsk)
    3640             : {
    3641             :         /*
    3642             :          * Prevent various race issues against a concurrent incoming waiter
    3643             :          * including live locks by forcing the waiter to block on
    3644             :          * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
    3645             :          * attach_to_pi_owner().
    3646             :          */
    3647        1865 :         mutex_lock(&tsk->futex_exit_mutex);
    3648             : 
    3649             :         /*
    3650             :          * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
    3651             :          *
    3652             :          * This ensures that all subsequent checks of tsk->futex_state in
    3653             :          * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
    3654             :          * tsk->pi_lock held.
    3655             :          *
    3656             :          * It guarantees also that a pi_state which was queued right before
    3657             :          * the state change under tsk->pi_lock by a concurrent waiter must
    3658             :          * be observed in exit_pi_state_list().
    3659             :          */
    3660        1865 :         raw_spin_lock_irq(&tsk->pi_lock);
    3661        1865 :         tsk->futex_state = FUTEX_STATE_EXITING;
    3662        1865 :         raw_spin_unlock_irq(&tsk->pi_lock);
    3663        1865 : }
    3664             : 
    3665        1865 : static void futex_cleanup_end(struct task_struct *tsk, int state)
    3666             : {
    3667             :         /*
    3668             :          * Lockless store. The only side effect is that an observer might
    3669             :          * take another loop until it becomes visible.
    3670             :          */
    3671        1865 :         tsk->futex_state = state;
    3672             :         /*
    3673             :          * Drop the exit protection. This unblocks waiters which observed
    3674             :          * FUTEX_STATE_EXITING to reevaluate the state.
    3675             :          */
    3676        1865 :         mutex_unlock(&tsk->futex_exit_mutex);
    3677             : }
    3678             : 
    3679         790 : void futex_exec_release(struct task_struct *tsk)
    3680             : {
    3681             :         /*
    3682             :          * The state handling is done for consistency, but in the case of
    3683             :          * exec() there is no way to prevent futher damage as the PID stays
    3684             :          * the same. But for the unlikely and arguably buggy case that a
    3685             :          * futex is held on exec(), this provides at least as much state
    3686             :          * consistency protection which is possible.
    3687             :          */
    3688         790 :         futex_cleanup_begin(tsk);
    3689         790 :         futex_cleanup(tsk);
    3690             :         /*
    3691             :          * Reset the state to FUTEX_STATE_OK. The task is alive and about
    3692             :          * exec a new binary.
    3693             :          */
    3694         790 :         futex_cleanup_end(tsk, FUTEX_STATE_OK);
    3695         790 : }
    3696             : 
    3697        1075 : void futex_exit_release(struct task_struct *tsk)
    3698             : {
    3699        1075 :         futex_cleanup_begin(tsk);
    3700        1075 :         futex_cleanup(tsk);
    3701        1075 :         futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
    3702        1075 : }
    3703             : 
    3704         762 : long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
    3705             :                 u32 __user *uaddr2, u32 val2, u32 val3)
    3706             : {
    3707         762 :         int cmd = op & FUTEX_CMD_MASK;
    3708         762 :         unsigned int flags = 0;
    3709             : 
    3710         762 :         if (!(op & FUTEX_PRIVATE_FLAG))
    3711           2 :                 flags |= FLAGS_SHARED;
    3712             : 
    3713         762 :         if (op & FUTEX_CLOCK_REALTIME) {
    3714           0 :                 flags |= FLAGS_CLOCKRT;
    3715           0 :                 if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \
    3716             :                     cmd != FUTEX_WAIT_REQUEUE_PI)
    3717             :                         return -ENOSYS;
    3718             :         }
    3719             : 
    3720         762 :         switch (cmd) {
    3721           0 :         case FUTEX_LOCK_PI:
    3722             :         case FUTEX_UNLOCK_PI:
    3723             :         case FUTEX_TRYLOCK_PI:
    3724             :         case FUTEX_WAIT_REQUEUE_PI:
    3725             :         case FUTEX_CMP_REQUEUE_PI:
    3726           0 :                 if (!futex_cmpxchg_enabled)
    3727             :                         return -ENOSYS;
    3728             :         }
    3729             : 
    3730         762 :         switch (cmd) {
    3731         168 :         case FUTEX_WAIT:
    3732         168 :                 val3 = FUTEX_BITSET_MATCH_ANY;
    3733         168 :                 fallthrough;
    3734         168 :         case FUTEX_WAIT_BITSET:
    3735         168 :                 return futex_wait(uaddr, flags, val, timeout, val3);
    3736         594 :         case FUTEX_WAKE:
    3737         594 :                 val3 = FUTEX_BITSET_MATCH_ANY;
    3738         594 :                 fallthrough;
    3739         594 :         case FUTEX_WAKE_BITSET:
    3740         594 :                 return futex_wake(uaddr, flags, val, val3);
    3741           0 :         case FUTEX_REQUEUE:
    3742           0 :                 return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
    3743           0 :         case FUTEX_CMP_REQUEUE:
    3744           0 :                 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
    3745           0 :         case FUTEX_WAKE_OP:
    3746           0 :                 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
    3747           0 :         case FUTEX_LOCK_PI:
    3748           0 :                 return futex_lock_pi(uaddr, flags, timeout, 0);
    3749           0 :         case FUTEX_UNLOCK_PI:
    3750           0 :                 return futex_unlock_pi(uaddr, flags);
    3751           0 :         case FUTEX_TRYLOCK_PI:
    3752           0 :                 return futex_lock_pi(uaddr, flags, NULL, 1);
    3753           0 :         case FUTEX_WAIT_REQUEUE_PI:
    3754           0 :                 val3 = FUTEX_BITSET_MATCH_ANY;
    3755           0 :                 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
    3756             :                                              uaddr2);
    3757           0 :         case FUTEX_CMP_REQUEUE_PI:
    3758           0 :                 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
    3759             :         }
    3760             :         return -ENOSYS;
    3761             : }
    3762             : 
    3763             : 
    3764        1520 : SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
    3765             :                 const struct __kernel_timespec __user *, utime,
    3766             :                 u32 __user *, uaddr2, u32, val3)
    3767             : {
    3768         760 :         struct timespec64 ts;
    3769         760 :         ktime_t t, *tp = NULL;
    3770         760 :         u32 val2 = 0;
    3771         760 :         int cmd = op & FUTEX_CMD_MASK;
    3772             : 
    3773         760 :         if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
    3774           4 :                       cmd == FUTEX_WAIT_BITSET ||
    3775           4 :                       cmd == FUTEX_WAIT_REQUEUE_PI)) {
    3776           0 :                 if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
    3777             :                         return -EFAULT;
    3778           0 :                 if (get_timespec64(&ts, utime))
    3779             :                         return -EFAULT;
    3780           0 :                 if (!timespec64_valid(&ts))
    3781             :                         return -EINVAL;
    3782             : 
    3783           0 :                 t = timespec64_to_ktime(ts);
    3784           0 :                 if (cmd == FUTEX_WAIT)
    3785           0 :                         t = ktime_add_safe(ktime_get(), t);
    3786             :                 else if (!(op & FUTEX_CLOCK_REALTIME))
    3787         760 :                         t = timens_ktime_to_host(CLOCK_MONOTONIC, t);
    3788             :                 tp = &t;
    3789             :         }
    3790             :         /*
    3791             :          * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
    3792             :          * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
    3793             :          */
    3794         760 :         if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
    3795         760 :             cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
    3796           0 :                 val2 = (u32) (unsigned long) utime;
    3797             : 
    3798         760 :         return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
    3799             : }
    3800             : 
    3801             : #ifdef CONFIG_COMPAT
    3802             : /*
    3803             :  * Fetch a robust-list pointer. Bit 0 signals PI futexes:
    3804             :  */
    3805             : static inline int
    3806           0 : compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
    3807             :                    compat_uptr_t __user *head, unsigned int *pi)
    3808             : {
    3809           0 :         if (get_user(*uentry, head))
    3810             :                 return -EFAULT;
    3811             : 
    3812           0 :         *entry = compat_ptr((*uentry) & ~1);
    3813           0 :         *pi = (unsigned int)(*uentry) & 1;
    3814             : 
    3815           0 :         return 0;
    3816             : }
    3817             : 
    3818           0 : static void __user *futex_uaddr(struct robust_list __user *entry,
    3819             :                                 compat_long_t futex_offset)
    3820             : {
    3821           0 :         compat_uptr_t base = ptr_to_compat(entry);
    3822           0 :         void __user *uaddr = compat_ptr(base + futex_offset);
    3823             : 
    3824           0 :         return uaddr;
    3825             : }
    3826             : 
    3827             : /*
    3828             :  * Walk curr->robust_list (very carefully, it's a userspace list!)
    3829             :  * and mark any locks found there dead, and notify any waiters.
    3830             :  *
    3831             :  * We silently return on any sign of list-walking problem.
    3832             :  */
    3833           0 : static void compat_exit_robust_list(struct task_struct *curr)
    3834             : {
    3835           0 :         struct compat_robust_list_head __user *head = curr->compat_robust_list;
    3836           0 :         struct robust_list __user *entry, *next_entry, *pending;
    3837           0 :         unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
    3838           0 :         unsigned int next_pi;
    3839           0 :         compat_uptr_t uentry, next_uentry, upending;
    3840           0 :         compat_long_t futex_offset;
    3841           0 :         int rc;
    3842             : 
    3843           0 :         if (!futex_cmpxchg_enabled)
    3844           0 :                 return;
    3845             : 
    3846             :         /*
    3847             :          * Fetch the list head (which was registered earlier, via
    3848             :          * sys_set_robust_list()):
    3849             :          */
    3850           0 :         if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
    3851             :                 return;
    3852             :         /*
    3853             :          * Fetch the relative futex offset:
    3854             :          */
    3855           0 :         if (get_user(futex_offset, &head->futex_offset))
    3856             :                 return;
    3857             :         /*
    3858             :          * Fetch any possibly pending lock-add first, and handle it
    3859             :          * if it exists:
    3860             :          */
    3861           0 :         if (compat_fetch_robust_entry(&upending, &pending,
    3862             :                                &head->list_op_pending, &pip))
    3863             :                 return;
    3864             : 
    3865           0 :         next_entry = NULL;      /* avoid warning with gcc */
    3866           0 :         while (entry != (struct robust_list __user *) &head->list) {
    3867             :                 /*
    3868             :                  * Fetch the next entry in the list before calling
    3869             :                  * handle_futex_death:
    3870             :                  */
    3871           0 :                 rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
    3872           0 :                         (compat_uptr_t __user *)&entry->next, &next_pi);
    3873             :                 /*
    3874             :                  * A pending lock might already be on the list, so
    3875             :                  * dont process it twice:
    3876             :                  */
    3877           0 :                 if (entry != pending) {
    3878           0 :                         void __user *uaddr = futex_uaddr(entry, futex_offset);
    3879             : 
    3880           0 :                         if (handle_futex_death(uaddr, curr, pi,
    3881             :                                                HANDLE_DEATH_LIST))
    3882             :                                 return;
    3883             :                 }
    3884           0 :                 if (rc)
    3885             :                         return;
    3886           0 :                 uentry = next_uentry;
    3887           0 :                 entry = next_entry;
    3888           0 :                 pi = next_pi;
    3889             :                 /*
    3890             :                  * Avoid excessively long or circular lists:
    3891             :                  */
    3892           0 :                 if (!--limit)
    3893             :                         break;
    3894             : 
    3895           0 :                 cond_resched();
    3896             :         }
    3897           0 :         if (pending) {
    3898           0 :                 void __user *uaddr = futex_uaddr(pending, futex_offset);
    3899             : 
    3900           0 :                 handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
    3901             :         }
    3902             : }
    3903             : 
    3904           0 : COMPAT_SYSCALL_DEFINE2(set_robust_list,
    3905             :                 struct compat_robust_list_head __user *, head,
    3906             :                 compat_size_t, len)
    3907             : {
    3908           0 :         if (!futex_cmpxchg_enabled)
    3909             :                 return -ENOSYS;
    3910             : 
    3911           0 :         if (unlikely(len != sizeof(*head)))
    3912             :                 return -EINVAL;
    3913             : 
    3914           0 :         current->compat_robust_list = head;
    3915             : 
    3916           0 :         return 0;
    3917             : }
    3918             : 
    3919           0 : COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
    3920             :                         compat_uptr_t __user *, head_ptr,
    3921             :                         compat_size_t __user *, len_ptr)
    3922             : {
    3923           0 :         struct compat_robust_list_head __user *head;
    3924           0 :         unsigned long ret;
    3925           0 :         struct task_struct *p;
    3926             : 
    3927           0 :         if (!futex_cmpxchg_enabled)
    3928             :                 return -ENOSYS;
    3929             : 
    3930           0 :         rcu_read_lock();
    3931             : 
    3932           0 :         ret = -ESRCH;
    3933           0 :         if (!pid)
    3934           0 :                 p = current;
    3935             :         else {
    3936           0 :                 p = find_task_by_vpid(pid);
    3937           0 :                 if (!p)
    3938           0 :                         goto err_unlock;
    3939             :         }
    3940             : 
    3941           0 :         ret = -EPERM;
    3942           0 :         if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
    3943           0 :                 goto err_unlock;
    3944             : 
    3945           0 :         head = p->compat_robust_list;
    3946           0 :         rcu_read_unlock();
    3947             : 
    3948           0 :         if (put_user(sizeof(*head), len_ptr))
    3949             :                 return -EFAULT;
    3950           0 :         return put_user(ptr_to_compat(head), head_ptr);
    3951             : 
    3952           0 : err_unlock:
    3953           0 :         rcu_read_unlock();
    3954             : 
    3955           0 :         return ret;
    3956             : }
    3957             : #endif /* CONFIG_COMPAT */
    3958             : 
    3959             : #ifdef CONFIG_COMPAT_32BIT_TIME
    3960             : SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
    3961             :                 const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
    3962             :                 u32, val3)
    3963             : {
    3964             :         struct timespec64 ts;
    3965             :         ktime_t t, *tp = NULL;
    3966             :         int val2 = 0;
    3967             :         int cmd = op & FUTEX_CMD_MASK;
    3968             : 
    3969             :         if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
    3970             :                       cmd == FUTEX_WAIT_BITSET ||
    3971             :                       cmd == FUTEX_WAIT_REQUEUE_PI)) {
    3972             :                 if (get_old_timespec32(&ts, utime))
    3973             :                         return -EFAULT;
    3974             :                 if (!timespec64_valid(&ts))
    3975             :                         return -EINVAL;
    3976             : 
    3977             :                 t = timespec64_to_ktime(ts);
    3978             :                 if (cmd == FUTEX_WAIT)
    3979             :                         t = ktime_add_safe(ktime_get(), t);
    3980             :                 else if (!(op & FUTEX_CLOCK_REALTIME))
    3981             :                         t = timens_ktime_to_host(CLOCK_MONOTONIC, t);
    3982             :                 tp = &t;
    3983             :         }
    3984             :         if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
    3985             :             cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
    3986             :                 val2 = (int) (unsigned long) utime;
    3987             : 
    3988             :         return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
    3989             : }
    3990             : #endif /* CONFIG_COMPAT_32BIT_TIME */
    3991             : 
    3992           1 : static void __init futex_detect_cmpxchg(void)
    3993             : {
    3994             : #ifndef CONFIG_HAVE_FUTEX_CMPXCHG
    3995           1 :         u32 curval;
    3996             : 
    3997             :         /*
    3998             :          * This will fail and we want it. Some arch implementations do
    3999             :          * runtime detection of the futex_atomic_cmpxchg_inatomic()
    4000             :          * functionality. We want to know that before we call in any
    4001             :          * of the complex code paths. Also we want to prevent
    4002             :          * registration of robust lists in that case. NULL is
    4003             :          * guaranteed to fault and we get -EFAULT on functional
    4004             :          * implementation, the non-functional ones will return
    4005             :          * -ENOSYS.
    4006             :          */
    4007           1 :         if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
    4008           1 :                 futex_cmpxchg_enabled = 1;
    4009             : #endif
    4010           1 : }
    4011             : 
    4012           1 : static int __init futex_init(void)
    4013             : {
    4014           1 :         unsigned int futex_shift;
    4015           1 :         unsigned long i;
    4016             : 
    4017             : #if CONFIG_BASE_SMALL
    4018             :         futex_hashsize = 16;
    4019             : #else
    4020           1 :         futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
    4021             : #endif
    4022             : 
    4023           2 :         futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
    4024             :                                                futex_hashsize, 0,
    4025             :                                                futex_hashsize < 256 ? HASH_SMALL : 0,
    4026             :                                                &futex_shift, NULL,
    4027             :                                                futex_hashsize, futex_hashsize);
    4028           1 :         futex_hashsize = 1UL << futex_shift;
    4029             : 
    4030           1 :         futex_detect_cmpxchg();
    4031             : 
    4032        1026 :         for (i = 0; i < futex_hashsize; i++) {
    4033        1024 :                 atomic_set(&futex_queues[i].waiters, 0);
    4034        1024 :                 plist_head_init(&futex_queues[i].chain);
    4035        1024 :                 spin_lock_init(&futex_queues[i].lock);
    4036             :         }
    4037             : 
    4038           1 :         return 0;
    4039             : }
    4040             : core_initcall(futex_init);

Generated by: LCOV version 1.14