Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-or-later
2 : /*
3 : * Fast Userspace Mutexes (which I call "Futexes!").
4 : * (C) Rusty Russell, IBM 2002
5 : *
6 : * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
7 : * (C) Copyright 2003 Red Hat Inc, All Rights Reserved
8 : *
9 : * Removed page pinning, fix privately mapped COW pages and other cleanups
10 : * (C) Copyright 2003, 2004 Jamie Lokier
11 : *
12 : * Robust futex support started by Ingo Molnar
13 : * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
14 : * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
15 : *
16 : * PI-futex support started by Ingo Molnar and Thomas Gleixner
17 : * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
18 : * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
19 : *
20 : * PRIVATE futexes by Eric Dumazet
21 : * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
22 : *
23 : * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
24 : * Copyright (C) IBM Corporation, 2009
25 : * Thanks to Thomas Gleixner for conceptual design and careful reviews.
26 : *
27 : * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
28 : * enough at me, Linus for the original (flawed) idea, Matthew
29 : * Kirkwood for proof-of-concept implementation.
30 : *
31 : * "The futexes are also cursed."
32 : * "But they come in a choice of three flavours!"
33 : */
34 : #include <linux/compat.h>
35 : #include <linux/jhash.h>
36 : #include <linux/pagemap.h>
37 : #include <linux/syscalls.h>
38 : #include <linux/hugetlb.h>
39 : #include <linux/freezer.h>
40 : #include <linux/memblock.h>
41 : #include <linux/fault-inject.h>
42 : #include <linux/time_namespace.h>
43 :
44 : #include <asm/futex.h>
45 :
46 : #include "locking/rtmutex_common.h"
47 :
48 : /*
49 : * READ this before attempting to hack on futexes!
50 : *
51 : * Basic futex operation and ordering guarantees
52 : * =============================================
53 : *
54 : * The waiter reads the futex value in user space and calls
55 : * futex_wait(). This function computes the hash bucket and acquires
56 : * the hash bucket lock. After that it reads the futex user space value
57 : * again and verifies that the data has not changed. If it has not changed
58 : * it enqueues itself into the hash bucket, releases the hash bucket lock
59 : * and schedules.
60 : *
61 : * The waker side modifies the user space value of the futex and calls
62 : * futex_wake(). This function computes the hash bucket and acquires the
63 : * hash bucket lock. Then it looks for waiters on that futex in the hash
64 : * bucket and wakes them.
65 : *
66 : * In futex wake up scenarios where no tasks are blocked on a futex, taking
67 : * the hb spinlock can be avoided and simply return. In order for this
68 : * optimization to work, ordering guarantees must exist so that the waiter
69 : * being added to the list is acknowledged when the list is concurrently being
70 : * checked by the waker, avoiding scenarios like the following:
71 : *
72 : * CPU 0 CPU 1
73 : * val = *futex;
74 : * sys_futex(WAIT, futex, val);
75 : * futex_wait(futex, val);
76 : * uval = *futex;
77 : * *futex = newval;
78 : * sys_futex(WAKE, futex);
79 : * futex_wake(futex);
80 : * if (queue_empty())
81 : * return;
82 : * if (uval == val)
83 : * lock(hash_bucket(futex));
84 : * queue();
85 : * unlock(hash_bucket(futex));
86 : * schedule();
87 : *
88 : * This would cause the waiter on CPU 0 to wait forever because it
89 : * missed the transition of the user space value from val to newval
90 : * and the waker did not find the waiter in the hash bucket queue.
91 : *
92 : * The correct serialization ensures that a waiter either observes
93 : * the changed user space value before blocking or is woken by a
94 : * concurrent waker:
95 : *
96 : * CPU 0 CPU 1
97 : * val = *futex;
98 : * sys_futex(WAIT, futex, val);
99 : * futex_wait(futex, val);
100 : *
101 : * waiters++; (a)
102 : * smp_mb(); (A) <-- paired with -.
103 : * |
104 : * lock(hash_bucket(futex)); |
105 : * |
106 : * uval = *futex; |
107 : * | *futex = newval;
108 : * | sys_futex(WAKE, futex);
109 : * | futex_wake(futex);
110 : * |
111 : * `--------> smp_mb(); (B)
112 : * if (uval == val)
113 : * queue();
114 : * unlock(hash_bucket(futex));
115 : * schedule(); if (waiters)
116 : * lock(hash_bucket(futex));
117 : * else wake_waiters(futex);
118 : * waiters--; (b) unlock(hash_bucket(futex));
119 : *
120 : * Where (A) orders the waiters increment and the futex value read through
121 : * atomic operations (see hb_waiters_inc) and where (B) orders the write
122 : * to futex and the waiters read (see hb_waiters_pending()).
123 : *
124 : * This yields the following case (where X:=waiters, Y:=futex):
125 : *
126 : * X = Y = 0
127 : *
128 : * w[X]=1 w[Y]=1
129 : * MB MB
130 : * r[Y]=y r[X]=x
131 : *
132 : * Which guarantees that x==0 && y==0 is impossible; which translates back into
133 : * the guarantee that we cannot both miss the futex variable change and the
134 : * enqueue.
135 : *
136 : * Note that a new waiter is accounted for in (a) even when it is possible that
137 : * the wait call can return error, in which case we backtrack from it in (b).
138 : * Refer to the comment in queue_lock().
139 : *
140 : * Similarly, in order to account for waiters being requeued on another
141 : * address we always increment the waiters for the destination bucket before
142 : * acquiring the lock. It then decrements them again after releasing it -
143 : * the code that actually moves the futex(es) between hash buckets (requeue_futex)
144 : * will do the additional required waiter count housekeeping. This is done for
145 : * double_lock_hb() and double_unlock_hb(), respectively.
146 : */
147 :
148 : #ifdef CONFIG_HAVE_FUTEX_CMPXCHG
149 : #define futex_cmpxchg_enabled 1
150 : #else
151 : static int __read_mostly futex_cmpxchg_enabled;
152 : #endif
153 :
154 : /*
155 : * Futex flags used to encode options to functions and preserve them across
156 : * restarts.
157 : */
158 : #ifdef CONFIG_MMU
159 : # define FLAGS_SHARED 0x01
160 : #else
161 : /*
162 : * NOMMU does not have per process address space. Let the compiler optimize
163 : * code away.
164 : */
165 : # define FLAGS_SHARED 0x00
166 : #endif
167 : #define FLAGS_CLOCKRT 0x02
168 : #define FLAGS_HAS_TIMEOUT 0x04
169 :
170 : /*
171 : * Priority Inheritance state:
172 : */
173 : struct futex_pi_state {
174 : /*
175 : * list of 'owned' pi_state instances - these have to be
176 : * cleaned up in do_exit() if the task exits prematurely:
177 : */
178 : struct list_head list;
179 :
180 : /*
181 : * The PI object:
182 : */
183 : struct rt_mutex pi_mutex;
184 :
185 : struct task_struct *owner;
186 : refcount_t refcount;
187 :
188 : union futex_key key;
189 : } __randomize_layout;
190 :
191 : /**
192 : * struct futex_q - The hashed futex queue entry, one per waiting task
193 : * @list: priority-sorted list of tasks waiting on this futex
194 : * @task: the task waiting on the futex
195 : * @lock_ptr: the hash bucket lock
196 : * @key: the key the futex is hashed on
197 : * @pi_state: optional priority inheritance state
198 : * @rt_waiter: rt_waiter storage for use with requeue_pi
199 : * @requeue_pi_key: the requeue_pi target futex key
200 : * @bitset: bitset for the optional bitmasked wakeup
201 : *
202 : * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
203 : * we can wake only the relevant ones (hashed queues may be shared).
204 : *
205 : * A futex_q has a woken state, just like tasks have TASK_RUNNING.
206 : * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
207 : * The order of wakeup is always to make the first condition true, then
208 : * the second.
209 : *
210 : * PI futexes are typically woken before they are removed from the hash list via
211 : * the rt_mutex code. See unqueue_me_pi().
212 : */
213 : struct futex_q {
214 : struct plist_node list;
215 :
216 : struct task_struct *task;
217 : spinlock_t *lock_ptr;
218 : union futex_key key;
219 : struct futex_pi_state *pi_state;
220 : struct rt_mutex_waiter *rt_waiter;
221 : union futex_key *requeue_pi_key;
222 : u32 bitset;
223 : } __randomize_layout;
224 :
225 : static const struct futex_q futex_q_init = {
226 : /* list gets initialized in queue_me()*/
227 : .key = FUTEX_KEY_INIT,
228 : .bitset = FUTEX_BITSET_MATCH_ANY
229 : };
230 :
231 : /*
232 : * Hash buckets are shared by all the futex_keys that hash to the same
233 : * location. Each key may have multiple futex_q structures, one for each task
234 : * waiting on a futex.
235 : */
236 : struct futex_hash_bucket {
237 : atomic_t waiters;
238 : spinlock_t lock;
239 : struct plist_head chain;
240 : } ____cacheline_aligned_in_smp;
241 :
242 : /*
243 : * The base of the bucket array and its size are always used together
244 : * (after initialization only in hash_futex()), so ensure that they
245 : * reside in the same cacheline.
246 : */
247 : static struct {
248 : struct futex_hash_bucket *queues;
249 : unsigned long hashsize;
250 : } __futex_data __read_mostly __aligned(2*sizeof(long));
251 : #define futex_queues (__futex_data.queues)
252 : #define futex_hashsize (__futex_data.hashsize)
253 :
254 :
255 : /*
256 : * Fault injections for futexes.
257 : */
258 : #ifdef CONFIG_FAIL_FUTEX
259 :
260 : static struct {
261 : struct fault_attr attr;
262 :
263 : bool ignore_private;
264 : } fail_futex = {
265 : .attr = FAULT_ATTR_INITIALIZER,
266 : .ignore_private = false,
267 : };
268 :
269 : static int __init setup_fail_futex(char *str)
270 : {
271 : return setup_fault_attr(&fail_futex.attr, str);
272 : }
273 : __setup("fail_futex=", setup_fail_futex);
274 :
275 : static bool should_fail_futex(bool fshared)
276 : {
277 : if (fail_futex.ignore_private && !fshared)
278 : return false;
279 :
280 : return should_fail(&fail_futex.attr, 1);
281 : }
282 :
283 : #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
284 :
285 : static int __init fail_futex_debugfs(void)
286 : {
287 : umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
288 : struct dentry *dir;
289 :
290 : dir = fault_create_debugfs_attr("fail_futex", NULL,
291 : &fail_futex.attr);
292 : if (IS_ERR(dir))
293 : return PTR_ERR(dir);
294 :
295 : debugfs_create_bool("ignore-private", mode, dir,
296 : &fail_futex.ignore_private);
297 : return 0;
298 : }
299 :
300 : late_initcall(fail_futex_debugfs);
301 :
302 : #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
303 :
304 : #else
305 766 : static inline bool should_fail_futex(bool fshared)
306 : {
307 766 : return false;
308 : }
309 : #endif /* CONFIG_FAIL_FUTEX */
310 :
311 : #ifdef CONFIG_COMPAT
312 : static void compat_exit_robust_list(struct task_struct *curr);
313 : #endif
314 :
315 : /*
316 : * Reflects a new waiter being added to the waitqueue.
317 : */
318 168 : static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
319 : {
320 : #ifdef CONFIG_SMP
321 168 : atomic_inc(&hb->waiters);
322 : /*
323 : * Full barrier (A), see the ordering comment above.
324 : */
325 168 : smp_mb__after_atomic();
326 : #endif
327 168 : }
328 :
329 : /*
330 : * Reflects a waiter being removed from the waitqueue by wakeup
331 : * paths.
332 : */
333 167 : static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
334 : {
335 : #ifdef CONFIG_SMP
336 167 : atomic_dec(&hb->waiters);
337 : #endif
338 167 : }
339 :
340 594 : static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
341 : {
342 : #ifdef CONFIG_SMP
343 : /*
344 : * Full barrier (B), see the ordering comment above.
345 : */
346 594 : smp_mb();
347 594 : return atomic_read(&hb->waiters);
348 : #else
349 : return 1;
350 : #endif
351 : }
352 :
353 : /**
354 : * hash_futex - Return the hash bucket in the global hash
355 : * @key: Pointer to the futex key for which the hash is calculated
356 : *
357 : * We hash on the keys returned from get_futex_key (see below) and return the
358 : * corresponding hash bucket in the global hash.
359 : */
360 762 : static struct futex_hash_bucket *hash_futex(union futex_key *key)
361 : {
362 762 : u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
363 : key->both.offset);
364 :
365 762 : return &futex_queues[hash & (futex_hashsize - 1)];
366 : }
367 :
368 :
369 : /**
370 : * match_futex - Check whether two futex keys are equal
371 : * @key1: Pointer to key1
372 : * @key2: Pointer to key2
373 : *
374 : * Return 1 if two futex_keys are equal, 0 otherwise.
375 : */
376 167 : static inline int match_futex(union futex_key *key1, union futex_key *key2)
377 : {
378 167 : return (key1 && key2
379 167 : && key1->both.word == key2->both.word
380 166 : && key1->both.ptr == key2->both.ptr
381 333 : && key1->both.offset == key2->both.offset);
382 : }
383 :
384 : enum futex_access {
385 : FUTEX_READ,
386 : FUTEX_WRITE
387 : };
388 :
389 : /**
390 : * futex_setup_timer - set up the sleeping hrtimer.
391 : * @time: ptr to the given timeout value
392 : * @timeout: the hrtimer_sleeper structure to be set up
393 : * @flags: futex flags
394 : * @range_ns: optional range in ns
395 : *
396 : * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
397 : * value given
398 : */
399 : static inline struct hrtimer_sleeper *
400 168 : futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
401 : int flags, u64 range_ns)
402 : {
403 168 : if (!time)
404 : return NULL;
405 :
406 0 : hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
407 : CLOCK_REALTIME : CLOCK_MONOTONIC,
408 : HRTIMER_MODE_ABS);
409 : /*
410 : * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
411 : * effectively the same as calling hrtimer_set_expires().
412 : */
413 0 : hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
414 :
415 0 : return timeout;
416 : }
417 :
418 : /*
419 : * Generate a machine wide unique identifier for this inode.
420 : *
421 : * This relies on u64 not wrapping in the life-time of the machine; which with
422 : * 1ns resolution means almost 585 years.
423 : *
424 : * This further relies on the fact that a well formed program will not unmap
425 : * the file while it has a (shared) futex waiting on it. This mapping will have
426 : * a file reference which pins the mount and inode.
427 : *
428 : * If for some reason an inode gets evicted and read back in again, it will get
429 : * a new sequence number and will _NOT_ match, even though it is the exact same
430 : * file.
431 : *
432 : * It is important that match_futex() will never have a false-positive, esp.
433 : * for PI futexes that can mess up the state. The above argues that false-negatives
434 : * are only possible for malformed programs.
435 : */
436 0 : static u64 get_inode_sequence_number(struct inode *inode)
437 : {
438 0 : static atomic64_t i_seq;
439 0 : u64 old;
440 :
441 : /* Does the inode already have a sequence number? */
442 0 : old = atomic64_read(&inode->i_sequence);
443 0 : if (likely(old))
444 : return old;
445 :
446 0 : for (;;) {
447 0 : u64 new = atomic64_add_return(1, &i_seq);
448 0 : if (WARN_ON_ONCE(!new))
449 0 : continue;
450 :
451 0 : old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
452 0 : if (old)
453 0 : return old;
454 : return new;
455 : }
456 : }
457 :
458 : /**
459 : * get_futex_key() - Get parameters which are the keys for a futex
460 : * @uaddr: virtual address of the futex
461 : * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
462 : * @key: address where result is stored.
463 : * @rw: mapping needs to be read/write (values: FUTEX_READ,
464 : * FUTEX_WRITE)
465 : *
466 : * Return: a negative error code or 0
467 : *
468 : * The key words are stored in @key on success.
469 : *
470 : * For shared mappings (when @fshared), the key is:
471 : *
472 : * ( inode->i_sequence, page->index, offset_within_page )
473 : *
474 : * [ also see get_inode_sequence_number() ]
475 : *
476 : * For private mappings (or when !@fshared), the key is:
477 : *
478 : * ( current->mm, address, 0 )
479 : *
480 : * This allows (cross process, where applicable) identification of the futex
481 : * without keeping the page pinned for the duration of the FUTEX_WAIT.
482 : *
483 : * lock_page() might sleep, the caller should not hold a spinlock.
484 : */
485 762 : static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
486 : enum futex_access rw)
487 : {
488 762 : unsigned long address = (unsigned long)uaddr;
489 762 : struct mm_struct *mm = current->mm;
490 762 : struct page *page, *tail;
491 762 : struct address_space *mapping;
492 762 : int err, ro = 0;
493 :
494 : /*
495 : * The futex address must be "naturally" aligned.
496 : */
497 762 : key->both.offset = address % PAGE_SIZE;
498 762 : if (unlikely((address % sizeof(u32)) != 0))
499 : return -EINVAL;
500 762 : address -= key->both.offset;
501 :
502 762 : if (unlikely(!access_ok(uaddr, sizeof(u32))))
503 : return -EFAULT;
504 :
505 762 : if (unlikely(should_fail_futex(fshared)))
506 : return -EFAULT;
507 :
508 : /*
509 : * PROCESS_PRIVATE futexes are fast.
510 : * As the mm cannot disappear under us and the 'key' only needs
511 : * virtual address, we dont even have to find the underlying vma.
512 : * Note : We do have to check 'uaddr' is a valid user address,
513 : * but access_ok() should be faster than find_vma()
514 : */
515 762 : if (!fshared) {
516 760 : key->private.mm = mm;
517 760 : key->private.address = address;
518 760 : return 0;
519 : }
520 :
521 2 : again:
522 : /* Ignore any VERIFY_READ mapping (futex common case) */
523 2 : if (unlikely(should_fail_futex(true)))
524 : return -EFAULT;
525 :
526 2 : err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
527 : /*
528 : * If write access is not required (eg. FUTEX_WAIT), try
529 : * and get read-only access.
530 : */
531 2 : if (err == -EFAULT && rw == FUTEX_READ) {
532 0 : err = get_user_pages_fast(address, 1, 0, &page);
533 0 : ro = 1;
534 : }
535 2 : if (err < 0)
536 0 : return err;
537 : else
538 2 : err = 0;
539 :
540 : /*
541 : * The treatment of mapping from this point on is critical. The page
542 : * lock protects many things but in this context the page lock
543 : * stabilizes mapping, prevents inode freeing in the shared
544 : * file-backed region case and guards against movement to swap cache.
545 : *
546 : * Strictly speaking the page lock is not needed in all cases being
547 : * considered here and page lock forces unnecessarily serialization
548 : * From this point on, mapping will be re-verified if necessary and
549 : * page lock will be acquired only if it is unavoidable
550 : *
551 : * Mapping checks require the head page for any compound page so the
552 : * head page and mapping is looked up now. For anonymous pages, it
553 : * does not matter if the page splits in the future as the key is
554 : * based on the address. For filesystem-backed pages, the tail is
555 : * required as the index of the page determines the key. For
556 : * base pages, there is no tail page and tail == page.
557 : */
558 2 : tail = page;
559 2 : page = compound_head(page);
560 2 : mapping = READ_ONCE(page->mapping);
561 :
562 : /*
563 : * If page->mapping is NULL, then it cannot be a PageAnon
564 : * page; but it might be the ZERO_PAGE or in the gate area or
565 : * in a special mapping (all cases which we are happy to fail);
566 : * or it may have been a good file page when get_user_pages_fast
567 : * found it, but truncated or holepunched or subjected to
568 : * invalidate_complete_page2 before we got the page lock (also
569 : * cases which we are happy to fail). And we hold a reference,
570 : * so refcount care in invalidate_complete_page's remove_mapping
571 : * prevents drop_caches from setting mapping to NULL beneath us.
572 : *
573 : * The case we do have to guard against is when memory pressure made
574 : * shmem_writepage move it from filecache to swapcache beneath us:
575 : * an unlikely race, but we do need to retry for page->mapping.
576 : */
577 2 : if (unlikely(!mapping)) {
578 0 : int shmem_swizzled;
579 :
580 : /*
581 : * Page lock is required to identify which special case above
582 : * applies. If this is really a shmem page then the page lock
583 : * will prevent unexpected transitions.
584 : */
585 0 : lock_page(page);
586 0 : shmem_swizzled = PageSwapCache(page) || page->mapping;
587 0 : unlock_page(page);
588 0 : put_page(page);
589 :
590 0 : if (shmem_swizzled)
591 0 : goto again;
592 :
593 : return -EFAULT;
594 : }
595 :
596 : /*
597 : * Private mappings are handled in a simple way.
598 : *
599 : * If the futex key is stored on an anonymous page, then the associated
600 : * object is the mm which is implicitly pinned by the calling process.
601 : *
602 : * NOTE: When userspace waits on a MAP_SHARED mapping, even if
603 : * it's a read-only handle, it's expected that futexes attach to
604 : * the object not the particular process.
605 : */
606 2 : if (PageAnon(page)) {
607 : /*
608 : * A RO anonymous page will never change and thus doesn't make
609 : * sense for futex operations.
610 : */
611 2 : if (unlikely(should_fail_futex(true)) || ro) {
612 0 : err = -EFAULT;
613 0 : goto out;
614 : }
615 :
616 2 : key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
617 2 : key->private.mm = mm;
618 2 : key->private.address = address;
619 :
620 : } else {
621 0 : struct inode *inode;
622 :
623 : /*
624 : * The associated futex object in this case is the inode and
625 : * the page->mapping must be traversed. Ordinarily this should
626 : * be stabilised under page lock but it's not strictly
627 : * necessary in this case as we just want to pin the inode, not
628 : * update the radix tree or anything like that.
629 : *
630 : * The RCU read lock is taken as the inode is finally freed
631 : * under RCU. If the mapping still matches expectations then the
632 : * mapping->host can be safely accessed as being a valid inode.
633 : */
634 0 : rcu_read_lock();
635 :
636 0 : if (READ_ONCE(page->mapping) != mapping) {
637 0 : rcu_read_unlock();
638 0 : put_page(page);
639 :
640 0 : goto again;
641 : }
642 :
643 0 : inode = READ_ONCE(mapping->host);
644 0 : if (!inode) {
645 0 : rcu_read_unlock();
646 0 : put_page(page);
647 :
648 0 : goto again;
649 : }
650 :
651 0 : key->both.offset |= FUT_OFF_INODE; /* inode-based key */
652 0 : key->shared.i_seq = get_inode_sequence_number(inode);
653 0 : key->shared.pgoff = basepage_index(tail);
654 0 : rcu_read_unlock();
655 : }
656 :
657 2 : out:
658 2 : put_page(page);
659 2 : return err;
660 : }
661 :
662 : /**
663 : * fault_in_user_writeable() - Fault in user address and verify RW access
664 : * @uaddr: pointer to faulting user space address
665 : *
666 : * Slow path to fixup the fault we just took in the atomic write
667 : * access to @uaddr.
668 : *
669 : * We have no generic implementation of a non-destructive write to the
670 : * user address. We know that we faulted in the atomic pagefault
671 : * disabled section so we can as well avoid the #PF overhead by
672 : * calling get_user_pages() right away.
673 : */
674 0 : static int fault_in_user_writeable(u32 __user *uaddr)
675 : {
676 0 : struct mm_struct *mm = current->mm;
677 0 : int ret;
678 :
679 0 : mmap_read_lock(mm);
680 0 : ret = fixup_user_fault(mm, (unsigned long)uaddr,
681 : FAULT_FLAG_WRITE, NULL);
682 0 : mmap_read_unlock(mm);
683 :
684 0 : return ret < 0 ? ret : 0;
685 : }
686 :
687 : /**
688 : * futex_top_waiter() - Return the highest priority waiter on a futex
689 : * @hb: the hash bucket the futex_q's reside in
690 : * @key: the futex key (to distinguish it from other futex futex_q's)
691 : *
692 : * Must be called with the hb lock held.
693 : */
694 0 : static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
695 : union futex_key *key)
696 : {
697 0 : struct futex_q *this;
698 :
699 0 : plist_for_each_entry(this, &hb->chain, list) {
700 0 : if (match_futex(&this->key, key))
701 0 : return this;
702 : }
703 : return NULL;
704 : }
705 :
706 1 : static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
707 : u32 uval, u32 newval)
708 : {
709 1 : int ret;
710 :
711 1 : pagefault_disable();
712 1 : ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
713 1 : pagefault_enable();
714 :
715 1 : return ret;
716 : }
717 :
718 168 : static int get_futex_value_locked(u32 *dest, u32 __user *from)
719 : {
720 168 : int ret;
721 :
722 168 : pagefault_disable();
723 168 : ret = __get_user(*dest, from);
724 168 : pagefault_enable();
725 :
726 168 : return ret ? -EFAULT : 0;
727 : }
728 :
729 :
730 : /*
731 : * PI code:
732 : */
733 0 : static int refill_pi_state_cache(void)
734 : {
735 0 : struct futex_pi_state *pi_state;
736 :
737 0 : if (likely(current->pi_state_cache))
738 : return 0;
739 :
740 0 : pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
741 :
742 0 : if (!pi_state)
743 : return -ENOMEM;
744 :
745 0 : INIT_LIST_HEAD(&pi_state->list);
746 : /* pi_mutex gets initialized later */
747 0 : pi_state->owner = NULL;
748 0 : refcount_set(&pi_state->refcount, 1);
749 0 : pi_state->key = FUTEX_KEY_INIT;
750 :
751 0 : current->pi_state_cache = pi_state;
752 :
753 0 : return 0;
754 : }
755 :
756 0 : static struct futex_pi_state *alloc_pi_state(void)
757 : {
758 0 : struct futex_pi_state *pi_state = current->pi_state_cache;
759 :
760 0 : WARN_ON(!pi_state);
761 0 : current->pi_state_cache = NULL;
762 :
763 0 : return pi_state;
764 : }
765 :
766 0 : static void pi_state_update_owner(struct futex_pi_state *pi_state,
767 : struct task_struct *new_owner)
768 : {
769 0 : struct task_struct *old_owner = pi_state->owner;
770 :
771 0 : lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
772 :
773 0 : if (old_owner) {
774 0 : raw_spin_lock(&old_owner->pi_lock);
775 0 : WARN_ON(list_empty(&pi_state->list));
776 0 : list_del_init(&pi_state->list);
777 0 : raw_spin_unlock(&old_owner->pi_lock);
778 : }
779 :
780 0 : if (new_owner) {
781 0 : raw_spin_lock(&new_owner->pi_lock);
782 0 : WARN_ON(!list_empty(&pi_state->list));
783 0 : list_add(&pi_state->list, &new_owner->pi_state_list);
784 0 : pi_state->owner = new_owner;
785 0 : raw_spin_unlock(&new_owner->pi_lock);
786 : }
787 0 : }
788 :
789 0 : static void get_pi_state(struct futex_pi_state *pi_state)
790 : {
791 0 : WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
792 0 : }
793 :
794 : /*
795 : * Drops a reference to the pi_state object and frees or caches it
796 : * when the last reference is gone.
797 : */
798 0 : static void put_pi_state(struct futex_pi_state *pi_state)
799 : {
800 0 : if (!pi_state)
801 : return;
802 :
803 0 : if (!refcount_dec_and_test(&pi_state->refcount))
804 : return;
805 :
806 : /*
807 : * If pi_state->owner is NULL, the owner is most probably dying
808 : * and has cleaned up the pi_state already
809 : */
810 0 : if (pi_state->owner) {
811 0 : unsigned long flags;
812 :
813 0 : raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
814 0 : pi_state_update_owner(pi_state, NULL);
815 0 : rt_mutex_proxy_unlock(&pi_state->pi_mutex);
816 0 : raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
817 : }
818 :
819 0 : if (current->pi_state_cache) {
820 0 : kfree(pi_state);
821 : } else {
822 : /*
823 : * pi_state->list is already empty.
824 : * clear pi_state->owner.
825 : * refcount is at 0 - put it back to 1.
826 : */
827 0 : pi_state->owner = NULL;
828 0 : refcount_set(&pi_state->refcount, 1);
829 0 : current->pi_state_cache = pi_state;
830 : }
831 : }
832 :
833 : #ifdef CONFIG_FUTEX_PI
834 :
835 : /*
836 : * This task is holding PI mutexes at exit time => bad.
837 : * Kernel cleans up PI-state, but userspace is likely hosed.
838 : * (Robust-futex cleanup is separate and might save the day for userspace.)
839 : */
840 0 : static void exit_pi_state_list(struct task_struct *curr)
841 : {
842 0 : struct list_head *next, *head = &curr->pi_state_list;
843 0 : struct futex_pi_state *pi_state;
844 0 : struct futex_hash_bucket *hb;
845 0 : union futex_key key = FUTEX_KEY_INIT;
846 :
847 0 : if (!futex_cmpxchg_enabled)
848 0 : return;
849 : /*
850 : * We are a ZOMBIE and nobody can enqueue itself on
851 : * pi_state_list anymore, but we have to be careful
852 : * versus waiters unqueueing themselves:
853 : */
854 0 : raw_spin_lock_irq(&curr->pi_lock);
855 0 : while (!list_empty(head)) {
856 0 : next = head->next;
857 0 : pi_state = list_entry(next, struct futex_pi_state, list);
858 0 : key = pi_state->key;
859 0 : hb = hash_futex(&key);
860 :
861 : /*
862 : * We can race against put_pi_state() removing itself from the
863 : * list (a waiter going away). put_pi_state() will first
864 : * decrement the reference count and then modify the list, so
865 : * its possible to see the list entry but fail this reference
866 : * acquire.
867 : *
868 : * In that case; drop the locks to let put_pi_state() make
869 : * progress and retry the loop.
870 : */
871 0 : if (!refcount_inc_not_zero(&pi_state->refcount)) {
872 0 : raw_spin_unlock_irq(&curr->pi_lock);
873 0 : cpu_relax();
874 0 : raw_spin_lock_irq(&curr->pi_lock);
875 0 : continue;
876 : }
877 0 : raw_spin_unlock_irq(&curr->pi_lock);
878 :
879 0 : spin_lock(&hb->lock);
880 0 : raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
881 0 : raw_spin_lock(&curr->pi_lock);
882 : /*
883 : * We dropped the pi-lock, so re-check whether this
884 : * task still owns the PI-state:
885 : */
886 0 : if (head->next != next) {
887 : /* retain curr->pi_lock for the loop invariant */
888 0 : raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
889 0 : spin_unlock(&hb->lock);
890 0 : put_pi_state(pi_state);
891 0 : continue;
892 : }
893 :
894 0 : WARN_ON(pi_state->owner != curr);
895 0 : WARN_ON(list_empty(&pi_state->list));
896 0 : list_del_init(&pi_state->list);
897 0 : pi_state->owner = NULL;
898 :
899 0 : raw_spin_unlock(&curr->pi_lock);
900 0 : raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
901 0 : spin_unlock(&hb->lock);
902 :
903 0 : rt_mutex_futex_unlock(&pi_state->pi_mutex);
904 0 : put_pi_state(pi_state);
905 :
906 0 : raw_spin_lock_irq(&curr->pi_lock);
907 : }
908 0 : raw_spin_unlock_irq(&curr->pi_lock);
909 : }
910 : #else
911 : static inline void exit_pi_state_list(struct task_struct *curr) { }
912 : #endif
913 :
914 : /*
915 : * We need to check the following states:
916 : *
917 : * Waiter | pi_state | pi->owner | uTID | uODIED | ?
918 : *
919 : * [1] NULL | --- | --- | 0 | 0/1 | Valid
920 : * [2] NULL | --- | --- | >0 | 0/1 | Valid
921 : *
922 : * [3] Found | NULL | -- | Any | 0/1 | Invalid
923 : *
924 : * [4] Found | Found | NULL | 0 | 1 | Valid
925 : * [5] Found | Found | NULL | >0 | 1 | Invalid
926 : *
927 : * [6] Found | Found | task | 0 | 1 | Valid
928 : *
929 : * [7] Found | Found | NULL | Any | 0 | Invalid
930 : *
931 : * [8] Found | Found | task | ==taskTID | 0/1 | Valid
932 : * [9] Found | Found | task | 0 | 0 | Invalid
933 : * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
934 : *
935 : * [1] Indicates that the kernel can acquire the futex atomically. We
936 : * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
937 : *
938 : * [2] Valid, if TID does not belong to a kernel thread. If no matching
939 : * thread is found then it indicates that the owner TID has died.
940 : *
941 : * [3] Invalid. The waiter is queued on a non PI futex
942 : *
943 : * [4] Valid state after exit_robust_list(), which sets the user space
944 : * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
945 : *
946 : * [5] The user space value got manipulated between exit_robust_list()
947 : * and exit_pi_state_list()
948 : *
949 : * [6] Valid state after exit_pi_state_list() which sets the new owner in
950 : * the pi_state but cannot access the user space value.
951 : *
952 : * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
953 : *
954 : * [8] Owner and user space value match
955 : *
956 : * [9] There is no transient state which sets the user space TID to 0
957 : * except exit_robust_list(), but this is indicated by the
958 : * FUTEX_OWNER_DIED bit. See [4]
959 : *
960 : * [10] There is no transient state which leaves owner and user space
961 : * TID out of sync. Except one error case where the kernel is denied
962 : * write access to the user address, see fixup_pi_state_owner().
963 : *
964 : *
965 : * Serialization and lifetime rules:
966 : *
967 : * hb->lock:
968 : *
969 : * hb -> futex_q, relation
970 : * futex_q -> pi_state, relation
971 : *
972 : * (cannot be raw because hb can contain arbitrary amount
973 : * of futex_q's)
974 : *
975 : * pi_mutex->wait_lock:
976 : *
977 : * {uval, pi_state}
978 : *
979 : * (and pi_mutex 'obviously')
980 : *
981 : * p->pi_lock:
982 : *
983 : * p->pi_state_list -> pi_state->list, relation
984 : *
985 : * pi_state->refcount:
986 : *
987 : * pi_state lifetime
988 : *
989 : *
990 : * Lock order:
991 : *
992 : * hb->lock
993 : * pi_mutex->wait_lock
994 : * p->pi_lock
995 : *
996 : */
997 :
998 : /*
999 : * Validate that the existing waiter has a pi_state and sanity check
1000 : * the pi_state against the user space value. If correct, attach to
1001 : * it.
1002 : */
1003 0 : static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
1004 : struct futex_pi_state *pi_state,
1005 : struct futex_pi_state **ps)
1006 : {
1007 0 : pid_t pid = uval & FUTEX_TID_MASK;
1008 0 : u32 uval2;
1009 0 : int ret;
1010 :
1011 : /*
1012 : * Userspace might have messed up non-PI and PI futexes [3]
1013 : */
1014 0 : if (unlikely(!pi_state))
1015 : return -EINVAL;
1016 :
1017 : /*
1018 : * We get here with hb->lock held, and having found a
1019 : * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
1020 : * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
1021 : * which in turn means that futex_lock_pi() still has a reference on
1022 : * our pi_state.
1023 : *
1024 : * The waiter holding a reference on @pi_state also protects against
1025 : * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
1026 : * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
1027 : * free pi_state before we can take a reference ourselves.
1028 : */
1029 0 : WARN_ON(!refcount_read(&pi_state->refcount));
1030 :
1031 : /*
1032 : * Now that we have a pi_state, we can acquire wait_lock
1033 : * and do the state validation.
1034 : */
1035 0 : raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1036 :
1037 : /*
1038 : * Since {uval, pi_state} is serialized by wait_lock, and our current
1039 : * uval was read without holding it, it can have changed. Verify it
1040 : * still is what we expect it to be, otherwise retry the entire
1041 : * operation.
1042 : */
1043 0 : if (get_futex_value_locked(&uval2, uaddr))
1044 0 : goto out_efault;
1045 :
1046 0 : if (uval != uval2)
1047 0 : goto out_eagain;
1048 :
1049 : /*
1050 : * Handle the owner died case:
1051 : */
1052 0 : if (uval & FUTEX_OWNER_DIED) {
1053 : /*
1054 : * exit_pi_state_list sets owner to NULL and wakes the
1055 : * topmost waiter. The task which acquires the
1056 : * pi_state->rt_mutex will fixup owner.
1057 : */
1058 0 : if (!pi_state->owner) {
1059 : /*
1060 : * No pi state owner, but the user space TID
1061 : * is not 0. Inconsistent state. [5]
1062 : */
1063 0 : if (pid)
1064 0 : goto out_einval;
1065 : /*
1066 : * Take a ref on the state and return success. [4]
1067 : */
1068 0 : goto out_attach;
1069 : }
1070 :
1071 : /*
1072 : * If TID is 0, then either the dying owner has not
1073 : * yet executed exit_pi_state_list() or some waiter
1074 : * acquired the rtmutex in the pi state, but did not
1075 : * yet fixup the TID in user space.
1076 : *
1077 : * Take a ref on the state and return success. [6]
1078 : */
1079 0 : if (!pid)
1080 0 : goto out_attach;
1081 : } else {
1082 : /*
1083 : * If the owner died bit is not set, then the pi_state
1084 : * must have an owner. [7]
1085 : */
1086 0 : if (!pi_state->owner)
1087 0 : goto out_einval;
1088 : }
1089 :
1090 : /*
1091 : * Bail out if user space manipulated the futex value. If pi
1092 : * state exists then the owner TID must be the same as the
1093 : * user space TID. [9/10]
1094 : */
1095 0 : if (pid != task_pid_vnr(pi_state->owner))
1096 0 : goto out_einval;
1097 :
1098 0 : out_attach:
1099 0 : get_pi_state(pi_state);
1100 0 : raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1101 0 : *ps = pi_state;
1102 0 : return 0;
1103 :
1104 0 : out_einval:
1105 0 : ret = -EINVAL;
1106 0 : goto out_error;
1107 :
1108 0 : out_eagain:
1109 0 : ret = -EAGAIN;
1110 0 : goto out_error;
1111 :
1112 0 : out_efault:
1113 0 : ret = -EFAULT;
1114 0 : goto out_error;
1115 :
1116 0 : out_error:
1117 0 : raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1118 0 : return ret;
1119 : }
1120 :
1121 : /**
1122 : * wait_for_owner_exiting - Block until the owner has exited
1123 : * @ret: owner's current futex lock status
1124 : * @exiting: Pointer to the exiting task
1125 : *
1126 : * Caller must hold a refcount on @exiting.
1127 : */
1128 0 : static void wait_for_owner_exiting(int ret, struct task_struct *exiting)
1129 : {
1130 0 : if (ret != -EBUSY) {
1131 0 : WARN_ON_ONCE(exiting);
1132 : return;
1133 : }
1134 :
1135 0 : if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
1136 : return;
1137 :
1138 0 : mutex_lock(&exiting->futex_exit_mutex);
1139 : /*
1140 : * No point in doing state checking here. If the waiter got here
1141 : * while the task was in exec()->exec_futex_release() then it can
1142 : * have any FUTEX_STATE_* value when the waiter has acquired the
1143 : * mutex. OK, if running, EXITING or DEAD if it reached exit()
1144 : * already. Highly unlikely and not a problem. Just one more round
1145 : * through the futex maze.
1146 : */
1147 0 : mutex_unlock(&exiting->futex_exit_mutex);
1148 :
1149 0 : put_task_struct(exiting);
1150 : }
1151 :
1152 0 : static int handle_exit_race(u32 __user *uaddr, u32 uval,
1153 : struct task_struct *tsk)
1154 : {
1155 0 : u32 uval2;
1156 :
1157 : /*
1158 : * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
1159 : * caller that the alleged owner is busy.
1160 : */
1161 0 : if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
1162 : return -EBUSY;
1163 :
1164 : /*
1165 : * Reread the user space value to handle the following situation:
1166 : *
1167 : * CPU0 CPU1
1168 : *
1169 : * sys_exit() sys_futex()
1170 : * do_exit() futex_lock_pi()
1171 : * futex_lock_pi_atomic()
1172 : * exit_signals(tsk) No waiters:
1173 : * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID
1174 : * mm_release(tsk) Set waiter bit
1175 : * exit_robust_list(tsk) { *uaddr = 0x80000PID;
1176 : * Set owner died attach_to_pi_owner() {
1177 : * *uaddr = 0xC0000000; tsk = get_task(PID);
1178 : * } if (!tsk->flags & PF_EXITING) {
1179 : * ... attach();
1180 : * tsk->futex_state = } else {
1181 : * FUTEX_STATE_DEAD; if (tsk->futex_state !=
1182 : * FUTEX_STATE_DEAD)
1183 : * return -EAGAIN;
1184 : * return -ESRCH; <--- FAIL
1185 : * }
1186 : *
1187 : * Returning ESRCH unconditionally is wrong here because the
1188 : * user space value has been changed by the exiting task.
1189 : *
1190 : * The same logic applies to the case where the exiting task is
1191 : * already gone.
1192 : */
1193 0 : if (get_futex_value_locked(&uval2, uaddr))
1194 : return -EFAULT;
1195 :
1196 : /* If the user space value has changed, try again. */
1197 0 : if (uval2 != uval)
1198 0 : return -EAGAIN;
1199 :
1200 : /*
1201 : * The exiting task did not have a robust list, the robust list was
1202 : * corrupted or the user space value in *uaddr is simply bogus.
1203 : * Give up and tell user space.
1204 : */
1205 : return -ESRCH;
1206 : }
1207 :
1208 : /*
1209 : * Lookup the task for the TID provided from user space and attach to
1210 : * it after doing proper sanity checks.
1211 : */
1212 0 : static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
1213 : struct futex_pi_state **ps,
1214 : struct task_struct **exiting)
1215 : {
1216 0 : pid_t pid = uval & FUTEX_TID_MASK;
1217 0 : struct futex_pi_state *pi_state;
1218 0 : struct task_struct *p;
1219 :
1220 : /*
1221 : * We are the first waiter - try to look up the real owner and attach
1222 : * the new pi_state to it, but bail out when TID = 0 [1]
1223 : *
1224 : * The !pid check is paranoid. None of the call sites should end up
1225 : * with pid == 0, but better safe than sorry. Let the caller retry
1226 : */
1227 0 : if (!pid)
1228 : return -EAGAIN;
1229 0 : p = find_get_task_by_vpid(pid);
1230 0 : if (!p)
1231 0 : return handle_exit_race(uaddr, uval, NULL);
1232 :
1233 0 : if (unlikely(p->flags & PF_KTHREAD)) {
1234 0 : put_task_struct(p);
1235 0 : return -EPERM;
1236 : }
1237 :
1238 : /*
1239 : * We need to look at the task state to figure out, whether the
1240 : * task is exiting. To protect against the change of the task state
1241 : * in futex_exit_release(), we do this protected by p->pi_lock:
1242 : */
1243 0 : raw_spin_lock_irq(&p->pi_lock);
1244 0 : if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
1245 : /*
1246 : * The task is on the way out. When the futex state is
1247 : * FUTEX_STATE_DEAD, we know that the task has finished
1248 : * the cleanup:
1249 : */
1250 0 : int ret = handle_exit_race(uaddr, uval, p);
1251 :
1252 0 : raw_spin_unlock_irq(&p->pi_lock);
1253 : /*
1254 : * If the owner task is between FUTEX_STATE_EXITING and
1255 : * FUTEX_STATE_DEAD then store the task pointer and keep
1256 : * the reference on the task struct. The calling code will
1257 : * drop all locks, wait for the task to reach
1258 : * FUTEX_STATE_DEAD and then drop the refcount. This is
1259 : * required to prevent a live lock when the current task
1260 : * preempted the exiting task between the two states.
1261 : */
1262 0 : if (ret == -EBUSY)
1263 0 : *exiting = p;
1264 : else
1265 0 : put_task_struct(p);
1266 0 : return ret;
1267 : }
1268 :
1269 : /*
1270 : * No existing pi state. First waiter. [2]
1271 : *
1272 : * This creates pi_state, we have hb->lock held, this means nothing can
1273 : * observe this state, wait_lock is irrelevant.
1274 : */
1275 0 : pi_state = alloc_pi_state();
1276 :
1277 : /*
1278 : * Initialize the pi_mutex in locked state and make @p
1279 : * the owner of it:
1280 : */
1281 0 : rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
1282 :
1283 : /* Store the key for possible exit cleanups: */
1284 0 : pi_state->key = *key;
1285 :
1286 0 : WARN_ON(!list_empty(&pi_state->list));
1287 0 : list_add(&pi_state->list, &p->pi_state_list);
1288 : /*
1289 : * Assignment without holding pi_state->pi_mutex.wait_lock is safe
1290 : * because there is no concurrency as the object is not published yet.
1291 : */
1292 0 : pi_state->owner = p;
1293 0 : raw_spin_unlock_irq(&p->pi_lock);
1294 :
1295 0 : put_task_struct(p);
1296 :
1297 0 : *ps = pi_state;
1298 :
1299 0 : return 0;
1300 : }
1301 :
1302 0 : static int lookup_pi_state(u32 __user *uaddr, u32 uval,
1303 : struct futex_hash_bucket *hb,
1304 : union futex_key *key, struct futex_pi_state **ps,
1305 : struct task_struct **exiting)
1306 : {
1307 0 : struct futex_q *top_waiter = futex_top_waiter(hb, key);
1308 :
1309 : /*
1310 : * If there is a waiter on that futex, validate it and
1311 : * attach to the pi_state when the validation succeeds.
1312 : */
1313 0 : if (top_waiter)
1314 0 : return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
1315 :
1316 : /*
1317 : * We are the first waiter - try to look up the owner based on
1318 : * @uval and attach to it.
1319 : */
1320 0 : return attach_to_pi_owner(uaddr, uval, key, ps, exiting);
1321 : }
1322 :
1323 0 : static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
1324 : {
1325 0 : int err;
1326 0 : u32 curval;
1327 :
1328 0 : if (unlikely(should_fail_futex(true)))
1329 : return -EFAULT;
1330 :
1331 0 : err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
1332 0 : if (unlikely(err))
1333 : return err;
1334 :
1335 : /* If user space value changed, let the caller retry */
1336 0 : return curval != uval ? -EAGAIN : 0;
1337 : }
1338 :
1339 : /**
1340 : * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
1341 : * @uaddr: the pi futex user address
1342 : * @hb: the pi futex hash bucket
1343 : * @key: the futex key associated with uaddr and hb
1344 : * @ps: the pi_state pointer where we store the result of the
1345 : * lookup
1346 : * @task: the task to perform the atomic lock work for. This will
1347 : * be "current" except in the case of requeue pi.
1348 : * @exiting: Pointer to store the task pointer of the owner task
1349 : * which is in the middle of exiting
1350 : * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
1351 : *
1352 : * Return:
1353 : * - 0 - ready to wait;
1354 : * - 1 - acquired the lock;
1355 : * - <0 - error
1356 : *
1357 : * The hb->lock and futex_key refs shall be held by the caller.
1358 : *
1359 : * @exiting is only set when the return value is -EBUSY. If so, this holds
1360 : * a refcount on the exiting task on return and the caller needs to drop it
1361 : * after waiting for the exit to complete.
1362 : */
1363 0 : static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
1364 : union futex_key *key,
1365 : struct futex_pi_state **ps,
1366 : struct task_struct *task,
1367 : struct task_struct **exiting,
1368 : int set_waiters)
1369 : {
1370 0 : u32 uval, newval, vpid = task_pid_vnr(task);
1371 0 : struct futex_q *top_waiter;
1372 0 : int ret;
1373 :
1374 : /*
1375 : * Read the user space value first so we can validate a few
1376 : * things before proceeding further.
1377 : */
1378 0 : if (get_futex_value_locked(&uval, uaddr))
1379 : return -EFAULT;
1380 :
1381 0 : if (unlikely(should_fail_futex(true)))
1382 : return -EFAULT;
1383 :
1384 : /*
1385 : * Detect deadlocks.
1386 : */
1387 0 : if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
1388 : return -EDEADLK;
1389 :
1390 0 : if ((unlikely(should_fail_futex(true))))
1391 : return -EDEADLK;
1392 :
1393 : /*
1394 : * Lookup existing state first. If it exists, try to attach to
1395 : * its pi_state.
1396 : */
1397 0 : top_waiter = futex_top_waiter(hb, key);
1398 0 : if (top_waiter)
1399 0 : return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
1400 :
1401 : /*
1402 : * No waiter and user TID is 0. We are here because the
1403 : * waiters or the owner died bit is set or called from
1404 : * requeue_cmp_pi or for whatever reason something took the
1405 : * syscall.
1406 : */
1407 0 : if (!(uval & FUTEX_TID_MASK)) {
1408 : /*
1409 : * We take over the futex. No other waiters and the user space
1410 : * TID is 0. We preserve the owner died bit.
1411 : */
1412 0 : newval = uval & FUTEX_OWNER_DIED;
1413 0 : newval |= vpid;
1414 :
1415 : /* The futex requeue_pi code can enforce the waiters bit */
1416 0 : if (set_waiters)
1417 0 : newval |= FUTEX_WAITERS;
1418 :
1419 0 : ret = lock_pi_update_atomic(uaddr, uval, newval);
1420 : /* If the take over worked, return 1 */
1421 0 : return ret < 0 ? ret : 1;
1422 : }
1423 :
1424 : /*
1425 : * First waiter. Set the waiters bit before attaching ourself to
1426 : * the owner. If owner tries to unlock, it will be forced into
1427 : * the kernel and blocked on hb->lock.
1428 : */
1429 0 : newval = uval | FUTEX_WAITERS;
1430 0 : ret = lock_pi_update_atomic(uaddr, uval, newval);
1431 0 : if (ret)
1432 : return ret;
1433 : /*
1434 : * If the update of the user space value succeeded, we try to
1435 : * attach to the owner. If that fails, no harm done, we only
1436 : * set the FUTEX_WAITERS bit in the user space variable.
1437 : */
1438 0 : return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
1439 : }
1440 :
1441 : /**
1442 : * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
1443 : * @q: The futex_q to unqueue
1444 : *
1445 : * The q->lock_ptr must not be NULL and must be held by the caller.
1446 : */
1447 166 : static void __unqueue_futex(struct futex_q *q)
1448 : {
1449 166 : struct futex_hash_bucket *hb;
1450 :
1451 166 : if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
1452 : return;
1453 332 : lockdep_assert_held(q->lock_ptr);
1454 :
1455 166 : hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
1456 166 : plist_del(&q->list, &hb->chain);
1457 166 : hb_waiters_dec(hb);
1458 : }
1459 :
1460 : /*
1461 : * The hash bucket lock must be held when this is called.
1462 : * Afterwards, the futex_q must not be accessed. Callers
1463 : * must ensure to later call wake_up_q() for the actual
1464 : * wakeups to occur.
1465 : */
1466 166 : static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
1467 : {
1468 166 : struct task_struct *p = q->task;
1469 :
1470 332 : if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
1471 : return;
1472 :
1473 166 : get_task_struct(p);
1474 166 : __unqueue_futex(q);
1475 : /*
1476 : * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
1477 : * is written, without taking any locks. This is possible in the event
1478 : * of a spurious wakeup, for example. A memory barrier is required here
1479 : * to prevent the following store to lock_ptr from getting ahead of the
1480 : * plist_del in __unqueue_futex().
1481 : */
1482 166 : smp_store_release(&q->lock_ptr, NULL);
1483 :
1484 : /*
1485 : * Queue the task for later wakeup for after we've released
1486 : * the hb->lock.
1487 : */
1488 166 : wake_q_add_safe(wake_q, p);
1489 : }
1490 :
1491 : /*
1492 : * Caller must hold a reference on @pi_state.
1493 : */
1494 0 : static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
1495 : {
1496 0 : u32 curval, newval;
1497 0 : struct task_struct *new_owner;
1498 0 : bool postunlock = false;
1499 0 : DEFINE_WAKE_Q(wake_q);
1500 0 : int ret = 0;
1501 :
1502 0 : new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
1503 0 : if (WARN_ON_ONCE(!new_owner)) {
1504 : /*
1505 : * As per the comment in futex_unlock_pi() this should not happen.
1506 : *
1507 : * When this happens, give up our locks and try again, giving
1508 : * the futex_lock_pi() instance time to complete, either by
1509 : * waiting on the rtmutex or removing itself from the futex
1510 : * queue.
1511 : */
1512 0 : ret = -EAGAIN;
1513 0 : goto out_unlock;
1514 : }
1515 :
1516 : /*
1517 : * We pass it to the next owner. The WAITERS bit is always kept
1518 : * enabled while there is PI state around. We cleanup the owner
1519 : * died bit, because we are the owner.
1520 : */
1521 0 : newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
1522 :
1523 0 : if (unlikely(should_fail_futex(true))) {
1524 : ret = -EFAULT;
1525 : goto out_unlock;
1526 : }
1527 :
1528 0 : ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
1529 0 : if (!ret && (curval != uval)) {
1530 : /*
1531 : * If a unconditional UNLOCK_PI operation (user space did not
1532 : * try the TID->0 transition) raced with a waiter setting the
1533 : * FUTEX_WAITERS flag between get_user() and locking the hash
1534 : * bucket lock, retry the operation.
1535 : */
1536 0 : if ((FUTEX_TID_MASK & curval) == uval)
1537 : ret = -EAGAIN;
1538 : else
1539 0 : ret = -EINVAL;
1540 : }
1541 :
1542 0 : if (!ret) {
1543 : /*
1544 : * This is a point of no return; once we modified the uval
1545 : * there is no going back and subsequent operations must
1546 : * not fail.
1547 : */
1548 0 : pi_state_update_owner(pi_state, new_owner);
1549 0 : postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
1550 : }
1551 :
1552 0 : out_unlock:
1553 0 : raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1554 :
1555 0 : if (postunlock)
1556 0 : rt_mutex_postunlock(&wake_q);
1557 :
1558 0 : return ret;
1559 : }
1560 :
1561 : /*
1562 : * Express the locking dependencies for lockdep:
1563 : */
1564 : static inline void
1565 0 : double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1566 : {
1567 0 : if (hb1 <= hb2) {
1568 0 : spin_lock(&hb1->lock);
1569 0 : if (hb1 < hb2)
1570 0 : spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
1571 : } else { /* hb1 > hb2 */
1572 0 : spin_lock(&hb2->lock);
1573 0 : spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
1574 : }
1575 0 : }
1576 :
1577 : static inline void
1578 0 : double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1579 : {
1580 0 : spin_unlock(&hb1->lock);
1581 0 : if (hb1 != hb2)
1582 0 : spin_unlock(&hb2->lock);
1583 0 : }
1584 :
1585 : /*
1586 : * Wake up waiters matching bitset queued on this futex (uaddr).
1587 : */
1588 : static int
1589 594 : futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1590 : {
1591 594 : struct futex_hash_bucket *hb;
1592 594 : struct futex_q *this, *next;
1593 594 : union futex_key key = FUTEX_KEY_INIT;
1594 594 : int ret;
1595 594 : DEFINE_WAKE_Q(wake_q);
1596 :
1597 594 : if (!bitset)
1598 : return -EINVAL;
1599 :
1600 594 : ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
1601 594 : if (unlikely(ret != 0))
1602 : return ret;
1603 :
1604 594 : hb = hash_futex(&key);
1605 :
1606 : /* Make sure we really have tasks to wakeup */
1607 1188 : if (!hb_waiters_pending(hb))
1608 : return ret;
1609 :
1610 167 : spin_lock(&hb->lock);
1611 :
1612 169 : plist_for_each_entry_safe(this, next, &hb->chain, list) {
1613 167 : if (match_futex (&this->key, &key)) {
1614 166 : if (this->pi_state || this->rt_waiter) {
1615 : ret = -EINVAL;
1616 : break;
1617 : }
1618 :
1619 : /* Check if one of the bits is set in both bitsets */
1620 166 : if (!(this->bitset & bitset))
1621 0 : continue;
1622 :
1623 166 : mark_wake_futex(&wake_q, this);
1624 166 : if (++ret >= nr_wake)
1625 : break;
1626 : }
1627 : }
1628 :
1629 167 : spin_unlock(&hb->lock);
1630 167 : wake_up_q(&wake_q);
1631 167 : return ret;
1632 : }
1633 :
1634 0 : static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
1635 : {
1636 0 : unsigned int op = (encoded_op & 0x70000000) >> 28;
1637 0 : unsigned int cmp = (encoded_op & 0x0f000000) >> 24;
1638 0 : int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
1639 0 : int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
1640 0 : int oldval, ret;
1641 :
1642 0 : if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
1643 0 : if (oparg < 0 || oparg > 31) {
1644 0 : char comm[sizeof(current->comm)];
1645 : /*
1646 : * kill this print and return -EINVAL when userspace
1647 : * is sane again
1648 : */
1649 0 : pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
1650 : get_task_comm(comm, current), oparg);
1651 0 : oparg &= 31;
1652 : }
1653 0 : oparg = 1 << oparg;
1654 : }
1655 :
1656 0 : pagefault_disable();
1657 0 : ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
1658 0 : pagefault_enable();
1659 0 : if (ret)
1660 : return ret;
1661 :
1662 0 : switch (cmp) {
1663 0 : case FUTEX_OP_CMP_EQ:
1664 0 : return oldval == cmparg;
1665 0 : case FUTEX_OP_CMP_NE:
1666 0 : return oldval != cmparg;
1667 0 : case FUTEX_OP_CMP_LT:
1668 0 : return oldval < cmparg;
1669 0 : case FUTEX_OP_CMP_GE:
1670 0 : return oldval >= cmparg;
1671 0 : case FUTEX_OP_CMP_LE:
1672 0 : return oldval <= cmparg;
1673 0 : case FUTEX_OP_CMP_GT:
1674 0 : return oldval > cmparg;
1675 : default:
1676 : return -ENOSYS;
1677 : }
1678 : }
1679 :
1680 : /*
1681 : * Wake up all waiters hashed on the physical page that is mapped
1682 : * to this virtual address:
1683 : */
1684 : static int
1685 0 : futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
1686 : int nr_wake, int nr_wake2, int op)
1687 : {
1688 0 : union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1689 0 : struct futex_hash_bucket *hb1, *hb2;
1690 0 : struct futex_q *this, *next;
1691 0 : int ret, op_ret;
1692 0 : DEFINE_WAKE_Q(wake_q);
1693 :
1694 0 : retry:
1695 0 : ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
1696 0 : if (unlikely(ret != 0))
1697 0 : return ret;
1698 0 : ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
1699 0 : if (unlikely(ret != 0))
1700 0 : return ret;
1701 :
1702 0 : hb1 = hash_futex(&key1);
1703 0 : hb2 = hash_futex(&key2);
1704 :
1705 0 : retry_private:
1706 0 : double_lock_hb(hb1, hb2);
1707 0 : op_ret = futex_atomic_op_inuser(op, uaddr2);
1708 0 : if (unlikely(op_ret < 0)) {
1709 0 : double_unlock_hb(hb1, hb2);
1710 :
1711 0 : if (!IS_ENABLED(CONFIG_MMU) ||
1712 0 : unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
1713 : /*
1714 : * we don't get EFAULT from MMU faults if we don't have
1715 : * an MMU, but we might get them from range checking
1716 : */
1717 0 : ret = op_ret;
1718 0 : return ret;
1719 : }
1720 :
1721 0 : if (op_ret == -EFAULT) {
1722 0 : ret = fault_in_user_writeable(uaddr2);
1723 0 : if (ret)
1724 0 : return ret;
1725 : }
1726 :
1727 0 : if (!(flags & FLAGS_SHARED)) {
1728 0 : cond_resched();
1729 0 : goto retry_private;
1730 : }
1731 :
1732 0 : cond_resched();
1733 0 : goto retry;
1734 : }
1735 :
1736 0 : plist_for_each_entry_safe(this, next, &hb1->chain, list) {
1737 0 : if (match_futex (&this->key, &key1)) {
1738 0 : if (this->pi_state || this->rt_waiter) {
1739 0 : ret = -EINVAL;
1740 0 : goto out_unlock;
1741 : }
1742 0 : mark_wake_futex(&wake_q, this);
1743 0 : if (++ret >= nr_wake)
1744 : break;
1745 : }
1746 : }
1747 :
1748 0 : if (op_ret > 0) {
1749 0 : op_ret = 0;
1750 0 : plist_for_each_entry_safe(this, next, &hb2->chain, list) {
1751 0 : if (match_futex (&this->key, &key2)) {
1752 0 : if (this->pi_state || this->rt_waiter) {
1753 0 : ret = -EINVAL;
1754 0 : goto out_unlock;
1755 : }
1756 0 : mark_wake_futex(&wake_q, this);
1757 0 : if (++op_ret >= nr_wake2)
1758 : break;
1759 : }
1760 : }
1761 0 : ret += op_ret;
1762 : }
1763 :
1764 0 : out_unlock:
1765 0 : double_unlock_hb(hb1, hb2);
1766 0 : wake_up_q(&wake_q);
1767 0 : return ret;
1768 : }
1769 :
1770 : /**
1771 : * requeue_futex() - Requeue a futex_q from one hb to another
1772 : * @q: the futex_q to requeue
1773 : * @hb1: the source hash_bucket
1774 : * @hb2: the target hash_bucket
1775 : * @key2: the new key for the requeued futex_q
1776 : */
1777 : static inline
1778 0 : void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1779 : struct futex_hash_bucket *hb2, union futex_key *key2)
1780 : {
1781 :
1782 : /*
1783 : * If key1 and key2 hash to the same bucket, no need to
1784 : * requeue.
1785 : */
1786 0 : if (likely(&hb1->chain != &hb2->chain)) {
1787 0 : plist_del(&q->list, &hb1->chain);
1788 0 : hb_waiters_dec(hb1);
1789 0 : hb_waiters_inc(hb2);
1790 0 : plist_add(&q->list, &hb2->chain);
1791 0 : q->lock_ptr = &hb2->lock;
1792 : }
1793 0 : q->key = *key2;
1794 0 : }
1795 :
1796 : /**
1797 : * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1798 : * @q: the futex_q
1799 : * @key: the key of the requeue target futex
1800 : * @hb: the hash_bucket of the requeue target futex
1801 : *
1802 : * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1803 : * target futex if it is uncontended or via a lock steal. Set the futex_q key
1804 : * to the requeue target futex so the waiter can detect the wakeup on the right
1805 : * futex, but remove it from the hb and NULL the rt_waiter so it can detect
1806 : * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock
1807 : * to protect access to the pi_state to fixup the owner later. Must be called
1808 : * with both q->lock_ptr and hb->lock held.
1809 : */
1810 : static inline
1811 0 : void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1812 : struct futex_hash_bucket *hb)
1813 : {
1814 0 : q->key = *key;
1815 :
1816 0 : __unqueue_futex(q);
1817 :
1818 0 : WARN_ON(!q->rt_waiter);
1819 0 : q->rt_waiter = NULL;
1820 :
1821 0 : q->lock_ptr = &hb->lock;
1822 :
1823 0 : wake_up_state(q->task, TASK_NORMAL);
1824 0 : }
1825 :
1826 : /**
1827 : * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
1828 : * @pifutex: the user address of the to futex
1829 : * @hb1: the from futex hash bucket, must be locked by the caller
1830 : * @hb2: the to futex hash bucket, must be locked by the caller
1831 : * @key1: the from futex key
1832 : * @key2: the to futex key
1833 : * @ps: address to store the pi_state pointer
1834 : * @exiting: Pointer to store the task pointer of the owner task
1835 : * which is in the middle of exiting
1836 : * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
1837 : *
1838 : * Try and get the lock on behalf of the top waiter if we can do it atomically.
1839 : * Wake the top waiter if we succeed. If the caller specified set_waiters,
1840 : * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
1841 : * hb1 and hb2 must be held by the caller.
1842 : *
1843 : * @exiting is only set when the return value is -EBUSY. If so, this holds
1844 : * a refcount on the exiting task on return and the caller needs to drop it
1845 : * after waiting for the exit to complete.
1846 : *
1847 : * Return:
1848 : * - 0 - failed to acquire the lock atomically;
1849 : * - >0 - acquired the lock, return value is vpid of the top_waiter
1850 : * - <0 - error
1851 : */
1852 : static int
1853 0 : futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
1854 : struct futex_hash_bucket *hb2, union futex_key *key1,
1855 : union futex_key *key2, struct futex_pi_state **ps,
1856 : struct task_struct **exiting, int set_waiters)
1857 : {
1858 0 : struct futex_q *top_waiter = NULL;
1859 0 : u32 curval;
1860 0 : int ret, vpid;
1861 :
1862 0 : if (get_futex_value_locked(&curval, pifutex))
1863 : return -EFAULT;
1864 :
1865 0 : if (unlikely(should_fail_futex(true)))
1866 : return -EFAULT;
1867 :
1868 : /*
1869 : * Find the top_waiter and determine if there are additional waiters.
1870 : * If the caller intends to requeue more than 1 waiter to pifutex,
1871 : * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
1872 : * as we have means to handle the possible fault. If not, don't set
1873 : * the bit unecessarily as it will force the subsequent unlock to enter
1874 : * the kernel.
1875 : */
1876 0 : top_waiter = futex_top_waiter(hb1, key1);
1877 :
1878 : /* There are no waiters, nothing for us to do. */
1879 0 : if (!top_waiter)
1880 : return 0;
1881 :
1882 : /* Ensure we requeue to the expected futex. */
1883 0 : if (!match_futex(top_waiter->requeue_pi_key, key2))
1884 : return -EINVAL;
1885 :
1886 : /*
1887 : * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
1888 : * the contended case or if set_waiters is 1. The pi_state is returned
1889 : * in ps in contended cases.
1890 : */
1891 0 : vpid = task_pid_vnr(top_waiter->task);
1892 0 : ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1893 : exiting, set_waiters);
1894 0 : if (ret == 1) {
1895 0 : requeue_pi_wake_futex(top_waiter, key2, hb2);
1896 0 : return vpid;
1897 : }
1898 : return ret;
1899 : }
1900 :
1901 : /**
1902 : * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1903 : * @uaddr1: source futex user address
1904 : * @flags: futex flags (FLAGS_SHARED, etc.)
1905 : * @uaddr2: target futex user address
1906 : * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
1907 : * @nr_requeue: number of waiters to requeue (0-INT_MAX)
1908 : * @cmpval: @uaddr1 expected value (or %NULL)
1909 : * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
1910 : * pi futex (pi to pi requeue is not supported)
1911 : *
1912 : * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1913 : * uaddr2 atomically on behalf of the top waiter.
1914 : *
1915 : * Return:
1916 : * - >=0 - on success, the number of tasks requeued or woken;
1917 : * - <0 - on error
1918 : */
1919 0 : static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1920 : u32 __user *uaddr2, int nr_wake, int nr_requeue,
1921 : u32 *cmpval, int requeue_pi)
1922 : {
1923 0 : union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1924 0 : int task_count = 0, ret;
1925 0 : struct futex_pi_state *pi_state = NULL;
1926 0 : struct futex_hash_bucket *hb1, *hb2;
1927 0 : struct futex_q *this, *next;
1928 0 : DEFINE_WAKE_Q(wake_q);
1929 :
1930 0 : if (nr_wake < 0 || nr_requeue < 0)
1931 : return -EINVAL;
1932 :
1933 : /*
1934 : * When PI not supported: return -ENOSYS if requeue_pi is true,
1935 : * consequently the compiler knows requeue_pi is always false past
1936 : * this point which will optimize away all the conditional code
1937 : * further down.
1938 : */
1939 0 : if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
1940 : return -ENOSYS;
1941 :
1942 0 : if (requeue_pi) {
1943 : /*
1944 : * Requeue PI only works on two distinct uaddrs. This
1945 : * check is only valid for private futexes. See below.
1946 : */
1947 0 : if (uaddr1 == uaddr2)
1948 : return -EINVAL;
1949 :
1950 : /*
1951 : * requeue_pi requires a pi_state, try to allocate it now
1952 : * without any locks in case it fails.
1953 : */
1954 0 : if (refill_pi_state_cache())
1955 : return -ENOMEM;
1956 : /*
1957 : * requeue_pi must wake as many tasks as it can, up to nr_wake
1958 : * + nr_requeue, since it acquires the rt_mutex prior to
1959 : * returning to userspace, so as to not leave the rt_mutex with
1960 : * waiters and no owner. However, second and third wake-ups
1961 : * cannot be predicted as they involve race conditions with the
1962 : * first wake and a fault while looking up the pi_state. Both
1963 : * pthread_cond_signal() and pthread_cond_broadcast() should
1964 : * use nr_wake=1.
1965 : */
1966 0 : if (nr_wake != 1)
1967 : return -EINVAL;
1968 : }
1969 :
1970 0 : retry:
1971 0 : ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
1972 0 : if (unlikely(ret != 0))
1973 0 : return ret;
1974 0 : ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
1975 : requeue_pi ? FUTEX_WRITE : FUTEX_READ);
1976 0 : if (unlikely(ret != 0))
1977 0 : return ret;
1978 :
1979 : /*
1980 : * The check above which compares uaddrs is not sufficient for
1981 : * shared futexes. We need to compare the keys:
1982 : */
1983 0 : if (requeue_pi && match_futex(&key1, &key2))
1984 : return -EINVAL;
1985 :
1986 0 : hb1 = hash_futex(&key1);
1987 0 : hb2 = hash_futex(&key2);
1988 :
1989 0 : retry_private:
1990 0 : hb_waiters_inc(hb2);
1991 0 : double_lock_hb(hb1, hb2);
1992 :
1993 0 : if (likely(cmpval != NULL)) {
1994 0 : u32 curval;
1995 :
1996 0 : ret = get_futex_value_locked(&curval, uaddr1);
1997 :
1998 0 : if (unlikely(ret)) {
1999 0 : double_unlock_hb(hb1, hb2);
2000 0 : hb_waiters_dec(hb2);
2001 :
2002 0 : ret = get_user(curval, uaddr1);
2003 0 : if (ret)
2004 0 : return ret;
2005 :
2006 0 : if (!(flags & FLAGS_SHARED))
2007 0 : goto retry_private;
2008 :
2009 0 : goto retry;
2010 : }
2011 0 : if (curval != *cmpval) {
2012 0 : ret = -EAGAIN;
2013 0 : goto out_unlock;
2014 : }
2015 : }
2016 :
2017 0 : if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
2018 0 : struct task_struct *exiting = NULL;
2019 :
2020 : /*
2021 : * Attempt to acquire uaddr2 and wake the top waiter. If we
2022 : * intend to requeue waiters, force setting the FUTEX_WAITERS
2023 : * bit. We force this here where we are able to easily handle
2024 : * faults rather in the requeue loop below.
2025 : */
2026 0 : ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
2027 : &key2, &pi_state,
2028 : &exiting, nr_requeue);
2029 :
2030 : /*
2031 : * At this point the top_waiter has either taken uaddr2 or is
2032 : * waiting on it. If the former, then the pi_state will not
2033 : * exist yet, look it up one more time to ensure we have a
2034 : * reference to it. If the lock was taken, ret contains the
2035 : * vpid of the top waiter task.
2036 : * If the lock was not taken, we have pi_state and an initial
2037 : * refcount on it. In case of an error we have nothing.
2038 : */
2039 0 : if (ret > 0) {
2040 0 : WARN_ON(pi_state);
2041 0 : task_count++;
2042 : /*
2043 : * If we acquired the lock, then the user space value
2044 : * of uaddr2 should be vpid. It cannot be changed by
2045 : * the top waiter as it is blocked on hb2 lock if it
2046 : * tries to do so. If something fiddled with it behind
2047 : * our back the pi state lookup might unearth it. So
2048 : * we rather use the known value than rereading and
2049 : * handing potential crap to lookup_pi_state.
2050 : *
2051 : * If that call succeeds then we have pi_state and an
2052 : * initial refcount on it.
2053 : */
2054 0 : ret = lookup_pi_state(uaddr2, ret, hb2, &key2,
2055 : &pi_state, &exiting);
2056 : }
2057 :
2058 0 : switch (ret) {
2059 : case 0:
2060 : /* We hold a reference on the pi state. */
2061 0 : break;
2062 :
2063 : /* If the above failed, then pi_state is NULL */
2064 0 : case -EFAULT:
2065 0 : double_unlock_hb(hb1, hb2);
2066 0 : hb_waiters_dec(hb2);
2067 0 : ret = fault_in_user_writeable(uaddr2);
2068 0 : if (!ret)
2069 0 : goto retry;
2070 0 : return ret;
2071 0 : case -EBUSY:
2072 : case -EAGAIN:
2073 : /*
2074 : * Two reasons for this:
2075 : * - EBUSY: Owner is exiting and we just wait for the
2076 : * exit to complete.
2077 : * - EAGAIN: The user space value changed.
2078 : */
2079 0 : double_unlock_hb(hb1, hb2);
2080 0 : hb_waiters_dec(hb2);
2081 : /*
2082 : * Handle the case where the owner is in the middle of
2083 : * exiting. Wait for the exit to complete otherwise
2084 : * this task might loop forever, aka. live lock.
2085 : */
2086 0 : wait_for_owner_exiting(ret, exiting);
2087 0 : cond_resched();
2088 0 : goto retry;
2089 0 : default:
2090 0 : goto out_unlock;
2091 : }
2092 : }
2093 :
2094 0 : plist_for_each_entry_safe(this, next, &hb1->chain, list) {
2095 0 : if (task_count - nr_wake >= nr_requeue)
2096 : break;
2097 :
2098 0 : if (!match_futex(&this->key, &key1))
2099 0 : continue;
2100 :
2101 : /*
2102 : * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
2103 : * be paired with each other and no other futex ops.
2104 : *
2105 : * We should never be requeueing a futex_q with a pi_state,
2106 : * which is awaiting a futex_unlock_pi().
2107 : */
2108 0 : if ((requeue_pi && !this->rt_waiter) ||
2109 0 : (!requeue_pi && this->rt_waiter) ||
2110 0 : this->pi_state) {
2111 : ret = -EINVAL;
2112 : break;
2113 : }
2114 :
2115 : /*
2116 : * Wake nr_wake waiters. For requeue_pi, if we acquired the
2117 : * lock, we already woke the top_waiter. If not, it will be
2118 : * woken by futex_unlock_pi().
2119 : */
2120 0 : if (++task_count <= nr_wake && !requeue_pi) {
2121 0 : mark_wake_futex(&wake_q, this);
2122 0 : continue;
2123 : }
2124 :
2125 : /* Ensure we requeue to the expected futex for requeue_pi. */
2126 0 : if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
2127 : ret = -EINVAL;
2128 : break;
2129 : }
2130 :
2131 : /*
2132 : * Requeue nr_requeue waiters and possibly one more in the case
2133 : * of requeue_pi if we couldn't acquire the lock atomically.
2134 : */
2135 0 : if (requeue_pi) {
2136 : /*
2137 : * Prepare the waiter to take the rt_mutex. Take a
2138 : * refcount on the pi_state and store the pointer in
2139 : * the futex_q object of the waiter.
2140 : */
2141 0 : get_pi_state(pi_state);
2142 0 : this->pi_state = pi_state;
2143 0 : ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
2144 : this->rt_waiter,
2145 : this->task);
2146 0 : if (ret == 1) {
2147 : /*
2148 : * We got the lock. We do neither drop the
2149 : * refcount on pi_state nor clear
2150 : * this->pi_state because the waiter needs the
2151 : * pi_state for cleaning up the user space
2152 : * value. It will drop the refcount after
2153 : * doing so.
2154 : */
2155 0 : requeue_pi_wake_futex(this, &key2, hb2);
2156 0 : continue;
2157 0 : } else if (ret) {
2158 : /*
2159 : * rt_mutex_start_proxy_lock() detected a
2160 : * potential deadlock when we tried to queue
2161 : * that waiter. Drop the pi_state reference
2162 : * which we took above and remove the pointer
2163 : * to the state from the waiters futex_q
2164 : * object.
2165 : */
2166 0 : this->pi_state = NULL;
2167 0 : put_pi_state(pi_state);
2168 : /*
2169 : * We stop queueing more waiters and let user
2170 : * space deal with the mess.
2171 : */
2172 0 : break;
2173 : }
2174 : }
2175 0 : requeue_futex(this, hb1, hb2, &key2);
2176 : }
2177 :
2178 : /*
2179 : * We took an extra initial reference to the pi_state either
2180 : * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We
2181 : * need to drop it here again.
2182 : */
2183 0 : put_pi_state(pi_state);
2184 :
2185 0 : out_unlock:
2186 0 : double_unlock_hb(hb1, hb2);
2187 0 : wake_up_q(&wake_q);
2188 0 : hb_waiters_dec(hb2);
2189 0 : return ret ? ret : task_count;
2190 : }
2191 :
2192 : /* The key must be already stored in q->key. */
2193 168 : static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
2194 : __acquires(&hb->lock)
2195 : {
2196 168 : struct futex_hash_bucket *hb;
2197 :
2198 168 : hb = hash_futex(&q->key);
2199 :
2200 : /*
2201 : * Increment the counter before taking the lock so that
2202 : * a potential waker won't miss a to-be-slept task that is
2203 : * waiting for the spinlock. This is safe as all queue_lock()
2204 : * users end up calling queue_me(). Similarly, for housekeeping,
2205 : * decrement the counter at queue_unlock() when some error has
2206 : * occurred and we don't end up adding the task to the list.
2207 : */
2208 168 : hb_waiters_inc(hb); /* implies smp_mb(); (A) */
2209 :
2210 168 : q->lock_ptr = &hb->lock;
2211 :
2212 168 : spin_lock(&hb->lock);
2213 168 : return hb;
2214 : }
2215 :
2216 : static inline void
2217 1 : queue_unlock(struct futex_hash_bucket *hb)
2218 : __releases(&hb->lock)
2219 : {
2220 2 : spin_unlock(&hb->lock);
2221 0 : hb_waiters_dec(hb);
2222 0 : }
2223 :
2224 167 : static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
2225 : {
2226 167 : int prio;
2227 :
2228 : /*
2229 : * The priority used to register this element is
2230 : * - either the real thread-priority for the real-time threads
2231 : * (i.e. threads with a priority lower than MAX_RT_PRIO)
2232 : * - or MAX_RT_PRIO for non-RT threads.
2233 : * Thus, all RT-threads are woken first in priority order, and
2234 : * the others are woken last, in FIFO order.
2235 : */
2236 167 : prio = min(current->normal_prio, MAX_RT_PRIO);
2237 :
2238 167 : plist_node_init(&q->list, prio);
2239 167 : plist_add(&q->list, &hb->chain);
2240 167 : q->task = current;
2241 167 : }
2242 :
2243 : /**
2244 : * queue_me() - Enqueue the futex_q on the futex_hash_bucket
2245 : * @q: The futex_q to enqueue
2246 : * @hb: The destination hash bucket
2247 : *
2248 : * The hb->lock must be held by the caller, and is released here. A call to
2249 : * queue_me() is typically paired with exactly one call to unqueue_me(). The
2250 : * exceptions involve the PI related operations, which may use unqueue_me_pi()
2251 : * or nothing if the unqueue is done as part of the wake process and the unqueue
2252 : * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
2253 : * an example).
2254 : */
2255 167 : static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
2256 : __releases(&hb->lock)
2257 : {
2258 167 : __queue_me(q, hb);
2259 167 : spin_unlock(&hb->lock);
2260 : }
2261 :
2262 : /**
2263 : * unqueue_me() - Remove the futex_q from its futex_hash_bucket
2264 : * @q: The futex_q to unqueue
2265 : *
2266 : * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
2267 : * be paired with exactly one earlier call to queue_me().
2268 : *
2269 : * Return:
2270 : * - 1 - if the futex_q was still queued (and we removed unqueued it);
2271 : * - 0 - if the futex_q was already removed by the waking thread
2272 : */
2273 166 : static int unqueue_me(struct futex_q *q)
2274 : {
2275 166 : spinlock_t *lock_ptr;
2276 166 : int ret = 0;
2277 :
2278 : /* In the common case we don't take the spinlock, which is nice. */
2279 166 : retry:
2280 : /*
2281 : * q->lock_ptr can change between this read and the following spin_lock.
2282 : * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
2283 : * optimizing lock_ptr out of the logic below.
2284 : */
2285 166 : lock_ptr = READ_ONCE(q->lock_ptr);
2286 166 : if (lock_ptr != NULL) {
2287 0 : spin_lock(lock_ptr);
2288 : /*
2289 : * q->lock_ptr can change between reading it and
2290 : * spin_lock(), causing us to take the wrong lock. This
2291 : * corrects the race condition.
2292 : *
2293 : * Reasoning goes like this: if we have the wrong lock,
2294 : * q->lock_ptr must have changed (maybe several times)
2295 : * between reading it and the spin_lock(). It can
2296 : * change again after the spin_lock() but only if it was
2297 : * already changed before the spin_lock(). It cannot,
2298 : * however, change back to the original value. Therefore
2299 : * we can detect whether we acquired the correct lock.
2300 : */
2301 0 : if (unlikely(lock_ptr != q->lock_ptr)) {
2302 0 : spin_unlock(lock_ptr);
2303 0 : goto retry;
2304 : }
2305 0 : __unqueue_futex(q);
2306 :
2307 0 : BUG_ON(q->pi_state);
2308 :
2309 0 : spin_unlock(lock_ptr);
2310 0 : ret = 1;
2311 : }
2312 :
2313 166 : return ret;
2314 : }
2315 :
2316 : /*
2317 : * PI futexes can not be requeued and must remove themself from the
2318 : * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
2319 : * and dropped here.
2320 : */
2321 0 : static void unqueue_me_pi(struct futex_q *q)
2322 : __releases(q->lock_ptr)
2323 : {
2324 0 : __unqueue_futex(q);
2325 :
2326 0 : BUG_ON(!q->pi_state);
2327 0 : put_pi_state(q->pi_state);
2328 0 : q->pi_state = NULL;
2329 :
2330 0 : spin_unlock(q->lock_ptr);
2331 0 : }
2332 :
2333 0 : static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
2334 : struct task_struct *argowner)
2335 : {
2336 0 : struct futex_pi_state *pi_state = q->pi_state;
2337 0 : struct task_struct *oldowner, *newowner;
2338 0 : u32 uval, curval, newval, newtid;
2339 0 : int err = 0;
2340 :
2341 0 : oldowner = pi_state->owner;
2342 :
2343 : /*
2344 : * We are here because either:
2345 : *
2346 : * - we stole the lock and pi_state->owner needs updating to reflect
2347 : * that (@argowner == current),
2348 : *
2349 : * or:
2350 : *
2351 : * - someone stole our lock and we need to fix things to point to the
2352 : * new owner (@argowner == NULL).
2353 : *
2354 : * Either way, we have to replace the TID in the user space variable.
2355 : * This must be atomic as we have to preserve the owner died bit here.
2356 : *
2357 : * Note: We write the user space value _before_ changing the pi_state
2358 : * because we can fault here. Imagine swapped out pages or a fork
2359 : * that marked all the anonymous memory readonly for cow.
2360 : *
2361 : * Modifying pi_state _before_ the user space value would leave the
2362 : * pi_state in an inconsistent state when we fault here, because we
2363 : * need to drop the locks to handle the fault. This might be observed
2364 : * in the PID check in lookup_pi_state.
2365 : */
2366 0 : retry:
2367 0 : if (!argowner) {
2368 0 : if (oldowner != current) {
2369 : /*
2370 : * We raced against a concurrent self; things are
2371 : * already fixed up. Nothing to do.
2372 : */
2373 : return 0;
2374 : }
2375 :
2376 0 : if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
2377 : /* We got the lock. pi_state is correct. Tell caller. */
2378 : return 1;
2379 : }
2380 :
2381 : /*
2382 : * The trylock just failed, so either there is an owner or
2383 : * there is a higher priority waiter than this one.
2384 : */
2385 0 : newowner = rt_mutex_owner(&pi_state->pi_mutex);
2386 : /*
2387 : * If the higher priority waiter has not yet taken over the
2388 : * rtmutex then newowner is NULL. We can't return here with
2389 : * that state because it's inconsistent vs. the user space
2390 : * state. So drop the locks and try again. It's a valid
2391 : * situation and not any different from the other retry
2392 : * conditions.
2393 : */
2394 0 : if (unlikely(!newowner)) {
2395 0 : err = -EAGAIN;
2396 0 : goto handle_err;
2397 : }
2398 : } else {
2399 0 : WARN_ON_ONCE(argowner != current);
2400 0 : if (oldowner == current) {
2401 : /*
2402 : * We raced against a concurrent self; things are
2403 : * already fixed up. Nothing to do.
2404 : */
2405 : return 1;
2406 : }
2407 : newowner = argowner;
2408 : }
2409 :
2410 0 : newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
2411 : /* Owner died? */
2412 0 : if (!pi_state->owner)
2413 0 : newtid |= FUTEX_OWNER_DIED;
2414 :
2415 0 : err = get_futex_value_locked(&uval, uaddr);
2416 0 : if (err)
2417 0 : goto handle_err;
2418 :
2419 0 : for (;;) {
2420 0 : newval = (uval & FUTEX_OWNER_DIED) | newtid;
2421 :
2422 0 : err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
2423 0 : if (err)
2424 0 : goto handle_err;
2425 :
2426 0 : if (curval == uval)
2427 : break;
2428 0 : uval = curval;
2429 : }
2430 :
2431 : /*
2432 : * We fixed up user space. Now we need to fix the pi_state
2433 : * itself.
2434 : */
2435 0 : pi_state_update_owner(pi_state, newowner);
2436 :
2437 0 : return argowner == current;
2438 :
2439 : /*
2440 : * In order to reschedule or handle a page fault, we need to drop the
2441 : * locks here. In the case of a fault, this gives the other task
2442 : * (either the highest priority waiter itself or the task which stole
2443 : * the rtmutex) the chance to try the fixup of the pi_state. So once we
2444 : * are back from handling the fault we need to check the pi_state after
2445 : * reacquiring the locks and before trying to do another fixup. When
2446 : * the fixup has been done already we simply return.
2447 : *
2448 : * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
2449 : * drop hb->lock since the caller owns the hb -> futex_q relation.
2450 : * Dropping the pi_mutex->wait_lock requires the state revalidate.
2451 : */
2452 0 : handle_err:
2453 0 : raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
2454 0 : spin_unlock(q->lock_ptr);
2455 :
2456 0 : switch (err) {
2457 0 : case -EFAULT:
2458 0 : err = fault_in_user_writeable(uaddr);
2459 0 : break;
2460 :
2461 0 : case -EAGAIN:
2462 0 : cond_resched();
2463 0 : err = 0;
2464 0 : break;
2465 :
2466 : default:
2467 0 : WARN_ON_ONCE(1);
2468 0 : break;
2469 : }
2470 :
2471 0 : spin_lock(q->lock_ptr);
2472 0 : raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
2473 :
2474 : /*
2475 : * Check if someone else fixed it for us:
2476 : */
2477 0 : if (pi_state->owner != oldowner)
2478 0 : return argowner == current;
2479 :
2480 : /* Retry if err was -EAGAIN or the fault in succeeded */
2481 0 : if (!err)
2482 0 : goto retry;
2483 :
2484 : /*
2485 : * fault_in_user_writeable() failed so user state is immutable. At
2486 : * best we can make the kernel state consistent but user state will
2487 : * be most likely hosed and any subsequent unlock operation will be
2488 : * rejected due to PI futex rule [10].
2489 : *
2490 : * Ensure that the rtmutex owner is also the pi_state owner despite
2491 : * the user space value claiming something different. There is no
2492 : * point in unlocking the rtmutex if current is the owner as it
2493 : * would need to wait until the next waiter has taken the rtmutex
2494 : * to guarantee consistent state. Keep it simple. Userspace asked
2495 : * for this wreckaged state.
2496 : *
2497 : * The rtmutex has an owner - either current or some other
2498 : * task. See the EAGAIN loop above.
2499 : */
2500 0 : pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
2501 :
2502 0 : return err;
2503 : }
2504 :
2505 0 : static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
2506 : struct task_struct *argowner)
2507 : {
2508 0 : struct futex_pi_state *pi_state = q->pi_state;
2509 0 : int ret;
2510 :
2511 0 : lockdep_assert_held(q->lock_ptr);
2512 :
2513 0 : raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
2514 0 : ret = __fixup_pi_state_owner(uaddr, q, argowner);
2515 0 : raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
2516 0 : return ret;
2517 : }
2518 :
2519 : static long futex_wait_restart(struct restart_block *restart);
2520 :
2521 : /**
2522 : * fixup_owner() - Post lock pi_state and corner case management
2523 : * @uaddr: user address of the futex
2524 : * @q: futex_q (contains pi_state and access to the rt_mutex)
2525 : * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
2526 : *
2527 : * After attempting to lock an rt_mutex, this function is called to cleanup
2528 : * the pi_state owner as well as handle race conditions that may allow us to
2529 : * acquire the lock. Must be called with the hb lock held.
2530 : *
2531 : * Return:
2532 : * - 1 - success, lock taken;
2533 : * - 0 - success, lock not taken;
2534 : * - <0 - on error (-EFAULT)
2535 : */
2536 0 : static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
2537 : {
2538 0 : if (locked) {
2539 : /*
2540 : * Got the lock. We might not be the anticipated owner if we
2541 : * did a lock-steal - fix up the PI-state in that case:
2542 : *
2543 : * Speculative pi_state->owner read (we don't hold wait_lock);
2544 : * since we own the lock pi_state->owner == current is the
2545 : * stable state, anything else needs more attention.
2546 : */
2547 0 : if (q->pi_state->owner != current)
2548 0 : return fixup_pi_state_owner(uaddr, q, current);
2549 : return 1;
2550 : }
2551 :
2552 : /*
2553 : * If we didn't get the lock; check if anybody stole it from us. In
2554 : * that case, we need to fix up the uval to point to them instead of
2555 : * us, otherwise bad things happen. [10]
2556 : *
2557 : * Another speculative read; pi_state->owner == current is unstable
2558 : * but needs our attention.
2559 : */
2560 0 : if (q->pi_state->owner == current)
2561 0 : return fixup_pi_state_owner(uaddr, q, NULL);
2562 :
2563 : /*
2564 : * Paranoia check. If we did not take the lock, then we should not be
2565 : * the owner of the rt_mutex. Warn and establish consistent state.
2566 : */
2567 0 : if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
2568 0 : return fixup_pi_state_owner(uaddr, q, current);
2569 :
2570 : return 0;
2571 : }
2572 :
2573 : /**
2574 : * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
2575 : * @hb: the futex hash bucket, must be locked by the caller
2576 : * @q: the futex_q to queue up on
2577 : * @timeout: the prepared hrtimer_sleeper, or null for no timeout
2578 : */
2579 167 : static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
2580 : struct hrtimer_sleeper *timeout)
2581 : {
2582 : /*
2583 : * The task state is guaranteed to be set before another task can
2584 : * wake it. set_current_state() is implemented using smp_store_mb() and
2585 : * queue_me() calls spin_unlock() upon completion, both serializing
2586 : * access to the hash list and forcing another memory barrier.
2587 : */
2588 167 : set_current_state(TASK_INTERRUPTIBLE);
2589 167 : queue_me(q, hb);
2590 :
2591 : /* Arm the timer */
2592 167 : if (timeout)
2593 0 : hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
2594 :
2595 : /*
2596 : * If we have been removed from the hash list, then another task
2597 : * has tried to wake us, and we can skip the call to schedule().
2598 : */
2599 167 : if (likely(!plist_node_empty(&q->list))) {
2600 : /*
2601 : * If the timer has already expired, current will already be
2602 : * flagged for rescheduling. Only call schedule if there
2603 : * is no timeout, or if it has yet to expire.
2604 : */
2605 167 : if (!timeout || timeout->task)
2606 167 : freezable_schedule();
2607 : }
2608 166 : __set_current_state(TASK_RUNNING);
2609 166 : }
2610 :
2611 : /**
2612 : * futex_wait_setup() - Prepare to wait on a futex
2613 : * @uaddr: the futex userspace address
2614 : * @val: the expected value
2615 : * @flags: futex flags (FLAGS_SHARED, etc.)
2616 : * @q: the associated futex_q
2617 : * @hb: storage for hash_bucket pointer to be returned to caller
2618 : *
2619 : * Setup the futex_q and locate the hash_bucket. Get the futex value and
2620 : * compare it with the expected value. Handle atomic faults internally.
2621 : * Return with the hb lock held and a q.key reference on success, and unlocked
2622 : * with no q.key reference on failure.
2623 : *
2624 : * Return:
2625 : * - 0 - uaddr contains val and hb has been locked;
2626 : * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
2627 : */
2628 168 : static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
2629 : struct futex_q *q, struct futex_hash_bucket **hb)
2630 : {
2631 168 : u32 uval;
2632 168 : int ret;
2633 :
2634 : /*
2635 : * Access the page AFTER the hash-bucket is locked.
2636 : * Order is important:
2637 : *
2638 : * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
2639 : * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }
2640 : *
2641 : * The basic logical guarantee of a futex is that it blocks ONLY
2642 : * if cond(var) is known to be true at the time of blocking, for
2643 : * any cond. If we locked the hash-bucket after testing *uaddr, that
2644 : * would open a race condition where we could block indefinitely with
2645 : * cond(var) false, which would violate the guarantee.
2646 : *
2647 : * On the other hand, we insert q and release the hash-bucket only
2648 : * after testing *uaddr. This guarantees that futex_wait() will NOT
2649 : * absorb a wakeup if *uaddr does not match the desired values
2650 : * while the syscall executes.
2651 : */
2652 168 : retry:
2653 168 : ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
2654 168 : if (unlikely(ret != 0))
2655 0 : return ret;
2656 :
2657 168 : retry_private:
2658 168 : *hb = queue_lock(q);
2659 :
2660 168 : ret = get_futex_value_locked(&uval, uaddr);
2661 :
2662 168 : if (ret) {
2663 0 : queue_unlock(*hb);
2664 :
2665 0 : ret = get_user(uval, uaddr);
2666 0 : if (ret)
2667 0 : return ret;
2668 :
2669 0 : if (!(flags & FLAGS_SHARED))
2670 0 : goto retry_private;
2671 :
2672 0 : goto retry;
2673 : }
2674 :
2675 168 : if (uval != val) {
2676 1 : queue_unlock(*hb);
2677 1 : ret = -EWOULDBLOCK;
2678 : }
2679 :
2680 : return ret;
2681 : }
2682 :
2683 168 : static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
2684 : ktime_t *abs_time, u32 bitset)
2685 : {
2686 168 : struct hrtimer_sleeper timeout, *to;
2687 168 : struct restart_block *restart;
2688 168 : struct futex_hash_bucket *hb;
2689 168 : struct futex_q q = futex_q_init;
2690 168 : int ret;
2691 :
2692 168 : if (!bitset)
2693 : return -EINVAL;
2694 168 : q.bitset = bitset;
2695 :
2696 168 : to = futex_setup_timer(abs_time, &timeout, flags,
2697 168 : current->timer_slack_ns);
2698 168 : retry:
2699 : /*
2700 : * Prepare to wait on uaddr. On success, holds hb lock and increments
2701 : * q.key refs.
2702 : */
2703 168 : ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2704 168 : if (ret)
2705 1 : goto out;
2706 :
2707 : /* queue_me and wait for wakeup, timeout, or a signal. */
2708 167 : futex_wait_queue_me(hb, &q, to);
2709 :
2710 : /* If we were woken (and unqueued), we succeeded, whatever. */
2711 166 : ret = 0;
2712 : /* unqueue_me() drops q.key ref */
2713 166 : if (!unqueue_me(&q))
2714 166 : goto out;
2715 0 : ret = -ETIMEDOUT;
2716 0 : if (to && !to->task)
2717 0 : goto out;
2718 :
2719 : /*
2720 : * We expect signal_pending(current), but we might be the
2721 : * victim of a spurious wakeup as well.
2722 : */
2723 0 : if (!signal_pending(current))
2724 0 : goto retry;
2725 :
2726 0 : ret = -ERESTARTSYS;
2727 0 : if (!abs_time)
2728 0 : goto out;
2729 :
2730 0 : restart = ¤t->restart_block;
2731 0 : restart->fn = futex_wait_restart;
2732 0 : restart->futex.uaddr = uaddr;
2733 0 : restart->futex.val = val;
2734 0 : restart->futex.time = *abs_time;
2735 0 : restart->futex.bitset = bitset;
2736 0 : restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
2737 :
2738 0 : ret = -ERESTART_RESTARTBLOCK;
2739 :
2740 167 : out:
2741 167 : if (to) {
2742 0 : hrtimer_cancel(&to->timer);
2743 0 : destroy_hrtimer_on_stack(&to->timer);
2744 : }
2745 : return ret;
2746 : }
2747 :
2748 :
2749 0 : static long futex_wait_restart(struct restart_block *restart)
2750 : {
2751 0 : u32 __user *uaddr = restart->futex.uaddr;
2752 0 : ktime_t t, *tp = NULL;
2753 :
2754 0 : if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
2755 0 : t = restart->futex.time;
2756 0 : tp = &t;
2757 : }
2758 0 : restart->fn = do_no_restart_syscall;
2759 :
2760 0 : return (long)futex_wait(uaddr, restart->futex.flags,
2761 : restart->futex.val, tp, restart->futex.bitset);
2762 : }
2763 :
2764 :
2765 : /*
2766 : * Userspace tried a 0 -> TID atomic transition of the futex value
2767 : * and failed. The kernel side here does the whole locking operation:
2768 : * if there are waiters then it will block as a consequence of relying
2769 : * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
2770 : * a 0 value of the futex too.).
2771 : *
2772 : * Also serves as futex trylock_pi()'ing, and due semantics.
2773 : */
2774 0 : static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
2775 : ktime_t *time, int trylock)
2776 : {
2777 0 : struct hrtimer_sleeper timeout, *to;
2778 0 : struct task_struct *exiting = NULL;
2779 0 : struct rt_mutex_waiter rt_waiter;
2780 0 : struct futex_hash_bucket *hb;
2781 0 : struct futex_q q = futex_q_init;
2782 0 : int res, ret;
2783 :
2784 0 : if (!IS_ENABLED(CONFIG_FUTEX_PI))
2785 : return -ENOSYS;
2786 :
2787 0 : if (refill_pi_state_cache())
2788 : return -ENOMEM;
2789 :
2790 0 : to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0);
2791 :
2792 : retry:
2793 0 : ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
2794 0 : if (unlikely(ret != 0))
2795 0 : goto out;
2796 :
2797 0 : retry_private:
2798 0 : hb = queue_lock(&q);
2799 :
2800 0 : ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
2801 : &exiting, 0);
2802 0 : if (unlikely(ret)) {
2803 : /*
2804 : * Atomic work succeeded and we got the lock,
2805 : * or failed. Either way, we do _not_ block.
2806 : */
2807 0 : switch (ret) {
2808 0 : case 1:
2809 : /* We got the lock. */
2810 0 : ret = 0;
2811 0 : goto out_unlock_put_key;
2812 0 : case -EFAULT:
2813 0 : goto uaddr_faulted;
2814 : case -EBUSY:
2815 : case -EAGAIN:
2816 : /*
2817 : * Two reasons for this:
2818 : * - EBUSY: Task is exiting and we just wait for the
2819 : * exit to complete.
2820 : * - EAGAIN: The user space value changed.
2821 : */
2822 0 : queue_unlock(hb);
2823 : /*
2824 : * Handle the case where the owner is in the middle of
2825 : * exiting. Wait for the exit to complete otherwise
2826 : * this task might loop forever, aka. live lock.
2827 : */
2828 0 : wait_for_owner_exiting(ret, exiting);
2829 0 : cond_resched();
2830 0 : goto retry;
2831 0 : default:
2832 0 : goto out_unlock_put_key;
2833 : }
2834 : }
2835 :
2836 0 : WARN_ON(!q.pi_state);
2837 :
2838 : /*
2839 : * Only actually queue now that the atomic ops are done:
2840 : */
2841 0 : __queue_me(&q, hb);
2842 :
2843 0 : if (trylock) {
2844 0 : ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
2845 : /* Fixup the trylock return value: */
2846 0 : ret = ret ? 0 : -EWOULDBLOCK;
2847 0 : goto no_block;
2848 : }
2849 :
2850 0 : rt_mutex_init_waiter(&rt_waiter);
2851 :
2852 : /*
2853 : * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
2854 : * hold it while doing rt_mutex_start_proxy(), because then it will
2855 : * include hb->lock in the blocking chain, even through we'll not in
2856 : * fact hold it while blocking. This will lead it to report -EDEADLK
2857 : * and BUG when futex_unlock_pi() interleaves with this.
2858 : *
2859 : * Therefore acquire wait_lock while holding hb->lock, but drop the
2860 : * latter before calling __rt_mutex_start_proxy_lock(). This
2861 : * interleaves with futex_unlock_pi() -- which does a similar lock
2862 : * handoff -- such that the latter can observe the futex_q::pi_state
2863 : * before __rt_mutex_start_proxy_lock() is done.
2864 : */
2865 0 : raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
2866 0 : spin_unlock(q.lock_ptr);
2867 : /*
2868 : * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
2869 : * such that futex_unlock_pi() is guaranteed to observe the waiter when
2870 : * it sees the futex_q::pi_state.
2871 : */
2872 0 : ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
2873 0 : raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
2874 :
2875 0 : if (ret) {
2876 0 : if (ret == 1)
2877 0 : ret = 0;
2878 0 : goto cleanup;
2879 : }
2880 :
2881 0 : if (unlikely(to))
2882 0 : hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
2883 :
2884 0 : ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
2885 :
2886 0 : cleanup:
2887 0 : spin_lock(q.lock_ptr);
2888 : /*
2889 : * If we failed to acquire the lock (deadlock/signal/timeout), we must
2890 : * first acquire the hb->lock before removing the lock from the
2891 : * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
2892 : * lists consistent.
2893 : *
2894 : * In particular; it is important that futex_unlock_pi() can not
2895 : * observe this inconsistency.
2896 : */
2897 0 : if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
2898 0 : ret = 0;
2899 :
2900 0 : no_block:
2901 : /*
2902 : * Fixup the pi_state owner and possibly acquire the lock if we
2903 : * haven't already.
2904 : */
2905 0 : res = fixup_owner(uaddr, &q, !ret);
2906 : /*
2907 : * If fixup_owner() returned an error, proprogate that. If it acquired
2908 : * the lock, clear our -ETIMEDOUT or -EINTR.
2909 : */
2910 0 : if (res)
2911 0 : ret = (res < 0) ? res : 0;
2912 :
2913 : /* Unqueue and drop the lock */
2914 0 : unqueue_me_pi(&q);
2915 0 : goto out;
2916 :
2917 0 : out_unlock_put_key:
2918 0 : queue_unlock(hb);
2919 :
2920 0 : out:
2921 0 : if (to) {
2922 0 : hrtimer_cancel(&to->timer);
2923 0 : destroy_hrtimer_on_stack(&to->timer);
2924 : }
2925 0 : return ret != -EINTR ? ret : -ERESTARTNOINTR;
2926 :
2927 0 : uaddr_faulted:
2928 0 : queue_unlock(hb);
2929 :
2930 0 : ret = fault_in_user_writeable(uaddr);
2931 0 : if (ret)
2932 0 : goto out;
2933 :
2934 0 : if (!(flags & FLAGS_SHARED))
2935 0 : goto retry_private;
2936 :
2937 0 : goto retry;
2938 : }
2939 :
2940 : /*
2941 : * Userspace attempted a TID -> 0 atomic transition, and failed.
2942 : * This is the in-kernel slowpath: we look up the PI state (if any),
2943 : * and do the rt-mutex unlock.
2944 : */
2945 0 : static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2946 : {
2947 0 : u32 curval, uval, vpid = task_pid_vnr(current);
2948 0 : union futex_key key = FUTEX_KEY_INIT;
2949 0 : struct futex_hash_bucket *hb;
2950 0 : struct futex_q *top_waiter;
2951 0 : int ret;
2952 :
2953 0 : if (!IS_ENABLED(CONFIG_FUTEX_PI))
2954 : return -ENOSYS;
2955 :
2956 0 : retry:
2957 0 : if (get_user(uval, uaddr))
2958 : return -EFAULT;
2959 : /*
2960 : * We release only a lock we actually own:
2961 : */
2962 0 : if ((uval & FUTEX_TID_MASK) != vpid)
2963 : return -EPERM;
2964 :
2965 0 : ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
2966 0 : if (ret)
2967 0 : return ret;
2968 :
2969 0 : hb = hash_futex(&key);
2970 0 : spin_lock(&hb->lock);
2971 :
2972 : /*
2973 : * Check waiters first. We do not trust user space values at
2974 : * all and we at least want to know if user space fiddled
2975 : * with the futex value instead of blindly unlocking.
2976 : */
2977 0 : top_waiter = futex_top_waiter(hb, &key);
2978 0 : if (top_waiter) {
2979 0 : struct futex_pi_state *pi_state = top_waiter->pi_state;
2980 :
2981 0 : ret = -EINVAL;
2982 0 : if (!pi_state)
2983 0 : goto out_unlock;
2984 :
2985 : /*
2986 : * If current does not own the pi_state then the futex is
2987 : * inconsistent and user space fiddled with the futex value.
2988 : */
2989 0 : if (pi_state->owner != current)
2990 0 : goto out_unlock;
2991 :
2992 0 : get_pi_state(pi_state);
2993 : /*
2994 : * By taking wait_lock while still holding hb->lock, we ensure
2995 : * there is no point where we hold neither; and therefore
2996 : * wake_futex_pi() must observe a state consistent with what we
2997 : * observed.
2998 : *
2999 : * In particular; this forces __rt_mutex_start_proxy() to
3000 : * complete such that we're guaranteed to observe the
3001 : * rt_waiter. Also see the WARN in wake_futex_pi().
3002 : */
3003 0 : raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
3004 0 : spin_unlock(&hb->lock);
3005 :
3006 : /* drops pi_state->pi_mutex.wait_lock */
3007 0 : ret = wake_futex_pi(uaddr, uval, pi_state);
3008 :
3009 0 : put_pi_state(pi_state);
3010 :
3011 : /*
3012 : * Success, we're done! No tricky corner cases.
3013 : */
3014 0 : if (!ret)
3015 : return ret;
3016 : /*
3017 : * The atomic access to the futex value generated a
3018 : * pagefault, so retry the user-access and the wakeup:
3019 : */
3020 0 : if (ret == -EFAULT)
3021 0 : goto pi_faulted;
3022 : /*
3023 : * A unconditional UNLOCK_PI op raced against a waiter
3024 : * setting the FUTEX_WAITERS bit. Try again.
3025 : */
3026 0 : if (ret == -EAGAIN)
3027 0 : goto pi_retry;
3028 : /*
3029 : * wake_futex_pi has detected invalid state. Tell user
3030 : * space.
3031 : */
3032 0 : return ret;
3033 : }
3034 :
3035 : /*
3036 : * We have no kernel internal state, i.e. no waiters in the
3037 : * kernel. Waiters which are about to queue themselves are stuck
3038 : * on hb->lock. So we can safely ignore them. We do neither
3039 : * preserve the WAITERS bit not the OWNER_DIED one. We are the
3040 : * owner.
3041 : */
3042 0 : if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {
3043 0 : spin_unlock(&hb->lock);
3044 0 : switch (ret) {
3045 0 : case -EFAULT:
3046 0 : goto pi_faulted;
3047 :
3048 0 : case -EAGAIN:
3049 0 : goto pi_retry;
3050 :
3051 : default:
3052 0 : WARN_ON_ONCE(1);
3053 0 : return ret;
3054 : }
3055 : }
3056 :
3057 : /*
3058 : * If uval has changed, let user space handle it.
3059 : */
3060 0 : ret = (curval == uval) ? 0 : -EAGAIN;
3061 :
3062 0 : out_unlock:
3063 0 : spin_unlock(&hb->lock);
3064 0 : return ret;
3065 :
3066 0 : pi_retry:
3067 0 : cond_resched();
3068 0 : goto retry;
3069 :
3070 0 : pi_faulted:
3071 :
3072 0 : ret = fault_in_user_writeable(uaddr);
3073 0 : if (!ret)
3074 0 : goto retry;
3075 :
3076 : return ret;
3077 : }
3078 :
3079 : /**
3080 : * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
3081 : * @hb: the hash_bucket futex_q was original enqueued on
3082 : * @q: the futex_q woken while waiting to be requeued
3083 : * @key2: the futex_key of the requeue target futex
3084 : * @timeout: the timeout associated with the wait (NULL if none)
3085 : *
3086 : * Detect if the task was woken on the initial futex as opposed to the requeue
3087 : * target futex. If so, determine if it was a timeout or a signal that caused
3088 : * the wakeup and return the appropriate error code to the caller. Must be
3089 : * called with the hb lock held.
3090 : *
3091 : * Return:
3092 : * - 0 = no early wakeup detected;
3093 : * - <0 = -ETIMEDOUT or -ERESTARTNOINTR
3094 : */
3095 : static inline
3096 0 : int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
3097 : struct futex_q *q, union futex_key *key2,
3098 : struct hrtimer_sleeper *timeout)
3099 : {
3100 0 : int ret = 0;
3101 :
3102 : /*
3103 : * With the hb lock held, we avoid races while we process the wakeup.
3104 : * We only need to hold hb (and not hb2) to ensure atomicity as the
3105 : * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
3106 : * It can't be requeued from uaddr2 to something else since we don't
3107 : * support a PI aware source futex for requeue.
3108 : */
3109 0 : if (!match_futex(&q->key, key2)) {
3110 0 : WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
3111 : /*
3112 : * We were woken prior to requeue by a timeout or a signal.
3113 : * Unqueue the futex_q and determine which it was.
3114 : */
3115 0 : plist_del(&q->list, &hb->chain);
3116 0 : hb_waiters_dec(hb);
3117 :
3118 : /* Handle spurious wakeups gracefully */
3119 0 : ret = -EWOULDBLOCK;
3120 0 : if (timeout && !timeout->task)
3121 : ret = -ETIMEDOUT;
3122 0 : else if (signal_pending(current))
3123 0 : ret = -ERESTARTNOINTR;
3124 : }
3125 0 : return ret;
3126 : }
3127 :
3128 : /**
3129 : * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
3130 : * @uaddr: the futex we initially wait on (non-pi)
3131 : * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
3132 : * the same type, no requeueing from private to shared, etc.
3133 : * @val: the expected value of uaddr
3134 : * @abs_time: absolute timeout
3135 : * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
3136 : * @uaddr2: the pi futex we will take prior to returning to user-space
3137 : *
3138 : * The caller will wait on uaddr and will be requeued by futex_requeue() to
3139 : * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
3140 : * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
3141 : * userspace. This ensures the rt_mutex maintains an owner when it has waiters;
3142 : * without one, the pi logic would not know which task to boost/deboost, if
3143 : * there was a need to.
3144 : *
3145 : * We call schedule in futex_wait_queue_me() when we enqueue and return there
3146 : * via the following--
3147 : * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
3148 : * 2) wakeup on uaddr2 after a requeue
3149 : * 3) signal
3150 : * 4) timeout
3151 : *
3152 : * If 3, cleanup and return -ERESTARTNOINTR.
3153 : *
3154 : * If 2, we may then block on trying to take the rt_mutex and return via:
3155 : * 5) successful lock
3156 : * 6) signal
3157 : * 7) timeout
3158 : * 8) other lock acquisition failure
3159 : *
3160 : * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
3161 : *
3162 : * If 4 or 7, we cleanup and return with -ETIMEDOUT.
3163 : *
3164 : * Return:
3165 : * - 0 - On success;
3166 : * - <0 - On error
3167 : */
3168 0 : static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
3169 : u32 val, ktime_t *abs_time, u32 bitset,
3170 : u32 __user *uaddr2)
3171 : {
3172 0 : struct hrtimer_sleeper timeout, *to;
3173 0 : struct rt_mutex_waiter rt_waiter;
3174 0 : struct futex_hash_bucket *hb;
3175 0 : union futex_key key2 = FUTEX_KEY_INIT;
3176 0 : struct futex_q q = futex_q_init;
3177 0 : int res, ret;
3178 :
3179 0 : if (!IS_ENABLED(CONFIG_FUTEX_PI))
3180 : return -ENOSYS;
3181 :
3182 0 : if (uaddr == uaddr2)
3183 : return -EINVAL;
3184 :
3185 0 : if (!bitset)
3186 : return -EINVAL;
3187 :
3188 0 : to = futex_setup_timer(abs_time, &timeout, flags,
3189 0 : current->timer_slack_ns);
3190 :
3191 : /*
3192 : * The waiter is allocated on our stack, manipulated by the requeue
3193 : * code while we sleep on uaddr.
3194 : */
3195 0 : rt_mutex_init_waiter(&rt_waiter);
3196 :
3197 0 : ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
3198 0 : if (unlikely(ret != 0))
3199 0 : goto out;
3200 :
3201 0 : q.bitset = bitset;
3202 0 : q.rt_waiter = &rt_waiter;
3203 0 : q.requeue_pi_key = &key2;
3204 :
3205 : /*
3206 : * Prepare to wait on uaddr. On success, increments q.key (key1) ref
3207 : * count.
3208 : */
3209 0 : ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
3210 0 : if (ret)
3211 0 : goto out;
3212 :
3213 : /*
3214 : * The check above which compares uaddrs is not sufficient for
3215 : * shared futexes. We need to compare the keys:
3216 : */
3217 0 : if (match_futex(&q.key, &key2)) {
3218 0 : queue_unlock(hb);
3219 0 : ret = -EINVAL;
3220 0 : goto out;
3221 : }
3222 :
3223 : /* Queue the futex_q, drop the hb lock, wait for wakeup. */
3224 0 : futex_wait_queue_me(hb, &q, to);
3225 :
3226 0 : spin_lock(&hb->lock);
3227 0 : ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
3228 0 : spin_unlock(&hb->lock);
3229 0 : if (ret)
3230 0 : goto out;
3231 :
3232 : /*
3233 : * In order for us to be here, we know our q.key == key2, and since
3234 : * we took the hb->lock above, we also know that futex_requeue() has
3235 : * completed and we no longer have to concern ourselves with a wakeup
3236 : * race with the atomic proxy lock acquisition by the requeue code. The
3237 : * futex_requeue dropped our key1 reference and incremented our key2
3238 : * reference count.
3239 : */
3240 :
3241 : /* Check if the requeue code acquired the second futex for us. */
3242 0 : if (!q.rt_waiter) {
3243 : /*
3244 : * Got the lock. We might not be the anticipated owner if we
3245 : * did a lock-steal - fix up the PI-state in that case.
3246 : */
3247 0 : if (q.pi_state && (q.pi_state->owner != current)) {
3248 0 : spin_lock(q.lock_ptr);
3249 0 : ret = fixup_pi_state_owner(uaddr2, &q, current);
3250 : /*
3251 : * Drop the reference to the pi state which
3252 : * the requeue_pi() code acquired for us.
3253 : */
3254 0 : put_pi_state(q.pi_state);
3255 0 : spin_unlock(q.lock_ptr);
3256 : /*
3257 : * Adjust the return value. It's either -EFAULT or
3258 : * success (1) but the caller expects 0 for success.
3259 : */
3260 0 : ret = ret < 0 ? ret : 0;
3261 : }
3262 : } else {
3263 0 : struct rt_mutex *pi_mutex;
3264 :
3265 : /*
3266 : * We have been woken up by futex_unlock_pi(), a timeout, or a
3267 : * signal. futex_unlock_pi() will not destroy the lock_ptr nor
3268 : * the pi_state.
3269 : */
3270 0 : WARN_ON(!q.pi_state);
3271 0 : pi_mutex = &q.pi_state->pi_mutex;
3272 0 : ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
3273 :
3274 0 : spin_lock(q.lock_ptr);
3275 0 : if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
3276 0 : ret = 0;
3277 :
3278 0 : debug_rt_mutex_free_waiter(&rt_waiter);
3279 : /*
3280 : * Fixup the pi_state owner and possibly acquire the lock if we
3281 : * haven't already.
3282 : */
3283 0 : res = fixup_owner(uaddr2, &q, !ret);
3284 : /*
3285 : * If fixup_owner() returned an error, proprogate that. If it
3286 : * acquired the lock, clear -ETIMEDOUT or -EINTR.
3287 : */
3288 0 : if (res)
3289 0 : ret = (res < 0) ? res : 0;
3290 :
3291 : /* Unqueue and drop the lock. */
3292 0 : unqueue_me_pi(&q);
3293 : }
3294 :
3295 0 : if (ret == -EINTR) {
3296 : /*
3297 : * We've already been requeued, but cannot restart by calling
3298 : * futex_lock_pi() directly. We could restart this syscall, but
3299 : * it would detect that the user space "val" changed and return
3300 : * -EWOULDBLOCK. Save the overhead of the restart and return
3301 : * -EWOULDBLOCK directly.
3302 : */
3303 0 : ret = -EWOULDBLOCK;
3304 : }
3305 :
3306 0 : out:
3307 0 : if (to) {
3308 0 : hrtimer_cancel(&to->timer);
3309 0 : destroy_hrtimer_on_stack(&to->timer);
3310 : }
3311 : return ret;
3312 : }
3313 :
3314 : /*
3315 : * Support for robust futexes: the kernel cleans up held futexes at
3316 : * thread exit time.
3317 : *
3318 : * Implementation: user-space maintains a per-thread list of locks it
3319 : * is holding. Upon do_exit(), the kernel carefully walks this list,
3320 : * and marks all locks that are owned by this thread with the
3321 : * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
3322 : * always manipulated with the lock held, so the list is private and
3323 : * per-thread. Userspace also maintains a per-thread 'list_op_pending'
3324 : * field, to allow the kernel to clean up if the thread dies after
3325 : * acquiring the lock, but just before it could have added itself to
3326 : * the list. There can only be one such pending lock.
3327 : */
3328 :
3329 : /**
3330 : * sys_set_robust_list() - Set the robust-futex list head of a task
3331 : * @head: pointer to the list-head
3332 : * @len: length of the list-head, as userspace expects
3333 : */
3334 1428 : SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
3335 : size_t, len)
3336 : {
3337 714 : if (!futex_cmpxchg_enabled)
3338 : return -ENOSYS;
3339 : /*
3340 : * The kernel knows only one size for now:
3341 : */
3342 714 : if (unlikely(len != sizeof(*head)))
3343 : return -EINVAL;
3344 :
3345 714 : current->robust_list = head;
3346 :
3347 714 : return 0;
3348 : }
3349 :
3350 : /**
3351 : * sys_get_robust_list() - Get the robust-futex list head of a task
3352 : * @pid: pid of the process [zero for current task]
3353 : * @head_ptr: pointer to a list-head pointer, the kernel fills it in
3354 : * @len_ptr: pointer to a length field, the kernel fills in the header size
3355 : */
3356 0 : SYSCALL_DEFINE3(get_robust_list, int, pid,
3357 : struct robust_list_head __user * __user *, head_ptr,
3358 : size_t __user *, len_ptr)
3359 : {
3360 0 : struct robust_list_head __user *head;
3361 0 : unsigned long ret;
3362 0 : struct task_struct *p;
3363 :
3364 0 : if (!futex_cmpxchg_enabled)
3365 : return -ENOSYS;
3366 :
3367 0 : rcu_read_lock();
3368 :
3369 0 : ret = -ESRCH;
3370 0 : if (!pid)
3371 0 : p = current;
3372 : else {
3373 0 : p = find_task_by_vpid(pid);
3374 0 : if (!p)
3375 0 : goto err_unlock;
3376 : }
3377 :
3378 0 : ret = -EPERM;
3379 0 : if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
3380 0 : goto err_unlock;
3381 :
3382 0 : head = p->robust_list;
3383 0 : rcu_read_unlock();
3384 :
3385 0 : if (put_user(sizeof(*head), len_ptr))
3386 : return -EFAULT;
3387 0 : return put_user(head, head_ptr);
3388 :
3389 0 : err_unlock:
3390 0 : rcu_read_unlock();
3391 :
3392 0 : return ret;
3393 : }
3394 :
3395 : /* Constants for the pending_op argument of handle_futex_death */
3396 : #define HANDLE_DEATH_PENDING true
3397 : #define HANDLE_DEATH_LIST false
3398 :
3399 : /*
3400 : * Process a futex-list entry, check whether it's owned by the
3401 : * dying task, and do notification if so:
3402 : */
3403 0 : static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
3404 : bool pi, bool pending_op)
3405 : {
3406 0 : u32 uval, nval, mval;
3407 0 : int err;
3408 :
3409 : /* Futex address must be 32bit aligned */
3410 0 : if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
3411 : return -1;
3412 :
3413 0 : retry:
3414 0 : if (get_user(uval, uaddr))
3415 : return -1;
3416 :
3417 : /*
3418 : * Special case for regular (non PI) futexes. The unlock path in
3419 : * user space has two race scenarios:
3420 : *
3421 : * 1. The unlock path releases the user space futex value and
3422 : * before it can execute the futex() syscall to wake up
3423 : * waiters it is killed.
3424 : *
3425 : * 2. A woken up waiter is killed before it can acquire the
3426 : * futex in user space.
3427 : *
3428 : * In both cases the TID validation below prevents a wakeup of
3429 : * potential waiters which can cause these waiters to block
3430 : * forever.
3431 : *
3432 : * In both cases the following conditions are met:
3433 : *
3434 : * 1) task->robust_list->list_op_pending != NULL
3435 : * @pending_op == true
3436 : * 2) User space futex value == 0
3437 : * 3) Regular futex: @pi == false
3438 : *
3439 : * If these conditions are met, it is safe to attempt waking up a
3440 : * potential waiter without touching the user space futex value and
3441 : * trying to set the OWNER_DIED bit. The user space futex value is
3442 : * uncontended and the rest of the user space mutex state is
3443 : * consistent, so a woken waiter will just take over the
3444 : * uncontended futex. Setting the OWNER_DIED bit would create
3445 : * inconsistent state and malfunction of the user space owner died
3446 : * handling.
3447 : */
3448 0 : if (pending_op && !pi && !uval) {
3449 0 : futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
3450 0 : return 0;
3451 : }
3452 :
3453 0 : if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
3454 : return 0;
3455 :
3456 : /*
3457 : * Ok, this dying thread is truly holding a futex
3458 : * of interest. Set the OWNER_DIED bit atomically
3459 : * via cmpxchg, and if the value had FUTEX_WAITERS
3460 : * set, wake up a waiter (if any). (We have to do a
3461 : * futex_wake() even if OWNER_DIED is already set -
3462 : * to handle the rare but possible case of recursive
3463 : * thread-death.) The rest of the cleanup is done in
3464 : * userspace.
3465 : */
3466 0 : mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
3467 :
3468 : /*
3469 : * We are not holding a lock here, but we want to have
3470 : * the pagefault_disable/enable() protection because
3471 : * we want to handle the fault gracefully. If the
3472 : * access fails we try to fault in the futex with R/W
3473 : * verification via get_user_pages. get_user() above
3474 : * does not guarantee R/W access. If that fails we
3475 : * give up and leave the futex locked.
3476 : */
3477 0 : if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) {
3478 0 : switch (err) {
3479 0 : case -EFAULT:
3480 0 : if (fault_in_user_writeable(uaddr))
3481 : return -1;
3482 0 : goto retry;
3483 :
3484 0 : case -EAGAIN:
3485 0 : cond_resched();
3486 0 : goto retry;
3487 :
3488 : default:
3489 0 : WARN_ON_ONCE(1);
3490 0 : return err;
3491 : }
3492 : }
3493 :
3494 0 : if (nval != uval)
3495 0 : goto retry;
3496 :
3497 : /*
3498 : * Wake robust non-PI futexes here. The wakeup of
3499 : * PI futexes happens in exit_pi_state():
3500 : */
3501 0 : if (!pi && (uval & FUTEX_WAITERS))
3502 0 : futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
3503 :
3504 : return 0;
3505 : }
3506 :
3507 : /*
3508 : * Fetch a robust-list pointer. Bit 0 signals PI futexes:
3509 : */
3510 1386 : static inline int fetch_robust_entry(struct robust_list __user **entry,
3511 : struct robust_list __user * __user *head,
3512 : unsigned int *pi)
3513 : {
3514 1386 : unsigned long uentry;
3515 :
3516 1386 : if (get_user(uentry, (unsigned long __user *)head))
3517 : return -EFAULT;
3518 :
3519 1386 : *entry = (void __user *)(uentry & ~1UL);
3520 1386 : *pi = uentry & 1;
3521 :
3522 1386 : return 0;
3523 : }
3524 :
3525 : /*
3526 : * Walk curr->robust_list (very carefully, it's a userspace list!)
3527 : * and mark any locks found there dead, and notify any waiters.
3528 : *
3529 : * We silently return on any sign of list-walking problem.
3530 : */
3531 693 : static void exit_robust_list(struct task_struct *curr)
3532 : {
3533 693 : struct robust_list_head __user *head = curr->robust_list;
3534 693 : struct robust_list __user *entry, *next_entry, *pending;
3535 693 : unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
3536 693 : unsigned int next_pi;
3537 693 : unsigned long futex_offset;
3538 693 : int rc;
3539 :
3540 693 : if (!futex_cmpxchg_enabled)
3541 0 : return;
3542 :
3543 : /*
3544 : * Fetch the list head (which was registered earlier, via
3545 : * sys_set_robust_list()):
3546 : */
3547 693 : if (fetch_robust_entry(&entry, &head->list.next, &pi))
3548 : return;
3549 : /*
3550 : * Fetch the relative futex offset:
3551 : */
3552 693 : if (get_user(futex_offset, &head->futex_offset))
3553 : return;
3554 : /*
3555 : * Fetch any possibly pending lock-add first, and handle it
3556 : * if it exists:
3557 : */
3558 693 : if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
3559 : return;
3560 :
3561 693 : next_entry = NULL; /* avoid warning with gcc */
3562 693 : while (entry != &head->list) {
3563 : /*
3564 : * Fetch the next entry in the list before calling
3565 : * handle_futex_death:
3566 : */
3567 0 : rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
3568 : /*
3569 : * A pending lock might already be on the list, so
3570 : * don't process it twice:
3571 : */
3572 0 : if (entry != pending) {
3573 0 : if (handle_futex_death((void __user *)entry + futex_offset,
3574 : curr, pi, HANDLE_DEATH_LIST))
3575 : return;
3576 : }
3577 0 : if (rc)
3578 : return;
3579 0 : entry = next_entry;
3580 0 : pi = next_pi;
3581 : /*
3582 : * Avoid excessively long or circular lists:
3583 : */
3584 0 : if (!--limit)
3585 : break;
3586 :
3587 0 : cond_resched();
3588 : }
3589 :
3590 693 : if (pending) {
3591 0 : handle_futex_death((void __user *)pending + futex_offset,
3592 : curr, pip, HANDLE_DEATH_PENDING);
3593 : }
3594 : }
3595 :
3596 1865 : static void futex_cleanup(struct task_struct *tsk)
3597 : {
3598 1865 : if (unlikely(tsk->robust_list)) {
3599 693 : exit_robust_list(tsk);
3600 693 : tsk->robust_list = NULL;
3601 : }
3602 :
3603 : #ifdef CONFIG_COMPAT
3604 1865 : if (unlikely(tsk->compat_robust_list)) {
3605 0 : compat_exit_robust_list(tsk);
3606 0 : tsk->compat_robust_list = NULL;
3607 : }
3608 : #endif
3609 :
3610 1865 : if (unlikely(!list_empty(&tsk->pi_state_list)))
3611 0 : exit_pi_state_list(tsk);
3612 1865 : }
3613 :
3614 : /**
3615 : * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
3616 : * @tsk: task to set the state on
3617 : *
3618 : * Set the futex exit state of the task lockless. The futex waiter code
3619 : * observes that state when a task is exiting and loops until the task has
3620 : * actually finished the futex cleanup. The worst case for this is that the
3621 : * waiter runs through the wait loop until the state becomes visible.
3622 : *
3623 : * This is called from the recursive fault handling path in do_exit().
3624 : *
3625 : * This is best effort. Either the futex exit code has run already or
3626 : * not. If the OWNER_DIED bit has been set on the futex then the waiter can
3627 : * take it over. If not, the problem is pushed back to user space. If the
3628 : * futex exit code did not run yet, then an already queued waiter might
3629 : * block forever, but there is nothing which can be done about that.
3630 : */
3631 0 : void futex_exit_recursive(struct task_struct *tsk)
3632 : {
3633 : /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
3634 0 : if (tsk->futex_state == FUTEX_STATE_EXITING)
3635 0 : mutex_unlock(&tsk->futex_exit_mutex);
3636 0 : tsk->futex_state = FUTEX_STATE_DEAD;
3637 0 : }
3638 :
3639 1865 : static void futex_cleanup_begin(struct task_struct *tsk)
3640 : {
3641 : /*
3642 : * Prevent various race issues against a concurrent incoming waiter
3643 : * including live locks by forcing the waiter to block on
3644 : * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
3645 : * attach_to_pi_owner().
3646 : */
3647 1865 : mutex_lock(&tsk->futex_exit_mutex);
3648 :
3649 : /*
3650 : * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
3651 : *
3652 : * This ensures that all subsequent checks of tsk->futex_state in
3653 : * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
3654 : * tsk->pi_lock held.
3655 : *
3656 : * It guarantees also that a pi_state which was queued right before
3657 : * the state change under tsk->pi_lock by a concurrent waiter must
3658 : * be observed in exit_pi_state_list().
3659 : */
3660 1865 : raw_spin_lock_irq(&tsk->pi_lock);
3661 1865 : tsk->futex_state = FUTEX_STATE_EXITING;
3662 1865 : raw_spin_unlock_irq(&tsk->pi_lock);
3663 1865 : }
3664 :
3665 1865 : static void futex_cleanup_end(struct task_struct *tsk, int state)
3666 : {
3667 : /*
3668 : * Lockless store. The only side effect is that an observer might
3669 : * take another loop until it becomes visible.
3670 : */
3671 1865 : tsk->futex_state = state;
3672 : /*
3673 : * Drop the exit protection. This unblocks waiters which observed
3674 : * FUTEX_STATE_EXITING to reevaluate the state.
3675 : */
3676 1865 : mutex_unlock(&tsk->futex_exit_mutex);
3677 : }
3678 :
3679 790 : void futex_exec_release(struct task_struct *tsk)
3680 : {
3681 : /*
3682 : * The state handling is done for consistency, but in the case of
3683 : * exec() there is no way to prevent futher damage as the PID stays
3684 : * the same. But for the unlikely and arguably buggy case that a
3685 : * futex is held on exec(), this provides at least as much state
3686 : * consistency protection which is possible.
3687 : */
3688 790 : futex_cleanup_begin(tsk);
3689 790 : futex_cleanup(tsk);
3690 : /*
3691 : * Reset the state to FUTEX_STATE_OK. The task is alive and about
3692 : * exec a new binary.
3693 : */
3694 790 : futex_cleanup_end(tsk, FUTEX_STATE_OK);
3695 790 : }
3696 :
3697 1075 : void futex_exit_release(struct task_struct *tsk)
3698 : {
3699 1075 : futex_cleanup_begin(tsk);
3700 1075 : futex_cleanup(tsk);
3701 1075 : futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
3702 1075 : }
3703 :
3704 762 : long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
3705 : u32 __user *uaddr2, u32 val2, u32 val3)
3706 : {
3707 762 : int cmd = op & FUTEX_CMD_MASK;
3708 762 : unsigned int flags = 0;
3709 :
3710 762 : if (!(op & FUTEX_PRIVATE_FLAG))
3711 2 : flags |= FLAGS_SHARED;
3712 :
3713 762 : if (op & FUTEX_CLOCK_REALTIME) {
3714 0 : flags |= FLAGS_CLOCKRT;
3715 0 : if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \
3716 : cmd != FUTEX_WAIT_REQUEUE_PI)
3717 : return -ENOSYS;
3718 : }
3719 :
3720 762 : switch (cmd) {
3721 0 : case FUTEX_LOCK_PI:
3722 : case FUTEX_UNLOCK_PI:
3723 : case FUTEX_TRYLOCK_PI:
3724 : case FUTEX_WAIT_REQUEUE_PI:
3725 : case FUTEX_CMP_REQUEUE_PI:
3726 0 : if (!futex_cmpxchg_enabled)
3727 : return -ENOSYS;
3728 : }
3729 :
3730 762 : switch (cmd) {
3731 168 : case FUTEX_WAIT:
3732 168 : val3 = FUTEX_BITSET_MATCH_ANY;
3733 168 : fallthrough;
3734 168 : case FUTEX_WAIT_BITSET:
3735 168 : return futex_wait(uaddr, flags, val, timeout, val3);
3736 594 : case FUTEX_WAKE:
3737 594 : val3 = FUTEX_BITSET_MATCH_ANY;
3738 594 : fallthrough;
3739 594 : case FUTEX_WAKE_BITSET:
3740 594 : return futex_wake(uaddr, flags, val, val3);
3741 0 : case FUTEX_REQUEUE:
3742 0 : return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
3743 0 : case FUTEX_CMP_REQUEUE:
3744 0 : return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
3745 0 : case FUTEX_WAKE_OP:
3746 0 : return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
3747 0 : case FUTEX_LOCK_PI:
3748 0 : return futex_lock_pi(uaddr, flags, timeout, 0);
3749 0 : case FUTEX_UNLOCK_PI:
3750 0 : return futex_unlock_pi(uaddr, flags);
3751 0 : case FUTEX_TRYLOCK_PI:
3752 0 : return futex_lock_pi(uaddr, flags, NULL, 1);
3753 0 : case FUTEX_WAIT_REQUEUE_PI:
3754 0 : val3 = FUTEX_BITSET_MATCH_ANY;
3755 0 : return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
3756 : uaddr2);
3757 0 : case FUTEX_CMP_REQUEUE_PI:
3758 0 : return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
3759 : }
3760 : return -ENOSYS;
3761 : }
3762 :
3763 :
3764 1520 : SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
3765 : const struct __kernel_timespec __user *, utime,
3766 : u32 __user *, uaddr2, u32, val3)
3767 : {
3768 760 : struct timespec64 ts;
3769 760 : ktime_t t, *tp = NULL;
3770 760 : u32 val2 = 0;
3771 760 : int cmd = op & FUTEX_CMD_MASK;
3772 :
3773 760 : if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
3774 4 : cmd == FUTEX_WAIT_BITSET ||
3775 4 : cmd == FUTEX_WAIT_REQUEUE_PI)) {
3776 0 : if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
3777 : return -EFAULT;
3778 0 : if (get_timespec64(&ts, utime))
3779 : return -EFAULT;
3780 0 : if (!timespec64_valid(&ts))
3781 : return -EINVAL;
3782 :
3783 0 : t = timespec64_to_ktime(ts);
3784 0 : if (cmd == FUTEX_WAIT)
3785 0 : t = ktime_add_safe(ktime_get(), t);
3786 : else if (!(op & FUTEX_CLOCK_REALTIME))
3787 760 : t = timens_ktime_to_host(CLOCK_MONOTONIC, t);
3788 : tp = &t;
3789 : }
3790 : /*
3791 : * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
3792 : * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
3793 : */
3794 760 : if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
3795 760 : cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
3796 0 : val2 = (u32) (unsigned long) utime;
3797 :
3798 760 : return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
3799 : }
3800 :
3801 : #ifdef CONFIG_COMPAT
3802 : /*
3803 : * Fetch a robust-list pointer. Bit 0 signals PI futexes:
3804 : */
3805 : static inline int
3806 0 : compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
3807 : compat_uptr_t __user *head, unsigned int *pi)
3808 : {
3809 0 : if (get_user(*uentry, head))
3810 : return -EFAULT;
3811 :
3812 0 : *entry = compat_ptr((*uentry) & ~1);
3813 0 : *pi = (unsigned int)(*uentry) & 1;
3814 :
3815 0 : return 0;
3816 : }
3817 :
3818 0 : static void __user *futex_uaddr(struct robust_list __user *entry,
3819 : compat_long_t futex_offset)
3820 : {
3821 0 : compat_uptr_t base = ptr_to_compat(entry);
3822 0 : void __user *uaddr = compat_ptr(base + futex_offset);
3823 :
3824 0 : return uaddr;
3825 : }
3826 :
3827 : /*
3828 : * Walk curr->robust_list (very carefully, it's a userspace list!)
3829 : * and mark any locks found there dead, and notify any waiters.
3830 : *
3831 : * We silently return on any sign of list-walking problem.
3832 : */
3833 0 : static void compat_exit_robust_list(struct task_struct *curr)
3834 : {
3835 0 : struct compat_robust_list_head __user *head = curr->compat_robust_list;
3836 0 : struct robust_list __user *entry, *next_entry, *pending;
3837 0 : unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
3838 0 : unsigned int next_pi;
3839 0 : compat_uptr_t uentry, next_uentry, upending;
3840 0 : compat_long_t futex_offset;
3841 0 : int rc;
3842 :
3843 0 : if (!futex_cmpxchg_enabled)
3844 0 : return;
3845 :
3846 : /*
3847 : * Fetch the list head (which was registered earlier, via
3848 : * sys_set_robust_list()):
3849 : */
3850 0 : if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
3851 : return;
3852 : /*
3853 : * Fetch the relative futex offset:
3854 : */
3855 0 : if (get_user(futex_offset, &head->futex_offset))
3856 : return;
3857 : /*
3858 : * Fetch any possibly pending lock-add first, and handle it
3859 : * if it exists:
3860 : */
3861 0 : if (compat_fetch_robust_entry(&upending, &pending,
3862 : &head->list_op_pending, &pip))
3863 : return;
3864 :
3865 0 : next_entry = NULL; /* avoid warning with gcc */
3866 0 : while (entry != (struct robust_list __user *) &head->list) {
3867 : /*
3868 : * Fetch the next entry in the list before calling
3869 : * handle_futex_death:
3870 : */
3871 0 : rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
3872 0 : (compat_uptr_t __user *)&entry->next, &next_pi);
3873 : /*
3874 : * A pending lock might already be on the list, so
3875 : * dont process it twice:
3876 : */
3877 0 : if (entry != pending) {
3878 0 : void __user *uaddr = futex_uaddr(entry, futex_offset);
3879 :
3880 0 : if (handle_futex_death(uaddr, curr, pi,
3881 : HANDLE_DEATH_LIST))
3882 : return;
3883 : }
3884 0 : if (rc)
3885 : return;
3886 0 : uentry = next_uentry;
3887 0 : entry = next_entry;
3888 0 : pi = next_pi;
3889 : /*
3890 : * Avoid excessively long or circular lists:
3891 : */
3892 0 : if (!--limit)
3893 : break;
3894 :
3895 0 : cond_resched();
3896 : }
3897 0 : if (pending) {
3898 0 : void __user *uaddr = futex_uaddr(pending, futex_offset);
3899 :
3900 0 : handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
3901 : }
3902 : }
3903 :
3904 0 : COMPAT_SYSCALL_DEFINE2(set_robust_list,
3905 : struct compat_robust_list_head __user *, head,
3906 : compat_size_t, len)
3907 : {
3908 0 : if (!futex_cmpxchg_enabled)
3909 : return -ENOSYS;
3910 :
3911 0 : if (unlikely(len != sizeof(*head)))
3912 : return -EINVAL;
3913 :
3914 0 : current->compat_robust_list = head;
3915 :
3916 0 : return 0;
3917 : }
3918 :
3919 0 : COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
3920 : compat_uptr_t __user *, head_ptr,
3921 : compat_size_t __user *, len_ptr)
3922 : {
3923 0 : struct compat_robust_list_head __user *head;
3924 0 : unsigned long ret;
3925 0 : struct task_struct *p;
3926 :
3927 0 : if (!futex_cmpxchg_enabled)
3928 : return -ENOSYS;
3929 :
3930 0 : rcu_read_lock();
3931 :
3932 0 : ret = -ESRCH;
3933 0 : if (!pid)
3934 0 : p = current;
3935 : else {
3936 0 : p = find_task_by_vpid(pid);
3937 0 : if (!p)
3938 0 : goto err_unlock;
3939 : }
3940 :
3941 0 : ret = -EPERM;
3942 0 : if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
3943 0 : goto err_unlock;
3944 :
3945 0 : head = p->compat_robust_list;
3946 0 : rcu_read_unlock();
3947 :
3948 0 : if (put_user(sizeof(*head), len_ptr))
3949 : return -EFAULT;
3950 0 : return put_user(ptr_to_compat(head), head_ptr);
3951 :
3952 0 : err_unlock:
3953 0 : rcu_read_unlock();
3954 :
3955 0 : return ret;
3956 : }
3957 : #endif /* CONFIG_COMPAT */
3958 :
3959 : #ifdef CONFIG_COMPAT_32BIT_TIME
3960 : SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
3961 : const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
3962 : u32, val3)
3963 : {
3964 : struct timespec64 ts;
3965 : ktime_t t, *tp = NULL;
3966 : int val2 = 0;
3967 : int cmd = op & FUTEX_CMD_MASK;
3968 :
3969 : if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
3970 : cmd == FUTEX_WAIT_BITSET ||
3971 : cmd == FUTEX_WAIT_REQUEUE_PI)) {
3972 : if (get_old_timespec32(&ts, utime))
3973 : return -EFAULT;
3974 : if (!timespec64_valid(&ts))
3975 : return -EINVAL;
3976 :
3977 : t = timespec64_to_ktime(ts);
3978 : if (cmd == FUTEX_WAIT)
3979 : t = ktime_add_safe(ktime_get(), t);
3980 : else if (!(op & FUTEX_CLOCK_REALTIME))
3981 : t = timens_ktime_to_host(CLOCK_MONOTONIC, t);
3982 : tp = &t;
3983 : }
3984 : if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
3985 : cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
3986 : val2 = (int) (unsigned long) utime;
3987 :
3988 : return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
3989 : }
3990 : #endif /* CONFIG_COMPAT_32BIT_TIME */
3991 :
3992 1 : static void __init futex_detect_cmpxchg(void)
3993 : {
3994 : #ifndef CONFIG_HAVE_FUTEX_CMPXCHG
3995 1 : u32 curval;
3996 :
3997 : /*
3998 : * This will fail and we want it. Some arch implementations do
3999 : * runtime detection of the futex_atomic_cmpxchg_inatomic()
4000 : * functionality. We want to know that before we call in any
4001 : * of the complex code paths. Also we want to prevent
4002 : * registration of robust lists in that case. NULL is
4003 : * guaranteed to fault and we get -EFAULT on functional
4004 : * implementation, the non-functional ones will return
4005 : * -ENOSYS.
4006 : */
4007 1 : if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
4008 1 : futex_cmpxchg_enabled = 1;
4009 : #endif
4010 1 : }
4011 :
4012 1 : static int __init futex_init(void)
4013 : {
4014 1 : unsigned int futex_shift;
4015 1 : unsigned long i;
4016 :
4017 : #if CONFIG_BASE_SMALL
4018 : futex_hashsize = 16;
4019 : #else
4020 1 : futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
4021 : #endif
4022 :
4023 2 : futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
4024 : futex_hashsize, 0,
4025 : futex_hashsize < 256 ? HASH_SMALL : 0,
4026 : &futex_shift, NULL,
4027 : futex_hashsize, futex_hashsize);
4028 1 : futex_hashsize = 1UL << futex_shift;
4029 :
4030 1 : futex_detect_cmpxchg();
4031 :
4032 1026 : for (i = 0; i < futex_hashsize; i++) {
4033 1024 : atomic_set(&futex_queues[i].waiters, 0);
4034 1024 : plist_head_init(&futex_queues[i].chain);
4035 1024 : spin_lock_init(&futex_queues[i].lock);
4036 : }
4037 :
4038 1 : return 0;
4039 : }
4040 : core_initcall(futex_init);
|