LCOV - code coverage report
Current view: top level - fs - eventpoll.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 639 817 78.2 %
Date: 2021-04-22 12:43:58 Functions: 46 68 67.6 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-or-later
       2             : /*
       3             :  *  fs/eventpoll.c (Efficient event retrieval implementation)
       4             :  *  Copyright (C) 2001,...,2009  Davide Libenzi
       5             :  *
       6             :  *  Davide Libenzi <davidel@xmailserver.org>
       7             :  */
       8             : 
       9             : #include <linux/init.h>
      10             : #include <linux/kernel.h>
      11             : #include <linux/sched/signal.h>
      12             : #include <linux/fs.h>
      13             : #include <linux/file.h>
      14             : #include <linux/signal.h>
      15             : #include <linux/errno.h>
      16             : #include <linux/mm.h>
      17             : #include <linux/slab.h>
      18             : #include <linux/poll.h>
      19             : #include <linux/string.h>
      20             : #include <linux/list.h>
      21             : #include <linux/hash.h>
      22             : #include <linux/spinlock.h>
      23             : #include <linux/syscalls.h>
      24             : #include <linux/rbtree.h>
      25             : #include <linux/wait.h>
      26             : #include <linux/eventpoll.h>
      27             : #include <linux/mount.h>
      28             : #include <linux/bitops.h>
      29             : #include <linux/mutex.h>
      30             : #include <linux/anon_inodes.h>
      31             : #include <linux/device.h>
      32             : #include <linux/uaccess.h>
      33             : #include <asm/io.h>
      34             : #include <asm/mman.h>
      35             : #include <linux/atomic.h>
      36             : #include <linux/proc_fs.h>
      37             : #include <linux/seq_file.h>
      38             : #include <linux/compat.h>
      39             : #include <linux/rculist.h>
      40             : #include <net/busy_poll.h>
      41             : 
      42             : /*
      43             :  * LOCKING:
      44             :  * There are three level of locking required by epoll :
      45             :  *
      46             :  * 1) epmutex (mutex)
      47             :  * 2) ep->mtx (mutex)
      48             :  * 3) ep->lock (rwlock)
      49             :  *
      50             :  * The acquire order is the one listed above, from 1 to 3.
      51             :  * We need a rwlock (ep->lock) because we manipulate objects
      52             :  * from inside the poll callback, that might be triggered from
      53             :  * a wake_up() that in turn might be called from IRQ context.
      54             :  * So we can't sleep inside the poll callback and hence we need
      55             :  * a spinlock. During the event transfer loop (from kernel to
      56             :  * user space) we could end up sleeping due a copy_to_user(), so
      57             :  * we need a lock that will allow us to sleep. This lock is a
      58             :  * mutex (ep->mtx). It is acquired during the event transfer loop,
      59             :  * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
      60             :  * Then we also need a global mutex to serialize eventpoll_release_file()
      61             :  * and ep_free().
      62             :  * This mutex is acquired by ep_free() during the epoll file
      63             :  * cleanup path and it is also acquired by eventpoll_release_file()
      64             :  * if a file has been pushed inside an epoll set and it is then
      65             :  * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
      66             :  * It is also acquired when inserting an epoll fd onto another epoll
      67             :  * fd. We do this so that we walk the epoll tree and ensure that this
      68             :  * insertion does not create a cycle of epoll file descriptors, which
      69             :  * could lead to deadlock. We need a global mutex to prevent two
      70             :  * simultaneous inserts (A into B and B into A) from racing and
      71             :  * constructing a cycle without either insert observing that it is
      72             :  * going to.
      73             :  * It is necessary to acquire multiple "ep->mtx"es at once in the
      74             :  * case when one epoll fd is added to another. In this case, we
      75             :  * always acquire the locks in the order of nesting (i.e. after
      76             :  * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
      77             :  * before e2->mtx). Since we disallow cycles of epoll file
      78             :  * descriptors, this ensures that the mutexes are well-ordered. In
      79             :  * order to communicate this nesting to lockdep, when walking a tree
      80             :  * of epoll file descriptors, we use the current recursion depth as
      81             :  * the lockdep subkey.
      82             :  * It is possible to drop the "ep->mtx" and to use the global
      83             :  * mutex "epmutex" (together with "ep->lock") to have it working,
      84             :  * but having "ep->mtx" will make the interface more scalable.
      85             :  * Events that require holding "epmutex" are very rare, while for
      86             :  * normal operations the epoll private "ep->mtx" will guarantee
      87             :  * a better scalability.
      88             :  */
      89             : 
      90             : /* Epoll private bits inside the event mask */
      91             : #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
      92             : 
      93             : #define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT)
      94             : 
      95             : #define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \
      96             :                                 EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
      97             : 
      98             : /* Maximum number of nesting allowed inside epoll sets */
      99             : #define EP_MAX_NESTS 4
     100             : 
     101             : #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
     102             : 
     103             : #define EP_UNACTIVE_PTR ((void *) -1L)
     104             : 
     105             : #define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
     106             : 
     107             : struct epoll_filefd {
     108             :         struct file *file;
     109             :         int fd;
     110             : } __packed;
     111             : 
     112             : /* Wait structure used by the poll hooks */
     113             : struct eppoll_entry {
     114             :         /* List header used to link this structure to the "struct epitem" */
     115             :         struct eppoll_entry *next;
     116             : 
     117             :         /* The "base" pointer is set to the container "struct epitem" */
     118             :         struct epitem *base;
     119             : 
     120             :         /*
     121             :          * Wait queue item that will be linked to the target file wait
     122             :          * queue head.
     123             :          */
     124             :         wait_queue_entry_t wait;
     125             : 
     126             :         /* The wait queue head that linked the "wait" wait queue item */
     127             :         wait_queue_head_t *whead;
     128             : };
     129             : 
     130             : /*
     131             :  * Each file descriptor added to the eventpoll interface will
     132             :  * have an entry of this type linked to the "rbr" RB tree.
     133             :  * Avoid increasing the size of this struct, there can be many thousands
     134             :  * of these on a server and we do not want this to take another cache line.
     135             :  */
     136             : struct epitem {
     137             :         union {
     138             :                 /* RB tree node links this structure to the eventpoll RB tree */
     139             :                 struct rb_node rbn;
     140             :                 /* Used to free the struct epitem */
     141             :                 struct rcu_head rcu;
     142             :         };
     143             : 
     144             :         /* List header used to link this structure to the eventpoll ready list */
     145             :         struct list_head rdllink;
     146             : 
     147             :         /*
     148             :          * Works together "struct eventpoll"->ovflist in keeping the
     149             :          * single linked chain of items.
     150             :          */
     151             :         struct epitem *next;
     152             : 
     153             :         /* The file descriptor information this item refers to */
     154             :         struct epoll_filefd ffd;
     155             : 
     156             :         /* List containing poll wait queues */
     157             :         struct eppoll_entry *pwqlist;
     158             : 
     159             :         /* The "container" of this item */
     160             :         struct eventpoll *ep;
     161             : 
     162             :         /* List header used to link this item to the "struct file" items list */
     163             :         struct hlist_node fllink;
     164             : 
     165             :         /* wakeup_source used when EPOLLWAKEUP is set */
     166             :         struct wakeup_source __rcu *ws;
     167             : 
     168             :         /* The structure that describe the interested events and the source fd */
     169             :         struct epoll_event event;
     170             : };
     171             : 
     172             : /*
     173             :  * This structure is stored inside the "private_data" member of the file
     174             :  * structure and represents the main data structure for the eventpoll
     175             :  * interface.
     176             :  */
     177             : struct eventpoll {
     178             :         /*
     179             :          * This mutex is used to ensure that files are not removed
     180             :          * while epoll is using them. This is held during the event
     181             :          * collection loop, the file cleanup path, the epoll file exit
     182             :          * code and the ctl operations.
     183             :          */
     184             :         struct mutex mtx;
     185             : 
     186             :         /* Wait queue used by sys_epoll_wait() */
     187             :         wait_queue_head_t wq;
     188             : 
     189             :         /* Wait queue used by file->poll() */
     190             :         wait_queue_head_t poll_wait;
     191             : 
     192             :         /* List of ready file descriptors */
     193             :         struct list_head rdllist;
     194             : 
     195             :         /* Lock which protects rdllist and ovflist */
     196             :         rwlock_t lock;
     197             : 
     198             :         /* RB tree root used to store monitored fd structs */
     199             :         struct rb_root_cached rbr;
     200             : 
     201             :         /*
     202             :          * This is a single linked list that chains all the "struct epitem" that
     203             :          * happened while transferring ready events to userspace w/out
     204             :          * holding ->lock.
     205             :          */
     206             :         struct epitem *ovflist;
     207             : 
     208             :         /* wakeup_source used when ep_scan_ready_list is running */
     209             :         struct wakeup_source *ws;
     210             : 
     211             :         /* The user that created the eventpoll descriptor */
     212             :         struct user_struct *user;
     213             : 
     214             :         struct file *file;
     215             : 
     216             :         /* used to optimize loop detection check */
     217             :         u64 gen;
     218             :         struct hlist_head refs;
     219             : 
     220             : #ifdef CONFIG_NET_RX_BUSY_POLL
     221             :         /* used to track busy poll napi_id */
     222             :         unsigned int napi_id;
     223             : #endif
     224             : 
     225             : #ifdef CONFIG_DEBUG_LOCK_ALLOC
     226             :         /* tracks wakeup nests for lockdep validation */
     227             :         u8 nests;
     228             : #endif
     229             : };
     230             : 
     231             : /* Wrapper struct used by poll queueing */
     232             : struct ep_pqueue {
     233             :         poll_table pt;
     234             :         struct epitem *epi;
     235             : };
     236             : 
     237             : /*
     238             :  * Configuration options available inside /proc/sys/fs/epoll/
     239             :  */
     240             : /* Maximum number of epoll watched descriptors, per user */
     241             : static long max_user_watches __read_mostly;
     242             : 
     243             : /*
     244             :  * This mutex is used to serialize ep_free() and eventpoll_release_file().
     245             :  */
     246             : static DEFINE_MUTEX(epmutex);
     247             : 
     248             : static u64 loop_check_gen = 0;
     249             : 
     250             : /* Used to check for epoll file descriptor inclusion loops */
     251             : static struct eventpoll *inserting_into;
     252             : 
     253             : /* Slab cache used to allocate "struct epitem" */
     254             : static struct kmem_cache *epi_cache __read_mostly;
     255             : 
     256             : /* Slab cache used to allocate "struct eppoll_entry" */
     257             : static struct kmem_cache *pwq_cache __read_mostly;
     258             : 
     259             : /*
     260             :  * List of files with newly added links, where we may need to limit the number
     261             :  * of emanating paths. Protected by the epmutex.
     262             :  */
     263             : struct epitems_head {
     264             :         struct hlist_head epitems;
     265             :         struct epitems_head *next;
     266             : };
     267             : static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;
     268             : 
     269             : static struct kmem_cache *ephead_cache __read_mostly;
     270             : 
     271         527 : static inline void free_ephead(struct epitems_head *head)
     272             : {
     273         527 :         if (head)
     274         183 :                 kmem_cache_free(ephead_cache, head);
     275         527 : }
     276             : 
     277           5 : static void list_file(struct file *file)
     278             : {
     279           5 :         struct epitems_head *head;
     280             : 
     281           5 :         head = container_of(file->f_ep, struct epitems_head, epitems);
     282           5 :         if (!head->next) {
     283           5 :                 head->next = tfile_check_list;
     284           5 :                 tfile_check_list = head;
     285             :         }
     286             : }
     287             : 
     288           5 : static void unlist_file(struct epitems_head *head)
     289             : {
     290           5 :         struct epitems_head *to_free = head;
     291           5 :         struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems));
     292           5 :         if (p) {
     293           5 :                 struct epitem *epi= container_of(p, struct epitem, fllink);
     294           5 :                 spin_lock(&epi->ffd.file->f_lock);
     295           5 :                 if (!hlist_empty(&head->epitems))
     296           5 :                         to_free = NULL;
     297           5 :                 head->next = NULL;
     298           5 :                 spin_unlock(&epi->ffd.file->f_lock);
     299             :         }
     300           5 :         free_ephead(to_free);
     301           5 : }
     302             : 
     303             : #ifdef CONFIG_SYSCTL
     304             : 
     305             : #include <linux/sysctl.h>
     306             : 
     307             : static long long_zero;
     308             : static long long_max = LONG_MAX;
     309             : 
     310             : struct ctl_table epoll_table[] = {
     311             :         {
     312             :                 .procname       = "max_user_watches",
     313             :                 .data           = &max_user_watches,
     314             :                 .maxlen         = sizeof(max_user_watches),
     315             :                 .mode           = 0644,
     316             :                 .proc_handler   = proc_doulongvec_minmax,
     317             :                 .extra1         = &long_zero,
     318             :                 .extra2         = &long_max,
     319             :         },
     320             :         { }
     321             : };
     322             : #endif /* CONFIG_SYSCTL */
     323             : 
     324             : static const struct file_operations eventpoll_fops;
     325             : 
     326       11992 : static inline int is_file_epoll(struct file *f)
     327             : {
     328       11992 :         return f->f_op == &eventpoll_fops;
     329             : }
     330             : 
     331             : /* Setup the structure that is used as key for the RB tree */
     332        1021 : static inline void ep_set_ffd(struct epoll_filefd *ffd,
     333             :                               struct file *file, int fd)
     334             : {
     335        1021 :         ffd->file = file;
     336        1021 :         ffd->fd = fd;
     337             : }
     338             : 
     339             : /* Compare RB tree keys */
     340        3780 : static inline int ep_cmp_ffd(struct epoll_filefd *p1,
     341             :                              struct epoll_filefd *p2)
     342             : {
     343        3780 :         return (p1->file > p2->file ? +1:
     344        1953 :                 (p1->file < p2->file ? -1 : p1->fd - p2->fd));
     345             : }
     346             : 
     347             : /* Tells us if the item is currently linked */
     348        3686 : static inline int ep_is_linked(struct epitem *epi)
     349             : {
     350          97 :         return !list_empty(&epi->rdllink);
     351             : }
     352             : 
     353           0 : static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
     354             : {
     355           0 :         return container_of(p, struct eppoll_entry, wait);
     356             : }
     357             : 
     358             : /* Get the "struct epitem" from a wait queue pointer */
     359        5222 : static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
     360             : {
     361        5222 :         return container_of(p, struct eppoll_entry, wait)->base;
     362             : }
     363             : 
     364             : /**
     365             :  * ep_events_available - Checks if ready events might be available.
     366             :  *
     367             :  * @ep: Pointer to the eventpoll context.
     368             :  *
     369             :  * Returns: Returns a value different than zero if ready events are available,
     370             :  *          or zero otherwise.
     371             :  */
     372        5025 : static inline int ep_events_available(struct eventpoll *ep)
     373             : {
     374       15130 :         return !list_empty_careful(&ep->rdllist) ||
     375        2010 :                 READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
     376             : }
     377             : 
     378             : #ifdef CONFIG_NET_RX_BUSY_POLL
     379           0 : static bool ep_busy_loop_end(void *p, unsigned long start_time)
     380             : {
     381           0 :         struct eventpoll *ep = p;
     382             : 
     383           0 :         return ep_events_available(ep) || busy_loop_timeout(start_time);
     384             : }
     385             : 
     386             : /*
     387             :  * Busy poll if globally on and supporting sockets found && no events,
     388             :  * busy loop will return if need_resched or ep_events_available.
     389             :  *
     390             :  * we must do our busy polling with irqs enabled
     391             :  */
     392        1535 : static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
     393             : {
     394        1535 :         unsigned int napi_id = READ_ONCE(ep->napi_id);
     395             : 
     396        1535 :         if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) {
     397           0 :                 napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false,
     398             :                                BUSY_POLL_BUDGET);
     399           0 :                 if (ep_events_available(ep))
     400             :                         return true;
     401             :                 /*
     402             :                  * Busy poll timed out.  Drop NAPI ID for now, we can add
     403             :                  * it back in when we have moved a socket with a valid NAPI
     404             :                  * ID onto the ready list.
     405             :                  */
     406           0 :                 ep->napi_id = 0;
     407           0 :                 return false;
     408             :         }
     409             :         return false;
     410             : }
     411             : 
     412             : /*
     413             :  * Set epoll busy poll NAPI ID from sk.
     414             :  */
     415        5532 : static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
     416             : {
     417        5532 :         struct eventpoll *ep;
     418        5532 :         unsigned int napi_id;
     419        5532 :         struct socket *sock;
     420        5532 :         struct sock *sk;
     421             : 
     422        5532 :         if (!net_busy_loop_on())
     423             :                 return;
     424             : 
     425           0 :         sock = sock_from_file(epi->ffd.file);
     426           0 :         if (!sock)
     427             :                 return;
     428             : 
     429           0 :         sk = sock->sk;
     430           0 :         if (!sk)
     431             :                 return;
     432             : 
     433           0 :         napi_id = READ_ONCE(sk->sk_napi_id);
     434           0 :         ep = epi->ep;
     435             : 
     436             :         /* Non-NAPI IDs can be rejected
     437             :          *      or
     438             :          * Nothing to do if we already have this ID
     439             :          */
     440           0 :         if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id)
     441             :                 return;
     442             : 
     443             :         /* record NAPI ID for use in next busy poll */
     444           0 :         ep->napi_id = napi_id;
     445             : }
     446             : 
     447             : #else
     448             : 
     449             : static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
     450             : {
     451             :         return false;
     452             : }
     453             : 
     454             : static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
     455             : {
     456             : }
     457             : 
     458             : #endif /* CONFIG_NET_RX_BUSY_POLL */
     459             : 
     460             : /*
     461             :  * As described in commit 0ccf831cb lockdep: annotate epoll
     462             :  * the use of wait queues used by epoll is done in a very controlled
     463             :  * manner. Wake ups can nest inside each other, but are never done
     464             :  * with the same locking. For example:
     465             :  *
     466             :  *   dfd = socket(...);
     467             :  *   efd1 = epoll_create();
     468             :  *   efd2 = epoll_create();
     469             :  *   epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
     470             :  *   epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
     471             :  *
     472             :  * When a packet arrives to the device underneath "dfd", the net code will
     473             :  * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
     474             :  * callback wakeup entry on that queue, and the wake_up() performed by the
     475             :  * "dfd" net code will end up in ep_poll_callback(). At this point epoll
     476             :  * (efd1) notices that it may have some event ready, so it needs to wake up
     477             :  * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
     478             :  * that ends up in another wake_up(), after having checked about the
     479             :  * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to
     480             :  * avoid stack blasting.
     481             :  *
     482             :  * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
     483             :  * this special case of epoll.
     484             :  */
     485             : #ifdef CONFIG_DEBUG_LOCK_ALLOC
     486             : 
     487         193 : static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
     488             : {
     489         193 :         struct eventpoll *ep_src;
     490         193 :         unsigned long flags;
     491         193 :         u8 nests = 0;
     492             : 
     493             :         /*
     494             :          * To set the subclass or nesting level for spin_lock_irqsave_nested()
     495             :          * it might be natural to create a per-cpu nest count. However, since
     496             :          * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can
     497             :          * schedule() in the -rt kernel, the per-cpu variable are no longer
     498             :          * protected. Thus, we are introducing a per eventpoll nest field.
     499             :          * If we are not being call from ep_poll_callback(), epi is NULL and
     500             :          * we are at the first level of nesting, 0. Otherwise, we are being
     501             :          * called from ep_poll_callback() and if a previous wakeup source is
     502             :          * not an epoll file itself, we are at depth 1 since the wakeup source
     503             :          * is depth 0. If the wakeup source is a previous epoll file in the
     504             :          * wakeup chain then we use its nests value and record ours as
     505             :          * nests + 1. The previous epoll file nests value is stable since its
     506             :          * already holding its own poll_wait.lock.
     507             :          */
     508         193 :         if (epi) {
     509         193 :                 if ((is_file_epoll(epi->ffd.file))) {
     510           0 :                         ep_src = epi->ffd.file->private_data;
     511           0 :                         nests = ep_src->nests;
     512             :                 } else {
     513             :                         nests = 1;
     514             :                 }
     515             :         }
     516         193 :         spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
     517         193 :         ep->nests = nests + 1;
     518         193 :         wake_up_locked_poll(&ep->poll_wait, EPOLLIN);
     519         193 :         ep->nests = 0;
     520         193 :         spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
     521         193 : }
     522             : 
     523             : #else
     524             : 
     525             : static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
     526             : {
     527             :         wake_up_poll(&ep->poll_wait, EPOLLIN);
     528             : }
     529             : 
     530             : #endif
     531             : 
     532         213 : static void ep_remove_wait_queue(struct eppoll_entry *pwq)
     533             : {
     534         213 :         wait_queue_head_t *whead;
     535             : 
     536         213 :         rcu_read_lock();
     537             :         /*
     538             :          * If it is cleared by POLLFREE, it should be rcu-safe.
     539             :          * If we read NULL we need a barrier paired with
     540             :          * smp_store_release() in ep_poll_callback(), otherwise
     541             :          * we rely on whead->lock.
     542             :          */
     543         213 :         whead = smp_load_acquire(&pwq->whead);
     544         213 :         if (whead)
     545         213 :                 remove_wait_queue(whead, &pwq->wait);
     546         213 :         rcu_read_unlock();
     547         213 : }
     548             : 
     549             : /*
     550             :  * This function unregisters poll callbacks from the associated file
     551             :  * descriptor.  Must be called with "mtx" held (or "epmutex" if called from
     552             :  * ep_free).
     553             :  */
     554         217 : static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
     555             : {
     556         217 :         struct eppoll_entry **p = &epi->pwqlist;
     557         217 :         struct eppoll_entry *pwq;
     558             : 
     559         430 :         while ((pwq = *p) != NULL) {
     560         213 :                 *p = pwq->next;
     561         213 :                 ep_remove_wait_queue(pwq);
     562         213 :                 kmem_cache_free(pwq_cache, pwq);
     563             :         }
     564         217 : }
     565             : 
     566             : /* call only when ep->mtx is held */
     567       10179 : static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
     568             : {
     569       10179 :         return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
     570             : }
     571             : 
     572             : /* call only when ep->mtx is held */
     573        4133 : static inline void ep_pm_stay_awake(struct epitem *epi)
     574             : {
     575        4267 :         struct wakeup_source *ws = ep_wakeup_source(epi);
     576             : 
     577        4133 :         if (ws)
     578        4133 :                 __pm_stay_awake(ws);
     579        3997 : }
     580             : 
     581         218 : static inline bool ep_has_wakeup_source(struct epitem *epi)
     582             : {
     583         218 :         return rcu_access_pointer(epi->ws) ? true : false;
     584             : }
     585             : 
     586             : /* call when ep->mtx cannot be held (ep_poll_callback) */
     587        1892 : static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
     588             : {
     589        1892 :         struct wakeup_source *ws;
     590             : 
     591        1892 :         rcu_read_lock();
     592        1892 :         ws = rcu_dereference(epi->ws);
     593        1892 :         if (ws)
     594        1892 :                 __pm_stay_awake(ws);
     595        1892 :         rcu_read_unlock();
     596        1892 : }
     597             : 
     598             : 
     599             : /*
     600             :  * ep->mutex needs to be held because we could be hit by
     601             :  * eventpoll_release_file() and epoll_ctl().
     602             :  */
     603        4738 : static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
     604             : {
     605             :         /*
     606             :          * Steal the ready list, and re-init the original one to the
     607             :          * empty list. Also, set ep->ovflist to NULL so that events
     608             :          * happening while looping w/out locks, are not lost. We cannot
     609             :          * have the poll callback to queue directly on ep->rdllist,
     610             :          * because we want the "sproc" callback to be able to do it
     611             :          * in a lockless way.
     612             :          */
     613        9476 :         lockdep_assert_irqs_enabled();
     614        4738 :         write_lock_irq(&ep->lock);
     615        4737 :         list_splice_init(&ep->rdllist, txlist);
     616        4737 :         WRITE_ONCE(ep->ovflist, NULL);
     617        4737 :         write_unlock_irq(&ep->lock);
     618        4738 : }
     619             : 
     620        4738 : static void ep_done_scan(struct eventpoll *ep,
     621             :                          struct list_head *txlist)
     622             : {
     623        4738 :         struct epitem *epi, *nepi;
     624             : 
     625        4738 :         write_lock_irq(&ep->lock);
     626             :         /*
     627             :          * During the time we spent inside the "sproc" callback, some
     628             :          * other events might have been queued by the poll callback.
     629             :          * We re-insert them inside the main ready-list here.
     630             :          */
     631        4743 :         for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
     632           5 :              nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
     633             :                 /*
     634             :                  * We need to check if the item is already in the list.
     635             :                  * During the "sproc" callback execution time, items are
     636             :                  * queued into ->ovflist but the "txlist" might already
     637             :                  * contain them, and the list_splice() below takes care of them.
     638             :                  */
     639           6 :                 if (!ep_is_linked(epi)) {
     640             :                         /*
     641             :                          * ->ovflist is LIFO, so we have to reverse it in order
     642             :                          * to keep in FIFO.
     643             :                          */
     644           1 :                         list_add(&epi->rdllink, &ep->rdllist);
     645           6 :                         ep_pm_stay_awake(epi);
     646             :                 }
     647             :         }
     648             :         /*
     649             :          * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
     650             :          * releasing the lock, events will be queued in the normal way inside
     651             :          * ep->rdllist.
     652             :          */
     653        4737 :         WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
     654             : 
     655             :         /*
     656             :          * Quickly re-inject items left on "txlist".
     657             :          */
     658        4737 :         list_splice(txlist, &ep->rdllist);
     659        4737 :         __pm_relax(ep->ws);
     660        4737 :         write_unlock_irq(&ep->lock);
     661        4737 : }
     662             : 
     663         213 : static void epi_rcu_free(struct rcu_head *head)
     664             : {
     665         213 :         struct epitem *epi = container_of(head, struct epitem, rcu);
     666         213 :         kmem_cache_free(epi_cache, epi);
     667         213 : }
     668             : 
     669             : /*
     670             :  * Removes a "struct epitem" from the eventpoll RB tree and deallocates
     671             :  * all the associated resources. Must be called with "mtx" held.
     672             :  */
     673         213 : static int ep_remove(struct eventpoll *ep, struct epitem *epi)
     674             : {
     675         213 :         struct file *file = epi->ffd.file;
     676         213 :         struct epitems_head *to_free;
     677         213 :         struct hlist_head *head;
     678             : 
     679         426 :         lockdep_assert_irqs_enabled();
     680             : 
     681             :         /*
     682             :          * Removes poll wait queue hooks.
     683             :          */
     684         213 :         ep_unregister_pollwait(ep, epi);
     685             : 
     686             :         /* Remove the current item from the list of epoll hooks */
     687         213 :         spin_lock(&file->f_lock);
     688         213 :         to_free = NULL;
     689         213 :         head = file->f_ep;
     690         213 :         if (head->first == &epi->fllink && !epi->fllink.next) {
     691         183 :                 file->f_ep = NULL;
     692         183 :                 if (!is_file_epoll(file)) {
     693         183 :                         struct epitems_head *v;
     694         183 :                         v = container_of(head, struct epitems_head, epitems);
     695         183 :                         if (!smp_load_acquire(&v->next))
     696         183 :                                 to_free = v;
     697             :                 }
     698             :         }
     699         213 :         hlist_del_rcu(&epi->fllink);
     700         213 :         spin_unlock(&file->f_lock);
     701         213 :         free_ephead(to_free);
     702             : 
     703         213 :         rb_erase_cached(&epi->rbn, &ep->rbr);
     704             : 
     705         213 :         write_lock_irq(&ep->lock);
     706         213 :         if (ep_is_linked(epi))
     707         186 :                 list_del_init(&epi->rdllink);
     708         213 :         write_unlock_irq(&ep->lock);
     709             : 
     710         213 :         wakeup_source_unregister(ep_wakeup_source(epi));
     711             :         /*
     712             :          * At this point it is safe to free the eventpoll item. Use the union
     713             :          * field epi->rcu, since we are trying to minimize the size of
     714             :          * 'struct epitem'. The 'rbn' field is no longer in use. Protected by
     715             :          * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
     716             :          * use of the rbn field.
     717             :          */
     718         213 :         call_rcu(&epi->rcu, epi_rcu_free);
     719             : 
     720         213 :         atomic_long_dec(&ep->user->epoll_watches);
     721             : 
     722         213 :         return 0;
     723             : }
     724             : 
     725          15 : static void ep_free(struct eventpoll *ep)
     726             : {
     727          15 :         struct rb_node *rbp;
     728          15 :         struct epitem *epi;
     729             : 
     730             :         /* We need to release all tasks waiting for these file */
     731          15 :         if (waitqueue_active(&ep->poll_wait))
     732           0 :                 ep_poll_safewake(ep, NULL);
     733             : 
     734             :         /*
     735             :          * We need to lock this because we could be hit by
     736             :          * eventpoll_release_file() while we're freeing the "struct eventpoll".
     737             :          * We do not need to hold "ep->mtx" here because the epoll file
     738             :          * is on the way to be removed and no one has references to it
     739             :          * anymore. The only hit might come from eventpoll_release_file() but
     740             :          * holding "epmutex" is sufficient here.
     741             :          */
     742          15 :         mutex_lock(&epmutex);
     743             : 
     744             :         /*
     745             :          * Walks through the whole tree by unregistering poll callbacks.
     746             :          */
     747          19 :         for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
     748           4 :                 epi = rb_entry(rbp, struct epitem, rbn);
     749             : 
     750           4 :                 ep_unregister_pollwait(ep, epi);
     751           4 :                 cond_resched();
     752             :         }
     753             : 
     754             :         /*
     755             :          * Walks through the whole tree by freeing each "struct epitem". At this
     756             :          * point we are sure no poll callbacks will be lingering around, and also by
     757             :          * holding "epmutex" we can be sure that no file cleanup code will hit
     758             :          * us during this operation. So we can avoid the lock on "ep->lock".
     759             :          * We do not need to lock ep->mtx, either, we only do it to prevent
     760             :          * a lockdep warning.
     761             :          */
     762          15 :         mutex_lock(&ep->mtx);
     763          19 :         while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
     764           4 :                 epi = rb_entry(rbp, struct epitem, rbn);
     765           4 :                 ep_remove(ep, epi);
     766           4 :                 cond_resched();
     767             :         }
     768          15 :         mutex_unlock(&ep->mtx);
     769             : 
     770          15 :         mutex_unlock(&epmutex);
     771          15 :         mutex_destroy(&ep->mtx);
     772          15 :         free_uid(ep->user);
     773          15 :         wakeup_source_unregister(ep->ws);
     774          15 :         kfree(ep);
     775          15 : }
     776             : 
     777          15 : static int ep_eventpoll_release(struct inode *inode, struct file *file)
     778             : {
     779          15 :         struct eventpoll *ep = file->private_data;
     780             : 
     781          15 :         if (ep)
     782          15 :                 ep_free(ep);
     783             : 
     784          15 :         return 0;
     785             : }
     786             : 
     787             : static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth);
     788             : 
     789         196 : static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth)
     790             : {
     791         196 :         struct eventpoll *ep = file->private_data;
     792         196 :         LIST_HEAD(txlist);
     793         196 :         struct epitem *epi, *tmp;
     794         196 :         poll_table pt;
     795         196 :         __poll_t res = 0;
     796             : 
     797         196 :         init_poll_funcptr(&pt, NULL);
     798             : 
     799             :         /* Insert inside our poll wait queue */
     800         196 :         poll_wait(file, &ep->poll_wait, wait);
     801             : 
     802             :         /*
     803             :          * Proceed to find out if wanted events are really available inside
     804             :          * the ready list.
     805             :          */
     806         196 :         mutex_lock_nested(&ep->mtx, depth);
     807         197 :         ep_start_scan(ep, &txlist);
     808         196 :         list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
     809         123 :                 if (ep_item_poll(epi, &pt, depth + 1)) {
     810             :                         res = EPOLLIN | EPOLLRDNORM;
     811             :                         break;
     812             :                 } else {
     813             :                         /*
     814             :                          * Item has been dropped into the ready list by the poll
     815             :                          * callback, but it's not actually ready, as far as
     816             :                          * caller requested events goes. We can remove it here.
     817             :                          */
     818           0 :                         __pm_relax(ep_wakeup_source(epi));
     819           0 :                         list_del_init(&epi->rdllink);
     820             :                 }
     821             :         }
     822         197 :         ep_done_scan(ep, &txlist);
     823         196 :         mutex_unlock(&ep->mtx);
     824         196 :         return res;
     825             : }
     826             : 
     827             : /*
     828             :  * Differs from ep_eventpoll_poll() in that internal callers already have
     829             :  * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
     830             :  * is correctly annotated.
     831             :  */
     832        6481 : static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
     833             :                                  int depth)
     834             : {
     835        6481 :         struct file *file = epi->ffd.file;
     836        6481 :         __poll_t res;
     837             : 
     838        6481 :         pt->_key = epi->event.events;
     839        6481 :         if (!is_file_epoll(file))
     840        6284 :                 res = vfs_poll(file, pt);
     841             :         else
     842         197 :                 res = __ep_eventpoll_poll(file, pt, depth);
     843        6483 :         return res & epi->event.events;
     844             : }
     845             : 
     846           0 : static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)
     847             : {
     848           0 :         return __ep_eventpoll_poll(file, wait, 0);
     849             : }
     850             : 
     851             : #ifdef CONFIG_PROC_FS
     852           0 : static void ep_show_fdinfo(struct seq_file *m, struct file *f)
     853             : {
     854           0 :         struct eventpoll *ep = f->private_data;
     855           0 :         struct rb_node *rbp;
     856             : 
     857           0 :         mutex_lock(&ep->mtx);
     858           0 :         for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
     859           0 :                 struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
     860           0 :                 struct inode *inode = file_inode(epi->ffd.file);
     861             : 
     862           0 :                 seq_printf(m, "tfd: %8d events: %8x data: %16llx "
     863             :                            " pos:%lli ino:%lx sdev:%x\n",
     864             :                            epi->ffd.fd, epi->event.events,
     865           0 :                            (long long)epi->event.data,
     866           0 :                            (long long)epi->ffd.file->f_pos,
     867           0 :                            inode->i_ino, inode->i_sb->s_dev);
     868           0 :                 if (seq_has_overflowed(m))
     869             :                         break;
     870             :         }
     871           0 :         mutex_unlock(&ep->mtx);
     872           0 : }
     873             : #endif
     874             : 
     875             : /* File callbacks that implement the eventpoll file behaviour */
     876             : static const struct file_operations eventpoll_fops = {
     877             : #ifdef CONFIG_PROC_FS
     878             :         .show_fdinfo    = ep_show_fdinfo,
     879             : #endif
     880             :         .release        = ep_eventpoll_release,
     881             :         .poll           = ep_eventpoll_poll,
     882             :         .llseek         = noop_llseek,
     883             : };
     884             : 
     885             : /*
     886             :  * This is called from eventpoll_release() to unlink files from the eventpoll
     887             :  * interface. We need to have this facility to cleanup correctly files that are
     888             :  * closed without being removed from the eventpoll interface.
     889             :  */
     890          24 : void eventpoll_release_file(struct file *file)
     891             : {
     892          24 :         struct eventpoll *ep;
     893          24 :         struct epitem *epi;
     894          24 :         struct hlist_node *next;
     895             : 
     896             :         /*
     897             :          * We don't want to get "file->f_lock" because it is not
     898             :          * necessary. It is not necessary because we're in the "struct file"
     899             :          * cleanup path, and this means that no one is using this file anymore.
     900             :          * So, for example, epoll_ctl() cannot hit here since if we reach this
     901             :          * point, the file counter already went to zero and fget() would fail.
     902             :          * The only hit might come from ep_free() but by holding the mutex
     903             :          * will correctly serialize the operation. We do need to acquire
     904             :          * "ep->mtx" after "epmutex" because ep_remove() requires it when called
     905             :          * from anywhere but ep_free().
     906             :          *
     907             :          * Besides, ep_remove() acquires the lock, so we can't hold it here.
     908             :          */
     909          24 :         mutex_lock(&epmutex);
     910          24 :         if (unlikely(!file->f_ep)) {
     911           0 :                 mutex_unlock(&epmutex);
     912           0 :                 return;
     913             :         }
     914          72 :         hlist_for_each_entry_safe(epi, next, file->f_ep, fllink) {
     915          24 :                 ep = epi->ep;
     916          24 :                 mutex_lock_nested(&ep->mtx, 0);
     917          24 :                 ep_remove(ep, epi);
     918          24 :                 mutex_unlock(&ep->mtx);
     919             :         }
     920          24 :         mutex_unlock(&epmutex);
     921             : }
     922             : 
     923          24 : static int ep_alloc(struct eventpoll **pep)
     924             : {
     925          24 :         int error;
     926          24 :         struct user_struct *user;
     927          24 :         struct eventpoll *ep;
     928             : 
     929          24 :         user = get_current_user();
     930          24 :         error = -ENOMEM;
     931          24 :         ep = kzalloc(sizeof(*ep), GFP_KERNEL);
     932          24 :         if (unlikely(!ep))
     933           0 :                 goto free_uid;
     934             : 
     935          24 :         mutex_init(&ep->mtx);
     936          24 :         rwlock_init(&ep->lock);
     937          24 :         init_waitqueue_head(&ep->wq);
     938          24 :         init_waitqueue_head(&ep->poll_wait);
     939          24 :         INIT_LIST_HEAD(&ep->rdllist);
     940          24 :         ep->rbr = RB_ROOT_CACHED;
     941          24 :         ep->ovflist = EP_UNACTIVE_PTR;
     942          24 :         ep->user = user;
     943             : 
     944          24 :         *pep = ep;
     945             : 
     946          24 :         return 0;
     947             : 
     948           0 : free_uid:
     949           0 :         free_uid(user);
     950           0 :         return error;
     951             : }
     952             : 
     953             : /*
     954             :  * Search the file inside the eventpoll tree. The RB tree operations
     955             :  * are protected by the "mtx" mutex, and ep_find() must be called with
     956             :  * "mtx" held.
     957             :  */
     958         712 : static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
     959             : {
     960         712 :         int kcmp;
     961         712 :         struct rb_node *rbp;
     962         712 :         struct epitem *epi, *epir = NULL;
     963         712 :         struct epoll_filefd ffd;
     964             : 
     965         712 :         ep_set_ffd(&ffd, file, fd);
     966        3026 :         for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
     967        2717 :                 epi = rb_entry(rbp, struct epitem, rbn);
     968        2717 :                 kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
     969         403 :                 if (kcmp > 0)
     970        1257 :                         rbp = rbp->rb_right;
     971        1460 :                 else if (kcmp < 0)
     972        1057 :                         rbp = rbp->rb_left;
     973             :                 else {
     974             :                         epir = epi;
     975             :                         break;
     976             :                 }
     977             :         }
     978             : 
     979         712 :         return epir;
     980             : }
     981             : 
     982             : #ifdef CONFIG_KCMP
     983             : static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
     984             : {
     985             :         struct rb_node *rbp;
     986             :         struct epitem *epi;
     987             : 
     988             :         for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
     989             :                 epi = rb_entry(rbp, struct epitem, rbn);
     990             :                 if (epi->ffd.fd == tfd) {
     991             :                         if (toff == 0)
     992             :                                 return epi;
     993             :                         else
     994             :                                 toff--;
     995             :                 }
     996             :                 cond_resched();
     997             :         }
     998             : 
     999             :         return NULL;
    1000             : }
    1001             : 
    1002             : struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
    1003             :                                      unsigned long toff)
    1004             : {
    1005             :         struct file *file_raw;
    1006             :         struct eventpoll *ep;
    1007             :         struct epitem *epi;
    1008             : 
    1009             :         if (!is_file_epoll(file))
    1010             :                 return ERR_PTR(-EINVAL);
    1011             : 
    1012             :         ep = file->private_data;
    1013             : 
    1014             :         mutex_lock(&ep->mtx);
    1015             :         epi = ep_find_tfd(ep, tfd, toff);
    1016             :         if (epi)
    1017             :                 file_raw = epi->ffd.file;
    1018             :         else
    1019             :                 file_raw = ERR_PTR(-ENOENT);
    1020             :         mutex_unlock(&ep->mtx);
    1021             : 
    1022             :         return file_raw;
    1023             : }
    1024             : #endif /* CONFIG_KCMP */
    1025             : 
    1026             : /**
    1027             :  * Adds a new entry to the tail of the list in a lockless way, i.e.
    1028             :  * multiple CPUs are allowed to call this function concurrently.
    1029             :  *
    1030             :  * Beware: it is necessary to prevent any other modifications of the
    1031             :  *         existing list until all changes are completed, in other words
    1032             :  *         concurrent list_add_tail_lockless() calls should be protected
    1033             :  *         with a read lock, where write lock acts as a barrier which
    1034             :  *         makes sure all list_add_tail_lockless() calls are fully
    1035             :  *         completed.
    1036             :  *
    1037             :  *        Also an element can be locklessly added to the list only in one
    1038             :  *        direction i.e. either to the tail either to the head, otherwise
    1039             :  *        concurrent access will corrupt the list.
    1040             :  *
    1041             :  * Returns %false if element has been already added to the list, %true
    1042             :  * otherwise.
    1043             :  */
    1044        1887 : static inline bool list_add_tail_lockless(struct list_head *new,
    1045             :                                           struct list_head *head)
    1046             : {
    1047        1887 :         struct list_head *prev;
    1048             : 
    1049             :         /*
    1050             :          * This is simple 'new->next = head' operation, but cmpxchg()
    1051             :          * is used in order to detect that same element has been just
    1052             :          * added to the list from another CPU: the winner observes
    1053             :          * new->next == new.
    1054             :          */
    1055        1887 :         if (cmpxchg(&new->next, new, head) != new)
    1056             :                 return false;
    1057             : 
    1058             :         /*
    1059             :          * Initially ->next of a new element must be updated with the head
    1060             :          * (we are inserting to the tail) and only then pointers are atomically
    1061             :          * exchanged.  XCHG guarantees memory ordering, thus ->next should be
    1062             :          * updated before pointers are actually swapped and pointers are
    1063             :          * swapped before prev->next is updated.
    1064             :          */
    1065             : 
    1066        1887 :         prev = xchg(&head->prev, new);
    1067             : 
    1068             :         /*
    1069             :          * It is safe to modify prev->next and new->prev, because a new element
    1070             :          * is added only to the tail and new->next is updated before XCHG.
    1071             :          */
    1072             : 
    1073        1887 :         prev->next = new;
    1074        1887 :         new->prev = prev;
    1075             : 
    1076        1887 :         return true;
    1077             : }
    1078             : 
    1079             : /**
    1080             :  * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
    1081             :  * i.e. multiple CPUs are allowed to call this function concurrently.
    1082             :  *
    1083             :  * Returns %false if epi element has been already chained, %true otherwise.
    1084             :  */
    1085           5 : static inline bool chain_epi_lockless(struct epitem *epi)
    1086             : {
    1087           5 :         struct eventpoll *ep = epi->ep;
    1088             : 
    1089             :         /* Fast preliminary check */
    1090           5 :         if (epi->next != EP_UNACTIVE_PTR)
    1091             :                 return false;
    1092             : 
    1093             :         /* Check that the same epi has not been just chained from another CPU */
    1094           5 :         if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
    1095             :                 return false;
    1096             : 
    1097             :         /* Atomically exchange tail */
    1098           5 :         epi->next = xchg(&ep->ovflist, epi);
    1099             : 
    1100           5 :         return true;
    1101             : }
    1102             : 
    1103             : /*
    1104             :  * This is the callback that is passed to the wait queue wakeup
    1105             :  * mechanism. It is called by the stored file descriptors when they
    1106             :  * have events to report.
    1107             :  *
    1108             :  * This callback takes a read lock in order not to content with concurrent
    1109             :  * events from another file descriptors, thus all modifications to ->rdllist
    1110             :  * or ->ovflist are lockless.  Read lock is paired with the write lock from
    1111             :  * ep_scan_ready_list(), which stops all list modifications and guarantees
    1112             :  * that lists state is seen correctly.
    1113             :  *
    1114             :  * Another thing worth to mention is that ep_poll_callback() can be called
    1115             :  * concurrently for the same @epi from different CPUs if poll table was inited
    1116             :  * with several wait queues entries.  Plural wakeup from different CPUs of a
    1117             :  * single wait queue is serialized by wq.lock, but the case when multiple wait
    1118             :  * queues are used should be detected accordingly.  This is detected using
    1119             :  * cmpxchg() operation.
    1120             :  */
    1121        5222 : static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
    1122             : {
    1123        5222 :         int pwake = 0;
    1124        5222 :         struct epitem *epi = ep_item_from_wait(wait);
    1125        5222 :         struct eventpoll *ep = epi->ep;
    1126        5222 :         __poll_t pollflags = key_to_poll(key);
    1127        5222 :         unsigned long flags;
    1128        5222 :         int ewake = 0;
    1129             : 
    1130        5222 :         read_lock_irqsave(&ep->lock, flags);
    1131             : 
    1132        5223 :         ep_set_busy_poll_napi_id(epi);
    1133             : 
    1134             :         /*
    1135             :          * If the event mask does not contain any poll(2) event, we consider the
    1136             :          * descriptor to be disabled. This condition is likely the effect of the
    1137             :          * EPOLLONESHOT bit that disables the descriptor when an event is received,
    1138             :          * until the next EPOLL_CTL_MOD will be issued.
    1139             :          */
    1140        5223 :         if (!(epi->event.events & ~EP_PRIVATE_BITS))
    1141           0 :                 goto out_unlock;
    1142             : 
    1143             :         /*
    1144             :          * Check the events coming with the callback. At this stage, not
    1145             :          * every device reports the events in the "key" parameter of the
    1146             :          * callback. We need to be able to handle both cases here, hence the
    1147             :          * test for "key" != NULL before the event match test.
    1148             :          */
    1149        5223 :         if (pollflags && !(pollflags & epi->event.events))
    1150        1961 :                 goto out_unlock;
    1151             : 
    1152             :         /*
    1153             :          * If we are transferring events to userspace, we can hold no locks
    1154             :          * (because we're accessing user memory, and because of linux f_op->poll()
    1155             :          * semantics). All the events that happen during that period of time are
    1156             :          * chained in ep->ovflist and requeued later on.
    1157             :          */
    1158        3262 :         if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
    1159           5 :                 if (chain_epi_lockless(epi))
    1160           5 :                         ep_pm_stay_awake_rcu(epi);
    1161        3257 :         } else if (!ep_is_linked(epi)) {
    1162             :                 /* In the usual case, add event to ready list. */
    1163        1887 :                 if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
    1164        1887 :                         ep_pm_stay_awake_rcu(epi);
    1165             :         }
    1166             : 
    1167             :         /*
    1168             :          * Wake up ( if active ) both the eventpoll wait list and the ->poll()
    1169             :          * wait list.
    1170             :          */
    1171        3262 :         if (waitqueue_active(&ep->wq)) {
    1172        1526 :                 if ((epi->event.events & EPOLLEXCLUSIVE) &&
    1173           0 :                                         !(pollflags & POLLFREE)) {
    1174           0 :                         switch (pollflags & EPOLLINOUT_BITS) {
    1175           0 :                         case EPOLLIN:
    1176           0 :                                 if (epi->event.events & EPOLLIN)
    1177             :                                         ewake = 1;
    1178             :                                 break;
    1179           0 :                         case EPOLLOUT:
    1180           0 :                                 if (epi->event.events & EPOLLOUT)
    1181           0 :                                         ewake = 1;
    1182             :                                 break;
    1183           0 :                         case 0:
    1184           0 :                                 ewake = 1;
    1185           0 :                                 break;
    1186             :                         }
    1187        1526 :                 }
    1188        1526 :                 wake_up(&ep->wq);
    1189             :         }
    1190        3262 :         if (waitqueue_active(&ep->poll_wait))
    1191         193 :                 pwake++;
    1192             : 
    1193        3069 : out_unlock:
    1194        5223 :         read_unlock_irqrestore(&ep->lock, flags);
    1195             : 
    1196             :         /* We have to call this outside the lock */
    1197        5222 :         if (pwake)
    1198         193 :                 ep_poll_safewake(ep, epi);
    1199             : 
    1200        5222 :         if (!(epi->event.events & EPOLLEXCLUSIVE))
    1201        5222 :                 ewake = 1;
    1202             : 
    1203        5222 :         if (pollflags & POLLFREE) {
    1204             :                 /*
    1205             :                  * If we race with ep_remove_wait_queue() it can miss
    1206             :                  * ->whead = NULL and do another remove_wait_queue() after
    1207             :                  * us, so we can't use __remove_wait_queue().
    1208             :                  */
    1209           0 :                 list_del_init(&wait->entry);
    1210             :                 /*
    1211             :                  * ->whead != NULL protects us from the race with ep_free()
    1212             :                  * or ep_remove(), ep_remove_wait_queue() takes whead->lock
    1213             :                  * held by the caller. Once we nullify it, nothing protects
    1214             :                  * ep/epi or even wait.
    1215             :                  */
    1216           0 :                 smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
    1217             :         }
    1218             : 
    1219        5222 :         return ewake;
    1220             : }
    1221             : 
    1222             : /*
    1223             :  * This is the callback that is used to add our wait queue to the
    1224             :  * target file wakeup lists.
    1225             :  */
    1226         310 : static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
    1227             :                                  poll_table *pt)
    1228             : {
    1229         310 :         struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt);
    1230         310 :         struct epitem *epi = epq->epi;
    1231         310 :         struct eppoll_entry *pwq;
    1232             : 
    1233         310 :         if (unlikely(!epi))     // an earlier allocation has failed
    1234             :                 return;
    1235             : 
    1236         310 :         pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL);
    1237         310 :         if (unlikely(!pwq)) {
    1238           0 :                 epq->epi = NULL;
    1239           0 :                 return;
    1240             :         }
    1241             : 
    1242         310 :         init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
    1243         310 :         pwq->whead = whead;
    1244         310 :         pwq->base = epi;
    1245         310 :         if (epi->event.events & EPOLLEXCLUSIVE)
    1246           0 :                 add_wait_queue_exclusive(whead, &pwq->wait);
    1247             :         else
    1248         310 :                 add_wait_queue(whead, &pwq->wait);
    1249         310 :         pwq->next = epi->pwqlist;
    1250         310 :         epi->pwqlist = pwq;
    1251             : }
    1252             : 
    1253         309 : static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
    1254             : {
    1255         309 :         int kcmp;
    1256         309 :         struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;
    1257         309 :         struct epitem *epic;
    1258         309 :         bool leftmost = true;
    1259             : 
    1260        1372 :         while (*p) {
    1261        1063 :                 parent = *p;
    1262        1063 :                 epic = rb_entry(parent, struct epitem, rbn);
    1263        1063 :                 kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
    1264           0 :                 if (kcmp > 0) {
    1265         570 :                         p = &parent->rb_right;
    1266         570 :                         leftmost = false;
    1267             :                 } else
    1268         493 :                         p = &parent->rb_left;
    1269             :         }
    1270         309 :         rb_link_node(&epi->rbn, parent, p);
    1271         309 :         rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
    1272         309 : }
    1273             : 
    1274             : 
    1275             : 
    1276             : #define PATH_ARR_SIZE 5
    1277             : /*
    1278             :  * These are the number paths of length 1 to 5, that we are allowing to emanate
    1279             :  * from a single file of interest. For example, we allow 1000 paths of length
    1280             :  * 1, to emanate from each file of interest. This essentially represents the
    1281             :  * potential wakeup paths, which need to be limited in order to avoid massive
    1282             :  * uncontrolled wakeup storms. The common use case should be a single ep which
    1283             :  * is connected to n file sources. In this case each file source has 1 path
    1284             :  * of length 1. Thus, the numbers below should be more than sufficient. These
    1285             :  * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
    1286             :  * and delete can't add additional paths. Protected by the epmutex.
    1287             :  */
    1288             : static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
    1289             : static int path_count[PATH_ARR_SIZE];
    1290             : 
    1291           5 : static int path_count_inc(int nests)
    1292             : {
    1293             :         /* Allow an arbitrary number of depth 1 paths */
    1294           5 :         if (nests == 0)
    1295             :                 return 0;
    1296             : 
    1297           4 :         if (++path_count[nests] > path_limits[nests])
    1298           0 :                 return -1;
    1299             :         return 0;
    1300             : }
    1301             : 
    1302             : static void path_count_init(void)
    1303             : {
    1304             :         int i;
    1305             : 
    1306          30 :         for (i = 0; i < PATH_ARR_SIZE; i++)
    1307          25 :                 path_count[i] = 0;
    1308             : }
    1309             : 
    1310           9 : static int reverse_path_check_proc(struct hlist_head *refs, int depth)
    1311             : {
    1312           9 :         int error = 0;
    1313           9 :         struct epitem *epi;
    1314             : 
    1315           9 :         if (depth > EP_MAX_NESTS) /* too deep nesting */
    1316             :                 return -1;
    1317             : 
    1318             :         /* CTL_DEL can remove links here, but that can't increase our count */
    1319          36 :         hlist_for_each_entry_rcu(epi, refs, fllink) {
    1320           9 :                 struct hlist_head *refs = &epi->ep->refs;
    1321           9 :                 if (hlist_empty(refs))
    1322           5 :                         error = path_count_inc(depth);
    1323             :                 else
    1324           4 :                         error = reverse_path_check_proc(refs, depth + 1);
    1325           9 :                 if (error != 0)
    1326             :                         break;
    1327             :         }
    1328             :         return error;
    1329             : }
    1330             : 
    1331             : /**
    1332             :  * reverse_path_check - The tfile_check_list is list of epitem_head, which have
    1333             :  *                      links that are proposed to be newly added. We need to
    1334             :  *                      make sure that those added links don't add too many
    1335             :  *                      paths such that we will spend all our time waking up
    1336             :  *                      eventpoll objects.
    1337             :  *
    1338             :  * Returns: Returns zero if the proposed links don't create too many paths,
    1339             :  *          -1 otherwise.
    1340             :  */
    1341           3 : static int reverse_path_check(void)
    1342             : {
    1343           3 :         struct epitems_head *p;
    1344             : 
    1345           8 :         for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) {
    1346             :                 int error;
    1347           5 :                 path_count_init();
    1348           5 :                 rcu_read_lock();
    1349           5 :                 error = reverse_path_check_proc(&p->epitems, 0);
    1350           5 :                 rcu_read_unlock();
    1351           5 :                 if (error)
    1352           0 :                         return error;
    1353             :         }
    1354             :         return 0;
    1355             : }
    1356             : 
    1357           0 : static int ep_create_wakeup_source(struct epitem *epi)
    1358             : {
    1359           0 :         struct name_snapshot n;
    1360           0 :         struct wakeup_source *ws;
    1361             : 
    1362           0 :         if (!epi->ep->ws) {
    1363           0 :                 epi->ep->ws = wakeup_source_register(NULL, "eventpoll");
    1364           0 :                 if (!epi->ep->ws)
    1365             :                         return -ENOMEM;
    1366             :         }
    1367             : 
    1368           0 :         take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
    1369           0 :         ws = wakeup_source_register(NULL, n.name.name);
    1370           0 :         release_dentry_name_snapshot(&n);
    1371             : 
    1372           0 :         if (!ws)
    1373           0 :                 return -ENOMEM;
    1374             :         rcu_assign_pointer(epi->ws, ws);
    1375             : 
    1376             :         return 0;
    1377             : }
    1378             : 
    1379             : /* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */
    1380           0 : static noinline void ep_destroy_wakeup_source(struct epitem *epi)
    1381             : {
    1382           0 :         struct wakeup_source *ws = ep_wakeup_source(epi);
    1383             : 
    1384           0 :         RCU_INIT_POINTER(epi->ws, NULL);
    1385             : 
    1386             :         /*
    1387             :          * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is
    1388             :          * used internally by wakeup_source_remove, too (called by
    1389             :          * wakeup_source_unregister), so we cannot use call_rcu
    1390             :          */
    1391           0 :         synchronize_rcu();
    1392           0 :         wakeup_source_unregister(ws);
    1393           0 : }
    1394             : 
    1395         309 : static int attach_epitem(struct file *file, struct epitem *epi)
    1396             : {
    1397         309 :         struct epitems_head *to_free = NULL;
    1398         309 :         struct hlist_head *head = NULL;
    1399         309 :         struct eventpoll *ep = NULL;
    1400             : 
    1401         309 :         if (is_file_epoll(file))
    1402           2 :                 ep = file->private_data;
    1403             : 
    1404           2 :         if (ep) {
    1405           2 :                 head = &ep->refs;
    1406         307 :         } else if (!READ_ONCE(file->f_ep)) {
    1407         268 : allocate:
    1408         268 :                 to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL);
    1409         268 :                 if (!to_free)
    1410             :                         return -ENOMEM;
    1411         268 :                 head = &to_free->epitems;
    1412             :         }
    1413         309 :         spin_lock(&file->f_lock);
    1414         309 :         if (!file->f_ep) {
    1415         270 :                 if (unlikely(!head)) {
    1416           0 :                         spin_unlock(&file->f_lock);
    1417           0 :                         goto allocate;
    1418             :                 }
    1419         270 :                 file->f_ep = head;
    1420         270 :                 to_free = NULL;
    1421             :         }
    1422         309 :         hlist_add_head_rcu(&epi->fllink, file->f_ep);
    1423         309 :         spin_unlock(&file->f_lock);
    1424         309 :         free_ephead(to_free);
    1425         309 :         return 0;
    1426             : }
    1427             : 
    1428             : /*
    1429             :  * Must be called with "mtx" held.
    1430             :  */
    1431         309 : static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
    1432             :                      struct file *tfile, int fd, int full_check)
    1433             : {
    1434         309 :         int error, pwake = 0;
    1435         309 :         __poll_t revents;
    1436         309 :         long user_watches;
    1437         309 :         struct epitem *epi;
    1438         309 :         struct ep_pqueue epq;
    1439         309 :         struct eventpoll *tep = NULL;
    1440             : 
    1441         309 :         if (is_file_epoll(tfile))
    1442           2 :                 tep = tfile->private_data;
    1443             : 
    1444         618 :         lockdep_assert_irqs_enabled();
    1445             : 
    1446         309 :         user_watches = atomic_long_read(&ep->user->epoll_watches);
    1447         309 :         if (unlikely(user_watches >= max_user_watches))
    1448             :                 return -ENOSPC;
    1449         309 :         if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL)))
    1450             :                 return -ENOMEM;
    1451             : 
    1452             :         /* Item initialization follow here ... */
    1453         309 :         INIT_LIST_HEAD(&epi->rdllink);
    1454         309 :         epi->ep = ep;
    1455         309 :         ep_set_ffd(&epi->ffd, tfile, fd);
    1456         309 :         epi->event = *event;
    1457         309 :         epi->next = EP_UNACTIVE_PTR;
    1458             : 
    1459         309 :         if (tep)
    1460           2 :                 mutex_lock_nested(&tep->mtx, 1);
    1461             :         /* Add the current item to the list of active epoll hook for this file */
    1462         309 :         if (unlikely(attach_epitem(tfile, epi) < 0)) {
    1463           0 :                 kmem_cache_free(epi_cache, epi);
    1464           0 :                 if (tep)
    1465           0 :                         mutex_unlock(&tep->mtx);
    1466           0 :                 return -ENOMEM;
    1467             :         }
    1468             : 
    1469         309 :         if (full_check && !tep)
    1470           1 :                 list_file(tfile);
    1471             : 
    1472         309 :         atomic_long_inc(&ep->user->epoll_watches);
    1473             : 
    1474             :         /*
    1475             :          * Add the current item to the RB tree. All RB tree operations are
    1476             :          * protected by "mtx", and ep_insert() is called with "mtx" held.
    1477             :          */
    1478         309 :         ep_rbtree_insert(ep, epi);
    1479         309 :         if (tep)
    1480           2 :                 mutex_unlock(&tep->mtx);
    1481             : 
    1482             :         /* now check if we've created too many backpaths */
    1483         309 :         if (unlikely(full_check && reverse_path_check())) {
    1484           0 :                 ep_remove(ep, epi);
    1485           0 :                 return -EINVAL;
    1486             :         }
    1487             : 
    1488         309 :         if (epi->event.events & EPOLLWAKEUP) {
    1489           0 :                 error = ep_create_wakeup_source(epi);
    1490           0 :                 if (error) {
    1491           0 :                         ep_remove(ep, epi);
    1492           0 :                         return error;
    1493             :                 }
    1494             :         }
    1495             : 
    1496             :         /* Initialize the poll table using the queue callback */
    1497         309 :         epq.epi = epi;
    1498         309 :         init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
    1499             : 
    1500             :         /*
    1501             :          * Attach the item to the poll hooks and get current event bits.
    1502             :          * We can safely use the file* here because its usage count has
    1503             :          * been increased by the caller of this function. Note that after
    1504             :          * this operation completes, the poll callback can start hitting
    1505             :          * the new item.
    1506             :          */
    1507         309 :         revents = ep_item_poll(epi, &epq.pt, 1);
    1508             : 
    1509             :         /*
    1510             :          * We have to check if something went wrong during the poll wait queue
    1511             :          * install process. Namely an allocation for a wait queue failed due
    1512             :          * high memory pressure.
    1513             :          */
    1514         309 :         if (unlikely(!epq.epi)) {
    1515           0 :                 ep_remove(ep, epi);
    1516           0 :                 return -ENOMEM;
    1517             :         }
    1518             : 
    1519             :         /* We have to drop the new item inside our item list to keep track of it */
    1520         309 :         write_lock_irq(&ep->lock);
    1521             : 
    1522             :         /* record NAPI ID of new item if present */
    1523         309 :         ep_set_busy_poll_napi_id(epi);
    1524             : 
    1525             :         /* If the file is already "ready" we drop it inside the ready list */
    1526         309 :         if (revents && !ep_is_linked(epi)) {
    1527          97 :                 list_add_tail(&epi->rdllink, &ep->rdllist);
    1528          97 :                 ep_pm_stay_awake(epi);
    1529             : 
    1530             :                 /* Notify waiting tasks that events are available */
    1531          97 :                 if (waitqueue_active(&ep->wq))
    1532           0 :                         wake_up(&ep->wq);
    1533          97 :                 if (waitqueue_active(&ep->poll_wait))
    1534           0 :                         pwake++;
    1535             :         }
    1536             : 
    1537         309 :         write_unlock_irq(&ep->lock);
    1538             : 
    1539             :         /* We have to call this outside the lock */
    1540         309 :         if (pwake)
    1541           0 :                 ep_poll_safewake(ep, NULL);
    1542             : 
    1543             :         return 0;
    1544             : }
    1545             : 
    1546             : /*
    1547             :  * Modify the interest event mask by dropping an event if the new mask
    1548             :  * has a match in the current file status. Must be called with "mtx" held.
    1549             :  */
    1550         218 : static int ep_modify(struct eventpoll *ep, struct epitem *epi,
    1551             :                      const struct epoll_event *event)
    1552             : {
    1553         218 :         int pwake = 0;
    1554         218 :         poll_table pt;
    1555             : 
    1556         436 :         lockdep_assert_irqs_enabled();
    1557             : 
    1558         218 :         init_poll_funcptr(&pt, NULL);
    1559             : 
    1560             :         /*
    1561             :          * Set the new event interest mask before calling f_op->poll();
    1562             :          * otherwise we might miss an event that happens between the
    1563             :          * f_op->poll() call and the new event set registering.
    1564             :          */
    1565         218 :         epi->event.events = event->events; /* need barrier below */
    1566         218 :         epi->event.data = event->data; /* protected by mtx */
    1567         218 :         if (epi->event.events & EPOLLWAKEUP) {
    1568           0 :                 if (!ep_has_wakeup_source(epi))
    1569           0 :                         ep_create_wakeup_source(epi);
    1570         218 :         } else if (ep_has_wakeup_source(epi)) {
    1571           0 :                 ep_destroy_wakeup_source(epi);
    1572             :         }
    1573             : 
    1574             :         /*
    1575             :          * The following barrier has two effects:
    1576             :          *
    1577             :          * 1) Flush epi changes above to other CPUs.  This ensures
    1578             :          *    we do not miss events from ep_poll_callback if an
    1579             :          *    event occurs immediately after we call f_op->poll().
    1580             :          *    We need this because we did not take ep->lock while
    1581             :          *    changing epi above (but ep_poll_callback does take
    1582             :          *    ep->lock).
    1583             :          *
    1584             :          * 2) We also need to ensure we do not miss _past_ events
    1585             :          *    when calling f_op->poll().  This barrier also
    1586             :          *    pairs with the barrier in wq_has_sleeper (see
    1587             :          *    comments for wq_has_sleeper).
    1588             :          *
    1589             :          * This barrier will now guarantee ep_poll_callback or f_op->poll
    1590             :          * (or both) will notice the readiness of an item.
    1591             :          */
    1592         218 :         smp_mb();
    1593             : 
    1594             :         /*
    1595             :          * Get current event bits. We can safely use the file* here because
    1596             :          * its usage count has been increased by the caller of this function.
    1597             :          * If the item is "hot" and it is not registered inside the ready
    1598             :          * list, push it inside.
    1599             :          */
    1600         218 :         if (ep_item_poll(epi, &pt, 1)) {
    1601         113 :                 write_lock_irq(&ep->lock);
    1602         113 :                 if (!ep_is_linked(epi)) {
    1603          37 :                         list_add_tail(&epi->rdllink, &ep->rdllist);
    1604          37 :                         ep_pm_stay_awake(epi);
    1605             : 
    1606             :                         /* Notify waiting tasks that events are available */
    1607          37 :                         if (waitqueue_active(&ep->wq))
    1608           0 :                                 wake_up(&ep->wq);
    1609          37 :                         if (waitqueue_active(&ep->poll_wait))
    1610           0 :                                 pwake++;
    1611             :                 }
    1612         113 :                 write_unlock_irq(&ep->lock);
    1613             :         }
    1614             : 
    1615             :         /* We have to call this outside the lock */
    1616         113 :         if (pwake)
    1617           0 :                 ep_poll_safewake(ep, NULL);
    1618             : 
    1619         218 :         return 0;
    1620             : }
    1621             : 
    1622        4540 : static int ep_send_events(struct eventpoll *ep,
    1623             :                           struct epoll_event __user *events, int maxevents)
    1624             : {
    1625        4540 :         struct epitem *epi, *tmp;
    1626        4540 :         LIST_HEAD(txlist);
    1627        4540 :         poll_table pt;
    1628        4540 :         int res = 0;
    1629             : 
    1630             :         /*
    1631             :          * Always short-circuit for fatal signals to allow threads to make a
    1632             :          * timely exit without the chance of finding more events available and
    1633             :          * fetching repeatedly.
    1634             :          */
    1635        4540 :         if (fatal_signal_pending(current))
    1636             :                 return -EINTR;
    1637             : 
    1638        4540 :         init_poll_funcptr(&pt, NULL);
    1639             : 
    1640        4540 :         mutex_lock(&ep->mtx);
    1641        4542 :         ep_start_scan(ep, &txlist);
    1642             : 
    1643             :         /*
    1644             :          * We can loop without lock because we are passed a task private list.
    1645             :          * Items cannot vanish during the loop we are holding ep->mtx.
    1646             :          */
    1647       10375 :         list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
    1648        5836 :                 struct wakeup_source *ws;
    1649        5836 :                 __poll_t revents;
    1650             : 
    1651        5836 :                 if (res >= maxevents)
    1652             :                         break;
    1653             : 
    1654             :                 /*
    1655             :                  * Activate ep->ws before deactivating epi->ws to prevent
    1656             :                  * triggering auto-suspend here (in case we reactive epi->ws
    1657             :                  * below).
    1658             :                  *
    1659             :                  * This could be rearranged to delay the deactivation of epi->ws
    1660             :                  * instead, but then epi->ws would temporarily be out of sync
    1661             :                  * with ep_is_linked().
    1662             :                  */
    1663        5834 :                 ws = ep_wakeup_source(epi);
    1664        5833 :                 if (ws) {
    1665             :                         if (ws->active)
    1666        5833 :                                 __pm_stay_awake(ep->ws);
    1667        5833 :                         __pm_relax(ws);
    1668             :                 }
    1669             : 
    1670        5833 :                 list_del_init(&epi->rdllink);
    1671             : 
    1672             :                 /*
    1673             :                  * If the event mask intersect the caller-requested one,
    1674             :                  * deliver the event to userspace. Again, we are holding ep->mtx,
    1675             :                  * so no operations coming from userspace can change the item.
    1676             :                  */
    1677        5833 :                 revents = ep_item_poll(epi, &pt, 1);
    1678        5833 :                 if (!revents)
    1679        1710 :                         continue;
    1680             : 
    1681        4123 :                 if (__put_user(revents, &events->events) ||
    1682        4124 :                     __put_user(epi->event.data, &events->data)) {
    1683           0 :                         list_add(&epi->rdllink, &txlist);
    1684           0 :                         ep_pm_stay_awake(epi);
    1685           0 :                         if (!res)
    1686           0 :                                 res = -EFAULT;
    1687             :                         break;
    1688             :                 }
    1689        4124 :                 res++;
    1690        4124 :                 events++;
    1691        4124 :                 if (epi->event.events & EPOLLONESHOT)
    1692           0 :                         epi->event.events &= EP_PRIVATE_BITS;
    1693        4124 :                 else if (!(epi->event.events & EPOLLET)) {
    1694             :                         /*
    1695             :                          * If this file has been added with Level
    1696             :                          * Trigger mode, we need to insert back inside
    1697             :                          * the ready list, so that the next call to
    1698             :                          * epoll_wait() will check again the events
    1699             :                          * availability. At this point, no one can insert
    1700             :                          * into ep->rdllist besides us. The epoll_ctl()
    1701             :                          * callers are locked out by
    1702             :                          * ep_scan_ready_list() holding "mtx" and the
    1703             :                          * poll callback will queue them in ep->ovflist.
    1704             :                          */
    1705        3998 :                         list_add_tail(&epi->rdllink, &ep->rdllist);
    1706        9831 :                         ep_pm_stay_awake(epi);
    1707             :                 }
    1708             :         }
    1709        4541 :         ep_done_scan(ep, &txlist);
    1710        4540 :         mutex_unlock(&ep->mtx);
    1711             : 
    1712        4540 :         return res;
    1713             : }
    1714             : 
    1715        3488 : static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
    1716             : {
    1717        3488 :         struct timespec64 now;
    1718             : 
    1719        3488 :         if (ms < 0)
    1720             :                 return NULL;
    1721             : 
    1722        1433 :         if (!ms) {
    1723        1355 :                 to->tv_sec = 0;
    1724        1355 :                 to->tv_nsec = 0;
    1725        1355 :                 return to;
    1726             :         }
    1727             : 
    1728          78 :         to->tv_sec = ms / MSEC_PER_SEC;
    1729          78 :         to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC);
    1730             : 
    1731          78 :         ktime_get_ts64(&now);
    1732          78 :         *to = timespec64_add_safe(now, *to);
    1733          78 :         return to;
    1734             : }
    1735             : 
    1736             : /**
    1737             :  * ep_poll - Retrieves ready events, and delivers them to the caller supplied
    1738             :  *           event buffer.
    1739             :  *
    1740             :  * @ep: Pointer to the eventpoll context.
    1741             :  * @events: Pointer to the userspace buffer where the ready events should be
    1742             :  *          stored.
    1743             :  * @maxevents: Size (in terms of number of events) of the caller event buffer.
    1744             :  * @timeout: Maximum timeout for the ready events fetch operation, in
    1745             :  *           timespec. If the timeout is zero, the function will not block,
    1746             :  *           while if the @timeout ptr is NULL, the function will block
    1747             :  *           until at least one event has been retrieved (or an error
    1748             :  *           occurred).
    1749             :  *
    1750             :  * Returns: Returns the number of ready events which have been fetched, or an
    1751             :  *          error code, in case of error.
    1752             :  */
    1753        3489 : static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
    1754             :                    int maxevents, struct timespec64 *timeout)
    1755             : {
    1756        3489 :         int res, eavail, timed_out = 0;
    1757        3489 :         u64 slack = 0;
    1758        3489 :         wait_queue_entry_t wait;
    1759        3489 :         ktime_t expires, *to = NULL;
    1760             : 
    1761        6978 :         lockdep_assert_irqs_enabled();
    1762             : 
    1763        3490 :         if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
    1764          78 :                 slack = select_estimate_accuracy(timeout);
    1765          78 :                 to = &expires;
    1766         156 :                 *to = timespec64_to_ktime(*timeout);
    1767        3412 :         } else if (timeout) {
    1768             :                 /*
    1769             :                  * Avoid the unnecessary trip to the wait queue loop, if the
    1770             :                  * caller specified a non blocking operation.
    1771             :                  */
    1772        1357 :                 timed_out = 1;
    1773             :         }
    1774             : 
    1775             :         /*
    1776             :          * This call is racy: We may or may not see events that are being added
    1777             :          * to the ready list under the lock (e.g., in IRQ callbacks). For, cases
    1778             :          * with a non-zero timeout, this thread will check the ready list under
    1779             :          * lock and will added to the wait queue.  For, cases with a zero
    1780             :          * timeout, the user by definition should not care and will have to
    1781             :          * recheck again.
    1782             :          */
    1783        3490 :         eavail = ep_events_available(ep);
    1784             : 
    1785        3491 :         while (1) {
    1786        3491 :                 if (eavail) {
    1787             :                         /*
    1788             :                          * Try to transfer events to user space. In case we get
    1789             :                          * 0 events and there's still timeout left over, we go
    1790             :                          * trying again in search of more luck.
    1791             :                          */
    1792        9081 :                         res = ep_send_events(ep, events, maxevents);
    1793        4539 :                         if (res)
    1794        3109 :                                 return res;
    1795             :                 }
    1796             : 
    1797        1906 :                 if (timed_out)
    1798             :                         return 0;
    1799             : 
    1800        1534 :                 eavail = ep_busy_loop(ep, timed_out);
    1801        1535 :                 if (eavail)
    1802           0 :                         continue;
    1803             : 
    1804        1535 :                 if (signal_pending(current))
    1805             :                         return -EINTR;
    1806             : 
    1807             :                 /*
    1808             :                  * Internally init_wait() uses autoremove_wake_function(),
    1809             :                  * thus wait entry is removed from the wait queue on each
    1810             :                  * wakeup. Why it is important? In case of several waiters
    1811             :                  * each new wakeup will hit the next waiter, giving it the
    1812             :                  * chance to harvest new event. Otherwise wakeup can be
    1813             :                  * lost. This is also good performance-wise, because on
    1814             :                  * normal wakeup path no need to call __remove_wait_queue()
    1815             :                  * explicitly, thus ep->lock is not taken, which halts the
    1816             :                  * event delivery.
    1817             :                  */
    1818        1535 :                 init_wait(&wait);
    1819             : 
    1820        1535 :                 write_lock_irq(&ep->lock);
    1821             :                 /*
    1822             :                  * Barrierless variant, waitqueue_active() is called under
    1823             :                  * the same lock on wakeup ep_poll_callback() side, so it
    1824             :                  * is safe to avoid an explicit barrier.
    1825             :                  */
    1826        1535 :                 __set_current_state(TASK_INTERRUPTIBLE);
    1827             : 
    1828             :                 /*
    1829             :                  * Do the final check under the lock. ep_scan_ready_list()
    1830             :                  * plays with two lists (->rdllist and ->ovflist) and there
    1831             :                  * is always a race when both lists are empty for short
    1832             :                  * period of time although events are pending, so lock is
    1833             :                  * important.
    1834             :                  */
    1835        1535 :                 eavail = ep_events_available(ep);
    1836        1534 :                 if (!eavail)
    1837        1534 :                         __add_wait_queue_exclusive(&ep->wq, &wait);
    1838             : 
    1839        1535 :                 write_unlock_irq(&ep->lock);
    1840             : 
    1841        1535 :                 if (!eavail)
    1842        1534 :                         timed_out = !schedule_hrtimeout_range(to, slack,
    1843             :                                                               HRTIMER_MODE_ABS);
    1844        1526 :                 __set_current_state(TASK_RUNNING);
    1845             : 
    1846             :                 /*
    1847             :                  * We were woken up, thus go and try to harvest some events.
    1848             :                  * If timed out and still on the wait queue, recheck eavail
    1849             :                  * carefully under lock, below.
    1850             :                  */
    1851        1526 :                 eavail = 1;
    1852             : 
    1853        6068 :                 if (!list_empty_careful(&wait.entry)) {
    1854           1 :                         write_lock_irq(&ep->lock);
    1855             :                         /*
    1856             :                          * If the thread timed out and is not on the wait queue,
    1857             :                          * it means that the thread was woken up after its
    1858             :                          * timeout expired before it could reacquire the lock.
    1859             :                          * Thus, when wait.entry is empty, it needs to harvest
    1860             :                          * events.
    1861             :                          */
    1862           1 :                         if (timed_out)
    1863           1 :                                 eavail = list_empty(&wait.entry);
    1864           1 :                         __remove_wait_queue(&ep->wq, &wait);
    1865           1 :                         write_unlock_irq(&ep->lock);
    1866             :                 }
    1867             :         }
    1868             : }
    1869             : 
    1870             : /**
    1871             :  * ep_loop_check_proc - verify that adding an epoll file inside another
    1872             :  *                      epoll structure, does not violate the constraints, in
    1873             :  *                      terms of closed loops, or too deep chains (which can
    1874             :  *                      result in excessive stack usage).
    1875             :  *
    1876             :  * @priv: Pointer to the epoll file to be currently checked.
    1877             :  * @depth: Current depth of the path being checked.
    1878             :  *
    1879             :  * Returns: Returns zero if adding the epoll @file inside current epoll
    1880             :  *          structure @ep does not violate the constraints, or -1 otherwise.
    1881             :  */
    1882           2 : static int ep_loop_check_proc(struct eventpoll *ep, int depth)
    1883             : {
    1884           2 :         int error = 0;
    1885           2 :         struct rb_node *rbp;
    1886           2 :         struct epitem *epi;
    1887             : 
    1888           2 :         mutex_lock_nested(&ep->mtx, depth + 1);
    1889           2 :         ep->gen = loop_check_gen;
    1890           6 :         for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
    1891           4 :                 epi = rb_entry(rbp, struct epitem, rbn);
    1892           4 :                 if (unlikely(is_file_epoll(epi->ffd.file))) {
    1893           0 :                         struct eventpoll *ep_tovisit;
    1894           0 :                         ep_tovisit = epi->ffd.file->private_data;
    1895           0 :                         if (ep_tovisit->gen == loop_check_gen)
    1896           0 :                                 continue;
    1897           0 :                         if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
    1898             :                                 error = -1;
    1899             :                         else
    1900           0 :                                 error = ep_loop_check_proc(ep_tovisit, depth + 1);
    1901           0 :                         if (error != 0)
    1902             :                                 break;
    1903             :                 } else {
    1904             :                         /*
    1905             :                          * If we've reached a file that is not associated with
    1906             :                          * an ep, then we need to check if the newly added
    1907             :                          * links are going to add too many wakeup paths. We do
    1908             :                          * this by adding it to the tfile_check_list, if it's
    1909             :                          * not already there, and calling reverse_path_check()
    1910             :                          * during ep_insert().
    1911             :                          */
    1912           8 :                         list_file(epi->ffd.file);
    1913             :                 }
    1914             :         }
    1915           2 :         mutex_unlock(&ep->mtx);
    1916             : 
    1917           2 :         return error;
    1918             : }
    1919             : 
    1920             : /**
    1921             :  * ep_loop_check - Performs a check to verify that adding an epoll file (@to)
    1922             :  *                 into another epoll file (represented by @from) does not create
    1923             :  *                 closed loops or too deep chains.
    1924             :  *
    1925             :  * @from: Pointer to the epoll we are inserting into.
    1926             :  * @to: Pointer to the epoll to be inserted.
    1927             :  *
    1928             :  * Returns: Returns zero if adding the epoll @to inside the epoll @from
    1929             :  * does not violate the constraints, or -1 otherwise.
    1930             :  */
    1931           2 : static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
    1932             : {
    1933           2 :         inserting_into = ep;
    1934           2 :         return ep_loop_check_proc(to, 0);
    1935             : }
    1936             : 
    1937           3 : static void clear_tfile_check_list(void)
    1938             : {
    1939           3 :         rcu_read_lock();
    1940           8 :         while (tfile_check_list != EP_UNACTIVE_PTR) {
    1941           5 :                 struct epitems_head *head = tfile_check_list;
    1942           5 :                 tfile_check_list = head->next;
    1943           5 :                 unlist_file(head);
    1944             :         }
    1945           3 :         rcu_read_unlock();
    1946           3 : }
    1947             : 
    1948             : /*
    1949             :  * Open an eventpoll file descriptor.
    1950             :  */
    1951          24 : static int do_epoll_create(int flags)
    1952             : {
    1953          24 :         int error, fd;
    1954          24 :         struct eventpoll *ep = NULL;
    1955          24 :         struct file *file;
    1956             : 
    1957             :         /* Check the EPOLL_* constant for consistency.  */
    1958          24 :         BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
    1959             : 
    1960          24 :         if (flags & ~EPOLL_CLOEXEC)
    1961             :                 return -EINVAL;
    1962             :         /*
    1963             :          * Create the internal data structure ("struct eventpoll").
    1964             :          */
    1965          24 :         error = ep_alloc(&ep);
    1966          24 :         if (error < 0)
    1967             :                 return error;
    1968             :         /*
    1969             :          * Creates all the items needed to setup an eventpoll file. That is,
    1970             :          * a file structure and a free file descriptor.
    1971             :          */
    1972          24 :         fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
    1973          24 :         if (fd < 0) {
    1974           0 :                 error = fd;
    1975           0 :                 goto out_free_ep;
    1976             :         }
    1977          24 :         file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
    1978             :                                  O_RDWR | (flags & O_CLOEXEC));
    1979          24 :         if (IS_ERR(file)) {
    1980           0 :                 error = PTR_ERR(file);
    1981           0 :                 goto out_free_fd;
    1982             :         }
    1983          24 :         ep->file = file;
    1984          24 :         fd_install(fd, file);
    1985          24 :         return fd;
    1986             : 
    1987           0 : out_free_fd:
    1988           0 :         put_unused_fd(fd);
    1989           0 : out_free_ep:
    1990           0 :         ep_free(ep);
    1991           0 :         return error;
    1992             : }
    1993             : 
    1994          48 : SYSCALL_DEFINE1(epoll_create1, int, flags)
    1995             : {
    1996          24 :         return do_epoll_create(flags);
    1997             : }
    1998             : 
    1999           0 : SYSCALL_DEFINE1(epoll_create, int, size)
    2000             : {
    2001           0 :         if (size <= 0)
    2002             :                 return -EINVAL;
    2003             : 
    2004           0 :         return do_epoll_create(0);
    2005             : }
    2006             : 
    2007         718 : static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
    2008             :                                    bool nonblock)
    2009             : {
    2010         718 :         if (!nonblock) {
    2011         718 :                 mutex_lock_nested(mutex, depth);
    2012         718 :                 return 0;
    2013             :         }
    2014           0 :         if (mutex_trylock(mutex))
    2015           0 :                 return 0;
    2016             :         return -EAGAIN;
    2017             : }
    2018             : 
    2019         712 : int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
    2020             :                  bool nonblock)
    2021             : {
    2022         712 :         int error;
    2023         712 :         int full_check = 0;
    2024         712 :         struct fd f, tf;
    2025         712 :         struct eventpoll *ep;
    2026         712 :         struct epitem *epi;
    2027         712 :         struct eventpoll *tep = NULL;
    2028             : 
    2029         712 :         error = -EBADF;
    2030         712 :         f = fdget(epfd);
    2031         712 :         if (!f.file)
    2032           0 :                 goto error_return;
    2033             : 
    2034             :         /* Get the "struct file *" for the target file */
    2035         712 :         tf = fdget(fd);
    2036         712 :         if (!tf.file)
    2037           0 :                 goto error_fput;
    2038             : 
    2039             :         /* The target file descriptor must support poll */
    2040         712 :         error = -EPERM;
    2041         712 :         if (!file_can_poll(tf.file))
    2042           0 :                 goto error_tgt_fput;
    2043             : 
    2044             :         /* Check if EPOLLWAKEUP is allowed */
    2045         712 :         if (ep_op_has_event(op))
    2046         527 :                 ep_take_care_of_epollwakeup(epds);
    2047             : 
    2048             :         /*
    2049             :          * We have to check that the file structure underneath the file descriptor
    2050             :          * the user passed to us _is_ an eventpoll file. And also we do not permit
    2051             :          * adding an epoll file descriptor inside itself.
    2052             :          */
    2053         712 :         error = -EINVAL;
    2054         712 :         if (f.file == tf.file || !is_file_epoll(f.file))
    2055           0 :                 goto error_tgt_fput;
    2056             : 
    2057             :         /*
    2058             :          * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
    2059             :          * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
    2060             :          * Also, we do not currently supported nested exclusive wakeups.
    2061             :          */
    2062         712 :         if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
    2063           0 :                 if (op == EPOLL_CTL_MOD)
    2064           0 :                         goto error_tgt_fput;
    2065           0 :                 if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
    2066           0 :                                 (epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
    2067           0 :                         goto error_tgt_fput;
    2068             :         }
    2069             : 
    2070             :         /*
    2071             :          * At this point it is safe to assume that the "private_data" contains
    2072             :          * our own data structure.
    2073             :          */
    2074         712 :         ep = f.file->private_data;
    2075             : 
    2076             :         /*
    2077             :          * When we insert an epoll file descriptor, inside another epoll file
    2078             :          * descriptor, there is the change of creating closed loops, which are
    2079             :          * better be handled here, than in more critical paths. While we are
    2080             :          * checking for loops we also determine the list of files reachable
    2081             :          * and hang them on the tfile_check_list, so we can check that we
    2082             :          * haven't created too many possible wakeup paths.
    2083             :          *
    2084             :          * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
    2085             :          * the epoll file descriptor is attaching directly to a wakeup source,
    2086             :          * unless the epoll file descriptor is nested. The purpose of taking the
    2087             :          * 'epmutex' on add is to prevent complex toplogies such as loops and
    2088             :          * deep wakeup paths from forming in parallel through multiple
    2089             :          * EPOLL_CTL_ADD operations.
    2090             :          */
    2091         712 :         error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
    2092         712 :         if (error)
    2093           0 :                 goto error_tgt_fput;
    2094         712 :         if (op == EPOLL_CTL_ADD) {
    2095         309 :                 if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
    2096         308 :                     is_file_epoll(tf.file)) {
    2097           3 :                         mutex_unlock(&ep->mtx);
    2098           3 :                         error = epoll_mutex_lock(&epmutex, 0, nonblock);
    2099           3 :                         if (error)
    2100           0 :                                 goto error_tgt_fput;
    2101           3 :                         loop_check_gen++;
    2102           3 :                         full_check = 1;
    2103           3 :                         if (is_file_epoll(tf.file)) {
    2104           2 :                                 tep = tf.file->private_data;
    2105           2 :                                 error = -ELOOP;
    2106           2 :                                 if (ep_loop_check(ep, tep) != 0)
    2107           0 :                                         goto error_tgt_fput;
    2108             :                         }
    2109           3 :                         error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
    2110           3 :                         if (error)
    2111           0 :                                 goto error_tgt_fput;
    2112             :                 }
    2113             :         }
    2114             : 
    2115             :         /*
    2116             :          * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
    2117             :          * above, we can be sure to be able to use the item looked up by
    2118             :          * ep_find() till we release the mutex.
    2119             :          */
    2120         712 :         epi = ep_find(ep, tf.file, fd);
    2121             : 
    2122         712 :         error = -EINVAL;
    2123         712 :         switch (op) {
    2124         309 :         case EPOLL_CTL_ADD:
    2125         309 :                 if (!epi) {
    2126         309 :                         epds->events |= EPOLLERR | EPOLLHUP;
    2127         309 :                         error = ep_insert(ep, epds, tf.file, fd, full_check);
    2128             :                 } else
    2129             :                         error = -EEXIST;
    2130             :                 break;
    2131         185 :         case EPOLL_CTL_DEL:
    2132         185 :                 if (epi)
    2133         185 :                         error = ep_remove(ep, epi);
    2134             :                 else
    2135             :                         error = -ENOENT;
    2136             :                 break;
    2137         218 :         case EPOLL_CTL_MOD:
    2138         218 :                 if (epi) {
    2139         218 :                         if (!(epi->event.events & EPOLLEXCLUSIVE)) {
    2140         218 :                                 epds->events |= EPOLLERR | EPOLLHUP;
    2141         218 :                                 error = ep_modify(ep, epi, epds);
    2142             :                         }
    2143             :                 } else
    2144             :                         error = -ENOENT;
    2145             :                 break;
    2146             :         }
    2147         712 :         mutex_unlock(&ep->mtx);
    2148             : 
    2149         712 : error_tgt_fput:
    2150         712 :         if (full_check) {
    2151           3 :                 clear_tfile_check_list();
    2152           3 :                 loop_check_gen++;
    2153           3 :                 mutex_unlock(&epmutex);
    2154             :         }
    2155             : 
    2156         720 :         fdput(tf);
    2157         712 : error_fput:
    2158         720 :         fdput(f);
    2159         712 : error_return:
    2160             : 
    2161         712 :         return error;
    2162             : }
    2163             : 
    2164             : /*
    2165             :  * The following function implements the controller interface for
    2166             :  * the eventpoll file that enables the insertion/removal/change of
    2167             :  * file descriptors inside the interest set.
    2168             :  */
    2169        1424 : SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
    2170             :                 struct epoll_event __user *, event)
    2171             : {
    2172         712 :         struct epoll_event epds;
    2173             : 
    2174         712 :         if (ep_op_has_event(op) &&
    2175         527 :             copy_from_user(&epds, event, sizeof(struct epoll_event)))
    2176             :                 return -EFAULT;
    2177             : 
    2178         712 :         return do_epoll_ctl(epfd, op, fd, &epds, false);
    2179             : }
    2180             : 
    2181             : /*
    2182             :  * Implement the event wait interface for the eventpoll file. It is the kernel
    2183             :  * part of the user space epoll_wait(2).
    2184             :  */
    2185        3487 : static int do_epoll_wait(int epfd, struct epoll_event __user *events,
    2186             :                          int maxevents, struct timespec64 *to)
    2187             : {
    2188        3487 :         int error;
    2189        3487 :         struct fd f;
    2190        3487 :         struct eventpoll *ep;
    2191             : 
    2192             :         /* The maximum number of event must be greater than zero */
    2193        3487 :         if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
    2194             :                 return -EINVAL;
    2195             : 
    2196             :         /* Verify that the area passed by the user is writeable */
    2197        6974 :         if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
    2198             :                 return -EFAULT;
    2199             : 
    2200             :         /* Get the "struct file *" for the eventpoll file */
    2201        3487 :         f = fdget(epfd);
    2202        3490 :         if (!f.file)
    2203             :                 return -EBADF;
    2204             : 
    2205             :         /*
    2206             :          * We have to check that the file structure underneath the fd
    2207             :          * the user passed to us _is_ an eventpoll file.
    2208             :          */
    2209        3490 :         error = -EINVAL;
    2210        3490 :         if (!is_file_epoll(f.file))
    2211           0 :                 goto error_fput;
    2212             : 
    2213             :         /*
    2214             :          * At this point it is safe to assume that the "private_data" contains
    2215             :          * our own data structure.
    2216             :          */
    2217        3490 :         ep = f.file->private_data;
    2218             : 
    2219             :         /* Time to fish for events ... */
    2220        3490 :         error = ep_poll(ep, events, maxevents, to);
    2221             : 
    2222        3480 : error_fput:
    2223        3480 :         fdput(f);
    2224        3480 :         return error;
    2225             : }
    2226             : 
    2227        6978 : SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
    2228             :                 int, maxevents, int, timeout)
    2229             : {
    2230        3489 :         struct timespec64 to;
    2231             : 
    2232        3489 :         return do_epoll_wait(epfd, events, maxevents,
    2233             :                              ep_timeout_to_timespec(&to, timeout));
    2234             : }
    2235             : 
    2236             : /*
    2237             :  * Implement the event wait interface for the eventpoll file. It is the kernel
    2238             :  * part of the user space epoll_pwait(2).
    2239             :  */
    2240           0 : static int do_epoll_pwait(int epfd, struct epoll_event __user *events,
    2241             :                           int maxevents, struct timespec64 *to,
    2242             :                           const sigset_t __user *sigmask, size_t sigsetsize)
    2243             : {
    2244           0 :         int error;
    2245             : 
    2246             :         /*
    2247             :          * If the caller wants a certain signal mask to be set during the wait,
    2248             :          * we apply it here.
    2249             :          */
    2250           0 :         error = set_user_sigmask(sigmask, sigsetsize);
    2251           0 :         if (error)
    2252             :                 return error;
    2253             : 
    2254           0 :         error = do_epoll_wait(epfd, events, maxevents, to);
    2255             : 
    2256           0 :         restore_saved_sigmask_unless(error == -EINTR);
    2257             : 
    2258           0 :         return error;
    2259             : }
    2260             : 
    2261           0 : SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
    2262             :                 int, maxevents, int, timeout, const sigset_t __user *, sigmask,
    2263             :                 size_t, sigsetsize)
    2264             : {
    2265           0 :         struct timespec64 to;
    2266             : 
    2267           0 :         return do_epoll_pwait(epfd, events, maxevents,
    2268             :                               ep_timeout_to_timespec(&to, timeout),
    2269             :                               sigmask, sigsetsize);
    2270             : }
    2271             : 
    2272           0 : SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events,
    2273             :                 int, maxevents, const struct __kernel_timespec __user *, timeout,
    2274             :                 const sigset_t __user *, sigmask, size_t, sigsetsize)
    2275             : {
    2276           0 :         struct timespec64 ts, *to = NULL;
    2277             : 
    2278           0 :         if (timeout) {
    2279           0 :                 if (get_timespec64(&ts, timeout))
    2280             :                         return -EFAULT;
    2281           0 :                 to = &ts;
    2282           0 :                 if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
    2283             :                         return -EINVAL;
    2284             :         }
    2285             : 
    2286           0 :         return do_epoll_pwait(epfd, events, maxevents, to,
    2287             :                               sigmask, sigsetsize);
    2288             : }
    2289             : 
    2290             : #ifdef CONFIG_COMPAT
    2291           0 : static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events,
    2292             :                                  int maxevents, struct timespec64 *timeout,
    2293             :                                  const compat_sigset_t __user *sigmask,
    2294             :                                  compat_size_t sigsetsize)
    2295             : {
    2296           0 :         long err;
    2297             : 
    2298             :         /*
    2299             :          * If the caller wants a certain signal mask to be set during the wait,
    2300             :          * we apply it here.
    2301             :          */
    2302           0 :         err = set_compat_user_sigmask(sigmask, sigsetsize);
    2303           0 :         if (err)
    2304             :                 return err;
    2305             : 
    2306           0 :         err = do_epoll_wait(epfd, events, maxevents, timeout);
    2307             : 
    2308           0 :         restore_saved_sigmask_unless(err == -EINTR);
    2309             : 
    2310           0 :         return err;
    2311             : }
    2312             : 
    2313           0 : COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
    2314             :                        struct epoll_event __user *, events,
    2315             :                        int, maxevents, int, timeout,
    2316             :                        const compat_sigset_t __user *, sigmask,
    2317             :                        compat_size_t, sigsetsize)
    2318             : {
    2319           0 :         struct timespec64 to;
    2320             : 
    2321           0 :         return do_compat_epoll_pwait(epfd, events, maxevents,
    2322             :                                      ep_timeout_to_timespec(&to, timeout),
    2323             :                                      sigmask, sigsetsize);
    2324             : }
    2325             : 
    2326           0 : COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd,
    2327             :                        struct epoll_event __user *, events,
    2328             :                        int, maxevents,
    2329             :                        const struct __kernel_timespec __user *, timeout,
    2330             :                        const compat_sigset_t __user *, sigmask,
    2331             :                        compat_size_t, sigsetsize)
    2332             : {
    2333           0 :         struct timespec64 ts, *to = NULL;
    2334             : 
    2335           0 :         if (timeout) {
    2336           0 :                 if (get_timespec64(&ts, timeout))
    2337             :                         return -EFAULT;
    2338           0 :                 to = &ts;
    2339           0 :                 if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
    2340             :                         return -EINVAL;
    2341             :         }
    2342             : 
    2343           0 :         return do_compat_epoll_pwait(epfd, events, maxevents, to,
    2344             :                                      sigmask, sigsetsize);
    2345             : }
    2346             : 
    2347             : #endif
    2348             : 
    2349           1 : static int __init eventpoll_init(void)
    2350             : {
    2351           1 :         struct sysinfo si;
    2352             : 
    2353           1 :         si_meminfo(&si);
    2354             :         /*
    2355             :          * Allows top 4% of lomem to be allocated for epoll watches (per user).
    2356             :          */
    2357           1 :         max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
    2358             :                 EP_ITEM_COST;
    2359           1 :         BUG_ON(max_user_watches < 0);
    2360             : 
    2361             :         /*
    2362             :          * We can have many thousands of epitems, so prevent this from
    2363             :          * using an extra cache line on 64-bit (and smaller) CPUs
    2364             :          */
    2365           1 :         BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
    2366             : 
    2367             :         /* Allocates slab cache used to allocate "struct epitem" items */
    2368           1 :         epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
    2369             :                         0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
    2370             : 
    2371             :         /* Allocates slab cache used to allocate "struct eppoll_entry" */
    2372           1 :         pwq_cache = kmem_cache_create("eventpoll_pwq",
    2373             :                 sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
    2374             : 
    2375           1 :         ephead_cache = kmem_cache_create("ep_head",
    2376             :                 sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
    2377             : 
    2378           1 :         return 0;
    2379             : }
    2380             : fs_initcall(eventpoll_init);

Generated by: LCOV version 1.14