Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : /*
3 : * fs/eventfd.c
4 : *
5 : * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
6 : *
7 : */
8 :
9 : #include <linux/file.h>
10 : #include <linux/poll.h>
11 : #include <linux/init.h>
12 : #include <linux/fs.h>
13 : #include <linux/sched/signal.h>
14 : #include <linux/kernel.h>
15 : #include <linux/slab.h>
16 : #include <linux/list.h>
17 : #include <linux/spinlock.h>
18 : #include <linux/anon_inodes.h>
19 : #include <linux/syscalls.h>
20 : #include <linux/export.h>
21 : #include <linux/kref.h>
22 : #include <linux/eventfd.h>
23 : #include <linux/proc_fs.h>
24 : #include <linux/seq_file.h>
25 : #include <linux/idr.h>
26 : #include <linux/uio.h>
27 :
28 : DEFINE_PER_CPU(int, eventfd_wake_count);
29 :
30 : static DEFINE_IDA(eventfd_ida);
31 :
32 : struct eventfd_ctx {
33 : struct kref kref;
34 : wait_queue_head_t wqh;
35 : /*
36 : * Every time that a write(2) is performed on an eventfd, the
37 : * value of the __u64 being written is added to "count" and a
38 : * wakeup is performed on "wqh". A read(2) will return the "count"
39 : * value to userspace, and will reset "count" to zero. The kernel
40 : * side eventfd_signal() also, adds to the "count" counter and
41 : * issue a wakeup.
42 : */
43 : __u64 count;
44 : unsigned int flags;
45 : int id;
46 : };
47 :
48 : /**
49 : * eventfd_signal - Adds @n to the eventfd counter.
50 : * @ctx: [in] Pointer to the eventfd context.
51 : * @n: [in] Value of the counter to be added to the eventfd internal counter.
52 : * The value cannot be negative.
53 : *
54 : * This function is supposed to be called by the kernel in paths that do not
55 : * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
56 : * value, and we signal this as overflow condition by returning a EPOLLERR
57 : * to poll(2).
58 : *
59 : * Returns the amount by which the counter was incremented. This will be less
60 : * than @n if the counter has overflowed.
61 : */
62 0 : __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
63 : {
64 0 : unsigned long flags;
65 :
66 : /*
67 : * Deadlock or stack overflow issues can happen if we recurse here
68 : * through waitqueue wakeup handlers. If the caller users potentially
69 : * nested waitqueues with custom wakeup handlers, then it should
70 : * check eventfd_signal_count() before calling this function. If
71 : * it returns true, the eventfd_signal() call should be deferred to a
72 : * safe context.
73 : */
74 0 : if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
75 : return 0;
76 :
77 0 : spin_lock_irqsave(&ctx->wqh.lock, flags);
78 0 : this_cpu_inc(eventfd_wake_count);
79 0 : if (ULLONG_MAX - ctx->count < n)
80 : n = ULLONG_MAX - ctx->count;
81 0 : ctx->count += n;
82 0 : if (waitqueue_active(&ctx->wqh))
83 0 : wake_up_locked_poll(&ctx->wqh, EPOLLIN);
84 0 : this_cpu_dec(eventfd_wake_count);
85 0 : spin_unlock_irqrestore(&ctx->wqh.lock, flags);
86 :
87 0 : return n;
88 : }
89 : EXPORT_SYMBOL_GPL(eventfd_signal);
90 :
91 0 : static void eventfd_free_ctx(struct eventfd_ctx *ctx)
92 : {
93 0 : if (ctx->id >= 0)
94 0 : ida_simple_remove(&eventfd_ida, ctx->id);
95 0 : kfree(ctx);
96 0 : }
97 :
98 0 : static void eventfd_free(struct kref *kref)
99 : {
100 0 : struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
101 :
102 0 : eventfd_free_ctx(ctx);
103 0 : }
104 :
105 : /**
106 : * eventfd_ctx_put - Releases a reference to the internal eventfd context.
107 : * @ctx: [in] Pointer to eventfd context.
108 : *
109 : * The eventfd context reference must have been previously acquired either
110 : * with eventfd_ctx_fdget() or eventfd_ctx_fileget().
111 : */
112 0 : void eventfd_ctx_put(struct eventfd_ctx *ctx)
113 : {
114 0 : kref_put(&ctx->kref, eventfd_free);
115 0 : }
116 : EXPORT_SYMBOL_GPL(eventfd_ctx_put);
117 :
118 0 : static int eventfd_release(struct inode *inode, struct file *file)
119 : {
120 0 : struct eventfd_ctx *ctx = file->private_data;
121 :
122 0 : wake_up_poll(&ctx->wqh, EPOLLHUP);
123 0 : eventfd_ctx_put(ctx);
124 0 : return 0;
125 : }
126 :
127 2 : static __poll_t eventfd_poll(struct file *file, poll_table *wait)
128 : {
129 2 : struct eventfd_ctx *ctx = file->private_data;
130 2 : __poll_t events = 0;
131 2 : u64 count;
132 :
133 2 : poll_wait(file, &ctx->wqh, wait);
134 :
135 : /*
136 : * All writes to ctx->count occur within ctx->wqh.lock. This read
137 : * can be done outside ctx->wqh.lock because we know that poll_wait
138 : * takes that lock (through add_wait_queue) if our caller will sleep.
139 : *
140 : * The read _can_ therefore seep into add_wait_queue's critical
141 : * section, but cannot move above it! add_wait_queue's spin_lock acts
142 : * as an acquire barrier and ensures that the read be ordered properly
143 : * against the writes. The following CAN happen and is safe:
144 : *
145 : * poll write
146 : * ----------------- ------------
147 : * lock ctx->wqh.lock (in poll_wait)
148 : * count = ctx->count
149 : * __add_wait_queue
150 : * unlock ctx->wqh.lock
151 : * lock ctx->qwh.lock
152 : * ctx->count += n
153 : * if (waitqueue_active)
154 : * wake_up_locked_poll
155 : * unlock ctx->qwh.lock
156 : * eventfd_poll returns 0
157 : *
158 : * but the following, which would miss a wakeup, cannot happen:
159 : *
160 : * poll write
161 : * ----------------- ------------
162 : * count = ctx->count (INVALID!)
163 : * lock ctx->qwh.lock
164 : * ctx->count += n
165 : * **waitqueue_active is false**
166 : * **no wake_up_locked_poll!**
167 : * unlock ctx->qwh.lock
168 : * lock ctx->wqh.lock (in poll_wait)
169 : * __add_wait_queue
170 : * unlock ctx->wqh.lock
171 : * eventfd_poll returns 0
172 : */
173 2 : count = READ_ONCE(ctx->count);
174 :
175 2 : if (count > 0)
176 1 : events |= EPOLLIN;
177 2 : if (count == ULLONG_MAX)
178 0 : events |= EPOLLERR;
179 2 : if (ULLONG_MAX - 1 > count)
180 2 : events |= EPOLLOUT;
181 :
182 2 : return events;
183 : }
184 :
185 1 : void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
186 : {
187 2 : lockdep_assert_held(&ctx->wqh.lock);
188 :
189 1 : *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
190 1 : ctx->count -= *cnt;
191 1 : }
192 : EXPORT_SYMBOL_GPL(eventfd_ctx_do_read);
193 :
194 : /**
195 : * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
196 : * @ctx: [in] Pointer to eventfd context.
197 : * @wait: [in] Wait queue to be removed.
198 : * @cnt: [out] Pointer to the 64-bit counter value.
199 : *
200 : * Returns %0 if successful, or the following error codes:
201 : *
202 : * -EAGAIN : The operation would have blocked.
203 : *
204 : * This is used to atomically remove a wait queue entry from the eventfd wait
205 : * queue head, and read/reset the counter value.
206 : */
207 0 : int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
208 : __u64 *cnt)
209 : {
210 0 : unsigned long flags;
211 :
212 0 : spin_lock_irqsave(&ctx->wqh.lock, flags);
213 0 : eventfd_ctx_do_read(ctx, cnt);
214 0 : __remove_wait_queue(&ctx->wqh, wait);
215 0 : if (*cnt != 0 && waitqueue_active(&ctx->wqh))
216 0 : wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
217 0 : spin_unlock_irqrestore(&ctx->wqh.lock, flags);
218 :
219 0 : return *cnt != 0 ? 0 : -EAGAIN;
220 : }
221 : EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
222 :
223 1 : static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
224 : {
225 1 : struct file *file = iocb->ki_filp;
226 1 : struct eventfd_ctx *ctx = file->private_data;
227 1 : __u64 ucnt = 0;
228 1 : DECLARE_WAITQUEUE(wait, current);
229 :
230 1 : if (iov_iter_count(to) < sizeof(ucnt))
231 : return -EINVAL;
232 1 : spin_lock_irq(&ctx->wqh.lock);
233 1 : if (!ctx->count) {
234 0 : if ((file->f_flags & O_NONBLOCK) ||
235 0 : (iocb->ki_flags & IOCB_NOWAIT)) {
236 0 : spin_unlock_irq(&ctx->wqh.lock);
237 0 : return -EAGAIN;
238 : }
239 0 : __add_wait_queue(&ctx->wqh, &wait);
240 0 : for (;;) {
241 0 : set_current_state(TASK_INTERRUPTIBLE);
242 0 : if (ctx->count)
243 : break;
244 0 : if (signal_pending(current)) {
245 0 : __remove_wait_queue(&ctx->wqh, &wait);
246 0 : __set_current_state(TASK_RUNNING);
247 0 : spin_unlock_irq(&ctx->wqh.lock);
248 0 : return -ERESTARTSYS;
249 : }
250 0 : spin_unlock_irq(&ctx->wqh.lock);
251 0 : schedule();
252 0 : spin_lock_irq(&ctx->wqh.lock);
253 : }
254 0 : __remove_wait_queue(&ctx->wqh, &wait);
255 0 : __set_current_state(TASK_RUNNING);
256 : }
257 1 : eventfd_ctx_do_read(ctx, &ucnt);
258 1 : if (waitqueue_active(&ctx->wqh))
259 0 : wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
260 1 : spin_unlock_irq(&ctx->wqh.lock);
261 1 : if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt)))
262 0 : return -EFAULT;
263 :
264 : return sizeof(ucnt);
265 : }
266 :
267 2 : static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
268 : loff_t *ppos)
269 : {
270 2 : struct eventfd_ctx *ctx = file->private_data;
271 2 : ssize_t res;
272 2 : __u64 ucnt;
273 2 : DECLARE_WAITQUEUE(wait, current);
274 :
275 2 : if (count < sizeof(ucnt))
276 : return -EINVAL;
277 2 : if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
278 : return -EFAULT;
279 2 : if (ucnt == ULLONG_MAX)
280 : return -EINVAL;
281 2 : spin_lock_irq(&ctx->wqh.lock);
282 2 : res = -EAGAIN;
283 2 : if (ULLONG_MAX - ctx->count > ucnt)
284 : res = sizeof(ucnt);
285 0 : else if (!(file->f_flags & O_NONBLOCK)) {
286 0 : __add_wait_queue(&ctx->wqh, &wait);
287 0 : for (res = 0;;) {
288 0 : set_current_state(TASK_INTERRUPTIBLE);
289 0 : if (ULLONG_MAX - ctx->count > ucnt) {
290 : res = sizeof(ucnt);
291 : break;
292 : }
293 0 : if (signal_pending(current)) {
294 : res = -ERESTARTSYS;
295 : break;
296 : }
297 0 : spin_unlock_irq(&ctx->wqh.lock);
298 0 : schedule();
299 0 : spin_lock_irq(&ctx->wqh.lock);
300 : }
301 0 : __remove_wait_queue(&ctx->wqh, &wait);
302 0 : __set_current_state(TASK_RUNNING);
303 : }
304 2 : if (likely(res > 0)) {
305 2 : ctx->count += ucnt;
306 2 : if (waitqueue_active(&ctx->wqh))
307 1 : wake_up_locked_poll(&ctx->wqh, EPOLLIN);
308 : }
309 2 : spin_unlock_irq(&ctx->wqh.lock);
310 :
311 2 : return res;
312 : }
313 :
314 : #ifdef CONFIG_PROC_FS
315 0 : static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
316 : {
317 0 : struct eventfd_ctx *ctx = f->private_data;
318 :
319 0 : spin_lock_irq(&ctx->wqh.lock);
320 0 : seq_printf(m, "eventfd-count: %16llx\n",
321 0 : (unsigned long long)ctx->count);
322 0 : spin_unlock_irq(&ctx->wqh.lock);
323 0 : seq_printf(m, "eventfd-id: %d\n", ctx->id);
324 0 : }
325 : #endif
326 :
327 : static const struct file_operations eventfd_fops = {
328 : #ifdef CONFIG_PROC_FS
329 : .show_fdinfo = eventfd_show_fdinfo,
330 : #endif
331 : .release = eventfd_release,
332 : .poll = eventfd_poll,
333 : .read_iter = eventfd_read,
334 : .write = eventfd_write,
335 : .llseek = noop_llseek,
336 : };
337 :
338 : /**
339 : * eventfd_fget - Acquire a reference of an eventfd file descriptor.
340 : * @fd: [in] Eventfd file descriptor.
341 : *
342 : * Returns a pointer to the eventfd file structure in case of success, or the
343 : * following error pointer:
344 : *
345 : * -EBADF : Invalid @fd file descriptor.
346 : * -EINVAL : The @fd file descriptor is not an eventfd file.
347 : */
348 0 : struct file *eventfd_fget(int fd)
349 : {
350 0 : struct file *file;
351 :
352 0 : file = fget(fd);
353 0 : if (!file)
354 0 : return ERR_PTR(-EBADF);
355 0 : if (file->f_op != &eventfd_fops) {
356 0 : fput(file);
357 0 : return ERR_PTR(-EINVAL);
358 : }
359 :
360 : return file;
361 : }
362 : EXPORT_SYMBOL_GPL(eventfd_fget);
363 :
364 : /**
365 : * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
366 : * @fd: [in] Eventfd file descriptor.
367 : *
368 : * Returns a pointer to the internal eventfd context, otherwise the error
369 : * pointers returned by the following functions:
370 : *
371 : * eventfd_fget
372 : */
373 0 : struct eventfd_ctx *eventfd_ctx_fdget(int fd)
374 : {
375 0 : struct eventfd_ctx *ctx;
376 0 : struct fd f = fdget(fd);
377 0 : if (!f.file)
378 0 : return ERR_PTR(-EBADF);
379 0 : ctx = eventfd_ctx_fileget(f.file);
380 0 : fdput(f);
381 0 : return ctx;
382 : }
383 : EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
384 :
385 : /**
386 : * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
387 : * @file: [in] Eventfd file pointer.
388 : *
389 : * Returns a pointer to the internal eventfd context, otherwise the error
390 : * pointer:
391 : *
392 : * -EINVAL : The @fd file descriptor is not an eventfd file.
393 : */
394 0 : struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
395 : {
396 0 : struct eventfd_ctx *ctx;
397 :
398 0 : if (file->f_op != &eventfd_fops)
399 0 : return ERR_PTR(-EINVAL);
400 :
401 0 : ctx = file->private_data;
402 0 : kref_get(&ctx->kref);
403 0 : return ctx;
404 : }
405 : EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
406 :
407 2 : static int do_eventfd(unsigned int count, int flags)
408 : {
409 2 : struct eventfd_ctx *ctx;
410 2 : struct file *file;
411 2 : int fd;
412 :
413 : /* Check the EFD_* constants for consistency. */
414 2 : BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
415 2 : BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
416 :
417 2 : if (flags & ~EFD_FLAGS_SET)
418 : return -EINVAL;
419 :
420 2 : ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
421 2 : if (!ctx)
422 : return -ENOMEM;
423 :
424 2 : kref_init(&ctx->kref);
425 2 : init_waitqueue_head(&ctx->wqh);
426 2 : ctx->count = count;
427 2 : ctx->flags = flags;
428 2 : ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
429 :
430 2 : flags &= EFD_SHARED_FCNTL_FLAGS;
431 2 : flags |= O_RDWR;
432 2 : fd = get_unused_fd_flags(flags);
433 2 : if (fd < 0)
434 0 : goto err;
435 :
436 2 : file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags);
437 2 : if (IS_ERR(file)) {
438 0 : put_unused_fd(fd);
439 0 : fd = PTR_ERR(file);
440 0 : goto err;
441 : }
442 :
443 2 : file->f_mode |= FMODE_NOWAIT;
444 2 : fd_install(fd, file);
445 2 : return fd;
446 0 : err:
447 0 : eventfd_free_ctx(ctx);
448 0 : return fd;
449 : }
450 :
451 4 : SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
452 : {
453 2 : return do_eventfd(count, flags);
454 : }
455 :
456 0 : SYSCALL_DEFINE1(eventfd, unsigned int, count)
457 : {
458 0 : return do_eventfd(count, 0);
459 : }
460 :
|