Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Performance events ring-buffer code:
4 : *
5 : * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
6 : * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
7 : * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
8 : * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
9 : */
10 :
11 : #include <linux/perf_event.h>
12 : #include <linux/vmalloc.h>
13 : #include <linux/slab.h>
14 : #include <linux/circ_buf.h>
15 : #include <linux/poll.h>
16 : #include <linux/nospec.h>
17 :
18 : #include "internal.h"
19 :
20 0 : static void perf_output_wakeup(struct perf_output_handle *handle)
21 : {
22 0 : atomic_set(&handle->rb->poll, EPOLLIN);
23 :
24 0 : handle->event->pending_wakeup = 1;
25 0 : irq_work_queue(&handle->event->pending);
26 0 : }
27 :
28 : /*
29 : * We need to ensure a later event_id doesn't publish a head when a former
30 : * event isn't done writing. However since we need to deal with NMIs we
31 : * cannot fully serialize things.
32 : *
33 : * We only publish the head (and generate a wakeup) when the outer-most
34 : * event completes.
35 : */
36 0 : static void perf_output_get_handle(struct perf_output_handle *handle)
37 : {
38 0 : struct perf_buffer *rb = handle->rb;
39 :
40 0 : preempt_disable();
41 :
42 : /*
43 : * Avoid an explicit LOAD/STORE such that architectures with memops
44 : * can use them.
45 : */
46 0 : (*(volatile unsigned int *)&rb->nest)++;
47 0 : handle->wakeup = local_read(&rb->wakeup);
48 0 : }
49 :
50 0 : static void perf_output_put_handle(struct perf_output_handle *handle)
51 : {
52 0 : struct perf_buffer *rb = handle->rb;
53 0 : unsigned long head;
54 0 : unsigned int nest;
55 :
56 : /*
57 : * If this isn't the outermost nesting, we don't have to update
58 : * @rb->user_page->data_head.
59 : */
60 0 : nest = READ_ONCE(rb->nest);
61 0 : if (nest > 1) {
62 0 : WRITE_ONCE(rb->nest, nest - 1);
63 0 : goto out;
64 : }
65 :
66 0 : again:
67 : /*
68 : * In order to avoid publishing a head value that goes backwards,
69 : * we must ensure the load of @rb->head happens after we've
70 : * incremented @rb->nest.
71 : *
72 : * Otherwise we can observe a @rb->head value before one published
73 : * by an IRQ/NMI happening between the load and the increment.
74 : */
75 0 : barrier();
76 0 : head = local_read(&rb->head);
77 :
78 : /*
79 : * IRQ/NMI can happen here and advance @rb->head, causing our
80 : * load above to be stale.
81 : */
82 :
83 : /*
84 : * Since the mmap() consumer (userspace) can run on a different CPU:
85 : *
86 : * kernel user
87 : *
88 : * if (LOAD ->data_tail) { LOAD ->data_head
89 : * (A) smp_rmb() (C)
90 : * STORE $data LOAD $data
91 : * smp_wmb() (B) smp_mb() (D)
92 : * STORE ->data_head STORE ->data_tail
93 : * }
94 : *
95 : * Where A pairs with D, and B pairs with C.
96 : *
97 : * In our case (A) is a control dependency that separates the load of
98 : * the ->data_tail and the stores of $data. In case ->data_tail
99 : * indicates there is no room in the buffer to store $data we do not.
100 : *
101 : * D needs to be a full barrier since it separates the data READ
102 : * from the tail WRITE.
103 : *
104 : * For B a WMB is sufficient since it separates two WRITEs, and for C
105 : * an RMB is sufficient since it separates two READs.
106 : *
107 : * See perf_output_begin().
108 : */
109 0 : smp_wmb(); /* B, matches C */
110 0 : WRITE_ONCE(rb->user_page->data_head, head);
111 :
112 : /*
113 : * We must publish the head before decrementing the nest count,
114 : * otherwise an IRQ/NMI can publish a more recent head value and our
115 : * write will (temporarily) publish a stale value.
116 : */
117 0 : barrier();
118 0 : WRITE_ONCE(rb->nest, 0);
119 :
120 : /*
121 : * Ensure we decrement @rb->nest before we validate the @rb->head.
122 : * Otherwise we cannot be sure we caught the 'last' nested update.
123 : */
124 0 : barrier();
125 0 : if (unlikely(head != local_read(&rb->head))) {
126 0 : WRITE_ONCE(rb->nest, 1);
127 0 : goto again;
128 : }
129 :
130 0 : if (handle->wakeup != local_read(&rb->wakeup))
131 0 : perf_output_wakeup(handle);
132 :
133 0 : out:
134 0 : preempt_enable();
135 0 : }
136 :
137 : static __always_inline bool
138 0 : ring_buffer_has_space(unsigned long head, unsigned long tail,
139 : unsigned long data_size, unsigned int size,
140 : bool backward)
141 : {
142 0 : if (!backward)
143 0 : return CIRC_SPACE(head, tail, data_size) >= size;
144 : else
145 0 : return CIRC_SPACE(tail, head, data_size) >= size;
146 : }
147 :
148 : static __always_inline int
149 0 : __perf_output_begin(struct perf_output_handle *handle,
150 : struct perf_sample_data *data,
151 : struct perf_event *event, unsigned int size,
152 : bool backward)
153 : {
154 0 : struct perf_buffer *rb;
155 0 : unsigned long tail, offset, head;
156 0 : int have_lost, page_shift;
157 0 : struct {
158 : struct perf_event_header header;
159 : u64 id;
160 : u64 lost;
161 : } lost_event;
162 :
163 0 : rcu_read_lock();
164 : /*
165 : * For inherited events we send all the output towards the parent.
166 : */
167 0 : if (event->parent)
168 0 : event = event->parent;
169 :
170 0 : rb = rcu_dereference(event->rb);
171 0 : if (unlikely(!rb))
172 0 : goto out;
173 :
174 0 : if (unlikely(rb->paused)) {
175 0 : if (rb->nr_pages)
176 0 : local_inc(&rb->lost);
177 0 : goto out;
178 : }
179 :
180 0 : handle->rb = rb;
181 0 : handle->event = event;
182 :
183 0 : have_lost = local_read(&rb->lost);
184 0 : if (unlikely(have_lost)) {
185 0 : size += sizeof(lost_event);
186 0 : if (event->attr.sample_id_all)
187 0 : size += event->id_header_size;
188 : }
189 :
190 0 : perf_output_get_handle(handle);
191 :
192 0 : do {
193 0 : tail = READ_ONCE(rb->user_page->data_tail);
194 0 : offset = head = local_read(&rb->head);
195 0 : if (!rb->overwrite) {
196 0 : if (unlikely(!ring_buffer_has_space(head, tail,
197 : perf_data_size(rb),
198 : size, backward)))
199 0 : goto fail;
200 : }
201 :
202 : /*
203 : * The above forms a control dependency barrier separating the
204 : * @tail load above from the data stores below. Since the @tail
205 : * load is required to compute the branch to fail below.
206 : *
207 : * A, matches D; the full memory barrier userspace SHOULD issue
208 : * after reading the data and before storing the new tail
209 : * position.
210 : *
211 : * See perf_output_put_handle().
212 : */
213 :
214 0 : if (!backward)
215 0 : head += size;
216 : else
217 0 : head -= size;
218 0 : } while (local_cmpxchg(&rb->head, offset, head) != offset);
219 :
220 0 : if (backward) {
221 0 : offset = head;
222 0 : head = (u64)(-head);
223 : }
224 :
225 : /*
226 : * We rely on the implied barrier() by local_cmpxchg() to ensure
227 : * none of the data stores below can be lifted up by the compiler.
228 : */
229 :
230 0 : if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
231 0 : local_add(rb->watermark, &rb->wakeup);
232 :
233 0 : page_shift = PAGE_SHIFT + page_order(rb);
234 :
235 0 : handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
236 0 : offset &= (1UL << page_shift) - 1;
237 0 : handle->addr = rb->data_pages[handle->page] + offset;
238 0 : handle->size = (1UL << page_shift) - offset;
239 :
240 0 : if (unlikely(have_lost)) {
241 0 : lost_event.header.size = sizeof(lost_event);
242 0 : lost_event.header.type = PERF_RECORD_LOST;
243 0 : lost_event.header.misc = 0;
244 0 : lost_event.id = event->id;
245 0 : lost_event.lost = local_xchg(&rb->lost, 0);
246 :
247 : /* XXX mostly redundant; @data is already fully initializes */
248 0 : perf_event_header__init_id(&lost_event.header, data, event);
249 0 : perf_output_put(handle, lost_event);
250 0 : perf_event__output_id_sample(event, handle, data);
251 : }
252 :
253 : return 0;
254 :
255 0 : fail:
256 0 : local_inc(&rb->lost);
257 0 : perf_output_put_handle(handle);
258 0 : out:
259 0 : rcu_read_unlock();
260 :
261 0 : return -ENOSPC;
262 : }
263 :
264 0 : int perf_output_begin_forward(struct perf_output_handle *handle,
265 : struct perf_sample_data *data,
266 : struct perf_event *event, unsigned int size)
267 : {
268 0 : return __perf_output_begin(handle, data, event, size, false);
269 : }
270 :
271 0 : int perf_output_begin_backward(struct perf_output_handle *handle,
272 : struct perf_sample_data *data,
273 : struct perf_event *event, unsigned int size)
274 : {
275 0 : return __perf_output_begin(handle, data, event, size, true);
276 : }
277 :
278 0 : int perf_output_begin(struct perf_output_handle *handle,
279 : struct perf_sample_data *data,
280 : struct perf_event *event, unsigned int size)
281 : {
282 :
283 0 : return __perf_output_begin(handle, data, event, size,
284 0 : unlikely(is_write_backward(event)));
285 : }
286 :
287 0 : unsigned int perf_output_copy(struct perf_output_handle *handle,
288 : const void *buf, unsigned int len)
289 : {
290 0 : return __output_copy(handle, buf, len);
291 : }
292 :
293 0 : unsigned int perf_output_skip(struct perf_output_handle *handle,
294 : unsigned int len)
295 : {
296 0 : return __output_skip(handle, NULL, len);
297 : }
298 :
299 0 : void perf_output_end(struct perf_output_handle *handle)
300 : {
301 0 : perf_output_put_handle(handle);
302 0 : rcu_read_unlock();
303 0 : }
304 :
305 : static void
306 0 : ring_buffer_init(struct perf_buffer *rb, long watermark, int flags)
307 : {
308 0 : long max_size = perf_data_size(rb);
309 :
310 0 : if (watermark)
311 0 : rb->watermark = min(max_size, watermark);
312 :
313 0 : if (!rb->watermark)
314 0 : rb->watermark = max_size / 2;
315 :
316 0 : if (flags & RING_BUFFER_WRITABLE)
317 0 : rb->overwrite = 0;
318 : else
319 0 : rb->overwrite = 1;
320 :
321 0 : refcount_set(&rb->refcount, 1);
322 :
323 0 : INIT_LIST_HEAD(&rb->event_list);
324 0 : spin_lock_init(&rb->event_lock);
325 :
326 : /*
327 : * perf_output_begin() only checks rb->paused, therefore
328 : * rb->paused must be true if we have no pages for output.
329 : */
330 0 : if (!rb->nr_pages)
331 0 : rb->paused = 1;
332 0 : }
333 :
334 0 : void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
335 : {
336 : /*
337 : * OVERWRITE is determined by perf_aux_output_end() and can't
338 : * be passed in directly.
339 : */
340 0 : if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE))
341 : return;
342 :
343 0 : handle->aux_flags |= flags;
344 : }
345 : EXPORT_SYMBOL_GPL(perf_aux_output_flag);
346 :
347 : /*
348 : * This is called before hardware starts writing to the AUX area to
349 : * obtain an output handle and make sure there's room in the buffer.
350 : * When the capture completes, call perf_aux_output_end() to commit
351 : * the recorded data to the buffer.
352 : *
353 : * The ordering is similar to that of perf_output_{begin,end}, with
354 : * the exception of (B), which should be taken care of by the pmu
355 : * driver, since ordering rules will differ depending on hardware.
356 : *
357 : * Call this from pmu::start(); see the comment in perf_aux_output_end()
358 : * about its use in pmu callbacks. Both can also be called from the PMI
359 : * handler if needed.
360 : */
361 0 : void *perf_aux_output_begin(struct perf_output_handle *handle,
362 : struct perf_event *event)
363 : {
364 0 : struct perf_event *output_event = event;
365 0 : unsigned long aux_head, aux_tail;
366 0 : struct perf_buffer *rb;
367 0 : unsigned int nest;
368 :
369 0 : if (output_event->parent)
370 0 : output_event = output_event->parent;
371 :
372 : /*
373 : * Since this will typically be open across pmu::add/pmu::del, we
374 : * grab ring_buffer's refcount instead of holding rcu read lock
375 : * to make sure it doesn't disappear under us.
376 : */
377 0 : rb = ring_buffer_get(output_event);
378 0 : if (!rb)
379 : return NULL;
380 :
381 0 : if (!rb_has_aux(rb))
382 0 : goto err;
383 :
384 : /*
385 : * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(),
386 : * about to get freed, so we leave immediately.
387 : *
388 : * Checking rb::aux_mmap_count and rb::refcount has to be done in
389 : * the same order, see perf_mmap_close. Otherwise we end up freeing
390 : * aux pages in this path, which is a bug, because in_atomic().
391 : */
392 0 : if (!atomic_read(&rb->aux_mmap_count))
393 0 : goto err;
394 :
395 0 : if (!refcount_inc_not_zero(&rb->aux_refcount))
396 0 : goto err;
397 :
398 0 : nest = READ_ONCE(rb->aux_nest);
399 : /*
400 : * Nesting is not supported for AUX area, make sure nested
401 : * writers are caught early
402 : */
403 0 : if (WARN_ON_ONCE(nest))
404 0 : goto err_put;
405 :
406 0 : WRITE_ONCE(rb->aux_nest, nest + 1);
407 :
408 0 : aux_head = rb->aux_head;
409 :
410 0 : handle->rb = rb;
411 0 : handle->event = event;
412 0 : handle->head = aux_head;
413 0 : handle->size = 0;
414 0 : handle->aux_flags = 0;
415 :
416 : /*
417 : * In overwrite mode, AUX data stores do not depend on aux_tail,
418 : * therefore (A) control dependency barrier does not exist. The
419 : * (B) <-> (C) ordering is still observed by the pmu driver.
420 : */
421 0 : if (!rb->aux_overwrite) {
422 0 : aux_tail = READ_ONCE(rb->user_page->aux_tail);
423 0 : handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
424 0 : if (aux_head - aux_tail < perf_aux_size(rb))
425 0 : handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
426 :
427 : /*
428 : * handle->size computation depends on aux_tail load; this forms a
429 : * control dependency barrier separating aux_tail load from aux data
430 : * store that will be enabled on successful return
431 : */
432 0 : if (!handle->size) { /* A, matches D */
433 0 : event->pending_disable = smp_processor_id();
434 0 : perf_output_wakeup(handle);
435 0 : WRITE_ONCE(rb->aux_nest, 0);
436 0 : goto err_put;
437 : }
438 : }
439 :
440 0 : return handle->rb->aux_priv;
441 :
442 0 : err_put:
443 : /* can't be last */
444 0 : rb_free_aux(rb);
445 :
446 0 : err:
447 0 : ring_buffer_put(rb);
448 0 : handle->event = NULL;
449 :
450 0 : return NULL;
451 : }
452 : EXPORT_SYMBOL_GPL(perf_aux_output_begin);
453 :
454 0 : static __always_inline bool rb_need_aux_wakeup(struct perf_buffer *rb)
455 : {
456 0 : if (rb->aux_overwrite)
457 : return false;
458 :
459 0 : if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
460 0 : rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
461 0 : return true;
462 : }
463 :
464 : return false;
465 : }
466 :
467 : /*
468 : * Commit the data written by hardware into the ring buffer by adjusting
469 : * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
470 : * pmu driver's responsibility to observe ordering rules of the hardware,
471 : * so that all the data is externally visible before this is called.
472 : *
473 : * Note: this has to be called from pmu::stop() callback, as the assumption
474 : * of the AUX buffer management code is that after pmu::stop(), the AUX
475 : * transaction must be stopped and therefore drop the AUX reference count.
476 : */
477 0 : void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
478 : {
479 0 : bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED);
480 0 : struct perf_buffer *rb = handle->rb;
481 0 : unsigned long aux_head;
482 :
483 : /* in overwrite mode, driver provides aux_head via handle */
484 0 : if (rb->aux_overwrite) {
485 0 : handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE;
486 :
487 0 : aux_head = handle->head;
488 0 : rb->aux_head = aux_head;
489 : } else {
490 0 : handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE;
491 :
492 0 : aux_head = rb->aux_head;
493 0 : rb->aux_head += size;
494 : }
495 :
496 : /*
497 : * Only send RECORD_AUX if we have something useful to communicate
498 : *
499 : * Note: the OVERWRITE records by themselves are not considered
500 : * useful, as they don't communicate any *new* information,
501 : * aside from the short-lived offset, that becomes history at
502 : * the next event sched-in and therefore isn't useful.
503 : * The userspace that needs to copy out AUX data in overwrite
504 : * mode should know to use user_page::aux_head for the actual
505 : * offset. So, from now on we don't output AUX records that
506 : * have *only* OVERWRITE flag set.
507 : */
508 0 : if (size || (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE))
509 0 : perf_event_aux_event(handle->event, aux_head, size,
510 : handle->aux_flags);
511 :
512 0 : WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
513 0 : if (rb_need_aux_wakeup(rb))
514 0 : wakeup = true;
515 :
516 0 : if (wakeup) {
517 0 : if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
518 0 : handle->event->pending_disable = smp_processor_id();
519 0 : perf_output_wakeup(handle);
520 : }
521 :
522 0 : handle->event = NULL;
523 :
524 0 : WRITE_ONCE(rb->aux_nest, 0);
525 : /* can't be last */
526 0 : rb_free_aux(rb);
527 0 : ring_buffer_put(rb);
528 0 : }
529 : EXPORT_SYMBOL_GPL(perf_aux_output_end);
530 :
531 : /*
532 : * Skip over a given number of bytes in the AUX buffer, due to, for example,
533 : * hardware's alignment constraints.
534 : */
535 0 : int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
536 : {
537 0 : struct perf_buffer *rb = handle->rb;
538 :
539 0 : if (size > handle->size)
540 : return -ENOSPC;
541 :
542 0 : rb->aux_head += size;
543 :
544 0 : WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
545 0 : if (rb_need_aux_wakeup(rb)) {
546 0 : perf_output_wakeup(handle);
547 0 : handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
548 : }
549 :
550 0 : handle->head = rb->aux_head;
551 0 : handle->size -= size;
552 :
553 0 : return 0;
554 : }
555 : EXPORT_SYMBOL_GPL(perf_aux_output_skip);
556 :
557 0 : void *perf_get_aux(struct perf_output_handle *handle)
558 : {
559 : /* this is only valid between perf_aux_output_begin and *_end */
560 0 : if (!handle->event)
561 : return NULL;
562 :
563 0 : return handle->rb->aux_priv;
564 : }
565 : EXPORT_SYMBOL_GPL(perf_get_aux);
566 :
567 : /*
568 : * Copy out AUX data from an AUX handle.
569 : */
570 0 : long perf_output_copy_aux(struct perf_output_handle *aux_handle,
571 : struct perf_output_handle *handle,
572 : unsigned long from, unsigned long to)
573 : {
574 0 : struct perf_buffer *rb = aux_handle->rb;
575 0 : unsigned long tocopy, remainder, len = 0;
576 0 : void *addr;
577 :
578 0 : from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
579 0 : to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
580 :
581 0 : do {
582 0 : tocopy = PAGE_SIZE - offset_in_page(from);
583 0 : if (to > from)
584 0 : tocopy = min(tocopy, to - from);
585 0 : if (!tocopy)
586 : break;
587 :
588 0 : addr = rb->aux_pages[from >> PAGE_SHIFT];
589 0 : addr += offset_in_page(from);
590 :
591 0 : remainder = perf_output_copy(handle, addr, tocopy);
592 0 : if (remainder)
593 : return -EFAULT;
594 :
595 0 : len += tocopy;
596 0 : from += tocopy;
597 0 : from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
598 0 : } while (to != from);
599 :
600 0 : return len;
601 : }
602 :
603 : #define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
604 :
605 0 : static struct page *rb_alloc_aux_page(int node, int order)
606 : {
607 0 : struct page *page;
608 :
609 0 : if (order > MAX_ORDER)
610 : order = MAX_ORDER;
611 :
612 0 : do {
613 0 : page = alloc_pages_node(node, PERF_AUX_GFP, order);
614 0 : } while (!page && order--);
615 :
616 0 : if (page && order) {
617 : /*
618 : * Communicate the allocation size to the driver:
619 : * if we managed to secure a high-order allocation,
620 : * set its first page's private to this order;
621 : * !PagePrivate(page) means it's just a normal page.
622 : */
623 0 : split_page(page, order);
624 0 : SetPagePrivate(page);
625 0 : set_page_private(page, order);
626 : }
627 :
628 0 : return page;
629 : }
630 :
631 0 : static void rb_free_aux_page(struct perf_buffer *rb, int idx)
632 : {
633 0 : struct page *page = virt_to_page(rb->aux_pages[idx]);
634 :
635 0 : ClearPagePrivate(page);
636 0 : page->mapping = NULL;
637 0 : __free_page(page);
638 0 : }
639 :
640 0 : static void __rb_free_aux(struct perf_buffer *rb)
641 : {
642 0 : int pg;
643 :
644 : /*
645 : * Should never happen, the last reference should be dropped from
646 : * perf_mmap_close() path, which first stops aux transactions (which
647 : * in turn are the atomic holders of aux_refcount) and then does the
648 : * last rb_free_aux().
649 : */
650 0 : WARN_ON_ONCE(in_atomic());
651 :
652 0 : if (rb->aux_priv) {
653 0 : rb->free_aux(rb->aux_priv);
654 0 : rb->free_aux = NULL;
655 0 : rb->aux_priv = NULL;
656 : }
657 :
658 0 : if (rb->aux_nr_pages) {
659 0 : for (pg = 0; pg < rb->aux_nr_pages; pg++)
660 0 : rb_free_aux_page(rb, pg);
661 :
662 0 : kfree(rb->aux_pages);
663 0 : rb->aux_nr_pages = 0;
664 : }
665 0 : }
666 :
667 0 : int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
668 : pgoff_t pgoff, int nr_pages, long watermark, int flags)
669 : {
670 0 : bool overwrite = !(flags & RING_BUFFER_WRITABLE);
671 0 : int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
672 0 : int ret = -ENOMEM, max_order;
673 :
674 0 : if (!has_aux(event))
675 : return -EOPNOTSUPP;
676 :
677 : /*
678 : * We need to start with the max_order that fits in nr_pages,
679 : * not the other way around, hence ilog2() and not get_order.
680 : */
681 0 : max_order = ilog2(nr_pages);
682 :
683 : /*
684 : * PMU requests more than one contiguous chunks of memory
685 : * for SW double buffering
686 : */
687 0 : if (!overwrite) {
688 0 : if (!max_order)
689 : return -EINVAL;
690 :
691 0 : max_order--;
692 : }
693 :
694 0 : rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
695 : node);
696 0 : if (!rb->aux_pages)
697 : return -ENOMEM;
698 :
699 0 : rb->free_aux = event->pmu->free_aux;
700 0 : for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) {
701 0 : struct page *page;
702 0 : int last, order;
703 :
704 0 : order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages));
705 0 : page = rb_alloc_aux_page(node, order);
706 0 : if (!page)
707 0 : goto out;
708 :
709 0 : for (last = rb->aux_nr_pages + (1 << page_private(page));
710 0 : last > rb->aux_nr_pages; rb->aux_nr_pages++)
711 0 : rb->aux_pages[rb->aux_nr_pages] = page_address(page++);
712 : }
713 :
714 : /*
715 : * In overwrite mode, PMUs that don't support SG may not handle more
716 : * than one contiguous allocation, since they rely on PMI to do double
717 : * buffering. In this case, the entire buffer has to be one contiguous
718 : * chunk.
719 : */
720 0 : if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) &&
721 : overwrite) {
722 0 : struct page *page = virt_to_page(rb->aux_pages[0]);
723 :
724 0 : if (page_private(page) != max_order)
725 0 : goto out;
726 : }
727 :
728 0 : rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages,
729 : overwrite);
730 0 : if (!rb->aux_priv)
731 0 : goto out;
732 :
733 0 : ret = 0;
734 :
735 : /*
736 : * aux_pages (and pmu driver's private data, aux_priv) will be
737 : * referenced in both producer's and consumer's contexts, thus
738 : * we keep a refcount here to make sure either of the two can
739 : * reference them safely.
740 : */
741 0 : refcount_set(&rb->aux_refcount, 1);
742 :
743 0 : rb->aux_overwrite = overwrite;
744 0 : rb->aux_watermark = watermark;
745 :
746 0 : if (!rb->aux_watermark && !rb->aux_overwrite)
747 0 : rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1);
748 :
749 0 : out:
750 0 : if (!ret)
751 0 : rb->aux_pgoff = pgoff;
752 : else
753 0 : __rb_free_aux(rb);
754 :
755 : return ret;
756 : }
757 :
758 0 : void rb_free_aux(struct perf_buffer *rb)
759 : {
760 0 : if (refcount_dec_and_test(&rb->aux_refcount))
761 0 : __rb_free_aux(rb);
762 0 : }
763 :
764 : #ifndef CONFIG_PERF_USE_VMALLOC
765 :
766 : /*
767 : * Back perf_mmap() with regular GFP_KERNEL-0 pages.
768 : */
769 :
770 : static struct page *
771 0 : __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
772 : {
773 0 : if (pgoff > rb->nr_pages)
774 : return NULL;
775 :
776 0 : if (pgoff == 0)
777 0 : return virt_to_page(rb->user_page);
778 :
779 0 : return virt_to_page(rb->data_pages[pgoff - 1]);
780 : }
781 :
782 0 : static void *perf_mmap_alloc_page(int cpu)
783 : {
784 0 : struct page *page;
785 0 : int node;
786 :
787 0 : node = (cpu == -1) ? cpu : cpu_to_node(cpu);
788 0 : page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
789 0 : if (!page)
790 : return NULL;
791 :
792 0 : return page_address(page);
793 : }
794 :
795 0 : static void perf_mmap_free_page(void *addr)
796 : {
797 0 : struct page *page = virt_to_page(addr);
798 :
799 0 : page->mapping = NULL;
800 0 : __free_page(page);
801 0 : }
802 :
803 0 : struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
804 : {
805 0 : struct perf_buffer *rb;
806 0 : unsigned long size;
807 0 : int i;
808 :
809 0 : size = sizeof(struct perf_buffer);
810 0 : size += nr_pages * sizeof(void *);
811 :
812 0 : if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER)
813 0 : goto fail;
814 :
815 0 : rb = kzalloc(size, GFP_KERNEL);
816 0 : if (!rb)
817 0 : goto fail;
818 :
819 0 : rb->user_page = perf_mmap_alloc_page(cpu);
820 0 : if (!rb->user_page)
821 0 : goto fail_user_page;
822 :
823 0 : for (i = 0; i < nr_pages; i++) {
824 0 : rb->data_pages[i] = perf_mmap_alloc_page(cpu);
825 0 : if (!rb->data_pages[i])
826 0 : goto fail_data_pages;
827 : }
828 :
829 0 : rb->nr_pages = nr_pages;
830 :
831 0 : ring_buffer_init(rb, watermark, flags);
832 :
833 0 : return rb;
834 :
835 0 : fail_data_pages:
836 0 : for (i--; i >= 0; i--)
837 0 : perf_mmap_free_page(rb->data_pages[i]);
838 :
839 0 : perf_mmap_free_page(rb->user_page);
840 :
841 0 : fail_user_page:
842 0 : kfree(rb);
843 :
844 : fail:
845 : return NULL;
846 : }
847 :
848 0 : void rb_free(struct perf_buffer *rb)
849 : {
850 0 : int i;
851 :
852 0 : perf_mmap_free_page(rb->user_page);
853 0 : for (i = 0; i < rb->nr_pages; i++)
854 0 : perf_mmap_free_page(rb->data_pages[i]);
855 0 : kfree(rb);
856 0 : }
857 :
858 : #else
859 : static int data_page_nr(struct perf_buffer *rb)
860 : {
861 : return rb->nr_pages << page_order(rb);
862 : }
863 :
864 : static struct page *
865 : __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
866 : {
867 : /* The '>' counts in the user page. */
868 : if (pgoff > data_page_nr(rb))
869 : return NULL;
870 :
871 : return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
872 : }
873 :
874 : static void perf_mmap_unmark_page(void *addr)
875 : {
876 : struct page *page = vmalloc_to_page(addr);
877 :
878 : page->mapping = NULL;
879 : }
880 :
881 : static void rb_free_work(struct work_struct *work)
882 : {
883 : struct perf_buffer *rb;
884 : void *base;
885 : int i, nr;
886 :
887 : rb = container_of(work, struct perf_buffer, work);
888 : nr = data_page_nr(rb);
889 :
890 : base = rb->user_page;
891 : /* The '<=' counts in the user page. */
892 : for (i = 0; i <= nr; i++)
893 : perf_mmap_unmark_page(base + (i * PAGE_SIZE));
894 :
895 : vfree(base);
896 : kfree(rb);
897 : }
898 :
899 : void rb_free(struct perf_buffer *rb)
900 : {
901 : schedule_work(&rb->work);
902 : }
903 :
904 : struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
905 : {
906 : struct perf_buffer *rb;
907 : unsigned long size;
908 : void *all_buf;
909 :
910 : size = sizeof(struct perf_buffer);
911 : size += sizeof(void *);
912 :
913 : rb = kzalloc(size, GFP_KERNEL);
914 : if (!rb)
915 : goto fail;
916 :
917 : INIT_WORK(&rb->work, rb_free_work);
918 :
919 : all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
920 : if (!all_buf)
921 : goto fail_all_buf;
922 :
923 : rb->user_page = all_buf;
924 : rb->data_pages[0] = all_buf + PAGE_SIZE;
925 : if (nr_pages) {
926 : rb->nr_pages = 1;
927 : rb->page_order = ilog2(nr_pages);
928 : }
929 :
930 : ring_buffer_init(rb, watermark, flags);
931 :
932 : return rb;
933 :
934 : fail_all_buf:
935 : kfree(rb);
936 :
937 : fail:
938 : return NULL;
939 : }
940 :
941 : #endif
942 :
943 : struct page *
944 0 : perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
945 : {
946 0 : if (rb->aux_nr_pages) {
947 : /* above AUX space */
948 0 : if (pgoff > rb->aux_pgoff + rb->aux_nr_pages)
949 : return NULL;
950 :
951 : /* AUX space */
952 0 : if (pgoff >= rb->aux_pgoff) {
953 0 : int aux_pgoff = array_index_nospec(pgoff - rb->aux_pgoff, rb->aux_nr_pages);
954 0 : return virt_to_page(rb->aux_pages[aux_pgoff]);
955 : }
956 : }
957 :
958 0 : return __perf_mmap_to_page(rb, pgoff);
959 : }
|