LCOV - code coverage report
Current view: top level - kernel/trace - ring_buffer.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 157 1848 8.5 %
Date: 2021-04-22 12:43:58 Functions: 10 110 9.1 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * Generic ring buffer
       4             :  *
       5             :  * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
       6             :  */
       7             : #include <linux/trace_recursion.h>
       8             : #include <linux/trace_events.h>
       9             : #include <linux/ring_buffer.h>
      10             : #include <linux/trace_clock.h>
      11             : #include <linux/sched/clock.h>
      12             : #include <linux/trace_seq.h>
      13             : #include <linux/spinlock.h>
      14             : #include <linux/irq_work.h>
      15             : #include <linux/security.h>
      16             : #include <linux/uaccess.h>
      17             : #include <linux/hardirq.h>
      18             : #include <linux/kthread.h>        /* for self test */
      19             : #include <linux/module.h>
      20             : #include <linux/percpu.h>
      21             : #include <linux/mutex.h>
      22             : #include <linux/delay.h>
      23             : #include <linux/slab.h>
      24             : #include <linux/init.h>
      25             : #include <linux/hash.h>
      26             : #include <linux/list.h>
      27             : #include <linux/cpu.h>
      28             : #include <linux/oom.h>
      29             : 
      30             : #include <asm/local.h>
      31             : 
      32             : static void update_pages_handler(struct work_struct *work);
      33             : 
      34             : /*
      35             :  * The ring buffer header is special. We must manually up keep it.
      36             :  */
      37           0 : int ring_buffer_print_entry_header(struct trace_seq *s)
      38             : {
      39           0 :         trace_seq_puts(s, "# compressed entry header\n");
      40           0 :         trace_seq_puts(s, "\ttype_len    :    5 bits\n");
      41           0 :         trace_seq_puts(s, "\ttime_delta  :   27 bits\n");
      42           0 :         trace_seq_puts(s, "\tarray       :   32 bits\n");
      43           0 :         trace_seq_putc(s, '\n');
      44           0 :         trace_seq_printf(s, "\tpadding     : type == %d\n",
      45             :                          RINGBUF_TYPE_PADDING);
      46           0 :         trace_seq_printf(s, "\ttime_extend : type == %d\n",
      47             :                          RINGBUF_TYPE_TIME_EXTEND);
      48           0 :         trace_seq_printf(s, "\ttime_stamp : type == %d\n",
      49             :                          RINGBUF_TYPE_TIME_STAMP);
      50           0 :         trace_seq_printf(s, "\tdata max type_len  == %d\n",
      51             :                          RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
      52             : 
      53           0 :         return !trace_seq_has_overflowed(s);
      54             : }
      55             : 
      56             : /*
      57             :  * The ring buffer is made up of a list of pages. A separate list of pages is
      58             :  * allocated for each CPU. A writer may only write to a buffer that is
      59             :  * associated with the CPU it is currently executing on.  A reader may read
      60             :  * from any per cpu buffer.
      61             :  *
      62             :  * The reader is special. For each per cpu buffer, the reader has its own
      63             :  * reader page. When a reader has read the entire reader page, this reader
      64             :  * page is swapped with another page in the ring buffer.
      65             :  *
      66             :  * Now, as long as the writer is off the reader page, the reader can do what
      67             :  * ever it wants with that page. The writer will never write to that page
      68             :  * again (as long as it is out of the ring buffer).
      69             :  *
      70             :  * Here's some silly ASCII art.
      71             :  *
      72             :  *   +------+
      73             :  *   |reader|          RING BUFFER
      74             :  *   |page  |
      75             :  *   +------+        +---+   +---+   +---+
      76             :  *                   |   |-->|   |-->|   |
      77             :  *                   +---+   +---+   +---+
      78             :  *                     ^               |
      79             :  *                     |               |
      80             :  *                     +---------------+
      81             :  *
      82             :  *
      83             :  *   +------+
      84             :  *   |reader|          RING BUFFER
      85             :  *   |page  |------------------v
      86             :  *   +------+        +---+   +---+   +---+
      87             :  *                   |   |-->|   |-->|   |
      88             :  *                   +---+   +---+   +---+
      89             :  *                     ^               |
      90             :  *                     |               |
      91             :  *                     +---------------+
      92             :  *
      93             :  *
      94             :  *   +------+
      95             :  *   |reader|          RING BUFFER
      96             :  *   |page  |------------------v
      97             :  *   +------+        +---+   +---+   +---+
      98             :  *      ^            |   |-->|   |-->|   |
      99             :  *      |            +---+   +---+   +---+
     100             :  *      |                              |
     101             :  *      |                              |
     102             :  *      +------------------------------+
     103             :  *
     104             :  *
     105             :  *   +------+
     106             :  *   |buffer|          RING BUFFER
     107             :  *   |page  |------------------v
     108             :  *   +------+        +---+   +---+   +---+
     109             :  *      ^            |   |   |   |-->|   |
     110             :  *      |   New      +---+   +---+   +---+
     111             :  *      |  Reader------^               |
     112             :  *      |   page                       |
     113             :  *      +------------------------------+
     114             :  *
     115             :  *
     116             :  * After we make this swap, the reader can hand this page off to the splice
     117             :  * code and be done with it. It can even allocate a new page if it needs to
     118             :  * and swap that into the ring buffer.
     119             :  *
     120             :  * We will be using cmpxchg soon to make all this lockless.
     121             :  *
     122             :  */
     123             : 
     124             : /* Used for individual buffers (after the counter) */
     125             : #define RB_BUFFER_OFF           (1 << 20)
     126             : 
     127             : #define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
     128             : 
     129             : #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
     130             : #define RB_ALIGNMENT            4U
     131             : #define RB_MAX_SMALL_DATA       (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
     132             : #define RB_EVNT_MIN_SIZE        8U      /* two 32bit words */
     133             : 
     134             : #ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS
     135             : # define RB_FORCE_8BYTE_ALIGNMENT       0
     136             : # define RB_ARCH_ALIGNMENT              RB_ALIGNMENT
     137             : #else
     138             : # define RB_FORCE_8BYTE_ALIGNMENT       1
     139             : # define RB_ARCH_ALIGNMENT              8U
     140             : #endif
     141             : 
     142             : #define RB_ALIGN_DATA           __aligned(RB_ARCH_ALIGNMENT)
     143             : 
     144             : /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
     145             : #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
     146             : 
     147             : enum {
     148             :         RB_LEN_TIME_EXTEND = 8,
     149             :         RB_LEN_TIME_STAMP =  8,
     150             : };
     151             : 
     152             : #define skip_time_extend(event) \
     153             :         ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
     154             : 
     155             : #define extended_time(event) \
     156             :         (event->type_len >= RINGBUF_TYPE_TIME_EXTEND)
     157             : 
     158           0 : static inline int rb_null_event(struct ring_buffer_event *event)
     159             : {
     160           0 :         return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
     161             : }
     162             : 
     163           0 : static void rb_event_set_padding(struct ring_buffer_event *event)
     164             : {
     165             :         /* padding has a NULL time_delta */
     166           0 :         event->type_len = RINGBUF_TYPE_PADDING;
     167           0 :         event->time_delta = 0;
     168             : }
     169             : 
     170             : static unsigned
     171           0 : rb_event_data_length(struct ring_buffer_event *event)
     172             : {
     173           0 :         unsigned length;
     174             : 
     175           0 :         if (event->type_len)
     176           0 :                 length = event->type_len * RB_ALIGNMENT;
     177             :         else
     178           0 :                 length = event->array[0];
     179           0 :         return length + RB_EVNT_HDR_SIZE;
     180             : }
     181             : 
     182             : /*
     183             :  * Return the length of the given event. Will return
     184             :  * the length of the time extend if the event is a
     185             :  * time extend.
     186             :  */
     187             : static inline unsigned
     188           0 : rb_event_length(struct ring_buffer_event *event)
     189             : {
     190           0 :         switch (event->type_len) {
     191             :         case RINGBUF_TYPE_PADDING:
     192           0 :                 if (rb_null_event(event))
     193             :                         /* undefined */
     194             :                         return -1;
     195           0 :                 return  event->array[0] + RB_EVNT_HDR_SIZE;
     196             : 
     197             :         case RINGBUF_TYPE_TIME_EXTEND:
     198             :                 return RB_LEN_TIME_EXTEND;
     199             : 
     200             :         case RINGBUF_TYPE_TIME_STAMP:
     201             :                 return RB_LEN_TIME_STAMP;
     202             : 
     203             :         case RINGBUF_TYPE_DATA:
     204           0 :                 return rb_event_data_length(event);
     205             :         default:
     206           0 :                 WARN_ON_ONCE(1);
     207             :         }
     208             :         /* not hit */
     209           0 :         return 0;
     210             : }
     211             : 
     212             : /*
     213             :  * Return total length of time extend and data,
     214             :  *   or just the event length for all other events.
     215             :  */
     216             : static inline unsigned
     217           0 : rb_event_ts_length(struct ring_buffer_event *event)
     218             : {
     219           0 :         unsigned len = 0;
     220             : 
     221           0 :         if (extended_time(event)) {
     222             :                 /* time extends include the data event after it */
     223           0 :                 len = RB_LEN_TIME_EXTEND;
     224           0 :                 event = skip_time_extend(event);
     225             :         }
     226           0 :         return len + rb_event_length(event);
     227             : }
     228             : 
     229             : /**
     230             :  * ring_buffer_event_length - return the length of the event
     231             :  * @event: the event to get the length of
     232             :  *
     233             :  * Returns the size of the data load of a data event.
     234             :  * If the event is something other than a data event, it
     235             :  * returns the size of the event itself. With the exception
     236             :  * of a TIME EXTEND, where it still returns the size of the
     237             :  * data load of the data event after it.
     238             :  */
     239           0 : unsigned ring_buffer_event_length(struct ring_buffer_event *event)
     240             : {
     241           0 :         unsigned length;
     242             : 
     243           0 :         if (extended_time(event))
     244           0 :                 event = skip_time_extend(event);
     245             : 
     246           0 :         length = rb_event_length(event);
     247           0 :         if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
     248             :                 return length;
     249           0 :         length -= RB_EVNT_HDR_SIZE;
     250           0 :         if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
     251           0 :                 length -= sizeof(event->array[0]);
     252             :         return length;
     253             : }
     254             : EXPORT_SYMBOL_GPL(ring_buffer_event_length);
     255             : 
     256             : /* inline for ring buffer fast paths */
     257             : static __always_inline void *
     258           0 : rb_event_data(struct ring_buffer_event *event)
     259             : {
     260           0 :         if (extended_time(event))
     261           0 :                 event = skip_time_extend(event);
     262           0 :         WARN_ON_ONCE(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
     263             :         /* If length is in len field, then array[0] has the data */
     264           0 :         if (event->type_len)
     265           0 :                 return (void *)&event->array[0];
     266             :         /* Otherwise length is in array[0] and array[1] has the data */
     267           0 :         return (void *)&event->array[1];
     268             : }
     269             : 
     270             : /**
     271             :  * ring_buffer_event_data - return the data of the event
     272             :  * @event: the event to get the data from
     273             :  */
     274           0 : void *ring_buffer_event_data(struct ring_buffer_event *event)
     275             : {
     276           0 :         return rb_event_data(event);
     277             : }
     278             : EXPORT_SYMBOL_GPL(ring_buffer_event_data);
     279             : 
     280             : #define for_each_buffer_cpu(buffer, cpu)                \
     281             :         for_each_cpu(cpu, buffer->cpumask)
     282             : 
     283             : #define for_each_online_buffer_cpu(buffer, cpu)         \
     284             :         for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask)
     285             : 
     286             : #define TS_SHIFT        27
     287             : #define TS_MASK         ((1ULL << TS_SHIFT) - 1)
     288             : #define TS_DELTA_TEST   (~TS_MASK)
     289             : 
     290             : /**
     291             :  * ring_buffer_event_time_stamp - return the event's extended timestamp
     292             :  * @event: the event to get the timestamp of
     293             :  *
     294             :  * Returns the extended timestamp associated with a data event.
     295             :  * An extended time_stamp is a 64-bit timestamp represented
     296             :  * internally in a special way that makes the best use of space
     297             :  * contained within a ring buffer event.  This function decodes
     298             :  * it and maps it to a straight u64 value.
     299             :  */
     300           0 : u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event)
     301             : {
     302           0 :         u64 ts;
     303             : 
     304           0 :         ts = event->array[0];
     305           0 :         ts <<= TS_SHIFT;
     306           0 :         ts += event->time_delta;
     307             : 
     308           0 :         return ts;
     309             : }
     310             : 
     311             : /* Flag when events were overwritten */
     312             : #define RB_MISSED_EVENTS        (1 << 31)
     313             : /* Missed count stored at end */
     314             : #define RB_MISSED_STORED        (1 << 30)
     315             : 
     316             : struct buffer_data_page {
     317             :         u64              time_stamp;    /* page time stamp */
     318             :         local_t          commit;        /* write committed index */
     319             :         unsigned char    data[] RB_ALIGN_DATA;  /* data of buffer page */
     320             : };
     321             : 
     322             : /*
     323             :  * Note, the buffer_page list must be first. The buffer pages
     324             :  * are allocated in cache lines, which means that each buffer
     325             :  * page will be at the beginning of a cache line, and thus
     326             :  * the least significant bits will be zero. We use this to
     327             :  * add flags in the list struct pointers, to make the ring buffer
     328             :  * lockless.
     329             :  */
     330             : struct buffer_page {
     331             :         struct list_head list;          /* list of buffer pages */
     332             :         local_t          write;         /* index for next write */
     333             :         unsigned         read;          /* index for next read */
     334             :         local_t          entries;       /* entries on this page */
     335             :         unsigned long    real_end;      /* real end of data */
     336             :         struct buffer_data_page *page;  /* Actual data page */
     337             : };
     338             : 
     339             : /*
     340             :  * The buffer page counters, write and entries, must be reset
     341             :  * atomically when crossing page boundaries. To synchronize this
     342             :  * update, two counters are inserted into the number. One is
     343             :  * the actual counter for the write position or count on the page.
     344             :  *
     345             :  * The other is a counter of updaters. Before an update happens
     346             :  * the update partition of the counter is incremented. This will
     347             :  * allow the updater to update the counter atomically.
     348             :  *
     349             :  * The counter is 20 bits, and the state data is 12.
     350             :  */
     351             : #define RB_WRITE_MASK           0xfffff
     352             : #define RB_WRITE_INTCNT         (1 << 20)
     353             : 
     354          24 : static void rb_init_page(struct buffer_data_page *bpage)
     355             : {
     356          48 :         local_set(&bpage->commit, 0);
     357             : }
     358             : 
     359             : /*
     360             :  * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
     361             :  * this issue out.
     362             :  */
     363           0 : static void free_buffer_page(struct buffer_page *bpage)
     364             : {
     365           0 :         free_page((unsigned long)bpage->page);
     366           0 :         kfree(bpage);
     367           0 : }
     368             : 
     369             : /*
     370             :  * We need to fit the time_stamp delta into 27 bits.
     371             :  */
     372           0 : static inline int test_time_stamp(u64 delta)
     373             : {
     374           0 :         if (delta & TS_DELTA_TEST)
     375           0 :                 return 1;
     376             :         return 0;
     377             : }
     378             : 
     379             : #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
     380             : 
     381             : /* Max payload is BUF_PAGE_SIZE - header (8bytes) */
     382             : #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
     383             : 
     384           0 : int ring_buffer_print_page_header(struct trace_seq *s)
     385             : {
     386           0 :         struct buffer_data_page field;
     387             : 
     388           0 :         trace_seq_printf(s, "\tfield: u64 timestamp;\t"
     389             :                          "offset:0;\tsize:%u;\tsigned:%u;\n",
     390             :                          (unsigned int)sizeof(field.time_stamp),
     391             :                          (unsigned int)is_signed_type(u64));
     392             : 
     393           0 :         trace_seq_printf(s, "\tfield: local_t commit;\t"
     394             :                          "offset:%u;\tsize:%u;\tsigned:%u;\n",
     395             :                          (unsigned int)offsetof(typeof(field), commit),
     396             :                          (unsigned int)sizeof(field.commit),
     397             :                          (unsigned int)is_signed_type(long));
     398             : 
     399           0 :         trace_seq_printf(s, "\tfield: int overwrite;\t"
     400             :                          "offset:%u;\tsize:%u;\tsigned:%u;\n",
     401             :                          (unsigned int)offsetof(typeof(field), commit),
     402             :                          1,
     403             :                          (unsigned int)is_signed_type(long));
     404             : 
     405           0 :         trace_seq_printf(s, "\tfield: char data;\t"
     406             :                          "offset:%u;\tsize:%u;\tsigned:%u;\n",
     407             :                          (unsigned int)offsetof(typeof(field), data),
     408             :                          (unsigned int)BUF_PAGE_SIZE,
     409             :                          (unsigned int)is_signed_type(char));
     410             : 
     411           0 :         return !trace_seq_has_overflowed(s);
     412             : }
     413             : 
     414             : struct rb_irq_work {
     415             :         struct irq_work                 work;
     416             :         wait_queue_head_t               waiters;
     417             :         wait_queue_head_t               full_waiters;
     418             :         bool                            waiters_pending;
     419             :         bool                            full_waiters_pending;
     420             :         bool                            wakeup_full;
     421             : };
     422             : 
     423             : /*
     424             :  * Structure to hold event state and handle nested events.
     425             :  */
     426             : struct rb_event_info {
     427             :         u64                     ts;
     428             :         u64                     delta;
     429             :         u64                     before;
     430             :         u64                     after;
     431             :         unsigned long           length;
     432             :         struct buffer_page      *tail_page;
     433             :         int                     add_timestamp;
     434             : };
     435             : 
     436             : /*
     437             :  * Used for the add_timestamp
     438             :  *  NONE
     439             :  *  EXTEND - wants a time extend
     440             :  *  ABSOLUTE - the buffer requests all events to have absolute time stamps
     441             :  *  FORCE - force a full time stamp.
     442             :  */
     443             : enum {
     444             :         RB_ADD_STAMP_NONE               = 0,
     445             :         RB_ADD_STAMP_EXTEND             = BIT(1),
     446             :         RB_ADD_STAMP_ABSOLUTE           = BIT(2),
     447             :         RB_ADD_STAMP_FORCE              = BIT(3)
     448             : };
     449             : /*
     450             :  * Used for which event context the event is in.
     451             :  *  TRANSITION = 0
     452             :  *  NMI     = 1
     453             :  *  IRQ     = 2
     454             :  *  SOFTIRQ = 3
     455             :  *  NORMAL  = 4
     456             :  *
     457             :  * See trace_recursive_lock() comment below for more details.
     458             :  */
     459             : enum {
     460             :         RB_CTX_TRANSITION,
     461             :         RB_CTX_NMI,
     462             :         RB_CTX_IRQ,
     463             :         RB_CTX_SOFTIRQ,
     464             :         RB_CTX_NORMAL,
     465             :         RB_CTX_MAX
     466             : };
     467             : 
     468             : #if BITS_PER_LONG == 32
     469             : #define RB_TIME_32
     470             : #endif
     471             : 
     472             : /* To test on 64 bit machines */
     473             : //#define RB_TIME_32
     474             : 
     475             : #ifdef RB_TIME_32
     476             : 
     477             : struct rb_time_struct {
     478             :         local_t         cnt;
     479             :         local_t         top;
     480             :         local_t         bottom;
     481             : };
     482             : #else
     483             : #include <asm/local64.h>
     484             : struct rb_time_struct {
     485             :         local64_t       time;
     486             : };
     487             : #endif
     488             : typedef struct rb_time_struct rb_time_t;
     489             : 
     490             : /*
     491             :  * head_page == tail_page && head == tail then buffer is empty.
     492             :  */
     493             : struct ring_buffer_per_cpu {
     494             :         int                             cpu;
     495             :         atomic_t                        record_disabled;
     496             :         atomic_t                        resize_disabled;
     497             :         struct trace_buffer     *buffer;
     498             :         raw_spinlock_t                  reader_lock;    /* serialize readers */
     499             :         arch_spinlock_t                 lock;
     500             :         struct lock_class_key           lock_key;
     501             :         struct buffer_data_page         *free_page;
     502             :         unsigned long                   nr_pages;
     503             :         unsigned int                    current_context;
     504             :         struct list_head                *pages;
     505             :         struct buffer_page              *head_page;     /* read from head */
     506             :         struct buffer_page              *tail_page;     /* write to tail */
     507             :         struct buffer_page              *commit_page;   /* committed pages */
     508             :         struct buffer_page              *reader_page;
     509             :         unsigned long                   lost_events;
     510             :         unsigned long                   last_overrun;
     511             :         unsigned long                   nest;
     512             :         local_t                         entries_bytes;
     513             :         local_t                         entries;
     514             :         local_t                         overrun;
     515             :         local_t                         commit_overrun;
     516             :         local_t                         dropped_events;
     517             :         local_t                         committing;
     518             :         local_t                         commits;
     519             :         local_t                         pages_touched;
     520             :         local_t                         pages_read;
     521             :         long                            last_pages_touch;
     522             :         size_t                          shortest_full;
     523             :         unsigned long                   read;
     524             :         unsigned long                   read_bytes;
     525             :         rb_time_t                       write_stamp;
     526             :         rb_time_t                       before_stamp;
     527             :         u64                             read_stamp;
     528             :         /* ring buffer pages to update, > 0 to add, < 0 to remove */
     529             :         long                            nr_pages_to_update;
     530             :         struct list_head                new_pages; /* new pages to add */
     531             :         struct work_struct              update_pages_work;
     532             :         struct completion               update_done;
     533             : 
     534             :         struct rb_irq_work              irq_work;
     535             : };
     536             : 
     537             : struct trace_buffer {
     538             :         unsigned                        flags;
     539             :         int                             cpus;
     540             :         atomic_t                        record_disabled;
     541             :         cpumask_var_t                   cpumask;
     542             : 
     543             :         struct lock_class_key           *reader_lock_key;
     544             : 
     545             :         struct mutex                    mutex;
     546             : 
     547             :         struct ring_buffer_per_cpu      **buffers;
     548             : 
     549             :         struct hlist_node               node;
     550             :         u64                             (*clock)(void);
     551             : 
     552             :         struct rb_irq_work              irq_work;
     553             :         bool                            time_stamp_abs;
     554             : };
     555             : 
     556             : struct ring_buffer_iter {
     557             :         struct ring_buffer_per_cpu      *cpu_buffer;
     558             :         unsigned long                   head;
     559             :         unsigned long                   next_event;
     560             :         struct buffer_page              *head_page;
     561             :         struct buffer_page              *cache_reader_page;
     562             :         unsigned long                   cache_read;
     563             :         u64                             read_stamp;
     564             :         u64                             page_stamp;
     565             :         struct ring_buffer_event        *event;
     566             :         int                             missed_events;
     567             : };
     568             : 
     569             : #ifdef RB_TIME_32
     570             : 
     571             : /*
     572             :  * On 32 bit machines, local64_t is very expensive. As the ring
     573             :  * buffer doesn't need all the features of a true 64 bit atomic,
     574             :  * on 32 bit, it uses these functions (64 still uses local64_t).
     575             :  *
     576             :  * For the ring buffer, 64 bit required operations for the time is
     577             :  * the following:
     578             :  *
     579             :  *  - Only need 59 bits (uses 60 to make it even).
     580             :  *  - Reads may fail if it interrupted a modification of the time stamp.
     581             :  *      It will succeed if it did not interrupt another write even if
     582             :  *      the read itself is interrupted by a write.
     583             :  *      It returns whether it was successful or not.
     584             :  *
     585             :  *  - Writes always succeed and will overwrite other writes and writes
     586             :  *      that were done by events interrupting the current write.
     587             :  *
     588             :  *  - A write followed by a read of the same time stamp will always succeed,
     589             :  *      but may not contain the same value.
     590             :  *
     591             :  *  - A cmpxchg will fail if it interrupted another write or cmpxchg.
     592             :  *      Other than that, it acts like a normal cmpxchg.
     593             :  *
     594             :  * The 60 bit time stamp is broken up by 30 bits in a top and bottom half
     595             :  *  (bottom being the least significant 30 bits of the 60 bit time stamp).
     596             :  *
     597             :  * The two most significant bits of each half holds a 2 bit counter (0-3).
     598             :  * Each update will increment this counter by one.
     599             :  * When reading the top and bottom, if the two counter bits match then the
     600             :  *  top and bottom together make a valid 60 bit number.
     601             :  */
     602             : #define RB_TIME_SHIFT   30
     603             : #define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1)
     604             : 
     605             : static inline int rb_time_cnt(unsigned long val)
     606             : {
     607             :         return (val >> RB_TIME_SHIFT) & 3;
     608             : }
     609             : 
     610             : static inline u64 rb_time_val(unsigned long top, unsigned long bottom)
     611             : {
     612             :         u64 val;
     613             : 
     614             :         val = top & RB_TIME_VAL_MASK;
     615             :         val <<= RB_TIME_SHIFT;
     616             :         val |= bottom & RB_TIME_VAL_MASK;
     617             : 
     618             :         return val;
     619             : }
     620             : 
     621             : static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
     622             : {
     623             :         unsigned long top, bottom;
     624             :         unsigned long c;
     625             : 
     626             :         /*
     627             :          * If the read is interrupted by a write, then the cnt will
     628             :          * be different. Loop until both top and bottom have been read
     629             :          * without interruption.
     630             :          */
     631             :         do {
     632             :                 c = local_read(&t->cnt);
     633             :                 top = local_read(&t->top);
     634             :                 bottom = local_read(&t->bottom);
     635             :         } while (c != local_read(&t->cnt));
     636             : 
     637             :         *cnt = rb_time_cnt(top);
     638             : 
     639             :         /* If top and bottom counts don't match, this interrupted a write */
     640             :         if (*cnt != rb_time_cnt(bottom))
     641             :                 return false;
     642             : 
     643             :         *ret = rb_time_val(top, bottom);
     644             :         return true;
     645             : }
     646             : 
     647             : static bool rb_time_read(rb_time_t *t, u64 *ret)
     648             : {
     649             :         unsigned long cnt;
     650             : 
     651             :         return __rb_time_read(t, ret, &cnt);
     652             : }
     653             : 
     654             : static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt)
     655             : {
     656             :         return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT);
     657             : }
     658             : 
     659             : static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom)
     660             : {
     661             :         *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK);
     662             :         *bottom = (unsigned long)(val & RB_TIME_VAL_MASK);
     663             : }
     664             : 
     665             : static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt)
     666             : {
     667             :         val = rb_time_val_cnt(val, cnt);
     668             :         local_set(t, val);
     669             : }
     670             : 
     671             : static void rb_time_set(rb_time_t *t, u64 val)
     672             : {
     673             :         unsigned long cnt, top, bottom;
     674             : 
     675             :         rb_time_split(val, &top, &bottom);
     676             : 
     677             :         /* Writes always succeed with a valid number even if it gets interrupted. */
     678             :         do {
     679             :                 cnt = local_inc_return(&t->cnt);
     680             :                 rb_time_val_set(&t->top, top, cnt);
     681             :                 rb_time_val_set(&t->bottom, bottom, cnt);
     682             :         } while (cnt != local_read(&t->cnt));
     683             : }
     684             : 
     685             : static inline bool
     686             : rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
     687             : {
     688             :         unsigned long ret;
     689             : 
     690             :         ret = local_cmpxchg(l, expect, set);
     691             :         return ret == expect;
     692             : }
     693             : 
     694             : static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
     695             : {
     696             :         unsigned long cnt, top, bottom;
     697             :         unsigned long cnt2, top2, bottom2;
     698             :         u64 val;
     699             : 
     700             :         /* The cmpxchg always fails if it interrupted an update */
     701             :          if (!__rb_time_read(t, &val, &cnt2))
     702             :                  return false;
     703             : 
     704             :          if (val != expect)
     705             :                  return false;
     706             : 
     707             :          cnt = local_read(&t->cnt);
     708             :          if ((cnt & 3) != cnt2)
     709             :                  return false;
     710             : 
     711             :          cnt2 = cnt + 1;
     712             : 
     713             :          rb_time_split(val, &top, &bottom);
     714             :          top = rb_time_val_cnt(top, cnt);
     715             :          bottom = rb_time_val_cnt(bottom, cnt);
     716             : 
     717             :          rb_time_split(set, &top2, &bottom2);
     718             :          top2 = rb_time_val_cnt(top2, cnt2);
     719             :          bottom2 = rb_time_val_cnt(bottom2, cnt2);
     720             : 
     721             :         if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2))
     722             :                 return false;
     723             :         if (!rb_time_read_cmpxchg(&t->top, top, top2))
     724             :                 return false;
     725             :         if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2))
     726             :                 return false;
     727             :         return true;
     728             : }
     729             : 
     730             : #else /* 64 bits */
     731             : 
     732             : /* local64_t always succeeds */
     733             : 
     734           0 : static inline bool rb_time_read(rb_time_t *t, u64 *ret)
     735             : {
     736           0 :         *ret = local64_read(&t->time);
     737           0 :         return true;
     738             : }
     739           0 : static void rb_time_set(rb_time_t *t, u64 val)
     740             : {
     741           0 :         local64_set(&t->time, val);
     742           0 : }
     743             : 
     744           0 : static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
     745             : {
     746           0 :         u64 val;
     747           0 :         val = local64_cmpxchg(&t->time, expect, set);
     748           0 :         return val == expect;
     749             : }
     750             : #endif
     751             : 
     752             : /**
     753             :  * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer
     754             :  * @buffer: The ring_buffer to get the number of pages from
     755             :  * @cpu: The cpu of the ring_buffer to get the number of pages from
     756             :  *
     757             :  * Returns the number of pages used by a per_cpu buffer of the ring buffer.
     758             :  */
     759           0 : size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu)
     760             : {
     761           0 :         return buffer->buffers[cpu]->nr_pages;
     762             : }
     763             : 
     764             : /**
     765             :  * ring_buffer_nr_pages_dirty - get the number of used pages in the ring buffer
     766             :  * @buffer: The ring_buffer to get the number of pages from
     767             :  * @cpu: The cpu of the ring_buffer to get the number of pages from
     768             :  *
     769             :  * Returns the number of pages that have content in the ring buffer.
     770             :  */
     771           0 : size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu)
     772             : {
     773           0 :         size_t read;
     774           0 :         size_t cnt;
     775             : 
     776           0 :         read = local_read(&buffer->buffers[cpu]->pages_read);
     777           0 :         cnt = local_read(&buffer->buffers[cpu]->pages_touched);
     778             :         /* The reader can read an empty page, but not more than that */
     779           0 :         if (cnt < read) {
     780           0 :                 WARN_ON_ONCE(read > cnt + 1);
     781             :                 return 0;
     782             :         }
     783             : 
     784           0 :         return cnt - read;
     785             : }
     786             : 
     787             : /*
     788             :  * rb_wake_up_waiters - wake up tasks waiting for ring buffer input
     789             :  *
     790             :  * Schedules a delayed work to wake up any task that is blocked on the
     791             :  * ring buffer waiters queue.
     792             :  */
     793           0 : static void rb_wake_up_waiters(struct irq_work *work)
     794             : {
     795           0 :         struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
     796             : 
     797           0 :         wake_up_all(&rbwork->waiters);
     798           0 :         if (rbwork->wakeup_full) {
     799           0 :                 rbwork->wakeup_full = false;
     800           0 :                 wake_up_all(&rbwork->full_waiters);
     801             :         }
     802           0 : }
     803             : 
     804             : /**
     805             :  * ring_buffer_wait - wait for input to the ring buffer
     806             :  * @buffer: buffer to wait on
     807             :  * @cpu: the cpu buffer to wait on
     808             :  * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS
     809             :  *
     810             :  * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
     811             :  * as data is added to any of the @buffer's cpu buffers. Otherwise
     812             :  * it will wait for data to be added to a specific cpu buffer.
     813             :  */
     814           0 : int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
     815             : {
     816           0 :         struct ring_buffer_per_cpu *cpu_buffer;
     817           0 :         DEFINE_WAIT(wait);
     818           0 :         struct rb_irq_work *work;
     819           0 :         int ret = 0;
     820             : 
     821             :         /*
     822             :          * Depending on what the caller is waiting for, either any
     823             :          * data in any cpu buffer, or a specific buffer, put the
     824             :          * caller on the appropriate wait queue.
     825             :          */
     826           0 :         if (cpu == RING_BUFFER_ALL_CPUS) {
     827           0 :                 work = &buffer->irq_work;
     828             :                 /* Full only makes sense on per cpu reads */
     829           0 :                 full = 0;
     830             :         } else {
     831           0 :                 if (!cpumask_test_cpu(cpu, buffer->cpumask))
     832             :                         return -ENODEV;
     833           0 :                 cpu_buffer = buffer->buffers[cpu];
     834           0 :                 work = &cpu_buffer->irq_work;
     835             :         }
     836             : 
     837             : 
     838           0 :         while (true) {
     839           0 :                 if (full)
     840           0 :                         prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE);
     841             :                 else
     842           0 :                         prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
     843             : 
     844             :                 /*
     845             :                  * The events can happen in critical sections where
     846             :                  * checking a work queue can cause deadlocks.
     847             :                  * After adding a task to the queue, this flag is set
     848             :                  * only to notify events to try to wake up the queue
     849             :                  * using irq_work.
     850             :                  *
     851             :                  * We don't clear it even if the buffer is no longer
     852             :                  * empty. The flag only causes the next event to run
     853             :                  * irq_work to do the work queue wake up. The worse
     854             :                  * that can happen if we race with !trace_empty() is that
     855             :                  * an event will cause an irq_work to try to wake up
     856             :                  * an empty queue.
     857             :                  *
     858             :                  * There's no reason to protect this flag either, as
     859             :                  * the work queue and irq_work logic will do the necessary
     860             :                  * synchronization for the wake ups. The only thing
     861             :                  * that is necessary is that the wake up happens after
     862             :                  * a task has been queued. It's OK for spurious wake ups.
     863             :                  */
     864           0 :                 if (full)
     865           0 :                         work->full_waiters_pending = true;
     866             :                 else
     867           0 :                         work->waiters_pending = true;
     868             : 
     869           0 :                 if (signal_pending(current)) {
     870             :                         ret = -EINTR;
     871             :                         break;
     872             :                 }
     873             : 
     874           0 :                 if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer))
     875             :                         break;
     876             : 
     877           0 :                 if (cpu != RING_BUFFER_ALL_CPUS &&
     878           0 :                     !ring_buffer_empty_cpu(buffer, cpu)) {
     879           0 :                         unsigned long flags;
     880           0 :                         bool pagebusy;
     881           0 :                         size_t nr_pages;
     882           0 :                         size_t dirty;
     883             : 
     884           0 :                         if (!full)
     885             :                                 break;
     886             : 
     887           0 :                         raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
     888           0 :                         pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
     889           0 :                         nr_pages = cpu_buffer->nr_pages;
     890           0 :                         dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
     891           0 :                         if (!cpu_buffer->shortest_full ||
     892           0 :                             cpu_buffer->shortest_full < full)
     893           0 :                                 cpu_buffer->shortest_full = full;
     894           0 :                         raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
     895           0 :                         if (!pagebusy &&
     896           0 :                             (!nr_pages || (dirty * 100) > full * nr_pages))
     897             :                                 break;
     898             :                 }
     899             : 
     900           0 :                 schedule();
     901             :         }
     902             : 
     903           0 :         if (full)
     904           0 :                 finish_wait(&work->full_waiters, &wait);
     905             :         else
     906           0 :                 finish_wait(&work->waiters, &wait);
     907             : 
     908             :         return ret;
     909             : }
     910             : 
     911             : /**
     912             :  * ring_buffer_poll_wait - poll on buffer input
     913             :  * @buffer: buffer to wait on
     914             :  * @cpu: the cpu buffer to wait on
     915             :  * @filp: the file descriptor
     916             :  * @poll_table: The poll descriptor
     917             :  *
     918             :  * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
     919             :  * as data is added to any of the @buffer's cpu buffers. Otherwise
     920             :  * it will wait for data to be added to a specific cpu buffer.
     921             :  *
     922             :  * Returns EPOLLIN | EPOLLRDNORM if data exists in the buffers,
     923             :  * zero otherwise.
     924             :  */
     925           0 : __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
     926             :                           struct file *filp, poll_table *poll_table)
     927             : {
     928           0 :         struct ring_buffer_per_cpu *cpu_buffer;
     929           0 :         struct rb_irq_work *work;
     930             : 
     931           0 :         if (cpu == RING_BUFFER_ALL_CPUS)
     932           0 :                 work = &buffer->irq_work;
     933             :         else {
     934           0 :                 if (!cpumask_test_cpu(cpu, buffer->cpumask))
     935             :                         return -EINVAL;
     936             : 
     937           0 :                 cpu_buffer = buffer->buffers[cpu];
     938           0 :                 work = &cpu_buffer->irq_work;
     939             :         }
     940             : 
     941           0 :         poll_wait(filp, &work->waiters, poll_table);
     942           0 :         work->waiters_pending = true;
     943             :         /*
     944             :          * There's a tight race between setting the waiters_pending and
     945             :          * checking if the ring buffer is empty.  Once the waiters_pending bit
     946             :          * is set, the next event will wake the task up, but we can get stuck
     947             :          * if there's only a single event in.
     948             :          *
     949             :          * FIXME: Ideally, we need a memory barrier on the writer side as well,
     950             :          * but adding a memory barrier to all events will cause too much of a
     951             :          * performance hit in the fast path.  We only need a memory barrier when
     952             :          * the buffer goes from empty to having content.  But as this race is
     953             :          * extremely small, and it's not a problem if another event comes in, we
     954             :          * will fix it later.
     955             :          */
     956           0 :         smp_mb();
     957             : 
     958           0 :         if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
     959           0 :             (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
     960           0 :                 return EPOLLIN | EPOLLRDNORM;
     961             :         return 0;
     962             : }
     963             : 
     964             : /* buffer may be either ring_buffer or ring_buffer_per_cpu */
     965             : #define RB_WARN_ON(b, cond)                                             \
     966             :         ({                                                              \
     967             :                 int _____ret = unlikely(cond);                          \
     968             :                 if (_____ret) {                                         \
     969             :                         if (__same_type(*(b), struct ring_buffer_per_cpu)) { \
     970             :                                 struct ring_buffer_per_cpu *__b =       \
     971             :                                         (void *)b;                      \
     972             :                                 atomic_inc(&__b->buffer->record_disabled); \
     973             :                         } else                                          \
     974             :                                 atomic_inc(&b->record_disabled); \
     975             :                         WARN_ON(1);                                     \
     976             :                 }                                                       \
     977             :                 _____ret;                                               \
     978             :         })
     979             : 
     980             : /* Up this if you want to test the TIME_EXTENTS and normalization */
     981             : #define DEBUG_SHIFT 0
     982             : 
     983           0 : static inline u64 rb_time_stamp(struct trace_buffer *buffer)
     984             : {
     985           0 :         u64 ts;
     986             : 
     987             :         /* Skip retpolines :-( */
     988           0 :         if (IS_ENABLED(CONFIG_RETPOLINE) && likely(buffer->clock == trace_clock_local))
     989             :                 ts = trace_clock_local();
     990             :         else
     991           0 :                 ts = buffer->clock();
     992             : 
     993             :         /* shift to debug/test normalization and TIME_EXTENTS */
     994           0 :         return ts << DEBUG_SHIFT;
     995             : }
     996             : 
     997           0 : u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu)
     998             : {
     999           0 :         u64 time;
    1000             : 
    1001           0 :         preempt_disable_notrace();
    1002           0 :         time = rb_time_stamp(buffer);
    1003           0 :         preempt_enable_notrace();
    1004             : 
    1005           0 :         return time;
    1006             : }
    1007             : EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
    1008             : 
    1009           0 : void ring_buffer_normalize_time_stamp(struct trace_buffer *buffer,
    1010             :                                       int cpu, u64 *ts)
    1011             : {
    1012             :         /* Just stupid testing the normalize function and deltas */
    1013           0 :         *ts >>= DEBUG_SHIFT;
    1014           0 : }
    1015             : EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
    1016             : 
    1017             : /*
    1018             :  * Making the ring buffer lockless makes things tricky.
    1019             :  * Although writes only happen on the CPU that they are on,
    1020             :  * and they only need to worry about interrupts. Reads can
    1021             :  * happen on any CPU.
    1022             :  *
    1023             :  * The reader page is always off the ring buffer, but when the
    1024             :  * reader finishes with a page, it needs to swap its page with
    1025             :  * a new one from the buffer. The reader needs to take from
    1026             :  * the head (writes go to the tail). But if a writer is in overwrite
    1027             :  * mode and wraps, it must push the head page forward.
    1028             :  *
    1029             :  * Here lies the problem.
    1030             :  *
    1031             :  * The reader must be careful to replace only the head page, and
    1032             :  * not another one. As described at the top of the file in the
    1033             :  * ASCII art, the reader sets its old page to point to the next
    1034             :  * page after head. It then sets the page after head to point to
    1035             :  * the old reader page. But if the writer moves the head page
    1036             :  * during this operation, the reader could end up with the tail.
    1037             :  *
    1038             :  * We use cmpxchg to help prevent this race. We also do something
    1039             :  * special with the page before head. We set the LSB to 1.
    1040             :  *
    1041             :  * When the writer must push the page forward, it will clear the
    1042             :  * bit that points to the head page, move the head, and then set
    1043             :  * the bit that points to the new head page.
    1044             :  *
    1045             :  * We also don't want an interrupt coming in and moving the head
    1046             :  * page on another writer. Thus we use the second LSB to catch
    1047             :  * that too. Thus:
    1048             :  *
    1049             :  * head->list->prev->next        bit 1          bit 0
    1050             :  *                              -------        -------
    1051             :  * Normal page                     0              0
    1052             :  * Points to head page             0              1
    1053             :  * New head page                   1              0
    1054             :  *
    1055             :  * Note we can not trust the prev pointer of the head page, because:
    1056             :  *
    1057             :  * +----+       +-----+        +-----+
    1058             :  * |    |------>|  T  |---X--->|  N  |
    1059             :  * |    |<------|     |        |     |
    1060             :  * +----+       +-----+        +-----+
    1061             :  *   ^                           ^ |
    1062             :  *   |          +-----+          | |
    1063             :  *   +----------|  R  |----------+ |
    1064             :  *              |     |<-----------+
    1065             :  *              +-----+
    1066             :  *
    1067             :  * Key:  ---X-->  HEAD flag set in pointer
    1068             :  *         T      Tail page
    1069             :  *         R      Reader page
    1070             :  *         N      Next page
    1071             :  *
    1072             :  * (see __rb_reserve_next() to see where this happens)
    1073             :  *
    1074             :  *  What the above shows is that the reader just swapped out
    1075             :  *  the reader page with a page in the buffer, but before it
    1076             :  *  could make the new header point back to the new page added
    1077             :  *  it was preempted by a writer. The writer moved forward onto
    1078             :  *  the new page added by the reader and is about to move forward
    1079             :  *  again.
    1080             :  *
    1081             :  *  You can see, it is legitimate for the previous pointer of
    1082             :  *  the head (or any page) not to point back to itself. But only
    1083             :  *  temporarily.
    1084             :  */
    1085             : 
    1086             : #define RB_PAGE_NORMAL          0UL
    1087             : #define RB_PAGE_HEAD            1UL
    1088             : #define RB_PAGE_UPDATE          2UL
    1089             : 
    1090             : 
    1091             : #define RB_FLAG_MASK            3UL
    1092             : 
    1093             : /* PAGE_MOVED is not part of the mask */
    1094             : #define RB_PAGE_MOVED           4UL
    1095             : 
    1096             : /*
    1097             :  * rb_list_head - remove any bit
    1098             :  */
    1099          32 : static struct list_head *rb_list_head(struct list_head *list)
    1100             : {
    1101          32 :         unsigned long val = (unsigned long)list;
    1102             : 
    1103          32 :         return (struct list_head *)(val & ~RB_FLAG_MASK);
    1104             : }
    1105             : 
    1106             : /*
    1107             :  * rb_is_head_page - test if the given page is the head page
    1108             :  *
    1109             :  * Because the reader may move the head_page pointer, we can
    1110             :  * not trust what the head page is (it may be pointing to
    1111             :  * the reader page). But if the next page is a header page,
    1112             :  * its flags will be non zero.
    1113             :  */
    1114             : static inline int
    1115           0 : rb_is_head_page(struct buffer_page *page, struct list_head *list)
    1116             : {
    1117           0 :         unsigned long val;
    1118             : 
    1119           0 :         val = (unsigned long)list->next;
    1120             : 
    1121           0 :         if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list)
    1122             :                 return RB_PAGE_MOVED;
    1123             : 
    1124           0 :         return val & RB_FLAG_MASK;
    1125             : }
    1126             : 
    1127             : /*
    1128             :  * rb_is_reader_page
    1129             :  *
    1130             :  * The unique thing about the reader page, is that, if the
    1131             :  * writer is ever on it, the previous pointer never points
    1132             :  * back to the reader page.
    1133             :  */
    1134           0 : static bool rb_is_reader_page(struct buffer_page *page)
    1135             : {
    1136           0 :         struct list_head *list = page->list.prev;
    1137             : 
    1138           0 :         return rb_list_head(list->next) != &page->list;
    1139             : }
    1140             : 
    1141             : /*
    1142             :  * rb_set_list_to_head - set a list_head to be pointing to head.
    1143             :  */
    1144           8 : static void rb_set_list_to_head(struct list_head *list)
    1145             : {
    1146           8 :         unsigned long *ptr;
    1147             : 
    1148           8 :         ptr = (unsigned long *)&list->next;
    1149           8 :         *ptr |= RB_PAGE_HEAD;
    1150           8 :         *ptr &= ~RB_PAGE_UPDATE;
    1151           8 : }
    1152             : 
    1153             : /*
    1154             :  * rb_head_page_activate - sets up head page
    1155             :  */
    1156          16 : static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
    1157             : {
    1158          16 :         struct buffer_page *head;
    1159             : 
    1160          16 :         head = cpu_buffer->head_page;
    1161          16 :         if (!head)
    1162             :                 return;
    1163             : 
    1164             :         /*
    1165             :          * Set the previous list pointer to have the HEAD flag.
    1166             :          */
    1167           8 :         rb_set_list_to_head(head->list.prev);
    1168             : }
    1169             : 
    1170          16 : static void rb_list_head_clear(struct list_head *list)
    1171             : {
    1172          16 :         unsigned long *ptr = (unsigned long *)&list->next;
    1173             : 
    1174          16 :         *ptr &= ~RB_FLAG_MASK;
    1175             : }
    1176             : 
    1177             : /*
    1178             :  * rb_head_page_deactivate - clears head page ptr (for free list)
    1179             :  */
    1180             : static void
    1181           8 : rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer)
    1182             : {
    1183           8 :         struct list_head *hd;
    1184             : 
    1185             :         /* Go through the whole list and clear any pointers found. */
    1186           8 :         rb_list_head_clear(cpu_buffer->pages);
    1187             : 
    1188          16 :         list_for_each(hd, cpu_buffer->pages)
    1189           8 :                 rb_list_head_clear(hd);
    1190           8 : }
    1191             : 
    1192           0 : static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,
    1193             :                             struct buffer_page *head,
    1194             :                             struct buffer_page *prev,
    1195             :                             int old_flag, int new_flag)
    1196             : {
    1197           0 :         struct list_head *list;
    1198           0 :         unsigned long val = (unsigned long)&head->list;
    1199           0 :         unsigned long ret;
    1200             : 
    1201           0 :         list = &prev->list;
    1202             : 
    1203           0 :         val &= ~RB_FLAG_MASK;
    1204             : 
    1205           0 :         ret = cmpxchg((unsigned long *)&list->next,
    1206             :                       val | old_flag, val | new_flag);
    1207             : 
    1208             :         /* check if the reader took the page */
    1209           0 :         if ((ret & ~RB_FLAG_MASK) != val)
    1210             :                 return RB_PAGE_MOVED;
    1211             : 
    1212           0 :         return ret & RB_FLAG_MASK;
    1213             : }
    1214             : 
    1215           0 : static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer,
    1216             :                                    struct buffer_page *head,
    1217             :                                    struct buffer_page *prev,
    1218             :                                    int old_flag)
    1219             : {
    1220           0 :         return rb_head_page_set(cpu_buffer, head, prev,
    1221             :                                 old_flag, RB_PAGE_UPDATE);
    1222             : }
    1223             : 
    1224           0 : static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer,
    1225             :                                  struct buffer_page *head,
    1226             :                                  struct buffer_page *prev,
    1227             :                                  int old_flag)
    1228             : {
    1229           0 :         return rb_head_page_set(cpu_buffer, head, prev,
    1230             :                                 old_flag, RB_PAGE_HEAD);
    1231             : }
    1232             : 
    1233           0 : static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer,
    1234             :                                    struct buffer_page *head,
    1235             :                                    struct buffer_page *prev,
    1236             :                                    int old_flag)
    1237             : {
    1238           0 :         return rb_head_page_set(cpu_buffer, head, prev,
    1239             :                                 old_flag, RB_PAGE_NORMAL);
    1240             : }
    1241             : 
    1242           0 : static inline void rb_inc_page(struct buffer_page **bpage)
    1243             : {
    1244           0 :         struct list_head *p = rb_list_head((*bpage)->list.next);
    1245             : 
    1246           0 :         *bpage = list_entry(p, struct buffer_page, list);
    1247           0 : }
    1248             : 
    1249             : static struct buffer_page *
    1250           0 : rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
    1251             : {
    1252           0 :         struct buffer_page *head;
    1253           0 :         struct buffer_page *page;
    1254           0 :         struct list_head *list;
    1255           0 :         int i;
    1256             : 
    1257           0 :         if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page))
    1258             :                 return NULL;
    1259             : 
    1260             :         /* sanity check */
    1261           0 :         list = cpu_buffer->pages;
    1262           0 :         if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list))
    1263             :                 return NULL;
    1264             : 
    1265           0 :         page = head = cpu_buffer->head_page;
    1266             :         /*
    1267             :          * It is possible that the writer moves the header behind
    1268             :          * where we started, and we miss in one loop.
    1269             :          * A second loop should grab the header, but we'll do
    1270             :          * three loops just because I'm paranoid.
    1271             :          */
    1272           0 :         for (i = 0; i < 3; i++) {
    1273           0 :                 do {
    1274           0 :                         if (rb_is_head_page(page, page->list.prev)) {
    1275           0 :                                 cpu_buffer->head_page = page;
    1276           0 :                                 return page;
    1277             :                         }
    1278           0 :                         rb_inc_page(&page);
    1279           0 :                 } while (page != head);
    1280             :         }
    1281             : 
    1282           0 :         RB_WARN_ON(cpu_buffer, 1);
    1283             : 
    1284           0 :         return NULL;
    1285             : }
    1286             : 
    1287           0 : static int rb_head_page_replace(struct buffer_page *old,
    1288             :                                 struct buffer_page *new)
    1289             : {
    1290           0 :         unsigned long *ptr = (unsigned long *)&old->list.prev->next;
    1291           0 :         unsigned long val;
    1292           0 :         unsigned long ret;
    1293             : 
    1294           0 :         val = *ptr & ~RB_FLAG_MASK;
    1295           0 :         val |= RB_PAGE_HEAD;
    1296             : 
    1297           0 :         ret = cmpxchg(ptr, val, (unsigned long)&new->list);
    1298             : 
    1299           0 :         return ret == val;
    1300             : }
    1301             : 
    1302             : /*
    1303             :  * rb_tail_page_update - move the tail page forward
    1304             :  */
    1305           0 : static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
    1306             :                                struct buffer_page *tail_page,
    1307             :                                struct buffer_page *next_page)
    1308             : {
    1309           0 :         unsigned long old_entries;
    1310           0 :         unsigned long old_write;
    1311             : 
    1312             :         /*
    1313             :          * The tail page now needs to be moved forward.
    1314             :          *
    1315             :          * We need to reset the tail page, but without messing
    1316             :          * with possible erasing of data brought in by interrupts
    1317             :          * that have moved the tail page and are currently on it.
    1318             :          *
    1319             :          * We add a counter to the write field to denote this.
    1320             :          */
    1321           0 :         old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
    1322           0 :         old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
    1323             : 
    1324           0 :         local_inc(&cpu_buffer->pages_touched);
    1325             :         /*
    1326             :          * Just make sure we have seen our old_write and synchronize
    1327             :          * with any interrupts that come in.
    1328             :          */
    1329           0 :         barrier();
    1330             : 
    1331             :         /*
    1332             :          * If the tail page is still the same as what we think
    1333             :          * it is, then it is up to us to update the tail
    1334             :          * pointer.
    1335             :          */
    1336           0 :         if (tail_page == READ_ONCE(cpu_buffer->tail_page)) {
    1337             :                 /* Zero the write counter */
    1338           0 :                 unsigned long val = old_write & ~RB_WRITE_MASK;
    1339           0 :                 unsigned long eval = old_entries & ~RB_WRITE_MASK;
    1340             : 
    1341             :                 /*
    1342             :                  * This will only succeed if an interrupt did
    1343             :                  * not come in and change it. In which case, we
    1344             :                  * do not want to modify it.
    1345             :                  *
    1346             :                  * We add (void) to let the compiler know that we do not care
    1347             :                  * about the return value of these functions. We use the
    1348             :                  * cmpxchg to only update if an interrupt did not already
    1349             :                  * do it for us. If the cmpxchg fails, we don't care.
    1350             :                  */
    1351           0 :                 (void)local_cmpxchg(&next_page->write, old_write, val);
    1352           0 :                 (void)local_cmpxchg(&next_page->entries, old_entries, eval);
    1353             : 
    1354             :                 /*
    1355             :                  * No need to worry about races with clearing out the commit.
    1356             :                  * it only can increment when a commit takes place. But that
    1357             :                  * only happens in the outer most nested commit.
    1358             :                  */
    1359           0 :                 local_set(&next_page->page->commit, 0);
    1360             : 
    1361             :                 /* Again, either we update tail_page or an interrupt does */
    1362           0 :                 (void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page);
    1363             :         }
    1364           0 : }
    1365             : 
    1366          24 : static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
    1367             :                           struct buffer_page *bpage)
    1368             : {
    1369          24 :         unsigned long val = (unsigned long)bpage;
    1370             : 
    1371          24 :         if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK))
    1372           0 :                 return 1;
    1373             : 
    1374             :         return 0;
    1375             : }
    1376             : 
    1377             : /**
    1378             :  * rb_check_list - make sure a pointer to a list has the last bits zero
    1379             :  */
    1380          16 : static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
    1381             :                          struct list_head *list)
    1382             : {
    1383          16 :         if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev))
    1384             :                 return 1;
    1385          16 :         if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next))
    1386           0 :                 return 1;
    1387             :         return 0;
    1388             : }
    1389             : 
    1390             : /**
    1391             :  * rb_check_pages - integrity check of buffer pages
    1392             :  * @cpu_buffer: CPU buffer with pages to test
    1393             :  *
    1394             :  * As a safety measure we check to make sure the data pages have not
    1395             :  * been corrupted.
    1396             :  */
    1397           8 : static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
    1398             : {
    1399           8 :         struct list_head *head = cpu_buffer->pages;
    1400           8 :         struct buffer_page *bpage, *tmp;
    1401             : 
    1402             :         /* Reset the head page if it exists */
    1403           8 :         if (cpu_buffer->head_page)
    1404           0 :                 rb_set_head_page(cpu_buffer);
    1405             : 
    1406           8 :         rb_head_page_deactivate(cpu_buffer);
    1407             : 
    1408           8 :         if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
    1409             :                 return -1;
    1410           8 :         if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
    1411             :                 return -1;
    1412             : 
    1413           8 :         if (rb_check_list(cpu_buffer, head))
    1414             :                 return -1;
    1415             : 
    1416          16 :         list_for_each_entry_safe(bpage, tmp, head, list) {
    1417           8 :                 if (RB_WARN_ON(cpu_buffer,
    1418             :                                bpage->list.next->prev != &bpage->list))
    1419             :                         return -1;
    1420           8 :                 if (RB_WARN_ON(cpu_buffer,
    1421             :                                bpage->list.prev->next != &bpage->list))
    1422             :                         return -1;
    1423           8 :                 if (rb_check_list(cpu_buffer, &bpage->list))
    1424             :                         return -1;
    1425             :         }
    1426             : 
    1427           8 :         rb_head_page_activate(cpu_buffer);
    1428             : 
    1429             :         return 0;
    1430             : }
    1431             : 
    1432           8 : static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
    1433             :                 long nr_pages, struct list_head *pages)
    1434             : {
    1435           8 :         struct buffer_page *bpage, *tmp;
    1436           8 :         bool user_thread = current->mm != NULL;
    1437           8 :         gfp_t mflags;
    1438           8 :         long i;
    1439             : 
    1440             :         /*
    1441             :          * Check if the available memory is there first.
    1442             :          * Note, si_mem_available() only gives us a rough estimate of available
    1443             :          * memory. It may not be accurate. But we don't care, we just want
    1444             :          * to prevent doing any allocation when it is obvious that it is
    1445             :          * not going to succeed.
    1446             :          */
    1447           8 :         i = si_mem_available();
    1448           8 :         if (i < nr_pages)
    1449             :                 return -ENOMEM;
    1450             : 
    1451             :         /*
    1452             :          * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
    1453             :          * gracefully without invoking oom-killer and the system is not
    1454             :          * destabilized.
    1455             :          */
    1456           8 :         mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
    1457             : 
    1458             :         /*
    1459             :          * If a user thread allocates too much, and si_mem_available()
    1460             :          * reports there's enough memory, even though there is not.
    1461             :          * Make sure the OOM killer kills this thread. This can happen
    1462             :          * even with RETRY_MAYFAIL because another task may be doing
    1463             :          * an allocation after this task has taken all memory.
    1464             :          * This is the task the OOM killer needs to take out during this
    1465             :          * loop, even if it was triggered by an allocation somewhere else.
    1466             :          */
    1467           8 :         if (user_thread)
    1468           0 :                 set_current_oom_origin();
    1469          24 :         for (i = 0; i < nr_pages; i++) {
    1470          16 :                 struct page *page;
    1471             : 
    1472          16 :                 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
    1473             :                                     mflags, cpu_to_node(cpu_buffer->cpu));
    1474          16 :                 if (!bpage)
    1475           0 :                         goto free_pages;
    1476             : 
    1477          16 :                 rb_check_bpage(cpu_buffer, bpage);
    1478             : 
    1479          16 :                 list_add(&bpage->list, pages);
    1480             : 
    1481          16 :                 page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0);
    1482          16 :                 if (!page)
    1483           0 :                         goto free_pages;
    1484          16 :                 bpage->page = page_address(page);
    1485          16 :                 rb_init_page(bpage->page);
    1486             : 
    1487          16 :                 if (user_thread && fatal_signal_pending(current))
    1488           0 :                         goto free_pages;
    1489             :         }
    1490           8 :         if (user_thread)
    1491           0 :                 clear_current_oom_origin();
    1492             : 
    1493             :         return 0;
    1494             : 
    1495           0 : free_pages:
    1496           0 :         list_for_each_entry_safe(bpage, tmp, pages, list) {
    1497           0 :                 list_del_init(&bpage->list);
    1498           0 :                 free_buffer_page(bpage);
    1499             :         }
    1500           0 :         if (user_thread)
    1501           0 :                 clear_current_oom_origin();
    1502             : 
    1503             :         return -ENOMEM;
    1504             : }
    1505             : 
    1506           8 : static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
    1507             :                              unsigned long nr_pages)
    1508             : {
    1509           8 :         LIST_HEAD(pages);
    1510             : 
    1511           8 :         WARN_ON(!nr_pages);
    1512             : 
    1513           8 :         if (__rb_allocate_pages(cpu_buffer, nr_pages, &pages))
    1514             :                 return -ENOMEM;
    1515             : 
    1516             :         /*
    1517             :          * The ring buffer page list is a circular list that does not
    1518             :          * start and end with a list head. All page list items point to
    1519             :          * other pages.
    1520             :          */
    1521           8 :         cpu_buffer->pages = pages.next;
    1522           8 :         list_del(&pages);
    1523             : 
    1524           8 :         cpu_buffer->nr_pages = nr_pages;
    1525             : 
    1526           8 :         rb_check_pages(cpu_buffer);
    1527             : 
    1528           8 :         return 0;
    1529             : }
    1530             : 
    1531             : static struct ring_buffer_per_cpu *
    1532           8 : rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
    1533             : {
    1534           8 :         struct ring_buffer_per_cpu *cpu_buffer;
    1535           8 :         struct buffer_page *bpage;
    1536           8 :         struct page *page;
    1537           8 :         int ret;
    1538             : 
    1539           8 :         cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
    1540             :                                   GFP_KERNEL, cpu_to_node(cpu));
    1541           8 :         if (!cpu_buffer)
    1542             :                 return NULL;
    1543             : 
    1544           8 :         cpu_buffer->cpu = cpu;
    1545           8 :         cpu_buffer->buffer = buffer;
    1546           8 :         raw_spin_lock_init(&cpu_buffer->reader_lock);
    1547           8 :         lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
    1548           8 :         cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
    1549           8 :         INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);
    1550           8 :         init_completion(&cpu_buffer->update_done);
    1551           8 :         init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
    1552           8 :         init_waitqueue_head(&cpu_buffer->irq_work.waiters);
    1553           8 :         init_waitqueue_head(&cpu_buffer->irq_work.full_waiters);
    1554             : 
    1555           8 :         bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
    1556             :                             GFP_KERNEL, cpu_to_node(cpu));
    1557           8 :         if (!bpage)
    1558           0 :                 goto fail_free_buffer;
    1559             : 
    1560           8 :         rb_check_bpage(cpu_buffer, bpage);
    1561             : 
    1562           8 :         cpu_buffer->reader_page = bpage;
    1563           8 :         page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
    1564           8 :         if (!page)
    1565           0 :                 goto fail_free_reader;
    1566           8 :         bpage->page = page_address(page);
    1567           8 :         rb_init_page(bpage->page);
    1568             : 
    1569           8 :         INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
    1570           8 :         INIT_LIST_HEAD(&cpu_buffer->new_pages);
    1571             : 
    1572           8 :         ret = rb_allocate_pages(cpu_buffer, nr_pages);
    1573           8 :         if (ret < 0)
    1574           0 :                 goto fail_free_reader;
    1575             : 
    1576           8 :         cpu_buffer->head_page
    1577           8 :                 = list_entry(cpu_buffer->pages, struct buffer_page, list);
    1578           8 :         cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
    1579             : 
    1580           8 :         rb_head_page_activate(cpu_buffer);
    1581             : 
    1582             :         return cpu_buffer;
    1583             : 
    1584           0 :  fail_free_reader:
    1585           0 :         free_buffer_page(cpu_buffer->reader_page);
    1586             : 
    1587           0 :  fail_free_buffer:
    1588           0 :         kfree(cpu_buffer);
    1589           0 :         return NULL;
    1590             : }
    1591             : 
    1592           0 : static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
    1593             : {
    1594           0 :         struct list_head *head = cpu_buffer->pages;
    1595           0 :         struct buffer_page *bpage, *tmp;
    1596             : 
    1597           0 :         free_buffer_page(cpu_buffer->reader_page);
    1598             : 
    1599           0 :         rb_head_page_deactivate(cpu_buffer);
    1600             : 
    1601           0 :         if (head) {
    1602           0 :                 list_for_each_entry_safe(bpage, tmp, head, list) {
    1603           0 :                         list_del_init(&bpage->list);
    1604           0 :                         free_buffer_page(bpage);
    1605             :                 }
    1606           0 :                 bpage = list_entry(head, struct buffer_page, list);
    1607           0 :                 free_buffer_page(bpage);
    1608             :         }
    1609             : 
    1610           0 :         kfree(cpu_buffer);
    1611           0 : }
    1612             : 
    1613             : /**
    1614             :  * __ring_buffer_alloc - allocate a new ring_buffer
    1615             :  * @size: the size in bytes per cpu that is needed.
    1616             :  * @flags: attributes to set for the ring buffer.
    1617             :  * @key: ring buffer reader_lock_key.
    1618             :  *
    1619             :  * Currently the only flag that is available is the RB_FL_OVERWRITE
    1620             :  * flag. This flag means that the buffer will overwrite old data
    1621             :  * when the buffer wraps. If this flag is not set, the buffer will
    1622             :  * drop data when the tail hits the head.
    1623             :  */
    1624           2 : struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
    1625             :                                         struct lock_class_key *key)
    1626             : {
    1627           2 :         struct trace_buffer *buffer;
    1628           2 :         long nr_pages;
    1629           2 :         int bsize;
    1630           2 :         int cpu;
    1631           2 :         int ret;
    1632             : 
    1633             :         /* keep it in its own cache line */
    1634           2 :         buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
    1635             :                          GFP_KERNEL);
    1636           2 :         if (!buffer)
    1637             :                 return NULL;
    1638             : 
    1639           2 :         if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
    1640             :                 goto fail_free_buffer;
    1641             : 
    1642           2 :         nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
    1643           2 :         buffer->flags = flags;
    1644           2 :         buffer->clock = trace_clock_local;
    1645           2 :         buffer->reader_lock_key = key;
    1646             : 
    1647           2 :         init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters);
    1648           2 :         init_waitqueue_head(&buffer->irq_work.waiters);
    1649             : 
    1650             :         /* need at least two pages */
    1651           2 :         if (nr_pages < 2)
    1652             :                 nr_pages = 2;
    1653             : 
    1654           2 :         buffer->cpus = nr_cpu_ids;
    1655             : 
    1656           2 :         bsize = sizeof(void *) * nr_cpu_ids;
    1657           2 :         buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()),
    1658             :                                   GFP_KERNEL);
    1659           2 :         if (!buffer->buffers)
    1660           0 :                 goto fail_free_cpumask;
    1661             : 
    1662           2 :         cpu = raw_smp_processor_id();
    1663           2 :         cpumask_set_cpu(cpu, buffer->cpumask);
    1664           2 :         buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
    1665           2 :         if (!buffer->buffers[cpu])
    1666           0 :                 goto fail_free_buffers;
    1667             : 
    1668           2 :         ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
    1669           2 :         if (ret < 0)
    1670           0 :                 goto fail_free_buffers;
    1671             : 
    1672           2 :         mutex_init(&buffer->mutex);
    1673             : 
    1674           2 :         return buffer;
    1675             : 
    1676             :  fail_free_buffers:
    1677           0 :         for_each_buffer_cpu(buffer, cpu) {
    1678           0 :                 if (buffer->buffers[cpu])
    1679           0 :                         rb_free_cpu_buffer(buffer->buffers[cpu]);
    1680             :         }
    1681           0 :         kfree(buffer->buffers);
    1682             : 
    1683           0 :  fail_free_cpumask:
    1684           0 :         free_cpumask_var(buffer->cpumask);
    1685             : 
    1686           0 :  fail_free_buffer:
    1687           0 :         kfree(buffer);
    1688           0 :         return NULL;
    1689             : }
    1690             : EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
    1691             : 
    1692             : /**
    1693             :  * ring_buffer_free - free a ring buffer.
    1694             :  * @buffer: the buffer to free.
    1695             :  */
    1696             : void
    1697           0 : ring_buffer_free(struct trace_buffer *buffer)
    1698             : {
    1699           0 :         int cpu;
    1700             : 
    1701           0 :         cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
    1702             : 
    1703           0 :         for_each_buffer_cpu(buffer, cpu)
    1704           0 :                 rb_free_cpu_buffer(buffer->buffers[cpu]);
    1705             : 
    1706           0 :         kfree(buffer->buffers);
    1707           0 :         free_cpumask_var(buffer->cpumask);
    1708             : 
    1709           0 :         kfree(buffer);
    1710           0 : }
    1711             : EXPORT_SYMBOL_GPL(ring_buffer_free);
    1712             : 
    1713           0 : void ring_buffer_set_clock(struct trace_buffer *buffer,
    1714             :                            u64 (*clock)(void))
    1715             : {
    1716           0 :         buffer->clock = clock;
    1717           0 : }
    1718             : 
    1719           0 : void ring_buffer_set_time_stamp_abs(struct trace_buffer *buffer, bool abs)
    1720             : {
    1721           0 :         buffer->time_stamp_abs = abs;
    1722           0 : }
    1723             : 
    1724           0 : bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer)
    1725             : {
    1726           0 :         return buffer->time_stamp_abs;
    1727             : }
    1728             : 
    1729             : static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
    1730             : 
    1731           0 : static inline unsigned long rb_page_entries(struct buffer_page *bpage)
    1732             : {
    1733           0 :         return local_read(&bpage->entries) & RB_WRITE_MASK;
    1734             : }
    1735             : 
    1736           0 : static inline unsigned long rb_page_write(struct buffer_page *bpage)
    1737             : {
    1738           0 :         return local_read(&bpage->write) & RB_WRITE_MASK;
    1739             : }
    1740             : 
    1741             : static int
    1742           0 : rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
    1743             : {
    1744           0 :         struct list_head *tail_page, *to_remove, *next_page;
    1745           0 :         struct buffer_page *to_remove_page, *tmp_iter_page;
    1746           0 :         struct buffer_page *last_page, *first_page;
    1747           0 :         unsigned long nr_removed;
    1748           0 :         unsigned long head_bit;
    1749           0 :         int page_entries;
    1750             : 
    1751           0 :         head_bit = 0;
    1752             : 
    1753           0 :         raw_spin_lock_irq(&cpu_buffer->reader_lock);
    1754           0 :         atomic_inc(&cpu_buffer->record_disabled);
    1755             :         /*
    1756             :          * We don't race with the readers since we have acquired the reader
    1757             :          * lock. We also don't race with writers after disabling recording.
    1758             :          * This makes it easy to figure out the first and the last page to be
    1759             :          * removed from the list. We unlink all the pages in between including
    1760             :          * the first and last pages. This is done in a busy loop so that we
    1761             :          * lose the least number of traces.
    1762             :          * The pages are freed after we restart recording and unlock readers.
    1763             :          */
    1764           0 :         tail_page = &cpu_buffer->tail_page->list;
    1765             : 
    1766             :         /*
    1767             :          * tail page might be on reader page, we remove the next page
    1768             :          * from the ring buffer
    1769             :          */
    1770           0 :         if (cpu_buffer->tail_page == cpu_buffer->reader_page)
    1771           0 :                 tail_page = rb_list_head(tail_page->next);
    1772           0 :         to_remove = tail_page;
    1773             : 
    1774             :         /* start of pages to remove */
    1775           0 :         first_page = list_entry(rb_list_head(to_remove->next),
    1776             :                                 struct buffer_page, list);
    1777             : 
    1778           0 :         for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) {
    1779           0 :                 to_remove = rb_list_head(to_remove)->next;
    1780           0 :                 head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD;
    1781             :         }
    1782             : 
    1783           0 :         next_page = rb_list_head(to_remove)->next;
    1784             : 
    1785             :         /*
    1786             :          * Now we remove all pages between tail_page and next_page.
    1787             :          * Make sure that we have head_bit value preserved for the
    1788             :          * next page
    1789             :          */
    1790           0 :         tail_page->next = (struct list_head *)((unsigned long)next_page |
    1791             :                                                 head_bit);
    1792           0 :         next_page = rb_list_head(next_page);
    1793           0 :         next_page->prev = tail_page;
    1794             : 
    1795             :         /* make sure pages points to a valid page in the ring buffer */
    1796           0 :         cpu_buffer->pages = next_page;
    1797             : 
    1798             :         /* update head page */
    1799           0 :         if (head_bit)
    1800           0 :                 cpu_buffer->head_page = list_entry(next_page,
    1801             :                                                 struct buffer_page, list);
    1802             : 
    1803             :         /*
    1804             :          * change read pointer to make sure any read iterators reset
    1805             :          * themselves
    1806             :          */
    1807           0 :         cpu_buffer->read = 0;
    1808             : 
    1809             :         /* pages are removed, resume tracing and then free the pages */
    1810           0 :         atomic_dec(&cpu_buffer->record_disabled);
    1811           0 :         raw_spin_unlock_irq(&cpu_buffer->reader_lock);
    1812             : 
    1813           0 :         RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages));
    1814             : 
    1815             :         /* last buffer page to remove */
    1816           0 :         last_page = list_entry(rb_list_head(to_remove), struct buffer_page,
    1817             :                                 list);
    1818             :         tmp_iter_page = first_page;
    1819             : 
    1820           0 :         do {
    1821           0 :                 cond_resched();
    1822             : 
    1823           0 :                 to_remove_page = tmp_iter_page;
    1824           0 :                 rb_inc_page(&tmp_iter_page);
    1825             : 
    1826             :                 /* update the counters */
    1827           0 :                 page_entries = rb_page_entries(to_remove_page);
    1828           0 :                 if (page_entries) {
    1829             :                         /*
    1830             :                          * If something was added to this page, it was full
    1831             :                          * since it is not the tail page. So we deduct the
    1832             :                          * bytes consumed in ring buffer from here.
    1833             :                          * Increment overrun to account for the lost events.
    1834             :                          */
    1835           0 :                         local_add(page_entries, &cpu_buffer->overrun);
    1836           0 :                         local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
    1837             :                 }
    1838             : 
    1839             :                 /*
    1840             :                  * We have already removed references to this list item, just
    1841             :                  * free up the buffer_page and its page
    1842             :                  */
    1843           0 :                 free_buffer_page(to_remove_page);
    1844           0 :                 nr_removed--;
    1845             : 
    1846           0 :         } while (to_remove_page != last_page);
    1847             : 
    1848           0 :         RB_WARN_ON(cpu_buffer, nr_removed);
    1849             : 
    1850           0 :         return nr_removed == 0;
    1851             : }
    1852             : 
    1853             : static int
    1854           0 : rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
    1855             : {
    1856           0 :         struct list_head *pages = &cpu_buffer->new_pages;
    1857           0 :         int retries, success;
    1858             : 
    1859           0 :         raw_spin_lock_irq(&cpu_buffer->reader_lock);
    1860             :         /*
    1861             :          * We are holding the reader lock, so the reader page won't be swapped
    1862             :          * in the ring buffer. Now we are racing with the writer trying to
    1863             :          * move head page and the tail page.
    1864             :          * We are going to adapt the reader page update process where:
    1865             :          * 1. We first splice the start and end of list of new pages between
    1866             :          *    the head page and its previous page.
    1867             :          * 2. We cmpxchg the prev_page->next to point from head page to the
    1868             :          *    start of new pages list.
    1869             :          * 3. Finally, we update the head->prev to the end of new list.
    1870             :          *
    1871             :          * We will try this process 10 times, to make sure that we don't keep
    1872             :          * spinning.
    1873             :          */
    1874           0 :         retries = 10;
    1875           0 :         success = 0;
    1876           0 :         while (retries--) {
    1877           0 :                 struct list_head *head_page, *prev_page, *r;
    1878           0 :                 struct list_head *last_page, *first_page;
    1879           0 :                 struct list_head *head_page_with_bit;
    1880             : 
    1881           0 :                 head_page = &rb_set_head_page(cpu_buffer)->list;
    1882           0 :                 if (!head_page)
    1883             :                         break;
    1884           0 :                 prev_page = head_page->prev;
    1885             : 
    1886           0 :                 first_page = pages->next;
    1887           0 :                 last_page  = pages->prev;
    1888             : 
    1889           0 :                 head_page_with_bit = (struct list_head *)
    1890           0 :                                      ((unsigned long)head_page | RB_PAGE_HEAD);
    1891             : 
    1892           0 :                 last_page->next = head_page_with_bit;
    1893           0 :                 first_page->prev = prev_page;
    1894             : 
    1895           0 :                 r = cmpxchg(&prev_page->next, head_page_with_bit, first_page);
    1896             : 
    1897           0 :                 if (r == head_page_with_bit) {
    1898             :                         /*
    1899             :                          * yay, we replaced the page pointer to our new list,
    1900             :                          * now, we just have to update to head page's prev
    1901             :                          * pointer to point to end of list
    1902             :                          */
    1903           0 :                         head_page->prev = last_page;
    1904           0 :                         success = 1;
    1905           0 :                         break;
    1906             :                 }
    1907             :         }
    1908             : 
    1909           0 :         if (success)
    1910           0 :                 INIT_LIST_HEAD(pages);
    1911             :         /*
    1912             :          * If we weren't successful in adding in new pages, warn and stop
    1913             :          * tracing
    1914             :          */
    1915           0 :         RB_WARN_ON(cpu_buffer, !success);
    1916           0 :         raw_spin_unlock_irq(&cpu_buffer->reader_lock);
    1917             : 
    1918             :         /* free pages if they weren't inserted */
    1919           0 :         if (!success) {
    1920           0 :                 struct buffer_page *bpage, *tmp;
    1921           0 :                 list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
    1922             :                                          list) {
    1923           0 :                         list_del_init(&bpage->list);
    1924           0 :                         free_buffer_page(bpage);
    1925             :                 }
    1926             :         }
    1927           0 :         return success;
    1928             : }
    1929             : 
    1930           0 : static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer)
    1931             : {
    1932           0 :         int success;
    1933             : 
    1934           0 :         if (cpu_buffer->nr_pages_to_update > 0)
    1935           0 :                 success = rb_insert_pages(cpu_buffer);
    1936             :         else
    1937           0 :                 success = rb_remove_pages(cpu_buffer,
    1938           0 :                                         -cpu_buffer->nr_pages_to_update);
    1939             : 
    1940           0 :         if (success)
    1941           0 :                 cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update;
    1942           0 : }
    1943             : 
    1944           0 : static void update_pages_handler(struct work_struct *work)
    1945             : {
    1946           0 :         struct ring_buffer_per_cpu *cpu_buffer = container_of(work,
    1947             :                         struct ring_buffer_per_cpu, update_pages_work);
    1948           0 :         rb_update_pages(cpu_buffer);
    1949           0 :         complete(&cpu_buffer->update_done);
    1950           0 : }
    1951             : 
    1952             : /**
    1953             :  * ring_buffer_resize - resize the ring buffer
    1954             :  * @buffer: the buffer to resize.
    1955             :  * @size: the new size.
    1956             :  * @cpu_id: the cpu buffer to resize
    1957             :  *
    1958             :  * Minimum size is 2 * BUF_PAGE_SIZE.
    1959             :  *
    1960             :  * Returns 0 on success and < 0 on failure.
    1961             :  */
    1962           0 : int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
    1963             :                         int cpu_id)
    1964             : {
    1965           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    1966           0 :         unsigned long nr_pages;
    1967           0 :         int cpu, err;
    1968             : 
    1969             :         /*
    1970             :          * Always succeed at resizing a non-existent buffer:
    1971             :          */
    1972           0 :         if (!buffer)
    1973             :                 return 0;
    1974             : 
    1975             :         /* Make sure the requested buffer exists */
    1976           0 :         if (cpu_id != RING_BUFFER_ALL_CPUS &&
    1977           0 :             !cpumask_test_cpu(cpu_id, buffer->cpumask))
    1978             :                 return 0;
    1979             : 
    1980           0 :         nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
    1981             : 
    1982             :         /* we need a minimum of two pages */
    1983           0 :         if (nr_pages < 2)
    1984           0 :                 nr_pages = 2;
    1985             : 
    1986             :         /* prevent another thread from changing buffer sizes */
    1987           0 :         mutex_lock(&buffer->mutex);
    1988             : 
    1989             : 
    1990           0 :         if (cpu_id == RING_BUFFER_ALL_CPUS) {
    1991             :                 /*
    1992             :                  * Don't succeed if resizing is disabled, as a reader might be
    1993             :                  * manipulating the ring buffer and is expecting a sane state while
    1994             :                  * this is true.
    1995             :                  */
    1996           0 :                 for_each_buffer_cpu(buffer, cpu) {
    1997           0 :                         cpu_buffer = buffer->buffers[cpu];
    1998           0 :                         if (atomic_read(&cpu_buffer->resize_disabled)) {
    1999           0 :                                 err = -EBUSY;
    2000           0 :                                 goto out_err_unlock;
    2001             :                         }
    2002             :                 }
    2003             : 
    2004             :                 /* calculate the pages to update */
    2005           0 :                 for_each_buffer_cpu(buffer, cpu) {
    2006           0 :                         cpu_buffer = buffer->buffers[cpu];
    2007             : 
    2008           0 :                         cpu_buffer->nr_pages_to_update = nr_pages -
    2009           0 :                                                         cpu_buffer->nr_pages;
    2010             :                         /*
    2011             :                          * nothing more to do for removing pages or no update
    2012             :                          */
    2013           0 :                         if (cpu_buffer->nr_pages_to_update <= 0)
    2014           0 :                                 continue;
    2015             :                         /*
    2016             :                          * to add pages, make sure all new pages can be
    2017             :                          * allocated without receiving ENOMEM
    2018             :                          */
    2019           0 :                         INIT_LIST_HEAD(&cpu_buffer->new_pages);
    2020           0 :                         if (__rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update,
    2021             :                                                 &cpu_buffer->new_pages)) {
    2022             :                                 /* not enough memory for new pages */
    2023           0 :                                 err = -ENOMEM;
    2024           0 :                                 goto out_err;
    2025             :                         }
    2026             :                 }
    2027             : 
    2028           0 :                 get_online_cpus();
    2029             :                 /*
    2030             :                  * Fire off all the required work handlers
    2031             :                  * We can't schedule on offline CPUs, but it's not necessary
    2032             :                  * since we can change their buffer sizes without any race.
    2033             :                  */
    2034           0 :                 for_each_buffer_cpu(buffer, cpu) {
    2035           0 :                         cpu_buffer = buffer->buffers[cpu];
    2036           0 :                         if (!cpu_buffer->nr_pages_to_update)
    2037           0 :                                 continue;
    2038             : 
    2039             :                         /* Can't run something on an offline CPU. */
    2040           0 :                         if (!cpu_online(cpu)) {
    2041           0 :                                 rb_update_pages(cpu_buffer);
    2042           0 :                                 cpu_buffer->nr_pages_to_update = 0;
    2043             :                         } else {
    2044           0 :                                 schedule_work_on(cpu,
    2045             :                                                 &cpu_buffer->update_pages_work);
    2046             :                         }
    2047             :                 }
    2048             : 
    2049             :                 /* wait for all the updates to complete */
    2050           0 :                 for_each_buffer_cpu(buffer, cpu) {
    2051           0 :                         cpu_buffer = buffer->buffers[cpu];
    2052           0 :                         if (!cpu_buffer->nr_pages_to_update)
    2053           0 :                                 continue;
    2054             : 
    2055           0 :                         if (cpu_online(cpu))
    2056           0 :                                 wait_for_completion(&cpu_buffer->update_done);
    2057           0 :                         cpu_buffer->nr_pages_to_update = 0;
    2058             :                 }
    2059             : 
    2060           0 :                 put_online_cpus();
    2061             :         } else {
    2062           0 :                 cpu_buffer = buffer->buffers[cpu_id];
    2063             : 
    2064           0 :                 if (nr_pages == cpu_buffer->nr_pages)
    2065           0 :                         goto out;
    2066             : 
    2067             :                 /*
    2068             :                  * Don't succeed if resizing is disabled, as a reader might be
    2069             :                  * manipulating the ring buffer and is expecting a sane state while
    2070             :                  * this is true.
    2071             :                  */
    2072           0 :                 if (atomic_read(&cpu_buffer->resize_disabled)) {
    2073           0 :                         err = -EBUSY;
    2074           0 :                         goto out_err_unlock;
    2075             :                 }
    2076             : 
    2077           0 :                 cpu_buffer->nr_pages_to_update = nr_pages -
    2078           0 :                                                 cpu_buffer->nr_pages;
    2079             : 
    2080           0 :                 INIT_LIST_HEAD(&cpu_buffer->new_pages);
    2081           0 :                 if (cpu_buffer->nr_pages_to_update > 0 &&
    2082           0 :                         __rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update,
    2083             :                                             &cpu_buffer->new_pages)) {
    2084           0 :                         err = -ENOMEM;
    2085           0 :                         goto out_err;
    2086             :                 }
    2087             : 
    2088           0 :                 get_online_cpus();
    2089             : 
    2090             :                 /* Can't run something on an offline CPU. */
    2091           0 :                 if (!cpu_online(cpu_id))
    2092           0 :                         rb_update_pages(cpu_buffer);
    2093             :                 else {
    2094           0 :                         schedule_work_on(cpu_id,
    2095             :                                          &cpu_buffer->update_pages_work);
    2096           0 :                         wait_for_completion(&cpu_buffer->update_done);
    2097             :                 }
    2098             : 
    2099           0 :                 cpu_buffer->nr_pages_to_update = 0;
    2100           0 :                 put_online_cpus();
    2101             :         }
    2102             : 
    2103           0 :  out:
    2104             :         /*
    2105             :          * The ring buffer resize can happen with the ring buffer
    2106             :          * enabled, so that the update disturbs the tracing as little
    2107             :          * as possible. But if the buffer is disabled, we do not need
    2108             :          * to worry about that, and we can take the time to verify
    2109             :          * that the buffer is not corrupt.
    2110             :          */
    2111           0 :         if (atomic_read(&buffer->record_disabled)) {
    2112           0 :                 atomic_inc(&buffer->record_disabled);
    2113             :                 /*
    2114             :                  * Even though the buffer was disabled, we must make sure
    2115             :                  * that it is truly disabled before calling rb_check_pages.
    2116             :                  * There could have been a race between checking
    2117             :                  * record_disable and incrementing it.
    2118             :                  */
    2119           0 :                 synchronize_rcu();
    2120           0 :                 for_each_buffer_cpu(buffer, cpu) {
    2121           0 :                         cpu_buffer = buffer->buffers[cpu];
    2122           0 :                         rb_check_pages(cpu_buffer);
    2123             :                 }
    2124           0 :                 atomic_dec(&buffer->record_disabled);
    2125             :         }
    2126             : 
    2127           0 :         mutex_unlock(&buffer->mutex);
    2128           0 :         return 0;
    2129             : 
    2130             :  out_err:
    2131           0 :         for_each_buffer_cpu(buffer, cpu) {
    2132           0 :                 struct buffer_page *bpage, *tmp;
    2133             : 
    2134           0 :                 cpu_buffer = buffer->buffers[cpu];
    2135           0 :                 cpu_buffer->nr_pages_to_update = 0;
    2136             : 
    2137           0 :                 if (list_empty(&cpu_buffer->new_pages))
    2138           0 :                         continue;
    2139             : 
    2140           0 :                 list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
    2141             :                                         list) {
    2142           0 :                         list_del_init(&bpage->list);
    2143           0 :                         free_buffer_page(bpage);
    2144             :                 }
    2145             :         }
    2146           0 :  out_err_unlock:
    2147           0 :         mutex_unlock(&buffer->mutex);
    2148           0 :         return err;
    2149             : }
    2150             : EXPORT_SYMBOL_GPL(ring_buffer_resize);
    2151             : 
    2152           0 : void ring_buffer_change_overwrite(struct trace_buffer *buffer, int val)
    2153             : {
    2154           0 :         mutex_lock(&buffer->mutex);
    2155           0 :         if (val)
    2156           0 :                 buffer->flags |= RB_FL_OVERWRITE;
    2157             :         else
    2158           0 :                 buffer->flags &= ~RB_FL_OVERWRITE;
    2159           0 :         mutex_unlock(&buffer->mutex);
    2160           0 : }
    2161             : EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
    2162             : 
    2163           0 : static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)
    2164             : {
    2165           0 :         return bpage->page->data + index;
    2166             : }
    2167             : 
    2168             : static __always_inline struct ring_buffer_event *
    2169           0 : rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
    2170             : {
    2171           0 :         return __rb_page_index(cpu_buffer->reader_page,
    2172           0 :                                cpu_buffer->reader_page->read);
    2173             : }
    2174             : 
    2175           0 : static __always_inline unsigned rb_page_commit(struct buffer_page *bpage)
    2176             : {
    2177           0 :         return local_read(&bpage->page->commit);
    2178             : }
    2179             : 
    2180             : static struct ring_buffer_event *
    2181           0 : rb_iter_head_event(struct ring_buffer_iter *iter)
    2182             : {
    2183           0 :         struct ring_buffer_event *event;
    2184           0 :         struct buffer_page *iter_head_page = iter->head_page;
    2185           0 :         unsigned long commit;
    2186           0 :         unsigned length;
    2187             : 
    2188           0 :         if (iter->head != iter->next_event)
    2189           0 :                 return iter->event;
    2190             : 
    2191             :         /*
    2192             :          * When the writer goes across pages, it issues a cmpxchg which
    2193             :          * is a mb(), which will synchronize with the rmb here.
    2194             :          * (see rb_tail_page_update() and __rb_reserve_next())
    2195             :          */
    2196           0 :         commit = rb_page_commit(iter_head_page);
    2197           0 :         smp_rmb();
    2198           0 :         event = __rb_page_index(iter_head_page, iter->head);
    2199           0 :         length = rb_event_length(event);
    2200             : 
    2201             :         /*
    2202             :          * READ_ONCE() doesn't work on functions and we don't want the
    2203             :          * compiler doing any crazy optimizations with length.
    2204             :          */
    2205           0 :         barrier();
    2206             : 
    2207           0 :         if ((iter->head + length) > commit || length > BUF_MAX_DATA_SIZE)
    2208             :                 /* Writer corrupted the read? */
    2209           0 :                 goto reset;
    2210             : 
    2211           0 :         memcpy(iter->event, event, length);
    2212             :         /*
    2213             :          * If the page stamp is still the same after this rmb() then the
    2214             :          * event was safely copied without the writer entering the page.
    2215             :          */
    2216           0 :         smp_rmb();
    2217             : 
    2218             :         /* Make sure the page didn't change since we read this */
    2219           0 :         if (iter->page_stamp != iter_head_page->page->time_stamp ||
    2220           0 :             commit > rb_page_commit(iter_head_page))
    2221           0 :                 goto reset;
    2222             : 
    2223           0 :         iter->next_event = iter->head + length;
    2224           0 :         return iter->event;
    2225           0 :  reset:
    2226             :         /* Reset to the beginning */
    2227           0 :         iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp;
    2228           0 :         iter->head = 0;
    2229           0 :         iter->next_event = 0;
    2230           0 :         iter->missed_events = 1;
    2231           0 :         return NULL;
    2232             : }
    2233             : 
    2234             : /* Size is determined by what has been committed */
    2235           0 : static __always_inline unsigned rb_page_size(struct buffer_page *bpage)
    2236             : {
    2237           0 :         return rb_page_commit(bpage);
    2238             : }
    2239             : 
    2240             : static __always_inline unsigned
    2241           0 : rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
    2242             : {
    2243           0 :         return rb_page_commit(cpu_buffer->commit_page);
    2244             : }
    2245             : 
    2246             : static __always_inline unsigned
    2247           0 : rb_event_index(struct ring_buffer_event *event)
    2248             : {
    2249           0 :         unsigned long addr = (unsigned long)event;
    2250             : 
    2251           0 :         return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
    2252             : }
    2253             : 
    2254           0 : static void rb_inc_iter(struct ring_buffer_iter *iter)
    2255             : {
    2256           0 :         struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
    2257             : 
    2258             :         /*
    2259             :          * The iterator could be on the reader page (it starts there).
    2260             :          * But the head could have moved, since the reader was
    2261             :          * found. Check for this case and assign the iterator
    2262             :          * to the head page instead of next.
    2263             :          */
    2264           0 :         if (iter->head_page == cpu_buffer->reader_page)
    2265           0 :                 iter->head_page = rb_set_head_page(cpu_buffer);
    2266             :         else
    2267           0 :                 rb_inc_page(&iter->head_page);
    2268             : 
    2269           0 :         iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp;
    2270           0 :         iter->head = 0;
    2271           0 :         iter->next_event = 0;
    2272           0 : }
    2273             : 
    2274             : /*
    2275             :  * rb_handle_head_page - writer hit the head page
    2276             :  *
    2277             :  * Returns: +1 to retry page
    2278             :  *           0 to continue
    2279             :  *          -1 on error
    2280             :  */
    2281             : static int
    2282           0 : rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
    2283             :                     struct buffer_page *tail_page,
    2284             :                     struct buffer_page *next_page)
    2285             : {
    2286           0 :         struct buffer_page *new_head;
    2287           0 :         int entries;
    2288           0 :         int type;
    2289           0 :         int ret;
    2290             : 
    2291           0 :         entries = rb_page_entries(next_page);
    2292             : 
    2293             :         /*
    2294             :          * The hard part is here. We need to move the head
    2295             :          * forward, and protect against both readers on
    2296             :          * other CPUs and writers coming in via interrupts.
    2297             :          */
    2298           0 :         type = rb_head_page_set_update(cpu_buffer, next_page, tail_page,
    2299             :                                        RB_PAGE_HEAD);
    2300             : 
    2301             :         /*
    2302             :          * type can be one of four:
    2303             :          *  NORMAL - an interrupt already moved it for us
    2304             :          *  HEAD   - we are the first to get here.
    2305             :          *  UPDATE - we are the interrupt interrupting
    2306             :          *           a current move.
    2307             :          *  MOVED  - a reader on another CPU moved the next
    2308             :          *           pointer to its reader page. Give up
    2309             :          *           and try again.
    2310             :          */
    2311             : 
    2312           0 :         switch (type) {
    2313           0 :         case RB_PAGE_HEAD:
    2314             :                 /*
    2315             :                  * We changed the head to UPDATE, thus
    2316             :                  * it is our responsibility to update
    2317             :                  * the counters.
    2318             :                  */
    2319           0 :                 local_add(entries, &cpu_buffer->overrun);
    2320           0 :                 local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
    2321             : 
    2322             :                 /*
    2323             :                  * The entries will be zeroed out when we move the
    2324             :                  * tail page.
    2325             :                  */
    2326             : 
    2327             :                 /* still more to do */
    2328             :                 break;
    2329             : 
    2330             :         case RB_PAGE_UPDATE:
    2331             :                 /*
    2332             :                  * This is an interrupt that interrupt the
    2333             :                  * previous update. Still more to do.
    2334             :                  */
    2335             :                 break;
    2336             :         case RB_PAGE_NORMAL:
    2337             :                 /*
    2338             :                  * An interrupt came in before the update
    2339             :                  * and processed this for us.
    2340             :                  * Nothing left to do.
    2341             :                  */
    2342             :                 return 1;
    2343             :         case RB_PAGE_MOVED:
    2344             :                 /*
    2345             :                  * The reader is on another CPU and just did
    2346             :                  * a swap with our next_page.
    2347             :                  * Try again.
    2348             :                  */
    2349             :                 return 1;
    2350             :         default:
    2351           0 :                 RB_WARN_ON(cpu_buffer, 1); /* WTF??? */
    2352           0 :                 return -1;
    2353             :         }
    2354             : 
    2355             :         /*
    2356             :          * Now that we are here, the old head pointer is
    2357             :          * set to UPDATE. This will keep the reader from
    2358             :          * swapping the head page with the reader page.
    2359             :          * The reader (on another CPU) will spin till
    2360             :          * we are finished.
    2361             :          *
    2362             :          * We just need to protect against interrupts
    2363             :          * doing the job. We will set the next pointer
    2364             :          * to HEAD. After that, we set the old pointer
    2365             :          * to NORMAL, but only if it was HEAD before.
    2366             :          * otherwise we are an interrupt, and only
    2367             :          * want the outer most commit to reset it.
    2368             :          */
    2369           0 :         new_head = next_page;
    2370           0 :         rb_inc_page(&new_head);
    2371             : 
    2372           0 :         ret = rb_head_page_set_head(cpu_buffer, new_head, next_page,
    2373             :                                     RB_PAGE_NORMAL);
    2374             : 
    2375             :         /*
    2376             :          * Valid returns are:
    2377             :          *  HEAD   - an interrupt came in and already set it.
    2378             :          *  NORMAL - One of two things:
    2379             :          *            1) We really set it.
    2380             :          *            2) A bunch of interrupts came in and moved
    2381             :          *               the page forward again.
    2382             :          */
    2383           0 :         switch (ret) {
    2384             :         case RB_PAGE_HEAD:
    2385             :         case RB_PAGE_NORMAL:
    2386             :                 /* OK */
    2387           0 :                 break;
    2388             :         default:
    2389           0 :                 RB_WARN_ON(cpu_buffer, 1);
    2390           0 :                 return -1;
    2391             :         }
    2392             : 
    2393             :         /*
    2394             :          * It is possible that an interrupt came in,
    2395             :          * set the head up, then more interrupts came in
    2396             :          * and moved it again. When we get back here,
    2397             :          * the page would have been set to NORMAL but we
    2398             :          * just set it back to HEAD.
    2399             :          *
    2400             :          * How do you detect this? Well, if that happened
    2401             :          * the tail page would have moved.
    2402             :          */
    2403           0 :         if (ret == RB_PAGE_NORMAL) {
    2404           0 :                 struct buffer_page *buffer_tail_page;
    2405             : 
    2406           0 :                 buffer_tail_page = READ_ONCE(cpu_buffer->tail_page);
    2407             :                 /*
    2408             :                  * If the tail had moved passed next, then we need
    2409             :                  * to reset the pointer.
    2410             :                  */
    2411           0 :                 if (buffer_tail_page != tail_page &&
    2412           0 :                     buffer_tail_page != next_page)
    2413           0 :                         rb_head_page_set_normal(cpu_buffer, new_head,
    2414             :                                                 next_page,
    2415             :                                                 RB_PAGE_HEAD);
    2416             :         }
    2417             : 
    2418             :         /*
    2419             :          * If this was the outer most commit (the one that
    2420             :          * changed the original pointer from HEAD to UPDATE),
    2421             :          * then it is up to us to reset it to NORMAL.
    2422             :          */
    2423           0 :         if (type == RB_PAGE_HEAD) {
    2424           0 :                 ret = rb_head_page_set_normal(cpu_buffer, next_page,
    2425             :                                               tail_page,
    2426             :                                               RB_PAGE_UPDATE);
    2427           0 :                 if (RB_WARN_ON(cpu_buffer,
    2428             :                                ret != RB_PAGE_UPDATE))
    2429           0 :                         return -1;
    2430             :         }
    2431             : 
    2432             :         return 0;
    2433             : }
    2434             : 
    2435             : static inline void
    2436           0 : rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
    2437             :               unsigned long tail, struct rb_event_info *info)
    2438             : {
    2439           0 :         struct buffer_page *tail_page = info->tail_page;
    2440           0 :         struct ring_buffer_event *event;
    2441           0 :         unsigned long length = info->length;
    2442             : 
    2443             :         /*
    2444             :          * Only the event that crossed the page boundary
    2445             :          * must fill the old tail_page with padding.
    2446             :          */
    2447           0 :         if (tail >= BUF_PAGE_SIZE) {
    2448             :                 /*
    2449             :                  * If the page was filled, then we still need
    2450             :                  * to update the real_end. Reset it to zero
    2451             :                  * and the reader will ignore it.
    2452             :                  */
    2453           0 :                 if (tail == BUF_PAGE_SIZE)
    2454           0 :                         tail_page->real_end = 0;
    2455             : 
    2456           0 :                 local_sub(length, &tail_page->write);
    2457           0 :                 return;
    2458             :         }
    2459             : 
    2460           0 :         event = __rb_page_index(tail_page, tail);
    2461             : 
    2462             :         /* account for padding bytes */
    2463           0 :         local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
    2464             : 
    2465             :         /*
    2466             :          * Save the original length to the meta data.
    2467             :          * This will be used by the reader to add lost event
    2468             :          * counter.
    2469             :          */
    2470           0 :         tail_page->real_end = tail;
    2471             : 
    2472             :         /*
    2473             :          * If this event is bigger than the minimum size, then
    2474             :          * we need to be careful that we don't subtract the
    2475             :          * write counter enough to allow another writer to slip
    2476             :          * in on this page.
    2477             :          * We put in a discarded commit instead, to make sure
    2478             :          * that this space is not used again.
    2479             :          *
    2480             :          * If we are less than the minimum size, we don't need to
    2481             :          * worry about it.
    2482             :          */
    2483           0 :         if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
    2484             :                 /* No room for any events */
    2485             : 
    2486             :                 /* Mark the rest of the page with padding */
    2487           0 :                 rb_event_set_padding(event);
    2488             : 
    2489             :                 /* Set the write back to the previous setting */
    2490           0 :                 local_sub(length, &tail_page->write);
    2491           0 :                 return;
    2492             :         }
    2493             : 
    2494             :         /* Put in a discarded event */
    2495           0 :         event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
    2496           0 :         event->type_len = RINGBUF_TYPE_PADDING;
    2497             :         /* time delta must be non zero */
    2498           0 :         event->time_delta = 1;
    2499             : 
    2500             :         /* Set write to end of buffer */
    2501           0 :         length = (tail + length) - BUF_PAGE_SIZE;
    2502           0 :         local_sub(length, &tail_page->write);
    2503             : }
    2504             : 
    2505             : static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer);
    2506             : 
    2507             : /*
    2508             :  * This is the slow path, force gcc not to inline it.
    2509             :  */
    2510             : static noinline struct ring_buffer_event *
    2511           0 : rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
    2512             :              unsigned long tail, struct rb_event_info *info)
    2513             : {
    2514           0 :         struct buffer_page *tail_page = info->tail_page;
    2515           0 :         struct buffer_page *commit_page = cpu_buffer->commit_page;
    2516           0 :         struct trace_buffer *buffer = cpu_buffer->buffer;
    2517           0 :         struct buffer_page *next_page;
    2518           0 :         int ret;
    2519             : 
    2520           0 :         next_page = tail_page;
    2521             : 
    2522           0 :         rb_inc_page(&next_page);
    2523             : 
    2524             :         /*
    2525             :          * If for some reason, we had an interrupt storm that made
    2526             :          * it all the way around the buffer, bail, and warn
    2527             :          * about it.
    2528             :          */
    2529           0 :         if (unlikely(next_page == commit_page)) {
    2530           0 :                 local_inc(&cpu_buffer->commit_overrun);
    2531           0 :                 goto out_reset;
    2532             :         }
    2533             : 
    2534             :         /*
    2535             :          * This is where the fun begins!
    2536             :          *
    2537             :          * We are fighting against races between a reader that
    2538             :          * could be on another CPU trying to swap its reader
    2539             :          * page with the buffer head.
    2540             :          *
    2541             :          * We are also fighting against interrupts coming in and
    2542             :          * moving the head or tail on us as well.
    2543             :          *
    2544             :          * If the next page is the head page then we have filled
    2545             :          * the buffer, unless the commit page is still on the
    2546             :          * reader page.
    2547             :          */
    2548           0 :         if (rb_is_head_page(next_page, &tail_page->list)) {
    2549             : 
    2550             :                 /*
    2551             :                  * If the commit is not on the reader page, then
    2552             :                  * move the header page.
    2553             :                  */
    2554           0 :                 if (!rb_is_reader_page(cpu_buffer->commit_page)) {
    2555             :                         /*
    2556             :                          * If we are not in overwrite mode,
    2557             :                          * this is easy, just stop here.
    2558             :                          */
    2559           0 :                         if (!(buffer->flags & RB_FL_OVERWRITE)) {
    2560           0 :                                 local_inc(&cpu_buffer->dropped_events);
    2561           0 :                                 goto out_reset;
    2562             :                         }
    2563             : 
    2564           0 :                         ret = rb_handle_head_page(cpu_buffer,
    2565             :                                                   tail_page,
    2566             :                                                   next_page);
    2567           0 :                         if (ret < 0)
    2568           0 :                                 goto out_reset;
    2569           0 :                         if (ret)
    2570           0 :                                 goto out_again;
    2571             :                 } else {
    2572             :                         /*
    2573             :                          * We need to be careful here too. The
    2574             :                          * commit page could still be on the reader
    2575             :                          * page. We could have a small buffer, and
    2576             :                          * have filled up the buffer with events
    2577             :                          * from interrupts and such, and wrapped.
    2578             :                          *
    2579             :                          * Note, if the tail page is also on the
    2580             :                          * reader_page, we let it move out.
    2581             :                          */
    2582           0 :                         if (unlikely((cpu_buffer->commit_page !=
    2583             :                                       cpu_buffer->tail_page) &&
    2584             :                                      (cpu_buffer->commit_page ==
    2585             :                                       cpu_buffer->reader_page))) {
    2586           0 :                                 local_inc(&cpu_buffer->commit_overrun);
    2587           0 :                                 goto out_reset;
    2588             :                         }
    2589             :                 }
    2590             :         }
    2591             : 
    2592           0 :         rb_tail_page_update(cpu_buffer, tail_page, next_page);
    2593             : 
    2594           0 :  out_again:
    2595             : 
    2596           0 :         rb_reset_tail(cpu_buffer, tail, info);
    2597             : 
    2598             :         /* Commit what we have for now. */
    2599           0 :         rb_end_commit(cpu_buffer);
    2600             :         /* rb_end_commit() decs committing */
    2601           0 :         local_inc(&cpu_buffer->committing);
    2602             : 
    2603             :         /* fail and let the caller try again */
    2604           0 :         return ERR_PTR(-EAGAIN);
    2605             : 
    2606           0 :  out_reset:
    2607             :         /* reset write */
    2608           0 :         rb_reset_tail(cpu_buffer, tail, info);
    2609             : 
    2610           0 :         return NULL;
    2611             : }
    2612             : 
    2613             : /* Slow path */
    2614             : static struct ring_buffer_event *
    2615           0 : rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
    2616             : {
    2617           0 :         if (abs)
    2618           0 :                 event->type_len = RINGBUF_TYPE_TIME_STAMP;
    2619             :         else
    2620           0 :                 event->type_len = RINGBUF_TYPE_TIME_EXTEND;
    2621             : 
    2622             :         /* Not the first event on the page, or not delta? */
    2623           0 :         if (abs || rb_event_index(event)) {
    2624           0 :                 event->time_delta = delta & TS_MASK;
    2625           0 :                 event->array[0] = delta >> TS_SHIFT;
    2626             :         } else {
    2627             :                 /* nope, just zero it */
    2628           0 :                 event->time_delta = 0;
    2629           0 :                 event->array[0] = 0;
    2630             :         }
    2631             : 
    2632           0 :         return skip_time_extend(event);
    2633             : }
    2634             : 
    2635             : #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
    2636             : static inline bool sched_clock_stable(void)
    2637             : {
    2638             :         return true;
    2639             : }
    2640             : #endif
    2641             : 
    2642             : static void
    2643           0 : rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
    2644             :                    struct rb_event_info *info)
    2645             : {
    2646           0 :         u64 write_stamp;
    2647             : 
    2648           0 :         WARN_ONCE(1, "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s",
    2649             :                   (unsigned long long)info->delta,
    2650             :                   (unsigned long long)info->ts,
    2651             :                   (unsigned long long)info->before,
    2652             :                   (unsigned long long)info->after,
    2653             :                   (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0),
    2654             :                   sched_clock_stable() ? "" :
    2655             :                   "If you just came from a suspend/resume,\n"
    2656             :                   "please switch to the trace global clock:\n"
    2657             :                   "  echo global > /sys/kernel/debug/tracing/trace_clock\n"
    2658             :                   "or add trace_clock=global to the kernel command line\n");
    2659           0 : }
    2660             : 
    2661           0 : static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
    2662             :                                       struct ring_buffer_event **event,
    2663             :                                       struct rb_event_info *info,
    2664             :                                       u64 *delta,
    2665             :                                       unsigned int *length)
    2666             : {
    2667           0 :         bool abs = info->add_timestamp &
    2668             :                 (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE);
    2669             : 
    2670           0 :         if (unlikely(info->delta > (1ULL << 59))) {
    2671             :                 /* did the clock go backwards */
    2672           0 :                 if (info->before == info->after && info->before > info->ts) {
    2673             :                         /* not interrupted */
    2674           0 :                         static int once;
    2675             : 
    2676             :                         /*
    2677             :                          * This is possible with a recalibrating of the TSC.
    2678             :                          * Do not produce a call stack, but just report it.
    2679             :                          */
    2680           0 :                         if (!once) {
    2681           0 :                                 once++;
    2682           0 :                                 pr_warn("Ring buffer clock went backwards: %llu -> %llu\n",
    2683             :                                         info->before, info->ts);
    2684             :                         }
    2685             :                 } else
    2686           0 :                         rb_check_timestamp(cpu_buffer, info);
    2687           0 :                 if (!abs)
    2688           0 :                         info->delta = 0;
    2689             :         }
    2690           0 :         *event = rb_add_time_stamp(*event, info->delta, abs);
    2691           0 :         *length -= RB_LEN_TIME_EXTEND;
    2692           0 :         *delta = 0;
    2693           0 : }
    2694             : 
    2695             : /**
    2696             :  * rb_update_event - update event type and data
    2697             :  * @cpu_buffer: The per cpu buffer of the @event
    2698             :  * @event: the event to update
    2699             :  * @info: The info to update the @event with (contains length and delta)
    2700             :  *
    2701             :  * Update the type and data fields of the @event. The length
    2702             :  * is the actual size that is written to the ring buffer,
    2703             :  * and with this, we can determine what to place into the
    2704             :  * data field.
    2705             :  */
    2706             : static void
    2707           0 : rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
    2708             :                 struct ring_buffer_event *event,
    2709             :                 struct rb_event_info *info)
    2710             : {
    2711           0 :         unsigned length = info->length;
    2712           0 :         u64 delta = info->delta;
    2713             : 
    2714             :         /*
    2715             :          * If we need to add a timestamp, then we
    2716             :          * add it to the start of the reserved space.
    2717             :          */
    2718           0 :         if (unlikely(info->add_timestamp))
    2719           0 :                 rb_add_timestamp(cpu_buffer, &event, info, &delta, &length);
    2720             : 
    2721           0 :         event->time_delta = delta;
    2722           0 :         length -= RB_EVNT_HDR_SIZE;
    2723           0 :         if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
    2724           0 :                 event->type_len = 0;
    2725           0 :                 event->array[0] = length;
    2726             :         } else
    2727           0 :                 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
    2728           0 : }
    2729             : 
    2730           0 : static unsigned rb_calculate_event_length(unsigned length)
    2731             : {
    2732           0 :         struct ring_buffer_event event; /* Used only for sizeof array */
    2733             : 
    2734             :         /* zero length can cause confusions */
    2735           0 :         if (!length)
    2736             :                 length++;
    2737             : 
    2738           0 :         if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
    2739           0 :                 length += sizeof(event.array[0]);
    2740             : 
    2741           0 :         length += RB_EVNT_HDR_SIZE;
    2742           0 :         length = ALIGN(length, RB_ARCH_ALIGNMENT);
    2743             : 
    2744             :         /*
    2745             :          * In case the time delta is larger than the 27 bits for it
    2746             :          * in the header, we need to add a timestamp. If another
    2747             :          * event comes in when trying to discard this one to increase
    2748             :          * the length, then the timestamp will be added in the allocated
    2749             :          * space of this event. If length is bigger than the size needed
    2750             :          * for the TIME_EXTEND, then padding has to be used. The events
    2751             :          * length must be either RB_LEN_TIME_EXTEND, or greater than or equal
    2752             :          * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding.
    2753             :          * As length is a multiple of 4, we only need to worry if it
    2754             :          * is 12 (RB_LEN_TIME_EXTEND + 4).
    2755             :          */
    2756           0 :         if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT)
    2757           0 :                 length += RB_ALIGNMENT;
    2758             : 
    2759           0 :         return length;
    2760             : }
    2761             : 
    2762           0 : static u64 rb_time_delta(struct ring_buffer_event *event)
    2763             : {
    2764           0 :         switch (event->type_len) {
    2765             :         case RINGBUF_TYPE_PADDING:
    2766             :                 return 0;
    2767             : 
    2768             :         case RINGBUF_TYPE_TIME_EXTEND:
    2769           0 :                 return ring_buffer_event_time_stamp(event);
    2770             : 
    2771             :         case RINGBUF_TYPE_TIME_STAMP:
    2772             :                 return 0;
    2773             : 
    2774           0 :         case RINGBUF_TYPE_DATA:
    2775           0 :                 return event->time_delta;
    2776             :         default:
    2777             :                 return 0;
    2778             :         }
    2779             : }
    2780             : 
    2781             : static inline int
    2782           0 : rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
    2783             :                   struct ring_buffer_event *event)
    2784             : {
    2785           0 :         unsigned long new_index, old_index;
    2786           0 :         struct buffer_page *bpage;
    2787           0 :         unsigned long index;
    2788           0 :         unsigned long addr;
    2789           0 :         u64 write_stamp;
    2790           0 :         u64 delta;
    2791             : 
    2792           0 :         new_index = rb_event_index(event);
    2793           0 :         old_index = new_index + rb_event_ts_length(event);
    2794           0 :         addr = (unsigned long)event;
    2795           0 :         addr &= PAGE_MASK;
    2796             : 
    2797           0 :         bpage = READ_ONCE(cpu_buffer->tail_page);
    2798             : 
    2799           0 :         delta = rb_time_delta(event);
    2800             : 
    2801           0 :         if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp))
    2802             :                 return 0;
    2803             : 
    2804             :         /* Make sure the write stamp is read before testing the location */
    2805           0 :         barrier();
    2806             : 
    2807           0 :         if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
    2808           0 :                 unsigned long write_mask =
    2809           0 :                         local_read(&bpage->write) & ~RB_WRITE_MASK;
    2810           0 :                 unsigned long event_length = rb_event_length(event);
    2811             : 
    2812             :                 /* Something came in, can't discard */
    2813           0 :                 if (!rb_time_cmpxchg(&cpu_buffer->write_stamp,
    2814             :                                        write_stamp, write_stamp - delta))
    2815             :                         return 0;
    2816             : 
    2817             :                 /*
    2818             :                  * It's possible that the event time delta is zero
    2819             :                  * (has the same time stamp as the previous event)
    2820             :                  * in which case write_stamp and before_stamp could
    2821             :                  * be the same. In such a case, force before_stamp
    2822             :                  * to be different than write_stamp. It doesn't
    2823             :                  * matter what it is, as long as its different.
    2824             :                  */
    2825           0 :                 if (!delta)
    2826           0 :                         rb_time_set(&cpu_buffer->before_stamp, 0);
    2827             : 
    2828             :                 /*
    2829             :                  * If an event were to come in now, it would see that the
    2830             :                  * write_stamp and the before_stamp are different, and assume
    2831             :                  * that this event just added itself before updating
    2832             :                  * the write stamp. The interrupting event will fix the
    2833             :                  * write stamp for us, and use the before stamp as its delta.
    2834             :                  */
    2835             : 
    2836             :                 /*
    2837             :                  * This is on the tail page. It is possible that
    2838             :                  * a write could come in and move the tail page
    2839             :                  * and write to the next page. That is fine
    2840             :                  * because we just shorten what is on this page.
    2841             :                  */
    2842           0 :                 old_index += write_mask;
    2843           0 :                 new_index += write_mask;
    2844           0 :                 index = local_cmpxchg(&bpage->write, old_index, new_index);
    2845           0 :                 if (index == old_index) {
    2846             :                         /* update counters */
    2847           0 :                         local_sub(event_length, &cpu_buffer->entries_bytes);
    2848           0 :                         return 1;
    2849             :                 }
    2850             :         }
    2851             : 
    2852             :         /* could not discard */
    2853             :         return 0;
    2854             : }
    2855             : 
    2856           0 : static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
    2857             : {
    2858           0 :         local_inc(&cpu_buffer->committing);
    2859           0 :         local_inc(&cpu_buffer->commits);
    2860             : }
    2861             : 
    2862             : static __always_inline void
    2863             : rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
    2864             : {
    2865           0 :         unsigned long max_count;
    2866             : 
    2867             :         /*
    2868             :          * We only race with interrupts and NMIs on this CPU.
    2869             :          * If we own the commit event, then we can commit
    2870             :          * all others that interrupted us, since the interruptions
    2871             :          * are in stack format (they finish before they come
    2872             :          * back to us). This allows us to do a simple loop to
    2873             :          * assign the commit to the tail.
    2874             :          */
    2875           0 :  again:
    2876           0 :         max_count = cpu_buffer->nr_pages * 100;
    2877             : 
    2878           0 :         while (cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page)) {
    2879           0 :                 if (RB_WARN_ON(cpu_buffer, !(--max_count)))
    2880             :                         return;
    2881           0 :                 if (RB_WARN_ON(cpu_buffer,
    2882             :                                rb_is_reader_page(cpu_buffer->tail_page)))
    2883             :                         return;
    2884           0 :                 local_set(&cpu_buffer->commit_page->page->commit,
    2885             :                           rb_page_write(cpu_buffer->commit_page));
    2886           0 :                 rb_inc_page(&cpu_buffer->commit_page);
    2887             :                 /* add barrier to keep gcc from optimizing too much */
    2888           0 :                 barrier();
    2889             :         }
    2890           0 :         while (rb_commit_index(cpu_buffer) !=
    2891           0 :                rb_page_write(cpu_buffer->commit_page)) {
    2892             : 
    2893           0 :                 local_set(&cpu_buffer->commit_page->page->commit,
    2894             :                           rb_page_write(cpu_buffer->commit_page));
    2895           0 :                 RB_WARN_ON(cpu_buffer,
    2896             :                            local_read(&cpu_buffer->commit_page->page->commit) &
    2897             :                            ~RB_WRITE_MASK);
    2898           0 :                 barrier();
    2899             :         }
    2900             : 
    2901             :         /* again, keep gcc from optimizing */
    2902           0 :         barrier();
    2903             : 
    2904             :         /*
    2905             :          * If an interrupt came in just after the first while loop
    2906             :          * and pushed the tail page forward, we will be left with
    2907             :          * a dangling commit that will never go forward.
    2908             :          */
    2909           0 :         if (unlikely(cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page)))
    2910           0 :                 goto again;
    2911             : }
    2912             : 
    2913           0 : static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
    2914             : {
    2915           0 :         unsigned long commits;
    2916             : 
    2917           0 :         if (RB_WARN_ON(cpu_buffer,
    2918             :                        !local_read(&cpu_buffer->committing)))
    2919             :                 return;
    2920             : 
    2921           0 :  again:
    2922           0 :         commits = local_read(&cpu_buffer->commits);
    2923             :         /* synchronize with interrupts */
    2924           0 :         barrier();
    2925           0 :         if (local_read(&cpu_buffer->committing) == 1)
    2926           0 :                 rb_set_commit_to_write(cpu_buffer);
    2927             : 
    2928           0 :         local_dec(&cpu_buffer->committing);
    2929             : 
    2930             :         /* synchronize with interrupts */
    2931           0 :         barrier();
    2932             : 
    2933             :         /*
    2934             :          * Need to account for interrupts coming in between the
    2935             :          * updating of the commit page and the clearing of the
    2936             :          * committing counter.
    2937             :          */
    2938           0 :         if (unlikely(local_read(&cpu_buffer->commits) != commits) &&
    2939           0 :             !local_read(&cpu_buffer->committing)) {
    2940           0 :                 local_inc(&cpu_buffer->committing);
    2941           0 :                 goto again;
    2942             :         }
    2943             : }
    2944             : 
    2945           0 : static inline void rb_event_discard(struct ring_buffer_event *event)
    2946             : {
    2947           0 :         if (extended_time(event))
    2948           0 :                 event = skip_time_extend(event);
    2949             : 
    2950             :         /* array[0] holds the actual length for the discarded event */
    2951           0 :         event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
    2952           0 :         event->type_len = RINGBUF_TYPE_PADDING;
    2953             :         /* time delta must be non zero */
    2954           0 :         if (!event->time_delta)
    2955           0 :                 event->time_delta = 1;
    2956           0 : }
    2957             : 
    2958           0 : static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
    2959             :                       struct ring_buffer_event *event)
    2960             : {
    2961           0 :         local_inc(&cpu_buffer->entries);
    2962           0 :         rb_end_commit(cpu_buffer);
    2963           0 : }
    2964             : 
    2965             : static __always_inline void
    2966           0 : rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
    2967             : {
    2968           0 :         size_t nr_pages;
    2969           0 :         size_t dirty;
    2970           0 :         size_t full;
    2971             : 
    2972           0 :         if (buffer->irq_work.waiters_pending) {
    2973           0 :                 buffer->irq_work.waiters_pending = false;
    2974             :                 /* irq_work_queue() supplies it's own memory barriers */
    2975           0 :                 irq_work_queue(&buffer->irq_work.work);
    2976             :         }
    2977             : 
    2978           0 :         if (cpu_buffer->irq_work.waiters_pending) {
    2979           0 :                 cpu_buffer->irq_work.waiters_pending = false;
    2980             :                 /* irq_work_queue() supplies it's own memory barriers */
    2981           0 :                 irq_work_queue(&cpu_buffer->irq_work.work);
    2982             :         }
    2983             : 
    2984           0 :         if (cpu_buffer->last_pages_touch == local_read(&cpu_buffer->pages_touched))
    2985             :                 return;
    2986             : 
    2987           0 :         if (cpu_buffer->reader_page == cpu_buffer->commit_page)
    2988             :                 return;
    2989             : 
    2990           0 :         if (!cpu_buffer->irq_work.full_waiters_pending)
    2991             :                 return;
    2992             : 
    2993           0 :         cpu_buffer->last_pages_touch = local_read(&cpu_buffer->pages_touched);
    2994             : 
    2995           0 :         full = cpu_buffer->shortest_full;
    2996           0 :         nr_pages = cpu_buffer->nr_pages;
    2997           0 :         dirty = ring_buffer_nr_dirty_pages(buffer, cpu_buffer->cpu);
    2998           0 :         if (full && nr_pages && (dirty * 100) <= full * nr_pages)
    2999             :                 return;
    3000             : 
    3001           0 :         cpu_buffer->irq_work.wakeup_full = true;
    3002           0 :         cpu_buffer->irq_work.full_waiters_pending = false;
    3003             :         /* irq_work_queue() supplies it's own memory barriers */
    3004           0 :         irq_work_queue(&cpu_buffer->irq_work.work);
    3005             : }
    3006             : 
    3007             : #ifdef CONFIG_RING_BUFFER_RECORD_RECURSION
    3008             : # define do_ring_buffer_record_recursion()      \
    3009             :         do_ftrace_record_recursion(_THIS_IP_, _RET_IP_)
    3010             : #else
    3011             : # define do_ring_buffer_record_recursion() do { } while (0)
    3012             : #endif
    3013             : 
    3014             : /*
    3015             :  * The lock and unlock are done within a preempt disable section.
    3016             :  * The current_context per_cpu variable can only be modified
    3017             :  * by the current task between lock and unlock. But it can
    3018             :  * be modified more than once via an interrupt. To pass this
    3019             :  * information from the lock to the unlock without having to
    3020             :  * access the 'in_interrupt()' functions again (which do show
    3021             :  * a bit of overhead in something as critical as function tracing,
    3022             :  * we use a bitmask trick.
    3023             :  *
    3024             :  *  bit 1 =  NMI context
    3025             :  *  bit 2 =  IRQ context
    3026             :  *  bit 3 =  SoftIRQ context
    3027             :  *  bit 4 =  normal context.
    3028             :  *
    3029             :  * This works because this is the order of contexts that can
    3030             :  * preempt other contexts. A SoftIRQ never preempts an IRQ
    3031             :  * context.
    3032             :  *
    3033             :  * When the context is determined, the corresponding bit is
    3034             :  * checked and set (if it was set, then a recursion of that context
    3035             :  * happened).
    3036             :  *
    3037             :  * On unlock, we need to clear this bit. To do so, just subtract
    3038             :  * 1 from the current_context and AND it to itself.
    3039             :  *
    3040             :  * (binary)
    3041             :  *  101 - 1 = 100
    3042             :  *  101 & 100 = 100 (clearing bit zero)
    3043             :  *
    3044             :  *  1010 - 1 = 1001
    3045             :  *  1010 & 1001 = 1000 (clearing bit 1)
    3046             :  *
    3047             :  * The least significant bit can be cleared this way, and it
    3048             :  * just so happens that it is the same bit corresponding to
    3049             :  * the current context.
    3050             :  *
    3051             :  * Now the TRANSITION bit breaks the above slightly. The TRANSITION bit
    3052             :  * is set when a recursion is detected at the current context, and if
    3053             :  * the TRANSITION bit is already set, it will fail the recursion.
    3054             :  * This is needed because there's a lag between the changing of
    3055             :  * interrupt context and updating the preempt count. In this case,
    3056             :  * a false positive will be found. To handle this, one extra recursion
    3057             :  * is allowed, and this is done by the TRANSITION bit. If the TRANSITION
    3058             :  * bit is already set, then it is considered a recursion and the function
    3059             :  * ends. Otherwise, the TRANSITION bit is set, and that bit is returned.
    3060             :  *
    3061             :  * On the trace_recursive_unlock(), the TRANSITION bit will be the first
    3062             :  * to be cleared. Even if it wasn't the context that set it. That is,
    3063             :  * if an interrupt comes in while NORMAL bit is set and the ring buffer
    3064             :  * is called before preempt_count() is updated, since the check will
    3065             :  * be on the NORMAL bit, the TRANSITION bit will then be set. If an
    3066             :  * NMI then comes in, it will set the NMI bit, but when the NMI code
    3067             :  * does the trace_recursive_unlock() it will clear the TRANSTION bit
    3068             :  * and leave the NMI bit set. But this is fine, because the interrupt
    3069             :  * code that set the TRANSITION bit will then clear the NMI bit when it
    3070             :  * calls trace_recursive_unlock(). If another NMI comes in, it will
    3071             :  * set the TRANSITION bit and continue.
    3072             :  *
    3073             :  * Note: The TRANSITION bit only handles a single transition between context.
    3074             :  */
    3075             : 
    3076             : static __always_inline int
    3077           0 : trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
    3078             : {
    3079           0 :         unsigned int val = cpu_buffer->current_context;
    3080           0 :         unsigned long pc = preempt_count();
    3081           0 :         int bit;
    3082             : 
    3083           0 :         if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
    3084             :                 bit = RB_CTX_NORMAL;
    3085             :         else
    3086           0 :                 bit = pc & NMI_MASK ? RB_CTX_NMI :
    3087           0 :                         pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
    3088             : 
    3089           0 :         if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) {
    3090             :                 /*
    3091             :                  * It is possible that this was called by transitioning
    3092             :                  * between interrupt context, and preempt_count() has not
    3093             :                  * been updated yet. In this case, use the TRANSITION bit.
    3094             :                  */
    3095           0 :                 bit = RB_CTX_TRANSITION;
    3096           0 :                 if (val & (1 << (bit + cpu_buffer->nest))) {
    3097             :                         do_ring_buffer_record_recursion();
    3098             :                         return 1;
    3099             :                 }
    3100             :         }
    3101             : 
    3102           0 :         val |= (1 << (bit + cpu_buffer->nest));
    3103           0 :         cpu_buffer->current_context = val;
    3104             : 
    3105           0 :         return 0;
    3106             : }
    3107             : 
    3108             : static __always_inline void
    3109           0 : trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
    3110             : {
    3111           0 :         cpu_buffer->current_context &=
    3112           0 :                 cpu_buffer->current_context - (1 << cpu_buffer->nest);
    3113           0 : }
    3114             : 
    3115             : /* The recursive locking above uses 5 bits */
    3116             : #define NESTED_BITS 5
    3117             : 
    3118             : /**
    3119             :  * ring_buffer_nest_start - Allow to trace while nested
    3120             :  * @buffer: The ring buffer to modify
    3121             :  *
    3122             :  * The ring buffer has a safety mechanism to prevent recursion.
    3123             :  * But there may be a case where a trace needs to be done while
    3124             :  * tracing something else. In this case, calling this function
    3125             :  * will allow this function to nest within a currently active
    3126             :  * ring_buffer_lock_reserve().
    3127             :  *
    3128             :  * Call this function before calling another ring_buffer_lock_reserve() and
    3129             :  * call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit().
    3130             :  */
    3131           0 : void ring_buffer_nest_start(struct trace_buffer *buffer)
    3132             : {
    3133           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    3134           0 :         int cpu;
    3135             : 
    3136             :         /* Enabled by ring_buffer_nest_end() */
    3137           0 :         preempt_disable_notrace();
    3138           0 :         cpu = raw_smp_processor_id();
    3139           0 :         cpu_buffer = buffer->buffers[cpu];
    3140             :         /* This is the shift value for the above recursive locking */
    3141           0 :         cpu_buffer->nest += NESTED_BITS;
    3142           0 : }
    3143             : 
    3144             : /**
    3145             :  * ring_buffer_nest_end - Allow to trace while nested
    3146             :  * @buffer: The ring buffer to modify
    3147             :  *
    3148             :  * Must be called after ring_buffer_nest_start() and after the
    3149             :  * ring_buffer_unlock_commit().
    3150             :  */
    3151           0 : void ring_buffer_nest_end(struct trace_buffer *buffer)
    3152             : {
    3153           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    3154           0 :         int cpu;
    3155             : 
    3156             :         /* disabled by ring_buffer_nest_start() */
    3157           0 :         cpu = raw_smp_processor_id();
    3158           0 :         cpu_buffer = buffer->buffers[cpu];
    3159             :         /* This is the shift value for the above recursive locking */
    3160           0 :         cpu_buffer->nest -= NESTED_BITS;
    3161           0 :         preempt_enable_notrace();
    3162           0 : }
    3163             : 
    3164             : /**
    3165             :  * ring_buffer_unlock_commit - commit a reserved
    3166             :  * @buffer: The buffer to commit to
    3167             :  * @event: The event pointer to commit.
    3168             :  *
    3169             :  * This commits the data to the ring buffer, and releases any locks held.
    3170             :  *
    3171             :  * Must be paired with ring_buffer_lock_reserve.
    3172             :  */
    3173           0 : int ring_buffer_unlock_commit(struct trace_buffer *buffer,
    3174             :                               struct ring_buffer_event *event)
    3175             : {
    3176           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    3177           0 :         int cpu = raw_smp_processor_id();
    3178             : 
    3179           0 :         cpu_buffer = buffer->buffers[cpu];
    3180             : 
    3181           0 :         rb_commit(cpu_buffer, event);
    3182             : 
    3183           0 :         rb_wakeups(buffer, cpu_buffer);
    3184             : 
    3185           0 :         trace_recursive_unlock(cpu_buffer);
    3186             : 
    3187           0 :         preempt_enable_notrace();
    3188             : 
    3189           0 :         return 0;
    3190             : }
    3191             : EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
    3192             : 
    3193             : /* Special value to validate all deltas on a page. */
    3194             : #define CHECK_FULL_PAGE         1L
    3195             : 
    3196             : #ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS
    3197             : static void dump_buffer_page(struct buffer_data_page *bpage,
    3198             :                              struct rb_event_info *info,
    3199             :                              unsigned long tail)
    3200             : {
    3201             :         struct ring_buffer_event *event;
    3202             :         u64 ts, delta;
    3203             :         int e;
    3204             : 
    3205             :         ts = bpage->time_stamp;
    3206             :         pr_warn("  [%lld] PAGE TIME STAMP\n", ts);
    3207             : 
    3208             :         for (e = 0; e < tail; e += rb_event_length(event)) {
    3209             : 
    3210             :                 event = (struct ring_buffer_event *)(bpage->data + e);
    3211             : 
    3212             :                 switch (event->type_len) {
    3213             : 
    3214             :                 case RINGBUF_TYPE_TIME_EXTEND:
    3215             :                         delta = ring_buffer_event_time_stamp(event);
    3216             :                         ts += delta;
    3217             :                         pr_warn("  [%lld] delta:%lld TIME EXTEND\n", ts, delta);
    3218             :                         break;
    3219             : 
    3220             :                 case RINGBUF_TYPE_TIME_STAMP:
    3221             :                         delta = ring_buffer_event_time_stamp(event);
    3222             :                         ts = delta;
    3223             :                         pr_warn("  [%lld] absolute:%lld TIME STAMP\n", ts, delta);
    3224             :                         break;
    3225             : 
    3226             :                 case RINGBUF_TYPE_PADDING:
    3227             :                         ts += event->time_delta;
    3228             :                         pr_warn("  [%lld] delta:%d PADDING\n", ts, event->time_delta);
    3229             :                         break;
    3230             : 
    3231             :                 case RINGBUF_TYPE_DATA:
    3232             :                         ts += event->time_delta;
    3233             :                         pr_warn("  [%lld] delta:%d\n", ts, event->time_delta);
    3234             :                         break;
    3235             : 
    3236             :                 default:
    3237             :                         break;
    3238             :                 }
    3239             :         }
    3240             : }
    3241             : 
    3242             : static DEFINE_PER_CPU(atomic_t, checking);
    3243             : static atomic_t ts_dump;
    3244             : 
    3245             : /*
    3246             :  * Check if the current event time stamp matches the deltas on
    3247             :  * the buffer page.
    3248             :  */
    3249             : static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
    3250             :                          struct rb_event_info *info,
    3251             :                          unsigned long tail)
    3252             : {
    3253             :         struct ring_buffer_event *event;
    3254             :         struct buffer_data_page *bpage;
    3255             :         u64 ts, delta;
    3256             :         bool full = false;
    3257             :         int e;
    3258             : 
    3259             :         bpage = info->tail_page->page;
    3260             : 
    3261             :         if (tail == CHECK_FULL_PAGE) {
    3262             :                 full = true;
    3263             :                 tail = local_read(&bpage->commit);
    3264             :         } else if (info->add_timestamp &
    3265             :                    (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)) {
    3266             :                 /* Ignore events with absolute time stamps */
    3267             :                 return;
    3268             :         }
    3269             : 
    3270             :         /*
    3271             :          * Do not check the first event (skip possible extends too).
    3272             :          * Also do not check if previous events have not been committed.
    3273             :          */
    3274             :         if (tail <= 8 || tail > local_read(&bpage->commit))
    3275             :                 return;
    3276             : 
    3277             :         /*
    3278             :          * If this interrupted another event, 
    3279             :          */
    3280             :         if (atomic_inc_return(this_cpu_ptr(&checking)) != 1)
    3281             :                 goto out;
    3282             : 
    3283             :         ts = bpage->time_stamp;
    3284             : 
    3285             :         for (e = 0; e < tail; e += rb_event_length(event)) {
    3286             : 
    3287             :                 event = (struct ring_buffer_event *)(bpage->data + e);
    3288             : 
    3289             :                 switch (event->type_len) {
    3290             : 
    3291             :                 case RINGBUF_TYPE_TIME_EXTEND:
    3292             :                         delta = ring_buffer_event_time_stamp(event);
    3293             :                         ts += delta;
    3294             :                         break;
    3295             : 
    3296             :                 case RINGBUF_TYPE_TIME_STAMP:
    3297             :                         delta = ring_buffer_event_time_stamp(event);
    3298             :                         ts = delta;
    3299             :                         break;
    3300             : 
    3301             :                 case RINGBUF_TYPE_PADDING:
    3302             :                         if (event->time_delta == 1)
    3303             :                                 break;
    3304             :                         /* fall through */
    3305             :                 case RINGBUF_TYPE_DATA:
    3306             :                         ts += event->time_delta;
    3307             :                         break;
    3308             : 
    3309             :                 default:
    3310             :                         RB_WARN_ON(cpu_buffer, 1);
    3311             :                 }
    3312             :         }
    3313             :         if ((full && ts > info->ts) ||
    3314             :             (!full && ts + info->delta != info->ts)) {
    3315             :                 /* If another report is happening, ignore this one */
    3316             :                 if (atomic_inc_return(&ts_dump) != 1) {
    3317             :                         atomic_dec(&ts_dump);
    3318             :                         goto out;
    3319             :                 }
    3320             :                 atomic_inc(&cpu_buffer->record_disabled);
    3321             :                 /* There's some cases in boot up that this can happen */
    3322             :                 WARN_ON_ONCE(system_state != SYSTEM_BOOTING);
    3323             :                 pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s\n",
    3324             :                         cpu_buffer->cpu,
    3325             :                         ts + info->delta, info->ts, info->delta,
    3326             :                         info->before, info->after,
    3327             :                         full ? " (full)" : "");
    3328             :                 dump_buffer_page(bpage, info, tail);
    3329             :                 atomic_dec(&ts_dump);
    3330             :                 /* Do not re-enable checking */
    3331             :                 return;
    3332             :         }
    3333             : out:
    3334             :         atomic_dec(this_cpu_ptr(&checking));
    3335             : }
    3336             : #else
    3337           0 : static inline void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
    3338             :                          struct rb_event_info *info,
    3339             :                          unsigned long tail)
    3340             : {
    3341           0 : }
    3342             : #endif /* CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS */
    3343             : 
    3344             : static struct ring_buffer_event *
    3345           0 : __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
    3346             :                   struct rb_event_info *info)
    3347             : {
    3348           0 :         struct ring_buffer_event *event;
    3349           0 :         struct buffer_page *tail_page;
    3350           0 :         unsigned long tail, write, w;
    3351           0 :         bool a_ok;
    3352           0 :         bool b_ok;
    3353             : 
    3354             :         /* Don't let the compiler play games with cpu_buffer->tail_page */
    3355           0 :         tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
    3356             : 
    3357           0 :  /*A*/  w = local_read(&tail_page->write) & RB_WRITE_MASK;
    3358           0 :         barrier();
    3359           0 :         b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
    3360           0 :         a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
    3361           0 :         barrier();
    3362           0 :         info->ts = rb_time_stamp(cpu_buffer->buffer);
    3363             : 
    3364           0 :         if ((info->add_timestamp & RB_ADD_STAMP_ABSOLUTE)) {
    3365           0 :                 info->delta = info->ts;
    3366             :         } else {
    3367             :                 /*
    3368             :                  * If interrupting an event time update, we may need an
    3369             :                  * absolute timestamp.
    3370             :                  * Don't bother if this is the start of a new page (w == 0).
    3371             :                  */
    3372           0 :                 if (unlikely(!a_ok || !b_ok || (info->before != info->after && w))) {
    3373           0 :                         info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
    3374           0 :                         info->length += RB_LEN_TIME_EXTEND;
    3375             :                 } else {
    3376           0 :                         info->delta = info->ts - info->after;
    3377           0 :                         if (unlikely(test_time_stamp(info->delta))) {
    3378           0 :                                 info->add_timestamp |= RB_ADD_STAMP_EXTEND;
    3379           0 :                                 info->length += RB_LEN_TIME_EXTEND;
    3380             :                         }
    3381             :                 }
    3382             :         }
    3383             : 
    3384           0 :  /*B*/  rb_time_set(&cpu_buffer->before_stamp, info->ts);
    3385             : 
    3386           0 :  /*C*/  write = local_add_return(info->length, &tail_page->write);
    3387             : 
    3388             :         /* set write to only the index of the write */
    3389           0 :         write &= RB_WRITE_MASK;
    3390             : 
    3391           0 :         tail = write - info->length;
    3392             : 
    3393             :         /* See if we shot pass the end of this buffer page */
    3394           0 :         if (unlikely(write > BUF_PAGE_SIZE)) {
    3395             :                 /* before and after may now different, fix it up*/
    3396           0 :                 b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
    3397           0 :                 a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
    3398           0 :                 if (a_ok && b_ok && info->before != info->after)
    3399           0 :                         (void)rb_time_cmpxchg(&cpu_buffer->before_stamp,
    3400             :                                               info->before, info->after);
    3401           0 :                 if (a_ok && b_ok)
    3402           0 :                         check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
    3403           0 :                 return rb_move_tail(cpu_buffer, tail, info);
    3404             :         }
    3405             : 
    3406           0 :         if (likely(tail == w)) {
    3407           0 :                 u64 save_before;
    3408           0 :                 bool s_ok;
    3409             : 
    3410             :                 /* Nothing interrupted us between A and C */
    3411           0 :  /*D*/          rb_time_set(&cpu_buffer->write_stamp, info->ts);
    3412           0 :                 barrier();
    3413           0 :  /*E*/          s_ok = rb_time_read(&cpu_buffer->before_stamp, &save_before);
    3414           0 :                 RB_WARN_ON(cpu_buffer, !s_ok);
    3415           0 :                 if (likely(!(info->add_timestamp &
    3416             :                              (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
    3417             :                         /* This did not interrupt any time update */
    3418           0 :                         info->delta = info->ts - info->after;
    3419             :                 else
    3420             :                         /* Just use full timestamp for interrupting event */
    3421           0 :                         info->delta = info->ts;
    3422           0 :                 barrier();
    3423           0 :                 check_buffer(cpu_buffer, info, tail);
    3424           0 :                 if (unlikely(info->ts != save_before)) {
    3425             :                         /* SLOW PATH - Interrupted between C and E */
    3426             : 
    3427           0 :                         a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
    3428           0 :                         RB_WARN_ON(cpu_buffer, !a_ok);
    3429             : 
    3430             :                         /* Write stamp must only go forward */
    3431           0 :                         if (save_before > info->after) {
    3432             :                                 /*
    3433             :                                  * We do not care about the result, only that
    3434             :                                  * it gets updated atomically.
    3435             :                                  */
    3436           0 :                                 (void)rb_time_cmpxchg(&cpu_buffer->write_stamp,
    3437             :                                                       info->after, save_before);
    3438             :                         }
    3439             :                 }
    3440             :         } else {
    3441           0 :                 u64 ts;
    3442             :                 /* SLOW PATH - Interrupted between A and C */
    3443           0 :                 a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
    3444             :                 /* Was interrupted before here, write_stamp must be valid */
    3445           0 :                 RB_WARN_ON(cpu_buffer, !a_ok);
    3446           0 :                 ts = rb_time_stamp(cpu_buffer->buffer);
    3447           0 :                 barrier();
    3448           0 :  /*E*/          if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
    3449           0 :                     info->after < ts &&
    3450           0 :                     rb_time_cmpxchg(&cpu_buffer->write_stamp,
    3451             :                                     info->after, ts)) {
    3452             :                         /* Nothing came after this event between C and E */
    3453           0 :                         info->delta = ts - info->after;
    3454           0 :                         info->ts = ts;
    3455             :                 } else {
    3456             :                         /*
    3457             :                          * Interrupted between C and E:
    3458             :                          * Lost the previous events time stamp. Just set the
    3459             :                          * delta to zero, and this will be the same time as
    3460             :                          * the event this event interrupted. And the events that
    3461             :                          * came after this will still be correct (as they would
    3462             :                          * have built their delta on the previous event.
    3463             :                          */
    3464           0 :                         info->delta = 0;
    3465             :                 }
    3466           0 :                 info->add_timestamp &= ~RB_ADD_STAMP_FORCE;
    3467             :         }
    3468             : 
    3469             :         /*
    3470             :          * If this is the first commit on the page, then it has the same
    3471             :          * timestamp as the page itself.
    3472             :          */
    3473           0 :         if (unlikely(!tail && !(info->add_timestamp &
    3474             :                                 (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
    3475           0 :                 info->delta = 0;
    3476             : 
    3477             :         /* We reserved something on the buffer */
    3478             : 
    3479           0 :         event = __rb_page_index(tail_page, tail);
    3480           0 :         rb_update_event(cpu_buffer, event, info);
    3481             : 
    3482           0 :         local_inc(&tail_page->entries);
    3483             : 
    3484             :         /*
    3485             :          * If this is the first commit on the page, then update
    3486             :          * its timestamp.
    3487             :          */
    3488           0 :         if (unlikely(!tail))
    3489           0 :                 tail_page->page->time_stamp = info->ts;
    3490             : 
    3491             :         /* account for these added bytes */
    3492           0 :         local_add(info->length, &cpu_buffer->entries_bytes);
    3493             : 
    3494           0 :         return event;
    3495             : }
    3496             : 
    3497             : static __always_inline struct ring_buffer_event *
    3498           0 : rb_reserve_next_event(struct trace_buffer *buffer,
    3499             :                       struct ring_buffer_per_cpu *cpu_buffer,
    3500             :                       unsigned long length)
    3501             : {
    3502           0 :         struct ring_buffer_event *event;
    3503           0 :         struct rb_event_info info;
    3504           0 :         int nr_loops = 0;
    3505           0 :         int add_ts_default;
    3506             : 
    3507           0 :         rb_start_commit(cpu_buffer);
    3508             :         /* The commit page can not change after this */
    3509             : 
    3510             : #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
    3511             :         /*
    3512             :          * Due to the ability to swap a cpu buffer from a buffer
    3513             :          * it is possible it was swapped before we committed.
    3514             :          * (committing stops a swap). We check for it here and
    3515             :          * if it happened, we have to fail the write.
    3516             :          */
    3517             :         barrier();
    3518             :         if (unlikely(READ_ONCE(cpu_buffer->buffer) != buffer)) {
    3519             :                 local_dec(&cpu_buffer->committing);
    3520             :                 local_dec(&cpu_buffer->commits);
    3521             :                 return NULL;
    3522             :         }
    3523             : #endif
    3524             : 
    3525           0 :         info.length = rb_calculate_event_length(length);
    3526             : 
    3527           0 :         if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
    3528           0 :                 add_ts_default = RB_ADD_STAMP_ABSOLUTE;
    3529           0 :                 info.length += RB_LEN_TIME_EXTEND;
    3530             :         } else {
    3531           0 :                 add_ts_default = RB_ADD_STAMP_NONE;
    3532             :         }
    3533             : 
    3534           0 :  again:
    3535           0 :         info.add_timestamp = add_ts_default;
    3536           0 :         info.delta = 0;
    3537             : 
    3538             :         /*
    3539             :          * We allow for interrupts to reenter here and do a trace.
    3540             :          * If one does, it will cause this original code to loop
    3541             :          * back here. Even with heavy interrupts happening, this
    3542             :          * should only happen a few times in a row. If this happens
    3543             :          * 1000 times in a row, there must be either an interrupt
    3544             :          * storm or we have something buggy.
    3545             :          * Bail!
    3546             :          */
    3547           0 :         if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
    3548           0 :                 goto out_fail;
    3549             : 
    3550           0 :         event = __rb_reserve_next(cpu_buffer, &info);
    3551             : 
    3552           0 :         if (unlikely(PTR_ERR(event) == -EAGAIN)) {
    3553           0 :                 if (info.add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND))
    3554           0 :                         info.length -= RB_LEN_TIME_EXTEND;
    3555           0 :                 goto again;
    3556             :         }
    3557             : 
    3558           0 :         if (likely(event))
    3559             :                 return event;
    3560           0 :  out_fail:
    3561           0 :         rb_end_commit(cpu_buffer);
    3562             :         return NULL;
    3563             : }
    3564             : 
    3565             : /**
    3566             :  * ring_buffer_lock_reserve - reserve a part of the buffer
    3567             :  * @buffer: the ring buffer to reserve from
    3568             :  * @length: the length of the data to reserve (excluding event header)
    3569             :  *
    3570             :  * Returns a reserved event on the ring buffer to copy directly to.
    3571             :  * The user of this interface will need to get the body to write into
    3572             :  * and can use the ring_buffer_event_data() interface.
    3573             :  *
    3574             :  * The length is the length of the data needed, not the event length
    3575             :  * which also includes the event header.
    3576             :  *
    3577             :  * Must be paired with ring_buffer_unlock_commit, unless NULL is returned.
    3578             :  * If NULL is returned, then nothing has been allocated or locked.
    3579             :  */
    3580             : struct ring_buffer_event *
    3581           0 : ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length)
    3582             : {
    3583           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    3584           0 :         struct ring_buffer_event *event;
    3585           0 :         int cpu;
    3586             : 
    3587             :         /* If we are tracing schedule, we don't want to recurse */
    3588           0 :         preempt_disable_notrace();
    3589             : 
    3590           0 :         if (unlikely(atomic_read(&buffer->record_disabled)))
    3591           0 :                 goto out;
    3592             : 
    3593           0 :         cpu = raw_smp_processor_id();
    3594             : 
    3595           0 :         if (unlikely(!cpumask_test_cpu(cpu, buffer->cpumask)))
    3596           0 :                 goto out;
    3597             : 
    3598           0 :         cpu_buffer = buffer->buffers[cpu];
    3599             : 
    3600           0 :         if (unlikely(atomic_read(&cpu_buffer->record_disabled)))
    3601           0 :                 goto out;
    3602             : 
    3603           0 :         if (unlikely(length > BUF_MAX_DATA_SIZE))
    3604           0 :                 goto out;
    3605             : 
    3606           0 :         if (unlikely(trace_recursive_lock(cpu_buffer)))
    3607           0 :                 goto out;
    3608             : 
    3609           0 :         event = rb_reserve_next_event(buffer, cpu_buffer, length);
    3610           0 :         if (!event)
    3611           0 :                 goto out_unlock;
    3612             : 
    3613             :         return event;
    3614             : 
    3615           0 :  out_unlock:
    3616           0 :         trace_recursive_unlock(cpu_buffer);
    3617           0 :  out:
    3618           0 :         preempt_enable_notrace();
    3619           0 :         return NULL;
    3620             : }
    3621             : EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
    3622             : 
    3623             : /*
    3624             :  * Decrement the entries to the page that an event is on.
    3625             :  * The event does not even need to exist, only the pointer
    3626             :  * to the page it is on. This may only be called before the commit
    3627             :  * takes place.
    3628             :  */
    3629             : static inline void
    3630           0 : rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
    3631             :                    struct ring_buffer_event *event)
    3632             : {
    3633           0 :         unsigned long addr = (unsigned long)event;
    3634           0 :         struct buffer_page *bpage = cpu_buffer->commit_page;
    3635           0 :         struct buffer_page *start;
    3636             : 
    3637           0 :         addr &= PAGE_MASK;
    3638             : 
    3639             :         /* Do the likely case first */
    3640           0 :         if (likely(bpage->page == (void *)addr)) {
    3641           0 :                 local_dec(&bpage->entries);
    3642           0 :                 return;
    3643             :         }
    3644             : 
    3645             :         /*
    3646             :          * Because the commit page may be on the reader page we
    3647             :          * start with the next page and check the end loop there.
    3648             :          */
    3649           0 :         rb_inc_page(&bpage);
    3650           0 :         start = bpage;
    3651           0 :         do {
    3652           0 :                 if (bpage->page == (void *)addr) {
    3653           0 :                         local_dec(&bpage->entries);
    3654           0 :                         return;
    3655             :                 }
    3656           0 :                 rb_inc_page(&bpage);
    3657           0 :         } while (bpage != start);
    3658             : 
    3659             :         /* commit not part of this buffer?? */
    3660           0 :         RB_WARN_ON(cpu_buffer, 1);
    3661             : }
    3662             : 
    3663             : /**
    3664             :  * ring_buffer_discard_commit - discard an event that has not been committed
    3665             :  * @buffer: the ring buffer
    3666             :  * @event: non committed event to discard
    3667             :  *
    3668             :  * Sometimes an event that is in the ring buffer needs to be ignored.
    3669             :  * This function lets the user discard an event in the ring buffer
    3670             :  * and then that event will not be read later.
    3671             :  *
    3672             :  * This function only works if it is called before the item has been
    3673             :  * committed. It will try to free the event from the ring buffer
    3674             :  * if another event has not been added behind it.
    3675             :  *
    3676             :  * If another event has been added behind it, it will set the event
    3677             :  * up as discarded, and perform the commit.
    3678             :  *
    3679             :  * If this function is called, do not call ring_buffer_unlock_commit on
    3680             :  * the event.
    3681             :  */
    3682           0 : void ring_buffer_discard_commit(struct trace_buffer *buffer,
    3683             :                                 struct ring_buffer_event *event)
    3684             : {
    3685           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    3686           0 :         int cpu;
    3687             : 
    3688             :         /* The event is discarded regardless */
    3689           0 :         rb_event_discard(event);
    3690             : 
    3691           0 :         cpu = smp_processor_id();
    3692           0 :         cpu_buffer = buffer->buffers[cpu];
    3693             : 
    3694             :         /*
    3695             :          * This must only be called if the event has not been
    3696             :          * committed yet. Thus we can assume that preemption
    3697             :          * is still disabled.
    3698             :          */
    3699           0 :         RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
    3700             : 
    3701           0 :         rb_decrement_entry(cpu_buffer, event);
    3702           0 :         if (rb_try_to_discard(cpu_buffer, event))
    3703             :                 goto out;
    3704             : 
    3705           0 :  out:
    3706           0 :         rb_end_commit(cpu_buffer);
    3707             : 
    3708           0 :         trace_recursive_unlock(cpu_buffer);
    3709             : 
    3710           0 :         preempt_enable_notrace();
    3711             : 
    3712           0 : }
    3713             : EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
    3714             : 
    3715             : /**
    3716             :  * ring_buffer_write - write data to the buffer without reserving
    3717             :  * @buffer: The ring buffer to write to.
    3718             :  * @length: The length of the data being written (excluding the event header)
    3719             :  * @data: The data to write to the buffer.
    3720             :  *
    3721             :  * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as
    3722             :  * one function. If you already have the data to write to the buffer, it
    3723             :  * may be easier to simply call this function.
    3724             :  *
    3725             :  * Note, like ring_buffer_lock_reserve, the length is the length of the data
    3726             :  * and not the length of the event which would hold the header.
    3727             :  */
    3728           0 : int ring_buffer_write(struct trace_buffer *buffer,
    3729             :                       unsigned long length,
    3730             :                       void *data)
    3731             : {
    3732           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    3733           0 :         struct ring_buffer_event *event;
    3734           0 :         void *body;
    3735           0 :         int ret = -EBUSY;
    3736           0 :         int cpu;
    3737             : 
    3738           0 :         preempt_disable_notrace();
    3739             : 
    3740           0 :         if (atomic_read(&buffer->record_disabled))
    3741           0 :                 goto out;
    3742             : 
    3743           0 :         cpu = raw_smp_processor_id();
    3744             : 
    3745           0 :         if (!cpumask_test_cpu(cpu, buffer->cpumask))
    3746           0 :                 goto out;
    3747             : 
    3748           0 :         cpu_buffer = buffer->buffers[cpu];
    3749             : 
    3750           0 :         if (atomic_read(&cpu_buffer->record_disabled))
    3751           0 :                 goto out;
    3752             : 
    3753           0 :         if (length > BUF_MAX_DATA_SIZE)
    3754           0 :                 goto out;
    3755             : 
    3756           0 :         if (unlikely(trace_recursive_lock(cpu_buffer)))
    3757           0 :                 goto out;
    3758             : 
    3759           0 :         event = rb_reserve_next_event(buffer, cpu_buffer, length);
    3760           0 :         if (!event)
    3761           0 :                 goto out_unlock;
    3762             : 
    3763           0 :         body = rb_event_data(event);
    3764             : 
    3765           0 :         memcpy(body, data, length);
    3766             : 
    3767           0 :         rb_commit(cpu_buffer, event);
    3768             : 
    3769           0 :         rb_wakeups(buffer, cpu_buffer);
    3770             : 
    3771             :         ret = 0;
    3772             : 
    3773           0 :  out_unlock:
    3774           0 :         trace_recursive_unlock(cpu_buffer);
    3775             : 
    3776           0 :  out:
    3777           0 :         preempt_enable_notrace();
    3778             : 
    3779           0 :         return ret;
    3780             : }
    3781             : EXPORT_SYMBOL_GPL(ring_buffer_write);
    3782             : 
    3783           0 : static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
    3784             : {
    3785           0 :         struct buffer_page *reader = cpu_buffer->reader_page;
    3786           0 :         struct buffer_page *head = rb_set_head_page(cpu_buffer);
    3787           0 :         struct buffer_page *commit = cpu_buffer->commit_page;
    3788             : 
    3789             :         /* In case of error, head will be NULL */
    3790           0 :         if (unlikely(!head))
    3791             :                 return true;
    3792             : 
    3793           0 :         return reader->read == rb_page_commit(reader) &&
    3794           0 :                 (commit == reader ||
    3795           0 :                  (commit == head &&
    3796           0 :                   head->read == rb_page_commit(commit)));
    3797             : }
    3798             : 
    3799             : /**
    3800             :  * ring_buffer_record_disable - stop all writes into the buffer
    3801             :  * @buffer: The ring buffer to stop writes to.
    3802             :  *
    3803             :  * This prevents all writes to the buffer. Any attempt to write
    3804             :  * to the buffer after this will fail and return NULL.
    3805             :  *
    3806             :  * The caller should call synchronize_rcu() after this.
    3807             :  */
    3808           0 : void ring_buffer_record_disable(struct trace_buffer *buffer)
    3809             : {
    3810           0 :         atomic_inc(&buffer->record_disabled);
    3811           0 : }
    3812             : EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
    3813             : 
    3814             : /**
    3815             :  * ring_buffer_record_enable - enable writes to the buffer
    3816             :  * @buffer: The ring buffer to enable writes
    3817             :  *
    3818             :  * Note, multiple disables will need the same number of enables
    3819             :  * to truly enable the writing (much like preempt_disable).
    3820             :  */
    3821           0 : void ring_buffer_record_enable(struct trace_buffer *buffer)
    3822             : {
    3823           0 :         atomic_dec(&buffer->record_disabled);
    3824           0 : }
    3825             : EXPORT_SYMBOL_GPL(ring_buffer_record_enable);
    3826             : 
    3827             : /**
    3828             :  * ring_buffer_record_off - stop all writes into the buffer
    3829             :  * @buffer: The ring buffer to stop writes to.
    3830             :  *
    3831             :  * This prevents all writes to the buffer. Any attempt to write
    3832             :  * to the buffer after this will fail and return NULL.
    3833             :  *
    3834             :  * This is different than ring_buffer_record_disable() as
    3835             :  * it works like an on/off switch, where as the disable() version
    3836             :  * must be paired with a enable().
    3837             :  */
    3838           0 : void ring_buffer_record_off(struct trace_buffer *buffer)
    3839             : {
    3840           0 :         unsigned int rd;
    3841           0 :         unsigned int new_rd;
    3842             : 
    3843           0 :         do {
    3844           0 :                 rd = atomic_read(&buffer->record_disabled);
    3845           0 :                 new_rd = rd | RB_BUFFER_OFF;
    3846           0 :         } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
    3847           0 : }
    3848             : EXPORT_SYMBOL_GPL(ring_buffer_record_off);
    3849             : 
    3850             : /**
    3851             :  * ring_buffer_record_on - restart writes into the buffer
    3852             :  * @buffer: The ring buffer to start writes to.
    3853             :  *
    3854             :  * This enables all writes to the buffer that was disabled by
    3855             :  * ring_buffer_record_off().
    3856             :  *
    3857             :  * This is different than ring_buffer_record_enable() as
    3858             :  * it works like an on/off switch, where as the enable() version
    3859             :  * must be paired with a disable().
    3860             :  */
    3861           0 : void ring_buffer_record_on(struct trace_buffer *buffer)
    3862             : {
    3863           0 :         unsigned int rd;
    3864           0 :         unsigned int new_rd;
    3865             : 
    3866           0 :         do {
    3867           0 :                 rd = atomic_read(&buffer->record_disabled);
    3868           0 :                 new_rd = rd & ~RB_BUFFER_OFF;
    3869           0 :         } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
    3870           0 : }
    3871             : EXPORT_SYMBOL_GPL(ring_buffer_record_on);
    3872             : 
    3873             : /**
    3874             :  * ring_buffer_record_is_on - return true if the ring buffer can write
    3875             :  * @buffer: The ring buffer to see if write is enabled
    3876             :  *
    3877             :  * Returns true if the ring buffer is in a state that it accepts writes.
    3878             :  */
    3879           0 : bool ring_buffer_record_is_on(struct trace_buffer *buffer)
    3880             : {
    3881           0 :         return !atomic_read(&buffer->record_disabled);
    3882             : }
    3883             : 
    3884             : /**
    3885             :  * ring_buffer_record_is_set_on - return true if the ring buffer is set writable
    3886             :  * @buffer: The ring buffer to see if write is set enabled
    3887             :  *
    3888             :  * Returns true if the ring buffer is set writable by ring_buffer_record_on().
    3889             :  * Note that this does NOT mean it is in a writable state.
    3890             :  *
    3891             :  * It may return true when the ring buffer has been disabled by
    3892             :  * ring_buffer_record_disable(), as that is a temporary disabling of
    3893             :  * the ring buffer.
    3894             :  */
    3895           0 : bool ring_buffer_record_is_set_on(struct trace_buffer *buffer)
    3896             : {
    3897           0 :         return !(atomic_read(&buffer->record_disabled) & RB_BUFFER_OFF);
    3898             : }
    3899             : 
    3900             : /**
    3901             :  * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
    3902             :  * @buffer: The ring buffer to stop writes to.
    3903             :  * @cpu: The CPU buffer to stop
    3904             :  *
    3905             :  * This prevents all writes to the buffer. Any attempt to write
    3906             :  * to the buffer after this will fail and return NULL.
    3907             :  *
    3908             :  * The caller should call synchronize_rcu() after this.
    3909             :  */
    3910           0 : void ring_buffer_record_disable_cpu(struct trace_buffer *buffer, int cpu)
    3911             : {
    3912           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    3913             : 
    3914           0 :         if (!cpumask_test_cpu(cpu, buffer->cpumask))
    3915             :                 return;
    3916             : 
    3917           0 :         cpu_buffer = buffer->buffers[cpu];
    3918           0 :         atomic_inc(&cpu_buffer->record_disabled);
    3919             : }
    3920             : EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
    3921             : 
    3922             : /**
    3923             :  * ring_buffer_record_enable_cpu - enable writes to the buffer
    3924             :  * @buffer: The ring buffer to enable writes
    3925             :  * @cpu: The CPU to enable.
    3926             :  *
    3927             :  * Note, multiple disables will need the same number of enables
    3928             :  * to truly enable the writing (much like preempt_disable).
    3929             :  */
    3930           0 : void ring_buffer_record_enable_cpu(struct trace_buffer *buffer, int cpu)
    3931             : {
    3932           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    3933             : 
    3934           0 :         if (!cpumask_test_cpu(cpu, buffer->cpumask))
    3935             :                 return;
    3936             : 
    3937           0 :         cpu_buffer = buffer->buffers[cpu];
    3938           0 :         atomic_dec(&cpu_buffer->record_disabled);
    3939             : }
    3940             : EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
    3941             : 
    3942             : /*
    3943             :  * The total entries in the ring buffer is the running counter
    3944             :  * of entries entered into the ring buffer, minus the sum of
    3945             :  * the entries read from the ring buffer and the number of
    3946             :  * entries that were overwritten.
    3947             :  */
    3948             : static inline unsigned long
    3949           0 : rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
    3950             : {
    3951           0 :         return local_read(&cpu_buffer->entries) -
    3952           0 :                 (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
    3953             : }
    3954             : 
    3955             : /**
    3956             :  * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer
    3957             :  * @buffer: The ring buffer
    3958             :  * @cpu: The per CPU buffer to read from.
    3959             :  */
    3960           0 : u64 ring_buffer_oldest_event_ts(struct trace_buffer *buffer, int cpu)
    3961             : {
    3962           0 :         unsigned long flags;
    3963           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    3964           0 :         struct buffer_page *bpage;
    3965           0 :         u64 ret = 0;
    3966             : 
    3967           0 :         if (!cpumask_test_cpu(cpu, buffer->cpumask))
    3968             :                 return 0;
    3969             : 
    3970           0 :         cpu_buffer = buffer->buffers[cpu];
    3971           0 :         raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
    3972             :         /*
    3973             :          * if the tail is on reader_page, oldest time stamp is on the reader
    3974             :          * page
    3975             :          */
    3976           0 :         if (cpu_buffer->tail_page == cpu_buffer->reader_page)
    3977             :                 bpage = cpu_buffer->reader_page;
    3978             :         else
    3979           0 :                 bpage = rb_set_head_page(cpu_buffer);
    3980           0 :         if (bpage)
    3981           0 :                 ret = bpage->page->time_stamp;
    3982           0 :         raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
    3983             : 
    3984           0 :         return ret;
    3985             : }
    3986             : EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts);
    3987             : 
    3988             : /**
    3989             :  * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer
    3990             :  * @buffer: The ring buffer
    3991             :  * @cpu: The per CPU buffer to read from.
    3992             :  */
    3993           0 : unsigned long ring_buffer_bytes_cpu(struct trace_buffer *buffer, int cpu)
    3994             : {
    3995           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    3996           0 :         unsigned long ret;
    3997             : 
    3998           0 :         if (!cpumask_test_cpu(cpu, buffer->cpumask))
    3999             :                 return 0;
    4000             : 
    4001           0 :         cpu_buffer = buffer->buffers[cpu];
    4002           0 :         ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes;
    4003             : 
    4004           0 :         return ret;
    4005             : }
    4006             : EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu);
    4007             : 
    4008             : /**
    4009             :  * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
    4010             :  * @buffer: The ring buffer
    4011             :  * @cpu: The per CPU buffer to get the entries from.
    4012             :  */
    4013           0 : unsigned long ring_buffer_entries_cpu(struct trace_buffer *buffer, int cpu)
    4014             : {
    4015           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    4016             : 
    4017           0 :         if (!cpumask_test_cpu(cpu, buffer->cpumask))
    4018             :                 return 0;
    4019             : 
    4020           0 :         cpu_buffer = buffer->buffers[cpu];
    4021             : 
    4022           0 :         return rb_num_of_entries(cpu_buffer);
    4023             : }
    4024             : EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
    4025             : 
    4026             : /**
    4027             :  * ring_buffer_overrun_cpu - get the number of overruns caused by the ring
    4028             :  * buffer wrapping around (only if RB_FL_OVERWRITE is on).
    4029             :  * @buffer: The ring buffer
    4030             :  * @cpu: The per CPU buffer to get the number of overruns from
    4031             :  */
    4032           0 : unsigned long ring_buffer_overrun_cpu(struct trace_buffer *buffer, int cpu)
    4033             : {
    4034           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    4035           0 :         unsigned long ret;
    4036             : 
    4037           0 :         if (!cpumask_test_cpu(cpu, buffer->cpumask))
    4038             :                 return 0;
    4039             : 
    4040           0 :         cpu_buffer = buffer->buffers[cpu];
    4041           0 :         ret = local_read(&cpu_buffer->overrun);
    4042             : 
    4043           0 :         return ret;
    4044             : }
    4045             : EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
    4046             : 
    4047             : /**
    4048             :  * ring_buffer_commit_overrun_cpu - get the number of overruns caused by
    4049             :  * commits failing due to the buffer wrapping around while there are uncommitted
    4050             :  * events, such as during an interrupt storm.
    4051             :  * @buffer: The ring buffer
    4052             :  * @cpu: The per CPU buffer to get the number of overruns from
    4053             :  */
    4054             : unsigned long
    4055           0 : ring_buffer_commit_overrun_cpu(struct trace_buffer *buffer, int cpu)
    4056             : {
    4057           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    4058           0 :         unsigned long ret;
    4059             : 
    4060           0 :         if (!cpumask_test_cpu(cpu, buffer->cpumask))
    4061             :                 return 0;
    4062             : 
    4063           0 :         cpu_buffer = buffer->buffers[cpu];
    4064           0 :         ret = local_read(&cpu_buffer->commit_overrun);
    4065             : 
    4066           0 :         return ret;
    4067             : }
    4068             : EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
    4069             : 
    4070             : /**
    4071             :  * ring_buffer_dropped_events_cpu - get the number of dropped events caused by
    4072             :  * the ring buffer filling up (only if RB_FL_OVERWRITE is off).
    4073             :  * @buffer: The ring buffer
    4074             :  * @cpu: The per CPU buffer to get the number of overruns from
    4075             :  */
    4076             : unsigned long
    4077           0 : ring_buffer_dropped_events_cpu(struct trace_buffer *buffer, int cpu)
    4078             : {
    4079           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    4080           0 :         unsigned long ret;
    4081             : 
    4082           0 :         if (!cpumask_test_cpu(cpu, buffer->cpumask))
    4083             :                 return 0;
    4084             : 
    4085           0 :         cpu_buffer = buffer->buffers[cpu];
    4086           0 :         ret = local_read(&cpu_buffer->dropped_events);
    4087             : 
    4088           0 :         return ret;
    4089             : }
    4090             : EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu);
    4091             : 
    4092             : /**
    4093             :  * ring_buffer_read_events_cpu - get the number of events successfully read
    4094             :  * @buffer: The ring buffer
    4095             :  * @cpu: The per CPU buffer to get the number of events read
    4096             :  */
    4097             : unsigned long
    4098           0 : ring_buffer_read_events_cpu(struct trace_buffer *buffer, int cpu)
    4099             : {
    4100           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    4101             : 
    4102           0 :         if (!cpumask_test_cpu(cpu, buffer->cpumask))
    4103             :                 return 0;
    4104             : 
    4105           0 :         cpu_buffer = buffer->buffers[cpu];
    4106           0 :         return cpu_buffer->read;
    4107             : }
    4108             : EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu);
    4109             : 
    4110             : /**
    4111             :  * ring_buffer_entries - get the number of entries in a buffer
    4112             :  * @buffer: The ring buffer
    4113             :  *
    4114             :  * Returns the total number of entries in the ring buffer
    4115             :  * (all CPU entries)
    4116             :  */
    4117           0 : unsigned long ring_buffer_entries(struct trace_buffer *buffer)
    4118             : {
    4119           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    4120           0 :         unsigned long entries = 0;
    4121           0 :         int cpu;
    4122             : 
    4123             :         /* if you care about this being correct, lock the buffer */
    4124           0 :         for_each_buffer_cpu(buffer, cpu) {
    4125           0 :                 cpu_buffer = buffer->buffers[cpu];
    4126           0 :                 entries += rb_num_of_entries(cpu_buffer);
    4127             :         }
    4128             : 
    4129           0 :         return entries;
    4130             : }
    4131             : EXPORT_SYMBOL_GPL(ring_buffer_entries);
    4132             : 
    4133             : /**
    4134             :  * ring_buffer_overruns - get the number of overruns in buffer
    4135             :  * @buffer: The ring buffer
    4136             :  *
    4137             :  * Returns the total number of overruns in the ring buffer
    4138             :  * (all CPU entries)
    4139             :  */
    4140           0 : unsigned long ring_buffer_overruns(struct trace_buffer *buffer)
    4141             : {
    4142           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    4143           0 :         unsigned long overruns = 0;
    4144           0 :         int cpu;
    4145             : 
    4146             :         /* if you care about this being correct, lock the buffer */
    4147           0 :         for_each_buffer_cpu(buffer, cpu) {
    4148           0 :                 cpu_buffer = buffer->buffers[cpu];
    4149           0 :                 overruns += local_read(&cpu_buffer->overrun);
    4150             :         }
    4151             : 
    4152           0 :         return overruns;
    4153             : }
    4154             : EXPORT_SYMBOL_GPL(ring_buffer_overruns);
    4155             : 
    4156           0 : static void rb_iter_reset(struct ring_buffer_iter *iter)
    4157             : {
    4158           0 :         struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
    4159             : 
    4160             :         /* Iterator usage is expected to have record disabled */
    4161           0 :         iter->head_page = cpu_buffer->reader_page;
    4162           0 :         iter->head = cpu_buffer->reader_page->read;
    4163           0 :         iter->next_event = iter->head;
    4164             : 
    4165           0 :         iter->cache_reader_page = iter->head_page;
    4166           0 :         iter->cache_read = cpu_buffer->read;
    4167             : 
    4168           0 :         if (iter->head) {
    4169           0 :                 iter->read_stamp = cpu_buffer->read_stamp;
    4170           0 :                 iter->page_stamp = cpu_buffer->reader_page->page->time_stamp;
    4171             :         } else {
    4172           0 :                 iter->read_stamp = iter->head_page->page->time_stamp;
    4173           0 :                 iter->page_stamp = iter->read_stamp;
    4174             :         }
    4175           0 : }
    4176             : 
    4177             : /**
    4178             :  * ring_buffer_iter_reset - reset an iterator
    4179             :  * @iter: The iterator to reset
    4180             :  *
    4181             :  * Resets the iterator, so that it will start from the beginning
    4182             :  * again.
    4183             :  */
    4184           0 : void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
    4185             : {
    4186           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    4187           0 :         unsigned long flags;
    4188             : 
    4189           0 :         if (!iter)
    4190             :                 return;
    4191             : 
    4192           0 :         cpu_buffer = iter->cpu_buffer;
    4193             : 
    4194           0 :         raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
    4195           0 :         rb_iter_reset(iter);
    4196           0 :         raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
    4197             : }
    4198             : EXPORT_SYMBOL_GPL(ring_buffer_iter_reset);
    4199             : 
    4200             : /**
    4201             :  * ring_buffer_iter_empty - check if an iterator has no more to read
    4202             :  * @iter: The iterator to check
    4203             :  */
    4204           0 : int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
    4205             : {
    4206           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    4207           0 :         struct buffer_page *reader;
    4208           0 :         struct buffer_page *head_page;
    4209           0 :         struct buffer_page *commit_page;
    4210           0 :         struct buffer_page *curr_commit_page;
    4211           0 :         unsigned commit;
    4212           0 :         u64 curr_commit_ts;
    4213           0 :         u64 commit_ts;
    4214             : 
    4215           0 :         cpu_buffer = iter->cpu_buffer;
    4216           0 :         reader = cpu_buffer->reader_page;
    4217           0 :         head_page = cpu_buffer->head_page;
    4218           0 :         commit_page = cpu_buffer->commit_page;
    4219           0 :         commit_ts = commit_page->page->time_stamp;
    4220             : 
    4221             :         /*
    4222             :          * When the writer goes across pages, it issues a cmpxchg which
    4223             :          * is a mb(), which will synchronize with the rmb here.
    4224             :          * (see rb_tail_page_update())
    4225             :          */
    4226           0 :         smp_rmb();
    4227           0 :         commit = rb_page_commit(commit_page);
    4228             :         /* We want to make sure that the commit page doesn't change */
    4229           0 :         smp_rmb();
    4230             : 
    4231             :         /* Make sure commit page didn't change */
    4232           0 :         curr_commit_page = READ_ONCE(cpu_buffer->commit_page);
    4233           0 :         curr_commit_ts = READ_ONCE(curr_commit_page->page->time_stamp);
    4234             : 
    4235             :         /* If the commit page changed, then there's more data */
    4236           0 :         if (curr_commit_page != commit_page ||
    4237           0 :             curr_commit_ts != commit_ts)
    4238             :                 return 0;
    4239             : 
    4240             :         /* Still racy, as it may return a false positive, but that's OK */
    4241           0 :         return ((iter->head_page == commit_page && iter->head >= commit) ||
    4242           0 :                 (iter->head_page == reader && commit_page == head_page &&
    4243           0 :                  head_page->read == commit &&
    4244           0 :                  iter->head == rb_page_commit(cpu_buffer->reader_page)));
    4245             : }
    4246             : EXPORT_SYMBOL_GPL(ring_buffer_iter_empty);
    4247             : 
    4248             : static void
    4249           0 : rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
    4250             :                      struct ring_buffer_event *event)
    4251             : {
    4252           0 :         u64 delta;
    4253             : 
    4254           0 :         switch (event->type_len) {
    4255             :         case RINGBUF_TYPE_PADDING:
    4256             :                 return;
    4257             : 
    4258             :         case RINGBUF_TYPE_TIME_EXTEND:
    4259           0 :                 delta = ring_buffer_event_time_stamp(event);
    4260           0 :                 cpu_buffer->read_stamp += delta;
    4261           0 :                 return;
    4262             : 
    4263             :         case RINGBUF_TYPE_TIME_STAMP:
    4264           0 :                 delta = ring_buffer_event_time_stamp(event);
    4265           0 :                 cpu_buffer->read_stamp = delta;
    4266           0 :                 return;
    4267             : 
    4268           0 :         case RINGBUF_TYPE_DATA:
    4269           0 :                 cpu_buffer->read_stamp += event->time_delta;
    4270           0 :                 return;
    4271             : 
    4272             :         default:
    4273           0 :                 RB_WARN_ON(cpu_buffer, 1);
    4274             :         }
    4275           0 :         return;
    4276             : }
    4277             : 
    4278             : static void
    4279           0 : rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
    4280             :                           struct ring_buffer_event *event)
    4281             : {
    4282           0 :         u64 delta;
    4283             : 
    4284           0 :         switch (event->type_len) {
    4285             :         case RINGBUF_TYPE_PADDING:
    4286             :                 return;
    4287             : 
    4288             :         case RINGBUF_TYPE_TIME_EXTEND:
    4289           0 :                 delta = ring_buffer_event_time_stamp(event);
    4290           0 :                 iter->read_stamp += delta;
    4291           0 :                 return;
    4292             : 
    4293             :         case RINGBUF_TYPE_TIME_STAMP:
    4294           0 :                 delta = ring_buffer_event_time_stamp(event);
    4295           0 :                 iter->read_stamp = delta;
    4296           0 :                 return;
    4297             : 
    4298           0 :         case RINGBUF_TYPE_DATA:
    4299           0 :                 iter->read_stamp += event->time_delta;
    4300           0 :                 return;
    4301             : 
    4302             :         default:
    4303           0 :                 RB_WARN_ON(iter->cpu_buffer, 1);
    4304             :         }
    4305           0 :         return;
    4306             : }
    4307             : 
    4308             : static struct buffer_page *
    4309           0 : rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
    4310             : {
    4311           0 :         struct buffer_page *reader = NULL;
    4312           0 :         unsigned long overwrite;
    4313           0 :         unsigned long flags;
    4314           0 :         int nr_loops = 0;
    4315           0 :         int ret;
    4316             : 
    4317           0 :         local_irq_save(flags);
    4318           0 :         arch_spin_lock(&cpu_buffer->lock);
    4319             : 
    4320           0 :  again:
    4321             :         /*
    4322             :          * This should normally only loop twice. But because the
    4323             :          * start of the reader inserts an empty page, it causes
    4324             :          * a case where we will loop three times. There should be no
    4325             :          * reason to loop four times (that I know of).
    4326             :          */
    4327           0 :         if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) {
    4328           0 :                 reader = NULL;
    4329           0 :                 goto out;
    4330             :         }
    4331             : 
    4332           0 :         reader = cpu_buffer->reader_page;
    4333             : 
    4334             :         /* If there's more to read, return this page */
    4335           0 :         if (cpu_buffer->reader_page->read < rb_page_size(reader))
    4336           0 :                 goto out;
    4337             : 
    4338             :         /* Never should we have an index greater than the size */
    4339           0 :         if (RB_WARN_ON(cpu_buffer,
    4340             :                        cpu_buffer->reader_page->read > rb_page_size(reader)))
    4341           0 :                 goto out;
    4342             : 
    4343             :         /* check if we caught up to the tail */
    4344           0 :         reader = NULL;
    4345           0 :         if (cpu_buffer->commit_page == cpu_buffer->reader_page)
    4346           0 :                 goto out;
    4347             : 
    4348             :         /* Don't bother swapping if the ring buffer is empty */
    4349           0 :         if (rb_num_of_entries(cpu_buffer) == 0)
    4350           0 :                 goto out;
    4351             : 
    4352             :         /*
    4353             :          * Reset the reader page to size zero.
    4354             :          */
    4355           0 :         local_set(&cpu_buffer->reader_page->write, 0);
    4356           0 :         local_set(&cpu_buffer->reader_page->entries, 0);
    4357           0 :         local_set(&cpu_buffer->reader_page->page->commit, 0);
    4358           0 :         cpu_buffer->reader_page->real_end = 0;
    4359             : 
    4360           0 :  spin:
    4361             :         /*
    4362             :          * Splice the empty reader page into the list around the head.
    4363             :          */
    4364           0 :         reader = rb_set_head_page(cpu_buffer);
    4365           0 :         if (!reader)
    4366           0 :                 goto out;
    4367           0 :         cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next);
    4368           0 :         cpu_buffer->reader_page->list.prev = reader->list.prev;
    4369             : 
    4370             :         /*
    4371             :          * cpu_buffer->pages just needs to point to the buffer, it
    4372             :          *  has no specific buffer page to point to. Lets move it out
    4373             :          *  of our way so we don't accidentally swap it.
    4374             :          */
    4375           0 :         cpu_buffer->pages = reader->list.prev;
    4376             : 
    4377             :         /* The reader page will be pointing to the new head */
    4378           0 :         rb_set_list_to_head(&cpu_buffer->reader_page->list);
    4379             : 
    4380             :         /*
    4381             :          * We want to make sure we read the overruns after we set up our
    4382             :          * pointers to the next object. The writer side does a
    4383             :          * cmpxchg to cross pages which acts as the mb on the writer
    4384             :          * side. Note, the reader will constantly fail the swap
    4385             :          * while the writer is updating the pointers, so this
    4386             :          * guarantees that the overwrite recorded here is the one we
    4387             :          * want to compare with the last_overrun.
    4388             :          */
    4389           0 :         smp_mb();
    4390           0 :         overwrite = local_read(&(cpu_buffer->overrun));
    4391             : 
    4392             :         /*
    4393             :          * Here's the tricky part.
    4394             :          *
    4395             :          * We need to move the pointer past the header page.
    4396             :          * But we can only do that if a writer is not currently
    4397             :          * moving it. The page before the header page has the
    4398             :          * flag bit '1' set if it is pointing to the page we want.
    4399             :          * but if the writer is in the process of moving it
    4400             :          * than it will be '2' or already moved '0'.
    4401             :          */
    4402             : 
    4403           0 :         ret = rb_head_page_replace(reader, cpu_buffer->reader_page);
    4404             : 
    4405             :         /*
    4406             :          * If we did not convert it, then we must try again.
    4407             :          */
    4408           0 :         if (!ret)
    4409           0 :                 goto spin;
    4410             : 
    4411             :         /*
    4412             :          * Yay! We succeeded in replacing the page.
    4413             :          *
    4414             :          * Now make the new head point back to the reader page.
    4415             :          */
    4416           0 :         rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
    4417           0 :         rb_inc_page(&cpu_buffer->head_page);
    4418             : 
    4419           0 :         local_inc(&cpu_buffer->pages_read);
    4420             : 
    4421             :         /* Finally update the reader page to the new head */
    4422           0 :         cpu_buffer->reader_page = reader;
    4423           0 :         cpu_buffer->reader_page->read = 0;
    4424             : 
    4425           0 :         if (overwrite != cpu_buffer->last_overrun) {
    4426           0 :                 cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun;
    4427           0 :                 cpu_buffer->last_overrun = overwrite;
    4428             :         }
    4429             : 
    4430           0 :         goto again;
    4431             : 
    4432           0 :  out:
    4433             :         /* Update the read_stamp on the first event */
    4434           0 :         if (reader && reader->read == 0)
    4435           0 :                 cpu_buffer->read_stamp = reader->page->time_stamp;
    4436             : 
    4437           0 :         arch_spin_unlock(&cpu_buffer->lock);
    4438           0 :         local_irq_restore(flags);
    4439             : 
    4440           0 :         return reader;
    4441             : }
    4442             : 
    4443           0 : static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
    4444             : {
    4445           0 :         struct ring_buffer_event *event;
    4446           0 :         struct buffer_page *reader;
    4447           0 :         unsigned length;
    4448             : 
    4449           0 :         reader = rb_get_reader_page(cpu_buffer);
    4450             : 
    4451             :         /* This function should not be called when buffer is empty */
    4452           0 :         if (RB_WARN_ON(cpu_buffer, !reader))
    4453             :                 return;
    4454             : 
    4455           0 :         event = rb_reader_event(cpu_buffer);
    4456             : 
    4457           0 :         if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
    4458           0 :                 cpu_buffer->read++;
    4459             : 
    4460           0 :         rb_update_read_stamp(cpu_buffer, event);
    4461             : 
    4462           0 :         length = rb_event_length(event);
    4463           0 :         cpu_buffer->reader_page->read += length;
    4464             : }
    4465             : 
    4466           0 : static void rb_advance_iter(struct ring_buffer_iter *iter)
    4467             : {
    4468           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    4469             : 
    4470           0 :         cpu_buffer = iter->cpu_buffer;
    4471             : 
    4472             :         /* If head == next_event then we need to jump to the next event */
    4473           0 :         if (iter->head == iter->next_event) {
    4474             :                 /* If the event gets overwritten again, there's nothing to do */
    4475           0 :                 if (rb_iter_head_event(iter) == NULL)
    4476             :                         return;
    4477             :         }
    4478             : 
    4479           0 :         iter->head = iter->next_event;
    4480             : 
    4481             :         /*
    4482             :          * Check if we are at the end of the buffer.
    4483             :          */
    4484           0 :         if (iter->next_event >= rb_page_size(iter->head_page)) {
    4485             :                 /* discarded commits can make the page empty */
    4486           0 :                 if (iter->head_page == cpu_buffer->commit_page)
    4487             :                         return;
    4488           0 :                 rb_inc_iter(iter);
    4489           0 :                 return;
    4490             :         }
    4491             : 
    4492           0 :         rb_update_iter_read_stamp(iter, iter->event);
    4493             : }
    4494             : 
    4495           0 : static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
    4496             : {
    4497           0 :         return cpu_buffer->lost_events;
    4498             : }
    4499             : 
    4500             : static struct ring_buffer_event *
    4501           0 : rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
    4502             :                unsigned long *lost_events)
    4503             : {
    4504           0 :         struct ring_buffer_event *event;
    4505           0 :         struct buffer_page *reader;
    4506           0 :         int nr_loops = 0;
    4507             : 
    4508           0 :         if (ts)
    4509           0 :                 *ts = 0;
    4510           0 :  again:
    4511             :         /*
    4512             :          * We repeat when a time extend is encountered.
    4513             :          * Since the time extend is always attached to a data event,
    4514             :          * we should never loop more than once.
    4515             :          * (We never hit the following condition more than twice).
    4516             :          */
    4517           0 :         if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
    4518             :                 return NULL;
    4519             : 
    4520           0 :         reader = rb_get_reader_page(cpu_buffer);
    4521           0 :         if (!reader)
    4522             :                 return NULL;
    4523             : 
    4524           0 :         event = rb_reader_event(cpu_buffer);
    4525             : 
    4526           0 :         switch (event->type_len) {
    4527             :         case RINGBUF_TYPE_PADDING:
    4528           0 :                 if (rb_null_event(event))
    4529           0 :                         RB_WARN_ON(cpu_buffer, 1);
    4530             :                 /*
    4531             :                  * Because the writer could be discarding every
    4532             :                  * event it creates (which would probably be bad)
    4533             :                  * if we were to go back to "again" then we may never
    4534             :                  * catch up, and will trigger the warn on, or lock
    4535             :                  * the box. Return the padding, and we will release
    4536             :                  * the current locks, and try again.
    4537             :                  */
    4538             :                 return event;
    4539             : 
    4540           0 :         case RINGBUF_TYPE_TIME_EXTEND:
    4541             :                 /* Internal data, OK to advance */
    4542           0 :                 rb_advance_reader(cpu_buffer);
    4543           0 :                 goto again;
    4544             : 
    4545           0 :         case RINGBUF_TYPE_TIME_STAMP:
    4546           0 :                 if (ts) {
    4547           0 :                         *ts = ring_buffer_event_time_stamp(event);
    4548           0 :                         ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
    4549             :                                                          cpu_buffer->cpu, ts);
    4550             :                 }
    4551             :                 /* Internal data, OK to advance */
    4552           0 :                 rb_advance_reader(cpu_buffer);
    4553           0 :                 goto again;
    4554             : 
    4555           0 :         case RINGBUF_TYPE_DATA:
    4556           0 :                 if (ts && !(*ts)) {
    4557           0 :                         *ts = cpu_buffer->read_stamp + event->time_delta;
    4558           0 :                         ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
    4559             :                                                          cpu_buffer->cpu, ts);
    4560             :                 }
    4561           0 :                 if (lost_events)
    4562           0 :                         *lost_events = rb_lost_events(cpu_buffer);
    4563             :                 return event;
    4564             : 
    4565             :         default:
    4566           0 :                 RB_WARN_ON(cpu_buffer, 1);
    4567             :         }
    4568             : 
    4569           0 :         return NULL;
    4570             : }
    4571             : EXPORT_SYMBOL_GPL(ring_buffer_peek);
    4572             : 
    4573             : static struct ring_buffer_event *
    4574           0 : rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
    4575             : {
    4576           0 :         struct trace_buffer *buffer;
    4577           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    4578           0 :         struct ring_buffer_event *event;
    4579           0 :         int nr_loops = 0;
    4580             : 
    4581           0 :         if (ts)
    4582           0 :                 *ts = 0;
    4583             : 
    4584           0 :         cpu_buffer = iter->cpu_buffer;
    4585           0 :         buffer = cpu_buffer->buffer;
    4586             : 
    4587             :         /*
    4588             :          * Check if someone performed a consuming read to
    4589             :          * the buffer. A consuming read invalidates the iterator
    4590             :          * and we need to reset the iterator in this case.
    4591             :          */
    4592           0 :         if (unlikely(iter->cache_read != cpu_buffer->read ||
    4593             :                      iter->cache_reader_page != cpu_buffer->reader_page))
    4594           0 :                 rb_iter_reset(iter);
    4595             : 
    4596           0 :  again:
    4597           0 :         if (ring_buffer_iter_empty(iter))
    4598             :                 return NULL;
    4599             : 
    4600             :         /*
    4601             :          * As the writer can mess with what the iterator is trying
    4602             :          * to read, just give up if we fail to get an event after
    4603             :          * three tries. The iterator is not as reliable when reading
    4604             :          * the ring buffer with an active write as the consumer is.
    4605             :          * Do not warn if the three failures is reached.
    4606             :          */
    4607           0 :         if (++nr_loops > 3)
    4608             :                 return NULL;
    4609             : 
    4610           0 :         if (rb_per_cpu_empty(cpu_buffer))
    4611             :                 return NULL;
    4612             : 
    4613           0 :         if (iter->head >= rb_page_size(iter->head_page)) {
    4614           0 :                 rb_inc_iter(iter);
    4615           0 :                 goto again;
    4616             :         }
    4617             : 
    4618           0 :         event = rb_iter_head_event(iter);
    4619           0 :         if (!event)
    4620           0 :                 goto again;
    4621             : 
    4622           0 :         switch (event->type_len) {
    4623             :         case RINGBUF_TYPE_PADDING:
    4624           0 :                 if (rb_null_event(event)) {
    4625           0 :                         rb_inc_iter(iter);
    4626           0 :                         goto again;
    4627             :                 }
    4628           0 :                 rb_advance_iter(iter);
    4629           0 :                 return event;
    4630             : 
    4631           0 :         case RINGBUF_TYPE_TIME_EXTEND:
    4632             :                 /* Internal data, OK to advance */
    4633           0 :                 rb_advance_iter(iter);
    4634           0 :                 goto again;
    4635             : 
    4636           0 :         case RINGBUF_TYPE_TIME_STAMP:
    4637           0 :                 if (ts) {
    4638           0 :                         *ts = ring_buffer_event_time_stamp(event);
    4639           0 :                         ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
    4640             :                                                          cpu_buffer->cpu, ts);
    4641             :                 }
    4642             :                 /* Internal data, OK to advance */
    4643           0 :                 rb_advance_iter(iter);
    4644           0 :                 goto again;
    4645             : 
    4646           0 :         case RINGBUF_TYPE_DATA:
    4647           0 :                 if (ts && !(*ts)) {
    4648           0 :                         *ts = iter->read_stamp + event->time_delta;
    4649           0 :                         ring_buffer_normalize_time_stamp(buffer,
    4650             :                                                          cpu_buffer->cpu, ts);
    4651             :                 }
    4652             :                 return event;
    4653             : 
    4654             :         default:
    4655           0 :                 RB_WARN_ON(cpu_buffer, 1);
    4656             :         }
    4657             : 
    4658           0 :         return NULL;
    4659             : }
    4660             : EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
    4661             : 
    4662           0 : static inline bool rb_reader_lock(struct ring_buffer_per_cpu *cpu_buffer)
    4663             : {
    4664           0 :         if (likely(!in_nmi())) {
    4665           0 :                 raw_spin_lock(&cpu_buffer->reader_lock);
    4666           0 :                 return true;
    4667             :         }
    4668             : 
    4669             :         /*
    4670             :          * If an NMI die dumps out the content of the ring buffer
    4671             :          * trylock must be used to prevent a deadlock if the NMI
    4672             :          * preempted a task that holds the ring buffer locks. If
    4673             :          * we get the lock then all is fine, if not, then continue
    4674             :          * to do the read, but this can corrupt the ring buffer,
    4675             :          * so it must be permanently disabled from future writes.
    4676             :          * Reading from NMI is a oneshot deal.
    4677             :          */
    4678           0 :         if (raw_spin_trylock(&cpu_buffer->reader_lock))
    4679             :                 return true;
    4680             : 
    4681             :         /* Continue without locking, but disable the ring buffer */
    4682           0 :         atomic_inc(&cpu_buffer->record_disabled);
    4683           0 :         return false;
    4684             : }
    4685             : 
    4686             : static inline void
    4687           0 : rb_reader_unlock(struct ring_buffer_per_cpu *cpu_buffer, bool locked)
    4688             : {
    4689           0 :         if (likely(locked))
    4690           0 :                 raw_spin_unlock(&cpu_buffer->reader_lock);
    4691           0 :         return;
    4692             : }
    4693             : 
    4694             : /**
    4695             :  * ring_buffer_peek - peek at the next event to be read
    4696             :  * @buffer: The ring buffer to read
    4697             :  * @cpu: The cpu to peak at
    4698             :  * @ts: The timestamp counter of this event.
    4699             :  * @lost_events: a variable to store if events were lost (may be NULL)
    4700             :  *
    4701             :  * This will return the event that will be read next, but does
    4702             :  * not consume the data.
    4703             :  */
    4704             : struct ring_buffer_event *
    4705           0 : ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts,
    4706             :                  unsigned long *lost_events)
    4707             : {
    4708           0 :         struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
    4709           0 :         struct ring_buffer_event *event;
    4710           0 :         unsigned long flags;
    4711           0 :         bool dolock;
    4712             : 
    4713           0 :         if (!cpumask_test_cpu(cpu, buffer->cpumask))
    4714             :                 return NULL;
    4715             : 
    4716           0 :  again:
    4717           0 :         local_irq_save(flags);
    4718           0 :         dolock = rb_reader_lock(cpu_buffer);
    4719           0 :         event = rb_buffer_peek(cpu_buffer, ts, lost_events);
    4720           0 :         if (event && event->type_len == RINGBUF_TYPE_PADDING)
    4721           0 :                 rb_advance_reader(cpu_buffer);
    4722           0 :         rb_reader_unlock(cpu_buffer, dolock);
    4723           0 :         local_irq_restore(flags);
    4724             : 
    4725           0 :         if (event && event->type_len == RINGBUF_TYPE_PADDING)
    4726           0 :                 goto again;
    4727             : 
    4728             :         return event;
    4729             : }
    4730             : 
    4731             : /** ring_buffer_iter_dropped - report if there are dropped events
    4732             :  * @iter: The ring buffer iterator
    4733             :  *
    4734             :  * Returns true if there was dropped events since the last peek.
    4735             :  */
    4736           0 : bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter)
    4737             : {
    4738           0 :         bool ret = iter->missed_events != 0;
    4739             : 
    4740           0 :         iter->missed_events = 0;
    4741           0 :         return ret;
    4742             : }
    4743             : EXPORT_SYMBOL_GPL(ring_buffer_iter_dropped);
    4744             : 
    4745             : /**
    4746             :  * ring_buffer_iter_peek - peek at the next event to be read
    4747             :  * @iter: The ring buffer iterator
    4748             :  * @ts: The timestamp counter of this event.
    4749             :  *
    4750             :  * This will return the event that will be read next, but does
    4751             :  * not increment the iterator.
    4752             :  */
    4753             : struct ring_buffer_event *
    4754           0 : ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
    4755             : {
    4756           0 :         struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
    4757           0 :         struct ring_buffer_event *event;
    4758           0 :         unsigned long flags;
    4759             : 
    4760           0 :  again:
    4761           0 :         raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
    4762           0 :         event = rb_iter_peek(iter, ts);
    4763           0 :         raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
    4764             : 
    4765           0 :         if (event && event->type_len == RINGBUF_TYPE_PADDING)
    4766           0 :                 goto again;
    4767             : 
    4768           0 :         return event;
    4769             : }
    4770             : 
    4771             : /**
    4772             :  * ring_buffer_consume - return an event and consume it
    4773             :  * @buffer: The ring buffer to get the next event from
    4774             :  * @cpu: the cpu to read the buffer from
    4775             :  * @ts: a variable to store the timestamp (may be NULL)
    4776             :  * @lost_events: a variable to store if events were lost (may be NULL)
    4777             :  *
    4778             :  * Returns the next event in the ring buffer, and that event is consumed.
    4779             :  * Meaning, that sequential reads will keep returning a different event,
    4780             :  * and eventually empty the ring buffer if the producer is slower.
    4781             :  */
    4782             : struct ring_buffer_event *
    4783           0 : ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
    4784             :                     unsigned long *lost_events)
    4785             : {
    4786           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    4787           0 :         struct ring_buffer_event *event = NULL;
    4788           0 :         unsigned long flags;
    4789           0 :         bool dolock;
    4790             : 
    4791           0 :  again:
    4792             :         /* might be called in atomic */
    4793           0 :         preempt_disable();
    4794             : 
    4795           0 :         if (!cpumask_test_cpu(cpu, buffer->cpumask))
    4796           0 :                 goto out;
    4797             : 
    4798           0 :         cpu_buffer = buffer->buffers[cpu];
    4799           0 :         local_irq_save(flags);
    4800           0 :         dolock = rb_reader_lock(cpu_buffer);
    4801             : 
    4802           0 :         event = rb_buffer_peek(cpu_buffer, ts, lost_events);
    4803           0 :         if (event) {
    4804           0 :                 cpu_buffer->lost_events = 0;
    4805           0 :                 rb_advance_reader(cpu_buffer);
    4806             :         }
    4807             : 
    4808           0 :         rb_reader_unlock(cpu_buffer, dolock);
    4809           0 :         local_irq_restore(flags);
    4810             : 
    4811           0 :  out:
    4812           0 :         preempt_enable();
    4813             : 
    4814           0 :         if (event && event->type_len == RINGBUF_TYPE_PADDING)
    4815           0 :                 goto again;
    4816             : 
    4817           0 :         return event;
    4818             : }
    4819             : EXPORT_SYMBOL_GPL(ring_buffer_consume);
    4820             : 
    4821             : /**
    4822             :  * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
    4823             :  * @buffer: The ring buffer to read from
    4824             :  * @cpu: The cpu buffer to iterate over
    4825             :  * @flags: gfp flags to use for memory allocation
    4826             :  *
    4827             :  * This performs the initial preparations necessary to iterate
    4828             :  * through the buffer.  Memory is allocated, buffer recording
    4829             :  * is disabled, and the iterator pointer is returned to the caller.
    4830             :  *
    4831             :  * Disabling buffer recording prevents the reading from being
    4832             :  * corrupted. This is not a consuming read, so a producer is not
    4833             :  * expected.
    4834             :  *
    4835             :  * After a sequence of ring_buffer_read_prepare calls, the user is
    4836             :  * expected to make at least one call to ring_buffer_read_prepare_sync.
    4837             :  * Afterwards, ring_buffer_read_start is invoked to get things going
    4838             :  * for real.
    4839             :  *
    4840             :  * This overall must be paired with ring_buffer_read_finish.
    4841             :  */
    4842             : struct ring_buffer_iter *
    4843           0 : ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
    4844             : {
    4845           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    4846           0 :         struct ring_buffer_iter *iter;
    4847             : 
    4848           0 :         if (!cpumask_test_cpu(cpu, buffer->cpumask))
    4849             :                 return NULL;
    4850             : 
    4851           0 :         iter = kzalloc(sizeof(*iter), flags);
    4852           0 :         if (!iter)
    4853             :                 return NULL;
    4854             : 
    4855           0 :         iter->event = kmalloc(BUF_MAX_DATA_SIZE, flags);
    4856           0 :         if (!iter->event) {
    4857           0 :                 kfree(iter);
    4858           0 :                 return NULL;
    4859             :         }
    4860             : 
    4861           0 :         cpu_buffer = buffer->buffers[cpu];
    4862             : 
    4863           0 :         iter->cpu_buffer = cpu_buffer;
    4864             : 
    4865           0 :         atomic_inc(&cpu_buffer->resize_disabled);
    4866             : 
    4867           0 :         return iter;
    4868             : }
    4869             : EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
    4870             : 
    4871             : /**
    4872             :  * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
    4873             :  *
    4874             :  * All previously invoked ring_buffer_read_prepare calls to prepare
    4875             :  * iterators will be synchronized.  Afterwards, read_buffer_read_start
    4876             :  * calls on those iterators are allowed.
    4877             :  */
    4878             : void
    4879           0 : ring_buffer_read_prepare_sync(void)
    4880             : {
    4881           0 :         synchronize_rcu();
    4882           0 : }
    4883             : EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
    4884             : 
    4885             : /**
    4886             :  * ring_buffer_read_start - start a non consuming read of the buffer
    4887             :  * @iter: The iterator returned by ring_buffer_read_prepare
    4888             :  *
    4889             :  * This finalizes the startup of an iteration through the buffer.
    4890             :  * The iterator comes from a call to ring_buffer_read_prepare and
    4891             :  * an intervening ring_buffer_read_prepare_sync must have been
    4892             :  * performed.
    4893             :  *
    4894             :  * Must be paired with ring_buffer_read_finish.
    4895             :  */
    4896             : void
    4897           0 : ring_buffer_read_start(struct ring_buffer_iter *iter)
    4898             : {
    4899           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    4900           0 :         unsigned long flags;
    4901             : 
    4902           0 :         if (!iter)
    4903             :                 return;
    4904             : 
    4905           0 :         cpu_buffer = iter->cpu_buffer;
    4906             : 
    4907           0 :         raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
    4908           0 :         arch_spin_lock(&cpu_buffer->lock);
    4909           0 :         rb_iter_reset(iter);
    4910           0 :         arch_spin_unlock(&cpu_buffer->lock);
    4911           0 :         raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
    4912             : }
    4913             : EXPORT_SYMBOL_GPL(ring_buffer_read_start);
    4914             : 
    4915             : /**
    4916             :  * ring_buffer_read_finish - finish reading the iterator of the buffer
    4917             :  * @iter: The iterator retrieved by ring_buffer_start
    4918             :  *
    4919             :  * This re-enables the recording to the buffer, and frees the
    4920             :  * iterator.
    4921             :  */
    4922             : void
    4923           0 : ring_buffer_read_finish(struct ring_buffer_iter *iter)
    4924             : {
    4925           0 :         struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
    4926           0 :         unsigned long flags;
    4927             : 
    4928             :         /*
    4929             :          * Ring buffer is disabled from recording, here's a good place
    4930             :          * to check the integrity of the ring buffer.
    4931             :          * Must prevent readers from trying to read, as the check
    4932             :          * clears the HEAD page and readers require it.
    4933             :          */
    4934           0 :         raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
    4935           0 :         rb_check_pages(cpu_buffer);
    4936           0 :         raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
    4937             : 
    4938           0 :         atomic_dec(&cpu_buffer->resize_disabled);
    4939           0 :         kfree(iter->event);
    4940           0 :         kfree(iter);
    4941           0 : }
    4942             : EXPORT_SYMBOL_GPL(ring_buffer_read_finish);
    4943             : 
    4944             : /**
    4945             :  * ring_buffer_iter_advance - advance the iterator to the next location
    4946             :  * @iter: The ring buffer iterator
    4947             :  *
    4948             :  * Move the location of the iterator such that the next read will
    4949             :  * be the next location of the iterator.
    4950             :  */
    4951           0 : void ring_buffer_iter_advance(struct ring_buffer_iter *iter)
    4952             : {
    4953           0 :         struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
    4954           0 :         unsigned long flags;
    4955             : 
    4956           0 :         raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
    4957             : 
    4958           0 :         rb_advance_iter(iter);
    4959             : 
    4960           0 :         raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
    4961           0 : }
    4962             : EXPORT_SYMBOL_GPL(ring_buffer_iter_advance);
    4963             : 
    4964             : /**
    4965             :  * ring_buffer_size - return the size of the ring buffer (in bytes)
    4966             :  * @buffer: The ring buffer.
    4967             :  * @cpu: The CPU to get ring buffer size from.
    4968             :  */
    4969           1 : unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu)
    4970             : {
    4971             :         /*
    4972             :          * Earlier, this method returned
    4973             :          *      BUF_PAGE_SIZE * buffer->nr_pages
    4974             :          * Since the nr_pages field is now removed, we have converted this to
    4975             :          * return the per cpu buffer value.
    4976             :          */
    4977           1 :         if (!cpumask_test_cpu(cpu, buffer->cpumask))
    4978             :                 return 0;
    4979             : 
    4980           1 :         return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;
    4981             : }
    4982             : EXPORT_SYMBOL_GPL(ring_buffer_size);
    4983             : 
    4984             : static void
    4985           0 : rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
    4986             : {
    4987           0 :         rb_head_page_deactivate(cpu_buffer);
    4988             : 
    4989           0 :         cpu_buffer->head_page
    4990           0 :                 = list_entry(cpu_buffer->pages, struct buffer_page, list);
    4991           0 :         local_set(&cpu_buffer->head_page->write, 0);
    4992           0 :         local_set(&cpu_buffer->head_page->entries, 0);
    4993           0 :         local_set(&cpu_buffer->head_page->page->commit, 0);
    4994             : 
    4995           0 :         cpu_buffer->head_page->read = 0;
    4996             : 
    4997           0 :         cpu_buffer->tail_page = cpu_buffer->head_page;
    4998           0 :         cpu_buffer->commit_page = cpu_buffer->head_page;
    4999             : 
    5000           0 :         INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
    5001           0 :         INIT_LIST_HEAD(&cpu_buffer->new_pages);
    5002           0 :         local_set(&cpu_buffer->reader_page->write, 0);
    5003           0 :         local_set(&cpu_buffer->reader_page->entries, 0);
    5004           0 :         local_set(&cpu_buffer->reader_page->page->commit, 0);
    5005           0 :         cpu_buffer->reader_page->read = 0;
    5006             : 
    5007           0 :         local_set(&cpu_buffer->entries_bytes, 0);
    5008           0 :         local_set(&cpu_buffer->overrun, 0);
    5009           0 :         local_set(&cpu_buffer->commit_overrun, 0);
    5010           0 :         local_set(&cpu_buffer->dropped_events, 0);
    5011           0 :         local_set(&cpu_buffer->entries, 0);
    5012           0 :         local_set(&cpu_buffer->committing, 0);
    5013           0 :         local_set(&cpu_buffer->commits, 0);
    5014           0 :         local_set(&cpu_buffer->pages_touched, 0);
    5015           0 :         local_set(&cpu_buffer->pages_read, 0);
    5016           0 :         cpu_buffer->last_pages_touch = 0;
    5017           0 :         cpu_buffer->shortest_full = 0;
    5018           0 :         cpu_buffer->read = 0;
    5019           0 :         cpu_buffer->read_bytes = 0;
    5020             : 
    5021           0 :         rb_time_set(&cpu_buffer->write_stamp, 0);
    5022           0 :         rb_time_set(&cpu_buffer->before_stamp, 0);
    5023             : 
    5024           0 :         cpu_buffer->lost_events = 0;
    5025           0 :         cpu_buffer->last_overrun = 0;
    5026             : 
    5027           0 :         rb_head_page_activate(cpu_buffer);
    5028           0 : }
    5029             : 
    5030             : /* Must have disabled the cpu buffer then done a synchronize_rcu */
    5031           0 : static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
    5032             : {
    5033           0 :         unsigned long flags;
    5034             : 
    5035           0 :         raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
    5036             : 
    5037           0 :         if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
    5038           0 :                 goto out;
    5039             : 
    5040           0 :         arch_spin_lock(&cpu_buffer->lock);
    5041             : 
    5042           0 :         rb_reset_cpu(cpu_buffer);
    5043             : 
    5044           0 :         arch_spin_unlock(&cpu_buffer->lock);
    5045             : 
    5046           0 :  out:
    5047           0 :         raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
    5048           0 : }
    5049             : 
    5050             : /**
    5051             :  * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
    5052             :  * @buffer: The ring buffer to reset a per cpu buffer of
    5053             :  * @cpu: The CPU buffer to be reset
    5054             :  */
    5055           0 : void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
    5056             : {
    5057           0 :         struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
    5058             : 
    5059           0 :         if (!cpumask_test_cpu(cpu, buffer->cpumask))
    5060             :                 return;
    5061             : 
    5062             :         /* prevent another thread from changing buffer sizes */
    5063           0 :         mutex_lock(&buffer->mutex);
    5064             : 
    5065           0 :         atomic_inc(&cpu_buffer->resize_disabled);
    5066           0 :         atomic_inc(&cpu_buffer->record_disabled);
    5067             : 
    5068             :         /* Make sure all commits have finished */
    5069           0 :         synchronize_rcu();
    5070             : 
    5071           0 :         reset_disabled_cpu_buffer(cpu_buffer);
    5072             : 
    5073           0 :         atomic_dec(&cpu_buffer->record_disabled);
    5074           0 :         atomic_dec(&cpu_buffer->resize_disabled);
    5075             : 
    5076           0 :         mutex_unlock(&buffer->mutex);
    5077             : }
    5078             : EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
    5079             : 
    5080             : /**
    5081             :  * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
    5082             :  * @buffer: The ring buffer to reset a per cpu buffer of
    5083             :  * @cpu: The CPU buffer to be reset
    5084             :  */
    5085           0 : void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
    5086             : {
    5087           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    5088           0 :         int cpu;
    5089             : 
    5090             :         /* prevent another thread from changing buffer sizes */
    5091           0 :         mutex_lock(&buffer->mutex);
    5092             : 
    5093           0 :         for_each_online_buffer_cpu(buffer, cpu) {
    5094           0 :                 cpu_buffer = buffer->buffers[cpu];
    5095             : 
    5096           0 :                 atomic_inc(&cpu_buffer->resize_disabled);
    5097           0 :                 atomic_inc(&cpu_buffer->record_disabled);
    5098             :         }
    5099             : 
    5100             :         /* Make sure all commits have finished */
    5101           0 :         synchronize_rcu();
    5102             : 
    5103           0 :         for_each_online_buffer_cpu(buffer, cpu) {
    5104           0 :                 cpu_buffer = buffer->buffers[cpu];
    5105             : 
    5106           0 :                 reset_disabled_cpu_buffer(cpu_buffer);
    5107             : 
    5108           0 :                 atomic_dec(&cpu_buffer->record_disabled);
    5109           0 :                 atomic_dec(&cpu_buffer->resize_disabled);
    5110             :         }
    5111             : 
    5112           0 :         mutex_unlock(&buffer->mutex);
    5113           0 : }
    5114             : 
    5115             : /**
    5116             :  * ring_buffer_reset - reset a ring buffer
    5117             :  * @buffer: The ring buffer to reset all cpu buffers
    5118             :  */
    5119           0 : void ring_buffer_reset(struct trace_buffer *buffer)
    5120             : {
    5121           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    5122           0 :         int cpu;
    5123             : 
    5124           0 :         for_each_buffer_cpu(buffer, cpu) {
    5125           0 :                 cpu_buffer = buffer->buffers[cpu];
    5126             : 
    5127           0 :                 atomic_inc(&cpu_buffer->resize_disabled);
    5128           0 :                 atomic_inc(&cpu_buffer->record_disabled);
    5129             :         }
    5130             : 
    5131             :         /* Make sure all commits have finished */
    5132           0 :         synchronize_rcu();
    5133             : 
    5134           0 :         for_each_buffer_cpu(buffer, cpu) {
    5135           0 :                 cpu_buffer = buffer->buffers[cpu];
    5136             : 
    5137           0 :                 reset_disabled_cpu_buffer(cpu_buffer);
    5138             : 
    5139           0 :                 atomic_dec(&cpu_buffer->record_disabled);
    5140           0 :                 atomic_dec(&cpu_buffer->resize_disabled);
    5141             :         }
    5142           0 : }
    5143             : EXPORT_SYMBOL_GPL(ring_buffer_reset);
    5144             : 
    5145             : /**
    5146             :  * rind_buffer_empty - is the ring buffer empty?
    5147             :  * @buffer: The ring buffer to test
    5148             :  */
    5149           0 : bool ring_buffer_empty(struct trace_buffer *buffer)
    5150             : {
    5151           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    5152           0 :         unsigned long flags;
    5153           0 :         bool dolock;
    5154           0 :         int cpu;
    5155           0 :         int ret;
    5156             : 
    5157             :         /* yes this is racy, but if you don't like the race, lock the buffer */
    5158           0 :         for_each_buffer_cpu(buffer, cpu) {
    5159           0 :                 cpu_buffer = buffer->buffers[cpu];
    5160           0 :                 local_irq_save(flags);
    5161           0 :                 dolock = rb_reader_lock(cpu_buffer);
    5162           0 :                 ret = rb_per_cpu_empty(cpu_buffer);
    5163           0 :                 rb_reader_unlock(cpu_buffer, dolock);
    5164           0 :                 local_irq_restore(flags);
    5165             : 
    5166           0 :                 if (!ret)
    5167             :                         return false;
    5168             :         }
    5169             : 
    5170             :         return true;
    5171             : }
    5172             : EXPORT_SYMBOL_GPL(ring_buffer_empty);
    5173             : 
    5174             : /**
    5175             :  * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty?
    5176             :  * @buffer: The ring buffer
    5177             :  * @cpu: The CPU buffer to test
    5178             :  */
    5179           0 : bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu)
    5180             : {
    5181           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    5182           0 :         unsigned long flags;
    5183           0 :         bool dolock;
    5184           0 :         int ret;
    5185             : 
    5186           0 :         if (!cpumask_test_cpu(cpu, buffer->cpumask))
    5187             :                 return true;
    5188             : 
    5189           0 :         cpu_buffer = buffer->buffers[cpu];
    5190           0 :         local_irq_save(flags);
    5191           0 :         dolock = rb_reader_lock(cpu_buffer);
    5192           0 :         ret = rb_per_cpu_empty(cpu_buffer);
    5193           0 :         rb_reader_unlock(cpu_buffer, dolock);
    5194           0 :         local_irq_restore(flags);
    5195             : 
    5196             :         return ret;
    5197             : }
    5198             : EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
    5199             : 
    5200             : #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
    5201             : /**
    5202             :  * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
    5203             :  * @buffer_a: One buffer to swap with
    5204             :  * @buffer_b: The other buffer to swap with
    5205             :  * @cpu: the CPU of the buffers to swap
    5206             :  *
    5207             :  * This function is useful for tracers that want to take a "snapshot"
    5208             :  * of a CPU buffer and has another back up buffer lying around.
    5209             :  * it is expected that the tracer handles the cpu buffer not being
    5210             :  * used at the moment.
    5211             :  */
    5212             : int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
    5213             :                          struct trace_buffer *buffer_b, int cpu)
    5214             : {
    5215             :         struct ring_buffer_per_cpu *cpu_buffer_a;
    5216             :         struct ring_buffer_per_cpu *cpu_buffer_b;
    5217             :         int ret = -EINVAL;
    5218             : 
    5219             :         if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
    5220             :             !cpumask_test_cpu(cpu, buffer_b->cpumask))
    5221             :                 goto out;
    5222             : 
    5223             :         cpu_buffer_a = buffer_a->buffers[cpu];
    5224             :         cpu_buffer_b = buffer_b->buffers[cpu];
    5225             : 
    5226             :         /* At least make sure the two buffers are somewhat the same */
    5227             :         if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
    5228             :                 goto out;
    5229             : 
    5230             :         ret = -EAGAIN;
    5231             : 
    5232             :         if (atomic_read(&buffer_a->record_disabled))
    5233             :                 goto out;
    5234             : 
    5235             :         if (atomic_read(&buffer_b->record_disabled))
    5236             :                 goto out;
    5237             : 
    5238             :         if (atomic_read(&cpu_buffer_a->record_disabled))
    5239             :                 goto out;
    5240             : 
    5241             :         if (atomic_read(&cpu_buffer_b->record_disabled))
    5242             :                 goto out;
    5243             : 
    5244             :         /*
    5245             :          * We can't do a synchronize_rcu here because this
    5246             :          * function can be called in atomic context.
    5247             :          * Normally this will be called from the same CPU as cpu.
    5248             :          * If not it's up to the caller to protect this.
    5249             :          */
    5250             :         atomic_inc(&cpu_buffer_a->record_disabled);
    5251             :         atomic_inc(&cpu_buffer_b->record_disabled);
    5252             : 
    5253             :         ret = -EBUSY;
    5254             :         if (local_read(&cpu_buffer_a->committing))
    5255             :                 goto out_dec;
    5256             :         if (local_read(&cpu_buffer_b->committing))
    5257             :                 goto out_dec;
    5258             : 
    5259             :         buffer_a->buffers[cpu] = cpu_buffer_b;
    5260             :         buffer_b->buffers[cpu] = cpu_buffer_a;
    5261             : 
    5262             :         cpu_buffer_b->buffer = buffer_a;
    5263             :         cpu_buffer_a->buffer = buffer_b;
    5264             : 
    5265             :         ret = 0;
    5266             : 
    5267             : out_dec:
    5268             :         atomic_dec(&cpu_buffer_a->record_disabled);
    5269             :         atomic_dec(&cpu_buffer_b->record_disabled);
    5270             : out:
    5271             :         return ret;
    5272             : }
    5273             : EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
    5274             : #endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */
    5275             : 
    5276             : /**
    5277             :  * ring_buffer_alloc_read_page - allocate a page to read from buffer
    5278             :  * @buffer: the buffer to allocate for.
    5279             :  * @cpu: the cpu buffer to allocate.
    5280             :  *
    5281             :  * This function is used in conjunction with ring_buffer_read_page.
    5282             :  * When reading a full page from the ring buffer, these functions
    5283             :  * can be used to speed up the process. The calling function should
    5284             :  * allocate a few pages first with this function. Then when it
    5285             :  * needs to get pages from the ring buffer, it passes the result
    5286             :  * of this function into ring_buffer_read_page, which will swap
    5287             :  * the page that was allocated, with the read page of the buffer.
    5288             :  *
    5289             :  * Returns:
    5290             :  *  The page allocated, or ERR_PTR
    5291             :  */
    5292           0 : void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
    5293             : {
    5294           0 :         struct ring_buffer_per_cpu *cpu_buffer;
    5295           0 :         struct buffer_data_page *bpage = NULL;
    5296           0 :         unsigned long flags;
    5297           0 :         struct page *page;
    5298             : 
    5299           0 :         if (!cpumask_test_cpu(cpu, buffer->cpumask))
    5300           0 :                 return ERR_PTR(-ENODEV);
    5301             : 
    5302           0 :         cpu_buffer = buffer->buffers[cpu];
    5303           0 :         local_irq_save(flags);
    5304           0 :         arch_spin_lock(&cpu_buffer->lock);
    5305             : 
    5306           0 :         if (cpu_buffer->free_page) {
    5307           0 :                 bpage = cpu_buffer->free_page;
    5308           0 :                 cpu_buffer->free_page = NULL;
    5309             :         }
    5310             : 
    5311           0 :         arch_spin_unlock(&cpu_buffer->lock);
    5312           0 :         local_irq_restore(flags);
    5313             : 
    5314           0 :         if (bpage)
    5315           0 :                 goto out;
    5316             : 
    5317           0 :         page = alloc_pages_node(cpu_to_node(cpu),
    5318             :                                 GFP_KERNEL | __GFP_NORETRY, 0);
    5319           0 :         if (!page)
    5320           0 :                 return ERR_PTR(-ENOMEM);
    5321             : 
    5322           0 :         bpage = page_address(page);
    5323             : 
    5324           0 :  out:
    5325           0 :         rb_init_page(bpage);
    5326             : 
    5327           0 :         return bpage;
    5328             : }
    5329             : EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
    5330             : 
    5331             : /**
    5332             :  * ring_buffer_free_read_page - free an allocated read page
    5333             :  * @buffer: the buffer the page was allocate for
    5334             :  * @cpu: the cpu buffer the page came from
    5335             :  * @data: the page to free
    5336             :  *
    5337             :  * Free a page allocated from ring_buffer_alloc_read_page.
    5338             :  */
    5339           0 : void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data)
    5340             : {
    5341           0 :         struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
    5342           0 :         struct buffer_data_page *bpage = data;
    5343           0 :         struct page *page = virt_to_page(bpage);
    5344           0 :         unsigned long flags;
    5345             : 
    5346             :         /* If the page is still in use someplace else, we can't reuse it */
    5347           0 :         if (page_ref_count(page) > 1)
    5348           0 :                 goto out;
    5349             : 
    5350           0 :         local_irq_save(flags);
    5351           0 :         arch_spin_lock(&cpu_buffer->lock);
    5352             : 
    5353           0 :         if (!cpu_buffer->free_page) {
    5354           0 :                 cpu_buffer->free_page = bpage;
    5355           0 :                 bpage = NULL;
    5356             :         }
    5357             : 
    5358           0 :         arch_spin_unlock(&cpu_buffer->lock);
    5359           0 :         local_irq_restore(flags);
    5360             : 
    5361           0 :  out:
    5362           0 :         free_page((unsigned long)bpage);
    5363           0 : }
    5364             : EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
    5365             : 
    5366             : /**
    5367             :  * ring_buffer_read_page - extract a page from the ring buffer
    5368             :  * @buffer: buffer to extract from
    5369             :  * @data_page: the page to use allocated from ring_buffer_alloc_read_page
    5370             :  * @len: amount to extract
    5371             :  * @cpu: the cpu of the buffer to extract
    5372             :  * @full: should the extraction only happen when the page is full.
    5373             :  *
    5374             :  * This function will pull out a page from the ring buffer and consume it.
    5375             :  * @data_page must be the address of the variable that was returned
    5376             :  * from ring_buffer_alloc_read_page. This is because the page might be used
    5377             :  * to swap with a page in the ring buffer.
    5378             :  *
    5379             :  * for example:
    5380             :  *      rpage = ring_buffer_alloc_read_page(buffer, cpu);
    5381             :  *      if (IS_ERR(rpage))
    5382             :  *              return PTR_ERR(rpage);
    5383             :  *      ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
    5384             :  *      if (ret >= 0)
    5385             :  *              process_page(rpage, ret);
    5386             :  *
    5387             :  * When @full is set, the function will not return true unless
    5388             :  * the writer is off the reader page.
    5389             :  *
    5390             :  * Note: it is up to the calling functions to handle sleeps and wakeups.
    5391             :  *  The ring buffer can be used anywhere in the kernel and can not
    5392             :  *  blindly call wake_up. The layer that uses the ring buffer must be
    5393             :  *  responsible for that.
    5394             :  *
    5395             :  * Returns:
    5396             :  *  >=0 if data has been transferred, returns the offset of consumed data.
    5397             :  *  <0 if no data has been transferred.
    5398             :  */
    5399           0 : int ring_buffer_read_page(struct trace_buffer *buffer,
    5400             :                           void **data_page, size_t len, int cpu, int full)
    5401             : {
    5402           0 :         struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
    5403           0 :         struct ring_buffer_event *event;
    5404           0 :         struct buffer_data_page *bpage;
    5405           0 :         struct buffer_page *reader;
    5406           0 :         unsigned long missed_events;
    5407           0 :         unsigned long flags;
    5408           0 :         unsigned int commit;
    5409           0 :         unsigned int read;
    5410           0 :         u64 save_timestamp;
    5411           0 :         int ret = -1;
    5412             : 
    5413           0 :         if (!cpumask_test_cpu(cpu, buffer->cpumask))
    5414           0 :                 goto out;
    5415             : 
    5416             :         /*
    5417             :          * If len is not big enough to hold the page header, then
    5418             :          * we can not copy anything.
    5419             :          */
    5420           0 :         if (len <= BUF_PAGE_HDR_SIZE)
    5421           0 :                 goto out;
    5422             : 
    5423           0 :         len -= BUF_PAGE_HDR_SIZE;
    5424             : 
    5425           0 :         if (!data_page)
    5426           0 :                 goto out;
    5427             : 
    5428           0 :         bpage = *data_page;
    5429           0 :         if (!bpage)
    5430           0 :                 goto out;
    5431             : 
    5432           0 :         raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
    5433             : 
    5434           0 :         reader = rb_get_reader_page(cpu_buffer);
    5435           0 :         if (!reader)
    5436           0 :                 goto out_unlock;
    5437             : 
    5438           0 :         event = rb_reader_event(cpu_buffer);
    5439             : 
    5440           0 :         read = reader->read;
    5441           0 :         commit = rb_page_commit(reader);
    5442             : 
    5443             :         /* Check if any events were dropped */
    5444           0 :         missed_events = cpu_buffer->lost_events;
    5445             : 
    5446             :         /*
    5447             :          * If this page has been partially read or
    5448             :          * if len is not big enough to read the rest of the page or
    5449             :          * a writer is still on the page, then
    5450             :          * we must copy the data from the page to the buffer.
    5451             :          * Otherwise, we can simply swap the page with the one passed in.
    5452             :          */
    5453           0 :         if (read || (len < (commit - read)) ||
    5454           0 :             cpu_buffer->reader_page == cpu_buffer->commit_page) {
    5455           0 :                 struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
    5456           0 :                 unsigned int rpos = read;
    5457           0 :                 unsigned int pos = 0;
    5458           0 :                 unsigned int size;
    5459             : 
    5460           0 :                 if (full)
    5461           0 :                         goto out_unlock;
    5462             : 
    5463           0 :                 if (len > (commit - read))
    5464             :                         len = (commit - read);
    5465             : 
    5466             :                 /* Always keep the time extend and data together */
    5467           0 :                 size = rb_event_ts_length(event);
    5468             : 
    5469           0 :                 if (len < size)
    5470           0 :                         goto out_unlock;
    5471             : 
    5472             :                 /* save the current timestamp, since the user will need it */
    5473           0 :                 save_timestamp = cpu_buffer->read_stamp;
    5474             : 
    5475             :                 /* Need to copy one event at a time */
    5476           0 :                 do {
    5477             :                         /* We need the size of one event, because
    5478             :                          * rb_advance_reader only advances by one event,
    5479             :                          * whereas rb_event_ts_length may include the size of
    5480             :                          * one or two events.
    5481             :                          * We have already ensured there's enough space if this
    5482             :                          * is a time extend. */
    5483           0 :                         size = rb_event_length(event);
    5484           0 :                         memcpy(bpage->data + pos, rpage->data + rpos, size);
    5485             : 
    5486           0 :                         len -= size;
    5487             : 
    5488           0 :                         rb_advance_reader(cpu_buffer);
    5489           0 :                         rpos = reader->read;
    5490           0 :                         pos += size;
    5491             : 
    5492           0 :                         if (rpos >= commit)
    5493             :                                 break;
    5494             : 
    5495           0 :                         event = rb_reader_event(cpu_buffer);
    5496             :                         /* Always keep the time extend and data together */
    5497           0 :                         size = rb_event_ts_length(event);
    5498           0 :                 } while (len >= size);
    5499             : 
    5500             :                 /* update bpage */
    5501           0 :                 local_set(&bpage->commit, pos);
    5502           0 :                 bpage->time_stamp = save_timestamp;
    5503             : 
    5504             :                 /* we copied everything to the beginning */
    5505           0 :                 read = 0;
    5506             :         } else {
    5507             :                 /* update the entry counter */
    5508           0 :                 cpu_buffer->read += rb_page_entries(reader);
    5509           0 :                 cpu_buffer->read_bytes += BUF_PAGE_SIZE;
    5510             : 
    5511             :                 /* swap the pages */
    5512           0 :                 rb_init_page(bpage);
    5513           0 :                 bpage = reader->page;
    5514           0 :                 reader->page = *data_page;
    5515           0 :                 local_set(&reader->write, 0);
    5516           0 :                 local_set(&reader->entries, 0);
    5517           0 :                 reader->read = 0;
    5518           0 :                 *data_page = bpage;
    5519             : 
    5520             :                 /*
    5521             :                  * Use the real_end for the data size,
    5522             :                  * This gives us a chance to store the lost events
    5523             :                  * on the page.
    5524             :                  */
    5525           0 :                 if (reader->real_end)
    5526           0 :                         local_set(&bpage->commit, reader->real_end);
    5527             :         }
    5528           0 :         ret = read;
    5529             : 
    5530           0 :         cpu_buffer->lost_events = 0;
    5531             : 
    5532           0 :         commit = local_read(&bpage->commit);
    5533             :         /*
    5534             :          * Set a flag in the commit field if we lost events
    5535             :          */
    5536           0 :         if (missed_events) {
    5537             :                 /* If there is room at the end of the page to save the
    5538             :                  * missed events, then record it there.
    5539             :                  */
    5540           0 :                 if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
    5541           0 :                         memcpy(&bpage->data[commit], &missed_events,
    5542             :                                sizeof(missed_events));
    5543           0 :                         local_add(RB_MISSED_STORED, &bpage->commit);
    5544           0 :                         commit += sizeof(missed_events);
    5545             :                 }
    5546           0 :                 local_add(RB_MISSED_EVENTS, &bpage->commit);
    5547             :         }
    5548             : 
    5549             :         /*
    5550             :          * This page may be off to user land. Zero it out here.
    5551             :          */
    5552           0 :         if (commit < BUF_PAGE_SIZE)
    5553           0 :                 memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
    5554             : 
    5555           0 :  out_unlock:
    5556           0 :         raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
    5557             : 
    5558           0 :  out:
    5559           0 :         return ret;
    5560             : }
    5561             : EXPORT_SYMBOL_GPL(ring_buffer_read_page);
    5562             : 
    5563             : /*
    5564             :  * We only allocate new buffers, never free them if the CPU goes down.
    5565             :  * If we were to free the buffer, then the user would lose any trace that was in
    5566             :  * the buffer.
    5567             :  */
    5568           8 : int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node)
    5569             : {
    5570           8 :         struct trace_buffer *buffer;
    5571           8 :         long nr_pages_same;
    5572           8 :         int cpu_i;
    5573           8 :         unsigned long nr_pages;
    5574             : 
    5575           8 :         buffer = container_of(node, struct trace_buffer, node);
    5576           8 :         if (cpumask_test_cpu(cpu, buffer->cpumask))
    5577             :                 return 0;
    5578             : 
    5579             :         nr_pages = 0;
    5580          18 :         nr_pages_same = 1;
    5581             :         /* check if all cpu sizes are same */
    5582          18 :         for_each_buffer_cpu(buffer, cpu_i) {
    5583             :                 /* fill in the size from first enabled cpu */
    5584          12 :                 if (nr_pages == 0)
    5585           6 :                         nr_pages = buffer->buffers[cpu_i]->nr_pages;
    5586          12 :                 if (nr_pages != buffer->buffers[cpu_i]->nr_pages) {
    5587             :                         nr_pages_same = 0;
    5588             :                         break;
    5589             :                 }
    5590             :         }
    5591             :         /* allocate minimum pages, user can later expand it */
    5592           6 :         if (!nr_pages_same)
    5593           0 :                 nr_pages = 2;
    5594          12 :         buffer->buffers[cpu] =
    5595           6 :                 rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
    5596           6 :         if (!buffer->buffers[cpu]) {
    5597           0 :                 WARN(1, "failed to allocate ring buffer on CPU %u\n",
    5598             :                      cpu);
    5599           0 :                 return -ENOMEM;
    5600             :         }
    5601           6 :         smp_wmb();
    5602           6 :         cpumask_set_cpu(cpu, buffer->cpumask);
    5603           6 :         return 0;
    5604             : }
    5605             : 
    5606             : #ifdef CONFIG_RING_BUFFER_STARTUP_TEST
    5607             : /*
    5608             :  * This is a basic integrity check of the ring buffer.
    5609             :  * Late in the boot cycle this test will run when configured in.
    5610             :  * It will kick off a thread per CPU that will go into a loop
    5611             :  * writing to the per cpu ring buffer various sizes of data.
    5612             :  * Some of the data will be large items, some small.
    5613             :  *
    5614             :  * Another thread is created that goes into a spin, sending out
    5615             :  * IPIs to the other CPUs to also write into the ring buffer.
    5616             :  * this is to test the nesting ability of the buffer.
    5617             :  *
    5618             :  * Basic stats are recorded and reported. If something in the
    5619             :  * ring buffer should happen that's not expected, a big warning
    5620             :  * is displayed and all ring buffers are disabled.
    5621             :  */
    5622             : static struct task_struct *rb_threads[NR_CPUS] __initdata;
    5623             : 
    5624             : struct rb_test_data {
    5625             :         struct trace_buffer *buffer;
    5626             :         unsigned long           events;
    5627             :         unsigned long           bytes_written;
    5628             :         unsigned long           bytes_alloc;
    5629             :         unsigned long           bytes_dropped;
    5630             :         unsigned long           events_nested;
    5631             :         unsigned long           bytes_written_nested;
    5632             :         unsigned long           bytes_alloc_nested;
    5633             :         unsigned long           bytes_dropped_nested;
    5634             :         int                     min_size_nested;
    5635             :         int                     max_size_nested;
    5636             :         int                     max_size;
    5637             :         int                     min_size;
    5638             :         int                     cpu;
    5639             :         int                     cnt;
    5640             : };
    5641             : 
    5642             : static struct rb_test_data rb_data[NR_CPUS] __initdata;
    5643             : 
    5644             : /* 1 meg per cpu */
    5645             : #define RB_TEST_BUFFER_SIZE     1048576
    5646             : 
    5647             : static char rb_string[] __initdata =
    5648             :         "abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\"
    5649             :         "?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890"
    5650             :         "!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv";
    5651             : 
    5652             : static bool rb_test_started __initdata;
    5653             : 
    5654             : struct rb_item {
    5655             :         int size;
    5656             :         char str[];
    5657             : };
    5658             : 
    5659             : static __init int rb_write_something(struct rb_test_data *data, bool nested)
    5660             : {
    5661             :         struct ring_buffer_event *event;
    5662             :         struct rb_item *item;
    5663             :         bool started;
    5664             :         int event_len;
    5665             :         int size;
    5666             :         int len;
    5667             :         int cnt;
    5668             : 
    5669             :         /* Have nested writes different that what is written */
    5670             :         cnt = data->cnt + (nested ? 27 : 0);
    5671             : 
    5672             :         /* Multiply cnt by ~e, to make some unique increment */
    5673             :         size = (cnt * 68 / 25) % (sizeof(rb_string) - 1);
    5674             : 
    5675             :         len = size + sizeof(struct rb_item);
    5676             : 
    5677             :         started = rb_test_started;
    5678             :         /* read rb_test_started before checking buffer enabled */
    5679             :         smp_rmb();
    5680             : 
    5681             :         event = ring_buffer_lock_reserve(data->buffer, len);
    5682             :         if (!event) {
    5683             :                 /* Ignore dropped events before test starts. */
    5684             :                 if (started) {
    5685             :                         if (nested)
    5686             :                                 data->bytes_dropped += len;
    5687             :                         else
    5688             :                                 data->bytes_dropped_nested += len;
    5689             :                 }
    5690             :                 return len;
    5691             :         }
    5692             : 
    5693             :         event_len = ring_buffer_event_length(event);
    5694             : 
    5695             :         if (RB_WARN_ON(data->buffer, event_len < len))
    5696             :                 goto out;
    5697             : 
    5698             :         item = ring_buffer_event_data(event);
    5699             :         item->size = size;
    5700             :         memcpy(item->str, rb_string, size);
    5701             : 
    5702             :         if (nested) {
    5703             :                 data->bytes_alloc_nested += event_len;
    5704             :                 data->bytes_written_nested += len;
    5705             :                 data->events_nested++;
    5706             :                 if (!data->min_size_nested || len < data->min_size_nested)
    5707             :                         data->min_size_nested = len;
    5708             :                 if (len > data->max_size_nested)
    5709             :                         data->max_size_nested = len;
    5710             :         } else {
    5711             :                 data->bytes_alloc += event_len;
    5712             :                 data->bytes_written += len;
    5713             :                 data->events++;
    5714             :                 if (!data->min_size || len < data->min_size)
    5715             :                         data->max_size = len;
    5716             :                 if (len > data->max_size)
    5717             :                         data->max_size = len;
    5718             :         }
    5719             : 
    5720             :  out:
    5721             :         ring_buffer_unlock_commit(data->buffer, event);
    5722             : 
    5723             :         return 0;
    5724             : }
    5725             : 
    5726             : static __init int rb_test(void *arg)
    5727             : {
    5728             :         struct rb_test_data *data = arg;
    5729             : 
    5730             :         while (!kthread_should_stop()) {
    5731             :                 rb_write_something(data, false);
    5732             :                 data->cnt++;
    5733             : 
    5734             :                 set_current_state(TASK_INTERRUPTIBLE);
    5735             :                 /* Now sleep between a min of 100-300us and a max of 1ms */
    5736             :                 usleep_range(((data->cnt % 3) + 1) * 100, 1000);
    5737             :         }
    5738             : 
    5739             :         return 0;
    5740             : }
    5741             : 
    5742             : static __init void rb_ipi(void *ignore)
    5743             : {
    5744             :         struct rb_test_data *data;
    5745             :         int cpu = smp_processor_id();
    5746             : 
    5747             :         data = &rb_data[cpu];
    5748             :         rb_write_something(data, true);
    5749             : }
    5750             : 
    5751             : static __init int rb_hammer_test(void *arg)
    5752             : {
    5753             :         while (!kthread_should_stop()) {
    5754             : 
    5755             :                 /* Send an IPI to all cpus to write data! */
    5756             :                 smp_call_function(rb_ipi, NULL, 1);
    5757             :                 /* No sleep, but for non preempt, let others run */
    5758             :                 schedule();
    5759             :         }
    5760             : 
    5761             :         return 0;
    5762             : }
    5763             : 
    5764             : static __init int test_ringbuffer(void)
    5765             : {
    5766             :         struct task_struct *rb_hammer;
    5767             :         struct trace_buffer *buffer;
    5768             :         int cpu;
    5769             :         int ret = 0;
    5770             : 
    5771             :         if (security_locked_down(LOCKDOWN_TRACEFS)) {
    5772             :                 pr_warn("Lockdown is enabled, skipping ring buffer tests\n");
    5773             :                 return 0;
    5774             :         }
    5775             : 
    5776             :         pr_info("Running ring buffer tests...\n");
    5777             : 
    5778             :         buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE);
    5779             :         if (WARN_ON(!buffer))
    5780             :                 return 0;
    5781             : 
    5782             :         /* Disable buffer so that threads can't write to it yet */
    5783             :         ring_buffer_record_off(buffer);
    5784             : 
    5785             :         for_each_online_cpu(cpu) {
    5786             :                 rb_data[cpu].buffer = buffer;
    5787             :                 rb_data[cpu].cpu = cpu;
    5788             :                 rb_data[cpu].cnt = cpu;
    5789             :                 rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu],
    5790             :                                                  "rbtester/%d", cpu);
    5791             :                 if (WARN_ON(IS_ERR(rb_threads[cpu]))) {
    5792             :                         pr_cont("FAILED\n");
    5793             :                         ret = PTR_ERR(rb_threads[cpu]);
    5794             :                         goto out_free;
    5795             :                 }
    5796             : 
    5797             :                 kthread_bind(rb_threads[cpu], cpu);
    5798             :                 wake_up_process(rb_threads[cpu]);
    5799             :         }
    5800             : 
    5801             :         /* Now create the rb hammer! */
    5802             :         rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer");
    5803             :         if (WARN_ON(IS_ERR(rb_hammer))) {
    5804             :                 pr_cont("FAILED\n");
    5805             :                 ret = PTR_ERR(rb_hammer);
    5806             :                 goto out_free;
    5807             :         }
    5808             : 
    5809             :         ring_buffer_record_on(buffer);
    5810             :         /*
    5811             :          * Show buffer is enabled before setting rb_test_started.
    5812             :          * Yes there's a small race window where events could be
    5813             :          * dropped and the thread wont catch it. But when a ring
    5814             :          * buffer gets enabled, there will always be some kind of
    5815             :          * delay before other CPUs see it. Thus, we don't care about
    5816             :          * those dropped events. We care about events dropped after
    5817             :          * the threads see that the buffer is active.
    5818             :          */
    5819             :         smp_wmb();
    5820             :         rb_test_started = true;
    5821             : 
    5822             :         set_current_state(TASK_INTERRUPTIBLE);
    5823             :         /* Just run for 10 seconds */;
    5824             :         schedule_timeout(10 * HZ);
    5825             : 
    5826             :         kthread_stop(rb_hammer);
    5827             : 
    5828             :  out_free:
    5829             :         for_each_online_cpu(cpu) {
    5830             :                 if (!rb_threads[cpu])
    5831             :                         break;
    5832             :                 kthread_stop(rb_threads[cpu]);
    5833             :         }
    5834             :         if (ret) {
    5835             :                 ring_buffer_free(buffer);
    5836             :                 return ret;
    5837             :         }
    5838             : 
    5839             :         /* Report! */
    5840             :         pr_info("finished\n");
    5841             :         for_each_online_cpu(cpu) {
    5842             :                 struct ring_buffer_event *event;
    5843             :                 struct rb_test_data *data = &rb_data[cpu];
    5844             :                 struct rb_item *item;
    5845             :                 unsigned long total_events;
    5846             :                 unsigned long total_dropped;
    5847             :                 unsigned long total_written;
    5848             :                 unsigned long total_alloc;
    5849             :                 unsigned long total_read = 0;
    5850             :                 unsigned long total_size = 0;
    5851             :                 unsigned long total_len = 0;
    5852             :                 unsigned long total_lost = 0;
    5853             :                 unsigned long lost;
    5854             :                 int big_event_size;
    5855             :                 int small_event_size;
    5856             : 
    5857             :                 ret = -1;
    5858             : 
    5859             :                 total_events = data->events + data->events_nested;
    5860             :                 total_written = data->bytes_written + data->bytes_written_nested;
    5861             :                 total_alloc = data->bytes_alloc + data->bytes_alloc_nested;
    5862             :                 total_dropped = data->bytes_dropped + data->bytes_dropped_nested;
    5863             : 
    5864             :                 big_event_size = data->max_size + data->max_size_nested;
    5865             :                 small_event_size = data->min_size + data->min_size_nested;
    5866             : 
    5867             :                 pr_info("CPU %d:\n", cpu);
    5868             :                 pr_info("              events:    %ld\n", total_events);
    5869             :                 pr_info("       dropped bytes:    %ld\n", total_dropped);
    5870             :                 pr_info("       alloced bytes:    %ld\n", total_alloc);
    5871             :                 pr_info("       written bytes:    %ld\n", total_written);
    5872             :                 pr_info("       biggest event:    %d\n", big_event_size);
    5873             :                 pr_info("      smallest event:    %d\n", small_event_size);
    5874             : 
    5875             :                 if (RB_WARN_ON(buffer, total_dropped))
    5876             :                         break;
    5877             : 
    5878             :                 ret = 0;
    5879             : 
    5880             :                 while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) {
    5881             :                         total_lost += lost;
    5882             :                         item = ring_buffer_event_data(event);
    5883             :                         total_len += ring_buffer_event_length(event);
    5884             :                         total_size += item->size + sizeof(struct rb_item);
    5885             :                         if (memcmp(&item->str[0], rb_string, item->size) != 0) {
    5886             :                                 pr_info("FAILED!\n");
    5887             :                                 pr_info("buffer had: %.*s\n", item->size, item->str);
    5888             :                                 pr_info("expected:   %.*s\n", item->size, rb_string);
    5889             :                                 RB_WARN_ON(buffer, 1);
    5890             :                                 ret = -1;
    5891             :                                 break;
    5892             :                         }
    5893             :                         total_read++;
    5894             :                 }
    5895             :                 if (ret)
    5896             :                         break;
    5897             : 
    5898             :                 ret = -1;
    5899             : 
    5900             :                 pr_info("         read events:   %ld\n", total_read);
    5901             :                 pr_info("         lost events:   %ld\n", total_lost);
    5902             :                 pr_info("        total events:   %ld\n", total_lost + total_read);
    5903             :                 pr_info("  recorded len bytes:   %ld\n", total_len);
    5904             :                 pr_info(" recorded size bytes:   %ld\n", total_size);
    5905             :                 if (total_lost)
    5906             :                         pr_info(" With dropped events, record len and size may not match\n"
    5907             :                                 " alloced and written from above\n");
    5908             :                 if (!total_lost) {
    5909             :                         if (RB_WARN_ON(buffer, total_len != total_alloc ||
    5910             :                                        total_size != total_written))
    5911             :                                 break;
    5912             :                 }
    5913             :                 if (RB_WARN_ON(buffer, total_lost + total_read != total_events))
    5914             :                         break;
    5915             : 
    5916             :                 ret = 0;
    5917             :         }
    5918             :         if (!ret)
    5919             :                 pr_info("Ring buffer PASSED!\n");
    5920             : 
    5921             :         ring_buffer_free(buffer);
    5922             :         return 0;
    5923             : }
    5924             : 
    5925             : late_initcall(test_ringbuffer);
    5926             : #endif /* CONFIG_RING_BUFFER_STARTUP_TEST */

Generated by: LCOV version 1.14