How to Avoid High CPU Usage While Reading/Writing Character Device

How to avoid high cpu usage while reading/writing character device?

As pointed out by @0andriy, you are not supposed to access iomem directly. There are functions such as memcpy_toio() and memcpy_fromio() that can copy between iomem and normal memory, but they only work on kernel virtual addresses.

NOTE: The use of get_user_pages_fast(), set_page_dirty_lock() and put_page() described below should be changed for Linux kernel version 5.6 onwards. The required changes are described later.

In order to copy from userspace addresses to iomem without using an intermediate data buffer, the userspace memory pages need to be "pinned" into physical memory. That can be done using get_user_pages_fast(). However, the pinned pages may be in "high memory" (highmem) which is outside the permanently mapped memory in the kernel. Such pages need to be temporarily mapped into kernel virtual address space for a short duration using kmap_atomic(). (There are rules governing the use of kmap_atomic(), and there are other functions for longer term mapping of highmem. Check the highmem documentation for details.)

Once a userspace page has been mapped to kernel virtual address space, memcpy_toio() and memcpy_fromio() can be used to copy between that page and iomem.

A page temporarily mapped by kmap_atomic() needs to be unmapped by kunmap_atomic().

User memory pages pinned by get_user_pages_fast() need to be unpinned individually by calling put_page(), but if the page memory has been written to (e.g. by memcpy_fromio(), it must first be flagged as "dirty" by set_page_dirty_lock() before calling put_page().

Note: Change for kernel version 5.6 onwards.
The call to get_user_pages_fast() should be changed to pin_user_pages_fast().
Dirty pages pinned by pin_user_pages_fast() should be unpinned by unpin_user_pages_dirty_lock() with the last argument set true.
Clean pages pinned by pin_user_pages_fast() should be unpinned by unpin_user_page(), unpin_user_pages(), or unpin_user_pages_dirty_lock() with the last argument set false.
put_page() must not be used to unpin pages pinned by pin_user_pages_fast().
For code to be compatible with earlier kernel versions, the availability of pin_user_pages_fast(), unpin_user_page(), etc. can be determined by whether the FOLL_PIN macro has been defined by #include <linux/mm.h>.

Putting all that together, the following functions may be used to copy between user memory and iomem:

#include <linux/kernel.h>
#include <linux/uaccess.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/io.h>

/**
 * my_copy_to_user_from_iomem - copy to user memory from MMIO
 * @to:     destination in user memory
 * @from:   source in remapped MMIO
 * @n:      number of bytes to copy
 * Context: process
 *
 * Returns number of uncopied bytes.
 */
long my_copy_to_user_from_iomem(void __user *to, const void __iomem *from,
                unsigned long n)
{
    might_fault();
    if (!access_ok(to, n))
        return n;
    while (n) {
        enum { PAGE_LIST_LEN = 32 };
        struct page *page_list[PAGE_LIST_LEN];
        unsigned long start;
        unsigned int p_off;
        unsigned int part_len;
        int nr_pages;
        int i;

        /* Determine pages to do this iteration. */
        p_off = offset_in_page(to);
        start = (unsigned long)to - p_off;
        nr_pages = min_t(int, PAGE_ALIGN(p_off + n) >> PAGE_SHIFT,
                 PAGE_LIST_LEN);
        /* Lock down (for write) user pages. */
#ifdef FOLL_PIN
        nr_pages = pin_user_pages_fast(start, nr_pages, FOLL_WRITE, page_list);
#else
        nr_pages = get_user_pages_fast(start, nr_pages, FOLL_WRITE, page_list);
#endif
        if (nr_pages <= 0)
            break;

        /* Limit number of bytes to end of locked-down pages. */
        part_len =
            min(n, ((unsigned long)nr_pages << PAGE_SHIFT) - p_off);

        /* Copy from iomem to locked-down user memory pages. */
        for (i = 0; i < nr_pages; i++) {
            struct page *page = page_list[i];
            unsigned char *p_va;
            unsigned int plen;

            plen = min((unsigned int)PAGE_SIZE - p_off, part_len);
            p_va = kmap_atomic(page);
            memcpy_fromio(p_va + p_off, from, plen);
            kunmap_atomic(p_va);
#ifndef FOLL_PIN
            set_page_dirty_lock(page);
            put_page(page);
#endif
            to = (char __user *)to + plen;
            from = (const char __iomem *)from + plen;
            n -= plen;
            part_len -= plen;
            p_off = 0;
        }
#ifdef FOLL_PIN
        unpin_user_pages_dirty_lock(page_list, nr_pages, true);
#endif
    }
    return n;
}
    
/**
 * my_copy_from_user_to_iomem - copy from user memory to MMIO
 * @to:     destination in remapped MMIO
 * @from:   source in user memory
 * @n:      number of bytes to copy
 * Context: process
 *
 * Returns number of uncopied bytes.
 */
long my_copy_from_user_to_iomem(void __iomem *to, const void __user *from,
                unsigned long n)
{
    might_fault();
    if (!access_ok(from, n))
        return n;
    while (n) {
        enum { PAGE_LIST_LEN = 32 };
        struct page *page_list[PAGE_LIST_LEN];
        unsigned long start;
        unsigned int p_off;
        unsigned int part_len;
        int nr_pages;
        int i;

        /* Determine pages to do this iteration. */
        p_off = offset_in_page(from);
        start = (unsigned long)from - p_off;
        nr_pages = min_t(int, PAGE_ALIGN(p_off + n) >> PAGE_SHIFT,
                 PAGE_LIST_LEN);
        /* Lock down (for read) user pages. */
#ifdef FOLL_PIN
        nr_pages = pin_user_pages_fast(start, nr_pages, 0, page_list);
#else
        nr_pages = get_user_pages_fast(start, nr_pages, 0, page_list);
#endif
        if (nr_pages <= 0)
            break;

        /* Limit number of bytes to end of locked-down pages. */
        part_len =
            min(n, ((unsigned long)nr_pages << PAGE_SHIFT) - p_off);

        /* Copy from locked-down user memory pages to iomem. */
        for (i = 0; i < nr_pages; i++) {
            struct page *page = page_list[i];
            unsigned char *p_va;
            unsigned int plen;

            plen = min((unsigned int)PAGE_SIZE - p_off, part_len);
            p_va = kmap_atomic(page);
            memcpy_toio(to, p_va + p_off, plen);
            kunmap_atomic(p_va);
#ifndef FOLL_PIN
            put_page(page);
#endif
            to = (char __iomem *)to + plen;
            from = (const char __user *)from + plen;
            n -= plen;
            part_len -= plen;
            p_off = 0;
        }
#ifdef FOLL_PIN
        unpin_user_pages(page_list, nr_pages);
#endif
    }
    return n;
}

Secondly, you might be able to speed up memory access by mapping the iomem as "write combined" by replacing pci_iomap() with pci_iomap_wc().

Thirdly, the only real way to avoid wait-stating the CPU when accessing slow memory is to not use the CPU and use DMA transfers instead. The details of that very much depend on your PCIe device's bus-mastering DMA capabilities (if it has any at all). User memory pages still need to be pinned (e.g. by get_user_pages_fast() or pin_user_pages_fast() as appropriate) during the DMA transfer, but do not need to be temporarily mapped by kmap_atomic().

Linux character device -- what to do if read buffer is too small?

You could act like the datagram socket device driver: it always returns just a single datagram. If the read buffer is smaller, the excess is discarded -- it's the caller's responsibility to provide enough space for a whole datagram (typically, the application protocol specifies the maximum datagram size).

The documentation of your device should specify that it works in 16-byte units, so there's no reason why a caller would want to provide a buffer smaller than this. So any lost data due to the above discarding could be considered a bug in the calling application.

However, it would also be reasonable to return more than 16 at a time if the caller asks for it -- that suggests that the application will split it up into units itself. This could be more performance, since it minimizes system calls. But if the buffer isn't a multiple of 16, you could discard the remainder of the last unit. Just make sure this is documented, so they know to make it a multiple.

If you're worried about generic applications like cat, I don't think you need to. I would expect them to use very large input buffers, simply for performance reasons.

Linux PCIe driver and app showing high CPU Usage

Found the issue.

In the app I run, 4 Threads are created which will handle each PCIe Interrupt. Apart from that, The main function polls on a global variable with an empty while loop. This is the reason for high CPU Usage. Figured out a way and used usleep instead of while loop and done.
CPU Usage is less than 20% now.

Thanks for your comments.

Why does Linux C serial read only work with high CPU?

As your code doesn't work in a common condition that you can't prevent from happening, the why of "Why [it] only work[s] with high CPU?" doesn't really matter. It's probably interesting to spend a lot of time and effort finding out the "Why?", but I'd think you're going to have to change your code because anything that stops working when CPU load goes down is, IMO, waaaaay too fragile to trust to work for any time.

First, is threading even useful on your system? If there's only one CPU that can run only one thread at a time, creating multiple threads will be counterproductive. Have you tried a simple single-threaded solution and actually found that it doesn't work?

If you have tried a single-threaded solution and it doesn't work, the first thing I note is that your currently posted code is doing a tremendous amount of extra work it doesn't need to do, and it's likely contending over a single lock when that doesn't help much at all.

So eliminate your extraneous copying of data along with all the unnecessary bookkeeping you're doing.

You also probably have a lot of contention with just a single mutex and condition variable. There's no need to not read because the logging thread is doing something, or the logging thread not processing because the read thread is doing some bookkeeping. You'd almost certainly benefit from finer lock granularity.

I'd do something like this:

#define CHUNK_SIZE ( 4 * 1024 )
#define NUM_BUFFERS 16

struct dataStruct
{
    sem_t full;
    sem_t empty;
    ssize_t count;
    char data[ CHUNK_SIZE ]
};

struct dataStruct dataArray[ NUM_BUFFERS ];

void initDataArray( void )
{
    for ( int ii = 0; ii < NUM_BUFFERS; ii++ )
    {
        // buffers all start empty
        sem_init( &( dataArray[ ii ].full ), 0, 0 );
        sem_init( &( dataArray[ ii ].empty ), 0, 1 );
    }
}

void *readPortThread( void *arg )
{
    unsigned currBuffer = 0;

    // get portFD from arg
    int portFD = ???
    for ( ;; )
    {
        sem_wait( &( dataArray[ currBuffer  ].empty ) );

        // probably should loop and read more, and don't
        // infinite loop on any error (hint...)
        dataArray[ currBuffer  ].count = read( portFD, 
            dataArray[ currBuffer  ].data,
            sizeof( dataArray[ currBuffer  ].data ) );
        sem_post( &( dataArray[ currBuffer  ].full ) );
        currBuffer++;
        currBuffer  %= NUM_BUFFERS;
    }
    return( NULL );
}

void *processData( char *data, ssize_t count )
{
    ...
}

void *logDataThread( void *arg )
{
    for ( ;; )
    {
        sem_wait( &( dataArray[ currBuffer  ].full ) );

        processData( dataArray[ currBuffer  ].data,
            dataArray[ currBuffer  ].count );

        sem_post( &( dataArray[ currBuffer  ].empty ) );
        currBuffer++;
        currBuffer  %= NUM_BUFFERS;
    }
    return( NULL );
}

Note the much finer locking granularity, and the complete lack of extraneous copying of data. Proper headers, all error checking, and full implementation are left as an exercise...

You'll have to test to find optimum values for CHUNK_SIZE and NUM_BUFFERS. Setting the number of buffers dynamically would be a good improvement also.

OT: There's no need for any indirection in your int open_port(int flags, int *fd) function. Simply return the fd value - it's good enough for open() itself:

int open_port(int flags )
{
    struct termios options;
    int fd = open("/dev/ttyUSB0", flags | O_NOCTTY);

    if(fd < 0){
        printf("Error opening port\n");
        return fd;
    }

    // no need for else - the above if statement returns
    printf("Port handle is %d\n", fd);

    // did you set **ALL** the fields???
    memset( options, 0, sizeof( options ) );

    options.c_cflag = BAUDRATE | CS8 | CLOCAL | CREAD;
    options.c_iflag = 0;
    options.c_oflag = 0;
    options.c_lflag = 0;
    options.c_cc[VTIME] = 0;
    options.c_cc[VMIN] = 200;
    tcflush(fd, TCIFLUSH);
    tcsetattr(fd, TCSANOW, &options);

    return fd;
}

How to Avoid High CPU Usage While Reading/Writing Character Device