一切皆文件之块设备驱动(三)

块设备驱动示例

#include <linux/blk_types.h>
#include <linux/blkdev.h>
#include <linux/device.h>
#include <linux/blk-mq.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/hdreg.h> /* for HDIO_GETGEO */
#include <linux/cdrom.h> /* for CDROM_GET_CAPABILITY */

#define CONFIG_SBLKDEV_REQUESTS_BASED

struct sblkdev_device {
    struct list_head link;

    sector_t capacity;      /* Device size in sectors */
    u8 *data;           /* The data in virtual memory */
#ifdef CONFIG_SBLKDEV_REQUESTS_BASED
    struct blk_mq_tag_set tag_set;
#endif
    struct gendisk *disk;
};

struct sblkdev_device *sblkdev_add(int major, int minor, char *name,
                  sector_t capacity);
void sblkdev_remove(struct sblkdev_device *dev);

extern int dump_flag;

#ifdef CONFIG_SBLKDEV_REQUESTS_BASED
static inline int process_request(struct request *rq, unsigned int *nr_bytes)
{
    int ret = 0;
    struct bio_vec bvec;
    struct req_iterator iter;
    struct sblkdev_device *dev = rq->q->queuedata;
    loff_t pos = blk_rq_pos(rq) << SECTOR_SHIFT;
    loff_t dev_size = (dev->capacity << SECTOR_SHIFT);

    dump_stack();
    printk("

    rq_for_each_segment(bvec, rq, iter) {
        unsigned long len = bvec.bv_len;
        void *buf = page_address(bvec.bv_page) + bvec.bv_offset;

        if ((pos + len) > dev_size)
            len = (unsigned long)(dev_size - pos);

        if (rq_data_dir(rq)) {
            printk("
            memcpy(dev->data + pos, buf, len); /* WRITE */
        } else {
            printk("
            memcpy(buf, dev->data + pos, len); /* READ */
        }
        pos += len;
        *nr_bytes += len;
    }

    return ret;
}

static blk_status_t _queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd)
{
    unsigned int nr_bytes = 0;
    blk_status_t status = BLK_STS_OK;
    struct request *rq = bd->rq;

    dump_stack();

    printk("
    //might_sleep();
    cant_sleep(); /* cannot use any locks that make the thread sleep */

    blk_mq_start_request(rq);

    if (process_request(rq, &nr_bytes))
        status = BLK_STS_IOERR;

    pr_debug("request 

    blk_mq_end_request(rq, status);

    return status;
}

static struct blk_mq_ops mq_ops = {
    .queue_rq = _queue_rq,
};

#else  /* CONFIG_SBLKDEV_REQUESTS_BASED */

static inline void process_bio(struct sblkdev_device *dev, struct bio *bio)
{
    struct bio_vec bvec;
    struct bvec_iter iter;
    loff_t pos = bio->bi_iter.bi_sector << SECTOR_SHIFT;
    loff_t dev_size = (dev->capacity << SECTOR_SHIFT);
    unsigned long start_time;

    dump_stack();

    printk("
    start_time = bio_start_io_acct(bio);
    bio_for_each_segment(bvec, bio, iter) {
        unsigned int len = bvec.bv_len;
        void *buf = page_address(bvec.bv_page) + bvec.bv_offset;

        if ((pos + len) > dev_size) {
            /* len = (unsigned long)(dev_size - pos);*/
            bio->bi_status = BLK_STS_IOERR;
            break;
        }

        if (bio_data_dir(bio)) {
            printk("process_bio write\\n");
            memcpy(dev->data + pos, buf, len); /* WRITE */
        } else {
            printk("process_bio read\\n");
            memcpy(buf, dev->data + pos, len); /* READ */
        }

        pos += len;
    }
    bio_end_io_acct(bio, start_time);
    bio_endio(bio);
}

blk_qc_t _submit_bio(struct bio *bio)
{
    blk_qc_t ret = BLK_QC_T_NONE;
    struct sblkdev_device *dev = bio->bi_bdev->bd_disk->private_data;

    printk("
    might_sleep();
    //cant_sleep(); /* cannot use any locks that make the thread sleep */

    process_bio(dev, bio);

    return ret;
}

#endif /* CONFIG_SBLKDEV_REQUESTS_BASED */


static int _open(struct block_device *bdev, fmode_t mode)
{
    struct sblkdev_device *dev = bdev->bd_disk->private_data;

    dump_flag = 1;
    printk("
    if (!dev) {
        pr_err("Invalid disk private_data\\n");
        return -ENXIO;
    }

    pr_debug("Device was opened\\n");

    return 0;
}

static void _release(struct gendisk *disk, fmode_t mode)
{
    struct sblkdev_device *dev = disk->private_data;

    printk("
    if (!dev) {
        pr_err("Invalid disk private_data\\n");
        return;
    }

    pr_debug("Device was closed\\n");
}

static inline int ioctl_hdio_getgeo(struct sblkdev_device *dev, unsigned long arg)
{
    struct hd_geometry geo = {0};

    printk("
    geo.start = 0;
    if (dev->capacity > 63) {
        sector_t quotient;

        geo.sectors = 63;
        quotient = (dev->capacity + (63 - 1)) / 63;

        if (quotient > 255) {
            geo.heads = 255;
            geo.cylinders = (unsigned short)
                ((quotient + (255 - 1)) / 255);
        } else {
            geo.heads = (unsigned char)quotient;
            geo.cylinders = 1;
        }
    } else {
        geo.sectors = (unsigned char)dev->capacity;
        geo.cylinders = 1;
        geo.heads = 1;
    }

    if (copy_to_user((void *)arg, &geo, sizeof(geo)))
        return -EINVAL;

    return 0;
}

static int _ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
{
    struct sblkdev_device *dev = bdev->bd_disk->private_data;

    pr_debug("contol command [0x

    printk("
    switch (cmd) {
    case HDIO_GETGEO:
        return ioctl_hdio_getgeo(dev, arg);
    case CDROM_GET_CAPABILITY:
        return -EINVAL;
    default:
        return -ENOTTY;
    }
}

#ifdef CONFIG_COMPAT
static int _compat_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
{
    printk("
    // CONFIG_COMPAT is to allow running 32-bit userspace code on a 64-bit kernel
    return -ENOTTY; // not supported
}
#endif

static const struct block_device_operations fops = {
    .owner = THIS_MODULE,
    .open = _open,
    .release = _release,
    .ioctl = _ioctl,
#ifdef CONFIG_COMPAT
    .compat_ioctl = _compat_ioctl,
#endif
#ifndef CONFIG_SBLKDEV_REQUESTS_BASED
    .submit_bio = _submit_bio,
#endif
};

/*
 * sblkdev_remove() - Remove simple block device
 */
void sblkdev_remove(struct sblkdev_device *dev)
{
    printk("
    del_gendisk(dev->disk);

#ifdef HAVE_BLK_MQ_ALLOC_DISK
#ifdef HAVE_BLK_CLEANUP_DISK
    blk_cleanup_disk(dev->disk);
#else
    put_disk(dev->disk);
#endif
#else
    blk_cleanup_queue(dev->disk->queue);
    put_disk(dev->disk);
#endif

#ifdef CONFIG_SBLKDEV_REQUESTS_BASED
    blk_mq_free_tag_set(&dev->tag_set);
#endif
    vfree(dev->data);

    kfree(dev);

    pr_info("simple block device was removed\\n");
}

#ifdef CONFIG_SBLKDEV_REQUESTS_BASED
static inline int init_tag_set(struct blk_mq_tag_set *set, void *data)
{
    printk("
    //设置blk_mq_ops
    set->ops = &mq_ops;
    //设置硬件队列个数
    set->nr_hw_queues = 1;
    set->nr_maps = 1;
    //设置队列深度
    set->queue_depth = 128;
    set->numa_node = NUMA_NO_NODE;
    set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING;

    set->cmd_size = 0;
    set->driver_data = data;

    return blk_mq_alloc_tag_set(set);
}
#endif
/*
 * sblkdev_add() - Add simple block device
 */
struct sblkdev_device *sblkdev_add(int major, int minor, char *name,
                  sector_t capacity)
{
    struct sblkdev_device *dev = NULL;
    int ret = 0;
    struct gendisk *disk;

    pr_info("add device '

    dev = kzalloc(sizeof(struct sblkdev_device), GFP_KERNEL);
    if (!dev) {
        ret = -ENOMEM;
        goto fail;
    }

    INIT_LIST_HEAD(&dev->link);
    dev->capacity = capacity;
    dev->data = __vmalloc(capacity << SECTOR_SHIFT, GFP_NOIO | __GFP_ZERO);
    if (!dev->data) {
        ret = -ENOMEM;
        goto fail_kfree;
    }

#ifdef CONFIG_SBLKDEV_REQUESTS_BASED
    ret = init_tag_set(&dev->tag_set, dev);
    if (ret) {
        pr_err("Failed to allocate tag set\\n");
        goto fail_vfree;
    }

    disk = blk_mq_alloc_disk(&dev->tag_set, dev);
    if (unlikely(!disk)) {
        ret = -ENOMEM;
        pr_err("Failed to allocate disk\\n");
        goto fail_free_tag_set;
    }
    if (IS_ERR(disk)) {
        ret = PTR_ERR(disk);
        pr_err("Failed to allocate disk\\n");
        goto fail_free_tag_set;
    }

#else
    disk = blk_alloc_disk(NUMA_NO_NODE);
    if (!disk) {
        pr_err("Failed to allocate disk\\n");
        ret = -ENOMEM;
        goto fail_vfree;
    }
#endif
    dev->disk = disk;

    /* only one partition */
#ifdef GENHD_FL_NO_PART_SCAN
    disk->flags |= GENHD_FL_NO_PART_SCAN;
#else
    disk->flags |= GENHD_FL_NO_PART;
#endif

    /* removable device */
    /* disk->flags |= GENHD_FL_REMOVABLE; */

    disk->major = major;
    disk->first_minor = minor;
    disk->minors = 1;

    disk->fops = &fops;

    disk->private_data = dev;

    sprintf(disk->disk_name, name);
    set_capacity(disk, dev->capacity);

#ifdef CONFIG_SBLKDEV_BLOCK_SIZE
    blk_queue_physical_block_size(disk->queue, CONFIG_SBLKDEV_BLOCK_SIZE);
    blk_queue_logical_block_size(disk->queue, CONFIG_SBLKDEV_BLOCK_SIZE);
    blk_queue_io_min(disk->queue, CONFIG_SBLKDEV_BLOCK_SIZE);
    blk_queue_io_opt(disk->queue, CONFIG_SBLKDEV_BLOCK_SIZE);
#else
    blk_queue_physical_block_size(disk->queue, SECTOR_SIZE);
    blk_queue_logical_block_size(disk->queue, SECTOR_SIZE);
#endif
    blk_queue_max_hw_sectors(disk->queue, BLK_DEF_MAX_SECTORS);
    blk_queue_flag_set(QUEUE_FLAG_NOMERGES, disk->queue);


#ifdef HAVE_ADD_DISK_RESULT
    ret = add_disk(disk);
    if (ret) {
        pr_err("Failed to add disk '
        goto fail_put_disk;
    }
#else
    add_disk(disk);
#endif

    pr_info("Simple block device [

    return dev;

#ifdef HAVE_ADD_DISK_RESULT
fail_put_disk:
#ifdef HAVE_BLK_MQ_ALLOC_DISK
#ifdef HAVE_BLK_CLEANUP_DISK
    blk_cleanup_disk(dev->disk);
#else
    put_disk(dev->disk);
#endif
#else
    blk_cleanup_queue(dev->queue);
    put_disk(dev->disk);
#endif
#endif /* HAVE_ADD_DISK_RESULT */

#ifdef CONFIG_SBLKDEV_REQUESTS_BASED
fail_free_tag_set:
    blk_mq_free_tag_set(&dev->tag_set);
#endif
fail_vfree:
    vfree(dev->data);
fail_kfree:
    kfree(dev);
fail:
    pr_err("Failed to add block device\\n");

    return ERR_PTR(ret);
}
/*
 * A module can create more than one block device.
 * The configuration of block devices is implemented in the simplest way:
 * using the module parameter, which is passed when the module is loaded.
 * Example:
 *    modprobe sblkdev catalog="sblkdev1,2048;sblkdev2,4096"
 */

static int sblkdev_major;
static LIST_HEAD(sblkdev_device_list);
static char *sblkdev_catalog = "sblkdev1,2048;sblkdev2,4096";

/*
 * sblkdev_init() - Entry point 'init'.
 *
 * Executed when the module is loaded. Parses the catalog parameter and
 * creates block devices.
 */
static int __init sblkdev_init(void)
{
    int ret = 0;
    int inx = 0;
    char *catalog;
    char *next_token;
    char *token;
    size_t length;

    sblkdev_major = register_blkdev(sblkdev_major, KBUILD_MODNAME);
    if (sblkdev_major <= 0) {
        pr_info("Unable to get major number\\n");
        return -EBUSY;
    }

    length = strlen(sblkdev_catalog);
    if ((length < 1) || (length > PAGE_SIZE)) {
        pr_info("Invalid module parameter 'catalog'\\n");
        ret = -EINVAL;
        goto fail_unregister;
    }

    catalog = kzalloc(length + 1, GFP_KERNEL);
    if (!catalog) {
        ret = -ENOMEM;
        goto fail_unregister;
    }
    strcpy(catalog, sblkdev_catalog);
    next_token = catalog;
    while ((token = strsep(&next_token, ";"))) {
        struct sblkdev_device *dev;
        char *name;
        char *capacity;
        sector_t capacity_value;

        name = strsep(&token, ",");
        if (!name)
            continue;
        capacity = strsep(&token, ",");
        if (!capacity)
            continue;

        ret = kstrtoull(capacity, 10, &capacity_value);
        if (ret)
            break;

        dev = sblkdev_add(sblkdev_major, inx, name, capacity_value);
        if (IS_ERR(dev)) {
            ret = PTR_ERR(dev);
            break;
        }

        list_add(&dev->link, &sblkdev_device_list);
        inx++;
    }
    kfree(catalog);

    if (ret == 0)
        return 0;

fail_unregister:
    unregister_blkdev(sblkdev_major, KBUILD_MODNAME);
    return ret;
}

/*
 * sblkdev_exit() - Entry point 'exit'.
 *
 * Executed when the module is unloaded. Remove all block devices and cleanup
 * all resources.
 */
static void __exit sblkdev_exit(void)
{
    struct sblkdev_device *dev;

    while ((dev = list_first_entry_or_null(&sblkdev_device_list,
                           struct sblkdev_device, link))) {
        list_del(&dev->link);
        sblkdev_remove(dev);
    }

    if (sblkdev_major > 0)
        unregister_blkdev(sblkdev_major, KBUILD_MODNAME);
}

module_init(sblkdev_init);
module_exit(sblkdev_exit);

module_param_named(catalog, sblkdev_catalog, charp, 0644);
MODULE_PARM_DESC(catalog, "New block devices catalog in format '<name>,<capacity sectors>;...'");

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Sergei Shtepa");

应用程序示例

#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
void dump_f(char *buff, int len)
{
    int i;

    for(i= 0; i < len ; i ++) {
        if(i 
            printf("\\n");
        printf("
    }
    printf("\\n");
}

int main(int argc, char *argv[])
{
    int fd;
    char buf[4096];
    int i;

    sleep(3); //run ./funtion.sh to trace vfs_read of this process
    fd = open("/dev/sblkdev1", O_RDWR);
    printf("fd ====:

    int ch = 0;
    for (;;) {
        ch = getopt(argc, argv, "rw");
        if (ch < 0) {
            printf("not param.\\n");
            break;
        }
        switch (ch) {
            case 'r':
                printf("test read 1...............\\n");
                read(fd, buf, 4096);
                dump_f(buf,42);
                sleep(2);
                printf("test read 2...............\\n");
                read(fd, buf, 4096);
                dump_f(buf,42);
                sleep(2);
                printf("test read 3...............\\n");
                read(fd, buf, 4096);
                dump_f(buf,42);
                sleep(2);
                printf("test read 4...............\\n");
                read(fd, buf, 4096);
                dump_f(buf,42);
                sleep(2);
                printf("test read 5...............\\n");
                read(fd, buf, 4096);
                dump_f(buf,42);
                while(1)
                    sleep(4);
                break;
            case 'w':
                for(i=0;i<4096;i++)
                    buf[i] = i;
                printf("test write 1............\\n");
                write(fd, buf, 4096);
                sleep(3);
                printf("test write 2............\\n");
                write(fd, buf, 4096);
                sleep(3);
                printf("test write 3............\\n");
                write(fd, buf, 4096);
                sleep(3);
                printf("test write 4............\\n");
                write(fd, buf, 4096);
                sleep(3);
                printf("test write 5............\\n");
                write(fd, buf, 4096);
                sleep(3);
                while(1)
                    sleep(4);
                break;
            default:
                printf("Not support\\n");
            break;
       }
    }
    while(1)
        sleep(4);
    return 0;
}

trace脚本

debugfs=/sys/kernel/debug

echo nop > debugfs/tracing/current_tracer

echo 0>debugfs/tracing/tracing_on

echo `pidof appxxx` > debugfs/tracing/set_ftrace_pid

echo function_graph>debugfs/tracing/current_tracer

echo vfs_read > debugfs/tracing/set_graph_function

echo 1>debugfs/tracing/tracing_on

实验

insmod simpleblk.ko

app -w &

app -r &

代码分析

初始化请求队列

初始化请求队列在init_tag_set中实现,在init_tag_set函数中填充了struct blk_mq_tag_set *set数据结构,blk_mq_tag_set用于描述与存储设备相关的集合,对存储器IO特征进行的抽象。

struct blk_mq_tag_set {
    struct blk_mq_queue_map map[HCTX_MAX_TYPES]; 软件队列CTX到硬件队列hctx的映射表
    unsigned int        nr_maps;                 映射表的数量    
    const struct blk_mq_ops *ops;                块设备驱动的mq函数操作集合
    unsigned int        nr_hw_queues;            块设备的硬件队列hctx数量,大多情况是1
    unsigned int        queue_depth;             块设备硬件队列深度
    unsigned int        reserved_tags;
    unsigned int        cmd_size;                块设备驱动为每个request分配的额外空间大小
    ......
};

init_tag_set中调用blk_mq_alloc_tag_set为一个或者多个请求队列分配tag和request集合。

int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)

    设置硬件队列数量、映射表数量(nr_maps)
    ->set->nr_maps = xxx
    ->set->nr_hw_queues = xxx
    ->set->queue_depth = xxx

    根据硬件队列数量拓展tags数组
    ->blk_mq_alloc_tag_set_tags

    更新映射表(cpu id-> hw queue id)
    ->ret = blk_mq_update_queue_map(set);

    分配request和tag
    ->ret = blk_mq_alloc_map_and_requests(set);

数据处理

数据处理有两种方式,主要区别于block_device_operations中有没有实现submit_bio,如果实现了该函数文件系统下来的数据打包成bio后就直接回调该函数;如果没有实现该函数,文件系统下来的数据打包成bio后需要经过request queue进行处理,然后再派发回调到struct blk_mq_ops 注册的.queue_rq进行处理。

请求队列(request queue)里面包含一系列(request),在reqeust里面包含bio,真正的数据就存储在bio里面,因此对数据的处理就是从request_queue中取出一个一个的reqeust,然后再从reqeust里面取出bio,在处理请求时通过blk_mq_start_request和blk_mq_end_request来开始请求和结束请求。