一切皆文件之块设备驱动(三)
- Linux
- 2023-06-03
- 181热度
- 0评论
块设备驱动示例
#include <linux/blk_types.h>
#include <linux/blkdev.h>
#include <linux/device.h>
#include <linux/blk-mq.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/hdreg.h> /* for HDIO_GETGEO */
#include <linux/cdrom.h> /* for CDROM_GET_CAPABILITY */
#define CONFIG_SBLKDEV_REQUESTS_BASED
struct sblkdev_device {
struct list_head link;
sector_t capacity; /* Device size in sectors */
u8 *data; /* The data in virtual memory */
#ifdef CONFIG_SBLKDEV_REQUESTS_BASED
struct blk_mq_tag_set tag_set;
#endif
struct gendisk *disk;
};
struct sblkdev_device *sblkdev_add(int major, int minor, char *name,
sector_t capacity);
void sblkdev_remove(struct sblkdev_device *dev);
extern int dump_flag;
#ifdef CONFIG_SBLKDEV_REQUESTS_BASED
static inline int process_request(struct request *rq, unsigned int *nr_bytes)
{
int ret = 0;
struct bio_vec bvec;
struct req_iterator iter;
struct sblkdev_device *dev = rq->q->queuedata;
loff_t pos = blk_rq_pos(rq) << SECTOR_SHIFT;
loff_t dev_size = (dev->capacity << SECTOR_SHIFT);
dump_stack();
printk("
rq_for_each_segment(bvec, rq, iter) {
unsigned long len = bvec.bv_len;
void *buf = page_address(bvec.bv_page) + bvec.bv_offset;
if ((pos + len) > dev_size)
len = (unsigned long)(dev_size - pos);
if (rq_data_dir(rq)) {
printk("
memcpy(dev->data + pos, buf, len); /* WRITE */
} else {
printk("
memcpy(buf, dev->data + pos, len); /* READ */
}
pos += len;
*nr_bytes += len;
}
return ret;
}
static blk_status_t _queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd)
{
unsigned int nr_bytes = 0;
blk_status_t status = BLK_STS_OK;
struct request *rq = bd->rq;
dump_stack();
printk("
//might_sleep();
cant_sleep(); /* cannot use any locks that make the thread sleep */
blk_mq_start_request(rq);
if (process_request(rq, &nr_bytes))
status = BLK_STS_IOERR;
pr_debug("request
blk_mq_end_request(rq, status);
return status;
}
static struct blk_mq_ops mq_ops = {
.queue_rq = _queue_rq,
};
#else /* CONFIG_SBLKDEV_REQUESTS_BASED */
static inline void process_bio(struct sblkdev_device *dev, struct bio *bio)
{
struct bio_vec bvec;
struct bvec_iter iter;
loff_t pos = bio->bi_iter.bi_sector << SECTOR_SHIFT;
loff_t dev_size = (dev->capacity << SECTOR_SHIFT);
unsigned long start_time;
dump_stack();
printk("
start_time = bio_start_io_acct(bio);
bio_for_each_segment(bvec, bio, iter) {
unsigned int len = bvec.bv_len;
void *buf = page_address(bvec.bv_page) + bvec.bv_offset;
if ((pos + len) > dev_size) {
/* len = (unsigned long)(dev_size - pos);*/
bio->bi_status = BLK_STS_IOERR;
break;
}
if (bio_data_dir(bio)) {
printk("process_bio write\\n");
memcpy(dev->data + pos, buf, len); /* WRITE */
} else {
printk("process_bio read\\n");
memcpy(buf, dev->data + pos, len); /* READ */
}
pos += len;
}
bio_end_io_acct(bio, start_time);
bio_endio(bio);
}
blk_qc_t _submit_bio(struct bio *bio)
{
blk_qc_t ret = BLK_QC_T_NONE;
struct sblkdev_device *dev = bio->bi_bdev->bd_disk->private_data;
printk("
might_sleep();
//cant_sleep(); /* cannot use any locks that make the thread sleep */
process_bio(dev, bio);
return ret;
}
#endif /* CONFIG_SBLKDEV_REQUESTS_BASED */
static int _open(struct block_device *bdev, fmode_t mode)
{
struct sblkdev_device *dev = bdev->bd_disk->private_data;
dump_flag = 1;
printk("
if (!dev) {
pr_err("Invalid disk private_data\\n");
return -ENXIO;
}
pr_debug("Device was opened\\n");
return 0;
}
static void _release(struct gendisk *disk, fmode_t mode)
{
struct sblkdev_device *dev = disk->private_data;
printk("
if (!dev) {
pr_err("Invalid disk private_data\\n");
return;
}
pr_debug("Device was closed\\n");
}
static inline int ioctl_hdio_getgeo(struct sblkdev_device *dev, unsigned long arg)
{
struct hd_geometry geo = {0};
printk("
geo.start = 0;
if (dev->capacity > 63) {
sector_t quotient;
geo.sectors = 63;
quotient = (dev->capacity + (63 - 1)) / 63;
if (quotient > 255) {
geo.heads = 255;
geo.cylinders = (unsigned short)
((quotient + (255 - 1)) / 255);
} else {
geo.heads = (unsigned char)quotient;
geo.cylinders = 1;
}
} else {
geo.sectors = (unsigned char)dev->capacity;
geo.cylinders = 1;
geo.heads = 1;
}
if (copy_to_user((void *)arg, &geo, sizeof(geo)))
return -EINVAL;
return 0;
}
static int _ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
{
struct sblkdev_device *dev = bdev->bd_disk->private_data;
pr_debug("contol command [0x
printk("
switch (cmd) {
case HDIO_GETGEO:
return ioctl_hdio_getgeo(dev, arg);
case CDROM_GET_CAPABILITY:
return -EINVAL;
default:
return -ENOTTY;
}
}
#ifdef CONFIG_COMPAT
static int _compat_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
{
printk("
// CONFIG_COMPAT is to allow running 32-bit userspace code on a 64-bit kernel
return -ENOTTY; // not supported
}
#endif
static const struct block_device_operations fops = {
.owner = THIS_MODULE,
.open = _open,
.release = _release,
.ioctl = _ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = _compat_ioctl,
#endif
#ifndef CONFIG_SBLKDEV_REQUESTS_BASED
.submit_bio = _submit_bio,
#endif
};
/*
* sblkdev_remove() - Remove simple block device
*/
void sblkdev_remove(struct sblkdev_device *dev)
{
printk("
del_gendisk(dev->disk);
#ifdef HAVE_BLK_MQ_ALLOC_DISK
#ifdef HAVE_BLK_CLEANUP_DISK
blk_cleanup_disk(dev->disk);
#else
put_disk(dev->disk);
#endif
#else
blk_cleanup_queue(dev->disk->queue);
put_disk(dev->disk);
#endif
#ifdef CONFIG_SBLKDEV_REQUESTS_BASED
blk_mq_free_tag_set(&dev->tag_set);
#endif
vfree(dev->data);
kfree(dev);
pr_info("simple block device was removed\\n");
}
#ifdef CONFIG_SBLKDEV_REQUESTS_BASED
static inline int init_tag_set(struct blk_mq_tag_set *set, void *data)
{
printk("
//设置blk_mq_ops
set->ops = &mq_ops;
//设置硬件队列个数
set->nr_hw_queues = 1;
set->nr_maps = 1;
//设置队列深度
set->queue_depth = 128;
set->numa_node = NUMA_NO_NODE;
set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING;
set->cmd_size = 0;
set->driver_data = data;
return blk_mq_alloc_tag_set(set);
}
#endif
/*
* sblkdev_add() - Add simple block device
*/
struct sblkdev_device *sblkdev_add(int major, int minor, char *name,
sector_t capacity)
{
struct sblkdev_device *dev = NULL;
int ret = 0;
struct gendisk *disk;
pr_info("add device '
dev = kzalloc(sizeof(struct sblkdev_device), GFP_KERNEL);
if (!dev) {
ret = -ENOMEM;
goto fail;
}
INIT_LIST_HEAD(&dev->link);
dev->capacity = capacity;
dev->data = __vmalloc(capacity << SECTOR_SHIFT, GFP_NOIO | __GFP_ZERO);
if (!dev->data) {
ret = -ENOMEM;
goto fail_kfree;
}
#ifdef CONFIG_SBLKDEV_REQUESTS_BASED
ret = init_tag_set(&dev->tag_set, dev);
if (ret) {
pr_err("Failed to allocate tag set\\n");
goto fail_vfree;
}
disk = blk_mq_alloc_disk(&dev->tag_set, dev);
if (unlikely(!disk)) {
ret = -ENOMEM;
pr_err("Failed to allocate disk\\n");
goto fail_free_tag_set;
}
if (IS_ERR(disk)) {
ret = PTR_ERR(disk);
pr_err("Failed to allocate disk\\n");
goto fail_free_tag_set;
}
#else
disk = blk_alloc_disk(NUMA_NO_NODE);
if (!disk) {
pr_err("Failed to allocate disk\\n");
ret = -ENOMEM;
goto fail_vfree;
}
#endif
dev->disk = disk;
/* only one partition */
#ifdef GENHD_FL_NO_PART_SCAN
disk->flags |= GENHD_FL_NO_PART_SCAN;
#else
disk->flags |= GENHD_FL_NO_PART;
#endif
/* removable device */
/* disk->flags |= GENHD_FL_REMOVABLE; */
disk->major = major;
disk->first_minor = minor;
disk->minors = 1;
disk->fops = &fops;
disk->private_data = dev;
sprintf(disk->disk_name, name);
set_capacity(disk, dev->capacity);
#ifdef CONFIG_SBLKDEV_BLOCK_SIZE
blk_queue_physical_block_size(disk->queue, CONFIG_SBLKDEV_BLOCK_SIZE);
blk_queue_logical_block_size(disk->queue, CONFIG_SBLKDEV_BLOCK_SIZE);
blk_queue_io_min(disk->queue, CONFIG_SBLKDEV_BLOCK_SIZE);
blk_queue_io_opt(disk->queue, CONFIG_SBLKDEV_BLOCK_SIZE);
#else
blk_queue_physical_block_size(disk->queue, SECTOR_SIZE);
blk_queue_logical_block_size(disk->queue, SECTOR_SIZE);
#endif
blk_queue_max_hw_sectors(disk->queue, BLK_DEF_MAX_SECTORS);
blk_queue_flag_set(QUEUE_FLAG_NOMERGES, disk->queue);
#ifdef HAVE_ADD_DISK_RESULT
ret = add_disk(disk);
if (ret) {
pr_err("Failed to add disk '
goto fail_put_disk;
}
#else
add_disk(disk);
#endif
pr_info("Simple block device [
return dev;
#ifdef HAVE_ADD_DISK_RESULT
fail_put_disk:
#ifdef HAVE_BLK_MQ_ALLOC_DISK
#ifdef HAVE_BLK_CLEANUP_DISK
blk_cleanup_disk(dev->disk);
#else
put_disk(dev->disk);
#endif
#else
blk_cleanup_queue(dev->queue);
put_disk(dev->disk);
#endif
#endif /* HAVE_ADD_DISK_RESULT */
#ifdef CONFIG_SBLKDEV_REQUESTS_BASED
fail_free_tag_set:
blk_mq_free_tag_set(&dev->tag_set);
#endif
fail_vfree:
vfree(dev->data);
fail_kfree:
kfree(dev);
fail:
pr_err("Failed to add block device\\n");
return ERR_PTR(ret);
}
/*
* A module can create more than one block device.
* The configuration of block devices is implemented in the simplest way:
* using the module parameter, which is passed when the module is loaded.
* Example:
* modprobe sblkdev catalog="sblkdev1,2048;sblkdev2,4096"
*/
static int sblkdev_major;
static LIST_HEAD(sblkdev_device_list);
static char *sblkdev_catalog = "sblkdev1,2048;sblkdev2,4096";
/*
* sblkdev_init() - Entry point 'init'.
*
* Executed when the module is loaded. Parses the catalog parameter and
* creates block devices.
*/
static int __init sblkdev_init(void)
{
int ret = 0;
int inx = 0;
char *catalog;
char *next_token;
char *token;
size_t length;
sblkdev_major = register_blkdev(sblkdev_major, KBUILD_MODNAME);
if (sblkdev_major <= 0) {
pr_info("Unable to get major number\\n");
return -EBUSY;
}
length = strlen(sblkdev_catalog);
if ((length < 1) || (length > PAGE_SIZE)) {
pr_info("Invalid module parameter 'catalog'\\n");
ret = -EINVAL;
goto fail_unregister;
}
catalog = kzalloc(length + 1, GFP_KERNEL);
if (!catalog) {
ret = -ENOMEM;
goto fail_unregister;
}
strcpy(catalog, sblkdev_catalog);
next_token = catalog;
while ((token = strsep(&next_token, ";"))) {
struct sblkdev_device *dev;
char *name;
char *capacity;
sector_t capacity_value;
name = strsep(&token, ",");
if (!name)
continue;
capacity = strsep(&token, ",");
if (!capacity)
continue;
ret = kstrtoull(capacity, 10, &capacity_value);
if (ret)
break;
dev = sblkdev_add(sblkdev_major, inx, name, capacity_value);
if (IS_ERR(dev)) {
ret = PTR_ERR(dev);
break;
}
list_add(&dev->link, &sblkdev_device_list);
inx++;
}
kfree(catalog);
if (ret == 0)
return 0;
fail_unregister:
unregister_blkdev(sblkdev_major, KBUILD_MODNAME);
return ret;
}
/*
* sblkdev_exit() - Entry point 'exit'.
*
* Executed when the module is unloaded. Remove all block devices and cleanup
* all resources.
*/
static void __exit sblkdev_exit(void)
{
struct sblkdev_device *dev;
while ((dev = list_first_entry_or_null(&sblkdev_device_list,
struct sblkdev_device, link))) {
list_del(&dev->link);
sblkdev_remove(dev);
}
if (sblkdev_major > 0)
unregister_blkdev(sblkdev_major, KBUILD_MODNAME);
}
module_init(sblkdev_init);
module_exit(sblkdev_exit);
module_param_named(catalog, sblkdev_catalog, charp, 0644);
MODULE_PARM_DESC(catalog, "New block devices catalog in format '<name>,<capacity sectors>;...'");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Sergei Shtepa");
应用程序示例
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
void dump_f(char *buff, int len)
{
int i;
for(i= 0; i < len ; i ++) {
if(i
printf("\\n");
printf("
}
printf("\\n");
}
int main(int argc, char *argv[])
{
int fd;
char buf[4096];
int i;
sleep(3); //run ./funtion.sh to trace vfs_read of this process
fd = open("/dev/sblkdev1", O_RDWR);
printf("fd ====:
int ch = 0;
for (;;) {
ch = getopt(argc, argv, "rw");
if (ch < 0) {
printf("not param.\\n");
break;
}
switch (ch) {
case 'r':
printf("test read 1...............\\n");
read(fd, buf, 4096);
dump_f(buf,42);
sleep(2);
printf("test read 2...............\\n");
read(fd, buf, 4096);
dump_f(buf,42);
sleep(2);
printf("test read 3...............\\n");
read(fd, buf, 4096);
dump_f(buf,42);
sleep(2);
printf("test read 4...............\\n");
read(fd, buf, 4096);
dump_f(buf,42);
sleep(2);
printf("test read 5...............\\n");
read(fd, buf, 4096);
dump_f(buf,42);
while(1)
sleep(4);
break;
case 'w':
for(i=0;i<4096;i++)
buf[i] = i;
printf("test write 1............\\n");
write(fd, buf, 4096);
sleep(3);
printf("test write 2............\\n");
write(fd, buf, 4096);
sleep(3);
printf("test write 3............\\n");
write(fd, buf, 4096);
sleep(3);
printf("test write 4............\\n");
write(fd, buf, 4096);
sleep(3);
printf("test write 5............\\n");
write(fd, buf, 4096);
sleep(3);
while(1)
sleep(4);
break;
default:
printf("Not support\\n");
break;
}
}
while(1)
sleep(4);
return 0;
}
trace脚本
debugfs=/sys/kernel/debug
echo nop > debugfs/tracing/current_tracer
echo 0>debugfs/tracing/tracing_on
echo `pidof appxxx` > debugfs/tracing/set_ftrace_pid
echo function_graph>debugfs/tracing/current_tracer
echo vfs_read > debugfs/tracing/set_graph_function
echo 1>debugfs/tracing/tracing_on
实验
insmod simpleblk.ko
app -w &
app -r &
代码分析
初始化请求队列
初始化请求队列在init_tag_set中实现,在init_tag_set函数中填充了struct blk_mq_tag_set *set数据结构,blk_mq_tag_set用于描述与存储设备相关的集合,对存储器IO特征进行的抽象。
struct blk_mq_tag_set {
struct blk_mq_queue_map map[HCTX_MAX_TYPES]; 软件队列CTX到硬件队列hctx的映射表
unsigned int nr_maps; 映射表的数量
const struct blk_mq_ops *ops; 块设备驱动的mq函数操作集合
unsigned int nr_hw_queues; 块设备的硬件队列hctx数量,大多情况是1
unsigned int queue_depth; 块设备硬件队列深度
unsigned int reserved_tags;
unsigned int cmd_size; 块设备驱动为每个request分配的额外空间大小
......
};
init_tag_set中调用blk_mq_alloc_tag_set为一个或者多个请求队列分配tag和request集合。
int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
设置硬件队列数量、映射表数量(nr_maps)
->set->nr_maps = xxx
->set->nr_hw_queues = xxx
->set->queue_depth = xxx
根据硬件队列数量拓展tags数组
->blk_mq_alloc_tag_set_tags
更新映射表(cpu id-> hw queue id)
->ret = blk_mq_update_queue_map(set);
分配request和tag
->ret = blk_mq_alloc_map_and_requests(set);
数据处理
数据处理有两种方式,主要区别于block_device_operations中有没有实现submit_bio,如果实现了该函数文件系统下来的数据打包成bio后就直接回调该函数;如果没有实现该函数,文件系统下来的数据打包成bio后需要经过request queue进行处理,然后再派发回调到struct blk_mq_ops 注册的.queue_rq进行处理。
请求队列(request queue)里面包含一系列(request),在reqeust里面包含bio,真正的数据就存储在bio里面,因此对数据的处理就是从request_queue中取出一个一个的reqeust,然后再从reqeust里面取出bio,在处理请求时通过blk_mq_start_request和blk_mq_end_request来开始请求和结束请求。