Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
/*
* Budget Fair Queueing (BFQ) I/O scheduler.
*
* Based on ideas and code from CFQ:
* Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
*
* Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
* Paolo Valente <paolo.valente@unimore.it>
*
* Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
* Arianna Avanzini <avanzini@google.com>
*
* Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* BFQ is a proportional-share I/O scheduler, with some extra
* low-latency capabilities. BFQ also supports full hierarchical
* scheduling through cgroups. Next paragraphs provide an introduction
* on BFQ inner workings. Details on BFQ benefits, usage and
* limitations can be found in Documentation/block/bfq-iosched.txt.
*
* BFQ is a proportional-share storage-I/O scheduling algorithm based
* on the slice-by-slice service scheme of CFQ. But BFQ assigns
* budgets, measured in number of sectors, to processes instead of
* time slices. The device is not granted to the in-service process
* for a given time slice, but until it has exhausted its assigned
* budget. This change from the time to the service domain enables BFQ
* to distribute the device throughput among processes as desired,
* without any distortion due to throughput fluctuations, or to device
* internal queueing. BFQ uses an ad hoc internal scheduler, called
* B-WF2Q+, to schedule processes according to their budgets. More
* precisely, BFQ schedules queues associated with processes. Each
* process/queue is assigned a user-configurable weight, and B-WF2Q+
* guarantees that each queue receives a fraction of the throughput
* proportional to its weight. Thanks to the accurate policy of
* B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
* processes issuing sequential requests (to boost the throughput),
* and yet guarantee a low latency to interactive and soft real-time
* applications.
*
* In particular, to provide these low-latency guarantees, BFQ
* explicitly privileges the I/O of two classes of time-sensitive
* applications: interactive and soft real-time. This feature enables
* BFQ to provide applications in these classes with a very low
* latency. Finally, BFQ also features additional heuristics for
* preserving both a low latency and a high throughput on NCQ-capable,
* rotational or flash-based devices, and to get the job done quickly
* for applications consisting in many I/O-bound processes.
*
* BFQ is described in [1], where also a reference to the initial, more
* theoretical paper on BFQ can be found. The interested reader can find
* in the latter paper full details on the main algorithm, as well as
* formulas of the guarantees and formal proofs of all the properties.
* With respect to the version of BFQ presented in these papers, this
* implementation adds a few more heuristics, such as the one that
* guarantees a low latency to soft real-time applications, and a
* hierarchical extension based on H-WF2Q+.
*
* B-WF2Q+ is based on WF2Q+, which is described in [2], together with
* H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
* with O(log N) complexity derives from the one introduced with EEVDF
* in [3].
*
* [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
* Scheduler", Proceedings of the First Workshop on Mobile System
* Technologies (MST-2015), May 2015.
* http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
*
* [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
* Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
* Oct 1997.
*
* http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
*
* [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
* First: A Flexible and Accurate Mechanism for Proportional Share
* Resource Allocation", technical report.
*
* http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
*/
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/cgroup.h>
#include <linux/elevator.h>
#include <linux/ktime.h>
#include <linux/rbtree.h>
#include <linux/ioprio.h>
#include <linux/sbitmap.h>
#include <linux/delay.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
#include "blk-mq-sched.h"
#include <linux/blktrace_api.h>
#include <linux/hrtimer.h>
#include <linux/blk-cgroup.h>
#define BFQ_IOPRIO_CLASSES 3
#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
#define BFQ_MIN_WEIGHT 1
#define BFQ_MAX_WEIGHT 1000
#define BFQ_WEIGHT_CONVERSION_COEFF 10
#define BFQ_DEFAULT_QUEUE_IOPRIO 4
#define BFQ_WEIGHT_LEGACY_DFL 100
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#define BFQ_DEFAULT_GRP_IOPRIO 0
#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
struct bfq_entity;
/**
* struct bfq_service_tree - per ioprio_class service tree.
*
* Each service tree represents a B-WF2Q+ scheduler on its own. Each
* ioprio_class has its own independent scheduler, and so its own
* bfq_service_tree. All the fields are protected by the queue lock
* of the containing bfqd.
*/
struct bfq_service_tree {
/* tree for active entities (i.e., those backlogged) */
struct rb_root active;
/* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
struct rb_root idle;
/* idle entity with minimum F_i */
struct bfq_entity *first_idle;
/* idle entity with maximum F_i */
struct bfq_entity *last_idle;
/* scheduler virtual time */
u64 vtime;
/* scheduler weight sum; active and idle entities contribute to it */
unsigned long wsum;
};
/**
* struct bfq_sched_data - multi-class scheduler.
*
* bfq_sched_data is the basic scheduler queue. It supports three
* ioprio_classes, and can be used either as a toplevel queue or as an
* intermediate queue on a hierarchical setup. @next_in_service
* points to the active entity of the sched_data service trees that
* will be scheduled next. It is used to reduce the number of steps
* needed for each hierarchical-schedule update.
*
* The supported ioprio_classes are the same as in CFQ, in descending
* priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
* Requests from higher priority queues are served before all the
* requests from lower priority queues; among requests of the same
* queue requests are served according to B-WF2Q+.
* All the fields are protected by the queue lock of the containing bfqd.
*/
struct bfq_sched_data {
/* entity in service */
struct bfq_entity *in_service_entity;
/* head-of-line entity (see comments above) */
struct bfq_entity *next_in_service;
/* array of service trees, one per ioprio_class */
struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
/* last time CLASS_IDLE was served */
unsigned long bfq_class_idle_last_service;
};
/**
* struct bfq_entity - schedulable entity.
*
* A bfq_entity is used to represent either a bfq_queue (leaf node in the
* cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
* entity belongs to the sched_data of the parent group in the cgroup
* hierarchy. Non-leaf entities have also their own sched_data, stored
* in @my_sched_data.
*
* Each entity stores independently its priority values; this would
* allow different weights on different devices, but this
* functionality is not exported to userspace by now. Priorities and
* weights are updated lazily, first storing the new values into the
* new_* fields, then setting the @prio_changed flag. As soon as
* there is a transition in the entity state that allows the priority
* update to take place the effective and the requested priority
* values are synchronized.
*
* Unless cgroups are used, the weight value is calculated from the
* ioprio to export the same interface as CFQ. When dealing with
* ``well-behaved'' queues (i.e., queues that do not spend too much
* time to consume their budget and have true sequential behavior, and
* when there are no external factors breaking anticipation) the
* relative weights at each level of the cgroups hierarchy should be
* guaranteed. All the fields are protected by the queue lock of the
* containing bfqd.
*/
struct bfq_entity {
/* service_tree member */
struct rb_node rb_node;
/*
* Flag, true if the entity is on a tree (either the active or
* the idle one of its service_tree) or is in service.
bool on_st;
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
/* B-WF2Q+ start and finish timestamps [sectors/weight] */
u64 start, finish;
/* tree the entity is enqueued into; %NULL if not on a tree */
struct rb_root *tree;
/*
* minimum start time of the (active) subtree rooted at this
* entity; used for O(log N) lookups into active trees
*/
u64 min_start;
/* amount of service received during the last service slot */
int service;
/* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
int budget;
/* weight of the queue */
int weight;
/* next weight if a change is in progress */
int new_weight;
/* original weight, used to implement weight boosting */
int orig_weight;
/* parent entity, for hierarchical scheduling */
struct bfq_entity *parent;
/*
* For non-leaf nodes in the hierarchy, the associated
* scheduler queue, %NULL on leaf nodes.
*/
struct bfq_sched_data *my_sched_data;
/* the scheduler queue this entity belongs to */
struct bfq_sched_data *sched_data;
/* flag, set to request a weight, ioprio or ioprio_class change */
int prio_changed;
};
struct bfq_group;
/**
* struct bfq_ttime - per process thinktime stats.
*/
struct bfq_ttime {
/* completion time of the last request */
u64 last_end_request;
/* total process thinktime */
u64 ttime_total;
/* number of thinktime samples */
unsigned long ttime_samples;
/* average process thinktime */
u64 ttime_mean;
};
/**
* struct bfq_queue - leaf schedulable entity.
*
* A bfq_queue is a leaf request queue; it can be associated with an
* io_context or more, if it is async. @cgroup holds a reference to
* the cgroup, to be sure that it does not disappear while a bfqq
* still references it (mostly to avoid races between request issuing
* and task migration followed by cgroup destruction). All the fields
* are protected by the queue lock of the containing bfqd.
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
*/
struct bfq_queue {
/* reference counter */
int ref;
/* parent bfq_data */
struct bfq_data *bfqd;
/* current ioprio and ioprio class */
unsigned short ioprio, ioprio_class;
/* next ioprio and ioprio class if a change is in progress */
unsigned short new_ioprio, new_ioprio_class;
/* sorted list of pending requests */
struct rb_root sort_list;
/* if fifo isn't expired, next request to serve */
struct request *next_rq;
/* number of sync and async requests queued */
int queued[2];
/* number of requests currently allocated */
int allocated;
/* number of pending metadata requests */
int meta_pending;
/* fifo list of requests in sort_list */
struct list_head fifo;
/* entity representing this queue in the scheduler */
struct bfq_entity entity;
/* maximum budget allowed from the feedback mechanism */
int max_budget;
/* budget expiration (in jiffies) */
unsigned long budget_timeout;
/* number of requests on the dispatch list or inside driver */
int dispatched;
/* status flags */
unsigned long flags;
/* node for active/idle bfqq list inside parent bfqd */
struct list_head bfqq_list;
/* associated @bfq_ttime struct */
struct bfq_ttime ttime;
/* bit vector: a 1 for each seeky requests in history */
u32 seek_history;
/* position of the last request enqueued */
sector_t last_request_pos;
/* Number of consecutive pairs of request completion and
* arrival, such that the queue becomes idle after the
* completion, but the next request arrives within an idle
* time slice; used only if the queue's IO_bound flag has been
* cleared.
*/
unsigned int requests_within_timer;
/* pid of the process owning the queue, used for logging purposes */
pid_t pid;
};
/**
* struct bfq_io_cq - per (request_queue, io_context) structure.
*/
struct bfq_io_cq {
/* associated io_cq structure */
struct io_cq icq; /* must be the first member */
/* array of two process queues, the sync and the async */
struct bfq_queue *bfqq[2];
/* per (request_queue, blkcg) ioprio */
int ioprio;
#ifdef CONFIG_BFQ_GROUP_IOSCHED
uint64_t blkcg_serial_nr; /* the current blkcg serial */
#endif
};
/**
* struct bfq_data - per-device data structure.
*
* All the fields are protected by @lock.
*/
struct bfq_data {
/* device request queue */
struct request_queue *queue;
/* dispatch queue */
struct list_head dispatch;
/* root bfq_group for the device */
struct bfq_group *root_group;
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
/*
* Number of bfq_queues containing requests (including the
* queue in service, even if it is idling).
*/
int busy_queues;
/* number of queued requests */
int queued;
/* number of requests dispatched and waiting for completion */
int rq_in_driver;
/*
* Maximum number of requests in driver in the last
* @hw_tag_samples completed requests.
*/
int max_rq_in_driver;
/* number of samples used to calculate hw_tag */
int hw_tag_samples;
/* flag set to one if the driver is showing a queueing behavior */
int hw_tag;
/* number of budgets assigned */
int budgets_assigned;
/*
* Timer set when idling (waiting) for the next request from
* the queue in service.
*/
struct hrtimer idle_slice_timer;
/* bfq_queue in service */
struct bfq_queue *in_service_queue;
/* bfq_io_cq (bic) associated with the @in_service_queue */
struct bfq_io_cq *in_service_bic;
/* on-disk position of the last served request */
sector_t last_position;
/* time of last request completion (ns) */
u64 last_completion;
/* time of first rq dispatch in current observation interval (ns) */
u64 first_dispatch;
/* time of last rq dispatch in current observation interval (ns) */
u64 last_dispatch;
/* beginning of the last budget */
ktime_t last_budget_start;
/* beginning of the last idle slice */
ktime_t last_idling_start;
/* number of samples in current observation interval */
int peak_rate_samples;
/* num of samples of seq dispatches in current observation interval */
u32 sequential_samples;
/* total num of sectors transferred in current observation interval */
u64 tot_sectors_dispatched;
/* max rq size seen during current observation interval (sectors) */
u32 last_rq_max_size;
/* time elapsed from first dispatch in current observ. interval (us) */
u64 delta_from_first;
* Current estimate of the device peak rate, measured in
* [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
* BFQ_RATE_SHIFT is performed to increase precision in
* fixed-point calculations.
*/
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
/* maximum budget allotted to a bfq_queue before rescheduling */
int bfq_max_budget;
/* list of all the bfq_queues active on the device */
struct list_head active_list;
/* list of all the bfq_queues idle on the device */
struct list_head idle_list;
/*
* Timeout for async/sync requests; when it fires, requests
* are served in fifo order.
*/
u64 bfq_fifo_expire[2];
/* weight of backward seeks wrt forward ones */
unsigned int bfq_back_penalty;
/* maximum allowed backward seek */
unsigned int bfq_back_max;
/* maximum idling time */
u32 bfq_slice_idle;
/* user-configured max budget value (0 for auto-tuning) */
int bfq_user_max_budget;
/*
* Timeout for bfq_queues to consume their budget; used to
* prevent seeky queues from imposing long latencies to
* sequential or quasi-sequential ones (this also implies that
* seeky queues cannot receive guarantees in the service
* domain; after a timeout they are charged for the time they
* have been in service, to preserve fairness among them, but
* without service-domain guarantees).
*/
unsigned int bfq_timeout;
/*
* Number of consecutive requests that must be issued within
* the idle time slice to set again idling to a queue which
* was marked as non-I/O-bound (see the definition of the
* IO_bound flag for further details).
*/
unsigned int bfq_requests_within_timer;
/*
* Force device idling whenever needed to provide accurate
* service guarantees, without caring about throughput
* issues. CAVEAT: this may even increase latencies, in case
* of useless idling for processes that did stop doing I/O.
*/
bool strict_guarantees;
/* fallback dummy bfqq for extreme OOM conditions */
struct bfq_queue oom_bfqq;
spinlock_t lock;
/*
* bic associated with the task issuing current bio for
* merging. This and the next field are used as a support to
* be able to perform the bic lookup, needed by bio-merge
* functions, before the scheduler lock is taken, and thus
* avoid taking the request-queue lock while the scheduler
* lock is being held.
*/
struct bfq_io_cq *bio_bic;
/* bfqq associated with the task issuing current bio for merging */
struct bfq_queue *bio_bfqq;
};
enum bfqq_state_flags {
BFQQF_busy = 0, /* has requests or is in service */
BFQQF_wait_request, /* waiting for a request */
BFQQF_non_blocking_wait_rq, /*
* waiting for a request
* without idling the device
*/
BFQQF_fifo_expire, /* FIFO checked in this slice */
BFQQF_idle_window, /* slice idling enabled */
BFQQF_sync, /* synchronous queue */
BFQQF_budget_new, /* no completion with this budget */
BFQQF_IO_bound, /*
* bfqq has timed-out at least once
* having consumed at most 2/10 of
* its budget
*/
};
#define BFQ_BFQQ_FNS(name) \
static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
{ \
__set_bit(BFQQF_##name, &(bfqq)->flags); \
} \
static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
{ \
__clear_bit(BFQQF_##name, &(bfqq)->flags); \
} \
static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
{ \
return test_bit(BFQQF_##name, &(bfqq)->flags); \
}
BFQ_BFQQ_FNS(busy);
BFQ_BFQQ_FNS(wait_request);
BFQ_BFQQ_FNS(non_blocking_wait_rq);
BFQ_BFQQ_FNS(fifo_expire);
BFQ_BFQQ_FNS(idle_window);
BFQ_BFQQ_FNS(sync);
BFQ_BFQQ_FNS(budget_new);
BFQ_BFQQ_FNS(IO_bound);
#undef BFQ_BFQQ_FNS
/* Logging facilities. */
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
#ifdef CONFIG_BFQ_GROUP_IOSCHED
static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
char __pbuf[128]; \
\
blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
__pbuf, ##args); \
} while (0)
#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
char __pbuf[128]; \
\
blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \
} while (0)
#else /* CONFIG_BFQ_GROUP_IOSCHED */
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \
bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
##args)
#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
#endif /* CONFIG_BFQ_GROUP_IOSCHED */
#define bfq_log(bfqd, fmt, args...) \
blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
/* Expiration reasons. */
enum bfqq_expiration {
BFQQE_TOO_IDLE = 0, /*
* queue has been idling for
* too long
*/
BFQQE_BUDGET_TIMEOUT, /* budget took too long to be used */
BFQQE_BUDGET_EXHAUSTED, /* budget consumed */
BFQQE_NO_MORE_REQUESTS, /* the queue has no more requests */
BFQQE_PREEMPTED /* preemption in progress */
};
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
struct bfqg_stats {
#ifdef CONFIG_BFQ_GROUP_IOSCHED
/* number of ios merged */
struct blkg_rwstat merged;
/* total time spent on device in ns, may not be accurate w/ queueing */
struct blkg_rwstat service_time;
/* total time spent waiting in scheduler queue in ns */
struct blkg_rwstat wait_time;
/* number of IOs queued up */
struct blkg_rwstat queued;
/* total disk time and nr sectors dispatched by this group */
struct blkg_stat time;
/* sum of number of ios queued across all samples */
struct blkg_stat avg_queue_size_sum;
/* count of samples taken for average */
struct blkg_stat avg_queue_size_samples;
/* how many times this group has been removed from service tree */
struct blkg_stat dequeue;
/* total time spent waiting for it to be assigned a timeslice. */
struct blkg_stat group_wait_time;
/* time spent idling for this blkcg_gq */
struct blkg_stat idle_time;
/* total time with empty current active q with other requests queued */
struct blkg_stat empty_time;
/* fields after this shouldn't be cleared on stat reset */
uint64_t start_group_wait_time;
uint64_t start_idle_time;
uint64_t start_empty_time;
uint16_t flags;
#endif /* CONFIG_BFQ_GROUP_IOSCHED */
};
#ifdef CONFIG_BFQ_GROUP_IOSCHED
/*
* struct bfq_group_data - per-blkcg storage for the blkio subsystem.
*
* @ps: @blkcg_policy_storage that this structure inherits
* @weight: weight of the bfq_group
*/
struct bfq_group_data {
/* must be the first member */
struct blkcg_policy_data pd;
unsigned short weight;
};
/**
* struct bfq_group - per (device, cgroup) data structure.
* @entity: schedulable entity to insert into the parent group sched_data.
* @sched_data: own sched_data, to contain child entities (they may be
* both bfq_queues and bfq_groups).
* @bfqd: the bfq_data for the device this group acts upon.
* @async_bfqq: array of async queues for all the tasks belonging to
* the group, one queue per ioprio value per ioprio_class,
* except for the idle class that has only one queue.
* @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
* @my_entity: pointer to @entity, %NULL for the toplevel group; used
* to avoid too many special cases during group creation/
* migration.
* @stats: stats for this bfqg.
*
* Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
* there is a set of bfq_groups, each one collecting the lower-level
* entities belonging to the group that are acting on the same device.
*
* Locking works as follows:
* o @bfqd is protected by the queue lock, RCU is used to access it
* from the readers.
* o All the other fields are protected by the @bfqd queue lock.
*/
struct bfq_group {
/* must be the first member */
struct blkg_policy_data pd;
struct bfq_entity entity;
struct bfq_sched_data sched_data;
void *bfqd;
struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
struct bfq_queue *async_idle_bfqq;
struct bfq_entity *my_entity;
struct bfqg_stats stats;
};
#else
struct bfq_group {
struct bfq_sched_data sched_data;
struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
struct bfq_queue *async_idle_bfqq;
struct rb_root rq_pos_tree;
};
#endif
static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
static unsigned int bfq_class_idx(struct bfq_entity *entity)
{
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
return bfqq ? bfqq->ioprio_class - 1 :
BFQ_DEFAULT_GRP_CLASS - 1;
}
static struct bfq_service_tree *
bfq_entity_service_tree(struct bfq_entity *entity)
{
struct bfq_sched_data *sched_data = entity->sched_data;
unsigned int idx = bfq_class_idx(entity);
return sched_data->service_tree + idx;
}
static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
{
return bic->bfqq[is_sync];
}
static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq,
bool is_sync)
{
bic->bfqq[is_sync] = bfqq;
}
static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
{
return bic->icq.q->elevator->elevator_data;
}
static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio);
static void bfq_put_queue(struct bfq_queue *bfqq);
static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
struct bio *bio, bool is_sync,
struct bfq_io_cq *bic);
static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
/* Expiration time of sync (0) and async (1) requests, in ns. */
static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
/* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
static const int bfq_back_max = 16 * 1024;
/* Penalty of a backwards seek, in number of sectors. */
static const int bfq_back_penalty = 2;
/* Idling period duration, in ns. */
static u64 bfq_slice_idle = NSEC_PER_SEC / 125;
/* Minimum number of assigned budgets for which stats are safe to compute. */
static const int bfq_stats_min_budgets = 194;
/* Default maximum budget values, in sectors and number of requests. */
static const int bfq_default_max_budget = 16 * 1024;
/* Default timeout values, in jiffies, approximating CFQ defaults. */
static const int bfq_timeout = HZ / 8;
static struct kmem_cache *bfq_pool;
/* Below this threshold (in ns), we consider thinktime immediate. */
#define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
/* hw_tag detection: parallel requests threshold and min samples needed. */
#define BFQ_HW_QUEUE_THRESHOLD 4
#define BFQ_HW_QUEUE_SAMPLES 32
#define BFQQ_SEEK_THR (sector_t)(8 * 100)
#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
/* Min number of samples required to perform peak-rate update */
#define BFQ_RATE_MIN_SAMPLES 32
/* Min observation time interval required to perform a peak-rate update (ns) */
#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC)
/* Target observation time interval for a peak-rate update (ns) */
#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
/* Shift used for peak rate fixed precision calculations. */
#define BFQ_RATE_SHIFT 16
#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
/**
* icq_to_bic - convert iocontext queue structure to bfq_io_cq.
* @icq: the iocontext queue.
*/
static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
{
/* bic->icq is the first member, %NULL will convert to %NULL */
return container_of(icq, struct bfq_io_cq, icq);
}
/**
* bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
* @bfqd: the lookup key.
* @ioc: the io_context of the process doing I/O.
* @q: the request queue.
*/
static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
struct io_context *ioc,
struct request_queue *q)
{
if (ioc) {
unsigned long flags;
struct bfq_io_cq *icq;
spin_lock_irqsave(q->queue_lock, flags);
icq = icq_to_bic(ioc_lookup_icq(ioc, q));
spin_unlock_irqrestore(q->queue_lock, flags);
return icq;
}
return NULL;
}
/*
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
* Scheduler run of queue, if there are requests pending and no one in the
* driver that will restart queueing.
*/
static void bfq_schedule_dispatch(struct bfq_data *bfqd)
{
if (bfqd->queued != 0) {
bfq_log(bfqd, "schedule dispatch");
blk_mq_run_hw_queues(bfqd->queue, true);
}
}
/**
* bfq_gt - compare two timestamps.
* @a: first ts.
* @b: second ts.
*
* Return @a > @b, dealing with wrapping correctly.
*/
static int bfq_gt(u64 a, u64 b)
{
return (s64)(a - b) > 0;
}
static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree)
{
struct rb_node *node = tree->rb_node;
return rb_entry(node, struct bfq_entity, rb_node);
}
static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd);
static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
/**
* bfq_update_next_in_service - update sd->next_in_service
* @sd: sched_data for which to perform the update.
* @new_entity: if not NULL, pointer to the entity whose activation,
* requeueing or repositionig triggered the invocation of
* this function.
*
* This function is called to update sd->next_in_service, which, in
* its turn, may change as a consequence of the insertion or
* extraction of an entity into/from one of the active trees of
* sd. These insertions/extractions occur as a consequence of
* activations/deactivations of entities, with some activations being
* 'true' activations, and other activations being requeueings (i.e.,
* implementing the second, requeueing phase of the mechanism used to
* reposition an entity in its active tree; see comments on
* __bfq_activate_entity and __bfq_requeue_entity for details). In
* both the last two activation sub-cases, new_entity points to the
* just activated or requeued entity.
*
* Returns true if sd->next_in_service changes in such a way that
* entity->parent may become the next_in_service for its parent
* entity.
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
struct bfq_entity *new_entity)
{
struct bfq_entity *next_in_service = sd->next_in_service;
bool parent_sched_may_change = false;
/*
* If this update is triggered by the activation, requeueing
* or repositiong of an entity that does not coincide with
* sd->next_in_service, then a full lookup in the active tree
* can be avoided. In fact, it is enough to check whether the
* just-modified entity has a higher priority than
* sd->next_in_service, or, even if it has the same priority
* as sd->next_in_service, is eligible and has a lower virtual
* finish time than sd->next_in_service. If this compound
* condition holds, then the new entity becomes the new
* next_in_service. Otherwise no change is needed.
*/
if (new_entity && new_entity != sd->next_in_service) {
/*
* Flag used to decide whether to replace
* sd->next_in_service with new_entity. Tentatively
* set to true, and left as true if
* sd->next_in_service is NULL.
*/
bool replace_next = true;
/*
* If there is already a next_in_service candidate
* entity, then compare class priorities or timestamps
* to decide whether to replace sd->service_tree with
* new_entity.
*/
if (next_in_service) {
unsigned int new_entity_class_idx =
bfq_class_idx(new_entity);
struct bfq_service_tree *st =
sd->service_tree + new_entity_class_idx;
/*
* For efficiency, evaluate the most likely
* sub-condition first.
*/
replace_next =
(new_entity_class_idx ==
bfq_class_idx(next_in_service)
&&
!bfq_gt(new_entity->start, st->vtime)
&&
bfq_gt(next_in_service->finish,
new_entity->finish))
||
new_entity_class_idx <
bfq_class_idx(next_in_service);
}
if (replace_next)
next_in_service = new_entity;
} else /* invoked because of a deactivation: lookup needed */
next_in_service = bfq_lookup_next_entity(sd);
if (next_in_service) {
parent_sched_may_change = !sd->next_in_service ||
bfq_update_parent_budget(next_in_service);
}
sd->next_in_service = next_in_service;
if (!next_in_service)
return parent_sched_may_change;
return parent_sched_may_change;
}
#ifdef CONFIG_BFQ_GROUP_IOSCHED
/* both next loops stop at one of the child entities of the root group */
#define for_each_entity(entity) \
for (; entity ; entity = entity->parent)
/*
* For each iteration, compute parent in advance, so as to be safe if
* entity is deallocated during the iteration. Such a deallocation may
* happen as a consequence of a bfq_put_queue that frees the bfq_queue
* containing entity.
*/
#define for_each_entity_safe(entity, parent) \
for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
/*
* Returns true if this budget changes may let next_in_service->parent
* become the next_in_service entity for its parent entity.
*/
static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
struct bfq_entity *bfqg_entity;
struct bfq_group *bfqg;
struct bfq_sched_data *group_sd;
bool ret = false;
group_sd = next_in_service->sched_data;
bfqg = container_of(group_sd, struct bfq_group, sched_data);
/*
* bfq_group's my_entity field is not NULL only if the group
* is not the root group. We must not touch the root entity
* as it must never become an in-service entity.
*/
bfqg_entity = bfqg->my_entity;
if (bfqg_entity) {
if (bfqg_entity->budget > next_in_service->budget)
ret = true;
bfqg_entity->budget = next_in_service->budget;
}
return ret;
}
/*
* This function tells whether entity stops being a candidate for next
* service, according to the following logic.