Newer
Older
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
INIT_LIST_HEAD(&bfqq->fifo);
bfqq->ref = 0;
bfqq->bfqd = bfqd;
if (bic)
bfq_set_next_ioprio_data(bfqq, bic);
if (is_sync) {
if (!bfq_class_idle(bfqq))
bfq_mark_bfqq_idle_window(bfqq);
bfq_mark_bfqq_sync(bfqq);
} else
bfq_clear_bfqq_sync(bfqq);
/* set end request to minus infinity from now */
bfqq->ttime.last_end_request = ktime_get_ns() + 1;
bfq_mark_bfqq_IO_bound(bfqq);
bfqq->pid = pid;
/* Tentative initial value to trade off between thr and lat */
bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
bfqq->budget_timeout = bfq_smallest_from_now();
/* first request is almost certainly seeky */
bfqq->seek_history = 1;
}
static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
struct bfq_group *bfqg,
int ioprio_class, int ioprio)
{
switch (ioprio_class) {
case IOPRIO_CLASS_RT:
return &bfqg->async_bfqq[0][ioprio];
case IOPRIO_CLASS_NONE:
ioprio = IOPRIO_NORM;
/* fall through */
case IOPRIO_CLASS_BE:
return &bfqg->async_bfqq[1][ioprio];
case IOPRIO_CLASS_IDLE:
return &bfqg->async_idle_bfqq;
default:
return NULL;
}
}
static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
struct bio *bio, bool is_sync,
struct bfq_io_cq *bic)
{
const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
struct bfq_queue **async_bfqq = NULL;
struct bfq_queue *bfqq;
struct bfq_group *bfqg;
rcu_read_lock();
bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
if (!bfqg) {
bfqq = &bfqd->oom_bfqq;
goto out;
}
if (!is_sync) {
async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
ioprio);
bfqq = *async_bfqq;
if (bfqq)
goto out;
}
bfqq = kmem_cache_alloc_node(bfq_pool,
GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
bfqd->queue->node);
if (bfqq) {
bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
is_sync);
bfq_init_entity(&bfqq->entity, bfqg);
bfq_log_bfqq(bfqd, bfqq, "allocated");
} else {
bfqq = &bfqd->oom_bfqq;
bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
goto out;
}
/*
* Pin the queue now that it's allocated, scheduler exit will
* prune it.
*/
if (async_bfqq) {
bfqq->ref++; /*
* Extra group reference, w.r.t. sync
* queue. This extra reference is removed
* only if bfqq->bfqg disappears, to
* guarantee that this queue is not freed
* until its group goes away.
*/
bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
bfqq, bfqq->ref);
*async_bfqq = bfqq;
}
out:
bfqq->ref++; /* get a process reference to this queue */
bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
rcu_read_unlock();
return bfqq;
}
static void bfq_update_io_thinktime(struct bfq_data *bfqd,
struct bfq_queue *bfqq)
{
struct bfq_ttime *ttime = &bfqq->ttime;
u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8;
ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8);
ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
ttime->ttime_samples);
}
static void
bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct request *rq)
{
sector_t sdist = 0;
if (bfqq->last_request_pos) {
if (bfqq->last_request_pos < blk_rq_pos(rq))
sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
else
sdist = bfqq->last_request_pos - blk_rq_pos(rq);
}
bfqq->seek_history <<= 1;
bfqq->seek_history |= sdist > BFQQ_SEEK_THR &&
(!blk_queue_nonrot(bfqd->queue) ||
blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
}
/*
* Disable idle window if the process thinks too long or seeks so much that
* it doesn't matter.
*/
static void bfq_update_idle_window(struct bfq_data *bfqd,
struct bfq_queue *bfqq,
struct bfq_io_cq *bic)
{
int enable_idle;
/* Don't idle for async or idle io prio class. */
if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
return;
enable_idle = bfq_bfqq_idle_window(bfqq);
if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
bfqd->bfq_slice_idle == 0 ||
(bfqd->hw_tag && BFQQ_SEEKY(bfqq)))
enable_idle = 0;
else if (bfq_sample_valid(bfqq->ttime.ttime_samples)) {
if (bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)
enable_idle = 0;
else
enable_idle = 1;
}
bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
enable_idle);
if (enable_idle)
bfq_mark_bfqq_idle_window(bfqq);
else
bfq_clear_bfqq_idle_window(bfqq);
}
/*
* Called when a new fs request (rq) is added to bfqq. Check if there's
* something we should do about it.
*/
static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct request *rq)
{
struct bfq_io_cq *bic = RQ_BIC(rq);
if (rq->cmd_flags & REQ_META)
bfqq->meta_pending++;
bfq_update_io_thinktime(bfqd, bfqq);
bfq_update_io_seektime(bfqd, bfqq, rq);
if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
!BFQQ_SEEKY(bfqq))
bfq_update_idle_window(bfqd, bfqq, bic);
bfq_log_bfqq(bfqd, bfqq,
"rq_enqueued: idle_window=%d (seeky %d)",
bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq));
bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
blk_rq_sectors(rq) < 32;
bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
/*
* There is just this request queued: if the request
* is small and the queue is not to be expired, then
* just exit.
*
* In this way, if the device is being idled to wait
* for a new request from the in-service queue, we
* avoid unplugging the device and committing the
* device to serve just a small request. On the
* contrary, we wait for the block layer to decide
* when to unplug the device: hopefully, new requests
* will be merged to this one quickly, then the device
* will be unplugged and larger requests will be
* dispatched.
*/
if (small_req && !budget_timeout)
return;
/*
* A large enough request arrived, or the queue is to
* be expired: in both cases disk idling is to be
* stopped, so clear wait_request flag and reset
* timer.
*/
bfq_clear_bfqq_wait_request(bfqq);
hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
bfqg_stats_update_idle_time(bfqq_group(bfqq));
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
/*
* The queue is not empty, because a new request just
* arrived. Hence we can safely expire the queue, in
* case of budget timeout, without risking that the
* timestamps of the queue are not updated correctly.
* See [1] for more details.
*/
if (budget_timeout)
bfq_bfqq_expire(bfqd, bfqq, false,
BFQQE_BUDGET_TIMEOUT);
}
}
static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
{
struct bfq_queue *bfqq = RQ_BFQQ(rq);
bfq_add_request(rq);
rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
list_add_tail(&rq->queuelist, &bfqq->fifo);
bfq_rq_enqueued(bfqd, bfqq, rq);
}
static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
bool at_head)
{
struct request_queue *q = hctx->queue;
struct bfq_data *bfqd = q->elevator->elevator_data;
spin_lock_irq(&bfqd->lock);
if (blk_mq_sched_try_insert_merge(q, rq)) {
spin_unlock_irq(&bfqd->lock);
return;
}
spin_unlock_irq(&bfqd->lock);
blk_mq_sched_request_inserted(rq);
spin_lock_irq(&bfqd->lock);
if (at_head || blk_rq_is_passthrough(rq)) {
if (at_head)
list_add(&rq->queuelist, &bfqd->dispatch);
else
list_add_tail(&rq->queuelist, &bfqd->dispatch);
} else {
__bfq_insert_request(bfqd, rq);
if (rq_mergeable(rq)) {
elv_rqhash_add(q, rq);
if (!q->last_merge)
q->last_merge = rq;
}
}
spin_unlock_irq(&bfqd->lock);
}
static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
struct list_head *list, bool at_head)
{
while (!list_empty(list)) {
struct request *rq;
rq = list_first_entry(list, struct request, queuelist);
list_del_init(&rq->queuelist);
bfq_insert_request(hctx, rq, at_head);
}
}
static void bfq_update_hw_tag(struct bfq_data *bfqd)
{
bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
bfqd->rq_in_driver);
if (bfqd->hw_tag == 1)
return;
/*
* This sample is valid if the number of outstanding requests
* is large enough to allow a queueing behavior. Note that the
* sum is not exact, as it's not taking into account deactivated
* requests.
*/
if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
return;
if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
return;
bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
bfqd->max_rq_in_driver = 0;
bfqd->hw_tag_samples = 0;
}
static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
{
bfq_update_hw_tag(bfqd);
bfqd->rq_in_driver--;
bfqq->dispatched--;
bfqq->ttime.last_end_request = ktime_get_ns();
/*
* If this is the in-service queue, check if it needs to be expired,
* or if we want to idle in case it has no pending requests.
*/
if (bfqd->in_service_queue == bfqq) {
if (bfq_bfqq_budget_new(bfqq))
bfq_set_budget_timeout(bfqd);
if (bfq_bfqq_must_idle(bfqq)) {
bfq_arm_slice_timer(bfqd);
return;
} else if (bfq_may_expire_for_budg_timeout(bfqq))
bfq_bfqq_expire(bfqd, bfqq, false,
BFQQE_BUDGET_TIMEOUT);
else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
(bfqq->dispatched == 0 ||
!bfq_bfqq_may_idle(bfqq)))
bfq_bfqq_expire(bfqd, bfqq, false,
BFQQE_NO_MORE_REQUESTS);
}
}
static void bfq_put_rq_priv_body(struct bfq_queue *bfqq)
{
bfqq->allocated--;
bfq_put_queue(bfqq);
}
static void bfq_put_rq_private(struct request_queue *q, struct request *rq)
{
struct bfq_queue *bfqq = RQ_BFQQ(rq);
struct bfq_data *bfqd = bfqq->bfqd;
if (rq->rq_flags & RQF_STARTED)
bfqg_stats_update_completion(bfqq_group(bfqq),
rq_start_time_ns(rq),
rq_io_start_time_ns(rq),
rq->cmd_flags);
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
if (likely(rq->rq_flags & RQF_STARTED)) {
unsigned long flags;
spin_lock_irqsave(&bfqd->lock, flags);
bfq_completed_request(bfqq, bfqd);
bfq_put_rq_priv_body(bfqq);
spin_unlock_irqrestore(&bfqd->lock, flags);
} else {
/*
* Request rq may be still/already in the scheduler,
* in which case we need to remove it. And we cannot
* defer such a check and removal, to avoid
* inconsistencies in the time interval from the end
* of this function to the start of the deferred work.
* This situation seems to occur only in process
* context, as a consequence of a merge. In the
* current version of the code, this implies that the
* lock is held.
*/
if (!RB_EMPTY_NODE(&rq->rb_node))
bfq_remove_request(q, rq);
bfq_put_rq_priv_body(bfqq);
}
rq->elv.priv[0] = NULL;
rq->elv.priv[1] = NULL;
}
/*
* Allocate bfq data structures associated with this request.
*/
static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
struct bio *bio)
{
struct bfq_data *bfqd = q->elevator->elevator_data;
struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
const int is_sync = rq_is_sync(rq);
struct bfq_queue *bfqq;
spin_lock_irq(&bfqd->lock);
bfq_check_ioprio_change(bic, bio);
if (!bic)
goto queue_fail;
bfq_bic_update_cgroup(bic, bio);
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
bfqq = bic_to_bfqq(bic, is_sync);
if (!bfqq || bfqq == &bfqd->oom_bfqq) {
if (bfqq)
bfq_put_queue(bfqq);
bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
bic_set_bfqq(bic, bfqq, is_sync);
}
bfqq->allocated++;
bfqq->ref++;
bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d",
rq, bfqq, bfqq->ref);
rq->elv.priv[0] = bic;
rq->elv.priv[1] = bfqq;
spin_unlock_irq(&bfqd->lock);
return 0;
queue_fail:
spin_unlock_irq(&bfqd->lock);
return 1;
}
static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq)
{
struct bfq_data *bfqd = bfqq->bfqd;
enum bfqq_expiration reason;
unsigned long flags;
spin_lock_irqsave(&bfqd->lock, flags);
bfq_clear_bfqq_wait_request(bfqq);
if (bfqq != bfqd->in_service_queue) {
spin_unlock_irqrestore(&bfqd->lock, flags);
return;
}
if (bfq_bfqq_budget_timeout(bfqq))
/*
* Also here the queue can be safely expired
* for budget timeout without wasting
* guarantees
*/
reason = BFQQE_BUDGET_TIMEOUT;
else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
/*
* The queue may not be empty upon timer expiration,
* because we may not disable the timer when the
* first request of the in-service queue arrives
* during disk idling.
*/
reason = BFQQE_TOO_IDLE;
else
goto schedule_dispatch;
bfq_bfqq_expire(bfqd, bfqq, true, reason);
schedule_dispatch:
spin_unlock_irqrestore(&bfqd->lock, flags);
bfq_schedule_dispatch(bfqd);
}
/*
* Handler of the expiration of the timer running if the in-service queue
* is idling inside its time slice.
*/
static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer)
{
struct bfq_data *bfqd = container_of(timer, struct bfq_data,
idle_slice_timer);
struct bfq_queue *bfqq = bfqd->in_service_queue;
/*
* Theoretical race here: the in-service queue can be NULL or
* different from the queue that was idling if a new request
* arrives for the current queue and there is a full dispatch
* cycle that changes the in-service queue. This can hardly
* happen, but in the worst case we just expire a queue too
* early.
*/
if (bfqq)
bfq_idle_slice_timer_body(bfqq);
return HRTIMER_NORESTART;
}
static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
struct bfq_queue **bfqq_ptr)
{
struct bfq_queue *bfqq = *bfqq_ptr;
bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
if (bfqq) {
bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
bfqq, bfqq->ref);
bfq_put_queue(bfqq);
*bfqq_ptr = NULL;
}
}
/*
* Release all the bfqg references to its async queues. If we are
* deallocating the group these queues may still contain requests, so
* we reparent them to the root cgroup (i.e., the only one that will
* exist for sure until all the requests on a device are gone).
*/
static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
{
int i, j;
for (i = 0; i < 2; i++)
for (j = 0; j < IOPRIO_BE_NR; j++)
__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
}
static void bfq_exit_queue(struct elevator_queue *e)
{
struct bfq_data *bfqd = e->elevator_data;
struct bfq_queue *bfqq, *n;
hrtimer_cancel(&bfqd->idle_slice_timer);
spin_lock_irq(&bfqd->lock);
list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
bfq_deactivate_bfqq(bfqd, bfqq, false, false);
spin_unlock_irq(&bfqd->lock);
hrtimer_cancel(&bfqd->idle_slice_timer);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
#else
spin_lock_irq(&bfqd->lock);
bfq_put_async_queues(bfqd, bfqd->root_group);
kfree(bfqd->root_group);
spin_unlock_irq(&bfqd->lock);
#endif
kfree(bfqd);
}
static void bfq_init_root_group(struct bfq_group *root_group,
struct bfq_data *bfqd)
{
int i;
#ifdef CONFIG_BFQ_GROUP_IOSCHED
root_group->entity.parent = NULL;
root_group->my_entity = NULL;
root_group->bfqd = bfqd;
#endif
for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
root_group->sched_data.bfq_class_idle_last_service = jiffies;
}
static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
{
struct bfq_data *bfqd;
struct elevator_queue *eq;
eq = elevator_alloc(q, e);
if (!eq)
return -ENOMEM;
bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
if (!bfqd) {
kobject_put(&eq->kobj);
return -ENOMEM;
}
eq->elevator_data = bfqd;
spin_lock_irq(q->queue_lock);
q->elevator = eq;
spin_unlock_irq(q->queue_lock);
/*
* Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
* Grab a permanent reference to it, so that the normal code flow
* will not attempt to free it.
*/
bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
bfqd->oom_bfqq.ref++;
bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
bfqd->oom_bfqq.entity.new_weight =
bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
/*
* Trigger weight initialization, according to ioprio, at the
* oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
* class won't be changed any more.
*/
bfqd->oom_bfqq.entity.prio_changed = 1;
bfqd->queue = q;
INIT_LIST_HEAD(&bfqd->dispatch);
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
HRTIMER_MODE_REL);
bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
INIT_LIST_HEAD(&bfqd->active_list);
INIT_LIST_HEAD(&bfqd->idle_list);
bfqd->hw_tag = -1;
bfqd->bfq_max_budget = bfq_default_max_budget;
bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
bfqd->bfq_back_max = bfq_back_max;
bfqd->bfq_back_penalty = bfq_back_penalty;
bfqd->bfq_slice_idle = bfq_slice_idle;
bfqd->bfq_timeout = bfq_timeout;
bfqd->bfq_requests_within_timer = 120;
spin_lock_init(&bfqd->lock);
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
/*
* The invocation of the next bfq_create_group_hierarchy
* function is the head of a chain of function calls
* (bfq_create_group_hierarchy->blkcg_activate_policy->
* blk_mq_freeze_queue) that may lead to the invocation of the
* has_work hook function. For this reason,
* bfq_create_group_hierarchy is invoked only after all
* scheduler data has been initialized, apart from the fields
* that can be initialized only after invoking
* bfq_create_group_hierarchy. This, in particular, enables
* has_work to correctly return false. Of course, to avoid
* other inconsistencies, the blk-mq stack must then refrain
* from invoking further scheduler hooks before this init
* function is finished.
*/
bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);
if (!bfqd->root_group)
goto out_free;
bfq_init_root_group(bfqd->root_group, bfqd);
bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
return 0;
out_free:
kfree(bfqd);
kobject_put(&eq->kobj);
return -ENOMEM;
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
}
static void bfq_slab_kill(void)
{
kmem_cache_destroy(bfq_pool);
}
static int __init bfq_slab_setup(void)
{
bfq_pool = KMEM_CACHE(bfq_queue, 0);
if (!bfq_pool)
return -ENOMEM;
return 0;
}
static ssize_t bfq_var_show(unsigned int var, char *page)
{
return sprintf(page, "%u\n", var);
}
static ssize_t bfq_var_store(unsigned long *var, const char *page,
size_t count)
{
unsigned long new_val;
int ret = kstrtoul(page, 10, &new_val);
if (ret == 0)
*var = new_val;
return count;
}
#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
static ssize_t __FUNC(struct elevator_queue *e, char *page) \
{ \
struct bfq_data *bfqd = e->elevator_data; \
u64 __data = __VAR; \
if (__CONV == 1) \
__data = jiffies_to_msecs(__data); \
else if (__CONV == 2) \
__data = div_u64(__data, NSEC_PER_MSEC); \
return bfq_var_show(__data, (page)); \
}
SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2);
SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2);
SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2);
SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1);
SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0);
#undef SHOW_FUNCTION
#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
static ssize_t __FUNC(struct elevator_queue *e, char *page) \
{ \
struct bfq_data *bfqd = e->elevator_data; \
u64 __data = __VAR; \
__data = div_u64(__data, NSEC_PER_USEC); \
return bfq_var_show(__data, (page)); \
}
USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle);
#undef USEC_SHOW_FUNCTION
#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
static ssize_t \
__FUNC(struct elevator_queue *e, const char *page, size_t count) \
{ \
struct bfq_data *bfqd = e->elevator_data; \
unsigned long uninitialized_var(__data); \
int ret = bfq_var_store(&__data, (page), count); \
if (__data < (MIN)) \
__data = (MIN); \
else if (__data > (MAX)) \
__data = (MAX); \
if (__CONV == 1) \
*(__PTR) = msecs_to_jiffies(__data); \
else if (__CONV == 2) \
*(__PTR) = (u64)__data * NSEC_PER_MSEC; \
else \
*(__PTR) = __data; \
return ret; \
}
STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
INT_MAX, 2);
STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
INT_MAX, 2);
STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
INT_MAX, 0);
STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2);
#undef STORE_FUNCTION
#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
{ \
struct bfq_data *bfqd = e->elevator_data; \
unsigned long uninitialized_var(__data); \
int ret = bfq_var_store(&__data, (page), count); \
if (__data < (MIN)) \
__data = (MIN); \
else if (__data > (MAX)) \
__data = (MAX); \
*(__PTR) = (u64)__data * NSEC_PER_USEC; \
return ret; \
}
USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0,
UINT_MAX);
#undef USEC_STORE_FUNCTION
static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
{
u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout);
if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
return bfq_calc_max_budget(bfqd->peak_rate, timeout);
else
return bfq_default_max_budget;
}
static ssize_t bfq_max_budget_store(struct elevator_queue *e,
const char *page, size_t count)
{
struct bfq_data *bfqd = e->elevator_data;
unsigned long uninitialized_var(__data);
int ret = bfq_var_store(&__data, (page), count);
if (__data == 0)
bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
else {
if (__data > INT_MAX)
__data = INT_MAX;
bfqd->bfq_max_budget = __data;
}
bfqd->bfq_user_max_budget = __data;
return ret;
}
/*
* Leaving this name to preserve name compatibility with cfq
* parameters, but this timeout is used for both sync and async.
*/
static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
const char *page, size_t count)
{
struct bfq_data *bfqd = e->elevator_data;
unsigned long uninitialized_var(__data);
int ret = bfq_var_store(&__data, (page), count);
if (__data < 1)
__data = 1;
else if (__data > INT_MAX)
__data = INT_MAX;
bfqd->bfq_timeout = msecs_to_jiffies(__data);
if (bfqd->bfq_user_max_budget == 0)
bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
return ret;
}
static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e,
const char *page, size_t count)
{
struct bfq_data *bfqd = e->elevator_data;
unsigned long uninitialized_var(__data);
int ret = bfq_var_store(&__data, (page), count);
if (__data > 1)
__data = 1;
if (!bfqd->strict_guarantees && __data == 1
&& bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC)
bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC;
bfqd->strict_guarantees = __data;
return ret;
}
#define BFQ_ATTR(name) \
__ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
static struct elv_fs_entry bfq_attrs[] = {
BFQ_ATTR(fifo_expire_sync),
BFQ_ATTR(fifo_expire_async),
BFQ_ATTR(back_seek_max),
BFQ_ATTR(back_seek_penalty),
BFQ_ATTR(slice_idle),
BFQ_ATTR(slice_idle_us),
BFQ_ATTR(max_budget),
BFQ_ATTR(timeout_sync),
BFQ_ATTR(strict_guarantees),
__ATTR_NULL
};
static struct elevator_type iosched_bfq_mq = {
.ops.mq = {
.get_rq_priv = bfq_get_rq_private,
.put_rq_priv = bfq_put_rq_private,
.exit_icq = bfq_exit_icq,
.insert_requests = bfq_insert_requests,
.dispatch_request = bfq_dispatch_request,
.next_request = elv_rb_latter_request,
.former_request = elv_rb_former_request,
.allow_merge = bfq_allow_bio_merge,
.bio_merge = bfq_bio_merge,
.request_merge = bfq_request_merge,
.requests_merged = bfq_requests_merged,
.request_merged = bfq_request_merged,
.has_work = bfq_has_work,
.init_sched = bfq_init_queue,
.exit_sched = bfq_exit_queue,
},
.uses_mq = true,
.icq_size = sizeof(struct bfq_io_cq),
.icq_align = __alignof__(struct bfq_io_cq),
.elevator_attrs = bfq_attrs,
.elevator_name = "bfq",
.elevator_owner = THIS_MODULE,
};
#ifdef CONFIG_BFQ_GROUP_IOSCHED
static struct blkcg_policy blkcg_policy_bfq = {
.dfl_cftypes = bfq_blkg_files,
.legacy_cftypes = bfq_blkcg_legacy_files,
.cpd_alloc_fn = bfq_cpd_alloc,
.cpd_init_fn = bfq_cpd_init,
.cpd_bind_fn = bfq_cpd_init,
.cpd_free_fn = bfq_cpd_free,
.pd_alloc_fn = bfq_pd_alloc,
.pd_init_fn = bfq_pd_init,
.pd_offline_fn = bfq_pd_offline,
.pd_free_fn = bfq_pd_free,
.pd_reset_stats_fn = bfq_pd_reset_stats,
};
#endif
static int __init bfq_init(void)
{
int ret;
#ifdef CONFIG_BFQ_GROUP_IOSCHED
ret = blkcg_policy_register(&blkcg_policy_bfq);
if (ret)
return ret;
#endif
ret = -ENOMEM;
if (bfq_slab_setup())
goto err_pol_unreg;
ret = elv_register(&iosched_bfq_mq);
if (ret)
goto err_pol_unreg;
return 0;
err_pol_unreg:
#ifdef CONFIG_BFQ_GROUP_IOSCHED
blkcg_policy_unregister(&blkcg_policy_bfq);
#endif
return ret;
}
static void __exit bfq_exit(void)
{
elv_unregister(&iosched_bfq_mq);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
blkcg_policy_unregister(&blkcg_policy_bfq);
#endif
bfq_slab_kill();
}
module_init(bfq_init);
module_exit(bfq_exit);
MODULE_AUTHOR("Paolo Valente");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler");