2007-07-09 18:51:58 +02:00
|
|
|
/*
|
|
|
|
* Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
|
|
|
|
* policies)
|
|
|
|
*/
|
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
#include "sched.h"
|
|
|
|
|
|
|
|
#include <linux/slab.h>
|
2014-04-24 20:10:44 +02:00
|
|
|
#include <trace/events/sched.h>
|
2011-10-25 10:00:11 +02:00
|
|
|
|
2013-02-07 16:47:04 +01:00
|
|
|
int sched_rr_timeslice = RR_TIMESLICE;
|
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
|
|
|
|
|
|
|
|
struct rt_bandwidth def_rt_bandwidth;
|
|
|
|
|
|
|
|
static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
|
|
|
|
{
|
|
|
|
struct rt_bandwidth *rt_b =
|
|
|
|
container_of(timer, struct rt_bandwidth, rt_period_timer);
|
|
|
|
ktime_t now;
|
|
|
|
int overrun;
|
|
|
|
int idle = 0;
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
now = hrtimer_cb_get_time(timer);
|
|
|
|
overrun = hrtimer_forward(timer, now, rt_b->rt_period);
|
|
|
|
|
|
|
|
if (!overrun)
|
|
|
|
break;
|
|
|
|
|
|
|
|
idle = do_sched_rt_period_timer(rt_b, overrun);
|
|
|
|
}
|
|
|
|
|
|
|
|
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
|
|
|
|
}
|
|
|
|
|
|
|
|
void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
|
|
|
|
{
|
|
|
|
rt_b->rt_period = ns_to_ktime(period);
|
|
|
|
rt_b->rt_runtime = runtime;
|
|
|
|
|
|
|
|
raw_spin_lock_init(&rt_b->rt_runtime_lock);
|
|
|
|
|
|
|
|
hrtimer_init(&rt_b->rt_period_timer,
|
|
|
|
CLOCK_MONOTONIC, HRTIMER_MODE_REL);
|
|
|
|
rt_b->rt_period_timer.function = sched_rt_period_timer;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
|
|
|
|
{
|
|
|
|
if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (hrtimer_active(&rt_b->rt_period_timer))
|
|
|
|
return;
|
|
|
|
|
|
|
|
raw_spin_lock(&rt_b->rt_runtime_lock);
|
|
|
|
start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
|
|
|
|
raw_spin_unlock(&rt_b->rt_runtime_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
|
|
|
|
{
|
|
|
|
struct rt_prio_array *array;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
array = &rt_rq->active;
|
|
|
|
for (i = 0; i < MAX_RT_PRIO; i++) {
|
|
|
|
INIT_LIST_HEAD(array->queue + i);
|
|
|
|
__clear_bit(i, array->bitmap);
|
|
|
|
}
|
|
|
|
/* delimiter for bitsearch: */
|
|
|
|
__set_bit(MAX_RT_PRIO, array->bitmap);
|
|
|
|
|
|
|
|
#if defined CONFIG_SMP
|
|
|
|
rt_rq->highest_prio.curr = MAX_RT_PRIO;
|
|
|
|
rt_rq->highest_prio.next = MAX_RT_PRIO;
|
|
|
|
rt_rq->rt_nr_migratory = 0;
|
|
|
|
rt_rq->overloaded = 0;
|
|
|
|
plist_head_init(&rt_rq->pushable_tasks);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
rt_rq->rt_time = 0;
|
|
|
|
rt_rq->rt_throttled = 0;
|
|
|
|
rt_rq->rt_runtime = 0;
|
|
|
|
raw_spin_lock_init(&rt_rq->rt_runtime_lock);
|
|
|
|
}
|
|
|
|
|
2009-07-24 12:25:30 +02:00
|
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
2011-10-25 10:00:11 +02:00
|
|
|
static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
|
|
|
|
{
|
|
|
|
hrtimer_cancel(&rt_b->rt_period_timer);
|
|
|
|
}
|
2009-07-24 12:25:30 +02:00
|
|
|
|
|
|
|
#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
|
|
|
|
|
2009-01-14 15:10:04 +01:00
|
|
|
static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
|
|
|
|
{
|
2009-07-24 12:25:30 +02:00
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
|
|
WARN_ON_ONCE(!rt_entity_is_task(rt_se));
|
|
|
|
#endif
|
2009-01-14 15:10:04 +01:00
|
|
|
return container_of(rt_se, struct task_struct, rt);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
|
|
|
|
{
|
|
|
|
return rt_rq->rq;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
|
|
|
|
{
|
|
|
|
return rt_se->rt_rq;
|
|
|
|
}
|
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
void free_rt_sched_group(struct task_group *tg)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (tg->rt_se)
|
|
|
|
destroy_rt_bandwidth(&tg->rt_bandwidth);
|
|
|
|
|
|
|
|
for_each_possible_cpu(i) {
|
|
|
|
if (tg->rt_rq)
|
|
|
|
kfree(tg->rt_rq[i]);
|
|
|
|
if (tg->rt_se)
|
|
|
|
kfree(tg->rt_se[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
kfree(tg->rt_rq);
|
|
|
|
kfree(tg->rt_se);
|
|
|
|
}
|
|
|
|
|
|
|
|
void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
|
|
|
|
struct sched_rt_entity *rt_se, int cpu,
|
|
|
|
struct sched_rt_entity *parent)
|
|
|
|
{
|
|
|
|
struct rq *rq = cpu_rq(cpu);
|
|
|
|
|
|
|
|
rt_rq->highest_prio.curr = MAX_RT_PRIO;
|
|
|
|
rt_rq->rt_nr_boosted = 0;
|
|
|
|
rt_rq->rq = rq;
|
|
|
|
rt_rq->tg = tg;
|
|
|
|
|
|
|
|
tg->rt_rq[cpu] = rt_rq;
|
|
|
|
tg->rt_se[cpu] = rt_se;
|
|
|
|
|
|
|
|
if (!rt_se)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!parent)
|
|
|
|
rt_se->rt_rq = &rq->rt;
|
|
|
|
else
|
|
|
|
rt_se->rt_rq = parent->my_q;
|
|
|
|
|
|
|
|
rt_se->my_q = rt_rq;
|
|
|
|
rt_se->parent = parent;
|
|
|
|
INIT_LIST_HEAD(&rt_se->run_list);
|
|
|
|
}
|
|
|
|
|
|
|
|
int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
|
|
|
|
{
|
|
|
|
struct rt_rq *rt_rq;
|
|
|
|
struct sched_rt_entity *rt_se;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
|
|
|
|
if (!tg->rt_rq)
|
|
|
|
goto err;
|
|
|
|
tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
|
|
|
|
if (!tg->rt_se)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
init_rt_bandwidth(&tg->rt_bandwidth,
|
|
|
|
ktime_to_ns(def_rt_bandwidth.rt_period), 0);
|
|
|
|
|
|
|
|
for_each_possible_cpu(i) {
|
|
|
|
rt_rq = kzalloc_node(sizeof(struct rt_rq),
|
|
|
|
GFP_KERNEL, cpu_to_node(i));
|
|
|
|
if (!rt_rq)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
|
|
|
|
GFP_KERNEL, cpu_to_node(i));
|
|
|
|
if (!rt_se)
|
|
|
|
goto err_free_rq;
|
|
|
|
|
|
|
|
init_rt_rq(rt_rq, cpu_rq(i));
|
|
|
|
rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
|
|
|
|
init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
err_free_rq:
|
|
|
|
kfree(rt_rq);
|
|
|
|
err:
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-01-14 15:10:04 +01:00
|
|
|
#else /* CONFIG_RT_GROUP_SCHED */
|
|
|
|
|
2009-04-01 18:40:15 +02:00
|
|
|
#define rt_entity_is_task(rt_se) (1)
|
|
|
|
|
2009-07-24 12:25:30 +02:00
|
|
|
static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
|
|
|
|
{
|
|
|
|
return container_of(rt_se, struct task_struct, rt);
|
|
|
|
}
|
|
|
|
|
2009-01-14 15:10:04 +01:00
|
|
|
static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
|
|
|
|
{
|
|
|
|
return container_of(rt_rq, struct rq, rt);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
|
|
|
|
{
|
|
|
|
struct task_struct *p = rt_task_of(rt_se);
|
|
|
|
struct rq *rq = task_rq(p);
|
|
|
|
|
|
|
|
return &rq->rt;
|
|
|
|
}
|
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
void free_rt_sched_group(struct task_group *tg) { }
|
|
|
|
|
|
|
|
int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
|
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
2009-01-14 15:10:04 +01:00
|
|
|
#endif /* CONFIG_RT_GROUP_SCHED */
|
|
|
|
|
2008-01-25 21:08:06 +01:00
|
|
|
#ifdef CONFIG_SMP
|
2008-01-25 21:08:15 +01:00
|
|
|
|
2008-01-25 21:08:18 +01:00
|
|
|
static inline int rt_overloaded(struct rq *rq)
|
2008-01-25 21:08:06 +01:00
|
|
|
{
|
2008-01-25 21:08:18 +01:00
|
|
|
return atomic_read(&rq->rd->rto_count);
|
2008-01-25 21:08:06 +01:00
|
|
|
}
|
2008-01-25 21:08:15 +01:00
|
|
|
|
2008-01-25 21:08:06 +01:00
|
|
|
static inline void rt_set_overload(struct rq *rq)
|
|
|
|
{
|
2008-06-04 21:04:05 +02:00
|
|
|
if (!rq->online)
|
|
|
|
return;
|
|
|
|
|
2008-11-24 17:05:05 +01:00
|
|
|
cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
|
2008-01-25 21:08:06 +01:00
|
|
|
/*
|
|
|
|
* Make sure the mask is visible before we set
|
|
|
|
* the overload count. That is checked to determine
|
|
|
|
* if we should look at the mask. It would be a shame
|
|
|
|
* if we looked at the mask, but the mask was not
|
|
|
|
* updated yet.
|
|
|
|
*/
|
|
|
|
wmb();
|
2008-01-25 21:08:18 +01:00
|
|
|
atomic_inc(&rq->rd->rto_count);
|
2008-01-25 21:08:06 +01:00
|
|
|
}
|
2008-01-25 21:08:15 +01:00
|
|
|
|
2008-01-25 21:08:06 +01:00
|
|
|
static inline void rt_clear_overload(struct rq *rq)
|
|
|
|
{
|
2008-06-04 21:04:05 +02:00
|
|
|
if (!rq->online)
|
|
|
|
return;
|
|
|
|
|
2008-01-25 21:08:06 +01:00
|
|
|
/* the order here really doesn't matter */
|
2008-01-25 21:08:18 +01:00
|
|
|
atomic_dec(&rq->rd->rto_count);
|
2008-11-24 17:05:05 +01:00
|
|
|
cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
|
2008-01-25 21:08:06 +01:00
|
|
|
}
|
2008-01-25 21:08:07 +01:00
|
|
|
|
2009-01-14 15:10:04 +01:00
|
|
|
static void update_rt_migration(struct rt_rq *rt_rq)
|
2008-01-25 21:08:07 +01:00
|
|
|
{
|
2009-04-01 18:40:15 +02:00
|
|
|
if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
|
2009-01-14 15:10:04 +01:00
|
|
|
if (!rt_rq->overloaded) {
|
|
|
|
rt_set_overload(rq_of_rt_rq(rt_rq));
|
|
|
|
rt_rq->overloaded = 1;
|
2008-01-25 21:08:23 +01:00
|
|
|
}
|
2009-01-14 15:10:04 +01:00
|
|
|
} else if (rt_rq->overloaded) {
|
|
|
|
rt_clear_overload(rq_of_rt_rq(rt_rq));
|
|
|
|
rt_rq->overloaded = 0;
|
2008-01-25 21:08:18 +01:00
|
|
|
}
|
2008-01-25 21:08:07 +01:00
|
|
|
}
|
2008-01-25 21:08:06 +01:00
|
|
|
|
2009-01-14 15:10:04 +01:00
|
|
|
static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
|
|
|
|
{
|
2012-04-23 12:11:21 +02:00
|
|
|
struct task_struct *p;
|
|
|
|
|
2009-04-01 18:40:15 +02:00
|
|
|
if (!rt_entity_is_task(rt_se))
|
|
|
|
return;
|
|
|
|
|
2012-04-23 12:11:21 +02:00
|
|
|
p = rt_task_of(rt_se);
|
2009-04-01 18:40:15 +02:00
|
|
|
rt_rq = &rq_of_rt_rq(rt_rq)->rt;
|
|
|
|
|
|
|
|
rt_rq->rt_nr_total++;
|
2012-04-23 12:11:21 +02:00
|
|
|
if (p->nr_cpus_allowed > 1)
|
2009-01-14 15:10:04 +01:00
|
|
|
rt_rq->rt_nr_migratory++;
|
|
|
|
|
|
|
|
update_rt_migration(rt_rq);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
|
|
|
|
{
|
2012-04-23 12:11:21 +02:00
|
|
|
struct task_struct *p;
|
|
|
|
|
2009-04-01 18:40:15 +02:00
|
|
|
if (!rt_entity_is_task(rt_se))
|
|
|
|
return;
|
|
|
|
|
2012-04-23 12:11:21 +02:00
|
|
|
p = rt_task_of(rt_se);
|
2009-04-01 18:40:15 +02:00
|
|
|
rt_rq = &rq_of_rt_rq(rt_rq)->rt;
|
|
|
|
|
|
|
|
rt_rq->rt_nr_total--;
|
2012-04-23 12:11:21 +02:00
|
|
|
if (p->nr_cpus_allowed > 1)
|
2009-01-14 15:10:04 +01:00
|
|
|
rt_rq->rt_nr_migratory--;
|
|
|
|
|
|
|
|
update_rt_migration(rt_rq);
|
|
|
|
}
|
|
|
|
|
sched: Use pushable_tasks to determine next highest prio
Hillf Danton proposed a patch (see link) that cleaned up the
sched_rt code that calculates the priority of the next highest priority
task to be used in finding run queues to pull from.
His patch removed the calculating of the next prio to just use the current
prio when deteriming if we should examine a run queue to pull from. The problem
with his patch was that it caused more false checks. Because we check a run
queue for pushable tasks if the current priority of that run queue is higher
in priority than the task about to run on our run queue. But after grabbing
the locks and doing the real check, we find that there may not be a task
that has a higher prio task to pull. Thus the locks were taken with nothing to
do.
I added some trace_printks() to record when and how many times the run queue
locks were taken to check for pullable tasks, compared to how many times we
pulled a task.
With the current method, it was:
3806 locks taken vs 2812 pulled tasks
With Hillf's patch:
6728 locks taken vs 2804 pulled tasks
The number of times locks were taken to pull a task went up almost double with
no more success rate.
But his patch did get me thinking. When we look at the priority of the highest
task to consider taking the locks to do a pull, a failure to pull can be one
of the following: (in order of most likely)
o RT task was pushed off already between the check and taking the lock
o Waiting RT task can not be migrated
o RT task's CPU affinity does not include the target run queue's CPU
o RT task's priority changed between the check and taking the lock
And with Hillf's patch, the thing that caused most of the failures, is
the RT task to pull was not at the right priority to pull (not greater than
the current RT task priority on the target run queue).
Most of the above cases we can't help. But the current method does not check
if the next highest prio RT task can be migrated or not, and if it can not,
we still grab the locks to do the test (we don't find out about this fact until
after we have the locks). I thought about this case, and realized that the
pushable task plist that is maintained only holds RT tasks that can migrate.
If we move the calculating of the next highest prio task from the inc/dec_rt_task()
functions into the queuing of the pushable tasks, then we only measure the
priorities of those tasks that we push, and we get this basically for free.
Not only does this patch make the code a little more efficient, it cleans it
up and makes it a little simpler.
Thanks to Hillf Danton for inspiring me on this patch.
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Gregory Haskins <ghaskins@novell.com>
Link: http://lkml.kernel.org/r/BANLkTimQ67180HxCx5vgMqumqw1EkFh3qg@mail.gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-06-17 03:55:23 +02:00
|
|
|
static inline int has_pushable_tasks(struct rq *rq)
|
|
|
|
{
|
|
|
|
return !plist_head_empty(&rq->rt.pushable_tasks);
|
|
|
|
}
|
|
|
|
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 15:39:53 +01:00
|
|
|
static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
|
|
|
|
{
|
|
|
|
plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
|
|
|
|
plist_node_init(&p->pushable_tasks, p->prio);
|
|
|
|
plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
|
sched: Use pushable_tasks to determine next highest prio
Hillf Danton proposed a patch (see link) that cleaned up the
sched_rt code that calculates the priority of the next highest priority
task to be used in finding run queues to pull from.
His patch removed the calculating of the next prio to just use the current
prio when deteriming if we should examine a run queue to pull from. The problem
with his patch was that it caused more false checks. Because we check a run
queue for pushable tasks if the current priority of that run queue is higher
in priority than the task about to run on our run queue. But after grabbing
the locks and doing the real check, we find that there may not be a task
that has a higher prio task to pull. Thus the locks were taken with nothing to
do.
I added some trace_printks() to record when and how many times the run queue
locks were taken to check for pullable tasks, compared to how many times we
pulled a task.
With the current method, it was:
3806 locks taken vs 2812 pulled tasks
With Hillf's patch:
6728 locks taken vs 2804 pulled tasks
The number of times locks were taken to pull a task went up almost double with
no more success rate.
But his patch did get me thinking. When we look at the priority of the highest
task to consider taking the locks to do a pull, a failure to pull can be one
of the following: (in order of most likely)
o RT task was pushed off already between the check and taking the lock
o Waiting RT task can not be migrated
o RT task's CPU affinity does not include the target run queue's CPU
o RT task's priority changed between the check and taking the lock
And with Hillf's patch, the thing that caused most of the failures, is
the RT task to pull was not at the right priority to pull (not greater than
the current RT task priority on the target run queue).
Most of the above cases we can't help. But the current method does not check
if the next highest prio RT task can be migrated or not, and if it can not,
we still grab the locks to do the test (we don't find out about this fact until
after we have the locks). I thought about this case, and realized that the
pushable task plist that is maintained only holds RT tasks that can migrate.
If we move the calculating of the next highest prio task from the inc/dec_rt_task()
functions into the queuing of the pushable tasks, then we only measure the
priorities of those tasks that we push, and we get this basically for free.
Not only does this patch make the code a little more efficient, it cleans it
up and makes it a little simpler.
Thanks to Hillf Danton for inspiring me on this patch.
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Gregory Haskins <ghaskins@novell.com>
Link: http://lkml.kernel.org/r/BANLkTimQ67180HxCx5vgMqumqw1EkFh3qg@mail.gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-06-17 03:55:23 +02:00
|
|
|
|
|
|
|
/* Update the highest prio pushable task */
|
|
|
|
if (p->prio < rq->rt.highest_prio.next)
|
|
|
|
rq->rt.highest_prio.next = p->prio;
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 15:39:53 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
|
|
|
|
{
|
|
|
|
plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
|
|
|
|
|
sched: Use pushable_tasks to determine next highest prio
Hillf Danton proposed a patch (see link) that cleaned up the
sched_rt code that calculates the priority of the next highest priority
task to be used in finding run queues to pull from.
His patch removed the calculating of the next prio to just use the current
prio when deteriming if we should examine a run queue to pull from. The problem
with his patch was that it caused more false checks. Because we check a run
queue for pushable tasks if the current priority of that run queue is higher
in priority than the task about to run on our run queue. But after grabbing
the locks and doing the real check, we find that there may not be a task
that has a higher prio task to pull. Thus the locks were taken with nothing to
do.
I added some trace_printks() to record when and how many times the run queue
locks were taken to check for pullable tasks, compared to how many times we
pulled a task.
With the current method, it was:
3806 locks taken vs 2812 pulled tasks
With Hillf's patch:
6728 locks taken vs 2804 pulled tasks
The number of times locks were taken to pull a task went up almost double with
no more success rate.
But his patch did get me thinking. When we look at the priority of the highest
task to consider taking the locks to do a pull, a failure to pull can be one
of the following: (in order of most likely)
o RT task was pushed off already between the check and taking the lock
o Waiting RT task can not be migrated
o RT task's CPU affinity does not include the target run queue's CPU
o RT task's priority changed between the check and taking the lock
And with Hillf's patch, the thing that caused most of the failures, is
the RT task to pull was not at the right priority to pull (not greater than
the current RT task priority on the target run queue).
Most of the above cases we can't help. But the current method does not check
if the next highest prio RT task can be migrated or not, and if it can not,
we still grab the locks to do the test (we don't find out about this fact until
after we have the locks). I thought about this case, and realized that the
pushable task plist that is maintained only holds RT tasks that can migrate.
If we move the calculating of the next highest prio task from the inc/dec_rt_task()
functions into the queuing of the pushable tasks, then we only measure the
priorities of those tasks that we push, and we get this basically for free.
Not only does this patch make the code a little more efficient, it cleans it
up and makes it a little simpler.
Thanks to Hillf Danton for inspiring me on this patch.
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Gregory Haskins <ghaskins@novell.com>
Link: http://lkml.kernel.org/r/BANLkTimQ67180HxCx5vgMqumqw1EkFh3qg@mail.gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-06-17 03:55:23 +02:00
|
|
|
/* Update the new highest prio pushable task */
|
|
|
|
if (has_pushable_tasks(rq)) {
|
|
|
|
p = plist_first_entry(&rq->rt.pushable_tasks,
|
|
|
|
struct task_struct, pushable_tasks);
|
|
|
|
rq->rt.highest_prio.next = p->prio;
|
|
|
|
} else
|
|
|
|
rq->rt.highest_prio.next = MAX_RT_PRIO;
|
2008-04-19 12:11:10 +02:00
|
|
|
}
|
|
|
|
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 15:39:53 +01:00
|
|
|
#else
|
|
|
|
|
2009-01-16 14:46:40 +01:00
|
|
|
static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
|
2008-01-25 21:08:29 +01:00
|
|
|
{
|
2008-01-25 21:08:30 +01:00
|
|
|
}
|
|
|
|
|
2009-01-16 14:46:40 +01:00
|
|
|
static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2009-01-14 14:55:39 +01:00
|
|
|
static inline
|
2009-01-16 14:46:40 +01:00
|
|
|
void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2009-01-14 15:10:04 +01:00
|
|
|
static inline
|
2009-01-16 14:46:40 +01:00
|
|
|
void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
|
|
|
|
{
|
|
|
|
}
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 15:39:53 +01:00
|
|
|
|
2008-01-25 21:08:06 +01:00
|
|
|
#endif /* CONFIG_SMP */
|
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
|
|
|
|
{
|
|
|
|
return !list_empty(&rt_se->run_list);
|
|
|
|
}
|
|
|
|
|
2008-02-13 15:45:40 +01:00
|
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
2008-01-25 21:08:30 +01:00
|
|
|
|
2008-02-13 15:45:39 +01:00
|
|
|
static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
|
2008-01-25 21:08:30 +01:00
|
|
|
{
|
|
|
|
if (!rt_rq->tg)
|
2008-02-13 15:45:39 +01:00
|
|
|
return RUNTIME_INF;
|
2008-01-25 21:08:30 +01:00
|
|
|
|
2008-04-19 19:44:58 +02:00
|
|
|
return rt_rq->rt_runtime;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline u64 sched_rt_period(struct rt_rq *rt_rq)
|
|
|
|
{
|
|
|
|
return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
|
2008-01-25 21:08:30 +01:00
|
|
|
}
|
|
|
|
|
2011-05-14 08:20:02 +02:00
|
|
|
typedef struct task_group *rt_rq_iter_t;
|
|
|
|
|
2011-06-28 04:51:31 +02:00
|
|
|
static inline struct task_group *next_task_group(struct task_group *tg)
|
|
|
|
{
|
|
|
|
do {
|
|
|
|
tg = list_entry_rcu(tg->list.next,
|
|
|
|
typeof(struct task_group), list);
|
|
|
|
} while (&tg->list != &task_groups && task_group_is_autogroup(tg));
|
|
|
|
|
|
|
|
if (&tg->list == &task_groups)
|
|
|
|
tg = NULL;
|
|
|
|
|
|
|
|
return tg;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define for_each_rt_rq(rt_rq, iter, rq) \
|
|
|
|
for (iter = container_of(&task_groups, typeof(*iter), list); \
|
|
|
|
(iter = next_task_group(iter)) && \
|
|
|
|
(rt_rq = iter->rt_rq[cpu_of(rq)]);)
|
2011-05-14 08:20:02 +02:00
|
|
|
|
2010-11-16 00:47:01 +01:00
|
|
|
static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
|
|
|
|
{
|
|
|
|
list_add_rcu(&rt_rq->leaf_rt_rq_list,
|
|
|
|
&rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
|
|
|
|
{
|
|
|
|
list_del_rcu(&rt_rq->leaf_rt_rq_list);
|
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
#define for_each_leaf_rt_rq(rt_rq, rq) \
|
2008-12-15 07:26:48 +01:00
|
|
|
list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
|
2008-01-25 21:08:30 +01:00
|
|
|
|
|
|
|
#define for_each_sched_rt_entity(rt_se) \
|
|
|
|
for (; rt_se; rt_se = rt_se->parent)
|
|
|
|
|
|
|
|
static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
|
|
|
|
{
|
|
|
|
return rt_se->my_q;
|
|
|
|
}
|
|
|
|
|
2010-01-20 21:59:01 +01:00
|
|
|
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
|
2008-01-25 21:08:30 +01:00
|
|
|
static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
|
|
|
|
|
2008-02-13 15:45:39 +01:00
|
|
|
static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
|
2008-01-25 21:08:30 +01:00
|
|
|
{
|
sched_rt.c: resch needed in rt_rq_enqueue() for the root rt_rq
While working on the new version of the code for SCHED_SPORADIC I
noticed something strange in the present throttling mechanism. More
specifically in the throttling timer handler in sched_rt.c
(do_sched_rt_period_timer()) and in rt_rq_enqueue().
The problem is that, when unthrottling a runqueue, rt_rq_enqueue() only
asks for rescheduling if the runqueue has a sched_entity associated to
it (i.e., rt_rq->rt_se != NULL).
Now, if the runqueue is the root rq (which has a rt_se = NULL)
rescheduling does not take place, and it is delayed to some undefined
instant in the future.
This imply some random bandwidth usage by the RT tasks under throttling.
For instance, setting rt_runtime_us/rt_period_us = 950ms/1000ms an RT
task will get less than 95%. In our tests we got something varying
between 70% to 95%.
Using smaller time values, e.g., 95ms/100ms, things are even worse, and
I can see values also going down to 20-25%!!
The tests we performed are simply running 'yes' as a SCHED_FIFO task,
and checking the CPU usage with top, but we can investigate thoroughly
if you think it is needed.
Things go much better, for us, with the attached patch... Don't know if
it is the best approach, but it solved the issue for us.
Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Michael Trimarchi <trimarchimichael@yahoo.it>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-10-03 17:40:46 +02:00
|
|
|
struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
|
2010-01-29 07:57:52 +01:00
|
|
|
struct sched_rt_entity *rt_se;
|
|
|
|
|
sched: Fix sched rt group scheduling when hierachy is enabled
The current sched rt code is broken when it comes to hierarchical
scheduling, this patch fixes two problems
1. It adds redundant enqueuing (harmless) when it finds a queue
has tasks enqueued, but it has no run time and it is not
throttled.
2. The most important change is in sched_rt_rq_enqueue/dequeue.
The code just picks the rt_rq belonging to the current cpu
on which the period timer runs, the patch fixes it, so that
the correct rt_se is enqueued/dequeued.
Tested with a simple hierarchy
/c/d, c and d assigned similar runtimes of 50,000 and a while
1 loop runs within "d". Both c and d get throttled, without
the patch, the task just stops running and never runs (depends
on where the sched_rt b/w timer runs). With the patch, the
task is throttled and runs as expected.
[ bharata, suggestions on how to pick the rt_se belong to the
rt_rq and correct cpu ]
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: stable@kernel.org
LKML-Reference: <20110303113435.GA2868@balbir.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-03-03 12:34:35 +01:00
|
|
|
int cpu = cpu_of(rq_of_rt_rq(rt_rq));
|
|
|
|
|
|
|
|
rt_se = rt_rq->tg->rt_se[cpu];
|
2008-01-25 21:08:30 +01:00
|
|
|
|
sched_rt.c: resch needed in rt_rq_enqueue() for the root rt_rq
While working on the new version of the code for SCHED_SPORADIC I
noticed something strange in the present throttling mechanism. More
specifically in the throttling timer handler in sched_rt.c
(do_sched_rt_period_timer()) and in rt_rq_enqueue().
The problem is that, when unthrottling a runqueue, rt_rq_enqueue() only
asks for rescheduling if the runqueue has a sched_entity associated to
it (i.e., rt_rq->rt_se != NULL).
Now, if the runqueue is the root rq (which has a rt_se = NULL)
rescheduling does not take place, and it is delayed to some undefined
instant in the future.
This imply some random bandwidth usage by the RT tasks under throttling.
For instance, setting rt_runtime_us/rt_period_us = 950ms/1000ms an RT
task will get less than 95%. In our tests we got something varying
between 70% to 95%.
Using smaller time values, e.g., 95ms/100ms, things are even worse, and
I can see values also going down to 20-25%!!
The tests we performed are simply running 'yes' as a SCHED_FIFO task,
and checking the CPU usage with top, but we can investigate thoroughly
if you think it is needed.
Things go much better, for us, with the attached patch... Don't know if
it is the best approach, but it solved the issue for us.
Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Michael Trimarchi <trimarchimichael@yahoo.it>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-10-03 17:40:46 +02:00
|
|
|
if (rt_rq->rt_nr_running) {
|
|
|
|
if (rt_se && !on_rt_rq(rt_se))
|
2010-01-20 21:59:01 +01:00
|
|
|
enqueue_rt_entity(rt_se, false);
|
2008-12-29 15:39:49 +01:00
|
|
|
if (rt_rq->highest_prio.curr < curr->prio)
|
2008-01-25 21:08:32 +01:00
|
|
|
resched_task(curr);
|
2008-01-25 21:08:30 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-02-13 15:45:39 +01:00
|
|
|
static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
|
2008-01-25 21:08:30 +01:00
|
|
|
{
|
2010-01-29 07:57:52 +01:00
|
|
|
struct sched_rt_entity *rt_se;
|
sched: Fix sched rt group scheduling when hierachy is enabled
The current sched rt code is broken when it comes to hierarchical
scheduling, this patch fixes two problems
1. It adds redundant enqueuing (harmless) when it finds a queue
has tasks enqueued, but it has no run time and it is not
throttled.
2. The most important change is in sched_rt_rq_enqueue/dequeue.
The code just picks the rt_rq belonging to the current cpu
on which the period timer runs, the patch fixes it, so that
the correct rt_se is enqueued/dequeued.
Tested with a simple hierarchy
/c/d, c and d assigned similar runtimes of 50,000 and a while
1 loop runs within "d". Both c and d get throttled, without
the patch, the task just stops running and never runs (depends
on where the sched_rt b/w timer runs). With the patch, the
task is throttled and runs as expected.
[ bharata, suggestions on how to pick the rt_se belong to the
rt_rq and correct cpu ]
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: stable@kernel.org
LKML-Reference: <20110303113435.GA2868@balbir.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-03-03 12:34:35 +01:00
|
|
|
int cpu = cpu_of(rq_of_rt_rq(rt_rq));
|
2010-01-29 07:57:52 +01:00
|
|
|
|
sched: Fix sched rt group scheduling when hierachy is enabled
The current sched rt code is broken when it comes to hierarchical
scheduling, this patch fixes two problems
1. It adds redundant enqueuing (harmless) when it finds a queue
has tasks enqueued, but it has no run time and it is not
throttled.
2. The most important change is in sched_rt_rq_enqueue/dequeue.
The code just picks the rt_rq belonging to the current cpu
on which the period timer runs, the patch fixes it, so that
the correct rt_se is enqueued/dequeued.
Tested with a simple hierarchy
/c/d, c and d assigned similar runtimes of 50,000 and a while
1 loop runs within "d". Both c and d get throttled, without
the patch, the task just stops running and never runs (depends
on where the sched_rt b/w timer runs). With the patch, the
task is throttled and runs as expected.
[ bharata, suggestions on how to pick the rt_se belong to the
rt_rq and correct cpu ]
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: stable@kernel.org
LKML-Reference: <20110303113435.GA2868@balbir.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-03-03 12:34:35 +01:00
|
|
|
rt_se = rt_rq->tg->rt_se[cpu];
|
2008-01-25 21:08:30 +01:00
|
|
|
|
|
|
|
if (rt_se && on_rt_rq(rt_se))
|
|
|
|
dequeue_rt_entity(rt_se);
|
|
|
|
}
|
|
|
|
|
2008-02-13 15:45:39 +01:00
|
|
|
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
|
|
|
|
{
|
|
|
|
return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int rt_se_boosted(struct sched_rt_entity *rt_se)
|
|
|
|
{
|
|
|
|
struct rt_rq *rt_rq = group_rt_rq(rt_se);
|
|
|
|
struct task_struct *p;
|
|
|
|
|
|
|
|
if (rt_rq)
|
|
|
|
return !!rt_rq->rt_nr_boosted;
|
|
|
|
|
|
|
|
p = rt_task_of(rt_se);
|
|
|
|
return p->prio != p->normal_prio;
|
|
|
|
}
|
|
|
|
|
2008-04-19 19:44:57 +02:00
|
|
|
#ifdef CONFIG_SMP
|
2008-11-24 17:05:05 +01:00
|
|
|
static inline const struct cpumask *sched_rt_period_mask(void)
|
2008-04-19 19:44:57 +02:00
|
|
|
{
|
|
|
|
return cpu_rq(smp_processor_id())->rd->span;
|
|
|
|
}
|
2008-01-25 21:08:30 +01:00
|
|
|
#else
|
2008-11-24 17:05:05 +01:00
|
|
|
static inline const struct cpumask *sched_rt_period_mask(void)
|
2008-04-19 19:44:57 +02:00
|
|
|
{
|
2008-11-24 17:05:05 +01:00
|
|
|
return cpu_online_mask;
|
2008-04-19 19:44:57 +02:00
|
|
|
}
|
|
|
|
#endif
|
2008-01-25 21:08:30 +01:00
|
|
|
|
2008-04-19 19:44:57 +02:00
|
|
|
static inline
|
|
|
|
struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
|
2008-01-25 21:08:30 +01:00
|
|
|
{
|
2008-04-19 19:44:57 +02:00
|
|
|
return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
|
|
|
|
}
|
2008-02-13 15:45:39 +01:00
|
|
|
|
2008-04-19 19:44:58 +02:00
|
|
|
static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
|
|
|
|
{
|
|
|
|
return &rt_rq->tg->rt_bandwidth;
|
|
|
|
}
|
|
|
|
|
2008-06-24 20:09:43 +02:00
|
|
|
#else /* !CONFIG_RT_GROUP_SCHED */
|
2008-04-19 19:44:57 +02:00
|
|
|
|
|
|
|
static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
|
|
|
|
{
|
2008-04-19 19:44:58 +02:00
|
|
|
return rt_rq->rt_runtime;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline u64 sched_rt_period(struct rt_rq *rt_rq)
|
|
|
|
{
|
|
|
|
return ktime_to_ns(def_rt_bandwidth.rt_period);
|
2008-01-25 21:08:30 +01:00
|
|
|
}
|
|
|
|
|
2011-05-14 08:20:02 +02:00
|
|
|
typedef struct rt_rq *rt_rq_iter_t;
|
|
|
|
|
|
|
|
#define for_each_rt_rq(rt_rq, iter, rq) \
|
|
|
|
for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
|
|
|
|
|
2010-11-16 00:47:01 +01:00
|
|
|
static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
#define for_each_leaf_rt_rq(rt_rq, rq) \
|
|
|
|
for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
|
|
|
|
|
|
|
|
#define for_each_sched_rt_entity(rt_se) \
|
|
|
|
for (; rt_se; rt_se = NULL)
|
|
|
|
|
|
|
|
static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2008-02-13 15:45:39 +01:00
|
|
|
static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
|
2008-01-25 21:08:30 +01:00
|
|
|
{
|
2008-08-26 21:09:43 +02:00
|
|
|
if (rt_rq->rt_nr_running)
|
|
|
|
resched_task(rq_of_rt_rq(rt_rq)->curr);
|
2008-01-25 21:08:30 +01:00
|
|
|
}
|
|
|
|
|
2008-02-13 15:45:39 +01:00
|
|
|
static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
|
2008-01-25 21:08:30 +01:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2008-02-13 15:45:39 +01:00
|
|
|
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
|
|
|
|
{
|
|
|
|
return rt_rq->rt_throttled;
|
|
|
|
}
|
2008-04-19 19:44:57 +02:00
|
|
|
|
2008-11-24 17:05:05 +01:00
|
|
|
static inline const struct cpumask *sched_rt_period_mask(void)
|
2008-04-19 19:44:57 +02:00
|
|
|
{
|
2008-11-24 17:05:05 +01:00
|
|
|
return cpu_online_mask;
|
2008-04-19 19:44:57 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline
|
|
|
|
struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
|
|
|
|
{
|
|
|
|
return &cpu_rq(cpu)->rt;
|
|
|
|
}
|
|
|
|
|
2008-04-19 19:44:58 +02:00
|
|
|
static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
|
|
|
|
{
|
|
|
|
return &def_rt_bandwidth;
|
|
|
|
}
|
|
|
|
|
2008-06-24 20:09:43 +02:00
|
|
|
#endif /* CONFIG_RT_GROUP_SCHED */
|
2008-04-19 19:44:57 +02:00
|
|
|
|
2008-04-19 19:44:58 +02:00
|
|
|
#ifdef CONFIG_SMP
|
2008-09-23 15:33:43 +02:00
|
|
|
/*
|
|
|
|
* We ran out of runtime, see if we can borrow some from our neighbours.
|
|
|
|
*/
|
2008-06-19 14:22:25 +02:00
|
|
|
static int do_balance_runtime(struct rt_rq *rt_rq)
|
2008-04-19 19:44:58 +02:00
|
|
|
{
|
|
|
|
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
|
2013-01-14 18:55:31 +01:00
|
|
|
struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
|
2008-04-19 19:44:58 +02:00
|
|
|
int i, weight, more = 0;
|
|
|
|
u64 rt_period;
|
|
|
|
|
2008-11-24 17:05:05 +01:00
|
|
|
weight = cpumask_weight(rd->span);
|
2008-04-19 19:44:58 +02:00
|
|
|
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_lock(&rt_b->rt_runtime_lock);
|
2008-04-19 19:44:58 +02:00
|
|
|
rt_period = ktime_to_ns(rt_b->rt_period);
|
2008-11-24 17:05:05 +01:00
|
|
|
for_each_cpu(i, rd->span) {
|
2008-04-19 19:44:58 +02:00
|
|
|
struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
|
|
|
|
s64 diff;
|
|
|
|
|
|
|
|
if (iter == rt_rq)
|
|
|
|
continue;
|
|
|
|
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_lock(&iter->rt_runtime_lock);
|
2008-09-23 15:33:43 +02:00
|
|
|
/*
|
|
|
|
* Either all rqs have inf runtime and there's nothing to steal
|
|
|
|
* or __disable_runtime() below sets a specific rq to inf to
|
|
|
|
* indicate its been disabled and disalow stealing.
|
|
|
|
*/
|
2008-06-05 14:49:58 +02:00
|
|
|
if (iter->rt_runtime == RUNTIME_INF)
|
|
|
|
goto next;
|
|
|
|
|
2008-09-23 15:33:43 +02:00
|
|
|
/*
|
|
|
|
* From runqueues with spare time, take 1/n part of their
|
|
|
|
* spare time, but no more than our period.
|
|
|
|
*/
|
2008-04-19 19:44:58 +02:00
|
|
|
diff = iter->rt_runtime - iter->rt_time;
|
|
|
|
if (diff > 0) {
|
2008-07-24 12:43:13 +02:00
|
|
|
diff = div_u64((u64)diff, weight);
|
2008-04-19 19:44:58 +02:00
|
|
|
if (rt_rq->rt_runtime + diff > rt_period)
|
|
|
|
diff = rt_period - rt_rq->rt_runtime;
|
|
|
|
iter->rt_runtime -= diff;
|
|
|
|
rt_rq->rt_runtime += diff;
|
|
|
|
more = 1;
|
|
|
|
if (rt_rq->rt_runtime == rt_period) {
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_unlock(&iter->rt_runtime_lock);
|
2008-04-19 19:44:58 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2008-06-05 14:49:58 +02:00
|
|
|
next:
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_unlock(&iter->rt_runtime_lock);
|
2008-04-19 19:44:58 +02:00
|
|
|
}
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_unlock(&rt_b->rt_runtime_lock);
|
2008-04-19 19:44:58 +02:00
|
|
|
|
|
|
|
return more;
|
|
|
|
}
|
2008-06-05 14:49:58 +02:00
|
|
|
|
2008-09-23 15:33:43 +02:00
|
|
|
/*
|
|
|
|
* Ensure this RQ takes back all the runtime it lend to its neighbours.
|
|
|
|
*/
|
2008-06-05 14:49:58 +02:00
|
|
|
static void __disable_runtime(struct rq *rq)
|
|
|
|
{
|
|
|
|
struct root_domain *rd = rq->rd;
|
2011-05-14 08:20:02 +02:00
|
|
|
rt_rq_iter_t iter;
|
2008-06-05 14:49:58 +02:00
|
|
|
struct rt_rq *rt_rq;
|
|
|
|
|
|
|
|
if (unlikely(!scheduler_running))
|
|
|
|
return;
|
|
|
|
|
2011-05-14 08:20:02 +02:00
|
|
|
for_each_rt_rq(rt_rq, iter, rq) {
|
2008-06-05 14:49:58 +02:00
|
|
|
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
|
|
|
|
s64 want;
|
|
|
|
int i;
|
|
|
|
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_lock(&rt_b->rt_runtime_lock);
|
|
|
|
raw_spin_lock(&rt_rq->rt_runtime_lock);
|
2008-09-23 15:33:43 +02:00
|
|
|
/*
|
|
|
|
* Either we're all inf and nobody needs to borrow, or we're
|
|
|
|
* already disabled and thus have nothing to do, or we have
|
|
|
|
* exactly the right amount of runtime to take out.
|
|
|
|
*/
|
2008-06-05 14:49:58 +02:00
|
|
|
if (rt_rq->rt_runtime == RUNTIME_INF ||
|
|
|
|
rt_rq->rt_runtime == rt_b->rt_runtime)
|
|
|
|
goto balanced;
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_unlock(&rt_rq->rt_runtime_lock);
|
2008-06-05 14:49:58 +02:00
|
|
|
|
2008-09-23 15:33:43 +02:00
|
|
|
/*
|
|
|
|
* Calculate the difference between what we started out with
|
|
|
|
* and what we current have, that's the amount of runtime
|
|
|
|
* we lend and now have to reclaim.
|
|
|
|
*/
|
2008-06-05 14:49:58 +02:00
|
|
|
want = rt_b->rt_runtime - rt_rq->rt_runtime;
|
|
|
|
|
2008-09-23 15:33:43 +02:00
|
|
|
/*
|
|
|
|
* Greedy reclaim, take back as much as we can.
|
|
|
|
*/
|
2008-11-24 17:05:05 +01:00
|
|
|
for_each_cpu(i, rd->span) {
|
2008-06-05 14:49:58 +02:00
|
|
|
struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
|
|
|
|
s64 diff;
|
|
|
|
|
2008-09-23 15:33:43 +02:00
|
|
|
/*
|
|
|
|
* Can't reclaim from ourselves or disabled runqueues.
|
|
|
|
*/
|
2008-08-14 15:49:00 +02:00
|
|
|
if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
|
2008-06-05 14:49:58 +02:00
|
|
|
continue;
|
|
|
|
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_lock(&iter->rt_runtime_lock);
|
2008-06-05 14:49:58 +02:00
|
|
|
if (want > 0) {
|
|
|
|
diff = min_t(s64, iter->rt_runtime, want);
|
|
|
|
iter->rt_runtime -= diff;
|
|
|
|
want -= diff;
|
|
|
|
} else {
|
|
|
|
iter->rt_runtime -= want;
|
|
|
|
want -= want;
|
|
|
|
}
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_unlock(&iter->rt_runtime_lock);
|
2008-06-05 14:49:58 +02:00
|
|
|
|
|
|
|
if (!want)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_lock(&rt_rq->rt_runtime_lock);
|
2008-09-23 15:33:43 +02:00
|
|
|
/*
|
|
|
|
* We cannot be left wanting - that would mean some runtime
|
|
|
|
* leaked out of the system.
|
|
|
|
*/
|
2008-06-05 14:49:58 +02:00
|
|
|
BUG_ON(want);
|
|
|
|
balanced:
|
2008-09-23 15:33:43 +02:00
|
|
|
/*
|
|
|
|
* Disable all the borrow logic by pretending we have inf
|
|
|
|
* runtime - in which case borrowing doesn't make sense.
|
|
|
|
*/
|
2008-06-05 14:49:58 +02:00
|
|
|
rt_rq->rt_runtime = RUNTIME_INF;
|
2012-08-10 00:34:47 +02:00
|
|
|
rt_rq->rt_throttled = 0;
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_unlock(&rt_rq->rt_runtime_lock);
|
|
|
|
raw_spin_unlock(&rt_b->rt_runtime_lock);
|
2008-06-05 14:49:58 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __enable_runtime(struct rq *rq)
|
|
|
|
{
|
2011-05-14 08:20:02 +02:00
|
|
|
rt_rq_iter_t iter;
|
2008-06-05 14:49:58 +02:00
|
|
|
struct rt_rq *rt_rq;
|
|
|
|
|
|
|
|
if (unlikely(!scheduler_running))
|
|
|
|
return;
|
|
|
|
|
2008-09-23 15:33:43 +02:00
|
|
|
/*
|
|
|
|
* Reset each runqueue's bandwidth settings
|
|
|
|
*/
|
2011-05-14 08:20:02 +02:00
|
|
|
for_each_rt_rq(rt_rq, iter, rq) {
|
2008-06-05 14:49:58 +02:00
|
|
|
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
|
|
|
|
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_lock(&rt_b->rt_runtime_lock);
|
|
|
|
raw_spin_lock(&rt_rq->rt_runtime_lock);
|
2008-06-05 14:49:58 +02:00
|
|
|
rt_rq->rt_runtime = rt_b->rt_runtime;
|
|
|
|
rt_rq->rt_time = 0;
|
2008-09-09 05:26:33 +02:00
|
|
|
rt_rq->rt_throttled = 0;
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_unlock(&rt_rq->rt_runtime_lock);
|
|
|
|
raw_spin_unlock(&rt_b->rt_runtime_lock);
|
2008-06-05 14:49:58 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-06-19 14:22:26 +02:00
|
|
|
static int balance_runtime(struct rt_rq *rt_rq)
|
|
|
|
{
|
|
|
|
int more = 0;
|
|
|
|
|
2011-10-06 22:39:14 +02:00
|
|
|
if (!sched_feat(RT_RUNTIME_SHARE))
|
|
|
|
return more;
|
|
|
|
|
2008-06-19 14:22:26 +02:00
|
|
|
if (rt_rq->rt_time > rt_rq->rt_runtime) {
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_unlock(&rt_rq->rt_runtime_lock);
|
2008-06-19 14:22:26 +02:00
|
|
|
more = do_balance_runtime(rt_rq);
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_lock(&rt_rq->rt_runtime_lock);
|
2008-06-19 14:22:26 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return more;
|
|
|
|
}
|
2008-06-24 20:09:43 +02:00
|
|
|
#else /* !CONFIG_SMP */
|
2008-06-19 14:22:26 +02:00
|
|
|
static inline int balance_runtime(struct rt_rq *rt_rq)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2008-06-24 20:09:43 +02:00
|
|
|
#endif /* CONFIG_SMP */
|
2008-04-19 19:44:58 +02:00
|
|
|
|
2008-06-19 14:22:26 +02:00
|
|
|
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
|
|
|
|
{
|
2011-10-18 22:03:48 +02:00
|
|
|
int i, idle = 1, throttled = 0;
|
2008-11-24 17:05:05 +01:00
|
|
|
const struct cpumask *span;
|
2008-06-19 14:22:26 +02:00
|
|
|
|
|
|
|
span = sched_rt_period_mask();
|
2012-08-07 10:02:38 +02:00
|
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
|
|
|
/*
|
|
|
|
* FIXME: isolated CPUs should really leave the root task group,
|
|
|
|
* whether they are isolcpus or were isolated via cpusets, lest
|
|
|
|
* the timer run on a CPU which does not service all runqueues,
|
|
|
|
* potentially leaving other CPUs indefinitely throttled. If
|
|
|
|
* isolation is really required, the user will turn the throttle
|
|
|
|
* off to kill the perturbations it causes anyway. Meanwhile,
|
|
|
|
* this maintains functionality for boot and/or troubleshooting.
|
|
|
|
*/
|
|
|
|
if (rt_b == &root_task_group.rt_bandwidth)
|
|
|
|
span = cpu_online_mask;
|
|
|
|
#endif
|
2008-11-24 17:05:05 +01:00
|
|
|
for_each_cpu(i, span) {
|
2008-06-19 14:22:26 +02:00
|
|
|
int enqueue = 0;
|
|
|
|
struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
|
|
|
|
struct rq *rq = rq_of_rt_rq(rt_rq);
|
|
|
|
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_lock(&rq->lock);
|
2008-06-19 14:22:26 +02:00
|
|
|
if (rt_rq->rt_time) {
|
|
|
|
u64 runtime;
|
|
|
|
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_lock(&rt_rq->rt_runtime_lock);
|
2008-06-19 14:22:26 +02:00
|
|
|
if (rt_rq->rt_throttled)
|
|
|
|
balance_runtime(rt_rq);
|
|
|
|
runtime = rt_rq->rt_runtime;
|
|
|
|
rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
|
|
|
|
if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
|
|
|
|
rt_rq->rt_throttled = 0;
|
|
|
|
enqueue = 1;
|
2011-04-29 08:36:50 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Force a clock update if the CPU was idle,
|
|
|
|
* lest wakeup -> unthrottle time accumulate.
|
|
|
|
*/
|
|
|
|
if (rt_rq->rt_nr_running && rq->curr == rq->idle)
|
|
|
|
rq->skip_clock_update = -1;
|
2008-06-19 14:22:26 +02:00
|
|
|
}
|
|
|
|
if (rt_rq->rt_time || rt_rq->rt_nr_running)
|
|
|
|
idle = 0;
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_unlock(&rt_rq->rt_runtime_lock);
|
sched: Fix sched rt group scheduling when hierachy is enabled
The current sched rt code is broken when it comes to hierarchical
scheduling, this patch fixes two problems
1. It adds redundant enqueuing (harmless) when it finds a queue
has tasks enqueued, but it has no run time and it is not
throttled.
2. The most important change is in sched_rt_rq_enqueue/dequeue.
The code just picks the rt_rq belonging to the current cpu
on which the period timer runs, the patch fixes it, so that
the correct rt_se is enqueued/dequeued.
Tested with a simple hierarchy
/c/d, c and d assigned similar runtimes of 50,000 and a while
1 loop runs within "d". Both c and d get throttled, without
the patch, the task just stops running and never runs (depends
on where the sched_rt b/w timer runs). With the patch, the
task is throttled and runs as expected.
[ bharata, suggestions on how to pick the rt_se belong to the
rt_rq and correct cpu ]
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: stable@kernel.org
LKML-Reference: <20110303113435.GA2868@balbir.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-03-03 12:34:35 +01:00
|
|
|
} else if (rt_rq->rt_nr_running) {
|
2008-06-19 14:22:28 +02:00
|
|
|
idle = 0;
|
sched: Fix sched rt group scheduling when hierachy is enabled
The current sched rt code is broken when it comes to hierarchical
scheduling, this patch fixes two problems
1. It adds redundant enqueuing (harmless) when it finds a queue
has tasks enqueued, but it has no run time and it is not
throttled.
2. The most important change is in sched_rt_rq_enqueue/dequeue.
The code just picks the rt_rq belonging to the current cpu
on which the period timer runs, the patch fixes it, so that
the correct rt_se is enqueued/dequeued.
Tested with a simple hierarchy
/c/d, c and d assigned similar runtimes of 50,000 and a while
1 loop runs within "d". Both c and d get throttled, without
the patch, the task just stops running and never runs (depends
on where the sched_rt b/w timer runs). With the patch, the
task is throttled and runs as expected.
[ bharata, suggestions on how to pick the rt_se belong to the
rt_rq and correct cpu ]
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: stable@kernel.org
LKML-Reference: <20110303113435.GA2868@balbir.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-03-03 12:34:35 +01:00
|
|
|
if (!rt_rq_throttled(rt_rq))
|
|
|
|
enqueue = 1;
|
|
|
|
}
|
2011-10-18 22:03:48 +02:00
|
|
|
if (rt_rq->rt_throttled)
|
|
|
|
throttled = 1;
|
2008-06-19 14:22:26 +02:00
|
|
|
|
|
|
|
if (enqueue)
|
|
|
|
sched_rt_rq_enqueue(rt_rq);
|
2009-11-17 14:28:38 +01:00
|
|
|
raw_spin_unlock(&rq->lock);
|
2008-06-19 14:22:26 +02:00
|
|
|
}
|
|
|
|
|
2011-10-18 22:03:48 +02:00
|
|
|
if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
|
|
|
|
return 1;
|
|
|
|
|
2008-06-19 14:22:26 +02:00
|
|
|
return idle;
|
|
|
|
}
|
2008-04-19 19:44:58 +02:00
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
static inline int rt_se_prio(struct sched_rt_entity *rt_se)
|
|
|
|
{
|
2008-02-13 15:45:40 +01:00
|
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
2008-01-25 21:08:30 +01:00
|
|
|
struct rt_rq *rt_rq = group_rt_rq(rt_se);
|
|
|
|
|
|
|
|
if (rt_rq)
|
2008-12-29 15:39:49 +01:00
|
|
|
return rt_rq->highest_prio.curr;
|
2008-01-25 21:08:30 +01:00
|
|
|
#endif
|
|
|
|
|
|
|
|
return rt_task_of(rt_se)->prio;
|
|
|
|
}
|
|
|
|
|
2014-06-18 06:43:35 +02:00
|
|
|
static void dump_throttled_rt_tasks(struct rt_rq *rt_rq)
|
|
|
|
{
|
|
|
|
struct rt_prio_array *array = &rt_rq->active;
|
|
|
|
struct sched_rt_entity *rt_se;
|
|
|
|
char buf[500];
|
|
|
|
char *pos = buf;
|
|
|
|
char *end = buf + sizeof(buf);
|
|
|
|
int idx;
|
|
|
|
|
|
|
|
pos += snprintf(pos, sizeof(buf),
|
|
|
|
"sched: RT throttling activated for rt_rq %p (cpu %d)\n",
|
|
|
|
rt_rq, cpu_of(rq_of_rt_rq(rt_rq)));
|
|
|
|
|
|
|
|
if (bitmap_empty(array->bitmap, MAX_RT_PRIO))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
pos += snprintf(pos, end - pos, "potential CPU hogs:\n");
|
|
|
|
idx = sched_find_first_bit(array->bitmap);
|
|
|
|
while (idx < MAX_RT_PRIO) {
|
|
|
|
list_for_each_entry(rt_se, array->queue + idx, run_list) {
|
|
|
|
struct task_struct *p;
|
|
|
|
|
|
|
|
if (!rt_entity_is_task(rt_se))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
p = rt_task_of(rt_se);
|
|
|
|
if (pos < end)
|
|
|
|
pos += snprintf(pos, end - pos, "\t%s (%d)\n",
|
|
|
|
p->comm, p->pid);
|
|
|
|
}
|
|
|
|
idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1);
|
|
|
|
}
|
|
|
|
out:
|
2014-06-19 23:23:33 +02:00
|
|
|
#ifdef CONFIG_PANIC_ON_RT_THROTTLING
|
|
|
|
/*
|
|
|
|
* Use pr_err() in the BUG() case since printk_sched() will
|
|
|
|
* not get flushed and deadlock is not a concern.
|
|
|
|
*/
|
|
|
|
pr_err("%s", buf);
|
|
|
|
BUG();
|
|
|
|
#else
|
This is the 3.10.67 stable release
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v2
iQIcBAABCAAGBQJUyuGRAAoJEDjbvchgkmk+7EwQALYPOeh+AManQFB1MQvFuOgZ
/4ulpjhGXw/RPTKHMeyHo8vRfUhMOx8UPF62uql+g1l9b/Zt2bs6qXu4QcxRRsQc
trSTUpi+U14y1hkgqOVOcFYP2ZaTjNEBQgLJ4eGn46CliLqme+rfoyRYm2GXzcR4
6cbSAr3mufdFIpi9/8Dn62Gv0aws5lIv3qkHJXznyuux3tisPT5y6Ux2KJoivPn/
SqADtRpwo+7lTjl15fE++9AqNsGMorV6toT2OO/7nXP+824psInKLmREAT2qC99b
BG61vcYdxOuHtzmwrvCf1jSRjxhvZT0j2xhBr/vCKcxy08AT0vDv68zrV1r6TIuu
U7/CKXtFBY95cjfnkTLJuswBSuIA/+sQHV6DaddH0V8fcZ6rQMLrblQ9ZcFFFkmT
2SG6lmlXqZvcEKYGMnL/Dcow1rkRhB5stiGgTkYxjiRSRpzAHISRJ/GGpsT+rRqK
HpBs5p9JshvRl7RWKwAu+DNGaEK1X/WYxc4/jw6dZFWX7lEWSMIPlr9zXgZCZ39y
V6lV1VVlT9/CSs1swKHUyhHHehlFsnIlQ6Fkiycr/KkuqBLs92Hyb7WhpVa819yX
osXdxSm6J54skiOLKYpBWHpnY09Tc+p28VEfMpErTExgp2oE8F34K7kdhoQPQb97
2mHiXNa+J4CLUNQ+sRmw
=HDBo
-----END PGP SIGNATURE-----
Merge commit 'v3.10.67' into LA.BF64.1.2.9
This merge brings us up to date with upstream kernel.org tag v3.10.67.
It also contains changes to allow forbidden warnings introduced in
the commit 'core, nfqueue, openvswitch: Orphan frags in skb_zerocopy
and handle errors'. Once upstream has corrected these warnings, the
changes to scripts/gcc-wrapper.py, in this commit, can be reverted.
* 'v3.10.67' (915 commits):
Linux 3.10.67
md/raid5: fetch_block must fetch all the blocks handle_stripe_dirtying wants.
ext4: fix warning in ext4_da_update_reserve_space()
quota: provide interface for readding allocated space into reserved space
crypto: add missing crypto module aliases
crypto: include crypto- module prefix in template
crypto: prefix module autoloading with "crypto-"
drbd: merge_bvec_fn: properly remap bvm->bi_bdev
Revert "swiotlb-xen: pass dev_addr to swiotlb_tbl_unmap_single"
ipvs: uninitialized data with IP_VS_IPV6
KEYS: close race between key lookup and freeing
sata_dwc_460ex: fix resource leak on error path
x86/asm/traps: Disable tracing and kprobes in fixup_bad_iret and sync_regs
x86, tls: Interpret an all-zero struct user_desc as "no segment"
x86, tls, ldt: Stop checking lm in LDT_empty
x86/tsc: Change Fast TSC calibration failed from error to info
x86, hyperv: Mark the Hyper-V clocksource as being continuous
clocksource: exynos_mct: Fix bitmask regression for exynos4_mct_write
can: dev: fix crtlmode_supported check
bus: mvebu-mbus: fix support of MBus window 13
ARM: dts: imx25: Fix PWM "per" clocks
time: adjtimex: Validate the ADJ_FREQUENCY values
time: settimeofday: Validate the values of tv from user
dm cache: share cache-metadata object across inactive and active DM tables
ipr: wait for aborted command responses
drm/i915: Fix mutex->owner inspection race under DEBUG_MUTEXES
scripts/recordmcount.pl: There is no -m32 gcc option on Super-H anymore
ALSA: usb-audio: Add mic volume fix quirk for Logitech Webcam C210
libata: prevent HSM state change race between ISR and PIO
pinctrl: Fix two deadlocks
gpio: sysfs: fix gpio device-attribute leak
gpio: sysfs: fix gpio-chip device-attribute leak
Linux 3.10.66
s390/3215: fix tty output containing tabs
s390/3215: fix hanging console issue
fsnotify: next_i is freed during fsnotify_unmount_inodes.
netfilter: ipset: small potential read beyond the end of buffer
mmc: sdhci: Fix sleep in atomic after inserting SD card
LOCKD: Fix a race when initialising nlmsvc_timeout
x86, um: actually mark system call tables readonly
um: Skip futex_atomic_cmpxchg_inatomic() test
decompress_bunzip2: off by one in get_next_block()
ARM: shmobile: sh73a0 legacy: Set .control_parent for all irqpin instances
ARM: omap5/dra7xx: Fix frequency typos
ARM: clk-imx6q: fix video divider for rev T0 1.0
ARM: imx6q: drop unnecessary semicolon
ARM: dts: imx25: Fix the SPI1 clocks
Input: I8042 - add Acer Aspire 7738 to the nomux list
Input: i8042 - reset keyboard to fix Elantech touchpad detection
can: kvaser_usb: Don't send a RESET_CHIP for non-existing channels
can: kvaser_usb: Reset all URB tx contexts upon channel close
can: kvaser_usb: Don't free packets when tight on URBs
USB: keyspan: fix null-deref at probe
USB: cp210x: add IDs for CEL USB sticks and MeshWorks devices
USB: cp210x: fix ID for production CEL MeshConnect USB Stick
usb: dwc3: gadget: Stop TRB preparation after limit is reached
usb: dwc3: gadget: Fix TRB preparation during SG
OHCI: add a quirk for ULi M5237 blocking on reset
gpiolib: of: Correct error handling in of_get_named_gpiod_flags
NFSv4.1: Fix client id trunking on Linux
ftrace/jprobes/x86: Fix conflict between jprobes and function graph tracing
vfio-pci: Fix the check on pci device type in vfio_pci_probe()
uvcvideo: Fix destruction order in uvc_delete()
smiapp: Take mutex during PLL update in sensor initialisation
af9005: fix kernel panic on init if compiled without IR
smiapp-pll: Correct clock debug prints
video/logo: prevent use of logos after they have been freed
storvsc: ring buffer failures may result in I/O freeze
iscsi-target: Fail connection on short sendmsg writes
hp_accel: Add support for HP ZBook 15
cfg80211: Fix 160 MHz channels with 80+80 and 160 MHz drivers
ARC: [nsimosci] move peripherals to match model to FPGA
drm/i915: Force the CS stall for invalidate flushes
drm/i915: Invalidate media caches on gen7
drm/radeon: properly filter DP1.2 4k modes on non-DP1.2 hw
drm/radeon: check the right ring in radeon_evict_flags()
drm/vmwgfx: Fix fence event code
enic: fix rx skb checksum
alx: fix alx_poll()
tcp: Do not apply TSO segment limit to non-TSO packets
tg3: tg3_disable_ints using uninitialized mailbox value to disable interrupts
netlink: Don't reorder loads/stores before marking mmap netlink frame as available
netlink: Always copy on mmap TX.
Linux 3.10.65
mm: Don't count the stack guard page towards RLIMIT_STACK
mm: propagate error from stack expansion even for guard page
mm, vmscan: prevent kswapd livelock due to pfmemalloc-throttled process being killed
perf session: Do not fail on processing out of order event
perf: Fix events installation during moving group
perf/x86/intel/uncore: Make sure only uncore events are collected
Btrfs: don't delay inode ref updates during log replay
ARM: mvebu: disable I/O coherency on non-SMP situations on Armada 370/375/38x/XP
scripts/kernel-doc: don't eat struct members with __aligned
nilfs2: fix the nilfs_iget() vs. nilfs_new_inode() races
nfsd4: fix xdr4 inclusion of escaped char
fs: nfsd: Fix signedness bug in compare_blob
serial: samsung: wait for transfer completion before clock disable
writeback: fix a subtle race condition in I_DIRTY clearing
cdc-acm: memory leak in error case
genhd: check for int overflow in disk_expand_part_tbl()
USB: cdc-acm: check for valid interfaces
ALSA: hda - Fix wrong gpio_dir & gpio_mask hint setups for IDT/STAC codecs
ALSA: hda - using uninitialized data
ALSA: usb-audio: extend KEF X300A FU 10 tweak to Arcam rPAC
driver core: Fix unbalanced device reference in drivers_probe
x86, vdso: Use asm volatile in __getcpu
x86_64, vdso: Fix the vdso address randomization algorithm
HID: Add a new id 0x501a for Genius MousePen i608X
HID: add battery quirk for USB_DEVICE_ID_APPLE_ALU_WIRELESS_2011_ISO keyboard
HID: roccat: potential out of bounds in pyra_sysfs_write_settings()
HID: i2c-hid: prevent buffer overflow in early IRQ
HID: i2c-hid: fix race condition reading reports
iommu/vt-d: Fix an off-by-one bug in __domain_mapping()
UBI: Fix double free after do_sync_erase()
UBI: Fix invalid vfree()
pstore-ram: Allow optional mapping with pgprot_noncached
pstore-ram: Fix hangs by using write-combine mappings
PCI: Restore detection of read-only BARs
ASoC: dwc: Ensure FIFOs are flushed to prevent channel swap
ASoC: max98090: Fix ill-defined sidetone route
ASoC: sigmadsp: Refuse to load firmware files with a non-supported version
ath5k: fix hardware queue index assignment
swiotlb-xen: pass dev_addr to swiotlb_tbl_unmap_single
can: peak_usb: fix memset() usage
can: peak_usb: fix cleanup sequence order in case of error during init
ath9k: fix BE/BK queue order
ath9k_hw: fix hardware queue allocation
ocfs2: fix journal commit deadlock
Linux 3.10.64
Btrfs: fix fs corruption on transaction abort if device supports discard
Btrfs: do not move em to modified list when unpinning
eCryptfs: Remove buggy and unnecessary write in file name decode routine
eCryptfs: Force RO mount when encrypted view is enabled
udf: Verify symlink size before loading it
exit: pidns: alloc_pid() leaks pid_namespace if child_reaper is exiting
ncpfs: return proper error from NCP_IOC_SETROOT ioctl
crypto: af_alg - fix backlog handling
userns: Unbreak the unprivileged remount tests
userns: Allow setting gid_maps without privilege when setgroups is disabled
userns: Add a knob to disable setgroups on a per user namespace basis
userns: Rename id_map_mutex to userns_state_mutex
userns: Only allow the creator of the userns unprivileged mappings
userns: Check euid no fsuid when establishing an unprivileged uid mapping
userns: Don't allow unprivileged creation of gid mappings
userns: Don't allow setgroups until a gid mapping has been setablished
userns: Document what the invariant required for safe unprivileged mappings.
groups: Consolidate the setgroups permission checks
umount: Disallow unprivileged mount force
mnt: Update unprivileged remount test
mnt: Implicitly add MNT_NODEV on remount when it was implicitly added by mount
mac80211: free management frame keys when removing station
mac80211: fix multicast LED blinking and counter
KEYS: Fix stale key registration at error path
isofs: Fix unchecked printing of ER records
x86/tls: Don't validate lm in set_thread_area() after all
dm space map metadata: fix sm_bootstrap_get_nr_blocks()
dm bufio: fix memleak when using a dm_buffer's inline bio
nfs41: fix nfs4_proc_layoutget error handling
megaraid_sas: corrected return of wait_event from abort frame path
mmc: block: add newline to sysfs display of force_ro
mfd: tc6393xb: Fail ohci suspend if full state restore is required
md/bitmap: always wait for writes on unplug.
x86, kvm: Clear paravirt_enabled on KVM guests for espfix32's benefit
x86_64, switch_to(): Load TLS descriptors before switching DS and ES
x86/tls: Disallow unusual TLS segments
x86/tls: Validate TLS entries to protect espfix
isofs: Fix infinite looping over CE entries
Linux 3.10.63
ALSA: usb-audio: Don't resubmit pending URBs at MIDI error recovery
powerpc: 32 bit getcpu VDSO function uses 64 bit instructions
ARM: sched_clock: Load cycle count after epoch stabilizes
igb: bring link up when PHY is powered up
ext2: Fix oops in ext2_get_block() called from ext2_quota_write()
nEPT: Nested INVEPT
net: sctp: use MAX_HEADER for headroom reserve in output path
net: mvneta: fix Tx interrupt delay
rtnetlink: release net refcnt on error in do_setlink()
net/mlx4_core: Limit count field to 24 bits in qp_alloc_res
tg3: fix ring init when there are more TX than RX channels
ipv6: gre: fix wrong skb->protocol in WCCP
sata_fsl: fix error handling of irq_of_parse_and_map
ahci: disable MSI on SAMSUNG 0xa800 SSD
AHCI: Add DeviceIDs for Sunrise Point-LP SATA controller
media: smiapp: Only some selection targets are settable
drm/i915: Unlock panel even when LVDS is disabled
drm/radeon: kernel panic in drm_calc_vbltimestamp_from_scanoutpos with 3.18.0-rc6
i2c: davinci: generate STP always when NACK is received
i2c: omap: fix i207 errata handling
i2c: omap: fix NACK and Arbitration Lost irq handling
xen-netfront: Remove BUGs on paged skb data which crosses a page boundary
mm: fix swapoff hang after page migration and fork
mm: frontswap: invalidate expired data on a dup-store failure
Linux 3.10.62
nfsd: Fix ACL null pointer deref
powerpc/powernv: Honor the generic "no_64bit_msi" flag
bnx2fc: do not add shared skbs to the fcoe_rx_list
nfsd4: fix leak of inode reference on delegation failure
nfsd: Fix slot wake up race in the nfsv4.1 callback code
rt2x00: do not align payload on modern H/W
can: dev: avoid calling kfree_skb() from interrupt context
spi: dw: Fix dynamic speed change.
iser-target: Handle DEVICE_REMOVAL event on network portal listener correctly
target: Don't call TFO->write_pending if data_length == 0
srp-target: Retry when QP creation fails with ENOMEM
Input: xpad - use proper endpoint type
ARM: 8222/1: mvebu: enable strex backoff delay
ARM: 8216/1: xscale: correct auxiliary register in suspend/resume
ALSA: usb-audio: Add ctrl message delay quirk for Marantz/Denon devices
can: esd_usb2: fix memory leak on disconnect
USB: xhci: don't start a halted endpoint before its new dequeue is set
usb-quirks: Add reset-resume quirk for MS Wireless Laser Mouse 6000
usb: serial: ftdi_sio: add PIDs for Matrix Orbital products
USB: serial: cp210x: add IDs for CEL MeshConnect USB Stick
USB: keyspan: fix tty line-status reporting
USB: keyspan: fix overrun-error reporting
USB: ssu100: fix overrun-error reporting
iio: Fix IIO_EVENT_CODE_EXTRACT_DIR bit mask
powerpc/pseries: Fix endiannes issue in RTAS call from xmon
powerpc/pseries: Honor the generic "no_64bit_msi" flag
of/base: Fix PowerPC address parsing hack
ASoC: wm_adsp: Avoid attempt to free buffers that might still be in use
ASoC: sgtl5000: Fix SMALL_POP bit definition
PCI/MSI: Add device flag indicating that 64-bit MSIs don't work
ipx: fix locking regression in ipx_sendmsg and ipx_recvmsg
pptp: fix stack info leak in pptp_getname()
qmi_wwan: Add support for HP lt4112 LTE/HSPA+ Gobi 4G Modem
ieee802154: fix error handling in ieee802154fake_probe()
ipv4: Fix incorrect error code when adding an unreachable route
inetdevice: fixed signed integer overflow
sparc64: Fix constraints on swab helpers.
uprobes, x86: Fix _TIF_UPROBE vs _TIF_NOTIFY_RESUME
x86, mm: Set NX across entire PMD at boot
x86: Require exact match for 'noxsave' command line option
x86_64, traps: Rework bad_iret
x86_64, traps: Stop using IST for #SS
x86_64, traps: Fix the espfix64 #DF fixup and rewrite it in C
MIPS: Loongson: Make platform serial setup always built-in.
MIPS: oprofile: Fix backtrace on 64-bit kernel
Linux 3.10.61
mm: memcg: handle non-error OOM situations more gracefully
mm: memcg: do not trap chargers with full callstack on OOM
mm: memcg: rework and document OOM waiting and wakeup
mm: memcg: enable memcg OOM killer only for user faults
x86: finish user fault error path with fatal signal
arch: mm: pass userspace fault flag to generic fault handler
arch: mm: do not invoke OOM killer on kernel fault OOM
arch: mm: remove obsolete init OOM protection
mm: invoke oom-killer from remaining unconverted page fault handlers
net: sctp: fix skb_over_panic when receiving malformed ASCONF chunks
net: sctp: fix panic on duplicate ASCONF chunks
net: sctp: fix remote memory pressure from excessive queueing
KVM: x86: Don't report guest userspace emulation error to userspace
SCSI: hpsa: fix a race in cmd_free/scsi_done
net/mlx4_en: Fix BlueFlame race
ARM: Correct BUG() assembly to ensure it is endian-agnostic
perf/x86/intel: Use proper dTLB-load-misses event on IvyBridge
mei: bus: fix possible boundaries violation
perf: Handle compat ioctl
MIPS: Fix forgotten preempt_enable() when CPU has inclusive pcaches
dell-wmi: Fix access out of memory
ARM: probes: fix instruction fetch order with <asm/opcodes.h>
br: fix use of ->rx_handler_data in code executed on non-rx_handler path
netfilter: nf_nat: fix oops on netns removal
netfilter: xt_bpf: add mising opaque struct sk_filter definition
netfilter: nf_log: release skbuff on nlmsg put failure
netfilter: nfnetlink_log: fix maximum packet length logged to userspace
netfilter: nf_log: account for size of NLMSG_DONE attribute
ipc: always handle a new value of auto_msgmni
clocksource: Remove "weak" from clocksource_default_clock() declaration
kgdb: Remove "weak" from kgdb_arch_pc() declaration
media: ttusb-dec: buffer overflow in ioctl
NFSv4: Fix races between nfs_remove_bad_delegation() and delegation return
nfs: Fix use of uninitialized variable in nfs_getattr()
NFS: Don't try to reclaim delegation open state if recovery failed
NFSv4: Ensure that we remove NFSv4.0 delegations when state has expired
Input: alps - allow up to 2 invalid packets without resetting device
Input: alps - ignore potential bare packets when device is out of sync
dm raid: ensure superblock's size matches device's logical block size
dm btree: fix a recursion depth bug in btree walking code
block: Fix computation of merged request priority
parisc: Use compat layer for msgctl, shmat, shmctl and semtimedop syscalls
scsi: only re-lock door after EH on devices that were reset
nfs: fix pnfs direct write memory leak
firewire: cdev: prevent kernel stack leaking into ioctl arguments
arm64: __clear_user: handle exceptions on strb
ARM: 8198/1: make kuser helpers depend on MMU
drm/radeon: add missing crtc unlock when setting up the MC
mac80211: fix use-after-free in defragmentation
macvtap: Fix csum_start when VLAN tags are present
iwlwifi: configure the LTR
libceph: do not crash on large auth tickets
xtensa: re-wire umount syscall to sys_oldumount
ALSA: usb-audio: Fix memory leak in FTU quirk
ahci: disable MSI instead of NCQ on Samsung pci-e SSDs on macbooks
ahci: Add Device IDs for Intel Sunrise Point PCH
audit: keep inode pinned
x86, x32, audit: Fix x32's AUDIT_ARCH wrt audit
sparc32: Implement xchg and atomic_xchg using ATOMIC_HASH locks
sparc64: Do irq_{enter,exit}() around generic_smp_call_function*().
sparc64: Fix crashes in schizo_pcierr_intr_other().
sunvdc: don't call VD_OP_GET_VTOC
vio: fix reuse of vio_dring slot
sunvdc: limit each sg segment to a page
sunvdc: compute vdisk geometry from capacity
sunvdc: add cdrom and v1.1 protocol support
net: sctp: fix memory leak in auth key management
net: sctp: fix NULL pointer dereference in af->from_addr_param on malformed packet
gre6: Move the setting of dev->iflink into the ndo_init functions.
ip6_tunnel: Use ip6_tnl_dev_init as the ndo_init function.
Linux 3.10.60
libceph: ceph-msgr workqueue needs a resque worker
Btrfs: fix kfree on list_head in btrfs_lookup_csums_range error cleanup
of: Fix overflow bug in string property parsing functions
sysfs: driver core: Fix glue dir race condition by gdp_mutex
i2c: at91: don't account as iowait
acer-wmi: Add acpi_backlight=video quirk for the Acer KAV80
rbd: Fix error recovery in rbd_obj_read_sync()
drm/radeon: remove invalid pci id
usb: gadget: udc: core: fix kernel oops with soft-connect
usb: gadget: function: acm: make f_acm pass USB20CV Chapter9
usb: dwc3: gadget: fix set_halt() bug with pending transfers
crypto: algif - avoid excessive use of socket buffer in skcipher
mm: Remove false WARN_ON from pagecache_isize_extended()
x86, apic: Handle a bad TSC more gracefully
posix-timers: Fix stack info leak in timer_create()
mac80211: fix typo in starting baserate for rts_cts_rate_idx
PM / Sleep: fix recovery during resuming from hibernation
tty: Fix high cpu load if tty is unreleaseable
quota: Properly return errors from dquot_writeback_dquots()
ext3: Don't check quota format when there are no quota files
nfsd4: fix crash on unknown operation number
cpc925_edac: Report UE events properly
e7xxx_edac: Report CE events properly
i3200_edac: Report CE events properly
i82860_edac: Report CE events properly
scsi: Fix error handling in SCSI_IOCTL_SEND_COMMAND
lib/bitmap.c: fix undefined shift in __bitmap_shift_{left|right}()
cgroup/kmemleak: add kmemleak_free() for cgroup deallocations.
usb: Do not allow usb_alloc_streams on unconfigured devices
USB: opticon: fix non-atomic allocation in write path
usb-storage: handle a skipped data phase
spi: pxa2xx: toggle clocks on suspend if not disabled by runtime PM
spi: pl022: Fix incorrect dma_unmap_sg
usb: dwc3: gadget: Properly initialize LINK TRB
wireless: rt2x00: add new rt2800usb device
USB: option: add Haier CE81B CDMA modem
usb: option: add support for Telit LE910
USB: cdc-acm: only raise DTR on transitions from B0
USB: cdc-acm: add device id for GW Instek AFG-2225
usb: serial: ftdi_sio: add "bricked" FTDI device PID
usb: serial: ftdi_sio: add Awinda Station and Dongle products
USB: serial: cp210x: add Silicon Labs 358x VID and PID
serial: Fix divide-by-zero fault in uart_get_divisor()
staging:iio:ade7758: Remove "raw" from channel name
staging:iio:ade7758: Fix check if channels are enabled in prenable
staging:iio:ade7758: Fix NULL pointer deref when enabling buffer
staging:iio:ad5933: Drop "raw" from channel names
staging:iio:ad5933: Fix NULL pointer deref when enabling buffer
OOM, PM: OOM killed task shouldn't escape PM suspend
freezer: Do not freeze tasks killed by OOM killer
ext4: fix oops when loading block bitmap failed
cpufreq: intel_pstate: Fix setting max_perf_pct in performance policy
ext4: fix overflow when updating superblock backups after resize
ext4: check s_chksum_driver when looking for bg csum presence
ext4: fix reservation overflow in ext4_da_write_begin
ext4: add ext4_iget_normal() which is to be used for dir tree lookups
ext4: grab missed write_count for EXT4_IOC_SWAP_BOOT
ext4: don't check quota format when there are no quota files
ext4: check EA value offset when loading
jbd2: free bh when descriptor block checksum fails
MIPS: tlbex: Properly fix HUGE TLB Refill exception handler
target: Fix APTPL metadata handling for dynamic MappedLUNs
target: Fix queue full status NULL pointer for SCF_TRANSPORT_TASK_SENSE
qla_target: don't delete changed nacls
ARC: Update order of registers in KGDB to match GDB 7.5
ARC: [nsimosci] Allow "headless" models to boot
KVM: x86: Emulator fixes for eip canonical checks on near branches
KVM: x86: Fix wrong masking on relative jump/call
kvm: x86: don't kill guest on unknown exit reason
KVM: x86: Check non-canonical addresses upon WRMSR
KVM: x86: Improve thread safety in pit
KVM: x86: Prevent host from panicking on shared MSR writes.
kvm: fix excessive pages un-pinning in kvm_iommu_map error path.
media: tda7432: Fix setting TDA7432_MUTE bit for TDA7432_RF register
media: ds3000: fix LNB supply voltage on Tevii S480 on initialization
media: em28xx-v4l: give back all active video buffers to the vb2 core properly on streaming stop
media: v4l2-common: fix overflow in v4l_bound_align_image()
drm/nouveau/bios: memset dcb struct to zero before parsing
drm/tilcdc: Fix the error path in tilcdc_load()
drm/ast: Fix HW cursor image
Input: i8042 - quirks for Fujitsu Lifebook A544 and Lifebook AH544
Input: i8042 - add noloop quirk for Asus X750LN
framebuffer: fix border color
modules, lock around setting of MODULE_STATE_UNFORMED
dm log userspace: fix memory leak in dm_ulog_tfr_init failure path
block: fix alignment_offset math that assumes io_min is a power-of-2
drbd: compute the end before rb_insert_augmented()
dm bufio: update last_accessed when relinking a buffer
virtio_pci: fix virtio spec compliance on restore
selinux: fix inode security list corruption
pstore: Fix duplicate {console,ftrace}-efi entries
mfd: rtsx_pcr: Fix MSI enable error handling
mnt: Prevent pivot_root from creating a loop in the mount tree
UBI: add missing kmem_cache_free() in process_pool_aeb error path
random: add and use memzero_explicit() for clearing data
crypto: more robust crypto_memneq
fix misuses of f_count() in ppp and netlink
kill wbuf_queued/wbuf_dwork_lock
ALSA: pcm: Zero-clear reserved fields of PCM status ioctl in compat mode
evm: check xattr value length and type in evm_inode_setxattr()
x86, pageattr: Prevent overflow in slow_virt_to_phys() for X86_PAE
x86_64, entry: Fix out of bounds read on sysenter
x86_64, entry: Filter RFLAGS.NT on entry from userspace
x86, flags: Rename X86_EFLAGS_BIT1 to X86_EFLAGS_FIXED
x86, fpu: shift drop_init_fpu() from save_xstate_sig() to handle_signal()
x86, fpu: __restore_xstate_sig()->math_state_restore() needs preempt_disable()
x86: Reject x32 executables if x32 ABI not supported
vfs: fix data corruption when blocksize < pagesize for mmaped data
UBIFS: fix free log space calculation
UBIFS: fix a race condition
UBIFS: remove mst_mutex
fs: Fix theoretical division by 0 in super_cache_scan().
fs: make cont_expand_zero interruptible
mmc: rtsx_pci_sdmmc: fix incorrect last byte in R2 response
libata-sff: Fix controllers with no ctl port
pata_serverworks: disable 64-KB DMA transfers on Broadcom OSB4 IDE Controller
Revert "percpu: free percpu allocation info for uniprocessor system"
lockd: Try to reconnect if statd has moved
drivers/net: macvtap and tun depend on INET
ipv4: dst_entry leak in ip_send_unicast_reply()
ax88179_178a: fix bonding failure
ipv4: fix nexthop attlen check in fib_nh_match
tracing/syscalls: Ignore numbers outside NR_syscalls' range
Linux 3.10.59
ecryptfs: avoid to access NULL pointer when write metadata in xattr
ARM: at91/PMC: don't forget to write PMC_PCDR register to disable clocks
ALSA: usb-audio: Add support for Steinberg UR22 USB interface
ALSA: emu10k1: Fix deadlock in synth voice lookup
ALSA: pcm: use the same dma mmap codepath both for arm and arm64
arm64: compat: fix compat types affecting struct compat_elf_prpsinfo
spi: dw-mid: terminate ongoing transfers at exit
kernel: add support for gcc 5
fanotify: enable close-on-exec on events' fd when requested in fanotify_init()
mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set
Bluetooth: Fix issue with USB suspend in btusb driver
Bluetooth: Fix HCI H5 corrupted ack value
rt2800: correct BBP1_TX_POWER_CTRL mask
PCI: Generate uppercase hex for modalias interface class
PCI: Increase IBM ipr SAS Crocodile BARs to at least system page size
iwlwifi: Add missing PCI IDs for the 7260 series
NFSv4.1: Fix an NFSv4.1 state renewal regression
NFSv4: fix open/lock state recovery error handling
NFSv4: Fix lock recovery when CREATE_SESSION/SETCLIENTID_CONFIRM fails
lzo: check for length overrun in variable length encoding.
Revert "lzo: properly check for overruns"
Documentation: lzo: document part of the encoding
m68k: Disable/restore interrupts in hwreg_present()/hwreg_write()
Drivers: hv: vmbus: Fix a bug in vmbus_open()
Drivers: hv: vmbus: Cleanup vmbus_establish_gpadl()
Drivers: hv: vmbus: Cleanup vmbus_teardown_gpadl()
Drivers: hv: vmbus: Cleanup vmbus_post_msg()
firmware_class: make sure fw requests contain a name
qla2xxx: Use correct offset to req-q-out for reserve calculation
mptfusion: enable no_write_same for vmware scsi disks
be2iscsi: check ip buffer before copying
regmap: fix NULL pointer dereference in _regmap_write/read
regmap: debugfs: fix possbile NULL pointer dereference
spi: dw-mid: check that DMA was inited before exit
spi: dw-mid: respect 8 bit mode
x86/intel/quark: Switch off CR4.PGE so TLB flush uses CR3 instead
kvm: don't take vcpu mutex for obviously invalid vcpu ioctls
KVM: s390: unintended fallthrough for external call
kvm: x86: fix stale mmio cache bug
fs: Add a missing permission check to do_umount
Btrfs: fix race in WAIT_SYNC ioctl
Btrfs: fix build_backref_tree issue with multiple shared blocks
Btrfs: try not to ENOSPC on log replay
Linux 3.10.58
USB: cp210x: add support for Seluxit USB dongle
USB: serial: cp210x: added Ketra N1 wireless interface support
USB: Add device quirk for ASUS T100 Base Station keyboard
ipv6: reallocate addrconf router for ipv6 address when lo device up
tcp: fixing TLP's FIN recovery
sctp: handle association restarts when the socket is closed.
ip6_gre: fix flowi6_proto value in xmit path
hyperv: Fix a bug in netvsc_start_xmit()
tg3: Allow for recieve of full-size 8021AD frames
tg3: Work around HW/FW limitations with vlan encapsulated frames
l2tp: fix race while getting PMTU on PPP pseudo-wire
openvswitch: fix panic with multiple vlan headers
packet: handle too big packets for PACKET_V3
tcp: fix tcp_release_cb() to dispatch via address family for mtu_reduced()
sit: Fix ipip6_tunnel_lookup device matching criteria
myri10ge: check for DMA mapping errors
Linux 3.10.57
cpufreq: ondemand: Change the calculation of target frequency
cpufreq: Fix wrong time unit conversion
nl80211: clear skb cb before passing to netlink
drbd: fix regression 'out of mem, failed to invoke fence-peer helper'
jiffies: Fix timeval conversion to jiffies
md/raid5: disable 'DISCARD' by default due to safety concerns.
media: vb2: fix VBI/poll regression
mm: numa: Do not mark PTEs pte_numa when splitting huge pages
mm, thp: move invariant bug check out of loop in __split_huge_page_map
ring-buffer: Fix infinite spin in reading buffer
init/Kconfig: Fix HAVE_FUTEX_CMPXCHG to not break up the EXPERT menu
perf: fix perf bug in fork()
udf: Avoid infinite loop when processing indirect ICBs
Linux 3.10.56
vm_is_stack: use for_each_thread() rather then buggy while_each_thread()
oom_kill: add rcu_read_lock() into find_lock_task_mm()
oom_kill: has_intersects_mems_allowed() needs rcu_read_lock()
oom_kill: change oom_kill.c to use for_each_thread()
introduce for_each_thread() to replace the buggy while_each_thread()
kernel/fork.c:copy_process(): unify CLONE_THREAD-or-thread_group_leader code
arm: multi_v7_defconfig: Enable Zynq UART driver
ext2: Fix fs corruption in ext2_get_xip_mem()
serial: 8250_dma: check the result of TX buffer mapping
ARM: 7748/1: oabi: handle faults when loading swi instruction from userspace
netfilter: nf_conntrack: avoid large timeout for mid-stream pickup
PM / sleep: Use valid_state() for platform-dependent sleep states only
PM / sleep: Add state field to pm_states[] entries
ipvs: fix ipv6 hook registration for local replies
ipvs: Maintain all DSCP and ECN bits for ipv6 tun forwarding
ipvs: avoid netns exit crash on ip_vs_conn_drop_conntrack
md/raid1: fix_read_error should act on all non-faulty devices.
media: cx18: fix kernel oops with tda8290 tuner
Fix nasty 32-bit overflow bug in buffer i/o code.
perf kmem: Make it work again on non NUMA machines
perf: Fix a race condition in perf_remove_from_context()
alarmtimer: Lock k_itimer during timer callback
alarmtimer: Do not signal SIGEV_NONE timers
parisc: Only use -mfast-indirect-calls option for 32-bit kernel builds
powerpc/perf: Fix ABIv2 kernel backtraces
sched: Fix unreleased llc_shared_mask bit during CPU hotplug
ocfs2/dlm: do not get resource spinlock if lockres is new
nilfs2: fix data loss with mmap()
fs/notify: don't show f_handle if exportfs_encode_inode_fh failed
fsnotify/fdinfo: use named constants instead of hardcoded values
kcmp: fix standard comparison bug
Revert "mac80211: disable uAPSD if all ACs are under ACM"
usb: dwc3: core: fix ordering for PHY suspend
usb: dwc3: core: fix order of PM runtime calls
usb: host: xhci: fix compliance mode workaround
genhd: fix leftover might_sleep() in blk_free_devt()
lockd: fix rpcbind crash on lockd startup failure
rtlwifi: rtl8192cu: Add new ID
percpu: perform tlb flush after pcpu_map_pages() failure
percpu: fix pcpu_alloc_pages() failure path
percpu: free percpu allocation info for uniprocessor system
ata_piix: Add Device IDs for Intel 9 Series PCH
Input: i8042 - add nomux quirk for Avatar AVIU-145A6
Input: i8042 - add Fujitsu U574 to no_timeout dmi table
Input: atkbd - do not try 'deactivate' keyboard on any LG laptops
Input: elantech - fix detection of touchpad on ASUS s301l
Input: synaptics - add support for ForcePads
Input: serport - add compat handling for SPIOCSTYPE ioctl
dm crypt: fix access beyond the end of allocated space
block: Fix dev_t minor allocation lifetime
workqueue: apply __WQ_ORDERED to create_singlethread_workqueue()
Revert "iwlwifi: dvm: don't enable CTS to self"
SCSI: libiscsi: fix potential buffer overrun in __iscsi_conn_send_pdu
NFC: microread: Potential overflows in microread_target_discovered()
iscsi-target: Fix memory corruption in iscsit_logout_post_handler_diffcid
iscsi-target: avoid NULL pointer in iscsi_copy_param_list failure
Target/iser: Don't put isert_conn inside disconnected handler
Target/iser: Get isert_conn reference once got to connected_handler
iio:inkern: fix overwritten -EPROBE_DEFER in of_iio_channel_get_by_name
iio:magnetometer: bugfix magnetometers gain values
iio: adc: ad_sigma_delta: Fix indio_dev->trig assignment
iio: st_sensors: Fix indio_dev->trig assignment
iio: meter: ade7758: Fix indio_dev->trig assignment
iio: inv_mpu6050: Fix indio_dev->trig assignment
iio: gyro: itg3200: Fix indio_dev->trig assignment
iio:trigger: modify return value for iio_trigger_get
CIFS: Fix SMB2 readdir error handling
CIFS: Fix directory rename error
ASoC: davinci-mcasp: Correct rx format unit configuration
shmem: fix nlink for rename overwrite directory
x86 early_ioremap: Increase FIX_BTMAPS_SLOTS to 8
KVM: x86: handle idiv overflow at kvm_write_tsc
regmap: Fix handling of volatile registers for format_write() chips
ACPICA: Update to GPIO region handler interface.
MIPS: mcount: Adjust stack pointer for static trace in MIPS32
MIPS: ZBOOT: add missing <linux/string.h> include
ARM: 8165/1: alignment: don't break misaligned NEON load/store
ARM: 7897/1: kexec: Use the right ISA for relocate_new_kernel
ARM: 8133/1: use irq_set_affinity with force=false when migrating irqs
ARM: 8128/1: abort: don't clear the exclusive monitors
NFSv4: Fix another bug in the close/open_downgrade code
NFSv4: nfs4_state_manager() vs. nfs_server_remove_lists()
usb:hub set hub->change_bits when over-current happens
usb: dwc3: omap: fix ordering for runtime pm calls
USB: EHCI: unlink QHs even after the controller has stopped
USB: storage: Add quirks for Entrega/Xircom USB to SCSI converters
USB: storage: Add quirk for Ariston Technologies iConnect USB to SCSI adapter
USB: storage: Add quirk for Adaptec USBConnect 2000 USB-to-SCSI Adapter
storage: Add single-LUN quirk for Jaz USB Adapter
usb: hub: take hub->hdev reference when processing from eventlist
xhci: fix oops when xhci resumes from hibernate with hw lpm capable devices
xhci: Fix null pointer dereference if xhci initialization fails
USB: zte_ev: fix removed PIDs
USB: ftdi_sio: add support for NOVITUS Bono E thermal printer
USB: sierra: add 1199:68AA device ID
USB: sierra: avoid CDC class functions on "68A3" devices
USB: zte_ev: remove duplicate Qualcom PID
USB: zte_ev: remove duplicate Gobi PID
Revert "USB: option,zte_ev: move most ZTE CDMA devices to zte_ev"
USB: option: add VIA Telecom CDS7 chipset device id
USB: option: reduce interrupt-urb logging verbosity
USB: serial: fix potential heap buffer overflow
USB: sisusb: add device id for Magic Control USB video
USB: serial: fix potential stack buffer overflow
USB: serial: pl2303: add device id for ztek device
xtensa: fix a6 and a7 handling in fast_syscall_xtensa
xtensa: fix TLBTEMP_BASE_2 region handling in fast_second_level_miss
xtensa: fix access to THREAD_RA/THREAD_SP/THREAD_DS
xtensa: fix address checks in dma_{alloc,free}_coherent
xtensa: replace IOCTL code definitions with constants
drm/radeon: add connector quirk for fujitsu board
drm/vmwgfx: Fix a potential infinite spin waiting for fifo idle
drm/ast: AST2000 cannot be detected correctly
drm/i915: Wait for vblank before enabling the TV encoder
drm/i915: Remove bogus __init annotation from DMI callbacks
HID: logitech-dj: prevent false errors to be shown
HID: magicmouse: sanity check report size in raw_event() callback
HID: picolcd: sanity check report size in raw_event() callback
cfq-iosched: Fix wrong children_weight calculation
ALSA: pcm: fix fifo_size frame calculation
ALSA: hda - Fix invalid pin powermap without jack detection
ALSA: hda - Fix COEF setups for ALC1150 codec
ALSA: core: fix buffer overflow in snd_info_get_line()
arm64: ptrace: fix compat hardware watchpoint reporting
trace: Fix epoll hang when we race with new entries
i2c: at91: Fix a race condition during signal handling in at91_do_twi_xfer.
i2c: at91: add bound checking on SMBus block length bytes
arm64: flush TLS registers during exec
ibmveth: Fix endian issues with rx_no_buffer statistic
ahci: add pcid for Marvel 0x9182 controller
ahci: Add Device IDs for Intel 9 Series PCH
pata_scc: propagate return value of scc_wait_after_reset
drm/i915: read HEAD register back in init_ring_common() to enforce ordering
drm/radeon: load the lm63 driver for an lm64 thermal chip.
drm/ttm: Choose a pool to shrink correctly in ttm_dma_pool_shrink_scan().
drm/ttm: Fix possible division by 0 in ttm_dma_pool_shrink_scan().
drm/tilcdc: fix double kfree
drm/tilcdc: fix release order on exit
drm/tilcdc: panel: fix leak when unloading the module
drm/tilcdc: tfp410: fix dangling sysfs connector node
drm/tilcdc: slave: fix dangling sysfs connector node
drm/tilcdc: panel: fix dangling sysfs connector node
carl9170: fix sending URBs with wrong type when using full-speed
Linux 3.10.55
libceph: gracefully handle large reply messages from the mon
libceph: rename ceph_msg::front_max to front_alloc_len
tpm: Provide a generic means to override the chip returned timeouts
vfs: fix bad hashing of dentries
dcache.c: get rid of pointless macros
IB/srp: Fix deadlock between host removal and multipathd
blkcg: don't call into policy draining if root_blkg is already gone
mtd: nand: omap: Fix 1-bit Hamming code scheme, omap_calculate_ecc()
mtd/ftl: fix the double free of the buffers allocated in build_maps()
CIFS: Fix wrong restart readdir for SMB1
CIFS: Fix wrong filename length for SMB2
CIFS: Fix wrong directory attributes after rename
CIFS: Possible null ptr deref in SMB2_tcon
CIFS: Fix async reading on reconnects
CIFS: Fix STATUS_CANNOT_DELETE error mapping for SMB2
libceph: do not hard code max auth ticket len
libceph: add process_one_ticket() helper
libceph: set last_piece in ceph_msg_data_pages_cursor_init() correctly
md/raid1,raid10: always abort recover on write error.
xfs: don't zero partial page cache pages during O_DIRECT writes
xfs: don't zero partial page cache pages during O_DIRECT writes
xfs: don't dirty buffers beyond EOF
xfs: quotacheck leaves dquot buffers without verifiers
RDMA/iwcm: Use a default listen backlog if needed
md/raid10: Fix memory leak when raid10 reshape completes.
md/raid10: fix memory leak when reshaping a RAID10.
md/raid6: avoid data corruption during recovery of double-degraded RAID6
Bluetooth: Avoid use of session socket after the session gets freed
Bluetooth: never linger on process exit
mnt: Add tests for unprivileged remount cases that have found to be faulty
mnt: Change the default remount atime from relatime to the existing value
mnt: Correct permission checks in do_remount
mnt: Move the test for MNT_LOCK_READONLY from change_mount_flags into do_remount
mnt: Only change user settable mount flags in remount
ring-buffer: Up rb_iter_peek() loop count to 3
ring-buffer: Always reset iterator to reader page
ACPI / cpuidle: fix deadlock between cpuidle_lock and cpu_hotplug.lock
ACPI: Run fixed event device notifications in process context
ACPICA: Utilities: Fix memory leak in acpi_ut_copy_iobject_to_iobject
bfa: Fix undefined bit shift on big-endian architectures with 32-bit DMA address
ASoC: pxa-ssp: drop SNDRV_PCM_FMTBIT_S24_LE
ASoC: max98090: Fix missing free_irq
ASoC: samsung: Correct I2S DAI suspend/resume ops
ASoC: wm_adsp: Add missing MODULE_LICENSE
ASoC: pcm: fix dpcm_path_put in dpcm runtime update
openrisc: Rework signal handling
MIPS: Fix accessing to per-cpu data when flushing the cache
MIPS: OCTEON: make get_system_type() thread-safe
MIPS: asm: thread_info: Add _TIF_SECCOMP flag
MIPS: Cleanup flags in syscall flags handlers.
MIPS: asm/reg.h: Make 32- and 64-bit definitions available at the same time
MIPS: Remove BUG_ON(!is_fpu_owner()) in do_ade()
MIPS: tlbex: Fix a missing statement for HUGETLB
MIPS: Prevent user from setting FCSR cause bits
MIPS: GIC: Prevent array overrun
drivers: scsi: storvsc: Correctly handle TEST_UNIT_READY failure
Drivers: scsi: storvsc: Implement a eh_timed_out handler
powerpc/pseries: Failure on removing device node
powerpc/mm: Use read barrier when creating real_pte
powerpc/mm/numa: Fix break placement
regulator: arizona-ldo1: remove bypass functionality
mfd: omap-usb-host: Fix improper mask use.
kernel/smp.c:on_each_cpu_cond(): fix warning in fallback path
CAPABILITIES: remove undefined caps from all processes
tpm: missing tpm_chip_put in tpm_get_random()
firmware: Do not use WARN_ON(!spin_is_locked())
spi: omap2-mcspi: Configure hardware when slave driver changes mode
spi: orion: fix incorrect handling of cell-index DT property
iommu/amd: Fix cleanup_domain for mass device removal
media: media-device: Remove duplicated memset() in media_enum_entities()
media: au0828: Only alt setting logic when needed
media: xc4000: Fix get_frequency()
media: xc5000: Fix get_frequency()
Linux 3.10.54
USB: fix build error with CONFIG_PM_RUNTIME disabled
NFSv4: Fix problems with close in the presence of a delegation
NFSv3: Fix another acl regression
svcrdma: Select NFSv4.1 backchannel transport based on forward channel
NFSD: Decrease nfsd_users in nfsd_startup_generic fail
usb: hub: Prevent hub autosuspend if usbcore.autosuspend is -1
USB: whiteheat: Added bounds checking for bulk command response
USB: ftdi_sio: Added PID for new ekey device
USB: ftdi_sio: add Basic Micro ATOM Nano USB2Serial PID
ARM: OMAP2+: hwmod: Rearm wake-up interrupts for DT when MUSB is idled
usb: xhci: amd chipset also needs short TX quirk
xhci: Treat not finding the event_seg on COMP_STOP the same as COMP_STOP_INVAL
Staging: speakup: Update __speakup_paste_selection() tty (ab)usage to match vt
jbd2: fix infinite loop when recovering corrupt journal blocks
mei: nfc: fix memory leak in error path
mei: reset client state on queued connect request
Btrfs: fix csum tree corruption, duplicate and outdated checksums
hpsa: fix bad -ENOMEM return value in hpsa_big_passthru_ioctl
x86/efi: Enforce CONFIG_RELOCATABLE for EFI boot stub
x86_64/vsyscall: Fix warn_bad_vsyscall log output
x86: don't exclude low BIOS area when allocating address space for non-PCI cards
drm/radeon: add additional SI pci ids
ext4: fix BUG_ON in mb_free_blocks()
kvm: iommu: fix the third parameter of kvm_iommu_put_pages (CVE-2014-3601)
Revert "KVM: x86: Increase the number of fixed MTRR regs to 10"
KVM: nVMX: fix "acknowledge interrupt on exit" when APICv is in use
KVM: x86: always exit on EOIs for interrupts listed in the IOAPIC redir table
KVM: x86: Inter-privilege level ret emulation is not implemeneted
crypto: ux500 - make interrupt mode plausible
serial: core: Preserve termios c_cflag for console resume
ext4: fix ext4_discard_allocated_blocks() if we can't allocate the pa struct
drivers/i2c/busses: use correct type for dma_map/unmap
hwmon: (dme1737) Prevent overflow problem when writing large limits
hwmon: (ads1015) Fix out-of-bounds array access
hwmon: (lm85) Fix various errors on attribute writes
hwmon: (ads1015) Fix off-by-one for valid channel index checking
hwmon: (gpio-fan) Prevent overflow problem when writing large limits
hwmon: (lm78) Fix overflow problems seen when writing large temperature limits
hwmon: (sis5595) Prevent overflow problem when writing large limits
drm: omapdrm: fix compiler errors
ARM: OMAP3: Fix choice of omap3_restore_es function in OMAP34XX rev3.1.2 case.
mei: start disconnect request timer consistently
ALSA: hda/realtek - Avoid setting wrong COEF on ALC269 & co
ALSA: hda/ca0132 - Don't try loading firmware at resume when already failed
ALSA: virtuoso: add Xonar Essence STX II support
ALSA: hda - fix an external mic jack problem on a HP machine
USB: Fix persist resume of some SS USB devices
USB: ehci-pci: USB host controller support for Intel Quark X1000
USB: serial: ftdi_sio: Add support for new Xsens devices
USB: serial: ftdi_sio: Annotate the current Xsens PID assignments
USB: OHCI: don't lose track of EDs when a controller dies
isofs: Fix unbounded recursion when processing relocated directories
HID: fix a couple of off-by-ones
HID: logitech: perform bounds checking on device_id early enough
stable_kernel_rules: Add pointer to netdev-FAQ for network patches
Linux 3.10.53
arch/sparc/math-emu/math_32.c: drop stray break operator
sparc64: ldc_connect() should not return EINVAL when handshake is in progress.
sunsab: Fix detection of BREAK on sunsab serial console
bbc-i2c: Fix BBC I2C envctrl on SunBlade 2000
sparc64: Guard against flushing openfirmware mappings.
sparc64: Do not insert non-valid PTEs into the TSB hash table.
sparc64: Add membar to Niagara2 memcpy code.
sparc64: Fix huge TSB mapping on pre-UltraSPARC-III cpus.
sparc64: Don't bark so loudly about 32-bit tasks generating 64-bit fault addresses.
sparc64: Fix top-level fault handling bugs.
sparc64: Handle 32-bit tasks properly in compute_effective_address().
sparc64: Make itc_sync_lock raw
sparc64: Fix argument sign extension for compat_sys_futex().
sctp: fix possible seqlock seadlock in sctp_packet_transmit()
iovec: make sure the caller actually wants anything in memcpy_fromiovecend
net: Correctly set segment mac_len in skb_segment().
macvlan: Initialize vlan_features to turn on offload support.
net: sctp: inherit auth_capable on INIT collisions
tcp: Fix integer-overflow in TCP vegas
tcp: Fix integer-overflows in TCP veno
net: sendmsg: fix NULL pointer dereference
ip: make IP identifiers less predictable
inetpeer: get rid of ip_id_count
bnx2x: fix crash during TSO tunneling
Linux 3.10.52
x86/espfix/xen: Fix allocation of pages for paravirt page tables
lib/btree.c: fix leak of whole btree nodes
net/l2tp: don't fall back on UDP [get|set]sockopt
net: mvneta: replace Tx timer with a real interrupt
net: mvneta: add missing bit descriptions for interrupt masks and causes
net: mvneta: do not schedule in mvneta_tx_timeout
net: mvneta: use per_cpu stats to fix an SMP lock up
net: mvneta: increase the 64-bit rx/tx stats out of the hot path
Revert "mac80211: move "bufferable MMPDU" check to fix AP mode scan"
staging: vt6655: Fix Warning on boot handle_irq_event_percpu.
x86_64/entry/xen: Do not invoke espfix64 on Xen
x86, espfix: Make it possible to disable 16-bit support
x86, espfix: Make espfix64 a Kconfig option, fix UML
x86, espfix: Fix broken header guard
x86, espfix: Move espfix definitions into a separate header file
x86-64, espfix: Don't leak bits 31:16 of %esp returning to 16-bit stack
Revert "x86-64, modify_ldt: Make support for 16-bit segments a runtime option"
timer: Fix lock inversion between hrtimer_bases.lock and scheduler locks
printk: rename printk_sched to printk_deferred
iio: buffer: Fix demux table creation
staging: vt6655: Fix disassociated messages every 10 seconds
mm, thp: do not allow thp faults to avoid cpuset restrictions
scsi: handle flush errors properly
rapidio/tsi721_dma: fix failure to obtain transaction descriptor
cfg80211: fix mic_failure tracing
ARM: 8115/1: LPAE: reduce damage caused by idmap to virtual memory layout
crypto: af_alg - properly label AF_ALG socket
Linux 3.10.51
core, nfqueue, openvswitch: Orphan frags in skb_zerocopy and handle errors
x86/efi: Include a .bss section within the PE/COFF headers
s390/ptrace: fix PSW mask check
Fix gcc-4.9.0 miscompilation of load_balance() in scheduler
mm: hugetlb: fix copy_hugetlb_page_range()
x86_32, entry: Store badsys error code in %eax
hwmon: (smsc47m192) Fix temperature limit and vrm write operations
parisc: Remove SA_RESTORER define
coredump: fix the setting of PF_DUMPCORE
Input: fix defuzzing logic
slab_common: fix the check for duplicate slab names
slab_common: Do not check for duplicate slab names
tracing: Fix wraparound problems in "uptime" trace clock
blkcg: don't call into policy draining if root_blkg is already gone
ahci: add support for the Promise FastTrak TX8660 SATA HBA (ahci mode)
libata: introduce ata_host->n_tags to avoid oops on SAS controllers
libata: support the ata host which implements a queue depth less than 32
block: don't assume last put of shared tags is for the host
block: provide compat ioctl for BLKZEROOUT
media: tda10071: force modulation to QPSK on DVB-S
media: hdpvr: fix two audio bugs
Linux 3.10.50
ARC: Implement ptrace(PTRACE_GET_THREAD_AREA)
sched: Fix possible divide by zero in avg_atom() calculation
locking/mutex: Disable optimistic spinning on some architectures
PM / sleep: Fix request_firmware() error at resume
dm cache metadata: do not allow the data block size to change
dm thin metadata: do not allow the data block size to change
alarmtimer: Fix bug where relative alarm timers were treated as absolute
drm/radeon: avoid leaking edid data
drm/qxl: return IRQ_NONE if it was not our irq
drm/radeon: set default bl level to something reasonable
irqchip: gic: Fix core ID calculation when topology is read from DT
irqchip: gic: Add support for cortex a7 compatible string
ring-buffer: Fix polling on trace_pipe
mwifiex: fix Tx timeout issue
perf/x86/intel: ignore CondChgd bit to avoid false NMI handling
ipv4: fix buffer overflow in ip_options_compile()
dns_resolver: Null-terminate the right string
dns_resolver: assure that dns_query() result is null-terminated
sunvnet: clean up objects created in vnet_new() on vnet_exit()
net: pppoe: use correct channel MTU when using Multilink PPP
net: sctp: fix information leaks in ulpevent layer
tipc: clear 'next'-pointer of message fragments before reassembly
be2net: set EQ DB clear-intr bit in be_open()
netlink: Fix handling of error from netlink_dump().
net: mvneta: Fix big endian issue in mvneta_txq_desc_csum()
net: mvneta: fix operation in 10 Mbit/s mode
appletalk: Fix socket referencing in skb
tcp: fix false undo corner cases
igmp: fix the problem when mc leave group
net: qmi_wwan: add two Sierra Wireless/Netgear devices
net: qmi_wwan: Add ID for Telewell TW-LTE 4G v2
ipv4: icmp: Fix pMTU handling for rare case
tcp: Fix divide by zero when pushing during tcp-repair
bnx2x: fix possible panic under memory stress
net: fix sparse warning in sk_dst_set()
ipv4: irq safe sk_dst_[re]set() and ipv4_sk_update_pmtu() fix
ipv4: fix dst race in sk_dst_get()
8021q: fix a potential memory leak
net: sctp: check proc_dointvec result in proc_sctp_do_auth
tcp: fix tcp_match_skb_to_sack() for unaligned SACK at end of an skb
ip_tunnel: fix ip_tunnel_lookup
shmem: fix splicing from a hole while it's punched
shmem: fix faulting into a hole, not taking i_mutex
shmem: fix faulting into a hole while it's punched
iwlwifi: dvm: don't enable CTS to self
igb: do a reset on SR-IOV re-init if device is down
hwmon: (adt7470) Fix writes to temperature limit registers
hwmon: (da9052) Don't use dash in the name attribute
hwmon: (da9055) Don't use dash in the name attribute
tracing: Add ftrace_trace_stack into __trace_puts/__trace_bputs
tracing: Fix graph tracer with stack tracer on other archs
fuse: handle large user and group ID
Bluetooth: Ignore H5 non-link packets in non-active state
Drivers: hv: util: Fix a bug in the KVP code
media: gspca_pac7302: Add new usb-id for Genius i-Look 317
usb: Check if port status is equal to RxDetect
Signed-off-by: Ian Maund <imaund@codeaurora.org>
2015-05-01 22:28:10 +02:00
|
|
|
printk_deferred("%s", buf);
|
2014-06-19 23:23:33 +02:00
|
|
|
#endif
|
2014-06-18 06:43:35 +02:00
|
|
|
}
|
|
|
|
|
2008-02-13 15:45:39 +01:00
|
|
|
static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
|
2008-01-25 21:08:30 +01:00
|
|
|
{
|
2008-02-13 15:45:39 +01:00
|
|
|
u64 runtime = sched_rt_runtime(rt_rq);
|
2008-01-25 21:08:29 +01:00
|
|
|
|
|
|
|
if (rt_rq->rt_throttled)
|
2008-02-13 15:45:39 +01:00
|
|
|
return rt_rq_throttled(rt_rq);
|
2008-01-25 21:08:29 +01:00
|
|
|
|
2011-11-29 04:03:56 +01:00
|
|
|
if (runtime >= sched_rt_period(rt_rq))
|
2008-04-19 19:44:58 +02:00
|
|
|
return 0;
|
|
|
|
|
2008-06-19 14:22:25 +02:00
|
|
|
balance_runtime(rt_rq);
|
|
|
|
runtime = sched_rt_runtime(rt_rq);
|
|
|
|
if (runtime == RUNTIME_INF)
|
|
|
|
return 0;
|
2008-04-19 19:44:58 +02:00
|
|
|
|
2008-02-13 15:45:39 +01:00
|
|
|
if (rt_rq->rt_time > runtime) {
|
2011-10-18 22:03:48 +02:00
|
|
|
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Don't actually throttle groups that have no runtime assigned
|
|
|
|
* but accrue some time due to boosting.
|
|
|
|
*/
|
|
|
|
if (likely(rt_b->rt_runtime)) {
|
2012-02-27 10:47:00 +01:00
|
|
|
static bool once = false;
|
|
|
|
|
2011-10-18 22:03:48 +02:00
|
|
|
rt_rq->rt_throttled = 1;
|
2012-02-27 10:47:00 +01:00
|
|
|
|
|
|
|
if (!once) {
|
|
|
|
once = true;
|
2014-06-18 06:43:35 +02:00
|
|
|
dump_throttled_rt_tasks(rt_rq);
|
2012-02-27 10:47:00 +01:00
|
|
|
}
|
2011-10-18 22:03:48 +02:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* In case we did anyway, make it go away,
|
|
|
|
* replenishment is a joke, since it will replenish us
|
|
|
|
* with exactly 0 ns.
|
|
|
|
*/
|
|
|
|
rt_rq->rt_time = 0;
|
|
|
|
}
|
|
|
|
|
2008-02-13 15:45:39 +01:00
|
|
|
if (rt_rq_throttled(rt_rq)) {
|
2008-02-13 15:45:39 +01:00
|
|
|
sched_rt_rq_dequeue(rt_rq);
|
2008-02-13 15:45:39 +01:00
|
|
|
return 1;
|
|
|
|
}
|
2008-01-25 21:08:29 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2007-07-09 18:51:58 +02:00
|
|
|
/*
|
|
|
|
* Update the current task's runtime statistics. Skip current tasks that
|
|
|
|
* are not in our scheduling class.
|
|
|
|
*/
|
2007-10-15 17:00:13 +02:00
|
|
|
static void update_curr_rt(struct rq *rq)
|
2007-07-09 18:51:58 +02:00
|
|
|
{
|
|
|
|
struct task_struct *curr = rq->curr;
|
2008-01-25 21:08:30 +01:00
|
|
|
struct sched_rt_entity *rt_se = &curr->rt;
|
|
|
|
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
|
2007-07-09 18:51:58 +02:00
|
|
|
u64 delta_exec;
|
|
|
|
|
2011-02-02 13:19:48 +01:00
|
|
|
if (curr->sched_class != &rt_sched_class)
|
2007-07-09 18:51:58 +02:00
|
|
|
return;
|
|
|
|
|
2010-10-05 02:03:21 +02:00
|
|
|
delta_exec = rq->clock_task - curr->se.exec_start;
|
2013-01-30 13:50:36 +01:00
|
|
|
if (unlikely((s64)delta_exec <= 0))
|
|
|
|
return;
|
2007-08-02 17:41:40 +02:00
|
|
|
|
2011-10-18 22:03:48 +02:00
|
|
|
schedstat_set(curr->se.statistics.exec_max,
|
|
|
|
max(curr->se.statistics.exec_max, delta_exec));
|
2007-07-09 18:51:58 +02:00
|
|
|
|
|
|
|
curr->se.sum_exec_runtime += delta_exec;
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-12 18:54:39 +02:00
|
|
|
account_group_exec_runtime(curr, delta_exec);
|
|
|
|
|
2010-10-05 02:03:21 +02:00
|
|
|
curr->se.exec_start = rq->clock_task;
|
2007-12-02 20:04:49 +01:00
|
|
|
cpuacct_charge(curr, delta_exec);
|
2008-01-25 21:08:29 +01:00
|
|
|
|
2009-09-01 10:34:37 +02:00
|
|
|
sched_rt_avg_update(rq, delta_exec);
|
|
|
|
|
2008-08-19 12:33:04 +02:00
|
|
|
if (!rt_bandwidth_enabled())
|
|
|
|
return;
|
|
|
|
|
2008-04-19 19:44:59 +02:00
|
|
|
for_each_sched_rt_entity(rt_se) {
|
|
|
|
rt_rq = rt_rq_of_se(rt_se);
|
|
|
|
|
2008-08-19 12:33:03 +02:00
|
|
|
if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_lock(&rt_rq->rt_runtime_lock);
|
2008-08-19 12:33:03 +02:00
|
|
|
rt_rq->rt_time += delta_exec;
|
|
|
|
if (sched_rt_runtime_exceeded(rt_rq))
|
|
|
|
resched_task(curr);
|
2009-11-17 15:32:06 +01:00
|
|
|
raw_spin_unlock(&rt_rq->rt_runtime_lock);
|
2008-08-19 12:33:03 +02:00
|
|
|
}
|
2008-04-19 19:44:59 +02:00
|
|
|
}
|
2007-07-09 18:51:58 +02:00
|
|
|
}
|
|
|
|
|
2009-01-14 15:10:04 +01:00
|
|
|
#if defined CONFIG_SMP
|
2008-12-29 15:39:49 +01:00
|
|
|
|
2009-01-14 15:10:04 +01:00
|
|
|
static void
|
|
|
|
inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
|
2008-01-25 21:08:03 +01:00
|
|
|
{
|
2008-12-29 15:39:49 +01:00
|
|
|
struct rq *rq = rq_of_rt_rq(rt_rq);
|
2008-06-04 21:04:05 +02:00
|
|
|
|
2013-11-27 16:59:13 +01:00
|
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
|
|
|
/*
|
|
|
|
* Change rq's cpupri only if rt_rq is the top queue.
|
|
|
|
*/
|
|
|
|
if (&rq->rt != rt_rq)
|
|
|
|
return;
|
|
|
|
#endif
|
sched: Use pushable_tasks to determine next highest prio
Hillf Danton proposed a patch (see link) that cleaned up the
sched_rt code that calculates the priority of the next highest priority
task to be used in finding run queues to pull from.
His patch removed the calculating of the next prio to just use the current
prio when deteriming if we should examine a run queue to pull from. The problem
with his patch was that it caused more false checks. Because we check a run
queue for pushable tasks if the current priority of that run queue is higher
in priority than the task about to run on our run queue. But after grabbing
the locks and doing the real check, we find that there may not be a task
that has a higher prio task to pull. Thus the locks were taken with nothing to
do.
I added some trace_printks() to record when and how many times the run queue
locks were taken to check for pullable tasks, compared to how many times we
pulled a task.
With the current method, it was:
3806 locks taken vs 2812 pulled tasks
With Hillf's patch:
6728 locks taken vs 2804 pulled tasks
The number of times locks were taken to pull a task went up almost double with
no more success rate.
But his patch did get me thinking. When we look at the priority of the highest
task to consider taking the locks to do a pull, a failure to pull can be one
of the following: (in order of most likely)
o RT task was pushed off already between the check and taking the lock
o Waiting RT task can not be migrated
o RT task's CPU affinity does not include the target run queue's CPU
o RT task's priority changed between the check and taking the lock
And with Hillf's patch, the thing that caused most of the failures, is
the RT task to pull was not at the right priority to pull (not greater than
the current RT task priority on the target run queue).
Most of the above cases we can't help. But the current method does not check
if the next highest prio RT task can be migrated or not, and if it can not,
we still grab the locks to do the test (we don't find out about this fact until
after we have the locks). I thought about this case, and realized that the
pushable task plist that is maintained only holds RT tasks that can migrate.
If we move the calculating of the next highest prio task from the inc/dec_rt_task()
functions into the queuing of the pushable tasks, then we only measure the
priorities of those tasks that we push, and we get this basically for free.
Not only does this patch make the code a little more efficient, it cleans it
up and makes it a little simpler.
Thanks to Hillf Danton for inspiring me on this patch.
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Gregory Haskins <ghaskins@novell.com>
Link: http://lkml.kernel.org/r/BANLkTimQ67180HxCx5vgMqumqw1EkFh3qg@mail.gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-06-17 03:55:23 +02:00
|
|
|
if (rq->online && prio < prev_prio)
|
|
|
|
cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
|
2009-01-14 15:10:04 +01:00
|
|
|
}
|
2008-01-25 21:08:07 +01:00
|
|
|
|
2009-01-14 15:10:04 +01:00
|
|
|
static void
|
|
|
|
dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
|
|
|
|
{
|
|
|
|
struct rq *rq = rq_of_rt_rq(rt_rq);
|
2008-04-19 19:44:57 +02:00
|
|
|
|
2013-11-27 16:59:13 +01:00
|
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
|
|
|
/*
|
|
|
|
* Change rq's cpupri only if rt_rq is the top queue.
|
|
|
|
*/
|
|
|
|
if (&rq->rt != rt_rq)
|
|
|
|
return;
|
|
|
|
#endif
|
2009-01-14 15:10:04 +01:00
|
|
|
if (rq->online && rt_rq->highest_prio.curr != prev_prio)
|
|
|
|
cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
|
2008-01-25 21:08:03 +01:00
|
|
|
}
|
|
|
|
|
2009-01-14 15:10:04 +01:00
|
|
|
#else /* CONFIG_SMP */
|
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
static inline
|
2009-01-14 15:10:04 +01:00
|
|
|
void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
|
|
|
|
static inline
|
|
|
|
void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
|
|
|
|
|
|
|
|
#endif /* CONFIG_SMP */
|
2008-05-12 21:21:01 +02:00
|
|
|
|
2008-02-13 15:45:40 +01:00
|
|
|
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
|
2009-01-14 15:10:04 +01:00
|
|
|
static void
|
|
|
|
inc_rt_prio(struct rt_rq *rt_rq, int prio)
|
|
|
|
{
|
|
|
|
int prev_prio = rt_rq->highest_prio.curr;
|
|
|
|
|
|
|
|
if (prio < prev_prio)
|
|
|
|
rt_rq->highest_prio.curr = prio;
|
|
|
|
|
|
|
|
inc_rt_prio_smp(rt_rq, prio, prev_prio);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dec_rt_prio(struct rt_rq *rt_rq, int prio)
|
|
|
|
{
|
|
|
|
int prev_prio = rt_rq->highest_prio.curr;
|
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
if (rt_rq->rt_nr_running) {
|
2008-01-25 21:08:04 +01:00
|
|
|
|
2009-01-14 15:10:04 +01:00
|
|
|
WARN_ON(prio < prev_prio);
|
2008-01-25 21:08:04 +01:00
|
|
|
|
2008-12-29 15:39:49 +01:00
|
|
|
/*
|
2009-01-14 15:10:04 +01:00
|
|
|
* This may have been our highest task, and therefore
|
|
|
|
* we may have some recomputation to do
|
2008-12-29 15:39:49 +01:00
|
|
|
*/
|
2009-01-14 15:10:04 +01:00
|
|
|
if (prio == prev_prio) {
|
2008-12-29 15:39:49 +01:00
|
|
|
struct rt_prio_array *array = &rt_rq->active;
|
|
|
|
|
|
|
|
rt_rq->highest_prio.curr =
|
2008-01-25 21:08:04 +01:00
|
|
|
sched_find_first_bit(array->bitmap);
|
2008-12-29 15:39:49 +01:00
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:04 +01:00
|
|
|
} else
|
2008-12-29 15:39:49 +01:00
|
|
|
rt_rq->highest_prio.curr = MAX_RT_PRIO;
|
2008-01-25 21:08:07 +01:00
|
|
|
|
2009-01-14 15:10:04 +01:00
|
|
|
dec_rt_prio_smp(rt_rq, prio, prev_prio);
|
|
|
|
}
|
2008-06-04 21:04:05 +02:00
|
|
|
|
2009-01-14 15:10:04 +01:00
|
|
|
#else
|
|
|
|
|
|
|
|
static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
|
|
|
|
static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
|
|
|
|
|
|
|
|
#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
|
2008-05-12 21:21:01 +02:00
|
|
|
|
2008-02-13 15:45:40 +01:00
|
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
2009-01-14 15:10:04 +01:00
|
|
|
|
|
|
|
static void
|
|
|
|
inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
|
|
|
|
{
|
|
|
|
if (rt_se_boosted(rt_se))
|
|
|
|
rt_rq->rt_nr_boosted++;
|
|
|
|
|
|
|
|
if (rt_rq->tg)
|
|
|
|
start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
|
|
|
|
{
|
2008-02-13 15:45:39 +01:00
|
|
|
if (rt_se_boosted(rt_se))
|
|
|
|
rt_rq->rt_nr_boosted--;
|
|
|
|
|
|
|
|
WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
|
2009-01-14 15:10:04 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
#else /* CONFIG_RT_GROUP_SCHED */
|
|
|
|
|
|
|
|
static void
|
|
|
|
inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
|
|
|
|
{
|
|
|
|
start_rt_bandwidth(&def_rt_bandwidth);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline
|
|
|
|
void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
|
|
|
|
|
|
|
|
#endif /* CONFIG_RT_GROUP_SCHED */
|
|
|
|
|
2015-01-16 06:57:31 +01:00
|
|
|
#ifdef CONFIG_SCHED_HMP
|
|
|
|
|
|
|
|
static void
|
|
|
|
inc_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p)
|
|
|
|
{
|
|
|
|
inc_cumulative_runnable_avg(&rq->hmp_stats, p);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dec_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p)
|
|
|
|
{
|
|
|
|
dec_cumulative_runnable_avg(&rq->hmp_stats, p);
|
|
|
|
}
|
|
|
|
|
|
|
|
#else /* CONFIG_SCHED_HMP */
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
inc_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p) { }
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
dec_hmp_sched_stats_rt(struct rq *rq, struct task_struct *p) { }
|
|
|
|
|
|
|
|
#endif /* CONFIG_SCHED_HMP */
|
|
|
|
|
2009-01-14 15:10:04 +01:00
|
|
|
static inline
|
|
|
|
void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
|
|
|
|
{
|
|
|
|
int prio = rt_se_prio(rt_se);
|
|
|
|
|
|
|
|
WARN_ON(!rt_prio(prio));
|
|
|
|
rt_rq->rt_nr_running++;
|
|
|
|
|
|
|
|
inc_rt_prio(rt_rq, prio);
|
|
|
|
inc_rt_migration(rt_se, rt_rq);
|
|
|
|
inc_rt_group(rt_se, rt_rq);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline
|
|
|
|
void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
|
|
|
|
{
|
|
|
|
WARN_ON(!rt_prio(rt_se_prio(rt_se)));
|
|
|
|
WARN_ON(!rt_rq->rt_nr_running);
|
|
|
|
rt_rq->rt_nr_running--;
|
|
|
|
|
|
|
|
dec_rt_prio(rt_rq, rt_se_prio(rt_se));
|
|
|
|
dec_rt_migration(rt_se, rt_rq);
|
|
|
|
dec_rt_group(rt_se, rt_rq);
|
2008-01-25 21:08:03 +01:00
|
|
|
}
|
|
|
|
|
2010-01-20 21:59:01 +01:00
|
|
|
static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
|
2007-07-09 18:51:58 +02:00
|
|
|
{
|
2008-01-25 21:08:30 +01:00
|
|
|
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
|
|
|
|
struct rt_prio_array *array = &rt_rq->active;
|
|
|
|
struct rt_rq *group_rq = group_rt_rq(rt_se);
|
sched: rework of "prioritize non-migratable tasks over migratable ones"
regarding this commit: 45c01e824991b2dd0a332e19efc4901acb31209f
I think we can do it simpler. Please take a look at the patch below.
Instead of having 2 separate arrays (which is + ~800 bytes on x86_32 and
twice so on x86_64), let's add "exclusive" (the ones that are bound to
this CPU) tasks to the head of the queue and "shared" ones -- to the
end.
In case of a few newly woken up "exclusive" tasks, they are 'stacked'
(not queued as now), meaning that a task {i+1} is being placed in front
of the previously woken up task {i}. But I don't think that this
behavior may cause any realistic problems.
There are a couple of changes on top of this one.
(1) in check_preempt_curr_rt()
I don't think there is a need for the "pick_next_rt_entity(rq, &rq->rt)
!= &rq->curr->rt" check.
enqueue_task_rt(p) and check_preempt_curr_rt() are always called one
after another with rq->lock being held so the following check
"p->rt.nr_cpus_allowed == 1 && rq->curr->rt.nr_cpus_allowed != 1" should
be enough (well, just its left part) to guarantee that 'p' has been
queued in front of the 'curr'.
(2) in set_cpus_allowed_rt()
I don't thinks there is a need for requeue_task_rt() here.
Perhaps, the only case when 'requeue' (+ reschedule) might be useful is
as follows:
i) weight == 1 && cpu_isset(task_cpu(p), *new_mask)
i.e. a task is being bound to this CPU);
ii) 'p' != rq->curr
but here, 'p' has already been on this CPU for a while and was not
migrated. i.e. it's possible that 'rq->curr' would not have high chances
to be migrated right at this particular moment (although, has chance in
a bit longer term), should we allow it to be preempted.
Anyway, I think we should not perhaps make it more complex trying to
address some rare corner cases. For instance, that's why a single queue
approach would be preferable. Unless I'm missing something obvious, this
approach gives us similar functionality at lower cost.
Verified only compilation-wise.
(Almost)-Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-06-11 00:58:30 +02:00
|
|
|
struct list_head *queue = array->queue + rt_se_prio(rt_se);
|
2007-07-09 18:51:58 +02:00
|
|
|
|
2008-06-19 09:06:57 +02:00
|
|
|
/*
|
|
|
|
* Don't enqueue the group if its throttled, or when empty.
|
|
|
|
* The latter is a consequence of the former when a child group
|
|
|
|
* get throttled and the current group doesn't have any other
|
|
|
|
* active members.
|
|
|
|
*/
|
|
|
|
if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
|
2008-01-25 21:08:30 +01:00
|
|
|
return;
|
2008-01-25 21:08:03 +01:00
|
|
|
|
2010-11-16 00:47:01 +01:00
|
|
|
if (!rt_rq->rt_nr_running)
|
|
|
|
list_add_leaf_rt_rq(rt_rq);
|
|
|
|
|
2010-01-20 21:59:01 +01:00
|
|
|
if (head)
|
|
|
|
list_add(&rt_se->run_list, queue);
|
|
|
|
else
|
|
|
|
list_add_tail(&rt_se->run_list, queue);
|
2008-01-25 21:08:30 +01:00
|
|
|
__set_bit(rt_se_prio(rt_se), array->bitmap);
|
2008-01-25 21:08:27 +01:00
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
inc_rt_tasks(rt_se, rt_rq);
|
|
|
|
}
|
|
|
|
|
2008-06-19 09:06:57 +02:00
|
|
|
static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
|
2008-01-25 21:08:30 +01:00
|
|
|
{
|
|
|
|
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
|
|
|
|
struct rt_prio_array *array = &rt_rq->active;
|
|
|
|
|
|
|
|
list_del_init(&rt_se->run_list);
|
|
|
|
if (list_empty(array->queue + rt_se_prio(rt_se)))
|
|
|
|
__clear_bit(rt_se_prio(rt_se), array->bitmap);
|
|
|
|
|
|
|
|
dec_rt_tasks(rt_se, rt_rq);
|
2010-11-16 00:47:01 +01:00
|
|
|
if (!rt_rq->rt_nr_running)
|
|
|
|
list_del_leaf_rt_rq(rt_rq);
|
2008-01-25 21:08:30 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Because the prio of an upper entry depends on the lower
|
|
|
|
* entries, we must remove entries top - down.
|
|
|
|
*/
|
2008-06-19 09:06:57 +02:00
|
|
|
static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
|
2008-01-25 21:08:30 +01:00
|
|
|
{
|
2008-06-19 09:06:57 +02:00
|
|
|
struct sched_rt_entity *back = NULL;
|
2008-01-25 21:08:30 +01:00
|
|
|
|
2008-04-19 19:45:00 +02:00
|
|
|
for_each_sched_rt_entity(rt_se) {
|
|
|
|
rt_se->back = back;
|
|
|
|
back = rt_se;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (rt_se = back; rt_se; rt_se = rt_se->back) {
|
|
|
|
if (on_rt_rq(rt_se))
|
2008-06-19 09:06:57 +02:00
|
|
|
__dequeue_rt_entity(rt_se);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-01-20 21:59:01 +01:00
|
|
|
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
|
2008-06-19 09:06:57 +02:00
|
|
|
{
|
|
|
|
dequeue_rt_stack(rt_se);
|
|
|
|
for_each_sched_rt_entity(rt_se)
|
2010-01-20 21:59:01 +01:00
|
|
|
__enqueue_rt_entity(rt_se, head);
|
2008-06-19 09:06:57 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
|
|
|
|
{
|
|
|
|
dequeue_rt_stack(rt_se);
|
|
|
|
|
|
|
|
for_each_sched_rt_entity(rt_se) {
|
|
|
|
struct rt_rq *rt_rq = group_rt_rq(rt_se);
|
|
|
|
|
|
|
|
if (rt_rq && rt_rq->rt_nr_running)
|
2010-01-20 21:59:01 +01:00
|
|
|
__enqueue_rt_entity(rt_se, false);
|
2008-04-19 19:45:00 +02:00
|
|
|
}
|
2007-07-09 18:51:58 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Adding/removing a task to/from a priority array:
|
|
|
|
*/
|
2010-01-20 21:58:57 +01:00
|
|
|
static void
|
2010-03-24 16:38:48 +01:00
|
|
|
enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
|
2008-01-25 21:08:30 +01:00
|
|
|
{
|
|
|
|
struct sched_rt_entity *rt_se = &p->rt;
|
|
|
|
|
2010-03-24 16:38:48 +01:00
|
|
|
if (flags & ENQUEUE_WAKEUP)
|
2008-01-25 21:08:30 +01:00
|
|
|
rt_se->timeout = 0;
|
|
|
|
|
2010-03-24 16:38:48 +01:00
|
|
|
enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
|
2008-06-27 13:41:14 +02:00
|
|
|
|
2012-04-23 12:11:21 +02:00
|
|
|
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 15:39:53 +01:00
|
|
|
enqueue_pushable_task(rq, p);
|
2011-07-21 18:43:27 +02:00
|
|
|
|
|
|
|
inc_nr_running(rq);
|
2015-01-16 06:57:31 +01:00
|
|
|
inc_hmp_sched_stats_rt(rq, p);
|
2008-01-25 21:08:30 +01:00
|
|
|
}
|
|
|
|
|
2010-03-24 16:38:48 +01:00
|
|
|
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
|
2007-07-09 18:51:58 +02:00
|
|
|
{
|
2008-01-25 21:08:30 +01:00
|
|
|
struct sched_rt_entity *rt_se = &p->rt;
|
2007-07-09 18:51:58 +02:00
|
|
|
|
2007-08-09 11:16:48 +02:00
|
|
|
update_curr_rt(rq);
|
2008-06-19 09:06:57 +02:00
|
|
|
dequeue_rt_entity(rt_se);
|
2008-06-27 13:41:14 +02:00
|
|
|
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 15:39:53 +01:00
|
|
|
dequeue_pushable_task(rq, p);
|
2011-07-21 18:43:27 +02:00
|
|
|
|
|
|
|
dec_nr_running(rq);
|
2015-01-16 06:57:31 +01:00
|
|
|
dec_hmp_sched_stats_rt(rq, p);
|
2007-07-09 18:51:58 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2011-11-12 18:07:57 +01:00
|
|
|
* Put task to the head or the end of the run list without the overhead of
|
|
|
|
* dequeue followed by enqueue.
|
2007-07-09 18:51:58 +02:00
|
|
|
*/
|
2008-07-01 23:32:15 +02:00
|
|
|
static void
|
|
|
|
requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
|
2008-01-25 21:08:30 +01:00
|
|
|
{
|
2008-06-19 09:09:15 +02:00
|
|
|
if (on_rt_rq(rt_se)) {
|
2008-07-01 23:32:15 +02:00
|
|
|
struct rt_prio_array *array = &rt_rq->active;
|
|
|
|
struct list_head *queue = array->queue + rt_se_prio(rt_se);
|
|
|
|
|
|
|
|
if (head)
|
|
|
|
list_move(&rt_se->run_list, queue);
|
|
|
|
else
|
|
|
|
list_move_tail(&rt_se->run_list, queue);
|
2008-06-19 09:09:15 +02:00
|
|
|
}
|
2008-01-25 21:08:30 +01:00
|
|
|
}
|
|
|
|
|
2008-07-01 23:32:15 +02:00
|
|
|
static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
|
2007-07-09 18:51:58 +02:00
|
|
|
{
|
2008-01-25 21:08:30 +01:00
|
|
|
struct sched_rt_entity *rt_se = &p->rt;
|
|
|
|
struct rt_rq *rt_rq;
|
2007-07-09 18:51:58 +02:00
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
for_each_sched_rt_entity(rt_se) {
|
|
|
|
rt_rq = rt_rq_of_se(rt_se);
|
2008-07-01 23:32:15 +02:00
|
|
|
requeue_rt_entity(rt_rq, rt_se, head);
|
2008-01-25 21:08:30 +01:00
|
|
|
}
|
2007-07-09 18:51:58 +02:00
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
static void yield_task_rt(struct rq *rq)
|
2007-07-09 18:51:58 +02:00
|
|
|
{
|
2008-07-01 23:32:15 +02:00
|
|
|
requeue_task_rt(rq, rq->curr, 0);
|
2007-07-09 18:51:58 +02:00
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:09 +01:00
|
|
|
#ifdef CONFIG_SMP
|
2008-01-25 21:08:10 +01:00
|
|
|
static int find_lowest_rq(struct task_struct *task);
|
|
|
|
|
2014-04-24 20:10:44 +02:00
|
|
|
static int
|
2014-07-21 11:05:24 +02:00
|
|
|
select_task_rq_rt_hmp(struct task_struct *p, int sd_flag, int flags)
|
2014-04-24 20:10:44 +02:00
|
|
|
{
|
|
|
|
int cpu, target;
|
|
|
|
|
|
|
|
cpu = task_cpu(p);
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
target = find_lowest_rq(p);
|
|
|
|
if (target != -1)
|
|
|
|
cpu = target;
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
return cpu;
|
|
|
|
}
|
|
|
|
|
2010-03-24 18:34:10 +01:00
|
|
|
static int
|
2011-04-05 17:23:46 +02:00
|
|
|
select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
|
2008-01-25 21:08:09 +01:00
|
|
|
{
|
2011-04-05 17:23:46 +02:00
|
|
|
struct task_struct *curr;
|
|
|
|
struct rq *rq;
|
|
|
|
int cpu;
|
|
|
|
|
|
|
|
cpu = task_cpu(p);
|
2011-06-17 03:55:22 +02:00
|
|
|
|
2012-04-23 12:11:21 +02:00
|
|
|
if (p->nr_cpus_allowed == 1)
|
2011-11-22 15:18:24 +01:00
|
|
|
goto out;
|
|
|
|
|
2014-07-21 17:24:04 +02:00
|
|
|
if (sched_enable_hmp)
|
2014-07-21 11:05:24 +02:00
|
|
|
return select_task_rq_rt_hmp(p, sd_flag, flags);
|
|
|
|
|
2011-06-17 03:55:22 +02:00
|
|
|
/* For anything but wake ups, just return the task_cpu */
|
|
|
|
if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
|
|
|
|
goto out;
|
|
|
|
|
2011-04-05 17:23:46 +02:00
|
|
|
rq = cpu_rq(cpu);
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
curr = ACCESS_ONCE(rq->curr); /* unlocked access */
|
|
|
|
|
2008-01-25 21:08:10 +01:00
|
|
|
/*
|
2011-04-05 17:23:46 +02:00
|
|
|
* If the current task on @p's runqueue is an RT task, then
|
2008-01-25 21:08:12 +01:00
|
|
|
* try to see if we can wake this RT task up on another
|
|
|
|
* runqueue. Otherwise simply start this RT task
|
|
|
|
* on its current runqueue.
|
|
|
|
*
|
sched: Try not to migrate higher priority RT tasks
When first working on the RT scheduler design, we concentrated on
keeping all CPUs running RT tasks instead of having multiple RT
tasks on a single CPU waiting for the migration thread to move
them. Instead we take a more proactive stance and push or pull RT
tasks from one CPU to another on wakeup or scheduling.
When an RT task wakes up on a CPU that is running another RT task,
instead of preempting it and killing the cache of the running RT
task, we look to see if we can migrate the RT task that is waking
up, even if the RT task waking up is of higher priority.
This may sound a bit odd, but RT tasks should be limited in
migration by the user anyway. But in practice, people do not do
this, which causes high prio RT tasks to bounce around the CPUs.
This becomes even worse when we have priority inheritance, because
a high prio task can block on a lower prio task and boost its
priority. When the lower prio task wakes up the high prio task, if
it happens to be on the same CPU it will migrate off of it.
But in reality, the above does not happen much either, because the
wake up of the lower prio task, which has already been boosted, if
it was on the same CPU as the higher prio task, it would then
migrate off of it. But anyway, we do not want to migrate them
either.
To examine the scheduling, I created a test program and examined it
under kernelshark. The test program created CPU * 2 threads, where
each thread had a different priority. The program takes different
options. The options used in this change log was to have priority
inheritance mutexes or not.
All threads did the following loop:
static void grab_lock(long id, int iter, int l)
{
ftrace_write("thread %ld iter %d, taking lock %d\n",
id, iter, l);
pthread_mutex_lock(&locks[l]);
ftrace_write("thread %ld iter %d, took lock %d\n",
id, iter, l);
busy_loop(nr_tasks - id);
ftrace_write("thread %ld iter %d, unlock lock %d\n",
id, iter, l);
pthread_mutex_unlock(&locks[l]);
}
void *start_task(void *id)
{
[...]
while (!done) {
for (l = 0; l < nr_locks; l++) {
grab_lock(id, i, l);
ftrace_write("thread %ld iter %d sleeping\n",
id, i);
ms_sleep(id);
}
i++;
}
[...]
}
The busy_loop(ms) keeps the CPU spinning for ms milliseconds. The
ms_sleep(ms) sleeps for ms milliseconds. The ftrace_write() writes
to the ftrace buffer to help analyze via ftrace.
The higher the id, the higher the prio, the shorter it does the
busy loop, but the longer it spins. This is usually the case with
RT tasks, the lower priority tasks usually run longer than higher
priority tasks.
At the end of the test, it records the number of loops each thread
took, as well as the number of voluntary preemptions, non-voluntary
preemptions, and number of migrations each thread took, taking the
information from /proc/$$/sched and /proc/$$/status.
Running this on a 4 CPU processor, the results without changes to
the kernel looked like this:
Task vol nonvol migrated iterations
---- --- ------ -------- ----------
0: 53 3220 1470 98
1: 562 773 724 98
2: 752 933 1375 98
3: 749 39 697 98
4: 758 5 515 98
5: 764 2 679 99
6: 761 2 535 99
7: 757 3 346 99
total: 5156 4977 6341 787
Each thread regardless of priority migrated a few hundred times.
The higher priority tasks, were a little better but still took
quite an impact.
By letting higher priority tasks bump the lower prio task from the
CPU, things changed a bit:
Task vol nonvol migrated iterations
---- --- ------ -------- ----------
0: 37 2835 1937 98
1: 666 1821 1865 98
2: 654 1003 1385 98
3: 664 635 973 99
4: 698 197 352 99
5: 703 101 159 99
6: 708 1 75 99
7: 713 1 2 99
total: 4843 6594 6748 789
The total # of migrations did not change (several runs showed the
difference all within the noise). But we now see a dramatic
improvement to the higher priority tasks. (kernelshark showed that
the watchdog timer bumped the highest priority task to give it the
2 count. This was actually consistent with every run).
Notice that the # of iterations did not change either.
The above was with priority inheritance mutexes. That is, when the
higher prority task blocked on a lower priority task, the lower
priority task would inherit the higher priority task (which shows
why task 6 was bumped so many times). When not using priority
inheritance mutexes, the current kernel shows this:
Task vol nonvol migrated iterations
---- --- ------ -------- ----------
0: 56 3101 1892 95
1: 594 713 937 95
2: 625 188 618 95
3: 628 4 491 96
4: 640 7 468 96
5: 631 2 501 96
6: 641 1 466 96
7: 643 2 497 96
total: 4458 4018 5870 765
Not much changed with or without priority inheritance mutexes. But
if we let the high priority task bump lower priority tasks on
wakeup we see:
Task vol nonvol migrated iterations
---- --- ------ -------- ----------
0: 115 3439 2782 98
1: 633 1354 1583 99
2: 652 919 1218 99
3: 645 713 934 99
4: 690 3 3 99
5: 694 1 4 99
6: 720 3 4 99
7: 747 0 1 100
Which shows a even bigger change. The big difference between task 3
and task 4 is because we have only 4 CPUs on the machine, causing
the 4 highest prio tasks to always have preference.
Although I did not measure cache misses, and I'm sure there would
be little to measure since the test was not data intensive, I could
imagine large improvements for higher priority tasks when dealing
with lower priority tasks. Thus, I'm satisfied with making the
change and agreeing with what Gregory Haskins argued a few years
ago when we first had this discussion.
One final note. All tasks in the above tests were RT tasks. Any RT
task will always preempt a non RT task that is running on the CPU
the RT task wants to run on.
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Gregory Haskins <ghaskins@novell.com>
LKML-Reference: <20100921024138.605460343@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-09-21 04:40:03 +02:00
|
|
|
* We want to avoid overloading runqueues. If the woken
|
|
|
|
* task is a higher priority, then it will stay on this CPU
|
|
|
|
* and the lower prio task should be moved to another CPU.
|
|
|
|
* Even though this will probably make the lower prio task
|
|
|
|
* lose its cache, we do not want to bounce a higher task
|
|
|
|
* around just because it gave up its CPU, perhaps for a
|
|
|
|
* lock?
|
|
|
|
*
|
|
|
|
* For equal prio tasks, we just let the scheduler sort it out.
|
2011-04-05 17:23:46 +02:00
|
|
|
*
|
|
|
|
* Otherwise, just let it ride on the affined RQ and the
|
|
|
|
* post-schedule router will push the preempted task away
|
|
|
|
*
|
|
|
|
* This test is optimistic, if we get it wrong the load-balancer
|
|
|
|
* will have to sort it out.
|
2008-01-25 21:08:10 +01:00
|
|
|
*/
|
2011-04-05 17:23:46 +02:00
|
|
|
if (curr && unlikely(rt_task(curr)) &&
|
2012-04-23 12:11:21 +02:00
|
|
|
(curr->nr_cpus_allowed < 2 ||
|
2011-09-12 16:28:04 +02:00
|
|
|
curr->prio <= p->prio) &&
|
2012-04-23 12:11:21 +02:00
|
|
|
(p->nr_cpus_allowed > 1)) {
|
2011-04-05 17:23:46 +02:00
|
|
|
int target = find_lowest_rq(p);
|
2008-01-25 21:08:10 +01:00
|
|
|
|
2011-04-05 17:23:46 +02:00
|
|
|
if (target != -1)
|
|
|
|
cpu = target;
|
2008-01-25 21:08:10 +01:00
|
|
|
}
|
2011-04-05 17:23:46 +02:00
|
|
|
rcu_read_unlock();
|
2008-01-25 21:08:10 +01:00
|
|
|
|
2011-06-17 03:55:22 +02:00
|
|
|
out:
|
2011-04-05 17:23:46 +02:00
|
|
|
return cpu;
|
2008-01-25 21:08:09 +01:00
|
|
|
}
|
2008-07-01 23:32:15 +02:00
|
|
|
|
|
|
|
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
|
|
|
|
{
|
2012-04-23 12:11:21 +02:00
|
|
|
if (rq->curr->nr_cpus_allowed == 1)
|
2008-07-01 23:32:15 +02:00
|
|
|
return;
|
|
|
|
|
2012-04-23 12:11:21 +02:00
|
|
|
if (p->nr_cpus_allowed != 1
|
2009-03-25 05:31:22 +01:00
|
|
|
&& cpupri_find(&rq->rd->cpupri, p, NULL))
|
|
|
|
return;
|
2008-11-24 17:05:13 +01:00
|
|
|
|
2009-03-25 05:31:22 +01:00
|
|
|
if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
|
|
|
|
return;
|
2008-07-01 23:32:15 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* There appears to be other cpus that can accept
|
|
|
|
* current and none to run 'p', so lets reschedule
|
|
|
|
* to try and push current away:
|
|
|
|
*/
|
|
|
|
requeue_task_rt(rq, p, 1);
|
|
|
|
resched_task(rq->curr);
|
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:09 +01:00
|
|
|
#endif /* CONFIG_SMP */
|
|
|
|
|
2007-07-09 18:51:58 +02:00
|
|
|
/*
|
|
|
|
* Preempt the current task with a newly woken task if needed:
|
|
|
|
*/
|
2009-09-14 19:55:44 +02:00
|
|
|
static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
|
2007-07-09 18:51:58 +02:00
|
|
|
{
|
sched: prioritize non-migratable tasks over migratable ones
Dmitry Adamushko pointed out a known flaw in the rt-balancing algorithm
that could allow suboptimal balancing if a non-migratable task gets
queued behind a running migratable one. It is discussed in this thread:
http://lkml.org/lkml/2008/4/22/296
This issue has been further exacerbated by a recent checkin to
sched-devel (git-id 5eee63a5ebc19a870ac40055c0be49457f3a89a3).
>From a pure priority standpoint, the run-queue is doing the "right"
thing. Using Dmitry's nomenclature, if T0 is on cpu1 first, and T1
wakes up at equal or lower priority (affined only to cpu1) later, it
*should* wait for T0 to finish. However, in reality that is likely
suboptimal from a system perspective if there are other cores that
could allow T0 and T1 to run concurrently. Since T1 can not migrate,
the only choice for higher concurrency is to try to move T0. This is
not something we addessed in the recent rt-balancing re-work.
This patch tries to enhance the balancing algorithm by accomodating this
scenario. It accomplishes this by incorporating the migratability of a
task into its priority calculation. Within a numerical tsk->prio, a
non-migratable task is logically higher than a migratable one. We
maintain this by introducing a new per-priority queue (xqueue, or
exclusive-queue) for holding non-migratable tasks. The scheduler will
draw from the xqueue over the standard shared-queue (squeue) when
available.
There are several details for utilizing this properly.
1) During task-wake-up, we not only need to check if the priority
preempts the current task, but we also need to check for this
non-migratable condition. Therefore, if a non-migratable task wakes
up and sees an equal priority migratable task already running, it
will attempt to preempt it *if* there is a likelyhood that the
current task will find an immediate home.
2) Tasks only get this non-migratable "priority boost" on wake-up. Any
requeuing will result in the non-migratable task being queued to the
end of the shared queue. This is an attempt to prevent the system
from being completely unfair to migratable tasks during things like
SCHED_RR timeslicing.
I am sure this patch introduces potentially "odd" behavior if you
concoct a scenario where a bunch of non-migratable threads could starve
migratable ones given the right pattern. I am not yet convinced that
this is a problem since we are talking about tasks of equal RT priority
anyway, and there never is much in the way of guarantees against
starvation under that scenario anyway. (e.g. you could come up with a
similar scenario with a specific timing environment verses an affinity
environment). I can be convinced otherwise, but for now I think this is
"ok".
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
CC: Dmitry Adamushko <dmitry.adamushko@gmail.com>
CC: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2008-05-12 21:20:41 +02:00
|
|
|
if (p->prio < rq->curr->prio) {
|
2007-07-09 18:51:58 +02:00
|
|
|
resched_task(rq->curr);
|
sched: prioritize non-migratable tasks over migratable ones
Dmitry Adamushko pointed out a known flaw in the rt-balancing algorithm
that could allow suboptimal balancing if a non-migratable task gets
queued behind a running migratable one. It is discussed in this thread:
http://lkml.org/lkml/2008/4/22/296
This issue has been further exacerbated by a recent checkin to
sched-devel (git-id 5eee63a5ebc19a870ac40055c0be49457f3a89a3).
>From a pure priority standpoint, the run-queue is doing the "right"
thing. Using Dmitry's nomenclature, if T0 is on cpu1 first, and T1
wakes up at equal or lower priority (affined only to cpu1) later, it
*should* wait for T0 to finish. However, in reality that is likely
suboptimal from a system perspective if there are other cores that
could allow T0 and T1 to run concurrently. Since T1 can not migrate,
the only choice for higher concurrency is to try to move T0. This is
not something we addessed in the recent rt-balancing re-work.
This patch tries to enhance the balancing algorithm by accomodating this
scenario. It accomplishes this by incorporating the migratability of a
task into its priority calculation. Within a numerical tsk->prio, a
non-migratable task is logically higher than a migratable one. We
maintain this by introducing a new per-priority queue (xqueue, or
exclusive-queue) for holding non-migratable tasks. The scheduler will
draw from the xqueue over the standard shared-queue (squeue) when
available.
There are several details for utilizing this properly.
1) During task-wake-up, we not only need to check if the priority
preempts the current task, but we also need to check for this
non-migratable condition. Therefore, if a non-migratable task wakes
up and sees an equal priority migratable task already running, it
will attempt to preempt it *if* there is a likelyhood that the
current task will find an immediate home.
2) Tasks only get this non-migratable "priority boost" on wake-up. Any
requeuing will result in the non-migratable task being queued to the
end of the shared queue. This is an attempt to prevent the system
from being completely unfair to migratable tasks during things like
SCHED_RR timeslicing.
I am sure this patch introduces potentially "odd" behavior if you
concoct a scenario where a bunch of non-migratable threads could starve
migratable ones given the right pattern. I am not yet convinced that
this is a problem since we are talking about tasks of equal RT priority
anyway, and there never is much in the way of guarantees against
starvation under that scenario anyway. (e.g. you could come up with a
similar scenario with a specific timing environment verses an affinity
environment). I can be convinced otherwise, but for now I think this is
"ok".
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
CC: Dmitry Adamushko <dmitry.adamushko@gmail.com>
CC: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2008-05-12 21:20:41 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
/*
|
|
|
|
* If:
|
|
|
|
*
|
|
|
|
* - the newly woken task is of equal priority to the current task
|
|
|
|
* - the newly woken task is non-migratable while current is migratable
|
|
|
|
* - current will be preempted on the next reschedule
|
|
|
|
*
|
|
|
|
* we should check to see if current can readily move to a different
|
|
|
|
* cpu. If so, we will reschedule to allow the push logic to try
|
|
|
|
* to move current somewhere else, making room for our non-migratable
|
|
|
|
* task.
|
|
|
|
*/
|
2011-06-15 00:36:24 +02:00
|
|
|
if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
|
2008-07-01 23:32:15 +02:00
|
|
|
check_preempt_equal_prio(rq, p);
|
sched: prioritize non-migratable tasks over migratable ones
Dmitry Adamushko pointed out a known flaw in the rt-balancing algorithm
that could allow suboptimal balancing if a non-migratable task gets
queued behind a running migratable one. It is discussed in this thread:
http://lkml.org/lkml/2008/4/22/296
This issue has been further exacerbated by a recent checkin to
sched-devel (git-id 5eee63a5ebc19a870ac40055c0be49457f3a89a3).
>From a pure priority standpoint, the run-queue is doing the "right"
thing. Using Dmitry's nomenclature, if T0 is on cpu1 first, and T1
wakes up at equal or lower priority (affined only to cpu1) later, it
*should* wait for T0 to finish. However, in reality that is likely
suboptimal from a system perspective if there are other cores that
could allow T0 and T1 to run concurrently. Since T1 can not migrate,
the only choice for higher concurrency is to try to move T0. This is
not something we addessed in the recent rt-balancing re-work.
This patch tries to enhance the balancing algorithm by accomodating this
scenario. It accomplishes this by incorporating the migratability of a
task into its priority calculation. Within a numerical tsk->prio, a
non-migratable task is logically higher than a migratable one. We
maintain this by introducing a new per-priority queue (xqueue, or
exclusive-queue) for holding non-migratable tasks. The scheduler will
draw from the xqueue over the standard shared-queue (squeue) when
available.
There are several details for utilizing this properly.
1) During task-wake-up, we not only need to check if the priority
preempts the current task, but we also need to check for this
non-migratable condition. Therefore, if a non-migratable task wakes
up and sees an equal priority migratable task already running, it
will attempt to preempt it *if* there is a likelyhood that the
current task will find an immediate home.
2) Tasks only get this non-migratable "priority boost" on wake-up. Any
requeuing will result in the non-migratable task being queued to the
end of the shared queue. This is an attempt to prevent the system
from being completely unfair to migratable tasks during things like
SCHED_RR timeslicing.
I am sure this patch introduces potentially "odd" behavior if you
concoct a scenario where a bunch of non-migratable threads could starve
migratable ones given the right pattern. I am not yet convinced that
this is a problem since we are talking about tasks of equal RT priority
anyway, and there never is much in the way of guarantees against
starvation under that scenario anyway. (e.g. you could come up with a
similar scenario with a specific timing environment verses an affinity
environment). I can be convinced otherwise, but for now I think this is
"ok".
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
CC: Dmitry Adamushko <dmitry.adamushko@gmail.com>
CC: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2008-05-12 21:20:41 +02:00
|
|
|
#endif
|
2007-07-09 18:51:58 +02:00
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
|
|
|
|
struct rt_rq *rt_rq)
|
2007-07-09 18:51:58 +02:00
|
|
|
{
|
2008-01-25 21:08:30 +01:00
|
|
|
struct rt_prio_array *array = &rt_rq->active;
|
|
|
|
struct sched_rt_entity *next = NULL;
|
2007-07-09 18:51:58 +02:00
|
|
|
struct list_head *queue;
|
|
|
|
int idx;
|
|
|
|
|
|
|
|
idx = sched_find_first_bit(array->bitmap);
|
2008-01-25 21:08:30 +01:00
|
|
|
BUG_ON(idx >= MAX_RT_PRIO);
|
2007-07-09 18:51:58 +02:00
|
|
|
|
|
|
|
queue = array->queue + idx;
|
2008-01-25 21:08:30 +01:00
|
|
|
next = list_entry(queue->next, struct sched_rt_entity, run_list);
|
2008-01-25 21:08:34 +01:00
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
return next;
|
|
|
|
}
|
2007-07-09 18:51:58 +02:00
|
|
|
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 15:39:53 +01:00
|
|
|
static struct task_struct *_pick_next_task_rt(struct rq *rq)
|
2008-01-25 21:08:30 +01:00
|
|
|
{
|
|
|
|
struct sched_rt_entity *rt_se;
|
|
|
|
struct task_struct *p;
|
|
|
|
struct rt_rq *rt_rq;
|
2007-07-09 18:51:58 +02:00
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
rt_rq = &rq->rt;
|
|
|
|
|
2010-12-06 17:28:30 +01:00
|
|
|
if (!rt_rq->rt_nr_running)
|
2008-01-25 21:08:30 +01:00
|
|
|
return NULL;
|
|
|
|
|
2008-02-13 15:45:39 +01:00
|
|
|
if (rt_rq_throttled(rt_rq))
|
2008-01-25 21:08:30 +01:00
|
|
|
return NULL;
|
|
|
|
|
|
|
|
do {
|
|
|
|
rt_se = pick_next_rt_entity(rq, rt_rq);
|
2008-01-25 21:08:34 +01:00
|
|
|
BUG_ON(!rt_se);
|
2008-01-25 21:08:30 +01:00
|
|
|
rt_rq = group_rt_rq(rt_se);
|
|
|
|
} while (rt_rq);
|
|
|
|
|
2014-12-02 09:09:21 +01:00
|
|
|
/*
|
|
|
|
* Force update of rq->clock_task in case we failed to do so in
|
|
|
|
* put_prev_task. A stale value can cause us to over-charge execution
|
|
|
|
* time to real-time task, that could trigger throttling unnecessarily
|
|
|
|
*/
|
|
|
|
if (rq->skip_clock_update > 0) {
|
|
|
|
rq->skip_clock_update = 0;
|
|
|
|
update_rq_clock(rq);
|
|
|
|
}
|
2008-01-25 21:08:30 +01:00
|
|
|
p = rt_task_of(rt_se);
|
2010-10-05 02:03:21 +02:00
|
|
|
p->se.exec_start = rq->clock_task;
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 15:39:53 +01:00
|
|
|
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct task_struct *pick_next_task_rt(struct rq *rq)
|
|
|
|
{
|
|
|
|
struct task_struct *p = _pick_next_task_rt(rq);
|
|
|
|
|
|
|
|
/* The running task is never eligible for pushing */
|
|
|
|
if (p)
|
|
|
|
dequeue_pushable_task(rq, p);
|
|
|
|
|
2008-04-19 12:11:10 +02:00
|
|
|
#ifdef CONFIG_SMP
|
2009-07-29 17:08:47 +02:00
|
|
|
/*
|
|
|
|
* We detect this state here so that we can avoid taking the RQ
|
|
|
|
* lock again later if there is no need to push
|
|
|
|
*/
|
|
|
|
rq->post_schedule = has_pushable_tasks(rq);
|
2008-04-19 12:11:10 +02:00
|
|
|
#endif
|
2009-07-29 17:08:47 +02:00
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
return p;
|
2007-07-09 18:51:58 +02:00
|
|
|
}
|
|
|
|
|
2007-08-09 11:16:49 +02:00
|
|
|
static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
|
2007-07-09 18:51:58 +02:00
|
|
|
{
|
2007-08-09 11:16:48 +02:00
|
|
|
update_curr_rt(rq);
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 15:39:53 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The previous task needs to be made eligible for pushing
|
|
|
|
* if it is still active
|
|
|
|
*/
|
2012-04-23 12:11:21 +02:00
|
|
|
if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 15:39:53 +01:00
|
|
|
enqueue_pushable_task(rq, p);
|
2007-07-09 18:51:58 +02:00
|
|
|
}
|
|
|
|
|
2007-10-24 18:23:51 +02:00
|
|
|
#ifdef CONFIG_SMP
|
2008-01-25 21:08:30 +01:00
|
|
|
|
2008-01-25 21:08:05 +01:00
|
|
|
/* Only try algorithms three times */
|
|
|
|
#define RT_MAX_TRIES 3
|
|
|
|
|
2008-01-25 21:08:07 +01:00
|
|
|
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
|
|
|
|
{
|
|
|
|
if (!task_running(rq, p) &&
|
2013-01-31 15:56:17 +01:00
|
|
|
cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
|
2008-01-25 21:08:07 +01:00
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:05 +01:00
|
|
|
/* Return the second highest RT task, NULL otherwise */
|
2008-01-25 21:08:14 +01:00
|
|
|
static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
|
2008-01-25 21:08:05 +01:00
|
|
|
{
|
2008-01-25 21:08:30 +01:00
|
|
|
struct task_struct *next = NULL;
|
|
|
|
struct sched_rt_entity *rt_se;
|
|
|
|
struct rt_prio_array *array;
|
|
|
|
struct rt_rq *rt_rq;
|
2008-01-25 21:08:05 +01:00
|
|
|
int idx;
|
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
for_each_leaf_rt_rq(rt_rq, rq) {
|
|
|
|
array = &rt_rq->active;
|
|
|
|
idx = sched_find_first_bit(array->bitmap);
|
2010-10-17 21:46:10 +02:00
|
|
|
next_idx:
|
2008-01-25 21:08:30 +01:00
|
|
|
if (idx >= MAX_RT_PRIO)
|
|
|
|
continue;
|
2012-03-19 23:26:19 +01:00
|
|
|
if (next && next->prio <= idx)
|
2008-01-25 21:08:30 +01:00
|
|
|
continue;
|
|
|
|
list_for_each_entry(rt_se, array->queue + idx, run_list) {
|
2010-03-10 17:07:24 +01:00
|
|
|
struct task_struct *p;
|
|
|
|
|
|
|
|
if (!rt_entity_is_task(rt_se))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
p = rt_task_of(rt_se);
|
2008-01-25 21:08:30 +01:00
|
|
|
if (pick_rt_task(rq, p, cpu)) {
|
|
|
|
next = p;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!next) {
|
|
|
|
idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
|
|
|
|
goto next_idx;
|
|
|
|
}
|
2008-01-25 21:08:07 +01:00
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:05 +01:00
|
|
|
return next;
|
|
|
|
}
|
|
|
|
|
2008-11-24 17:05:13 +01:00
|
|
|
static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
|
2008-01-25 21:08:05 +01:00
|
|
|
|
2014-07-23 00:12:32 +02:00
|
|
|
#ifdef CONFIG_SCHED_HMP
|
2014-07-21 11:05:24 +02:00
|
|
|
static int find_lowest_rq_hmp(struct task_struct *task)
|
2014-04-24 20:10:44 +02:00
|
|
|
{
|
|
|
|
struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
|
|
|
|
int cpu_cost, min_cost = INT_MAX;
|
|
|
|
int best_cpu = -1;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/* Make sure the mask is initialized first */
|
|
|
|
if (unlikely(!lowest_mask))
|
|
|
|
return best_cpu;
|
|
|
|
|
|
|
|
if (task->nr_cpus_allowed == 1)
|
|
|
|
return best_cpu; /* No other targets possible */
|
|
|
|
|
|
|
|
if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
|
|
|
|
return best_cpu; /* No targets found */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* At this point we have built a mask of cpus representing the
|
|
|
|
* lowest priority tasks in the system. Now we want to elect
|
|
|
|
* the best one based on our affinity and topology.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Skip performance considerations and optimize for power.
|
|
|
|
* Worst case we'll be iterating over all CPUs here. CPU
|
|
|
|
* online mask should be taken care of when constructing
|
|
|
|
* the lowest_mask.
|
|
|
|
*/
|
|
|
|
for_each_cpu(i, lowest_mask) {
|
|
|
|
struct rq *rq = cpu_rq(i);
|
|
|
|
cpu_cost = power_cost_at_freq(i, ACCESS_ONCE(rq->min_freq));
|
2014-11-14 07:04:22 +01:00
|
|
|
trace_sched_cpu_load(rq, idle_cpu(i), mostly_idle_cpu(i),
|
2014-11-25 01:54:59 +01:00
|
|
|
sched_irqload(i), cpu_cost, cpu_temp(i));
|
2014-12-03 19:18:12 +01:00
|
|
|
|
|
|
|
if (sched_boost() && capacity(rq) != max_capacity)
|
|
|
|
continue;
|
|
|
|
|
2014-11-13 23:58:10 +01:00
|
|
|
if (cpu_cost < min_cost && !sched_cpu_high_irqload(i)) {
|
2014-04-24 20:10:44 +02:00
|
|
|
min_cost = cpu_cost;
|
|
|
|
best_cpu = i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return best_cpu;
|
|
|
|
}
|
2015-01-16 06:57:31 +01:00
|
|
|
|
|
|
|
#else /* CONFIG_SCHED_HMP */
|
|
|
|
|
2014-07-23 00:12:32 +02:00
|
|
|
static int find_lowest_rq_hmp(struct task_struct *task)
|
|
|
|
{
|
|
|
|
return -1;
|
|
|
|
}
|
2015-01-16 06:57:31 +01:00
|
|
|
|
|
|
|
#endif /* CONFIG_SCHED_HMP */
|
2014-07-21 11:05:24 +02:00
|
|
|
|
2008-01-25 21:08:11 +01:00
|
|
|
static int find_lowest_rq(struct task_struct *task)
|
|
|
|
{
|
|
|
|
struct sched_domain *sd;
|
2008-11-24 17:05:14 +01:00
|
|
|
struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
|
2008-01-25 21:08:11 +01:00
|
|
|
int this_cpu = smp_processor_id();
|
|
|
|
int cpu = task_cpu(task);
|
2008-01-25 21:08:13 +01:00
|
|
|
|
2014-07-21 17:24:04 +02:00
|
|
|
if (sched_enable_hmp)
|
2014-07-21 11:05:24 +02:00
|
|
|
return find_lowest_rq_hmp(task);
|
|
|
|
|
2011-06-15 00:36:25 +02:00
|
|
|
/* Make sure the mask is initialized first */
|
|
|
|
if (unlikely(!lowest_mask))
|
|
|
|
return -1;
|
|
|
|
|
2012-04-23 12:11:21 +02:00
|
|
|
if (task->nr_cpus_allowed == 1)
|
2008-05-12 21:21:01 +02:00
|
|
|
return -1; /* No other targets possible */
|
2008-01-25 21:08:11 +01:00
|
|
|
|
2008-05-12 21:21:01 +02:00
|
|
|
if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
|
|
|
|
return -1; /* No targets found */
|
2008-01-25 21:08:11 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* At this point we have built a mask of cpus representing the
|
|
|
|
* lowest priority tasks in the system. Now we want to elect
|
|
|
|
* the best one based on our affinity and topology.
|
|
|
|
*
|
|
|
|
* We prioritize the last cpu that the task executed on since
|
|
|
|
* it is most likely cache-hot in that location.
|
|
|
|
*/
|
2008-11-24 17:05:14 +01:00
|
|
|
if (cpumask_test_cpu(cpu, lowest_mask))
|
2008-01-25 21:08:11 +01:00
|
|
|
return cpu;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Otherwise, we consult the sched_domains span maps to figure
|
|
|
|
* out which cpu is logically closest to our hot cache data.
|
|
|
|
*/
|
2009-11-03 05:23:15 +01:00
|
|
|
if (!cpumask_test_cpu(this_cpu, lowest_mask))
|
|
|
|
this_cpu = -1; /* Skip this_cpu opt if not among lowest */
|
2008-01-25 21:08:11 +01:00
|
|
|
|
2011-04-22 12:53:54 +02:00
|
|
|
rcu_read_lock();
|
2009-11-03 05:23:15 +01:00
|
|
|
for_each_domain(cpu, sd) {
|
|
|
|
if (sd->flags & SD_WAKE_AFFINE) {
|
|
|
|
int best_cpu;
|
2008-01-25 21:08:11 +01:00
|
|
|
|
2009-11-03 05:23:15 +01:00
|
|
|
/*
|
|
|
|
* "this_cpu" is cheaper to preempt than a
|
|
|
|
* remote processor.
|
|
|
|
*/
|
|
|
|
if (this_cpu != -1 &&
|
2011-04-22 12:53:54 +02:00
|
|
|
cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
|
|
|
|
rcu_read_unlock();
|
2009-11-03 05:23:15 +01:00
|
|
|
return this_cpu;
|
2011-04-22 12:53:54 +02:00
|
|
|
}
|
2009-11-03 05:23:15 +01:00
|
|
|
|
|
|
|
best_cpu = cpumask_first_and(lowest_mask,
|
|
|
|
sched_domain_span(sd));
|
2011-04-22 12:53:54 +02:00
|
|
|
if (best_cpu < nr_cpu_ids) {
|
|
|
|
rcu_read_unlock();
|
2009-11-03 05:23:15 +01:00
|
|
|
return best_cpu;
|
2011-04-22 12:53:54 +02:00
|
|
|
}
|
2008-01-25 21:08:11 +01:00
|
|
|
}
|
|
|
|
}
|
2011-04-22 12:53:54 +02:00
|
|
|
rcu_read_unlock();
|
2008-01-25 21:08:11 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* And finally, if there were no matches within the domains
|
|
|
|
* just give the caller *something* to work with from the compatible
|
|
|
|
* locations.
|
|
|
|
*/
|
2009-11-03 05:23:15 +01:00
|
|
|
if (this_cpu != -1)
|
|
|
|
return this_cpu;
|
|
|
|
|
|
|
|
cpu = cpumask_any(lowest_mask);
|
|
|
|
if (cpu < nr_cpu_ids)
|
|
|
|
return cpu;
|
|
|
|
return -1;
|
2008-01-25 21:08:10 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Will lock the rq it finds */
|
2008-01-25 21:08:15 +01:00
|
|
|
static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
|
2008-01-25 21:08:10 +01:00
|
|
|
{
|
|
|
|
struct rq *lowest_rq = NULL;
|
|
|
|
int tries;
|
2008-01-25 21:08:15 +01:00
|
|
|
int cpu;
|
2008-01-25 21:08:05 +01:00
|
|
|
|
2008-01-25 21:08:10 +01:00
|
|
|
for (tries = 0; tries < RT_MAX_TRIES; tries++) {
|
|
|
|
cpu = find_lowest_rq(task);
|
|
|
|
|
2008-01-25 21:08:10 +01:00
|
|
|
if ((cpu == -1) || (cpu == rq->cpu))
|
2008-01-25 21:08:05 +01:00
|
|
|
break;
|
|
|
|
|
2008-01-25 21:08:10 +01:00
|
|
|
lowest_rq = cpu_rq(cpu);
|
|
|
|
|
2008-01-25 21:08:05 +01:00
|
|
|
/* if the prio of this runqueue changed, try again */
|
2008-01-25 21:08:10 +01:00
|
|
|
if (double_lock_balance(rq, lowest_rq)) {
|
2008-01-25 21:08:05 +01:00
|
|
|
/*
|
|
|
|
* We had to unlock the run queue. In
|
|
|
|
* the mean time, task could have
|
|
|
|
* migrated already or had its affinity changed.
|
|
|
|
* Also make sure that it wasn't scheduled on its rq.
|
|
|
|
*/
|
2008-01-25 21:08:10 +01:00
|
|
|
if (unlikely(task_rq(task) != rq ||
|
2008-11-24 17:05:14 +01:00
|
|
|
!cpumask_test_cpu(lowest_rq->cpu,
|
2011-06-16 12:23:22 +02:00
|
|
|
tsk_cpus_allowed(task)) ||
|
2008-01-25 21:08:10 +01:00
|
|
|
task_running(rq, task) ||
|
2011-04-05 17:23:44 +02:00
|
|
|
!task->on_rq)) {
|
2008-01-25 21:08:15 +01:00
|
|
|
|
2012-05-17 21:19:46 +02:00
|
|
|
double_unlock_balance(rq, lowest_rq);
|
2008-01-25 21:08:05 +01:00
|
|
|
lowest_rq = NULL;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If this rq is still suitable use it. */
|
2008-12-29 15:39:49 +01:00
|
|
|
if (lowest_rq->rt.highest_prio.curr > task->prio)
|
2008-01-25 21:08:05 +01:00
|
|
|
break;
|
|
|
|
|
|
|
|
/* try again */
|
2008-08-11 09:30:22 +02:00
|
|
|
double_unlock_balance(rq, lowest_rq);
|
2008-01-25 21:08:05 +01:00
|
|
|
lowest_rq = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return lowest_rq;
|
|
|
|
}
|
|
|
|
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 15:39:53 +01:00
|
|
|
static struct task_struct *pick_next_pushable_task(struct rq *rq)
|
|
|
|
{
|
|
|
|
struct task_struct *p;
|
|
|
|
|
|
|
|
if (!has_pushable_tasks(rq))
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
p = plist_first_entry(&rq->rt.pushable_tasks,
|
|
|
|
struct task_struct, pushable_tasks);
|
|
|
|
|
|
|
|
BUG_ON(rq->cpu != task_cpu(p));
|
|
|
|
BUG_ON(task_current(rq, p));
|
2012-04-23 12:11:21 +02:00
|
|
|
BUG_ON(p->nr_cpus_allowed <= 1);
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 15:39:53 +01:00
|
|
|
|
2011-04-05 17:23:44 +02:00
|
|
|
BUG_ON(!p->on_rq);
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 15:39:53 +01:00
|
|
|
BUG_ON(!rt_task(p));
|
|
|
|
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:05 +01:00
|
|
|
/*
|
|
|
|
* If the current CPU has more than one RT task, see if the non
|
|
|
|
* running task can migrate over to a CPU that is running a task
|
|
|
|
* of lesser priority.
|
|
|
|
*/
|
2008-01-25 21:08:09 +01:00
|
|
|
static int push_rt_task(struct rq *rq)
|
2008-01-25 21:08:05 +01:00
|
|
|
{
|
|
|
|
struct task_struct *next_task;
|
|
|
|
struct rq *lowest_rq;
|
2011-06-17 03:55:20 +02:00
|
|
|
int ret = 0;
|
2008-01-25 21:08:05 +01:00
|
|
|
|
2008-01-25 21:08:12 +01:00
|
|
|
if (!rq->rt.overloaded)
|
|
|
|
return 0;
|
|
|
|
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 15:39:53 +01:00
|
|
|
next_task = pick_next_pushable_task(rq);
|
2008-01-25 21:08:05 +01:00
|
|
|
if (!next_task)
|
|
|
|
return 0;
|
|
|
|
|
2010-10-17 21:46:10 +02:00
|
|
|
retry:
|
2008-01-25 21:08:09 +01:00
|
|
|
if (unlikely(next_task == rq->curr)) {
|
2008-01-25 21:08:07 +01:00
|
|
|
WARN_ON(1);
|
2008-01-25 21:08:05 +01:00
|
|
|
return 0;
|
2008-01-25 21:08:07 +01:00
|
|
|
}
|
2008-01-25 21:08:05 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* It's possible that the next_task slipped in of
|
|
|
|
* higher priority than current. If that's the case
|
|
|
|
* just reschedule current.
|
|
|
|
*/
|
2008-01-25 21:08:09 +01:00
|
|
|
if (unlikely(next_task->prio < rq->curr->prio)) {
|
|
|
|
resched_task(rq->curr);
|
2008-01-25 21:08:05 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:09 +01:00
|
|
|
/* We might release rq lock */
|
2008-01-25 21:08:05 +01:00
|
|
|
get_task_struct(next_task);
|
|
|
|
|
|
|
|
/* find_lock_lowest_rq locks the rq if found */
|
2008-01-25 21:08:09 +01:00
|
|
|
lowest_rq = find_lock_lowest_rq(next_task, rq);
|
2008-01-25 21:08:05 +01:00
|
|
|
if (!lowest_rq) {
|
|
|
|
struct task_struct *task;
|
|
|
|
/*
|
2011-06-17 03:55:20 +02:00
|
|
|
* find_lock_lowest_rq releases rq->lock
|
2008-12-29 15:39:53 +01:00
|
|
|
* so it is possible that next_task has migrated.
|
|
|
|
*
|
|
|
|
* We need to make sure that the task is still on the same
|
|
|
|
* run-queue and is also still the next task eligible for
|
|
|
|
* pushing.
|
2008-01-25 21:08:05 +01:00
|
|
|
*/
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 15:39:53 +01:00
|
|
|
task = pick_next_pushable_task(rq);
|
2008-12-29 15:39:53 +01:00
|
|
|
if (task_cpu(next_task) == rq->cpu && task == next_task) {
|
|
|
|
/*
|
2011-06-17 03:55:20 +02:00
|
|
|
* The task hasn't migrated, and is still the next
|
|
|
|
* eligible task, but we failed to find a run-queue
|
|
|
|
* to push it to. Do not retry in this case, since
|
|
|
|
* other cpus will pull from us when ready.
|
2008-12-29 15:39:53 +01:00
|
|
|
*/
|
|
|
|
goto out;
|
2008-01-25 21:08:05 +01:00
|
|
|
}
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 15:39:53 +01:00
|
|
|
|
2008-12-29 15:39:53 +01:00
|
|
|
if (!task)
|
|
|
|
/* No more tasks, just exit */
|
|
|
|
goto out;
|
|
|
|
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 15:39:53 +01:00
|
|
|
/*
|
2008-12-29 15:39:53 +01:00
|
|
|
* Something has shifted, try again.
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 15:39:53 +01:00
|
|
|
*/
|
2008-12-29 15:39:53 +01:00
|
|
|
put_task_struct(next_task);
|
|
|
|
next_task = task;
|
|
|
|
goto retry;
|
2008-01-25 21:08:05 +01:00
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:09 +01:00
|
|
|
deactivate_task(rq, next_task, 0);
|
2008-01-25 21:08:05 +01:00
|
|
|
set_task_cpu(next_task, lowest_rq->cpu);
|
|
|
|
activate_task(lowest_rq, next_task, 0);
|
2011-06-17 03:55:20 +02:00
|
|
|
ret = 1;
|
2008-01-25 21:08:05 +01:00
|
|
|
|
|
|
|
resched_task(lowest_rq->curr);
|
|
|
|
|
2008-08-11 09:30:22 +02:00
|
|
|
double_unlock_balance(rq, lowest_rq);
|
2008-01-25 21:08:05 +01:00
|
|
|
|
|
|
|
out:
|
|
|
|
put_task_struct(next_task);
|
|
|
|
|
2011-06-17 03:55:20 +02:00
|
|
|
return ret;
|
2008-01-25 21:08:05 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static void push_rt_tasks(struct rq *rq)
|
|
|
|
{
|
|
|
|
/* push_rt_task will return true if it moved an RT */
|
|
|
|
while (push_rt_task(rq))
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:07 +01:00
|
|
|
static int pull_rt_task(struct rq *this_rq)
|
|
|
|
{
|
2008-01-25 21:08:17 +01:00
|
|
|
int this_cpu = this_rq->cpu, ret = 0, cpu;
|
2008-12-29 15:39:49 +01:00
|
|
|
struct task_struct *p;
|
2008-01-25 21:08:07 +01:00
|
|
|
struct rq *src_rq;
|
|
|
|
|
2008-01-25 21:08:18 +01:00
|
|
|
if (likely(!rt_overloaded(this_rq)))
|
2008-01-25 21:08:07 +01:00
|
|
|
return 0;
|
|
|
|
|
2008-11-24 17:05:05 +01:00
|
|
|
for_each_cpu(cpu, this_rq->rd->rto_mask) {
|
2008-01-25 21:08:07 +01:00
|
|
|
if (this_cpu == cpu)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
src_rq = cpu_rq(cpu);
|
2008-12-29 15:39:50 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Don't bother taking the src_rq->lock if the next highest
|
|
|
|
* task is known to be lower-priority than our current task.
|
|
|
|
* This may look racy, but if this value is about to go
|
|
|
|
* logically higher, the src_rq will push this task away.
|
|
|
|
* And if its going logically lower, we do not care
|
|
|
|
*/
|
|
|
|
if (src_rq->rt.highest_prio.next >=
|
|
|
|
this_rq->rt.highest_prio.curr)
|
|
|
|
continue;
|
|
|
|
|
2008-01-25 21:08:07 +01:00
|
|
|
/*
|
|
|
|
* We can potentially drop this_rq's lock in
|
|
|
|
* double_lock_balance, and another CPU could
|
2008-12-29 15:39:49 +01:00
|
|
|
* alter this_rq
|
2008-01-25 21:08:07 +01:00
|
|
|
*/
|
2008-12-29 15:39:49 +01:00
|
|
|
double_lock_balance(this_rq, src_rq);
|
2008-01-25 21:08:07 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Are there still pullable RT tasks?
|
|
|
|
*/
|
2008-01-25 21:08:30 +01:00
|
|
|
if (src_rq->rt.rt_nr_running <= 1)
|
|
|
|
goto skip;
|
2008-01-25 21:08:07 +01:00
|
|
|
|
|
|
|
p = pick_next_highest_task_rt(src_rq, this_cpu);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Do we have an RT task that preempts
|
|
|
|
* the to-be-scheduled task?
|
|
|
|
*/
|
2008-12-29 15:39:49 +01:00
|
|
|
if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
|
2008-01-25 21:08:07 +01:00
|
|
|
WARN_ON(p == src_rq->curr);
|
2011-04-05 17:23:44 +02:00
|
|
|
WARN_ON(!p->on_rq);
|
2008-01-25 21:08:07 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* There's a chance that p is higher in priority
|
|
|
|
* than what's currently running on its cpu.
|
|
|
|
* This is just that p is wakeing up and hasn't
|
|
|
|
* had a chance to schedule. We only pull
|
|
|
|
* p if it is lower in priority than the
|
2008-12-29 15:39:49 +01:00
|
|
|
* current task on the run queue
|
2008-01-25 21:08:07 +01:00
|
|
|
*/
|
2008-12-29 15:39:49 +01:00
|
|
|
if (p->prio < src_rq->curr->prio)
|
2008-01-25 21:08:30 +01:00
|
|
|
goto skip;
|
2008-01-25 21:08:07 +01:00
|
|
|
|
|
|
|
ret = 1;
|
|
|
|
|
|
|
|
deactivate_task(src_rq, p, 0);
|
|
|
|
set_task_cpu(p, this_cpu);
|
|
|
|
activate_task(this_rq, p, 0);
|
|
|
|
/*
|
|
|
|
* We continue with the search, just in
|
|
|
|
* case there's an even higher prio task
|
2011-03-31 03:57:33 +02:00
|
|
|
* in another runqueue. (low likelihood
|
2008-01-25 21:08:07 +01:00
|
|
|
* but possible)
|
|
|
|
*/
|
|
|
|
}
|
2010-10-17 21:46:10 +02:00
|
|
|
skip:
|
2008-08-11 09:30:22 +02:00
|
|
|
double_unlock_balance(this_rq, src_rq);
|
2008-01-25 21:08:07 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:22 +01:00
|
|
|
static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
|
2008-01-25 21:08:07 +01:00
|
|
|
{
|
|
|
|
/* Try to pull RT tasks here if we lower this rq's prio */
|
2010-02-09 20:43:59 +01:00
|
|
|
if (rq->rt.highest_prio.curr > prev->prio)
|
2008-01-25 21:08:07 +01:00
|
|
|
pull_rt_task(rq);
|
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:22 +01:00
|
|
|
static void post_schedule_rt(struct rq *rq)
|
2008-01-25 21:08:05 +01:00
|
|
|
{
|
2008-12-29 15:39:52 +01:00
|
|
|
push_rt_tasks(rq);
|
2008-01-25 21:08:05 +01:00
|
|
|
}
|
|
|
|
|
2008-04-23 13:13:29 +02:00
|
|
|
/*
|
|
|
|
* If we are not running and we are not going to reschedule soon, we should
|
|
|
|
* try to push tasks away now
|
|
|
|
*/
|
2009-12-16 18:04:40 +01:00
|
|
|
static void task_woken_rt(struct rq *rq, struct task_struct *p)
|
2008-01-25 21:08:07 +01:00
|
|
|
{
|
2008-01-25 21:08:22 +01:00
|
|
|
if (!task_running(rq, p) &&
|
2008-04-23 13:13:29 +02:00
|
|
|
!test_tsk_need_resched(rq->curr) &&
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 15:39:53 +01:00
|
|
|
has_pushable_tasks(rq) &&
|
2012-04-23 12:11:21 +02:00
|
|
|
p->nr_cpus_allowed > 1 &&
|
sched: Try not to migrate higher priority RT tasks
When first working on the RT scheduler design, we concentrated on
keeping all CPUs running RT tasks instead of having multiple RT
tasks on a single CPU waiting for the migration thread to move
them. Instead we take a more proactive stance and push or pull RT
tasks from one CPU to another on wakeup or scheduling.
When an RT task wakes up on a CPU that is running another RT task,
instead of preempting it and killing the cache of the running RT
task, we look to see if we can migrate the RT task that is waking
up, even if the RT task waking up is of higher priority.
This may sound a bit odd, but RT tasks should be limited in
migration by the user anyway. But in practice, people do not do
this, which causes high prio RT tasks to bounce around the CPUs.
This becomes even worse when we have priority inheritance, because
a high prio task can block on a lower prio task and boost its
priority. When the lower prio task wakes up the high prio task, if
it happens to be on the same CPU it will migrate off of it.
But in reality, the above does not happen much either, because the
wake up of the lower prio task, which has already been boosted, if
it was on the same CPU as the higher prio task, it would then
migrate off of it. But anyway, we do not want to migrate them
either.
To examine the scheduling, I created a test program and examined it
under kernelshark. The test program created CPU * 2 threads, where
each thread had a different priority. The program takes different
options. The options used in this change log was to have priority
inheritance mutexes or not.
All threads did the following loop:
static void grab_lock(long id, int iter, int l)
{
ftrace_write("thread %ld iter %d, taking lock %d\n",
id, iter, l);
pthread_mutex_lock(&locks[l]);
ftrace_write("thread %ld iter %d, took lock %d\n",
id, iter, l);
busy_loop(nr_tasks - id);
ftrace_write("thread %ld iter %d, unlock lock %d\n",
id, iter, l);
pthread_mutex_unlock(&locks[l]);
}
void *start_task(void *id)
{
[...]
while (!done) {
for (l = 0; l < nr_locks; l++) {
grab_lock(id, i, l);
ftrace_write("thread %ld iter %d sleeping\n",
id, i);
ms_sleep(id);
}
i++;
}
[...]
}
The busy_loop(ms) keeps the CPU spinning for ms milliseconds. The
ms_sleep(ms) sleeps for ms milliseconds. The ftrace_write() writes
to the ftrace buffer to help analyze via ftrace.
The higher the id, the higher the prio, the shorter it does the
busy loop, but the longer it spins. This is usually the case with
RT tasks, the lower priority tasks usually run longer than higher
priority tasks.
At the end of the test, it records the number of loops each thread
took, as well as the number of voluntary preemptions, non-voluntary
preemptions, and number of migrations each thread took, taking the
information from /proc/$$/sched and /proc/$$/status.
Running this on a 4 CPU processor, the results without changes to
the kernel looked like this:
Task vol nonvol migrated iterations
---- --- ------ -------- ----------
0: 53 3220 1470 98
1: 562 773 724 98
2: 752 933 1375 98
3: 749 39 697 98
4: 758 5 515 98
5: 764 2 679 99
6: 761 2 535 99
7: 757 3 346 99
total: 5156 4977 6341 787
Each thread regardless of priority migrated a few hundred times.
The higher priority tasks, were a little better but still took
quite an impact.
By letting higher priority tasks bump the lower prio task from the
CPU, things changed a bit:
Task vol nonvol migrated iterations
---- --- ------ -------- ----------
0: 37 2835 1937 98
1: 666 1821 1865 98
2: 654 1003 1385 98
3: 664 635 973 99
4: 698 197 352 99
5: 703 101 159 99
6: 708 1 75 99
7: 713 1 2 99
total: 4843 6594 6748 789
The total # of migrations did not change (several runs showed the
difference all within the noise). But we now see a dramatic
improvement to the higher priority tasks. (kernelshark showed that
the watchdog timer bumped the highest priority task to give it the
2 count. This was actually consistent with every run).
Notice that the # of iterations did not change either.
The above was with priority inheritance mutexes. That is, when the
higher prority task blocked on a lower priority task, the lower
priority task would inherit the higher priority task (which shows
why task 6 was bumped so many times). When not using priority
inheritance mutexes, the current kernel shows this:
Task vol nonvol migrated iterations
---- --- ------ -------- ----------
0: 56 3101 1892 95
1: 594 713 937 95
2: 625 188 618 95
3: 628 4 491 96
4: 640 7 468 96
5: 631 2 501 96
6: 641 1 466 96
7: 643 2 497 96
total: 4458 4018 5870 765
Not much changed with or without priority inheritance mutexes. But
if we let the high priority task bump lower priority tasks on
wakeup we see:
Task vol nonvol migrated iterations
---- --- ------ -------- ----------
0: 115 3439 2782 98
1: 633 1354 1583 99
2: 652 919 1218 99
3: 645 713 934 99
4: 690 3 3 99
5: 694 1 4 99
6: 720 3 4 99
7: 747 0 1 100
Which shows a even bigger change. The big difference between task 3
and task 4 is because we have only 4 CPUs on the machine, causing
the 4 highest prio tasks to always have preference.
Although I did not measure cache misses, and I'm sure there would
be little to measure since the test was not data intensive, I could
imagine large improvements for higher priority tasks when dealing
with lower priority tasks. Thus, I'm satisfied with making the
change and agreeing with what Gregory Haskins argued a few years
ago when we first had this discussion.
One final note. All tasks in the above tests were RT tasks. Any RT
task will always preempt a non RT task that is running on the CPU
the RT task wants to run on.
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Gregory Haskins <ghaskins@novell.com>
LKML-Reference: <20100921024138.605460343@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-09-21 04:40:03 +02:00
|
|
|
rt_task(rq->curr) &&
|
2012-04-23 12:11:21 +02:00
|
|
|
(rq->curr->nr_cpus_allowed < 2 ||
|
2011-09-12 16:28:04 +02:00
|
|
|
rq->curr->prio <= p->prio))
|
2008-01-25 21:08:07 +01:00
|
|
|
push_rt_tasks(rq);
|
|
|
|
}
|
|
|
|
|
2008-03-26 22:23:49 +01:00
|
|
|
static void set_cpus_allowed_rt(struct task_struct *p,
|
2008-11-24 17:05:14 +01:00
|
|
|
const struct cpumask *new_mask)
|
2008-01-25 21:08:07 +01:00
|
|
|
{
|
2012-04-11 07:06:04 +02:00
|
|
|
struct rq *rq;
|
|
|
|
int weight;
|
2008-01-25 21:08:07 +01:00
|
|
|
|
|
|
|
BUG_ON(!rt_task(p));
|
|
|
|
|
2012-04-11 07:06:04 +02:00
|
|
|
if (!p->on_rq)
|
|
|
|
return;
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 15:39:53 +01:00
|
|
|
|
2012-04-11 07:06:04 +02:00
|
|
|
weight = cpumask_weight(new_mask);
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 15:39:53 +01:00
|
|
|
|
2012-04-11 07:06:04 +02:00
|
|
|
/*
|
|
|
|
* Only update if the process changes its state from whether it
|
|
|
|
* can migrate or not.
|
|
|
|
*/
|
2012-04-23 12:11:21 +02:00
|
|
|
if ((p->nr_cpus_allowed > 1) == (weight > 1))
|
2012-04-11 07:06:04 +02:00
|
|
|
return;
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 15:39:53 +01:00
|
|
|
|
2012-04-11 07:06:04 +02:00
|
|
|
rq = task_rq(p);
|
2008-01-25 21:08:07 +01:00
|
|
|
|
2012-04-11 07:06:04 +02:00
|
|
|
/*
|
|
|
|
* The process used to be able to migrate OR it can now migrate
|
|
|
|
*/
|
|
|
|
if (weight <= 1) {
|
|
|
|
if (!task_current(rq, p))
|
|
|
|
dequeue_pushable_task(rq, p);
|
|
|
|
BUG_ON(!rq->rt.rt_nr_migratory);
|
|
|
|
rq->rt.rt_nr_migratory--;
|
|
|
|
} else {
|
|
|
|
if (!task_current(rq, p))
|
|
|
|
enqueue_pushable_task(rq, p);
|
|
|
|
rq->rt.rt_nr_migratory++;
|
2008-01-25 21:08:07 +01:00
|
|
|
}
|
2012-04-11 07:06:04 +02:00
|
|
|
|
|
|
|
update_rt_migration(&rq->rt);
|
2008-01-25 21:08:07 +01:00
|
|
|
}
|
2008-01-25 21:08:15 +01:00
|
|
|
|
2008-01-25 21:08:18 +01:00
|
|
|
/* Assumes rq->lock is held */
|
2008-06-04 21:04:05 +02:00
|
|
|
static void rq_online_rt(struct rq *rq)
|
2008-01-25 21:08:18 +01:00
|
|
|
{
|
|
|
|
if (rq->rt.overloaded)
|
|
|
|
rt_set_overload(rq);
|
2008-05-12 21:21:01 +02:00
|
|
|
|
2008-06-05 14:49:58 +02:00
|
|
|
__enable_runtime(rq);
|
|
|
|
|
2008-12-29 15:39:49 +01:00
|
|
|
cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
|
2008-01-25 21:08:18 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Assumes rq->lock is held */
|
2008-06-04 21:04:05 +02:00
|
|
|
static void rq_offline_rt(struct rq *rq)
|
2008-01-25 21:08:18 +01:00
|
|
|
{
|
|
|
|
if (rq->rt.overloaded)
|
|
|
|
rt_clear_overload(rq);
|
2008-05-12 21:21:01 +02:00
|
|
|
|
2008-06-05 14:49:58 +02:00
|
|
|
__disable_runtime(rq);
|
|
|
|
|
2008-05-12 21:21:01 +02:00
|
|
|
cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
|
2008-01-25 21:08:18 +01:00
|
|
|
}
|
2008-01-25 21:08:22 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* When switch from the rt queue, we bring ourselves to a position
|
|
|
|
* that we might want to pull RT tasks from other runqueues.
|
|
|
|
*/
|
2011-01-17 17:03:27 +01:00
|
|
|
static void switched_from_rt(struct rq *rq, struct task_struct *p)
|
2008-01-25 21:08:22 +01:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* If there are other RT tasks then we will reschedule
|
|
|
|
* and the scheduling of the other RT tasks will handle
|
|
|
|
* the balancing. But if we are the last RT task
|
|
|
|
* we may need to handle the pulling of RT tasks
|
|
|
|
* now.
|
|
|
|
*/
|
2012-11-22 21:02:15 +01:00
|
|
|
if (!p->on_rq || rq->rt.rt_nr_running)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (pull_rt_task(rq))
|
|
|
|
resched_task(rq->curr);
|
2008-01-25 21:08:22 +01:00
|
|
|
}
|
2008-11-25 00:28:41 +01:00
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
void init_sched_rt_class(void)
|
2008-11-25 00:28:41 +01:00
|
|
|
{
|
|
|
|
unsigned int i;
|
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
for_each_possible_cpu(i) {
|
2009-06-06 23:51:36 +02:00
|
|
|
zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
|
2009-01-01 03:08:45 +01:00
|
|
|
GFP_KERNEL, cpu_to_node(i));
|
2011-10-25 10:00:11 +02:00
|
|
|
}
|
2008-11-25 00:28:41 +01:00
|
|
|
}
|
2015-01-16 06:57:31 +01:00
|
|
|
|
2008-01-25 21:08:22 +01:00
|
|
|
#endif /* CONFIG_SMP */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* When switching a task to RT, we may overload the runqueue
|
|
|
|
* with RT tasks. In this case we try to push them off to
|
|
|
|
* other runqueues.
|
|
|
|
*/
|
2011-01-17 17:03:27 +01:00
|
|
|
static void switched_to_rt(struct rq *rq, struct task_struct *p)
|
2008-01-25 21:08:22 +01:00
|
|
|
{
|
|
|
|
int check_resched = 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we are already running, then there's nothing
|
|
|
|
* that needs to be done. But if we are not running
|
|
|
|
* we may need to preempt the current running task.
|
|
|
|
* If that current running task is also an RT task
|
|
|
|
* then see if we can move to another run queue.
|
|
|
|
*/
|
2011-04-05 17:23:44 +02:00
|
|
|
if (p->on_rq && rq->curr != p) {
|
2008-01-25 21:08:22 +01:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
if (rq->rt.overloaded && push_rt_task(rq) &&
|
|
|
|
/* Don't resched if we changed runqueues */
|
|
|
|
rq != task_rq(p))
|
|
|
|
check_resched = 0;
|
|
|
|
#endif /* CONFIG_SMP */
|
|
|
|
if (check_resched && p->prio < rq->curr->prio)
|
|
|
|
resched_task(rq->curr);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Priority of the task has changed. This may cause
|
|
|
|
* us to initiate a push or pull.
|
|
|
|
*/
|
2011-01-17 17:03:27 +01:00
|
|
|
static void
|
|
|
|
prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
|
2008-01-25 21:08:22 +01:00
|
|
|
{
|
2011-04-05 17:23:44 +02:00
|
|
|
if (!p->on_rq)
|
2011-01-17 17:03:27 +01:00
|
|
|
return;
|
|
|
|
|
|
|
|
if (rq->curr == p) {
|
2008-01-25 21:08:22 +01:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
/*
|
|
|
|
* If our priority decreases while running, we
|
|
|
|
* may need to pull tasks to this runqueue.
|
|
|
|
*/
|
|
|
|
if (oldprio < p->prio)
|
|
|
|
pull_rt_task(rq);
|
|
|
|
/*
|
|
|
|
* If there's a higher priority task waiting to run
|
2008-03-05 16:00:12 +01:00
|
|
|
* then reschedule. Note, the above pull_rt_task
|
|
|
|
* can release the rq lock and p could migrate.
|
|
|
|
* Only reschedule if p is still on the same runqueue.
|
2008-01-25 21:08:22 +01:00
|
|
|
*/
|
2008-12-29 15:39:49 +01:00
|
|
|
if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
|
2008-01-25 21:08:22 +01:00
|
|
|
resched_task(p);
|
|
|
|
#else
|
|
|
|
/* For UP simply resched on drop of prio */
|
|
|
|
if (oldprio < p->prio)
|
|
|
|
resched_task(p);
|
2008-01-25 21:08:05 +01:00
|
|
|
#endif /* CONFIG_SMP */
|
2008-01-25 21:08:22 +01:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* This task is not running, but if it is
|
|
|
|
* greater than the current running task
|
|
|
|
* then reschedule.
|
|
|
|
*/
|
|
|
|
if (p->prio < rq->curr->prio)
|
|
|
|
resched_task(rq->curr);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:27 +01:00
|
|
|
static void watchdog(struct rq *rq, struct task_struct *p)
|
|
|
|
{
|
|
|
|
unsigned long soft, hard;
|
|
|
|
|
2010-03-05 22:42:54 +01:00
|
|
|
/* max may change after cur was read, this will be fixed next tick */
|
|
|
|
soft = task_rlimit(p, RLIMIT_RTTIME);
|
|
|
|
hard = task_rlimit_max(p, RLIMIT_RTTIME);
|
2008-01-25 21:08:27 +01:00
|
|
|
|
|
|
|
if (soft != RLIM_INFINITY) {
|
|
|
|
unsigned long next;
|
|
|
|
|
sched/rt: Avoid updating RT entry timeout twice within one tick period
The issue below was found in 2.6.34-rt rather than mainline rt
kernel, but the issue still exists upstream as well.
So please let me describe how it was noticed on 2.6.34-rt:
On this version, each softirq has its own thread, it means there
is at least one RT FIFO task per cpu. The priority of these
tasks is set to 49 by default. If user launches an RT FIFO task
with priority lower than 49 of softirq RT tasks, it's possible
there are two RT FIFO tasks enqueued one cpu runqueue at one
moment. By current strategy of balancing RT tasks, when it comes
to RT tasks, we really need to put them off to a CPU that they
can run on as soon as possible. Even if it means a bit of cache
line flushing, we want RT tasks to be run with the least latency.
When the user RT FIFO task which just launched before is
running, the sched timer tick of the current cpu happens. In this
tick period, the timeout value of the user RT task will be
updated once. Subsequently, we try to wake up one softirq RT
task on its local cpu. As the priority of current user RT task
is lower than the softirq RT task, the current task will be
preempted by the higher priority softirq RT task. Before
preemption, we check to see if current can readily move to a
different cpu. If so, we will reschedule to allow the RT push logic
to try to move current somewhere else. Whenever the woken
softirq RT task runs, it first tries to migrate the user FIFO RT
task over to a cpu that is running a task of lesser priority. If
migration is done, it will send a reschedule request to the found
cpu by IPI interrupt. Once the target cpu responds the IPI
interrupt, it will pick the migrated user RT task to preempt its
current task. When the user RT task is running on the new cpu,
the sched timer tick of the cpu fires. So it will tick the user
RT task again. This also means the RT task timeout value will be
updated again. As the migration may be done in one tick period,
it means the user RT task timeout value will be updated twice
within one tick.
If we set a limit on the amount of cpu time for the user RT task
by setrlimit(RLIMIT_RTTIME), the SIGXCPU signal should be posted
upon reaching the soft limit.
But exactly when the SIGXCPU signal should be sent depends on the
RT task timeout value. In fact the timeout mechanism of sending
the SIGXCPU signal assumes the RT task timeout is increased once
every tick.
However, currently the timeout value may be added twice per
tick. So it results in the SIGXCPU signal being sent earlier
than expected.
To solve this issue, we prevent the timeout value from increasing
twice within one tick time by remembering the jiffies value of
last updating the timeout. As long as the RT task's jiffies is
different with the global jiffies value, we allow its timeout to
be updated.
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Fan Du <fan.du@windriver.com>
Reviewed-by: Yong Zhang <yong.zhang0@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1342508623-2887-1-git-send-email-ying.xue@windriver.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2012-07-17 09:03:43 +02:00
|
|
|
if (p->rt.watchdog_stamp != jiffies) {
|
|
|
|
p->rt.timeout++;
|
|
|
|
p->rt.watchdog_stamp = jiffies;
|
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:27 +01:00
|
|
|
next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
|
2008-01-25 21:08:32 +01:00
|
|
|
if (p->rt.timeout > next)
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-12 18:54:39 +02:00
|
|
|
p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
|
2008-01-25 21:08:27 +01:00
|
|
|
}
|
|
|
|
}
|
2007-07-09 18:51:58 +02:00
|
|
|
|
2008-01-25 21:08:29 +01:00
|
|
|
static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
|
2007-07-09 18:51:58 +02:00
|
|
|
{
|
2012-05-17 06:34:23 +02:00
|
|
|
struct sched_rt_entity *rt_se = &p->rt;
|
|
|
|
|
2007-12-20 15:01:17 +01:00
|
|
|
update_curr_rt(rq);
|
|
|
|
|
2008-01-25 21:08:27 +01:00
|
|
|
watchdog(rq, p);
|
|
|
|
|
2007-07-09 18:51:58 +02:00
|
|
|
/*
|
|
|
|
* RR tasks need a special form of timeslice management.
|
|
|
|
* FIFO tasks have no timeslices.
|
|
|
|
*/
|
|
|
|
if (p->policy != SCHED_RR)
|
|
|
|
return;
|
|
|
|
|
2008-01-25 21:08:27 +01:00
|
|
|
if (--p->rt.time_slice)
|
2007-07-09 18:51:58 +02:00
|
|
|
return;
|
|
|
|
|
2013-02-07 16:47:04 +01:00
|
|
|
p->rt.time_slice = sched_rr_timeslice;
|
2007-07-09 18:51:58 +02:00
|
|
|
|
2007-08-24 20:39:10 +02:00
|
|
|
/*
|
2012-05-17 06:34:23 +02:00
|
|
|
* Requeue to the end of queue if we (and all of our ancestors) are the
|
|
|
|
* only element on the queue
|
2007-08-24 20:39:10 +02:00
|
|
|
*/
|
2012-05-17 06:34:23 +02:00
|
|
|
for_each_sched_rt_entity(rt_se) {
|
|
|
|
if (rt_se->run_list.prev != rt_se->run_list.next) {
|
|
|
|
requeue_task_rt(rq, p, 0);
|
|
|
|
set_tsk_need_resched(p);
|
|
|
|
return;
|
|
|
|
}
|
2007-08-24 20:39:10 +02:00
|
|
|
}
|
2007-07-09 18:51:58 +02:00
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:08 +02:00
|
|
|
static void set_curr_task_rt(struct rq *rq)
|
|
|
|
{
|
|
|
|
struct task_struct *p = rq->curr;
|
|
|
|
|
2010-10-05 02:03:21 +02:00
|
|
|
p->se.exec_start = rq->clock_task;
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 15:39:53 +01:00
|
|
|
|
|
|
|
/* The running task is never eligible for pushing */
|
|
|
|
dequeue_pushable_task(rq, p);
|
2007-10-15 17:00:08 +02:00
|
|
|
}
|
|
|
|
|
2010-01-14 04:21:52 +01:00
|
|
|
static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
|
2009-09-21 03:31:53 +02:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Time slice is 0 for SCHED_FIFO tasks
|
|
|
|
*/
|
|
|
|
if (task->policy == SCHED_RR)
|
2013-02-07 16:47:04 +01:00
|
|
|
return sched_rr_timeslice;
|
2009-09-21 03:31:53 +02:00
|
|
|
else
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
const struct sched_class rt_sched_class = {
|
2007-10-15 17:00:12 +02:00
|
|
|
.next = &fair_sched_class,
|
2007-07-09 18:51:58 +02:00
|
|
|
.enqueue_task = enqueue_task_rt,
|
|
|
|
.dequeue_task = dequeue_task_rt,
|
|
|
|
.yield_task = yield_task_rt,
|
|
|
|
|
|
|
|
.check_preempt_curr = check_preempt_curr_rt,
|
|
|
|
|
|
|
|
.pick_next_task = pick_next_task_rt,
|
|
|
|
.put_prev_task = put_prev_task_rt,
|
|
|
|
|
2007-10-24 18:23:51 +02:00
|
|
|
#ifdef CONFIG_SMP
|
2008-10-22 09:25:26 +02:00
|
|
|
.select_task_rq = select_task_rq_rt,
|
|
|
|
|
2008-01-25 21:08:07 +01:00
|
|
|
.set_cpus_allowed = set_cpus_allowed_rt,
|
2008-06-04 21:04:05 +02:00
|
|
|
.rq_online = rq_online_rt,
|
|
|
|
.rq_offline = rq_offline_rt,
|
2008-01-25 21:08:22 +01:00
|
|
|
.pre_schedule = pre_schedule_rt,
|
|
|
|
.post_schedule = post_schedule_rt,
|
2009-12-16 18:04:40 +01:00
|
|
|
.task_woken = task_woken_rt,
|
2008-01-25 21:08:22 +01:00
|
|
|
.switched_from = switched_from_rt,
|
2007-10-24 18:23:51 +02:00
|
|
|
#endif
|
2007-07-09 18:51:58 +02:00
|
|
|
|
2007-10-15 17:00:08 +02:00
|
|
|
.set_curr_task = set_curr_task_rt,
|
2007-07-09 18:51:58 +02:00
|
|
|
.task_tick = task_tick_rt,
|
2008-01-25 21:08:22 +01:00
|
|
|
|
2009-09-21 03:31:53 +02:00
|
|
|
.get_rr_interval = get_rr_interval_rt,
|
|
|
|
|
2008-01-25 21:08:22 +01:00
|
|
|
.prio_changed = prio_changed_rt,
|
|
|
|
.switched_to = switched_to_rt,
|
2015-01-16 06:57:31 +01:00
|
|
|
#ifdef CONFIG_SCHED_HMP
|
|
|
|
.inc_hmp_sched_stats = inc_hmp_sched_stats_rt,
|
|
|
|
.dec_hmp_sched_stats = dec_hmp_sched_stats_rt,
|
|
|
|
#endif
|
2007-07-09 18:51:58 +02:00
|
|
|
};
|
2008-06-19 14:22:24 +02:00
|
|
|
|
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
|
|
extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
|
|
|
|
|
2011-10-25 10:00:11 +02:00
|
|
|
void print_rt_stats(struct seq_file *m, int cpu)
|
2008-06-19 14:22:24 +02:00
|
|
|
{
|
2011-05-14 08:20:02 +02:00
|
|
|
rt_rq_iter_t iter;
|
2008-06-19 14:22:24 +02:00
|
|
|
struct rt_rq *rt_rq;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
2011-05-14 08:20:02 +02:00
|
|
|
for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
|
2008-06-19 14:22:24 +02:00
|
|
|
print_rt_rq(m, cpu, rt_rq);
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
2008-06-24 20:09:43 +02:00
|
|
|
#endif /* CONFIG_SCHED_DEBUG */
|