Linux.patch

diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 7a35a6e..b5ee1dc 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -356,3 +356,6 @@
 347	i386	process_vm_readv	sys_process_vm_readv		compat_sys_process_vm_readv
 348	i386	process_vm_writev	sys_process_vm_writev		compat_sys_process_vm_writev
 349	i386	kcmp			sys_kcmp
+350 i386	atlas_next			sys_atlas_next
+351 i386	atlas_submit		sys_atlas_submit
+352 i386	atlas_debug			sys_atlas_debug
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index a582bfe..43f9836 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -319,6 +319,9 @@
 310	64	process_vm_readv	sys_process_vm_readv
 311	64	process_vm_writev	sys_process_vm_writev
 312	common	kcmp			sys_kcmp
+313	common	atlas_next		sys_atlas_next
+314	64	atlas_submit		sys_atlas_submit
+315	common	atlas_debug		sys_atlas_debug
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/debian.quantal/config/amd64/config.flavour.atlas b/debian.quantal/config/amd64/config.flavour.atlas
new file mode 100644
index 0000000..06d8b53
--- /dev/null
+++ b/debian.quantal/config/amd64/config.flavour.atlas
@@ -0,0 +1,3 @@
+#
+# Config options for config.flavour.atlas automatically generated by splitconfig.pl
+#
diff --git a/debian.quantal/config/i386/config.flavour.atlas b/debian.quantal/config/i386/config.flavour.atlas
new file mode 100644
index 0000000..06d8b53
--- /dev/null
+++ b/debian.quantal/config/i386/config.flavour.atlas
@@ -0,0 +1,3 @@
+#
+# Config options for config.flavour.atlas automatically generated by splitconfig.pl
+#
diff --git a/debian.quantal/control.d/vars.atlas b/debian.quantal/control.d/vars.atlas
new file mode 100644
index 0000000..83e429e
--- /dev/null
+++ b/debian.quantal/control.d/vars.atlas
@@ -0,0 +1,6 @@
+arch="i386 amd64"
+supported="ATLAS"
+target="Experimental ATLAS scheduler."
+desc="=HUMAN= SMP"
+bootloader="grub-pc | grub-efi-amd64 | grub-efi-ia32 | grub | lilo (>= 19.1)"
+provides="kvm-api-4, redhat-cluster-modules, ivtv-modules, ndiswrapper-modules-1.9"
diff --git a/debian.quantal/rules.d/amd64.mk b/debian.quantal/rules.d/amd64.mk
index 9d00768..2692682 100644
--- a/debian.quantal/rules.d/amd64.mk
+++ b/debian.quantal/rules.d/amd64.mk
@@ -2,7 +2,7 @@ human_arch	= 64 bit x86
 build_arch	= x86_64
 header_arch	= $(build_arch)
 defconfig	= defconfig
-flavours	= generic
+flavours	= atlas
 build_image	= bzImage
 kernel_file	= arch/$(build_arch)/boot/bzImage
 install_file	= vmlinuz
diff --git a/debian.quantal/rules.d/i386.mk b/debian.quantal/rules.d/i386.mk
index 3e82c65..35feef3 100644
--- a/debian.quantal/rules.d/i386.mk
+++ b/debian.quantal/rules.d/i386.mk
@@ -2,7 +2,7 @@ human_arch	= 32 bit x86
 build_arch	= i386
 header_arch	= x86_64
 defconfig	= defconfig
-flavours        = generic
+flavours        = atlas
 build_image	= bzImage
 kernel_file	= arch/$(build_arch)/boot/bzImage
 install_file	= vmlinuz
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index b806b82..5c0e6b8 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -168,6 +168,14 @@ extern struct task_group root_task_group;
 		.run_list	= LIST_HEAD_INIT(tsk.rt.run_list),	\
 		.time_slice	= RR_TIMESLICE,				\
 	},								\
+	.atlas  = {						\
+		.state = ATLAS_UNDEF,		\
+		.flags = 0,					\
+		.jobs = LIST_HEAD_INIT(tsk.atlas.jobs),		\
+		.job  = NULL,		\
+		.real_job = NULL,	\
+		.jobs_lock = __SPIN_LOCK_UNLOCKED(tsk.atlas.jobs_lock),	\
+	},								\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	INIT_PUSHABLE_TASKS(tsk)					\
 	INIT_CGROUP_SCHED(tsk)						\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e63650f..d18a61d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -39,6 +39,8 @@
 #define SCHED_BATCH		3
 /* SCHED_ISO: reserved but not implemented yet */
 #define SCHED_IDLE		5
+/* SCHED_ATLAS:  ATLAS Scheduler*/
+#define SCHED_ATLAS		6
 /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
 #define SCHED_RESET_ON_FORK     0x40000000
 
@@ -1084,6 +1086,7 @@ struct sched_domain;
 #endif
 
 #define DEQUEUE_SLEEP		1
+#define DEQUEUE_SKIP_UPDATE 2
 
 struct sched_class {
 	const struct sched_class *next;
@@ -1211,6 +1214,31 @@ struct sched_rt_entity {
 #endif
 };
 
+enum atlas_state {
+		ATLAS_UNDEF,
+		ATLAS_BLOCKED,
+		ATLAS_RUNNING,
+};
+
+
+struct sched_atlas_entity {
+	struct rb_node     run_node; /*for normal operation*/
+	//struct list_head   run_list;  ??
+	struct list_head   list;     /*for initialization*/
+	unsigned int       state;
+	unsigned long      flags;
+	unsigned int       on_rq;
+	unsigned int       on_recover_rq;
+	ktime_t            start;
+	//struct atlas_rq    *atlas_rq; //needed?
+	
+	struct atlas_job         *job, *real_job;
+	struct list_head         jobs;
+	spinlock_t               jobs_lock;
+
+	struct hrtimer 			 timer;
+};
+
 /*
  * default timeslice is 100 msecs (used only for SCHED_RR tasks).
  * Timeslices get refilled after they expire.
@@ -1244,6 +1272,7 @@ struct task_struct {
 	const struct sched_class *sched_class;
 	struct sched_entity se;
 	struct sched_rt_entity rt;
+	struct sched_atlas_entity atlas;
 #ifdef CONFIG_CGROUP_SCHED
 	struct task_group *sched_task_group;
 #endif
@@ -2005,6 +2034,10 @@ extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
 
+extern unsigned int sysctl_sched_atlas_min_slack;
+extern unsigned int sysctl_sched_atlas_advance_in_cfs;
+
+
 enum sched_tunable_scaling {
 	SCHED_TUNABLESCALING_NONE,
 	SCHED_TUNABLESCALING_LOG,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 19439c7..9c26268 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -857,6 +857,14 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
 				      const struct iovec __user *rvec,
 				      unsigned long riovcnt,
 				      unsigned long flags);
+asmlinkage long sys_atlas_next(void);
+asmlinkage long sys_atlas_submit(pid_t pid,
+					struct timeval __user *exectime,
+					struct timeval __user *deadline,
+					int time_base);
+asmlinkage long sys_atlas_debug(int operation,
+					int arg1,
+					int arg2);
 
 asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
 			 unsigned long idx1, unsigned long idx2);
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index ea7a203..708076a 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -150,6 +150,130 @@ TRACE_EVENT(sched_switch,
 );
 
 /*
+ * Tracepoint for pick_next_task.
+ */
+DECLARE_EVENT_CLASS(sched_pick_put_template,
+
+	TP_PROTO(struct rq *rq, struct task_struct *p),
+
+	TP_ARGS(rq, p),
+
+	TP_STRUCT__entry(
+		__array(	char,	p_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	policy		)
+		__field(	unsigned long,	pending_work)
+		__field(	unsigned long,	flags	)
+		__field(	int,	has_sub			)
+		__field(    void *, job	            )
+		__field(	s64,	sdeadline		)
+		__field(	s64,	deadline		)
+		__field(	s64,	sexectime		)
+		__field(	s64,	exectime		)
+		__field(	s64,	now				)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->p_comm, p->comm, TASK_COMM_LEN);
+		__entry->pid	 = p->pid;
+		__entry->policy  = p->policy;
+		__entry->flags   = p->atlas.flags;
+		__entry->job     = p->atlas.job;
+		__entry->sdeadline = __entry->job ? ktime_to_ns(p->atlas.job->sdeadline) : 0;
+		__entry->deadline  = __entry->job ? ktime_to_ns(p->atlas.job->deadline) : 0;
+		__entry->sexectime = __entry->job ? ktime_to_ns(p->atlas.job->sexectime) : 0;
+		__entry->exectime  = __entry->job ? ktime_to_ns(p->atlas.job->exectime) : 0;
+		__entry->now       = ktime_to_ns(ktime_get());
+	),
+
+	TP_printk("pid=%d",
+		__entry->pid)
+);
+
+DEFINE_EVENT(sched_pick_put_template, sched_pick_next_task,
+	     TP_PROTO(struct rq *rq, struct task_struct *p),
+	     TP_ARGS(rq, p));
+DEFINE_EVENT(sched_pick_put_template, sched_put_prev_task,
+	     TP_PROTO(struct rq *rq, struct task_struct *p),
+	     TP_ARGS(rq, p));
+
+/*
+ * Tracepoint for queuing:
+ */
+DECLARE_EVENT_CLASS(sched_queue_template,
+
+	TP_PROTO(struct task_struct *p, struct rq *rq),
+
+	TP_ARGS(p, rq),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	policy		)
+		__field(	int,	rq_cpu		)
+		__field(	s64,	now			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->policy     = p->policy;
+		__entry->rq_cpu     = rq->cpu;
+		__entry->now        = ktime_to_ns(ktime_get());
+	),
+
+	TP_printk("pid=%d",
+		__entry->pid)
+);
+
+DEFINE_EVENT(sched_queue_template, sched_enqueue_task,
+	     TP_PROTO(struct task_struct *p, struct rq *rq),
+	     TP_ARGS(p, rq));
+
+DEFINE_EVENT(sched_queue_template, sched_dequeue_task,
+	     TP_PROTO(struct task_struct *p, struct rq *rq),
+	     TP_ARGS(p, rq));
+
+TRACE_EVENT(sched_enter,
+
+	TP_PROTO(struct rq *rq),
+
+	TP_ARGS(rq),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	pending_work)
+		__field(	s64,	now			)
+	),
+
+	TP_fast_assign(
+		__entry->pending_work = rq->atlas.pending_work | rq->atlas_recover.pending_work << 8;
+		__entry->now          = ktime_to_ns(ktime_get());
+	),
+
+	TP_printk("sched_enter (%llu)",
+		 __entry->now)
+);
+
+TRACE_EVENT(sched_log,
+	TP_PROTO(const char *msg),
+	
+	TP_ARGS(msg),
+	
+	TP_STRUCT__entry(
+		__array(	char,	message,	30	)
+		__field(	s64,	now			)
+	),
+	
+	TP_fast_assign(
+		memcpy(__entry->message, msg, TASK_COMM_LEN);
+		__entry->now          = ktime_to_ns(ktime_get());
+	),
+	
+	TP_printk("sched_log: %s",
+		 __entry->message)
+);
+	
+/*
  * Tracepoint for a task being migrated:
  */
 TRACE_EVENT(sched_migrate_task,
diff --git a/kernel/exit.c b/kernel/exit.c
index 46ce8da..f438a6a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -60,6 +60,7 @@
 #include <asm/mmu_context.h>
 
 static void exit_mm(struct task_struct * tsk);
+extern void exit_atlas(struct task_struct * tsk);
 
 static void __unhash_process(struct task_struct *p, bool group_dead)
 {
@@ -991,6 +992,7 @@ void do_exit(long code)
 		acct_process();
 	trace_sched_process_exit(tsk);
 
+	exit_atlas(tsk);
 	exit_sem(tsk);
 	exit_shm(tsk);
 	exit_files(tsk);
diff --git a/kernel/printk.c b/kernel/printk.c
index 146827f..b827392 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1875,21 +1875,33 @@ int is_console_locked(void)
 /*
  * Delayed printk version, for scheduler-internal messages:
  */
-#define PRINTK_BUF_SIZE		512
+#define PRINTK_BUF_SIZE		128
+#define PRINTK_BUF_NR		256
 
 #define PRINTK_PENDING_WAKEUP	0x01
 #define PRINTK_PENDING_SCHED	0x02
 
 static DEFINE_PER_CPU(int, printk_pending);
-static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
+
+typedef struct {
+	int idx;
+	char buf[PRINTK_BUF_NR][PRINTK_BUF_SIZE];
+} printk_sched_buf_t;
+
+static DEFINE_PER_CPU(printk_sched_buf_t, printk_sched_buf) = {.idx = 0 };
 
 void printk_tick(void)
 {
 	if (__this_cpu_read(printk_pending)) {
 		int pending = __this_cpu_xchg(printk_pending, 0);
 		if (pending & PRINTK_PENDING_SCHED) {
-			char *buf = __get_cpu_var(printk_sched_buf);
-			printk(KERN_WARNING "[sched_delayed] %s", buf);
+			printk_sched_buf_t *data = &__get_cpu_var(printk_sched_buf);
+			int i;
+			for (i = 0; i < data->idx; ++i) {
+				char *buf = data->buf[i]; 
+				printk(KERN_WARNING "[sched_delayed] %s", buf);
+			}
+			data->idx = 0;
 		}
 		if (pending & PRINTK_PENDING_WAKEUP)
 			wake_up_interruptible(&log_wait);
@@ -2362,16 +2374,19 @@ int printk_sched(const char *fmt, ...)
 {
 	unsigned long flags;
 	va_list args;
+	printk_sched_buf_t *data;
 	char *buf;
 	int r;
 
 	local_irq_save(flags);
-	buf = __get_cpu_var(printk_sched_buf);
-
+	data = &__get_cpu_var(printk_sched_buf);
+	BUG_ON(data->idx >= PRINTK_BUF_NR);
+	buf = data->buf[data->idx++];
+	
 	va_start(args, fmt);
 	r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args);
 	va_end(args);
-
+	
 	__this_cpu_or(printk_pending, PRINTK_PENDING_SCHED);
 	local_irq_restore(flags);
 
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 173ea52..445a719 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
-obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o
+obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o atlas.o atlas_recover.o
 obj-$(CONFIG_SMP) += cpupri.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/atlas.c b/kernel/sched/atlas.c
new file mode 100644
index 0000000..454f791
--- /dev/null
+++ b/kernel/sched/atlas.c
@@ -0,0 +1,2058 @@
+#include <linux/syscalls.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/hrtimer.h>
+#include <linux/ktime.h>
+#include "sched.h"
+
+
+const struct sched_class atlas_sched_class;
+
+unsigned int sysctl_sched_atlas_min_slack      = 1000000ULL;
+unsigned int sysctl_sched_atlas_advance_in_cfs = 0;
+
+
+#define TIMER_EXPIRED                0x01
+
+/* pending work definitions */
+#define PENDING_STOP_CFS_ADVANCED    0x01
+#define PENDING_START_CFS_ADVANCED   0x02
+#define PENDING_JOB_TIMER            0x04
+#define PENDING_MOVE_TO_CFS          0x08
+#define PENDING_MOVE_TO_RECOVER      0x10
+#define PENDING_MOVE_TO_ATLAS        0x20
+
+#define ATLAS_DEBUG
+
+enum {
+	DEBUG_SYS_NEXT       = 1UL << 0,
+	DEBUG_SYS_SUBMIT     = 1UL << 1,
+	DEBUG_ENQUEUE        = 1UL << 2,
+	DEBUG_DEQUEUE        = 1UL << 3,
+	DEBUG_PICK_NEXT_TASK = 1UL << 4,
+	DEBUG_SET_CURR_TASK  = 1UL << 5,
+	DEBUG_SWITCHED_TO    = 1UL << 6,
+	DEBUG_PUT_PREV_TASK  = 1UL << 7,
+	DEBUG_CHECK_PREEMPT  = 1UL << 8,
+	DEBUG_RBTREE         = 1UL << 9,
+	DEBUG_TIMER          = 1UL << 10,
+	DEBUG_SUBMISSIONS    = 1UL << 11,
+	DEBUG_SWITCH_SCHED   = 1UL << 11,
+	DEBUG_ADAPT_SEXEC    = 1UL << 13,
+	DEBUG_SLACK_TIME     = 1UL << 14,
+};
+
+enum update_exec_time {
+	UPDATE_EXEC_TIME,
+	NO_UPDATE_EXEC_TIME,
+};
+
+#ifdef ATLAS_DEBUG
+//static const unsigned debug_mask = DEBUG_PICK_NEXT_TASK |
+//		DEBUG_PUT_PREV_TASK | DEBUG_ENQUEUE | DEBUG_DEQUEUE | DEBUG_TIMER | DEBUG_SWITCH_SCHED |
+//		DEBUG_SYS_NEXT;
+
+static const unsigned debug_mask = 0;
+
+static int printk_counter = 0;
+	#define DEBUG(T,STR,...) \
+		do { \
+			if (T & debug_mask)  { \
+				preempt_disable(); \
+				printk_sched("%d (%d): "#T ": " STR "\n", (printk_counter++), \
+					smp_processor_id(), ##__VA_ARGS__); \
+				preempt_enable(); \
+			} \
+		} while(0)
+	
+	#define DEBUG_ON(T) if (debug_mask & (T))
+			
+#else 
+	#define DEBUG(...)
+	#define DEBUG_ON(T) if (0)
+#endif /* ATLAS_DEBUG */
+
+
+
+static inline void init_job(struct atlas_job *job) {
+	memset(job, 0, sizeof(struct atlas_job));
+	atomic_set(&job->count, 1);
+}
+
+static inline struct atlas_job *get_job
+	(struct atlas_job  *job)
+{
+	if (job)
+		atomic_inc(&job->count);
+	return job;
+}
+
+
+static void put_job(struct atlas_job *job)
+{
+	if (!job)
+		return;
+
+	if (atomic_dec_and_test(&job->count)) {
+		//printk_sched("free job=%p\n", job);
+		put_pid(job->pid);
+		kfree(job);
+	}
+}
+
+static inline int job_before(struct atlas_job *a,
+		struct atlas_job *b)
+{
+	BUG_ON(!a);
+	BUG_ON(!b);
+	return ktime_to_ns(a->deadline) <  ktime_to_ns(b->deadline);
+}
+
+static int entity_before(struct sched_atlas_entity *a,
+		struct sched_atlas_entity *b)
+{
+	
+	/*
+	 * a preemption within sys_next or a wakeup due to a signal can lead
+	 * into cases where se->job is null.
+	 * Because we also queue this se's into the tree, we have to check
+	 * both.
+	 * 
+	 * 4 cases:
+	 * new | comparator
+	 * ----------------
+	 *  o  |  o  doesn't matter
+	 *  o  |  x  new should go to the beginning
+	 *  x  |  o  the old entry should stay on the left side
+	 *  x  |  x  compare
+	 */
+	 
+	if (unlikely(!a->job)) //left side if new has no submisson
+		return 1;
+	
+	if (unlikely(!b->job)) //right side
+		return 0;
+		
+	return job_before(a->job, b->job);
+}
+
+static void enqueue_entity(struct atlas_rq *atlas_rq,
+		struct sched_atlas_entity *se)
+{
+	struct rb_node **link = &atlas_rq->tasks_timeline.rb_node;
+	struct rb_node *parent = NULL;
+	struct sched_atlas_entity *entry;
+	int leftmost = 1;
+	
+	//FIXME?
+	rb_init_node(&se->run_node);
+	
+	DEBUG(DEBUG_RBTREE, "enqueue_task_rb_tree");
+	while (*link) {
+		parent = *link;
+		entry = rb_entry(parent, struct sched_atlas_entity, run_node);
+		
+		if (entity_before(se, entry))
+			link = &parent->rb_left;
+		else {
+			link = &parent->rb_right;
+			leftmost = 0;
+		}
+	}
+
+	if (leftmost)
+		atlas_rq->rb_leftmost_se = &se->run_node;
+	
+	rb_link_node(&se->run_node, parent, link);
+	rb_insert_color(&se->run_node, &atlas_rq->tasks_timeline);	
+}
+
+static void dequeue_entity(struct atlas_rq *atlas_rq,
+		struct sched_atlas_entity *se)
+{
+	DEBUG(DEBUG_RBTREE, "dequeue_task_rb_tree");
+
+	if (atlas_rq->rb_leftmost_se == &se->run_node) {
+		struct rb_node *next_node;
+
+		next_node = rb_next(&se->run_node);
+		atlas_rq->rb_leftmost_se = next_node;
+	}
+	
+	rb_erase(&se->run_node, &atlas_rq->tasks_timeline);
+}
+
+static struct sched_atlas_entity *pick_first_entity(struct atlas_rq *atlas_rq)
+{
+	struct rb_node *left = atlas_rq->rb_leftmost_se;
+
+	if (!left)
+		return NULL;
+
+	return rb_entry(left, struct sched_atlas_entity, run_node);
+}
+
+static struct sched_atlas_entity *pick_next_entity(struct sched_atlas_entity *se)
+{
+	struct rb_node *next = rb_next(&se->run_node);
+
+	if (!next)
+		return NULL;
+
+	return rb_entry(next, struct sched_atlas_entity, run_node);
+}
+
+static struct atlas_job *pick_first_job(struct atlas_rq *atlas_rq) {
+	struct rb_node *first = rb_first(&atlas_rq->jobs);
+
+	if (!first)
+		return NULL;
+	
+	return rb_entry(first, struct atlas_job, rb_node);
+}
+
+static struct atlas_job *pick_last_job(struct atlas_rq *atlas_rq) {
+	struct rb_node *last = rb_last(&atlas_rq->jobs);
+
+	if (!last)
+		return NULL;
+	
+	return rb_entry(last, struct atlas_job, rb_node);
+}
+
+static struct atlas_job *pick_next_job(struct atlas_job *s) {
+	struct rb_node *next = rb_next(&s->rb_node);
+	
+	if (!next)
+		return NULL;
+	
+	return rb_entry(next, struct atlas_job, rb_node);
+}
+
+static struct atlas_job *pick_prev_job(struct atlas_job *s) {
+	struct rb_node *prev = rb_prev(&s->rb_node);
+	
+	if (!prev)
+		return NULL;
+	
+	return rb_entry(prev, struct atlas_job, rb_node);
+}
+
+static inline int job_in_rq(struct atlas_job *s) {
+	return !RB_EMPTY_NODE(&s->rb_node);
+}
+
+/*
+ * remember to call put_task_struct(p) after you are done
+ */
+static inline struct task_struct *task_of_job(struct atlas_job *s) {
+	return get_pid_task(s->pid, PIDTYPE_PID);
+}
+
+static inline int in_slacktime(struct atlas_rq *atlas_rq) {
+	return (atlas_rq->timer_target == ATLAS_SLACK);
+}
+
+static inline ktime_t ktime_min(ktime_t a, ktime_t b) {
+	return ns_to_ktime(min(ktime_to_ns(a), ktime_to_ns(b)));
+}
+
+static inline int ktime_neg(ktime_t a) {
+	return ktime_to_ns(a) < 0;
+}
+
+static inline int ktime_zero(ktime_t a) {
+	return ktime_equal(ktime_set(0,0), a);
+}
+
+static inline int ktime_cmp(ktime_t a, ktime_t b) {
+	s64 tmp = ktime_to_ns(ktime_sub(a, b));
+	if (tmp > 0)
+		return 1;
+	else if (tmp == 0)
+		return 0;
+	else
+		return -1;
+}
+
+static inline ktime_t job_start(struct atlas_job *s) {
+	return ktime_sub(s->sdeadline, s->sexectime);
+}
+
+static inline int job_missed_deadline(struct atlas_job *s, ktime_t now) {
+	return ktime_cmp(s->deadline, now) <= 0;
+}
+
+static inline struct rq *rq_of(struct atlas_rq *atlas_rq)
+{
+	return container_of(atlas_rq, struct rq, atlas);
+}
+
+static inline struct task_struct *task_of(struct sched_atlas_entity *se)
+{
+	return container_of(se, struct task_struct, atlas);
+}
+
+
+
+/*
+ **********************************************************
+ ***                 timer stuff                        ***
+ **********************************************************
+ */
+
+static inline int hrtimer_start_nowakeup(struct hrtimer *timer, ktime_t tim,
+		const enum hrtimer_mode mode)
+{
+	return __hrtimer_start_range_ns(timer, tim, 0, mode, 0);
+}
+
+static inline void __setup_rq_timer(struct atlas_rq *atlas_rq, ktime_t ktime) {
+	assert_raw_spin_locked(&rq_of(atlas_rq)->lock);
+	atlas_rq->timer_end = ktime;
+	
+	DEBUG(DEBUG_TIMER, "timer up to: %lld",
+			ktime_to_us(atlas_rq->timer_end));
+	
+	BUG_ON(atlas_rq->timer_target == ATLAS_NONE);
+	hrtimer_start_nowakeup(&atlas_rq->timer, ktime, HRTIMER_MODE_ABS_PINNED);
+}
+
+static inline void start_slack(struct atlas_rq *atlas_rq, ktime_t slack) {
+	DEBUG(DEBUG_TIMER, "Setup timer for slack");
+	BUG_ON(atlas_rq->timer_target != ATLAS_NONE);
+	slack = ktime_add(slack, ktime_get());
+	atlas_rq->timer_target = ATLAS_SLACK;
+	__setup_rq_timer(atlas_rq, slack);
+}
+
+static inline void start_job(struct atlas_rq *atlas_rq, struct atlas_job *job) {
+	ktime_t tmp = ktime_get();
+	DEBUG(DEBUG_TIMER, "Setup timer for job");
+	BUG_ON(atlas_rq->timer_target != ATLAS_NONE);
+	atlas_rq->timer_target = ATLAS_JOB;
+
+	tmp = ktime_add(tmp, job->sexectime);
+	tmp = ktime_min(tmp, job->deadline);
+	
+	__setup_rq_timer(atlas_rq, tmp);
+}
+
+static void reset_slack_time(struct atlas_rq *atlas_rq) {
+	if (!(atlas_rq->timer_target == ATLAS_SLACK))
+		return;
+	
+	if (hrtimer_cancel(&atlas_rq->timer)) {
+		atlas_rq->pending_work |= PENDING_STOP_CFS_ADVANCED;
+		resched_task(rq_of(atlas_rq)->curr);
+		atlas_rq->timer_target = ATLAS_NONE;
+	}
+
+	BUG_ON(atlas_rq->timer_target != ATLAS_NONE);
+
+	DEBUG(DEBUG_TIMER, "reset timer programmed for slack time");
+}
+
+static void reset_job_time(struct atlas_rq *atlas_rq) {
+	if (!(atlas_rq->timer_target == ATLAS_JOB))
+		return;
+	
+	if (hrtimer_cancel(&atlas_rq->timer)) {
+		atlas_rq->timer_target = ATLAS_NONE;
+	}
+
+	BUG_ON(atlas_rq->timer_target != ATLAS_NONE);
+
+	DEBUG(DEBUG_TIMER, "reset timer programmed for job");
+}
+
+static int update_execution_time(struct atlas_rq *atlas_rq,
+	struct atlas_job *job, ktime_t delta_exec);
+
+
+static inline void reset_timer(struct atlas_rq *atlas_rq) {
+	
+	assert_raw_spin_locked(&rq_of(atlas_rq)->lock);
+
+	BUG_ON(atlas_rq->advance_in_cfs && atlas_rq->timer_target != ATLAS_SLACK && !(atlas_rq->pending_work & PENDING_STOP_CFS_ADVANCED));
+
+	switch (atlas_rq->timer_target) {
+		case ATLAS_NONE:
+			break;
+		case ATLAS_SLACK:
+			reset_slack_time(atlas_rq);
+			break;
+		case ATLAS_JOB:
+			reset_job_time(atlas_rq);
+			break;
+		default:
+			BUG();
+	}
+
+	BUG_ON(atlas_rq->advance_in_cfs && !(atlas_rq->pending_work & PENDING_STOP_CFS_ADVANCED));
+
+	BUG_ON(atlas_rq->timer_target != ATLAS_NONE);
+}
+
+static inline void erase_rq_job(struct atlas_rq *, struct atlas_job *);
+void atlas_switch_scheduler(struct rq *, struct task_struct *, const struct sched_class *);
+static void update_curr_atlas(struct rq *);
+
+
+static enum hrtimer_restart timer_rq_func(struct hrtimer *timer)
+{
+	struct atlas_rq *atlas_rq = container_of(timer, struct atlas_rq, timer);
+	struct rq *rq = rq_of(atlas_rq);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	
+	update_rq_clock(rq);
+	if (atlas_rq->curr)
+		update_curr_atlas(rq);
+
+	BUG_ON(atlas_rq->timer_target == ATLAS_NONE);
+
+	sched_log("Timer: %s", atlas_rq->timer_target == ATLAS_JOB ? "JOB" :
+						   atlas_rq->timer_target == ATLAS_SLACK ? "SLACK" : "BUG");
+	
+	switch (atlas_rq->timer_target) {
+		case ATLAS_JOB:
+			BUG_ON(rq->curr->sched_class != &atlas_sched_class);
+			atlas_rq->pending_work |= PENDING_JOB_TIMER;
+			break;
+		case ATLAS_SLACK:
+			atlas_rq->pending_work |= PENDING_STOP_CFS_ADVANCED;
+			break;
+		default:
+			BUG();
+	}
+
+	atlas_rq->timer_target = ATLAS_NONE;
+	DEBUG(DEBUG_TIMER, "timer expired: calling resched_task now");
+	
+	/* resched curr */
+	if (rq->curr)
+		resched_task(rq->curr);
+	
+	BUG_ON(atlas_rq->advance_in_cfs &&
+		!(atlas_rq->pending_work & PENDING_STOP_CFS_ADVANCED));
+
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+	return HRTIMER_NORESTART;
+}
+
+
+/*
+ * switching between the schedulers
+ */
+
+#ifdef ATLAS_DEBUG
+static const char * sched_name(const struct sched_class *c) {
+	if (c == &rt_sched_class)
+		return "REALTIME";
+	if (c == &atlas_sched_class)
+		return "ATLAS";
+	if (c == &atlas_recover_sched_class)
+		return "ATLAS_RECOVER";
+	if (c == &fair_sched_class)
+		return "CFS";
+	if (c == &idle_sched_class)
+		return "IDLE";
+	return "UNKNOWN";
+}
+#endif /* ATLAS_DEBUG */
+
+/*
+ * scheduler switching stuff
+ */
+
+void atlas_switch_scheduler(struct rq *rq,
+	struct task_struct *p, const struct sched_class *new_sched_class)
+{
+	const struct sched_class *prev_sched_class;
+	int on_rq, running;
+
+	BUG_ON(in_interrupt());
+	assert_raw_spin_locked(&rq->lock);
+
+	//raw_spin_lock(&p->pi_lock);
+	
+	prev_sched_class = p->sched_class;
+
+	if (new_sched_class == prev_sched_class) {
+		//raw_spin_unlock(&p->pi_lock);
+		return;
+	}
+	on_rq = p->on_rq;
+	running = rq->curr == p;
+	
+	DEBUG(DEBUG_SWITCH_SCHED, "pid=%d from %s to %s, on_rq=%d, running=%d",
+		p->pid, sched_name(prev_sched_class), sched_name(new_sched_class), on_rq, running);
+	
+	if (on_rq)
+		prev_sched_class->dequeue_task(rq, p, 0);
+	if (running)
+		prev_sched_class->put_prev_task(rq, p);
+
+	p->sched_class = new_sched_class;
+	
+	if (running)
+		new_sched_class->set_curr_task(rq);
+	if (on_rq)
+		new_sched_class->enqueue_task(rq, p, 0);
+
+	if (prev_sched_class->switched_from)
+		prev_sched_class->switched_from(rq, p);
+	new_sched_class->switched_to(rq, p);
+	
+	//FIXME: pi-stuff?
+	//raw_spin_unlock(&p->pi_lock);
+	//rt_mutex_adjust_pi(p);
+}
+
+static void advance_thread_in_cfs(struct atlas_rq *atlas_rq) {
+	struct sched_atlas_entity *se;
+	struct task_struct *p;
+
+	BUG_ON(atlas_rq->advance_in_cfs != NULL);
+
+	if (!atlas_rq->nr_runnable) {
+		sched_log("advance: no thread ready");
+		reset_slack_time(atlas_rq);
+		return;
+	}
+
+	se = atlas_rq->curr;
+
+	/*
+	 * se can be the blocked entity in cfs (put_prev_task not called yet)
+	 * -> select the first entity from rb-tree
+	 */
+	if (!se || !task_of(se)->on_rq)
+		se = pick_first_entity(atlas_rq);
+	
+	BUG_ON(!se);
+	
+	p = task_of(se);
+	BUG_ON(!p->on_rq);
+	
+	BUG_ON(atlas_rq->timer_target != ATLAS_SLACK);
+	atlas_rq->advance_in_cfs = p;
+	
+	//move p to cfs
+	p->atlas.flags |= ATLAS_CFS_ADVANCED;
+	
+	sched_log("advance: next thread p=%d", p->pid);
+	atlas_switch_scheduler(rq_of(atlas_rq), p, &fair_sched_class);
+}
+
+void atlas_cfs_blocked(struct rq *rq, struct task_struct *p) {
+	struct atlas_rq *atlas_rq = &rq->atlas;
+
+	assert_raw_spin_locked(&rq->lock);
+	sched_log("advance_in_cfs: blocked");
+	BUG_ON(p->sched_class != &fair_sched_class);
+	BUG_ON(p->on_rq);
+	BUG_ON(!(p->atlas.flags & ATLAS_CFS_ADVANCED));
+
+	/* switch the scheduling class back to atlas */
+	p->atlas.flags &= ~ATLAS_CFS_ADVANCED;
+	atlas_switch_scheduler(rq, p, &atlas_sched_class);
+	atlas_rq->advance_in_cfs = NULL;
+
+	/* move the next ready task to cfs */
+	if (in_slacktime(atlas_rq))
+		advance_thread_in_cfs(atlas_rq);
+}
+
+
+#ifdef ATLAS_DEBUG
+
+static void debug_job(struct atlas_job *s) {
+	if (!s) {
+		printk_sched("DEBUG_JOBS: NULL\n");
+		return;
+	}
+	printk_sched("DEBUG_JOBS: %6lld - %6lld (%6lld - %6lld) (%p, ref=%d)\n",
+		ktime_to_ms(ktime_sub(s->sdeadline, s->sexectime)),
+		ktime_to_ms(s->sdeadline),
+		ktime_to_ms(ktime_sub(s->deadline, s->exectime)),
+		ktime_to_ms(s->deadline),
+		s,
+		atomic_read(&s->count));
+}
+
+static void __debug_jobs(struct atlas_rq *atlas_rq) {
+	struct atlas_job *job, *prev = NULL;
+	
+	job = pick_first_job(atlas_rq);
+	printk_sched("DEBUG_JOBS:\n");
+	while (job) {
+		if (prev) {
+			ktime_t start, end, diff;
+			start = prev->sdeadline;
+			end = job_start(job);
+			diff = ktime_sub(end, start);
+			if (!ktime_zero(diff)) {
+				printk_sched("DEBUG_JOBS: %6lld - %6lld (gap=%lld)\n",
+				ktime_to_ms(start),
+				ktime_to_ms(end),
+				ktime_to_ms(diff));
+			}
+		}
+		debug_job(job);
+		prev = job;
+		job = pick_next_job(job);
+	}
+	printk_sched("======================\n");
+
+}
+
+static void debug_jobs(struct atlas_rq *atlas_rq) {
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&atlas_rq->lock, flags);
+	__debug_jobs(atlas_rq);
+	raw_spin_unlock_irqrestore(&atlas_rq->lock, flags);
+}
+
+/*
+ * rq must be locked
+ */
+static void __debug_rq(struct rq *rq) {
+	struct sched_atlas_entity *se;
+	
+	printk_sched("SCHED_ATLAS: DEBUG rq=%d\n", cpu_of(rq));
+	printk_sched("    Currently running: %d\n", rq->atlas.nr_runnable);
+	printk_sched("    Curr: pid=%d\n", rq->atlas.curr ? task_of(rq->atlas.curr)->pid : -1);
+	
+	printk_sched("    DEBUG tasks_timeline:\n");
+	se = pick_first_entity(&rq->atlas);
+	while (se) {
+		printk_sched("        pid=%5d, job=%p\n", task_of(se)->pid, se->job);
+		se = pick_next_entity(se);	
+	}
+	printk_sched("======================\n");
+	debug_jobs(&rq->atlas);
+}
+
+static void debug_rq(struct rq *rq) {
+	unsigned long flags;
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	__debug_rq(rq);
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static void debug_task(struct task_struct *p) {
+	unsigned counter = 0;
+	struct atlas_job *job;
+	struct sched_atlas_entity *se = &p->atlas;
+	const char *s;
+	
+	printk_sched("SCHED_ATLAS: DEBUG task pid=%d\n", p->pid);
+	switch (p->atlas.state) {
+	case ATLAS_BLOCKED:
+		s = "ATLAS_BLOCKED";
+		break;
+	case ATLAS_UNDEF:
+		s = "ATLAS_UNDEF";
+		break;
+	case ATLAS_RUNNING:
+		s = "ATLAS_RUNNING";
+		break;
+	default:
+		s = "UNKNOWN";
+	}
+	
+	printk_sched("State: %s\n", s);
+	printk_sched("Submissions:\n");
+	spin_lock(&p->atlas.jobs_lock);
+	printk_sched("se->job=%p\n", p->atlas.job);
+	list_for_each_entry(job, &se->jobs, list) {
+		counter++;
+		debug_job(job);
+	}
+	printk_sched("    count: %d\n", counter);
+	printk_sched("======================\n");
+	spin_unlock(&p->atlas.jobs_lock);
+}
+#endif /* ATLAS_DEBUG */
+
+/*
+ * must be called with lock hold
+ */
+static void push_task_job(struct sched_atlas_entity *se,
+		struct atlas_job *new_job)
+{
+	struct list_head *entry;
+	struct atlas_job *job;
+
+	assert_spin_locked(&se->jobs_lock);
+
+	//typically, a new job should go to the end
+	list_for_each_prev(entry, &se->jobs) {
+		job = list_entry(entry, struct atlas_job, list);
+		if (job_before(job, new_job))
+			goto out;
+	}
+out:
+	list_add(&new_job->list, entry);
+	get_job(new_job);
+}
+
+/*
+ * caller is responsible for calling put_job(job) when done
+ */
+static struct atlas_job *pop_task_job(struct sched_atlas_entity *se)
+{
+	struct atlas_job *s = NULL;
+	struct list_head *elem;
+	
+	spin_lock(&se->jobs_lock);
+	
+	if (list_empty(&se->jobs))
+		goto out;
+	
+	elem = se->jobs.next;
+	s = list_entry(elem, struct atlas_job, list);
+	list_del(elem);
+out:
+	spin_unlock(&se->jobs_lock);
+	return s;
+}
+
+/* 
+ * must be called with rcu_read_lock hold
+ */
+static void assign_task_job(struct task_struct *p, struct atlas_job *job)
+{
+	struct sched_atlas_entity *se;
+	unsigned wakeup = 0;
+	
+	BUG_ON(!p);
+
+	{
+		//ensure that p is mapped to cpu 0
+		cpumask_t test;
+		cpumask_clear(&test);
+		cpumask_set_cpu(0, &test);
+
+		BUG_ON(!cpumask_equal(&test, &p->cpus_allowed));
+	}
+	
+	se = &p->atlas;
+
+	spin_lock(&se->jobs_lock);
+	wakeup = list_empty(&se->jobs) && (se->state == ATLAS_BLOCKED);
+	push_task_job(se, job);
+	spin_unlock(&se->jobs_lock);
+	
+	/*
+	 * wake up process
+	 */
+	if (wakeup)
+		wake_up_process(p);
+}
+
+/*
+ * call with se->jobs_lock hold!
+ */
+void erase_task_job(struct atlas_job *s) {
+	if (unlikely(!s))
+		return;
+	list_del(&s->list);
+	put_job(s);
+}
+
+
+
+/*
+ * Handling of pending work, called by core scheduler
+ * 
+ * called with rq locked
+ *
+ * timer interrupt may have been already triggered
+ */
+void atlas_do_pending_work(struct rq *rq) {
+	struct atlas_rq *atlas_rq = &rq->atlas;
+	struct task_struct *prev = rq->curr;
+
+	update_rq_clock(rq);
+
+	if (atlas_rq->pending_work & PENDING_STOP_CFS_ADVANCED) {
+		struct task_struct **p = &atlas_rq->advance_in_cfs;
+
+		if (*p) {
+			(*p)->atlas.flags &= ~ATLAS_CFS_ADVANCED;
+			atlas_switch_scheduler(rq, *p, &atlas_sched_class);
+			*p = NULL;
+		}
+		
+		atlas_rq->pending_work &= ~ PENDING_STOP_CFS_ADVANCED;
+		BUG_ON(atlas_rq->advance_in_cfs != NULL);
+	}
+
+	if (atlas_rq->pending_work & PENDING_START_CFS_ADVANCED) {
+
+		/* slack time? timer routine may have reset flag already */
+		if (atlas_rq->timer_target == ATLAS_SLACK)
+			advance_thread_in_cfs(atlas_rq);
+		
+		atlas_rq->pending_work &= ~ PENDING_START_CFS_ADVANCED;
+	}
+
+	if (atlas_rq->pending_work & PENDING_JOB_TIMER) {
+		/* deadline miss or execution time overrun */
+		
+		struct sched_atlas_entity *se = &prev->atlas;
+		
+		if (ktime_cmp(se->job->sexectime, ktime_set(0,30000)) <= 0) {
+			se->flags |= ATLAS_EXECTIME;
+			printk_sched("PUT_FAIR: job->sexec = %llu, job->exec = %llu\n", 
+				ktime_to_ns(prev->atlas.job->sexectime),
+				ktime_to_ns(prev->atlas.job->exectime));
+			atlas_switch_scheduler(rq, prev, &fair_sched_class);
+		} 
+
+		else {			
+			printk_sched("PUT_RECO: job->sexec = %llu, job->exec = %llu\n", 
+				ktime_to_ns(prev->atlas.job->sexectime),
+				ktime_to_ns(prev->atlas.job->exectime));
+			atlas_switch_scheduler(rq, prev, &atlas_recover_sched_class);
+		}
+		
+		atlas_rq->pending_work &= ~ PENDING_JOB_TIMER;
+	}
+
+	if (atlas_rq->pending_work & PENDING_MOVE_TO_CFS) {
+		atlas_switch_scheduler(rq, prev, &fair_sched_class);
+		atlas_rq->pending_work &= ~ PENDING_MOVE_TO_CFS;
+	}
+	
+	if (atlas_rq->pending_work & PENDING_MOVE_TO_RECOVER) {
+		atlas_switch_scheduler(rq, prev, &atlas_recover_sched_class);
+		atlas_rq->pending_work &= ~ PENDING_MOVE_TO_RECOVER;
+	}
+
+	if (atlas_rq->pending_work & PENDING_MOVE_TO_ATLAS) {
+		atlas_switch_scheduler(rq, atlas_rq->move_to_atlas, &atlas_sched_class);
+		atlas_rq->move_to_atlas = NULL;
+		atlas_rq->pending_work &= ~ PENDING_MOVE_TO_ATLAS;
+	}
+	
+	BUG_ON(atlas_rq->pending_work);
+}
+
+/*******************************************************
+ * Scheduler stuff
+ */
+
+void init_atlas_rq(struct atlas_rq *atlas_rq)
+{
+	atlas_rq->curr = NULL;
+    atlas_rq->tasks_timeline = RB_ROOT;
+	atlas_rq->rb_leftmost_se = NULL;
+    atlas_rq->nr_runnable = 0;
+    printk(KERN_INFO "INIT_ATLAS_RUNQUEUE(%d): %p\n",
+		cpu_of(rq_of(atlas_rq)), atlas_rq);
+    printk(KERN_INFO "sizeof(struct atlas_job)=%zu\n",
+		sizeof(struct atlas_job));
+	atlas_rq->jobs = RB_ROOT;
+
+	hrtimer_init(&atlas_rq->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+	atlas_rq->timer.function = &timer_rq_func;
+	atlas_rq->timer_target = ATLAS_NONE;
+
+	atlas_rq->flags = 0;
+	atlas_rq->pending_work = 0;
+	atlas_rq->cfs_job = NULL;
+	atlas_rq->cfs_job_start = ktime_set(0,0);
+
+	atlas_rq->advance_in_cfs = NULL;
+	atlas_rq->move_to_atlas  = NULL;
+	atlas_rq->skip_update_curr = 0;
+}
+
+/*
+ * We pick a new current task - update its stats:
+ */
+static inline void
+update_stats_curr_start(struct atlas_rq *atlas_rq, struct sched_atlas_entity *se, ktime_t now)
+{
+	/*
+	 * starting new timer period
+	 */
+	task_of(se)->se.exec_start = rq_of(atlas_rq)->clock_task;
+	se->start = now;
+}
+
+
+static void update_curr_atlas(struct rq *rq)
+{
+    //copied from rt
+	struct task_struct *curr = rq->curr;
+	struct sched_atlas_entity *se = &curr->atlas;
+	struct atlas_rq *atlas_rq = &rq->atlas;
+	u64 delta_exec;
+	struct atlas_job *job = se->job;
+	unsigned long flags;
+	ktime_t diff_ktime, now;
+
+	if (curr->sched_class != &atlas_sched_class) {
+		sched_log("update_curr: wrong scheduling class!");
+		return;
+	}
+
+	delta_exec = rq->clock_task - curr->se.exec_start;
+	
+	if (unlikely((s64)delta_exec < 0))
+		delta_exec = 0;
+
+	schedstat_set(curr->se.statistics.exec_max,
+		      max(curr->se.statistics.exec_max, delta_exec));
+
+	curr->se.sum_exec_runtime += delta_exec;
+	account_group_exec_runtime(curr, delta_exec);
+
+	now = ktime_get();
+	diff_ktime = ktime_sub(now, se->start);
+	update_stats_curr_start(atlas_rq, se, now);
+	cpuacct_charge(curr, delta_exec);
+	
+	
+	/*
+	 * do not update execution plan if there is no job
+	 */
+	if (unlikely(!job))
+		return;
+
+	raw_spin_lock_irqsave(&atlas_rq->lock, flags);
+	//update_execution_time(atlas_rq, job, ns_to_ktime(delta_exec)); 
+	update_execution_time(atlas_rq, job, diff_ktime); 
+	raw_spin_unlock_irqrestore(&atlas_rq->lock, flags);
+}
+
+
+
+/*
+ * enqueue task
+ *
+ * always called with updated runqueue clock
+ */
+static void enqueue_task_atlas(struct rq *rq, struct task_struct *p, int flags)
+{
+	struct atlas_rq *atlas_rq = &rq->atlas;
+	struct sched_atlas_entity *se = &p->atlas;
+	
+	if (atlas_rq->curr != se) {
+		enqueue_entity(atlas_rq, se);
+	}
+    
+	se->on_rq = 1;
+    atlas_rq->nr_runnable++;
+    
+    inc_nr_running(rq);
+
+	/*
+	 * The previously calculated slack time depends on the first
+	 * ready job in the rb tree. If the new entity is that one with the
+	 * nearest deadline the old slacktime might be wrong.
+	 * 
+	 * - check_preempt_curr_atlas is called after the enqueue
+	 */
+	//sched_log("ENQ: W=%d S=%d f=%d",
+	//	flags & ENQUEUE_WAKEUP, in_slacktime(atlas_rq), pick_first_entity(atlas_rq) == se);
+
+	if ( flags & ENQUEUE_WAKEUP &&
+			in_slacktime(atlas_rq) &&
+			pick_first_entity(atlas_rq) == se )
+	{
+		sched_log("ENQ: reset timer");
+		reset_timer(atlas_rq);
+		BUG_ON(atlas_rq->advance_in_cfs &&
+			!(atlas_rq->pending_work & PENDING_STOP_CFS_ADVANCED));
+		//enqueue calls also check_preempt -> reschedule flag already set,
+		//because of higher scheduling-class
+	}
+}
+
+
+/*
+ * dequeue task
+ *
+ * always called with updated runqueue clock
+ */
+static void dequeue_task_atlas(struct rq *rq, struct task_struct *p, int flags)
+{
+	struct atlas_rq *atlas_rq = &rq->atlas;
+	struct sched_atlas_entity *se = &p->atlas;
+	
+	update_curr_atlas(rq);
+
+    if (atlas_rq->curr == se)
+		atlas_rq->curr = NULL;
+	else
+		dequeue_entity(atlas_rq, se);
+	
+	se->on_rq = 0;
+	
+    atlas_rq->nr_runnable--;
+
+    dec_nr_running(rq);
+	return;
+}
+
+static void yield_task_atlas(struct rq *rq)
+{
+    return;
+}
+
+/*
+ * called when currently running task is scheduled by us
+ * and another task is woken up
+ */
+static void check_preempt_curr_atlas(struct rq *rq, struct task_struct *p, int flags)
+{
+	struct task_struct *curr = rq->curr;
+	struct sched_atlas_entity *se = &curr->atlas, *pse = &p->atlas;
+	int sub = (se->job != NULL), psub = (pse->job != NULL);
+	
+	DEBUG(DEBUG_CHECK_PREEMPT, "pid=%d", p->pid);
+	
+	if (unlikely(se == pse)) {
+		DEBUG(DEBUG_CHECK_PREEMPT, "se == pse; pid=%d don't preempt curr->pid=%d",
+			p->pid, curr->pid);
+		return;
+	}
+	
+	if (test_tsk_need_resched(curr)) {
+		DEBUG(DEBUG_CHECK_PREEMPT, "test_tsk_need_resched; pid=%d don't preempt curr->pid=%d",
+			p->pid, curr->pid);
+		return;
+	}
+
+	
+	/* Bug if task is not scheduled by us */
+	BUG_ON(p->sched_class != &atlas_sched_class);
+		
+	/* if the new task has no job, preempt */
+	if (unlikely(!psub))
+		goto preempt;
+	
+	/* if the currently running task has no job, don't preempt */
+	if (unlikely(!sub))
+		goto no_preempt;
+		
+	if (ktime_cmp(pse->job->sdeadline, se->job->sdeadline) == -1)
+		goto preempt;
+	
+no_preempt:
+	DEBUG(DEBUG_CHECK_PREEMPT, "pid=%d don't preempt curr->pid=%d",
+		p->pid, curr->pid);
+
+	return;
+	
+preempt:
+	DEBUG(DEBUG_CHECK_PREEMPT, "pid=%d preempt curr->pid=%d",
+		p->pid, curr->pid);
+	resched_task(curr);
+
+	return;
+}
+
+static int get_slacktime(struct atlas_rq *atlas_rq, ktime_t *slack);
+static void cleanup_rq(struct atlas_rq *atlas_rq, ktime_t ktime);
+
+static struct task_struct *pick_next_task_atlas(struct rq *rq)
+{
+	struct atlas_rq *atlas_rq = &rq->atlas;
+	struct sched_atlas_entity *se;
+	ktime_t slack, now;
+	struct atlas_job *job, *job_next;
+	struct task_struct *p;
+	unsigned long flags;
+	int timer = 1;
+
+	/*
+	 * only proceed if there are runnable tasks
+	 */
+	if (likely(!atlas_rq->nr_runnable)) {
+		return NULL;
+	}
+
+	se = pick_first_entity(atlas_rq);
+
+	/*
+	 * threads without a job are doing important things like
+	 * signal handling (by construction always at the beginning
+	 * of the tree)
+	 */
+	if (unlikely(!se->job)) {
+		atlas_rq->curr = se;
+		dequeue_entity(atlas_rq, se);
+		timer = 0;
+		goto out;
+	}
+	
+	/*
+	 * slack time?
+	 */
+	if (in_slacktime(atlas_rq))
+		return NULL;
+
+	BUG_ON(atlas_rq->timer_target == ATLAS_SLACK);
+	BUG_ON(atlas_rq->timer_target == ATLAS_JOB);
+	BUG_ON(atlas_rq->timer_target != ATLAS_NONE);
+	BUG_ON(atlas_rq->advance_in_cfs);
+
+	raw_spin_lock_irqsave(&atlas_rq->lock, flags);
+	
+	now = ktime_get();
+
+	/*
+	 * remove jobs having a deadline in the past
+	 */
+	cleanup_rq(atlas_rq, now);
+		
+	/*
+	 * job of se might be removed by cleanup
+	 */
+	if (unlikely(!job_in_rq(se->job))) {
+		if (ktime_zero(se->job->sexectime)) {
+			atlas_rq->pending_work |= PENDING_MOVE_TO_CFS;
+		}
+		else {
+			atlas_rq->pending_work |= PENDING_MOVE_TO_RECOVER;
+		}
+		atlas_rq->curr = se;
+		dequeue_entity(atlas_rq, se);
+
+		goto unlock_out;
+	}
+
+	/*
+	 * handle slack time
+	 */
+	if (get_slacktime(atlas_rq, &slack))
+	{
+		start_slack(atlas_rq, slack);
+		
+		if (likely(sysctl_sched_atlas_advance_in_cfs)) {
+			atlas_rq->curr = se;
+			dequeue_entity(atlas_rq, se);
+			// skip setup of timer, it is used for slack
+			timer = 0;
+			atlas_rq->pending_work |= PENDING_START_CFS_ADVANCED;
+		}
+
+		goto unlock_out;
+	}
+
+	/*
+	 * no slack time left
+	 */
+	job = se->job;
+	BUG_ON(job == NULL);
+	BUG_ON(!job_in_rq(job));
+
+	job_next = pick_first_job(atlas_rq);
+	BUG_ON(job_next == NULL);
+	while (job != job_next) {
+		
+		p = task_of_job(job_next);
+		
+		if (!p) {
+			job_next = pick_next_job(job_next);
+			continue;
+		}
+		
+		/* job blocked? */
+		if (!p->on_rq) {
+			put_task_struct(p);
+			job_next = pick_next_job(job_next);
+			continue;
+		}
+
+		/* ready job and time scheduled in atlas -> move it to atlas */
+		BUG_ON(p->sched_class == &atlas_sched_class);
+
+		se = &p->atlas;
+
+		atlas_rq->move_to_atlas = p;
+		atlas_rq->pending_work |= PENDING_MOVE_TO_ATLAS;
+			
+		BUG_ON(in_interrupt());
+
+		/* only accessed with preemption disabled */
+		se->job = job_next;
+		se->flags |= ATLAS_PENDING_JOBS;
+
+		put_task_struct(p);
+		goto unlock_out;
+	}
+	
+	/*
+	 * job ready
+	 */
+
+	atlas_rq->curr = se;
+	dequeue_entity(atlas_rq, se);
+
+unlock_out:
+	raw_spin_unlock_irqrestore(&atlas_rq->lock, flags);
+
+out:	
+	if (atlas_rq->curr) {
+		
+		if (atlas_rq->pending_work)
+			resched_task(task_of(se));
+		
+		DEBUG(DEBUG_PICK_NEXT_TASK, "pid=%d, need_resched=%d",
+			task_of(atlas_rq->curr)->pid, test_tsk_need_resched(task_of(atlas_rq->curr)));
+		update_stats_curr_start(atlas_rq, atlas_rq->curr, ktime_get());
+
+		if (timer)
+			start_job(atlas_rq, atlas_rq->curr->job);
+
+
+    	return task_of(atlas_rq->curr);
+	} else {
+		DEBUG(DEBUG_PICK_NEXT_TASK, "NULL");
+		return NULL;
+	}
+}
+
+
+
+
+static void put_prev_task_atlas(struct rq *rq, struct task_struct *prev)
+{
+	struct atlas_rq *atlas_rq = &rq->atlas;
+	struct sched_atlas_entity *se = &prev->atlas;
+	
+	DEBUG(DEBUG_PUT_PREV_TASK, "pid=%d (on_rq=%d, timer_expired=%d)", prev->pid,
+		se->on_rq, (atlas_rq->flags & TIMER_EXPIRED) != 0);
+	
+	/* reset timer */
+	reset_job_time(atlas_rq);
+
+	if (se->on_rq) {
+		update_curr_atlas(rq);
+		enqueue_entity(atlas_rq, se);
+	}
+	
+	atlas_rq->curr = NULL;
+}
+
+/*
+ * called when a thread moved to ATLAS
+ * it is rescheduled because of switch_to,
+ * all timer stuff handled in put_prev_task
+ */
+static void set_curr_task_atlas(struct rq *rq)
+{
+	struct task_struct *p = rq->curr;
+	struct sched_atlas_entity *se = &p->atlas;
+	struct atlas_rq *atlas_rq = &rq->atlas;
+	
+	DEBUG(DEBUG_SET_CURR_TASK, "pid=%d", p->pid);
+    update_stats_curr_start(atlas_rq, se, ktime_get());
+    
+    BUG_ON(rq->atlas.curr);
+	rq->atlas.curr = se;
+	
+    return;
+}
+
+static void task_tick_atlas(struct rq *rq, struct task_struct *p, int queued)
+{
+	//revolution
+    //update_curr_atlas(rq);
+    return;
+}
+
+static void prio_changed_atlas(struct rq *rq, struct task_struct *p, int oldprio)
+{
+    //printk(KERN_INFO "SCHED_ATLAS: prio_changed\n");
+    return;
+}
+
+static void switched_from_atlas(struct rq *rq, struct task_struct *p)
+{
+    //printk(KERN_INFO "SCHED_ATLAS: switched_from\n");
+    return;
+}
+
+static void switched_to_atlas(struct rq *rq, struct task_struct *p)
+{
+    DEBUG(DEBUG_SWITCHED_TO, "pid=%d", p->pid);
+	
+	if (!p->atlas.on_rq)
+		return;
+
+	if (rq->curr == p)
+		resched_task(rq->curr);
+	else
+		check_preempt_curr(rq, p, 0);
+
+	return;
+}   
+
+static unsigned int get_rr_interval_atlas(struct rq *rq, struct task_struct *task)
+{
+    printk(KERN_INFO "SCHED_ATLAS: get_rr_interval\n");
+    return 0;
+}
+
+#ifdef CONFIG_SMP
+static int select_task_rq_atlas(struct task_struct *p, int sd_flag, int flags)
+{
+    return task_cpu(p);
+    
+}
+#endif /* CONFIG_SMP */
+
+ 
+/*
+ * Methods to maintain job tree.
+ */
+
+static inline int is_collision(struct atlas_job *a, struct atlas_job *b) {
+	ktime_t b_start = job_start(b);
+	ktime_t a_end = a->sdeadline;
+	if (ktime_cmp(a_end, b_start) == 1) {
+		//end > start
+		return 1;
+	}
+	return 0;
+}
+
+static void check_admission_plan(struct atlas_rq *atlas_rq) {
+#ifdef DEBUG
+	struct atlas_job *prev, *next;
+	
+	assert_raw_spin_locked(&atlas_rq->lock);
+	//__debug_jobs(atlas_rq);
+	
+	prev = pick_first_job(atlas_rq);
+
+	if (!prev)
+		return;
+
+	while ((next = pick_next_job(prev))) {
+		if (is_collision(prev, next)) {
+			BUG();
+		}
+		prev = next;
+	}
+#endif
+}
+
+/*
+ * resolve_collision assumes that there is a collision
+ */
+static inline void resolve_collision(struct atlas_job *a,
+		struct atlas_job *b) {
+	a->sdeadline = job_start(b);
+}
+
+/*
+ * close the gap between job a and b and
+ * return 1 iff start of job a was moved forward
+ */
+static inline int collapse_jobs(struct atlas_job *a,
+		struct atlas_job *b, enum update_exec_time update) {
+	
+	ktime_t start_a, start_b, end, move;
+	//can we move job a forward? if not, we are ready
+	if (likely(ktime_equal(a->deadline, a->sdeadline)))
+		return 0;
+	
+	//adapt the deadline of the job
+	start_a = job_start(a);
+	start_b = job_start(b);
+	end = ktime_min(a->deadline, start_b);
+
+	//end is either the start of the next job or the real deadline
+	
+	//save the movement
+	move = ktime_sub(end, a->sdeadline);
+	a->sdeadline = end;
+	
+	//no update of execution time possible/allowed?
+	if (update == NO_UPDATE_EXEC_TIME ||
+			likely(ktime_equal(a->exectime, a->sexectime))) {
+		//we moved the start
+		return 1;
+	}
+
+	//extend the execution time
+	a->sexectime = ktime_min(a->exectime, ktime_add(move, a->sexectime));
+
+	//did we moved the start?
+	if (ktime_equal(start_a, job_start(a))) {
+		return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * close gaps, called when a job is removed or when its exectime was updated
+ *
+ * note: - whenever updating the job's execution time,
+ *         the wall clock time moves also forward. It's therefore
+ *         illegal to extend the execution time of the previous jobs,
+ *         otherwise exection time would be created that isn't available
+ *       - it is completely admissible to extract the execution time
+ *         of previous jobs whenever a job is removed from the execution
+ *         plan
+ */
+static inline void close_gaps(struct atlas_job *job, enum update_exec_time update) {
+	struct atlas_job *prev;
+	while((prev = pick_prev_job(job))) {
+		if (!collapse_jobs(prev, job, update))
+			break;
+		job = prev;
+	}
+
+}
+
+/*
+ * calculate the gap between two jobs
+ */
+static inline ktime_t calc_gap(struct atlas_job *a, struct atlas_job *b) {
+	ktime_t start = job_start(b);
+	ktime_t ret = ktime_sub(start, a->sdeadline);
+
+	BUG_ON(ktime_to_ns(ret) < 0);
+	return ret;
+}
+
+
+/*
+ * must be called with atlas_rq locked
+ */
+static void assign_rq_job(struct atlas_rq *atlas_rq,
+		struct atlas_job *job, ktime_t now) {
+	
+	struct rb_node **link;
+	struct rb_node *parent = NULL;
+	struct atlas_job *entry, *next, *prev, *first;
+	
+	assert_raw_spin_locked(&atlas_rq->lock);
+	
+	cleanup_rq(atlas_rq, now);
+
+	/*
+	 * needed to decide whether to reset slack
+	 */
+	first = pick_first_job(atlas_rq);
+	
+	link = &atlas_rq->jobs.rb_node;
+	
+	while (*link) {
+		parent = *link;
+		entry = rb_entry(parent, struct atlas_job, rb_node);
+		
+		if (job_before(job, entry))
+			link = &parent->rb_left;
+		else
+			link = &parent->rb_right;
+	}
+	
+	rb_link_node(&job->rb_node, parent, link);
+	rb_insert_color(&job->rb_node, &atlas_rq->jobs);	
+	
+	//save reference
+	get_job(job);
+
+	/* fix the scheduled deadline of the new job*/
+	next = pick_next_job(job);
+	if (next && is_collision(job, next)) {
+		resolve_collision(job, next);
+	}
+
+	/*
+	 * FIXME: the scheduled deadline might be in the past
+	 * FIXME: what about overload situations: we might have to update the sexectime,
+	 *        for the moment, we skip that
+	 */
+
+	/* fix scheduled execution time */
+	
+	/*if (next == job) {
+		ktime_t diff = ktime_sub(job->sdeadline, now);
+		job->sexectime = ktime_min(diff, job->sexectime);
+	} else {
+		ktime_t max_exec = ktime_sub(job->sdeadline, now);
+		
+		//take care of the first job if now > start
+		//in this case we substract to much later on
+		ktime_t start = get_job_start(next);
+		if (ktime_cmp(now, start) == 1) {
+			ktime_t diff = ktime_sub(now, start);
+			max_exec = ktime_add(max_exec, diff);
+		}
+
+		while (next != job) {
+			max_exec = ktime_sub(max_exec, next->sexectime);
+			next = pick_next_job(next);
+		}
+
+		if (ktime_neg(max_exec)) {
+			job->sexectime = ktime_set(0,0);
+		} else {
+			job->sexectime = ktime_min(job->sexectime, max_exec);
+		}
+	}*/
+
+	/*
+	 * update the scheduled deadlines of the jobs placed before
+	 * the new job
+	 */
+	while ((prev = pick_prev_job(job))) {
+		if (!is_collision(prev, job))
+			break;
+		resolve_collision(prev, job);
+		job = prev;
+	}
+
+	/*
+	 * reset slack time iff start moved to the left
+	 *   - we have to initiate a reschedule on the target cpu
+	 */
+	if (first && ktime_cmp(job_start(first),
+			job_start(pick_first_job(atlas_rq))) == 1) {
+		resched_cpu(cpu_of(rq_of(atlas_rq)));
+	}
+
+	check_admission_plan(atlas_rq);
+}
+
+static int update_execution_time(struct atlas_rq *atlas_rq,
+	struct atlas_job *job, ktime_t delta_exec) {
+	
+	int ret = 0;
+
+	assert_raw_spin_locked(&atlas_rq->lock);
+	
+	job->exectime = ktime_sub(job->exectime, delta_exec); 
+
+	if (unlikely(ktime_neg(job->exectime))) {
+		job->exectime = ktime_set(0,0);
+		job->sexectime = ktime_set(0,0);
+		ret = 1;
+		goto out;
+	}
+
+	job->sexectime = ktime_sub(job->sexectime, delta_exec);
+	if (ktime_neg(job->sexectime)) {
+		job->sexectime = ktime_set(0,0);
+		ret = 2;
+	}
+
+out:
+	//adapt admission plan
+	close_gaps(job, NO_UPDATE_EXEC_TIME);
+
+	check_admission_plan(atlas_rq);   
+	
+	return ret;
+}
+
+/*
+ * atlas_rq->lock must be hold!
+ */
+static inline void erase_rq_job(struct atlas_rq *atlas_rq,
+		struct atlas_job *job)
+{	
+	// a job is removed from the rq from next and also in
+	// pick_next_task on cleanup, so there is a race condition
+	if (unlikely(!job_in_rq(job)))
+		return;
+		
+	assert_raw_spin_locked(&atlas_rq->lock);
+	
+	if (likely(job_in_rq(job))) {
+		ktime_t tmp = job->sexectime;
+		job->sexectime = ktime_set(0,0);
+		close_gaps(job, UPDATE_EXEC_TIME);
+		rb_erase(&job->rb_node, &atlas_rq->jobs);
+		RB_CLEAR_NODE(&job->rb_node);
+		job->sexectime = tmp;
+		put_job(job);
+	}	
+
+	check_admission_plan(atlas_rq);
+}
+
+/*
+ * determine if there is slack time left. atlas_rq has to be locked
+ * 
+ * return: 1 if there is slack time
+ * 		   0 if there is no slack time
+ * 
+ * the amount of slack time left is returned in ktime
+ *
+ * Assertions:
+ * 	- first entity in rb-tree has a job
+ */
+static int get_slacktime(struct atlas_rq *atlas_rq, ktime_t *slack) {
+	struct sched_atlas_entity *se;
+	struct atlas_job *job;
+	ktime_t start, sum, now;
+	
+	assert_raw_spin_locked(&atlas_rq->lock);
+	
+	se = pick_first_entity(atlas_rq);
+	BUG_ON(!se);
+	BUG_ON(!se->job);
+	BUG_ON(!job_in_rq(se->job));
+	
+	job = se->job;
+	start = job_start(job);
+	
+	//sum up the execution time of the jobs before
+	sum = ktime_set(0,0);
+	while((job = pick_prev_job(job))) {
+		sum = ktime_add(sum, job->sexectime);
+	}
+
+	now = ktime_get();
+	*slack = ktime_sub(ktime_sub(start, now), sum);
+
+	if (ktime_to_ns(*slack) > sysctl_sched_atlas_min_slack)
+		return 1;
+	else
+		return 0;
+}
+
+static void cleanup_rq(struct atlas_rq *atlas_rq, ktime_t now) {
+	struct atlas_job *tmp, *s = pick_first_job(atlas_rq);
+
+	assert_raw_spin_locked(&atlas_rq->lock);
+	while (s && unlikely(job_missed_deadline(s, now))) {
+		/*struct task_struct *p = task_of_job(s);
+		
+		if (p) {
+			printk_sched("drop Submission from rq; sub=%p pid=%d scheduler=%d sub_task=%p\n",
+					s, p->pid, p->policy, p->atlas.job);
+			put_task_struct(p);
+		} else {
+			printk_sched("drop Submission of nonexistent task from rq; sub=%p\n", s);
+		}*/
+
+		tmp = s;
+		s = pick_next_job(s);
+		erase_rq_job(atlas_rq, tmp);
+	}
+}
+
+/* 
+ * free pending jobs of a killed task
+ * called from do_exit()
+ *
+ * there might also be the timer
+ */
+void exit_atlas(struct task_struct *p) {
+	struct atlas_job *job, *tmp;
+	struct rq *rq = task_rq(p);
+	struct atlas_rq *atlas_rq = &rq->atlas;
+	unsigned long flags;
+	
+	if (cpu_of(rq) != 0)
+		return;
+
+	hrtimer_cancel(&p->atlas.timer);
+	
+	//debug_rq(rq);
+	//debug_task(p);
+
+	BUG_ON(in_interrupt());
+	//remove jobs from run queue
+	if ((job = p->atlas.job)) {
+		p->atlas.job = NULL;
+		put_job(job);
+		raw_spin_lock_irqsave(&rq->atlas.lock, flags);
+		erase_rq_job(atlas_rq, job);
+		raw_spin_unlock_irqrestore(&rq->atlas.lock, flags);
+	}
+	
+	spin_lock(&p->atlas.jobs_lock);
+	list_for_each_entry_safe(job, tmp, &p->atlas.jobs, list) {
+		raw_spin_lock_irqsave(&rq->atlas.lock, flags);
+		erase_rq_job(atlas_rq, job);
+		raw_spin_unlock_irqrestore(&rq->atlas.lock, flags);
+
+		erase_task_job(job);
+	}
+	spin_unlock(&p->atlas.jobs_lock);
+
+	//debug_rq(rq);
+	//debug_task(p);
+}
+
+
+
+
+/*
+ * All the scheduling class methods:
+ */
+const struct sched_class atlas_sched_class = {
+	.next               = &atlas_recover_sched_class,
+	.enqueue_task       = enqueue_task_atlas,
+	.dequeue_task       = dequeue_task_atlas,
+	.yield_task         = yield_task_atlas,
+	//.yield_to_task		= yield_to_task_atlas,
+
+	.check_preempt_curr = check_preempt_curr_atlas,
+
+	.pick_next_task     = pick_next_task_atlas,
+	.put_prev_task      = put_prev_task_atlas,
+
+/**we do not support SMP so far*/
+#ifdef CONFIG_SMP
+	.select_task_rq     = select_task_rq_atlas,
+
+	//.rq_online		= rq_online_atlas,
+	//.rq_offline		= rq_offline_atlas,
+
+	//.task_waking		= task_waking_atlas,
+#endif
+
+	.set_curr_task      = set_curr_task_atlas,
+	.task_tick          = task_tick_atlas,
+	//.task_fork        = task_fork_atlas,
+
+	.prio_changed       = prio_changed_atlas,
+	.switched_from      = switched_from_atlas,
+	.switched_to        = switched_to_atlas,
+
+	.get_rr_interval    = get_rr_interval_atlas,
+
+};
+
+
+
+
+#ifdef ATLAS_DEBUG
+#define OP_DEBUG_RUNQUEUE 1
+#define OP_UPDATE_EXECTIME 2
+#define OP_DELETE_JOB 3
+#endif
+
+SYSCALL_DEFINE3(atlas_debug, int, operation, int, arg1, int, arg2)
+{
+#ifdef ATLAS_DEBUG
+	struct rq *rq = &per_cpu(runqueues, 0);
+	switch (operation) {
+		case OP_DEBUG_RUNQUEUE:
+			debug_rq(rq);
+			break;
+		case OP_UPDATE_EXECTIME: {
+			struct atlas_rq *atlas_rq = &rq->atlas;
+			struct atlas_job *job;
+			int nr_job = arg1;
+			unsigned long flags;
+
+			s64 delta_exec = arg2 * 1000 * 1000; /*ns*/	
+
+			raw_spin_lock_irqsave(&atlas_rq->lock, flags);
+			job = pick_first_job(atlas_rq);
+			while (job && nr_job) {
+				job = pick_next_job(job);
+				nr_job--;
+			}
+			if (job)
+				update_execution_time(atlas_rq, job, ns_to_ktime(delta_exec)); 
+			raw_spin_unlock_irqrestore(&atlas_rq->lock, flags);
+			break;
+		}
+		case OP_DELETE_JOB: {
+			struct atlas_rq *atlas_rq = &rq->atlas;
+			struct atlas_job *job;
+			int nr_job = arg1;
+			unsigned long flags;
+
+			raw_spin_lock_irqsave(&atlas_rq->lock, flags);
+			job = pick_first_job(atlas_rq);
+			while (job && nr_job) {
+				job = pick_next_job(job);
+				nr_job--;
+			}
+			if (job)
+				erase_rq_job(atlas_rq, job);
+			raw_spin_unlock_irqrestore(&atlas_rq->lock, flags);
+			break;
+		}
+		default:
+			break;
+	}
+#endif
+	return 0;
+}
+
+
+/*
+ * called when a process missed its deadline; called from irq context
+ */
+enum hrtimer_restart atlas_timer_task_function(struct hrtimer *timer)
+{
+	struct sched_atlas_entity *se = container_of(timer, struct sched_atlas_entity, timer);
+	struct task_struct *p = task_of(se);
+
+	WARN_ON(!se->job);
+	se->flags |= ATLAS_DEADLINE;
+	
+	DEBUG(DEBUG_TIMER, "deadline missed: pid=%d", task_of(se)->pid);
+	wmb();
+	send_sig(SIGXCPU, p, 0);
+	
+	return HRTIMER_NORESTART;
+}
+
+/*
+ * sys_atlas_next
+ */
+SYSCALL_DEFINE0(atlas_next)
+{
+	int ret = 0;
+	struct atlas_job *next_job;
+	struct sched_atlas_entity *se = &current->atlas;
+	struct rq *rq;
+	struct atlas_rq *atlas_rq;
+	unsigned long flags;
+
+	DEBUG(DEBUG_SYS_NEXT, "pid=%d policy=%s job=%p", current->pid,
+		sched_name(current->sched_class), se->job);
+	
+	hrtimer_cancel(&se->timer);	
+	//reset rq timer
+	//FIXME:
+
+	preempt_disable();
+	
+	rq = task_rq(current);
+	atlas_rq = &rq->atlas;
+
+	//remove the old job from the rq
+	
+	raw_spin_lock_irqsave(&rq->lock, flags);
+
+	sched_log("NEXT pid=%d", current->pid);
+	
+	reset_timer(atlas_rq);
+
+	se->flags &= ~ATLAS_DEADLINE;
+	se->flags &= ~ATLAS_EXECTIME;
+	
+	if (current->sched_class == &atlas_sched_class) {
+		update_rq_clock(rq);
+		update_curr_atlas(rq);
+	}
+	
+	//clean up
+	if (se->real_job) {
+		raw_spin_lock(&atlas_rq->lock);
+		erase_rq_job(atlas_rq, se->real_job);
+		raw_spin_unlock(&atlas_rq->lock);
+	}
+	
+	//get new job
+	next_job = pop_task_job(se);
+	
+	if (unlikely(se->real_job != se->job))
+	{
+		// remove old job
+		put_job(se->real_job);
+
+		// update real job
+		se->real_job = next_job;
+		se->flags |= ATLAS_PENDING_JOBS;
+	
+	} else
+	{
+		//remove old job
+		put_job(se->job);
+
+		se->job = se->real_job = next_job;
+		se->flags &= ~ATLAS_PENDING_JOBS;
+	}
+	
+
+	if (se->job == se->real_job) {
+		if (atlas_rq->advance_in_cfs == current) {
+			BUG_ON(!(current->atlas.flags & ATLAS_CFS_ADVANCED));
+			BUG_ON(atlas_rq->timer_target != ATLAS_SLACK && !(atlas_rq->pending_work & PENDING_STOP_CFS_ADVANCED));
+			reset_timer(atlas_rq);
+		} else
+			atlas_switch_scheduler(rq, current, &atlas_sched_class);
+	} /* else
+		atlas_switch_scheduler(rq, current, &fair_sched_class); */
+	
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+	if (se->real_job)
+		goto out_timer;
+		
+	preempt_enable();
+	se->state = ATLAS_BLOCKED;
+	
+
+	for(;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		
+		//we are aware of the lost update problem
+		if ((se->job = se->real_job = pop_task_job(se)))
+		{
+			break;
+		}
+		DEBUG(DEBUG_SYS_NEXT, "pid=%d no job, call schedule now", current->pid);
+
+		if (likely(!signal_pending(current))) {
+			schedule();
+			continue;
+		}
+	
+		/*
+		 * pending signal
+		 */
+		se->state = ATLAS_UNDEF;
+		__set_current_state(TASK_RUNNING);
+		ret = -EINTR;
+		goto out;
+	}
+
+	__set_current_state(TASK_RUNNING);
+	se->state = ATLAS_RUNNING;
+
+	DEBUG(DEBUG_SYS_NEXT, "pid=%d job=%p job->deadline=%llu",
+		current->pid, se->job, ktime_to_us(se->job->deadline));
+	
+	preempt_disable();
+
+out_timer:
+
+	set_tsk_need_resched(current);
+	
+
+	/*
+	 * setup new timer
+	 * if the deadline has already passed, the callback will be called
+	 * resulting in a scheduler switch to CFS
+	 */
+	DEBUG(DEBUG_SYS_NEXT, "pid=%d setup timer for real_job %p (job %p) (need_resched=%d).",
+		current->pid, se->real_job, se->job, test_tsk_need_resched(current));
+
+	hrtimer_start(&se->timer, se->real_job->deadline, HRTIMER_MODE_ABS_PINNED);
+	
+	sched_log("NEXT pid=%d job=%p", current->pid, current->atlas.job);
+
+	preempt_enable();
+
+out:	
+	return ret;
+
+}
+
+
+#define ATLAS_TIME_ABS 0
+#define ATLAS_TIME_REL 1
+
+SYSCALL_DEFINE4(atlas_submit, pid_t, pid, struct timeval __user *,
+					exectime, struct timeval __user *, deadline, int, time_base)
+					
+{
+	struct timeval lexectime;
+	struct timeval ldeadline;
+	struct atlas_job *job;
+	struct task_struct *t;
+	int ret = 0;
+	ktime_t now, kdeadline;
+	struct atlas_rq *atlas_rq;
+	unsigned long flags;
+
+	//DEBUG(DEBUG_SYS_SUBMIT, "pid=%u, exectime=0x%p, deadline=0x%p",
+	//	pid, exectime, deadline);
+			
+	if (!exectime || !deadline || pid < 0)
+		return -EINVAL;
+					
+	if (copy_from_user(&lexectime, exectime, sizeof(struct timeval)) ||
+		copy_from_user(&ldeadline, deadline, sizeof(struct timeval))) {
+		DEBUG(DEBUG_SYS_SUBMIT, "bad address");
+		return -EFAULT;
+	}
+	DEBUG(DEBUG_SYS_SUBMIT, "pid=%u, exectime=%lld, deadline=%lld, time_base=%s",
+		pid,
+		ktime_to_ms(timeval_to_ktime(lexectime)),
+		ktime_to_ms(timeval_to_ktime(ldeadline)),
+		time_base == 0 ? "ABS" : ( time_base == 1 ? "REL" : "INVALID"));
+
+	/*
+	 * calculate deadline with respect to CLOCK_MONOTONIC
+	 */
+	kdeadline = timeval_to_ktime(ldeadline);
+	if (time_base == ATLAS_TIME_REL)
+		kdeadline = ktime_add(ktime_get(), kdeadline);
+
+	/*
+	 * allocate memory for the new job
+	 */
+	job = kmalloc(sizeof(struct atlas_job), 0);
+	DEBUG(DEBUG_SYS_SUBMIT, "job=%p", job);
+	if (job == NULL) {
+		return -ENOMEM;
+	}
+
+	rcu_read_lock();
+
+	/*
+	 * check for thread existence
+	 */
+	job->pid = find_get_pid(pid);
+	
+	if (!job->pid) {
+		kfree(job);
+		ret = -ESRCH;
+		goto out;
+	}
+	
+	t = pid_task(job->pid, PIDTYPE_PID);
+	BUG_ON(!t);
+	atlas_rq = &task_rq(t)->atlas;
+
+	init_job(job);
+	
+	job->deadline = kdeadline; 
+	job->exectime = timeval_to_ktime(lexectime);
+	
+	job->sdeadline = job->deadline;
+	job->sexectime = job->exectime;
+
+	raw_spin_lock_irqsave(&atlas_rq->lock, flags);
+	//now = atlas_rq->exec_timer.base->get_time();
+	now = ktime_get();
+	
+	assign_rq_job(atlas_rq, job, ktime_get());
+
+	if (ktime_cmp(job->exectime, job->sexectime) == 0)
+		DEBUG(DEBUG_SYS_SUBMIT, "sexectime == exectime");
+	else if (ktime_zero(job->sexectime))
+		DEBUG(DEBUG_SYS_SUBMIT, "sexectime == 0");
+	else
+		DEBUG(DEBUG_SYS_SUBMIT, "sexectime < exectime");
+
+	raw_spin_unlock_irqrestore(&atlas_rq->lock, flags);
+
+	//rcu_read_lock prevents the job from going away
+	assign_task_job(t, job);
+	
+out:
+	DEBUG(DEBUG_SYS_SUBMIT, "ready: job=%p", job);
+	rcu_read_unlock();
+	put_job(job);
+	return ret;
+}
diff --git a/kernel/sched/atlas_recover.c b/kernel/sched/atlas_recover.c
new file mode 100644
index 0000000..8a52fd3
--- /dev/null
+++ b/kernel/sched/atlas_recover.c
@@ -0,0 +1,526 @@
+#include <linux/rbtree.h>
+#include <linux/hrtimer.h>
+#include <linux/ktime.h>
+#include "sched.h"
+
+#define PENDING_MOVE_TO_CFS   0x1
+
+#define ATLAS_DEBUG
+
+enum {
+		DEBUG_SET_CURR_TASK  = 1UL << 1,
+		DEBUG_ENQUEUE        = 1UL << 2,
+		DEBUG_DEQUEUE        = 1UL << 3,
+		DEBUG_PICK_NEXT_TASK = 1UL << 4,
+		DEBUG_SWITCHED_TO    = 1UL << 6,
+		DEBUG_PUT_PREV_TASK  = 1UL << 7,
+		DEBUG_CHECK_PREEMPT  = 1UL << 8,
+		DEBUG_TIMER          = 1UL << 10,
+		DEBUG_SWITCH_SCHED   = 1UL << 11,
+};
+
+#ifdef ATLAS_DEBUG
+//static const unsigned debug_mask = DEBUG_PICK_NEXT_TASK |
+//		DEBUG_PUT_PREV_TASK | DEBUG_ENQUEUE | DEBUG_DEQUEUE | DEBUG_TIMER | DEBUG_SWITCH_SCHED;
+
+static const unsigned debug_mask = 0;
+
+static int printk_counter = 0;
+	#define DEBUG(T,STR,...) \
+		do { \
+			if (T & debug_mask)  { \
+				preempt_disable(); \
+				printk_sched("RECOVER: %d (%d): "#T ": " STR "\n", (printk_counter++), \
+					smp_processor_id(), ##__VA_ARGS__); \
+				preempt_enable(); \
+			} \
+		} while(0)
+	
+	#define DEBUG_ON(T) if (debug_mask & (T))
+			
+#else 
+	#define DEBUG(...)
+	#define DEBUG_ON(T) if (0)
+#endif /* ATLAS_DEBUG */
+
+const struct sched_class atlas_recover_sched_class;
+
+static inline int ktime_zero(ktime_t a) {
+	return ktime_equal(ktime_set(0,0), a);
+}
+
+static inline struct rq *rq_of(struct atlas_recover_rq *atlas_recover_rq)
+{
+	return container_of(atlas_recover_rq, struct rq, atlas_recover);
+}
+
+static inline struct task_struct *task_of(struct sched_atlas_entity *se)
+{
+	return container_of(se, struct task_struct, atlas);
+}
+
+static inline int hrtimer_start_nowakeup(struct hrtimer *timer, ktime_t tim,
+		const enum hrtimer_mode mode)
+{
+	return __hrtimer_start_range_ns(timer, tim, 0, mode, 0);
+}
+   
+static inline void update_stats_curr_start(struct atlas_recover_rq *atlas_recover_rq,
+			struct sched_atlas_entity *se, ktime_t now)
+{
+	task_of(se)->se.exec_start = rq_of(atlas_recover_rq)->clock_task;
+	se->start = now;
+}
+
+/*
+ * handle slack time transitions
+ */
+static enum hrtimer_restart timer_rq_func(struct hrtimer *timer)
+{
+	struct atlas_recover_rq *atlas_recover_rq =
+			container_of(timer, struct atlas_recover_rq, timer);
+	struct rq *rq = rq_of(atlas_recover_rq);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+
+	BUG_ON(rq->curr->sched_class != &atlas_recover_sched_class);
+	
+	sched_log("Timer Recover");
+
+	atlas_recover_rq->pending_work |= PENDING_MOVE_TO_CFS;
+	
+	if (rq->curr)
+		resched_task(rq->curr);
+	
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+	
+	return HRTIMER_NORESTART;
+}
+
+static inline void setup_rq_timer(struct atlas_recover_rq *atlas_recover_rq,
+		struct atlas_job *job) {
+
+	if (unlikely(!job))
+		return;
+
+	hrtimer_start_nowakeup(&atlas_recover_rq->timer,
+			job->sexectime, HRTIMER_MODE_REL_PINNED);
+}
+
+void init_atlas_recover_rq(struct atlas_recover_rq *atlas_recover_rq)
+{
+	atlas_recover_rq->curr = NULL;
+    atlas_recover_rq->tasks_timeline = RB_ROOT;
+	atlas_recover_rq->rb_leftmost_se = NULL;
+    atlas_recover_rq->nr_runnable = 0;
+
+	hrtimer_init(&atlas_recover_rq->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+	atlas_recover_rq->timer.function = &timer_rq_func;
+	
+	atlas_recover_rq->flags = 0;
+	atlas_recover_rq->pending_work = 0;
+}
+
+static inline int job_before(struct atlas_job *a,
+		struct atlas_job *b)
+{
+	BUG_ON(!a);
+	BUG_ON(!b);
+	return ktime_to_ns(a->deadline) <  ktime_to_ns(b->deadline);
+}
+
+static inline int entity_before(struct sched_atlas_entity *a,
+		struct sched_atlas_entity *b)
+{
+	return job_before(a->job, b->job);
+}
+
+
+static void enqueue_entity(struct atlas_recover_rq *atlas_recover_rq,
+		struct sched_atlas_entity *se)
+{
+	struct rb_node **link = &atlas_recover_rq->tasks_timeline.rb_node;
+	struct rb_node *parent = NULL;
+	struct sched_atlas_entity *entry;
+	int leftmost = 1;
+	
+	
+	//FIXME?
+	rb_init_node(&se->run_node);
+	
+	while (*link) {
+		parent = *link;
+		entry = rb_entry(parent, struct sched_atlas_entity, run_node);
+		
+		if (entity_before(se, entry))
+			link = &parent->rb_left;
+		else {
+			link = &parent->rb_right;
+			leftmost = 0;
+		}
+	}
+
+	if (leftmost)
+		atlas_recover_rq->rb_leftmost_se = &se->run_node;
+	
+	rb_link_node(&se->run_node, parent, link);
+	rb_insert_color(&se->run_node, &atlas_recover_rq->tasks_timeline);	
+}
+
+static void dequeue_entity(struct atlas_recover_rq *atlas_recover_rq,
+		struct sched_atlas_entity *se)
+{
+	if (atlas_recover_rq->rb_leftmost_se == &se->run_node) {
+		struct rb_node *next_node;
+
+		next_node = rb_next(&se->run_node);
+		atlas_recover_rq->rb_leftmost_se = next_node;
+	}
+	
+	rb_erase(&se->run_node, &atlas_recover_rq->tasks_timeline);
+}
+
+static inline struct sched_atlas_entity *pick_first_entity
+		(struct atlas_recover_rq *atlas_recover_rq)
+{
+	struct rb_node *left = atlas_recover_rq->rb_leftmost_se;
+
+	if (!left)
+		return NULL;
+
+	return rb_entry(left, struct sched_atlas_entity, run_node);
+}
+
+static inline struct sched_atlas_entity *pick_next_entity
+		(struct sched_atlas_entity *se)
+{
+	struct rb_node *next = rb_next(&se->run_node);
+
+	if (!next)
+		return NULL;
+
+	return rb_entry(next, struct sched_atlas_entity, run_node);
+}
+
+static void update_curr_atlas_recover(struct rq *rq)
+{
+    //copied from rt
+	struct task_struct *curr = rq->curr;
+	struct sched_atlas_entity *se = &curr->atlas;
+	struct atlas_recover_rq *atlas_recover_rq = &rq->atlas_recover;
+	u64 delta_exec;
+	struct atlas_job *job = se->job;
+	ktime_t diff_ktime, now;
+
+	if (curr->sched_class != &atlas_recover_sched_class)
+		return;
+
+	delta_exec = rq->clock_task - curr->se.exec_start;
+	if (unlikely((s64)delta_exec < 0))
+		delta_exec = 0;
+
+	schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
+
+	curr->se.sum_exec_runtime += delta_exec;
+	account_group_exec_runtime(curr, delta_exec);
+
+	now = ktime_get();
+	diff_ktime = ktime_sub(now, se->start);
+    update_stats_curr_start(atlas_recover_rq, se, now); 
+	cpuacct_charge(curr, delta_exec);
+
+	//it's very unlikely, but possible in sys_atlas_next
+	if (unlikely(!job))
+		return;
+	
+	//update exectime
+	
+	job->exectime = ktime_sub(job->exectime, diff_ktime);
+	if (ktime_to_ns(job->exectime) <= 0) {
+		job->exectime = ktime_set(0,0);
+		job->sexectime = job->exectime;
+		goto exectime_exceeded;
+	}
+	
+	job->sexectime = ktime_sub(job->sexectime, diff_ktime);
+	if (ktime_to_ns(job->sexectime) <= 0) {
+		job->sexectime = ktime_set(0,0);
+		goto exectime_exceeded;
+	}
+	
+	return;
+
+exectime_exceeded:
+	se->flags |= ATLAS_EXECTIME;
+	return;
+}
+
+
+
+/*
+ * enqueue task
+ */
+static void enqueue_task_atlas_recover(struct rq *rq, struct task_struct *p, int flags)
+{
+	struct atlas_recover_rq *atlas_recover_rq = &rq->atlas_recover;
+	struct sched_atlas_entity *se = &p->atlas;
+		
+	DEBUG(DEBUG_ENQUEUE, "p->pid=%d job->sexec=%lld job->exec=%lld", p->pid,
+		se->job ? ktime_to_ns(se->job->sexectime) : -1, se->job ? ktime_to_ns(se->job->exectime) : -1);
+		
+	if (atlas_recover_rq->curr != se)
+		enqueue_entity(atlas_recover_rq, se);
+    
+    //mark task as on runqueue now
+	se->on_recover_rq = 1;
+    atlas_recover_rq->nr_runnable++;
+    
+    inc_nr_running(rq);
+}
+
+/*
+ * dequeue task
+ */
+static void dequeue_task_atlas_recover(struct rq *rq, struct task_struct *p, int flags)
+{
+	struct atlas_recover_rq *atlas_recover_rq = &rq->atlas_recover;
+	struct sched_atlas_entity *se = &p->atlas;
+	
+	DEBUG(DEBUG_DEQUEUE, "p->pid=%d job->sexec=%lld job->exec=%lld", p->pid,
+		se->job ? ktime_to_ns(se->job->sexectime) : -1, se->job ? ktime_to_ns(se->job->exectime) : -1);
+	
+	update_curr_atlas_recover(rq);
+
+    if (atlas_recover_rq->curr == se)
+		atlas_recover_rq->curr = NULL;
+	else
+		dequeue_entity(atlas_recover_rq, se);
+	
+	se->on_recover_rq = 0;
+	
+    atlas_recover_rq->nr_runnable--;
+
+    dec_nr_running(rq);
+	return;
+}
+
+static void yield_task_atlas_recover(struct rq *rq) 
+{
+    return;
+}
+
+extern void atlas_switch_scheduler(struct rq *,
+	struct task_struct *, const struct sched_class *);
+
+static void put_prev_task_atlas_recover(struct rq *rq, struct task_struct *prev)
+{	
+	struct atlas_recover_rq *atlas_recover_rq = &rq->atlas_recover;
+	struct sched_atlas_entity *se = &prev->atlas;
+	
+	/*
+	 * reset timer
+	 */
+	hrtimer_cancel(&atlas_recover_rq->timer);
+
+	if (se->on_recover_rq) {
+		update_curr_atlas_recover(rq);
+		enqueue_entity(atlas_recover_rq, se);
+	}
+	
+	atlas_recover_rq->curr = NULL;
+	
+	return;
+}
+
+static void check_preempt_curr_atlas_recover(struct rq *rq, struct task_struct *p, int flags)
+{
+	struct task_struct *curr = rq->curr;
+	struct sched_atlas_entity *se = &curr->atlas, *pse = &p->atlas;
+	int sub = (se->job != NULL), psub = (pse->job != NULL);
+	
+	if (unlikely(se == pse))
+		return;
+	
+	if (test_tsk_need_resched(curr))
+		return;
+
+	
+	/* Bug if task is not scheduled by us */
+	BUG_ON(curr->sched_class != &atlas_recover_sched_class);
+	BUG_ON(p->sched_class != &atlas_recover_sched_class);
+
+	if (!psub)
+		goto preempt;
+	
+	if (!sub)
+		return;
+		
+	if (ktime_to_ns(pse->job->deadline) < ktime_to_ns(se->job->deadline))
+		goto preempt;
+	
+	return;
+	
+preempt:
+	resched_task(curr);
+
+	return;
+}
+
+static void task_tick_atlas_recover(struct rq *rq, struct task_struct *p, int queued)
+{
+    update_curr_atlas_recover(rq);
+    return;
+}
+
+static void prio_changed_atlas_recover(struct rq *rq, struct task_struct *p, int oldprio)
+{
+    return;
+}
+
+static void switched_from_atlas_recover(struct rq *rq, struct task_struct *p)
+{
+    return;
+}
+
+static void switched_to_atlas_recover(struct rq *rq, struct task_struct *p)
+{
+    DEBUG(DEBUG_SWITCHED_TO, "pid=%d", p->pid);
+	
+	if (!p->atlas.on_recover_rq)
+		return;
+
+	if (rq->curr == p)
+		resched_task(rq->curr);
+	else
+		check_preempt_curr(rq, p, 0);
+
+	return;
+}   
+
+static unsigned int get_rr_interval_atlas_recover(struct rq *rq, struct task_struct *task)
+{
+    return 0;
+}
+
+#ifdef CONFIG_SMP
+static int select_task_rq_atlas_recover(struct task_struct *p, int sd_flag, int flags)
+{
+    return task_cpu(p);
+    
+}
+#endif /* CONFIG_SMP */
+
+
+
+static struct task_struct *pick_next_task_atlas_recover(struct rq *rq)
+{
+	struct atlas_recover_rq *atlas_recover_rq = &rq->atlas_recover;
+	struct sched_atlas_entity *se;
+
+	BUG_ON(atlas_recover_rq->curr);
+
+
+	/*
+	 * only proceed if there are runnable tasks
+	 */
+	if (likely(!atlas_recover_rq->nr_runnable)) {
+		//if there is no ready task, no need to set up timer
+		return NULL;
+	}
+		
+	se = pick_first_entity(atlas_recover_rq);
+
+	atlas_recover_rq->curr = se;
+	dequeue_entity(atlas_recover_rq, se);
+
+	
+	DEBUG(DEBUG_PICK_NEXT_TASK, "p->pid=%d job->sexec=%lld job->exec=%lld", task_of(se)->pid,
+		ktime_to_ns(se->job->sexectime), ktime_to_ns(se->job->exectime));
+	
+	//update start
+    update_stats_curr_start(atlas_recover_rq, se, ktime_get()); 
+
+	//job?
+	if (se->job) {
+		if (ktime_zero(se->job->sexectime))
+			atlas_recover_rq->pending_work |= PENDING_MOVE_TO_CFS;
+		else
+			setup_rq_timer(atlas_recover_rq, atlas_recover_rq->curr->job);
+    }
+
+	return task_of(atlas_recover_rq->curr);
+}
+
+static void set_curr_task_atlas_recover(struct rq *rq)
+{
+	struct task_struct *p = rq->curr;
+	struct sched_atlas_entity *se = &p->atlas;
+	struct atlas_recover_rq *atlas_recover_rq = &rq->atlas_recover;
+	
+	DEBUG(DEBUG_SET_CURR_TASK, "pid=%d", p->pid);
+    update_stats_curr_start(atlas_recover_rq, se, ktime_get()); 
+    
+    BUG_ON(rq->atlas_recover.curr);
+	rq->atlas_recover.curr = se;
+
+	
+	/*
+	 * reset timer
+	 */
+	hrtimer_cancel(&atlas_recover_rq->timer);
+	
+	setup_rq_timer(atlas_recover_rq, se->job);
+
+    return;
+}
+
+
+void atlas_recover_do_pending_work(struct rq *rq) {
+	struct atlas_recover_rq *atlas_recover_rq = &rq->atlas_recover;
+	struct task_struct *prev = rq->curr;
+
+	if (atlas_recover_rq->pending_work & PENDING_MOVE_TO_CFS) {
+		atlas_switch_scheduler(rq, prev, &fair_sched_class);
+		atlas_recover_rq->pending_work &= ~ PENDING_MOVE_TO_CFS;
+	}
+	
+	BUG_ON(atlas_recover_rq->pending_work);
+}
+
+/*
+ * All the scheduling class methods:
+ */
+const struct sched_class atlas_recover_sched_class = {
+	.next               = &fair_sched_class,
+	.enqueue_task       = enqueue_task_atlas_recover,
+	.dequeue_task       = dequeue_task_atlas_recover,
+	.yield_task         = yield_task_atlas_recover,
+	//.yield_to_task		= yield_to_task_atlas,
+
+	.check_preempt_curr = check_preempt_curr_atlas_recover,
+
+	.pick_next_task     = pick_next_task_atlas_recover,
+	.put_prev_task      = put_prev_task_atlas_recover,
+
+/**we do not support SMP so far*/
+#ifdef CONFIG_SMP
+	.select_task_rq     = select_task_rq_atlas_recover,
+
+	//.rq_online		= rq_online_atlas,
+	//.rq_offline		= rq_offline_atlas,
+
+	//.task_waking		= task_waking_atlas,
+#endif
+
+	.set_curr_task      = set_curr_task_atlas_recover,
+	.task_tick          = task_tick_atlas_recover,
+	//.task_fork        = task_fork_atlas,
+
+	.prio_changed       = prio_changed_atlas_recover,
+	.switched_from      = switched_from_atlas_recover,
+	.switched_to        = switched_to_atlas_recover,
+
+	.get_rr_interval    = get_rr_interval_atlas_recover,
+
+};
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9f81a3a..0466eff 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -88,6 +88,18 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
 
+void sched_log(const char *fmt, ...) {
+	char buf[50];
+	
+	va_list args;
+	
+	va_start(args, fmt);
+	vsnprintf(buf, 50, fmt, args);
+	va_end(args);
+
+	trace_sched_log(&buf[0]);
+}
+
 void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
 {
 	unsigned long delta;
@@ -715,6 +727,7 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 	update_rq_clock(rq);
 	sched_info_queued(p);
 	p->sched_class->enqueue_task(rq, p, flags);
+	trace_sched_enqueue_task(p, rq);
 }
 
 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -722,6 +735,7 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 	update_rq_clock(rq);
 	sched_info_dequeued(p);
 	p->sched_class->dequeue_task(rq, p, flags);
+	trace_sched_dequeue_task(p, rq);
 }
 
 void activate_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1695,6 +1709,8 @@ int wake_up_state(struct task_struct *p, unsigned int state)
 	return try_to_wake_up(p, state, 0);
 }
 
+extern enum hrtimer_restart atlas_timer_task_function(struct hrtimer *timer);
+
 /*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
@@ -1718,6 +1734,12 @@ static void __sched_fork(struct task_struct *p)
 #endif
 
 	INIT_LIST_HEAD(&p->rt.run_list);
+	
+	/*ATLAS stuff*/
+	INIT_LIST_HEAD(&p->atlas.jobs);
+	p->atlas.on_rq = 0;
+	hrtimer_init(&p->atlas.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+	p->atlas.timer.function = &atlas_timer_task_function;
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -1749,7 +1771,7 @@ void sched_fork(struct task_struct *p)
 	 * Revert to default priority/policy on fork if requested.
 	 */
 	if (unlikely(p->sched_reset_on_fork)) {
-		if (task_has_rt_policy(p)) {
+		if (task_has_rt_policy(p) || unlikely(p->policy == SCHED_ATLAS)) {
 			p->policy = SCHED_NORMAL;
 			p->static_prio = NICE_TO_PRIO(0);
 			p->rt_priority = 0;
@@ -3335,6 +3357,8 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
 	if (prev->on_rq || rq->skip_clock_update < 0)
 		update_rq_clock(rq);
 	prev->sched_class->put_prev_task(rq, prev);
+	
+	trace_sched_put_prev_task(rq, prev);
 }
 
 /*
@@ -3353,16 +3377,20 @@ pick_next_task(struct rq *rq)
 	if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
 		p = fair_sched_class.pick_next_task(rq);
 		if (likely(p))
-			return p;
+			goto out;
 	}
 
 	for_each_class(class) {
 		p = class->pick_next_task(rq);
 		if (p)
-			return p;
+			goto out;
 	}
 
-	BUG(); /* the idle class will always have a runnable task */
+out:
+	BUG_ON(!p); /* the idle class will always have a runnable task */
+	
+	trace_sched_pick_next_task(rq, p);
+	return p;
 }
 
 /*
@@ -3381,14 +3409,16 @@ need_resched:
 	rq = cpu_rq(cpu);
 	rcu_note_context_switch(cpu);
 	prev = rq->curr;
-
+	
 	schedule_debug(prev);
 
 	if (sched_feat(HRTICK))
 		hrtick_clear(rq);
-
+	
 	raw_spin_lock_irq(&rq->lock);
 
+	trace_sched_enter(rq);
+
 	switch_count = &prev->nivcsw;
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 		if (unlikely(signal_pending_state(prev->state, prev))) {
@@ -3396,6 +3426,11 @@ need_resched:
 		} else {
 			deactivate_task(rq, prev, DEQUEUE_SLEEP);
 			prev->on_rq = 0;
+ 
+			/* check if thread was scheduled by cfs to support an atlas job */
+			if (unlikely(prev->atlas.flags & ATLAS_CFS_ADVANCED)) {
+				atlas_cfs_blocked(rq, prev);
+			}
 
 			/*
 			 * If a worker went to sleep, notify and ask workqueue
@@ -3412,6 +3447,16 @@ need_resched:
 		}
 		switch_count = &prev->nvcsw;
 	}
+	BUG_ON((rq->atlas.pending_work & 0x3) == 3 && rq->atlas.advance_in_cfs);
+	BUG_ON(rq->atlas.pending_work & 0x2 && rq->atlas.advance_in_cfs);
+
+	if (unlikely(rq->atlas.pending_work)) {
+		atlas_do_pending_work(rq);
+	}
+	
+	if (unlikely(rq->atlas_recover.pending_work)) {
+		atlas_recover_do_pending_work(rq);
+	}
 
 	pre_schedule(rq, prev);
 
@@ -3419,6 +3464,7 @@ need_resched:
 		idle_balance(cpu, rq);
 
 	put_prev_task(rq, prev);
+
 	next = pick_next_task(rq);
 	clear_tsk_need_resched(prev);
 	rq->skip_clock_update = 0;
@@ -3441,9 +3487,10 @@ need_resched:
 		raw_spin_unlock_irq(&rq->lock);
 
 	post_schedule(rq);
-
+	
 	sched_preempt_enable_no_resched();
-	if (need_resched())
+	if (need_resched() || rq->atlas.pending_work
+			|| rq->atlas_recover.pending_work)
 		goto need_resched;
 }
 
@@ -4223,6 +4270,8 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 	p->prio = rt_mutex_getprio(p);
 	if (rt_prio(p->prio))
 		p->sched_class = &rt_sched_class;
+	else if (policy == SCHED_ATLAS)
+		p->sched_class = &atlas_sched_class;
 	else
 		p->sched_class = &fair_sched_class;
 	set_load_weight(p);
@@ -4266,7 +4315,7 @@ recheck:
 
 		if (policy != SCHED_FIFO && policy != SCHED_RR &&
 				policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-				policy != SCHED_IDLE)
+				policy != SCHED_IDLE && policy != SCHED_ATLAS)
 			return -EINVAL;
 	}
 
@@ -7299,6 +7348,8 @@ void __init sched_init(void)
 		rq->calc_load_update = jiffies + LOAD_FREQ;
 		init_cfs_rq(&rq->cfs);
 		init_rt_rq(&rq->rt, rq);
+		init_atlas_rq(&rq->atlas);
+		init_atlas_recover_rq(&rq->atlas_recover);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		root_task_group.shares = ROOT_TASK_GROUP_LOAD;
 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
@@ -8559,3 +8610,9 @@ struct cgroup_subsys cpuacct_subsys = {
 	.base_cftypes = files,
 };
 #endif	/* CONFIG_CGROUP_CPUACCT */
+
+EXPORT_SYMBOL_GPL(rt_sched_class);
+EXPORT_SYMBOL_GPL(atlas_sched_class);
+EXPORT_SYMBOL_GPL(atlas_recover_sched_class);
+EXPORT_SYMBOL_GPL(fair_sched_class);
+EXPORT_SYMBOL_GPL(idle_sched_class);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 573e1ca..b272fa6 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2038,7 +2038,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
 }
 
 const struct sched_class rt_sched_class = {
-	.next			= &fair_sched_class,
+	.next			= &atlas_sched_class,
 	.enqueue_task		= enqueue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
 	.yield_task		= yield_task_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 749f712..92eafe5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,3 +1,5 @@
+#ifndef _LINUX_SCHED_H_PRIVATE
+#define _LINUX_SCHED_H_PRIVATE
 
 #include <linux/sched.h>
 #include <linux/mutex.h>
@@ -309,6 +311,60 @@ struct rt_rq {
 #endif
 };
 
+
+#define ATLAS_EXECTIME      0x1
+#define ATLAS_DEADLINE      0x2
+#define ATLAS_CFS_ADVANCED  0x4
+#define ATLAS_PENDING_JOBS  0x8
+
+//needs to be defined here because of trace stuff
+struct atlas_job {
+	struct list_head list;
+	struct rb_node rb_node;
+	struct pid *pid;  //used to map submission -> map AND to distinguish task and gap
+	ktime_t exectime; //relative
+	ktime_t deadline; //absolut
+	ktime_t sdeadline;
+	ktime_t sexectime;
+	atomic_t count;
+};
+
+enum atlas_timer_target {
+	ATLAS_SLACK,
+	ATLAS_JOB,
+	ATLAS_NONE
+};
+
+struct atlas_rq {
+	struct sched_atlas_entity *curr;
+	struct rb_root     tasks_timeline;
+	struct rb_node *rb_leftmost_se;
+	struct rb_root     jobs;
+	raw_spinlock_t			lock;
+	int nr_runnable;
+	struct hrtimer timer; //used for slack time and for time to cfs
+	enum atlas_timer_target timer_target;
+	ktime_t timer_end;
+	struct atlas_job *cfs_job;
+	ktime_t cfs_job_start;
+	unsigned long flags;
+	unsigned long pending_work; //used in core to use callback in atlas
+	struct task_struct *advance_in_cfs;
+	struct task_struct *move_to_atlas;
+	int skip_update_curr;
+};
+
+struct atlas_recover_rq {
+	struct sched_atlas_entity *curr;
+	struct rb_root     tasks_timeline;
+	struct rb_node     *rb_leftmost_se;
+	struct rb_root     jobs;
+	int nr_runnable;
+	struct hrtimer timer;
+	unsigned long flags;
+	unsigned long pending_work; //used in core to use callback in atlas
+};
+
 #ifdef CONFIG_SMP
 
 /*
@@ -370,6 +426,8 @@ struct rq {
 
 	struct cfs_rq cfs;
 	struct rt_rq rt;
+	struct atlas_rq atlas;
+	struct atlas_recover_rq atlas_recover;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
@@ -844,10 +902,11 @@ enum cpuacct_stat_index {
 
 extern const struct sched_class stop_sched_class;
 extern const struct sched_class rt_sched_class;
+extern const struct sched_class atlas_sched_class;
+extern const struct sched_class atlas_recover_sched_class;
 extern const struct sched_class fair_sched_class;
 extern const struct sched_class idle_sched_class;
 
-
 #ifdef CONFIG_SMP
 
 extern void trigger_load_balance(struct rq *rq, int cpu);
@@ -1133,6 +1192,8 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 
 #endif
 
+extern void sched_log(const char *fmt, ...);
+
 extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
 extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
 extern void print_cfs_stats(struct seq_file *m, int cpu);
@@ -1140,6 +1201,11 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
 
 extern void init_cfs_rq(struct cfs_rq *cfs_rq);
 extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
+extern void init_atlas_rq(struct atlas_rq *atlas_rq);
+extern void atlas_cfs_blocked(struct rq *rq, struct task_struct *p);
+extern void atlas_do_pending_work(struct rq *rq);
+extern void init_atlas_recover_rq(struct atlas_recover_rq *atlas_recover_rq);
+extern void atlas_recover_do_pending_work(struct rq *rq);
 extern void unthrottle_offline_cfs_rqs(struct rq *rq);
 
 extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
@@ -1153,3 +1219,5 @@ enum rq_nohz_flag_bits {
 
 #define nohz_flags(cpu)	(&cpu_rq(cpu)->nohz_flags)
 #endif
+
+#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4ab1187..7761576 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -339,6 +339,22 @@ static struct ctl_table kern_table[] = {
 	},
 #endif
 	{
+		.procname	= "sched_atlas_min_slack",
+		.data		= &sysctl_sched_atlas_min_slack,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sched_atlas_advance_in_cfs",
+		.data		= &sysctl_sched_atlas_advance_in_cfs,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+	{
 		.procname	= "sched_rt_period_us",
 		.data		= &sysctl_sched_rt_period,
 		.maxlen		= sizeof(unsigned int),