diff --git a/src/Makefile b/src/Makefile
index 4765219b67..7a00b4e94b 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,9 +1,9 @@
 MAKEFLAGS=--no-print-directory --section-alignment 0x1000 -I$(PWD)
 #$(info Make flags $(MAKEFLAGS))
 
-default: | all cp
+default: | all
 
-all: comps plat
+all: comps plat cp
 
 comps:
 	$(info )
diff --git a/src/components/Makefile.comp b/src/components/Makefile.comp
index 6257eb9d5c..2a4887534e 100644
--- a/src/components/Makefile.comp
+++ b/src/components/Makefile.comp
@@ -11,6 +11,7 @@ MUSLBIN=$(MUSLDIR)/bin
 MUSLCC=$(MUSLBIN)/musl-$(CC)
 MUSLINC=-isystem$(MUSLDIR)/include
 
+PSLIBDIR=$(LIBDIR)/ps
 CKDIR=$(LIBDIR)/ck
 CKLIBDIR=$(CKDIR)/lib
 CKINCDIR=$(CKDIR)/include
@@ -40,22 +41,14 @@ LUAINC=-I$(LUADIR)/src -I$(LUABASE)/cos/include
 
 INC_PATH=-I./ -I$(CDIR)/include/ -I$(CDIR)/interface/ -I$(SHAREDINC) -I$(CKINCDIR)
 SHARED_FLAGS=-fno-merge-constants -nostdinc -nostdlib -fno-pic
-OPT= -g -fvar-tracking
-#OPT= -O3
+OPT  = -g -fvar-tracking
+OPT += -O3
 CFLAGS=-m32 -D__x86__ -Wall -Wextra -Wno-unused-parameter -Wno-unused-function -fno-stack-protector -fno-omit-frame-pointer -Wno-unused-variable $(INC_PATH) $(MUSLINC) $(LWIPINC) $(LUAINC) $(OPT) $(SHARED_FLAGS)
 CXXFLAGS=-fno-exceptions -fno-threadsafe-statics -Wno-write-strings $(CFLAGS)
 LDFLAGS=-melf_i386
 MUSLCFLAGS=$(CFLAGS) -lc -lgcc -Xlinker -r
 ASFLAGS=-m32 $(INC_PATH) $(SHARED_FLAGS)
 
-SERVER_STUB=s_stub.o
-CLIENT_STUB=c_stub.o
-
-LIBCOSDEFKERN=-lcos_kernel_api -lcos_defkernel_api
-LIBSLCORE=$(LIBCOSDEFKERN) -lsl_sched -lheap -lsl_xcpu -lsl_child -lck
-LIBSLCAPMGR=$(LIBSLCORE) -lsl_capmgr
-LIBSLRAW=$(LIBSLCORE) -lsl_raw
-
 GCC_PIE=$(shell gcc -v 2>&1 | grep -c "\--enable-default-pie")
 ifeq ($(GCC_PIE),1)
 MUSLCFLAGS+=-no-pie
@@ -63,3 +56,11 @@ LDFLAGS+=-no-pie
 CFLAGS+=-fno-pie
 CXXFLAGS+=-fno-pie
 endif
+
+SERVER_STUB=s_stub.o
+CLIENT_STUB=c_stub.o
+
+LIBCOSDEFKERN=-lcos_kernel_api -lcos_defkernel_api
+LIBSLCORE=$(LIBCOSDEFKERN) -lsl_sched -lheap -lsl_xcore -lsl_child -lck
+LIBSLCAPMGR=$(LIBSLCORE) -lsl_capmgr
+LIBSLRAW=$(LIBSLCORE) -lsl_raw -lcos_dcb
diff --git a/src/components/implementation/Makefile.subsubdir b/src/components/implementation/Makefile.subsubdir
index 693d3a11a1..89bc44b379 100644
--- a/src/components/implementation/Makefile.subsubdir
+++ b/src/components/implementation/Makefile.subsubdir
@@ -42,7 +42,7 @@ TMP_STR2=tmp2
 INCLUDE=-I../ $(DEP_INC) $(IF_INCLUDE) $(CINC)
 LIB_LIBRARIES_PRE=$(DEP_LIB_EXIST)
 LIB_LIBRARIES=$(strip $(LIB_LIBRARIES_PRE))
-LIB_FLAGS=-L$(CKLIBDIR) -L$(LIBDIR) -L$(LIBCXXDIR) $(DEP_LIB) $(LIB_LIBRARIES) $(ADDITIONAL_LIBS)
+LIB_FLAGS=-L$(PSLIBDIR) -L$(CKLIBDIR) -L$(LIBDIR) -L$(LIBCXXDIR) $(DEP_LIB) $(LIB_LIBRARIES) $(ADDITIONAL_LIBS)
 
 C_SOURCES=$(C_OBJS:%.o=%.c)
 CXX_SOURCES=$(CXX_OBJS:%.o=%.cc)
diff --git a/src/components/implementation/capmgr/naive/Makefile b/src/components/implementation/capmgr/naive/Makefile
index 171178b7c5..4a6a2129f4 100644
--- a/src/components/implementation/capmgr/naive/Makefile
+++ b/src/components/implementation/capmgr/naive/Makefile
@@ -1,7 +1,7 @@
 C_OBJS=cap_mgr.c mem_mgr.c init.c
 ASM_OBJS=
 COMPONENT=capmgr.o
-INTERFACES=capmgr channel
+INTERFACES=capmgr channel work
 DEPENDENCIES=
 IF_LIB=
 ADDITIONAL_LIBS=$(LIBSLRAW) -lsl_mod_fprr -lsl_thd_static_backend
diff --git a/src/components/implementation/capmgr/naive/cap_info.c b/src/components/implementation/capmgr/naive/cap_info.c
index e35eb4486a..bcdce17eca 100644
--- a/src/components/implementation/capmgr/naive/cap_info.c
+++ b/src/components/implementation/capmgr/naive/cap_info.c
@@ -62,6 +62,27 @@ cap_info_thd_next(struct cap_comp_info *rci)
 	return NULL;
 }
 
+void
+cap_info_cpu_initdcb_init(struct cap_comp_info *rci)
+{
+	dcbcap_t initdcb = 0;
+	unsigned short init_off = 0;
+	vaddr_t  initaddr = 0;
+	struct cos_compinfo *ci = cos_compinfo_get(cap_info_dci(rci));
+	struct cap_comp_cpu_info *rci_cpu = cap_info_cpu_local(rci);
+
+	if (rci->cid == 0 || rci->cid == cos_spd_id()) {
+		cos_dcb_info_init_ext(cap_info_cpu_dcbdata(rci_cpu), 0, 0, 0, 0);
+		return;
+	}
+
+	initaddr = rci->init_dcb_start + cos_cpuid() * PAGE_SIZE;
+	initdcb  = cos_dcb_alloc(cos_compinfo_get(cos_defcompinfo_curr_get()), ci->pgtbl_cap, initaddr);
+	assert(initdcb);
+
+	cos_dcb_info_init_ext(cap_info_cpu_dcbdata(rci_cpu), ci, initdcb, initaddr, init_off);
+}
+
 struct cap_comp_info *
 cap_info_comp_init(spdid_t spdid, captblcap_t captbl_cap, pgtblcap_t pgtbl_cap, compcap_t compcap,
 		   capid_t cap_frontier, vaddr_t heap_frontier, spdid_t sched_spdid)
@@ -76,13 +97,16 @@ cap_info_comp_init(spdid_t spdid, captblcap_t captbl_cap, pgtblcap_t pgtbl_cap,
 
 	capci[spdid].cid = spdid;
 	cos_meminfo_init(&ci->mi, 0, 0, 0);
-	cos_compinfo_init(ci, pgtbl_cap, captbl_cap, compcap, heap_frontier, cap_frontier,
-			cos_compinfo_get(cos_defcompinfo_curr_get()));
+	cos_compinfo_init(ci, pgtbl_cap, captbl_cap, compcap, heap_frontier,
+			  cap_frontier, cos_compinfo_get(cos_defcompinfo_curr_get()));
 
 	memset(rglb, 0, sizeof(struct cap_shmem_glb_info));
 	memset(cap_shi, 0, sizeof(struct cap_shmem_info));
 	cap_shi->cinfo = ci;
 
+	capci[spdid].init_dcb_start = heap_frontier - (NUM_CPU * PAGE_SIZE);
+	cap_info_cpu_initdcb_init(&capci[spdid]);
+
 	capci[spdid].initflag = 1;
 	ps_faa((unsigned long *)&cap_comp_count, 1);
 
@@ -336,7 +360,8 @@ cap_shmem_region_find(cos_channelkey_t key)
 	cbuf_t i, free = rglb->free_region_id;
 
 	for (i = 1; i <= free; i++) {
-		if (ps_load((unsigned long *)&rglb->region_keys[i - 1]) == key) {
+		cos_channelkey_t *k = &rglb->region_keys[i - 1];
+		if (ps_load((unsigned long *)k) == (unsigned long)key) {
 			id = i;
 			break;
 		}
diff --git a/src/components/implementation/capmgr/naive/cap_info.h b/src/components/implementation/capmgr/naive/cap_info.h
index 99d0bd060b..9919c6c796 100644
--- a/src/components/implementation/capmgr/naive/cap_info.h
+++ b/src/components/implementation/capmgr/naive/cap_info.h
@@ -15,6 +15,7 @@
 #include <capmgr.h>
 #include <memmgr.h>
 #include <bitmap.h>
+#include <cos_dcb.h>
 
 #define CAP_INFO_MAX_THREADS (MAX_NUM_THREADS)
 
@@ -29,12 +30,12 @@ struct cap_shmem_glb_info {
 };
 
 struct cap_comm_info {
-	arcvcap_t  rcvcap; /* rcv capid in capmgr! */
-	cpuid_t    rcvcpuid;
-	cycles_t   ipiwin, ipiwin_start; /* TODO: synchronize TSC on all cores */
-	u32_t      ipicnt, ipimax;
-	asndcap_t  sndcap[NUM_CPU]; /* for cross-core asnds */
-	sinvcap_t  sinvcap[NUM_CPU]; /* for each core (except for the same core!) */
+	arcvcap_t     rcvcap; /* rcv capid in capmgr! */
+	cpuid_t       rcvcpuid;
+	cycles_t      ipiwin, ipiwin_start; /* TODO: synchronize TSC on all cores */
+	unsigned long ipicnt, ipimax;
+	asndcap_t     sndcap[NUM_CPU]; /* for cross-core asnds */
+	sinvcap_t     sinvcap[NUM_CPU]; /* for each core (except for the same core!) */
 } cap_comminfo[CAP_INFO_MAX_THREADS];
 
 struct cap_channelaep_info {
@@ -58,6 +59,8 @@ struct cap_comp_cpu_info {
 	int p_thd_iterator; /* iterator for parent to get all threads created by capmgr in this component so far! */
 	thdcap_t p_initthdcap; /* init thread's cap in parent */
 	thdid_t  initthdid; /* init thread's tid */
+
+	struct cos_dcbinfo_data dcb_data;
 } CACHE_ALIGNED;
 
 struct cap_comp_info {
@@ -65,6 +68,7 @@ struct cap_comp_info {
 	struct cos_defcompinfo defci;
 	struct cap_shmem_info shminfo;
 	int initflag;
+	vaddr_t init_dcb_start;
 
 	struct cap_comp_cpu_info cpu_local[NUM_CPU];
 };
@@ -74,6 +78,7 @@ struct cap_comp_info *cap_info_comp_init(spdid_t spdid, captblcap_t captbl_cap,
 
 struct sl_thd *cap_info_thd_init(struct cap_comp_info *rci, struct sl_thd *t, cos_channelkey_t key);
 struct sl_thd *cap_info_initthd_init(struct cap_comp_info *rci, struct sl_thd *t, cos_channelkey_t key);
+void           cap_info_cpu_initdcb_init(struct cap_comp_info *rci);
 
 struct cap_comp_info *cap_info_comp_find(spdid_t s);
 struct sl_thd        *cap_info_thd_find(struct cap_comp_info *r, thdid_t t);
@@ -116,6 +121,12 @@ cap_info_cpu_local(struct cap_comp_info *c)
 	return &c->cpu_local[cos_cpuid()];
 }
 
+static inline struct cos_dcbinfo_data *
+cap_info_cpu_dcbdata(struct cap_comp_cpu_info *c)
+{
+	return &c->dcb_data;
+}
+
 static inline struct cap_comp_info *
 cap_info_parent(struct cap_comp_info *r)
 {
@@ -133,11 +144,18 @@ cap_info_is_parent(struct cap_comp_info *r, spdid_t p)
 }
 
 static inline int
-cap_info_is_sched(spdid_t c)
+cap_info_is_sched_core(spdid_t c, cpuid_t core)
 {
+	if (core >= NUM_CPU) return 0;
 	if (!c) return 1; /* llbooter! */
 
-	return bitmap_check(cap_info_schedbmp[cos_cpuid()], c - 1);
+	return bitmap_check(cap_info_schedbmp[core], c - 1);
+}
+
+static inline int
+cap_info_is_sched(spdid_t c)
+{
+	return cap_info_is_sched_core(c, cos_cpuid());
 }
 
 static inline int
diff --git a/src/components/implementation/capmgr/naive/cap_mgr.c b/src/components/implementation/capmgr/naive/cap_mgr.c
index 9216a3b35c..fcf938b936 100644
--- a/src/components/implementation/capmgr/naive/cap_mgr.c
+++ b/src/components/implementation/capmgr/naive/cap_mgr.c
@@ -13,7 +13,7 @@
 #include <cap_info.h>
 
 thdcap_t
-capmgr_thd_create_cserialized(thdid_t *tid, int *unused, thdclosure_index_t idx)
+capmgr_thd_create_cserialized(struct cos_dcb_info **dcb, thdid_t *tid, thdclosure_index_t idx)
 {
 	spdid_t                 cur     = cos_inv_token();
 	struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get();
@@ -21,18 +21,24 @@ capmgr_thd_create_cserialized(thdid_t *tid, int *unused, thdclosure_index_t idx)
 	struct cap_comp_info   *r       = cap_info_comp_find(cur);
 	struct sl_thd          *rt      = NULL, *t = NULL;
 	thdcap_t                thdcap  = 0;
+	dcbcap_t                dcbcap  = 0;
+	dcboff_t                dcboff  = 0;
+	vaddr_t                 dcbaddr = 0;
 
 	if (!r || !cap_info_init_check(r)) return 0;
 	if (!cap_info_is_sched(cur)) return 0;
 	if (idx <= 0) return 0;
 
-	t = sl_thd_aep_alloc_ext(cap_info_dci(r), NULL, idx, 0, 0, 0, 0, 0, NULL);
-	if (!t) return 0;
+	dcbcap = cos_dcb_info_alloc(cap_info_cpu_dcbdata(cap_info_cpu_local(r)), &dcboff, &dcbaddr);
+	if (!dcbcap || !dcbaddr || !dcboff) return 0; /* dcboff == 0 for initthd in that comp! */
+  	t = sl_thd_aep_alloc_ext_dcb(cap_info_dci(r), NULL, idx, 0, 0, 0, dcbcap, dcboff, 0, 0, NULL);
+  	if (!t) return 0;
 	thdcap = cos_cap_cpy(cap_info_ci(r), cap_ci, CAP_THD, sl_thd_thdcap(t));
 	if (!thdcap) goto err;
 
 	cap_info_thd_init(r, t, 0);
 	*tid = sl_thd_thdid(t);
+	*dcb = (struct cos_dcb_info *)dcbaddr;
 
 	return thdcap;
 err:
@@ -42,7 +48,7 @@ capmgr_thd_create_cserialized(thdid_t *tid, int *unused, thdclosure_index_t idx)
 }
 
 thdcap_t
-capmgr_thd_create_ext_cserialized(thdid_t *tid, int *unused, spdid_t s, thdclosure_index_t idx)
+capmgr_thd_create_ext_cserialized(struct cos_dcb_info **dcb, thdid_t *tid, spdid_t s, thdclosure_index_t idx)
 {
 	spdid_t                 cur     = cos_inv_token();
 	struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get();
@@ -51,6 +57,9 @@ capmgr_thd_create_ext_cserialized(thdid_t *tid, int *unused, spdid_t s, thdclosu
 	struct cap_comp_info   *rs      = cap_info_comp_find(s);
 	struct sl_thd          *t       = NULL;
 	thdcap_t                thdcap  = 0;
+	dcbcap_t                dcbcap  = 0;
+	dcboff_t                dcboff  = 0;
+	vaddr_t                 dcbaddr = 0;
 
 	if (!rc || !cap_info_init_check(rc)) return 0;
 	if (!rs || !cap_info_init_check(rs)) return 0;
@@ -58,7 +67,10 @@ capmgr_thd_create_ext_cserialized(thdid_t *tid, int *unused, spdid_t s, thdclosu
 	if (cap_info_is_sched(s)) return 0;
 	if (idx <= 0) return 0;
 
-	t = sl_thd_aep_alloc_ext(cap_info_dci(rs), NULL, idx, 0, 0, 0, 0, 0, NULL);
+	/* s is not a scheduler, dcbinfo will be in the scheduler component */
+	dcbcap = cos_dcb_info_alloc(cap_info_cpu_dcbdata(cap_info_cpu_local(rc)), &dcboff, &dcbaddr);
+	if (!dcbcap || !dcbaddr || !dcboff) return 0; /* dcboff == 0 for initthd in that comp! */
+	t = sl_thd_aep_alloc_ext_dcb(cap_info_dci(rs), NULL, idx, 0, 0, 0, dcbcap, dcboff, 0, 0, NULL);
 	if (!t) return 0;
 	thdcap = cos_cap_cpy(cap_info_ci(rc), cap_ci, CAP_THD, sl_thd_thdcap(t));
 	if (!thdcap) goto err;
@@ -66,6 +78,7 @@ capmgr_thd_create_ext_cserialized(thdid_t *tid, int *unused, spdid_t s, thdclosu
 	cap_info_thd_init(rc, t, 0);
 	cap_info_thd_init(rs, t, 0);
 	*tid = sl_thd_thdid(t);
+	*dcb = (struct cos_dcb_info *)dcbaddr;
 	/* child is not a scheduler, don't copy into child */
 
 	return thdcap;
@@ -78,20 +91,27 @@ capmgr_thd_create_ext_cserialized(thdid_t *tid, int *unused, spdid_t s, thdclosu
 thdcap_t
 capmgr_initthd_create_cserialized(thdid_t *tid, int *unused, spdid_t s)
 {
-	spdid_t                 cur     = cos_inv_token();
-	struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get();
-	struct cos_compinfo    *cap_ci  = cos_compinfo_get(cap_dci);
-	struct cap_comp_info   *rc      = cap_info_comp_find(cur);
-	struct cap_comp_info   *rs      = cap_info_comp_find(s);
-	struct sl_thd          *t       = NULL;
-	thdcap_t                thdcap  = 0;
+	spdid_t                   cur     = cos_inv_token();
+	struct cos_defcompinfo   *cap_dci = cos_defcompinfo_curr_get();
+	struct cos_compinfo      *cap_ci  = cos_compinfo_get(cap_dci);
+	struct cap_comp_info     *rc      = cap_info_comp_find(cur);
+	struct cap_comp_info     *rs      = cap_info_comp_find(s);
+	struct cap_comp_cpu_info *rs_cpu  = cap_info_cpu_local(rs);
+	struct cos_compinfo      *rs_ci   = cap_info_ci(rs);
+	struct sl_thd            *t       = NULL;
+	thdcap_t                  thdcap  = 0;
+	dcbcap_t                  dcbcap  = 0;
+	dcboff_t                  dcboff  = 0;
+	vaddr_t                   dcbaddr = 0;
 
 	if (!rc || !cap_info_init_check(rc)) return 0;
 	if (!rs || !cap_info_init_check(rs)) return 0;
 	if (!cap_info_is_sched(cur) || !cap_info_is_child(rc, s)) return 0;
 	if (cap_info_is_sched(s)) return 0;
 
-	t = sl_thd_initaep_alloc(cap_info_dci(rs), NULL, 0, 0, 0, 0, 0);
+	dcbcap = cos_dcb_info_alloc(cap_info_cpu_dcbdata(rs_cpu), &dcboff, &dcbaddr);
+	if (!dcbcap || !dcbaddr || dcboff) return 0; /* dcboff == 0 for initthd in that comp! */
+	t = sl_thd_initaep_alloc_dcb(cap_info_dci(rs), NULL, 0, 0, 0, dcbcap, 0, 0);
 	if (!t) return 0;
 	/* child is not a scheduler, don't copy into child */
 	/* parent only needs the thdcap */
@@ -113,22 +133,26 @@ capmgr_initthd_create_cserialized(thdid_t *tid, int *unused, spdid_t s)
 thdcap_t
 capmgr_initaep_create_cserialized(u32_t *sndtidret, u32_t *rcvtcret, u32_t spdid_owntc, u32_t key_ipimax, u32_t ipiwin32b)
 {
-	spdid_t                 cur     = cos_inv_token(), s = spdid_owntc >> 16;
-	struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get();
-	struct cos_compinfo    *cap_ci  = cos_compinfo_get(cap_dci);
-	struct cap_comp_info   *rc      = cap_info_comp_find(cur);
-	struct cap_comp_info   *rs      = cap_info_comp_find(s);
-	struct sl_thd          *t       = NULL, *rinit = NULL;
-	thdcap_t                thdcap  = 0;
-	int                     owntc   = (spdid_owntc << 16) >> 16;
-	cos_channelkey_t        key     = key_ipimax >> 16;
-	u32_t                   ipimax  = (key_ipimax << 16) >> 16;
-	microsec_t              ipiwin  = (microsec_t)ipiwin32b;
-	int                     ret;
-	tcap_t                  tc;
-	arcvcap_t               rcv;
-	asndcap_t               snd;
-	thdid_t                 tid;
+	spdid_t                   cur     = cos_inv_token(), s = spdid_owntc >> 16;
+	struct cos_defcompinfo   *cap_dci = cos_defcompinfo_curr_get();
+	struct cos_compinfo      *cap_ci  = cos_compinfo_get(cap_dci);
+	struct cap_comp_info     *rc      = cap_info_comp_find(cur);
+	struct cap_comp_info     *rs      = cap_info_comp_find(s);
+	struct cap_comp_cpu_info *rs_cpu  = cap_info_cpu_local(rs);
+	struct sl_thd            *t       = NULL, *rinit = NULL;
+	thdcap_t                  thdcap  = 0;
+	int                       owntc   = (spdid_owntc << 16) >> 16;
+	cos_channelkey_t          key     = key_ipimax >> 16;
+	u32_t                     ipimax  = (key_ipimax << 16) >> 16;
+	microsec_t                ipiwin  = (microsec_t)ipiwin32b;
+	dcbcap_t                  dcbcap  = 0;
+	dcboff_t                  dcboff  = 0;
+	vaddr_t                   dcbaddr = 0;
+	int                       ret;
+	tcap_t                    tc;
+	arcvcap_t                 rcv;
+	asndcap_t                 snd;
+	thdid_t                   tid;
 
 	if (!rc || !cap_info_init_check(rc)) return 0;
 	if (!rs || !cap_info_init_check(rs)) return 0;
@@ -137,7 +161,9 @@ capmgr_initaep_create_cserialized(u32_t *sndtidret, u32_t *rcvtcret, u32_t spdid
 
 	rinit = cap_info_initthd(rc);
 	if (!rinit) return 0;
-	t = sl_thd_initaep_alloc(cap_info_dci(rs), rinit, 1, owntc, 0, 0, 0);
+	dcbcap = cos_dcb_info_alloc(cap_info_cpu_dcbdata(rs_cpu), &dcboff, &dcbaddr);
+	if (!dcbcap || !dcbaddr || dcboff) return 0; /* dcboff == 0 for initthd in that comp! */
+	t = sl_thd_initaep_alloc_dcb(cap_info_dci(rs), rinit, 1, owntc, 0, dcbcap, ipimax, ipiwin);
 	if (!t) return 0;
 	/* child is a scheduler.. copy initcaps */
 	ret = cos_cap_cpy_at(cap_info_ci(rs), BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, cap_ci, sl_thd_thdcap(t));
@@ -166,8 +192,8 @@ capmgr_initaep_create_cserialized(u32_t *sndtidret, u32_t *rcvtcret, u32_t spdid
 	cap_comminfo_init(t, ipiwin, ipimax);
 	cap_info_thd_init(rc, t, key);
 	cap_info_initthd_init(rs, t, 0);
-	cap_info_cpu_local(rs)->p_initthdcap = thdcap = ret;
-	cap_info_cpu_local(rs)->initthdid    = tid = sl_thd_thdid(t);
+	rs_cpu->p_initthdcap = thdcap = ret;
+	rs_cpu->initthdid    = tid = sl_thd_thdid(t);
 	*rcvtcret  = (rcv << 16) | (tc);
 	*sndtidret = (snd << 16) | (tid);
 
@@ -178,8 +204,33 @@ capmgr_initaep_create_cserialized(u32_t *sndtidret, u32_t *rcvtcret, u32_t spdid
 	return 0;
 }
 
-thdcap_t
-capmgr_aep_create_ext_cserialized(u32_t *drcvtidret, u32_t *rcvtcret, u32_t owntc_spdid_thdidx, u32_t chkey_ipimax, u32_t ipiwin32b)
+arcvcap_t
+capmgr_aep_rcv_retrieve_cserialized(spdid_t s, thdid_t tid)
+{
+	spdid_t                   cur     = cos_inv_token();
+	struct cos_defcompinfo   *cap_dci = cos_defcompinfo_curr_get();
+	struct cos_compinfo      *cap_ci  = cos_compinfo_get(cap_dci);
+	struct cap_comp_info     *rc      = cap_info_comp_find(cur);
+	struct cap_comp_info     *rs      = cap_info_comp_find(s);
+	struct sl_thd            *ti      = cap_info_thd_find(rs, tid);
+	arcvcap_t                 dstrcv  = 0;
+
+	if (!rc || !cap_info_init_check(rc)) return 0;
+	if (!rs || !cap_info_init_check(rs)) return 0;
+	if (!cap_info_is_sched(cur) || !cap_info_is_child(rc, s)) return 0;
+	if (!ti || !sl_thd_thdcap(ti)) return 0;
+
+	/*
+	 * for aep thread.. rcv cap should be accessible in the destination component,
+	 * so we return that cap so the scheduler can init proper structure of the dest component.
+	 */
+	dstrcv = cos_cap_cpy(cap_info_ci(rs), cap_ci, CAP_ARCV, sl_thd_rcvcap(ti));
+
+	return dstrcv;
+}
+
+u32_t
+capmgr_aep_create_ext_cserialized(struct cos_dcb_info **dcb, u32_t *rcvtcret, u32_t owntc_spdid_thdidx, u32_t chkey_ipimax, u32_t ipiwin32b)
 {
 	spdid_t                 cur     = cos_inv_token();
 	spdid_t                 s       = (owntc_spdid_thdidx << 1) >> 17;
@@ -195,6 +246,9 @@ capmgr_aep_create_ext_cserialized(u32_t *drcvtidret, u32_t *rcvtcret, u32_t ownt
 	u32_t                   ipimax  = (chkey_ipimax << 16) >> 16;
 	microsec_t              ipiwin  = (microsec_t)ipiwin32b;
 	arcvcap_t               srcrcv, dstrcv;
+	dcbcap_t                dcbcap  = 0;
+	dcboff_t                dcboff  = 0;
+	vaddr_t                 dcbaddr = 0;
 	tcap_t                  tc;
 	int                     ret;
 
@@ -206,17 +260,15 @@ capmgr_aep_create_ext_cserialized(u32_t *drcvtidret, u32_t *rcvtcret, u32_t ownt
 	rinit = cap_info_initthd(rc);
 	if (!rinit) return 0;
 
-	t = sl_thd_aep_alloc_ext(cap_info_dci(rs), rinit, tidx, 1, owntc, 0, 0, 0, &srcrcv);
+	/* if s is not a scheduler, dcbinfo will be in the scheduler component */
+	//if (cap_info_is_sched(s)) dcbcap = cos_dcb_info_alloc(cap_info_cpu_dcbdata(cap_info_cpu_local(rs)), &dcboff, &dcbaddr);
+	/*else*/ dcbcap = cos_dcb_info_alloc(cap_info_cpu_dcbdata(cap_info_cpu_local(rc)), &dcboff, &dcbaddr);
+	if (!dcbcap || !dcbaddr || !dcboff) return 0; /* dcboff == 0 for initthd in that comp! */
+	t = sl_thd_aep_alloc_ext_dcb(cap_info_dci(rs), rinit, tidx, 1, owntc, 0, dcbcap, dcboff, ipiwin, ipimax, &srcrcv);
 	if (!t) return 0;
 	/* cur is a scheduler, copy thdcap */
 	ret = cos_cap_cpy(cap_info_ci(rc), cap_ci, CAP_THD, sl_thd_thdcap(t));
 	if (!ret) goto err;
-	/*
-	 * for aep thread.. rcv cap should be accessible in the destination component,
-	 * so we return that cap so the scheduler can init proper structucap of the dest component.
-	 */
-	dstrcv = cos_cap_cpy(cap_info_ci(rs), cap_ci, CAP_ARCV, sl_thd_rcvcap(t));
-	if (!dstrcv) goto err;
 
 	if (owntc) {
 		/*
@@ -239,8 +291,8 @@ capmgr_aep_create_ext_cserialized(u32_t *drcvtidret, u32_t *rcvtcret, u32_t ownt
 	cap_comminfo_init(t, ipiwin, ipimax);
 	cap_info_thd_init(rc, t, key);
 	cap_info_thd_init(rs, t, 0);
-	*drcvtidret = (dstrcv << 16 | sl_thd_thdid(t));
-	thdcap = ret;
+	thdcap = ret << 16 | sl_thd_thdid(t);
+	*dcb = (struct cos_dcb_info *)dcbaddr;
 
 	return thdcap;
 err:
@@ -249,8 +301,8 @@ capmgr_aep_create_ext_cserialized(u32_t *drcvtidret, u32_t *rcvtcret, u32_t ownt
 	return 0;
 }
 
-thdcap_t
-capmgr_aep_create_cserialized(thdid_t *tid, u32_t *tcrcvret, u32_t owntc_tidx, u32_t key_ipimax, u32_t ipiwin32b)
+u32_t
+capmgr_aep_create_cserialized(struct cos_dcb_info **dcb, u32_t *tcrcvret, u32_t owntc_tidx, u32_t key_ipimax, u32_t ipiwin32b)
 {
 	spdid_t                 cur     = cos_inv_token();
 	struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get();
@@ -263,6 +315,9 @@ capmgr_aep_create_cserialized(thdid_t *tid, u32_t *tcrcvret, u32_t owntc_tidx, u
 	microsec_t              ipiwin  = (microsec_t)ipiwin32b;
 	struct sl_thd          *t       = NULL, *rinit = NULL;
 	thdcap_t                thdcap  = 0;
+	dcbcap_t                dcbcap  = 0;
+	dcboff_t                dcboff  = 0;
+	vaddr_t                 dcbaddr = 0;
 	arcvcap_t               rcv;
 	tcap_t                  tc;
 	int                     ret;
@@ -274,7 +329,9 @@ capmgr_aep_create_cserialized(thdid_t *tid, u32_t *tcrcvret, u32_t owntc_tidx, u
 	rinit = cap_info_initthd(rc);
 	if (!rinit) return 0;
 
-	t = sl_thd_aep_alloc_ext(cap_info_dci(rc), rinit, tidx, 1, owntc, 0, 0, 0, &rcv);
+	dcbcap = cos_dcb_info_alloc(cap_info_cpu_dcbdata(cap_info_cpu_local(rc)), &dcboff, &dcbaddr);
+	if (!dcbcap || !dcbaddr || !dcboff) return 0; /* dcboff == 0 for initthd in that comp! */
+	t = sl_thd_aep_alloc_ext_dcb(cap_info_dci(rc), rinit, tidx, 1, owntc, 0, dcbcap, dcboff, ipiwin, ipimax, &rcv);
 	if (!t) return 0;
 	/* current is a sched, so copy */
 	ret = cos_cap_cpy(cap_info_ci(rc), cap_ci, CAP_THD, sl_thd_thdcap(t));
@@ -294,8 +351,8 @@ capmgr_aep_create_cserialized(thdid_t *tid, u32_t *tcrcvret, u32_t owntc_tidx, u
 	cap_comminfo_init(t, ipiwin, ipimax);
 	cap_info_thd_init(rc, t, key);
 	*tcrcvret = (tc << 16 | rcv);
-	*tid      = sl_thd_thdid(t);
-	thdcap    = ret;
+	thdcap    = ret << 16 | sl_thd_thdid(t);
+	*dcb = (struct cos_dcb_info *)dcbaddr;
 
 	return thdcap;
 err:
@@ -304,6 +361,32 @@ capmgr_aep_create_cserialized(thdid_t *tid, u32_t *tcrcvret, u32_t owntc_tidx, u
 	return 0;
 }
 
+int
+capmgr_thd_migrate(thdid_t tid, thdcap_t tc, cpuid_t core)
+{
+	spdid_t                   cur     = cos_inv_token();
+	struct cos_defcompinfo   *cap_dci = cos_defcompinfo_curr_get();
+	struct cos_compinfo      *cap_ci  = cos_compinfo_get(cap_dci);
+	struct cap_comp_info     *rc      = cap_info_comp_find(cur);
+	struct sl_thd            *ti      = cap_info_thd_find(rc, tid);
+	struct cap_comp_cpu_info *rc_cpu  = NULL;
+	int ret;
+
+	if (!rc || !cap_info_init_check(rc)) return -EINVAL;
+	if (!cap_info_is_sched(cur) || !cap_info_is_sched_core(cur, core)) return -EINVAL;
+	if (!ti || !sl_thd_thdcap(ti)) return -EINVAL;
+	rc_cpu = cap_info_cpu_local(rc);
+	if (tid == rc_cpu->initthdid) return -EINVAL;
+
+	ret = cos_thd_migrate(cap_ci, sl_thd_thdcap(ti), core);
+	if (ret) return ret;
+	ret = cos_thdcap_migrate(cap_info_ci(rc), tc);
+	if (ret) return ret;
+	ret = sl_thd_migrate(tid, core);
+
+	return ret;
+}
+
 thdcap_t
 capmgr_thd_retrieve_cserialized(thdid_t *inittid, int *unused, spdid_t s, thdid_t tid)
 {
@@ -483,3 +566,47 @@ capmgr_asnd_key_create(cos_channelkey_t key)
 
 	return (asndcap_t)capret;
 }
+
+int
+capmgr_hw_attach(hwid_t hwid, thdid_t tid)
+{
+	spdid_t                 cur     = cos_inv_token();
+	struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get();
+	struct cos_compinfo    *cap_ci  = cos_compinfo_get(cap_dci);
+	struct cap_comp_info   *rc      = cap_info_comp_find(cur);
+	struct sl_thd          *ti      = cap_info_thd_find(rc, tid);
+
+	if (!rc || !cap_info_init_check(rc)) return -EINVAL;
+	if (!ti || !sl_thd_rcvcap(ti)) return -EINVAL;
+
+	return cos_hw_attach(BOOT_CAPTBL_SELF_INITHW_BASE, hwid, sl_thd_rcvcap(ti));
+}
+
+int
+capmgr_hw_periodic_attach(hwid_t hwid, thdid_t tid, unsigned int period_us)
+{
+	spdid_t                 cur     = cos_inv_token();
+	struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get();
+	struct cos_compinfo    *cap_ci  = cos_compinfo_get(cap_dci);
+	struct cap_comp_info   *rc      = cap_info_comp_find(cur);
+	struct sl_thd          *ti      = cap_info_thd_find(rc, tid);
+
+	if (period_us == 0) return -EINVAL;
+	if (!rc || !cap_info_init_check(rc)) return -EINVAL;
+	if (!ti || !sl_thd_rcvcap(ti)) return -EINVAL;
+
+	return cos_hw_periodic_attach(BOOT_CAPTBL_SELF_INITHW_BASE, hwid, sl_thd_rcvcap(ti), period_us);
+}
+
+int
+capmgr_hw_detach(hwid_t hwid)
+{
+	spdid_t                 cur     = cos_inv_token();
+	struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get();
+	struct cos_compinfo    *cap_ci  = cos_compinfo_get(cap_dci);
+	struct cap_comp_info   *rc      = cap_info_comp_find(cur);
+
+	if (!rc || !cap_info_init_check(rc)) return -EINVAL;
+
+	return cos_hw_detach(BOOT_CAPTBL_SELF_INITHW_BASE, hwid);
+}
diff --git a/src/components/implementation/capmgr/naive/init.c b/src/components/implementation/capmgr/naive/init.c
index e3fcd1e0d4..0512aab8f3 100644
--- a/src/components/implementation/capmgr/naive/init.c
+++ b/src/components/implementation/capmgr/naive/init.c
@@ -13,6 +13,7 @@
 #include <cap_info.h>
 #include <hypercall.h>
 #include <sl.h>
+#include "spinlib.h"
 
 static volatile int capmgr_init_core_done = 0;
 
@@ -22,13 +23,13 @@ capmgr_comp_info_init(struct cap_comp_info *rci, spdid_t spdid)
 	struct cos_defcompinfo *defci  = cos_defcompinfo_curr_get();
 	struct cos_compinfo    *ci     = cos_compinfo_get(defci);
 	struct cap_comp_info   *btinfo = cap_info_comp_find(0);
-	spdid_t sched_spdid = 0;
 	struct cap_comp_info *rci_sched = NULL;
 	struct cap_comp_cpu_info *rci_cpu = NULL;
 	struct sl_thd *ithd = NULL;
 	u64_t chbits = 0, chschbits = 0;
 	int ret = 0, is_sched = 0;
 	int remain_child = 0;
+	spdid_t sched_spdid = 0;
 	spdid_t childid;
 	comp_flag_t ch_flags;
 	struct cos_aep_info aep;
@@ -38,17 +39,21 @@ capmgr_comp_info_init(struct cap_comp_info *rci, spdid_t spdid)
 	assert(cap_info_init_check(rci));
 	rci_cpu = cap_info_cpu_local(rci);
 
+	sched_spdid = hypercall_comp_sched_get(spdid);
 	if (spdid == 0 || (spdid != cos_spd_id() && cap_info_is_child(btinfo, spdid))) {
 		is_sched = (spdid == 0 || cap_info_is_sched_child(btinfo, spdid)) ? 1 : 0;
 
-		ret = hypercall_comp_initaep_get(spdid, is_sched, &aep);
-		assert(ret == 0);
+		if (!spdid || (spdid && sched_spdid != 0)) {
+			ret = hypercall_comp_initaep_get(spdid, is_sched, &aep, &sched_spdid);
+			assert(ret == 0);
+		}
 	}
 
 	rci_sched = cap_info_comp_find(sched_spdid);
-	assert(rci_sched && cap_info_init_check(rci_sched));
+	assert(rci_sched);
 	rci_cpu->parent = rci_sched;
 	rci_cpu->thd_used = 1;
+	if (cos_cpuid() != INIT_CORE) cap_info_cpu_initdcb_init(rci);
 
 	while ((remain_child = hypercall_comp_child_next(spdid, &childid, &ch_flags)) >= 0) {
 		bitmap_set(rci_cpu->child_bitmap, childid - 1);
@@ -66,14 +71,41 @@ capmgr_comp_info_init(struct cap_comp_info *rci, spdid_t spdid)
 		cap_comminfo_init(ithd, 0, 0);
 		cap_info_initthd_init(rci, ithd, 0);
 	} else if (cos_spd_id() == spdid) {
-		cap_info_initthd_init(rci, sl__globals_cpu()->sched_thd, 0);
+		cap_info_initthd_init(rci, sl__globals_core()->sched_thd, 0);
+	} else if (!sched_spdid && spdid) {
+		struct sl_thd *booter_thd = cap_info_initthd(btinfo);
+		dcbcap_t dcap;
+		dcboff_t off = 0;
+		vaddr_t  addr = 0;
+		struct cos_compinfo *rt_ci = cap_info_ci(rci);
+
+		dcap = cos_dcb_info_alloc(&rci_cpu->dcb_data, &off, &addr);
+		if (dcap) assert(off == 0 && addr);
+
+		/* root-scheduler, TODO: rate-limiting? */
+		ithd = sl_thd_initaep_alloc_dcb(cap_info_dci(rci), booter_thd, is_sched, is_sched ? 1 : 0, 0, dcap, 0, 0);
+		assert(ithd);
+
+		ret = cos_cap_cpy_at(rt_ci, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, ci, sl_thd_thdcap(ithd));
+		assert(ret == 0);
+		if (is_sched) {
+			ret = cos_cap_cpy_at(rt_ci, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, ci, sl_thd_tcap(ithd));
+			assert(ret == 0);
+			ret = cos_cap_cpy_at(rt_ci, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, ci, sl_thd_rcvcap(ithd));
+			assert(ret == 0);
+		}
+
+		ret = hypercall_root_initaep_set(spdid, sl_thd_aepinfo(ithd));
+		assert(ret == 0);
+		cap_info_initthd_init(rci, ithd, 0);
+		cap_comminfo_init(ithd, 0, 0);
 	}
 
 	return;
 }
 
 static void
-capmgr_comp_info_iter_cpu(void)
+capmgr_comp_info_iter_core(void)
 {
 	int remaining = hypercall_numcomps_get(), i;
 	int num_comps = 0;
@@ -142,8 +174,9 @@ cos_init(void)
 	spdid_t child;
 	comp_flag_t ch_flags;
 	int ret = 0, i;
+	unsigned int cycs_per_us = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
 
-	PRINTLOG(PRINT_DEBUG, "CPU cycles per sec: %u\n", cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE));
+	PRINTLOG(PRINT_DEBUG, "CPU cycles per sec: %u\n", cycs_per_us);
 	ret = hypercall_comp_frontier_get(cos_spd_id(), &heap_frontier, &cap_frontier);
 	assert(ret == 0);
 
@@ -153,14 +186,17 @@ cos_init(void)
 				BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT,
 				BOOT_CAPTBL_SELF_COMP, heap_frontier, cap_frontier);
 		cap_info_init();
+		cos_dcb_info_init_curr();
 		sl_init(SL_MIN_PERIOD_US);
+		spinlib_calib(cycs_per_us);
 		capmgr_comp_info_iter();
 	} else {
 		while (!capmgr_init_core_done) ; /* WAIT FOR INIT CORE TO BE DONE */
 
 		cos_defcompinfo_sched_init();
+		cos_dcb_info_init_curr();
 		sl_init(SL_MIN_PERIOD_US);
-		capmgr_comp_info_iter_cpu();
+		capmgr_comp_info_iter_core();
 	}
 	assert(hypercall_comp_child_next(cos_spd_id(), &child, &ch_flags) == -1);
 
diff --git a/src/components/implementation/capmgr/naive/spinlib.c b/src/components/implementation/capmgr/naive/spinlib.c
new file mode 100644
index 0000000000..782cdc3c6f
--- /dev/null
+++ b/src/components/implementation/capmgr/naive/spinlib.c
@@ -0,0 +1,110 @@
+#include "spinlib.h"
+#include <sl.h>
+
+#define SPINLIB_CALIB 256
+
+static u64_t spinlib_cycs_per_spin_iters = 0;
+static u64_t spinlib_usecs_per_spin_iters = 0;
+unsigned int spinlib_cycs_per_us = 0;
+static unsigned int spinlib_init = 0;
+
+void spinlib_calib(unsigned int cycs_per_us) __attribute__((optimize("O0")));
+void spinlib_usecs(cycles_t usecs) __attribute__((optimize("O0")));
+void spinlib_cycles(cycles_t cycs) __attribute__((optimize("O0")));
+void spinlib_std_iters(void) __attribute__((optimize("O0")));
+
+#define SPINLIB_TEST_NITEMS 4
+
+static void
+spinlib_calib_test(void)
+{
+	microsec_t test_us[SPINLIB_TEST_NITEMS] = { 1000, 2000, 3000, 4000 };
+	int i;
+
+	for (i = 0; i < SPINLIB_TEST_NITEMS; i++) {
+		cycles_t st, end, elapsed_cycs;
+
+		rdtscll(st);
+		spinlib_usecs(test_us[i]);
+		rdtscll(end);
+		elapsed_cycs = end - st;
+
+		PRINTC("SPIN %lluus => elapsed :%llucycs %lluus\n", test_us[i], elapsed_cycs, sl_cyc2usec(elapsed_cycs));
+	}
+}
+
+void
+spinlib_std_iters(void)
+{
+	unsigned int i;
+
+	for (i = 0 ; i < SPINLIB_ITERS_SPIN ; i++) {
+		__asm__ __volatile__("nop": : :"memory");
+	}
+}
+
+/* time taken in that loop */
+void
+spinlib_calib(unsigned int cycs_per_us)
+{
+	cycles_t total_cycs = 0;
+	unsigned int iters = 0;
+
+	if (spinlib_init) return;
+	spinlib_cycs_per_us = cycs_per_us;
+
+	while (iters < SPINLIB_CALIB) {
+		cycles_t start, end;
+
+		rdtscll(start);
+		spinlib_std_iters();
+		rdtscll(end);
+
+		total_cycs += (end - start);
+		iters ++;
+	}
+
+	spinlib_cycs_per_spin_iters = total_cycs / SPINLIB_CALIB;
+	spinlib_usecs_per_spin_iters = spinlib_cycs_per_spin_iters / spinlib_cycs_per_us;
+
+	spinlib_init = 0;
+	printc("Spin calibration: ITERS:%u Cycs/ITERS:%llu usecs/ITERS:%llu\n",
+	       SPINLIB_ITERS_SPIN, spinlib_cycs_per_spin_iters, spinlib_usecs_per_spin_iters);
+	spinlib_calib_test();
+}
+
+void
+spinlib_cycles(cycles_t cycs)
+{
+	unsigned int i = 0;
+	unsigned int iters = cycs / spinlib_cycs_per_spin_iters;
+	unsigned int left = cycs % spinlib_cycs_per_spin_iters;
+
+	assert(cycs >= spinlib_cycs_per_spin_iters);
+
+	/* round off to next cycs/spin */
+	if (left >= (spinlib_cycs_per_spin_iters / 2)) iters ++;
+
+	while (i < iters) {
+		spinlib_std_iters();
+		i ++;
+	}
+}
+
+void
+spinlib_usecs(cycles_t usecs)
+{
+	unsigned int i = 0;
+	unsigned int iters = usecs / spinlib_usecs_per_spin_iters;
+	unsigned int left = usecs % spinlib_usecs_per_spin_iters;
+
+	assert(usecs >= spinlib_usecs_per_spin_iters);
+
+	/* round off to next usec */
+	if (left >= (spinlib_usecs_per_spin_iters / 2)) iters ++;
+
+	while (i < iters) {
+		spinlib_std_iters();
+		i ++;
+	}
+}
diff --git a/src/components/implementation/capmgr/naive/spinlib.h b/src/components/implementation/capmgr/naive/spinlib.h
new file mode 100644
index 0000000000..6c477fc48c
--- /dev/null
+++ b/src/components/implementation/capmgr/naive/spinlib.h
@@ -0,0 +1,20 @@
+#ifndef SPINLIB_H
+#define SPINLIB_H
+
+#include <cos_debug.h>
+#include <cos_types.h>
+#include <cos_component.h>
+
+/*
+ * this is probably the trickiest thing to configure and
+ * the accuracy of the workgen depends very much on this.
+ */
+#define SPINLIB_ITERS_SPIN (51000)
+
+extern unsigned int spinlib_cycs_per_us;
+
+extern void spinlib_calib(unsigned int cycs_per_us);
+extern void spinlib_usecs(cycles_t usecs);
+extern void spinlib_cycles(cycles_t cycs);
+
+#endif /* SPINLIB_H */
diff --git a/src/components/implementation/capmgr/naive/work.c b/src/components/implementation/capmgr/naive/work.c
new file mode 100644
index 0000000000..ffd63ca16a
--- /dev/null
+++ b/src/components/implementation/capmgr/naive/work.c
@@ -0,0 +1,38 @@
+#include <work.h>
+#include <sl.h>
+#include "spinlib.h"
+
+int
+work_cycs_cserialized(unsigned long *hielapsed, unsigned long *loelapsed, unsigned long hi_cycs, unsigned long lo_cycs)
+{
+	cycles_t st, end, elapsed, cycs_input = (((cycles_t)hi_cycs << 32) | (cycles_t)lo_cycs);
+
+	rdtscll(st);
+	spinlib_cycles(cycs_input);
+	rdtscll(end);
+	elapsed = end - st;
+
+	*hielapsed = (elapsed >> 32);
+	*loelapsed = ((elapsed << 32) >> 32);
+
+	return 0;
+}
+
+int
+work_usecs_cserialized(unsigned long *hielapsed, unsigned long *loelapsed, unsigned long hi_us, unsigned long lo_us)
+{
+	cycles_t st, end;
+	microsec_t elapsed, usecs_input = (((microsec_t)hi_us << 32) | (microsec_t)lo_us);
+
+	rdtscll(st);
+	spinlib_usecs(usecs_input);
+	rdtscll(end);
+	/* perhaps use spinlib to return the elapsed or use sl.. */
+	elapsed = sl_cyc2usec(end - st);
+
+	*hielapsed = (elapsed >> 32);
+	*loelapsed = ((elapsed << 32) >> 32);
+
+	return 0;
+
+}
diff --git a/src/components/implementation/no_interface/llbooter/boot_deps.h b/src/components/implementation/no_interface/llbooter/boot_deps.h
index e1140221e9..8c2fab7cbe 100644
--- a/src/components/implementation/no_interface/llbooter/boot_deps.h
+++ b/src/components/implementation/no_interface/llbooter/boot_deps.h
@@ -29,14 +29,17 @@ struct comp_sched_info {
 
 /* The booter uses this to keep track of each comp */
 struct comp_cap_info {
-	struct cos_defcompinfo  def_cinfo;
-	struct usr_inv_cap      ST_user_caps[INTERFACE_UNDEF_SYMBS];
-	vaddr_t                 vaddr_user_caps; /* vaddr of user caps table in comp */
-	vaddr_t                 addr_start;
-	vaddr_t                 vaddr_mapped_in_booter;
-	vaddr_t                 upcall_entry;
-	u32_t                   cpu_bitmap[NUM_CPU_BMP_WORDS];
-	struct comp_sched_info *schedinfo[NUM_CPU];
+	struct cos_defcompinfo            def_cinfo;
+	struct usr_inv_cap                ST_user_caps[INTERFACE_UNDEF_SYMBS];
+	vaddr_t                           vaddr_user_caps; /* vaddr of user caps table in comp */
+	vaddr_t                           addr_start;
+	vaddr_t                           vaddr_mapped_in_booter;
+	vaddr_t                           upcall_entry;
+	vaddr_t                           initdcbpgs;
+	u32_t                             cpu_bitmap[NUM_CPU_BMP_WORDS];
+	struct comp_sched_info           *schedinfo[NUM_CPU];
+	struct cos_component_information *cobj_info;
+	scbcap_t                          scbcap;
 } new_comp_cap_info[MAX_NUM_SPDS];
 
 int                   schedule[NUM_CPU][MAX_NUM_SPDS];
@@ -56,6 +59,14 @@ boot_spd_comp_schedinfo_curr_get(void)
 	return &comp_schedinfo[cos_cpuid()][0];
 }
 
+static inline struct cos_component_information *
+boot_spd_comp_cobj_info_get(spdid_t spdid)
+{
+	assert(spdid && spdid <= MAX_NUM_SPDS);
+
+	return boot_spd_compcapinfo_get(spdid)->cobj_info;
+}
+
 static inline struct comp_sched_info *
 boot_spd_comp_schedinfo_get(spdid_t spdid)
 {
@@ -147,8 +158,8 @@ boot_capmgr_mem_alloc(void)
 void
 boot_comp_mem_alloc(spdid_t spdid)
 {
-	struct cos_compinfo *compinfo = boot_spd_compinfo_get(spdid);
-	struct cos_compinfo *boot_info   = boot_spd_compinfo_curr_get();
+	struct cos_compinfo *compinfo  = boot_spd_compinfo_get(spdid);
+	struct cos_compinfo *boot_info = boot_spd_compinfo_curr_get();
 	unsigned long mem_sz = capmgr_spdid ? CAPMGR_MIN_UNTYPED_SZ : LLBOOT_NEWCOMP_UNTYPED_SZ;
 
 	if (capmgr_spdid) return;
@@ -161,14 +172,14 @@ boot_compinfo_init(spdid_t spdid, captblcap_t *ct, pgtblcap_t *pt, u32_t heap_st
 {
 	struct cos_compinfo *compinfo  = boot_spd_compinfo_get(spdid);
 	struct cos_compinfo *boot_info = boot_spd_compinfo_curr_get();
+	struct comp_cap_info *spdinfo  = boot_spd_compcapinfo_get(spdid);
 
-	*ct = cos_captbl_alloc(boot_info);
+	*ct  = cos_captbl_alloc(boot_info);
 	assert(*ct);
-	*pt = cos_pgtbl_alloc(boot_info);
+	*pt  = cos_pgtbl_alloc(boot_info);
 	assert(*pt);
 
 	cos_compinfo_init(compinfo, *pt, *ct, 0, (vaddr_t)heap_start_vaddr, BOOT_CAPTBL_FREE, boot_info);
-
 	/*
 	 * if this is a capmgr, let it manage its share (ideally rest of system memory) of memory.
 	 * if there is no capmgr in the system, allow every component to manage its memory.
@@ -190,8 +201,8 @@ boot_newcomp_sinv_alloc(spdid_t spdid)
 	int i = 0;
 	int intr_spdid;
 	void *user_cap_vaddr;
-	struct cos_compinfo *interface_compinfo;
-	struct cos_compinfo *compinfo  = boot_spd_compinfo_get(spdid);
+	struct cos_compinfo  *interface_compinfo;
+	struct cos_compinfo  *compinfo = boot_spd_compinfo_get(spdid);
 	struct comp_cap_info *spdinfo  = boot_spd_compcapinfo_get(spdid);
 	/* TODO: Purge rest of booter of spdid convention */
 	invtoken_t token = (invtoken_t)spdid;
@@ -241,8 +252,14 @@ boot_newcomp_defcinfo_init(spdid_t spdid)
 	struct cos_compinfo    *child_ci  = boot_spd_compinfo_get(spdid);
 	struct cos_compinfo    *boot_info = boot_spd_compinfo_curr_get();
 	struct comp_sched_info *spdsi     = boot_spd_comp_schedinfo_get(spdid);
+	struct comp_cap_info   *spdinfo   = boot_spd_compcapinfo_get(spdid);
+	dcbcap_t dcbcap = 0;
+	dcboff_t dcboff = 0;
 
-	child_aep->thd = cos_initthd_alloc(boot_info, child_ci->comp_cap);
+	dcbcap = cos_dcb_alloc(boot_info, child_ci->pgtbl_cap, spdinfo->initdcbpgs + cos_cpuid() * PAGE_SIZE);
+	assert(dcbcap);
+
+	child_aep->thd = cos_initthd_alloc(boot_info, child_ci->comp_cap, dcbcap);
 	assert(child_aep->thd);
 
 	if (spdsi->flags & COMP_FLAG_SCHED) {
@@ -266,11 +283,8 @@ boot_comp_sched_set(spdid_t spdid)
 	struct cos_aep_info *child_aep = boot_spd_initaep_get(spdid);
 	int i = 0;
 
-	/* capmgr init only on boot core! */
 	if (!capmgr_spdid) goto set;
-	/*
-	 * if there is capmgr in the system, set it to be the first (index == 0) to initialize
-	 */
+	/* if there is capmgr in the system, set it to be the first (index == 0) to initialize */
 	if (spdid == capmgr_spdid) goto done;
 	i = 1;
 
@@ -291,8 +305,8 @@ boot_sched_caps_init(spdid_t spdid)
 	struct cos_aep_info    *child_aep = boot_spd_initaep_get(spdid);
 	int ret, i;
 
-	/* If booter should create the init caps in that component */
-	if (compsi->parent_spdid) return;
+	/* booter uses capmgr to create initthds in root-schedulers */
+	if (compsi->parent_spdid || (capmgr_spdid && spdid != capmgr_spdid)) return;
 
 	boot_newcomp_defcinfo_init(spdid);
 	ret = cos_cap_cpy_at(ci, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, boot_info, child_aep->thd);
@@ -360,6 +374,8 @@ boot_newcomp_create(spdid_t spdid, struct cos_compinfo *comp_info)
 	struct cos_compinfo *compinfo  = boot_spd_compinfo_get(spdid);
 	struct cos_compinfo *boot_info = boot_spd_compinfo_curr_get();
 	struct comp_cap_info *spdinfo  = boot_spd_compcapinfo_get(spdid);
+	struct cos_component_information *cobj_info = boot_spd_comp_cobj_info_get(spdid);
+	struct comp_sched_info *spdsi = boot_spd_comp_schedinfo_get(spdid);
 	captblcap_t ct = compinfo->captbl_cap;
 	pgtblcap_t  pt = compinfo->pgtbl_cap;
 	compcap_t   cc;
@@ -368,8 +384,31 @@ boot_newcomp_create(spdid_t spdid, struct cos_compinfo *comp_info)
 	int         i = 0;
 	invtoken_t token = (invtoken_t)spdid;
 	int ret;
+	vaddr_t    scb_uaddr = 0;
+	scbcap_t   scbcap    = 0;
+
+	if (spdsi->flags & COMP_FLAG_SCHED) { 
+		scbcap = cos_scb_alloc(boot_info);
+		assert(scbcap);
+		spdinfo->scbcap = scbcap;
+		scb_uaddr = cos_page_bump_intern_valloc(compinfo, COS_SCB_SIZE);
+		assert(scb_uaddr);
+	} else if (spdsi->parent_spdid) {
+		struct comp_cap_info *psi = boot_spd_compcapinfo_get(spdsi->parent_spdid);
+		scbcap = psi->scbcap;
+	}
 
-	cc = cos_comp_alloc(boot_info, ct, pt, (vaddr_t)spdinfo->upcall_entry);
+	if (spdinfo->initdcbpgs == 0) {
+		vaddr_t  dcbaddr = 0;
+
+		dcbaddr = cos_page_bump_intern_valloc(compinfo, NUM_CPU * PAGE_SIZE);
+		assert(dcbaddr);
+
+		spdinfo->initdcbpgs = dcbaddr;
+	}
+
+	/* scb info created on compinfo_init */
+	cc = cos_comp_alloc(boot_info, ct, pt, scbcap, (vaddr_t)spdinfo->upcall_entry, scb_uaddr);
 	assert(cc);
 	compinfo->comp_cap = cc;
 
@@ -394,7 +433,9 @@ boot_bootcomp_init(void)
 	if (first_time) {
 		first_time = 0;
 		cos_meminfo_init(&(boot_info->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
-		cos_defcompinfo_init();
+		cos_defcompinfo_init_ext(BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE,
+			 BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT,
+			 BOOT_CAPTBL_SELF_COMP, (vaddr_t)cos_get_heap_ptr(), LLBOOT_CAPTBL_FREE);
 	} else {
 		cos_defcompinfo_sched_init();
 	}
@@ -402,6 +443,21 @@ boot_bootcomp_init(void)
 	bootsi->flags |= COMP_FLAG_SCHED;
 }
 
+static void
+boot_root_sched_transfer(void)
+{
+	struct cos_aep_info *root_aep = NULL;
+	int ret;
+
+	if (!root_spdid[cos_cpuid()]) return;
+
+	root_aep = boot_spd_initaep_get(root_spdid[cos_cpuid()]);
+
+	PRINTLOG(PRINT_DEBUG, "Root scheduler is %u, transferring INF budget now!\n", root_spdid[cos_cpuid()]);
+	ret = cos_tcap_transfer(root_aep->rcv, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, LLBOOT_ROOTSCHED_PRIO);
+	assert(ret == 0);
+}
+
 static void
 boot_done(void)
 {
@@ -415,7 +471,6 @@ void
 boot_root_sched_run(void)
 {
 	struct cos_aep_info *root_aep = NULL;
-	int ret;
 
 	if (!root_spdid[cos_cpuid()]) {
 		PRINTLOG(PRINT_WARN, "No root scheduler!\n");
@@ -426,10 +481,7 @@ boot_root_sched_run(void)
 	root_aep = boot_spd_initaep_get(root_spdid[cos_cpuid()]);
 
 	PRINTLOG(PRINT_DEBUG, "Root scheduler is %u, switching to it now!\n", root_spdid[cos_cpuid()]);
-	ret = cos_tcap_transfer(root_aep->rcv, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, LLBOOT_ROOTSCHED_PRIO);
-	assert(ret == 0);
-
-	ret = cos_switch(root_aep->thd, root_aep->tc, LLBOOT_ROOTSCHED_PRIO, TCAP_TIME_NIL, 0, cos_sched_sync());
+	cos_switch(root_aep->thd, root_aep->tc, LLBOOT_ROOTSCHED_PRIO, TCAP_TIME_NIL, 0, cos_sched_sync());
 	PRINTLOG(PRINT_ERROR, "Root scheduler returned.\n");
 	assert(0);
 }
@@ -512,7 +564,18 @@ boot_comp_cap_cpy_at(spdid_t dstid, capid_t dstslot, spdid_t srcid, cap_t captyp
 }
 
 static inline int
-boot_comp_initaep_get(spdid_t dstid, spdid_t srcid, thdcap_t thdslot, arcvcap_t rcvslot, tcap_t tcslot)
+boot_comp_sched_get(spdid_t dstid, spdid_t srcid)
+{
+	struct comp_sched_info *si = NULL;
+
+	if (srcid > num_cobj || dstid > num_cobj) return -EINVAL;
+	si = boot_spd_comp_schedinfo_get(srcid);
+
+	return si->parent_spdid;
+}
+
+static inline int
+boot_comp_initaep_get(spdid_t dstid, spdid_t srcid, thdcap_t thdslot, arcvcap_t rcvslot, tcap_t tcslot, spdid_t *parent)
 {
 	struct comp_sched_info *si = NULL;
 	int ret = -1;
@@ -531,10 +594,48 @@ boot_comp_initaep_get(spdid_t dstid, spdid_t srcid, thdcap_t thdslot, arcvcap_t
 	if (ret) goto done;
 	ret = boot_comp_cap_cpy_at(dstid, tcslot, srcid, CAP_TCAP);
 
+	*parent = si->parent_spdid;
+
 done:
 	return ret;
 }
 
+static inline int
+boot_root_initaep_set(spdid_t dstid, spdid_t srcid, thdcap_t thd, arcvcap_t rcv, tcap_t tc)
+{
+	struct comp_sched_info *si = NULL;
+	struct cos_aep_info    *a  = NULL;
+	struct cos_compinfo   *b   = cos_compinfo_get(cos_defcompinfo_curr_get()), *c = boot_spd_compinfo_get(dstid);
+
+	if (srcid > num_cobj || dstid > num_cobj) return -EINVAL;
+	if (!thd) return -EINVAL;
+
+	si = boot_spd_comp_schedinfo_get(srcid);
+	if (si->parent_spdid != 0) return -EINVAL;
+
+	a = boot_spd_initaep_get(srcid);
+	if (!a) return -EINVAL;
+
+	a->thd = cos_cap_cpy(b, c, CAP_THD, thd);
+	assert(a->thd);
+	if ((si->flags & COMP_FLAG_SCHED) == 0) {
+		assert(!tc && !rcv);
+		goto done;
+	}
+	if (!rcv || !tc) return -EINVAL;
+
+	a->tc  = cos_cap_cpy(b, c, CAP_TCAP, tc);
+	assert(a->tc);
+	a->rcv = cos_cap_cpy(b, c, CAP_ARCV, rcv);
+	assert(a->rcv);
+	if (root_spdid[cos_cpuid()] == srcid) boot_root_sched_transfer();
+
+done:
+	boot_comp_sched_set(srcid);
+
+	return 0;
+}
+
 static inline int
 boot_comp_info_get(spdid_t dstid, spdid_t srcid, pgtblcap_t ptslot, captblcap_t ctslot, compcap_t compslot, spdid_t *parentid)
 {
@@ -706,9 +807,24 @@ hypercall_entry(word_t *ret2, word_t *ret3, int op, word_t arg3, word_t arg4, wo
 		thdcap_t  thdslot = (arg3 << 16) >> 16;
 		tcap_t    tcslot  = (arg4 << 16) >> 16;;
 		arcvcap_t rcvslot = arg4 >> 16;
+		spdid_t   parent_spdid = 0;
+
+		if (!__hypercall_resource_access_check(client, srcid, 0)) return -EACCES;
+		ret1 = boot_comp_initaep_get(client, srcid, thdslot, rcvslot, tcslot, &parent_spdid);
+
+		*ret2 = (word_t)parent_spdid;
+
+		break;
+	}
+	case HYPERCALL_ROOT_INITAEP_SET:
+	{
+		spdid_t   srcid   = arg3 >> 16;
+		thdcap_t  thd = (arg3 << 16) >> 16;
+		tcap_t    tc  = (arg4 << 16) >> 16;;
+		arcvcap_t rcv = arg4 >> 16;
 
 		if (!__hypercall_resource_access_check(client, srcid, 0)) return -EACCES;
-		ret1 = boot_comp_initaep_get(client, srcid, thdslot, rcvslot, tcslot);
+		ret1 = boot_root_initaep_set(client, srcid, thd, rcv, tc);
 
 		break;
 	}
@@ -756,26 +872,21 @@ hypercall_entry(word_t *ret2, word_t *ret3, int op, word_t arg3, word_t arg4, wo
 
 		break;
 	}
-	case HYPERCALL_COMP_CAPFRONTIER_GET:
+	case HYPERCALL_COMP_CPUBITMAP_GET:
 	{
-		vaddr_t vasfr;
-		capid_t capfr;
 		spdid_t srcid = arg3;
 
 		if (!__hypercall_resource_access_check(client, srcid, 1)) return -EACCES;
-		ret1  = boot_comp_frontier_get(client, srcid, &vasfr, &capfr);
-		if (ret1) goto done;
-
-		*ret2 = vasfr;
+		ret1 = boot_comp_cpubitmap_get(srcid, (u32_t *)ret2, (u32_t *)ret3);
 
 		break;
 	}
-	case HYPERCALL_COMP_CPUBITMAP_GET:
+	case HYPERCALL_COMP_SCHED_GET:
 	{
 		spdid_t srcid = arg3;
 
 		if (!__hypercall_resource_access_check(client, srcid, 1)) return -EACCES;
-		ret1 = boot_comp_cpubitmap_get(srcid, (u32_t *)ret2, (u32_t *)ret3);
+		ret1 = boot_comp_sched_get(client, srcid);
 
 		break;
 	}
diff --git a/src/components/implementation/no_interface/llbooter/llbooter.c b/src/components/implementation/no_interface/llbooter/llbooter.c
index e60d732370..0d0acebe6b 100644
--- a/src/components/implementation/no_interface/llbooter/llbooter.c
+++ b/src/components/implementation/no_interface/llbooter/llbooter.c
@@ -222,15 +222,17 @@ boot_comp_map_populate(struct cobj_header *h, spdid_t spdid, vaddr_t comp_info)
 		}
 
 		if (sect->flags & COBJ_SECT_CINFO) {
+			int k;
+
 			assert((left % PAGE_SIZE) == 0);
 			assert(comp_info == (dest_daddr + (((left/PAGE_SIZE)-1)*PAGE_SIZE)));
 			boot_process_cinfo(h, spdid, boot_spd_end(h), start_addr + (comp_info - init_daddr), comp_info);
 			ci = (struct cos_component_information *)(start_addr + (comp_info - init_daddr));
+			spdinfo->cobj_info = ci;
 
 			hinfo = boot_spd_compcapinfo_get(h->id);
 			hinfo->upcall_entry = ci->cos_upcall_entry;
 		}
-
 	}
 
 	return 0;
@@ -466,7 +468,7 @@ cos_init(void)
 
 	if (cos_cpuid() == INIT_CORE) {
 		capmgr_spdid = 0;
-		memset(root_spdid, 0, sizeof(int) * NUM_CPU);
+		memset(root_spdid, 0, sizeof(spdid_t) * NUM_CPU);
 		memset(new_comp_cap_info, 0, sizeof(struct comp_cap_info) * (MAX_NUM_SPDS));
 
 		h        = (struct cobj_header *)cos_comp_info.cos_poly[0];
diff --git a/src/components/implementation/no_interface/omp_dijkstra/Makefile b/src/components/implementation/no_interface/omp_dijkstra/Makefile
new file mode 100644
index 0000000000..a702328c38
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_dijkstra/Makefile
@@ -0,0 +1,10 @@
+COMPONENT=omp_dijkstra.o
+INTERFACES=
+#DEPENDENCIES=capmgr
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
+
+CFLAGS += -fopenmp
diff --git a/src/components/implementation/no_interface/omp_dijkstra/dijkstra_omp.c b/src/components/implementation/no_interface/omp_dijkstra/dijkstra_omp.c
new file mode 100644
index 0000000000..4eb5375c3c
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_dijkstra/dijkstra_omp.c
@@ -0,0 +1,564 @@
+#include <llprint.h>
+#include <cos_types.h>
+#include <stdlib.h>
+#include <omp.h>
+
+# define NV 6
+
+//int main ( int argc, char **argv );
+int *dijkstra_distance ( int ohd[NV][NV] );
+void find_nearest ( int s, int e, int mind[NV], int connected[NV], int *d, 
+  int *v );
+void init ( int ohd[NV][NV] );
+void timestamp ( void );
+void update_mind ( int s, int e, int mv, int connected[NV], int ohd[NV][NV], 
+  int mind[NV] );
+
+/******************************************************************************/
+
+int main ( void )//int argc, char **argv )
+
+/******************************************************************************/
+/*
+  Purpose:
+
+    MAIN runs an example of Dijkstra's minimum distance algorithm.
+
+  Discussion:
+
+    Given the distance matrix that defines a graph, we seek a list
+    of the minimum distances between node 0 and all other nodes.
+
+    This program sets up a small example problem and solves it.
+
+    The correct minimum distances are:
+
+      0   35   15   45   49   41
+
+  Licensing:
+
+    This code is distributed under the GNU LGPL license. 
+
+  Modified:
+
+    01 July 2010
+
+  Author:
+
+    Original C version by Norm Matloff, CS Dept, UC Davis.
+    This C version by John Burkardt.
+*/
+{
+  int i;
+  int i4_huge = 2147483647;
+  int j;
+  int *mind;
+  int ohd[NV][NV];
+
+  timestamp ( );
+  PRINTC ( "\n" );
+  PRINTC ( "DIJKSTRA_OPENMP\n" );
+  PRINTC ( "  C version\n" );
+  PRINTC ( "  Use Dijkstra's algorithm to determine the minimum\n" );
+  PRINTC ( "  distance from node 0 to each node in a graph,\n" );
+  PRINTC ( "  given the distances between each pair of nodes.\n" );
+  PRINTC ( "\n" );
+  PRINTC ( "  Although a very small example is considered, we\n" );
+  PRINTC ( "  demonstrate the use of OpenMP directives for\n" );
+  PRINTC ( "  parallel execution.\n" );
+/*
+  Initialize the problem data.
+*/
+  init ( ohd );
+/*
+  Print the distance matrix.
+*/
+  PRINTC ( "\n" );
+  PRINTC ( "  Distance matrix:\n" );
+  PRINTC ( "\n" );
+  for ( i = 0; i < NV; i++ )
+  {
+    for ( j = 0; j < NV; j++ )
+    {
+      if ( ohd[i][j] == i4_huge )
+      {
+        PRINTC ( "  Inf" );
+      }
+      else
+      {
+        PRINTC ( "  %3d", ohd[i][j] );
+      }
+    }
+    PRINTC ( "\n" );
+  }
+/*
+  Carry out the algorithm.
+*/
+  mind = dijkstra_distance ( ohd );  
+/*
+  Print the results.
+*/
+  PRINTC ( "\n" );
+  PRINTC ( "  Minimum distances from node 0:\n");
+  PRINTC ( "\n" );
+  for ( i = 0; i < NV; i++ )
+  {
+    PRINTC ( "  %2d  %2d\n", i, mind[i] );
+  }
+/*
+  Free memory.
+*/
+  free ( mind );
+/*
+  Terminate.
+*/
+  PRINTC ( "\n" );
+  PRINTC ( "DIJKSTRA_OPENMP\n" );
+  PRINTC ( "  Normal end of execution.\n" );
+
+  PRINTC ( "\n" );
+  timestamp ( );
+
+  return 0;
+}
+/******************************************************************************/
+
+int *dijkstra_distance ( int ohd[NV][NV]  )
+
+/******************************************************************************/
+/*
+  Purpose:
+
+    DIJKSTRA_DISTANCE uses Dijkstra's minimum distance algorithm.
+
+  Discussion:
+
+    We essentially build a tree.  We start with only node 0 connected
+    to the tree, and this is indicated by setting CONNECTED[0] = 1.
+
+    We initialize MIND[I] to the one step distance from node 0 to node I.
+    
+    Now we search among the unconnected nodes for the node MV whose minimum
+    distance is smallest, and connect it to the tree.  For each remaining
+    unconnected node I, we check to see whether the distance from 0 to MV
+    to I is less than that recorded in MIND[I], and if so, we can reduce
+    the distance.
+
+    After NV-1 steps, we have connected all the nodes to 0, and computed
+    the correct minimum distances.
+
+  Licensing:
+
+    This code is distributed under the GNU LGPL license. 
+
+  Modified:
+
+    02 July 2010
+
+  Author:
+
+    Original C version by Norm Matloff, CS Dept, UC Davis.
+    This C version by John Burkardt.
+
+  Parameters:
+
+    Input, int OHD[NV][NV], the distance of the direct link between
+    nodes I and J.
+
+    Output, int DIJKSTRA_DISTANCE[NV], the minimum distance from 
+    node 0 to each node.
+*/
+{
+  int *connected;
+  int i;
+  int i4_huge = 2147483647;
+  int md;
+  int *mind;
+  int mv;
+  int my_first;
+  int my_id;
+  int my_last;
+  int my_md;
+  int my_mv;
+  int my_step;
+  int nth;
+/*
+  Start out with only node 0 connected to the tree.
+*/
+  connected = ( int * ) malloc ( NV * sizeof ( int ) );
+
+  connected[0] = 1;
+  for ( i = 1; i < NV; i++ )
+  {
+    connected[i] = 0;
+  }
+/*
+  Initial estimate of minimum distance is the 1-step distance.
+*/
+  mind = ( int * ) malloc ( NV * sizeof ( int ) );
+
+  for ( i = 0; i < NV; i++ )
+  {
+    mind[i] = ohd[0][i];
+  }
+/*
+  Begin the parallel region.
+*/
+  # pragma omp parallel private ( my_first, my_id, my_last, my_md, my_mv, my_step ) \
+  shared ( connected, md, mind, mv, nth, ohd )
+  {
+    my_id = omp_get_thread_num ( );
+    nth = omp_get_num_threads ( ); 
+    my_first =   (   my_id       * NV ) / nth;
+    my_last  =   ( ( my_id + 1 ) * NV ) / nth - 1;
+/*
+  The SINGLE directive means that the block is to be executed by only
+  one thread, and that thread will be whichever one gets here first.
+*/
+    # pragma omp single
+    {
+      PRINTC ( "\n" );
+      PRINTC ( "  P%d: Parallel region begins with %d threads\n", my_id, nth );
+      PRINTC ( "\n" );
+    }
+    PRINTC ( "  P%d:  First=%d  Last=%d\n", my_id, my_first, my_last );
+
+    for ( my_step = 1; my_step < NV; my_step++ )
+    {
+/*
+  Before we compare the results of each thread, set the shared variable 
+  MD to a big value.  Only one thread needs to do this.
+*/
+      # pragma omp single 
+      {
+        md = i4_huge;
+        mv = -1; 
+      }
+/*
+  Each thread finds the nearest unconnected node in its part of the graph.
+  Some threads might have no unconnected nodes left.
+*/
+      find_nearest ( my_first, my_last, mind, connected, &my_md, &my_mv );
+/*
+  In order to determine the minimum of all the MY_MD's, we must insist
+  that only one thread at a time execute this block!
+*/
+      # pragma omp critical
+      {
+        if ( my_md < md )  
+        {
+          md = my_md;
+          mv = my_mv;
+        }
+      }
+/*
+  This barrier means that ALL threads have executed the critical
+  block, and therefore MD and MV have the correct value.  Only then
+  can we proceed.
+*/
+      # pragma omp barrier
+/*
+  If MV is -1, then NO thread found an unconnected node, so we're done early. 
+  OpenMP does not like to BREAK out of a parallel region, so we'll just have 
+  to let the iteration run to the end, while we avoid doing any more updates.
+
+  Otherwise, we connect the nearest node.
+*/
+      # pragma omp single 
+      {
+        if ( mv != - 1 )
+        {
+          connected[mv] = 1;
+          PRINTC ( "  P%d: Connecting node %d.\n", my_id, mv );
+        }
+      }
+/*
+  Again, we don't want any thread to proceed until the value of
+  CONNECTED is updated.
+*/
+      # pragma omp barrier
+/*
+  Now each thread should update its portion of the MIND vector,
+  by checking to see whether the trip from 0 to MV plus the step
+  from MV to a node is closer than the current record.
+*/
+      if ( mv != -1 )
+      {
+        update_mind ( my_first, my_last, mv, connected, ohd, mind );
+      }
+/*
+  Before starting the next step of the iteration, we need all threads 
+  to complete the updating, so we set a BARRIER here.
+*/
+      #pragma omp barrier
+    }
+/*
+  Once all the nodes have been connected, we can exit.
+*/
+    # pragma omp single
+    {
+      PRINTC ( "\n" );
+      PRINTC ( "  P%d: Exiting parallel region.\n", my_id );
+    }
+  }
+
+  free ( connected );
+
+  return mind;
+}
+/******************************************************************************/
+
+void find_nearest ( int s, int e, int mind[NV], int connected[NV], int *d, 
+  int *v )
+
+/******************************************************************************/
+/*
+  Purpose:
+
+    FIND_NEAREST finds the nearest unconnected node.
+
+  Licensing:
+
+    This code is distributed under the GNU LGPL license. 
+
+  Modified:
+
+    02 July 2010
+
+  Author:
+
+    Original C version by Norm Matloff, CS Dept, UC Davis.
+    This C version by John Burkardt.
+
+  Parameters:
+
+    Input, int S, E, the first and last nodes that are to be checked.
+
+    Input, int MIND[NV], the currently computed minimum distance from
+    node 0 to each node.
+
+    Input, int CONNECTED[NV], is 1 for each connected node, whose 
+    minimum distance to node 0 has been determined.
+
+    Output, int *D, the distance from node 0 to the nearest unconnected 
+    node in the range S to E.
+
+    Output, int *V, the index of the nearest unconnected node in the range
+    S to E.
+*/
+{
+  int i;
+  int i4_huge = 2147483647;
+
+  *d = i4_huge;
+  *v = -1;
+
+  for ( i = s; i <= e; i++ )
+  {
+    if ( !connected[i] && ( mind[i] < *d ) )
+    {
+      *d = mind[i];
+      *v = i;
+    }
+  }
+  return;
+}
+/******************************************************************************/
+
+void init ( int ohd[NV][NV] )
+
+/******************************************************************************/
+/*
+  Purpose:
+
+    INIT initializes the problem data.
+
+  Discussion:
+
+    The graph uses 6 nodes, and has the following diagram and
+    distance matrix:
+
+    N0--15--N2-100--N3           0   40   15  Inf  Inf  Inf
+      \      |     /            40    0   20   10   25    6
+       \     |    /             15   20    0  100  Inf  Inf
+        40  20  10             Inf   10  100    0  Inf  Inf
+          \  |  /              Inf   25  Inf  Inf    0    8
+           \ | /               Inf    6  Inf  Inf    8    0
+            N1
+            / \
+           /   \
+          6    25
+         /       \
+        /         \
+      N5----8-----N4
+
+  Licensing:
+
+    This code is distributed under the GNU LGPL license. 
+
+  Modified:
+
+    02 July 2010
+
+  Author:
+
+    Original C version by Norm Matloff, CS Dept, UC Davis.
+    This C version by John Burkardt.
+
+  Parameters:
+
+    Output, int OHD[NV][NV], the distance of the direct link between
+    nodes I and J.
+*/
+{
+  int i;
+  int i4_huge = 2147483647;
+  int j;
+
+  for ( i = 0; i < NV; i++ )  
+  {
+    for ( j = 0; j < NV; j++ )
+    {
+      if ( i == j ) 
+      {
+        ohd[i][i] = 0;
+      }
+      else
+      {
+        ohd[i][j] = i4_huge;
+      }
+    }
+  }
+  ohd[0][1] = ohd[1][0] = 40;
+  ohd[0][2] = ohd[2][0] = 15;
+  ohd[1][2] = ohd[2][1] = 20;
+  ohd[1][3] = ohd[3][1] = 10;
+  ohd[1][4] = ohd[4][1] = 25;
+  ohd[2][3] = ohd[3][2] = 100;
+  ohd[1][5] = ohd[5][1] = 6;
+  ohd[4][5] = ohd[5][4] = 8;
+
+  return;
+}
+/******************************************************************************/
+
+void timestamp ( void )
+
+/******************************************************************************/
+/*
+  Purpose:
+
+    TIMESTAMP prints the current YMDHMS date as a time stamp.
+
+  Example:
+
+    31 May 2001 09:45:54 AM
+
+  Licensing:
+
+    This code is distributed under the GNU LGPL license. 
+
+  Modified:
+
+    24 September 2003
+
+  Author:
+
+    John Burkardt
+
+  Parameters:
+
+    None
+*/
+{
+#if 0
+# define TIME_SIZE 40
+
+  static char time_buffer[TIME_SIZE];
+  const struct tm *tm;
+  time_t now;
+
+  now = time ( NULL );
+  tm = localtime ( &now );
+
+  strftime ( time_buffer, TIME_SIZE, "%d %B %Y %I:%M:%S %p", tm );
+
+  PRINTC ( "%s\n", time_buffer );
+
+  return;
+# undef TIME_SIZE
+#else
+  cycles_t now;
+
+  rdtscll(now);
+  PRINTC("%llu\n", now);
+#endif
+}
+/******************************************************************************/
+
+void update_mind ( int s, int e, int mv, int connected[NV], int ohd[NV][NV],
+  int mind[NV] )
+
+/******************************************************************************/
+/*
+  Purpose:
+
+    UPDATE_MIND updates the minimum distance vector.
+
+  Discussion:
+
+    We've just determined the minimum distance to node MV.
+
+    For each unconnected node I in the range S to E,
+    check whether the route from node 0 to MV to I is shorter
+    than the currently known minimum distance.
+
+  Licensing:
+
+    This code is distributed under the GNU LGPL license. 
+
+  Modified:
+
+    02 July 2010
+
+  Author:
+
+    Original C version by Norm Matloff, CS Dept, UC Davis.
+    This C version by John Burkardt.
+
+  Parameters:
+
+    Input, int S, E, the first and last nodes that are to be checked.
+
+    Input, int MV, the node whose minimum distance to node 0
+    has just been determined.
+
+    Input, int CONNECTED[NV], is 1 for each connected node, whose 
+    minimum distance to node 0 has been determined.
+
+    Input, int OHD[NV][NV], the distance of the direct link between
+    nodes I and J.
+
+    Input/output, int MIND[NV], the currently computed minimum distances
+    from node 0 to each node.  On output, the values for nodes S through
+    E have been updated.
+*/
+{
+  int i;
+  int i4_huge = 2147483647;
+
+  for ( i = s; i <= e; i++ )
+  {
+    if ( !connected[i] )
+    {
+      if ( ohd[mv][i] < i4_huge )
+      {
+        if ( mind[mv] + ohd[mv][i] < mind[i] )  
+        {
+          mind[i] = mind[mv] + ohd[mv][i];
+        }
+      }
+    }
+  }
+  return;
+} 
diff --git a/src/components/implementation/no_interface/omp_dijkstra/init.c b/src/components/implementation/no_interface/omp_dijkstra/init.c
new file mode 120000
index 0000000000..b2694bf833
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_dijkstra/init.c
@@ -0,0 +1 @@
+../omp_hello/init.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_dijkstra/posix_basic.c b/src/components/implementation/no_interface/omp_dijkstra/posix_basic.c
new file mode 100644
index 0000000000..41c8507068
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_dijkstra/posix_basic.c
@@ -0,0 +1,126 @@
+#include <math.h>
+#include <syscall.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/utsname.h>
+#include <time.h>
+
+#include <cos_component.h>
+#include <cos_kernel_api.h>
+#include <cos_defkernel_api.h>
+#include <cos_types.h>
+#include <ps.h>
+
+/*
+ * hack for more memory using the most insecure feature in composite: 
+ * map random physical addresses to virtual addresses and do whatever with it!
+ */
+#define START_PHY round_up_to_page(0x00100000 + COS_PHYMEM_MAX_SZ + PAGE_SIZE)
+#define PHY_MAX ((512 * 1024 * 1024) + (256 * 1024 * 1024))
+
+static unsigned free_phy_offset = 0;
+
+void *
+__alloc_memory(size_t sz)
+{
+	void *va = NULL;
+	struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get());
+	//unsigned off = ps_faa(&free_phy_offset, sz);
+	unsigned off;
+
+try_again:
+	off = ps_load(&free_phy_offset);
+
+	/* 
+	 * first use physical memory hack and 
+	 * if we run out, then use heap alloc so 
+	 * we don't run out of standard memory first 
+	 */
+	if (off > PHY_MAX || off + sz > PHY_MAX) {
+		va = cos_page_bump_allocn(ci, round_up_to_page(sz));
+	} else {
+		if (!ps_cas(&free_phy_offset, off, off + sz)) goto try_again;
+		/* use physical memory hack! */
+		va = cos_hw_map(ci, BOOT_CAPTBL_SELF_INITHW_BASE, START_PHY + off, sz);
+	}
+
+	assert(va);
+	memset(va, 0, sz);
+
+	return va;
+}
+
+//#include <memmgr.h>
+
+// HACK: The hack to end all hacks
+void *
+cos_mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset)
+{
+	void *ret=0;
+
+	if (addr != NULL) {
+		printc("parameter void *addr is not supported!\n");
+		errno = ENOTSUP;
+		return MAP_FAILED;
+	}
+	if (fd != -1) {
+		printc("file mapping is not supported!\n");
+		errno = ENOTSUP;
+		return MAP_FAILED;
+	}
+
+	//addr = (void *)memmgr_heap_page_allocn(pages);
+	addr = __alloc_memory(length);
+//	addr = (void *)cos_page_bump_allocn(cos_compinfo_get(cos_defcompinfo_curr_get()), round_up_to_page(length));
+	if (!addr){
+		ret = (void *) -1;
+	} else {
+		ret = addr;
+	}
+
+	if (ret == (void *)-1) {  /* return value comes from man page */
+		printc("mmap() failed!\n");
+		/* This is a best guess about what went wrong */
+		errno = ENOMEM;
+	}
+	return ret;
+}
+
+long
+cos_syscall_handler(int syscall_num, long a, long b, long c, long d, long e, long f, long g)
+{
+	if (syscall_num == __NR_clock_gettime) {
+		microsec_t microseconds = ps_tsc() / cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
+		time_t seconds = microseconds / 1000000;
+		long rest = microseconds % 1000000;
+
+		*((struct timespec *)b) = (struct timespec) {seconds, rest};
+		return 0;
+	}
+
+	if (syscall_num == __NR_mmap || syscall_num == __NR_mmap2) {
+		return (long)cos_mmap((void *)a, (size_t)b, (int)c, (int)d, (int)e, (off_t)f);
+	}
+
+	if (syscall_num == __NR_brk || syscall_num == __NR_munmap) {
+		return 0;
+	}
+
+	printc("Unimplemented syscall number %d\n", syscall_num);
+	assert(0);
+	return 0;
+}
+
+// Hack around thread local data
+static int cancelstate = 0;
+
+int
+pthread_setcancelstate(int new, int *old)
+{
+	if (new > 2) return EINVAL;
+
+	if (old) *old = cancelstate;
+	cancelstate = new;
+	return 0;
+}
diff --git a/src/components/implementation/no_interface/omp_fft_bots/Makefile b/src/components/implementation/no_interface/omp_fft_bots/Makefile
new file mode 100644
index 0000000000..c5d7dddf99
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fft_bots/Makefile
@@ -0,0 +1,19 @@
+COMPONENT=omp_fft_bots.o
+INTERFACES=
+#DEPENDENCIES=capmgr
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
+
+CFLAGS += -fopenmp
+# if tied tasks are required
+CFLAGS += -DFORCE_TIED_TASKS
+
+OMPC_FINAL_FLAGS=
+
+# one per compilation or none
+#CFLAGS += -DMANUAL_CUTOFF
+#CFLAGS += -DIF_CUTOFF
+#CFLAGS += -DFINAL_CUTOFF $(OMPC_FINAL_FLAGS) 
diff --git a/src/components/implementation/no_interface/omp_fft_bots/app-desc.h b/src/components/implementation/no_interface/omp_fft_bots/app-desc.h
new file mode 100644
index 0000000000..d31b29104e
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fft_bots/app-desc.h
@@ -0,0 +1,56 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include "omp-tasks-app.h"
+#include "fft.h"
+
+#define BOTS_APP_NAME "FFT"
+#define BOTS_APP_PARAMETERS_DESC "Size=%d"
+#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size
+
+#define BOTS_APP_USES_ARG_SIZE
+#define BOTS_APP_DEF_ARG_SIZE 32*1024*1024
+#define BOTS_APP_DESC_ARG_SIZE "Matrix Size"
+
+#define BOTS_APP_INIT int i;\
+     COMPLEX *in, *out1=NULL, *out2=NULL;\
+     in = malloc(bots_arg_size * sizeof(COMPLEX));\
+
+#define KERNEL_INIT\
+     out1 = malloc(bots_arg_size * sizeof(COMPLEX));\
+     for (i = 0; i < bots_arg_size; ++i) {\
+          c_re(in[i]) = 1.0;\
+          c_im(in[i]) = 1.0;\
+     }
+#define KERNEL_CALL fft(bots_arg_size, in, out1);
+#define KERNEL_FINI 
+
+#define KERNEL_SEQ_INIT\
+     out2 = malloc(bots_arg_size * sizeof(COMPLEX));\
+     for (i = 0; i < bots_arg_size; ++i) {\
+          c_re(in[i]) = 1.0;\
+          c_im(in[i]) = 1.0;\
+     }
+#define KERNEL_SEQ_CALL fft_seq(bots_arg_size, in, out2);
+#define KERNEL_SEQ_FINI
+
+#undef BOTS_APP_CHECK_USES_SEQ_RESULT
+#define KERNEL_CHECK test_correctness(bots_arg_size, out1, out2)
+
diff --git a/src/components/implementation/no_interface/omp_fft_bots/bots.h b/src/components/implementation/no_interface/omp_fft_bots/bots.h
new file mode 120000
index 0000000000..ea0ad2b59f
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fft_bots/bots.h
@@ -0,0 +1 @@
+../omp_fib_bots/bots.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_fft_bots/bots_common.c b/src/components/implementation/no_interface/omp_fft_bots/bots_common.c
new file mode 120000
index 0000000000..4802b0cf70
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fft_bots/bots_common.c
@@ -0,0 +1 @@
+../omp_fib_bots/bots_common.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_fft_bots/bots_common.h b/src/components/implementation/no_interface/omp_fft_bots/bots_common.h
new file mode 120000
index 0000000000..14eda863e4
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fft_bots/bots_common.h
@@ -0,0 +1 @@
+../omp_fib_bots/bots_common.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_fft_bots/bots_main.c b/src/components/implementation/no_interface/omp_fft_bots/bots_main.c
new file mode 120000
index 0000000000..14f2dab009
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fft_bots/bots_main.c
@@ -0,0 +1 @@
+../omp_fib_bots/bots_main.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_fft_bots/bots_main.h b/src/components/implementation/no_interface/omp_fft_bots/bots_main.h
new file mode 120000
index 0000000000..86c06ad286
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fft_bots/bots_main.h
@@ -0,0 +1 @@
+../omp_fib_bots/bots_main.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_fft_bots/fft.c b/src/components/implementation/no_interface/omp_fft_bots/fft.c
new file mode 100644
index 0000000000..b030676e26
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fft_bots/fft.c
@@ -0,0 +1,5015 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+/* 
+ * Original code from the Cilk project 
+ *
+ * Copyright (c) 2000 Massachusetts Institute of Technology
+ * Copyright (c) 2000 Matteo Frigo
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "bots.h"
+#include "app-desc.h"
+
+/* Definitions and operations for complex numbers */
+
+/*
+ * compute the W coefficients (that is, powers of the root of 1)
+ * and store them into an array.
+ */
+void compute_w_coefficients(int n, int a, int b, COMPLEX * W)
+{
+     register double twoPiOverN;
+     register int k;
+     register REAL s, c;
+
+     if (b - a < 128) {
+	  twoPiOverN = 2.0 * 3.1415926535897932384626434 / n;
+	  for (k = a; k <= b; ++k) {
+	       c = cos(twoPiOverN * k);
+	       c_re(W[k]) = c_re(W[n - k]) = c;
+	       s = sin(twoPiOverN * k);
+	       c_im(W[k]) = -s;
+	       c_im(W[n - k]) = s;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+#if defined(FORCE_TIED_TASKS)
+          #pragma omp task
+	  compute_w_coefficients(n, a, ab, W);
+          #pragma omp task
+	  compute_w_coefficients(n, ab + 1, b, W);
+#else
+          #pragma omp task untied
+	  compute_w_coefficients(n, a, ab, W);
+          #pragma omp task untied
+	  compute_w_coefficients(n, ab + 1, b, W);
+#endif
+          #pragma omp taskwait
+     }
+}
+void compute_w_coefficients_seq(int n, int a, int b, COMPLEX * W)
+{
+     register double twoPiOverN;
+     register int k;
+     register REAL s, c;
+
+     if (b - a < 128) {
+	  twoPiOverN = 2.0 * 3.1415926535897932384626434 / n;
+	  for (k = a; k <= b; ++k) {
+	       c = cos(twoPiOverN * k);
+	       c_re(W[k]) = c_re(W[n - k]) = c;
+	       s = sin(twoPiOverN * k);
+	       c_im(W[k]) = -s;
+	       c_im(W[n - k]) = s;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  compute_w_coefficients_seq(n, a, ab, W);
+	  compute_w_coefficients_seq(n, ab + 1, b, W);
+     }
+}
+/*
+ * Determine (in a stupid way) if n is divisible by eight, then by four, else
+ * find the smallest prime factor of n.
+ */
+int factor(int n)
+{
+     int r;
+
+     if (n < 2) return 1;
+     if (n == 64 || n == 128 || n == 256 || n == 1024 || n == 2048 || n == 4096) return 8;
+     if ((n & 15) == 0) return 16;
+     if ((n & 7) == 0) return 8;
+     if ((n & 3) == 0) return 4;
+     if ((n & 1) == 0) return 2;
+
+     /* try odd numbers up to n (computing the sqrt may be slower) */
+     for (r = 3; r < n; r += 2) if (n % r == 0) return r;
+
+     /* n is prime */
+     return n;
+}
+
+void unshuffle(int a, int b, COMPLEX * in, COMPLEX * out, int r, int m)
+{
+     int i, j;
+     int r4 = r & (~0x3);
+     const COMPLEX *ip;
+     COMPLEX *jp;
+
+     if (b - a < 16) {
+	  ip = in + a * r;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       for (j = 0; j < r4; j += 4) {
+		    jp[0] = ip[0];
+		    jp[m] = ip[1];
+		    jp[2 * m] = ip[2];
+		    jp[3 * m] = ip[3];
+		    jp += 4 * m;
+		    ip += 4;
+	       }
+	       for (; j < r; ++j) {
+		    *jp = *ip;
+		    ip++;
+		    jp += m;
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+#if defined(FORCE_TIED_TASKS)
+          #pragma omp task
+	  unshuffle(a, ab, in, out, r, m);
+          #pragma omp task
+	  unshuffle(ab, b, in, out, r, m);
+#else
+          #pragma omp task untied
+	  unshuffle(a, ab, in, out, r, m);
+          #pragma omp task untied
+	  unshuffle(ab, b, in, out, r, m);
+#endif
+          #pragma omp taskwait
+     }
+}
+void unshuffle_seq(int a, int b, COMPLEX * in, COMPLEX * out, int r, int m)
+{
+     int i, j;
+     int r4 = r & (~0x3);
+     const COMPLEX *ip;
+     COMPLEX *jp;
+
+     if (b - a < 16) {
+	  ip = in + a * r;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       for (j = 0; j < r4; j += 4) {
+		    jp[0] = ip[0];
+		    jp[m] = ip[1];
+		    jp[2 * m] = ip[2];
+		    jp[3 * m] = ip[3];
+		    jp += 4 * m;
+		    ip += 4;
+	       }
+	       for (; j < r; ++j) {
+		    *jp = *ip;
+		    ip++;
+		    jp += m;
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  unshuffle_seq(a, ab, in, out, r, m);
+	  unshuffle_seq(ab, b, in, out, r, m);
+     }
+}
+void fft_twiddle_gen1(COMPLEX * in, COMPLEX * out,
+				  COMPLEX * W, int r, int m,
+				  int nW, int nWdnti, int nWdntm)
+{
+     int j, k;
+     COMPLEX *jp, *kp;
+
+     for (k = 0, kp = out; k < r; ++k, kp += m) {
+	  REAL r0, i0, rt, it, rw, iw;
+	  int l1 = nWdnti + nWdntm * k;
+	  int l0;
+
+	  r0 = i0 = 0.0;
+	  for (j = 0, jp = in, l0 = 0; j < r; ++j, jp += m) {
+	       rw = c_re(W[l0]);
+	       iw = c_im(W[l0]);
+	       rt = c_re(*jp);
+	       it = c_im(*jp);
+	       r0 += rt * rw - it * iw;
+	       i0 += rt * iw + it * rw;
+	       l0 += l1;
+	       if (l0 > nW)
+		    l0 -= nW;
+	  }
+	  c_re(*kp) = r0;
+	  c_im(*kp) = i0;
+     }
+}
+
+void fft_twiddle_gen(int i, int i1, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int r, int m)
+{
+#if defined(FORCE_TIED_TASKS)
+     if (i == i1 - 1) {
+          #pragma omp task
+	  fft_twiddle_gen1(in + i, out + i, W,
+				 r, m, nW, nWdn * i, nWdn * m);
+     } else {
+	  int i2 = (i + i1) / 2;
+          #pragma omp task
+	  fft_twiddle_gen(i, i2, in, out, W, nW,
+				nWdn, r, m);
+          #pragma omp task
+	  fft_twiddle_gen(i2, i1, in, out, W, nW,
+				nWdn, r, m);
+     }
+#else
+     if (i == i1 - 1) {
+          #pragma omp task untied
+	  fft_twiddle_gen1(in + i, out + i, W,
+				 r, m, nW, nWdn * i, nWdn * m);
+     } else {
+	  int i2 = (i + i1) / 2;
+          #pragma omp task untied
+	  fft_twiddle_gen(i, i2, in, out, W, nW,
+				nWdn, r, m);
+          #pragma omp task untied
+	  fft_twiddle_gen(i2, i1, in, out, W, nW,
+				nWdn, r, m);
+     }
+#endif
+     #pragma omp taskwait
+}
+void fft_twiddle_gen_seq(int i, int i1, COMPLEX * in, COMPLEX * out, COMPLEX * W,
+                         int nW, int nWdn, int r, int m)
+{
+     if (i == i1 - 1) {
+	  fft_twiddle_gen1(in + i, out + i, W,
+				 r, m, nW, nWdn * i, nWdn * m);
+     } else {
+	  int i2 = (i + i1) / 2;
+	  fft_twiddle_gen_seq(i, i2, in, out, W, nW,
+				nWdn, r, m);
+	  fft_twiddle_gen_seq(i2, i1, in, out, W, nW,
+				nWdn, r, m);
+     }
+}
+/* machine-generated code begins here */
+void fft_base_2(COMPLEX * in, COMPLEX * out)
+{
+     REAL r1_0, i1_0;
+     REAL r1_1, i1_1;
+     r1_0 = c_re(in[0]);
+     i1_0 = c_im(in[0]);
+     r1_1 = c_re(in[1]);
+     i1_1 = c_im(in[1]);
+     c_re(out[0]) = (r1_0 + r1_1);
+     c_im(out[0]) = (i1_0 + i1_1);
+     c_re(out[1]) = (r1_0 - r1_1);
+     c_im(out[1]) = (i1_0 - i1_1);
+}
+void fft_twiddle_2(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    r1_0 = c_re(jp[0 * m]);
+		    i1_0 = c_im(jp[0 * m]);
+		    wr = c_re(W[1 * l1]);
+		    wi = c_im(W[1 * l1]);
+		    tmpr = c_re(jp[1 * m]);
+		    tmpi = c_im(jp[1 * m]);
+		    r1_1 = ((wr * tmpr) - (wi * tmpi));
+		    i1_1 = ((wi * tmpr) + (wr * tmpi));
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[1 * m]) = (r1_0 - r1_1);
+		    c_im(kp[1 * m]) = (i1_0 - i1_1);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+#if defined(FORCE_TIED_TASKS)
+          #pragma omp task
+	  fft_twiddle_2(a, ab, in, out, W, nW, nWdn, m);
+          #pragma omp task
+	  fft_twiddle_2(ab, b, in, out, W, nW, nWdn, m);
+#else
+          #pragma omp task untied
+	  fft_twiddle_2(a, ab, in, out, W, nW, nWdn, m);
+          #pragma omp task untied
+	  fft_twiddle_2(ab, b, in, out, W, nW, nWdn, m);
+#endif
+          #pragma omp taskwait
+     }
+}
+void fft_twiddle_2_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    r1_0 = c_re(jp[0 * m]);
+		    i1_0 = c_im(jp[0 * m]);
+		    wr = c_re(W[1 * l1]);
+		    wi = c_im(W[1 * l1]);
+		    tmpr = c_re(jp[1 * m]);
+		    tmpi = c_im(jp[1 * m]);
+		    r1_1 = ((wr * tmpr) - (wi * tmpi));
+		    i1_1 = ((wi * tmpr) + (wr * tmpi));
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[1 * m]) = (r1_0 - r1_1);
+		    c_im(kp[1 * m]) = (i1_0 - i1_1);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_twiddle_2_seq(a, ab, in, out, W, nW, nWdn, m);
+	  fft_twiddle_2_seq(ab, b, in, out, W, nW, nWdn, m);
+     }
+}
+void fft_unshuffle_2(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 2;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+#if defined(FORCE_TIED_TASKS)
+          #pragma omp task
+	  fft_unshuffle_2(a, ab, in, out, m);
+          #pragma omp task
+	  fft_unshuffle_2(ab, b, in, out, m);
+#else
+          #pragma omp task untied
+	  fft_unshuffle_2(a, ab, in, out, m);
+          #pragma omp task untied
+	  fft_unshuffle_2(ab, b, in, out, m);
+#endif
+          #pragma omp taskwait
+     }
+}
+void fft_unshuffle_2_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 2;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_unshuffle_2_seq(a, ab, in, out, m);
+	  fft_unshuffle_2_seq(ab, b, in, out, m);
+     }
+}
+void fft_base_4(COMPLEX * in, COMPLEX * out)
+{
+     REAL r1_0, i1_0;
+     REAL r1_1, i1_1;
+     REAL r1_2, i1_2;
+     REAL r1_3, i1_3;
+     {
+	  REAL r2_0, i2_0;
+	  REAL r2_2, i2_2;
+	  r2_0 = c_re(in[0]);
+	  i2_0 = c_im(in[0]);
+	  r2_2 = c_re(in[2]);
+	  i2_2 = c_im(in[2]);
+	  r1_0 = (r2_0 + r2_2);
+	  i1_0 = (i2_0 + i2_2);
+	  r1_2 = (r2_0 - r2_2);
+	  i1_2 = (i2_0 - i2_2);
+     }
+     {
+	  REAL r2_1, i2_1;
+	  REAL r2_3, i2_3;
+	  r2_1 = c_re(in[1]);
+	  i2_1 = c_im(in[1]);
+	  r2_3 = c_re(in[3]);
+	  i2_3 = c_im(in[3]);
+	  r1_1 = (r2_1 + r2_3);
+	  i1_1 = (i2_1 + i2_3);
+	  r1_3 = (r2_1 - r2_3);
+	  i1_3 = (i2_1 - i2_3);
+     }
+     c_re(out[0]) = (r1_0 + r1_1);
+     c_im(out[0]) = (i1_0 + i1_1);
+     c_re(out[2]) = (r1_0 - r1_1);
+     c_im(out[2]) = (i1_0 - i1_1);
+     c_re(out[1]) = (r1_2 + i1_3);
+     c_im(out[1]) = (i1_2 - r1_3);
+     c_re(out[3]) = (r1_2 - i1_3);
+     c_im(out[3]) = (i1_2 + r1_3);
+}
+void fft_twiddle_4(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    REAL r1_2, i1_2;
+		    REAL r1_3, i1_3;
+		    {
+			 REAL r2_0, i2_0;
+			 REAL r2_2, i2_2;
+			 r2_0 = c_re(jp[0 * m]);
+			 i2_0 = c_im(jp[0 * m]);
+			 wr = c_re(W[2 * l1]);
+			 wi = c_im(W[2 * l1]);
+			 tmpr = c_re(jp[2 * m]);
+			 tmpi = c_im(jp[2 * m]);
+			 r2_2 = ((wr * tmpr) - (wi * tmpi));
+			 i2_2 = ((wi * tmpr) + (wr * tmpi));
+			 r1_0 = (r2_0 + r2_2);
+			 i1_0 = (i2_0 + i2_2);
+			 r1_2 = (r2_0 - r2_2);
+			 i1_2 = (i2_0 - i2_2);
+		    }
+		    {
+			 REAL r2_1, i2_1;
+			 REAL r2_3, i2_3;
+			 wr = c_re(W[1 * l1]);
+			 wi = c_im(W[1 * l1]);
+			 tmpr = c_re(jp[1 * m]);
+			 tmpi = c_im(jp[1 * m]);
+			 r2_1 = ((wr * tmpr) - (wi * tmpi));
+			 i2_1 = ((wi * tmpr) + (wr * tmpi));
+			 wr = c_re(W[3 * l1]);
+			 wi = c_im(W[3 * l1]);
+			 tmpr = c_re(jp[3 * m]);
+			 tmpi = c_im(jp[3 * m]);
+			 r2_3 = ((wr * tmpr) - (wi * tmpi));
+			 i2_3 = ((wi * tmpr) + (wr * tmpi));
+			 r1_1 = (r2_1 + r2_3);
+			 i1_1 = (i2_1 + i2_3);
+			 r1_3 = (r2_1 - r2_3);
+			 i1_3 = (i2_1 - i2_3);
+		    }
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[2 * m]) = (r1_0 - r1_1);
+		    c_im(kp[2 * m]) = (i1_0 - i1_1);
+		    c_re(kp[1 * m]) = (r1_2 + i1_3);
+		    c_im(kp[1 * m]) = (i1_2 - r1_3);
+		    c_re(kp[3 * m]) = (r1_2 - i1_3);
+		    c_im(kp[3 * m]) = (i1_2 + r1_3);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+#if defined(FORCE_TIED_TASKS)
+          #pragma omp task
+	  fft_twiddle_4(a, ab, in, out, W, nW, nWdn, m);
+          #pragma omp task
+	  fft_twiddle_4(ab, b, in, out, W, nW, nWdn, m);
+#else
+          #pragma omp task untied
+	  fft_twiddle_4(a, ab, in, out, W, nW, nWdn, m);
+          #pragma omp task untied
+	  fft_twiddle_4(ab, b, in, out, W, nW, nWdn, m);
+#endif
+          #pragma omp taskwait
+     }
+}
+void fft_twiddle_4_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    REAL r1_2, i1_2;
+		    REAL r1_3, i1_3;
+		    {
+			 REAL r2_0, i2_0;
+			 REAL r2_2, i2_2;
+			 r2_0 = c_re(jp[0 * m]);
+			 i2_0 = c_im(jp[0 * m]);
+			 wr = c_re(W[2 * l1]);
+			 wi = c_im(W[2 * l1]);
+			 tmpr = c_re(jp[2 * m]);
+			 tmpi = c_im(jp[2 * m]);
+			 r2_2 = ((wr * tmpr) - (wi * tmpi));
+			 i2_2 = ((wi * tmpr) + (wr * tmpi));
+			 r1_0 = (r2_0 + r2_2);
+			 i1_0 = (i2_0 + i2_2);
+			 r1_2 = (r2_0 - r2_2);
+			 i1_2 = (i2_0 - i2_2);
+		    }
+		    {
+			 REAL r2_1, i2_1;
+			 REAL r2_3, i2_3;
+			 wr = c_re(W[1 * l1]);
+			 wi = c_im(W[1 * l1]);
+			 tmpr = c_re(jp[1 * m]);
+			 tmpi = c_im(jp[1 * m]);
+			 r2_1 = ((wr * tmpr) - (wi * tmpi));
+			 i2_1 = ((wi * tmpr) + (wr * tmpi));
+			 wr = c_re(W[3 * l1]);
+			 wi = c_im(W[3 * l1]);
+			 tmpr = c_re(jp[3 * m]);
+			 tmpi = c_im(jp[3 * m]);
+			 r2_3 = ((wr * tmpr) - (wi * tmpi));
+			 i2_3 = ((wi * tmpr) + (wr * tmpi));
+			 r1_1 = (r2_1 + r2_3);
+			 i1_1 = (i2_1 + i2_3);
+			 r1_3 = (r2_1 - r2_3);
+			 i1_3 = (i2_1 - i2_3);
+		    }
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[2 * m]) = (r1_0 - r1_1);
+		    c_im(kp[2 * m]) = (i1_0 - i1_1);
+		    c_re(kp[1 * m]) = (r1_2 + i1_3);
+		    c_im(kp[1 * m]) = (i1_2 - r1_3);
+		    c_re(kp[3 * m]) = (r1_2 - i1_3);
+		    c_im(kp[3 * m]) = (i1_2 + r1_3);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_twiddle_4_seq(a, ab, in, out, W, nW, nWdn, m);
+	  fft_twiddle_4_seq(ab, b, in, out, W, nW, nWdn, m);
+     }
+}
+void fft_unshuffle_4(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 4;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+#if defined(FORCE_TIED_TASKS)
+          #pragma omp task
+	  fft_unshuffle_4(a, ab, in, out, m);
+          #pragma omp task
+	  fft_unshuffle_4(ab, b, in, out, m);
+#else
+          #pragma omp task untied
+	  fft_unshuffle_4(a, ab, in, out, m);
+          #pragma omp task untied
+	  fft_unshuffle_4(ab, b, in, out, m);
+#endif
+          #pragma omp taskwait
+     }
+}
+void fft_unshuffle_4_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 4;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_unshuffle_4_seq(a, ab, in, out, m);
+	  fft_unshuffle_4_seq(ab, b, in, out, m);
+     }
+}
+void fft_base_8(COMPLEX * in, COMPLEX * out)
+{
+     REAL tmpr, tmpi;
+     {
+	  REAL r1_0, i1_0;
+	  REAL r1_1, i1_1;
+	  REAL r1_2, i1_2;
+	  REAL r1_3, i1_3;
+	  REAL r1_4, i1_4;
+	  REAL r1_5, i1_5;
+	  REAL r1_6, i1_6;
+	  REAL r1_7, i1_7;
+	  {
+	       REAL r2_0, i2_0;
+	       REAL r2_2, i2_2;
+	       REAL r2_4, i2_4;
+	       REAL r2_6, i2_6;
+	       {
+		    REAL r3_0, i3_0;
+		    REAL r3_4, i3_4;
+		    r3_0 = c_re(in[0]);
+		    i3_0 = c_im(in[0]);
+		    r3_4 = c_re(in[4]);
+		    i3_4 = c_im(in[4]);
+		    r2_0 = (r3_0 + r3_4);
+		    i2_0 = (i3_0 + i3_4);
+		    r2_4 = (r3_0 - r3_4);
+		    i2_4 = (i3_0 - i3_4);
+	       }
+	       {
+		    REAL r3_2, i3_2;
+		    REAL r3_6, i3_6;
+		    r3_2 = c_re(in[2]);
+		    i3_2 = c_im(in[2]);
+		    r3_6 = c_re(in[6]);
+		    i3_6 = c_im(in[6]);
+		    r2_2 = (r3_2 + r3_6);
+		    i2_2 = (i3_2 + i3_6);
+		    r2_6 = (r3_2 - r3_6);
+		    i2_6 = (i3_2 - i3_6);
+	       }
+	       r1_0 = (r2_0 + r2_2);
+	       i1_0 = (i2_0 + i2_2);
+	       r1_4 = (r2_0 - r2_2);
+	       i1_4 = (i2_0 - i2_2);
+	       r1_2 = (r2_4 + i2_6);
+	       i1_2 = (i2_4 - r2_6);
+	       r1_6 = (r2_4 - i2_6);
+	       i1_6 = (i2_4 + r2_6);
+	  }
+	  {
+	       REAL r2_1, i2_1;
+	       REAL r2_3, i2_3;
+	       REAL r2_5, i2_5;
+	       REAL r2_7, i2_7;
+	       {
+		    REAL r3_1, i3_1;
+		    REAL r3_5, i3_5;
+		    r3_1 = c_re(in[1]);
+		    i3_1 = c_im(in[1]);
+		    r3_5 = c_re(in[5]);
+		    i3_5 = c_im(in[5]);
+		    r2_1 = (r3_1 + r3_5);
+		    i2_1 = (i3_1 + i3_5);
+		    r2_5 = (r3_1 - r3_5);
+		    i2_5 = (i3_1 - i3_5);
+	       }
+	       {
+		    REAL r3_3, i3_3;
+		    REAL r3_7, i3_7;
+		    r3_3 = c_re(in[3]);
+		    i3_3 = c_im(in[3]);
+		    r3_7 = c_re(in[7]);
+		    i3_7 = c_im(in[7]);
+		    r2_3 = (r3_3 + r3_7);
+		    i2_3 = (i3_3 + i3_7);
+		    r2_7 = (r3_3 - r3_7);
+		    i2_7 = (i3_3 - i3_7);
+	       }
+	       r1_1 = (r2_1 + r2_3);
+	       i1_1 = (i2_1 + i2_3);
+	       r1_5 = (r2_1 - r2_3);
+	       i1_5 = (i2_1 - i2_3);
+	       r1_3 = (r2_5 + i2_7);
+	       i1_3 = (i2_5 - r2_7);
+	       r1_7 = (r2_5 - i2_7);
+	       i1_7 = (i2_5 + r2_7);
+	  }
+	  c_re(out[0]) = (r1_0 + r1_1);
+	  c_im(out[0]) = (i1_0 + i1_1);
+	  c_re(out[4]) = (r1_0 - r1_1);
+	  c_im(out[4]) = (i1_0 - i1_1);
+	  tmpr = (0.707106781187 * (r1_3 + i1_3));
+	  tmpi = (0.707106781187 * (i1_3 - r1_3));
+	  c_re(out[1]) = (r1_2 + tmpr);
+	  c_im(out[1]) = (i1_2 + tmpi);
+	  c_re(out[5]) = (r1_2 - tmpr);
+	  c_im(out[5]) = (i1_2 - tmpi);
+	  c_re(out[2]) = (r1_4 + i1_5);
+	  c_im(out[2]) = (i1_4 - r1_5);
+	  c_re(out[6]) = (r1_4 - i1_5);
+	  c_im(out[6]) = (i1_4 + r1_5);
+	  tmpr = (0.707106781187 * (i1_7 - r1_7));
+	  tmpi = (0.707106781187 * (r1_7 + i1_7));
+	  c_re(out[3]) = (r1_6 + tmpr);
+	  c_im(out[3]) = (i1_6 - tmpi);
+	  c_re(out[7]) = (r1_6 - tmpr);
+	  c_im(out[7]) = (i1_6 + tmpi);
+     }
+}
+void fft_twiddle_8(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    REAL r1_2, i1_2;
+		    REAL r1_3, i1_3;
+		    REAL r1_4, i1_4;
+		    REAL r1_5, i1_5;
+		    REAL r1_6, i1_6;
+		    REAL r1_7, i1_7;
+		    {
+			 REAL r2_0, i2_0;
+			 REAL r2_2, i2_2;
+			 REAL r2_4, i2_4;
+			 REAL r2_6, i2_6;
+			 {
+			      REAL r3_0, i3_0;
+			      REAL r3_4, i3_4;
+			      r3_0 = c_re(jp[0 * m]);
+			      i3_0 = c_im(jp[0 * m]);
+			      wr = c_re(W[4 * l1]);
+			      wi = c_im(W[4 * l1]);
+			      tmpr = c_re(jp[4 * m]);
+			      tmpi = c_im(jp[4 * m]);
+			      r3_4 = ((wr * tmpr) - (wi * tmpi));
+			      i3_4 = ((wi * tmpr) + (wr * tmpi));
+			      r2_0 = (r3_0 + r3_4);
+			      i2_0 = (i3_0 + i3_4);
+			      r2_4 = (r3_0 - r3_4);
+			      i2_4 = (i3_0 - i3_4);
+			 }
+			 {
+			      REAL r3_2, i3_2;
+			      REAL r3_6, i3_6;
+			      wr = c_re(W[2 * l1]);
+			      wi = c_im(W[2 * l1]);
+			      tmpr = c_re(jp[2 * m]);
+			      tmpi = c_im(jp[2 * m]);
+			      r3_2 = ((wr * tmpr) - (wi * tmpi));
+			      i3_2 = ((wi * tmpr) + (wr * tmpi));
+			      wr = c_re(W[6 * l1]);
+			      wi = c_im(W[6 * l1]);
+			      tmpr = c_re(jp[6 * m]);
+			      tmpi = c_im(jp[6 * m]);
+			      r3_6 = ((wr * tmpr) - (wi * tmpi));
+			      i3_6 = ((wi * tmpr) + (wr * tmpi));
+			      r2_2 = (r3_2 + r3_6);
+			      i2_2 = (i3_2 + i3_6);
+			      r2_6 = (r3_2 - r3_6);
+			      i2_6 = (i3_2 - i3_6);
+			 }
+			 r1_0 = (r2_0 + r2_2);
+			 i1_0 = (i2_0 + i2_2);
+			 r1_4 = (r2_0 - r2_2);
+			 i1_4 = (i2_0 - i2_2);
+			 r1_2 = (r2_4 + i2_6);
+			 i1_2 = (i2_4 - r2_6);
+			 r1_6 = (r2_4 - i2_6);
+			 i1_6 = (i2_4 + r2_6);
+		    }
+		    {
+			 REAL r2_1, i2_1;
+			 REAL r2_3, i2_3;
+			 REAL r2_5, i2_5;
+			 REAL r2_7, i2_7;
+			 {
+			      REAL r3_1, i3_1;
+			      REAL r3_5, i3_5;
+			      wr = c_re(W[1 * l1]);
+			      wi = c_im(W[1 * l1]);
+			      tmpr = c_re(jp[1 * m]);
+			      tmpi = c_im(jp[1 * m]);
+			      r3_1 = ((wr * tmpr) - (wi * tmpi));
+			      i3_1 = ((wi * tmpr) + (wr * tmpi));
+			      wr = c_re(W[5 * l1]);
+			      wi = c_im(W[5 * l1]);
+			      tmpr = c_re(jp[5 * m]);
+			      tmpi = c_im(jp[5 * m]);
+			      r3_5 = ((wr * tmpr) - (wi * tmpi));
+			      i3_5 = ((wi * tmpr) + (wr * tmpi));
+			      r2_1 = (r3_1 + r3_5);
+			      i2_1 = (i3_1 + i3_5);
+			      r2_5 = (r3_1 - r3_5);
+			      i2_5 = (i3_1 - i3_5);
+			 }
+			 {
+			      REAL r3_3, i3_3;
+			      REAL r3_7, i3_7;
+			      wr = c_re(W[3 * l1]);
+			      wi = c_im(W[3 * l1]);
+			      tmpr = c_re(jp[3 * m]);
+			      tmpi = c_im(jp[3 * m]);
+			      r3_3 = ((wr * tmpr) - (wi * tmpi));
+			      i3_3 = ((wi * tmpr) + (wr * tmpi));
+			      wr = c_re(W[7 * l1]);
+			      wi = c_im(W[7 * l1]);
+			      tmpr = c_re(jp[7 * m]);
+			      tmpi = c_im(jp[7 * m]);
+			      r3_7 = ((wr * tmpr) - (wi * tmpi));
+			      i3_7 = ((wi * tmpr) + (wr * tmpi));
+			      r2_3 = (r3_3 + r3_7);
+			      i2_3 = (i3_3 + i3_7);
+			      r2_7 = (r3_3 - r3_7);
+			      i2_7 = (i3_3 - i3_7);
+			 }
+			 r1_1 = (r2_1 + r2_3);
+			 i1_1 = (i2_1 + i2_3);
+			 r1_5 = (r2_1 - r2_3);
+			 i1_5 = (i2_1 - i2_3);
+			 r1_3 = (r2_5 + i2_7);
+			 i1_3 = (i2_5 - r2_7);
+			 r1_7 = (r2_5 - i2_7);
+			 i1_7 = (i2_5 + r2_7);
+		    }
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[4 * m]) = (r1_0 - r1_1);
+		    c_im(kp[4 * m]) = (i1_0 - i1_1);
+		    tmpr = (0.707106781187 * (r1_3 + i1_3));
+		    tmpi = (0.707106781187 * (i1_3 - r1_3));
+		    c_re(kp[1 * m]) = (r1_2 + tmpr);
+		    c_im(kp[1 * m]) = (i1_2 + tmpi);
+		    c_re(kp[5 * m]) = (r1_2 - tmpr);
+		    c_im(kp[5 * m]) = (i1_2 - tmpi);
+		    c_re(kp[2 * m]) = (r1_4 + i1_5);
+		    c_im(kp[2 * m]) = (i1_4 - r1_5);
+		    c_re(kp[6 * m]) = (r1_4 - i1_5);
+		    c_im(kp[6 * m]) = (i1_4 + r1_5);
+		    tmpr = (0.707106781187 * (i1_7 - r1_7));
+		    tmpi = (0.707106781187 * (r1_7 + i1_7));
+		    c_re(kp[3 * m]) = (r1_6 + tmpr);
+		    c_im(kp[3 * m]) = (i1_6 - tmpi);
+		    c_re(kp[7 * m]) = (r1_6 - tmpr);
+		    c_im(kp[7 * m]) = (i1_6 + tmpi);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+#if defined(FORCE_TIED_TASKS)
+          #pragma omp task
+	  fft_twiddle_8(a, ab, in, out, W, nW, nWdn, m);
+          #pragma omp task
+	  fft_twiddle_8(ab, b, in, out, W, nW, nWdn, m);
+#else
+          #pragma omp task untied
+	  fft_twiddle_8(a, ab, in, out, W, nW, nWdn, m);
+          #pragma omp task untied
+	  fft_twiddle_8(ab, b, in, out, W, nW, nWdn, m);
+#endif
+          #pragma omp taskwait
+     }
+}
+void fft_twiddle_8_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    REAL r1_2, i1_2;
+		    REAL r1_3, i1_3;
+		    REAL r1_4, i1_4;
+		    REAL r1_5, i1_5;
+		    REAL r1_6, i1_6;
+		    REAL r1_7, i1_7;
+		    {
+			 REAL r2_0, i2_0;
+			 REAL r2_2, i2_2;
+			 REAL r2_4, i2_4;
+			 REAL r2_6, i2_6;
+			 {
+			      REAL r3_0, i3_0;
+			      REAL r3_4, i3_4;
+			      r3_0 = c_re(jp[0 * m]);
+			      i3_0 = c_im(jp[0 * m]);
+			      wr = c_re(W[4 * l1]);
+			      wi = c_im(W[4 * l1]);
+			      tmpr = c_re(jp[4 * m]);
+			      tmpi = c_im(jp[4 * m]);
+			      r3_4 = ((wr * tmpr) - (wi * tmpi));
+			      i3_4 = ((wi * tmpr) + (wr * tmpi));
+			      r2_0 = (r3_0 + r3_4);
+			      i2_0 = (i3_0 + i3_4);
+			      r2_4 = (r3_0 - r3_4);
+			      i2_4 = (i3_0 - i3_4);
+			 }
+			 {
+			      REAL r3_2, i3_2;
+			      REAL r3_6, i3_6;
+			      wr = c_re(W[2 * l1]);
+			      wi = c_im(W[2 * l1]);
+			      tmpr = c_re(jp[2 * m]);
+			      tmpi = c_im(jp[2 * m]);
+			      r3_2 = ((wr * tmpr) - (wi * tmpi));
+			      i3_2 = ((wi * tmpr) + (wr * tmpi));
+			      wr = c_re(W[6 * l1]);
+			      wi = c_im(W[6 * l1]);
+			      tmpr = c_re(jp[6 * m]);
+			      tmpi = c_im(jp[6 * m]);
+			      r3_6 = ((wr * tmpr) - (wi * tmpi));
+			      i3_6 = ((wi * tmpr) + (wr * tmpi));
+			      r2_2 = (r3_2 + r3_6);
+			      i2_2 = (i3_2 + i3_6);
+			      r2_6 = (r3_2 - r3_6);
+			      i2_6 = (i3_2 - i3_6);
+			 }
+			 r1_0 = (r2_0 + r2_2);
+			 i1_0 = (i2_0 + i2_2);
+			 r1_4 = (r2_0 - r2_2);
+			 i1_4 = (i2_0 - i2_2);
+			 r1_2 = (r2_4 + i2_6);
+			 i1_2 = (i2_4 - r2_6);
+			 r1_6 = (r2_4 - i2_6);
+			 i1_6 = (i2_4 + r2_6);
+		    }
+		    {
+			 REAL r2_1, i2_1;
+			 REAL r2_3, i2_3;
+			 REAL r2_5, i2_5;
+			 REAL r2_7, i2_7;
+			 {
+			      REAL r3_1, i3_1;
+			      REAL r3_5, i3_5;
+			      wr = c_re(W[1 * l1]);
+			      wi = c_im(W[1 * l1]);
+			      tmpr = c_re(jp[1 * m]);
+			      tmpi = c_im(jp[1 * m]);
+			      r3_1 = ((wr * tmpr) - (wi * tmpi));
+			      i3_1 = ((wi * tmpr) + (wr * tmpi));
+			      wr = c_re(W[5 * l1]);
+			      wi = c_im(W[5 * l1]);
+			      tmpr = c_re(jp[5 * m]);
+			      tmpi = c_im(jp[5 * m]);
+			      r3_5 = ((wr * tmpr) - (wi * tmpi));
+			      i3_5 = ((wi * tmpr) + (wr * tmpi));
+			      r2_1 = (r3_1 + r3_5);
+			      i2_1 = (i3_1 + i3_5);
+			      r2_5 = (r3_1 - r3_5);
+			      i2_5 = (i3_1 - i3_5);
+			 }
+			 {
+			      REAL r3_3, i3_3;
+			      REAL r3_7, i3_7;
+			      wr = c_re(W[3 * l1]);
+			      wi = c_im(W[3 * l1]);
+			      tmpr = c_re(jp[3 * m]);
+			      tmpi = c_im(jp[3 * m]);
+			      r3_3 = ((wr * tmpr) - (wi * tmpi));
+			      i3_3 = ((wi * tmpr) + (wr * tmpi));
+			      wr = c_re(W[7 * l1]);
+			      wi = c_im(W[7 * l1]);
+			      tmpr = c_re(jp[7 * m]);
+			      tmpi = c_im(jp[7 * m]);
+			      r3_7 = ((wr * tmpr) - (wi * tmpi));
+			      i3_7 = ((wi * tmpr) + (wr * tmpi));
+			      r2_3 = (r3_3 + r3_7);
+			      i2_3 = (i3_3 + i3_7);
+			      r2_7 = (r3_3 - r3_7);
+			      i2_7 = (i3_3 - i3_7);
+			 }
+			 r1_1 = (r2_1 + r2_3);
+			 i1_1 = (i2_1 + i2_3);
+			 r1_5 = (r2_1 - r2_3);
+			 i1_5 = (i2_1 - i2_3);
+			 r1_3 = (r2_5 + i2_7);
+			 i1_3 = (i2_5 - r2_7);
+			 r1_7 = (r2_5 - i2_7);
+			 i1_7 = (i2_5 + r2_7);
+		    }
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[4 * m]) = (r1_0 - r1_1);
+		    c_im(kp[4 * m]) = (i1_0 - i1_1);
+		    tmpr = (0.707106781187 * (r1_3 + i1_3));
+		    tmpi = (0.707106781187 * (i1_3 - r1_3));
+		    c_re(kp[1 * m]) = (r1_2 + tmpr);
+		    c_im(kp[1 * m]) = (i1_2 + tmpi);
+		    c_re(kp[5 * m]) = (r1_2 - tmpr);
+		    c_im(kp[5 * m]) = (i1_2 - tmpi);
+		    c_re(kp[2 * m]) = (r1_4 + i1_5);
+		    c_im(kp[2 * m]) = (i1_4 - r1_5);
+		    c_re(kp[6 * m]) = (r1_4 - i1_5);
+		    c_im(kp[6 * m]) = (i1_4 + r1_5);
+		    tmpr = (0.707106781187 * (i1_7 - r1_7));
+		    tmpi = (0.707106781187 * (r1_7 + i1_7));
+		    c_re(kp[3 * m]) = (r1_6 + tmpr);
+		    c_im(kp[3 * m]) = (i1_6 - tmpi);
+		    c_re(kp[7 * m]) = (r1_6 - tmpr);
+		    c_im(kp[7 * m]) = (i1_6 + tmpi);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_twiddle_8_seq(a, ab, in, out, W, nW, nWdn, m);
+	  fft_twiddle_8_seq(ab, b, in, out, W, nW, nWdn, m);
+     }
+}
+void fft_unshuffle_8(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 8;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+#if defined(FORCE_TIED_TASKS)
+          #pragma omp task
+	  fft_unshuffle_8(a, ab, in, out, m);
+          #pragma omp task
+	  fft_unshuffle_8(ab, b, in, out, m);
+#else
+          #pragma omp task untied
+	  fft_unshuffle_8(a, ab, in, out, m);
+          #pragma omp task untied
+	  fft_unshuffle_8(ab, b, in, out, m);
+#endif
+          #pragma omp taskwait
+     }
+}
+void fft_unshuffle_8_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 8;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_unshuffle_8_seq(a, ab, in, out, m);
+	  fft_unshuffle_8_seq(ab, b, in, out, m);
+     }
+}
+void fft_base_16(COMPLEX * in, COMPLEX * out)
+{
+     REAL tmpr, tmpi;
+     {
+	  REAL r1_0, i1_0;
+	  REAL r1_1, i1_1;
+	  REAL r1_2, i1_2;
+	  REAL r1_3, i1_3;
+	  REAL r1_4, i1_4;
+	  REAL r1_5, i1_5;
+	  REAL r1_6, i1_6;
+	  REAL r1_7, i1_7;
+	  REAL r1_8, i1_8;
+	  REAL r1_9, i1_9;
+	  REAL r1_10, i1_10;
+	  REAL r1_11, i1_11;
+	  REAL r1_12, i1_12;
+	  REAL r1_13, i1_13;
+	  REAL r1_14, i1_14;
+	  REAL r1_15, i1_15;
+	  {
+	       REAL r2_0, i2_0;
+	       REAL r2_2, i2_2;
+	       REAL r2_4, i2_4;
+	       REAL r2_6, i2_6;
+	       REAL r2_8, i2_8;
+	       REAL r2_10, i2_10;
+	       REAL r2_12, i2_12;
+	       REAL r2_14, i2_14;
+	       {
+		    REAL r3_0, i3_0;
+		    REAL r3_4, i3_4;
+		    REAL r3_8, i3_8;
+		    REAL r3_12, i3_12;
+		    {
+			 REAL r4_0, i4_0;
+			 REAL r4_8, i4_8;
+			 r4_0 = c_re(in[0]);
+			 i4_0 = c_im(in[0]);
+			 r4_8 = c_re(in[8]);
+			 i4_8 = c_im(in[8]);
+			 r3_0 = (r4_0 + r4_8);
+			 i3_0 = (i4_0 + i4_8);
+			 r3_8 = (r4_0 - r4_8);
+			 i3_8 = (i4_0 - i4_8);
+		    }
+		    {
+			 REAL r4_4, i4_4;
+			 REAL r4_12, i4_12;
+			 r4_4 = c_re(in[4]);
+			 i4_4 = c_im(in[4]);
+			 r4_12 = c_re(in[12]);
+			 i4_12 = c_im(in[12]);
+			 r3_4 = (r4_4 + r4_12);
+			 i3_4 = (i4_4 + i4_12);
+			 r3_12 = (r4_4 - r4_12);
+			 i3_12 = (i4_4 - i4_12);
+		    }
+		    r2_0 = (r3_0 + r3_4);
+		    i2_0 = (i3_0 + i3_4);
+		    r2_8 = (r3_0 - r3_4);
+		    i2_8 = (i3_0 - i3_4);
+		    r2_4 = (r3_8 + i3_12);
+		    i2_4 = (i3_8 - r3_12);
+		    r2_12 = (r3_8 - i3_12);
+		    i2_12 = (i3_8 + r3_12);
+	       }
+	       {
+		    REAL r3_2, i3_2;
+		    REAL r3_6, i3_6;
+		    REAL r3_10, i3_10;
+		    REAL r3_14, i3_14;
+		    {
+			 REAL r4_2, i4_2;
+			 REAL r4_10, i4_10;
+			 r4_2 = c_re(in[2]);
+			 i4_2 = c_im(in[2]);
+			 r4_10 = c_re(in[10]);
+			 i4_10 = c_im(in[10]);
+			 r3_2 = (r4_2 + r4_10);
+			 i3_2 = (i4_2 + i4_10);
+			 r3_10 = (r4_2 - r4_10);
+			 i3_10 = (i4_2 - i4_10);
+		    }
+		    {
+			 REAL r4_6, i4_6;
+			 REAL r4_14, i4_14;
+			 r4_6 = c_re(in[6]);
+			 i4_6 = c_im(in[6]);
+			 r4_14 = c_re(in[14]);
+			 i4_14 = c_im(in[14]);
+			 r3_6 = (r4_6 + r4_14);
+			 i3_6 = (i4_6 + i4_14);
+			 r3_14 = (r4_6 - r4_14);
+			 i3_14 = (i4_6 - i4_14);
+		    }
+		    r2_2 = (r3_2 + r3_6);
+		    i2_2 = (i3_2 + i3_6);
+		    r2_10 = (r3_2 - r3_6);
+		    i2_10 = (i3_2 - i3_6);
+		    r2_6 = (r3_10 + i3_14);
+		    i2_6 = (i3_10 - r3_14);
+		    r2_14 = (r3_10 - i3_14);
+		    i2_14 = (i3_10 + r3_14);
+	       }
+	       r1_0 = (r2_0 + r2_2);
+	       i1_0 = (i2_0 + i2_2);
+	       r1_8 = (r2_0 - r2_2);
+	       i1_8 = (i2_0 - i2_2);
+	       tmpr = (0.707106781187 * (r2_6 + i2_6));
+	       tmpi = (0.707106781187 * (i2_6 - r2_6));
+	       r1_2 = (r2_4 + tmpr);
+	       i1_2 = (i2_4 + tmpi);
+	       r1_10 = (r2_4 - tmpr);
+	       i1_10 = (i2_4 - tmpi);
+	       r1_4 = (r2_8 + i2_10);
+	       i1_4 = (i2_8 - r2_10);
+	       r1_12 = (r2_8 - i2_10);
+	       i1_12 = (i2_8 + r2_10);
+	       tmpr = (0.707106781187 * (i2_14 - r2_14));
+	       tmpi = (0.707106781187 * (r2_14 + i2_14));
+	       r1_6 = (r2_12 + tmpr);
+	       i1_6 = (i2_12 - tmpi);
+	       r1_14 = (r2_12 - tmpr);
+	       i1_14 = (i2_12 + tmpi);
+	  }
+	  {
+	       REAL r2_1, i2_1;
+	       REAL r2_3, i2_3;
+	       REAL r2_5, i2_5;
+	       REAL r2_7, i2_7;
+	       REAL r2_9, i2_9;
+	       REAL r2_11, i2_11;
+	       REAL r2_13, i2_13;
+	       REAL r2_15, i2_15;
+	       {
+		    REAL r3_1, i3_1;
+		    REAL r3_5, i3_5;
+		    REAL r3_9, i3_9;
+		    REAL r3_13, i3_13;
+		    {
+			 REAL r4_1, i4_1;
+			 REAL r4_9, i4_9;
+			 r4_1 = c_re(in[1]);
+			 i4_1 = c_im(in[1]);
+			 r4_9 = c_re(in[9]);
+			 i4_9 = c_im(in[9]);
+			 r3_1 = (r4_1 + r4_9);
+			 i3_1 = (i4_1 + i4_9);
+			 r3_9 = (r4_1 - r4_9);
+			 i3_9 = (i4_1 - i4_9);
+		    }
+		    {
+			 REAL r4_5, i4_5;
+			 REAL r4_13, i4_13;
+			 r4_5 = c_re(in[5]);
+			 i4_5 = c_im(in[5]);
+			 r4_13 = c_re(in[13]);
+			 i4_13 = c_im(in[13]);
+			 r3_5 = (r4_5 + r4_13);
+			 i3_5 = (i4_5 + i4_13);
+			 r3_13 = (r4_5 - r4_13);
+			 i3_13 = (i4_5 - i4_13);
+		    }
+		    r2_1 = (r3_1 + r3_5);
+		    i2_1 = (i3_1 + i3_5);
+		    r2_9 = (r3_1 - r3_5);
+		    i2_9 = (i3_1 - i3_5);
+		    r2_5 = (r3_9 + i3_13);
+		    i2_5 = (i3_9 - r3_13);
+		    r2_13 = (r3_9 - i3_13);
+		    i2_13 = (i3_9 + r3_13);
+	       }
+	       {
+		    REAL r3_3, i3_3;
+		    REAL r3_7, i3_7;
+		    REAL r3_11, i3_11;
+		    REAL r3_15, i3_15;
+		    {
+			 REAL r4_3, i4_3;
+			 REAL r4_11, i4_11;
+			 r4_3 = c_re(in[3]);
+			 i4_3 = c_im(in[3]);
+			 r4_11 = c_re(in[11]);
+			 i4_11 = c_im(in[11]);
+			 r3_3 = (r4_3 + r4_11);
+			 i3_3 = (i4_3 + i4_11);
+			 r3_11 = (r4_3 - r4_11);
+			 i3_11 = (i4_3 - i4_11);
+		    }
+		    {
+			 REAL r4_7, i4_7;
+			 REAL r4_15, i4_15;
+			 r4_7 = c_re(in[7]);
+			 i4_7 = c_im(in[7]);
+			 r4_15 = c_re(in[15]);
+			 i4_15 = c_im(in[15]);
+			 r3_7 = (r4_7 + r4_15);
+			 i3_7 = (i4_7 + i4_15);
+			 r3_15 = (r4_7 - r4_15);
+			 i3_15 = (i4_7 - i4_15);
+		    }
+		    r2_3 = (r3_3 + r3_7);
+		    i2_3 = (i3_3 + i3_7);
+		    r2_11 = (r3_3 - r3_7);
+		    i2_11 = (i3_3 - i3_7);
+		    r2_7 = (r3_11 + i3_15);
+		    i2_7 = (i3_11 - r3_15);
+		    r2_15 = (r3_11 - i3_15);
+		    i2_15 = (i3_11 + r3_15);
+	       }
+	       r1_1 = (r2_1 + r2_3);
+	       i1_1 = (i2_1 + i2_3);
+	       r1_9 = (r2_1 - r2_3);
+	       i1_9 = (i2_1 - i2_3);
+	       tmpr = (0.707106781187 * (r2_7 + i2_7));
+	       tmpi = (0.707106781187 * (i2_7 - r2_7));
+	       r1_3 = (r2_5 + tmpr);
+	       i1_3 = (i2_5 + tmpi);
+	       r1_11 = (r2_5 - tmpr);
+	       i1_11 = (i2_5 - tmpi);
+	       r1_5 = (r2_9 + i2_11);
+	       i1_5 = (i2_9 - r2_11);
+	       r1_13 = (r2_9 - i2_11);
+	       i1_13 = (i2_9 + r2_11);
+	       tmpr = (0.707106781187 * (i2_15 - r2_15));
+	       tmpi = (0.707106781187 * (r2_15 + i2_15));
+	       r1_7 = (r2_13 + tmpr);
+	       i1_7 = (i2_13 - tmpi);
+	       r1_15 = (r2_13 - tmpr);
+	       i1_15 = (i2_13 + tmpi);
+	  }
+	  c_re(out[0]) = (r1_0 + r1_1);
+	  c_im(out[0]) = (i1_0 + i1_1);
+	  c_re(out[8]) = (r1_0 - r1_1);
+	  c_im(out[8]) = (i1_0 - i1_1);
+	  tmpr = ((0.923879532511 * r1_3) + (0.382683432365 * i1_3));
+	  tmpi = ((0.923879532511 * i1_3) - (0.382683432365 * r1_3));
+	  c_re(out[1]) = (r1_2 + tmpr);
+	  c_im(out[1]) = (i1_2 + tmpi);
+	  c_re(out[9]) = (r1_2 - tmpr);
+	  c_im(out[9]) = (i1_2 - tmpi);
+	  tmpr = (0.707106781187 * (r1_5 + i1_5));
+	  tmpi = (0.707106781187 * (i1_5 - r1_5));
+	  c_re(out[2]) = (r1_4 + tmpr);
+	  c_im(out[2]) = (i1_4 + tmpi);
+	  c_re(out[10]) = (r1_4 - tmpr);
+	  c_im(out[10]) = (i1_4 - tmpi);
+	  tmpr = ((0.382683432365 * r1_7) + (0.923879532511 * i1_7));
+	  tmpi = ((0.382683432365 * i1_7) - (0.923879532511 * r1_7));
+	  c_re(out[3]) = (r1_6 + tmpr);
+	  c_im(out[3]) = (i1_6 + tmpi);
+	  c_re(out[11]) = (r1_6 - tmpr);
+	  c_im(out[11]) = (i1_6 - tmpi);
+	  c_re(out[4]) = (r1_8 + i1_9);
+	  c_im(out[4]) = (i1_8 - r1_9);
+	  c_re(out[12]) = (r1_8 - i1_9);
+	  c_im(out[12]) = (i1_8 + r1_9);
+	  tmpr = ((0.923879532511 * i1_11) - (0.382683432365 * r1_11));
+	  tmpi = ((0.923879532511 * r1_11) + (0.382683432365 * i1_11));
+	  c_re(out[5]) = (r1_10 + tmpr);
+	  c_im(out[5]) = (i1_10 - tmpi);
+	  c_re(out[13]) = (r1_10 - tmpr);
+	  c_im(out[13]) = (i1_10 + tmpi);
+	  tmpr = (0.707106781187 * (i1_13 - r1_13));
+	  tmpi = (0.707106781187 * (r1_13 + i1_13));
+	  c_re(out[6]) = (r1_12 + tmpr);
+	  c_im(out[6]) = (i1_12 - tmpi);
+	  c_re(out[14]) = (r1_12 - tmpr);
+	  c_im(out[14]) = (i1_12 + tmpi);
+	  tmpr = ((0.382683432365 * i1_15) - (0.923879532511 * r1_15));
+	  tmpi = ((0.382683432365 * r1_15) + (0.923879532511 * i1_15));
+	  c_re(out[7]) = (r1_14 + tmpr);
+	  c_im(out[7]) = (i1_14 - tmpi);
+	  c_re(out[15]) = (r1_14 - tmpr);
+	  c_im(out[15]) = (i1_14 + tmpi);
+     }
+}
+void fft_twiddle_16(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    REAL r1_2, i1_2;
+		    REAL r1_3, i1_3;
+		    REAL r1_4, i1_4;
+		    REAL r1_5, i1_5;
+		    REAL r1_6, i1_6;
+		    REAL r1_7, i1_7;
+		    REAL r1_8, i1_8;
+		    REAL r1_9, i1_9;
+		    REAL r1_10, i1_10;
+		    REAL r1_11, i1_11;
+		    REAL r1_12, i1_12;
+		    REAL r1_13, i1_13;
+		    REAL r1_14, i1_14;
+		    REAL r1_15, i1_15;
+		    {
+			 REAL r2_0, i2_0;
+			 REAL r2_2, i2_2;
+			 REAL r2_4, i2_4;
+			 REAL r2_6, i2_6;
+			 REAL r2_8, i2_8;
+			 REAL r2_10, i2_10;
+			 REAL r2_12, i2_12;
+			 REAL r2_14, i2_14;
+			 {
+			      REAL r3_0, i3_0;
+			      REAL r3_4, i3_4;
+			      REAL r3_8, i3_8;
+			      REAL r3_12, i3_12;
+			      {
+				   REAL r4_0, i4_0;
+				   REAL r4_8, i4_8;
+				   r4_0 = c_re(jp[0 * m]);
+				   i4_0 = c_im(jp[0 * m]);
+				   wr = c_re(W[8 * l1]);
+				   wi = c_im(W[8 * l1]);
+				   tmpr = c_re(jp[8 * m]);
+				   tmpi = c_im(jp[8 * m]);
+				   r4_8 = ((wr * tmpr) - (wi * tmpi));
+				   i4_8 = ((wi * tmpr) + (wr * tmpi));
+				   r3_0 = (r4_0 + r4_8);
+				   i3_0 = (i4_0 + i4_8);
+				   r3_8 = (r4_0 - r4_8);
+				   i3_8 = (i4_0 - i4_8);
+			      }
+			      {
+				   REAL r4_4, i4_4;
+				   REAL r4_12, i4_12;
+				   wr = c_re(W[4 * l1]);
+				   wi = c_im(W[4 * l1]);
+				   tmpr = c_re(jp[4 * m]);
+				   tmpi = c_im(jp[4 * m]);
+				   r4_4 = ((wr * tmpr) - (wi * tmpi));
+				   i4_4 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[12 * l1]);
+				   wi = c_im(W[12 * l1]);
+				   tmpr = c_re(jp[12 * m]);
+				   tmpi = c_im(jp[12 * m]);
+				   r4_12 = ((wr * tmpr) - (wi * tmpi));
+				   i4_12 = ((wi * tmpr) + (wr * tmpi));
+				   r3_4 = (r4_4 + r4_12);
+				   i3_4 = (i4_4 + i4_12);
+				   r3_12 = (r4_4 - r4_12);
+				   i3_12 = (i4_4 - i4_12);
+			      }
+			      r2_0 = (r3_0 + r3_4);
+			      i2_0 = (i3_0 + i3_4);
+			      r2_8 = (r3_0 - r3_4);
+			      i2_8 = (i3_0 - i3_4);
+			      r2_4 = (r3_8 + i3_12);
+			      i2_4 = (i3_8 - r3_12);
+			      r2_12 = (r3_8 - i3_12);
+			      i2_12 = (i3_8 + r3_12);
+			 }
+			 {
+			      REAL r3_2, i3_2;
+			      REAL r3_6, i3_6;
+			      REAL r3_10, i3_10;
+			      REAL r3_14, i3_14;
+			      {
+				   REAL r4_2, i4_2;
+				   REAL r4_10, i4_10;
+				   wr = c_re(W[2 * l1]);
+				   wi = c_im(W[2 * l1]);
+				   tmpr = c_re(jp[2 * m]);
+				   tmpi = c_im(jp[2 * m]);
+				   r4_2 = ((wr * tmpr) - (wi * tmpi));
+				   i4_2 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[10 * l1]);
+				   wi = c_im(W[10 * l1]);
+				   tmpr = c_re(jp[10 * m]);
+				   tmpi = c_im(jp[10 * m]);
+				   r4_10 = ((wr * tmpr) - (wi * tmpi));
+				   i4_10 = ((wi * tmpr) + (wr * tmpi));
+				   r3_2 = (r4_2 + r4_10);
+				   i3_2 = (i4_2 + i4_10);
+				   r3_10 = (r4_2 - r4_10);
+				   i3_10 = (i4_2 - i4_10);
+			      }
+			      {
+				   REAL r4_6, i4_6;
+				   REAL r4_14, i4_14;
+				   wr = c_re(W[6 * l1]);
+				   wi = c_im(W[6 * l1]);
+				   tmpr = c_re(jp[6 * m]);
+				   tmpi = c_im(jp[6 * m]);
+				   r4_6 = ((wr * tmpr) - (wi * tmpi));
+				   i4_6 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[14 * l1]);
+				   wi = c_im(W[14 * l1]);
+				   tmpr = c_re(jp[14 * m]);
+				   tmpi = c_im(jp[14 * m]);
+				   r4_14 = ((wr * tmpr) - (wi * tmpi));
+				   i4_14 = ((wi * tmpr) + (wr * tmpi));
+				   r3_6 = (r4_6 + r4_14);
+				   i3_6 = (i4_6 + i4_14);
+				   r3_14 = (r4_6 - r4_14);
+				   i3_14 = (i4_6 - i4_14);
+			      }
+			      r2_2 = (r3_2 + r3_6);
+			      i2_2 = (i3_2 + i3_6);
+			      r2_10 = (r3_2 - r3_6);
+			      i2_10 = (i3_2 - i3_6);
+			      r2_6 = (r3_10 + i3_14);
+			      i2_6 = (i3_10 - r3_14);
+			      r2_14 = (r3_10 - i3_14);
+			      i2_14 = (i3_10 + r3_14);
+			 }
+			 r1_0 = (r2_0 + r2_2);
+			 i1_0 = (i2_0 + i2_2);
+			 r1_8 = (r2_0 - r2_2);
+			 i1_8 = (i2_0 - i2_2);
+			 tmpr = (0.707106781187 * (r2_6 + i2_6));
+			 tmpi = (0.707106781187 * (i2_6 - r2_6));
+			 r1_2 = (r2_4 + tmpr);
+			 i1_2 = (i2_4 + tmpi);
+			 r1_10 = (r2_4 - tmpr);
+			 i1_10 = (i2_4 - tmpi);
+			 r1_4 = (r2_8 + i2_10);
+			 i1_4 = (i2_8 - r2_10);
+			 r1_12 = (r2_8 - i2_10);
+			 i1_12 = (i2_8 + r2_10);
+			 tmpr = (0.707106781187 * (i2_14 - r2_14));
+			 tmpi = (0.707106781187 * (r2_14 + i2_14));
+			 r1_6 = (r2_12 + tmpr);
+			 i1_6 = (i2_12 - tmpi);
+			 r1_14 = (r2_12 - tmpr);
+			 i1_14 = (i2_12 + tmpi);
+		    }
+		    {
+			 REAL r2_1, i2_1;
+			 REAL r2_3, i2_3;
+			 REAL r2_5, i2_5;
+			 REAL r2_7, i2_7;
+			 REAL r2_9, i2_9;
+			 REAL r2_11, i2_11;
+			 REAL r2_13, i2_13;
+			 REAL r2_15, i2_15;
+			 {
+			      REAL r3_1, i3_1;
+			      REAL r3_5, i3_5;
+			      REAL r3_9, i3_9;
+			      REAL r3_13, i3_13;
+			      {
+				   REAL r4_1, i4_1;
+				   REAL r4_9, i4_9;
+				   wr = c_re(W[1 * l1]);
+				   wi = c_im(W[1 * l1]);
+				   tmpr = c_re(jp[1 * m]);
+				   tmpi = c_im(jp[1 * m]);
+				   r4_1 = ((wr * tmpr) - (wi * tmpi));
+				   i4_1 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[9 * l1]);
+				   wi = c_im(W[9 * l1]);
+				   tmpr = c_re(jp[9 * m]);
+				   tmpi = c_im(jp[9 * m]);
+				   r4_9 = ((wr * tmpr) - (wi * tmpi));
+				   i4_9 = ((wi * tmpr) + (wr * tmpi));
+				   r3_1 = (r4_1 + r4_9);
+				   i3_1 = (i4_1 + i4_9);
+				   r3_9 = (r4_1 - r4_9);
+				   i3_9 = (i4_1 - i4_9);
+			      }
+			      {
+				   REAL r4_5, i4_5;
+				   REAL r4_13, i4_13;
+				   wr = c_re(W[5 * l1]);
+				   wi = c_im(W[5 * l1]);
+				   tmpr = c_re(jp[5 * m]);
+				   tmpi = c_im(jp[5 * m]);
+				   r4_5 = ((wr * tmpr) - (wi * tmpi));
+				   i4_5 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[13 * l1]);
+				   wi = c_im(W[13 * l1]);
+				   tmpr = c_re(jp[13 * m]);
+				   tmpi = c_im(jp[13 * m]);
+				   r4_13 = ((wr * tmpr) - (wi * tmpi));
+				   i4_13 = ((wi * tmpr) + (wr * tmpi));
+				   r3_5 = (r4_5 + r4_13);
+				   i3_5 = (i4_5 + i4_13);
+				   r3_13 = (r4_5 - r4_13);
+				   i3_13 = (i4_5 - i4_13);
+			      }
+			      r2_1 = (r3_1 + r3_5);
+			      i2_1 = (i3_1 + i3_5);
+			      r2_9 = (r3_1 - r3_5);
+			      i2_9 = (i3_1 - i3_5);
+			      r2_5 = (r3_9 + i3_13);
+			      i2_5 = (i3_9 - r3_13);
+			      r2_13 = (r3_9 - i3_13);
+			      i2_13 = (i3_9 + r3_13);
+			 }
+			 {
+			      REAL r3_3, i3_3;
+			      REAL r3_7, i3_7;
+			      REAL r3_11, i3_11;
+			      REAL r3_15, i3_15;
+			      {
+				   REAL r4_3, i4_3;
+				   REAL r4_11, i4_11;
+				   wr = c_re(W[3 * l1]);
+				   wi = c_im(W[3 * l1]);
+				   tmpr = c_re(jp[3 * m]);
+				   tmpi = c_im(jp[3 * m]);
+				   r4_3 = ((wr * tmpr) - (wi * tmpi));
+				   i4_3 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[11 * l1]);
+				   wi = c_im(W[11 * l1]);
+				   tmpr = c_re(jp[11 * m]);
+				   tmpi = c_im(jp[11 * m]);
+				   r4_11 = ((wr * tmpr) - (wi * tmpi));
+				   i4_11 = ((wi * tmpr) + (wr * tmpi));
+				   r3_3 = (r4_3 + r4_11);
+				   i3_3 = (i4_3 + i4_11);
+				   r3_11 = (r4_3 - r4_11);
+				   i3_11 = (i4_3 - i4_11);
+			      }
+			      {
+				   REAL r4_7, i4_7;
+				   REAL r4_15, i4_15;
+				   wr = c_re(W[7 * l1]);
+				   wi = c_im(W[7 * l1]);
+				   tmpr = c_re(jp[7 * m]);
+				   tmpi = c_im(jp[7 * m]);
+				   r4_7 = ((wr * tmpr) - (wi * tmpi));
+				   i4_7 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[15 * l1]);
+				   wi = c_im(W[15 * l1]);
+				   tmpr = c_re(jp[15 * m]);
+				   tmpi = c_im(jp[15 * m]);
+				   r4_15 = ((wr * tmpr) - (wi * tmpi));
+				   i4_15 = ((wi * tmpr) + (wr * tmpi));
+				   r3_7 = (r4_7 + r4_15);
+				   i3_7 = (i4_7 + i4_15);
+				   r3_15 = (r4_7 - r4_15);
+				   i3_15 = (i4_7 - i4_15);
+			      }
+			      r2_3 = (r3_3 + r3_7);
+			      i2_3 = (i3_3 + i3_7);
+			      r2_11 = (r3_3 - r3_7);
+			      i2_11 = (i3_3 - i3_7);
+			      r2_7 = (r3_11 + i3_15);
+			      i2_7 = (i3_11 - r3_15);
+			      r2_15 = (r3_11 - i3_15);
+			      i2_15 = (i3_11 + r3_15);
+			 }
+			 r1_1 = (r2_1 + r2_3);
+			 i1_1 = (i2_1 + i2_3);
+			 r1_9 = (r2_1 - r2_3);
+			 i1_9 = (i2_1 - i2_3);
+			 tmpr = (0.707106781187 * (r2_7 + i2_7));
+			 tmpi = (0.707106781187 * (i2_7 - r2_7));
+			 r1_3 = (r2_5 + tmpr);
+			 i1_3 = (i2_5 + tmpi);
+			 r1_11 = (r2_5 - tmpr);
+			 i1_11 = (i2_5 - tmpi);
+			 r1_5 = (r2_9 + i2_11);
+			 i1_5 = (i2_9 - r2_11);
+			 r1_13 = (r2_9 - i2_11);
+			 i1_13 = (i2_9 + r2_11);
+			 tmpr = (0.707106781187 * (i2_15 - r2_15));
+			 tmpi = (0.707106781187 * (r2_15 + i2_15));
+			 r1_7 = (r2_13 + tmpr);
+			 i1_7 = (i2_13 - tmpi);
+			 r1_15 = (r2_13 - tmpr);
+			 i1_15 = (i2_13 + tmpi);
+		    }
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[8 * m]) = (r1_0 - r1_1);
+		    c_im(kp[8 * m]) = (i1_0 - i1_1);
+		    tmpr = ((0.923879532511 * r1_3) + (0.382683432365 * i1_3));
+		    tmpi = ((0.923879532511 * i1_3) - (0.382683432365 * r1_3));
+		    c_re(kp[1 * m]) = (r1_2 + tmpr);
+		    c_im(kp[1 * m]) = (i1_2 + tmpi);
+		    c_re(kp[9 * m]) = (r1_2 - tmpr);
+		    c_im(kp[9 * m]) = (i1_2 - tmpi);
+		    tmpr = (0.707106781187 * (r1_5 + i1_5));
+		    tmpi = (0.707106781187 * (i1_5 - r1_5));
+		    c_re(kp[2 * m]) = (r1_4 + tmpr);
+		    c_im(kp[2 * m]) = (i1_4 + tmpi);
+		    c_re(kp[10 * m]) = (r1_4 - tmpr);
+		    c_im(kp[10 * m]) = (i1_4 - tmpi);
+		    tmpr = ((0.382683432365 * r1_7) + (0.923879532511 * i1_7));
+		    tmpi = ((0.382683432365 * i1_7) - (0.923879532511 * r1_7));
+		    c_re(kp[3 * m]) = (r1_6 + tmpr);
+		    c_im(kp[3 * m]) = (i1_6 + tmpi);
+		    c_re(kp[11 * m]) = (r1_6 - tmpr);
+		    c_im(kp[11 * m]) = (i1_6 - tmpi);
+		    c_re(kp[4 * m]) = (r1_8 + i1_9);
+		    c_im(kp[4 * m]) = (i1_8 - r1_9);
+		    c_re(kp[12 * m]) = (r1_8 - i1_9);
+		    c_im(kp[12 * m]) = (i1_8 + r1_9);
+		    tmpr = ((0.923879532511 * i1_11) - (0.382683432365 * r1_11));
+		    tmpi = ((0.923879532511 * r1_11) + (0.382683432365 * i1_11));
+		    c_re(kp[5 * m]) = (r1_10 + tmpr);
+		    c_im(kp[5 * m]) = (i1_10 - tmpi);
+		    c_re(kp[13 * m]) = (r1_10 - tmpr);
+		    c_im(kp[13 * m]) = (i1_10 + tmpi);
+		    tmpr = (0.707106781187 * (i1_13 - r1_13));
+		    tmpi = (0.707106781187 * (r1_13 + i1_13));
+		    c_re(kp[6 * m]) = (r1_12 + tmpr);
+		    c_im(kp[6 * m]) = (i1_12 - tmpi);
+		    c_re(kp[14 * m]) = (r1_12 - tmpr);
+		    c_im(kp[14 * m]) = (i1_12 + tmpi);
+		    tmpr = ((0.382683432365 * i1_15) - (0.923879532511 * r1_15));
+		    tmpi = ((0.382683432365 * r1_15) + (0.923879532511 * i1_15));
+		    c_re(kp[7 * m]) = (r1_14 + tmpr);
+		    c_im(kp[7 * m]) = (i1_14 - tmpi);
+		    c_re(kp[15 * m]) = (r1_14 - tmpr);
+		    c_im(kp[15 * m]) = (i1_14 + tmpi);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+#if defined(FORCE_TIED_TASKS)
+          #pragma omp task
+	  fft_twiddle_16(a, ab, in, out, W, nW, nWdn, m);
+          #pragma omp task
+	  fft_twiddle_16(ab, b, in, out, W, nW, nWdn, m);
+#else
+          #pragma omp task untied
+	  fft_twiddle_16(a, ab, in, out, W, nW, nWdn, m);
+          #pragma omp task untied
+	  fft_twiddle_16(ab, b, in, out, W, nW, nWdn, m);
+#endif
+          #pragma omp taskwait
+     }
+}
+void fft_twiddle_16_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    REAL r1_2, i1_2;
+		    REAL r1_3, i1_3;
+		    REAL r1_4, i1_4;
+		    REAL r1_5, i1_5;
+		    REAL r1_6, i1_6;
+		    REAL r1_7, i1_7;
+		    REAL r1_8, i1_8;
+		    REAL r1_9, i1_9;
+		    REAL r1_10, i1_10;
+		    REAL r1_11, i1_11;
+		    REAL r1_12, i1_12;
+		    REAL r1_13, i1_13;
+		    REAL r1_14, i1_14;
+		    REAL r1_15, i1_15;
+		    {
+			 REAL r2_0, i2_0;
+			 REAL r2_2, i2_2;
+			 REAL r2_4, i2_4;
+			 REAL r2_6, i2_6;
+			 REAL r2_8, i2_8;
+			 REAL r2_10, i2_10;
+			 REAL r2_12, i2_12;
+			 REAL r2_14, i2_14;
+			 {
+			      REAL r3_0, i3_0;
+			      REAL r3_4, i3_4;
+			      REAL r3_8, i3_8;
+			      REAL r3_12, i3_12;
+			      {
+				   REAL r4_0, i4_0;
+				   REAL r4_8, i4_8;
+				   r4_0 = c_re(jp[0 * m]);
+				   i4_0 = c_im(jp[0 * m]);
+				   wr = c_re(W[8 * l1]);
+				   wi = c_im(W[8 * l1]);
+				   tmpr = c_re(jp[8 * m]);
+				   tmpi = c_im(jp[8 * m]);
+				   r4_8 = ((wr * tmpr) - (wi * tmpi));
+				   i4_8 = ((wi * tmpr) + (wr * tmpi));
+				   r3_0 = (r4_0 + r4_8);
+				   i3_0 = (i4_0 + i4_8);
+				   r3_8 = (r4_0 - r4_8);
+				   i3_8 = (i4_0 - i4_8);
+			      }
+			      {
+				   REAL r4_4, i4_4;
+				   REAL r4_12, i4_12;
+				   wr = c_re(W[4 * l1]);
+				   wi = c_im(W[4 * l1]);
+				   tmpr = c_re(jp[4 * m]);
+				   tmpi = c_im(jp[4 * m]);
+				   r4_4 = ((wr * tmpr) - (wi * tmpi));
+				   i4_4 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[12 * l1]);
+				   wi = c_im(W[12 * l1]);
+				   tmpr = c_re(jp[12 * m]);
+				   tmpi = c_im(jp[12 * m]);
+				   r4_12 = ((wr * tmpr) - (wi * tmpi));
+				   i4_12 = ((wi * tmpr) + (wr * tmpi));
+				   r3_4 = (r4_4 + r4_12);
+				   i3_4 = (i4_4 + i4_12);
+				   r3_12 = (r4_4 - r4_12);
+				   i3_12 = (i4_4 - i4_12);
+			      }
+			      r2_0 = (r3_0 + r3_4);
+			      i2_0 = (i3_0 + i3_4);
+			      r2_8 = (r3_0 - r3_4);
+			      i2_8 = (i3_0 - i3_4);
+			      r2_4 = (r3_8 + i3_12);
+			      i2_4 = (i3_8 - r3_12);
+			      r2_12 = (r3_8 - i3_12);
+			      i2_12 = (i3_8 + r3_12);
+			 }
+			 {
+			      REAL r3_2, i3_2;
+			      REAL r3_6, i3_6;
+			      REAL r3_10, i3_10;
+			      REAL r3_14, i3_14;
+			      {
+				   REAL r4_2, i4_2;
+				   REAL r4_10, i4_10;
+				   wr = c_re(W[2 * l1]);
+				   wi = c_im(W[2 * l1]);
+				   tmpr = c_re(jp[2 * m]);
+				   tmpi = c_im(jp[2 * m]);
+				   r4_2 = ((wr * tmpr) - (wi * tmpi));
+				   i4_2 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[10 * l1]);
+				   wi = c_im(W[10 * l1]);
+				   tmpr = c_re(jp[10 * m]);
+				   tmpi = c_im(jp[10 * m]);
+				   r4_10 = ((wr * tmpr) - (wi * tmpi));
+				   i4_10 = ((wi * tmpr) + (wr * tmpi));
+				   r3_2 = (r4_2 + r4_10);
+				   i3_2 = (i4_2 + i4_10);
+				   r3_10 = (r4_2 - r4_10);
+				   i3_10 = (i4_2 - i4_10);
+			      }
+			      {
+				   REAL r4_6, i4_6;
+				   REAL r4_14, i4_14;
+				   wr = c_re(W[6 * l1]);
+				   wi = c_im(W[6 * l1]);
+				   tmpr = c_re(jp[6 * m]);
+				   tmpi = c_im(jp[6 * m]);
+				   r4_6 = ((wr * tmpr) - (wi * tmpi));
+				   i4_6 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[14 * l1]);
+				   wi = c_im(W[14 * l1]);
+				   tmpr = c_re(jp[14 * m]);
+				   tmpi = c_im(jp[14 * m]);
+				   r4_14 = ((wr * tmpr) - (wi * tmpi));
+				   i4_14 = ((wi * tmpr) + (wr * tmpi));
+				   r3_6 = (r4_6 + r4_14);
+				   i3_6 = (i4_6 + i4_14);
+				   r3_14 = (r4_6 - r4_14);
+				   i3_14 = (i4_6 - i4_14);
+			      }
+			      r2_2 = (r3_2 + r3_6);
+			      i2_2 = (i3_2 + i3_6);
+			      r2_10 = (r3_2 - r3_6);
+			      i2_10 = (i3_2 - i3_6);
+			      r2_6 = (r3_10 + i3_14);
+			      i2_6 = (i3_10 - r3_14);
+			      r2_14 = (r3_10 - i3_14);
+			      i2_14 = (i3_10 + r3_14);
+			 }
+			 r1_0 = (r2_0 + r2_2);
+			 i1_0 = (i2_0 + i2_2);
+			 r1_8 = (r2_0 - r2_2);
+			 i1_8 = (i2_0 - i2_2);
+			 tmpr = (0.707106781187 * (r2_6 + i2_6));
+			 tmpi = (0.707106781187 * (i2_6 - r2_6));
+			 r1_2 = (r2_4 + tmpr);
+			 i1_2 = (i2_4 + tmpi);
+			 r1_10 = (r2_4 - tmpr);
+			 i1_10 = (i2_4 - tmpi);
+			 r1_4 = (r2_8 + i2_10);
+			 i1_4 = (i2_8 - r2_10);
+			 r1_12 = (r2_8 - i2_10);
+			 i1_12 = (i2_8 + r2_10);
+			 tmpr = (0.707106781187 * (i2_14 - r2_14));
+			 tmpi = (0.707106781187 * (r2_14 + i2_14));
+			 r1_6 = (r2_12 + tmpr);
+			 i1_6 = (i2_12 - tmpi);
+			 r1_14 = (r2_12 - tmpr);
+			 i1_14 = (i2_12 + tmpi);
+		    }
+		    {
+			 REAL r2_1, i2_1;
+			 REAL r2_3, i2_3;
+			 REAL r2_5, i2_5;
+			 REAL r2_7, i2_7;
+			 REAL r2_9, i2_9;
+			 REAL r2_11, i2_11;
+			 REAL r2_13, i2_13;
+			 REAL r2_15, i2_15;
+			 {
+			      REAL r3_1, i3_1;
+			      REAL r3_5, i3_5;
+			      REAL r3_9, i3_9;
+			      REAL r3_13, i3_13;
+			      {
+				   REAL r4_1, i4_1;
+				   REAL r4_9, i4_9;
+				   wr = c_re(W[1 * l1]);
+				   wi = c_im(W[1 * l1]);
+				   tmpr = c_re(jp[1 * m]);
+				   tmpi = c_im(jp[1 * m]);
+				   r4_1 = ((wr * tmpr) - (wi * tmpi));
+				   i4_1 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[9 * l1]);
+				   wi = c_im(W[9 * l1]);
+				   tmpr = c_re(jp[9 * m]);
+				   tmpi = c_im(jp[9 * m]);
+				   r4_9 = ((wr * tmpr) - (wi * tmpi));
+				   i4_9 = ((wi * tmpr) + (wr * tmpi));
+				   r3_1 = (r4_1 + r4_9);
+				   i3_1 = (i4_1 + i4_9);
+				   r3_9 = (r4_1 - r4_9);
+				   i3_9 = (i4_1 - i4_9);
+			      }
+			      {
+				   REAL r4_5, i4_5;
+				   REAL r4_13, i4_13;
+				   wr = c_re(W[5 * l1]);
+				   wi = c_im(W[5 * l1]);
+				   tmpr = c_re(jp[5 * m]);
+				   tmpi = c_im(jp[5 * m]);
+				   r4_5 = ((wr * tmpr) - (wi * tmpi));
+				   i4_5 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[13 * l1]);
+				   wi = c_im(W[13 * l1]);
+				   tmpr = c_re(jp[13 * m]);
+				   tmpi = c_im(jp[13 * m]);
+				   r4_13 = ((wr * tmpr) - (wi * tmpi));
+				   i4_13 = ((wi * tmpr) + (wr * tmpi));
+				   r3_5 = (r4_5 + r4_13);
+				   i3_5 = (i4_5 + i4_13);
+				   r3_13 = (r4_5 - r4_13);
+				   i3_13 = (i4_5 - i4_13);
+			      }
+			      r2_1 = (r3_1 + r3_5);
+			      i2_1 = (i3_1 + i3_5);
+			      r2_9 = (r3_1 - r3_5);
+			      i2_9 = (i3_1 - i3_5);
+			      r2_5 = (r3_9 + i3_13);
+			      i2_5 = (i3_9 - r3_13);
+			      r2_13 = (r3_9 - i3_13);
+			      i2_13 = (i3_9 + r3_13);
+			 }
+			 {
+			      REAL r3_3, i3_3;
+			      REAL r3_7, i3_7;
+			      REAL r3_11, i3_11;
+			      REAL r3_15, i3_15;
+			      {
+				   REAL r4_3, i4_3;
+				   REAL r4_11, i4_11;
+				   wr = c_re(W[3 * l1]);
+				   wi = c_im(W[3 * l1]);
+				   tmpr = c_re(jp[3 * m]);
+				   tmpi = c_im(jp[3 * m]);
+				   r4_3 = ((wr * tmpr) - (wi * tmpi));
+				   i4_3 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[11 * l1]);
+				   wi = c_im(W[11 * l1]);
+				   tmpr = c_re(jp[11 * m]);
+				   tmpi = c_im(jp[11 * m]);
+				   r4_11 = ((wr * tmpr) - (wi * tmpi));
+				   i4_11 = ((wi * tmpr) + (wr * tmpi));
+				   r3_3 = (r4_3 + r4_11);
+				   i3_3 = (i4_3 + i4_11);
+				   r3_11 = (r4_3 - r4_11);
+				   i3_11 = (i4_3 - i4_11);
+			      }
+			      {
+				   REAL r4_7, i4_7;
+				   REAL r4_15, i4_15;
+				   wr = c_re(W[7 * l1]);
+				   wi = c_im(W[7 * l1]);
+				   tmpr = c_re(jp[7 * m]);
+				   tmpi = c_im(jp[7 * m]);
+				   r4_7 = ((wr * tmpr) - (wi * tmpi));
+				   i4_7 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[15 * l1]);
+				   wi = c_im(W[15 * l1]);
+				   tmpr = c_re(jp[15 * m]);
+				   tmpi = c_im(jp[15 * m]);
+				   r4_15 = ((wr * tmpr) - (wi * tmpi));
+				   i4_15 = ((wi * tmpr) + (wr * tmpi));
+				   r3_7 = (r4_7 + r4_15);
+				   i3_7 = (i4_7 + i4_15);
+				   r3_15 = (r4_7 - r4_15);
+				   i3_15 = (i4_7 - i4_15);
+			      }
+			      r2_3 = (r3_3 + r3_7);
+			      i2_3 = (i3_3 + i3_7);
+			      r2_11 = (r3_3 - r3_7);
+			      i2_11 = (i3_3 - i3_7);
+			      r2_7 = (r3_11 + i3_15);
+			      i2_7 = (i3_11 - r3_15);
+			      r2_15 = (r3_11 - i3_15);
+			      i2_15 = (i3_11 + r3_15);
+			 }
+			 r1_1 = (r2_1 + r2_3);
+			 i1_1 = (i2_1 + i2_3);
+			 r1_9 = (r2_1 - r2_3);
+			 i1_9 = (i2_1 - i2_3);
+			 tmpr = (0.707106781187 * (r2_7 + i2_7));
+			 tmpi = (0.707106781187 * (i2_7 - r2_7));
+			 r1_3 = (r2_5 + tmpr);
+			 i1_3 = (i2_5 + tmpi);
+			 r1_11 = (r2_5 - tmpr);
+			 i1_11 = (i2_5 - tmpi);
+			 r1_5 = (r2_9 + i2_11);
+			 i1_5 = (i2_9 - r2_11);
+			 r1_13 = (r2_9 - i2_11);
+			 i1_13 = (i2_9 + r2_11);
+			 tmpr = (0.707106781187 * (i2_15 - r2_15));
+			 tmpi = (0.707106781187 * (r2_15 + i2_15));
+			 r1_7 = (r2_13 + tmpr);
+			 i1_7 = (i2_13 - tmpi);
+			 r1_15 = (r2_13 - tmpr);
+			 i1_15 = (i2_13 + tmpi);
+		    }
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[8 * m]) = (r1_0 - r1_1);
+		    c_im(kp[8 * m]) = (i1_0 - i1_1);
+		    tmpr = ((0.923879532511 * r1_3) + (0.382683432365 * i1_3));
+		    tmpi = ((0.923879532511 * i1_3) - (0.382683432365 * r1_3));
+		    c_re(kp[1 * m]) = (r1_2 + tmpr);
+		    c_im(kp[1 * m]) = (i1_2 + tmpi);
+		    c_re(kp[9 * m]) = (r1_2 - tmpr);
+		    c_im(kp[9 * m]) = (i1_2 - tmpi);
+		    tmpr = (0.707106781187 * (r1_5 + i1_5));
+		    tmpi = (0.707106781187 * (i1_5 - r1_5));
+		    c_re(kp[2 * m]) = (r1_4 + tmpr);
+		    c_im(kp[2 * m]) = (i1_4 + tmpi);
+		    c_re(kp[10 * m]) = (r1_4 - tmpr);
+		    c_im(kp[10 * m]) = (i1_4 - tmpi);
+		    tmpr = ((0.382683432365 * r1_7) + (0.923879532511 * i1_7));
+		    tmpi = ((0.382683432365 * i1_7) - (0.923879532511 * r1_7));
+		    c_re(kp[3 * m]) = (r1_6 + tmpr);
+		    c_im(kp[3 * m]) = (i1_6 + tmpi);
+		    c_re(kp[11 * m]) = (r1_6 - tmpr);
+		    c_im(kp[11 * m]) = (i1_6 - tmpi);
+		    c_re(kp[4 * m]) = (r1_8 + i1_9);
+		    c_im(kp[4 * m]) = (i1_8 - r1_9);
+		    c_re(kp[12 * m]) = (r1_8 - i1_9);
+		    c_im(kp[12 * m]) = (i1_8 + r1_9);
+		    tmpr = ((0.923879532511 * i1_11) - (0.382683432365 * r1_11));
+		    tmpi = ((0.923879532511 * r1_11) + (0.382683432365 * i1_11));
+		    c_re(kp[5 * m]) = (r1_10 + tmpr);
+		    c_im(kp[5 * m]) = (i1_10 - tmpi);
+		    c_re(kp[13 * m]) = (r1_10 - tmpr);
+		    c_im(kp[13 * m]) = (i1_10 + tmpi);
+		    tmpr = (0.707106781187 * (i1_13 - r1_13));
+		    tmpi = (0.707106781187 * (r1_13 + i1_13));
+		    c_re(kp[6 * m]) = (r1_12 + tmpr);
+		    c_im(kp[6 * m]) = (i1_12 - tmpi);
+		    c_re(kp[14 * m]) = (r1_12 - tmpr);
+		    c_im(kp[14 * m]) = (i1_12 + tmpi);
+		    tmpr = ((0.382683432365 * i1_15) - (0.923879532511 * r1_15));
+		    tmpi = ((0.382683432365 * r1_15) + (0.923879532511 * i1_15));
+		    c_re(kp[7 * m]) = (r1_14 + tmpr);
+		    c_im(kp[7 * m]) = (i1_14 - tmpi);
+		    c_re(kp[15 * m]) = (r1_14 - tmpr);
+		    c_im(kp[15 * m]) = (i1_14 + tmpi);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_twiddle_16_seq(a, ab, in, out, W, nW, nWdn, m);
+	  fft_twiddle_16_seq(ab, b, in, out, W, nW, nWdn, m);
+     }
+}
+void fft_unshuffle_16(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 16;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+#if defined(FORCE_TIED_TASKS)
+          #pragma omp task untied
+	  fft_unshuffle_16(a, ab, in, out, m);
+          #pragma omp task untied
+	  fft_unshuffle_16(ab, b, in, out, m);
+#else
+          #pragma omp task untied
+	  fft_unshuffle_16(a, ab, in, out, m);
+          #pragma omp task untied
+	  fft_unshuffle_16(ab, b, in, out, m);
+#endif
+          #pragma omp taskwait
+     }
+}
+void fft_unshuffle_16_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 16;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_unshuffle_16_seq(a, ab, in, out, m);
+	  fft_unshuffle_16_seq(ab, b, in, out, m);
+     }
+}
+void fft_base_32(COMPLEX * in, COMPLEX * out)
+{
+     REAL tmpr, tmpi;
+     {
+	  REAL r1_0, i1_0;
+	  REAL r1_1, i1_1;
+	  REAL r1_2, i1_2;
+	  REAL r1_3, i1_3;
+	  REAL r1_4, i1_4;
+	  REAL r1_5, i1_5;
+	  REAL r1_6, i1_6;
+	  REAL r1_7, i1_7;
+	  REAL r1_8, i1_8;
+	  REAL r1_9, i1_9;
+	  REAL r1_10, i1_10;
+	  REAL r1_11, i1_11;
+	  REAL r1_12, i1_12;
+	  REAL r1_13, i1_13;
+	  REAL r1_14, i1_14;
+	  REAL r1_15, i1_15;
+	  REAL r1_16, i1_16;
+	  REAL r1_17, i1_17;
+	  REAL r1_18, i1_18;
+	  REAL r1_19, i1_19;
+	  REAL r1_20, i1_20;
+	  REAL r1_21, i1_21;
+	  REAL r1_22, i1_22;
+	  REAL r1_23, i1_23;
+	  REAL r1_24, i1_24;
+	  REAL r1_25, i1_25;
+	  REAL r1_26, i1_26;
+	  REAL r1_27, i1_27;
+	  REAL r1_28, i1_28;
+	  REAL r1_29, i1_29;
+	  REAL r1_30, i1_30;
+	  REAL r1_31, i1_31;
+	  {
+	       REAL r2_0, i2_0;
+	       REAL r2_2, i2_2;
+	       REAL r2_4, i2_4;
+	       REAL r2_6, i2_6;
+	       REAL r2_8, i2_8;
+	       REAL r2_10, i2_10;
+	       REAL r2_12, i2_12;
+	       REAL r2_14, i2_14;
+	       REAL r2_16, i2_16;
+	       REAL r2_18, i2_18;
+	       REAL r2_20, i2_20;
+	       REAL r2_22, i2_22;
+	       REAL r2_24, i2_24;
+	       REAL r2_26, i2_26;
+	       REAL r2_28, i2_28;
+	       REAL r2_30, i2_30;
+	       {
+		    REAL r3_0, i3_0;
+		    REAL r3_4, i3_4;
+		    REAL r3_8, i3_8;
+		    REAL r3_12, i3_12;
+		    REAL r3_16, i3_16;
+		    REAL r3_20, i3_20;
+		    REAL r3_24, i3_24;
+		    REAL r3_28, i3_28;
+		    {
+			 REAL r4_0, i4_0;
+			 REAL r4_8, i4_8;
+			 REAL r4_16, i4_16;
+			 REAL r4_24, i4_24;
+			 {
+			      REAL r5_0, i5_0;
+			      REAL r5_16, i5_16;
+			      r5_0 = c_re(in[0]);
+			      i5_0 = c_im(in[0]);
+			      r5_16 = c_re(in[16]);
+			      i5_16 = c_im(in[16]);
+			      r4_0 = (r5_0 + r5_16);
+			      i4_0 = (i5_0 + i5_16);
+			      r4_16 = (r5_0 - r5_16);
+			      i4_16 = (i5_0 - i5_16);
+			 }
+			 {
+			      REAL r5_8, i5_8;
+			      REAL r5_24, i5_24;
+			      r5_8 = c_re(in[8]);
+			      i5_8 = c_im(in[8]);
+			      r5_24 = c_re(in[24]);
+			      i5_24 = c_im(in[24]);
+			      r4_8 = (r5_8 + r5_24);
+			      i4_8 = (i5_8 + i5_24);
+			      r4_24 = (r5_8 - r5_24);
+			      i4_24 = (i5_8 - i5_24);
+			 }
+			 r3_0 = (r4_0 + r4_8);
+			 i3_0 = (i4_0 + i4_8);
+			 r3_16 = (r4_0 - r4_8);
+			 i3_16 = (i4_0 - i4_8);
+			 r3_8 = (r4_16 + i4_24);
+			 i3_8 = (i4_16 - r4_24);
+			 r3_24 = (r4_16 - i4_24);
+			 i3_24 = (i4_16 + r4_24);
+		    }
+		    {
+			 REAL r4_4, i4_4;
+			 REAL r4_12, i4_12;
+			 REAL r4_20, i4_20;
+			 REAL r4_28, i4_28;
+			 {
+			      REAL r5_4, i5_4;
+			      REAL r5_20, i5_20;
+			      r5_4 = c_re(in[4]);
+			      i5_4 = c_im(in[4]);
+			      r5_20 = c_re(in[20]);
+			      i5_20 = c_im(in[20]);
+			      r4_4 = (r5_4 + r5_20);
+			      i4_4 = (i5_4 + i5_20);
+			      r4_20 = (r5_4 - r5_20);
+			      i4_20 = (i5_4 - i5_20);
+			 }
+			 {
+			      REAL r5_12, i5_12;
+			      REAL r5_28, i5_28;
+			      r5_12 = c_re(in[12]);
+			      i5_12 = c_im(in[12]);
+			      r5_28 = c_re(in[28]);
+			      i5_28 = c_im(in[28]);
+			      r4_12 = (r5_12 + r5_28);
+			      i4_12 = (i5_12 + i5_28);
+			      r4_28 = (r5_12 - r5_28);
+			      i4_28 = (i5_12 - i5_28);
+			 }
+			 r3_4 = (r4_4 + r4_12);
+			 i3_4 = (i4_4 + i4_12);
+			 r3_20 = (r4_4 - r4_12);
+			 i3_20 = (i4_4 - i4_12);
+			 r3_12 = (r4_20 + i4_28);
+			 i3_12 = (i4_20 - r4_28);
+			 r3_28 = (r4_20 - i4_28);
+			 i3_28 = (i4_20 + r4_28);
+		    }
+		    r2_0 = (r3_0 + r3_4);
+		    i2_0 = (i3_0 + i3_4);
+		    r2_16 = (r3_0 - r3_4);
+		    i2_16 = (i3_0 - i3_4);
+		    tmpr = (0.707106781187 * (r3_12 + i3_12));
+		    tmpi = (0.707106781187 * (i3_12 - r3_12));
+		    r2_4 = (r3_8 + tmpr);
+		    i2_4 = (i3_8 + tmpi);
+		    r2_20 = (r3_8 - tmpr);
+		    i2_20 = (i3_8 - tmpi);
+		    r2_8 = (r3_16 + i3_20);
+		    i2_8 = (i3_16 - r3_20);
+		    r2_24 = (r3_16 - i3_20);
+		    i2_24 = (i3_16 + r3_20);
+		    tmpr = (0.707106781187 * (i3_28 - r3_28));
+		    tmpi = (0.707106781187 * (r3_28 + i3_28));
+		    r2_12 = (r3_24 + tmpr);
+		    i2_12 = (i3_24 - tmpi);
+		    r2_28 = (r3_24 - tmpr);
+		    i2_28 = (i3_24 + tmpi);
+	       }
+	       {
+		    REAL r3_2, i3_2;
+		    REAL r3_6, i3_6;
+		    REAL r3_10, i3_10;
+		    REAL r3_14, i3_14;
+		    REAL r3_18, i3_18;
+		    REAL r3_22, i3_22;
+		    REAL r3_26, i3_26;
+		    REAL r3_30, i3_30;
+		    {
+			 REAL r4_2, i4_2;
+			 REAL r4_10, i4_10;
+			 REAL r4_18, i4_18;
+			 REAL r4_26, i4_26;
+			 {
+			      REAL r5_2, i5_2;
+			      REAL r5_18, i5_18;
+			      r5_2 = c_re(in[2]);
+			      i5_2 = c_im(in[2]);
+			      r5_18 = c_re(in[18]);
+			      i5_18 = c_im(in[18]);
+			      r4_2 = (r5_2 + r5_18);
+			      i4_2 = (i5_2 + i5_18);
+			      r4_18 = (r5_2 - r5_18);
+			      i4_18 = (i5_2 - i5_18);
+			 }
+			 {
+			      REAL r5_10, i5_10;
+			      REAL r5_26, i5_26;
+			      r5_10 = c_re(in[10]);
+			      i5_10 = c_im(in[10]);
+			      r5_26 = c_re(in[26]);
+			      i5_26 = c_im(in[26]);
+			      r4_10 = (r5_10 + r5_26);
+			      i4_10 = (i5_10 + i5_26);
+			      r4_26 = (r5_10 - r5_26);
+			      i4_26 = (i5_10 - i5_26);
+			 }
+			 r3_2 = (r4_2 + r4_10);
+			 i3_2 = (i4_2 + i4_10);
+			 r3_18 = (r4_2 - r4_10);
+			 i3_18 = (i4_2 - i4_10);
+			 r3_10 = (r4_18 + i4_26);
+			 i3_10 = (i4_18 - r4_26);
+			 r3_26 = (r4_18 - i4_26);
+			 i3_26 = (i4_18 + r4_26);
+		    }
+		    {
+			 REAL r4_6, i4_6;
+			 REAL r4_14, i4_14;
+			 REAL r4_22, i4_22;
+			 REAL r4_30, i4_30;
+			 {
+			      REAL r5_6, i5_6;
+			      REAL r5_22, i5_22;
+			      r5_6 = c_re(in[6]);
+			      i5_6 = c_im(in[6]);
+			      r5_22 = c_re(in[22]);
+			      i5_22 = c_im(in[22]);
+			      r4_6 = (r5_6 + r5_22);
+			      i4_6 = (i5_6 + i5_22);
+			      r4_22 = (r5_6 - r5_22);
+			      i4_22 = (i5_6 - i5_22);
+			 }
+			 {
+			      REAL r5_14, i5_14;
+			      REAL r5_30, i5_30;
+			      r5_14 = c_re(in[14]);
+			      i5_14 = c_im(in[14]);
+			      r5_30 = c_re(in[30]);
+			      i5_30 = c_im(in[30]);
+			      r4_14 = (r5_14 + r5_30);
+			      i4_14 = (i5_14 + i5_30);
+			      r4_30 = (r5_14 - r5_30);
+			      i4_30 = (i5_14 - i5_30);
+			 }
+			 r3_6 = (r4_6 + r4_14);
+			 i3_6 = (i4_6 + i4_14);
+			 r3_22 = (r4_6 - r4_14);
+			 i3_22 = (i4_6 - i4_14);
+			 r3_14 = (r4_22 + i4_30);
+			 i3_14 = (i4_22 - r4_30);
+			 r3_30 = (r4_22 - i4_30);
+			 i3_30 = (i4_22 + r4_30);
+		    }
+		    r2_2 = (r3_2 + r3_6);
+		    i2_2 = (i3_2 + i3_6);
+		    r2_18 = (r3_2 - r3_6);
+		    i2_18 = (i3_2 - i3_6);
+		    tmpr = (0.707106781187 * (r3_14 + i3_14));
+		    tmpi = (0.707106781187 * (i3_14 - r3_14));
+		    r2_6 = (r3_10 + tmpr);
+		    i2_6 = (i3_10 + tmpi);
+		    r2_22 = (r3_10 - tmpr);
+		    i2_22 = (i3_10 - tmpi);
+		    r2_10 = (r3_18 + i3_22);
+		    i2_10 = (i3_18 - r3_22);
+		    r2_26 = (r3_18 - i3_22);
+		    i2_26 = (i3_18 + r3_22);
+		    tmpr = (0.707106781187 * (i3_30 - r3_30));
+		    tmpi = (0.707106781187 * (r3_30 + i3_30));
+		    r2_14 = (r3_26 + tmpr);
+		    i2_14 = (i3_26 - tmpi);
+		    r2_30 = (r3_26 - tmpr);
+		    i2_30 = (i3_26 + tmpi);
+	       }
+	       r1_0 = (r2_0 + r2_2);
+	       i1_0 = (i2_0 + i2_2);
+	       r1_16 = (r2_0 - r2_2);
+	       i1_16 = (i2_0 - i2_2);
+	       tmpr = ((0.923879532511 * r2_6) + (0.382683432365 * i2_6));
+	       tmpi = ((0.923879532511 * i2_6) - (0.382683432365 * r2_6));
+	       r1_2 = (r2_4 + tmpr);
+	       i1_2 = (i2_4 + tmpi);
+	       r1_18 = (r2_4 - tmpr);
+	       i1_18 = (i2_4 - tmpi);
+	       tmpr = (0.707106781187 * (r2_10 + i2_10));
+	       tmpi = (0.707106781187 * (i2_10 - r2_10));
+	       r1_4 = (r2_8 + tmpr);
+	       i1_4 = (i2_8 + tmpi);
+	       r1_20 = (r2_8 - tmpr);
+	       i1_20 = (i2_8 - tmpi);
+	       tmpr = ((0.382683432365 * r2_14) + (0.923879532511 * i2_14));
+	       tmpi = ((0.382683432365 * i2_14) - (0.923879532511 * r2_14));
+	       r1_6 = (r2_12 + tmpr);
+	       i1_6 = (i2_12 + tmpi);
+	       r1_22 = (r2_12 - tmpr);
+	       i1_22 = (i2_12 - tmpi);
+	       r1_8 = (r2_16 + i2_18);
+	       i1_8 = (i2_16 - r2_18);
+	       r1_24 = (r2_16 - i2_18);
+	       i1_24 = (i2_16 + r2_18);
+	       tmpr = ((0.923879532511 * i2_22) - (0.382683432365 * r2_22));
+	       tmpi = ((0.923879532511 * r2_22) + (0.382683432365 * i2_22));
+	       r1_10 = (r2_20 + tmpr);
+	       i1_10 = (i2_20 - tmpi);
+	       r1_26 = (r2_20 - tmpr);
+	       i1_26 = (i2_20 + tmpi);
+	       tmpr = (0.707106781187 * (i2_26 - r2_26));
+	       tmpi = (0.707106781187 * (r2_26 + i2_26));
+	       r1_12 = (r2_24 + tmpr);
+	       i1_12 = (i2_24 - tmpi);
+	       r1_28 = (r2_24 - tmpr);
+	       i1_28 = (i2_24 + tmpi);
+	       tmpr = ((0.382683432365 * i2_30) - (0.923879532511 * r2_30));
+	       tmpi = ((0.382683432365 * r2_30) + (0.923879532511 * i2_30));
+	       r1_14 = (r2_28 + tmpr);
+	       i1_14 = (i2_28 - tmpi);
+	       r1_30 = (r2_28 - tmpr);
+	       i1_30 = (i2_28 + tmpi);
+	  }
+	  {
+	       REAL r2_1, i2_1;
+	       REAL r2_3, i2_3;
+	       REAL r2_5, i2_5;
+	       REAL r2_7, i2_7;
+	       REAL r2_9, i2_9;
+	       REAL r2_11, i2_11;
+	       REAL r2_13, i2_13;
+	       REAL r2_15, i2_15;
+	       REAL r2_17, i2_17;
+	       REAL r2_19, i2_19;
+	       REAL r2_21, i2_21;
+	       REAL r2_23, i2_23;
+	       REAL r2_25, i2_25;
+	       REAL r2_27, i2_27;
+	       REAL r2_29, i2_29;
+	       REAL r2_31, i2_31;
+	       {
+		    REAL r3_1, i3_1;
+		    REAL r3_5, i3_5;
+		    REAL r3_9, i3_9;
+		    REAL r3_13, i3_13;
+		    REAL r3_17, i3_17;
+		    REAL r3_21, i3_21;
+		    REAL r3_25, i3_25;
+		    REAL r3_29, i3_29;
+		    {
+			 REAL r4_1, i4_1;
+			 REAL r4_9, i4_9;
+			 REAL r4_17, i4_17;
+			 REAL r4_25, i4_25;
+			 {
+			      REAL r5_1, i5_1;
+			      REAL r5_17, i5_17;
+			      r5_1 = c_re(in[1]);
+			      i5_1 = c_im(in[1]);
+			      r5_17 = c_re(in[17]);
+			      i5_17 = c_im(in[17]);
+			      r4_1 = (r5_1 + r5_17);
+			      i4_1 = (i5_1 + i5_17);
+			      r4_17 = (r5_1 - r5_17);
+			      i4_17 = (i5_1 - i5_17);
+			 }
+			 {
+			      REAL r5_9, i5_9;
+			      REAL r5_25, i5_25;
+			      r5_9 = c_re(in[9]);
+			      i5_9 = c_im(in[9]);
+			      r5_25 = c_re(in[25]);
+			      i5_25 = c_im(in[25]);
+			      r4_9 = (r5_9 + r5_25);
+			      i4_9 = (i5_9 + i5_25);
+			      r4_25 = (r5_9 - r5_25);
+			      i4_25 = (i5_9 - i5_25);
+			 }
+			 r3_1 = (r4_1 + r4_9);
+			 i3_1 = (i4_1 + i4_9);
+			 r3_17 = (r4_1 - r4_9);
+			 i3_17 = (i4_1 - i4_9);
+			 r3_9 = (r4_17 + i4_25);
+			 i3_9 = (i4_17 - r4_25);
+			 r3_25 = (r4_17 - i4_25);
+			 i3_25 = (i4_17 + r4_25);
+		    }
+		    {
+			 REAL r4_5, i4_5;
+			 REAL r4_13, i4_13;
+			 REAL r4_21, i4_21;
+			 REAL r4_29, i4_29;
+			 {
+			      REAL r5_5, i5_5;
+			      REAL r5_21, i5_21;
+			      r5_5 = c_re(in[5]);
+			      i5_5 = c_im(in[5]);
+			      r5_21 = c_re(in[21]);
+			      i5_21 = c_im(in[21]);
+			      r4_5 = (r5_5 + r5_21);
+			      i4_5 = (i5_5 + i5_21);
+			      r4_21 = (r5_5 - r5_21);
+			      i4_21 = (i5_5 - i5_21);
+			 }
+			 {
+			      REAL r5_13, i5_13;
+			      REAL r5_29, i5_29;
+			      r5_13 = c_re(in[13]);
+			      i5_13 = c_im(in[13]);
+			      r5_29 = c_re(in[29]);
+			      i5_29 = c_im(in[29]);
+			      r4_13 = (r5_13 + r5_29);
+			      i4_13 = (i5_13 + i5_29);
+			      r4_29 = (r5_13 - r5_29);
+			      i4_29 = (i5_13 - i5_29);
+			 }
+			 r3_5 = (r4_5 + r4_13);
+			 i3_5 = (i4_5 + i4_13);
+			 r3_21 = (r4_5 - r4_13);
+			 i3_21 = (i4_5 - i4_13);
+			 r3_13 = (r4_21 + i4_29);
+			 i3_13 = (i4_21 - r4_29);
+			 r3_29 = (r4_21 - i4_29);
+			 i3_29 = (i4_21 + r4_29);
+		    }
+		    r2_1 = (r3_1 + r3_5);
+		    i2_1 = (i3_1 + i3_5);
+		    r2_17 = (r3_1 - r3_5);
+		    i2_17 = (i3_1 - i3_5);
+		    tmpr = (0.707106781187 * (r3_13 + i3_13));
+		    tmpi = (0.707106781187 * (i3_13 - r3_13));
+		    r2_5 = (r3_9 + tmpr);
+		    i2_5 = (i3_9 + tmpi);
+		    r2_21 = (r3_9 - tmpr);
+		    i2_21 = (i3_9 - tmpi);
+		    r2_9 = (r3_17 + i3_21);
+		    i2_9 = (i3_17 - r3_21);
+		    r2_25 = (r3_17 - i3_21);
+		    i2_25 = (i3_17 + r3_21);
+		    tmpr = (0.707106781187 * (i3_29 - r3_29));
+		    tmpi = (0.707106781187 * (r3_29 + i3_29));
+		    r2_13 = (r3_25 + tmpr);
+		    i2_13 = (i3_25 - tmpi);
+		    r2_29 = (r3_25 - tmpr);
+		    i2_29 = (i3_25 + tmpi);
+	       }
+	       {
+		    REAL r3_3, i3_3;
+		    REAL r3_7, i3_7;
+		    REAL r3_11, i3_11;
+		    REAL r3_15, i3_15;
+		    REAL r3_19, i3_19;
+		    REAL r3_23, i3_23;
+		    REAL r3_27, i3_27;
+		    REAL r3_31, i3_31;
+		    {
+			 REAL r4_3, i4_3;
+			 REAL r4_11, i4_11;
+			 REAL r4_19, i4_19;
+			 REAL r4_27, i4_27;
+			 {
+			      REAL r5_3, i5_3;
+			      REAL r5_19, i5_19;
+			      r5_3 = c_re(in[3]);
+			      i5_3 = c_im(in[3]);
+			      r5_19 = c_re(in[19]);
+			      i5_19 = c_im(in[19]);
+			      r4_3 = (r5_3 + r5_19);
+			      i4_3 = (i5_3 + i5_19);
+			      r4_19 = (r5_3 - r5_19);
+			      i4_19 = (i5_3 - i5_19);
+			 }
+			 {
+			      REAL r5_11, i5_11;
+			      REAL r5_27, i5_27;
+			      r5_11 = c_re(in[11]);
+			      i5_11 = c_im(in[11]);
+			      r5_27 = c_re(in[27]);
+			      i5_27 = c_im(in[27]);
+			      r4_11 = (r5_11 + r5_27);
+			      i4_11 = (i5_11 + i5_27);
+			      r4_27 = (r5_11 - r5_27);
+			      i4_27 = (i5_11 - i5_27);
+			 }
+			 r3_3 = (r4_3 + r4_11);
+			 i3_3 = (i4_3 + i4_11);
+			 r3_19 = (r4_3 - r4_11);
+			 i3_19 = (i4_3 - i4_11);
+			 r3_11 = (r4_19 + i4_27);
+			 i3_11 = (i4_19 - r4_27);
+			 r3_27 = (r4_19 - i4_27);
+			 i3_27 = (i4_19 + r4_27);
+		    }
+		    {
+			 REAL r4_7, i4_7;
+			 REAL r4_15, i4_15;
+			 REAL r4_23, i4_23;
+			 REAL r4_31, i4_31;
+			 {
+			      REAL r5_7, i5_7;
+			      REAL r5_23, i5_23;
+			      r5_7 = c_re(in[7]);
+			      i5_7 = c_im(in[7]);
+			      r5_23 = c_re(in[23]);
+			      i5_23 = c_im(in[23]);
+			      r4_7 = (r5_7 + r5_23);
+			      i4_7 = (i5_7 + i5_23);
+			      r4_23 = (r5_7 - r5_23);
+			      i4_23 = (i5_7 - i5_23);
+			 }
+			 {
+			      REAL r5_15, i5_15;
+			      REAL r5_31, i5_31;
+			      r5_15 = c_re(in[15]);
+			      i5_15 = c_im(in[15]);
+			      r5_31 = c_re(in[31]);
+			      i5_31 = c_im(in[31]);
+			      r4_15 = (r5_15 + r5_31);
+			      i4_15 = (i5_15 + i5_31);
+			      r4_31 = (r5_15 - r5_31);
+			      i4_31 = (i5_15 - i5_31);
+			 }
+			 r3_7 = (r4_7 + r4_15);
+			 i3_7 = (i4_7 + i4_15);
+			 r3_23 = (r4_7 - r4_15);
+			 i3_23 = (i4_7 - i4_15);
+			 r3_15 = (r4_23 + i4_31);
+			 i3_15 = (i4_23 - r4_31);
+			 r3_31 = (r4_23 - i4_31);
+			 i3_31 = (i4_23 + r4_31);
+		    }
+		    r2_3 = (r3_3 + r3_7);
+		    i2_3 = (i3_3 + i3_7);
+		    r2_19 = (r3_3 - r3_7);
+		    i2_19 = (i3_3 - i3_7);
+		    tmpr = (0.707106781187 * (r3_15 + i3_15));
+		    tmpi = (0.707106781187 * (i3_15 - r3_15));
+		    r2_7 = (r3_11 + tmpr);
+		    i2_7 = (i3_11 + tmpi);
+		    r2_23 = (r3_11 - tmpr);
+		    i2_23 = (i3_11 - tmpi);
+		    r2_11 = (r3_19 + i3_23);
+		    i2_11 = (i3_19 - r3_23);
+		    r2_27 = (r3_19 - i3_23);
+		    i2_27 = (i3_19 + r3_23);
+		    tmpr = (0.707106781187 * (i3_31 - r3_31));
+		    tmpi = (0.707106781187 * (r3_31 + i3_31));
+		    r2_15 = (r3_27 + tmpr);
+		    i2_15 = (i3_27 - tmpi);
+		    r2_31 = (r3_27 - tmpr);
+		    i2_31 = (i3_27 + tmpi);
+	       }
+	       r1_1 = (r2_1 + r2_3);
+	       i1_1 = (i2_1 + i2_3);
+	       r1_17 = (r2_1 - r2_3);
+	       i1_17 = (i2_1 - i2_3);
+	       tmpr = ((0.923879532511 * r2_7) + (0.382683432365 * i2_7));
+	       tmpi = ((0.923879532511 * i2_7) - (0.382683432365 * r2_7));
+	       r1_3 = (r2_5 + tmpr);
+	       i1_3 = (i2_5 + tmpi);
+	       r1_19 = (r2_5 - tmpr);
+	       i1_19 = (i2_5 - tmpi);
+	       tmpr = (0.707106781187 * (r2_11 + i2_11));
+	       tmpi = (0.707106781187 * (i2_11 - r2_11));
+	       r1_5 = (r2_9 + tmpr);
+	       i1_5 = (i2_9 + tmpi);
+	       r1_21 = (r2_9 - tmpr);
+	       i1_21 = (i2_9 - tmpi);
+	       tmpr = ((0.382683432365 * r2_15) + (0.923879532511 * i2_15));
+	       tmpi = ((0.382683432365 * i2_15) - (0.923879532511 * r2_15));
+	       r1_7 = (r2_13 + tmpr);
+	       i1_7 = (i2_13 + tmpi);
+	       r1_23 = (r2_13 - tmpr);
+	       i1_23 = (i2_13 - tmpi);
+	       r1_9 = (r2_17 + i2_19);
+	       i1_9 = (i2_17 - r2_19);
+	       r1_25 = (r2_17 - i2_19);
+	       i1_25 = (i2_17 + r2_19);
+	       tmpr = ((0.923879532511 * i2_23) - (0.382683432365 * r2_23));
+	       tmpi = ((0.923879532511 * r2_23) + (0.382683432365 * i2_23));
+	       r1_11 = (r2_21 + tmpr);
+	       i1_11 = (i2_21 - tmpi);
+	       r1_27 = (r2_21 - tmpr);
+	       i1_27 = (i2_21 + tmpi);
+	       tmpr = (0.707106781187 * (i2_27 - r2_27));
+	       tmpi = (0.707106781187 * (r2_27 + i2_27));
+	       r1_13 = (r2_25 + tmpr);
+	       i1_13 = (i2_25 - tmpi);
+	       r1_29 = (r2_25 - tmpr);
+	       i1_29 = (i2_25 + tmpi);
+	       tmpr = ((0.382683432365 * i2_31) - (0.923879532511 * r2_31));
+	       tmpi = ((0.382683432365 * r2_31) + (0.923879532511 * i2_31));
+	       r1_15 = (r2_29 + tmpr);
+	       i1_15 = (i2_29 - tmpi);
+	       r1_31 = (r2_29 - tmpr);
+	       i1_31 = (i2_29 + tmpi);
+	  }
+	  c_re(out[0]) = (r1_0 + r1_1);
+	  c_im(out[0]) = (i1_0 + i1_1);
+	  c_re(out[16]) = (r1_0 - r1_1);
+	  c_im(out[16]) = (i1_0 - i1_1);
+	  tmpr = ((0.980785280403 * r1_3) + (0.195090322016 * i1_3));
+	  tmpi = ((0.980785280403 * i1_3) - (0.195090322016 * r1_3));
+	  c_re(out[1]) = (r1_2 + tmpr);
+	  c_im(out[1]) = (i1_2 + tmpi);
+	  c_re(out[17]) = (r1_2 - tmpr);
+	  c_im(out[17]) = (i1_2 - tmpi);
+	  tmpr = ((0.923879532511 * r1_5) + (0.382683432365 * i1_5));
+	  tmpi = ((0.923879532511 * i1_5) - (0.382683432365 * r1_5));
+	  c_re(out[2]) = (r1_4 + tmpr);
+	  c_im(out[2]) = (i1_4 + tmpi);
+	  c_re(out[18]) = (r1_4 - tmpr);
+	  c_im(out[18]) = (i1_4 - tmpi);
+	  tmpr = ((0.831469612303 * r1_7) + (0.55557023302 * i1_7));
+	  tmpi = ((0.831469612303 * i1_7) - (0.55557023302 * r1_7));
+	  c_re(out[3]) = (r1_6 + tmpr);
+	  c_im(out[3]) = (i1_6 + tmpi);
+	  c_re(out[19]) = (r1_6 - tmpr);
+	  c_im(out[19]) = (i1_6 - tmpi);
+	  tmpr = (0.707106781187 * (r1_9 + i1_9));
+	  tmpi = (0.707106781187 * (i1_9 - r1_9));
+	  c_re(out[4]) = (r1_8 + tmpr);
+	  c_im(out[4]) = (i1_8 + tmpi);
+	  c_re(out[20]) = (r1_8 - tmpr);
+	  c_im(out[20]) = (i1_8 - tmpi);
+	  tmpr = ((0.55557023302 * r1_11) + (0.831469612303 * i1_11));
+	  tmpi = ((0.55557023302 * i1_11) - (0.831469612303 * r1_11));
+	  c_re(out[5]) = (r1_10 + tmpr);
+	  c_im(out[5]) = (i1_10 + tmpi);
+	  c_re(out[21]) = (r1_10 - tmpr);
+	  c_im(out[21]) = (i1_10 - tmpi);
+	  tmpr = ((0.382683432365 * r1_13) + (0.923879532511 * i1_13));
+	  tmpi = ((0.382683432365 * i1_13) - (0.923879532511 * r1_13));
+	  c_re(out[6]) = (r1_12 + tmpr);
+	  c_im(out[6]) = (i1_12 + tmpi);
+	  c_re(out[22]) = (r1_12 - tmpr);
+	  c_im(out[22]) = (i1_12 - tmpi);
+	  tmpr = ((0.195090322016 * r1_15) + (0.980785280403 * i1_15));
+	  tmpi = ((0.195090322016 * i1_15) - (0.980785280403 * r1_15));
+	  c_re(out[7]) = (r1_14 + tmpr);
+	  c_im(out[7]) = (i1_14 + tmpi);
+	  c_re(out[23]) = (r1_14 - tmpr);
+	  c_im(out[23]) = (i1_14 - tmpi);
+	  c_re(out[8]) = (r1_16 + i1_17);
+	  c_im(out[8]) = (i1_16 - r1_17);
+	  c_re(out[24]) = (r1_16 - i1_17);
+	  c_im(out[24]) = (i1_16 + r1_17);
+	  tmpr = ((0.980785280403 * i1_19) - (0.195090322016 * r1_19));
+	  tmpi = ((0.980785280403 * r1_19) + (0.195090322016 * i1_19));
+	  c_re(out[9]) = (r1_18 + tmpr);
+	  c_im(out[9]) = (i1_18 - tmpi);
+	  c_re(out[25]) = (r1_18 - tmpr);
+	  c_im(out[25]) = (i1_18 + tmpi);
+	  tmpr = ((0.923879532511 * i1_21) - (0.382683432365 * r1_21));
+	  tmpi = ((0.923879532511 * r1_21) + (0.382683432365 * i1_21));
+	  c_re(out[10]) = (r1_20 + tmpr);
+	  c_im(out[10]) = (i1_20 - tmpi);
+	  c_re(out[26]) = (r1_20 - tmpr);
+	  c_im(out[26]) = (i1_20 + tmpi);
+	  tmpr = ((0.831469612303 * i1_23) - (0.55557023302 * r1_23));
+	  tmpi = ((0.831469612303 * r1_23) + (0.55557023302 * i1_23));
+	  c_re(out[11]) = (r1_22 + tmpr);
+	  c_im(out[11]) = (i1_22 - tmpi);
+	  c_re(out[27]) = (r1_22 - tmpr);
+	  c_im(out[27]) = (i1_22 + tmpi);
+	  tmpr = (0.707106781187 * (i1_25 - r1_25));
+	  tmpi = (0.707106781187 * (r1_25 + i1_25));
+	  c_re(out[12]) = (r1_24 + tmpr);
+	  c_im(out[12]) = (i1_24 - tmpi);
+	  c_re(out[28]) = (r1_24 - tmpr);
+	  c_im(out[28]) = (i1_24 + tmpi);
+	  tmpr = ((0.55557023302 * i1_27) - (0.831469612303 * r1_27));
+	  tmpi = ((0.55557023302 * r1_27) + (0.831469612303 * i1_27));
+	  c_re(out[13]) = (r1_26 + tmpr);
+	  c_im(out[13]) = (i1_26 - tmpi);
+	  c_re(out[29]) = (r1_26 - tmpr);
+	  c_im(out[29]) = (i1_26 + tmpi);
+	  tmpr = ((0.382683432365 * i1_29) - (0.923879532511 * r1_29));
+	  tmpi = ((0.382683432365 * r1_29) + (0.923879532511 * i1_29));
+	  c_re(out[14]) = (r1_28 + tmpr);
+	  c_im(out[14]) = (i1_28 - tmpi);
+	  c_re(out[30]) = (r1_28 - tmpr);
+	  c_im(out[30]) = (i1_28 + tmpi);
+	  tmpr = ((0.195090322016 * i1_31) - (0.980785280403 * r1_31));
+	  tmpi = ((0.195090322016 * r1_31) + (0.980785280403 * i1_31));
+	  c_re(out[15]) = (r1_30 + tmpr);
+	  c_im(out[15]) = (i1_30 - tmpi);
+	  c_re(out[31]) = (r1_30 - tmpr);
+	  c_im(out[31]) = (i1_30 + tmpi);
+     }
+}
+void fft_twiddle_32(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    REAL r1_2, i1_2;
+		    REAL r1_3, i1_3;
+		    REAL r1_4, i1_4;
+		    REAL r1_5, i1_5;
+		    REAL r1_6, i1_6;
+		    REAL r1_7, i1_7;
+		    REAL r1_8, i1_8;
+		    REAL r1_9, i1_9;
+		    REAL r1_10, i1_10;
+		    REAL r1_11, i1_11;
+		    REAL r1_12, i1_12;
+		    REAL r1_13, i1_13;
+		    REAL r1_14, i1_14;
+		    REAL r1_15, i1_15;
+		    REAL r1_16, i1_16;
+		    REAL r1_17, i1_17;
+		    REAL r1_18, i1_18;
+		    REAL r1_19, i1_19;
+		    REAL r1_20, i1_20;
+		    REAL r1_21, i1_21;
+		    REAL r1_22, i1_22;
+		    REAL r1_23, i1_23;
+		    REAL r1_24, i1_24;
+		    REAL r1_25, i1_25;
+		    REAL r1_26, i1_26;
+		    REAL r1_27, i1_27;
+		    REAL r1_28, i1_28;
+		    REAL r1_29, i1_29;
+		    REAL r1_30, i1_30;
+		    REAL r1_31, i1_31;
+		    {
+			 REAL r2_0, i2_0;
+			 REAL r2_2, i2_2;
+			 REAL r2_4, i2_4;
+			 REAL r2_6, i2_6;
+			 REAL r2_8, i2_8;
+			 REAL r2_10, i2_10;
+			 REAL r2_12, i2_12;
+			 REAL r2_14, i2_14;
+			 REAL r2_16, i2_16;
+			 REAL r2_18, i2_18;
+			 REAL r2_20, i2_20;
+			 REAL r2_22, i2_22;
+			 REAL r2_24, i2_24;
+			 REAL r2_26, i2_26;
+			 REAL r2_28, i2_28;
+			 REAL r2_30, i2_30;
+			 {
+			      REAL r3_0, i3_0;
+			      REAL r3_4, i3_4;
+			      REAL r3_8, i3_8;
+			      REAL r3_12, i3_12;
+			      REAL r3_16, i3_16;
+			      REAL r3_20, i3_20;
+			      REAL r3_24, i3_24;
+			      REAL r3_28, i3_28;
+			      {
+				   REAL r4_0, i4_0;
+				   REAL r4_8, i4_8;
+				   REAL r4_16, i4_16;
+				   REAL r4_24, i4_24;
+				   {
+					REAL r5_0, i5_0;
+					REAL r5_16, i5_16;
+					r5_0 = c_re(jp[0 * m]);
+					i5_0 = c_im(jp[0 * m]);
+					wr = c_re(W[16 * l1]);
+					wi = c_im(W[16 * l1]);
+					tmpr = c_re(jp[16 * m]);
+					tmpi = c_im(jp[16 * m]);
+					r5_16 = ((wr * tmpr) - (wi * tmpi));
+					i5_16 = ((wi * tmpr) + (wr * tmpi));
+					r4_0 = (r5_0 + r5_16);
+					i4_0 = (i5_0 + i5_16);
+					r4_16 = (r5_0 - r5_16);
+					i4_16 = (i5_0 - i5_16);
+				   }
+				   {
+					REAL r5_8, i5_8;
+					REAL r5_24, i5_24;
+					wr = c_re(W[8 * l1]);
+					wi = c_im(W[8 * l1]);
+					tmpr = c_re(jp[8 * m]);
+					tmpi = c_im(jp[8 * m]);
+					r5_8 = ((wr * tmpr) - (wi * tmpi));
+					i5_8 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[24 * l1]);
+					wi = c_im(W[24 * l1]);
+					tmpr = c_re(jp[24 * m]);
+					tmpi = c_im(jp[24 * m]);
+					r5_24 = ((wr * tmpr) - (wi * tmpi));
+					i5_24 = ((wi * tmpr) + (wr * tmpi));
+					r4_8 = (r5_8 + r5_24);
+					i4_8 = (i5_8 + i5_24);
+					r4_24 = (r5_8 - r5_24);
+					i4_24 = (i5_8 - i5_24);
+				   }
+				   r3_0 = (r4_0 + r4_8);
+				   i3_0 = (i4_0 + i4_8);
+				   r3_16 = (r4_0 - r4_8);
+				   i3_16 = (i4_0 - i4_8);
+				   r3_8 = (r4_16 + i4_24);
+				   i3_8 = (i4_16 - r4_24);
+				   r3_24 = (r4_16 - i4_24);
+				   i3_24 = (i4_16 + r4_24);
+			      }
+			      {
+				   REAL r4_4, i4_4;
+				   REAL r4_12, i4_12;
+				   REAL r4_20, i4_20;
+				   REAL r4_28, i4_28;
+				   {
+					REAL r5_4, i5_4;
+					REAL r5_20, i5_20;
+					wr = c_re(W[4 * l1]);
+					wi = c_im(W[4 * l1]);
+					tmpr = c_re(jp[4 * m]);
+					tmpi = c_im(jp[4 * m]);
+					r5_4 = ((wr * tmpr) - (wi * tmpi));
+					i5_4 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[20 * l1]);
+					wi = c_im(W[20 * l1]);
+					tmpr = c_re(jp[20 * m]);
+					tmpi = c_im(jp[20 * m]);
+					r5_20 = ((wr * tmpr) - (wi * tmpi));
+					i5_20 = ((wi * tmpr) + (wr * tmpi));
+					r4_4 = (r5_4 + r5_20);
+					i4_4 = (i5_4 + i5_20);
+					r4_20 = (r5_4 - r5_20);
+					i4_20 = (i5_4 - i5_20);
+				   }
+				   {
+					REAL r5_12, i5_12;
+					REAL r5_28, i5_28;
+					wr = c_re(W[12 * l1]);
+					wi = c_im(W[12 * l1]);
+					tmpr = c_re(jp[12 * m]);
+					tmpi = c_im(jp[12 * m]);
+					r5_12 = ((wr * tmpr) - (wi * tmpi));
+					i5_12 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[28 * l1]);
+					wi = c_im(W[28 * l1]);
+					tmpr = c_re(jp[28 * m]);
+					tmpi = c_im(jp[28 * m]);
+					r5_28 = ((wr * tmpr) - (wi * tmpi));
+					i5_28 = ((wi * tmpr) + (wr * tmpi));
+					r4_12 = (r5_12 + r5_28);
+					i4_12 = (i5_12 + i5_28);
+					r4_28 = (r5_12 - r5_28);
+					i4_28 = (i5_12 - i5_28);
+				   }
+				   r3_4 = (r4_4 + r4_12);
+				   i3_4 = (i4_4 + i4_12);
+				   r3_20 = (r4_4 - r4_12);
+				   i3_20 = (i4_4 - i4_12);
+				   r3_12 = (r4_20 + i4_28);
+				   i3_12 = (i4_20 - r4_28);
+				   r3_28 = (r4_20 - i4_28);
+				   i3_28 = (i4_20 + r4_28);
+			      }
+			      r2_0 = (r3_0 + r3_4);
+			      i2_0 = (i3_0 + i3_4);
+			      r2_16 = (r3_0 - r3_4);
+			      i2_16 = (i3_0 - i3_4);
+			      tmpr = (0.707106781187 * (r3_12 + i3_12));
+			      tmpi = (0.707106781187 * (i3_12 - r3_12));
+			      r2_4 = (r3_8 + tmpr);
+			      i2_4 = (i3_8 + tmpi);
+			      r2_20 = (r3_8 - tmpr);
+			      i2_20 = (i3_8 - tmpi);
+			      r2_8 = (r3_16 + i3_20);
+			      i2_8 = (i3_16 - r3_20);
+			      r2_24 = (r3_16 - i3_20);
+			      i2_24 = (i3_16 + r3_20);
+			      tmpr = (0.707106781187 * (i3_28 - r3_28));
+			      tmpi = (0.707106781187 * (r3_28 + i3_28));
+			      r2_12 = (r3_24 + tmpr);
+			      i2_12 = (i3_24 - tmpi);
+			      r2_28 = (r3_24 - tmpr);
+			      i2_28 = (i3_24 + tmpi);
+			 }
+			 {
+			      REAL r3_2, i3_2;
+			      REAL r3_6, i3_6;
+			      REAL r3_10, i3_10;
+			      REAL r3_14, i3_14;
+			      REAL r3_18, i3_18;
+			      REAL r3_22, i3_22;
+			      REAL r3_26, i3_26;
+			      REAL r3_30, i3_30;
+			      {
+				   REAL r4_2, i4_2;
+				   REAL r4_10, i4_10;
+				   REAL r4_18, i4_18;
+				   REAL r4_26, i4_26;
+				   {
+					REAL r5_2, i5_2;
+					REAL r5_18, i5_18;
+					wr = c_re(W[2 * l1]);
+					wi = c_im(W[2 * l1]);
+					tmpr = c_re(jp[2 * m]);
+					tmpi = c_im(jp[2 * m]);
+					r5_2 = ((wr * tmpr) - (wi * tmpi));
+					i5_2 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[18 * l1]);
+					wi = c_im(W[18 * l1]);
+					tmpr = c_re(jp[18 * m]);
+					tmpi = c_im(jp[18 * m]);
+					r5_18 = ((wr * tmpr) - (wi * tmpi));
+					i5_18 = ((wi * tmpr) + (wr * tmpi));
+					r4_2 = (r5_2 + r5_18);
+					i4_2 = (i5_2 + i5_18);
+					r4_18 = (r5_2 - r5_18);
+					i4_18 = (i5_2 - i5_18);
+				   }
+				   {
+					REAL r5_10, i5_10;
+					REAL r5_26, i5_26;
+					wr = c_re(W[10 * l1]);
+					wi = c_im(W[10 * l1]);
+					tmpr = c_re(jp[10 * m]);
+					tmpi = c_im(jp[10 * m]);
+					r5_10 = ((wr * tmpr) - (wi * tmpi));
+					i5_10 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[26 * l1]);
+					wi = c_im(W[26 * l1]);
+					tmpr = c_re(jp[26 * m]);
+					tmpi = c_im(jp[26 * m]);
+					r5_26 = ((wr * tmpr) - (wi * tmpi));
+					i5_26 = ((wi * tmpr) + (wr * tmpi));
+					r4_10 = (r5_10 + r5_26);
+					i4_10 = (i5_10 + i5_26);
+					r4_26 = (r5_10 - r5_26);
+					i4_26 = (i5_10 - i5_26);
+				   }
+				   r3_2 = (r4_2 + r4_10);
+				   i3_2 = (i4_2 + i4_10);
+				   r3_18 = (r4_2 - r4_10);
+				   i3_18 = (i4_2 - i4_10);
+				   r3_10 = (r4_18 + i4_26);
+				   i3_10 = (i4_18 - r4_26);
+				   r3_26 = (r4_18 - i4_26);
+				   i3_26 = (i4_18 + r4_26);
+			      }
+			      {
+				   REAL r4_6, i4_6;
+				   REAL r4_14, i4_14;
+				   REAL r4_22, i4_22;
+				   REAL r4_30, i4_30;
+				   {
+					REAL r5_6, i5_6;
+					REAL r5_22, i5_22;
+					wr = c_re(W[6 * l1]);
+					wi = c_im(W[6 * l1]);
+					tmpr = c_re(jp[6 * m]);
+					tmpi = c_im(jp[6 * m]);
+					r5_6 = ((wr * tmpr) - (wi * tmpi));
+					i5_6 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[22 * l1]);
+					wi = c_im(W[22 * l1]);
+					tmpr = c_re(jp[22 * m]);
+					tmpi = c_im(jp[22 * m]);
+					r5_22 = ((wr * tmpr) - (wi * tmpi));
+					i5_22 = ((wi * tmpr) + (wr * tmpi));
+					r4_6 = (r5_6 + r5_22);
+					i4_6 = (i5_6 + i5_22);
+					r4_22 = (r5_6 - r5_22);
+					i4_22 = (i5_6 - i5_22);
+				   }
+				   {
+					REAL r5_14, i5_14;
+					REAL r5_30, i5_30;
+					wr = c_re(W[14 * l1]);
+					wi = c_im(W[14 * l1]);
+					tmpr = c_re(jp[14 * m]);
+					tmpi = c_im(jp[14 * m]);
+					r5_14 = ((wr * tmpr) - (wi * tmpi));
+					i5_14 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[30 * l1]);
+					wi = c_im(W[30 * l1]);
+					tmpr = c_re(jp[30 * m]);
+					tmpi = c_im(jp[30 * m]);
+					r5_30 = ((wr * tmpr) - (wi * tmpi));
+					i5_30 = ((wi * tmpr) + (wr * tmpi));
+					r4_14 = (r5_14 + r5_30);
+					i4_14 = (i5_14 + i5_30);
+					r4_30 = (r5_14 - r5_30);
+					i4_30 = (i5_14 - i5_30);
+				   }
+				   r3_6 = (r4_6 + r4_14);
+				   i3_6 = (i4_6 + i4_14);
+				   r3_22 = (r4_6 - r4_14);
+				   i3_22 = (i4_6 - i4_14);
+				   r3_14 = (r4_22 + i4_30);
+				   i3_14 = (i4_22 - r4_30);
+				   r3_30 = (r4_22 - i4_30);
+				   i3_30 = (i4_22 + r4_30);
+			      }
+			      r2_2 = (r3_2 + r3_6);
+			      i2_2 = (i3_2 + i3_6);
+			      r2_18 = (r3_2 - r3_6);
+			      i2_18 = (i3_2 - i3_6);
+			      tmpr = (0.707106781187 * (r3_14 + i3_14));
+			      tmpi = (0.707106781187 * (i3_14 - r3_14));
+			      r2_6 = (r3_10 + tmpr);
+			      i2_6 = (i3_10 + tmpi);
+			      r2_22 = (r3_10 - tmpr);
+			      i2_22 = (i3_10 - tmpi);
+			      r2_10 = (r3_18 + i3_22);
+			      i2_10 = (i3_18 - r3_22);
+			      r2_26 = (r3_18 - i3_22);
+			      i2_26 = (i3_18 + r3_22);
+			      tmpr = (0.707106781187 * (i3_30 - r3_30));
+			      tmpi = (0.707106781187 * (r3_30 + i3_30));
+			      r2_14 = (r3_26 + tmpr);
+			      i2_14 = (i3_26 - tmpi);
+			      r2_30 = (r3_26 - tmpr);
+			      i2_30 = (i3_26 + tmpi);
+			 }
+			 r1_0 = (r2_0 + r2_2);
+			 i1_0 = (i2_0 + i2_2);
+			 r1_16 = (r2_0 - r2_2);
+			 i1_16 = (i2_0 - i2_2);
+			 tmpr = ((0.923879532511 * r2_6) + (0.382683432365 * i2_6));
+			 tmpi = ((0.923879532511 * i2_6) - (0.382683432365 * r2_6));
+			 r1_2 = (r2_4 + tmpr);
+			 i1_2 = (i2_4 + tmpi);
+			 r1_18 = (r2_4 - tmpr);
+			 i1_18 = (i2_4 - tmpi);
+			 tmpr = (0.707106781187 * (r2_10 + i2_10));
+			 tmpi = (0.707106781187 * (i2_10 - r2_10));
+			 r1_4 = (r2_8 + tmpr);
+			 i1_4 = (i2_8 + tmpi);
+			 r1_20 = (r2_8 - tmpr);
+			 i1_20 = (i2_8 - tmpi);
+			 tmpr = ((0.382683432365 * r2_14) + (0.923879532511 * i2_14));
+			 tmpi = ((0.382683432365 * i2_14) - (0.923879532511 * r2_14));
+			 r1_6 = (r2_12 + tmpr);
+			 i1_6 = (i2_12 + tmpi);
+			 r1_22 = (r2_12 - tmpr);
+			 i1_22 = (i2_12 - tmpi);
+			 r1_8 = (r2_16 + i2_18);
+			 i1_8 = (i2_16 - r2_18);
+			 r1_24 = (r2_16 - i2_18);
+			 i1_24 = (i2_16 + r2_18);
+			 tmpr = ((0.923879532511 * i2_22) - (0.382683432365 * r2_22));
+			 tmpi = ((0.923879532511 * r2_22) + (0.382683432365 * i2_22));
+			 r1_10 = (r2_20 + tmpr);
+			 i1_10 = (i2_20 - tmpi);
+			 r1_26 = (r2_20 - tmpr);
+			 i1_26 = (i2_20 + tmpi);
+			 tmpr = (0.707106781187 * (i2_26 - r2_26));
+			 tmpi = (0.707106781187 * (r2_26 + i2_26));
+			 r1_12 = (r2_24 + tmpr);
+			 i1_12 = (i2_24 - tmpi);
+			 r1_28 = (r2_24 - tmpr);
+			 i1_28 = (i2_24 + tmpi);
+			 tmpr = ((0.382683432365 * i2_30) - (0.923879532511 * r2_30));
+			 tmpi = ((0.382683432365 * r2_30) + (0.923879532511 * i2_30));
+			 r1_14 = (r2_28 + tmpr);
+			 i1_14 = (i2_28 - tmpi);
+			 r1_30 = (r2_28 - tmpr);
+			 i1_30 = (i2_28 + tmpi);
+		    }
+		    {
+			 REAL r2_1, i2_1;
+			 REAL r2_3, i2_3;
+			 REAL r2_5, i2_5;
+			 REAL r2_7, i2_7;
+			 REAL r2_9, i2_9;
+			 REAL r2_11, i2_11;
+			 REAL r2_13, i2_13;
+			 REAL r2_15, i2_15;
+			 REAL r2_17, i2_17;
+			 REAL r2_19, i2_19;
+			 REAL r2_21, i2_21;
+			 REAL r2_23, i2_23;
+			 REAL r2_25, i2_25;
+			 REAL r2_27, i2_27;
+			 REAL r2_29, i2_29;
+			 REAL r2_31, i2_31;
+			 {
+			      REAL r3_1, i3_1;
+			      REAL r3_5, i3_5;
+			      REAL r3_9, i3_9;
+			      REAL r3_13, i3_13;
+			      REAL r3_17, i3_17;
+			      REAL r3_21, i3_21;
+			      REAL r3_25, i3_25;
+			      REAL r3_29, i3_29;
+			      {
+				   REAL r4_1, i4_1;
+				   REAL r4_9, i4_9;
+				   REAL r4_17, i4_17;
+				   REAL r4_25, i4_25;
+				   {
+					REAL r5_1, i5_1;
+					REAL r5_17, i5_17;
+					wr = c_re(W[1 * l1]);
+					wi = c_im(W[1 * l1]);
+					tmpr = c_re(jp[1 * m]);
+					tmpi = c_im(jp[1 * m]);
+					r5_1 = ((wr * tmpr) - (wi * tmpi));
+					i5_1 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[17 * l1]);
+					wi = c_im(W[17 * l1]);
+					tmpr = c_re(jp[17 * m]);
+					tmpi = c_im(jp[17 * m]);
+					r5_17 = ((wr * tmpr) - (wi * tmpi));
+					i5_17 = ((wi * tmpr) + (wr * tmpi));
+					r4_1 = (r5_1 + r5_17);
+					i4_1 = (i5_1 + i5_17);
+					r4_17 = (r5_1 - r5_17);
+					i4_17 = (i5_1 - i5_17);
+				   }
+				   {
+					REAL r5_9, i5_9;
+					REAL r5_25, i5_25;
+					wr = c_re(W[9 * l1]);
+					wi = c_im(W[9 * l1]);
+					tmpr = c_re(jp[9 * m]);
+					tmpi = c_im(jp[9 * m]);
+					r5_9 = ((wr * tmpr) - (wi * tmpi));
+					i5_9 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[25 * l1]);
+					wi = c_im(W[25 * l1]);
+					tmpr = c_re(jp[25 * m]);
+					tmpi = c_im(jp[25 * m]);
+					r5_25 = ((wr * tmpr) - (wi * tmpi));
+					i5_25 = ((wi * tmpr) + (wr * tmpi));
+					r4_9 = (r5_9 + r5_25);
+					i4_9 = (i5_9 + i5_25);
+					r4_25 = (r5_9 - r5_25);
+					i4_25 = (i5_9 - i5_25);
+				   }
+				   r3_1 = (r4_1 + r4_9);
+				   i3_1 = (i4_1 + i4_9);
+				   r3_17 = (r4_1 - r4_9);
+				   i3_17 = (i4_1 - i4_9);
+				   r3_9 = (r4_17 + i4_25);
+				   i3_9 = (i4_17 - r4_25);
+				   r3_25 = (r4_17 - i4_25);
+				   i3_25 = (i4_17 + r4_25);
+			      }
+			      {
+				   REAL r4_5, i4_5;
+				   REAL r4_13, i4_13;
+				   REAL r4_21, i4_21;
+				   REAL r4_29, i4_29;
+				   {
+					REAL r5_5, i5_5;
+					REAL r5_21, i5_21;
+					wr = c_re(W[5 * l1]);
+					wi = c_im(W[5 * l1]);
+					tmpr = c_re(jp[5 * m]);
+					tmpi = c_im(jp[5 * m]);
+					r5_5 = ((wr * tmpr) - (wi * tmpi));
+					i5_5 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[21 * l1]);
+					wi = c_im(W[21 * l1]);
+					tmpr = c_re(jp[21 * m]);
+					tmpi = c_im(jp[21 * m]);
+					r5_21 = ((wr * tmpr) - (wi * tmpi));
+					i5_21 = ((wi * tmpr) + (wr * tmpi));
+					r4_5 = (r5_5 + r5_21);
+					i4_5 = (i5_5 + i5_21);
+					r4_21 = (r5_5 - r5_21);
+					i4_21 = (i5_5 - i5_21);
+				   }
+				   {
+					REAL r5_13, i5_13;
+					REAL r5_29, i5_29;
+					wr = c_re(W[13 * l1]);
+					wi = c_im(W[13 * l1]);
+					tmpr = c_re(jp[13 * m]);
+					tmpi = c_im(jp[13 * m]);
+					r5_13 = ((wr * tmpr) - (wi * tmpi));
+					i5_13 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[29 * l1]);
+					wi = c_im(W[29 * l1]);
+					tmpr = c_re(jp[29 * m]);
+					tmpi = c_im(jp[29 * m]);
+					r5_29 = ((wr * tmpr) - (wi * tmpi));
+					i5_29 = ((wi * tmpr) + (wr * tmpi));
+					r4_13 = (r5_13 + r5_29);
+					i4_13 = (i5_13 + i5_29);
+					r4_29 = (r5_13 - r5_29);
+					i4_29 = (i5_13 - i5_29);
+				   }
+				   r3_5 = (r4_5 + r4_13);
+				   i3_5 = (i4_5 + i4_13);
+				   r3_21 = (r4_5 - r4_13);
+				   i3_21 = (i4_5 - i4_13);
+				   r3_13 = (r4_21 + i4_29);
+				   i3_13 = (i4_21 - r4_29);
+				   r3_29 = (r4_21 - i4_29);
+				   i3_29 = (i4_21 + r4_29);
+			      }
+			      r2_1 = (r3_1 + r3_5);
+			      i2_1 = (i3_1 + i3_5);
+			      r2_17 = (r3_1 - r3_5);
+			      i2_17 = (i3_1 - i3_5);
+			      tmpr = (0.707106781187 * (r3_13 + i3_13));
+			      tmpi = (0.707106781187 * (i3_13 - r3_13));
+			      r2_5 = (r3_9 + tmpr);
+			      i2_5 = (i3_9 + tmpi);
+			      r2_21 = (r3_9 - tmpr);
+			      i2_21 = (i3_9 - tmpi);
+			      r2_9 = (r3_17 + i3_21);
+			      i2_9 = (i3_17 - r3_21);
+			      r2_25 = (r3_17 - i3_21);
+			      i2_25 = (i3_17 + r3_21);
+			      tmpr = (0.707106781187 * (i3_29 - r3_29));
+			      tmpi = (0.707106781187 * (r3_29 + i3_29));
+			      r2_13 = (r3_25 + tmpr);
+			      i2_13 = (i3_25 - tmpi);
+			      r2_29 = (r3_25 - tmpr);
+			      i2_29 = (i3_25 + tmpi);
+			 }
+			 {
+			      REAL r3_3, i3_3;
+			      REAL r3_7, i3_7;
+			      REAL r3_11, i3_11;
+			      REAL r3_15, i3_15;
+			      REAL r3_19, i3_19;
+			      REAL r3_23, i3_23;
+			      REAL r3_27, i3_27;
+			      REAL r3_31, i3_31;
+			      {
+				   REAL r4_3, i4_3;
+				   REAL r4_11, i4_11;
+				   REAL r4_19, i4_19;
+				   REAL r4_27, i4_27;
+				   {
+					REAL r5_3, i5_3;
+					REAL r5_19, i5_19;
+					wr = c_re(W[3 * l1]);
+					wi = c_im(W[3 * l1]);
+					tmpr = c_re(jp[3 * m]);
+					tmpi = c_im(jp[3 * m]);
+					r5_3 = ((wr * tmpr) - (wi * tmpi));
+					i5_3 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[19 * l1]);
+					wi = c_im(W[19 * l1]);
+					tmpr = c_re(jp[19 * m]);
+					tmpi = c_im(jp[19 * m]);
+					r5_19 = ((wr * tmpr) - (wi * tmpi));
+					i5_19 = ((wi * tmpr) + (wr * tmpi));
+					r4_3 = (r5_3 + r5_19);
+					i4_3 = (i5_3 + i5_19);
+					r4_19 = (r5_3 - r5_19);
+					i4_19 = (i5_3 - i5_19);
+				   }
+				   {
+					REAL r5_11, i5_11;
+					REAL r5_27, i5_27;
+					wr = c_re(W[11 * l1]);
+					wi = c_im(W[11 * l1]);
+					tmpr = c_re(jp[11 * m]);
+					tmpi = c_im(jp[11 * m]);
+					r5_11 = ((wr * tmpr) - (wi * tmpi));
+					i5_11 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[27 * l1]);
+					wi = c_im(W[27 * l1]);
+					tmpr = c_re(jp[27 * m]);
+					tmpi = c_im(jp[27 * m]);
+					r5_27 = ((wr * tmpr) - (wi * tmpi));
+					i5_27 = ((wi * tmpr) + (wr * tmpi));
+					r4_11 = (r5_11 + r5_27);
+					i4_11 = (i5_11 + i5_27);
+					r4_27 = (r5_11 - r5_27);
+					i4_27 = (i5_11 - i5_27);
+				   }
+				   r3_3 = (r4_3 + r4_11);
+				   i3_3 = (i4_3 + i4_11);
+				   r3_19 = (r4_3 - r4_11);
+				   i3_19 = (i4_3 - i4_11);
+				   r3_11 = (r4_19 + i4_27);
+				   i3_11 = (i4_19 - r4_27);
+				   r3_27 = (r4_19 - i4_27);
+				   i3_27 = (i4_19 + r4_27);
+			      }
+			      {
+				   REAL r4_7, i4_7;
+				   REAL r4_15, i4_15;
+				   REAL r4_23, i4_23;
+				   REAL r4_31, i4_31;
+				   {
+					REAL r5_7, i5_7;
+					REAL r5_23, i5_23;
+					wr = c_re(W[7 * l1]);
+					wi = c_im(W[7 * l1]);
+					tmpr = c_re(jp[7 * m]);
+					tmpi = c_im(jp[7 * m]);
+					r5_7 = ((wr * tmpr) - (wi * tmpi));
+					i5_7 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[23 * l1]);
+					wi = c_im(W[23 * l1]);
+					tmpr = c_re(jp[23 * m]);
+					tmpi = c_im(jp[23 * m]);
+					r5_23 = ((wr * tmpr) - (wi * tmpi));
+					i5_23 = ((wi * tmpr) + (wr * tmpi));
+					r4_7 = (r5_7 + r5_23);
+					i4_7 = (i5_7 + i5_23);
+					r4_23 = (r5_7 - r5_23);
+					i4_23 = (i5_7 - i5_23);
+				   }
+				   {
+					REAL r5_15, i5_15;
+					REAL r5_31, i5_31;
+					wr = c_re(W[15 * l1]);
+					wi = c_im(W[15 * l1]);
+					tmpr = c_re(jp[15 * m]);
+					tmpi = c_im(jp[15 * m]);
+					r5_15 = ((wr * tmpr) - (wi * tmpi));
+					i5_15 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[31 * l1]);
+					wi = c_im(W[31 * l1]);
+					tmpr = c_re(jp[31 * m]);
+					tmpi = c_im(jp[31 * m]);
+					r5_31 = ((wr * tmpr) - (wi * tmpi));
+					i5_31 = ((wi * tmpr) + (wr * tmpi));
+					r4_15 = (r5_15 + r5_31);
+					i4_15 = (i5_15 + i5_31);
+					r4_31 = (r5_15 - r5_31);
+					i4_31 = (i5_15 - i5_31);
+				   }
+				   r3_7 = (r4_7 + r4_15);
+				   i3_7 = (i4_7 + i4_15);
+				   r3_23 = (r4_7 - r4_15);
+				   i3_23 = (i4_7 - i4_15);
+				   r3_15 = (r4_23 + i4_31);
+				   i3_15 = (i4_23 - r4_31);
+				   r3_31 = (r4_23 - i4_31);
+				   i3_31 = (i4_23 + r4_31);
+			      }
+			      r2_3 = (r3_3 + r3_7);
+			      i2_3 = (i3_3 + i3_7);
+			      r2_19 = (r3_3 - r3_7);
+			      i2_19 = (i3_3 - i3_7);
+			      tmpr = (0.707106781187 * (r3_15 + i3_15));
+			      tmpi = (0.707106781187 * (i3_15 - r3_15));
+			      r2_7 = (r3_11 + tmpr);
+			      i2_7 = (i3_11 + tmpi);
+			      r2_23 = (r3_11 - tmpr);
+			      i2_23 = (i3_11 - tmpi);
+			      r2_11 = (r3_19 + i3_23);
+			      i2_11 = (i3_19 - r3_23);
+			      r2_27 = (r3_19 - i3_23);
+			      i2_27 = (i3_19 + r3_23);
+			      tmpr = (0.707106781187 * (i3_31 - r3_31));
+			      tmpi = (0.707106781187 * (r3_31 + i3_31));
+			      r2_15 = (r3_27 + tmpr);
+			      i2_15 = (i3_27 - tmpi);
+			      r2_31 = (r3_27 - tmpr);
+			      i2_31 = (i3_27 + tmpi);
+			 }
+			 r1_1 = (r2_1 + r2_3);
+			 i1_1 = (i2_1 + i2_3);
+			 r1_17 = (r2_1 - r2_3);
+			 i1_17 = (i2_1 - i2_3);
+			 tmpr = ((0.923879532511 * r2_7) + (0.382683432365 * i2_7));
+			 tmpi = ((0.923879532511 * i2_7) - (0.382683432365 * r2_7));
+			 r1_3 = (r2_5 + tmpr);
+			 i1_3 = (i2_5 + tmpi);
+			 r1_19 = (r2_5 - tmpr);
+			 i1_19 = (i2_5 - tmpi);
+			 tmpr = (0.707106781187 * (r2_11 + i2_11));
+			 tmpi = (0.707106781187 * (i2_11 - r2_11));
+			 r1_5 = (r2_9 + tmpr);
+			 i1_5 = (i2_9 + tmpi);
+			 r1_21 = (r2_9 - tmpr);
+			 i1_21 = (i2_9 - tmpi);
+			 tmpr = ((0.382683432365 * r2_15) + (0.923879532511 * i2_15));
+			 tmpi = ((0.382683432365 * i2_15) - (0.923879532511 * r2_15));
+			 r1_7 = (r2_13 + tmpr);
+			 i1_7 = (i2_13 + tmpi);
+			 r1_23 = (r2_13 - tmpr);
+			 i1_23 = (i2_13 - tmpi);
+			 r1_9 = (r2_17 + i2_19);
+			 i1_9 = (i2_17 - r2_19);
+			 r1_25 = (r2_17 - i2_19);
+			 i1_25 = (i2_17 + r2_19);
+			 tmpr = ((0.923879532511 * i2_23) - (0.382683432365 * r2_23));
+			 tmpi = ((0.923879532511 * r2_23) + (0.382683432365 * i2_23));
+			 r1_11 = (r2_21 + tmpr);
+			 i1_11 = (i2_21 - tmpi);
+			 r1_27 = (r2_21 - tmpr);
+			 i1_27 = (i2_21 + tmpi);
+			 tmpr = (0.707106781187 * (i2_27 - r2_27));
+			 tmpi = (0.707106781187 * (r2_27 + i2_27));
+			 r1_13 = (r2_25 + tmpr);
+			 i1_13 = (i2_25 - tmpi);
+			 r1_29 = (r2_25 - tmpr);
+			 i1_29 = (i2_25 + tmpi);
+			 tmpr = ((0.382683432365 * i2_31) - (0.923879532511 * r2_31));
+			 tmpi = ((0.382683432365 * r2_31) + (0.923879532511 * i2_31));
+			 r1_15 = (r2_29 + tmpr);
+			 i1_15 = (i2_29 - tmpi);
+			 r1_31 = (r2_29 - tmpr);
+			 i1_31 = (i2_29 + tmpi);
+		    }
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[16 * m]) = (r1_0 - r1_1);
+		    c_im(kp[16 * m]) = (i1_0 - i1_1);
+		    tmpr = ((0.980785280403 * r1_3) + (0.195090322016 * i1_3));
+		    tmpi = ((0.980785280403 * i1_3) - (0.195090322016 * r1_3));
+		    c_re(kp[1 * m]) = (r1_2 + tmpr);
+		    c_im(kp[1 * m]) = (i1_2 + tmpi);
+		    c_re(kp[17 * m]) = (r1_2 - tmpr);
+		    c_im(kp[17 * m]) = (i1_2 - tmpi);
+		    tmpr = ((0.923879532511 * r1_5) + (0.382683432365 * i1_5));
+		    tmpi = ((0.923879532511 * i1_5) - (0.382683432365 * r1_5));
+		    c_re(kp[2 * m]) = (r1_4 + tmpr);
+		    c_im(kp[2 * m]) = (i1_4 + tmpi);
+		    c_re(kp[18 * m]) = (r1_4 - tmpr);
+		    c_im(kp[18 * m]) = (i1_4 - tmpi);
+		    tmpr = ((0.831469612303 * r1_7) + (0.55557023302 * i1_7));
+		    tmpi = ((0.831469612303 * i1_7) - (0.55557023302 * r1_7));
+		    c_re(kp[3 * m]) = (r1_6 + tmpr);
+		    c_im(kp[3 * m]) = (i1_6 + tmpi);
+		    c_re(kp[19 * m]) = (r1_6 - tmpr);
+		    c_im(kp[19 * m]) = (i1_6 - tmpi);
+		    tmpr = (0.707106781187 * (r1_9 + i1_9));
+		    tmpi = (0.707106781187 * (i1_9 - r1_9));
+		    c_re(kp[4 * m]) = (r1_8 + tmpr);
+		    c_im(kp[4 * m]) = (i1_8 + tmpi);
+		    c_re(kp[20 * m]) = (r1_8 - tmpr);
+		    c_im(kp[20 * m]) = (i1_8 - tmpi);
+		    tmpr = ((0.55557023302 * r1_11) + (0.831469612303 * i1_11));
+		    tmpi = ((0.55557023302 * i1_11) - (0.831469612303 * r1_11));
+		    c_re(kp[5 * m]) = (r1_10 + tmpr);
+		    c_im(kp[5 * m]) = (i1_10 + tmpi);
+		    c_re(kp[21 * m]) = (r1_10 - tmpr);
+		    c_im(kp[21 * m]) = (i1_10 - tmpi);
+		    tmpr = ((0.382683432365 * r1_13) + (0.923879532511 * i1_13));
+		    tmpi = ((0.382683432365 * i1_13) - (0.923879532511 * r1_13));
+		    c_re(kp[6 * m]) = (r1_12 + tmpr);
+		    c_im(kp[6 * m]) = (i1_12 + tmpi);
+		    c_re(kp[22 * m]) = (r1_12 - tmpr);
+		    c_im(kp[22 * m]) = (i1_12 - tmpi);
+		    tmpr = ((0.195090322016 * r1_15) + (0.980785280403 * i1_15));
+		    tmpi = ((0.195090322016 * i1_15) - (0.980785280403 * r1_15));
+		    c_re(kp[7 * m]) = (r1_14 + tmpr);
+		    c_im(kp[7 * m]) = (i1_14 + tmpi);
+		    c_re(kp[23 * m]) = (r1_14 - tmpr);
+		    c_im(kp[23 * m]) = (i1_14 - tmpi);
+		    c_re(kp[8 * m]) = (r1_16 + i1_17);
+		    c_im(kp[8 * m]) = (i1_16 - r1_17);
+		    c_re(kp[24 * m]) = (r1_16 - i1_17);
+		    c_im(kp[24 * m]) = (i1_16 + r1_17);
+		    tmpr = ((0.980785280403 * i1_19) - (0.195090322016 * r1_19));
+		    tmpi = ((0.980785280403 * r1_19) + (0.195090322016 * i1_19));
+		    c_re(kp[9 * m]) = (r1_18 + tmpr);
+		    c_im(kp[9 * m]) = (i1_18 - tmpi);
+		    c_re(kp[25 * m]) = (r1_18 - tmpr);
+		    c_im(kp[25 * m]) = (i1_18 + tmpi);
+		    tmpr = ((0.923879532511 * i1_21) - (0.382683432365 * r1_21));
+		    tmpi = ((0.923879532511 * r1_21) + (0.382683432365 * i1_21));
+		    c_re(kp[10 * m]) = (r1_20 + tmpr);
+		    c_im(kp[10 * m]) = (i1_20 - tmpi);
+		    c_re(kp[26 * m]) = (r1_20 - tmpr);
+		    c_im(kp[26 * m]) = (i1_20 + tmpi);
+		    tmpr = ((0.831469612303 * i1_23) - (0.55557023302 * r1_23));
+		    tmpi = ((0.831469612303 * r1_23) + (0.55557023302 * i1_23));
+		    c_re(kp[11 * m]) = (r1_22 + tmpr);
+		    c_im(kp[11 * m]) = (i1_22 - tmpi);
+		    c_re(kp[27 * m]) = (r1_22 - tmpr);
+		    c_im(kp[27 * m]) = (i1_22 + tmpi);
+		    tmpr = (0.707106781187 * (i1_25 - r1_25));
+		    tmpi = (0.707106781187 * (r1_25 + i1_25));
+		    c_re(kp[12 * m]) = (r1_24 + tmpr);
+		    c_im(kp[12 * m]) = (i1_24 - tmpi);
+		    c_re(kp[28 * m]) = (r1_24 - tmpr);
+		    c_im(kp[28 * m]) = (i1_24 + tmpi);
+		    tmpr = ((0.55557023302 * i1_27) - (0.831469612303 * r1_27));
+		    tmpi = ((0.55557023302 * r1_27) + (0.831469612303 * i1_27));
+		    c_re(kp[13 * m]) = (r1_26 + tmpr);
+		    c_im(kp[13 * m]) = (i1_26 - tmpi);
+		    c_re(kp[29 * m]) = (r1_26 - tmpr);
+		    c_im(kp[29 * m]) = (i1_26 + tmpi);
+		    tmpr = ((0.382683432365 * i1_29) - (0.923879532511 * r1_29));
+		    tmpi = ((0.382683432365 * r1_29) + (0.923879532511 * i1_29));
+		    c_re(kp[14 * m]) = (r1_28 + tmpr);
+		    c_im(kp[14 * m]) = (i1_28 - tmpi);
+		    c_re(kp[30 * m]) = (r1_28 - tmpr);
+		    c_im(kp[30 * m]) = (i1_28 + tmpi);
+		    tmpr = ((0.195090322016 * i1_31) - (0.980785280403 * r1_31));
+		    tmpi = ((0.195090322016 * r1_31) + (0.980785280403 * i1_31));
+		    c_re(kp[15 * m]) = (r1_30 + tmpr);
+		    c_im(kp[15 * m]) = (i1_30 - tmpi);
+		    c_re(kp[31 * m]) = (r1_30 - tmpr);
+		    c_im(kp[31 * m]) = (i1_30 + tmpi);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+#if defined(FORCE_TIED_TASKS)
+          #pragma omp task
+	  fft_twiddle_32(a, ab, in, out, W, nW, nWdn, m);
+          #pragma omp task
+	  fft_twiddle_32(ab, b, in, out, W, nW, nWdn, m);
+#else
+          #pragma omp task untied
+	  fft_twiddle_32(a, ab, in, out, W, nW, nWdn, m);
+          #pragma omp task untied
+	  fft_twiddle_32(ab, b, in, out, W, nW, nWdn, m);
+#endif
+          #pragma omp taskwait
+     }
+}
+void fft_twiddle_32_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    REAL r1_2, i1_2;
+		    REAL r1_3, i1_3;
+		    REAL r1_4, i1_4;
+		    REAL r1_5, i1_5;
+		    REAL r1_6, i1_6;
+		    REAL r1_7, i1_7;
+		    REAL r1_8, i1_8;
+		    REAL r1_9, i1_9;
+		    REAL r1_10, i1_10;
+		    REAL r1_11, i1_11;
+		    REAL r1_12, i1_12;
+		    REAL r1_13, i1_13;
+		    REAL r1_14, i1_14;
+		    REAL r1_15, i1_15;
+		    REAL r1_16, i1_16;
+		    REAL r1_17, i1_17;
+		    REAL r1_18, i1_18;
+		    REAL r1_19, i1_19;
+		    REAL r1_20, i1_20;
+		    REAL r1_21, i1_21;
+		    REAL r1_22, i1_22;
+		    REAL r1_23, i1_23;
+		    REAL r1_24, i1_24;
+		    REAL r1_25, i1_25;
+		    REAL r1_26, i1_26;
+		    REAL r1_27, i1_27;
+		    REAL r1_28, i1_28;
+		    REAL r1_29, i1_29;
+		    REAL r1_30, i1_30;
+		    REAL r1_31, i1_31;
+		    {
+			 REAL r2_0, i2_0;
+			 REAL r2_2, i2_2;
+			 REAL r2_4, i2_4;
+			 REAL r2_6, i2_6;
+			 REAL r2_8, i2_8;
+			 REAL r2_10, i2_10;
+			 REAL r2_12, i2_12;
+			 REAL r2_14, i2_14;
+			 REAL r2_16, i2_16;
+			 REAL r2_18, i2_18;
+			 REAL r2_20, i2_20;
+			 REAL r2_22, i2_22;
+			 REAL r2_24, i2_24;
+			 REAL r2_26, i2_26;
+			 REAL r2_28, i2_28;
+			 REAL r2_30, i2_30;
+			 {
+			      REAL r3_0, i3_0;
+			      REAL r3_4, i3_4;
+			      REAL r3_8, i3_8;
+			      REAL r3_12, i3_12;
+			      REAL r3_16, i3_16;
+			      REAL r3_20, i3_20;
+			      REAL r3_24, i3_24;
+			      REAL r3_28, i3_28;
+			      {
+				   REAL r4_0, i4_0;
+				   REAL r4_8, i4_8;
+				   REAL r4_16, i4_16;
+				   REAL r4_24, i4_24;
+				   {
+					REAL r5_0, i5_0;
+					REAL r5_16, i5_16;
+					r5_0 = c_re(jp[0 * m]);
+					i5_0 = c_im(jp[0 * m]);
+					wr = c_re(W[16 * l1]);
+					wi = c_im(W[16 * l1]);
+					tmpr = c_re(jp[16 * m]);
+					tmpi = c_im(jp[16 * m]);
+					r5_16 = ((wr * tmpr) - (wi * tmpi));
+					i5_16 = ((wi * tmpr) + (wr * tmpi));
+					r4_0 = (r5_0 + r5_16);
+					i4_0 = (i5_0 + i5_16);
+					r4_16 = (r5_0 - r5_16);
+					i4_16 = (i5_0 - i5_16);
+				   }
+				   {
+					REAL r5_8, i5_8;
+					REAL r5_24, i5_24;
+					wr = c_re(W[8 * l1]);
+					wi = c_im(W[8 * l1]);
+					tmpr = c_re(jp[8 * m]);
+					tmpi = c_im(jp[8 * m]);
+					r5_8 = ((wr * tmpr) - (wi * tmpi));
+					i5_8 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[24 * l1]);
+					wi = c_im(W[24 * l1]);
+					tmpr = c_re(jp[24 * m]);
+					tmpi = c_im(jp[24 * m]);
+					r5_24 = ((wr * tmpr) - (wi * tmpi));
+					i5_24 = ((wi * tmpr) + (wr * tmpi));
+					r4_8 = (r5_8 + r5_24);
+					i4_8 = (i5_8 + i5_24);
+					r4_24 = (r5_8 - r5_24);
+					i4_24 = (i5_8 - i5_24);
+				   }
+				   r3_0 = (r4_0 + r4_8);
+				   i3_0 = (i4_0 + i4_8);
+				   r3_16 = (r4_0 - r4_8);
+				   i3_16 = (i4_0 - i4_8);
+				   r3_8 = (r4_16 + i4_24);
+				   i3_8 = (i4_16 - r4_24);
+				   r3_24 = (r4_16 - i4_24);
+				   i3_24 = (i4_16 + r4_24);
+			      }
+			      {
+				   REAL r4_4, i4_4;
+				   REAL r4_12, i4_12;
+				   REAL r4_20, i4_20;
+				   REAL r4_28, i4_28;
+				   {
+					REAL r5_4, i5_4;
+					REAL r5_20, i5_20;
+					wr = c_re(W[4 * l1]);
+					wi = c_im(W[4 * l1]);
+					tmpr = c_re(jp[4 * m]);
+					tmpi = c_im(jp[4 * m]);
+					r5_4 = ((wr * tmpr) - (wi * tmpi));
+					i5_4 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[20 * l1]);
+					wi = c_im(W[20 * l1]);
+					tmpr = c_re(jp[20 * m]);
+					tmpi = c_im(jp[20 * m]);
+					r5_20 = ((wr * tmpr) - (wi * tmpi));
+					i5_20 = ((wi * tmpr) + (wr * tmpi));
+					r4_4 = (r5_4 + r5_20);
+					i4_4 = (i5_4 + i5_20);
+					r4_20 = (r5_4 - r5_20);
+					i4_20 = (i5_4 - i5_20);
+				   }
+				   {
+					REAL r5_12, i5_12;
+					REAL r5_28, i5_28;
+					wr = c_re(W[12 * l1]);
+					wi = c_im(W[12 * l1]);
+					tmpr = c_re(jp[12 * m]);
+					tmpi = c_im(jp[12 * m]);
+					r5_12 = ((wr * tmpr) - (wi * tmpi));
+					i5_12 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[28 * l1]);
+					wi = c_im(W[28 * l1]);
+					tmpr = c_re(jp[28 * m]);
+					tmpi = c_im(jp[28 * m]);
+					r5_28 = ((wr * tmpr) - (wi * tmpi));
+					i5_28 = ((wi * tmpr) + (wr * tmpi));
+					r4_12 = (r5_12 + r5_28);
+					i4_12 = (i5_12 + i5_28);
+					r4_28 = (r5_12 - r5_28);
+					i4_28 = (i5_12 - i5_28);
+				   }
+				   r3_4 = (r4_4 + r4_12);
+				   i3_4 = (i4_4 + i4_12);
+				   r3_20 = (r4_4 - r4_12);
+				   i3_20 = (i4_4 - i4_12);
+				   r3_12 = (r4_20 + i4_28);
+				   i3_12 = (i4_20 - r4_28);
+				   r3_28 = (r4_20 - i4_28);
+				   i3_28 = (i4_20 + r4_28);
+			      }
+			      r2_0 = (r3_0 + r3_4);
+			      i2_0 = (i3_0 + i3_4);
+			      r2_16 = (r3_0 - r3_4);
+			      i2_16 = (i3_0 - i3_4);
+			      tmpr = (0.707106781187 * (r3_12 + i3_12));
+			      tmpi = (0.707106781187 * (i3_12 - r3_12));
+			      r2_4 = (r3_8 + tmpr);
+			      i2_4 = (i3_8 + tmpi);
+			      r2_20 = (r3_8 - tmpr);
+			      i2_20 = (i3_8 - tmpi);
+			      r2_8 = (r3_16 + i3_20);
+			      i2_8 = (i3_16 - r3_20);
+			      r2_24 = (r3_16 - i3_20);
+			      i2_24 = (i3_16 + r3_20);
+			      tmpr = (0.707106781187 * (i3_28 - r3_28));
+			      tmpi = (0.707106781187 * (r3_28 + i3_28));
+			      r2_12 = (r3_24 + tmpr);
+			      i2_12 = (i3_24 - tmpi);
+			      r2_28 = (r3_24 - tmpr);
+			      i2_28 = (i3_24 + tmpi);
+			 }
+			 {
+			      REAL r3_2, i3_2;
+			      REAL r3_6, i3_6;
+			      REAL r3_10, i3_10;
+			      REAL r3_14, i3_14;
+			      REAL r3_18, i3_18;
+			      REAL r3_22, i3_22;
+			      REAL r3_26, i3_26;
+			      REAL r3_30, i3_30;
+			      {
+				   REAL r4_2, i4_2;
+				   REAL r4_10, i4_10;
+				   REAL r4_18, i4_18;
+				   REAL r4_26, i4_26;
+				   {
+					REAL r5_2, i5_2;
+					REAL r5_18, i5_18;
+					wr = c_re(W[2 * l1]);
+					wi = c_im(W[2 * l1]);
+					tmpr = c_re(jp[2 * m]);
+					tmpi = c_im(jp[2 * m]);
+					r5_2 = ((wr * tmpr) - (wi * tmpi));
+					i5_2 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[18 * l1]);
+					wi = c_im(W[18 * l1]);
+					tmpr = c_re(jp[18 * m]);
+					tmpi = c_im(jp[18 * m]);
+					r5_18 = ((wr * tmpr) - (wi * tmpi));
+					i5_18 = ((wi * tmpr) + (wr * tmpi));
+					r4_2 = (r5_2 + r5_18);
+					i4_2 = (i5_2 + i5_18);
+					r4_18 = (r5_2 - r5_18);
+					i4_18 = (i5_2 - i5_18);
+				   }
+				   {
+					REAL r5_10, i5_10;
+					REAL r5_26, i5_26;
+					wr = c_re(W[10 * l1]);
+					wi = c_im(W[10 * l1]);
+					tmpr = c_re(jp[10 * m]);
+					tmpi = c_im(jp[10 * m]);
+					r5_10 = ((wr * tmpr) - (wi * tmpi));
+					i5_10 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[26 * l1]);
+					wi = c_im(W[26 * l1]);
+					tmpr = c_re(jp[26 * m]);
+					tmpi = c_im(jp[26 * m]);
+					r5_26 = ((wr * tmpr) - (wi * tmpi));
+					i5_26 = ((wi * tmpr) + (wr * tmpi));
+					r4_10 = (r5_10 + r5_26);
+					i4_10 = (i5_10 + i5_26);
+					r4_26 = (r5_10 - r5_26);
+					i4_26 = (i5_10 - i5_26);
+				   }
+				   r3_2 = (r4_2 + r4_10);
+				   i3_2 = (i4_2 + i4_10);
+				   r3_18 = (r4_2 - r4_10);
+				   i3_18 = (i4_2 - i4_10);
+				   r3_10 = (r4_18 + i4_26);
+				   i3_10 = (i4_18 - r4_26);
+				   r3_26 = (r4_18 - i4_26);
+				   i3_26 = (i4_18 + r4_26);
+			      }
+			      {
+				   REAL r4_6, i4_6;
+				   REAL r4_14, i4_14;
+				   REAL r4_22, i4_22;
+				   REAL r4_30, i4_30;
+				   {
+					REAL r5_6, i5_6;
+					REAL r5_22, i5_22;
+					wr = c_re(W[6 * l1]);
+					wi = c_im(W[6 * l1]);
+					tmpr = c_re(jp[6 * m]);
+					tmpi = c_im(jp[6 * m]);
+					r5_6 = ((wr * tmpr) - (wi * tmpi));
+					i5_6 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[22 * l1]);
+					wi = c_im(W[22 * l1]);
+					tmpr = c_re(jp[22 * m]);
+					tmpi = c_im(jp[22 * m]);
+					r5_22 = ((wr * tmpr) - (wi * tmpi));
+					i5_22 = ((wi * tmpr) + (wr * tmpi));
+					r4_6 = (r5_6 + r5_22);
+					i4_6 = (i5_6 + i5_22);
+					r4_22 = (r5_6 - r5_22);
+					i4_22 = (i5_6 - i5_22);
+				   }
+				   {
+					REAL r5_14, i5_14;
+					REAL r5_30, i5_30;
+					wr = c_re(W[14 * l1]);
+					wi = c_im(W[14 * l1]);
+					tmpr = c_re(jp[14 * m]);
+					tmpi = c_im(jp[14 * m]);
+					r5_14 = ((wr * tmpr) - (wi * tmpi));
+					i5_14 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[30 * l1]);
+					wi = c_im(W[30 * l1]);
+					tmpr = c_re(jp[30 * m]);
+					tmpi = c_im(jp[30 * m]);
+					r5_30 = ((wr * tmpr) - (wi * tmpi));
+					i5_30 = ((wi * tmpr) + (wr * tmpi));
+					r4_14 = (r5_14 + r5_30);
+					i4_14 = (i5_14 + i5_30);
+					r4_30 = (r5_14 - r5_30);
+					i4_30 = (i5_14 - i5_30);
+				   }
+				   r3_6 = (r4_6 + r4_14);
+				   i3_6 = (i4_6 + i4_14);
+				   r3_22 = (r4_6 - r4_14);
+				   i3_22 = (i4_6 - i4_14);
+				   r3_14 = (r4_22 + i4_30);
+				   i3_14 = (i4_22 - r4_30);
+				   r3_30 = (r4_22 - i4_30);
+				   i3_30 = (i4_22 + r4_30);
+			      }
+			      r2_2 = (r3_2 + r3_6);
+			      i2_2 = (i3_2 + i3_6);
+			      r2_18 = (r3_2 - r3_6);
+			      i2_18 = (i3_2 - i3_6);
+			      tmpr = (0.707106781187 * (r3_14 + i3_14));
+			      tmpi = (0.707106781187 * (i3_14 - r3_14));
+			      r2_6 = (r3_10 + tmpr);
+			      i2_6 = (i3_10 + tmpi);
+			      r2_22 = (r3_10 - tmpr);
+			      i2_22 = (i3_10 - tmpi);
+			      r2_10 = (r3_18 + i3_22);
+			      i2_10 = (i3_18 - r3_22);
+			      r2_26 = (r3_18 - i3_22);
+			      i2_26 = (i3_18 + r3_22);
+			      tmpr = (0.707106781187 * (i3_30 - r3_30));
+			      tmpi = (0.707106781187 * (r3_30 + i3_30));
+			      r2_14 = (r3_26 + tmpr);
+			      i2_14 = (i3_26 - tmpi);
+			      r2_30 = (r3_26 - tmpr);
+			      i2_30 = (i3_26 + tmpi);
+			 }
+			 r1_0 = (r2_0 + r2_2);
+			 i1_0 = (i2_0 + i2_2);
+			 r1_16 = (r2_0 - r2_2);
+			 i1_16 = (i2_0 - i2_2);
+			 tmpr = ((0.923879532511 * r2_6) + (0.382683432365 * i2_6));
+			 tmpi = ((0.923879532511 * i2_6) - (0.382683432365 * r2_6));
+			 r1_2 = (r2_4 + tmpr);
+			 i1_2 = (i2_4 + tmpi);
+			 r1_18 = (r2_4 - tmpr);
+			 i1_18 = (i2_4 - tmpi);
+			 tmpr = (0.707106781187 * (r2_10 + i2_10));
+			 tmpi = (0.707106781187 * (i2_10 - r2_10));
+			 r1_4 = (r2_8 + tmpr);
+			 i1_4 = (i2_8 + tmpi);
+			 r1_20 = (r2_8 - tmpr);
+			 i1_20 = (i2_8 - tmpi);
+			 tmpr = ((0.382683432365 * r2_14) + (0.923879532511 * i2_14));
+			 tmpi = ((0.382683432365 * i2_14) - (0.923879532511 * r2_14));
+			 r1_6 = (r2_12 + tmpr);
+			 i1_6 = (i2_12 + tmpi);
+			 r1_22 = (r2_12 - tmpr);
+			 i1_22 = (i2_12 - tmpi);
+			 r1_8 = (r2_16 + i2_18);
+			 i1_8 = (i2_16 - r2_18);
+			 r1_24 = (r2_16 - i2_18);
+			 i1_24 = (i2_16 + r2_18);
+			 tmpr = ((0.923879532511 * i2_22) - (0.382683432365 * r2_22));
+			 tmpi = ((0.923879532511 * r2_22) + (0.382683432365 * i2_22));
+			 r1_10 = (r2_20 + tmpr);
+			 i1_10 = (i2_20 - tmpi);
+			 r1_26 = (r2_20 - tmpr);
+			 i1_26 = (i2_20 + tmpi);
+			 tmpr = (0.707106781187 * (i2_26 - r2_26));
+			 tmpi = (0.707106781187 * (r2_26 + i2_26));
+			 r1_12 = (r2_24 + tmpr);
+			 i1_12 = (i2_24 - tmpi);
+			 r1_28 = (r2_24 - tmpr);
+			 i1_28 = (i2_24 + tmpi);
+			 tmpr = ((0.382683432365 * i2_30) - (0.923879532511 * r2_30));
+			 tmpi = ((0.382683432365 * r2_30) + (0.923879532511 * i2_30));
+			 r1_14 = (r2_28 + tmpr);
+			 i1_14 = (i2_28 - tmpi);
+			 r1_30 = (r2_28 - tmpr);
+			 i1_30 = (i2_28 + tmpi);
+		    }
+		    {
+			 REAL r2_1, i2_1;
+			 REAL r2_3, i2_3;
+			 REAL r2_5, i2_5;
+			 REAL r2_7, i2_7;
+			 REAL r2_9, i2_9;
+			 REAL r2_11, i2_11;
+			 REAL r2_13, i2_13;
+			 REAL r2_15, i2_15;
+			 REAL r2_17, i2_17;
+			 REAL r2_19, i2_19;
+			 REAL r2_21, i2_21;
+			 REAL r2_23, i2_23;
+			 REAL r2_25, i2_25;
+			 REAL r2_27, i2_27;
+			 REAL r2_29, i2_29;
+			 REAL r2_31, i2_31;
+			 {
+			      REAL r3_1, i3_1;
+			      REAL r3_5, i3_5;
+			      REAL r3_9, i3_9;
+			      REAL r3_13, i3_13;
+			      REAL r3_17, i3_17;
+			      REAL r3_21, i3_21;
+			      REAL r3_25, i3_25;
+			      REAL r3_29, i3_29;
+			      {
+				   REAL r4_1, i4_1;
+				   REAL r4_9, i4_9;
+				   REAL r4_17, i4_17;
+				   REAL r4_25, i4_25;
+				   {
+					REAL r5_1, i5_1;
+					REAL r5_17, i5_17;
+					wr = c_re(W[1 * l1]);
+					wi = c_im(W[1 * l1]);
+					tmpr = c_re(jp[1 * m]);
+					tmpi = c_im(jp[1 * m]);
+					r5_1 = ((wr * tmpr) - (wi * tmpi));
+					i5_1 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[17 * l1]);
+					wi = c_im(W[17 * l1]);
+					tmpr = c_re(jp[17 * m]);
+					tmpi = c_im(jp[17 * m]);
+					r5_17 = ((wr * tmpr) - (wi * tmpi));
+					i5_17 = ((wi * tmpr) + (wr * tmpi));
+					r4_1 = (r5_1 + r5_17);
+					i4_1 = (i5_1 + i5_17);
+					r4_17 = (r5_1 - r5_17);
+					i4_17 = (i5_1 - i5_17);
+				   }
+				   {
+					REAL r5_9, i5_9;
+					REAL r5_25, i5_25;
+					wr = c_re(W[9 * l1]);
+					wi = c_im(W[9 * l1]);
+					tmpr = c_re(jp[9 * m]);
+					tmpi = c_im(jp[9 * m]);
+					r5_9 = ((wr * tmpr) - (wi * tmpi));
+					i5_9 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[25 * l1]);
+					wi = c_im(W[25 * l1]);
+					tmpr = c_re(jp[25 * m]);
+					tmpi = c_im(jp[25 * m]);
+					r5_25 = ((wr * tmpr) - (wi * tmpi));
+					i5_25 = ((wi * tmpr) + (wr * tmpi));
+					r4_9 = (r5_9 + r5_25);
+					i4_9 = (i5_9 + i5_25);
+					r4_25 = (r5_9 - r5_25);
+					i4_25 = (i5_9 - i5_25);
+				   }
+				   r3_1 = (r4_1 + r4_9);
+				   i3_1 = (i4_1 + i4_9);
+				   r3_17 = (r4_1 - r4_9);
+				   i3_17 = (i4_1 - i4_9);
+				   r3_9 = (r4_17 + i4_25);
+				   i3_9 = (i4_17 - r4_25);
+				   r3_25 = (r4_17 - i4_25);
+				   i3_25 = (i4_17 + r4_25);
+			      }
+			      {
+				   REAL r4_5, i4_5;
+				   REAL r4_13, i4_13;
+				   REAL r4_21, i4_21;
+				   REAL r4_29, i4_29;
+				   {
+					REAL r5_5, i5_5;
+					REAL r5_21, i5_21;
+					wr = c_re(W[5 * l1]);
+					wi = c_im(W[5 * l1]);
+					tmpr = c_re(jp[5 * m]);
+					tmpi = c_im(jp[5 * m]);
+					r5_5 = ((wr * tmpr) - (wi * tmpi));
+					i5_5 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[21 * l1]);
+					wi = c_im(W[21 * l1]);
+					tmpr = c_re(jp[21 * m]);
+					tmpi = c_im(jp[21 * m]);
+					r5_21 = ((wr * tmpr) - (wi * tmpi));
+					i5_21 = ((wi * tmpr) + (wr * tmpi));
+					r4_5 = (r5_5 + r5_21);
+					i4_5 = (i5_5 + i5_21);
+					r4_21 = (r5_5 - r5_21);
+					i4_21 = (i5_5 - i5_21);
+				   }
+				   {
+					REAL r5_13, i5_13;
+					REAL r5_29, i5_29;
+					wr = c_re(W[13 * l1]);
+					wi = c_im(W[13 * l1]);
+					tmpr = c_re(jp[13 * m]);
+					tmpi = c_im(jp[13 * m]);
+					r5_13 = ((wr * tmpr) - (wi * tmpi));
+					i5_13 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[29 * l1]);
+					wi = c_im(W[29 * l1]);
+					tmpr = c_re(jp[29 * m]);
+					tmpi = c_im(jp[29 * m]);
+					r5_29 = ((wr * tmpr) - (wi * tmpi));
+					i5_29 = ((wi * tmpr) + (wr * tmpi));
+					r4_13 = (r5_13 + r5_29);
+					i4_13 = (i5_13 + i5_29);
+					r4_29 = (r5_13 - r5_29);
+					i4_29 = (i5_13 - i5_29);
+				   }
+				   r3_5 = (r4_5 + r4_13);
+				   i3_5 = (i4_5 + i4_13);
+				   r3_21 = (r4_5 - r4_13);
+				   i3_21 = (i4_5 - i4_13);
+				   r3_13 = (r4_21 + i4_29);
+				   i3_13 = (i4_21 - r4_29);
+				   r3_29 = (r4_21 - i4_29);
+				   i3_29 = (i4_21 + r4_29);
+			      }
+			      r2_1 = (r3_1 + r3_5);
+			      i2_1 = (i3_1 + i3_5);
+			      r2_17 = (r3_1 - r3_5);
+			      i2_17 = (i3_1 - i3_5);
+			      tmpr = (0.707106781187 * (r3_13 + i3_13));
+			      tmpi = (0.707106781187 * (i3_13 - r3_13));
+			      r2_5 = (r3_9 + tmpr);
+			      i2_5 = (i3_9 + tmpi);
+			      r2_21 = (r3_9 - tmpr);
+			      i2_21 = (i3_9 - tmpi);
+			      r2_9 = (r3_17 + i3_21);
+			      i2_9 = (i3_17 - r3_21);
+			      r2_25 = (r3_17 - i3_21);
+			      i2_25 = (i3_17 + r3_21);
+			      tmpr = (0.707106781187 * (i3_29 - r3_29));
+			      tmpi = (0.707106781187 * (r3_29 + i3_29));
+			      r2_13 = (r3_25 + tmpr);
+			      i2_13 = (i3_25 - tmpi);
+			      r2_29 = (r3_25 - tmpr);
+			      i2_29 = (i3_25 + tmpi);
+			 }
+			 {
+			      REAL r3_3, i3_3;
+			      REAL r3_7, i3_7;
+			      REAL r3_11, i3_11;
+			      REAL r3_15, i3_15;
+			      REAL r3_19, i3_19;
+			      REAL r3_23, i3_23;
+			      REAL r3_27, i3_27;
+			      REAL r3_31, i3_31;
+			      {
+				   REAL r4_3, i4_3;
+				   REAL r4_11, i4_11;
+				   REAL r4_19, i4_19;
+				   REAL r4_27, i4_27;
+				   {
+					REAL r5_3, i5_3;
+					REAL r5_19, i5_19;
+					wr = c_re(W[3 * l1]);
+					wi = c_im(W[3 * l1]);
+					tmpr = c_re(jp[3 * m]);
+					tmpi = c_im(jp[3 * m]);
+					r5_3 = ((wr * tmpr) - (wi * tmpi));
+					i5_3 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[19 * l1]);
+					wi = c_im(W[19 * l1]);
+					tmpr = c_re(jp[19 * m]);
+					tmpi = c_im(jp[19 * m]);
+					r5_19 = ((wr * tmpr) - (wi * tmpi));
+					i5_19 = ((wi * tmpr) + (wr * tmpi));
+					r4_3 = (r5_3 + r5_19);
+					i4_3 = (i5_3 + i5_19);
+					r4_19 = (r5_3 - r5_19);
+					i4_19 = (i5_3 - i5_19);
+				   }
+				   {
+					REAL r5_11, i5_11;
+					REAL r5_27, i5_27;
+					wr = c_re(W[11 * l1]);
+					wi = c_im(W[11 * l1]);
+					tmpr = c_re(jp[11 * m]);
+					tmpi = c_im(jp[11 * m]);
+					r5_11 = ((wr * tmpr) - (wi * tmpi));
+					i5_11 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[27 * l1]);
+					wi = c_im(W[27 * l1]);
+					tmpr = c_re(jp[27 * m]);
+					tmpi = c_im(jp[27 * m]);
+					r5_27 = ((wr * tmpr) - (wi * tmpi));
+					i5_27 = ((wi * tmpr) + (wr * tmpi));
+					r4_11 = (r5_11 + r5_27);
+					i4_11 = (i5_11 + i5_27);
+					r4_27 = (r5_11 - r5_27);
+					i4_27 = (i5_11 - i5_27);
+				   }
+				   r3_3 = (r4_3 + r4_11);
+				   i3_3 = (i4_3 + i4_11);
+				   r3_19 = (r4_3 - r4_11);
+				   i3_19 = (i4_3 - i4_11);
+				   r3_11 = (r4_19 + i4_27);
+				   i3_11 = (i4_19 - r4_27);
+				   r3_27 = (r4_19 - i4_27);
+				   i3_27 = (i4_19 + r4_27);
+			      }
+			      {
+				   REAL r4_7, i4_7;
+				   REAL r4_15, i4_15;
+				   REAL r4_23, i4_23;
+				   REAL r4_31, i4_31;
+				   {
+					REAL r5_7, i5_7;
+					REAL r5_23, i5_23;
+					wr = c_re(W[7 * l1]);
+					wi = c_im(W[7 * l1]);
+					tmpr = c_re(jp[7 * m]);
+					tmpi = c_im(jp[7 * m]);
+					r5_7 = ((wr * tmpr) - (wi * tmpi));
+					i5_7 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[23 * l1]);
+					wi = c_im(W[23 * l1]);
+					tmpr = c_re(jp[23 * m]);
+					tmpi = c_im(jp[23 * m]);
+					r5_23 = ((wr * tmpr) - (wi * tmpi));
+					i5_23 = ((wi * tmpr) + (wr * tmpi));
+					r4_7 = (r5_7 + r5_23);
+					i4_7 = (i5_7 + i5_23);
+					r4_23 = (r5_7 - r5_23);
+					i4_23 = (i5_7 - i5_23);
+				   }
+				   {
+					REAL r5_15, i5_15;
+					REAL r5_31, i5_31;
+					wr = c_re(W[15 * l1]);
+					wi = c_im(W[15 * l1]);
+					tmpr = c_re(jp[15 * m]);
+					tmpi = c_im(jp[15 * m]);
+					r5_15 = ((wr * tmpr) - (wi * tmpi));
+					i5_15 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[31 * l1]);
+					wi = c_im(W[31 * l1]);
+					tmpr = c_re(jp[31 * m]);
+					tmpi = c_im(jp[31 * m]);
+					r5_31 = ((wr * tmpr) - (wi * tmpi));
+					i5_31 = ((wi * tmpr) + (wr * tmpi));
+					r4_15 = (r5_15 + r5_31);
+					i4_15 = (i5_15 + i5_31);
+					r4_31 = (r5_15 - r5_31);
+					i4_31 = (i5_15 - i5_31);
+				   }
+				   r3_7 = (r4_7 + r4_15);
+				   i3_7 = (i4_7 + i4_15);
+				   r3_23 = (r4_7 - r4_15);
+				   i3_23 = (i4_7 - i4_15);
+				   r3_15 = (r4_23 + i4_31);
+				   i3_15 = (i4_23 - r4_31);
+				   r3_31 = (r4_23 - i4_31);
+				   i3_31 = (i4_23 + r4_31);
+			      }
+			      r2_3 = (r3_3 + r3_7);
+			      i2_3 = (i3_3 + i3_7);
+			      r2_19 = (r3_3 - r3_7);
+			      i2_19 = (i3_3 - i3_7);
+			      tmpr = (0.707106781187 * (r3_15 + i3_15));
+			      tmpi = (0.707106781187 * (i3_15 - r3_15));
+			      r2_7 = (r3_11 + tmpr);
+			      i2_7 = (i3_11 + tmpi);
+			      r2_23 = (r3_11 - tmpr);
+			      i2_23 = (i3_11 - tmpi);
+			      r2_11 = (r3_19 + i3_23);
+			      i2_11 = (i3_19 - r3_23);
+			      r2_27 = (r3_19 - i3_23);
+			      i2_27 = (i3_19 + r3_23);
+			      tmpr = (0.707106781187 * (i3_31 - r3_31));
+			      tmpi = (0.707106781187 * (r3_31 + i3_31));
+			      r2_15 = (r3_27 + tmpr);
+			      i2_15 = (i3_27 - tmpi);
+			      r2_31 = (r3_27 - tmpr);
+			      i2_31 = (i3_27 + tmpi);
+			 }
+			 r1_1 = (r2_1 + r2_3);
+			 i1_1 = (i2_1 + i2_3);
+			 r1_17 = (r2_1 - r2_3);
+			 i1_17 = (i2_1 - i2_3);
+			 tmpr = ((0.923879532511 * r2_7) + (0.382683432365 * i2_7));
+			 tmpi = ((0.923879532511 * i2_7) - (0.382683432365 * r2_7));
+			 r1_3 = (r2_5 + tmpr);
+			 i1_3 = (i2_5 + tmpi);
+			 r1_19 = (r2_5 - tmpr);
+			 i1_19 = (i2_5 - tmpi);
+			 tmpr = (0.707106781187 * (r2_11 + i2_11));
+			 tmpi = (0.707106781187 * (i2_11 - r2_11));
+			 r1_5 = (r2_9 + tmpr);
+			 i1_5 = (i2_9 + tmpi);
+			 r1_21 = (r2_9 - tmpr);
+			 i1_21 = (i2_9 - tmpi);
+			 tmpr = ((0.382683432365 * r2_15) + (0.923879532511 * i2_15));
+			 tmpi = ((0.382683432365 * i2_15) - (0.923879532511 * r2_15));
+			 r1_7 = (r2_13 + tmpr);
+			 i1_7 = (i2_13 + tmpi);
+			 r1_23 = (r2_13 - tmpr);
+			 i1_23 = (i2_13 - tmpi);
+			 r1_9 = (r2_17 + i2_19);
+			 i1_9 = (i2_17 - r2_19);
+			 r1_25 = (r2_17 - i2_19);
+			 i1_25 = (i2_17 + r2_19);
+			 tmpr = ((0.923879532511 * i2_23) - (0.382683432365 * r2_23));
+			 tmpi = ((0.923879532511 * r2_23) + (0.382683432365 * i2_23));
+			 r1_11 = (r2_21 + tmpr);
+			 i1_11 = (i2_21 - tmpi);
+			 r1_27 = (r2_21 - tmpr);
+			 i1_27 = (i2_21 + tmpi);
+			 tmpr = (0.707106781187 * (i2_27 - r2_27));
+			 tmpi = (0.707106781187 * (r2_27 + i2_27));
+			 r1_13 = (r2_25 + tmpr);
+			 i1_13 = (i2_25 - tmpi);
+			 r1_29 = (r2_25 - tmpr);
+			 i1_29 = (i2_25 + tmpi);
+			 tmpr = ((0.382683432365 * i2_31) - (0.923879532511 * r2_31));
+			 tmpi = ((0.382683432365 * r2_31) + (0.923879532511 * i2_31));
+			 r1_15 = (r2_29 + tmpr);
+			 i1_15 = (i2_29 - tmpi);
+			 r1_31 = (r2_29 - tmpr);
+			 i1_31 = (i2_29 + tmpi);
+		    }
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[16 * m]) = (r1_0 - r1_1);
+		    c_im(kp[16 * m]) = (i1_0 - i1_1);
+		    tmpr = ((0.980785280403 * r1_3) + (0.195090322016 * i1_3));
+		    tmpi = ((0.980785280403 * i1_3) - (0.195090322016 * r1_3));
+		    c_re(kp[1 * m]) = (r1_2 + tmpr);
+		    c_im(kp[1 * m]) = (i1_2 + tmpi);
+		    c_re(kp[17 * m]) = (r1_2 - tmpr);
+		    c_im(kp[17 * m]) = (i1_2 - tmpi);
+		    tmpr = ((0.923879532511 * r1_5) + (0.382683432365 * i1_5));
+		    tmpi = ((0.923879532511 * i1_5) - (0.382683432365 * r1_5));
+		    c_re(kp[2 * m]) = (r1_4 + tmpr);
+		    c_im(kp[2 * m]) = (i1_4 + tmpi);
+		    c_re(kp[18 * m]) = (r1_4 - tmpr);
+		    c_im(kp[18 * m]) = (i1_4 - tmpi);
+		    tmpr = ((0.831469612303 * r1_7) + (0.55557023302 * i1_7));
+		    tmpi = ((0.831469612303 * i1_7) - (0.55557023302 * r1_7));
+		    c_re(kp[3 * m]) = (r1_6 + tmpr);
+		    c_im(kp[3 * m]) = (i1_6 + tmpi);
+		    c_re(kp[19 * m]) = (r1_6 - tmpr);
+		    c_im(kp[19 * m]) = (i1_6 - tmpi);
+		    tmpr = (0.707106781187 * (r1_9 + i1_9));
+		    tmpi = (0.707106781187 * (i1_9 - r1_9));
+		    c_re(kp[4 * m]) = (r1_8 + tmpr);
+		    c_im(kp[4 * m]) = (i1_8 + tmpi);
+		    c_re(kp[20 * m]) = (r1_8 - tmpr);
+		    c_im(kp[20 * m]) = (i1_8 - tmpi);
+		    tmpr = ((0.55557023302 * r1_11) + (0.831469612303 * i1_11));
+		    tmpi = ((0.55557023302 * i1_11) - (0.831469612303 * r1_11));
+		    c_re(kp[5 * m]) = (r1_10 + tmpr);
+		    c_im(kp[5 * m]) = (i1_10 + tmpi);
+		    c_re(kp[21 * m]) = (r1_10 - tmpr);
+		    c_im(kp[21 * m]) = (i1_10 - tmpi);
+		    tmpr = ((0.382683432365 * r1_13) + (0.923879532511 * i1_13));
+		    tmpi = ((0.382683432365 * i1_13) - (0.923879532511 * r1_13));
+		    c_re(kp[6 * m]) = (r1_12 + tmpr);
+		    c_im(kp[6 * m]) = (i1_12 + tmpi);
+		    c_re(kp[22 * m]) = (r1_12 - tmpr);
+		    c_im(kp[22 * m]) = (i1_12 - tmpi);
+		    tmpr = ((0.195090322016 * r1_15) + (0.980785280403 * i1_15));
+		    tmpi = ((0.195090322016 * i1_15) - (0.980785280403 * r1_15));
+		    c_re(kp[7 * m]) = (r1_14 + tmpr);
+		    c_im(kp[7 * m]) = (i1_14 + tmpi);
+		    c_re(kp[23 * m]) = (r1_14 - tmpr);
+		    c_im(kp[23 * m]) = (i1_14 - tmpi);
+		    c_re(kp[8 * m]) = (r1_16 + i1_17);
+		    c_im(kp[8 * m]) = (i1_16 - r1_17);
+		    c_re(kp[24 * m]) = (r1_16 - i1_17);
+		    c_im(kp[24 * m]) = (i1_16 + r1_17);
+		    tmpr = ((0.980785280403 * i1_19) - (0.195090322016 * r1_19));
+		    tmpi = ((0.980785280403 * r1_19) + (0.195090322016 * i1_19));
+		    c_re(kp[9 * m]) = (r1_18 + tmpr);
+		    c_im(kp[9 * m]) = (i1_18 - tmpi);
+		    c_re(kp[25 * m]) = (r1_18 - tmpr);
+		    c_im(kp[25 * m]) = (i1_18 + tmpi);
+		    tmpr = ((0.923879532511 * i1_21) - (0.382683432365 * r1_21));
+		    tmpi = ((0.923879532511 * r1_21) + (0.382683432365 * i1_21));
+		    c_re(kp[10 * m]) = (r1_20 + tmpr);
+		    c_im(kp[10 * m]) = (i1_20 - tmpi);
+		    c_re(kp[26 * m]) = (r1_20 - tmpr);
+		    c_im(kp[26 * m]) = (i1_20 + tmpi);
+		    tmpr = ((0.831469612303 * i1_23) - (0.55557023302 * r1_23));
+		    tmpi = ((0.831469612303 * r1_23) + (0.55557023302 * i1_23));
+		    c_re(kp[11 * m]) = (r1_22 + tmpr);
+		    c_im(kp[11 * m]) = (i1_22 - tmpi);
+		    c_re(kp[27 * m]) = (r1_22 - tmpr);
+		    c_im(kp[27 * m]) = (i1_22 + tmpi);
+		    tmpr = (0.707106781187 * (i1_25 - r1_25));
+		    tmpi = (0.707106781187 * (r1_25 + i1_25));
+		    c_re(kp[12 * m]) = (r1_24 + tmpr);
+		    c_im(kp[12 * m]) = (i1_24 - tmpi);
+		    c_re(kp[28 * m]) = (r1_24 - tmpr);
+		    c_im(kp[28 * m]) = (i1_24 + tmpi);
+		    tmpr = ((0.55557023302 * i1_27) - (0.831469612303 * r1_27));
+		    tmpi = ((0.55557023302 * r1_27) + (0.831469612303 * i1_27));
+		    c_re(kp[13 * m]) = (r1_26 + tmpr);
+		    c_im(kp[13 * m]) = (i1_26 - tmpi);
+		    c_re(kp[29 * m]) = (r1_26 - tmpr);
+		    c_im(kp[29 * m]) = (i1_26 + tmpi);
+		    tmpr = ((0.382683432365 * i1_29) - (0.923879532511 * r1_29));
+		    tmpi = ((0.382683432365 * r1_29) + (0.923879532511 * i1_29));
+		    c_re(kp[14 * m]) = (r1_28 + tmpr);
+		    c_im(kp[14 * m]) = (i1_28 - tmpi);
+		    c_re(kp[30 * m]) = (r1_28 - tmpr);
+		    c_im(kp[30 * m]) = (i1_28 + tmpi);
+		    tmpr = ((0.195090322016 * i1_31) - (0.980785280403 * r1_31));
+		    tmpi = ((0.195090322016 * r1_31) + (0.980785280403 * i1_31));
+		    c_re(kp[15 * m]) = (r1_30 + tmpr);
+		    c_im(kp[15 * m]) = (i1_30 - tmpi);
+		    c_re(kp[31 * m]) = (r1_30 - tmpr);
+		    c_im(kp[31 * m]) = (i1_30 + tmpi);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_twiddle_32_seq(a, ab, in, out, W, nW, nWdn, m);
+	  fft_twiddle_32_seq(ab, b, in, out, W, nW, nWdn, m);
+     }
+}
+void fft_unshuffle_32(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 32;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+#if defined(FORCE_TIED_TASKS)
+          #pragma omp task
+	  fft_unshuffle_32(a, ab, in, out, m);
+          #pragma omp task
+	  fft_unshuffle_32(ab, b, in, out, m);
+#else
+          #pragma omp task untied
+	  fft_unshuffle_32(a, ab, in, out, m);
+          #pragma omp task untied
+	  fft_unshuffle_32(ab, b, in, out, m);
+#endif
+          #pragma omp taskwait
+     }
+}
+void fft_unshuffle_32_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 32;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_unshuffle_32_seq(a, ab, in, out, m);
+	  fft_unshuffle_32_seq(ab, b, in, out, m);
+     }
+}
+/* end of machine-generated code */
+
+/*
+ * Recursive complex FFT on the n complex components of the array in:
+ * basic Cooley-Tukey algorithm, with some improvements for
+ * n power of two. The result is placed in the array out. n is arbitrary. 
+ * The algorithm runs in time O(n*(r1 + ... + rk)) where r1, ..., rk
+ * are prime numbers, and r1 * r2 * ... * rk = n.
+ *
+ * n: size of the input
+ * in: pointer to input
+ * out: pointer to output
+ * factors: list of factors of n, precomputed
+ * W: twiddle factors
+ * nW: size of W, that is, size of the original transform
+ *
+ */
+void fft_aux(int n, COMPLEX * in, COMPLEX * out, int *factors, COMPLEX * W, int nW)
+{
+     int r, m;
+     int k;
+
+     /* special cases */
+     if (n == 32) {
+	  fft_base_32(in, out);
+	  return;
+     }
+     if (n == 16) {
+	  fft_base_16(in, out);
+	  return;
+     }
+     if (n == 8) {
+	  fft_base_8(in, out);
+	  return;
+     }
+     if (n == 4) {
+	  fft_base_4(in, out);
+	  return;
+     }
+     if (n == 2) {
+	  fft_base_2(in, out);
+	  return;
+     }
+     /* 
+      * the cases n == 3, n == 5, and maybe 7 should be implemented as well
+      */
+
+     r = *factors;
+     m = n / r;
+
+     if (r < n) {
+	  /* 
+	   * split the DFT of length n into r DFTs of length n/r,  and
+	   * recurse 
+	   */
+#if defined(FORCE_TIED_TASKS)
+	  if (r == 32) {
+               #pragma omp task
+	       fft_unshuffle_32(0, m, in, out, m);
+	  } else if (r == 16) {
+               #pragma omp task
+	       fft_unshuffle_16(0, m, in, out, m);
+	  } else if (r == 8) {
+               #pragma omp task
+	       fft_unshuffle_8(0, m, in, out, m);
+	  } else if (r == 4) {
+               #pragma omp task
+	       fft_unshuffle_4(0, m, in, out, m);
+	  } else if (r == 2) {
+               #pragma omp task
+	       fft_unshuffle_2(0, m, in, out, m);
+	  } else
+	       unshuffle(0, m, in, out, r, m);
+#else
+	  if (r == 32) {
+               #pragma omp task untied
+	       fft_unshuffle_32(0, m, in, out, m);
+	  } else if (r == 16) {
+               #pragma omp task untied
+	       fft_unshuffle_16(0, m, in, out, m);
+	  } else if (r == 8) {
+               #pragma omp task untied
+	       fft_unshuffle_8(0, m, in, out, m);
+	  } else if (r == 4) {
+               #pragma omp task untied
+	       fft_unshuffle_4(0, m, in, out, m);
+	  } else if (r == 2) {
+               #pragma omp task untied
+	       fft_unshuffle_2(0, m, in, out, m);
+	  } else
+	       unshuffle(0, m, in, out, r, m);
+
+#endif
+          #pragma omp taskwait
+
+	  for (k = 0; k < n; k += m) {
+#if defined(FORCE_TIED_TASKS)
+               #pragma omp task
+	       fft_aux(m, out + k, in + k, factors + 1, W, nW);
+#else
+               #pragma omp task untied
+	       fft_aux(m, out + k, in + k, factors + 1, W, nW);
+#endif
+	  }
+          #pragma omp taskwait
+     }
+     /* 
+      * now multiply by the twiddle factors, and perform m FFTs
+      * of length r
+      */
+#if defined(FORCE_TIED_TASKS)
+     if (r == 2) {
+          #pragma omp task untied
+	  fft_twiddle_2(0, m, in, out, W, nW, nW / n, m);
+     } else if (r == 4) {
+          #pragma omp task untied
+	  fft_twiddle_4(0, m, in, out, W, nW, nW / n, m);
+     } else if (r == 8) {
+          #pragma omp task untied
+	  fft_twiddle_8(0, m, in, out, W, nW, nW / n, m);
+     } else if (r == 16) {
+          #pragma omp task untied
+	  fft_twiddle_16(0, m, in, out, W, nW, nW / n, m);
+     } else if (r == 32) {
+          #pragma omp task untied
+	  fft_twiddle_32(0, m, in, out, W, nW, nW / n, m);
+     } else {
+          #pragma omp task untied
+	  fft_twiddle_gen(0, m, in, out, W, nW, nW / n, r, m);
+     }
+#else
+     if (r == 2) {
+          #pragma omp task untied
+	  fft_twiddle_2(0, m, in, out, W, nW, nW / n, m);
+     } else if (r == 4) {
+          #pragma omp task untied
+	  fft_twiddle_4(0, m, in, out, W, nW, nW / n, m);
+     } else if (r == 8) {
+          #pragma omp task untied
+	  fft_twiddle_8(0, m, in, out, W, nW, nW / n, m);
+     } else if (r == 16) {
+          #pragma omp task untied
+	  fft_twiddle_16(0, m, in, out, W, nW, nW / n, m);
+     } else if (r == 32) {
+          #pragma omp task untied
+	  fft_twiddle_32(0, m, in, out, W, nW, nW / n, m);
+     } else {
+          #pragma omp task untied
+	  fft_twiddle_gen(0, m, in, out, W, nW, nW / n, r, m);
+     }
+#endif
+
+     #pragma omp taskwait
+
+     return;
+}
+
+void fft_aux_seq(int n, COMPLEX * in, COMPLEX * out, int *factors, COMPLEX * W, int nW)
+{
+     int r, m;
+     int k;
+
+     /* special cases */
+     if (n == 32) {
+	  fft_base_32(in, out);
+	  return;
+     }
+     if (n == 16) {
+	  fft_base_16(in, out);
+	  return;
+     }
+     if (n == 8) {
+	  fft_base_8(in, out);
+	  return;
+     }
+     if (n == 4) {
+	  fft_base_4(in, out);
+	  return;
+     }
+     if (n == 2) {
+	  fft_base_2(in, out);
+	  return;
+     }
+     /* 
+      * the cases n == 3, n == 5, and maybe 7 should be implemented as well
+      */
+
+     r = *factors;
+     m = n / r;
+
+     if (r < n) {
+	  /* 
+	   * split the DFT of length n into r DFTs of length n/r,  and
+	   * recurse 
+	   */
+	  if      (r == 32) fft_unshuffle_32_seq(0, m, in, out, m);
+	  else if (r == 16) fft_unshuffle_16_seq(0, m, in, out, m);
+	  else if (r ==  8) fft_unshuffle_8_seq(0, m, in, out, m);
+	  else if (r ==  4) fft_unshuffle_4_seq(0, m, in, out, m);
+	  else if (r ==  2) fft_unshuffle_2_seq(0, m, in, out, m);
+	  else              unshuffle_seq(0, m, in, out, r, m);
+
+	  for (k = 0; k < n; k += m) {
+	       fft_aux_seq(m, out + k, in + k, factors + 1, W, nW);
+	  }
+     }
+     /* 
+      * now multiply by the twiddle factors, and perform m FFTs
+      * of length r
+      */
+     if      (r ==  2) fft_twiddle_2_seq(0, m, in, out, W, nW, nW / n, m);
+     else if (r ==  4) fft_twiddle_4_seq(0, m, in, out, W, nW, nW / n, m);
+     else if (r ==  8) fft_twiddle_8_seq(0, m, in, out, W, nW, nW / n, m);
+     else if (r == 16) fft_twiddle_16_seq(0, m, in, out, W, nW, nW / n, m);
+     else if (r == 32) fft_twiddle_32_seq(0, m, in, out, W, nW, nW / n, m);
+     else              fft_twiddle_gen_seq(0, m, in, out, W, nW, nW / n, r, m);
+
+     return;
+}
+/*
+ * user interface for fft_aux
+ */
+void fft(int n, COMPLEX * in, COMPLEX * out)
+{
+     int factors[40];		/* allows FFTs up to at least 3^40 */
+     int *p = factors;
+     int l = n;
+     int r;
+     COMPLEX *W;
+
+     bots_message("Computing coefficients ");
+     W = (COMPLEX *) malloc((n + 1) * sizeof(COMPLEX));
+     #pragma omp parallel
+     #pragma omp single
+#if defined(FORCE_TIED_TASKS)
+     #pragma omp task
+#else
+     #pragma omp task untied
+#endif
+     compute_w_coefficients(n, 0, n / 2, W);
+     bots_message(" completed!\n");
+
+     /* 
+      * find factors of n, first 8, then 4 and then primes in ascending
+      * order 
+      */
+     do {
+	  r = factor(l);
+	  *p++ = r;
+	  l /= r;
+     } while (l > 1);
+
+     bots_message("Computing FFT ");
+     #pragma omp parallel
+     #pragma omp single
+#if defined(FORCE_TIED_TASKS)
+     #pragma omp task
+#else
+     #pragma omp task untied
+#endif
+     fft_aux(n, in, out, factors, W, n);
+     bots_message(" completed!\n");
+
+     free(W);
+     return;
+}
+void fft_seq(int n, COMPLEX * in, COMPLEX * out)
+{
+     int factors[40];		/* allows FFTs up to at least 3^40 */
+     int *p = factors;
+     int l = n;
+     int r;
+     COMPLEX *W;
+
+     W = (COMPLEX *) malloc((n + 1) * sizeof(COMPLEX));
+     compute_w_coefficients_seq(n, 0, n / 2, W);
+
+     /* 
+      * find factors of n, first 8, then 4 and then primes in ascending
+      * order 
+      */
+     do {
+	  r = factor(l);
+	  *p++ = r;
+	  l /= r;
+     } while (l > 1);
+
+     fft_aux_seq(n, in, out, factors, W, n);
+
+     free(W);
+     return;
+}
+int test_correctness(int n, COMPLEX *out1, COMPLEX *out2)
+{
+  int i;
+  double a,d,error = 0.0;
+
+  for (i = 0; i < n; ++i) {
+       a = sqrt((c_re(out1[i]) - c_re(out2[i])) *
+		(c_re(out1[i]) - c_re(out2[i])) +
+		(c_im(out1[i]) - c_im(out2[i])) *
+		(c_im(out1[i]) - c_im(out2[i])));
+       d =  sqrt(c_re(out2[i]) * c_re(out2[i]) + 
+		 c_im(out2[i]) * c_im(out2[i]));
+       if (d < -1.0e-10 || d > 1.0e-10) a /= d;
+       if (a > error) error = a;
+  }
+  bots_message("relative error=%e\n", error);
+  if (error > 1e-3) return BOTS_RESULT_UNSUCCESSFUL;
+  else return BOTS_RESULT_SUCCESSFUL;
+}
+
diff --git a/src/components/implementation/no_interface/omp_fft_bots/fft.h b/src/components/implementation/no_interface/omp_fft_bots/fft.h
new file mode 100644
index 0000000000..ebafa9fb4d
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fft_bots/fft.h
@@ -0,0 +1,55 @@
+#ifndef FFT_H
+#define FFT_H
+
+/* our real numbers */
+typedef double REAL;
+
+/* Complex numbers and operations */
+typedef struct {
+     REAL re, im;
+} COMPLEX;
+
+#define c_re(c)  ((c).re)
+#define c_im(c)  ((c).im)
+
+void compute_w_coefficients(int n, int a, int b, COMPLEX * W);
+void compute_w_coefficients_seq(int n, int a, int b, COMPLEX * W);
+int factor(int n);
+void unshuffle(int a, int b, COMPLEX * in, COMPLEX * out, int r, int m);
+void unshuffle_seq(int a, int b, COMPLEX * in, COMPLEX * out, int r, int m);
+void fft_twiddle_gen1(COMPLEX * in, COMPLEX * out, COMPLEX * W, int r, int m, int nW, int nWdnti, int nWdntm);
+void fft_twiddle_gen(int i, int i1, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int r, int m);
+void fft_twiddle_gen_seq(int i, int i1, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int r, int m);
+void fft_base_2(COMPLEX * in, COMPLEX * out);
+void fft_twiddle_2(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_twiddle_2_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_unshuffle_2(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_unshuffle_2_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_base_4(COMPLEX * in, COMPLEX * out);
+void fft_twiddle_4(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_twiddle_4_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_unshuffle_4(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_unshuffle_4_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_base_8(COMPLEX * in, COMPLEX * out);
+void fft_twiddle_8(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_twiddle_8_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_unshuffle_8(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_unshuffle_8_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_base_16(COMPLEX * in, COMPLEX * out);
+void fft_twiddle_16(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_twiddle_16_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_unshuffle_16(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_unshuffle_16_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_base_32(COMPLEX * in, COMPLEX * out);
+void fft_twiddle_32(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_twiddle_32_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_unshuffle_32(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_unshuffle_32_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_aux(int n, COMPLEX * in, COMPLEX * out, int *factors, COMPLEX * W, int nW);
+void fft_aux_seq(int n, COMPLEX * in, COMPLEX * out, int *factors, COMPLEX * W, int nW);
+void fft(int n, COMPLEX * in, COMPLEX * out);
+void fft_seq(int n, COMPLEX * in, COMPLEX * out);
+int test_correctness(int n, COMPLEX *out1, COMPLEX *out2);
+
+#endif
+
diff --git a/src/components/implementation/no_interface/omp_fft_bots/init.c b/src/components/implementation/no_interface/omp_fft_bots/init.c
new file mode 120000
index 0000000000..9e09b82e77
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fft_bots/init.c
@@ -0,0 +1 @@
+../omp_fib_bots/init.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_fft_bots/omp-tasks-app.h b/src/components/implementation/no_interface/omp_fft_bots/omp-tasks-app.h
new file mode 120000
index 0000000000..9fba574408
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fft_bots/omp-tasks-app.h
@@ -0,0 +1 @@
+../omp_fib_bots/omp-tasks-app.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_fft_bots/posix_basic.c b/src/components/implementation/no_interface/omp_fft_bots/posix_basic.c
new file mode 120000
index 0000000000..9afee078fb
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fft_bots/posix_basic.c
@@ -0,0 +1 @@
+../omp_fib_bots/posix_basic.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_fib_bots/Makefile b/src/components/implementation/no_interface/omp_fib_bots/Makefile
new file mode 100644
index 0000000000..bee96fd0aa
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fib_bots/Makefile
@@ -0,0 +1,19 @@
+COMPONENT=omp_fib_bots.o
+INTERFACES=
+#DEPENDENCIES=capmgr
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
+
+CFLAGS += -fopenmp
+# if tied tasks are required
+CFLAGS += -DFORCE_TIED_TASKS
+
+OMPC_FINAL_FLAGS=
+
+# one per compilation or none
+#CFLAGS += -DMANUAL_CUTOFF
+#CFLAGS += -DIF_CUTOFF
+#CFLAGS += -DFINAL_CUTOFF $(OMPC_FINAL_FLAGS) 
diff --git a/src/components/implementation/no_interface/omp_fib_bots/app-desc.h b/src/components/implementation/no_interface/omp_fib_bots/app-desc.h
new file mode 100644
index 0000000000..e8af171324
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fib_bots/app-desc.h
@@ -0,0 +1,47 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include "omp-tasks-app.h"
+
+#define BOTS_APP_NAME "Fibonacci"
+#define BOTS_APP_PARAMETERS_DESC "N=%d"
+#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size
+
+#define BOTS_APP_USES_ARG_SIZE
+#define BOTS_APP_DEF_ARG_SIZE 20
+#define BOTS_APP_DESC_ARG_SIZE "Number to compute"
+
+int fib_verify(int);
+void fib0 (int);
+void fib0_seq (int);
+
+//#define KERNEL_INIT
+#define KERNEL_CALL fib0(bots_arg_size)
+//#define KERNEL_FINI
+
+//#define KERNEL_SEQ_INIT
+#define KERNEL_SEQ_CALL fib0_seq(bots_arg_size)
+//#define KERNEL_SEQ_FINI
+
+
+#define KERNEL_CHECK fib_verify(bots_arg_size)
+
+#define BOTS_CUTOFF_DEF_VALUE 10
+
diff --git a/src/components/implementation/no_interface/omp_fib_bots/bots.h b/src/components/implementation/no_interface/omp_fib_bots/bots.h
new file mode 100644
index 0000000000..fee71a7eb2
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fib_bots/bots.h
@@ -0,0 +1,113 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#ifndef _BOTS_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <llprint.h>
+
+/* common flags */
+extern int bots_sequential_flag;
+extern int bots_benchmark_flag;
+extern int bots_check_flag;
+extern int bots_result;
+extern int bots_output_format;
+extern int bots_print_header;
+/* common variables */
+extern char bots_name[];
+extern char bots_parameters[];
+extern char bots_model[];
+extern char bots_resources[];
+/* compile and execution information */
+extern char bots_exec_date[];
+extern char bots_exec_message[];
+extern char bots_comp_date[];
+extern char bots_comp_message[];
+extern char bots_cc[];
+extern char bots_cflags[];
+extern char bots_ld[];
+extern char bots_ldflags[];
+/* time variables */
+extern double bots_time_program;
+extern double bots_time_sequential;
+
+/* number of tasks variable */
+extern unsigned long long bots_number_of_tasks; /* forcing 8 bytes size on -m32 and -m64 */
+
+extern char bots_cutoff[];
+extern int  bots_cutoff_value;
+
+extern int  bots_app_cutoff_value;
+extern int  bots_app_cutoff_value_1;
+extern int  bots_app_cutoff_value_2;
+
+extern int bots_arg_size;
+extern int bots_arg_size_1;
+extern int bots_arg_size_2;
+
+/* function could be used in app. code but are implemented in bots_common.c */
+long bots_usecs();
+void bots_error(int error, char *message);
+void bots_warning(int warning, char *message);
+
+#define BOTS_RESULT_NA 0
+#define BOTS_RESULT_SUCCESSFUL 1
+#define BOTS_RESULT_UNSUCCESSFUL 2
+#define BOTS_RESULT_NOT_REQUESTED 3
+
+
+typedef enum { BOTS_VERBOSE_NONE=0,
+               BOTS_VERBOSE_DEFAULT,
+               BOTS_VERBOSE_DEBUG } bots_verbose_mode_t;
+
+extern bots_verbose_mode_t bots_verbose_mode;
+
+#define bots_message(msg, ...) \
+   {\
+      if ( bots_verbose_mode >= BOTS_VERBOSE_DEFAULT ) {\
+        printc(msg , ##__VA_ARGS__);\
+      }\
+   }
+
+#ifdef BOTS_DEBUG
+#define bots_debug(msg, ...) \
+   {\
+      if ( bots_verbose_mode >= BOTS_VERBOSE_DEBUG ) {\
+       PRINTC(msg , ##__VA_ARGS__);\
+      }\
+   }
+#define bots_debug_with_location_info(msg, ...) \
+   {\
+      if ( bots_verbose_mode >= BOTS_VERBOSE_DEBUG ) {\
+       PRINTC("%s:%d:%s:" msg ,__FILE__, __LINE__,__func__,##__VA_ARGS__);\
+      }\
+   }
+#else
+#define bots_debug(msg, ...)
+#define bots_debug_with_location_info(msg, ...)
+#endif
+
+#define FALSE 0
+#define TRUE 1
+
+#endif
+
+
diff --git a/src/components/implementation/no_interface/omp_fib_bots/bots_common.c b/src/components/implementation/no_interface/omp_fib_bots/bots_common.c
new file mode 100644
index 0000000000..49af8a438e
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fib_bots/bots_common.c
@@ -0,0 +1,342 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/utsname.h>
+#include <sys/resource.h>
+
+#include "bots_common.h"
+#include "bots_main.h"
+#include "bots.h"
+#include <cos_time.h>
+
+void
+bots_error(int error, char *message)
+{
+   if (message == NULL)
+   {
+      switch(error)
+      {
+         case BOTS_ERROR:
+            PRINTC("Error (%d): %s\n",error,"Unspecified error.");
+            break;
+         case BOTS_ERROR_NOT_ENOUGH_MEMORY:
+            PRINTC("Error (%d): %s\n",error,"Not enough memory.");
+            break;
+         case BOTS_ERROR_UNRECOGNIZED_PARAMETER:
+            PRINTC("Error (%d): %s\n",error,"Unrecognized parameter.");
+            bots_print_usage();
+            break;
+         default:
+            PRINTC("Error (%d): %s\n",error,"Invalid error code.");
+            break;
+      }
+   }
+   else PRINTC("Error (%d): %s\n",error,message);
+   cos_exit(100+error);
+}
+
+void
+bots_warning(int warning, char *message)
+{
+   if (message == NULL)
+   {
+      switch(warning)
+      {
+         case BOTS_WARNING:
+            PRINTC("Warning (%d): %s\n",warning,"Unspecified warning.");
+            break;
+         default:
+            PRINTC("Warning (%d): %s\n",warning,"Invalid warning code.");
+            break;
+      }
+   }
+   else PRINTC("Warning (%d): %s\n",warning,message);
+}
+
+long bots_usecs (void)
+{
+   //struct timeval t;
+   //gettimeofday(&t,NULL);
+   //return t.tv_sec*1000000+t.tv_usec;
+   return (long)time_now_usec();
+}
+
+void
+bots_get_date(char *str)
+{
+   time_t now;
+   time(&now);
+   //strftime(str, 32, "%Y/%m/%d;%H:%M", gmtime(&now));
+   strncpy(str, "01/01/0001", 32);
+}
+
+void bots_get_architecture(char *str)
+{
+	snprintf(str, BOTS_TMP_STR_SZ, "Composite-i386;%d", NUM_CPU);
+//   int ncpus = sysconf(_SC_NPROCESSORS_CONF);
+//   struct utsname architecture;
+//
+//   uname(&architecture);
+//   snprintf(str, BOTS_TMP_STR_SZ, "%s-%s;%d" ,architecture.sysname, architecture.machine, ncpus);
+}
+
+#undef __linux
+#if defined (__linux)
+/* ****************************************************************** */
+void bots_get_load_average(char *str)
+{
+   double loadavg[3];
+   getloadavg (loadavg, 3);
+   snprintf(str, BOTS_TMP_STR_SZ, "%.2f;%.2f;%.2f",loadavg[0],loadavg[1],loadavg[2]);
+}
+#else
+/* ****************************************************************** */
+void bots_get_load_average(char *str) { sprintf(str,";;"); }
+#endif
+
+void bots_print_results()
+{
+   char str_name[BOTS_TMP_STR_SZ];
+   char str_parameters[BOTS_TMP_STR_SZ];
+   char str_model[BOTS_TMP_STR_SZ];
+   char str_resources[BOTS_TMP_STR_SZ];
+   char str_result[15];
+   char str_time_program[15];
+   char str_time_sequential[15];
+   char str_speed_up[15];
+   char str_number_of_tasks[15];
+   char str_number_of_tasks_per_second[15];
+   char str_exec_date[BOTS_TMP_STR_SZ];
+   char str_exec_message[BOTS_TMP_STR_SZ];
+   char str_architecture[BOTS_TMP_STR_SZ];
+   char str_load_avg[BOTS_TMP_STR_SZ];
+   char str_comp_date[BOTS_TMP_STR_SZ];
+   char str_comp_message[BOTS_TMP_STR_SZ];
+   char str_cc[BOTS_TMP_STR_SZ];
+   char str_cflags[BOTS_TMP_STR_SZ];
+   char str_ld[BOTS_TMP_STR_SZ];
+   char str_ldflags[BOTS_TMP_STR_SZ];
+   char str_cutoff[BOTS_TMP_STR_SZ];
+
+   /* compute output strings */
+   sprintf(str_name, "%s", bots_name);
+   sprintf(str_parameters, "%s", bots_parameters);
+   sprintf(str_model, "%s", bots_model);
+   sprintf(str_cutoff, "%s", bots_cutoff);
+   sprintf(str_resources, "%s", bots_resources);
+   switch(bots_result)
+   {
+      case BOTS_RESULT_NA: 
+         sprintf(str_result, "n/a");
+         break;
+      case BOTS_RESULT_SUCCESSFUL: 
+         sprintf(str_result, "successful");
+         break;
+      case BOTS_RESULT_UNSUCCESSFUL: 
+         sprintf(str_result, "UNSUCCESSFUL");
+         break;
+      case BOTS_RESULT_NOT_REQUESTED:
+         sprintf(str_result, "Not requested");
+         break;
+      default: 
+         sprintf(str_result, "error");
+         break;
+   }
+   sprintf(str_time_program, "%f", bots_time_program);
+   if (bots_sequential_flag) sprintf(str_time_sequential, "%f", bots_time_sequential);
+   else sprintf(str_time_sequential, "n/a");
+   if (bots_sequential_flag)
+   sprintf(str_speed_up, "%3.2f", bots_time_sequential/bots_time_program);
+   else sprintf(str_speed_up, "n/a");
+
+   sprintf(str_number_of_tasks, "%3.2f", (float) bots_number_of_tasks);
+   sprintf(str_number_of_tasks_per_second, "%3.2f", (float) bots_number_of_tasks/bots_time_program);
+
+   sprintf(str_exec_date, "%s", bots_exec_date);
+   sprintf(str_exec_message, "%s", bots_exec_message);
+   bots_get_architecture(str_architecture);
+   bots_get_load_average(str_load_avg);
+   sprintf(str_comp_date, "%s", bots_comp_date);
+   sprintf(str_comp_message, "%s", bots_comp_message);
+   sprintf(str_cc, "%s", bots_cc);
+   sprintf(str_cflags, "%s", bots_cflags);
+   sprintf(str_ld, "%s", bots_ld);
+   sprintf(str_ldflags, "%s", bots_ldflags);
+
+   if(bots_print_header)
+   {
+      switch(bots_output_format)
+      {
+         case 0:
+            break;
+         case 1:
+            break;
+         case 2:
+PRINTC(
+"Benchmark;Parameters;Model;Cutoff;Resources;Result;\
+Time;Sequential;Speed-up;\
+Nodes;Nodes/Sec;\
+Exec Date;Exec Time;Exec Message;\
+Architecture;Processors;Load Avg-1;Load Avg-5;Load Avg-15;\
+Comp Date;Comp Time;Comp Message;CC;CFLAGS;LD;LDFLAGS\n");
+            break;
+         case 3:
+            break;
+         case 4:
+PRINTC(
+"Benchmark;Parameters;Model;Cutoff;Resources;Result;\
+Time;Sequential;Speed-up;\
+Nodes;Nodes/Sec;\n");
+            break;
+         default:
+            break;
+      }
+   }
+
+   /* print results */
+   switch(bots_output_format)
+   {
+      case 0:
+         break;
+      case 1:
+	 PRINTC("\n");
+         PRINTC("Program             = %s\n", str_name); /*fix*/
+         PRINTC("Parameters          = %s\n", str_parameters); /*fix*/
+         PRINTC("Model               = %s\n", str_model); 
+         PRINTC("Embedded cut-off    = %s\n", str_cutoff); 
+         PRINTC("# of Threads        = %s\n", str_resources);
+         PRINTC("Verification        = %s\n", str_result);
+
+         PRINTC("Time Program        = %s seconds\n", str_time_program);
+	 if (bots_sequential_flag) {
+           PRINTC("Time Sequential     = %s seconds\n", str_time_sequential);
+           PRINTC("Speed-up            = %s\n", str_speed_up);
+	 }
+
+         if ( bots_number_of_tasks > 0 ) {
+           PRINTC("Nodes               = %s\n", str_number_of_tasks);
+           PRINTC("Nodes/Sec           = %s\n", str_number_of_tasks_per_second);
+	 }
+
+         PRINTC("Execution Date      = %s\n", str_exec_date);
+         PRINTC("Execution Message   = %s\n", str_exec_message);
+
+         PRINTC("Architecture        = %s\n", str_architecture);
+         PRINTC("Load Avg [1:5:15]   = %s\n", str_load_avg);
+
+         PRINTC("Compilation Date    = %s\n", str_comp_date);
+         PRINTC("Compilation Message = %s\n", str_comp_message);
+
+         PRINTC("Compiler            = %s\n", str_cc);
+         PRINTC("Compiler Flags      = %s\n", str_cflags);
+         PRINTC("Linker              = %s\n", str_ld);
+         PRINTC("Linker Flags        = %s\n", str_ldflags);
+	 fflush(stdout);
+         break;
+      case 2:
+         PRINTC("%s;%s;%s;%s;%s;%s;", 
+              str_name, 
+              str_parameters, 
+              str_model, 
+              str_cutoff, 
+              str_resources, 
+              str_result
+         );
+         PRINTC("%s;%s;%s;", 
+              str_time_program, 
+              str_time_sequential, 
+              str_speed_up 
+         );
+         PRINTC("%s;%s;", 
+              str_number_of_tasks, 
+              str_number_of_tasks_per_second
+         );
+         PRINTC("%s;%s;", 
+              str_exec_date,
+              str_exec_message
+         );
+         PRINTC("%s;%s;", 
+              str_architecture,
+              str_load_avg
+         );
+         PRINTC("%s;%s;", 
+              str_comp_date,
+              str_comp_message
+         );
+         PRINTC("%s;%s;%s;%s;",
+              str_cc,
+              str_cflags,
+              str_ld,
+              str_ldflags
+         );
+         PRINTC("\n");
+         break;
+      case 3:
+	 PRINTC("\n");
+         PRINTC("Program             = %s\n", str_name); /*fix*/
+         PRINTC("Parameters          = %s\n", str_parameters); /*fix*/
+         PRINTC("Model               = %s\n", str_model); 
+         PRINTC("Embedded cut-off    = %s\n", str_cutoff); 
+         PRINTC("# of Threads        = %s\n", str_resources);
+         PRINTC("Verification        = %s\n", str_result);
+
+         PRINTC("Time Program        = %s seconds\n", str_time_program);
+	 if (bots_sequential_flag) {
+           PRINTC("Time Sequential     = %s seconds\n", str_time_sequential);
+           PRINTC("Speed-up            = %s\n", str_speed_up);
+	 }
+
+         if ( bots_number_of_tasks > 0 ) {
+           PRINTC("Nodes               = %s\n", str_number_of_tasks);
+           PRINTC("Nodes/Sec           = %s\n", str_number_of_tasks_per_second);
+	 }
+         break;
+      case 4:
+         PRINTC("%s;%s;%s;%s;%s;%s;", 
+              str_name, 
+              str_parameters, 
+              str_model, 
+              str_cutoff, 
+              str_resources, 
+              str_result
+         );
+         PRINTC("%s;%s;%s;", 
+              str_time_program, 
+              str_time_sequential, 
+              str_speed_up 
+         );
+         PRINTC("%s;%s;", 
+              str_number_of_tasks, 
+              str_number_of_tasks_per_second
+         );
+         PRINTC("\n");
+         break;
+      default:
+         bots_error(BOTS_ERROR,"No valid output format\n");
+         break;
+   }
+}
+
diff --git a/src/components/implementation/no_interface/omp_fib_bots/bots_common.h b/src/components/implementation/no_interface/omp_fib_bots/bots_common.h
new file mode 100644
index 0000000000..9d38799ef1
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fib_bots/bots_common.h
@@ -0,0 +1,56 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#ifndef _COMMON_H
+#define _COMMON_H
+
+#ifndef CC
+#define CC "GCC"
+#endif
+#ifndef CFLAGS
+#define CFLAGS "-fopenmp"
+#endif
+#ifndef LD
+#define LD "LD"
+#endif
+#ifndef LDFLAGS
+#define LDFLAGS "-fopenmp -lcos_gomp"
+#endif
+#ifndef CDATE
+#define CDATE "01/01/0001"
+#endif
+#ifndef CMESSAGE
+#define CMESSAGE "Done!"
+#endif
+
+#define BOTS_ERROR                         0
+#define BOTS_ERROR_NOT_ENOUGH_MEMORY       1
+#define BOTS_ERROR_UNRECOGNIZED_PARAMETER  2
+
+#define BOTS_WARNING                       0
+
+void bots_get_date(char *str);
+void bots_get_architecture(char *str);
+void bots_get_load_average(char *str);
+void bots_print_results(void);
+
+#define BOTS_TMP_STR_SZ 64 
+
+#endif
diff --git a/src/components/implementation/no_interface/omp_fib_bots/bots_main.c b/src/components/implementation/no_interface/omp_fib_bots/bots_main.c
new file mode 100644
index 0000000000..e70ca2fccb
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fib_bots/bots_main.c
@@ -0,0 +1,540 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+/***********************************************************************
+ * main function & common behaviour of the benchmark.
+ **********************************************************************/
+#include <stdio.h>
+#include <stdlib.h> 
+#include <string.h>
+#include <math.h>
+#include <stddef.h>
+#include <memory.h>
+#include <sys/time.h>
+#include <libgen.h>
+#include "bots_common.h"
+#include "bots_main.h"
+#include "bots.h"
+#include "app-desc.h"
+#include <cos_component.h>
+
+/***********************************************************************
+ * DEFAULT VALUES 
+ *********************************************************************/
+/* common flags */
+int bots_sequential_flag = FALSE;
+int bots_check_flag = FALSE;
+bots_verbose_mode_t bots_verbose_mode = BOTS_VERBOSE_DEFAULT;
+int bots_result = BOTS_RESULT_NOT_REQUESTED;
+int bots_output_format = 1;
+int bots_print_header = FALSE;
+/* common variables */
+char bots_name[BOTS_TMP_STR_SZ];
+char bots_execname[BOTS_TMP_STR_SZ];
+char bots_parameters[BOTS_TMP_STR_SZ];
+char bots_model[BOTS_TMP_STR_SZ];
+char bots_resources[BOTS_TMP_STR_SZ];
+/* compile and execution information */
+char bots_exec_date[BOTS_TMP_STR_SZ];
+char bots_exec_message[BOTS_TMP_STR_SZ];
+char bots_comp_date[BOTS_TMP_STR_SZ];
+char bots_comp_message[BOTS_TMP_STR_SZ];
+char bots_cc[BOTS_TMP_STR_SZ];
+char bots_cflags[BOTS_TMP_STR_SZ];
+char bots_ld[BOTS_TMP_STR_SZ];
+char bots_ldflags[BOTS_TMP_STR_SZ];
+char bots_cutoff[BOTS_TMP_STR_SZ];
+
+/* time variables */
+double bots_time_program = 0.0;
+double bots_time_sequential = 0.0;
+unsigned long long bots_number_of_tasks = 0; /* forcing 8 bytes size in -m32 and -m64 */
+
+/*
+ * Application dependent info
+ */
+
+#ifndef BOTS_APP_NAME
+#error "Application name must be defined (#define BOTS_APP_NAME)"
+#endif
+
+#ifndef BOTS_APP_PARAMETERS_DESC
+#define BOTS_APP_PARAMETERS_DESC ""
+#endif
+
+#ifndef BOTS_APP_PARAMETERS_LIST
+#define BOTS_APP_PARAMETERS_LIST
+#endif
+
+#ifndef BOTS_APP_INIT
+#define BOTS_APP_INIT
+#endif
+
+#ifndef BOTS_APP_FINI
+#define BOTS_APP_FINI
+#endif
+
+#ifndef KERNEL_CALL
+#error "Initial kernell call must be specified (#define KERNEL_CALL)"
+#endif
+
+#ifndef KERNEL_INIT
+#define KERNEL_INIT
+#endif
+
+#ifndef KERNEL_FINI
+#define KERNEL_FINI
+#endif
+
+#ifndef KERNEL_SEQ_INIT
+#define KERNEL_SEQ_INIT
+#endif
+
+#ifndef KERNEL_SEQ_FINI
+#define KERNEL_SEQ_FINI
+#endif
+
+#ifndef BOTS_MODEL_DESC
+#define BOTS_MODEL_DESC "Unknown"
+#endif
+
+#ifdef BOTS_APP_USES_ARG_SIZE
+#ifndef BOTS_APP_DEF_ARG_SIZE
+#error "Default vaule for argument size must be specified (#define BOTS_APP_DEF_ARG_SIZE)"
+#endif
+#ifndef BOTS_APP_DESC_ARG_SIZE
+#error "Help description for argument size must be specified (#define BOTS_APP_DESC_ARG_SIZE)"
+#endif
+int bots_arg_size = BOTS_APP_DEF_ARG_SIZE;
+#endif
+
+#ifdef BOTS_APP_USES_ARG_SIZE_1
+#ifndef BOTS_APP_DEF_ARG_SIZE_1
+#error "Default vaule for argument size must be specified (#define BOTS_APP_DEF_ARG_SIZE_1)"
+#endif
+#ifndef BOTS_APP_DESC_ARG_SIZE_1
+#error "Help description for argument size must be specified (#define BOTS_APP_DESC_ARG_SIZE_1)"
+#endif
+int bots_arg_size_1 = BOTS_APP_DEF_ARG_SIZE_1;
+#endif
+
+#ifdef BOTS_APP_USES_ARG_SIZE_2
+#ifndef BOTS_APP_DEF_ARG_SIZE_2
+#error "Default vaule for argument size must be specified (#define BOTS_APP_DEF_ARG_SIZE_2)"
+#endif
+#ifndef BOTS_APP_DESC_ARG_SIZE_2
+#error "Help description for argument size must be specified (#define BOTS_APP_DESC_ARG_SIZE_2)"
+#endif
+int bots_arg_size_2 = BOTS_APP_DEF_ARG_SIZE_2;
+#endif
+
+#ifdef BOTS_APP_USES_ARG_REPETITIONS
+#ifndef BOTS_APP_DEF_ARG_REPETITIONS
+#error "Default vaule for argument repetitions must be specified (#define BOTS_APP_DEF_ARG_REPETITIONS)"
+#endif
+#ifndef BOTS_APP_DESC_ARG_REPETITIONS
+#error "Help description for argument repetitions must be specified (#define BOTS_APP_DESC_ARG_REPETITIONS)"
+#endif
+int bots_arg_repetitions = BOTS_APP_DEF_ARG_REPETITIONS;
+#endif
+
+#ifdef BOTS_APP_USES_ARG_FILE
+#ifndef BOTS_APP_DESC_ARG_FILE
+#error "Help description for argument file must be specified (#define BOTS_APP_DESC_ARG_FILE)"
+#endif
+char bots_arg_file[255]="";
+#endif
+
+#ifdef BOTS_APP_USES_ARG_BLOCK
+#ifndef BOTS_APP_DEF_ARG_BLOCK
+#error "Default value for argument block must be specified (#define BOTS_APP_DEF_ARG_BLOCK)"
+#endif
+#ifndef BOTS_APP_DESC_ARG_BLOCK
+#error "Help description for argument block must be specified (#define BOTS_APP_DESC_ARG_BLOCK)"
+#endif
+int bots_arg_block = BOTS_APP_DEF_ARG_BLOCK;
+#endif
+
+#ifdef BOTS_APP_USES_ARG_CUTOFF
+#ifndef BOTS_APP_DEF_ARG_CUTOFF
+#error "Default value for argument cutoff  must be specified (#define BOTS_APP_DEF_ARG_CUTOFF)"
+#endif
+#ifndef BOTS_APP_DESC_ARG_CUTOFF
+#error "Help description for argument cutoff must be specified (#define BOTS_APP_DESC_ARG_CUTOFF)"
+#endif
+int bots_app_cutoff_value = BOTS_APP_DEF_ARG_CUTOFF;
+#endif
+
+#ifdef BOTS_APP_USES_ARG_CUTOFF_1
+#ifndef BOTS_APP_DEF_ARG_CUTOFF_1
+#error "Default value for argument cutoff  must be specified (#define BOTS_APP_DEF_ARG_CUTOFF_1)"
+#endif
+#ifndef BOTS_APP_DESC_ARG_CUTOFF_1
+#error "Help description for argument cutoff must be specified (#define BOTS_APP_DESC_ARG_CUTOFF_1)"
+#endif
+int bots_app_cutoff_value_1 = BOTS_APP_DEF_ARG_CUTOFF_1;
+#endif
+
+#ifdef BOTS_APP_USES_ARG_CUTOFF_2
+#ifndef BOTS_APP_DEF_ARG_CUTOFF_2
+#error "Default value for argument cutoff  must be specified (#define BOTS_APP_DEF_ARG_CUTOFF_2)"
+#endif
+#ifndef BOTS_APP_DESC_ARG_CUTOFF_2
+#error "Help description for argument cutoff must be specified (#define BOTS_APP_DESC_ARG_CUTOFF_2)"
+#endif
+int bots_app_cutoff_value_2 = BOTS_APP_DEF_ARG_CUTOFF_2;
+#endif
+
+#if defined(MANUAL_CUTOFF) || defined(IF_CUTOFF) || defined(FINAL_CUTOFF)
+int  bots_cutoff_value = BOTS_CUTOFF_DEF_VALUE;
+#endif
+
+/***********************************************************************
+ * print_usage: 
+ **********************************************************************/
+void bots_print_usage()
+{
+   PRINTC("\n");
+   PRINTC("Usage: %s -[options]\n", bots_execname);
+   PRINTC("\n");
+   PRINTC("Where options are:\n");
+#ifdef BOTS_APP_USES_REPETITIONS
+   PRINTC("  -r <value> : Set the number of repetitions (default = 1).\n");
+#endif
+#ifdef BOTS_APP_USES_ARG_SIZE
+   PRINTC("  -n <size>  : "BOTS_APP_DESC_ARG_SIZE"\n");
+#endif
+#ifdef BOTS_APP_USES_ARG_SIZE_1
+   PRINTC("  -m <size>  : "BOTS_APP_DESC_ARG_SIZE_1"\n");
+#endif
+#ifdef BOTS_APP_USES_ARG_SIZE_2
+   PRINTC("  -l <size>  : "BOTS_APP_DESC_ARG_SIZE_2"\n");
+#endif
+#ifdef BOTS_APP_USES_ARG_FILE
+   PRINTC("  -f <file>  : "BOTS_APP_DESC_ARG_FILE"\n");
+#endif
+#if defined(MANUAL_CUTOFF) || defined(IF_CUTOFF) || defined(FINAL_CUTOFF)
+   PRINTC("  -x <value> : OpenMP tasks cut-off value (default=%d)\n",BOTS_CUTOFF_DEF_VALUE);
+#endif
+#ifdef BOTS_APP_USES_ARG_CUTOFF
+   PRINTC("  -y <value> : "BOTS_APP_DESC_ARG_CUTOFF"(default=%d)\n", BOTS_APP_DEF_ARG_CUTOFF);
+#endif
+#ifdef BOTS_APP_USES_ARG_CUTOFF_1
+   PRINTC("  -a <value> : "BOTS_APP_DESC_ARG_CUTOFF_1"(default=%d)\n", BOTS_APP_DEF_ARG_CUTOFF_1);
+#endif
+#ifdef BOTS_APP_USES_ARG_CUTOFF_2
+   PRINTC("  -b <value> : "BOTS_APP_DESC_ARG_CUTOFF_2"(default=%d)\n", BOTS_APP_DEF_ARG_CUTOFF_2);
+#endif
+
+   PRINTC("\n");
+   PRINTC("  -e <str>   : Include 'str' execution message.\n");
+   PRINTC("  -v <level> : Set verbose level (default = 1).\n");
+   PRINTC("               0 - none.\n");
+   PRINTC("               1 - default.\n");
+   PRINTC("               2 - debug.\n");
+   PRINTC("  -o <value> : Set output format mode (default = 1).\n");
+   PRINTC("               0 - no benchmark output.\n");
+   PRINTC("               1 - detailed list format.\n");
+   PRINTC("               2 - detailed row format.\n");
+   PRINTC("               3 - abridged list format.\n");
+   PRINTC("               4 - abridged row format.\n");
+   PRINTC("  -z         : Print row header (if output format is a row variant).\n");
+   PRINTC("\n");
+#ifdef KERNEL_SEQ_CALL
+   PRINTC("  -s         : Run sequential version.\n");
+#endif
+#ifdef BOTS_APP_CHECK_USES_SEQ_RESULT
+   PRINTC("  -c         : Check mode ON (implies running sequential version).\n");
+#else
+   PRINTC("  -c         : Check mode ON.\n");
+#endif
+   PRINTC("\n");
+   PRINTC("  -h         : Print program's usage (this help).\n");
+   PRINTC("\n");
+}
+/***********************************************************************
+ * bots_get_params_common: 
+ **********************************************************************/
+void
+bots_get_params_common(int argc, char **argv)
+{
+   int i;
+   strcpy(bots_execname, basename(argv[0]));
+   bots_get_date(bots_exec_date);
+   strcpy(bots_exec_message,"");
+   for (i=1; i<argc; i++) 
+   {
+      if (argv[i][0] == '-')
+      {
+         switch (argv[i][1])
+         {
+#ifdef BOTS_APP_USES_ARG_CUTOFF_1
+	    case 'a':
+	       argv[i][1] = '*';
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               bots_app_cutoff_value_1 = atoi(argv[i]);
+               break;
+#endif
+#ifdef BOTS_APP_USES_ARG_CUTOFF_2
+	    case 'b':
+	       argv[i][1] = '*';
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               bots_app_cutoff_value_2 = atoi(argv[i]);
+               break;
+#endif
+            case 'c': /* set/unset check mode */
+               argv[i][1] = '*';
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               bots_check_flag = atoi(argv[i]);
+               break;
+            case 'e': /* include execution message */
+               argv[i][1] = '*';
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               strcpy(bots_exec_message, argv[i]);
+               break;
+#ifdef BOTS_APP_USES_ARG_FILE
+            case 'f': /* read argument file name */
+               argv[i][1] = '*';
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               strcpy(bots_arg_file,argv[i]);
+               break;
+#endif
+            case 'h': /* print usage */
+               argv[i][1] = '*';
+               bots_print_usage();
+               cos_exit (100);
+	       break;
+#ifdef BOTS_APP_USES_ARG_SIZE_2
+            case 'l': /* read argument size 2 */
+               argv[i][1] = '*';
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               bots_arg_size_2 = atoi(argv[i]);
+               break;
+#endif
+#ifdef BOTS_APP_USES_ARG_SIZE_1
+            case 'm': /* read argument size 1 */
+               argv[i][1] = '*';
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               bots_arg_size_1 = atoi(argv[i]);
+               break;
+#endif
+#ifdef BOTS_APP_USES_ARG_SIZE
+            case 'n': /* read argument size 0 */
+               argv[i][1] = '*';
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               bots_arg_size = atoi(argv[i]);
+               break;
+#endif
+#ifdef BOTS_APP_USES_ARG_BLOCK
+/*TODO*/
+#endif
+            case 'o': /* set output mode */
+               argv[i][1] = '*';
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               bots_output_format = atoi(argv[i]);
+               break;
+#ifdef BOTS_APP_USES_REPETITIONS
+            case 'r': /* set number of repetitions */
+               argv[i][1] = '*';
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               bots_arg_repetition = atoi(argv[i]);
+               break;
+#endif
+#ifdef KERNEL_SEQ_CALL
+            case 's': /* set sequential execution */
+               argv[i][1] = '*';
+               //i++;
+               //if (argc == i) { bots_print_usage(); cos_exit(100); }
+               //bots_sequential_flag = atoi(argv[i]);
+               bots_sequential_flag = TRUE;
+               break;
+#endif
+            case 'v': /* set/unset verbose level */
+               argv[i][1] = '*';
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               bots_verbose_mode = (bots_verbose_mode_t) atoi(argv[i]);
+#ifndef BOTS_DEBUG
+               if ( bots_verbose_mode > 1 ) {
+                  PRINTC("Error: Configure the suite using '--debug' option in order to use a verbose level greather than 1.\n");
+                  cos_exit(100);
+               }
+#endif
+               break;
+#if defined(MANUAL_CUTOFF) || defined(IF_CUTOFF) || defined(FINAL_CUTOFF)
+	    case 'x':
+	       argv[i][1] = '*';
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               bots_cutoff_value = atoi(argv[i]);
+               break;
+#endif
+#ifdef BOTS_APP_USES_ARG_CUTOFF
+	    case 'y':
+	       argv[i][1] = '*';
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               bots_app_cutoff_value = atoi(argv[i]);
+               break;
+#endif
+	    case 'z':
+	       argv[i][1] = '*';
+               bots_print_header = TRUE;
+               break;
+            default:
+               // As at the moment there are only common paramenters
+               // we launch an error. Otherwise we have to ignore the
+               // parameter and to check, after specific parameters are
+               // completely read, if there are unrecognized parameters.
+               PRINTC("Error: Unrecognized parameter.\n");
+               bots_print_usage();
+               cos_exit (100);
+         }
+      }
+      else
+      {
+         // As at the moment there are only common paramenters
+         // we launch an error. Otherwise we have to ignore the
+         // parameter and to check, after specific parameters are
+         // completely read, if there are unrecognized parameters.
+         PRINTC("Error: Unrecognized parameter.\n");
+         bots_print_usage();
+         cos_exit (100);
+      }
+   }
+
+   /* always verify? */
+   bots_check_flag = TRUE;
+}
+/***********************************************************************
+ * bots_get_params_common: 
+ **********************************************************************/
+void
+bots_get_params(int argc, char **argv)
+{
+   bots_get_params_common(argc, argv);
+//   bots_get_params_specific(argc, argv);
+}
+
+
+/***********************************************************************
+ * bots_set_info 
+ **********************************************************************/
+void bots_set_info ()
+{
+   /* program specific info */
+   snprintf(bots_name, BOTS_TMP_STR_SZ, BOTS_APP_NAME);
+   snprintf(bots_parameters, BOTS_TMP_STR_SZ, BOTS_APP_PARAMETERS_DESC BOTS_APP_PARAMETERS_LIST);
+   snprintf(bots_model, BOTS_TMP_STR_SZ, BOTS_MODEL_DESC);
+   snprintf(bots_resources, BOTS_TMP_STR_SZ, "%d", omp_get_max_threads());
+
+   /* compilation info (do not modify) */
+   snprintf(bots_comp_date, BOTS_TMP_STR_SZ, CDATE);
+   snprintf(bots_comp_message, BOTS_TMP_STR_SZ, CMESSAGE);
+   snprintf(bots_cc, BOTS_TMP_STR_SZ, CC);
+   snprintf(bots_cflags, BOTS_TMP_STR_SZ, CFLAGS);
+   snprintf(bots_ld, BOTS_TMP_STR_SZ, LD);
+   snprintf(bots_ldflags, BOTS_TMP_STR_SZ, LDFLAGS);
+
+#if defined(MANUAL_CUTOFF) 
+   snprintf(bots_cutoff, BOTS_TMP_STR_SZ, "manual (%d)",bots_cutoff_value);
+#elif defined(IF_CUTOFF) 
+   snprintf(bots_cutoff, BOTS_TMP_STR_SZ, "pragma-if (%d)",bots_cutoff_value);
+#elif defined(FINAL_CUTOFF)
+   snprintf(bots_cutoff, BOTS_TMP_STR_SZ, "final (%d)",bots_cutoff_value);
+#else
+   strcpy(bots_cutoff,"none");
+#endif
+}
+
+/***********************************************************************
+ * main: 
+ **********************************************************************/
+int
+main(void)
+{
+   /* TODO: app specific args? */
+   int argc = 1;
+   char *app = "bots_app";
+   char **argv = &app;
+
+#ifndef BOTS_APP_SELF_TIMING
+   long bots_t_start;
+   long bots_t_end;
+#endif
+
+   bots_get_params(argc,argv);
+   BOTS_APP_INIT;
+   bots_set_info();
+
+#ifdef KERNEL_SEQ_CALL
+#ifdef BOTS_APP_CHECK_USES_SEQ_RESULT
+   if (bots_sequential_flag || bots_check_flag)
+#else
+   if (bots_sequential_flag)
+#endif
+   {
+      bots_sequential_flag = 1;
+      KERNEL_SEQ_INIT;
+#ifdef BOTS_APP_SELF_TIMING
+      bots_time_sequential = KERNEL_SEQ_CALL;
+#else
+      bots_t_start = bots_usecs();
+      KERNEL_SEQ_CALL;
+      bots_t_end = bots_usecs();
+      bots_time_sequential = ((double)(bots_t_end-bots_t_start))/1000000;
+#endif
+      KERNEL_SEQ_FINI;
+   }
+#endif
+
+   KERNEL_INIT;
+#ifdef BOTS_APP_SELF_TIMING
+   bots_time_program = KERNEL_CALL;
+#else
+   bots_t_start = bots_usecs();
+   KERNEL_CALL;
+   bots_t_end = bots_usecs();
+   bots_time_program = ((double)(bots_t_end-bots_t_start))/1000000;
+#endif
+   KERNEL_FINI;
+
+#ifdef KERNEL_CHECK
+   if (bots_check_flag) {
+     bots_result = KERNEL_CHECK;
+   }
+#endif
+
+   BOTS_APP_FINI;
+
+   bots_print_results();
+   return (0);
+}
+
diff --git a/src/components/implementation/no_interface/omp_fib_bots/bots_main.h b/src/components/implementation/no_interface/omp_fib_bots/bots_main.h
new file mode 100644
index 0000000000..8d1a9ca9a6
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fib_bots/bots_main.h
@@ -0,0 +1,53 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#define BOTS_PARAM_TYPE_NONE 0
+#define BOTS_PARAM_TYPE_INT 1
+#define BOTS_PARAM_TYPE_BOOL 2
+#define BOTS_PARAM_TYPE_STR 3
+
+#ifdef _OPENMP
+# include <omp.h>
+#else
+# define omp_get_max_threads()  1
+# define omp_get_thread_num()   0
+# define omp_set_num_threads(x)
+#endif
+
+void bots_print_usage(void);
+void bots_print_usage_option(char opt, int type, char* description, char *val, int subc, char **subv);
+
+/***********************************************************************
+ * BENCHMARK HEADERS 
+ *********************************************************************/
+void bots_initialize();
+void bots_finalize();
+void bots_sequential_ini();
+long bots_sequential();
+void bots_sequential_fini();
+int bots_check_result();
+void bots_print_usage_specific();
+void bots_get_params_specific(int argc, char **argv);
+void bots_set_info();
+
+void bots_get_params_common(int argc, char **argv);
+void bots_get_params(int argc, char **argv);
+
+extern void cos_exit(int x);
diff --git a/src/components/implementation/no_interface/omp_fib_bots/fib.c b/src/components/implementation/no_interface/omp_fib_bots/fib.c
new file mode 100644
index 0000000000..445b1b40d5
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fib_bots/fib.c
@@ -0,0 +1,235 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include "bots.h"
+#include "fib.h"
+
+#define FIB_RESULTS_PRE 41
+long long fib_results[FIB_RESULTS_PRE] = {0,1,1,2,3,5,8,13,21,34,55,89,144,233,377,610,987,1597,2584,4181,6765,10946,17711,28657,46368,75025,121393,196418,317811,514229,832040,1346269,2178309,3524578,5702887,9227465,14930352,24157817,39088169,63245986,102334155};
+
+long long fib_seq (int n)
+{
+	int x, y;
+	if (n < 2) return n;
+
+	x = fib_seq(n - 1);
+	y = fib_seq(n - 2);
+
+	return x + y;
+}
+
+#if defined(FORCE_TIED_TASKS)
+#if defined(IF_CUTOFF)
+
+long long fib (int n,int d)
+{
+	long long x, y;
+	if (n < 2) return n;
+
+	#pragma omp task shared(x) firstprivate(n) if(d < bots_cutoff_value)
+	x = fib(n - 1,d+1);
+
+	#pragma omp task shared(y) firstprivate(n) if(d < bots_cutoff_value)
+	y = fib(n - 2,d+1);
+
+	#pragma omp taskwait
+	return x + y;
+}
+
+#elif defined(FINAL_CUTOFF)
+
+long long fib (int n,int d)
+{
+	long long x, y;
+	if (n < 2) return n;
+
+	#pragma omp task shared(x) firstprivate(n) final(d+1 >= bots_cutoff_value) mergeable
+	x = fib(n - 1,d+1);
+
+	#pragma omp task shared(y) firstprivate(n) final(d+1 >= bots_cutoff_value) mergeable
+	y = fib(n - 2,d+1);
+
+	#pragma omp taskwait
+	return x + y;
+}
+
+#elif defined(MANUAL_CUTOFF)
+
+long long fib (int n, int d)
+{
+	long long x, y;
+	if (n < 2) return n;
+
+	if ( d < bots_cutoff_value ) {
+		#pragma omp task shared(x) firstprivate(n)
+		x = fib(n - 1,d+1);
+
+		#pragma omp task shared(y) firstprivate(n)
+		y = fib(n - 2,d+1);
+
+		#pragma omp taskwait
+	} else {
+		x = fib_seq(n-1);
+		y = fib_seq(n-2);
+	}
+
+	return x + y;
+}
+
+#else
+
+long long fib (int n)
+{
+	long long x, y;
+	if (n < 2) return n;
+
+	#pragma omp task shared(x) firstprivate(n)
+	x = fib(n - 1);
+	#pragma omp task shared(y) firstprivate(n)
+	y = fib(n - 2);
+
+	#pragma omp taskwait
+	return x + y;
+}
+
+#endif
+#else
+
+#if defined(IF_CUTOFF)
+
+long long fib (int n,int d)
+{
+	long long x, y;
+	if (n < 2) return n;
+
+	#pragma omp task untied shared(x) firstprivate(n) if(d < bots_cutoff_value)
+	x = fib(n - 1,d+1);
+
+	#pragma omp task untied shared(y) firstprivate(n) if(d < bots_cutoff_value)
+	y = fib(n - 2,d+1);
+
+	#pragma omp taskwait
+	return x + y;
+}
+
+#elif defined(FINAL_CUTOFF)
+
+long long fib (int n,int d)
+{
+	long long x, y;
+	if (n < 2) return n;
+
+	#pragma omp task untied shared(x) firstprivate(n) final(d+1 >= bots_cutoff_value) mergeable
+	x = fib(n - 1,d+1);
+
+	#pragma omp task untied shared(y) firstprivate(n) final(d+1 >= bots_cutoff_value) mergeable
+	y = fib(n - 2,d+1);
+
+	#pragma omp taskwait
+	return x + y;
+}
+
+#elif defined(MANUAL_CUTOFF)
+
+long long fib (int n, int d)
+{
+	long long x, y;
+	if (n < 2) return n;
+
+	if ( d < bots_cutoff_value ) {
+		#pragma omp task untied shared(x) firstprivate(n)
+		x = fib(n - 1,d+1);
+
+		#pragma omp task untied shared(y) firstprivate(n)
+		y = fib(n - 2,d+1);
+
+		#pragma omp taskwait
+	} else {
+		x = fib_seq(n-1);
+		y = fib_seq(n-2);
+	}
+
+	return x + y;
+}
+
+#else
+
+long long fib (int n)
+{
+	long long x, y;
+	if (n < 2) return n;
+
+	#pragma omp task untied shared(x) firstprivate(n)
+	x = fib(n - 1);
+	#pragma omp task untied shared(y) firstprivate(n)
+	y = fib(n - 2);
+
+	#pragma omp taskwait
+	return x + y;
+}
+
+#endif
+#endif
+
+static long long par_res, seq_res;
+
+void fib0 (int n)
+{
+	#pragma omp parallel
+	#pragma omp single
+#if defined(MANUAL_CUTOFF) || defined(IF_CUTOFF) || defined(FINAL_CUTOFF)
+	par_res = fib(n,0);
+#else
+	par_res = fib(n);
+#endif
+	bots_message("Fibonacci result for %d is %lld\n",n,par_res);
+}
+
+void fib0_seq (int n)
+{
+	seq_res = fib_seq(n);
+	bots_message("Fibonacci result for %d is %lld\n",n,seq_res);
+}
+
+long long fib_verify_value(int n)
+{
+	if (n < FIB_RESULTS_PRE) return fib_results[n];
+	return ( fib_verify_value(n-1) + fib_verify_value(n-2));
+}
+
+int fib_verify (int n)
+{
+	int result;
+
+	if (bots_sequential_flag)
+	{
+		if (par_res == seq_res) result = BOTS_RESULT_SUCCESSFUL;
+		else result = BOTS_RESULT_UNSUCCESSFUL;
+	}
+	else
+	{
+		seq_res = fib_verify_value(n);
+		if (par_res == seq_res) result = BOTS_RESULT_SUCCESSFUL;
+		else result = BOTS_RESULT_UNSUCCESSFUL;
+	}
+
+	return result;
+}
+
diff --git a/src/components/implementation/no_interface/omp_fib_bots/fib.h b/src/components/implementation/no_interface/omp_fib_bots/fib.h
new file mode 100644
index 0000000000..e3d2983e7c
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fib_bots/fib.h
@@ -0,0 +1,40 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+#ifndef FIB_H
+#define FIB_H
+#if defined(IF_CUTOFF)
+long long fib (int n,int d);
+#elif defined(FINAL_CUTOFF)
+long long fib (int n,int d);
+#elif defined(MANUAL_CUTOFF)
+long long fib (int n,int d);
+#else
+long long fib (int n);
+#endif
+
+long long fib_seq (int n);
+
+void fib0 (int n);
+void fib0_seq (int n);
+
+int fib_verify (int n);
+long long fib_verify_value(int n);
+#endif
+
diff --git a/src/components/implementation/no_interface/omp_fib_bots/init.c b/src/components/implementation/no_interface/omp_fib_bots/init.c
new file mode 120000
index 0000000000..b2694bf833
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fib_bots/init.c
@@ -0,0 +1 @@
+../omp_hello/init.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_fib_bots/omp-tasks-app.h b/src/components/implementation/no_interface/omp_fib_bots/omp-tasks-app.h
new file mode 100644
index 0000000000..9cbc9282b2
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fib_bots/omp-tasks-app.h
@@ -0,0 +1,31 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include <omp.h>
+
+#define MODEL OMP-TASKS
+
+#ifdef FORCE_TIED_TASKS
+#define BOTS_MODEL_DESC "OpenMP (using tied tasks)"
+#else
+#define BOTS_MODEL_DESC "OpenMP (using tasks)"
+#endif
+
+
diff --git a/src/components/implementation/no_interface/omp_fib_bots/posix_basic.c b/src/components/implementation/no_interface/omp_fib_bots/posix_basic.c
new file mode 120000
index 0000000000..99b9e18548
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fib_bots/posix_basic.c
@@ -0,0 +1 @@
+../omp_dijkstra/posix_basic.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_hello/Makefile b/src/components/implementation/no_interface/omp_hello/Makefile
new file mode 100644
index 0000000000..ba90175127
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_hello/Makefile
@@ -0,0 +1,10 @@
+COMPONENT=omp_hello.o
+INTERFACES=
+#DEPENDENCIES=capmgr
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
+
+CFLAGS += -fopenmp
diff --git a/src/components/implementation/no_interface/omp_hello/hello_omp.c b/src/components/implementation/no_interface/omp_hello/hello_omp.c
new file mode 100644
index 0000000000..f96d49d3fc
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_hello/hello_omp.c
@@ -0,0 +1,92 @@
+#include <llprint.h>
+#include <omp.h>
+
+/******************************************************************************/
+
+int main ( void )
+
+/******************************************************************************/
+/*
+  Purpose:
+
+    HELLO has each thread print out its ID.
+
+  Discussion:
+
+    HELLO is a "Hello, World" program for OpenMP.
+
+  Licensing:
+
+    This code is distributed under the GNU LGPL license. 
+
+  Modified:
+
+    23 June 2010
+
+  Author:
+
+    John Burkardt
+*/
+{
+  int id;
+  double wtime;
+
+  PRINTC ( "\n" );
+  PRINTC ( "HELLO_OPENMP\n" );
+  PRINTC ( "  C/OpenMP version\n" );
+
+  PRINTC ( "\n" );
+  PRINTC ( "  Number of processors available = %d\n", omp_get_num_procs ( ) );
+  PRINTC ( "  Number of threads =              %d\n", omp_get_max_threads ( ) );
+
+  wtime = omp_get_wtime ( );
+
+  PRINTC ( "\n" );
+  PRINTC ( "  OUTSIDE the parallel region.\n" );
+  PRINTC ( "\n" );
+
+  id = omp_get_thread_num ( );
+  PRINTC ( "  HELLO from process %d\n", id ) ;
+
+  PRINTC ( "\n" );
+  PRINTC ( "  Going INSIDE the parallel region:\n" );
+  PRINTC ( "\n" );
+/*
+  INSIDE THE PARALLEL REGION, have each thread say hello.
+*/
+#if 1
+#pragma omp parallel private(id)
+  {
+#pragma omp for
+  for (id = 0; id < 10; id++) 
+  {
+	  PRINTC("id:%u\n", id);
+  }
+  }
+#else
+# pragma omp parallel\
+  private ( id )
+  {
+    id = omp_get_thread_num ( );
+    PRINTC ("  Hello from process %d\n", id );
+  }
+#endif
+/*
+  Finish up by measuring the elapsed time.
+*/
+  wtime = omp_get_wtime ( ) - wtime;
+
+  PRINTC ( "\n" );
+  PRINTC ( "  Back OUTSIDE the parallel region.\n" );
+/*
+  Terminate.
+*/
+  PRINTC ( "\n" );
+  PRINTC ( "HELLO_OPENMP\n" );
+  PRINTC ( "  Normal end of execution.\n" );
+
+  PRINTC ( "\n" );
+  PRINTC ( "  Elapsed wall clock time = %f\n", wtime );
+
+  return 0;
+}
diff --git a/src/components/implementation/no_interface/omp_hello/init.c b/src/components/implementation/no_interface/omp_hello/init.c
new file mode 100644
index 0000000000..ddba532393
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_hello/init.c
@@ -0,0 +1,78 @@
+#include <cos_kernel_api.h>
+#include <cos_defkernel_api.h>
+#include <llprint.h>
+#include <sl.h>
+#include <cos_omp.h>
+#include <cos_dcb.h>
+
+int main(void);
+
+void
+cos_exit(int x)
+{
+	PRINTC("Exit code: %d\n", x);
+	while (1) ;
+}
+
+static void
+cos_main(void *d)
+{
+	assert(sl_thd_thdid(sl_thd_curr()) == cos_thdid());
+	main();
+
+	while (1) ;
+}
+
+extern void cos_gomp_init(void);
+
+void
+cos_init(void *d)
+{
+	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
+	struct cos_compinfo *   ci    = cos_compinfo_get(defci);
+	int i;
+	static volatile unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 };
+	static unsigned b1 = 0, b2 = 0, b3 = 0;
+
+	PRINTC("In an OpenMP program!\n");
+	if (ps_cas(&first, NUM_CPU + 1, cos_cpuid())) {
+		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
+		cos_defcompinfo_llinit();
+	} else {
+		while (!ps_load(&init_done[first])) ;
+
+		cos_defcompinfo_sched_init();
+	}
+	cos_dcb_info_init_curr();
+	ps_faa(&init_done[cos_cpuid()], 1);
+
+	/* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */
+	for (i = 0; i < NUM_CPU; i++) {
+		while (!ps_load(&init_done[i])) ;
+	}
+	sl_init(SL_MIN_PERIOD_US*100);
+	/* barrier, wait for sl_init to be done on all cores */
+	ps_faa(&b1, 1);
+	while (ps_load(&b1) != NUM_CPU) ;
+	cos_gomp_init();
+	/* barrier, wait for gomp_init to be done on all cores */
+	ps_faa(&b2, 1);
+	while (ps_load(&b2) != NUM_CPU) ;
+
+	if (!cos_cpuid()) {
+		struct sl_thd *t = NULL;
+
+		t = sl_thd_alloc(cos_main, NULL);
+		assert(t);
+		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX));
+	}
+	/* wait for all cores to reach this point, so all threads wait for main thread to be ready! */
+	ps_faa(&b3, 1);
+	while (ps_load(&b3) != NUM_CPU) ;
+
+	sl_sched_loop_nonblock();
+
+	PRINTC("Should never get here!\n");
+	assert(0);
+}
+
diff --git a/src/components/implementation/no_interface/omp_sort_bots/Makefile b/src/components/implementation/no_interface/omp_sort_bots/Makefile
new file mode 100644
index 0000000000..05d43d1f94
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sort_bots/Makefile
@@ -0,0 +1,19 @@
+COMPONENT=omp_sort_bots.o
+INTERFACES=
+#DEPENDENCIES=capmgr
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
+
+CFLAGS += -fopenmp
+# if tied tasks are required
+CFLAGS += -DFORCE_TIED_TASKS
+
+OMPC_FINAL_FLAGS=
+
+# one per compilation or none
+#CFLAGS += -DMANUAL_CUTOFF
+#CFLAGS += -DIF_CUTOFF
+#CFLAGS += -DFINAL_CUTOFF $(OMPC_FINAL_FLAGS) 
diff --git a/src/components/implementation/no_interface/omp_sort_bots/app-desc.h b/src/components/implementation/no_interface/omp_sort_bots/app-desc.h
new file mode 100644
index 0000000000..85e6e47782
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sort_bots/app-desc.h
@@ -0,0 +1,66 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include "omp-tasks-app.h"
+
+#define BOTS_APP_NAME "Sort"
+#define BOTS_APP_PARAMETERS_DESC "N=%d:Q=%d:I=%d:M=%d"
+#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size,bots_app_cutoff_value_1,bots_app_cutoff_value_2,bots_app_cutoff_value
+
+#define BOTS_APP_USES_ARG_SIZE
+#define BOTS_APP_DEF_ARG_SIZE (32*1024*1024)
+#define BOTS_APP_DESC_ARG_SIZE "Array size"
+
+#define BOTS_APP_USES_ARG_CUTOFF
+#define BOTS_APP_DEF_ARG_CUTOFF (2*1024)
+#define BOTS_APP_DESC_ARG_CUTOFF "Sequential Merge cutoff value"
+
+#define BOTS_APP_USES_ARG_CUTOFF_1
+#define BOTS_APP_DEF_ARG_CUTOFF_1 (2*1024)
+#define BOTS_APP_DESC_ARG_CUTOFF_1 "Sequential Quicksort cutoff value"
+
+#define BOTS_APP_USES_ARG_CUTOFF_2
+#define BOTS_APP_DEF_ARG_CUTOFF_2 (20)
+#define BOTS_APP_DESC_ARG_CUTOFF_2 "Sequential Insertion cutoff value"
+
+typedef long ELM;
+
+void seqquick(ELM *low, ELM *high); 
+void seqmerge(ELM *low1, ELM *high1, ELM *low2, ELM *high2, ELM *lowdest);
+ELM *binsplit(ELM val, ELM *low, ELM *high); 
+void cilkmerge(ELM *low1, ELM *high1, ELM *low2, ELM *high2, ELM *lowdest);
+void cilkmerge_par(ELM *low1, ELM *high1, ELM *low2, ELM *high2, ELM *lowdest);
+void cilksort(ELM *low, ELM *tmp, long size);
+void cilksort_par(ELM *low, ELM *tmp, long size);
+void scramble_array( ELM *array ); 
+void fill_array( ELM *array ); 
+void sort ( void ); 
+
+void sort_par (void);
+void sort_init (void);
+int sort_verify (void);
+
+#define BOTS_APP_INIT sort_init()
+
+#define KERNEL_INIT
+#define KERNEL_CALL sort_par()
+#define KERNEL_CHECK sort_verify()
+
+
diff --git a/src/components/implementation/no_interface/omp_sort_bots/bots.h b/src/components/implementation/no_interface/omp_sort_bots/bots.h
new file mode 120000
index 0000000000..ea0ad2b59f
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sort_bots/bots.h
@@ -0,0 +1 @@
+../omp_fib_bots/bots.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sort_bots/bots_common.c b/src/components/implementation/no_interface/omp_sort_bots/bots_common.c
new file mode 120000
index 0000000000..4802b0cf70
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sort_bots/bots_common.c
@@ -0,0 +1 @@
+../omp_fib_bots/bots_common.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sort_bots/bots_common.h b/src/components/implementation/no_interface/omp_sort_bots/bots_common.h
new file mode 120000
index 0000000000..14eda863e4
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sort_bots/bots_common.h
@@ -0,0 +1 @@
+../omp_fib_bots/bots_common.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sort_bots/bots_main.c b/src/components/implementation/no_interface/omp_sort_bots/bots_main.c
new file mode 120000
index 0000000000..14f2dab009
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sort_bots/bots_main.c
@@ -0,0 +1 @@
+../omp_fib_bots/bots_main.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sort_bots/bots_main.h b/src/components/implementation/no_interface/omp_sort_bots/bots_main.h
new file mode 120000
index 0000000000..86c06ad286
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sort_bots/bots_main.h
@@ -0,0 +1 @@
+../omp_fib_bots/bots_main.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sort_bots/init.c b/src/components/implementation/no_interface/omp_sort_bots/init.c
new file mode 120000
index 0000000000..9e09b82e77
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sort_bots/init.c
@@ -0,0 +1 @@
+../omp_fib_bots/init.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sort_bots/omp-tasks-app.h b/src/components/implementation/no_interface/omp_sort_bots/omp-tasks-app.h
new file mode 120000
index 0000000000..9fba574408
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sort_bots/omp-tasks-app.h
@@ -0,0 +1 @@
+../omp_fib_bots/omp-tasks-app.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sort_bots/posix_basic.c b/src/components/implementation/no_interface/omp_sort_bots/posix_basic.c
new file mode 120000
index 0000000000..9afee078fb
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sort_bots/posix_basic.c
@@ -0,0 +1 @@
+../omp_fib_bots/posix_basic.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sort_bots/sort.c b/src/components/implementation/no_interface/omp_sort_bots/sort.c
new file mode 100644
index 0000000000..d8140970d6
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sort_bots/sort.c
@@ -0,0 +1,517 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+/*
+ *  Original code from the Cilk project
+ *
+ * Copyright (c) 2000 Massachusetts Institute of Technology
+ * Copyright (c) 2000 Matteo Frigo
+ */
+
+/*
+ * this program uses an algorithm that we call `cilksort'.
+ * The algorithm is essentially mergesort:
+ *
+ *   cilksort(in[1..n]) =
+ *       spawn cilksort(in[1..n/2], tmp[1..n/2])
+ *       spawn cilksort(in[n/2..n], tmp[n/2..n])
+ *       sync
+ *       spawn cilkmerge(tmp[1..n/2], tmp[n/2..n], in[1..n])
+ *
+ *
+ * The procedure cilkmerge does the following:
+ *       
+ *       cilkmerge(A[1..n], B[1..m], C[1..(n+m)]) =
+ *          find the median of A \union B using binary
+ *          search.  The binary search gives a pair
+ *          (ma, mb) such that ma + mb = (n + m)/2
+ *          and all elements in A[1..ma] are smaller than
+ *          B[mb..m], and all the B[1..mb] are smaller
+ *          than all elements in A[ma..n].
+ *
+ *          spawn cilkmerge(A[1..ma], B[1..mb], C[1..(n+m)/2])
+ *          spawn cilkmerge(A[ma..m], B[mb..n], C[(n+m)/2 .. (n+m)])
+ *          sync
+ *
+ * The algorithm appears for the first time (AFAIK) in S. G. Akl and
+ * N. Santoro, "Optimal Parallel Merging and Sorting Without Memory
+ * Conflicts", IEEE Trans. Comp., Vol. C-36 No. 11, Nov. 1987 .  The
+ * paper does not express the algorithm using recursion, but the
+ * idea of finding the median is there.
+ *
+ * For cilksort of n elements, T_1 = O(n log n) and
+ * T_\infty = O(log^3 n).  There is a way to shave a
+ * log factor in the critical path (left as homework).
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "bots.h"
+#include "app-desc.h"
+
+ELM *array, *tmp;
+
+static unsigned long rand_nxt = 0;
+
+static inline unsigned long my_rand(void)
+{
+     rand_nxt = rand_nxt * 1103515245 + 12345;
+     return rand_nxt;
+}
+
+static inline void my_srand(unsigned long seed)
+{
+     rand_nxt = seed;
+}
+
+static inline ELM med3(ELM a, ELM b, ELM c)
+{
+     if (a < b) {
+	  if (b < c) {
+	       return b;
+	  } else {
+	       if (a < c)
+		    return c;
+	       else
+		    return a;
+	  }
+     } else {
+	  if (b > c) {
+	       return b;
+	  } else {
+	       if (a > c)
+		    return c;
+	       else
+		    return a;
+	  }
+     }
+}
+
+/*
+ * simple approach for now; a better median-finding
+ * may be preferable
+ */
+static inline ELM choose_pivot(ELM *low, ELM *high)
+{
+     return med3(*low, *high, low[(high - low) / 2]);
+}
+
+static ELM *seqpart(ELM *low, ELM *high)
+{
+     ELM pivot;
+     ELM h, l;
+     ELM *curr_low = low;
+     ELM *curr_high = high;
+
+     pivot = choose_pivot(low, high);
+
+     while (1) {
+	  while ((h = *curr_high) > pivot)
+	       curr_high--;
+
+	  while ((l = *curr_low) < pivot)
+	       curr_low++;
+
+	  if (curr_low >= curr_high)
+	       break;
+
+	  *curr_high-- = l;
+	  *curr_low++ = h;
+     }
+
+     /*
+      * I don't know if this is really necessary.
+      * The problem is that the pivot is not always the
+      * first element, and the partition may be trivial.
+      * However, if the partition is trivial, then
+      * *high is the largest element, whence the following
+      * code.
+      */
+     if (curr_high < high)
+	  return curr_high;
+     else
+	  return curr_high - 1;
+}
+
+#define swap(a, b) \
+{ \
+  ELM tmp;\
+  tmp = a;\
+  a = b;\
+  b = tmp;\
+}
+
+static void insertion_sort(ELM *low, ELM *high)
+{
+     ELM *p, *q;
+     ELM a, b;
+
+     for (q = low + 1; q <= high; ++q) {
+	  a = q[0];
+	  for (p = q - 1; p >= low && (b = p[0]) > a; p--)
+	       p[1] = b;
+	  p[1] = a;
+     }
+}
+
+/*
+ * tail-recursive quicksort, almost unrecognizable :-)
+ */
+void seqquick(ELM *low, ELM *high)
+{
+     ELM *p;
+
+     while (high - low >= bots_app_cutoff_value_2) {
+	  p = seqpart(low, high);
+	  seqquick(low, p);
+	  low = p + 1;
+     }
+
+     insertion_sort(low, high);
+}
+
+void seqmerge(ELM *low1, ELM *high1, ELM *low2, ELM *high2,
+	      ELM *lowdest)
+{
+     ELM a1, a2;
+
+     /*
+      * The following 'if' statement is not necessary
+      * for the correctness of the algorithm, and is
+      * in fact subsumed by the rest of the function.
+      * However, it is a few percent faster.  Here is why.
+      *
+      * The merging loop below has something like
+      *   if (a1 < a2) {
+      *        *dest++ = a1;
+      *        ++low1;
+      *        if (end of array) break;
+      *        a1 = *low1;
+      *   }
+      *
+      * Now, a1 is needed immediately in the next iteration
+      * and there is no way to mask the latency of the load.
+      * A better approach is to load a1 *before* the end-of-array
+      * check; the problem is that we may be speculatively
+      * loading an element out of range.  While this is
+      * probably not a problem in practice, yet I don't feel
+      * comfortable with an incorrect algorithm.  Therefore,
+      * I use the 'fast' loop on the array (except for the last 
+      * element) and the 'slow' loop for the rest, saving both
+      * performance and correctness.
+      */
+
+     if (low1 < high1 && low2 < high2) {
+	  a1 = *low1;
+	  a2 = *low2;
+	  for (;;) {
+	       if (a1 < a2) {
+		    *lowdest++ = a1;
+		    a1 = *++low1;
+		    if (low1 >= high1)
+			 break;
+	       } else {
+		    *lowdest++ = a2;
+		    a2 = *++low2;
+		    if (low2 >= high2)
+			 break;
+	       }
+	  }
+     }
+     if (low1 <= high1 && low2 <= high2) {
+	  a1 = *low1;
+	  a2 = *low2;
+	  for (;;) {
+	       if (a1 < a2) {
+		    *lowdest++ = a1;
+		    ++low1;
+		    if (low1 > high1)
+			 break;
+		    a1 = *low1;
+	       } else {
+		    *lowdest++ = a2;
+		    ++low2;
+		    if (low2 > high2)
+			 break;
+		    a2 = *low2;
+	       }
+	  }
+     }
+     if (low1 > high1) {
+	  memcpy(lowdest, low2, sizeof(ELM) * (high2 - low2 + 1));
+     } else {
+	  memcpy(lowdest, low1, sizeof(ELM) * (high1 - low1 + 1));
+     }
+}
+
+#define swap_indices(a, b) \
+{ \
+  ELM *tmp;\
+  tmp = a;\
+  a = b;\
+  b = tmp;\
+}
+
+ELM *binsplit(ELM val, ELM *low, ELM *high)
+{
+     /*
+      * returns index which contains greatest element <= val.  If val is
+      * less than all elements, returns low-1
+      */
+     ELM *mid;
+
+     while (low != high) {
+	  mid = low + ((high - low + 1) >> 1);
+	  if (val <= *mid)
+	       high = mid - 1;
+	  else
+	       low = mid;
+     }
+
+     if (*low > val)
+	  return low - 1;
+     else
+	  return low;
+}
+
+
+void cilkmerge_par(ELM *low1, ELM *high1, ELM *low2, ELM *high2, ELM *lowdest)
+{
+     /*
+      * Cilkmerge: Merges range [low1, high1] with range [low2, high2] 
+      * into the range [lowdest, ...]  
+      */
+
+     ELM *split1, *split2;	/*
+				 * where each of the ranges are broken for 
+				 * recursive merge 
+				 */
+     long int lowsize;		/*
+				 * total size of lower halves of two
+				 * ranges - 2 
+				 */
+
+     /*
+      * We want to take the middle element (indexed by split1) from the
+      * larger of the two arrays.  The following code assumes that split1
+      * is taken from range [low1, high1].  So if [low1, high1] is
+      * actually the smaller range, we should swap it with [low2, high2] 
+      */
+
+     if (high2 - low2 > high1 - low1) {
+	  swap_indices(low1, low2);
+	  swap_indices(high1, high2);
+     }
+     if (high2 < low2) {
+	  /* smaller range is empty */
+	  memcpy(lowdest, low1, sizeof(ELM) * (high1 - low1));
+	  return;
+     }
+     if (high2 - low2 < bots_app_cutoff_value ) {
+	  seqmerge(low1, high1, low2, high2, lowdest);
+	  return;
+     }
+     /*
+      * Basic approach: Find the middle element of one range (indexed by
+      * split1). Find where this element would fit in the other range
+      * (indexed by split 2). Then merge the two lower halves and the two
+      * upper halves. 
+      */
+
+     split1 = ((high1 - low1 + 1) / 2) + low1;
+     split2 = binsplit(*split1, low2, high2);
+     lowsize = split1 - low1 + split2 - low2;
+
+     /* 
+      * directly put the splitting element into
+      * the appropriate location
+      */
+     *(lowdest + lowsize + 1) = *split1;
+#if defined(FORCE_TIED_TASKS)
+#pragma omp task
+     cilkmerge_par(low1, split1 - 1, low2, split2, lowdest);
+#pragma omp task
+     cilkmerge_par(split1 + 1, high1, split2 + 1, high2,
+		     lowdest + lowsize + 2);
+#else
+#pragma omp task untied
+     cilkmerge_par(low1, split1 - 1, low2, split2, lowdest);
+#pragma omp task untied
+     cilkmerge_par(split1 + 1, high1, split2 + 1, high2,
+		     lowdest + lowsize + 2);
+#endif
+#pragma omp taskwait
+
+     return;
+}
+
+void cilksort_par(ELM *low, ELM *tmp, long size)
+{
+     /*
+      * divide the input in four parts of the same size (A, B, C, D)
+      * Then:
+      *   1) recursively sort A, B, C, and D (in parallel)
+      *   2) merge A and B into tmp1, and C and D into tmp2 (in parallel)
+      *   3) merge tmp1 and tmp2 into the original array
+      */
+     long quarter = size / 4;
+     ELM *A, *B, *C, *D, *tmpA, *tmpB, *tmpC, *tmpD;
+
+     if (size < bots_app_cutoff_value_1 ) {
+	  /* quicksort when less than 1024 elements */
+	  seqquick(low, low + size - 1);
+	  return;
+     }
+     A = low;
+     tmpA = tmp;
+     B = A + quarter;
+     tmpB = tmpA + quarter;
+     C = B + quarter;
+     tmpC = tmpB + quarter;
+     D = C + quarter;
+     tmpD = tmpC + quarter;
+
+#if defined(FORCE_TIED_TASKS)
+#pragma omp task
+     cilksort_par(A, tmpA, quarter);
+#pragma omp task
+     cilksort_par(B, tmpB, quarter);
+#pragma omp task
+     cilksort_par(C, tmpC, quarter);
+#pragma omp task
+     cilksort_par(D, tmpD, size - 3 * quarter);
+#else
+#pragma omp task untied
+     cilksort_par(A, tmpA, quarter);
+#pragma omp task untied
+     cilksort_par(B, tmpB, quarter);
+#pragma omp task untied
+     cilksort_par(C, tmpC, quarter);
+#pragma omp task untied
+     cilksort_par(D, tmpD, size - 3 * quarter);
+#endif
+#pragma omp taskwait
+
+#if defined(FORCE_TIED_TASKS)
+#pragma omp task
+     cilkmerge_par(A, A + quarter - 1, B, B + quarter - 1, tmpA);
+#pragma omp task
+     cilkmerge_par(C, C + quarter - 1, D, low + size - 1, tmpC);
+#else
+#pragma omp task untied
+     cilkmerge_par(A, A + quarter - 1, B, B + quarter - 1, tmpA);
+#pragma omp task untied
+     cilkmerge_par(C, C + quarter - 1, D, low + size - 1, tmpC);
+#endif
+#pragma omp taskwait
+
+     cilkmerge_par(tmpA, tmpC - 1, tmpC, tmpA + size - 1, A);
+}
+
+void scramble_array( ELM *array )
+{
+     unsigned long i;
+     unsigned long j;
+
+     for (i = 0; i < bots_arg_size; ++i) {
+	  j = my_rand();
+	  j = j % bots_arg_size;
+	  swap(array[i], array[j]);
+     }
+}
+
+void fill_array( ELM *array )
+{
+     unsigned long i;
+
+     my_srand(1);
+     /* first, fill with integers 1..size */
+     for (i = 0; i < bots_arg_size; ++i) {
+	  array[i] = i;
+     }
+}
+
+void sort_init ( void )
+{
+     /* Checking arguments */
+     if (bots_arg_size < 4) {
+        bots_message("%s can not be less than 4, using 4 as a parameter.\n", BOTS_APP_DESC_ARG_SIZE );
+        bots_arg_size = 4;
+     }
+
+     if (bots_app_cutoff_value < 2) {
+        bots_message("%s can not be less than 2, using 2 as a parameter.\n", BOTS_APP_DESC_ARG_CUTOFF);
+        bots_app_cutoff_value = 2;
+     }
+     else if (bots_app_cutoff_value > bots_arg_size ) {
+        bots_message("%s can not be greather than vector size, using %d as a parameter.\n", BOTS_APP_DESC_ARG_CUTOFF, bots_arg_size);
+        bots_app_cutoff_value = bots_arg_size;
+     }
+
+     if (bots_app_cutoff_value_1 > bots_arg_size ) {
+        bots_message("%s can not be greather than vector size, using %d as a parameter.\n", BOTS_APP_DESC_ARG_CUTOFF_1, bots_arg_size);
+        bots_app_cutoff_value_1 = bots_arg_size;
+     }
+     if (bots_app_cutoff_value_2 > bots_arg_size ) {
+        bots_message("%s can not be greather than vector size, using %d as a parameter.\n", BOTS_APP_DESC_ARG_CUTOFF_2, bots_arg_size);
+        bots_app_cutoff_value_2 = bots_arg_size;
+     }
+
+     if (bots_app_cutoff_value_2 > bots_app_cutoff_value_1) {
+        bots_message("%s can not be greather than %s, using %d as a parameter.\n",
+		BOTS_APP_DESC_ARG_CUTOFF_2,
+		BOTS_APP_DESC_ARG_CUTOFF_1,
+		bots_app_cutoff_value_1
+	);
+        bots_app_cutoff_value_2 = bots_app_cutoff_value_1;
+     }
+
+     array = (ELM *) malloc(bots_arg_size * sizeof(ELM));
+     tmp = (ELM *) malloc(bots_arg_size * sizeof(ELM));
+     fill_array(array);
+     scramble_array(array);
+}
+
+void sort_par ( void )
+{
+	bots_message("Computing multisort algorithm (n=%d) ", bots_arg_size);
+	#pragma omp parallel
+	#pragma omp single nowait
+#if defined(FORCE_TIED_TASKS)
+	#pragma omp task
+	     cilksort_par(array, tmp, bots_arg_size);
+#else
+	#pragma omp task untied
+	     cilksort_par(array, tmp, bots_arg_size);
+#endif
+	bots_message(" completed!\n");
+}
+
+int sort_verify ( void )
+{
+     int i, success = 1;
+     for (i = 0; i < bots_arg_size; ++i)
+	  if (array[i] != i)
+	       success = 0;
+
+     return success ? BOTS_RESULT_SUCCESSFUL : BOTS_RESULT_UNSUCCESSFUL;
+}
+
diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/Makefile b/src/components/implementation/no_interface/omp_sparselu_for_bots/Makefile
new file mode 100644
index 0000000000..901901a2cb
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/Makefile
@@ -0,0 +1,19 @@
+COMPONENT=omp_sparselu_for_bots.o
+INTERFACES=
+#DEPENDENCIES=capmgr
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
+
+CFLAGS += -fopenmp
+# if tied tasks are required
+CFLAGS += -DFORCE_TIED_TASKS
+
+OMPC_FINAL_FLAGS=
+
+# one per compilation or none
+#CFLAGS += -DMANUAL_CUTOFF
+#CFLAGS += -DIF_CUTOFF
+#CFLAGS += -DFINAL_CUTOFF $(OMPC_FINAL_FLAGS) 
diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/app-desc.h b/src/components/implementation/no_interface/omp_sparselu_for_bots/app-desc.h
new file mode 100644
index 0000000000..50e655cf0b
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/app-desc.h
@@ -0,0 +1,56 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include "omp-tasks-app.h"
+
+#define BOTS_APP_NAME "SparseLU (For version)"
+#define BOTS_APP_PARAMETERS_DESC "S1=%dx%d, S2=%dx%d"
+#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size,bots_arg_size,bots_arg_size_1,bots_arg_size_1
+
+#define BOTS_APP_USES_ARG_SIZE
+#define BOTS_APP_DEF_ARG_SIZE 50
+#define BOTS_APP_DESC_ARG_SIZE "Matrix Size"
+
+#define BOTS_APP_USES_ARG_SIZE_1
+#define BOTS_APP_DEF_ARG_SIZE_1 100
+#define BOTS_APP_DESC_ARG_SIZE_1 "Submatrix Size"
+
+#define BOTS_APP_INIT float **SEQ,**BENCH;
+
+void sparselu_init(float ***pM, char *pass);
+void sparselu_fini(float **M, char *pass);
+void sparselu_seq_call(float **SEQ);
+void sparselu_par_call(float **BENCH);
+int sparselu_check(float **SEQ, float **BENCH);
+
+#define KERNEL_INIT sparselu_init(&BENCH,"benchmark");
+#define KERNEL_CALL sparselu_par_call(BENCH);
+#define KERNEL_FINI sparselu_fini(BENCH,"benchmark");
+
+#define KERNEL_SEQ_INIT sparselu_init(&SEQ,"serial");
+#define KERNEL_SEQ_CALL sparselu_seq_call(SEQ);
+#define KERNEL_SEQ_FINI sparselu_fini(SEQ,"serial");
+
+/*
+ * Phani: start without sequencial test
+ */
+#undef BOTS_APP_CHECK_USES_SEQ_RESULT
+#define KERNEL_CHECK sparselu_check(SEQ,BENCH);
+
diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/bots.h b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots.h
new file mode 120000
index 0000000000..828039f356
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots.h
@@ -0,0 +1 @@
+../omp_sparselu_single_bots/bots.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_common.c b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_common.c
new file mode 120000
index 0000000000..8517c18eeb
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_common.c
@@ -0,0 +1 @@
+../omp_sparselu_single_bots/bots_common.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_common.h b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_common.h
new file mode 120000
index 0000000000..7eb55ec523
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_common.h
@@ -0,0 +1 @@
+../omp_sparselu_single_bots/bots_common.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_main.c b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_main.c
new file mode 120000
index 0000000000..29ad202b50
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_main.c
@@ -0,0 +1 @@
+../omp_sparselu_single_bots/bots_main.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_main.h b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_main.h
new file mode 120000
index 0000000000..2d1387edd5
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_main.h
@@ -0,0 +1 @@
+../omp_sparselu_single_bots/bots_main.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/init.c b/src/components/implementation/no_interface/omp_sparselu_for_bots/init.c
new file mode 120000
index 0000000000..a7a03a9e37
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/init.c
@@ -0,0 +1 @@
+../omp_sparselu_single_bots/init.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/omp-tasks-app.h b/src/components/implementation/no_interface/omp_sparselu_for_bots/omp-tasks-app.h
new file mode 120000
index 0000000000..1c1cf79526
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/omp-tasks-app.h
@@ -0,0 +1 @@
+../omp_sparselu_single_bots/omp-tasks-app.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/posix_basic.c b/src/components/implementation/no_interface/omp_sparselu_for_bots/posix_basic.c
new file mode 120000
index 0000000000..0b1896b27e
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/posix_basic.c
@@ -0,0 +1 @@
+../omp_sparselu_single_bots/posix_basic.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/sparselu.c b/src/components/implementation/no_interface/omp_sparselu_for_bots/sparselu.c
new file mode 100644
index 0000000000..b441389dc9
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/sparselu.c
@@ -0,0 +1,326 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h> 
+#include <string.h>
+#include <math.h>
+#include <libgen.h>
+#include "bots.h"
+#include "sparselu.h"
+
+/***********************************************************************
+ * checkmat: 
+ **********************************************************************/
+int checkmat (float *M, float *N)
+{
+   int i, j;
+   float r_err;
+
+   for (i = 0; i < bots_arg_size_1; i++) 
+   {
+      for (j = 0; j < bots_arg_size_1; j++) 
+      {
+         r_err = M[i*bots_arg_size_1+j] - N[i*bots_arg_size_1+j];
+         if ( r_err == 0.0 ) continue;
+
+         if (r_err < 0.0 ) r_err = -r_err;
+
+         if ( M[i*bots_arg_size_1+j] == 0 )
+         {
+           bots_message("Checking failure: A[%d][%d]=%f  B[%d][%d]=%f; \n",
+                    i,j, M[i*bots_arg_size_1+j], i,j, N[i*bots_arg_size_1+j]);
+           return FALSE;
+         }
+         r_err = r_err / M[i*bots_arg_size_1+j];
+         if(r_err > EPSILON)
+         {
+            bots_message("Checking failure: A[%d][%d]=%f  B[%d][%d]=%f; Relative Error=%f\n",
+                    i,j, M[i*bots_arg_size_1+j], i,j, N[i*bots_arg_size_1+j], r_err);
+            return FALSE;
+         }
+      }
+   }
+   return TRUE;
+}
+/***********************************************************************
+ * genmat: 
+ **********************************************************************/
+void genmat (float *M[])
+{
+   int null_entry, init_val, i, j, ii, jj;
+   float *p;
+   int a=0,b=0;
+
+   init_val = 1325;
+
+   /* generating the structure */
+   for (ii=0; ii < bots_arg_size; ii++)
+   {
+      for (jj=0; jj < bots_arg_size; jj++)
+      {
+         /* computing null entries */
+         null_entry=FALSE;
+         if ((ii<jj) && (ii%3 !=0)) null_entry = TRUE;
+         if ((ii>jj) && (jj%3 !=0)) null_entry = TRUE;
+	 if (ii%2==1) null_entry = TRUE;
+	 if (jj%2==1) null_entry = TRUE;
+	 if (ii==jj) null_entry = FALSE;
+	 if (ii==jj-1) null_entry = FALSE;
+         if (ii-1 == jj) null_entry = FALSE; 
+         /* allocating matrix */
+         if (null_entry == FALSE){
+            a++;
+            M[ii*bots_arg_size+jj] = (float *) malloc(bots_arg_size_1*bots_arg_size_1*sizeof(float));
+	    if ((M[ii*bots_arg_size+jj] == NULL))
+            {
+               bots_message("Error: Out of memory\n");
+               exit(101);
+            }
+            /* initializing matrix */
+            p = M[ii*bots_arg_size+jj];
+            for (i = 0; i < bots_arg_size_1; i++) 
+            {
+               for (j = 0; j < bots_arg_size_1; j++)
+               {
+	            init_val = (3125 * init_val) % 65536;
+      	            (*p) = (float)((init_val - 32768.0) / 16384.0);
+                    p++;
+               }
+            }
+         }
+         else
+         {
+            b++;
+            M[ii*bots_arg_size+jj] = NULL;
+         }
+      }
+   }
+   bots_debug("allo = %d, no = %d, total = %d, factor = %f\n",a,b,a+b,(float)((float)a/(float)(a+b)));
+}
+/***********************************************************************
+ * print_structure: 
+ **********************************************************************/
+void print_structure(char *name, float *M[])
+{
+   int ii, jj;
+   bots_message("Structure for matrix %s @ 0x%p\n",name, M);
+   for (ii = 0; ii < bots_arg_size; ii++) {
+     for (jj = 0; jj < bots_arg_size; jj++) {
+        if (M[ii*bots_arg_size+jj]!=NULL) {bots_message("x");}
+        else bots_message(" ");
+     }
+     bots_message("\n");
+   }
+   bots_message("\n");
+}
+/***********************************************************************
+ * allocate_clean_block: 
+ **********************************************************************/
+float * allocate_clean_block()
+{
+  int i,j;
+  float *p, *q;
+
+  p = (float *) malloc(bots_arg_size_1*bots_arg_size_1*sizeof(float));
+  q=p;
+  if (p!=NULL){
+     for (i = 0; i < bots_arg_size_1; i++) 
+        for (j = 0; j < bots_arg_size_1; j++){(*p)=0.0; p++;}
+	
+  }
+  else
+  {
+      bots_message("Error: Out of memory\n");
+      exit (101);
+  }
+  return (q);
+}
+
+/***********************************************************************
+ * lu0: 
+ **********************************************************************/
+void lu0(float *diag)
+{
+   int i, j, k;
+
+   for (k=0; k<bots_arg_size_1; k++)
+      for (i=k+1; i<bots_arg_size_1; i++)
+      {
+         diag[i*bots_arg_size_1+k] = diag[i*bots_arg_size_1+k] / diag[k*bots_arg_size_1+k];
+         for (j=k+1; j<bots_arg_size_1; j++)
+            diag[i*bots_arg_size_1+j] = diag[i*bots_arg_size_1+j] - diag[i*bots_arg_size_1+k] * diag[k*bots_arg_size_1+j];
+      }
+}
+
+/***********************************************************************
+ * bdiv: 
+ **********************************************************************/
+void bdiv(float *diag, float *row)
+{
+   int i, j, k;
+   for (i=0; i<bots_arg_size_1; i++)
+      for (k=0; k<bots_arg_size_1; k++)
+      {
+         row[i*bots_arg_size_1+k] = row[i*bots_arg_size_1+k] / diag[k*bots_arg_size_1+k];
+         for (j=k+1; j<bots_arg_size_1; j++)
+            row[i*bots_arg_size_1+j] = row[i*bots_arg_size_1+j] - row[i*bots_arg_size_1+k]*diag[k*bots_arg_size_1+j];
+      }
+}
+/***********************************************************************
+ * bmod: 
+ **********************************************************************/
+void bmod(float *row, float *col, float *inner)
+{
+   int i, j, k;
+   for (i=0; i<bots_arg_size_1; i++)
+      for (j=0; j<bots_arg_size_1; j++)
+         for (k=0; k<bots_arg_size_1; k++)
+            inner[i*bots_arg_size_1+j] = inner[i*bots_arg_size_1+j] - row[i*bots_arg_size_1+k]*col[k*bots_arg_size_1+j];
+}
+/***********************************************************************
+ * fwd: 
+ **********************************************************************/
+void fwd(float *diag, float *col)
+{
+   int i, j, k;
+   for (j=0; j<bots_arg_size_1; j++)
+      for (k=0; k<bots_arg_size_1; k++) 
+         for (i=k+1; i<bots_arg_size_1; i++)
+            col[i*bots_arg_size_1+j] = col[i*bots_arg_size_1+j] - diag[i*bots_arg_size_1+k]*col[k*bots_arg_size_1+j];
+}
+
+void sparselu_init (float ***pBENCH, char *pass)
+{
+   *pBENCH = (float **) malloc(bots_arg_size*bots_arg_size*sizeof(float *));
+   genmat(*pBENCH);
+   print_structure(pass, *pBENCH);
+}
+
+
+void sparselu_seq_call(float **BENCH)
+{
+   int ii, jj, kk;
+
+   for (kk=0; kk<bots_arg_size; kk++)
+   {
+      lu0(BENCH[kk*bots_arg_size+kk]);
+      for (jj=kk+1; jj<bots_arg_size; jj++)
+         if (BENCH[kk*bots_arg_size+jj] != NULL)
+         {
+            fwd(BENCH[kk*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj]);
+         }
+      for (ii=kk+1; ii<bots_arg_size; ii++) 
+         if (BENCH[ii*bots_arg_size+kk] != NULL)
+         {
+            bdiv (BENCH[kk*bots_arg_size+kk], BENCH[ii*bots_arg_size+kk]);
+         }
+      for (ii=kk+1; ii<bots_arg_size; ii++)
+         if (BENCH[ii*bots_arg_size+kk] != NULL)
+            for (jj=kk+1; jj<bots_arg_size; jj++)
+               if (BENCH[kk*bots_arg_size+jj] != NULL)
+               {
+                     if (BENCH[ii*bots_arg_size+jj]==NULL) BENCH[ii*bots_arg_size+jj] = allocate_clean_block();
+                     bmod(BENCH[ii*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj], BENCH[ii*bots_arg_size+jj]);
+               }
+
+   }
+}
+
+void sparselu_par_call(float **BENCH)
+{
+   int ii, jj, kk;
+   
+   bots_message("Computing SparseLU Factorization (%dx%d matrix with %dx%d blocks) ",
+           bots_arg_size,bots_arg_size,bots_arg_size_1,bots_arg_size_1);
+#pragma omp parallel private(kk)
+   {
+   for (kk=0; kk<bots_arg_size; kk++) 
+   {
+#pragma omp single
+      lu0(BENCH[kk*bots_arg_size+kk]);
+
+#pragma omp for nowait
+      for (jj=kk+1; jj<bots_arg_size; jj++)
+         if (BENCH[kk*bots_arg_size+jj] != NULL)
+#if defined(FORCE_TIED_TASKS)
+            #pragma omp task firstprivate(kk, jj) shared(BENCH)
+#else
+            #pragma omp task untied firstprivate(kk, jj) shared(BENCH)
+#endif
+         {
+            fwd(BENCH[kk*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj]);
+         }
+#pragma omp for
+      for (ii=kk+1; ii<bots_arg_size; ii++) 
+         if (BENCH[ii*bots_arg_size+kk] != NULL)
+#if defined(FORCE_TIED_TASKS)
+            #pragma omp task firstprivate(kk, ii) shared(BENCH)
+#else
+            #pragma omp task untied firstprivate(kk, ii) shared(BENCH)
+#endif
+         {
+            bdiv (BENCH[kk*bots_arg_size+kk], BENCH[ii*bots_arg_size+kk]);
+         }
+
+#pragma omp for private(jj)
+      for (ii=kk+1; ii<bots_arg_size; ii++)
+         if (BENCH[ii*bots_arg_size+kk] != NULL)
+            for (jj=kk+1; jj<bots_arg_size; jj++)
+               if (BENCH[kk*bots_arg_size+jj] != NULL)
+#if defined(FORCE_TIED_TASKS)
+               #pragma omp task firstprivate(kk, jj, ii) shared(BENCH)
+#else
+               #pragma omp task untied firstprivate(kk, jj, ii) shared(BENCH)
+#endif
+               {
+                     if (BENCH[ii*bots_arg_size+jj]==NULL) BENCH[ii*bots_arg_size+jj] = allocate_clean_block();
+                     bmod(BENCH[ii*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj], BENCH[ii*bots_arg_size+jj]);
+               }
+
+   }
+   }
+   bots_message(" completed!\n");
+}
+
+void sparselu_fini (float **BENCH, char *pass)
+{
+   print_structure(pass, BENCH);
+}
+
+int sparselu_check(float **SEQ, float **BENCH)
+{
+   int ii,jj,ok=1;
+
+   for (ii=0; ((ii<bots_arg_size) && ok); ii++)
+   {
+      for (jj=0; ((jj<bots_arg_size) && ok); jj++)
+      {
+         if ((SEQ[ii*bots_arg_size+jj] == NULL) && (BENCH[ii*bots_arg_size+jj] != NULL)) ok = FALSE;
+         if ((SEQ[ii*bots_arg_size+jj] != NULL) && (BENCH[ii*bots_arg_size+jj] == NULL)) ok = FALSE;
+         if ((SEQ[ii*bots_arg_size+jj] != NULL) && (BENCH[ii*bots_arg_size+jj] != NULL))
+            ok = checkmat(SEQ[ii*bots_arg_size+jj], BENCH[ii*bots_arg_size+jj]);
+      }
+   }
+   if (ok) return BOTS_RESULT_SUCCESSFUL;
+   else return BOTS_RESULT_UNSUCCESSFUL;
+}
+
diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/sparselu.h b/src/components/implementation/no_interface/omp_sparselu_for_bots/sparselu.h
new file mode 100644
index 0000000000..4edbe682da
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/sparselu.h
@@ -0,0 +1,24 @@
+#ifndef SPARSELU_H
+#define SPARSELU_H
+
+#define EPSILON 1.0E-6
+
+int checkmat (float *M, float *N);
+void genmat (float *M[]);
+void print_structure(char *name, float *M[]);
+float * allocate_clean_block();
+void lu0(float *diag);
+void bdiv(float *diag, float *row);
+void bmod(float *row, float *col, float *inner);
+void fwd(float *diag, float *col);
+
+void sparselu_init (float ***pBENCH, char *pass); 
+void sparselu(float **BENCH);
+void sparselu_fini (float **BENCH, char *pass); 
+
+void sparselu_seq_call(float **BENCH);
+void sparselu_par_call(float **BENCH);
+
+int sparselu_check(float **SEQ, float **BENCH);
+
+#endif
diff --git a/src/components/implementation/no_interface/omp_sparselu_single_bots/Makefile b/src/components/implementation/no_interface/omp_sparselu_single_bots/Makefile
new file mode 100644
index 0000000000..8e664f9f1a
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_single_bots/Makefile
@@ -0,0 +1,19 @@
+COMPONENT=omp_sparselu_single_bots.o
+INTERFACES=
+#DEPENDENCIES=capmgr
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
+
+CFLAGS += -fopenmp
+# if tied tasks are required
+CFLAGS += -DFORCE_TIED_TASKS
+
+OMPC_FINAL_FLAGS=
+
+# one per compilation or none
+#CFLAGS += -DMANUAL_CUTOFF
+#CFLAGS += -DIF_CUTOFF
+#CFLAGS += -DFINAL_CUTOFF $(OMPC_FINAL_FLAGS) 
diff --git a/src/components/implementation/no_interface/omp_sparselu_single_bots/app-desc.h b/src/components/implementation/no_interface/omp_sparselu_single_bots/app-desc.h
new file mode 100644
index 0000000000..5362c8504e
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_single_bots/app-desc.h
@@ -0,0 +1,53 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include "omp-tasks-app.h"
+
+#define BOTS_APP_NAME "SparseLU (Single version)"
+#define BOTS_APP_PARAMETERS_DESC "S1=%dx%d, S2=%dx%d"
+#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size,bots_arg_size,bots_arg_size_1,bots_arg_size_1
+
+#define BOTS_APP_USES_ARG_SIZE
+#define BOTS_APP_DEF_ARG_SIZE 50
+#define BOTS_APP_DESC_ARG_SIZE "Matrix Size"
+
+#define BOTS_APP_USES_ARG_SIZE_1
+#define BOTS_APP_DEF_ARG_SIZE_1 100
+#define BOTS_APP_DESC_ARG_SIZE_1 "Submatrix Size"
+
+#define BOTS_APP_INIT float **SEQ,**BENCH;
+
+void sparselu_init(float ***pM, char *pass);
+void sparselu_fini(float **M, char *pass);
+void sparselu_seq_call(float **SEQ);
+void sparselu_par_call(float **BENCH);
+int sparselu_check(float **SEQ, float **BENCH);
+
+#define KERNEL_INIT sparselu_init(&BENCH,"benchmark");
+#define KERNEL_CALL sparselu_par_call(BENCH);
+#define KERNEL_FINI sparselu_fini(BENCH,"benchmark");
+
+#define KERNEL_SEQ_INIT sparselu_init(&SEQ,"serial");
+#define KERNEL_SEQ_CALL sparselu_seq_call(SEQ);
+#define KERNEL_SEQ_FINI sparselu_fini(SEQ,"serial");
+
+#define BOTS_APP_CHECK_USES_SEQ_RESULT
+#define KERNEL_CHECK sparselu_check(SEQ,BENCH);
+
diff --git a/src/components/implementation/no_interface/omp_sparselu_single_bots/bots.h b/src/components/implementation/no_interface/omp_sparselu_single_bots/bots.h
new file mode 120000
index 0000000000..ea0ad2b59f
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_single_bots/bots.h
@@ -0,0 +1 @@
+../omp_fib_bots/bots.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_single_bots/bots_common.c b/src/components/implementation/no_interface/omp_sparselu_single_bots/bots_common.c
new file mode 120000
index 0000000000..4802b0cf70
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_single_bots/bots_common.c
@@ -0,0 +1 @@
+../omp_fib_bots/bots_common.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_single_bots/bots_common.h b/src/components/implementation/no_interface/omp_sparselu_single_bots/bots_common.h
new file mode 120000
index 0000000000..14eda863e4
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_single_bots/bots_common.h
@@ -0,0 +1 @@
+../omp_fib_bots/bots_common.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_single_bots/bots_main.c b/src/components/implementation/no_interface/omp_sparselu_single_bots/bots_main.c
new file mode 120000
index 0000000000..14f2dab009
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_single_bots/bots_main.c
@@ -0,0 +1 @@
+../omp_fib_bots/bots_main.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_single_bots/bots_main.h b/src/components/implementation/no_interface/omp_sparselu_single_bots/bots_main.h
new file mode 120000
index 0000000000..86c06ad286
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_single_bots/bots_main.h
@@ -0,0 +1 @@
+../omp_fib_bots/bots_main.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_single_bots/init.c b/src/components/implementation/no_interface/omp_sparselu_single_bots/init.c
new file mode 120000
index 0000000000..9e09b82e77
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_single_bots/init.c
@@ -0,0 +1 @@
+../omp_fib_bots/init.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_single_bots/omp-tasks-app.h b/src/components/implementation/no_interface/omp_sparselu_single_bots/omp-tasks-app.h
new file mode 120000
index 0000000000..9fba574408
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_single_bots/omp-tasks-app.h
@@ -0,0 +1 @@
+../omp_fib_bots/omp-tasks-app.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_single_bots/posix_basic.c b/src/components/implementation/no_interface/omp_sparselu_single_bots/posix_basic.c
new file mode 120000
index 0000000000..9afee078fb
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_single_bots/posix_basic.c
@@ -0,0 +1 @@
+../omp_fib_bots/posix_basic.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_single_bots/sparselu.c b/src/components/implementation/no_interface/omp_sparselu_single_bots/sparselu.c
new file mode 100644
index 0000000000..e32b446c02
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_single_bots/sparselu.c
@@ -0,0 +1,325 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h> 
+#include <string.h>
+#include <math.h>
+#include <libgen.h>
+#include "bots.h"
+#include "sparselu.h"
+
+/***********************************************************************
+ * checkmat: 
+ **********************************************************************/
+int checkmat (float *M, float *N)
+{
+   int i, j;
+   float r_err;
+
+   for (i = 0; i < bots_arg_size_1; i++)
+   {
+      for (j = 0; j < bots_arg_size_1; j++)
+      {
+         r_err = M[i*bots_arg_size_1+j] - N[i*bots_arg_size_1+j];
+         if ( r_err == 0.0 ) continue;
+
+         if (r_err < 0.0 ) r_err = -r_err;
+
+         if ( M[i*bots_arg_size_1+j] == 0 ) 
+         {
+           bots_message("Checking failure: A[%d][%d]=%f  B[%d][%d]=%f; \n",
+                    i,j, M[i*bots_arg_size_1+j], i,j, N[i*bots_arg_size_1+j]);
+           return FALSE;
+         }  
+         r_err = r_err / M[i*bots_arg_size_1+j];
+         if(r_err > EPSILON)
+         {
+            bots_message("Checking failure: A[%d][%d]=%f  B[%d][%d]=%f; Relative Error=%f\n",
+                    i,j, M[i*bots_arg_size_1+j], i,j, N[i*bots_arg_size_1+j], r_err);
+            return FALSE;
+         }
+      }
+   }
+   return TRUE;
+}
+/***********************************************************************
+ * genmat: 
+ **********************************************************************/
+void genmat (float *M[])
+{
+   int null_entry, init_val, i, j, ii, jj;
+   float *p;
+
+   init_val = 1325;
+
+   /* generating the structure */
+   for (ii=0; ii < bots_arg_size; ii++)
+   {
+      for (jj=0; jj < bots_arg_size; jj++)
+      {
+         /* computing null entries */
+         null_entry=FALSE;
+         if ((ii<jj) && (ii%3 !=0)) null_entry = TRUE;
+         if ((ii>jj) && (jj%3 !=0)) null_entry = TRUE;
+	 if (ii%2==1) null_entry = TRUE;
+	 if (jj%2==1) null_entry = TRUE;
+	 if (ii==jj) null_entry = FALSE;
+	 if (ii==jj-1) null_entry = FALSE;
+         if (ii-1 == jj) null_entry = FALSE; 
+         /* allocating matrix */
+         if (null_entry == FALSE){
+            M[ii*bots_arg_size+jj] = (float *) malloc(bots_arg_size_1*bots_arg_size_1*sizeof(float));
+	    if ((M[ii*bots_arg_size+jj] == NULL))
+            {
+               bots_message("Error: Out of memory\n");
+               exit(101);
+            }
+            /* initializing matrix */
+            p = M[ii*bots_arg_size+jj];
+            for (i = 0; i < bots_arg_size_1; i++) 
+            {
+               for (j = 0; j < bots_arg_size_1; j++)
+               {
+	            init_val = (3125 * init_val) % 65536;
+      	            (*p) = (float)((init_val - 32768.0) / 16384.0);
+                    p++;
+               }
+            }
+         }
+         else
+         {
+            M[ii*bots_arg_size+jj] = NULL;
+         }
+      }
+   }
+}
+/***********************************************************************
+ * print_structure: 
+ **********************************************************************/
+void print_structure(char *name, float *M[])
+{
+   int ii, jj;
+   bots_message("Structure for matrix %s @ 0x%p\n",name, M);
+   for (ii = 0; ii < bots_arg_size; ii++) {
+     for (jj = 0; jj < bots_arg_size; jj++) {
+        if (M[ii*bots_arg_size+jj]!=NULL) {bots_message("x");}
+        else bots_message(" ");
+     }
+     bots_message("\n");
+   }
+   bots_message("\n");
+}
+/***********************************************************************
+ * allocate_clean_block: 
+ **********************************************************************/
+float * allocate_clean_block()
+{
+  int i,j;
+  float *p, *q;
+
+  p = (float *) malloc(bots_arg_size_1*bots_arg_size_1*sizeof(float));
+  q=p;
+  if (p!=NULL){
+     for (i = 0; i < bots_arg_size_1; i++) 
+        for (j = 0; j < bots_arg_size_1; j++){(*p)=0.0; p++;}
+	
+  }
+  else
+  {
+      bots_message("Error: Out of memory\n");
+      exit (101);
+  }
+  return (q);
+}
+
+/***********************************************************************
+ * lu0: 
+ **********************************************************************/
+void lu0(float *diag)
+{
+   int i, j, k;
+
+   for (k=0; k<bots_arg_size_1; k++)
+      for (i=k+1; i<bots_arg_size_1; i++)
+      {
+         diag[i*bots_arg_size_1+k] = diag[i*bots_arg_size_1+k] / diag[k*bots_arg_size_1+k];
+         for (j=k+1; j<bots_arg_size_1; j++)
+            diag[i*bots_arg_size_1+j] = diag[i*bots_arg_size_1+j] - diag[i*bots_arg_size_1+k] * diag[k*bots_arg_size_1+j];
+      }
+}
+
+/***********************************************************************
+ * bdiv: 
+ **********************************************************************/
+void bdiv(float *diag, float *row)
+{
+   int i, j, k;
+   for (i=0; i<bots_arg_size_1; i++)
+      for (k=0; k<bots_arg_size_1; k++)
+      {
+         row[i*bots_arg_size_1+k] = row[i*bots_arg_size_1+k] / diag[k*bots_arg_size_1+k];
+         for (j=k+1; j<bots_arg_size_1; j++)
+            row[i*bots_arg_size_1+j] = row[i*bots_arg_size_1+j] - row[i*bots_arg_size_1+k]*diag[k*bots_arg_size_1+j];
+      }
+}
+/***********************************************************************
+ * bmod: 
+ **********************************************************************/
+void bmod(float *row, float *col, float *inner)
+{
+   int i, j, k;
+   for (i=0; i<bots_arg_size_1; i++)
+      for (j=0; j<bots_arg_size_1; j++)
+         for (k=0; k<bots_arg_size_1; k++)
+            inner[i*bots_arg_size_1+j] = inner[i*bots_arg_size_1+j] - row[i*bots_arg_size_1+k]*col[k*bots_arg_size_1+j];
+}
+/***********************************************************************
+ * fwd: 
+ **********************************************************************/
+void fwd(float *diag, float *col)
+{
+   int i, j, k;
+   for (j=0; j<bots_arg_size_1; j++)
+      for (k=0; k<bots_arg_size_1; k++) 
+         for (i=k+1; i<bots_arg_size_1; i++)
+            col[i*bots_arg_size_1+j] = col[i*bots_arg_size_1+j] - diag[i*bots_arg_size_1+k]*col[k*bots_arg_size_1+j];
+}
+
+
+void sparselu_init (float ***pBENCH, char *pass)
+{
+   *pBENCH = (float **) malloc(bots_arg_size*bots_arg_size*sizeof(float *));
+   genmat(*pBENCH);
+   print_structure(pass, *pBENCH);
+}
+
+void sparselu_par_call(float **BENCH)
+{
+   int ii, jj, kk;
+
+   bots_message("Computing SparseLU Factorization (%dx%d matrix with %dx%d blocks) ",
+           bots_arg_size,bots_arg_size,bots_arg_size_1,bots_arg_size_1);
+#pragma omp parallel
+#pragma omp single nowait
+#if defined(FORCE_TIED_TASKS)
+#pragma omp task
+#else
+#pragma omp task untied
+#endif
+   for (kk=0; kk<bots_arg_size; kk++) 
+   {
+      lu0(BENCH[kk*bots_arg_size+kk]);
+      for (jj=kk+1; jj<bots_arg_size; jj++)
+         if (BENCH[kk*bots_arg_size+jj] != NULL)
+#if defined(FORCE_TIED_TASKS)
+            #pragma omp task firstprivate(kk, jj) shared(BENCH)
+#else
+            #pragma omp task untied firstprivate(kk, jj) shared(BENCH)
+#endif
+         {
+            fwd(BENCH[kk*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj]);
+         }
+      for (ii=kk+1; ii<bots_arg_size; ii++) 
+         if (BENCH[ii*bots_arg_size+kk] != NULL)
+#if defined(FORCE_TIED_TASKS)
+            #pragma omp task firstprivate(kk, ii) shared(BENCH)
+#else
+            #pragma omp task untied firstprivate(kk, ii) shared(BENCH)
+#endif
+         {
+            bdiv (BENCH[kk*bots_arg_size+kk], BENCH[ii*bots_arg_size+kk]);
+         }
+
+      #pragma omp taskwait
+
+      for (ii=kk+1; ii<bots_arg_size; ii++)
+         if (BENCH[ii*bots_arg_size+kk] != NULL)
+            for (jj=kk+1; jj<bots_arg_size; jj++)
+               if (BENCH[kk*bots_arg_size+jj] != NULL)
+#if defined(FORCE_TIED_TASKS)
+               #pragma omp task firstprivate(kk, jj, ii) shared(BENCH)
+#else
+               #pragma omp task untied firstprivate(kk, jj, ii) shared(BENCH)
+#endif
+               {
+                     if (BENCH[ii*bots_arg_size+jj]==NULL) BENCH[ii*bots_arg_size+jj] = allocate_clean_block();
+                     bmod(BENCH[ii*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj], BENCH[ii*bots_arg_size+jj]);
+               }
+
+      #pragma omp taskwait
+   }
+   bots_message(" completed!\n");
+}
+
+
+void sparselu_seq_call(float **BENCH)
+{
+   int ii, jj, kk;
+
+   for (kk=0; kk<bots_arg_size; kk++)
+   {
+      lu0(BENCH[kk*bots_arg_size+kk]);
+      for (jj=kk+1; jj<bots_arg_size; jj++)
+         if (BENCH[kk*bots_arg_size+jj] != NULL)
+         {
+            fwd(BENCH[kk*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj]);
+         }
+      for (ii=kk+1; ii<bots_arg_size; ii++)
+         if (BENCH[ii*bots_arg_size+kk] != NULL)
+         {
+            bdiv (BENCH[kk*bots_arg_size+kk], BENCH[ii*bots_arg_size+kk]);
+         }
+      for (ii=kk+1; ii<bots_arg_size; ii++)
+         if (BENCH[ii*bots_arg_size+kk] != NULL)
+            for (jj=kk+1; jj<bots_arg_size; jj++)
+               if (BENCH[kk*bots_arg_size+jj] != NULL)
+               {
+                     if (BENCH[ii*bots_arg_size+jj]==NULL) BENCH[ii*bots_arg_size+jj] = allocate_clean_block();
+                     bmod(BENCH[ii*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj], BENCH[ii*bots_arg_size+jj]);
+               }
+
+   }
+}
+
+void sparselu_fini (float **BENCH, char *pass)
+{
+   print_structure(pass, BENCH);
+}
+
+int sparselu_check(float **SEQ, float **BENCH)
+{
+   int ii,jj,ok=1;
+
+   for (ii=0; ((ii<bots_arg_size) && ok); ii++)
+   {
+      for (jj=0; ((jj<bots_arg_size) && ok); jj++)
+      {
+         if ((SEQ[ii*bots_arg_size+jj] == NULL) && (BENCH[ii*bots_arg_size+jj] != NULL)) ok = FALSE;
+         if ((SEQ[ii*bots_arg_size+jj] != NULL) && (BENCH[ii*bots_arg_size+jj] == NULL)) ok = FALSE;
+         if ((SEQ[ii*bots_arg_size+jj] != NULL) && (BENCH[ii*bots_arg_size+jj] != NULL))
+            ok = checkmat(SEQ[ii*bots_arg_size+jj], BENCH[ii*bots_arg_size+jj]);
+      }
+   }
+   if (ok) return BOTS_RESULT_SUCCESSFUL;
+   else return BOTS_RESULT_UNSUCCESSFUL;
+}
+
diff --git a/src/components/implementation/no_interface/omp_sparselu_single_bots/sparselu.h b/src/components/implementation/no_interface/omp_sparselu_single_bots/sparselu.h
new file mode 100644
index 0000000000..4edbe682da
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_single_bots/sparselu.h
@@ -0,0 +1,24 @@
+#ifndef SPARSELU_H
+#define SPARSELU_H
+
+#define EPSILON 1.0E-6
+
+int checkmat (float *M, float *N);
+void genmat (float *M[]);
+void print_structure(char *name, float *M[]);
+float * allocate_clean_block();
+void lu0(float *diag);
+void bdiv(float *diag, float *row);
+void bmod(float *row, float *col, float *inner);
+void fwd(float *diag, float *col);
+
+void sparselu_init (float ***pBENCH, char *pass); 
+void sparselu(float **BENCH);
+void sparselu_fini (float **BENCH, char *pass); 
+
+void sparselu_seq_call(float **BENCH);
+void sparselu_par_call(float **BENCH);
+
+int sparselu_check(float **SEQ, float **BENCH);
+
+#endif
diff --git a/src/components/implementation/no_interface/omp_strassen_bots/Makefile b/src/components/implementation/no_interface/omp_strassen_bots/Makefile
new file mode 100644
index 0000000000..55589c981b
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_strassen_bots/Makefile
@@ -0,0 +1,19 @@
+COMPONENT=omp_strassen_bots.o
+INTERFACES=
+#DEPENDENCIES=capmgr
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
+
+CFLAGS += -fopenmp
+# if tied tasks are required
+CFLAGS += -DFORCE_TIED_TASKS
+
+OMPC_FINAL_FLAGS=
+
+# one per compilation or none
+#CFLAGS += -DMANUAL_CUTOFF
+#CFLAGS += -DIF_CUTOFF
+#CFLAGS += -DFINAL_CUTOFF $(OMPC_FINAL_FLAGS) 
diff --git a/src/components/implementation/no_interface/omp_strassen_bots/app-desc.h b/src/components/implementation/no_interface/omp_strassen_bots/app-desc.h
new file mode 100644
index 0000000000..3113bb570e
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_strassen_bots/app-desc.h
@@ -0,0 +1,81 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include "omp-tasks-app.h"
+
+#define BOTS_APP_NAME "Strassen"
+#define BOTS_APP_PARAMETERS_DESC "N=%d:Y=%d"
+#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size,bots_app_cutoff_value
+
+#define BOTS_APP_USES_ARG_SIZE
+#define BOTS_APP_DEF_ARG_SIZE 1024
+#define BOTS_APP_DESC_ARG_SIZE "Matrix Size"
+
+#define BOTS_APP_USES_ARG_BLOCK
+#define BOTS_APP_DEF_ARG_BLOCK 32
+#define BOTS_APP_DESC_ARG_BLOCK "Matrix Block Size"
+
+/* Below this cut off strassen uses MultiplyByDivideAndConquer() algorithm */
+#define BOTS_APP_USES_ARG_CUTOFF
+#define BOTS_APP_DEF_ARG_CUTOFF 64
+#define BOTS_APP_DESC_ARG_CUTOFF "Strassen Cutoff"
+
+/* Task creation cut off */
+#define BOTS_CUTOFF_DEF_VALUE 3
+
+/***********************************************************************
+ * The real numbers we are using --- either double or float
+ **********************************************************************/
+typedef double REAL;
+typedef unsigned long PTR;
+void init_matrix(int n, REAL *A, int an);
+void strassen_main_par(REAL *A, REAL *B, REAL *C, int n);
+void strassen_main_seq(REAL *A, REAL *B, REAL *C, int n);
+int compare_matrix(int n, REAL *A, int an, REAL *B, int bn);
+
+#define BOTS_APP_INIT\
+    double *A, *B, *C, *D;\
+    if ((bots_arg_size & (bots_arg_size - 1)) != 0 || (bots_arg_size % 16) != 0) {\
+        bots_message("Error: matrix size (%d) must be a power of 2 and a multiple of %d\n", bots_arg_size, 16);\
+        exit (1);\
+    }\
+    A = (double *) malloc (bots_arg_size * bots_arg_size * sizeof(double));\
+    B = (double *) malloc (bots_arg_size * bots_arg_size * sizeof(double));\
+    C = (double *) malloc (bots_arg_size * bots_arg_size * sizeof(double));\
+    D = (double *) malloc (bots_arg_size * bots_arg_size * sizeof(double));\
+    init_matrix(bots_arg_size,A,bots_arg_size);\
+    init_matrix(bots_arg_size,B,bots_arg_size);
+
+//#define KERNEL_INIT
+#define KERNEL_CALL strassen_main_par(C,A,B,bots_arg_size);
+//#define KERNEL_FINI
+
+//#define KERNEL_SEQ_INIT
+#define KERNEL_SEQ_CALL strassen_main_seq(D,A,B,bots_arg_size);
+//#define KERNEL_SEQ_FINI
+
+/* 
+ * Phani: disabled running sequencial as we don't have munmap working and 
+ * this program uses a lot of memory per execution.
+ */
+#undef BOTS_APP_CHECK_USES_SEQ_RESULT
+#define KERNEL_CHECK compare_matrix(bots_arg_size,C,bots_arg_size,D,bots_arg_size);
+
+
diff --git a/src/components/implementation/no_interface/omp_strassen_bots/bots.h b/src/components/implementation/no_interface/omp_strassen_bots/bots.h
new file mode 120000
index 0000000000..ea0ad2b59f
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_strassen_bots/bots.h
@@ -0,0 +1 @@
+../omp_fib_bots/bots.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_strassen_bots/bots_common.c b/src/components/implementation/no_interface/omp_strassen_bots/bots_common.c
new file mode 120000
index 0000000000..4802b0cf70
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_strassen_bots/bots_common.c
@@ -0,0 +1 @@
+../omp_fib_bots/bots_common.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_strassen_bots/bots_common.h b/src/components/implementation/no_interface/omp_strassen_bots/bots_common.h
new file mode 120000
index 0000000000..14eda863e4
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_strassen_bots/bots_common.h
@@ -0,0 +1 @@
+../omp_fib_bots/bots_common.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_strassen_bots/bots_main.c b/src/components/implementation/no_interface/omp_strassen_bots/bots_main.c
new file mode 120000
index 0000000000..14f2dab009
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_strassen_bots/bots_main.c
@@ -0,0 +1 @@
+../omp_fib_bots/bots_main.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_strassen_bots/bots_main.h b/src/components/implementation/no_interface/omp_strassen_bots/bots_main.h
new file mode 120000
index 0000000000..86c06ad286
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_strassen_bots/bots_main.h
@@ -0,0 +1 @@
+../omp_fib_bots/bots_main.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_strassen_bots/init.c b/src/components/implementation/no_interface/omp_strassen_bots/init.c
new file mode 120000
index 0000000000..9e09b82e77
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_strassen_bots/init.c
@@ -0,0 +1 @@
+../omp_fib_bots/init.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_strassen_bots/omp-tasks-app.h b/src/components/implementation/no_interface/omp_strassen_bots/omp-tasks-app.h
new file mode 120000
index 0000000000..9fba574408
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_strassen_bots/omp-tasks-app.h
@@ -0,0 +1 @@
+../omp_fib_bots/omp-tasks-app.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_strassen_bots/posix_basic.c b/src/components/implementation/no_interface/omp_strassen_bots/posix_basic.c
new file mode 120000
index 0000000000..9afee078fb
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_strassen_bots/posix_basic.c
@@ -0,0 +1 @@
+../omp_fib_bots/posix_basic.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_strassen_bots/strassen.c b/src/components/implementation/no_interface/omp_strassen_bots/strassen.c
new file mode 100644
index 0000000000..59996b9d82
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_strassen_bots/strassen.c
@@ -0,0 +1,1375 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/**********************************************************************************************/
+
+/*
+ * Copyright (c) 1996 Massachusetts Institute of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to use, copy, modify, and distribute the Software without
+ * restriction, provided the Software, including any modified copies made
+ * under this license, is not distributed for a fee, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE MASSACHUSETTS INSTITUTE OF TECHNOLOGY BE LIABLE
+ * FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+ * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * /WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Except as contained in this notice, the name of the Massachusetts
+ * Institute of Technology shall not be used in advertising or otherwise
+ * to promote the sale, use or other dealings in this Software without
+ * prior written authorization from the Massachusetts Institute of
+ * Technology.
+ *
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "app-desc.h"
+#include "bots.h"
+#include "strassen.h"
+
+/***********************************************************************
+ * Naive sequential algorithm, for comparison purposes
+ **********************************************************************/
+void matrixmul(int n, REAL *A, int an, REAL *B, int bn, REAL *C, int cn)
+{
+   int i, j, k;
+   REAL s;
+
+   for (i = 0; i < n; ++i)
+   { 
+      for (j = 0; j < n; ++j)
+      {
+         s = 0.0;
+         for (k = 0; k < n; ++k) s += ELEM(A, an, i, k) * ELEM(B, bn, k, j);
+         ELEM(C, cn, i, j) = s;
+      }
+   }
+}
+/*****************************************************************************
+**
+** FastNaiveMatrixMultiply
+**
+** For small to medium sized matrices A, B, and C of size
+** MatrixSize * MatrixSize this function performs the operation
+** C = A x B efficiently.
+**
+** Note MatrixSize must be divisible by 8.
+**
+** INPUT:
+**    C = (*C WRITE) Address of top left element of matrix C.
+**    A = (*A IS READ ONLY) Address of top left element of matrix A.
+**    B = (*B IS READ ONLY) Address of top left element of matrix B.
+**    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
+**    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
+**    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
+**    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
+**
+** OUTPUT:
+**    C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.)
+**
+*****************************************************************************/
+void FastNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB)
+{ 
+  /* Assumes size of real is 8 bytes */
+  PTR RowWidthBInBytes = RowWidthB  << 3;
+  PTR RowWidthAInBytes = RowWidthA << 3;
+  PTR MatrixWidthInBytes = MatrixSize << 3;
+  PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3;
+  unsigned Horizontal, Vertical;
+  
+  REAL *ARowStart = A;
+  for (Vertical = 0; Vertical < MatrixSize; Vertical++) {
+    for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) {
+      REAL *BColumnStart = B + Horizontal;
+      REAL FirstARowValue = *ARowStart++;
+
+      REAL Sum0 = FirstARowValue * (*BColumnStart);
+      REAL Sum1 = FirstARowValue * (*(BColumnStart+1));
+      REAL Sum2 = FirstARowValue * (*(BColumnStart+2));
+      REAL Sum3 = FirstARowValue * (*(BColumnStart+3));
+      REAL Sum4 = FirstARowValue * (*(BColumnStart+4));
+      REAL Sum5 = FirstARowValue * (*(BColumnStart+5));
+      REAL Sum6 = FirstARowValue * (*(BColumnStart+6));
+      REAL Sum7 = FirstARowValue * (*(BColumnStart+7));	
+
+      unsigned Products;
+      for (Products = 1; Products < MatrixSize; Products++) {
+	REAL ARowValue = *ARowStart++;
+	BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes);
+	Sum0 += ARowValue * (*BColumnStart);
+	Sum1 += ARowValue * (*(BColumnStart+1));
+	Sum2 += ARowValue * (*(BColumnStart+2));
+	Sum3 += ARowValue * (*(BColumnStart+3));
+	Sum4 += ARowValue * (*(BColumnStart+4));
+	Sum5 += ARowValue * (*(BColumnStart+5));
+	Sum6 += ARowValue * (*(BColumnStart+6));
+	Sum7 += ARowValue * (*(BColumnStart+7));	
+      }
+      ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes);
+
+      *(C) = Sum0;
+      *(C+1) = Sum1;
+      *(C+2) = Sum2;
+      *(C+3) = Sum3;
+      *(C+4) = Sum4;
+      *(C+5) = Sum5;
+      *(C+6) = Sum6;
+      *(C+7) = Sum7;
+      C+=8;
+    }
+    ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes );
+    C = (REAL*) ( ((PTR) C) + RowIncrementC );
+  }
+}
+/*****************************************************************************
+**
+** FastAdditiveNaiveMatrixMultiply
+**
+** For small to medium sized matrices A, B, and C of size
+** MatrixSize * MatrixSize this function performs the operation
+** C += A x B efficiently.
+**
+** Note MatrixSize must be divisible by 8.
+**
+** INPUT:
+**    C = (*C READ/WRITE) Address of top left element of matrix C.
+**    A = (*A IS READ ONLY) Address of top left element of matrix A.
+**    B = (*B IS READ ONLY) Address of top left element of matrix B.
+**    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
+**    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
+**    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
+**    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
+**
+** OUTPUT:
+**    C = (*C READ/WRITE) Matrix C contains C + A x B.
+**
+*****************************************************************************/
+void FastAdditiveNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB)
+{ 
+  /* Assumes size of real is 8 bytes */
+  PTR RowWidthBInBytes = RowWidthB  << 3;
+  PTR RowWidthAInBytes = RowWidthA << 3;
+  PTR MatrixWidthInBytes = MatrixSize << 3;
+  PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3;
+  unsigned Horizontal, Vertical;
+  
+  REAL *ARowStart = A;
+  for (Vertical = 0; Vertical < MatrixSize; Vertical++) {
+    for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) {
+      REAL *BColumnStart = B + Horizontal;
+
+      REAL Sum0 = *C;
+      REAL Sum1 = *(C+1);
+      REAL Sum2 = *(C+2);
+      REAL Sum3 = *(C+3);
+      REAL Sum4 = *(C+4);
+      REAL Sum5 = *(C+5);
+      REAL Sum6 = *(C+6);
+      REAL Sum7 = *(C+7);	
+
+      unsigned Products;
+      for (Products = 0; Products < MatrixSize; Products++) {
+	REAL ARowValue = *ARowStart++;
+
+	Sum0 += ARowValue * (*BColumnStart);
+	Sum1 += ARowValue * (*(BColumnStart+1));
+	Sum2 += ARowValue * (*(BColumnStart+2));
+	Sum3 += ARowValue * (*(BColumnStart+3));
+	Sum4 += ARowValue * (*(BColumnStart+4));
+	Sum5 += ARowValue * (*(BColumnStart+5));
+	Sum6 += ARowValue * (*(BColumnStart+6));
+	Sum7 += ARowValue * (*(BColumnStart+7));
+
+	BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes);
+
+      }
+      ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes);
+
+      *(C) = Sum0;
+      *(C+1) = Sum1;
+      *(C+2) = Sum2;
+      *(C+3) = Sum3;
+      *(C+4) = Sum4;
+      *(C+5) = Sum5;
+      *(C+6) = Sum6;
+      *(C+7) = Sum7;
+      C+=8;
+    }
+
+    ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes );
+    C = (REAL*) ( ((PTR) C) + RowIncrementC );
+  }
+}
+/*****************************************************************************
+**
+** MultiplyByDivideAndConquer
+**
+** For medium to medium-large (would you like fries with that) sized
+** matrices A, B, and C of size MatrixSize * MatrixSize this function
+** efficiently performs the operation
+**    C  = A x B (if AdditiveMode == 0)
+**    C += A x B (if AdditiveMode != 0)
+**
+** Note MatrixSize must be divisible by 16.
+**
+** INPUT:
+**    C = (*C READ/WRITE) Address of top left element of matrix C.
+**    A = (*A IS READ ONLY) Address of top left element of matrix A.
+**    B = (*B IS READ ONLY) Address of top left element of matrix B.
+**    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
+**    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
+**    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
+**    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
+**    AdditiveMode = 0 if we want C = A x B, otherwise we'll do C += A x B
+**
+** OUTPUT:
+**    C (+)= A x B. (+ if AdditiveMode != 0)
+**
+*****************************************************************************/
+void MultiplyByDivideAndConquer(REAL *C, REAL *A, REAL *B,
+				     unsigned MatrixSize,
+				     unsigned RowWidthC,
+				     unsigned RowWidthA,
+				     unsigned RowWidthB,
+				     int AdditiveMode
+				    )
+{
+  #define A00 A
+  #define B00 B
+  #define C00 C
+  REAL  *A01, *A10, *A11, *B01, *B10, *B11, *C01, *C10, *C11;
+  unsigned QuadrantSize = MatrixSize >> 1;
+
+  /* partition the matrix */
+  A01 = A00 + QuadrantSize;
+  A10 = A00 + RowWidthA * QuadrantSize;
+  A11 = A10 + QuadrantSize;
+
+  B01 = B00 + QuadrantSize;
+  B10 = B00 + RowWidthB * QuadrantSize;
+  B11 = B10 + QuadrantSize;
+
+  C01 = C00 + QuadrantSize;
+  C10 = C00 + RowWidthC * QuadrantSize;
+  C11 = C10 + QuadrantSize;
+
+  if (QuadrantSize > SizeAtWhichNaiveAlgorithmIsMoreEfficient) {
+
+    MultiplyByDivideAndConquer(C00, A00, B00, QuadrantSize,
+				     RowWidthC, RowWidthA, RowWidthB,
+				     AdditiveMode);
+
+    MultiplyByDivideAndConquer(C01, A00, B01, QuadrantSize,
+				     RowWidthC, RowWidthA, RowWidthB,
+				     AdditiveMode);
+
+    MultiplyByDivideAndConquer(C11, A10, B01, QuadrantSize,
+				     RowWidthC, RowWidthA, RowWidthB,
+				     AdditiveMode);
+
+    MultiplyByDivideAndConquer(C10, A10, B00, QuadrantSize,
+				     RowWidthC, RowWidthA, RowWidthB,
+				     AdditiveMode);
+
+    MultiplyByDivideAndConquer(C00, A01, B10, QuadrantSize,
+				     RowWidthC, RowWidthA, RowWidthB,
+				     1);
+
+    MultiplyByDivideAndConquer(C01, A01, B11, QuadrantSize,
+				     RowWidthC, RowWidthA, RowWidthB,
+				     1);
+
+    MultiplyByDivideAndConquer(C11, A11, B11, QuadrantSize,
+				     RowWidthC, RowWidthA, RowWidthB,
+				     1);
+
+    MultiplyByDivideAndConquer(C10, A11, B10, QuadrantSize,
+				     RowWidthC, RowWidthA, RowWidthB,
+				     1);
+    
+  } else {
+
+    if (AdditiveMode) {
+      FastAdditiveNaiveMatrixMultiply(C00, A00, B00, QuadrantSize,
+			      RowWidthC, RowWidthA, RowWidthB);
+      
+      FastAdditiveNaiveMatrixMultiply(C01, A00, B01, QuadrantSize,
+			      RowWidthC, RowWidthA, RowWidthB);
+      
+      FastAdditiveNaiveMatrixMultiply(C11, A10, B01, QuadrantSize,
+			      RowWidthC, RowWidthA, RowWidthB);
+      
+      FastAdditiveNaiveMatrixMultiply(C10, A10, B00, QuadrantSize,
+			      RowWidthC, RowWidthA, RowWidthB);
+      
+    } else {
+      
+      FastNaiveMatrixMultiply(C00, A00, B00, QuadrantSize,
+			      RowWidthC, RowWidthA, RowWidthB);
+      
+      FastNaiveMatrixMultiply(C01, A00, B01, QuadrantSize,
+			      RowWidthC, RowWidthA, RowWidthB);
+      
+      FastNaiveMatrixMultiply(C11, A10, B01, QuadrantSize,
+			      RowWidthC, RowWidthA, RowWidthB);
+      
+      FastNaiveMatrixMultiply(C10, A10, B00, QuadrantSize,
+			      RowWidthC, RowWidthA, RowWidthB);
+    }
+
+    FastAdditiveNaiveMatrixMultiply(C00, A01, B10, QuadrantSize,
+				    RowWidthC, RowWidthA, RowWidthB);
+    
+    FastAdditiveNaiveMatrixMultiply(C01, A01, B11, QuadrantSize,
+				    RowWidthC, RowWidthA, RowWidthB);
+    
+    FastAdditiveNaiveMatrixMultiply(C11, A11, B11, QuadrantSize,
+				    RowWidthC, RowWidthA, RowWidthB);
+    
+    FastAdditiveNaiveMatrixMultiply(C10, A11, B10, QuadrantSize,
+				    RowWidthC, RowWidthA, RowWidthB);
+  }
+  return;
+}
+/*****************************************************************************
+**
+** OptimizedStrassenMultiply
+**
+** For large matrices A, B, and C of size MatrixSize * MatrixSize this
+** function performs the operation C = A x B efficiently.
+**
+** INPUT:
+**    C = (*C WRITE) Address of top left element of matrix C.
+**    A = (*A IS READ ONLY) Address of top left element of matrix A.
+**    B = (*B IS READ ONLY) Address of top left element of matrix B.
+**    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
+**    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
+**    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
+**    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
+**
+** OUTPUT:
+**    C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.)
+**
+*****************************************************************************/
+void OptimizedStrassenMultiply_seq(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth)
+{
+  unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */
+  unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize
+                                 + 32;
+  unsigned Column, Row;
+  
+  /************************************************************************
+  ** For each matrix A, B, and C, we'll want pointers to each quandrant
+  ** in the matrix. These quandrants will be addressed as follows:
+  **  --        --
+  **  | A11  A12 |
+  **  |          |
+  **  | A21  A22 |
+  **  --        --
+  ************************************************************************/
+  REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12,
+       *A21, *B21, *C21, *A22, *B22, *C22;
+
+  REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT;
+  #define T2sMULT C22
+  #define NumberOfVariables 11
+
+  PTR TempMatrixOffset = 0;
+  PTR MatrixOffsetA = 0;
+  PTR MatrixOffsetB = 0;
+
+  char *Heap;
+  void *StartHeap;
+
+  /* Distance between the end of a matrix row and the start of the next row */
+  PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3;
+  PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3;
+  PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3;
+
+  if (MatrixSize <= bots_app_cutoff_value) {
+    MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0);
+    return;
+  }
+
+  /* Initialize quandrant matrices */
+  #define A11 A
+  #define B11 B
+  #define C11 C
+  A12 = A11 + QuadrantSize;
+  B12 = B11 + QuadrantSize;
+  C12 = C11 + QuadrantSize;
+  A21 = A + (RowWidthA * QuadrantSize);
+  B21 = B + (RowWidthB * QuadrantSize);
+  C21 = C + (RowWidthC * QuadrantSize);
+  A22 = A21 + QuadrantSize;
+  B22 = B21 + QuadrantSize;
+  C22 = C21 + QuadrantSize;
+
+  /* Allocate Heap Space Here */
+  StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables);
+  /* ensure that heap is on cache boundary */
+  if ( ((PTR) Heap) & 31)
+     Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) );
+  
+  /* Distribute the heap space over the variables */
+  S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  
+  /***************************************************************************
+  ** Step through all columns row by row (vertically)
+  ** (jumps in memory by RowWidth => bad locality)
+  ** (but we want the best locality on the innermost loop)
+  ***************************************************************************/
+  for (Row = 0; Row < QuadrantSize; Row++) {
+    
+    /*************************************************************************
+    ** Step through each row horizontally (addressing elements in each column)
+    ** (jumps linearly througn memory => good locality)
+    *************************************************************************/
+    for (Column = 0; Column < QuadrantSize; Column++) {
+      
+      /***********************************************************
+      ** Within this loop, the following holds for MatrixOffset:
+      ** MatrixOffset = (Row * RowWidth) + Column
+      ** (note: that the unit of the offset is number of reals)
+      ***********************************************************/
+      /* Element of Global Matrix, such as A, B, C */
+      #define E(Matrix)   (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) )
+      #define EA(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) )
+      #define EB(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) )
+
+      /* FIXME - may pay to expand these out - got higher speed-ups below */
+      /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */
+      E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) );
+
+      /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */
+      E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21);
+
+      /* S3 = A11 - A21 */
+      E(S3) = EA(A11) - EA(A21);
+      
+      /* S7 = B22 - B12 */
+      E(S7) = EB(B22) - EB(B12);
+
+      TempMatrixOffset += sizeof(REAL);
+      MatrixOffsetA += sizeof(REAL);
+      MatrixOffsetB += sizeof(REAL);
+    } /* end row loop*/
+
+    MatrixOffsetA += RowIncrementA;
+    MatrixOffsetB += RowIncrementB;
+  } /* end column loop */
+
+  /* M2 = A11 x B11 */
+  OptimizedStrassenMultiply_seq(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1);
+
+  /* M5 = S1 * S5 */
+  OptimizedStrassenMultiply_seq(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T1 = S2 x S6 + M2 */
+  OptimizedStrassenMultiply_seq(T1sMULT, S2, S6,  QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T2 = T1 + S3 x S7 */
+  OptimizedStrassenMultiply_seq(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of C11 = M2 + A12 * B21 */
+  OptimizedStrassenMultiply_seq(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1);
+  
+  /* Step 1 of C12 = S4 x B22 + T1 + M5 */
+  OptimizedStrassenMultiply_seq(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1);
+
+  /* Step 1 of C21 = T2 - A22 * S8 */
+  OptimizedStrassenMultiply_seq(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1);
+
+  /***************************************************************************
+  ** Step through all columns row by row (vertically)
+  ** (jumps in memory by RowWidth => bad locality)
+  ** (but we want the best locality on the innermost loop)
+  ***************************************************************************/
+  for (Row = 0; Row < QuadrantSize; Row++) {
+    /*************************************************************************
+    ** Step through each row horizontally (addressing elements in each column)
+    ** (jumps linearly througn memory => good locality)
+    *************************************************************************/
+    for (Column = 0; Column < QuadrantSize; Column += 4) {
+      REAL LocalM5_0 = *(M5);
+      REAL LocalM5_1 = *(M5+1);
+      REAL LocalM5_2 = *(M5+2);
+      REAL LocalM5_3 = *(M5+3);
+      REAL LocalM2_0 = *(M2);
+      REAL LocalM2_1 = *(M2+1);
+      REAL LocalM2_2 = *(M2+2);
+      REAL LocalM2_3 = *(M2+3);
+      REAL T1_0 = *(T1sMULT) + LocalM2_0;
+      REAL T1_1 = *(T1sMULT+1) + LocalM2_1;
+      REAL T1_2 = *(T1sMULT+2) + LocalM2_2;
+      REAL T1_3 = *(T1sMULT+3) + LocalM2_3;
+      REAL T2_0 = *(C22) + T1_0;
+      REAL T2_1 = *(C22+1) + T1_1;
+      REAL T2_2 = *(C22+2) + T1_2;
+      REAL T2_3 = *(C22+3) + T1_3;
+      (*(C11))   += LocalM2_0;
+      (*(C11+1)) += LocalM2_1;
+      (*(C11+2)) += LocalM2_2;
+      (*(C11+3)) += LocalM2_3;
+      (*(C12))   += LocalM5_0 + T1_0;
+      (*(C12+1)) += LocalM5_1 + T1_1;
+      (*(C12+2)) += LocalM5_2 + T1_2;
+      (*(C12+3)) += LocalM5_3 + T1_3;
+      (*(C22))   = LocalM5_0 + T2_0;
+      (*(C22+1)) = LocalM5_1 + T2_1;
+      (*(C22+2)) = LocalM5_2 + T2_2;
+      (*(C22+3)) = LocalM5_3 + T2_3;
+      (*(C21  )) = (- *(C21  )) + T2_0;
+      (*(C21+1)) = (- *(C21+1)) + T2_1;
+      (*(C21+2)) = (- *(C21+2)) + T2_2;
+      (*(C21+3)) = (- *(C21+3)) + T2_3;
+      M5 += 4;
+      M2 += 4;
+      T1sMULT += 4;
+      C11 += 4;
+      C12 += 4;
+      C21 += 4;
+      C22 += 4;
+    }
+    C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC);
+    C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC);
+    C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC);
+    C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC);
+  }
+  free(StartHeap);
+}
+#if defined(IF_CUTOFF)
+void OptimizedStrassenMultiply_par(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth)
+{
+  unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */
+  unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize
+                                 + 32;
+  unsigned Column, Row;
+  
+  /************************************************************************
+  ** For each matrix A, B, and C, we'll want pointers to each quandrant
+  ** in the matrix. These quandrants will be addressed as follows:
+  **  --        --
+  **  | A11  A12 |
+  **  |          |
+  **  | A21  A22 |
+  **  --        --
+  ************************************************************************/
+  REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12,
+       *A21, *B21, *C21, *A22, *B22, *C22;
+
+  REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT;
+  #define T2sMULT C22
+  #define NumberOfVariables 11
+
+  PTR TempMatrixOffset = 0;
+  PTR MatrixOffsetA = 0;
+  PTR MatrixOffsetB = 0;
+
+  char *Heap;
+  void *StartHeap;
+
+  /* Distance between the end of a matrix row and the start of the next row */
+  PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3;
+  PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3;
+  PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3;
+
+  if (MatrixSize <= bots_app_cutoff_value) {
+    MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0);
+    return;
+  }
+
+  /* Initialize quandrant matrices */
+  #define A11 A
+  #define B11 B
+  #define C11 C
+  A12 = A11 + QuadrantSize;
+  B12 = B11 + QuadrantSize;
+  C12 = C11 + QuadrantSize;
+  A21 = A + (RowWidthA * QuadrantSize);
+  B21 = B + (RowWidthB * QuadrantSize);
+  C21 = C + (RowWidthC * QuadrantSize);
+  A22 = A21 + QuadrantSize;
+  B22 = B21 + QuadrantSize;
+  C22 = C21 + QuadrantSize;
+
+  /* Allocate Heap Space Here */
+  StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables);
+  /* ensure that heap is on cache boundary */
+  if ( ((PTR) Heap) & 31)
+     Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) );
+  
+  /* Distribute the heap space over the variables */
+  S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  
+  /***************************************************************************
+  ** Step through all columns row by row (vertically)
+  ** (jumps in memory by RowWidth => bad locality)
+  ** (but we want the best locality on the innermost loop)
+  ***************************************************************************/
+  for (Row = 0; Row < QuadrantSize; Row++) {
+    
+    /*************************************************************************
+    ** Step through each row horizontally (addressing elements in each column)
+    ** (jumps linearly througn memory => good locality)
+    *************************************************************************/
+    for (Column = 0; Column < QuadrantSize; Column++) {
+      
+      /***********************************************************
+      ** Within this loop, the following holds for MatrixOffset:
+      ** MatrixOffset = (Row * RowWidth) + Column
+      ** (note: that the unit of the offset is number of reals)
+      ***********************************************************/
+      /* Element of Global Matrix, such as A, B, C */
+      #define E(Matrix)   (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) )
+      #define EA(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) )
+      #define EB(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) )
+
+      /* FIXME - may pay to expand these out - got higher speed-ups below */
+      /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */
+      E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) );
+
+      /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */
+      E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21);
+
+      /* S3 = A11 - A21 */
+      E(S3) = EA(A11) - EA(A21);
+      
+      /* S7 = B22 - B12 */
+      E(S7) = EB(B22) - EB(B12);
+
+      TempMatrixOffset += sizeof(REAL);
+      MatrixOffsetA += sizeof(REAL);
+      MatrixOffsetB += sizeof(REAL);
+    } /* end row loop*/
+
+    MatrixOffsetA += RowIncrementA;
+    MatrixOffsetB += RowIncrementB;
+  } /* end column loop */
+
+#if defined(FORCE_TIED_TASKS)
+  /* M2 = A11 x B11 */
+  #pragma omp task if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1);
+
+  /* M5 = S1 * S5 */
+  #pragma omp task if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T1 = S2 x S6 + M2 */
+  #pragma omp task if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(T1sMULT, S2, S6,  QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T2 = T1 + S3 x S7 */
+  #pragma omp task if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of C11 = M2 + A12 * B21 */
+  #pragma omp task if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1);
+  
+  /* Step 1 of C12 = S4 x B22 + T1 + M5 */
+  #pragma omp task if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1);
+
+  /* Step 1 of C21 = T2 - A22 * S8 */
+  #pragma omp task if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1);
+#else
+  /* M2 = A11 x B11 */
+  #pragma omp task untied if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1);
+
+  /* M5 = S1 * S5 */
+  #pragma omp task untied if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T1 = S2 x S6 + M2 */
+  #pragma omp task untied if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(T1sMULT, S2, S6,  QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T2 = T1 + S3 x S7 */
+  #pragma omp task untied if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of C11 = M2 + A12 * B21 */
+  #pragma omp task untied if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1);
+  
+  /* Step 1 of C12 = S4 x B22 + T1 + M5 */
+  #pragma omp task untied if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1);
+
+  /* Step 1 of C21 = T2 - A22 * S8 */
+  #pragma omp task untied if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1);
+#endif
+  /**********************************************
+  ** Synchronization Point
+  **********************************************/
+  #pragma omp taskwait
+  /***************************************************************************
+  ** Step through all columns row by row (vertically)
+  ** (jumps in memory by RowWidth => bad locality)
+  ** (but we want the best locality on the innermost loop)
+  ***************************************************************************/
+  for (Row = 0; Row < QuadrantSize; Row++) {
+    /*************************************************************************
+    ** Step through each row horizontally (addressing elements in each column)
+    ** (jumps linearly througn memory => good locality)
+    *************************************************************************/
+    for (Column = 0; Column < QuadrantSize; Column += 4) {
+      REAL LocalM5_0 = *(M5);
+      REAL LocalM5_1 = *(M5+1);
+      REAL LocalM5_2 = *(M5+2);
+      REAL LocalM5_3 = *(M5+3);
+      REAL LocalM2_0 = *(M2);
+      REAL LocalM2_1 = *(M2+1);
+      REAL LocalM2_2 = *(M2+2);
+      REAL LocalM2_3 = *(M2+3);
+      REAL T1_0 = *(T1sMULT) + LocalM2_0;
+      REAL T1_1 = *(T1sMULT+1) + LocalM2_1;
+      REAL T1_2 = *(T1sMULT+2) + LocalM2_2;
+      REAL T1_3 = *(T1sMULT+3) + LocalM2_3;
+      REAL T2_0 = *(C22) + T1_0;
+      REAL T2_1 = *(C22+1) + T1_1;
+      REAL T2_2 = *(C22+2) + T1_2;
+      REAL T2_3 = *(C22+3) + T1_3;
+      (*(C11))   += LocalM2_0;
+      (*(C11+1)) += LocalM2_1;
+      (*(C11+2)) += LocalM2_2;
+      (*(C11+3)) += LocalM2_3;
+      (*(C12))   += LocalM5_0 + T1_0;
+      (*(C12+1)) += LocalM5_1 + T1_1;
+      (*(C12+2)) += LocalM5_2 + T1_2;
+      (*(C12+3)) += LocalM5_3 + T1_3;
+      (*(C22))   = LocalM5_0 + T2_0;
+      (*(C22+1)) = LocalM5_1 + T2_1;
+      (*(C22+2)) = LocalM5_2 + T2_2;
+      (*(C22+3)) = LocalM5_3 + T2_3;
+      (*(C21  )) = (- *(C21  )) + T2_0;
+      (*(C21+1)) = (- *(C21+1)) + T2_1;
+      (*(C21+2)) = (- *(C21+2)) + T2_2;
+      (*(C21+3)) = (- *(C21+3)) + T2_3;
+      M5 += 4;
+      M2 += 4;
+      T1sMULT += 4;
+      C11 += 4;
+      C12 += 4;
+      C21 += 4;
+      C22 += 4;
+    }
+    C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC);
+    C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC);
+    C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC);
+    C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC);
+  }
+  free(StartHeap);
+}
+#elif defined(MANUAL_CUTOFF)
+void OptimizedStrassenMultiply_par(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth)
+{
+  unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */
+  unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize
+                                 + 32;
+  unsigned Column, Row;
+  
+  /************************************************************************
+  ** For each matrix A, B, and C, we'll want pointers to each quandrant
+  ** in the matrix. These quandrants will be addressed as follows:
+  **  --        --
+  **  | A11  A12 |
+  **  |          |
+  **  | A21  A22 |
+  **  --        --
+  ************************************************************************/
+  REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12,
+       *A21, *B21, *C21, *A22, *B22, *C22;
+
+  REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT;
+  #define T2sMULT C22
+  #define NumberOfVariables 11
+
+  PTR TempMatrixOffset = 0;
+  PTR MatrixOffsetA = 0;
+  PTR MatrixOffsetB = 0;
+
+  char *Heap;
+  void *StartHeap;
+
+  /* Distance between the end of a matrix row and the start of the next row */
+  PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3;
+  PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3;
+  PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3;
+
+  if (MatrixSize <= bots_app_cutoff_value) {
+    MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0);
+    return;
+  }
+
+  /* Initialize quandrant matrices */
+  #define A11 A
+  #define B11 B
+  #define C11 C
+  A12 = A11 + QuadrantSize;
+  B12 = B11 + QuadrantSize;
+  C12 = C11 + QuadrantSize;
+  A21 = A + (RowWidthA * QuadrantSize);
+  B21 = B + (RowWidthB * QuadrantSize);
+  C21 = C + (RowWidthC * QuadrantSize);
+  A22 = A21 + QuadrantSize;
+  B22 = B21 + QuadrantSize;
+  C22 = C21 + QuadrantSize;
+
+  /* Allocate Heap Space Here */
+  StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables);
+  /* ensure that heap is on cache boundary */
+  if ( ((PTR) Heap) & 31)
+     Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) );
+  
+  /* Distribute the heap space over the variables */
+  S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  
+  /***************************************************************************
+  ** Step through all columns row by row (vertically)
+  ** (jumps in memory by RowWidth => bad locality)
+  ** (but we want the best locality on the innermost loop)
+  ***************************************************************************/
+  for (Row = 0; Row < QuadrantSize; Row++) {
+    
+    /*************************************************************************
+    ** Step through each row horizontally (addressing elements in each column)
+    ** (jumps linearly througn memory => good locality)
+    *************************************************************************/
+    for (Column = 0; Column < QuadrantSize; Column++) {
+      
+      /***********************************************************
+      ** Within this loop, the following holds for MatrixOffset:
+      ** MatrixOffset = (Row * RowWidth) + Column
+      ** (note: that the unit of the offset is number of reals)
+      ***********************************************************/
+      /* Element of Global Matrix, such as A, B, C */
+      #define E(Matrix)   (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) )
+      #define EA(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) )
+      #define EB(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) )
+
+      /* FIXME - may pay to expand these out - got higher speed-ups below */
+      /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */
+      E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) );
+
+      /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */
+      E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21);
+
+      /* S3 = A11 - A21 */
+      E(S3) = EA(A11) - EA(A21);
+      
+      /* S7 = B22 - B12 */
+      E(S7) = EB(B22) - EB(B12);
+
+      TempMatrixOffset += sizeof(REAL);
+      MatrixOffsetA += sizeof(REAL);
+      MatrixOffsetB += sizeof(REAL);
+    } /* end row loop*/
+
+    MatrixOffsetA += RowIncrementA;
+    MatrixOffsetB += RowIncrementB;
+  } /* end column loop */
+
+  if (Depth < bots_cutoff_value)
+  {
+#if defined(FORCE_TIED_TASKS)
+    /* M2 = A11 x B11 */
+    #pragma omp task
+    OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1);
+
+    /* M5 = S1 * S5 */
+    #pragma omp task
+    OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+    /* Step 1 of T1 = S2 x S6 + M2 */
+    #pragma omp task
+    OptimizedStrassenMultiply_par(T1sMULT, S2, S6,  QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+    /* Step 1 of T2 = T1 + S3 x S7 */
+    #pragma omp task
+    OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1);
+
+    /* Step 1 of C11 = M2 + A12 * B21 */
+    #pragma omp task
+    OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1);
+  
+    /* Step 1 of C12 = S4 x B22 + T1 + M5 */
+    #pragma omp task
+    OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1);
+
+    /* Step 1 of C21 = T2 - A22 * S8 */
+    #pragma omp task
+    OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1);
+#else
+    /* M2 = A11 x B11 */
+    #pragma omp task untied
+    OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1);
+
+    /* M5 = S1 * S5 */
+    #pragma omp task untied
+    OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+    /* Step 1 of T1 = S2 x S6 + M2 */
+    #pragma omp task untied
+    OptimizedStrassenMultiply_par(T1sMULT, S2, S6,  QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+    /* Step 1 of T2 = T1 + S3 x S7 */
+    #pragma omp task untied
+    OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1);
+
+    /* Step 1 of C11 = M2 + A12 * B21 */
+    #pragma omp task untied
+    OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1);
+  
+    /* Step 1 of C12 = S4 x B22 + T1 + M5 */
+    #pragma omp task untied
+    OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1);
+
+    /* Step 1 of C21 = T2 - A22 * S8 */
+    #pragma omp task untied
+    OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1);
+#endif
+
+    /**********************************************
+    ** Synchronization Point
+    **********************************************/
+    #pragma omp taskwait
+  }
+  else
+  {
+    /* M2 = A11 x B11 */
+    OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1);
+    /* M5 = S1 * S5 */
+    OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+    /* Step 1 of T1 = S2 x S6 + M2 */
+    OptimizedStrassenMultiply_par(T1sMULT, S2, S6,  QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+    /* Step 1 of T2 = T1 + S3 x S7 */
+    OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1);
+    /* Step 1 of C11 = M2 + A12 * B21 */
+    OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1);
+    /* Step 1 of C12 = S4 x B22 + T1 + M5 */
+    OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1);
+    /* Step 1 of C21 = T2 - A22 * S8 */
+    OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1);
+  }
+
+  /***************************************************************************
+  ** Step through all columns row by row (vertically)
+  ** (jumps in memory by RowWidth => bad locality)
+  ** (but we want the best locality on the innermost loop)
+  ***************************************************************************/
+  for (Row = 0; Row < QuadrantSize; Row++) {
+    /*************************************************************************
+    ** Step through each row horizontally (addressing elements in each column)
+    ** (jumps linearly througn memory => good locality)
+    *************************************************************************/
+    for (Column = 0; Column < QuadrantSize; Column += 4) {
+      REAL LocalM5_0 = *(M5);
+      REAL LocalM5_1 = *(M5+1);
+      REAL LocalM5_2 = *(M5+2);
+      REAL LocalM5_3 = *(M5+3);
+      REAL LocalM2_0 = *(M2);
+      REAL LocalM2_1 = *(M2+1);
+      REAL LocalM2_2 = *(M2+2);
+      REAL LocalM2_3 = *(M2+3);
+      REAL T1_0 = *(T1sMULT) + LocalM2_0;
+      REAL T1_1 = *(T1sMULT+1) + LocalM2_1;
+      REAL T1_2 = *(T1sMULT+2) + LocalM2_2;
+      REAL T1_3 = *(T1sMULT+3) + LocalM2_3;
+      REAL T2_0 = *(C22) + T1_0;
+      REAL T2_1 = *(C22+1) + T1_1;
+      REAL T2_2 = *(C22+2) + T1_2;
+      REAL T2_3 = *(C22+3) + T1_3;
+      (*(C11))   += LocalM2_0;
+      (*(C11+1)) += LocalM2_1;
+      (*(C11+2)) += LocalM2_2;
+      (*(C11+3)) += LocalM2_3;
+      (*(C12))   += LocalM5_0 + T1_0;
+      (*(C12+1)) += LocalM5_1 + T1_1;
+      (*(C12+2)) += LocalM5_2 + T1_2;
+      (*(C12+3)) += LocalM5_3 + T1_3;
+      (*(C22))   = LocalM5_0 + T2_0;
+      (*(C22+1)) = LocalM5_1 + T2_1;
+      (*(C22+2)) = LocalM5_2 + T2_2;
+      (*(C22+3)) = LocalM5_3 + T2_3;
+      (*(C21  )) = (- *(C21  )) + T2_0;
+      (*(C21+1)) = (- *(C21+1)) + T2_1;
+      (*(C21+2)) = (- *(C21+2)) + T2_2;
+      (*(C21+3)) = (- *(C21+3)) + T2_3;
+      M5 += 4;
+      M2 += 4;
+      T1sMULT += 4;
+      C11 += 4;
+      C12 += 4;
+      C21 += 4;
+      C22 += 4;
+    }
+    C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC);
+    C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC);
+    C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC);
+    C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC);
+  }
+  free(StartHeap);
+}
+#else
+void OptimizedStrassenMultiply_par(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth)
+{
+  unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */
+  unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize
+                                 + 32;
+  unsigned Column, Row;
+  
+  /************************************************************************
+  ** For each matrix A, B, and C, we'll want pointers to each quandrant
+  ** in the matrix. These quandrants will be addressed as follows:
+  **  --        --
+  **  | A11  A12 |
+  **  |          |
+  **  | A21  A22 |
+  **  --        --
+  ************************************************************************/
+  REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12,
+       *A21, *B21, *C21, *A22, *B22, *C22;
+
+  REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT;
+  #define T2sMULT C22
+  #define NumberOfVariables 11
+
+  PTR TempMatrixOffset = 0;
+  PTR MatrixOffsetA = 0;
+  PTR MatrixOffsetB = 0;
+
+  char *Heap;
+  void *StartHeap;
+
+  /* Distance between the end of a matrix row and the start of the next row */
+  PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3;
+  PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3;
+  PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3;
+
+  if (MatrixSize <= bots_app_cutoff_value) {
+    MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0);
+    return;
+  }
+
+  /* Initialize quandrant matrices */
+  #define A11 A
+  #define B11 B
+  #define C11 C
+  A12 = A11 + QuadrantSize;
+  B12 = B11 + QuadrantSize;
+  C12 = C11 + QuadrantSize;
+  A21 = A + (RowWidthA * QuadrantSize);
+  B21 = B + (RowWidthB * QuadrantSize);
+  C21 = C + (RowWidthC * QuadrantSize);
+  A22 = A21 + QuadrantSize;
+  B22 = B21 + QuadrantSize;
+  C22 = C21 + QuadrantSize;
+
+  /* Allocate Heap Space Here */
+  StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables);
+  /* ensure that heap is on cache boundary */
+  if ( ((PTR) Heap) & 31)
+     Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) );
+  
+  /* Distribute the heap space over the variables */
+  S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  
+  /***************************************************************************
+  ** Step through all columns row by row (vertically)
+  ** (jumps in memory by RowWidth => bad locality)
+  ** (but we want the best locality on the innermost loop)
+  ***************************************************************************/
+  for (Row = 0; Row < QuadrantSize; Row++) {
+    
+    /*************************************************************************
+    ** Step through each row horizontally (addressing elements in each column)
+    ** (jumps linearly througn memory => good locality)
+    *************************************************************************/
+    for (Column = 0; Column < QuadrantSize; Column++) {
+      
+      /***********************************************************
+      ** Within this loop, the following holds for MatrixOffset:
+      ** MatrixOffset = (Row * RowWidth) + Column
+      ** (note: that the unit of the offset is number of reals)
+      ***********************************************************/
+      /* Element of Global Matrix, such as A, B, C */
+      #define E(Matrix)   (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) )
+      #define EA(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) )
+      #define EB(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) )
+
+      /* FIXME - may pay to expand these out - got higher speed-ups below */
+      /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */
+      E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) );
+
+      /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */
+      E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21);
+
+      /* S3 = A11 - A21 */
+      E(S3) = EA(A11) - EA(A21);
+      
+      /* S7 = B22 - B12 */
+      E(S7) = EB(B22) - EB(B12);
+
+      TempMatrixOffset += sizeof(REAL);
+      MatrixOffsetA += sizeof(REAL);
+      MatrixOffsetB += sizeof(REAL);
+    } /* end row loop*/
+
+    MatrixOffsetA += RowIncrementA;
+    MatrixOffsetB += RowIncrementB;
+  } /* end column loop */
+
+#if defined(FORCE_TIED_TASKS)
+  /* M2 = A11 x B11 */
+  #pragma omp task
+  OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1);
+
+  /* M5 = S1 * S5 */
+  #pragma omp task
+  OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T1 = S2 x S6 + M2 */
+  #pragma omp task
+  OptimizedStrassenMultiply_par(T1sMULT, S2, S6,  QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T2 = T1 + S3 x S7 */
+  #pragma omp task
+  OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of C11 = M2 + A12 * B21 */
+  #pragma omp task
+  OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1);
+  
+  /* Step 1 of C12 = S4 x B22 + T1 + M5 */
+  #pragma omp task
+  OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1);
+
+  /* Step 1 of C21 = T2 - A22 * S8 */
+  #pragma omp task
+  OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1);
+#else
+  /* M2 = A11 x B11 */
+  #pragma omp task untied
+  OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1);
+
+  /* M5 = S1 * S5 */
+  #pragma omp task untied
+  OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T1 = S2 x S6 + M2 */
+  #pragma omp task untied
+  OptimizedStrassenMultiply_par(T1sMULT, S2, S6,  QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T2 = T1 + S3 x S7 */
+  #pragma omp task untied
+  OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of C11 = M2 + A12 * B21 */
+  #pragma omp task untied
+  OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1);
+  
+  /* Step 1 of C12 = S4 x B22 + T1 + M5 */
+  #pragma omp task untied
+  OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1);
+
+  /* Step 1 of C21 = T2 - A22 * S8 */
+  #pragma omp task untied
+  OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1);
+#endif
+
+  /**********************************************
+  ** Synchronization Point
+  **********************************************/
+  #pragma omp taskwait
+  /***************************************************************************
+  ** Step through all columns row by row (vertically)
+  ** (jumps in memory by RowWidth => bad locality)
+  ** (but we want the best locality on the innermost loop)
+  ***************************************************************************/
+  for (Row = 0; Row < QuadrantSize; Row++) {
+    /*************************************************************************
+    ** Step through each row horizontally (addressing elements in each column)
+    ** (jumps linearly througn memory => good locality)
+    *************************************************************************/
+    for (Column = 0; Column < QuadrantSize; Column += 4) {
+      REAL LocalM5_0 = *(M5);
+      REAL LocalM5_1 = *(M5+1);
+      REAL LocalM5_2 = *(M5+2);
+      REAL LocalM5_3 = *(M5+3);
+      REAL LocalM2_0 = *(M2);
+      REAL LocalM2_1 = *(M2+1);
+      REAL LocalM2_2 = *(M2+2);
+      REAL LocalM2_3 = *(M2+3);
+      REAL T1_0 = *(T1sMULT) + LocalM2_0;
+      REAL T1_1 = *(T1sMULT+1) + LocalM2_1;
+      REAL T1_2 = *(T1sMULT+2) + LocalM2_2;
+      REAL T1_3 = *(T1sMULT+3) + LocalM2_3;
+      REAL T2_0 = *(C22) + T1_0;
+      REAL T2_1 = *(C22+1) + T1_1;
+      REAL T2_2 = *(C22+2) + T1_2;
+      REAL T2_3 = *(C22+3) + T1_3;
+      (*(C11))   += LocalM2_0;
+      (*(C11+1)) += LocalM2_1;
+      (*(C11+2)) += LocalM2_2;
+      (*(C11+3)) += LocalM2_3;
+      (*(C12))   += LocalM5_0 + T1_0;
+      (*(C12+1)) += LocalM5_1 + T1_1;
+      (*(C12+2)) += LocalM5_2 + T1_2;
+      (*(C12+3)) += LocalM5_3 + T1_3;
+      (*(C22))   = LocalM5_0 + T2_0;
+      (*(C22+1)) = LocalM5_1 + T2_1;
+      (*(C22+2)) = LocalM5_2 + T2_2;
+      (*(C22+3)) = LocalM5_3 + T2_3;
+      (*(C21  )) = (- *(C21  )) + T2_0;
+      (*(C21+1)) = (- *(C21+1)) + T2_1;
+      (*(C21+2)) = (- *(C21+2)) + T2_2;
+      (*(C21+3)) = (- *(C21+3)) + T2_3;
+      M5 += 4;
+      M2 += 4;
+      T1sMULT += 4;
+      C11 += 4;
+      C12 += 4;
+      C21 += 4;
+      C22 += 4;
+    }
+    C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC);
+    C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC);
+    C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC);
+    C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC);
+  }
+  free(StartHeap);
+}
+#endif
+/*
+ * Set an n by n matrix A to random values.  The distance between
+ * rows is an
+ */
+void init_matrix(int n, REAL *A, int an)
+{
+     int i, j;
+
+     for (i = 0; i < n; ++i)
+	  for (j = 0; j < n; ++j) 
+	       ELEM(A, an, i, j) = ((double) rand()) / (double) RAND_MAX; 
+}
+
+/*
+ * Compare two matrices.  Print an error message if they differ by
+ * more than EPSILON.
+ */
+int compare_matrix(int n, REAL *A, int an, REAL *B, int bn)
+{
+     int i, j;
+     REAL c;
+
+     for (i = 0; i < n; ++i)
+	  for (j = 0; j < n; ++j) {
+	       /* compute the relative error c */
+	       c = ELEM(A, an, i, j) - ELEM(B, bn, i, j);
+	       if (c < 0.0) 
+		    c = -c;
+
+	       c = c / ELEM(A, an, i, j);
+	       if (c > EPSILON) {
+		    bots_message("Strassen: Wrong answer!\n");
+		    return BOTS_RESULT_UNSUCCESSFUL;
+	       }
+	  }
+
+     return BOTS_RESULT_SUCCESSFUL;
+}
+	       
+/*
+ * Allocate a matrix of side n (therefore n^2 elements)
+ */
+REAL *alloc_matrix(int n) 
+{
+     return malloc(n * n * sizeof(REAL));
+}
+
+void strassen_main_par(REAL *A, REAL *B, REAL *C, int n)
+{
+	bots_message("Computing parallel Strassen algorithm (n=%d) ", n);
+	#pragma omp parallel
+	#pragma omp single
+#if defined(FORCE_TIED_TASKS)
+	#pragma omp task
+#else
+	#pragma omp task untied     
+#endif
+		OptimizedStrassenMultiply_par(C, A, B, n, n, n, n, 1);
+	bots_message(" completed!\n");
+}
+void strassen_main_seq(REAL *A, REAL *B, REAL *C, int n)
+{
+	bots_message("Computing sequential Strassen algorithm (n=%d) ", n);
+	OptimizedStrassenMultiply_seq(C, A, B, n, n, n, n, 1);
+	bots_message(" completed!\n");
+}
+
diff --git a/src/components/implementation/no_interface/omp_strassen_bots/strassen.h b/src/components/implementation/no_interface/omp_strassen_bots/strassen.h
new file mode 100644
index 0000000000..7944f77880
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_strassen_bots/strassen.h
@@ -0,0 +1,66 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+#ifndef _STRASSEN_H
+#define _STRASSEN_H
+/* ******************************************************************* */
+/* STRASSEN APPLICATION CUT OFF's                                      */
+/* ******************************************************************* */
+/* Strassen uses three different functions to compute Matrix Multiply. */
+/* Each of them is related to an application cut off value:            */
+/*  - Initial algorithm: OptimizedStrassenMultiply()                   */
+/*  - bots_app_cutoff_value: MultiplyByDivideAndConquer()              */
+/*  - SizeAtWhichNaiveAlgorithmIsMoreEfficient: FastAdditiveNaiveMatrixMultiply() */
+/* ******************************************************************* */
+
+/*FIXME: at the moment we use a constant value, change to parameter ???*/
+/* Below this cut off  strassen uses FastAdditiveNaiveMatrixMultiply algorithm */
+#define SizeAtWhichNaiveAlgorithmIsMoreEfficient 16
+
+/***********************************************************************
+ * maximum tolerable relative error (for the checking routine)
+ **********************************************************************/
+#define EPSILON (1.0E-6)
+/***********************************************************************
+ * Matrices are stored in row-major order; A is a pointer to
+ * the first element of the matrix, and an is the number of elements
+ * between two rows. This macro produces the element A[i,j]
+ * given A, an, i and j
+ **********************************************************************/
+#define ELEM(A, an, i, j) (A[(i)*(an)+(j)])
+
+void matrixmul(int n, REAL *A, int an, REAL *B, int bn, REAL *C, int cn);
+void FastNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB);
+void FastAdditiveNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB);
+void MultiplyByDivideAndConquer(REAL *C, REAL *A, REAL *B,
+				     unsigned MatrixSize,
+				     unsigned RowWidthC,
+				     unsigned RowWidthA,
+				     unsigned RowWidthB,
+				     int AdditiveMode
+				    );
+void OptimizedStrassenMultiply_par(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth);
+void OptimizedStrassenMultiply_seq(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth);
+REAL *alloc_matrix(int n);
+#endif
+
diff --git a/src/components/implementation/no_interface/omp_ubench/Makefile b/src/components/implementation/no_interface/omp_ubench/Makefile
new file mode 100644
index 0000000000..d93533c7e5
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_ubench/Makefile
@@ -0,0 +1,19 @@
+COMPONENT=omp_ubench.o
+INTERFACES=
+#DEPENDENCIES=capmgr
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
+
+CFLAGS += -fopenmp
+# if tied tasks are required
+#CFLAGS += -DFORCE_TIED_TASKS
+
+#OMPC_FINAL_FLAGS=
+
+# one per compilation or none
+#CFLAGS += -DMANUAL_CUTOFF
+#CFLAGS += -DIF_CUTOFF
+#CFLAGS += -DFINAL_CUTOFF $(OMPC_FINAL_FLAGS) 
diff --git a/src/components/implementation/no_interface/omp_ubench/init.c b/src/components/implementation/no_interface/omp_ubench/init.c
new file mode 120000
index 0000000000..9e09b82e77
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_ubench/init.c
@@ -0,0 +1 @@
+../omp_fib_bots/init.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_ubench/posix_basic.c b/src/components/implementation/no_interface/omp_ubench/posix_basic.c
new file mode 120000
index 0000000000..9afee078fb
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_ubench/posix_basic.c
@@ -0,0 +1 @@
+../omp_fib_bots/posix_basic.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_ubench/ubench.c b/src/components/implementation/no_interface/omp_ubench/ubench.c
new file mode 100644
index 0000000000..6d22daaf25
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_ubench/ubench.c
@@ -0,0 +1,156 @@
+#include <omp.h>
+#include <cos_types.h>
+#include <sl.h>
+#include <cos_component.h>
+
+#define ITERS 1000
+#define RECUR 4 
+
+#define DISPLAY_VALS
+
+void
+test_parallel(void)
+{
+	cycles_t max = 0, total = 0;
+	int i, x = 0;
+
+	for (i = 0; i < ITERS; i++) {
+		cycles_t st, en, diff;
+
+		rdtscll(st);
+		#pragma omp parallel
+		{
+			x++;
+		}
+		rdtscll(en);
+
+		diff = en - st;
+		total += diff;
+		if (diff > max) max = diff;
+#ifdef DISPLAY_VALS
+		PRINTC("%llu\n", diff);
+#endif
+	}
+
+	PRINTC("uBench Parallel (NCORES:%u, NITERS=%d): AVG:%llu WC:%llu\n", NUM_CPU, ITERS, total / ITERS, max);
+}
+
+void
+test_parallel_critical(void)
+{
+	cycles_t max = 0, total = 0;
+	int i, x = 0;
+
+	for (i = 0; i < ITERS; i++) {
+		cycles_t st, en, diff;
+
+		rdtscll(st);
+		#pragma omp parallel
+		{
+			#pragma omp critical
+			{
+				x++;
+			}
+		}
+		rdtscll(en);
+
+		diff = en - st;
+		total += diff;
+		if (diff > max) max = diff;
+#ifdef DISPLAY_VALS
+		PRINTC("%llu\n", diff);
+#endif
+	}
+
+	PRINTC("uBench Parallel+Critical (NCORES:%u, NITERS=%d): AVG:%llu WC:%llu\n", NUM_CPU, ITERS, total / ITERS, max);
+}
+
+void
+test_parallel_task(void)
+{
+	cycles_t max = 0, total = 0;
+	int i, x = 0, y = 0;
+
+	for (i = 0; i < ITERS; i++) {
+		cycles_t st, en, diff;
+
+		rdtscll(st);
+		#pragma omp parallel
+		{
+			x++;
+			#pragma omp task
+			{
+				y++;
+			}
+			#pragma omp taskwait
+		}
+		rdtscll(en);
+
+		diff = en - st;
+		total += diff;
+		if (diff > max) max = diff;
+#ifdef DISPLAY_VALS
+		PRINTC("%llu\n", diff);
+#endif
+	}
+
+	PRINTC("uBench Parallel+Task+Taskwait (NCORES:%u, NITERS=%d): AVG:%llu WC:%llu\n", NUM_CPU, ITERS, total / ITERS, max);
+}
+
+void
+test_parallel_task_4levels(void)
+{
+	cycles_t max = 0, total = 0;
+	int i, x = 0, y = 0;
+
+	for (i = 0; i < ITERS; i++) {
+		cycles_t st, en, diff;
+
+		rdtscll(st);
+		#pragma omp parallel
+		{
+			x++;
+			#pragma omp task
+			{
+				#pragma omp task
+				{
+					#pragma omp task
+					{
+						#pragma omp task
+						{
+							y++;
+						}
+						#pragma omp taskwait
+						y++;
+					}
+					#pragma omp taskwait
+					y++;
+				}
+				#pragma omp taskwait
+				y++;
+			}
+			#pragma omp taskwait
+		}
+		rdtscll(en);
+
+		diff = en - st;
+		total += diff;
+		if (diff > max) max = diff;
+#ifdef DISPLAY_VALS
+		PRINTC("%llu\n", diff);
+#endif
+	}
+
+	PRINTC("uBench Parallel+Task 4levels+Taskwait (NCORES:%u, NITERS=%d): AVG:%llu WC:%llu\n", NUM_CPU, ITERS, total / ITERS, max);
+}
+
+int
+main(void)
+{
+//	test_parallel();
+//	test_parallel_critical();
+	test_parallel_task();
+//	test_parallel_task_4levels();
+
+	return 0;
+}
diff --git a/src/components/implementation/no_interface/omp_workconservation/Makefile b/src/components/implementation/no_interface/omp_workconservation/Makefile
new file mode 100644
index 0000000000..816ae03c7e
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_workconservation/Makefile
@@ -0,0 +1,10 @@
+COMPONENT=omp_workconsprob.o
+INTERFACES=
+#DEPENDENCIES=capmgr
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
+
+CFLAGS += -fopenmp
diff --git a/src/components/implementation/no_interface/omp_workconservation/init.c b/src/components/implementation/no_interface/omp_workconservation/init.c
new file mode 120000
index 0000000000..b2694bf833
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_workconservation/init.c
@@ -0,0 +1 @@
+../omp_hello/init.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_workconservation/work_problem.c b/src/components/implementation/no_interface/omp_workconservation/work_problem.c
new file mode 100644
index 0000000000..e395df3eeb
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_workconservation/work_problem.c
@@ -0,0 +1,89 @@
+#include <assert.h>
+#include <sched.h>
+#include <sys/syscall.h>
+#include <stdio.h>
+#include <omp.h>
+#include <llprint.h>
+#include <cos_component.h>
+#include <sl.h>
+
+#define GETTID() cos_thdid()
+#define sched_getcpu() cos_cpuid()
+#define CYC_US 3200
+
+/*
+ * From Chaos tests!
+ * NOTE: number obtained by running composite instance with no interference..
+ *       (validated with fiasco so far, it is 10us)
+ */
+#define ITERS_10US 5850
+#define MULTIPLE 10000
+
+#define SPIN_ITERS (ITERS_10US*MULTIPLE)
+
+static void __spin_fn(void) __attribute__((optimize("O0")));
+
+static void
+__spin_fn(void)
+{
+        unsigned int spin = 0;
+
+        while (spin < SPIN_ITERS) {
+                __asm__ __volatile__("nop": : :"memory");
+                spin++;
+        }
+}
+
+#define ITERS 1000
+
+int main(void)
+{
+	unsigned long long max = 0, total = 0;
+	int i;
+	unsigned long long x, y;
+
+	rdtscll(x);
+	__spin_fn();
+	rdtscll(y);
+	printc("%llu:%llu\n\n\n", y - x, sl_cyc2usec(y - x));
+
+
+	for (i = 0; i < ITERS; i++) {
+		volatile unsigned long long st = 0, en = 0;
+
+		rdtscll(st);
+		#pragma omp parallel
+		{
+			#pragma omp single
+			{
+				#pragma omp task
+				{
+					#pragma omp task
+					{
+						__spin_fn();
+					}
+					#pragma omp taskwait
+				}
+
+				#pragma omp task
+				{
+					__spin_fn();
+				}
+				__spin_fn();
+				#pragma omp taskwait
+			}
+		}
+		rdtscll(en);
+		long diff = en - st;
+		assert(diff > 0);
+
+		total += diff;
+		if ((unsigned long long) diff > max) max = diff;
+		printc("%ld, %ld\n", diff, diff / CYC_US);
+	}
+
+	printc("(cyc) Avg: %llu, Max: %llu\n", (total / ITERS), max);
+	printc("(us) Avg: %llu, Max: %llu\n", (total / ITERS) / CYC_US, max / CYC_US);
+
+	return 0;
+}
diff --git a/src/components/implementation/sched/Makefile b/src/components/implementation/sched/Makefile
index c0cd0d6743..43e1e0e4f1 100644
--- a/src/components/implementation/sched/Makefile
+++ b/src/components/implementation/sched/Makefile
@@ -1,3 +1,3 @@
-INTERFACES=sched schedinit
+INTERFACES=sched schedinit crt
 
 include ../Makefile.subdir
diff --git a/src/components/implementation/sched/chan_backend.c b/src/components/implementation/sched/chan_backend.c
new file mode 100644
index 0000000000..10736eabdf
--- /dev/null
+++ b/src/components/implementation/sched/chan_backend.c
@@ -0,0 +1,45 @@
+#include <chan_crt.h>
+
+#include <crt_chan.h>
+#include <sched_info.h>
+
+struct __sched_inout_chan {
+	struct crt_chan *in, *out;
+} __sched_thds[NUM_CPU][MAX_NUM_THREADS];
+
+void
+__sched_stdio_init(void)
+{
+	memset(__sched_thds[cos_cpuid()], 0, MAX_NUM_THREADS * sizeof(struct __sched_inout_chan));
+}
+
+void
+__sched_stdio_thd_init(thdid_t tid, struct crt_chan *in, struct crt_chan *out)
+{
+	__sched_thds[cos_cpuid()][tid].in  = in;
+	__sched_thds[cos_cpuid()][tid].out = out;
+}
+
+int
+chan_out(unsigned long item)
+{
+	struct crt_chan *co = __sched_thds[cos_cpuid()][cos_thdid()].out;
+
+	assert(co != NULL);
+	return crt_chan_send_LU(co, &item);
+}
+
+unsigned long
+chan_in(void)
+{
+	unsigned long item = 0;
+	int ret = 0;
+	struct crt_chan *ci = __sched_thds[cos_cpuid()][cos_thdid()].in;
+
+	assert(ci != NULL);
+
+	ret = crt_chan_recv_LU(ci, &item);
+	assert(ret == 0);
+
+	return item;	
+}
diff --git a/src/components/implementation/sched/hier_fprr/Makefile b/src/components/implementation/sched/hier_fprr/Makefile
index 5be22a8cbd..f19b907991 100644
--- a/src/components/implementation/sched/hier_fprr/Makefile
+++ b/src/components/implementation/sched/hier_fprr/Makefile
@@ -5,7 +5,7 @@ INTERFACES=sched schedinit
 DEPENDENCIES=capmgr sched schedinit
 IF_LIB=
 FN_PREPEND=parent_
-ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr
+ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr -lsl_blkpt
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/sched/hier_fprr/init.c b/src/components/implementation/sched/hier_fprr/init.c
index 78a4e578cc..90369146e4 100644
--- a/src/components/implementation/sched/hier_fprr/init.c
+++ b/src/components/implementation/sched/hier_fprr/init.c
@@ -55,14 +55,33 @@ __init_done(void *d)
 void
 sched_child_init(struct sched_childinfo *schedci)
 {
-	struct sl_thd *initthd = NULL;
+	vaddr_t dcbaddr;
 
 	assert(schedci);
-	initthd = sched_child_initthd_get(schedci);
-	assert(initthd);
-	sl_thd_param_set(initthd, sched_param_pack(SCHEDP_PRIO, FIXED_PRIO));
-	sl_thd_param_set(initthd, sched_param_pack(SCHEDP_WINDOW, FIXED_PERIOD_MS));
-	sl_thd_param_set(initthd, sched_param_pack(SCHEDP_BUDGET, FIXED_BUDGET_MS));
+	schedci->initthd = sl_thd_initaep_alloc(sched_child_defci_get(schedci), NULL, schedci->flags & COMP_FLAG_SCHED, schedci->flags & COMP_FLAG_SCHED ? 1 : 0, 0, 0, 0, &dcbaddr);
+	assert(schedci->initthd);
+
+	sl_thd_param_set(schedci->initthd, sched_param_pack(SCHEDP_PRIO, FIXED_PRIO));
+	sl_thd_param_set(schedci->initthd, sched_param_pack(SCHEDP_WINDOW, FIXED_PERIOD_MS));
+	sl_thd_param_set(schedci->initthd, sched_param_pack(SCHEDP_BUDGET, FIXED_BUDGET_MS));
+}
+
+thdid_t
+sched_child_thd_create(struct sched_childinfo *schedci, thdclosure_index_t idx)
+{
+	vaddr_t addr;
+	struct sl_thd *t = sl_thd_aep_alloc_ext(sched_child_defci_get(schedci), NULL, idx, 0, 0, 0, 0, 0, &addr, NULL);
+
+	return t ? sl_thd_thdid(t) : 0;
+}
+
+thdid_t
+sched_child_aep_create(struct sched_childinfo *schedci, thdclosure_index_t idx, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv)
+{
+	vaddr_t addr;
+	struct sl_thd *t = sl_thd_aep_alloc_ext(sched_child_defci_get(schedci), NULL, idx, 1, owntc, key, 0, 0, &addr, extrcv);
+
+	return t ? sl_thd_thdid(t) : 0;
 }
 
 void
@@ -70,7 +89,7 @@ cos_init(void)
 {
 	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
 	struct cos_compinfo    *ci    = cos_compinfo_get(defci);
-	static volatile int first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 };
+	static volatile unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 };
 	static u32_t cpubmp[NUM_CPU_BMP_WORDS] = { 0 };
 	int i;
 
@@ -94,7 +113,7 @@ cos_init(void)
 		while (!ps_load((unsigned long *)&init_done[i])) ;
 	}
 
-	sl_init_cpubmp(SL_MIN_PERIOD_US, cpubmp);
+	sl_init_corebmp(SL_MIN_PERIOD_US, cpubmp);
 	sched_childinfo_init();
 	__initializer_thd[cos_cpuid()] = sl_thd_alloc(__init_done, NULL);
 	assert(__initializer_thd[cos_cpuid()]);
diff --git a/src/components/implementation/sched/root_fprr/Makefile b/src/components/implementation/sched/root_fprr/Makefile
index ec245e6d9f..e811b24382 100644
--- a/src/components/implementation/sched/root_fprr/Makefile
+++ b/src/components/implementation/sched/root_fprr/Makefile
@@ -4,7 +4,7 @@ COMPONENT=root_fprr.o
 INTERFACES=sched schedinit
 DEPENDENCIES=capmgr
 IF_LIB=
-ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr 
+ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr -lsl_blkpt
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/sched/root_fprr/init.c b/src/components/implementation/sched/root_fprr/init.c
index 9ea0ef3812..54f819c7ff 100644
--- a/src/components/implementation/sched/root_fprr/init.c
+++ b/src/components/implementation/sched/root_fprr/init.c
@@ -17,8 +17,8 @@ u32_t cycs_per_usec = 0;
 #define INITIALIZE_BUDGET_MS (2000)
 
 #define FIXED_PRIO 2
-#define FIXED_PERIOD_MS (10000)
-#define FIXED_BUDGET_MS (4000)
+#define FIXED_PERIOD_MS (50000)
+#define FIXED_BUDGET_MS (100000)
 
 static struct sl_thd *__initializer_thd[NUM_CPU] CACHE_ALIGNED;
 
@@ -45,14 +45,45 @@ __init_done(void *d)
 void
 sched_child_init(struct sched_childinfo *schedci)
 {
-	struct sl_thd *initthd = NULL;
+	vaddr_t dcbaddr;
+	struct sl_thd *initthd;
+	tcap_prio_t p = FIXED_PRIO;
 
 	assert(schedci);
-	initthd = sched_child_initthd_get(schedci);
-	assert(initthd);
-	sl_thd_param_set(initthd, sched_param_pack(SCHEDP_PRIO, FIXED_PRIO));
-	sl_thd_param_set(initthd, sched_param_pack(SCHEDP_WINDOW, FIXED_PERIOD_MS));
-	sl_thd_param_set(initthd, sched_param_pack(SCHEDP_BUDGET, FIXED_BUDGET_MS));
+	if (schedci->id != 1) p = FIXED_PRIO;
+	else                  p = FIXED_PRIO + 1;
+	schedci->initthd = sl_thd_initaep_alloc(sched_child_defci_get(schedci), NULL, schedci->flags & COMP_FLAG_SCHED, schedci->flags & COMP_FLAG_SCHED ? 1 : 0, 0, 0, 0, &dcbaddr);
+        assert(schedci->initthd);
+	initthd = schedci->initthd;
+
+	if (schedci->flags & COMP_FLAG_SCHED) {
+		if (cos_tcap_transfer(sl_thd_rcvcap(initthd), BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, p)) {
+			PRINTC("Failed to transfer INF budget\n");
+			assert(0);
+		} else {
+			sl_thd_param_set(initthd, sched_param_pack(SCHEDP_BUDGET, FIXED_BUDGET_MS));
+		}
+		sl_thd_param_set(initthd, sched_param_pack(SCHEDP_WINDOW, FIXED_PERIOD_MS));
+	}
+	sl_thd_param_set(initthd, sched_param_pack(SCHEDP_PRIO, p));
+}
+
+thdid_t
+sched_child_thd_create(struct sched_childinfo *schedci, thdclosure_index_t idx)
+{
+	vaddr_t addr;
+	struct sl_thd *t = sl_thd_aep_alloc_ext(sched_child_defci_get(schedci), NULL, idx, 0, 0, 0, 0, 0, &addr, NULL);
+
+	return t ? sl_thd_thdid(t) : 0;
+}
+
+thdid_t
+sched_child_aep_create(struct sched_childinfo *schedci, thdclosure_index_t idx, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv)
+{
+	vaddr_t addr;
+	struct sl_thd *t = sl_thd_aep_alloc_ext(sched_child_defci_get(schedci), NULL, idx, 1, owntc, key, ipiwin, ipimax, &addr, extrcv);
+
+	return t ? sl_thd_thdid(t) : 0;
 }
 
 void
@@ -60,7 +91,7 @@ cos_init(void)
 {
 	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
 	struct cos_compinfo    *ci    = cos_compinfo_get(defci);
-	static volatile int first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 };
+	static volatile unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 };
 	static u32_t cpubmp[NUM_CPU_BMP_WORDS] = { 0 };
 	int i;
 
@@ -84,7 +115,7 @@ cos_init(void)
 		while (!ps_load((unsigned long *)&init_done[i])) ;
 	}
 
-	sl_init_cpubmp(SL_MIN_PERIOD_US, cpubmp);
+	sl_init_corebmp(100*SL_MIN_PERIOD_US, cpubmp);
 	sched_childinfo_init();
 	__initializer_thd[cos_cpuid()] = sl_thd_alloc(__init_done, NULL);
 	assert(__initializer_thd[cos_cpuid()]);
diff --git a/src/components/implementation/sched/root_fprr_raw/Makefile b/src/components/implementation/sched/root_fprr_raw/Makefile
index 5061883b7f..206a88478a 100644
--- a/src/components/implementation/sched/root_fprr_raw/Makefile
+++ b/src/components/implementation/sched/root_fprr_raw/Makefile
@@ -4,7 +4,7 @@ COMPONENT=root_fprr_raw.o
 INTERFACES=sched schedinit
 DEPENDENCIES=
 IF_LIB=
-ADDITIONAL_LIBS=$(LIBSLRAW) -lsl_thd_static_backend -lsl_mod_fprr
+ADDITIONAL_LIBS=$(LIBSLRAW) -lsl_thd_static_backend -lsl_mod_fprr -lsl_blkpt
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/sched/root_fprr_raw/init.c b/src/components/implementation/sched/root_fprr_raw/init.c
index 854992f962..14d2c6ab46 100644
--- a/src/components/implementation/sched/root_fprr_raw/init.c
+++ b/src/components/implementation/sched/root_fprr_raw/init.c
@@ -24,13 +24,28 @@ capmgr_thd_retrieve_next(spdid_t child, thdid_t *tid)
 void
 sched_child_init(struct sched_childinfo *schedci)
 {
-	struct sl_thd *initthd = NULL;
-
 	assert(schedci);
 
-	initthd = sched_child_initthd_get(schedci);
-	assert(initthd);
-	sl_thd_param_set(initthd, sched_param_pack(SCHEDP_PRIO, FIXED_PRIO));
+	schedci->initthd = sl_thd_initaep_alloc_dcb(sched_child_defci_get(schedci), NULL, schedci->flags & COMP_FLAG_SCHED, schedci->flags & COMP_FLAG_SCHED ? 1 : 0, 0, 0, 0, 0);
+
+	assert(schedci->initthd);
+	sl_thd_param_set(schedci->initthd, sched_param_pack(SCHEDP_PRIO, FIXED_PRIO));
+}
+
+thdid_t
+sched_child_thd_create(struct sched_childinfo *schedci, thdclosure_index_t idx)
+{
+	struct sl_thd *t = sl_thd_aep_alloc_ext_dcb(sched_child_defci_get(schedci), NULL, idx, 0, 0, 0, 0, 0, 0, 0, NULL);
+
+	return t ? sl_thd_thdid(t) : 0;
+}
+
+thdid_t
+sched_child_aep_create(struct sched_childinfo *schedci, thdclosure_index_t idx, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv)
+{
+	struct sl_thd *t = sl_thd_aep_alloc_ext_dcb(sched_child_defci_get(schedci), NULL, idx, 1, owntc, key, 0, 0, 0, 0, extrcv);
+
+	return t ? sl_thd_thdid(t) : 0;
 }
 
 void
@@ -62,7 +77,7 @@ cos_init(void)
 		while (!ps_load(&init_done[i])) ;
 	}
 
-	sl_init_cpubmp(SL_MIN_PERIOD_US, cpubmp);
+	sl_init_corebmp(SL_MIN_PERIOD_US, cpubmp);
 	sched_childinfo_init_raw();
 	hypercall_comp_init_done();
 
diff --git a/src/components/implementation/sched/sched.c b/src/components/implementation/sched/sched.c
index 5943ed5217..1103dbddaa 100644
--- a/src/components/implementation/sched/sched.c
+++ b/src/components/implementation/sched/sched.c
@@ -50,25 +50,20 @@ thdid_t
 sched_thd_create_cserialized(thdclosure_index_t idx)
 {
 	spdid_t c = cos_inv_token();
-	struct cos_defcompinfo *dci;
-	struct sl_thd *t = NULL;
+	struct sched_childinfo *sci;
 
 	if (!c) return 0;
-	dci = sched_child_defci_get(sched_childinfo_find(c));
-	if (!dci) return 0;
+	sci = sched_childinfo_find(c);
+	if (!sci) return 0;
 
-	t = sl_thd_aep_alloc_ext(dci, NULL, idx, 0, 0, 0, 0, 0, NULL);
-	if (!t) return 0;
-
-	return sl_thd_thdid(t);
+	return sched_child_thd_create(sci, idx);
 }
 
 thdid_t
 sched_aep_create_cserialized(arcvcap_t *extrcv, int *unused, u32_t thdidx_owntc, u32_t key_ipimax, u32_t ipiwin32b)
 {
 	spdid_t c = cos_inv_token();
-	struct cos_defcompinfo *dci;
-	struct sl_thd      *t      = NULL;
+	struct sched_childinfo *sci;
 	int                 owntc  = (thdidx_owntc << 16) >> 16;
 	thdclosure_index_t  idx    = (thdidx_owntc >> 16);
 	microsec_t          ipiwin = (microsec_t)ipiwin32b;
@@ -76,13 +71,10 @@ sched_aep_create_cserialized(arcvcap_t *extrcv, int *unused, u32_t thdidx_owntc,
 	cos_channelkey_t    key    = (key_ipimax >> 16);
 
 	if (!c) return 0;
-	dci = sched_child_defci_get(sched_childinfo_find(c));
-	if (!dci) return 0;
-
-	t = sl_thd_aep_alloc_ext(dci, NULL, idx, 1, owntc, key, ipiwin, ipimax, extrcv);
-	if (!t) return 0;
+	sci = sched_childinfo_find(c);
+	if (!sci) return 0;
 
-	return sl_thd_thdid(t);
+	return sched_child_aep_create(sci, idx, owntc, key, ipiwin, ipimax, extrcv);
 }
 
 int
diff --git a/src/components/implementation/sched/sched_info.c b/src/components/implementation/sched/sched_info.c
index 96c3c5b360..5a9bb457a2 100644
--- a/src/components/implementation/sched/sched_info.c
+++ b/src/components/implementation/sched/sched_info.c
@@ -70,6 +70,8 @@ sched_num_childsched_get(void)
 	return sched_num_childsched[cos_cpuid()];
 }
 
+extern void __sched_stdio_init(void);
+
 static void
 sched_childinfo_init_intern(int is_raw)
 {
@@ -78,11 +80,10 @@ sched_childinfo_init_intern(int is_raw)
 	comp_flag_t childflags;
 
 	memset(childinfo[cos_cpuid()], 0, sizeof(struct sched_childinfo) * SCHED_MAX_CHILD_COMPS);
+	__sched_stdio_init();
 
 	while ((remaining = hypercall_comp_child_next(cos_spd_id(), &child, &childflags)) >= 0) {
-		struct cos_defcompinfo *child_dci = NULL;
 		struct sched_childinfo *schedinfo = NULL;
-		struct sl_thd          *initthd   = NULL;
 		compcap_t               compcap   = 0;
 
 		if (is_raw) {
@@ -92,15 +93,10 @@ sched_childinfo_init_intern(int is_raw)
 
 		schedinfo = sched_childinfo_alloc(child, compcap, childflags);
 		assert(schedinfo);
-		child_dci = sched_child_defci_get(schedinfo);
 		hypercall_comp_cpubitmap_get(child, schedinfo->cpubmp);
 
 		if (bitmap_check(schedinfo->cpubmp, cos_cpuid())) {
 			PRINTLOG(PRINT_DEBUG, "Initializing child component %u, is_sched=%d\n", child, childflags & COMP_FLAG_SCHED);
-			initthd = sl_thd_initaep_alloc(child_dci, NULL, childflags & COMP_FLAG_SCHED, childflags & COMP_FLAG_SCHED ? 1 : 0, 0, 0, 0); /* TODO: rate information */
-			assert(initthd);
-			sched_child_initthd_set(schedinfo, initthd);
-
 			sched_child_init(schedinfo);
 			if (childflags & COMP_FLAG_SCHED) ps_faa((unsigned long *)&sched_num_childsched[cos_cpuid()], 1);
 		}
diff --git a/src/components/implementation/sched/sched_info.h b/src/components/implementation/sched/sched_info.h
index 30bd318fa5..a1895d717e 100644
--- a/src/components/implementation/sched/sched_info.h
+++ b/src/components/implementation/sched/sched_info.h
@@ -11,8 +11,10 @@
 #include <cos_kernel_api.h>
 #include <cos_defkernel_api.h>
 #include <cos_types.h>
+#include <crt_chan.h>
 
 #define SCHED_MAX_CHILD_COMPS 8
+CRT_CHAN_TYPE_PROTOTYPES(LU, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
 
 struct sched_childinfo {
 	struct cos_defcompinfo defcinfo;
@@ -30,6 +32,8 @@ void sched_childinfo_init(void);
 void sched_childinfo_init_raw(void);
 
 extern unsigned int self_init[], num_child_init[];
+extern thdid_t sched_child_thd_create(struct sched_childinfo *schedci, thdclosure_index_t idx);
+extern thdid_t sched_child_aep_create(struct sched_childinfo *schedci, thdclosure_index_t idx, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv);
 
 static inline struct cos_defcompinfo *
 sched_child_defci_get(struct sched_childinfo *sci)
diff --git a/src/components/implementation/sched/sched_init.c b/src/components/implementation/sched/sched_init.c
index 7b47764cad..d528982980 100644
--- a/src/components/implementation/sched/sched_init.c
+++ b/src/components/implementation/sched/sched_init.c
@@ -37,7 +37,7 @@ schedinit_child(void)
 	if (!init) return 0;
 	tcur = sl_thd_curr();
 	if (!tcur) return 0;
-	assert(tcur->schedthd == init);
+	assert(tcur->schedthd == init || tcur == init);
 
 	/* thd retrieve */
 	do {
@@ -52,7 +52,7 @@ schedinit_child(void)
 		if (unlikely(t)) continue;
 
 		aep.tid = thdid;
-		aep.tc  = sl_thd_tcap(sl__globals_cpu()->sched_thd);
+		aep.tc  = sl_thd_tcap(sl__globals_core()->sched_thd);
 		t = sl_thd_init_ext(&aep, init);
 		if (!t) return 0;
 	} while (thdid);
diff --git a/src/components/implementation/sched/test_sched/Makefile b/src/components/implementation/sched/test_sched/Makefile
new file mode 100644
index 0000000000..b6383ecc8c
--- /dev/null
+++ b/src/components/implementation/sched/test_sched/Makefile
@@ -0,0 +1,10 @@
+C_OBJS=
+ASM_OBJS=
+COMPONENT=test_sched.o
+INTERFACES=sched schedinit crt
+DEPENDENCIES=capmgr channel
+IF_LIB=
+ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr -lsl_blkpt
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/sched/test_sched/init.c b/src/components/implementation/sched/test_sched/init.c
new file mode 100644
index 0000000000..83db6ea806
--- /dev/null
+++ b/src/components/implementation/sched/test_sched/init.c
@@ -0,0 +1,293 @@
+/**
+ * Redistribution of this file is permitted under the BSD two clause license.
+ *
+ * Copyright 2018, The George Washington University
+ * Author: Phani Gadepalli, phanikishoreg@gwu.edu
+ */
+
+#include <sl.h>
+#include <res_spec.h>
+#include <hypercall.h>
+#include <sched_info.h>
+#include <crt_chan.h>
+#include <chan_crt.h>
+#include <channel.h>
+
+#define MAX_USE_PIPE_SZ 1
+#define INITIALIZE_PRIO 1
+#define INITIALIZE_PERIOD_MS (4000)
+#define INITIALIZE_BUDGET_MS (2000)
+
+static struct sl_thd *__initializer_thd[NUM_CPU] CACHE_ALIGNED;
+
+u32_t cycs_per_usec = 0;
+cycles_t *int_start = NULL;
+volatile unsigned long *rdy = NULL;
+
+void
+sched_child_init(struct sched_childinfo *schedci)
+{
+	vaddr_t dcbaddr;
+	struct sl_thd *initthd;
+
+	assert(schedci);
+	assert(!(schedci->flags & COMP_FLAG_SCHED));
+	schedci->initthd = sl_thd_initaep_alloc(sched_child_defci_get(schedci), NULL, 0, 0, 0, 0, 0, &dcbaddr);
+        assert(schedci->initthd);
+	initthd = schedci->initthd;
+
+	sl_thd_param_set(initthd, sched_param_pack(SCHEDP_PRIO, 2));
+}
+
+extern void __sched_stdio_thd_init(thdid_t, struct crt_chan *, struct crt_chan *);
+#define MAX_PIPE_SZ 8
+CRT_CHAN_STATIC_ALLOC(c0, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+CRT_CHAN_STATIC_ALLOC(c1, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+CRT_CHAN_STATIC_ALLOC(c2, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+CRT_CHAN_STATIC_ALLOC(c3, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+CRT_CHAN_STATIC_ALLOC(c4, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+CRT_CHAN_STATIC_ALLOC(c5, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+CRT_CHAN_STATIC_ALLOC(c6, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+CRT_CHAN_STATIC_ALLOC(c7, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+
+#define SPDID_INT 5
+#define SPDID_W1  6
+#define SPDID_W3  7
+
+#define PRIO_START (MAX_PIPE_SZ + 8)
+
+#define PRIO_INT PRIO_START 
+#define PRIO_W0  (PRIO_START - 1)
+#define PRIO_W1  (PRIO_START - 2)
+#define PRIO_W2  (PRIO_START - 3)
+#define PRIO_W3  (PRIO_START - 4)
+#define PRIO_W4  (PRIO_START - 5)
+#define PRIO_W5  (PRIO_START - 6)
+#define PRIO_W6  (PRIO_START - 7)
+
+#define SND_DATA 0x1234
+
+#define SHMCHANNEL_KEY 0x2020
+#define MAX_ITERS 100000
+cycles_t vals[MAX_ITERS] = { 0 };
+int iters = 0;
+cycles_t tot = 0, wc = 0;
+static int pc, tc;
+
+struct __thd_info {
+	struct sl_thd *t;
+	tcap_prio_t p;
+} iot[MAX_PIPE_SZ + 1];
+
+struct __pipe_info {
+	struct sl_thd *sndr, *rcvr; /* p2p channels */
+	struct crt_chan *c;
+} iop[MAX_PIPE_SZ];
+
+static int
+schedinit_self(void)
+{
+	if (ps_load(&tc) < (MAX_USE_PIPE_SZ + 1)) return 1;
+
+	assert(ps_load(&tc) == (MAX_USE_PIPE_SZ + 1));
+
+	return 0;
+}
+
+static void
+__init_done(void *d)
+{
+	while (schedinit_self()) sl_thd_block_periodic(0);
+
+	int i;
+
+	for (i = 0; i < MAX_USE_PIPE_SZ; i++) {
+		if (i == 0) {
+			crt_chan_init_LU(iop[i].c);
+		} else {
+			assert(iop[i].sndr && iop[i].rcvr);
+			crt_chan_p2p_init_LU(iop[i].c, iop[i].sndr, iop[i].rcvr);
+		}
+	}
+
+	/* don't want the threads to run before channels are initialized! */
+	for (i = MAX_USE_PIPE_SZ; i >= 0; i--) {
+		PRINTC("%d, %lx, %u\n", i, (unsigned long)(iot[i].t), sl_thd_thdid(iot[i].t));
+		assert(iot[i].t);
+		sl_thd_param_set(iot[i].t, sched_param_pack(SCHEDP_PRIO, iot[i].p));
+	}
+	PRINTLOG(PRINT_DEBUG, "SELF (inc. CHILD) INIT DONE.\n");
+
+	sl_thd_exit();
+
+	assert(0);
+}
+
+
+static void
+work_thd_fn(void *data)
+{
+	int is_last = (int)data;
+
+	ps_faa(rdy, 1);
+
+	while (1) {
+		chan_in();
+		if (unlikely(is_last)) {
+			cycles_t end, diff;
+			if (iters >= MAX_ITERS) continue;
+			rdtscll(end);
+			assert(int_start);
+			diff = end - *int_start;
+			if (wc < diff) wc = diff;
+			tot += diff;
+			vals[iters] = diff;
+			//printc("%llu\n", diff);
+			iters++;
+			if (iters % 1000 == 0) printc(".");
+
+			if (iters == MAX_ITERS) {
+				int i;
+
+				for (i = 0; i < MAX_ITERS; i++) printc("%llu\n", vals[i]);
+				PRINTC("%llu, %llu\n", tot / iters, wc);
+				//tot = wc = 0;
+				//iters = 0;
+			}
+			continue;
+		}
+		chan_out(SND_DATA);
+	}
+}
+
+thdid_t
+sched_child_thd_create(struct sched_childinfo *schedci, thdclosure_index_t idx)
+{
+	vaddr_t addr;
+	struct sl_thd *t = sl_thd_aep_alloc_ext(sched_child_defci_get(schedci), NULL, idx, 0, 0, 0, 0, 0, &addr, NULL);
+	assert(t);
+	if (cos_inv_token() == SPDID_W1) {
+		iot[2].t = t;
+		iot[2].p = PRIO_W1;
+		iop[1].rcvr = t;
+		iop[2].sndr = t;
+		__sched_stdio_thd_init(sl_thd_thdid(t), c1, c2);
+	} else if (cos_inv_token() == SPDID_W3) {
+		iot[4].t = t;
+		iot[4].p = PRIO_W3;
+		iop[3].rcvr = t;
+		__sched_stdio_thd_init(sl_thd_thdid(t), c3, NULL);
+	}
+	ps_faa(&tc, 1);
+
+	return t ? sl_thd_thdid(t) : 0;
+}
+
+thdid_t
+sched_child_aep_create(struct sched_childinfo *schedci, thdclosure_index_t idx, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv)
+{
+	assert(cos_inv_token() == SPDID_INT);
+	int first = 1;
+	vaddr_t addr;
+	/* only 1 aep */
+	if (!ps_cas(&first, 1, 0)) assert(0);
+	struct sl_thd *t = sl_thd_aep_alloc_ext(sched_child_defci_get(schedci), NULL, idx, 1, owntc, key, ipiwin, ipimax, &addr, extrcv);
+	assert(t);
+	__sched_stdio_thd_init(sl_thd_thdid(t), NULL, c0);
+	iot[0].t = t;
+	iot[0].p = PRIO_INT;
+	iop[0].sndr = t;
+	ps_faa(&tc, 1);
+
+	return t ? sl_thd_thdid(t) : 0;
+}
+
+void
+test_pipes_init(void)
+{
+	struct sl_thd *t = sl_thd_alloc(work_thd_fn, MAX_USE_PIPE_SZ == 1 ? (void *)1 : (void *)0);
+	assert(t);
+	iot[1].t = t;
+	iot[1].p = PRIO_W0;
+	iop[0].rcvr = t; /* no optimized path for rcving from INT thread */
+	iop[1].sndr = t;
+	__sched_stdio_thd_init(sl_thd_thdid(t), c0, c1);
+	ps_faa(&tc, 1);
+	if (MAX_USE_PIPE_SZ >= 3) { 
+		t = sl_thd_alloc(work_thd_fn, MAX_USE_PIPE_SZ == 3 ? (void *)1 : (void *)0);
+		assert(t);
+		iot[3].t = t;
+		iot[3].p = PRIO_W2;
+		iop[2].rcvr = t;
+		iop[3].sndr = t;
+		__sched_stdio_thd_init(sl_thd_thdid(t), c2, c3);
+		ps_faa(&tc, 1);
+	}
+}
+
+void
+cos_init(void)
+{
+	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
+	struct cos_compinfo    *ci    = cos_compinfo_get(defci);
+	static volatile unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 };
+	static u32_t cpubmp[NUM_CPU_BMP_WORDS] = { 0 };
+	int i;
+
+	assert(NUM_CPU == 1);
+	assert(MAX_USE_PIPE_SZ <= MAX_PIPE_SZ);
+	memset(iop, 0, sizeof(struct __pipe_info) * MAX_PIPE_SZ);
+	memset(iot, 0, sizeof(struct __thd_info) * (MAX_PIPE_SZ + 1));
+	pc = tc = 0;
+	iop[0].c = c0;
+	iop[1].c = c1;
+	iop[2].c = c2;
+	iop[3].c = c3;
+	iop[4].c = c4;
+	iop[5].c = c5;
+	iop[6].c = c6;
+	iop[7].c = c7;
+
+	PRINTLOG(PRINT_DEBUG, "CPU cycles per sec: %u\n", cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE));
+
+	if (ps_cas((unsigned long *)&first, NUM_CPU + 1, cos_cpuid())) {
+		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
+		cos_defcompinfo_init();
+		cos_init_args_cpubmp(cpubmp);
+	} else {
+		while (!ps_load((unsigned long *)&init_done[first])) ;
+
+		cos_defcompinfo_sched_init();
+	}
+	ps_faa((unsigned long *)&init_done[cos_cpuid()], 1);
+
+	/* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */
+	for (i = 0; i < NUM_CPU; i++) {
+		if (!bitmap_check(cpubmp, i)) continue;
+
+		while (!ps_load((unsigned long *)&init_done[i])) ;
+	}
+
+	sl_init_corebmp(100*SL_MIN_PERIOD_US, cpubmp);
+	vaddr_t tscaddr = 0;
+	cbuf_t id = channel_shared_page_alloc(SHMCHANNEL_KEY, &tscaddr);
+	assert(id > 0);
+	int_start = (cycles_t *)tscaddr;
+	*int_start = 0ULL;
+	rdy = (volatile unsigned long *)(int_start + 1);
+	*rdy = 0;
+	sched_childinfo_init();
+	test_pipes_init();
+	__initializer_thd[cos_cpuid()] = sl_thd_alloc(__init_done, NULL);
+	assert(__initializer_thd[cos_cpuid()]);
+	sl_thd_param_set(__initializer_thd[cos_cpuid()], sched_param_pack(SCHEDP_PRIO, INITIALIZE_PRIO));
+	sl_thd_param_set(__initializer_thd[cos_cpuid()], sched_param_pack(SCHEDP_WINDOW, INITIALIZE_BUDGET_MS));
+	sl_thd_param_set(__initializer_thd[cos_cpuid()], sched_param_pack(SCHEDP_BUDGET, INITIALIZE_PERIOD_MS));
+
+	hypercall_comp_init_done();
+
+	sl_sched_loop_nonblock();
+
+	PRINTLOG(PRINT_ERROR, "Should never have reached this point!!!\n");
+	assert(0);
+}
diff --git a/src/components/implementation/srv_dummy/Makefile b/src/components/implementation/srv_dummy/Makefile
index 0490a703e3..53929a7ceb 100644
--- a/src/components/implementation/srv_dummy/Makefile
+++ b/src/components/implementation/srv_dummy/Makefile
@@ -1,3 +1,3 @@
-INTERFACES=sched schedinit srv_dummy
+INTERFACES=sched schedinit crt srv_dummy
 
 include ../Makefile.subdir
diff --git a/src/components/implementation/srv_dummy/cdummy/Makefile b/src/components/implementation/srv_dummy/cdummy/Makefile
index 1762e85c90..f6165eca08 100644
--- a/src/components/implementation/srv_dummy/cdummy/Makefile
+++ b/src/components/implementation/srv_dummy/cdummy/Makefile
@@ -5,7 +5,7 @@ INTERFACES=sched schedinit srv_dummy
 DEPENDENCIES=capmgr sched schedinit channel
 IF_LIB=
 FN_PREPEND=parent_
-ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr -lsinv_client
+ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr -lsinv_client -lsl_blkpt
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/srv_dummy/cdummy/init.c b/src/components/implementation/srv_dummy/cdummy/init.c
index cf568ceb2c..cdd8184421 100644
--- a/src/components/implementation/srv_dummy/cdummy/init.c
+++ b/src/components/implementation/srv_dummy/cdummy/init.c
@@ -97,7 +97,7 @@ cos_init(void)
 		while (!ps_load(&init_done[i])) ;
 	}
 
-	sl_init_cpubmp(SL_MIN_PERIOD_US, cpubmp);
+	sl_init_corebmp(SL_MIN_PERIOD_US, cpubmp);
 	sched_childinfo_init();
 	__initializer_thd[cos_cpuid()] = sl_thd_alloc(__init_done, NULL);
 	assert(__initializer_thd[cos_cpuid()]);
diff --git a/src/components/implementation/srv_dummy/chan_backend.c b/src/components/implementation/srv_dummy/chan_backend.c
new file mode 120000
index 0000000000..1f996d8e9b
--- /dev/null
+++ b/src/components/implementation/srv_dummy/chan_backend.c
@@ -0,0 +1 @@
+../sched/chan_backend.c
\ No newline at end of file
diff --git a/src/components/implementation/srv_dummy/sched.c b/src/components/implementation/srv_dummy/sched.c
index 338c99723b..9980008529 100644
--- a/src/components/implementation/srv_dummy/sched.c
+++ b/src/components/implementation/srv_dummy/sched.c
@@ -52,12 +52,13 @@ sched_thd_create_cserialized(thdclosure_index_t idx)
 	spdid_t c = cos_inv_token();
 	struct cos_defcompinfo *dci;
 	struct sl_thd *t = NULL;
+	vaddr_t dcbaddr;
 
 	if (!c) return 0;
 	dci = sched_child_defci_get(sched_childinfo_find(c));
 	if (!dci) return 0;
 
-	t = sl_thd_aep_alloc_ext(dci, NULL, idx, 0, 0, 0, 0, 0, NULL);
+	t = sl_thd_aep_alloc_ext(dci, NULL, idx, 0, 0, 0, 0, 0, &dcbaddr, NULL);
 	if (!t) return 0;
 	srv_dummy_thdinit(sl_thd_thdid(t), 0);
 
@@ -75,12 +76,13 @@ sched_aep_create_cserialized(arcvcap_t *extrcv, int *unused, u32_t thdidx_owntc,
 	microsec_t          ipiwin = (microsec_t)ipiwin32b;
 	u32_t               ipimax = (key_ipimax << 16) >> 16;
 	cos_channelkey_t    key    = (key_ipimax >> 16);
+	vaddr_t dcbaddr;
 
 	if (!c) return 0;
 	dci = sched_child_defci_get(sched_childinfo_find(c));
 	if (!dci) return 0;
 
-	t = sl_thd_aep_alloc_ext(dci, NULL, idx, 1, owntc, key, ipiwin, ipimax, extrcv);
+	t = sl_thd_aep_alloc_ext(dci, NULL, idx, 1, owntc, key, ipiwin, ipimax, &dcbaddr, extrcv);
 	if (!t) return 0;
 	srv_dummy_thdinit(sl_thd_thdid(t), 1);
 
diff --git a/src/components/implementation/srv_dummy/sched_info.h b/src/components/implementation/srv_dummy/sched_info.h
index 7cb898ec51..b922a2464b 100644
--- a/src/components/implementation/srv_dummy/sched_info.h
+++ b/src/components/implementation/srv_dummy/sched_info.h
@@ -11,8 +11,10 @@
 #include <cos_kernel_api.h>
 #include <cos_defkernel_api.h>
 #include <cos_types.h>
+#include <crt_chan.h>
 
 #define SCHED_MAX_CHILD_COMPS 8
+CRT_CHAN_TYPE_PROTOTYPES(LU, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
 
 struct sched_childinfo {
 	struct cos_defcompinfo defcinfo;
diff --git a/src/components/implementation/srv_dummy/sched_init.c b/src/components/implementation/srv_dummy/sched_init.c
index 7f88b858c5..ecd962e63f 100644
--- a/src/components/implementation/srv_dummy/sched_init.c
+++ b/src/components/implementation/srv_dummy/sched_init.c
@@ -56,7 +56,7 @@ schedinit_child(void)
 		if (unlikely(t)) continue;
 
 		aep.tid = thdid;
-		aep.tc  = sl_thd_tcap(sl__globals_cpu()->sched_thd);
+		aep.tc  = sl_thd_tcap(sl__globals_core()->sched_thd);
 		t = sl_thd_init_ext(&aep, init);
 		if (!t) return 0;
 	} while (thdid);
diff --git a/src/components/implementation/srv_dummy/sdummy/Makefile b/src/components/implementation/srv_dummy/sdummy/Makefile
index 75fff5cefa..89e1ccf634 100644
--- a/src/components/implementation/srv_dummy/sdummy/Makefile
+++ b/src/components/implementation/srv_dummy/sdummy/Makefile
@@ -5,7 +5,7 @@ INTERFACES=sched schedinit srv_dummy
 DEPENDENCIES=capmgr sched schedinit
 IF_LIB=
 FN_PREPEND=parent_
-ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr
+ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr -lsl_blkpt
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/srv_dummy/sdummy/init.c b/src/components/implementation/srv_dummy/sdummy/init.c
index 367a575cc3..9cecb2e145 100644
--- a/src/components/implementation/srv_dummy/sdummy/init.c
+++ b/src/components/implementation/srv_dummy/sdummy/init.c
@@ -94,7 +94,7 @@ cos_init(void)
 		while (!ps_load(&init_done[i])) ;
 	}
 
-	sl_init_cpubmp(SL_MIN_PERIOD_US, cpubmp);
+	sl_init_corebmp(SL_MIN_PERIOD_US, cpubmp);
 	sched_childinfo_init();
 	__initializer_thd[cos_cpuid()] = sl_thd_alloc(__init_done, NULL);
 	assert(__initializer_thd[cos_cpuid()]);
diff --git a/src/components/implementation/tests/crt_tests/Makefile b/src/components/implementation/tests/crt_tests/Makefile
new file mode 100644
index 0000000000..1469929f49
--- /dev/null
+++ b/src/components/implementation/tests/crt_tests/Makefile
@@ -0,0 +1,8 @@
+COMPONENT=crtt.o
+INTERFACES=
+DEPENDENCIES=
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format $(LIBSLRAW) -lsl_mod_fprr -lsl_thd_static_backend -lsl_blkpt
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/tests/crt_tests/crttests.c b/src/components/implementation/tests/crt_tests/crttests.c
new file mode 100644
index 0000000000..ac8882afac
--- /dev/null
+++ b/src/components/implementation/tests/crt_tests/crttests.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright 2016, Phani Gadepalli and Gabriel Parmer, GWU, gparmer@gwu.edu.
+ *
+ * This uses a two clause BSD License.
+ */
+
+#include <cos_component.h>
+#include <cos_defkernel_api.h>
+#include <llprint.h>
+#include <sl.h>
+
+#include <crt_lock.h>
+#include <crt_chan.h>
+
+struct cos_compinfo *ci;
+
+#define CHAN_ITER  1000000
+#define NCHANTHDS  5
+#define CHAN_BATCH 3
+
+CRT_CHAN_STATIC_ALLOC(c0, int, 4);
+CRT_CHAN_STATIC_ALLOC(c1, int, 4);
+CRT_CHAN_STATIC_ALLOC(c2, int, 4);
+CRT_CHAN_STATIC_ALLOC(c3, int, 4);
+CRT_CHAN_STATIC_ALLOC(c4, int, 4);
+
+CRT_CHAN_TYPE_PROTOTYPES(test, int, 4);
+struct crt_chan *chans[NCHANTHDS + 1];
+struct sl_thd  *chan_thds[NCHANTHDS] = {NULL, };
+
+typedef enum { CHILLING = 0, RECVING, SENDING } actions_t;
+unsigned long status[NCHANTHDS];
+unsigned long cnts[NCHANTHDS] = {0, };
+
+int
+chantest_is_deadlocked(void)
+{
+	int i;
+	actions_t s = status[0];
+
+	/* Are all threads in the same blocked state? */
+	for (i = 0; i < NCHANTHDS; i++) {
+		if (status[i] == CHILLING || status[i] != s) return 0;
+	}
+
+	return 1;
+}
+
+void
+chantest_send(int thd_off, struct crt_chan *c)
+{
+	int send = cos_thdid();
+
+	if (crt_chan_full_test(c)) status[thd_off] = SENDING;
+	if (!chantest_is_deadlocked()) {
+		/* printc("\t%d: send\n", cos_thdid()); */
+		crt_chan_send_test(c, &send);
+	}
+	status[thd_off] = CHILLING;
+}
+
+void
+chantest_recv(int thd_off, struct crt_chan *c)
+{
+	int recv;
+
+	if (crt_chan_empty_test(c)) status[thd_off] = RECVING;
+	if (!chantest_is_deadlocked()) {
+		/* printc("\t%d: recv\n", cos_thdid()); */
+		crt_chan_recv_test(c, &recv);
+		cnts[thd_off]++;
+	}
+	status[thd_off] = CHILLING;
+}
+
+void
+chan_thd(void *d)
+{
+	int thd_off = (int)d;
+	struct crt_chan **chan_pair = &chans[thd_off];
+	int recv;
+	int i;
+
+	for (i = 0; i < CHAN_ITER; i++) {
+		int j;
+
+		/* printc("%d: pre-send\n", cos_thdid()); */
+		for (j = 0; j < CHAN_BATCH; j++) {
+			chantest_send(thd_off, chan_pair[1]);
+		}
+
+		/* printc("%d: pre-recv\n", cos_thdid()); */
+		for (j = 0; j < CHAN_BATCH; j++) {
+			chantest_recv(thd_off, chan_pair[0]);
+		}
+	}
+
+	printc("SUCCESS! Counts (should be within %d of each other): ", NCHANTHDS * CHAN_BATCH);
+	for (i = 0; i < NCHANTHDS; i++) {
+		printc("\t%ld", cnts[i]);
+	}
+	printc("\n");
+	while (1) ;
+}
+
+void
+idle_thd(void *d)
+{
+	printc("FAILURE: deadlock!\n");
+	while (1) ;
+}
+
+void
+test_chan(void)
+{
+	int i;
+	struct sl_thd *idle;
+	union sched_param_union idle_param = {.c = {.type = SCHEDP_PRIO, .value = 10}};
+
+	union sched_param_union sps[] = {
+		{.c = {.type = SCHEDP_PRIO, .value = 7}},
+		{.c = {.type = SCHEDP_PRIO, .value = 6}},
+		{.c = {.type = SCHEDP_PRIO, .value = 8}},
+		{.c = {.type = SCHEDP_PRIO, .value = 5}},
+		{.c = {.type = SCHEDP_PRIO, .value = 5}}
+	};
+
+	chans[0] = c0;
+	chans[1] = c1;
+	chans[2] = c2;
+	chans[3] = c3;
+	chans[4] = c4;
+	chans[5] = c0;
+
+	for (i = 0; i < NCHANTHDS; i++) {
+		crt_chan_init_test(chans[i]);
+	}
+
+	printc("Create threads:\n");
+	for (i = 0; i < NCHANTHDS; i++) {
+		chan_thds[i] = sl_thd_alloc(chan_thd, (void *)i);
+		assert(chan_thds[i]);
+		printc("\tcreating thread %d at prio %d\n", sl_thd_thdid(chan_thds[i]), sps[i].c.value);
+		sl_thd_param_set(chan_thds[i], sps[i].v);
+	}
+	idle = sl_thd_alloc(idle_thd, NULL);
+	printc("\tcreating IDLE %d at prio %d\n", sl_thd_thdid(idle), idle_param.c.value);
+	sl_thd_param_set(idle, idle_param.v);
+
+}
+
+#define LOCK_ITER 1000000
+#define NLOCKTHDS 4
+struct crt_lock lock;
+struct sl_thd  *lock_thds[NLOCKTHDS] = {NULL, };
+unsigned int    progress[NLOCKTHDS] = {0, };
+volatile thdid_t holder;
+
+thdid_t
+next_thd(void)
+{
+	return sl_thd_thdid(lock_thds[(unsigned int)(ps_tsc() % NLOCKTHDS)]);
+}
+
+void
+lock_thd(void *d)
+{
+	int i, cnt, me = -1;
+
+	for (i = 0; i < NLOCKTHDS; i++) {
+		if (sl_thd_thdid(lock_thds[i]) != cos_thdid()) continue;
+
+		me = i;
+	}
+	assert(me != -1);
+
+	sl_thd_yield(sl_thd_thdid(lock_thds[1]));
+
+	for (i = 0; i < LOCK_ITER; i++) {
+		crt_lock_take(&lock);
+
+		progress[me]++;
+		holder = cos_thdid();
+
+		sl_thd_yield(next_thd());
+
+		if (holder != cos_thdid()) {
+			printc("FAILURE\n");
+			BUG();
+		}
+		crt_lock_release(&lock);
+		sl_thd_yield(next_thd());
+	}
+
+	for (i = 0; i < NLOCKTHDS; i++) {
+		if (i == me) continue;
+
+		if (progress[i] < LOCK_ITER) {
+			sl_thd_yield(sl_thd_thdid(lock_thds[i]));
+		}
+	}
+
+	printc("SUCCESS!");
+	while (1) ;
+}
+
+void
+test_lock(void)
+{
+	int i;
+	union sched_param_union sps[] = {
+		{.c = {.type = SCHEDP_PRIO, .value = 5}},
+		{.c = {.type = SCHEDP_PRIO, .value = 6}},
+		{.c = {.type = SCHEDP_PRIO, .value = 6}},
+		{.c = {.type = SCHEDP_PRIO, .value = 7}}
+	};
+
+	crt_lock_init(&lock);
+
+	printc("Create threads:\n");
+	for (i = 0; i < NLOCKTHDS; i++) {
+		lock_thds[i] = sl_thd_alloc(lock_thd, NULL);
+		printc("\tcreating thread %d at prio %d\n", sl_thd_thdid(lock_thds[i]), sps[i].c.value);
+		sl_thd_param_set(lock_thds[i], sps[i].v);
+	}
+}
+
+void
+cos_init(void)
+{
+	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
+	ci = cos_compinfo_get(defci);
+
+	printc("Unit-test for the crt (sl)\n");
+	cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
+	cos_defcompinfo_init();
+	sl_init(SL_MIN_PERIOD_US);
+
+	test_lock();
+//	test_chan();
+
+	printc("Running benchmark...\n");
+	sl_sched_loop_nonblock();
+
+	assert(0);
+
+	return;
+}
diff --git a/src/components/implementation/tests/kernel_tests/k_perf_tests.c b/src/components/implementation/tests/kernel_tests/k_perf_tests.c
index a49d4af3ff..81812d60c5 100644
--- a/src/components/implementation/tests/kernel_tests/k_perf_tests.c
+++ b/src/components/implementation/tests/kernel_tests/k_perf_tests.c
@@ -23,58 +23,58 @@ volatile cycles_t         main_thd = 0, side_thd = 0;
 static void
 bounceback(void *d)
 {
-        while (1) {
-                rdtscll(side_thd);
-                cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
-        }
+	while (1) {
+		rdtscll(side_thd);
+		cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
+	}
 }
 
 static void
 test_thds_create_switch(void)
 {
-        thdcap_t ts;
-        int      ret, i;
+	thdcap_t ts;
+	int      ret, i;
 
-        perfdata_init(&pd[cos_cpuid()], "COS THD => COS_THD_SWITCH", test_results, ARRAY_SIZE);
+	perfdata_init(&pd[cos_cpuid()], "COS THD => COS_THD_SWITCH", test_results, ARRAY_SIZE);
 
-        ts = cos_thd_alloc(&booter_info, booter_info.comp_cap, bounceback, NULL);
-        if (EXPECT_LL_LT(1, ts, "Thread Creation: Cannot Allocate")) {
-                return;
-        }
+	ts = cos_thd_alloc(&booter_info, booter_info.comp_cap, bounceback, NULL, 0, 0);
+	if (EXPECT_LL_LT(1, ts, "Thread Creation: Cannot Allocate")) {
+		return;
+	}
 
-        for (i = 0; i < ITER; i++) {
-                rdtscll(main_thd);
-                ret = cos_thd_switch(ts);
-                EXPECT_LL_NEQ(0, ret, "COS Switch Error");
+	for (i = 0; i < ITER; i++) {
+		rdtscll(main_thd);
+		ret = cos_thd_switch(ts);
+		EXPECT_LL_NEQ(0, ret, "COS Switch Error");
 
-                perfdata_add(&pd[cos_cpuid()], (side_thd - main_thd));
-        }
+		perfdata_add(&pd[cos_cpuid()], (side_thd - main_thd));
+	}
 
-        perfdata_calc(&pd[cos_cpuid()]);
+	perfdata_calc(&pd[cos_cpuid()]);
 
-        PRINTC("\tCOS THD => COS_THD_SWITCH:\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
-                        perfdata_avg(&pd[cos_cpuid()]), perfdata_max(&pd[cos_cpuid()]), perfdata_min(&pd[cos_cpuid()]), perfdata_sz(&pd[cos_cpuid()]));
+	PRINTC("\tCOS THD => COS_THD_SWITCH:\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
+			perfdata_avg(&pd[cos_cpuid()]), perfdata_max(&pd[cos_cpuid()]), perfdata_min(&pd[cos_cpuid()]), perfdata_sz(&pd[cos_cpuid()]));
 
-        printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
-                        perfdata_sd(&pd[cos_cpuid()]),perfdata_90ptile(&pd[cos_cpuid()]), perfdata_95ptile(&pd[cos_cpuid()]), perfdata_99ptile(&pd[cos_cpuid()]));
+	printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
+			perfdata_sd(&pd[cos_cpuid()]),perfdata_90ptile(&pd[cos_cpuid()]), perfdata_95ptile(&pd[cos_cpuid()]), perfdata_99ptile(&pd[cos_cpuid()]));
 
-        perfdata_init(&pd[cos_cpuid()], "COS THD => COS_SWITCH", test_results, ARRAY_SIZE);
+	perfdata_init(&pd[cos_cpuid()], "COS THD => COS_SWITCH", test_results, ARRAY_SIZE);
 
-        for (i = 0; i < ITER; i++) {
-                rdtscll(main_thd);
-                ret = cos_switch(ts, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, 0, 0, 0, 0);
-                EXPECT_LL_NEQ(0, ret, "COS Switch Error");
+	for (i = 0; i < ITER; i++) {
+		rdtscll(main_thd);
+		ret = cos_switch(ts, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, 0, 0, 0, 0);
+		EXPECT_LL_NEQ(0, ret, "COS Switch Error");
 
-                perfdata_add(&pd[cos_cpuid()], (side_thd - main_thd));
-        }
+		perfdata_add(&pd[cos_cpuid()], (side_thd - main_thd));
+	}
 
-        perfdata_calc(&pd[cos_cpuid()]);
+	perfdata_calc(&pd[cos_cpuid()]);
 
-        PRINTC("\tCOS THD => COS_SWITCH:\t\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
-                        perfdata_avg(&pd[cos_cpuid()]), perfdata_max(&pd[cos_cpuid()]), perfdata_min(&pd[cos_cpuid()]), perfdata_sz(&pd[cos_cpuid()]));
+	PRINTC("\tCOS THD => COS_SWITCH:\t\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
+			perfdata_avg(&pd[cos_cpuid()]), perfdata_max(&pd[cos_cpuid()]), perfdata_min(&pd[cos_cpuid()]), perfdata_sz(&pd[cos_cpuid()]));
 
-        printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
-                        perfdata_sd(&pd[cos_cpuid()]),perfdata_90ptile(&pd[cos_cpuid()]), perfdata_95ptile(&pd[cos_cpuid()]), perfdata_99ptile(&pd[cos_cpuid()]));
+	printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
+			perfdata_sd(&pd[cos_cpuid()]),perfdata_90ptile(&pd[cos_cpuid()]), perfdata_95ptile(&pd[cos_cpuid()]), perfdata_99ptile(&pd[cos_cpuid()]));
 }
 
 /*
@@ -86,162 +86,162 @@ test_thds_create_switch(void)
 static void
 async_thd_fn_perf(void *thdcap)
 {
-        thdcap_t  tc = (thdcap_t)thdcap;
-        asndcap_t sc = scc_global[cos_cpuid()];
-        arcvcap_t rc = rcc_global[cos_cpuid()];
-        int           i, ret, pending = 0;
+	thdcap_t  tc = (thdcap_t)thdcap;
+	asndcap_t sc = scc_global[cos_cpuid()];
+	arcvcap_t rc = rcc_global[cos_cpuid()];
+	int           i, ret, pending = 0;
 
-        for (i = 0; i < ITER; i++) {
-                cos_rcv(rc, 0, NULL);
-                cos_asnd(sc, 1);
-        }
+	for (i = 0; i < ITER; i++) {
+		cos_rcv(rc, 0);
+		cos_asnd(sc, 1);
+	}
 
-        cos_thd_switch(tc);
+	cos_thd_switch(tc);
 
-        for (i = 0; i < ITER + 1; i++) {
-                cos_rcv(rc, 0, NULL);
-        }
+	for (i = 0; i < ITER + 1; i++) {
+		cos_rcv(rc, 0);
+	}
 
-        ret = cos_thd_switch(tc);
-        EXPECT_LL_NEQ(0, ret, "COS Switch Error");
+	ret = cos_thd_switch(tc);
+	EXPECT_LL_NEQ(0, ret, "COS Switch Error");
 
-        EXPECT_LL_NEQ(1, 0, "Error, shouldn't get here!\n");
-        assert(0);
+	EXPECT_LL_NEQ(1, 0, "Error, shouldn't get here!\n");
+	assert(0);
 }
 
 static void
 async_thd_parent_perf(void *thdcap)
 {
-        thdcap_t  tc = (thdcap_t)thdcap;
-        asndcap_t sc = scp_global[cos_cpuid()];
-        arcvcap_t rc = rcc_global[cos_cpuid()];
-        long long e = 0, s = 0;
-        int           i, pending = 0;
+	thdcap_t  tc = (thdcap_t)thdcap;
+	asndcap_t sc = scp_global[cos_cpuid()];
+	arcvcap_t rc = rcc_global[cos_cpuid()];
+	long long e = 0, s = 0;
+	int           i, pending = 0;
 
-        perfdata_init(&pd[cos_cpuid()], "Async Endpoints => Roundtrip", test_results, ARRAY_SIZE);
+	perfdata_init(&pd[cos_cpuid()], "Async Endpoints => Roundtrip", test_results, ARRAY_SIZE);
 
-        for (i = 0; i < ITER; i++) {
-                rdtscll(s);
-                cos_asnd(sc, 1);
-                cos_rcv(rc, 0, NULL);
-                rdtscll(e);
+	for (i = 0; i < ITER; i++) {
+		rdtscll(s);
+		cos_asnd(sc, 1);
+		cos_rcv(rc, 0);
+		rdtscll(e);
 
-                perfdata_add(&pd[cos_cpuid()], (e - s));
-        }
+		perfdata_add(&pd[cos_cpuid()], (e - s));
+	}
 
-        perfdata_calc(&pd[cos_cpuid()]);
+	perfdata_calc(&pd[cos_cpuid()]);
 
-        PRINTC("\tAsync Endpoints => Roundtrip:\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
-                        perfdata_avg(&pd[cos_cpuid()]), perfdata_max(&pd[cos_cpuid()]), perfdata_min(&pd[cos_cpuid()]),
-                        perfdata_sz(&pd[cos_cpuid()]));
+	PRINTC("\tAsync Endpoints => Roundtrip:\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
+			perfdata_avg(&pd[cos_cpuid()]), perfdata_max(&pd[cos_cpuid()]), perfdata_min(&pd[cos_cpuid()]),
+			perfdata_sz(&pd[cos_cpuid()]));
 
-        printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
-                        perfdata_sd(&pd[cos_cpuid()]),perfdata_90ptile(&pd[cos_cpuid()]), perfdata_95ptile(&pd[cos_cpuid()]),
-                        perfdata_99ptile(&pd[cos_cpuid()]));
+	printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
+			perfdata_sd(&pd[cos_cpuid()]),perfdata_90ptile(&pd[cos_cpuid()]), perfdata_95ptile(&pd[cos_cpuid()]),
+			perfdata_99ptile(&pd[cos_cpuid()]));
 
-        perfdata_init(&pd[cos_cpuid()], "Async Endpoints => One Way", test_results, ARRAY_SIZE);
+	perfdata_init(&pd[cos_cpuid()], "Async Endpoints => One Way", test_results, ARRAY_SIZE);
 
-        for (i = 0; i < ITER; i++) {
-                rdtscll(s);
-                cos_asnd(sc, 1);
-                rdtscll(e);
+	for (i = 0; i < ITER; i++) {
+		rdtscll(s);
+		cos_asnd(sc, 1);
+		rdtscll(e);
 
-                perfdata_add(&pd[cos_cpuid()], (e - s));
-        }
+		perfdata_add(&pd[cos_cpuid()], (e - s));
+	}
 
-        perfdata_calc(&pd[cos_cpuid()]);
+	perfdata_calc(&pd[cos_cpuid()]);
 
-        PRINTC("\tAsync Endpoints => One Way:\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
-                        perfdata_avg(&pd[cos_cpuid()]), perfdata_max(&pd[cos_cpuid()]), perfdata_min(&pd[cos_cpuid()]),
-                        perfdata_sz(&pd[cos_cpuid()]));
+	PRINTC("\tAsync Endpoints => One Way:\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
+			perfdata_avg(&pd[cos_cpuid()]), perfdata_max(&pd[cos_cpuid()]), perfdata_min(&pd[cos_cpuid()]),
+			perfdata_sz(&pd[cos_cpuid()]));
 
-        printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
-                        perfdata_sd(&pd[cos_cpuid()]),perfdata_90ptile(&pd[cos_cpuid()]), perfdata_95ptile(&pd[cos_cpuid()]),
-                        perfdata_99ptile(&pd[cos_cpuid()]));
+	printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
+			perfdata_sd(&pd[cos_cpuid()]),perfdata_90ptile(&pd[cos_cpuid()]), perfdata_95ptile(&pd[cos_cpuid()]),
+			perfdata_99ptile(&pd[cos_cpuid()]));
 
-        async_test_flag_[cos_cpuid()] = 0;
-        while (1) cos_thd_switch(tc);
+	async_test_flag_[cos_cpuid()] = 0;
+	while (1) cos_thd_switch(tc);
 }
 
 static void
 test_async_endpoints_perf(void)
 {
-        thdcap_t  tcp, tcc;
-        tcap_t    tccp, tccc;
-        arcvcap_t rcp, rcc;
-
-        /* parent rcv capabilities */
-        tcp = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_parent_perf,
-                                                (void *)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
-        if(EXPECT_LL_LT(1, tcp, "Test Async Endpoints")) return;
-        tccp = cos_tcap_alloc(&booter_info);
-        if(EXPECT_LL_LT(1, tccp, "Test Async Endpoints")) return;
-        rcp = cos_arcv_alloc(&booter_info, tcp, tccp, booter_info.comp_cap, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE);
-        if(EXPECT_LL_LT(1, rcp, "Test Async Endpoints")) return;
-        if(EXPECT_LL_NEQ(0,cos_tcap_transfer(rcp, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF,
-                                                                                 TCAP_PRIO_MAX + 1), "Test Async Endpoints")) {
-                return;
-        }
-
-        /* child rcv capabilities */
-        tcc = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_fn_perf, (void *)tcp);
-        if(EXPECT_LL_LT(1, tcc, "Test Async Endpoints")) return;
-        tccc = cos_tcap_alloc(&booter_info);
-        if(EXPECT_LL_LT(1, tccc, "Test Async Endpoints")) return;
-        rcc = cos_arcv_alloc(&booter_info, tcc, tccc, booter_info.comp_cap, rcp);
-        if(EXPECT_LL_LT(1, rcc, "Test Async Endpoints")) return;
-        if(EXPECT_LL_NEQ(0,cos_tcap_transfer(rcc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF,
-                                                                                 TCAP_PRIO_MAX), "Test Async Endpoints"))
-                 return;
-
-        /* make the snd channel to the child */
-        scp_global[cos_cpuid()] = cos_asnd_alloc(&booter_info, rcc, booter_info.captbl_cap);
-        if(EXPECT_LL_EQ(0, scp_global[cos_cpuid()], "Test Async Endpoints")) return;
-
-        /* make the snd channel to the parent */
-        scc_global[cos_cpuid()] = cos_asnd_alloc(&booter_info, rcp, booter_info.captbl_cap);
-        if(EXPECT_LL_EQ(0, scp_global[cos_cpuid()], "Test Async Endpoints")) return;
-
-        rcc_global[cos_cpuid()] = rcc;
-        rcp_global[cos_cpuid()] = rcp;
-
-        async_test_flag_[cos_cpuid()] = 1;
-        while (async_test_flag_[cos_cpuid()]) cos_thd_switch(tcp);
+	thdcap_t  tcp, tcc;
+	tcap_t    tccp, tccc;
+	arcvcap_t rcp, rcc;
+
+	/* parent rcv capabilities */
+	tcp = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_parent_perf,
+			(void *)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, 0, 0);
+	if(EXPECT_LL_LT(1, tcp, "Test Async Endpoints")) return;
+	tccp = cos_tcap_alloc(&booter_info);
+	if(EXPECT_LL_LT(1, tccp, "Test Async Endpoints")) return;
+	rcp = cos_arcv_alloc(&booter_info, tcp, tccp, booter_info.comp_cap, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE);
+	if(EXPECT_LL_LT(1, rcp, "Test Async Endpoints")) return;
+	if(EXPECT_LL_NEQ(0,cos_tcap_transfer(rcp, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF,
+					TCAP_PRIO_MAX + 1), "Test Async Endpoints")) {
+		return;
+	}
+
+	/* child rcv capabilities */
+	tcc = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_fn_perf, (void *)tcp, 0, 0);
+	if(EXPECT_LL_LT(1, tcc, "Test Async Endpoints")) return;
+	tccc = cos_tcap_alloc(&booter_info);
+	if(EXPECT_LL_LT(1, tccc, "Test Async Endpoints")) return;
+	rcc = cos_arcv_alloc(&booter_info, tcc, tccc, booter_info.comp_cap, rcp);
+	if(EXPECT_LL_LT(1, rcc, "Test Async Endpoints")) return;
+	if(EXPECT_LL_NEQ(0,cos_tcap_transfer(rcc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF,
+					TCAP_PRIO_MAX), "Test Async Endpoints"))
+		return;
+
+	/* make the snd channel to the child */
+	scp_global[cos_cpuid()] = cos_asnd_alloc(&booter_info, rcc, booter_info.captbl_cap);
+	if(EXPECT_LL_EQ(0, scp_global[cos_cpuid()], "Test Async Endpoints")) return;
+
+	/* make the snd channel to the parent */
+	scc_global[cos_cpuid()] = cos_asnd_alloc(&booter_info, rcp, booter_info.captbl_cap);
+	if(EXPECT_LL_EQ(0, scp_global[cos_cpuid()], "Test Async Endpoints")) return;
+
+	rcc_global[cos_cpuid()] = rcc;
+	rcp_global[cos_cpuid()] = rcp;
+
+	async_test_flag_[cos_cpuid()] = 1;
+	while (async_test_flag_[cos_cpuid()]) cos_thd_switch(tcp);
 }
 
 void
 test_print_ubench(void)
 {
-        PRINTC("\tSINV:\t\t\t\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
-                        result_sinv.avg, result_sinv.max, result_sinv.max,
-                        result_sinv.sz);
+	PRINTC("\tSINV:\t\t\t\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
+			result_sinv.avg, result_sinv.max, result_sinv.max,
+			result_sinv.sz);
 
-        printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
-                        result_sinv.sd, result_sinv.p90tile, result_sinv.p95tile,
-                        result_sinv.p99tile);
+	printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
+			result_sinv.sd, result_sinv.p90tile, result_sinv.p95tile,
+			result_sinv.p99tile);
 
-        PRINTC("\tTimer => Timeout Overhead: \t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
-                        result_test_timer.avg, result_test_timer.max, result_test_timer.min,
-                        result_test_timer.sz);
+	PRINTC("\tTimer => Timeout Overhead: \t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
+			result_test_timer.avg, result_test_timer.max, result_test_timer.min,
+			result_test_timer.sz);
 
-        printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
-                        result_test_timer.sd, result_test_timer.p90tile, result_test_timer.p95tile,
-                        result_test_timer.p99tile);
+	printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
+			result_test_timer.sd, result_test_timer.p90tile, result_test_timer.p95tile,
+			result_test_timer.p99tile);
 
-        PRINTC("\tTimer => Budget based: \t\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
-                        result_budgets_single.avg, result_budgets_single.max, result_budgets_single.min,
-                        result_budgets_single.sz);
+	PRINTC("\tTimer => Budget based: \t\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
+			result_budgets_single.avg, result_budgets_single.max, result_budgets_single.min,
+			result_budgets_single.sz);
 
-        printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
-                        result_budgets_single.sd, result_budgets_single.p90tile, result_budgets_single.p95tile,
-                        result_budgets_single.p99tile);
+	printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
+			result_budgets_single.sd, result_budgets_single.p90tile, result_budgets_single.p95tile,
+			result_budgets_single.p99tile);
 }
 
 void
 test_run_perf_kernel(void)
 {
-        cyc_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
-        test_thds_create_switch();
-        test_async_endpoints_perf();
-        test_print_ubench();
+	cyc_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
+	test_thds_create_switch();
+	test_async_endpoints_perf();
+	test_print_ubench();
 }
diff --git a/src/components/implementation/tests/kernel_tests/k_test_async.c b/src/components/implementation/tests/kernel_tests/k_test_async.c
index 19d155f2c6..e32db4c61b 100644
--- a/src/components/implementation/tests/kernel_tests/k_test_async.c
+++ b/src/components/implementation/tests/kernel_tests/k_test_async.c
@@ -17,139 +17,136 @@ static int      failure = 0;
 static void
 async_thd_fn(void *thdcap)
 {
-        thdcap_t  tc = (thdcap_t)thdcap;
-        arcvcap_t rc = rcc_global[cos_cpuid()];
-        int       pending, rcvd, ret;
-
-        pending = cos_rcv(rc, RCV_NON_BLOCKING, NULL);
-        if (EXPECT_LL_NEQ(3, pending, "Test Async Endpoints")) failure = 1;
-
-        pending = cos_rcv(rc, RCV_NON_BLOCKING | RCV_ALL_PENDING, &rcvd);
-        if (EXPECT_LL_NEQ(0, pending, "Test Async Endpoints")) failure = 1;
-
-        pending = cos_rcv(rc, RCV_ALL_PENDING, &rcvd);
-        /* switch */
-        if (EXPECT_LL_NEQ(0, pending, "Test Async Endpoints")) failure = 1;
-
-        pending = cos_rcv(rc, 0, NULL);
-        /* switch */
-        if (EXPECT_LL_NEQ(0, pending, "Test Async Endpoints")) failure = 1;
-
-        pending = cos_rcv(rc, 0, NULL);
-        /* switch */
-        if (EXPECT_LL_NEQ(0, pending, "Test Async Endpoints")) failure = 1;
-
-        pending = cos_rcv(rc, RCV_NON_BLOCKING, NULL);
-        if (EXPECT_LL_NEQ(pending, -EAGAIN, "Test Async Endpoints")) failure = 1;
-
-        pending = cos_rcv(rc, 0, NULL);
-        /* switch */
-        if (EXPECT_LL_NEQ(0, 1, "Test Async Endpoints")) failure = 1;
-
-        ret = cos_thd_switch(tc);
-        if (EXPECT_LL_NEQ(0, ret, "COS Switch Error") ||
-                EXPECT_LL_NEQ(0, 1, "Test Async Endpoints")) {
-                failure = 1;
-        }
-        while (1) cos_thd_switch(tc);
+	thdcap_t  tc = (thdcap_t)thdcap;
+	arcvcap_t rc = rcc_global[cos_cpuid()];
+	int       pending, rcvd, ret;
+
+	pending = cos_rcv(rc, RCV_NON_BLOCKING);
+	if (EXPECT_LL_NEQ(0, pending, "Test Async Endpoints")) failure = 1;
+
+	pending = cos_rcv(rc, 0);
+	/* switch */
+	if (EXPECT_LL_NEQ(0, pending, "Test Async Endpoints")) failure = 1;
+
+	pending = cos_rcv(rc, 0);
+	/* switch */
+	if (EXPECT_LL_NEQ(0, pending, "Test Async Endpoints")) failure = 1;
+
+	pending = cos_rcv(rc, 0);
+	/* switch */
+	if (EXPECT_LL_NEQ(0, pending, "Test Async Endpoints")) failure = 1;
+
+	pending = cos_rcv(rc, RCV_NON_BLOCKING);
+	if (EXPECT_LL_NEQ(pending, -EAGAIN, "Test Async Endpoints")) failure = 1;
+
+	pending = cos_rcv(rc, 0);
+	/* switch */
+	if (EXPECT_LL_NEQ(0, 1, "Test Async Endpoints")) failure = 1;
+
+	ret = cos_thd_switch(tc);
+	if (EXPECT_LL_NEQ(0, ret, "COS Switch Error") ||
+			EXPECT_LL_NEQ(0, 1, "Test Async Endpoints")) {
+		failure = 1;
+	}
+	while (1) cos_thd_switch(tc);
 }
 
 static void
 async_thd_parent(void *thdcap)
 {
-        thdcap_t    tc = (thdcap_t)thdcap;
-        arcvcap_t   rc = rcp_global[cos_cpuid()];
-        asndcap_t   sc = scp_global[cos_cpuid()];
-        int         ret;
-        thdid_t     tid;
-        int         blocked, rcvd;
-        cycles_t    cycles, now;
-        tcap_time_t thd_timeout;
-
-        /* NON_BLOCKING ASND with 0 as arg*/
-        ret = cos_asnd(sc, 0);
-        ret = cos_asnd(sc, 0);
-        ret = cos_asnd(sc, 0);
-        ret = cos_asnd(sc, 1);
-
-        /* switch */
-        /* child blocked at this point, parent is using child's tcap, this call yields to the child */
-        ret = cos_asnd(sc, 0);
-
-        /* switch */
-        ret = cos_asnd(sc, 0);
-        if (EXPECT_LL_NEQ(0, ret, "Test Async Endpoints")) failure = 1;
-
-        /* switch */
-        ret = cos_asnd(sc, 1);
-        if (EXPECT_LL_NEQ(0, ret, "Test Async Endpoints")) failure = 1;
-
-        /* switch */
-        cos_sched_rcv(rc, RCV_ALL_PENDING, 0, &rcvd, &tid, &blocked, &cycles, &thd_timeout);
-        rdtscll(now);
-
-        async_test_flag_[cos_cpuid()] = 0;
-        while (1) cos_thd_switch(tc);
+	thdcap_t    tc = (thdcap_t)thdcap;
+	arcvcap_t   rc = rcp_global[cos_cpuid()];
+	asndcap_t   sc = scp_global[cos_cpuid()];
+	int         ret;
+	thdid_t     tid;
+	int         blocked;
+	cycles_t    cycles, now;
+	tcap_time_t thd_timeout;
+
+	/* NON_BLOCKING ASND with 0 as arg*/
+	ret = cos_asnd(sc, 0);
+	ret = cos_asnd(sc, 0);
+	ret = cos_asnd(sc, 0);
+	ret = cos_asnd(sc, 1);
+
+	/* switch */
+	/* child blocked at this point, parent is using child's tcap, this call yields to the child */
+	ret = cos_asnd(sc, 0);
+
+	/* switch */
+	ret = cos_asnd(sc, 0);
+	if (EXPECT_LL_NEQ(0, ret, "Test Async Endpoints")) failure = 1;
+
+	/* switch */
+	ret = cos_asnd(sc, 1);
+	if (EXPECT_LL_NEQ(0, ret, "Test Async Endpoints")) failure = 1;
+
+	/* switch to parent */
+	cos_sched_rcv(rc, 0, 0, &tid, &blocked, &cycles, &thd_timeout);
+	rdtscll(now);
+
+	async_test_flag_[cos_cpuid()] = 0;
+	while (1) cos_thd_switch(tc);
 }
 
 void
 test_async_endpoints(void)
 {
-        thdcap_t  tcp, tcc;
-        tcap_t    tccp, tccc;
-        arcvcap_t rcp, rcc;
-        asndcap_t scr;
-
-        /* parent rcv capabilities */
-        tcp = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_parent,
-                            (void *)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
-        if (EXPECT_LL_LT(1, tcp, "Test Async Endpoints")) {
-                return;
-        }
-        tccp = cos_tcap_alloc(&booter_info);
-        if (EXPECT_LL_LT(1, tccp, "Test Async Endpoints")) {
-                return;
-        }
-        rcp = cos_arcv_alloc(&booter_info, tcp, tccp, booter_info.comp_cap, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE);
-        if (EXPECT_LL_LT(1, rcp, "Test Async Endpoints")) {
-                return;
-        }
-        if (EXPECT_LL_NEQ(0,cos_tcap_transfer(rcp, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, TCAP_PRIO_MAX),
-                                              "Test Async Endpoints")) {
-                return;
-        }
-
-        /* child rcv capabilities */
-        tcc = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_fn, (void *)tcp);
-        if (EXPECT_LL_LT(1, tcc, "Test Async Endpoints")) {
-                return;
-        }
-        tccc = cos_tcap_alloc(&booter_info);
-        if (EXPECT_LL_LT(1, tccc, "Test Async Endpoints")) {
-                return;
-        }
-        rcc = cos_arcv_alloc(&booter_info, tcc, tccc, booter_info.comp_cap, rcp);
-        if (EXPECT_LL_LT(1, rcc, "Test Async Endpoints")) {
-                return;
-        }
-        if (EXPECT_LL_NEQ(0,cos_tcap_transfer(rcc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF,
-                                         TCAP_PRIO_MAX + 1), "Test Async Endpoints")) {
-                return;
-        }
-
-        /* make the snd channel to the child */
-        scp_global[cos_cpuid()] = cos_asnd_alloc(&booter_info, rcc, booter_info.captbl_cap);
-        if (EXPECT_LL_EQ(0, scp_global[cos_cpuid()], "Test Async Endpoints")) return;
-        scr = cos_asnd_alloc(&booter_info, rcp, booter_info.captbl_cap);
-        if (EXPECT_LL_EQ(0, scr, "Test Async Endpoints")) return;
-
-        rcc_global[cos_cpuid()] = rcc;
-        rcp_global[cos_cpuid()] = rcp;
-
-        async_test_flag_[cos_cpuid()] = 1;
-        while (async_test_flag_[cos_cpuid()]) cos_asnd(scr, 1);
-
-        CHECK_STATUS_FLAG();
-        PRINTC("\t%s: \t\tSuccess\n", "Asynchronous Endpoints");
-        EXIT_FN();
+	thdcap_t  tcp, tcc;
+	tcap_t    tccp, tccc;
+	arcvcap_t rcp, rcc;
+	asndcap_t scr;
+
+	/* parent rcv capabilities */
+	tcp = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_parent,
+			(void *)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, 0, 0);
+	if (EXPECT_LL_LT(1, tcp, "Test Async Endpoints")) {
+		return;
+	}
+	tccp = cos_tcap_alloc(&booter_info);
+	if (EXPECT_LL_LT(1, tccp, "Test Async Endpoints")) {
+		return;
+	}
+	rcp = cos_arcv_alloc(&booter_info, tcp, tccp, booter_info.comp_cap, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE);
+	if (EXPECT_LL_LT(1, rcp, "Test Async Endpoints")) {
+		return;
+	}
+	if (EXPECT_LL_NEQ(0,cos_tcap_transfer(rcp, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, TCAP_PRIO_MAX),
+				"Test Async Endpoints")) {
+		return;
+	}
+
+	/* child rcv capabilities */
+	tcc = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_fn, (void *)tcp, 0, 0);
+	if (EXPECT_LL_LT(1, tcc, "Test Async Endpoints")) {
+		return;
+	}
+	tccc = cos_tcap_alloc(&booter_info);
+	if (EXPECT_LL_LT(1, tccc, "Test Async Endpoints")) {
+		return;
+	}
+	rcc = cos_arcv_alloc(&booter_info, tcc, tccc, booter_info.comp_cap, rcp);
+	if (EXPECT_LL_LT(1, rcc, "Test Async Endpoints")) {
+		return;
+	}
+	if (EXPECT_LL_NEQ(0,cos_tcap_transfer(rcc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF,
+					TCAP_PRIO_MAX + 1), "Test Async Endpoints")) {
+		return;
+	}
+
+	/* make the snd channel to the child */
+	scp_global[cos_cpuid()] = cos_asnd_alloc(&booter_info, rcc, booter_info.captbl_cap);
+	if (EXPECT_LL_EQ(0, scp_global[cos_cpuid()], "Test Async Endpoints")) return;
+	scr = cos_asnd_alloc(&booter_info, rcp, booter_info.captbl_cap);
+	if (EXPECT_LL_EQ(0, scr, "Test Async Endpoints")) return;
+
+	rcc_global[cos_cpuid()] = rcc;
+	rcp_global[cos_cpuid()] = rcp;
+
+	async_test_flag_[cos_cpuid()] = 1;
+	while (async_test_flag_[cos_cpuid()]) cos_asnd(scr, 1);
+
+	CHECK_STATUS_FLAG();
+	PRINTC("\t%s: \t\tSuccess\n", "Asynchronous Endpoints");
+	EXIT_FN();
 }
diff --git a/src/components/implementation/tests/kernel_tests/k_test_captbl.c b/src/components/implementation/tests/kernel_tests/k_test_captbl.c
index 4365195e08..76532eeef0 100644
--- a/src/components/implementation/tests/kernel_tests/k_test_captbl.c
+++ b/src/components/implementation/tests/kernel_tests/k_test_captbl.c
@@ -14,20 +14,20 @@ extern void *__inv_test_serverfn(int a, int b, int c);
 void
 test_captbl_expands(void)
 {
-        int       i;
-        compcap_t cc;
+	int       i;
+	compcap_t cc;
 
-        cc = cos_comp_alloc(&booter_info, booter_info.captbl_cap, booter_info.pgtbl_cap, (vaddr_t)NULL);
-        if (EXPECT_LL_LT(1, cc, "Capability Table Expansion")) {
-                return;
-        }
-        for (i = 0; i < CAPTBL_ITER; i++) {
-                sinvcap_t ic;
+	cc = cos_comp_alloc(&booter_info, booter_info.captbl_cap, booter_info.pgtbl_cap, 0, (vaddr_t)NULL, (vaddr_t)NULL);
+	if (EXPECT_LL_LT(1, cc, "Capability Table Expansion")) {
+		return;
+	}
+	for (i = 0; i < CAPTBL_ITER; i++) {
+		sinvcap_t ic;
 
-                ic = cos_sinv_alloc(&booter_info, cc, (vaddr_t)__inv_test_serverfn, 0);
-                if(EXPECT_LL_LT(1, ic, "Capability Table: Cannot Allocate")) {
-                        return;
-                }
-        }
-        PRINTC("\t%s: \t\tSuccess\n", "Capability Table Expansion");
+		ic = cos_sinv_alloc(&booter_info, cc, (vaddr_t)__inv_test_serverfn, 0);
+		if(EXPECT_LL_LT(1, ic, "Capability Table: Cannot Allocate")) {
+			return;
+		}
+	}
+	PRINTC("\t%s: \t\tSuccess\n", "Capability Table Expansion");
 }
diff --git a/src/components/implementation/tests/kernel_tests/k_test_inv.c b/src/components/implementation/tests/kernel_tests/k_test_inv.c
index fcb9fa132a..f6833c36ba 100644
--- a/src/components/implementation/tests/kernel_tests/k_test_inv.c
+++ b/src/components/implementation/tests/kernel_tests/k_test_inv.c
@@ -16,7 +16,7 @@ static cycles_t test_results[ARRAY_SIZE] = { 0 };
 int
 test_serverfn(int a, int b, int c)
 {
-        return 0xDEADBEEF;
+	return 0xDEADBEEF;
 }
 
 extern void *__inv_test_serverfn(int a, int b, int c);
@@ -24,68 +24,68 @@ extern void *__inv_test_serverfn(int a, int b, int c);
 static inline int
 call_cap_mb(u32_t cap_no, int arg1, int arg2, int arg3)
 {
-        int ret;
-
-        /*
-         * Which stack should we use for this invocation?  Simple, use
-         * this stack, at the current sp.  This is essentially a
-         * function call into another component, with odd calling
-         * conventions.
-         */
-        cap_no = (cap_no + 1) << COS_CAPABILITY_OFFSET;
-
-        __asm__ __volatile__("pushl %%ebp\n\t"
-                             "movl %%esp, %%ebp\n\t"
-                             "movl %%esp, %%edx\n\t"
-                             "movl $1f, %%ecx\n\t"
-                             "sysenter\n\t"
-                             "1:\n\t"
-                             "popl %%ebp"
-                             : "=a"(ret)
-                             : "a"(cap_no), "b"(arg1), "S"(arg2), "D"(arg3)
-                             : "memory", "cc", "ecx", "edx");
-
-        return ret;
+	int ret;
+
+	/*
+	 * Which stack should we use for this invocation?  Simple, use
+	 * this stack, at the current sp.  This is essentially a
+	 * function call into another component, with odd calling
+	 * conventions.
+	 */
+	cap_no = (cap_no + 1) << COS_CAPABILITY_OFFSET;
+
+	__asm__ __volatile__("pushl %%ebp\n\t"
+			"movl %%esp, %%ebp\n\t"
+			"movl %%esp, %%edx\n\t"
+			"movl $1f, %%ecx\n\t"
+			"sysenter\n\t"
+			"1:\n\t"
+			"popl %%ebp"
+			: "=a"(ret)
+			: "a"(cap_no), "b"(arg1), "S"(arg2), "D"(arg3)
+			: "memory", "cc", "ecx", "edx");
+
+	return ret;
 }
 
 void
 test_inv(void)
 {
-        compcap_t        cc;
-        sinvcap_t        ic;
-        unsigned int r;
-        int                  i;
-        cycles_t         start_cycles = 0LL, end_cycles = 0LL;
-
-        perfdata_init(&result, "SINV", test_results, ARRAY_SIZE);
-
-        cc = cos_comp_alloc(&booter_info, booter_info.captbl_cap, booter_info.pgtbl_cap, (vaddr_t)NULL);
-        if (EXPECT_LL_LT(1, cc, "Invocation: Cannot Allocate")) return;
-        ic = cos_sinv_alloc(&booter_info, cc, (vaddr_t)__inv_test_serverfn, 0);
-        if (EXPECT_LL_LT(1, ic, "Invocation: Cannot Allocate")) return;
-
-        r = call_cap_mb(ic, 1, 2, 3);
-        if (EXPECT_LLU_NEQ(0xDEADBEEF, r, "Test Invocation")) return;
-
-        for (i = 0; i < ITER; i++) {
-                rdtscll(start_cycles);
-                call_cap_mb(ic, 1, 2, 3);
-                rdtscll(end_cycles);
-
-                perfdata_add(&result, end_cycles - start_cycles);
-        }
-
-        perfdata_calc(&result);
-        result_sinv.avg = perfdata_avg(&result);
-        result_sinv.max = perfdata_avg(&result);
-        result_sinv.min = perfdata_avg(&result);
-        result_sinv.sz = perfdata_avg(&result);
-        result_sinv.sd = perfdata_avg(&result);
-        result_sinv.p90tile = perfdata_avg(&result);
-        result_sinv.p95tile = perfdata_avg(&result);
-        result_sinv.p99tile = perfdata_avg(&result);
-
-        CHECK_STATUS_FLAG();
-        PRINTC("\t%s: \t\tSuccess\n", "Synchronous Invocations");
-        EXIT_FN();
+	compcap_t        cc;
+	sinvcap_t        ic;
+	unsigned int r;
+	int                  i;
+	cycles_t         start_cycles = 0LL, end_cycles = 0LL;
+
+	perfdata_init(&result, "SINV", test_results, ARRAY_SIZE);
+
+	cc = cos_comp_alloc(&booter_info, booter_info.captbl_cap, booter_info.pgtbl_cap, 0, (vaddr_t)NULL, (vaddr_t)NULL);
+	if (EXPECT_LL_LT(1, cc, "Invocation: Cannot Allocate")) return;
+	ic = cos_sinv_alloc(&booter_info, cc, (vaddr_t)__inv_test_serverfn, 0);
+	if (EXPECT_LL_LT(1, ic, "Invocation: Cannot Allocate")) return;
+
+	r = call_cap_mb(ic, 1, 2, 3);
+	if (EXPECT_LLU_NEQ(0xDEADBEEF, r, "Test Invocation")) return;
+
+	for (i = 0; i < ITER; i++) {
+		rdtscll(start_cycles);
+		call_cap_mb(ic, 1, 2, 3);
+		rdtscll(end_cycles);
+
+		perfdata_add(&result, end_cycles - start_cycles);
+	}
+
+	perfdata_calc(&result);
+	result_sinv.avg = perfdata_avg(&result);
+	result_sinv.max = perfdata_avg(&result);
+	result_sinv.min = perfdata_avg(&result);
+	result_sinv.sz = perfdata_avg(&result);
+	result_sinv.sd = perfdata_avg(&result);
+	result_sinv.p90tile = perfdata_avg(&result);
+	result_sinv.p95tile = perfdata_avg(&result);
+	result_sinv.p99tile = perfdata_avg(&result);
+
+	CHECK_STATUS_FLAG();
+	PRINTC("\t%s: \t\tSuccess\n", "Synchronous Invocations");
+	EXIT_FN();
 }
diff --git a/src/components/implementation/tests/kernel_tests/k_test_mem.c b/src/components/implementation/tests/kernel_tests/k_test_mem.c
index b10fa54e94..4da2919749 100644
--- a/src/components/implementation/tests/kernel_tests/k_test_mem.c
+++ b/src/components/implementation/tests/kernel_tests/k_test_mem.c
@@ -13,48 +13,48 @@
 void
 test_mem_alloc(void)
 {
-        char *      p, *s, *t, *prev;
-        int         i;
-        const char *chk = "SUCCESS";
-        int         fail_contiguous = 0;
+	char *      p, *s, *t, *prev;
+	int         i;
+	const char *chk = "SUCCESS";
+	int         fail_contiguous = 0;
 
-        p = cos_page_bump_alloc(&booter_info);
-        if (p == NULL) {
-                EXPECT_LL_NEQ(0, 1, "Memory Test: Cannot Allocate");
-                return;
-        }
-        PRINTC("\t%s: \t\t\tSuccess\n", "Memory => Allocation");
-        strcpy(p, chk);
+	p = cos_page_bump_alloc(&booter_info);
+	if (p == NULL) {
+		EXPECT_LL_NEQ(0, 1, "Memory Test: Cannot Allocate");
+		return;
+	}
+	PRINTC("\t%s: \t\t\tSuccess\n", "Memory => Allocation");
+	strcpy(p, chk);
 
-        if (EXPECT_LL_NEQ(0, strcmp(chk, p), "Memory Test: Wrong STRCPY")) {
-                return;
-        }
+	if (EXPECT_LL_NEQ(0, strcmp(chk, p), "Memory Test: Wrong STRCPY")) {
+		return;
+	}
 
-        s = cos_page_bump_alloc(&booter_info);
-        assert(s);
-        prev = s;
-        for (i = 0; i < TEST_NPAGES; i++) {
-                t = cos_page_bump_alloc(&booter_info);
-                if (t == NULL){
-                        EXPECT_LL_EQ(0, 1, "Memory Test: Cannot Allocate");
-                        return;
-                }
-                if (t != prev + PAGE_SIZE) {
-                        fail_contiguous = 1;
-                }
-                prev = t;
-        }
-        if (!fail_contiguous) {
-                memset(s, 0, TEST_NPAGES * PAGE_SIZE);
-        } else if (EXPECT_LL_EQ(i, TEST_NPAGES,"Memory Test: Cannot Allocate contiguous")) {
-                return;
-        }
+	s = cos_page_bump_alloc(&booter_info);
+	assert(s);
+	prev = s;
+	for (i = 0; i < TEST_NPAGES; i++) {
+		t = cos_page_bump_alloc(&booter_info);
+		if (t == NULL){
+			EXPECT_LL_EQ(0, 1, "Memory Test: Cannot Allocate");
+			return;
+		}
+		if (t != prev + PAGE_SIZE) {
+			fail_contiguous = 1;
+		}
+		prev = t;
+	}
+	if (!fail_contiguous) {
+		memset(s, 0, TEST_NPAGES * PAGE_SIZE);
+	} else if (EXPECT_LL_EQ(i, TEST_NPAGES,"Memory Test: Cannot Allocate contiguous")) {
+		return;
+	}
 
-        t = cos_page_bump_allocn(&booter_info, TEST_NPAGES * PAGE_SIZE);
-        if (t == NULL) {
-                EXPECT_LL_NEQ(0, 1, "Memory Test: Cannot Allocate");
-                return;
-        }
-        memset(t, 0, TEST_NPAGES * PAGE_SIZE);
-        PRINTC("\t%s: \t\t\tSuccess\n", "Memory => R & W");
+	t = cos_page_bump_allocn(&booter_info, TEST_NPAGES * PAGE_SIZE);
+	if (t == NULL) {
+		EXPECT_LL_NEQ(0, 1, "Memory Test: Cannot Allocate");
+		return;
+	}
+	memset(t, 0, TEST_NPAGES * PAGE_SIZE);
+	PRINTC("\t%s: \t\t\tSuccess\n", "Memory => R & W");
 }
diff --git a/src/components/implementation/tests/kernel_tests/k_test_tcap.c b/src/components/implementation/tests/kernel_tests/k_test_tcap.c
index f69dfa5fed..6b1a311552 100644
--- a/src/components/implementation/tests/kernel_tests/k_test_tcap.c
+++ b/src/components/implementation/tests/kernel_tests/k_test_tcap.c
@@ -8,6 +8,7 @@
 
 #include <stdint.h>
 #include "kernel_tests.h"
+#include <cos_ulsched_rcv.h>
 
 struct results result_test_timer;
 struct results result_budgets_single;
@@ -19,157 +20,156 @@ static cycles_t test_results[ARRAY_SIZE] = { 0 };
 static void
 spinner(void *d)
 {
-        while (1);
+	while (1);
 }
 
 void
-sched_events_clear(void)
+sched_events_clear_nonblock(void)
 {
-        thdid_t     tid;
-        int         blocked, rcvd;
-        cycles_t    cycles, now;
-        tcap_time_t timer, thd_timeout;
-
-        while (cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, RCV_ALL_PENDING, 0,
-                                                 &rcvd, &tid, &blocked, &cycles, &thd_timeout) != 0);
+	struct cos_sched_event e;
+	cos_ul_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, RCV_NON_BLOCKING, 0, &e);
+}
 
+void
+sched_events_clear(void)
+{
+	struct cos_sched_event e;
+	while (cos_ul_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, 0, 0, &e)) ;
 }
 
 void
 test_timer(void)
 {
-        thdcap_t    tc;
-        cycles_t    c = 0, p = 0;
-        int         i, ret;
-        cycles_t    s, e;
-        thdid_t     tid;
-        int         blocked, rcvd;
-        cycles_t    cycles, now, utime;
-        long long   time, mask;
-        tcap_time_t timer, thd_timeout;
-
-        tc = cos_thd_alloc(&booter_info, booter_info.comp_cap, spinner, NULL);
-
-        perfdata_init(&result, "COS THD => COS_THD_SWITCH", test_results, ARRAY_SIZE);
-
-        for (i = 0; i <= TEST_ITER; i++){
-                rdtscll(now);
-                timer = tcap_cyc2time(now + GRANULARITY * cyc_per_usec);
-                cos_switch(tc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, 0, timer, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE,
-                           cos_sched_sync());
-                p = c;
-                rdtscll(c);
-                time = (c - now - (cycles_t)(GRANULARITY * cyc_per_usec));
-                mask = (time >> (sizeof(long long) * CHAR_BIT - 1));
-                utime = (time + mask) ^ mask;
-
-                if (i > 0) {
-                        perfdata_add(&result, utime);
-
-                        if (EXPECT_LLU_LT((long long unsigned)(c-now), (unsigned)(GRANULARITY * cyc_per_usec * MAX_THDS),
-                                            "Timer: Failure on  MAX") ||
-                                EXPECT_LLU_LT((unsigned)(GRANULARITY * cyc_per_usec * MIN_THDS), (long long unsigned)(c-now),
-                                            "Timer: failure on MIN")) {
-                                return;
-                        }
-                }
-                sched_events_clear();
-        }
-
-        perfdata_calc(&result);
-        result_test_timer.avg = perfdata_avg(&result);
-        result_test_timer.max = perfdata_avg(&result);
-        result_test_timer.min = perfdata_avg(&result);
-        result_test_timer.sz = perfdata_avg(&result);
-        result_test_timer.sd = perfdata_avg(&result);
-        result_test_timer.p90tile = perfdata_avg(&result);
-        result_test_timer.p95tile = perfdata_avg(&result);
-        result_test_timer.p99tile = perfdata_avg(&result);
-
-        /* Timer in past */
-        c = 0, p = 0;
-
-        rdtscll(c);
-        timer = tcap_cyc2time(c - GRANULARITY * cyc_per_usec);
-        cos_switch(tc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, 0, timer, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE,
-                    cos_sched_sync());
-        p = c;
-        rdtscll(c);
-
-        if (EXPECT_LLU_LT((long long unsigned)(c-p), (unsigned)(GRANULARITY * cyc_per_usec), "Timer: Past")) {
-                return;
-        }
-
-        sched_events_clear();
-
-        /* Timer now */
-        c = 0, p = 0;
-
-        rdtscll(c);
-        timer = tcap_cyc2time(c);
-        cos_switch(tc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, 0, timer, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE,
-                    cos_sched_sync());
-        p = c;
-        rdtscll(c);
-
-        if (EXPECT_LLU_LT((long long unsigned)(c-p), (unsigned)(GRANULARITY * cyc_per_usec), "Timer:  Now")) {
-                return;
-        }
-
-        cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, RCV_ALL_PENDING, 0,
-                      &rcvd, &tid, &blocked, &cycles, &thd_timeout)
-                        ;
-
-        EXPECT_LLU_LT((long long unsigned)cycles, (long long unsigned)(c-p), "Timer => Cycles time");
-
-        sched_events_clear();
-        PRINTC("\t%s: \t\t\tSuccess\n", "One-Shot Timeout");
+	thdcap_t    tc;
+	cycles_t    c = 0, p = 0;
+	int         i, ret;
+	cycles_t    s, e;
+	cycles_t    cycles, now, utime;
+	long long   time, mask;
+	tcap_time_t timer, thd_timeout;
+
+	tc = cos_thd_alloc(&booter_info, booter_info.comp_cap, spinner, NULL, 0, 0);
+
+	perfdata_init(&result, "COS THD => COS_THD_SWITCH", test_results, ARRAY_SIZE);
+
+	for (i = 0; i <= TEST_ITER; i++){
+		rdtscll(now);
+		timer = tcap_cyc2time(now + GRANULARITY * cyc_per_usec);
+		cos_switch(tc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, 0, timer, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE,
+				cos_sched_sync());
+		p = c;
+		rdtscll(c);
+		time = (c - now - (cycles_t)(GRANULARITY * cyc_per_usec));
+		mask = (time >> (sizeof(long long) * CHAR_BIT - 1));
+		utime = (time + mask) ^ mask;
+
+		if (i > 0) {
+			perfdata_add(&result, utime);
+
+			if (EXPECT_LLU_LT((long long unsigned)(c-now), (unsigned)(GRANULARITY * cyc_per_usec * MAX_THDS),
+						"Timer: Failure on  MAX") ||
+					EXPECT_LLU_LT((unsigned)(GRANULARITY * cyc_per_usec * MIN_THDS), (long long unsigned)(c-now),
+						"Timer: failure on MIN")) {
+				return;
+			}
+		}
+		sched_events_clear();
+	}
+
+	perfdata_calc(&result);
+	result_test_timer.avg = perfdata_avg(&result);
+	result_test_timer.max = perfdata_avg(&result);
+	result_test_timer.min = perfdata_avg(&result);
+	result_test_timer.sz = perfdata_avg(&result);
+	result_test_timer.sd = perfdata_avg(&result);
+	result_test_timer.p90tile = perfdata_avg(&result);
+	result_test_timer.p95tile = perfdata_avg(&result);
+	result_test_timer.p99tile = perfdata_avg(&result);
+
+	/* Timer in past */
+	c = 0, p = 0;
+
+	rdtscll(c);
+	timer = tcap_cyc2time(c - GRANULARITY * cyc_per_usec);
+	cos_switch(tc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, 0, timer, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE,
+			cos_sched_sync());
+	p = c;
+	rdtscll(c);
+
+	if (EXPECT_LLU_LT((long long unsigned)(c-p), (unsigned)(GRANULARITY * cyc_per_usec), "Timer: Past")) {
+		return;
+	}
+
+	sched_events_clear();
+
+	/* Timer now */
+	c = 0, p = 0;
+
+	rdtscll(c);
+	timer = tcap_cyc2time(c);
+	cos_switch(tc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, 0, timer, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE,
+			cos_sched_sync());
+	p = c;
+	rdtscll(c);
+
+	if (EXPECT_LLU_LT((long long unsigned)(c-p), (unsigned)(GRANULARITY * cyc_per_usec), "Timer:  Now")) {
+		return;
+	}
+
+	struct cos_sched_event ev;
+	cos_ul_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, 0, 0, &ev);
+	cycles = ev.evt.elapsed_cycs;
+
+	EXPECT_LLU_LT((long long unsigned)cycles, (long long unsigned)(c-p), "Timer => Cycles time");
+
+	sched_events_clear();
+	PRINTC("\t%s: \t\t\tSuccess\n", "One-Shot Timeout");
 }
 
 struct exec_cluster {
-        thdcap_t        tc;
-        arcvcap_t   rc;
-        tcap_t          tcc;
-        cycles_t        cyc;
-        asndcap_t   sc;                 /* send-cap to send to rc */
-        tcap_prio_t prio;
-        int                 xseq;           /* expected activation sequence number for this thread */
+	thdcap_t        tc;
+	arcvcap_t   rc;
+	tcap_t          tcc;
+	cycles_t        cyc;
+	asndcap_t   sc;                 /* send-cap to send to rc */
+	tcap_prio_t prio;
+	int                 xseq;           /* expected activation sequence number for this thread */
 };
 
 struct budget_test_data {
-        /* p=parent, c=child, g=grand-child */
-        struct exec_cluster p, c, g;
+	/* p=parent, c=child, g=grand-child */
+	struct exec_cluster p, c, g;
 } bt[NUM_CPU], mbt[NUM_CPU];
 
-static int
+	static int
 exec_cluster_alloc(struct exec_cluster *e, cos_thd_fn_t fn, void *d, arcvcap_t parentc)
 {
-        e->tcc = cos_tcap_alloc(&booter_info);
-        if (EXPECT_LL_LT(1, e->tcc, "Cluster Allocation: TCAP ALLOC")) return -1;
-        e->tc = cos_thd_alloc(&booter_info, booter_info.comp_cap, fn, d);
-        if (EXPECT_LL_LT(1, e->tc, "Cluster Allocation: THD ALLOC")) return -1;
-        e->rc = cos_arcv_alloc(&booter_info, e->tc, e->tcc, booter_info.comp_cap, parentc);
-        if (EXPECT_LL_LT(1, e->rc, "Cluster Allocation: ARCV ALLOC")) return -1;
-        e->sc = cos_asnd_alloc(&booter_info, e->rc, booter_info.captbl_cap);
-        if (EXPECT_LL_LT(1, e->sc, "Cluster Allocation: ASND ALLOC")) return -1;
-
-        e->cyc = 0;
-
-        return 0;
+	e->tcc = cos_tcap_alloc(&booter_info);
+	if (EXPECT_LL_LT(1, e->tcc, "Cluster Allocation: TCAP ALLOC")) return -1;
+	e->tc = cos_thd_alloc(&booter_info, booter_info.comp_cap, fn, d, 0, 0);
+	if (EXPECT_LL_LT(1, e->tc, "Cluster Allocation: THD ALLOC")) return -1;
+	e->rc = cos_arcv_alloc(&booter_info, e->tc, e->tcc, booter_info.comp_cap, parentc);
+	if (EXPECT_LL_LT(1, e->rc, "Cluster Allocation: ARCV ALLOC")) return -1;
+	e->sc = cos_asnd_alloc(&booter_info, e->rc, booter_info.captbl_cap);
+	if (EXPECT_LL_LT(1, e->sc, "Cluster Allocation: ASND ALLOC")) return -1;
+
+	e->cyc = 0;
+
+	return 0;
 }
 
 static void
 parent(void *d)
 {
-        assert(0);
+	assert(0);
 }
 
 static void
 spinner_cyc(void *d)
 {
-        cycles_t *p = (cycles_t *)d;
+	cycles_t *p = (cycles_t *)d;
 
-        while (1) rdtscll(*p);
+	while (1) rdtscll(*p);
 }
 
 #define TIMER_TIME 100
@@ -177,70 +177,70 @@ spinner_cyc(void *d)
 void
 test_2timers(void)
 {
-        int ret;
-        cycles_t        s, e, timer;
-
-        if (EXPECT_LL_NEQ(0, exec_cluster_alloc(&bt[cos_cpuid()].p, parent, &bt[cos_cpuid()].p,
-                                                BOOT_CAPTBL_SELF_INITRCV_CPU_BASE), "TCAP v. Timer: Cannot Allocate")) {
-                return;
-        }
-        if (EXPECT_LL_NEQ(0, exec_cluster_alloc(&bt[cos_cpuid()].c, spinner, &bt[cos_cpuid()].c,
-                                                bt[cos_cpuid()].p.rc), "TCAP v. Timer: Cannot Allocate")) {
-                return;
-        }
-
-        /* Timer > TCAP */
-
-        ret = cos_tcap_transfer(bt[cos_cpuid()].c.rc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE,
-                                GRANULARITY * TIMER_TIME, TCAP_PRIO_MAX + 2);
-        if (EXPECT_LL_NEQ(0, ret, "TCAP v. Timer : TCAP Transfer")) {
-                return;
-        }
-
-        rdtscll(s);
-        timer = tcap_cyc2time(s + GRANULARITY * cyc_per_usec);
-        if (cos_switch(bt[cos_cpuid()].c.tc, bt[cos_cpuid()].c.tcc, TCAP_PRIO_MAX + 2,
-                       timer, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cos_sched_sync())) {
-                EXPECT_LL_NEQ(0, 1, "TCAP v. Timer: COS Switch");
-                return;
-        }
-        rdtscll(e);
-
-        if (EXPECT_LLU_LT((long long unsigned)(e-s), (unsigned)(GRANULARITY * cyc_per_usec),
-                                          "TCAP v. Timer: Timer > TCAP") ||
-                EXPECT_LLU_LT((unsigned)(GRANULARITY * TIMER_TIME), (long long unsigned)(e-s),
-                                          "TCAP v. Timer: Interreupt Under")) {
-                return;
-        }
-
-        sched_events_clear();
-
-        /* Timer < TCAP */
-
-        ret = cos_tcap_transfer(bt[cos_cpuid()].c.rc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE,
-                                GRANULARITY * cyc_per_usec, TCAP_PRIO_MAX + 2);
-        if (EXPECT_LL_NEQ(0, ret, "TCAP v. Timer: TCAP Transfer")) {
-                return;
-        }
-
-        rdtscll(s);
-        timer = tcap_cyc2time(s + GRANULARITY * TIMER_TIME);
-        if (EXPECT_LL_NEQ(0, cos_switch(bt[cos_cpuid()].c.tc, bt[cos_cpuid()].c.tcc, TCAP_PRIO_MAX + 2, timer,
-                                        BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cos_sched_sync()), "TCAP v. TImer: COS Switch")) {
-                return;
-        }
-
-        rdtscll(e);
-
-        if (EXPECT_LLU_LT((long long unsigned)(e-s), (unsigned)(GRANULARITY * cyc_per_usec),
-                          "TCAP v. Timer: Timer < TCAP") ||
-                EXPECT_LLU_LT((unsigned)(GRANULARITY * TIMER_TIME), (long long unsigned)(e-s),
-                               "TCAP v. Timer: Interreupt Under")) {
-                return;
-        }
-
-        sched_events_clear();
-        PRINTC("\t%s: \t\tSuccess\n", "Timer => Timeout v. Budget");
+	int ret;
+	cycles_t        s, e, timer;
+
+	if (EXPECT_LL_NEQ(0, exec_cluster_alloc(&bt[cos_cpuid()].p, parent, &bt[cos_cpuid()].p,
+					BOOT_CAPTBL_SELF_INITRCV_CPU_BASE), "TCAP v. Timer: Cannot Allocate")) {
+		return;
+	}
+	if (EXPECT_LL_NEQ(0, exec_cluster_alloc(&bt[cos_cpuid()].c, spinner, &bt[cos_cpuid()].c,
+					bt[cos_cpuid()].p.rc), "TCAP v. Timer: Cannot Allocate")) {
+		return;
+	}
+
+	/* Timer > TCAP */
+
+	ret = cos_tcap_transfer(bt[cos_cpuid()].c.rc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE,
+			GRANULARITY * TIMER_TIME, TCAP_PRIO_MAX + 2);
+	if (EXPECT_LL_NEQ(0, ret, "TCAP v. Timer : TCAP Transfer")) {
+		return;
+	}
+
+	rdtscll(s);
+	timer = tcap_cyc2time(s + GRANULARITY * cyc_per_usec);
+	if (cos_switch(bt[cos_cpuid()].c.tc, bt[cos_cpuid()].c.tcc, TCAP_PRIO_MAX + 2,
+				timer, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cos_sched_sync())) {
+		EXPECT_LL_NEQ(0, 1, "TCAP v. Timer: COS Switch");
+		return;
+	}
+	rdtscll(e);
+
+	if (EXPECT_LLU_LT((long long unsigned)(e-s), (unsigned)(GRANULARITY * cyc_per_usec),
+				"TCAP v. Timer: Timer > TCAP") ||
+			EXPECT_LLU_LT((unsigned)(GRANULARITY * TIMER_TIME), (long long unsigned)(e-s),
+				"TCAP v. Timer: Interreupt Under")) {
+		return;
+	}
+
+	sched_events_clear();
+
+	/* Timer < TCAP */
+
+	ret = cos_tcap_transfer(bt[cos_cpuid()].c.rc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE,
+			GRANULARITY * cyc_per_usec, TCAP_PRIO_MAX + 2);
+	if (EXPECT_LL_NEQ(0, ret, "TCAP v. Timer: TCAP Transfer")) {
+		return;
+	}
+
+	rdtscll(s);
+	timer = tcap_cyc2time(s + GRANULARITY * TIMER_TIME);
+	if (EXPECT_LL_NEQ(0, cos_switch(bt[cos_cpuid()].c.tc, bt[cos_cpuid()].c.tcc, TCAP_PRIO_MAX + 2, timer,
+					BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cos_sched_sync()), "TCAP v. TImer: COS Switch")) {
+		return;
+	}
+
+	rdtscll(e);
+
+	if (EXPECT_LLU_LT((long long unsigned)(e-s), (unsigned)(GRANULARITY * cyc_per_usec),
+				"TCAP v. Timer: Timer < TCAP") ||
+			EXPECT_LLU_LT((unsigned)(GRANULARITY * TIMER_TIME), (long long unsigned)(e-s),
+				"TCAP v. Timer: Interreupt Under")) {
+		return;
+	}
+
+	sched_events_clear();
+	PRINTC("\t%s: \t\tSuccess\n", "Timer => Timeout v. Budget");
 }
 
 #define BUDGET_TIME 100
@@ -248,64 +248,64 @@ test_2timers(void)
 static void
 test_tcap_budgets_single(void)
 {
-        int         i;
-        cycles_t    s = 0, e = 0;
-        cycles_t    time, mask;
-        int         ret;
-
-        perfdata_init(&result, "Timer => Budget based", test_results, ARRAY_SIZE);
-
-        if (EXPECT_LL_NEQ(0, exec_cluster_alloc(&bt[cos_cpuid()].p, parent, &bt[cos_cpuid()].p,
-                          BOOT_CAPTBL_SELF_INITRCV_CPU_BASE), "Single Budget: Cannot Allocate") ||
-                EXPECT_LL_NEQ(0, exec_cluster_alloc(&bt[cos_cpuid()].c, spinner, &bt[cos_cpuid()].c,
-                              bt[cos_cpuid()].p.rc), "Single Budget: Cannot Allocate")) {
-                return;
-        }
-        for (i = 1; i <= TEST_ITER; i++) {
-
-                ret = cos_tcap_transfer(bt[cos_cpuid()].c.rc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE,
-                                        GRANULARITY * BUDGET_TIME, TCAP_PRIO_MAX + 2);
-                if (EXPECT_LL_NEQ(0, ret, "Single Budget: TCAP Transfer")) {
-                        return;
-                }
-
-                rdtscll(s);
-                if (cos_switch(bt[cos_cpuid()].c.tc, bt[cos_cpuid()].c.tcc, TCAP_PRIO_MAX + 2, TCAP_TIME_NIL,
-                               BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cos_sched_sync())){
-                        EXPECT_LL_NEQ(0, 1, "Single Budget: COS Switch");
-                        return;
-                }
-                rdtscll(e);
-
-                if (i > 1) {
-                        /* Performant absolute value function instead of branching */
-                        time = (e - s - (GRANULARITY * BUDGET_TIME));
-                        mask = (time >> (sizeof(cycles_t) * CHAR_BIT - 1));
-                        time = (time + mask) ^ mask;
-
-                        perfdata_add(&result, time);
-
-                        if (EXPECT_LLU_LT((long long unsigned)(e-s), (unsigned)(GRANULARITY * BUDGET_TIME * MAX_THDS),
-                                          "Single Budget: MAX Bound") ||
-                                EXPECT_LLU_LT((unsigned)(GRANULARITY * BUDGET_TIME * MIN_THDS), (long long unsigned)(e-s),
-                                               "Single Budget: MIN Bound")) {
-                                return;
-                        }
-                }
-                sched_events_clear();
-        }
-
-        perfdata_calc(&result);
-        result_budgets_single.avg = perfdata_avg(&result);
-        result_budgets_single.max = perfdata_avg(&result);
-        result_budgets_single.min = perfdata_avg(&result);
-        result_budgets_single.sz = perfdata_avg(&result);
-        result_budgets_single.sd = perfdata_avg(&result);
-        result_budgets_single.p90tile = perfdata_avg(&result);
-        result_budgets_single.p95tile = perfdata_avg(&result);
-        result_budgets_single.p99tile = perfdata_avg(&result);
-
-        PRINTC("\t%s: \t\t\tSuccess\n", "Timer => Budget based");
+	int         i;
+	cycles_t    s = 0, e = 0;
+	cycles_t    time, mask;
+	int         ret;
+
+	perfdata_init(&result, "Timer => Budget based", test_results, ARRAY_SIZE);
+
+	if (EXPECT_LL_NEQ(0, exec_cluster_alloc(&bt[cos_cpuid()].p, parent, &bt[cos_cpuid()].p,
+					BOOT_CAPTBL_SELF_INITRCV_CPU_BASE), "Single Budget: Cannot Allocate") ||
+			EXPECT_LL_NEQ(0, exec_cluster_alloc(&bt[cos_cpuid()].c, spinner, &bt[cos_cpuid()].c,
+					bt[cos_cpuid()].p.rc), "Single Budget: Cannot Allocate")) {
+		return;
+	}
+	for (i = 1; i <= TEST_ITER; i++) {
+
+		ret = cos_tcap_transfer(bt[cos_cpuid()].c.rc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE,
+				GRANULARITY * BUDGET_TIME, TCAP_PRIO_MAX + 2);
+		if (EXPECT_LL_NEQ(0, ret, "Single Budget: TCAP Transfer")) {
+			return;
+		}
+
+		rdtscll(s);
+		if (cos_switch(bt[cos_cpuid()].c.tc, bt[cos_cpuid()].c.tcc, TCAP_PRIO_MAX + 2, TCAP_TIME_NIL,
+					BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cos_sched_sync())){
+			EXPECT_LL_NEQ(0, 1, "Single Budget: COS Switch");
+			return;
+		}
+		rdtscll(e);
+
+		if (i > 1) {
+			/* Performant absolute value function instead of branching */
+			time = (e - s - (GRANULARITY * BUDGET_TIME));
+			mask = (time >> (sizeof(cycles_t) * CHAR_BIT - 1));
+			time = (time + mask) ^ mask;
+
+			perfdata_add(&result, time);
+
+			if (EXPECT_LLU_LT((long long unsigned)(e-s), (unsigned)(GRANULARITY * BUDGET_TIME * MAX_THDS),
+						"Single Budget: MAX Bound") ||
+					EXPECT_LLU_LT((unsigned)(GRANULARITY * BUDGET_TIME * MIN_THDS), (long long unsigned)(e-s),
+						"Single Budget: MIN Bound")) {
+				return;
+			}
+		}
+		sched_events_clear();
+	}
+
+	perfdata_calc(&result);
+	result_budgets_single.avg = perfdata_avg(&result);
+	result_budgets_single.max = perfdata_avg(&result);
+	result_budgets_single.min = perfdata_avg(&result);
+	result_budgets_single.sz = perfdata_avg(&result);
+	result_budgets_single.sd = perfdata_avg(&result);
+	result_budgets_single.p90tile = perfdata_avg(&result);
+	result_budgets_single.p95tile = perfdata_avg(&result);
+	result_budgets_single.p99tile = perfdata_avg(&result);
+
+	PRINTC("\t%s: \t\t\tSuccess\n", "Timer => Budget based");
 }
 
 #define RATE_1 1600
@@ -314,80 +314,78 @@ test_tcap_budgets_single(void)
 static void
 test_tcap_budgets_multi(void)
 {
-        int i;
-
-        if(EXPECT_LL_NEQ(0, exec_cluster_alloc(&mbt[cos_cpuid()].p, spinner_cyc, &(mbt[cos_cpuid()].p.cyc),
-                                         BOOT_CAPTBL_SELF_INITRCV_CPU_BASE), "Multi Budget: Cannot Allocate") ||
-           EXPECT_LL_NEQ(0, exec_cluster_alloc(&mbt[cos_cpuid()].c, spinner_cyc, &(mbt[cos_cpuid()].c.cyc),
-                                         mbt[cos_cpuid()].p.rc), "Multi Budget: Cannot Allocate") ||
-           EXPECT_LL_NEQ(0, exec_cluster_alloc(&mbt[cos_cpuid()].g, spinner_cyc, &(mbt[cos_cpuid()].g.cyc),
-                                         mbt[cos_cpuid()].c.rc), "Multi Budget: Cannot allocate")) {
-                return;
-        }
-
-        for (i = 1; i <= TEST_ITER; i++) {
-                tcap_res_t  res;
-                thdid_t     tid;
-                int         blocked;
-                cycles_t    cycles, s, e;
-                tcap_time_t thd_timeout;
-
-                                        /* test both increasing budgets and constant budgets */
-                if (i > (TEST_ITER/2))
-                        res = GRANULARITY * RATE_1;
-                else
-                        res = i * GRANULARITY * RATE_2;
-
-                if (EXPECT_LL_NEQ(0, cos_tcap_transfer(mbt[cos_cpuid()].p.rc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE,
-                                  res, TCAP_PRIO_MAX + 2), "Multi Budget: TCAP Transfer") ||
-                        EXPECT_LL_NEQ(0, cos_tcap_transfer(mbt[cos_cpuid()].c.rc, mbt[cos_cpuid()].p.tcc, res / 2,
-                                      TCAP_PRIO_MAX + 2), "Multi Budget: TCAP Transfer") ||
-                        EXPECT_LL_NEQ(0, cos_tcap_transfer(mbt[cos_cpuid()].g.rc, mbt[cos_cpuid()].c.tcc, res / 4,
-                                      TCAP_PRIO_MAX + 2), "Multi Budget: TCAP Transfer")) {
-                        return;
-                }
-
-                mbt[cos_cpuid()].p.cyc = mbt[cos_cpuid()].c.cyc = mbt[cos_cpuid()].g.cyc = 0;
-                rdtscll(s);
-                if (cos_switch(mbt[cos_cpuid()].g.tc, mbt[cos_cpuid()].g.tcc, TCAP_PRIO_MAX + 2, TCAP_TIME_NIL,
-                               BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cos_sched_sync())) {
-                        EXPECT_LL_NEQ(0, 1, "Multi Budget: COS Switch");
-                        return;
-                }
-                rdtscll(e);
-
-                cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, 0, 0, NULL, &tid, &blocked, &cycles, &thd_timeout);
-
-                if ( i > 1) {
-
-                        /* To measure time of execution, we need a min time
-                         * as well as a max time to determine
-                         * if the interrupt happened when it was supposed to
-                         * thus MAX bound and MIN bound
-                         * MAX_THDS and MIN_THDS are #defined to give it some flexibility
-                         * from the user
-                         */
-
-                        if (EXPECT_LLU_LT((mbt[cos_cpuid()].g.cyc - s), (res / 4 * MAX_THDS), "Multi Budget: G")       ||
-                            EXPECT_LLU_LT(mbt[cos_cpuid()].g.cyc - s, res / 4 * MAX_THDS, "Multi Budget: G MAX Bound") ||
-                            EXPECT_LLU_LT(res / 4 * MIN_THDS, mbt[cos_cpuid()].g.cyc - s, "Multi Budget: G MIN Bound") ||
-                            EXPECT_LLU_LT(mbt[cos_cpuid()].c.cyc - s, res / 2 * MAX_THDS, "Multi Budget: C MAX Bound") ||
-                            EXPECT_LLU_LT(res / 2 * MIN_THDS, mbt[cos_cpuid()].c.cyc - s, "Multi Budget: C MIN Bound") ||
-                            EXPECT_LLU_LT(mbt[cos_cpuid()].p.cyc - s, res * MAX_THDS, "Multi Budget: P MAX Bound")     ||
-                            EXPECT_LLU_LT(res * MIN_THDS, mbt[cos_cpuid()].p.cyc - s, "Multi Budget: P MIN BOund")) {
-                            return;
-                        }
-                }
-        }
-        PRINTC("\t%s: \t\tSuccess\n", "Timer => Hierarchical Budget");
+	int i;
+
+	if(EXPECT_LL_NEQ(0, exec_cluster_alloc(&mbt[cos_cpuid()].p, spinner_cyc, &(mbt[cos_cpuid()].p.cyc),
+					BOOT_CAPTBL_SELF_INITRCV_CPU_BASE), "Multi Budget: Cannot Allocate") ||
+			EXPECT_LL_NEQ(0, exec_cluster_alloc(&mbt[cos_cpuid()].c, spinner_cyc, &(mbt[cos_cpuid()].c.cyc),
+					mbt[cos_cpuid()].p.rc), "Multi Budget: Cannot Allocate") ||
+			EXPECT_LL_NEQ(0, exec_cluster_alloc(&mbt[cos_cpuid()].g, spinner_cyc, &(mbt[cos_cpuid()].g.cyc),
+					mbt[cos_cpuid()].c.rc), "Multi Budget: Cannot allocate")) {
+		return;
+	}
+
+	for (i = 1; i <= TEST_ITER; i++) {
+		tcap_res_t  res;
+		cycles_t    s, e;
+		tcap_time_t thd_timeout;
+
+		/* test both increasing budgets and constant budgets */
+		if (i > (TEST_ITER/2))
+			res = GRANULARITY * RATE_1;
+		else
+			res = i * GRANULARITY * RATE_2;
+
+		if (EXPECT_LL_NEQ(0, cos_tcap_transfer(mbt[cos_cpuid()].p.rc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE,
+						res, TCAP_PRIO_MAX + 2), "Multi Budget: TCAP Transfer") ||
+				EXPECT_LL_NEQ(0, cos_tcap_transfer(mbt[cos_cpuid()].c.rc, mbt[cos_cpuid()].p.tcc, res / 2,
+						TCAP_PRIO_MAX + 2), "Multi Budget: TCAP Transfer") ||
+				EXPECT_LL_NEQ(0, cos_tcap_transfer(mbt[cos_cpuid()].g.rc, mbt[cos_cpuid()].c.tcc, res / 4,
+						TCAP_PRIO_MAX + 2), "Multi Budget: TCAP Transfer")) {
+			return;
+		}
+
+		mbt[cos_cpuid()].p.cyc = mbt[cos_cpuid()].c.cyc = mbt[cos_cpuid()].g.cyc = 0;
+		rdtscll(s);
+		if (cos_switch(mbt[cos_cpuid()].g.tc, mbt[cos_cpuid()].g.tcc, TCAP_PRIO_MAX + 2, TCAP_TIME_NIL,
+					BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cos_sched_sync())) {
+			EXPECT_LL_NEQ(0, 1, "Multi Budget: COS Switch");
+			return;
+		}
+		rdtscll(e);
+
+		sched_events_clear();
+
+		if ( i > 1) {
+
+			/* To measure time of execution, we need a min time
+			 * as well as a max time to determine
+			 * if the interrupt happened when it was supposed to
+			 * thus MAX bound and MIN bound
+			 * MAX_THDS and MIN_THDS are #defined to give it some flexibility
+			 * from the user
+			 */
+
+			if (EXPECT_LLU_LT((mbt[cos_cpuid()].g.cyc - s), (res / 4 * MAX_THDS), "Multi Budget: G")       ||
+					EXPECT_LLU_LT(mbt[cos_cpuid()].g.cyc - s, res / 4 * MAX_THDS, "Multi Budget: G MAX Bound") ||
+					EXPECT_LLU_LT(res / 4 * MIN_THDS, mbt[cos_cpuid()].g.cyc - s, "Multi Budget: G MIN Bound") ||
+					EXPECT_LLU_LT(mbt[cos_cpuid()].c.cyc - s, res / 2 * MAX_THDS, "Multi Budget: C MAX Bound") ||
+					EXPECT_LLU_LT(res / 2 * MIN_THDS, mbt[cos_cpuid()].c.cyc - s, "Multi Budget: C MIN Bound") ||
+					EXPECT_LLU_LT(mbt[cos_cpuid()].p.cyc - s, res * MAX_THDS, "Multi Budget: P MAX Bound")     ||
+					EXPECT_LLU_LT(res * MIN_THDS, mbt[cos_cpuid()].p.cyc - s, "Multi Budget: P MIN BOund")) {
+				return;
+			}
+		}
+	}
+	PRINTC("\t%s: \t\tSuccess\n", "Timer => Hierarchical Budget");
 }
 
 void
 test_tcap_budgets(void)
 {
-        /* single-level budgets test */
-        test_tcap_budgets_single();
+	/* single-level budgets test */
+	test_tcap_budgets_single();
 
-        /* multi-level budgets test */
-        test_tcap_budgets_multi();
+	/* multi-level budgets test */
+	test_tcap_budgets_multi();
 }
diff --git a/src/components/implementation/tests/kernel_tests/k_test_thd.c b/src/components/implementation/tests/kernel_tests/k_test_thd.c
index 90483b39fb..a4cffad5c7 100644
--- a/src/components/implementation/tests/kernel_tests/k_test_thd.c
+++ b/src/components/implementation/tests/kernel_tests/k_test_thd.c
@@ -11,50 +11,50 @@ static int          failure = 0;
 static void
 test_thd_arg(void *d)
 {
-        int ret = 0;
+	int ret = 0;
 
-        if (EXPECT_LL_NEQ((int)d, THD_ARG, "Thread Creation: Argument Incorrect")) failure = 1;
-        while (1) cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
-        PRINTC("Error, shouldn't get here!\n");
+	if (EXPECT_LL_NEQ((int)d, THD_ARG, "Thread Creation: Argument Incorrect")) failure = 1;
+	while (1) cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
+	PRINTC("Error, shouldn't get here!\n");
 }
 
 static void
 test_thds_create_switch(void)
 {
-        thdcap_t ts;
-        intptr_t i = THD_ARG;
-        int      ret;
-
-        ts = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_thd_arg, (void *)i);
-        if (EXPECT_LL_LT(1, ts, "Thread Creation: Cannot Allocate")) {
-                return;
-        }
-        ret = cos_thd_switch(ts);
-        EXPECT_LL_NEQ(0, ret, "COS Switch Error");
-
-        CHECK_STATUS_FLAG();
-        PRINTC("\t%s: \t\t\tSuccess\n", "THD => Creation & ARG");
-        EXIT_FN();
+	thdcap_t ts;
+	intptr_t i = THD_ARG;
+	int      ret;
+
+	ts = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_thd_arg, (void *)i, 0, 0);
+	if (EXPECT_LL_LT(1, ts, "Thread Creation: Cannot Allocate")) {
+		return;
+	}
+	ret = cos_thd_switch(ts);
+	EXPECT_LL_NEQ(0, ret, "COS Switch Error");
+
+	CHECK_STATUS_FLAG();
+	PRINTC("\t%s: \t\t\tSuccess\n", "THD => Creation & ARG");
+	EXIT_FN();
 }
 
 static void
 thd_fn_mthds_ring(void *d)
 {
-        int ret;
+	int ret;
 
-        if (count != (int) d) cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
+	if (count != (int) d) cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
 
-        int next = (++count) % TEST_NTHDS;
-        if (!next) cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
+	int next = (++count) % TEST_NTHDS;
+	if (!next) cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
 
-        ret = cos_thd_switch(thd_test[next]);
-        if (EXPECT_LL_NEQ(0, ret, "Thread Ring: COS Switch Error")) failure = 1;
+	ret = cos_thd_switch(thd_test[next]);
+	if (EXPECT_LL_NEQ(0, ret, "Thread Ring: COS Switch Error")) failure = 1;
 
-        while (1) {
-                cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
-        }
-        EXPECT_LL_NEQ(1, 0, "Error, shouldn't get here!\n");
-        assert(0);
+	while (1) {
+		cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
+	}
+	EXPECT_LL_NEQ(1, 0, "Error, shouldn't get here!\n");
+	assert(0);
 }
 
 /* Ring Multithreaded Test
@@ -66,39 +66,39 @@ thd_fn_mthds_ring(void *d)
 static void
 test_mthds_ring(void)
 {
-        int   i, ret;
+	int   i, ret;
 
-        count = 0;
+	count = 0;
 
-        for (i = 0; i < TEST_NTHDS; i++) {
-                thd_test[i] = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_fn_mthds_ring, (void *)i);
-                if (EXPECT_LL_LT(1, thd_test[i], "Thread Ring: Cannot Allocate")) {
-                        return;
-                }
-        }
+	for (i = 0; i < TEST_NTHDS; i++) {
+		thd_test[i] = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_fn_mthds_ring, (void *)i, 0, 0);
+		if (EXPECT_LL_LT(1, thd_test[i], "Thread Ring: Cannot Allocate")) {
+			return;
+		}
+	}
 
-        ret = cos_thd_switch(thd_test[0]);
-        EXPECT_LL_NEQ(0, ret, "Thread Ring: COS Switch Error");
+	ret = cos_thd_switch(thd_test[0]);
+	EXPECT_LL_NEQ(0, ret, "Thread Ring: COS Switch Error");
 
-        if (EXPECT_LL_NEQ(count, TEST_NTHDS, "Thread Ring: Failure # of THDS")) {
-                return;
-        }
+	if (EXPECT_LL_NEQ(count, TEST_NTHDS, "Thread Ring: Failure # of THDS")) {
+		return;
+	}
 
-        CHECK_STATUS_FLAG();
-        PRINTC("\t%s: \t\t\tSuccess\n", "THD => Switch Cyclic" );
-        EXIT_FN();
+	CHECK_STATUS_FLAG();
+	PRINTC("\t%s: \t\t\tSuccess\n", "THD => Switch Cyclic" );
+	EXIT_FN();
 }
 
 static void
 thd_fn_mthds_classic(void *d)
 {
-        cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
+	cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
 
-        while (1) {
-                cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
-        }
-        EXPECT_LL_NEQ(1, 0, "Error, shouldn't get here!\n");
-        assert(0);
+	while (1) {
+		cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
+	}
+	EXPECT_LL_NEQ(1, 0, "Error, shouldn't get here!\n");
+	assert(0);
 }
 
 /* Classic Multithreaded Test
@@ -109,31 +109,31 @@ thd_fn_mthds_classic(void *d)
 static void
 test_mthds_classic(void)
 {
-        thdcap_t  ts;
-        int       i, ret;
-
-        ts = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_fn_mthds_classic, NULL);
-        if (EXPECT_LL_LT(1, ts, "Thread Classic: Cannot Allocate")) {
-                return;
-        }
-
-        for (i = 0; i < ITER; i++) {
-                ret = cos_thd_switch(ts);
-                if(EXPECT_LL_NEQ(0, ret, "Thread Classic: COS Switch Error")) return;
-        }
-        CHECK_STATUS_FLAG();
-        PRINTC("\t%s: \t\tSuccess\n", "THD => Switch in pairs");
-        EXIT_FN();
+	thdcap_t  ts;
+	int       i, ret;
+
+	ts = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_fn_mthds_classic, NULL, 0, 0);
+	if (EXPECT_LL_LT(1, ts, "Thread Classic: Cannot Allocate")) {
+		return;
+	}
+
+	for (i = 0; i < ITER; i++) {
+		ret = cos_thd_switch(ts);
+		if(EXPECT_LL_NEQ(0, ret, "Thread Classic: COS Switch Error")) return;
+	}
+	CHECK_STATUS_FLAG();
+	PRINTC("\t%s: \t\tSuccess\n", "THD => Switch in pairs");
+	EXIT_FN();
 }
 
 static void
 thd_tls(void *d)
 {
-        if (EXPECT_LLU_NEQ((long unsigned)tls_get(0), (long unsigned)tls_test[cos_cpuid()][(int)d],
-                            "Thread TLS: ARG not correct")) failure = 1;
-        while (1) cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
-        EXPECT_LL_NEQ(1, 0, "Error, shouldn't get here!\n");
-        assert(0);
+	if (EXPECT_LLU_NEQ((long unsigned)tls_get(0), (long unsigned)tls_test[cos_cpuid()][(int)d],
+				"Thread TLS: ARG not correct")) failure = 1;
+	while (1) cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
+	EXPECT_LL_NEQ(1, 0, "Error, shouldn't get here!\n");
+	assert(0);
 }
 
 /* Test the TLS support
@@ -142,32 +142,32 @@ thd_tls(void *d)
 static void
 test_thds_tls(void)
 {
-        thdcap_t ts[TEST_NTHDS];
-        intptr_t i;
-        int      ret;
-
-        for (i = 0; i < TEST_NTHDS; i++) {
-                ts[i] = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_tls, (void *)i);
-                if (EXPECT_LL_LT(1, ts[i], "Thread TLS: Cannot Allocate")) {
-                        return;
-                }
-                tls_test[cos_cpuid()][i] = i;
-                cos_thd_mod(&booter_info, ts[i], &tls_test[cos_cpuid()][i]);
-                ret = cos_thd_switch(ts[i]);
-                if (EXPECT_LL_NEQ(0, ret, "Thread TLS: COS Switch Error")) return;
-        }
-
-        CHECK_STATUS_FLAG();
-        PRINTC("\t%s: \t\t\tSuccess\n", "THD => Creation & TLS");
-        EXIT_FN();
+	thdcap_t ts[TEST_NTHDS];
+	intptr_t i;
+	int      ret;
+
+	for (i = 0; i < TEST_NTHDS; i++) {
+		ts[i] = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_tls, (void *)i, 0, 0);
+		if (EXPECT_LL_LT(1, ts[i], "Thread TLS: Cannot Allocate")) {
+			return;
+		}
+		tls_test[cos_cpuid()][i] = i;
+		cos_thd_mod(&booter_info, ts[i], &tls_test[cos_cpuid()][i]);
+		ret = cos_thd_switch(ts[i]);
+		if (EXPECT_LL_NEQ(0, ret, "Thread TLS: COS Switch Error")) return;
+	}
+
+	CHECK_STATUS_FLAG();
+	PRINTC("\t%s: \t\t\tSuccess\n", "THD => Creation & TLS");
+	EXIT_FN();
 }
 
 void
 test_thds(void)
 {
-        test_thds_create_switch();
-        test_thds_tls();
-        test_mthds_classic();
-        test_mthds_ring();
+	test_thds_create_switch();
+	test_thds_tls();
+	test_mthds_classic();
+	test_mthds_ring();
 }
 
diff --git a/src/components/implementation/tests/kernel_tests/kernel_test_booter.c b/src/components/implementation/tests/kernel_tests/kernel_test_booter.c
index 4998ee861b..50ebeb8d59 100644
--- a/src/components/implementation/tests/kernel_tests/kernel_test_booter.c
+++ b/src/components/implementation/tests/kernel_tests/kernel_test_booter.c
@@ -14,7 +14,7 @@ int count = 0;
 void
 term_fn(void *d)
 {
-        SPIN();
+	SPIN();
 }
 
 static int test_done[NUM_CPU];
@@ -22,52 +22,52 @@ static int test_done[NUM_CPU];
 void
 cos_init(void)
 {
-        int        cycs, i;
-        static int first_init = 1, init_done = 0;
-
-        cycs = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
-        printc("\t%d cycles per microsecond\n", cycs);
-
-        if (first_init) {
-                first_init = 0;
-                cos_meminfo_init(&booter_info.mi, BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
-                cos_compinfo_init(&booter_info, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_COMP,
-                                                  (vaddr_t)cos_get_heap_ptr(), BOOT_CAPTBL_FREE, &booter_info);
-                init_done = 1;
-        }
-
-        while (!init_done);
-
-        termthd[cos_cpuid()] = cos_thd_alloc(&booter_info, booter_info.comp_cap, term_fn, NULL);
-        assert(termthd[cos_cpuid()]);
-        PRINTC("Kernel Tests\n");
-        printc("\nUnit Test Started:\n\n");
-
-        /* Kernel Tests */
-        cyc_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
-        test_timer();
-        test_tcap_budgets();
-        test_2timers();
-        test_thds();
-        test_mem_alloc();
-        test_async_endpoints();
-        test_inv();
-        test_captbl_expands();
-
-        printc("\nuBenchamarks Started:\n\n");
-
-        test_run_perf_kernel();
-
-        /* NOTE: This is just to make sense of the output on HW! To understand that microbooter runs to completion on all cores! */
-        test_done[cos_cpuid()] = 1;
-        for (i = 0; i < NUM_CPU; i++) {
-                while (!test_done[i]) ;
-        }
-
-        printc("\n");
-        PRINTC("Kernel Tests done.\n");
-
-        cos_thd_switch(termthd[cos_cpuid()]);
-
-        return;
+	int        cycs, i;
+	static int first_init = 1, init_done = 0;
+
+	cycs = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
+	printc("\t%d cycles per microsecond\n", cycs);
+
+	if (first_init) {
+		first_init = 0;
+		cos_meminfo_init(&booter_info.mi, BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
+		cos_compinfo_init(&booter_info, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_COMP,
+				(vaddr_t)cos_get_heap_ptr(), LLBOOT_CAPTBL_FREE, &booter_info);
+		init_done = 1;
+	}
+
+	while (!init_done);
+
+	termthd[cos_cpuid()] = cos_thd_alloc(&booter_info, booter_info.comp_cap, term_fn, NULL, 0, 0);
+	assert(termthd[cos_cpuid()]);
+	PRINTC("Kernel Tests\n");
+	printc("\nUnit Test Started:\n\n");
+
+	/* Kernel Tests */
+	cyc_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
+	test_timer();
+	test_tcap_budgets();
+	test_2timers();
+	test_thds();
+	test_mem_alloc();
+	test_async_endpoints();
+	test_inv();
+	test_captbl_expands();
+
+	printc("\nuBenchamarks Started:\n\n");
+
+	test_run_perf_kernel();
+
+	/* NOTE: This is just to make sense of the output on HW! To understand that microbooter runs to completion on all cores! */
+	test_done[cos_cpuid()] = 1;
+	for (i = 0; i < NUM_CPU; i++) {
+		while (!test_done[i]) ;
+	}
+
+	printc("\n");
+	PRINTC("Kernel Tests done.\n");
+
+	cos_thd_switch(termthd[cos_cpuid()]);
+
+	return;
 }
diff --git a/src/components/implementation/tests/kernel_tests/kernel_tests.h b/src/components/implementation/tests/kernel_tests/kernel_tests.h
index 82741bef12..4668e89297 100644
--- a/src/components/implementation/tests/kernel_tests/kernel_tests.h
+++ b/src/components/implementation/tests/kernel_tests/kernel_tests.h
@@ -10,22 +10,22 @@
 #undef assert
 /* On assert, immediately switch to the "exit" thread */
 #define assert(node)                                                        \
-        do {                                                                \
-                if (unlikely(!(node))) {                                    \
-                        debug_print("assert error in @ ");                  \
-                        cos_thd_switch(termthd[cos_cpuid()]);               \
-                }                                                           \
-        } while (0)
+	do {                                                                \
+		if (unlikely(!(node))) {                                    \
+			debug_print("assert error in @ ");                  \
+			cos_thd_switch(termthd[cos_cpuid()]);               \
+		}                                                           \
+	} while (0)
 
 #define EXIT_FN()                                                           \
-                exit_fn: return;
+exit_fn: return;
 
 #define CHECK_STATUS_FLAG()                                                 \
-        do {                                                                \
-                if (failure) {                                              \
-                        goto exit_fn;                                       \
-                }                                                           \
-        } while (0)
+	do {                                                                \
+		if (failure) {                                              \
+			goto exit_fn;                                       \
+		}                                                           \
+	} while (0)
 
 #include <cos_component.h>
 #include <cobj_format.h>
@@ -56,30 +56,30 @@ extern unsigned long    thd_test[TEST_NTHDS];
 extern int              num, den, count;
 
 struct results {
-        long long unsigned avg;
-        long long unsigned max;
-        long long unsigned min;
-        long long unsigned sd;
-        int                sz;
-        long long unsigned p90tile;
-        long long unsigned p95tile;
-        long long unsigned p99tile;
+	long long unsigned avg;
+	long long unsigned max;
+	long long unsigned min;
+	long long unsigned sd;
+	int                sz;
+	long long unsigned p90tile;
+	long long unsigned p95tile;
+	long long unsigned p99tile;
 };
 
-static unsigned long
+	static unsigned long
 tls_get(size_t off)
 {
-        unsigned long val;
+	unsigned long val;
 
-        __asm__ __volatile__("movl %%gs:(%1), %0" : "=r"(val) : "r"(off) :);
+	__asm__ __volatile__("movl %%gs:(%1), %0" : "=r"(val) : "r"(off) :);
 
-        return val;
+	return val;
 }
 
-static void
+	static void
 tls_set(size_t off, unsigned long val)
 {
-        __asm__ __volatile__("movl %0, %%gs:(%1)" : : "r"(val), "r"(off) : "memory");
+	__asm__ __volatile__("movl %0, %%gs:(%1)" : : "r"(val), "r"(off) : "memory");
 }
 
 extern void test_run_perf_kernel(void);
diff --git a/src/components/implementation/tests/micro_chan/Makefile b/src/components/implementation/tests/micro_chan/Makefile
new file mode 100644
index 0000000000..9ecb1154a8
--- /dev/null
+++ b/src/components/implementation/tests/micro_chan/Makefile
@@ -0,0 +1,8 @@
+COMPONENT=micro_chan.o
+INTERFACES=
+DEPENDENCIES=
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format $(LIBSLRAW) -lsl_mod_fprr -lsl_thd_static_backend -lsl_blkpt
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/tests/micro_chan/unit_schedlib.c b/src/components/implementation/tests/micro_chan/unit_schedlib.c
new file mode 100644
index 0000000000..bfc8c2340d
--- /dev/null
+++ b/src/components/implementation/tests/micro_chan/unit_schedlib.c
@@ -0,0 +1,497 @@
+/*
+ * Copyright 2016, Phani Gadepalli and Gabriel Parmer, GWU, gparmer@gwu.edu.
+ *
+ * This uses a two clause BSD License.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <cos_component.h>
+#include <cobj_format.h>
+#include <cos_defkernel_api.h>
+#include <llprint.h>
+#include <sl.h>
+#include <cos_dcb.h>
+#include <crt_chan.h>
+#include <crt_lock.h>
+
+/* Iterations, channels */
+#define CHAN_ITER  1000000
+#define NCHANTHDS  2
+#define CHAN_BATCH 3
+
+unsigned long long iters[CHAN_ITER] = { 0 };
+
+CRT_CHAN_STATIC_ALLOC(c0, int, 4);
+CRT_CHAN_TYPE_PROTOTYPES(test, int, 4);
+struct crt_lock lock;
+
+unsigned int one_only = 0;
+
+typedef enum { CHILLING = 0, RECVING, SENDING } actions_t;
+unsigned long status[NCHANTHDS];
+unsigned long cnts[NCHANTHDS] = {0, };
+
+/* sl also defines a SPIN macro */
+#undef SPIN
+#define SPIN(iters)                                \
+	do {                                       \
+		if (iters > 0) {                   \
+			for (; iters > 0; iters--) \
+				;                  \
+		} else {                           \
+			while (1)                  \
+				;                  \
+		}                                  \
+	} while (0)
+
+
+#define N_TESTTHDS 2
+#define WORKITERS 100
+
+#define N_TESTTHDS_PERF 2
+#define PERF_ITERS 1000
+
+static volatile cycles_t mid_cycs = 0;
+static volatile int testing = 1;
+
+void
+test_thd_perffn(void *data)
+{
+	cycles_t start_cycs = 0, end_cycs = 0, wc_cycs = 0, total_cycs = 0;
+	unsigned int i = 0;
+
+	rdtscll(start_cycs);
+	sl_thd_yield(0);
+	rdtscll(end_cycs);
+	assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs);
+
+	for (i = 0; i < PERF_ITERS; i++) {
+		cycles_t diff1_cycs = 0, diff2_cycs = 0;
+
+		mid_cycs = 0;
+		rdtscll(start_cycs);
+		sl_thd_yield(0);
+		rdtscll(end_cycs);
+		assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs);
+
+		diff1_cycs = mid_cycs - start_cycs;
+		diff2_cycs = end_cycs - mid_cycs;
+
+		if (diff1_cycs > wc_cycs) wc_cycs = diff1_cycs;
+		if (diff2_cycs > wc_cycs) wc_cycs = diff2_cycs;
+		total_cycs += (diff1_cycs + diff2_cycs);
+	}
+
+	PRINTC("SWITCH UBENCH: avg: %llu, wc: %llu, iters:%u\n", (total_cycs / (2 * PERF_ITERS)), wc_cycs, PERF_ITERS);
+	testing = 0;
+	/* done testing! let the spinfn cleanup! */
+	sl_thd_yield(0);
+
+	sl_thd_exit();
+}
+
+void
+test_thd_spinfn(void *data)
+{
+	while (likely(testing)) {
+		rdtscll(mid_cycs);
+		sl_thd_yield(0);
+	}
+
+	sl_thd_exit();
+}
+/* Get the numbers */
+volatile unsigned long long start_time;
+volatile unsigned long long end_time;
+//void
+//test_thd_fn(void *data)
+//{
+//	cycles_t time;
+//	cycles_t iters;
+//	int rounds = 0;
+//	if (data!=0) {
+//		while (1) {
+//			rounds++;
+//			rdtscll(start_time);
+//			sl_thd_yield(3);
+//			rdtscll(end_time);
+//			print_uint((unsigned long)(end_time-start_time));
+//			print_string("\r\n");
+//			if(rounds == 10000)
+//				while(1);
+//		}
+//	}
+//	else {
+//		while (1) {
+//			sl_thd_yield(4);
+//		}
+//	}
+//}
+
+#define RCV 0
+#define SND 1
+
+void
+test_thd_fn(void *data)
+{
+	cycles_t time;
+//	cycles_t iters;
+	cycles_t total = 0, max = 0, diff;
+	int send;
+	int recv;
+	int rounds = 0;
+	if (data==RCV) {
+		while (1) {
+			rounds ++;
+			crt_chan_recv_test(c0, &recv);
+			rdtscll(end_time);
+			assert(ps_faa(&one_only, -1) == 1);
+
+			diff = end_time - start_time;
+			if (diff > max) max = diff;
+			total += diff;
+			iters[rounds - 1] = diff;
+			//printc("%llu, ", diff);
+
+			if (rounds == CHAN_ITER) {
+				int i;
+
+				for (i = 0; i < CHAN_ITER; i++) {
+					printc("%llu\n", iters[i]);
+				}
+				printc("\nAvg: %llu, Wc:%llu\n", total / CHAN_ITER, max);
+
+				while (1) ;
+			}
+			//print_uint((unsigned long)(end_time-start_time));
+			//print_string("\r\n");
+			//if(rounds == 10000)
+			//	while(1);
+		}
+	}
+	else {
+		send = 0x1234;
+		while (1) {
+			assert(ps_faa(&one_only, 1) == 0);
+			rdtscll(start_time);
+			crt_chan_send_test(c0, &send);
+		}
+	}
+}
+
+//void
+//test_thd_fn(void *data)
+//{
+//	cycles_t time;
+//	cycles_t iters;
+//	int send;
+//	int recv;
+//	int rounds = 0;
+//
+//	if (data!=0) {
+//		while (1) {
+//			rounds ++;
+//
+//			crt_lock_take(&lock);
+//			sl_thd_yield(0);
+//			rdtscll(end_time);
+//			crt_lock_release(&lock);
+//			sl_thd_yield(0);
+//
+//			print_uint((unsigned long)(end_time-start_time));
+//			print_string("\r\n");
+//			if(rounds == 10000)
+//				while(1);
+//		}
+//	}
+//	else {
+//		crt_lock_init(&lock);
+//		while (1) {
+//			rdtscll(start_time);
+//			crt_lock_take(&lock);
+//			crt_lock_release(&lock);
+//			sl_thd_yield(0);
+//		}
+//	}
+//}
+//
+//volatile unsigned long long int_tsc;
+//void
+//test_thd_fn(capid_t cap, void *data)
+//{
+//	cycles_t time;
+//	cycles_t iters;
+//	int send;
+//	int recv;
+//	unsigned int result;
+//	int rounds = 0;
+//	if (data==0) {
+//		while (1) {
+//			//print_string("*");
+//		}
+//	}
+//	else {
+//		/* Higher priority on this branch */
+//		cos_hw_attach(BOOT_CAPTBL_SELF_INITHW_BASE, 63, sl_thd_rcvcap(sl_thd_lkup(sl_thdid())));
+//		cos_hw_custom(BOOT_CAPTBL_SELF_INITHW_BASE);
+//		while (1) {
+//			/* We are doing this receive anyway */
+//			cos_rcv(sl_thd_rcvcap(sl_thd_lkup(sl_thdid())), 0);
+//			rdtscll(end_time);
+//			addr[rounds] = (unsigned int)(end_time-int_tsc);
+//			rounds ++;
+//			if(rounds == 10000)
+//			{
+//				for (rounds = 0; rounds < 10000; rounds ++)
+//				{
+//					print_uint(addr[rounds]);
+//					print_string("\r\n");
+//				}
+//				while(1);
+//			}
+//		}
+//	}
+//}
+
+//	int rounds = 0;
+//void
+//test_thd_fn(capid_t cap, void *data)
+//{
+//	cycles_t time;
+//	cycles_t iters;
+//	int send;
+//	int recv;
+//	unsigned int result;
+//	/* if (data == 0) {
+//		while (1) {
+//			print_string("*");
+//		}
+//	}
+//	else */if (data == 0)
+//	{
+//		/* Higher priority on this branch - receiving stuff from interrupt */
+//		cos_hw_attach(BOOT_CAPTBL_SELF_INITHW_BASE, 63, sl_thd_rcvcap(sl_thd_lkup(sl_thdid())));
+//		cos_hw_custom(BOOT_CAPTBL_SELF_INITHW_BASE);
+//		while (1) {
+////			print_string(" :1a: \r\n");
+//			/* We are doing this receive anyway */
+////			sl_thd_rcv(RCV_ULONLY);
+//			cos_rcv(sl_thd_rcvcap(sl_thd_lkup(sl_thdid())), 0);
+////			print_string(" :1b: ");
+//			/* Send to the guy immediately */
+//			crt_chan_send_test(c0, &send);
+//			//sl_thd_wakeup(4);
+////			print_string(" :1c: ");
+//			//rdtscll(end_time);
+//			//addr[rounds] = (unsigned int)(end_time-int_tsc);
+//		}
+//	}
+//	else {
+//		while(1) {
+//			/* Finally, we send what we receive here */
+////			print_string(" :2a: ");
+//			//sl_thd_block(0);
+//			crt_chan_recv_test(c0, &recv);
+////			print_string(" :2b: ");
+//			rdtscll(end_time);
+//			//print_uint(addr[rounds]);
+//			//print_string(" - ");
+//			addr[rounds] = (unsigned int)(end_time-int_tsc);
+//			//print_uint(addr[rounds]);
+//			//print_string("\r\n");
+//			rounds ++;
+//			if(rounds == 10000)
+//			{
+//				for (rounds = 0; rounds < 10000; rounds ++)
+//				{
+//					print_uint(addr[rounds]);
+//					print_string("\r\n");
+//				}
+//				while(1);
+//			}
+//		}
+//	}
+//}
+
+//void
+//test_yield_perf(void)
+//{
+//	int                     i;
+//	struct sl_thd          *threads[N_TESTTHDS_PERF];
+//	union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 31}};
+//
+//	for (i = 0; i < N_TESTTHDS_PERF; i++) {
+//		if (i == 1) threads[i] = sl_thd_alloc(test_thd_perffn, (void *)&threads[0]);
+//		else        threads[i] = sl_thd_alloc(test_thd_spinfn, NULL);
+//		assert(threads[i]);
+//		sl_thd_param_set(threads[i], sp.v);
+//		PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i]));
+//	}
+//}
+
+//void
+//test_yields(void)
+//{
+//	int                     i;
+//	struct sl_thd *         threads[N_TESTTHDS];
+//	union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 10}};
+//
+//	for (i = 0; i < N_TESTTHDS; i++) {
+//		threads[i] = sl_thd_alloc(test_thd_fn, (void *)i);
+//		assert(threads[i]);
+//		sl_thd_param_set(threads[i], sp.v);
+//		PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i]));
+//	}
+//}
+
+void
+test_yields(void)
+{
+	int                     i;
+	struct sl_thd *         threads[N_TESTTHDS];
+	union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 0}};
+
+	start_time = end_time = 0;
+
+	for (i = 0; i < N_TESTTHDS; i++) {
+		threads[i] = sl_thd_alloc(test_thd_fn, (void *)i);
+		assert(threads[i]);
+		if (i == RCV) sp.c.value = 2;
+		else          sp.c.value = 5;
+		sl_thd_param_set(threads[i], sp.v);
+		PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i]));
+		//sl_thd_yield_thd(threads[i]);
+	}
+	assert(N_TESTTHDS == 2);
+	//crt_chan_p2p_init_test(c0, threads[SND], threads[RCV]);
+	crt_chan_init_test(c0);
+}
+
+//void
+//test_yields(void)
+//{
+//	int                     i;
+//	struct sl_thd *         threads[N_TESTTHDS];
+//	union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 10}};
+//
+//	crt_chan_init_test(&c0);
+//	for (i = 0; i < N_TESTTHDS; i++) {
+//		threads[i] = sl_thd_aep_alloc(test_thd_fn, (void *)i, 0, 0, 0, 0);
+//		assert(threads[i]);
+//		if(i != 0)
+//			sp.c.value = 9;
+//		sl_thd_param_set(threads[i], sp.v);
+//		PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i]));
+//	}
+//}
+
+void
+test_high(void *data)
+{
+	struct sl_thd *t = data;
+
+	while (1) {
+		sl_thd_yield(sl_thd_thdid(t));
+		printc("h");
+	}
+}
+
+void
+test_low(void *data)
+{
+	while (1) {
+		int workiters = WORKITERS * 10;
+		SPIN(workiters);
+		printc("l");
+	}
+}
+
+void
+test_blocking_directed_yield(void)
+{
+	struct sl_thd *         low, *high;
+	union sched_param_union sph = {.c = {.type = SCHEDP_PRIO, .value = 5}};
+	union sched_param_union spl = {.c = {.type = SCHEDP_PRIO, .value = 10}};
+
+	low  = sl_thd_alloc(test_low, NULL);
+	high = sl_thd_alloc(test_high, low);
+	sl_thd_param_set(low, spl.v);
+	sl_thd_param_set(high, sph.v);
+}
+
+#define TEST_ITERS 1000
+
+void
+test_high_wakeup(void *data)
+{
+	unsigned int   toggle = 0, iters = 0;
+	struct sl_thd *t     = data;
+	cycles_t       start = sl_now();
+
+	while (1) {
+		cycles_t timeout = sl_now() + sl_usec2cyc(100);
+
+		if (toggle % 10 == 0)
+			printc(".h:%llums.", sl_cyc2usec(sl_thd_block_timeout(0, timeout)));
+		else
+			printc(".h:%up.", sl_thd_block_periodic(0));
+
+		toggle++;
+		iters++;
+
+		if (iters == TEST_ITERS) {
+			printc("\nTest done! (Duration: %llu ms)\n", sl_cyc2usec(sl_now() - start) / 1000);
+			printc("Deleting all threads. Idle thread should take over!\n");
+			sl_thd_free(t);
+			sl_thd_free(sl_thd_curr());
+
+			/* should not be scheduled. */
+			assert(0);
+		}
+	}
+}
+
+void
+test_timeout_wakeup(void)
+{
+	struct sl_thd *         low, *high;
+	union sched_param_union sph = {.c = {.type = SCHEDP_PRIO, .value = 5}};
+	union sched_param_union spl = {.c = {.type = SCHEDP_PRIO, .value = 10}};
+	union sched_param_union spw = {.c = {.type = SCHEDP_WINDOW, .value = 1000}};
+
+	low = sl_thd_alloc(test_low, NULL);
+	sl_thd_param_set(low, spl.v);
+	sl_thd_param_set(low, spw.v);
+
+	high = sl_thd_alloc(test_high_wakeup, low);
+	sl_thd_param_set(high, sph.v);
+	sl_thd_param_set(high, spw.v);
+}
+
+void
+cos_init(void)
+{
+	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
+	struct cos_compinfo *   ci    = cos_compinfo_get(defci);
+
+	printc("Unit-test for the scheduling library (sl)\n");
+	/* This is a hack, we know where the heap is */
+	cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
+	cos_defcompinfo_llinit();
+	cos_dcb_info_init_curr();
+	sl_init(SL_MIN_PERIOD_US*50);
+
+	//test_yield_perf();
+	test_yields();
+	//test_blocking_directed_yield();
+	//test_timeout_wakeup();
+
+	sl_sched_loop_nonblock();
+
+	assert(0);
+
+	return;
+}
diff --git a/src/components/implementation/tests/micro_ipi/micro_ipi.c b/src/components/implementation/tests/micro_ipi/micro_ipi.c
index 2a3a180b20..45ce1f2198 100644
--- a/src/components/implementation/tests/micro_ipi/micro_ipi.c
+++ b/src/components/implementation/tests/micro_ipi/micro_ipi.c
@@ -49,13 +49,13 @@ hiprio_c0_lat_fn(arcvcap_t r, void *d)
 	assert(snd);
 
 	while (1) {
-		int pending = 0, rcvd = 0, ret = 0;
+		int pending = 0, ret = 0;
 		cycles_t now;
 
 		if (unlikely(testing == 0)) break;
 
-		pending = cos_rcv(r, RCV_ALL_PENDING, &rcvd);
-		assert(pending == 0 && rcvd == 1);
+		pending = cos_rcv(r, 0);
+		assert(pending == 0);
 		rdtscll(now);
 
 #ifdef RCV_UB_TEST
@@ -94,7 +94,7 @@ hiprio_cn_lat_fn(arcvcap_t r, void *d)
 
 	while (1) {
 		cycles_t st, en, rpcen;
-		int pending = 0, rcvd = 0, ret = 0;
+		int pending = 0, ret = 0;
 
 		if (unlikely(testing == 0)) break;
 
@@ -119,8 +119,8 @@ hiprio_cn_lat_fn(arcvcap_t r, void *d)
 #endif
 
 #ifndef CN_SND_ONLY
-		pending = cos_rcv(r, RCV_ALL_PENDING, &rcvd);
-		assert(pending == 0 && rcvd == 1);
+		pending = cos_rcv(r, 0);
+		assert(pending == 0);
 		rdtscll(rpcen);
 #ifdef RPC_UB_TEST
 		iters ++;
@@ -297,12 +297,12 @@ loprio_rate_c0_fn(arcvcap_t r, void *d)
 	while (testing == 0) ;
 
 	while (1) {
-		int pending = 0, rcvd = 0, ret = 0;
+		int pending = 0, ret = 0;
 
 		if (unlikely(testing == 0)) break;
 
-		pending = cos_rcv(r, RCV_ALL_PENDING, &rcvd);
-		assert(pending == 0 && rcvd == 1);
+		pending = cos_rcv(r, 0);
+		assert(pending == 0);
 
 		ret = cos_asnd(snd, 0);
 		assert(ret == 0);
@@ -320,7 +320,7 @@ hiprio_rate_cn_fn(arcvcap_t r, void *d)
 	while (testing == 0) ;
 
 	while (1) {
-		int pending = 0, rcvd = 0, ret = 0;
+		int pending = 0, ret = 0;
 
 		if (unlikely(testing == 0)) break;
 
@@ -330,8 +330,8 @@ hiprio_rate_cn_fn(arcvcap_t r, void *d)
 		assert(ret == 0);
 
 #ifndef CN_SND_ONLY
-		pending = cos_rcv(r, RCV_ALL_PENDING, &rcvd);
-		assert(pending == 0 && rcvd == 1);
+		pending = cos_rcv(r, 0);
+		assert(pending == 0);
 #endif
 	}
 
@@ -412,7 +412,7 @@ static void
 c0_ipc_fn(arcvcap_t r, void *d)
 {
 	asndcap_t snd = c0_cn_asnd[cos_cpuid()];
-	int iters;
+	int iters = 0;
 	cycles_t rtt_total = 0, one_total = 0, rtt_wc = 0, one_wc = 0, rone_total = 0, rone_wc = 0;
 
 	PRINTC("Testing Cross-core IPC:\n");
@@ -423,7 +423,7 @@ c0_ipc_fn(arcvcap_t r, void *d)
 	testing = 1;
 
 	while (1) {
-		int pending = 0, rcvd = 0, ret = 0;
+		int pending = 0, ret = 0;
 		cycles_t rtt_diff, one_diff = 0, rone_diff = 0;
 
 		rdtscll(c0_start);
@@ -431,8 +431,8 @@ c0_ipc_fn(arcvcap_t r, void *d)
 		assert(ret == 0);
 
 		rdtscll(c0_mid);
-		pending = cos_rcv(r, RCV_ALL_PENDING, &rcvd);
-		assert(pending == 0 && rcvd == 1);
+		pending = cos_rcv(r, 0);
+		assert(pending == 0);
 		rdtscll(c0_end);
 
 		rtt_diff = (c0_end - c0_start);
@@ -466,13 +466,13 @@ c1_ipc_fn(arcvcap_t r, void *d)
 	while (testing == 0) ;
 
 	while (1) {
-		int pending = 0, rcvd = 0, ret = 0;
+		int pending = 0, ret = 0;
 
 		if (unlikely(testing == 0)) break;
 
 		rdtscll(c1_start);
-		pending = cos_rcv(r, RCV_ALL_PENDING, &rcvd);
-		assert(pending == 0 && rcvd == 1);
+		pending = cos_rcv(r, 0);
+		assert(pending == 0);
 
 		rdtscll(c1_mid);
 		ret = cos_asnd(snd, 0);
@@ -487,7 +487,7 @@ static void
 test_ipc_setup(void)
 {
 #ifdef TEST_IPC
-	static volatile int cdone[NUM_CPU] = { 0 };
+	static volatile unsigned long cdone[NUM_CPU] = { 0 };
 	int i, ret;
 	struct sl_thd *t = NULL;
 	asndcap_t snd = 0;
diff --git a/src/components/implementation/tests/micro_xcores/micro_xcores.c b/src/components/implementation/tests/micro_xcores/micro_xcores.c
index 62c22be39e..7a4aebf008 100644
--- a/src/components/implementation/tests/micro_xcores/micro_xcores.c
+++ b/src/components/implementation/tests/micro_xcores/micro_xcores.c
@@ -29,13 +29,13 @@ cos_init(void)
                 first_init = 0;
                 cos_meminfo_init(&booter_info.mi, BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
                 cos_compinfo_init(&booter_info, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_COMP,
-                                      (vaddr_t)cos_get_heap_ptr(), BOOT_CAPTBL_FREE, &booter_info);
+                                      (vaddr_t)cos_get_heap_ptr(), LLBOOT_CAPTBL_FREE, &booter_info);
                 init_done = 1;
         }
 
         while (!init_done);
 
-        termthd[cos_cpuid()] = cos_thd_alloc(&booter_info, booter_info.comp_cap, term_fn, NULL);
+        termthd[cos_cpuid()] = cos_thd_alloc(&booter_info, booter_info.comp_cap, term_fn, NULL, 0, 0);
         assert(termthd[cos_cpuid()]);
         if (cos_cpuid() == 0) PRINTC("Micro Booter Xcore started.\n");
 
diff --git a/src/components/implementation/tests/micro_xcores/test_ipi_interference.c b/src/components/implementation/tests/micro_xcores/test_ipi_interference.c
index f301f9d873..fbe59951ca 100644
--- a/src/components/implementation/tests/micro_xcores/test_ipi_interference.c
+++ b/src/components/implementation/tests/micro_xcores/test_ipi_interference.c
@@ -2,7 +2,7 @@
 
 #include "micro_xcores.h"
 
-extern void sched_events_clear(int* rcvd, thdid_t* tid, int* blocked, cycles_t* cycles, tcap_time_t* thd_timeout);
+extern void sched_events_clear(thdid_t* tid, int* blocked, cycles_t* cycles, tcap_time_t* thd_timeout);
 
 /* Test RCV 2: Close Loop at higher priority => Measure Kernel involvement */
 
@@ -38,7 +38,7 @@ test_rcv(arcvcap_t r)
 {
         int pending = 0, rcvd = 0;
 
-        pending = cos_rcv(r, RCV_ALL_PENDING, &rcvd);
+        pending = cos_rcv(r, 0);
         assert(pending == 0);
 
         total_rcvd[cos_cpuid()] += rcvd;
@@ -76,13 +76,13 @@ test_rcv_fn(void *d)
 static void
 test_sched_loop(void)
 {
-        int blocked, rcvd, pending, ret;
+        int blocked, pending, ret;
         cycles_t cycles;
         tcap_time_t timeout, thd_timeout;
         thdid_t thdid;
 
         /* Clear Scheduler */
-        sched_events_clear(&rcvd, &thdid, &blocked, &cycles, &thd_timeout);
+        sched_events_clear(&thdid, &blocked, &cycles, &thd_timeout);
 
         while (1) {
                 if(cos_cpuid() == TEST_RCV_CORE) {
@@ -90,8 +90,8 @@ test_sched_loop(void)
                                 ret = cos_switch(spinner_thd[cos_cpuid()], BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_PRIO_MAX + 2, 0, 0, 0);
                         } while (ret == -EAGAIN);
                 }
-                while ((pending = cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, RCV_ALL_PENDING, 0,
-                                                &rcvd, &thdid, &blocked, &cycles, &thd_timeout)) >= 0) {
+                while ((pending = cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, 0, 0,
+                                                &thdid, &blocked, &cycles, &thd_timeout)) >= 0) {
                         if (!thdid) goto done;
                         assert(thdid == tid[cos_cpuid()]);
                         blkd[cos_cpuid()] = blocked;
@@ -181,6 +181,7 @@ test_ipi_interference(void)
         thdcap_t  t = 0;
         tcap_t    tcc = 0;
 
+	if (NUM_CPU <= 1) return;
 
         if (cos_cpuid() == TEST_RCV_CORE) {
 
@@ -190,7 +191,7 @@ test_ipi_interference(void)
                 if (EXPECT_LL_LT(1, tcc, "IPI Interference: TCAP Allocation"))
                         return;
 
-                t = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_rcv_fn, NULL);
+                t = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_rcv_fn, NULL, 0, 0);
                 if (EXPECT_LL_LT(1, t, "IPI Inteference: Thread Allocation"))
                         return;
 
@@ -205,7 +206,7 @@ test_ipi_interference(void)
                 rcv[cos_cpuid()] = r;
                 while (!rcv[TEST_SND_CORE]) ;
 
-                t = cos_thd_alloc(&booter_info, booter_info.comp_cap, rcv_spinner, NULL);
+                t = cos_thd_alloc(&booter_info, booter_info.comp_cap, rcv_spinner, NULL, 0, 0);
                 if (EXPECT_LL_LT(1, t, "IPI Interference: Thread Allocation"))
                         return;
 
@@ -230,7 +231,7 @@ test_ipi_interference(void)
                 if (EXPECT_LL_LT(1, tcc, "IPI Interference: TCAP Allocation"))
                         return;
 
-                t = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_asnd_fn, NULL);
+                t = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_asnd_fn, NULL, 0, 0);
                 if (EXPECT_LL_LT(1, t, "IPI Interference: Thread Allocation"))
                         return;
 
diff --git a/src/components/implementation/tests/micro_xcores/test_ipi_n_n.c b/src/components/implementation/tests/micro_xcores/test_ipi_n_n.c
index a8d9c9bfb9..a0c90f6510 100644
--- a/src/components/implementation/tests/micro_xcores/test_ipi_n_n.c
+++ b/src/components/implementation/tests/micro_xcores/test_ipi_n_n.c
@@ -26,7 +26,7 @@ test_ipi_fn(void *d)
 
                 r = cos_asnd(snd, 1);
                 assert(r == 0);
-                p = cos_rcv(rcv, RCV_ALL_PENDING, &r);
+                p = cos_rcv(rcv, 0);
                 assert(p >= 0);
         }
 }
@@ -49,7 +49,7 @@ test_rcv_crt(void)
                 asndcap_t snd = 0;
 
                 if (cos_cpuid() == i) continue;
-                thd = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_ipi_fn, (void *)i);
+                thd = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_ipi_fn, (void *)i, 0, 0);
                 assert(thd);
 
                 rcv = cos_arcv_alloc(&booter_info, thd, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, booter_info.comp_cap, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE);
@@ -147,7 +147,7 @@ test_ipi_n_n(void)
         rdtscll(now);
         prev = now;
         while (1) {
-                int blocked, rcvd, pending;
+                int blocked, pending;
                 cycles_t cycles;
                 tcap_time_t timeout, thd_timeout;
                 thdid_t tid;
@@ -158,8 +158,8 @@ test_ipi_n_n(void)
                 if (now - prev > wc) wc = now - prev;
                 test_thd_act();
 
-                while ((pending = cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, RCV_ALL_PENDING, 0,
-                                 &rcvd, &tid, &blocked, &cycles, &thd_timeout)) >= 0) {
+                while ((pending = cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, 0, 0,
+                                                &tid, &blocked, &cycles, &thd_timeout)) >= 0) {
                         if (!tid) goto done;
                         j = test_find_tid(tid);
                         assert(j >= 0);
diff --git a/src/components/implementation/tests/micro_xcores/test_ipi_roundtrip.c b/src/components/implementation/tests/micro_xcores/test_ipi_roundtrip.c
index 24dd2c13ad..005c79940d 100644
--- a/src/components/implementation/tests/micro_xcores/test_ipi_roundtrip.c
+++ b/src/components/implementation/tests/micro_xcores/test_ipi_roundtrip.c
@@ -2,7 +2,7 @@
 
 #include "micro_xcores.h"
 
-extern void sched_events_clear(int* rcvd, thdid_t* tid, int* blocked, cycles_t* cycles, tcap_time_t* thd_timeout);
+extern void sched_events_clear(thdid_t* tid, int* blocked, cycles_t* cycles, tcap_time_t* thd_timeout);
 
 /* Test Sender Time + Receiver Time Roundtrip */
 
@@ -29,14 +29,14 @@ static cycles_t           results[2][ARRAY_SIZE];
 static void
 test_rcv(arcvcap_t r)
 {
-        int pending = 0, rcvd = 0;
+        int pending = 0;
 
-        pending = cos_rcv(r, RCV_ALL_PENDING, &rcvd);
+        pending = cos_rcv(r, 0);
         assert(pending == 0);
         if (EXPECT_LL_LT(1, r, "IPI Roundtrip: Allocation on RCV"))
                 cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
 
-        total_rcvd[cos_cpuid()] += rcvd;
+        total_rcvd[cos_cpuid()] += 1;
 }
 
 static void
@@ -64,18 +64,18 @@ test_rcv_fn(void *d)
 static void
 test_sched_loop(void)
 {
-        int         blocked, rcvd, pending, ret;
+        int         blocked, pending, ret;
         cycles_t    cycles;
         tcap_time_t timeout, thd_timeout;
         thdid_t     thdid;
 
         /* Clear Scheduler */
-        sched_events_clear(&rcvd, &thdid, &blocked, &cycles, &thd_timeout);
+        sched_events_clear(&thdid, &blocked, &cycles, &thd_timeout);
 
         while (1) {
 
-                while ((pending = cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, RCV_ALL_PENDING, 0,
-                                                &rcvd, &thdid, &blocked, &cycles, &thd_timeout)) >= 0) {
+                while ((pending = cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, 0, 0,
+                                                &thdid, &blocked, &cycles, &thd_timeout)) >= 0) {
                         if (!thdid) goto done;
                         assert(thdid == tid[cos_cpuid()]);
                         blkd[cos_cpuid()] = blocked;
@@ -157,6 +157,7 @@ test_ipi_roundtrip(void)
         thdcap_t  t = 0;
         tcap_t    tcc = 0;
 
+	if (NUM_CPU <= 1) return;
 
         if (cos_cpuid() == TEST_RCV_CORE) {
 
@@ -167,7 +168,7 @@ test_ipi_roundtrip(void)
                         return;
 
 
-                t = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_rcv_fn, NULL);
+                t = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_rcv_fn, NULL, 0, 0);
                 if (EXPECT_LL_LT(1, t, "IPI ROUNDTRIP: Thread Allocation"))
                         return;
 
@@ -196,7 +197,7 @@ test_ipi_roundtrip(void)
 
                 /* Test Sender Time */
 
-                t = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_asnd_fn, NULL);
+                t = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_asnd_fn, NULL, 0, 0);
                 if (EXPECT_LL_LT(1, t, "IPI ROUNDTRIP: Thread Allocation"))
                         return;
 
diff --git a/src/components/implementation/tests/micro_xcores/test_ipi_switch.c b/src/components/implementation/tests/micro_xcores/test_ipi_switch.c
index e13dbda15d..4c368244cc 100644
--- a/src/components/implementation/tests/micro_xcores/test_ipi_switch.c
+++ b/src/components/implementation/tests/micro_xcores/test_ipi_switch.c
@@ -3,10 +3,10 @@
 #include "micro_xcores.h"
 
 void
-sched_events_clear(int* rcvd, thdid_t* tid, int* blocked, cycles_t* cycles, tcap_time_t* thd_timeout)
+sched_events_clear(thdid_t* tid, int* blocked, cycles_t* cycles, tcap_time_t* thd_timeout)
 {
-        while (cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, RCV_ALL_PENDING, 0,
-                             rcvd, tid, blocked, cycles, thd_timeout) != 0)
+        while (cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, 0, 0,
+                             tid, blocked, cycles, thd_timeout) != 0)
                 ;
 }
 
@@ -40,12 +40,12 @@ static cycles_t           results[ARRAY_SIZE];
 static void
 test_rcv(arcvcap_t r)
 {
-        int pending = 0, rcvd = 0;
+        int pending = 0;
 
-        pending = cos_rcv(r, RCV_ALL_PENDING, &rcvd);
+        pending = cos_rcv(r, 0);
         assert(pending == 0);
 
-        total_rcvd[cos_cpuid()] += rcvd;
+        total_rcvd[cos_cpuid()] += 1;
 }
 
 static void
@@ -80,16 +80,16 @@ rcv_spinner(void *d)
 static void
 test_rcv_1(arcvcap_t r)
 {
-        int pending = 0, rcvd = 0;
+        int pending = 0;
 
-        pending = cos_rcv(r, RCV_ALL_PENDING, &rcvd);
+        pending = cos_rcv(r, 0);
         rdtscll(global_time[1]);
         time = (global_time[1] - global_time[0]);
         perfdata_add(&pd, time);
 
         assert(pending == 0);
 
-        total_rcvd[cos_cpuid()] += rcvd;
+        total_rcvd[cos_cpuid()] += 1;
 }
 
 static void
@@ -138,13 +138,13 @@ test_asnd_fn(void *d)
 static void
 test_sched_loop(void)
 {
-        int blocked, rcvd, pending, ret;
+        int blocked, pending, ret;
         cycles_t cycles;
         tcap_time_t timeout, thd_timeout;
         thdid_t thdid;
 
         /* Clear Scheduler */
-        sched_events_clear(&rcvd, &thdid, &blocked, &cycles, &thd_timeout);
+        sched_events_clear(&thdid, &blocked, &cycles, &thd_timeout);
 
         while (1) {
 
@@ -153,8 +153,8 @@ test_sched_loop(void)
                                 ret = cos_switch(spinner_thd[cos_cpuid()], BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_PRIO_MAX + 2, 0, 0, 0);
                         } while (ret == -EAGAIN);
                 }
-                while ((pending = cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, RCV_ALL_PENDING, 0,
-                                                &rcvd, &thdid, &blocked, &cycles, &thd_timeout)) >= 0) {
+                while ((pending = cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, 0, 0,
+                                                &thdid, &blocked, &cycles, &thd_timeout)) >= 0) {
                         if (!thdid)
                                 goto done;
                         assert(thdid == tid[cos_cpuid()]);
@@ -183,6 +183,7 @@ test_ipi_switch(void)
         thdcap_t  t = 0;
         tcap_t    tcc = 0;
 
+	if (NUM_CPU <= 1) return;
 
         if (cos_cpuid() == TEST_RCV_CORE) {
 
@@ -192,7 +193,7 @@ test_ipi_switch(void)
                 if (EXPECT_LL_LT(1, tcc, "IPI SWITCH: TCAP Allocation"))
                         return;
 
-                t = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_rcv_fn, NULL);
+                t = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_rcv_fn, NULL, 0, 0);
                 if (EXPECT_LL_LT(1, t, "IPI SWITCH: Thread Allocation"))
                         return;
 
@@ -207,7 +208,7 @@ test_ipi_switch(void)
                 rcv[cos_cpuid()] = r;
                 while (!rcv[TEST_SND_CORE]) ;
 
-                t = cos_thd_alloc(&booter_info, booter_info.comp_cap, rcv_spinner, NULL);
+                t = cos_thd_alloc(&booter_info, booter_info.comp_cap, rcv_spinner, NULL, 0, 0);
                 if (EXPECT_LL_LT(1, t, "IPI SWITCH: Thread Allocation"))
                         return;
 
@@ -226,7 +227,7 @@ test_ipi_switch(void)
 
                 /* Test RCV1: Corresponding Send */
 
-                t = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_asnd_fn, NULL);
+                t = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_asnd_fn, NULL, 0, 0);
                 if (EXPECT_LL_LT(1, t, "IPI SWITCH: Thread Allocation"))
                         return;
 
diff --git a/src/components/implementation/tests/part_test/Makefile b/src/components/implementation/tests/part_test/Makefile
new file mode 100644
index 0000000000..3fcb066f74
--- /dev/null
+++ b/src/components/implementation/tests/part_test/Makefile
@@ -0,0 +1,8 @@
+COMPONENT=part_test.o
+INTERFACES=
+#DEPENDENCIES=capmgr
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/tests/part_test/init.c b/src/components/implementation/tests/part_test/init.c
new file mode 100644
index 0000000000..3511588c85
--- /dev/null
+++ b/src/components/implementation/tests/part_test/init.c
@@ -0,0 +1,79 @@
+#include <cos_kernel_api.h>
+#include <cos_defkernel_api.h>
+#include <llprint.h>
+#include <sl.h>
+#include <cos_dcb.h>
+#include <part.h>
+#include <part_task.h>
+
+int main(void);
+
+void
+cos_exit(int x)
+{
+	PRINTC("Exit code: %d\n", x);
+	while (1) ;
+}
+
+static void
+cos_main(void *d)
+{
+	assert(sl_thd_thdid(sl_thd_curr()) == cos_thdid());
+	main();
+
+	while (1) ;
+}
+
+extern void cos_gomp_init(void);
+
+void
+cos_init(void *d)
+{
+	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
+	struct cos_compinfo *   ci    = cos_compinfo_get(defci);
+	int i;
+	static volatile unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 };
+	static unsigned b1 = 0, b2 = 0, b3 = 0;
+
+	PRINTC("In a parallel program!\n");
+	if (ps_cas(&first, NUM_CPU + 1, cos_cpuid())) {
+		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
+		cos_defcompinfo_llinit();
+	} else {
+		while (!ps_load(&init_done[first])) ;
+
+		cos_defcompinfo_sched_init();
+	}
+	cos_dcb_info_init_curr();
+	ps_faa(&init_done[cos_cpuid()], 1);
+
+	/* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */
+	for (i = 0; i < NUM_CPU; i++) {
+		while (!ps_load(&init_done[i])) ;
+	}
+	sl_init(SL_MIN_PERIOD_US*100);
+	/* barrier, wait for sl_init to be done on all cores */
+	ps_faa(&b1, 1);
+	while (ps_load(&b1) != NUM_CPU) ;
+	part_init();
+	/* barrier, wait for gomp_init to be done on all cores */
+	ps_faa(&b2, 1);
+	while (ps_load(&b2) != NUM_CPU) ;
+
+	if (!cos_cpuid()) {
+		struct sl_thd *t = NULL;
+
+		t = sl_thd_alloc(cos_main, NULL);
+		assert(t);
+		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX));
+	}
+	/* wait for all cores to reach this point, so all threads wait for main thread to be ready! */
+	ps_faa(&b3, 1);
+	while (ps_load(&b3) != NUM_CPU) ;
+
+	sl_sched_loop_nonblock();
+
+	PRINTC("Should never get here!\n");
+	assert(0);
+}
+
diff --git a/src/components/implementation/tests/part_test/main.c b/src/components/implementation/tests/part_test/main.c
new file mode 100644
index 0000000000..b751b97ece
--- /dev/null
+++ b/src/components/implementation/tests/part_test/main.c
@@ -0,0 +1,32 @@
+#include <part.h>
+#include <part_task.h>
+
+#define NTHDS 2
+
+void
+work_fn(void *d)
+{
+	PRINTC("Sharing work!\n");
+}
+
+int
+main(void)
+{
+	struct sl_thd *c = sl_thd_curr();
+	struct part_task *p = (struct part_task *)c->part_context, *pt = &main_task;
+	int n = NTHDS > PART_MAX_PAR_THDS ? PART_MAX_PAR_THDS : NTHDS;
+
+	assert(p == NULL);
+	
+	pt->state = PART_TASK_S_ALLOCATED;
+	part_task_init(pt, PART_TASK_T_WORKSHARE, p, n, work_fn, NULL, NULL);
+	assert(pt->nthds = n);
+
+	c->part_context = pt;
+	part_list_append(pt);
+
+	work_fn(NULL);
+	part_task_end(pt);
+
+	PRINTC("Done!\n");
+}
diff --git a/src/components/implementation/tests/spin_comp/Makefile b/src/components/implementation/tests/spin_comp/Makefile
new file mode 100644
index 0000000000..bb7f30634e
--- /dev/null
+++ b/src/components/implementation/tests/spin_comp/Makefile
@@ -0,0 +1,10 @@
+C_OBJS=init.o
+ASM_OBJS=
+COMPONENT=spin_comp.o
+INTERFACES=
+DEPENDENCIES=capmgr schedinit
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/tests/spin_comp/init.c b/src/components/implementation/tests/spin_comp/init.c
new file mode 100644
index 0000000000..15cdd385f5
--- /dev/null
+++ b/src/components/implementation/tests/spin_comp/init.c
@@ -0,0 +1,17 @@
+#include <cos_kernel_api.h>
+#include <llprint.h>
+#include <cos_types.h>
+#include <bitmap.h>
+#include <schedinit.h>
+
+void
+cos_init(void)
+{
+	PRINTC("Spin Init!\n");
+	schedinit_child();
+
+	while (1) ;
+
+	PRINTLOG(PRINT_ERROR, "Cannot reach here!\n");
+	assert(0);
+}
diff --git a/src/components/implementation/tests/test_schedinv/Makefile b/src/components/implementation/tests/test_schedinv/Makefile
new file mode 100644
index 0000000000..859fb3dd71
--- /dev/null
+++ b/src/components/implementation/tests/test_schedinv/Makefile
@@ -0,0 +1,8 @@
+COMPONENT=test_sched_inv.o
+INTERFACES=
+DEPENDENCIES= crt sched capmgr channel
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api #required for cos_sinv in llboot.h!
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/tests/test_schedinv/test_schedinv.c b/src/components/implementation/tests/test_schedinv/test_schedinv.c
new file mode 100644
index 0000000000..2e71cb8ef3
--- /dev/null
+++ b/src/components/implementation/tests/test_schedinv/test_schedinv.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright 2018, Phani Gadepalli and Gabriel Parmer, GWU, gparmer@gwu.edu.
+ *
+ * This uses a two clause BSD License.
+ */
+
+#include <llprint.h>
+#include <res_spec.h>
+#include <hypercall.h>
+#include <sched.h>
+#include <capmgr.h>
+#include <chan_crt.h>
+#include <channel.h>
+#include <cos_time.h>
+
+#define SPDID_INT 5
+#define SPDID_W1  6
+#define SPDID_W3  7
+
+static u32_t cycs_per_usec = 0;
+
+#define MAX_USE_PIPE_SZ 1
+
+#define SND_DATA 0x4321
+#define HPET_PERIOD_TEST_US 20000
+
+#define SHMCHANNEL_KEY 0x2020
+static cycles_t *sttsc = NULL;
+volatile unsigned long *rdy = NULL;
+int iters = 0;
+#define ITERS 100000
+cycles_t vals[ITERS] = { 0 };
+
+static void
+__test_int_fn(arcvcap_t rcv, void *data)
+{
+	ps_faa(rdy, 1);
+
+	while (ps_load(rdy) <= MAX_USE_PIPE_SZ) sched_thd_block_timeout(0, time_now() + time_usec2cyc(HPET_PERIOD_TEST_US));
+	int a = capmgr_hw_periodic_attach(HW_HPET_PERIODIC, cos_thdid(), HPET_PERIOD_TEST_US);
+	assert(a == 0);
+
+	/* TODO: register to HPET */
+	while (1) {
+		cos_rcv(rcv, 0);
+		iters++;
+		rdtscll(*sttsc);
+		chan_out(SND_DATA);
+
+		if (iters == ITERS) capmgr_hw_detach(HW_HPET_PERIODIC);
+	}
+
+	sched_thd_exit();
+}
+
+cycles_t tot = 0, wc = 0;
+
+static void
+__test_wrk_fn(void *data)
+{
+	int e = (int) data;
+	ps_faa(rdy, 1);
+	while (1) {
+		chan_in();
+
+		if (unlikely(e)) {
+			cycles_t en, diff;
+
+			if (unlikely(iters >= ITERS)) continue;
+			rdtscll(en);
+			assert(sttsc);
+			diff = en - *sttsc;
+			if (diff > wc) wc = diff;
+			tot += diff;
+			vals[iters] = diff;
+			//printc("%llu\n", diff);
+			iters++;
+			if (iters % 1000 == 0) printc(",");
+			if (iters == ITERS) {
+				int i;
+
+				for (i = 0; i < ITERS; i++) printc("%llu\n", vals[i]);
+				PRINTC("%llu, %llu\n", tot / ITERS, wc);
+				tot = wc = 0;
+				//iters = 0;
+			}
+			continue;
+		}
+		chan_out(SND_DATA);
+	}
+}
+
+struct cos_aep_info intaep;
+
+static void
+test_aeps(void)
+{
+	thdid_t tid;
+	int ret;
+	int i = 0;
+
+	if (cos_spd_id() == SPDID_INT) {
+		tid = sched_aep_create(&intaep, __test_int_fn, (void *)0, 0, 0, 0, 0);
+	} else {
+		tid = sched_thd_create(__test_wrk_fn, 
+			((cos_spd_id() == SPDID_W3 && MAX_USE_PIPE_SZ == 4) 
+			|| (cos_spd_id() == SPDID_W1 && MAX_USE_PIPE_SZ == 2)) 
+			? (void *)1: (void *)0);
+	}
+	assert(tid);
+}
+
+void
+cos_init(void)
+{
+	spdid_t child;
+	comp_flag_t childflags;
+
+	vaddr_t addr = 0;
+	unsigned long pages = 0;
+	cbuf_t id =  channel_shared_page_map(SHMCHANNEL_KEY, &addr, &pages);
+	assert(id > 0 && addr && pages == 1);
+	sttsc = (cycles_t *)addr;
+	rdy = (volatile unsigned long *)(sttsc + 1);
+
+	cycs_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
+
+	assert(hypercall_comp_child_next(cos_spd_id(), &child, &childflags) == -1);
+	test_aeps();
+	PRINTC("Init Done!\n");
+
+	sched_thd_exit();
+}
diff --git a/src/components/implementation/tests/unit_capmgr/unit_capmgr.c b/src/components/implementation/tests/unit_capmgr/unit_capmgr.c
index 91b24e7e47..880428378c 100644
--- a/src/components/implementation/tests/unit_capmgr/unit_capmgr.c
+++ b/src/components/implementation/tests/unit_capmgr/unit_capmgr.c
@@ -33,7 +33,9 @@ test_thds(void)
 	int failure = 0;
 
 	for (; i < TEST_N_THDS; i++) {
-		test_ts[cos_cpuid()][i] = capmgr_thd_create(__test_thd_fn, (void *)i, &tid);
+		struct cos_dcb_info *dcb;
+
+		test_ts[cos_cpuid()][i] = capmgr_thd_create(__test_thd_fn, (void *)i, &tid, &dcb);
 		assert(test_ts[cos_cpuid()][i]);
 
 		if (cos_thd_switch(test_ts[cos_cpuid()][i])) {
diff --git a/src/components/implementation/tests/unit_defcompinfo/unit_defcompinfo.c b/src/components/implementation/tests/unit_defcompinfo/unit_defcompinfo.c
index 21133d1e21..0083657d72 100644
--- a/src/components/implementation/tests/unit_defcompinfo/unit_defcompinfo.c
+++ b/src/components/implementation/tests/unit_defcompinfo/unit_defcompinfo.c
@@ -35,7 +35,7 @@ aep_thd_fn(arcvcap_t rcv, void *data)
 {
 	printc("\tSwitched to aep %d\n", (int)data);
 	while (1) {
-		cos_rcv(rcv, 0, NULL);
+		cos_rcv(rcv, 0);
 	}
 }
 
@@ -56,7 +56,7 @@ test_aeps(void)
 		asndcap_t snd;
 
 		printc("\tCreating AEP [%d]\n", i);
-		ret = cos_aep_tcap_alloc(&(test_aep[i]), BOOT_CAPTBL_SELF_INITTCAP_BASE, aep_thd_fn, (void *)i);
+		ret = cos_aep_tcap_alloc(&(test_aep[i]), BOOT_CAPTBL_SELF_INITTCAP_BASE, aep_thd_fn, (void *)i, 0, 0);
 		assert(ret == 0);
 
 		snd = cos_asnd_alloc(ci, test_aep[i].rcv, ci->captbl_cap);
@@ -66,7 +66,7 @@ test_aeps(void)
 		                        TCAP_DELEG_YIELD);
 		assert(ret == 0);
 
-		while (cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_BASE, 0, 0, NULL, &tid, &blocked, &cycs, &thd_timeout))
+		while (cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_BASE, 0, 0, &tid, &blocked, &cycs, &thd_timeout))
 			;
 	}
 
@@ -85,7 +85,7 @@ test_childcomps(void)
 		thdid_t     tid;
 		tcap_time_t thd_timeout;
 
-		while (cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_BASE, 0, 0, NULL, &tid, &blocked, &cycs, &thd_timeout))
+		while (cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_BASE, 0, 0, &tid, &blocked, &cycs, &thd_timeout))
 			;
 		printc("\tSwitching to [%d] component\n", id);
 		if (id == CHILD_SCHED_ID) {
@@ -122,10 +122,10 @@ cos_init(void)
 		is_booter = 0;
 		printc("Unit-test for defcompinfo API\n");
 		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
-		cos_defcompinfo_init();
+		cos_defcompinfo_llinit();
 
 		for (id = 0; id < CHILD_COMP_COUNT; id++) {
-			vaddr_t              vm_range, addr;
+			vaddr_t              vm_range, addr, dcbaddr;
 			pgtblcap_t           child_utpt;
 			int                  is_sched = ((id == CHILD_SCHED_ID) ? 1 : 0);
 			struct cos_compinfo *child_ci = cos_compinfo_get(&child_defci[id]);
@@ -136,7 +136,7 @@ cos_init(void)
 
 			cos_meminfo_init(&(child_ci->mi), BOOT_MEM_KM_BASE, CHILD_UNTYPED_SIZE, child_utpt);
 			cos_defcompinfo_child_alloc(&child_defci[id], (vaddr_t)&cos_upcall_entry,
-			                            (vaddr_t)BOOT_MEM_VM_BASE, BOOT_CAPTBL_FREE, is_sched);
+			                            (vaddr_t)BOOT_MEM_VM_BASE, BOOT_CAPTBL_FREE, is_sched, &dcbaddr);
 
 			printc("\t\tCopying new capabilities\n");
 			ret = cos_cap_cpy_at(child_ci, BOOT_CAPTBL_SELF_CT, ci, child_ci->captbl_cap);
@@ -147,6 +147,7 @@ cos_init(void)
 			assert(ret == 0);
 			ret = cos_cap_cpy_at(child_ci, BOOT_CAPTBL_SELF_COMP, ci, child_ci->comp_cap);
 			assert(ret == 0);
+			/* FIXME: copy BOOT_CAPTBL_SELF_SCB cap?? */
 
 			ret = cos_cap_cpy_at(child_ci, BOOT_CAPTBL_SELF_INITTHD_BASE, ci,
 			                     cos_sched_aep_get(&child_defci[id])->thd);
@@ -207,7 +208,7 @@ cos_init(void)
 		/* TEST BLOCKING */
 		/* TODO: Challenge - how does a component know at runtime if can call cos_rcv or not? - It does not at
 		 * runtime. */
-		cos_rcv(BOOT_CAPTBL_SELF_INITRCV_BASE, 0, NULL);
+		cos_rcv(BOOT_CAPTBL_SELF_INITRCV_BASE, 0);
 		printc("\tThis is a simple component\n");
 
 		SPIN();
diff --git a/src/components/implementation/tests/unit_fprr/Makefile b/src/components/implementation/tests/unit_fprr/Makefile
index dd4186daef..66f9041230 100644
--- a/src/components/implementation/tests/unit_fprr/Makefile
+++ b/src/components/implementation/tests/unit_fprr/Makefile
@@ -1,3 +1,5 @@
+C_OBJS=unit_fprr.o
+ASM_OBJS=
 COMPONENT=unit_fprr_test.o
 INTERFACES=
 DEPENDENCIES=
diff --git a/src/components/implementation/tests/unit_fprr/unit_fprr.c b/src/components/implementation/tests/unit_fprr/unit_fprr.c
index 6149b0a0d9..093ba6c25b 100644
--- a/src/components/implementation/tests/unit_fprr/unit_fprr.c
+++ b/src/components/implementation/tests/unit_fprr/unit_fprr.c
@@ -8,11 +8,12 @@
 #include <llprint.h>
 #include <res_spec.h>
 #include <sl.h>
+#include <cos_dcb.h>
 
 /* Ensure this is the same as what is in sl_mod_fprr.c */
 #define SL_FPRR_NPRIOS 32
 
-#define LOWEST_PRIORITY (SL_FPRR_NPRIOS - 1)
+#define LOWEST_PRIORITY (15)
 
 #define LOW_PRIORITY (LOWEST_PRIORITY - 1)
 #define HIGH_PRIORITY (LOWEST_PRIORITY - 10)
@@ -106,51 +107,55 @@ test_swapping(void)
 	sl_thd_block_timeout(0, wakeup);
 }
 
-#define XCPU_THDS (NUM_CPU-1)
+#define XCORE_THDS (NUM_CPU-1)
 #define THD_SLEEP_US (100 * 1000)
-volatile unsigned int xcpu_thd_data[NUM_CPU][XCPU_THDS];
-volatile unsigned int xcpu_thd_counter[NUM_CPU];
+volatile unsigned int xcore_thd_data[NUM_CPU][XCORE_THDS];
+volatile unsigned int xcore_thd_counter[NUM_CPU];
 static void
-test_xcpu_fn(void *data)
+test_xcore_fn(void *data)
 {
 	cycles_t wakeup, elapsed;
 	int cpu = *((unsigned int *)data) >> 16;
 	int i   = (*((unsigned int *)data) << 16) >> 16;
 
-	assert(i < XCPU_THDS);
+	assert(i < XCORE_THDS);
 	wakeup = sl_now() + sl_usec2cyc(THD_SLEEP_US);
 	elapsed = sl_thd_block_timeout(0, wakeup);
 
-	if (elapsed) xcpu_thd_counter[cpu] ++;
+	if (elapsed) xcore_thd_counter[cpu] ++;
 	sl_thd_exit();
 }
 
 static void
-run_xcpu_tests()
+run_xcore_tests()
 {
 	int ret = 0, i, cpu = 0;
 
 	if (NUM_CPU == 1) return;
 
-	memset((void *)xcpu_thd_data[cos_cpuid()], 0, sizeof(unsigned int) * XCPU_THDS);
-	xcpu_thd_counter[cos_cpuid()] = 0;
+	memset((void *)xcore_thd_data[cos_cpuid()], 0, sizeof(unsigned int) * XCORE_THDS);
+	xcore_thd_counter[cos_cpuid()] = 0;
 
-	for (i = 0; i < XCPU_THDS; i++) {
+	for (i = 0; i < XCORE_THDS; i++) {
 		sched_param_t p[1];
+		struct sl_xcore_thd *t = NULL;
 
 		if (cpu == cos_cpuid()) cpu++;
 		cpu %= NUM_CPU;
-		xcpu_thd_data[cos_cpuid()][i] = (cpu << 16) | i;
+		xcore_thd_data[cos_cpuid()][i] = (cpu << 16) | i;
 
 		p[0] = sched_param_pack(SCHEDP_PRIO, HIGH_PRIORITY);
-		ret = sl_xcpu_thd_alloc(cpu, test_xcpu_fn, (void *)&xcpu_thd_data[cos_cpuid()][i], p);
-		if (ret) break;
+		t = sl_xcore_thd_alloc(cpu, test_xcore_fn, (void *)&xcore_thd_data[cos_cpuid()][i], 1, p);
+		if (!t) {
+			ret = -1;
+			break;
+		}
 
 		cpu++;
 	}
 
-	PRINTC("%s: Creating cross-CPU threads!\n", ret ? "FAILURE" : "SUCCESS");
-	while (xcpu_thd_counter[cos_cpuid()] != XCPU_THDS) ;
+	PRINTC("%s: Creating cross-core threads!\n", ret ? "FAILURE" : "SUCCESS");
+	while (xcore_thd_counter[cos_cpuid()] != XCORE_THDS) ;
 }
 
 static void
@@ -161,7 +166,7 @@ run_tests()
 	test_swapping();
 	PRINTC("%s: Swap back and forth!\n", (thd1_ran[cos_cpuid()] && thd2_ran[cos_cpuid()]) ? "SUCCESS" : "FAILURE");
 
-	run_xcpu_tests();
+//	run_xcore_tests();
 
 	PRINTC("Unit-test done!\n");
 	sl_thd_exit();
@@ -176,11 +181,9 @@ cos_init(void)
 	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
 	struct cos_compinfo    *ci    = cos_compinfo_get(defci);
 
-	PRINTC("Unit-test for the scheduling library (sl)\n");
-
 	if (ps_cas(&first, NUM_CPU + 1, cos_cpuid())) {
 		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
-		cos_defcompinfo_init();
+		cos_defcompinfo_llinit();
 	} else {
 		while (!ps_load(&init_done[first])) ;
 
@@ -191,6 +194,8 @@ cos_init(void)
 		while (!ps_load(&init_done[i])) ;
 	}
 
+	PRINTC("Unit-test for the scheduling library (sl)\n");
+
 	sl_init(SL_MIN_PERIOD_US);
 
 	testing_thread = sl_thd_alloc(run_tests, NULL);
diff --git a/src/components/implementation/tests/unit_schedaep/Makefile b/src/components/implementation/tests/unit_schedappaep/Makefile
similarity index 85%
rename from src/components/implementation/tests/unit_schedaep/Makefile
rename to src/components/implementation/tests/unit_schedappaep/Makefile
index b6f56f58bf..da9e217045 100644
--- a/src/components/implementation/tests/unit_schedaep/Makefile
+++ b/src/components/implementation/tests/unit_schedappaep/Makefile
@@ -1,4 +1,4 @@
-COMPONENT=unit_schedaep_test.o
+COMPONENT=unit_schedappaep_test.o
 INTERFACES=
 DEPENDENCIES=sched capmgr
 IF_LIB=
diff --git a/src/components/implementation/tests/unit_schedaep/unit_schedaep.c b/src/components/implementation/tests/unit_schedappaep/unit_schedappaep.c
similarity index 98%
rename from src/components/implementation/tests/unit_schedaep/unit_schedaep.c
rename to src/components/implementation/tests/unit_schedappaep/unit_schedappaep.c
index 371c8144fd..4702c82253 100644
--- a/src/components/implementation/tests/unit_schedaep/unit_schedaep.c
+++ b/src/components/implementation/tests/unit_schedappaep/unit_schedappaep.c
@@ -26,7 +26,7 @@ __test_child(arcvcap_t rcv, void *data)
 
 	assert(taeps[cos_cpuid()][(int)data].rcv == rcv);
 	while (child_rcvd[cos_cpuid()] < TEST_ITERS) {
-		ret = cos_rcv(rcv, 0, NULL);
+		ret = cos_rcv(rcv, 0);
 		assert(ret >= 0);
 
 		child_rcvd[cos_cpuid()]++;
@@ -43,7 +43,7 @@ __test_parent(arcvcap_t rcv, void *data)
 
 	assert(taeps[cos_cpuid()][(int)data].rcv == rcv);
 	while (parent_sent[cos_cpuid()] < TEST_ITERS) {
-		ret = cos_rcv(rcv, 0, NULL);
+		ret = cos_rcv(rcv, 0);
 		assert(ret >= 0);
 
 		do {
diff --git a/src/components/implementation/tests/unit_schedappcomp/Makefile b/src/components/implementation/tests/unit_schedappcomp/Makefile
new file mode 100644
index 0000000000..dfe5cbcf92
--- /dev/null
+++ b/src/components/implementation/tests/unit_schedappcomp/Makefile
@@ -0,0 +1,8 @@
+COMPONENT=unit_schedappcomp_test.o
+INTERFACES=
+DEPENDENCIES=sched
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api #required for cos_sinv in llboot.h!
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/tests/unit_schedcomp/unit_schedcomp.c b/src/components/implementation/tests/unit_schedappcomp/unit_schedappcomp.c
similarity index 99%
rename from src/components/implementation/tests/unit_schedcomp/unit_schedcomp.c
rename to src/components/implementation/tests/unit_schedappcomp/unit_schedappcomp.c
index 98a5ccbc8d..a7cf4db127 100644
--- a/src/components/implementation/tests/unit_schedcomp/unit_schedcomp.c
+++ b/src/components/implementation/tests/unit_schedappcomp/unit_schedappcomp.c
@@ -131,6 +131,7 @@ cos_init(void)
 	assert(hypercall_comp_child_next(cos_spd_id(), &child, &childflag) == -1);
 
 	testtid = sched_thd_create(run_tests, NULL);
+	assert(testtid);
 	sched_thd_param_set(testtid, sched_param_pack(SCHEDP_PRIO, LOWEST_PRIORITY));
 
 	while (1) {
diff --git a/src/components/implementation/tests/unit_schedcomp/Makefile b/src/components/implementation/tests/unit_schedcomp/Makefile
index 3edcf1b36d..1134e9cb60 100644
--- a/src/components/implementation/tests/unit_schedcomp/Makefile
+++ b/src/components/implementation/tests/unit_schedcomp/Makefile
@@ -1,8 +1,8 @@
 COMPONENT=unit_schedcomp_test.o
 INTERFACES=
-DEPENDENCIES=sched
+DEPENDENCIES=capmgr
 IF_LIB=
-ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api #required for cos_sinv in llboot.h!
+ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_rr -lcos_defkernel_api
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/tests/unit_schedcomp/unit_schedlib.c b/src/components/implementation/tests/unit_schedcomp/unit_schedlib.c
new file mode 100644
index 0000000000..e5527bb269
--- /dev/null
+++ b/src/components/implementation/tests/unit_schedcomp/unit_schedlib.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright 2016, Phani Gadepalli and Gabriel Parmer, GWU, gparmer@gwu.edu.
+ *
+ * This uses a two clause BSD License.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <llprint.h>
+#include <sl.h>
+#include <hypercall.h>
+#include <cos_rdtsc.h>
+
+/* sl also defines a SPIN macro */
+#undef SPIN
+#define SPIN(iters)                                \
+	do {                                       \
+		if (iters > 0) {                   \
+			for (; iters > 0; iters--) \
+				;                  \
+		} else {                           \
+			while (1)                  \
+				;                  \
+		}                                  \
+	} while (0)
+
+
+#define N_TESTTHDS 8
+#define WORKITERS 10000
+
+#define PERF_ITERS 1000000
+
+static cycles_t rdtscp_min = 0, rdtscp_max = 0, rdtscp_avg = 0;
+static volatile int switched = 0;
+static volatile cycles_t mid_cycs = 0;
+static volatile int testing = 1;
+static struct sl_thd *perf_thd, *spin_thd;
+
+void
+test_thd_perffn(void *data)
+{
+	thdid_t yield_to = sl_thd_thdid(spin_thd);
+	cycles_t start_cycs = 0, end_cycs = 0, wc_cycs = 0, total_cycs = 0, bc_cycs = 500;
+	unsigned int i = 0;
+	int ret = 0;
+
+	assert(perf_thd == sl_thd_curr());
+	rdtscll(start_cycs);
+	//printc("a");
+	//sl_thd_yield(yield_to);
+	//ret = sl_thd_dispatch(spin_thd, cos_sched_sync(), perf_thd);
+	//sl_thd_yield_thd_c(perf_thd, spin_thd);
+	sl_thd_yield_thd(spin_thd);
+	//assert(ret == 0);
+	rdtscll(end_cycs);
+	//assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs);
+	assert(switched);
+
+	for (i = 0; i < PERF_ITERS; i++) {
+		cycles_t diff1_cycs = 0, diff2_cycs = 0;
+
+		end_cycs = start_cycs = 0;
+		//mid_cycs = 0;
+		switched = 0;
+		//cos_rdtscp(start_cycs);
+		rdtscll(start_cycs);
+		//ret = sl_thd_dispatch(spin_thd, cos_sched_sync(), perf_thd);
+		//printc("a");
+		//sl_thd_yield(yield_to);
+		//sl_thd_yield_thd_c(perf_thd, spin_thd);
+		sl_thd_yield_thd(spin_thd);
+		rdtscll(end_cycs);
+		//cos_rdtscp(end_cycs);
+		assert(switched);
+		assert(ret == 0);
+		//assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs);
+
+		//diff1_cycs = mid_cycs - start_cycs;
+		diff2_cycs = end_cycs - start_cycs;
+		//assert(diff2_cycs > rdtscp_min);
+		//diff2_cycs -= rdtscp_min;
+
+		//if (diff1_cycs > wc_cycs) wc_cycs = diff1_cycs;
+		if (diff2_cycs > wc_cycs) wc_cycs = diff2_cycs;
+		if (diff2_cycs < bc_cycs) bc_cycs = diff2_cycs;
+		total_cycs += diff2_cycs;
+	}
+
+	PRINTC("SWITCH UBENCH : avg: %llu, wc: %llu, bc: %llu, iters:%u\n", (total_cycs / (PERF_ITERS)) / 2, wc_cycs / 2, bc_cycs / 2, PERF_ITERS);
+	testing = 0;
+	/* done testing! free the spin thread! */
+	while (1) ;
+//	sl_thd_free(spin_thd);
+
+//	sl_thd_exit();
+}
+
+void
+test_thd_spinfn(void *data)
+{
+	thdid_t yield_to = sl_thd_thdid(perf_thd);
+	assert(sl_thd_curr() == spin_thd);
+
+	while (likely(testing)) {
+		//rdtscll(mid_cycs);
+		switched = 1;
+		//sl_thd_dispatch(perf_thd, cos_sched_sync(), spin_thd);
+		//printc("b");
+		//sl_thd_yield(yield_to);
+		//sl_thd_yield_thd_c(spin_thd, perf_thd);
+		sl_thd_yield_thd(perf_thd);
+	}
+
+	//sl_thd_dispatch(perf_thd, cos_sched_sync(), spin_thd);
+	sl_thd_yield(yield_to);
+	//sl_thd_yield_thd_c(spin_thd, perf_thd);
+	//sl_thd_yield_thd(perf_thd);
+	//assert(0);
+}
+
+void
+test_thd_fn(void *data)
+{
+	while (1) {
+		int workiters = WORKITERS * ((int)data);
+
+		printc("%c", 'a' + (int)data);
+		//SPIN(workiters);
+		sl_thd_yield(0);
+	}
+}
+
+void
+test_yield_perf(void)
+{
+	union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 31}};
+
+	assert(NUM_CPU == 1);
+
+	spin_thd = sl_thd_alloc(test_thd_spinfn, NULL);
+	assert(spin_thd);
+	sl_thd_param_set(spin_thd, sp.v);
+	PRINTC("Spin thread %u:%lu created\n", sl_thd_thdid(spin_thd), sl_thd_thdcap(spin_thd));
+
+	perf_thd = sl_thd_alloc(test_thd_perffn, NULL);
+	assert(perf_thd);
+	sl_thd_param_set(perf_thd, sp.v);
+	PRINTC("Perf thread %u:%lu created\n", sl_thd_thdid(perf_thd), sl_thd_thdcap(perf_thd));
+
+	sl_thd_yield(sl_thd_thdid(perf_thd));
+	//sl_thd_dispatch(perf_thd, cos_sched_sync(), sl_thd_curr());
+	//sl_thd_yield_thd_c(sl_thd_curr(), perf_thd);
+	//sl_thd_yield_thd(perf_thd);
+	while (1);
+}
+
+void
+test_yields(void)
+{
+	int                     i;
+	struct sl_thd *         threads[N_TESTTHDS];
+	union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 10}};
+
+	for (i = 0; i < N_TESTTHDS; i++) {
+		threads[i] = sl_thd_alloc(test_thd_fn, (void *)i);
+		assert(threads[i]);
+		sl_thd_param_set(threads[i], sp.v);
+		PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i]));
+	}
+}
+
+void
+cos_init(void)
+{
+	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
+	struct cos_compinfo *   ci    = cos_compinfo_get(defci);
+	static int first_time = 1, init_done = 0;
+
+	PRINTC("Unit-test for the scheduling library (sl) with capmgr usage\n");
+	PRINTC("CPU cycles per sec: %u\n", cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE));
+
+	if (first_time) {
+		first_time = 0;
+		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
+		cos_defcompinfo_init();
+		cos_rdtscp_calib(&rdtscp_min, &rdtscp_avg, &rdtscp_max);
+		PRINTC("RDTSCP MIN:%llu MAX:%llu AVG:%llu\n", rdtscp_min, rdtscp_max, rdtscp_avg);
+
+		init_done = 1;
+	} else {
+		while (!init_done) ;
+
+		cos_defcompinfo_sched_init();
+	}
+
+	sl_init(SL_MIN_PERIOD_US);
+	hypercall_comp_init_done();
+
+	test_yield_perf();
+	//test_yields();
+
+	sl_sched_loop_nonblock();
+
+	assert(0);
+
+	return;
+}
diff --git a/src/components/implementation/tests/unit_schedtests/Makefile b/src/components/implementation/tests/unit_schedtests/Makefile
index e46827dc8d..1735aff577 100644
--- a/src/components/implementation/tests/unit_schedtests/Makefile
+++ b/src/components/implementation/tests/unit_schedtests/Makefile
@@ -2,7 +2,7 @@ COMPONENT=unit_schedlibtests.o
 INTERFACES=
 DEPENDENCIES=
 IF_LIB=
-ADDITIONAL_LIBS=-lcobj_format $(LIBSLRAW) -lsl_mod_fprr -lsl_thd_static_backend
+ADDITIONAL_LIBS=-lcobj_format $(LIBSLRAW) -lsl_mod_rr -lsl_thd_static_backend
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/tests/unit_schedtests/inv.S b/src/components/implementation/tests/unit_schedtests/inv.S
new file mode 120000
index 0000000000..b9e55311b4
--- /dev/null
+++ b/src/components/implementation/tests/unit_schedtests/inv.S
@@ -0,0 +1 @@
+../kernel_tests/inv.S
\ No newline at end of file
diff --git a/src/components/implementation/tests/unit_schedtests/unit_schedlib.c b/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
index 2ca97b36ff..807776f25c 100644
--- a/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
+++ b/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
@@ -13,6 +13,7 @@
 #include <cos_defkernel_api.h>
 #include <llprint.h>
 #include <sl.h>
+#include <cos_dcb.h>
 
 /* sl also defines a SPIN macro */
 #undef SPIN
@@ -31,18 +32,208 @@
 #define N_TESTTHDS 8
 #define WORKITERS 10000
 
+#define N_TESTTHDS_PERF 2
+#define PERF_ITERS 1000000
+
+#define MAGIC_RET 0xDEADBEEF
+
+#undef INV_TEST
+static volatile cycles_t mid_cycs = 0;
+static volatile int testing = 1;
+
+
+int
+test_serverfn(int a, int b, int c)
+{
+        //rdtscll(midinv_cycles[cos_cpuid()]);
+        return MAGIC_RET;
+}
+
+extern void *__inv_test_serverfn(int a, int b, int c);
+
+static inline int
+call_cap_mb(u32_t cap_no, int arg1, int arg2, int arg3)
+{
+        int ret;
+
+        /*
+         * Which stack should we use for this invocation?  Simple, use
+         * this stack, at the current sp.  This is essentially a
+         * function call into another component, with odd calling
+         * conventions.
+         */
+        cap_no = (cap_no + 1) << COS_CAPABILITY_OFFSET;
+
+        __asm__ __volatile__("pushl %%ebp\n\t"
+                             "movl %%esp, %%ebp\n\t"
+                             "movl %%esp, %%edx\n\t"
+                             "movl $1f, %%ecx\n\t"
+                             "sysenter\n\t"
+                             "1:\n\t"
+                             "popl %%ebp"
+                             : "=a"(ret)
+                             : "a"(cap_no), "b"(arg1), "S"(arg2), "D"(arg3)
+                             : "memory", "cc", "ecx", "edx");
+
+        return ret;
+}
+
+sinvcap_t sinv_cap = 0;
+
+static inline void
+test_inv_setup(void)
+{
+	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
+	struct cos_compinfo *   ci    = cos_compinfo_get(defci);
+        compcap_t    cc;
+        sinvcap_t    ic;
+        int          i;
+        unsigned int ret;
+
+        cc = cos_comp_alloc(ci, ci->captbl_cap, ci->pgtbl_cap, 0, (vaddr_t)NULL, 0);
+        assert(cc > 0);
+        ic = cos_sinv_alloc(ci, cc, (vaddr_t)__inv_test_serverfn, 0);
+        assert(ic > 0);
+        ret = call_cap_mb(ic, 1, 2, 3);
+        assert(ret == MAGIC_RET);
+
+	sinv_cap = ic;
+}
+
+static struct sl_thd *perf_thd = NULL, *spin_thd = NULL;
+
+void
+test_thd_perffn(void *data)
+{
+	cycles_t start_cycs = 0, end_cycs = 0, wc_cycs = 0, total_cycs = 0;
+	unsigned int i = 0;
+	struct sl_thd *c = sl_thd_curr();
+
+	rdtscll(start_cycs);
+	sl_thd_yield_thd(spin_thd);
+	rdtscll(end_cycs);
+	assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs);
+
+	for (i = 0; i < PERF_ITERS; i++) {
+		cycles_t diff1_cycs = 0, diff2_cycs = 0;
+
+		mid_cycs = 0;
+		rdtscll(start_cycs);
+		sl_thd_yield_thd_c(c, spin_thd);
+		rdtscll(end_cycs);
+		assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs);
+
+		diff1_cycs = mid_cycs - start_cycs;
+		diff2_cycs = end_cycs - mid_cycs;
+
+		if (diff1_cycs > wc_cycs) wc_cycs = diff1_cycs;
+		if (diff2_cycs > wc_cycs) wc_cycs = diff2_cycs;
+		total_cycs += (diff1_cycs + diff2_cycs);
+	}
+
+	PRINTC("SWITCH UBENCH: avg: %llu, wc: %llu, iters:%u\n", (total_cycs / (2 * PERF_ITERS)), wc_cycs, PERF_ITERS);
+	testing = 0;
+	/* done testing! let the spinfn cleanup! */
+	sl_thd_yield_thd(spin_thd);
+
+	sl_thd_exit();
+}
+
+void
+test_inv_perffn(void *data)
+{
+	cycles_t start_cycs = 0, end_cycs = 0, wc_cycs = 0, total_cycs = 0;
+	unsigned int i = 0;
+	struct sl_thd *c = sl_thd_curr();
+
+	test_inv_setup();
+
+	rdtscll(start_cycs);
+	sl_thd_yield_thd(spin_thd);
+	rdtscll(end_cycs);
+	assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs);
+
+	for (i = 0; i < PERF_ITERS; i++) {
+		cycles_t diff_cycs = 0;
+		int ret;
+
+		sl_thd_yield_thd_c(c, spin_thd);
+		mid_cycs = 0;
+		rdtscll(start_cycs);
+		ret = call_cap_mb(sinv_cap, 1, 2, 3);
+		rdtscll(end_cycs);
+		assert(ret == (int)MAGIC_RET);
+//		assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs);
+//
+//		diff1_cycs = mid_cycs - start_cycs;
+//		diff2_cycs = end_cycs - mid_cycs;
+//
+//		if (diff1_cycs > wc_cycs) wc_cycs = diff1_cycs;
+//		if (diff2_cycs > wc_cycs) wc_cycs = diff2_cycs;
+//		total_cycs += (diff1_cycs + diff2_cycs);
+		diff_cycs = end_cycs - start_cycs;
+		if (diff_cycs > wc_cycs) wc_cycs = diff_cycs;
+		total_cycs += diff_cycs;
+	}
+
+	PRINTC("INV UBENCH: avg: %llu, wc: %llu, iters:%u\n", (total_cycs / PERF_ITERS), wc_cycs, PERF_ITERS);
+	testing = 0;
+	/* done testing! let the spinfn cleanup! */
+	sl_thd_yield_thd(spin_thd);
+
+	sl_thd_exit();
+}
+
+void
+test_thd_spinfn(void *data)
+{
+	struct sl_thd *c = sl_thd_curr();
+
+	while (likely(testing)) {
+		rdtscll(mid_cycs);
+		sl_thd_yield_thd_c(c, perf_thd);
+	}
+
+	sl_thd_exit();
+}
+
 void
 test_thd_fn(void *data)
 {
 	while (1) {
 		int workiters = WORKITERS * ((int)data);
 
-		printc("%d", (int)data);
-		SPIN(workiters);
+		printc("%c", 'a' + (int)data);
+		//SPIN(workiters);
 		sl_thd_yield(0);
 	}
 }
 
+void
+test_yield_perf(void)
+{
+	int                     i;
+	struct sl_thd          *threads[N_TESTTHDS_PERF];
+	union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 31}};
+
+	for (i = 0; i < N_TESTTHDS_PERF; i++) {
+		if (i == 1) {
+#ifdef INV_TEST
+			threads[i] = sl_thd_alloc(test_inv_perffn, (void *)&threads[0]);
+#else
+			threads[i] = sl_thd_alloc(test_thd_perffn, (void *)&threads[0]);
+#endif
+			perf_thd = threads[i];
+		} else {
+			threads[i] = sl_thd_alloc(test_thd_spinfn, NULL);
+			spin_thd = threads[i];
+		}
+		assert(threads[i]);
+		sl_thd_param_set(threads[i], sp.v);
+		PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i]));
+	}
+}
+
 void
 test_yields(void)
 {
@@ -51,9 +242,10 @@ test_yields(void)
 	union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 10}};
 
 	for (i = 0; i < N_TESTTHDS; i++) {
-		threads[i] = sl_thd_alloc(test_thd_fn, (void *)(intptr_t)(i + 1));
+		threads[i] = sl_thd_alloc(test_thd_fn, (void *)i);
 		assert(threads[i]);
 		sl_thd_param_set(threads[i], sp.v);
+		PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i]));
 	}
 }
 
@@ -148,12 +340,14 @@ cos_init(void)
 
 	printc("Unit-test for the scheduling library (sl)\n");
 	cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
-	cos_defcompinfo_init();
-	sl_init(SL_MIN_PERIOD_US);
+	cos_defcompinfo_llinit();
+	cos_dcb_info_init_curr();
+	sl_init(SL_MIN_PERIOD_US*100);
 
-	//	test_yields();
-	//	test_blocking_directed_yield();
-	test_timeout_wakeup();
+	test_yield_perf();
+	//test_yields();
+	//test_blocking_directed_yield();
+	//test_timeout_wakeup();
 
 	sl_sched_loop_nonblock();
 
diff --git a/src/components/implementation/tests/unit_slrcv/Makefile b/src/components/implementation/tests/unit_slrcv/Makefile
new file mode 100644
index 0000000000..3500d01777
--- /dev/null
+++ b/src/components/implementation/tests/unit_slrcv/Makefile
@@ -0,0 +1,8 @@
+COMPONENT=unit_slrcvtest.o
+INTERFACES=
+DEPENDENCIES=capmgr schedinit work
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format $(LIBSLCAPMGR) -lsl_mod_fprr -lsl_thd_static_backend -lcos_dcb -lsl_blkpt
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/tests/unit_slrcv/init.c b/src/components/implementation/tests/unit_slrcv/init.c
new file mode 100644
index 0000000000..aa5a85741c
--- /dev/null
+++ b/src/components/implementation/tests/unit_slrcv/init.c
@@ -0,0 +1,238 @@
+#include <cos_kernel_api.h>
+#include <cos_defkernel_api.h>
+#include <llprint.h>
+#include <sl.h>
+#include <cos_types.h>
+#include <cos_dcb.h>
+#include <hypercall.h>
+#include <schedinit.h>
+#include <work.h>
+#include <capmgr.h>
+#include <crt_chan.h>
+
+static struct sl_xcore_thd *ping;
+static struct sl_xcore_thd *pong;
+
+#define HPET_PERIOD_TEST_US 20000
+
+#define WORK_US (1000)
+
+static inline void
+ping_fn(void *d)
+{
+	asndcap_t s = *(asndcap_t *)d;
+
+	while (1) {
+		printc("s");
+		int r = cos_asnd(s, 0);
+
+		assert(r == 0);
+		work_usecs(WORK_US);
+	}
+	sl_thd_exit();
+}
+
+unsigned int iter = 0;
+volatile cycles_t st = 0, en = 0, tot = 0, wc = 0;
+CRT_CHAN_STATIC_ALLOC(c0, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+CRT_CHAN_STATIC_ALLOC(c1, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+CRT_CHAN_STATIC_ALLOC(c2, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+CRT_CHAN_STATIC_ALLOC(c3, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+CRT_CHAN_STATIC_ALLOC(c4, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+CRT_CHAN_STATIC_ALLOC(c5, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+CRT_CHAN_TYPE_PROTOTYPES(test, int, 4);
+
+#define PIPELINE_LEN 4
+#define PRIO_START (TCAP_PRIO_MAX + 10 + PIPELINE_LEN + 1)
+#define PRIO_INT (PRIO_START + 1)
+#define ITERS 100000
+static cycles_t vals[ITERS] = { 0 };
+static int pipe_line = 0;
+static int pipe_send = 0, pipe_rcv = 0;
+
+static inline void
+chrcv(int i)
+{
+	int r;
+
+	if (i == 0) {
+		assert(ps_cas(&pipe_rcv, 0, PIPELINE_LEN));
+	}
+
+	//printc("[r%d,%d]", i, pipe_line);
+	switch(i) {
+	case 0: crt_chan_recv_test(c0, &r); break;
+	case 1: crt_chan_recv_test(c1, &r); break;
+	case 2: crt_chan_recv_test(c2, &r); break;
+	case 3: crt_chan_recv_test(c3, &r); break;
+	case 4: crt_chan_recv_test(c4, &r); break;
+	case 5: crt_chan_recv_test(c5, &r); break;
+	default: assert(0);
+	}
+	assert(ps_faa(&pipe_line, -1) == 1);
+	//printc("[d%d,%d]", i, pipe_line);
+	assert(ps_faa(&pipe_rcv, -1) == (PIPELINE_LEN - i));
+}
+
+static inline void
+chsnd(int i)
+{
+	int s = 0xDEAD0000 | i;
+
+	if (i == 0) {
+		assert(ps_cas(&pipe_send, 0, PIPELINE_LEN));
+	}
+	assert(ps_faa(&pipe_send, -1) == (PIPELINE_LEN - i));
+	//printc("[s%d,%d]", i, pipe_line);
+	assert(ps_faa(&pipe_line, 1) == 0);
+	switch(i) {
+	case 0: crt_chan_send_test(c0, &s); break;
+	case 1: crt_chan_send_test(c1, &s); break;
+	case 2: crt_chan_send_test(c2, &s); break;
+	case 3: crt_chan_send_test(c3, &s); break;
+	case 4: crt_chan_send_test(c4, &s); break;
+	case 5: crt_chan_send_test(c5, &s); break;
+	default: assert(0);
+	}
+	//printc("[o%d,%d]", i, pipe_line);
+}
+
+static inline void
+chinit(int i, struct sl_thd *s, struct sl_thd *r)
+{
+	switch(i) {
+	case 0: crt_chan_init_test(c0); break;
+	case 1: crt_chan_p2p_init_test(c1, s, r); break;
+	case 2: crt_chan_p2p_init_test(c2, s, r); break;
+	case 3: crt_chan_p2p_init_test(c3, s, r); break;
+	case 4: crt_chan_p2p_init_test(c4, s, r); break;
+	case 5: crt_chan_p2p_init_test(c5, s, r); break;
+	default: assert(0);
+	}
+}
+
+static inline void
+work_fn(void *x)
+{
+	int chid = (int)x;
+	while (1) {
+		chrcv(chid);
+
+		if (likely(chid + 1 < PIPELINE_LEN)) chsnd(chid + 1);
+		else {
+			rdtscll(en);
+			if (iter >= ITERS) continue;
+			assert(en > st);
+			cycles_t diff = en - st;
+			if (diff > wc) wc = diff;
+			//printc("%llu\n", diff);
+			vals[iter] = diff;
+			tot += diff;
+			iter ++;
+			if (unlikely(iter == ITERS)) {
+				int i;
+				for (i = 0; i < ITERS; i++) printc("%llu\n", vals[i]);
+				PRINTC("%d: %llu %llu\n", iter, tot / iter, wc);
+				iter = 0;
+				wc = tot = 0;
+			}
+		}
+	}
+	sl_thd_exit();
+}
+
+struct sl_thd *wt[PIPELINE_LEN] = { NULL };
+
+static inline void
+pong_fn(arcvcap_t r, void *d)
+{
+	PRINTC("Hpet Register\n");
+	int a = capmgr_hw_periodic_attach(HW_HPET_PERIODIC, cos_thdid(), HPET_PERIOD_TEST_US);
+	assert(a == 0);
+
+	while (1) {
+		//printc("I");
+		int p = sl_thd_rcv(RCV_ULONLY);
+		//work_usecs(WORK_US);
+		rdtscll(st);
+		chsnd(0);
+		if (iter == ITERS) capmgr_hw_detach(HW_HPET_PERIODIC);
+	}
+	sl_thd_exit();
+}
+
+void
+cos_init(void *d)
+{
+	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
+	struct cos_compinfo *   ci    = cos_compinfo_get(defci);
+	int i;
+	static volatile unsigned long init_done[NUM_CPU] = { 0 };
+	static volatile arcvcap_t r = 0;
+	static volatile asndcap_t s = 0;
+	unsigned int cycs_per_us = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
+
+//	if (NUM_CPU == 2) {
+//		assert(0); // need to rework.. 
+//		if (cos_cpuid() == 0) {
+//			cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
+//			cos_defcompinfo_llinit();
+//			cos_dcb_info_init_curr();
+//			sl_init(SL_MIN_PERIOD_US);
+//
+//			struct sl_thd *t = sl_thd_aep_alloc(pong_fn, NULL, 0, 0, 0, 0);
+//			assert(t);
+//			sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
+//			r = sl_thd_rcvcap(t);
+//			assert(r);
+//		} else {
+//			while (!ps_load(&init_done[0])) ;
+//
+//			cos_defcompinfo_sched_init();
+//			cos_dcb_info_init_curr();
+//			sl_init(SL_MIN_PERIOD_US);
+//
+//			struct sl_thd *t = sl_thd_alloc(ping_fn, (void *)&s);
+//			assert(t);
+//			sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
+//
+//			while (!r) ;
+//			s = cos_asnd_alloc(ci, r, ci->captbl_cap);
+//			assert(s);
+//		}
+//	} else {
+		assert(NUM_CPU == 1);
+		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
+		cos_defcompinfo_init();
+		sl_init(SL_MIN_PERIOD_US*100);
+		//int i;
+		struct sl_thd *rt = sl_thd_aep_alloc(pong_fn, NULL, 0, 0, 0, 0);
+		assert(rt);
+
+		//sl_thd_param_set(rt, sched_param_pack(SCHEDP_PRIO, PRIO_INT));
+		for (i = 0; i < PIPELINE_LEN; i++) {
+			wt[i] = sl_thd_alloc(work_fn, (void *)i);
+			assert(wt[i]);
+			//sl_thd_param_set(wt[i], sched_param_pack(SCHEDP_PRIO, PRIO_START-i));
+			if (i == 0) chinit(i, 0, 0);
+			else chinit(i, wt[i-1], wt[i]);
+		}
+
+//	}
+	ps_faa(&init_done[cos_cpuid()], 1);
+
+	/* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */
+	for (i = 0; i < NUM_CPU; i++) {
+		while (!ps_load(&init_done[i])) ;
+	}
+	PRINTC("Int component init done!\n");
+	//hypercall_comp_init_done();
+	schedinit_child();
+	for (i = 0; i < PIPELINE_LEN; i++) sl_thd_param_set(wt[i], sched_param_pack(SCHEDP_PRIO, PRIO_START-i));
+	sl_thd_param_set(rt, sched_param_pack(SCHEDP_PRIO, PRIO_INT));
+
+	sl_sched_loop();
+
+	PRINTC("Should never get here!\n");
+	assert(0);
+}
diff --git a/src/components/implementation/tests/unit_slxcore/Makefile b/src/components/implementation/tests/unit_slxcore/Makefile
new file mode 100644
index 0000000000..0bc62b21b8
--- /dev/null
+++ b/src/components/implementation/tests/unit_slxcore/Makefile
@@ -0,0 +1,8 @@
+COMPONENT=unit_slxcoretests.o
+INTERFACES=
+DEPENDENCIES=
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format $(LIBSLRAW) -lsl_mod_rr -lsl_thd_static_backend -lcos_dcb
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/tests/unit_slxcore/init.c b/src/components/implementation/tests/unit_slxcore/init.c
new file mode 100644
index 0000000000..7038d767fc
--- /dev/null
+++ b/src/components/implementation/tests/unit_slxcore/init.c
@@ -0,0 +1,86 @@
+#include <cos_kernel_api.h>
+#include <cos_defkernel_api.h>
+#include <llprint.h>
+#include <sl.h>
+#include <cos_dcb.h>
+#include <hypercall.h>
+
+#define MAX_PONG 20
+static struct sl_xcore_thd *ping;
+static struct sl_xcore_thd *pong[MAX_PONG];
+
+static inline void
+ping_fn(void *d)
+{
+	int k = 0;
+
+	while (1) {
+		sl_xcore_thd_wakeup(pong[k % MAX_PONG]);
+		k++;
+	}
+}
+
+static inline void
+pong_fn(void *d)
+{
+	while (1) {
+		sl_thd_block(0);
+	}
+}
+
+void
+cos_init(void *d)
+{
+	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
+	struct cos_compinfo *   ci    = cos_compinfo_get(defci);
+	int i;
+	static volatile unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 };
+	static unsigned b1 = 0, b2 = 0, b3 = 0;
+
+	if (ps_cas(&first, NUM_CPU + 1, cos_cpuid())) {
+		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
+		cos_defcompinfo_llinit();
+	} else {
+		while (!ps_load(&init_done[first])) ;
+
+		cos_defcompinfo_sched_init();
+	}
+	cos_dcb_info_init_curr();
+	ps_faa(&init_done[cos_cpuid()], 1);
+
+	/* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */
+	for (i = 0; i < NUM_CPU; i++) {
+		while (!ps_load(&init_done[i])) ;
+	}
+	sl_init(SL_MIN_PERIOD_US);
+	/* barrier, wait for sl_init to be done on all cores */
+	ps_faa(&b1, 1);
+	while (ps_load(&b1) != NUM_CPU) ;
+	if (cos_cpuid()) {
+		for (i = 0; i < MAX_PONG; i++) {
+			struct sl_thd *t = sl_thd_alloc(pong_fn, NULL);
+
+			assert(t);
+			sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX));
+			pong[i] = sl_xcore_thd_lookup(sl_thd_thdid(t));
+			assert(pong[i]);
+		}
+	} else {
+		struct sl_thd *t = sl_thd_alloc(ping_fn, NULL);
+
+		assert(t);
+		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX));
+
+		ping = sl_xcore_thd_lookup(sl_thd_thdid(t));
+		assert(ping);
+	}
+	ps_faa(&b2, 1);
+	while (ps_load(&b2) != NUM_CPU) ;
+	PRINTC("Ready!");
+//	hypercall_comp_init_done();
+
+	sl_sched_loop_nonblock();
+
+	PRINTC("Should never get here!\n");
+	assert(0);
+}
diff --git a/src/components/include/cirque.h b/src/components/include/cirque.h
new file mode 100644
index 0000000000..8c63772322
--- /dev/null
+++ b/src/components/include/cirque.h
@@ -0,0 +1,128 @@
+/**
+ * Redistribution of this file is permitted under the BSD two clause license.
+ *
+ * Copyright 2019, The George Washington University
+ * Author: Phani Gadepalli, phanikishoreg@gwu.edu
+ */
+#ifndef CIRQUE_H
+#define CIRQUE_H
+
+/* remember to use multi-core locks as these are really single producer, single consumer */
+#define CIRQUE_MAX_SZ 4096
+
+#define CIRQUE_PROTOTYPE(name, type)						\
+struct cirque_##name {								\
+	type wrk[CIRQUE_MAX_SZ];						\
+	size_t size;								\
+	size_t mask;								\
+										\
+	volatile long head;							\
+	volatile long tail;							\
+};										\
+										\
+static inline void								\
+cirque_init_##name(struct cirque_##name *q, size_t sz)				\
+{										\
+	memset(q, 0, sizeof(struct cirque_##name));				\
+										\
+	if (sz) {								\
+		/* only for size with pow of 2 */				\
+		assert(round_to_pow2(sz) == sz);				\
+		assert(sz <= CIRQUE_MAX_SZ);					\
+	} else {								\
+		sz = CIRQUE_MAX_SZ;						\
+	}									\
+										\
+	q->head = q->tail = 0;							\
+	q->size = sz;								\
+	q->mask = sz - 1;							\
+}										\
+										\
+static inline int								\
+cirque_insert_##name(struct cirque_##name *q, type *w)				\
+{										\
+	long ct = ps_load((unsigned long *)&q->tail); 				\
+	long ch = ps_load((unsigned long *)&q->head);				\
+										\
+	if ((ct == 0 && ch == q->mask) || 					\
+	    ((ch + 1) & q->mask) == ct) return -ENOSPC;				\
+										\
+	ps_mem_fence();								\
+	if (!ps_cas((unsigned long *)q->head, ch,				\
+		    (ch + 1) & q->mask)) return -EAGAIN;			\
+	q->wrk[ch] = *w;							\
+										\
+	return 0;								\
+}										\
+										\
+static inline int								\
+cirque_delete_##name(struct cirque_##name *q, type *w)				\
+{										\
+	long ct = ps_load((unsigned long *)&q->tail); 				\
+	long ch = ps_load((unsigned long *)&q->head);				\
+										\
+	if (ct >= ch) return -ENOENT;						\
+										\
+	*w = q->wrk[ct];							\
+	if (!ps_cas((unsigned long *)q->tail, ct, 				\
+		    (ct + 1) & q->mask)) return -EAGAIN;			\
+										\
+	return 0;								\
+}										\
+										\
+static inline int								\
+cirque_peek_##name(struct cirque_##name *q, type *w)				\
+{										\
+	long ct = ps_load((unsigned long *)&q->tail); 				\
+	long ch = ps_load((unsigned long *)&q->head);				\
+										\
+	if (ct >= ch) return -ENOENT;						\
+										\
+	*w = q->wrk[ct];							\
+										\
+	return 0;								\
+}										\
+										\
+static inline type *								\
+cirque_allocptr_##name(struct cirque_##name *q)					\
+{										\
+	long ct = ps_load((unsigned long *)&q->tail); 				\
+	long ch = ps_load((unsigned long *)&q->head);				\
+										\
+	if ((ct == 0 && ch == q->mask) || 					\
+	    ((ch + 1) & q->mask) == ct) return NULL;				\
+										\
+	ps_mem_fence();								\
+	if (!ps_cas((unsigned long *)q->head, ch,				\
+		    (ch + 1) & q->mask)) return NULL;				\
+										\
+	return &q->wrk[ch];							\
+}										\
+										\
+static inline void								\
+cirque_freeptr_##name(struct cirque_##name *q)					\
+{										\
+	long ct = ps_load((unsigned long *)&q->tail); 				\
+	long ch = ps_load((unsigned long *)&q->head);				\
+										\
+	if (ct >= ch) return;							\
+										\
+	if (ps_cas((unsigned long *)q->tail, ct, (ct + 1) & q->mask)) {		\
+		memset(&q->wrk[ct], 0, sizeof(type));				\
+	}									\
+										\
+	return;									\
+}										\
+										\
+static inline type *								\
+cirque_peekptr_##name(struct cirque_##name *q)					\
+{										\
+	long ct = ps_load((unsigned long *)&q->tail); 				\
+	long ch = ps_load((unsigned long *)&q->head);				\
+										\
+	if (ct >= ch) return NULL;						\
+										\
+	return &q->wrk[ct];							\
+}
+
+#endif /* CIRQUE_H */
diff --git a/src/components/include/cos_asm_simple_stacks.h b/src/components/include/cos_asm_simple_stacks.h
index b6dd7b9e21..46eb349cf7 100644
--- a/src/components/include/cos_asm_simple_stacks.h
+++ b/src/components/include/cos_asm_simple_stacks.h
@@ -16,7 +16,8 @@
 	shr $MAX_STACK_SZ_BYTE_ORDER, %eax; \
 	shr $16, %edx;			    \
 	pushl %edx;			    \
-	pushl %eax;
+	pushl %eax;			    \
+	pushl $0;
 
 #define COS_ASM_GET_STACK       \
 	COS_ASM_GET_STACK_BASIC \
diff --git a/src/components/include/cos_component.h b/src/components/include/cos_component.h
index aa64a093ec..e229fdac00 100644
--- a/src/components/include/cos_component.h
+++ b/src/components/include/cos_component.h
@@ -10,6 +10,7 @@
 
 #include <consts.h>
 #include <cos_types.h>
+#include <cos_sched.h>
 #include <errno.h>
 #include <util.h>
 #include <string.h>
@@ -53,6 +54,7 @@ call_cap_asm(u32_t cap_no, u32_t op, int arg1, int arg2, int arg3, int arg4)
 	return ret;
 }
 
+/* NOTE: make sure the memory locations r1, r2 & r3 are at least word-sized as the register stores are word-sized! */
 static inline int
 call_cap_retvals_asm(u32_t cap_no, u32_t op, int arg1, int arg2, int arg3, int arg4,
 			 unsigned long *r1, unsigned long *r2, unsigned long *r3)
@@ -84,6 +86,7 @@ call_cap_retvals_asm(u32_t cap_no, u32_t op, int arg1, int arg2, int arg3, int a
 	return ret;
 }
 
+/* NOTE: make sure the memory locations r1 & r2 are at least word-sized as the register stores are word-sized! */
 static inline int
 call_cap_2retvals_asm(u32_t cap_no, u32_t op, int arg1, int arg2, int arg3, int arg4,
 			 unsigned long *r1, unsigned long *r2)
@@ -145,9 +148,8 @@ extern struct cos_component_information cos_comp_info;
 static inline long
 get_stk_data(int offset)
 {
-	unsigned long curr_stk_pointer;
+	unsigned long curr_stk_pointer = 0;
 
-	__asm__("movl %%esp, %0;" : "=r"(curr_stk_pointer));
 	/*
 	 * We save the CPU_ID and thread id in the stack for fast
 	 * access.  We want to find the struct cos_stk (see the stkmgr
@@ -155,7 +157,15 @@ get_stk_data(int offset)
 	 * cpu_id.  This struct is at the _top_ of the current stack,
 	 * and cpu_id is at the top of the struct (it is a u32_t).
 	 */
-	return *(long *)((curr_stk_pointer & ~(COS_STACK_SZ - 1)) + COS_STACK_SZ - offset * sizeof(u32_t));
+	return *(long *)((((unsigned long)(&curr_stk_pointer)) & ~(COS_STACK_SZ - 1)) + COS_STACK_SZ - offset * sizeof(u32_t));
+}
+
+static inline void
+set_stk_data(int offset, long val)
+{
+	unsigned long curr_stk_pointer = 0;
+
+	*(long *)((((unsigned long)&curr_stk_pointer) & ~(COS_STACK_SZ - 1)) + COS_STACK_SZ - offset * sizeof(u32_t)) = val;
 }
 
 #define GET_CURR_CPU cos_cpuid()
@@ -195,6 +205,18 @@ cos_thdid(void)
 	return cos_get_thd_id();
 }
 
+static void *
+cos_get_slthd_ptr(void)
+{
+	return (void *)get_stk_data(SLTHDPTR_OFFSET);
+}
+
+static void
+cos_set_slthd_ptr(void *ptr)
+{
+	set_stk_data(SLTHDPTR_OFFSET, (long)ptr);
+}
+
 #define ERR_THROW(errval, label) \
 	do {                     \
 		ret = errval;    \
@@ -210,12 +232,36 @@ cos_spd_id(void)
 static inline void *
 cos_get_heap_ptr(void)
 {
-	return (void *)cos_comp_info.cos_heap_ptr;
+	/* page at heap_ptr is actually the SCB_PAGE for any component. */
+	unsigned int off = COS_SCB_SIZE + (PAGE_SIZE * NUM_CPU);
+	void *heap_ptr = ((void *)(cos_comp_info.cos_heap_ptr + off));
+
+	return heap_ptr;
+}
+
+static inline struct cos_scb_info *
+cos_scb_info_get(void)
+{
+	return (struct cos_scb_info *)(cos_comp_info.cos_heap_ptr);
+}
+
+static inline struct cos_scb_info *
+cos_scb_info_get_core(void)
+{
+	return cos_scb_info_get() + cos_cpuid();
+}
+
+static inline struct cos_dcb_info *
+cos_init_dcb_get(void)
+{
+	/* created at boot-time for the first component in the system! */
+	return (struct cos_dcb_info *)(cos_comp_info.cos_heap_ptr + COS_SCB_SIZE + (PAGE_SIZE * cos_cpuid()));
 }
 
 static inline void
 cos_set_heap_ptr(void *addr)
 {
+	/* FIXME: fix this for the hack if it's not going to work! */
 	cos_comp_info.cos_heap_ptr = (vaddr_t)addr;
 }
 
diff --git a/src/components/include/cos_dcb.h b/src/components/include/cos_dcb.h
new file mode 100644
index 0000000000..1fc6298da6
--- /dev/null
+++ b/src/components/include/cos_dcb.h
@@ -0,0 +1,28 @@
+#ifndef COS_DCB_H
+#define COS_DCB_H
+
+#include <cos_types.h>
+#include <cos_kernel_api.h>
+
+#define COS_DCB_PERPG_MAX (PAGE_SIZE / sizeof(struct cos_dcb_info))
+
+#define COS_DCB_MAX_CAPS (MAX_NUM_THREADS / COS_DCB_PERPG_MAX + 1)
+
+struct cos_dcbinfo_data {
+	dcbcap_t dcbcaps[COS_DCB_MAX_CAPS];
+	vaddr_t  dcbaddr[COS_DCB_MAX_CAPS];
+	dcboff_t curr_cap_off;
+	unsigned short curr_cap;
+
+	struct cos_compinfo *ci;
+} CACHE_ALIGNED;
+
+void cos_dcb_info_init(struct cos_dcbinfo_data *cdi, struct cos_compinfo *ci);
+void cos_dcb_info_init_ext(struct cos_dcbinfo_data *cdi, struct cos_compinfo *ci, dcbcap_t initdcbcap, vaddr_t initdcbaddr, dcboff_t start_off);
+dcbcap_t cos_dcb_info_alloc(struct cos_dcbinfo_data *cdi, dcboff_t *dcboff, vaddr_t *dcbaddr);
+
+void cos_dcb_info_init_curr(void);
+void cos_dcb_info_init_curr_ext(dcbcap_t initdcbcap, vaddr_t initdcbaddr, dcboff_t st_off);
+dcbcap_t cos_dcb_info_alloc_curr(dcboff_t *dcboff, vaddr_t *dcbaddr);
+
+#endif /* COS_DCB_H */
diff --git a/src/components/include/cos_debug.h b/src/components/include/cos_debug.h
index c646c1b977..6e8bb00825 100644
--- a/src/components/include/cos_debug.h
+++ b/src/components/include/cos_debug.h
@@ -9,7 +9,7 @@
 #endif
 
 #ifndef PRINT_FN
-#define PRINT_FN prints
+#define PRINT_FN PRINTC 
 #endif
 
 #include <llprint.h>
diff --git a/src/components/include/cos_defkernel_api.h b/src/components/include/cos_defkernel_api.h
index fa083c27ef..b98796c129 100644
--- a/src/components/include/cos_defkernel_api.h
+++ b/src/components/include/cos_defkernel_api.h
@@ -36,7 +36,7 @@ struct cos_aep_info {
 	thdid_t         tid;
 	arcvcap_t       rcv;
 	cos_aepthd_fn_t fn;
-	void *          data;
+	void           *data;
 };
 
 /* Default Component information */
@@ -53,7 +53,7 @@ cos_aepthd_fn(void *data)
 {
 	struct cos_aep_info *aep_info = (struct cos_aep_info *)data;
 	cos_aepthd_fn_t      aep_fn   = aep_info->fn;
-	void *               fn_data  = aep_info->data;
+	void                *fn_data  = aep_info->data;
 
 	(aep_fn)(aep_info->rcv, fn_data);
 
@@ -81,6 +81,7 @@ struct cos_aep_info *cos_sched_aep_get(struct cos_defcompinfo *defci);
  * capabilities layout.
  */
 void cos_defcompinfo_init(void);
+void cos_defcompinfo_llinit(void);
 /*
  * cos_defcompinfo_init_ext: initialize the current component's global cos_defcompinfo struct using the parameters
  * passed.
@@ -96,44 +97,49 @@ void cos_defcompinfo_sched_init(void);
  * cos_defcompinfo_child_alloc: called to create a new child component including initial capabilities like pgtbl,
  * captbl, compcap, aep. if is_sched is set, scheduling end-point will also be created for the child component, else,
  * the current component's scheduler will remain the scheduler for the child component.
+ * TODO: initdcb cap and initdcb addr?
  */
 int cos_defcompinfo_child_alloc(struct cos_defcompinfo *child_defci, vaddr_t entry, vaddr_t heap_ptr,
-                                capid_t cap_frontier, int is_sched);
+                                capid_t cap_frontier, int is_sched, dcbcap_t *initdcbcap);
 
 /*
  * cos_aep_alloc: creates a new async activation end-point which includes thread, tcap and rcv capabilities.
  *                struct cos_aep_info passed in, must not be stack allocated.
  */
-int cos_aep_alloc(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data);
+int cos_aep_alloc(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data, dcbcap_t dcap, dcboff_t doff);
 /*
  * cos_aep_alloc: creates a new async activation end-point, using an existing tcap.
  *                struct cos_aep_info passed in, must not be stack allocated.
  */
-int cos_aep_tcap_alloc(struct cos_aep_info *aep, tcap_t tc, cos_aepthd_fn_t fn, void *data);
+int cos_aep_tcap_alloc(struct cos_aep_info *aep, tcap_t tc, cos_aepthd_fn_t fn, void *data, dcbcap_t dcap, dcboff_t doff);
 
 /*
  * cos_initaep_alloc: create an initaep in the @child_dci and using sched->rcv as the parent, sets up cos_sched_ape_get(@child_dci) with the init capabilities.
  * 		      if @sched == NULL, use the current scheduler in cos_sched_aep_get(cos_defcompinfo_get_cur()).
  *                    if @is_sched == 0, creates only the init thread (does not need @sched parameter)
+ * NOTE: dcbuaddr is the address in child_dci page-table.
  */
-int cos_initaep_alloc(struct cos_defcompinfo *child_dci, struct cos_aep_info *sched, int is_sched);
+int cos_initaep_alloc(struct cos_defcompinfo *child_dci, struct cos_aep_info *sched, int is_sched, dcbcap_t dcap);
 /*
  * cos_initaep_tcap_alloc: same as cos_initaep_alloc with is_sched == 1, except it doesn't create a new tcap,
  *			   uses the tcap passed in @tc.
+ * NOTE: dcbuaddr is the address in child_dci page-table.
  */
-int cos_initaep_tcap_alloc(struct cos_defcompinfo *child_dci, tcap_t tc, struct cos_aep_info *sched);
+int cos_initaep_tcap_alloc(struct cos_defcompinfo *child_dci, tcap_t tc, struct cos_aep_info *sched, dcbcap_t dcap);
 
 /*
  * cos_aep_alloc_ext: creates a new async activation end-point which includes thread, tcap and rcv capabilities in the child_dci component using sched_aep->rcv.
  *		      if @child_dci == NULL, create in the current component.
+ * NOTE: dcbuaddr is the address in child_dci page-table.
  */
-int cos_aep_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *child_dci, struct cos_aep_info *sched_aep, thdclosure_index_t idx);
+int cos_aep_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *child_dci, struct cos_aep_info *sched_aep, thdclosure_index_t idx, dcbcap_t dcap, dcboff_t doff);
 
 /*
  * cos_aep_alloc_ext: creates a new async activation end-point which includes thread, tcap and rcv capabilities in the child_dci component using sched_aep->rcv.
  *		      if @child_dci == NULL, create in the current component.
+ * NOTE: dcbuaddr is the address in child_dci page-table.
  */
-int cos_aep_tcap_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *child_dci, struct cos_aep_info *sched_aep, tcap_t tc, thdclosure_index_t idx);
+int cos_aep_tcap_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *child_dci, struct cos_aep_info *sched_aep, tcap_t tc, thdclosure_index_t idx, dcbcap_t dcap, dcboff_t doff);
 
 /*
  * cos_defswitch: thread switch api using the default scheduling tcap and rcv.
diff --git a/src/components/include/cos_kernel_api.h b/src/components/include/cos_kernel_api.h
index 911f025e01..542290774d 100644
--- a/src/components/include/cos_kernel_api.h
+++ b/src/components/include/cos_kernel_api.h
@@ -54,6 +54,9 @@ typedef capid_t compcap_t;
 typedef capid_t captblcap_t;
 typedef capid_t pgtblcap_t;
 typedef capid_t hwcap_t;
+typedef capid_t scbcap_t;
+typedef capid_t dcbcap_t;
+typedef unsigned short dcboff_t;
 
 /* Memory source information */
 struct cos_meminfo {
@@ -81,7 +84,7 @@ struct cos_compinfo {
 };
 
 void cos_compinfo_init(struct cos_compinfo *ci, pgtblcap_t pgtbl_cap, captblcap_t captbl_cap, compcap_t comp_cap,
-                       vaddr_t heap_ptr, capid_t cap_frontier, struct cos_compinfo *ci_resources);
+		       vaddr_t heap_ptr, capid_t cap_frontier, struct cos_compinfo *ci_resources);
 /*
  * This only needs be called on compinfos that are managing resources
  * (i.e. likely only one).  All of the capabilities will be relative
@@ -107,24 +110,35 @@ int cos_pgtbl_intern_expandwith(struct cos_compinfo *ci, pgtblcap_t intern, vadd
  * This uses the next three functions to allocate a new component and
  * correctly populate ci (allocating all resources from ci_resources).
  */
-int         cos_compinfo_alloc(struct cos_compinfo *ci, vaddr_t heap_ptr, capid_t cap_frontier, vaddr_t entry,
-                               struct cos_compinfo *ci_resources);
+int         cos_compinfo_alloc(struct cos_compinfo *ci, scbcap_t sc, vaddr_t heap_ptr, capid_t cap_frontier,         				    vaddr_t entry, struct cos_compinfo *ci_resources);
 captblcap_t cos_captbl_alloc(struct cos_compinfo *ci);
 pgtblcap_t  cos_pgtbl_alloc(struct cos_compinfo *ci);
-compcap_t   cos_comp_alloc(struct cos_compinfo *ci, captblcap_t ctc, pgtblcap_t ptc, vaddr_t entry);
+compcap_t   cos_comp_alloc(struct cos_compinfo *ci, captblcap_t ctc, pgtblcap_t ptc, scbcap_t scbc, vaddr_t entry,
+			   vaddr_t scb_addr);
+scbcap_t    cos_scb_alloc(struct cos_compinfo *ci);
+dcbcap_t    cos_dcb_alloc(struct cos_compinfo *ci, pgtblcap_t ptc, vaddr_t dcb_uaddr);
 
 typedef void (*cos_thd_fn_t)(void *);
-thdcap_t cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, cos_thd_fn_t fn, void *data);
-thdcap_t cos_thd_alloc_ext(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t idx);
+thdcap_t cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, cos_thd_fn_t fn, void *data, dcbcap_t dc,
+		       dcboff_t dcboff);
+thdcap_t cos_thd_alloc_ext(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t idx, dcbcap_t dc,
+			   dcboff_t dcboff);
 /* Create the initial (cos_init) thread */
-thdcap_t  cos_initthd_alloc(struct cos_compinfo *ci, compcap_t comp);
+thdcap_t  cos_initthd_alloc(struct cos_compinfo *ci, compcap_t comp, dcbcap_t dc);
+int cos_thd_migrate(struct cos_compinfo *ci, thdcap_t thdc, cpuid_t core);
+/* update the thdcap to migrated core */
+int cos_thdcap_migrate(struct cos_compinfo *ci, thdcap_t thdc);
 sinvcap_t cos_sinv_alloc(struct cos_compinfo *srcci, compcap_t dstcomp, vaddr_t entry, invtoken_t token);
-arcvcap_t cos_arcv_alloc(struct cos_compinfo *ci, thdcap_t thdcap, tcap_t tcapcap, compcap_t compcap, arcvcap_t enotif);
+arcvcap_t cos_arcv_alloc(struct cos_compinfo *ci, thdcap_t thdcap, tcap_t tcapcap, compcap_t compcap,
+			 arcvcap_t enotif);
 asndcap_t cos_asnd_alloc(struct cos_compinfo *ci, arcvcap_t arcvcap, captblcap_t ctcap);
 
 void *cos_page_bump_alloc(struct cos_compinfo *ci);
 void *cos_page_bump_allocn(struct cos_compinfo *ci, size_t sz);
 
+void *cos_dcbpg_bump_allocn(struct cos_compinfo *ci, size_t sz);
+void *cos_scbpg_bump_allocn(struct cos_compinfo *ci, size_t sz);
+
 capid_t cos_cap_cpy(struct cos_compinfo *dstci, struct cos_compinfo *srcci, cap_t srcctype, capid_t srccap);
 int     cos_cap_cpy_at(struct cos_compinfo *dstci, capid_t dstcap, struct cos_compinfo *srcci, capid_t srccap);
 
@@ -152,10 +166,11 @@ int cos_thd_mod(struct cos_compinfo *ci, thdcap_t c, void *tls_addr); /* set tls
 int cos_sched_asnd(asndcap_t snd, tcap_time_t timeout, arcvcap_t srcv, sched_tok_t stok);
 /* returns 0 on success and -EINVAL on failure */
 int cos_asnd(asndcap_t snd, int yield);
-/* returns non-zero if there are still pending events (i.e. there have been pending snds) */
-int cos_rcv(arcvcap_t rcv, rcv_flags_t flags, int *rcvd);
+/* returns 0 on success */
+int cos_rcv(arcvcap_t rcv, rcv_flags_t flags);
 /* returns the same value as cos_rcv, but also information about scheduling events */
-int cos_sched_rcv(arcvcap_t rcv, rcv_flags_t flags, tcap_time_t timeout, int *rcvd, thdid_t *thdid, int *blocked, cycles_t *cycles, tcap_time_t *thd_timeout);
+int cos_sched_rcv(arcvcap_t rcv, rcv_flags_t flags, tcap_time_t timeout, thdid_t *thdid, int *blocked,
+		  cycles_t *cycles, tcap_time_t *thd_timeout);
 
 int cos_introspect(struct cos_compinfo *ci, capid_t cap, unsigned long op);
 
@@ -188,11 +203,13 @@ int cos_tcap_merge(tcap_t dst, tcap_t rm);
 /* Hardware (interrupts) operations */
 hwcap_t cos_hw_alloc(struct cos_compinfo *ci, u32_t bitmap);
 int     cos_hw_attach(hwcap_t hwc, hwid_t hwid, arcvcap_t rcvcap);
+int     cos_hw_periodic_attach(hwcap_t hwc, hwid_t hwid, arcvcap_t rcvcap, unsigned int period);
 int     cos_hw_detach(hwcap_t hwc, hwid_t hwid);
 void   *cos_hw_map(struct cos_compinfo *ci, hwcap_t hwc, paddr_t pa, unsigned int len);
 int     cos_hw_cycles_per_usec(hwcap_t hwc);
 int     cos_hw_cycles_thresh(hwcap_t hwc);
 
 capid_t cos_capid_bump_alloc(struct cos_compinfo *ci, cap_t cap);
+vaddr_t cos_page_bump_intern_valloc(struct cos_compinfo *ci, size_t sz);
 
 #endif /* COS_KERNEL_API_H */
diff --git a/src/components/include/cos_omp.h b/src/components/include/cos_omp.h
new file mode 100644
index 0000000000..8933449ae9
--- /dev/null
+++ b/src/components/include/cos_omp.h
@@ -0,0 +1,50 @@
+/**
+ * Redistribution of this file is permitted under the BSD two clause license.
+ *
+ * Copyright 2019, The George Washington University
+ * Author: Phani Gadepalli, phanikishoreg@gwu.edu
+ */
+
+#ifndef COS_OMP_H
+#define COS_OMP_H
+
+#include <part_task.h>
+#include <cos_types.h>
+#include <omp.h>
+
+#define COS_OMP_MAX_NUM_THREADS (PART_MAX_THDS)
+
+struct cos_icv_data_env {
+	unsigned dyn_var;
+	unsigned nest_var;
+	unsigned nthreads_var;
+	unsigned run_sched_var;
+	unsigned bind_var;
+	unsigned thread_limit_var;
+	unsigned active_levels_var;
+	unsigned levels_var;
+	unsigned default_device_var;
+};
+
+struct cos_icv_global_env {
+	unsigned cancel_var;
+	unsigned max_task_priority_var;
+};
+
+struct cos_icv_implicittask_env {
+	unsigned place_partition_var;
+};
+
+struct cos_icv_device_env {
+	unsigned def_sched_var;
+	unsigned stacksize_var;
+	unsigned wait_policy_var;
+	unsigned max_active_levels_var;
+};
+
+extern void cos_omp_icv_data_init(struct cos_icv_data_env *icvde);
+extern void cos_omp_icv_implitsk_init(struct cos_icv_implicittask_env *icvite);
+extern void cos_omp_icv_device_init(struct cos_icv_device_env *icvdve, unsigned dev_no);
+extern void cos_omp_init(void);
+
+#endif /* COS_OMP_H */
diff --git a/src/components/include/cos_rdtsc.h b/src/components/include/cos_rdtsc.h
new file mode 100644
index 0000000000..d8ebfad445
--- /dev/null
+++ b/src/components/include/cos_rdtsc.h
@@ -0,0 +1,65 @@
+#ifndef COS_RDTSC_H
+#define COS_RDTSC_H
+
+#include <cos_types.h>
+
+#define COS_RDTSCP_CALIB_ITERS 1000000
+
+#define cos_rdtsc rdtscll
+
+/* Copied from seL4bench */
+#define cos_rdtscp(var) do { 					\
+	u32_t low, high; 					\
+	asm volatile( 						\
+			"movl $0, %%eax \n" 			\
+			"movl $0, %%ecx \n" 			\
+			"cpuid \n" 				\
+			"rdtsc \n" 				\
+			"movl %%edx, %0 \n" 			\
+			"movl %%eax, %1 \n" 			\
+			"movl $0, %%eax \n" 			\
+			"movl $0, %%ecx \n" 			\
+			"cpuid \n" 				\
+			: 					\
+			"=r"(high), 				\
+			"=r"(low) 				\
+			: 					\
+			: "eax", "ebx", "ecx", "edx" 		\
+		    ); 						\
+	(var) = (((u64_t)high) << 32ull) | ((u64_t)low); 	\
+} while(0)
+
+/*
+ * use this to calibrate the rdtscp and perhaps use
+ * min value to remove from your benchmarks
+ */
+static inline void
+cos_rdtscp_calib(cycles_t *min, cycles_t *avg, cycles_t *max)
+{
+	int i;
+	volatile cycles_t st, en, mn = 0, mx = 0, total = 0;
+
+	cos_rdtscp(st);
+	cos_rdtscp(en);
+	mn = mx = en - st;
+
+	for (i = 0; i < COS_RDTSCP_CALIB_ITERS; i++) {
+		cycles_t diff;
+
+		cos_rdtscp(st);
+		cos_rdtscp(en);
+
+		diff = en - st;
+		total += diff;
+		if (diff < mn) mn = diff;
+		if (diff > mx) mx = diff;
+	}
+
+	if (min) *min = mn;
+	if (max) *max = mx;
+	if (avg) *avg = total / COS_RDTSCP_CALIB_ITERS;
+
+	return;
+}
+
+#endif /* COS_RDTSC_H */
diff --git a/src/components/include/cos_ulsched_rcv.h b/src/components/include/cos_ulsched_rcv.h
new file mode 100644
index 0000000000..60ff25d795
--- /dev/null
+++ b/src/components/include/cos_ulsched_rcv.h
@@ -0,0 +1,80 @@
+#ifndef COS_ULSCHED_RCV_H
+#define COS_ULSCHED_RCV_H
+
+#include <cos_kernel_api.h>
+
+static inline int
+__cos_sched_events_present(struct cos_sched_ring *r)
+{
+	return (ps_load(&r->tail) != ps_load(&r->head));
+}
+
+static inline int
+cos_sched_ispending(void)
+{
+	struct cos_scb_info *scb_cpu = cos_scb_info_get_core();
+	struct cos_sched_ring *r     = &scb_cpu->sched_events;
+
+	return ps_load(&r->more);
+}
+
+static inline int
+cos_sched_events_isempty(void)
+{
+	struct cos_scb_info *scb_cpu = cos_scb_info_get_core();
+	struct cos_sched_ring *r     = &scb_cpu->sched_events;
+
+	return (ps_load(&r->tail) == ps_load(&r->head)) && !ps_load(&r->more);
+}
+
+static inline int
+__cos_sched_event_consume(struct cos_sched_ring *r, struct cos_sched_event *e)
+{
+	int f = 0;
+
+	if (unlikely(!r || !__cos_sched_events_present(r))) return 0;
+	assert(e);
+	f = ps_upfaa((unsigned long *)&r->head, 1);
+	*e = r->event_buf[f];
+//	memcpy((void *)e, (void *)&(r->event_buf[f]), sizeof(struct cos_sched_event));
+
+	return 1;
+}
+
+/* if other than sched-thread calls this, races will need to be handled by the caller! */
+static inline int
+cos_ul_sched_rcv(arcvcap_t rcv, rcv_flags_t rfl, tcap_time_t timeout, struct cos_sched_event *evt)
+{
+	int ret = 0;
+	struct cos_scb_info *scb_cpu = cos_scb_info_get_core();
+	struct cos_sched_ring *r     = &scb_cpu->sched_events;
+
+	evt->tid = 0;
+	assert(scb_cpu);
+	/* a non-scheduler thread, should call with rcv == 0 to consume user-level events alone */
+	if (__cos_sched_event_consume(r, evt) == 0
+		     && rcv && !(rfl & RCV_ULONLY)) {
+
+		ret = cos_sched_rcv(rcv, rfl, timeout, &(evt->tid), (int *)&(evt->evt.blocked),
+			            (cycles_t *)&(evt->evt.elapsed_cycs), (tcap_time_t *)&(evt->evt.next_timeout));
+		if (unlikely(ret < 0)) return ret;
+	}
+
+	return (ret || __cos_sched_events_present(r) || cos_sched_ispending());
+}
+
+static inline int
+cos_ul_rcv(arcvcap_t rcv, rcv_flags_t rfl, tcap_time_t sched_timeout)
+{
+	struct cos_sched_event ev = { .tid = 0 };
+	int ret = 0;
+
+	if (likely(sched_timeout)) rfl |= RCV_SCHEDTIMEOUT;
+	ret = cos_sched_rcv(rcv, rfl, sched_timeout, &(ev.tid), (int *)&(ev.evt.blocked),
+			    (cycles_t *)&(ev.evt.elapsed_cycs), (tcap_time_t *)&(ev.evt.next_timeout));
+	assert(ev.tid == 0);
+
+	return ret;
+}
+
+#endif /* COS_ULSCHED_RCV_H */
diff --git a/src/components/include/crt_blkpt.h b/src/components/include/crt_blkpt.h
new file mode 100644
index 0000000000..d647dc50d9
--- /dev/null
+++ b/src/components/include/crt_blkpt.h
@@ -0,0 +1,298 @@
+#ifndef CRT_BLKPT_H
+#define CRT_BLKPT_H
+
+#include <cos_types.h>
+#include <ps.h>
+#include <sl.h>
+
+/***
+ * The event count/block point is an abstraction to synchronize the
+ * blocking behavior of different threads on abstract events. The
+ * events are usually tied to a specific state of another
+ * data-structure (into which the blkpt is embedded).  For example, a
+ * lock is taken and released thus generating an event for any
+ * blocking threads, or a ring buffer has a data item inserted into
+ * it, thus generating an event for any threads waiting for
+ * data. Concretely, we want a number of threads to be able to block,
+ * and a thread to be able to wake up one, or all of them. The
+ * challenge is solving a single race-condition:
+ *
+ * thd 0: check data-structure, determine the need for blocking and
+ *        waiting for an event
+ * thd 0: preemption, switching to thd 1
+ * thd 1: check data-structure, determine that an event is generated
+ * thd 1: call the scheduler, and wake all blocked threads (not
+ *        including thd 0 yet)
+ * thd 1: preempt, and switch to thd 0
+ * thd 0: call scheduler to block
+ *
+ * The resulting state is that thd 1 should have unblocked thd 0, but
+ * due to a race, the thd 0 will be blocked awaiting the *next* event
+ * that may never come. Event counts are meant to solve this
+ * problem. Traditional systems solve this problem using condition
+ * variables and a lock around the scheduling logic, but if you want
+ * to decouple the data-structure from the scheduler (e.g. as they are
+ * in different modes, or components), this is a fundamental problem.
+ *
+ * The event count abstraction:
+ *
+ * Assume the data-structure generating events has at least three
+ * states:
+ * S0: available
+ * S1: unavailable
+ * S2: unavailable & subscribed
+ *
+ * The transitions within the data-structure are:
+ * {S0->S1, S1->S0, S1->S2, S2->S0}
+ *
+ * Every transition into S0 is an abstract *event*. Threads that look
+ * at the state of the data-structure, and must block waiting for its
+ * state to change, wait for such an event to wakeup.
+ *
+ * The data-structure must define its own mapping to this state
+ * machine. A few examples:
+ *
+ * Mutexes:
+ * S0: Not locked.
+ * S1: Locked and held by thread 0.
+ * S2: Locked and held by thread 0, and threads 1...N contend the lock
+ *
+ * Ring buffer (for simplicity, assuming it never fills):
+ * S0: data items in ring buffer
+ * S1: no data in ring buffer
+ * S2: no data in ring buffer, and thread(s) are waiting for data
+ *
+ * The event counts are used to track the threads that use the
+ * data-structure when transitioning from S1->S2 (block thread), when
+ * it is in S2 (block additional threads), and when it transitions
+ * from S2->S0 (wakeup blocked threads).
+ *
+ * The event count is used in the following way:
+ *
+ * S0->S1:
+ *     data-structure (DS) operation
+ *     E.g. not locked -> locked, or
+ *          dequeue from ring with single data item
+ *
+ * S1->S0:
+ *     blkpt_checkpoint(ec) (not used)
+ *     data-structure (DS) operation
+ *     assert(blkpt_has_blocked(ec) == false) (as we're in S1)
+ *     blkpt_trigger(ec) (won't do much as noone is blocked)
+ *     E.g. unlock with no contention, or
+ *          enqueue with no dequeuing threads
+ *
+ * S1->S2:
+ *     cp = blkpt_checkpoint(ec)
+ *     data-structure (DS) operation, determine we need to await event
+ *     blkpt_wait(ec, cp)
+ *     retry (this is why event counts can be used with lock-free data-structs)
+ *     E.g. locked -> contended
+ *          empty ring -> waiting for data
+ *
+ * S2->S0:
+ *     data-structure (DS) operation
+ *     assert(blkpt_has_blocked(ec) == true) (as we're in S2)
+ *     blkpt_trigger(ec) (wake blocked threads!)
+ *     E.g. unlock with contention, or
+ *          enqueue with dequeuing threads
+ *
+ * Event count *optimization*:
+ *
+ * We prevent the race above using an epoch (count) for the events
+ * thus the name. However, to avoid rapid wraparound on the epoch, we
+ * only increment the epoch when the race condition is possible. That
+ * is to say, we only increment the event count when the
+ * data-structure has blocked threads. This not only delays
+ * wraparound, it also will avoid an atomic instruction for all
+ * operations that don't involve blocked threads (a common-case,
+ * exemplified by futexes, for example).
+ *
+ * Usage optimization:
+ *
+ * Because of the event counter optimization to only use expensive
+ * operations when triggering there are blocked threads, the user of
+ * this API can trigger whenever transitioning back to S0.
+ */
+
+struct crt_blkpt {
+	sched_blkpt_id_t  id;
+	/* most significant bit specifies blocked thds */
+	sched_blkpt_epoch_t epoch_blocked;
+};
+
+struct crt_blkpt_checkpoint {
+	sched_blkpt_epoch_t epoch_blocked;
+};
+
+typedef enum {
+	CRT_BLKPT_UNIPROC   = 1, 	/* are the event operations only called on a single core? */
+	CRT_BLKPT_CRIT_SECT = 2,	/* is only one thread ever going to trigger at a time? */
+} crt_blkpt_flags_t;
+
+#define CRT_BLKPT_EPOCH_BLKED_BITS (sizeof(sched_blkpt_epoch_t) * 8)
+#define CRT_BLKPT_BLKED_MASK       (1 << (CRT_BLKPT_EPOCH_BLKED_BITS - 2))
+#define CRT_BLKPT_BLKED(e)         ((e) &  CRT_BLKPT_BLKED_MASK)
+#define CRT_BLKPT_EPOCH(e)         ((e) & ~CRT_BLKPT_BLKED_MASK)
+
+/* Return != 0 on failure: no ids to allocate */
+static inline int
+crt_blkpt_init(struct crt_blkpt *blkpt)
+{
+	sched_blkpt_id_t id;
+
+	id = sched_blkpt_alloc();
+	if (id == SCHED_BLKPT_NULL) return -1;
+
+	*blkpt = (struct crt_blkpt){
+		.id = id,
+		.epoch_blocked = 0
+	};
+
+	return 0;
+}
+
+static inline int
+crt_blkpt_teardown(struct crt_blkpt *blkpt)
+{
+	return sched_blkpt_free(blkpt->id);
+}
+
+/* Internal APIs that must be inlined to remove the branches */
+static inline int
+__crt_blkpt_atomic_trigger(sched_blkpt_epoch_t *ec, sched_blkpt_epoch_t chkpt, crt_blkpt_flags_t flags)
+{
+	/*
+	 * Assume that the most significant bit is the blocked
+	 * indicator. This math might reset it to zero, which we want
+	 * to do anyway (as part of CRT_BLKPT_EPOCH).
+	 */
+	sched_blkpt_epoch_t new = CRT_BLKPT_EPOCH(chkpt + 1);
+
+	/* inlined so that constant propagation will get rid of condition */
+	if (flags == CRT_BLKPT_UNIPROC) {
+		return ps_upcas(ec, chkpt, new);
+	} else {
+		return ps_cas(ec, chkpt, new);
+	}
+	/* TODO: faa for CRT_BLKPT_CRIT_SECT? */
+}
+
+/*
+ * If we return 1, then the caller will attempt to block, otherwise,
+ * return 0 and it will re-check the data-structure assuming that
+ * something happened in the mean time.
+ */
+static inline int
+__crt_blkpt_atomic_wait(sched_blkpt_epoch_t *ec, sched_blkpt_epoch_t chkpt, crt_blkpt_flags_t flags)
+{
+	sched_blkpt_epoch_t cached = ps_load(ec);
+	sched_blkpt_epoch_t new    = cached | CRT_BLKPT_BLKED_MASK;
+	int ret;
+
+	/*
+	 * We are the second or later blocker. Blocked already
+	 * set. We're done here.
+	 *
+	 * It isn't clear if it is better to have the additional
+	 * branch here for this to avoid atomic instructions, or to
+	 * just always do the atomic instructions and possibly fail.
+	 */
+	if (cached == new) return 1;
+
+	/* function is inlined so that constant propagation will get rid of condition */
+	if (flags == CRT_BLKPT_UNIPROC) {
+		ret = ps_upcas(ec, chkpt, new);
+	} else {
+		ret = ps_cas(ec, chkpt, new);
+	}
+	if (unlikely(!ret)) {
+		/*
+		 * CAS failure can mean that 1. another thread
+		 * blocked, and set the blocked bit, or 2. an event is
+		 * triggered. In the former case, we still want to
+		 * block. In the latter case, we want to go back to
+		 * the data-structure.
+		 */
+		return ps_load(ec) == new; /* same epoch with blocked set? == success */
+	}
+
+	return 1;
+}
+
+/* Trigger an event, waking blocked threads. */
+static inline void
+crt_blkpt_trigger(struct crt_blkpt *blkpt, crt_blkpt_flags_t flags)
+{
+	/*
+	 * Note that the flags should likely be passed in statically,
+	 * as constants. That way they will be inlined the conditions
+	 * in the *_atomic_* function will be removed.
+	 */
+	sched_blkpt_epoch_t saved = ps_load(&blkpt->epoch_blocked);
+
+	/* The optimization: don't increment events if noone's listening */
+	if (likely(!CRT_BLKPT_BLKED(saved))) return;
+
+	/* slow(er) path for when we have blocked threads */
+	if (!__crt_blkpt_atomic_trigger(&blkpt->epoch_blocked, saved, flags)) {
+		/*
+		 * Race here between triggering threads. In this case,
+		 * someone else already incremented the epoch and
+		 * unblocked the threads. Yeah, helping algorithms!
+		 */
+		return;
+	}
+	/*
+	 * Note that there is a race here. Multiple threads triggering
+	 * events might pass different epochs down to the next
+	 * level. This is OK as the next level always takes the epoch
+	 * = max(epoch, ...) (for some wraparound-aware version of
+	 * max).
+	 */
+	sched_blkpt_trigger(blkpt->id, CRT_BLKPT_EPOCH(saved + 1), 0);
+}
+
+/* Wake only a single, specified thread (tracked manually in the data-structure) */
+/* void crt_blkpt_trigger_one(struct crt_blkpt *blkpt, crt_blkpt_flags_t flags, cos_thdid_t thdid); */
+
+/*
+ * Checkpoint the state of the current event counter. This checkpoint
+ * is the one that is active during our operations on the
+ * data-structure. If we determine that we want to wait for an event
+ * (thus blocking), then the state of the checkpoint will be compared
+ * versus the state of the event counter to see if we're working off
+ * of outdated information.
+ */
+static inline void
+crt_blkpt_checkpoint(struct crt_blkpt *blkpt, struct crt_blkpt_checkpoint *chkpt)
+{
+	chkpt->epoch_blocked = ps_load(&blkpt->epoch_blocked);
+}
+
+/* Wait for an event. */
+static inline void
+crt_blkpt_wait(struct crt_blkpt *blkpt, crt_blkpt_flags_t flags, struct crt_blkpt_checkpoint *chkpt)
+{
+	/*
+	 * If blocked is already set, we can try and block
+	 * directly. Otherwise, go through and try to atomically set
+	 * it. If that fails, then either epoch or blocked has been
+	 * updated, so return and try accessing the data-structure
+	 * again.
+	 */
+	if (!CRT_BLKPT_BLKED(chkpt->epoch_blocked) &&
+	    !__crt_blkpt_atomic_wait(&blkpt->epoch_blocked, chkpt->epoch_blocked, flags)) return;
+
+	if (unlikely(sched_blkpt_block(blkpt->id, CRT_BLKPT_EPOCH(chkpt->epoch_blocked), 0))) {
+		BUG(); 		/* we are using a blkpt id that doesn't exist! */
+	}
+}
+
+/*
+ * Create an execution dependency on the specified thread for,
+ * e.g. priority inheritance.
+ */
+/* void crt_blkpt_wait_dep(struct crt_blkpt *blkpt, crt_blkpt_flags_t flags, struct crt_blkpt_checkpoint *chkpt, cos_thdid_t thdid); */
+
+#endif /* CRT_BLKPT_H */
diff --git a/src/components/include/crt_chan.h b/src/components/include/crt_chan.h
new file mode 100644
index 0000000000..5f4267bb8a
--- /dev/null
+++ b/src/components/include/crt_chan.h
@@ -0,0 +1,323 @@
+/*
+ * Copyright 2019, Gabriel Parmer, GWU, gparmer@gwu.edu.
+ *
+ * This uses a two clause BSD License.
+ */
+
+#ifndef CRT_CHAN_H
+#define CRT_CHAN_H
+
+/***
+ *
+ */
+
+#include <cos_component.h>
+#include <crt_blkpt.h>
+#include <bitmap.h>
+#include <sl.h>
+#include <sl_thd.h>
+
+struct crt_chan {
+	u32_t producer;
+	/* If the ring is empty, recving threads will block on this blkpt. */
+	struct crt_blkpt empty;
+	char _padding1[CACHE_LINE * 2 - (sizeof(struct crt_blkpt) + sizeof(u32_t))];
+	u32_t consumer;
+	/* If the ring is full, sending thread will block on this blkpt. */
+	struct crt_blkpt full;
+	char _padding2[CACHE_LINE * 2 - (sizeof(struct crt_blkpt) + sizeof(u32_t))];
+	/*
+	 * @item_sz is a power of two and corresponds to the
+	 * wraparound_mask. The number of data items that the channel
+	 * can hold is item_sz - 1. @wraparound_mask = nslots-1 (were
+	 * nslots is a power of two)
+	 */
+	u32_t item_sz, wraparound_mask;
+	u32_t nslots;
+	/* FIXME: p2p channels only SINGLE-CORE for now! */
+	unsigned long sender, receiver; /* for p2p channels, sl_thd pointers + MSB for blocked on channel send/recv.. */
+	/* The memory for the channel. */
+	char mem[0];
+};
+
+/* produce a  */
+#define CRT_CHAN_STATIC_ALLOC(name, type, nslots)		\
+struct __crt_chan_envelope_##name {	                        \
+        struct crt_chan c;					\
+	char mem[nslots * sizeof(type)];			\
+} __##name;							\
+struct crt_chan *name = &__##name.c
+
+#define CRT_CHAN_TYPE_PROTOTYPES(name, type, nslots)					\
+static inline int									\
+crt_chan_init_##name(struct crt_chan *c)						\
+{ return crt_chan_init(c, sizeof(type), nslots); }					\
+static inline int									\
+crt_chan_p2p_init_##name(struct crt_chan *c, struct sl_thd *sndr, struct sl_thd *rcvr)	\
+{ return crt_chan_p2p_init(c, sizeof(type), nslots, sndr, rcvr); }			\
+static inline void									\
+crt_chan_teardown_##name(struct crt_chan *c)						\
+{ crt_chan_teardown(c); }								\
+static inline int									\
+crt_chan_empty_##name(struct crt_chan *c)						\
+{ return __crt_chan_empty(c, nslots - 1); }						\
+static inline int									\
+crt_chan_full_##name(struct crt_chan *c)						\
+{ return __crt_chan_full(c, nslots - 1); }						\
+static inline int									\
+crt_chan_send_##name(struct crt_chan *c, void *item)					\
+{											\
+	assert(pow2(nslots));								\
+	return __crt_chan_send(c, item, nslots - 1, sizeof(type));			\
+}											\
+static inline int									\
+crt_chan_recv_##name(struct crt_chan *c, void *item)					\
+{											\
+	assert(pow2(nslots));								\
+	return __crt_chan_recv(c, item, nslots - 1, sizeof(type));			\
+}											\
+static inline int									\
+crt_chan_async_send_##name(struct crt_chan *c, void *item)				\
+{											\
+	assert(pow2(nslots));								\
+	if (__crt_chan_produce(c, item, nslots - 1, sizeof(type))) return -EAGAIN; 	\
+	return 0;									\
+}											\
+static inline int									\
+crt_chan_async_recv_##name(struct crt_chan *c, void *item)				\
+{											\
+	assert(pow2(nslots));								\
+	if (__crt_chan_consume(c, item, nslots - 1, sizeof(type))) return -EAGAIN; 	\
+	return 0;									\
+}
+
+#define CRT_CHANCHAN_PROTOTYPES(nslots) \
+CRT_CHAN_TYPE_PROTOTYPES(chan, struct chan *, nslots
+
+static inline unsigned int
+__crt_chan_buff_idx(struct crt_chan *c, u32_t v, u32_t wraparound_mask)
+{ return v & wraparound_mask; }
+
+static inline int
+__crt_chan_full(struct crt_chan *c, u32_t wraparound_mask)
+{ return __crt_chan_buff_idx(c, c->consumer, wraparound_mask) == __crt_chan_buff_idx(c, c->producer + 1, wraparound_mask); }
+
+static inline int
+__crt_chan_empty(struct crt_chan *c, u32_t wraparound_mask)
+{ return c->producer == c->consumer; }
+
+static inline int
+__crt_chan_produce(struct crt_chan *c, void *d, u32_t wraparound_mask, u32_t sz)
+{
+	if (__crt_chan_full(c, wraparound_mask)) return 1;
+	memcpy(c->mem + (__crt_chan_buff_idx(c, c->producer, wraparound_mask) * sz), d, sz);
+	c->producer++;
+
+	return 0;
+}
+
+static inline int
+__crt_chan_consume(struct crt_chan *c, void *d, u32_t wraparound_mask, u32_t sz)
+{
+	void *ret;
+
+	if (__crt_chan_empty(c, wraparound_mask)) return 1;
+	memcpy(d, c->mem + (__crt_chan_buff_idx(c, c->consumer, wraparound_mask) * sz), sz);
+	c->consumer++;
+
+	return 0;
+}
+
+/* only wake it up if it's blocked on the channel! */
+static inline void
+__crt_chan_p2p_wakeup(unsigned long *w)
+{
+	unsigned long wc, wn;
+
+	sl_cs_enter();
+	wc = ps_load(w);
+	if (likely(wc & (1<<31))) goto blocked;
+	sl_cs_exit();
+
+	return;
+
+blocked:
+	wn = wc & ~(1<<31);
+	struct sl_thd *wt = (struct sl_thd *)wn;
+	if (unlikely(!ps_upcas(w, wc, wn))) BUG();
+	sl_thd_wakeup_no_cs(wt);
+	sl_cs_exit_switchto(wt);
+}
+
+/* block on channel */
+static inline void
+__crt_chan_p2p_block(unsigned long *b)
+{
+	unsigned long bc, bn;
+
+	sl_cs_enter();
+	bc = ps_load(b);
+	assert((bc & (1<<31)) == 0);
+	bn = bc | (1<<31);
+	if (unlikely(!ps_upcas(b, bc, bn))) BUG();
+
+	if (sl_thd_block_no_cs(sl_thd_curr(), SL_THD_BLOCKED, 0)) BUG();
+	sl_cs_exit_schedule();
+}
+
+static inline int
+__crt_chan_is_p2p(struct crt_chan *c)
+{
+	return ((c->sender & ~(1<<31)) && (c->receiver & ~(1<<31)));
+}
+
+/**
+ * The next two functions pass all of the variables in via arguments,
+ * so that we can use them for constant propagation along with
+ * inlining to get rid of the general memcpy code.
+ */
+static inline int
+__crt_chan_send(struct crt_chan *c, void *item, u32_t wraparound_mask, u32_t item_sz)
+{
+	/* optimizing for p2p */
+	if (likely(__crt_chan_is_p2p(c))) {
+		while (1) {
+			if (!__crt_chan_produce(c, item, wraparound_mask, item_sz)) {
+				__crt_chan_p2p_wakeup(&c->receiver);
+				break;
+			}
+			__crt_chan_p2p_block(&c->sender);
+		}
+	} else {
+		while (1) {
+			struct crt_blkpt_checkpoint chkpt;
+
+			crt_blkpt_checkpoint(&c->full, &chkpt);
+			if (!__crt_chan_produce(c, item, wraparound_mask, item_sz)) {
+				/* success! */
+				crt_blkpt_trigger(&c->empty, 0);
+				break;
+			}
+			crt_blkpt_wait(&c->full, 0, &chkpt);
+		}
+	}
+
+	return 0;
+}
+
+static inline int
+__crt_chan_recv(struct crt_chan *c, void *item, u32_t wraparound_mask, u32_t item_sz)
+{
+	/* optimizing for p2p */
+	if (likely(__crt_chan_is_p2p(c))) {
+		while (1) {
+			if (!__crt_chan_consume(c, item, wraparound_mask, item_sz)) {
+				__crt_chan_p2p_wakeup(&c->sender);
+				break;
+			}
+			__crt_chan_p2p_block(&c->receiver);
+		}
+	} else {
+		while (1) {
+			struct crt_blkpt_checkpoint chkpt;
+
+			crt_blkpt_checkpoint(&c->empty, &chkpt);
+			if (!__crt_chan_consume(c, item, wraparound_mask, item_sz)) {
+				/* success! */
+				crt_blkpt_trigger(&c->full, 0);
+				break;
+			}
+			crt_blkpt_wait(&c->empty, 0, &chkpt);
+		}
+	}
+
+	return 0;
+}
+
+
+/*
+ * We need to know how much to malloc? This function returns that
+ * requirement. It assumes (and checks) that @slots is a power of two.
+ */
+static inline int
+crt_chan_mem_sz(int item_sz, int slots)
+{
+	assert(pow2(slots));
+
+	return sizeof(struct crt_chan) + item_sz * slots;
+}
+
+/* How many slots can we fit into an allocation of a specific mem_sz */
+static inline int
+crt_chan_nslots(int item_sz, int mem_sz)
+{
+	return leqpow2((mem_sz - sizeof(struct crt_chan)) / item_sz);
+}
+
+static inline int
+crt_chan_init(struct crt_chan *c, int item_sz, int slots)
+{
+	assert(pow2(slots));
+	if (crt_blkpt_init(&c->empty)) return -1;
+	if (crt_blkpt_init(&c->full)) return -1;
+	c->nslots  = slots;
+	c->item_sz = item_sz;
+	c->wraparound_mask = slots - 1; /* slots is a pow2 */
+	c->sender = c->receiver = 0;
+
+	return 0;
+}
+
+static inline int
+crt_chan_p2p_init(struct crt_chan *c, int item_sz, int slots,
+		  struct sl_thd *sndr, struct sl_thd *rcvr)
+{
+	int r = crt_chan_init(c, item_sz, slots);
+	assert(sndr && rcvr);
+
+	/* FIXME: only single-core for now! */
+	if (r > 0) return r;
+	c->sender = (unsigned long)sndr;
+	c->receiver = (unsigned long)rcvr;
+	assert((c->sender & (1<<31)) == 0);
+	assert((c->receiver & (1<<31)) == 0);
+
+	return 0;
+}
+
+static inline void
+crt_chan_teardown(struct crt_chan *c)
+{
+	crt_blkpt_teardown(&c->empty);
+	crt_blkpt_teardown(&c->full);
+}
+
+/* User-facing send and receive APIs: */
+
+static inline int
+crt_chan_send(struct crt_chan *c, void *item)
+{
+	return __crt_chan_send(c, item, c->wraparound_mask, c->item_sz);
+}
+
+static inline int
+crt_chan_recv(struct crt_chan *c, void *item)
+{
+	return __crt_chan_recv(c, item, c->wraparound_mask, c->item_sz);
+}
+
+static inline int
+crt_chan_async_send(struct crt_chan *c, void *item)
+{
+	if (__crt_chan_produce(c, item, c->wraparound_mask, c->item_sz)) return -EAGAIN;
+	return 0;
+}
+
+static inline int
+crt_chan_async_recv(struct crt_chan *c, void *item)
+{
+	if (__crt_chan_consume(c, item, c->wraparound_mask, c->item_sz)) return -EAGAIN;
+	return 0;
+}
+
+#endif /* CRT_CHAN_H */
diff --git a/src/components/include/crt_lock.h b/src/components/include/crt_lock.h
new file mode 100644
index 0000000000..95e901e52b
--- /dev/null
+++ b/src/components/include/crt_lock.h
@@ -0,0 +1,59 @@
+#ifndef CRT_LOCK_H
+#define CRT_LOCK_H
+
+/***
+ * Simple blocking lock. Uses blockpoints to enable the blocking and
+ * waking of contending threads. This has little to no intelligence,
+ * for example, not expressing dependencies for PI.
+ */
+
+#include <cos_component.h>
+#include <crt_blkpt.h>
+
+struct crt_lock {
+	unsigned long owner;
+	struct crt_blkpt blkpt;
+};
+
+static inline int
+crt_lock_init(struct crt_lock *l)
+{
+	l->owner = 0;
+
+	return crt_blkpt_init(&l->blkpt);
+}
+
+static inline int
+crt_lock_teardown(struct crt_lock *l)
+{
+	assert(l->owner == 0);
+
+	return crt_blkpt_teardown(&l->blkpt);
+}
+
+static inline void
+crt_lock_take(struct crt_lock *l)
+{
+	struct crt_blkpt_checkpoint chkpt;
+
+	while (1) {
+		crt_blkpt_checkpoint(&l->blkpt, &chkpt);
+
+		if (ps_cas(&l->owner, 0, (unsigned long)(cos_cpuid() << 16 | cos_thdid()))) {
+			return;	/* success! */
+		}
+		/* failure: try and block */
+		crt_blkpt_wait(&l->blkpt, 0, &chkpt);
+	}
+}
+
+static inline void
+crt_lock_release(struct crt_lock *l)
+{
+	assert(l->owner == (unsigned long)(cos_cpuid() << 16 | cos_thdid()));
+	l->owner = 0;
+	/* if there are blocked threads, wake 'em up! */
+	crt_blkpt_trigger(&l->blkpt, 0);
+}
+
+#endif /* CRT_LOCK_H */
diff --git a/src/components/include/deque.h b/src/components/include/deque.h
new file mode 100644
index 0000000000..696eb5781c
--- /dev/null
+++ b/src/components/include/deque.h
@@ -0,0 +1,112 @@
+/**
+ * Redistribution of this file is permitted under the BSD two clause license.
+ *
+ * Copyright 2019, The George Washington University
+ * Author: Phani Gadepalli, phanikishoreg@gwu.edu
+ */
+#ifndef DEQUE_H
+#define DEQUE_H
+
+/*
+ * This was implemented by referring to:
+ * https://github.com/cpp-taskflow/cpp-taskflow/blob/9c28ccec910346a9937c40db7bdb542262053f9c/taskflow/executor/workstealing.hpp
+ *
+ * which is based on the following papers:
+ *
+ * The work stealing queue described in the paper, "Dynamic Circular Work-stealing Deque," SPAA, 2015.
+ * Only the queue owner can perform pop and push operations, while others can steal data from the queue.
+ *
+ * PPoPP implementation paper, "Correct and Efficient Work-Stealing for Weak Memory Models"
+ * https://www.di.ens.fr/~zappa/readings/ppopp13.pdf
+ */
+#define DEQUE_MAX_SZ (1<<14)
+
+#define DEQUE_PROTOTYPE(name, type)							\
+struct deque_##name {									\
+	type wrk[DEQUE_MAX_SZ];								\
+	long size;									\
+											\
+	volatile long top;								\
+	volatile long bottom;								\
+};											\
+											\
+static inline void									\
+deque_init_##name(struct deque_##name *q, size_t sz)					\
+{											\
+	memset(q, 0, sizeof(struct deque_##name));					\
+											\
+	if (sz) {									\
+		/* only for size with pow of 2 */					\
+		/* assert((sz & (sz - 1)) == 0); */					\
+		assert(sz <= DEQUE_MAX_SZ);						\
+	} else {									\
+		sz = DEQUE_MAX_SZ;							\
+	}										\
+											\
+	q->size = sz;									\
+}											\
+											\
+/* Use mutual exclusion locks around push/pop if multi-threaded. */			\
+static inline int									\
+deque_push_##name(struct deque_##name *q, type *w)					\
+{											\
+	long ct, cb;									\
+											\
+	ct = ps_load((unsigned long *)&q->top);						\
+       	cb = ps_load((unsigned long *)&q->bottom);					\
+											\
+	/* nope, fixed size only */							\
+	if (q->size - 1 < (cb - ct)) return -ENOSPC;					\
+											\
+	q->wrk[cb] = *w;								\
+	ps_mem_fence();									\
+	if (!ps_upcas((unsigned long *)&q->bottom, cb, cb + 1)) assert(0);		\
+											\
+	return 0;									\
+}											\
+											\
+/* Use mutual exclusion locks around push/pop if multi-threaded. */			\
+static inline int									\
+deque_pop_##name(struct deque_##name *q, type *w)					\
+{											\
+	long ct = 0, sz = 0;								\
+	long cb = ps_load((unsigned long *)&q->bottom) - 1;				\
+	int ret = 0;									\
+											\
+	if (!ps_upcas((unsigned long *)&q->bottom, cb + 1, cb)) assert(0);		\
+											\
+	ct = ps_load((unsigned long *)&q->top);						\
+	sz = cb - ct;									\
+	if (sz < 0) {									\
+		if (!ps_upcas((unsigned long *)&q->bottom, cb, ct)) assert(0);		\
+											\
+		return -ENOENT;								\
+	}										\
+											\
+	*w = q->wrk[cb];								\
+	if (sz > 0) return 0;								\
+											\
+	ret = ps_cas((unsigned long *)&q->top, ct, ct + 1);				\
+	if (!ps_upcas((unsigned long *)&q->bottom, cb, ct + 1)) assert(0);		\
+	if (!ret) { *w = NULL; return -ENOENT; }					\
+											\
+	return 0;									\
+}											\
+											\
+static inline int									\
+deque_steal_##name(struct deque_##name *q, type *w)					\
+{											\
+	long ct, cb;									\
+											\
+	ct = ps_load((unsigned long *)&q->top);						\
+       	cb = ps_load((unsigned long *)&q->bottom);					\
+											\
+	if (ct >= cb) return -ENOENT;							\
+											\
+	*w = q->wrk[ct];								\
+	if (!ps_cas((unsigned long *)&q->top, ct, ct + 1)) return -EAGAIN;		\
+											\
+	return 0;									\
+}
+
+#endif /* DEQUE_H */
diff --git a/src/components/include/hypercall.h b/src/components/include/hypercall.h
index 83426c1c4a..ee3caeb312 100644
--- a/src/components/include/hypercall.h
+++ b/src/components/include/hypercall.h
@@ -13,13 +13,15 @@ enum hypercall_cntl {
 	HYPERCALL_COMP_COMPCAP_GET,
 	HYPERCALL_COMP_CAPTBLCAP_GET,
 	HYPERCALL_COMP_PGTBLCAP_GET,
-	HYPERCALL_COMP_CAPFRONTIER_GET,
 
 	HYPERCALL_COMP_INITAEP_GET,
 	HYPERCALL_COMP_CHILD_NEXT,
 	HYPERCALL_COMP_CPUBITMAP_GET,
+	HYPERCALL_COMP_SCHED_GET,
 
 	HYPERCALL_NUMCOMPS_GET,
+
+	HYPERCALL_ROOT_INITAEP_SET, /* per-core root-scheduler init-aeps created by capmgr and passed to llbooter */
 };
 
 static inline int
@@ -48,11 +50,12 @@ hypercall_comp_init_done(void)
 
 /* Note: This API can be called ONLY by components that manage capability resources */
 static inline int
-hypercall_comp_initaep_get(spdid_t spdid, int is_sched, struct cos_aep_info *aep)
+hypercall_comp_initaep_get(spdid_t spdid, int is_sched, struct cos_aep_info *aep, spdid_t *parent_spdid)
 {
 	thdcap_t  thdslot = 0;
 	arcvcap_t rcvslot = 0;
 	tcap_t    tcslot  = 0;
+	word_t    r3 = 0;
 	struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get());
 	int ret = 0;
 
@@ -68,8 +71,8 @@ hypercall_comp_initaep_get(spdid_t spdid, int is_sched, struct cos_aep_info *aep
 	}
 
 	/* capid_t though is unsigned long, only assuming it occupies 16bits for packing */
-	ret = cos_sinv(BOOT_CAPTBL_SINV_CAP, HYPERCALL_COMP_INITAEP_GET,
-			spdid << 16 | thdslot, rcvslot << 16 | tcslot, 0);
+	ret = cos_sinv_rets(BOOT_CAPTBL_SINV_CAP, HYPERCALL_COMP_INITAEP_GET,
+			    spdid << 16 | thdslot, rcvslot << 16 | tcslot, 0, (word_t *)&parent_spdid, &r3);
 	if (ret) return ret;
 
 	aep->thd = thdslot;
@@ -80,6 +83,18 @@ hypercall_comp_initaep_get(spdid_t spdid, int is_sched, struct cos_aep_info *aep
 	return 0;
 }
 
+static inline int
+hypercall_root_initaep_set(spdid_t spdid, struct cos_aep_info *aep)
+{
+	int ret = 0;
+
+	ret = cos_sinv(BOOT_CAPTBL_SINV_CAP, HYPERCALL_ROOT_INITAEP_SET, spdid << 16 | aep->thd,
+		       aep->rcv << 16 | aep->tc, 0);
+	if (ret) return ret;
+
+	return 0;
+}
+
 /* Note: This API can be called ONLY by components that manage capability resources */
 static inline int
 hypercall_comp_info_get(spdid_t spdid, pgtblcap_t *ptslot, captblcap_t *ctslot, compcap_t *compslot, spdid_t *parentid)
@@ -176,15 +191,10 @@ hypercall_comp_pgtblcap_get(spdid_t spdid)
 	return ptslot;
 }
 
-static inline capid_t
-hypercall_comp_capfrontier_get(spdid_t spdid)
+static inline spdid_t
+hypercall_comp_sched_get(spdid_t spdid)
 {
-	word_t unused;
-	capid_t cap_frontier;
-
-	if (cos_sinv_rets(BOOT_CAPTBL_SINV_CAP, HYPERCALL_COMP_CAPFRONTIER_GET, spdid, 0, 0, &cap_frontier, &unused)) return 0;
-
-	return cap_frontier;
+	return cos_sinv(BOOT_CAPTBL_SINV_CAP, HYPERCALL_COMP_SCHED_GET, spdid, 0, 0);
 }
 
 static inline int
diff --git a/src/components/include/omp.h b/src/components/include/omp.h
new file mode 100644
index 0000000000..f3312ec5bc
--- /dev/null
+++ b/src/components/include/omp.h
@@ -0,0 +1,174 @@
+/* Copyright (C) 2005-2017 Free Software Foundation, Inc.
+   Contributed by Richard Henderson <rth@redhat.com>.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* 
+ * NOTE: This header is from gcc 7 customized 
+ *	 to support only what is required in our environment 
+ */
+#ifndef _OMP_H
+#define _OMP_H 1
+
+#ifndef _LIBGOMP_OMP_LOCK_DEFINED
+#define _LIBGOMP_OMP_LOCK_DEFINED 1
+/* These two structures get edited by the libgomp build process to 
+   reflect the shape of the two types.  Their internals are private
+   to the library.  */
+
+typedef struct
+{
+  unsigned char _x[4] 
+    __attribute__((__aligned__(4)));
+} omp_lock_t;
+
+typedef struct
+{
+#if defined(__linux__)
+  unsigned char _x[8 + sizeof (void *)] 
+    __attribute__((__aligned__(sizeof (void *))));
+#else
+  unsigned char _x[16] 
+    __attribute__((__aligned__(8)));
+#endif
+} omp_nest_lock_t;
+#endif
+
+typedef enum omp_sched_t
+{
+  omp_sched_static = 1,
+  omp_sched_dynamic = 2,
+  omp_sched_guided = 3,
+  omp_sched_auto = 4
+} omp_sched_t;
+
+typedef enum omp_proc_bind_t
+{
+  omp_proc_bind_false = 0,
+  omp_proc_bind_true = 1,
+  omp_proc_bind_master = 2,
+  omp_proc_bind_close = 3,
+  omp_proc_bind_spread = 4
+} omp_proc_bind_t;
+
+typedef enum omp_lock_hint_t
+{
+  omp_lock_hint_none = 0,
+  omp_lock_hint_uncontended = 1,
+  omp_lock_hint_contended = 2,
+  omp_lock_hint_nonspeculative = 4,
+  omp_lock_hint_speculative = 8,
+} omp_lock_hint_t;
+
+#ifdef __cplusplus
+extern "C" {
+# define __GOMP_NOTHROW throw ()
+#else
+# define __GOMP_NOTHROW __attribute__((__nothrow__))
+#endif
+
+//extern void omp_set_num_threads (int) __GOMP_NOTHROW;
+extern int omp_get_num_threads (void) __GOMP_NOTHROW;
+extern int omp_get_max_threads (void) __GOMP_NOTHROW;
+extern int omp_get_thread_num (void) __GOMP_NOTHROW;
+extern int omp_get_num_procs (void) __GOMP_NOTHROW;
+
+//extern int omp_in_parallel (void) __GOMP_NOTHROW;
+//
+//extern void omp_set_dynamic (int) __GOMP_NOTHROW;
+//extern int omp_get_dynamic (void) __GOMP_NOTHROW;
+//
+//extern void omp_set_nested (int) __GOMP_NOTHROW;
+//extern int omp_get_nested (void) __GOMP_NOTHROW;
+//
+//extern void omp_init_lock (omp_lock_t *) __GOMP_NOTHROW;
+//extern void omp_init_lock_with_hint (omp_lock_t *, omp_lock_hint_t)
+//  __GOMP_NOTHROW;
+//extern void omp_destroy_lock (omp_lock_t *) __GOMP_NOTHROW;
+//extern void omp_set_lock (omp_lock_t *) __GOMP_NOTHROW;
+//extern void omp_unset_lock (omp_lock_t *) __GOMP_NOTHROW;
+//extern int omp_test_lock (omp_lock_t *) __GOMP_NOTHROW;
+//
+//extern void omp_init_nest_lock (omp_nest_lock_t *) __GOMP_NOTHROW;
+//extern void omp_init_nest_lock_with_hint (omp_nest_lock_t *, omp_lock_hint_t)
+//  __GOMP_NOTHROW;
+//extern void omp_destroy_nest_lock (omp_nest_lock_t *) __GOMP_NOTHROW;
+//extern void omp_set_nest_lock (omp_nest_lock_t *) __GOMP_NOTHROW;
+//extern void omp_unset_nest_lock (omp_nest_lock_t *) __GOMP_NOTHROW;
+//extern int omp_test_nest_lock (omp_nest_lock_t *) __GOMP_NOTHROW;
+//
+extern double omp_get_wtime (void) __GOMP_NOTHROW;
+//extern double omp_get_wtick (void) __GOMP_NOTHROW;
+//
+//extern void omp_set_schedule (omp_sched_t, int) __GOMP_NOTHROW;
+//extern void omp_get_schedule (omp_sched_t *, int *) __GOMP_NOTHROW;
+//extern int omp_get_thread_limit (void) __GOMP_NOTHROW;
+//extern void omp_set_max_active_levels (int) __GOMP_NOTHROW;
+//extern int omp_get_max_active_levels (void) __GOMP_NOTHROW;
+//extern int omp_get_level (void) __GOMP_NOTHROW;
+//extern int omp_get_ancestor_thread_num (int) __GOMP_NOTHROW;
+//extern int omp_get_team_size (int) __GOMP_NOTHROW;
+//extern int omp_get_active_level (void) __GOMP_NOTHROW;
+//
+//extern int omp_in_final (void) __GOMP_NOTHROW;
+//
+//extern int omp_get_cancellation (void) __GOMP_NOTHROW;
+//extern omp_proc_bind_t omp_get_proc_bind (void) __GOMP_NOTHROW;
+//extern int omp_get_num_places (void) __GOMP_NOTHROW;
+//extern int omp_get_place_num_procs (int) __GOMP_NOTHROW;
+//extern void omp_get_place_proc_ids (int, int *) __GOMP_NOTHROW;
+//extern int omp_get_place_num (void) __GOMP_NOTHROW;
+//extern int omp_get_partition_num_places (void) __GOMP_NOTHROW;
+//extern void omp_get_partition_place_nums (int *) __GOMP_NOTHROW;
+//
+//extern void omp_set_default_device (int) __GOMP_NOTHROW;
+//extern int omp_get_default_device (void) __GOMP_NOTHROW;
+//extern int omp_get_num_devices (void) __GOMP_NOTHROW;
+//extern int omp_get_num_teams (void) __GOMP_NOTHROW;
+//extern int omp_get_team_num (void) __GOMP_NOTHROW;
+//
+//extern int omp_is_initial_device (void) __GOMP_NOTHROW;
+//extern int omp_get_initial_device (void) __GOMP_NOTHROW;
+//extern int omp_get_max_task_priority (void) __GOMP_NOTHROW;
+//
+//extern void *omp_target_alloc (__SIZE_TYPE__, int) __GOMP_NOTHROW;
+//extern void omp_target_free (void *, int) __GOMP_NOTHROW;
+//extern int omp_target_is_present (void *, int) __GOMP_NOTHROW;
+//extern int omp_target_memcpy (void *, void *, __SIZE_TYPE__, __SIZE_TYPE__,
+//			      __SIZE_TYPE__, int, int) __GOMP_NOTHROW;
+//extern int omp_target_memcpy_rect (void *, void *, __SIZE_TYPE__, int,
+//				   const __SIZE_TYPE__ *,
+//				   const __SIZE_TYPE__ *,
+//				   const __SIZE_TYPE__ *,
+//				   const __SIZE_TYPE__ *,
+//				   const __SIZE_TYPE__ *, int, int)
+//  __GOMP_NOTHROW;
+//extern int omp_target_associate_ptr (void *, void *, __SIZE_TYPE__,
+//				     __SIZE_TYPE__, int) __GOMP_NOTHROW;
+//extern int omp_target_disassociate_ptr (void *, int) __GOMP_NOTHROW;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _OMP_H */
diff --git a/src/components/include/part.h b/src/components/include/part.h
new file mode 100644
index 0000000000..f4ea8cc9bc
--- /dev/null
+++ b/src/components/include/part.h
@@ -0,0 +1,437 @@
+#ifndef PART_H
+#define PART_H
+
+#include <part_task.h>
+#include <ps_list.h>
+#include <deque.h>
+#include <crt_lock.h>
+
+#include <sl.h>
+#include <sl_xcore.h>
+
+#undef PART_ENABLE_NESTED
+#define PART_ENABLE_BLOCKING
+//#include <cirque.h>
+
+DEQUE_PROTOTYPE(part, struct part_task *);
+//CIRQUE_PROTOTYPE(part, struct part_task);
+
+extern struct deque_part *part_dq_percore[];
+//extern struct cirque_par parcq_global;
+/* FIXME: use stacklist or another stack like data structure? */
+extern struct ps_list_head part_thdpool_core[];
+extern volatile int in_main_parallel;
+#if defined(PART_ENABLE_NESTED)
+extern struct ps_list_head part_l_global;
+extern struct crt_lock     part_l_lock;
+#else 
+extern struct part_task main_task;
+#endif
+
+static inline struct deque_part *
+part_deque_curr(void)
+{
+	return part_dq_percore[cos_cpuid()];
+}
+
+static inline struct deque_part *
+part_deque_core(cpuid_t c)
+{
+	assert(c < NUM_CPU);
+
+	return part_dq_percore[c];
+}
+
+static inline struct ps_list_head *
+part_thdpool_curr(void)
+{
+	return &part_thdpool_core[cos_cpuid()];
+}
+
+//static inline struct cirque_par *
+//part_cirque(void)
+//{
+//	return &parcq_global;
+//}
+
+#if defined(PART_ENABLE_NESTED)
+static inline struct ps_list_head *
+part_list(void)
+{
+	return &part_l_global;
+}
+#endif
+
+static inline int 
+part_deque_push(struct part_task *t)
+{
+	int ret;
+
+	assert(t->type == PART_TASK_T_TASK);
+	sl_cs_enter();
+	ret = deque_push_part(part_deque_curr(), &t);
+	sl_cs_exit();
+
+	return ret;
+}
+
+static inline int
+part_deque_pop(struct part_task **t)
+{
+	int ret;
+
+	*t = NULL;
+	sl_cs_enter();
+	ret = deque_pop_part(part_deque_curr(), t);
+	sl_cs_exit();
+	if (unlikely(ret)) *t = NULL;
+
+	if (unlikely(*t && (*t)->type != PART_TASK_T_TASK)) { *t = NULL; ret = -EAGAIN; }
+
+	return ret;
+}
+
+static inline struct part_task * 
+part_deque_steal(cpuid_t core)
+{
+#if NUM_CPU > 1
+	int ret;
+	struct part_task *t = NULL;
+
+	ret = deque_steal_part(part_deque_core(core), &t);
+	if (unlikely(ret)) return NULL;
+	assert(t->type == PART_TASK_T_TASK);
+
+	return t;
+#else
+	return NULL;
+#endif
+}
+
+static inline struct part_task * 
+part_deque_steal_any(void)
+{
+#if NUM_CPU > 1
+	unsigned i = 0, c = (unsigned)(ps_tsc() % NUM_CPU);
+
+	do {
+		struct part_task *t = NULL;
+
+		i ++;
+		if (unlikely(c == (unsigned)cos_cpuid())) c = (c + 1) % NUM_CPU;
+
+		t = part_deque_steal(c);
+		if (likely(t)) return t;
+	} while (i < NUM_CPU);
+#endif
+	return NULL;
+}
+
+static inline void
+part_pool_wakeup(void)
+{
+#ifdef PART_ENABLE_BLOCKING
+	struct sl_thd *t = NULL;
+	int i;
+
+	/* we're still not in main parallel, so don't try to wakeup any threads yet! */
+	if (!ps_load(&in_main_parallel)) return;
+
+	sl_cs_enter();
+	if (unlikely(ps_list_head_empty(part_thdpool_curr()))) goto done;
+
+	t = ps_list_head_first(part_thdpool_curr(), struct sl_thd, partlist);
+	/* removal from the list is taken care in mod_part_fifo */
+	if (t == sl_thd_curr()) goto done;
+	sl_thd_wakeup_no_cs(t);
+done:
+	sl_cs_exit();
+#endif
+}
+
+static inline void
+part_pool_block(void)
+{
+#ifdef PART_ENABLE_BLOCKING
+	struct sl_thd *t = sl_thd_curr();
+
+	/* very much a replica of sl_thd_block + adding to thread pool in part */
+	sl_cs_enter();
+	if (ps_list_singleton(t, partlist)) ps_list_head_append(part_thdpool_curr(), t, partlist);
+	sl_cs_exit();
+	sl_thd_block(0);
+	assert(sl_thd_is_runnable(t));
+#else
+	sl_thd_yield(0);
+#endif
+}
+
+///* ds memory in a circular queue */
+//static inline struct part_task * 
+//part_cirque_alloc(void)
+//{
+//	return cirque_allocptr_par(part_cirque());
+//}
+//
+//static inline void
+//part_cirque_free(void)
+//{
+//	cirque_freeptr_par(part_cirque());
+//}
+//
+//static inline struct part_task * 
+//part_cirque_peek(void)
+//{
+//	return cirque_peekptr_par(part_cirque());
+//}
+
+/* TODO: lock for shared list! */
+static inline void
+part_list_append(struct part_task *t)
+{
+	unsigned i;
+	int in_nest = 0;
+
+	assert(t->type == PART_TASK_T_WORKSHARE);
+
+#if defined(PART_ENABLE_NESTED)
+	assert(ps_list_singleton(t, partask));
+	/* 
+	 * this is not required to be in a cs. 
+	 * if multiple appends are called, simultaneously, we at least
+	 * have the main outermost parallel block running!.
+	 */
+	if (likely(!ps_list_head_empty(part_list()))) in_nest = 1;
+	/* so other threads can work on this! */
+	if (t->nthds > 1) { 
+		crt_lock_take(&part_l_lock);
+		ps_list_head_append(part_list(), t, partask);
+		crt_lock_release(&part_l_lock);
+	}
+#else
+	if (t != &main_task) {
+		/* without nesting, all child parallel blocks are run just be the encountering threads -master threads */
+		assert(t->nthds == 1); 
+		assert(ps_load(&in_main_parallel));
+
+		return;
+	}
+	assert(ps_load(&in_main_parallel) == 0);
+#endif
+	/* 
+	 * wake up as many threads on this core! 
+	 * some may not get work if other cores pull work before they get to it.
+	 */
+	for (i = 1; i < t->nthds; i++) part_pool_wakeup();
+
+	/* if this is the first time in a parallel, make everyone know */
+	if (likely(!in_nest)) ps_faa(&in_main_parallel, 1);
+}
+
+static inline void
+part_list_remove(struct part_task *t)
+{
+	int in_nest = 0;
+
+	assert(t->type == PART_TASK_T_WORKSHARE);
+#if defined(PART_ENABLE_NESTED)
+	assert(t->nthds > 1);
+	assert(!ps_list_singleton(t, partask));
+
+	crt_lock_take(&part_l_lock);
+	ps_list_rem(t, partask);
+	if (unlikely(!ps_list_head_empty(part_list()))) in_nest = 1;
+	crt_lock_release(&part_l_lock);
+#else
+	/* only called for the other parallel region */
+	assert(ps_load(&in_main_parallel));
+	if (t != &main_task) return;
+#endif
+
+	if (likely(!in_nest)) ps_faa(&in_main_parallel, -1);
+}
+
+static inline struct part_task *
+part_list_peek(void)
+{
+	/* there should at least be the outer parallel block for other threads to peek! */
+	if (!ps_load(&in_main_parallel)) return NULL;
+
+#if defined(PART_ENABLE_NESTED)
+	struct part_task *t = NULL;
+	int found = 0;
+
+	crt_lock_take(&part_l_lock);
+	if (unlikely(ps_list_head_empty(part_list()))) goto done;
+	/* not great! traversing from the first element always! */
+	/* TODO: perhaps traverse from the current task? */
+	ps_list_foreach(part_list(), t, partask) {
+		int i;
+
+		assert(t);
+		assert(t->type == PART_TASK_T_WORKSHARE);
+		/* coz, master thread adds to list the implicit task and doesn't defer it */
+		i = part_task_work_try(t);
+		assert(i != 0);
+
+		if (likely(i > 0 && !ps_load(&t->end))) {
+			found = 1;
+			break;
+		}
+	}
+
+done:
+	crt_lock_release(&part_l_lock);
+
+	if (unlikely(!found)) return NULL;
+
+	return t;
+#else
+	int i;
+
+	assert(main_task.type == PART_TASK_T_WORKSHARE);
+	i = part_task_work_try(&main_task);
+	assert(i != 0);
+
+	if (likely(i > 0 && ps_load(&main_task.end) != main_task.nthds)) return &main_task;
+
+	return NULL;
+#endif
+}
+
+void part_init(void);
+
+unsigned part_isready(void);
+
+static inline void
+part_task_barrier(struct part_task *t, int is_end)
+{
+	struct sl_thd *ts = sl_thd_curr();
+	unsigned cbc = 0;
+	int is_master = t->master == PART_CURR_THD ? 1 : 0;
+
+	assert(t->type != PART_TASK_T_NONE);
+	assert(t->state == PART_TASK_S_INITIALIZED);
+	assert(t->nthds >= 1);
+
+	part_task_wait_children(t);
+
+	if (t->nthds == 1) {
+		struct part_data *d;
+
+		if (unlikely(!is_end)) return;
+
+		ps_faa(&t->end, 1);
+		/* remove myself from my parent. */
+		part_task_remove_child(t);
+		if (t->type == PART_TASK_T_WORKSHARE) {
+			assert(is_master);
+			ts->part_context = t->parent;
+			part_list_remove(t);
+
+			return;
+		}
+
+		ts->part_context = NULL;
+		d = t->data_env;
+
+		part_task_free(t);
+		part_data_free(d);
+
+		return;
+	}
+
+	assert(t->type == PART_TASK_T_WORKSHARE);
+
+	cbc = ps_faa(&t->barrier, -1);
+	if (cbc > 1) {
+		sl_thd_block(0);
+		if (is_master) part_peer_wakeup(t);
+	} else {
+		if (ps_cas(&t->barrier, 0, t->nthds)) ps_faa(&t->barrier_epoch, 1);
+		if (is_master) {
+			part_peer_wakeup(t);
+		}
+		else {
+			part_master_wakeup(t);
+			sl_thd_block(0);
+		}
+	}
+	//assert(ps_load(&t->barrier_epoch) == cbep + 1);
+
+	if (!is_end) return;
+	ps_faa(&t->end, 1);
+
+	if (is_master) {
+		part_task_remove_child(t);
+		part_list_remove(t);
+		ts->part_context = t->parent;
+	} else {
+		ts->part_context = NULL;
+	}
+}
+
+static inline void
+part_task_end(struct part_task *t)
+{ part_task_barrier(t, 1); }
+
+static inline void
+part_thd_fn(void *d)
+{
+	struct sl_thd *curr = sl_thd_curr();
+
+	/* parallel runtime not ready? */
+	/* if (unlikely(!part_isready())) part_pool_block(); */
+	/* not in the main parallel block? */
+
+	while (1) {
+		struct part_task *t = NULL;
+		int ret;
+
+		if (!ps_load(&in_main_parallel)) part_pool_block();
+
+		/* FIXME: nested parallel needs love! */
+		t = part_list_peek();
+		if (likely(t)) goto found;
+
+single:
+		ret = part_deque_pop(&t);
+		if (likely(ret == 0)) {
+			int thdnum = -1;
+
+			assert(t && t->type == PART_TASK_T_TASK);
+			thdnum = part_task_work_try(t);
+			assert(thdnum == 0);
+			goto found;
+		}
+
+		if (unlikely(ret == -EAGAIN)) goto single;
+
+		t = part_deque_steal_any();
+		if (unlikely(!t)) {
+			part_pool_block();
+
+			continue;
+		} else {
+			int thdnum = -1;
+
+			assert(t->type == PART_TASK_T_TASK);
+			thdnum = part_task_work_try(t);
+			if (thdnum < 0) continue;
+			assert(thdnum == 0);
+		}
+
+found:
+		assert(t);
+		curr->part_context = (void *)t;
+
+		t->cs.fn(t->cs.data);
+
+		part_task_end(t);
+		assert(curr->part_context == NULL);
+	}
+
+	sl_thd_exit();
+}
+
+#endif /* PART_H */
diff --git a/src/components/include/part_task.h b/src/components/include/part_task.h
new file mode 100644
index 0000000000..8bc9f4ea38
--- /dev/null
+++ b/src/components/include/part_task.h
@@ -0,0 +1,278 @@
+#ifndef PART_TASK_H
+#define PART_TASK_H
+
+#include <sl.h>
+#include <ps.h>
+#include <ps_list.h>
+#include <cos_types.h>
+
+#define PART_THD(c, t) ((unsigned)(cos_cpuid() << 16 | cos_thdid()))
+#define PART_CURR_THD  PART_THD(cos_cpuid(), cos_thdid()) 
+#define PART_THD_COREID(t) (t >> 16)
+#define PART_THD_THDID(t)  ((t << 16) >> 16)
+
+#define PART_MAX_TASKS      (NUM_CPU < 4 ? 2048 : 8192)
+#define PART_MAX_DATA       PART_MAX_TASKS 
+#define PART_MAX_PAR_THDS   NUM_CPU /* change this to test more data-parallel tasks on single core configuration */
+#define PART_MAX_CORE_THDS  (NUM_CPU == 1 ? 200 : (NUM_CPU == 2 ? 128 : (NUM_CPU < 5 ? 64 : 48)))
+#define PART_MAX_THDS       512
+#define PART_MAX_CHILD      1024
+#define PART_MAX_WORKSHARES 16
+
+typedef void (*part_fn_t)(void *);
+
+typedef enum {
+	PART_TASK_S_FREED,
+	PART_TASK_S_ALLOCATED,
+	PART_TASK_S_INITIALIZED,
+	PART_TASK_S_RUNNING,
+	PART_TASK_S_CHILD_WAIT, /* WAIT FOR CHILD TASKS */
+	PART_TASK_S_SIBLING_WAIT, /* WAIT FOR SIBLING TASKS */
+	PART_TASK_S_PARENT_WAIT, /* WAIT FOR PARENT TASK */
+	PART_TASK_S_IN_BARRIER, /* WAIT FOR ALL OTHER THREADS */
+} part_task_state_t;
+
+typedef enum {
+	PART_TASK_T_NONE,
+	PART_TASK_T_WORKSHARE = 1, /* task to put in a shared fifo queue */
+	PART_TASK_T_TASK,
+} part_task_type_t;
+
+typedef enum {
+	PART_WORKSHARE_NONE,
+	PART_WORKSHARE_LOOP_STATIC,
+	PART_WORKSHARE_LOOP_DYNAMIC,
+	PART_WORKSHARE_LOOP_GUIDED,
+	PART_WORKSHARE_LOOP_RUNTIME,
+	PART_WORKSHARE_SECTIONS,
+	PART_WORKSHARE_SINGLE,
+} part_workshare_type_t;
+
+struct part_workshare {
+	part_workshare_type_t type;
+
+	long chunk_sz;
+
+	long st, end, inc;
+
+	long next;
+
+	unsigned worker_bmp;
+};
+
+struct part_closure {
+	part_fn_t  fn;
+	void     *data;
+};
+
+struct part_data {
+	int flag; /* 0 = not in use, 1 = in use */	
+	struct part_data *next_free; /* for explicit data allocation/free */
+	char data[PART_MAX_DATA];
+};
+
+struct part_task {
+	int id; /* only for debugging */
+	part_task_state_t state;
+	part_task_type_t  type;
+
+	struct part_workshare ws[PART_MAX_WORKSHARES];
+	struct part_closure   cs;
+
+	unsigned nthds; /* number of threads for this task, 1 in case of non-workshare work */
+	unsigned nworkers;
+	unsigned workers[PART_MAX_PAR_THDS]; /* threads sharing this work or thread doing this work! */
+	int ws_off[PART_MAX_PAR_THDS]; /* progress of the workshares in each participating thread */
+	unsigned master; /* coreid << 16 | thdid of the master */
+	unsigned end, barrier, barrier_epoch;
+
+	struct part_data *data_env; 
+	struct part_task *parent;
+	/* in data-parallel task, each thread waits for its children. */
+	int nchildren[PART_MAX_PAR_THDS];
+
+	struct ps_list partask;
+	struct part_task *next_free; /* for explicit task allocation/free */
+} CACHE_ALIGNED;
+
+static inline void
+part_task_init(struct part_task *t, part_task_type_t type, struct part_task *p, unsigned nthds, part_fn_t fn, void *data, struct part_data *d)
+{
+	static unsigned part_id_free = 0;
+	int i, id = ps_faa(&part_id_free, 1);
+
+	assert(type != PART_TASK_T_NONE);
+	t->type = type;
+	if (!ps_cas(&t->state, PART_TASK_S_ALLOCATED, PART_TASK_S_INITIALIZED)) assert(0);
+	t->id = id;
+	memset(t->ws, 0, sizeof(struct part_workshare) * PART_MAX_WORKSHARES);
+	t->cs.fn = fn;
+	t->cs.data = data;
+	assert (nthds <= PART_MAX_PAR_THDS);
+	t->nthds = nthds;
+	t->nworkers = 0;
+	memset(t->workers, 0, sizeof(unsigned) * PART_MAX_PAR_THDS);
+	t->master = PART_CURR_THD;
+	/* if it's worksharing, current thread is the master and does take part in the par section */
+	if (type == PART_TASK_T_WORKSHARE) {
+		t->nworkers = 1;
+		t->workers[0] = t->master;
+	}
+	for (i = 0; i < PART_MAX_PAR_THDS; i++) t->ws_off[i] = -1;
+	t->barrier = t->nthds;
+	t->end = t->barrier_epoch = 0;
+	t->data_env = d;
+	t->parent = p;
+	memset(t->nchildren, 0, sizeof(int) * PART_MAX_PAR_THDS);
+
+	ps_list_init(t, partask);
+}
+
+struct part_task *part_task_alloc(part_task_type_t);
+void part_task_free(struct part_task *);
+struct part_data *part_data_alloc(void);
+void part_data_free(struct part_data *);
+
+static inline int
+part_task_work_try(struct part_task *t)
+{
+	int i = 0;
+        unsigned key = PART_CURR_THD;
+
+	assert(t->state == PART_TASK_S_INITIALIZED);
+	if (t->type == PART_TASK_T_TASK) {
+		assert(t->nthds == 1);
+	} else {
+		assert(t->type == PART_TASK_T_WORKSHARE);
+		assert(t->master != key && t->master == t->workers[0]);
+		assert(t->nthds >= 1);
+	}
+
+	/* task was finished! */
+	if (unlikely(ps_load(&t->end) == t->nthds)) return -1;
+	/* if you can work with this task */
+	i = ps_faa(&t->nworkers, 1);
+	if (unlikely(i >= (int)t->nthds)) return -1;
+
+	t->workers[i] = key;
+
+	return i;
+}
+
+static inline int
+part_task_work_thd_num(struct part_task *t, unsigned core_thd)
+{
+	int i; 
+	unsigned key = core_thd;
+
+	assert(t);
+
+	assert(t->state == PART_TASK_S_INITIALIZED);
+	if (likely(t->type == PART_TASK_T_TASK)) {
+		assert(t->nthds == 1);
+
+		if (ps_load(&t->workers[0]) == key) return 0;
+
+		return -1;
+	}
+	assert(t->type == PART_TASK_T_WORKSHARE);
+
+	if (key == t->master) return 0;
+	for (i = 1; i < (int)t->nthds; i++) {
+		if (t->workers[i] == key) return i;
+	}
+
+	return -1;
+}
+
+static inline void
+part_thd_wakeup(unsigned thd)
+{
+	thdid_t t = PART_THD_THDID(thd);
+	cpuid_t c = PART_THD_COREID(thd);
+
+	assert(c >= 0 && c < NUM_CPU);
+	assert(t < MAX_NUM_THREADS);
+
+	if (thd == PART_CURR_THD) return;
+	if (c != cos_cpuid()) sl_xcore_thd_wakeup_tid(t, c);
+	else                  sl_thd_wakeup(t);
+}
+
+static inline void
+part_master_wakeup(struct part_task *t)
+{
+	assert(t->type == PART_TASK_T_WORKSHARE);
+	assert(t->state == PART_TASK_S_INITIALIZED);
+	assert(t->nthds > 1);
+	assert(t->master && t->master != PART_CURR_THD);
+
+	part_thd_wakeup(t->master);
+}
+
+static inline void
+part_peer_wakeup(struct part_task *t)
+{
+	unsigned i;
+
+	assert(t->type == PART_TASK_T_WORKSHARE);
+	assert(t->state == PART_TASK_S_INITIALIZED);
+	assert(t->nthds > 1);
+	assert(t->master == PART_CURR_THD);
+
+	for (i = 1; i < t->nthds; i++) part_thd_wakeup(t->workers[i]);
+}
+
+static inline int
+part_task_add_child(struct part_task *t, struct part_task *c)
+{
+	int i;
+	int num = part_task_work_thd_num(t, PART_CURR_THD);
+
+	assert(num >= 0);
+	assert(t->state == PART_TASK_S_INITIALIZED);
+
+	if (unlikely(!t || !c)) return -1;
+
+	i = ps_faa(&t->nchildren[num], 1);
+	assert(i < PART_MAX_CHILD);
+	
+	return i;
+}
+
+static inline void
+part_task_remove_child(struct part_task *c)
+{
+	struct part_task *p = c->parent;
+	unsigned wkup;
+	int i, num;
+
+	if (unlikely(!p)) return;
+	assert(c->state == PART_TASK_S_INITIALIZED);
+
+	if (c->type == PART_TASK_T_TASK) wkup = c->master;
+	else                             wkup = p->master;
+
+	num = part_task_work_thd_num(p, wkup);
+	assert(num >= 0);
+
+	assert(p->nchildren[num] != 0);
+	i = ps_faa(&p->nchildren[num], -1);
+	assert(i > 0);
+
+	/* only the last child to wake up the parent */
+	if (i == 1) part_thd_wakeup(wkup);
+}
+
+static inline void
+part_task_wait_children(struct part_task *t)
+{
+	int num = part_task_work_thd_num(t, PART_CURR_THD);
+
+	assert(num >= 0);
+	assert(t->state == PART_TASK_S_INITIALIZED);
+
+	if (ps_load(&(t->nchildren[num])) > 0) sl_thd_block(0);
+}
+
+#endif /* PART_TASK_H */
diff --git a/src/components/include/res_spec.h b/src/components/include/res_spec.h
index e109b8a2fb..e81736950a 100644
--- a/src/components/include/res_spec.h
+++ b/src/components/include/res_spec.h
@@ -64,10 +64,10 @@ sched_param_pack(sched_param_type_t type, unsigned int value)
 static inline void
 sched_param_get(sched_param_t sp, sched_param_type_t *type, unsigned int *value)
 {
-	struct sched_param_s s = *(struct sched_param_s *)(void *)&sp;
+	union sched_param_union us = *(union sched_param_union *)&sp;
 
-	*type  = s.type;
-	*value = s.value;
+	*type  = us.c.type;
+	*value = us.c.value;
 }
 
 #endif /* RES_SPEC_H */
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index 1529c7835c..777125322e 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -37,8 +37,13 @@
 #include <sl_plugins.h>
 #include <sl_thd.h>
 #include <sl_consts.h>
-#include <sl_xcpu.h>
+#include <sl_xcore.h>
 #include <heap.h>
+#include <cos_ulsched_rcv.h>
+
+#define SL_CS
+#undef SL_REPLENISH
+#undef SL_PARENTCHILD
 
 /* Critical section (cs) API to protect scheduler data-structures */
 struct sl_cs {
@@ -51,7 +56,7 @@ struct sl_cs {
 	} u;
 };
 
-struct sl_global_cpu {
+struct sl_global_core {
 	struct sl_cs lock;
 
 	thdcap_t       sched_thdcap;
@@ -62,18 +67,29 @@ struct sl_global_cpu {
 
 	int         cyc_per_usec;
 	cycles_t    period;
-	cycles_t    timer_next;
+	cycles_t    timer_next, timer_prev;
 	tcap_time_t timeout_next;
 
+	struct cos_scb_info *scb_info;
 	struct ps_list_head event_head; /* all pending events for sched end-point */
 };
 
-extern struct sl_global_cpu sl_global_cpu_data[];
+extern struct sl_global_core sl_global_core_data[];
+
+typedef u32_t sched_blkpt_id_t;
+#define SCHED_BLKPT_NULL 0
+typedef word_t sched_blkpt_epoch_t;
+
+static inline struct sl_global_core *
+sl__globals_core(void)
+{
+	return &(sl_global_core_data[cos_cpuid()]);
+}
 
-static inline struct sl_global_cpu *
-sl__globals_cpu(void)
+static inline struct cos_scb_info *
+sl_scb_info_core(void)
 {
-	return &(sl_global_cpu_data[cos_cpuid()]);
+	return (sl__globals_core()->scb_info);
 }
 
 static inline void
@@ -83,15 +99,22 @@ sl_thd_setprio(struct sl_thd *t, tcap_prio_t p)
 }
 
 /* for lazy retrieval of a child component thread in the parent */
-extern struct sl_thd *sl_thd_retrieve(thdid_t tid);
+extern struct sl_thd *sl_thd_retrieve_lazy(thdid_t tid);
 
 static inline struct sl_thd *
 sl_thd_lkup(thdid_t tid)
 {
-	assert(tid != 0);
-	if (unlikely(tid > MAX_NUM_THREADS)) return NULL;
+	struct sl_thd *t;
+	struct sl_xcore_thd *xt;
+
+	if (unlikely(tid < 1 || tid > MAX_NUM_THREADS)) return NULL;
+	t = sl_mod_thd_get(sl_thd_lookup_backend(tid));
+	if (likely(t && sl_thd_aepinfo(t))) return t;
+	xt = sl_xcore_thd_lookup(tid);
+	if (unlikely(xt && xt->core != cos_cpuid())) return NULL;
 
-	return sl_thd_retrieve(tid);
+	/* FIXME: cross-core child threads must be handled in retrieve */
+	return sl_thd_retrieve_lazy(tid);
 }
 
 /* only see if it's already sl_thd initialized */
@@ -100,8 +123,7 @@ sl_thd_try_lkup(thdid_t tid)
 {
 	struct sl_thd *t = NULL;
 
-	assert(tid != 0);
-	if (unlikely(tid > MAX_NUM_THREADS)) return NULL;
+	if (unlikely(tid < 1 || tid > MAX_NUM_THREADS)) return NULL;
 
 	t = sl_mod_thd_get(sl_thd_lookup_backend(tid));
 	if (!sl_thd_aepinfo(t)) return NULL;
@@ -112,26 +134,32 @@ sl_thd_try_lkup(thdid_t tid)
 static inline thdid_t
 sl_thdid(void)
 {
-	thdid_t tid = cos_thdid();
-
-	assert(tid != 0);
-	assert(tid < MAX_NUM_THREADS);
-
-	return tid;
+	return cos_thdid();
 }
 
+sched_blkpt_id_t sched_blkpt_alloc(void);
+int sched_blkpt_free(sched_blkpt_id_t id);
+int sched_blkpt_trigger(sched_blkpt_id_t blkpt, sched_blkpt_epoch_t epoch, int single);
+int sched_blkpt_block(sched_blkpt_id_t blkpt, sched_blkpt_epoch_t epoch, thdid_t dependency);
 
 static inline struct sl_thd *
 sl_thd_curr(void)
 {
-	return sl_thd_lkup(sl_thdid());
+	struct sl_thd *t = (struct sl_thd *)cos_get_slthd_ptr();
+
+	if (likely(t)) return t;
+
+	t = sl_thd_lkup(sl_thdid());
+	cos_set_slthd_ptr((void *)t);
+
+	return t;
 }
 
 /* are we the owner of the critical section? */
 static inline int
 sl_cs_owner(void)
 {
-	return sl__globals_cpu()->lock.u.s.owner == sl_thd_thdcap(sl_thd_curr());
+	return sl__globals_core()->lock.u.s.owner == sl_thd_thdcap(sl_thd_curr());
 }
 
 /* ...not part of the public API */
@@ -147,7 +175,7 @@ sl_cs_owner(void)
  *     -ve from cos_defswitch failure, allowing caller for ex: the scheduler thread to
  *     check if it was -EBUSY to first recieve pending notifications before retrying lock.
  */
-int sl_cs_enter_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, thdcap_t curr, sched_tok_t tok);
+int sl_cs_enter_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, struct sl_global_core *gcore, struct sl_thd *curr, sched_tok_t tok);
 /*
  * @csi: current critical section value
  * @cached: a cached copy of @csi
@@ -155,28 +183,28 @@ int sl_cs_enter_contention(union sl_cs_intern *csi, union sl_cs_intern *cached,
  *
  * @ret: returns 1 if we need a retry, 0 otherwise
  */
-int sl_cs_exit_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, sched_tok_t tok);
+int sl_cs_exit_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, struct sl_global_core *gcore, sched_tok_t tok);
 
 /* Enter into the scheduler critical section */
 static inline int
 sl_cs_enter_nospin(void)
 {
+#ifdef SL_CS
+	struct sl_global_core *gcore = sl__globals_core();
+	struct sl_thd         *t     = sl_thd_curr();
 	union sl_cs_intern csi, cached;
-	struct sl_thd *    t = sl_thd_curr();
-	sched_tok_t        tok;
 
 	assert(t);
-	tok      = cos_sched_sync();
-	csi.v    = sl__globals_cpu()->lock.u.v;
+	csi.v    = gcore->lock.u.v;
 	cached.v = csi.v;
 
 	if (unlikely(csi.s.owner)) {
-		return sl_cs_enter_contention(&csi, &cached, sl_thd_thdcap(t), tok);
+		return sl_cs_enter_contention(&csi, &cached, gcore, t, cos_sched_sync());
 	}
 
 	csi.s.owner = sl_thd_thdcap(t);
-	if (!ps_cas(&sl__globals_cpu()->lock.u.v, cached.v, csi.v)) return 1;
-
+	if (!ps_upcas(&gcore->lock.u.v, cached.v, csi.v)) return 1;
+#endif
 	return 0;
 }
 
@@ -211,22 +239,23 @@ sl_cs_enter_sched(void)
 static inline void
 sl_cs_exit(void)
 {
+#ifdef SL_CS
+	struct sl_global_core *gcore = sl__globals_core();
 	union sl_cs_intern csi, cached;
-	sched_tok_t        tok;
 
 	assert(sl_cs_owner());
-
 retry:
-	tok      = cos_sched_sync();
-	csi.v    = sl__globals_cpu()->lock.u.v;
+	csi.v    = gcore->lock.u.v;
 	cached.v = csi.v;
 
 	if (unlikely(csi.s.contention)) {
-		if (sl_cs_exit_contention(&csi, &cached, tok)) goto retry;
+		if (sl_cs_exit_contention(&csi, &cached, gcore, cos_sched_sync())) goto retry;
+
 		return;
 	}
 
-	if (!ps_cas(&sl__globals_cpu()->lock.u.v, cached.v, 0)) goto retry;
+	if (!ps_upcas(&gcore->lock.u.v, cached.v, 0)) goto retry;
+#endif
 }
 
 /*
@@ -270,9 +299,15 @@ int  sl_thd_sched_wakeup_no_cs(struct sl_thd *t);
 /* wakeup thread and do not remove from timeout queue if blocked on timeout */
 int  sl_thd_wakeup_no_cs_rm(struct sl_thd *t);
 
-void sl_thd_yield(thdid_t tid);
+void sl_thd_yield_intern(thdid_t tid);
+void sl_thd_yield_intern_timeout(cycles_t abs_timeout);
+
 void sl_thd_yield_cs_exit(thdid_t tid);
 
+int sl_thd_migrate_no_cs(struct sl_thd *t, cpuid_t core);
+/* @return: 0 - success, -1 - failure */
+int sl_thd_migrate(thdid_t tid, cpuid_t core);
+
 /* The entire thread allocation and free API */
 struct sl_thd *sl_thd_alloc(cos_thd_fn_t fn, void *data);
 struct sl_thd *sl_thd_aep_alloc(cos_aepthd_fn_t fn, void *data, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax);
@@ -282,8 +317,10 @@ struct sl_thd *sl_thd_aep_alloc(cos_aepthd_fn_t fn, void *data, int own_tcap, co
  */
 struct sl_thd *sl_thd_comp_init(struct cos_defcompinfo *comp, int is_sched);
 
-struct sl_thd *sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax);
-struct sl_thd *sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv);
+struct sl_thd *sl_thd_initaep_alloc_dcb(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, dcbcap_t dcap, microsec_t ipiwin, u32_t ipimax);
+struct sl_thd *sl_thd_aep_alloc_ext_dcb(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, dcbcap_t dcap, dcboff_t doff, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv);
+struct sl_thd *sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, vaddr_t *dcbaddr);
+struct sl_thd *sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, vaddr_t *dcbaddr, arcvcap_t *extrcv);
 
 struct sl_thd *sl_thd_init_ext(struct cos_aep_info *aep, struct sl_thd *sched_thd);
 
@@ -295,13 +332,13 @@ void sl_thd_param_set(struct sl_thd *t, sched_param_t sp);
 static inline microsec_t
 sl_cyc2usec(cycles_t cyc)
 {
-	return cyc / sl__globals_cpu()->cyc_per_usec;
+	return cyc / sl__globals_core()->cyc_per_usec;
 }
 
 static inline cycles_t
 sl_usec2cyc(microsec_t usec)
 {
-	return usec * sl__globals_cpu()->cyc_per_usec;
+	return usec * sl__globals_core()->cyc_per_usec;
 }
 
 static inline cycles_t
@@ -333,14 +370,17 @@ void sl_timeout_period(cycles_t period);
 static inline cycles_t
 sl_timeout_period_get(void)
 {
-	return sl__globals_cpu()->period;
+	return sl__globals_core()->period;
 }
 
 static inline void
 sl_timeout_oneshot(cycles_t absolute_us)
 {
-	sl__globals_cpu()->timer_next   = absolute_us;
-	sl__globals_cpu()->timeout_next = tcap_cyc2time(absolute_us);
+	struct sl_global_core *g = sl__globals_core();
+
+	g->timer_prev   = g->timer_next;
+	g->timer_next   = absolute_us;
+	g->timeout_next = tcap_cyc2time(absolute_us);
 }
 
 static inline void
@@ -368,7 +408,7 @@ struct heap *sl_timeout_heap(void);
 static inline void
 sl_timeout_wakeup_expired(cycles_t now)
 {
-	if (!heap_size(sl_timeout_heap())) return;
+	if (likely(!heap_size(sl_timeout_heap()))) return;
 
 	do {
 		struct sl_thd *tp, *th;
@@ -396,32 +436,173 @@ sl_thd_is_runnable(struct sl_thd *t)
 }
 
 static inline int
-sl_thd_activate(struct sl_thd *t, sched_tok_t tok)
+sl_thd_dispatch_kern(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr, tcap_time_t timeout, tcap_t tc, tcap_prio_t p)
 {
-	struct cos_defcompinfo *dci = cos_defcompinfo_curr_get();
-	struct cos_compinfo    *ci  = &dci->ci;
-	struct sl_global_cpu   *g   = sl__globals_cpu();
+	volatile struct cos_scb_info *scb = sl_scb_info_core();
+	struct sl_global_core *g = sl__globals_core();
+	struct cos_dcb_info *cd = sl_thd_dcbinfo(curr), *nd = sl_thd_dcbinfo(next);
+	word_t a = ((sl_thd_thdcap(next)  + 1) << COS_CAPABILITY_OFFSET) + (tok >> 16);
+	word_t b = (tc << 16) | g->sched_rcv;
+	word_t S = (p << 32) >> 32;
+	word_t D = (((p << 16) >> 48) << 16) | ((tok << 16) >> 16);
+	word_t d = timeout; 
 	int ret = 0;
 
-	if (t->properties & SL_THD_PROPERTY_SEND) {
+	assert(curr != next);
+	if (unlikely(!cd || !nd)) return cos_switch(sl_thd_thdcap(next), sl_thd_tcap(next), next->prio, timeout, g->sched_rcv, tok);
+
+	__asm__ __volatile__ (			\
+		"pushl %%ebp\n\t"		\
+		"movl %%esp, %%ebp\n\t"		\
+		"movl $1f, (%%esi)\n\t"		\
+		"movl %%esp, 4(%%esi)\n\t"	\
+		"movl %%ecx, %%esi\n\t"		\
+		"movl $2f, %%ecx\n\t"		\
+		"sysenter\n\t"			\
+		"jmp 2f\n\t"			\
+		".align 4\n\t"			\
+		"1:\n\t"			\
+		"movl $0, %%eax\n\t"		\
+		".align 4\n\t"			\
+		"2:\n\t"			\
+		"popl %%ebp\n\t"		\
+		: "=a" (ret)
+		: "a" (a), "b" (b), "S" (cd), "D" (D), "d" (d), "c" (S)
+		: "memory", "cc");
+
+	scb = sl_scb_info_core();
+	cd = sl_thd_dcbinfo(sl_thd_curr());
+	cd->sp = 0;
+	if (unlikely(ps_load(&scb->sched_tok) != tok)) return -EAGAIN;
+
+	return ret;
+}
+
+static inline int
+sl_thd_dispatch_usr(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
+{
+	volatile struct cos_scb_info *scb = sl_scb_info_core();
+	struct cos_dcb_info *cd = sl_thd_dcbinfo(curr), *nd = sl_thd_dcbinfo(next);
+	struct sl_global_core *g = sl__globals_core();
+
+	assert(curr != next);
+	if (unlikely(!cd || !nd)) return cos_defswitch(sl_thd_thdcap(next), next->prio, g->timeout_next, tok);
+
+	/*
+	 * jump labels in the asm routine:
+	 *
+	 * 1: slowpath dispatch using cos_thd_switch to switch to a thread
+	 *    if the dcb sp of the next thread is reset.
+	 *	(inlined slowpath sysenter to debug preemption problem)
+	 *
+	 * 2: if user-level dispatch routine completed successfully so
+	 *    the register states still retained and in the dispatched thread
+	 *    we reset its dcb sp!
+	 *
+	 * 3: if user-level dispatch was either preempted in the middle
+	 *    of this routine or kernel at some point had to switch to a
+	 *    thread that co-operatively switched away from this routine.
+	 *    NOTE: kernel takes care of resetting dcb sp in this case!
+	 *
+	 * a simple cos_thd_switch() kind will disable timers! so, pass in the timeout anyway to 
+	 * slowpath thread switch!
+	 */
+
+	__asm__ __volatile__ (			\
+		"pushl %%ebp\n\t"		\
+		"movl %%esp, %%ebp\n\t"		\
+		"movl $2f, (%%eax)\n\t"		\
+		"movl %%esp, 4(%%eax)\n\t"	\
+		"cmp $0, 4(%%ebx)\n\t"		\
+		"je 1f\n\t"			\
+		"movl %%edx, (%%ecx)\n\t"	\
+		"movl 4(%%ebx), %%esp\n\t"	\
+		"jmp *(%%ebx)\n\t"		\
+		".align 4\n\t"			\
+		"1:\n\t"			\
+		"movl $3f, %%ecx\n\t"		\
+		"movl %%edx, %%eax\n\t"		\
+		"inc %%eax\n\t"			\
+		"shl $16, %%eax\n\t"		\
+		"movl $0, %%ebx\n\t"		\
+		"movl %%esi, %%edx\n\t"		\
+		"movl $0, %%esi\n\t"		\
+		"movl $0, %%edi\n\t"		\
+		"sysenter\n\t"			\
+		"jmp 3f\n\t"			\
+		".align 4\n\t"			\
+		"2:\n\t"			\
+		"movl $0, 4(%%ebx)\n\t"		\
+		".align 4\n\t"			\
+		"3:\n\t"			\
+		"popl %%ebp\n\t"		\
+		:
+		: "a" (cd), "b" (nd),
+		  "S" (g->timeout_next), "D" (tok),
+		  "c" (&(scb->curr_thd)), "d" (sl_thd_thdcap(next))
+		: "memory", "cc");
+
+	scb = sl_scb_info_core();
+	if (unlikely(ps_load(&scb->sched_tok) != tok)) return -EAGAIN;
+
+	return 0;
+}
+
+static inline int
+sl_thd_activate_c(struct sl_thd *t, sched_tok_t tok, tcap_time_t timeout, tcap_prio_t prio, struct sl_thd *curr, struct sl_global_core *g)
+{
+	if (unlikely(t->properties & SL_THD_PROPERTY_SEND)) {
 		return cos_sched_asnd(t->sndcap, g->timeout_next, g->sched_rcv, tok);
-	} else if (t->properties & SL_THD_PROPERTY_OWN_TCAP) {
-		return cos_switch(sl_thd_thdcap(t), sl_thd_tcap(t), t->prio,
-				  g->timeout_next, g->sched_rcv, tok);
+	}
+
+	/* there is more events.. run scheduler again! */
+	if (unlikely(cos_sched_ispending())) {
+		if (curr == g->sched_thd) return -EBUSY;
+		return sl_thd_dispatch_usr(g->sched_thd, tok, curr);
+	}
+
+	if (unlikely(t->properties & SL_THD_PROPERTY_OWN_TCAP)) {
+		return sl_thd_dispatch_kern(t, tok, curr, timeout, sl_thd_tcap(t), prio == 0 ? t->prio : prio);
+	}
+
+	/* TODO: there is something in the kernel that seem to disable timers..!! */
+	/* WORKAROUND: idle thread is a big cpu hogger.. so make sure there is timeout set around switching to and away! */
+	if (unlikely(curr == g->idle_thd || t == g->idle_thd)) {
+		return sl_thd_dispatch_kern(t, tok, curr, g->timeout_next, g->sched_tcap, prio);
+	}
+
+	if (unlikely(timeout || prio)) {
+		return sl_thd_dispatch_kern(t, tok, curr, timeout, g->sched_tcap, prio);
 	} else {
-		ret = cos_defswitch(sl_thd_thdcap(t), t->prio, t == g->sched_thd ?
-				    TCAP_TIME_NIL : g->timeout_next, tok);
-		if (likely(t != g->sched_thd && t != g->idle_thd)) return ret;
-		if (unlikely(ret != -EPERM)) return ret;
-
-		/*
-		 * Attempting to activate scheduler thread or idle thread failed for no budget in it's tcap.
-		 * Force switch to the scheduler with current tcap.
-		 */
-		return cos_switch(sl_thd_thdcap(g->sched_thd), 0, t->prio, 0, g->sched_rcv, tok);
+		assert(t != g->idle_thd);
+		return sl_thd_dispatch_usr(t, tok, curr);
 	}
 }
 
+
+static inline int
+sl_thd_activate(struct sl_thd *t, sched_tok_t tok, tcap_time_t timeout, tcap_prio_t prio)
+{
+	struct sl_global_core *g = sl__globals_core();
+
+	return sl_thd_activate_c(t, tok, timeout, prio, sl_thd_curr(), g);
+}
+
+static inline int
+sl_cs_exit_schedule_nospin_arg_c(struct sl_thd *curr, struct sl_thd *next)
+{
+	sched_tok_t tok;
+#ifdef SL_CS
+	if (likely(!sl_cs_owner())) sl_cs_enter();
+#endif
+	tok = cos_sched_sync();
+#ifdef SL_CS
+	sl_cs_exit();
+#endif
+	return sl_thd_activate_c(next, tok, 0, 0, curr, sl__globals_core());
+}
+
+void sl_thd_replenish_no_cs(struct sl_thd *t, cycles_t now);
 /*
  * Do a few things: 1. take the critical section if it isn't already
  * taken, 2. call schedule to find the next thread to run, 3. release
@@ -449,24 +630,24 @@ sl_thd_activate(struct sl_thd *t, sched_tok_t tok)
 static inline int
 sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 {
-	struct cos_defcompinfo *dci = cos_defcompinfo_curr_get();
-	struct cos_compinfo    *ci = &dci->ci;
-	struct sl_thd_policy   *pt;
-	struct sl_thd *         t;
-	struct sl_global_cpu   *globals = sl__globals_cpu();
-	sched_tok_t             tok;
-	cycles_t                now;
-	s64_t                   offset;
-	int                     ret;
+	struct sl_thd         *t = to, *c = sl_thd_curr();
+	struct sl_global_core *globals = sl__globals_core();
+	sched_tok_t            tok;
+#ifdef SL_REPLENISH
+	cycles_t               now;
+#endif
+	s64_t                  offset;
+	int                    ret;
 
 	/* Don't abuse this, it is only to enable the tight loop around this function for races... */
-	if (unlikely(!sl_cs_owner())) sl_cs_enter();
+#ifdef SL_CS
+	if (likely(!sl_cs_owner())) sl_cs_enter();
+#endif
 
 	tok    = cos_sched_sync();
+#ifdef SL_REPLENISH
 	now    = sl_now();
-	offset = (s64_t)(globals->timer_next - now);
-	if (globals->timer_next && offset <= 0) sl_timeout_expended(now, globals->timer_next);
-	sl_timeout_wakeup_expired(now);
+#endif
 
 	/*
 	 * Once we exit, we can't trust t's memory as it could be
@@ -475,45 +656,153 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 	 * catch it.  This is a little twitchy and subtle, so lets put
 	 * it in a function, here.
 	 */
-	if (unlikely(to)) {
+	if (likely(to)) {
 		t = to;
-		if (!sl_thd_is_runnable(t)) to = NULL;
+		if (unlikely(!sl_thd_is_runnable(t))) to = NULL;
 	}
-	if (likely(!to)) {
-		pt = sl_mod_schedule();
+	if (unlikely(!to)) {
+		struct sl_thd_policy *pt = sl_mod_schedule();
+
 		if (unlikely(!pt))
 			t = globals->idle_thd;
 		else
 			t = sl_mod_thd_get(pt);
 	}
+	if (unlikely(!t)) t= globals->sched_thd;
+
+#ifdef SL_REPLENISH
+	sl_thd_replenish_no_cs(t, now);
+#endif
+
+	assert(t && sl_thd_is_runnable(t));
+#ifdef SL_CS
+	sl_cs_exit();
+#endif
+	if (unlikely(t == c)) return 0;
+
+	ret = sl_thd_activate_c(t, tok, 0, 0, c, globals);
+
+	/*
+	 * one observation, in slowpath switch:
+	 *        if the kernel decides to switch over to scheduler thread and
+	 *        later at some point decides to resume this thread, the ret value
+	 *        from the syscall is probably 0, even though token has advanced and
+	 *        the switch this thread intended, did not go through.
+	 *
+	 * there is some wierd race in user-level thread switch:
+	 *        a thread sl_thd_block()'s itself and decides to switch to a runnable
+	 *        thread at user-level.
+	 *        if a preemption occurs and eventually this thread is resumed, 
+	 *        for some reason the token check is not working well.
+	 *
+	 * what is more wierd is, even in slowpath sl_thd_activate(), I see that
+	 * on return from syscall, this thread is not runnable. 
+	 * how is this possible? is there a race? i don't think so.
+	 * only the current thread can block itself, of course this is not true for AEPs.
+	 * But for non AEPs, I don't know why this triggers!
+	 *
+	 * I'll need to rethink about some possible scenario, perhaps some bug in the code
+	 * that returns to this thread when it is not runnable.
+	 * something!!!!
+	 */
+	if (unlikely(!sl_thd_is_runnable(c))) return -EAGAIN;
+
+#ifdef SL_REPLENISH 
+	/*
+	 * dispatch failed with -EPERM because tcap associated with thread t does not have budget.
+	 * Block the thread until it's next replenishment and return to the scheduler thread.
+	 *
+	 * If the thread is not replenished by the scheduler (replenished "only" by
+	 * the inter-component delegations), block till next timeout and try again.
+	 */
+	if (unlikely(ret == -EPERM)) {
+		assert(t != globals->sched_thd && t != globals->idle_thd);
+		sl_thd_block_expiry(t);
+		if (unlikely(sl_thd_curr() != globals->sched_thd)) ret = sl_thd_activate(globals->sched_thd, tok, globals->timeout_next, 0);
+	}
+#endif
+	/* either this thread is runnable at this point or a switch failed */
+	assert(sl_thd_is_runnable(c) || ret);
+
+	return ret;
+}
+
+static inline int
+sl_cs_exit_schedule_nospin_arg_timeout(struct sl_thd *to, cycles_t abs_timeout)
+{
+	struct sl_thd         *t = to, *c = sl_thd_curr();
+	struct sl_global_core *globals = sl__globals_core();
+	sched_tok_t            tok;
+	cycles_t               now;
+	s64_t                  offset;
+	int                    ret;
+	struct cos_dcb_info *cb;
+	tcap_time_t            timeout = 0;
 
-	if (t->properties & SL_THD_PROPERTY_OWN_TCAP && t->budget) {
-		assert(t->period);
-		assert(sl_thd_tcap(t) != globals->sched_tcap);
+	/* Don't abuse this, it is only to enable the tight loop around this function for races... */
+#ifdef SL_CS
+	if (likely(!sl_cs_owner())) sl_cs_enter();
+#endif
 
-		if (t->last_replenish == 0 || t->last_replenish + t->period <= now) {
-			tcap_res_t currbudget = 0;
-			cycles_t replenish    = now - ((now - t->last_replenish) % t->period);
+	tok    = cos_sched_sync();
+	now    = sl_now();
 
-			ret = 0;
-			currbudget = (tcap_res_t)cos_introspect(ci, sl_thd_tcap(t), TCAP_GET_BUDGET);
+	offset = (s64_t)(globals->timer_next - now);
+	if (offset <= 0) sl_timeout_expended(now, globals->timer_next);
+	sl_timeout_wakeup_expired(now);
 
-			if (!cycles_same(currbudget, t->budget, SL_CYCS_DIFF) && currbudget < t->budget) {
-				tcap_res_t transfer = t->budget - currbudget;
+	/*
+	 * Once we exit, we can't trust t's memory as it could be
+	 * deallocated/modified, so cache it locally.  If these values
+	 * are out of date, the scheduler synchronization tok will
+	 * catch it.  This is a little twitchy and subtle, so lets put
+	 * it in a function, here.
+	 */
+	if (likely(to)) {
+		t = to;
+		if (unlikely(!sl_thd_is_runnable(t))) to = NULL;
+	}
+	if (unlikely(!to)) {
+		struct sl_thd_policy *pt = sl_mod_schedule();
 
-				/* tcap_transfer will assign sched_tcap's prio to t's tcap if t->prio == 0, which we don't want. */
-				assert(t->prio >= TCAP_PRIO_MAX && t->prio <= TCAP_PRIO_MIN);
-				ret = cos_tcap_transfer(sl_thd_rcvcap(t), globals->sched_tcap, transfer, t->prio);
-			}
+		if (unlikely(!pt))
+			t = globals->idle_thd;
+		else
+			t = sl_mod_thd_get(pt);
+	}
+	if (unlikely(!t)) t= globals->sched_thd;
+
+#ifdef SL_REPLENISH
+	sl_thd_replenish_no_cs(t, now);
+#endif
 
-			if (likely(ret == 0)) t->last_replenish = replenish;
-		}
+	assert(t && sl_thd_is_runnable(t));
+	if (offset <= 0 || 
+	    (abs_timeout > now && abs_timeout > globals->timer_next + globals->cyc_per_usec)) {
+		timeout = offset <= 0 ? globals->timer_next : (abs_timeout > now ? tcap_cyc2time(abs_timeout) : 0);
 	}
 
-	assert(sl_thd_is_runnable(t));
+#ifdef SL_CS
 	sl_cs_exit();
+#endif
+	if (likely(c == t && t == globals->sched_thd && timeout)) {
+		/* program the new timer.. */
+		return cos_defswitch(globals->sched_thdcap, globals->sched_thd->prio, timeout, tok);
+	}
+	if (unlikely(t == c)) return 0;
 
-	ret = sl_thd_activate(t, tok);
+	/* 
+	 * if the requested timeout is greater than next timeout 
+	 * and timer is already programmed to be over a usec away, don't 
+	 * reprogam it.
+	 *
+	 * else, reprogram for an earlier timeout requested.
+	 */
+
+	ret = sl_thd_activate_c(t, tok, timeout, 0, c, globals);
+	if (unlikely(!sl_thd_is_runnable(c))) return -EAGAIN;
+
+#ifdef SL_REPLENISH 
 	/*
 	 * dispatch failed with -EPERM because tcap associated with thread t does not have budget.
 	 * Block the thread until it's next replenishment and return to the scheduler thread.
@@ -524,8 +813,9 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 	if (unlikely(ret == -EPERM)) {
 		assert(t != globals->sched_thd && t != globals->idle_thd);
 		sl_thd_block_expiry(t);
-		if (unlikely(sl_thd_curr() != globals->sched_thd)) ret = sl_thd_activate(globals->sched_thd, tok);
+		if (unlikely(sl_thd_curr() != globals->sched_thd)) ret = sl_thd_activate_c(globals->sched_thd, tok, globals->timeout_next, 0, c, globals);
 	}
+#endif
 
 	return ret;
 }
@@ -557,6 +847,41 @@ sl_cs_exit_switchto(struct sl_thd *to)
 	}
 }
 
+static inline int
+sl_cs_exit_schedule_nospin_timeout(cycles_t abs_timeout)
+{
+	return sl_cs_exit_schedule_nospin_arg_timeout(NULL, abs_timeout);
+}
+
+static inline void
+sl_cs_exit_schedule_timeout(cycles_t abs_timeout)
+{
+	while (sl_cs_exit_schedule_nospin_timeout(abs_timeout) && sl_now() < abs_timeout)
+		;
+}
+
+static inline void
+sl_cs_exit_switchto_timeout(struct sl_thd *to, cycles_t abs_timeout)
+{
+	/*
+	 * We only try once, so it is possible that we don't end up
+	 * switching to the desired thread.  However, this is always a
+	 * case that the caller has to consider if the current thread
+	 * has a higher priority than the "to" thread.
+	 */
+	if (sl_cs_exit_schedule_nospin_arg_timeout(to, abs_timeout)) {
+		sl_cs_exit_schedule_timeout(abs_timeout);
+	}
+}
+
+static inline void
+sl_cs_exit_switchto_c(struct sl_thd *c, struct sl_thd *n)
+{
+	if (sl_cs_exit_schedule_nospin_arg_c(c, n)) {
+		sl_cs_exit_schedule();
+	}
+}
+
 /*
  * Initialization protocol in cos_init: initialization of
  * library-internal data-structures, and then the ability for the
@@ -571,7 +896,7 @@ void sl_init(microsec_t period);
 /*
  * @cpubmp - cpu/cores on which this scheduler will run on!
  */
-void sl_init_cpubmp(microsec_t period, u32_t *cpubmp);
+void sl_init_corebmp(microsec_t period, u32_t *corebmp);
 /*
  * sl_sched_loop internally calls the kernel api - cos_sched_rcv
  * which blocks (suspends) the calling thread if there are no pending events.
@@ -590,5 +915,114 @@ void sl_sched_loop(void) __attribute__((noreturn));
  * booter receive (INITRCV) end-point at the kernel level.
  */
 void sl_sched_loop_nonblock(void) __attribute__((noreturn));
+static inline void
+sl_thd_yield_thd_c(struct sl_thd *c, struct sl_thd *n)
+{
+	if (likely(c && n)) sl_cs_exit_switchto_c(c, n);
+	else                sl_thd_yield_intern(0);
+}
+
+static inline void
+sl_thd_yield_thd(struct sl_thd *n)
+{
+	if (likely(n)) sl_cs_exit_switchto(n);
+	else           sl_thd_yield_intern(0);
+}
+
+static inline void
+sl_thd_yield(thdid_t tid)
+{
+	if (likely(tid)) {
+		sl_cs_enter();
+		sl_cs_exit_switchto(sl_thd_lkup(tid));
+	} else {
+		sl_thd_yield_intern(0);
+	}
+}
+
+static inline void
+sl_thd_yield_timeout(thdid_t tid, cycles_t abs_timeout)
+{
+	if (likely(tid)) {
+		sl_cs_enter();
+		sl_cs_exit_switchto_timeout(sl_thd_lkup(tid), abs_timeout);
+	} else {
+		sl_thd_yield_intern_timeout(abs_timeout);
+	}
+}
+
+static inline void
+sl_thd_event_info_reset(struct sl_thd *t)
+{
+	t->event_info.blocked      = 0;
+	t->event_info.elapsed_cycs = 0;
+	t->event_info.next_timeout = 0;
+	t->event_info.epoch        = 0;
+}
+
+static inline void
+sl_thd_event_enqueue(struct sl_thd *t, struct cos_thd_event *e)
+{
+	struct sl_global_core *g = sl__globals_core();
+
+	assert(e->epoch);
+	if (e->epoch <= t->event_info.epoch) return;
+
+	if (ps_list_singleton(t, SL_THD_EVENT_LIST)) ps_list_head_append(&g->event_head, t, SL_THD_EVENT_LIST);
+
+	t->event_info.blocked       = e->blocked;
+	t->event_info.elapsed_cycs += e->elapsed_cycs;
+	t->event_info.next_timeout  = e->next_timeout;
+}
+
+static inline void
+sl_thd_event_dequeue(struct sl_thd *t, struct cos_thd_event *e)
+{
+	ps_list_rem(t, SL_THD_EVENT_LIST);
+
+	e->blocked      = t->event_info.blocked;
+	e->elapsed_cycs = t->event_info.elapsed_cycs;
+	e->next_timeout = t->event_info.next_timeout;
+	sl_thd_event_info_reset(t);
+}
+
+static inline int
+sl_thd_rcv(rcv_flags_t flags)
+{
+	return cos_ul_rcv(sl_thd_rcvcap(sl_thd_curr()), flags, sl__globals_core()->timeout_next);
+//	/* FIXME: elapsed_cycs accounting..?? */
+//	struct cos_thd_event ev = { .blocked = 1, .next_timeout = 0, .epoch = 0, .elapsed_cycs = 0 };
+//	struct sl_thd *t = sl_thd_curr();
+//	unsigned long *p = &sl_thd_dcbinfo(t)->pending, q = 0;
+//	int ret = 0;
+//
+//	assert(sl_thd_rcvcap(t));
+//	assert(!(flags & RCV_ULSCHED_RCV));
+//
+//recheck:
+//	if ((q = ps_load(p)) == 0) {
+//		if (!(flags & RCV_ULONLY)) {
+//			ret = cos_rcv(sl_thd_rcvcap(t), flags);
+//			q = ps_load(p);
+//			goto done;
+//		}
+//		if (unlikely(flags & RCV_NON_BLOCKING)) return -EAGAIN;
+//
+//		sl_cs_enter();
+//		ev.epoch = sl_now();
+//		sl_thd_event_enqueue(t, &ev);
+//		sl_thd_sched_block_no_cs(t, SL_THD_BLOCKED, 0);
+//		sl_cs_exit_switchto(sl__globals_core()->sched_thd);
+//		goto recheck;
+//		//q = ps_load(p);
+//	}
+//	assert(sl_thd_dcbinfo(t)->sp == 0);
+//	assert(q == 1); /* q should be 1 if the thread did not call COS_RCV and is woken up.. */
+//
+//done:
+//	ps_upcas(p, q, 0);
+////if (cos_spd_id() != 4) printc("[R%u]", cos_thdid()); 
+//	return ret;
+}
 
 #endif /* SL_H */
diff --git a/src/components/include/sl_plugins.h b/src/components/include/sl_plugins.h
index 0a7d22db3f..a5266f5bc9 100644
--- a/src/components/include/sl_plugins.h
+++ b/src/components/include/sl_plugins.h
@@ -16,6 +16,7 @@
  */
 struct sl_thd_policy *sl_thd_alloc_backend(thdid_t tid);
 void                  sl_thd_free_backend(struct sl_thd_policy *t);
+struct sl_thd_policy *sl_thd_migrate_backend(struct sl_thd_policy *t, cpuid_t core);
 /*
  * cos_aep_info structs cannot be stack allocated!
  * The thread_alloc_backened needs to provide struct cos_aep_info without
@@ -42,6 +43,8 @@ static inline struct sl_thd_policy *sl_mod_thd_policy_get(struct sl_thd *t);
 
 void                  sl_mod_execution(struct sl_thd_policy *t, cycles_t cycles);
 struct sl_thd_policy *sl_mod_schedule(void);
+/* give me the thread at the end of the run-queue */
+struct sl_thd_policy *sl_mod_last_schedule(void);
 
 void sl_mod_block(struct sl_thd_policy *t);
 void sl_mod_wakeup(struct sl_thd_policy *t);
diff --git a/src/components/include/sl_thd.h b/src/components/include/sl_thd.h
index bd1035f27c..632759087f 100644
--- a/src/components/include/sl_thd.h
+++ b/src/components/include/sl_thd.h
@@ -27,12 +27,6 @@ typedef enum {
 	SL_THD_PROPERTY_SEND     = (1<<1), /* use asnd to dispatch to this thread */
 } sl_thd_property_t;
 
-struct event_info {
-	int         blocked; /* 1 - blocked. 0 - awoken */
-	cycles_t    cycles;
-	tcap_time_t timeout;
-};
-
 struct sl_thd {
 	sl_thd_state_t       state;
 	/*
@@ -93,10 +87,27 @@ struct sl_thd {
 	cycles_t    wakeup_cycs;   /* actual last wakeup - used in timeout API for jitter information, etc */
 	int         timeout_idx;   /* timeout heap index, used in timeout API */
 
-	struct event_info event_info;
+	struct cos_thd_event event_info;
 	struct ps_list    SL_THD_EVENT_LIST; /* list of events for the scheduler end-point */
+
+	struct cos_dcb_info *dcb;
+
+	void *part_context; /* used by the parallelism stuff! */
+	struct ps_list partlist;
 };
 
+static inline struct cos_dcb_info *
+sl_thd_dcbinfo(struct sl_thd *t)
+{ return t->dcb; }
+
+static inline unsigned long *
+sl_thd_ip(struct sl_thd *t)
+{ return &t->dcb->ip; }
+
+static inline unsigned long *
+sl_thd_sp(struct sl_thd *t)
+{ return &t->dcb->sp; }
+
 static inline struct cos_aep_info *
 sl_thd_aepinfo(struct sl_thd *t)
 { return (t->aepinfo); }
diff --git a/src/components/include/sl_xcore.h b/src/components/include/sl_xcore.h
new file mode 100644
index 0000000000..b06d3c51b4
--- /dev/null
+++ b/src/components/include/sl_xcore.h
@@ -0,0 +1,193 @@
+#ifndef SL_XCORE_H
+#define SL_XCORE_H
+
+#include <ck_ring.h>
+#include <cos_kernel_api.h>
+#include <cos_defkernel_api.h>
+#include <res_spec.h>
+
+#define SL_XCORE_PARAM_MAX   4
+#define SL_XCORE_MIGRATE_MAX 16
+#define SL_XCORE_KEEP_MIN    4
+
+typedef enum {
+	SL_XCORE_THD_ALLOC = 0,
+	SL_XCORE_THD_ALLOC_EXT,
+	SL_XCORE_AEP_ALLOC,
+	SL_XCORE_AEP_ALLOC_EXT,
+	SL_XCORE_INITAEP_ALLOC,
+	SL_XCORE_THD_DEALLOC, /* thread delete, need it? */
+
+	SL_XCORE_THD_PARAM_SET,
+	SL_XCORE_THD_WAKEUP,
+
+	SL_XCORE_LOAD_BALANCE,
+} sl_xcore_req_t;
+
+struct sl_xcore_response {
+	/* request type */
+	sl_xcore_req_t type; /* set by the client requesting */
+	/* response fields */
+	volatile int resp_ready;
+	union {
+		struct {
+			thdid_t tid;
+		} sl_xcore_resp_thd_alloc;
+		struct {
+			unsigned nthds;
+			thdid_t tid[SL_XCORE_MIGRATE_MAX];
+		} sl_xcore_resp_load_balance;
+	};
+};
+
+struct sl_xcore_request {
+	sl_xcore_req_t type;         /* request type */
+	cpuid_t        client_core;  /* client cpu making the request */
+	thdid_t        client_thd;
+	struct sl_xcore_response *response;
+
+	union {
+		struct {
+			cos_thd_fn_t            fn;
+			void                   *data;
+			sched_param_t           params[SL_XCORE_PARAM_MAX]; /* scheduling parameters */
+			int                     param_count;		 /* number of parameters */
+		} sl_xcore_req_thd_alloc;
+		struct {
+			cos_thd_fn_t            fn;
+			void                   *data;
+			int                     own_tcap;
+			cos_channelkey_t        key;
+			sched_param_t           params[SL_XCORE_PARAM_MAX]; /* scheduling parameters */
+			int                     param_count;		 /* number of parameters */
+		} sl_xcore_req_aep_alloc;
+		struct {
+			thdclosure_index_t      idx; /* TODO: create thread in another component ? */
+			struct cos_defcompinfo *dci;
+			sched_param_t           params[SL_XCORE_PARAM_MAX]; /* scheduling parameters */
+			int                     param_count;		 /* number of parameters */
+		} sl_xcore_req_thd_alloc_ext;
+		struct {
+			thdclosure_index_t      idx;
+			int                     own_tcap;
+			cos_channelkey_t        key;
+			struct cos_defcompinfo *dci;
+			sched_param_t           params[SL_XCORE_PARAM_MAX]; /* scheduling parameters */
+			int                     param_count;		 /* number of parameters */
+		} sl_xcore_req_aep_alloc_ext;
+		struct {
+			int                     is_sched;
+			int                     own_tcap;
+			struct cos_defcompinfo *dci, *sched;
+			sched_param_t           params[SL_XCORE_PARAM_MAX]; /* scheduling parameters */
+			int                     param_count;		 /* number of parameters */
+		} sl_xcore_req_initaep_alloc;
+		struct {
+			thdid_t tid;
+			sched_param_t param;
+		} sl_xcore_req_thd_param_set;
+		struct {
+			thdid_t tid;
+		} sl_xcore_req_thd_wakeup;
+		struct {
+			int nthds; /* if 0 - migrate as many as the src can */
+		} sl_xcore_req_load_balance;
+	};
+};
+
+CK_RING_PROTOTYPE(xcore, sl_xcore_request);
+
+#define SL_XCORE_RING_SIZE (64 * sizeof(struct sl_xcore_request)) /* in sl_const.h? */
+
+/* 
+ * TODO: unionize with sl_thd? 
+ *
+ * IMHO, no! This will occupy too much memory if unionized!
+ * Plus, that would require that we'd need cpuid in the sl_thd and many
+ * branches around in the code for core-local scheduling!
+ * Also, making this struct explicit, makes API use explicit.
+ * I should only be able to use: param_set(), wakeup() and perhaps free(). 
+ *
+ * Change my mind! This is a shit ton of wastage with CACHE_ALIGNED!
+ */
+struct sl_xcore_thd {
+	thdid_t thd;
+	cpuid_t core;
+
+	asndcap_t asnd[NUM_CPU];
+} CACHE_ALIGNED;
+
+struct sl_xcore_thd *sl_xcore_thd_lookup(thdid_t tid);
+struct sl_xcore_thd *sl_xcore_thd_lookup_init(thdid_t tid, cpuid_t core);
+static inline thdid_t
+sl_xcore_thd_thdid(struct sl_xcore_thd *t)
+{
+	return t->thd;
+}
+static inline cpuid_t
+sl_xcore_thd_core(struct sl_xcore_thd *t)
+{
+	return t->core;
+}
+
+/* perhaps move these to sl.h? */
+struct sl_global {
+	struct ck_ring xcore_ring[NUM_CPU]; /* mpsc ring! */
+
+	struct sl_xcore_request xcore_rbuf[NUM_CPU][SL_XCORE_RING_SIZE];
+	u32_t core_bmp[(NUM_CPU + 7)/8]; /* bitmap of cores this scheduler is running on! */
+	asndcap_t xcore_asnd[NUM_CPU][NUM_CPU];
+	unsigned nthds_running[NUM_CPU] CACHE_ALIGNED;
+	struct cos_scb_info *scb_area;
+} CACHE_ALIGNED;
+
+extern struct sl_global sl_global_data;
+
+static inline struct sl_global *
+sl__globals(void)
+{
+	return &sl_global_data;
+}
+
+static inline struct ck_ring *
+sl__ring(cpuid_t core)
+{
+	return &(sl__globals()->xcore_ring[core]);
+}
+
+static inline struct ck_ring *
+sl__ring_curr(void)
+{
+	return sl__ring(cos_cpuid());
+}
+
+static inline struct sl_xcore_request *
+sl__ring_buffer(cpuid_t core)
+{
+	return (sl__globals()->xcore_rbuf[core]);
+}
+
+static inline struct sl_xcore_request *
+sl__ring_buffer_curr(void)
+{
+	return sl__ring_buffer(cos_cpuid());
+}
+
+static inline int
+sl_core_active(void)
+{
+	return bitmap_check(sl__globals()->core_bmp, cos_cpuid());
+}
+
+struct sl_xcore_thd *sl_xcore_thd_alloc(cpuid_t core, cos_thd_fn_t fn, void *data, int nparams, sched_param_t params[]);
+struct sl_xcore_thd *sl_xcore_thd_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, thdclosure_index_t idx, int nparams, sched_param_t params[]);
+struct sl_xcore_thd *sl_xcore_aep_alloc(cpuid_t core, cos_thd_fn_t fn, void *data, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[]);
+struct sl_xcore_thd *sl_xcore_aep_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, thdclosure_index_t idx, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[]);
+struct sl_xcore_thd *sl_xcore_initaep_alloc(cpuid_t core, struct cos_defcompinfo *dci, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[]);
+struct sl_xcore_thd *sl_xcore_initaep_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, struct cos_defcompinfo *sched, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[]);
+void                 sl_xcore_thd_param_set(struct sl_xcore_thd *t, sched_param_t param);
+void                 sl_xcore_thd_wakeup(struct sl_xcore_thd *t);
+void                 sl_xcore_thd_wakeup_tid(thdid_t tid, cpuid_t core);
+int                  sl_xcore_load_balance(void);
+
+#endif /* SL_XCORE_H */
diff --git a/src/components/include/sl_xcpu.h b/src/components/include/sl_xcpu.h
deleted file mode 100644
index f8c915e471..0000000000
--- a/src/components/include/sl_xcpu.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/**
- * Redistribution of this file is permitted under the BSD two clause license.
- *
- * Copyright 2018, The George Washington University
- * Author: Phani Gadepalli, phanikishoreg@gwu.edu
- */
-
-#ifndef SL_XCPU_H
-#define SL_XCPU_H
-
-#include <ck_ring.h>
-#include <cos_kernel_api.h>
-#include <cos_defkernel_api.h>
-#include <res_spec.h>
-
-#define SL_XCPU_PARAM_MAX 4
-
-typedef enum {
-	SL_XCPU_THD_ALLOC = 0,
-	SL_XCPU_THD_ALLOC_EXT,
-	SL_XCPU_AEP_ALLOC,
-	SL_XCPU_AEP_ALLOC_EXT,
-	SL_XCPU_INITAEP_ALLOC,
-	SL_XCPU_THD_DEALLOC, /* thread delete, need it? */
-} sl_xcpu_req_t;
-
-struct sl_xcpu_request {
-	sl_xcpu_req_t type;         /* request type */
-	cpuid_t       client;       /* client cpu making the request */
-	int           req_response; /* client needs a response */
-	sched_param_t params[SL_XCPU_PARAM_MAX]; /* scheduling parameters */
-	int           param_count;		 /* number of parameters */
-
-	union {
-		struct {
-			cos_thd_fn_t            fn;
-			void                   *data;
-		} sl_xcpu_req_thd_alloc;
-		struct {
-			cos_thd_fn_t            fn;
-			void                   *data;
-			int                     own_tcap;
-			cos_channelkey_t        key;
-		} sl_xcpu_req_aep_alloc;
-		struct {
-			thdclosure_index_t      idx; /* TODO: create thread in another component ? */
-			struct cos_defcompinfo *dci;
-		} sl_xcpu_req_thd_alloc_ext;
-		struct {
-			thdclosure_index_t      idx;
-			int                     own_tcap;
-			cos_channelkey_t        key;
-			struct cos_defcompinfo *dci;
-		} sl_xcpu_req_aep_alloc_ext;
-		struct {
-			int                     is_sched;
-			int                     own_tcap;
-			struct cos_defcompinfo *dci, *sched;
-		} sl_xcpu_req_initaep_alloc;
-	};
-};
-
-CK_RING_PROTOTYPE(xcpu, sl_xcpu_request);
-
-#define SL_XCPU_RING_SIZE (64 * sizeof(struct sl_xcpu_request)) /* in sl_const.h? */
-
-/* perhaps move these to sl.h? */
-struct sl_global {
-	struct ck_ring xcpu_ring[NUM_CPU]; /* mpsc ring! */
-
-	struct sl_xcpu_request xcpu_rbuf[NUM_CPU][SL_XCPU_RING_SIZE];
-	u32_t cpu_bmp[NUM_CPU_BMP_WORDS]; /* bitmap of cpus this scheduler is running on! */
-	asndcap_t xcpu_asnd[NUM_CPU][NUM_CPU];
-} CACHE_ALIGNED;
-
-extern struct sl_global sl_global_data;
-
-static inline struct sl_global *
-sl__globals(void)
-{
-	return &sl_global_data;
-}
-
-static inline int
-sl_cpu_active(void)
-{
-        return bitmap_check(sl__globals()->cpu_bmp, cos_cpuid());
-}
-
-static inline struct ck_ring *
-sl__ring(cpuid_t cpu)
-{
-	return &(sl__globals()->xcpu_ring[cpu]);
-}
-
-static inline struct ck_ring *
-sl__ring_curr(void)
-{
-	return sl__ring(cos_cpuid());
-}
-
-static inline struct sl_xcpu_request *
-sl__ring_buffer(cpuid_t cpu)
-{
-	return (sl__globals()->xcpu_rbuf[cpu]);
-}
-
-static inline struct sl_xcpu_request *
-sl__ring_buffer_curr(void)
-{
-	return sl__ring_buffer(cos_cpuid());
-}
-
-/* perhaps move these to sl.h? */
-int sl_xcpu_thd_alloc(cpuid_t cpu, cos_thd_fn_t fn, void *data, sched_param_t params[]);
-int sl_xcpu_thd_alloc_ext(cpuid_t cpu, struct cos_defcompinfo *dci, thdclosure_index_t idx, sched_param_t params[]);
-int sl_xcpu_aep_alloc(cpuid_t cpu, cos_thd_fn_t fn, void *data, int own_tcap, cos_channelkey_t key, sched_param_t params[]);
-int sl_xcpu_aep_alloc_ext(cpuid_t cpu, struct cos_defcompinfo *dci, thdclosure_index_t idx, int own_tcap, cos_channelkey_t key, sched_param_t params[]);
-int sl_xcpu_initaep_alloc(cpuid_t cpu, struct cos_defcompinfo *dci, int own_tcap, cos_channelkey_t key, sched_param_t params[]);
-int sl_xcpu_initaep_alloc_ext(cpuid_t cpu, struct cos_defcompinfo *dci, struct cos_defcompinfo *sched, int own_tcap, cos_channelkey_t key, sched_param_t params[]);
-
-#endif /* SL_XCPU_H */
diff --git a/src/components/include/stacklist.h b/src/components/include/stacklist.h
new file mode 100644
index 0000000000..edb6f9fd54
--- /dev/null
+++ b/src/components/include/stacklist.h
@@ -0,0 +1,94 @@
+#ifndef STACKLIST_H
+#define STACKLIST_H
+
+/**
+ * Modified to support multi-core via a Treiber stack. This is not 100%
+ * a great solution as it isn't FIFO. However, we release *all*
+ * threads when unlocking, so the priority scheduling should take over
+ * at that point.
+ */
+
+#include <cos_component.h>
+#include <ps.h>
+
+struct stacklist {
+	cpuid_t coreid;
+	thdid_t thdid;
+	struct stacklist *next;
+};
+
+struct stacklist_head {
+	struct stacklist *head;
+};
+
+static inline void
+stacklist_init(struct stacklist_head *h)
+{
+	h->head = NULL;
+}
+
+/*
+ * Remove a thread from the list that has been woken. Return 0 on
+ * success, and 1 if it could not be removed.
+ */
+static inline int
+stacklist_rem(struct stacklist *l)
+{
+	/*
+	 * Not currently supported with Trebor Stack. Threads that
+	 * wake early still have to wait their turn.
+	 */
+	return 1;
+}
+
+/* Add a thread that is going to block */
+static inline void
+stacklist_add(struct stacklist_head *h, struct stacklist *l)
+{
+	l->coreid = cos_cpuid();
+	l->thdid  = cos_thdid();
+	l->next   = NULL;
+	assert(h);
+
+	while (1) {
+		struct stacklist *n = ps_load(&h->head);
+
+		l->next = n;
+		if (ps_cas((unsigned long *)&h->head, (unsigned long)n, (unsigned long)l)) break;
+	}
+}
+
+/* Get a thread to wake up, and remove its record! */
+static inline thdid_t 
+stacklist_dequeue(cpuid_t *core, struct stacklist_head *h)
+{
+	struct stacklist *sl = NULL;
+
+	/*
+	 * Only a single thread should trigger an event, and dequeue
+	 * threads, but we'll implement this conservatively. Given
+	 * this, please note that this should *not* iterate more than
+	 * once.
+	 */
+	do {
+		sl = ps_load(&h->head);
+		if (unlikely(!sl)) return 0;
+	} while (!ps_cas((unsigned long *)&h->head, (unsigned long)sl, (unsigned long)sl->next));
+	sl->next = NULL;
+	*core    = sl->coreid;
+
+	return sl->thdid;
+}
+
+/*
+ * A thread that wakes up after blocking using a stacklist should be
+ * able to assume that it is no longer on the list. This enables them
+ * to assert on that fact.
+ */
+static inline int
+stacklist_is_removed(struct stacklist *l)
+{
+	return l->next == NULL;
+}
+
+#endif	/* STACKLIST_H */
diff --git a/src/components/interface/capmgr/capmgr.h b/src/components/interface/capmgr/capmgr.h
index eb0f85b6d4..09fc89acbf 100644
--- a/src/components/interface/capmgr/capmgr.h
+++ b/src/components/interface/capmgr/capmgr.h
@@ -13,10 +13,10 @@
 
 thdcap_t  capmgr_initthd_create(spdid_t child, thdid_t *tid);
 thdcap_t  capmgr_initaep_create(spdid_t child, struct cos_aep_info *aep, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, asndcap_t *sndret);
-thdcap_t  capmgr_thd_create(cos_thd_fn_t fn, void *data, thdid_t *tid);
-thdcap_t  capmgr_aep_create(struct cos_aep_info *a, cos_aepthd_fn_t fn, void *data, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax);
-thdcap_t  capmgr_thd_create_ext(spdid_t child, thdclosure_index_t idx, thdid_t *tid);
-thdcap_t  capmgr_aep_create_ext(spdid_t child, struct cos_aep_info *a, thdclosure_index_t idx, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv);
+thdcap_t  capmgr_thd_create(cos_thd_fn_t fn, void *data, thdid_t *tid, struct cos_dcb_info **dcb);
+thdcap_t  capmgr_aep_create(struct cos_aep_info *a, cos_aepthd_fn_t fn, void *data, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, struct cos_dcb_info **dcb);
+thdcap_t  capmgr_thd_create_ext(spdid_t child, thdclosure_index_t idx, thdid_t *tid, struct cos_dcb_info **dcb);
+thdcap_t  capmgr_aep_create_ext(spdid_t child, struct cos_aep_info *a, thdclosure_index_t idx, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, struct cos_dcb_info **dcb, arcvcap_t *extrcv);
 thdcap_t  capmgr_thd_retrieve(spdid_t child, thdid_t t, thdid_t *inittid);
 thdcap_t  capmgr_thd_retrieve_next(spdid_t child, thdid_t *tid);
 arcvcap_t capmgr_rcv_create(spdid_t child, thdid_t tid, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax);
@@ -24,4 +24,10 @@ asndcap_t capmgr_asnd_create(spdid_t child, thdid_t t);
 asndcap_t capmgr_asnd_rcv_create(arcvcap_t rcv);
 asndcap_t capmgr_asnd_key_create(cos_channelkey_t key);
 
+int capmgr_thd_migrate(thdid_t tid, thdcap_t tc, cpuid_t core);
+
+int capmgr_hw_attach(hwid_t hwid, thdid_t tid);
+int capmgr_hw_periodic_attach(hwid_t hwid, thdid_t tid, unsigned int period_us);
+int capmgr_hw_detach(hwid_t hwid);
+
 #endif /* CAPMGR_H */
diff --git a/src/components/interface/capmgr/stubs/c_stub.c b/src/components/interface/capmgr/stubs/c_stub.c
index a808528adc..c008e15361 100644
--- a/src/components/interface/capmgr/stubs/c_stub.c
+++ b/src/components/interface/capmgr/stubs/c_stub.c
@@ -10,14 +10,16 @@
 #include <cos_thd_init.h>
 #include <cos_defkernel_api.h>
 
-thdcap_t capmgr_initthd_create_cserialized(thdid_t *tid, int *unused, spdid_t s);
-thdcap_t capmgr_initaep_create_cserialized(u32_t *sndtidret, u32_t *rcvtcret, u32_t spdid_owntc, u32_t key_ipimax, u32_t ipiwin32b);
-thdcap_t capmgr_thd_create_cserialized(thdid_t *tid, int *unused, thdclosure_index_t idx);
-thdcap_t capmgr_aep_create_ext_cserialized(u32_t *drcvtidret, u32_t *rcvtcret, u32_t spdid_owntc_thdidx, u32_t chkey_ipimax, u32_t ipiwin32b);
-thdcap_t capmgr_thd_create_ext_cserialized(thdid_t *tid, int *unused, spdid_t s, thdclosure_index_t idx);
-thdcap_t capmgr_aep_create_cserialized(thdid_t *tid, u32_t *tcrcvret, u32_t owntc_tidx, u32_t key_ipimax, u32_t ipiwin32b);
-thdcap_t capmgr_thd_retrieve_next_cserialized(thdid_t *tid, int *unused, spdid_t s);
-thdcap_t capmgr_thd_retrieve_cserialized(thdid_t *inittid, int *unused, spdid_t s, thdid_t tid);
+thdcap_t  capmgr_initthd_create_cserialized(thdid_t *tid, int *unused, spdid_t s);
+thdcap_t  capmgr_initaep_create_cserialized(u32_t *sndtidret, u32_t *rcvtcret, u32_t spdid_owntc, u32_t key_ipimax, u32_t ipiwin32b);
+thdcap_t  capmgr_thd_create_cserialized(struct cos_dcb_info **dcb, thdid_t *tid, thdclosure_index_t idx);
+u32_t     capmgr_aep_create_ext_cserialized(struct cos_dcb_info **dcb, u32_t *rcvtcret, u32_t spdid_owntc_thdidx, u32_t chkey_ipimax, u32_t ipiwin32b);
+/* rcvcap for spdid = s shall be obtained through a separate call to capmgr! */
+arcvcap_t capmgr_aep_rcv_retrieve_cserialized(spdid_t s, thdid_t tid);
+thdcap_t  capmgr_thd_create_ext_cserialized(struct cos_dcb_info **dcb, thdid_t *tid, spdid_t s, thdclosure_index_t idx);
+u32_t     capmgr_aep_create_cserialized(struct cos_dcb_info **dcb, u32_t *rcvtcret, u32_t owntc_tidx, u32_t key_ipimax, u32_t ipiwin32b);
+thdcap_t  capmgr_thd_retrieve_next_cserialized(thdid_t *tid, int *unused, spdid_t s);
+thdcap_t  capmgr_thd_retrieve_cserialized(thdid_t *inittid, int *unused, spdid_t s, thdid_t tid);
 arcvcap_t capmgr_rcv_create_cserialized(u32_t spd_tid, u32_t key_ipimax, u32_t ipiwin32b);
 
 arcvcap_t
@@ -33,50 +35,67 @@ capmgr_rcv_create(spdid_t child, thdid_t tid, cos_channelkey_t key, microsec_t i
 thdcap_t
 capmgr_thd_retrieve(spdid_t child, thdid_t tid, thdid_t *inittid)
 {
-	int unused;
+	int r1, r2, r3;
 
-	return capmgr_thd_retrieve_cserialized(inittid, &unused, child, tid);
+	r1 = capmgr_thd_retrieve_cserialized((thdid_t *)&r2, &r3, child, tid);
+	*inittid = r2;
+
+	return r1;
 }
 
 thdcap_t
 capmgr_thd_retrieve_next(spdid_t child, thdid_t *tid)
 {
-	int unused;
+	int r1, r2, r3;
+
+	r1 = capmgr_thd_retrieve_next_cserialized((thdid_t *)&r2, &r3, child);
+	*tid = r2;
 
-	return capmgr_thd_retrieve_next_cserialized(tid, &unused, child);
+	return r1;
 }
 
 thdcap_t
 capmgr_initthd_create(spdid_t child, thdid_t *tid)
 {
-	int unused;
+	int r1, r2, r3;
+
+	r1 = capmgr_initthd_create_cserialized((thdid_t *)&r2, &r3, child);
+	*tid = r2;
 
-	return capmgr_initthd_create_cserialized(tid, &unused, child);
+	return r1;
 }
 
 thdcap_t
-capmgr_thd_create(cos_thd_fn_t fn, void *data, thdid_t *tid)
+capmgr_thd_create(cos_thd_fn_t fn, void *data, thdid_t *tid, struct cos_dcb_info **dcb)
 {
-	int unused;
+	int r1, r2, r3;
 	thdclosure_index_t idx = cos_thd_init_alloc(fn, data);
 
-	if (idx < 1) return 0;
+	if (unlikely(idx < 1)) return 0;
+
+	r1 = capmgr_thd_create_cserialized((struct cos_dcb_info **)&r2, (thdid_t *)&r3, idx);
+	*dcb = (struct cos_dcb_info *)r2;
+	*tid = r3;
 
-	return capmgr_thd_create_cserialized(tid, &unused, idx);
+	return r1;
 }
 
 thdcap_t
-capmgr_thd_create_ext(spdid_t child, thdclosure_index_t idx, thdid_t *tid)
+capmgr_thd_create_ext(spdid_t child, thdclosure_index_t idx, thdid_t *tid, struct cos_dcb_info **dcb)
 {
-	int unused;
+	int r1, r2, r3;
 
-	return capmgr_thd_create_ext_cserialized(tid, &unused, child, idx);
+	r1 = capmgr_thd_create_ext_cserialized((struct cos_dcb_info **)&r2, (thdid_t *)&r3, child, idx);
+	*tid = r3;
+	*dcb = (struct cos_dcb_info *)r2;
+
+	return r1;
 }
 
 thdcap_t
-capmgr_aep_create(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax)
+capmgr_aep_create(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, struct cos_dcb_info **dcb)
 {
-	u32_t tcrcvret = 0;
+	u32_t tcrcvret = 0, thdtidret = 0;
 	thdcap_t thd = 0;
 	arcvcap_t rcv = 0;
 	tcap_t tc = 0;
@@ -88,8 +107,11 @@ capmgr_aep_create(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data, int
 
 	if (idx < 1) return 0;
 
-	thd = capmgr_aep_create_cserialized(&tid, &tcrcvret, owntc_idx, key_ipimax, ipiwin32b);
-	if (!thd) return 0;
+	thdtidret = capmgr_aep_create_cserialized(dcb, &tcrcvret, owntc_idx, key_ipimax, ipiwin32b);
+	if (!thdtidret) return 0;
+	thd = thdtidret >> 16;
+	tid = (thdtidret << 16) >> 16;
+	if (!thd || !tid) return 0;
 
 	aep->fn   = fn;
 	aep->data = data;
@@ -102,9 +124,9 @@ capmgr_aep_create(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data, int
 }
 
 thdcap_t
-capmgr_aep_create_ext(spdid_t child, struct cos_aep_info *aep, thdclosure_index_t idx, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv)
+capmgr_aep_create_ext(spdid_t child, struct cos_aep_info *aep, thdclosure_index_t idx, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, struct cos_dcb_info **dcb, arcvcap_t *extrcv)
 {
-	u32_t drcvtidret = 0;
+	u32_t thdtidret = 0;
 	u32_t tcrcvret = 0;
 	thdid_t tid = 0;
 	thdcap_t thd = 0;
@@ -112,16 +134,20 @@ capmgr_aep_create_ext(spdid_t child, struct cos_aep_info *aep, thdclosure_index_
 	u32_t key_ipimax = (key << 16) | ((ipimax << 16) >> 16);
 	u32_t ipiwin32b  = (u32_t)ipiwin;
 
-	thd = capmgr_aep_create_ext_cserialized(&drcvtidret, &tcrcvret, owntc_spdid_thdidx, key_ipimax, ipiwin32b);
-	if (!thd) return thd;
+	thdtidret = capmgr_aep_create_ext_cserialized(dcb, &tcrcvret, owntc_spdid_thdidx, key_ipimax, ipiwin32b);
+	if (!thdtidret) return thd;
+	thd = thdtidret >> 16;
+	tid = (thdtidret << 16) >> 16;
+	if (!thd || !tid) return 0;
 
 	aep->fn   = NULL;
 	aep->data = NULL;
 	aep->thd  = thd;
-	aep->tid  = (drcvtidret << 16) >> 16;
+	aep->tid  = tid;
 	aep->rcv  = tcrcvret >> 16;
 	aep->tc   = (tcrcvret << 16) >> 16;
-	*extrcv   = drcvtidret >> 16;
+	*extrcv = capmgr_aep_rcv_retrieve_cserialized(child, tid);
+	assert(*extrcv);
 
 	return aep->thd;
 }
diff --git a/src/components/interface/capmgr/stubs/s_stub.S b/src/components/interface/capmgr/stubs/s_stub.S
index a7dfb0be87..4059d6a5db 100644
--- a/src/components/interface/capmgr/stubs/s_stub.S
+++ b/src/components/interface/capmgr/stubs/s_stub.S
@@ -14,12 +14,18 @@ cos_asm_server_stub_rets(capmgr_thd_create_cserialized)
 cos_asm_server_stub_rets(capmgr_aep_create_cserialized)
 cos_asm_server_stub_rets(capmgr_thd_create_ext_cserialized)
 cos_asm_server_stub_rets(capmgr_aep_create_ext_cserialized)
+cos_asm_server_stub(capmgr_aep_rcv_retrieve_cserialized)
 cos_asm_server_stub_rets(capmgr_thd_retrieve_cserialized)
 cos_asm_server_stub_rets(capmgr_thd_retrieve_next_cserialized)
 cos_asm_server_stub(capmgr_rcv_create_cserialized)
 cos_asm_server_stub(capmgr_asnd_create)
 cos_asm_server_stub(capmgr_asnd_rcv_create)
 cos_asm_server_stub(capmgr_asnd_key_create)
+cos_asm_server_stub(capmgr_thd_migrate)
+
+cos_asm_server_stub(capmgr_hw_attach)
+cos_asm_server_stub(capmgr_hw_periodic_attach)
+cos_asm_server_stub(capmgr_hw_detach)
 
 cos_asm_server_stub(memmgr_heap_page_allocn)
 cos_asm_server_stub_rets(memmgr_shared_page_allocn_cserialized)
diff --git a/src/components/interface/crt/Makefile b/src/components/interface/crt/Makefile
new file mode 100644
index 0000000000..6015b0c902
--- /dev/null
+++ b/src/components/interface/crt/Makefile
@@ -0,0 +1,4 @@
+LIB_OBJS=
+LIBS=$(LIB_OBJS:%.o=%.a)
+
+include ../Makefile.subdir
diff --git a/src/components/interface/crt/chan_crt.h b/src/components/interface/crt/chan_crt.h
new file mode 100644
index 0000000000..2d93167c45
--- /dev/null
+++ b/src/components/interface/crt/chan_crt.h
@@ -0,0 +1,7 @@
+#ifndef CHAN_CRT_H
+#define CHAN_CRT_H
+
+int           chan_out(unsigned long item);
+unsigned long chan_in(void);
+
+#endif /* CHAN_CRT_H */
diff --git a/src/components/interface/crt/stubs/s_stub.S b/src/components/interface/crt/stubs/s_stub.S
new file mode 100644
index 0000000000..806aea9e19
--- /dev/null
+++ b/src/components/interface/crt/stubs/s_stub.S
@@ -0,0 +1,20 @@
+/**
+ * Redistribution of this file is permitted under the BSD two clause license.
+ *
+ * Copyright 2019, The George Washington University
+ * Author: Phani Gadepalli, phanikishoreg@gwu.edu
+ */
+
+#include <cos_asm_server_stub_simple_stack.h>
+
+.text
+cos_asm_server_stub(chan_out)
+cos_asm_server_stub(chan_in)
+//cos_asm_server_stub(chan_init)
+//cos_asm_server_stub(chan_teardown)
+//cos_asm_server_stub(chan_in_get)
+//cos_asm_server_stub(chan_out_get)
+//cos_asm_server_stub(chan_send)
+//cos_asm_server_stub(chan_recv)
+//cos_asm_server_stub(chan_async_send)
+//cos_asm_server_stub(chan_async_recv)
diff --git a/src/components/interface/work/Makefile b/src/components/interface/work/Makefile
new file mode 100644
index 0000000000..800adb919e
--- /dev/null
+++ b/src/components/interface/work/Makefile
@@ -0,0 +1,4 @@
+B_OBJS=
+LIBS=$(LIB_OBJS:%.o=%.a)
+
+include ../Makefile.subdir
diff --git a/src/components/interface/work/stubs/c_stub.c b/src/components/interface/work/stubs/c_stub.c
new file mode 100644
index 0000000000..aafec59e63
--- /dev/null
+++ b/src/components/interface/work/stubs/c_stub.c
@@ -0,0 +1,37 @@
+/**
+ * Redistribution of this file is permitted under the BSD two clause license.
+ *
+ * Copyright 2018, The George Washington University
+ * Author: Phani Gadepalli, phanikishoreg@gwu.edu
+ */
+
+#include <work.h>
+
+int work_cycs_cserialized(unsigned long *hielpased, unsigned long *loelapsed, unsigned long hi_cycs, unsigned long lo_cycs);
+int work_usecs_cserialized(unsigned long *hielpased, unsigned long *loelapsed, unsigned long hi_usecs, unsigned long lo_usecs);
+
+cycles_t
+work_cycs(cycles_t ncycs)
+{
+	unsigned long hi_in, lo_in, hi_out, lo_out;
+
+	hi_in = (ncycs >> 32);
+	lo_in = ((ncycs << 32) >> 32);
+
+	work_cycs_cserialized(&hi_out, &lo_out, hi_in, lo_in);
+
+	return (((cycles_t) hi_out << 32) | (cycles_t)lo_out);
+}
+
+microsec_t
+work_usecs(microsec_t nusecs)
+{
+	unsigned long hi_in, lo_in, hi_out, lo_out;
+
+	hi_in = (nusecs >> 32);
+	lo_in = ((nusecs << 32) >> 32);
+
+	work_usecs_cserialized(&hi_out, &lo_out, hi_in, lo_in);
+
+	return (((microsec_t) hi_out << 32) | (microsec_t)lo_out);
+}
diff --git a/src/components/interface/work/stubs/s_stub.S b/src/components/interface/work/stubs/s_stub.S
new file mode 100644
index 0000000000..d3245b4e75
--- /dev/null
+++ b/src/components/interface/work/stubs/s_stub.S
@@ -0,0 +1,12 @@
+/**
+ * Redistribution of this file is permitted under the BSD two clause license.
+ *
+ * Copyright 2018, The George Washington University
+ * Author: Phani Gadepalli, phanikishoreg@gwu.edu
+ */
+
+#include <cos_asm_server_stub_simple_stack.h>
+
+.text
+cos_asm_server_stub_rets(work_cycs_cserialized)
+cos_asm_server_stub_rets(work_usecs_cserialized)
diff --git a/src/components/interface/work/work.h b/src/components/interface/work/work.h
new file mode 100644
index 0000000000..9768993ceb
--- /dev/null
+++ b/src/components/interface/work/work.h
@@ -0,0 +1,12 @@
+#ifndef WORK_H
+#define WORK_H
+
+#include <cos_types.h>
+
+/* @return: number of actual cycles elapsed */
+cycles_t work_cycs(cycles_t ncycs);
+/* @return: number of actual usecs elapsed */
+microsec_t work_usecs(microsec_t nusecs);
+
+
+#endif /* WORK_H */
diff --git a/src/components/lib/Makefile b/src/components/lib/Makefile
index cba4d0e8a0..8eb222d861 100644
--- a/src/components/lib/Makefile
+++ b/src/components/lib/Makefile
@@ -1,6 +1,6 @@
 include Makefile.src Makefile.comp
 
-LIB_OBJS=heap.o cobj_format.o cos_kernel_api.o cos_defkernel_api.o cos_ubench.o
+LIB_OBJS=heap.o cobj_format.o cos_kernel_api.o cos_defkernel_api.o cos_dcb.o part_raw.o part_capmgr.o cos_ubench.o
 LIBS=$(LIB_OBJS:%.o=%.a)
 MANDITORY=c_stub.o cos_asm_upcall.o cos_asm_ainv.o cos_component.o
 MAND=$(MANDITORY_LIB)
@@ -10,8 +10,8 @@ SIMPLE_STKLIB=simple_stklib.o
 CINC_ENV=$(CINC)
 export CINC_ENV
 
-.PHONY: all sl ps ck sinv
-all: $(LIBS) $(MAND) $(SIMPLE_STKLIB) sl sinv
+.PHONY: all sl ps ck sinv cos_gomp posix cxx
+all: $(LIBS) $(MAND) $(SIMPLE_STKLIB) sl posix sinv cos_gomp
 
 # we have to compile these without dietlibc so that there are not
 # symbol conflicts and this is why we have the %.a here and don't
@@ -30,6 +30,9 @@ $(SIMPLE_STKLIB): $(SIMPLE_STACKS)
 sl:
 	make $(MAKEFLAGS) -C sl
 
+cos_gomp:
+	make $(MAKEFLAGS) -C cos_gomp
+
 sinv:
 	make $(MAKEFLAGS) -C sinv_async
 
@@ -42,15 +45,18 @@ sinv:
 	@$(CC) $(CFLAGS) $(CINC) -o $@ -c $^
 
 clean:
-	$(info |     [RM]   Cleaning up directory)
+	$(info |     [RM]   Cleaning up libraries and directories)
 	@rm -f a.out *.o *.a *.d *~
-	make -C sl clean
+	@make -C sl clean
+	@make -C sinv_async clean
+	@make -C posix clean
+	@make -C cos_gomp clean
 
 distclean:
+	$(info |     [RM]   Uninstalling external libraries)
 	make -C musl-1.1.11 distclean
 # keep the following commands in one line. make executes each line
 # with a new shell.
-	make -C posix clean
 	make -C libcxx clean
 	make -C ck uninstall
 
@@ -62,12 +68,15 @@ musl:
 ps:
 	cd ps; ./configure cos x86 general; cd ..; make -C ps config ; make -C ps all
 
-
 ck:
 	make -C ck all
 	make -C ck install
 
-init: clean distclean musl ck ps all
-# keep the following commands in one line. Same as above.
+posix:
 	make -C posix
+
+cxx:
 	make -C libcxx
+
+init: clean distclean musl ck ps cxx all
+# keep the following commands in one line. Same as above.
diff --git a/src/components/lib/cos_component.c b/src/components/lib/cos_component.c
index 15f5ab3122..d40ca97cb3 100644
--- a/src/components/lib/cos_component.c
+++ b/src/components/lib/cos_component.c
@@ -200,13 +200,17 @@ cos_upcall_fn(upcall_type_t t, void *arg1, void *arg2, void *arg3)
 					cos_thd_entry_exec(idx);
 				}
 			}
-			return;
+			break;
 		}
 	default:
 		/* fault! */
 		assert(0);
 		return;
 	}
+
+	/* FIXME: for now, don't let threads page-fault on return! */
+	while (1) ;
+
 	return;
 }
 
diff --git a/src/components/lib/cos_dcb.c b/src/components/lib/cos_dcb.c
new file mode 100644
index 0000000000..e73069af8f
--- /dev/null
+++ b/src/components/lib/cos_dcb.c
@@ -0,0 +1,96 @@
+#include <cos_component.h>
+#include <cos_dcb.h>
+#include <cos_defkernel_api.h>
+
+static struct cos_dcbinfo_data _cos_dcbinfo[NUM_CPU];
+
+void
+cos_dcb_info_init_ext(struct cos_dcbinfo_data *cdi, struct cos_compinfo *ci, 
+		      dcbcap_t initdcbcap, vaddr_t initdcbaddr, dcboff_t start_off)
+{
+	memset(cdi, 0, sizeof(struct cos_dcbinfo_data));
+
+	cdi->dcbcaps[0]   = initdcbcap;
+	cdi->dcbaddr[0]   = initdcbaddr;
+	cdi->curr_cap_off = start_off;
+	cdi->curr_cap     = 0;
+}
+
+void
+cos_dcb_info_init(struct cos_dcbinfo_data *cdi, struct cos_compinfo *ci)
+{
+	if (cos_spd_id() == 0) {
+		cos_dcb_info_init_ext(cdi, ci, LLBOOT_CAPTBL_CPU_INITDCB, 
+				      (vaddr_t)cos_init_dcb_get(), 1);
+	} else {
+		cos_dcb_info_init_ext(cdi, ci, 0, 0, 0);
+	}
+}
+
+void
+cos_dcb_info_init_curr(void)
+{
+	cos_dcb_info_init_curr_ext(0, 0, 0);
+}
+
+void
+cos_dcb_info_init_curr_ext(dcbcap_t initdcbcap, vaddr_t initdcbaddr, dcboff_t st_off)
+{
+	struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get());
+
+	if (initdcbcap == 0 && initdcbaddr == 0) {
+
+		if (cos_spd_id() == 0) {
+			cos_dcb_info_init_ext(&_cos_dcbinfo[cos_cpuid()], ci, 
+					      LLBOOT_CAPTBL_CPU_INITDCB, (vaddr_t)cos_init_dcb_get(), 1);
+
+			return;
+		} else {
+			initdcbaddr = cos_page_bump_intern_valloc(ci, PAGE_SIZE);
+			assert(initdcbaddr);
+			initdcbcap  = cos_dcb_alloc(ci, ci->pgtbl_cap, initdcbaddr);
+			assert(initdcbcap);
+			st_off = 0;
+		}
+	}
+	cos_dcb_info_init_ext(&_cos_dcbinfo[cos_cpuid()], ci, initdcbcap, initdcbaddr, st_off);
+}
+
+dcbcap_t
+cos_dcb_info_alloc_curr(dcboff_t *dcboff, vaddr_t *dcbaddr)
+{
+	return cos_dcb_info_alloc(&_cos_dcbinfo[cos_cpuid()], dcboff, dcbaddr);
+}
+
+dcbcap_t
+cos_dcb_info_alloc(struct cos_dcbinfo_data *cdi, dcboff_t *dcboff, vaddr_t *dcbaddr)
+{
+	if (unlikely(cdi->dcbcaps[cdi->curr_cap] == 0)) {
+		*dcboff = 0;
+		*dcbaddr = 0;
+
+		return 0;
+	}
+	if (cdi->curr_cap_off >= COS_DCB_PERPG_MAX) {
+		int ret;
+		unsigned short curr_off = cdi->curr_cap;
+
+		assert(curr_off + 1 < (unsigned short)COS_DCB_MAX_CAPS && cdi->dcbcaps[curr_off + 1] == 0);
+
+		cdi->dcbaddr[curr_off + 1] = cos_page_bump_intern_valloc(cdi->ci, PAGE_SIZE);
+		assert(cdi->dcbaddr[curr_off + 1]);
+		cdi->dcbcaps[curr_off + 1] = cos_dcb_alloc(cos_compinfo_get(cos_defcompinfo_curr_get()), 
+							   cdi->ci->pgtbl_cap, cdi->dcbaddr[curr_off + 1]);
+
+		assert(cdi->dcbcaps[curr_off + 1]);
+		ret = ps_cas((unsigned long *)&cdi->curr_cap, curr_off, curr_off + 1);
+		assert(ret);
+		ret = ps_cas((unsigned long *)&cdi->curr_cap_off, cdi->curr_cap_off, 0);
+		assert(ret);
+	}
+
+	*dcboff  = ps_faa((unsigned long *)&cdi->curr_cap_off, 1);
+	*dcbaddr = cdi->dcbaddr[cdi->curr_cap] + (sizeof(struct cos_dcb_info) * (*dcboff));
+
+	return cdi->dcbcaps[cdi->curr_cap];
+}
diff --git a/src/components/lib/cos_defkernel_api.c b/src/components/lib/cos_defkernel_api.c
index 68caf64dc1..ceed2c2fbf 100644
--- a/src/components/lib/cos_defkernel_api.c
+++ b/src/components/lib/cos_defkernel_api.c
@@ -46,6 +46,17 @@ cos_defcompinfo_init(void)
 
 }
 
+void
+cos_defcompinfo_llinit(void)
+{
+	if (curr_defci_init_status == INITIALIZED) return;
+
+	cos_defcompinfo_init_ext(BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE,
+	                         BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT,
+	                         BOOT_CAPTBL_SELF_COMP, (vaddr_t)cos_get_heap_ptr(), LLBOOT_CAPTBL_FREE);
+
+}
+
 void
 cos_defcompinfo_init_ext(tcap_t sched_tc, thdcap_t sched_thd, arcvcap_t sched_rcv, pgtblcap_t pgtbl_cap,
                          captblcap_t captbl_cap, compcap_t comp_cap, vaddr_t heap_ptr, capid_t cap_frontier)
@@ -87,7 +98,7 @@ cos_defcompinfo_sched_init(void)
 }
 
 static int
-cos_aep_alloc_intern(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, tcap_t tc, struct cos_aep_info *sched, cos_aepthd_fn_t fn, void *data, thdclosure_index_t idx)
+cos_aep_alloc_intern(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, tcap_t tc, struct cos_aep_info *sched, cos_aepthd_fn_t fn, void *data, thdclosure_index_t idx, dcbcap_t dcbcap, dcboff_t dcboff)
 {
 	struct cos_defcompinfo *defci   = cos_defcompinfo_curr_get();
 	struct cos_compinfo    *ci      = cos_compinfo_get(defci);
@@ -97,9 +108,9 @@ cos_aep_alloc_intern(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci,
 	assert(curr_defci_init_status == INITIALIZED);
 	memset(aep, 0, sizeof(struct cos_aep_info));
 
-	if (is_init)      aep->thd = cos_initthd_alloc(ci, dst_ci->comp_cap);
-	else if (idx > 0) aep->thd = cos_thd_alloc_ext(ci, dst_ci->comp_cap, idx);
-	else              aep->thd = cos_thd_alloc(ci, dst_ci->comp_cap, cos_aepthd_fn, (void *)aep);
+	if (is_init)      aep->thd = cos_initthd_alloc(ci, dst_ci->comp_cap, dcbcap);
+	else if (idx > 0) aep->thd = cos_thd_alloc_ext(ci, dst_ci->comp_cap, idx, dcbcap, dcboff);
+	else              aep->thd = cos_thd_alloc(ci, dst_ci->comp_cap, cos_aepthd_fn, (void *)aep, dcbcap, dcboff);
 	assert(aep->thd);
 	aep->tid  = cos_introspect(ci, aep->thd, THD_GET_TID);
 	if (!sched && is_init) return 0;
@@ -121,7 +132,7 @@ cos_aep_alloc_intern(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci,
 
 int
 cos_defcompinfo_child_alloc(struct cos_defcompinfo *child_defci, vaddr_t entry, vaddr_t heap_ptr, capid_t cap_frontier,
-                            int is_sched)
+                            int is_sched, dcbcap_t *initdcbcap)
 {
 	int                     ret;
 	struct cos_defcompinfo *defci     = cos_defcompinfo_curr_get();
@@ -129,11 +140,22 @@ cos_defcompinfo_child_alloc(struct cos_defcompinfo *child_defci, vaddr_t entry,
 	struct cos_compinfo    *ci        = cos_compinfo_get(defci);
 	struct cos_compinfo    *child_ci  = cos_compinfo_get(child_defci);
 	struct cos_aep_info    *child_aep = cos_sched_aep_get(child_defci);
+	vaddr_t dcbaddr = 0;
+	dcbcap_t dcbcap = 0;
+	scbcap_t scbcap = 0;
+
+	scbcap = cos_scb_alloc(ci);
+	assert(scbcap);
 
 	assert(curr_defci_init_status == INITIALIZED);
-	ret = cos_compinfo_alloc(child_ci, heap_ptr, cap_frontier, entry, ci);
+	ret = cos_compinfo_alloc(child_ci, scbcap, heap_ptr, cap_frontier, entry, ci);
 	if (ret) return ret;
-	ret = cos_aep_alloc_intern(child_aep, child_defci, 0, is_sched ? sched_aep : NULL, NULL, NULL, 0);
+	dcbaddr = (vaddr_t)cos_page_bump_intern_valloc(child_ci, PAGE_SIZE);
+	assert(dcbaddr);
+	dcbcap = cos_dcb_alloc(ci, child_ci->pgtbl_cap, dcbaddr);
+	assert(dcbcap);
+	ret = cos_aep_alloc_intern(child_aep, child_defci, 0, is_sched ? sched_aep : NULL, NULL, NULL, 0, dcbcap, 0);
+	*initdcbcap = dcbcap;
 
 	return ret;
 }
@@ -147,29 +169,29 @@ cos_defcompinfo_childid_init(struct cos_defcompinfo *child_defci, spdid_t c)
 }
 
 int
-cos_initaep_alloc(struct cos_defcompinfo *dst_dci, struct cos_aep_info *sched, int is_sched)
+cos_initaep_alloc(struct cos_defcompinfo *dst_dci, struct cos_aep_info *sched, int is_sched, dcbcap_t dcap)
 {
 	struct cos_defcompinfo *defci     = cos_defcompinfo_curr_get();
 	struct cos_aep_info    *sched_aep = cos_sched_aep_get(defci);
 	struct cos_aep_info    *child_aep = cos_sched_aep_get(dst_dci);
 	struct cos_aep_info    *sched_use = is_sched ? (sched ? sched : sched_aep) : NULL;
 
-	return cos_aep_alloc_intern(child_aep, dst_dci, 0, sched_use, NULL, NULL, 0);
+	return cos_aep_alloc_intern(child_aep, dst_dci, 0, sched_use, NULL, NULL, 0, dcap, 0);
 }
 
 int
-cos_initaep_tcap_alloc(struct cos_defcompinfo *dst_dci, tcap_t tc, struct cos_aep_info *sched)
+cos_initaep_tcap_alloc(struct cos_defcompinfo *dst_dci, tcap_t tc, struct cos_aep_info *sched, dcbcap_t dcap)
 {
 	struct cos_defcompinfo *defci     = cos_defcompinfo_curr_get();
 	struct cos_aep_info    *sched_aep = cos_sched_aep_get(defci);
 	struct cos_aep_info    *child_aep = cos_sched_aep_get(dst_dci);
 	struct cos_aep_info    *sched_use = sched ? sched : sched_aep;
 
-	return cos_aep_alloc_intern(child_aep, dst_dci, tc, sched_use, NULL, NULL, 0);
+	return cos_aep_alloc_intern(child_aep, dst_dci, tc, sched_use, NULL, NULL, 0, dcap, 0);
 }
 
 int
-cos_aep_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, struct cos_aep_info *sched, thdclosure_index_t idx)
+cos_aep_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, struct cos_aep_info *sched, thdclosure_index_t idx, dcbcap_t dcap, dcboff_t doff)
 {
 	struct cos_defcompinfo *defci     = cos_defcompinfo_curr_get();
 	struct cos_aep_info    *sched_aep = cos_sched_aep_get(defci);
@@ -178,11 +200,11 @@ cos_aep_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, str
 	if (!sched) sched_aep = cos_sched_aep_get(dst_dci);
 	else        sched_aep = sched;
 
-	return cos_aep_alloc_intern(aep, dst_dci, 0, sched_aep, NULL, NULL, idx);
+	return cos_aep_alloc_intern(aep, dst_dci, 0, sched_aep, NULL, NULL, idx, dcap, doff);
 }
 
 int
-cos_aep_tcap_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, struct cos_aep_info *sched, tcap_t tc, thdclosure_index_t idx)
+cos_aep_tcap_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, struct cos_aep_info *sched, tcap_t tc, thdclosure_index_t idx, dcbcap_t dcap, dcboff_t doff)
 {
 	struct cos_defcompinfo *defci     = cos_defcompinfo_curr_get();
 	struct cos_aep_info    *sched_aep = cos_sched_aep_get(defci);
@@ -192,25 +214,25 @@ cos_aep_tcap_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci
 	if (!sched) sched_aep = cos_sched_aep_get(dst_dci);
 	else        sched_aep = sched;
 
-	return cos_aep_alloc_intern(aep, dst_dci, tc, sched_aep, NULL, NULL, idx);
+	return cos_aep_alloc_intern(aep, dst_dci, tc, sched_aep, NULL, NULL, idx, dcap, doff);
 }
 
 int
-cos_aep_alloc(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data)
+cos_aep_alloc(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data, dcbcap_t dcap, dcboff_t doff)
 {
 	struct cos_defcompinfo *defci     = cos_defcompinfo_curr_get();
 	struct cos_aep_info    *sched_aep = cos_sched_aep_get(defci);
 
-	return cos_aep_alloc_intern(aep, defci, 0, sched_aep, fn, data, 0);
+	return cos_aep_alloc_intern(aep, defci, 0, sched_aep, fn, data, 0, dcap, doff);
 }
 
 int
-cos_aep_tcap_alloc(struct cos_aep_info *aep, tcap_t tc, cos_aepthd_fn_t fn, void *data)
+cos_aep_tcap_alloc(struct cos_aep_info *aep, tcap_t tc, cos_aepthd_fn_t fn, void *data, dcbcap_t dcap, dcboff_t doff)
 {
 	struct cos_defcompinfo *defci     = cos_defcompinfo_curr_get();
 	struct cos_aep_info    *sched_aep = cos_sched_aep_get(defci);
 
-	return cos_aep_alloc_intern(aep, defci, tc, sched_aep, fn, data, 0);
+	return cos_aep_alloc_intern(aep, defci, tc, sched_aep, fn, data, 0, dcap, doff);
 }
 
 int
diff --git a/src/components/lib/cos_gomp/Makefile b/src/components/lib/cos_gomp/Makefile
new file mode 100644
index 0000000000..ad4c1f75f9
--- /dev/null
+++ b/src/components/lib/cos_gomp/Makefile
@@ -0,0 +1,20 @@
+include Makefile.src Makefile.comp
+
+OBJS=cos_omp.o cos_gomp.o
+LIB=cos_gomp
+CINC+=-m32
+
+.PHONY: all clean
+all: $(LIB)
+	@cp *.a ../
+
+%.o:%.c
+	$(info |     [CC]   Compiling C file $^ into $@)
+	@$(CC) $(CFLAGS) $(CINC) -o $@ -c $<
+
+$(LIB): $(OBJS)
+	$(info |     [LD]   Creating library file lib$(LIB).a)
+	@$(AR) cr lib$(LIB).a $^
+
+clean:
+	@rm -f *.o *.a *.d
diff --git a/src/components/lib/cos_gomp/cos_gomp.c b/src/components/lib/cos_gomp/cos_gomp.c
new file mode 100644
index 0000000000..1c338c537b
--- /dev/null
+++ b/src/components/lib/cos_gomp/cos_gomp.c
@@ -0,0 +1,385 @@
+/**
+ * Redistribution of this file is permitted under the BSD two clause license.
+ *
+ * Copyright 2019, The George Washington University
+ * Author: Phani Gadepalli, phanikishoreg@gwu.edu
+ *
+ *
+ * NOTE: There is no header file for this library! 
+ *	 This is a backend for GOMP API in GCC and 
+ *	 replaces LIBGOMP for composite!
+ */
+
+#include <res_spec.h>
+#include <sl.h>
+#include <sl_thd.h>
+#include <sl_lock.h> /* for now, single core lock! */
+#include <cos_omp.h>
+
+#include "cos_gomp.h"
+#include <crt_lock.h>
+#include <part.h>
+
+static struct crt_lock _glock; /* global lock for critical sections */
+
+static inline struct part_task *
+_cos_gomp_alloc_explicit(void)
+{
+	return part_task_alloc(0);
+}
+
+void
+cos_gomp_init(void)
+{
+	static int first_one = NUM_CPU, init_done = 0;
+
+	if (ps_cas(&first_one, NUM_CPU, cos_cpuid())) {
+		crt_lock_init(&_glock);
+		cos_omp_init();
+		init_done = 1;
+	} else {
+		while(!ps_load(&init_done)) ;
+	}
+	part_init();
+}
+
+static inline void
+_gomp_parallel_start(struct part_task *pt, void (*fn) (void *), void *data, unsigned num_threads, unsigned flags)
+{
+	int parent_off;
+	struct sl_thd *t = sl_thd_curr();
+	struct part_task *parent = (struct part_task *)t->part_context;
+
+	if (parent) assert(ps_load(&in_main_parallel));
+
+	num_threads = (num_threads == 0 || num_threads > COS_GOMP_MAX_THDS) ? COS_GOMP_MAX_THDS : num_threads;
+
+	/* nesting? */
+#if !defined(PART_ENABLE_NESTED)
+	if (unlikely(parent)) num_threads = 1;
+#endif
+
+	pt->state = PART_TASK_S_ALLOCATED;
+	part_task_init(pt, PART_TASK_T_WORKSHARE, parent, num_threads, fn, data, NULL);
+	assert(pt->nthds == num_threads);
+	if (unlikely(parent)) {
+		parent_off = part_task_add_child(parent, pt);
+		assert(parent_off >= 0);
+	}
+	t->part_context = pt;
+	/* should not append to workshare list if it's a task with nthds == 1 */
+	part_list_append(pt);
+}
+
+static inline void
+_gomp_parallel_end(struct part_task *pt)
+{
+	/* implicit hard barrier. only master thread to deinit task and all other threads just go back to pool */
+	part_task_end(pt);
+}
+
+/* GOMP_parallel prototype from libgomp within gcc */
+void
+GOMP_parallel (void (*fn) (void *), void *data, unsigned num_threads,
+	       unsigned int flags)
+{
+	struct part_task *prt = NULL;
+	struct part_task pt;
+
+#if defined(PART_ENABLE_NESTED)
+	prt = &pt
+#else
+	struct sl_thd *t = sl_thd_curr();
+	struct part_task *parent = (struct part_task *)t->part_context;
+
+	/* child parallel will not be nested, will be run by this thread and also not added to the global list */
+	if(parent) prt = &pt;
+	else       prt = &main_task;
+#endif
+
+	_gomp_parallel_start(prt, fn, data, num_threads, flags);
+	fn(data);
+	_gomp_parallel_end(prt);
+}
+
+bool
+GOMP_single_start(void)
+{
+	struct part_task *t = (struct part_task *)sl_thd_curr()->part_context;
+	int i;
+	int coff = part_task_work_thd_num(t, PART_CURR_THD);
+	unsigned b = 1 << coff;
+
+	assert(coff >= 0 && coff < (int)t->nthds);
+	for (i = t->ws_off[coff] + 1; i < PART_MAX_WORKSHARES; i++) {
+		struct part_workshare *pw = &t->ws[i];
+		unsigned c;
+
+		if (ps_load(&pw->type) == PART_WORKSHARE_NONE) {
+			/* perhaps one of the threads just converted it to a single */
+			if (!ps_cas(&pw->type, PART_WORKSHARE_NONE, PART_WORKSHARE_SINGLE)) assert(pw->type == PART_WORKSHARE_SINGLE);
+		}
+		if (ps_load(&pw->type) != PART_WORKSHARE_SINGLE) continue;
+
+retry_bmp:
+		c = ps_load(&pw->worker_bmp);
+		/* if already went through this, should not have called start! */
+		assert(!(c & b));
+
+		/* 
+		 * this thd, add to worker bmp to indicate it reached the construct.
+		 * if this is the first to reach, then return "true", else "false".
+		 *
+		 * if cas failed, try again as you have to indicate that this thd
+		 * has done this construct!
+		 */
+		if (ps_cas(&pw->worker_bmp, c, c | b)) {
+			t->ws_off[coff] = i;
+
+			return c ? false : true;
+		}
+		goto retry_bmp;
+	}
+
+	assert(0); /* exceed the number of workshares? */
+
+	return false;
+}
+
+void
+GOMP_barrier (void)
+{
+	struct part_task *t = (struct part_task *)sl_thd_curr()->part_context;
+
+	part_task_barrier(t, 0);
+}
+
+static inline bool
+_gomp_loop_dynamic_next(struct part_task *t, struct part_workshare *w, long *s, long *e)
+{
+	long cn, left, wrk = 0;
+
+retry:
+	cn = ps_load(&w->next);
+	left = w->end - cn;
+
+	if (unlikely(left == 0)) return false;
+	/* todo: incr <= 0 */
+	assert(w->inc > 0);
+
+	wrk = w->chunk_sz;
+	if (unlikely(left < wrk)) wrk = left;
+	if (!ps_cas(&w->next, cn, cn + wrk)) goto retry;
+
+	*s = cn;
+	*e = cn + wrk;
+
+	return true;
+}
+
+bool
+GOMP_loop_dynamic_start (long start, long end, long incr, long chunk_size,
+			 long *istart, long *iend)
+{
+	struct part_task *t = (struct part_task *)sl_thd_curr()->part_context;
+	int i;
+	int coff = part_task_work_thd_num(t, PART_CURR_THD);
+	unsigned b = 1 << coff;
+
+	assert(coff >= 0 && coff < (int)t->nthds);
+	for (i = t->ws_off[coff] + 1; i < PART_MAX_WORKSHARES; i++) {
+		struct part_workshare *pw = &t->ws[i];
+		unsigned c;
+
+		if (ps_load(&pw->type) == PART_WORKSHARE_NONE) {
+			/* perhaps one of the threads just converted it to a loop */
+			if (!ps_cas(&pw->type, PART_WORKSHARE_NONE, PART_WORKSHARE_LOOP_DYNAMIC)) assert(pw->type == PART_WORKSHARE_LOOP_DYNAMIC);
+		}
+
+		if (ps_load(&pw->type) != PART_WORKSHARE_LOOP_DYNAMIC) continue;
+
+retry_bmp:
+		c = ps_load(&pw->worker_bmp);
+		/* if already went through this, should not have called start! */
+		assert(!(c & b));
+
+		/* 
+		 * this thd, add to worker bmp to indicate it reached the construct.
+		 */
+		if (ps_cas(&pw->worker_bmp, c, c | b)) t->ws_off[coff] = i;
+		else goto retry_bmp;
+
+		/* all threads participating will initialize to the same values */
+		if (unlikely(!pw->end)) {
+			pw->chunk_sz = chunk_size;
+			pw->inc = incr;
+			pw->st = start;
+			pw->end = end;
+		}
+
+		if (likely(istart && iend)) return _gomp_loop_dynamic_next(t, pw, istart, iend);
+		else return true;
+	}
+
+	assert(0);
+
+	return false;
+}
+
+void
+GOMP_parallel_loop_dynamic (void (*fn) (void *), void *data,
+			    unsigned num_threads, long start, long end,
+			    long incr, long chunk_size, unsigned flags)
+{
+	struct part_task *prt = NULL;
+	struct part_task pt;
+	bool ret;
+
+#if defined(PART_ENABLE_NESTED)
+	prt = &pt
+#else
+	struct sl_thd *t = sl_thd_curr();
+	struct part_task *parent = (struct part_task *)t->part_context;
+
+	/* child parallel will not be nested, will be run by this thread and also not added to the global list */
+	if (parent) prt = &pt;
+	else        prt = &main_task;
+#endif
+
+	_gomp_parallel_start(prt, fn, data, num_threads, flags);
+	ret = GOMP_loop_dynamic_start(start, end, incr, chunk_size, NULL, NULL);
+	assert(ret == true);
+
+	fn(data);
+	_gomp_parallel_end(prt);
+}
+
+bool
+GOMP_loop_dynamic_next (long *istart, long *iend)
+{
+	struct part_task *t = (struct part_task *)sl_thd_curr()->part_context;
+	unsigned coff = part_task_work_thd_num(t, PART_CURR_THD);
+	int woff = t->ws_off[coff];
+
+	if (unlikely(woff < 0)) t->ws_off[coff] = woff = 0;
+	assert(t->ws[woff].type == PART_WORKSHARE_LOOP_DYNAMIC);
+
+	return _gomp_loop_dynamic_next(t, &t->ws[woff], istart, iend);
+}
+
+void
+GOMP_loop_end (void)
+{
+	struct part_task *t = (struct part_task *)sl_thd_curr()->part_context;
+	unsigned coff = part_task_work_thd_num(t, PART_CURR_THD);
+	int woff = t->ws_off[coff], c = 0;
+
+	assert(t->ws[woff].type == PART_WORKSHARE_LOOP_DYNAMIC);
+
+	part_task_barrier(t, 0);
+}
+
+void
+GOMP_loop_end_nowait (void)
+{
+	struct part_task *t = (struct part_task *)sl_thd_curr()->part_context;
+	unsigned coff = part_task_work_thd_num(t, PART_CURR_THD);
+	int woff = t->ws_off[coff], c = 0;
+
+	assert(t->ws[woff].type == PART_WORKSHARE_LOOP_DYNAMIC);
+}
+
+void
+GOMP_critical_start (void)
+{
+	crt_lock_take(&_glock);
+}
+
+void
+GOMP_critical_end (void)
+{
+	crt_lock_release(&_glock);
+}
+
+void
+GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
+           long arg_size, long arg_align, bool if_clause, unsigned flags,
+           void **depend, int priority)
+{
+	struct part_task *parent = (struct part_task *)sl_thd_curr()->part_context;
+	int parent_off = -1, ret = -1;
+
+	/* 
+	 * There should be nothing that prevents us to enqueue a task that 
+	 * has a dependency, in or out!
+	 * The thread that pops this task should potentially do the dependency
+	 * tracking before/after execution of the function.
+	 */
+	/* TODO: depend, flags, etc! */
+	assert(depend == NULL);
+
+	if (if_clause) {
+		struct part_task *pt;
+		struct part_data *d;
+		char *arg = NULL;
+
+		pt = _cos_gomp_alloc_explicit();
+		assert(pt);
+		d = part_data_alloc();
+		assert(d);
+
+		assert(pt && d);
+		assert(arg_size + arg_align - 1 <= PART_MAX_DATA);
+		memset(d->data, 0, PART_MAX_DATA);
+		arg = (char *) (((uintptr_t) d->data + arg_align - 1)
+                                & ~(uintptr_t) (arg_align - 1));
+		if (cpyfn) cpyfn(arg, data);
+		else       memcpy(arg, data, arg_size);
+
+		assert(parent);
+		part_task_init(pt, PART_TASK_T_TASK, parent, 1, fn, arg, d);
+		parent_off = part_task_add_child(parent, pt);
+		assert(parent_off >= 0);
+		assert(pt->type == PART_TASK_T_TASK);
+
+		do {
+			ret = part_deque_push(pt);
+		} while (ret == -EAGAIN);
+		assert(ret == 0);
+		/* wake up a thread that might potentially run this workload */
+		part_pool_wakeup();
+	} else {
+		/* if_clause is false, task is an included/undeferred task */
+		struct part_task pt;
+
+		assert(parent);
+		part_task_init(&pt, PART_TASK_T_TASK, parent, 1, fn, data, NULL);
+		parent_off = part_task_add_child(parent, &pt);
+		assert(parent_off >= 0);
+		sl_thd_curr()->part_context = &pt;
+		pt.workers[0] = PART_CURR_THD;
+
+		if (cpyfn) {
+			char buf[arg_size + arg_align - 1];
+			char *arg = (char *) (((uintptr_t) buf + arg_align - 1)
+					& ~(uintptr_t) (arg_align - 1));
+
+			cpyfn(arg, data);
+			fn(arg);
+		} else {
+			fn(data);
+		}
+
+		part_task_end(&pt);
+		sl_thd_curr()->part_context = pt.parent;
+	}
+}
+
+void
+GOMP_taskwait (void)
+{
+	struct part_task *t = sl_thd_curr()->part_context;
+
+	part_task_wait_children(t);
+	/* no barriers of course! */
+}
diff --git a/src/components/lib/cos_gomp/cos_gomp.h b/src/components/lib/cos_gomp/cos_gomp.h
new file mode 100644
index 0000000000..3cce60a1fe
--- /dev/null
+++ b/src/components/lib/cos_gomp/cos_gomp.h
@@ -0,0 +1,11 @@
+#ifndef COS_GOMP_H
+#define COS_GOMP_H
+
+#include <part.h>
+
+#define COS_GOMP_MAX_THDS PART_MAX_PAR_THDS 
+#define COS_GOMP_CORE_MAX_THDS PART_MAX_CORE_THDS
+#define COS_GOMP_MAX_CHILD PART_MAX_CHILD
+#define COS_GOMP_MAX_TASKS PART_MAX_TASKS 
+
+#endif /* COS_GOMP_H */
diff --git a/src/components/lib/cos_gomp/cos_omp.c b/src/components/lib/cos_gomp/cos_omp.c
new file mode 100644
index 0000000000..b74ea94785
--- /dev/null
+++ b/src/components/lib/cos_gomp/cos_omp.c
@@ -0,0 +1,141 @@
+/**
+ * Redistribution of this file is permitted under the BSD two clause license.
+ *
+ * Copyright 2019, The George Washington University
+ * Author: Phani Gadepalli, phanikishoreg@gwu.edu
+ */
+
+#include <part_task.h>
+#include <cos_omp.h>
+#include <cos_gomp.h>
+#include <cos_kernel_api.h>
+#include <cos_types.h>
+
+#define COS_OMP_NUM_DEVS 1
+
+static struct cos_icv_global_env       cos_icv_glbenv;
+static struct cos_icv_device_env       cos_icv_devenv[COS_OMP_NUM_DEVS];
+static struct cos_icv_data_env         cos_icv_init_dataenv;
+static struct cos_icv_implicittask_env cos_icv_init_implitskenv;
+static unsigned int _cos_omp_init_done = 0;
+static unsigned int _cycs_per_usec = 0;
+
+#define _USEC_TO_SEC_d(x) (((double)x)/(double)(1000*1000))
+#define _CYCS_TO_SEC_d(x) _USEC_TO_SEC_d((x)/(double)_cycs_per_usec)
+
+__GOMP_NOTHROW double
+omp_get_wtime(void)
+{
+	cycles_t now;
+
+	rdtscll(now);
+	return _CYCS_TO_SEC_d(now);
+}
+
+__GOMP_NOTHROW int
+omp_get_num_procs(void)
+{
+	return NUM_CPU;
+}
+
+__GOMP_NOTHROW int
+omp_get_max_threads(void)
+{
+	return COS_GOMP_MAX_THDS;
+}
+
+__GOMP_NOTHROW int
+omp_get_num_threads(void)
+{
+	struct sl_thd *t = sl_thd_curr();
+	struct part_task *pt = (struct part_task *)t->part_context;
+
+	if (pt) return pt->nthds;
+
+	return 1;
+}
+
+__GOMP_NOTHROW int
+omp_get_thread_num(void)
+{
+	struct sl_thd *t = sl_thd_curr();
+	struct part_task *pt = (struct part_task *)t->part_context;
+
+	if (!pt) return 0;
+	
+	return part_task_work_thd_num(pt, PART_CURR_THD);
+}
+
+static inline void
+cos_omp_icv_global_init(void)
+{
+	assert(!_cos_omp_init_done);
+	/* TODO: what is not int? what is not zero? */
+	/* cos_icv_glbenv.xxxx = yyyy; */
+}
+
+void
+cos_omp_icv_data_init(struct cos_icv_data_env *icvde)
+{
+	if (unlikely(icvde == &cos_icv_init_dataenv)) {
+		assert(!_cos_omp_init_done); /* init only on startup! */
+
+		/* TODO: what is not int? what is not zero! */
+		return;
+	}
+
+	assert(_cos_omp_init_done);
+	memcpy(icvde, &cos_icv_init_dataenv, sizeof(struct cos_icv_data_env));
+}
+
+void
+cos_omp_icv_implitsk_init(struct cos_icv_implicittask_env *icvite)
+{
+	if (unlikely(icvite == &cos_icv_init_implitskenv)) {
+		assert(!_cos_omp_init_done); /* init only on startup! */
+
+		/* TODO: what is not int? what is not zero! */
+		return;
+	}
+
+	assert(_cos_omp_init_done);
+	memcpy(icvite, &cos_icv_init_implitskenv, sizeof(struct cos_icv_implicittask_env));
+}
+
+void
+cos_omp_icv_device_init(struct cos_icv_device_env *icvdve, unsigned dev_no)
+{
+	assert(dev_no < COS_OMP_NUM_DEVS);
+
+	if (unlikely(icvdve == &cos_icv_devenv[dev_no])) {
+		assert(!_cos_omp_init_done); /* init only on startup! */
+
+		/* TODO: what is not int? what is not zero! */
+		return;
+	}
+
+	assert(_cos_omp_init_done);
+	memcpy(icvdve, &cos_icv_devenv[dev_no], sizeof(struct cos_icv_device_env));
+}
+
+static inline void
+cos_omp_icv_init(void)
+{
+	cos_omp_icv_global_init();
+
+	cos_omp_icv_device_init(&cos_icv_devenv[0], 0);
+
+	cos_omp_icv_data_init(&cos_icv_init_dataenv);
+	cos_omp_icv_implitsk_init(&cos_icv_init_implitskenv);
+}
+
+void
+cos_omp_init(void)
+{
+	_cycs_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
+
+	assert(_cycs_per_usec);
+
+	cos_omp_icv_init();
+	_cos_omp_init_done = 1;
+}
diff --git a/src/components/lib/cos_kernel_api.c b/src/components/lib/cos_kernel_api.c
index 54e5afc647..8b0e8d4cd5 100644
--- a/src/components/lib/cos_kernel_api.c
+++ b/src/components/lib/cos_kernel_api.c
@@ -34,7 +34,7 @@ __compinfo_metacap(struct cos_compinfo *ci)
 static inline void
 cos_vasfrontier_init(struct cos_compinfo *ci, vaddr_t heap_ptr)
 {
-	ci->vas_frontier = heap_ptr;
+	ci->vas_frontier    = heap_ptr;
 	/*
 	 * The first allocation should trigger PTE allocation, unless
 	 * it is in the middle of a PGD, in which case we assume one
@@ -71,24 +71,23 @@ cos_capfrontier_init(struct cos_compinfo *ci, capid_t cap_frontier)
 
 void
 cos_compinfo_init(struct cos_compinfo *ci, pgtblcap_t pgtbl_cap, captblcap_t captbl_cap, compcap_t comp_cap,
-                  vaddr_t heap_ptr, capid_t cap_frontier, struct cos_compinfo *ci_resources)
+		  vaddr_t heap_ptr, capid_t cap_frontier, struct cos_compinfo *ci_resources)
 {
 	assert(ci && ci_resources);
 	assert(cap_frontier % CAPMAX_ENTRY_SZ == 0);
 
 	ci->memsrc = ci_resources;
 	assert(ci_resources->memsrc == ci_resources); /* prevent infinite data-structs */
-
-	ci->pgtbl_cap    = pgtbl_cap;
-	ci->captbl_cap   = captbl_cap;
-	ci->comp_cap     = comp_cap;
-
-	cos_vasfrontier_init(ci, heap_ptr);
-	cos_capfrontier_init(ci, cap_frontier);
-
 	ps_lock_init(&ci->cap_lock);
 	ps_lock_init(&ci->mem_lock);
 	ps_lock_init(&ci->va_lock);
+
+	ci->pgtbl_cap  = pgtbl_cap;
+	ci->captbl_cap = captbl_cap;
+	ci->comp_cap   = comp_cap;
+
+	cos_capfrontier_init(ci, cap_frontier);
+	cos_vasfrontier_init(ci, heap_ptr);
 }
 
 /**************** [Memory Capability Allocation Functions] ***************/
@@ -469,7 +468,7 @@ __page_bump_mem_alloc(struct cos_compinfo *ci, vaddr_t *mem_addr, vaddr_t *mem_f
 	struct cos_compinfo *meta = __compinfo_metacap(ci);
 	size_t               rounded;
 
-	printd("__page_bump_alloc\n");
+	printd("__page_bump_mem_alloc\n");
 
 	assert(sz % PAGE_SIZE == 0);
 	assert(meta == __compinfo_metacap(meta)); /* prevent unbounded structures */
@@ -506,8 +505,14 @@ __page_bump_valloc(struct cos_compinfo *ci, size_t sz)
 	return ret_addr;
 }
 
+vaddr_t
+cos_page_bump_intern_valloc(struct cos_compinfo *ci, size_t sz)
+{
+	return __page_bump_valloc(ci, sz);
+}
+
 static vaddr_t
-__page_bump_alloc(struct cos_compinfo *ci, size_t sz)
+__page_bump_alloc(struct cos_compinfo *ci, size_t sz, int shared)
 {
 	struct cos_compinfo *meta = __compinfo_metacap(ci);
 	vaddr_t              heap_vaddr, heap_cursor, heap_limit;
@@ -532,7 +537,7 @@ __page_bump_alloc(struct cos_compinfo *ci, size_t sz)
 	for (heap_cursor = heap_vaddr; heap_cursor < heap_limit; heap_cursor += PAGE_SIZE) {
 		vaddr_t umem;
 
-		umem = __umem_bump_alloc(ci);
+		umem = shared ? __kmem_bump_alloc(ci) : __umem_bump_alloc(ci);
 		if (!umem) return 0;
 
 		/* Actually map in the memory. */
@@ -574,7 +579,7 @@ __alloc_mem_cap(struct cos_compinfo *ci, cap_t ct, vaddr_t *kmem, capid_t *cap)
 }
 
 static thdcap_t
-__cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t init_data)
+__cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t init_data, dcbcap_t dc, dcboff_t off)
 {
 	vaddr_t kmem;
 	capid_t cap;
@@ -585,9 +590,11 @@ __cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t init
 
 	if (__alloc_mem_cap(ci, CAP_THD, &kmem, &cap)) return 0;
 	assert(!(init_data & ~((1 << 16) - 1)));
-	/* TODO: Add cap size checking */
-	if (call_cap_op(ci->captbl_cap, CAPTBL_OP_THDACTIVATE, (init_data << 16) | cap,
-	                __compinfo_metacap(ci)->mi.pgtbl_cap, kmem, comp))
+	assert(!(off & ~((1 << 9) - 1)));
+	assert(kmem && (round_to_page(kmem) == kmem));
+
+	if (call_cap_op(ci->captbl_cap, CAPTBL_OP_THDACTIVATE, __compinfo_metacap(ci)->mi.pgtbl_cap | (cap << 16),
+			kmem, comp << 16 | dc, off << 16 | init_data))
 		BUG();
 
 	return cap;
@@ -596,30 +603,61 @@ __cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t init
 #include <cos_thd_init.h>
 
 thdcap_t
-cos_thd_alloc_ext(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t idx)
+cos_thd_alloc_ext(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t idx, dcbcap_t dc, dcboff_t off)
 {
 	if (idx < 1) return 0;
 
-	return __cos_thd_alloc(ci, comp, idx);
+	return __cos_thd_alloc(ci, comp, idx, dc, off);
 }
 
 thdcap_t
-cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, cos_thd_fn_t fn, void *data)
+cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, cos_thd_fn_t fn, void *data, dcbcap_t dc, dcboff_t off)
 {
 	int      idx = cos_thd_init_alloc(fn, data);
 	thdcap_t ret;
 
 	if (idx < 1) return 0;
-	ret = __cos_thd_alloc(ci, comp, idx);
+	ret = __cos_thd_alloc(ci, comp, idx, dc, off);
 	if (!ret) cos_thd_init_free(idx);
 
 	return ret;
 }
 
 thdcap_t
-cos_initthd_alloc(struct cos_compinfo *ci, compcap_t comp)
+cos_initthd_alloc(struct cos_compinfo *ci, compcap_t comp, dcbcap_t dc)
+{
+	return __cos_thd_alloc(ci, comp, 0, dc, 0);
+}
+
+int
+cos_thd_migrate(struct cos_compinfo *ci, thdcap_t t, cpuid_t c)
+{
+	return call_cap_op(ci->captbl_cap, CAPTBL_OP_THDMIGRATE, t, c, 0, 0);
+}
+
+int
+cos_thdcap_migrate(struct cos_compinfo *ci, thdcap_t t)
+{
+	return call_cap_op(ci->captbl_cap, CAPTBL_OP_THDMIGRATE, t, 0, 1, 0);
+}
+
+dcbcap_t
+cos_dcb_alloc(struct cos_compinfo *ci, pgtblcap_t ptcap, vaddr_t uaddr)
 {
-	return __cos_thd_alloc(ci, comp, 0);
+	vaddr_t kmem;
+	capid_t cap;
+	u32_t   lid = livenessid_bump_alloc();
+
+	printd("cos_dcb_alloc\n");
+
+	assert(ci);
+
+	if (__alloc_mem_cap(ci, CAP_DCB, &kmem, &cap)) return 0;
+	assert(kmem && (round_to_page(kmem) == kmem));
+	if (call_cap_op(ci->captbl_cap, CAPTBL_OP_DCB_ACTIVATE, cap << 16 | lid, (__compinfo_metacap(ci)->mi.pgtbl_cap) << 16 | ptcap, kmem, uaddr))
+		BUG();
+
+	return cap;
 }
 
 captblcap_t
@@ -656,30 +694,53 @@ cos_pgtbl_alloc(struct cos_compinfo *ci)
 	return cap;
 }
 
+scbcap_t
+cos_scb_alloc(struct cos_compinfo *ci)
+{
+	vaddr_t kmem;
+	capid_t cap;
+	u32_t   lid = livenessid_bump_alloc();
+
+	printd("cos_scb_alloc\n");
+
+	assert(ci && lid);
+
+	if (__alloc_mem_cap(ci, CAP_SCB, &kmem, &cap)) return 0;
+	assert(kmem && (round_to_page(kmem) == kmem));
+	if (call_cap_op(ci->captbl_cap, CAPTBL_OP_SCB_ACTIVATE, cap, __compinfo_metacap(ci)->mi.pgtbl_cap, kmem, lid))
+		BUG();
+
+	return cap;
+}
+
 compcap_t
-cos_comp_alloc(struct cos_compinfo *ci, captblcap_t ctc, pgtblcap_t ptc, vaddr_t entry)
+cos_comp_alloc(struct cos_compinfo *ci, captblcap_t ctc, pgtblcap_t ptc, scbcap_t scbc, vaddr_t entry, vaddr_t uaddr)
 {
 	capid_t cap;
+	/* FIXME: same or diff liveness ids in scb and comp resources? */
 	u32_t   lid = livenessid_bump_alloc();
 
 	printd("cos_comp_alloc\n");
 
 	assert(ci && ctc && ptc && lid);
+	/* FIXME: packing scbc in 12 bits */
+	assert(scbc < (1 << 12));
 
 	cap = __capid_bump_alloc(ci, CAP_COMP);
 	if (!cap) return 0;
-	if (call_cap_op(ci->captbl_cap, CAPTBL_OP_COMPACTIVATE, cap, (ctc << 16) | ptc, lid, entry)) BUG();
+	if (call_cap_op(ci->captbl_cap, CAPTBL_OP_COMPACTIVATE, (lid << 16) | cap, (ctc << 16) | ptc, uaddr | scbc, entry)) BUG();
 
 	return cap;
 }
 
 int
-cos_compinfo_alloc(struct cos_compinfo *ci, vaddr_t heap_ptr, capid_t cap_frontier, vaddr_t entry,
+cos_compinfo_alloc(struct cos_compinfo *ci, scbcap_t sc, vaddr_t heap_ptr, capid_t cap_frontier, vaddr_t entry,
                    struct cos_compinfo *ci_resources)
 {
 	pgtblcap_t  ptc;
 	captblcap_t ctc;
 	compcap_t   compc;
+	vaddr_t     scb_vaddr;
 
 	printd("cos_compinfo_alloc\n");
 
@@ -687,10 +748,14 @@ cos_compinfo_alloc(struct cos_compinfo *ci, vaddr_t heap_ptr, capid_t cap_fronti
 	assert(ptc);
 	ctc = cos_captbl_alloc(ci_resources);
 	assert(ctc);
-	compc = cos_comp_alloc(ci_resources, ctc, ptc, entry);
-	assert(compc);
+	cos_compinfo_init(ci, ptc, ctc, 0, heap_ptr, cap_frontier, ci_resources);
 
-	cos_compinfo_init(ci, ptc, ctc, compc, heap_ptr, cap_frontier, ci_resources);
+	/* FIXME: make sure this is right at the start of heap_ptr! */
+	scb_vaddr = (vaddr_t)__page_bump_valloc(ci, COS_SCB_SIZE);
+	assert(scb_vaddr);
+	compc     = cos_comp_alloc(ci_resources, ctc, ptc, sc, entry, scb_vaddr);
+	assert(compc);
+	ci->comp_cap = compc;
 
 	return 0;
 }
@@ -779,10 +844,29 @@ cos_hw_alloc(struct cos_compinfo *ci, u32_t bitmap)
 	return cap;
 }
 
+/* TODO: Can we alias/etc on this page with this logic? */
+void *
+cos_dcbpg_bump_allocn(struct cos_compinfo *ci, size_t sz)
+{
+	assert(sz == PAGE_SIZE);
+	/* assert(sz % PAGE_SIZE == 0); */
+
+	return (void *)__page_bump_alloc(ci, sz, 1);
+}
+
+void *
+cos_scbpg_bump_allocn(struct cos_compinfo *ci, size_t sz)
+{
+	assert(sz == PAGE_SIZE);
+	/* assert(sz % PAGE_SIZE == 0); */
+
+	return (void *)__page_bump_alloc(ci, sz, 1);
+}
+
 void *
 cos_page_bump_alloc(struct cos_compinfo *ci)
 {
-	return (void *)__page_bump_alloc(ci, PAGE_SIZE);
+	return (void *)__page_bump_alloc(ci, PAGE_SIZE, 0);
 }
 
 void *
@@ -790,7 +874,7 @@ cos_page_bump_allocn(struct cos_compinfo *ci, size_t sz)
 {
 	assert(sz % PAGE_SIZE == 0);
 
-	return (void *)__page_bump_alloc(ci, sz);
+	return (void *)__page_bump_alloc(ci, sz, 0);
 }
 
 capid_t
@@ -837,9 +921,7 @@ cos_thd_wakeup(thdcap_t thd, tcap_t tc, tcap_prio_t prio, tcap_res_t res)
 sched_tok_t
 cos_sched_sync(void)
 {
-	static sched_tok_t stok[NUM_CPU] CACHE_ALIGNED;
-
-	return ps_faa((unsigned long *)&stok[cos_cpuid()], 1);
+	return ps_load(&cos_scb_info_get_core()->sched_tok);
 }
 
 int
@@ -863,7 +945,7 @@ cos_asnd(asndcap_t snd, int yield)
 
 int
 cos_sched_rcv(arcvcap_t rcv, rcv_flags_t flags, tcap_time_t timeout,
-	      int *rcvd, thdid_t *thdid, int *blocked, cycles_t *cycles, tcap_time_t *thd_timeout)
+	      thdid_t *thdid, int *blocked, cycles_t *cycles, tcap_time_t *thd_timeout)
 {
 	unsigned long thd_state = 0;
 	unsigned long cyc       = 0;
@@ -875,16 +957,11 @@ cos_sched_rcv(arcvcap_t rcv, rcv_flags_t flags, tcap_time_t timeout,
 	*thdid   = (thdid_t)(thd_state & ((1 << (sizeof(thdid_t) * 8)) - 1));
 	*cycles  = cyc;
 
-	if (ret >= 0 && flags & RCV_ALL_PENDING) {
-		*rcvd = (ret >> 1);
-		ret &= 1;
-	}
-
 	return ret;
 }
 
 int
-cos_rcv(arcvcap_t rcv, rcv_flags_t flags, int *rcvd)
+cos_rcv(arcvcap_t rcv, rcv_flags_t flags)
 {
 	thdid_t     tid = 0;
 	int         blocked;
@@ -892,7 +969,7 @@ cos_rcv(arcvcap_t rcv, rcv_flags_t flags, int *rcvd)
 	int         ret;
 	tcap_time_t thd_timeout;
 
-	ret = cos_sched_rcv(rcv, flags, 0, rcvd, &tid, &blocked, &cyc, &thd_timeout);
+	ret = cos_sched_rcv(rcv, flags, 0, &tid, &blocked, &cyc, &thd_timeout);
 	assert(tid == 0);
 
 	return ret;
@@ -1033,6 +1110,14 @@ cos_hw_attach(hwcap_t hwc, hwid_t hwid, arcvcap_t arcv)
 	return call_cap_op(hwc, CAPTBL_OP_HW_ATTACH, hwid, arcv, 0, 0);
 }
 
+int
+cos_hw_periodic_attach(hwcap_t hwc, hwid_t hwid, arcvcap_t arcv, unsigned int period)
+{
+	assert(hwid == HW_HPET_PERIODIC);
+
+	return call_cap_op(hwc, CAPTBL_OP_HW_ATTACH, hwid, arcv, period, 0);
+}
+
 int
 cos_hw_detach(hwcap_t hwc, hwid_t hwid)
 {
diff --git a/src/components/lib/part_capmgr.c b/src/components/lib/part_capmgr.c
new file mode 100644
index 0000000000..9d09024af4
--- /dev/null
+++ b/src/components/lib/part_capmgr.c
@@ -0,0 +1,367 @@
+#include <cos_types.h>
+#include <cos_component.h>
+#include <part_task.h>
+#include <part.h>
+#include <../interface/capmgr/memmgr.h>
+#include <ps.h>
+#include <ps_slab.h>
+
+#include <sl.h>
+#include <sl_xcore.h>
+
+struct deque_part *part_dq_percore[NUM_CPU];
+//struct cirque_par parcq_global;
+static volatile unsigned part_ready = 0;
+volatile int in_main_parallel;
+#if defined(PART_ENABLE_NESTED)
+struct crt_lock part_l_lock;
+struct ps_list_head part_l_global;
+#else
+struct part_task main_task;
+#endif
+//static struct part_task *part_tasks = NULL;
+//static struct part_data *part__data = NULL;
+struct ps_list_head part_thdpool_core[NUM_CPU];
+
+#define PART_DEQUE_SZ PART_MAX_TASKS
+#define _PART_PRIO TCAP_PRIO_MAX
+#define _PART_PRIO_PACK() sched_param_pack(SCHEDP_PRIO, _PART_PRIO)
+
+#define _PART_IDLE_PRIO (_PART_PRIO+4)
+#define _PART_IDLE_PRIO_PACK() sched_param_pack(SCHEDP_PRIO, _PART_IDLE_PRIO)
+
+//struct ps_slab *
+//ps_slab_memmgr_alloc(struct ps_mem *m, size_t sz, coreid_t coreid)
+//{
+//	PRINTC("%s:%d\n", __func__, __LINE__);
+//	unsigned npages = round_up_to_page(sz) / PAGE_SIZE;
+//	vaddr_t addr = memmgr_heap_page_allocn(npages);
+//
+//	assert(addr);
+//	memset((void *)addr, 0, npages * PAGE_SIZE);
+//	PRINTC("%s:%d\n", __func__, __LINE__);
+//
+//	return (struct ps_slab *)addr;
+//}
+//
+//void
+//ps_slab_memmgr_free(struct ps_mem *m, struct ps_slab *s, size_t sz, coreid_t coreid)
+//{
+//	/* do nothing */
+//}
+
+/* this? */
+//PS_SLAB_CREATE_AFNS(parttask, sizeof(struct part_task), PART_TASKS_MAX_SZ, 0, ps_slab_memmgr_alloc, ps_slab_memmgr_free);
+//PS_SLAB_CREATE_AFNS(partdata, sizeof(struct part_data), PART_DATA_MAX_SZ, 0, ps_slab_memmgr_alloc, ps_slab_memmgr_free);
+/* or this. */
+//PS_SLAB_CREATE(parttask, sizeof(struct part_task), PART_TASKS_MAX_SZ)
+//PS_SLAB_CREATE(partdata, sizeof(struct part_data), PART_DATA_MAX_SZ)
+
+/* for task pool, per core list. tasks in pool can migrate cores */
+struct parttask_head {
+	struct part_task *head;
+};
+
+static inline void
+parttask_store_init(struct parttask_head *h)
+{
+	h->head = NULL;
+}
+
+static inline void
+parttask_store_add(struct parttask_head *h, struct part_task *l)
+{
+	struct part_task *n;
+	l->next_free = NULL;
+
+	assert(h);
+	do {
+		n = ps_load(&h->head);
+		l->next_free = n;
+	} while (!ps_cas(&h->head, (unsigned long)n, (unsigned long)l)); 
+}
+
+static inline struct part_task *
+parttask_store_dequeue(struct parttask_head *h)
+{
+	struct part_task *l = NULL;
+
+	do {
+		l = ps_load(&h->head);
+		if (unlikely(!l)) return NULL;
+	} while (!ps_cas(&h->head, (unsigned long)l, (unsigned long)l->next_free));
+
+	l->next_free = NULL;
+
+	return l;
+}
+
+/* for task data, per core pool - task data could migrate pools. */
+struct partdata_head {
+	struct part_data *head;
+};
+
+static inline void
+partdata_store_init(struct partdata_head *h)
+{
+	h->head = NULL;
+}
+
+static inline void
+partdata_store_add(struct partdata_head *h, struct part_data *l)
+{
+	struct part_data *n = NULL;
+	l->next_free = NULL;
+
+	assert(h);
+	do {
+		n = ps_load(&h->head);
+
+		l->next_free = n;
+	} while (!ps_cas(&h->head, (unsigned long)n, (unsigned long)l)); 
+}
+
+static inline struct part_data *
+partdata_store_dequeue(struct partdata_head *h)
+{
+	struct part_data *l = NULL;
+
+	do {
+		l = ps_load(&h->head);
+		if (unlikely(!l)) return NULL;
+	} while (!ps_cas(&h->head, (unsigned long)l, (unsigned long)l->next_free));
+
+	l->next_free = NULL;
+
+	return l;
+}
+
+/* end treiber stacks */
+#define PART_TASKS_MAX_SZ round_up_to_page(PART_MAX_TASKS * sizeof(struct part_task))
+#define PART_MAX_PAGES (PART_TASKS_MAX_SZ / PAGE_SIZE)
+#define PART_DATA_MAX_SZ round_up_to_page(PART_MAX_TASKS * sizeof(struct part_data))
+#define PART_MAX_DATA_PAGES (PART_DATA_MAX_SZ / PAGE_SIZE)
+#define PART_DEQUE_MAX_PAGES (round_up_to_page(sizeof(struct deque_part)) / PAGE_SIZE)
+
+struct partdata_head pd_head[NUM_CPU];
+
+static inline void
+partdata_store_init_all(vaddr_t mem)
+{
+	int i;
+
+	for (i = 0; i < NUM_CPU; i++) {
+		int j;
+		struct part_data *st = (struct part_data *)(mem + (PART_DATA_MAX_SZ * i));
+
+		partdata_store_init(&pd_head[i]);
+		
+		for (j = 0; j < PART_MAX_TASKS; j++) partdata_store_add(&pd_head[i], st + j);
+	}
+}
+
+struct parttask_head pt_head[NUM_CPU];
+
+static inline void
+parttask_store_init_all(vaddr_t mem)
+{
+	int i;
+
+	for (i = 0; i < NUM_CPU; i++) {
+		int j;
+		struct part_task *st = (struct part_task *)(mem + (PART_TASKS_MAX_SZ * i));
+
+		parttask_store_init(&pt_head[i]);
+		
+		for (j = 0; j < PART_MAX_TASKS; j++) parttask_store_add(&pt_head[i], st + j);
+	}
+}
+
+/* idle thread to wakeup when there is nothing to do on this core! */
+static void
+part_idle_fn(void *d)
+{
+	struct sl_thd *sched = sl__globals_core()->sched_thd, *curr = sl_thd_curr();
+
+	while (1) {
+		/*
+		 * TODO: threads could be woken up even if there is no work!
+		 */
+		if (likely(ps_load(&in_main_parallel))) part_pool_wakeup();
+		sl_thd_yield_thd(sched);
+	}
+}
+
+struct part_data *
+part_data_alloc(void)
+{
+	struct part_data *d = partdata_store_dequeue(&pd_head[cos_cpuid()]);
+
+	if (!d) return d;
+	if (!ps_cas(&d->flag, 0, 1)) assert(0);
+
+	return d;
+//	int i;
+//	struct part_data *d = ps_slab_alloc_partdata();
+//
+//	if (!ps_cas(&d->flag, 0, 1)) assert(0);
+//
+//	return d;
+//	for (i = 0; i < PART_MAX_TASKS; i++) {
+//		d = part__data + i;
+//
+//		if (d->flag) continue;
+//
+//		/* if this fails, someone else just alloced it! */
+//		if (!ps_cas(&d->flag, 0, 1)) continue;
+//
+//		return d;
+//	}
+//
+//	return NULL;
+}
+
+void
+part_data_free(struct part_data *d)
+{
+	if (!ps_cas(&d->flag, 1, 0)) assert(0);
+
+	partdata_store_add(&pd_head[cos_cpuid()], d);
+//	ps_slab_free_partdata(d);
+//	int f;
+//
+//	if (!d) return;
+//
+//	do {
+//		f = d->flag;
+//		assert(f);
+//	} while (!ps_cas(&d->flag, f, 0));
+}
+struct part_task *
+part_task_alloc(part_task_type_t type)
+{
+	struct part_task *t = parttask_store_dequeue(&pt_head[cos_cpuid()]);
+
+	if (!t) return t;
+
+	/* use upcas ? */
+	if (!ps_cas(&t->state, PART_TASK_S_FREED, PART_TASK_S_ALLOCATED)) assert(0);
+
+	return t;
+//	struct part_task *t = ps_slab_alloc_parttask();
+//
+//	if (!ps_cas(&t->state, PART_TASK_S_FREED, PART_TASK_S_ALLOCATED)) assert(0);
+//
+//	return t;
+//	int i;
+//	struct part_task *t;
+//
+//	for (i = 0; i < PART_MAX_TASKS; i++) {
+//		t = part_tasks + i;
+//
+//		if (ps_load(&t->state) != PART_TASK_S_FREED) continue;
+//
+//		/* if this fails, someone else just alloced it! */
+//		if (!ps_cas(&t->state, PART_TASK_S_FREED, PART_TASK_S_ALLOCATED)) continue;
+//
+//		return t;
+//	}
+//
+//	return NULL;
+}
+
+void
+part_task_free(struct part_task *t)
+{
+	if (!ps_cas(&t->state, PART_TASK_S_INITIALIZED, PART_TASK_S_FREED)) assert(0);
+
+	parttask_store_add(&pt_head[cos_cpuid()], t);
+//	ps_slab_free_parttask(t);
+//	part_task_state_t s = 0;
+//
+//	if (!t) return;
+//
+//	do {
+//		s = ps_load(&t->state);
+//		if (s != PART_TASK_S_INITIALIZED) return;
+//	} while (!ps_cas(&t->state, s, PART_TASK_S_FREED));
+}
+
+unsigned
+part_isready(void)
+{ return (part_ready == NUM_CPU); }
+
+void
+part_init(void)
+{
+	int k;
+	static volatile int is_first = NUM_CPU;
+	struct sl_thd *it = NULL;
+	struct sl_xcore_thd *xit = NULL;
+	sched_param_t ip = _PART_IDLE_PRIO_PACK();
+	static volatile int all_done = 0;
+
+	ps_list_head_init(&part_thdpool_core[cos_cpuid()]);
+	if (ps_cas(&is_first, NUM_CPU, cos_cpuid())) {
+		vaddr_t ptmem = 0, pdmem = 0;
+
+		for (k = 0; k < NUM_CPU; k++) {
+			part_dq_percore[k] = (struct deque_part *)memmgr_heap_page_allocn(PART_DEQUE_MAX_PAGES);
+			assert(part_dq_percore[k]);
+			deque_init_part(part_dq_percore[k], PART_DEQUE_SZ);
+		}
+		ptmem = memmgr_heap_page_allocn(PART_MAX_PAGES * NUM_CPU);
+		assert(ptmem);
+		memset((void *)ptmem, 0, PART_MAX_PAGES * PAGE_SIZE * NUM_CPU);
+
+		pdmem = memmgr_heap_page_allocn(PART_MAX_DATA_PAGES * NUM_CPU);
+		assert(pdmem);
+		memset((void *)pdmem, 0, PART_MAX_DATA_PAGES * PAGE_SIZE * NUM_CPU);
+
+		partdata_store_init_all(pdmem);
+		parttask_store_init_all(ptmem);
+//		ps_slab_init_parttask();
+//		ps_slab_init_partdata();
+
+#if defined(PART_ENABLE_NESTED)
+		ps_list_head_init(&part_l_global);
+		crt_lock_init(&part_l_lock);
+#else
+		memset(&main_task, 0, sizeof(main_task));
+#endif
+		in_main_parallel = 0;
+	}
+	
+	for (k = 0; k < PART_MAX_CORE_THDS; k++) {
+		struct sl_xcore_thd *x;
+		struct sl_thd *t;
+		sched_param_t p = _PART_PRIO_PACK();
+
+		t = sl_thd_alloc(part_thd_fn, NULL);
+		assert(t);
+
+		sl_thd_param_set(t, p);
+
+		x = sl_xcore_thd_lookup_init(sl_thd_thdid(t), cos_cpuid());
+		assert(x);
+	}
+
+#ifdef PART_ENABLE_BLOCKING
+	sl_cs_enter();
+	/* 
+	 * because it's fifo, all threads would go block 
+	 * themselves up as there is no work yet
+	 * eventually returning to this main thread on core-0, 
+	 * and on all other cores, scheduler would be running!
+	 */
+	sl_cs_exit_schedule(); 
+	it = sl_thd_alloc(part_idle_fn, NULL);
+	assert(it);
+	sl_thd_param_set(it, ip);
+#endif
+
+	ps_faa(&all_done, 1);
+	while (ps_load(&all_done) != NUM_CPU) ;
+
+	ps_faa(&part_ready, 1);
+}
diff --git a/src/components/lib/part_raw.c b/src/components/lib/part_raw.c
new file mode 100644
index 0000000000..273ce48f4a
--- /dev/null
+++ b/src/components/lib/part_raw.c
@@ -0,0 +1,402 @@
+#include <cos_types.h>
+#include <cos_component.h>
+#include <part_task.h>
+#include <part.h>
+#include <cos_kernel_api.h>
+#include <cos_defkernel_api.h>
+#include <ps.h>
+#include <ps_slab.h>
+
+#include <sl.h>
+#include <sl_xcore.h>
+
+struct deque_part *part_dq_percore[NUM_CPU];
+//struct cirque_par parcq_global;
+static volatile unsigned part_ready = 0;
+volatile int in_main_parallel;
+#if defined(PART_ENABLE_NESTED)
+struct crt_lock part_l_lock;
+struct ps_list_head part_l_global;
+#else
+struct part_task main_task;
+#endif
+//static struct part_task *part_tasks = NULL;
+//static struct part_data *part__data = NULL;
+struct ps_list_head part_thdpool_core[NUM_CPU];
+
+#define PART_DEQUE_SZ PART_MAX_TASKS
+#define _PART_PRIO TCAP_PRIO_MAX
+#define _PART_PRIO_PACK() sched_param_pack(SCHEDP_PRIO, _PART_PRIO)
+
+#define _PART_IDLE_PRIO (_PART_PRIO+4)
+#define _PART_IDLE_PRIO_PACK() sched_param_pack(SCHEDP_PRIO, _PART_IDLE_PRIO)
+
+//struct ps_slab *
+//ps_slab_memmgr_alloc(struct ps_mem *m, size_t sz, coreid_t coreid)
+//{
+//	PRINTC("%s:%d\n", __func__, __LINE__);
+//	unsigned npages = round_up_to_page(sz) / PAGE_SIZE;
+//	vaddr_t addr = memmgr_heap_page_allocn(npages);
+//
+//	assert(addr);
+//	memset((void *)addr, 0, npages * PAGE_SIZE);
+//	PRINTC("%s:%d\n", __func__, __LINE__);
+//
+//	return (struct ps_slab *)addr;
+//}
+//
+//void
+//ps_slab_memmgr_free(struct ps_mem *m, struct ps_slab *s, size_t sz, coreid_t coreid)
+//{
+//	/* do nothing */
+//}
+
+/* this? */
+//PS_SLAB_CREATE_AFNS(parttask, sizeof(struct part_task), PART_TASKS_MAX_SZ, 0, ps_slab_memmgr_alloc, ps_slab_memmgr_free);
+//PS_SLAB_CREATE_AFNS(partdata, sizeof(struct part_data), PART_DATA_MAX_SZ, 0, ps_slab_memmgr_alloc, ps_slab_memmgr_free);
+/* or this. */
+//PS_SLAB_CREATE(parttask, sizeof(struct part_task), PART_TASKS_MAX_SZ)
+//PS_SLAB_CREATE(partdata, sizeof(struct part_data), PART_DATA_MAX_SZ)
+
+/* for task pool, per core list. tasks in pool can migrate cores */
+struct parttask_head {
+	struct part_task *head;
+};
+
+static inline void
+parttask_store_init(struct parttask_head *h)
+{
+	h->head = NULL;
+}
+
+static inline void
+parttask_store_add(struct parttask_head *h, struct part_task *l)
+{
+	struct part_task *n;
+	l->next_free = NULL;
+
+	assert(h);
+	do {
+		n = ps_load(&h->head);
+		l->next_free = n;
+	} while (!ps_cas(&h->head, (unsigned long)n, (unsigned long)l)); 
+}
+
+static inline struct part_task *
+parttask_store_dequeue(struct parttask_head *h)
+{
+	struct part_task *l = NULL;
+
+	do {
+		l = ps_load(&h->head);
+		if (unlikely(!l)) return NULL;
+	} while (!ps_cas(&h->head, (unsigned long)l, (unsigned long)l->next_free));
+
+	l->next_free = NULL;
+
+	return l;
+}
+
+/* for task data, per core pool - task data could migrate pools. */
+struct partdata_head {
+	struct part_data *head;
+};
+
+static inline void
+partdata_store_init(struct partdata_head *h)
+{
+	h->head = NULL;
+}
+
+static inline void
+partdata_store_add(struct partdata_head *h, struct part_data *l)
+{
+	struct part_data *n = NULL;
+	l->next_free = NULL;
+
+	assert(h);
+	do {
+		n = ps_load(&h->head);
+
+		l->next_free = n;
+	} while (!ps_cas(&h->head, (unsigned long)n, (unsigned long)l)); 
+}
+
+static inline struct part_data *
+partdata_store_dequeue(struct partdata_head *h)
+{
+	struct part_data *l = NULL;
+
+	do {
+		l = ps_load(&h->head);
+		if (unlikely(!l)) return NULL;
+	} while (!ps_cas(&h->head, (unsigned long)l, (unsigned long)l->next_free));
+
+	l->next_free = NULL;
+
+	return l;
+}
+
+/* end treiber stacks */
+#define PART_TASKS_MAX_SZ round_up_to_page(PART_MAX_TASKS * sizeof(struct part_task))
+#define PART_MAX_PAGES (PART_TASKS_MAX_SZ / PAGE_SIZE)
+#define PART_DATA_MAX_SZ round_up_to_page(PART_MAX_TASKS * sizeof(struct part_data))
+#define PART_MAX_DATA_PAGES (PART_DATA_MAX_SZ / PAGE_SIZE)
+#define PART_DEQUE_MAX_SZ round_up_to_page(sizeof(struct deque_part))
+#define PART_DEQUE_MAX_PAGES (PART_DEQUE_MAX_SZ / PAGE_SIZE)
+
+struct partdata_head pd_head[NUM_CPU];
+
+static inline void
+partdata_store_init_all(vaddr_t mem)
+{
+	int i;
+
+	for (i = 0; i < NUM_CPU; i++) {
+		int j;
+		struct part_data *st = (struct part_data *)(mem + (PART_DATA_MAX_SZ * i));
+
+		partdata_store_init(&pd_head[i]);
+		
+		for (j = 0; j < PART_MAX_TASKS; j++) partdata_store_add(&pd_head[i], st + j);
+	}
+}
+
+static inline struct part_data *
+partdata_store_dequeue_any(void)
+{
+	struct part_data *p = NULL;
+	int i = 0;
+
+	for (i = 0; i < NUM_CPU; i++) {
+		p = partdata_store_dequeue(&pd_head[(cos_cpuid() + i) % NUM_CPU]);
+
+		if (p) break;
+	}
+
+	return p;
+}
+
+struct parttask_head pt_head[NUM_CPU];
+
+static inline void
+parttask_store_init_all(vaddr_t mem)
+{
+	int i;
+
+	for (i = 0; i < NUM_CPU; i++) {
+		int j;
+		struct part_task *st = (struct part_task *)(mem + (PART_TASKS_MAX_SZ * i));
+
+		parttask_store_init(&pt_head[i]);
+		
+		for (j = 0; j < PART_MAX_TASKS; j++) parttask_store_add(&pt_head[i], st + j);
+	}
+}
+
+static inline struct part_task *
+parttask_store_dequeue_any(void)
+{
+	struct part_task *p = NULL;
+	int i = 0;
+
+	for (i = 0; i < NUM_CPU; i++) {
+		p = parttask_store_dequeue(&pt_head[(cos_cpuid() + i) % NUM_CPU]);
+
+		if (p) break;
+	}
+
+	return p;
+}
+
+/* idle thread to wakeup when there is nothing to do on this core! */
+static void
+part_idle_fn(void *d)
+{
+	struct sl_thd *sched = sl__globals_core()->sched_thd, *curr = sl_thd_curr();
+
+	while (1) {
+		/*
+		 * TODO: threads could be woken up even if there is no work!
+		 */
+		if (likely(ps_load(&in_main_parallel))) part_pool_wakeup();
+		sl_thd_yield_thd(sched);
+	}
+}
+
+struct part_data *
+part_data_alloc(void)
+{
+	struct part_data *d = partdata_store_dequeue_any();
+	//struct part_data *d = partdata_store_dequeue(&pd_head[cos_cpuid()]);
+
+	if (!d) return d;
+	if (!ps_cas(&d->flag, 0, 1)) assert(0);
+
+	return d;
+//	int i;
+//	struct part_data *d = ps_slab_alloc_partdata();
+//
+//	if (!ps_cas(&d->flag, 0, 1)) assert(0);
+//
+//	return d;
+//	for (i = 0; i < PART_MAX_TASKS; i++) {
+//		d = part__data + i;
+//
+//		if (d->flag) continue;
+//
+//		/* if this fails, someone else just alloced it! */
+//		if (!ps_cas(&d->flag, 0, 1)) continue;
+//
+//		return d;
+//	}
+//
+//	return NULL;
+}
+
+void
+part_data_free(struct part_data *d)
+{
+	if (!ps_cas(&d->flag, 1, 0)) assert(0);
+
+	partdata_store_add(&pd_head[cos_cpuid()], d);
+//	ps_slab_free_partdata(d);
+//	int f;
+//
+//	if (!d) return;
+//
+//	do {
+//		f = d->flag;
+//		assert(f);
+//	} while (!ps_cas(&d->flag, f, 0));
+}
+struct part_task *
+part_task_alloc(part_task_type_t type)
+{
+	struct part_task *t = parttask_store_dequeue_any();
+	//struct part_task *t = parttask_store_dequeue(&pt_head[cos_cpuid()]);
+
+	if (!t) return t;
+
+	/* use upcas ? */
+	if (!ps_cas(&t->state, PART_TASK_S_FREED, PART_TASK_S_ALLOCATED)) assert(0);
+
+	return t;
+//	struct part_task *t = ps_slab_alloc_parttask();
+//
+//	if (!ps_cas(&t->state, PART_TASK_S_FREED, PART_TASK_S_ALLOCATED)) assert(0);
+//
+//	return t;
+//	int i;
+//	struct part_task *t;
+//
+//	for (i = 0; i < PART_MAX_TASKS; i++) {
+//		t = part_tasks + i;
+//
+//		if (ps_load(&t->state) != PART_TASK_S_FREED) continue;
+//
+//		/* if this fails, someone else just alloced it! */
+//		if (!ps_cas(&t->state, PART_TASK_S_FREED, PART_TASK_S_ALLOCATED)) continue;
+//
+//		return t;
+//	}
+//
+//	return NULL;
+}
+
+void
+part_task_free(struct part_task *t)
+{
+	if (!ps_cas(&t->state, PART_TASK_S_INITIALIZED, PART_TASK_S_FREED)) assert(0);
+
+	parttask_store_add(&pt_head[cos_cpuid()], t);
+//	ps_slab_free_parttask(t);
+//	part_task_state_t s = 0;
+//
+//	if (!t) return;
+//
+//	do {
+//		s = ps_load(&t->state);
+//		if (s != PART_TASK_S_INITIALIZED) return;
+//	} while (!ps_cas(&t->state, s, PART_TASK_S_FREED));
+}
+
+unsigned
+part_isready(void)
+{ return (part_ready == NUM_CPU); }
+
+void
+part_init(void)
+{
+	struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get());
+	int k;
+	static volatile int is_first = NUM_CPU;
+	struct sl_thd *it = NULL;
+	struct sl_xcore_thd *xit = NULL;
+	sched_param_t ip = _PART_IDLE_PRIO_PACK();
+	static volatile int all_done = 0;
+
+	ps_list_head_init(&part_thdpool_core[cos_cpuid()]);
+	if (ps_cas(&is_first, NUM_CPU, cos_cpuid())) {
+		vaddr_t ptmem = 0, pdmem = 0;
+
+		for (k = 0; k < NUM_CPU; k++) {
+			part_dq_percore[k] = (struct deque_part *)cos_page_bump_allocn(ci, PART_DEQUE_MAX_SZ);
+			assert(part_dq_percore[k]);
+			deque_init_part(part_dq_percore[k], PART_DEQUE_SZ);
+		}
+		ptmem = (vaddr_t)cos_page_bump_allocn(ci, PART_TASKS_MAX_SZ * NUM_CPU);
+		assert(ptmem);
+		memset((void *)ptmem, 0, PART_MAX_PAGES * PAGE_SIZE * NUM_CPU);
+
+		pdmem = (vaddr_t)cos_page_bump_allocn(ci, PART_DATA_MAX_SZ * NUM_CPU);
+		assert(pdmem);
+		memset((void *)pdmem, 0, PART_MAX_DATA_PAGES * PAGE_SIZE * NUM_CPU);
+
+		partdata_store_init_all(pdmem);
+		parttask_store_init_all(ptmem);
+//		ps_slab_init_parttask();
+//		ps_slab_init_partdata();
+
+#if defined(PART_ENABLE_NESTED)
+		ps_list_head_init(&part_l_global);
+		crt_lock_init(&part_l_lock);
+#else
+		memset(&main_task, 0, sizeof(main_task));
+#endif
+		in_main_parallel = 0;
+	}
+	
+	for (k = 0; k < PART_MAX_CORE_THDS; k++) {
+		struct sl_xcore_thd *x;
+		struct sl_thd *t;
+		sched_param_t p = _PART_PRIO_PACK();
+
+		t = sl_thd_alloc(part_thd_fn, NULL);
+		assert(t);
+
+		sl_thd_param_set(t, p);
+
+		x = sl_xcore_thd_lookup_init(sl_thd_thdid(t), cos_cpuid());
+		assert(x);
+	}
+
+#ifdef PART_ENABLE_BLOCKING
+	sl_cs_enter();
+	/* 
+	 * because it's fifo, all threads would go block 
+	 * themselves up as there is no work yet
+	 * eventually returning to this main thread on core-0, 
+	 * and on all other cores, scheduler would be running!
+	 */
+	sl_cs_exit_schedule(); 
+	it = sl_thd_alloc(part_idle_fn, NULL);
+	assert(it);
+	sl_thd_param_set(it, ip);
+#endif
+
+	ps_faa(&all_done, 1);
+	while (ps_load(&all_done) != NUM_CPU) ;
+
+	ps_faa(&part_ready, 1);
+}
diff --git a/src/components/lib/posix/Makefile b/src/components/lib/posix/Makefile
index c72f105cd7..90cdcd62a5 100644
--- a/src/components/lib/posix/Makefile
+++ b/src/components/lib/posix/Makefile
@@ -8,7 +8,7 @@ INC += -I../ps/
 all: posix.o
 
 posix.o: posix.c
-	$(CC) $(INC) $< -o $@ -c $(CFLAGS)
+	@$(CC) $(INC) $< -o $@ -c $(CFLAGS)
 
 clean:
-	rm -f posix.o
+	@rm -f posix.o
diff --git a/src/components/lib/posix/posix.c b/src/components/lib/posix/posix.c
index 73166a7524..fc1e8c366b 100644
--- a/src/components/lib/posix/posix.c
+++ b/src/components/lib/posix/posix.c
@@ -362,7 +362,7 @@ struct sl_lock futex_lock = SL_LOCK_STATIC_INIT();
 int
 cos_futex_wait(struct futex_data *futex, int *uaddr, int val, const struct timespec *timeout)
 {
-	cycles_t   deadline;
+	cycles_t   deadline = sl_now();
 	microsec_t wait_time;
 	struct futex_waiter waiter = (struct futex_waiter) {
 		.thdid = sl_thdid()
diff --git a/src/components/lib/sinv_async/acom_client.c b/src/components/lib/sinv_async/acom_client.c
index da67bb17e5..384c83b2bb 100644
--- a/src/components/lib/sinv_async/acom_client.c
+++ b/src/components/lib/sinv_async/acom_client.c
@@ -81,7 +81,7 @@ acom_client_request(struct sinv_async_info *s, acom_type_t t, word_t a, word_t b
 {
 	struct sinv_thdinfo *tinfo = &s->cdata.cthds[cos_thdid()];
 	volatile unsigned long *reqaddr = (volatile unsigned long *)SINV_POLL_ADDR(tinfo->shmaddr);
-	int *retval = NULL, ret, rcvd = 0;
+	int *retval = NULL, ret;
 	struct sinv_call_req *req = NULL;
 
 	assert(t >= 0 && t < SINV_NUM_MAX);
@@ -108,7 +108,7 @@ acom_client_request(struct sinv_async_info *s, acom_type_t t, word_t a, word_t b
 	cos_asnd(tinfo->sndcap, 1);
 
 	assert(tinfo->rcvcap);
-	while ((cos_rcv(tinfo->rcvcap, RCV_NON_BLOCKING | RCV_ALL_PENDING, &rcvd) < 0)) {
+	while ((cos_rcv(tinfo->rcvcap, RCV_NON_BLOCKING) < 0)) {
 		cycles_t timeout = time_now() + time_usec2cyc(SINV_SRV_POLL_US);
 
 		if (ps_load((unsigned long *)reqaddr) == SINV_REQ_RESET) break;
diff --git a/src/components/lib/sinv_async/sinv_client.c b/src/components/lib/sinv_async/sinv_client.c
index 501a98e3f6..031ed40f77 100644
--- a/src/components/lib/sinv_async/sinv_client.c
+++ b/src/components/lib/sinv_async/sinv_client.c
@@ -112,7 +112,7 @@ sinv_client_call_wrets(int wrets, struct sinv_async_info *s, sinv_num_t n, word_
 	 */
 	cos_asnd(tinfo->sndcap, 1);
 
-	while ((tinfo->rcvcap && cos_rcv(tinfo->rcvcap, RCV_NON_BLOCKING, NULL) < 0) && (ps_load((unsigned long *)reqaddr) != SINV_REQ_RESET)) {
+	while ((tinfo->rcvcap && cos_rcv(tinfo->rcvcap, RCV_NON_BLOCKING) < 0) && (ps_load((unsigned long *)reqaddr) != SINV_REQ_RESET)) {
 		cycles_t timeout = time_now() + time_usec2cyc(SINV_SRV_POLL_US);
 
 		sl_thd_block_timeout(0, timeout); /* in the scheduler component */
diff --git a/src/components/lib/sinv_async/sinv_server.c b/src/components/lib/sinv_async/sinv_server.c
index 680d790cd3..2a6c539c69 100644
--- a/src/components/lib/sinv_async/sinv_server.c
+++ b/src/components/lib/sinv_async/sinv_server.c
@@ -114,9 +114,8 @@ sinv_server_aep_fn(arcvcap_t rcv, void *data)
 		asndcap_t snd = t->sndcap;
 		int *retval = (int *)SINV_RET_ADDR(t->shmaddr), ret;
 		struct sinv_call_req *req = (struct sinv_call_req *)SINV_REQ_ADDR(t->shmaddr);
-		int rcvd = 0;
 
-		while ((cos_rcv(rcv, RCV_NON_BLOCKING | RCV_ALL_PENDING, &rcvd) < 0)) {
+		while ((cos_rcv(rcv, RCV_NON_BLOCKING) < 0)) {
 			cycles_t timeout = time_now() + time_usec2cyc(SINV_SRV_POLL_US);
 
 			if (ps_load((unsigned long *)reqaddr) == SINV_REQ_SET) break;
diff --git a/src/components/lib/sl/Makefile b/src/components/lib/sl/Makefile
index 6e908cda0b..d54ad150e6 100644
--- a/src/components/lib/sl/Makefile
+++ b/src/components/lib/sl/Makefile
@@ -1,6 +1,6 @@
 include Makefile.src Makefile.comp
 
-LIB_OBJS=sl_capmgr.o sl_raw.o sl_sched.o sl_xcpu.o sl_child.o sl_mod_fprr.o sl_lock.o sl_thd_static_backend.o
+LIB_OBJS=sl_capmgr.o sl_raw.o sl_sched.o sl_xcore.o sl_child.o sl_mod_fprr.o sl_mod_rr.o sl_mod_fifo.o sl_mod_part_fifo.o sl_lock.o sl_thd_static_backend.o sl_blkpt.o
 LIBS=$(LIB_OBJS:%.o=%.a)
 CINC+=-m32
 
@@ -13,5 +13,10 @@ all: $(LIBS)
 	@$(CC) $(CFLAGS) $(CINC) -o $(@:%.a=%.o) -c $<
 	@$(AR) cr lib$@ $(@:%.a=%.o)
 
+%.a:%.S
+	$(info |     [AS] Creating library file $@ from $^)
+	@$(AS) $(ASFLAGS) -c -o $(@:%.a=%.o) $^
+	@$(AR) cr lib$@ $(@:%.a=%.o)
+
 clean:
 	@rm -f *.o *.a *.d
diff --git a/src/components/lib/sl/sl_blkpt.c b/src/components/lib/sl/sl_blkpt.c
new file mode 100644
index 0000000000..de59ee69a1
--- /dev/null
+++ b/src/components/lib/sl/sl_blkpt.c
@@ -0,0 +1,140 @@
+#include <sl.h>
+#include <stacklist.h>
+
+#define NBLKPTS 64
+struct blkpt_mem {
+	sched_blkpt_id_t      id;
+	sched_blkpt_epoch_t   epoch;
+	struct stacklist_head blocked;
+};
+static struct blkpt_mem __blkpts[NBLKPTS];
+static int __blkpt_offset = 1;
+
+#define BLKPT_EPOCH_BLKED_BITS ((sizeof(sched_blkpt_epoch_t) * 8)
+#define BLKPT_EPOCH_DIFF       (BLKPT_EPOCH_BLKED_BITS - 2)/2)
+
+/*
+ * Is cmp > e? This is more complicated than it seems it should be
+ * only because of wrap-around. We have to consider the case that we
+ * have, and that we haven't wrapped around.
+ */
+static int
+blkpt_epoch_is_higher(sched_blkpt_epoch_t e, sched_blkpt_epoch_t cmp)
+{
+	return (e > cmp && (e - cmp) > BLKPT_EPOCH_DIFF) || (e < cmp && (cmp - e) < BLKPT_EPOCH_DIFF);
+}
+
+static struct blkpt_mem *
+blkpt_get(sched_blkpt_id_t id)
+{
+	if (id - 1 == NBLKPTS) return NULL;
+
+	return &__blkpts[id-1];
+}
+
+sched_blkpt_id_t
+sched_blkpt_alloc(void)
+{
+	sched_blkpt_id_t id;
+	struct blkpt_mem *m;
+	sched_blkpt_id_t ret = SCHED_BLKPT_NULL;
+
+	sl_cs_enter();
+
+	id = (sched_blkpt_id_t)ps_faa(&__blkpt_offset, 1);
+	m  = blkpt_get(id);
+	if (!m) ERR_THROW(SCHED_BLKPT_NULL, unlock);
+
+	m->id    = id;
+	ret      = id;
+	m->epoch = 0;
+	stacklist_init(&m->blocked);
+	/* TODO: undo offset if it failed in an multi-core safe way!*/
+unlock:
+	sl_cs_exit();
+
+	return ret;
+}
+
+int
+sched_blkpt_free(sched_blkpt_id_t id)
+{
+	/* alloc only for now */
+	return 0;
+}
+
+int
+sched_blkpt_trigger(sched_blkpt_id_t blkpt, sched_blkpt_epoch_t epoch, int single)
+{
+	thdid_t tid;
+	cpuid_t core;
+	struct blkpt_mem *m;
+	int ret = 0;
+
+	sl_cs_enter();
+
+	m = blkpt_get(blkpt);
+	if (!m) ERR_THROW(-1, unlock);
+
+	/* is the new epoch more recent than the existing? */
+	if (!blkpt_epoch_is_higher(m->epoch, epoch)) ERR_THROW(0, unlock);
+
+	m->epoch = epoch;
+	while ((tid = stacklist_dequeue(&core, &m->blocked)) != 0) {
+		if (core == cos_cpuid()) {
+			struct sl_thd *t = sl_thd_lkup(tid);
+
+			assert(t);
+
+			sl_thd_wakeup_no_cs(t); /* ignore retval: process next thread */
+		} else {
+			struct sl_xcore_thd *t = sl_xcore_thd_lookup(tid);
+
+			assert(t && t->core == core);
+			/* perhaps sl_xcore_thd_wakeup_no_cs? */
+			sl_cs_exit();
+			sl_xcore_thd_wakeup(t);
+			sl_cs_enter();
+		}
+	}
+	/* most likely we switch to a woken thread here */
+	sl_cs_exit_schedule();
+
+	return 0;
+unlock:
+	sl_cs_exit();
+
+	return ret;
+}
+
+int
+sched_blkpt_block(sched_blkpt_id_t blkpt, sched_blkpt_epoch_t epoch, thdid_t dependency)
+{
+	struct blkpt_mem *m;
+	struct sl_thd    *t;
+	struct stacklist  sl; 	/* The stack-based structure we'll use to track ourself */
+	int ret = 0;
+
+	sl_cs_enter();
+
+	m = blkpt_get(blkpt);
+	if (!m) ERR_THROW(-1, unlock);
+
+	/* Outdated event? don't block! */
+	if (blkpt_epoch_is_higher(m->epoch, epoch)) ERR_THROW(0, unlock);
+
+	/* Block! */
+	stacklist_add(&m->blocked, &sl);
+
+	t = sl_thd_curr();
+	if (sl_thd_block_no_cs(t, SL_THD_BLOCKED, 0)) ERR_THROW(-1, unlock);
+
+	sl_cs_exit_schedule();
+	assert(stacklist_is_removed(&sl)); /* we cannot still be on the list */
+
+	return 0;
+unlock:
+	sl_cs_exit();
+
+	return ret;
+}
diff --git a/src/components/lib/sl/sl_capmgr.c b/src/components/lib/sl/sl_capmgr.c
index 5eeafe2886..d160c2fadc 100644
--- a/src/components/lib/sl/sl_capmgr.c
+++ b/src/components/lib/sl/sl_capmgr.c
@@ -14,6 +14,7 @@
 #include "../../interface/capmgr/memmgr.h"
 #include <bitmap.h>
 #include <sl_child.h>
+#include <cos_dcb.h>
 
 extern void sl_thd_event_info_reset(struct sl_thd *t);
 extern void sl_thd_free_no_cs(struct sl_thd *t);
@@ -37,7 +38,7 @@ sl_shm_map(cbuf_t id)
 }
 
 void
-sl_xcpu_asnd_alloc(void)
+sl_xcore_asnd_alloc(void)
 {
 	int i;
 
@@ -46,16 +47,16 @@ sl_xcpu_asnd_alloc(void)
 		thdid_t tid;
 
 		if (i == cos_cpuid()) continue;
-		if (!bitmap_check(sl__globals()->cpu_bmp, i)) continue;
+		if (!bitmap_check(sl__globals()->core_bmp, i)) continue;
 
 		snd = capmgr_asnd_rcv_create(BOOT_CAPTBL_SELF_INITRCV_BASE_CPU(i));
 		assert(snd);
-		sl__globals()->xcpu_asnd[cos_cpuid()][i] = snd;
+		sl__globals()->xcore_asnd[cos_cpuid()][i] = snd;
 	}
 }
 
 struct sl_thd *
-sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t prps)
+sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t prps, struct cos_dcb_info *dcb)
 {
 	struct sl_thd_policy *tp = NULL;
 	struct sl_thd        *t  = NULL;
@@ -64,6 +65,7 @@ sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t
 	if (!tp) goto done;
 	t  = sl_mod_thd_get(tp);
 
+	t->dcb            = dcb;
 	t->properties     = prps;
 	t->aepinfo        = aep;
 	t->sndcap         = sndcap;
@@ -78,7 +80,9 @@ sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t
 	t->timeout_idx    = -1;
 	t->prio           = TCAP_PRIO_MIN;
 	ps_list_init(t, SL_THD_EVENT_LIST);
+	ps_list_init(t, partlist);
 	sl_thd_event_info_reset(t);
+	sl_xcore_thd_lookup_init(aep->tid, cos_cpuid());
 
 done:
 	return t;
@@ -88,21 +92,24 @@ struct sl_thd *
 sl_thd_alloc_no_cs(cos_thd_fn_t fn, void *data)
 {
 	struct cos_defcompinfo *dci = cos_defcompinfo_curr_get();
-	struct cos_compinfo    *ci  = &dci->ci;
+	struct cos_compinfo    *ci  = cos_compinfo_get(dci);
 	struct sl_thd          *t   = NULL;
 	struct cos_aep_info    *aep = NULL;
+	struct cos_dcb_info    *dcb = NULL;
 	thdcap_t thdcap = 0;
 	thdid_t tid = 0;
 
 	aep = sl_thd_alloc_aep_backend();
 	if (!aep) goto done;
 
-	aep->thd = capmgr_thd_create(fn, data, &tid);
+	aep->thd = capmgr_thd_create(fn, data, &tid, &dcb);
 	if (!aep->thd) goto done;
 	aep->tid = tid;
+	assert(tid);
 
-	t = sl_thd_alloc_init(aep, 0, 0);
+	t = sl_thd_alloc_init(aep, 0, 0, dcb);
 	sl_mod_thd_create(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 
 done:
 	return t;
@@ -128,21 +135,23 @@ sl_thd_comp_init_no_cs(struct cos_defcompinfo *comp, sl_thd_property_t prps, asn
 		assert(snd);
 	}
 
-	t = sl_thd_alloc_init(aep, snd, prps);
+	t = sl_thd_alloc_init(aep, snd, prps, NULL);
 	sl_mod_thd_create(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 
 done:
 	return t;
 }
 
 static struct sl_thd *
-sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx)
+sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx, vaddr_t *dcbuaddr)
 {
 	struct cos_defcompinfo *dci    = cos_defcompinfo_curr_get();
 	struct cos_compinfo    *ci     = cos_compinfo_get(dci);
 	struct cos_compinfo    *compci = cos_compinfo_get(comp);
 	struct sl_thd          *t      = NULL;
 	struct cos_aep_info    *aep    = NULL;
+	struct cos_dcb_info    *dcb = NULL;
 
 	if (comp == NULL || comp->id == 0) goto done;
 
@@ -150,12 +159,13 @@ sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx)
 		aep = sl_thd_alloc_aep_backend();
 		if (!aep) goto done;
 
-		aep->thd = capmgr_thd_create_ext(comp->id, idx, &aep->tid);
+		aep->thd = capmgr_thd_create_ext(comp->id, idx, &aep->tid, &dcb);
 		if (!aep->thd) goto done;
-		aep->tc  = sl_thd_tcap(sl__globals_cpu()->sched_thd);
+		aep->tc  = sl_thd_tcap(sl__globals_core()->sched_thd);
 
-		t = sl_thd_alloc_init(aep, 0, 0);
+		t = sl_thd_alloc_init(aep, 0, 0, dcb);
 		sl_mod_thd_create(sl_mod_thd_policy_get(t));
+		ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 	} else {
 		struct cos_aep_info *compaep = cos_sched_aep_get(comp);
 
@@ -173,10 +183,11 @@ sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx)
 }
 
 static struct sl_thd *
-sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, thdclosure_index_t idx, sl_thd_property_t prps, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv)
+sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, thdclosure_index_t idx, sl_thd_property_t prps, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, vaddr_t *dcbuaddr, arcvcap_t *extrcv)
 {
 	struct cos_aep_info *aep = NULL;
 	struct sl_thd       *t   = NULL;
+	struct cos_dcb_info *dcb = NULL;
 	asndcap_t            snd = 0;
 	int                  ret = 0, owntc = 0;
 
@@ -198,11 +209,12 @@ sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, t
 		if (!aep) goto done;
 
 		if (prps & SL_THD_PROPERTY_OWN_TCAP) owntc = 1;
-		capmgr_aep_create_ext(comp->id, aep, idx, owntc, key, ipiwin, ipimax, extrcv);
+		capmgr_aep_create_ext(comp->id, aep, idx, owntc, key, ipiwin, ipimax, &dcb, extrcv);
 		if (!aep->thd) goto done;
 
-		t = sl_thd_alloc_init(aep, 0, prps);
+		t = sl_thd_alloc_init(aep, 0, prps, dcb);
 		sl_mod_thd_create(sl_mod_thd_policy_get(t));
+		ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 	}
 
 done:
@@ -214,17 +226,20 @@ sl_thd_aep_alloc_no_cs(cos_aepthd_fn_t fn, void *data, sl_thd_property_t prps, c
 {
 	struct sl_thd       *t     = NULL;
 	struct cos_aep_info *aep   = NULL;
+	struct cos_dcb_info *dcb   = NULL;
 	int                  owntc = 0;
 
 	aep = sl_thd_alloc_aep_backend();
 	if (!aep) goto done;
 
 	if (prps & SL_THD_PROPERTY_OWN_TCAP) owntc = 1;
-	capmgr_aep_create(aep, fn, data, owntc, key, ipiwin, ipimax);
+	capmgr_aep_create(aep, fn, data, owntc, key, ipiwin, ipimax, &dcb);
 	if (aep->thd == 0) goto done;
+	assert(aep->tid);
 
-	t = sl_thd_alloc_init(aep, 0, prps);
+	t = sl_thd_alloc_init(aep, 0, prps, dcb);
 	sl_mod_thd_create(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 
 done:
 	return t;
@@ -270,7 +285,15 @@ sl_thd_comp_init(struct cos_defcompinfo *comp, int is_sched)
 }
 
 struct sl_thd *
-sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax)
+sl_thd_initaep_alloc_dcb(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, dcbcap_t dcap, microsec_t ipiwin, u32_t ipimax)
+{
+	PRINTC("UNIMPLEMENTED: Using CAPMGR API which should manage the DCB capabilities\n");
+
+	return NULL;
+}
+
+struct sl_thd *
+sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, vaddr_t *dcbuaddr)
 {
 	struct sl_thd *t = NULL;
 
@@ -278,18 +301,27 @@ sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int
 
 	sl_cs_enter();
 	if (!is_sched) {
-		t = sl_thd_alloc_ext_no_cs(comp, 0);
+		t = sl_thd_alloc_ext_no_cs(comp, 0, dcbuaddr);
 	} else {
 		t = sl_thd_aep_alloc_ext_no_cs(comp, sched_thd, 0, (is_sched ? SL_THD_PROPERTY_SEND : 0)
-					       | (own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0), key, ipiwin, ipimax, NULL);
+					       | (own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0), key, ipiwin, ipimax, dcbuaddr, NULL);
 	}
 	sl_cs_exit();
 
 	return t;
 }
 
+
 struct sl_thd *
-sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv)
+sl_thd_aep_alloc_ext_dcb(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, dcbcap_t dcap, dcboff_t doff, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv)
+{
+	PRINTC("UNIMPLEMENTED: Using CAPMGR API which should manage the DCB capabilities\n");
+
+	return NULL;
+}
+
+struct sl_thd *
+sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, vaddr_t *dcbuaddr, arcvcap_t *extrcv)
 {
 	struct sl_thd *t = NULL;
 
@@ -299,9 +331,9 @@ sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thd
 	sl_cs_enter();
 	if (!is_aep) own_tcap = 0;
 	if (is_aep) {
-		t = sl_thd_aep_alloc_ext_no_cs(comp, sched_thd, idx, own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0, key, ipiwin, ipimax, extrcv);
+		t = sl_thd_aep_alloc_ext_no_cs(comp, sched_thd, idx, own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0, key, ipiwin, ipimax, dcbuaddr, extrcv);
 	} else {
-		t = sl_thd_alloc_ext_no_cs(comp, idx);
+		t = sl_thd_alloc_ext_no_cs(comp, idx, dcbuaddr);
 	}
 	sl_cs_exit();
 
@@ -318,7 +350,7 @@ sl_thd_init_ext_no_cs(struct cos_aep_info *aepthd, struct sl_thd *sched)
 	if (!aep) goto done;
 
 	*aep = *aepthd;
-	t = sl_thd_alloc_init(aep, 0, 0);
+	t = sl_thd_alloc_init(aep, 0, 0, NULL);
 	if (!t) goto done;
 
 	/* use sched info for parent -> child notifications */
@@ -343,15 +375,14 @@ sl_thd_init_ext(struct cos_aep_info *aepthd, struct sl_thd *sched)
 }
 
 struct sl_thd *
-sl_thd_retrieve(thdid_t tid)
+sl_thd_retrieve_lazy(thdid_t tid)
 {
-	struct sl_thd       *t      = sl_mod_thd_get(sl_thd_lookup_backend(tid));
+	struct sl_thd       *t;
 	spdid_t              client = cos_inv_token();
 	thdid_t              itid   = 0;
 	struct sl_thd       *it     = NULL;
 	struct cos_aep_info  aep;
 
-	if (t && sl_thd_aepinfo(t)) return t;
 	if (tid >= SL_MAX_NUM_THDS) return NULL;
 	assert(client);
 
@@ -377,7 +408,7 @@ sl_thd_retrieve(thdid_t tid)
 	it = sl_thd_try_lkup(itid);
 	assert(it);
 	aep.tid = tid;
-	aep.tc  = sl__globals_cpu()->sched_tcap;
+	aep.tc  = sl__globals_core()->sched_tcap;
 	t = sl_thd_init_ext_no_cs(&aep, it);
 
 	/* if (tid != sl_thdid()) sl_cs_exit(); */
@@ -394,3 +425,39 @@ sl_thd_free(struct sl_thd *t)
 	sl_thd_free_no_cs(t);
 	sl_cs_exit();
 }
+
+int
+sl_thd_migrate_no_cs(struct sl_thd *t, cpuid_t core)
+{
+	struct sl_thd_policy *x = NULL;
+	int ret;
+
+	if (t->properties) return -1;
+	if (t->state != SL_THD_RUNNABLE) return -1;
+	/* capmgr should migrate the thdcap as well */
+	ret = capmgr_thd_migrate(sl_thd_thdid(t), sl_thd_thdcap(t), core);
+	if (ret) return -1;
+	sl_mod_thd_delete(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), -1);
+
+	x = sl_thd_migrate_backend(sl_mod_thd_policy_get(t), core);
+	if (!x) return -1;
+
+	return 0;
+}
+
+int
+sl_thd_migrate(thdid_t tid, cpuid_t core)
+{
+	int ret;
+	struct sl_thd *c = sl_thd_curr(), *t = sl_thd_lkup(tid);
+
+	if (core == cos_cpuid()) return -1;
+	if (sl_thd_rcvcap(t) || sl_thd_tcap(t)) return -1;
+	assert(c != t);
+	sl_cs_enter();
+	ret = sl_thd_migrate_no_cs(t, core);
+	sl_cs_exit();
+
+	return ret;
+}
diff --git a/src/components/lib/sl/sl_child.c b/src/components/lib/sl/sl_child.c
index 45ce8fe18e..badc3bba88 100644
--- a/src/components/lib/sl/sl_child.c
+++ b/src/components/lib/sl/sl_child.c
@@ -47,6 +47,7 @@ sl_parent_notif_alloc(struct sl_thd *childthd)
 int
 sl_parent_notif_enqueue(struct sl_thd *thd, struct sl_child_notification *notif)
 {
+#ifdef SL_PARENTCHILD
 	assert(thd && notif);
 	assert(thd->properties & SL_THD_PROPERTY_SEND);
 
@@ -55,11 +56,14 @@ sl_parent_notif_enqueue(struct sl_thd *thd, struct sl_child_notification *notif)
 
 	if (ck_ring_enqueue_spsc_child(thd->ch_ring, thd->ch_ringbuf, notif) == false) return -1;
 	if (cos_asnd(sl_thd_asndcap(thd), 0)) return -1;
+#else
+	assert(0);
+#endif
 
 	return 0;
 }
 
-/* there is only 1 parent per scheduler per cpu */
+/* there is only 1 parent per scheduler per core */
 int
 sl_child_notif_map(cbuf_t id)
 {
@@ -85,6 +89,7 @@ sl_child_notif_map(cbuf_t id)
 int
 sl_child_notif_dequeue(struct sl_child_notification *notif)
 {
+#ifdef SL_PARENTCHILD
 	struct ck_ring *cring = child_ring[cos_cpuid()];
 	struct sl_child_notification *crbuf = child_ringbuf[cos_cpuid()];
 
@@ -92,38 +97,52 @@ sl_child_notif_dequeue(struct sl_child_notification *notif)
 	if (!cring || !crbuf) return 0;
 
 	if (ck_ring_dequeue_spsc_child(cring, crbuf, notif) == true) return 1;
-
+#endif
 	return 0;
 }
 
 int
 sl_child_notif_empty(void)
 {
+#ifdef SL_PARENTCHILD
 	struct ck_ring *cring = child_ring[cos_cpuid()];
 
 	if (!cring) return 1;
 
 	return (!ck_ring_size(cring));
+#else
+	return 1;
+#endif
 }
 
 int
 sl_parent_notif_block_no_cs(struct sl_thd *child, struct sl_thd *thd)
 {
+#ifdef SL_PARENTCHILD
 	struct sl_child_notification notif;
 
 	notif.type = SL_CHILD_THD_BLOCK;
 	notif.tid  = sl_thd_thdid(thd);
 
 	return sl_parent_notif_enqueue(child, &notif);
+#else
+	assert(0);
+	return 0;
+#endif
 }
 
 int
 sl_parent_notif_wakeup_no_cs(struct sl_thd *child, struct sl_thd *thd)
 {
+#ifdef SL_PARENTCHILD
 	struct sl_child_notification notif;
 
 	notif.type = SL_CHILD_THD_WAKEUP;
 	notif.tid  = sl_thd_thdid(thd);
 
 	return sl_parent_notif_enqueue(child, &notif);
+#else
+	assert(0);
+	return 0;
+#endif
 }
diff --git a/src/components/lib/sl/sl_mod_fifo.c b/src/components/lib/sl/sl_mod_fifo.c
new file mode 100644
index 0000000000..3824356794
--- /dev/null
+++ b/src/components/lib/sl/sl_mod_fifo.c
@@ -0,0 +1,115 @@
+/**
+ * Redistribution of this file is permitted under the BSD two clause license.
+ *
+ * Copyright 2019, The George Washington University
+ * Author: Phani Gadepalli, phanikishoreg@gwu.edu
+ */
+
+#include <sl.h>
+#include <sl_consts.h>
+#include <sl_mod_policy.h>
+#include <sl_plugins.h>
+
+#define SL_FPRR_PERIOD_US_MIN  SL_MIN_PERIOD_US
+
+static struct ps_list_head threads[NUM_CPU] CACHE_ALIGNED;
+
+void
+sl_mod_execution(struct sl_thd_policy *t, cycles_t cycles)
+{ }
+
+struct sl_thd_policy *
+sl_mod_schedule(void)
+{
+	struct sl_thd_policy *t = NULL;
+
+	if (unlikely(ps_list_head_empty(&threads[cos_cpuid()]))) goto done;
+	t = ps_list_head_first_d(&threads[cos_cpuid()], struct sl_thd_policy);
+
+done:
+	return t;
+}
+
+struct sl_thd_policy *
+sl_mod_last_schedule(void)
+{
+	struct sl_thd_policy *t = NULL;
+
+	if (unlikely(ps_list_head_empty(&threads[cos_cpuid()]))) goto done;
+	t = ps_list_head_last_d(&threads[cos_cpuid()], struct sl_thd_policy);
+
+done:
+	return t;
+}
+
+void
+sl_mod_block(struct sl_thd_policy *t)
+{
+	ps_list_rem_d(t);
+}
+
+void
+sl_mod_wakeup(struct sl_thd_policy *t)
+{
+	assert(ps_list_singleton_d(t));
+
+	ps_list_head_append_d(&threads[cos_cpuid()], t);
+}
+
+void
+sl_mod_yield(struct sl_thd_policy *t, struct sl_thd_policy *yield_to)
+{
+	ps_list_rem_d(t);
+	ps_list_head_append_d(&threads[cos_cpuid()], t);
+}
+
+void
+sl_mod_thd_create(struct sl_thd_policy *t)
+{
+	t->priority    = TCAP_PRIO_MIN;
+	t->period      = 0;
+	t->period_usec = 0;
+	ps_list_init_d(t);
+
+	/* TODO: add to runq here? for now, only add when PRIO is set and that's pretty much it's ARRIVAL time! */
+}
+
+void
+sl_mod_thd_delete(struct sl_thd_policy *t)
+{ ps_list_rem_d(t); }
+
+void
+sl_mod_thd_param_set(struct sl_thd_policy *t, sched_param_type_t type, unsigned int v)
+{
+	int cpu = cos_cpuid();
+
+	switch (type) {
+	case SCHEDP_PRIO:
+	{
+		t->priority = v;
+		sl_thd_setprio(sl_mod_thd_get(t), t->priority);
+		ps_list_head_append_d(&threads[cos_cpuid()], t);
+
+		break;
+	}
+	case SCHEDP_WINDOW:
+	{
+		assert(v >= SL_FPRR_PERIOD_US_MIN);
+		t->period_usec    = v;
+		t->period         = sl_usec2cyc(v);
+
+		break;
+	}
+	case SCHEDP_BUDGET:
+	{
+		break;
+	}
+	default: assert(0);
+	}
+}
+
+void
+sl_mod_init(void)
+{
+	ps_list_head_init(&threads[cos_cpuid()]);
+}
diff --git a/src/components/lib/sl/sl_mod_fprr.c b/src/components/lib/sl/sl_mod_fprr.c
index 5d1c5dd202..8992ea0a57 100644
--- a/src/components/lib/sl/sl_mod_fprr.c
+++ b/src/components/lib/sl/sl_mod_fprr.c
@@ -9,9 +9,9 @@
 
 #define SL_FPRR_PERIOD_US_MIN  SL_MIN_PERIOD_US
 
-struct ps_list_head threads[NUM_CPU][SL_FPRR_NPRIOS] CACHE_ALIGNED;
+static unsigned int thdlist_bmp[NUM_CPU] CACHE_ALIGNED;
+static struct ps_list_head threads[NUM_CPU][SL_FPRR_NPRIOS] CACHE_ALIGNED;
 
-/* No RR yet */
 void
 sl_mod_execution(struct sl_thd_policy *t, cycles_t cycles)
 { }
@@ -20,37 +20,66 @@ struct sl_thd_policy *
 sl_mod_schedule(void)
 {
 	int i;
-	struct sl_thd_policy *t;
+	struct sl_thd_policy *t = NULL;
 
-	for (i = 0 ; i < SL_FPRR_NPRIOS ; i++) {
-		if (ps_list_head_empty(&threads[cos_cpuid()][i])) continue;
-		t = ps_list_head_first_d(&threads[cos_cpuid()][i], struct sl_thd_policy);
+	if (unlikely(!thdlist_bmp[cos_cpuid()])) return NULL;
+	i = __builtin_ctz(thdlist_bmp[cos_cpuid()]);
+	assert(i < SL_FPRR_NPRIOS);
+	assert(!ps_list_head_empty(&threads[cos_cpuid()][i]));
+	t = ps_list_head_first_d(&threads[cos_cpuid()][i], struct sl_thd_policy);
+	assert(t);
 
-		/*
-		 * We want to move the selected thread to the back of the list.
-		 * Otherwise fprr won't be truly round robin
-		 */
-		ps_list_rem_d(t);
-		ps_list_head_append_d(&threads[cos_cpuid()][i], t);
+	ps_list_rem_d(t);
+	ps_list_head_append_d(&threads[cos_cpuid()][i], t);
 
-		return t;
-	}
+	return t;
+}
 
+struct sl_thd_policy *
+sl_mod_last_schedule(void)
+{
+	/* not supported! */
 	return NULL;
 }
 
+static inline void
+__sl_mod_bmp_unset(struct sl_thd_policy *t)
+{
+	unsigned int ctb = ps_load(&thdlist_bmp[cos_cpuid()]);
+	unsigned int p = t->priority - 1, b = 1 << p;
+
+	if (!ps_list_head_empty(&threads[cos_cpuid()][p])) return;
+
+	/* unset from bitmap if there are no threads at this priority */
+	if (unlikely(!ps_upcas(&thdlist_bmp[cos_cpuid()], ctb, ctb & ~b))) assert(0);
+}
+
+static inline void
+__sl_mod_bmp_set(struct sl_thd_policy *t)
+{
+	unsigned int ctb = ps_load(&thdlist_bmp[cos_cpuid()]);
+	unsigned int p = t->priority - 1, b = 1 << p;
+
+	if (unlikely(ctb & b)) return; 
+
+	assert(!ps_list_head_empty(&threads[cos_cpuid()][p]));
+	/* set to bitmap if this is the first element added at this prio! */
+	if (unlikely(!ps_upcas(&thdlist_bmp[cos_cpuid()], ctb, ctb | b))) assert(0);
+}
+
 void
 sl_mod_block(struct sl_thd_policy *t)
 {
 	ps_list_rem_d(t);
+	__sl_mod_bmp_unset(t);
 }
 
 void
 sl_mod_wakeup(struct sl_thd_policy *t)
 {
 	assert(ps_list_singleton_d(t));
-
 	ps_list_head_append_d(&threads[cos_cpuid()][t->priority - 1], t);
+	__sl_mod_bmp_set(t);
 }
 
 void
@@ -72,7 +101,10 @@ sl_mod_thd_create(struct sl_thd_policy *t)
 
 void
 sl_mod_thd_delete(struct sl_thd_policy *t)
-{ ps_list_rem_d(t); }
+{
+	ps_list_rem_d(t); 
+	__sl_mod_bmp_unset(t);
+}
 
 void
 sl_mod_thd_param_set(struct sl_thd_policy *t, sched_param_type_t type, unsigned int v)
@@ -81,10 +113,12 @@ sl_mod_thd_param_set(struct sl_thd_policy *t, sched_param_type_t type, unsigned
 	case SCHEDP_PRIO:
 	{
 		assert(v >= SL_FPRR_PRIO_HIGHEST && v <= SL_FPRR_PRIO_LOWEST);
-		ps_list_rem_d(t); /* if we're already on a list, and we're updating priority */
+		/* should not have been on any prio before, this is FP */
+		assert(ps_list_singleton_d(t));
 		t->priority = v;
-		ps_list_head_append_d(&threads[cos_cpuid()][t->priority - 1], t);
-		sl_thd_setprio(sl_mod_thd_get(t), t->priority);
+		ps_list_head_append_d(&threads[cos_cpuid()][v - 1], t);
+		__sl_mod_bmp_set(t);
+		sl_thd_setprio(sl_mod_thd_get(t), v);
 
 		break;
 	}
@@ -110,6 +144,7 @@ sl_mod_init(void)
 {
 	int i;
 
+	thdlist_bmp[cos_cpuid()] = 0;
 	memset(threads[cos_cpuid()], 0, sizeof(struct ps_list_head) * SL_FPRR_NPRIOS);
 	for (i = 0 ; i < SL_FPRR_NPRIOS ; i++) {
 		ps_list_head_init(&threads[cos_cpuid()][i]);
diff --git a/src/components/lib/sl/sl_mod_part_fifo.c b/src/components/lib/sl/sl_mod_part_fifo.c
new file mode 100644
index 0000000000..3584d0dc26
--- /dev/null
+++ b/src/components/lib/sl/sl_mod_part_fifo.c
@@ -0,0 +1,138 @@
+/**
+ * Redistribution of this file is permitted under the BSD two clause license.
+ *
+ * Copyright 2019, The George Washington University
+ * Author: Phani Gadepalli, phanikishoreg@gwu.edu
+ */
+
+#include <sl.h>
+#include <sl_consts.h>
+#include <sl_mod_policy.h>
+#include <sl_plugins.h>
+
+#define SL_FIFO_PRIO           TCAP_PRIO_MAX
+#define SL_FIFO_IDLE_PRIO      SL_FIFO_PRIO+4
+#define SL_FIFO_PERIOD_US_MIN  SL_MIN_PERIOD_US
+
+static struct ps_list_head threads[NUM_CPU] CACHE_ALIGNED;
+static struct sl_thd_policy *idle_thd[NUM_CPU];
+
+void
+sl_mod_execution(struct sl_thd_policy *t, cycles_t cycles)
+{ }
+
+struct sl_thd_policy *
+sl_mod_schedule(void)
+{
+	struct sl_thd_policy *t = NULL;
+
+	if (unlikely(ps_list_head_empty(&threads[cos_cpuid()]))) goto done;
+	t = ps_list_head_first_d(&threads[cos_cpuid()], struct sl_thd_policy);
+
+	return t;
+done:
+	if (likely(idle_thd[cos_cpuid()])) return idle_thd[cos_cpuid()];
+
+	return t;
+}
+
+struct sl_thd_policy *
+sl_mod_last_schedule(void)
+{
+	struct sl_thd_policy *t = NULL;
+
+	if (unlikely(ps_list_head_empty(&threads[cos_cpuid()]))) goto done;
+	t = ps_list_head_last_d(&threads[cos_cpuid()], struct sl_thd_policy);
+
+done:
+	return t;
+}
+
+void
+sl_mod_block(struct sl_thd_policy *t)
+{
+	assert(t != idle_thd[cos_cpuid()]);
+	ps_list_rem_d(t);
+}
+
+void
+sl_mod_wakeup(struct sl_thd_policy *t)
+{
+	struct sl_thd *tm = sl_mod_thd_get(t);
+
+	assert(t != idle_thd[cos_cpuid()]);
+	assert(ps_list_singleton_d(t));
+
+	ps_list_head_append_d(&threads[cos_cpuid()], t);
+	/* remove from partlist used for tracking free pool of tasks on this core! */
+	if (!ps_list_singleton(tm, partlist)) ps_list_rem(tm, partlist);
+}
+
+void
+sl_mod_yield(struct sl_thd_policy *t, struct sl_thd_policy *yield_to)
+{
+	if (unlikely(t == idle_thd[cos_cpuid()])) return;
+	ps_list_rem_d(t);
+	ps_list_head_append_d(&threads[cos_cpuid()], t);
+}
+
+void
+sl_mod_thd_create(struct sl_thd_policy *t)
+{
+	t->priority    = TCAP_PRIO_MIN;
+	t->period      = 0;
+	t->period_usec = 0;
+	ps_list_init_d(t);
+
+	/* TODO: add to runq here? for now, only add when PRIO is set and that's pretty much it's ARRIVAL time! */
+}
+
+void
+sl_mod_thd_delete(struct sl_thd_policy *t)
+{
+	if (unlikely(t == idle_thd[cos_cpuid()])) return;	
+	ps_list_rem_d(t);
+}
+
+void
+sl_mod_thd_param_set(struct sl_thd_policy *t, sched_param_type_t type, unsigned int v)
+{
+	int cpu = cos_cpuid();
+
+	switch (type) {
+	case SCHEDP_PRIO:
+	{
+		t->priority = v;
+		sl_thd_setprio(sl_mod_thd_get(t), t->priority);
+
+		if (v == SL_FIFO_IDLE_PRIO) {
+			assert(idle_thd[cos_cpuid()] == NULL);
+			idle_thd[cos_cpuid()] = t;
+		} else {
+			ps_list_head_append_d(&threads[cos_cpuid()], t);
+		}
+
+		break;
+	}
+	case SCHEDP_WINDOW:
+	{
+		assert(v >= SL_FIFO_PERIOD_US_MIN);
+		t->period_usec    = v;
+		t->period         = sl_usec2cyc(v);
+
+		break;
+	}
+	case SCHEDP_BUDGET:
+	{
+		break;
+	}
+	default: assert(0);
+	}
+}
+
+void
+sl_mod_init(void)
+{
+	idle_thd[cos_cpuid()] = NULL;
+	ps_list_head_init(&threads[cos_cpuid()]);
+}
diff --git a/src/components/lib/sl/sl_mod_rr.c b/src/components/lib/sl/sl_mod_rr.c
new file mode 100644
index 0000000000..ef3116a97c
--- /dev/null
+++ b/src/components/lib/sl/sl_mod_rr.c
@@ -0,0 +1,109 @@
+#include <sl.h>
+#include <sl_consts.h>
+#include <sl_mod_policy.h>
+#include <sl_plugins.h>
+
+#define SL_FPRR_PERIOD_US_MIN  SL_MIN_PERIOD_US
+
+static struct ps_list_head threads[NUM_CPU] CACHE_ALIGNED;
+
+void
+sl_mod_execution(struct sl_thd_policy *t, cycles_t cycles)
+{ }
+
+struct sl_thd_policy *
+sl_mod_schedule(void)
+{
+	struct sl_thd_policy *t = NULL;
+
+	if (unlikely(ps_list_head_empty(&threads[cos_cpuid()]))) goto done;
+	t = ps_list_head_first_d(&threads[cos_cpuid()], struct sl_thd_policy);
+	ps_list_rem_d(t);
+	ps_list_head_append_d(&threads[cos_cpuid()], t);
+
+done:
+	return t;
+}
+
+struct sl_thd_policy *
+sl_mod_last_schedule(void)
+{
+	struct sl_thd_policy *t = NULL, *tl = NULL;
+
+	if (unlikely(ps_list_head_empty(&threads[cos_cpuid()]))) goto done;
+	t = ps_list_head_last_d(&threads[cos_cpuid()], struct sl_thd_policy);
+
+done:
+	return t;
+}
+
+void
+sl_mod_block(struct sl_thd_policy *t)
+{
+	ps_list_rem_d(t);
+}
+
+void
+sl_mod_wakeup(struct sl_thd_policy *t)
+{
+	assert(ps_list_singleton_d(t));
+
+	ps_list_head_append_d(&threads[cos_cpuid()], t);
+}
+
+void
+sl_mod_yield(struct sl_thd_policy *t, struct sl_thd_policy *yield_to)
+{
+	ps_list_rem_d(t);
+	ps_list_head_append_d(&threads[cos_cpuid()], t);
+}
+
+void
+sl_mod_thd_create(struct sl_thd_policy *t)
+{
+	t->priority    = TCAP_PRIO_MIN;
+	t->period      = 0;
+	t->period_usec = 0;
+	ps_list_init_d(t);
+}
+
+void
+sl_mod_thd_delete(struct sl_thd_policy *t)
+{ ps_list_rem_d(t); }
+
+void
+sl_mod_thd_param_set(struct sl_thd_policy *t, sched_param_type_t type, unsigned int v)
+{
+	int cpu = cos_cpuid();
+
+	switch (type) {
+	case SCHEDP_PRIO:
+	{
+		t->priority = v;
+		sl_thd_setprio(sl_mod_thd_get(t), t->priority);
+		ps_list_head_append_d(&threads[cos_cpuid()], t);
+
+		break;
+	}
+	case SCHEDP_WINDOW:
+	{
+		assert(v >= SL_FPRR_PERIOD_US_MIN);
+		t->period_usec    = v;
+		t->period         = sl_usec2cyc(v);
+		/* FIXME: synchronize periods for all tasks */
+
+		break;
+	}
+	case SCHEDP_BUDGET:
+	{
+		break;
+	}
+	default: assert(0);
+	}
+}
+
+void
+sl_mod_init(void)
+{
+	ps_list_head_init(&threads[cos_cpuid()]);
+}
diff --git a/src/components/lib/sl/sl_raw.c b/src/components/lib/sl/sl_raw.c
index 77b32f3a29..b73384e10e 100644
--- a/src/components/lib/sl/sl_raw.c
+++ b/src/components/lib/sl/sl_raw.c
@@ -11,6 +11,7 @@
 #include <cos_debug.h>
 #include <cos_kernel_api.h>
 #include <bitmap.h>
+#include <cos_dcb.h>
 
 extern void sl_thd_event_info_reset(struct sl_thd *t);
 extern void sl_thd_free_no_cs(struct sl_thd *t);
@@ -28,7 +29,7 @@ sl_shm_map(cbuf_t id)
 }
 
 void
-sl_xcpu_asnd_alloc(void)
+sl_xcore_asnd_alloc(void)
 {
         struct cos_defcompinfo *dci = cos_defcompinfo_curr_get();
         struct cos_compinfo    *ci  = cos_compinfo_get(dci);
@@ -38,16 +39,16 @@ sl_xcpu_asnd_alloc(void)
 		asndcap_t snd;
 
 		if (i == cos_cpuid()) continue;
-		if (!bitmap_check(sl__globals()->cpu_bmp, i)) continue;
+		if (!bitmap_check(sl__globals()->core_bmp, i)) continue;
 
 		snd = cos_asnd_alloc(ci, BOOT_CAPTBL_SELF_INITRCV_BASE_CPU(i), ci->captbl_cap);
 		assert(snd);
-		sl__globals()->xcpu_asnd[cos_cpuid()][i] = snd;
+		sl__globals()->xcore_asnd[cos_cpuid()][i] = snd;
 	}
 }
 
 struct sl_thd *
-sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t prps)
+sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t prps, struct cos_dcb_info *dcb)
 {
 	struct sl_thd_policy *tp = NULL;
 	struct sl_thd        *t  = NULL;
@@ -57,6 +58,7 @@ sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t
 	if (!tp) goto done;
 	t  = sl_mod_thd_get(tp);
 
+	t->dcb            = dcb;
 	t->properties     = prps;
 	t->aepinfo        = aep;
 	t->sndcap         = sndcap;
@@ -71,7 +73,9 @@ sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t
 	t->timeout_idx    = -1;
 	t->prio           = TCAP_PRIO_MIN;
 	ps_list_init(t, SL_THD_EVENT_LIST);
+	ps_list_init(t, partlist);
 	sl_thd_event_info_reset(t);
+	sl_xcore_thd_lookup_init(aep->tid, cos_cpuid());
 
 done:
 	return t;
@@ -84,17 +88,23 @@ sl_thd_alloc_no_cs(cos_thd_fn_t fn, void *data)
 	struct cos_compinfo    *ci  = cos_compinfo_get(dci);
 	struct sl_thd          *t   = NULL;
 	struct cos_aep_info    *aep = NULL;
+	struct cos_dcb_info    *dcb = NULL;
+	dcbcap_t dcap;
+	dcboff_t doff;
 
 	aep = sl_thd_alloc_aep_backend();
 	if (!aep) goto done;
+	dcap = cos_dcb_info_alloc_curr(&doff, (vaddr_t *)&dcb);
+	if (dcb && doff) assert(dcap);
 
-	aep->thd = cos_thd_alloc(ci, ci->comp_cap, fn, data);
+	aep->thd = cos_thd_alloc(ci, ci->comp_cap, fn, data, dcap, doff);
 	if (!aep->thd) goto done;
 	aep->tid = cos_introspect(ci, aep->thd, THD_GET_TID);
 	if (!aep->tid) goto done;
 
-	t = sl_thd_alloc_init(aep, 0, 0);
+	t = sl_thd_alloc_init(aep, 0, 0, dcb);
 	sl_mod_thd_create(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 
 done:
 	return t;
@@ -119,15 +129,16 @@ sl_thd_comp_init_no_cs(struct cos_defcompinfo *comp, sl_thd_property_t prps, asn
 		assert(snd);
 	}
 
-	t = sl_thd_alloc_init(aep, snd, prps);
+	t = sl_thd_alloc_init(aep, snd, prps, NULL);
 	sl_mod_thd_create(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 
 done:
 	return t;
 }
 
 static struct sl_thd *
-sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx)
+sl_thd_alloc_ext_dcb_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx, dcbcap_t dcbcap, dcboff_t dcboff)
 {
 	struct cos_defcompinfo *dci    = cos_defcompinfo_curr_get();
 	struct cos_compinfo    *ci     = cos_compinfo_get(dci);
@@ -140,16 +151,17 @@ sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx)
 		aep = sl_thd_alloc_aep_backend();
 		if (!aep) goto done;
 
-		aep->thd = cos_thd_alloc_ext(ci, compci->comp_cap, idx);
+		aep->thd = cos_thd_alloc_ext(ci, compci->comp_cap, idx, dcbcap, dcboff);
 		if (!aep->thd) goto done;
 		aep->tid = cos_introspect(ci, aep->thd, THD_GET_TID);
 		if (!aep->tid) goto done;
 
-		t = sl_thd_alloc_init(aep, 0, 0);
+		t = sl_thd_alloc_init(aep, 0, 0, NULL);
 		sl_mod_thd_create(sl_mod_thd_policy_get(t));
+		ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 	} else {
 		assert(idx == 0);
-		ret = cos_initaep_alloc(comp, NULL, 0);
+		ret = cos_initaep_alloc(comp, NULL, 0, dcbcap);
 		if (ret) goto done;
 
 		t = sl_thd_comp_init_no_cs(comp, 0, 0);
@@ -165,26 +177,32 @@ sl_thd_aep_alloc_no_cs(cos_aepthd_fn_t fn, void *data, sl_thd_property_t prps, c
 	struct cos_defcompinfo *dci = cos_defcompinfo_curr_get();
 	struct sl_thd          *t   = NULL;
 	struct cos_aep_info    *aep = NULL;
+	struct cos_dcb_info    *dcb = NULL;
 	int                     ret;
+	dcbcap_t dcap;
+	dcboff_t doff;
 
 	aep = sl_thd_alloc_aep_backend();
 	if (!aep) goto done;
+	dcap = cos_dcb_info_alloc_curr(&doff, (vaddr_t *)&dcb);
+	if (dcb && doff) assert(dcap);
 
 	/* NOTE: Cannot use stack-allocated cos_aep_info struct here */
-	if (prps & SL_THD_PROPERTY_OWN_TCAP) ret = cos_aep_alloc(aep, fn, data);
-	else                                 ret = cos_aep_tcap_alloc(aep, sl_thd_aepinfo(sl__globals_cpu()->sched_thd)->tc,
-			                                              fn, data);
+	if (prps & SL_THD_PROPERTY_OWN_TCAP) ret = cos_aep_alloc(aep, fn, data, dcap, doff);
+	else                                 ret = cos_aep_tcap_alloc(aep, sl_thd_aepinfo(sl__globals_core()->sched_thd)->tc,
+			                                              fn, data, dcap, doff);
 	if (ret) goto done;
 
-	t = sl_thd_alloc_init(aep, 0, prps);
+	t = sl_thd_alloc_init(aep, 0, prps, dcb);
 	sl_mod_thd_create(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 
 done:
 	return t;
 }
 
 static struct sl_thd *
-sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, thdclosure_index_t idx, sl_thd_property_t prps, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv)
+sl_thd_aep_alloc_ext_dcb_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, thdclosure_index_t idx, sl_thd_property_t prps, cos_channelkey_t key, dcbcap_t dcap, dcboff_t doff, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv)
 {
 	struct cos_aep_info *aep = NULL;
 	struct sl_thd       *t   = NULL;
@@ -192,11 +210,11 @@ sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, t
 	int                  ret = 0;
 
 	if (prps & SL_THD_PROPERTY_SEND) {
-		assert(sched);
+		assert(sched && !doff);
 		if (prps & SL_THD_PROPERTY_OWN_TCAP) {
-			ret = cos_initaep_alloc(comp, sl_thd_aepinfo(sched), prps & SL_THD_PROPERTY_SEND);
+			ret = cos_initaep_alloc(comp, sl_thd_aepinfo(sched), prps & SL_THD_PROPERTY_SEND, dcap);
 		} else {
-			ret = cos_initaep_tcap_alloc(comp, sl_thd_tcap(sched), sl_thd_aepinfo(sched));
+			ret = cos_initaep_tcap_alloc(comp, sl_thd_tcap(sched), sl_thd_aepinfo(sched), dcap);
 		}
 		if (ret) goto done;
 
@@ -208,14 +226,15 @@ sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, t
 		if (!aep) goto done;
 
 		if (prps & SL_THD_PROPERTY_OWN_TCAP) {
-			ret = cos_aep_alloc_ext(aep, comp, sl_thd_aepinfo(sched), idx);
+			ret = cos_aep_alloc_ext(aep, comp, sl_thd_aepinfo(sched), idx, dcap, doff);
 		} else {
-			ret = cos_aep_tcap_alloc_ext(aep, comp, sl_thd_aepinfo(sched), sl_thd_tcap(sched), idx);
+			ret = cos_aep_tcap_alloc_ext(aep, comp, sl_thd_aepinfo(sched), sl_thd_tcap(sched), idx, dcap, doff);
 		}
 		if (ret) goto done;
 
-		t = sl_thd_alloc_init(aep, 0, prps);
+		t = sl_thd_alloc_init(aep, 0, prps, NULL);
 		sl_mod_thd_create(sl_mod_thd_policy_get(t));
+		ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 
 		if (extrcv) *extrcv = sl_thd_rcvcap(t);
 	}
@@ -264,23 +283,39 @@ sl_thd_comp_init(struct cos_defcompinfo *comp, int is_sched)
 }
 
 struct sl_thd *
-sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax)
+sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, vaddr_t *dcbaddr)
+{
+	PRINTC("UNIMPLEMENTED: Using RAW API which cannot manage DCB resource for child components\n");
+
+	return NULL;
+}
+
+struct sl_thd *
+sl_thd_initaep_alloc_dcb(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, dcbcap_t dcap, microsec_t ipiwin, u32_t ipimax)
 {
 	struct sl_thd *t = NULL;
 
 	if (!comp) return NULL;
 
 	sl_cs_enter();
-	if (!is_sched) t = sl_thd_alloc_ext_no_cs(comp, 0);
-	else           t = sl_thd_aep_alloc_ext_no_cs(comp, sched_thd, 0, (is_sched ? SL_THD_PROPERTY_SEND : 0)
-						      | (own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0), key, ipiwin, ipimax, NULL);
+	if (!is_sched) t = sl_thd_alloc_ext_dcb_no_cs(comp, 0, dcap, 0);
+	else           t = sl_thd_aep_alloc_ext_dcb_no_cs(comp, sched_thd, 0, (is_sched ? SL_THD_PROPERTY_SEND : 0)
+							  | (own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0), key, dcap, 0, ipiwin, ipimax, NULL);
 	sl_cs_exit();
 
 	return t;
 }
 
 struct sl_thd *
-sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv)
+sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, vaddr_t *dcbaddr, arcvcap_t *extrcv)
+{
+	PRINTC("UNIMPLEMENTED: Using RAW API which cannot manage DCB resource for child components\n");
+
+	return NULL;
+}
+
+struct sl_thd *
+sl_thd_aep_alloc_ext_dcb(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, dcbcap_t dcap, dcboff_t doff, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv)
 {
 	struct sl_thd *t = NULL;
 
@@ -288,9 +323,9 @@ sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thd
 	sl_cs_enter();
 	if (!is_aep) own_tcap = 0;
 	if (is_aep) {
-		t = sl_thd_aep_alloc_ext_no_cs(comp, sched_thd, idx, own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0, key, ipiwin, ipimax, extrcv);
+		t = sl_thd_aep_alloc_ext_dcb_no_cs(comp, sched_thd, idx, own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0, key, dcap, doff, ipiwin, ipimax, extrcv);
 	} else {
-		t = sl_thd_alloc_ext_no_cs(comp, idx);
+		t = sl_thd_alloc_ext_dcb_no_cs(comp, idx, dcap, doff);
 	}
 	sl_cs_exit();
 
@@ -311,7 +346,7 @@ sl_thd_init_ext(struct cos_aep_info *aepthd, struct sl_thd *sched)
 
 	*aep = *aepthd;
 	/* TODO: use sched info for parent -> child notifications */
-	t = sl_thd_alloc_init(aep, 0, 0);
+	t = sl_thd_alloc_init(aep, 0, 0, NULL);
 
 done:
 	sl_cs_exit();
@@ -320,9 +355,11 @@ sl_thd_init_ext(struct cos_aep_info *aepthd, struct sl_thd *sched)
 }
 
 struct sl_thd *
-sl_thd_retrieve(thdid_t tid)
+sl_thd_retrieve_lazy(thdid_t tid)
 {
-	return sl_mod_thd_get(sl_thd_lookup_backend(tid));
+	/* without capmgr, there is no lazy retrieval of threads! */
+	assert(0);
+	return NULL;
 }
 
 void
@@ -334,3 +371,40 @@ sl_thd_free(struct sl_thd *t)
 	sl_thd_free_no_cs(t);
 	sl_cs_exit();
 }
+
+int
+sl_thd_migrate_no_cs(struct sl_thd *t, cpuid_t core)
+{
+	struct cos_defcompinfo *dci = cos_defcompinfo_curr_get();
+	struct cos_compinfo    *ci  = cos_compinfo_get(dci);
+	struct sl_thd_policy   *x = NULL;
+	int ret;
+
+	if (t->properties) return -1;
+	if (t->state != SL_THD_RUNNABLE) return -1;
+	ret = cos_thd_migrate(ci, sl_thd_thdcap(t), core);
+	if (ret) return -1;
+	sl_mod_thd_delete(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), -1);
+
+	x = sl_thd_migrate_backend(sl_mod_thd_policy_get(t), core);
+	if (!x) return -1;
+
+	return 0;
+}
+
+int
+sl_thd_migrate(thdid_t tid, cpuid_t core)
+{
+	int ret;
+	struct sl_thd *c = sl_thd_curr(), *t = sl_thd_lkup(tid);
+
+	if (core == cos_cpuid()) return -1;
+	assert(c != t);
+	sl_cs_enter();
+	ret = sl_thd_migrate_no_cs(t, core);
+	sl_cs_exit();
+
+	return ret;
+}
+
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index fe297c1be9..095dffb072 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -7,55 +7,56 @@
 
 #include <ps.h>
 #include <sl.h>
-#include <sl_xcpu.h>
+#include <sl_xcore.h>
 #include <sl_child.h>
 #include <sl_mod_policy.h>
 #include <cos_debug.h>
 #include <cos_kernel_api.h>
 #include <bitmap.h>
+#include <cos_dcb.h>
+#include <cos_ulsched_rcv.h>
 
 struct sl_global sl_global_data;
-struct sl_global_cpu sl_global_cpu_data[NUM_CPU] CACHE_ALIGNED;
+struct sl_global_core sl_global_core_data[NUM_CPU] CACHE_ALIGNED;
 static void sl_sched_loop_intern(int non_block) __attribute__((noreturn));
-extern struct sl_thd *sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t prps);
-extern int sl_xcpu_process_no_cs(void);
-extern void sl_xcpu_asnd_alloc(void);
+extern struct sl_thd *sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t prps, struct cos_dcb_info *dcb);
+extern int sl_xcore_process_no_cs(void);
+extern void sl_xcore_asnd_alloc(void);
 
 /*
  * These functions are removed from the inlined fast-paths of the
  * critical section (cs) code to save on code size/locality
  */
 int
-sl_cs_enter_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, thdcap_t curr, sched_tok_t tok)
+sl_cs_enter_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, struct sl_global_core *gcore, struct sl_thd *curr, sched_tok_t tok)
 {
-	struct sl_thd        *t = sl_thd_curr();
-	struct sl_global_cpu *g = sl__globals_cpu();
+#ifdef SL_CS
 	int ret;
 
 	/* recursive locks are not allowed */
-	assert(csi->s.owner != sl_thd_thdcap(t));
+	assert(csi->s.owner != sl_thd_thdcap(curr));
 	if (!csi->s.contention) {
 		csi->s.contention = 1;
-		if (!ps_cas(&g->lock.u.v, cached->v, csi->v)) return 1;
+		if (!ps_upcas(&gcore->lock.u.v, cached->v, csi->v)) return 1;
 	}
 	/* Switch to the owner of the critical section, with inheritance using our tcap/priority */
-	if ((ret = cos_defswitch(csi->s.owner, t->prio, csi->s.owner == sl_thd_thdcap(g->sched_thd) ?
-				 TCAP_TIME_NIL : g->timeout_next, tok))) return ret;
+	if ((ret = cos_defswitch(csi->s.owner, curr->prio, csi->s.owner == sl_thd_thdcap(gcore->sched_thd) ?
+				 TCAP_TIME_NIL : gcore->timeout_next, tok))) return ret;
 	/* if we have an outdated token, then we want to use the same repeat loop, so return to that */
+#endif
 
 	return 1;
 }
 
 /* Return 1 if we need a retry, 0 otherwise */
 int
-sl_cs_exit_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, sched_tok_t tok)
+sl_cs_exit_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, struct sl_global_core *gcore, sched_tok_t tok)
 {
-	struct sl_thd        *t = sl_thd_curr();
-	struct sl_global_cpu *g = sl__globals_cpu();
-
-	if (!ps_cas(&g->lock.u.v, cached->v, 0)) return 1;
+#ifdef SL_CS
+	if (!ps_upcas(&gcore->lock.u.v, cached->v, 0)) return 1;
 	/* let the scheduler thread decide which thread to run next, inheriting our budget/priority */
-	cos_defswitch(g->sched_thdcap, t->prio, TCAP_TIME_NIL, tok);
+	cos_defswitch(gcore->sched_thdcap, sl_thd_curr()->prio, TCAP_TIME_NIL, tok);
+#endif
 
 	return 0;
 }
@@ -109,27 +110,6 @@ sl_timeout_remove(struct sl_thd *t)
 	t->timeout_idx = -1;
 }
 
-void
-sl_thd_free_no_cs(struct sl_thd *t)
-{
-        struct sl_thd *ct = sl_thd_curr();
-
-        assert(t);
-        assert(t->state != SL_THD_FREE);
-        if (t->state == SL_THD_BLOCKED_TIMEOUT) sl_timeout_remove(t);
-        sl_thd_index_rem_backend(sl_mod_thd_policy_get(t));
-        sl_mod_thd_delete(sl_mod_thd_policy_get(t));
-        t->state = SL_THD_FREE;
-        /* TODO: add logic for the graveyard to delay this deallocation if t == current */
-        sl_thd_free_backend(sl_mod_thd_policy_get(t));
-
-        /* thread should not continue to run if it deletes itself. */
-        if (unlikely(t == ct)) {
-                while (1) sl_cs_exit_schedule();
-                /* FIXME: should never get here, but tcap mechanism can let a child scheduler run! */
-        }
-}
-
 static int
 __sl_timeout_compare_min(void *a, void *b)
 {
@@ -151,6 +131,29 @@ sl_timeout_init(microsec_t period)
 	heap_init(sl_timeout_heap(), SL_MAX_NUM_THDS, __sl_timeout_compare_min, __sl_timeout_update_idx);
 }
 
+void
+sl_thd_free_no_cs(struct sl_thd *t)
+{
+        struct sl_thd *ct = sl_thd_curr();
+
+        assert(t);
+        assert(t->state != SL_THD_FREE);
+        if (t->state == SL_THD_BLOCKED_TIMEOUT) sl_timeout_remove(t);
+        sl_thd_index_rem_backend(sl_mod_thd_policy_get(t));
+        sl_mod_thd_delete(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), -1);
+        t->state = SL_THD_FREE;
+        /* TODO: add logic for the graveyard to delay this deallocation if t == current */
+        sl_thd_free_backend(sl_mod_thd_policy_get(t));
+
+        /* thread should not continue to run if it deletes itself. */
+        if (unlikely(t == ct)) {
+                while (1) {
+			sl_cs_exit_schedule();
+		}
+                /* FIXME: should never get here, but tcap mechanism can let a child scheduler run! */
+        }
+}
 /*
  * This API is only used by the scheduling thread to block an AEP thread.
  * AEP thread scheduling events could be redundant.
@@ -161,7 +164,7 @@ int
 sl_thd_sched_block_no_cs(struct sl_thd *t, sl_thd_state_t block_type, cycles_t timeout)
 {
 	assert(t);
-	assert(t != sl__globals_cpu()->idle_thd && t != sl__globals_cpu()->sched_thd);
+	assert(t != sl__globals_core()->idle_thd && t != sl__globals_core()->sched_thd);
 	assert(block_type == SL_THD_BLOCKED_TIMEOUT || block_type == SL_THD_BLOCKED);
 
 	if (t->schedthd) return 0;
@@ -178,6 +181,7 @@ sl_thd_sched_block_no_cs(struct sl_thd *t, sl_thd_state_t block_type, cycles_t t
 
 	assert(sl_thd_is_runnable(t));
 	sl_mod_block(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), -1);
 
 update:
 	t->state = block_type;
@@ -212,9 +216,11 @@ sl_thd_sched_unblock_no_cs(struct sl_thd *t)
 int
 sl_thd_block_no_cs(struct sl_thd *t, sl_thd_state_t block_type, cycles_t timeout)
 {
-	assert(t);
-	assert(t != sl__globals_cpu()->idle_thd && t != sl__globals_cpu()->sched_thd);
-	assert(sl_thd_curr() == t); /* only current thread is allowed to block itself */
+	assert(t && sl_thd_curr() == t); /* only current thread is allowed to block itself */
+	assert(t != sl__globals_core()->idle_thd && t != sl__globals_core()->sched_thd);
+	/* interrupt thread could run and block itself before scheduler sees any of that! */
+	sl_thd_sched_unblock_no_cs(t);
+	assert(sl_thd_is_runnable(t));
 	assert(block_type == SL_THD_BLOCKED_TIMEOUT || block_type == SL_THD_BLOCKED);
 
 	if (t->schedthd) {
@@ -230,9 +236,9 @@ sl_thd_block_no_cs(struct sl_thd *t, sl_thd_state_t block_type, cycles_t timeout
 	}
 
 	/* reset rcv_suspended if the scheduler thinks "curr" was suspended on cos_rcv previously */
-	sl_thd_sched_unblock_no_cs(t);
 	assert(t->state == SL_THD_RUNNABLE);
 	sl_mod_block(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), -1);
 	t->state = block_type;
 	if (block_type == SL_THD_BLOCKED_TIMEOUT) sl_timeout_block(t, timeout);
 
@@ -254,6 +260,7 @@ sl_thd_block(thdid_t tid)
 		return;
 	}
 	sl_cs_exit_schedule();
+	assert(sl_thd_is_runnable(t));
 
 	return;
 }
@@ -326,11 +333,11 @@ sl_thd_block_expiry(struct sl_thd *t)
 {
 	cycles_t abs_timeout = 0;
 
-	assert(t != sl__globals_cpu()->idle_thd && t != sl__globals_cpu()->sched_thd);
+	assert(t != sl__globals_core()->idle_thd && t != sl__globals_core()->sched_thd);
 	sl_cs_enter();
 	if (!(t->properties & SL_THD_PROPERTY_OWN_TCAP)) {
 		assert(!t->rcv_suspended);
-		abs_timeout = sl__globals_cpu()->timeout_next;
+		abs_timeout = sl__globals_core()->timeout_next;
 	} else {
 		assert(t->period);
 		abs_timeout = t->last_replenish + t->period;
@@ -372,6 +379,7 @@ sl_thd_sched_wakeup_no_cs(struct sl_thd *t)
 	if (t->state == SL_THD_BLOCKED_TIMEOUT) sl_timeout_remove(t);
 	t->state = SL_THD_RUNNABLE;
 	sl_mod_wakeup(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 
 	return 0;
 }
@@ -384,11 +392,12 @@ int
 sl_thd_wakeup_no_cs_rm(struct sl_thd *t)
 {
 	assert(t);
-	assert(t != sl__globals_cpu()->idle_thd && t != sl__globals_cpu()->sched_thd);
+	assert(t != sl__globals_core()->idle_thd && t != sl__globals_core()->sched_thd);
 
 	assert(t->state == SL_THD_BLOCKED || t->state == SL_THD_BLOCKED_TIMEOUT);
 	t->state = SL_THD_RUNNABLE;
 	sl_mod_wakeup(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 	t->rcv_suspended = 0;
 
 	return 0;
@@ -406,8 +415,23 @@ sl_thd_wakeup_no_cs(struct sl_thd *t)
 		return 0;
 	}
 
-	if (unlikely(sl_thd_is_runnable(t))) {
-		/* t->state == SL_THD_WOKEN? multiple wakeups? */
+//	if (unlikely(sl_thd_is_runnable(t))) {
+//		/* t->state == SL_THD_WOKEN? multiple wakeups? */
+//		t->state = SL_THD_WOKEN;
+//		return 1;
+//	}
+	/*
+	 * TODO: with blockpoints, multiple wakeup problem might go away.
+	 * will try that next!
+	 *
+	 * For now, if a thread creates N tasks and if at least two of them
+	 * complete before master goes to block, which can happen on multi-core
+	 * execution of tasks, then that results in multiple wakeups!
+	 */
+	if (unlikely(t->state == SL_THD_WOKEN)) {
+		t->state = SL_THD_RUNNABLE;
+		return 1;
+	} else if (unlikely(t->state == SL_THD_RUNNABLE)) {
 		t->state = SL_THD_WOKEN;
 		return 1;
 	}
@@ -435,60 +459,47 @@ sl_thd_wakeup(thdid_t tid)
 	return;
 }
 
-void
-sl_thd_yield_cs_exit(thdid_t tid)
+static inline void
+sl_thd_yield_cs_exit_intern(thdid_t tid)
 {
 	struct sl_thd *t = sl_thd_curr();
 
 	/* reset rcv_suspended if the scheduler thinks "curr" was suspended on cos_rcv previously */
 	sl_thd_sched_unblock_no_cs(t);
-	if (tid) {
+	if (likely(tid)) {
 		struct sl_thd *to = sl_thd_lkup(tid);
 
-		assert(to);
 		sl_cs_exit_switchto(to);
 	} else {
-		if (likely(t != sl__globals_cpu()->sched_thd && t != sl__globals_cpu()->idle_thd)) sl_mod_yield(sl_mod_thd_policy_get(t), NULL);
+		if (likely(t != sl__globals_core()->sched_thd && t != sl__globals_core()->idle_thd)) sl_mod_yield(sl_mod_thd_policy_get(t), NULL);
 		sl_cs_exit_schedule();
 	}
 }
 
-void
-sl_thd_yield(thdid_t tid)
-{
-	sl_cs_enter();
-	sl_thd_yield_cs_exit(tid);
-}
 
 void
-sl_thd_event_info_reset(struct sl_thd *t)
+sl_thd_yield_cs_exit(thdid_t tid)
 {
-	t->event_info.blocked = 0;
-	t->event_info.cycles  = 0;
-	t->event_info.timeout = 0;
+	sl_thd_yield_cs_exit_intern(tid);
 }
 
-static inline void
-sl_thd_event_enqueue(struct sl_thd *t, int blocked, cycles_t cycles, tcap_time_t timeout)
+void
+sl_thd_yield_intern(thdid_t tid)
 {
-	struct sl_global_cpu *g = sl__globals_cpu();
-
-	if (ps_list_singleton(t, SL_THD_EVENT_LIST)) ps_list_head_append(&g->event_head, t, SL_THD_EVENT_LIST);
-
-	t->event_info.blocked  = blocked;
-	t->event_info.cycles  += cycles;
-	t->event_info.timeout  = timeout;
+	sl_cs_enter();
+	sl_thd_yield_cs_exit_intern(tid);
 }
 
-static inline void
-sl_thd_event_dequeue(struct sl_thd *t, int *blocked, cycles_t *cycles, tcap_time_t *timeout)
+void
+sl_thd_yield_intern_timeout(cycles_t abs_timeout)
 {
-	ps_list_rem(t, SL_THD_EVENT_LIST);
+	struct sl_thd *t = sl_thd_curr();
 
-	*blocked = t->event_info.blocked;
-	*cycles  = t->event_info.cycles;
-	*timeout = t->event_info.timeout;
-	sl_thd_event_info_reset(t);
+	sl_cs_enter();
+	/* reset rcv_suspended if the scheduler thinks "curr" was suspended on cos_rcv previously */
+	sl_thd_sched_unblock_no_cs(t);
+	if (likely(t != sl__globals_core()->sched_thd && t != sl__globals_core()->idle_thd)) sl_mod_yield(sl_mod_thd_policy_get(t), NULL);
+	sl_cs_exit_schedule_timeout(abs_timeout);
 }
 
 void
@@ -498,7 +509,7 @@ sl_thd_exit()
 }
 
 void
-sl_thd_param_set(struct sl_thd *t, sched_param_t sp)
+sl_thd_param_set_no_cs(struct sl_thd *t, sched_param_t sp)
 {
 	sched_param_type_t type;
 	unsigned int       value;
@@ -525,84 +536,114 @@ sl_thd_param_set(struct sl_thd *t, sched_param_t sp)
 	sl_mod_thd_param_set(sl_mod_thd_policy_get(t), type, value);
 }
 
+void
+sl_thd_param_set(struct sl_thd *t, sched_param_t sp)
+{
+	assert(t);
+
+	sl_cs_enter();
+
+	sl_thd_param_set_no_cs(t, sp);
+	sl_cs_exit();
+}
+
 void
 sl_timeout_period(microsec_t period)
 {
 	cycles_t p = sl_usec2cyc(period);
 
-	sl__globals_cpu()->period = p;
-	sl_timeout_relative(p);
+	sl__globals_core()->period = p;
 }
 
 /* engage space heater mode */
 void
 sl_idle(void *d)
-{ while (1) ; }
+{
+	struct sl_global_core *gc = sl__globals_core();
+
+	while (1) {
+		cycles_t now = sl_now();
+
+		do {
+			if (cos_sched_ispending() ||
+#if NUM_CPU > 1
+			    ck_ring_size(sl__ring_curr()) != 0 ||
+#endif
+			    !sl_child_notif_empty()) break;
+			now = sl_now();
+		} while (now < gc->timer_next);
+		sl_thd_activate_c(gc->sched_thd, cos_sched_sync(), 0, 0, gc->idle_thd, gc);
+	}
+}
 
 /* call from the user? */
 static void
-sl_global_init(u32_t *cpu_bmp)
+sl_global_init(u32_t *core_bmp)
 {
 	struct sl_global *g = sl__globals();
 	unsigned int i = 0;
 
 	memset(g, 0, sizeof(struct sl_global));
+	assert(sizeof(struct cos_scb_info) * NUM_CPU <= COS_SCB_SIZE && COS_SCB_SIZE == PAGE_SIZE);
+	g->scb_area = (struct cos_scb_info *)cos_scb_info_get();
 
 	for (i = 0; i < NUM_CPU; i++) {
-		if (!bitmap_check(cpu_bmp, i)) continue;
+		if (!bitmap_check(core_bmp, i)) continue;
 
-		bitmap_set(g->cpu_bmp, i);
-		ck_ring_init(sl__ring(i), SL_XCPU_RING_SIZE);
+		bitmap_set(g->core_bmp, i);
+		ck_ring_init(sl__ring(i), SL_XCORE_RING_SIZE);
 	}
 }
 
 void
-sl_init_cpubmp(microsec_t period, u32_t *cpubmp)
+sl_init_corebmp(microsec_t period, u32_t *corebmp)
 {
 	int i;
-	static volatile int first    = 1, init_done = 0;
-	struct cos_defcompinfo *dci  = cos_defcompinfo_curr_get();
-	struct cos_compinfo    *ci   = cos_compinfo_get(dci);
-	struct sl_global_cpu   *g    = sl__globals_cpu();
-	struct cos_aep_info    *saep = cos_sched_aep_get(dci);
-
-	if (ps_cas((unsigned long *)&first, 1, 0)) {
-		sl_global_init(cpubmp);
-
+	static volatile unsigned long  first = NUM_CPU + 1, init_done = 0;
+	struct cos_defcompinfo        *dci   = cos_defcompinfo_curr_get();
+	struct cos_compinfo           *ci    = cos_compinfo_get(dci);
+	struct sl_global_core         *g     = sl__globals_core();
+	struct cos_aep_info           *ga    = cos_sched_aep_get(dci);
+
+	if (ps_cas((unsigned long *)&first, NUM_CPU + 1, cos_cpuid())) {
+		sl_global_init(corebmp);
 		ps_faa((unsigned long *)&init_done, 1);
 	} else {
 		/* wait until global ring buffers are initialized correctly! */
 		while (!ps_load((unsigned long *)&init_done)) ;
 		/* make sure this scheduler is active on this cpu/core */
-		assert(sl_cpu_active());
+		assert(sl_core_active());
 	}
 
 	/* must fit in a word */
 	assert(sizeof(struct sl_cs) <= sizeof(unsigned long));
-	memset(g, 0, sizeof(struct sl_global_cpu));
+	memset(g, 0, sizeof(struct sl_global_core));
 
-	g->cyc_per_usec    = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
-	g->lock.u.v        = 0;
+	g->cyc_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
+	g->lock.u.v     = 0;
+	g->scb_info     = ((sl__globals()->scb_area) + cos_cpuid());
 
 	sl_thd_init_backend();
 	sl_mod_init();
 	sl_timeout_init(period);
 
-	/* Create the scheduler thread for us. cos_sched_aep_get() is from global(static) memory */
-	g->sched_thd       = sl_thd_alloc_init(saep, 0, 0);
+	/* Create the scheduler thread for us. */
+	g->sched_thd       = sl_thd_alloc_init(ga, 0, 0, (struct cos_dcb_info *)cos_init_dcb_get());
 	assert(g->sched_thd);
-	g->sched_thdcap    = saep->thd;
-	g->sched_tcap      = saep->tc;
-	g->sched_rcv       = saep->rcv;
+	g->sched_thdcap    = ga->thd;
+	g->sched_tcap      = ga->tc;
+	g->sched_rcv       = ga->rcv;
 	assert(g->sched_rcv);
 	g->sched_thd->prio = TCAP_PRIO_MAX;
 	ps_list_head_init(&g->event_head);
+	assert(cos_thdid() == sl_thd_thdid(g->sched_thd));
+	g->scb_info->curr_thd = 0;
 
 	g->idle_thd        = sl_thd_alloc(sl_idle, NULL);
 	assert(g->idle_thd);
 
 	/* all cores that this sched runs on, must be initialized by now so "asnd"s can be created! */
-	sl_xcpu_asnd_alloc();
+	sl_xcore_asnd_alloc();
 
 	return;
 }
@@ -611,42 +652,106 @@ sl_init_cpubmp(microsec_t period, u32_t *cpubmp)
 void
 sl_init(microsec_t period)
 {
-	u32_t cpubmp[NUM_CPU_BMP_WORDS] = { 0 };
+	u32_t corebmp[NUM_CPU_BMP_WORDS] = { 0 };
 
 	/* runs on all cores.. */
-	bitmap_set_contig(cpubmp, 0, NUM_CPU, 1);
-	sl_init_cpubmp(period, cpubmp);
+	bitmap_set_contig(corebmp, 0, NUM_CPU, 1);
+	sl_init_corebmp(period, corebmp);
+}
+
+static inline int
+__sl_sched_events_present(void)
+{
+	struct cos_scb_info *scb = sl_scb_info_core();
+	struct cos_sched_ring *ring = &scb->sched_events;
+
+	return __cos_sched_events_present(ring);
+}
+
+static inline int
+__sl_sched_event_consume(struct cos_sched_event *e)
+{
+	struct cos_scb_info *scb = sl_scb_info_core();
+	struct cos_sched_ring *ring = &scb->sched_events;
+
+	return __cos_sched_event_consume(ring, e);
+}
+
+static inline int
+__sl_sched_rcv(rcv_flags_t rf, struct cos_sched_event *e)
+{
+	struct sl_global_core *g = sl__globals_core();
+#if 0
+	struct sl_thd *curr = sl_thd_curr();
+	struct cos_dcb_info *cd = sl_thd_dcbinfo(curr);
+	int ret = 0;
+//	if (cos_spd_id() != 4) printc("D");
+
+	assert(curr == g->sched_thd);
+	if (!cd) return cos_ul_sched_rcv(g->sched_rcv, rf, g->timeout_next, e);
+
+	rf |= RCV_ULSCHED_RCV;
+	
+	__asm__ __volatile__ (			\
+		"pushl %%ebp\n\t"		\
+		"movl %%esp, %%ebp\n\t"		\
+		"movl $1f, (%%eax)\n\t"		\
+		"movl %%esp, 4(%%eax)\n\t"	\
+		"movl $2f, %%ecx\n\t"		\
+		"movl %%edx, %%eax\n\t"		\
+		"inc %%eax\n\t"			\
+		"shl $16, %%eax\n\t"		\
+		"movl $0, %%edx\n\t"		\
+		"movl $0, %%edi\n\t"		\
+		"sysenter\n\t"			\
+		"jmp 2f\n\t"			\
+		".align 4\n\t"			\
+		"1:\n\t"			\
+		"movl $1, %%eax\n\t"		\
+		".align 4\n\t"			\
+		"2:\n\t"			\
+		"popl %%ebp\n\t"		\
+		: "=a" (ret)
+		: "a" (cd), "b" (rf), "S" (g->timeout_next), "d" (g->sched_rcv)
+		: "memory", "cc", "ecx", "edi");
+
+//	if (cos_spd_id() != 4) printc("E");
+//	if (cos_thdid() == 7) PRINTC("%s:%d %d\n", __func__, __LINE__, ret);
+	cd = sl_thd_dcbinfo(sl_thd_curr());
+	cd->sp = 0;
+
+	rf |= RCV_ULONLY;
+#endif
+	return cos_ul_sched_rcv(g->sched_rcv, rf, g->timeout_next, e);
 }
 
 static void
 sl_sched_loop_intern(int non_block)
 {
-	struct sl_global_cpu *g   = sl__globals_cpu();
-	rcv_flags_t           rfl = (non_block ? RCV_NON_BLOCKING : 0) | RCV_ALL_PENDING;
+	struct sl_global_core *g   = sl__globals_core();
+	rcv_flags_t            rfl = (non_block ? RCV_NON_BLOCKING : 0);
 
-	assert(sl_cpu_active());
+	assert(sl_thd_curr() == g->sched_thd);
+	assert(sl_core_active());
 
 	while (1) {
 		int pending;
 
 		do {
-			thdid_t        tid;
-			int            blocked, rcvd;
-			cycles_t       cycles;
-			tcap_time_t    timeout = g->timeout_next, thd_timeout;
 			struct sl_thd *t = NULL, *tn = NULL;
 			struct sl_child_notification notif;
+			struct cos_sched_event e = { .tid = 0 };
 
+			
 			/*
 			 * a child scheduler may receive both scheduling notifications (block/unblock
 			 * states of it's child threads) and normal notifications (mainly activations from
 			 * it's parent scheduler).
 			 */
-			pending = cos_sched_rcv(g->sched_rcv, rfl, timeout,
-						&rcvd, &tid, &blocked, &cycles, &thd_timeout);
-			if (!tid) goto pending_events;
+			pending = __sl_sched_rcv(rfl, &e);
+			if (pending < 0 || !e.tid) goto pending_events;
 
-			t = sl_thd_lkup(tid);
+			t = sl_thd_lkup(e.tid);
 			assert(t);
 			/* don't report the idle thread or a freed thread */
 			if (unlikely(t == g->idle_thd || t->state == SL_THD_FREE)) goto pending_events;
@@ -658,12 +763,15 @@ sl_sched_loop_intern(int non_block)
 			 * To avoid dropping events, add the events to the scheduler event list and processing all
 			 * the pending events after the scheduler can successfully take the lock.
 			 */
-			sl_thd_event_enqueue(t, blocked, cycles, thd_timeout);
+			sl_thd_event_enqueue(t, &e.evt);
 
 pending_events:
 			if (ps_list_head_empty(&g->event_head) &&
+#if NUM_CPU > 1
 			    ck_ring_size(sl__ring_curr()) == 0 &&
-			    sl_child_notif_empty()) continue;
+#endif
+			    sl_child_notif_empty() && 
+			    !cos_sched_events_isempty()) continue;
 
 			/*
 			 * receiving scheduler notifications is not in critical section mainly for
@@ -676,21 +784,21 @@ sl_sched_loop_intern(int non_block)
 
 			ps_list_foreach_del(&g->event_head, t, tn, SL_THD_EVENT_LIST) {
 				/* remove the event from the list and get event info */
-				sl_thd_event_dequeue(t, &blocked, &cycles, &thd_timeout);
+				sl_thd_event_dequeue(t, &e.evt);
 
 				/* outdated event for a freed thread */
 				if (t->state == SL_THD_FREE) continue;
 
-				sl_mod_execution(sl_mod_thd_policy_get(t), cycles);
+				sl_mod_execution(sl_mod_thd_policy_get(t), e.evt.elapsed_cycs);
 
-				if (blocked) {
+				if (e.evt.blocked) {
 					sl_thd_state_t state = SL_THD_BLOCKED;
 					cycles_t abs_timeout = 0;
 
-					if (likely(cycles)) {
-						if (thd_timeout) {
+					if (likely(e.evt.elapsed_cycs)) {
+						if (e.evt.next_timeout) {
 							state       = SL_THD_BLOCKED_TIMEOUT;
-							abs_timeout = tcap_time2cyc(thd_timeout, sl_now());
+							abs_timeout = tcap_time2cyc(e.evt.next_timeout, sl_now());
 						}
 						sl_thd_sched_block_no_cs(t, state, abs_timeout);
 					}
@@ -707,15 +815,17 @@ sl_sched_loop_intern(int non_block)
 				else                                  sl_thd_wakeup_no_cs(t);
 			}
 
+#if NUM_CPU > 1
 			/* process cross-core requests */
-			sl_xcpu_process_no_cs();
+			sl_xcore_process_no_cs();
+#endif
 
 			sl_cs_exit();
 		} while (pending > 0);
 
 		if (sl_cs_enter_sched()) continue;
 		/* If switch returns an inconsistency, we retry anyway */
-		sl_cs_exit_schedule_nospin();
+		sl_cs_exit_schedule_nospin_timeout(0);
 	}
 }
 
@@ -730,3 +840,36 @@ sl_sched_loop_nonblock(void)
 {
 	sl_sched_loop_intern(1);
 }
+
+void
+sl_thd_replenish_no_cs(struct sl_thd *t, cycles_t now)
+{
+#ifdef SL_REPLENISH
+	struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get());
+	tcap_res_t currbudget = 0;
+	cycles_t replenish;
+	int ret;
+
+	if (likely(!(t->properties & SL_THD_PROPERTY_OWN_TCAP))) return;
+	if (!t->budget) return;
+	assert(t->period);
+	assert(sl_thd_tcap(t) != sl__globals_core()->sched_tcap);
+
+	if (!(t->last_replenish == 0 || t->last_replenish + t->period <= now)) return;
+
+	replenish = now - ((now - t->last_replenish) % t->period);
+
+	ret = 0;
+	currbudget = (tcap_res_t)cos_introspect(ci, sl_thd_tcap(t), TCAP_GET_BUDGET);
+
+	if (!cycles_same(currbudget, t->budget, SL_CYCS_DIFF) && currbudget < t->budget) {
+		tcap_res_t transfer = t->budget - currbudget;
+
+		/* tcap_transfer will assign sched_tcap's prio to t's tcap if t->prio == 0, which we don't want. */
+		assert(t->prio >= TCAP_PRIO_MAX && t->prio <= TCAP_PRIO_MIN);
+		ret = cos_tcap_transfer(sl_thd_rcvcap(t), sl__globals_core()->sched_tcap, transfer, t->prio);
+	}
+
+	if (likely(ret == 0)) t->last_replenish = replenish;
+#endif
+}
diff --git a/src/components/lib/sl/sl_thd_static_backend.c b/src/components/lib/sl/sl_thd_static_backend.c
index 86aa4eac66..2985f8f5e5 100644
--- a/src/components/lib/sl/sl_thd_static_backend.c
+++ b/src/components/lib/sl/sl_thd_static_backend.c
@@ -17,26 +17,63 @@ static struct cos_aep_info __sl_aep_infos[NUM_CPU][SL_MAX_NUM_THDS];
 static u32_t               __sl_aep_free_off[NUM_CPU];
 
 /* Default implementations of backend functions */
-struct sl_thd_policy *
-sl_thd_alloc_backend(thdid_t tid)
+static inline struct sl_thd_policy *
+sl_thd_alloc_backend_core(cpuid_t core, thdid_t tid)
 {
-	assert(tid < SL_MAX_NUM_THDS);
+	assert(tid < SL_MAX_NUM_THDS && core >= 0 && core < NUM_CPU);
 
-	return &(__sl_threads[cos_cpuid()][tid]);
+	return &(__sl_threads[core][tid]);
 }
 
-struct cos_aep_info *
-sl_thd_alloc_aep_backend(void)
+static inline struct cos_aep_info *
+sl_thd_alloc_aep_backend_core(cpuid_t core)
 {
+	int off = 0;
 	struct cos_aep_info *aep = NULL;
 
-	assert(__sl_aep_free_off[cos_cpuid()] < SL_MAX_NUM_THDS);
-	aep = &(__sl_aep_infos[cos_cpuid()][__sl_aep_free_off[cos_cpuid()]]);
-	ps_faa((unsigned long *)&(__sl_aep_free_off[cos_cpuid()]), 1);
+	assert(core < NUM_CPU && core >= 0);
+	off = ps_faa((unsigned long *)&__sl_aep_free_off[core], 1);
+	assert(off < SL_MAX_NUM_THDS);
+	aep = &__sl_aep_infos[core][off];
 
 	return aep;
 }
 
+struct sl_thd_policy *
+sl_thd_migrate_backend(struct sl_thd_policy *t, cpuid_t core)
+{
+	assert(core != cos_cpuid() && core >= 0 && core < NUM_CPU);
+
+	struct cos_aep_info *a = sl_thd_alloc_aep_backend_core(core);
+	struct cos_aep_info *b = sl_thd_aepinfo(sl_mod_thd_get(t));
+	struct sl_thd_policy *tc = sl_thd_alloc_backend_core(core, b->tid);
+	struct sl_thd *x = sl_mod_thd_get(tc), *y = sl_mod_thd_get(t);
+
+	memset(a, 0, sizeof(struct cos_aep_info));
+	a->tid = b->tid;
+	a->thd = b->thd;
+	assert(b->rcv == 0 && b->tc == 0);
+	memset(b, 0, sizeof(struct cos_aep_info));
+
+	memcpy(tc, t, sizeof(struct sl_thd_policy));
+	x->aepinfo = a;
+	memset(t, 0, sizeof(struct sl_thd_policy));
+
+	return tc;
+}
+
+struct sl_thd_policy *
+sl_thd_alloc_backend(thdid_t tid)
+{
+	return sl_thd_alloc_backend_core(cos_cpuid(), tid);
+}
+
+struct cos_aep_info *
+sl_thd_alloc_aep_backend(void)
+{
+	return sl_thd_alloc_aep_backend_core(cos_cpuid());
+}
+
 void
 sl_thd_free_backend(struct sl_thd_policy *t)
 { }
diff --git a/src/components/lib/sl/sl_xcore.c b/src/components/lib/sl/sl_xcore.c
new file mode 100644
index 0000000000..b105a18411
--- /dev/null
+++ b/src/components/lib/sl/sl_xcore.c
@@ -0,0 +1,413 @@
+#include <ps.h>
+#include <ck_ring.h>
+#include <sl_xcore.h>
+#include <sl.h>
+#include <bitmap.h>
+
+/******************************* Client-side ***************************/
+
+/* static xcore thread backend! mainly for bookkeeping across cores! */
+static struct sl_xcore_thd _xcore_thds[MAX_NUM_THREADS];
+extern void sl_thd_param_set_no_cs(struct sl_thd *, sched_param_t);
+
+static inline void
+_sl_xcore_response_wait(struct sl_xcore_response *r)
+{
+	if (sl_thd_curr() != sl__globals_core()->sched_thd) {
+		if (!ps_load(&r->resp_ready)) sl_thd_block(0);
+	} else {
+		while (!ps_load(&r->resp_ready)) {
+			if (sl_cs_enter_sched()) continue;
+			sl_cs_exit_schedule_nospin();
+		}
+	}
+	assert(r->resp_ready);
+}
+
+static inline struct sl_xcore_thd *
+_sl_xcore_thd_backend_lookup(thdid_t tid)
+{
+	return &_xcore_thds[tid];
+}
+
+static inline struct sl_xcore_thd *
+_sl_xcore_thd_backend_init(thdid_t tid, cpuid_t core, asndcap_t snd)
+{
+	struct sl_xcore_thd *t = _sl_xcore_thd_backend_lookup(tid);
+
+	if (unlikely(t->thd)) return t;
+	t->thd  = tid;
+	t->core = core;
+
+	return t;
+}
+
+struct sl_xcore_thd *
+sl_xcore_thd_lookup_init(thdid_t tid, cpuid_t core)
+{
+	struct sl_xcore_thd *t = _sl_xcore_thd_backend_lookup(tid);
+
+	/* TODO: is this safe? a wrong coreid can cause DOS! */
+	if (unlikely(!(t->thd))) return _sl_xcore_thd_backend_init(tid, core, 0);
+
+	/* perhaps migrated! */
+	if (unlikely(t->core != core)) t->core = core;
+	/* if (unlikely(t->core != core)) return NULL; */
+
+	return t;
+}
+
+struct sl_xcore_thd *
+sl_xcore_thd_lookup(thdid_t tid)
+{
+	return _sl_xcore_thd_backend_lookup(tid);
+}
+
+#define SL_XCORE_REQ(req, typ, resp) do { 		\
+			req.type        = typ;		\
+			req.client_core = cos_cpuid();	\
+			req.client_thd  = cos_thdid();	\
+			req.response    = resp;		\
+		} while (0)
+
+#define SL_XCORE_RESP(resp, typ) do {			\
+			resp.type       = typ;		\
+			resp.resp_ready = 0;		\
+		} while (0)
+
+extern struct sl_thd *sl_thd_alloc_no_cs(cos_thd_fn_t fn, void *data);
+
+#define SL_IPI_ENABLE
+
+static inline int
+_sl_xcore_request_enqueue_no_cs(cpuid_t core, struct sl_xcore_request *rq)
+{
+	int ret = 0;
+
+	if (unlikely(core >= NUM_CPU)) return -1;
+	if (unlikely(core == cos_cpuid())) return -1;
+	if (unlikely(!bitmap_check(sl__globals()->core_bmp, core))) return -1;
+	ret = ck_ring_enqueue_mpsc_xcore(sl__ring(core), sl__ring_buffer(core), rq);
+
+#ifdef SL_IPI_ENABLE
+	asndcap_t snd = sl__globals()->xcore_asnd[cos_cpuid()][core];
+	assert(snd);
+
+	/* send an IPI for the request */
+	cos_asnd(snd, 0);
+#endif
+
+	if (unlikely(ret == false)) return -1;
+
+	return 0;
+}
+
+static inline int
+_sl_xcore_request_enqueue(cpuid_t core, struct sl_xcore_request *rq)
+{
+	int ret = 0;
+	
+	if (unlikely(core >= NUM_CPU)) return -1;
+	sl_cs_enter();
+	ret = _sl_xcore_request_enqueue_no_cs(core, rq);
+	sl_cs_exit();
+	if (unlikely(ret)) return -1;
+
+
+	return 0;
+}
+
+struct sl_xcore_thd *
+sl_xcore_thd_alloc(cpuid_t core, cos_thd_fn_t fn, void *data, int nparams, sched_param_t params[])
+{
+	int ret = 0;
+	asndcap_t snd = 0;
+	struct sl_xcore_request req;
+	struct sl_xcore_response resp;
+	thdid_t xcore_tid;
+
+	SL_XCORE_REQ(req, SL_XCORE_THD_ALLOC, &resp);
+	SL_XCORE_RESP(resp, SL_XCORE_THD_ALLOC);
+	req.sl_xcore_req_thd_alloc.fn = fn;
+	req.sl_xcore_req_thd_alloc.data = data;
+	if (nparams) memcpy(req.sl_xcore_req_thd_alloc.params, params, sizeof(sched_param_t) * nparams);
+	req.sl_xcore_req_thd_alloc.param_count = nparams;
+
+	ret = _sl_xcore_request_enqueue(core, &req);
+	if (unlikely(ret)) return NULL;
+
+	/* Other core will wake this up after creation! */
+	_sl_xcore_response_wait(&resp);
+	xcore_tid = resp.sl_xcore_resp_thd_alloc.tid;
+	assert(xcore_tid);
+	
+	return _sl_xcore_thd_backend_init(xcore_tid, core, 0);
+}
+
+struct sl_xcore_thd *
+sl_xcore_thd_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, thdclosure_index_t idx, int nparams, sched_param_t params[])
+{
+	return NULL;
+}
+
+struct sl_xcore_thd *
+sl_xcore_aep_alloc(cpuid_t core, cos_thd_fn_t fn, void *data, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[])
+{
+	return NULL;
+}
+
+struct sl_xcore_thd *
+sl_xcore_aep_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, thdclosure_index_t idx, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[])
+{
+	return NULL;
+}
+
+struct sl_xcore_thd *
+sl_xcore_initaep_alloc(cpuid_t core, struct cos_defcompinfo *dci, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[])
+{
+	return NULL;
+}
+
+struct sl_xcore_thd *
+sl_xcore_initaep_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, struct cos_defcompinfo *sched, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[])
+{
+	return NULL;
+}
+
+void
+sl_xcore_thd_param_set(struct sl_xcore_thd *t, sched_param_t param)
+{
+	struct sl_xcore_request req;
+	cpuid_t core = sl_xcore_thd_core(t);
+
+	SL_XCORE_REQ(req, SL_XCORE_THD_PARAM_SET, 0);
+	req.sl_xcore_req_thd_param_set.tid   = sl_xcore_thd_thdid(t);
+	req.sl_xcore_req_thd_param_set.param = param;
+
+	_sl_xcore_request_enqueue(core, &req);
+}
+
+static inline void
+_sl_xcore_thd_wakeup_tid_no_cs(thdid_t tid, cpuid_t core)
+{
+	struct sl_xcore_request req;
+
+	SL_XCORE_REQ(req, SL_XCORE_THD_WAKEUP, 0);
+	req.sl_xcore_req_thd_wakeup.tid = tid;
+	_sl_xcore_request_enqueue_no_cs(core, &req);
+}
+
+void
+sl_xcore_thd_wakeup(struct sl_xcore_thd *t)
+{
+	struct sl_xcore_request req;
+	cpuid_t core = sl_xcore_thd_core(t);
+
+	if (unlikely(!t)) return;
+
+	SL_XCORE_REQ(req, SL_XCORE_THD_WAKEUP, 0);
+	req.sl_xcore_req_thd_wakeup.tid = sl_xcore_thd_thdid(t);
+	_sl_xcore_request_enqueue(core, &req);
+}
+
+void
+sl_xcore_thd_wakeup_tid(thdid_t tid, cpuid_t core)
+{
+	struct sl_xcore_thd *t = sl_xcore_thd_lookup(tid);
+
+	assert(t->core == core);
+
+	sl_xcore_thd_wakeup(t);
+}
+
+int
+sl_xcore_load_balance(void)
+{
+	struct sl_xcore_request req;
+	struct sl_xcore_response resp;
+	struct sl_global *g = sl__globals();
+	unsigned max = 0, i, nthds = 0;
+	int core = -1, ret;
+
+	for (i = 0; i < NUM_CPU; i++) {
+		if (!bitmap_check(g->core_bmp, i)) continue;
+
+		if (g->nthds_running[i] <= max) continue;
+
+		max = g->nthds_running[i];
+		core = i;
+		break;
+	}
+
+	if (max == 0 || core == -1) return -1;
+
+	memset(&req, 0, sizeof(req));
+	SL_XCORE_REQ(req, SL_XCORE_LOAD_BALANCE, &resp);
+	SL_XCORE_RESP(resp, SL_XCORE_LOAD_BALANCE);
+	req.sl_xcore_req_load_balance.nthds = 1; /* FIXME: lets start with just 1 */
+	ret = _sl_xcore_request_enqueue((cpuid_t)core, &req);
+	if (unlikely(ret)) return -1;
+
+	_sl_xcore_response_wait(&resp);
+	nthds = resp.sl_xcore_resp_load_balance.nthds;
+	if (!nthds) return 0;
+
+	assert(nthds < SL_XCORE_MIGRATE_MAX);
+	sl_cs_enter();
+	for (i = 0; i < nthds; i++) {
+		struct sl_thd *t = sl_thd_lkup(resp.sl_xcore_resp_load_balance.tid[i]);
+
+		assert(t);
+		assert(t->state == SL_THD_RUNNABLE);
+		sl_mod_wakeup(sl_mod_thd_policy_get(t));
+		ps_faa(&(g->nthds_running[cos_cpuid()]), 1);
+	}
+	sl_cs_exit();
+
+	return nthds;
+}
+
+/******************************* Server-side ***************************/
+static inline void
+_sl_xcore_respond(struct sl_xcore_request *req)
+{
+	struct sl_xcore_response *resp = req->response;
+
+	if (!resp) return;
+
+	assert(resp->type == req->type && ps_load(&resp->resp_ready) == 0);
+	ps_faa(&resp->resp_ready, 1);
+	_sl_xcore_thd_wakeup_tid_no_cs(req->client_thd, req->client_core);
+}
+
+static inline int
+_sl_xcore_req_thd_alloc_no_cs(struct sl_xcore_request *req)
+{
+	cos_thd_fn_t   fn   = req->sl_xcore_req_thd_alloc.fn;
+	void          *data = req->sl_xcore_req_thd_alloc.data;
+	struct sl_thd *t;
+	struct sl_xcore_response *x = req->response;
+	int i;
+
+	assert(fn);
+
+	t = sl_thd_alloc_no_cs(fn, data);
+	assert(t);
+	if (likely(x)) x->sl_xcore_resp_thd_alloc.tid = sl_thd_thdid(t);
+	for (i = 0; i < req->sl_xcore_req_thd_alloc.param_count; i++) sl_thd_param_set_no_cs(t, req->sl_xcore_req_thd_alloc.params[i]);
+
+	return 0;
+}
+
+static inline int
+_sl_xcore_req_thd_param_set_no_cs(struct sl_xcore_request *req)
+{
+	struct sl_thd *t = sl_thd_lkup(req->sl_xcore_req_thd_param_set.tid);
+
+	if (!t) return -1;
+	sl_thd_param_set_no_cs(t, req->sl_xcore_req_thd_param_set.param);
+
+	return 0;
+}
+
+static inline int
+_sl_xcore_req_thd_wakeup_no_cs(struct sl_xcore_request *req)
+{
+	struct sl_thd *t = sl_thd_lkup(req->sl_xcore_req_thd_param_set.tid);
+
+	if (!t) return -1;
+	if (unlikely(t == sl__globals_core()->sched_thd)) return 0;
+	sl_thd_wakeup_no_cs(t);
+
+	return 0;
+}
+
+static inline void 
+_sl_xcore_req_load_balance_no_cs(struct sl_xcore_request *req)
+{
+	struct sl_global *g = sl__globals();
+	int n = g->nthds_running[cos_cpuid()], i, j = 0;
+	struct sl_xcore_response *rp = req->response;
+	cpuid_t cl_core = req->client_core;
+
+	if (n <= SL_XCORE_KEEP_MIN) return;
+	n -= SL_XCORE_KEEP_MIN;
+
+	if (n > SL_XCORE_MIGRATE_MAX) n = SL_XCORE_MIGRATE_MAX;
+	if (n > req->sl_xcore_req_load_balance.nthds) n = req->sl_xcore_req_load_balance.nthds;
+
+	assert(rp);
+	for (i = 0; i < n; i++) {
+		struct sl_thd_policy *t = sl_mod_last_schedule();
+		thdid_t tid = 0;
+		struct sl_xcore_thd *xt = NULL;
+
+		if (!t) break;
+		tid = sl_thd_thdid(sl_mod_thd_get(t));
+		xt = sl_xcore_thd_lookup(tid);
+		assert(xt);
+		if (xt->thd == tid) assert(xt->core == cos_cpuid());
+		if (sl_thd_migrate_no_cs(sl_mod_thd_get(t), cl_core)) break;
+		sl_xcore_thd_lookup_init(tid, cl_core);
+		rp->sl_xcore_resp_load_balance.tid[i] = tid;
+	}
+	rp->sl_xcore_resp_load_balance.nthds = i;
+
+	return;
+}
+
+int
+sl_xcore_process_no_cs(void)
+{
+	int num = 0;
+	struct sl_xcore_request xcore_req;
+
+	if (likely(NUM_CPU < 2)) return 0;
+
+	while (ck_ring_dequeue_mpsc_xcore(sl__ring_curr(), sl__ring_buffer_curr(), &xcore_req) == true) {
+		assert(xcore_req.client_core != cos_cpuid());
+
+		switch(xcore_req.type) {
+		case SL_XCORE_THD_ALLOC:
+		{
+			_sl_xcore_req_thd_alloc_no_cs(&xcore_req);
+			break;
+		}
+		case SL_XCORE_THD_ALLOC_EXT:
+		case SL_XCORE_AEP_ALLOC:
+		case SL_XCORE_AEP_ALLOC_EXT:
+		case SL_XCORE_INITAEP_ALLOC:
+		case SL_XCORE_THD_DEALLOC:
+		{
+			PRINTC("Unimplemented request! Aborting!\n");
+			assert(0);
+
+			break;
+		}
+		case SL_XCORE_THD_PARAM_SET:
+		{
+			_sl_xcore_req_thd_param_set_no_cs(&xcore_req);
+			break;
+		}
+		case SL_XCORE_THD_WAKEUP:
+		{
+			_sl_xcore_req_thd_wakeup_no_cs(&xcore_req);
+			break;
+		}
+		case SL_XCORE_LOAD_BALANCE:
+		{
+			_sl_xcore_req_load_balance_no_cs(&xcore_req);
+			break;
+		}
+		default:
+		{
+			PRINTC("Unrecognized request! Aborting!\n");
+			assert(0);
+		}
+		}
+		_sl_xcore_respond(&xcore_req);
+		num ++;
+	}
+
+	return num; /* number of requests processed */
+}
diff --git a/src/components/lib/sl/sl_xcpu.c b/src/components/lib/sl/sl_xcpu.c
deleted file mode 100644
index 7afcef766e..0000000000
--- a/src/components/lib/sl/sl_xcpu.c
+++ /dev/null
@@ -1,130 +0,0 @@
-/**
- * Redistribution of this file is permitted under the BSD two clause license.
- *
- * Copyright 2018, The George Washington University
- * Author: Phani Gadepalli, phanikishoreg@gwu.edu
- */
-
-#include <ps.h>
-#include <ck_ring.h>
-#include <sl_xcpu.h>
-#include <sl.h>
-#include <bitmap.h>
-
-#define SL_REQ_THD_ALLOC(req, fn, data) do {							\
-						req.type = SL_XCPU_THD_ALLOC;			\
-						req.client = cos_cpuid();			\
-						req.req_response = 0;				\
-						req.sl_xcpu_req_thd_alloc.fn = fn;		\
-						req.sl_xcpu_req_thd_alloc.data = data;		\
-					     } while (0)
-
-extern struct sl_thd *sl_thd_alloc_no_cs(cos_thd_fn_t fn, void *data);
-
-int
-sl_xcpu_thd_alloc(cpuid_t cpu, cos_thd_fn_t fn, void *data, sched_param_t params[])
-{
-	int i, sz = sizeof(params) / sizeof(params[0]);
-	int ret = 0;
-	asndcap_t snd = 0;
-	struct sl_xcpu_request req;
-
-	if (cpu == cos_cpuid()) return -EINVAL;
-	if (!bitmap_check(sl__globals()->cpu_bmp, cpu)) return -EINVAL;
-
-	sl_cs_enter();
-
-	SL_REQ_THD_ALLOC(req, fn, data);
-	memcpy(req.params, params, sizeof(sched_param_t) * sz);
-	req.param_count = sz;
-	if (ck_ring_enqueue_mpsc_xcpu(sl__ring(cpu), sl__ring_buffer(cpu), &req) != true) {
-		ret = -ENOMEM;
-	} else {
-		snd = sl__globals()->xcpu_asnd[cos_cpuid()][cpu];
-		assert(snd);
-	}
-
-	sl_cs_exit();
-
-	if (!snd || ret) goto done;
-
-	/* send an IPI for the request */
-	ret = cos_asnd(snd, 1);
-
-done:
-	return ret;
-}
-
-int
-sl_xcpu_thd_alloc_ext(cpuid_t cpu, struct cos_defcompinfo *dci, thdclosure_index_t idx, sched_param_t params[])
-{
-	return -ENOTSUP;
-}
-
-int
-sl_xcpu_aep_alloc(cpuid_t cpu, cos_thd_fn_t fn, void *data, int own_tcap, cos_channelkey_t key, sched_param_t params[])
-{
-	return -ENOTSUP;
-}
-
-int
-sl_xcpu_aep_alloc_ext(cpuid_t cpu, struct cos_defcompinfo *dci, thdclosure_index_t idx, int own_tcap, cos_channelkey_t key, sched_param_t params[])
-{
-	return -ENOTSUP;
-}
-
-int
-sl_xcpu_initaep_alloc(cpuid_t cpu, struct cos_defcompinfo *dci, int own_tcap, cos_channelkey_t key, sched_param_t params[])
-{
-	return -ENOTSUP;
-}
-
-int
-sl_xcpu_initaep_alloc_ext(cpuid_t cpu, struct cos_defcompinfo *dci, struct cos_defcompinfo *sched, int own_tcap, cos_channelkey_t key, sched_param_t params[])
-{
-	return -ENOTSUP;
-}
-
-int
-sl_xcpu_process_no_cs(void)
-{
-	int num = 0;
-	struct sl_xcpu_request xcpu_req;
-
-	while (ck_ring_dequeue_mpsc_xcpu(sl__ring_curr(), sl__ring_buffer_curr(), &xcpu_req) == true) {
-
-		assert(xcpu_req.client != cos_cpuid());
-		switch(xcpu_req.type) {
-		case SL_XCPU_THD_ALLOC:
-		{
-			cos_thd_fn_t   fn   = xcpu_req.sl_xcpu_req_thd_alloc.fn;
-			void          *data = xcpu_req.sl_xcpu_req_thd_alloc.data;
-			struct sl_thd *t;
-			int i;
-
-			assert(fn);
-
-			t = sl_thd_alloc_no_cs(fn, data);
-			assert(t);
-			for (i = 0; i < xcpu_req.param_count; i++) {
-				sl_thd_param_set(t, xcpu_req.params[i]);
-			}
-
-			break;
-		}
-		case SL_XCPU_THD_ALLOC_EXT:
-		case SL_XCPU_AEP_ALLOC:
-		case SL_XCPU_AEP_ALLOC_EXT:
-		case SL_XCPU_INITAEP_ALLOC:
-		case SL_XCPU_THD_DEALLOC:
-		default:
-		{
-			PRINTC("Unimplemented request! Aborting!\n");
-			assert(0);
-		}
-		}
-		num ++;
-	}
-
-	return num; /* number of requests processed */
-}
diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index 0a294301fb..abbdc67bb2 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -15,6 +15,8 @@
 #include "include/tcap.h"
 #include "include/chal/defs.h"
 #include "include/hw.h"
+#include "include/scb.h"
+#include "include/dcb.h"
 
 #define COS_DEFAULT_RET_CAP 0
 
@@ -82,6 +84,76 @@ printfn(struct pt_regs *regs)
 	return 0;
 }
 
+/* TODO: inline fast path and force non-inlined slow-path */
+static inline struct thread *
+cap_ulthd_lazyupdate(struct pt_regs *regs, struct cos_cpu_local_info *cos_info, int interrupt, struct comp_info **ci_ptr)
+{
+	struct thread       *thd = thd_current(cos_info);
+	struct cap_thd      *ch_ult = NULL;
+	struct thread       *ulthd = NULL;
+	capid_t              ultc = 0;
+	struct cos_scb_info *scb_core = NULL; /* per-core scb_info */
+
+	*ci_ptr = thd_invstk_current_compinfo(thd, cos_info);
+
+	assert(*ci_ptr && (*ci_ptr)->captbl);
+
+	if (unlikely(!(*ci_ptr)->scb_data)) goto done;
+	scb_core = (((*ci_ptr)->scb_data) + get_cpuid());
+	ultc     = scb_core->curr_thd;
+	/* reset inconsistency from user-level thd! */
+	scb_core->curr_thd = 0;
+	if (!ultc && !interrupt) goto done;
+
+	if (likely(ultc)) {
+		ch_ult = (struct cap_thd *)captbl_lkup((*ci_ptr)->captbl, ultc);
+		if (unlikely(!CAP_TYPECHK_CORE(ch_ult, CAP_THD))) ch_ult = NULL;
+		else                                              ulthd = ch_ult->t;
+	}
+	if (unlikely(!ultc || !ulthd || ulthd->dcbinfo == NULL)) goto done;
+	if (ulthd == thd) goto done;
+	
+	thd_current_update(ulthd, thd, cos_info);
+	thd = ulthd;
+	*ci_ptr = thd_invstk_current_compinfo(thd, cos_info);
+
+done:
+	return thd;
+}
+
+void
+cos_cap_ipi_handling(void)
+{
+	int                         idx, end;
+	struct IPI_receiving_rings *receiver_rings;
+	struct xcore_ring *         ring;
+
+	receiver_rings = &IPI_cap_dest[get_cpuid()];
+
+	/* We need to scan the entire buffer once. */
+	idx                   = receiver_rings->start;
+	end                   = receiver_rings->start - 1; // end is int type. could be -1.
+	receiver_rings->start = (receiver_rings->start + 1) % NUM_CPU;
+
+	/* scan the first half */
+	for (; idx < NUM_CPU; idx++) {
+		ring = &receiver_rings->IPI_source[idx];
+		if (ring->sender != ring->receiver) {
+			process_ring(ring);
+		}
+	}
+
+	/* and scan the second half */
+	for (idx = 0; idx <= end; idx++) {
+		ring = &receiver_rings->IPI_source[idx];
+		if (ring->sender != ring->receiver) {
+			process_ring(ring);
+		}
+	}
+
+	return;
+}
+
 static void
 kmem_unalloc(unsigned long *pte)
 {
@@ -287,6 +359,8 @@ cap_cpy(struct captbl *t, capid_t cap_to, capid_t capin_to, capid_t cap_from, ca
 		type = ctfrom->type;
 		sz   = __captbl_cap2bytes(type);
 
+		/* don't allow cap copy on SCB/DCB */
+		if (type == CAP_SCB || type == CAP_DCB) return -EINVAL;
 		ctto = __cap_capactivate_pre(t, cap_to, capin_to, type, &ret);
 		if (!ctto) return -EINVAL;
 
@@ -435,7 +509,7 @@ cap_thd_switch(struct pt_regs *regs, struct thread *curr, struct thread *next, s
 
 	preempt = thd_switch_update(next, &next->regs, 0);
 	/* if switching to the preempted/awoken thread clear cpu local next_thdinfo */
-	if (nti->thd && nti->thd == next) thd_next_thdinfo_update(cos_info, 0, 0, 0, 0);
+	//if (nti->thd && nti->thd == next) thd_next_thdinfo_update(cos_info, 0, 0, 0, 0);
 
 	copy_all_regs(&next->regs, regs);
 
@@ -447,7 +521,9 @@ notify_parent(struct thread *rcv_thd, int send)
 {
 	struct thread *curr_notif = NULL, *prev_notif = NULL, *arcv_notif = NULL;
 	int            depth = 0;
+	cycles_t       now; 
 
+	rdtscll(now);
 	/* hierarchical notifications - upto init (bounded by ARCV_NOTIF_DEPTH) */
 	prev_notif = rcv_thd;
 	curr_notif = arcv_notif = arcv_thd_notif(prev_notif);
@@ -455,6 +531,7 @@ notify_parent(struct thread *rcv_thd, int send)
 	while (curr_notif && curr_notif != prev_notif) {
 		assert(depth < ARCV_NOTIF_DEPTH);
 
+		prev_notif->event_epoch = now;
 		thd_rcvcap_evt_enqueue(curr_notif, prev_notif);
 		if (!(curr_notif->state & THD_STATE_RCVING)) break;
 
@@ -500,7 +577,7 @@ asnd_process(struct thread *rcv_thd, struct thread *thd, struct tcap *rcv_tcap,
 {
 	struct thread *next;
 
-	thd_rcvcap_pending_inc(rcv_thd);
+	thd_rcvcap_pending_set(rcv_thd);
 	next = notify_process(rcv_thd, thd, rcv_tcap, tcap, tcap_next, yield);
 
 	/*
@@ -586,11 +663,19 @@ cap_switch(struct pt_regs *regs, struct thread *curr, struct thread *next, struc
 static int
 cap_sched_tok_validate(struct thread *rcvt, sched_tok_t usr_tok, struct comp_info *ci, struct cos_cpu_local_info *cos_info)
 {
+	struct cos_scb_info *scb_core = ci->scb_data + get_cpuid();
+
 	assert(rcvt && usr_tok < ~0U);
 
-	/* race-condition check for user-level thread switches */
-	if (thd_rcvcap_get_counter(rcvt) > usr_tok) return -EAGAIN;
-	thd_rcvcap_set_counter(rcvt, usr_tok);
+	/*
+	 * Kernel increments the sched_tok on preemption only.
+	 * The rest is all co-operative, so if sched_tok in scb page
+	 * increments after someone fetching a tok, then check for that!
+	 *
+	 * FIXME: make sure we're checking the scb of the scheduling component and not in any other component.
+	 *        I don't know if the comp_info here is of the scheduling component!
+	 */
+	if (unlikely(scb_core->sched_tok != usr_tok)) return -EAGAIN;
 
 	return 0;
 }
@@ -624,7 +709,9 @@ cap_thd_op(struct cap_thd *thd_cap, struct thread *thd, struct pt_regs *regs, st
 		ret  = cap_sched_tok_validate(rcvt, usr_counter, ci, cos_info);
 		if (ret) return ret;
 
-		if (thd_rcvcap_pending(rcvt) > 0) {
+		/* only if it has scheduler events to process! */
+		if (thd_rcvcap_evt_pending(rcvt)) {
+			printk("%s:%d\n", __func__, __LINE__);
 			if (thd == rcvt) return -EBUSY;
 
 			next = rcvt;
@@ -650,7 +737,7 @@ cap_thd_op(struct cap_thd *thd_cap, struct thread *thd, struct pt_regs *regs, st
 	}
 
 	ret = cap_switch(regs, thd, next, tcap, timeout, ci, cos_info);
-	if (tc && tcap_current(cos_info) == tcap) tcap_setprio(tcap, prio);
+	if (tc && tcap_current(cos_info) == tcap && prio) tcap_setprio(tcap, prio);
 
 	return ret;
 }
@@ -680,13 +767,11 @@ cap_ipi_process(struct pt_regs *regs)
 	struct tcap 		   *tcap_curr, *tcap_next;
 	struct comp_info 	   *ci;
 	int                         i, scan_base;
-	unsigned long               ip, sp;
 
-	thd_curr       = thd_next = thd_current(cos_info);
+	thd_next       = thd_curr = cap_ulthd_lazyupdate(regs, cos_info, 1, &ci);
+	assert(ci && ci->captbl);
 	receiver_rings = &IPI_cap_dest[get_cpuid()];
 	tcap_curr      = tcap_next = tcap_current(cos_info);
-	ci             = thd_invstk_current(thd_curr, &ip, &sp, cos_info);
-	assert(ci && ci->captbl);
 
 	scan_base = receiver_rings->start;
 	receiver_rings->start = (receiver_rings->start + 1) % NUM_CPU;
@@ -767,7 +852,8 @@ cap_asnd_op(struct cap_asnd *asnd, struct thread *thd, struct pt_regs *regs, str
 		ret  = cap_sched_tok_validate(rcvt, usr_tok, ci, cos_info);
 		if (ret) return ret;
 
-		if (thd_rcvcap_pending(rcvt) > 0) {
+		/* only if the rcvt has scheduler events to process */
+		if (thd_rcvcap_evt_pending(rcvt)) {
 			if (thd == rcvt) return -EBUSY;
 
 			next = rcvt;
@@ -794,12 +880,11 @@ int
 cap_hw_asnd(struct cap_asnd *asnd, struct pt_regs *regs)
 {
 	int                        curr_cpu = get_cpuid();
-	struct cap_arcv *          arcv;
+	struct cap_arcv           *arcv;
 	struct cos_cpu_local_info *cos_info;
-	struct thread *            rcv_thd, *next, *thd;
-	struct tcap *              rcv_tcap, *tcap, *tcap_next;
-	struct comp_info *         ci;
-	unsigned long              ip, sp;
+	struct thread             *rcv_thd, *next, *thd;
+	struct tcap               *rcv_tcap, *tcap, *tcap_next;
+	struct comp_info          *ci;
 
 	if (!CAP_TYPECHK(asnd, CAP_ASND)) return 1;
 	assert(asnd->arcv_capid);
@@ -815,12 +900,10 @@ cap_hw_asnd(struct cap_asnd *asnd, struct pt_regs *regs)
 
 	cos_info = cos_cpu_local_info();
 	assert(cos_info);
-	thd  = thd_current(cos_info);
-	tcap = tcap_current(cos_info);
-	assert(thd);
-	ci = thd_invstk_current(thd, &ip, &sp, cos_info);
-	assert(ci && ci->captbl);
+	thd = cap_ulthd_lazyupdate(regs, cos_info, 1, &ci);
+	assert(thd && ci && ci->captbl);
 	assert(!(thd->state & THD_STATE_PREEMPTED));
+	tcap = tcap_current(cos_info);
 	rcv_thd  = arcv->thd;
 	rcv_tcap = rcv_thd->rcvcap.rcvcap_tcap;
 	assert(rcv_tcap && tcap);
@@ -829,7 +912,9 @@ cap_hw_asnd(struct cap_asnd *asnd, struct pt_regs *regs)
 	if (next == thd) return 1;
 	thd->state |= THD_STATE_PREEMPTED;
 
-	return cap_switch(regs, thd, next, tcap_next, TCAP_TIME_NIL, ci, cos_info);
+	/* don't disable timer if we're not switching to a diff tcap.. */
+	/* TODO: hierarchical timeouts */
+	return cap_switch(regs, thd, next, tcap_next, tcap == tcap_next ? tcap_cyc2time(cos_info->next_timer) : TCAP_TIME_NIL, ci, cos_info);
 }
 
 int
@@ -863,16 +948,13 @@ int
 timer_process(struct pt_regs *regs)
 {
 	struct cos_cpu_local_info *cos_info;
-	struct thread *            thd_curr;
-	struct comp_info *         comp;
-	unsigned long              ip, sp;
-	cycles_t                   now;
+	struct thread             *thd_curr;
+	struct comp_info          *comp = NULL;
 
 	cos_info = cos_cpu_local_info();
 	assert(cos_info);
-	thd_curr = thd_current(cos_info);
+	thd_curr = cap_ulthd_lazyupdate(regs, cos_info, 1, &comp);
 	assert(thd_curr && thd_curr->cpuid == get_cpuid());
-	comp = thd_invstk_current(thd_curr, &ip, &sp, cos_info);
 	assert(comp);
 
 	return expended_process(regs, thd_curr, comp, cos_info, 1);
@@ -887,21 +969,25 @@ cap_arcv_op(struct cap_arcv *arcv, struct thread *thd, struct pt_regs *regs, str
 	struct next_thdinfo *nti         = &cos_info->next_ti;
 	rcv_flags_t          rflags      = __userregs_get1(regs);
 	tcap_time_t          swtimeout   = TCAP_TIME_NIL;
-	tcap_time_t          timeout     = __userregs_get2(regs);
-	int                  all_pending = (!!(rflags & RCV_ALL_PENDING));
+	tcap_time_t          timeout     = TCAP_TIME_NIL, x = __userregs_get2(regs);
 
+	if (likely(rflags & RCV_SCHEDTIMEOUT)) swtimeout = x;
+	else                                   timeout   = x;
 	if (unlikely(arcv->thd != thd || arcv->cpuid != get_cpuid())) return -EINVAL;
 
 	/* deliver pending notifications? */
 	if (thd_rcvcap_pending(thd)) {
 		__userregs_set(regs, 0, __userregs_getsp(regs), __userregs_getip(regs));
-		thd_rcvcap_all_pending_set(thd, all_pending);
 		thd_rcvcap_pending_deliver(thd, regs);
+		/* for sched_rcv enabling user-level switch */
+		//if (thd->dcbinfo) thd->dcbinfo->sp = 0;
 
 		return 0;
 	} else if (rflags & RCV_NON_BLOCKING) {
 		__userregs_set(regs, 0, __userregs_getsp(regs), __userregs_getip(regs));
 		__userregs_setretvals(regs, -EAGAIN, 0, 0, 0);
+		/* for sched_rcv enabling user-level switch */
+		//if (thd->dcbinfo) thd->dcbinfo->sp = 0;
 
 		return 0;
 	}
@@ -912,20 +998,20 @@ cap_arcv_op(struct cap_arcv *arcv, struct thread *thd, struct pt_regs *regs, str
 	if (unlikely(tc_next != thd_rcvcap_tcap(thd))) tc_next = thd_rcvcap_tcap(thd);
 
 	/* if preempted/awoken thread is waiting, switch to that */
-	if (nti->thd) {
-		assert(nti->tc);
-
-		next    = nti->thd;
-		tc_next = nti->tc;
-		tcap_setprio(nti->tc, nti->prio);
-		if (nti->budget) {
-			/* convert budget to timeout */
-			cycles_t now;
-			rdtscll(now);
-			swtimeout = tcap_cyc2time(now + nti->budget);
-		}
-		thd_next_thdinfo_update(cos_info, 0, 0, 0, 0);
-	}
+	//if (nti->thd) {
+	//	assert(nti->tc);
+
+	//	next    = nti->thd;
+	//	tc_next = nti->tc;
+	//	tcap_setprio(nti->tc, nti->prio);
+	//	if (nti->budget) {
+	//		/* convert budget to timeout */
+	//		cycles_t now;
+	//		rdtscll(now);
+	//		swtimeout = tcap_cyc2time(now + nti->budget);
+	//	}
+	//	thd_next_thdinfo_update(cos_info, 0, 0, 0, 0);
+	//}
 
 	/* FIXME:  for now, lets just ignore this path...need to plumb tcaps into it */
 	thd->interrupted_thread = NULL;
@@ -939,8 +1025,10 @@ cap_arcv_op(struct cap_arcv *arcv, struct thread *thd, struct pt_regs *regs, str
 	if (likely(thd != next)) {
 		assert(!(thd->state & THD_STATE_PREEMPTED));
 		thd->state |= THD_STATE_RCVING;
-		thd_rcvcap_all_pending_set(thd, all_pending);
 		thd->timeout = timeout;
+	} else {
+		/* switching back to the thread.. don't disable timers..*/
+		swtimeout = timeout;
 	}
 
 	return cap_switch(regs, thd, next, tc_next, swtimeout, ci, cos_info);
@@ -960,6 +1048,8 @@ cap_introspect(struct captbl *ct, capid_t capid, u32_t op, unsigned long *retval
 		return tcap_introspect(((struct cap_tcap *)ch)->tcap, op, retval);
 	case CAP_ARCV:
 		return arcv_introspect(((struct cap_arcv *)ch), op, retval);
+	case CAP_COMP:
+		return comp_introspect(((struct cap_comp *)ch), op, retval);
 	default:
 		return -EINVAL;
 	}
@@ -967,6 +1057,13 @@ cap_introspect(struct captbl *ct, capid_t capid, u32_t op, unsigned long *retval
 
 #define ENABLE_KERNEL_PRINT
 
+#define cos_thd_throw(label, thd, errno) 				\
+        {								\
+                ret = (errno); 						\
+		if (unlikely(thd->dcbinfo)) thd->dcbinfo->sp = 0; 	\
+                goto label;						\
+        } 
+
 static int composite_syscall_slowpath(struct pt_regs *regs, int *thd_switch);
 
 COS_SYSCALL __attribute__((section("__ipc_entry"))) int
@@ -976,7 +1073,6 @@ composite_syscall_handler(struct pt_regs *regs)
 	struct comp_info * ci;
 	struct thread *    thd;
 	capid_t            cap;
-	unsigned long      ip, sp;
 
 	/*
 	 * We lookup this struct (which is on stack) only once, and
@@ -986,8 +1082,10 @@ composite_syscall_handler(struct pt_regs *regs)
 	int                        ret        = -ENOENT;
 	int                        thd_switch = 0;
 
+	/* Definitely do it for all the fast-path calls. */
+	thd = cap_ulthd_lazyupdate(regs, cos_info, 0, &ci);
+	assert(thd);
 	cap = __userregs_getcap(regs);
-	thd = thd_current(cos_info);
 
 	/* printk("thd %d calling cap %d (ip %x, sp %x), operation %d: %x, %x, %x, %x\n", thd->tid, cap,
 	 *        __userregs_getip(regs), __userregs_getsp(regs), __userregs_getop(regs),
@@ -1007,14 +1105,12 @@ composite_syscall_handler(struct pt_regs *regs)
 		return 0;
 	}
 
-	ci = thd_invstk_current(thd, &ip, &sp, cos_info);
-	assert(ci && ci->captbl);
-
 	/*
 	 * We don't check the liveness of the current component
 	 * because it's guaranteed by component quiescence period,
 	 * which is at timer tick granularity.
 	 */
+	assert(ci && ci->captbl);
 	ch = captbl_lkup(ci->captbl, cap);
 	if (unlikely(!ch)) {
 		printk("cos: cap %d not found!\n", (int)cap);
@@ -1033,7 +1129,8 @@ composite_syscall_handler(struct pt_regs *regs)
 	switch (ch->type) {
 	case CAP_THD:
 		ret = cap_thd_op((struct cap_thd *)ch, thd, regs, ci, cos_info);
-		if (ret < 0) cos_throw(done, ret);
+		//printk("[%d]\n", ret);
+		if (ret < 0) cos_thd_throw(done, thd, ret);
 		return ret;
 	case CAP_ASND:
 		ret = cap_asnd_op((struct cap_asnd *)ch, thd, regs, ci, cos_info);
@@ -1041,7 +1138,7 @@ composite_syscall_handler(struct pt_regs *regs)
 		return ret;
 	case CAP_ARCV:
 		ret = cap_arcv_op((struct cap_arcv *)ch, thd, regs, ci, cos_info);
-		if (ret < 0) cos_throw(done, ret);
+		if (ret < 0) cos_thd_throw(done, thd, ret);
 		return ret;
 	default:
 		break;
@@ -1212,22 +1309,38 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs *
 			break;
 		}
 		case CAPTBL_OP_THDACTIVATE: {
-			thdclosure_index_t init_data  = __userregs_get1(regs) >> 16;
-			capid_t thd_cap               = __userregs_get1(regs) & 0xFFFF;
-			capid_t pgtbl_cap             = __userregs_get2(regs);
-			capid_t pgtbl_addr            = __userregs_get3(regs);
-			capid_t compcap               = __userregs_get4(regs);
-
-			struct thread *thd;
-			unsigned long *pte = NULL;
-
-			ret = cap_kmem_activate(ct, pgtbl_cap, pgtbl_addr, (unsigned long *)&thd, &pte);
+			u32_t              reg3         = __userregs_get3(regs);
+			u32_t              reg4         = __userregs_get4(regs);
+			capid_t            pgtbl_addr   = __userregs_get2(regs);
+			thdclosure_index_t init_data    = (reg4 << 16) >> 16;
+			capid_t            thd_cap      = (capin >> 16);
+			capid_t            pgtbl_cap    = (capin << 16) >> 16;
+			capid_t            compcap      = (reg3 >> 16);
+			capid_t            dcb_cap      = (reg3 << 16) >> 16;
+			unsigned short     dcboff       = reg4 >> 16;
+			unsigned long     *tpte = NULL, flags;
+			struct thread     *thd;
+			struct cap_header *ctfrom;
+
+			ret = cap_kmem_activate(ct, pgtbl_cap, pgtbl_addr, (unsigned long *)&thd, &tpte);
 			if (unlikely(ret)) cos_throw(err, ret);
-			assert(thd && pte);
+			assert(thd && tpte);
 
 			/* ret is returned by the overall function */
-			ret = thd_activate(ct, cap, thd_cap, thd, compcap, init_data);
-			if (ret) kmem_unalloc(pte);
+			ret = thd_activate(ct, cap, thd_cap, thd, compcap, init_data, dcb_cap, dcboff);
+			if (ret) kmem_unalloc(tpte);
+
+			break;
+		}
+		case CAPTBL_OP_THDMIGRATE: {
+			u32_t reg2 = __userregs_get2(regs);
+			u32_t reg3 = __userregs_get3(regs);
+
+			if (reg3) {
+				ret = thd_migrate_cap(ct, capin);
+			} else {
+				ret = thd_migrate(ct, capin, reg2);
+			}
 
 			break;
 		}
@@ -1249,7 +1362,7 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs *
 		case CAPTBL_OP_THDDEACTIVATE: {
 			livenessid_t lid = __userregs_get2(regs);
 
-			ret = thd_deactivate(ct, op_cap, capin, lid, 0, 0, 0);
+			ret = thd_deactivate(ct, op_cap, capin, lid, 0, 0, 0, 0);
 			break;
 		}
 		case CAPTBL_OP_THDTLSSET: {
@@ -1265,7 +1378,7 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs *
 			capid_t      pgtbl_cap     = __userregs_get3(regs);
 			capid_t      cosframe_addr = __userregs_get4(regs);
 
-			ret = thd_deactivate(ct, op_cap, capin, lid, pgtbl_cap, cosframe_addr, 1);
+			ret = thd_deactivate(ct, op_cap, capin, lid, pgtbl_cap, cosframe_addr, 0, 1);
 			break;
 		}
 		case CAPTBL_OP_CAPKMEM_FREEZE: {
@@ -1277,10 +1390,13 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs *
 		case CAPTBL_OP_COMPACTIVATE: {
 			capid_t      captbl_cap = __userregs_get2(regs) >> 16;
 			capid_t      pgtbl_cap  = __userregs_get2(regs) & 0xFFFF;
-			livenessid_t lid        = __userregs_get3(regs);
+			livenessid_t lid        = capin >> 16;
+			capid_t      comp_cap   = (capin << 16) >> 16;
+			vaddr_t      scb_uaddr  = __userregs_get3(regs) & (~0 << 12);
 			vaddr_t      entry_addr = __userregs_get4(regs);
+			capid_t      scb_cap    = __userregs_get3(regs) & ((1 << 12) - 1);
 
-			ret = comp_activate(ct, cap, capin, captbl_cap, pgtbl_cap, lid, entry_addr, NULL);
+			ret = comp_activate(ct, cap, comp_cap, captbl_cap, pgtbl_cap, scb_cap, lid, entry_addr, scb_uaddr);
 			break;
 		}
 		case CAPTBL_OP_COMPDEACTIVATE: {
@@ -1389,6 +1505,65 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs *
 			ret = hw_deactivate(op_cap, capin, lid);
 			break;
 		}
+		case CAPTBL_OP_SCB_ACTIVATE: {
+			capid_t      ptcap  = __userregs_get2(regs);
+			livenessid_t lid    = __userregs_get4(regs);
+			vaddr_t      addr   = __userregs_get3(regs);
+			unsigned long *pte;
+			struct cos_scb_info *scb;
+
+			ret = cap_kmem_activate(ct, ptcap, addr, (unsigned long *)&scb, &pte);
+			if (ret) cos_throw(err, ret);
+
+			ret = scb_activate(ct, cap, capin, (vaddr_t)scb, lid);
+
+			break;
+		}
+		case CAPTBL_OP_SCB_DEACTIVATE: {
+			u32_t        r2      = __userregs_get2(regs);
+			livenessid_t lid     = r2 >> 16;
+			capid_t      ptcap   = (r2 << 16) >> 16;
+			capid_t      cf_addr = __userregs_get3(regs);
+
+			ret = scb_deactivate(op_cap, capin, ptcap, cf_addr, lid);
+
+			break;
+		}
+		case CAPTBL_OP_DCB_ACTIVATE: {
+			u32_t        r1      = __userregs_get1(regs);
+			u32_t        r2      = __userregs_get2(regs);
+			u32_t        r3      = __userregs_get3(regs);
+			u32_t        r4      = __userregs_get4(regs);
+			capid_t      dcbcap  = r1 >> 16;
+			capid_t      ptcap   = r2 >> 16;
+			livenessid_t lid     = (r1 << 16) >> 16;
+			capid_t      ptcapin = (r2 << 16) >> 16;
+			vaddr_t      kaddr   = r3;
+			vaddr_t      uaddrin = r4;
+			struct cos_dcb_info *dcb;
+			unsigned long *pte;
+
+			ret = cap_kmem_activate(ct, ptcap, kaddr, (unsigned long *)&dcb, &pte);
+			if (ret) cos_throw(err, ret);
+
+			ret = dcb_activate(ct, cap, dcbcap, (vaddr_t)dcb, lid, ptcapin, uaddrin);
+
+			break;
+		}
+		case CAPTBL_OP_DCB_DEACTIVATE: {
+			u32_t        r2      = __userregs_get2(regs);
+			u32_t        r3      = __userregs_get3(regs);
+			u32_t        r4      = __userregs_get4(regs);
+			livenessid_t lid     = r2 >> 16;
+			capid_t      ptcap   = (r2 << 16) >> 16;
+			vaddr_t      cf_addr = r3 & (~0 << 12);
+			vaddr_t      uaddrin = r4 & (~0 << 12);
+			capid_t      ptcapin = (r4 << 20) >> 12 | ((r3 << 20) >> 20);
+
+			ret = dcb_deactivate(op_cap, capin, lid, ptcap, cf_addr, ptcapin, uaddrin);
+
+			break;
+		}
 		default:
 			goto err;
 		}
@@ -1645,17 +1820,28 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs *
 			struct cap_arcv *rcvc;
 			hwid_t           hwid   = __userregs_get1(regs);
 			capid_t          rcvcap = __userregs_get2(regs);
+			u32_t period = __userregs_get3(regs);
 
 			rcvc = (struct cap_arcv *)captbl_lkup(ci->captbl, rcvcap);
 			if (!CAP_TYPECHK(rcvc, CAP_ARCV)) cos_throw(err, -EINVAL);
 
 			ret = hw_attach_rcvcap((struct cap_hw *)ch, hwid, rcvc, rcvcap);
+			if (!ret) {
+				if (hwid == HW_HPET_PERIODIC || hwid == HW_HPET_ONESHOT) chal_hpet_periodic_set(hwid, period);
+				ret = chal_irq_enable(hwid, get_cpuid());
+			}
+
 			break;
 		}
 		case CAPTBL_OP_HW_DETACH: {
 			hwid_t hwid = __userregs_get1(regs);
 
 			ret = hw_detach_rcvcap((struct cap_hw *)ch, hwid);
+			if (!ret) {
+				if (hwid == HW_HPET_PERIODIC || hwid == HW_HPET_ONESHOT) chal_hpet_disable(hwid);
+				ret = chal_irq_disable(hwid, get_cpuid());
+			}
+
 			break;
 		}
 		case CAPTBL_OP_HW_MAP: {
diff --git a/src/kernel/include/captbl.h b/src/kernel/include/captbl.h
index 102fe147d3..7530b06796 100644
--- a/src/kernel/include/captbl.h
+++ b/src/kernel/include/captbl.h
@@ -51,7 +51,7 @@ typedef enum {
 #define CAP_HEAD_AMAP_SZ 4
 #define CAP_HEAD_SZ_SZ 2
 #define CAP_HEAD_FLAGS_SZ 3
-#define CAP_HEAD_TYPE_SZ 7
+#define CAP_HEAD_TYPE_SZ CAP_TYPE_MAXBITS
 
 /*
  * This is the header for each capability.  Includes information about
diff --git a/src/kernel/include/chal.h b/src/kernel/include/chal.h
index 2caa7dd0ca..b7a4683587 100644
--- a/src/kernel/include/chal.h
+++ b/src/kernel/include/chal.h
@@ -94,6 +94,12 @@ void chal_send_ipi(int cpu_id);
 void chal_idle(void);
 void chal_timer_set(cycles_t cycles);
 void chal_timer_disable(void);
+void     chal_hpet_periodic_set(hwid_t, unsigned long);
+void     chal_hpet_disable(hwid_t);
+cycles_t chal_hpet_first_period(void);
+
+int chal_irq_disable(int irqline, cpuid_t cpu_id);
+int chal_irq_enable(int irqline, cpuid_t cpu_id);
 
 void chal_init(void);
 
@@ -104,6 +110,8 @@ void chal_init(void);
 
 #include "../../platform/include/chal_plat.h"
 
+#define PRINTK(format, ...) printk("(CPU%ld:) " format, get_cpuid(), ## __VA_ARGS__)
+
 extern void printk(const char *fmt, ...);
 void        chal_khalt(void);
 
diff --git a/src/kernel/include/component.h b/src/kernel/include/component.h
index c837cf22fa..79cfbd5546 100644
--- a/src/kernel/include/component.h
+++ b/src/kernel/include/component.h
@@ -12,36 +12,44 @@
 #include "captbl.h"
 #include "pgtbl.h"
 #include "cap_ops.h"
+#include "shared/cos_sched.h"
 
 struct comp_info {
 	struct liveness_data        liveness;
 	pgtbl_t                     pgtbl;
-	struct captbl *             captbl;
-	struct cos_sched_data_area *comp_nfo;
+	struct captbl              *captbl;
+	struct cos_scb_info        *scb_data;
 } __attribute__((packed));
 
 struct cap_comp {
 	struct cap_header  h;
 	vaddr_t            entry_addr;
-	struct cap_pgtbl * pgd;
+	struct cap_pgtbl  *pgd;
 	struct cap_captbl *ct_top;
 	struct comp_info   info;
 } __attribute__((packed));
 
+#include "scb.h"
+
 static int
-comp_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t captbl_cap, capid_t pgtbl_cap, livenessid_t lid,
-              vaddr_t entry_addr, struct cos_sched_data_area *sa)
+comp_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t captbl_cap, capid_t pgtbl_cap, capid_t scbcap,
+	      livenessid_t lid, vaddr_t entry_addr, vaddr_t scb_uaddr)
 {
-	struct cap_comp *  compc;
-	struct cap_pgtbl * ptc;
+	struct cap_comp   *compc;
+	struct cap_pgtbl  *ptc;
 	struct cap_captbl *ctc;
-	u32_t              v;
+	u32_t              v, flags;
 	int                ret = 0;
+	struct cap_scb    *scbc = NULL;
 
 	ctc = (struct cap_captbl *)captbl_lkup(t, captbl_cap);
 	if (unlikely(!ctc || ctc->h.type != CAP_CAPTBL || ctc->lvl > 0)) return -EINVAL;
 	ptc = (struct cap_pgtbl *)captbl_lkup(t, pgtbl_cap);
 	if (unlikely(!ptc || ptc->h.type != CAP_PGTBL || ptc->lvl > 0)) return -EINVAL;
+	if (likely(scbcap)) {
+		scbc = (struct cap_scb *)captbl_lkup(t, scbcap);
+		if (unlikely(!scbc || scbc->h.type != CAP_SCB)) return -EINVAL;
+	}
 
 	v = ptc->refcnt_flags;
 	if (v & CAP_MEM_FROZEN_FLAG) return -EINVAL;
@@ -53,14 +61,16 @@ comp_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t captbl_cap,
 		/* undo before return */
 		cos_throw(undo_ptc, -ECASFAIL);
 	}
-
 	compc = (struct cap_comp *)__cap_capactivate_pre(t, cap, capin, CAP_COMP, &ret);
 	if (!compc) cos_throw(undo_ctc, ret);
 
+	if (likely(scbc)) {
+		ret = scb_comp_update(t, scbc, compc, ptc, scb_uaddr);
+		if (ret) cos_throw(undo_capact, ret);
+	}
 	compc->entry_addr    = entry_addr;
 	compc->info.pgtbl    = ptc->pgtbl;
 	compc->info.captbl   = ctc->captbl;
-	compc->info.comp_nfo = sa;
 	compc->pgd           = ptc;
 	compc->ct_top        = ctc;
 	ltbl_get(lid, &compc->info.liveness);
@@ -68,6 +78,9 @@ comp_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t captbl_cap,
 
 	return 0;
 
+/*undo_scb:
+	scb_comp_remove(t, scbc, pgtbl_cap, scb_uaddr);*/
+undo_capact:
 undo_ctc:
 	cos_faa((int *)&ctc->refcnt_flags, -1);
 undo_ptc:
@@ -79,8 +92,8 @@ static int
 comp_deactivate(struct cap_captbl *ct, capid_t capin, livenessid_t lid)
 {
 	int                ret;
-	struct cap_comp *  compc;
-	struct cap_pgtbl * pgd;
+	struct cap_comp   *compc;
+	struct cap_pgtbl  *pgd;
 	struct cap_captbl *ct_top;
 
 	compc = (struct cap_comp *)captbl_lkup(ct->captbl, capin);
@@ -89,6 +102,8 @@ comp_deactivate(struct cap_captbl *ct, capid_t capin, livenessid_t lid)
 	ltbl_expire(&compc->info.liveness);
 	pgd    = compc->pgd;
 	ct_top = compc->ct_top;
+	/* TODO: right way to remove scb info */
+	if (likely(compc->info.scb_data)) scb_comp_remove(ct, 0, 0, 0);
 
 	ret = cap_capdeactivate(ct, capin, CAP_COMP, lid);
 	if (ret) return ret;
@@ -107,4 +122,17 @@ comp_init(void)
 	assert(sizeof(struct cap_comp) <= __captbl_cap2bytes(CAP_COMP));
 }
 
+static inline int
+comp_introspect(struct cap_comp *t, unsigned long op, unsigned long *retval)
+{
+	switch (op) {
+	case COMP_GET_SCB_CURTHD:
+		*retval = t->info.scb_data->curr_thd;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
 #endif /* COMPONENT_H */
diff --git a/src/kernel/include/dcb.h b/src/kernel/include/dcb.h
new file mode 100644
index 0000000000..eac71fa497
--- /dev/null
+++ b/src/kernel/include/dcb.h
@@ -0,0 +1,109 @@
+/**
+ * Copyright 2019 by Phani Gadepalli, phanikishoreg@gwu.edu
+ *
+ * Redistribution of this file is permitted under the GNU General Public License v2.
+ */
+
+#ifndef DCB_H
+#define DCB_H
+
+#include "cap_ops.h"
+#include "pgtbl.h"
+#include "retype_tbl.h"
+#include "component.h"
+#include "thd.h"
+
+#define DCB_ENTRIES_MAX_PER_PAGE (PAGE_SIZE/sizeof(struct cos_dcb_info))
+
+struct cap_dcb {
+	struct cap_header     h;
+	struct liveness_data  liveness;
+	unsigned int          refcnt;
+	vaddr_t               kern_addr;
+	cpuid_t               cpuid;
+} __attribute__((packed));
+
+static inline int
+dcb_activate(struct captbl *t, capid_t ctcap, capid_t dcbcap, vaddr_t kaddr, livenessid_t lid, capid_t ptcapin, vaddr_t uaddr)
+{
+	struct cap_dcb      *dc;
+	struct cap_pgtbl    *ptcin;
+	int                  ret;
+	paddr_t              pf = chal_va2pa((void *)kaddr);
+
+	ptcin = (struct cap_pgtbl *)captbl_lkup(t, ptcapin);
+	if (!ptcin || ptcin->h.type != CAP_PGTBL) return -EINVAL;
+
+	if (pgtbl_mapping_add(ptcin->pgtbl, uaddr, pf, PGTBL_USER_DEF)) return -EINVAL;
+
+	dc = (struct cap_dcb *)__cap_capactivate_pre(t, ctcap, dcbcap, CAP_DCB, &ret);
+	if (!dc) return -EINVAL;
+
+	ltbl_get(lid, &dc->liveness);
+	dc->kern_addr = kaddr;
+	memset((void *)kaddr, 0, PAGE_SIZE);
+	dc->refcnt    = 0;
+	dc->cpuid     = get_cpuid();
+
+	__cap_capactivate_post(&dc->h, CAP_DCB);
+
+	return 0;
+}
+
+static inline int
+dcb_deactivate(struct cap_captbl *ct, capid_t dcbcap, livenessid_t lid, capid_t ptcap, capid_t cosframe_addr, capid_t ptcapin, vaddr_t uaddrin)
+{
+	struct cap_dcb *dc;
+	struct cap_pgtbl *ptcin;
+	unsigned long *pte, addr, flags, old_v;
+	int ret;
+
+	dc = (struct cap_dcb *)captbl_lkup(ct->captbl, dcbcap);
+	if (!dc || dc->h.type != CAP_DCB) return -EINVAL;
+
+	if (!ptcapin || !uaddrin) return -EINVAL;
+	ptcin = (struct cap_pgtbl *)captbl_lkup(ct->captbl, ptcapin);
+	if (!ptcin || ptcin->h.type != CAP_PGTBL) return -EINVAL;
+	pte = pgtbl_lkup(ptcin->pgtbl, uaddrin, (u32_t *)&flags);
+	if (!pte) return -EINVAL;
+	if ((vaddr_t)pte != dc->kern_addr) return -EINVAL;
+
+	if (dc->refcnt) return -EPERM;
+
+	ltbl_expire(&dc->liveness);
+	ret = kmem_deact_pre((struct cap_header *)dc, ct->captbl, ptcap, cosframe_addr, &pte, &old_v);
+	if (ret) return ret;
+	ret = kmem_deact_post(pte, old_v);
+	if (ret) return ret;
+	dc->kern_addr = 0;
+
+	return cap_capdeactivate(ct, dcbcap, CAP_DCB, lid);
+}
+
+static inline int
+dcb_thd_ref(struct cap_dcb *dc, struct thread *thd)
+{
+	if (dc->refcnt >= DCB_ENTRIES_MAX_PER_PAGE) return -EINVAL;
+	if (dc->cpuid != thd->cpuid) return -EINVAL;
+	if (!ltbl_isalive(&dc->liveness)) return -EPERM;
+
+	dc->refcnt++;
+
+	return 0;
+}
+
+static inline int
+dcb_thd_deref(struct cap_dcb *dc, struct thread *thd)
+{
+	if (!dc->refcnt) return -EINVAL;
+	if (dc->cpuid != thd->cpuid) return -EINVAL;
+
+	if ((vaddr_t)thd->dcbinfo < dc->kern_addr || (vaddr_t)thd->dcbinfo > (dc->kern_addr + PAGE_SIZE)) return -EINVAL;
+	if (!ltbl_isalive(&dc->liveness)) return -EPERM;
+
+	dc->refcnt--;
+
+	return 0;
+}
+
+#endif /* DCB_H */
diff --git a/src/kernel/include/hw.h b/src/kernel/include/hw.h
index fafc1ef7e1..4c03f1cd87 100644
--- a/src/kernel/include/hw.h
+++ b/src/kernel/include/hw.h
@@ -17,17 +17,17 @@
 #define HW_IRQ_EXTERNAL_MIN 32
 #define HW_IRQ_EXTERNAL_MAX 63
 
-struct cap_asnd hw_asnd_caps[HW_IRQ_TOTAL];
+struct cap_asnd hw_asnd_caps[NUM_CPU][HW_IRQ_TOTAL];
 
 struct cap_hw {
 	struct cap_header h;
 	u32_t             hw_bitmap;
 } __attribute__((packed));
 
-static void
+static inline void
 hw_asndcap_init(void)
 {
-	memset(&hw_asnd_caps, 0, sizeof(struct cap_asnd) * HW_IRQ_TOTAL);
+	memset(&hw_asnd_caps, 0, sizeof(struct cap_asnd) * HW_IRQ_TOTAL * NUM_CPU);
 }
 
 /*
@@ -36,7 +36,7 @@ hw_asndcap_init(void)
  * from another, and only with a subset of the bitmap.  Any other HW
  * resources should not be passed on.
  */
-static int
+static inline int
 hw_activate(struct captbl *t, capid_t cap, capid_t capin, u32_t bitmap)
 {
 	struct cap_hw *hwc;
@@ -52,23 +52,23 @@ hw_activate(struct captbl *t, capid_t cap, capid_t capin, u32_t bitmap)
 	return 0;
 }
 
-static int
+static inline int
 hw_deactivate(struct cap_captbl *t, capid_t capin, livenessid_t lid)
 {
 	return cap_capdeactivate(t, capin, CAP_HW, lid);
 }
 
-static int
+static inline int
 hw_attach_rcvcap(struct cap_hw *hwc, hwid_t hwid, struct cap_arcv *rcvc, capid_t rcv_cap)
 {
 	if (hwid < HW_IRQ_EXTERNAL_MIN || hwid > HW_IRQ_EXTERNAL_MAX) return -EINVAL;
 	if (!(hwc->hw_bitmap & (1 << (hwid - HW_IRQ_EXTERNAL_MIN)))) return -EINVAL;
-	if (hw_asnd_caps[hwid].h.type == CAP_ASND) return -EEXIST;
+	if (hw_asnd_caps[get_cpuid()][hwid].h.type == CAP_ASND) return -EEXIST;
 
-	return asnd_construct(&hw_asnd_caps[hwid], rcvc, rcv_cap, 0, 0);
+	return asnd_construct(&hw_asnd_caps[get_cpuid()][hwid], rcvc, rcv_cap, 0, 0);
 }
 
-static int
+static inline int
 hw_detach_rcvcap(struct cap_hw *hwc, hwid_t hwid)
 {
 	if (hwid < HW_IRQ_EXTERNAL_MIN || hwid > HW_IRQ_EXTERNAL_MAX) return -EINVAL;
@@ -78,7 +78,7 @@ hw_detach_rcvcap(struct cap_hw *hwc, hwid_t hwid)
 	 * FIXME: Need to synchronize using __xx_pre and
 	 *        __xx_post perhaps in asnd_deconstruct()
 	 */
-	memset(&hw_asnd_caps[hwid], 0, sizeof(struct cap_asnd));
+	memset(&hw_asnd_caps[get_cpuid()][hwid], 0, sizeof(struct cap_asnd));
 
 	return 0;
 }
diff --git a/src/kernel/include/inv.h b/src/kernel/include/inv.h
index 089c784b54..7ac9cb14b1 100644
--- a/src/kernel/include/inv.h
+++ b/src/kernel/include/inv.h
@@ -50,7 +50,7 @@ struct cap_arcv {
 	u8_t           depth;
 } __attribute__((packed));
 
-static int
+static inline int
 sinv_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t comp_cap, vaddr_t entry_addr, invtoken_t token)
 {
 	struct cap_sinv *sinvc;
@@ -72,13 +72,13 @@ sinv_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t comp_cap, va
 	return 0;
 }
 
-static int
+static inline int
 sinv_deactivate(struct cap_captbl *t, capid_t capin, livenessid_t lid)
 {
 	return cap_capdeactivate(t, capin, CAP_SINV, lid);
 }
 
-static int
+static inline int
 sret_activate(struct captbl *t, capid_t cap, capid_t capin)
 {
 	struct cap_sret *sretc;
@@ -91,13 +91,13 @@ sret_activate(struct captbl *t, capid_t cap, capid_t capin)
 	return 0;
 }
 
-static int
+static inline int
 sret_deactivate(struct cap_captbl *t, capid_t capin, livenessid_t lid)
 {
 	return cap_capdeactivate(t, capin, CAP_SRET, lid);
 }
 
-static int
+static inline int
 asnd_construct(struct cap_asnd *asndc, struct cap_arcv *arcvc, capid_t rcv_cap, u32_t budget, u32_t period)
 {
 	/* FIXME: Add synchronization with __xx_pre and __xx_post */
@@ -118,7 +118,7 @@ asnd_construct(struct cap_asnd *asndc, struct cap_arcv *arcvc, capid_t rcv_cap,
 	return 0;
 }
 
-static int
+static inline int
 asnd_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t rcv_captbl, capid_t rcv_cap, u32_t budget,
               u32_t period)
 {
@@ -142,7 +142,7 @@ asnd_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t rcv_captbl,
 	return ret;
 }
 
-static int
+static inline int
 asnd_deactivate(struct cap_captbl *t, capid_t capin, livenessid_t lid)
 {
 	return cap_capdeactivate(t, capin, CAP_ASND, lid);
@@ -153,7 +153,7 @@ int cap_ipi_process(struct pt_regs *regs);
 /* send to a receive end-point within an interrupt */
 int cap_hw_asnd(struct cap_asnd *asnd, struct pt_regs *regs);
 
-static void
+static inline void
 __arcv_setup(struct cap_arcv *arcv, struct thread *thd, struct tcap *tcap, struct thread *notif)
 {
 	assert(arcv && thd && tcap && !thd_bound2rcvcap(thd));
@@ -168,7 +168,7 @@ __arcv_setup(struct cap_arcv *arcv, struct thread *thd, struct tcap *tcap, struc
 	tcap_promote(tcap, thd);
 }
 
-static int
+static inline int
 __arcv_teardown(struct cap_arcv *arcv, struct thread *thd)
 {
 	struct thread *notif;
@@ -189,13 +189,13 @@ __arcv_teardown(struct cap_arcv *arcv, struct thread *thd)
 	return 0;
 }
 
-static struct thread *
+static inline struct thread *
 arcv_thd_notif(struct thread *arcvt)
 {
 	return arcvt->rcvcap.rcvcap_thd_notif;
 }
 
-static int
+static inline int
 arcv_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t comp_cap, capid_t thd_cap, capid_t tcap_cap,
               capid_t arcv_cap, int init)
 {
@@ -245,7 +245,7 @@ arcv_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t comp_cap, ca
 	return 0;
 }
 
-static int
+static inline int
 arcv_deactivate(struct cap_captbl *t, capid_t capin, livenessid_t lid)
 {
 	struct cap_arcv *arcvc;
@@ -345,7 +345,7 @@ sret_ret(struct thread *thd, struct pt_regs *regs, struct cos_cpu_local_info *co
 	__userregs_set(regs, __userregs_getinvret(regs), sp, ip);
 }
 
-static void
+static inline void
 inv_init(void)
 {
 //#define __OUTPUT_CAP_SIZE
diff --git a/src/kernel/include/pgtbl.h b/src/kernel/include/pgtbl.h
index 7ef95512d8..f07c4b4ad5 100644
--- a/src/kernel/include/pgtbl.h
+++ b/src/kernel/include/pgtbl.h
@@ -357,6 +357,7 @@ pgtbl_cosframe_add(pgtbl_t pt, u32_t addr, u32_t page, u32_t flags)
                                                   PGTBL_DEPTH, &accum);
 	orig_v = (u32_t)(pte->next);
 	assert(orig_v == 0);
+//	printk("%x %x %p %x\n", addr, page, pte, orig_v);
 
 	return __pgtbl_update_leaf(pte, (void *)(page | flags), 0);
 }
diff --git a/src/kernel/include/scb.h b/src/kernel/include/scb.h
new file mode 100644
index 0000000000..b90d66b3d2
--- /dev/null
+++ b/src/kernel/include/scb.h
@@ -0,0 +1,101 @@
+/**
+ * Copyright 2019 by Phani Gadepalli, phanikishoreg@gwu.edu
+ *
+ * Redistribution of this file is permitted under the GNU General Public License v2.
+ */
+
+#ifndef SCB_H
+#define SCB_H
+
+#include "component.h"
+#include "cap_ops.h"
+#include "pgtbl.h"
+#include "retype_tbl.h"
+
+struct comp_info;
+
+struct cap_scb {
+	struct cap_header     h;
+	struct liveness_data  liveness;
+	struct cap_comp      *compc;
+	vaddr_t               kern_addr;
+} __attribute__((packed));
+
+static inline int
+scb_activate(struct captbl *t, capid_t ctcap, capid_t scbcap, vaddr_t kaddr, livenessid_t lid)
+{
+	struct cap_scb *sc;
+	int             ret;
+
+	sc = (struct cap_scb *)__cap_capactivate_pre(t, ctcap, scbcap, CAP_SCB, &ret);
+	if (!sc) return -EINVAL;
+
+	ltbl_get(lid, &sc->liveness);
+	sc->kern_addr = kaddr;
+	sc->compc     = NULL;
+	memset((void *)kaddr, 0, COS_SCB_SIZE);
+
+	__cap_capactivate_post(&sc->h, CAP_SCB);
+
+	return 0;
+}
+
+static inline int
+scb_deactivate(struct cap_captbl *ct, capid_t scbcap, capid_t ptcap, capid_t cosframe_addr, livenessid_t lid)
+{
+	struct cap_scb *sc;
+	unsigned long old_v = 0, *pte = NULL;
+	int ret;
+
+	sc = (struct cap_scb *)captbl_lkup(ct->captbl, scbcap);
+	if (!sc || sc->h.type != CAP_SCB) return -EINVAL;
+
+	/* FIXME: component using this scbcap is still active! how to handle this? */
+	if (sc->compc) return -EPERM;
+
+	ltbl_expire(&sc->liveness);
+	ret = kmem_deact_pre((struct cap_header *)sc, ct->captbl, ptcap, cosframe_addr, &pte, &old_v);
+	if (ret) return ret;
+	ret = kmem_deact_post(pte, old_v);
+	if (ret) return ret;
+
+	return cap_capdeactivate(ct, scbcap, CAP_SCB, lid);
+}
+
+static inline int
+scb_comp_update(struct captbl *ct, struct cap_scb *sc, struct cap_comp *compc, struct cap_pgtbl *ptcin, vaddr_t uaddrin)
+{
+	paddr_t pf = chal_va2pa((void *)(sc->kern_addr));
+
+	if (unlikely(!ltbl_isalive(&sc->liveness))) return -EPERM;
+	/* for non-schedulers, scbs are from schedulers, so uaddrin will be zero and sc->compc should have been set! */
+	if (uaddrin && pgtbl_mapping_add(ptcin->pgtbl, uaddrin, pf, PGTBL_USER_DEF)) return -EINVAL;
+
+	if (uaddrin && sc->compc == NULL) sc->compc = compc;
+	compc->info.scb_data = (struct cos_scb_info *)(sc->kern_addr);
+
+	return 0;
+}
+
+static inline int
+scb_comp_remove(struct cap_captbl *ct, struct cap_scb *sc, capid_t ptcapin, vaddr_t uaddrin)
+{
+	int ret;
+
+	if (unlikely(!ct || !sc || !ptcapin || !uaddrin)) return -EINVAL;
+
+	if (unlikely(!ltbl_isalive(&sc->liveness))) return -EPERM;
+	if (unlikely(!sc->compc)) return -EINVAL;
+
+	/* TODO: unmap uaddrin in the user-land */
+
+	return 0;
+}
+
+static inline struct liveness_data *
+scb_liveness(struct cap_scb *sc)
+{
+	return &sc->liveness;
+}
+
+#endif /* SCB_H */
diff --git a/src/kernel/include/shared/consts.h b/src/kernel/include/shared/consts.h
index e059c507a7..d5cb53b9d9 100644
--- a/src/kernel/include/shared/consts.h
+++ b/src/kernel/include/shared/consts.h
@@ -48,7 +48,7 @@ struct pt_regs {
 #endif
 
 #define MAX_SERVICE_DEPTH 31
-#define MAX_NUM_THREADS (64 * NUM_CPU)
+#define MAX_NUM_THREADS (2048)
 
 /* Stacks are 2 * page_size (expressed in words) */
 #define MAX_STACK_SZ_BYTE_ORDER 12
@@ -136,6 +136,7 @@ struct pt_regs {
  * offsets below are used to access CPU and thread IDs. */
 #define CPUID_OFFSET 1
 #define THDID_OFFSET 2
-#define INVTOKEN_OFFSET 3
+#define SLTHDPTR_OFFSET 3
+#define INVTOKEN_OFFSET 4
 
 #endif
diff --git a/src/kernel/include/shared/cos_config.h b/src/kernel/include/shared/cos_config.h
index a80dc56884..bf501b3be9 100644
--- a/src/kernel/include/shared/cos_config.h
+++ b/src/kernel/include/shared/cos_config.h
@@ -62,6 +62,7 @@
 
 /* Composite user memory uses physical memory above this. */
 #define COS_MEM_START COS_MEM_KERN_PA
+#define COS_SCB_SIZE  (PAGE_SIZE)
 
 /* NUM_CPU_SOCKETS defined in cpu_ghz.h. The information is used for
  * intelligent IPI distribution. */
diff --git a/src/kernel/include/shared/cos_sched.h b/src/kernel/include/shared/cos_sched.h
new file mode 100644
index 0000000000..525d7edcb9
--- /dev/null
+++ b/src/kernel/include/shared/cos_sched.h
@@ -0,0 +1,53 @@
+#ifndef COS_SCHED_H
+#define COS_SCHED_H
+
+#include "./cos_types.h"
+
+struct cos_thd_event {
+	u16_t blocked;
+	u32_t next_timeout;
+	u64_t elapsed_cycs;
+	u64_t epoch; 
+} __attribute__((packed));
+
+struct cos_sched_event {
+	thdid_t tid;
+	struct cos_thd_event evt;
+} __attribute__((packed));
+
+#define COS_SCHED_EVENT_RING_SIZE 16
+
+struct cos_sched_ring {
+	int head, tail, more;
+	struct cos_sched_event event_buf[COS_SCHED_EVENT_RING_SIZE];
+} __attribute__((packed));
+
+struct cos_scb_info {
+	capid_t               curr_thd;
+	cycles_t              timer_next;
+	sched_tok_t           sched_tok;
+	struct cos_sched_ring sched_events; /* kernel-level events only */
+} CACHE_ALIGNED;
+
+struct cos_dcb_info {
+	unsigned long ip;
+	unsigned long sp;
+	unsigned long pending; /* binary value. TODO: move it to ip or sp */
+} __attribute__((packed));
+
+/*
+ * This is the "ip" the kernel uses to update the thread when it sees that the
+ * thread is still in user-level dispatch routine.
+ * This is the offset of instruction after resetting the "next" thread's "sp" to zero
+ * in a purely user-level dispatch.
+ *
+ * Whenever kernel is switching to a thread which has "sp" non-zero, it would switch
+ * to the "ip" saved in the dcb_info and reset the "sp" of the thread that the kernel
+ * is dispatching to!
+ * This is necessary because, if the kernel is dispatching to a thread that was in the
+ * user-level dispatch routine before, then the only registers that it can restore are
+ * "ip" and "sp", everything else is either clobbered or saved/loaded at user-level.
+ */
+#define DCB_IP_KERN_OFF 8
+
+#endif /* COS_SCHED_H */
diff --git a/src/kernel/include/shared/cos_types.h b/src/kernel/include/shared/cos_types.h
index f3714097e2..cee8b006ef 100644
--- a/src/kernel/include/shared/cos_types.h
+++ b/src/kernel/include/shared/cos_types.h
@@ -72,7 +72,9 @@ typedef enum {
 
 typedef enum {
 	RCV_NON_BLOCKING = 1,
-	RCV_ALL_PENDING  = 1 << 1,
+	RCV_ULONLY       = (1 << 1),
+	RCV_ULSCHED_RCV  = (1 << 2),
+	RCV_SCHEDTIMEOUT = (1 << 3),
 } rcv_flags_t;
 
 #define BOOT_LIVENESS_ID_BASE 2
@@ -84,6 +86,7 @@ typedef enum {
 	CAPTBL_OP_THDACTIVATE,
 	CAPTBL_OP_THDDEACTIVATE,
 	CAPTBL_OP_THDTLSSET,
+	CAPTBL_OP_THDMIGRATE,
 	CAPTBL_OP_COMPACTIVATE,
 	CAPTBL_OP_COMPDEACTIVATE,
 	CAPTBL_OP_SINVACTIVATE,
@@ -125,6 +128,12 @@ typedef enum {
 	CAPTBL_OP_HW_MAP,
 	CAPTBL_OP_HW_CYC_USEC,
 	CAPTBL_OP_HW_CYC_THRESH,
+
+	CAPTBL_OP_SCB_ACTIVATE,
+	CAPTBL_OP_SCB_DEACTIVATE,
+
+	CAPTBL_OP_DCB_ACTIVATE,
+	CAPTBL_OP_DCB_DEACTIVATE,
 } syscall_op_t;
 
 typedef enum {
@@ -142,8 +151,13 @@ typedef enum {
 	CAP_QUIESCENCE, /* when deactivating, set to track quiescence state */
 	CAP_TCAP,       /* tcap captable entry */
 	CAP_HW,         /* hardware (interrupt) */
+	CAP_SCB,	/* Scheduler control block (SCB) */
+	CAP_DCB,	/* Dispatch control block (DCB) */
 } cap_t;
 
+/* maximum size allowed for CAP TYPE in a capability header */
+#define CAP_TYPE_MAXBITS 7
+#define CAP_TYPE_MAX (1 << CAP_TYPE_MAXBITS - 1)
 /* TODO: pervasive use of these macros */
 /* v \in struct cap_* *, type \in cap_t */
 #define CAP_TYPECHK(v, t) ((v) && (v)->h.type == (t))
@@ -192,12 +206,16 @@ typedef int cpuid_t;
 static inline cap_sz_t
 __captbl_cap2sz(cap_t c)
 {
+	/* if (unlikely(c > CAP_TYPE_MAX)) return CAP_SZ_ERR; */
+
 	/* TODO: optimize for invocation and return */
 	switch (c) {
 	case CAP_SRET:
-	case CAP_THD:
 	case CAP_TCAP:
+	case CAP_THD:
 		return CAP_SZ_16B;
+	case CAP_SCB:
+	case CAP_DCB:
 	case CAP_CAPTBL:
 	case CAP_PGTBL:
 	case CAP_HW: /* TODO: 256bits = 32B * 8b */
@@ -260,12 +278,15 @@ enum
 	 */
 	BOOT_CAPTBL_SELF_INITRCV_BASE  = round_up_to_pow2(BOOT_CAPTBL_SELF_INITTHD_BASE + NUM_CPU * CAP64B_IDSZ,
                                                          CAPMAX_ENTRY_SZ),
+	/* BOOT_CAPTBL_SELF_INITTCAP_BASE  = round_up_to_pow2(BOOT_CAPTBL_SELF_INITRCV_BASE + NUM_CPU * CAP64B_IDSZ,
+                                                         CAPMAX_ENTRY_SZ), */
 	BOOT_CAPTBL_LAST_CAP           = BOOT_CAPTBL_SELF_INITRCV_BASE + NUM_CPU * CAP64B_IDSZ,
 	/* round up to next entry */
 	BOOT_CAPTBL_FREE = round_up_to_pow2(BOOT_CAPTBL_LAST_CAP, CAPMAX_ENTRY_SZ)
 };
 
-#define BOOT_CAPTBL_SELF_INITTCAP_BASE (BOOT_CAPTBL_SELF_INITTHD_BASE + CAP16B_IDSZ)
+#define BOOT_CAPTBL_SELF_INITTCAP_BASE BOOT_CAPTBL_SELF_INITTHD_BASE + CAP16B_IDSZ
+
 #define BOOT_CAPTBL_SELF_INITTHD_CPU_BASE (BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cos_cpuid()))
 #define BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE (BOOT_CAPTBL_SELF_INITTCAP_BASE_CPU(cos_cpuid()))
 #define BOOT_CAPTBL_SELF_INITRCV_CPU_BASE (BOOT_CAPTBL_SELF_INITRCV_BASE_CPU(cos_cpuid()))
@@ -274,6 +295,16 @@ enum
 #define BOOT_CAPTBL_SELF_INITTCAP_BASE_CPU(cpuid) (BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cpuid) + CAP16B_IDSZ)
 #define BOOT_CAPTBL_SELF_INITRCV_BASE_CPU(cpuid) (BOOT_CAPTBL_SELF_INITRCV_BASE + cpuid * CAP64B_IDSZ)
 
+enum llboot_scb_dcb_caps
+{
+	LLBOOT_CAPTBL_SCB     = round_up_to_pow2(BOOT_CAPTBL_LAST_CAP, CAPMAX_ENTRY_SZ),
+	LLBOOT_CAPTBL_INITDCB = LLBOOT_CAPTBL_SCB + CAP64B_IDSZ,
+	LLBOOT_CAPTBL_FREE    = round_up_to_pow2(LLBOOT_CAPTBL_INITDCB + (CAP64B_IDSZ * NUM_CPU), CAPMAX_ENTRY_SZ),
+};
+
+#define LLBOOT_CAPTBL_INITDCB_CPU(cpuid) (LLBOOT_CAPTBL_INITDCB + (CAP64B_IDSZ * cpuid))
+#define LLBOOT_CAPTBL_CPU_INITDCB        LLBOOT_CAPTBL_INITDCB_CPU(cos_cpuid())
+
 /*
  * The half of the first page of init captbl is devoted to root node. So, the
  * first page of captbl can contain 128 caps, and every extra page can hold 256
@@ -291,6 +322,8 @@ enum
 {
 	/* thread id */
 	THD_GET_TID,
+	THD_GET_DCB_IP,
+	THD_GET_DCB_SP,
 };
 
 enum
@@ -307,6 +340,12 @@ enum
 	ARCV_GET_THDID,
 };
 
+enum
+{
+	/* get current thread info from scb */
+	COMP_GET_SCB_CURTHD,
+};
+
 /* Macro used to define per core variables */
 #define PERCPU(type, name)       \
 	PERCPU_DECL(type, name); \
@@ -408,7 +447,6 @@ struct cos_component_information {
 	vaddr_t                    cos_heap_allocated, cos_heap_alloc_extent;
 	vaddr_t                    cos_upcall_entry;
 	vaddr_t                    cos_async_inv_entry;
-	//	struct cos_sched_data_area *cos_sched_data_area;
 	vaddr_t                            cos_user_caps;
 	struct restartable_atomic_sequence cos_ras[COS_NUM_ATOMIC_SECTIONS / 2];
 	vaddr_t                            cos_poly[COMP_INFO_POLY_NUM];
@@ -484,6 +522,10 @@ typedef unsigned int isolation_level_t;
 #define MEMMGR_MAX_SHMEM_REGIONS 1024
 #define CAPMGR_AEPKEYS_MAX       (1<<15)
 
+#define CHAN_CRT_NSLOTS 4
+#define CHAN_CRT_ITEM_TYPE unsigned long
+#define CHAN_CRT_ITEM_SZ sizeof(CHAN_CRT_ITEM_TYPE)
+
 #define IPIWIN_DEFAULT_US (1000) /* 1ms */
 #define IPIMAX_DEFAULT    (64) /* IPIs per ms for each RCV ep */
 
diff --git a/src/kernel/include/thd.h b/src/kernel/include/thd.h
index 8c10d536cc..c9c01c734b 100644
--- a/src/kernel/include/thd.h
+++ b/src/kernel/include/thd.h
@@ -34,7 +34,7 @@ struct invstk_entry {
  */
 struct rcvcap_info {
 	/* how many other arcv end-points send notifications to this one? */
-	int            isbound, pending, refcnt, is_all_pending;
+	int            isbound, pending, refcnt, is_init;
 	sched_tok_t    sched_count;
 	struct tcap *  rcvcap_tcap;      /* This rcvcap's tcap */
 	struct thread *rcvcap_thd_notif; /* The parent rcvcap thread for notifications */
@@ -69,11 +69,13 @@ struct thread {
 	tcap_time_t    timeout;
 	struct thread *interrupted_thread;
 	struct thread *scheduler_thread;
+	struct cos_dcb_info *dcbinfo;
 
 	/* rcv end-point data-structures */
 	struct rcvcap_info rcvcap;
 	struct list        event_head; /* all events for *this* end-point */
 	struct list_node   event_list; /* the list of events for another end-point */
+	u64_t              event_epoch; /* used by user-level for ULSCHED events.. */
 } CACHE_ALIGNED;
 
 /*
@@ -89,6 +91,8 @@ struct cap_thd {
 	cpuid_t           cpuid;
 } __attribute__((packed));
 
+#include "dcb.h"
+
 static void
 thd_upcall_setup(struct thread *thd, u32_t entry_addr, int option, int arg1, int arg2, int arg3)
 {
@@ -188,20 +192,43 @@ thd_next_thdinfo_update(struct cos_cpu_local_info *cli, struct thread *thd, stru
 }
 
 static void
-thd_rcvcap_init(struct thread *t)
+thd_rcvcap_init(struct thread *t, int is_init)
 {
 	struct rcvcap_info *rc = &t->rcvcap;
 
 	rc->isbound = rc->pending = rc->refcnt = 0;
-	rc->is_all_pending                     = 0;
 	rc->sched_count                        = 0;
+	rc->is_init                            = is_init;
 	rc->rcvcap_thd_notif                   = NULL;
 }
 
+static inline struct comp_info *
+thd_invstk_peek_compinfo(struct thread *curr_thd, struct cos_cpu_local_info *cos_info, int peek_index)
+{
+	/* curr_thd should be the current thread! We are using cached invstk_top. */
+	return &(curr_thd->invstk[peek_index].comp_info);
+}
+
+static inline int
+thd_rcvcap_evt_pending(struct thread *t)
+{
+	return !list_isempty(&t->event_head);
+}
+
 static inline void
 thd_rcvcap_evt_enqueue(struct thread *head, struct thread *t)
 {
+	struct cos_cpu_local_info *cos_info = cos_cpu_local_info();
+	struct comp_info *c = thd_invstk_peek_compinfo(head, cos_info, 0); /* in its root component! */
+	struct cos_scb_info   *scb = NULL;
+	struct cos_sched_ring *r   = NULL;
+
 	if (list_empty(&t->event_list) && head != t) list_enqueue(&head->event_head, &t->event_list);
+	if (unlikely(!c ||!c->scb_data)) return;
+
+	scb = ((c->scb_data) + get_cpuid());
+	r   = &(scb->sched_events);
+	r->more = thd_rcvcap_evt_pending(head);
 }
 
 static inline void
@@ -227,69 +254,41 @@ thd_track_exec(struct thread *t)
 	return !list_empty(&t->event_list);
 }
 
-static void
-thd_rcvcap_all_pending_set(struct thread *t, int val)
-{
-	t->rcvcap.is_all_pending = val;
-}
-
-static int
-thd_rcvcap_all_pending_get(struct thread *t)
-{
-	return t->rcvcap.is_all_pending;
-}
-
-static int
-thd_rcvcap_all_pending(struct thread *t)
-{
-	int pending = t->rcvcap.pending;
-
-	/* receive all pending */
-	t->rcvcap.pending = 0;
-	thd_rcvcap_all_pending_set(t, 0);
-
-	return ((pending << 1) | !list_isempty(&t->event_head));
-}
-
-static int
+static inline int
 thd_rcvcap_pending(struct thread *t)
 {
-	if (t->rcvcap.pending) return t->rcvcap.pending;
-	return !list_isempty(&t->event_head);
-	;
+	if (t->rcvcap.pending || (t->dcbinfo && t->dcbinfo->pending)) return 1;
+	return thd_rcvcap_evt_pending(t);
 }
 
-static sched_tok_t
+static inline sched_tok_t
 thd_rcvcap_get_counter(struct thread *t)
 {
 	return t->rcvcap.sched_count;
 }
 
-static void
+static inline void
 thd_rcvcap_set_counter(struct thread *t, sched_tok_t cntr)
 {
 	t->rcvcap.sched_count = cntr;
 }
 
-static void
-thd_rcvcap_pending_inc(struct thread *arcvt)
+static inline void
+thd_rcvcap_pending_set(struct thread *arcvt)
 {
-	arcvt->rcvcap.pending++;
+	if (likely(arcvt->dcbinfo)) arcvt->dcbinfo->pending = 1;
+	else arcvt->rcvcap.pending = 1;
 }
 
-static int
-thd_rcvcap_pending_dec(struct thread *arcvt)
+static inline void
+thd_rcvcap_pending_reset(struct thread *arcvt)
 {
-	int pending = arcvt->rcvcap.pending;
-
-	if (pending == 0) return 0;
-	arcvt->rcvcap.pending--;
-
-	return pending;
+	if (likely(arcvt->dcbinfo)) arcvt->dcbinfo->pending = 0;
+	else arcvt->rcvcap.pending = 0;
 }
 
 static inline int
-thd_state_evt_deliver(struct thread *t, unsigned long *thd_state, unsigned long *cycles, unsigned long *timeout)
+thd_state_evt_deliver(struct thread *t, unsigned long *thd_state, unsigned long *cycles, unsigned long *timeout, u64_t *epoch)
 {
 	struct thread *e = thd_rcvcap_evt_dequeue(t);
 
@@ -301,6 +300,8 @@ thd_state_evt_deliver(struct thread *t, unsigned long *thd_state, unsigned long
 	e->exec    = 0;
 	*timeout   = e->timeout;
 	e->timeout = 0;
+	*epoch     = e->event_epoch;
+	e->event_epoch = 0;
 
 	return 1;
 }
@@ -315,7 +316,7 @@ static inline void
 thd_current_update(struct thread *next, struct thread *prev, struct cos_cpu_local_info *cos_info)
 {
 	/* commit the cached data */
-	prev->invstk_top     = cos_info->invstk_top;
+	prev->invstk_top = cos_info->invstk_top;
 	cos_info->invstk_top = next->invstk_top;
 	cos_info->curr_thd   = next;
 }
@@ -332,17 +333,23 @@ thd_scheduler_set(struct thread *thd, struct thread *sched)
 	if (unlikely(thd->scheduler_thread != sched)) thd->scheduler_thread = sched;
 }
 
-static int
-thd_activate(struct captbl *t, capid_t cap, capid_t capin, struct thread *thd, capid_t compcap, thdclosure_index_t init_data)
+static inline int
+thd_activate(struct captbl *t, capid_t cap, capid_t capin, struct thread *thd, capid_t compcap, thdclosure_index_t init_data, capid_t dcbcap, unsigned short dcboff)
 {
 	struct cos_cpu_local_info *cli = cos_cpu_local_info();
-	struct cap_thd            *tc;
-	struct cap_comp           *compc;
+	struct cap_thd            *tc = NULL;
+	struct cap_comp           *compc = NULL;
+	struct cap_dcb            *dc = NULL;
 	int                        ret;
 
 	memset(thd, 0, sizeof(struct thread));
 	compc = (struct cap_comp *)captbl_lkup(t, compcap);
 	if (unlikely(!compc || compc->h.type != CAP_COMP)) return -EINVAL;
+	if (likely(dcbcap)) {
+		dc    = (struct cap_dcb *)captbl_lkup(t, dcbcap);
+		if (unlikely(!dc || dc->h.type != CAP_DCB)) return -EINVAL;
+		if (dcboff > PAGE_SIZE / sizeof(struct cos_dcb_info)) return -EINVAL;
+	}
 
 	tc = (struct cap_thd *)__cap_capactivate_pre(t, cap, capin, CAP_THD, &ret);
 	if (!tc) return ret;
@@ -354,10 +361,17 @@ thd_activate(struct captbl *t, capid_t cap, capid_t capin, struct thread *thd, c
 	thd->refcnt                           = 1;
 	thd->invstk_top                       = 0;
 	thd->cpuid                            = get_cpuid();
+	if (likely(dc)) {
+		ret = dcb_thd_ref(dc, thd);
+		if (ret) goto err; /* TODO: cleanup captbl slot */
+		thd->dcbinfo = (struct cos_dcb_info *)(dc->kern_addr + (dcboff * sizeof(struct cos_dcb_info)));
+		memset(thd->dcbinfo, 0, sizeof(struct cos_dcb_info));
+	}
 	assert(thd->tid <= MAX_NUM_THREADS);
 	thd_scheduler_set(thd, thd_current(cli));
 
-	thd_rcvcap_init(thd);
+	/* TODO: fix the way to specify scheduler in a component! */
+	thd_rcvcap_init(thd, !init_data);
 	list_head_init(&thd->event_head);
 	list_init(&thd->event_list, thd);
 
@@ -369,15 +383,69 @@ thd_activate(struct captbl *t, capid_t cap, capid_t capin, struct thread *thd, c
 	__cap_capactivate_post(&tc->h, CAP_THD);
 
 	return 0;
+
+err:
+	return ret;
 }
 
-static int
+static inline int
+thd_migrate_cap(struct captbl *ct, capid_t thd_cap)
+{
+	struct thread *thd;
+	struct cap_thd *tc;
+
+	/* we migrated the capability to core */
+	tc = (struct cap_thd *)captbl_lkup(ct, thd_cap);
+	if (!tc || tc->h.type != CAP_THD || get_cpuid() != tc->cpuid) return -EINVAL;
+	thd = tc->t;
+	tc->cpuid = thd->cpuid;
+
+	return 0;
+}
+
+static inline int
+thd_migrate(struct captbl *ct, capid_t thd_cap, cpuid_t core)
+{
+	struct thread *thd;
+	struct cap_thd *tc;
+
+	tc = (struct cap_thd *)captbl_lkup(ct, thd_cap);
+	if (!tc || tc->h.type != CAP_THD || get_cpuid() != tc->cpuid) return -EINVAL;
+	thd = tc->t;
+	if (NUM_CPU < 2 || core >= NUM_CPU || core < 0) return -EINVAL;
+	if (tc->cpuid != thd->cpuid) return -EINVAL; /* outdated capability */
+	if (thd->cpuid == core) return -EINVAL; /* already migrated. invalid req */
+	if (thd->cpuid != get_cpuid()) return -EPERM; /* only push migration */
+
+	if (thd_current(cos_cpu_local_info()) == thd) return -EPERM; /* not a running thread! */
+	if (thd->invstk_top > 0) return -EPERM;  /* not if its in an invocation */
+	if (thd_bound2rcvcap(thd) || thd->rcvcap.rcvcap_thd_notif) return -EPERM; /* not if it's an AEP */
+	if (thd->rcvcap.rcvcap_tcap) return -EPERM; /* not if it has its own tcap on this core */
+
+	thd->scheduler_thread = NULL;
+	thd->cpuid = core;
+	/* we also migrated the capability to core */
+	tc->cpuid = core;
+
+	/* 
+	 * TODO:
+	 * given that the thread is not running right now, 
+	 * and we don't allow migrating a thread that's in an invocation for now,
+	 * i think we can find the COREID_OFFSET/CPUID_OFFSET on stack and fix the
+	 * core id right here?? 
+	 */
+
+	return 0;
+}
+
+static inline int
 thd_deactivate(struct captbl *ct, struct cap_captbl *dest_ct, unsigned long capin, livenessid_t lid, capid_t pgtbl_cap,
-               capid_t cosframe_addr, const int root)
+               capid_t cosframe_addr, capid_t dcbcap, const int root)
 {
 	struct cos_cpu_local_info *cli = cos_cpu_local_info();
-	struct cap_header *        thd_header;
-	struct thread *            thd;
+	struct cap_header         *thd_header;
+	struct thread             *thd;
+	struct cap_dcb            *dcb = NULL;
 	unsigned long              old_v = 0, *pte = NULL;
 	int                        ret;
 
@@ -385,6 +453,10 @@ thd_deactivate(struct captbl *ct, struct cap_captbl *dest_ct, unsigned long capi
 	if (!thd_header || thd_header->type != CAP_THD) cos_throw(err, -EINVAL);
 	thd = ((struct cap_thd *)thd_header)->t;
 	assert(thd->refcnt);
+	if (dcbcap) {
+		dcb = (struct cap_dcb *)captbl_lkup(ct, dcbcap);
+		if (!dcb || dcb->h.type != CAP_DCB) cos_throw(err, -EINVAL);
+	}
 
 	if (thd->refcnt == 1) {
 		if (!root) cos_throw(err, -EINVAL);
@@ -410,6 +482,10 @@ thd_deactivate(struct captbl *ct, struct cap_captbl *dest_ct, unsigned long capi
 		}
 	}
 
+	if (dcb) {
+		ret = dcb_thd_deref(dcb, thd);
+		if (ret) cos_throw(err, ret);
+	}
 	ret = cap_capdeactivate(dest_ct, capin, CAP_THD, lid);
 	if (ret) cos_throw(err, ret);
 
@@ -429,7 +505,7 @@ thd_deactivate(struct captbl *ct, struct cap_captbl *dest_ct, unsigned long capi
 	return ret;
 }
 
-static int
+static inline int
 thd_tls_set(struct captbl *ct, capid_t thd_cap, vaddr_t tlsaddr, struct thread *current)
 {
 	struct cap_thd *tc;
@@ -447,7 +523,7 @@ thd_tls_set(struct captbl *ct, capid_t thd_cap, vaddr_t tlsaddr, struct thread *
 	return 0;
 }
 
-static void
+static inline void
 thd_init(void)
 {
 	assert(sizeof(struct cap_thd) <= __captbl_cap2bytes(CAP_THD));
@@ -472,6 +548,12 @@ curr_invstk_top(struct cos_cpu_local_info *cos_info)
 	return cos_info->invstk_top;
 }
 
+static inline struct comp_info *
+thd_invstk_current_compinfo(struct thread *curr_thd, struct cos_cpu_local_info *cos_info)
+{
+	return &(curr_thd->invstk[curr_invstk_top(cos_info)].comp_info);
+}
+
 static inline struct comp_info *
 thd_invstk_current(struct thread *curr_thd, unsigned long *ip, unsigned long *sp, struct cos_cpu_local_info *cos_info)
 {
@@ -531,38 +613,80 @@ thd_preemption_state_update(struct thread *curr, struct thread *next, struct pt_
 	memcpy(&curr->regs, regs, sizeof(struct pt_regs));
 }
 
+static inline int
+thd_sched_events_produce(struct thread *thd, struct cos_cpu_local_info *cos_info)
+{
+	int delta = 0, inv_top = curr_invstk_top(cos_info);
+	struct cos_scb_info   *scb = NULL;
+	struct cos_sched_ring *r   = NULL;
+	struct comp_info      *c   = NULL;
+
+	if (unlikely(inv_top != 0 || thd->rcvcap.is_init == 0)) return 0;
+
+	c = thd_invstk_peek_compinfo(thd, cos_info, inv_top);
+	if (unlikely(!c || !c->scb_data)) return -ENOENT;
+
+	scb = ((c->scb_data) + get_cpuid());
+	r   = &(scb->sched_events);
+	/* 
+	 * only produce more if the ring is empty! 
+	 * so the user only calls after dequeueing all previous events. 
+	 */
+	if (unlikely(r->head != r->tail)) return -EAGAIN;
+
+	r->head = r->tail = 0;
+	while (delta < COS_SCHED_EVENT_RING_SIZE) {
+		struct cos_sched_event *e = &(r->event_buf[delta]);
+		unsigned long thd_state;
+
+		if (!thd_state_evt_deliver(thd, &thd_state, (unsigned long *)&(e->evt.elapsed_cycs),
+					(unsigned long *)&(e->evt.next_timeout), &(e->evt.epoch))) break;
+		e->tid         = (thd_state << 1) >> 1;
+		e->evt.blocked = (thd_state >> 31);
+
+		delta++;
+	}
+
+	r->tail += delta;
+	r->more  = thd_rcvcap_evt_pending(thd);
+
+	return delta;
+}
+
 static inline void
 thd_rcvcap_pending_deliver(struct thread *thd, struct pt_regs *regs)
 {
-	unsigned long thd_state = 0, cycles = 0, timeout = 0, pending = 0;
-	int           all_pending = thd_rcvcap_all_pending_get(thd);
+	unsigned long thd_state = 0, cycles = 0, timeout = 0;
+	u64_t epoch = 0;
 
-	thd_state_evt_deliver(thd, &thd_state, &cycles, &timeout);
-	if (all_pending) {
-		pending = thd_rcvcap_all_pending(thd);
-	} else {
-		thd_rcvcap_pending_dec(thd);
-		pending = thd_rcvcap_pending(thd);
+	/* events only in scb now, no return values... */
+	thd_rcvcap_pending_reset(thd);
+	if (thd_sched_events_produce(thd, cos_cpu_local_info()) == -ENOENT) {
+		thd_state_evt_deliver(thd, &thd_state, &cycles, &timeout, &epoch);
 	}
-	__userregs_setretvals(regs, pending, thd_state, cycles, timeout);
+	__userregs_setretvals(regs, thd_rcvcap_pending(thd), thd_state, cycles, timeout);
 }
 
 static inline int
 thd_switch_update(struct thread *thd, struct pt_regs *regs, int issame)
 {
-	int preempt = 0;
+	int preempt = 0, pending = 0;
 
 	/* TODO: check FPU */
 	/* fpu_save(thd); */
 	if (thd->state & THD_STATE_PREEMPTED) {
-		assert(!(thd->state & THD_STATE_RCVING));
+		/* TODO: assert that its a scheduler thread */
+		/* assert(!(thd->state & THD_STATE_RCVING)); */
 		thd->state &= ~THD_STATE_PREEMPTED;
 		preempt = 1;
-	} else if (thd->state & THD_STATE_RCVING) {
+	}
+
+	/* FIXME: can the thread be in race with the kernel? */
+	if (thd->state & THD_STATE_RCVING) {
 		assert(!(thd->state & THD_STATE_PREEMPTED));
 		thd->state &= ~THD_STATE_RCVING;
 		thd_rcvcap_pending_deliver(thd, regs);
-
+		pending = thd_rcvcap_pending(thd);
 		/*
 		 * If a scheduler thread was running using child tcap and blocked on RCVING
 		 * and budget expended logic decided to run the scheduler thread with it's
@@ -570,8 +694,15 @@ thd_switch_update(struct thread *thd, struct pt_regs *regs, int issame)
 		 */
 	}
 
+	if (unlikely(thd->dcbinfo && thd->dcbinfo->sp)) {
+		assert(preempt == 0);
+		regs->dx = regs->ip = thd->dcbinfo->ip + DCB_IP_KERN_OFF;
+		regs->cx = regs->sp = thd->dcbinfo->sp;
+		thd->dcbinfo->sp = 0;
+	}
+
 	if (issame && preempt == 0) {
-		__userregs_set(regs, 0, __userregs_getsp(regs), __userregs_getip(regs));
+		__userregs_set(regs, pending, __userregs_getsp(regs), __userregs_getip(regs));
 	}
 
 	return preempt;
@@ -584,6 +715,12 @@ thd_introspect(struct thread *t, unsigned long op, unsigned long *retval)
 	case THD_GET_TID:
 		*retval = t->tid;
 		break;
+	case THD_GET_DCB_IP:
+		*retval = t->dcbinfo->ip;
+		break;
+	case THD_GET_DCB_SP:
+		*retval = t->dcbinfo->sp;
+		break;
 	default:
 		return -EINVAL;
 	}
diff --git a/src/platform/i386/Makefile b/src/platform/i386/Makefile
index 9a4f0e0614..0b222c5920 100644
--- a/src/platform/i386/Makefile
+++ b/src/platform/i386/Makefile
@@ -32,6 +32,8 @@ CFLAGS += $(WARNINGS)
 OBJS += kernel.o
 OBJS += gdt.o
 OBJS += idt.o
+OBJS += pic.o
+OBJS += ioapic.o
 OBJS += vm.o
 OBJS += printk.o
 OBJS += string.o
@@ -42,9 +44,9 @@ OBJS += serial.o
 OBJS += hpet.o
 OBJS += chal.o
 OBJS += boot_comp.o
-OBJS += miniacpi.o
-#OBJS += console.o
+OBJS += acpi.o
 OBJS += vga.o
+OBJS += keyboard.o
 OBJS += exception.o
 OBJS += lapic.o
 
diff --git a/src/platform/i386/miniacpi.c b/src/platform/i386/acpi.c
similarity index 65%
rename from src/platform/i386/miniacpi.c
rename to src/platform/i386/acpi.c
index c1647cfd25..68aabe763b 100644
--- a/src/platform/i386/miniacpi.c
+++ b/src/platform/i386/acpi.c
@@ -2,6 +2,8 @@
 #include "string.h"
 #include "mem_layout.h"
 #include "pgtbl.h"
+#include "apic_cntl.h"
+#include "ioapic.h"
 
 #define RSDP_LO_ADDRESS ((unsigned char *)0xc00E0000)
 #define RSDP_HI_ADDRESS ((unsigned char *)0xc00FFFFF)
@@ -32,9 +34,10 @@ struct rsdt {
 	struct rsdt *entry[0];
 } __attribute__((packed));
 
-extern u8_t *       boot_comp_pgd;
-static u32_t        basepage;
-static struct rsdt *rsdt;
+extern u8_t *         boot_comp_pgd;
+static u32_t          basepage;
+static struct rsdt   *rsdt;
+static unsigned char *madt;
 
 static inline void *
 pa2va(void *pa)
@@ -78,7 +81,7 @@ acpi_find_rsdt(void)
 }
 
 void *
-acpi_find_timer(void)
+acpi_find_hpet(void)
 {
 	pgtbl_t pgtbl = (pgtbl_t)boot_comp_pgd;
 	size_t  i;
@@ -142,3 +145,52 @@ acpi_set_rsdt_page(u32_t page)
 	basepage = page * (1 << 22);
 	rsdt     = (struct rsdt *)pa2va(rsdt);
 }
+
+void
+acpi_madt_intsrc_iter(unsigned char *addr)
+{
+	struct int_cntl_head *h   = NULL, *end = NULL;
+	u32_t                 len = 0;
+	int                   nl  = 0, nio = 0;
+
+	assert(addr);
+	madt = addr;
+	h    = (struct int_cntl_head *)(madt + APIC_CNTR_ARR_OFF);
+	len  = *(u32_t *)(madt + APIC_HDR_LEN_OFF);
+	end  = (struct int_cntl_head *)(madt + len);
+
+	printk("\tMADT length %d (base struct %d)\n", len, APIC_CNTR_ARR_OFF);
+	assert(h <= end);
+	for (; h < end; h = (struct int_cntl_head *)((char *)h + h->len)) {
+		/* termination condition */
+		assert(h->len >= sizeof(struct int_cntl_head));
+		switch (h->type) {
+		case APIC_CNTL_LAPIC: {
+			nl++;
+			lapic_iter((struct lapic_cntl *)h);
+			break;
+		}
+		case APIC_CNTL_IOAPIC: {
+			nio++;
+			ioapic_iter((struct ioapic_cntl *)h);
+			break;
+		}
+		case APIC_CNTL_ISO: {
+			ioapic_int_override((struct intsrcovrride_cntl *)h);
+			break;
+		}
+		default:
+			/* See 5.2.12 in the ACPI 5.0 Spec */
+			printk("\tInterrupt controller type %d: ignoring\n", h->type);
+			break;
+		}
+	}
+
+	printk("\tMADT => LAPICs=%d, IOAPICs=%d\n", nl, nio);
+
+	if (nl < NUM_CPU) {
+		printk("Number of LAPICs processed =%d not meeting the requirement = %d\n", nl, NUM_CPU);
+		printk("Please reconfigure NUM_CPU in Composite/HW-BIOS\n");
+		assert(0);
+	}
+}
diff --git a/src/platform/i386/acpi.h b/src/platform/i386/acpi.h
new file mode 100644
index 0000000000..a46ed82e7a
--- /dev/null
+++ b/src/platform/i386/acpi.h
@@ -0,0 +1,10 @@
+#ifndef ACPI_H
+#define ACPI_H
+
+void *acpi_find_apic(void);
+void *acpi_find_rsdt(void);
+void *acpi_find_hpet(void);
+void  acpi_set_rsdt_page(u32_t);
+void  acpi_madt_intsrc_iter(unsigned char *);
+
+#endif /* ACPI_H */
diff --git a/src/platform/i386/apic_cntl.h b/src/platform/i386/apic_cntl.h
new file mode 100644
index 0000000000..47f3073698
--- /dev/null
+++ b/src/platform/i386/apic_cntl.h
@@ -0,0 +1,63 @@
+#ifndef APIC_CNTL_H
+#define APIC_CNTL_H
+
+#define APIC_DEFAULT_PHYS     0xFEE00000
+#define APIC_HDR_LEN_OFF      0x04
+#define APIC_CNTRLR_ADDR_OFF  0x24
+#define APIC_CNTRLR_FLAGS_OFF 0x28
+#define APIC_CNTR_ARR_OFF     0x2C
+
+/* See 5.2.12 in the ACPI 5.0 Spec */
+enum
+{
+	APIC_CNTL_LAPIC  = 0,
+	APIC_CNTL_IOAPIC = 1,
+	APIC_CNTL_ISO    = 2,
+};
+
+struct int_cntl_head {
+	u8_t type;
+	u8_t len;
+} __attribute__((packed));
+
+struct lapic_cntl {
+	/* type == APIC_CNTL_LAPIC */
+	struct int_cntl_head header;
+	u8_t                 proc_id;
+	u8_t                 apic_id;
+	u32_t                flags; /* 0 = dead processor */
+} __attribute__((packed));
+
+struct ioapic_cntl {
+	/* type == APIC_CNTL_IOAPIC */
+	struct int_cntl_head header;
+	u8_t                 ioapic_id;
+	u8_t                 reserved;
+	u32_t                ioapic_phys_addr;
+	u32_t                glb_int_num_off; /* I/O APIC's interrupt base number offset  */
+} __attribute__((packed));
+
+struct intsrcovrride_cntl {
+	/* type == APIC_CNTL_ISO */
+	struct int_cntl_head header;
+	u8_t                 bus;
+	u8_t                 source;
+	u32_t                glb_int_num_off;
+	u16_t                flags;
+} __attribute__((packed));
+
+enum acpi_madt_iso_polarity {
+	ACPI_MADT_ISO_POL_CONFORMS = 0,
+	ACPI_MADT_ISO_POL_ACTHIGH,
+	ACPI_MADT_ISO_POL_RESERVED,
+	ACPI_MADT_ISO_POL_ACTLOW,
+};
+
+enum acpi_madt_iso_trigger {
+	ACPI_MADT_ISO_TRIG_CONFORMS = 0,
+	ACPI_MADT_ISO_TRIG_EDGE,
+	ACPI_MADT_ISO_TRIG_RESERVED,
+	ACPI_MADT_ISO_TRIG_LEVEL,
+};
+
+#endif /* APIC_CNTL_H */
diff --git a/src/platform/i386/boot_comp.c b/src/platform/i386/boot_comp.c
index 82b363de1f..b023d8e471 100644
--- a/src/platform/i386/boot_comp.c
+++ b/src/platform/i386/boot_comp.c
@@ -9,10 +9,13 @@
 #include <component.h>
 #include <inv.h>
 #include <hw.h>
+#include <scb.h>
+#include <dcb.h>
 
 extern u8_t *boot_comp_pgd;
 
-void *thd_mem[NUM_CPU], *tcap_mem[NUM_CPU];
+vaddr_t dcb_addr, dcb_uaddr;
+void *thd_mem, *tcap_mem;
 struct captbl *glb_boot_ct;
 
 int
@@ -34,7 +37,8 @@ boot_pgtbl_mappings_add(struct captbl *ct, capid_t pgdcap, capid_t ptecap, const
 	pgd_cap = (struct cap_pgtbl *)captbl_lkup(ct, pgdcap);
 	if (!pgd_cap || !CAP_TYPECHK(pgd_cap, CAP_PGTBL)) assert(0);
 	pgtbl = (pgtbl_t)pgd_cap->pgtbl;
-	nptes = boot_nptes(range);
+	if (!uvm) nptes = boot_nptes(range);
+	else      nptes = boot_nptes(range + COS_SCB_SIZE);
 	ptes  = mem_boot_alloc(nptes);
 	assert(ptes);
 
@@ -89,16 +93,18 @@ boot_pgtbl_mappings_add(struct captbl *ct, capid_t pgdcap, capid_t ptecap, const
 	return 0;
 }
 
-/* FIXME:  loops to create threads/tcaps/rcv caps per core. */
 static void
-kern_boot_thd(struct captbl *ct, void *thd_mem, void *tcap_mem, const cpuid_t cpu_id)
+kern_boot_thd(struct captbl *ct, const cpuid_t cpu_id)
 {
+	void                      *tmem     = (void *)((vaddr_t)thd_mem + cpu_id * PAGE_SIZE);
+	void                      *tcmem    = (void *)((vaddr_t)tcap_mem + cpu_id * PAGE_SIZE);
+	vaddr_t                    dcbmem   = dcb_addr + cpu_id * PAGE_SIZE, dcbumem = dcb_uaddr + cpu_id * PAGE_SIZE;
 	struct cos_cpu_local_info *cos_info = cos_cpu_local_info();
-	struct thread *            t        = thd_mem;
-	struct tcap *              tc       = tcap_mem;
+	struct thread             *t        = tmem;
+	struct tcap               *tc       = tcmem;
 	tcap_res_t                 expended;
 	int                        ret;
-	struct cap_pgtbl *         cap_pt;
+	struct cap_pgtbl          *cap_pt;
 	pgtbl_t                    pgtbl;
 
 	assert(cpu_id >= 0);
@@ -108,16 +114,18 @@ kern_boot_thd(struct captbl *ct, void *thd_mem, void *tcap_mem, const cpuid_t cp
 	cos_info->cpuid          = cpu_id;
 	cos_info->invstk_top     = 0;
 	cos_info->overflow_check = 0xDEADBEEF;
-	ret = thd_activate(ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cpu_id), thd_mem, BOOT_CAPTBL_SELF_COMP, 0);
+	ret = dcb_activate(ct, BOOT_CAPTBL_SELF_CT, LLBOOT_CAPTBL_INITDCB_CPU(cpu_id), dcbmem, 0, BOOT_CAPTBL_SELF_PT, dcbumem);
+	assert(!ret);
+	ret = thd_activate(ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cpu_id), tmem, BOOT_CAPTBL_SELF_COMP, 0, LLBOOT_CAPTBL_INITDCB_CPU(cpu_id), 0);
 	assert(!ret);
 
 	tcap_active_init(cos_info);
-	ret = tcap_activate(ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_INITTCAP_BASE_CPU(cpu_id), tcap_mem);
+	ret = tcap_activate(ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_INITTCAP_BASE_CPU(cpu_id), tcmem);
 	assert(!ret);
 
 	tc->budget.cycles = TCAP_RES_INF; /* Chronos's got all the time in the world */
 	tc->perm_prio     = 0;
-	tcap_setprio(tc, 0);                              /* Chronos gets preempted by no one! */
+	tcap_setprio(tc, 0);              /* Chronos gets preempted by no one! */
 	list_enqueue(&cos_info->tcaps, &tc->active_list); /* Chronos on the TCap active list */
 	cos_info->tcap_uid  = 1;
 	cos_info->cycles    = tsc();
@@ -131,10 +139,7 @@ kern_boot_thd(struct captbl *ct, void *thd_mem, void *tcap_mem, const cpuid_t cp
 	                    BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cpu_id), BOOT_CAPTBL_SELF_INITTCAP_BASE_CPU(cpu_id), 0, 1);
 	assert(!ret);
 
-	/*
-	 * boot component's mapped into SELF_PT,
-	 * switching to boot component's pgd
-	 */
+	/* boot component's mapped into SELF_PT, switching to boot component's pgd. */
 	cap_pt = (struct cap_pgtbl *)captbl_lkup(ct, BOOT_CAPTBL_SELF_PT);
 	if (!cap_pt || !CAP_TYPECHK(cap_pt, CAP_PGTBL)) assert(0);
 	pgtbl = cap_pt->pgtbl;
@@ -152,12 +157,13 @@ kern_boot_comp(const cpuid_t cpu_id)
 	u8_t *         boot_comp_captbl;
 	pgtbl_t        pgtbl     = (pgtbl_t)chal_va2pa(&boot_comp_pgd), boot_vm_pgd;
 	u32_t          hw_bitmap = 0xFFFFFFFF;
+	vaddr_t        scb_uaddr = 0, scb_kaddr = 0;
 
 	assert(cpu_id >= 0);
 	if (NUM_CPU > 1 && cpu_id > 0) {
 		assert(glb_boot_ct);
 		pgtbl_update(pgtbl);
-		kern_boot_thd(glb_boot_ct, thd_mem[cpu_id], tcap_mem[cpu_id], cpu_id);
+		kern_boot_thd(glb_boot_ct, cpu_id);
 		return;
 	}
 
@@ -178,11 +184,13 @@ kern_boot_comp(const cpuid_t cpu_id)
 		assert(!ret);
 	}
 
-	for (i = 0; i < NUM_CPU; i++) {
-		thd_mem[i]  = mem_boot_alloc(1);
-		tcap_mem[i] = mem_boot_alloc(1);
-		assert(thd_mem[i] && tcap_mem[i]);
-	}
+	scb_kaddr = (vaddr_t)mem_boot_alloc(1);
+	assert(scb_kaddr);
+
+	dcb_addr = (vaddr_t)mem_boot_alloc(NUM_CPU);
+	thd_mem  = mem_boot_alloc(NUM_CPU);
+	tcap_mem = mem_boot_alloc(NUM_CPU);
+	assert(thd_mem && tcap_mem && dcb_addr);
 
 	if (captbl_activate_boot(glb_boot_ct, BOOT_CAPTBL_SELF_CT)) assert(0);
 	if (sret_activate(glb_boot_ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SRET)) assert(0);
@@ -203,6 +211,9 @@ kern_boot_comp(const cpuid_t cpu_id)
 	ret = boot_pgtbl_mappings_add(glb_boot_ct, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_BOOTVM_PTE, "booter VM", mem_bootc_start(),
 	                              (unsigned long)mem_bootc_vaddr(), mem_bootc_end() - mem_bootc_start(), 1);
 	assert(ret == 0);
+	scb_uaddr = (vaddr_t)(mem_bootc_vaddr() + (mem_bootc_end() - mem_bootc_start()));
+	assert(COS_SCB_SIZE == PAGE_SIZE);
+	dcb_uaddr = scb_uaddr + COS_SCB_SIZE;
 
 	/*
 	 * This _must_ be the last allocation.  The bump pointer
@@ -218,17 +229,19 @@ kern_boot_comp(const cpuid_t cpu_id)
                                       mem_utmem_end() - mem_boot_nalloc_end(nkmemptes), 0);
 	assert(ret == 0);
 
-	printk("\tCapability table and page-table created.\n");
-
 	/* Shut off further bump allocations */
 	glb_memlayout.allocs_avail = 0;
+	if (scb_activate(glb_boot_ct, BOOT_CAPTBL_SELF_CT, LLBOOT_CAPTBL_SCB, scb_kaddr, 0)) assert(0);
 
-	if (comp_activate(glb_boot_ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_COMP, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_PT, 0,
-	                  (vaddr_t)mem_bootc_entry(), NULL))
+	printk("\tCapability table and page-table created.\n");
+
+	if (comp_activate(glb_boot_ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_COMP, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_PT,
+			  LLBOOT_CAPTBL_SCB, 0, (vaddr_t)mem_bootc_entry(), scb_uaddr))
 		assert(0);
+
 	printk("\tCreated boot component structure from page-table and capability-table.\n");
 
-	kern_boot_thd(glb_boot_ct, thd_mem[cpu_id], tcap_mem[cpu_id], cpu_id);
+	kern_boot_thd(glb_boot_ct, cpu_id);
 
 	printk("\tBoot component initialization complete.\n");
 }
diff --git a/src/platform/i386/chal/chal_config.h b/src/platform/i386/chal/chal_config.h
index 2624ad9fab..1af6cd703f 100644
--- a/src/platform/i386/chal/chal_config.h
+++ b/src/platform/i386/chal/chal_config.h
@@ -13,8 +13,10 @@ typedef signed int         s32_t;
 typedef signed long long   s64_t;
 #endif
 
+#define HW_IRQ_START 32
+
 typedef enum {
-	HW_PERIODIC = 32, /* periodic timer interrupt */
+	HW_HPET_PERIODIC = HW_IRQ_START, /* periodic timer interrupt */
 	HW_KEYBOARD,      /* keyboard interrupt */
 	HW_ID3,
 	HW_ID4,
@@ -22,7 +24,7 @@ typedef enum {
 	HW_ID6,
 	HW_ID7,
 	HW_ID8,
-	HW_ONESHOT, /* onetime timer interrupt */
+	HW_HPET_ONESHOT, /* onetime timer interrupt */
 	HW_ID10,
 	HW_ID11,
 	HW_ID12,
diff --git a/src/platform/i386/console.c b/src/platform/i386/console.c
deleted file mode 100644
index 7003a5b15b..0000000000
--- a/src/platform/i386/console.c
+++ /dev/null
@@ -1,152 +0,0 @@
-#define ENABLE_CONSOLE
-
-#include "io.h"
-#include "string.h"
-#include "isr.h"
-#include "kernel.h"
-
-#define VIDEO_MEM 0xb8000
-
-#define VGA_CTL_REG 0x3D4
-#define VGA_DATA_REG 0x3D5
-
-#define KEY_DEVICE 0x60
-#define KEY_PENDING 0x64
-
-#define COLUMNS 80
-#define LINES 25
-
-/* FIXME these should go somewhere else */
-#define BACKSPACE 0x08
-#define TAB 0x09
-
-enum vga_colors
-{
-	BLACK = 0x00,
-	BLUE,
-	GREEN,
-	CYAN,
-	RED,
-	MAGENTA,
-	BROWN,
-	LIGHT_GREY,
-	DARK_GREY,
-	LIGHT_BLUE,
-	LIGHT_GREEN,
-	LIGHT_CYAN,
-	LIGHT_RED,
-	LIGHT_MAGENTA,
-	LIGHT_BROWN,
-	WHITE
-};
-
-static u16_t *video_mem = (u16_t *)VIDEO_MEM;
-static u8_t   cursor_x;
-static u8_t   cursor_y;
-
-static void
-wmemset(void *dst, int c, size_t count)
-{
-	unsigned short *tmp = (unsigned short *)dst;
-
-	for (; count != 0; count--) *tmp++ = c;
-}
-
-static inline u8_t
-gen_color(u8_t forground, u8_t background)
-{
-	return (background << 4) | (forground & 0x0F);
-}
-
-static void
-update_cursor(u8_t row, u8_t col)
-{
-	u16_t pos = row * COLUMNS + col;
-
-	outb(VGA_CTL_REG, 0x0E);
-	outb(VGA_DATA_REG, pos >> 8);
-	outb(VGA_CTL_REG, 0x0F);
-	outb(VGA_DATA_REG, pos);
-}
-
-static void
-scroll(void)
-{
-	u16_t    blank = ((u8_t)' ') | gen_color(WHITE, BLACK);
-	unsigned i;
-
-	if (cursor_y < LINES) return;
-
-	for (i = 0; i < (LINES - 1) * COLUMNS; i++) video_mem[i] = video_mem[i + COLUMNS];
-
-	wmemset(video_mem + ((LINES - 1) * COLUMNS), blank, COLUMNS);
-	cursor_y = LINES - 1;
-}
-
-static void
-vga_putch(char c)
-{
-	u8_t   color     = gen_color(LIGHT_GREY, BLACK);
-	u16_t  attribute = color << 8;
-	u16_t *location;
-
-	if (c == BACKSPACE && cursor_x)
-		cursor_x--;
-	else if (c == TAB)
-		cursor_x = (cursor_x + 8) & ~(8 - 1);
-	else if (c == '\r')
-		cursor_x = 0;
-	else if (c == '\n') {
-		cursor_x = 0;
-		cursor_y++;
-	} else if (c >= ' ') {
-		location  = video_mem + (cursor_y * COLUMNS + cursor_x);
-		*location = c | attribute;
-		cursor_x++;
-	}
-
-	if (cursor_x >= COLUMNS) {
-		cursor_x = 0;
-		cursor_y++;
-	}
-
-	scroll();
-	update_cursor(cursor_y, cursor_x);
-}
-
-void
-vga_puts(const char *s)
-{
-	for (; *s != '\0'; s++) vga_putch(*s);
-}
-
-void
-vga_clear(void)
-{
-	u8_t  color = gen_color(WHITE, BLACK);
-	u16_t blank = ((u8_t)' ') | color << 8;
-	wmemset(video_mem, blank, COLUMNS * LINES);
-}
-
-int
-keyboard_handler(struct pt_regs *regs)
-{
-	u16_t scancode;
-	int   preempt = 1;
-
-	ack_irq(IRQ_KEYBOARD);
-
-	while (inb(KEY_PENDING) & 2) {
-		/* wait for keypress to be ready */
-	}
-	scancode = inb(KEY_DEVICE);
-	printk("Keyboard press: %d\n", scancode);
-	return preempt;
-}
-
-void
-console_init(void)
-{
-	vga_clear();
-	printk_register_handler(vga_puts);
-}
diff --git a/src/platform/i386/entry.S b/src/platform/i386/entry.S
index 9479ed54b1..07b5954aab 100644
--- a/src/platform/i386/entry.S
+++ b/src/platform/i386/entry.S
@@ -131,7 +131,7 @@ IRQ(smid_float_pt_except_fault)
 IRQ(virtualization_except_fault)
 IRQ_CODE(security_except_fault)
 
-IRQ(periodic)
+IRQ(hpet_periodic)
 IRQ(keyboard)
 IRQ_ID(34)
 IRQ_ID(35)
@@ -139,7 +139,7 @@ IRQ(serial)
 IRQ_ID(37)
 IRQ_ID(38)
 IRQ_ID(39)
-IRQ(oneshot)
+IRQ(hpet_oneshot)
 IRQ_ID(41)
 IRQ_ID(42)
 IRQ_ID(43)
diff --git a/src/platform/i386/exception.c b/src/platform/i386/exception.c
index 5b6694c01a..b4d2e4c538 100644
--- a/src/platform/i386/exception.c
+++ b/src/platform/i386/exception.c
@@ -6,7 +6,6 @@
 #include "isr.h"
 #include "chal_cpu.h"
 
-#define PRINTK(format, ...) printk("(CPU%ld:) " format, get_cpuid(), ## __VA_ARGS__)
 
 void
 print_regs_state(struct pt_regs *regs)
diff --git a/src/platform/i386/hpet.c b/src/platform/i386/hpet.c
index 719ea4af87..840754ef2c 100644
--- a/src/platform/i386/hpet.c
+++ b/src/platform/i386/hpet.c
@@ -41,20 +41,24 @@
 
 /* Bits in HPET_Tn_CONFIG */
 /* 1 << 0 is reserved */
-#define TN_INT_TYPE_CNF (1ll << 1) /* 0 = edge trigger, 1 = level trigger */
-#define TN_INT_ENB_CNF (1ll << 2)  /* 0 = no interrupt, 1 = interrupt */
-#define TN_TYPE_CNF (1ll << 3)     /* 0 = one-shot, 1 = periodic */
-#define TN_PER_INT_CAP (1ll << 4)  /* read only, 1 = periodic supported */
-#define TN_SIZE_CAP (1ll << 5)     /* 0 = 32-bit, 1 = 64-bit */
-#define TN_VAL_SET_CNF (1ll << 6)  /* set to allow directly setting accumulator */
+#define HPET_TN_INT_TYPE_CNF (1ll << 1) /* 0 = edge trigger, 1 = level trigger */
+#define HPET_TN_INT_ENB_CNF (1ll << 2)  /* 0 = no interrupt, 1 = interrupt */
+#define HPET_TN_TYPE_CNF (1ll << 3)     /* 0 = one-shot, 1 = periodic */
+#define HPET_TN_PER_INT_CAP (1ll << 4)  /* read only, 1 = periodic supported */
+#define HPET_TN_SIZE_CAP (1ll << 5)     /* 0 = 32-bit, 1 = 64-bit */
+#define HPET_TN_VAL_SET_CNF (1ll << 6)  /* set to allow directly setting accumulator */
 /* 1 << 7 is reserved */
-#define TN_32MODE_CNF (1ll << 8)           /* 1 = force 32-bit access to 64-bit timer */
-/* #define TN_INT_ROUTE_CNF (1<<9:1<<13)*/ /* routing for interrupt */
-#define TN_FSB_EN_CNF (1ll << 14)          /* 1 = deliver interrupts via FSB instead of APIC */
-#define TN_FSB_INT_DEL_CAP (1ll << 15)     /* read only, 1 = FSB delivery available */
+#define HPET_TN_32MODE_CNF (1ll << 8)           /* 1 = force 32-bit access to 64-bit timer */
+#define HPET_TN_INT_ROUTE_CNF (9) 	/* routing for interrupt */
+#define HPET_TN_FSB_EN_CNF (1ll << 14)          /* 1 = deliver interrupts via FSB instead of APIC */
+#define HPET_TN_FSB_INT_DEL_CAP (1ll << 15)     /* read only, 1 = FSB delivery available */
 
 #define HPET_INT_ENABLE(n) (*hpet_interrupt = (0x1 << n)) /* Clears the INT n for level-triggered mode. */
 
+/* vector for interrupts */
+#define HPET_PERIODIC_VEC 0ll
+#define HPET_ONESHOT_VEC 8ll
+
 static volatile u32_t *hpet_capabilities;
 static volatile u64_t *hpet_config;
 static volatile u64_t *hpet_interrupt;
@@ -69,7 +73,7 @@ volatile struct hpet_timer {
 
 /*
  * When determining how many CPU cycles are in a HPET tick, we must
- * execute a number of periodic ticks (TIMER_CALIBRATION_ITER) at a
+ * execute a number of periodic ticks (HPET_CALIBRATION_ITER) at a
  * controlled interval, and use the HPET tick granularity to compute
  * how many CPU cycles per HPET tick there are.  Unfortunately, this
  * can be quite low (e.g. HPET tick of 10ns, CPU tick of 2ns) leading
@@ -79,33 +83,36 @@ volatile struct hpet_timer {
  * Practically, this will lead to the divisor in the conversion being
  * smaller than it should be, thus causing timers to go off _later_
  * than they should.  Thus we use a multiplicative factor
- * (TIMER_ERROR_BOUND_FACTOR) to lessen the rounding error.
+ * (HPET_ERROR_BOUND_FACTOR) to lessen the rounding error.
  *
  * All of the hardware is documented in the HPET specification @
  * http://www.intel.com/content/dam/www/public/us/en/documents/technical-specifications/software-developers-hpet-spec-1-0a.pdf
  */
 
-#define PICO_PER_MICRO 1000000UL
-#define FEMPTO_PER_PICO 1000UL
-#define TIMER_CALIBRATION_ITER 256
-#define TIMER_ERROR_BOUND_FACTOR 256
-static int           timer_calibration_init   = 0;
-static unsigned long timer_cycles_per_hpetcyc = TIMER_ERROR_BOUND_FACTOR;
-static unsigned long cycles_per_tick;
-static unsigned long hpetcyc_per_tick;
 #define ULONG_MAX 4294967295UL
+#define HPET_PICO_PER_MICRO 1000000UL
+#define HPET_FEMPTO_PER_PICO 1000UL
+#define HPET_CALIBRATION_ITER 256
+#define HPET_ERROR_BOUND_FACTOR 256
+#define HPET_DEFAULT_PERIOD_US 1000 /* US = microseconds */
+static int           hpet_calibration_init   = 0;
+static unsigned long hpet_cpucyc_per_hpetcyc = HPET_ERROR_BOUND_FACTOR;
+static unsigned long hpet_cpucyc_per_tick;
+static unsigned long hpet_hpetcyc_per_tick;
+static unsigned long hpet_periodicity_curr[2] = { 0 };
+static cycles_t hpet_first_hpet_period = 0; /* for timer 0 = HPET_PERIODIC */
 extern u32_t chal_msr_mhz;
 
 static inline u64_t
-timer_cpu2hpet_cycles(u64_t cycles)
+hpet_cpu2hpet_cycles(u64_t cycles)
 {
 	unsigned long cyc;
 
 	/* demote precision to enable word-sized math */
 	cyc = (unsigned long)cycles;
-	if (unlikely((u64_t)cyc < cycles)) cyc= ULONG_MAX;
+	if (unlikely((u64_t)cyc < cycles)) cyc = ULONG_MAX;
 	/* convert from CPU cycles to HPET cycles */
-	cyc = (cyc / timer_cycles_per_hpetcyc) * TIMER_ERROR_BOUND_FACTOR;
+	cyc = (cyc / hpet_cpucyc_per_hpetcyc) * HPET_ERROR_BOUND_FACTOR;
 	/* promote the precision to interact with the hardware correctly */
 	cycles = cyc;
 
@@ -113,7 +120,7 @@ timer_cpu2hpet_cycles(u64_t cycles)
 }
 
 static void
-timer_disable(timer_type_t timer_type)
+hpet_disable(hpet_type_t timer_type)
 {
 	/* Disable timer interrupts */
 	*hpet_config &= ~HPET_ENABLE_CNF;
@@ -127,10 +134,10 @@ timer_disable(timer_type_t timer_type)
 }
 
 static void
-timer_calibration(void)
+hpet_calibration(void)
 {
-	static int   cnt   = 0;
-	static u64_t cycle = 0, tot = 0, prev;
+	static int   cnt       = 0;
+	static u64_t cycle     = 0, tot = 0, prev;
 	static u32_t apic_curr = 0, apic_tot = 0, apic_prev;
 
 	/* calibration only on BSP */
@@ -145,30 +152,31 @@ timer_calibration(void)
 		tot += cycle - prev;
 		apic_tot += (apic_prev - apic_curr);
 	}
-	if (cnt >= TIMER_CALIBRATION_ITER) {
-		assert(hpetcyc_per_tick);
-		timer_calibration_init = 0;
-		cycles_per_tick        = (unsigned long)(tot / TIMER_CALIBRATION_ITER);
-		assert(cycles_per_tick > hpetcyc_per_tick);
+	if (cnt >= HPET_CALIBRATION_ITER) {
+		assert(hpet_hpetcyc_per_tick);
+		hpet_calibration_init = 0;
+		hpet_cpucyc_per_tick  = (unsigned long)(tot / HPET_CALIBRATION_ITER);
+		assert(hpet_cpucyc_per_tick > hpet_hpetcyc_per_tick);
 
 		if (lapic_timer_calib_init) {
 			u32_t cycs_to_apic_ratio = 0, apic_cycs_per_tick = 0;
 
-			apic_cycs_per_tick = apic_tot / TIMER_CALIBRATION_ITER;
+			apic_cycs_per_tick = apic_tot / HPET_CALIBRATION_ITER;
 			assert(apic_cycs_per_tick);
 
-			cycs_to_apic_ratio = cycles_per_tick / apic_cycs_per_tick;
+			cycs_to_apic_ratio = hpet_cpucyc_per_tick / apic_cycs_per_tick;
 			lapic_timer_calibration(cycs_to_apic_ratio);
 		}
 
 		/* Possibly significant rounding error here.  Bound by the factor */
-		timer_cycles_per_hpetcyc = (TIMER_ERROR_BOUND_FACTOR * cycles_per_tick) / hpetcyc_per_tick;
+		hpet_cpucyc_per_hpetcyc = (HPET_ERROR_BOUND_FACTOR * hpet_cpucyc_per_tick) / hpet_hpetcyc_per_tick;
 		printk("Timer calibrated:\n\tCPU cycles per HPET tick: %ld\n\tHPET ticks in %d us: %ld\n",
-		       timer_cycles_per_hpetcyc / TIMER_ERROR_BOUND_FACTOR, TIMER_DEFAULT_US_INTERARRIVAL,
-		       hpetcyc_per_tick);
+		       hpet_cpucyc_per_hpetcyc / HPET_ERROR_BOUND_FACTOR, HPET_DEFAULT_PERIOD_US,
+		       hpet_hpetcyc_per_tick);
 
-		timer_disable(TIMER_PERIODIC);
-		timer_disable(TIMER_PERIODIC);
+		hpet_disable(HPET_PERIODIC);
+		hpet_disable(HPET_PERIODIC);
+		chal_irq_disable(HW_HPET_PERIODIC, 0);
 	}
 	cnt++;
 }
@@ -176,57 +184,70 @@ timer_calibration(void)
 int
 chal_cyc_usec(void)
 {
-	if (lapic_timer_calib_init) return 0;
+	if (unlikely(lapic_timer_calib_init || hpet_calibration_init)) return 0;
+
+	if (likely(hpet_cpucyc_per_tick)) return hpet_cpucyc_per_tick / HPET_DEFAULT_PERIOD_US;
 
-	return cycles_per_tick / TIMER_DEFAULT_US_INTERARRIVAL;
+	return 0;
 }
 
 int
-periodic_handler(struct pt_regs *regs)
+hpet_periodic_handler(struct pt_regs *regs)
 {
 	int preempt = 1;
+static int count = 0;
+
+	lapic_ack();
+	if (unlikely(hpet_calibration_init)) hpet_calibration();
+	if (unlikely(hpet_periodicity_curr[HPET_PERIODIC] && !hpet_first_hpet_period)) {
+	count++;
 
-	if (unlikely(timer_calibration_init)) timer_calibration();
+	if (count < 25) goto done;
+		rdtscll(hpet_first_hpet_period);
+	}
 
-	ack_irq(HW_PERIODIC);
-	preempt = cap_hw_asnd(&hw_asnd_caps[HW_PERIODIC], regs);
-	HPET_INT_ENABLE(TIMER_PERIODIC);
+	preempt = cap_hw_asnd(&hw_asnd_caps[get_cpuid()][HW_HPET_PERIODIC], regs);
+done:
+	HPET_INT_ENABLE(HPET_PERIODIC);
 
 	return preempt;
 }
 
-extern int timer_process(struct pt_regs *regs);
-
 int
-oneshot_handler(struct pt_regs *regs)
+hpet_oneshot_handler(struct pt_regs *regs)
 {
 	int preempt = 1;
 
-	ack_irq(HW_ONESHOT);
-	preempt = timer_process(regs);
-	HPET_INT_ENABLE(TIMER_ONESHOT);
+	assert(!hpet_calibration_init);
+
+	lapic_ack();
+	preempt = cap_hw_asnd(&hw_asnd_caps[get_cpuid()][HW_HPET_ONESHOT], regs);
+	HPET_INT_ENABLE(HPET_ONESHOT);
 
 	return preempt;
 }
 
 void
-timer_set(timer_type_t timer_type, u64_t cycles)
+hpet_set(hpet_type_t timer_type, u64_t cycles)
 {
-	u64_t outconfig = TN_INT_TYPE_CNF | TN_INT_ENB_CNF;
+	u64_t outconfig = HPET_TN_INT_TYPE_CNF | HPET_TN_INT_ENB_CNF;
 
 	/* Disable timer interrupts */
 	*hpet_config &= ~HPET_ENABLE_CNF;
 
 	/* Reset main counter */
-	if (timer_type == TIMER_ONESHOT) {
-		cycles = timer_cpu2hpet_cycles(cycles);
+	if (timer_type == HPET_ONESHOT) {
+		cycles = hpet_cpu2hpet_cycles(cycles);
 
 		/* Set a static value to count up to */
 		hpet_timers[timer_type].config = outconfig;
+		hpet_timers[timer_type].config |= HPET_ONESHOT_VEC << HPET_TN_INT_ROUTE_CNF;
 		cycles += HPET_COUNTER;
 	} else {
 		/* Set a periodic value */
-		hpet_timers[timer_type].config = outconfig | TN_TYPE_CNF | TN_VAL_SET_CNF;
+		hpet_timers[timer_type].config = outconfig | HPET_TN_TYPE_CNF | HPET_TN_VAL_SET_CNF;
+		/* Set the interrupt vector for periodic timer */
+		hpet_timers[timer_type].config |= HPET_PERIODIC_VEC << HPET_TN_INT_ROUTE_CNF;
 		/* Reset main counter */
 		HPET_COUNTER = 0x00;
 	}
@@ -237,7 +258,7 @@ timer_set(timer_type_t timer_type, u64_t cycles)
 }
 
 u64_t
-timer_find_hpet(void *timer)
+hpet_find(void *timer)
 {
 	u32_t          i;
 	unsigned char  sum      = 0;
@@ -264,7 +285,55 @@ timer_find_hpet(void *timer)
 }
 
 void
-timer_set_hpet_page(u32_t page)
+chal_hpet_periodic_set(hwid_t hwid, unsigned long usecs_period)
+{
+	hpet_type_t type = 0;
+
+	assert(hwid == HW_HPET_PERIODIC);
+	type = HPET_PERIODIC;
+
+	if (hpet_periodicity_curr[type] != usecs_period) {
+		hpet_disable(type);
+		hpet_disable(type);
+
+		hpet_periodicity_curr[type] = 0;
+	}
+
+	if (hpet_periodicity_curr[type] == 0) {
+		unsigned long tick_multiple = 0;
+		cycles_t hpetcyc_per_period = 0;
+
+		assert(hpet_calibration_init == 0);
+		assert((usecs_period >= HPET_DEFAULT_PERIOD_US) && (usecs_period % HPET_DEFAULT_PERIOD_US == 0));
+
+		tick_multiple = usecs_period / HPET_DEFAULT_PERIOD_US;
+		hpetcyc_per_period = (cycles_t)hpet_hpetcyc_per_tick * (cycles_t)tick_multiple;
+		hpet_periodicity_curr[type] = usecs_period;
+		if (type == HPET_PERIODIC) hpet_first_hpet_period = 0;
+		hpet_set(type, hpetcyc_per_period);
+		chal_irq_enable(HW_HPET_PERIODIC, 0);
+		printk("Setting HPET [%u:%u] Periodicity:%lu hpetcyc_per_period:%llu\n", hwid, type, usecs_period, hpetcyc_per_period);
+	}
+}
+
+cycles_t
+chal_hpet_first_period(void)
+{
+	return hpet_first_hpet_period;
+}
+
+void
+chal_hpet_disable(hwid_t hwid)
+{
+	printk("Disabling HPET %u\n", hwid);
+	hpet_type_t type = (hwid == HW_HPET_PERIODIC ? HPET_PERIODIC : HPET_ONESHOT);
+
+	hpet_disable(type);
+	hpet_disable(type);
+}
+
+void
+hpet_set_page(u32_t page)
 {
 	hpet              = (void *)(page * (1 << 22) | ((u32_t)hpet & ((1 << 22) - 1)));
 	hpet_capabilities = (u32_t *)((unsigned char *)hpet + HPET_CAPABILITIES);
@@ -276,17 +345,24 @@ timer_set_hpet_page(u32_t page)
 }
 
 void
-timer_init(void)
+hpet_init(void)
 {
 	unsigned long pico_per_hpetcyc;
 
 	assert(hpet_capabilities);
-	pico_per_hpetcyc = hpet_capabilities[1]
-	                   / FEMPTO_PER_PICO; /* bits 32-63 are # of femptoseconds per HPET clock tick */
-	hpetcyc_per_tick = (TIMER_DEFAULT_US_INTERARRIVAL * PICO_PER_MICRO) / pico_per_hpetcyc;
+	/* bits 32-63 are # of femptoseconds per HPET clock tick */
+	pico_per_hpetcyc      = hpet_capabilities[1] / HPET_FEMPTO_PER_PICO;
+	hpet_hpetcyc_per_tick = (HPET_DEFAULT_PERIOD_US * HPET_PICO_PER_MICRO) / pico_per_hpetcyc;
 
 	printk("Enabling timer @ %p with tick granularity %ld picoseconds\n", hpet, pico_per_hpetcyc);
-	/* Enable legacy interrupt routing */
+
+	/*
+	 * FIXME: For some reason, setting to non-legacy mode isn't working well.
+	 * Periodicity of the HPET fired is wrong and any interval configuration
+	 * is still producing the same wrong interval timing.
+	 *
+	 * So, Enable legacy interrupt routing like we had before!
+	 */
 	*hpet_config |= HPET_LEG_RT_CNF;
 
 	/*
@@ -294,13 +370,15 @@ timer_init(void)
 	 * specification is in hpet cycles (not cpu cycles).
 	 */
 	if (chal_msr_mhz && !lapic_timer_calib_init) {
-		cycles_per_tick          = chal_msr_mhz * TIMER_DEFAULT_US_INTERARRIVAL;
-		timer_cycles_per_hpetcyc = cycles_per_tick / hpetcyc_per_tick;
+		hpet_cpucyc_per_tick    = chal_msr_mhz * HPET_DEFAULT_PERIOD_US;
+		hpet_cpucyc_per_hpetcyc = hpet_cpucyc_per_tick / hpet_hpetcyc_per_tick;
 		printk("Timer not calibrated, instead computed using MSR frequency value\n");
 
 		return;
 	}
 
-	timer_calibration_init = 1;
-	timer_set(TIMER_PERIODIC, hpetcyc_per_tick);
+	hpet_calibration_init = 1;
+	hpet_set(HPET_PERIODIC, hpet_hpetcyc_per_tick);
+	chal_irq_enable(HW_HPET_PERIODIC, 0);
+	chal_irq_enable(HW_HPET_ONESHOT, 0);
 }
diff --git a/src/platform/i386/hpet.h b/src/platform/i386/hpet.h
new file mode 100644
index 0000000000..f6aa186ce8
--- /dev/null
+++ b/src/platform/i386/hpet.h
@@ -0,0 +1,14 @@
+#ifndef HPET_H
+#define HPET_H
+
+typedef enum {
+	HPET_PERIODIC = 0,
+	HPET_ONESHOT  = 1,
+} hpet_type_t;
+
+void  hpet_set(hpet_type_t timer_type, u64_t cycles);
+void  hpet_init(void);
+u64_t hpet_find(void *timer);
+void  hpet_set_page(u32_t page);
+
+#endif /* HPET_H */
diff --git a/src/platform/i386/idt.c b/src/platform/i386/idt.c
index 0d79f8c675..821806bfee 100644
--- a/src/platform/i386/idt.c
+++ b/src/platform/i386/idt.c
@@ -3,31 +3,6 @@
 #include "isr.h"
 #include "chal/io.h"
 
-/* Information taken from: http://wiki.osdev.org/PIC */
-/* FIXME:  Remove magic numbers and replace with this */
-#define PIC1 0x20
-#define PIC2 0xA0
-#define PIC1_COMMAND PIC1
-#define PIC1_DATA (PIC1 + 1)
-#define PIC2_COMMAND PIC2
-#define PIC2_DATA (PIC2 + 1)
-
-/* reinitialize the PIC controllers, giving them specified vector offsets
-   rather than 8 and 70, as configured by default */
-
-#define ICW1_ICW4 0x01      /* ICW4 (not) needed */
-#define ICW1_SINGLE 0x02    /* Single (cascade) mode */
-#define ICW1_INTERVAL4 0x04 /* Call address interval 4 (8) */
-#define ICW1_LEVEL 0x08     /* Level triggered (edge) mode */
-#define ICW1_INIT 0x10      /* Initialization - required! */
-
-#define ICW4_8086 0x01       /* 8086/88 (MCS-80/85) mode */
-#define ICW4_AUTO 0x02       /* Auto (normal) EOI */
-#define ICW4_BUF_SLAVE 0x08  /* Buffered mode/slave */
-#define ICW4_BUF_MASTER 0x0C /* Buffered mode/master */
-#define ICW4_SFNM 0x10       /* Special fully nested (not) */
-#define ICW1_ICW4 0x01
-
 struct idt_entry {
 	u16_t base_lo; // Lower 16 bits of address to jump too after int
 	u16_t sel;     // Kernel segment selector
@@ -73,42 +48,36 @@ hw_handler(struct pt_regs *regs)
 	 * TODO: ack here? or
 	 *       after user-level interrupt(rcv event) processing?
 	 */
-	ack_irq(regs->orig_ax);
-	preempt = cap_hw_asnd(&hw_asnd_caps[regs->orig_ax], regs);
+	lapic_ack();
+	preempt = cap_hw_asnd(&hw_asnd_caps[get_cpuid()][regs->orig_ax], regs);
 
 	return preempt;
 }
 
-#if 0
-static inline void
-remap_irq_table(void)
-{
-	u8_t pic1_mask;
-	u8_t pic2_mask;
-
-	// Save masks
-	pic1_mask = inb(PIC1_DATA);
-	pic2_mask = inb(PIC2_DATA);
-}
-#endif
-
 void
 idt_init(const cpuid_t cpu_id)
 {
+	struct {
+		unsigned short length;
+		unsigned long  base;
+	} __attribute__((__packed__)) idtr;
+
+	if (cpu_id != INIT_CORE) goto update;
+
 	idt_ptr.limit = (sizeof(struct idt_entry) * NUM_IDT_ENTRIES) - 1;
 	idt_ptr.base  = (u32_t)&(idt_entries);
 	memset(&(idt_entries), 0, sizeof(struct idt_entry) * NUM_IDT_ENTRIES);
 
-	outb(0x20, 0x11);
-	outb(0xA0, 0x11);
-	outb(0x21, 0x20);
-	outb(0xA1, 0x28);
-	outb(0x21, 0x04);
-	outb(0xA1, 0x02);
-	outb(0x21, 0x01);
-	outb(0xA1, 0x01);
-	outb(0x21, 0x0);
-	outb(0xA1, 0x0);
+        outb(0x20, 0x11);
+        outb(0xA0, 0x11);
+        outb(0x21, 0x20);
+        outb(0xA1, 0x28);
+        outb(0x21, 0x04);
+        outb(0xA1, 0x02);
+        outb(0x21, 0x01);
+        outb(0xA1, 0x01);
+        outb(0x21, 0x0);
+        outb(0xA1, 0x0);
 
 	idt_set_gate(IRQ_DIV_BY_ZERO_ERR_FAULT, (u32_t)div_by_zero_err_fault_irq, 0x08, 0x8E);
 	idt_set_gate(IRQ_DEBUG_TRAP, (u32_t)debug_trap_irq, 0x08, 0x8E);
@@ -130,7 +99,7 @@ idt_init(const cpuid_t cpu_id)
 	idt_set_gate(IRQ_VIRTUALIZATION_EXCEPT_FAULT, (u32_t)virtualization_except_fault_irq, 0x08, 0x8E);
 	idt_set_gate(IRQ_SECURITY_EXCEPT_FAULT, (u32_t)security_except_fault_irq, 0x08, 0x8E);
 
-	idt_set_gate(HW_PERIODIC, (u32_t)periodic_irq, 0x08, 0x8E);
+	idt_set_gate(HW_HPET_PERIODIC, (u32_t)hpet_periodic_irq, 0x08, 0x8E);
 	idt_set_gate(HW_KEYBOARD, (u32_t)keyboard_irq, 0x08, 0x8E);
 	idt_set_gate(HW_ID3, (u32_t)handler_hw_34, 0x08, 0x8E);
 	idt_set_gate(HW_ID4, (u32_t)handler_hw_35, 0x08, 0x8E);
@@ -138,7 +107,7 @@ idt_init(const cpuid_t cpu_id)
 	idt_set_gate(HW_ID6, (u32_t)handler_hw_37, 0x08, 0x8E);
 	idt_set_gate(HW_ID7, (u32_t)handler_hw_38, 0x08, 0x8E);
 	idt_set_gate(HW_ID8, (u32_t)handler_hw_39, 0x08, 0x8E);
-	idt_set_gate(HW_ONESHOT, (u32_t)oneshot_irq, 0x08, 0x8E);
+	idt_set_gate(HW_HPET_ONESHOT, (u32_t)hpet_oneshot_irq, 0x08, 0x8E);
 	idt_set_gate(HW_ID10, (u32_t)handler_hw_41, 0x08, 0x8E);
 	idt_set_gate(HW_ID11, (u32_t)handler_hw_42, 0x08, 0x8E);
 	idt_set_gate(HW_ID12, (u32_t)handler_hw_43, 0x08, 0x8E);
@@ -165,11 +134,7 @@ idt_init(const cpuid_t cpu_id)
 	idt_set_gate(HW_LAPIC_IPI_ASND, (u32_t)lapic_ipi_asnd_irq, 0x08, 0x8E);
 	idt_set_gate(HW_LAPIC_TIMER, (u32_t)lapic_timer_irq, 0x08, 0x8E);
 
-	struct {
-		unsigned short length;
-		unsigned long  base;
-	} __attribute__((__packed__)) idtr;
-
+update:
 	idtr.length = idt_ptr.limit;
 	idtr.base   = (unsigned long)(&(idt_entries));
 
diff --git a/src/platform/i386/ioapic.c b/src/platform/i386/ioapic.c
new file mode 100644
index 0000000000..1ae8231b7c
--- /dev/null
+++ b/src/platform/i386/ioapic.c
@@ -0,0 +1,393 @@
+#include "kernel.h"
+#include "ioapic.h"
+#include "pic.h"
+
+#define IOAPIC_MAX 4
+#define IOAPIC_INT_ISA_MAX 16 /* ACPI 5.0 spec: only ISA interrupts can have overrides */
+
+#define IOAPIC_IOAPICID  0x00
+#define IOAPIC_IOAPICVER 0x01
+#define IOAPIC_IOAPICARB 0x02
+
+#define IOAPIC_IOREGSEL 0x00
+#define IOAPIC_IOWIN    (IOAPIC_IOREGSEL + 0x10)
+#define IOAPIC_IOREDTBL 0x10
+#define IOAPIC_IOREDTBL_OFFSET(n) (IOAPIC_IOREDTBL + 2*n)
+
+#define IOAPIC_INT_DISABLED (1<<16)
+
+enum ioapic_deliverymode
+{
+	IOAPIC_DELIV_FIXED  = 0,
+	IOAPIC_DELIV_LOWEST = 1,
+	IOAPIC_DELIV_SMI    = 2,
+	IOAPIC_DELIV_NMI    = 4,
+	IOAPIC_DELIV_INIT   = 5,
+	IOAPIC_DELIV_EXTINT = 7,
+};
+
+enum ioapic_dstmode
+{
+	IOAPIC_DST_PHYSICAL = 0,
+	IOAPIC_DST_LOGICAL  = 1,
+};
+
+enum ioapic_pinpolarity
+{
+	IOAPIC_POL_ACTHIGH = 0,
+	IOAPIC_POL_ACTLOW  = 1,
+};
+
+enum ioapic_triggermode
+{
+	IOAPIC_TRIGGER_EDGE  = 0,
+	IOAPIC_TRIGGER_LEVEL = 1,
+};
+
+struct ioapic_info {
+	unsigned int   ioapicid;
+	volatile void *io_vaddr;
+	int            nentries;
+	int            glbint_base;
+};
+
+union ioapic_int_redir_entry {
+	struct {
+		u64_t vector: 8;
+		u64_t delivmod: 3;
+		u64_t destmod: 1;
+		u64_t delivsts: 1;
+		u64_t polarity: 1;
+		u64_t remoteirr: 1;
+		u64_t trigger: 1;
+		u64_t mask: 1;
+		u64_t reserved: 39;
+		u64_t destination: 8;
+	};
+	struct {
+		u32_t low_dword;
+		u32_t high_dword;
+	};
+};
+
+struct ioapic_isa_override {
+	int source;
+	int gsi;
+	union {
+		struct {
+			u16_t polarity:2;
+			u16_t trigger:2;
+			u16_t reserved:12;
+		};
+		u16_t flags;
+	};
+};
+
+static struct ioapic_info ioapicinfo[IOAPIC_MAX] = { { 0, NULL, 0, 0} };
+static unsigned int ioapic_count;
+static struct ioapic_isa_override ioapic_isainfo[IOAPIC_INT_ISA_MAX];
+static unsigned int ioapic_isaoverride_count;
+static unsigned int ioapic_int_count;
+
+static union ioapic_int_redir_entry ioapic_int_isa_tmpl = {
+	.delivmod = IOAPIC_DELIV_FIXED,
+	.destmod  = IOAPIC_DST_LOGICAL,
+	.polarity = IOAPIC_POL_ACTHIGH,
+	.trigger  = IOAPIC_TRIGGER_EDGE,
+	.mask     = 1,
+};
+
+static union ioapic_int_redir_entry ioapic_int_pci_tmpl = {
+	.delivmod = IOAPIC_DELIV_FIXED,
+	.destmod  = IOAPIC_DST_LOGICAL,
+	.polarity = IOAPIC_POL_ACTLOW,
+	.trigger  = IOAPIC_TRIGGER_EDGE, /* ref. barrelfish doesn't use level */
+	.mask     = 1,
+};
+
+void
+ioapic_set_page(struct ioapic_info *io, u32_t page)
+{
+        io->io_vaddr = (volatile u32_t *)(page * (1 << 22) | ((u32_t)io->io_vaddr & ((1 << 22) - 1)));
+
+        printk("\tSet IOAPIC %d @ %p\n", io->ioapicid, io->io_vaddr);
+}
+
+static void
+ioapic_reg_write(struct ioapic_info *io, u8_t offset, u32_t val)
+{
+        *(volatile u32_t *)(io->io_vaddr + IOAPIC_IOREGSEL) = offset;
+        *(volatile u32_t *)(io->io_vaddr + IOAPIC_IOWIN)    = val;
+}
+
+static u32_t
+ioapic_reg_read(struct ioapic_info *io, u8_t offset)
+{
+        *(volatile u32_t *)(io->io_vaddr + IOAPIC_IOREGSEL) = offset;
+
+        return *(volatile u32_t *)(io->io_vaddr + IOAPIC_IOWIN);
+}
+
+static struct ioapic_info *
+ioapic_findbygsi(int gsi)
+{
+	unsigned int i = 0;
+
+	for (; i < ioapic_count; i++) {
+		if (gsi >= ioapicinfo[i].glbint_base && gsi < ioapicinfo[i].nentries) return &ioapicinfo[i];
+	}
+
+	return NULL;
+}
+
+static struct ioapic_info *
+ioapic_findbyid(int id)
+{
+	unsigned int i = 0;
+
+	for (; i < ioapic_count; i++) {
+		if (id == (int)(ioapicinfo[i].ioapicid)) return &ioapicinfo[i];
+	}
+
+	return NULL;
+}
+
+static inline void
+ioapic_int_entry_write(struct ioapic_info *io, u8_t off, union ioapic_int_redir_entry entry)
+{
+	int tmpoff = IOAPIC_IOREDTBL_OFFSET(off);
+
+	ioapic_reg_write(io, tmpoff, entry.low_dword);
+	ioapic_reg_write(io, tmpoff+1, entry.high_dword);
+}
+
+static inline union ioapic_int_redir_entry
+ioapic_int_entry_read(struct ioapic_info *io, u8_t off)
+{
+	union ioapic_int_redir_entry entry;
+	int tmpoff = IOAPIC_IOREDTBL_OFFSET(off);
+
+	entry.low_dword  = ioapic_reg_read(io, tmpoff);
+	entry.high_dword = ioapic_reg_read(io, tmpoff+1);
+
+	return entry;
+}
+
+static inline void
+ioapic_int_mask_set(int gsi, int mask, int dest)
+{
+	struct ioapic_info *io = ioapic_findbygsi(gsi);
+	union ioapic_int_redir_entry entry;
+	u8_t off;
+
+	if (!io) return;
+
+	off = gsi - io->glbint_base;
+	entry = ioapic_int_entry_read(io, off);
+	entry.mask = mask ? 1 : 0;
+	entry.destination = dest;
+	ioapic_int_entry_write(io, off, entry);
+	entry = ioapic_int_entry_read(io, off);
+}
+
+static inline int
+ioapic_int_gsi(int gsi)
+{
+	int override_gsi = gsi;
+	int i;
+
+	if (gsi < IOAPIC_INT_ISA_MAX) {
+		for (i = 0; i < (int)ioapic_isaoverride_count; i++) {
+			if (ioapic_isainfo[i].source == gsi && ioapic_isainfo[i].gsi != gsi) {
+				override_gsi = ioapic_isainfo[i].gsi;
+				break;
+			}
+		}
+	}
+
+	return override_gsi;
+}
+
+void
+ioapic_int_mask(int gsi)
+{
+	/* clear destination when masking */
+	ioapic_int_mask_set(ioapic_int_gsi(gsi), 1, 0);
+}
+
+void
+ioapic_int_unmask(int gsi, int dest)
+{
+	ioapic_int_mask_set(ioapic_int_gsi(gsi), 0, dest);
+}
+
+void
+ioapic_int_override(struct intsrcovrride_cntl *iso)
+{
+	union ioapic_int_redir_entry entry = ioapic_int_isa_tmpl;
+	struct ioapic_info *iogsi = NULL, *iosrc = NULL;
+
+	assert(iso->header.len == sizeof(struct intsrcovrride_cntl));
+
+	assert(iso->source < IOAPIC_INT_ISA_MAX);
+	assert(ioapic_isaoverride_count < IOAPIC_INT_ISA_MAX);
+
+	if (iso->source != iso->glb_int_num_off) {
+		union ioapic_int_redir_entry srcentry = ioapic_int_isa_tmpl;
+
+		iosrc = ioapic_findbygsi(iso->source);
+		assert(iosrc);
+		srcentry.vector = iso->glb_int_num_off + HW_IRQ_START;
+		ioapic_int_entry_write(iosrc, iso->source - iosrc->glbint_base, srcentry);
+
+		ioapic_isainfo[ioapic_isaoverride_count].source = iso->glb_int_num_off;
+		ioapic_isainfo[ioapic_isaoverride_count].gsi    = iso->source;
+		ioapic_isainfo[ioapic_isaoverride_count].flags  = 0;
+		ioapic_isaoverride_count++;
+	}
+
+	ioapic_isainfo[ioapic_isaoverride_count].source = iso->source;
+	ioapic_isainfo[ioapic_isaoverride_count].gsi    = iso->glb_int_num_off;
+	ioapic_isainfo[ioapic_isaoverride_count].flags  = iso->flags;
+
+	printk("\tINT Override %u to %u, polarity: %u trigger: %u\n", iso->source, iso->glb_int_num_off,
+	       ioapic_isainfo[ioapic_isaoverride_count].polarity, ioapic_isainfo[ioapic_isaoverride_count].trigger);
+
+	switch(ioapic_isainfo[ioapic_isaoverride_count].trigger) {
+	case ACPI_MADT_ISO_TRIG_CONFORMS: break;
+	case ACPI_MADT_ISO_TRIG_EDGE: entry.trigger = IOAPIC_TRIGGER_EDGE; break;
+	case ACPI_MADT_ISO_TRIG_RESERVED: assert(0); break;
+	case ACPI_MADT_ISO_TRIG_LEVEL: entry.trigger = IOAPIC_TRIGGER_EDGE; break; /* XXX: should be level */
+	default: break;
+	}
+
+	switch(ioapic_isainfo[ioapic_isaoverride_count].polarity) {
+	case ACPI_MADT_ISO_POL_CONFORMS: break;
+	case ACPI_MADT_ISO_POL_ACTHIGH: entry.polarity = IOAPIC_POL_ACTHIGH; break;
+	case ACPI_MADT_ISO_POL_RESERVED: assert(0); break;
+	case ACPI_MADT_ISO_POL_ACTLOW: entry.polarity = IOAPIC_POL_ACTLOW; break;
+	default: break;
+	}
+
+	entry.vector = iso->source + HW_IRQ_START;
+	iogsi = ioapic_findbygsi(iso->glb_int_num_off);
+	assert(iogsi);
+
+	ioapic_int_entry_write(iogsi, iso->glb_int_num_off - iogsi->glbint_base, entry);
+
+	ioapic_isaoverride_count++;
+}
+
+void
+ioapic_iter(struct ioapic_cntl *io)
+{
+	u32_t ver;
+	int ioent, j;
+	static int more = 0;
+	unsigned int tmp_count = ioapic_count;
+
+	assert(io);
+
+	if (ioapic_count == IOAPIC_MAX) {
+		more ++;
+		printk("\t%d more than %d IOAPICs present..\n", more, IOAPIC_MAX);
+
+		return;
+	}
+
+	ioapic_count ++;
+	ioapicinfo[tmp_count].io_vaddr = (volatile void *)(io->ioapic_phys_addr);
+	ioapicinfo[tmp_count].ioapicid = io->ioapic_id;
+	ioapic_set_page(&(ioapicinfo[tmp_count]), vm_map_superpage((u32_t)(ioapicinfo[tmp_count].io_vaddr), 0));
+
+	ver   = ioapic_reg_read(&ioapicinfo[tmp_count], IOAPIC_IOAPICVER);
+	ioent = ((ver >> 16) & 0xFF) + 1;
+	printk("\tIOAPIC %d (counter:%d): Number of entries = %d\n", io->ioapic_id, tmp_count, ioent);
+
+	ioapicinfo[tmp_count].nentries    = ioent;
+	ioapicinfo[tmp_count].glbint_base = io->glb_int_num_off;
+	ioapic_int_count += ioent;
+
+	for (j = 0; j < ioent; j++) {
+		union ioapic_int_redir_entry entry = (io->glb_int_num_off + j) < IOAPIC_INT_ISA_MAX ? ioapic_int_isa_tmpl : ioapic_int_pci_tmpl;
+
+		entry.vector = io->glb_int_num_off + j + HW_IRQ_START;
+
+		ioapic_int_entry_write(&ioapicinfo[tmp_count], j, entry);
+	}
+}
+
+int
+chal_irq_enable(int irq, cpuid_t cpu_id)
+{
+	int 	            gsi = ioapic_int_gsi(irq - HW_IRQ_START);
+	struct ioapic_info *io  = ioapic_findbygsi(gsi);
+	union ioapic_int_redir_entry entry;
+	u8_t off;
+
+	if (!io) return -EINVAL;
+
+	off = gsi - io->glbint_base;
+	entry = ioapic_int_entry_read(io, off);
+
+	/* the destination bitmap is 8 bits */
+	if (irq - HW_IRQ_START >= (int)ioapic_int_count || cpu_id > 7) return -EINVAL;
+
+	/* irq should be masked or in logical mode */
+	assert(entry.mask || entry.destmod == IOAPIC_DST_LOGICAL);
+
+	/* if irq is masked, destination should be 0 */
+	assert(!entry.mask || !entry.destination);
+
+	ioapic_int_unmask(irq - HW_IRQ_START, entry.destination | (u8_t)logical_apicids[cpu_id]);
+
+	return 0;
+}
+
+int
+chal_irq_disable(int irq, cpuid_t cpu_id)
+{
+	int 	            gsi = ioapic_int_gsi(irq - HW_IRQ_START);
+	struct ioapic_info *io  = ioapic_findbygsi(gsi);
+	union ioapic_int_redir_entry entry;
+	u8_t off;
+
+	if (!io) return -EINVAL;
+
+	off = gsi - io->glbint_base;
+	entry = ioapic_int_entry_read(io, off);
+
+	/* the destination bitmap is 8 bits */
+	if (irq - HW_IRQ_START >= (int)ioapic_int_count || cpu_id > 7) return -EINVAL;
+
+	assert(entry.mask || entry.destmod == IOAPIC_DST_LOGICAL);
+
+	/* we should disable the irq if we remove the last core */
+	if (!(entry.destination & ~logical_apicids[cpu_id])) {
+		ioapic_int_mask(irq - HW_IRQ_START);
+		return 0;
+	}
+
+	ioapic_int_unmask(irq - HW_IRQ_START, entry.destination & ~logical_apicids[cpu_id]);
+	return 0;
+}
+
+void
+ioapic_init(void)
+{
+	assert(ioapic_count);
+	pic_disable();
+
+	printk("Setting up IOAPIC (disabling PIC)\n");
+
+	/*
+	 * PCI Interrupts may need some attention here.
+	 * https://forum.osdev.org/viewtopic.php?f=1&t=21745
+	 * The discussion in the above forum suggest modern PCIe devices bypass IOAPIC and send
+	 * interrupts directly to the core. For legacy PCI, we probably need to read some APIC tables.
+	 *
+	 * Update: with BMK_SCREW_INTERRUPT_ROUTING, got Rumpkernel to boot fine on HW as well.
+	 * The effect of that BMK_SCREW_INTERRUPT_ROUTING is mostly in the BMK intr.c to use an array of lists vs
+	 * single list. It doesn't change how NetBSD does interrupt processing.
+	 */
+}
diff --git a/src/platform/i386/ioapic.h b/src/platform/i386/ioapic.h
new file mode 100644
index 0000000000..3cd3e31ea4
--- /dev/null
+++ b/src/platform/i386/ioapic.h
@@ -0,0 +1,17 @@
+#ifndef IOAPIC_H
+#define IOAPIC_H
+
+#include "apic_cntl.h"
+
+void ioapic_init(void);
+
+void ioapic_iter(struct ioapic_cntl *);
+// void ioapic_int_mask(int irq);
+// void ioapic_int_unmask(int irq);
+
+void ioapic_int_disable(int irq);
+void ioapic_int_enable(int irq, cpuid_t cpu_id);
+
+void ioapic_int_override(struct intsrcovrride_cntl *);
+
+#endif /* IOAPIC_H */
diff --git a/src/platform/i386/isr.h b/src/platform/i386/isr.h
index 052c0596a8..e14392ee24 100644
--- a/src/platform/i386/isr.h
+++ b/src/platform/i386/isr.h
@@ -49,7 +49,7 @@ extern void smid_float_pt_except_fault_irq(struct pt_regs *);
 extern void virtualization_except_fault_irq(struct pt_regs *);
 extern void security_except_fault_irq(struct pt_regs *);
 
-extern void periodic_irq(struct pt_regs *);
+extern void hpet_periodic_irq(struct pt_regs *);
 extern void keyboard_irq(struct pt_regs *);
 extern void handler_hw_34(struct pt_regs *);
 extern void handler_hw_35(struct pt_regs *);
@@ -57,7 +57,7 @@ extern void serial_irq(struct pt_regs *);
 extern void handler_hw_37(struct pt_regs *);
 extern void handler_hw_38(struct pt_regs *);
 extern void handler_hw_39(struct pt_regs *);
-extern void oneshot_irq(struct pt_regs *);
+extern void hpet_oneshot_irq(struct pt_regs *);
 extern void handler_hw_41(struct pt_regs *);
 extern void handler_hw_42(struct pt_regs *);
 extern void handler_hw_43(struct pt_regs *);
@@ -84,11 +84,4 @@ extern void lapic_spurious_irq(struct pt_regs *);
 extern void lapic_ipi_asnd_irq(struct pt_regs *);
 extern void lapic_timer_irq(struct pt_regs *);
 
-static void
-ack_irq(int n)
-{
-	if (n >= 40) outb(0xA0, 0x20); /* Send reset signal to slave */
-	outb(0x20, 0x20);
-}
-
 #endif /* ISR_H */
diff --git a/src/platform/i386/kernel.c b/src/platform/i386/kernel.c
index caf1858803..a91c6f5437 100644
--- a/src/platform/i386/kernel.c
+++ b/src/platform/i386/kernel.c
@@ -145,9 +145,6 @@ kmain(struct multiboot *mboot, u32_t mboot_magic, u32_t esp)
 #ifdef ENABLE_SERIAL
 	serial_init();
 #endif
-#ifdef ENABLE_CONSOLE
-	console_init();
-#endif
 #ifdef ENABLE_VGA
 	vga_init();
 #endif
@@ -163,10 +160,15 @@ kmain(struct multiboot *mboot, u32_t mboot_magic, u32_t esp)
 	comp_init();
 	thd_init();
 	paging_init();
-
 	kern_boot_comp(INIT_CORE);
 	lapic_init();
-	timer_init();
+	hpet_init();
+	chal_irq_enable(HW_SERIAL, 0);
+	pic_init();
+	ioapic_init();
+#ifdef ENABLE_SERIAL
+	serial_late_init();
+#endif
 	smp_init(cores_ready);
 	cores_ready[INIT_CORE] = 1;
 
diff --git a/src/platform/i386/kernel.h b/src/platform/i386/kernel.h
index 6c64531537..2829009d32 100644
--- a/src/platform/i386/kernel.h
+++ b/src/platform/i386/kernel.h
@@ -11,62 +11,23 @@
 #include <thd.h>
 #include <hw.h>
 
-#ifdef ENABLE_CONSOLE
-void vga_clear(void);
-void vga_puts(const char *s);
-void console_init(void);
-#endif
-
-#ifdef ENABLE_VGA
-void vga_high_init(void);
-void vga_init(void);
-void vga_puts(const char *str);
-#endif
-
-#ifdef ENABLE_SERIAL
-void serial_init(void);
-#endif
-
-/* These numbers map directly to actual timers in the HPET */
-typedef enum {
-	TIMER_PERIODIC = 0,
-	TIMER_ONESHOT  = 1,
-} timer_type_t;
-
-#define TIMER_DEFAULT_US_INTERARRIVAL 1000 /* US = microseconds */
-
-void  timer_set(timer_type_t timer_type, u64_t cycles);
-void  timer_init(void);
-u64_t timer_find_hpet(void *timer);
-void  timer_set_hpet_page(u32_t page);
-void  timer_thd_init(struct thread *t);
-
-void  tss_init(const cpuid_t cpu_id);
-void  idt_init(const cpuid_t cpu_id);
-void  gdt_init(const cpuid_t cpu_id);
-void  user_init(void);
-void  paging_init(void);
-void *acpi_find_rsdt(void);
-void *acpi_find_timer(void);
-void  acpi_set_rsdt_page(u32_t);
-void  kern_paging_map_init(void *pa);
-
-void *       acpi_find_apic(void);
-u32_t        lapic_find_localaddr(void *l);
-void         lapic_set_page(u32_t page);
-void         lapic_timer_init(void);
-void         lapic_init(void);
-void         lapic_set_timer(int timer_type, cycles_t deadline);
-u32_t        lapic_get_ccr(void);
-void         lapic_timer_calibration(u32_t ratio);
-void         lapic_asnd_ipi_send(const cpuid_t cpu_id);
-extern volatile u32_t lapic_timer_calib_init;
-
-void smp_init(volatile int *cores_ready);
+#include "vga.h"
+#include "serial.h"
+#include "hpet.h"
+#include "acpi.h"
+#include "lapic.h"
+#include "pic.h"
+#include "ioapic.h"
+
+int vm_map_superpage(u32_t addr, int nocache);
+void kern_paging_map_init(void *);
+void paging_init(void);
+void tss_init(cpuid_t);
+void gdt_init(cpuid_t);
+void idt_init(cpuid_t);
 
 void tls_update(u32_t addr);
 
-// void printk(const char *fmt, ...);
 int printk_register_handler(void (*handler)(const char *));
 
 void khalt(void);
diff --git a/src/platform/i386/keyboard.c b/src/platform/i386/keyboard.c
new file mode 100644
index 0000000000..b38987faa2
--- /dev/null
+++ b/src/platform/i386/keyboard.c
@@ -0,0 +1,21 @@
+#include "kernel.h"
+
+#define KEY_DEVICE  0x60
+#define KEY_PENDING 0x64
+
+int
+keyboard_handler(struct pt_regs *regs)
+{
+        u16_t scancode = 0;
+	int preempt = 1;
+
+        lapic_ack();
+
+        while (inb(KEY_PENDING) & 2) {
+                /* wait for keypress to be ready */
+        }
+        scancode = inb(KEY_DEVICE);
+        PRINTK("Keyboard press: %d\n", scancode);
+
+	return preempt;
+}
diff --git a/src/platform/i386/lapic.c b/src/platform/i386/lapic.c
index a67dca3767..0a5eb894f2 100644
--- a/src/platform/i386/lapic.c
+++ b/src/platform/i386/lapic.c
@@ -1,44 +1,13 @@
 #include "kernel.h"
 #include "chal_cpu.h"
 #include "isr.h"
+#include "apic_cntl.h"
 
-#define APIC_DEFAULT_PHYS 0xfee00000
-#define APIC_HDR_LEN_OFF 0x04
-#define APIC_CNTRLR_ADDR_OFF 0x24
-#define APIC_CNTRLR_FLAGS_OFF 0x28
-#define APIC_CNTR_ARR_OFF 0x2C
+#define LAPIC_MAX NUM_CPU
 
-/* See 5.2.12 in the ACPI 5.0 Spec */
-enum
-{
-	APIC_CNTL_LAPIC  = 0,
-	APIC_CNTL_IOAPIC = 1,
-};
-
-struct int_cntl_head {
-	u8_t type;
-	u8_t len;
-} __attribute__((packed));
-
-struct lapic_cntl {
-	/* type == APIC_CNTL_LAPIC */
-	struct int_cntl_head header;
-	u8_t                 proc_id;
-	u8_t                 apic_id;
-	u32_t                flags; /* 0 = dead processor */
-} __attribute__((packed));
-
-struct ioapic_cntl {
-	/* type == APIC_CNTL_IOAPIC */
-	struct int_cntl_head header;
-	u8_t                 ioapic_id;
-	u8_t                 reserved;
-	u32_t                ioapic_phys_addr;
-	u32_t                glb_int_num_off; /* I/O APIC's interrupt base number offset  */
-} __attribute__((packed));
-
-volatile int ncpus = 1;
-volatile int apicids[NUM_CPU];
+int ncpus = 1;
+int apicids[NUM_CPU];
+u32_t logical_apicids[NUM_CPU];
 
 #define CMOS_PORT    0x70
 
@@ -46,6 +15,7 @@ volatile int apicids[NUM_CPU];
 #define LAPIC_VERSION_REG        0x030 /* version */
 #define LAPIC_TP_REG             0x080 /* Task Priority Register */
 
+#define LAPIC_LDR_REG            0x0D0 /* Logical destination register */
 #define LAPIC_SIV_REG            0x0F0 /* spurious interrupt vector */
 #define LAPIC_SIV_ENABLE         (1 << 8) /* enable bit in the SIV */
 #define LAPIC_EOI_REG            0x0B0 /* ack, or end-of-interrupt */
@@ -87,6 +57,10 @@ volatile int apicids[NUM_CPU];
 #define LAPIC_ONESHOT_THRESH (1 << 12)
 #define LAPIC_TSCDEADLINE_THRESH 0
 
+#define LAPIC_LDR_OFFSET 24
+#define LAPIC_LDR_MAST (0xfful << LAPIC_LDR_OFFSET)
+
+
 extern int timer_process(struct pt_regs *regs);
 
 enum lapic_timer_type
@@ -124,7 +98,7 @@ lapic_write_reg(u32_t off, u32_t val)
 	*(volatile u32_t *)(lapic + off) = val;
 }
 
-static void
+void
 lapic_ack(void)
 {
 	lapic_write_reg(LAPIC_EOI_REG, 0);
@@ -175,53 +149,16 @@ lapic_apicid(void)
 }
 
 void
-lapic_intsrc_iter(unsigned char *madt)
+lapic_iter(struct lapic_cntl *l)
 {
-	struct int_cntl_head *h   = (struct int_cntl_head *)(madt + APIC_CNTR_ARR_OFF);
-	u32_t                 len = *(u32_t *)(madt + APIC_HDR_LEN_OFF);
-	struct int_cntl_head *end = (struct int_cntl_head *)(madt + len);
-	int                   us = lapic_apicid(), off = 1;
-
-	apicids[0] = us;
-	printk("\tMADT length %d (base struct %d)\n", len, APIC_CNTR_ARR_OFF);
-	assert(h <= end);
-	for (; h < end; h = (struct int_cntl_head *)((char *)h + h->len)) {
-		/* termination condition */
-		assert(h->len >= sizeof(struct int_cntl_head));
-		switch (h->type) {
-		case APIC_CNTL_LAPIC: {
-			struct lapic_cntl *l = (struct lapic_cntl *)h;
-
-			assert(l->header.len == sizeof(struct lapic_cntl));
-			printk("\tLAPIC found: coreid %d, apicid %d flags %d\n", l->proc_id, l->apic_id, l->flags);
-
-			if (l->apic_id != us && l->flags && ncpus < NUM_CPU && NUM_CPU > 1) {
-				apicids[off++] = l->apic_id;
-				ncpus++;
-			}
-
-			break;
-		}
-		case APIC_CNTL_IOAPIC: {
-			struct ioapic_cntl *io = (struct ioapic_cntl *)h;
+	static int off = 1;
 
-			assert(io->header.len == sizeof(struct ioapic_cntl));
-			printk("\tI/O APIC found: ioapicid %d, addr %x, int offset %d\n", io->ioapic_id,
-			       io->ioapic_phys_addr, io->glb_int_num_off);
-			break;
-		}
-		default:
-			/* See 5.2.12 in the ACPI 5.0 Spec */
-			printk("\tInterrupt controller type %d: ignoring\n", h->type);
-			break;
-		}
-	}
-	printk("\tAPICs processed, %d cores\n", ncpus);
+	assert(l->header.len == sizeof(struct lapic_cntl));
+	printk("\tLAPIC found: coreid %d, apicid %d\n", l->proc_id, l->apic_id);
 
-	if (ncpus != NUM_CPU) {
-		printk("Number of LAPICs processed =%d not meeting the requirement = %d\n", ncpus, NUM_CPU);
-		printk("Please reconfigure NUM_CPU in Composite/HW-BIOS\n");
-		assert(0);
+	if (l->apic_id != apicids[INIT_CORE] && l->flags && ncpus < NUM_CPU && NUM_CPU > 1) {
+		apicids[off++] = l->apic_id;
+		ncpus++;
 	}
 }
 
@@ -236,6 +173,7 @@ lapic_find_localaddr(void *l)
 
 	printk("Initializing LAPIC @ %p\n", lapicaddr);
 
+	apicids[INIT_CORE] = lapic_apicid();
 	for (i = 0; i < length; i++) {
 		sum += lapicaddr[i];
 	}
@@ -248,7 +186,7 @@ lapic_find_localaddr(void *l)
 	addr       = *(u32_t *)(lapicaddr + APIC_CNTRLR_ADDR_OFF);
 	apic_flags = *(u32_t *)(lapicaddr + APIC_CNTRLR_FLAGS_OFF);
 	assert(apic_flags == 1); /* we're assuming the PIC exists */
-	lapic_intsrc_iter(lapicaddr);
+	acpi_madt_intsrc_iter(lapicaddr);
 
 	printk("\tChecksum is OK\n");
 	lapic = (void *)(addr);
@@ -261,12 +199,40 @@ lapic_find_localaddr(void *l)
 	return addr;
 }
 
+static u32_t
+cons_logical_id(const u32_t id)
+{
+	/*
+	 * FIXME: xAPIC only support 8 bits bitmap for logical destination,
+	 * So we will configure the logical id of cores with id larger than 7
+	 * to 0 which means we should find out a way(x2APIC) to fix this when we
+	 * have more than 8 cores in ioapic.
+	 */
+
+	if (id > 7) return 0;
+
+	return (1ul << id) << LAPIC_LDR_OFFSET;
+}
+
+static u32_t
+lapic_set_ldr(const u32_t id)
+{
+	u32_t lid = cons_logical_id(id);
+
+	lapic_write_reg(LAPIC_LDR_REG, lid | ~LAPIC_LDR_MAST);
+	return lid >> LAPIC_LDR_OFFSET;
+}
+
 void
 lapic_init(void)
 {
 	u32_t version;
 
 	assert(lapic);
+
+	/* setup LDR for logic destination before init lapic */
+	logical_apicids[get_cpuid()] = lapic_set_ldr(get_cpuid());
+
 	lapic_write_reg(LAPIC_SIV_REG, LAPIC_SIV_ENABLE | HW_LAPIC_SPURIOUS);
 
 	version = lapic_read_reg(LAPIC_VERSION_REG);
diff --git a/src/platform/i386/lapic.h b/src/platform/i386/lapic.h
new file mode 100644
index 0000000000..6156ffc708
--- /dev/null
+++ b/src/platform/i386/lapic.h
@@ -0,0 +1,23 @@
+#ifndef LAPIC_H
+#define LAPIC_H
+
+#include "apic_cntl.h"
+
+void  lapic_ack(void);
+void  lapic_iter(struct lapic_cntl *);
+u32_t lapic_find_localaddr(void *l);
+void  lapic_set_page(u32_t page);
+void  lapic_timer_init(void);
+void  lapic_set_timer(int timer_type, cycles_t deadline);
+u32_t lapic_get_ccr(void);
+void  lapic_timer_calibration(u32_t ratio);
+void  lapic_asnd_ipi_send(const cpuid_t cpu_id);
+
+extern volatile u32_t lapic_timer_calib_init;
+extern int apicids[NUM_CPU];
+extern u32_t logical_apicids[NUM_CPU];
+
+void lapic_init(void);
+void smp_init(volatile int *cores_ready);
+
+#endif /* LAPIC_H */
diff --git a/src/platform/i386/pic.c b/src/platform/i386/pic.c
new file mode 100644
index 0000000000..1de14dcabf
--- /dev/null
+++ b/src/platform/i386/pic.c
@@ -0,0 +1,59 @@
+#include "kernel.h"
+#include "pic.h"
+
+#define PIC_IRQ_BASE    0x20
+#define PIC_ALL_DISABLE 0xFF
+#define PIC_ALL_ENABLE  0x00
+
+/* Information taken from: http://wiki.osdev.org/PIC */
+#define PIC1 0x20
+#define PIC2 0xA0
+#define PIC1_CMD PIC1
+#define PIC1_DATA (PIC1 + 1)
+#define PIC2_CMD PIC2
+#define PIC2_DATA (PIC2 + 1)
+
+/* reinitialize the PIC controllers, giving them specified vector offsets
+   rather than 8 and 70, as configured by default */
+#define PIC_ICW1_ICW4      0x01 /* ICW4 (not) needed */
+#define PIC_ICW1_SINGLE    0x02 /* Single (cascade) mode */
+#define PIC_ICW1_INTERVAL4 0x04 /* Call address interval 4 (8) */
+#define PIC_ICW1_LEVEL     0x08 /* Level triggered (edge) mode */
+#define PIC_ICW1_INIT      0x10 /* Initialization - required! */
+
+#define PIC_ICW4_8086       0x01 /* 8086/88 (MCS-80/85) mode */
+#define PIC_ICW4_AUTO       0x02 /* Auto (normal) EOI */
+#define PIC_ICW4_BUF_SLAVE  0x08 /* Buffered mode/slave */
+#define PIC_ICW4_BUF_MASTER 0x0C /* Buffered mode/master */
+#define PIC_ICW4_SFNM       0x10 /* Special fully nested (not) */
+#define PIC_ICW1_ICW4       0x01
+
+void
+pic_disable(void)
+{
+	outb(PIC1_DATA, PIC_ALL_DISABLE);
+	outb(PIC2_DATA, PIC_ALL_DISABLE);
+}
+
+void
+pic_enable(void)
+{
+	outb(PIC1_DATA, PIC_ALL_ENABLE);
+	outb(PIC2_DATA, PIC_ALL_ENABLE);
+}
+
+void
+pic_init(void)
+{
+	printk("Setting up PIC\n");
+	outb(PIC1_CMD, PIC_ICW1_INIT | PIC_ICW1_ICW4);
+	outb(PIC2_CMD, PIC_ICW1_INIT | PIC_ICW1_ICW4);
+	outb(PIC1_DATA, PIC_IRQ_BASE);
+	outb(PIC2_DATA, PIC_IRQ_BASE + 8);
+	outb(PIC1_DATA, 4);
+	outb(PIC2_DATA, 2);
+	outb(PIC1_DATA, PIC_ICW4_8086);
+	outb(PIC2_DATA, PIC_ICW4_8086);
+
+	pic_enable();
+}
diff --git a/src/platform/i386/pic.h b/src/platform/i386/pic.h
new file mode 100644
index 0000000000..ed5b0ffdac
--- /dev/null
+++ b/src/platform/i386/pic.h
@@ -0,0 +1,17 @@
+#ifndef PIC_H
+#define PIC_H
+
+#include "chal/io.h"
+
+void pic_init(void);
+void pic_enable(void);
+void pic_disable(void);
+
+static void
+pic_ack_irq(int n)
+{
+	if (n >= 40) outb(0xA0, 0x20); /* Send reset signal to slave */
+	outb(0x20, 0x20);
+}
+
+#endif /* PIC_H */
diff --git a/src/platform/i386/qemu-kvm.sh b/src/platform/i386/qemu-kvm.sh
new file mode 100755
index 0000000000..5fb559c299
--- /dev/null
+++ b/src/platform/i386/qemu-kvm.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+if [ $# != 1 ]; then
+  echo "Usage: $0 <run-script.sh>"
+  exit 1
+fi
+
+if ! [ -r $1 ]; then
+  echo "Can't open run-script"
+  exit 1
+fi
+
+MODULES=$(sh $1 | awk '/^Writing image/ { print $3; }' | tr '\n' ' ')
+
+#qemu-system-i386 -m 768 -nographic -kernel kernel.img -no-reboot -s -initrd "$(echo $MODULES | tr ' ' ',')"
+qemu-system-i386 -enable-kvm -rtc base=localtime,clock=host,driftfix=none -smp sockets=1,cores=2,threads=1 -cpu host -nographic -m 800 -kernel kernel.img -initrd "$(echo $MODULES | tr ' ' ',')"
diff --git a/src/platform/i386/runscripts/crttests.sh b/src/platform/i386/runscripts/crttests.sh
new file mode 100644
index 0000000000..55c6b0792b
--- /dev/null
+++ b/src/platform/i386/runscripts/crttests.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+cp tests.crt_tests.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/micro_chan.sh b/src/platform/i386/runscripts/micro_chan.sh
new file mode 100644
index 0000000000..381d083c5a
--- /dev/null
+++ b/src/platform/i386/runscripts/micro_chan.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+cp micro_chan.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/omp_dijkstra.sh b/src/platform/i386/runscripts/omp_dijkstra.sh
new file mode 100644
index 0000000000..128366ed60
--- /dev/null
+++ b/src/platform/i386/runscripts/omp_dijkstra.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+cp omp_dijkstra.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub
+
+#cp llboot_comp.o llboot.o
+#cp omp_dijkstra.o boot.o
+#cp test_boot.o dummy1.o
+#cp test_boot.o dummy2.o
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/omp_fft_bots.sh b/src/platform/i386/runscripts/omp_fft_bots.sh
new file mode 100644
index 0000000000..858f140dd1
--- /dev/null
+++ b/src/platform/i386/runscripts/omp_fft_bots.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+cp omp_fft_bots.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub
+
+#cp llboot_comp.o llboot.o
+#cp omp_fft_bots.o boot.o
+#cp test_boot.o dummy1.o
+#cp test_boot.o dummy2.o
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/omp_fib_bots.sh b/src/platform/i386/runscripts/omp_fib_bots.sh
new file mode 100644
index 0000000000..5c4465f351
--- /dev/null
+++ b/src/platform/i386/runscripts/omp_fib_bots.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+cp omp_fib_bots.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub
+
+#cp llboot_comp.o llboot.o
+#cp omp_fib_bots.o boot.o
+#cp test_boot.o dummy1.o
+#cp test_boot.o dummy2.o
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/omp_hello.sh b/src/platform/i386/runscripts/omp_hello.sh
new file mode 100644
index 0000000000..342a043e00
--- /dev/null
+++ b/src/platform/i386/runscripts/omp_hello.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+cp omp_hello.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub
+
+#cp llboot_comp.o llboot.o
+#cp omp_hello.o boot.o
+#cp test_boot.o dummy1.o
+#cp test_boot.o dummy2.o
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/omp_sort_bots.sh b/src/platform/i386/runscripts/omp_sort_bots.sh
new file mode 100644
index 0000000000..cf71756905
--- /dev/null
+++ b/src/platform/i386/runscripts/omp_sort_bots.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+cp omp_sort_bots.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub
+
+#cp llboot_comp.o llboot.o
+#cp omp_sort_bots.o boot.o
+#cp test_boot.o dummy1.o
+#cp test_boot.o dummy2.o
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/omp_sparselu_for_bots.sh b/src/platform/i386/runscripts/omp_sparselu_for_bots.sh
new file mode 100644
index 0000000000..785b0eae92
--- /dev/null
+++ b/src/platform/i386/runscripts/omp_sparselu_for_bots.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+cp omp_sparselu_for_bots.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub
+
+#cp llboot_comp.o llboot.o
+#cp omp_sparselu_for_bots.o boot.o
+#cp test_boot.o dummy1.o
+#cp test_boot.o dummy2.o
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/omp_sparselu_single_bots.sh b/src/platform/i386/runscripts/omp_sparselu_single_bots.sh
new file mode 100644
index 0000000000..1d1374aef4
--- /dev/null
+++ b/src/platform/i386/runscripts/omp_sparselu_single_bots.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+cp omp_sparselu_single_bots.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub
+
+#cp llboot_comp.o llboot.o
+#cp omp_sparselu_single_bots.o boot.o
+#cp test_boot.o dummy1.o
+#cp test_boot.o dummy2.o
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/omp_strassen_bots.sh b/src/platform/i386/runscripts/omp_strassen_bots.sh
new file mode 100644
index 0000000000..3fe5a88ac3
--- /dev/null
+++ b/src/platform/i386/runscripts/omp_strassen_bots.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+cp omp_strassen_bots.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub
+
+#cp llboot_comp.o llboot.o
+#cp omp_strassen_bots.o boot.o
+#cp test_boot.o dummy1.o
+#cp test_boot.o dummy2.o
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/omp_ubench.sh b/src/platform/i386/runscripts/omp_ubench.sh
new file mode 100644
index 0000000000..100adcb020
--- /dev/null
+++ b/src/platform/i386/runscripts/omp_ubench.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+cp omp_ubench.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub
+
+#cp llboot_comp.o llboot.o
+#cp omp_ubench.o boot.o
+#cp test_boot.o dummy1.o
+#cp test_boot.o dummy2.o
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/omp_workconsprob.sh b/src/platform/i386/runscripts/omp_workconsprob.sh
new file mode 100644
index 0000000000..5e7a8985a6
--- /dev/null
+++ b/src/platform/i386/runscripts/omp_workconsprob.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+cp omp_workconsprob.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub
+
+#cp llboot_comp.o llboot.o
+#cp omp_hello.o boot.o
+#cp test_boot.o dummy1.o
+#cp test_boot.o dummy2.o
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/part_test.sh b/src/platform/i386/runscripts/part_test.sh
new file mode 100644
index 0000000000..a8815e0903
--- /dev/null
+++ b/src/platform/i386/runscripts/part_test.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+cp part_test.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/test_slite02.sh b/src/platform/i386/runscripts/test_slite02.sh
new file mode 100644
index 0000000000..e51ba080f7
--- /dev/null
+++ b/src/platform/i386/runscripts/test_slite02.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+
+cp llboot_comp.o llboot.o
+cp test_sched.o boot.o
+cp test_sched_inv.o intcomp.o
+cp test_sched_inv.o w1comp.o
+cp test_sched_inv.o w3comp.o
+cp test_boot.o dummy1.o
+cp test_boot.o dummy2.o
+
+# only int and w0 in root
+./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o" ./gen_client_stub
+
+#int, w0 in root and w1 in comp
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, ;w1comp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o;w1comp.o-boot.o|capmgr.o" ./gen_client_stub
+
+# int, w1 - w3
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, ;w1comp.o, ;w3comp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o;w1comp.o-boot.o|capmgr.o;w3comp.o-boot.o|capmgr.o" ./gen_client_stub
+
+#cp test_boot.o dummy.o
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o" ./gen_client_stub
+#
diff --git a/src/platform/i386/runscripts/unit_hierschedcomps.sh b/src/platform/i386/runscripts/unit_hierschedcomps.sh
index ba032033bf..5122af6f50 100644
--- a/src/platform/i386/runscripts/unit_hierschedcomps.sh
+++ b/src/platform/i386/runscripts/unit_hierschedcomps.sh
@@ -5,8 +5,8 @@ cp root_fprr.o boot.o
 cp hier_fprr.o hier_fprr1.o
 cp hier_fprr.o hier_fprr2.o
 cp hier_fprr.o hier_fprr3.o
-cp unit_schedcomp_test.o unit_schedcomp_test1.o
-cp unit_schedcomp_test.o unit_schedcomp_test2.o
-cp unit_schedcomp_test.o unit_schedcomp_test3.o
-cp unit_schedcomp_test.o unit_schedcomp_test4.o
-./cos_linker "llboot.o, ;*hier_fprr1.o, ;capmgr.o, ;*hier_fprr2.o, ;*boot.o, ;*hier_fprr3.o, ;unit_schedcomp_test1.o, ;unit_schedcomp_test2.o, ;unit_schedcomp_test3.o, ;unit_schedcomp_test4.o, :boot.o-capmgr.o;hier_fprr1.o-capmgr.o|[parent_]boot.o;hier_fprr2.o-capmgr.o|[parent_]boot.o;hier_fprr3.o-capmgr.o|[parent_]hier_fprr1.o;unit_schedcomp_test1.o-boot.o;unit_schedcomp_test2.o-hier_fprr1.o;unit_schedcomp_test3.o-hier_fprr2.o;unit_schedcomp_test4.o-hier_fprr3.o" ./gen_client_stub
+cp unit_schedappcomp_test.o unit_schedappcomp_test1.o
+cp unit_schedappcomp_test.o unit_schedappcomp_test2.o
+cp unit_schedappcomp_test.o unit_schedappcomp_test3.o
+cp unit_schedappcomp_test.o unit_schedappcomp_test4.o
+./cos_linker "llboot.o, ;*hier_fprr1.o, ;capmgr.o, ;*hier_fprr2.o, ;*boot.o, ;*hier_fprr3.o, ;unit_schedappcomp_test1.o, ;unit_schedappcomp_test2.o, ;unit_schedappcomp_test3.o, ;unit_schedappcomp_test4.o, :boot.o-capmgr.o;hier_fprr1.o-capmgr.o|[parent_]boot.o;hier_fprr2.o-capmgr.o|[parent_]boot.o;hier_fprr3.o-capmgr.o|[parent_]hier_fprr1.o;unit_schedappcomp_test1.o-boot.o;unit_schedappcomp_test2.o-hier_fprr1.o;unit_schedappcomp_test3.o-hier_fprr2.o;unit_schedappcomp_test4.o-hier_fprr3.o" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/unit_schedappcomps.sh b/src/platform/i386/runscripts/unit_schedappcomps.sh
new file mode 100644
index 0000000000..5792230896
--- /dev/null
+++ b/src/platform/i386/runscripts/unit_schedappcomps.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+cp llboot_comp.o llboot.o
+cp root_fprr.o boot.o
+./cos_linker "llboot.o, ;unit_schedappcomp_test.o, ;capmgr.o, ;unit_schedappaep_test.o, ;*boot.o, :boot.o-capmgr.o;unit_schedappcomp_test.o-boot.o;unit_schedappaep_test.o-boot.o|capmgr.o" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/unit_schedcomp.sh b/src/platform/i386/runscripts/unit_schedcomp.sh
index 9327f2ae50..7665041768 100644
--- a/src/platform/i386/runscripts/unit_schedcomp.sh
+++ b/src/platform/i386/runscripts/unit_schedcomp.sh
@@ -1,5 +1,7 @@
 #!/bin/sh
 
 cp llboot_comp.o llboot.o
-cp root_fprr.o boot.o
-./cos_linker "llboot.o, ;unit_schedcomp_test.o, ;capmgr.o, ;unit_schedaep_test.o, ;*boot.o, :boot.o-capmgr.o;unit_schedcomp_test.o-boot.o;unit_schedaep_test.o-boot.o|capmgr.o" ./gen_client_stub
+cp unit_schedcomp_test.o boot.o
+cp test_boot.o dummy1.o
+cp test_boot.o dummy2.o
+./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/unit_slite01.sh b/src/platform/i386/runscripts/unit_slite01.sh
new file mode 100644
index 0000000000..8a887a8a36
--- /dev/null
+++ b/src/platform/i386/runscripts/unit_slite01.sh
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+cp llboot_comp.o llboot.o
+cp root_fprr.o boot.o
+#cp unit_slrcvtest.o boot.o
+#cp test_boot.o dummy1.o
+#cp test_boot.o dummy2.o
+./cos_linker "llboot.o, ;*spin_comp.o, ;capmgr.o, ;*unit_slrcvtest.o, ;*boot.o, :boot.o-capmgr.o;unit_slrcvtest.o-boot.o|capmgr.o;spin_comp.o-boot.o|capmgr.o" ./gen_client_stub
+#./cos_linker "llboot.o, ;dummy2.o, ;capmgr.o, ;dummy1.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/unit_slrcv.sh b/src/platform/i386/runscripts/unit_slrcv.sh
new file mode 100644
index 0000000000..a12a03d75d
--- /dev/null
+++ b/src/platform/i386/runscripts/unit_slrcv.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+cp unit_slrcvtest.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/unit_slxcore.sh b/src/platform/i386/runscripts/unit_slxcore.sh
new file mode 100644
index 0000000000..4cb06cf503
--- /dev/null
+++ b/src/platform/i386/runscripts/unit_slxcore.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+cp unit_slxcoretests.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub
diff --git a/src/platform/i386/serial.c b/src/platform/i386/serial.c
index dc21481fd1..5685938af6 100644
--- a/src/platform/i386/serial.c
+++ b/src/platform/i386/serial.c
@@ -5,8 +5,6 @@
 #include "isr.h"
 #include "kernel.h"
 
-void serial_puts(const char *s);
-
 enum serial_ports
 {
 	SERIAL_PORT_A = 0x3F8,
@@ -43,7 +41,7 @@ serial_handler(struct pt_regs *r)
 	char serial;
 	int  preempt = 1;
 
-	ack_irq(HW_SERIAL);
+	lapic_ack();
 
 	serial = serial_recv();
 
@@ -62,18 +60,19 @@ serial_handler(struct pt_regs *r)
 	case 3: /* FIXME: Obviously remove this once we have working components */
 		die("Break\n");
 	case 'o':
-		timer_set(TIMER_ONESHOT, 50000000);
-		timer_set(TIMER_ONESHOT, 50000000);
+		hpet_set(HPET_ONESHOT, 50000000);
+		hpet_set(HPET_ONESHOT, 50000000);
 		break;
 	case 'p':
-		timer_set(TIMER_PERIODIC, 100000000);
-		timer_set(TIMER_PERIODIC, 100000000);
+		hpet_set(HPET_PERIODIC, 100000000);
+		hpet_set(HPET_PERIODIC, 100000000);
 		break;
 	default:
 		break;
 	}
 
-	printk("Serial: %d\n", serial);
+	PRINTK("Serial: %d\n", serial);
+
 	// printk("%c", serial);
 	return preempt;
 }
@@ -81,17 +80,24 @@ serial_handler(struct pt_regs *r)
 void
 serial_init(void)
 {
-	printk("Enabling serial I/O\n");
 	printk_register_handler(serial_puts);
 
 	/* We will initialize the first serial port */
 	outb(SERIAL_PORT_A + 1, 0x00);
 	outb(SERIAL_PORT_A + 3, 0x80); /* Enable divisor mode */
-	outb(SERIAL_PORT_A + 0, 0x03); /* Div Low:  03 Set the port to 38400 bps */
+	outb(SERIAL_PORT_A + 0, 0x01); /* Div Low:  01 Set the port to 115200 bps */
 	outb(SERIAL_PORT_A + 1, 0x00); /* Div High: 00 */
 	outb(SERIAL_PORT_A + 3, 0x03);
 	outb(SERIAL_PORT_A + 2, 0xC7);
 	outb(SERIAL_PORT_A + 4, 0x0B);
 
 	outb(SERIAL_PORT_A + 1, 0x01); /* Enable interrupts on receive */
+	printk("Enabling serial I/O\n");
+}
+
+void
+serial_late_init(void)
+{
+	chal_irq_enable(HW_SERIAL, 0);
+	chal_irq_enable(HW_KEYBOARD, 0);
 }
diff --git a/src/platform/i386/serial.h b/src/platform/i386/serial.h
new file mode 100644
index 0000000000..777c31078e
--- /dev/null
+++ b/src/platform/i386/serial.h
@@ -0,0 +1,9 @@
+#ifndef SERIAL_H
+#define SERIAL_H
+
+#ifdef ENABLE_SERIAL
+void serial_init(void);
+void serial_late_init(void);
+#endif
+
+#endif
diff --git a/src/platform/i386/vga.c b/src/platform/i386/vga.c
index bdfebe2882..bf4b17961f 100644
--- a/src/platform/i386/vga.c
+++ b/src/platform/i386/vga.c
@@ -44,9 +44,6 @@
 #define VGA_CTL_REG 0x3D4
 #define VGA_DATA_REG 0x3D5
 
-#define KEY_DEVICE 0x60
-#define KEY_PENDING 0x64
-
 /* Variables. */
 /* Save the X position. */
 static int csr_x;
@@ -160,27 +157,6 @@ cls(void)
 	move_csr();
 }
 
-/*
- * VIDEO virtual address set to HIGH address.
- */
-void
-vga_high_init(void)
-{
-	video = chal_pa2va(VIDEO);
-}
-
-/* Clear the screen and initialize VIDEO, XPOS and YPOS. */
-void
-vga_init(void)
-{
-	video = (unsigned char *) VIDEO;
-
-	csr_x = 0;
-	csr_y = 0;
-	cls();
-	printk_register_handler(vga_puts);
-}
-
 /* Put the character C on the screen. */
 static void
 putchar(int c)
@@ -222,16 +198,24 @@ puts(unsigned char *text)
 	move_csr();
 }
 
+/*
+ * VIDEO virtual address set to HIGH address.
+ */
 void
-keyboard_handler(struct pt_regs *regs)
+vga_high_init(void)
 {
-	u16_t scancode = 0;
+	video = chal_pa2va(VIDEO);
+}
 
-	ack_irq(HW_KEYBOARD);
+/* Clear the screen and initialize VIDEO, XPOS and YPOS. */
+void
+vga_init(void)
+{
+	video = (unsigned char *) VIDEO;
 
-	while (inb(KEY_PENDING) & 2) {
-		/* wait for keypress to be ready */
-	}
-	scancode = inb(KEY_DEVICE);
-	printk("Keyboard press: %d\n", scancode);
+	csr_x = 0;
+	csr_y = 0;
+	cls();
+	printk_register_handler(vga_puts);
+	printk("Enabling VGA\n");
 }
diff --git a/src/platform/i386/vga.h b/src/platform/i386/vga.h
new file mode 100644
index 0000000000..0788eb8b2f
--- /dev/null
+++ b/src/platform/i386/vga.h
@@ -0,0 +1,9 @@
+#ifndef VGA_H
+#define VGA_H
+
+#ifdef ENABLE_VGA
+void vga_init(void);
+void vga_high_init(void);
+#endif
+
+#endif /* VGA_H */
diff --git a/src/platform/i386/vm.c b/src/platform/i386/vm.c
index f7c4719dc9..a9457d5c67 100644
--- a/src/platform/i386/vm.c
+++ b/src/platform/i386/vm.c
@@ -55,6 +55,21 @@ u8_t *mem_boot_alloc(int npages) /* boot-time, bump-ptr heap */
 	return r;
 }
 
+static unsigned long vm_pgd_idx = COS_MEM_KERN_START_VA / PGD_RANGE;
+
+int
+vm_map_superpage(u32_t addr, int nocache)
+{
+	int idx = vm_pgd_idx;
+	u32_t page;
+
+	page = round_to_pgd_page(addr);
+	boot_comp_pgd[idx] = page | PGTBL_PRESENT | PGTBL_WRITABLE | PGTBL_SUPER | PGTBL_GLOBAL | (nocache ? PGTBL_NOCACHE : 0);
+	vm_pgd_idx ++;
+
+	return idx;
+}
+
 int
 kern_setup_image(void)
 {
@@ -76,6 +91,7 @@ kern_setup_image(void)
 		boot_comp_pgd[i / PGD_RANGE] = 0; /* unmap lower addresses */
 	}
 
+	vm_pgd_idx = j;
 	#ifdef ENABLE_VGA
 		/* uses virtual address for VGA */
 		vga_high_init();
@@ -89,33 +105,22 @@ kern_setup_image(void)
 		u64_t hpet;
 
 		page             = round_up_to_pgd_page(rsdt) - (1 << 22);
-		boot_comp_pgd[j] = page | PGTBL_PRESENT | PGTBL_WRITABLE | PGTBL_SUPER | PGTBL_GLOBAL;
-		acpi_set_rsdt_page(j);
-		j++;
-
-		hpet = timer_find_hpet(acpi_find_timer());
-		if (hpet) {
-			page             = round_up_to_pgd_page(hpet & 0xffffffff) - (1 << 22);
-			boot_comp_pgd[j] = page | PGTBL_PRESENT | PGTBL_WRITABLE | PGTBL_SUPER | PGTBL_GLOBAL;
-			timer_set_hpet_page(j);
-			j++;
-		}
+		acpi_set_rsdt_page(vm_map_superpage(page, 0));
+
+		hpet = hpet_find(acpi_find_hpet());
+		if (hpet) hpet_set_page(vm_map_superpage(hpet, 0));
 
 		/* lapic memory map */
 		lapic = lapic_find_localaddr(acpi_find_apic());
-		if (lapic) {
-			page             = round_up_to_pgd_page(lapic & 0xffffffff) - (1 << 22);
-			/*
-			 * Intel specification:
-			 * For correct APIC operation, this address space must be mapped to an area of memory
-			 * that has been designated as strong uncacheable (UC).
-			 */
-			boot_comp_pgd[j] = page | PGTBL_PRESENT | PGTBL_WRITABLE | PGTBL_SUPER | PGTBL_GLOBAL | PGTBL_NOCACHE;
-			lapic_set_page(j);
-			j++;
-		}
+		/*
+		 * Intel specification:
+		 * For correct APIC operation, this address space must be mapped to an area of memory
+		 * that has been designated as strong uncacheable (UC).
+		 */
+		if (lapic) lapic_set_page(vm_map_superpage(lapic, 1));
 	}
 
+	j = vm_pgd_idx;
 	for (; j < PAGE_SIZE / sizeof(unsigned int); i += PGD_RANGE, j++) {
 		boot_comp_pgd[j] = boot_comp_pgd[i / PGD_RANGE] = 0;
 	}