diff --git a/src/Makefile b/src/Makefile index 4765219b67..7a00b4e94b 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,9 +1,9 @@ MAKEFLAGS=--no-print-directory --section-alignment 0x1000 -I$(PWD) #$(info Make flags $(MAKEFLAGS)) -default: | all cp +default: | all -all: comps plat +all: comps plat cp comps: $(info ) diff --git a/src/components/Makefile.comp b/src/components/Makefile.comp index 6257eb9d5c..2a4887534e 100644 --- a/src/components/Makefile.comp +++ b/src/components/Makefile.comp @@ -11,6 +11,7 @@ MUSLBIN=$(MUSLDIR)/bin MUSLCC=$(MUSLBIN)/musl-$(CC) MUSLINC=-isystem$(MUSLDIR)/include +PSLIBDIR=$(LIBDIR)/ps CKDIR=$(LIBDIR)/ck CKLIBDIR=$(CKDIR)/lib CKINCDIR=$(CKDIR)/include @@ -40,22 +41,14 @@ LUAINC=-I$(LUADIR)/src -I$(LUABASE)/cos/include INC_PATH=-I./ -I$(CDIR)/include/ -I$(CDIR)/interface/ -I$(SHAREDINC) -I$(CKINCDIR) SHARED_FLAGS=-fno-merge-constants -nostdinc -nostdlib -fno-pic -OPT= -g -fvar-tracking -#OPT= -O3 +OPT = -g -fvar-tracking +OPT += -O3 CFLAGS=-m32 -D__x86__ -Wall -Wextra -Wno-unused-parameter -Wno-unused-function -fno-stack-protector -fno-omit-frame-pointer -Wno-unused-variable $(INC_PATH) $(MUSLINC) $(LWIPINC) $(LUAINC) $(OPT) $(SHARED_FLAGS) CXXFLAGS=-fno-exceptions -fno-threadsafe-statics -Wno-write-strings $(CFLAGS) LDFLAGS=-melf_i386 MUSLCFLAGS=$(CFLAGS) -lc -lgcc -Xlinker -r ASFLAGS=-m32 $(INC_PATH) $(SHARED_FLAGS) -SERVER_STUB=s_stub.o -CLIENT_STUB=c_stub.o - -LIBCOSDEFKERN=-lcos_kernel_api -lcos_defkernel_api -LIBSLCORE=$(LIBCOSDEFKERN) -lsl_sched -lheap -lsl_xcpu -lsl_child -lck -LIBSLCAPMGR=$(LIBSLCORE) -lsl_capmgr -LIBSLRAW=$(LIBSLCORE) -lsl_raw - GCC_PIE=$(shell gcc -v 2>&1 | grep -c "\--enable-default-pie") ifeq ($(GCC_PIE),1) MUSLCFLAGS+=-no-pie @@ -63,3 +56,11 @@ LDFLAGS+=-no-pie CFLAGS+=-fno-pie CXXFLAGS+=-fno-pie endif + +SERVER_STUB=s_stub.o +CLIENT_STUB=c_stub.o + +LIBCOSDEFKERN=-lcos_kernel_api -lcos_defkernel_api +LIBSLCORE=$(LIBCOSDEFKERN) -lsl_sched -lheap -lsl_xcore -lsl_child -lck +LIBSLCAPMGR=$(LIBSLCORE) -lsl_capmgr +LIBSLRAW=$(LIBSLCORE) -lsl_raw -lcos_dcb diff --git a/src/components/implementation/Makefile.subsubdir b/src/components/implementation/Makefile.subsubdir index 693d3a11a1..89bc44b379 100644 --- a/src/components/implementation/Makefile.subsubdir +++ b/src/components/implementation/Makefile.subsubdir @@ -42,7 +42,7 @@ TMP_STR2=tmp2 INCLUDE=-I../ $(DEP_INC) $(IF_INCLUDE) $(CINC) LIB_LIBRARIES_PRE=$(DEP_LIB_EXIST) LIB_LIBRARIES=$(strip $(LIB_LIBRARIES_PRE)) -LIB_FLAGS=-L$(CKLIBDIR) -L$(LIBDIR) -L$(LIBCXXDIR) $(DEP_LIB) $(LIB_LIBRARIES) $(ADDITIONAL_LIBS) +LIB_FLAGS=-L$(PSLIBDIR) -L$(CKLIBDIR) -L$(LIBDIR) -L$(LIBCXXDIR) $(DEP_LIB) $(LIB_LIBRARIES) $(ADDITIONAL_LIBS) C_SOURCES=$(C_OBJS:%.o=%.c) CXX_SOURCES=$(CXX_OBJS:%.o=%.cc) diff --git a/src/components/implementation/capmgr/naive/Makefile b/src/components/implementation/capmgr/naive/Makefile index 171178b7c5..4a6a2129f4 100644 --- a/src/components/implementation/capmgr/naive/Makefile +++ b/src/components/implementation/capmgr/naive/Makefile @@ -1,7 +1,7 @@ C_OBJS=cap_mgr.c mem_mgr.c init.c ASM_OBJS= COMPONENT=capmgr.o -INTERFACES=capmgr channel +INTERFACES=capmgr channel work DEPENDENCIES= IF_LIB= ADDITIONAL_LIBS=$(LIBSLRAW) -lsl_mod_fprr -lsl_thd_static_backend diff --git a/src/components/implementation/capmgr/naive/cap_info.c b/src/components/implementation/capmgr/naive/cap_info.c index e35eb4486a..bcdce17eca 100644 --- a/src/components/implementation/capmgr/naive/cap_info.c +++ b/src/components/implementation/capmgr/naive/cap_info.c @@ -62,6 +62,27 @@ cap_info_thd_next(struct cap_comp_info *rci) return NULL; } +void +cap_info_cpu_initdcb_init(struct cap_comp_info *rci) +{ + dcbcap_t initdcb = 0; + unsigned short init_off = 0; + vaddr_t initaddr = 0; + struct cos_compinfo *ci = cos_compinfo_get(cap_info_dci(rci)); + struct cap_comp_cpu_info *rci_cpu = cap_info_cpu_local(rci); + + if (rci->cid == 0 || rci->cid == cos_spd_id()) { + cos_dcb_info_init_ext(cap_info_cpu_dcbdata(rci_cpu), 0, 0, 0, 0); + return; + } + + initaddr = rci->init_dcb_start + cos_cpuid() * PAGE_SIZE; + initdcb = cos_dcb_alloc(cos_compinfo_get(cos_defcompinfo_curr_get()), ci->pgtbl_cap, initaddr); + assert(initdcb); + + cos_dcb_info_init_ext(cap_info_cpu_dcbdata(rci_cpu), ci, initdcb, initaddr, init_off); +} + struct cap_comp_info * cap_info_comp_init(spdid_t spdid, captblcap_t captbl_cap, pgtblcap_t pgtbl_cap, compcap_t compcap, capid_t cap_frontier, vaddr_t heap_frontier, spdid_t sched_spdid) @@ -76,13 +97,16 @@ cap_info_comp_init(spdid_t spdid, captblcap_t captbl_cap, pgtblcap_t pgtbl_cap, capci[spdid].cid = spdid; cos_meminfo_init(&ci->mi, 0, 0, 0); - cos_compinfo_init(ci, pgtbl_cap, captbl_cap, compcap, heap_frontier, cap_frontier, - cos_compinfo_get(cos_defcompinfo_curr_get())); + cos_compinfo_init(ci, pgtbl_cap, captbl_cap, compcap, heap_frontier, + cap_frontier, cos_compinfo_get(cos_defcompinfo_curr_get())); memset(rglb, 0, sizeof(struct cap_shmem_glb_info)); memset(cap_shi, 0, sizeof(struct cap_shmem_info)); cap_shi->cinfo = ci; + capci[spdid].init_dcb_start = heap_frontier - (NUM_CPU * PAGE_SIZE); + cap_info_cpu_initdcb_init(&capci[spdid]); + capci[spdid].initflag = 1; ps_faa((unsigned long *)&cap_comp_count, 1); @@ -336,7 +360,8 @@ cap_shmem_region_find(cos_channelkey_t key) cbuf_t i, free = rglb->free_region_id; for (i = 1; i <= free; i++) { - if (ps_load((unsigned long *)&rglb->region_keys[i - 1]) == key) { + cos_channelkey_t *k = &rglb->region_keys[i - 1]; + if (ps_load((unsigned long *)k) == (unsigned long)key) { id = i; break; } diff --git a/src/components/implementation/capmgr/naive/cap_info.h b/src/components/implementation/capmgr/naive/cap_info.h index 99d0bd060b..9919c6c796 100644 --- a/src/components/implementation/capmgr/naive/cap_info.h +++ b/src/components/implementation/capmgr/naive/cap_info.h @@ -15,6 +15,7 @@ #include #include #include +#include #define CAP_INFO_MAX_THREADS (MAX_NUM_THREADS) @@ -29,12 +30,12 @@ struct cap_shmem_glb_info { }; struct cap_comm_info { - arcvcap_t rcvcap; /* rcv capid in capmgr! */ - cpuid_t rcvcpuid; - cycles_t ipiwin, ipiwin_start; /* TODO: synchronize TSC on all cores */ - u32_t ipicnt, ipimax; - asndcap_t sndcap[NUM_CPU]; /* for cross-core asnds */ - sinvcap_t sinvcap[NUM_CPU]; /* for each core (except for the same core!) */ + arcvcap_t rcvcap; /* rcv capid in capmgr! */ + cpuid_t rcvcpuid; + cycles_t ipiwin, ipiwin_start; /* TODO: synchronize TSC on all cores */ + unsigned long ipicnt, ipimax; + asndcap_t sndcap[NUM_CPU]; /* for cross-core asnds */ + sinvcap_t sinvcap[NUM_CPU]; /* for each core (except for the same core!) */ } cap_comminfo[CAP_INFO_MAX_THREADS]; struct cap_channelaep_info { @@ -58,6 +59,8 @@ struct cap_comp_cpu_info { int p_thd_iterator; /* iterator for parent to get all threads created by capmgr in this component so far! */ thdcap_t p_initthdcap; /* init thread's cap in parent */ thdid_t initthdid; /* init thread's tid */ + + struct cos_dcbinfo_data dcb_data; } CACHE_ALIGNED; struct cap_comp_info { @@ -65,6 +68,7 @@ struct cap_comp_info { struct cos_defcompinfo defci; struct cap_shmem_info shminfo; int initflag; + vaddr_t init_dcb_start; struct cap_comp_cpu_info cpu_local[NUM_CPU]; }; @@ -74,6 +78,7 @@ struct cap_comp_info *cap_info_comp_init(spdid_t spdid, captblcap_t captbl_cap, struct sl_thd *cap_info_thd_init(struct cap_comp_info *rci, struct sl_thd *t, cos_channelkey_t key); struct sl_thd *cap_info_initthd_init(struct cap_comp_info *rci, struct sl_thd *t, cos_channelkey_t key); +void cap_info_cpu_initdcb_init(struct cap_comp_info *rci); struct cap_comp_info *cap_info_comp_find(spdid_t s); struct sl_thd *cap_info_thd_find(struct cap_comp_info *r, thdid_t t); @@ -116,6 +121,12 @@ cap_info_cpu_local(struct cap_comp_info *c) return &c->cpu_local[cos_cpuid()]; } +static inline struct cos_dcbinfo_data * +cap_info_cpu_dcbdata(struct cap_comp_cpu_info *c) +{ + return &c->dcb_data; +} + static inline struct cap_comp_info * cap_info_parent(struct cap_comp_info *r) { @@ -133,11 +144,18 @@ cap_info_is_parent(struct cap_comp_info *r, spdid_t p) } static inline int -cap_info_is_sched(spdid_t c) +cap_info_is_sched_core(spdid_t c, cpuid_t core) { + if (core >= NUM_CPU) return 0; if (!c) return 1; /* llbooter! */ - return bitmap_check(cap_info_schedbmp[cos_cpuid()], c - 1); + return bitmap_check(cap_info_schedbmp[core], c - 1); +} + +static inline int +cap_info_is_sched(spdid_t c) +{ + return cap_info_is_sched_core(c, cos_cpuid()); } static inline int diff --git a/src/components/implementation/capmgr/naive/cap_mgr.c b/src/components/implementation/capmgr/naive/cap_mgr.c index 9216a3b35c..fcf938b936 100644 --- a/src/components/implementation/capmgr/naive/cap_mgr.c +++ b/src/components/implementation/capmgr/naive/cap_mgr.c @@ -13,7 +13,7 @@ #include thdcap_t -capmgr_thd_create_cserialized(thdid_t *tid, int *unused, thdclosure_index_t idx) +capmgr_thd_create_cserialized(struct cos_dcb_info **dcb, thdid_t *tid, thdclosure_index_t idx) { spdid_t cur = cos_inv_token(); struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get(); @@ -21,18 +21,24 @@ capmgr_thd_create_cserialized(thdid_t *tid, int *unused, thdclosure_index_t idx) struct cap_comp_info *r = cap_info_comp_find(cur); struct sl_thd *rt = NULL, *t = NULL; thdcap_t thdcap = 0; + dcbcap_t dcbcap = 0; + dcboff_t dcboff = 0; + vaddr_t dcbaddr = 0; if (!r || !cap_info_init_check(r)) return 0; if (!cap_info_is_sched(cur)) return 0; if (idx <= 0) return 0; - t = sl_thd_aep_alloc_ext(cap_info_dci(r), NULL, idx, 0, 0, 0, 0, 0, NULL); - if (!t) return 0; + dcbcap = cos_dcb_info_alloc(cap_info_cpu_dcbdata(cap_info_cpu_local(r)), &dcboff, &dcbaddr); + if (!dcbcap || !dcbaddr || !dcboff) return 0; /* dcboff == 0 for initthd in that comp! */ + t = sl_thd_aep_alloc_ext_dcb(cap_info_dci(r), NULL, idx, 0, 0, 0, dcbcap, dcboff, 0, 0, NULL); + if (!t) return 0; thdcap = cos_cap_cpy(cap_info_ci(r), cap_ci, CAP_THD, sl_thd_thdcap(t)); if (!thdcap) goto err; cap_info_thd_init(r, t, 0); *tid = sl_thd_thdid(t); + *dcb = (struct cos_dcb_info *)dcbaddr; return thdcap; err: @@ -42,7 +48,7 @@ capmgr_thd_create_cserialized(thdid_t *tid, int *unused, thdclosure_index_t idx) } thdcap_t -capmgr_thd_create_ext_cserialized(thdid_t *tid, int *unused, spdid_t s, thdclosure_index_t idx) +capmgr_thd_create_ext_cserialized(struct cos_dcb_info **dcb, thdid_t *tid, spdid_t s, thdclosure_index_t idx) { spdid_t cur = cos_inv_token(); struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get(); @@ -51,6 +57,9 @@ capmgr_thd_create_ext_cserialized(thdid_t *tid, int *unused, spdid_t s, thdclosu struct cap_comp_info *rs = cap_info_comp_find(s); struct sl_thd *t = NULL; thdcap_t thdcap = 0; + dcbcap_t dcbcap = 0; + dcboff_t dcboff = 0; + vaddr_t dcbaddr = 0; if (!rc || !cap_info_init_check(rc)) return 0; if (!rs || !cap_info_init_check(rs)) return 0; @@ -58,7 +67,10 @@ capmgr_thd_create_ext_cserialized(thdid_t *tid, int *unused, spdid_t s, thdclosu if (cap_info_is_sched(s)) return 0; if (idx <= 0) return 0; - t = sl_thd_aep_alloc_ext(cap_info_dci(rs), NULL, idx, 0, 0, 0, 0, 0, NULL); + /* s is not a scheduler, dcbinfo will be in the scheduler component */ + dcbcap = cos_dcb_info_alloc(cap_info_cpu_dcbdata(cap_info_cpu_local(rc)), &dcboff, &dcbaddr); + if (!dcbcap || !dcbaddr || !dcboff) return 0; /* dcboff == 0 for initthd in that comp! */ + t = sl_thd_aep_alloc_ext_dcb(cap_info_dci(rs), NULL, idx, 0, 0, 0, dcbcap, dcboff, 0, 0, NULL); if (!t) return 0; thdcap = cos_cap_cpy(cap_info_ci(rc), cap_ci, CAP_THD, sl_thd_thdcap(t)); if (!thdcap) goto err; @@ -66,6 +78,7 @@ capmgr_thd_create_ext_cserialized(thdid_t *tid, int *unused, spdid_t s, thdclosu cap_info_thd_init(rc, t, 0); cap_info_thd_init(rs, t, 0); *tid = sl_thd_thdid(t); + *dcb = (struct cos_dcb_info *)dcbaddr; /* child is not a scheduler, don't copy into child */ return thdcap; @@ -78,20 +91,27 @@ capmgr_thd_create_ext_cserialized(thdid_t *tid, int *unused, spdid_t s, thdclosu thdcap_t capmgr_initthd_create_cserialized(thdid_t *tid, int *unused, spdid_t s) { - spdid_t cur = cos_inv_token(); - struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get(); - struct cos_compinfo *cap_ci = cos_compinfo_get(cap_dci); - struct cap_comp_info *rc = cap_info_comp_find(cur); - struct cap_comp_info *rs = cap_info_comp_find(s); - struct sl_thd *t = NULL; - thdcap_t thdcap = 0; + spdid_t cur = cos_inv_token(); + struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get(); + struct cos_compinfo *cap_ci = cos_compinfo_get(cap_dci); + struct cap_comp_info *rc = cap_info_comp_find(cur); + struct cap_comp_info *rs = cap_info_comp_find(s); + struct cap_comp_cpu_info *rs_cpu = cap_info_cpu_local(rs); + struct cos_compinfo *rs_ci = cap_info_ci(rs); + struct sl_thd *t = NULL; + thdcap_t thdcap = 0; + dcbcap_t dcbcap = 0; + dcboff_t dcboff = 0; + vaddr_t dcbaddr = 0; if (!rc || !cap_info_init_check(rc)) return 0; if (!rs || !cap_info_init_check(rs)) return 0; if (!cap_info_is_sched(cur) || !cap_info_is_child(rc, s)) return 0; if (cap_info_is_sched(s)) return 0; - t = sl_thd_initaep_alloc(cap_info_dci(rs), NULL, 0, 0, 0, 0, 0); + dcbcap = cos_dcb_info_alloc(cap_info_cpu_dcbdata(rs_cpu), &dcboff, &dcbaddr); + if (!dcbcap || !dcbaddr || dcboff) return 0; /* dcboff == 0 for initthd in that comp! */ + t = sl_thd_initaep_alloc_dcb(cap_info_dci(rs), NULL, 0, 0, 0, dcbcap, 0, 0); if (!t) return 0; /* child is not a scheduler, don't copy into child */ /* parent only needs the thdcap */ @@ -113,22 +133,26 @@ capmgr_initthd_create_cserialized(thdid_t *tid, int *unused, spdid_t s) thdcap_t capmgr_initaep_create_cserialized(u32_t *sndtidret, u32_t *rcvtcret, u32_t spdid_owntc, u32_t key_ipimax, u32_t ipiwin32b) { - spdid_t cur = cos_inv_token(), s = spdid_owntc >> 16; - struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get(); - struct cos_compinfo *cap_ci = cos_compinfo_get(cap_dci); - struct cap_comp_info *rc = cap_info_comp_find(cur); - struct cap_comp_info *rs = cap_info_comp_find(s); - struct sl_thd *t = NULL, *rinit = NULL; - thdcap_t thdcap = 0; - int owntc = (spdid_owntc << 16) >> 16; - cos_channelkey_t key = key_ipimax >> 16; - u32_t ipimax = (key_ipimax << 16) >> 16; - microsec_t ipiwin = (microsec_t)ipiwin32b; - int ret; - tcap_t tc; - arcvcap_t rcv; - asndcap_t snd; - thdid_t tid; + spdid_t cur = cos_inv_token(), s = spdid_owntc >> 16; + struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get(); + struct cos_compinfo *cap_ci = cos_compinfo_get(cap_dci); + struct cap_comp_info *rc = cap_info_comp_find(cur); + struct cap_comp_info *rs = cap_info_comp_find(s); + struct cap_comp_cpu_info *rs_cpu = cap_info_cpu_local(rs); + struct sl_thd *t = NULL, *rinit = NULL; + thdcap_t thdcap = 0; + int owntc = (spdid_owntc << 16) >> 16; + cos_channelkey_t key = key_ipimax >> 16; + u32_t ipimax = (key_ipimax << 16) >> 16; + microsec_t ipiwin = (microsec_t)ipiwin32b; + dcbcap_t dcbcap = 0; + dcboff_t dcboff = 0; + vaddr_t dcbaddr = 0; + int ret; + tcap_t tc; + arcvcap_t rcv; + asndcap_t snd; + thdid_t tid; if (!rc || !cap_info_init_check(rc)) return 0; if (!rs || !cap_info_init_check(rs)) return 0; @@ -137,7 +161,9 @@ capmgr_initaep_create_cserialized(u32_t *sndtidret, u32_t *rcvtcret, u32_t spdid rinit = cap_info_initthd(rc); if (!rinit) return 0; - t = sl_thd_initaep_alloc(cap_info_dci(rs), rinit, 1, owntc, 0, 0, 0); + dcbcap = cos_dcb_info_alloc(cap_info_cpu_dcbdata(rs_cpu), &dcboff, &dcbaddr); + if (!dcbcap || !dcbaddr || dcboff) return 0; /* dcboff == 0 for initthd in that comp! */ + t = sl_thd_initaep_alloc_dcb(cap_info_dci(rs), rinit, 1, owntc, 0, dcbcap, ipimax, ipiwin); if (!t) return 0; /* child is a scheduler.. copy initcaps */ ret = cos_cap_cpy_at(cap_info_ci(rs), BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, cap_ci, sl_thd_thdcap(t)); @@ -166,8 +192,8 @@ capmgr_initaep_create_cserialized(u32_t *sndtidret, u32_t *rcvtcret, u32_t spdid cap_comminfo_init(t, ipiwin, ipimax); cap_info_thd_init(rc, t, key); cap_info_initthd_init(rs, t, 0); - cap_info_cpu_local(rs)->p_initthdcap = thdcap = ret; - cap_info_cpu_local(rs)->initthdid = tid = sl_thd_thdid(t); + rs_cpu->p_initthdcap = thdcap = ret; + rs_cpu->initthdid = tid = sl_thd_thdid(t); *rcvtcret = (rcv << 16) | (tc); *sndtidret = (snd << 16) | (tid); @@ -178,8 +204,33 @@ capmgr_initaep_create_cserialized(u32_t *sndtidret, u32_t *rcvtcret, u32_t spdid return 0; } -thdcap_t -capmgr_aep_create_ext_cserialized(u32_t *drcvtidret, u32_t *rcvtcret, u32_t owntc_spdid_thdidx, u32_t chkey_ipimax, u32_t ipiwin32b) +arcvcap_t +capmgr_aep_rcv_retrieve_cserialized(spdid_t s, thdid_t tid) +{ + spdid_t cur = cos_inv_token(); + struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get(); + struct cos_compinfo *cap_ci = cos_compinfo_get(cap_dci); + struct cap_comp_info *rc = cap_info_comp_find(cur); + struct cap_comp_info *rs = cap_info_comp_find(s); + struct sl_thd *ti = cap_info_thd_find(rs, tid); + arcvcap_t dstrcv = 0; + + if (!rc || !cap_info_init_check(rc)) return 0; + if (!rs || !cap_info_init_check(rs)) return 0; + if (!cap_info_is_sched(cur) || !cap_info_is_child(rc, s)) return 0; + if (!ti || !sl_thd_thdcap(ti)) return 0; + + /* + * for aep thread.. rcv cap should be accessible in the destination component, + * so we return that cap so the scheduler can init proper structure of the dest component. + */ + dstrcv = cos_cap_cpy(cap_info_ci(rs), cap_ci, CAP_ARCV, sl_thd_rcvcap(ti)); + + return dstrcv; +} + +u32_t +capmgr_aep_create_ext_cserialized(struct cos_dcb_info **dcb, u32_t *rcvtcret, u32_t owntc_spdid_thdidx, u32_t chkey_ipimax, u32_t ipiwin32b) { spdid_t cur = cos_inv_token(); spdid_t s = (owntc_spdid_thdidx << 1) >> 17; @@ -195,6 +246,9 @@ capmgr_aep_create_ext_cserialized(u32_t *drcvtidret, u32_t *rcvtcret, u32_t ownt u32_t ipimax = (chkey_ipimax << 16) >> 16; microsec_t ipiwin = (microsec_t)ipiwin32b; arcvcap_t srcrcv, dstrcv; + dcbcap_t dcbcap = 0; + dcboff_t dcboff = 0; + vaddr_t dcbaddr = 0; tcap_t tc; int ret; @@ -206,17 +260,15 @@ capmgr_aep_create_ext_cserialized(u32_t *drcvtidret, u32_t *rcvtcret, u32_t ownt rinit = cap_info_initthd(rc); if (!rinit) return 0; - t = sl_thd_aep_alloc_ext(cap_info_dci(rs), rinit, tidx, 1, owntc, 0, 0, 0, &srcrcv); + /* if s is not a scheduler, dcbinfo will be in the scheduler component */ + //if (cap_info_is_sched(s)) dcbcap = cos_dcb_info_alloc(cap_info_cpu_dcbdata(cap_info_cpu_local(rs)), &dcboff, &dcbaddr); + /*else*/ dcbcap = cos_dcb_info_alloc(cap_info_cpu_dcbdata(cap_info_cpu_local(rc)), &dcboff, &dcbaddr); + if (!dcbcap || !dcbaddr || !dcboff) return 0; /* dcboff == 0 for initthd in that comp! */ + t = sl_thd_aep_alloc_ext_dcb(cap_info_dci(rs), rinit, tidx, 1, owntc, 0, dcbcap, dcboff, ipiwin, ipimax, &srcrcv); if (!t) return 0; /* cur is a scheduler, copy thdcap */ ret = cos_cap_cpy(cap_info_ci(rc), cap_ci, CAP_THD, sl_thd_thdcap(t)); if (!ret) goto err; - /* - * for aep thread.. rcv cap should be accessible in the destination component, - * so we return that cap so the scheduler can init proper structucap of the dest component. - */ - dstrcv = cos_cap_cpy(cap_info_ci(rs), cap_ci, CAP_ARCV, sl_thd_rcvcap(t)); - if (!dstrcv) goto err; if (owntc) { /* @@ -239,8 +291,8 @@ capmgr_aep_create_ext_cserialized(u32_t *drcvtidret, u32_t *rcvtcret, u32_t ownt cap_comminfo_init(t, ipiwin, ipimax); cap_info_thd_init(rc, t, key); cap_info_thd_init(rs, t, 0); - *drcvtidret = (dstrcv << 16 | sl_thd_thdid(t)); - thdcap = ret; + thdcap = ret << 16 | sl_thd_thdid(t); + *dcb = (struct cos_dcb_info *)dcbaddr; return thdcap; err: @@ -249,8 +301,8 @@ capmgr_aep_create_ext_cserialized(u32_t *drcvtidret, u32_t *rcvtcret, u32_t ownt return 0; } -thdcap_t -capmgr_aep_create_cserialized(thdid_t *tid, u32_t *tcrcvret, u32_t owntc_tidx, u32_t key_ipimax, u32_t ipiwin32b) +u32_t +capmgr_aep_create_cserialized(struct cos_dcb_info **dcb, u32_t *tcrcvret, u32_t owntc_tidx, u32_t key_ipimax, u32_t ipiwin32b) { spdid_t cur = cos_inv_token(); struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get(); @@ -263,6 +315,9 @@ capmgr_aep_create_cserialized(thdid_t *tid, u32_t *tcrcvret, u32_t owntc_tidx, u microsec_t ipiwin = (microsec_t)ipiwin32b; struct sl_thd *t = NULL, *rinit = NULL; thdcap_t thdcap = 0; + dcbcap_t dcbcap = 0; + dcboff_t dcboff = 0; + vaddr_t dcbaddr = 0; arcvcap_t rcv; tcap_t tc; int ret; @@ -274,7 +329,9 @@ capmgr_aep_create_cserialized(thdid_t *tid, u32_t *tcrcvret, u32_t owntc_tidx, u rinit = cap_info_initthd(rc); if (!rinit) return 0; - t = sl_thd_aep_alloc_ext(cap_info_dci(rc), rinit, tidx, 1, owntc, 0, 0, 0, &rcv); + dcbcap = cos_dcb_info_alloc(cap_info_cpu_dcbdata(cap_info_cpu_local(rc)), &dcboff, &dcbaddr); + if (!dcbcap || !dcbaddr || !dcboff) return 0; /* dcboff == 0 for initthd in that comp! */ + t = sl_thd_aep_alloc_ext_dcb(cap_info_dci(rc), rinit, tidx, 1, owntc, 0, dcbcap, dcboff, ipiwin, ipimax, &rcv); if (!t) return 0; /* current is a sched, so copy */ ret = cos_cap_cpy(cap_info_ci(rc), cap_ci, CAP_THD, sl_thd_thdcap(t)); @@ -294,8 +351,8 @@ capmgr_aep_create_cserialized(thdid_t *tid, u32_t *tcrcvret, u32_t owntc_tidx, u cap_comminfo_init(t, ipiwin, ipimax); cap_info_thd_init(rc, t, key); *tcrcvret = (tc << 16 | rcv); - *tid = sl_thd_thdid(t); - thdcap = ret; + thdcap = ret << 16 | sl_thd_thdid(t); + *dcb = (struct cos_dcb_info *)dcbaddr; return thdcap; err: @@ -304,6 +361,32 @@ capmgr_aep_create_cserialized(thdid_t *tid, u32_t *tcrcvret, u32_t owntc_tidx, u return 0; } +int +capmgr_thd_migrate(thdid_t tid, thdcap_t tc, cpuid_t core) +{ + spdid_t cur = cos_inv_token(); + struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get(); + struct cos_compinfo *cap_ci = cos_compinfo_get(cap_dci); + struct cap_comp_info *rc = cap_info_comp_find(cur); + struct sl_thd *ti = cap_info_thd_find(rc, tid); + struct cap_comp_cpu_info *rc_cpu = NULL; + int ret; + + if (!rc || !cap_info_init_check(rc)) return -EINVAL; + if (!cap_info_is_sched(cur) || !cap_info_is_sched_core(cur, core)) return -EINVAL; + if (!ti || !sl_thd_thdcap(ti)) return -EINVAL; + rc_cpu = cap_info_cpu_local(rc); + if (tid == rc_cpu->initthdid) return -EINVAL; + + ret = cos_thd_migrate(cap_ci, sl_thd_thdcap(ti), core); + if (ret) return ret; + ret = cos_thdcap_migrate(cap_info_ci(rc), tc); + if (ret) return ret; + ret = sl_thd_migrate(tid, core); + + return ret; +} + thdcap_t capmgr_thd_retrieve_cserialized(thdid_t *inittid, int *unused, spdid_t s, thdid_t tid) { @@ -483,3 +566,47 @@ capmgr_asnd_key_create(cos_channelkey_t key) return (asndcap_t)capret; } + +int +capmgr_hw_attach(hwid_t hwid, thdid_t tid) +{ + spdid_t cur = cos_inv_token(); + struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get(); + struct cos_compinfo *cap_ci = cos_compinfo_get(cap_dci); + struct cap_comp_info *rc = cap_info_comp_find(cur); + struct sl_thd *ti = cap_info_thd_find(rc, tid); + + if (!rc || !cap_info_init_check(rc)) return -EINVAL; + if (!ti || !sl_thd_rcvcap(ti)) return -EINVAL; + + return cos_hw_attach(BOOT_CAPTBL_SELF_INITHW_BASE, hwid, sl_thd_rcvcap(ti)); +} + +int +capmgr_hw_periodic_attach(hwid_t hwid, thdid_t tid, unsigned int period_us) +{ + spdid_t cur = cos_inv_token(); + struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get(); + struct cos_compinfo *cap_ci = cos_compinfo_get(cap_dci); + struct cap_comp_info *rc = cap_info_comp_find(cur); + struct sl_thd *ti = cap_info_thd_find(rc, tid); + + if (period_us == 0) return -EINVAL; + if (!rc || !cap_info_init_check(rc)) return -EINVAL; + if (!ti || !sl_thd_rcvcap(ti)) return -EINVAL; + + return cos_hw_periodic_attach(BOOT_CAPTBL_SELF_INITHW_BASE, hwid, sl_thd_rcvcap(ti), period_us); +} + +int +capmgr_hw_detach(hwid_t hwid) +{ + spdid_t cur = cos_inv_token(); + struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get(); + struct cos_compinfo *cap_ci = cos_compinfo_get(cap_dci); + struct cap_comp_info *rc = cap_info_comp_find(cur); + + if (!rc || !cap_info_init_check(rc)) return -EINVAL; + + return cos_hw_detach(BOOT_CAPTBL_SELF_INITHW_BASE, hwid); +} diff --git a/src/components/implementation/capmgr/naive/init.c b/src/components/implementation/capmgr/naive/init.c index e3fcd1e0d4..0512aab8f3 100644 --- a/src/components/implementation/capmgr/naive/init.c +++ b/src/components/implementation/capmgr/naive/init.c @@ -13,6 +13,7 @@ #include #include #include +#include "spinlib.h" static volatile int capmgr_init_core_done = 0; @@ -22,13 +23,13 @@ capmgr_comp_info_init(struct cap_comp_info *rci, spdid_t spdid) struct cos_defcompinfo *defci = cos_defcompinfo_curr_get(); struct cos_compinfo *ci = cos_compinfo_get(defci); struct cap_comp_info *btinfo = cap_info_comp_find(0); - spdid_t sched_spdid = 0; struct cap_comp_info *rci_sched = NULL; struct cap_comp_cpu_info *rci_cpu = NULL; struct sl_thd *ithd = NULL; u64_t chbits = 0, chschbits = 0; int ret = 0, is_sched = 0; int remain_child = 0; + spdid_t sched_spdid = 0; spdid_t childid; comp_flag_t ch_flags; struct cos_aep_info aep; @@ -38,17 +39,21 @@ capmgr_comp_info_init(struct cap_comp_info *rci, spdid_t spdid) assert(cap_info_init_check(rci)); rci_cpu = cap_info_cpu_local(rci); + sched_spdid = hypercall_comp_sched_get(spdid); if (spdid == 0 || (spdid != cos_spd_id() && cap_info_is_child(btinfo, spdid))) { is_sched = (spdid == 0 || cap_info_is_sched_child(btinfo, spdid)) ? 1 : 0; - ret = hypercall_comp_initaep_get(spdid, is_sched, &aep); - assert(ret == 0); + if (!spdid || (spdid && sched_spdid != 0)) { + ret = hypercall_comp_initaep_get(spdid, is_sched, &aep, &sched_spdid); + assert(ret == 0); + } } rci_sched = cap_info_comp_find(sched_spdid); - assert(rci_sched && cap_info_init_check(rci_sched)); + assert(rci_sched); rci_cpu->parent = rci_sched; rci_cpu->thd_used = 1; + if (cos_cpuid() != INIT_CORE) cap_info_cpu_initdcb_init(rci); while ((remain_child = hypercall_comp_child_next(spdid, &childid, &ch_flags)) >= 0) { bitmap_set(rci_cpu->child_bitmap, childid - 1); @@ -66,14 +71,41 @@ capmgr_comp_info_init(struct cap_comp_info *rci, spdid_t spdid) cap_comminfo_init(ithd, 0, 0); cap_info_initthd_init(rci, ithd, 0); } else if (cos_spd_id() == spdid) { - cap_info_initthd_init(rci, sl__globals_cpu()->sched_thd, 0); + cap_info_initthd_init(rci, sl__globals_core()->sched_thd, 0); + } else if (!sched_spdid && spdid) { + struct sl_thd *booter_thd = cap_info_initthd(btinfo); + dcbcap_t dcap; + dcboff_t off = 0; + vaddr_t addr = 0; + struct cos_compinfo *rt_ci = cap_info_ci(rci); + + dcap = cos_dcb_info_alloc(&rci_cpu->dcb_data, &off, &addr); + if (dcap) assert(off == 0 && addr); + + /* root-scheduler, TODO: rate-limiting? */ + ithd = sl_thd_initaep_alloc_dcb(cap_info_dci(rci), booter_thd, is_sched, is_sched ? 1 : 0, 0, dcap, 0, 0); + assert(ithd); + + ret = cos_cap_cpy_at(rt_ci, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, ci, sl_thd_thdcap(ithd)); + assert(ret == 0); + if (is_sched) { + ret = cos_cap_cpy_at(rt_ci, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, ci, sl_thd_tcap(ithd)); + assert(ret == 0); + ret = cos_cap_cpy_at(rt_ci, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, ci, sl_thd_rcvcap(ithd)); + assert(ret == 0); + } + + ret = hypercall_root_initaep_set(spdid, sl_thd_aepinfo(ithd)); + assert(ret == 0); + cap_info_initthd_init(rci, ithd, 0); + cap_comminfo_init(ithd, 0, 0); } return; } static void -capmgr_comp_info_iter_cpu(void) +capmgr_comp_info_iter_core(void) { int remaining = hypercall_numcomps_get(), i; int num_comps = 0; @@ -142,8 +174,9 @@ cos_init(void) spdid_t child; comp_flag_t ch_flags; int ret = 0, i; + unsigned int cycs_per_us = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE); - PRINTLOG(PRINT_DEBUG, "CPU cycles per sec: %u\n", cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE)); + PRINTLOG(PRINT_DEBUG, "CPU cycles per sec: %u\n", cycs_per_us); ret = hypercall_comp_frontier_get(cos_spd_id(), &heap_frontier, &cap_frontier); assert(ret == 0); @@ -153,14 +186,17 @@ cos_init(void) BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_COMP, heap_frontier, cap_frontier); cap_info_init(); + cos_dcb_info_init_curr(); sl_init(SL_MIN_PERIOD_US); + spinlib_calib(cycs_per_us); capmgr_comp_info_iter(); } else { while (!capmgr_init_core_done) ; /* WAIT FOR INIT CORE TO BE DONE */ cos_defcompinfo_sched_init(); + cos_dcb_info_init_curr(); sl_init(SL_MIN_PERIOD_US); - capmgr_comp_info_iter_cpu(); + capmgr_comp_info_iter_core(); } assert(hypercall_comp_child_next(cos_spd_id(), &child, &ch_flags) == -1); diff --git a/src/components/implementation/capmgr/naive/spinlib.c b/src/components/implementation/capmgr/naive/spinlib.c new file mode 100644 index 0000000000..782cdc3c6f --- /dev/null +++ b/src/components/implementation/capmgr/naive/spinlib.c @@ -0,0 +1,110 @@ +#include "spinlib.h" +#include + +#define SPINLIB_CALIB 256 + +static u64_t spinlib_cycs_per_spin_iters = 0; +static u64_t spinlib_usecs_per_spin_iters = 0; +unsigned int spinlib_cycs_per_us = 0; +static unsigned int spinlib_init = 0; + +void spinlib_calib(unsigned int cycs_per_us) __attribute__((optimize("O0"))); +void spinlib_usecs(cycles_t usecs) __attribute__((optimize("O0"))); +void spinlib_cycles(cycles_t cycs) __attribute__((optimize("O0"))); +void spinlib_std_iters(void) __attribute__((optimize("O0"))); + +#define SPINLIB_TEST_NITEMS 4 + +static void +spinlib_calib_test(void) +{ + microsec_t test_us[SPINLIB_TEST_NITEMS] = { 1000, 2000, 3000, 4000 }; + int i; + + for (i = 0; i < SPINLIB_TEST_NITEMS; i++) { + cycles_t st, end, elapsed_cycs; + + rdtscll(st); + spinlib_usecs(test_us[i]); + rdtscll(end); + elapsed_cycs = end - st; + + PRINTC("SPIN %lluus => elapsed :%llucycs %lluus\n", test_us[i], elapsed_cycs, sl_cyc2usec(elapsed_cycs)); + } +} + +void +spinlib_std_iters(void) +{ + unsigned int i; + + for (i = 0 ; i < SPINLIB_ITERS_SPIN ; i++) { + __asm__ __volatile__("nop": : :"memory"); + } +} + +/* time taken in that loop */ +void +spinlib_calib(unsigned int cycs_per_us) +{ + cycles_t total_cycs = 0; + unsigned int iters = 0; + + if (spinlib_init) return; + spinlib_cycs_per_us = cycs_per_us; + + while (iters < SPINLIB_CALIB) { + cycles_t start, end; + + rdtscll(start); + spinlib_std_iters(); + rdtscll(end); + + total_cycs += (end - start); + iters ++; + } + + spinlib_cycs_per_spin_iters = total_cycs / SPINLIB_CALIB; + spinlib_usecs_per_spin_iters = spinlib_cycs_per_spin_iters / spinlib_cycs_per_us; + + spinlib_init = 0; + printc("Spin calibration: ITERS:%u Cycs/ITERS:%llu usecs/ITERS:%llu\n", + SPINLIB_ITERS_SPIN, spinlib_cycs_per_spin_iters, spinlib_usecs_per_spin_iters); + spinlib_calib_test(); +} + +void +spinlib_cycles(cycles_t cycs) +{ + unsigned int i = 0; + unsigned int iters = cycs / spinlib_cycs_per_spin_iters; + unsigned int left = cycs % spinlib_cycs_per_spin_iters; + + assert(cycs >= spinlib_cycs_per_spin_iters); + + /* round off to next cycs/spin */ + if (left >= (spinlib_cycs_per_spin_iters / 2)) iters ++; + + while (i < iters) { + spinlib_std_iters(); + i ++; + } +} + +void +spinlib_usecs(cycles_t usecs) +{ + unsigned int i = 0; + unsigned int iters = usecs / spinlib_usecs_per_spin_iters; + unsigned int left = usecs % spinlib_usecs_per_spin_iters; + + assert(usecs >= spinlib_usecs_per_spin_iters); + + /* round off to next usec */ + if (left >= (spinlib_usecs_per_spin_iters / 2)) iters ++; + + while (i < iters) { + spinlib_std_iters(); + i ++; + } +} diff --git a/src/components/implementation/capmgr/naive/spinlib.h b/src/components/implementation/capmgr/naive/spinlib.h new file mode 100644 index 0000000000..6c477fc48c --- /dev/null +++ b/src/components/implementation/capmgr/naive/spinlib.h @@ -0,0 +1,20 @@ +#ifndef SPINLIB_H +#define SPINLIB_H + +#include +#include +#include + +/* + * this is probably the trickiest thing to configure and + * the accuracy of the workgen depends very much on this. + */ +#define SPINLIB_ITERS_SPIN (51000) + +extern unsigned int spinlib_cycs_per_us; + +extern void spinlib_calib(unsigned int cycs_per_us); +extern void spinlib_usecs(cycles_t usecs); +extern void spinlib_cycles(cycles_t cycs); + +#endif /* SPINLIB_H */ diff --git a/src/components/implementation/capmgr/naive/work.c b/src/components/implementation/capmgr/naive/work.c new file mode 100644 index 0000000000..ffd63ca16a --- /dev/null +++ b/src/components/implementation/capmgr/naive/work.c @@ -0,0 +1,38 @@ +#include +#include +#include "spinlib.h" + +int +work_cycs_cserialized(unsigned long *hielapsed, unsigned long *loelapsed, unsigned long hi_cycs, unsigned long lo_cycs) +{ + cycles_t st, end, elapsed, cycs_input = (((cycles_t)hi_cycs << 32) | (cycles_t)lo_cycs); + + rdtscll(st); + spinlib_cycles(cycs_input); + rdtscll(end); + elapsed = end - st; + + *hielapsed = (elapsed >> 32); + *loelapsed = ((elapsed << 32) >> 32); + + return 0; +} + +int +work_usecs_cserialized(unsigned long *hielapsed, unsigned long *loelapsed, unsigned long hi_us, unsigned long lo_us) +{ + cycles_t st, end; + microsec_t elapsed, usecs_input = (((microsec_t)hi_us << 32) | (microsec_t)lo_us); + + rdtscll(st); + spinlib_usecs(usecs_input); + rdtscll(end); + /* perhaps use spinlib to return the elapsed or use sl.. */ + elapsed = sl_cyc2usec(end - st); + + *hielapsed = (elapsed >> 32); + *loelapsed = ((elapsed << 32) >> 32); + + return 0; + +} diff --git a/src/components/implementation/no_interface/llbooter/boot_deps.h b/src/components/implementation/no_interface/llbooter/boot_deps.h index e1140221e9..8c2fab7cbe 100644 --- a/src/components/implementation/no_interface/llbooter/boot_deps.h +++ b/src/components/implementation/no_interface/llbooter/boot_deps.h @@ -29,14 +29,17 @@ struct comp_sched_info { /* The booter uses this to keep track of each comp */ struct comp_cap_info { - struct cos_defcompinfo def_cinfo; - struct usr_inv_cap ST_user_caps[INTERFACE_UNDEF_SYMBS]; - vaddr_t vaddr_user_caps; /* vaddr of user caps table in comp */ - vaddr_t addr_start; - vaddr_t vaddr_mapped_in_booter; - vaddr_t upcall_entry; - u32_t cpu_bitmap[NUM_CPU_BMP_WORDS]; - struct comp_sched_info *schedinfo[NUM_CPU]; + struct cos_defcompinfo def_cinfo; + struct usr_inv_cap ST_user_caps[INTERFACE_UNDEF_SYMBS]; + vaddr_t vaddr_user_caps; /* vaddr of user caps table in comp */ + vaddr_t addr_start; + vaddr_t vaddr_mapped_in_booter; + vaddr_t upcall_entry; + vaddr_t initdcbpgs; + u32_t cpu_bitmap[NUM_CPU_BMP_WORDS]; + struct comp_sched_info *schedinfo[NUM_CPU]; + struct cos_component_information *cobj_info; + scbcap_t scbcap; } new_comp_cap_info[MAX_NUM_SPDS]; int schedule[NUM_CPU][MAX_NUM_SPDS]; @@ -56,6 +59,14 @@ boot_spd_comp_schedinfo_curr_get(void) return &comp_schedinfo[cos_cpuid()][0]; } +static inline struct cos_component_information * +boot_spd_comp_cobj_info_get(spdid_t spdid) +{ + assert(spdid && spdid <= MAX_NUM_SPDS); + + return boot_spd_compcapinfo_get(spdid)->cobj_info; +} + static inline struct comp_sched_info * boot_spd_comp_schedinfo_get(spdid_t spdid) { @@ -147,8 +158,8 @@ boot_capmgr_mem_alloc(void) void boot_comp_mem_alloc(spdid_t spdid) { - struct cos_compinfo *compinfo = boot_spd_compinfo_get(spdid); - struct cos_compinfo *boot_info = boot_spd_compinfo_curr_get(); + struct cos_compinfo *compinfo = boot_spd_compinfo_get(spdid); + struct cos_compinfo *boot_info = boot_spd_compinfo_curr_get(); unsigned long mem_sz = capmgr_spdid ? CAPMGR_MIN_UNTYPED_SZ : LLBOOT_NEWCOMP_UNTYPED_SZ; if (capmgr_spdid) return; @@ -161,14 +172,14 @@ boot_compinfo_init(spdid_t spdid, captblcap_t *ct, pgtblcap_t *pt, u32_t heap_st { struct cos_compinfo *compinfo = boot_spd_compinfo_get(spdid); struct cos_compinfo *boot_info = boot_spd_compinfo_curr_get(); + struct comp_cap_info *spdinfo = boot_spd_compcapinfo_get(spdid); - *ct = cos_captbl_alloc(boot_info); + *ct = cos_captbl_alloc(boot_info); assert(*ct); - *pt = cos_pgtbl_alloc(boot_info); + *pt = cos_pgtbl_alloc(boot_info); assert(*pt); cos_compinfo_init(compinfo, *pt, *ct, 0, (vaddr_t)heap_start_vaddr, BOOT_CAPTBL_FREE, boot_info); - /* * if this is a capmgr, let it manage its share (ideally rest of system memory) of memory. * if there is no capmgr in the system, allow every component to manage its memory. @@ -190,8 +201,8 @@ boot_newcomp_sinv_alloc(spdid_t spdid) int i = 0; int intr_spdid; void *user_cap_vaddr; - struct cos_compinfo *interface_compinfo; - struct cos_compinfo *compinfo = boot_spd_compinfo_get(spdid); + struct cos_compinfo *interface_compinfo; + struct cos_compinfo *compinfo = boot_spd_compinfo_get(spdid); struct comp_cap_info *spdinfo = boot_spd_compcapinfo_get(spdid); /* TODO: Purge rest of booter of spdid convention */ invtoken_t token = (invtoken_t)spdid; @@ -241,8 +252,14 @@ boot_newcomp_defcinfo_init(spdid_t spdid) struct cos_compinfo *child_ci = boot_spd_compinfo_get(spdid); struct cos_compinfo *boot_info = boot_spd_compinfo_curr_get(); struct comp_sched_info *spdsi = boot_spd_comp_schedinfo_get(spdid); + struct comp_cap_info *spdinfo = boot_spd_compcapinfo_get(spdid); + dcbcap_t dcbcap = 0; + dcboff_t dcboff = 0; - child_aep->thd = cos_initthd_alloc(boot_info, child_ci->comp_cap); + dcbcap = cos_dcb_alloc(boot_info, child_ci->pgtbl_cap, spdinfo->initdcbpgs + cos_cpuid() * PAGE_SIZE); + assert(dcbcap); + + child_aep->thd = cos_initthd_alloc(boot_info, child_ci->comp_cap, dcbcap); assert(child_aep->thd); if (spdsi->flags & COMP_FLAG_SCHED) { @@ -266,11 +283,8 @@ boot_comp_sched_set(spdid_t spdid) struct cos_aep_info *child_aep = boot_spd_initaep_get(spdid); int i = 0; - /* capmgr init only on boot core! */ if (!capmgr_spdid) goto set; - /* - * if there is capmgr in the system, set it to be the first (index == 0) to initialize - */ + /* if there is capmgr in the system, set it to be the first (index == 0) to initialize */ if (spdid == capmgr_spdid) goto done; i = 1; @@ -291,8 +305,8 @@ boot_sched_caps_init(spdid_t spdid) struct cos_aep_info *child_aep = boot_spd_initaep_get(spdid); int ret, i; - /* If booter should create the init caps in that component */ - if (compsi->parent_spdid) return; + /* booter uses capmgr to create initthds in root-schedulers */ + if (compsi->parent_spdid || (capmgr_spdid && spdid != capmgr_spdid)) return; boot_newcomp_defcinfo_init(spdid); ret = cos_cap_cpy_at(ci, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, boot_info, child_aep->thd); @@ -360,6 +374,8 @@ boot_newcomp_create(spdid_t spdid, struct cos_compinfo *comp_info) struct cos_compinfo *compinfo = boot_spd_compinfo_get(spdid); struct cos_compinfo *boot_info = boot_spd_compinfo_curr_get(); struct comp_cap_info *spdinfo = boot_spd_compcapinfo_get(spdid); + struct cos_component_information *cobj_info = boot_spd_comp_cobj_info_get(spdid); + struct comp_sched_info *spdsi = boot_spd_comp_schedinfo_get(spdid); captblcap_t ct = compinfo->captbl_cap; pgtblcap_t pt = compinfo->pgtbl_cap; compcap_t cc; @@ -368,8 +384,31 @@ boot_newcomp_create(spdid_t spdid, struct cos_compinfo *comp_info) int i = 0; invtoken_t token = (invtoken_t)spdid; int ret; + vaddr_t scb_uaddr = 0; + scbcap_t scbcap = 0; + + if (spdsi->flags & COMP_FLAG_SCHED) { + scbcap = cos_scb_alloc(boot_info); + assert(scbcap); + spdinfo->scbcap = scbcap; + scb_uaddr = cos_page_bump_intern_valloc(compinfo, COS_SCB_SIZE); + assert(scb_uaddr); + } else if (spdsi->parent_spdid) { + struct comp_cap_info *psi = boot_spd_compcapinfo_get(spdsi->parent_spdid); + scbcap = psi->scbcap; + } - cc = cos_comp_alloc(boot_info, ct, pt, (vaddr_t)spdinfo->upcall_entry); + if (spdinfo->initdcbpgs == 0) { + vaddr_t dcbaddr = 0; + + dcbaddr = cos_page_bump_intern_valloc(compinfo, NUM_CPU * PAGE_SIZE); + assert(dcbaddr); + + spdinfo->initdcbpgs = dcbaddr; + } + + /* scb info created on compinfo_init */ + cc = cos_comp_alloc(boot_info, ct, pt, scbcap, (vaddr_t)spdinfo->upcall_entry, scb_uaddr); assert(cc); compinfo->comp_cap = cc; @@ -394,7 +433,9 @@ boot_bootcomp_init(void) if (first_time) { first_time = 0; cos_meminfo_init(&(boot_info->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT); - cos_defcompinfo_init(); + cos_defcompinfo_init_ext(BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, + BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT, + BOOT_CAPTBL_SELF_COMP, (vaddr_t)cos_get_heap_ptr(), LLBOOT_CAPTBL_FREE); } else { cos_defcompinfo_sched_init(); } @@ -402,6 +443,21 @@ boot_bootcomp_init(void) bootsi->flags |= COMP_FLAG_SCHED; } +static void +boot_root_sched_transfer(void) +{ + struct cos_aep_info *root_aep = NULL; + int ret; + + if (!root_spdid[cos_cpuid()]) return; + + root_aep = boot_spd_initaep_get(root_spdid[cos_cpuid()]); + + PRINTLOG(PRINT_DEBUG, "Root scheduler is %u, transferring INF budget now!\n", root_spdid[cos_cpuid()]); + ret = cos_tcap_transfer(root_aep->rcv, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, LLBOOT_ROOTSCHED_PRIO); + assert(ret == 0); +} + static void boot_done(void) { @@ -415,7 +471,6 @@ void boot_root_sched_run(void) { struct cos_aep_info *root_aep = NULL; - int ret; if (!root_spdid[cos_cpuid()]) { PRINTLOG(PRINT_WARN, "No root scheduler!\n"); @@ -426,10 +481,7 @@ boot_root_sched_run(void) root_aep = boot_spd_initaep_get(root_spdid[cos_cpuid()]); PRINTLOG(PRINT_DEBUG, "Root scheduler is %u, switching to it now!\n", root_spdid[cos_cpuid()]); - ret = cos_tcap_transfer(root_aep->rcv, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, LLBOOT_ROOTSCHED_PRIO); - assert(ret == 0); - - ret = cos_switch(root_aep->thd, root_aep->tc, LLBOOT_ROOTSCHED_PRIO, TCAP_TIME_NIL, 0, cos_sched_sync()); + cos_switch(root_aep->thd, root_aep->tc, LLBOOT_ROOTSCHED_PRIO, TCAP_TIME_NIL, 0, cos_sched_sync()); PRINTLOG(PRINT_ERROR, "Root scheduler returned.\n"); assert(0); } @@ -512,7 +564,18 @@ boot_comp_cap_cpy_at(spdid_t dstid, capid_t dstslot, spdid_t srcid, cap_t captyp } static inline int -boot_comp_initaep_get(spdid_t dstid, spdid_t srcid, thdcap_t thdslot, arcvcap_t rcvslot, tcap_t tcslot) +boot_comp_sched_get(spdid_t dstid, spdid_t srcid) +{ + struct comp_sched_info *si = NULL; + + if (srcid > num_cobj || dstid > num_cobj) return -EINVAL; + si = boot_spd_comp_schedinfo_get(srcid); + + return si->parent_spdid; +} + +static inline int +boot_comp_initaep_get(spdid_t dstid, spdid_t srcid, thdcap_t thdslot, arcvcap_t rcvslot, tcap_t tcslot, spdid_t *parent) { struct comp_sched_info *si = NULL; int ret = -1; @@ -531,10 +594,48 @@ boot_comp_initaep_get(spdid_t dstid, spdid_t srcid, thdcap_t thdslot, arcvcap_t if (ret) goto done; ret = boot_comp_cap_cpy_at(dstid, tcslot, srcid, CAP_TCAP); + *parent = si->parent_spdid; + done: return ret; } +static inline int +boot_root_initaep_set(spdid_t dstid, spdid_t srcid, thdcap_t thd, arcvcap_t rcv, tcap_t tc) +{ + struct comp_sched_info *si = NULL; + struct cos_aep_info *a = NULL; + struct cos_compinfo *b = cos_compinfo_get(cos_defcompinfo_curr_get()), *c = boot_spd_compinfo_get(dstid); + + if (srcid > num_cobj || dstid > num_cobj) return -EINVAL; + if (!thd) return -EINVAL; + + si = boot_spd_comp_schedinfo_get(srcid); + if (si->parent_spdid != 0) return -EINVAL; + + a = boot_spd_initaep_get(srcid); + if (!a) return -EINVAL; + + a->thd = cos_cap_cpy(b, c, CAP_THD, thd); + assert(a->thd); + if ((si->flags & COMP_FLAG_SCHED) == 0) { + assert(!tc && !rcv); + goto done; + } + if (!rcv || !tc) return -EINVAL; + + a->tc = cos_cap_cpy(b, c, CAP_TCAP, tc); + assert(a->tc); + a->rcv = cos_cap_cpy(b, c, CAP_ARCV, rcv); + assert(a->rcv); + if (root_spdid[cos_cpuid()] == srcid) boot_root_sched_transfer(); + +done: + boot_comp_sched_set(srcid); + + return 0; +} + static inline int boot_comp_info_get(spdid_t dstid, spdid_t srcid, pgtblcap_t ptslot, captblcap_t ctslot, compcap_t compslot, spdid_t *parentid) { @@ -706,9 +807,24 @@ hypercall_entry(word_t *ret2, word_t *ret3, int op, word_t arg3, word_t arg4, wo thdcap_t thdslot = (arg3 << 16) >> 16; tcap_t tcslot = (arg4 << 16) >> 16;; arcvcap_t rcvslot = arg4 >> 16; + spdid_t parent_spdid = 0; + + if (!__hypercall_resource_access_check(client, srcid, 0)) return -EACCES; + ret1 = boot_comp_initaep_get(client, srcid, thdslot, rcvslot, tcslot, &parent_spdid); + + *ret2 = (word_t)parent_spdid; + + break; + } + case HYPERCALL_ROOT_INITAEP_SET: + { + spdid_t srcid = arg3 >> 16; + thdcap_t thd = (arg3 << 16) >> 16; + tcap_t tc = (arg4 << 16) >> 16;; + arcvcap_t rcv = arg4 >> 16; if (!__hypercall_resource_access_check(client, srcid, 0)) return -EACCES; - ret1 = boot_comp_initaep_get(client, srcid, thdslot, rcvslot, tcslot); + ret1 = boot_root_initaep_set(client, srcid, thd, rcv, tc); break; } @@ -756,26 +872,21 @@ hypercall_entry(word_t *ret2, word_t *ret3, int op, word_t arg3, word_t arg4, wo break; } - case HYPERCALL_COMP_CAPFRONTIER_GET: + case HYPERCALL_COMP_CPUBITMAP_GET: { - vaddr_t vasfr; - capid_t capfr; spdid_t srcid = arg3; if (!__hypercall_resource_access_check(client, srcid, 1)) return -EACCES; - ret1 = boot_comp_frontier_get(client, srcid, &vasfr, &capfr); - if (ret1) goto done; - - *ret2 = vasfr; + ret1 = boot_comp_cpubitmap_get(srcid, (u32_t *)ret2, (u32_t *)ret3); break; } - case HYPERCALL_COMP_CPUBITMAP_GET: + case HYPERCALL_COMP_SCHED_GET: { spdid_t srcid = arg3; if (!__hypercall_resource_access_check(client, srcid, 1)) return -EACCES; - ret1 = boot_comp_cpubitmap_get(srcid, (u32_t *)ret2, (u32_t *)ret3); + ret1 = boot_comp_sched_get(client, srcid); break; } diff --git a/src/components/implementation/no_interface/llbooter/llbooter.c b/src/components/implementation/no_interface/llbooter/llbooter.c index e60d732370..0d0acebe6b 100644 --- a/src/components/implementation/no_interface/llbooter/llbooter.c +++ b/src/components/implementation/no_interface/llbooter/llbooter.c @@ -222,15 +222,17 @@ boot_comp_map_populate(struct cobj_header *h, spdid_t spdid, vaddr_t comp_info) } if (sect->flags & COBJ_SECT_CINFO) { + int k; + assert((left % PAGE_SIZE) == 0); assert(comp_info == (dest_daddr + (((left/PAGE_SIZE)-1)*PAGE_SIZE))); boot_process_cinfo(h, spdid, boot_spd_end(h), start_addr + (comp_info - init_daddr), comp_info); ci = (struct cos_component_information *)(start_addr + (comp_info - init_daddr)); + spdinfo->cobj_info = ci; hinfo = boot_spd_compcapinfo_get(h->id); hinfo->upcall_entry = ci->cos_upcall_entry; } - } return 0; @@ -466,7 +468,7 @@ cos_init(void) if (cos_cpuid() == INIT_CORE) { capmgr_spdid = 0; - memset(root_spdid, 0, sizeof(int) * NUM_CPU); + memset(root_spdid, 0, sizeof(spdid_t) * NUM_CPU); memset(new_comp_cap_info, 0, sizeof(struct comp_cap_info) * (MAX_NUM_SPDS)); h = (struct cobj_header *)cos_comp_info.cos_poly[0]; diff --git a/src/components/implementation/no_interface/omp_dijkstra/Makefile b/src/components/implementation/no_interface/omp_dijkstra/Makefile new file mode 100644 index 0000000000..a702328c38 --- /dev/null +++ b/src/components/implementation/no_interface/omp_dijkstra/Makefile @@ -0,0 +1,10 @@ +COMPONENT=omp_dijkstra.o +INTERFACES= +#DEPENDENCIES=capmgr +IF_LIB= +ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps + +include ../../Makefile.subsubdir +MANDITORY_LIB=simple_stklib.o + +CFLAGS += -fopenmp diff --git a/src/components/implementation/no_interface/omp_dijkstra/dijkstra_omp.c b/src/components/implementation/no_interface/omp_dijkstra/dijkstra_omp.c new file mode 100644 index 0000000000..4eb5375c3c --- /dev/null +++ b/src/components/implementation/no_interface/omp_dijkstra/dijkstra_omp.c @@ -0,0 +1,564 @@ +#include +#include +#include +#include + +# define NV 6 + +//int main ( int argc, char **argv ); +int *dijkstra_distance ( int ohd[NV][NV] ); +void find_nearest ( int s, int e, int mind[NV], int connected[NV], int *d, + int *v ); +void init ( int ohd[NV][NV] ); +void timestamp ( void ); +void update_mind ( int s, int e, int mv, int connected[NV], int ohd[NV][NV], + int mind[NV] ); + +/******************************************************************************/ + +int main ( void )//int argc, char **argv ) + +/******************************************************************************/ +/* + Purpose: + + MAIN runs an example of Dijkstra's minimum distance algorithm. + + Discussion: + + Given the distance matrix that defines a graph, we seek a list + of the minimum distances between node 0 and all other nodes. + + This program sets up a small example problem and solves it. + + The correct minimum distances are: + + 0 35 15 45 49 41 + + Licensing: + + This code is distributed under the GNU LGPL license. + + Modified: + + 01 July 2010 + + Author: + + Original C version by Norm Matloff, CS Dept, UC Davis. + This C version by John Burkardt. +*/ +{ + int i; + int i4_huge = 2147483647; + int j; + int *mind; + int ohd[NV][NV]; + + timestamp ( ); + PRINTC ( "\n" ); + PRINTC ( "DIJKSTRA_OPENMP\n" ); + PRINTC ( " C version\n" ); + PRINTC ( " Use Dijkstra's algorithm to determine the minimum\n" ); + PRINTC ( " distance from node 0 to each node in a graph,\n" ); + PRINTC ( " given the distances between each pair of nodes.\n" ); + PRINTC ( "\n" ); + PRINTC ( " Although a very small example is considered, we\n" ); + PRINTC ( " demonstrate the use of OpenMP directives for\n" ); + PRINTC ( " parallel execution.\n" ); +/* + Initialize the problem data. +*/ + init ( ohd ); +/* + Print the distance matrix. +*/ + PRINTC ( "\n" ); + PRINTC ( " Distance matrix:\n" ); + PRINTC ( "\n" ); + for ( i = 0; i < NV; i++ ) + { + for ( j = 0; j < NV; j++ ) + { + if ( ohd[i][j] == i4_huge ) + { + PRINTC ( " Inf" ); + } + else + { + PRINTC ( " %3d", ohd[i][j] ); + } + } + PRINTC ( "\n" ); + } +/* + Carry out the algorithm. +*/ + mind = dijkstra_distance ( ohd ); +/* + Print the results. +*/ + PRINTC ( "\n" ); + PRINTC ( " Minimum distances from node 0:\n"); + PRINTC ( "\n" ); + for ( i = 0; i < NV; i++ ) + { + PRINTC ( " %2d %2d\n", i, mind[i] ); + } +/* + Free memory. +*/ + free ( mind ); +/* + Terminate. +*/ + PRINTC ( "\n" ); + PRINTC ( "DIJKSTRA_OPENMP\n" ); + PRINTC ( " Normal end of execution.\n" ); + + PRINTC ( "\n" ); + timestamp ( ); + + return 0; +} +/******************************************************************************/ + +int *dijkstra_distance ( int ohd[NV][NV] ) + +/******************************************************************************/ +/* + Purpose: + + DIJKSTRA_DISTANCE uses Dijkstra's minimum distance algorithm. + + Discussion: + + We essentially build a tree. We start with only node 0 connected + to the tree, and this is indicated by setting CONNECTED[0] = 1. + + We initialize MIND[I] to the one step distance from node 0 to node I. + + Now we search among the unconnected nodes for the node MV whose minimum + distance is smallest, and connect it to the tree. For each remaining + unconnected node I, we check to see whether the distance from 0 to MV + to I is less than that recorded in MIND[I], and if so, we can reduce + the distance. + + After NV-1 steps, we have connected all the nodes to 0, and computed + the correct minimum distances. + + Licensing: + + This code is distributed under the GNU LGPL license. + + Modified: + + 02 July 2010 + + Author: + + Original C version by Norm Matloff, CS Dept, UC Davis. + This C version by John Burkardt. + + Parameters: + + Input, int OHD[NV][NV], the distance of the direct link between + nodes I and J. + + Output, int DIJKSTRA_DISTANCE[NV], the minimum distance from + node 0 to each node. +*/ +{ + int *connected; + int i; + int i4_huge = 2147483647; + int md; + int *mind; + int mv; + int my_first; + int my_id; + int my_last; + int my_md; + int my_mv; + int my_step; + int nth; +/* + Start out with only node 0 connected to the tree. +*/ + connected = ( int * ) malloc ( NV * sizeof ( int ) ); + + connected[0] = 1; + for ( i = 1; i < NV; i++ ) + { + connected[i] = 0; + } +/* + Initial estimate of minimum distance is the 1-step distance. +*/ + mind = ( int * ) malloc ( NV * sizeof ( int ) ); + + for ( i = 0; i < NV; i++ ) + { + mind[i] = ohd[0][i]; + } +/* + Begin the parallel region. +*/ + # pragma omp parallel private ( my_first, my_id, my_last, my_md, my_mv, my_step ) \ + shared ( connected, md, mind, mv, nth, ohd ) + { + my_id = omp_get_thread_num ( ); + nth = omp_get_num_threads ( ); + my_first = ( my_id * NV ) / nth; + my_last = ( ( my_id + 1 ) * NV ) / nth - 1; +/* + The SINGLE directive means that the block is to be executed by only + one thread, and that thread will be whichever one gets here first. +*/ + # pragma omp single + { + PRINTC ( "\n" ); + PRINTC ( " P%d: Parallel region begins with %d threads\n", my_id, nth ); + PRINTC ( "\n" ); + } + PRINTC ( " P%d: First=%d Last=%d\n", my_id, my_first, my_last ); + + for ( my_step = 1; my_step < NV; my_step++ ) + { +/* + Before we compare the results of each thread, set the shared variable + MD to a big value. Only one thread needs to do this. +*/ + # pragma omp single + { + md = i4_huge; + mv = -1; + } +/* + Each thread finds the nearest unconnected node in its part of the graph. + Some threads might have no unconnected nodes left. +*/ + find_nearest ( my_first, my_last, mind, connected, &my_md, &my_mv ); +/* + In order to determine the minimum of all the MY_MD's, we must insist + that only one thread at a time execute this block! +*/ + # pragma omp critical + { + if ( my_md < md ) + { + md = my_md; + mv = my_mv; + } + } +/* + This barrier means that ALL threads have executed the critical + block, and therefore MD and MV have the correct value. Only then + can we proceed. +*/ + # pragma omp barrier +/* + If MV is -1, then NO thread found an unconnected node, so we're done early. + OpenMP does not like to BREAK out of a parallel region, so we'll just have + to let the iteration run to the end, while we avoid doing any more updates. + + Otherwise, we connect the nearest node. +*/ + # pragma omp single + { + if ( mv != - 1 ) + { + connected[mv] = 1; + PRINTC ( " P%d: Connecting node %d.\n", my_id, mv ); + } + } +/* + Again, we don't want any thread to proceed until the value of + CONNECTED is updated. +*/ + # pragma omp barrier +/* + Now each thread should update its portion of the MIND vector, + by checking to see whether the trip from 0 to MV plus the step + from MV to a node is closer than the current record. +*/ + if ( mv != -1 ) + { + update_mind ( my_first, my_last, mv, connected, ohd, mind ); + } +/* + Before starting the next step of the iteration, we need all threads + to complete the updating, so we set a BARRIER here. +*/ + #pragma omp barrier + } +/* + Once all the nodes have been connected, we can exit. +*/ + # pragma omp single + { + PRINTC ( "\n" ); + PRINTC ( " P%d: Exiting parallel region.\n", my_id ); + } + } + + free ( connected ); + + return mind; +} +/******************************************************************************/ + +void find_nearest ( int s, int e, int mind[NV], int connected[NV], int *d, + int *v ) + +/******************************************************************************/ +/* + Purpose: + + FIND_NEAREST finds the nearest unconnected node. + + Licensing: + + This code is distributed under the GNU LGPL license. + + Modified: + + 02 July 2010 + + Author: + + Original C version by Norm Matloff, CS Dept, UC Davis. + This C version by John Burkardt. + + Parameters: + + Input, int S, E, the first and last nodes that are to be checked. + + Input, int MIND[NV], the currently computed minimum distance from + node 0 to each node. + + Input, int CONNECTED[NV], is 1 for each connected node, whose + minimum distance to node 0 has been determined. + + Output, int *D, the distance from node 0 to the nearest unconnected + node in the range S to E. + + Output, int *V, the index of the nearest unconnected node in the range + S to E. +*/ +{ + int i; + int i4_huge = 2147483647; + + *d = i4_huge; + *v = -1; + + for ( i = s; i <= e; i++ ) + { + if ( !connected[i] && ( mind[i] < *d ) ) + { + *d = mind[i]; + *v = i; + } + } + return; +} +/******************************************************************************/ + +void init ( int ohd[NV][NV] ) + +/******************************************************************************/ +/* + Purpose: + + INIT initializes the problem data. + + Discussion: + + The graph uses 6 nodes, and has the following diagram and + distance matrix: + + N0--15--N2-100--N3 0 40 15 Inf Inf Inf + \ | / 40 0 20 10 25 6 + \ | / 15 20 0 100 Inf Inf + 40 20 10 Inf 10 100 0 Inf Inf + \ | / Inf 25 Inf Inf 0 8 + \ | / Inf 6 Inf Inf 8 0 + N1 + / \ + / \ + 6 25 + / \ + / \ + N5----8-----N4 + + Licensing: + + This code is distributed under the GNU LGPL license. + + Modified: + + 02 July 2010 + + Author: + + Original C version by Norm Matloff, CS Dept, UC Davis. + This C version by John Burkardt. + + Parameters: + + Output, int OHD[NV][NV], the distance of the direct link between + nodes I and J. +*/ +{ + int i; + int i4_huge = 2147483647; + int j; + + for ( i = 0; i < NV; i++ ) + { + for ( j = 0; j < NV; j++ ) + { + if ( i == j ) + { + ohd[i][i] = 0; + } + else + { + ohd[i][j] = i4_huge; + } + } + } + ohd[0][1] = ohd[1][0] = 40; + ohd[0][2] = ohd[2][0] = 15; + ohd[1][2] = ohd[2][1] = 20; + ohd[1][3] = ohd[3][1] = 10; + ohd[1][4] = ohd[4][1] = 25; + ohd[2][3] = ohd[3][2] = 100; + ohd[1][5] = ohd[5][1] = 6; + ohd[4][5] = ohd[5][4] = 8; + + return; +} +/******************************************************************************/ + +void timestamp ( void ) + +/******************************************************************************/ +/* + Purpose: + + TIMESTAMP prints the current YMDHMS date as a time stamp. + + Example: + + 31 May 2001 09:45:54 AM + + Licensing: + + This code is distributed under the GNU LGPL license. + + Modified: + + 24 September 2003 + + Author: + + John Burkardt + + Parameters: + + None +*/ +{ +#if 0 +# define TIME_SIZE 40 + + static char time_buffer[TIME_SIZE]; + const struct tm *tm; + time_t now; + + now = time ( NULL ); + tm = localtime ( &now ); + + strftime ( time_buffer, TIME_SIZE, "%d %B %Y %I:%M:%S %p", tm ); + + PRINTC ( "%s\n", time_buffer ); + + return; +# undef TIME_SIZE +#else + cycles_t now; + + rdtscll(now); + PRINTC("%llu\n", now); +#endif +} +/******************************************************************************/ + +void update_mind ( int s, int e, int mv, int connected[NV], int ohd[NV][NV], + int mind[NV] ) + +/******************************************************************************/ +/* + Purpose: + + UPDATE_MIND updates the minimum distance vector. + + Discussion: + + We've just determined the minimum distance to node MV. + + For each unconnected node I in the range S to E, + check whether the route from node 0 to MV to I is shorter + than the currently known minimum distance. + + Licensing: + + This code is distributed under the GNU LGPL license. + + Modified: + + 02 July 2010 + + Author: + + Original C version by Norm Matloff, CS Dept, UC Davis. + This C version by John Burkardt. + + Parameters: + + Input, int S, E, the first and last nodes that are to be checked. + + Input, int MV, the node whose minimum distance to node 0 + has just been determined. + + Input, int CONNECTED[NV], is 1 for each connected node, whose + minimum distance to node 0 has been determined. + + Input, int OHD[NV][NV], the distance of the direct link between + nodes I and J. + + Input/output, int MIND[NV], the currently computed minimum distances + from node 0 to each node. On output, the values for nodes S through + E have been updated. +*/ +{ + int i; + int i4_huge = 2147483647; + + for ( i = s; i <= e; i++ ) + { + if ( !connected[i] ) + { + if ( ohd[mv][i] < i4_huge ) + { + if ( mind[mv] + ohd[mv][i] < mind[i] ) + { + mind[i] = mind[mv] + ohd[mv][i]; + } + } + } + } + return; +} diff --git a/src/components/implementation/no_interface/omp_dijkstra/init.c b/src/components/implementation/no_interface/omp_dijkstra/init.c new file mode 120000 index 0000000000..b2694bf833 --- /dev/null +++ b/src/components/implementation/no_interface/omp_dijkstra/init.c @@ -0,0 +1 @@ +../omp_hello/init.c \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_dijkstra/posix_basic.c b/src/components/implementation/no_interface/omp_dijkstra/posix_basic.c new file mode 100644 index 0000000000..41c8507068 --- /dev/null +++ b/src/components/implementation/no_interface/omp_dijkstra/posix_basic.c @@ -0,0 +1,126 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/* + * hack for more memory using the most insecure feature in composite: + * map random physical addresses to virtual addresses and do whatever with it! + */ +#define START_PHY round_up_to_page(0x00100000 + COS_PHYMEM_MAX_SZ + PAGE_SIZE) +#define PHY_MAX ((512 * 1024 * 1024) + (256 * 1024 * 1024)) + +static unsigned free_phy_offset = 0; + +void * +__alloc_memory(size_t sz) +{ + void *va = NULL; + struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get()); + //unsigned off = ps_faa(&free_phy_offset, sz); + unsigned off; + +try_again: + off = ps_load(&free_phy_offset); + + /* + * first use physical memory hack and + * if we run out, then use heap alloc so + * we don't run out of standard memory first + */ + if (off > PHY_MAX || off + sz > PHY_MAX) { + va = cos_page_bump_allocn(ci, round_up_to_page(sz)); + } else { + if (!ps_cas(&free_phy_offset, off, off + sz)) goto try_again; + /* use physical memory hack! */ + va = cos_hw_map(ci, BOOT_CAPTBL_SELF_INITHW_BASE, START_PHY + off, sz); + } + + assert(va); + memset(va, 0, sz); + + return va; +} + +//#include + +// HACK: The hack to end all hacks +void * +cos_mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset) +{ + void *ret=0; + + if (addr != NULL) { + printc("parameter void *addr is not supported!\n"); + errno = ENOTSUP; + return MAP_FAILED; + } + if (fd != -1) { + printc("file mapping is not supported!\n"); + errno = ENOTSUP; + return MAP_FAILED; + } + + //addr = (void *)memmgr_heap_page_allocn(pages); + addr = __alloc_memory(length); +// addr = (void *)cos_page_bump_allocn(cos_compinfo_get(cos_defcompinfo_curr_get()), round_up_to_page(length)); + if (!addr){ + ret = (void *) -1; + } else { + ret = addr; + } + + if (ret == (void *)-1) { /* return value comes from man page */ + printc("mmap() failed!\n"); + /* This is a best guess about what went wrong */ + errno = ENOMEM; + } + return ret; +} + +long +cos_syscall_handler(int syscall_num, long a, long b, long c, long d, long e, long f, long g) +{ + if (syscall_num == __NR_clock_gettime) { + microsec_t microseconds = ps_tsc() / cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE); + time_t seconds = microseconds / 1000000; + long rest = microseconds % 1000000; + + *((struct timespec *)b) = (struct timespec) {seconds, rest}; + return 0; + } + + if (syscall_num == __NR_mmap || syscall_num == __NR_mmap2) { + return (long)cos_mmap((void *)a, (size_t)b, (int)c, (int)d, (int)e, (off_t)f); + } + + if (syscall_num == __NR_brk || syscall_num == __NR_munmap) { + return 0; + } + + printc("Unimplemented syscall number %d\n", syscall_num); + assert(0); + return 0; +} + +// Hack around thread local data +static int cancelstate = 0; + +int +pthread_setcancelstate(int new, int *old) +{ + if (new > 2) return EINVAL; + + if (old) *old = cancelstate; + cancelstate = new; + return 0; +} diff --git a/src/components/implementation/no_interface/omp_fft_bots/Makefile b/src/components/implementation/no_interface/omp_fft_bots/Makefile new file mode 100644 index 0000000000..c5d7dddf99 --- /dev/null +++ b/src/components/implementation/no_interface/omp_fft_bots/Makefile @@ -0,0 +1,19 @@ +COMPONENT=omp_fft_bots.o +INTERFACES= +#DEPENDENCIES=capmgr +IF_LIB= +ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps + +include ../../Makefile.subsubdir +MANDITORY_LIB=simple_stklib.o + +CFLAGS += -fopenmp +# if tied tasks are required +CFLAGS += -DFORCE_TIED_TASKS + +OMPC_FINAL_FLAGS= + +# one per compilation or none +#CFLAGS += -DMANUAL_CUTOFF +#CFLAGS += -DIF_CUTOFF +#CFLAGS += -DFINAL_CUTOFF $(OMPC_FINAL_FLAGS) diff --git a/src/components/implementation/no_interface/omp_fft_bots/app-desc.h b/src/components/implementation/no_interface/omp_fft_bots/app-desc.h new file mode 100644 index 0000000000..d31b29104e --- /dev/null +++ b/src/components/implementation/no_interface/omp_fft_bots/app-desc.h @@ -0,0 +1,56 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +#include "omp-tasks-app.h" +#include "fft.h" + +#define BOTS_APP_NAME "FFT" +#define BOTS_APP_PARAMETERS_DESC "Size=%d" +#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size + +#define BOTS_APP_USES_ARG_SIZE +#define BOTS_APP_DEF_ARG_SIZE 32*1024*1024 +#define BOTS_APP_DESC_ARG_SIZE "Matrix Size" + +#define BOTS_APP_INIT int i;\ + COMPLEX *in, *out1=NULL, *out2=NULL;\ + in = malloc(bots_arg_size * sizeof(COMPLEX));\ + +#define KERNEL_INIT\ + out1 = malloc(bots_arg_size * sizeof(COMPLEX));\ + for (i = 0; i < bots_arg_size; ++i) {\ + c_re(in[i]) = 1.0;\ + c_im(in[i]) = 1.0;\ + } +#define KERNEL_CALL fft(bots_arg_size, in, out1); +#define KERNEL_FINI + +#define KERNEL_SEQ_INIT\ + out2 = malloc(bots_arg_size * sizeof(COMPLEX));\ + for (i = 0; i < bots_arg_size; ++i) {\ + c_re(in[i]) = 1.0;\ + c_im(in[i]) = 1.0;\ + } +#define KERNEL_SEQ_CALL fft_seq(bots_arg_size, in, out2); +#define KERNEL_SEQ_FINI + +#undef BOTS_APP_CHECK_USES_SEQ_RESULT +#define KERNEL_CHECK test_correctness(bots_arg_size, out1, out2) + diff --git a/src/components/implementation/no_interface/omp_fft_bots/bots.h b/src/components/implementation/no_interface/omp_fft_bots/bots.h new file mode 120000 index 0000000000..ea0ad2b59f --- /dev/null +++ b/src/components/implementation/no_interface/omp_fft_bots/bots.h @@ -0,0 +1 @@ +../omp_fib_bots/bots.h \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_fft_bots/bots_common.c b/src/components/implementation/no_interface/omp_fft_bots/bots_common.c new file mode 120000 index 0000000000..4802b0cf70 --- /dev/null +++ b/src/components/implementation/no_interface/omp_fft_bots/bots_common.c @@ -0,0 +1 @@ +../omp_fib_bots/bots_common.c \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_fft_bots/bots_common.h b/src/components/implementation/no_interface/omp_fft_bots/bots_common.h new file mode 120000 index 0000000000..14eda863e4 --- /dev/null +++ b/src/components/implementation/no_interface/omp_fft_bots/bots_common.h @@ -0,0 +1 @@ +../omp_fib_bots/bots_common.h \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_fft_bots/bots_main.c b/src/components/implementation/no_interface/omp_fft_bots/bots_main.c new file mode 120000 index 0000000000..14f2dab009 --- /dev/null +++ b/src/components/implementation/no_interface/omp_fft_bots/bots_main.c @@ -0,0 +1 @@ +../omp_fib_bots/bots_main.c \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_fft_bots/bots_main.h b/src/components/implementation/no_interface/omp_fft_bots/bots_main.h new file mode 120000 index 0000000000..86c06ad286 --- /dev/null +++ b/src/components/implementation/no_interface/omp_fft_bots/bots_main.h @@ -0,0 +1 @@ +../omp_fib_bots/bots_main.h \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_fft_bots/fft.c b/src/components/implementation/no_interface/omp_fft_bots/fft.c new file mode 100644 index 0000000000..b030676e26 --- /dev/null +++ b/src/components/implementation/no_interface/omp_fft_bots/fft.c @@ -0,0 +1,5015 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +/* + * Original code from the Cilk project + * + * Copyright (c) 2000 Massachusetts Institute of Technology + * Copyright (c) 2000 Matteo Frigo + */ + +#include +#include +#include +#include +#include "bots.h" +#include "app-desc.h" + +/* Definitions and operations for complex numbers */ + +/* + * compute the W coefficients (that is, powers of the root of 1) + * and store them into an array. + */ +void compute_w_coefficients(int n, int a, int b, COMPLEX * W) +{ + register double twoPiOverN; + register int k; + register REAL s, c; + + if (b - a < 128) { + twoPiOverN = 2.0 * 3.1415926535897932384626434 / n; + for (k = a; k <= b; ++k) { + c = cos(twoPiOverN * k); + c_re(W[k]) = c_re(W[n - k]) = c; + s = sin(twoPiOverN * k); + c_im(W[k]) = -s; + c_im(W[n - k]) = s; + } + } else { + int ab = (a + b) / 2; +#if defined(FORCE_TIED_TASKS) + #pragma omp task + compute_w_coefficients(n, a, ab, W); + #pragma omp task + compute_w_coefficients(n, ab + 1, b, W); +#else + #pragma omp task untied + compute_w_coefficients(n, a, ab, W); + #pragma omp task untied + compute_w_coefficients(n, ab + 1, b, W); +#endif + #pragma omp taskwait + } +} +void compute_w_coefficients_seq(int n, int a, int b, COMPLEX * W) +{ + register double twoPiOverN; + register int k; + register REAL s, c; + + if (b - a < 128) { + twoPiOverN = 2.0 * 3.1415926535897932384626434 / n; + for (k = a; k <= b; ++k) { + c = cos(twoPiOverN * k); + c_re(W[k]) = c_re(W[n - k]) = c; + s = sin(twoPiOverN * k); + c_im(W[k]) = -s; + c_im(W[n - k]) = s; + } + } else { + int ab = (a + b) / 2; + compute_w_coefficients_seq(n, a, ab, W); + compute_w_coefficients_seq(n, ab + 1, b, W); + } +} +/* + * Determine (in a stupid way) if n is divisible by eight, then by four, else + * find the smallest prime factor of n. + */ +int factor(int n) +{ + int r; + + if (n < 2) return 1; + if (n == 64 || n == 128 || n == 256 || n == 1024 || n == 2048 || n == 4096) return 8; + if ((n & 15) == 0) return 16; + if ((n & 7) == 0) return 8; + if ((n & 3) == 0) return 4; + if ((n & 1) == 0) return 2; + + /* try odd numbers up to n (computing the sqrt may be slower) */ + for (r = 3; r < n; r += 2) if (n % r == 0) return r; + + /* n is prime */ + return n; +} + +void unshuffle(int a, int b, COMPLEX * in, COMPLEX * out, int r, int m) +{ + int i, j; + int r4 = r & (~0x3); + const COMPLEX *ip; + COMPLEX *jp; + + if (b - a < 16) { + ip = in + a * r; + for (i = a; i < b; ++i) { + jp = out + i; + for (j = 0; j < r4; j += 4) { + jp[0] = ip[0]; + jp[m] = ip[1]; + jp[2 * m] = ip[2]; + jp[3 * m] = ip[3]; + jp += 4 * m; + ip += 4; + } + for (; j < r; ++j) { + *jp = *ip; + ip++; + jp += m; + } + } + } else { + int ab = (a + b) / 2; +#if defined(FORCE_TIED_TASKS) + #pragma omp task + unshuffle(a, ab, in, out, r, m); + #pragma omp task + unshuffle(ab, b, in, out, r, m); +#else + #pragma omp task untied + unshuffle(a, ab, in, out, r, m); + #pragma omp task untied + unshuffle(ab, b, in, out, r, m); +#endif + #pragma omp taskwait + } +} +void unshuffle_seq(int a, int b, COMPLEX * in, COMPLEX * out, int r, int m) +{ + int i, j; + int r4 = r & (~0x3); + const COMPLEX *ip; + COMPLEX *jp; + + if (b - a < 16) { + ip = in + a * r; + for (i = a; i < b; ++i) { + jp = out + i; + for (j = 0; j < r4; j += 4) { + jp[0] = ip[0]; + jp[m] = ip[1]; + jp[2 * m] = ip[2]; + jp[3 * m] = ip[3]; + jp += 4 * m; + ip += 4; + } + for (; j < r; ++j) { + *jp = *ip; + ip++; + jp += m; + } + } + } else { + int ab = (a + b) / 2; + unshuffle_seq(a, ab, in, out, r, m); + unshuffle_seq(ab, b, in, out, r, m); + } +} +void fft_twiddle_gen1(COMPLEX * in, COMPLEX * out, + COMPLEX * W, int r, int m, + int nW, int nWdnti, int nWdntm) +{ + int j, k; + COMPLEX *jp, *kp; + + for (k = 0, kp = out; k < r; ++k, kp += m) { + REAL r0, i0, rt, it, rw, iw; + int l1 = nWdnti + nWdntm * k; + int l0; + + r0 = i0 = 0.0; + for (j = 0, jp = in, l0 = 0; j < r; ++j, jp += m) { + rw = c_re(W[l0]); + iw = c_im(W[l0]); + rt = c_re(*jp); + it = c_im(*jp); + r0 += rt * rw - it * iw; + i0 += rt * iw + it * rw; + l0 += l1; + if (l0 > nW) + l0 -= nW; + } + c_re(*kp) = r0; + c_im(*kp) = i0; + } +} + +void fft_twiddle_gen(int i, int i1, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int r, int m) +{ +#if defined(FORCE_TIED_TASKS) + if (i == i1 - 1) { + #pragma omp task + fft_twiddle_gen1(in + i, out + i, W, + r, m, nW, nWdn * i, nWdn * m); + } else { + int i2 = (i + i1) / 2; + #pragma omp task + fft_twiddle_gen(i, i2, in, out, W, nW, + nWdn, r, m); + #pragma omp task + fft_twiddle_gen(i2, i1, in, out, W, nW, + nWdn, r, m); + } +#else + if (i == i1 - 1) { + #pragma omp task untied + fft_twiddle_gen1(in + i, out + i, W, + r, m, nW, nWdn * i, nWdn * m); + } else { + int i2 = (i + i1) / 2; + #pragma omp task untied + fft_twiddle_gen(i, i2, in, out, W, nW, + nWdn, r, m); + #pragma omp task untied + fft_twiddle_gen(i2, i1, in, out, W, nW, + nWdn, r, m); + } +#endif + #pragma omp taskwait +} +void fft_twiddle_gen_seq(int i, int i1, COMPLEX * in, COMPLEX * out, COMPLEX * W, + int nW, int nWdn, int r, int m) +{ + if (i == i1 - 1) { + fft_twiddle_gen1(in + i, out + i, W, + r, m, nW, nWdn * i, nWdn * m); + } else { + int i2 = (i + i1) / 2; + fft_twiddle_gen_seq(i, i2, in, out, W, nW, + nWdn, r, m); + fft_twiddle_gen_seq(i2, i1, in, out, W, nW, + nWdn, r, m); + } +} +/* machine-generated code begins here */ +void fft_base_2(COMPLEX * in, COMPLEX * out) +{ + REAL r1_0, i1_0; + REAL r1_1, i1_1; + r1_0 = c_re(in[0]); + i1_0 = c_im(in[0]); + r1_1 = c_re(in[1]); + i1_1 = c_im(in[1]); + c_re(out[0]) = (r1_0 + r1_1); + c_im(out[0]) = (i1_0 + i1_1); + c_re(out[1]) = (r1_0 - r1_1); + c_im(out[1]) = (i1_0 - i1_1); +} +void fft_twiddle_2(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m) +{ + int l1, i; + COMPLEX *jp, *kp; + REAL tmpr, tmpi, wr, wi; + if ((b - a) < 128) { + for (i = a, l1 = nWdn * i, kp = out + i; i < b; + i++, l1 += nWdn, kp++) { + jp = in + i; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + r1_0 = c_re(jp[0 * m]); + i1_0 = c_im(jp[0 * m]); + wr = c_re(W[1 * l1]); + wi = c_im(W[1 * l1]); + tmpr = c_re(jp[1 * m]); + tmpi = c_im(jp[1 * m]); + r1_1 = ((wr * tmpr) - (wi * tmpi)); + i1_1 = ((wi * tmpr) + (wr * tmpi)); + c_re(kp[0 * m]) = (r1_0 + r1_1); + c_im(kp[0 * m]) = (i1_0 + i1_1); + c_re(kp[1 * m]) = (r1_0 - r1_1); + c_im(kp[1 * m]) = (i1_0 - i1_1); + } + } + } else { + int ab = (a + b) / 2; +#if defined(FORCE_TIED_TASKS) + #pragma omp task + fft_twiddle_2(a, ab, in, out, W, nW, nWdn, m); + #pragma omp task + fft_twiddle_2(ab, b, in, out, W, nW, nWdn, m); +#else + #pragma omp task untied + fft_twiddle_2(a, ab, in, out, W, nW, nWdn, m); + #pragma omp task untied + fft_twiddle_2(ab, b, in, out, W, nW, nWdn, m); +#endif + #pragma omp taskwait + } +} +void fft_twiddle_2_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m) +{ + int l1, i; + COMPLEX *jp, *kp; + REAL tmpr, tmpi, wr, wi; + if ((b - a) < 128) { + for (i = a, l1 = nWdn * i, kp = out + i; i < b; + i++, l1 += nWdn, kp++) { + jp = in + i; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + r1_0 = c_re(jp[0 * m]); + i1_0 = c_im(jp[0 * m]); + wr = c_re(W[1 * l1]); + wi = c_im(W[1 * l1]); + tmpr = c_re(jp[1 * m]); + tmpi = c_im(jp[1 * m]); + r1_1 = ((wr * tmpr) - (wi * tmpi)); + i1_1 = ((wi * tmpr) + (wr * tmpi)); + c_re(kp[0 * m]) = (r1_0 + r1_1); + c_im(kp[0 * m]) = (i1_0 + i1_1); + c_re(kp[1 * m]) = (r1_0 - r1_1); + c_im(kp[1 * m]) = (i1_0 - i1_1); + } + } + } else { + int ab = (a + b) / 2; + fft_twiddle_2_seq(a, ab, in, out, W, nW, nWdn, m); + fft_twiddle_2_seq(ab, b, in, out, W, nW, nWdn, m); + } +} +void fft_unshuffle_2(int a, int b, COMPLEX * in, COMPLEX * out, int m) +{ + int i; + const COMPLEX *ip; + COMPLEX *jp; + if ((b - a) < 128) { + ip = in + a * 2; + for (i = a; i < b; ++i) { + jp = out + i; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + } + } else { + int ab = (a + b) / 2; +#if defined(FORCE_TIED_TASKS) + #pragma omp task + fft_unshuffle_2(a, ab, in, out, m); + #pragma omp task + fft_unshuffle_2(ab, b, in, out, m); +#else + #pragma omp task untied + fft_unshuffle_2(a, ab, in, out, m); + #pragma omp task untied + fft_unshuffle_2(ab, b, in, out, m); +#endif + #pragma omp taskwait + } +} +void fft_unshuffle_2_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m) +{ + int i; + const COMPLEX *ip; + COMPLEX *jp; + if ((b - a) < 128) { + ip = in + a * 2; + for (i = a; i < b; ++i) { + jp = out + i; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + } + } else { + int ab = (a + b) / 2; + fft_unshuffle_2_seq(a, ab, in, out, m); + fft_unshuffle_2_seq(ab, b, in, out, m); + } +} +void fft_base_4(COMPLEX * in, COMPLEX * out) +{ + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + r2_0 = c_re(in[0]); + i2_0 = c_im(in[0]); + r2_2 = c_re(in[2]); + i2_2 = c_im(in[2]); + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_2 = (r2_0 - r2_2); + i1_2 = (i2_0 - i2_2); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + r2_1 = c_re(in[1]); + i2_1 = c_im(in[1]); + r2_3 = c_re(in[3]); + i2_3 = c_im(in[3]); + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_3 = (r2_1 - r2_3); + i1_3 = (i2_1 - i2_3); + } + c_re(out[0]) = (r1_0 + r1_1); + c_im(out[0]) = (i1_0 + i1_1); + c_re(out[2]) = (r1_0 - r1_1); + c_im(out[2]) = (i1_0 - i1_1); + c_re(out[1]) = (r1_2 + i1_3); + c_im(out[1]) = (i1_2 - r1_3); + c_re(out[3]) = (r1_2 - i1_3); + c_im(out[3]) = (i1_2 + r1_3); +} +void fft_twiddle_4(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m) +{ + int l1, i; + COMPLEX *jp, *kp; + REAL tmpr, tmpi, wr, wi; + if ((b - a) < 128) { + for (i = a, l1 = nWdn * i, kp = out + i; i < b; + i++, l1 += nWdn, kp++) { + jp = in + i; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + r2_0 = c_re(jp[0 * m]); + i2_0 = c_im(jp[0 * m]); + wr = c_re(W[2 * l1]); + wi = c_im(W[2 * l1]); + tmpr = c_re(jp[2 * m]); + tmpi = c_im(jp[2 * m]); + r2_2 = ((wr * tmpr) - (wi * tmpi)); + i2_2 = ((wi * tmpr) + (wr * tmpi)); + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_2 = (r2_0 - r2_2); + i1_2 = (i2_0 - i2_2); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + wr = c_re(W[1 * l1]); + wi = c_im(W[1 * l1]); + tmpr = c_re(jp[1 * m]); + tmpi = c_im(jp[1 * m]); + r2_1 = ((wr * tmpr) - (wi * tmpi)); + i2_1 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[3 * l1]); + wi = c_im(W[3 * l1]); + tmpr = c_re(jp[3 * m]); + tmpi = c_im(jp[3 * m]); + r2_3 = ((wr * tmpr) - (wi * tmpi)); + i2_3 = ((wi * tmpr) + (wr * tmpi)); + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_3 = (r2_1 - r2_3); + i1_3 = (i2_1 - i2_3); + } + c_re(kp[0 * m]) = (r1_0 + r1_1); + c_im(kp[0 * m]) = (i1_0 + i1_1); + c_re(kp[2 * m]) = (r1_0 - r1_1); + c_im(kp[2 * m]) = (i1_0 - i1_1); + c_re(kp[1 * m]) = (r1_2 + i1_3); + c_im(kp[1 * m]) = (i1_2 - r1_3); + c_re(kp[3 * m]) = (r1_2 - i1_3); + c_im(kp[3 * m]) = (i1_2 + r1_3); + } + } + } else { + int ab = (a + b) / 2; +#if defined(FORCE_TIED_TASKS) + #pragma omp task + fft_twiddle_4(a, ab, in, out, W, nW, nWdn, m); + #pragma omp task + fft_twiddle_4(ab, b, in, out, W, nW, nWdn, m); +#else + #pragma omp task untied + fft_twiddle_4(a, ab, in, out, W, nW, nWdn, m); + #pragma omp task untied + fft_twiddle_4(ab, b, in, out, W, nW, nWdn, m); +#endif + #pragma omp taskwait + } +} +void fft_twiddle_4_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m) +{ + int l1, i; + COMPLEX *jp, *kp; + REAL tmpr, tmpi, wr, wi; + if ((b - a) < 128) { + for (i = a, l1 = nWdn * i, kp = out + i; i < b; + i++, l1 += nWdn, kp++) { + jp = in + i; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + r2_0 = c_re(jp[0 * m]); + i2_0 = c_im(jp[0 * m]); + wr = c_re(W[2 * l1]); + wi = c_im(W[2 * l1]); + tmpr = c_re(jp[2 * m]); + tmpi = c_im(jp[2 * m]); + r2_2 = ((wr * tmpr) - (wi * tmpi)); + i2_2 = ((wi * tmpr) + (wr * tmpi)); + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_2 = (r2_0 - r2_2); + i1_2 = (i2_0 - i2_2); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + wr = c_re(W[1 * l1]); + wi = c_im(W[1 * l1]); + tmpr = c_re(jp[1 * m]); + tmpi = c_im(jp[1 * m]); + r2_1 = ((wr * tmpr) - (wi * tmpi)); + i2_1 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[3 * l1]); + wi = c_im(W[3 * l1]); + tmpr = c_re(jp[3 * m]); + tmpi = c_im(jp[3 * m]); + r2_3 = ((wr * tmpr) - (wi * tmpi)); + i2_3 = ((wi * tmpr) + (wr * tmpi)); + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_3 = (r2_1 - r2_3); + i1_3 = (i2_1 - i2_3); + } + c_re(kp[0 * m]) = (r1_0 + r1_1); + c_im(kp[0 * m]) = (i1_0 + i1_1); + c_re(kp[2 * m]) = (r1_0 - r1_1); + c_im(kp[2 * m]) = (i1_0 - i1_1); + c_re(kp[1 * m]) = (r1_2 + i1_3); + c_im(kp[1 * m]) = (i1_2 - r1_3); + c_re(kp[3 * m]) = (r1_2 - i1_3); + c_im(kp[3 * m]) = (i1_2 + r1_3); + } + } + } else { + int ab = (a + b) / 2; + fft_twiddle_4_seq(a, ab, in, out, W, nW, nWdn, m); + fft_twiddle_4_seq(ab, b, in, out, W, nW, nWdn, m); + } +} +void fft_unshuffle_4(int a, int b, COMPLEX * in, COMPLEX * out, int m) +{ + int i; + const COMPLEX *ip; + COMPLEX *jp; + if ((b - a) < 128) { + ip = in + a * 4; + for (i = a; i < b; ++i) { + jp = out + i; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + } + } else { + int ab = (a + b) / 2; +#if defined(FORCE_TIED_TASKS) + #pragma omp task + fft_unshuffle_4(a, ab, in, out, m); + #pragma omp task + fft_unshuffle_4(ab, b, in, out, m); +#else + #pragma omp task untied + fft_unshuffle_4(a, ab, in, out, m); + #pragma omp task untied + fft_unshuffle_4(ab, b, in, out, m); +#endif + #pragma omp taskwait + } +} +void fft_unshuffle_4_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m) +{ + int i; + const COMPLEX *ip; + COMPLEX *jp; + if ((b - a) < 128) { + ip = in + a * 4; + for (i = a; i < b; ++i) { + jp = out + i; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + } + } else { + int ab = (a + b) / 2; + fft_unshuffle_4_seq(a, ab, in, out, m); + fft_unshuffle_4_seq(ab, b, in, out, m); + } +} +void fft_base_8(COMPLEX * in, COMPLEX * out) +{ + REAL tmpr, tmpi; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + REAL r1_4, i1_4; + REAL r1_5, i1_5; + REAL r1_6, i1_6; + REAL r1_7, i1_7; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + REAL r2_4, i2_4; + REAL r2_6, i2_6; + { + REAL r3_0, i3_0; + REAL r3_4, i3_4; + r3_0 = c_re(in[0]); + i3_0 = c_im(in[0]); + r3_4 = c_re(in[4]); + i3_4 = c_im(in[4]); + r2_0 = (r3_0 + r3_4); + i2_0 = (i3_0 + i3_4); + r2_4 = (r3_0 - r3_4); + i2_4 = (i3_0 - i3_4); + } + { + REAL r3_2, i3_2; + REAL r3_6, i3_6; + r3_2 = c_re(in[2]); + i3_2 = c_im(in[2]); + r3_6 = c_re(in[6]); + i3_6 = c_im(in[6]); + r2_2 = (r3_2 + r3_6); + i2_2 = (i3_2 + i3_6); + r2_6 = (r3_2 - r3_6); + i2_6 = (i3_2 - i3_6); + } + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_4 = (r2_0 - r2_2); + i1_4 = (i2_0 - i2_2); + r1_2 = (r2_4 + i2_6); + i1_2 = (i2_4 - r2_6); + r1_6 = (r2_4 - i2_6); + i1_6 = (i2_4 + r2_6); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + REAL r2_5, i2_5; + REAL r2_7, i2_7; + { + REAL r3_1, i3_1; + REAL r3_5, i3_5; + r3_1 = c_re(in[1]); + i3_1 = c_im(in[1]); + r3_5 = c_re(in[5]); + i3_5 = c_im(in[5]); + r2_1 = (r3_1 + r3_5); + i2_1 = (i3_1 + i3_5); + r2_5 = (r3_1 - r3_5); + i2_5 = (i3_1 - i3_5); + } + { + REAL r3_3, i3_3; + REAL r3_7, i3_7; + r3_3 = c_re(in[3]); + i3_3 = c_im(in[3]); + r3_7 = c_re(in[7]); + i3_7 = c_im(in[7]); + r2_3 = (r3_3 + r3_7); + i2_3 = (i3_3 + i3_7); + r2_7 = (r3_3 - r3_7); + i2_7 = (i3_3 - i3_7); + } + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_5 = (r2_1 - r2_3); + i1_5 = (i2_1 - i2_3); + r1_3 = (r2_5 + i2_7); + i1_3 = (i2_5 - r2_7); + r1_7 = (r2_5 - i2_7); + i1_7 = (i2_5 + r2_7); + } + c_re(out[0]) = (r1_0 + r1_1); + c_im(out[0]) = (i1_0 + i1_1); + c_re(out[4]) = (r1_0 - r1_1); + c_im(out[4]) = (i1_0 - i1_1); + tmpr = (0.707106781187 * (r1_3 + i1_3)); + tmpi = (0.707106781187 * (i1_3 - r1_3)); + c_re(out[1]) = (r1_2 + tmpr); + c_im(out[1]) = (i1_2 + tmpi); + c_re(out[5]) = (r1_2 - tmpr); + c_im(out[5]) = (i1_2 - tmpi); + c_re(out[2]) = (r1_4 + i1_5); + c_im(out[2]) = (i1_4 - r1_5); + c_re(out[6]) = (r1_4 - i1_5); + c_im(out[6]) = (i1_4 + r1_5); + tmpr = (0.707106781187 * (i1_7 - r1_7)); + tmpi = (0.707106781187 * (r1_7 + i1_7)); + c_re(out[3]) = (r1_6 + tmpr); + c_im(out[3]) = (i1_6 - tmpi); + c_re(out[7]) = (r1_6 - tmpr); + c_im(out[7]) = (i1_6 + tmpi); + } +} +void fft_twiddle_8(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m) +{ + int l1, i; + COMPLEX *jp, *kp; + REAL tmpr, tmpi, wr, wi; + if ((b - a) < 128) { + for (i = a, l1 = nWdn * i, kp = out + i; i < b; + i++, l1 += nWdn, kp++) { + jp = in + i; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + REAL r1_4, i1_4; + REAL r1_5, i1_5; + REAL r1_6, i1_6; + REAL r1_7, i1_7; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + REAL r2_4, i2_4; + REAL r2_6, i2_6; + { + REAL r3_0, i3_0; + REAL r3_4, i3_4; + r3_0 = c_re(jp[0 * m]); + i3_0 = c_im(jp[0 * m]); + wr = c_re(W[4 * l1]); + wi = c_im(W[4 * l1]); + tmpr = c_re(jp[4 * m]); + tmpi = c_im(jp[4 * m]); + r3_4 = ((wr * tmpr) - (wi * tmpi)); + i3_4 = ((wi * tmpr) + (wr * tmpi)); + r2_0 = (r3_0 + r3_4); + i2_0 = (i3_0 + i3_4); + r2_4 = (r3_0 - r3_4); + i2_4 = (i3_0 - i3_4); + } + { + REAL r3_2, i3_2; + REAL r3_6, i3_6; + wr = c_re(W[2 * l1]); + wi = c_im(W[2 * l1]); + tmpr = c_re(jp[2 * m]); + tmpi = c_im(jp[2 * m]); + r3_2 = ((wr * tmpr) - (wi * tmpi)); + i3_2 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[6 * l1]); + wi = c_im(W[6 * l1]); + tmpr = c_re(jp[6 * m]); + tmpi = c_im(jp[6 * m]); + r3_6 = ((wr * tmpr) - (wi * tmpi)); + i3_6 = ((wi * tmpr) + (wr * tmpi)); + r2_2 = (r3_2 + r3_6); + i2_2 = (i3_2 + i3_6); + r2_6 = (r3_2 - r3_6); + i2_6 = (i3_2 - i3_6); + } + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_4 = (r2_0 - r2_2); + i1_4 = (i2_0 - i2_2); + r1_2 = (r2_4 + i2_6); + i1_2 = (i2_4 - r2_6); + r1_6 = (r2_4 - i2_6); + i1_6 = (i2_4 + r2_6); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + REAL r2_5, i2_5; + REAL r2_7, i2_7; + { + REAL r3_1, i3_1; + REAL r3_5, i3_5; + wr = c_re(W[1 * l1]); + wi = c_im(W[1 * l1]); + tmpr = c_re(jp[1 * m]); + tmpi = c_im(jp[1 * m]); + r3_1 = ((wr * tmpr) - (wi * tmpi)); + i3_1 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[5 * l1]); + wi = c_im(W[5 * l1]); + tmpr = c_re(jp[5 * m]); + tmpi = c_im(jp[5 * m]); + r3_5 = ((wr * tmpr) - (wi * tmpi)); + i3_5 = ((wi * tmpr) + (wr * tmpi)); + r2_1 = (r3_1 + r3_5); + i2_1 = (i3_1 + i3_5); + r2_5 = (r3_1 - r3_5); + i2_5 = (i3_1 - i3_5); + } + { + REAL r3_3, i3_3; + REAL r3_7, i3_7; + wr = c_re(W[3 * l1]); + wi = c_im(W[3 * l1]); + tmpr = c_re(jp[3 * m]); + tmpi = c_im(jp[3 * m]); + r3_3 = ((wr * tmpr) - (wi * tmpi)); + i3_3 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[7 * l1]); + wi = c_im(W[7 * l1]); + tmpr = c_re(jp[7 * m]); + tmpi = c_im(jp[7 * m]); + r3_7 = ((wr * tmpr) - (wi * tmpi)); + i3_7 = ((wi * tmpr) + (wr * tmpi)); + r2_3 = (r3_3 + r3_7); + i2_3 = (i3_3 + i3_7); + r2_7 = (r3_3 - r3_7); + i2_7 = (i3_3 - i3_7); + } + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_5 = (r2_1 - r2_3); + i1_5 = (i2_1 - i2_3); + r1_3 = (r2_5 + i2_7); + i1_3 = (i2_5 - r2_7); + r1_7 = (r2_5 - i2_7); + i1_7 = (i2_5 + r2_7); + } + c_re(kp[0 * m]) = (r1_0 + r1_1); + c_im(kp[0 * m]) = (i1_0 + i1_1); + c_re(kp[4 * m]) = (r1_0 - r1_1); + c_im(kp[4 * m]) = (i1_0 - i1_1); + tmpr = (0.707106781187 * (r1_3 + i1_3)); + tmpi = (0.707106781187 * (i1_3 - r1_3)); + c_re(kp[1 * m]) = (r1_2 + tmpr); + c_im(kp[1 * m]) = (i1_2 + tmpi); + c_re(kp[5 * m]) = (r1_2 - tmpr); + c_im(kp[5 * m]) = (i1_2 - tmpi); + c_re(kp[2 * m]) = (r1_4 + i1_5); + c_im(kp[2 * m]) = (i1_4 - r1_5); + c_re(kp[6 * m]) = (r1_4 - i1_5); + c_im(kp[6 * m]) = (i1_4 + r1_5); + tmpr = (0.707106781187 * (i1_7 - r1_7)); + tmpi = (0.707106781187 * (r1_7 + i1_7)); + c_re(kp[3 * m]) = (r1_6 + tmpr); + c_im(kp[3 * m]) = (i1_6 - tmpi); + c_re(kp[7 * m]) = (r1_6 - tmpr); + c_im(kp[7 * m]) = (i1_6 + tmpi); + } + } + } else { + int ab = (a + b) / 2; +#if defined(FORCE_TIED_TASKS) + #pragma omp task + fft_twiddle_8(a, ab, in, out, W, nW, nWdn, m); + #pragma omp task + fft_twiddle_8(ab, b, in, out, W, nW, nWdn, m); +#else + #pragma omp task untied + fft_twiddle_8(a, ab, in, out, W, nW, nWdn, m); + #pragma omp task untied + fft_twiddle_8(ab, b, in, out, W, nW, nWdn, m); +#endif + #pragma omp taskwait + } +} +void fft_twiddle_8_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m) +{ + int l1, i; + COMPLEX *jp, *kp; + REAL tmpr, tmpi, wr, wi; + if ((b - a) < 128) { + for (i = a, l1 = nWdn * i, kp = out + i; i < b; + i++, l1 += nWdn, kp++) { + jp = in + i; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + REAL r1_4, i1_4; + REAL r1_5, i1_5; + REAL r1_6, i1_6; + REAL r1_7, i1_7; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + REAL r2_4, i2_4; + REAL r2_6, i2_6; + { + REAL r3_0, i3_0; + REAL r3_4, i3_4; + r3_0 = c_re(jp[0 * m]); + i3_0 = c_im(jp[0 * m]); + wr = c_re(W[4 * l1]); + wi = c_im(W[4 * l1]); + tmpr = c_re(jp[4 * m]); + tmpi = c_im(jp[4 * m]); + r3_4 = ((wr * tmpr) - (wi * tmpi)); + i3_4 = ((wi * tmpr) + (wr * tmpi)); + r2_0 = (r3_0 + r3_4); + i2_0 = (i3_0 + i3_4); + r2_4 = (r3_0 - r3_4); + i2_4 = (i3_0 - i3_4); + } + { + REAL r3_2, i3_2; + REAL r3_6, i3_6; + wr = c_re(W[2 * l1]); + wi = c_im(W[2 * l1]); + tmpr = c_re(jp[2 * m]); + tmpi = c_im(jp[2 * m]); + r3_2 = ((wr * tmpr) - (wi * tmpi)); + i3_2 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[6 * l1]); + wi = c_im(W[6 * l1]); + tmpr = c_re(jp[6 * m]); + tmpi = c_im(jp[6 * m]); + r3_6 = ((wr * tmpr) - (wi * tmpi)); + i3_6 = ((wi * tmpr) + (wr * tmpi)); + r2_2 = (r3_2 + r3_6); + i2_2 = (i3_2 + i3_6); + r2_6 = (r3_2 - r3_6); + i2_6 = (i3_2 - i3_6); + } + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_4 = (r2_0 - r2_2); + i1_4 = (i2_0 - i2_2); + r1_2 = (r2_4 + i2_6); + i1_2 = (i2_4 - r2_6); + r1_6 = (r2_4 - i2_6); + i1_6 = (i2_4 + r2_6); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + REAL r2_5, i2_5; + REAL r2_7, i2_7; + { + REAL r3_1, i3_1; + REAL r3_5, i3_5; + wr = c_re(W[1 * l1]); + wi = c_im(W[1 * l1]); + tmpr = c_re(jp[1 * m]); + tmpi = c_im(jp[1 * m]); + r3_1 = ((wr * tmpr) - (wi * tmpi)); + i3_1 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[5 * l1]); + wi = c_im(W[5 * l1]); + tmpr = c_re(jp[5 * m]); + tmpi = c_im(jp[5 * m]); + r3_5 = ((wr * tmpr) - (wi * tmpi)); + i3_5 = ((wi * tmpr) + (wr * tmpi)); + r2_1 = (r3_1 + r3_5); + i2_1 = (i3_1 + i3_5); + r2_5 = (r3_1 - r3_5); + i2_5 = (i3_1 - i3_5); + } + { + REAL r3_3, i3_3; + REAL r3_7, i3_7; + wr = c_re(W[3 * l1]); + wi = c_im(W[3 * l1]); + tmpr = c_re(jp[3 * m]); + tmpi = c_im(jp[3 * m]); + r3_3 = ((wr * tmpr) - (wi * tmpi)); + i3_3 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[7 * l1]); + wi = c_im(W[7 * l1]); + tmpr = c_re(jp[7 * m]); + tmpi = c_im(jp[7 * m]); + r3_7 = ((wr * tmpr) - (wi * tmpi)); + i3_7 = ((wi * tmpr) + (wr * tmpi)); + r2_3 = (r3_3 + r3_7); + i2_3 = (i3_3 + i3_7); + r2_7 = (r3_3 - r3_7); + i2_7 = (i3_3 - i3_7); + } + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_5 = (r2_1 - r2_3); + i1_5 = (i2_1 - i2_3); + r1_3 = (r2_5 + i2_7); + i1_3 = (i2_5 - r2_7); + r1_7 = (r2_5 - i2_7); + i1_7 = (i2_5 + r2_7); + } + c_re(kp[0 * m]) = (r1_0 + r1_1); + c_im(kp[0 * m]) = (i1_0 + i1_1); + c_re(kp[4 * m]) = (r1_0 - r1_1); + c_im(kp[4 * m]) = (i1_0 - i1_1); + tmpr = (0.707106781187 * (r1_3 + i1_3)); + tmpi = (0.707106781187 * (i1_3 - r1_3)); + c_re(kp[1 * m]) = (r1_2 + tmpr); + c_im(kp[1 * m]) = (i1_2 + tmpi); + c_re(kp[5 * m]) = (r1_2 - tmpr); + c_im(kp[5 * m]) = (i1_2 - tmpi); + c_re(kp[2 * m]) = (r1_4 + i1_5); + c_im(kp[2 * m]) = (i1_4 - r1_5); + c_re(kp[6 * m]) = (r1_4 - i1_5); + c_im(kp[6 * m]) = (i1_4 + r1_5); + tmpr = (0.707106781187 * (i1_7 - r1_7)); + tmpi = (0.707106781187 * (r1_7 + i1_7)); + c_re(kp[3 * m]) = (r1_6 + tmpr); + c_im(kp[3 * m]) = (i1_6 - tmpi); + c_re(kp[7 * m]) = (r1_6 - tmpr); + c_im(kp[7 * m]) = (i1_6 + tmpi); + } + } + } else { + int ab = (a + b) / 2; + fft_twiddle_8_seq(a, ab, in, out, W, nW, nWdn, m); + fft_twiddle_8_seq(ab, b, in, out, W, nW, nWdn, m); + } +} +void fft_unshuffle_8(int a, int b, COMPLEX * in, COMPLEX * out, int m) +{ + int i; + const COMPLEX *ip; + COMPLEX *jp; + if ((b - a) < 128) { + ip = in + a * 8; + for (i = a; i < b; ++i) { + jp = out + i; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + } + } else { + int ab = (a + b) / 2; +#if defined(FORCE_TIED_TASKS) + #pragma omp task + fft_unshuffle_8(a, ab, in, out, m); + #pragma omp task + fft_unshuffle_8(ab, b, in, out, m); +#else + #pragma omp task untied + fft_unshuffle_8(a, ab, in, out, m); + #pragma omp task untied + fft_unshuffle_8(ab, b, in, out, m); +#endif + #pragma omp taskwait + } +} +void fft_unshuffle_8_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m) +{ + int i; + const COMPLEX *ip; + COMPLEX *jp; + if ((b - a) < 128) { + ip = in + a * 8; + for (i = a; i < b; ++i) { + jp = out + i; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + } + } else { + int ab = (a + b) / 2; + fft_unshuffle_8_seq(a, ab, in, out, m); + fft_unshuffle_8_seq(ab, b, in, out, m); + } +} +void fft_base_16(COMPLEX * in, COMPLEX * out) +{ + REAL tmpr, tmpi; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + REAL r1_4, i1_4; + REAL r1_5, i1_5; + REAL r1_6, i1_6; + REAL r1_7, i1_7; + REAL r1_8, i1_8; + REAL r1_9, i1_9; + REAL r1_10, i1_10; + REAL r1_11, i1_11; + REAL r1_12, i1_12; + REAL r1_13, i1_13; + REAL r1_14, i1_14; + REAL r1_15, i1_15; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + REAL r2_4, i2_4; + REAL r2_6, i2_6; + REAL r2_8, i2_8; + REAL r2_10, i2_10; + REAL r2_12, i2_12; + REAL r2_14, i2_14; + { + REAL r3_0, i3_0; + REAL r3_4, i3_4; + REAL r3_8, i3_8; + REAL r3_12, i3_12; + { + REAL r4_0, i4_0; + REAL r4_8, i4_8; + r4_0 = c_re(in[0]); + i4_0 = c_im(in[0]); + r4_8 = c_re(in[8]); + i4_8 = c_im(in[8]); + r3_0 = (r4_0 + r4_8); + i3_0 = (i4_0 + i4_8); + r3_8 = (r4_0 - r4_8); + i3_8 = (i4_0 - i4_8); + } + { + REAL r4_4, i4_4; + REAL r4_12, i4_12; + r4_4 = c_re(in[4]); + i4_4 = c_im(in[4]); + r4_12 = c_re(in[12]); + i4_12 = c_im(in[12]); + r3_4 = (r4_4 + r4_12); + i3_4 = (i4_4 + i4_12); + r3_12 = (r4_4 - r4_12); + i3_12 = (i4_4 - i4_12); + } + r2_0 = (r3_0 + r3_4); + i2_0 = (i3_0 + i3_4); + r2_8 = (r3_0 - r3_4); + i2_8 = (i3_0 - i3_4); + r2_4 = (r3_8 + i3_12); + i2_4 = (i3_8 - r3_12); + r2_12 = (r3_8 - i3_12); + i2_12 = (i3_8 + r3_12); + } + { + REAL r3_2, i3_2; + REAL r3_6, i3_6; + REAL r3_10, i3_10; + REAL r3_14, i3_14; + { + REAL r4_2, i4_2; + REAL r4_10, i4_10; + r4_2 = c_re(in[2]); + i4_2 = c_im(in[2]); + r4_10 = c_re(in[10]); + i4_10 = c_im(in[10]); + r3_2 = (r4_2 + r4_10); + i3_2 = (i4_2 + i4_10); + r3_10 = (r4_2 - r4_10); + i3_10 = (i4_2 - i4_10); + } + { + REAL r4_6, i4_6; + REAL r4_14, i4_14; + r4_6 = c_re(in[6]); + i4_6 = c_im(in[6]); + r4_14 = c_re(in[14]); + i4_14 = c_im(in[14]); + r3_6 = (r4_6 + r4_14); + i3_6 = (i4_6 + i4_14); + r3_14 = (r4_6 - r4_14); + i3_14 = (i4_6 - i4_14); + } + r2_2 = (r3_2 + r3_6); + i2_2 = (i3_2 + i3_6); + r2_10 = (r3_2 - r3_6); + i2_10 = (i3_2 - i3_6); + r2_6 = (r3_10 + i3_14); + i2_6 = (i3_10 - r3_14); + r2_14 = (r3_10 - i3_14); + i2_14 = (i3_10 + r3_14); + } + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_8 = (r2_0 - r2_2); + i1_8 = (i2_0 - i2_2); + tmpr = (0.707106781187 * (r2_6 + i2_6)); + tmpi = (0.707106781187 * (i2_6 - r2_6)); + r1_2 = (r2_4 + tmpr); + i1_2 = (i2_4 + tmpi); + r1_10 = (r2_4 - tmpr); + i1_10 = (i2_4 - tmpi); + r1_4 = (r2_8 + i2_10); + i1_4 = (i2_8 - r2_10); + r1_12 = (r2_8 - i2_10); + i1_12 = (i2_8 + r2_10); + tmpr = (0.707106781187 * (i2_14 - r2_14)); + tmpi = (0.707106781187 * (r2_14 + i2_14)); + r1_6 = (r2_12 + tmpr); + i1_6 = (i2_12 - tmpi); + r1_14 = (r2_12 - tmpr); + i1_14 = (i2_12 + tmpi); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + REAL r2_5, i2_5; + REAL r2_7, i2_7; + REAL r2_9, i2_9; + REAL r2_11, i2_11; + REAL r2_13, i2_13; + REAL r2_15, i2_15; + { + REAL r3_1, i3_1; + REAL r3_5, i3_5; + REAL r3_9, i3_9; + REAL r3_13, i3_13; + { + REAL r4_1, i4_1; + REAL r4_9, i4_9; + r4_1 = c_re(in[1]); + i4_1 = c_im(in[1]); + r4_9 = c_re(in[9]); + i4_9 = c_im(in[9]); + r3_1 = (r4_1 + r4_9); + i3_1 = (i4_1 + i4_9); + r3_9 = (r4_1 - r4_9); + i3_9 = (i4_1 - i4_9); + } + { + REAL r4_5, i4_5; + REAL r4_13, i4_13; + r4_5 = c_re(in[5]); + i4_5 = c_im(in[5]); + r4_13 = c_re(in[13]); + i4_13 = c_im(in[13]); + r3_5 = (r4_5 + r4_13); + i3_5 = (i4_5 + i4_13); + r3_13 = (r4_5 - r4_13); + i3_13 = (i4_5 - i4_13); + } + r2_1 = (r3_1 + r3_5); + i2_1 = (i3_1 + i3_5); + r2_9 = (r3_1 - r3_5); + i2_9 = (i3_1 - i3_5); + r2_5 = (r3_9 + i3_13); + i2_5 = (i3_9 - r3_13); + r2_13 = (r3_9 - i3_13); + i2_13 = (i3_9 + r3_13); + } + { + REAL r3_3, i3_3; + REAL r3_7, i3_7; + REAL r3_11, i3_11; + REAL r3_15, i3_15; + { + REAL r4_3, i4_3; + REAL r4_11, i4_11; + r4_3 = c_re(in[3]); + i4_3 = c_im(in[3]); + r4_11 = c_re(in[11]); + i4_11 = c_im(in[11]); + r3_3 = (r4_3 + r4_11); + i3_3 = (i4_3 + i4_11); + r3_11 = (r4_3 - r4_11); + i3_11 = (i4_3 - i4_11); + } + { + REAL r4_7, i4_7; + REAL r4_15, i4_15; + r4_7 = c_re(in[7]); + i4_7 = c_im(in[7]); + r4_15 = c_re(in[15]); + i4_15 = c_im(in[15]); + r3_7 = (r4_7 + r4_15); + i3_7 = (i4_7 + i4_15); + r3_15 = (r4_7 - r4_15); + i3_15 = (i4_7 - i4_15); + } + r2_3 = (r3_3 + r3_7); + i2_3 = (i3_3 + i3_7); + r2_11 = (r3_3 - r3_7); + i2_11 = (i3_3 - i3_7); + r2_7 = (r3_11 + i3_15); + i2_7 = (i3_11 - r3_15); + r2_15 = (r3_11 - i3_15); + i2_15 = (i3_11 + r3_15); + } + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_9 = (r2_1 - r2_3); + i1_9 = (i2_1 - i2_3); + tmpr = (0.707106781187 * (r2_7 + i2_7)); + tmpi = (0.707106781187 * (i2_7 - r2_7)); + r1_3 = (r2_5 + tmpr); + i1_3 = (i2_5 + tmpi); + r1_11 = (r2_5 - tmpr); + i1_11 = (i2_5 - tmpi); + r1_5 = (r2_9 + i2_11); + i1_5 = (i2_9 - r2_11); + r1_13 = (r2_9 - i2_11); + i1_13 = (i2_9 + r2_11); + tmpr = (0.707106781187 * (i2_15 - r2_15)); + tmpi = (0.707106781187 * (r2_15 + i2_15)); + r1_7 = (r2_13 + tmpr); + i1_7 = (i2_13 - tmpi); + r1_15 = (r2_13 - tmpr); + i1_15 = (i2_13 + tmpi); + } + c_re(out[0]) = (r1_0 + r1_1); + c_im(out[0]) = (i1_0 + i1_1); + c_re(out[8]) = (r1_0 - r1_1); + c_im(out[8]) = (i1_0 - i1_1); + tmpr = ((0.923879532511 * r1_3) + (0.382683432365 * i1_3)); + tmpi = ((0.923879532511 * i1_3) - (0.382683432365 * r1_3)); + c_re(out[1]) = (r1_2 + tmpr); + c_im(out[1]) = (i1_2 + tmpi); + c_re(out[9]) = (r1_2 - tmpr); + c_im(out[9]) = (i1_2 - tmpi); + tmpr = (0.707106781187 * (r1_5 + i1_5)); + tmpi = (0.707106781187 * (i1_5 - r1_5)); + c_re(out[2]) = (r1_4 + tmpr); + c_im(out[2]) = (i1_4 + tmpi); + c_re(out[10]) = (r1_4 - tmpr); + c_im(out[10]) = (i1_4 - tmpi); + tmpr = ((0.382683432365 * r1_7) + (0.923879532511 * i1_7)); + tmpi = ((0.382683432365 * i1_7) - (0.923879532511 * r1_7)); + c_re(out[3]) = (r1_6 + tmpr); + c_im(out[3]) = (i1_6 + tmpi); + c_re(out[11]) = (r1_6 - tmpr); + c_im(out[11]) = (i1_6 - tmpi); + c_re(out[4]) = (r1_8 + i1_9); + c_im(out[4]) = (i1_8 - r1_9); + c_re(out[12]) = (r1_8 - i1_9); + c_im(out[12]) = (i1_8 + r1_9); + tmpr = ((0.923879532511 * i1_11) - (0.382683432365 * r1_11)); + tmpi = ((0.923879532511 * r1_11) + (0.382683432365 * i1_11)); + c_re(out[5]) = (r1_10 + tmpr); + c_im(out[5]) = (i1_10 - tmpi); + c_re(out[13]) = (r1_10 - tmpr); + c_im(out[13]) = (i1_10 + tmpi); + tmpr = (0.707106781187 * (i1_13 - r1_13)); + tmpi = (0.707106781187 * (r1_13 + i1_13)); + c_re(out[6]) = (r1_12 + tmpr); + c_im(out[6]) = (i1_12 - tmpi); + c_re(out[14]) = (r1_12 - tmpr); + c_im(out[14]) = (i1_12 + tmpi); + tmpr = ((0.382683432365 * i1_15) - (0.923879532511 * r1_15)); + tmpi = ((0.382683432365 * r1_15) + (0.923879532511 * i1_15)); + c_re(out[7]) = (r1_14 + tmpr); + c_im(out[7]) = (i1_14 - tmpi); + c_re(out[15]) = (r1_14 - tmpr); + c_im(out[15]) = (i1_14 + tmpi); + } +} +void fft_twiddle_16(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m) +{ + int l1, i; + COMPLEX *jp, *kp; + REAL tmpr, tmpi, wr, wi; + if ((b - a) < 128) { + for (i = a, l1 = nWdn * i, kp = out + i; i < b; + i++, l1 += nWdn, kp++) { + jp = in + i; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + REAL r1_4, i1_4; + REAL r1_5, i1_5; + REAL r1_6, i1_6; + REAL r1_7, i1_7; + REAL r1_8, i1_8; + REAL r1_9, i1_9; + REAL r1_10, i1_10; + REAL r1_11, i1_11; + REAL r1_12, i1_12; + REAL r1_13, i1_13; + REAL r1_14, i1_14; + REAL r1_15, i1_15; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + REAL r2_4, i2_4; + REAL r2_6, i2_6; + REAL r2_8, i2_8; + REAL r2_10, i2_10; + REAL r2_12, i2_12; + REAL r2_14, i2_14; + { + REAL r3_0, i3_0; + REAL r3_4, i3_4; + REAL r3_8, i3_8; + REAL r3_12, i3_12; + { + REAL r4_0, i4_0; + REAL r4_8, i4_8; + r4_0 = c_re(jp[0 * m]); + i4_0 = c_im(jp[0 * m]); + wr = c_re(W[8 * l1]); + wi = c_im(W[8 * l1]); + tmpr = c_re(jp[8 * m]); + tmpi = c_im(jp[8 * m]); + r4_8 = ((wr * tmpr) - (wi * tmpi)); + i4_8 = ((wi * tmpr) + (wr * tmpi)); + r3_0 = (r4_0 + r4_8); + i3_0 = (i4_0 + i4_8); + r3_8 = (r4_0 - r4_8); + i3_8 = (i4_0 - i4_8); + } + { + REAL r4_4, i4_4; + REAL r4_12, i4_12; + wr = c_re(W[4 * l1]); + wi = c_im(W[4 * l1]); + tmpr = c_re(jp[4 * m]); + tmpi = c_im(jp[4 * m]); + r4_4 = ((wr * tmpr) - (wi * tmpi)); + i4_4 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[12 * l1]); + wi = c_im(W[12 * l1]); + tmpr = c_re(jp[12 * m]); + tmpi = c_im(jp[12 * m]); + r4_12 = ((wr * tmpr) - (wi * tmpi)); + i4_12 = ((wi * tmpr) + (wr * tmpi)); + r3_4 = (r4_4 + r4_12); + i3_4 = (i4_4 + i4_12); + r3_12 = (r4_4 - r4_12); + i3_12 = (i4_4 - i4_12); + } + r2_0 = (r3_0 + r3_4); + i2_0 = (i3_0 + i3_4); + r2_8 = (r3_0 - r3_4); + i2_8 = (i3_0 - i3_4); + r2_4 = (r3_8 + i3_12); + i2_4 = (i3_8 - r3_12); + r2_12 = (r3_8 - i3_12); + i2_12 = (i3_8 + r3_12); + } + { + REAL r3_2, i3_2; + REAL r3_6, i3_6; + REAL r3_10, i3_10; + REAL r3_14, i3_14; + { + REAL r4_2, i4_2; + REAL r4_10, i4_10; + wr = c_re(W[2 * l1]); + wi = c_im(W[2 * l1]); + tmpr = c_re(jp[2 * m]); + tmpi = c_im(jp[2 * m]); + r4_2 = ((wr * tmpr) - (wi * tmpi)); + i4_2 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[10 * l1]); + wi = c_im(W[10 * l1]); + tmpr = c_re(jp[10 * m]); + tmpi = c_im(jp[10 * m]); + r4_10 = ((wr * tmpr) - (wi * tmpi)); + i4_10 = ((wi * tmpr) + (wr * tmpi)); + r3_2 = (r4_2 + r4_10); + i3_2 = (i4_2 + i4_10); + r3_10 = (r4_2 - r4_10); + i3_10 = (i4_2 - i4_10); + } + { + REAL r4_6, i4_6; + REAL r4_14, i4_14; + wr = c_re(W[6 * l1]); + wi = c_im(W[6 * l1]); + tmpr = c_re(jp[6 * m]); + tmpi = c_im(jp[6 * m]); + r4_6 = ((wr * tmpr) - (wi * tmpi)); + i4_6 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[14 * l1]); + wi = c_im(W[14 * l1]); + tmpr = c_re(jp[14 * m]); + tmpi = c_im(jp[14 * m]); + r4_14 = ((wr * tmpr) - (wi * tmpi)); + i4_14 = ((wi * tmpr) + (wr * tmpi)); + r3_6 = (r4_6 + r4_14); + i3_6 = (i4_6 + i4_14); + r3_14 = (r4_6 - r4_14); + i3_14 = (i4_6 - i4_14); + } + r2_2 = (r3_2 + r3_6); + i2_2 = (i3_2 + i3_6); + r2_10 = (r3_2 - r3_6); + i2_10 = (i3_2 - i3_6); + r2_6 = (r3_10 + i3_14); + i2_6 = (i3_10 - r3_14); + r2_14 = (r3_10 - i3_14); + i2_14 = (i3_10 + r3_14); + } + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_8 = (r2_0 - r2_2); + i1_8 = (i2_0 - i2_2); + tmpr = (0.707106781187 * (r2_6 + i2_6)); + tmpi = (0.707106781187 * (i2_6 - r2_6)); + r1_2 = (r2_4 + tmpr); + i1_2 = (i2_4 + tmpi); + r1_10 = (r2_4 - tmpr); + i1_10 = (i2_4 - tmpi); + r1_4 = (r2_8 + i2_10); + i1_4 = (i2_8 - r2_10); + r1_12 = (r2_8 - i2_10); + i1_12 = (i2_8 + r2_10); + tmpr = (0.707106781187 * (i2_14 - r2_14)); + tmpi = (0.707106781187 * (r2_14 + i2_14)); + r1_6 = (r2_12 + tmpr); + i1_6 = (i2_12 - tmpi); + r1_14 = (r2_12 - tmpr); + i1_14 = (i2_12 + tmpi); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + REAL r2_5, i2_5; + REAL r2_7, i2_7; + REAL r2_9, i2_9; + REAL r2_11, i2_11; + REAL r2_13, i2_13; + REAL r2_15, i2_15; + { + REAL r3_1, i3_1; + REAL r3_5, i3_5; + REAL r3_9, i3_9; + REAL r3_13, i3_13; + { + REAL r4_1, i4_1; + REAL r4_9, i4_9; + wr = c_re(W[1 * l1]); + wi = c_im(W[1 * l1]); + tmpr = c_re(jp[1 * m]); + tmpi = c_im(jp[1 * m]); + r4_1 = ((wr * tmpr) - (wi * tmpi)); + i4_1 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[9 * l1]); + wi = c_im(W[9 * l1]); + tmpr = c_re(jp[9 * m]); + tmpi = c_im(jp[9 * m]); + r4_9 = ((wr * tmpr) - (wi * tmpi)); + i4_9 = ((wi * tmpr) + (wr * tmpi)); + r3_1 = (r4_1 + r4_9); + i3_1 = (i4_1 + i4_9); + r3_9 = (r4_1 - r4_9); + i3_9 = (i4_1 - i4_9); + } + { + REAL r4_5, i4_5; + REAL r4_13, i4_13; + wr = c_re(W[5 * l1]); + wi = c_im(W[5 * l1]); + tmpr = c_re(jp[5 * m]); + tmpi = c_im(jp[5 * m]); + r4_5 = ((wr * tmpr) - (wi * tmpi)); + i4_5 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[13 * l1]); + wi = c_im(W[13 * l1]); + tmpr = c_re(jp[13 * m]); + tmpi = c_im(jp[13 * m]); + r4_13 = ((wr * tmpr) - (wi * tmpi)); + i4_13 = ((wi * tmpr) + (wr * tmpi)); + r3_5 = (r4_5 + r4_13); + i3_5 = (i4_5 + i4_13); + r3_13 = (r4_5 - r4_13); + i3_13 = (i4_5 - i4_13); + } + r2_1 = (r3_1 + r3_5); + i2_1 = (i3_1 + i3_5); + r2_9 = (r3_1 - r3_5); + i2_9 = (i3_1 - i3_5); + r2_5 = (r3_9 + i3_13); + i2_5 = (i3_9 - r3_13); + r2_13 = (r3_9 - i3_13); + i2_13 = (i3_9 + r3_13); + } + { + REAL r3_3, i3_3; + REAL r3_7, i3_7; + REAL r3_11, i3_11; + REAL r3_15, i3_15; + { + REAL r4_3, i4_3; + REAL r4_11, i4_11; + wr = c_re(W[3 * l1]); + wi = c_im(W[3 * l1]); + tmpr = c_re(jp[3 * m]); + tmpi = c_im(jp[3 * m]); + r4_3 = ((wr * tmpr) - (wi * tmpi)); + i4_3 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[11 * l1]); + wi = c_im(W[11 * l1]); + tmpr = c_re(jp[11 * m]); + tmpi = c_im(jp[11 * m]); + r4_11 = ((wr * tmpr) - (wi * tmpi)); + i4_11 = ((wi * tmpr) + (wr * tmpi)); + r3_3 = (r4_3 + r4_11); + i3_3 = (i4_3 + i4_11); + r3_11 = (r4_3 - r4_11); + i3_11 = (i4_3 - i4_11); + } + { + REAL r4_7, i4_7; + REAL r4_15, i4_15; + wr = c_re(W[7 * l1]); + wi = c_im(W[7 * l1]); + tmpr = c_re(jp[7 * m]); + tmpi = c_im(jp[7 * m]); + r4_7 = ((wr * tmpr) - (wi * tmpi)); + i4_7 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[15 * l1]); + wi = c_im(W[15 * l1]); + tmpr = c_re(jp[15 * m]); + tmpi = c_im(jp[15 * m]); + r4_15 = ((wr * tmpr) - (wi * tmpi)); + i4_15 = ((wi * tmpr) + (wr * tmpi)); + r3_7 = (r4_7 + r4_15); + i3_7 = (i4_7 + i4_15); + r3_15 = (r4_7 - r4_15); + i3_15 = (i4_7 - i4_15); + } + r2_3 = (r3_3 + r3_7); + i2_3 = (i3_3 + i3_7); + r2_11 = (r3_3 - r3_7); + i2_11 = (i3_3 - i3_7); + r2_7 = (r3_11 + i3_15); + i2_7 = (i3_11 - r3_15); + r2_15 = (r3_11 - i3_15); + i2_15 = (i3_11 + r3_15); + } + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_9 = (r2_1 - r2_3); + i1_9 = (i2_1 - i2_3); + tmpr = (0.707106781187 * (r2_7 + i2_7)); + tmpi = (0.707106781187 * (i2_7 - r2_7)); + r1_3 = (r2_5 + tmpr); + i1_3 = (i2_5 + tmpi); + r1_11 = (r2_5 - tmpr); + i1_11 = (i2_5 - tmpi); + r1_5 = (r2_9 + i2_11); + i1_5 = (i2_9 - r2_11); + r1_13 = (r2_9 - i2_11); + i1_13 = (i2_9 + r2_11); + tmpr = (0.707106781187 * (i2_15 - r2_15)); + tmpi = (0.707106781187 * (r2_15 + i2_15)); + r1_7 = (r2_13 + tmpr); + i1_7 = (i2_13 - tmpi); + r1_15 = (r2_13 - tmpr); + i1_15 = (i2_13 + tmpi); + } + c_re(kp[0 * m]) = (r1_0 + r1_1); + c_im(kp[0 * m]) = (i1_0 + i1_1); + c_re(kp[8 * m]) = (r1_0 - r1_1); + c_im(kp[8 * m]) = (i1_0 - i1_1); + tmpr = ((0.923879532511 * r1_3) + (0.382683432365 * i1_3)); + tmpi = ((0.923879532511 * i1_3) - (0.382683432365 * r1_3)); + c_re(kp[1 * m]) = (r1_2 + tmpr); + c_im(kp[1 * m]) = (i1_2 + tmpi); + c_re(kp[9 * m]) = (r1_2 - tmpr); + c_im(kp[9 * m]) = (i1_2 - tmpi); + tmpr = (0.707106781187 * (r1_5 + i1_5)); + tmpi = (0.707106781187 * (i1_5 - r1_5)); + c_re(kp[2 * m]) = (r1_4 + tmpr); + c_im(kp[2 * m]) = (i1_4 + tmpi); + c_re(kp[10 * m]) = (r1_4 - tmpr); + c_im(kp[10 * m]) = (i1_4 - tmpi); + tmpr = ((0.382683432365 * r1_7) + (0.923879532511 * i1_7)); + tmpi = ((0.382683432365 * i1_7) - (0.923879532511 * r1_7)); + c_re(kp[3 * m]) = (r1_6 + tmpr); + c_im(kp[3 * m]) = (i1_6 + tmpi); + c_re(kp[11 * m]) = (r1_6 - tmpr); + c_im(kp[11 * m]) = (i1_6 - tmpi); + c_re(kp[4 * m]) = (r1_8 + i1_9); + c_im(kp[4 * m]) = (i1_8 - r1_9); + c_re(kp[12 * m]) = (r1_8 - i1_9); + c_im(kp[12 * m]) = (i1_8 + r1_9); + tmpr = ((0.923879532511 * i1_11) - (0.382683432365 * r1_11)); + tmpi = ((0.923879532511 * r1_11) + (0.382683432365 * i1_11)); + c_re(kp[5 * m]) = (r1_10 + tmpr); + c_im(kp[5 * m]) = (i1_10 - tmpi); + c_re(kp[13 * m]) = (r1_10 - tmpr); + c_im(kp[13 * m]) = (i1_10 + tmpi); + tmpr = (0.707106781187 * (i1_13 - r1_13)); + tmpi = (0.707106781187 * (r1_13 + i1_13)); + c_re(kp[6 * m]) = (r1_12 + tmpr); + c_im(kp[6 * m]) = (i1_12 - tmpi); + c_re(kp[14 * m]) = (r1_12 - tmpr); + c_im(kp[14 * m]) = (i1_12 + tmpi); + tmpr = ((0.382683432365 * i1_15) - (0.923879532511 * r1_15)); + tmpi = ((0.382683432365 * r1_15) + (0.923879532511 * i1_15)); + c_re(kp[7 * m]) = (r1_14 + tmpr); + c_im(kp[7 * m]) = (i1_14 - tmpi); + c_re(kp[15 * m]) = (r1_14 - tmpr); + c_im(kp[15 * m]) = (i1_14 + tmpi); + } + } + } else { + int ab = (a + b) / 2; +#if defined(FORCE_TIED_TASKS) + #pragma omp task + fft_twiddle_16(a, ab, in, out, W, nW, nWdn, m); + #pragma omp task + fft_twiddle_16(ab, b, in, out, W, nW, nWdn, m); +#else + #pragma omp task untied + fft_twiddle_16(a, ab, in, out, W, nW, nWdn, m); + #pragma omp task untied + fft_twiddle_16(ab, b, in, out, W, nW, nWdn, m); +#endif + #pragma omp taskwait + } +} +void fft_twiddle_16_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m) +{ + int l1, i; + COMPLEX *jp, *kp; + REAL tmpr, tmpi, wr, wi; + if ((b - a) < 128) { + for (i = a, l1 = nWdn * i, kp = out + i; i < b; + i++, l1 += nWdn, kp++) { + jp = in + i; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + REAL r1_4, i1_4; + REAL r1_5, i1_5; + REAL r1_6, i1_6; + REAL r1_7, i1_7; + REAL r1_8, i1_8; + REAL r1_9, i1_9; + REAL r1_10, i1_10; + REAL r1_11, i1_11; + REAL r1_12, i1_12; + REAL r1_13, i1_13; + REAL r1_14, i1_14; + REAL r1_15, i1_15; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + REAL r2_4, i2_4; + REAL r2_6, i2_6; + REAL r2_8, i2_8; + REAL r2_10, i2_10; + REAL r2_12, i2_12; + REAL r2_14, i2_14; + { + REAL r3_0, i3_0; + REAL r3_4, i3_4; + REAL r3_8, i3_8; + REAL r3_12, i3_12; + { + REAL r4_0, i4_0; + REAL r4_8, i4_8; + r4_0 = c_re(jp[0 * m]); + i4_0 = c_im(jp[0 * m]); + wr = c_re(W[8 * l1]); + wi = c_im(W[8 * l1]); + tmpr = c_re(jp[8 * m]); + tmpi = c_im(jp[8 * m]); + r4_8 = ((wr * tmpr) - (wi * tmpi)); + i4_8 = ((wi * tmpr) + (wr * tmpi)); + r3_0 = (r4_0 + r4_8); + i3_0 = (i4_0 + i4_8); + r3_8 = (r4_0 - r4_8); + i3_8 = (i4_0 - i4_8); + } + { + REAL r4_4, i4_4; + REAL r4_12, i4_12; + wr = c_re(W[4 * l1]); + wi = c_im(W[4 * l1]); + tmpr = c_re(jp[4 * m]); + tmpi = c_im(jp[4 * m]); + r4_4 = ((wr * tmpr) - (wi * tmpi)); + i4_4 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[12 * l1]); + wi = c_im(W[12 * l1]); + tmpr = c_re(jp[12 * m]); + tmpi = c_im(jp[12 * m]); + r4_12 = ((wr * tmpr) - (wi * tmpi)); + i4_12 = ((wi * tmpr) + (wr * tmpi)); + r3_4 = (r4_4 + r4_12); + i3_4 = (i4_4 + i4_12); + r3_12 = (r4_4 - r4_12); + i3_12 = (i4_4 - i4_12); + } + r2_0 = (r3_0 + r3_4); + i2_0 = (i3_0 + i3_4); + r2_8 = (r3_0 - r3_4); + i2_8 = (i3_0 - i3_4); + r2_4 = (r3_8 + i3_12); + i2_4 = (i3_8 - r3_12); + r2_12 = (r3_8 - i3_12); + i2_12 = (i3_8 + r3_12); + } + { + REAL r3_2, i3_2; + REAL r3_6, i3_6; + REAL r3_10, i3_10; + REAL r3_14, i3_14; + { + REAL r4_2, i4_2; + REAL r4_10, i4_10; + wr = c_re(W[2 * l1]); + wi = c_im(W[2 * l1]); + tmpr = c_re(jp[2 * m]); + tmpi = c_im(jp[2 * m]); + r4_2 = ((wr * tmpr) - (wi * tmpi)); + i4_2 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[10 * l1]); + wi = c_im(W[10 * l1]); + tmpr = c_re(jp[10 * m]); + tmpi = c_im(jp[10 * m]); + r4_10 = ((wr * tmpr) - (wi * tmpi)); + i4_10 = ((wi * tmpr) + (wr * tmpi)); + r3_2 = (r4_2 + r4_10); + i3_2 = (i4_2 + i4_10); + r3_10 = (r4_2 - r4_10); + i3_10 = (i4_2 - i4_10); + } + { + REAL r4_6, i4_6; + REAL r4_14, i4_14; + wr = c_re(W[6 * l1]); + wi = c_im(W[6 * l1]); + tmpr = c_re(jp[6 * m]); + tmpi = c_im(jp[6 * m]); + r4_6 = ((wr * tmpr) - (wi * tmpi)); + i4_6 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[14 * l1]); + wi = c_im(W[14 * l1]); + tmpr = c_re(jp[14 * m]); + tmpi = c_im(jp[14 * m]); + r4_14 = ((wr * tmpr) - (wi * tmpi)); + i4_14 = ((wi * tmpr) + (wr * tmpi)); + r3_6 = (r4_6 + r4_14); + i3_6 = (i4_6 + i4_14); + r3_14 = (r4_6 - r4_14); + i3_14 = (i4_6 - i4_14); + } + r2_2 = (r3_2 + r3_6); + i2_2 = (i3_2 + i3_6); + r2_10 = (r3_2 - r3_6); + i2_10 = (i3_2 - i3_6); + r2_6 = (r3_10 + i3_14); + i2_6 = (i3_10 - r3_14); + r2_14 = (r3_10 - i3_14); + i2_14 = (i3_10 + r3_14); + } + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_8 = (r2_0 - r2_2); + i1_8 = (i2_0 - i2_2); + tmpr = (0.707106781187 * (r2_6 + i2_6)); + tmpi = (0.707106781187 * (i2_6 - r2_6)); + r1_2 = (r2_4 + tmpr); + i1_2 = (i2_4 + tmpi); + r1_10 = (r2_4 - tmpr); + i1_10 = (i2_4 - tmpi); + r1_4 = (r2_8 + i2_10); + i1_4 = (i2_8 - r2_10); + r1_12 = (r2_8 - i2_10); + i1_12 = (i2_8 + r2_10); + tmpr = (0.707106781187 * (i2_14 - r2_14)); + tmpi = (0.707106781187 * (r2_14 + i2_14)); + r1_6 = (r2_12 + tmpr); + i1_6 = (i2_12 - tmpi); + r1_14 = (r2_12 - tmpr); + i1_14 = (i2_12 + tmpi); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + REAL r2_5, i2_5; + REAL r2_7, i2_7; + REAL r2_9, i2_9; + REAL r2_11, i2_11; + REAL r2_13, i2_13; + REAL r2_15, i2_15; + { + REAL r3_1, i3_1; + REAL r3_5, i3_5; + REAL r3_9, i3_9; + REAL r3_13, i3_13; + { + REAL r4_1, i4_1; + REAL r4_9, i4_9; + wr = c_re(W[1 * l1]); + wi = c_im(W[1 * l1]); + tmpr = c_re(jp[1 * m]); + tmpi = c_im(jp[1 * m]); + r4_1 = ((wr * tmpr) - (wi * tmpi)); + i4_1 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[9 * l1]); + wi = c_im(W[9 * l1]); + tmpr = c_re(jp[9 * m]); + tmpi = c_im(jp[9 * m]); + r4_9 = ((wr * tmpr) - (wi * tmpi)); + i4_9 = ((wi * tmpr) + (wr * tmpi)); + r3_1 = (r4_1 + r4_9); + i3_1 = (i4_1 + i4_9); + r3_9 = (r4_1 - r4_9); + i3_9 = (i4_1 - i4_9); + } + { + REAL r4_5, i4_5; + REAL r4_13, i4_13; + wr = c_re(W[5 * l1]); + wi = c_im(W[5 * l1]); + tmpr = c_re(jp[5 * m]); + tmpi = c_im(jp[5 * m]); + r4_5 = ((wr * tmpr) - (wi * tmpi)); + i4_5 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[13 * l1]); + wi = c_im(W[13 * l1]); + tmpr = c_re(jp[13 * m]); + tmpi = c_im(jp[13 * m]); + r4_13 = ((wr * tmpr) - (wi * tmpi)); + i4_13 = ((wi * tmpr) + (wr * tmpi)); + r3_5 = (r4_5 + r4_13); + i3_5 = (i4_5 + i4_13); + r3_13 = (r4_5 - r4_13); + i3_13 = (i4_5 - i4_13); + } + r2_1 = (r3_1 + r3_5); + i2_1 = (i3_1 + i3_5); + r2_9 = (r3_1 - r3_5); + i2_9 = (i3_1 - i3_5); + r2_5 = (r3_9 + i3_13); + i2_5 = (i3_9 - r3_13); + r2_13 = (r3_9 - i3_13); + i2_13 = (i3_9 + r3_13); + } + { + REAL r3_3, i3_3; + REAL r3_7, i3_7; + REAL r3_11, i3_11; + REAL r3_15, i3_15; + { + REAL r4_3, i4_3; + REAL r4_11, i4_11; + wr = c_re(W[3 * l1]); + wi = c_im(W[3 * l1]); + tmpr = c_re(jp[3 * m]); + tmpi = c_im(jp[3 * m]); + r4_3 = ((wr * tmpr) - (wi * tmpi)); + i4_3 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[11 * l1]); + wi = c_im(W[11 * l1]); + tmpr = c_re(jp[11 * m]); + tmpi = c_im(jp[11 * m]); + r4_11 = ((wr * tmpr) - (wi * tmpi)); + i4_11 = ((wi * tmpr) + (wr * tmpi)); + r3_3 = (r4_3 + r4_11); + i3_3 = (i4_3 + i4_11); + r3_11 = (r4_3 - r4_11); + i3_11 = (i4_3 - i4_11); + } + { + REAL r4_7, i4_7; + REAL r4_15, i4_15; + wr = c_re(W[7 * l1]); + wi = c_im(W[7 * l1]); + tmpr = c_re(jp[7 * m]); + tmpi = c_im(jp[7 * m]); + r4_7 = ((wr * tmpr) - (wi * tmpi)); + i4_7 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[15 * l1]); + wi = c_im(W[15 * l1]); + tmpr = c_re(jp[15 * m]); + tmpi = c_im(jp[15 * m]); + r4_15 = ((wr * tmpr) - (wi * tmpi)); + i4_15 = ((wi * tmpr) + (wr * tmpi)); + r3_7 = (r4_7 + r4_15); + i3_7 = (i4_7 + i4_15); + r3_15 = (r4_7 - r4_15); + i3_15 = (i4_7 - i4_15); + } + r2_3 = (r3_3 + r3_7); + i2_3 = (i3_3 + i3_7); + r2_11 = (r3_3 - r3_7); + i2_11 = (i3_3 - i3_7); + r2_7 = (r3_11 + i3_15); + i2_7 = (i3_11 - r3_15); + r2_15 = (r3_11 - i3_15); + i2_15 = (i3_11 + r3_15); + } + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_9 = (r2_1 - r2_3); + i1_9 = (i2_1 - i2_3); + tmpr = (0.707106781187 * (r2_7 + i2_7)); + tmpi = (0.707106781187 * (i2_7 - r2_7)); + r1_3 = (r2_5 + tmpr); + i1_3 = (i2_5 + tmpi); + r1_11 = (r2_5 - tmpr); + i1_11 = (i2_5 - tmpi); + r1_5 = (r2_9 + i2_11); + i1_5 = (i2_9 - r2_11); + r1_13 = (r2_9 - i2_11); + i1_13 = (i2_9 + r2_11); + tmpr = (0.707106781187 * (i2_15 - r2_15)); + tmpi = (0.707106781187 * (r2_15 + i2_15)); + r1_7 = (r2_13 + tmpr); + i1_7 = (i2_13 - tmpi); + r1_15 = (r2_13 - tmpr); + i1_15 = (i2_13 + tmpi); + } + c_re(kp[0 * m]) = (r1_0 + r1_1); + c_im(kp[0 * m]) = (i1_0 + i1_1); + c_re(kp[8 * m]) = (r1_0 - r1_1); + c_im(kp[8 * m]) = (i1_0 - i1_1); + tmpr = ((0.923879532511 * r1_3) + (0.382683432365 * i1_3)); + tmpi = ((0.923879532511 * i1_3) - (0.382683432365 * r1_3)); + c_re(kp[1 * m]) = (r1_2 + tmpr); + c_im(kp[1 * m]) = (i1_2 + tmpi); + c_re(kp[9 * m]) = (r1_2 - tmpr); + c_im(kp[9 * m]) = (i1_2 - tmpi); + tmpr = (0.707106781187 * (r1_5 + i1_5)); + tmpi = (0.707106781187 * (i1_5 - r1_5)); + c_re(kp[2 * m]) = (r1_4 + tmpr); + c_im(kp[2 * m]) = (i1_4 + tmpi); + c_re(kp[10 * m]) = (r1_4 - tmpr); + c_im(kp[10 * m]) = (i1_4 - tmpi); + tmpr = ((0.382683432365 * r1_7) + (0.923879532511 * i1_7)); + tmpi = ((0.382683432365 * i1_7) - (0.923879532511 * r1_7)); + c_re(kp[3 * m]) = (r1_6 + tmpr); + c_im(kp[3 * m]) = (i1_6 + tmpi); + c_re(kp[11 * m]) = (r1_6 - tmpr); + c_im(kp[11 * m]) = (i1_6 - tmpi); + c_re(kp[4 * m]) = (r1_8 + i1_9); + c_im(kp[4 * m]) = (i1_8 - r1_9); + c_re(kp[12 * m]) = (r1_8 - i1_9); + c_im(kp[12 * m]) = (i1_8 + r1_9); + tmpr = ((0.923879532511 * i1_11) - (0.382683432365 * r1_11)); + tmpi = ((0.923879532511 * r1_11) + (0.382683432365 * i1_11)); + c_re(kp[5 * m]) = (r1_10 + tmpr); + c_im(kp[5 * m]) = (i1_10 - tmpi); + c_re(kp[13 * m]) = (r1_10 - tmpr); + c_im(kp[13 * m]) = (i1_10 + tmpi); + tmpr = (0.707106781187 * (i1_13 - r1_13)); + tmpi = (0.707106781187 * (r1_13 + i1_13)); + c_re(kp[6 * m]) = (r1_12 + tmpr); + c_im(kp[6 * m]) = (i1_12 - tmpi); + c_re(kp[14 * m]) = (r1_12 - tmpr); + c_im(kp[14 * m]) = (i1_12 + tmpi); + tmpr = ((0.382683432365 * i1_15) - (0.923879532511 * r1_15)); + tmpi = ((0.382683432365 * r1_15) + (0.923879532511 * i1_15)); + c_re(kp[7 * m]) = (r1_14 + tmpr); + c_im(kp[7 * m]) = (i1_14 - tmpi); + c_re(kp[15 * m]) = (r1_14 - tmpr); + c_im(kp[15 * m]) = (i1_14 + tmpi); + } + } + } else { + int ab = (a + b) / 2; + fft_twiddle_16_seq(a, ab, in, out, W, nW, nWdn, m); + fft_twiddle_16_seq(ab, b, in, out, W, nW, nWdn, m); + } +} +void fft_unshuffle_16(int a, int b, COMPLEX * in, COMPLEX * out, int m) +{ + int i; + const COMPLEX *ip; + COMPLEX *jp; + if ((b - a) < 128) { + ip = in + a * 16; + for (i = a; i < b; ++i) { + jp = out + i; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + } + } else { + int ab = (a + b) / 2; +#if defined(FORCE_TIED_TASKS) + #pragma omp task untied + fft_unshuffle_16(a, ab, in, out, m); + #pragma omp task untied + fft_unshuffle_16(ab, b, in, out, m); +#else + #pragma omp task untied + fft_unshuffle_16(a, ab, in, out, m); + #pragma omp task untied + fft_unshuffle_16(ab, b, in, out, m); +#endif + #pragma omp taskwait + } +} +void fft_unshuffle_16_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m) +{ + int i; + const COMPLEX *ip; + COMPLEX *jp; + if ((b - a) < 128) { + ip = in + a * 16; + for (i = a; i < b; ++i) { + jp = out + i; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + } + } else { + int ab = (a + b) / 2; + fft_unshuffle_16_seq(a, ab, in, out, m); + fft_unshuffle_16_seq(ab, b, in, out, m); + } +} +void fft_base_32(COMPLEX * in, COMPLEX * out) +{ + REAL tmpr, tmpi; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + REAL r1_4, i1_4; + REAL r1_5, i1_5; + REAL r1_6, i1_6; + REAL r1_7, i1_7; + REAL r1_8, i1_8; + REAL r1_9, i1_9; + REAL r1_10, i1_10; + REAL r1_11, i1_11; + REAL r1_12, i1_12; + REAL r1_13, i1_13; + REAL r1_14, i1_14; + REAL r1_15, i1_15; + REAL r1_16, i1_16; + REAL r1_17, i1_17; + REAL r1_18, i1_18; + REAL r1_19, i1_19; + REAL r1_20, i1_20; + REAL r1_21, i1_21; + REAL r1_22, i1_22; + REAL r1_23, i1_23; + REAL r1_24, i1_24; + REAL r1_25, i1_25; + REAL r1_26, i1_26; + REAL r1_27, i1_27; + REAL r1_28, i1_28; + REAL r1_29, i1_29; + REAL r1_30, i1_30; + REAL r1_31, i1_31; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + REAL r2_4, i2_4; + REAL r2_6, i2_6; + REAL r2_8, i2_8; + REAL r2_10, i2_10; + REAL r2_12, i2_12; + REAL r2_14, i2_14; + REAL r2_16, i2_16; + REAL r2_18, i2_18; + REAL r2_20, i2_20; + REAL r2_22, i2_22; + REAL r2_24, i2_24; + REAL r2_26, i2_26; + REAL r2_28, i2_28; + REAL r2_30, i2_30; + { + REAL r3_0, i3_0; + REAL r3_4, i3_4; + REAL r3_8, i3_8; + REAL r3_12, i3_12; + REAL r3_16, i3_16; + REAL r3_20, i3_20; + REAL r3_24, i3_24; + REAL r3_28, i3_28; + { + REAL r4_0, i4_0; + REAL r4_8, i4_8; + REAL r4_16, i4_16; + REAL r4_24, i4_24; + { + REAL r5_0, i5_0; + REAL r5_16, i5_16; + r5_0 = c_re(in[0]); + i5_0 = c_im(in[0]); + r5_16 = c_re(in[16]); + i5_16 = c_im(in[16]); + r4_0 = (r5_0 + r5_16); + i4_0 = (i5_0 + i5_16); + r4_16 = (r5_0 - r5_16); + i4_16 = (i5_0 - i5_16); + } + { + REAL r5_8, i5_8; + REAL r5_24, i5_24; + r5_8 = c_re(in[8]); + i5_8 = c_im(in[8]); + r5_24 = c_re(in[24]); + i5_24 = c_im(in[24]); + r4_8 = (r5_8 + r5_24); + i4_8 = (i5_8 + i5_24); + r4_24 = (r5_8 - r5_24); + i4_24 = (i5_8 - i5_24); + } + r3_0 = (r4_0 + r4_8); + i3_0 = (i4_0 + i4_8); + r3_16 = (r4_0 - r4_8); + i3_16 = (i4_0 - i4_8); + r3_8 = (r4_16 + i4_24); + i3_8 = (i4_16 - r4_24); + r3_24 = (r4_16 - i4_24); + i3_24 = (i4_16 + r4_24); + } + { + REAL r4_4, i4_4; + REAL r4_12, i4_12; + REAL r4_20, i4_20; + REAL r4_28, i4_28; + { + REAL r5_4, i5_4; + REAL r5_20, i5_20; + r5_4 = c_re(in[4]); + i5_4 = c_im(in[4]); + r5_20 = c_re(in[20]); + i5_20 = c_im(in[20]); + r4_4 = (r5_4 + r5_20); + i4_4 = (i5_4 + i5_20); + r4_20 = (r5_4 - r5_20); + i4_20 = (i5_4 - i5_20); + } + { + REAL r5_12, i5_12; + REAL r5_28, i5_28; + r5_12 = c_re(in[12]); + i5_12 = c_im(in[12]); + r5_28 = c_re(in[28]); + i5_28 = c_im(in[28]); + r4_12 = (r5_12 + r5_28); + i4_12 = (i5_12 + i5_28); + r4_28 = (r5_12 - r5_28); + i4_28 = (i5_12 - i5_28); + } + r3_4 = (r4_4 + r4_12); + i3_4 = (i4_4 + i4_12); + r3_20 = (r4_4 - r4_12); + i3_20 = (i4_4 - i4_12); + r3_12 = (r4_20 + i4_28); + i3_12 = (i4_20 - r4_28); + r3_28 = (r4_20 - i4_28); + i3_28 = (i4_20 + r4_28); + } + r2_0 = (r3_0 + r3_4); + i2_0 = (i3_0 + i3_4); + r2_16 = (r3_0 - r3_4); + i2_16 = (i3_0 - i3_4); + tmpr = (0.707106781187 * (r3_12 + i3_12)); + tmpi = (0.707106781187 * (i3_12 - r3_12)); + r2_4 = (r3_8 + tmpr); + i2_4 = (i3_8 + tmpi); + r2_20 = (r3_8 - tmpr); + i2_20 = (i3_8 - tmpi); + r2_8 = (r3_16 + i3_20); + i2_8 = (i3_16 - r3_20); + r2_24 = (r3_16 - i3_20); + i2_24 = (i3_16 + r3_20); + tmpr = (0.707106781187 * (i3_28 - r3_28)); + tmpi = (0.707106781187 * (r3_28 + i3_28)); + r2_12 = (r3_24 + tmpr); + i2_12 = (i3_24 - tmpi); + r2_28 = (r3_24 - tmpr); + i2_28 = (i3_24 + tmpi); + } + { + REAL r3_2, i3_2; + REAL r3_6, i3_6; + REAL r3_10, i3_10; + REAL r3_14, i3_14; + REAL r3_18, i3_18; + REAL r3_22, i3_22; + REAL r3_26, i3_26; + REAL r3_30, i3_30; + { + REAL r4_2, i4_2; + REAL r4_10, i4_10; + REAL r4_18, i4_18; + REAL r4_26, i4_26; + { + REAL r5_2, i5_2; + REAL r5_18, i5_18; + r5_2 = c_re(in[2]); + i5_2 = c_im(in[2]); + r5_18 = c_re(in[18]); + i5_18 = c_im(in[18]); + r4_2 = (r5_2 + r5_18); + i4_2 = (i5_2 + i5_18); + r4_18 = (r5_2 - r5_18); + i4_18 = (i5_2 - i5_18); + } + { + REAL r5_10, i5_10; + REAL r5_26, i5_26; + r5_10 = c_re(in[10]); + i5_10 = c_im(in[10]); + r5_26 = c_re(in[26]); + i5_26 = c_im(in[26]); + r4_10 = (r5_10 + r5_26); + i4_10 = (i5_10 + i5_26); + r4_26 = (r5_10 - r5_26); + i4_26 = (i5_10 - i5_26); + } + r3_2 = (r4_2 + r4_10); + i3_2 = (i4_2 + i4_10); + r3_18 = (r4_2 - r4_10); + i3_18 = (i4_2 - i4_10); + r3_10 = (r4_18 + i4_26); + i3_10 = (i4_18 - r4_26); + r3_26 = (r4_18 - i4_26); + i3_26 = (i4_18 + r4_26); + } + { + REAL r4_6, i4_6; + REAL r4_14, i4_14; + REAL r4_22, i4_22; + REAL r4_30, i4_30; + { + REAL r5_6, i5_6; + REAL r5_22, i5_22; + r5_6 = c_re(in[6]); + i5_6 = c_im(in[6]); + r5_22 = c_re(in[22]); + i5_22 = c_im(in[22]); + r4_6 = (r5_6 + r5_22); + i4_6 = (i5_6 + i5_22); + r4_22 = (r5_6 - r5_22); + i4_22 = (i5_6 - i5_22); + } + { + REAL r5_14, i5_14; + REAL r5_30, i5_30; + r5_14 = c_re(in[14]); + i5_14 = c_im(in[14]); + r5_30 = c_re(in[30]); + i5_30 = c_im(in[30]); + r4_14 = (r5_14 + r5_30); + i4_14 = (i5_14 + i5_30); + r4_30 = (r5_14 - r5_30); + i4_30 = (i5_14 - i5_30); + } + r3_6 = (r4_6 + r4_14); + i3_6 = (i4_6 + i4_14); + r3_22 = (r4_6 - r4_14); + i3_22 = (i4_6 - i4_14); + r3_14 = (r4_22 + i4_30); + i3_14 = (i4_22 - r4_30); + r3_30 = (r4_22 - i4_30); + i3_30 = (i4_22 + r4_30); + } + r2_2 = (r3_2 + r3_6); + i2_2 = (i3_2 + i3_6); + r2_18 = (r3_2 - r3_6); + i2_18 = (i3_2 - i3_6); + tmpr = (0.707106781187 * (r3_14 + i3_14)); + tmpi = (0.707106781187 * (i3_14 - r3_14)); + r2_6 = (r3_10 + tmpr); + i2_6 = (i3_10 + tmpi); + r2_22 = (r3_10 - tmpr); + i2_22 = (i3_10 - tmpi); + r2_10 = (r3_18 + i3_22); + i2_10 = (i3_18 - r3_22); + r2_26 = (r3_18 - i3_22); + i2_26 = (i3_18 + r3_22); + tmpr = (0.707106781187 * (i3_30 - r3_30)); + tmpi = (0.707106781187 * (r3_30 + i3_30)); + r2_14 = (r3_26 + tmpr); + i2_14 = (i3_26 - tmpi); + r2_30 = (r3_26 - tmpr); + i2_30 = (i3_26 + tmpi); + } + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_16 = (r2_0 - r2_2); + i1_16 = (i2_0 - i2_2); + tmpr = ((0.923879532511 * r2_6) + (0.382683432365 * i2_6)); + tmpi = ((0.923879532511 * i2_6) - (0.382683432365 * r2_6)); + r1_2 = (r2_4 + tmpr); + i1_2 = (i2_4 + tmpi); + r1_18 = (r2_4 - tmpr); + i1_18 = (i2_4 - tmpi); + tmpr = (0.707106781187 * (r2_10 + i2_10)); + tmpi = (0.707106781187 * (i2_10 - r2_10)); + r1_4 = (r2_8 + tmpr); + i1_4 = (i2_8 + tmpi); + r1_20 = (r2_8 - tmpr); + i1_20 = (i2_8 - tmpi); + tmpr = ((0.382683432365 * r2_14) + (0.923879532511 * i2_14)); + tmpi = ((0.382683432365 * i2_14) - (0.923879532511 * r2_14)); + r1_6 = (r2_12 + tmpr); + i1_6 = (i2_12 + tmpi); + r1_22 = (r2_12 - tmpr); + i1_22 = (i2_12 - tmpi); + r1_8 = (r2_16 + i2_18); + i1_8 = (i2_16 - r2_18); + r1_24 = (r2_16 - i2_18); + i1_24 = (i2_16 + r2_18); + tmpr = ((0.923879532511 * i2_22) - (0.382683432365 * r2_22)); + tmpi = ((0.923879532511 * r2_22) + (0.382683432365 * i2_22)); + r1_10 = (r2_20 + tmpr); + i1_10 = (i2_20 - tmpi); + r1_26 = (r2_20 - tmpr); + i1_26 = (i2_20 + tmpi); + tmpr = (0.707106781187 * (i2_26 - r2_26)); + tmpi = (0.707106781187 * (r2_26 + i2_26)); + r1_12 = (r2_24 + tmpr); + i1_12 = (i2_24 - tmpi); + r1_28 = (r2_24 - tmpr); + i1_28 = (i2_24 + tmpi); + tmpr = ((0.382683432365 * i2_30) - (0.923879532511 * r2_30)); + tmpi = ((0.382683432365 * r2_30) + (0.923879532511 * i2_30)); + r1_14 = (r2_28 + tmpr); + i1_14 = (i2_28 - tmpi); + r1_30 = (r2_28 - tmpr); + i1_30 = (i2_28 + tmpi); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + REAL r2_5, i2_5; + REAL r2_7, i2_7; + REAL r2_9, i2_9; + REAL r2_11, i2_11; + REAL r2_13, i2_13; + REAL r2_15, i2_15; + REAL r2_17, i2_17; + REAL r2_19, i2_19; + REAL r2_21, i2_21; + REAL r2_23, i2_23; + REAL r2_25, i2_25; + REAL r2_27, i2_27; + REAL r2_29, i2_29; + REAL r2_31, i2_31; + { + REAL r3_1, i3_1; + REAL r3_5, i3_5; + REAL r3_9, i3_9; + REAL r3_13, i3_13; + REAL r3_17, i3_17; + REAL r3_21, i3_21; + REAL r3_25, i3_25; + REAL r3_29, i3_29; + { + REAL r4_1, i4_1; + REAL r4_9, i4_9; + REAL r4_17, i4_17; + REAL r4_25, i4_25; + { + REAL r5_1, i5_1; + REAL r5_17, i5_17; + r5_1 = c_re(in[1]); + i5_1 = c_im(in[1]); + r5_17 = c_re(in[17]); + i5_17 = c_im(in[17]); + r4_1 = (r5_1 + r5_17); + i4_1 = (i5_1 + i5_17); + r4_17 = (r5_1 - r5_17); + i4_17 = (i5_1 - i5_17); + } + { + REAL r5_9, i5_9; + REAL r5_25, i5_25; + r5_9 = c_re(in[9]); + i5_9 = c_im(in[9]); + r5_25 = c_re(in[25]); + i5_25 = c_im(in[25]); + r4_9 = (r5_9 + r5_25); + i4_9 = (i5_9 + i5_25); + r4_25 = (r5_9 - r5_25); + i4_25 = (i5_9 - i5_25); + } + r3_1 = (r4_1 + r4_9); + i3_1 = (i4_1 + i4_9); + r3_17 = (r4_1 - r4_9); + i3_17 = (i4_1 - i4_9); + r3_9 = (r4_17 + i4_25); + i3_9 = (i4_17 - r4_25); + r3_25 = (r4_17 - i4_25); + i3_25 = (i4_17 + r4_25); + } + { + REAL r4_5, i4_5; + REAL r4_13, i4_13; + REAL r4_21, i4_21; + REAL r4_29, i4_29; + { + REAL r5_5, i5_5; + REAL r5_21, i5_21; + r5_5 = c_re(in[5]); + i5_5 = c_im(in[5]); + r5_21 = c_re(in[21]); + i5_21 = c_im(in[21]); + r4_5 = (r5_5 + r5_21); + i4_5 = (i5_5 + i5_21); + r4_21 = (r5_5 - r5_21); + i4_21 = (i5_5 - i5_21); + } + { + REAL r5_13, i5_13; + REAL r5_29, i5_29; + r5_13 = c_re(in[13]); + i5_13 = c_im(in[13]); + r5_29 = c_re(in[29]); + i5_29 = c_im(in[29]); + r4_13 = (r5_13 + r5_29); + i4_13 = (i5_13 + i5_29); + r4_29 = (r5_13 - r5_29); + i4_29 = (i5_13 - i5_29); + } + r3_5 = (r4_5 + r4_13); + i3_5 = (i4_5 + i4_13); + r3_21 = (r4_5 - r4_13); + i3_21 = (i4_5 - i4_13); + r3_13 = (r4_21 + i4_29); + i3_13 = (i4_21 - r4_29); + r3_29 = (r4_21 - i4_29); + i3_29 = (i4_21 + r4_29); + } + r2_1 = (r3_1 + r3_5); + i2_1 = (i3_1 + i3_5); + r2_17 = (r3_1 - r3_5); + i2_17 = (i3_1 - i3_5); + tmpr = (0.707106781187 * (r3_13 + i3_13)); + tmpi = (0.707106781187 * (i3_13 - r3_13)); + r2_5 = (r3_9 + tmpr); + i2_5 = (i3_9 + tmpi); + r2_21 = (r3_9 - tmpr); + i2_21 = (i3_9 - tmpi); + r2_9 = (r3_17 + i3_21); + i2_9 = (i3_17 - r3_21); + r2_25 = (r3_17 - i3_21); + i2_25 = (i3_17 + r3_21); + tmpr = (0.707106781187 * (i3_29 - r3_29)); + tmpi = (0.707106781187 * (r3_29 + i3_29)); + r2_13 = (r3_25 + tmpr); + i2_13 = (i3_25 - tmpi); + r2_29 = (r3_25 - tmpr); + i2_29 = (i3_25 + tmpi); + } + { + REAL r3_3, i3_3; + REAL r3_7, i3_7; + REAL r3_11, i3_11; + REAL r3_15, i3_15; + REAL r3_19, i3_19; + REAL r3_23, i3_23; + REAL r3_27, i3_27; + REAL r3_31, i3_31; + { + REAL r4_3, i4_3; + REAL r4_11, i4_11; + REAL r4_19, i4_19; + REAL r4_27, i4_27; + { + REAL r5_3, i5_3; + REAL r5_19, i5_19; + r5_3 = c_re(in[3]); + i5_3 = c_im(in[3]); + r5_19 = c_re(in[19]); + i5_19 = c_im(in[19]); + r4_3 = (r5_3 + r5_19); + i4_3 = (i5_3 + i5_19); + r4_19 = (r5_3 - r5_19); + i4_19 = (i5_3 - i5_19); + } + { + REAL r5_11, i5_11; + REAL r5_27, i5_27; + r5_11 = c_re(in[11]); + i5_11 = c_im(in[11]); + r5_27 = c_re(in[27]); + i5_27 = c_im(in[27]); + r4_11 = (r5_11 + r5_27); + i4_11 = (i5_11 + i5_27); + r4_27 = (r5_11 - r5_27); + i4_27 = (i5_11 - i5_27); + } + r3_3 = (r4_3 + r4_11); + i3_3 = (i4_3 + i4_11); + r3_19 = (r4_3 - r4_11); + i3_19 = (i4_3 - i4_11); + r3_11 = (r4_19 + i4_27); + i3_11 = (i4_19 - r4_27); + r3_27 = (r4_19 - i4_27); + i3_27 = (i4_19 + r4_27); + } + { + REAL r4_7, i4_7; + REAL r4_15, i4_15; + REAL r4_23, i4_23; + REAL r4_31, i4_31; + { + REAL r5_7, i5_7; + REAL r5_23, i5_23; + r5_7 = c_re(in[7]); + i5_7 = c_im(in[7]); + r5_23 = c_re(in[23]); + i5_23 = c_im(in[23]); + r4_7 = (r5_7 + r5_23); + i4_7 = (i5_7 + i5_23); + r4_23 = (r5_7 - r5_23); + i4_23 = (i5_7 - i5_23); + } + { + REAL r5_15, i5_15; + REAL r5_31, i5_31; + r5_15 = c_re(in[15]); + i5_15 = c_im(in[15]); + r5_31 = c_re(in[31]); + i5_31 = c_im(in[31]); + r4_15 = (r5_15 + r5_31); + i4_15 = (i5_15 + i5_31); + r4_31 = (r5_15 - r5_31); + i4_31 = (i5_15 - i5_31); + } + r3_7 = (r4_7 + r4_15); + i3_7 = (i4_7 + i4_15); + r3_23 = (r4_7 - r4_15); + i3_23 = (i4_7 - i4_15); + r3_15 = (r4_23 + i4_31); + i3_15 = (i4_23 - r4_31); + r3_31 = (r4_23 - i4_31); + i3_31 = (i4_23 + r4_31); + } + r2_3 = (r3_3 + r3_7); + i2_3 = (i3_3 + i3_7); + r2_19 = (r3_3 - r3_7); + i2_19 = (i3_3 - i3_7); + tmpr = (0.707106781187 * (r3_15 + i3_15)); + tmpi = (0.707106781187 * (i3_15 - r3_15)); + r2_7 = (r3_11 + tmpr); + i2_7 = (i3_11 + tmpi); + r2_23 = (r3_11 - tmpr); + i2_23 = (i3_11 - tmpi); + r2_11 = (r3_19 + i3_23); + i2_11 = (i3_19 - r3_23); + r2_27 = (r3_19 - i3_23); + i2_27 = (i3_19 + r3_23); + tmpr = (0.707106781187 * (i3_31 - r3_31)); + tmpi = (0.707106781187 * (r3_31 + i3_31)); + r2_15 = (r3_27 + tmpr); + i2_15 = (i3_27 - tmpi); + r2_31 = (r3_27 - tmpr); + i2_31 = (i3_27 + tmpi); + } + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_17 = (r2_1 - r2_3); + i1_17 = (i2_1 - i2_3); + tmpr = ((0.923879532511 * r2_7) + (0.382683432365 * i2_7)); + tmpi = ((0.923879532511 * i2_7) - (0.382683432365 * r2_7)); + r1_3 = (r2_5 + tmpr); + i1_3 = (i2_5 + tmpi); + r1_19 = (r2_5 - tmpr); + i1_19 = (i2_5 - tmpi); + tmpr = (0.707106781187 * (r2_11 + i2_11)); + tmpi = (0.707106781187 * (i2_11 - r2_11)); + r1_5 = (r2_9 + tmpr); + i1_5 = (i2_9 + tmpi); + r1_21 = (r2_9 - tmpr); + i1_21 = (i2_9 - tmpi); + tmpr = ((0.382683432365 * r2_15) + (0.923879532511 * i2_15)); + tmpi = ((0.382683432365 * i2_15) - (0.923879532511 * r2_15)); + r1_7 = (r2_13 + tmpr); + i1_7 = (i2_13 + tmpi); + r1_23 = (r2_13 - tmpr); + i1_23 = (i2_13 - tmpi); + r1_9 = (r2_17 + i2_19); + i1_9 = (i2_17 - r2_19); + r1_25 = (r2_17 - i2_19); + i1_25 = (i2_17 + r2_19); + tmpr = ((0.923879532511 * i2_23) - (0.382683432365 * r2_23)); + tmpi = ((0.923879532511 * r2_23) + (0.382683432365 * i2_23)); + r1_11 = (r2_21 + tmpr); + i1_11 = (i2_21 - tmpi); + r1_27 = (r2_21 - tmpr); + i1_27 = (i2_21 + tmpi); + tmpr = (0.707106781187 * (i2_27 - r2_27)); + tmpi = (0.707106781187 * (r2_27 + i2_27)); + r1_13 = (r2_25 + tmpr); + i1_13 = (i2_25 - tmpi); + r1_29 = (r2_25 - tmpr); + i1_29 = (i2_25 + tmpi); + tmpr = ((0.382683432365 * i2_31) - (0.923879532511 * r2_31)); + tmpi = ((0.382683432365 * r2_31) + (0.923879532511 * i2_31)); + r1_15 = (r2_29 + tmpr); + i1_15 = (i2_29 - tmpi); + r1_31 = (r2_29 - tmpr); + i1_31 = (i2_29 + tmpi); + } + c_re(out[0]) = (r1_0 + r1_1); + c_im(out[0]) = (i1_0 + i1_1); + c_re(out[16]) = (r1_0 - r1_1); + c_im(out[16]) = (i1_0 - i1_1); + tmpr = ((0.980785280403 * r1_3) + (0.195090322016 * i1_3)); + tmpi = ((0.980785280403 * i1_3) - (0.195090322016 * r1_3)); + c_re(out[1]) = (r1_2 + tmpr); + c_im(out[1]) = (i1_2 + tmpi); + c_re(out[17]) = (r1_2 - tmpr); + c_im(out[17]) = (i1_2 - tmpi); + tmpr = ((0.923879532511 * r1_5) + (0.382683432365 * i1_5)); + tmpi = ((0.923879532511 * i1_5) - (0.382683432365 * r1_5)); + c_re(out[2]) = (r1_4 + tmpr); + c_im(out[2]) = (i1_4 + tmpi); + c_re(out[18]) = (r1_4 - tmpr); + c_im(out[18]) = (i1_4 - tmpi); + tmpr = ((0.831469612303 * r1_7) + (0.55557023302 * i1_7)); + tmpi = ((0.831469612303 * i1_7) - (0.55557023302 * r1_7)); + c_re(out[3]) = (r1_6 + tmpr); + c_im(out[3]) = (i1_6 + tmpi); + c_re(out[19]) = (r1_6 - tmpr); + c_im(out[19]) = (i1_6 - tmpi); + tmpr = (0.707106781187 * (r1_9 + i1_9)); + tmpi = (0.707106781187 * (i1_9 - r1_9)); + c_re(out[4]) = (r1_8 + tmpr); + c_im(out[4]) = (i1_8 + tmpi); + c_re(out[20]) = (r1_8 - tmpr); + c_im(out[20]) = (i1_8 - tmpi); + tmpr = ((0.55557023302 * r1_11) + (0.831469612303 * i1_11)); + tmpi = ((0.55557023302 * i1_11) - (0.831469612303 * r1_11)); + c_re(out[5]) = (r1_10 + tmpr); + c_im(out[5]) = (i1_10 + tmpi); + c_re(out[21]) = (r1_10 - tmpr); + c_im(out[21]) = (i1_10 - tmpi); + tmpr = ((0.382683432365 * r1_13) + (0.923879532511 * i1_13)); + tmpi = ((0.382683432365 * i1_13) - (0.923879532511 * r1_13)); + c_re(out[6]) = (r1_12 + tmpr); + c_im(out[6]) = (i1_12 + tmpi); + c_re(out[22]) = (r1_12 - tmpr); + c_im(out[22]) = (i1_12 - tmpi); + tmpr = ((0.195090322016 * r1_15) + (0.980785280403 * i1_15)); + tmpi = ((0.195090322016 * i1_15) - (0.980785280403 * r1_15)); + c_re(out[7]) = (r1_14 + tmpr); + c_im(out[7]) = (i1_14 + tmpi); + c_re(out[23]) = (r1_14 - tmpr); + c_im(out[23]) = (i1_14 - tmpi); + c_re(out[8]) = (r1_16 + i1_17); + c_im(out[8]) = (i1_16 - r1_17); + c_re(out[24]) = (r1_16 - i1_17); + c_im(out[24]) = (i1_16 + r1_17); + tmpr = ((0.980785280403 * i1_19) - (0.195090322016 * r1_19)); + tmpi = ((0.980785280403 * r1_19) + (0.195090322016 * i1_19)); + c_re(out[9]) = (r1_18 + tmpr); + c_im(out[9]) = (i1_18 - tmpi); + c_re(out[25]) = (r1_18 - tmpr); + c_im(out[25]) = (i1_18 + tmpi); + tmpr = ((0.923879532511 * i1_21) - (0.382683432365 * r1_21)); + tmpi = ((0.923879532511 * r1_21) + (0.382683432365 * i1_21)); + c_re(out[10]) = (r1_20 + tmpr); + c_im(out[10]) = (i1_20 - tmpi); + c_re(out[26]) = (r1_20 - tmpr); + c_im(out[26]) = (i1_20 + tmpi); + tmpr = ((0.831469612303 * i1_23) - (0.55557023302 * r1_23)); + tmpi = ((0.831469612303 * r1_23) + (0.55557023302 * i1_23)); + c_re(out[11]) = (r1_22 + tmpr); + c_im(out[11]) = (i1_22 - tmpi); + c_re(out[27]) = (r1_22 - tmpr); + c_im(out[27]) = (i1_22 + tmpi); + tmpr = (0.707106781187 * (i1_25 - r1_25)); + tmpi = (0.707106781187 * (r1_25 + i1_25)); + c_re(out[12]) = (r1_24 + tmpr); + c_im(out[12]) = (i1_24 - tmpi); + c_re(out[28]) = (r1_24 - tmpr); + c_im(out[28]) = (i1_24 + tmpi); + tmpr = ((0.55557023302 * i1_27) - (0.831469612303 * r1_27)); + tmpi = ((0.55557023302 * r1_27) + (0.831469612303 * i1_27)); + c_re(out[13]) = (r1_26 + tmpr); + c_im(out[13]) = (i1_26 - tmpi); + c_re(out[29]) = (r1_26 - tmpr); + c_im(out[29]) = (i1_26 + tmpi); + tmpr = ((0.382683432365 * i1_29) - (0.923879532511 * r1_29)); + tmpi = ((0.382683432365 * r1_29) + (0.923879532511 * i1_29)); + c_re(out[14]) = (r1_28 + tmpr); + c_im(out[14]) = (i1_28 - tmpi); + c_re(out[30]) = (r1_28 - tmpr); + c_im(out[30]) = (i1_28 + tmpi); + tmpr = ((0.195090322016 * i1_31) - (0.980785280403 * r1_31)); + tmpi = ((0.195090322016 * r1_31) + (0.980785280403 * i1_31)); + c_re(out[15]) = (r1_30 + tmpr); + c_im(out[15]) = (i1_30 - tmpi); + c_re(out[31]) = (r1_30 - tmpr); + c_im(out[31]) = (i1_30 + tmpi); + } +} +void fft_twiddle_32(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m) +{ + int l1, i; + COMPLEX *jp, *kp; + REAL tmpr, tmpi, wr, wi; + if ((b - a) < 128) { + for (i = a, l1 = nWdn * i, kp = out + i; i < b; + i++, l1 += nWdn, kp++) { + jp = in + i; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + REAL r1_4, i1_4; + REAL r1_5, i1_5; + REAL r1_6, i1_6; + REAL r1_7, i1_7; + REAL r1_8, i1_8; + REAL r1_9, i1_9; + REAL r1_10, i1_10; + REAL r1_11, i1_11; + REAL r1_12, i1_12; + REAL r1_13, i1_13; + REAL r1_14, i1_14; + REAL r1_15, i1_15; + REAL r1_16, i1_16; + REAL r1_17, i1_17; + REAL r1_18, i1_18; + REAL r1_19, i1_19; + REAL r1_20, i1_20; + REAL r1_21, i1_21; + REAL r1_22, i1_22; + REAL r1_23, i1_23; + REAL r1_24, i1_24; + REAL r1_25, i1_25; + REAL r1_26, i1_26; + REAL r1_27, i1_27; + REAL r1_28, i1_28; + REAL r1_29, i1_29; + REAL r1_30, i1_30; + REAL r1_31, i1_31; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + REAL r2_4, i2_4; + REAL r2_6, i2_6; + REAL r2_8, i2_8; + REAL r2_10, i2_10; + REAL r2_12, i2_12; + REAL r2_14, i2_14; + REAL r2_16, i2_16; + REAL r2_18, i2_18; + REAL r2_20, i2_20; + REAL r2_22, i2_22; + REAL r2_24, i2_24; + REAL r2_26, i2_26; + REAL r2_28, i2_28; + REAL r2_30, i2_30; + { + REAL r3_0, i3_0; + REAL r3_4, i3_4; + REAL r3_8, i3_8; + REAL r3_12, i3_12; + REAL r3_16, i3_16; + REAL r3_20, i3_20; + REAL r3_24, i3_24; + REAL r3_28, i3_28; + { + REAL r4_0, i4_0; + REAL r4_8, i4_8; + REAL r4_16, i4_16; + REAL r4_24, i4_24; + { + REAL r5_0, i5_0; + REAL r5_16, i5_16; + r5_0 = c_re(jp[0 * m]); + i5_0 = c_im(jp[0 * m]); + wr = c_re(W[16 * l1]); + wi = c_im(W[16 * l1]); + tmpr = c_re(jp[16 * m]); + tmpi = c_im(jp[16 * m]); + r5_16 = ((wr * tmpr) - (wi * tmpi)); + i5_16 = ((wi * tmpr) + (wr * tmpi)); + r4_0 = (r5_0 + r5_16); + i4_0 = (i5_0 + i5_16); + r4_16 = (r5_0 - r5_16); + i4_16 = (i5_0 - i5_16); + } + { + REAL r5_8, i5_8; + REAL r5_24, i5_24; + wr = c_re(W[8 * l1]); + wi = c_im(W[8 * l1]); + tmpr = c_re(jp[8 * m]); + tmpi = c_im(jp[8 * m]); + r5_8 = ((wr * tmpr) - (wi * tmpi)); + i5_8 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[24 * l1]); + wi = c_im(W[24 * l1]); + tmpr = c_re(jp[24 * m]); + tmpi = c_im(jp[24 * m]); + r5_24 = ((wr * tmpr) - (wi * tmpi)); + i5_24 = ((wi * tmpr) + (wr * tmpi)); + r4_8 = (r5_8 + r5_24); + i4_8 = (i5_8 + i5_24); + r4_24 = (r5_8 - r5_24); + i4_24 = (i5_8 - i5_24); + } + r3_0 = (r4_0 + r4_8); + i3_0 = (i4_0 + i4_8); + r3_16 = (r4_0 - r4_8); + i3_16 = (i4_0 - i4_8); + r3_8 = (r4_16 + i4_24); + i3_8 = (i4_16 - r4_24); + r3_24 = (r4_16 - i4_24); + i3_24 = (i4_16 + r4_24); + } + { + REAL r4_4, i4_4; + REAL r4_12, i4_12; + REAL r4_20, i4_20; + REAL r4_28, i4_28; + { + REAL r5_4, i5_4; + REAL r5_20, i5_20; + wr = c_re(W[4 * l1]); + wi = c_im(W[4 * l1]); + tmpr = c_re(jp[4 * m]); + tmpi = c_im(jp[4 * m]); + r5_4 = ((wr * tmpr) - (wi * tmpi)); + i5_4 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[20 * l1]); + wi = c_im(W[20 * l1]); + tmpr = c_re(jp[20 * m]); + tmpi = c_im(jp[20 * m]); + r5_20 = ((wr * tmpr) - (wi * tmpi)); + i5_20 = ((wi * tmpr) + (wr * tmpi)); + r4_4 = (r5_4 + r5_20); + i4_4 = (i5_4 + i5_20); + r4_20 = (r5_4 - r5_20); + i4_20 = (i5_4 - i5_20); + } + { + REAL r5_12, i5_12; + REAL r5_28, i5_28; + wr = c_re(W[12 * l1]); + wi = c_im(W[12 * l1]); + tmpr = c_re(jp[12 * m]); + tmpi = c_im(jp[12 * m]); + r5_12 = ((wr * tmpr) - (wi * tmpi)); + i5_12 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[28 * l1]); + wi = c_im(W[28 * l1]); + tmpr = c_re(jp[28 * m]); + tmpi = c_im(jp[28 * m]); + r5_28 = ((wr * tmpr) - (wi * tmpi)); + i5_28 = ((wi * tmpr) + (wr * tmpi)); + r4_12 = (r5_12 + r5_28); + i4_12 = (i5_12 + i5_28); + r4_28 = (r5_12 - r5_28); + i4_28 = (i5_12 - i5_28); + } + r3_4 = (r4_4 + r4_12); + i3_4 = (i4_4 + i4_12); + r3_20 = (r4_4 - r4_12); + i3_20 = (i4_4 - i4_12); + r3_12 = (r4_20 + i4_28); + i3_12 = (i4_20 - r4_28); + r3_28 = (r4_20 - i4_28); + i3_28 = (i4_20 + r4_28); + } + r2_0 = (r3_0 + r3_4); + i2_0 = (i3_0 + i3_4); + r2_16 = (r3_0 - r3_4); + i2_16 = (i3_0 - i3_4); + tmpr = (0.707106781187 * (r3_12 + i3_12)); + tmpi = (0.707106781187 * (i3_12 - r3_12)); + r2_4 = (r3_8 + tmpr); + i2_4 = (i3_8 + tmpi); + r2_20 = (r3_8 - tmpr); + i2_20 = (i3_8 - tmpi); + r2_8 = (r3_16 + i3_20); + i2_8 = (i3_16 - r3_20); + r2_24 = (r3_16 - i3_20); + i2_24 = (i3_16 + r3_20); + tmpr = (0.707106781187 * (i3_28 - r3_28)); + tmpi = (0.707106781187 * (r3_28 + i3_28)); + r2_12 = (r3_24 + tmpr); + i2_12 = (i3_24 - tmpi); + r2_28 = (r3_24 - tmpr); + i2_28 = (i3_24 + tmpi); + } + { + REAL r3_2, i3_2; + REAL r3_6, i3_6; + REAL r3_10, i3_10; + REAL r3_14, i3_14; + REAL r3_18, i3_18; + REAL r3_22, i3_22; + REAL r3_26, i3_26; + REAL r3_30, i3_30; + { + REAL r4_2, i4_2; + REAL r4_10, i4_10; + REAL r4_18, i4_18; + REAL r4_26, i4_26; + { + REAL r5_2, i5_2; + REAL r5_18, i5_18; + wr = c_re(W[2 * l1]); + wi = c_im(W[2 * l1]); + tmpr = c_re(jp[2 * m]); + tmpi = c_im(jp[2 * m]); + r5_2 = ((wr * tmpr) - (wi * tmpi)); + i5_2 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[18 * l1]); + wi = c_im(W[18 * l1]); + tmpr = c_re(jp[18 * m]); + tmpi = c_im(jp[18 * m]); + r5_18 = ((wr * tmpr) - (wi * tmpi)); + i5_18 = ((wi * tmpr) + (wr * tmpi)); + r4_2 = (r5_2 + r5_18); + i4_2 = (i5_2 + i5_18); + r4_18 = (r5_2 - r5_18); + i4_18 = (i5_2 - i5_18); + } + { + REAL r5_10, i5_10; + REAL r5_26, i5_26; + wr = c_re(W[10 * l1]); + wi = c_im(W[10 * l1]); + tmpr = c_re(jp[10 * m]); + tmpi = c_im(jp[10 * m]); + r5_10 = ((wr * tmpr) - (wi * tmpi)); + i5_10 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[26 * l1]); + wi = c_im(W[26 * l1]); + tmpr = c_re(jp[26 * m]); + tmpi = c_im(jp[26 * m]); + r5_26 = ((wr * tmpr) - (wi * tmpi)); + i5_26 = ((wi * tmpr) + (wr * tmpi)); + r4_10 = (r5_10 + r5_26); + i4_10 = (i5_10 + i5_26); + r4_26 = (r5_10 - r5_26); + i4_26 = (i5_10 - i5_26); + } + r3_2 = (r4_2 + r4_10); + i3_2 = (i4_2 + i4_10); + r3_18 = (r4_2 - r4_10); + i3_18 = (i4_2 - i4_10); + r3_10 = (r4_18 + i4_26); + i3_10 = (i4_18 - r4_26); + r3_26 = (r4_18 - i4_26); + i3_26 = (i4_18 + r4_26); + } + { + REAL r4_6, i4_6; + REAL r4_14, i4_14; + REAL r4_22, i4_22; + REAL r4_30, i4_30; + { + REAL r5_6, i5_6; + REAL r5_22, i5_22; + wr = c_re(W[6 * l1]); + wi = c_im(W[6 * l1]); + tmpr = c_re(jp[6 * m]); + tmpi = c_im(jp[6 * m]); + r5_6 = ((wr * tmpr) - (wi * tmpi)); + i5_6 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[22 * l1]); + wi = c_im(W[22 * l1]); + tmpr = c_re(jp[22 * m]); + tmpi = c_im(jp[22 * m]); + r5_22 = ((wr * tmpr) - (wi * tmpi)); + i5_22 = ((wi * tmpr) + (wr * tmpi)); + r4_6 = (r5_6 + r5_22); + i4_6 = (i5_6 + i5_22); + r4_22 = (r5_6 - r5_22); + i4_22 = (i5_6 - i5_22); + } + { + REAL r5_14, i5_14; + REAL r5_30, i5_30; + wr = c_re(W[14 * l1]); + wi = c_im(W[14 * l1]); + tmpr = c_re(jp[14 * m]); + tmpi = c_im(jp[14 * m]); + r5_14 = ((wr * tmpr) - (wi * tmpi)); + i5_14 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[30 * l1]); + wi = c_im(W[30 * l1]); + tmpr = c_re(jp[30 * m]); + tmpi = c_im(jp[30 * m]); + r5_30 = ((wr * tmpr) - (wi * tmpi)); + i5_30 = ((wi * tmpr) + (wr * tmpi)); + r4_14 = (r5_14 + r5_30); + i4_14 = (i5_14 + i5_30); + r4_30 = (r5_14 - r5_30); + i4_30 = (i5_14 - i5_30); + } + r3_6 = (r4_6 + r4_14); + i3_6 = (i4_6 + i4_14); + r3_22 = (r4_6 - r4_14); + i3_22 = (i4_6 - i4_14); + r3_14 = (r4_22 + i4_30); + i3_14 = (i4_22 - r4_30); + r3_30 = (r4_22 - i4_30); + i3_30 = (i4_22 + r4_30); + } + r2_2 = (r3_2 + r3_6); + i2_2 = (i3_2 + i3_6); + r2_18 = (r3_2 - r3_6); + i2_18 = (i3_2 - i3_6); + tmpr = (0.707106781187 * (r3_14 + i3_14)); + tmpi = (0.707106781187 * (i3_14 - r3_14)); + r2_6 = (r3_10 + tmpr); + i2_6 = (i3_10 + tmpi); + r2_22 = (r3_10 - tmpr); + i2_22 = (i3_10 - tmpi); + r2_10 = (r3_18 + i3_22); + i2_10 = (i3_18 - r3_22); + r2_26 = (r3_18 - i3_22); + i2_26 = (i3_18 + r3_22); + tmpr = (0.707106781187 * (i3_30 - r3_30)); + tmpi = (0.707106781187 * (r3_30 + i3_30)); + r2_14 = (r3_26 + tmpr); + i2_14 = (i3_26 - tmpi); + r2_30 = (r3_26 - tmpr); + i2_30 = (i3_26 + tmpi); + } + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_16 = (r2_0 - r2_2); + i1_16 = (i2_0 - i2_2); + tmpr = ((0.923879532511 * r2_6) + (0.382683432365 * i2_6)); + tmpi = ((0.923879532511 * i2_6) - (0.382683432365 * r2_6)); + r1_2 = (r2_4 + tmpr); + i1_2 = (i2_4 + tmpi); + r1_18 = (r2_4 - tmpr); + i1_18 = (i2_4 - tmpi); + tmpr = (0.707106781187 * (r2_10 + i2_10)); + tmpi = (0.707106781187 * (i2_10 - r2_10)); + r1_4 = (r2_8 + tmpr); + i1_4 = (i2_8 + tmpi); + r1_20 = (r2_8 - tmpr); + i1_20 = (i2_8 - tmpi); + tmpr = ((0.382683432365 * r2_14) + (0.923879532511 * i2_14)); + tmpi = ((0.382683432365 * i2_14) - (0.923879532511 * r2_14)); + r1_6 = (r2_12 + tmpr); + i1_6 = (i2_12 + tmpi); + r1_22 = (r2_12 - tmpr); + i1_22 = (i2_12 - tmpi); + r1_8 = (r2_16 + i2_18); + i1_8 = (i2_16 - r2_18); + r1_24 = (r2_16 - i2_18); + i1_24 = (i2_16 + r2_18); + tmpr = ((0.923879532511 * i2_22) - (0.382683432365 * r2_22)); + tmpi = ((0.923879532511 * r2_22) + (0.382683432365 * i2_22)); + r1_10 = (r2_20 + tmpr); + i1_10 = (i2_20 - tmpi); + r1_26 = (r2_20 - tmpr); + i1_26 = (i2_20 + tmpi); + tmpr = (0.707106781187 * (i2_26 - r2_26)); + tmpi = (0.707106781187 * (r2_26 + i2_26)); + r1_12 = (r2_24 + tmpr); + i1_12 = (i2_24 - tmpi); + r1_28 = (r2_24 - tmpr); + i1_28 = (i2_24 + tmpi); + tmpr = ((0.382683432365 * i2_30) - (0.923879532511 * r2_30)); + tmpi = ((0.382683432365 * r2_30) + (0.923879532511 * i2_30)); + r1_14 = (r2_28 + tmpr); + i1_14 = (i2_28 - tmpi); + r1_30 = (r2_28 - tmpr); + i1_30 = (i2_28 + tmpi); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + REAL r2_5, i2_5; + REAL r2_7, i2_7; + REAL r2_9, i2_9; + REAL r2_11, i2_11; + REAL r2_13, i2_13; + REAL r2_15, i2_15; + REAL r2_17, i2_17; + REAL r2_19, i2_19; + REAL r2_21, i2_21; + REAL r2_23, i2_23; + REAL r2_25, i2_25; + REAL r2_27, i2_27; + REAL r2_29, i2_29; + REAL r2_31, i2_31; + { + REAL r3_1, i3_1; + REAL r3_5, i3_5; + REAL r3_9, i3_9; + REAL r3_13, i3_13; + REAL r3_17, i3_17; + REAL r3_21, i3_21; + REAL r3_25, i3_25; + REAL r3_29, i3_29; + { + REAL r4_1, i4_1; + REAL r4_9, i4_9; + REAL r4_17, i4_17; + REAL r4_25, i4_25; + { + REAL r5_1, i5_1; + REAL r5_17, i5_17; + wr = c_re(W[1 * l1]); + wi = c_im(W[1 * l1]); + tmpr = c_re(jp[1 * m]); + tmpi = c_im(jp[1 * m]); + r5_1 = ((wr * tmpr) - (wi * tmpi)); + i5_1 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[17 * l1]); + wi = c_im(W[17 * l1]); + tmpr = c_re(jp[17 * m]); + tmpi = c_im(jp[17 * m]); + r5_17 = ((wr * tmpr) - (wi * tmpi)); + i5_17 = ((wi * tmpr) + (wr * tmpi)); + r4_1 = (r5_1 + r5_17); + i4_1 = (i5_1 + i5_17); + r4_17 = (r5_1 - r5_17); + i4_17 = (i5_1 - i5_17); + } + { + REAL r5_9, i5_9; + REAL r5_25, i5_25; + wr = c_re(W[9 * l1]); + wi = c_im(W[9 * l1]); + tmpr = c_re(jp[9 * m]); + tmpi = c_im(jp[9 * m]); + r5_9 = ((wr * tmpr) - (wi * tmpi)); + i5_9 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[25 * l1]); + wi = c_im(W[25 * l1]); + tmpr = c_re(jp[25 * m]); + tmpi = c_im(jp[25 * m]); + r5_25 = ((wr * tmpr) - (wi * tmpi)); + i5_25 = ((wi * tmpr) + (wr * tmpi)); + r4_9 = (r5_9 + r5_25); + i4_9 = (i5_9 + i5_25); + r4_25 = (r5_9 - r5_25); + i4_25 = (i5_9 - i5_25); + } + r3_1 = (r4_1 + r4_9); + i3_1 = (i4_1 + i4_9); + r3_17 = (r4_1 - r4_9); + i3_17 = (i4_1 - i4_9); + r3_9 = (r4_17 + i4_25); + i3_9 = (i4_17 - r4_25); + r3_25 = (r4_17 - i4_25); + i3_25 = (i4_17 + r4_25); + } + { + REAL r4_5, i4_5; + REAL r4_13, i4_13; + REAL r4_21, i4_21; + REAL r4_29, i4_29; + { + REAL r5_5, i5_5; + REAL r5_21, i5_21; + wr = c_re(W[5 * l1]); + wi = c_im(W[5 * l1]); + tmpr = c_re(jp[5 * m]); + tmpi = c_im(jp[5 * m]); + r5_5 = ((wr * tmpr) - (wi * tmpi)); + i5_5 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[21 * l1]); + wi = c_im(W[21 * l1]); + tmpr = c_re(jp[21 * m]); + tmpi = c_im(jp[21 * m]); + r5_21 = ((wr * tmpr) - (wi * tmpi)); + i5_21 = ((wi * tmpr) + (wr * tmpi)); + r4_5 = (r5_5 + r5_21); + i4_5 = (i5_5 + i5_21); + r4_21 = (r5_5 - r5_21); + i4_21 = (i5_5 - i5_21); + } + { + REAL r5_13, i5_13; + REAL r5_29, i5_29; + wr = c_re(W[13 * l1]); + wi = c_im(W[13 * l1]); + tmpr = c_re(jp[13 * m]); + tmpi = c_im(jp[13 * m]); + r5_13 = ((wr * tmpr) - (wi * tmpi)); + i5_13 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[29 * l1]); + wi = c_im(W[29 * l1]); + tmpr = c_re(jp[29 * m]); + tmpi = c_im(jp[29 * m]); + r5_29 = ((wr * tmpr) - (wi * tmpi)); + i5_29 = ((wi * tmpr) + (wr * tmpi)); + r4_13 = (r5_13 + r5_29); + i4_13 = (i5_13 + i5_29); + r4_29 = (r5_13 - r5_29); + i4_29 = (i5_13 - i5_29); + } + r3_5 = (r4_5 + r4_13); + i3_5 = (i4_5 + i4_13); + r3_21 = (r4_5 - r4_13); + i3_21 = (i4_5 - i4_13); + r3_13 = (r4_21 + i4_29); + i3_13 = (i4_21 - r4_29); + r3_29 = (r4_21 - i4_29); + i3_29 = (i4_21 + r4_29); + } + r2_1 = (r3_1 + r3_5); + i2_1 = (i3_1 + i3_5); + r2_17 = (r3_1 - r3_5); + i2_17 = (i3_1 - i3_5); + tmpr = (0.707106781187 * (r3_13 + i3_13)); + tmpi = (0.707106781187 * (i3_13 - r3_13)); + r2_5 = (r3_9 + tmpr); + i2_5 = (i3_9 + tmpi); + r2_21 = (r3_9 - tmpr); + i2_21 = (i3_9 - tmpi); + r2_9 = (r3_17 + i3_21); + i2_9 = (i3_17 - r3_21); + r2_25 = (r3_17 - i3_21); + i2_25 = (i3_17 + r3_21); + tmpr = (0.707106781187 * (i3_29 - r3_29)); + tmpi = (0.707106781187 * (r3_29 + i3_29)); + r2_13 = (r3_25 + tmpr); + i2_13 = (i3_25 - tmpi); + r2_29 = (r3_25 - tmpr); + i2_29 = (i3_25 + tmpi); + } + { + REAL r3_3, i3_3; + REAL r3_7, i3_7; + REAL r3_11, i3_11; + REAL r3_15, i3_15; + REAL r3_19, i3_19; + REAL r3_23, i3_23; + REAL r3_27, i3_27; + REAL r3_31, i3_31; + { + REAL r4_3, i4_3; + REAL r4_11, i4_11; + REAL r4_19, i4_19; + REAL r4_27, i4_27; + { + REAL r5_3, i5_3; + REAL r5_19, i5_19; + wr = c_re(W[3 * l1]); + wi = c_im(W[3 * l1]); + tmpr = c_re(jp[3 * m]); + tmpi = c_im(jp[3 * m]); + r5_3 = ((wr * tmpr) - (wi * tmpi)); + i5_3 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[19 * l1]); + wi = c_im(W[19 * l1]); + tmpr = c_re(jp[19 * m]); + tmpi = c_im(jp[19 * m]); + r5_19 = ((wr * tmpr) - (wi * tmpi)); + i5_19 = ((wi * tmpr) + (wr * tmpi)); + r4_3 = (r5_3 + r5_19); + i4_3 = (i5_3 + i5_19); + r4_19 = (r5_3 - r5_19); + i4_19 = (i5_3 - i5_19); + } + { + REAL r5_11, i5_11; + REAL r5_27, i5_27; + wr = c_re(W[11 * l1]); + wi = c_im(W[11 * l1]); + tmpr = c_re(jp[11 * m]); + tmpi = c_im(jp[11 * m]); + r5_11 = ((wr * tmpr) - (wi * tmpi)); + i5_11 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[27 * l1]); + wi = c_im(W[27 * l1]); + tmpr = c_re(jp[27 * m]); + tmpi = c_im(jp[27 * m]); + r5_27 = ((wr * tmpr) - (wi * tmpi)); + i5_27 = ((wi * tmpr) + (wr * tmpi)); + r4_11 = (r5_11 + r5_27); + i4_11 = (i5_11 + i5_27); + r4_27 = (r5_11 - r5_27); + i4_27 = (i5_11 - i5_27); + } + r3_3 = (r4_3 + r4_11); + i3_3 = (i4_3 + i4_11); + r3_19 = (r4_3 - r4_11); + i3_19 = (i4_3 - i4_11); + r3_11 = (r4_19 + i4_27); + i3_11 = (i4_19 - r4_27); + r3_27 = (r4_19 - i4_27); + i3_27 = (i4_19 + r4_27); + } + { + REAL r4_7, i4_7; + REAL r4_15, i4_15; + REAL r4_23, i4_23; + REAL r4_31, i4_31; + { + REAL r5_7, i5_7; + REAL r5_23, i5_23; + wr = c_re(W[7 * l1]); + wi = c_im(W[7 * l1]); + tmpr = c_re(jp[7 * m]); + tmpi = c_im(jp[7 * m]); + r5_7 = ((wr * tmpr) - (wi * tmpi)); + i5_7 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[23 * l1]); + wi = c_im(W[23 * l1]); + tmpr = c_re(jp[23 * m]); + tmpi = c_im(jp[23 * m]); + r5_23 = ((wr * tmpr) - (wi * tmpi)); + i5_23 = ((wi * tmpr) + (wr * tmpi)); + r4_7 = (r5_7 + r5_23); + i4_7 = (i5_7 + i5_23); + r4_23 = (r5_7 - r5_23); + i4_23 = (i5_7 - i5_23); + } + { + REAL r5_15, i5_15; + REAL r5_31, i5_31; + wr = c_re(W[15 * l1]); + wi = c_im(W[15 * l1]); + tmpr = c_re(jp[15 * m]); + tmpi = c_im(jp[15 * m]); + r5_15 = ((wr * tmpr) - (wi * tmpi)); + i5_15 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[31 * l1]); + wi = c_im(W[31 * l1]); + tmpr = c_re(jp[31 * m]); + tmpi = c_im(jp[31 * m]); + r5_31 = ((wr * tmpr) - (wi * tmpi)); + i5_31 = ((wi * tmpr) + (wr * tmpi)); + r4_15 = (r5_15 + r5_31); + i4_15 = (i5_15 + i5_31); + r4_31 = (r5_15 - r5_31); + i4_31 = (i5_15 - i5_31); + } + r3_7 = (r4_7 + r4_15); + i3_7 = (i4_7 + i4_15); + r3_23 = (r4_7 - r4_15); + i3_23 = (i4_7 - i4_15); + r3_15 = (r4_23 + i4_31); + i3_15 = (i4_23 - r4_31); + r3_31 = (r4_23 - i4_31); + i3_31 = (i4_23 + r4_31); + } + r2_3 = (r3_3 + r3_7); + i2_3 = (i3_3 + i3_7); + r2_19 = (r3_3 - r3_7); + i2_19 = (i3_3 - i3_7); + tmpr = (0.707106781187 * (r3_15 + i3_15)); + tmpi = (0.707106781187 * (i3_15 - r3_15)); + r2_7 = (r3_11 + tmpr); + i2_7 = (i3_11 + tmpi); + r2_23 = (r3_11 - tmpr); + i2_23 = (i3_11 - tmpi); + r2_11 = (r3_19 + i3_23); + i2_11 = (i3_19 - r3_23); + r2_27 = (r3_19 - i3_23); + i2_27 = (i3_19 + r3_23); + tmpr = (0.707106781187 * (i3_31 - r3_31)); + tmpi = (0.707106781187 * (r3_31 + i3_31)); + r2_15 = (r3_27 + tmpr); + i2_15 = (i3_27 - tmpi); + r2_31 = (r3_27 - tmpr); + i2_31 = (i3_27 + tmpi); + } + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_17 = (r2_1 - r2_3); + i1_17 = (i2_1 - i2_3); + tmpr = ((0.923879532511 * r2_7) + (0.382683432365 * i2_7)); + tmpi = ((0.923879532511 * i2_7) - (0.382683432365 * r2_7)); + r1_3 = (r2_5 + tmpr); + i1_3 = (i2_5 + tmpi); + r1_19 = (r2_5 - tmpr); + i1_19 = (i2_5 - tmpi); + tmpr = (0.707106781187 * (r2_11 + i2_11)); + tmpi = (0.707106781187 * (i2_11 - r2_11)); + r1_5 = (r2_9 + tmpr); + i1_5 = (i2_9 + tmpi); + r1_21 = (r2_9 - tmpr); + i1_21 = (i2_9 - tmpi); + tmpr = ((0.382683432365 * r2_15) + (0.923879532511 * i2_15)); + tmpi = ((0.382683432365 * i2_15) - (0.923879532511 * r2_15)); + r1_7 = (r2_13 + tmpr); + i1_7 = (i2_13 + tmpi); + r1_23 = (r2_13 - tmpr); + i1_23 = (i2_13 - tmpi); + r1_9 = (r2_17 + i2_19); + i1_9 = (i2_17 - r2_19); + r1_25 = (r2_17 - i2_19); + i1_25 = (i2_17 + r2_19); + tmpr = ((0.923879532511 * i2_23) - (0.382683432365 * r2_23)); + tmpi = ((0.923879532511 * r2_23) + (0.382683432365 * i2_23)); + r1_11 = (r2_21 + tmpr); + i1_11 = (i2_21 - tmpi); + r1_27 = (r2_21 - tmpr); + i1_27 = (i2_21 + tmpi); + tmpr = (0.707106781187 * (i2_27 - r2_27)); + tmpi = (0.707106781187 * (r2_27 + i2_27)); + r1_13 = (r2_25 + tmpr); + i1_13 = (i2_25 - tmpi); + r1_29 = (r2_25 - tmpr); + i1_29 = (i2_25 + tmpi); + tmpr = ((0.382683432365 * i2_31) - (0.923879532511 * r2_31)); + tmpi = ((0.382683432365 * r2_31) + (0.923879532511 * i2_31)); + r1_15 = (r2_29 + tmpr); + i1_15 = (i2_29 - tmpi); + r1_31 = (r2_29 - tmpr); + i1_31 = (i2_29 + tmpi); + } + c_re(kp[0 * m]) = (r1_0 + r1_1); + c_im(kp[0 * m]) = (i1_0 + i1_1); + c_re(kp[16 * m]) = (r1_0 - r1_1); + c_im(kp[16 * m]) = (i1_0 - i1_1); + tmpr = ((0.980785280403 * r1_3) + (0.195090322016 * i1_3)); + tmpi = ((0.980785280403 * i1_3) - (0.195090322016 * r1_3)); + c_re(kp[1 * m]) = (r1_2 + tmpr); + c_im(kp[1 * m]) = (i1_2 + tmpi); + c_re(kp[17 * m]) = (r1_2 - tmpr); + c_im(kp[17 * m]) = (i1_2 - tmpi); + tmpr = ((0.923879532511 * r1_5) + (0.382683432365 * i1_5)); + tmpi = ((0.923879532511 * i1_5) - (0.382683432365 * r1_5)); + c_re(kp[2 * m]) = (r1_4 + tmpr); + c_im(kp[2 * m]) = (i1_4 + tmpi); + c_re(kp[18 * m]) = (r1_4 - tmpr); + c_im(kp[18 * m]) = (i1_4 - tmpi); + tmpr = ((0.831469612303 * r1_7) + (0.55557023302 * i1_7)); + tmpi = ((0.831469612303 * i1_7) - (0.55557023302 * r1_7)); + c_re(kp[3 * m]) = (r1_6 + tmpr); + c_im(kp[3 * m]) = (i1_6 + tmpi); + c_re(kp[19 * m]) = (r1_6 - tmpr); + c_im(kp[19 * m]) = (i1_6 - tmpi); + tmpr = (0.707106781187 * (r1_9 + i1_9)); + tmpi = (0.707106781187 * (i1_9 - r1_9)); + c_re(kp[4 * m]) = (r1_8 + tmpr); + c_im(kp[4 * m]) = (i1_8 + tmpi); + c_re(kp[20 * m]) = (r1_8 - tmpr); + c_im(kp[20 * m]) = (i1_8 - tmpi); + tmpr = ((0.55557023302 * r1_11) + (0.831469612303 * i1_11)); + tmpi = ((0.55557023302 * i1_11) - (0.831469612303 * r1_11)); + c_re(kp[5 * m]) = (r1_10 + tmpr); + c_im(kp[5 * m]) = (i1_10 + tmpi); + c_re(kp[21 * m]) = (r1_10 - tmpr); + c_im(kp[21 * m]) = (i1_10 - tmpi); + tmpr = ((0.382683432365 * r1_13) + (0.923879532511 * i1_13)); + tmpi = ((0.382683432365 * i1_13) - (0.923879532511 * r1_13)); + c_re(kp[6 * m]) = (r1_12 + tmpr); + c_im(kp[6 * m]) = (i1_12 + tmpi); + c_re(kp[22 * m]) = (r1_12 - tmpr); + c_im(kp[22 * m]) = (i1_12 - tmpi); + tmpr = ((0.195090322016 * r1_15) + (0.980785280403 * i1_15)); + tmpi = ((0.195090322016 * i1_15) - (0.980785280403 * r1_15)); + c_re(kp[7 * m]) = (r1_14 + tmpr); + c_im(kp[7 * m]) = (i1_14 + tmpi); + c_re(kp[23 * m]) = (r1_14 - tmpr); + c_im(kp[23 * m]) = (i1_14 - tmpi); + c_re(kp[8 * m]) = (r1_16 + i1_17); + c_im(kp[8 * m]) = (i1_16 - r1_17); + c_re(kp[24 * m]) = (r1_16 - i1_17); + c_im(kp[24 * m]) = (i1_16 + r1_17); + tmpr = ((0.980785280403 * i1_19) - (0.195090322016 * r1_19)); + tmpi = ((0.980785280403 * r1_19) + (0.195090322016 * i1_19)); + c_re(kp[9 * m]) = (r1_18 + tmpr); + c_im(kp[9 * m]) = (i1_18 - tmpi); + c_re(kp[25 * m]) = (r1_18 - tmpr); + c_im(kp[25 * m]) = (i1_18 + tmpi); + tmpr = ((0.923879532511 * i1_21) - (0.382683432365 * r1_21)); + tmpi = ((0.923879532511 * r1_21) + (0.382683432365 * i1_21)); + c_re(kp[10 * m]) = (r1_20 + tmpr); + c_im(kp[10 * m]) = (i1_20 - tmpi); + c_re(kp[26 * m]) = (r1_20 - tmpr); + c_im(kp[26 * m]) = (i1_20 + tmpi); + tmpr = ((0.831469612303 * i1_23) - (0.55557023302 * r1_23)); + tmpi = ((0.831469612303 * r1_23) + (0.55557023302 * i1_23)); + c_re(kp[11 * m]) = (r1_22 + tmpr); + c_im(kp[11 * m]) = (i1_22 - tmpi); + c_re(kp[27 * m]) = (r1_22 - tmpr); + c_im(kp[27 * m]) = (i1_22 + tmpi); + tmpr = (0.707106781187 * (i1_25 - r1_25)); + tmpi = (0.707106781187 * (r1_25 + i1_25)); + c_re(kp[12 * m]) = (r1_24 + tmpr); + c_im(kp[12 * m]) = (i1_24 - tmpi); + c_re(kp[28 * m]) = (r1_24 - tmpr); + c_im(kp[28 * m]) = (i1_24 + tmpi); + tmpr = ((0.55557023302 * i1_27) - (0.831469612303 * r1_27)); + tmpi = ((0.55557023302 * r1_27) + (0.831469612303 * i1_27)); + c_re(kp[13 * m]) = (r1_26 + tmpr); + c_im(kp[13 * m]) = (i1_26 - tmpi); + c_re(kp[29 * m]) = (r1_26 - tmpr); + c_im(kp[29 * m]) = (i1_26 + tmpi); + tmpr = ((0.382683432365 * i1_29) - (0.923879532511 * r1_29)); + tmpi = ((0.382683432365 * r1_29) + (0.923879532511 * i1_29)); + c_re(kp[14 * m]) = (r1_28 + tmpr); + c_im(kp[14 * m]) = (i1_28 - tmpi); + c_re(kp[30 * m]) = (r1_28 - tmpr); + c_im(kp[30 * m]) = (i1_28 + tmpi); + tmpr = ((0.195090322016 * i1_31) - (0.980785280403 * r1_31)); + tmpi = ((0.195090322016 * r1_31) + (0.980785280403 * i1_31)); + c_re(kp[15 * m]) = (r1_30 + tmpr); + c_im(kp[15 * m]) = (i1_30 - tmpi); + c_re(kp[31 * m]) = (r1_30 - tmpr); + c_im(kp[31 * m]) = (i1_30 + tmpi); + } + } + } else { + int ab = (a + b) / 2; +#if defined(FORCE_TIED_TASKS) + #pragma omp task + fft_twiddle_32(a, ab, in, out, W, nW, nWdn, m); + #pragma omp task + fft_twiddle_32(ab, b, in, out, W, nW, nWdn, m); +#else + #pragma omp task untied + fft_twiddle_32(a, ab, in, out, W, nW, nWdn, m); + #pragma omp task untied + fft_twiddle_32(ab, b, in, out, W, nW, nWdn, m); +#endif + #pragma omp taskwait + } +} +void fft_twiddle_32_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m) +{ + int l1, i; + COMPLEX *jp, *kp; + REAL tmpr, tmpi, wr, wi; + if ((b - a) < 128) { + for (i = a, l1 = nWdn * i, kp = out + i; i < b; + i++, l1 += nWdn, kp++) { + jp = in + i; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + REAL r1_4, i1_4; + REAL r1_5, i1_5; + REAL r1_6, i1_6; + REAL r1_7, i1_7; + REAL r1_8, i1_8; + REAL r1_9, i1_9; + REAL r1_10, i1_10; + REAL r1_11, i1_11; + REAL r1_12, i1_12; + REAL r1_13, i1_13; + REAL r1_14, i1_14; + REAL r1_15, i1_15; + REAL r1_16, i1_16; + REAL r1_17, i1_17; + REAL r1_18, i1_18; + REAL r1_19, i1_19; + REAL r1_20, i1_20; + REAL r1_21, i1_21; + REAL r1_22, i1_22; + REAL r1_23, i1_23; + REAL r1_24, i1_24; + REAL r1_25, i1_25; + REAL r1_26, i1_26; + REAL r1_27, i1_27; + REAL r1_28, i1_28; + REAL r1_29, i1_29; + REAL r1_30, i1_30; + REAL r1_31, i1_31; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + REAL r2_4, i2_4; + REAL r2_6, i2_6; + REAL r2_8, i2_8; + REAL r2_10, i2_10; + REAL r2_12, i2_12; + REAL r2_14, i2_14; + REAL r2_16, i2_16; + REAL r2_18, i2_18; + REAL r2_20, i2_20; + REAL r2_22, i2_22; + REAL r2_24, i2_24; + REAL r2_26, i2_26; + REAL r2_28, i2_28; + REAL r2_30, i2_30; + { + REAL r3_0, i3_0; + REAL r3_4, i3_4; + REAL r3_8, i3_8; + REAL r3_12, i3_12; + REAL r3_16, i3_16; + REAL r3_20, i3_20; + REAL r3_24, i3_24; + REAL r3_28, i3_28; + { + REAL r4_0, i4_0; + REAL r4_8, i4_8; + REAL r4_16, i4_16; + REAL r4_24, i4_24; + { + REAL r5_0, i5_0; + REAL r5_16, i5_16; + r5_0 = c_re(jp[0 * m]); + i5_0 = c_im(jp[0 * m]); + wr = c_re(W[16 * l1]); + wi = c_im(W[16 * l1]); + tmpr = c_re(jp[16 * m]); + tmpi = c_im(jp[16 * m]); + r5_16 = ((wr * tmpr) - (wi * tmpi)); + i5_16 = ((wi * tmpr) + (wr * tmpi)); + r4_0 = (r5_0 + r5_16); + i4_0 = (i5_0 + i5_16); + r4_16 = (r5_0 - r5_16); + i4_16 = (i5_0 - i5_16); + } + { + REAL r5_8, i5_8; + REAL r5_24, i5_24; + wr = c_re(W[8 * l1]); + wi = c_im(W[8 * l1]); + tmpr = c_re(jp[8 * m]); + tmpi = c_im(jp[8 * m]); + r5_8 = ((wr * tmpr) - (wi * tmpi)); + i5_8 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[24 * l1]); + wi = c_im(W[24 * l1]); + tmpr = c_re(jp[24 * m]); + tmpi = c_im(jp[24 * m]); + r5_24 = ((wr * tmpr) - (wi * tmpi)); + i5_24 = ((wi * tmpr) + (wr * tmpi)); + r4_8 = (r5_8 + r5_24); + i4_8 = (i5_8 + i5_24); + r4_24 = (r5_8 - r5_24); + i4_24 = (i5_8 - i5_24); + } + r3_0 = (r4_0 + r4_8); + i3_0 = (i4_0 + i4_8); + r3_16 = (r4_0 - r4_8); + i3_16 = (i4_0 - i4_8); + r3_8 = (r4_16 + i4_24); + i3_8 = (i4_16 - r4_24); + r3_24 = (r4_16 - i4_24); + i3_24 = (i4_16 + r4_24); + } + { + REAL r4_4, i4_4; + REAL r4_12, i4_12; + REAL r4_20, i4_20; + REAL r4_28, i4_28; + { + REAL r5_4, i5_4; + REAL r5_20, i5_20; + wr = c_re(W[4 * l1]); + wi = c_im(W[4 * l1]); + tmpr = c_re(jp[4 * m]); + tmpi = c_im(jp[4 * m]); + r5_4 = ((wr * tmpr) - (wi * tmpi)); + i5_4 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[20 * l1]); + wi = c_im(W[20 * l1]); + tmpr = c_re(jp[20 * m]); + tmpi = c_im(jp[20 * m]); + r5_20 = ((wr * tmpr) - (wi * tmpi)); + i5_20 = ((wi * tmpr) + (wr * tmpi)); + r4_4 = (r5_4 + r5_20); + i4_4 = (i5_4 + i5_20); + r4_20 = (r5_4 - r5_20); + i4_20 = (i5_4 - i5_20); + } + { + REAL r5_12, i5_12; + REAL r5_28, i5_28; + wr = c_re(W[12 * l1]); + wi = c_im(W[12 * l1]); + tmpr = c_re(jp[12 * m]); + tmpi = c_im(jp[12 * m]); + r5_12 = ((wr * tmpr) - (wi * tmpi)); + i5_12 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[28 * l1]); + wi = c_im(W[28 * l1]); + tmpr = c_re(jp[28 * m]); + tmpi = c_im(jp[28 * m]); + r5_28 = ((wr * tmpr) - (wi * tmpi)); + i5_28 = ((wi * tmpr) + (wr * tmpi)); + r4_12 = (r5_12 + r5_28); + i4_12 = (i5_12 + i5_28); + r4_28 = (r5_12 - r5_28); + i4_28 = (i5_12 - i5_28); + } + r3_4 = (r4_4 + r4_12); + i3_4 = (i4_4 + i4_12); + r3_20 = (r4_4 - r4_12); + i3_20 = (i4_4 - i4_12); + r3_12 = (r4_20 + i4_28); + i3_12 = (i4_20 - r4_28); + r3_28 = (r4_20 - i4_28); + i3_28 = (i4_20 + r4_28); + } + r2_0 = (r3_0 + r3_4); + i2_0 = (i3_0 + i3_4); + r2_16 = (r3_0 - r3_4); + i2_16 = (i3_0 - i3_4); + tmpr = (0.707106781187 * (r3_12 + i3_12)); + tmpi = (0.707106781187 * (i3_12 - r3_12)); + r2_4 = (r3_8 + tmpr); + i2_4 = (i3_8 + tmpi); + r2_20 = (r3_8 - tmpr); + i2_20 = (i3_8 - tmpi); + r2_8 = (r3_16 + i3_20); + i2_8 = (i3_16 - r3_20); + r2_24 = (r3_16 - i3_20); + i2_24 = (i3_16 + r3_20); + tmpr = (0.707106781187 * (i3_28 - r3_28)); + tmpi = (0.707106781187 * (r3_28 + i3_28)); + r2_12 = (r3_24 + tmpr); + i2_12 = (i3_24 - tmpi); + r2_28 = (r3_24 - tmpr); + i2_28 = (i3_24 + tmpi); + } + { + REAL r3_2, i3_2; + REAL r3_6, i3_6; + REAL r3_10, i3_10; + REAL r3_14, i3_14; + REAL r3_18, i3_18; + REAL r3_22, i3_22; + REAL r3_26, i3_26; + REAL r3_30, i3_30; + { + REAL r4_2, i4_2; + REAL r4_10, i4_10; + REAL r4_18, i4_18; + REAL r4_26, i4_26; + { + REAL r5_2, i5_2; + REAL r5_18, i5_18; + wr = c_re(W[2 * l1]); + wi = c_im(W[2 * l1]); + tmpr = c_re(jp[2 * m]); + tmpi = c_im(jp[2 * m]); + r5_2 = ((wr * tmpr) - (wi * tmpi)); + i5_2 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[18 * l1]); + wi = c_im(W[18 * l1]); + tmpr = c_re(jp[18 * m]); + tmpi = c_im(jp[18 * m]); + r5_18 = ((wr * tmpr) - (wi * tmpi)); + i5_18 = ((wi * tmpr) + (wr * tmpi)); + r4_2 = (r5_2 + r5_18); + i4_2 = (i5_2 + i5_18); + r4_18 = (r5_2 - r5_18); + i4_18 = (i5_2 - i5_18); + } + { + REAL r5_10, i5_10; + REAL r5_26, i5_26; + wr = c_re(W[10 * l1]); + wi = c_im(W[10 * l1]); + tmpr = c_re(jp[10 * m]); + tmpi = c_im(jp[10 * m]); + r5_10 = ((wr * tmpr) - (wi * tmpi)); + i5_10 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[26 * l1]); + wi = c_im(W[26 * l1]); + tmpr = c_re(jp[26 * m]); + tmpi = c_im(jp[26 * m]); + r5_26 = ((wr * tmpr) - (wi * tmpi)); + i5_26 = ((wi * tmpr) + (wr * tmpi)); + r4_10 = (r5_10 + r5_26); + i4_10 = (i5_10 + i5_26); + r4_26 = (r5_10 - r5_26); + i4_26 = (i5_10 - i5_26); + } + r3_2 = (r4_2 + r4_10); + i3_2 = (i4_2 + i4_10); + r3_18 = (r4_2 - r4_10); + i3_18 = (i4_2 - i4_10); + r3_10 = (r4_18 + i4_26); + i3_10 = (i4_18 - r4_26); + r3_26 = (r4_18 - i4_26); + i3_26 = (i4_18 + r4_26); + } + { + REAL r4_6, i4_6; + REAL r4_14, i4_14; + REAL r4_22, i4_22; + REAL r4_30, i4_30; + { + REAL r5_6, i5_6; + REAL r5_22, i5_22; + wr = c_re(W[6 * l1]); + wi = c_im(W[6 * l1]); + tmpr = c_re(jp[6 * m]); + tmpi = c_im(jp[6 * m]); + r5_6 = ((wr * tmpr) - (wi * tmpi)); + i5_6 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[22 * l1]); + wi = c_im(W[22 * l1]); + tmpr = c_re(jp[22 * m]); + tmpi = c_im(jp[22 * m]); + r5_22 = ((wr * tmpr) - (wi * tmpi)); + i5_22 = ((wi * tmpr) + (wr * tmpi)); + r4_6 = (r5_6 + r5_22); + i4_6 = (i5_6 + i5_22); + r4_22 = (r5_6 - r5_22); + i4_22 = (i5_6 - i5_22); + } + { + REAL r5_14, i5_14; + REAL r5_30, i5_30; + wr = c_re(W[14 * l1]); + wi = c_im(W[14 * l1]); + tmpr = c_re(jp[14 * m]); + tmpi = c_im(jp[14 * m]); + r5_14 = ((wr * tmpr) - (wi * tmpi)); + i5_14 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[30 * l1]); + wi = c_im(W[30 * l1]); + tmpr = c_re(jp[30 * m]); + tmpi = c_im(jp[30 * m]); + r5_30 = ((wr * tmpr) - (wi * tmpi)); + i5_30 = ((wi * tmpr) + (wr * tmpi)); + r4_14 = (r5_14 + r5_30); + i4_14 = (i5_14 + i5_30); + r4_30 = (r5_14 - r5_30); + i4_30 = (i5_14 - i5_30); + } + r3_6 = (r4_6 + r4_14); + i3_6 = (i4_6 + i4_14); + r3_22 = (r4_6 - r4_14); + i3_22 = (i4_6 - i4_14); + r3_14 = (r4_22 + i4_30); + i3_14 = (i4_22 - r4_30); + r3_30 = (r4_22 - i4_30); + i3_30 = (i4_22 + r4_30); + } + r2_2 = (r3_2 + r3_6); + i2_2 = (i3_2 + i3_6); + r2_18 = (r3_2 - r3_6); + i2_18 = (i3_2 - i3_6); + tmpr = (0.707106781187 * (r3_14 + i3_14)); + tmpi = (0.707106781187 * (i3_14 - r3_14)); + r2_6 = (r3_10 + tmpr); + i2_6 = (i3_10 + tmpi); + r2_22 = (r3_10 - tmpr); + i2_22 = (i3_10 - tmpi); + r2_10 = (r3_18 + i3_22); + i2_10 = (i3_18 - r3_22); + r2_26 = (r3_18 - i3_22); + i2_26 = (i3_18 + r3_22); + tmpr = (0.707106781187 * (i3_30 - r3_30)); + tmpi = (0.707106781187 * (r3_30 + i3_30)); + r2_14 = (r3_26 + tmpr); + i2_14 = (i3_26 - tmpi); + r2_30 = (r3_26 - tmpr); + i2_30 = (i3_26 + tmpi); + } + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_16 = (r2_0 - r2_2); + i1_16 = (i2_0 - i2_2); + tmpr = ((0.923879532511 * r2_6) + (0.382683432365 * i2_6)); + tmpi = ((0.923879532511 * i2_6) - (0.382683432365 * r2_6)); + r1_2 = (r2_4 + tmpr); + i1_2 = (i2_4 + tmpi); + r1_18 = (r2_4 - tmpr); + i1_18 = (i2_4 - tmpi); + tmpr = (0.707106781187 * (r2_10 + i2_10)); + tmpi = (0.707106781187 * (i2_10 - r2_10)); + r1_4 = (r2_8 + tmpr); + i1_4 = (i2_8 + tmpi); + r1_20 = (r2_8 - tmpr); + i1_20 = (i2_8 - tmpi); + tmpr = ((0.382683432365 * r2_14) + (0.923879532511 * i2_14)); + tmpi = ((0.382683432365 * i2_14) - (0.923879532511 * r2_14)); + r1_6 = (r2_12 + tmpr); + i1_6 = (i2_12 + tmpi); + r1_22 = (r2_12 - tmpr); + i1_22 = (i2_12 - tmpi); + r1_8 = (r2_16 + i2_18); + i1_8 = (i2_16 - r2_18); + r1_24 = (r2_16 - i2_18); + i1_24 = (i2_16 + r2_18); + tmpr = ((0.923879532511 * i2_22) - (0.382683432365 * r2_22)); + tmpi = ((0.923879532511 * r2_22) + (0.382683432365 * i2_22)); + r1_10 = (r2_20 + tmpr); + i1_10 = (i2_20 - tmpi); + r1_26 = (r2_20 - tmpr); + i1_26 = (i2_20 + tmpi); + tmpr = (0.707106781187 * (i2_26 - r2_26)); + tmpi = (0.707106781187 * (r2_26 + i2_26)); + r1_12 = (r2_24 + tmpr); + i1_12 = (i2_24 - tmpi); + r1_28 = (r2_24 - tmpr); + i1_28 = (i2_24 + tmpi); + tmpr = ((0.382683432365 * i2_30) - (0.923879532511 * r2_30)); + tmpi = ((0.382683432365 * r2_30) + (0.923879532511 * i2_30)); + r1_14 = (r2_28 + tmpr); + i1_14 = (i2_28 - tmpi); + r1_30 = (r2_28 - tmpr); + i1_30 = (i2_28 + tmpi); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + REAL r2_5, i2_5; + REAL r2_7, i2_7; + REAL r2_9, i2_9; + REAL r2_11, i2_11; + REAL r2_13, i2_13; + REAL r2_15, i2_15; + REAL r2_17, i2_17; + REAL r2_19, i2_19; + REAL r2_21, i2_21; + REAL r2_23, i2_23; + REAL r2_25, i2_25; + REAL r2_27, i2_27; + REAL r2_29, i2_29; + REAL r2_31, i2_31; + { + REAL r3_1, i3_1; + REAL r3_5, i3_5; + REAL r3_9, i3_9; + REAL r3_13, i3_13; + REAL r3_17, i3_17; + REAL r3_21, i3_21; + REAL r3_25, i3_25; + REAL r3_29, i3_29; + { + REAL r4_1, i4_1; + REAL r4_9, i4_9; + REAL r4_17, i4_17; + REAL r4_25, i4_25; + { + REAL r5_1, i5_1; + REAL r5_17, i5_17; + wr = c_re(W[1 * l1]); + wi = c_im(W[1 * l1]); + tmpr = c_re(jp[1 * m]); + tmpi = c_im(jp[1 * m]); + r5_1 = ((wr * tmpr) - (wi * tmpi)); + i5_1 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[17 * l1]); + wi = c_im(W[17 * l1]); + tmpr = c_re(jp[17 * m]); + tmpi = c_im(jp[17 * m]); + r5_17 = ((wr * tmpr) - (wi * tmpi)); + i5_17 = ((wi * tmpr) + (wr * tmpi)); + r4_1 = (r5_1 + r5_17); + i4_1 = (i5_1 + i5_17); + r4_17 = (r5_1 - r5_17); + i4_17 = (i5_1 - i5_17); + } + { + REAL r5_9, i5_9; + REAL r5_25, i5_25; + wr = c_re(W[9 * l1]); + wi = c_im(W[9 * l1]); + tmpr = c_re(jp[9 * m]); + tmpi = c_im(jp[9 * m]); + r5_9 = ((wr * tmpr) - (wi * tmpi)); + i5_9 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[25 * l1]); + wi = c_im(W[25 * l1]); + tmpr = c_re(jp[25 * m]); + tmpi = c_im(jp[25 * m]); + r5_25 = ((wr * tmpr) - (wi * tmpi)); + i5_25 = ((wi * tmpr) + (wr * tmpi)); + r4_9 = (r5_9 + r5_25); + i4_9 = (i5_9 + i5_25); + r4_25 = (r5_9 - r5_25); + i4_25 = (i5_9 - i5_25); + } + r3_1 = (r4_1 + r4_9); + i3_1 = (i4_1 + i4_9); + r3_17 = (r4_1 - r4_9); + i3_17 = (i4_1 - i4_9); + r3_9 = (r4_17 + i4_25); + i3_9 = (i4_17 - r4_25); + r3_25 = (r4_17 - i4_25); + i3_25 = (i4_17 + r4_25); + } + { + REAL r4_5, i4_5; + REAL r4_13, i4_13; + REAL r4_21, i4_21; + REAL r4_29, i4_29; + { + REAL r5_5, i5_5; + REAL r5_21, i5_21; + wr = c_re(W[5 * l1]); + wi = c_im(W[5 * l1]); + tmpr = c_re(jp[5 * m]); + tmpi = c_im(jp[5 * m]); + r5_5 = ((wr * tmpr) - (wi * tmpi)); + i5_5 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[21 * l1]); + wi = c_im(W[21 * l1]); + tmpr = c_re(jp[21 * m]); + tmpi = c_im(jp[21 * m]); + r5_21 = ((wr * tmpr) - (wi * tmpi)); + i5_21 = ((wi * tmpr) + (wr * tmpi)); + r4_5 = (r5_5 + r5_21); + i4_5 = (i5_5 + i5_21); + r4_21 = (r5_5 - r5_21); + i4_21 = (i5_5 - i5_21); + } + { + REAL r5_13, i5_13; + REAL r5_29, i5_29; + wr = c_re(W[13 * l1]); + wi = c_im(W[13 * l1]); + tmpr = c_re(jp[13 * m]); + tmpi = c_im(jp[13 * m]); + r5_13 = ((wr * tmpr) - (wi * tmpi)); + i5_13 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[29 * l1]); + wi = c_im(W[29 * l1]); + tmpr = c_re(jp[29 * m]); + tmpi = c_im(jp[29 * m]); + r5_29 = ((wr * tmpr) - (wi * tmpi)); + i5_29 = ((wi * tmpr) + (wr * tmpi)); + r4_13 = (r5_13 + r5_29); + i4_13 = (i5_13 + i5_29); + r4_29 = (r5_13 - r5_29); + i4_29 = (i5_13 - i5_29); + } + r3_5 = (r4_5 + r4_13); + i3_5 = (i4_5 + i4_13); + r3_21 = (r4_5 - r4_13); + i3_21 = (i4_5 - i4_13); + r3_13 = (r4_21 + i4_29); + i3_13 = (i4_21 - r4_29); + r3_29 = (r4_21 - i4_29); + i3_29 = (i4_21 + r4_29); + } + r2_1 = (r3_1 + r3_5); + i2_1 = (i3_1 + i3_5); + r2_17 = (r3_1 - r3_5); + i2_17 = (i3_1 - i3_5); + tmpr = (0.707106781187 * (r3_13 + i3_13)); + tmpi = (0.707106781187 * (i3_13 - r3_13)); + r2_5 = (r3_9 + tmpr); + i2_5 = (i3_9 + tmpi); + r2_21 = (r3_9 - tmpr); + i2_21 = (i3_9 - tmpi); + r2_9 = (r3_17 + i3_21); + i2_9 = (i3_17 - r3_21); + r2_25 = (r3_17 - i3_21); + i2_25 = (i3_17 + r3_21); + tmpr = (0.707106781187 * (i3_29 - r3_29)); + tmpi = (0.707106781187 * (r3_29 + i3_29)); + r2_13 = (r3_25 + tmpr); + i2_13 = (i3_25 - tmpi); + r2_29 = (r3_25 - tmpr); + i2_29 = (i3_25 + tmpi); + } + { + REAL r3_3, i3_3; + REAL r3_7, i3_7; + REAL r3_11, i3_11; + REAL r3_15, i3_15; + REAL r3_19, i3_19; + REAL r3_23, i3_23; + REAL r3_27, i3_27; + REAL r3_31, i3_31; + { + REAL r4_3, i4_3; + REAL r4_11, i4_11; + REAL r4_19, i4_19; + REAL r4_27, i4_27; + { + REAL r5_3, i5_3; + REAL r5_19, i5_19; + wr = c_re(W[3 * l1]); + wi = c_im(W[3 * l1]); + tmpr = c_re(jp[3 * m]); + tmpi = c_im(jp[3 * m]); + r5_3 = ((wr * tmpr) - (wi * tmpi)); + i5_3 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[19 * l1]); + wi = c_im(W[19 * l1]); + tmpr = c_re(jp[19 * m]); + tmpi = c_im(jp[19 * m]); + r5_19 = ((wr * tmpr) - (wi * tmpi)); + i5_19 = ((wi * tmpr) + (wr * tmpi)); + r4_3 = (r5_3 + r5_19); + i4_3 = (i5_3 + i5_19); + r4_19 = (r5_3 - r5_19); + i4_19 = (i5_3 - i5_19); + } + { + REAL r5_11, i5_11; + REAL r5_27, i5_27; + wr = c_re(W[11 * l1]); + wi = c_im(W[11 * l1]); + tmpr = c_re(jp[11 * m]); + tmpi = c_im(jp[11 * m]); + r5_11 = ((wr * tmpr) - (wi * tmpi)); + i5_11 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[27 * l1]); + wi = c_im(W[27 * l1]); + tmpr = c_re(jp[27 * m]); + tmpi = c_im(jp[27 * m]); + r5_27 = ((wr * tmpr) - (wi * tmpi)); + i5_27 = ((wi * tmpr) + (wr * tmpi)); + r4_11 = (r5_11 + r5_27); + i4_11 = (i5_11 + i5_27); + r4_27 = (r5_11 - r5_27); + i4_27 = (i5_11 - i5_27); + } + r3_3 = (r4_3 + r4_11); + i3_3 = (i4_3 + i4_11); + r3_19 = (r4_3 - r4_11); + i3_19 = (i4_3 - i4_11); + r3_11 = (r4_19 + i4_27); + i3_11 = (i4_19 - r4_27); + r3_27 = (r4_19 - i4_27); + i3_27 = (i4_19 + r4_27); + } + { + REAL r4_7, i4_7; + REAL r4_15, i4_15; + REAL r4_23, i4_23; + REAL r4_31, i4_31; + { + REAL r5_7, i5_7; + REAL r5_23, i5_23; + wr = c_re(W[7 * l1]); + wi = c_im(W[7 * l1]); + tmpr = c_re(jp[7 * m]); + tmpi = c_im(jp[7 * m]); + r5_7 = ((wr * tmpr) - (wi * tmpi)); + i5_7 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[23 * l1]); + wi = c_im(W[23 * l1]); + tmpr = c_re(jp[23 * m]); + tmpi = c_im(jp[23 * m]); + r5_23 = ((wr * tmpr) - (wi * tmpi)); + i5_23 = ((wi * tmpr) + (wr * tmpi)); + r4_7 = (r5_7 + r5_23); + i4_7 = (i5_7 + i5_23); + r4_23 = (r5_7 - r5_23); + i4_23 = (i5_7 - i5_23); + } + { + REAL r5_15, i5_15; + REAL r5_31, i5_31; + wr = c_re(W[15 * l1]); + wi = c_im(W[15 * l1]); + tmpr = c_re(jp[15 * m]); + tmpi = c_im(jp[15 * m]); + r5_15 = ((wr * tmpr) - (wi * tmpi)); + i5_15 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[31 * l1]); + wi = c_im(W[31 * l1]); + tmpr = c_re(jp[31 * m]); + tmpi = c_im(jp[31 * m]); + r5_31 = ((wr * tmpr) - (wi * tmpi)); + i5_31 = ((wi * tmpr) + (wr * tmpi)); + r4_15 = (r5_15 + r5_31); + i4_15 = (i5_15 + i5_31); + r4_31 = (r5_15 - r5_31); + i4_31 = (i5_15 - i5_31); + } + r3_7 = (r4_7 + r4_15); + i3_7 = (i4_7 + i4_15); + r3_23 = (r4_7 - r4_15); + i3_23 = (i4_7 - i4_15); + r3_15 = (r4_23 + i4_31); + i3_15 = (i4_23 - r4_31); + r3_31 = (r4_23 - i4_31); + i3_31 = (i4_23 + r4_31); + } + r2_3 = (r3_3 + r3_7); + i2_3 = (i3_3 + i3_7); + r2_19 = (r3_3 - r3_7); + i2_19 = (i3_3 - i3_7); + tmpr = (0.707106781187 * (r3_15 + i3_15)); + tmpi = (0.707106781187 * (i3_15 - r3_15)); + r2_7 = (r3_11 + tmpr); + i2_7 = (i3_11 + tmpi); + r2_23 = (r3_11 - tmpr); + i2_23 = (i3_11 - tmpi); + r2_11 = (r3_19 + i3_23); + i2_11 = (i3_19 - r3_23); + r2_27 = (r3_19 - i3_23); + i2_27 = (i3_19 + r3_23); + tmpr = (0.707106781187 * (i3_31 - r3_31)); + tmpi = (0.707106781187 * (r3_31 + i3_31)); + r2_15 = (r3_27 + tmpr); + i2_15 = (i3_27 - tmpi); + r2_31 = (r3_27 - tmpr); + i2_31 = (i3_27 + tmpi); + } + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_17 = (r2_1 - r2_3); + i1_17 = (i2_1 - i2_3); + tmpr = ((0.923879532511 * r2_7) + (0.382683432365 * i2_7)); + tmpi = ((0.923879532511 * i2_7) - (0.382683432365 * r2_7)); + r1_3 = (r2_5 + tmpr); + i1_3 = (i2_5 + tmpi); + r1_19 = (r2_5 - tmpr); + i1_19 = (i2_5 - tmpi); + tmpr = (0.707106781187 * (r2_11 + i2_11)); + tmpi = (0.707106781187 * (i2_11 - r2_11)); + r1_5 = (r2_9 + tmpr); + i1_5 = (i2_9 + tmpi); + r1_21 = (r2_9 - tmpr); + i1_21 = (i2_9 - tmpi); + tmpr = ((0.382683432365 * r2_15) + (0.923879532511 * i2_15)); + tmpi = ((0.382683432365 * i2_15) - (0.923879532511 * r2_15)); + r1_7 = (r2_13 + tmpr); + i1_7 = (i2_13 + tmpi); + r1_23 = (r2_13 - tmpr); + i1_23 = (i2_13 - tmpi); + r1_9 = (r2_17 + i2_19); + i1_9 = (i2_17 - r2_19); + r1_25 = (r2_17 - i2_19); + i1_25 = (i2_17 + r2_19); + tmpr = ((0.923879532511 * i2_23) - (0.382683432365 * r2_23)); + tmpi = ((0.923879532511 * r2_23) + (0.382683432365 * i2_23)); + r1_11 = (r2_21 + tmpr); + i1_11 = (i2_21 - tmpi); + r1_27 = (r2_21 - tmpr); + i1_27 = (i2_21 + tmpi); + tmpr = (0.707106781187 * (i2_27 - r2_27)); + tmpi = (0.707106781187 * (r2_27 + i2_27)); + r1_13 = (r2_25 + tmpr); + i1_13 = (i2_25 - tmpi); + r1_29 = (r2_25 - tmpr); + i1_29 = (i2_25 + tmpi); + tmpr = ((0.382683432365 * i2_31) - (0.923879532511 * r2_31)); + tmpi = ((0.382683432365 * r2_31) + (0.923879532511 * i2_31)); + r1_15 = (r2_29 + tmpr); + i1_15 = (i2_29 - tmpi); + r1_31 = (r2_29 - tmpr); + i1_31 = (i2_29 + tmpi); + } + c_re(kp[0 * m]) = (r1_0 + r1_1); + c_im(kp[0 * m]) = (i1_0 + i1_1); + c_re(kp[16 * m]) = (r1_0 - r1_1); + c_im(kp[16 * m]) = (i1_0 - i1_1); + tmpr = ((0.980785280403 * r1_3) + (0.195090322016 * i1_3)); + tmpi = ((0.980785280403 * i1_3) - (0.195090322016 * r1_3)); + c_re(kp[1 * m]) = (r1_2 + tmpr); + c_im(kp[1 * m]) = (i1_2 + tmpi); + c_re(kp[17 * m]) = (r1_2 - tmpr); + c_im(kp[17 * m]) = (i1_2 - tmpi); + tmpr = ((0.923879532511 * r1_5) + (0.382683432365 * i1_5)); + tmpi = ((0.923879532511 * i1_5) - (0.382683432365 * r1_5)); + c_re(kp[2 * m]) = (r1_4 + tmpr); + c_im(kp[2 * m]) = (i1_4 + tmpi); + c_re(kp[18 * m]) = (r1_4 - tmpr); + c_im(kp[18 * m]) = (i1_4 - tmpi); + tmpr = ((0.831469612303 * r1_7) + (0.55557023302 * i1_7)); + tmpi = ((0.831469612303 * i1_7) - (0.55557023302 * r1_7)); + c_re(kp[3 * m]) = (r1_6 + tmpr); + c_im(kp[3 * m]) = (i1_6 + tmpi); + c_re(kp[19 * m]) = (r1_6 - tmpr); + c_im(kp[19 * m]) = (i1_6 - tmpi); + tmpr = (0.707106781187 * (r1_9 + i1_9)); + tmpi = (0.707106781187 * (i1_9 - r1_9)); + c_re(kp[4 * m]) = (r1_8 + tmpr); + c_im(kp[4 * m]) = (i1_8 + tmpi); + c_re(kp[20 * m]) = (r1_8 - tmpr); + c_im(kp[20 * m]) = (i1_8 - tmpi); + tmpr = ((0.55557023302 * r1_11) + (0.831469612303 * i1_11)); + tmpi = ((0.55557023302 * i1_11) - (0.831469612303 * r1_11)); + c_re(kp[5 * m]) = (r1_10 + tmpr); + c_im(kp[5 * m]) = (i1_10 + tmpi); + c_re(kp[21 * m]) = (r1_10 - tmpr); + c_im(kp[21 * m]) = (i1_10 - tmpi); + tmpr = ((0.382683432365 * r1_13) + (0.923879532511 * i1_13)); + tmpi = ((0.382683432365 * i1_13) - (0.923879532511 * r1_13)); + c_re(kp[6 * m]) = (r1_12 + tmpr); + c_im(kp[6 * m]) = (i1_12 + tmpi); + c_re(kp[22 * m]) = (r1_12 - tmpr); + c_im(kp[22 * m]) = (i1_12 - tmpi); + tmpr = ((0.195090322016 * r1_15) + (0.980785280403 * i1_15)); + tmpi = ((0.195090322016 * i1_15) - (0.980785280403 * r1_15)); + c_re(kp[7 * m]) = (r1_14 + tmpr); + c_im(kp[7 * m]) = (i1_14 + tmpi); + c_re(kp[23 * m]) = (r1_14 - tmpr); + c_im(kp[23 * m]) = (i1_14 - tmpi); + c_re(kp[8 * m]) = (r1_16 + i1_17); + c_im(kp[8 * m]) = (i1_16 - r1_17); + c_re(kp[24 * m]) = (r1_16 - i1_17); + c_im(kp[24 * m]) = (i1_16 + r1_17); + tmpr = ((0.980785280403 * i1_19) - (0.195090322016 * r1_19)); + tmpi = ((0.980785280403 * r1_19) + (0.195090322016 * i1_19)); + c_re(kp[9 * m]) = (r1_18 + tmpr); + c_im(kp[9 * m]) = (i1_18 - tmpi); + c_re(kp[25 * m]) = (r1_18 - tmpr); + c_im(kp[25 * m]) = (i1_18 + tmpi); + tmpr = ((0.923879532511 * i1_21) - (0.382683432365 * r1_21)); + tmpi = ((0.923879532511 * r1_21) + (0.382683432365 * i1_21)); + c_re(kp[10 * m]) = (r1_20 + tmpr); + c_im(kp[10 * m]) = (i1_20 - tmpi); + c_re(kp[26 * m]) = (r1_20 - tmpr); + c_im(kp[26 * m]) = (i1_20 + tmpi); + tmpr = ((0.831469612303 * i1_23) - (0.55557023302 * r1_23)); + tmpi = ((0.831469612303 * r1_23) + (0.55557023302 * i1_23)); + c_re(kp[11 * m]) = (r1_22 + tmpr); + c_im(kp[11 * m]) = (i1_22 - tmpi); + c_re(kp[27 * m]) = (r1_22 - tmpr); + c_im(kp[27 * m]) = (i1_22 + tmpi); + tmpr = (0.707106781187 * (i1_25 - r1_25)); + tmpi = (0.707106781187 * (r1_25 + i1_25)); + c_re(kp[12 * m]) = (r1_24 + tmpr); + c_im(kp[12 * m]) = (i1_24 - tmpi); + c_re(kp[28 * m]) = (r1_24 - tmpr); + c_im(kp[28 * m]) = (i1_24 + tmpi); + tmpr = ((0.55557023302 * i1_27) - (0.831469612303 * r1_27)); + tmpi = ((0.55557023302 * r1_27) + (0.831469612303 * i1_27)); + c_re(kp[13 * m]) = (r1_26 + tmpr); + c_im(kp[13 * m]) = (i1_26 - tmpi); + c_re(kp[29 * m]) = (r1_26 - tmpr); + c_im(kp[29 * m]) = (i1_26 + tmpi); + tmpr = ((0.382683432365 * i1_29) - (0.923879532511 * r1_29)); + tmpi = ((0.382683432365 * r1_29) + (0.923879532511 * i1_29)); + c_re(kp[14 * m]) = (r1_28 + tmpr); + c_im(kp[14 * m]) = (i1_28 - tmpi); + c_re(kp[30 * m]) = (r1_28 - tmpr); + c_im(kp[30 * m]) = (i1_28 + tmpi); + tmpr = ((0.195090322016 * i1_31) - (0.980785280403 * r1_31)); + tmpi = ((0.195090322016 * r1_31) + (0.980785280403 * i1_31)); + c_re(kp[15 * m]) = (r1_30 + tmpr); + c_im(kp[15 * m]) = (i1_30 - tmpi); + c_re(kp[31 * m]) = (r1_30 - tmpr); + c_im(kp[31 * m]) = (i1_30 + tmpi); + } + } + } else { + int ab = (a + b) / 2; + fft_twiddle_32_seq(a, ab, in, out, W, nW, nWdn, m); + fft_twiddle_32_seq(ab, b, in, out, W, nW, nWdn, m); + } +} +void fft_unshuffle_32(int a, int b, COMPLEX * in, COMPLEX * out, int m) +{ + int i; + const COMPLEX *ip; + COMPLEX *jp; + if ((b - a) < 128) { + ip = in + a * 32; + for (i = a; i < b; ++i) { + jp = out + i; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + } + } else { + int ab = (a + b) / 2; +#if defined(FORCE_TIED_TASKS) + #pragma omp task + fft_unshuffle_32(a, ab, in, out, m); + #pragma omp task + fft_unshuffle_32(ab, b, in, out, m); +#else + #pragma omp task untied + fft_unshuffle_32(a, ab, in, out, m); + #pragma omp task untied + fft_unshuffle_32(ab, b, in, out, m); +#endif + #pragma omp taskwait + } +} +void fft_unshuffle_32_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m) +{ + int i; + const COMPLEX *ip; + COMPLEX *jp; + if ((b - a) < 128) { + ip = in + a * 32; + for (i = a; i < b; ++i) { + jp = out + i; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + } + } else { + int ab = (a + b) / 2; + fft_unshuffle_32_seq(a, ab, in, out, m); + fft_unshuffle_32_seq(ab, b, in, out, m); + } +} +/* end of machine-generated code */ + +/* + * Recursive complex FFT on the n complex components of the array in: + * basic Cooley-Tukey algorithm, with some improvements for + * n power of two. The result is placed in the array out. n is arbitrary. + * The algorithm runs in time O(n*(r1 + ... + rk)) where r1, ..., rk + * are prime numbers, and r1 * r2 * ... * rk = n. + * + * n: size of the input + * in: pointer to input + * out: pointer to output + * factors: list of factors of n, precomputed + * W: twiddle factors + * nW: size of W, that is, size of the original transform + * + */ +void fft_aux(int n, COMPLEX * in, COMPLEX * out, int *factors, COMPLEX * W, int nW) +{ + int r, m; + int k; + + /* special cases */ + if (n == 32) { + fft_base_32(in, out); + return; + } + if (n == 16) { + fft_base_16(in, out); + return; + } + if (n == 8) { + fft_base_8(in, out); + return; + } + if (n == 4) { + fft_base_4(in, out); + return; + } + if (n == 2) { + fft_base_2(in, out); + return; + } + /* + * the cases n == 3, n == 5, and maybe 7 should be implemented as well + */ + + r = *factors; + m = n / r; + + if (r < n) { + /* + * split the DFT of length n into r DFTs of length n/r, and + * recurse + */ +#if defined(FORCE_TIED_TASKS) + if (r == 32) { + #pragma omp task + fft_unshuffle_32(0, m, in, out, m); + } else if (r == 16) { + #pragma omp task + fft_unshuffle_16(0, m, in, out, m); + } else if (r == 8) { + #pragma omp task + fft_unshuffle_8(0, m, in, out, m); + } else if (r == 4) { + #pragma omp task + fft_unshuffle_4(0, m, in, out, m); + } else if (r == 2) { + #pragma omp task + fft_unshuffle_2(0, m, in, out, m); + } else + unshuffle(0, m, in, out, r, m); +#else + if (r == 32) { + #pragma omp task untied + fft_unshuffle_32(0, m, in, out, m); + } else if (r == 16) { + #pragma omp task untied + fft_unshuffle_16(0, m, in, out, m); + } else if (r == 8) { + #pragma omp task untied + fft_unshuffle_8(0, m, in, out, m); + } else if (r == 4) { + #pragma omp task untied + fft_unshuffle_4(0, m, in, out, m); + } else if (r == 2) { + #pragma omp task untied + fft_unshuffle_2(0, m, in, out, m); + } else + unshuffle(0, m, in, out, r, m); + +#endif + #pragma omp taskwait + + for (k = 0; k < n; k += m) { +#if defined(FORCE_TIED_TASKS) + #pragma omp task + fft_aux(m, out + k, in + k, factors + 1, W, nW); +#else + #pragma omp task untied + fft_aux(m, out + k, in + k, factors + 1, W, nW); +#endif + } + #pragma omp taskwait + } + /* + * now multiply by the twiddle factors, and perform m FFTs + * of length r + */ +#if defined(FORCE_TIED_TASKS) + if (r == 2) { + #pragma omp task untied + fft_twiddle_2(0, m, in, out, W, nW, nW / n, m); + } else if (r == 4) { + #pragma omp task untied + fft_twiddle_4(0, m, in, out, W, nW, nW / n, m); + } else if (r == 8) { + #pragma omp task untied + fft_twiddle_8(0, m, in, out, W, nW, nW / n, m); + } else if (r == 16) { + #pragma omp task untied + fft_twiddle_16(0, m, in, out, W, nW, nW / n, m); + } else if (r == 32) { + #pragma omp task untied + fft_twiddle_32(0, m, in, out, W, nW, nW / n, m); + } else { + #pragma omp task untied + fft_twiddle_gen(0, m, in, out, W, nW, nW / n, r, m); + } +#else + if (r == 2) { + #pragma omp task untied + fft_twiddle_2(0, m, in, out, W, nW, nW / n, m); + } else if (r == 4) { + #pragma omp task untied + fft_twiddle_4(0, m, in, out, W, nW, nW / n, m); + } else if (r == 8) { + #pragma omp task untied + fft_twiddle_8(0, m, in, out, W, nW, nW / n, m); + } else if (r == 16) { + #pragma omp task untied + fft_twiddle_16(0, m, in, out, W, nW, nW / n, m); + } else if (r == 32) { + #pragma omp task untied + fft_twiddle_32(0, m, in, out, W, nW, nW / n, m); + } else { + #pragma omp task untied + fft_twiddle_gen(0, m, in, out, W, nW, nW / n, r, m); + } +#endif + + #pragma omp taskwait + + return; +} + +void fft_aux_seq(int n, COMPLEX * in, COMPLEX * out, int *factors, COMPLEX * W, int nW) +{ + int r, m; + int k; + + /* special cases */ + if (n == 32) { + fft_base_32(in, out); + return; + } + if (n == 16) { + fft_base_16(in, out); + return; + } + if (n == 8) { + fft_base_8(in, out); + return; + } + if (n == 4) { + fft_base_4(in, out); + return; + } + if (n == 2) { + fft_base_2(in, out); + return; + } + /* + * the cases n == 3, n == 5, and maybe 7 should be implemented as well + */ + + r = *factors; + m = n / r; + + if (r < n) { + /* + * split the DFT of length n into r DFTs of length n/r, and + * recurse + */ + if (r == 32) fft_unshuffle_32_seq(0, m, in, out, m); + else if (r == 16) fft_unshuffle_16_seq(0, m, in, out, m); + else if (r == 8) fft_unshuffle_8_seq(0, m, in, out, m); + else if (r == 4) fft_unshuffle_4_seq(0, m, in, out, m); + else if (r == 2) fft_unshuffle_2_seq(0, m, in, out, m); + else unshuffle_seq(0, m, in, out, r, m); + + for (k = 0; k < n; k += m) { + fft_aux_seq(m, out + k, in + k, factors + 1, W, nW); + } + } + /* + * now multiply by the twiddle factors, and perform m FFTs + * of length r + */ + if (r == 2) fft_twiddle_2_seq(0, m, in, out, W, nW, nW / n, m); + else if (r == 4) fft_twiddle_4_seq(0, m, in, out, W, nW, nW / n, m); + else if (r == 8) fft_twiddle_8_seq(0, m, in, out, W, nW, nW / n, m); + else if (r == 16) fft_twiddle_16_seq(0, m, in, out, W, nW, nW / n, m); + else if (r == 32) fft_twiddle_32_seq(0, m, in, out, W, nW, nW / n, m); + else fft_twiddle_gen_seq(0, m, in, out, W, nW, nW / n, r, m); + + return; +} +/* + * user interface for fft_aux + */ +void fft(int n, COMPLEX * in, COMPLEX * out) +{ + int factors[40]; /* allows FFTs up to at least 3^40 */ + int *p = factors; + int l = n; + int r; + COMPLEX *W; + + bots_message("Computing coefficients "); + W = (COMPLEX *) malloc((n + 1) * sizeof(COMPLEX)); + #pragma omp parallel + #pragma omp single +#if defined(FORCE_TIED_TASKS) + #pragma omp task +#else + #pragma omp task untied +#endif + compute_w_coefficients(n, 0, n / 2, W); + bots_message(" completed!\n"); + + /* + * find factors of n, first 8, then 4 and then primes in ascending + * order + */ + do { + r = factor(l); + *p++ = r; + l /= r; + } while (l > 1); + + bots_message("Computing FFT "); + #pragma omp parallel + #pragma omp single +#if defined(FORCE_TIED_TASKS) + #pragma omp task +#else + #pragma omp task untied +#endif + fft_aux(n, in, out, factors, W, n); + bots_message(" completed!\n"); + + free(W); + return; +} +void fft_seq(int n, COMPLEX * in, COMPLEX * out) +{ + int factors[40]; /* allows FFTs up to at least 3^40 */ + int *p = factors; + int l = n; + int r; + COMPLEX *W; + + W = (COMPLEX *) malloc((n + 1) * sizeof(COMPLEX)); + compute_w_coefficients_seq(n, 0, n / 2, W); + + /* + * find factors of n, first 8, then 4 and then primes in ascending + * order + */ + do { + r = factor(l); + *p++ = r; + l /= r; + } while (l > 1); + + fft_aux_seq(n, in, out, factors, W, n); + + free(W); + return; +} +int test_correctness(int n, COMPLEX *out1, COMPLEX *out2) +{ + int i; + double a,d,error = 0.0; + + for (i = 0; i < n; ++i) { + a = sqrt((c_re(out1[i]) - c_re(out2[i])) * + (c_re(out1[i]) - c_re(out2[i])) + + (c_im(out1[i]) - c_im(out2[i])) * + (c_im(out1[i]) - c_im(out2[i]))); + d = sqrt(c_re(out2[i]) * c_re(out2[i]) + + c_im(out2[i]) * c_im(out2[i])); + if (d < -1.0e-10 || d > 1.0e-10) a /= d; + if (a > error) error = a; + } + bots_message("relative error=%e\n", error); + if (error > 1e-3) return BOTS_RESULT_UNSUCCESSFUL; + else return BOTS_RESULT_SUCCESSFUL; +} + diff --git a/src/components/implementation/no_interface/omp_fft_bots/fft.h b/src/components/implementation/no_interface/omp_fft_bots/fft.h new file mode 100644 index 0000000000..ebafa9fb4d --- /dev/null +++ b/src/components/implementation/no_interface/omp_fft_bots/fft.h @@ -0,0 +1,55 @@ +#ifndef FFT_H +#define FFT_H + +/* our real numbers */ +typedef double REAL; + +/* Complex numbers and operations */ +typedef struct { + REAL re, im; +} COMPLEX; + +#define c_re(c) ((c).re) +#define c_im(c) ((c).im) + +void compute_w_coefficients(int n, int a, int b, COMPLEX * W); +void compute_w_coefficients_seq(int n, int a, int b, COMPLEX * W); +int factor(int n); +void unshuffle(int a, int b, COMPLEX * in, COMPLEX * out, int r, int m); +void unshuffle_seq(int a, int b, COMPLEX * in, COMPLEX * out, int r, int m); +void fft_twiddle_gen1(COMPLEX * in, COMPLEX * out, COMPLEX * W, int r, int m, int nW, int nWdnti, int nWdntm); +void fft_twiddle_gen(int i, int i1, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int r, int m); +void fft_twiddle_gen_seq(int i, int i1, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int r, int m); +void fft_base_2(COMPLEX * in, COMPLEX * out); +void fft_twiddle_2(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m); +void fft_twiddle_2_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m); +void fft_unshuffle_2(int a, int b, COMPLEX * in, COMPLEX * out, int m); +void fft_unshuffle_2_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m); +void fft_base_4(COMPLEX * in, COMPLEX * out); +void fft_twiddle_4(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m); +void fft_twiddle_4_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m); +void fft_unshuffle_4(int a, int b, COMPLEX * in, COMPLEX * out, int m); +void fft_unshuffle_4_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m); +void fft_base_8(COMPLEX * in, COMPLEX * out); +void fft_twiddle_8(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m); +void fft_twiddle_8_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m); +void fft_unshuffle_8(int a, int b, COMPLEX * in, COMPLEX * out, int m); +void fft_unshuffle_8_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m); +void fft_base_16(COMPLEX * in, COMPLEX * out); +void fft_twiddle_16(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m); +void fft_twiddle_16_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m); +void fft_unshuffle_16(int a, int b, COMPLEX * in, COMPLEX * out, int m); +void fft_unshuffle_16_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m); +void fft_base_32(COMPLEX * in, COMPLEX * out); +void fft_twiddle_32(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m); +void fft_twiddle_32_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m); +void fft_unshuffle_32(int a, int b, COMPLEX * in, COMPLEX * out, int m); +void fft_unshuffle_32_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m); +void fft_aux(int n, COMPLEX * in, COMPLEX * out, int *factors, COMPLEX * W, int nW); +void fft_aux_seq(int n, COMPLEX * in, COMPLEX * out, int *factors, COMPLEX * W, int nW); +void fft(int n, COMPLEX * in, COMPLEX * out); +void fft_seq(int n, COMPLEX * in, COMPLEX * out); +int test_correctness(int n, COMPLEX *out1, COMPLEX *out2); + +#endif + diff --git a/src/components/implementation/no_interface/omp_fft_bots/init.c b/src/components/implementation/no_interface/omp_fft_bots/init.c new file mode 120000 index 0000000000..9e09b82e77 --- /dev/null +++ b/src/components/implementation/no_interface/omp_fft_bots/init.c @@ -0,0 +1 @@ +../omp_fib_bots/init.c \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_fft_bots/omp-tasks-app.h b/src/components/implementation/no_interface/omp_fft_bots/omp-tasks-app.h new file mode 120000 index 0000000000..9fba574408 --- /dev/null +++ b/src/components/implementation/no_interface/omp_fft_bots/omp-tasks-app.h @@ -0,0 +1 @@ +../omp_fib_bots/omp-tasks-app.h \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_fft_bots/posix_basic.c b/src/components/implementation/no_interface/omp_fft_bots/posix_basic.c new file mode 120000 index 0000000000..9afee078fb --- /dev/null +++ b/src/components/implementation/no_interface/omp_fft_bots/posix_basic.c @@ -0,0 +1 @@ +../omp_fib_bots/posix_basic.c \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_fib_bots/Makefile b/src/components/implementation/no_interface/omp_fib_bots/Makefile new file mode 100644 index 0000000000..bee96fd0aa --- /dev/null +++ b/src/components/implementation/no_interface/omp_fib_bots/Makefile @@ -0,0 +1,19 @@ +COMPONENT=omp_fib_bots.o +INTERFACES= +#DEPENDENCIES=capmgr +IF_LIB= +ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps + +include ../../Makefile.subsubdir +MANDITORY_LIB=simple_stklib.o + +CFLAGS += -fopenmp +# if tied tasks are required +CFLAGS += -DFORCE_TIED_TASKS + +OMPC_FINAL_FLAGS= + +# one per compilation or none +#CFLAGS += -DMANUAL_CUTOFF +#CFLAGS += -DIF_CUTOFF +#CFLAGS += -DFINAL_CUTOFF $(OMPC_FINAL_FLAGS) diff --git a/src/components/implementation/no_interface/omp_fib_bots/app-desc.h b/src/components/implementation/no_interface/omp_fib_bots/app-desc.h new file mode 100644 index 0000000000..e8af171324 --- /dev/null +++ b/src/components/implementation/no_interface/omp_fib_bots/app-desc.h @@ -0,0 +1,47 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +#include "omp-tasks-app.h" + +#define BOTS_APP_NAME "Fibonacci" +#define BOTS_APP_PARAMETERS_DESC "N=%d" +#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size + +#define BOTS_APP_USES_ARG_SIZE +#define BOTS_APP_DEF_ARG_SIZE 20 +#define BOTS_APP_DESC_ARG_SIZE "Number to compute" + +int fib_verify(int); +void fib0 (int); +void fib0_seq (int); + +//#define KERNEL_INIT +#define KERNEL_CALL fib0(bots_arg_size) +//#define KERNEL_FINI + +//#define KERNEL_SEQ_INIT +#define KERNEL_SEQ_CALL fib0_seq(bots_arg_size) +//#define KERNEL_SEQ_FINI + + +#define KERNEL_CHECK fib_verify(bots_arg_size) + +#define BOTS_CUTOFF_DEF_VALUE 10 + diff --git a/src/components/implementation/no_interface/omp_fib_bots/bots.h b/src/components/implementation/no_interface/omp_fib_bots/bots.h new file mode 100644 index 0000000000..fee71a7eb2 --- /dev/null +++ b/src/components/implementation/no_interface/omp_fib_bots/bots.h @@ -0,0 +1,113 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +#ifndef _BOTS_H_ + +#include +#include +#include + +/* common flags */ +extern int bots_sequential_flag; +extern int bots_benchmark_flag; +extern int bots_check_flag; +extern int bots_result; +extern int bots_output_format; +extern int bots_print_header; +/* common variables */ +extern char bots_name[]; +extern char bots_parameters[]; +extern char bots_model[]; +extern char bots_resources[]; +/* compile and execution information */ +extern char bots_exec_date[]; +extern char bots_exec_message[]; +extern char bots_comp_date[]; +extern char bots_comp_message[]; +extern char bots_cc[]; +extern char bots_cflags[]; +extern char bots_ld[]; +extern char bots_ldflags[]; +/* time variables */ +extern double bots_time_program; +extern double bots_time_sequential; + +/* number of tasks variable */ +extern unsigned long long bots_number_of_tasks; /* forcing 8 bytes size on -m32 and -m64 */ + +extern char bots_cutoff[]; +extern int bots_cutoff_value; + +extern int bots_app_cutoff_value; +extern int bots_app_cutoff_value_1; +extern int bots_app_cutoff_value_2; + +extern int bots_arg_size; +extern int bots_arg_size_1; +extern int bots_arg_size_2; + +/* function could be used in app. code but are implemented in bots_common.c */ +long bots_usecs(); +void bots_error(int error, char *message); +void bots_warning(int warning, char *message); + +#define BOTS_RESULT_NA 0 +#define BOTS_RESULT_SUCCESSFUL 1 +#define BOTS_RESULT_UNSUCCESSFUL 2 +#define BOTS_RESULT_NOT_REQUESTED 3 + + +typedef enum { BOTS_VERBOSE_NONE=0, + BOTS_VERBOSE_DEFAULT, + BOTS_VERBOSE_DEBUG } bots_verbose_mode_t; + +extern bots_verbose_mode_t bots_verbose_mode; + +#define bots_message(msg, ...) \ + {\ + if ( bots_verbose_mode >= BOTS_VERBOSE_DEFAULT ) {\ + printc(msg , ##__VA_ARGS__);\ + }\ + } + +#ifdef BOTS_DEBUG +#define bots_debug(msg, ...) \ + {\ + if ( bots_verbose_mode >= BOTS_VERBOSE_DEBUG ) {\ + PRINTC(msg , ##__VA_ARGS__);\ + }\ + } +#define bots_debug_with_location_info(msg, ...) \ + {\ + if ( bots_verbose_mode >= BOTS_VERBOSE_DEBUG ) {\ + PRINTC("%s:%d:%s:" msg ,__FILE__, __LINE__,__func__,##__VA_ARGS__);\ + }\ + } +#else +#define bots_debug(msg, ...) +#define bots_debug_with_location_info(msg, ...) +#endif + +#define FALSE 0 +#define TRUE 1 + +#endif + + diff --git a/src/components/implementation/no_interface/omp_fib_bots/bots_common.c b/src/components/implementation/no_interface/omp_fib_bots/bots_common.c new file mode 100644 index 0000000000..49af8a438e --- /dev/null +++ b/src/components/implementation/no_interface/omp_fib_bots/bots_common.c @@ -0,0 +1,342 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bots_common.h" +#include "bots_main.h" +#include "bots.h" +#include + +void +bots_error(int error, char *message) +{ + if (message == NULL) + { + switch(error) + { + case BOTS_ERROR: + PRINTC("Error (%d): %s\n",error,"Unspecified error."); + break; + case BOTS_ERROR_NOT_ENOUGH_MEMORY: + PRINTC("Error (%d): %s\n",error,"Not enough memory."); + break; + case BOTS_ERROR_UNRECOGNIZED_PARAMETER: + PRINTC("Error (%d): %s\n",error,"Unrecognized parameter."); + bots_print_usage(); + break; + default: + PRINTC("Error (%d): %s\n",error,"Invalid error code."); + break; + } + } + else PRINTC("Error (%d): %s\n",error,message); + cos_exit(100+error); +} + +void +bots_warning(int warning, char *message) +{ + if (message == NULL) + { + switch(warning) + { + case BOTS_WARNING: + PRINTC("Warning (%d): %s\n",warning,"Unspecified warning."); + break; + default: + PRINTC("Warning (%d): %s\n",warning,"Invalid warning code."); + break; + } + } + else PRINTC("Warning (%d): %s\n",warning,message); +} + +long bots_usecs (void) +{ + //struct timeval t; + //gettimeofday(&t,NULL); + //return t.tv_sec*1000000+t.tv_usec; + return (long)time_now_usec(); +} + +void +bots_get_date(char *str) +{ + time_t now; + time(&now); + //strftime(str, 32, "%Y/%m/%d;%H:%M", gmtime(&now)); + strncpy(str, "01/01/0001", 32); +} + +void bots_get_architecture(char *str) +{ + snprintf(str, BOTS_TMP_STR_SZ, "Composite-i386;%d", NUM_CPU); +// int ncpus = sysconf(_SC_NPROCESSORS_CONF); +// struct utsname architecture; +// +// uname(&architecture); +// snprintf(str, BOTS_TMP_STR_SZ, "%s-%s;%d" ,architecture.sysname, architecture.machine, ncpus); +} + +#undef __linux +#if defined (__linux) +/* ****************************************************************** */ +void bots_get_load_average(char *str) +{ + double loadavg[3]; + getloadavg (loadavg, 3); + snprintf(str, BOTS_TMP_STR_SZ, "%.2f;%.2f;%.2f",loadavg[0],loadavg[1],loadavg[2]); +} +#else +/* ****************************************************************** */ +void bots_get_load_average(char *str) { sprintf(str,";;"); } +#endif + +void bots_print_results() +{ + char str_name[BOTS_TMP_STR_SZ]; + char str_parameters[BOTS_TMP_STR_SZ]; + char str_model[BOTS_TMP_STR_SZ]; + char str_resources[BOTS_TMP_STR_SZ]; + char str_result[15]; + char str_time_program[15]; + char str_time_sequential[15]; + char str_speed_up[15]; + char str_number_of_tasks[15]; + char str_number_of_tasks_per_second[15]; + char str_exec_date[BOTS_TMP_STR_SZ]; + char str_exec_message[BOTS_TMP_STR_SZ]; + char str_architecture[BOTS_TMP_STR_SZ]; + char str_load_avg[BOTS_TMP_STR_SZ]; + char str_comp_date[BOTS_TMP_STR_SZ]; + char str_comp_message[BOTS_TMP_STR_SZ]; + char str_cc[BOTS_TMP_STR_SZ]; + char str_cflags[BOTS_TMP_STR_SZ]; + char str_ld[BOTS_TMP_STR_SZ]; + char str_ldflags[BOTS_TMP_STR_SZ]; + char str_cutoff[BOTS_TMP_STR_SZ]; + + /* compute output strings */ + sprintf(str_name, "%s", bots_name); + sprintf(str_parameters, "%s", bots_parameters); + sprintf(str_model, "%s", bots_model); + sprintf(str_cutoff, "%s", bots_cutoff); + sprintf(str_resources, "%s", bots_resources); + switch(bots_result) + { + case BOTS_RESULT_NA: + sprintf(str_result, "n/a"); + break; + case BOTS_RESULT_SUCCESSFUL: + sprintf(str_result, "successful"); + break; + case BOTS_RESULT_UNSUCCESSFUL: + sprintf(str_result, "UNSUCCESSFUL"); + break; + case BOTS_RESULT_NOT_REQUESTED: + sprintf(str_result, "Not requested"); + break; + default: + sprintf(str_result, "error"); + break; + } + sprintf(str_time_program, "%f", bots_time_program); + if (bots_sequential_flag) sprintf(str_time_sequential, "%f", bots_time_sequential); + else sprintf(str_time_sequential, "n/a"); + if (bots_sequential_flag) + sprintf(str_speed_up, "%3.2f", bots_time_sequential/bots_time_program); + else sprintf(str_speed_up, "n/a"); + + sprintf(str_number_of_tasks, "%3.2f", (float) bots_number_of_tasks); + sprintf(str_number_of_tasks_per_second, "%3.2f", (float) bots_number_of_tasks/bots_time_program); + + sprintf(str_exec_date, "%s", bots_exec_date); + sprintf(str_exec_message, "%s", bots_exec_message); + bots_get_architecture(str_architecture); + bots_get_load_average(str_load_avg); + sprintf(str_comp_date, "%s", bots_comp_date); + sprintf(str_comp_message, "%s", bots_comp_message); + sprintf(str_cc, "%s", bots_cc); + sprintf(str_cflags, "%s", bots_cflags); + sprintf(str_ld, "%s", bots_ld); + sprintf(str_ldflags, "%s", bots_ldflags); + + if(bots_print_header) + { + switch(bots_output_format) + { + case 0: + break; + case 1: + break; + case 2: +PRINTC( +"Benchmark;Parameters;Model;Cutoff;Resources;Result;\ +Time;Sequential;Speed-up;\ +Nodes;Nodes/Sec;\ +Exec Date;Exec Time;Exec Message;\ +Architecture;Processors;Load Avg-1;Load Avg-5;Load Avg-15;\ +Comp Date;Comp Time;Comp Message;CC;CFLAGS;LD;LDFLAGS\n"); + break; + case 3: + break; + case 4: +PRINTC( +"Benchmark;Parameters;Model;Cutoff;Resources;Result;\ +Time;Sequential;Speed-up;\ +Nodes;Nodes/Sec;\n"); + break; + default: + break; + } + } + + /* print results */ + switch(bots_output_format) + { + case 0: + break; + case 1: + PRINTC("\n"); + PRINTC("Program = %s\n", str_name); /*fix*/ + PRINTC("Parameters = %s\n", str_parameters); /*fix*/ + PRINTC("Model = %s\n", str_model); + PRINTC("Embedded cut-off = %s\n", str_cutoff); + PRINTC("# of Threads = %s\n", str_resources); + PRINTC("Verification = %s\n", str_result); + + PRINTC("Time Program = %s seconds\n", str_time_program); + if (bots_sequential_flag) { + PRINTC("Time Sequential = %s seconds\n", str_time_sequential); + PRINTC("Speed-up = %s\n", str_speed_up); + } + + if ( bots_number_of_tasks > 0 ) { + PRINTC("Nodes = %s\n", str_number_of_tasks); + PRINTC("Nodes/Sec = %s\n", str_number_of_tasks_per_second); + } + + PRINTC("Execution Date = %s\n", str_exec_date); + PRINTC("Execution Message = %s\n", str_exec_message); + + PRINTC("Architecture = %s\n", str_architecture); + PRINTC("Load Avg [1:5:15] = %s\n", str_load_avg); + + PRINTC("Compilation Date = %s\n", str_comp_date); + PRINTC("Compilation Message = %s\n", str_comp_message); + + PRINTC("Compiler = %s\n", str_cc); + PRINTC("Compiler Flags = %s\n", str_cflags); + PRINTC("Linker = %s\n", str_ld); + PRINTC("Linker Flags = %s\n", str_ldflags); + fflush(stdout); + break; + case 2: + PRINTC("%s;%s;%s;%s;%s;%s;", + str_name, + str_parameters, + str_model, + str_cutoff, + str_resources, + str_result + ); + PRINTC("%s;%s;%s;", + str_time_program, + str_time_sequential, + str_speed_up + ); + PRINTC("%s;%s;", + str_number_of_tasks, + str_number_of_tasks_per_second + ); + PRINTC("%s;%s;", + str_exec_date, + str_exec_message + ); + PRINTC("%s;%s;", + str_architecture, + str_load_avg + ); + PRINTC("%s;%s;", + str_comp_date, + str_comp_message + ); + PRINTC("%s;%s;%s;%s;", + str_cc, + str_cflags, + str_ld, + str_ldflags + ); + PRINTC("\n"); + break; + case 3: + PRINTC("\n"); + PRINTC("Program = %s\n", str_name); /*fix*/ + PRINTC("Parameters = %s\n", str_parameters); /*fix*/ + PRINTC("Model = %s\n", str_model); + PRINTC("Embedded cut-off = %s\n", str_cutoff); + PRINTC("# of Threads = %s\n", str_resources); + PRINTC("Verification = %s\n", str_result); + + PRINTC("Time Program = %s seconds\n", str_time_program); + if (bots_sequential_flag) { + PRINTC("Time Sequential = %s seconds\n", str_time_sequential); + PRINTC("Speed-up = %s\n", str_speed_up); + } + + if ( bots_number_of_tasks > 0 ) { + PRINTC("Nodes = %s\n", str_number_of_tasks); + PRINTC("Nodes/Sec = %s\n", str_number_of_tasks_per_second); + } + break; + case 4: + PRINTC("%s;%s;%s;%s;%s;%s;", + str_name, + str_parameters, + str_model, + str_cutoff, + str_resources, + str_result + ); + PRINTC("%s;%s;%s;", + str_time_program, + str_time_sequential, + str_speed_up + ); + PRINTC("%s;%s;", + str_number_of_tasks, + str_number_of_tasks_per_second + ); + PRINTC("\n"); + break; + default: + bots_error(BOTS_ERROR,"No valid output format\n"); + break; + } +} + diff --git a/src/components/implementation/no_interface/omp_fib_bots/bots_common.h b/src/components/implementation/no_interface/omp_fib_bots/bots_common.h new file mode 100644 index 0000000000..9d38799ef1 --- /dev/null +++ b/src/components/implementation/no_interface/omp_fib_bots/bots_common.h @@ -0,0 +1,56 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +#ifndef _COMMON_H +#define _COMMON_H + +#ifndef CC +#define CC "GCC" +#endif +#ifndef CFLAGS +#define CFLAGS "-fopenmp" +#endif +#ifndef LD +#define LD "LD" +#endif +#ifndef LDFLAGS +#define LDFLAGS "-fopenmp -lcos_gomp" +#endif +#ifndef CDATE +#define CDATE "01/01/0001" +#endif +#ifndef CMESSAGE +#define CMESSAGE "Done!" +#endif + +#define BOTS_ERROR 0 +#define BOTS_ERROR_NOT_ENOUGH_MEMORY 1 +#define BOTS_ERROR_UNRECOGNIZED_PARAMETER 2 + +#define BOTS_WARNING 0 + +void bots_get_date(char *str); +void bots_get_architecture(char *str); +void bots_get_load_average(char *str); +void bots_print_results(void); + +#define BOTS_TMP_STR_SZ 64 + +#endif diff --git a/src/components/implementation/no_interface/omp_fib_bots/bots_main.c b/src/components/implementation/no_interface/omp_fib_bots/bots_main.c new file mode 100644 index 0000000000..e70ca2fccb --- /dev/null +++ b/src/components/implementation/no_interface/omp_fib_bots/bots_main.c @@ -0,0 +1,540 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +/*********************************************************************** + * main function & common behaviour of the benchmark. + **********************************************************************/ +#include +#include +#include +#include +#include +#include +#include +#include +#include "bots_common.h" +#include "bots_main.h" +#include "bots.h" +#include "app-desc.h" +#include + +/*********************************************************************** + * DEFAULT VALUES + *********************************************************************/ +/* common flags */ +int bots_sequential_flag = FALSE; +int bots_check_flag = FALSE; +bots_verbose_mode_t bots_verbose_mode = BOTS_VERBOSE_DEFAULT; +int bots_result = BOTS_RESULT_NOT_REQUESTED; +int bots_output_format = 1; +int bots_print_header = FALSE; +/* common variables */ +char bots_name[BOTS_TMP_STR_SZ]; +char bots_execname[BOTS_TMP_STR_SZ]; +char bots_parameters[BOTS_TMP_STR_SZ]; +char bots_model[BOTS_TMP_STR_SZ]; +char bots_resources[BOTS_TMP_STR_SZ]; +/* compile and execution information */ +char bots_exec_date[BOTS_TMP_STR_SZ]; +char bots_exec_message[BOTS_TMP_STR_SZ]; +char bots_comp_date[BOTS_TMP_STR_SZ]; +char bots_comp_message[BOTS_TMP_STR_SZ]; +char bots_cc[BOTS_TMP_STR_SZ]; +char bots_cflags[BOTS_TMP_STR_SZ]; +char bots_ld[BOTS_TMP_STR_SZ]; +char bots_ldflags[BOTS_TMP_STR_SZ]; +char bots_cutoff[BOTS_TMP_STR_SZ]; + +/* time variables */ +double bots_time_program = 0.0; +double bots_time_sequential = 0.0; +unsigned long long bots_number_of_tasks = 0; /* forcing 8 bytes size in -m32 and -m64 */ + +/* + * Application dependent info + */ + +#ifndef BOTS_APP_NAME +#error "Application name must be defined (#define BOTS_APP_NAME)" +#endif + +#ifndef BOTS_APP_PARAMETERS_DESC +#define BOTS_APP_PARAMETERS_DESC "" +#endif + +#ifndef BOTS_APP_PARAMETERS_LIST +#define BOTS_APP_PARAMETERS_LIST +#endif + +#ifndef BOTS_APP_INIT +#define BOTS_APP_INIT +#endif + +#ifndef BOTS_APP_FINI +#define BOTS_APP_FINI +#endif + +#ifndef KERNEL_CALL +#error "Initial kernell call must be specified (#define KERNEL_CALL)" +#endif + +#ifndef KERNEL_INIT +#define KERNEL_INIT +#endif + +#ifndef KERNEL_FINI +#define KERNEL_FINI +#endif + +#ifndef KERNEL_SEQ_INIT +#define KERNEL_SEQ_INIT +#endif + +#ifndef KERNEL_SEQ_FINI +#define KERNEL_SEQ_FINI +#endif + +#ifndef BOTS_MODEL_DESC +#define BOTS_MODEL_DESC "Unknown" +#endif + +#ifdef BOTS_APP_USES_ARG_SIZE +#ifndef BOTS_APP_DEF_ARG_SIZE +#error "Default vaule for argument size must be specified (#define BOTS_APP_DEF_ARG_SIZE)" +#endif +#ifndef BOTS_APP_DESC_ARG_SIZE +#error "Help description for argument size must be specified (#define BOTS_APP_DESC_ARG_SIZE)" +#endif +int bots_arg_size = BOTS_APP_DEF_ARG_SIZE; +#endif + +#ifdef BOTS_APP_USES_ARG_SIZE_1 +#ifndef BOTS_APP_DEF_ARG_SIZE_1 +#error "Default vaule for argument size must be specified (#define BOTS_APP_DEF_ARG_SIZE_1)" +#endif +#ifndef BOTS_APP_DESC_ARG_SIZE_1 +#error "Help description for argument size must be specified (#define BOTS_APP_DESC_ARG_SIZE_1)" +#endif +int bots_arg_size_1 = BOTS_APP_DEF_ARG_SIZE_1; +#endif + +#ifdef BOTS_APP_USES_ARG_SIZE_2 +#ifndef BOTS_APP_DEF_ARG_SIZE_2 +#error "Default vaule for argument size must be specified (#define BOTS_APP_DEF_ARG_SIZE_2)" +#endif +#ifndef BOTS_APP_DESC_ARG_SIZE_2 +#error "Help description for argument size must be specified (#define BOTS_APP_DESC_ARG_SIZE_2)" +#endif +int bots_arg_size_2 = BOTS_APP_DEF_ARG_SIZE_2; +#endif + +#ifdef BOTS_APP_USES_ARG_REPETITIONS +#ifndef BOTS_APP_DEF_ARG_REPETITIONS +#error "Default vaule for argument repetitions must be specified (#define BOTS_APP_DEF_ARG_REPETITIONS)" +#endif +#ifndef BOTS_APP_DESC_ARG_REPETITIONS +#error "Help description for argument repetitions must be specified (#define BOTS_APP_DESC_ARG_REPETITIONS)" +#endif +int bots_arg_repetitions = BOTS_APP_DEF_ARG_REPETITIONS; +#endif + +#ifdef BOTS_APP_USES_ARG_FILE +#ifndef BOTS_APP_DESC_ARG_FILE +#error "Help description for argument file must be specified (#define BOTS_APP_DESC_ARG_FILE)" +#endif +char bots_arg_file[255]=""; +#endif + +#ifdef BOTS_APP_USES_ARG_BLOCK +#ifndef BOTS_APP_DEF_ARG_BLOCK +#error "Default value for argument block must be specified (#define BOTS_APP_DEF_ARG_BLOCK)" +#endif +#ifndef BOTS_APP_DESC_ARG_BLOCK +#error "Help description for argument block must be specified (#define BOTS_APP_DESC_ARG_BLOCK)" +#endif +int bots_arg_block = BOTS_APP_DEF_ARG_BLOCK; +#endif + +#ifdef BOTS_APP_USES_ARG_CUTOFF +#ifndef BOTS_APP_DEF_ARG_CUTOFF +#error "Default value for argument cutoff must be specified (#define BOTS_APP_DEF_ARG_CUTOFF)" +#endif +#ifndef BOTS_APP_DESC_ARG_CUTOFF +#error "Help description for argument cutoff must be specified (#define BOTS_APP_DESC_ARG_CUTOFF)" +#endif +int bots_app_cutoff_value = BOTS_APP_DEF_ARG_CUTOFF; +#endif + +#ifdef BOTS_APP_USES_ARG_CUTOFF_1 +#ifndef BOTS_APP_DEF_ARG_CUTOFF_1 +#error "Default value for argument cutoff must be specified (#define BOTS_APP_DEF_ARG_CUTOFF_1)" +#endif +#ifndef BOTS_APP_DESC_ARG_CUTOFF_1 +#error "Help description for argument cutoff must be specified (#define BOTS_APP_DESC_ARG_CUTOFF_1)" +#endif +int bots_app_cutoff_value_1 = BOTS_APP_DEF_ARG_CUTOFF_1; +#endif + +#ifdef BOTS_APP_USES_ARG_CUTOFF_2 +#ifndef BOTS_APP_DEF_ARG_CUTOFF_2 +#error "Default value for argument cutoff must be specified (#define BOTS_APP_DEF_ARG_CUTOFF_2)" +#endif +#ifndef BOTS_APP_DESC_ARG_CUTOFF_2 +#error "Help description for argument cutoff must be specified (#define BOTS_APP_DESC_ARG_CUTOFF_2)" +#endif +int bots_app_cutoff_value_2 = BOTS_APP_DEF_ARG_CUTOFF_2; +#endif + +#if defined(MANUAL_CUTOFF) || defined(IF_CUTOFF) || defined(FINAL_CUTOFF) +int bots_cutoff_value = BOTS_CUTOFF_DEF_VALUE; +#endif + +/*********************************************************************** + * print_usage: + **********************************************************************/ +void bots_print_usage() +{ + PRINTC("\n"); + PRINTC("Usage: %s -[options]\n", bots_execname); + PRINTC("\n"); + PRINTC("Where options are:\n"); +#ifdef BOTS_APP_USES_REPETITIONS + PRINTC(" -r : Set the number of repetitions (default = 1).\n"); +#endif +#ifdef BOTS_APP_USES_ARG_SIZE + PRINTC(" -n : "BOTS_APP_DESC_ARG_SIZE"\n"); +#endif +#ifdef BOTS_APP_USES_ARG_SIZE_1 + PRINTC(" -m : "BOTS_APP_DESC_ARG_SIZE_1"\n"); +#endif +#ifdef BOTS_APP_USES_ARG_SIZE_2 + PRINTC(" -l : "BOTS_APP_DESC_ARG_SIZE_2"\n"); +#endif +#ifdef BOTS_APP_USES_ARG_FILE + PRINTC(" -f : "BOTS_APP_DESC_ARG_FILE"\n"); +#endif +#if defined(MANUAL_CUTOFF) || defined(IF_CUTOFF) || defined(FINAL_CUTOFF) + PRINTC(" -x : OpenMP tasks cut-off value (default=%d)\n",BOTS_CUTOFF_DEF_VALUE); +#endif +#ifdef BOTS_APP_USES_ARG_CUTOFF + PRINTC(" -y : "BOTS_APP_DESC_ARG_CUTOFF"(default=%d)\n", BOTS_APP_DEF_ARG_CUTOFF); +#endif +#ifdef BOTS_APP_USES_ARG_CUTOFF_1 + PRINTC(" -a : "BOTS_APP_DESC_ARG_CUTOFF_1"(default=%d)\n", BOTS_APP_DEF_ARG_CUTOFF_1); +#endif +#ifdef BOTS_APP_USES_ARG_CUTOFF_2 + PRINTC(" -b : "BOTS_APP_DESC_ARG_CUTOFF_2"(default=%d)\n", BOTS_APP_DEF_ARG_CUTOFF_2); +#endif + + PRINTC("\n"); + PRINTC(" -e : Include 'str' execution message.\n"); + PRINTC(" -v : Set verbose level (default = 1).\n"); + PRINTC(" 0 - none.\n"); + PRINTC(" 1 - default.\n"); + PRINTC(" 2 - debug.\n"); + PRINTC(" -o : Set output format mode (default = 1).\n"); + PRINTC(" 0 - no benchmark output.\n"); + PRINTC(" 1 - detailed list format.\n"); + PRINTC(" 2 - detailed row format.\n"); + PRINTC(" 3 - abridged list format.\n"); + PRINTC(" 4 - abridged row format.\n"); + PRINTC(" -z : Print row header (if output format is a row variant).\n"); + PRINTC("\n"); +#ifdef KERNEL_SEQ_CALL + PRINTC(" -s : Run sequential version.\n"); +#endif +#ifdef BOTS_APP_CHECK_USES_SEQ_RESULT + PRINTC(" -c : Check mode ON (implies running sequential version).\n"); +#else + PRINTC(" -c : Check mode ON.\n"); +#endif + PRINTC("\n"); + PRINTC(" -h : Print program's usage (this help).\n"); + PRINTC("\n"); +} +/*********************************************************************** + * bots_get_params_common: + **********************************************************************/ +void +bots_get_params_common(int argc, char **argv) +{ + int i; + strcpy(bots_execname, basename(argv[0])); + bots_get_date(bots_exec_date); + strcpy(bots_exec_message,""); + for (i=1; i 1 ) { + PRINTC("Error: Configure the suite using '--debug' option in order to use a verbose level greather than 1.\n"); + cos_exit(100); + } +#endif + break; +#if defined(MANUAL_CUTOFF) || defined(IF_CUTOFF) || defined(FINAL_CUTOFF) + case 'x': + argv[i][1] = '*'; + i++; + if (argc == i) { bots_print_usage(); cos_exit(100); } + bots_cutoff_value = atoi(argv[i]); + break; +#endif +#ifdef BOTS_APP_USES_ARG_CUTOFF + case 'y': + argv[i][1] = '*'; + i++; + if (argc == i) { bots_print_usage(); cos_exit(100); } + bots_app_cutoff_value = atoi(argv[i]); + break; +#endif + case 'z': + argv[i][1] = '*'; + bots_print_header = TRUE; + break; + default: + // As at the moment there are only common paramenters + // we launch an error. Otherwise we have to ignore the + // parameter and to check, after specific parameters are + // completely read, if there are unrecognized parameters. + PRINTC("Error: Unrecognized parameter.\n"); + bots_print_usage(); + cos_exit (100); + } + } + else + { + // As at the moment there are only common paramenters + // we launch an error. Otherwise we have to ignore the + // parameter and to check, after specific parameters are + // completely read, if there are unrecognized parameters. + PRINTC("Error: Unrecognized parameter.\n"); + bots_print_usage(); + cos_exit (100); + } + } + + /* always verify? */ + bots_check_flag = TRUE; +} +/*********************************************************************** + * bots_get_params_common: + **********************************************************************/ +void +bots_get_params(int argc, char **argv) +{ + bots_get_params_common(argc, argv); +// bots_get_params_specific(argc, argv); +} + + +/*********************************************************************** + * bots_set_info + **********************************************************************/ +void bots_set_info () +{ + /* program specific info */ + snprintf(bots_name, BOTS_TMP_STR_SZ, BOTS_APP_NAME); + snprintf(bots_parameters, BOTS_TMP_STR_SZ, BOTS_APP_PARAMETERS_DESC BOTS_APP_PARAMETERS_LIST); + snprintf(bots_model, BOTS_TMP_STR_SZ, BOTS_MODEL_DESC); + snprintf(bots_resources, BOTS_TMP_STR_SZ, "%d", omp_get_max_threads()); + + /* compilation info (do not modify) */ + snprintf(bots_comp_date, BOTS_TMP_STR_SZ, CDATE); + snprintf(bots_comp_message, BOTS_TMP_STR_SZ, CMESSAGE); + snprintf(bots_cc, BOTS_TMP_STR_SZ, CC); + snprintf(bots_cflags, BOTS_TMP_STR_SZ, CFLAGS); + snprintf(bots_ld, BOTS_TMP_STR_SZ, LD); + snprintf(bots_ldflags, BOTS_TMP_STR_SZ, LDFLAGS); + +#if defined(MANUAL_CUTOFF) + snprintf(bots_cutoff, BOTS_TMP_STR_SZ, "manual (%d)",bots_cutoff_value); +#elif defined(IF_CUTOFF) + snprintf(bots_cutoff, BOTS_TMP_STR_SZ, "pragma-if (%d)",bots_cutoff_value); +#elif defined(FINAL_CUTOFF) + snprintf(bots_cutoff, BOTS_TMP_STR_SZ, "final (%d)",bots_cutoff_value); +#else + strcpy(bots_cutoff,"none"); +#endif +} + +/*********************************************************************** + * main: + **********************************************************************/ +int +main(void) +{ + /* TODO: app specific args? */ + int argc = 1; + char *app = "bots_app"; + char **argv = &app; + +#ifndef BOTS_APP_SELF_TIMING + long bots_t_start; + long bots_t_end; +#endif + + bots_get_params(argc,argv); + BOTS_APP_INIT; + bots_set_info(); + +#ifdef KERNEL_SEQ_CALL +#ifdef BOTS_APP_CHECK_USES_SEQ_RESULT + if (bots_sequential_flag || bots_check_flag) +#else + if (bots_sequential_flag) +#endif + { + bots_sequential_flag = 1; + KERNEL_SEQ_INIT; +#ifdef BOTS_APP_SELF_TIMING + bots_time_sequential = KERNEL_SEQ_CALL; +#else + bots_t_start = bots_usecs(); + KERNEL_SEQ_CALL; + bots_t_end = bots_usecs(); + bots_time_sequential = ((double)(bots_t_end-bots_t_start))/1000000; +#endif + KERNEL_SEQ_FINI; + } +#endif + + KERNEL_INIT; +#ifdef BOTS_APP_SELF_TIMING + bots_time_program = KERNEL_CALL; +#else + bots_t_start = bots_usecs(); + KERNEL_CALL; + bots_t_end = bots_usecs(); + bots_time_program = ((double)(bots_t_end-bots_t_start))/1000000; +#endif + KERNEL_FINI; + +#ifdef KERNEL_CHECK + if (bots_check_flag) { + bots_result = KERNEL_CHECK; + } +#endif + + BOTS_APP_FINI; + + bots_print_results(); + return (0); +} + diff --git a/src/components/implementation/no_interface/omp_fib_bots/bots_main.h b/src/components/implementation/no_interface/omp_fib_bots/bots_main.h new file mode 100644 index 0000000000..8d1a9ca9a6 --- /dev/null +++ b/src/components/implementation/no_interface/omp_fib_bots/bots_main.h @@ -0,0 +1,53 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +#define BOTS_PARAM_TYPE_NONE 0 +#define BOTS_PARAM_TYPE_INT 1 +#define BOTS_PARAM_TYPE_BOOL 2 +#define BOTS_PARAM_TYPE_STR 3 + +#ifdef _OPENMP +# include +#else +# define omp_get_max_threads() 1 +# define omp_get_thread_num() 0 +# define omp_set_num_threads(x) +#endif + +void bots_print_usage(void); +void bots_print_usage_option(char opt, int type, char* description, char *val, int subc, char **subv); + +/*********************************************************************** + * BENCHMARK HEADERS + *********************************************************************/ +void bots_initialize(); +void bots_finalize(); +void bots_sequential_ini(); +long bots_sequential(); +void bots_sequential_fini(); +int bots_check_result(); +void bots_print_usage_specific(); +void bots_get_params_specific(int argc, char **argv); +void bots_set_info(); + +void bots_get_params_common(int argc, char **argv); +void bots_get_params(int argc, char **argv); + +extern void cos_exit(int x); diff --git a/src/components/implementation/no_interface/omp_fib_bots/fib.c b/src/components/implementation/no_interface/omp_fib_bots/fib.c new file mode 100644 index 0000000000..445b1b40d5 --- /dev/null +++ b/src/components/implementation/no_interface/omp_fib_bots/fib.c @@ -0,0 +1,235 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +#include "bots.h" +#include "fib.h" + +#define FIB_RESULTS_PRE 41 +long long fib_results[FIB_RESULTS_PRE] = {0,1,1,2,3,5,8,13,21,34,55,89,144,233,377,610,987,1597,2584,4181,6765,10946,17711,28657,46368,75025,121393,196418,317811,514229,832040,1346269,2178309,3524578,5702887,9227465,14930352,24157817,39088169,63245986,102334155}; + +long long fib_seq (int n) +{ + int x, y; + if (n < 2) return n; + + x = fib_seq(n - 1); + y = fib_seq(n - 2); + + return x + y; +} + +#if defined(FORCE_TIED_TASKS) +#if defined(IF_CUTOFF) + +long long fib (int n,int d) +{ + long long x, y; + if (n < 2) return n; + + #pragma omp task shared(x) firstprivate(n) if(d < bots_cutoff_value) + x = fib(n - 1,d+1); + + #pragma omp task shared(y) firstprivate(n) if(d < bots_cutoff_value) + y = fib(n - 2,d+1); + + #pragma omp taskwait + return x + y; +} + +#elif defined(FINAL_CUTOFF) + +long long fib (int n,int d) +{ + long long x, y; + if (n < 2) return n; + + #pragma omp task shared(x) firstprivate(n) final(d+1 >= bots_cutoff_value) mergeable + x = fib(n - 1,d+1); + + #pragma omp task shared(y) firstprivate(n) final(d+1 >= bots_cutoff_value) mergeable + y = fib(n - 2,d+1); + + #pragma omp taskwait + return x + y; +} + +#elif defined(MANUAL_CUTOFF) + +long long fib (int n, int d) +{ + long long x, y; + if (n < 2) return n; + + if ( d < bots_cutoff_value ) { + #pragma omp task shared(x) firstprivate(n) + x = fib(n - 1,d+1); + + #pragma omp task shared(y) firstprivate(n) + y = fib(n - 2,d+1); + + #pragma omp taskwait + } else { + x = fib_seq(n-1); + y = fib_seq(n-2); + } + + return x + y; +} + +#else + +long long fib (int n) +{ + long long x, y; + if (n < 2) return n; + + #pragma omp task shared(x) firstprivate(n) + x = fib(n - 1); + #pragma omp task shared(y) firstprivate(n) + y = fib(n - 2); + + #pragma omp taskwait + return x + y; +} + +#endif +#else + +#if defined(IF_CUTOFF) + +long long fib (int n,int d) +{ + long long x, y; + if (n < 2) return n; + + #pragma omp task untied shared(x) firstprivate(n) if(d < bots_cutoff_value) + x = fib(n - 1,d+1); + + #pragma omp task untied shared(y) firstprivate(n) if(d < bots_cutoff_value) + y = fib(n - 2,d+1); + + #pragma omp taskwait + return x + y; +} + +#elif defined(FINAL_CUTOFF) + +long long fib (int n,int d) +{ + long long x, y; + if (n < 2) return n; + + #pragma omp task untied shared(x) firstprivate(n) final(d+1 >= bots_cutoff_value) mergeable + x = fib(n - 1,d+1); + + #pragma omp task untied shared(y) firstprivate(n) final(d+1 >= bots_cutoff_value) mergeable + y = fib(n - 2,d+1); + + #pragma omp taskwait + return x + y; +} + +#elif defined(MANUAL_CUTOFF) + +long long fib (int n, int d) +{ + long long x, y; + if (n < 2) return n; + + if ( d < bots_cutoff_value ) { + #pragma omp task untied shared(x) firstprivate(n) + x = fib(n - 1,d+1); + + #pragma omp task untied shared(y) firstprivate(n) + y = fib(n - 2,d+1); + + #pragma omp taskwait + } else { + x = fib_seq(n-1); + y = fib_seq(n-2); + } + + return x + y; +} + +#else + +long long fib (int n) +{ + long long x, y; + if (n < 2) return n; + + #pragma omp task untied shared(x) firstprivate(n) + x = fib(n - 1); + #pragma omp task untied shared(y) firstprivate(n) + y = fib(n - 2); + + #pragma omp taskwait + return x + y; +} + +#endif +#endif + +static long long par_res, seq_res; + +void fib0 (int n) +{ + #pragma omp parallel + #pragma omp single +#if defined(MANUAL_CUTOFF) || defined(IF_CUTOFF) || defined(FINAL_CUTOFF) + par_res = fib(n,0); +#else + par_res = fib(n); +#endif + bots_message("Fibonacci result for %d is %lld\n",n,par_res); +} + +void fib0_seq (int n) +{ + seq_res = fib_seq(n); + bots_message("Fibonacci result for %d is %lld\n",n,seq_res); +} + +long long fib_verify_value(int n) +{ + if (n < FIB_RESULTS_PRE) return fib_results[n]; + return ( fib_verify_value(n-1) + fib_verify_value(n-2)); +} + +int fib_verify (int n) +{ + int result; + + if (bots_sequential_flag) + { + if (par_res == seq_res) result = BOTS_RESULT_SUCCESSFUL; + else result = BOTS_RESULT_UNSUCCESSFUL; + } + else + { + seq_res = fib_verify_value(n); + if (par_res == seq_res) result = BOTS_RESULT_SUCCESSFUL; + else result = BOTS_RESULT_UNSUCCESSFUL; + } + + return result; +} + diff --git a/src/components/implementation/no_interface/omp_fib_bots/fib.h b/src/components/implementation/no_interface/omp_fib_bots/fib.h new file mode 100644 index 0000000000..e3d2983e7c --- /dev/null +++ b/src/components/implementation/no_interface/omp_fib_bots/fib.h @@ -0,0 +1,40 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ +#ifndef FIB_H +#define FIB_H +#if defined(IF_CUTOFF) +long long fib (int n,int d); +#elif defined(FINAL_CUTOFF) +long long fib (int n,int d); +#elif defined(MANUAL_CUTOFF) +long long fib (int n,int d); +#else +long long fib (int n); +#endif + +long long fib_seq (int n); + +void fib0 (int n); +void fib0_seq (int n); + +int fib_verify (int n); +long long fib_verify_value(int n); +#endif + diff --git a/src/components/implementation/no_interface/omp_fib_bots/init.c b/src/components/implementation/no_interface/omp_fib_bots/init.c new file mode 120000 index 0000000000..b2694bf833 --- /dev/null +++ b/src/components/implementation/no_interface/omp_fib_bots/init.c @@ -0,0 +1 @@ +../omp_hello/init.c \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_fib_bots/omp-tasks-app.h b/src/components/implementation/no_interface/omp_fib_bots/omp-tasks-app.h new file mode 100644 index 0000000000..9cbc9282b2 --- /dev/null +++ b/src/components/implementation/no_interface/omp_fib_bots/omp-tasks-app.h @@ -0,0 +1,31 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +#include + +#define MODEL OMP-TASKS + +#ifdef FORCE_TIED_TASKS +#define BOTS_MODEL_DESC "OpenMP (using tied tasks)" +#else +#define BOTS_MODEL_DESC "OpenMP (using tasks)" +#endif + + diff --git a/src/components/implementation/no_interface/omp_fib_bots/posix_basic.c b/src/components/implementation/no_interface/omp_fib_bots/posix_basic.c new file mode 120000 index 0000000000..99b9e18548 --- /dev/null +++ b/src/components/implementation/no_interface/omp_fib_bots/posix_basic.c @@ -0,0 +1 @@ +../omp_dijkstra/posix_basic.c \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_hello/Makefile b/src/components/implementation/no_interface/omp_hello/Makefile new file mode 100644 index 0000000000..ba90175127 --- /dev/null +++ b/src/components/implementation/no_interface/omp_hello/Makefile @@ -0,0 +1,10 @@ +COMPONENT=omp_hello.o +INTERFACES= +#DEPENDENCIES=capmgr +IF_LIB= +ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps + +include ../../Makefile.subsubdir +MANDITORY_LIB=simple_stklib.o + +CFLAGS += -fopenmp diff --git a/src/components/implementation/no_interface/omp_hello/hello_omp.c b/src/components/implementation/no_interface/omp_hello/hello_omp.c new file mode 100644 index 0000000000..f96d49d3fc --- /dev/null +++ b/src/components/implementation/no_interface/omp_hello/hello_omp.c @@ -0,0 +1,92 @@ +#include +#include + +/******************************************************************************/ + +int main ( void ) + +/******************************************************************************/ +/* + Purpose: + + HELLO has each thread print out its ID. + + Discussion: + + HELLO is a "Hello, World" program for OpenMP. + + Licensing: + + This code is distributed under the GNU LGPL license. + + Modified: + + 23 June 2010 + + Author: + + John Burkardt +*/ +{ + int id; + double wtime; + + PRINTC ( "\n" ); + PRINTC ( "HELLO_OPENMP\n" ); + PRINTC ( " C/OpenMP version\n" ); + + PRINTC ( "\n" ); + PRINTC ( " Number of processors available = %d\n", omp_get_num_procs ( ) ); + PRINTC ( " Number of threads = %d\n", omp_get_max_threads ( ) ); + + wtime = omp_get_wtime ( ); + + PRINTC ( "\n" ); + PRINTC ( " OUTSIDE the parallel region.\n" ); + PRINTC ( "\n" ); + + id = omp_get_thread_num ( ); + PRINTC ( " HELLO from process %d\n", id ) ; + + PRINTC ( "\n" ); + PRINTC ( " Going INSIDE the parallel region:\n" ); + PRINTC ( "\n" ); +/* + INSIDE THE PARALLEL REGION, have each thread say hello. +*/ +#if 1 +#pragma omp parallel private(id) + { +#pragma omp for + for (id = 0; id < 10; id++) + { + PRINTC("id:%u\n", id); + } + } +#else +# pragma omp parallel\ + private ( id ) + { + id = omp_get_thread_num ( ); + PRINTC (" Hello from process %d\n", id ); + } +#endif +/* + Finish up by measuring the elapsed time. +*/ + wtime = omp_get_wtime ( ) - wtime; + + PRINTC ( "\n" ); + PRINTC ( " Back OUTSIDE the parallel region.\n" ); +/* + Terminate. +*/ + PRINTC ( "\n" ); + PRINTC ( "HELLO_OPENMP\n" ); + PRINTC ( " Normal end of execution.\n" ); + + PRINTC ( "\n" ); + PRINTC ( " Elapsed wall clock time = %f\n", wtime ); + + return 0; +} diff --git a/src/components/implementation/no_interface/omp_hello/init.c b/src/components/implementation/no_interface/omp_hello/init.c new file mode 100644 index 0000000000..ddba532393 --- /dev/null +++ b/src/components/implementation/no_interface/omp_hello/init.c @@ -0,0 +1,78 @@ +#include +#include +#include +#include +#include +#include + +int main(void); + +void +cos_exit(int x) +{ + PRINTC("Exit code: %d\n", x); + while (1) ; +} + +static void +cos_main(void *d) +{ + assert(sl_thd_thdid(sl_thd_curr()) == cos_thdid()); + main(); + + while (1) ; +} + +extern void cos_gomp_init(void); + +void +cos_init(void *d) +{ + struct cos_defcompinfo *defci = cos_defcompinfo_curr_get(); + struct cos_compinfo * ci = cos_compinfo_get(defci); + int i; + static volatile unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 }; + static unsigned b1 = 0, b2 = 0, b3 = 0; + + PRINTC("In an OpenMP program!\n"); + if (ps_cas(&first, NUM_CPU + 1, cos_cpuid())) { + cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT); + cos_defcompinfo_llinit(); + } else { + while (!ps_load(&init_done[first])) ; + + cos_defcompinfo_sched_init(); + } + cos_dcb_info_init_curr(); + ps_faa(&init_done[cos_cpuid()], 1); + + /* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */ + for (i = 0; i < NUM_CPU; i++) { + while (!ps_load(&init_done[i])) ; + } + sl_init(SL_MIN_PERIOD_US*100); + /* barrier, wait for sl_init to be done on all cores */ + ps_faa(&b1, 1); + while (ps_load(&b1) != NUM_CPU) ; + cos_gomp_init(); + /* barrier, wait for gomp_init to be done on all cores */ + ps_faa(&b2, 1); + while (ps_load(&b2) != NUM_CPU) ; + + if (!cos_cpuid()) { + struct sl_thd *t = NULL; + + t = sl_thd_alloc(cos_main, NULL); + assert(t); + sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX)); + } + /* wait for all cores to reach this point, so all threads wait for main thread to be ready! */ + ps_faa(&b3, 1); + while (ps_load(&b3) != NUM_CPU) ; + + sl_sched_loop_nonblock(); + + PRINTC("Should never get here!\n"); + assert(0); +} + diff --git a/src/components/implementation/no_interface/omp_sort_bots/Makefile b/src/components/implementation/no_interface/omp_sort_bots/Makefile new file mode 100644 index 0000000000..05d43d1f94 --- /dev/null +++ b/src/components/implementation/no_interface/omp_sort_bots/Makefile @@ -0,0 +1,19 @@ +COMPONENT=omp_sort_bots.o +INTERFACES= +#DEPENDENCIES=capmgr +IF_LIB= +ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps + +include ../../Makefile.subsubdir +MANDITORY_LIB=simple_stklib.o + +CFLAGS += -fopenmp +# if tied tasks are required +CFLAGS += -DFORCE_TIED_TASKS + +OMPC_FINAL_FLAGS= + +# one per compilation or none +#CFLAGS += -DMANUAL_CUTOFF +#CFLAGS += -DIF_CUTOFF +#CFLAGS += -DFINAL_CUTOFF $(OMPC_FINAL_FLAGS) diff --git a/src/components/implementation/no_interface/omp_sort_bots/app-desc.h b/src/components/implementation/no_interface/omp_sort_bots/app-desc.h new file mode 100644 index 0000000000..85e6e47782 --- /dev/null +++ b/src/components/implementation/no_interface/omp_sort_bots/app-desc.h @@ -0,0 +1,66 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +#include "omp-tasks-app.h" + +#define BOTS_APP_NAME "Sort" +#define BOTS_APP_PARAMETERS_DESC "N=%d:Q=%d:I=%d:M=%d" +#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size,bots_app_cutoff_value_1,bots_app_cutoff_value_2,bots_app_cutoff_value + +#define BOTS_APP_USES_ARG_SIZE +#define BOTS_APP_DEF_ARG_SIZE (32*1024*1024) +#define BOTS_APP_DESC_ARG_SIZE "Array size" + +#define BOTS_APP_USES_ARG_CUTOFF +#define BOTS_APP_DEF_ARG_CUTOFF (2*1024) +#define BOTS_APP_DESC_ARG_CUTOFF "Sequential Merge cutoff value" + +#define BOTS_APP_USES_ARG_CUTOFF_1 +#define BOTS_APP_DEF_ARG_CUTOFF_1 (2*1024) +#define BOTS_APP_DESC_ARG_CUTOFF_1 "Sequential Quicksort cutoff value" + +#define BOTS_APP_USES_ARG_CUTOFF_2 +#define BOTS_APP_DEF_ARG_CUTOFF_2 (20) +#define BOTS_APP_DESC_ARG_CUTOFF_2 "Sequential Insertion cutoff value" + +typedef long ELM; + +void seqquick(ELM *low, ELM *high); +void seqmerge(ELM *low1, ELM *high1, ELM *low2, ELM *high2, ELM *lowdest); +ELM *binsplit(ELM val, ELM *low, ELM *high); +void cilkmerge(ELM *low1, ELM *high1, ELM *low2, ELM *high2, ELM *lowdest); +void cilkmerge_par(ELM *low1, ELM *high1, ELM *low2, ELM *high2, ELM *lowdest); +void cilksort(ELM *low, ELM *tmp, long size); +void cilksort_par(ELM *low, ELM *tmp, long size); +void scramble_array( ELM *array ); +void fill_array( ELM *array ); +void sort ( void ); + +void sort_par (void); +void sort_init (void); +int sort_verify (void); + +#define BOTS_APP_INIT sort_init() + +#define KERNEL_INIT +#define KERNEL_CALL sort_par() +#define KERNEL_CHECK sort_verify() + + diff --git a/src/components/implementation/no_interface/omp_sort_bots/bots.h b/src/components/implementation/no_interface/omp_sort_bots/bots.h new file mode 120000 index 0000000000..ea0ad2b59f --- /dev/null +++ b/src/components/implementation/no_interface/omp_sort_bots/bots.h @@ -0,0 +1 @@ +../omp_fib_bots/bots.h \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_sort_bots/bots_common.c b/src/components/implementation/no_interface/omp_sort_bots/bots_common.c new file mode 120000 index 0000000000..4802b0cf70 --- /dev/null +++ b/src/components/implementation/no_interface/omp_sort_bots/bots_common.c @@ -0,0 +1 @@ +../omp_fib_bots/bots_common.c \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_sort_bots/bots_common.h b/src/components/implementation/no_interface/omp_sort_bots/bots_common.h new file mode 120000 index 0000000000..14eda863e4 --- /dev/null +++ b/src/components/implementation/no_interface/omp_sort_bots/bots_common.h @@ -0,0 +1 @@ +../omp_fib_bots/bots_common.h \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_sort_bots/bots_main.c b/src/components/implementation/no_interface/omp_sort_bots/bots_main.c new file mode 120000 index 0000000000..14f2dab009 --- /dev/null +++ b/src/components/implementation/no_interface/omp_sort_bots/bots_main.c @@ -0,0 +1 @@ +../omp_fib_bots/bots_main.c \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_sort_bots/bots_main.h b/src/components/implementation/no_interface/omp_sort_bots/bots_main.h new file mode 120000 index 0000000000..86c06ad286 --- /dev/null +++ b/src/components/implementation/no_interface/omp_sort_bots/bots_main.h @@ -0,0 +1 @@ +../omp_fib_bots/bots_main.h \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_sort_bots/init.c b/src/components/implementation/no_interface/omp_sort_bots/init.c new file mode 120000 index 0000000000..9e09b82e77 --- /dev/null +++ b/src/components/implementation/no_interface/omp_sort_bots/init.c @@ -0,0 +1 @@ +../omp_fib_bots/init.c \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_sort_bots/omp-tasks-app.h b/src/components/implementation/no_interface/omp_sort_bots/omp-tasks-app.h new file mode 120000 index 0000000000..9fba574408 --- /dev/null +++ b/src/components/implementation/no_interface/omp_sort_bots/omp-tasks-app.h @@ -0,0 +1 @@ +../omp_fib_bots/omp-tasks-app.h \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_sort_bots/posix_basic.c b/src/components/implementation/no_interface/omp_sort_bots/posix_basic.c new file mode 120000 index 0000000000..9afee078fb --- /dev/null +++ b/src/components/implementation/no_interface/omp_sort_bots/posix_basic.c @@ -0,0 +1 @@ +../omp_fib_bots/posix_basic.c \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_sort_bots/sort.c b/src/components/implementation/no_interface/omp_sort_bots/sort.c new file mode 100644 index 0000000000..d8140970d6 --- /dev/null +++ b/src/components/implementation/no_interface/omp_sort_bots/sort.c @@ -0,0 +1,517 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +/* + * Original code from the Cilk project + * + * Copyright (c) 2000 Massachusetts Institute of Technology + * Copyright (c) 2000 Matteo Frigo + */ + +/* + * this program uses an algorithm that we call `cilksort'. + * The algorithm is essentially mergesort: + * + * cilksort(in[1..n]) = + * spawn cilksort(in[1..n/2], tmp[1..n/2]) + * spawn cilksort(in[n/2..n], tmp[n/2..n]) + * sync + * spawn cilkmerge(tmp[1..n/2], tmp[n/2..n], in[1..n]) + * + * + * The procedure cilkmerge does the following: + * + * cilkmerge(A[1..n], B[1..m], C[1..(n+m)]) = + * find the median of A \union B using binary + * search. The binary search gives a pair + * (ma, mb) such that ma + mb = (n + m)/2 + * and all elements in A[1..ma] are smaller than + * B[mb..m], and all the B[1..mb] are smaller + * than all elements in A[ma..n]. + * + * spawn cilkmerge(A[1..ma], B[1..mb], C[1..(n+m)/2]) + * spawn cilkmerge(A[ma..m], B[mb..n], C[(n+m)/2 .. (n+m)]) + * sync + * + * The algorithm appears for the first time (AFAIK) in S. G. Akl and + * N. Santoro, "Optimal Parallel Merging and Sorting Without Memory + * Conflicts", IEEE Trans. Comp., Vol. C-36 No. 11, Nov. 1987 . The + * paper does not express the algorithm using recursion, but the + * idea of finding the median is there. + * + * For cilksort of n elements, T_1 = O(n log n) and + * T_\infty = O(log^3 n). There is a way to shave a + * log factor in the critical path (left as homework). + */ + +#include +#include +#include +#include "bots.h" +#include "app-desc.h" + +ELM *array, *tmp; + +static unsigned long rand_nxt = 0; + +static inline unsigned long my_rand(void) +{ + rand_nxt = rand_nxt * 1103515245 + 12345; + return rand_nxt; +} + +static inline void my_srand(unsigned long seed) +{ + rand_nxt = seed; +} + +static inline ELM med3(ELM a, ELM b, ELM c) +{ + if (a < b) { + if (b < c) { + return b; + } else { + if (a < c) + return c; + else + return a; + } + } else { + if (b > c) { + return b; + } else { + if (a > c) + return c; + else + return a; + } + } +} + +/* + * simple approach for now; a better median-finding + * may be preferable + */ +static inline ELM choose_pivot(ELM *low, ELM *high) +{ + return med3(*low, *high, low[(high - low) / 2]); +} + +static ELM *seqpart(ELM *low, ELM *high) +{ + ELM pivot; + ELM h, l; + ELM *curr_low = low; + ELM *curr_high = high; + + pivot = choose_pivot(low, high); + + while (1) { + while ((h = *curr_high) > pivot) + curr_high--; + + while ((l = *curr_low) < pivot) + curr_low++; + + if (curr_low >= curr_high) + break; + + *curr_high-- = l; + *curr_low++ = h; + } + + /* + * I don't know if this is really necessary. + * The problem is that the pivot is not always the + * first element, and the partition may be trivial. + * However, if the partition is trivial, then + * *high is the largest element, whence the following + * code. + */ + if (curr_high < high) + return curr_high; + else + return curr_high - 1; +} + +#define swap(a, b) \ +{ \ + ELM tmp;\ + tmp = a;\ + a = b;\ + b = tmp;\ +} + +static void insertion_sort(ELM *low, ELM *high) +{ + ELM *p, *q; + ELM a, b; + + for (q = low + 1; q <= high; ++q) { + a = q[0]; + for (p = q - 1; p >= low && (b = p[0]) > a; p--) + p[1] = b; + p[1] = a; + } +} + +/* + * tail-recursive quicksort, almost unrecognizable :-) + */ +void seqquick(ELM *low, ELM *high) +{ + ELM *p; + + while (high - low >= bots_app_cutoff_value_2) { + p = seqpart(low, high); + seqquick(low, p); + low = p + 1; + } + + insertion_sort(low, high); +} + +void seqmerge(ELM *low1, ELM *high1, ELM *low2, ELM *high2, + ELM *lowdest) +{ + ELM a1, a2; + + /* + * The following 'if' statement is not necessary + * for the correctness of the algorithm, and is + * in fact subsumed by the rest of the function. + * However, it is a few percent faster. Here is why. + * + * The merging loop below has something like + * if (a1 < a2) { + * *dest++ = a1; + * ++low1; + * if (end of array) break; + * a1 = *low1; + * } + * + * Now, a1 is needed immediately in the next iteration + * and there is no way to mask the latency of the load. + * A better approach is to load a1 *before* the end-of-array + * check; the problem is that we may be speculatively + * loading an element out of range. While this is + * probably not a problem in practice, yet I don't feel + * comfortable with an incorrect algorithm. Therefore, + * I use the 'fast' loop on the array (except for the last + * element) and the 'slow' loop for the rest, saving both + * performance and correctness. + */ + + if (low1 < high1 && low2 < high2) { + a1 = *low1; + a2 = *low2; + for (;;) { + if (a1 < a2) { + *lowdest++ = a1; + a1 = *++low1; + if (low1 >= high1) + break; + } else { + *lowdest++ = a2; + a2 = *++low2; + if (low2 >= high2) + break; + } + } + } + if (low1 <= high1 && low2 <= high2) { + a1 = *low1; + a2 = *low2; + for (;;) { + if (a1 < a2) { + *lowdest++ = a1; + ++low1; + if (low1 > high1) + break; + a1 = *low1; + } else { + *lowdest++ = a2; + ++low2; + if (low2 > high2) + break; + a2 = *low2; + } + } + } + if (low1 > high1) { + memcpy(lowdest, low2, sizeof(ELM) * (high2 - low2 + 1)); + } else { + memcpy(lowdest, low1, sizeof(ELM) * (high1 - low1 + 1)); + } +} + +#define swap_indices(a, b) \ +{ \ + ELM *tmp;\ + tmp = a;\ + a = b;\ + b = tmp;\ +} + +ELM *binsplit(ELM val, ELM *low, ELM *high) +{ + /* + * returns index which contains greatest element <= val. If val is + * less than all elements, returns low-1 + */ + ELM *mid; + + while (low != high) { + mid = low + ((high - low + 1) >> 1); + if (val <= *mid) + high = mid - 1; + else + low = mid; + } + + if (*low > val) + return low - 1; + else + return low; +} + + +void cilkmerge_par(ELM *low1, ELM *high1, ELM *low2, ELM *high2, ELM *lowdest) +{ + /* + * Cilkmerge: Merges range [low1, high1] with range [low2, high2] + * into the range [lowdest, ...] + */ + + ELM *split1, *split2; /* + * where each of the ranges are broken for + * recursive merge + */ + long int lowsize; /* + * total size of lower halves of two + * ranges - 2 + */ + + /* + * We want to take the middle element (indexed by split1) from the + * larger of the two arrays. The following code assumes that split1 + * is taken from range [low1, high1]. So if [low1, high1] is + * actually the smaller range, we should swap it with [low2, high2] + */ + + if (high2 - low2 > high1 - low1) { + swap_indices(low1, low2); + swap_indices(high1, high2); + } + if (high2 < low2) { + /* smaller range is empty */ + memcpy(lowdest, low1, sizeof(ELM) * (high1 - low1)); + return; + } + if (high2 - low2 < bots_app_cutoff_value ) { + seqmerge(low1, high1, low2, high2, lowdest); + return; + } + /* + * Basic approach: Find the middle element of one range (indexed by + * split1). Find where this element would fit in the other range + * (indexed by split 2). Then merge the two lower halves and the two + * upper halves. + */ + + split1 = ((high1 - low1 + 1) / 2) + low1; + split2 = binsplit(*split1, low2, high2); + lowsize = split1 - low1 + split2 - low2; + + /* + * directly put the splitting element into + * the appropriate location + */ + *(lowdest + lowsize + 1) = *split1; +#if defined(FORCE_TIED_TASKS) +#pragma omp task + cilkmerge_par(low1, split1 - 1, low2, split2, lowdest); +#pragma omp task + cilkmerge_par(split1 + 1, high1, split2 + 1, high2, + lowdest + lowsize + 2); +#else +#pragma omp task untied + cilkmerge_par(low1, split1 - 1, low2, split2, lowdest); +#pragma omp task untied + cilkmerge_par(split1 + 1, high1, split2 + 1, high2, + lowdest + lowsize + 2); +#endif +#pragma omp taskwait + + return; +} + +void cilksort_par(ELM *low, ELM *tmp, long size) +{ + /* + * divide the input in four parts of the same size (A, B, C, D) + * Then: + * 1) recursively sort A, B, C, and D (in parallel) + * 2) merge A and B into tmp1, and C and D into tmp2 (in parallel) + * 3) merge tmp1 and tmp2 into the original array + */ + long quarter = size / 4; + ELM *A, *B, *C, *D, *tmpA, *tmpB, *tmpC, *tmpD; + + if (size < bots_app_cutoff_value_1 ) { + /* quicksort when less than 1024 elements */ + seqquick(low, low + size - 1); + return; + } + A = low; + tmpA = tmp; + B = A + quarter; + tmpB = tmpA + quarter; + C = B + quarter; + tmpC = tmpB + quarter; + D = C + quarter; + tmpD = tmpC + quarter; + +#if defined(FORCE_TIED_TASKS) +#pragma omp task + cilksort_par(A, tmpA, quarter); +#pragma omp task + cilksort_par(B, tmpB, quarter); +#pragma omp task + cilksort_par(C, tmpC, quarter); +#pragma omp task + cilksort_par(D, tmpD, size - 3 * quarter); +#else +#pragma omp task untied + cilksort_par(A, tmpA, quarter); +#pragma omp task untied + cilksort_par(B, tmpB, quarter); +#pragma omp task untied + cilksort_par(C, tmpC, quarter); +#pragma omp task untied + cilksort_par(D, tmpD, size - 3 * quarter); +#endif +#pragma omp taskwait + +#if defined(FORCE_TIED_TASKS) +#pragma omp task + cilkmerge_par(A, A + quarter - 1, B, B + quarter - 1, tmpA); +#pragma omp task + cilkmerge_par(C, C + quarter - 1, D, low + size - 1, tmpC); +#else +#pragma omp task untied + cilkmerge_par(A, A + quarter - 1, B, B + quarter - 1, tmpA); +#pragma omp task untied + cilkmerge_par(C, C + quarter - 1, D, low + size - 1, tmpC); +#endif +#pragma omp taskwait + + cilkmerge_par(tmpA, tmpC - 1, tmpC, tmpA + size - 1, A); +} + +void scramble_array( ELM *array ) +{ + unsigned long i; + unsigned long j; + + for (i = 0; i < bots_arg_size; ++i) { + j = my_rand(); + j = j % bots_arg_size; + swap(array[i], array[j]); + } +} + +void fill_array( ELM *array ) +{ + unsigned long i; + + my_srand(1); + /* first, fill with integers 1..size */ + for (i = 0; i < bots_arg_size; ++i) { + array[i] = i; + } +} + +void sort_init ( void ) +{ + /* Checking arguments */ + if (bots_arg_size < 4) { + bots_message("%s can not be less than 4, using 4 as a parameter.\n", BOTS_APP_DESC_ARG_SIZE ); + bots_arg_size = 4; + } + + if (bots_app_cutoff_value < 2) { + bots_message("%s can not be less than 2, using 2 as a parameter.\n", BOTS_APP_DESC_ARG_CUTOFF); + bots_app_cutoff_value = 2; + } + else if (bots_app_cutoff_value > bots_arg_size ) { + bots_message("%s can not be greather than vector size, using %d as a parameter.\n", BOTS_APP_DESC_ARG_CUTOFF, bots_arg_size); + bots_app_cutoff_value = bots_arg_size; + } + + if (bots_app_cutoff_value_1 > bots_arg_size ) { + bots_message("%s can not be greather than vector size, using %d as a parameter.\n", BOTS_APP_DESC_ARG_CUTOFF_1, bots_arg_size); + bots_app_cutoff_value_1 = bots_arg_size; + } + if (bots_app_cutoff_value_2 > bots_arg_size ) { + bots_message("%s can not be greather than vector size, using %d as a parameter.\n", BOTS_APP_DESC_ARG_CUTOFF_2, bots_arg_size); + bots_app_cutoff_value_2 = bots_arg_size; + } + + if (bots_app_cutoff_value_2 > bots_app_cutoff_value_1) { + bots_message("%s can not be greather than %s, using %d as a parameter.\n", + BOTS_APP_DESC_ARG_CUTOFF_2, + BOTS_APP_DESC_ARG_CUTOFF_1, + bots_app_cutoff_value_1 + ); + bots_app_cutoff_value_2 = bots_app_cutoff_value_1; + } + + array = (ELM *) malloc(bots_arg_size * sizeof(ELM)); + tmp = (ELM *) malloc(bots_arg_size * sizeof(ELM)); + fill_array(array); + scramble_array(array); +} + +void sort_par ( void ) +{ + bots_message("Computing multisort algorithm (n=%d) ", bots_arg_size); + #pragma omp parallel + #pragma omp single nowait +#if defined(FORCE_TIED_TASKS) + #pragma omp task + cilksort_par(array, tmp, bots_arg_size); +#else + #pragma omp task untied + cilksort_par(array, tmp, bots_arg_size); +#endif + bots_message(" completed!\n"); +} + +int sort_verify ( void ) +{ + int i, success = 1; + for (i = 0; i < bots_arg_size; ++i) + if (array[i] != i) + success = 0; + + return success ? BOTS_RESULT_SUCCESSFUL : BOTS_RESULT_UNSUCCESSFUL; +} + diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/Makefile b/src/components/implementation/no_interface/omp_sparselu_for_bots/Makefile new file mode 100644 index 0000000000..901901a2cb --- /dev/null +++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/Makefile @@ -0,0 +1,19 @@ +COMPONENT=omp_sparselu_for_bots.o +INTERFACES= +#DEPENDENCIES=capmgr +IF_LIB= +ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps + +include ../../Makefile.subsubdir +MANDITORY_LIB=simple_stklib.o + +CFLAGS += -fopenmp +# if tied tasks are required +CFLAGS += -DFORCE_TIED_TASKS + +OMPC_FINAL_FLAGS= + +# one per compilation or none +#CFLAGS += -DMANUAL_CUTOFF +#CFLAGS += -DIF_CUTOFF +#CFLAGS += -DFINAL_CUTOFF $(OMPC_FINAL_FLAGS) diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/app-desc.h b/src/components/implementation/no_interface/omp_sparselu_for_bots/app-desc.h new file mode 100644 index 0000000000..50e655cf0b --- /dev/null +++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/app-desc.h @@ -0,0 +1,56 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +#include "omp-tasks-app.h" + +#define BOTS_APP_NAME "SparseLU (For version)" +#define BOTS_APP_PARAMETERS_DESC "S1=%dx%d, S2=%dx%d" +#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size,bots_arg_size,bots_arg_size_1,bots_arg_size_1 + +#define BOTS_APP_USES_ARG_SIZE +#define BOTS_APP_DEF_ARG_SIZE 50 +#define BOTS_APP_DESC_ARG_SIZE "Matrix Size" + +#define BOTS_APP_USES_ARG_SIZE_1 +#define BOTS_APP_DEF_ARG_SIZE_1 100 +#define BOTS_APP_DESC_ARG_SIZE_1 "Submatrix Size" + +#define BOTS_APP_INIT float **SEQ,**BENCH; + +void sparselu_init(float ***pM, char *pass); +void sparselu_fini(float **M, char *pass); +void sparselu_seq_call(float **SEQ); +void sparselu_par_call(float **BENCH); +int sparselu_check(float **SEQ, float **BENCH); + +#define KERNEL_INIT sparselu_init(&BENCH,"benchmark"); +#define KERNEL_CALL sparselu_par_call(BENCH); +#define KERNEL_FINI sparselu_fini(BENCH,"benchmark"); + +#define KERNEL_SEQ_INIT sparselu_init(&SEQ,"serial"); +#define KERNEL_SEQ_CALL sparselu_seq_call(SEQ); +#define KERNEL_SEQ_FINI sparselu_fini(SEQ,"serial"); + +/* + * Phani: start without sequencial test + */ +#undef BOTS_APP_CHECK_USES_SEQ_RESULT +#define KERNEL_CHECK sparselu_check(SEQ,BENCH); + diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/bots.h b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots.h new file mode 120000 index 0000000000..828039f356 --- /dev/null +++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots.h @@ -0,0 +1 @@ +../omp_sparselu_single_bots/bots.h \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_common.c b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_common.c new file mode 120000 index 0000000000..8517c18eeb --- /dev/null +++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_common.c @@ -0,0 +1 @@ +../omp_sparselu_single_bots/bots_common.c \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_common.h b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_common.h new file mode 120000 index 0000000000..7eb55ec523 --- /dev/null +++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_common.h @@ -0,0 +1 @@ +../omp_sparselu_single_bots/bots_common.h \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_main.c b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_main.c new file mode 120000 index 0000000000..29ad202b50 --- /dev/null +++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_main.c @@ -0,0 +1 @@ +../omp_sparselu_single_bots/bots_main.c \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_main.h b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_main.h new file mode 120000 index 0000000000..2d1387edd5 --- /dev/null +++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_main.h @@ -0,0 +1 @@ +../omp_sparselu_single_bots/bots_main.h \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/init.c b/src/components/implementation/no_interface/omp_sparselu_for_bots/init.c new file mode 120000 index 0000000000..a7a03a9e37 --- /dev/null +++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/init.c @@ -0,0 +1 @@ +../omp_sparselu_single_bots/init.c \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/omp-tasks-app.h b/src/components/implementation/no_interface/omp_sparselu_for_bots/omp-tasks-app.h new file mode 120000 index 0000000000..1c1cf79526 --- /dev/null +++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/omp-tasks-app.h @@ -0,0 +1 @@ +../omp_sparselu_single_bots/omp-tasks-app.h \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/posix_basic.c b/src/components/implementation/no_interface/omp_sparselu_for_bots/posix_basic.c new file mode 120000 index 0000000000..0b1896b27e --- /dev/null +++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/posix_basic.c @@ -0,0 +1 @@ +../omp_sparselu_single_bots/posix_basic.c \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/sparselu.c b/src/components/implementation/no_interface/omp_sparselu_for_bots/sparselu.c new file mode 100644 index 0000000000..b441389dc9 --- /dev/null +++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/sparselu.c @@ -0,0 +1,326 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include "bots.h" +#include "sparselu.h" + +/*********************************************************************** + * checkmat: + **********************************************************************/ +int checkmat (float *M, float *N) +{ + int i, j; + float r_err; + + for (i = 0; i < bots_arg_size_1; i++) + { + for (j = 0; j < bots_arg_size_1; j++) + { + r_err = M[i*bots_arg_size_1+j] - N[i*bots_arg_size_1+j]; + if ( r_err == 0.0 ) continue; + + if (r_err < 0.0 ) r_err = -r_err; + + if ( M[i*bots_arg_size_1+j] == 0 ) + { + bots_message("Checking failure: A[%d][%d]=%f B[%d][%d]=%f; \n", + i,j, M[i*bots_arg_size_1+j], i,j, N[i*bots_arg_size_1+j]); + return FALSE; + } + r_err = r_err / M[i*bots_arg_size_1+j]; + if(r_err > EPSILON) + { + bots_message("Checking failure: A[%d][%d]=%f B[%d][%d]=%f; Relative Error=%f\n", + i,j, M[i*bots_arg_size_1+j], i,j, N[i*bots_arg_size_1+j], r_err); + return FALSE; + } + } + } + return TRUE; +} +/*********************************************************************** + * genmat: + **********************************************************************/ +void genmat (float *M[]) +{ + int null_entry, init_val, i, j, ii, jj; + float *p; + int a=0,b=0; + + init_val = 1325; + + /* generating the structure */ + for (ii=0; ii < bots_arg_size; ii++) + { + for (jj=0; jj < bots_arg_size; jj++) + { + /* computing null entries */ + null_entry=FALSE; + if ((iijj) && (jj%3 !=0)) null_entry = TRUE; + if (ii%2==1) null_entry = TRUE; + if (jj%2==1) null_entry = TRUE; + if (ii==jj) null_entry = FALSE; + if (ii==jj-1) null_entry = FALSE; + if (ii-1 == jj) null_entry = FALSE; + /* allocating matrix */ + if (null_entry == FALSE){ + a++; + M[ii*bots_arg_size+jj] = (float *) malloc(bots_arg_size_1*bots_arg_size_1*sizeof(float)); + if ((M[ii*bots_arg_size+jj] == NULL)) + { + bots_message("Error: Out of memory\n"); + exit(101); + } + /* initializing matrix */ + p = M[ii*bots_arg_size+jj]; + for (i = 0; i < bots_arg_size_1; i++) + { + for (j = 0; j < bots_arg_size_1; j++) + { + init_val = (3125 * init_val) % 65536; + (*p) = (float)((init_val - 32768.0) / 16384.0); + p++; + } + } + } + else + { + b++; + M[ii*bots_arg_size+jj] = NULL; + } + } + } + bots_debug("allo = %d, no = %d, total = %d, factor = %f\n",a,b,a+b,(float)((float)a/(float)(a+b))); +} +/*********************************************************************** + * print_structure: + **********************************************************************/ +void print_structure(char *name, float *M[]) +{ + int ii, jj; + bots_message("Structure for matrix %s @ 0x%p\n",name, M); + for (ii = 0; ii < bots_arg_size; ii++) { + for (jj = 0; jj < bots_arg_size; jj++) { + if (M[ii*bots_arg_size+jj]!=NULL) {bots_message("x");} + else bots_message(" "); + } + bots_message("\n"); + } + bots_message("\n"); +} +/*********************************************************************** + * allocate_clean_block: + **********************************************************************/ +float * allocate_clean_block() +{ + int i,j; + float *p, *q; + + p = (float *) malloc(bots_arg_size_1*bots_arg_size_1*sizeof(float)); + q=p; + if (p!=NULL){ + for (i = 0; i < bots_arg_size_1; i++) + for (j = 0; j < bots_arg_size_1; j++){(*p)=0.0; p++;} + + } + else + { + bots_message("Error: Out of memory\n"); + exit (101); + } + return (q); +} + +/*********************************************************************** + * lu0: + **********************************************************************/ +void lu0(float *diag) +{ + int i, j, k; + + for (k=0; k +#include +#include +#include +#include +#include +#include "bots.h" +#include "sparselu.h" + +/*********************************************************************** + * checkmat: + **********************************************************************/ +int checkmat (float *M, float *N) +{ + int i, j; + float r_err; + + for (i = 0; i < bots_arg_size_1; i++) + { + for (j = 0; j < bots_arg_size_1; j++) + { + r_err = M[i*bots_arg_size_1+j] - N[i*bots_arg_size_1+j]; + if ( r_err == 0.0 ) continue; + + if (r_err < 0.0 ) r_err = -r_err; + + if ( M[i*bots_arg_size_1+j] == 0 ) + { + bots_message("Checking failure: A[%d][%d]=%f B[%d][%d]=%f; \n", + i,j, M[i*bots_arg_size_1+j], i,j, N[i*bots_arg_size_1+j]); + return FALSE; + } + r_err = r_err / M[i*bots_arg_size_1+j]; + if(r_err > EPSILON) + { + bots_message("Checking failure: A[%d][%d]=%f B[%d][%d]=%f; Relative Error=%f\n", + i,j, M[i*bots_arg_size_1+j], i,j, N[i*bots_arg_size_1+j], r_err); + return FALSE; + } + } + } + return TRUE; +} +/*********************************************************************** + * genmat: + **********************************************************************/ +void genmat (float *M[]) +{ + int null_entry, init_val, i, j, ii, jj; + float *p; + + init_val = 1325; + + /* generating the structure */ + for (ii=0; ii < bots_arg_size; ii++) + { + for (jj=0; jj < bots_arg_size; jj++) + { + /* computing null entries */ + null_entry=FALSE; + if ((iijj) && (jj%3 !=0)) null_entry = TRUE; + if (ii%2==1) null_entry = TRUE; + if (jj%2==1) null_entry = TRUE; + if (ii==jj) null_entry = FALSE; + if (ii==jj-1) null_entry = FALSE; + if (ii-1 == jj) null_entry = FALSE; + /* allocating matrix */ + if (null_entry == FALSE){ + M[ii*bots_arg_size+jj] = (float *) malloc(bots_arg_size_1*bots_arg_size_1*sizeof(float)); + if ((M[ii*bots_arg_size+jj] == NULL)) + { + bots_message("Error: Out of memory\n"); + exit(101); + } + /* initializing matrix */ + p = M[ii*bots_arg_size+jj]; + for (i = 0; i < bots_arg_size_1; i++) + { + for (j = 0; j < bots_arg_size_1; j++) + { + init_val = (3125 * init_val) % 65536; + (*p) = (float)((init_val - 32768.0) / 16384.0); + p++; + } + } + } + else + { + M[ii*bots_arg_size+jj] = NULL; + } + } + } +} +/*********************************************************************** + * print_structure: + **********************************************************************/ +void print_structure(char *name, float *M[]) +{ + int ii, jj; + bots_message("Structure for matrix %s @ 0x%p\n",name, M); + for (ii = 0; ii < bots_arg_size; ii++) { + for (jj = 0; jj < bots_arg_size; jj++) { + if (M[ii*bots_arg_size+jj]!=NULL) {bots_message("x");} + else bots_message(" "); + } + bots_message("\n"); + } + bots_message("\n"); +} +/*********************************************************************** + * allocate_clean_block: + **********************************************************************/ +float * allocate_clean_block() +{ + int i,j; + float *p, *q; + + p = (float *) malloc(bots_arg_size_1*bots_arg_size_1*sizeof(float)); + q=p; + if (p!=NULL){ + for (i = 0; i < bots_arg_size_1; i++) + for (j = 0; j < bots_arg_size_1; j++){(*p)=0.0; p++;} + + } + else + { + bots_message("Error: Out of memory\n"); + exit (101); + } + return (q); +} + +/*********************************************************************** + * lu0: + **********************************************************************/ +void lu0(float *diag) +{ + int i, j, k; + + for (k=0; k +#include +#include +#include "app-desc.h" +#include "bots.h" +#include "strassen.h" + +/*********************************************************************** + * Naive sequential algorithm, for comparison purposes + **********************************************************************/ +void matrixmul(int n, REAL *A, int an, REAL *B, int bn, REAL *C, int cn) +{ + int i, j, k; + REAL s; + + for (i = 0; i < n; ++i) + { + for (j = 0; j < n; ++j) + { + s = 0.0; + for (k = 0; k < n; ++k) s += ELEM(A, an, i, k) * ELEM(B, bn, k, j); + ELEM(C, cn, i, j) = s; + } + } +} +/***************************************************************************** +** +** FastNaiveMatrixMultiply +** +** For small to medium sized matrices A, B, and C of size +** MatrixSize * MatrixSize this function performs the operation +** C = A x B efficiently. +** +** Note MatrixSize must be divisible by 8. +** +** INPUT: +** C = (*C WRITE) Address of top left element of matrix C. +** A = (*A IS READ ONLY) Address of top left element of matrix A. +** B = (*B IS READ ONLY) Address of top left element of matrix B. +** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n) +** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1] +** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1] +** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1] +** +** OUTPUT: +** C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.) +** +*****************************************************************************/ +void FastNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, + unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB) +{ + /* Assumes size of real is 8 bytes */ + PTR RowWidthBInBytes = RowWidthB << 3; + PTR RowWidthAInBytes = RowWidthA << 3; + PTR MatrixWidthInBytes = MatrixSize << 3; + PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3; + unsigned Horizontal, Vertical; + + REAL *ARowStart = A; + for (Vertical = 0; Vertical < MatrixSize; Vertical++) { + for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) { + REAL *BColumnStart = B + Horizontal; + REAL FirstARowValue = *ARowStart++; + + REAL Sum0 = FirstARowValue * (*BColumnStart); + REAL Sum1 = FirstARowValue * (*(BColumnStart+1)); + REAL Sum2 = FirstARowValue * (*(BColumnStart+2)); + REAL Sum3 = FirstARowValue * (*(BColumnStart+3)); + REAL Sum4 = FirstARowValue * (*(BColumnStart+4)); + REAL Sum5 = FirstARowValue * (*(BColumnStart+5)); + REAL Sum6 = FirstARowValue * (*(BColumnStart+6)); + REAL Sum7 = FirstARowValue * (*(BColumnStart+7)); + + unsigned Products; + for (Products = 1; Products < MatrixSize; Products++) { + REAL ARowValue = *ARowStart++; + BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes); + Sum0 += ARowValue * (*BColumnStart); + Sum1 += ARowValue * (*(BColumnStart+1)); + Sum2 += ARowValue * (*(BColumnStart+2)); + Sum3 += ARowValue * (*(BColumnStart+3)); + Sum4 += ARowValue * (*(BColumnStart+4)); + Sum5 += ARowValue * (*(BColumnStart+5)); + Sum6 += ARowValue * (*(BColumnStart+6)); + Sum7 += ARowValue * (*(BColumnStart+7)); + } + ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes); + + *(C) = Sum0; + *(C+1) = Sum1; + *(C+2) = Sum2; + *(C+3) = Sum3; + *(C+4) = Sum4; + *(C+5) = Sum5; + *(C+6) = Sum6; + *(C+7) = Sum7; + C+=8; + } + ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes ); + C = (REAL*) ( ((PTR) C) + RowIncrementC ); + } +} +/***************************************************************************** +** +** FastAdditiveNaiveMatrixMultiply +** +** For small to medium sized matrices A, B, and C of size +** MatrixSize * MatrixSize this function performs the operation +** C += A x B efficiently. +** +** Note MatrixSize must be divisible by 8. +** +** INPUT: +** C = (*C READ/WRITE) Address of top left element of matrix C. +** A = (*A IS READ ONLY) Address of top left element of matrix A. +** B = (*B IS READ ONLY) Address of top left element of matrix B. +** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n) +** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1] +** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1] +** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1] +** +** OUTPUT: +** C = (*C READ/WRITE) Matrix C contains C + A x B. +** +*****************************************************************************/ +void FastAdditiveNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, + unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB) +{ + /* Assumes size of real is 8 bytes */ + PTR RowWidthBInBytes = RowWidthB << 3; + PTR RowWidthAInBytes = RowWidthA << 3; + PTR MatrixWidthInBytes = MatrixSize << 3; + PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3; + unsigned Horizontal, Vertical; + + REAL *ARowStart = A; + for (Vertical = 0; Vertical < MatrixSize; Vertical++) { + for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) { + REAL *BColumnStart = B + Horizontal; + + REAL Sum0 = *C; + REAL Sum1 = *(C+1); + REAL Sum2 = *(C+2); + REAL Sum3 = *(C+3); + REAL Sum4 = *(C+4); + REAL Sum5 = *(C+5); + REAL Sum6 = *(C+6); + REAL Sum7 = *(C+7); + + unsigned Products; + for (Products = 0; Products < MatrixSize; Products++) { + REAL ARowValue = *ARowStart++; + + Sum0 += ARowValue * (*BColumnStart); + Sum1 += ARowValue * (*(BColumnStart+1)); + Sum2 += ARowValue * (*(BColumnStart+2)); + Sum3 += ARowValue * (*(BColumnStart+3)); + Sum4 += ARowValue * (*(BColumnStart+4)); + Sum5 += ARowValue * (*(BColumnStart+5)); + Sum6 += ARowValue * (*(BColumnStart+6)); + Sum7 += ARowValue * (*(BColumnStart+7)); + + BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes); + + } + ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes); + + *(C) = Sum0; + *(C+1) = Sum1; + *(C+2) = Sum2; + *(C+3) = Sum3; + *(C+4) = Sum4; + *(C+5) = Sum5; + *(C+6) = Sum6; + *(C+7) = Sum7; + C+=8; + } + + ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes ); + C = (REAL*) ( ((PTR) C) + RowIncrementC ); + } +} +/***************************************************************************** +** +** MultiplyByDivideAndConquer +** +** For medium to medium-large (would you like fries with that) sized +** matrices A, B, and C of size MatrixSize * MatrixSize this function +** efficiently performs the operation +** C = A x B (if AdditiveMode == 0) +** C += A x B (if AdditiveMode != 0) +** +** Note MatrixSize must be divisible by 16. +** +** INPUT: +** C = (*C READ/WRITE) Address of top left element of matrix C. +** A = (*A IS READ ONLY) Address of top left element of matrix A. +** B = (*B IS READ ONLY) Address of top left element of matrix B. +** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n) +** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1] +** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1] +** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1] +** AdditiveMode = 0 if we want C = A x B, otherwise we'll do C += A x B +** +** OUTPUT: +** C (+)= A x B. (+ if AdditiveMode != 0) +** +*****************************************************************************/ +void MultiplyByDivideAndConquer(REAL *C, REAL *A, REAL *B, + unsigned MatrixSize, + unsigned RowWidthC, + unsigned RowWidthA, + unsigned RowWidthB, + int AdditiveMode + ) +{ + #define A00 A + #define B00 B + #define C00 C + REAL *A01, *A10, *A11, *B01, *B10, *B11, *C01, *C10, *C11; + unsigned QuadrantSize = MatrixSize >> 1; + + /* partition the matrix */ + A01 = A00 + QuadrantSize; + A10 = A00 + RowWidthA * QuadrantSize; + A11 = A10 + QuadrantSize; + + B01 = B00 + QuadrantSize; + B10 = B00 + RowWidthB * QuadrantSize; + B11 = B10 + QuadrantSize; + + C01 = C00 + QuadrantSize; + C10 = C00 + RowWidthC * QuadrantSize; + C11 = C10 + QuadrantSize; + + if (QuadrantSize > SizeAtWhichNaiveAlgorithmIsMoreEfficient) { + + MultiplyByDivideAndConquer(C00, A00, B00, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB, + AdditiveMode); + + MultiplyByDivideAndConquer(C01, A00, B01, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB, + AdditiveMode); + + MultiplyByDivideAndConquer(C11, A10, B01, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB, + AdditiveMode); + + MultiplyByDivideAndConquer(C10, A10, B00, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB, + AdditiveMode); + + MultiplyByDivideAndConquer(C00, A01, B10, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB, + 1); + + MultiplyByDivideAndConquer(C01, A01, B11, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB, + 1); + + MultiplyByDivideAndConquer(C11, A11, B11, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB, + 1); + + MultiplyByDivideAndConquer(C10, A11, B10, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB, + 1); + + } else { + + if (AdditiveMode) { + FastAdditiveNaiveMatrixMultiply(C00, A00, B00, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + + FastAdditiveNaiveMatrixMultiply(C01, A00, B01, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + + FastAdditiveNaiveMatrixMultiply(C11, A10, B01, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + + FastAdditiveNaiveMatrixMultiply(C10, A10, B00, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + + } else { + + FastNaiveMatrixMultiply(C00, A00, B00, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + + FastNaiveMatrixMultiply(C01, A00, B01, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + + FastNaiveMatrixMultiply(C11, A10, B01, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + + FastNaiveMatrixMultiply(C10, A10, B00, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + } + + FastAdditiveNaiveMatrixMultiply(C00, A01, B10, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + + FastAdditiveNaiveMatrixMultiply(C01, A01, B11, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + + FastAdditiveNaiveMatrixMultiply(C11, A11, B11, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + + FastAdditiveNaiveMatrixMultiply(C10, A11, B10, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + } + return; +} +/***************************************************************************** +** +** OptimizedStrassenMultiply +** +** For large matrices A, B, and C of size MatrixSize * MatrixSize this +** function performs the operation C = A x B efficiently. +** +** INPUT: +** C = (*C WRITE) Address of top left element of matrix C. +** A = (*A IS READ ONLY) Address of top left element of matrix A. +** B = (*B IS READ ONLY) Address of top left element of matrix B. +** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n) +** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1] +** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1] +** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1] +** +** OUTPUT: +** C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.) +** +*****************************************************************************/ +void OptimizedStrassenMultiply_seq(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, + unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth) +{ + unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */ + unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize + + 32; + unsigned Column, Row; + + /************************************************************************ + ** For each matrix A, B, and C, we'll want pointers to each quandrant + ** in the matrix. These quandrants will be addressed as follows: + ** -- -- + ** | A11 A12 | + ** | | + ** | A21 A22 | + ** -- -- + ************************************************************************/ + REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12, + *A21, *B21, *C21, *A22, *B22, *C22; + + REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT; + #define T2sMULT C22 + #define NumberOfVariables 11 + + PTR TempMatrixOffset = 0; + PTR MatrixOffsetA = 0; + PTR MatrixOffsetB = 0; + + char *Heap; + void *StartHeap; + + /* Distance between the end of a matrix row and the start of the next row */ + PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3; + PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3; + PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3; + + if (MatrixSize <= bots_app_cutoff_value) { + MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0); + return; + } + + /* Initialize quandrant matrices */ + #define A11 A + #define B11 B + #define C11 C + A12 = A11 + QuadrantSize; + B12 = B11 + QuadrantSize; + C12 = C11 + QuadrantSize; + A21 = A + (RowWidthA * QuadrantSize); + B21 = B + (RowWidthB * QuadrantSize); + C21 = C + (RowWidthC * QuadrantSize); + A22 = A21 + QuadrantSize; + B22 = B21 + QuadrantSize; + C22 = C21 + QuadrantSize; + + /* Allocate Heap Space Here */ + StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables); + /* ensure that heap is on cache boundary */ + if ( ((PTR) Heap) & 31) + Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) ); + + /* Distribute the heap space over the variables */ + S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes; + + /*************************************************************************** + ** Step through all columns row by row (vertically) + ** (jumps in memory by RowWidth => bad locality) + ** (but we want the best locality on the innermost loop) + ***************************************************************************/ + for (Row = 0; Row < QuadrantSize; Row++) { + + /************************************************************************* + ** Step through each row horizontally (addressing elements in each column) + ** (jumps linearly througn memory => good locality) + *************************************************************************/ + for (Column = 0; Column < QuadrantSize; Column++) { + + /*********************************************************** + ** Within this loop, the following holds for MatrixOffset: + ** MatrixOffset = (Row * RowWidth) + Column + ** (note: that the unit of the offset is number of reals) + ***********************************************************/ + /* Element of Global Matrix, such as A, B, C */ + #define E(Matrix) (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) ) + #define EA(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) ) + #define EB(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) ) + + /* FIXME - may pay to expand these out - got higher speed-ups below */ + /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */ + E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) ); + + /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */ + E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21); + + /* S3 = A11 - A21 */ + E(S3) = EA(A11) - EA(A21); + + /* S7 = B22 - B12 */ + E(S7) = EB(B22) - EB(B12); + + TempMatrixOffset += sizeof(REAL); + MatrixOffsetA += sizeof(REAL); + MatrixOffsetB += sizeof(REAL); + } /* end row loop*/ + + MatrixOffsetA += RowIncrementA; + MatrixOffsetB += RowIncrementB; + } /* end column loop */ + + /* M2 = A11 x B11 */ + OptimizedStrassenMultiply_seq(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1); + + /* M5 = S1 * S5 */ + OptimizedStrassenMultiply_seq(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of T1 = S2 x S6 + M2 */ + OptimizedStrassenMultiply_seq(T1sMULT, S2, S6, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of T2 = T1 + S3 x S7 */ + OptimizedStrassenMultiply_seq(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of C11 = M2 + A12 * B21 */ + OptimizedStrassenMultiply_seq(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1); + + /* Step 1 of C12 = S4 x B22 + T1 + M5 */ + OptimizedStrassenMultiply_seq(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1); + + /* Step 1 of C21 = T2 - A22 * S8 */ + OptimizedStrassenMultiply_seq(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1); + + /*************************************************************************** + ** Step through all columns row by row (vertically) + ** (jumps in memory by RowWidth => bad locality) + ** (but we want the best locality on the innermost loop) + ***************************************************************************/ + for (Row = 0; Row < QuadrantSize; Row++) { + /************************************************************************* + ** Step through each row horizontally (addressing elements in each column) + ** (jumps linearly througn memory => good locality) + *************************************************************************/ + for (Column = 0; Column < QuadrantSize; Column += 4) { + REAL LocalM5_0 = *(M5); + REAL LocalM5_1 = *(M5+1); + REAL LocalM5_2 = *(M5+2); + REAL LocalM5_3 = *(M5+3); + REAL LocalM2_0 = *(M2); + REAL LocalM2_1 = *(M2+1); + REAL LocalM2_2 = *(M2+2); + REAL LocalM2_3 = *(M2+3); + REAL T1_0 = *(T1sMULT) + LocalM2_0; + REAL T1_1 = *(T1sMULT+1) + LocalM2_1; + REAL T1_2 = *(T1sMULT+2) + LocalM2_2; + REAL T1_3 = *(T1sMULT+3) + LocalM2_3; + REAL T2_0 = *(C22) + T1_0; + REAL T2_1 = *(C22+1) + T1_1; + REAL T2_2 = *(C22+2) + T1_2; + REAL T2_3 = *(C22+3) + T1_3; + (*(C11)) += LocalM2_0; + (*(C11+1)) += LocalM2_1; + (*(C11+2)) += LocalM2_2; + (*(C11+3)) += LocalM2_3; + (*(C12)) += LocalM5_0 + T1_0; + (*(C12+1)) += LocalM5_1 + T1_1; + (*(C12+2)) += LocalM5_2 + T1_2; + (*(C12+3)) += LocalM5_3 + T1_3; + (*(C22)) = LocalM5_0 + T2_0; + (*(C22+1)) = LocalM5_1 + T2_1; + (*(C22+2)) = LocalM5_2 + T2_2; + (*(C22+3)) = LocalM5_3 + T2_3; + (*(C21 )) = (- *(C21 )) + T2_0; + (*(C21+1)) = (- *(C21+1)) + T2_1; + (*(C21+2)) = (- *(C21+2)) + T2_2; + (*(C21+3)) = (- *(C21+3)) + T2_3; + M5 += 4; + M2 += 4; + T1sMULT += 4; + C11 += 4; + C12 += 4; + C21 += 4; + C22 += 4; + } + C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC); + C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC); + C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC); + C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC); + } + free(StartHeap); +} +#if defined(IF_CUTOFF) +void OptimizedStrassenMultiply_par(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, + unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth) +{ + unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */ + unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize + + 32; + unsigned Column, Row; + + /************************************************************************ + ** For each matrix A, B, and C, we'll want pointers to each quandrant + ** in the matrix. These quandrants will be addressed as follows: + ** -- -- + ** | A11 A12 | + ** | | + ** | A21 A22 | + ** -- -- + ************************************************************************/ + REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12, + *A21, *B21, *C21, *A22, *B22, *C22; + + REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT; + #define T2sMULT C22 + #define NumberOfVariables 11 + + PTR TempMatrixOffset = 0; + PTR MatrixOffsetA = 0; + PTR MatrixOffsetB = 0; + + char *Heap; + void *StartHeap; + + /* Distance between the end of a matrix row and the start of the next row */ + PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3; + PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3; + PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3; + + if (MatrixSize <= bots_app_cutoff_value) { + MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0); + return; + } + + /* Initialize quandrant matrices */ + #define A11 A + #define B11 B + #define C11 C + A12 = A11 + QuadrantSize; + B12 = B11 + QuadrantSize; + C12 = C11 + QuadrantSize; + A21 = A + (RowWidthA * QuadrantSize); + B21 = B + (RowWidthB * QuadrantSize); + C21 = C + (RowWidthC * QuadrantSize); + A22 = A21 + QuadrantSize; + B22 = B21 + QuadrantSize; + C22 = C21 + QuadrantSize; + + /* Allocate Heap Space Here */ + StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables); + /* ensure that heap is on cache boundary */ + if ( ((PTR) Heap) & 31) + Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) ); + + /* Distribute the heap space over the variables */ + S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes; + + /*************************************************************************** + ** Step through all columns row by row (vertically) + ** (jumps in memory by RowWidth => bad locality) + ** (but we want the best locality on the innermost loop) + ***************************************************************************/ + for (Row = 0; Row < QuadrantSize; Row++) { + + /************************************************************************* + ** Step through each row horizontally (addressing elements in each column) + ** (jumps linearly througn memory => good locality) + *************************************************************************/ + for (Column = 0; Column < QuadrantSize; Column++) { + + /*********************************************************** + ** Within this loop, the following holds for MatrixOffset: + ** MatrixOffset = (Row * RowWidth) + Column + ** (note: that the unit of the offset is number of reals) + ***********************************************************/ + /* Element of Global Matrix, such as A, B, C */ + #define E(Matrix) (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) ) + #define EA(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) ) + #define EB(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) ) + + /* FIXME - may pay to expand these out - got higher speed-ups below */ + /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */ + E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) ); + + /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */ + E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21); + + /* S3 = A11 - A21 */ + E(S3) = EA(A11) - EA(A21); + + /* S7 = B22 - B12 */ + E(S7) = EB(B22) - EB(B12); + + TempMatrixOffset += sizeof(REAL); + MatrixOffsetA += sizeof(REAL); + MatrixOffsetB += sizeof(REAL); + } /* end row loop*/ + + MatrixOffsetA += RowIncrementA; + MatrixOffsetB += RowIncrementB; + } /* end column loop */ + +#if defined(FORCE_TIED_TASKS) + /* M2 = A11 x B11 */ + #pragma omp task if (Depth < bots_cutoff_value) + OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1); + + /* M5 = S1 * S5 */ + #pragma omp task if (Depth < bots_cutoff_value) + OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of T1 = S2 x S6 + M2 */ + #pragma omp task if (Depth < bots_cutoff_value) + OptimizedStrassenMultiply_par(T1sMULT, S2, S6, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of T2 = T1 + S3 x S7 */ + #pragma omp task if (Depth < bots_cutoff_value) + OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of C11 = M2 + A12 * B21 */ + #pragma omp task if (Depth < bots_cutoff_value) + OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1); + + /* Step 1 of C12 = S4 x B22 + T1 + M5 */ + #pragma omp task if (Depth < bots_cutoff_value) + OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1); + + /* Step 1 of C21 = T2 - A22 * S8 */ + #pragma omp task if (Depth < bots_cutoff_value) + OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1); +#else + /* M2 = A11 x B11 */ + #pragma omp task untied if (Depth < bots_cutoff_value) + OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1); + + /* M5 = S1 * S5 */ + #pragma omp task untied if (Depth < bots_cutoff_value) + OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of T1 = S2 x S6 + M2 */ + #pragma omp task untied if (Depth < bots_cutoff_value) + OptimizedStrassenMultiply_par(T1sMULT, S2, S6, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of T2 = T1 + S3 x S7 */ + #pragma omp task untied if (Depth < bots_cutoff_value) + OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of C11 = M2 + A12 * B21 */ + #pragma omp task untied if (Depth < bots_cutoff_value) + OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1); + + /* Step 1 of C12 = S4 x B22 + T1 + M5 */ + #pragma omp task untied if (Depth < bots_cutoff_value) + OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1); + + /* Step 1 of C21 = T2 - A22 * S8 */ + #pragma omp task untied if (Depth < bots_cutoff_value) + OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1); +#endif + /********************************************** + ** Synchronization Point + **********************************************/ + #pragma omp taskwait + /*************************************************************************** + ** Step through all columns row by row (vertically) + ** (jumps in memory by RowWidth => bad locality) + ** (but we want the best locality on the innermost loop) + ***************************************************************************/ + for (Row = 0; Row < QuadrantSize; Row++) { + /************************************************************************* + ** Step through each row horizontally (addressing elements in each column) + ** (jumps linearly througn memory => good locality) + *************************************************************************/ + for (Column = 0; Column < QuadrantSize; Column += 4) { + REAL LocalM5_0 = *(M5); + REAL LocalM5_1 = *(M5+1); + REAL LocalM5_2 = *(M5+2); + REAL LocalM5_3 = *(M5+3); + REAL LocalM2_0 = *(M2); + REAL LocalM2_1 = *(M2+1); + REAL LocalM2_2 = *(M2+2); + REAL LocalM2_3 = *(M2+3); + REAL T1_0 = *(T1sMULT) + LocalM2_0; + REAL T1_1 = *(T1sMULT+1) + LocalM2_1; + REAL T1_2 = *(T1sMULT+2) + LocalM2_2; + REAL T1_3 = *(T1sMULT+3) + LocalM2_3; + REAL T2_0 = *(C22) + T1_0; + REAL T2_1 = *(C22+1) + T1_1; + REAL T2_2 = *(C22+2) + T1_2; + REAL T2_3 = *(C22+3) + T1_3; + (*(C11)) += LocalM2_0; + (*(C11+1)) += LocalM2_1; + (*(C11+2)) += LocalM2_2; + (*(C11+3)) += LocalM2_3; + (*(C12)) += LocalM5_0 + T1_0; + (*(C12+1)) += LocalM5_1 + T1_1; + (*(C12+2)) += LocalM5_2 + T1_2; + (*(C12+3)) += LocalM5_3 + T1_3; + (*(C22)) = LocalM5_0 + T2_0; + (*(C22+1)) = LocalM5_1 + T2_1; + (*(C22+2)) = LocalM5_2 + T2_2; + (*(C22+3)) = LocalM5_3 + T2_3; + (*(C21 )) = (- *(C21 )) + T2_0; + (*(C21+1)) = (- *(C21+1)) + T2_1; + (*(C21+2)) = (- *(C21+2)) + T2_2; + (*(C21+3)) = (- *(C21+3)) + T2_3; + M5 += 4; + M2 += 4; + T1sMULT += 4; + C11 += 4; + C12 += 4; + C21 += 4; + C22 += 4; + } + C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC); + C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC); + C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC); + C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC); + } + free(StartHeap); +} +#elif defined(MANUAL_CUTOFF) +void OptimizedStrassenMultiply_par(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, + unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth) +{ + unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */ + unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize + + 32; + unsigned Column, Row; + + /************************************************************************ + ** For each matrix A, B, and C, we'll want pointers to each quandrant + ** in the matrix. These quandrants will be addressed as follows: + ** -- -- + ** | A11 A12 | + ** | | + ** | A21 A22 | + ** -- -- + ************************************************************************/ + REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12, + *A21, *B21, *C21, *A22, *B22, *C22; + + REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT; + #define T2sMULT C22 + #define NumberOfVariables 11 + + PTR TempMatrixOffset = 0; + PTR MatrixOffsetA = 0; + PTR MatrixOffsetB = 0; + + char *Heap; + void *StartHeap; + + /* Distance between the end of a matrix row and the start of the next row */ + PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3; + PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3; + PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3; + + if (MatrixSize <= bots_app_cutoff_value) { + MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0); + return; + } + + /* Initialize quandrant matrices */ + #define A11 A + #define B11 B + #define C11 C + A12 = A11 + QuadrantSize; + B12 = B11 + QuadrantSize; + C12 = C11 + QuadrantSize; + A21 = A + (RowWidthA * QuadrantSize); + B21 = B + (RowWidthB * QuadrantSize); + C21 = C + (RowWidthC * QuadrantSize); + A22 = A21 + QuadrantSize; + B22 = B21 + QuadrantSize; + C22 = C21 + QuadrantSize; + + /* Allocate Heap Space Here */ + StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables); + /* ensure that heap is on cache boundary */ + if ( ((PTR) Heap) & 31) + Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) ); + + /* Distribute the heap space over the variables */ + S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes; + + /*************************************************************************** + ** Step through all columns row by row (vertically) + ** (jumps in memory by RowWidth => bad locality) + ** (but we want the best locality on the innermost loop) + ***************************************************************************/ + for (Row = 0; Row < QuadrantSize; Row++) { + + /************************************************************************* + ** Step through each row horizontally (addressing elements in each column) + ** (jumps linearly througn memory => good locality) + *************************************************************************/ + for (Column = 0; Column < QuadrantSize; Column++) { + + /*********************************************************** + ** Within this loop, the following holds for MatrixOffset: + ** MatrixOffset = (Row * RowWidth) + Column + ** (note: that the unit of the offset is number of reals) + ***********************************************************/ + /* Element of Global Matrix, such as A, B, C */ + #define E(Matrix) (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) ) + #define EA(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) ) + #define EB(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) ) + + /* FIXME - may pay to expand these out - got higher speed-ups below */ + /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */ + E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) ); + + /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */ + E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21); + + /* S3 = A11 - A21 */ + E(S3) = EA(A11) - EA(A21); + + /* S7 = B22 - B12 */ + E(S7) = EB(B22) - EB(B12); + + TempMatrixOffset += sizeof(REAL); + MatrixOffsetA += sizeof(REAL); + MatrixOffsetB += sizeof(REAL); + } /* end row loop*/ + + MatrixOffsetA += RowIncrementA; + MatrixOffsetB += RowIncrementB; + } /* end column loop */ + + if (Depth < bots_cutoff_value) + { +#if defined(FORCE_TIED_TASKS) + /* M2 = A11 x B11 */ + #pragma omp task + OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1); + + /* M5 = S1 * S5 */ + #pragma omp task + OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of T1 = S2 x S6 + M2 */ + #pragma omp task + OptimizedStrassenMultiply_par(T1sMULT, S2, S6, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of T2 = T1 + S3 x S7 */ + #pragma omp task + OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of C11 = M2 + A12 * B21 */ + #pragma omp task + OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1); + + /* Step 1 of C12 = S4 x B22 + T1 + M5 */ + #pragma omp task + OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1); + + /* Step 1 of C21 = T2 - A22 * S8 */ + #pragma omp task + OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1); +#else + /* M2 = A11 x B11 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1); + + /* M5 = S1 * S5 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of T1 = S2 x S6 + M2 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(T1sMULT, S2, S6, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of T2 = T1 + S3 x S7 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of C11 = M2 + A12 * B21 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1); + + /* Step 1 of C12 = S4 x B22 + T1 + M5 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1); + + /* Step 1 of C21 = T2 - A22 * S8 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1); +#endif + + /********************************************** + ** Synchronization Point + **********************************************/ + #pragma omp taskwait + } + else + { + /* M2 = A11 x B11 */ + OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1); + /* M5 = S1 * S5 */ + OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + /* Step 1 of T1 = S2 x S6 + M2 */ + OptimizedStrassenMultiply_par(T1sMULT, S2, S6, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + /* Step 1 of T2 = T1 + S3 x S7 */ + OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1); + /* Step 1 of C11 = M2 + A12 * B21 */ + OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1); + /* Step 1 of C12 = S4 x B22 + T1 + M5 */ + OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1); + /* Step 1 of C21 = T2 - A22 * S8 */ + OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1); + } + + /*************************************************************************** + ** Step through all columns row by row (vertically) + ** (jumps in memory by RowWidth => bad locality) + ** (but we want the best locality on the innermost loop) + ***************************************************************************/ + for (Row = 0; Row < QuadrantSize; Row++) { + /************************************************************************* + ** Step through each row horizontally (addressing elements in each column) + ** (jumps linearly througn memory => good locality) + *************************************************************************/ + for (Column = 0; Column < QuadrantSize; Column += 4) { + REAL LocalM5_0 = *(M5); + REAL LocalM5_1 = *(M5+1); + REAL LocalM5_2 = *(M5+2); + REAL LocalM5_3 = *(M5+3); + REAL LocalM2_0 = *(M2); + REAL LocalM2_1 = *(M2+1); + REAL LocalM2_2 = *(M2+2); + REAL LocalM2_3 = *(M2+3); + REAL T1_0 = *(T1sMULT) + LocalM2_0; + REAL T1_1 = *(T1sMULT+1) + LocalM2_1; + REAL T1_2 = *(T1sMULT+2) + LocalM2_2; + REAL T1_3 = *(T1sMULT+3) + LocalM2_3; + REAL T2_0 = *(C22) + T1_0; + REAL T2_1 = *(C22+1) + T1_1; + REAL T2_2 = *(C22+2) + T1_2; + REAL T2_3 = *(C22+3) + T1_3; + (*(C11)) += LocalM2_0; + (*(C11+1)) += LocalM2_1; + (*(C11+2)) += LocalM2_2; + (*(C11+3)) += LocalM2_3; + (*(C12)) += LocalM5_0 + T1_0; + (*(C12+1)) += LocalM5_1 + T1_1; + (*(C12+2)) += LocalM5_2 + T1_2; + (*(C12+3)) += LocalM5_3 + T1_3; + (*(C22)) = LocalM5_0 + T2_0; + (*(C22+1)) = LocalM5_1 + T2_1; + (*(C22+2)) = LocalM5_2 + T2_2; + (*(C22+3)) = LocalM5_3 + T2_3; + (*(C21 )) = (- *(C21 )) + T2_0; + (*(C21+1)) = (- *(C21+1)) + T2_1; + (*(C21+2)) = (- *(C21+2)) + T2_2; + (*(C21+3)) = (- *(C21+3)) + T2_3; + M5 += 4; + M2 += 4; + T1sMULT += 4; + C11 += 4; + C12 += 4; + C21 += 4; + C22 += 4; + } + C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC); + C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC); + C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC); + C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC); + } + free(StartHeap); +} +#else +void OptimizedStrassenMultiply_par(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, + unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth) +{ + unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */ + unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize + + 32; + unsigned Column, Row; + + /************************************************************************ + ** For each matrix A, B, and C, we'll want pointers to each quandrant + ** in the matrix. These quandrants will be addressed as follows: + ** -- -- + ** | A11 A12 | + ** | | + ** | A21 A22 | + ** -- -- + ************************************************************************/ + REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12, + *A21, *B21, *C21, *A22, *B22, *C22; + + REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT; + #define T2sMULT C22 + #define NumberOfVariables 11 + + PTR TempMatrixOffset = 0; + PTR MatrixOffsetA = 0; + PTR MatrixOffsetB = 0; + + char *Heap; + void *StartHeap; + + /* Distance between the end of a matrix row and the start of the next row */ + PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3; + PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3; + PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3; + + if (MatrixSize <= bots_app_cutoff_value) { + MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0); + return; + } + + /* Initialize quandrant matrices */ + #define A11 A + #define B11 B + #define C11 C + A12 = A11 + QuadrantSize; + B12 = B11 + QuadrantSize; + C12 = C11 + QuadrantSize; + A21 = A + (RowWidthA * QuadrantSize); + B21 = B + (RowWidthB * QuadrantSize); + C21 = C + (RowWidthC * QuadrantSize); + A22 = A21 + QuadrantSize; + B22 = B21 + QuadrantSize; + C22 = C21 + QuadrantSize; + + /* Allocate Heap Space Here */ + StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables); + /* ensure that heap is on cache boundary */ + if ( ((PTR) Heap) & 31) + Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) ); + + /* Distribute the heap space over the variables */ + S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes; + + /*************************************************************************** + ** Step through all columns row by row (vertically) + ** (jumps in memory by RowWidth => bad locality) + ** (but we want the best locality on the innermost loop) + ***************************************************************************/ + for (Row = 0; Row < QuadrantSize; Row++) { + + /************************************************************************* + ** Step through each row horizontally (addressing elements in each column) + ** (jumps linearly througn memory => good locality) + *************************************************************************/ + for (Column = 0; Column < QuadrantSize; Column++) { + + /*********************************************************** + ** Within this loop, the following holds for MatrixOffset: + ** MatrixOffset = (Row * RowWidth) + Column + ** (note: that the unit of the offset is number of reals) + ***********************************************************/ + /* Element of Global Matrix, such as A, B, C */ + #define E(Matrix) (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) ) + #define EA(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) ) + #define EB(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) ) + + /* FIXME - may pay to expand these out - got higher speed-ups below */ + /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */ + E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) ); + + /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */ + E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21); + + /* S3 = A11 - A21 */ + E(S3) = EA(A11) - EA(A21); + + /* S7 = B22 - B12 */ + E(S7) = EB(B22) - EB(B12); + + TempMatrixOffset += sizeof(REAL); + MatrixOffsetA += sizeof(REAL); + MatrixOffsetB += sizeof(REAL); + } /* end row loop*/ + + MatrixOffsetA += RowIncrementA; + MatrixOffsetB += RowIncrementB; + } /* end column loop */ + +#if defined(FORCE_TIED_TASKS) + /* M2 = A11 x B11 */ + #pragma omp task + OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1); + + /* M5 = S1 * S5 */ + #pragma omp task + OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of T1 = S2 x S6 + M2 */ + #pragma omp task + OptimizedStrassenMultiply_par(T1sMULT, S2, S6, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of T2 = T1 + S3 x S7 */ + #pragma omp task + OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of C11 = M2 + A12 * B21 */ + #pragma omp task + OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1); + + /* Step 1 of C12 = S4 x B22 + T1 + M5 */ + #pragma omp task + OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1); + + /* Step 1 of C21 = T2 - A22 * S8 */ + #pragma omp task + OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1); +#else + /* M2 = A11 x B11 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1); + + /* M5 = S1 * S5 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of T1 = S2 x S6 + M2 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(T1sMULT, S2, S6, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of T2 = T1 + S3 x S7 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of C11 = M2 + A12 * B21 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1); + + /* Step 1 of C12 = S4 x B22 + T1 + M5 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1); + + /* Step 1 of C21 = T2 - A22 * S8 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1); +#endif + + /********************************************** + ** Synchronization Point + **********************************************/ + #pragma omp taskwait + /*************************************************************************** + ** Step through all columns row by row (vertically) + ** (jumps in memory by RowWidth => bad locality) + ** (but we want the best locality on the innermost loop) + ***************************************************************************/ + for (Row = 0; Row < QuadrantSize; Row++) { + /************************************************************************* + ** Step through each row horizontally (addressing elements in each column) + ** (jumps linearly througn memory => good locality) + *************************************************************************/ + for (Column = 0; Column < QuadrantSize; Column += 4) { + REAL LocalM5_0 = *(M5); + REAL LocalM5_1 = *(M5+1); + REAL LocalM5_2 = *(M5+2); + REAL LocalM5_3 = *(M5+3); + REAL LocalM2_0 = *(M2); + REAL LocalM2_1 = *(M2+1); + REAL LocalM2_2 = *(M2+2); + REAL LocalM2_3 = *(M2+3); + REAL T1_0 = *(T1sMULT) + LocalM2_0; + REAL T1_1 = *(T1sMULT+1) + LocalM2_1; + REAL T1_2 = *(T1sMULT+2) + LocalM2_2; + REAL T1_3 = *(T1sMULT+3) + LocalM2_3; + REAL T2_0 = *(C22) + T1_0; + REAL T2_1 = *(C22+1) + T1_1; + REAL T2_2 = *(C22+2) + T1_2; + REAL T2_3 = *(C22+3) + T1_3; + (*(C11)) += LocalM2_0; + (*(C11+1)) += LocalM2_1; + (*(C11+2)) += LocalM2_2; + (*(C11+3)) += LocalM2_3; + (*(C12)) += LocalM5_0 + T1_0; + (*(C12+1)) += LocalM5_1 + T1_1; + (*(C12+2)) += LocalM5_2 + T1_2; + (*(C12+3)) += LocalM5_3 + T1_3; + (*(C22)) = LocalM5_0 + T2_0; + (*(C22+1)) = LocalM5_1 + T2_1; + (*(C22+2)) = LocalM5_2 + T2_2; + (*(C22+3)) = LocalM5_3 + T2_3; + (*(C21 )) = (- *(C21 )) + T2_0; + (*(C21+1)) = (- *(C21+1)) + T2_1; + (*(C21+2)) = (- *(C21+2)) + T2_2; + (*(C21+3)) = (- *(C21+3)) + T2_3; + M5 += 4; + M2 += 4; + T1sMULT += 4; + C11 += 4; + C12 += 4; + C21 += 4; + C22 += 4; + } + C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC); + C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC); + C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC); + C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC); + } + free(StartHeap); +} +#endif +/* + * Set an n by n matrix A to random values. The distance between + * rows is an + */ +void init_matrix(int n, REAL *A, int an) +{ + int i, j; + + for (i = 0; i < n; ++i) + for (j = 0; j < n; ++j) + ELEM(A, an, i, j) = ((double) rand()) / (double) RAND_MAX; +} + +/* + * Compare two matrices. Print an error message if they differ by + * more than EPSILON. + */ +int compare_matrix(int n, REAL *A, int an, REAL *B, int bn) +{ + int i, j; + REAL c; + + for (i = 0; i < n; ++i) + for (j = 0; j < n; ++j) { + /* compute the relative error c */ + c = ELEM(A, an, i, j) - ELEM(B, bn, i, j); + if (c < 0.0) + c = -c; + + c = c / ELEM(A, an, i, j); + if (c > EPSILON) { + bots_message("Strassen: Wrong answer!\n"); + return BOTS_RESULT_UNSUCCESSFUL; + } + } + + return BOTS_RESULT_SUCCESSFUL; +} + +/* + * Allocate a matrix of side n (therefore n^2 elements) + */ +REAL *alloc_matrix(int n) +{ + return malloc(n * n * sizeof(REAL)); +} + +void strassen_main_par(REAL *A, REAL *B, REAL *C, int n) +{ + bots_message("Computing parallel Strassen algorithm (n=%d) ", n); + #pragma omp parallel + #pragma omp single +#if defined(FORCE_TIED_TASKS) + #pragma omp task +#else + #pragma omp task untied +#endif + OptimizedStrassenMultiply_par(C, A, B, n, n, n, n, 1); + bots_message(" completed!\n"); +} +void strassen_main_seq(REAL *A, REAL *B, REAL *C, int n) +{ + bots_message("Computing sequential Strassen algorithm (n=%d) ", n); + OptimizedStrassenMultiply_seq(C, A, B, n, n, n, n, 1); + bots_message(" completed!\n"); +} + diff --git a/src/components/implementation/no_interface/omp_strassen_bots/strassen.h b/src/components/implementation/no_interface/omp_strassen_bots/strassen.h new file mode 100644 index 0000000000..7944f77880 --- /dev/null +++ b/src/components/implementation/no_interface/omp_strassen_bots/strassen.h @@ -0,0 +1,66 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ +#ifndef _STRASSEN_H +#define _STRASSEN_H +/* ******************************************************************* */ +/* STRASSEN APPLICATION CUT OFF's */ +/* ******************************************************************* */ +/* Strassen uses three different functions to compute Matrix Multiply. */ +/* Each of them is related to an application cut off value: */ +/* - Initial algorithm: OptimizedStrassenMultiply() */ +/* - bots_app_cutoff_value: MultiplyByDivideAndConquer() */ +/* - SizeAtWhichNaiveAlgorithmIsMoreEfficient: FastAdditiveNaiveMatrixMultiply() */ +/* ******************************************************************* */ + +/*FIXME: at the moment we use a constant value, change to parameter ???*/ +/* Below this cut off strassen uses FastAdditiveNaiveMatrixMultiply algorithm */ +#define SizeAtWhichNaiveAlgorithmIsMoreEfficient 16 + +/*********************************************************************** + * maximum tolerable relative error (for the checking routine) + **********************************************************************/ +#define EPSILON (1.0E-6) +/*********************************************************************** + * Matrices are stored in row-major order; A is a pointer to + * the first element of the matrix, and an is the number of elements + * between two rows. This macro produces the element A[i,j] + * given A, an, i and j + **********************************************************************/ +#define ELEM(A, an, i, j) (A[(i)*(an)+(j)]) + +void matrixmul(int n, REAL *A, int an, REAL *B, int bn, REAL *C, int cn); +void FastNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, + unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB); +void FastAdditiveNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, + unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB); +void MultiplyByDivideAndConquer(REAL *C, REAL *A, REAL *B, + unsigned MatrixSize, + unsigned RowWidthC, + unsigned RowWidthA, + unsigned RowWidthB, + int AdditiveMode + ); +void OptimizedStrassenMultiply_par(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, + unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth); +void OptimizedStrassenMultiply_seq(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, + unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth); +REAL *alloc_matrix(int n); +#endif + diff --git a/src/components/implementation/no_interface/omp_ubench/Makefile b/src/components/implementation/no_interface/omp_ubench/Makefile new file mode 100644 index 0000000000..d93533c7e5 --- /dev/null +++ b/src/components/implementation/no_interface/omp_ubench/Makefile @@ -0,0 +1,19 @@ +COMPONENT=omp_ubench.o +INTERFACES= +#DEPENDENCIES=capmgr +IF_LIB= +ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps + +include ../../Makefile.subsubdir +MANDITORY_LIB=simple_stklib.o + +CFLAGS += -fopenmp +# if tied tasks are required +#CFLAGS += -DFORCE_TIED_TASKS + +#OMPC_FINAL_FLAGS= + +# one per compilation or none +#CFLAGS += -DMANUAL_CUTOFF +#CFLAGS += -DIF_CUTOFF +#CFLAGS += -DFINAL_CUTOFF $(OMPC_FINAL_FLAGS) diff --git a/src/components/implementation/no_interface/omp_ubench/init.c b/src/components/implementation/no_interface/omp_ubench/init.c new file mode 120000 index 0000000000..9e09b82e77 --- /dev/null +++ b/src/components/implementation/no_interface/omp_ubench/init.c @@ -0,0 +1 @@ +../omp_fib_bots/init.c \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_ubench/posix_basic.c b/src/components/implementation/no_interface/omp_ubench/posix_basic.c new file mode 120000 index 0000000000..9afee078fb --- /dev/null +++ b/src/components/implementation/no_interface/omp_ubench/posix_basic.c @@ -0,0 +1 @@ +../omp_fib_bots/posix_basic.c \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_ubench/ubench.c b/src/components/implementation/no_interface/omp_ubench/ubench.c new file mode 100644 index 0000000000..6d22daaf25 --- /dev/null +++ b/src/components/implementation/no_interface/omp_ubench/ubench.c @@ -0,0 +1,156 @@ +#include +#include +#include +#include + +#define ITERS 1000 +#define RECUR 4 + +#define DISPLAY_VALS + +void +test_parallel(void) +{ + cycles_t max = 0, total = 0; + int i, x = 0; + + for (i = 0; i < ITERS; i++) { + cycles_t st, en, diff; + + rdtscll(st); + #pragma omp parallel + { + x++; + } + rdtscll(en); + + diff = en - st; + total += diff; + if (diff > max) max = diff; +#ifdef DISPLAY_VALS + PRINTC("%llu\n", diff); +#endif + } + + PRINTC("uBench Parallel (NCORES:%u, NITERS=%d): AVG:%llu WC:%llu\n", NUM_CPU, ITERS, total / ITERS, max); +} + +void +test_parallel_critical(void) +{ + cycles_t max = 0, total = 0; + int i, x = 0; + + for (i = 0; i < ITERS; i++) { + cycles_t st, en, diff; + + rdtscll(st); + #pragma omp parallel + { + #pragma omp critical + { + x++; + } + } + rdtscll(en); + + diff = en - st; + total += diff; + if (diff > max) max = diff; +#ifdef DISPLAY_VALS + PRINTC("%llu\n", diff); +#endif + } + + PRINTC("uBench Parallel+Critical (NCORES:%u, NITERS=%d): AVG:%llu WC:%llu\n", NUM_CPU, ITERS, total / ITERS, max); +} + +void +test_parallel_task(void) +{ + cycles_t max = 0, total = 0; + int i, x = 0, y = 0; + + for (i = 0; i < ITERS; i++) { + cycles_t st, en, diff; + + rdtscll(st); + #pragma omp parallel + { + x++; + #pragma omp task + { + y++; + } + #pragma omp taskwait + } + rdtscll(en); + + diff = en - st; + total += diff; + if (diff > max) max = diff; +#ifdef DISPLAY_VALS + PRINTC("%llu\n", diff); +#endif + } + + PRINTC("uBench Parallel+Task+Taskwait (NCORES:%u, NITERS=%d): AVG:%llu WC:%llu\n", NUM_CPU, ITERS, total / ITERS, max); +} + +void +test_parallel_task_4levels(void) +{ + cycles_t max = 0, total = 0; + int i, x = 0, y = 0; + + for (i = 0; i < ITERS; i++) { + cycles_t st, en, diff; + + rdtscll(st); + #pragma omp parallel + { + x++; + #pragma omp task + { + #pragma omp task + { + #pragma omp task + { + #pragma omp task + { + y++; + } + #pragma omp taskwait + y++; + } + #pragma omp taskwait + y++; + } + #pragma omp taskwait + y++; + } + #pragma omp taskwait + } + rdtscll(en); + + diff = en - st; + total += diff; + if (diff > max) max = diff; +#ifdef DISPLAY_VALS + PRINTC("%llu\n", diff); +#endif + } + + PRINTC("uBench Parallel+Task 4levels+Taskwait (NCORES:%u, NITERS=%d): AVG:%llu WC:%llu\n", NUM_CPU, ITERS, total / ITERS, max); +} + +int +main(void) +{ +// test_parallel(); +// test_parallel_critical(); + test_parallel_task(); +// test_parallel_task_4levels(); + + return 0; +} diff --git a/src/components/implementation/no_interface/omp_workconservation/Makefile b/src/components/implementation/no_interface/omp_workconservation/Makefile new file mode 100644 index 0000000000..816ae03c7e --- /dev/null +++ b/src/components/implementation/no_interface/omp_workconservation/Makefile @@ -0,0 +1,10 @@ +COMPONENT=omp_workconsprob.o +INTERFACES= +#DEPENDENCIES=capmgr +IF_LIB= +ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps + +include ../../Makefile.subsubdir +MANDITORY_LIB=simple_stklib.o + +CFLAGS += -fopenmp diff --git a/src/components/implementation/no_interface/omp_workconservation/init.c b/src/components/implementation/no_interface/omp_workconservation/init.c new file mode 120000 index 0000000000..b2694bf833 --- /dev/null +++ b/src/components/implementation/no_interface/omp_workconservation/init.c @@ -0,0 +1 @@ +../omp_hello/init.c \ No newline at end of file diff --git a/src/components/implementation/no_interface/omp_workconservation/work_problem.c b/src/components/implementation/no_interface/omp_workconservation/work_problem.c new file mode 100644 index 0000000000..e395df3eeb --- /dev/null +++ b/src/components/implementation/no_interface/omp_workconservation/work_problem.c @@ -0,0 +1,89 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#define GETTID() cos_thdid() +#define sched_getcpu() cos_cpuid() +#define CYC_US 3200 + +/* + * From Chaos tests! + * NOTE: number obtained by running composite instance with no interference.. + * (validated with fiasco so far, it is 10us) + */ +#define ITERS_10US 5850 +#define MULTIPLE 10000 + +#define SPIN_ITERS (ITERS_10US*MULTIPLE) + +static void __spin_fn(void) __attribute__((optimize("O0"))); + +static void +__spin_fn(void) +{ + unsigned int spin = 0; + + while (spin < SPIN_ITERS) { + __asm__ __volatile__("nop": : :"memory"); + spin++; + } +} + +#define ITERS 1000 + +int main(void) +{ + unsigned long long max = 0, total = 0; + int i; + unsigned long long x, y; + + rdtscll(x); + __spin_fn(); + rdtscll(y); + printc("%llu:%llu\n\n\n", y - x, sl_cyc2usec(y - x)); + + + for (i = 0; i < ITERS; i++) { + volatile unsigned long long st = 0, en = 0; + + rdtscll(st); + #pragma omp parallel + { + #pragma omp single + { + #pragma omp task + { + #pragma omp task + { + __spin_fn(); + } + #pragma omp taskwait + } + + #pragma omp task + { + __spin_fn(); + } + __spin_fn(); + #pragma omp taskwait + } + } + rdtscll(en); + long diff = en - st; + assert(diff > 0); + + total += diff; + if ((unsigned long long) diff > max) max = diff; + printc("%ld, %ld\n", diff, diff / CYC_US); + } + + printc("(cyc) Avg: %llu, Max: %llu\n", (total / ITERS), max); + printc("(us) Avg: %llu, Max: %llu\n", (total / ITERS) / CYC_US, max / CYC_US); + + return 0; +} diff --git a/src/components/implementation/sched/Makefile b/src/components/implementation/sched/Makefile index c0cd0d6743..43e1e0e4f1 100644 --- a/src/components/implementation/sched/Makefile +++ b/src/components/implementation/sched/Makefile @@ -1,3 +1,3 @@ -INTERFACES=sched schedinit +INTERFACES=sched schedinit crt include ../Makefile.subdir diff --git a/src/components/implementation/sched/chan_backend.c b/src/components/implementation/sched/chan_backend.c new file mode 100644 index 0000000000..10736eabdf --- /dev/null +++ b/src/components/implementation/sched/chan_backend.c @@ -0,0 +1,45 @@ +#include + +#include +#include + +struct __sched_inout_chan { + struct crt_chan *in, *out; +} __sched_thds[NUM_CPU][MAX_NUM_THREADS]; + +void +__sched_stdio_init(void) +{ + memset(__sched_thds[cos_cpuid()], 0, MAX_NUM_THREADS * sizeof(struct __sched_inout_chan)); +} + +void +__sched_stdio_thd_init(thdid_t tid, struct crt_chan *in, struct crt_chan *out) +{ + __sched_thds[cos_cpuid()][tid].in = in; + __sched_thds[cos_cpuid()][tid].out = out; +} + +int +chan_out(unsigned long item) +{ + struct crt_chan *co = __sched_thds[cos_cpuid()][cos_thdid()].out; + + assert(co != NULL); + return crt_chan_send_LU(co, &item); +} + +unsigned long +chan_in(void) +{ + unsigned long item = 0; + int ret = 0; + struct crt_chan *ci = __sched_thds[cos_cpuid()][cos_thdid()].in; + + assert(ci != NULL); + + ret = crt_chan_recv_LU(ci, &item); + assert(ret == 0); + + return item; +} diff --git a/src/components/implementation/sched/hier_fprr/Makefile b/src/components/implementation/sched/hier_fprr/Makefile index 5be22a8cbd..f19b907991 100644 --- a/src/components/implementation/sched/hier_fprr/Makefile +++ b/src/components/implementation/sched/hier_fprr/Makefile @@ -5,7 +5,7 @@ INTERFACES=sched schedinit DEPENDENCIES=capmgr sched schedinit IF_LIB= FN_PREPEND=parent_ -ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr +ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr -lsl_blkpt include ../../Makefile.subsubdir MANDITORY_LIB=simple_stklib.o diff --git a/src/components/implementation/sched/hier_fprr/init.c b/src/components/implementation/sched/hier_fprr/init.c index 78a4e578cc..90369146e4 100644 --- a/src/components/implementation/sched/hier_fprr/init.c +++ b/src/components/implementation/sched/hier_fprr/init.c @@ -55,14 +55,33 @@ __init_done(void *d) void sched_child_init(struct sched_childinfo *schedci) { - struct sl_thd *initthd = NULL; + vaddr_t dcbaddr; assert(schedci); - initthd = sched_child_initthd_get(schedci); - assert(initthd); - sl_thd_param_set(initthd, sched_param_pack(SCHEDP_PRIO, FIXED_PRIO)); - sl_thd_param_set(initthd, sched_param_pack(SCHEDP_WINDOW, FIXED_PERIOD_MS)); - sl_thd_param_set(initthd, sched_param_pack(SCHEDP_BUDGET, FIXED_BUDGET_MS)); + schedci->initthd = sl_thd_initaep_alloc(sched_child_defci_get(schedci), NULL, schedci->flags & COMP_FLAG_SCHED, schedci->flags & COMP_FLAG_SCHED ? 1 : 0, 0, 0, 0, &dcbaddr); + assert(schedci->initthd); + + sl_thd_param_set(schedci->initthd, sched_param_pack(SCHEDP_PRIO, FIXED_PRIO)); + sl_thd_param_set(schedci->initthd, sched_param_pack(SCHEDP_WINDOW, FIXED_PERIOD_MS)); + sl_thd_param_set(schedci->initthd, sched_param_pack(SCHEDP_BUDGET, FIXED_BUDGET_MS)); +} + +thdid_t +sched_child_thd_create(struct sched_childinfo *schedci, thdclosure_index_t idx) +{ + vaddr_t addr; + struct sl_thd *t = sl_thd_aep_alloc_ext(sched_child_defci_get(schedci), NULL, idx, 0, 0, 0, 0, 0, &addr, NULL); + + return t ? sl_thd_thdid(t) : 0; +} + +thdid_t +sched_child_aep_create(struct sched_childinfo *schedci, thdclosure_index_t idx, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv) +{ + vaddr_t addr; + struct sl_thd *t = sl_thd_aep_alloc_ext(sched_child_defci_get(schedci), NULL, idx, 1, owntc, key, 0, 0, &addr, extrcv); + + return t ? sl_thd_thdid(t) : 0; } void @@ -70,7 +89,7 @@ cos_init(void) { struct cos_defcompinfo *defci = cos_defcompinfo_curr_get(); struct cos_compinfo *ci = cos_compinfo_get(defci); - static volatile int first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 }; + static volatile unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 }; static u32_t cpubmp[NUM_CPU_BMP_WORDS] = { 0 }; int i; @@ -94,7 +113,7 @@ cos_init(void) while (!ps_load((unsigned long *)&init_done[i])) ; } - sl_init_cpubmp(SL_MIN_PERIOD_US, cpubmp); + sl_init_corebmp(SL_MIN_PERIOD_US, cpubmp); sched_childinfo_init(); __initializer_thd[cos_cpuid()] = sl_thd_alloc(__init_done, NULL); assert(__initializer_thd[cos_cpuid()]); diff --git a/src/components/implementation/sched/root_fprr/Makefile b/src/components/implementation/sched/root_fprr/Makefile index ec245e6d9f..e811b24382 100644 --- a/src/components/implementation/sched/root_fprr/Makefile +++ b/src/components/implementation/sched/root_fprr/Makefile @@ -4,7 +4,7 @@ COMPONENT=root_fprr.o INTERFACES=sched schedinit DEPENDENCIES=capmgr IF_LIB= -ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr +ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr -lsl_blkpt include ../../Makefile.subsubdir MANDITORY_LIB=simple_stklib.o diff --git a/src/components/implementation/sched/root_fprr/init.c b/src/components/implementation/sched/root_fprr/init.c index 9ea0ef3812..54f819c7ff 100644 --- a/src/components/implementation/sched/root_fprr/init.c +++ b/src/components/implementation/sched/root_fprr/init.c @@ -17,8 +17,8 @@ u32_t cycs_per_usec = 0; #define INITIALIZE_BUDGET_MS (2000) #define FIXED_PRIO 2 -#define FIXED_PERIOD_MS (10000) -#define FIXED_BUDGET_MS (4000) +#define FIXED_PERIOD_MS (50000) +#define FIXED_BUDGET_MS (100000) static struct sl_thd *__initializer_thd[NUM_CPU] CACHE_ALIGNED; @@ -45,14 +45,45 @@ __init_done(void *d) void sched_child_init(struct sched_childinfo *schedci) { - struct sl_thd *initthd = NULL; + vaddr_t dcbaddr; + struct sl_thd *initthd; + tcap_prio_t p = FIXED_PRIO; assert(schedci); - initthd = sched_child_initthd_get(schedci); - assert(initthd); - sl_thd_param_set(initthd, sched_param_pack(SCHEDP_PRIO, FIXED_PRIO)); - sl_thd_param_set(initthd, sched_param_pack(SCHEDP_WINDOW, FIXED_PERIOD_MS)); - sl_thd_param_set(initthd, sched_param_pack(SCHEDP_BUDGET, FIXED_BUDGET_MS)); + if (schedci->id != 1) p = FIXED_PRIO; + else p = FIXED_PRIO + 1; + schedci->initthd = sl_thd_initaep_alloc(sched_child_defci_get(schedci), NULL, schedci->flags & COMP_FLAG_SCHED, schedci->flags & COMP_FLAG_SCHED ? 1 : 0, 0, 0, 0, &dcbaddr); + assert(schedci->initthd); + initthd = schedci->initthd; + + if (schedci->flags & COMP_FLAG_SCHED) { + if (cos_tcap_transfer(sl_thd_rcvcap(initthd), BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, p)) { + PRINTC("Failed to transfer INF budget\n"); + assert(0); + } else { + sl_thd_param_set(initthd, sched_param_pack(SCHEDP_BUDGET, FIXED_BUDGET_MS)); + } + sl_thd_param_set(initthd, sched_param_pack(SCHEDP_WINDOW, FIXED_PERIOD_MS)); + } + sl_thd_param_set(initthd, sched_param_pack(SCHEDP_PRIO, p)); +} + +thdid_t +sched_child_thd_create(struct sched_childinfo *schedci, thdclosure_index_t idx) +{ + vaddr_t addr; + struct sl_thd *t = sl_thd_aep_alloc_ext(sched_child_defci_get(schedci), NULL, idx, 0, 0, 0, 0, 0, &addr, NULL); + + return t ? sl_thd_thdid(t) : 0; +} + +thdid_t +sched_child_aep_create(struct sched_childinfo *schedci, thdclosure_index_t idx, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv) +{ + vaddr_t addr; + struct sl_thd *t = sl_thd_aep_alloc_ext(sched_child_defci_get(schedci), NULL, idx, 1, owntc, key, ipiwin, ipimax, &addr, extrcv); + + return t ? sl_thd_thdid(t) : 0; } void @@ -60,7 +91,7 @@ cos_init(void) { struct cos_defcompinfo *defci = cos_defcompinfo_curr_get(); struct cos_compinfo *ci = cos_compinfo_get(defci); - static volatile int first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 }; + static volatile unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 }; static u32_t cpubmp[NUM_CPU_BMP_WORDS] = { 0 }; int i; @@ -84,7 +115,7 @@ cos_init(void) while (!ps_load((unsigned long *)&init_done[i])) ; } - sl_init_cpubmp(SL_MIN_PERIOD_US, cpubmp); + sl_init_corebmp(100*SL_MIN_PERIOD_US, cpubmp); sched_childinfo_init(); __initializer_thd[cos_cpuid()] = sl_thd_alloc(__init_done, NULL); assert(__initializer_thd[cos_cpuid()]); diff --git a/src/components/implementation/sched/root_fprr_raw/Makefile b/src/components/implementation/sched/root_fprr_raw/Makefile index 5061883b7f..206a88478a 100644 --- a/src/components/implementation/sched/root_fprr_raw/Makefile +++ b/src/components/implementation/sched/root_fprr_raw/Makefile @@ -4,7 +4,7 @@ COMPONENT=root_fprr_raw.o INTERFACES=sched schedinit DEPENDENCIES= IF_LIB= -ADDITIONAL_LIBS=$(LIBSLRAW) -lsl_thd_static_backend -lsl_mod_fprr +ADDITIONAL_LIBS=$(LIBSLRAW) -lsl_thd_static_backend -lsl_mod_fprr -lsl_blkpt include ../../Makefile.subsubdir MANDITORY_LIB=simple_stklib.o diff --git a/src/components/implementation/sched/root_fprr_raw/init.c b/src/components/implementation/sched/root_fprr_raw/init.c index 854992f962..14d2c6ab46 100644 --- a/src/components/implementation/sched/root_fprr_raw/init.c +++ b/src/components/implementation/sched/root_fprr_raw/init.c @@ -24,13 +24,28 @@ capmgr_thd_retrieve_next(spdid_t child, thdid_t *tid) void sched_child_init(struct sched_childinfo *schedci) { - struct sl_thd *initthd = NULL; - assert(schedci); - initthd = sched_child_initthd_get(schedci); - assert(initthd); - sl_thd_param_set(initthd, sched_param_pack(SCHEDP_PRIO, FIXED_PRIO)); + schedci->initthd = sl_thd_initaep_alloc_dcb(sched_child_defci_get(schedci), NULL, schedci->flags & COMP_FLAG_SCHED, schedci->flags & COMP_FLAG_SCHED ? 1 : 0, 0, 0, 0, 0); + + assert(schedci->initthd); + sl_thd_param_set(schedci->initthd, sched_param_pack(SCHEDP_PRIO, FIXED_PRIO)); +} + +thdid_t +sched_child_thd_create(struct sched_childinfo *schedci, thdclosure_index_t idx) +{ + struct sl_thd *t = sl_thd_aep_alloc_ext_dcb(sched_child_defci_get(schedci), NULL, idx, 0, 0, 0, 0, 0, 0, 0, NULL); + + return t ? sl_thd_thdid(t) : 0; +} + +thdid_t +sched_child_aep_create(struct sched_childinfo *schedci, thdclosure_index_t idx, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv) +{ + struct sl_thd *t = sl_thd_aep_alloc_ext_dcb(sched_child_defci_get(schedci), NULL, idx, 1, owntc, key, 0, 0, 0, 0, extrcv); + + return t ? sl_thd_thdid(t) : 0; } void @@ -62,7 +77,7 @@ cos_init(void) while (!ps_load(&init_done[i])) ; } - sl_init_cpubmp(SL_MIN_PERIOD_US, cpubmp); + sl_init_corebmp(SL_MIN_PERIOD_US, cpubmp); sched_childinfo_init_raw(); hypercall_comp_init_done(); diff --git a/src/components/implementation/sched/sched.c b/src/components/implementation/sched/sched.c index 5943ed5217..1103dbddaa 100644 --- a/src/components/implementation/sched/sched.c +++ b/src/components/implementation/sched/sched.c @@ -50,25 +50,20 @@ thdid_t sched_thd_create_cserialized(thdclosure_index_t idx) { spdid_t c = cos_inv_token(); - struct cos_defcompinfo *dci; - struct sl_thd *t = NULL; + struct sched_childinfo *sci; if (!c) return 0; - dci = sched_child_defci_get(sched_childinfo_find(c)); - if (!dci) return 0; + sci = sched_childinfo_find(c); + if (!sci) return 0; - t = sl_thd_aep_alloc_ext(dci, NULL, idx, 0, 0, 0, 0, 0, NULL); - if (!t) return 0; - - return sl_thd_thdid(t); + return sched_child_thd_create(sci, idx); } thdid_t sched_aep_create_cserialized(arcvcap_t *extrcv, int *unused, u32_t thdidx_owntc, u32_t key_ipimax, u32_t ipiwin32b) { spdid_t c = cos_inv_token(); - struct cos_defcompinfo *dci; - struct sl_thd *t = NULL; + struct sched_childinfo *sci; int owntc = (thdidx_owntc << 16) >> 16; thdclosure_index_t idx = (thdidx_owntc >> 16); microsec_t ipiwin = (microsec_t)ipiwin32b; @@ -76,13 +71,10 @@ sched_aep_create_cserialized(arcvcap_t *extrcv, int *unused, u32_t thdidx_owntc, cos_channelkey_t key = (key_ipimax >> 16); if (!c) return 0; - dci = sched_child_defci_get(sched_childinfo_find(c)); - if (!dci) return 0; - - t = sl_thd_aep_alloc_ext(dci, NULL, idx, 1, owntc, key, ipiwin, ipimax, extrcv); - if (!t) return 0; + sci = sched_childinfo_find(c); + if (!sci) return 0; - return sl_thd_thdid(t); + return sched_child_aep_create(sci, idx, owntc, key, ipiwin, ipimax, extrcv); } int diff --git a/src/components/implementation/sched/sched_info.c b/src/components/implementation/sched/sched_info.c index 96c3c5b360..5a9bb457a2 100644 --- a/src/components/implementation/sched/sched_info.c +++ b/src/components/implementation/sched/sched_info.c @@ -70,6 +70,8 @@ sched_num_childsched_get(void) return sched_num_childsched[cos_cpuid()]; } +extern void __sched_stdio_init(void); + static void sched_childinfo_init_intern(int is_raw) { @@ -78,11 +80,10 @@ sched_childinfo_init_intern(int is_raw) comp_flag_t childflags; memset(childinfo[cos_cpuid()], 0, sizeof(struct sched_childinfo) * SCHED_MAX_CHILD_COMPS); + __sched_stdio_init(); while ((remaining = hypercall_comp_child_next(cos_spd_id(), &child, &childflags)) >= 0) { - struct cos_defcompinfo *child_dci = NULL; struct sched_childinfo *schedinfo = NULL; - struct sl_thd *initthd = NULL; compcap_t compcap = 0; if (is_raw) { @@ -92,15 +93,10 @@ sched_childinfo_init_intern(int is_raw) schedinfo = sched_childinfo_alloc(child, compcap, childflags); assert(schedinfo); - child_dci = sched_child_defci_get(schedinfo); hypercall_comp_cpubitmap_get(child, schedinfo->cpubmp); if (bitmap_check(schedinfo->cpubmp, cos_cpuid())) { PRINTLOG(PRINT_DEBUG, "Initializing child component %u, is_sched=%d\n", child, childflags & COMP_FLAG_SCHED); - initthd = sl_thd_initaep_alloc(child_dci, NULL, childflags & COMP_FLAG_SCHED, childflags & COMP_FLAG_SCHED ? 1 : 0, 0, 0, 0); /* TODO: rate information */ - assert(initthd); - sched_child_initthd_set(schedinfo, initthd); - sched_child_init(schedinfo); if (childflags & COMP_FLAG_SCHED) ps_faa((unsigned long *)&sched_num_childsched[cos_cpuid()], 1); } diff --git a/src/components/implementation/sched/sched_info.h b/src/components/implementation/sched/sched_info.h index 30bd318fa5..a1895d717e 100644 --- a/src/components/implementation/sched/sched_info.h +++ b/src/components/implementation/sched/sched_info.h @@ -11,8 +11,10 @@ #include #include #include +#include #define SCHED_MAX_CHILD_COMPS 8 +CRT_CHAN_TYPE_PROTOTYPES(LU, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS); struct sched_childinfo { struct cos_defcompinfo defcinfo; @@ -30,6 +32,8 @@ void sched_childinfo_init(void); void sched_childinfo_init_raw(void); extern unsigned int self_init[], num_child_init[]; +extern thdid_t sched_child_thd_create(struct sched_childinfo *schedci, thdclosure_index_t idx); +extern thdid_t sched_child_aep_create(struct sched_childinfo *schedci, thdclosure_index_t idx, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv); static inline struct cos_defcompinfo * sched_child_defci_get(struct sched_childinfo *sci) diff --git a/src/components/implementation/sched/sched_init.c b/src/components/implementation/sched/sched_init.c index 7b47764cad..d528982980 100644 --- a/src/components/implementation/sched/sched_init.c +++ b/src/components/implementation/sched/sched_init.c @@ -37,7 +37,7 @@ schedinit_child(void) if (!init) return 0; tcur = sl_thd_curr(); if (!tcur) return 0; - assert(tcur->schedthd == init); + assert(tcur->schedthd == init || tcur == init); /* thd retrieve */ do { @@ -52,7 +52,7 @@ schedinit_child(void) if (unlikely(t)) continue; aep.tid = thdid; - aep.tc = sl_thd_tcap(sl__globals_cpu()->sched_thd); + aep.tc = sl_thd_tcap(sl__globals_core()->sched_thd); t = sl_thd_init_ext(&aep, init); if (!t) return 0; } while (thdid); diff --git a/src/components/implementation/sched/test_sched/Makefile b/src/components/implementation/sched/test_sched/Makefile new file mode 100644 index 0000000000..b6383ecc8c --- /dev/null +++ b/src/components/implementation/sched/test_sched/Makefile @@ -0,0 +1,10 @@ +C_OBJS= +ASM_OBJS= +COMPONENT=test_sched.o +INTERFACES=sched schedinit crt +DEPENDENCIES=capmgr channel +IF_LIB= +ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr -lsl_blkpt + +include ../../Makefile.subsubdir +MANDITORY_LIB=simple_stklib.o diff --git a/src/components/implementation/sched/test_sched/init.c b/src/components/implementation/sched/test_sched/init.c new file mode 100644 index 0000000000..83db6ea806 --- /dev/null +++ b/src/components/implementation/sched/test_sched/init.c @@ -0,0 +1,293 @@ +/** + * Redistribution of this file is permitted under the BSD two clause license. + * + * Copyright 2018, The George Washington University + * Author: Phani Gadepalli, phanikishoreg@gwu.edu + */ + +#include +#include +#include +#include +#include +#include +#include + +#define MAX_USE_PIPE_SZ 1 +#define INITIALIZE_PRIO 1 +#define INITIALIZE_PERIOD_MS (4000) +#define INITIALIZE_BUDGET_MS (2000) + +static struct sl_thd *__initializer_thd[NUM_CPU] CACHE_ALIGNED; + +u32_t cycs_per_usec = 0; +cycles_t *int_start = NULL; +volatile unsigned long *rdy = NULL; + +void +sched_child_init(struct sched_childinfo *schedci) +{ + vaddr_t dcbaddr; + struct sl_thd *initthd; + + assert(schedci); + assert(!(schedci->flags & COMP_FLAG_SCHED)); + schedci->initthd = sl_thd_initaep_alloc(sched_child_defci_get(schedci), NULL, 0, 0, 0, 0, 0, &dcbaddr); + assert(schedci->initthd); + initthd = schedci->initthd; + + sl_thd_param_set(initthd, sched_param_pack(SCHEDP_PRIO, 2)); +} + +extern void __sched_stdio_thd_init(thdid_t, struct crt_chan *, struct crt_chan *); +#define MAX_PIPE_SZ 8 +CRT_CHAN_STATIC_ALLOC(c0, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS); +CRT_CHAN_STATIC_ALLOC(c1, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS); +CRT_CHAN_STATIC_ALLOC(c2, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS); +CRT_CHAN_STATIC_ALLOC(c3, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS); +CRT_CHAN_STATIC_ALLOC(c4, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS); +CRT_CHAN_STATIC_ALLOC(c5, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS); +CRT_CHAN_STATIC_ALLOC(c6, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS); +CRT_CHAN_STATIC_ALLOC(c7, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS); + +#define SPDID_INT 5 +#define SPDID_W1 6 +#define SPDID_W3 7 + +#define PRIO_START (MAX_PIPE_SZ + 8) + +#define PRIO_INT PRIO_START +#define PRIO_W0 (PRIO_START - 1) +#define PRIO_W1 (PRIO_START - 2) +#define PRIO_W2 (PRIO_START - 3) +#define PRIO_W3 (PRIO_START - 4) +#define PRIO_W4 (PRIO_START - 5) +#define PRIO_W5 (PRIO_START - 6) +#define PRIO_W6 (PRIO_START - 7) + +#define SND_DATA 0x1234 + +#define SHMCHANNEL_KEY 0x2020 +#define MAX_ITERS 100000 +cycles_t vals[MAX_ITERS] = { 0 }; +int iters = 0; +cycles_t tot = 0, wc = 0; +static int pc, tc; + +struct __thd_info { + struct sl_thd *t; + tcap_prio_t p; +} iot[MAX_PIPE_SZ + 1]; + +struct __pipe_info { + struct sl_thd *sndr, *rcvr; /* p2p channels */ + struct crt_chan *c; +} iop[MAX_PIPE_SZ]; + +static int +schedinit_self(void) +{ + if (ps_load(&tc) < (MAX_USE_PIPE_SZ + 1)) return 1; + + assert(ps_load(&tc) == (MAX_USE_PIPE_SZ + 1)); + + return 0; +} + +static void +__init_done(void *d) +{ + while (schedinit_self()) sl_thd_block_periodic(0); + + int i; + + for (i = 0; i < MAX_USE_PIPE_SZ; i++) { + if (i == 0) { + crt_chan_init_LU(iop[i].c); + } else { + assert(iop[i].sndr && iop[i].rcvr); + crt_chan_p2p_init_LU(iop[i].c, iop[i].sndr, iop[i].rcvr); + } + } + + /* don't want the threads to run before channels are initialized! */ + for (i = MAX_USE_PIPE_SZ; i >= 0; i--) { + PRINTC("%d, %lx, %u\n", i, (unsigned long)(iot[i].t), sl_thd_thdid(iot[i].t)); + assert(iot[i].t); + sl_thd_param_set(iot[i].t, sched_param_pack(SCHEDP_PRIO, iot[i].p)); + } + PRINTLOG(PRINT_DEBUG, "SELF (inc. CHILD) INIT DONE.\n"); + + sl_thd_exit(); + + assert(0); +} + + +static void +work_thd_fn(void *data) +{ + int is_last = (int)data; + + ps_faa(rdy, 1); + + while (1) { + chan_in(); + if (unlikely(is_last)) { + cycles_t end, diff; + if (iters >= MAX_ITERS) continue; + rdtscll(end); + assert(int_start); + diff = end - *int_start; + if (wc < diff) wc = diff; + tot += diff; + vals[iters] = diff; + //printc("%llu\n", diff); + iters++; + if (iters % 1000 == 0) printc("."); + + if (iters == MAX_ITERS) { + int i; + + for (i = 0; i < MAX_ITERS; i++) printc("%llu\n", vals[i]); + PRINTC("%llu, %llu\n", tot / iters, wc); + //tot = wc = 0; + //iters = 0; + } + continue; + } + chan_out(SND_DATA); + } +} + +thdid_t +sched_child_thd_create(struct sched_childinfo *schedci, thdclosure_index_t idx) +{ + vaddr_t addr; + struct sl_thd *t = sl_thd_aep_alloc_ext(sched_child_defci_get(schedci), NULL, idx, 0, 0, 0, 0, 0, &addr, NULL); + assert(t); + if (cos_inv_token() == SPDID_W1) { + iot[2].t = t; + iot[2].p = PRIO_W1; + iop[1].rcvr = t; + iop[2].sndr = t; + __sched_stdio_thd_init(sl_thd_thdid(t), c1, c2); + } else if (cos_inv_token() == SPDID_W3) { + iot[4].t = t; + iot[4].p = PRIO_W3; + iop[3].rcvr = t; + __sched_stdio_thd_init(sl_thd_thdid(t), c3, NULL); + } + ps_faa(&tc, 1); + + return t ? sl_thd_thdid(t) : 0; +} + +thdid_t +sched_child_aep_create(struct sched_childinfo *schedci, thdclosure_index_t idx, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv) +{ + assert(cos_inv_token() == SPDID_INT); + int first = 1; + vaddr_t addr; + /* only 1 aep */ + if (!ps_cas(&first, 1, 0)) assert(0); + struct sl_thd *t = sl_thd_aep_alloc_ext(sched_child_defci_get(schedci), NULL, idx, 1, owntc, key, ipiwin, ipimax, &addr, extrcv); + assert(t); + __sched_stdio_thd_init(sl_thd_thdid(t), NULL, c0); + iot[0].t = t; + iot[0].p = PRIO_INT; + iop[0].sndr = t; + ps_faa(&tc, 1); + + return t ? sl_thd_thdid(t) : 0; +} + +void +test_pipes_init(void) +{ + struct sl_thd *t = sl_thd_alloc(work_thd_fn, MAX_USE_PIPE_SZ == 1 ? (void *)1 : (void *)0); + assert(t); + iot[1].t = t; + iot[1].p = PRIO_W0; + iop[0].rcvr = t; /* no optimized path for rcving from INT thread */ + iop[1].sndr = t; + __sched_stdio_thd_init(sl_thd_thdid(t), c0, c1); + ps_faa(&tc, 1); + if (MAX_USE_PIPE_SZ >= 3) { + t = sl_thd_alloc(work_thd_fn, MAX_USE_PIPE_SZ == 3 ? (void *)1 : (void *)0); + assert(t); + iot[3].t = t; + iot[3].p = PRIO_W2; + iop[2].rcvr = t; + iop[3].sndr = t; + __sched_stdio_thd_init(sl_thd_thdid(t), c2, c3); + ps_faa(&tc, 1); + } +} + +void +cos_init(void) +{ + struct cos_defcompinfo *defci = cos_defcompinfo_curr_get(); + struct cos_compinfo *ci = cos_compinfo_get(defci); + static volatile unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 }; + static u32_t cpubmp[NUM_CPU_BMP_WORDS] = { 0 }; + int i; + + assert(NUM_CPU == 1); + assert(MAX_USE_PIPE_SZ <= MAX_PIPE_SZ); + memset(iop, 0, sizeof(struct __pipe_info) * MAX_PIPE_SZ); + memset(iot, 0, sizeof(struct __thd_info) * (MAX_PIPE_SZ + 1)); + pc = tc = 0; + iop[0].c = c0; + iop[1].c = c1; + iop[2].c = c2; + iop[3].c = c3; + iop[4].c = c4; + iop[5].c = c5; + iop[6].c = c6; + iop[7].c = c7; + + PRINTLOG(PRINT_DEBUG, "CPU cycles per sec: %u\n", cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE)); + + if (ps_cas((unsigned long *)&first, NUM_CPU + 1, cos_cpuid())) { + cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT); + cos_defcompinfo_init(); + cos_init_args_cpubmp(cpubmp); + } else { + while (!ps_load((unsigned long *)&init_done[first])) ; + + cos_defcompinfo_sched_init(); + } + ps_faa((unsigned long *)&init_done[cos_cpuid()], 1); + + /* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */ + for (i = 0; i < NUM_CPU; i++) { + if (!bitmap_check(cpubmp, i)) continue; + + while (!ps_load((unsigned long *)&init_done[i])) ; + } + + sl_init_corebmp(100*SL_MIN_PERIOD_US, cpubmp); + vaddr_t tscaddr = 0; + cbuf_t id = channel_shared_page_alloc(SHMCHANNEL_KEY, &tscaddr); + assert(id > 0); + int_start = (cycles_t *)tscaddr; + *int_start = 0ULL; + rdy = (volatile unsigned long *)(int_start + 1); + *rdy = 0; + sched_childinfo_init(); + test_pipes_init(); + __initializer_thd[cos_cpuid()] = sl_thd_alloc(__init_done, NULL); + assert(__initializer_thd[cos_cpuid()]); + sl_thd_param_set(__initializer_thd[cos_cpuid()], sched_param_pack(SCHEDP_PRIO, INITIALIZE_PRIO)); + sl_thd_param_set(__initializer_thd[cos_cpuid()], sched_param_pack(SCHEDP_WINDOW, INITIALIZE_BUDGET_MS)); + sl_thd_param_set(__initializer_thd[cos_cpuid()], sched_param_pack(SCHEDP_BUDGET, INITIALIZE_PERIOD_MS)); + + hypercall_comp_init_done(); + + sl_sched_loop_nonblock(); + + PRINTLOG(PRINT_ERROR, "Should never have reached this point!!!\n"); + assert(0); +} diff --git a/src/components/implementation/srv_dummy/Makefile b/src/components/implementation/srv_dummy/Makefile index 0490a703e3..53929a7ceb 100644 --- a/src/components/implementation/srv_dummy/Makefile +++ b/src/components/implementation/srv_dummy/Makefile @@ -1,3 +1,3 @@ -INTERFACES=sched schedinit srv_dummy +INTERFACES=sched schedinit crt srv_dummy include ../Makefile.subdir diff --git a/src/components/implementation/srv_dummy/cdummy/Makefile b/src/components/implementation/srv_dummy/cdummy/Makefile index 1762e85c90..f6165eca08 100644 --- a/src/components/implementation/srv_dummy/cdummy/Makefile +++ b/src/components/implementation/srv_dummy/cdummy/Makefile @@ -5,7 +5,7 @@ INTERFACES=sched schedinit srv_dummy DEPENDENCIES=capmgr sched schedinit channel IF_LIB= FN_PREPEND=parent_ -ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr -lsinv_client +ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr -lsinv_client -lsl_blkpt include ../../Makefile.subsubdir MANDITORY_LIB=simple_stklib.o diff --git a/src/components/implementation/srv_dummy/cdummy/init.c b/src/components/implementation/srv_dummy/cdummy/init.c index cf568ceb2c..cdd8184421 100644 --- a/src/components/implementation/srv_dummy/cdummy/init.c +++ b/src/components/implementation/srv_dummy/cdummy/init.c @@ -97,7 +97,7 @@ cos_init(void) while (!ps_load(&init_done[i])) ; } - sl_init_cpubmp(SL_MIN_PERIOD_US, cpubmp); + sl_init_corebmp(SL_MIN_PERIOD_US, cpubmp); sched_childinfo_init(); __initializer_thd[cos_cpuid()] = sl_thd_alloc(__init_done, NULL); assert(__initializer_thd[cos_cpuid()]); diff --git a/src/components/implementation/srv_dummy/chan_backend.c b/src/components/implementation/srv_dummy/chan_backend.c new file mode 120000 index 0000000000..1f996d8e9b --- /dev/null +++ b/src/components/implementation/srv_dummy/chan_backend.c @@ -0,0 +1 @@ +../sched/chan_backend.c \ No newline at end of file diff --git a/src/components/implementation/srv_dummy/sched.c b/src/components/implementation/srv_dummy/sched.c index 338c99723b..9980008529 100644 --- a/src/components/implementation/srv_dummy/sched.c +++ b/src/components/implementation/srv_dummy/sched.c @@ -52,12 +52,13 @@ sched_thd_create_cserialized(thdclosure_index_t idx) spdid_t c = cos_inv_token(); struct cos_defcompinfo *dci; struct sl_thd *t = NULL; + vaddr_t dcbaddr; if (!c) return 0; dci = sched_child_defci_get(sched_childinfo_find(c)); if (!dci) return 0; - t = sl_thd_aep_alloc_ext(dci, NULL, idx, 0, 0, 0, 0, 0, NULL); + t = sl_thd_aep_alloc_ext(dci, NULL, idx, 0, 0, 0, 0, 0, &dcbaddr, NULL); if (!t) return 0; srv_dummy_thdinit(sl_thd_thdid(t), 0); @@ -75,12 +76,13 @@ sched_aep_create_cserialized(arcvcap_t *extrcv, int *unused, u32_t thdidx_owntc, microsec_t ipiwin = (microsec_t)ipiwin32b; u32_t ipimax = (key_ipimax << 16) >> 16; cos_channelkey_t key = (key_ipimax >> 16); + vaddr_t dcbaddr; if (!c) return 0; dci = sched_child_defci_get(sched_childinfo_find(c)); if (!dci) return 0; - t = sl_thd_aep_alloc_ext(dci, NULL, idx, 1, owntc, key, ipiwin, ipimax, extrcv); + t = sl_thd_aep_alloc_ext(dci, NULL, idx, 1, owntc, key, ipiwin, ipimax, &dcbaddr, extrcv); if (!t) return 0; srv_dummy_thdinit(sl_thd_thdid(t), 1); diff --git a/src/components/implementation/srv_dummy/sched_info.h b/src/components/implementation/srv_dummy/sched_info.h index 7cb898ec51..b922a2464b 100644 --- a/src/components/implementation/srv_dummy/sched_info.h +++ b/src/components/implementation/srv_dummy/sched_info.h @@ -11,8 +11,10 @@ #include #include #include +#include #define SCHED_MAX_CHILD_COMPS 8 +CRT_CHAN_TYPE_PROTOTYPES(LU, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS); struct sched_childinfo { struct cos_defcompinfo defcinfo; diff --git a/src/components/implementation/srv_dummy/sched_init.c b/src/components/implementation/srv_dummy/sched_init.c index 7f88b858c5..ecd962e63f 100644 --- a/src/components/implementation/srv_dummy/sched_init.c +++ b/src/components/implementation/srv_dummy/sched_init.c @@ -56,7 +56,7 @@ schedinit_child(void) if (unlikely(t)) continue; aep.tid = thdid; - aep.tc = sl_thd_tcap(sl__globals_cpu()->sched_thd); + aep.tc = sl_thd_tcap(sl__globals_core()->sched_thd); t = sl_thd_init_ext(&aep, init); if (!t) return 0; } while (thdid); diff --git a/src/components/implementation/srv_dummy/sdummy/Makefile b/src/components/implementation/srv_dummy/sdummy/Makefile index 75fff5cefa..89e1ccf634 100644 --- a/src/components/implementation/srv_dummy/sdummy/Makefile +++ b/src/components/implementation/srv_dummy/sdummy/Makefile @@ -5,7 +5,7 @@ INTERFACES=sched schedinit srv_dummy DEPENDENCIES=capmgr sched schedinit IF_LIB= FN_PREPEND=parent_ -ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr +ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr -lsl_blkpt include ../../Makefile.subsubdir MANDITORY_LIB=simple_stklib.o diff --git a/src/components/implementation/srv_dummy/sdummy/init.c b/src/components/implementation/srv_dummy/sdummy/init.c index 367a575cc3..9cecb2e145 100644 --- a/src/components/implementation/srv_dummy/sdummy/init.c +++ b/src/components/implementation/srv_dummy/sdummy/init.c @@ -94,7 +94,7 @@ cos_init(void) while (!ps_load(&init_done[i])) ; } - sl_init_cpubmp(SL_MIN_PERIOD_US, cpubmp); + sl_init_corebmp(SL_MIN_PERIOD_US, cpubmp); sched_childinfo_init(); __initializer_thd[cos_cpuid()] = sl_thd_alloc(__init_done, NULL); assert(__initializer_thd[cos_cpuid()]); diff --git a/src/components/implementation/tests/crt_tests/Makefile b/src/components/implementation/tests/crt_tests/Makefile new file mode 100644 index 0000000000..1469929f49 --- /dev/null +++ b/src/components/implementation/tests/crt_tests/Makefile @@ -0,0 +1,8 @@ +COMPONENT=crtt.o +INTERFACES= +DEPENDENCIES= +IF_LIB= +ADDITIONAL_LIBS=-lcobj_format $(LIBSLRAW) -lsl_mod_fprr -lsl_thd_static_backend -lsl_blkpt + +include ../../Makefile.subsubdir +MANDITORY_LIB=simple_stklib.o diff --git a/src/components/implementation/tests/crt_tests/crttests.c b/src/components/implementation/tests/crt_tests/crttests.c new file mode 100644 index 0000000000..ac8882afac --- /dev/null +++ b/src/components/implementation/tests/crt_tests/crttests.c @@ -0,0 +1,248 @@ +/* + * Copyright 2016, Phani Gadepalli and Gabriel Parmer, GWU, gparmer@gwu.edu. + * + * This uses a two clause BSD License. + */ + +#include +#include +#include +#include + +#include +#include + +struct cos_compinfo *ci; + +#define CHAN_ITER 1000000 +#define NCHANTHDS 5 +#define CHAN_BATCH 3 + +CRT_CHAN_STATIC_ALLOC(c0, int, 4); +CRT_CHAN_STATIC_ALLOC(c1, int, 4); +CRT_CHAN_STATIC_ALLOC(c2, int, 4); +CRT_CHAN_STATIC_ALLOC(c3, int, 4); +CRT_CHAN_STATIC_ALLOC(c4, int, 4); + +CRT_CHAN_TYPE_PROTOTYPES(test, int, 4); +struct crt_chan *chans[NCHANTHDS + 1]; +struct sl_thd *chan_thds[NCHANTHDS] = {NULL, }; + +typedef enum { CHILLING = 0, RECVING, SENDING } actions_t; +unsigned long status[NCHANTHDS]; +unsigned long cnts[NCHANTHDS] = {0, }; + +int +chantest_is_deadlocked(void) +{ + int i; + actions_t s = status[0]; + + /* Are all threads in the same blocked state? */ + for (i = 0; i < NCHANTHDS; i++) { + if (status[i] == CHILLING || status[i] != s) return 0; + } + + return 1; +} + +void +chantest_send(int thd_off, struct crt_chan *c) +{ + int send = cos_thdid(); + + if (crt_chan_full_test(c)) status[thd_off] = SENDING; + if (!chantest_is_deadlocked()) { + /* printc("\t%d: send\n", cos_thdid()); */ + crt_chan_send_test(c, &send); + } + status[thd_off] = CHILLING; +} + +void +chantest_recv(int thd_off, struct crt_chan *c) +{ + int recv; + + if (crt_chan_empty_test(c)) status[thd_off] = RECVING; + if (!chantest_is_deadlocked()) { + /* printc("\t%d: recv\n", cos_thdid()); */ + crt_chan_recv_test(c, &recv); + cnts[thd_off]++; + } + status[thd_off] = CHILLING; +} + +void +chan_thd(void *d) +{ + int thd_off = (int)d; + struct crt_chan **chan_pair = &chans[thd_off]; + int recv; + int i; + + for (i = 0; i < CHAN_ITER; i++) { + int j; + + /* printc("%d: pre-send\n", cos_thdid()); */ + for (j = 0; j < CHAN_BATCH; j++) { + chantest_send(thd_off, chan_pair[1]); + } + + /* printc("%d: pre-recv\n", cos_thdid()); */ + for (j = 0; j < CHAN_BATCH; j++) { + chantest_recv(thd_off, chan_pair[0]); + } + } + + printc("SUCCESS! Counts (should be within %d of each other): ", NCHANTHDS * CHAN_BATCH); + for (i = 0; i < NCHANTHDS; i++) { + printc("\t%ld", cnts[i]); + } + printc("\n"); + while (1) ; +} + +void +idle_thd(void *d) +{ + printc("FAILURE: deadlock!\n"); + while (1) ; +} + +void +test_chan(void) +{ + int i; + struct sl_thd *idle; + union sched_param_union idle_param = {.c = {.type = SCHEDP_PRIO, .value = 10}}; + + union sched_param_union sps[] = { + {.c = {.type = SCHEDP_PRIO, .value = 7}}, + {.c = {.type = SCHEDP_PRIO, .value = 6}}, + {.c = {.type = SCHEDP_PRIO, .value = 8}}, + {.c = {.type = SCHEDP_PRIO, .value = 5}}, + {.c = {.type = SCHEDP_PRIO, .value = 5}} + }; + + chans[0] = c0; + chans[1] = c1; + chans[2] = c2; + chans[3] = c3; + chans[4] = c4; + chans[5] = c0; + + for (i = 0; i < NCHANTHDS; i++) { + crt_chan_init_test(chans[i]); + } + + printc("Create threads:\n"); + for (i = 0; i < NCHANTHDS; i++) { + chan_thds[i] = sl_thd_alloc(chan_thd, (void *)i); + assert(chan_thds[i]); + printc("\tcreating thread %d at prio %d\n", sl_thd_thdid(chan_thds[i]), sps[i].c.value); + sl_thd_param_set(chan_thds[i], sps[i].v); + } + idle = sl_thd_alloc(idle_thd, NULL); + printc("\tcreating IDLE %d at prio %d\n", sl_thd_thdid(idle), idle_param.c.value); + sl_thd_param_set(idle, idle_param.v); + +} + +#define LOCK_ITER 1000000 +#define NLOCKTHDS 4 +struct crt_lock lock; +struct sl_thd *lock_thds[NLOCKTHDS] = {NULL, }; +unsigned int progress[NLOCKTHDS] = {0, }; +volatile thdid_t holder; + +thdid_t +next_thd(void) +{ + return sl_thd_thdid(lock_thds[(unsigned int)(ps_tsc() % NLOCKTHDS)]); +} + +void +lock_thd(void *d) +{ + int i, cnt, me = -1; + + for (i = 0; i < NLOCKTHDS; i++) { + if (sl_thd_thdid(lock_thds[i]) != cos_thdid()) continue; + + me = i; + } + assert(me != -1); + + sl_thd_yield(sl_thd_thdid(lock_thds[1])); + + for (i = 0; i < LOCK_ITER; i++) { + crt_lock_take(&lock); + + progress[me]++; + holder = cos_thdid(); + + sl_thd_yield(next_thd()); + + if (holder != cos_thdid()) { + printc("FAILURE\n"); + BUG(); + } + crt_lock_release(&lock); + sl_thd_yield(next_thd()); + } + + for (i = 0; i < NLOCKTHDS; i++) { + if (i == me) continue; + + if (progress[i] < LOCK_ITER) { + sl_thd_yield(sl_thd_thdid(lock_thds[i])); + } + } + + printc("SUCCESS!"); + while (1) ; +} + +void +test_lock(void) +{ + int i; + union sched_param_union sps[] = { + {.c = {.type = SCHEDP_PRIO, .value = 5}}, + {.c = {.type = SCHEDP_PRIO, .value = 6}}, + {.c = {.type = SCHEDP_PRIO, .value = 6}}, + {.c = {.type = SCHEDP_PRIO, .value = 7}} + }; + + crt_lock_init(&lock); + + printc("Create threads:\n"); + for (i = 0; i < NLOCKTHDS; i++) { + lock_thds[i] = sl_thd_alloc(lock_thd, NULL); + printc("\tcreating thread %d at prio %d\n", sl_thd_thdid(lock_thds[i]), sps[i].c.value); + sl_thd_param_set(lock_thds[i], sps[i].v); + } +} + +void +cos_init(void) +{ + struct cos_defcompinfo *defci = cos_defcompinfo_curr_get(); + ci = cos_compinfo_get(defci); + + printc("Unit-test for the crt (sl)\n"); + cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT); + cos_defcompinfo_init(); + sl_init(SL_MIN_PERIOD_US); + + test_lock(); +// test_chan(); + + printc("Running benchmark...\n"); + sl_sched_loop_nonblock(); + + assert(0); + + return; +} diff --git a/src/components/implementation/tests/kernel_tests/k_perf_tests.c b/src/components/implementation/tests/kernel_tests/k_perf_tests.c index a49d4af3ff..81812d60c5 100644 --- a/src/components/implementation/tests/kernel_tests/k_perf_tests.c +++ b/src/components/implementation/tests/kernel_tests/k_perf_tests.c @@ -23,58 +23,58 @@ volatile cycles_t main_thd = 0, side_thd = 0; static void bounceback(void *d) { - while (1) { - rdtscll(side_thd); - cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE); - } + while (1) { + rdtscll(side_thd); + cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE); + } } static void test_thds_create_switch(void) { - thdcap_t ts; - int ret, i; + thdcap_t ts; + int ret, i; - perfdata_init(&pd[cos_cpuid()], "COS THD => COS_THD_SWITCH", test_results, ARRAY_SIZE); + perfdata_init(&pd[cos_cpuid()], "COS THD => COS_THD_SWITCH", test_results, ARRAY_SIZE); - ts = cos_thd_alloc(&booter_info, booter_info.comp_cap, bounceback, NULL); - if (EXPECT_LL_LT(1, ts, "Thread Creation: Cannot Allocate")) { - return; - } + ts = cos_thd_alloc(&booter_info, booter_info.comp_cap, bounceback, NULL, 0, 0); + if (EXPECT_LL_LT(1, ts, "Thread Creation: Cannot Allocate")) { + return; + } - for (i = 0; i < ITER; i++) { - rdtscll(main_thd); - ret = cos_thd_switch(ts); - EXPECT_LL_NEQ(0, ret, "COS Switch Error"); + for (i = 0; i < ITER; i++) { + rdtscll(main_thd); + ret = cos_thd_switch(ts); + EXPECT_LL_NEQ(0, ret, "COS Switch Error"); - perfdata_add(&pd[cos_cpuid()], (side_thd - main_thd)); - } + perfdata_add(&pd[cos_cpuid()], (side_thd - main_thd)); + } - perfdata_calc(&pd[cos_cpuid()]); + perfdata_calc(&pd[cos_cpuid()]); - PRINTC("\tCOS THD => COS_THD_SWITCH:\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n", - perfdata_avg(&pd[cos_cpuid()]), perfdata_max(&pd[cos_cpuid()]), perfdata_min(&pd[cos_cpuid()]), perfdata_sz(&pd[cos_cpuid()])); + PRINTC("\tCOS THD => COS_THD_SWITCH:\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n", + perfdata_avg(&pd[cos_cpuid()]), perfdata_max(&pd[cos_cpuid()]), perfdata_min(&pd[cos_cpuid()]), perfdata_sz(&pd[cos_cpuid()])); - printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n", - perfdata_sd(&pd[cos_cpuid()]),perfdata_90ptile(&pd[cos_cpuid()]), perfdata_95ptile(&pd[cos_cpuid()]), perfdata_99ptile(&pd[cos_cpuid()])); + printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n", + perfdata_sd(&pd[cos_cpuid()]),perfdata_90ptile(&pd[cos_cpuid()]), perfdata_95ptile(&pd[cos_cpuid()]), perfdata_99ptile(&pd[cos_cpuid()])); - perfdata_init(&pd[cos_cpuid()], "COS THD => COS_SWITCH", test_results, ARRAY_SIZE); + perfdata_init(&pd[cos_cpuid()], "COS THD => COS_SWITCH", test_results, ARRAY_SIZE); - for (i = 0; i < ITER; i++) { - rdtscll(main_thd); - ret = cos_switch(ts, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, 0, 0, 0, 0); - EXPECT_LL_NEQ(0, ret, "COS Switch Error"); + for (i = 0; i < ITER; i++) { + rdtscll(main_thd); + ret = cos_switch(ts, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, 0, 0, 0, 0); + EXPECT_LL_NEQ(0, ret, "COS Switch Error"); - perfdata_add(&pd[cos_cpuid()], (side_thd - main_thd)); - } + perfdata_add(&pd[cos_cpuid()], (side_thd - main_thd)); + } - perfdata_calc(&pd[cos_cpuid()]); + perfdata_calc(&pd[cos_cpuid()]); - PRINTC("\tCOS THD => COS_SWITCH:\t\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n", - perfdata_avg(&pd[cos_cpuid()]), perfdata_max(&pd[cos_cpuid()]), perfdata_min(&pd[cos_cpuid()]), perfdata_sz(&pd[cos_cpuid()])); + PRINTC("\tCOS THD => COS_SWITCH:\t\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n", + perfdata_avg(&pd[cos_cpuid()]), perfdata_max(&pd[cos_cpuid()]), perfdata_min(&pd[cos_cpuid()]), perfdata_sz(&pd[cos_cpuid()])); - printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n", - perfdata_sd(&pd[cos_cpuid()]),perfdata_90ptile(&pd[cos_cpuid()]), perfdata_95ptile(&pd[cos_cpuid()]), perfdata_99ptile(&pd[cos_cpuid()])); + printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n", + perfdata_sd(&pd[cos_cpuid()]),perfdata_90ptile(&pd[cos_cpuid()]), perfdata_95ptile(&pd[cos_cpuid()]), perfdata_99ptile(&pd[cos_cpuid()])); } /* @@ -86,162 +86,162 @@ test_thds_create_switch(void) static void async_thd_fn_perf(void *thdcap) { - thdcap_t tc = (thdcap_t)thdcap; - asndcap_t sc = scc_global[cos_cpuid()]; - arcvcap_t rc = rcc_global[cos_cpuid()]; - int i, ret, pending = 0; + thdcap_t tc = (thdcap_t)thdcap; + asndcap_t sc = scc_global[cos_cpuid()]; + arcvcap_t rc = rcc_global[cos_cpuid()]; + int i, ret, pending = 0; - for (i = 0; i < ITER; i++) { - cos_rcv(rc, 0, NULL); - cos_asnd(sc, 1); - } + for (i = 0; i < ITER; i++) { + cos_rcv(rc, 0); + cos_asnd(sc, 1); + } - cos_thd_switch(tc); + cos_thd_switch(tc); - for (i = 0; i < ITER + 1; i++) { - cos_rcv(rc, 0, NULL); - } + for (i = 0; i < ITER + 1; i++) { + cos_rcv(rc, 0); + } - ret = cos_thd_switch(tc); - EXPECT_LL_NEQ(0, ret, "COS Switch Error"); + ret = cos_thd_switch(tc); + EXPECT_LL_NEQ(0, ret, "COS Switch Error"); - EXPECT_LL_NEQ(1, 0, "Error, shouldn't get here!\n"); - assert(0); + EXPECT_LL_NEQ(1, 0, "Error, shouldn't get here!\n"); + assert(0); } static void async_thd_parent_perf(void *thdcap) { - thdcap_t tc = (thdcap_t)thdcap; - asndcap_t sc = scp_global[cos_cpuid()]; - arcvcap_t rc = rcc_global[cos_cpuid()]; - long long e = 0, s = 0; - int i, pending = 0; + thdcap_t tc = (thdcap_t)thdcap; + asndcap_t sc = scp_global[cos_cpuid()]; + arcvcap_t rc = rcc_global[cos_cpuid()]; + long long e = 0, s = 0; + int i, pending = 0; - perfdata_init(&pd[cos_cpuid()], "Async Endpoints => Roundtrip", test_results, ARRAY_SIZE); + perfdata_init(&pd[cos_cpuid()], "Async Endpoints => Roundtrip", test_results, ARRAY_SIZE); - for (i = 0; i < ITER; i++) { - rdtscll(s); - cos_asnd(sc, 1); - cos_rcv(rc, 0, NULL); - rdtscll(e); + for (i = 0; i < ITER; i++) { + rdtscll(s); + cos_asnd(sc, 1); + cos_rcv(rc, 0); + rdtscll(e); - perfdata_add(&pd[cos_cpuid()], (e - s)); - } + perfdata_add(&pd[cos_cpuid()], (e - s)); + } - perfdata_calc(&pd[cos_cpuid()]); + perfdata_calc(&pd[cos_cpuid()]); - PRINTC("\tAsync Endpoints => Roundtrip:\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n", - perfdata_avg(&pd[cos_cpuid()]), perfdata_max(&pd[cos_cpuid()]), perfdata_min(&pd[cos_cpuid()]), - perfdata_sz(&pd[cos_cpuid()])); + PRINTC("\tAsync Endpoints => Roundtrip:\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n", + perfdata_avg(&pd[cos_cpuid()]), perfdata_max(&pd[cos_cpuid()]), perfdata_min(&pd[cos_cpuid()]), + perfdata_sz(&pd[cos_cpuid()])); - printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n", - perfdata_sd(&pd[cos_cpuid()]),perfdata_90ptile(&pd[cos_cpuid()]), perfdata_95ptile(&pd[cos_cpuid()]), - perfdata_99ptile(&pd[cos_cpuid()])); + printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n", + perfdata_sd(&pd[cos_cpuid()]),perfdata_90ptile(&pd[cos_cpuid()]), perfdata_95ptile(&pd[cos_cpuid()]), + perfdata_99ptile(&pd[cos_cpuid()])); - perfdata_init(&pd[cos_cpuid()], "Async Endpoints => One Way", test_results, ARRAY_SIZE); + perfdata_init(&pd[cos_cpuid()], "Async Endpoints => One Way", test_results, ARRAY_SIZE); - for (i = 0; i < ITER; i++) { - rdtscll(s); - cos_asnd(sc, 1); - rdtscll(e); + for (i = 0; i < ITER; i++) { + rdtscll(s); + cos_asnd(sc, 1); + rdtscll(e); - perfdata_add(&pd[cos_cpuid()], (e - s)); - } + perfdata_add(&pd[cos_cpuid()], (e - s)); + } - perfdata_calc(&pd[cos_cpuid()]); + perfdata_calc(&pd[cos_cpuid()]); - PRINTC("\tAsync Endpoints => One Way:\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n", - perfdata_avg(&pd[cos_cpuid()]), perfdata_max(&pd[cos_cpuid()]), perfdata_min(&pd[cos_cpuid()]), - perfdata_sz(&pd[cos_cpuid()])); + PRINTC("\tAsync Endpoints => One Way:\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n", + perfdata_avg(&pd[cos_cpuid()]), perfdata_max(&pd[cos_cpuid()]), perfdata_min(&pd[cos_cpuid()]), + perfdata_sz(&pd[cos_cpuid()])); - printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n", - perfdata_sd(&pd[cos_cpuid()]),perfdata_90ptile(&pd[cos_cpuid()]), perfdata_95ptile(&pd[cos_cpuid()]), - perfdata_99ptile(&pd[cos_cpuid()])); + printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n", + perfdata_sd(&pd[cos_cpuid()]),perfdata_90ptile(&pd[cos_cpuid()]), perfdata_95ptile(&pd[cos_cpuid()]), + perfdata_99ptile(&pd[cos_cpuid()])); - async_test_flag_[cos_cpuid()] = 0; - while (1) cos_thd_switch(tc); + async_test_flag_[cos_cpuid()] = 0; + while (1) cos_thd_switch(tc); } static void test_async_endpoints_perf(void) { - thdcap_t tcp, tcc; - tcap_t tccp, tccc; - arcvcap_t rcp, rcc; - - /* parent rcv capabilities */ - tcp = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_parent_perf, - (void *)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE); - if(EXPECT_LL_LT(1, tcp, "Test Async Endpoints")) return; - tccp = cos_tcap_alloc(&booter_info); - if(EXPECT_LL_LT(1, tccp, "Test Async Endpoints")) return; - rcp = cos_arcv_alloc(&booter_info, tcp, tccp, booter_info.comp_cap, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE); - if(EXPECT_LL_LT(1, rcp, "Test Async Endpoints")) return; - if(EXPECT_LL_NEQ(0,cos_tcap_transfer(rcp, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, - TCAP_PRIO_MAX + 1), "Test Async Endpoints")) { - return; - } - - /* child rcv capabilities */ - tcc = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_fn_perf, (void *)tcp); - if(EXPECT_LL_LT(1, tcc, "Test Async Endpoints")) return; - tccc = cos_tcap_alloc(&booter_info); - if(EXPECT_LL_LT(1, tccc, "Test Async Endpoints")) return; - rcc = cos_arcv_alloc(&booter_info, tcc, tccc, booter_info.comp_cap, rcp); - if(EXPECT_LL_LT(1, rcc, "Test Async Endpoints")) return; - if(EXPECT_LL_NEQ(0,cos_tcap_transfer(rcc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, - TCAP_PRIO_MAX), "Test Async Endpoints")) - return; - - /* make the snd channel to the child */ - scp_global[cos_cpuid()] = cos_asnd_alloc(&booter_info, rcc, booter_info.captbl_cap); - if(EXPECT_LL_EQ(0, scp_global[cos_cpuid()], "Test Async Endpoints")) return; - - /* make the snd channel to the parent */ - scc_global[cos_cpuid()] = cos_asnd_alloc(&booter_info, rcp, booter_info.captbl_cap); - if(EXPECT_LL_EQ(0, scp_global[cos_cpuid()], "Test Async Endpoints")) return; - - rcc_global[cos_cpuid()] = rcc; - rcp_global[cos_cpuid()] = rcp; - - async_test_flag_[cos_cpuid()] = 1; - while (async_test_flag_[cos_cpuid()]) cos_thd_switch(tcp); + thdcap_t tcp, tcc; + tcap_t tccp, tccc; + arcvcap_t rcp, rcc; + + /* parent rcv capabilities */ + tcp = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_parent_perf, + (void *)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, 0, 0); + if(EXPECT_LL_LT(1, tcp, "Test Async Endpoints")) return; + tccp = cos_tcap_alloc(&booter_info); + if(EXPECT_LL_LT(1, tccp, "Test Async Endpoints")) return; + rcp = cos_arcv_alloc(&booter_info, tcp, tccp, booter_info.comp_cap, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE); + if(EXPECT_LL_LT(1, rcp, "Test Async Endpoints")) return; + if(EXPECT_LL_NEQ(0,cos_tcap_transfer(rcp, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, + TCAP_PRIO_MAX + 1), "Test Async Endpoints")) { + return; + } + + /* child rcv capabilities */ + tcc = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_fn_perf, (void *)tcp, 0, 0); + if(EXPECT_LL_LT(1, tcc, "Test Async Endpoints")) return; + tccc = cos_tcap_alloc(&booter_info); + if(EXPECT_LL_LT(1, tccc, "Test Async Endpoints")) return; + rcc = cos_arcv_alloc(&booter_info, tcc, tccc, booter_info.comp_cap, rcp); + if(EXPECT_LL_LT(1, rcc, "Test Async Endpoints")) return; + if(EXPECT_LL_NEQ(0,cos_tcap_transfer(rcc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, + TCAP_PRIO_MAX), "Test Async Endpoints")) + return; + + /* make the snd channel to the child */ + scp_global[cos_cpuid()] = cos_asnd_alloc(&booter_info, rcc, booter_info.captbl_cap); + if(EXPECT_LL_EQ(0, scp_global[cos_cpuid()], "Test Async Endpoints")) return; + + /* make the snd channel to the parent */ + scc_global[cos_cpuid()] = cos_asnd_alloc(&booter_info, rcp, booter_info.captbl_cap); + if(EXPECT_LL_EQ(0, scp_global[cos_cpuid()], "Test Async Endpoints")) return; + + rcc_global[cos_cpuid()] = rcc; + rcp_global[cos_cpuid()] = rcp; + + async_test_flag_[cos_cpuid()] = 1; + while (async_test_flag_[cos_cpuid()]) cos_thd_switch(tcp); } void test_print_ubench(void) { - PRINTC("\tSINV:\t\t\t\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n", - result_sinv.avg, result_sinv.max, result_sinv.max, - result_sinv.sz); + PRINTC("\tSINV:\t\t\t\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n", + result_sinv.avg, result_sinv.max, result_sinv.max, + result_sinv.sz); - printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n", - result_sinv.sd, result_sinv.p90tile, result_sinv.p95tile, - result_sinv.p99tile); + printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n", + result_sinv.sd, result_sinv.p90tile, result_sinv.p95tile, + result_sinv.p99tile); - PRINTC("\tTimer => Timeout Overhead: \t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n", - result_test_timer.avg, result_test_timer.max, result_test_timer.min, - result_test_timer.sz); + PRINTC("\tTimer => Timeout Overhead: \t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n", + result_test_timer.avg, result_test_timer.max, result_test_timer.min, + result_test_timer.sz); - printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n", - result_test_timer.sd, result_test_timer.p90tile, result_test_timer.p95tile, - result_test_timer.p99tile); + printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n", + result_test_timer.sd, result_test_timer.p90tile, result_test_timer.p95tile, + result_test_timer.p99tile); - PRINTC("\tTimer => Budget based: \t\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n", - result_budgets_single.avg, result_budgets_single.max, result_budgets_single.min, - result_budgets_single.sz); + PRINTC("\tTimer => Budget based: \t\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n", + result_budgets_single.avg, result_budgets_single.max, result_budgets_single.min, + result_budgets_single.sz); - printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n", - result_budgets_single.sd, result_budgets_single.p90tile, result_budgets_single.p95tile, - result_budgets_single.p99tile); + printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n", + result_budgets_single.sd, result_budgets_single.p90tile, result_budgets_single.p95tile, + result_budgets_single.p99tile); } void test_run_perf_kernel(void) { - cyc_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE); - test_thds_create_switch(); - test_async_endpoints_perf(); - test_print_ubench(); + cyc_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE); + test_thds_create_switch(); + test_async_endpoints_perf(); + test_print_ubench(); } diff --git a/src/components/implementation/tests/kernel_tests/k_test_async.c b/src/components/implementation/tests/kernel_tests/k_test_async.c index 19d155f2c6..e32db4c61b 100644 --- a/src/components/implementation/tests/kernel_tests/k_test_async.c +++ b/src/components/implementation/tests/kernel_tests/k_test_async.c @@ -17,139 +17,136 @@ static int failure = 0; static void async_thd_fn(void *thdcap) { - thdcap_t tc = (thdcap_t)thdcap; - arcvcap_t rc = rcc_global[cos_cpuid()]; - int pending, rcvd, ret; - - pending = cos_rcv(rc, RCV_NON_BLOCKING, NULL); - if (EXPECT_LL_NEQ(3, pending, "Test Async Endpoints")) failure = 1; - - pending = cos_rcv(rc, RCV_NON_BLOCKING | RCV_ALL_PENDING, &rcvd); - if (EXPECT_LL_NEQ(0, pending, "Test Async Endpoints")) failure = 1; - - pending = cos_rcv(rc, RCV_ALL_PENDING, &rcvd); - /* switch */ - if (EXPECT_LL_NEQ(0, pending, "Test Async Endpoints")) failure = 1; - - pending = cos_rcv(rc, 0, NULL); - /* switch */ - if (EXPECT_LL_NEQ(0, pending, "Test Async Endpoints")) failure = 1; - - pending = cos_rcv(rc, 0, NULL); - /* switch */ - if (EXPECT_LL_NEQ(0, pending, "Test Async Endpoints")) failure = 1; - - pending = cos_rcv(rc, RCV_NON_BLOCKING, NULL); - if (EXPECT_LL_NEQ(pending, -EAGAIN, "Test Async Endpoints")) failure = 1; - - pending = cos_rcv(rc, 0, NULL); - /* switch */ - if (EXPECT_LL_NEQ(0, 1, "Test Async Endpoints")) failure = 1; - - ret = cos_thd_switch(tc); - if (EXPECT_LL_NEQ(0, ret, "COS Switch Error") || - EXPECT_LL_NEQ(0, 1, "Test Async Endpoints")) { - failure = 1; - } - while (1) cos_thd_switch(tc); + thdcap_t tc = (thdcap_t)thdcap; + arcvcap_t rc = rcc_global[cos_cpuid()]; + int pending, rcvd, ret; + + pending = cos_rcv(rc, RCV_NON_BLOCKING); + if (EXPECT_LL_NEQ(0, pending, "Test Async Endpoints")) failure = 1; + + pending = cos_rcv(rc, 0); + /* switch */ + if (EXPECT_LL_NEQ(0, pending, "Test Async Endpoints")) failure = 1; + + pending = cos_rcv(rc, 0); + /* switch */ + if (EXPECT_LL_NEQ(0, pending, "Test Async Endpoints")) failure = 1; + + pending = cos_rcv(rc, 0); + /* switch */ + if (EXPECT_LL_NEQ(0, pending, "Test Async Endpoints")) failure = 1; + + pending = cos_rcv(rc, RCV_NON_BLOCKING); + if (EXPECT_LL_NEQ(pending, -EAGAIN, "Test Async Endpoints")) failure = 1; + + pending = cos_rcv(rc, 0); + /* switch */ + if (EXPECT_LL_NEQ(0, 1, "Test Async Endpoints")) failure = 1; + + ret = cos_thd_switch(tc); + if (EXPECT_LL_NEQ(0, ret, "COS Switch Error") || + EXPECT_LL_NEQ(0, 1, "Test Async Endpoints")) { + failure = 1; + } + while (1) cos_thd_switch(tc); } static void async_thd_parent(void *thdcap) { - thdcap_t tc = (thdcap_t)thdcap; - arcvcap_t rc = rcp_global[cos_cpuid()]; - asndcap_t sc = scp_global[cos_cpuid()]; - int ret; - thdid_t tid; - int blocked, rcvd; - cycles_t cycles, now; - tcap_time_t thd_timeout; - - /* NON_BLOCKING ASND with 0 as arg*/ - ret = cos_asnd(sc, 0); - ret = cos_asnd(sc, 0); - ret = cos_asnd(sc, 0); - ret = cos_asnd(sc, 1); - - /* switch */ - /* child blocked at this point, parent is using child's tcap, this call yields to the child */ - ret = cos_asnd(sc, 0); - - /* switch */ - ret = cos_asnd(sc, 0); - if (EXPECT_LL_NEQ(0, ret, "Test Async Endpoints")) failure = 1; - - /* switch */ - ret = cos_asnd(sc, 1); - if (EXPECT_LL_NEQ(0, ret, "Test Async Endpoints")) failure = 1; - - /* switch */ - cos_sched_rcv(rc, RCV_ALL_PENDING, 0, &rcvd, &tid, &blocked, &cycles, &thd_timeout); - rdtscll(now); - - async_test_flag_[cos_cpuid()] = 0; - while (1) cos_thd_switch(tc); + thdcap_t tc = (thdcap_t)thdcap; + arcvcap_t rc = rcp_global[cos_cpuid()]; + asndcap_t sc = scp_global[cos_cpuid()]; + int ret; + thdid_t tid; + int blocked; + cycles_t cycles, now; + tcap_time_t thd_timeout; + + /* NON_BLOCKING ASND with 0 as arg*/ + ret = cos_asnd(sc, 0); + ret = cos_asnd(sc, 0); + ret = cos_asnd(sc, 0); + ret = cos_asnd(sc, 1); + + /* switch */ + /* child blocked at this point, parent is using child's tcap, this call yields to the child */ + ret = cos_asnd(sc, 0); + + /* switch */ + ret = cos_asnd(sc, 0); + if (EXPECT_LL_NEQ(0, ret, "Test Async Endpoints")) failure = 1; + + /* switch */ + ret = cos_asnd(sc, 1); + if (EXPECT_LL_NEQ(0, ret, "Test Async Endpoints")) failure = 1; + + /* switch to parent */ + cos_sched_rcv(rc, 0, 0, &tid, &blocked, &cycles, &thd_timeout); + rdtscll(now); + + async_test_flag_[cos_cpuid()] = 0; + while (1) cos_thd_switch(tc); } void test_async_endpoints(void) { - thdcap_t tcp, tcc; - tcap_t tccp, tccc; - arcvcap_t rcp, rcc; - asndcap_t scr; - - /* parent rcv capabilities */ - tcp = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_parent, - (void *)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE); - if (EXPECT_LL_LT(1, tcp, "Test Async Endpoints")) { - return; - } - tccp = cos_tcap_alloc(&booter_info); - if (EXPECT_LL_LT(1, tccp, "Test Async Endpoints")) { - return; - } - rcp = cos_arcv_alloc(&booter_info, tcp, tccp, booter_info.comp_cap, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE); - if (EXPECT_LL_LT(1, rcp, "Test Async Endpoints")) { - return; - } - if (EXPECT_LL_NEQ(0,cos_tcap_transfer(rcp, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, TCAP_PRIO_MAX), - "Test Async Endpoints")) { - return; - } - - /* child rcv capabilities */ - tcc = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_fn, (void *)tcp); - if (EXPECT_LL_LT(1, tcc, "Test Async Endpoints")) { - return; - } - tccc = cos_tcap_alloc(&booter_info); - if (EXPECT_LL_LT(1, tccc, "Test Async Endpoints")) { - return; - } - rcc = cos_arcv_alloc(&booter_info, tcc, tccc, booter_info.comp_cap, rcp); - if (EXPECT_LL_LT(1, rcc, "Test Async Endpoints")) { - return; - } - if (EXPECT_LL_NEQ(0,cos_tcap_transfer(rcc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, - TCAP_PRIO_MAX + 1), "Test Async Endpoints")) { - return; - } - - /* make the snd channel to the child */ - scp_global[cos_cpuid()] = cos_asnd_alloc(&booter_info, rcc, booter_info.captbl_cap); - if (EXPECT_LL_EQ(0, scp_global[cos_cpuid()], "Test Async Endpoints")) return; - scr = cos_asnd_alloc(&booter_info, rcp, booter_info.captbl_cap); - if (EXPECT_LL_EQ(0, scr, "Test Async Endpoints")) return; - - rcc_global[cos_cpuid()] = rcc; - rcp_global[cos_cpuid()] = rcp; - - async_test_flag_[cos_cpuid()] = 1; - while (async_test_flag_[cos_cpuid()]) cos_asnd(scr, 1); - - CHECK_STATUS_FLAG(); - PRINTC("\t%s: \t\tSuccess\n", "Asynchronous Endpoints"); - EXIT_FN(); + thdcap_t tcp, tcc; + tcap_t tccp, tccc; + arcvcap_t rcp, rcc; + asndcap_t scr; + + /* parent rcv capabilities */ + tcp = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_parent, + (void *)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, 0, 0); + if (EXPECT_LL_LT(1, tcp, "Test Async Endpoints")) { + return; + } + tccp = cos_tcap_alloc(&booter_info); + if (EXPECT_LL_LT(1, tccp, "Test Async Endpoints")) { + return; + } + rcp = cos_arcv_alloc(&booter_info, tcp, tccp, booter_info.comp_cap, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE); + if (EXPECT_LL_LT(1, rcp, "Test Async Endpoints")) { + return; + } + if (EXPECT_LL_NEQ(0,cos_tcap_transfer(rcp, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, TCAP_PRIO_MAX), + "Test Async Endpoints")) { + return; + } + + /* child rcv capabilities */ + tcc = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_fn, (void *)tcp, 0, 0); + if (EXPECT_LL_LT(1, tcc, "Test Async Endpoints")) { + return; + } + tccc = cos_tcap_alloc(&booter_info); + if (EXPECT_LL_LT(1, tccc, "Test Async Endpoints")) { + return; + } + rcc = cos_arcv_alloc(&booter_info, tcc, tccc, booter_info.comp_cap, rcp); + if (EXPECT_LL_LT(1, rcc, "Test Async Endpoints")) { + return; + } + if (EXPECT_LL_NEQ(0,cos_tcap_transfer(rcc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, + TCAP_PRIO_MAX + 1), "Test Async Endpoints")) { + return; + } + + /* make the snd channel to the child */ + scp_global[cos_cpuid()] = cos_asnd_alloc(&booter_info, rcc, booter_info.captbl_cap); + if (EXPECT_LL_EQ(0, scp_global[cos_cpuid()], "Test Async Endpoints")) return; + scr = cos_asnd_alloc(&booter_info, rcp, booter_info.captbl_cap); + if (EXPECT_LL_EQ(0, scr, "Test Async Endpoints")) return; + + rcc_global[cos_cpuid()] = rcc; + rcp_global[cos_cpuid()] = rcp; + + async_test_flag_[cos_cpuid()] = 1; + while (async_test_flag_[cos_cpuid()]) cos_asnd(scr, 1); + + CHECK_STATUS_FLAG(); + PRINTC("\t%s: \t\tSuccess\n", "Asynchronous Endpoints"); + EXIT_FN(); } diff --git a/src/components/implementation/tests/kernel_tests/k_test_captbl.c b/src/components/implementation/tests/kernel_tests/k_test_captbl.c index 4365195e08..76532eeef0 100644 --- a/src/components/implementation/tests/kernel_tests/k_test_captbl.c +++ b/src/components/implementation/tests/kernel_tests/k_test_captbl.c @@ -14,20 +14,20 @@ extern void *__inv_test_serverfn(int a, int b, int c); void test_captbl_expands(void) { - int i; - compcap_t cc; + int i; + compcap_t cc; - cc = cos_comp_alloc(&booter_info, booter_info.captbl_cap, booter_info.pgtbl_cap, (vaddr_t)NULL); - if (EXPECT_LL_LT(1, cc, "Capability Table Expansion")) { - return; - } - for (i = 0; i < CAPTBL_ITER; i++) { - sinvcap_t ic; + cc = cos_comp_alloc(&booter_info, booter_info.captbl_cap, booter_info.pgtbl_cap, 0, (vaddr_t)NULL, (vaddr_t)NULL); + if (EXPECT_LL_LT(1, cc, "Capability Table Expansion")) { + return; + } + for (i = 0; i < CAPTBL_ITER; i++) { + sinvcap_t ic; - ic = cos_sinv_alloc(&booter_info, cc, (vaddr_t)__inv_test_serverfn, 0); - if(EXPECT_LL_LT(1, ic, "Capability Table: Cannot Allocate")) { - return; - } - } - PRINTC("\t%s: \t\tSuccess\n", "Capability Table Expansion"); + ic = cos_sinv_alloc(&booter_info, cc, (vaddr_t)__inv_test_serverfn, 0); + if(EXPECT_LL_LT(1, ic, "Capability Table: Cannot Allocate")) { + return; + } + } + PRINTC("\t%s: \t\tSuccess\n", "Capability Table Expansion"); } diff --git a/src/components/implementation/tests/kernel_tests/k_test_inv.c b/src/components/implementation/tests/kernel_tests/k_test_inv.c index fcb9fa132a..f6833c36ba 100644 --- a/src/components/implementation/tests/kernel_tests/k_test_inv.c +++ b/src/components/implementation/tests/kernel_tests/k_test_inv.c @@ -16,7 +16,7 @@ static cycles_t test_results[ARRAY_SIZE] = { 0 }; int test_serverfn(int a, int b, int c) { - return 0xDEADBEEF; + return 0xDEADBEEF; } extern void *__inv_test_serverfn(int a, int b, int c); @@ -24,68 +24,68 @@ extern void *__inv_test_serverfn(int a, int b, int c); static inline int call_cap_mb(u32_t cap_no, int arg1, int arg2, int arg3) { - int ret; - - /* - * Which stack should we use for this invocation? Simple, use - * this stack, at the current sp. This is essentially a - * function call into another component, with odd calling - * conventions. - */ - cap_no = (cap_no + 1) << COS_CAPABILITY_OFFSET; - - __asm__ __volatile__("pushl %%ebp\n\t" - "movl %%esp, %%ebp\n\t" - "movl %%esp, %%edx\n\t" - "movl $1f, %%ecx\n\t" - "sysenter\n\t" - "1:\n\t" - "popl %%ebp" - : "=a"(ret) - : "a"(cap_no), "b"(arg1), "S"(arg2), "D"(arg3) - : "memory", "cc", "ecx", "edx"); - - return ret; + int ret; + + /* + * Which stack should we use for this invocation? Simple, use + * this stack, at the current sp. This is essentially a + * function call into another component, with odd calling + * conventions. + */ + cap_no = (cap_no + 1) << COS_CAPABILITY_OFFSET; + + __asm__ __volatile__("pushl %%ebp\n\t" + "movl %%esp, %%ebp\n\t" + "movl %%esp, %%edx\n\t" + "movl $1f, %%ecx\n\t" + "sysenter\n\t" + "1:\n\t" + "popl %%ebp" + : "=a"(ret) + : "a"(cap_no), "b"(arg1), "S"(arg2), "D"(arg3) + : "memory", "cc", "ecx", "edx"); + + return ret; } void test_inv(void) { - compcap_t cc; - sinvcap_t ic; - unsigned int r; - int i; - cycles_t start_cycles = 0LL, end_cycles = 0LL; - - perfdata_init(&result, "SINV", test_results, ARRAY_SIZE); - - cc = cos_comp_alloc(&booter_info, booter_info.captbl_cap, booter_info.pgtbl_cap, (vaddr_t)NULL); - if (EXPECT_LL_LT(1, cc, "Invocation: Cannot Allocate")) return; - ic = cos_sinv_alloc(&booter_info, cc, (vaddr_t)__inv_test_serverfn, 0); - if (EXPECT_LL_LT(1, ic, "Invocation: Cannot Allocate")) return; - - r = call_cap_mb(ic, 1, 2, 3); - if (EXPECT_LLU_NEQ(0xDEADBEEF, r, "Test Invocation")) return; - - for (i = 0; i < ITER; i++) { - rdtscll(start_cycles); - call_cap_mb(ic, 1, 2, 3); - rdtscll(end_cycles); - - perfdata_add(&result, end_cycles - start_cycles); - } - - perfdata_calc(&result); - result_sinv.avg = perfdata_avg(&result); - result_sinv.max = perfdata_avg(&result); - result_sinv.min = perfdata_avg(&result); - result_sinv.sz = perfdata_avg(&result); - result_sinv.sd = perfdata_avg(&result); - result_sinv.p90tile = perfdata_avg(&result); - result_sinv.p95tile = perfdata_avg(&result); - result_sinv.p99tile = perfdata_avg(&result); - - CHECK_STATUS_FLAG(); - PRINTC("\t%s: \t\tSuccess\n", "Synchronous Invocations"); - EXIT_FN(); + compcap_t cc; + sinvcap_t ic; + unsigned int r; + int i; + cycles_t start_cycles = 0LL, end_cycles = 0LL; + + perfdata_init(&result, "SINV", test_results, ARRAY_SIZE); + + cc = cos_comp_alloc(&booter_info, booter_info.captbl_cap, booter_info.pgtbl_cap, 0, (vaddr_t)NULL, (vaddr_t)NULL); + if (EXPECT_LL_LT(1, cc, "Invocation: Cannot Allocate")) return; + ic = cos_sinv_alloc(&booter_info, cc, (vaddr_t)__inv_test_serverfn, 0); + if (EXPECT_LL_LT(1, ic, "Invocation: Cannot Allocate")) return; + + r = call_cap_mb(ic, 1, 2, 3); + if (EXPECT_LLU_NEQ(0xDEADBEEF, r, "Test Invocation")) return; + + for (i = 0; i < ITER; i++) { + rdtscll(start_cycles); + call_cap_mb(ic, 1, 2, 3); + rdtscll(end_cycles); + + perfdata_add(&result, end_cycles - start_cycles); + } + + perfdata_calc(&result); + result_sinv.avg = perfdata_avg(&result); + result_sinv.max = perfdata_avg(&result); + result_sinv.min = perfdata_avg(&result); + result_sinv.sz = perfdata_avg(&result); + result_sinv.sd = perfdata_avg(&result); + result_sinv.p90tile = perfdata_avg(&result); + result_sinv.p95tile = perfdata_avg(&result); + result_sinv.p99tile = perfdata_avg(&result); + + CHECK_STATUS_FLAG(); + PRINTC("\t%s: \t\tSuccess\n", "Synchronous Invocations"); + EXIT_FN(); } diff --git a/src/components/implementation/tests/kernel_tests/k_test_mem.c b/src/components/implementation/tests/kernel_tests/k_test_mem.c index b10fa54e94..4da2919749 100644 --- a/src/components/implementation/tests/kernel_tests/k_test_mem.c +++ b/src/components/implementation/tests/kernel_tests/k_test_mem.c @@ -13,48 +13,48 @@ void test_mem_alloc(void) { - char * p, *s, *t, *prev; - int i; - const char *chk = "SUCCESS"; - int fail_contiguous = 0; + char * p, *s, *t, *prev; + int i; + const char *chk = "SUCCESS"; + int fail_contiguous = 0; - p = cos_page_bump_alloc(&booter_info); - if (p == NULL) { - EXPECT_LL_NEQ(0, 1, "Memory Test: Cannot Allocate"); - return; - } - PRINTC("\t%s: \t\t\tSuccess\n", "Memory => Allocation"); - strcpy(p, chk); + p = cos_page_bump_alloc(&booter_info); + if (p == NULL) { + EXPECT_LL_NEQ(0, 1, "Memory Test: Cannot Allocate"); + return; + } + PRINTC("\t%s: \t\t\tSuccess\n", "Memory => Allocation"); + strcpy(p, chk); - if (EXPECT_LL_NEQ(0, strcmp(chk, p), "Memory Test: Wrong STRCPY")) { - return; - } + if (EXPECT_LL_NEQ(0, strcmp(chk, p), "Memory Test: Wrong STRCPY")) { + return; + } - s = cos_page_bump_alloc(&booter_info); - assert(s); - prev = s; - for (i = 0; i < TEST_NPAGES; i++) { - t = cos_page_bump_alloc(&booter_info); - if (t == NULL){ - EXPECT_LL_EQ(0, 1, "Memory Test: Cannot Allocate"); - return; - } - if (t != prev + PAGE_SIZE) { - fail_contiguous = 1; - } - prev = t; - } - if (!fail_contiguous) { - memset(s, 0, TEST_NPAGES * PAGE_SIZE); - } else if (EXPECT_LL_EQ(i, TEST_NPAGES,"Memory Test: Cannot Allocate contiguous")) { - return; - } + s = cos_page_bump_alloc(&booter_info); + assert(s); + prev = s; + for (i = 0; i < TEST_NPAGES; i++) { + t = cos_page_bump_alloc(&booter_info); + if (t == NULL){ + EXPECT_LL_EQ(0, 1, "Memory Test: Cannot Allocate"); + return; + } + if (t != prev + PAGE_SIZE) { + fail_contiguous = 1; + } + prev = t; + } + if (!fail_contiguous) { + memset(s, 0, TEST_NPAGES * PAGE_SIZE); + } else if (EXPECT_LL_EQ(i, TEST_NPAGES,"Memory Test: Cannot Allocate contiguous")) { + return; + } - t = cos_page_bump_allocn(&booter_info, TEST_NPAGES * PAGE_SIZE); - if (t == NULL) { - EXPECT_LL_NEQ(0, 1, "Memory Test: Cannot Allocate"); - return; - } - memset(t, 0, TEST_NPAGES * PAGE_SIZE); - PRINTC("\t%s: \t\t\tSuccess\n", "Memory => R & W"); + t = cos_page_bump_allocn(&booter_info, TEST_NPAGES * PAGE_SIZE); + if (t == NULL) { + EXPECT_LL_NEQ(0, 1, "Memory Test: Cannot Allocate"); + return; + } + memset(t, 0, TEST_NPAGES * PAGE_SIZE); + PRINTC("\t%s: \t\t\tSuccess\n", "Memory => R & W"); } diff --git a/src/components/implementation/tests/kernel_tests/k_test_tcap.c b/src/components/implementation/tests/kernel_tests/k_test_tcap.c index f69dfa5fed..6b1a311552 100644 --- a/src/components/implementation/tests/kernel_tests/k_test_tcap.c +++ b/src/components/implementation/tests/kernel_tests/k_test_tcap.c @@ -8,6 +8,7 @@ #include #include "kernel_tests.h" +#include struct results result_test_timer; struct results result_budgets_single; @@ -19,157 +20,156 @@ static cycles_t test_results[ARRAY_SIZE] = { 0 }; static void spinner(void *d) { - while (1); + while (1); } void -sched_events_clear(void) +sched_events_clear_nonblock(void) { - thdid_t tid; - int blocked, rcvd; - cycles_t cycles, now; - tcap_time_t timer, thd_timeout; - - while (cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, RCV_ALL_PENDING, 0, - &rcvd, &tid, &blocked, &cycles, &thd_timeout) != 0); + struct cos_sched_event e; + cos_ul_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, RCV_NON_BLOCKING, 0, &e); +} +void +sched_events_clear(void) +{ + struct cos_sched_event e; + while (cos_ul_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, 0, 0, &e)) ; } void test_timer(void) { - thdcap_t tc; - cycles_t c = 0, p = 0; - int i, ret; - cycles_t s, e; - thdid_t tid; - int blocked, rcvd; - cycles_t cycles, now, utime; - long long time, mask; - tcap_time_t timer, thd_timeout; - - tc = cos_thd_alloc(&booter_info, booter_info.comp_cap, spinner, NULL); - - perfdata_init(&result, "COS THD => COS_THD_SWITCH", test_results, ARRAY_SIZE); - - for (i = 0; i <= TEST_ITER; i++){ - rdtscll(now); - timer = tcap_cyc2time(now + GRANULARITY * cyc_per_usec); - cos_switch(tc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, 0, timer, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, - cos_sched_sync()); - p = c; - rdtscll(c); - time = (c - now - (cycles_t)(GRANULARITY * cyc_per_usec)); - mask = (time >> (sizeof(long long) * CHAR_BIT - 1)); - utime = (time + mask) ^ mask; - - if (i > 0) { - perfdata_add(&result, utime); - - if (EXPECT_LLU_LT((long long unsigned)(c-now), (unsigned)(GRANULARITY * cyc_per_usec * MAX_THDS), - "Timer: Failure on MAX") || - EXPECT_LLU_LT((unsigned)(GRANULARITY * cyc_per_usec * MIN_THDS), (long long unsigned)(c-now), - "Timer: failure on MIN")) { - return; - } - } - sched_events_clear(); - } - - perfdata_calc(&result); - result_test_timer.avg = perfdata_avg(&result); - result_test_timer.max = perfdata_avg(&result); - result_test_timer.min = perfdata_avg(&result); - result_test_timer.sz = perfdata_avg(&result); - result_test_timer.sd = perfdata_avg(&result); - result_test_timer.p90tile = perfdata_avg(&result); - result_test_timer.p95tile = perfdata_avg(&result); - result_test_timer.p99tile = perfdata_avg(&result); - - /* Timer in past */ - c = 0, p = 0; - - rdtscll(c); - timer = tcap_cyc2time(c - GRANULARITY * cyc_per_usec); - cos_switch(tc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, 0, timer, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, - cos_sched_sync()); - p = c; - rdtscll(c); - - if (EXPECT_LLU_LT((long long unsigned)(c-p), (unsigned)(GRANULARITY * cyc_per_usec), "Timer: Past")) { - return; - } - - sched_events_clear(); - - /* Timer now */ - c = 0, p = 0; - - rdtscll(c); - timer = tcap_cyc2time(c); - cos_switch(tc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, 0, timer, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, - cos_sched_sync()); - p = c; - rdtscll(c); - - if (EXPECT_LLU_LT((long long unsigned)(c-p), (unsigned)(GRANULARITY * cyc_per_usec), "Timer: Now")) { - return; - } - - cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, RCV_ALL_PENDING, 0, - &rcvd, &tid, &blocked, &cycles, &thd_timeout) - ; - - EXPECT_LLU_LT((long long unsigned)cycles, (long long unsigned)(c-p), "Timer => Cycles time"); - - sched_events_clear(); - PRINTC("\t%s: \t\t\tSuccess\n", "One-Shot Timeout"); + thdcap_t tc; + cycles_t c = 0, p = 0; + int i, ret; + cycles_t s, e; + cycles_t cycles, now, utime; + long long time, mask; + tcap_time_t timer, thd_timeout; + + tc = cos_thd_alloc(&booter_info, booter_info.comp_cap, spinner, NULL, 0, 0); + + perfdata_init(&result, "COS THD => COS_THD_SWITCH", test_results, ARRAY_SIZE); + + for (i = 0; i <= TEST_ITER; i++){ + rdtscll(now); + timer = tcap_cyc2time(now + GRANULARITY * cyc_per_usec); + cos_switch(tc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, 0, timer, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, + cos_sched_sync()); + p = c; + rdtscll(c); + time = (c - now - (cycles_t)(GRANULARITY * cyc_per_usec)); + mask = (time >> (sizeof(long long) * CHAR_BIT - 1)); + utime = (time + mask) ^ mask; + + if (i > 0) { + perfdata_add(&result, utime); + + if (EXPECT_LLU_LT((long long unsigned)(c-now), (unsigned)(GRANULARITY * cyc_per_usec * MAX_THDS), + "Timer: Failure on MAX") || + EXPECT_LLU_LT((unsigned)(GRANULARITY * cyc_per_usec * MIN_THDS), (long long unsigned)(c-now), + "Timer: failure on MIN")) { + return; + } + } + sched_events_clear(); + } + + perfdata_calc(&result); + result_test_timer.avg = perfdata_avg(&result); + result_test_timer.max = perfdata_avg(&result); + result_test_timer.min = perfdata_avg(&result); + result_test_timer.sz = perfdata_avg(&result); + result_test_timer.sd = perfdata_avg(&result); + result_test_timer.p90tile = perfdata_avg(&result); + result_test_timer.p95tile = perfdata_avg(&result); + result_test_timer.p99tile = perfdata_avg(&result); + + /* Timer in past */ + c = 0, p = 0; + + rdtscll(c); + timer = tcap_cyc2time(c - GRANULARITY * cyc_per_usec); + cos_switch(tc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, 0, timer, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, + cos_sched_sync()); + p = c; + rdtscll(c); + + if (EXPECT_LLU_LT((long long unsigned)(c-p), (unsigned)(GRANULARITY * cyc_per_usec), "Timer: Past")) { + return; + } + + sched_events_clear(); + + /* Timer now */ + c = 0, p = 0; + + rdtscll(c); + timer = tcap_cyc2time(c); + cos_switch(tc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, 0, timer, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, + cos_sched_sync()); + p = c; + rdtscll(c); + + if (EXPECT_LLU_LT((long long unsigned)(c-p), (unsigned)(GRANULARITY * cyc_per_usec), "Timer: Now")) { + return; + } + + struct cos_sched_event ev; + cos_ul_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, 0, 0, &ev); + cycles = ev.evt.elapsed_cycs; + + EXPECT_LLU_LT((long long unsigned)cycles, (long long unsigned)(c-p), "Timer => Cycles time"); + + sched_events_clear(); + PRINTC("\t%s: \t\t\tSuccess\n", "One-Shot Timeout"); } struct exec_cluster { - thdcap_t tc; - arcvcap_t rc; - tcap_t tcc; - cycles_t cyc; - asndcap_t sc; /* send-cap to send to rc */ - tcap_prio_t prio; - int xseq; /* expected activation sequence number for this thread */ + thdcap_t tc; + arcvcap_t rc; + tcap_t tcc; + cycles_t cyc; + asndcap_t sc; /* send-cap to send to rc */ + tcap_prio_t prio; + int xseq; /* expected activation sequence number for this thread */ }; struct budget_test_data { - /* p=parent, c=child, g=grand-child */ - struct exec_cluster p, c, g; + /* p=parent, c=child, g=grand-child */ + struct exec_cluster p, c, g; } bt[NUM_CPU], mbt[NUM_CPU]; -static int + static int exec_cluster_alloc(struct exec_cluster *e, cos_thd_fn_t fn, void *d, arcvcap_t parentc) { - e->tcc = cos_tcap_alloc(&booter_info); - if (EXPECT_LL_LT(1, e->tcc, "Cluster Allocation: TCAP ALLOC")) return -1; - e->tc = cos_thd_alloc(&booter_info, booter_info.comp_cap, fn, d); - if (EXPECT_LL_LT(1, e->tc, "Cluster Allocation: THD ALLOC")) return -1; - e->rc = cos_arcv_alloc(&booter_info, e->tc, e->tcc, booter_info.comp_cap, parentc); - if (EXPECT_LL_LT(1, e->rc, "Cluster Allocation: ARCV ALLOC")) return -1; - e->sc = cos_asnd_alloc(&booter_info, e->rc, booter_info.captbl_cap); - if (EXPECT_LL_LT(1, e->sc, "Cluster Allocation: ASND ALLOC")) return -1; - - e->cyc = 0; - - return 0; + e->tcc = cos_tcap_alloc(&booter_info); + if (EXPECT_LL_LT(1, e->tcc, "Cluster Allocation: TCAP ALLOC")) return -1; + e->tc = cos_thd_alloc(&booter_info, booter_info.comp_cap, fn, d, 0, 0); + if (EXPECT_LL_LT(1, e->tc, "Cluster Allocation: THD ALLOC")) return -1; + e->rc = cos_arcv_alloc(&booter_info, e->tc, e->tcc, booter_info.comp_cap, parentc); + if (EXPECT_LL_LT(1, e->rc, "Cluster Allocation: ARCV ALLOC")) return -1; + e->sc = cos_asnd_alloc(&booter_info, e->rc, booter_info.captbl_cap); + if (EXPECT_LL_LT(1, e->sc, "Cluster Allocation: ASND ALLOC")) return -1; + + e->cyc = 0; + + return 0; } static void parent(void *d) { - assert(0); + assert(0); } static void spinner_cyc(void *d) { - cycles_t *p = (cycles_t *)d; + cycles_t *p = (cycles_t *)d; - while (1) rdtscll(*p); + while (1) rdtscll(*p); } #define TIMER_TIME 100 @@ -177,70 +177,70 @@ spinner_cyc(void *d) void test_2timers(void) { - int ret; - cycles_t s, e, timer; - - if (EXPECT_LL_NEQ(0, exec_cluster_alloc(&bt[cos_cpuid()].p, parent, &bt[cos_cpuid()].p, - BOOT_CAPTBL_SELF_INITRCV_CPU_BASE), "TCAP v. Timer: Cannot Allocate")) { - return; - } - if (EXPECT_LL_NEQ(0, exec_cluster_alloc(&bt[cos_cpuid()].c, spinner, &bt[cos_cpuid()].c, - bt[cos_cpuid()].p.rc), "TCAP v. Timer: Cannot Allocate")) { - return; - } - - /* Timer > TCAP */ - - ret = cos_tcap_transfer(bt[cos_cpuid()].c.rc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, - GRANULARITY * TIMER_TIME, TCAP_PRIO_MAX + 2); - if (EXPECT_LL_NEQ(0, ret, "TCAP v. Timer : TCAP Transfer")) { - return; - } - - rdtscll(s); - timer = tcap_cyc2time(s + GRANULARITY * cyc_per_usec); - if (cos_switch(bt[cos_cpuid()].c.tc, bt[cos_cpuid()].c.tcc, TCAP_PRIO_MAX + 2, - timer, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cos_sched_sync())) { - EXPECT_LL_NEQ(0, 1, "TCAP v. Timer: COS Switch"); - return; - } - rdtscll(e); - - if (EXPECT_LLU_LT((long long unsigned)(e-s), (unsigned)(GRANULARITY * cyc_per_usec), - "TCAP v. Timer: Timer > TCAP") || - EXPECT_LLU_LT((unsigned)(GRANULARITY * TIMER_TIME), (long long unsigned)(e-s), - "TCAP v. Timer: Interreupt Under")) { - return; - } - - sched_events_clear(); - - /* Timer < TCAP */ - - ret = cos_tcap_transfer(bt[cos_cpuid()].c.rc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, - GRANULARITY * cyc_per_usec, TCAP_PRIO_MAX + 2); - if (EXPECT_LL_NEQ(0, ret, "TCAP v. Timer: TCAP Transfer")) { - return; - } - - rdtscll(s); - timer = tcap_cyc2time(s + GRANULARITY * TIMER_TIME); - if (EXPECT_LL_NEQ(0, cos_switch(bt[cos_cpuid()].c.tc, bt[cos_cpuid()].c.tcc, TCAP_PRIO_MAX + 2, timer, - BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cos_sched_sync()), "TCAP v. TImer: COS Switch")) { - return; - } - - rdtscll(e); - - if (EXPECT_LLU_LT((long long unsigned)(e-s), (unsigned)(GRANULARITY * cyc_per_usec), - "TCAP v. Timer: Timer < TCAP") || - EXPECT_LLU_LT((unsigned)(GRANULARITY * TIMER_TIME), (long long unsigned)(e-s), - "TCAP v. Timer: Interreupt Under")) { - return; - } - - sched_events_clear(); - PRINTC("\t%s: \t\tSuccess\n", "Timer => Timeout v. Budget"); + int ret; + cycles_t s, e, timer; + + if (EXPECT_LL_NEQ(0, exec_cluster_alloc(&bt[cos_cpuid()].p, parent, &bt[cos_cpuid()].p, + BOOT_CAPTBL_SELF_INITRCV_CPU_BASE), "TCAP v. Timer: Cannot Allocate")) { + return; + } + if (EXPECT_LL_NEQ(0, exec_cluster_alloc(&bt[cos_cpuid()].c, spinner, &bt[cos_cpuid()].c, + bt[cos_cpuid()].p.rc), "TCAP v. Timer: Cannot Allocate")) { + return; + } + + /* Timer > TCAP */ + + ret = cos_tcap_transfer(bt[cos_cpuid()].c.rc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, + GRANULARITY * TIMER_TIME, TCAP_PRIO_MAX + 2); + if (EXPECT_LL_NEQ(0, ret, "TCAP v. Timer : TCAP Transfer")) { + return; + } + + rdtscll(s); + timer = tcap_cyc2time(s + GRANULARITY * cyc_per_usec); + if (cos_switch(bt[cos_cpuid()].c.tc, bt[cos_cpuid()].c.tcc, TCAP_PRIO_MAX + 2, + timer, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cos_sched_sync())) { + EXPECT_LL_NEQ(0, 1, "TCAP v. Timer: COS Switch"); + return; + } + rdtscll(e); + + if (EXPECT_LLU_LT((long long unsigned)(e-s), (unsigned)(GRANULARITY * cyc_per_usec), + "TCAP v. Timer: Timer > TCAP") || + EXPECT_LLU_LT((unsigned)(GRANULARITY * TIMER_TIME), (long long unsigned)(e-s), + "TCAP v. Timer: Interreupt Under")) { + return; + } + + sched_events_clear(); + + /* Timer < TCAP */ + + ret = cos_tcap_transfer(bt[cos_cpuid()].c.rc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, + GRANULARITY * cyc_per_usec, TCAP_PRIO_MAX + 2); + if (EXPECT_LL_NEQ(0, ret, "TCAP v. Timer: TCAP Transfer")) { + return; + } + + rdtscll(s); + timer = tcap_cyc2time(s + GRANULARITY * TIMER_TIME); + if (EXPECT_LL_NEQ(0, cos_switch(bt[cos_cpuid()].c.tc, bt[cos_cpuid()].c.tcc, TCAP_PRIO_MAX + 2, timer, + BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cos_sched_sync()), "TCAP v. TImer: COS Switch")) { + return; + } + + rdtscll(e); + + if (EXPECT_LLU_LT((long long unsigned)(e-s), (unsigned)(GRANULARITY * cyc_per_usec), + "TCAP v. Timer: Timer < TCAP") || + EXPECT_LLU_LT((unsigned)(GRANULARITY * TIMER_TIME), (long long unsigned)(e-s), + "TCAP v. Timer: Interreupt Under")) { + return; + } + + sched_events_clear(); + PRINTC("\t%s: \t\tSuccess\n", "Timer => Timeout v. Budget"); } #define BUDGET_TIME 100 @@ -248,64 +248,64 @@ test_2timers(void) static void test_tcap_budgets_single(void) { - int i; - cycles_t s = 0, e = 0; - cycles_t time, mask; - int ret; - - perfdata_init(&result, "Timer => Budget based", test_results, ARRAY_SIZE); - - if (EXPECT_LL_NEQ(0, exec_cluster_alloc(&bt[cos_cpuid()].p, parent, &bt[cos_cpuid()].p, - BOOT_CAPTBL_SELF_INITRCV_CPU_BASE), "Single Budget: Cannot Allocate") || - EXPECT_LL_NEQ(0, exec_cluster_alloc(&bt[cos_cpuid()].c, spinner, &bt[cos_cpuid()].c, - bt[cos_cpuid()].p.rc), "Single Budget: Cannot Allocate")) { - return; - } - for (i = 1; i <= TEST_ITER; i++) { - - ret = cos_tcap_transfer(bt[cos_cpuid()].c.rc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, - GRANULARITY * BUDGET_TIME, TCAP_PRIO_MAX + 2); - if (EXPECT_LL_NEQ(0, ret, "Single Budget: TCAP Transfer")) { - return; - } - - rdtscll(s); - if (cos_switch(bt[cos_cpuid()].c.tc, bt[cos_cpuid()].c.tcc, TCAP_PRIO_MAX + 2, TCAP_TIME_NIL, - BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cos_sched_sync())){ - EXPECT_LL_NEQ(0, 1, "Single Budget: COS Switch"); - return; - } - rdtscll(e); - - if (i > 1) { - /* Performant absolute value function instead of branching */ - time = (e - s - (GRANULARITY * BUDGET_TIME)); - mask = (time >> (sizeof(cycles_t) * CHAR_BIT - 1)); - time = (time + mask) ^ mask; - - perfdata_add(&result, time); - - if (EXPECT_LLU_LT((long long unsigned)(e-s), (unsigned)(GRANULARITY * BUDGET_TIME * MAX_THDS), - "Single Budget: MAX Bound") || - EXPECT_LLU_LT((unsigned)(GRANULARITY * BUDGET_TIME * MIN_THDS), (long long unsigned)(e-s), - "Single Budget: MIN Bound")) { - return; - } - } - sched_events_clear(); - } - - perfdata_calc(&result); - result_budgets_single.avg = perfdata_avg(&result); - result_budgets_single.max = perfdata_avg(&result); - result_budgets_single.min = perfdata_avg(&result); - result_budgets_single.sz = perfdata_avg(&result); - result_budgets_single.sd = perfdata_avg(&result); - result_budgets_single.p90tile = perfdata_avg(&result); - result_budgets_single.p95tile = perfdata_avg(&result); - result_budgets_single.p99tile = perfdata_avg(&result); - - PRINTC("\t%s: \t\t\tSuccess\n", "Timer => Budget based"); + int i; + cycles_t s = 0, e = 0; + cycles_t time, mask; + int ret; + + perfdata_init(&result, "Timer => Budget based", test_results, ARRAY_SIZE); + + if (EXPECT_LL_NEQ(0, exec_cluster_alloc(&bt[cos_cpuid()].p, parent, &bt[cos_cpuid()].p, + BOOT_CAPTBL_SELF_INITRCV_CPU_BASE), "Single Budget: Cannot Allocate") || + EXPECT_LL_NEQ(0, exec_cluster_alloc(&bt[cos_cpuid()].c, spinner, &bt[cos_cpuid()].c, + bt[cos_cpuid()].p.rc), "Single Budget: Cannot Allocate")) { + return; + } + for (i = 1; i <= TEST_ITER; i++) { + + ret = cos_tcap_transfer(bt[cos_cpuid()].c.rc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, + GRANULARITY * BUDGET_TIME, TCAP_PRIO_MAX + 2); + if (EXPECT_LL_NEQ(0, ret, "Single Budget: TCAP Transfer")) { + return; + } + + rdtscll(s); + if (cos_switch(bt[cos_cpuid()].c.tc, bt[cos_cpuid()].c.tcc, TCAP_PRIO_MAX + 2, TCAP_TIME_NIL, + BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cos_sched_sync())){ + EXPECT_LL_NEQ(0, 1, "Single Budget: COS Switch"); + return; + } + rdtscll(e); + + if (i > 1) { + /* Performant absolute value function instead of branching */ + time = (e - s - (GRANULARITY * BUDGET_TIME)); + mask = (time >> (sizeof(cycles_t) * CHAR_BIT - 1)); + time = (time + mask) ^ mask; + + perfdata_add(&result, time); + + if (EXPECT_LLU_LT((long long unsigned)(e-s), (unsigned)(GRANULARITY * BUDGET_TIME * MAX_THDS), + "Single Budget: MAX Bound") || + EXPECT_LLU_LT((unsigned)(GRANULARITY * BUDGET_TIME * MIN_THDS), (long long unsigned)(e-s), + "Single Budget: MIN Bound")) { + return; + } + } + sched_events_clear(); + } + + perfdata_calc(&result); + result_budgets_single.avg = perfdata_avg(&result); + result_budgets_single.max = perfdata_avg(&result); + result_budgets_single.min = perfdata_avg(&result); + result_budgets_single.sz = perfdata_avg(&result); + result_budgets_single.sd = perfdata_avg(&result); + result_budgets_single.p90tile = perfdata_avg(&result); + result_budgets_single.p95tile = perfdata_avg(&result); + result_budgets_single.p99tile = perfdata_avg(&result); + + PRINTC("\t%s: \t\t\tSuccess\n", "Timer => Budget based"); } #define RATE_1 1600 @@ -314,80 +314,78 @@ test_tcap_budgets_single(void) static void test_tcap_budgets_multi(void) { - int i; - - if(EXPECT_LL_NEQ(0, exec_cluster_alloc(&mbt[cos_cpuid()].p, spinner_cyc, &(mbt[cos_cpuid()].p.cyc), - BOOT_CAPTBL_SELF_INITRCV_CPU_BASE), "Multi Budget: Cannot Allocate") || - EXPECT_LL_NEQ(0, exec_cluster_alloc(&mbt[cos_cpuid()].c, spinner_cyc, &(mbt[cos_cpuid()].c.cyc), - mbt[cos_cpuid()].p.rc), "Multi Budget: Cannot Allocate") || - EXPECT_LL_NEQ(0, exec_cluster_alloc(&mbt[cos_cpuid()].g, spinner_cyc, &(mbt[cos_cpuid()].g.cyc), - mbt[cos_cpuid()].c.rc), "Multi Budget: Cannot allocate")) { - return; - } - - for (i = 1; i <= TEST_ITER; i++) { - tcap_res_t res; - thdid_t tid; - int blocked; - cycles_t cycles, s, e; - tcap_time_t thd_timeout; - - /* test both increasing budgets and constant budgets */ - if (i > (TEST_ITER/2)) - res = GRANULARITY * RATE_1; - else - res = i * GRANULARITY * RATE_2; - - if (EXPECT_LL_NEQ(0, cos_tcap_transfer(mbt[cos_cpuid()].p.rc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, - res, TCAP_PRIO_MAX + 2), "Multi Budget: TCAP Transfer") || - EXPECT_LL_NEQ(0, cos_tcap_transfer(mbt[cos_cpuid()].c.rc, mbt[cos_cpuid()].p.tcc, res / 2, - TCAP_PRIO_MAX + 2), "Multi Budget: TCAP Transfer") || - EXPECT_LL_NEQ(0, cos_tcap_transfer(mbt[cos_cpuid()].g.rc, mbt[cos_cpuid()].c.tcc, res / 4, - TCAP_PRIO_MAX + 2), "Multi Budget: TCAP Transfer")) { - return; - } - - mbt[cos_cpuid()].p.cyc = mbt[cos_cpuid()].c.cyc = mbt[cos_cpuid()].g.cyc = 0; - rdtscll(s); - if (cos_switch(mbt[cos_cpuid()].g.tc, mbt[cos_cpuid()].g.tcc, TCAP_PRIO_MAX + 2, TCAP_TIME_NIL, - BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cos_sched_sync())) { - EXPECT_LL_NEQ(0, 1, "Multi Budget: COS Switch"); - return; - } - rdtscll(e); - - cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, 0, 0, NULL, &tid, &blocked, &cycles, &thd_timeout); - - if ( i > 1) { - - /* To measure time of execution, we need a min time - * as well as a max time to determine - * if the interrupt happened when it was supposed to - * thus MAX bound and MIN bound - * MAX_THDS and MIN_THDS are #defined to give it some flexibility - * from the user - */ - - if (EXPECT_LLU_LT((mbt[cos_cpuid()].g.cyc - s), (res / 4 * MAX_THDS), "Multi Budget: G") || - EXPECT_LLU_LT(mbt[cos_cpuid()].g.cyc - s, res / 4 * MAX_THDS, "Multi Budget: G MAX Bound") || - EXPECT_LLU_LT(res / 4 * MIN_THDS, mbt[cos_cpuid()].g.cyc - s, "Multi Budget: G MIN Bound") || - EXPECT_LLU_LT(mbt[cos_cpuid()].c.cyc - s, res / 2 * MAX_THDS, "Multi Budget: C MAX Bound") || - EXPECT_LLU_LT(res / 2 * MIN_THDS, mbt[cos_cpuid()].c.cyc - s, "Multi Budget: C MIN Bound") || - EXPECT_LLU_LT(mbt[cos_cpuid()].p.cyc - s, res * MAX_THDS, "Multi Budget: P MAX Bound") || - EXPECT_LLU_LT(res * MIN_THDS, mbt[cos_cpuid()].p.cyc - s, "Multi Budget: P MIN BOund")) { - return; - } - } - } - PRINTC("\t%s: \t\tSuccess\n", "Timer => Hierarchical Budget"); + int i; + + if(EXPECT_LL_NEQ(0, exec_cluster_alloc(&mbt[cos_cpuid()].p, spinner_cyc, &(mbt[cos_cpuid()].p.cyc), + BOOT_CAPTBL_SELF_INITRCV_CPU_BASE), "Multi Budget: Cannot Allocate") || + EXPECT_LL_NEQ(0, exec_cluster_alloc(&mbt[cos_cpuid()].c, spinner_cyc, &(mbt[cos_cpuid()].c.cyc), + mbt[cos_cpuid()].p.rc), "Multi Budget: Cannot Allocate") || + EXPECT_LL_NEQ(0, exec_cluster_alloc(&mbt[cos_cpuid()].g, spinner_cyc, &(mbt[cos_cpuid()].g.cyc), + mbt[cos_cpuid()].c.rc), "Multi Budget: Cannot allocate")) { + return; + } + + for (i = 1; i <= TEST_ITER; i++) { + tcap_res_t res; + cycles_t s, e; + tcap_time_t thd_timeout; + + /* test both increasing budgets and constant budgets */ + if (i > (TEST_ITER/2)) + res = GRANULARITY * RATE_1; + else + res = i * GRANULARITY * RATE_2; + + if (EXPECT_LL_NEQ(0, cos_tcap_transfer(mbt[cos_cpuid()].p.rc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, + res, TCAP_PRIO_MAX + 2), "Multi Budget: TCAP Transfer") || + EXPECT_LL_NEQ(0, cos_tcap_transfer(mbt[cos_cpuid()].c.rc, mbt[cos_cpuid()].p.tcc, res / 2, + TCAP_PRIO_MAX + 2), "Multi Budget: TCAP Transfer") || + EXPECT_LL_NEQ(0, cos_tcap_transfer(mbt[cos_cpuid()].g.rc, mbt[cos_cpuid()].c.tcc, res / 4, + TCAP_PRIO_MAX + 2), "Multi Budget: TCAP Transfer")) { + return; + } + + mbt[cos_cpuid()].p.cyc = mbt[cos_cpuid()].c.cyc = mbt[cos_cpuid()].g.cyc = 0; + rdtscll(s); + if (cos_switch(mbt[cos_cpuid()].g.tc, mbt[cos_cpuid()].g.tcc, TCAP_PRIO_MAX + 2, TCAP_TIME_NIL, + BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cos_sched_sync())) { + EXPECT_LL_NEQ(0, 1, "Multi Budget: COS Switch"); + return; + } + rdtscll(e); + + sched_events_clear(); + + if ( i > 1) { + + /* To measure time of execution, we need a min time + * as well as a max time to determine + * if the interrupt happened when it was supposed to + * thus MAX bound and MIN bound + * MAX_THDS and MIN_THDS are #defined to give it some flexibility + * from the user + */ + + if (EXPECT_LLU_LT((mbt[cos_cpuid()].g.cyc - s), (res / 4 * MAX_THDS), "Multi Budget: G") || + EXPECT_LLU_LT(mbt[cos_cpuid()].g.cyc - s, res / 4 * MAX_THDS, "Multi Budget: G MAX Bound") || + EXPECT_LLU_LT(res / 4 * MIN_THDS, mbt[cos_cpuid()].g.cyc - s, "Multi Budget: G MIN Bound") || + EXPECT_LLU_LT(mbt[cos_cpuid()].c.cyc - s, res / 2 * MAX_THDS, "Multi Budget: C MAX Bound") || + EXPECT_LLU_LT(res / 2 * MIN_THDS, mbt[cos_cpuid()].c.cyc - s, "Multi Budget: C MIN Bound") || + EXPECT_LLU_LT(mbt[cos_cpuid()].p.cyc - s, res * MAX_THDS, "Multi Budget: P MAX Bound") || + EXPECT_LLU_LT(res * MIN_THDS, mbt[cos_cpuid()].p.cyc - s, "Multi Budget: P MIN BOund")) { + return; + } + } + } + PRINTC("\t%s: \t\tSuccess\n", "Timer => Hierarchical Budget"); } void test_tcap_budgets(void) { - /* single-level budgets test */ - test_tcap_budgets_single(); + /* single-level budgets test */ + test_tcap_budgets_single(); - /* multi-level budgets test */ - test_tcap_budgets_multi(); + /* multi-level budgets test */ + test_tcap_budgets_multi(); } diff --git a/src/components/implementation/tests/kernel_tests/k_test_thd.c b/src/components/implementation/tests/kernel_tests/k_test_thd.c index 90483b39fb..a4cffad5c7 100644 --- a/src/components/implementation/tests/kernel_tests/k_test_thd.c +++ b/src/components/implementation/tests/kernel_tests/k_test_thd.c @@ -11,50 +11,50 @@ static int failure = 0; static void test_thd_arg(void *d) { - int ret = 0; + int ret = 0; - if (EXPECT_LL_NEQ((int)d, THD_ARG, "Thread Creation: Argument Incorrect")) failure = 1; - while (1) cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE); - PRINTC("Error, shouldn't get here!\n"); + if (EXPECT_LL_NEQ((int)d, THD_ARG, "Thread Creation: Argument Incorrect")) failure = 1; + while (1) cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE); + PRINTC("Error, shouldn't get here!\n"); } static void test_thds_create_switch(void) { - thdcap_t ts; - intptr_t i = THD_ARG; - int ret; - - ts = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_thd_arg, (void *)i); - if (EXPECT_LL_LT(1, ts, "Thread Creation: Cannot Allocate")) { - return; - } - ret = cos_thd_switch(ts); - EXPECT_LL_NEQ(0, ret, "COS Switch Error"); - - CHECK_STATUS_FLAG(); - PRINTC("\t%s: \t\t\tSuccess\n", "THD => Creation & ARG"); - EXIT_FN(); + thdcap_t ts; + intptr_t i = THD_ARG; + int ret; + + ts = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_thd_arg, (void *)i, 0, 0); + if (EXPECT_LL_LT(1, ts, "Thread Creation: Cannot Allocate")) { + return; + } + ret = cos_thd_switch(ts); + EXPECT_LL_NEQ(0, ret, "COS Switch Error"); + + CHECK_STATUS_FLAG(); + PRINTC("\t%s: \t\t\tSuccess\n", "THD => Creation & ARG"); + EXIT_FN(); } static void thd_fn_mthds_ring(void *d) { - int ret; + int ret; - if (count != (int) d) cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE); + if (count != (int) d) cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE); - int next = (++count) % TEST_NTHDS; - if (!next) cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE); + int next = (++count) % TEST_NTHDS; + if (!next) cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE); - ret = cos_thd_switch(thd_test[next]); - if (EXPECT_LL_NEQ(0, ret, "Thread Ring: COS Switch Error")) failure = 1; + ret = cos_thd_switch(thd_test[next]); + if (EXPECT_LL_NEQ(0, ret, "Thread Ring: COS Switch Error")) failure = 1; - while (1) { - cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE); - } - EXPECT_LL_NEQ(1, 0, "Error, shouldn't get here!\n"); - assert(0); + while (1) { + cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE); + } + EXPECT_LL_NEQ(1, 0, "Error, shouldn't get here!\n"); + assert(0); } /* Ring Multithreaded Test @@ -66,39 +66,39 @@ thd_fn_mthds_ring(void *d) static void test_mthds_ring(void) { - int i, ret; + int i, ret; - count = 0; + count = 0; - for (i = 0; i < TEST_NTHDS; i++) { - thd_test[i] = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_fn_mthds_ring, (void *)i); - if (EXPECT_LL_LT(1, thd_test[i], "Thread Ring: Cannot Allocate")) { - return; - } - } + for (i = 0; i < TEST_NTHDS; i++) { + thd_test[i] = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_fn_mthds_ring, (void *)i, 0, 0); + if (EXPECT_LL_LT(1, thd_test[i], "Thread Ring: Cannot Allocate")) { + return; + } + } - ret = cos_thd_switch(thd_test[0]); - EXPECT_LL_NEQ(0, ret, "Thread Ring: COS Switch Error"); + ret = cos_thd_switch(thd_test[0]); + EXPECT_LL_NEQ(0, ret, "Thread Ring: COS Switch Error"); - if (EXPECT_LL_NEQ(count, TEST_NTHDS, "Thread Ring: Failure # of THDS")) { - return; - } + if (EXPECT_LL_NEQ(count, TEST_NTHDS, "Thread Ring: Failure # of THDS")) { + return; + } - CHECK_STATUS_FLAG(); - PRINTC("\t%s: \t\t\tSuccess\n", "THD => Switch Cyclic" ); - EXIT_FN(); + CHECK_STATUS_FLAG(); + PRINTC("\t%s: \t\t\tSuccess\n", "THD => Switch Cyclic" ); + EXIT_FN(); } static void thd_fn_mthds_classic(void *d) { - cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE); + cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE); - while (1) { - cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE); - } - EXPECT_LL_NEQ(1, 0, "Error, shouldn't get here!\n"); - assert(0); + while (1) { + cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE); + } + EXPECT_LL_NEQ(1, 0, "Error, shouldn't get here!\n"); + assert(0); } /* Classic Multithreaded Test @@ -109,31 +109,31 @@ thd_fn_mthds_classic(void *d) static void test_mthds_classic(void) { - thdcap_t ts; - int i, ret; - - ts = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_fn_mthds_classic, NULL); - if (EXPECT_LL_LT(1, ts, "Thread Classic: Cannot Allocate")) { - return; - } - - for (i = 0; i < ITER; i++) { - ret = cos_thd_switch(ts); - if(EXPECT_LL_NEQ(0, ret, "Thread Classic: COS Switch Error")) return; - } - CHECK_STATUS_FLAG(); - PRINTC("\t%s: \t\tSuccess\n", "THD => Switch in pairs"); - EXIT_FN(); + thdcap_t ts; + int i, ret; + + ts = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_fn_mthds_classic, NULL, 0, 0); + if (EXPECT_LL_LT(1, ts, "Thread Classic: Cannot Allocate")) { + return; + } + + for (i = 0; i < ITER; i++) { + ret = cos_thd_switch(ts); + if(EXPECT_LL_NEQ(0, ret, "Thread Classic: COS Switch Error")) return; + } + CHECK_STATUS_FLAG(); + PRINTC("\t%s: \t\tSuccess\n", "THD => Switch in pairs"); + EXIT_FN(); } static void thd_tls(void *d) { - if (EXPECT_LLU_NEQ((long unsigned)tls_get(0), (long unsigned)tls_test[cos_cpuid()][(int)d], - "Thread TLS: ARG not correct")) failure = 1; - while (1) cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE); - EXPECT_LL_NEQ(1, 0, "Error, shouldn't get here!\n"); - assert(0); + if (EXPECT_LLU_NEQ((long unsigned)tls_get(0), (long unsigned)tls_test[cos_cpuid()][(int)d], + "Thread TLS: ARG not correct")) failure = 1; + while (1) cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE); + EXPECT_LL_NEQ(1, 0, "Error, shouldn't get here!\n"); + assert(0); } /* Test the TLS support @@ -142,32 +142,32 @@ thd_tls(void *d) static void test_thds_tls(void) { - thdcap_t ts[TEST_NTHDS]; - intptr_t i; - int ret; - - for (i = 0; i < TEST_NTHDS; i++) { - ts[i] = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_tls, (void *)i); - if (EXPECT_LL_LT(1, ts[i], "Thread TLS: Cannot Allocate")) { - return; - } - tls_test[cos_cpuid()][i] = i; - cos_thd_mod(&booter_info, ts[i], &tls_test[cos_cpuid()][i]); - ret = cos_thd_switch(ts[i]); - if (EXPECT_LL_NEQ(0, ret, "Thread TLS: COS Switch Error")) return; - } - - CHECK_STATUS_FLAG(); - PRINTC("\t%s: \t\t\tSuccess\n", "THD => Creation & TLS"); - EXIT_FN(); + thdcap_t ts[TEST_NTHDS]; + intptr_t i; + int ret; + + for (i = 0; i < TEST_NTHDS; i++) { + ts[i] = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_tls, (void *)i, 0, 0); + if (EXPECT_LL_LT(1, ts[i], "Thread TLS: Cannot Allocate")) { + return; + } + tls_test[cos_cpuid()][i] = i; + cos_thd_mod(&booter_info, ts[i], &tls_test[cos_cpuid()][i]); + ret = cos_thd_switch(ts[i]); + if (EXPECT_LL_NEQ(0, ret, "Thread TLS: COS Switch Error")) return; + } + + CHECK_STATUS_FLAG(); + PRINTC("\t%s: \t\t\tSuccess\n", "THD => Creation & TLS"); + EXIT_FN(); } void test_thds(void) { - test_thds_create_switch(); - test_thds_tls(); - test_mthds_classic(); - test_mthds_ring(); + test_thds_create_switch(); + test_thds_tls(); + test_mthds_classic(); + test_mthds_ring(); } diff --git a/src/components/implementation/tests/kernel_tests/kernel_test_booter.c b/src/components/implementation/tests/kernel_tests/kernel_test_booter.c index 4998ee861b..50ebeb8d59 100644 --- a/src/components/implementation/tests/kernel_tests/kernel_test_booter.c +++ b/src/components/implementation/tests/kernel_tests/kernel_test_booter.c @@ -14,7 +14,7 @@ int count = 0; void term_fn(void *d) { - SPIN(); + SPIN(); } static int test_done[NUM_CPU]; @@ -22,52 +22,52 @@ static int test_done[NUM_CPU]; void cos_init(void) { - int cycs, i; - static int first_init = 1, init_done = 0; - - cycs = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE); - printc("\t%d cycles per microsecond\n", cycs); - - if (first_init) { - first_init = 0; - cos_meminfo_init(&booter_info.mi, BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT); - cos_compinfo_init(&booter_info, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_COMP, - (vaddr_t)cos_get_heap_ptr(), BOOT_CAPTBL_FREE, &booter_info); - init_done = 1; - } - - while (!init_done); - - termthd[cos_cpuid()] = cos_thd_alloc(&booter_info, booter_info.comp_cap, term_fn, NULL); - assert(termthd[cos_cpuid()]); - PRINTC("Kernel Tests\n"); - printc("\nUnit Test Started:\n\n"); - - /* Kernel Tests */ - cyc_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE); - test_timer(); - test_tcap_budgets(); - test_2timers(); - test_thds(); - test_mem_alloc(); - test_async_endpoints(); - test_inv(); - test_captbl_expands(); - - printc("\nuBenchamarks Started:\n\n"); - - test_run_perf_kernel(); - - /* NOTE: This is just to make sense of the output on HW! To understand that microbooter runs to completion on all cores! */ - test_done[cos_cpuid()] = 1; - for (i = 0; i < NUM_CPU; i++) { - while (!test_done[i]) ; - } - - printc("\n"); - PRINTC("Kernel Tests done.\n"); - - cos_thd_switch(termthd[cos_cpuid()]); - - return; + int cycs, i; + static int first_init = 1, init_done = 0; + + cycs = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE); + printc("\t%d cycles per microsecond\n", cycs); + + if (first_init) { + first_init = 0; + cos_meminfo_init(&booter_info.mi, BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT); + cos_compinfo_init(&booter_info, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_COMP, + (vaddr_t)cos_get_heap_ptr(), LLBOOT_CAPTBL_FREE, &booter_info); + init_done = 1; + } + + while (!init_done); + + termthd[cos_cpuid()] = cos_thd_alloc(&booter_info, booter_info.comp_cap, term_fn, NULL, 0, 0); + assert(termthd[cos_cpuid()]); + PRINTC("Kernel Tests\n"); + printc("\nUnit Test Started:\n\n"); + + /* Kernel Tests */ + cyc_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE); + test_timer(); + test_tcap_budgets(); + test_2timers(); + test_thds(); + test_mem_alloc(); + test_async_endpoints(); + test_inv(); + test_captbl_expands(); + + printc("\nuBenchamarks Started:\n\n"); + + test_run_perf_kernel(); + + /* NOTE: This is just to make sense of the output on HW! To understand that microbooter runs to completion on all cores! */ + test_done[cos_cpuid()] = 1; + for (i = 0; i < NUM_CPU; i++) { + while (!test_done[i]) ; + } + + printc("\n"); + PRINTC("Kernel Tests done.\n"); + + cos_thd_switch(termthd[cos_cpuid()]); + + return; } diff --git a/src/components/implementation/tests/kernel_tests/kernel_tests.h b/src/components/implementation/tests/kernel_tests/kernel_tests.h index 82741bef12..4668e89297 100644 --- a/src/components/implementation/tests/kernel_tests/kernel_tests.h +++ b/src/components/implementation/tests/kernel_tests/kernel_tests.h @@ -10,22 +10,22 @@ #undef assert /* On assert, immediately switch to the "exit" thread */ #define assert(node) \ - do { \ - if (unlikely(!(node))) { \ - debug_print("assert error in @ "); \ - cos_thd_switch(termthd[cos_cpuid()]); \ - } \ - } while (0) + do { \ + if (unlikely(!(node))) { \ + debug_print("assert error in @ "); \ + cos_thd_switch(termthd[cos_cpuid()]); \ + } \ + } while (0) #define EXIT_FN() \ - exit_fn: return; +exit_fn: return; #define CHECK_STATUS_FLAG() \ - do { \ - if (failure) { \ - goto exit_fn; \ - } \ - } while (0) + do { \ + if (failure) { \ + goto exit_fn; \ + } \ + } while (0) #include #include @@ -56,30 +56,30 @@ extern unsigned long thd_test[TEST_NTHDS]; extern int num, den, count; struct results { - long long unsigned avg; - long long unsigned max; - long long unsigned min; - long long unsigned sd; - int sz; - long long unsigned p90tile; - long long unsigned p95tile; - long long unsigned p99tile; + long long unsigned avg; + long long unsigned max; + long long unsigned min; + long long unsigned sd; + int sz; + long long unsigned p90tile; + long long unsigned p95tile; + long long unsigned p99tile; }; -static unsigned long + static unsigned long tls_get(size_t off) { - unsigned long val; + unsigned long val; - __asm__ __volatile__("movl %%gs:(%1), %0" : "=r"(val) : "r"(off) :); + __asm__ __volatile__("movl %%gs:(%1), %0" : "=r"(val) : "r"(off) :); - return val; + return val; } -static void + static void tls_set(size_t off, unsigned long val) { - __asm__ __volatile__("movl %0, %%gs:(%1)" : : "r"(val), "r"(off) : "memory"); + __asm__ __volatile__("movl %0, %%gs:(%1)" : : "r"(val), "r"(off) : "memory"); } extern void test_run_perf_kernel(void); diff --git a/src/components/implementation/tests/micro_chan/Makefile b/src/components/implementation/tests/micro_chan/Makefile new file mode 100644 index 0000000000..9ecb1154a8 --- /dev/null +++ b/src/components/implementation/tests/micro_chan/Makefile @@ -0,0 +1,8 @@ +COMPONENT=micro_chan.o +INTERFACES= +DEPENDENCIES= +IF_LIB= +ADDITIONAL_LIBS=-lcobj_format $(LIBSLRAW) -lsl_mod_fprr -lsl_thd_static_backend -lsl_blkpt + +include ../../Makefile.subsubdir +MANDITORY_LIB=simple_stklib.o diff --git a/src/components/implementation/tests/micro_chan/unit_schedlib.c b/src/components/implementation/tests/micro_chan/unit_schedlib.c new file mode 100644 index 0000000000..bfc8c2340d --- /dev/null +++ b/src/components/implementation/tests/micro_chan/unit_schedlib.c @@ -0,0 +1,497 @@ +/* + * Copyright 2016, Phani Gadepalli and Gabriel Parmer, GWU, gparmer@gwu.edu. + * + * This uses a two clause BSD License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Iterations, channels */ +#define CHAN_ITER 1000000 +#define NCHANTHDS 2 +#define CHAN_BATCH 3 + +unsigned long long iters[CHAN_ITER] = { 0 }; + +CRT_CHAN_STATIC_ALLOC(c0, int, 4); +CRT_CHAN_TYPE_PROTOTYPES(test, int, 4); +struct crt_lock lock; + +unsigned int one_only = 0; + +typedef enum { CHILLING = 0, RECVING, SENDING } actions_t; +unsigned long status[NCHANTHDS]; +unsigned long cnts[NCHANTHDS] = {0, }; + +/* sl also defines a SPIN macro */ +#undef SPIN +#define SPIN(iters) \ + do { \ + if (iters > 0) { \ + for (; iters > 0; iters--) \ + ; \ + } else { \ + while (1) \ + ; \ + } \ + } while (0) + + +#define N_TESTTHDS 2 +#define WORKITERS 100 + +#define N_TESTTHDS_PERF 2 +#define PERF_ITERS 1000 + +static volatile cycles_t mid_cycs = 0; +static volatile int testing = 1; + +void +test_thd_perffn(void *data) +{ + cycles_t start_cycs = 0, end_cycs = 0, wc_cycs = 0, total_cycs = 0; + unsigned int i = 0; + + rdtscll(start_cycs); + sl_thd_yield(0); + rdtscll(end_cycs); + assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs); + + for (i = 0; i < PERF_ITERS; i++) { + cycles_t diff1_cycs = 0, diff2_cycs = 0; + + mid_cycs = 0; + rdtscll(start_cycs); + sl_thd_yield(0); + rdtscll(end_cycs); + assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs); + + diff1_cycs = mid_cycs - start_cycs; + diff2_cycs = end_cycs - mid_cycs; + + if (diff1_cycs > wc_cycs) wc_cycs = diff1_cycs; + if (diff2_cycs > wc_cycs) wc_cycs = diff2_cycs; + total_cycs += (diff1_cycs + diff2_cycs); + } + + PRINTC("SWITCH UBENCH: avg: %llu, wc: %llu, iters:%u\n", (total_cycs / (2 * PERF_ITERS)), wc_cycs, PERF_ITERS); + testing = 0; + /* done testing! let the spinfn cleanup! */ + sl_thd_yield(0); + + sl_thd_exit(); +} + +void +test_thd_spinfn(void *data) +{ + while (likely(testing)) { + rdtscll(mid_cycs); + sl_thd_yield(0); + } + + sl_thd_exit(); +} +/* Get the numbers */ +volatile unsigned long long start_time; +volatile unsigned long long end_time; +//void +//test_thd_fn(void *data) +//{ +// cycles_t time; +// cycles_t iters; +// int rounds = 0; +// if (data!=0) { +// while (1) { +// rounds++; +// rdtscll(start_time); +// sl_thd_yield(3); +// rdtscll(end_time); +// print_uint((unsigned long)(end_time-start_time)); +// print_string("\r\n"); +// if(rounds == 10000) +// while(1); +// } +// } +// else { +// while (1) { +// sl_thd_yield(4); +// } +// } +//} + +#define RCV 0 +#define SND 1 + +void +test_thd_fn(void *data) +{ + cycles_t time; +// cycles_t iters; + cycles_t total = 0, max = 0, diff; + int send; + int recv; + int rounds = 0; + if (data==RCV) { + while (1) { + rounds ++; + crt_chan_recv_test(c0, &recv); + rdtscll(end_time); + assert(ps_faa(&one_only, -1) == 1); + + diff = end_time - start_time; + if (diff > max) max = diff; + total += diff; + iters[rounds - 1] = diff; + //printc("%llu, ", diff); + + if (rounds == CHAN_ITER) { + int i; + + for (i = 0; i < CHAN_ITER; i++) { + printc("%llu\n", iters[i]); + } + printc("\nAvg: %llu, Wc:%llu\n", total / CHAN_ITER, max); + + while (1) ; + } + //print_uint((unsigned long)(end_time-start_time)); + //print_string("\r\n"); + //if(rounds == 10000) + // while(1); + } + } + else { + send = 0x1234; + while (1) { + assert(ps_faa(&one_only, 1) == 0); + rdtscll(start_time); + crt_chan_send_test(c0, &send); + } + } +} + +//void +//test_thd_fn(void *data) +//{ +// cycles_t time; +// cycles_t iters; +// int send; +// int recv; +// int rounds = 0; +// +// if (data!=0) { +// while (1) { +// rounds ++; +// +// crt_lock_take(&lock); +// sl_thd_yield(0); +// rdtscll(end_time); +// crt_lock_release(&lock); +// sl_thd_yield(0); +// +// print_uint((unsigned long)(end_time-start_time)); +// print_string("\r\n"); +// if(rounds == 10000) +// while(1); +// } +// } +// else { +// crt_lock_init(&lock); +// while (1) { +// rdtscll(start_time); +// crt_lock_take(&lock); +// crt_lock_release(&lock); +// sl_thd_yield(0); +// } +// } +//} +// +//volatile unsigned long long int_tsc; +//void +//test_thd_fn(capid_t cap, void *data) +//{ +// cycles_t time; +// cycles_t iters; +// int send; +// int recv; +// unsigned int result; +// int rounds = 0; +// if (data==0) { +// while (1) { +// //print_string("*"); +// } +// } +// else { +// /* Higher priority on this branch */ +// cos_hw_attach(BOOT_CAPTBL_SELF_INITHW_BASE, 63, sl_thd_rcvcap(sl_thd_lkup(sl_thdid()))); +// cos_hw_custom(BOOT_CAPTBL_SELF_INITHW_BASE); +// while (1) { +// /* We are doing this receive anyway */ +// cos_rcv(sl_thd_rcvcap(sl_thd_lkup(sl_thdid())), 0); +// rdtscll(end_time); +// addr[rounds] = (unsigned int)(end_time-int_tsc); +// rounds ++; +// if(rounds == 10000) +// { +// for (rounds = 0; rounds < 10000; rounds ++) +// { +// print_uint(addr[rounds]); +// print_string("\r\n"); +// } +// while(1); +// } +// } +// } +//} + +// int rounds = 0; +//void +//test_thd_fn(capid_t cap, void *data) +//{ +// cycles_t time; +// cycles_t iters; +// int send; +// int recv; +// unsigned int result; +// /* if (data == 0) { +// while (1) { +// print_string("*"); +// } +// } +// else */if (data == 0) +// { +// /* Higher priority on this branch - receiving stuff from interrupt */ +// cos_hw_attach(BOOT_CAPTBL_SELF_INITHW_BASE, 63, sl_thd_rcvcap(sl_thd_lkup(sl_thdid()))); +// cos_hw_custom(BOOT_CAPTBL_SELF_INITHW_BASE); +// while (1) { +//// print_string(" :1a: \r\n"); +// /* We are doing this receive anyway */ +//// sl_thd_rcv(RCV_ULONLY); +// cos_rcv(sl_thd_rcvcap(sl_thd_lkup(sl_thdid())), 0); +//// print_string(" :1b: "); +// /* Send to the guy immediately */ +// crt_chan_send_test(c0, &send); +// //sl_thd_wakeup(4); +//// print_string(" :1c: "); +// //rdtscll(end_time); +// //addr[rounds] = (unsigned int)(end_time-int_tsc); +// } +// } +// else { +// while(1) { +// /* Finally, we send what we receive here */ +//// print_string(" :2a: "); +// //sl_thd_block(0); +// crt_chan_recv_test(c0, &recv); +//// print_string(" :2b: "); +// rdtscll(end_time); +// //print_uint(addr[rounds]); +// //print_string(" - "); +// addr[rounds] = (unsigned int)(end_time-int_tsc); +// //print_uint(addr[rounds]); +// //print_string("\r\n"); +// rounds ++; +// if(rounds == 10000) +// { +// for (rounds = 0; rounds < 10000; rounds ++) +// { +// print_uint(addr[rounds]); +// print_string("\r\n"); +// } +// while(1); +// } +// } +// } +//} + +//void +//test_yield_perf(void) +//{ +// int i; +// struct sl_thd *threads[N_TESTTHDS_PERF]; +// union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 31}}; +// +// for (i = 0; i < N_TESTTHDS_PERF; i++) { +// if (i == 1) threads[i] = sl_thd_alloc(test_thd_perffn, (void *)&threads[0]); +// else threads[i] = sl_thd_alloc(test_thd_spinfn, NULL); +// assert(threads[i]); +// sl_thd_param_set(threads[i], sp.v); +// PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i])); +// } +//} + +//void +//test_yields(void) +//{ +// int i; +// struct sl_thd * threads[N_TESTTHDS]; +// union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 10}}; +// +// for (i = 0; i < N_TESTTHDS; i++) { +// threads[i] = sl_thd_alloc(test_thd_fn, (void *)i); +// assert(threads[i]); +// sl_thd_param_set(threads[i], sp.v); +// PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i])); +// } +//} + +void +test_yields(void) +{ + int i; + struct sl_thd * threads[N_TESTTHDS]; + union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 0}}; + + start_time = end_time = 0; + + for (i = 0; i < N_TESTTHDS; i++) { + threads[i] = sl_thd_alloc(test_thd_fn, (void *)i); + assert(threads[i]); + if (i == RCV) sp.c.value = 2; + else sp.c.value = 5; + sl_thd_param_set(threads[i], sp.v); + PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i])); + //sl_thd_yield_thd(threads[i]); + } + assert(N_TESTTHDS == 2); + //crt_chan_p2p_init_test(c0, threads[SND], threads[RCV]); + crt_chan_init_test(c0); +} + +//void +//test_yields(void) +//{ +// int i; +// struct sl_thd * threads[N_TESTTHDS]; +// union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 10}}; +// +// crt_chan_init_test(&c0); +// for (i = 0; i < N_TESTTHDS; i++) { +// threads[i] = sl_thd_aep_alloc(test_thd_fn, (void *)i, 0, 0, 0, 0); +// assert(threads[i]); +// if(i != 0) +// sp.c.value = 9; +// sl_thd_param_set(threads[i], sp.v); +// PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i])); +// } +//} + +void +test_high(void *data) +{ + struct sl_thd *t = data; + + while (1) { + sl_thd_yield(sl_thd_thdid(t)); + printc("h"); + } +} + +void +test_low(void *data) +{ + while (1) { + int workiters = WORKITERS * 10; + SPIN(workiters); + printc("l"); + } +} + +void +test_blocking_directed_yield(void) +{ + struct sl_thd * low, *high; + union sched_param_union sph = {.c = {.type = SCHEDP_PRIO, .value = 5}}; + union sched_param_union spl = {.c = {.type = SCHEDP_PRIO, .value = 10}}; + + low = sl_thd_alloc(test_low, NULL); + high = sl_thd_alloc(test_high, low); + sl_thd_param_set(low, spl.v); + sl_thd_param_set(high, sph.v); +} + +#define TEST_ITERS 1000 + +void +test_high_wakeup(void *data) +{ + unsigned int toggle = 0, iters = 0; + struct sl_thd *t = data; + cycles_t start = sl_now(); + + while (1) { + cycles_t timeout = sl_now() + sl_usec2cyc(100); + + if (toggle % 10 == 0) + printc(".h:%llums.", sl_cyc2usec(sl_thd_block_timeout(0, timeout))); + else + printc(".h:%up.", sl_thd_block_periodic(0)); + + toggle++; + iters++; + + if (iters == TEST_ITERS) { + printc("\nTest done! (Duration: %llu ms)\n", sl_cyc2usec(sl_now() - start) / 1000); + printc("Deleting all threads. Idle thread should take over!\n"); + sl_thd_free(t); + sl_thd_free(sl_thd_curr()); + + /* should not be scheduled. */ + assert(0); + } + } +} + +void +test_timeout_wakeup(void) +{ + struct sl_thd * low, *high; + union sched_param_union sph = {.c = {.type = SCHEDP_PRIO, .value = 5}}; + union sched_param_union spl = {.c = {.type = SCHEDP_PRIO, .value = 10}}; + union sched_param_union spw = {.c = {.type = SCHEDP_WINDOW, .value = 1000}}; + + low = sl_thd_alloc(test_low, NULL); + sl_thd_param_set(low, spl.v); + sl_thd_param_set(low, spw.v); + + high = sl_thd_alloc(test_high_wakeup, low); + sl_thd_param_set(high, sph.v); + sl_thd_param_set(high, spw.v); +} + +void +cos_init(void) +{ + struct cos_defcompinfo *defci = cos_defcompinfo_curr_get(); + struct cos_compinfo * ci = cos_compinfo_get(defci); + + printc("Unit-test for the scheduling library (sl)\n"); + /* This is a hack, we know where the heap is */ + cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT); + cos_defcompinfo_llinit(); + cos_dcb_info_init_curr(); + sl_init(SL_MIN_PERIOD_US*50); + + //test_yield_perf(); + test_yields(); + //test_blocking_directed_yield(); + //test_timeout_wakeup(); + + sl_sched_loop_nonblock(); + + assert(0); + + return; +} diff --git a/src/components/implementation/tests/micro_ipi/micro_ipi.c b/src/components/implementation/tests/micro_ipi/micro_ipi.c index 2a3a180b20..45ce1f2198 100644 --- a/src/components/implementation/tests/micro_ipi/micro_ipi.c +++ b/src/components/implementation/tests/micro_ipi/micro_ipi.c @@ -49,13 +49,13 @@ hiprio_c0_lat_fn(arcvcap_t r, void *d) assert(snd); while (1) { - int pending = 0, rcvd = 0, ret = 0; + int pending = 0, ret = 0; cycles_t now; if (unlikely(testing == 0)) break; - pending = cos_rcv(r, RCV_ALL_PENDING, &rcvd); - assert(pending == 0 && rcvd == 1); + pending = cos_rcv(r, 0); + assert(pending == 0); rdtscll(now); #ifdef RCV_UB_TEST @@ -94,7 +94,7 @@ hiprio_cn_lat_fn(arcvcap_t r, void *d) while (1) { cycles_t st, en, rpcen; - int pending = 0, rcvd = 0, ret = 0; + int pending = 0, ret = 0; if (unlikely(testing == 0)) break; @@ -119,8 +119,8 @@ hiprio_cn_lat_fn(arcvcap_t r, void *d) #endif #ifndef CN_SND_ONLY - pending = cos_rcv(r, RCV_ALL_PENDING, &rcvd); - assert(pending == 0 && rcvd == 1); + pending = cos_rcv(r, 0); + assert(pending == 0); rdtscll(rpcen); #ifdef RPC_UB_TEST iters ++; @@ -297,12 +297,12 @@ loprio_rate_c0_fn(arcvcap_t r, void *d) while (testing == 0) ; while (1) { - int pending = 0, rcvd = 0, ret = 0; + int pending = 0, ret = 0; if (unlikely(testing == 0)) break; - pending = cos_rcv(r, RCV_ALL_PENDING, &rcvd); - assert(pending == 0 && rcvd == 1); + pending = cos_rcv(r, 0); + assert(pending == 0); ret = cos_asnd(snd, 0); assert(ret == 0); @@ -320,7 +320,7 @@ hiprio_rate_cn_fn(arcvcap_t r, void *d) while (testing == 0) ; while (1) { - int pending = 0, rcvd = 0, ret = 0; + int pending = 0, ret = 0; if (unlikely(testing == 0)) break; @@ -330,8 +330,8 @@ hiprio_rate_cn_fn(arcvcap_t r, void *d) assert(ret == 0); #ifndef CN_SND_ONLY - pending = cos_rcv(r, RCV_ALL_PENDING, &rcvd); - assert(pending == 0 && rcvd == 1); + pending = cos_rcv(r, 0); + assert(pending == 0); #endif } @@ -412,7 +412,7 @@ static void c0_ipc_fn(arcvcap_t r, void *d) { asndcap_t snd = c0_cn_asnd[cos_cpuid()]; - int iters; + int iters = 0; cycles_t rtt_total = 0, one_total = 0, rtt_wc = 0, one_wc = 0, rone_total = 0, rone_wc = 0; PRINTC("Testing Cross-core IPC:\n"); @@ -423,7 +423,7 @@ c0_ipc_fn(arcvcap_t r, void *d) testing = 1; while (1) { - int pending = 0, rcvd = 0, ret = 0; + int pending = 0, ret = 0; cycles_t rtt_diff, one_diff = 0, rone_diff = 0; rdtscll(c0_start); @@ -431,8 +431,8 @@ c0_ipc_fn(arcvcap_t r, void *d) assert(ret == 0); rdtscll(c0_mid); - pending = cos_rcv(r, RCV_ALL_PENDING, &rcvd); - assert(pending == 0 && rcvd == 1); + pending = cos_rcv(r, 0); + assert(pending == 0); rdtscll(c0_end); rtt_diff = (c0_end - c0_start); @@ -466,13 +466,13 @@ c1_ipc_fn(arcvcap_t r, void *d) while (testing == 0) ; while (1) { - int pending = 0, rcvd = 0, ret = 0; + int pending = 0, ret = 0; if (unlikely(testing == 0)) break; rdtscll(c1_start); - pending = cos_rcv(r, RCV_ALL_PENDING, &rcvd); - assert(pending == 0 && rcvd == 1); + pending = cos_rcv(r, 0); + assert(pending == 0); rdtscll(c1_mid); ret = cos_asnd(snd, 0); @@ -487,7 +487,7 @@ static void test_ipc_setup(void) { #ifdef TEST_IPC - static volatile int cdone[NUM_CPU] = { 0 }; + static volatile unsigned long cdone[NUM_CPU] = { 0 }; int i, ret; struct sl_thd *t = NULL; asndcap_t snd = 0; diff --git a/src/components/implementation/tests/micro_xcores/micro_xcores.c b/src/components/implementation/tests/micro_xcores/micro_xcores.c index 62c22be39e..7a4aebf008 100644 --- a/src/components/implementation/tests/micro_xcores/micro_xcores.c +++ b/src/components/implementation/tests/micro_xcores/micro_xcores.c @@ -29,13 +29,13 @@ cos_init(void) first_init = 0; cos_meminfo_init(&booter_info.mi, BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT); cos_compinfo_init(&booter_info, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_COMP, - (vaddr_t)cos_get_heap_ptr(), BOOT_CAPTBL_FREE, &booter_info); + (vaddr_t)cos_get_heap_ptr(), LLBOOT_CAPTBL_FREE, &booter_info); init_done = 1; } while (!init_done); - termthd[cos_cpuid()] = cos_thd_alloc(&booter_info, booter_info.comp_cap, term_fn, NULL); + termthd[cos_cpuid()] = cos_thd_alloc(&booter_info, booter_info.comp_cap, term_fn, NULL, 0, 0); assert(termthd[cos_cpuid()]); if (cos_cpuid() == 0) PRINTC("Micro Booter Xcore started.\n"); diff --git a/src/components/implementation/tests/micro_xcores/test_ipi_interference.c b/src/components/implementation/tests/micro_xcores/test_ipi_interference.c index f301f9d873..fbe59951ca 100644 --- a/src/components/implementation/tests/micro_xcores/test_ipi_interference.c +++ b/src/components/implementation/tests/micro_xcores/test_ipi_interference.c @@ -2,7 +2,7 @@ #include "micro_xcores.h" -extern void sched_events_clear(int* rcvd, thdid_t* tid, int* blocked, cycles_t* cycles, tcap_time_t* thd_timeout); +extern void sched_events_clear(thdid_t* tid, int* blocked, cycles_t* cycles, tcap_time_t* thd_timeout); /* Test RCV 2: Close Loop at higher priority => Measure Kernel involvement */ @@ -38,7 +38,7 @@ test_rcv(arcvcap_t r) { int pending = 0, rcvd = 0; - pending = cos_rcv(r, RCV_ALL_PENDING, &rcvd); + pending = cos_rcv(r, 0); assert(pending == 0); total_rcvd[cos_cpuid()] += rcvd; @@ -76,13 +76,13 @@ test_rcv_fn(void *d) static void test_sched_loop(void) { - int blocked, rcvd, pending, ret; + int blocked, pending, ret; cycles_t cycles; tcap_time_t timeout, thd_timeout; thdid_t thdid; /* Clear Scheduler */ - sched_events_clear(&rcvd, &thdid, &blocked, &cycles, &thd_timeout); + sched_events_clear(&thdid, &blocked, &cycles, &thd_timeout); while (1) { if(cos_cpuid() == TEST_RCV_CORE) { @@ -90,8 +90,8 @@ test_sched_loop(void) ret = cos_switch(spinner_thd[cos_cpuid()], BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_PRIO_MAX + 2, 0, 0, 0); } while (ret == -EAGAIN); } - while ((pending = cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, RCV_ALL_PENDING, 0, - &rcvd, &thdid, &blocked, &cycles, &thd_timeout)) >= 0) { + while ((pending = cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, 0, 0, + &thdid, &blocked, &cycles, &thd_timeout)) >= 0) { if (!thdid) goto done; assert(thdid == tid[cos_cpuid()]); blkd[cos_cpuid()] = blocked; @@ -181,6 +181,7 @@ test_ipi_interference(void) thdcap_t t = 0; tcap_t tcc = 0; + if (NUM_CPU <= 1) return; if (cos_cpuid() == TEST_RCV_CORE) { @@ -190,7 +191,7 @@ test_ipi_interference(void) if (EXPECT_LL_LT(1, tcc, "IPI Interference: TCAP Allocation")) return; - t = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_rcv_fn, NULL); + t = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_rcv_fn, NULL, 0, 0); if (EXPECT_LL_LT(1, t, "IPI Inteference: Thread Allocation")) return; @@ -205,7 +206,7 @@ test_ipi_interference(void) rcv[cos_cpuid()] = r; while (!rcv[TEST_SND_CORE]) ; - t = cos_thd_alloc(&booter_info, booter_info.comp_cap, rcv_spinner, NULL); + t = cos_thd_alloc(&booter_info, booter_info.comp_cap, rcv_spinner, NULL, 0, 0); if (EXPECT_LL_LT(1, t, "IPI Interference: Thread Allocation")) return; @@ -230,7 +231,7 @@ test_ipi_interference(void) if (EXPECT_LL_LT(1, tcc, "IPI Interference: TCAP Allocation")) return; - t = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_asnd_fn, NULL); + t = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_asnd_fn, NULL, 0, 0); if (EXPECT_LL_LT(1, t, "IPI Interference: Thread Allocation")) return; diff --git a/src/components/implementation/tests/micro_xcores/test_ipi_n_n.c b/src/components/implementation/tests/micro_xcores/test_ipi_n_n.c index a8d9c9bfb9..a0c90f6510 100644 --- a/src/components/implementation/tests/micro_xcores/test_ipi_n_n.c +++ b/src/components/implementation/tests/micro_xcores/test_ipi_n_n.c @@ -26,7 +26,7 @@ test_ipi_fn(void *d) r = cos_asnd(snd, 1); assert(r == 0); - p = cos_rcv(rcv, RCV_ALL_PENDING, &r); + p = cos_rcv(rcv, 0); assert(p >= 0); } } @@ -49,7 +49,7 @@ test_rcv_crt(void) asndcap_t snd = 0; if (cos_cpuid() == i) continue; - thd = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_ipi_fn, (void *)i); + thd = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_ipi_fn, (void *)i, 0, 0); assert(thd); rcv = cos_arcv_alloc(&booter_info, thd, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, booter_info.comp_cap, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE); @@ -147,7 +147,7 @@ test_ipi_n_n(void) rdtscll(now); prev = now; while (1) { - int blocked, rcvd, pending; + int blocked, pending; cycles_t cycles; tcap_time_t timeout, thd_timeout; thdid_t tid; @@ -158,8 +158,8 @@ test_ipi_n_n(void) if (now - prev > wc) wc = now - prev; test_thd_act(); - while ((pending = cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, RCV_ALL_PENDING, 0, - &rcvd, &tid, &blocked, &cycles, &thd_timeout)) >= 0) { + while ((pending = cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, 0, 0, + &tid, &blocked, &cycles, &thd_timeout)) >= 0) { if (!tid) goto done; j = test_find_tid(tid); assert(j >= 0); diff --git a/src/components/implementation/tests/micro_xcores/test_ipi_roundtrip.c b/src/components/implementation/tests/micro_xcores/test_ipi_roundtrip.c index 24dd2c13ad..005c79940d 100644 --- a/src/components/implementation/tests/micro_xcores/test_ipi_roundtrip.c +++ b/src/components/implementation/tests/micro_xcores/test_ipi_roundtrip.c @@ -2,7 +2,7 @@ #include "micro_xcores.h" -extern void sched_events_clear(int* rcvd, thdid_t* tid, int* blocked, cycles_t* cycles, tcap_time_t* thd_timeout); +extern void sched_events_clear(thdid_t* tid, int* blocked, cycles_t* cycles, tcap_time_t* thd_timeout); /* Test Sender Time + Receiver Time Roundtrip */ @@ -29,14 +29,14 @@ static cycles_t results[2][ARRAY_SIZE]; static void test_rcv(arcvcap_t r) { - int pending = 0, rcvd = 0; + int pending = 0; - pending = cos_rcv(r, RCV_ALL_PENDING, &rcvd); + pending = cos_rcv(r, 0); assert(pending == 0); if (EXPECT_LL_LT(1, r, "IPI Roundtrip: Allocation on RCV")) cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE); - total_rcvd[cos_cpuid()] += rcvd; + total_rcvd[cos_cpuid()] += 1; } static void @@ -64,18 +64,18 @@ test_rcv_fn(void *d) static void test_sched_loop(void) { - int blocked, rcvd, pending, ret; + int blocked, pending, ret; cycles_t cycles; tcap_time_t timeout, thd_timeout; thdid_t thdid; /* Clear Scheduler */ - sched_events_clear(&rcvd, &thdid, &blocked, &cycles, &thd_timeout); + sched_events_clear(&thdid, &blocked, &cycles, &thd_timeout); while (1) { - while ((pending = cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, RCV_ALL_PENDING, 0, - &rcvd, &thdid, &blocked, &cycles, &thd_timeout)) >= 0) { + while ((pending = cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, 0, 0, + &thdid, &blocked, &cycles, &thd_timeout)) >= 0) { if (!thdid) goto done; assert(thdid == tid[cos_cpuid()]); blkd[cos_cpuid()] = blocked; @@ -157,6 +157,7 @@ test_ipi_roundtrip(void) thdcap_t t = 0; tcap_t tcc = 0; + if (NUM_CPU <= 1) return; if (cos_cpuid() == TEST_RCV_CORE) { @@ -167,7 +168,7 @@ test_ipi_roundtrip(void) return; - t = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_rcv_fn, NULL); + t = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_rcv_fn, NULL, 0, 0); if (EXPECT_LL_LT(1, t, "IPI ROUNDTRIP: Thread Allocation")) return; @@ -196,7 +197,7 @@ test_ipi_roundtrip(void) /* Test Sender Time */ - t = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_asnd_fn, NULL); + t = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_asnd_fn, NULL, 0, 0); if (EXPECT_LL_LT(1, t, "IPI ROUNDTRIP: Thread Allocation")) return; diff --git a/src/components/implementation/tests/micro_xcores/test_ipi_switch.c b/src/components/implementation/tests/micro_xcores/test_ipi_switch.c index e13dbda15d..4c368244cc 100644 --- a/src/components/implementation/tests/micro_xcores/test_ipi_switch.c +++ b/src/components/implementation/tests/micro_xcores/test_ipi_switch.c @@ -3,10 +3,10 @@ #include "micro_xcores.h" void -sched_events_clear(int* rcvd, thdid_t* tid, int* blocked, cycles_t* cycles, tcap_time_t* thd_timeout) +sched_events_clear(thdid_t* tid, int* blocked, cycles_t* cycles, tcap_time_t* thd_timeout) { - while (cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, RCV_ALL_PENDING, 0, - rcvd, tid, blocked, cycles, thd_timeout) != 0) + while (cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, 0, 0, + tid, blocked, cycles, thd_timeout) != 0) ; } @@ -40,12 +40,12 @@ static cycles_t results[ARRAY_SIZE]; static void test_rcv(arcvcap_t r) { - int pending = 0, rcvd = 0; + int pending = 0; - pending = cos_rcv(r, RCV_ALL_PENDING, &rcvd); + pending = cos_rcv(r, 0); assert(pending == 0); - total_rcvd[cos_cpuid()] += rcvd; + total_rcvd[cos_cpuid()] += 1; } static void @@ -80,16 +80,16 @@ rcv_spinner(void *d) static void test_rcv_1(arcvcap_t r) { - int pending = 0, rcvd = 0; + int pending = 0; - pending = cos_rcv(r, RCV_ALL_PENDING, &rcvd); + pending = cos_rcv(r, 0); rdtscll(global_time[1]); time = (global_time[1] - global_time[0]); perfdata_add(&pd, time); assert(pending == 0); - total_rcvd[cos_cpuid()] += rcvd; + total_rcvd[cos_cpuid()] += 1; } static void @@ -138,13 +138,13 @@ test_asnd_fn(void *d) static void test_sched_loop(void) { - int blocked, rcvd, pending, ret; + int blocked, pending, ret; cycles_t cycles; tcap_time_t timeout, thd_timeout; thdid_t thdid; /* Clear Scheduler */ - sched_events_clear(&rcvd, &thdid, &blocked, &cycles, &thd_timeout); + sched_events_clear(&thdid, &blocked, &cycles, &thd_timeout); while (1) { @@ -153,8 +153,8 @@ test_sched_loop(void) ret = cos_switch(spinner_thd[cos_cpuid()], BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_PRIO_MAX + 2, 0, 0, 0); } while (ret == -EAGAIN); } - while ((pending = cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, RCV_ALL_PENDING, 0, - &rcvd, &thdid, &blocked, &cycles, &thd_timeout)) >= 0) { + while ((pending = cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, 0, 0, + &thdid, &blocked, &cycles, &thd_timeout)) >= 0) { if (!thdid) goto done; assert(thdid == tid[cos_cpuid()]); @@ -183,6 +183,7 @@ test_ipi_switch(void) thdcap_t t = 0; tcap_t tcc = 0; + if (NUM_CPU <= 1) return; if (cos_cpuid() == TEST_RCV_CORE) { @@ -192,7 +193,7 @@ test_ipi_switch(void) if (EXPECT_LL_LT(1, tcc, "IPI SWITCH: TCAP Allocation")) return; - t = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_rcv_fn, NULL); + t = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_rcv_fn, NULL, 0, 0); if (EXPECT_LL_LT(1, t, "IPI SWITCH: Thread Allocation")) return; @@ -207,7 +208,7 @@ test_ipi_switch(void) rcv[cos_cpuid()] = r; while (!rcv[TEST_SND_CORE]) ; - t = cos_thd_alloc(&booter_info, booter_info.comp_cap, rcv_spinner, NULL); + t = cos_thd_alloc(&booter_info, booter_info.comp_cap, rcv_spinner, NULL, 0, 0); if (EXPECT_LL_LT(1, t, "IPI SWITCH: Thread Allocation")) return; @@ -226,7 +227,7 @@ test_ipi_switch(void) /* Test RCV1: Corresponding Send */ - t = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_asnd_fn, NULL); + t = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_asnd_fn, NULL, 0, 0); if (EXPECT_LL_LT(1, t, "IPI SWITCH: Thread Allocation")) return; diff --git a/src/components/implementation/tests/part_test/Makefile b/src/components/implementation/tests/part_test/Makefile new file mode 100644 index 0000000000..3fcb066f74 --- /dev/null +++ b/src/components/implementation/tests/part_test/Makefile @@ -0,0 +1,8 @@ +COMPONENT=part_test.o +INTERFACES= +#DEPENDENCIES=capmgr +IF_LIB= +ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps + +include ../../Makefile.subsubdir +MANDITORY_LIB=simple_stklib.o diff --git a/src/components/implementation/tests/part_test/init.c b/src/components/implementation/tests/part_test/init.c new file mode 100644 index 0000000000..3511588c85 --- /dev/null +++ b/src/components/implementation/tests/part_test/init.c @@ -0,0 +1,79 @@ +#include +#include +#include +#include +#include +#include +#include + +int main(void); + +void +cos_exit(int x) +{ + PRINTC("Exit code: %d\n", x); + while (1) ; +} + +static void +cos_main(void *d) +{ + assert(sl_thd_thdid(sl_thd_curr()) == cos_thdid()); + main(); + + while (1) ; +} + +extern void cos_gomp_init(void); + +void +cos_init(void *d) +{ + struct cos_defcompinfo *defci = cos_defcompinfo_curr_get(); + struct cos_compinfo * ci = cos_compinfo_get(defci); + int i; + static volatile unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 }; + static unsigned b1 = 0, b2 = 0, b3 = 0; + + PRINTC("In a parallel program!\n"); + if (ps_cas(&first, NUM_CPU + 1, cos_cpuid())) { + cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT); + cos_defcompinfo_llinit(); + } else { + while (!ps_load(&init_done[first])) ; + + cos_defcompinfo_sched_init(); + } + cos_dcb_info_init_curr(); + ps_faa(&init_done[cos_cpuid()], 1); + + /* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */ + for (i = 0; i < NUM_CPU; i++) { + while (!ps_load(&init_done[i])) ; + } + sl_init(SL_MIN_PERIOD_US*100); + /* barrier, wait for sl_init to be done on all cores */ + ps_faa(&b1, 1); + while (ps_load(&b1) != NUM_CPU) ; + part_init(); + /* barrier, wait for gomp_init to be done on all cores */ + ps_faa(&b2, 1); + while (ps_load(&b2) != NUM_CPU) ; + + if (!cos_cpuid()) { + struct sl_thd *t = NULL; + + t = sl_thd_alloc(cos_main, NULL); + assert(t); + sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX)); + } + /* wait for all cores to reach this point, so all threads wait for main thread to be ready! */ + ps_faa(&b3, 1); + while (ps_load(&b3) != NUM_CPU) ; + + sl_sched_loop_nonblock(); + + PRINTC("Should never get here!\n"); + assert(0); +} + diff --git a/src/components/implementation/tests/part_test/main.c b/src/components/implementation/tests/part_test/main.c new file mode 100644 index 0000000000..b751b97ece --- /dev/null +++ b/src/components/implementation/tests/part_test/main.c @@ -0,0 +1,32 @@ +#include +#include + +#define NTHDS 2 + +void +work_fn(void *d) +{ + PRINTC("Sharing work!\n"); +} + +int +main(void) +{ + struct sl_thd *c = sl_thd_curr(); + struct part_task *p = (struct part_task *)c->part_context, *pt = &main_task; + int n = NTHDS > PART_MAX_PAR_THDS ? PART_MAX_PAR_THDS : NTHDS; + + assert(p == NULL); + + pt->state = PART_TASK_S_ALLOCATED; + part_task_init(pt, PART_TASK_T_WORKSHARE, p, n, work_fn, NULL, NULL); + assert(pt->nthds = n); + + c->part_context = pt; + part_list_append(pt); + + work_fn(NULL); + part_task_end(pt); + + PRINTC("Done!\n"); +} diff --git a/src/components/implementation/tests/spin_comp/Makefile b/src/components/implementation/tests/spin_comp/Makefile new file mode 100644 index 0000000000..bb7f30634e --- /dev/null +++ b/src/components/implementation/tests/spin_comp/Makefile @@ -0,0 +1,10 @@ +C_OBJS=init.o +ASM_OBJS= +COMPONENT=spin_comp.o +INTERFACES= +DEPENDENCIES=capmgr schedinit +IF_LIB= +ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api + +include ../../Makefile.subsubdir +MANDITORY_LIB=simple_stklib.o diff --git a/src/components/implementation/tests/spin_comp/init.c b/src/components/implementation/tests/spin_comp/init.c new file mode 100644 index 0000000000..15cdd385f5 --- /dev/null +++ b/src/components/implementation/tests/spin_comp/init.c @@ -0,0 +1,17 @@ +#include +#include +#include +#include +#include + +void +cos_init(void) +{ + PRINTC("Spin Init!\n"); + schedinit_child(); + + while (1) ; + + PRINTLOG(PRINT_ERROR, "Cannot reach here!\n"); + assert(0); +} diff --git a/src/components/implementation/tests/test_schedinv/Makefile b/src/components/implementation/tests/test_schedinv/Makefile new file mode 100644 index 0000000000..859fb3dd71 --- /dev/null +++ b/src/components/implementation/tests/test_schedinv/Makefile @@ -0,0 +1,8 @@ +COMPONENT=test_sched_inv.o +INTERFACES= +DEPENDENCIES= crt sched capmgr channel +IF_LIB= +ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api #required for cos_sinv in llboot.h! + +include ../../Makefile.subsubdir +MANDITORY_LIB=simple_stklib.o diff --git a/src/components/implementation/tests/test_schedinv/test_schedinv.c b/src/components/implementation/tests/test_schedinv/test_schedinv.c new file mode 100644 index 0000000000..2e71cb8ef3 --- /dev/null +++ b/src/components/implementation/tests/test_schedinv/test_schedinv.c @@ -0,0 +1,133 @@ +/* + * Copyright 2018, Phani Gadepalli and Gabriel Parmer, GWU, gparmer@gwu.edu. + * + * This uses a two clause BSD License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define SPDID_INT 5 +#define SPDID_W1 6 +#define SPDID_W3 7 + +static u32_t cycs_per_usec = 0; + +#define MAX_USE_PIPE_SZ 1 + +#define SND_DATA 0x4321 +#define HPET_PERIOD_TEST_US 20000 + +#define SHMCHANNEL_KEY 0x2020 +static cycles_t *sttsc = NULL; +volatile unsigned long *rdy = NULL; +int iters = 0; +#define ITERS 100000 +cycles_t vals[ITERS] = { 0 }; + +static void +__test_int_fn(arcvcap_t rcv, void *data) +{ + ps_faa(rdy, 1); + + while (ps_load(rdy) <= MAX_USE_PIPE_SZ) sched_thd_block_timeout(0, time_now() + time_usec2cyc(HPET_PERIOD_TEST_US)); + int a = capmgr_hw_periodic_attach(HW_HPET_PERIODIC, cos_thdid(), HPET_PERIOD_TEST_US); + assert(a == 0); + + /* TODO: register to HPET */ + while (1) { + cos_rcv(rcv, 0); + iters++; + rdtscll(*sttsc); + chan_out(SND_DATA); + + if (iters == ITERS) capmgr_hw_detach(HW_HPET_PERIODIC); + } + + sched_thd_exit(); +} + +cycles_t tot = 0, wc = 0; + +static void +__test_wrk_fn(void *data) +{ + int e = (int) data; + ps_faa(rdy, 1); + while (1) { + chan_in(); + + if (unlikely(e)) { + cycles_t en, diff; + + if (unlikely(iters >= ITERS)) continue; + rdtscll(en); + assert(sttsc); + diff = en - *sttsc; + if (diff > wc) wc = diff; + tot += diff; + vals[iters] = diff; + //printc("%llu\n", diff); + iters++; + if (iters % 1000 == 0) printc(","); + if (iters == ITERS) { + int i; + + for (i = 0; i < ITERS; i++) printc("%llu\n", vals[i]); + PRINTC("%llu, %llu\n", tot / ITERS, wc); + tot = wc = 0; + //iters = 0; + } + continue; + } + chan_out(SND_DATA); + } +} + +struct cos_aep_info intaep; + +static void +test_aeps(void) +{ + thdid_t tid; + int ret; + int i = 0; + + if (cos_spd_id() == SPDID_INT) { + tid = sched_aep_create(&intaep, __test_int_fn, (void *)0, 0, 0, 0, 0); + } else { + tid = sched_thd_create(__test_wrk_fn, + ((cos_spd_id() == SPDID_W3 && MAX_USE_PIPE_SZ == 4) + || (cos_spd_id() == SPDID_W1 && MAX_USE_PIPE_SZ == 2)) + ? (void *)1: (void *)0); + } + assert(tid); +} + +void +cos_init(void) +{ + spdid_t child; + comp_flag_t childflags; + + vaddr_t addr = 0; + unsigned long pages = 0; + cbuf_t id = channel_shared_page_map(SHMCHANNEL_KEY, &addr, &pages); + assert(id > 0 && addr && pages == 1); + sttsc = (cycles_t *)addr; + rdy = (volatile unsigned long *)(sttsc + 1); + + cycs_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE); + + assert(hypercall_comp_child_next(cos_spd_id(), &child, &childflags) == -1); + test_aeps(); + PRINTC("Init Done!\n"); + + sched_thd_exit(); +} diff --git a/src/components/implementation/tests/unit_capmgr/unit_capmgr.c b/src/components/implementation/tests/unit_capmgr/unit_capmgr.c index 91b24e7e47..880428378c 100644 --- a/src/components/implementation/tests/unit_capmgr/unit_capmgr.c +++ b/src/components/implementation/tests/unit_capmgr/unit_capmgr.c @@ -33,7 +33,9 @@ test_thds(void) int failure = 0; for (; i < TEST_N_THDS; i++) { - test_ts[cos_cpuid()][i] = capmgr_thd_create(__test_thd_fn, (void *)i, &tid); + struct cos_dcb_info *dcb; + + test_ts[cos_cpuid()][i] = capmgr_thd_create(__test_thd_fn, (void *)i, &tid, &dcb); assert(test_ts[cos_cpuid()][i]); if (cos_thd_switch(test_ts[cos_cpuid()][i])) { diff --git a/src/components/implementation/tests/unit_defcompinfo/unit_defcompinfo.c b/src/components/implementation/tests/unit_defcompinfo/unit_defcompinfo.c index 21133d1e21..0083657d72 100644 --- a/src/components/implementation/tests/unit_defcompinfo/unit_defcompinfo.c +++ b/src/components/implementation/tests/unit_defcompinfo/unit_defcompinfo.c @@ -35,7 +35,7 @@ aep_thd_fn(arcvcap_t rcv, void *data) { printc("\tSwitched to aep %d\n", (int)data); while (1) { - cos_rcv(rcv, 0, NULL); + cos_rcv(rcv, 0); } } @@ -56,7 +56,7 @@ test_aeps(void) asndcap_t snd; printc("\tCreating AEP [%d]\n", i); - ret = cos_aep_tcap_alloc(&(test_aep[i]), BOOT_CAPTBL_SELF_INITTCAP_BASE, aep_thd_fn, (void *)i); + ret = cos_aep_tcap_alloc(&(test_aep[i]), BOOT_CAPTBL_SELF_INITTCAP_BASE, aep_thd_fn, (void *)i, 0, 0); assert(ret == 0); snd = cos_asnd_alloc(ci, test_aep[i].rcv, ci->captbl_cap); @@ -66,7 +66,7 @@ test_aeps(void) TCAP_DELEG_YIELD); assert(ret == 0); - while (cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_BASE, 0, 0, NULL, &tid, &blocked, &cycs, &thd_timeout)) + while (cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_BASE, 0, 0, &tid, &blocked, &cycs, &thd_timeout)) ; } @@ -85,7 +85,7 @@ test_childcomps(void) thdid_t tid; tcap_time_t thd_timeout; - while (cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_BASE, 0, 0, NULL, &tid, &blocked, &cycs, &thd_timeout)) + while (cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_BASE, 0, 0, &tid, &blocked, &cycs, &thd_timeout)) ; printc("\tSwitching to [%d] component\n", id); if (id == CHILD_SCHED_ID) { @@ -122,10 +122,10 @@ cos_init(void) is_booter = 0; printc("Unit-test for defcompinfo API\n"); cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT); - cos_defcompinfo_init(); + cos_defcompinfo_llinit(); for (id = 0; id < CHILD_COMP_COUNT; id++) { - vaddr_t vm_range, addr; + vaddr_t vm_range, addr, dcbaddr; pgtblcap_t child_utpt; int is_sched = ((id == CHILD_SCHED_ID) ? 1 : 0); struct cos_compinfo *child_ci = cos_compinfo_get(&child_defci[id]); @@ -136,7 +136,7 @@ cos_init(void) cos_meminfo_init(&(child_ci->mi), BOOT_MEM_KM_BASE, CHILD_UNTYPED_SIZE, child_utpt); cos_defcompinfo_child_alloc(&child_defci[id], (vaddr_t)&cos_upcall_entry, - (vaddr_t)BOOT_MEM_VM_BASE, BOOT_CAPTBL_FREE, is_sched); + (vaddr_t)BOOT_MEM_VM_BASE, BOOT_CAPTBL_FREE, is_sched, &dcbaddr); printc("\t\tCopying new capabilities\n"); ret = cos_cap_cpy_at(child_ci, BOOT_CAPTBL_SELF_CT, ci, child_ci->captbl_cap); @@ -147,6 +147,7 @@ cos_init(void) assert(ret == 0); ret = cos_cap_cpy_at(child_ci, BOOT_CAPTBL_SELF_COMP, ci, child_ci->comp_cap); assert(ret == 0); + /* FIXME: copy BOOT_CAPTBL_SELF_SCB cap?? */ ret = cos_cap_cpy_at(child_ci, BOOT_CAPTBL_SELF_INITTHD_BASE, ci, cos_sched_aep_get(&child_defci[id])->thd); @@ -207,7 +208,7 @@ cos_init(void) /* TEST BLOCKING */ /* TODO: Challenge - how does a component know at runtime if can call cos_rcv or not? - It does not at * runtime. */ - cos_rcv(BOOT_CAPTBL_SELF_INITRCV_BASE, 0, NULL); + cos_rcv(BOOT_CAPTBL_SELF_INITRCV_BASE, 0); printc("\tThis is a simple component\n"); SPIN(); diff --git a/src/components/implementation/tests/unit_fprr/Makefile b/src/components/implementation/tests/unit_fprr/Makefile index dd4186daef..66f9041230 100644 --- a/src/components/implementation/tests/unit_fprr/Makefile +++ b/src/components/implementation/tests/unit_fprr/Makefile @@ -1,3 +1,5 @@ +C_OBJS=unit_fprr.o +ASM_OBJS= COMPONENT=unit_fprr_test.o INTERFACES= DEPENDENCIES= diff --git a/src/components/implementation/tests/unit_fprr/unit_fprr.c b/src/components/implementation/tests/unit_fprr/unit_fprr.c index 6149b0a0d9..093ba6c25b 100644 --- a/src/components/implementation/tests/unit_fprr/unit_fprr.c +++ b/src/components/implementation/tests/unit_fprr/unit_fprr.c @@ -8,11 +8,12 @@ #include #include #include +#include /* Ensure this is the same as what is in sl_mod_fprr.c */ #define SL_FPRR_NPRIOS 32 -#define LOWEST_PRIORITY (SL_FPRR_NPRIOS - 1) +#define LOWEST_PRIORITY (15) #define LOW_PRIORITY (LOWEST_PRIORITY - 1) #define HIGH_PRIORITY (LOWEST_PRIORITY - 10) @@ -106,51 +107,55 @@ test_swapping(void) sl_thd_block_timeout(0, wakeup); } -#define XCPU_THDS (NUM_CPU-1) +#define XCORE_THDS (NUM_CPU-1) #define THD_SLEEP_US (100 * 1000) -volatile unsigned int xcpu_thd_data[NUM_CPU][XCPU_THDS]; -volatile unsigned int xcpu_thd_counter[NUM_CPU]; +volatile unsigned int xcore_thd_data[NUM_CPU][XCORE_THDS]; +volatile unsigned int xcore_thd_counter[NUM_CPU]; static void -test_xcpu_fn(void *data) +test_xcore_fn(void *data) { cycles_t wakeup, elapsed; int cpu = *((unsigned int *)data) >> 16; int i = (*((unsigned int *)data) << 16) >> 16; - assert(i < XCPU_THDS); + assert(i < XCORE_THDS); wakeup = sl_now() + sl_usec2cyc(THD_SLEEP_US); elapsed = sl_thd_block_timeout(0, wakeup); - if (elapsed) xcpu_thd_counter[cpu] ++; + if (elapsed) xcore_thd_counter[cpu] ++; sl_thd_exit(); } static void -run_xcpu_tests() +run_xcore_tests() { int ret = 0, i, cpu = 0; if (NUM_CPU == 1) return; - memset((void *)xcpu_thd_data[cos_cpuid()], 0, sizeof(unsigned int) * XCPU_THDS); - xcpu_thd_counter[cos_cpuid()] = 0; + memset((void *)xcore_thd_data[cos_cpuid()], 0, sizeof(unsigned int) * XCORE_THDS); + xcore_thd_counter[cos_cpuid()] = 0; - for (i = 0; i < XCPU_THDS; i++) { + for (i = 0; i < XCORE_THDS; i++) { sched_param_t p[1]; + struct sl_xcore_thd *t = NULL; if (cpu == cos_cpuid()) cpu++; cpu %= NUM_CPU; - xcpu_thd_data[cos_cpuid()][i] = (cpu << 16) | i; + xcore_thd_data[cos_cpuid()][i] = (cpu << 16) | i; p[0] = sched_param_pack(SCHEDP_PRIO, HIGH_PRIORITY); - ret = sl_xcpu_thd_alloc(cpu, test_xcpu_fn, (void *)&xcpu_thd_data[cos_cpuid()][i], p); - if (ret) break; + t = sl_xcore_thd_alloc(cpu, test_xcore_fn, (void *)&xcore_thd_data[cos_cpuid()][i], 1, p); + if (!t) { + ret = -1; + break; + } cpu++; } - PRINTC("%s: Creating cross-CPU threads!\n", ret ? "FAILURE" : "SUCCESS"); - while (xcpu_thd_counter[cos_cpuid()] != XCPU_THDS) ; + PRINTC("%s: Creating cross-core threads!\n", ret ? "FAILURE" : "SUCCESS"); + while (xcore_thd_counter[cos_cpuid()] != XCORE_THDS) ; } static void @@ -161,7 +166,7 @@ run_tests() test_swapping(); PRINTC("%s: Swap back and forth!\n", (thd1_ran[cos_cpuid()] && thd2_ran[cos_cpuid()]) ? "SUCCESS" : "FAILURE"); - run_xcpu_tests(); +// run_xcore_tests(); PRINTC("Unit-test done!\n"); sl_thd_exit(); @@ -176,11 +181,9 @@ cos_init(void) struct cos_defcompinfo *defci = cos_defcompinfo_curr_get(); struct cos_compinfo *ci = cos_compinfo_get(defci); - PRINTC("Unit-test for the scheduling library (sl)\n"); - if (ps_cas(&first, NUM_CPU + 1, cos_cpuid())) { cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT); - cos_defcompinfo_init(); + cos_defcompinfo_llinit(); } else { while (!ps_load(&init_done[first])) ; @@ -191,6 +194,8 @@ cos_init(void) while (!ps_load(&init_done[i])) ; } + PRINTC("Unit-test for the scheduling library (sl)\n"); + sl_init(SL_MIN_PERIOD_US); testing_thread = sl_thd_alloc(run_tests, NULL); diff --git a/src/components/implementation/tests/unit_schedaep/Makefile b/src/components/implementation/tests/unit_schedappaep/Makefile similarity index 85% rename from src/components/implementation/tests/unit_schedaep/Makefile rename to src/components/implementation/tests/unit_schedappaep/Makefile index b6f56f58bf..da9e217045 100644 --- a/src/components/implementation/tests/unit_schedaep/Makefile +++ b/src/components/implementation/tests/unit_schedappaep/Makefile @@ -1,4 +1,4 @@ -COMPONENT=unit_schedaep_test.o +COMPONENT=unit_schedappaep_test.o INTERFACES= DEPENDENCIES=sched capmgr IF_LIB= diff --git a/src/components/implementation/tests/unit_schedaep/unit_schedaep.c b/src/components/implementation/tests/unit_schedappaep/unit_schedappaep.c similarity index 98% rename from src/components/implementation/tests/unit_schedaep/unit_schedaep.c rename to src/components/implementation/tests/unit_schedappaep/unit_schedappaep.c index 371c8144fd..4702c82253 100644 --- a/src/components/implementation/tests/unit_schedaep/unit_schedaep.c +++ b/src/components/implementation/tests/unit_schedappaep/unit_schedappaep.c @@ -26,7 +26,7 @@ __test_child(arcvcap_t rcv, void *data) assert(taeps[cos_cpuid()][(int)data].rcv == rcv); while (child_rcvd[cos_cpuid()] < TEST_ITERS) { - ret = cos_rcv(rcv, 0, NULL); + ret = cos_rcv(rcv, 0); assert(ret >= 0); child_rcvd[cos_cpuid()]++; @@ -43,7 +43,7 @@ __test_parent(arcvcap_t rcv, void *data) assert(taeps[cos_cpuid()][(int)data].rcv == rcv); while (parent_sent[cos_cpuid()] < TEST_ITERS) { - ret = cos_rcv(rcv, 0, NULL); + ret = cos_rcv(rcv, 0); assert(ret >= 0); do { diff --git a/src/components/implementation/tests/unit_schedappcomp/Makefile b/src/components/implementation/tests/unit_schedappcomp/Makefile new file mode 100644 index 0000000000..dfe5cbcf92 --- /dev/null +++ b/src/components/implementation/tests/unit_schedappcomp/Makefile @@ -0,0 +1,8 @@ +COMPONENT=unit_schedappcomp_test.o +INTERFACES= +DEPENDENCIES=sched +IF_LIB= +ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api #required for cos_sinv in llboot.h! + +include ../../Makefile.subsubdir +MANDITORY_LIB=simple_stklib.o diff --git a/src/components/implementation/tests/unit_schedcomp/unit_schedcomp.c b/src/components/implementation/tests/unit_schedappcomp/unit_schedappcomp.c similarity index 99% rename from src/components/implementation/tests/unit_schedcomp/unit_schedcomp.c rename to src/components/implementation/tests/unit_schedappcomp/unit_schedappcomp.c index 98a5ccbc8d..a7cf4db127 100644 --- a/src/components/implementation/tests/unit_schedcomp/unit_schedcomp.c +++ b/src/components/implementation/tests/unit_schedappcomp/unit_schedappcomp.c @@ -131,6 +131,7 @@ cos_init(void) assert(hypercall_comp_child_next(cos_spd_id(), &child, &childflag) == -1); testtid = sched_thd_create(run_tests, NULL); + assert(testtid); sched_thd_param_set(testtid, sched_param_pack(SCHEDP_PRIO, LOWEST_PRIORITY)); while (1) { diff --git a/src/components/implementation/tests/unit_schedcomp/Makefile b/src/components/implementation/tests/unit_schedcomp/Makefile index 3edcf1b36d..1134e9cb60 100644 --- a/src/components/implementation/tests/unit_schedcomp/Makefile +++ b/src/components/implementation/tests/unit_schedcomp/Makefile @@ -1,8 +1,8 @@ COMPONENT=unit_schedcomp_test.o INTERFACES= -DEPENDENCIES=sched +DEPENDENCIES=capmgr IF_LIB= -ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api #required for cos_sinv in llboot.h! +ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_rr -lcos_defkernel_api include ../../Makefile.subsubdir MANDITORY_LIB=simple_stklib.o diff --git a/src/components/implementation/tests/unit_schedcomp/unit_schedlib.c b/src/components/implementation/tests/unit_schedcomp/unit_schedlib.c new file mode 100644 index 0000000000..e5527bb269 --- /dev/null +++ b/src/components/implementation/tests/unit_schedcomp/unit_schedlib.c @@ -0,0 +1,209 @@ +/* + * Copyright 2016, Phani Gadepalli and Gabriel Parmer, GWU, gparmer@gwu.edu. + * + * This uses a two clause BSD License. + */ + +#include +#include +#include + +#include +#include +#include +#include + +/* sl also defines a SPIN macro */ +#undef SPIN +#define SPIN(iters) \ + do { \ + if (iters > 0) { \ + for (; iters > 0; iters--) \ + ; \ + } else { \ + while (1) \ + ; \ + } \ + } while (0) + + +#define N_TESTTHDS 8 +#define WORKITERS 10000 + +#define PERF_ITERS 1000000 + +static cycles_t rdtscp_min = 0, rdtscp_max = 0, rdtscp_avg = 0; +static volatile int switched = 0; +static volatile cycles_t mid_cycs = 0; +static volatile int testing = 1; +static struct sl_thd *perf_thd, *spin_thd; + +void +test_thd_perffn(void *data) +{ + thdid_t yield_to = sl_thd_thdid(spin_thd); + cycles_t start_cycs = 0, end_cycs = 0, wc_cycs = 0, total_cycs = 0, bc_cycs = 500; + unsigned int i = 0; + int ret = 0; + + assert(perf_thd == sl_thd_curr()); + rdtscll(start_cycs); + //printc("a"); + //sl_thd_yield(yield_to); + //ret = sl_thd_dispatch(spin_thd, cos_sched_sync(), perf_thd); + //sl_thd_yield_thd_c(perf_thd, spin_thd); + sl_thd_yield_thd(spin_thd); + //assert(ret == 0); + rdtscll(end_cycs); + //assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs); + assert(switched); + + for (i = 0; i < PERF_ITERS; i++) { + cycles_t diff1_cycs = 0, diff2_cycs = 0; + + end_cycs = start_cycs = 0; + //mid_cycs = 0; + switched = 0; + //cos_rdtscp(start_cycs); + rdtscll(start_cycs); + //ret = sl_thd_dispatch(spin_thd, cos_sched_sync(), perf_thd); + //printc("a"); + //sl_thd_yield(yield_to); + //sl_thd_yield_thd_c(perf_thd, spin_thd); + sl_thd_yield_thd(spin_thd); + rdtscll(end_cycs); + //cos_rdtscp(end_cycs); + assert(switched); + assert(ret == 0); + //assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs); + + //diff1_cycs = mid_cycs - start_cycs; + diff2_cycs = end_cycs - start_cycs; + //assert(diff2_cycs > rdtscp_min); + //diff2_cycs -= rdtscp_min; + + //if (diff1_cycs > wc_cycs) wc_cycs = diff1_cycs; + if (diff2_cycs > wc_cycs) wc_cycs = diff2_cycs; + if (diff2_cycs < bc_cycs) bc_cycs = diff2_cycs; + total_cycs += diff2_cycs; + } + + PRINTC("SWITCH UBENCH : avg: %llu, wc: %llu, bc: %llu, iters:%u\n", (total_cycs / (PERF_ITERS)) / 2, wc_cycs / 2, bc_cycs / 2, PERF_ITERS); + testing = 0; + /* done testing! free the spin thread! */ + while (1) ; +// sl_thd_free(spin_thd); + +// sl_thd_exit(); +} + +void +test_thd_spinfn(void *data) +{ + thdid_t yield_to = sl_thd_thdid(perf_thd); + assert(sl_thd_curr() == spin_thd); + + while (likely(testing)) { + //rdtscll(mid_cycs); + switched = 1; + //sl_thd_dispatch(perf_thd, cos_sched_sync(), spin_thd); + //printc("b"); + //sl_thd_yield(yield_to); + //sl_thd_yield_thd_c(spin_thd, perf_thd); + sl_thd_yield_thd(perf_thd); + } + + //sl_thd_dispatch(perf_thd, cos_sched_sync(), spin_thd); + sl_thd_yield(yield_to); + //sl_thd_yield_thd_c(spin_thd, perf_thd); + //sl_thd_yield_thd(perf_thd); + //assert(0); +} + +void +test_thd_fn(void *data) +{ + while (1) { + int workiters = WORKITERS * ((int)data); + + printc("%c", 'a' + (int)data); + //SPIN(workiters); + sl_thd_yield(0); + } +} + +void +test_yield_perf(void) +{ + union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 31}}; + + assert(NUM_CPU == 1); + + spin_thd = sl_thd_alloc(test_thd_spinfn, NULL); + assert(spin_thd); + sl_thd_param_set(spin_thd, sp.v); + PRINTC("Spin thread %u:%lu created\n", sl_thd_thdid(spin_thd), sl_thd_thdcap(spin_thd)); + + perf_thd = sl_thd_alloc(test_thd_perffn, NULL); + assert(perf_thd); + sl_thd_param_set(perf_thd, sp.v); + PRINTC("Perf thread %u:%lu created\n", sl_thd_thdid(perf_thd), sl_thd_thdcap(perf_thd)); + + sl_thd_yield(sl_thd_thdid(perf_thd)); + //sl_thd_dispatch(perf_thd, cos_sched_sync(), sl_thd_curr()); + //sl_thd_yield_thd_c(sl_thd_curr(), perf_thd); + //sl_thd_yield_thd(perf_thd); + while (1); +} + +void +test_yields(void) +{ + int i; + struct sl_thd * threads[N_TESTTHDS]; + union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 10}}; + + for (i = 0; i < N_TESTTHDS; i++) { + threads[i] = sl_thd_alloc(test_thd_fn, (void *)i); + assert(threads[i]); + sl_thd_param_set(threads[i], sp.v); + PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i])); + } +} + +void +cos_init(void) +{ + struct cos_defcompinfo *defci = cos_defcompinfo_curr_get(); + struct cos_compinfo * ci = cos_compinfo_get(defci); + static int first_time = 1, init_done = 0; + + PRINTC("Unit-test for the scheduling library (sl) with capmgr usage\n"); + PRINTC("CPU cycles per sec: %u\n", cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE)); + + if (first_time) { + first_time = 0; + cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT); + cos_defcompinfo_init(); + cos_rdtscp_calib(&rdtscp_min, &rdtscp_avg, &rdtscp_max); + PRINTC("RDTSCP MIN:%llu MAX:%llu AVG:%llu\n", rdtscp_min, rdtscp_max, rdtscp_avg); + + init_done = 1; + } else { + while (!init_done) ; + + cos_defcompinfo_sched_init(); + } + + sl_init(SL_MIN_PERIOD_US); + hypercall_comp_init_done(); + + test_yield_perf(); + //test_yields(); + + sl_sched_loop_nonblock(); + + assert(0); + + return; +} diff --git a/src/components/implementation/tests/unit_schedtests/Makefile b/src/components/implementation/tests/unit_schedtests/Makefile index e46827dc8d..1735aff577 100644 --- a/src/components/implementation/tests/unit_schedtests/Makefile +++ b/src/components/implementation/tests/unit_schedtests/Makefile @@ -2,7 +2,7 @@ COMPONENT=unit_schedlibtests.o INTERFACES= DEPENDENCIES= IF_LIB= -ADDITIONAL_LIBS=-lcobj_format $(LIBSLRAW) -lsl_mod_fprr -lsl_thd_static_backend +ADDITIONAL_LIBS=-lcobj_format $(LIBSLRAW) -lsl_mod_rr -lsl_thd_static_backend include ../../Makefile.subsubdir MANDITORY_LIB=simple_stklib.o diff --git a/src/components/implementation/tests/unit_schedtests/inv.S b/src/components/implementation/tests/unit_schedtests/inv.S new file mode 120000 index 0000000000..b9e55311b4 --- /dev/null +++ b/src/components/implementation/tests/unit_schedtests/inv.S @@ -0,0 +1 @@ +../kernel_tests/inv.S \ No newline at end of file diff --git a/src/components/implementation/tests/unit_schedtests/unit_schedlib.c b/src/components/implementation/tests/unit_schedtests/unit_schedlib.c index 2ca97b36ff..807776f25c 100644 --- a/src/components/implementation/tests/unit_schedtests/unit_schedlib.c +++ b/src/components/implementation/tests/unit_schedtests/unit_schedlib.c @@ -13,6 +13,7 @@ #include #include #include +#include /* sl also defines a SPIN macro */ #undef SPIN @@ -31,18 +32,208 @@ #define N_TESTTHDS 8 #define WORKITERS 10000 +#define N_TESTTHDS_PERF 2 +#define PERF_ITERS 1000000 + +#define MAGIC_RET 0xDEADBEEF + +#undef INV_TEST +static volatile cycles_t mid_cycs = 0; +static volatile int testing = 1; + + +int +test_serverfn(int a, int b, int c) +{ + //rdtscll(midinv_cycles[cos_cpuid()]); + return MAGIC_RET; +} + +extern void *__inv_test_serverfn(int a, int b, int c); + +static inline int +call_cap_mb(u32_t cap_no, int arg1, int arg2, int arg3) +{ + int ret; + + /* + * Which stack should we use for this invocation? Simple, use + * this stack, at the current sp. This is essentially a + * function call into another component, with odd calling + * conventions. + */ + cap_no = (cap_no + 1) << COS_CAPABILITY_OFFSET; + + __asm__ __volatile__("pushl %%ebp\n\t" + "movl %%esp, %%ebp\n\t" + "movl %%esp, %%edx\n\t" + "movl $1f, %%ecx\n\t" + "sysenter\n\t" + "1:\n\t" + "popl %%ebp" + : "=a"(ret) + : "a"(cap_no), "b"(arg1), "S"(arg2), "D"(arg3) + : "memory", "cc", "ecx", "edx"); + + return ret; +} + +sinvcap_t sinv_cap = 0; + +static inline void +test_inv_setup(void) +{ + struct cos_defcompinfo *defci = cos_defcompinfo_curr_get(); + struct cos_compinfo * ci = cos_compinfo_get(defci); + compcap_t cc; + sinvcap_t ic; + int i; + unsigned int ret; + + cc = cos_comp_alloc(ci, ci->captbl_cap, ci->pgtbl_cap, 0, (vaddr_t)NULL, 0); + assert(cc > 0); + ic = cos_sinv_alloc(ci, cc, (vaddr_t)__inv_test_serverfn, 0); + assert(ic > 0); + ret = call_cap_mb(ic, 1, 2, 3); + assert(ret == MAGIC_RET); + + sinv_cap = ic; +} + +static struct sl_thd *perf_thd = NULL, *spin_thd = NULL; + +void +test_thd_perffn(void *data) +{ + cycles_t start_cycs = 0, end_cycs = 0, wc_cycs = 0, total_cycs = 0; + unsigned int i = 0; + struct sl_thd *c = sl_thd_curr(); + + rdtscll(start_cycs); + sl_thd_yield_thd(spin_thd); + rdtscll(end_cycs); + assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs); + + for (i = 0; i < PERF_ITERS; i++) { + cycles_t diff1_cycs = 0, diff2_cycs = 0; + + mid_cycs = 0; + rdtscll(start_cycs); + sl_thd_yield_thd_c(c, spin_thd); + rdtscll(end_cycs); + assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs); + + diff1_cycs = mid_cycs - start_cycs; + diff2_cycs = end_cycs - mid_cycs; + + if (diff1_cycs > wc_cycs) wc_cycs = diff1_cycs; + if (diff2_cycs > wc_cycs) wc_cycs = diff2_cycs; + total_cycs += (diff1_cycs + diff2_cycs); + } + + PRINTC("SWITCH UBENCH: avg: %llu, wc: %llu, iters:%u\n", (total_cycs / (2 * PERF_ITERS)), wc_cycs, PERF_ITERS); + testing = 0; + /* done testing! let the spinfn cleanup! */ + sl_thd_yield_thd(spin_thd); + + sl_thd_exit(); +} + +void +test_inv_perffn(void *data) +{ + cycles_t start_cycs = 0, end_cycs = 0, wc_cycs = 0, total_cycs = 0; + unsigned int i = 0; + struct sl_thd *c = sl_thd_curr(); + + test_inv_setup(); + + rdtscll(start_cycs); + sl_thd_yield_thd(spin_thd); + rdtscll(end_cycs); + assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs); + + for (i = 0; i < PERF_ITERS; i++) { + cycles_t diff_cycs = 0; + int ret; + + sl_thd_yield_thd_c(c, spin_thd); + mid_cycs = 0; + rdtscll(start_cycs); + ret = call_cap_mb(sinv_cap, 1, 2, 3); + rdtscll(end_cycs); + assert(ret == (int)MAGIC_RET); +// assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs); +// +// diff1_cycs = mid_cycs - start_cycs; +// diff2_cycs = end_cycs - mid_cycs; +// +// if (diff1_cycs > wc_cycs) wc_cycs = diff1_cycs; +// if (diff2_cycs > wc_cycs) wc_cycs = diff2_cycs; +// total_cycs += (diff1_cycs + diff2_cycs); + diff_cycs = end_cycs - start_cycs; + if (diff_cycs > wc_cycs) wc_cycs = diff_cycs; + total_cycs += diff_cycs; + } + + PRINTC("INV UBENCH: avg: %llu, wc: %llu, iters:%u\n", (total_cycs / PERF_ITERS), wc_cycs, PERF_ITERS); + testing = 0; + /* done testing! let the spinfn cleanup! */ + sl_thd_yield_thd(spin_thd); + + sl_thd_exit(); +} + +void +test_thd_spinfn(void *data) +{ + struct sl_thd *c = sl_thd_curr(); + + while (likely(testing)) { + rdtscll(mid_cycs); + sl_thd_yield_thd_c(c, perf_thd); + } + + sl_thd_exit(); +} + void test_thd_fn(void *data) { while (1) { int workiters = WORKITERS * ((int)data); - printc("%d", (int)data); - SPIN(workiters); + printc("%c", 'a' + (int)data); + //SPIN(workiters); sl_thd_yield(0); } } +void +test_yield_perf(void) +{ + int i; + struct sl_thd *threads[N_TESTTHDS_PERF]; + union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 31}}; + + for (i = 0; i < N_TESTTHDS_PERF; i++) { + if (i == 1) { +#ifdef INV_TEST + threads[i] = sl_thd_alloc(test_inv_perffn, (void *)&threads[0]); +#else + threads[i] = sl_thd_alloc(test_thd_perffn, (void *)&threads[0]); +#endif + perf_thd = threads[i]; + } else { + threads[i] = sl_thd_alloc(test_thd_spinfn, NULL); + spin_thd = threads[i]; + } + assert(threads[i]); + sl_thd_param_set(threads[i], sp.v); + PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i])); + } +} + void test_yields(void) { @@ -51,9 +242,10 @@ test_yields(void) union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 10}}; for (i = 0; i < N_TESTTHDS; i++) { - threads[i] = sl_thd_alloc(test_thd_fn, (void *)(intptr_t)(i + 1)); + threads[i] = sl_thd_alloc(test_thd_fn, (void *)i); assert(threads[i]); sl_thd_param_set(threads[i], sp.v); + PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i])); } } @@ -148,12 +340,14 @@ cos_init(void) printc("Unit-test for the scheduling library (sl)\n"); cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT); - cos_defcompinfo_init(); - sl_init(SL_MIN_PERIOD_US); + cos_defcompinfo_llinit(); + cos_dcb_info_init_curr(); + sl_init(SL_MIN_PERIOD_US*100); - // test_yields(); - // test_blocking_directed_yield(); - test_timeout_wakeup(); + test_yield_perf(); + //test_yields(); + //test_blocking_directed_yield(); + //test_timeout_wakeup(); sl_sched_loop_nonblock(); diff --git a/src/components/implementation/tests/unit_slrcv/Makefile b/src/components/implementation/tests/unit_slrcv/Makefile new file mode 100644 index 0000000000..3500d01777 --- /dev/null +++ b/src/components/implementation/tests/unit_slrcv/Makefile @@ -0,0 +1,8 @@ +COMPONENT=unit_slrcvtest.o +INTERFACES= +DEPENDENCIES=capmgr schedinit work +IF_LIB= +ADDITIONAL_LIBS=-lcobj_format $(LIBSLCAPMGR) -lsl_mod_fprr -lsl_thd_static_backend -lcos_dcb -lsl_blkpt + +include ../../Makefile.subsubdir +MANDITORY_LIB=simple_stklib.o diff --git a/src/components/implementation/tests/unit_slrcv/init.c b/src/components/implementation/tests/unit_slrcv/init.c new file mode 100644 index 0000000000..aa5a85741c --- /dev/null +++ b/src/components/implementation/tests/unit_slrcv/init.c @@ -0,0 +1,238 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct sl_xcore_thd *ping; +static struct sl_xcore_thd *pong; + +#define HPET_PERIOD_TEST_US 20000 + +#define WORK_US (1000) + +static inline void +ping_fn(void *d) +{ + asndcap_t s = *(asndcap_t *)d; + + while (1) { + printc("s"); + int r = cos_asnd(s, 0); + + assert(r == 0); + work_usecs(WORK_US); + } + sl_thd_exit(); +} + +unsigned int iter = 0; +volatile cycles_t st = 0, en = 0, tot = 0, wc = 0; +CRT_CHAN_STATIC_ALLOC(c0, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS); +CRT_CHAN_STATIC_ALLOC(c1, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS); +CRT_CHAN_STATIC_ALLOC(c2, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS); +CRT_CHAN_STATIC_ALLOC(c3, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS); +CRT_CHAN_STATIC_ALLOC(c4, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS); +CRT_CHAN_STATIC_ALLOC(c5, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS); +CRT_CHAN_TYPE_PROTOTYPES(test, int, 4); + +#define PIPELINE_LEN 4 +#define PRIO_START (TCAP_PRIO_MAX + 10 + PIPELINE_LEN + 1) +#define PRIO_INT (PRIO_START + 1) +#define ITERS 100000 +static cycles_t vals[ITERS] = { 0 }; +static int pipe_line = 0; +static int pipe_send = 0, pipe_rcv = 0; + +static inline void +chrcv(int i) +{ + int r; + + if (i == 0) { + assert(ps_cas(&pipe_rcv, 0, PIPELINE_LEN)); + } + + //printc("[r%d,%d]", i, pipe_line); + switch(i) { + case 0: crt_chan_recv_test(c0, &r); break; + case 1: crt_chan_recv_test(c1, &r); break; + case 2: crt_chan_recv_test(c2, &r); break; + case 3: crt_chan_recv_test(c3, &r); break; + case 4: crt_chan_recv_test(c4, &r); break; + case 5: crt_chan_recv_test(c5, &r); break; + default: assert(0); + } + assert(ps_faa(&pipe_line, -1) == 1); + //printc("[d%d,%d]", i, pipe_line); + assert(ps_faa(&pipe_rcv, -1) == (PIPELINE_LEN - i)); +} + +static inline void +chsnd(int i) +{ + int s = 0xDEAD0000 | i; + + if (i == 0) { + assert(ps_cas(&pipe_send, 0, PIPELINE_LEN)); + } + assert(ps_faa(&pipe_send, -1) == (PIPELINE_LEN - i)); + //printc("[s%d,%d]", i, pipe_line); + assert(ps_faa(&pipe_line, 1) == 0); + switch(i) { + case 0: crt_chan_send_test(c0, &s); break; + case 1: crt_chan_send_test(c1, &s); break; + case 2: crt_chan_send_test(c2, &s); break; + case 3: crt_chan_send_test(c3, &s); break; + case 4: crt_chan_send_test(c4, &s); break; + case 5: crt_chan_send_test(c5, &s); break; + default: assert(0); + } + //printc("[o%d,%d]", i, pipe_line); +} + +static inline void +chinit(int i, struct sl_thd *s, struct sl_thd *r) +{ + switch(i) { + case 0: crt_chan_init_test(c0); break; + case 1: crt_chan_p2p_init_test(c1, s, r); break; + case 2: crt_chan_p2p_init_test(c2, s, r); break; + case 3: crt_chan_p2p_init_test(c3, s, r); break; + case 4: crt_chan_p2p_init_test(c4, s, r); break; + case 5: crt_chan_p2p_init_test(c5, s, r); break; + default: assert(0); + } +} + +static inline void +work_fn(void *x) +{ + int chid = (int)x; + while (1) { + chrcv(chid); + + if (likely(chid + 1 < PIPELINE_LEN)) chsnd(chid + 1); + else { + rdtscll(en); + if (iter >= ITERS) continue; + assert(en > st); + cycles_t diff = en - st; + if (diff > wc) wc = diff; + //printc("%llu\n", diff); + vals[iter] = diff; + tot += diff; + iter ++; + if (unlikely(iter == ITERS)) { + int i; + for (i = 0; i < ITERS; i++) printc("%llu\n", vals[i]); + PRINTC("%d: %llu %llu\n", iter, tot / iter, wc); + iter = 0; + wc = tot = 0; + } + } + } + sl_thd_exit(); +} + +struct sl_thd *wt[PIPELINE_LEN] = { NULL }; + +static inline void +pong_fn(arcvcap_t r, void *d) +{ + PRINTC("Hpet Register\n"); + int a = capmgr_hw_periodic_attach(HW_HPET_PERIODIC, cos_thdid(), HPET_PERIOD_TEST_US); + assert(a == 0); + + while (1) { + //printc("I"); + int p = sl_thd_rcv(RCV_ULONLY); + //work_usecs(WORK_US); + rdtscll(st); + chsnd(0); + if (iter == ITERS) capmgr_hw_detach(HW_HPET_PERIODIC); + } + sl_thd_exit(); +} + +void +cos_init(void *d) +{ + struct cos_defcompinfo *defci = cos_defcompinfo_curr_get(); + struct cos_compinfo * ci = cos_compinfo_get(defci); + int i; + static volatile unsigned long init_done[NUM_CPU] = { 0 }; + static volatile arcvcap_t r = 0; + static volatile asndcap_t s = 0; + unsigned int cycs_per_us = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE); + +// if (NUM_CPU == 2) { +// assert(0); // need to rework.. +// if (cos_cpuid() == 0) { +// cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT); +// cos_defcompinfo_llinit(); +// cos_dcb_info_init_curr(); +// sl_init(SL_MIN_PERIOD_US); +// +// struct sl_thd *t = sl_thd_aep_alloc(pong_fn, NULL, 0, 0, 0, 0); +// assert(t); +// sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1)); +// r = sl_thd_rcvcap(t); +// assert(r); +// } else { +// while (!ps_load(&init_done[0])) ; +// +// cos_defcompinfo_sched_init(); +// cos_dcb_info_init_curr(); +// sl_init(SL_MIN_PERIOD_US); +// +// struct sl_thd *t = sl_thd_alloc(ping_fn, (void *)&s); +// assert(t); +// sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1)); +// +// while (!r) ; +// s = cos_asnd_alloc(ci, r, ci->captbl_cap); +// assert(s); +// } +// } else { + assert(NUM_CPU == 1); + cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT); + cos_defcompinfo_init(); + sl_init(SL_MIN_PERIOD_US*100); + //int i; + struct sl_thd *rt = sl_thd_aep_alloc(pong_fn, NULL, 0, 0, 0, 0); + assert(rt); + + //sl_thd_param_set(rt, sched_param_pack(SCHEDP_PRIO, PRIO_INT)); + for (i = 0; i < PIPELINE_LEN; i++) { + wt[i] = sl_thd_alloc(work_fn, (void *)i); + assert(wt[i]); + //sl_thd_param_set(wt[i], sched_param_pack(SCHEDP_PRIO, PRIO_START-i)); + if (i == 0) chinit(i, 0, 0); + else chinit(i, wt[i-1], wt[i]); + } + +// } + ps_faa(&init_done[cos_cpuid()], 1); + + /* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */ + for (i = 0; i < NUM_CPU; i++) { + while (!ps_load(&init_done[i])) ; + } + PRINTC("Int component init done!\n"); + //hypercall_comp_init_done(); + schedinit_child(); + for (i = 0; i < PIPELINE_LEN; i++) sl_thd_param_set(wt[i], sched_param_pack(SCHEDP_PRIO, PRIO_START-i)); + sl_thd_param_set(rt, sched_param_pack(SCHEDP_PRIO, PRIO_INT)); + + sl_sched_loop(); + + PRINTC("Should never get here!\n"); + assert(0); +} diff --git a/src/components/implementation/tests/unit_slxcore/Makefile b/src/components/implementation/tests/unit_slxcore/Makefile new file mode 100644 index 0000000000..0bc62b21b8 --- /dev/null +++ b/src/components/implementation/tests/unit_slxcore/Makefile @@ -0,0 +1,8 @@ +COMPONENT=unit_slxcoretests.o +INTERFACES= +DEPENDENCIES= +IF_LIB= +ADDITIONAL_LIBS=-lcobj_format $(LIBSLRAW) -lsl_mod_rr -lsl_thd_static_backend -lcos_dcb + +include ../../Makefile.subsubdir +MANDITORY_LIB=simple_stklib.o diff --git a/src/components/implementation/tests/unit_slxcore/init.c b/src/components/implementation/tests/unit_slxcore/init.c new file mode 100644 index 0000000000..7038d767fc --- /dev/null +++ b/src/components/implementation/tests/unit_slxcore/init.c @@ -0,0 +1,86 @@ +#include +#include +#include +#include +#include +#include + +#define MAX_PONG 20 +static struct sl_xcore_thd *ping; +static struct sl_xcore_thd *pong[MAX_PONG]; + +static inline void +ping_fn(void *d) +{ + int k = 0; + + while (1) { + sl_xcore_thd_wakeup(pong[k % MAX_PONG]); + k++; + } +} + +static inline void +pong_fn(void *d) +{ + while (1) { + sl_thd_block(0); + } +} + +void +cos_init(void *d) +{ + struct cos_defcompinfo *defci = cos_defcompinfo_curr_get(); + struct cos_compinfo * ci = cos_compinfo_get(defci); + int i; + static volatile unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 }; + static unsigned b1 = 0, b2 = 0, b3 = 0; + + if (ps_cas(&first, NUM_CPU + 1, cos_cpuid())) { + cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT); + cos_defcompinfo_llinit(); + } else { + while (!ps_load(&init_done[first])) ; + + cos_defcompinfo_sched_init(); + } + cos_dcb_info_init_curr(); + ps_faa(&init_done[cos_cpuid()], 1); + + /* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */ + for (i = 0; i < NUM_CPU; i++) { + while (!ps_load(&init_done[i])) ; + } + sl_init(SL_MIN_PERIOD_US); + /* barrier, wait for sl_init to be done on all cores */ + ps_faa(&b1, 1); + while (ps_load(&b1) != NUM_CPU) ; + if (cos_cpuid()) { + for (i = 0; i < MAX_PONG; i++) { + struct sl_thd *t = sl_thd_alloc(pong_fn, NULL); + + assert(t); + sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX)); + pong[i] = sl_xcore_thd_lookup(sl_thd_thdid(t)); + assert(pong[i]); + } + } else { + struct sl_thd *t = sl_thd_alloc(ping_fn, NULL); + + assert(t); + sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX)); + + ping = sl_xcore_thd_lookup(sl_thd_thdid(t)); + assert(ping); + } + ps_faa(&b2, 1); + while (ps_load(&b2) != NUM_CPU) ; + PRINTC("Ready!"); +// hypercall_comp_init_done(); + + sl_sched_loop_nonblock(); + + PRINTC("Should never get here!\n"); + assert(0); +} diff --git a/src/components/include/cirque.h b/src/components/include/cirque.h new file mode 100644 index 0000000000..8c63772322 --- /dev/null +++ b/src/components/include/cirque.h @@ -0,0 +1,128 @@ +/** + * Redistribution of this file is permitted under the BSD two clause license. + * + * Copyright 2019, The George Washington University + * Author: Phani Gadepalli, phanikishoreg@gwu.edu + */ +#ifndef CIRQUE_H +#define CIRQUE_H + +/* remember to use multi-core locks as these are really single producer, single consumer */ +#define CIRQUE_MAX_SZ 4096 + +#define CIRQUE_PROTOTYPE(name, type) \ +struct cirque_##name { \ + type wrk[CIRQUE_MAX_SZ]; \ + size_t size; \ + size_t mask; \ + \ + volatile long head; \ + volatile long tail; \ +}; \ + \ +static inline void \ +cirque_init_##name(struct cirque_##name *q, size_t sz) \ +{ \ + memset(q, 0, sizeof(struct cirque_##name)); \ + \ + if (sz) { \ + /* only for size with pow of 2 */ \ + assert(round_to_pow2(sz) == sz); \ + assert(sz <= CIRQUE_MAX_SZ); \ + } else { \ + sz = CIRQUE_MAX_SZ; \ + } \ + \ + q->head = q->tail = 0; \ + q->size = sz; \ + q->mask = sz - 1; \ +} \ + \ +static inline int \ +cirque_insert_##name(struct cirque_##name *q, type *w) \ +{ \ + long ct = ps_load((unsigned long *)&q->tail); \ + long ch = ps_load((unsigned long *)&q->head); \ + \ + if ((ct == 0 && ch == q->mask) || \ + ((ch + 1) & q->mask) == ct) return -ENOSPC; \ + \ + ps_mem_fence(); \ + if (!ps_cas((unsigned long *)q->head, ch, \ + (ch + 1) & q->mask)) return -EAGAIN; \ + q->wrk[ch] = *w; \ + \ + return 0; \ +} \ + \ +static inline int \ +cirque_delete_##name(struct cirque_##name *q, type *w) \ +{ \ + long ct = ps_load((unsigned long *)&q->tail); \ + long ch = ps_load((unsigned long *)&q->head); \ + \ + if (ct >= ch) return -ENOENT; \ + \ + *w = q->wrk[ct]; \ + if (!ps_cas((unsigned long *)q->tail, ct, \ + (ct + 1) & q->mask)) return -EAGAIN; \ + \ + return 0; \ +} \ + \ +static inline int \ +cirque_peek_##name(struct cirque_##name *q, type *w) \ +{ \ + long ct = ps_load((unsigned long *)&q->tail); \ + long ch = ps_load((unsigned long *)&q->head); \ + \ + if (ct >= ch) return -ENOENT; \ + \ + *w = q->wrk[ct]; \ + \ + return 0; \ +} \ + \ +static inline type * \ +cirque_allocptr_##name(struct cirque_##name *q) \ +{ \ + long ct = ps_load((unsigned long *)&q->tail); \ + long ch = ps_load((unsigned long *)&q->head); \ + \ + if ((ct == 0 && ch == q->mask) || \ + ((ch + 1) & q->mask) == ct) return NULL; \ + \ + ps_mem_fence(); \ + if (!ps_cas((unsigned long *)q->head, ch, \ + (ch + 1) & q->mask)) return NULL; \ + \ + return &q->wrk[ch]; \ +} \ + \ +static inline void \ +cirque_freeptr_##name(struct cirque_##name *q) \ +{ \ + long ct = ps_load((unsigned long *)&q->tail); \ + long ch = ps_load((unsigned long *)&q->head); \ + \ + if (ct >= ch) return; \ + \ + if (ps_cas((unsigned long *)q->tail, ct, (ct + 1) & q->mask)) { \ + memset(&q->wrk[ct], 0, sizeof(type)); \ + } \ + \ + return; \ +} \ + \ +static inline type * \ +cirque_peekptr_##name(struct cirque_##name *q) \ +{ \ + long ct = ps_load((unsigned long *)&q->tail); \ + long ch = ps_load((unsigned long *)&q->head); \ + \ + if (ct >= ch) return NULL; \ + \ + return &q->wrk[ct]; \ +} + +#endif /* CIRQUE_H */ diff --git a/src/components/include/cos_asm_simple_stacks.h b/src/components/include/cos_asm_simple_stacks.h index b6dd7b9e21..46eb349cf7 100644 --- a/src/components/include/cos_asm_simple_stacks.h +++ b/src/components/include/cos_asm_simple_stacks.h @@ -16,7 +16,8 @@ shr $MAX_STACK_SZ_BYTE_ORDER, %eax; \ shr $16, %edx; \ pushl %edx; \ - pushl %eax; + pushl %eax; \ + pushl $0; #define COS_ASM_GET_STACK \ COS_ASM_GET_STACK_BASIC \ diff --git a/src/components/include/cos_component.h b/src/components/include/cos_component.h index aa64a093ec..e229fdac00 100644 --- a/src/components/include/cos_component.h +++ b/src/components/include/cos_component.h @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -53,6 +54,7 @@ call_cap_asm(u32_t cap_no, u32_t op, int arg1, int arg2, int arg3, int arg4) return ret; } +/* NOTE: make sure the memory locations r1, r2 & r3 are at least word-sized as the register stores are word-sized! */ static inline int call_cap_retvals_asm(u32_t cap_no, u32_t op, int arg1, int arg2, int arg3, int arg4, unsigned long *r1, unsigned long *r2, unsigned long *r3) @@ -84,6 +86,7 @@ call_cap_retvals_asm(u32_t cap_no, u32_t op, int arg1, int arg2, int arg3, int a return ret; } +/* NOTE: make sure the memory locations r1 & r2 are at least word-sized as the register stores are word-sized! */ static inline int call_cap_2retvals_asm(u32_t cap_no, u32_t op, int arg1, int arg2, int arg3, int arg4, unsigned long *r1, unsigned long *r2) @@ -145,9 +148,8 @@ extern struct cos_component_information cos_comp_info; static inline long get_stk_data(int offset) { - unsigned long curr_stk_pointer; + unsigned long curr_stk_pointer = 0; - __asm__("movl %%esp, %0;" : "=r"(curr_stk_pointer)); /* * We save the CPU_ID and thread id in the stack for fast * access. We want to find the struct cos_stk (see the stkmgr @@ -155,7 +157,15 @@ get_stk_data(int offset) * cpu_id. This struct is at the _top_ of the current stack, * and cpu_id is at the top of the struct (it is a u32_t). */ - return *(long *)((curr_stk_pointer & ~(COS_STACK_SZ - 1)) + COS_STACK_SZ - offset * sizeof(u32_t)); + return *(long *)((((unsigned long)(&curr_stk_pointer)) & ~(COS_STACK_SZ - 1)) + COS_STACK_SZ - offset * sizeof(u32_t)); +} + +static inline void +set_stk_data(int offset, long val) +{ + unsigned long curr_stk_pointer = 0; + + *(long *)((((unsigned long)&curr_stk_pointer) & ~(COS_STACK_SZ - 1)) + COS_STACK_SZ - offset * sizeof(u32_t)) = val; } #define GET_CURR_CPU cos_cpuid() @@ -195,6 +205,18 @@ cos_thdid(void) return cos_get_thd_id(); } +static void * +cos_get_slthd_ptr(void) +{ + return (void *)get_stk_data(SLTHDPTR_OFFSET); +} + +static void +cos_set_slthd_ptr(void *ptr) +{ + set_stk_data(SLTHDPTR_OFFSET, (long)ptr); +} + #define ERR_THROW(errval, label) \ do { \ ret = errval; \ @@ -210,12 +232,36 @@ cos_spd_id(void) static inline void * cos_get_heap_ptr(void) { - return (void *)cos_comp_info.cos_heap_ptr; + /* page at heap_ptr is actually the SCB_PAGE for any component. */ + unsigned int off = COS_SCB_SIZE + (PAGE_SIZE * NUM_CPU); + void *heap_ptr = ((void *)(cos_comp_info.cos_heap_ptr + off)); + + return heap_ptr; +} + +static inline struct cos_scb_info * +cos_scb_info_get(void) +{ + return (struct cos_scb_info *)(cos_comp_info.cos_heap_ptr); +} + +static inline struct cos_scb_info * +cos_scb_info_get_core(void) +{ + return cos_scb_info_get() + cos_cpuid(); +} + +static inline struct cos_dcb_info * +cos_init_dcb_get(void) +{ + /* created at boot-time for the first component in the system! */ + return (struct cos_dcb_info *)(cos_comp_info.cos_heap_ptr + COS_SCB_SIZE + (PAGE_SIZE * cos_cpuid())); } static inline void cos_set_heap_ptr(void *addr) { + /* FIXME: fix this for the hack if it's not going to work! */ cos_comp_info.cos_heap_ptr = (vaddr_t)addr; } diff --git a/src/components/include/cos_dcb.h b/src/components/include/cos_dcb.h new file mode 100644 index 0000000000..1fc6298da6 --- /dev/null +++ b/src/components/include/cos_dcb.h @@ -0,0 +1,28 @@ +#ifndef COS_DCB_H +#define COS_DCB_H + +#include +#include + +#define COS_DCB_PERPG_MAX (PAGE_SIZE / sizeof(struct cos_dcb_info)) + +#define COS_DCB_MAX_CAPS (MAX_NUM_THREADS / COS_DCB_PERPG_MAX + 1) + +struct cos_dcbinfo_data { + dcbcap_t dcbcaps[COS_DCB_MAX_CAPS]; + vaddr_t dcbaddr[COS_DCB_MAX_CAPS]; + dcboff_t curr_cap_off; + unsigned short curr_cap; + + struct cos_compinfo *ci; +} CACHE_ALIGNED; + +void cos_dcb_info_init(struct cos_dcbinfo_data *cdi, struct cos_compinfo *ci); +void cos_dcb_info_init_ext(struct cos_dcbinfo_data *cdi, struct cos_compinfo *ci, dcbcap_t initdcbcap, vaddr_t initdcbaddr, dcboff_t start_off); +dcbcap_t cos_dcb_info_alloc(struct cos_dcbinfo_data *cdi, dcboff_t *dcboff, vaddr_t *dcbaddr); + +void cos_dcb_info_init_curr(void); +void cos_dcb_info_init_curr_ext(dcbcap_t initdcbcap, vaddr_t initdcbaddr, dcboff_t st_off); +dcbcap_t cos_dcb_info_alloc_curr(dcboff_t *dcboff, vaddr_t *dcbaddr); + +#endif /* COS_DCB_H */ diff --git a/src/components/include/cos_debug.h b/src/components/include/cos_debug.h index c646c1b977..6e8bb00825 100644 --- a/src/components/include/cos_debug.h +++ b/src/components/include/cos_debug.h @@ -9,7 +9,7 @@ #endif #ifndef PRINT_FN -#define PRINT_FN prints +#define PRINT_FN PRINTC #endif #include diff --git a/src/components/include/cos_defkernel_api.h b/src/components/include/cos_defkernel_api.h index fa083c27ef..b98796c129 100644 --- a/src/components/include/cos_defkernel_api.h +++ b/src/components/include/cos_defkernel_api.h @@ -36,7 +36,7 @@ struct cos_aep_info { thdid_t tid; arcvcap_t rcv; cos_aepthd_fn_t fn; - void * data; + void *data; }; /* Default Component information */ @@ -53,7 +53,7 @@ cos_aepthd_fn(void *data) { struct cos_aep_info *aep_info = (struct cos_aep_info *)data; cos_aepthd_fn_t aep_fn = aep_info->fn; - void * fn_data = aep_info->data; + void *fn_data = aep_info->data; (aep_fn)(aep_info->rcv, fn_data); @@ -81,6 +81,7 @@ struct cos_aep_info *cos_sched_aep_get(struct cos_defcompinfo *defci); * capabilities layout. */ void cos_defcompinfo_init(void); +void cos_defcompinfo_llinit(void); /* * cos_defcompinfo_init_ext: initialize the current component's global cos_defcompinfo struct using the parameters * passed. @@ -96,44 +97,49 @@ void cos_defcompinfo_sched_init(void); * cos_defcompinfo_child_alloc: called to create a new child component including initial capabilities like pgtbl, * captbl, compcap, aep. if is_sched is set, scheduling end-point will also be created for the child component, else, * the current component's scheduler will remain the scheduler for the child component. + * TODO: initdcb cap and initdcb addr? */ int cos_defcompinfo_child_alloc(struct cos_defcompinfo *child_defci, vaddr_t entry, vaddr_t heap_ptr, - capid_t cap_frontier, int is_sched); + capid_t cap_frontier, int is_sched, dcbcap_t *initdcbcap); /* * cos_aep_alloc: creates a new async activation end-point which includes thread, tcap and rcv capabilities. * struct cos_aep_info passed in, must not be stack allocated. */ -int cos_aep_alloc(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data); +int cos_aep_alloc(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data, dcbcap_t dcap, dcboff_t doff); /* * cos_aep_alloc: creates a new async activation end-point, using an existing tcap. * struct cos_aep_info passed in, must not be stack allocated. */ -int cos_aep_tcap_alloc(struct cos_aep_info *aep, tcap_t tc, cos_aepthd_fn_t fn, void *data); +int cos_aep_tcap_alloc(struct cos_aep_info *aep, tcap_t tc, cos_aepthd_fn_t fn, void *data, dcbcap_t dcap, dcboff_t doff); /* * cos_initaep_alloc: create an initaep in the @child_dci and using sched->rcv as the parent, sets up cos_sched_ape_get(@child_dci) with the init capabilities. * if @sched == NULL, use the current scheduler in cos_sched_aep_get(cos_defcompinfo_get_cur()). * if @is_sched == 0, creates only the init thread (does not need @sched parameter) + * NOTE: dcbuaddr is the address in child_dci page-table. */ -int cos_initaep_alloc(struct cos_defcompinfo *child_dci, struct cos_aep_info *sched, int is_sched); +int cos_initaep_alloc(struct cos_defcompinfo *child_dci, struct cos_aep_info *sched, int is_sched, dcbcap_t dcap); /* * cos_initaep_tcap_alloc: same as cos_initaep_alloc with is_sched == 1, except it doesn't create a new tcap, * uses the tcap passed in @tc. + * NOTE: dcbuaddr is the address in child_dci page-table. */ -int cos_initaep_tcap_alloc(struct cos_defcompinfo *child_dci, tcap_t tc, struct cos_aep_info *sched); +int cos_initaep_tcap_alloc(struct cos_defcompinfo *child_dci, tcap_t tc, struct cos_aep_info *sched, dcbcap_t dcap); /* * cos_aep_alloc_ext: creates a new async activation end-point which includes thread, tcap and rcv capabilities in the child_dci component using sched_aep->rcv. * if @child_dci == NULL, create in the current component. + * NOTE: dcbuaddr is the address in child_dci page-table. */ -int cos_aep_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *child_dci, struct cos_aep_info *sched_aep, thdclosure_index_t idx); +int cos_aep_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *child_dci, struct cos_aep_info *sched_aep, thdclosure_index_t idx, dcbcap_t dcap, dcboff_t doff); /* * cos_aep_alloc_ext: creates a new async activation end-point which includes thread, tcap and rcv capabilities in the child_dci component using sched_aep->rcv. * if @child_dci == NULL, create in the current component. + * NOTE: dcbuaddr is the address in child_dci page-table. */ -int cos_aep_tcap_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *child_dci, struct cos_aep_info *sched_aep, tcap_t tc, thdclosure_index_t idx); +int cos_aep_tcap_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *child_dci, struct cos_aep_info *sched_aep, tcap_t tc, thdclosure_index_t idx, dcbcap_t dcap, dcboff_t doff); /* * cos_defswitch: thread switch api using the default scheduling tcap and rcv. diff --git a/src/components/include/cos_kernel_api.h b/src/components/include/cos_kernel_api.h index 911f025e01..542290774d 100644 --- a/src/components/include/cos_kernel_api.h +++ b/src/components/include/cos_kernel_api.h @@ -54,6 +54,9 @@ typedef capid_t compcap_t; typedef capid_t captblcap_t; typedef capid_t pgtblcap_t; typedef capid_t hwcap_t; +typedef capid_t scbcap_t; +typedef capid_t dcbcap_t; +typedef unsigned short dcboff_t; /* Memory source information */ struct cos_meminfo { @@ -81,7 +84,7 @@ struct cos_compinfo { }; void cos_compinfo_init(struct cos_compinfo *ci, pgtblcap_t pgtbl_cap, captblcap_t captbl_cap, compcap_t comp_cap, - vaddr_t heap_ptr, capid_t cap_frontier, struct cos_compinfo *ci_resources); + vaddr_t heap_ptr, capid_t cap_frontier, struct cos_compinfo *ci_resources); /* * This only needs be called on compinfos that are managing resources * (i.e. likely only one). All of the capabilities will be relative @@ -107,24 +110,35 @@ int cos_pgtbl_intern_expandwith(struct cos_compinfo *ci, pgtblcap_t intern, vadd * This uses the next three functions to allocate a new component and * correctly populate ci (allocating all resources from ci_resources). */ -int cos_compinfo_alloc(struct cos_compinfo *ci, vaddr_t heap_ptr, capid_t cap_frontier, vaddr_t entry, - struct cos_compinfo *ci_resources); +int cos_compinfo_alloc(struct cos_compinfo *ci, scbcap_t sc, vaddr_t heap_ptr, capid_t cap_frontier, vaddr_t entry, struct cos_compinfo *ci_resources); captblcap_t cos_captbl_alloc(struct cos_compinfo *ci); pgtblcap_t cos_pgtbl_alloc(struct cos_compinfo *ci); -compcap_t cos_comp_alloc(struct cos_compinfo *ci, captblcap_t ctc, pgtblcap_t ptc, vaddr_t entry); +compcap_t cos_comp_alloc(struct cos_compinfo *ci, captblcap_t ctc, pgtblcap_t ptc, scbcap_t scbc, vaddr_t entry, + vaddr_t scb_addr); +scbcap_t cos_scb_alloc(struct cos_compinfo *ci); +dcbcap_t cos_dcb_alloc(struct cos_compinfo *ci, pgtblcap_t ptc, vaddr_t dcb_uaddr); typedef void (*cos_thd_fn_t)(void *); -thdcap_t cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, cos_thd_fn_t fn, void *data); -thdcap_t cos_thd_alloc_ext(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t idx); +thdcap_t cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, cos_thd_fn_t fn, void *data, dcbcap_t dc, + dcboff_t dcboff); +thdcap_t cos_thd_alloc_ext(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t idx, dcbcap_t dc, + dcboff_t dcboff); /* Create the initial (cos_init) thread */ -thdcap_t cos_initthd_alloc(struct cos_compinfo *ci, compcap_t comp); +thdcap_t cos_initthd_alloc(struct cos_compinfo *ci, compcap_t comp, dcbcap_t dc); +int cos_thd_migrate(struct cos_compinfo *ci, thdcap_t thdc, cpuid_t core); +/* update the thdcap to migrated core */ +int cos_thdcap_migrate(struct cos_compinfo *ci, thdcap_t thdc); sinvcap_t cos_sinv_alloc(struct cos_compinfo *srcci, compcap_t dstcomp, vaddr_t entry, invtoken_t token); -arcvcap_t cos_arcv_alloc(struct cos_compinfo *ci, thdcap_t thdcap, tcap_t tcapcap, compcap_t compcap, arcvcap_t enotif); +arcvcap_t cos_arcv_alloc(struct cos_compinfo *ci, thdcap_t thdcap, tcap_t tcapcap, compcap_t compcap, + arcvcap_t enotif); asndcap_t cos_asnd_alloc(struct cos_compinfo *ci, arcvcap_t arcvcap, captblcap_t ctcap); void *cos_page_bump_alloc(struct cos_compinfo *ci); void *cos_page_bump_allocn(struct cos_compinfo *ci, size_t sz); +void *cos_dcbpg_bump_allocn(struct cos_compinfo *ci, size_t sz); +void *cos_scbpg_bump_allocn(struct cos_compinfo *ci, size_t sz); + capid_t cos_cap_cpy(struct cos_compinfo *dstci, struct cos_compinfo *srcci, cap_t srcctype, capid_t srccap); int cos_cap_cpy_at(struct cos_compinfo *dstci, capid_t dstcap, struct cos_compinfo *srcci, capid_t srccap); @@ -152,10 +166,11 @@ int cos_thd_mod(struct cos_compinfo *ci, thdcap_t c, void *tls_addr); /* set tls int cos_sched_asnd(asndcap_t snd, tcap_time_t timeout, arcvcap_t srcv, sched_tok_t stok); /* returns 0 on success and -EINVAL on failure */ int cos_asnd(asndcap_t snd, int yield); -/* returns non-zero if there are still pending events (i.e. there have been pending snds) */ -int cos_rcv(arcvcap_t rcv, rcv_flags_t flags, int *rcvd); +/* returns 0 on success */ +int cos_rcv(arcvcap_t rcv, rcv_flags_t flags); /* returns the same value as cos_rcv, but also information about scheduling events */ -int cos_sched_rcv(arcvcap_t rcv, rcv_flags_t flags, tcap_time_t timeout, int *rcvd, thdid_t *thdid, int *blocked, cycles_t *cycles, tcap_time_t *thd_timeout); +int cos_sched_rcv(arcvcap_t rcv, rcv_flags_t flags, tcap_time_t timeout, thdid_t *thdid, int *blocked, + cycles_t *cycles, tcap_time_t *thd_timeout); int cos_introspect(struct cos_compinfo *ci, capid_t cap, unsigned long op); @@ -188,11 +203,13 @@ int cos_tcap_merge(tcap_t dst, tcap_t rm); /* Hardware (interrupts) operations */ hwcap_t cos_hw_alloc(struct cos_compinfo *ci, u32_t bitmap); int cos_hw_attach(hwcap_t hwc, hwid_t hwid, arcvcap_t rcvcap); +int cos_hw_periodic_attach(hwcap_t hwc, hwid_t hwid, arcvcap_t rcvcap, unsigned int period); int cos_hw_detach(hwcap_t hwc, hwid_t hwid); void *cos_hw_map(struct cos_compinfo *ci, hwcap_t hwc, paddr_t pa, unsigned int len); int cos_hw_cycles_per_usec(hwcap_t hwc); int cos_hw_cycles_thresh(hwcap_t hwc); capid_t cos_capid_bump_alloc(struct cos_compinfo *ci, cap_t cap); +vaddr_t cos_page_bump_intern_valloc(struct cos_compinfo *ci, size_t sz); #endif /* COS_KERNEL_API_H */ diff --git a/src/components/include/cos_omp.h b/src/components/include/cos_omp.h new file mode 100644 index 0000000000..8933449ae9 --- /dev/null +++ b/src/components/include/cos_omp.h @@ -0,0 +1,50 @@ +/** + * Redistribution of this file is permitted under the BSD two clause license. + * + * Copyright 2019, The George Washington University + * Author: Phani Gadepalli, phanikishoreg@gwu.edu + */ + +#ifndef COS_OMP_H +#define COS_OMP_H + +#include +#include +#include + +#define COS_OMP_MAX_NUM_THREADS (PART_MAX_THDS) + +struct cos_icv_data_env { + unsigned dyn_var; + unsigned nest_var; + unsigned nthreads_var; + unsigned run_sched_var; + unsigned bind_var; + unsigned thread_limit_var; + unsigned active_levels_var; + unsigned levels_var; + unsigned default_device_var; +}; + +struct cos_icv_global_env { + unsigned cancel_var; + unsigned max_task_priority_var; +}; + +struct cos_icv_implicittask_env { + unsigned place_partition_var; +}; + +struct cos_icv_device_env { + unsigned def_sched_var; + unsigned stacksize_var; + unsigned wait_policy_var; + unsigned max_active_levels_var; +}; + +extern void cos_omp_icv_data_init(struct cos_icv_data_env *icvde); +extern void cos_omp_icv_implitsk_init(struct cos_icv_implicittask_env *icvite); +extern void cos_omp_icv_device_init(struct cos_icv_device_env *icvdve, unsigned dev_no); +extern void cos_omp_init(void); + +#endif /* COS_OMP_H */ diff --git a/src/components/include/cos_rdtsc.h b/src/components/include/cos_rdtsc.h new file mode 100644 index 0000000000..d8ebfad445 --- /dev/null +++ b/src/components/include/cos_rdtsc.h @@ -0,0 +1,65 @@ +#ifndef COS_RDTSC_H +#define COS_RDTSC_H + +#include + +#define COS_RDTSCP_CALIB_ITERS 1000000 + +#define cos_rdtsc rdtscll + +/* Copied from seL4bench */ +#define cos_rdtscp(var) do { \ + u32_t low, high; \ + asm volatile( \ + "movl $0, %%eax \n" \ + "movl $0, %%ecx \n" \ + "cpuid \n" \ + "rdtsc \n" \ + "movl %%edx, %0 \n" \ + "movl %%eax, %1 \n" \ + "movl $0, %%eax \n" \ + "movl $0, %%ecx \n" \ + "cpuid \n" \ + : \ + "=r"(high), \ + "=r"(low) \ + : \ + : "eax", "ebx", "ecx", "edx" \ + ); \ + (var) = (((u64_t)high) << 32ull) | ((u64_t)low); \ +} while(0) + +/* + * use this to calibrate the rdtscp and perhaps use + * min value to remove from your benchmarks + */ +static inline void +cos_rdtscp_calib(cycles_t *min, cycles_t *avg, cycles_t *max) +{ + int i; + volatile cycles_t st, en, mn = 0, mx = 0, total = 0; + + cos_rdtscp(st); + cos_rdtscp(en); + mn = mx = en - st; + + for (i = 0; i < COS_RDTSCP_CALIB_ITERS; i++) { + cycles_t diff; + + cos_rdtscp(st); + cos_rdtscp(en); + + diff = en - st; + total += diff; + if (diff < mn) mn = diff; + if (diff > mx) mx = diff; + } + + if (min) *min = mn; + if (max) *max = mx; + if (avg) *avg = total / COS_RDTSCP_CALIB_ITERS; + + return; +} + +#endif /* COS_RDTSC_H */ diff --git a/src/components/include/cos_ulsched_rcv.h b/src/components/include/cos_ulsched_rcv.h new file mode 100644 index 0000000000..60ff25d795 --- /dev/null +++ b/src/components/include/cos_ulsched_rcv.h @@ -0,0 +1,80 @@ +#ifndef COS_ULSCHED_RCV_H +#define COS_ULSCHED_RCV_H + +#include + +static inline int +__cos_sched_events_present(struct cos_sched_ring *r) +{ + return (ps_load(&r->tail) != ps_load(&r->head)); +} + +static inline int +cos_sched_ispending(void) +{ + struct cos_scb_info *scb_cpu = cos_scb_info_get_core(); + struct cos_sched_ring *r = &scb_cpu->sched_events; + + return ps_load(&r->more); +} + +static inline int +cos_sched_events_isempty(void) +{ + struct cos_scb_info *scb_cpu = cos_scb_info_get_core(); + struct cos_sched_ring *r = &scb_cpu->sched_events; + + return (ps_load(&r->tail) == ps_load(&r->head)) && !ps_load(&r->more); +} + +static inline int +__cos_sched_event_consume(struct cos_sched_ring *r, struct cos_sched_event *e) +{ + int f = 0; + + if (unlikely(!r || !__cos_sched_events_present(r))) return 0; + assert(e); + f = ps_upfaa((unsigned long *)&r->head, 1); + *e = r->event_buf[f]; +// memcpy((void *)e, (void *)&(r->event_buf[f]), sizeof(struct cos_sched_event)); + + return 1; +} + +/* if other than sched-thread calls this, races will need to be handled by the caller! */ +static inline int +cos_ul_sched_rcv(arcvcap_t rcv, rcv_flags_t rfl, tcap_time_t timeout, struct cos_sched_event *evt) +{ + int ret = 0; + struct cos_scb_info *scb_cpu = cos_scb_info_get_core(); + struct cos_sched_ring *r = &scb_cpu->sched_events; + + evt->tid = 0; + assert(scb_cpu); + /* a non-scheduler thread, should call with rcv == 0 to consume user-level events alone */ + if (__cos_sched_event_consume(r, evt) == 0 + && rcv && !(rfl & RCV_ULONLY)) { + + ret = cos_sched_rcv(rcv, rfl, timeout, &(evt->tid), (int *)&(evt->evt.blocked), + (cycles_t *)&(evt->evt.elapsed_cycs), (tcap_time_t *)&(evt->evt.next_timeout)); + if (unlikely(ret < 0)) return ret; + } + + return (ret || __cos_sched_events_present(r) || cos_sched_ispending()); +} + +static inline int +cos_ul_rcv(arcvcap_t rcv, rcv_flags_t rfl, tcap_time_t sched_timeout) +{ + struct cos_sched_event ev = { .tid = 0 }; + int ret = 0; + + if (likely(sched_timeout)) rfl |= RCV_SCHEDTIMEOUT; + ret = cos_sched_rcv(rcv, rfl, sched_timeout, &(ev.tid), (int *)&(ev.evt.blocked), + (cycles_t *)&(ev.evt.elapsed_cycs), (tcap_time_t *)&(ev.evt.next_timeout)); + assert(ev.tid == 0); + + return ret; +} + +#endif /* COS_ULSCHED_RCV_H */ diff --git a/src/components/include/crt_blkpt.h b/src/components/include/crt_blkpt.h new file mode 100644 index 0000000000..d647dc50d9 --- /dev/null +++ b/src/components/include/crt_blkpt.h @@ -0,0 +1,298 @@ +#ifndef CRT_BLKPT_H +#define CRT_BLKPT_H + +#include +#include +#include + +/*** + * The event count/block point is an abstraction to synchronize the + * blocking behavior of different threads on abstract events. The + * events are usually tied to a specific state of another + * data-structure (into which the blkpt is embedded). For example, a + * lock is taken and released thus generating an event for any + * blocking threads, or a ring buffer has a data item inserted into + * it, thus generating an event for any threads waiting for + * data. Concretely, we want a number of threads to be able to block, + * and a thread to be able to wake up one, or all of them. The + * challenge is solving a single race-condition: + * + * thd 0: check data-structure, determine the need for blocking and + * waiting for an event + * thd 0: preemption, switching to thd 1 + * thd 1: check data-structure, determine that an event is generated + * thd 1: call the scheduler, and wake all blocked threads (not + * including thd 0 yet) + * thd 1: preempt, and switch to thd 0 + * thd 0: call scheduler to block + * + * The resulting state is that thd 1 should have unblocked thd 0, but + * due to a race, the thd 0 will be blocked awaiting the *next* event + * that may never come. Event counts are meant to solve this + * problem. Traditional systems solve this problem using condition + * variables and a lock around the scheduling logic, but if you want + * to decouple the data-structure from the scheduler (e.g. as they are + * in different modes, or components), this is a fundamental problem. + * + * The event count abstraction: + * + * Assume the data-structure generating events has at least three + * states: + * S0: available + * S1: unavailable + * S2: unavailable & subscribed + * + * The transitions within the data-structure are: + * {S0->S1, S1->S0, S1->S2, S2->S0} + * + * Every transition into S0 is an abstract *event*. Threads that look + * at the state of the data-structure, and must block waiting for its + * state to change, wait for such an event to wakeup. + * + * The data-structure must define its own mapping to this state + * machine. A few examples: + * + * Mutexes: + * S0: Not locked. + * S1: Locked and held by thread 0. + * S2: Locked and held by thread 0, and threads 1...N contend the lock + * + * Ring buffer (for simplicity, assuming it never fills): + * S0: data items in ring buffer + * S1: no data in ring buffer + * S2: no data in ring buffer, and thread(s) are waiting for data + * + * The event counts are used to track the threads that use the + * data-structure when transitioning from S1->S2 (block thread), when + * it is in S2 (block additional threads), and when it transitions + * from S2->S0 (wakeup blocked threads). + * + * The event count is used in the following way: + * + * S0->S1: + * data-structure (DS) operation + * E.g. not locked -> locked, or + * dequeue from ring with single data item + * + * S1->S0: + * blkpt_checkpoint(ec) (not used) + * data-structure (DS) operation + * assert(blkpt_has_blocked(ec) == false) (as we're in S1) + * blkpt_trigger(ec) (won't do much as noone is blocked) + * E.g. unlock with no contention, or + * enqueue with no dequeuing threads + * + * S1->S2: + * cp = blkpt_checkpoint(ec) + * data-structure (DS) operation, determine we need to await event + * blkpt_wait(ec, cp) + * retry (this is why event counts can be used with lock-free data-structs) + * E.g. locked -> contended + * empty ring -> waiting for data + * + * S2->S0: + * data-structure (DS) operation + * assert(blkpt_has_blocked(ec) == true) (as we're in S2) + * blkpt_trigger(ec) (wake blocked threads!) + * E.g. unlock with contention, or + * enqueue with dequeuing threads + * + * Event count *optimization*: + * + * We prevent the race above using an epoch (count) for the events + * thus the name. However, to avoid rapid wraparound on the epoch, we + * only increment the epoch when the race condition is possible. That + * is to say, we only increment the event count when the + * data-structure has blocked threads. This not only delays + * wraparound, it also will avoid an atomic instruction for all + * operations that don't involve blocked threads (a common-case, + * exemplified by futexes, for example). + * + * Usage optimization: + * + * Because of the event counter optimization to only use expensive + * operations when triggering there are blocked threads, the user of + * this API can trigger whenever transitioning back to S0. + */ + +struct crt_blkpt { + sched_blkpt_id_t id; + /* most significant bit specifies blocked thds */ + sched_blkpt_epoch_t epoch_blocked; +}; + +struct crt_blkpt_checkpoint { + sched_blkpt_epoch_t epoch_blocked; +}; + +typedef enum { + CRT_BLKPT_UNIPROC = 1, /* are the event operations only called on a single core? */ + CRT_BLKPT_CRIT_SECT = 2, /* is only one thread ever going to trigger at a time? */ +} crt_blkpt_flags_t; + +#define CRT_BLKPT_EPOCH_BLKED_BITS (sizeof(sched_blkpt_epoch_t) * 8) +#define CRT_BLKPT_BLKED_MASK (1 << (CRT_BLKPT_EPOCH_BLKED_BITS - 2)) +#define CRT_BLKPT_BLKED(e) ((e) & CRT_BLKPT_BLKED_MASK) +#define CRT_BLKPT_EPOCH(e) ((e) & ~CRT_BLKPT_BLKED_MASK) + +/* Return != 0 on failure: no ids to allocate */ +static inline int +crt_blkpt_init(struct crt_blkpt *blkpt) +{ + sched_blkpt_id_t id; + + id = sched_blkpt_alloc(); + if (id == SCHED_BLKPT_NULL) return -1; + + *blkpt = (struct crt_blkpt){ + .id = id, + .epoch_blocked = 0 + }; + + return 0; +} + +static inline int +crt_blkpt_teardown(struct crt_blkpt *blkpt) +{ + return sched_blkpt_free(blkpt->id); +} + +/* Internal APIs that must be inlined to remove the branches */ +static inline int +__crt_blkpt_atomic_trigger(sched_blkpt_epoch_t *ec, sched_blkpt_epoch_t chkpt, crt_blkpt_flags_t flags) +{ + /* + * Assume that the most significant bit is the blocked + * indicator. This math might reset it to zero, which we want + * to do anyway (as part of CRT_BLKPT_EPOCH). + */ + sched_blkpt_epoch_t new = CRT_BLKPT_EPOCH(chkpt + 1); + + /* inlined so that constant propagation will get rid of condition */ + if (flags == CRT_BLKPT_UNIPROC) { + return ps_upcas(ec, chkpt, new); + } else { + return ps_cas(ec, chkpt, new); + } + /* TODO: faa for CRT_BLKPT_CRIT_SECT? */ +} + +/* + * If we return 1, then the caller will attempt to block, otherwise, + * return 0 and it will re-check the data-structure assuming that + * something happened in the mean time. + */ +static inline int +__crt_blkpt_atomic_wait(sched_blkpt_epoch_t *ec, sched_blkpt_epoch_t chkpt, crt_blkpt_flags_t flags) +{ + sched_blkpt_epoch_t cached = ps_load(ec); + sched_blkpt_epoch_t new = cached | CRT_BLKPT_BLKED_MASK; + int ret; + + /* + * We are the second or later blocker. Blocked already + * set. We're done here. + * + * It isn't clear if it is better to have the additional + * branch here for this to avoid atomic instructions, or to + * just always do the atomic instructions and possibly fail. + */ + if (cached == new) return 1; + + /* function is inlined so that constant propagation will get rid of condition */ + if (flags == CRT_BLKPT_UNIPROC) { + ret = ps_upcas(ec, chkpt, new); + } else { + ret = ps_cas(ec, chkpt, new); + } + if (unlikely(!ret)) { + /* + * CAS failure can mean that 1. another thread + * blocked, and set the blocked bit, or 2. an event is + * triggered. In the former case, we still want to + * block. In the latter case, we want to go back to + * the data-structure. + */ + return ps_load(ec) == new; /* same epoch with blocked set? == success */ + } + + return 1; +} + +/* Trigger an event, waking blocked threads. */ +static inline void +crt_blkpt_trigger(struct crt_blkpt *blkpt, crt_blkpt_flags_t flags) +{ + /* + * Note that the flags should likely be passed in statically, + * as constants. That way they will be inlined the conditions + * in the *_atomic_* function will be removed. + */ + sched_blkpt_epoch_t saved = ps_load(&blkpt->epoch_blocked); + + /* The optimization: don't increment events if noone's listening */ + if (likely(!CRT_BLKPT_BLKED(saved))) return; + + /* slow(er) path for when we have blocked threads */ + if (!__crt_blkpt_atomic_trigger(&blkpt->epoch_blocked, saved, flags)) { + /* + * Race here between triggering threads. In this case, + * someone else already incremented the epoch and + * unblocked the threads. Yeah, helping algorithms! + */ + return; + } + /* + * Note that there is a race here. Multiple threads triggering + * events might pass different epochs down to the next + * level. This is OK as the next level always takes the epoch + * = max(epoch, ...) (for some wraparound-aware version of + * max). + */ + sched_blkpt_trigger(blkpt->id, CRT_BLKPT_EPOCH(saved + 1), 0); +} + +/* Wake only a single, specified thread (tracked manually in the data-structure) */ +/* void crt_blkpt_trigger_one(struct crt_blkpt *blkpt, crt_blkpt_flags_t flags, cos_thdid_t thdid); */ + +/* + * Checkpoint the state of the current event counter. This checkpoint + * is the one that is active during our operations on the + * data-structure. If we determine that we want to wait for an event + * (thus blocking), then the state of the checkpoint will be compared + * versus the state of the event counter to see if we're working off + * of outdated information. + */ +static inline void +crt_blkpt_checkpoint(struct crt_blkpt *blkpt, struct crt_blkpt_checkpoint *chkpt) +{ + chkpt->epoch_blocked = ps_load(&blkpt->epoch_blocked); +} + +/* Wait for an event. */ +static inline void +crt_blkpt_wait(struct crt_blkpt *blkpt, crt_blkpt_flags_t flags, struct crt_blkpt_checkpoint *chkpt) +{ + /* + * If blocked is already set, we can try and block + * directly. Otherwise, go through and try to atomically set + * it. If that fails, then either epoch or blocked has been + * updated, so return and try accessing the data-structure + * again. + */ + if (!CRT_BLKPT_BLKED(chkpt->epoch_blocked) && + !__crt_blkpt_atomic_wait(&blkpt->epoch_blocked, chkpt->epoch_blocked, flags)) return; + + if (unlikely(sched_blkpt_block(blkpt->id, CRT_BLKPT_EPOCH(chkpt->epoch_blocked), 0))) { + BUG(); /* we are using a blkpt id that doesn't exist! */ + } +} + +/* + * Create an execution dependency on the specified thread for, + * e.g. priority inheritance. + */ +/* void crt_blkpt_wait_dep(struct crt_blkpt *blkpt, crt_blkpt_flags_t flags, struct crt_blkpt_checkpoint *chkpt, cos_thdid_t thdid); */ + +#endif /* CRT_BLKPT_H */ diff --git a/src/components/include/crt_chan.h b/src/components/include/crt_chan.h new file mode 100644 index 0000000000..5f4267bb8a --- /dev/null +++ b/src/components/include/crt_chan.h @@ -0,0 +1,323 @@ +/* + * Copyright 2019, Gabriel Parmer, GWU, gparmer@gwu.edu. + * + * This uses a two clause BSD License. + */ + +#ifndef CRT_CHAN_H +#define CRT_CHAN_H + +/*** + * + */ + +#include +#include +#include +#include +#include + +struct crt_chan { + u32_t producer; + /* If the ring is empty, recving threads will block on this blkpt. */ + struct crt_blkpt empty; + char _padding1[CACHE_LINE * 2 - (sizeof(struct crt_blkpt) + sizeof(u32_t))]; + u32_t consumer; + /* If the ring is full, sending thread will block on this blkpt. */ + struct crt_blkpt full; + char _padding2[CACHE_LINE * 2 - (sizeof(struct crt_blkpt) + sizeof(u32_t))]; + /* + * @item_sz is a power of two and corresponds to the + * wraparound_mask. The number of data items that the channel + * can hold is item_sz - 1. @wraparound_mask = nslots-1 (were + * nslots is a power of two) + */ + u32_t item_sz, wraparound_mask; + u32_t nslots; + /* FIXME: p2p channels only SINGLE-CORE for now! */ + unsigned long sender, receiver; /* for p2p channels, sl_thd pointers + MSB for blocked on channel send/recv.. */ + /* The memory for the channel. */ + char mem[0]; +}; + +/* produce a */ +#define CRT_CHAN_STATIC_ALLOC(name, type, nslots) \ +struct __crt_chan_envelope_##name { \ + struct crt_chan c; \ + char mem[nslots * sizeof(type)]; \ +} __##name; \ +struct crt_chan *name = &__##name.c + +#define CRT_CHAN_TYPE_PROTOTYPES(name, type, nslots) \ +static inline int \ +crt_chan_init_##name(struct crt_chan *c) \ +{ return crt_chan_init(c, sizeof(type), nslots); } \ +static inline int \ +crt_chan_p2p_init_##name(struct crt_chan *c, struct sl_thd *sndr, struct sl_thd *rcvr) \ +{ return crt_chan_p2p_init(c, sizeof(type), nslots, sndr, rcvr); } \ +static inline void \ +crt_chan_teardown_##name(struct crt_chan *c) \ +{ crt_chan_teardown(c); } \ +static inline int \ +crt_chan_empty_##name(struct crt_chan *c) \ +{ return __crt_chan_empty(c, nslots - 1); } \ +static inline int \ +crt_chan_full_##name(struct crt_chan *c) \ +{ return __crt_chan_full(c, nslots - 1); } \ +static inline int \ +crt_chan_send_##name(struct crt_chan *c, void *item) \ +{ \ + assert(pow2(nslots)); \ + return __crt_chan_send(c, item, nslots - 1, sizeof(type)); \ +} \ +static inline int \ +crt_chan_recv_##name(struct crt_chan *c, void *item) \ +{ \ + assert(pow2(nslots)); \ + return __crt_chan_recv(c, item, nslots - 1, sizeof(type)); \ +} \ +static inline int \ +crt_chan_async_send_##name(struct crt_chan *c, void *item) \ +{ \ + assert(pow2(nslots)); \ + if (__crt_chan_produce(c, item, nslots - 1, sizeof(type))) return -EAGAIN; \ + return 0; \ +} \ +static inline int \ +crt_chan_async_recv_##name(struct crt_chan *c, void *item) \ +{ \ + assert(pow2(nslots)); \ + if (__crt_chan_consume(c, item, nslots - 1, sizeof(type))) return -EAGAIN; \ + return 0; \ +} + +#define CRT_CHANCHAN_PROTOTYPES(nslots) \ +CRT_CHAN_TYPE_PROTOTYPES(chan, struct chan *, nslots + +static inline unsigned int +__crt_chan_buff_idx(struct crt_chan *c, u32_t v, u32_t wraparound_mask) +{ return v & wraparound_mask; } + +static inline int +__crt_chan_full(struct crt_chan *c, u32_t wraparound_mask) +{ return __crt_chan_buff_idx(c, c->consumer, wraparound_mask) == __crt_chan_buff_idx(c, c->producer + 1, wraparound_mask); } + +static inline int +__crt_chan_empty(struct crt_chan *c, u32_t wraparound_mask) +{ return c->producer == c->consumer; } + +static inline int +__crt_chan_produce(struct crt_chan *c, void *d, u32_t wraparound_mask, u32_t sz) +{ + if (__crt_chan_full(c, wraparound_mask)) return 1; + memcpy(c->mem + (__crt_chan_buff_idx(c, c->producer, wraparound_mask) * sz), d, sz); + c->producer++; + + return 0; +} + +static inline int +__crt_chan_consume(struct crt_chan *c, void *d, u32_t wraparound_mask, u32_t sz) +{ + void *ret; + + if (__crt_chan_empty(c, wraparound_mask)) return 1; + memcpy(d, c->mem + (__crt_chan_buff_idx(c, c->consumer, wraparound_mask) * sz), sz); + c->consumer++; + + return 0; +} + +/* only wake it up if it's blocked on the channel! */ +static inline void +__crt_chan_p2p_wakeup(unsigned long *w) +{ + unsigned long wc, wn; + + sl_cs_enter(); + wc = ps_load(w); + if (likely(wc & (1<<31))) goto blocked; + sl_cs_exit(); + + return; + +blocked: + wn = wc & ~(1<<31); + struct sl_thd *wt = (struct sl_thd *)wn; + if (unlikely(!ps_upcas(w, wc, wn))) BUG(); + sl_thd_wakeup_no_cs(wt); + sl_cs_exit_switchto(wt); +} + +/* block on channel */ +static inline void +__crt_chan_p2p_block(unsigned long *b) +{ + unsigned long bc, bn; + + sl_cs_enter(); + bc = ps_load(b); + assert((bc & (1<<31)) == 0); + bn = bc | (1<<31); + if (unlikely(!ps_upcas(b, bc, bn))) BUG(); + + if (sl_thd_block_no_cs(sl_thd_curr(), SL_THD_BLOCKED, 0)) BUG(); + sl_cs_exit_schedule(); +} + +static inline int +__crt_chan_is_p2p(struct crt_chan *c) +{ + return ((c->sender & ~(1<<31)) && (c->receiver & ~(1<<31))); +} + +/** + * The next two functions pass all of the variables in via arguments, + * so that we can use them for constant propagation along with + * inlining to get rid of the general memcpy code. + */ +static inline int +__crt_chan_send(struct crt_chan *c, void *item, u32_t wraparound_mask, u32_t item_sz) +{ + /* optimizing for p2p */ + if (likely(__crt_chan_is_p2p(c))) { + while (1) { + if (!__crt_chan_produce(c, item, wraparound_mask, item_sz)) { + __crt_chan_p2p_wakeup(&c->receiver); + break; + } + __crt_chan_p2p_block(&c->sender); + } + } else { + while (1) { + struct crt_blkpt_checkpoint chkpt; + + crt_blkpt_checkpoint(&c->full, &chkpt); + if (!__crt_chan_produce(c, item, wraparound_mask, item_sz)) { + /* success! */ + crt_blkpt_trigger(&c->empty, 0); + break; + } + crt_blkpt_wait(&c->full, 0, &chkpt); + } + } + + return 0; +} + +static inline int +__crt_chan_recv(struct crt_chan *c, void *item, u32_t wraparound_mask, u32_t item_sz) +{ + /* optimizing for p2p */ + if (likely(__crt_chan_is_p2p(c))) { + while (1) { + if (!__crt_chan_consume(c, item, wraparound_mask, item_sz)) { + __crt_chan_p2p_wakeup(&c->sender); + break; + } + __crt_chan_p2p_block(&c->receiver); + } + } else { + while (1) { + struct crt_blkpt_checkpoint chkpt; + + crt_blkpt_checkpoint(&c->empty, &chkpt); + if (!__crt_chan_consume(c, item, wraparound_mask, item_sz)) { + /* success! */ + crt_blkpt_trigger(&c->full, 0); + break; + } + crt_blkpt_wait(&c->empty, 0, &chkpt); + } + } + + return 0; +} + + +/* + * We need to know how much to malloc? This function returns that + * requirement. It assumes (and checks) that @slots is a power of two. + */ +static inline int +crt_chan_mem_sz(int item_sz, int slots) +{ + assert(pow2(slots)); + + return sizeof(struct crt_chan) + item_sz * slots; +} + +/* How many slots can we fit into an allocation of a specific mem_sz */ +static inline int +crt_chan_nslots(int item_sz, int mem_sz) +{ + return leqpow2((mem_sz - sizeof(struct crt_chan)) / item_sz); +} + +static inline int +crt_chan_init(struct crt_chan *c, int item_sz, int slots) +{ + assert(pow2(slots)); + if (crt_blkpt_init(&c->empty)) return -1; + if (crt_blkpt_init(&c->full)) return -1; + c->nslots = slots; + c->item_sz = item_sz; + c->wraparound_mask = slots - 1; /* slots is a pow2 */ + c->sender = c->receiver = 0; + + return 0; +} + +static inline int +crt_chan_p2p_init(struct crt_chan *c, int item_sz, int slots, + struct sl_thd *sndr, struct sl_thd *rcvr) +{ + int r = crt_chan_init(c, item_sz, slots); + assert(sndr && rcvr); + + /* FIXME: only single-core for now! */ + if (r > 0) return r; + c->sender = (unsigned long)sndr; + c->receiver = (unsigned long)rcvr; + assert((c->sender & (1<<31)) == 0); + assert((c->receiver & (1<<31)) == 0); + + return 0; +} + +static inline void +crt_chan_teardown(struct crt_chan *c) +{ + crt_blkpt_teardown(&c->empty); + crt_blkpt_teardown(&c->full); +} + +/* User-facing send and receive APIs: */ + +static inline int +crt_chan_send(struct crt_chan *c, void *item) +{ + return __crt_chan_send(c, item, c->wraparound_mask, c->item_sz); +} + +static inline int +crt_chan_recv(struct crt_chan *c, void *item) +{ + return __crt_chan_recv(c, item, c->wraparound_mask, c->item_sz); +} + +static inline int +crt_chan_async_send(struct crt_chan *c, void *item) +{ + if (__crt_chan_produce(c, item, c->wraparound_mask, c->item_sz)) return -EAGAIN; + return 0; +} + +static inline int +crt_chan_async_recv(struct crt_chan *c, void *item) +{ + if (__crt_chan_consume(c, item, c->wraparound_mask, c->item_sz)) return -EAGAIN; + return 0; +} + +#endif /* CRT_CHAN_H */ diff --git a/src/components/include/crt_lock.h b/src/components/include/crt_lock.h new file mode 100644 index 0000000000..95e901e52b --- /dev/null +++ b/src/components/include/crt_lock.h @@ -0,0 +1,59 @@ +#ifndef CRT_LOCK_H +#define CRT_LOCK_H + +/*** + * Simple blocking lock. Uses blockpoints to enable the blocking and + * waking of contending threads. This has little to no intelligence, + * for example, not expressing dependencies for PI. + */ + +#include +#include + +struct crt_lock { + unsigned long owner; + struct crt_blkpt blkpt; +}; + +static inline int +crt_lock_init(struct crt_lock *l) +{ + l->owner = 0; + + return crt_blkpt_init(&l->blkpt); +} + +static inline int +crt_lock_teardown(struct crt_lock *l) +{ + assert(l->owner == 0); + + return crt_blkpt_teardown(&l->blkpt); +} + +static inline void +crt_lock_take(struct crt_lock *l) +{ + struct crt_blkpt_checkpoint chkpt; + + while (1) { + crt_blkpt_checkpoint(&l->blkpt, &chkpt); + + if (ps_cas(&l->owner, 0, (unsigned long)(cos_cpuid() << 16 | cos_thdid()))) { + return; /* success! */ + } + /* failure: try and block */ + crt_blkpt_wait(&l->blkpt, 0, &chkpt); + } +} + +static inline void +crt_lock_release(struct crt_lock *l) +{ + assert(l->owner == (unsigned long)(cos_cpuid() << 16 | cos_thdid())); + l->owner = 0; + /* if there are blocked threads, wake 'em up! */ + crt_blkpt_trigger(&l->blkpt, 0); +} + +#endif /* CRT_LOCK_H */ diff --git a/src/components/include/deque.h b/src/components/include/deque.h new file mode 100644 index 0000000000..696eb5781c --- /dev/null +++ b/src/components/include/deque.h @@ -0,0 +1,112 @@ +/** + * Redistribution of this file is permitted under the BSD two clause license. + * + * Copyright 2019, The George Washington University + * Author: Phani Gadepalli, phanikishoreg@gwu.edu + */ +#ifndef DEQUE_H +#define DEQUE_H + +/* + * This was implemented by referring to: + * https://github.com/cpp-taskflow/cpp-taskflow/blob/9c28ccec910346a9937c40db7bdb542262053f9c/taskflow/executor/workstealing.hpp + * + * which is based on the following papers: + * + * The work stealing queue described in the paper, "Dynamic Circular Work-stealing Deque," SPAA, 2015. + * Only the queue owner can perform pop and push operations, while others can steal data from the queue. + * + * PPoPP implementation paper, "Correct and Efficient Work-Stealing for Weak Memory Models" + * https://www.di.ens.fr/~zappa/readings/ppopp13.pdf + */ +#define DEQUE_MAX_SZ (1<<14) + +#define DEQUE_PROTOTYPE(name, type) \ +struct deque_##name { \ + type wrk[DEQUE_MAX_SZ]; \ + long size; \ + \ + volatile long top; \ + volatile long bottom; \ +}; \ + \ +static inline void \ +deque_init_##name(struct deque_##name *q, size_t sz) \ +{ \ + memset(q, 0, sizeof(struct deque_##name)); \ + \ + if (sz) { \ + /* only for size with pow of 2 */ \ + /* assert((sz & (sz - 1)) == 0); */ \ + assert(sz <= DEQUE_MAX_SZ); \ + } else { \ + sz = DEQUE_MAX_SZ; \ + } \ + \ + q->size = sz; \ +} \ + \ +/* Use mutual exclusion locks around push/pop if multi-threaded. */ \ +static inline int \ +deque_push_##name(struct deque_##name *q, type *w) \ +{ \ + long ct, cb; \ + \ + ct = ps_load((unsigned long *)&q->top); \ + cb = ps_load((unsigned long *)&q->bottom); \ + \ + /* nope, fixed size only */ \ + if (q->size - 1 < (cb - ct)) return -ENOSPC; \ + \ + q->wrk[cb] = *w; \ + ps_mem_fence(); \ + if (!ps_upcas((unsigned long *)&q->bottom, cb, cb + 1)) assert(0); \ + \ + return 0; \ +} \ + \ +/* Use mutual exclusion locks around push/pop if multi-threaded. */ \ +static inline int \ +deque_pop_##name(struct deque_##name *q, type *w) \ +{ \ + long ct = 0, sz = 0; \ + long cb = ps_load((unsigned long *)&q->bottom) - 1; \ + int ret = 0; \ + \ + if (!ps_upcas((unsigned long *)&q->bottom, cb + 1, cb)) assert(0); \ + \ + ct = ps_load((unsigned long *)&q->top); \ + sz = cb - ct; \ + if (sz < 0) { \ + if (!ps_upcas((unsigned long *)&q->bottom, cb, ct)) assert(0); \ + \ + return -ENOENT; \ + } \ + \ + *w = q->wrk[cb]; \ + if (sz > 0) return 0; \ + \ + ret = ps_cas((unsigned long *)&q->top, ct, ct + 1); \ + if (!ps_upcas((unsigned long *)&q->bottom, cb, ct + 1)) assert(0); \ + if (!ret) { *w = NULL; return -ENOENT; } \ + \ + return 0; \ +} \ + \ +static inline int \ +deque_steal_##name(struct deque_##name *q, type *w) \ +{ \ + long ct, cb; \ + \ + ct = ps_load((unsigned long *)&q->top); \ + cb = ps_load((unsigned long *)&q->bottom); \ + \ + if (ct >= cb) return -ENOENT; \ + \ + *w = q->wrk[ct]; \ + if (!ps_cas((unsigned long *)&q->top, ct, ct + 1)) return -EAGAIN; \ + \ + return 0; \ +} + +#endif /* DEQUE_H */ diff --git a/src/components/include/hypercall.h b/src/components/include/hypercall.h index 83426c1c4a..ee3caeb312 100644 --- a/src/components/include/hypercall.h +++ b/src/components/include/hypercall.h @@ -13,13 +13,15 @@ enum hypercall_cntl { HYPERCALL_COMP_COMPCAP_GET, HYPERCALL_COMP_CAPTBLCAP_GET, HYPERCALL_COMP_PGTBLCAP_GET, - HYPERCALL_COMP_CAPFRONTIER_GET, HYPERCALL_COMP_INITAEP_GET, HYPERCALL_COMP_CHILD_NEXT, HYPERCALL_COMP_CPUBITMAP_GET, + HYPERCALL_COMP_SCHED_GET, HYPERCALL_NUMCOMPS_GET, + + HYPERCALL_ROOT_INITAEP_SET, /* per-core root-scheduler init-aeps created by capmgr and passed to llbooter */ }; static inline int @@ -48,11 +50,12 @@ hypercall_comp_init_done(void) /* Note: This API can be called ONLY by components that manage capability resources */ static inline int -hypercall_comp_initaep_get(spdid_t spdid, int is_sched, struct cos_aep_info *aep) +hypercall_comp_initaep_get(spdid_t spdid, int is_sched, struct cos_aep_info *aep, spdid_t *parent_spdid) { thdcap_t thdslot = 0; arcvcap_t rcvslot = 0; tcap_t tcslot = 0; + word_t r3 = 0; struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get()); int ret = 0; @@ -68,8 +71,8 @@ hypercall_comp_initaep_get(spdid_t spdid, int is_sched, struct cos_aep_info *aep } /* capid_t though is unsigned long, only assuming it occupies 16bits for packing */ - ret = cos_sinv(BOOT_CAPTBL_SINV_CAP, HYPERCALL_COMP_INITAEP_GET, - spdid << 16 | thdslot, rcvslot << 16 | tcslot, 0); + ret = cos_sinv_rets(BOOT_CAPTBL_SINV_CAP, HYPERCALL_COMP_INITAEP_GET, + spdid << 16 | thdslot, rcvslot << 16 | tcslot, 0, (word_t *)&parent_spdid, &r3); if (ret) return ret; aep->thd = thdslot; @@ -80,6 +83,18 @@ hypercall_comp_initaep_get(spdid_t spdid, int is_sched, struct cos_aep_info *aep return 0; } +static inline int +hypercall_root_initaep_set(spdid_t spdid, struct cos_aep_info *aep) +{ + int ret = 0; + + ret = cos_sinv(BOOT_CAPTBL_SINV_CAP, HYPERCALL_ROOT_INITAEP_SET, spdid << 16 | aep->thd, + aep->rcv << 16 | aep->tc, 0); + if (ret) return ret; + + return 0; +} + /* Note: This API can be called ONLY by components that manage capability resources */ static inline int hypercall_comp_info_get(spdid_t spdid, pgtblcap_t *ptslot, captblcap_t *ctslot, compcap_t *compslot, spdid_t *parentid) @@ -176,15 +191,10 @@ hypercall_comp_pgtblcap_get(spdid_t spdid) return ptslot; } -static inline capid_t -hypercall_comp_capfrontier_get(spdid_t spdid) +static inline spdid_t +hypercall_comp_sched_get(spdid_t spdid) { - word_t unused; - capid_t cap_frontier; - - if (cos_sinv_rets(BOOT_CAPTBL_SINV_CAP, HYPERCALL_COMP_CAPFRONTIER_GET, spdid, 0, 0, &cap_frontier, &unused)) return 0; - - return cap_frontier; + return cos_sinv(BOOT_CAPTBL_SINV_CAP, HYPERCALL_COMP_SCHED_GET, spdid, 0, 0); } static inline int diff --git a/src/components/include/omp.h b/src/components/include/omp.h new file mode 100644 index 0000000000..f3312ec5bc --- /dev/null +++ b/src/components/include/omp.h @@ -0,0 +1,174 @@ +/* Copyright (C) 2005-2017 Free Software Foundation, Inc. + Contributed by Richard Henderson . + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* + * NOTE: This header is from gcc 7 customized + * to support only what is required in our environment + */ +#ifndef _OMP_H +#define _OMP_H 1 + +#ifndef _LIBGOMP_OMP_LOCK_DEFINED +#define _LIBGOMP_OMP_LOCK_DEFINED 1 +/* These two structures get edited by the libgomp build process to + reflect the shape of the two types. Their internals are private + to the library. */ + +typedef struct +{ + unsigned char _x[4] + __attribute__((__aligned__(4))); +} omp_lock_t; + +typedef struct +{ +#if defined(__linux__) + unsigned char _x[8 + sizeof (void *)] + __attribute__((__aligned__(sizeof (void *)))); +#else + unsigned char _x[16] + __attribute__((__aligned__(8))); +#endif +} omp_nest_lock_t; +#endif + +typedef enum omp_sched_t +{ + omp_sched_static = 1, + omp_sched_dynamic = 2, + omp_sched_guided = 3, + omp_sched_auto = 4 +} omp_sched_t; + +typedef enum omp_proc_bind_t +{ + omp_proc_bind_false = 0, + omp_proc_bind_true = 1, + omp_proc_bind_master = 2, + omp_proc_bind_close = 3, + omp_proc_bind_spread = 4 +} omp_proc_bind_t; + +typedef enum omp_lock_hint_t +{ + omp_lock_hint_none = 0, + omp_lock_hint_uncontended = 1, + omp_lock_hint_contended = 2, + omp_lock_hint_nonspeculative = 4, + omp_lock_hint_speculative = 8, +} omp_lock_hint_t; + +#ifdef __cplusplus +extern "C" { +# define __GOMP_NOTHROW throw () +#else +# define __GOMP_NOTHROW __attribute__((__nothrow__)) +#endif + +//extern void omp_set_num_threads (int) __GOMP_NOTHROW; +extern int omp_get_num_threads (void) __GOMP_NOTHROW; +extern int omp_get_max_threads (void) __GOMP_NOTHROW; +extern int omp_get_thread_num (void) __GOMP_NOTHROW; +extern int omp_get_num_procs (void) __GOMP_NOTHROW; + +//extern int omp_in_parallel (void) __GOMP_NOTHROW; +// +//extern void omp_set_dynamic (int) __GOMP_NOTHROW; +//extern int omp_get_dynamic (void) __GOMP_NOTHROW; +// +//extern void omp_set_nested (int) __GOMP_NOTHROW; +//extern int omp_get_nested (void) __GOMP_NOTHROW; +// +//extern void omp_init_lock (omp_lock_t *) __GOMP_NOTHROW; +//extern void omp_init_lock_with_hint (omp_lock_t *, omp_lock_hint_t) +// __GOMP_NOTHROW; +//extern void omp_destroy_lock (omp_lock_t *) __GOMP_NOTHROW; +//extern void omp_set_lock (omp_lock_t *) __GOMP_NOTHROW; +//extern void omp_unset_lock (omp_lock_t *) __GOMP_NOTHROW; +//extern int omp_test_lock (omp_lock_t *) __GOMP_NOTHROW; +// +//extern void omp_init_nest_lock (omp_nest_lock_t *) __GOMP_NOTHROW; +//extern void omp_init_nest_lock_with_hint (omp_nest_lock_t *, omp_lock_hint_t) +// __GOMP_NOTHROW; +//extern void omp_destroy_nest_lock (omp_nest_lock_t *) __GOMP_NOTHROW; +//extern void omp_set_nest_lock (omp_nest_lock_t *) __GOMP_NOTHROW; +//extern void omp_unset_nest_lock (omp_nest_lock_t *) __GOMP_NOTHROW; +//extern int omp_test_nest_lock (omp_nest_lock_t *) __GOMP_NOTHROW; +// +extern double omp_get_wtime (void) __GOMP_NOTHROW; +//extern double omp_get_wtick (void) __GOMP_NOTHROW; +// +//extern void omp_set_schedule (omp_sched_t, int) __GOMP_NOTHROW; +//extern void omp_get_schedule (omp_sched_t *, int *) __GOMP_NOTHROW; +//extern int omp_get_thread_limit (void) __GOMP_NOTHROW; +//extern void omp_set_max_active_levels (int) __GOMP_NOTHROW; +//extern int omp_get_max_active_levels (void) __GOMP_NOTHROW; +//extern int omp_get_level (void) __GOMP_NOTHROW; +//extern int omp_get_ancestor_thread_num (int) __GOMP_NOTHROW; +//extern int omp_get_team_size (int) __GOMP_NOTHROW; +//extern int omp_get_active_level (void) __GOMP_NOTHROW; +// +//extern int omp_in_final (void) __GOMP_NOTHROW; +// +//extern int omp_get_cancellation (void) __GOMP_NOTHROW; +//extern omp_proc_bind_t omp_get_proc_bind (void) __GOMP_NOTHROW; +//extern int omp_get_num_places (void) __GOMP_NOTHROW; +//extern int omp_get_place_num_procs (int) __GOMP_NOTHROW; +//extern void omp_get_place_proc_ids (int, int *) __GOMP_NOTHROW; +//extern int omp_get_place_num (void) __GOMP_NOTHROW; +//extern int omp_get_partition_num_places (void) __GOMP_NOTHROW; +//extern void omp_get_partition_place_nums (int *) __GOMP_NOTHROW; +// +//extern void omp_set_default_device (int) __GOMP_NOTHROW; +//extern int omp_get_default_device (void) __GOMP_NOTHROW; +//extern int omp_get_num_devices (void) __GOMP_NOTHROW; +//extern int omp_get_num_teams (void) __GOMP_NOTHROW; +//extern int omp_get_team_num (void) __GOMP_NOTHROW; +// +//extern int omp_is_initial_device (void) __GOMP_NOTHROW; +//extern int omp_get_initial_device (void) __GOMP_NOTHROW; +//extern int omp_get_max_task_priority (void) __GOMP_NOTHROW; +// +//extern void *omp_target_alloc (__SIZE_TYPE__, int) __GOMP_NOTHROW; +//extern void omp_target_free (void *, int) __GOMP_NOTHROW; +//extern int omp_target_is_present (void *, int) __GOMP_NOTHROW; +//extern int omp_target_memcpy (void *, void *, __SIZE_TYPE__, __SIZE_TYPE__, +// __SIZE_TYPE__, int, int) __GOMP_NOTHROW; +//extern int omp_target_memcpy_rect (void *, void *, __SIZE_TYPE__, int, +// const __SIZE_TYPE__ *, +// const __SIZE_TYPE__ *, +// const __SIZE_TYPE__ *, +// const __SIZE_TYPE__ *, +// const __SIZE_TYPE__ *, int, int) +// __GOMP_NOTHROW; +//extern int omp_target_associate_ptr (void *, void *, __SIZE_TYPE__, +// __SIZE_TYPE__, int) __GOMP_NOTHROW; +//extern int omp_target_disassociate_ptr (void *, int) __GOMP_NOTHROW; + +#ifdef __cplusplus +} +#endif + +#endif /* _OMP_H */ diff --git a/src/components/include/part.h b/src/components/include/part.h new file mode 100644 index 0000000000..f4ea8cc9bc --- /dev/null +++ b/src/components/include/part.h @@ -0,0 +1,437 @@ +#ifndef PART_H +#define PART_H + +#include +#include +#include +#include + +#include +#include + +#undef PART_ENABLE_NESTED +#define PART_ENABLE_BLOCKING +//#include + +DEQUE_PROTOTYPE(part, struct part_task *); +//CIRQUE_PROTOTYPE(part, struct part_task); + +extern struct deque_part *part_dq_percore[]; +//extern struct cirque_par parcq_global; +/* FIXME: use stacklist or another stack like data structure? */ +extern struct ps_list_head part_thdpool_core[]; +extern volatile int in_main_parallel; +#if defined(PART_ENABLE_NESTED) +extern struct ps_list_head part_l_global; +extern struct crt_lock part_l_lock; +#else +extern struct part_task main_task; +#endif + +static inline struct deque_part * +part_deque_curr(void) +{ + return part_dq_percore[cos_cpuid()]; +} + +static inline struct deque_part * +part_deque_core(cpuid_t c) +{ + assert(c < NUM_CPU); + + return part_dq_percore[c]; +} + +static inline struct ps_list_head * +part_thdpool_curr(void) +{ + return &part_thdpool_core[cos_cpuid()]; +} + +//static inline struct cirque_par * +//part_cirque(void) +//{ +// return &parcq_global; +//} + +#if defined(PART_ENABLE_NESTED) +static inline struct ps_list_head * +part_list(void) +{ + return &part_l_global; +} +#endif + +static inline int +part_deque_push(struct part_task *t) +{ + int ret; + + assert(t->type == PART_TASK_T_TASK); + sl_cs_enter(); + ret = deque_push_part(part_deque_curr(), &t); + sl_cs_exit(); + + return ret; +} + +static inline int +part_deque_pop(struct part_task **t) +{ + int ret; + + *t = NULL; + sl_cs_enter(); + ret = deque_pop_part(part_deque_curr(), t); + sl_cs_exit(); + if (unlikely(ret)) *t = NULL; + + if (unlikely(*t && (*t)->type != PART_TASK_T_TASK)) { *t = NULL; ret = -EAGAIN; } + + return ret; +} + +static inline struct part_task * +part_deque_steal(cpuid_t core) +{ +#if NUM_CPU > 1 + int ret; + struct part_task *t = NULL; + + ret = deque_steal_part(part_deque_core(core), &t); + if (unlikely(ret)) return NULL; + assert(t->type == PART_TASK_T_TASK); + + return t; +#else + return NULL; +#endif +} + +static inline struct part_task * +part_deque_steal_any(void) +{ +#if NUM_CPU > 1 + unsigned i = 0, c = (unsigned)(ps_tsc() % NUM_CPU); + + do { + struct part_task *t = NULL; + + i ++; + if (unlikely(c == (unsigned)cos_cpuid())) c = (c + 1) % NUM_CPU; + + t = part_deque_steal(c); + if (likely(t)) return t; + } while (i < NUM_CPU); +#endif + return NULL; +} + +static inline void +part_pool_wakeup(void) +{ +#ifdef PART_ENABLE_BLOCKING + struct sl_thd *t = NULL; + int i; + + /* we're still not in main parallel, so don't try to wakeup any threads yet! */ + if (!ps_load(&in_main_parallel)) return; + + sl_cs_enter(); + if (unlikely(ps_list_head_empty(part_thdpool_curr()))) goto done; + + t = ps_list_head_first(part_thdpool_curr(), struct sl_thd, partlist); + /* removal from the list is taken care in mod_part_fifo */ + if (t == sl_thd_curr()) goto done; + sl_thd_wakeup_no_cs(t); +done: + sl_cs_exit(); +#endif +} + +static inline void +part_pool_block(void) +{ +#ifdef PART_ENABLE_BLOCKING + struct sl_thd *t = sl_thd_curr(); + + /* very much a replica of sl_thd_block + adding to thread pool in part */ + sl_cs_enter(); + if (ps_list_singleton(t, partlist)) ps_list_head_append(part_thdpool_curr(), t, partlist); + sl_cs_exit(); + sl_thd_block(0); + assert(sl_thd_is_runnable(t)); +#else + sl_thd_yield(0); +#endif +} + +///* ds memory in a circular queue */ +//static inline struct part_task * +//part_cirque_alloc(void) +//{ +// return cirque_allocptr_par(part_cirque()); +//} +// +//static inline void +//part_cirque_free(void) +//{ +// cirque_freeptr_par(part_cirque()); +//} +// +//static inline struct part_task * +//part_cirque_peek(void) +//{ +// return cirque_peekptr_par(part_cirque()); +//} + +/* TODO: lock for shared list! */ +static inline void +part_list_append(struct part_task *t) +{ + unsigned i; + int in_nest = 0; + + assert(t->type == PART_TASK_T_WORKSHARE); + +#if defined(PART_ENABLE_NESTED) + assert(ps_list_singleton(t, partask)); + /* + * this is not required to be in a cs. + * if multiple appends are called, simultaneously, we at least + * have the main outermost parallel block running!. + */ + if (likely(!ps_list_head_empty(part_list()))) in_nest = 1; + /* so other threads can work on this! */ + if (t->nthds > 1) { + crt_lock_take(&part_l_lock); + ps_list_head_append(part_list(), t, partask); + crt_lock_release(&part_l_lock); + } +#else + if (t != &main_task) { + /* without nesting, all child parallel blocks are run just be the encountering threads -master threads */ + assert(t->nthds == 1); + assert(ps_load(&in_main_parallel)); + + return; + } + assert(ps_load(&in_main_parallel) == 0); +#endif + /* + * wake up as many threads on this core! + * some may not get work if other cores pull work before they get to it. + */ + for (i = 1; i < t->nthds; i++) part_pool_wakeup(); + + /* if this is the first time in a parallel, make everyone know */ + if (likely(!in_nest)) ps_faa(&in_main_parallel, 1); +} + +static inline void +part_list_remove(struct part_task *t) +{ + int in_nest = 0; + + assert(t->type == PART_TASK_T_WORKSHARE); +#if defined(PART_ENABLE_NESTED) + assert(t->nthds > 1); + assert(!ps_list_singleton(t, partask)); + + crt_lock_take(&part_l_lock); + ps_list_rem(t, partask); + if (unlikely(!ps_list_head_empty(part_list()))) in_nest = 1; + crt_lock_release(&part_l_lock); +#else + /* only called for the other parallel region */ + assert(ps_load(&in_main_parallel)); + if (t != &main_task) return; +#endif + + if (likely(!in_nest)) ps_faa(&in_main_parallel, -1); +} + +static inline struct part_task * +part_list_peek(void) +{ + /* there should at least be the outer parallel block for other threads to peek! */ + if (!ps_load(&in_main_parallel)) return NULL; + +#if defined(PART_ENABLE_NESTED) + struct part_task *t = NULL; + int found = 0; + + crt_lock_take(&part_l_lock); + if (unlikely(ps_list_head_empty(part_list()))) goto done; + /* not great! traversing from the first element always! */ + /* TODO: perhaps traverse from the current task? */ + ps_list_foreach(part_list(), t, partask) { + int i; + + assert(t); + assert(t->type == PART_TASK_T_WORKSHARE); + /* coz, master thread adds to list the implicit task and doesn't defer it */ + i = part_task_work_try(t); + assert(i != 0); + + if (likely(i > 0 && !ps_load(&t->end))) { + found = 1; + break; + } + } + +done: + crt_lock_release(&part_l_lock); + + if (unlikely(!found)) return NULL; + + return t; +#else + int i; + + assert(main_task.type == PART_TASK_T_WORKSHARE); + i = part_task_work_try(&main_task); + assert(i != 0); + + if (likely(i > 0 && ps_load(&main_task.end) != main_task.nthds)) return &main_task; + + return NULL; +#endif +} + +void part_init(void); + +unsigned part_isready(void); + +static inline void +part_task_barrier(struct part_task *t, int is_end) +{ + struct sl_thd *ts = sl_thd_curr(); + unsigned cbc = 0; + int is_master = t->master == PART_CURR_THD ? 1 : 0; + + assert(t->type != PART_TASK_T_NONE); + assert(t->state == PART_TASK_S_INITIALIZED); + assert(t->nthds >= 1); + + part_task_wait_children(t); + + if (t->nthds == 1) { + struct part_data *d; + + if (unlikely(!is_end)) return; + + ps_faa(&t->end, 1); + /* remove myself from my parent. */ + part_task_remove_child(t); + if (t->type == PART_TASK_T_WORKSHARE) { + assert(is_master); + ts->part_context = t->parent; + part_list_remove(t); + + return; + } + + ts->part_context = NULL; + d = t->data_env; + + part_task_free(t); + part_data_free(d); + + return; + } + + assert(t->type == PART_TASK_T_WORKSHARE); + + cbc = ps_faa(&t->barrier, -1); + if (cbc > 1) { + sl_thd_block(0); + if (is_master) part_peer_wakeup(t); + } else { + if (ps_cas(&t->barrier, 0, t->nthds)) ps_faa(&t->barrier_epoch, 1); + if (is_master) { + part_peer_wakeup(t); + } + else { + part_master_wakeup(t); + sl_thd_block(0); + } + } + //assert(ps_load(&t->barrier_epoch) == cbep + 1); + + if (!is_end) return; + ps_faa(&t->end, 1); + + if (is_master) { + part_task_remove_child(t); + part_list_remove(t); + ts->part_context = t->parent; + } else { + ts->part_context = NULL; + } +} + +static inline void +part_task_end(struct part_task *t) +{ part_task_barrier(t, 1); } + +static inline void +part_thd_fn(void *d) +{ + struct sl_thd *curr = sl_thd_curr(); + + /* parallel runtime not ready? */ + /* if (unlikely(!part_isready())) part_pool_block(); */ + /* not in the main parallel block? */ + + while (1) { + struct part_task *t = NULL; + int ret; + + if (!ps_load(&in_main_parallel)) part_pool_block(); + + /* FIXME: nested parallel needs love! */ + t = part_list_peek(); + if (likely(t)) goto found; + +single: + ret = part_deque_pop(&t); + if (likely(ret == 0)) { + int thdnum = -1; + + assert(t && t->type == PART_TASK_T_TASK); + thdnum = part_task_work_try(t); + assert(thdnum == 0); + goto found; + } + + if (unlikely(ret == -EAGAIN)) goto single; + + t = part_deque_steal_any(); + if (unlikely(!t)) { + part_pool_block(); + + continue; + } else { + int thdnum = -1; + + assert(t->type == PART_TASK_T_TASK); + thdnum = part_task_work_try(t); + if (thdnum < 0) continue; + assert(thdnum == 0); + } + +found: + assert(t); + curr->part_context = (void *)t; + + t->cs.fn(t->cs.data); + + part_task_end(t); + assert(curr->part_context == NULL); + } + + sl_thd_exit(); +} + +#endif /* PART_H */ diff --git a/src/components/include/part_task.h b/src/components/include/part_task.h new file mode 100644 index 0000000000..8bc9f4ea38 --- /dev/null +++ b/src/components/include/part_task.h @@ -0,0 +1,278 @@ +#ifndef PART_TASK_H +#define PART_TASK_H + +#include +#include +#include +#include + +#define PART_THD(c, t) ((unsigned)(cos_cpuid() << 16 | cos_thdid())) +#define PART_CURR_THD PART_THD(cos_cpuid(), cos_thdid()) +#define PART_THD_COREID(t) (t >> 16) +#define PART_THD_THDID(t) ((t << 16) >> 16) + +#define PART_MAX_TASKS (NUM_CPU < 4 ? 2048 : 8192) +#define PART_MAX_DATA PART_MAX_TASKS +#define PART_MAX_PAR_THDS NUM_CPU /* change this to test more data-parallel tasks on single core configuration */ +#define PART_MAX_CORE_THDS (NUM_CPU == 1 ? 200 : (NUM_CPU == 2 ? 128 : (NUM_CPU < 5 ? 64 : 48))) +#define PART_MAX_THDS 512 +#define PART_MAX_CHILD 1024 +#define PART_MAX_WORKSHARES 16 + +typedef void (*part_fn_t)(void *); + +typedef enum { + PART_TASK_S_FREED, + PART_TASK_S_ALLOCATED, + PART_TASK_S_INITIALIZED, + PART_TASK_S_RUNNING, + PART_TASK_S_CHILD_WAIT, /* WAIT FOR CHILD TASKS */ + PART_TASK_S_SIBLING_WAIT, /* WAIT FOR SIBLING TASKS */ + PART_TASK_S_PARENT_WAIT, /* WAIT FOR PARENT TASK */ + PART_TASK_S_IN_BARRIER, /* WAIT FOR ALL OTHER THREADS */ +} part_task_state_t; + +typedef enum { + PART_TASK_T_NONE, + PART_TASK_T_WORKSHARE = 1, /* task to put in a shared fifo queue */ + PART_TASK_T_TASK, +} part_task_type_t; + +typedef enum { + PART_WORKSHARE_NONE, + PART_WORKSHARE_LOOP_STATIC, + PART_WORKSHARE_LOOP_DYNAMIC, + PART_WORKSHARE_LOOP_GUIDED, + PART_WORKSHARE_LOOP_RUNTIME, + PART_WORKSHARE_SECTIONS, + PART_WORKSHARE_SINGLE, +} part_workshare_type_t; + +struct part_workshare { + part_workshare_type_t type; + + long chunk_sz; + + long st, end, inc; + + long next; + + unsigned worker_bmp; +}; + +struct part_closure { + part_fn_t fn; + void *data; +}; + +struct part_data { + int flag; /* 0 = not in use, 1 = in use */ + struct part_data *next_free; /* for explicit data allocation/free */ + char data[PART_MAX_DATA]; +}; + +struct part_task { + int id; /* only for debugging */ + part_task_state_t state; + part_task_type_t type; + + struct part_workshare ws[PART_MAX_WORKSHARES]; + struct part_closure cs; + + unsigned nthds; /* number of threads for this task, 1 in case of non-workshare work */ + unsigned nworkers; + unsigned workers[PART_MAX_PAR_THDS]; /* threads sharing this work or thread doing this work! */ + int ws_off[PART_MAX_PAR_THDS]; /* progress of the workshares in each participating thread */ + unsigned master; /* coreid << 16 | thdid of the master */ + unsigned end, barrier, barrier_epoch; + + struct part_data *data_env; + struct part_task *parent; + /* in data-parallel task, each thread waits for its children. */ + int nchildren[PART_MAX_PAR_THDS]; + + struct ps_list partask; + struct part_task *next_free; /* for explicit task allocation/free */ +} CACHE_ALIGNED; + +static inline void +part_task_init(struct part_task *t, part_task_type_t type, struct part_task *p, unsigned nthds, part_fn_t fn, void *data, struct part_data *d) +{ + static unsigned part_id_free = 0; + int i, id = ps_faa(&part_id_free, 1); + + assert(type != PART_TASK_T_NONE); + t->type = type; + if (!ps_cas(&t->state, PART_TASK_S_ALLOCATED, PART_TASK_S_INITIALIZED)) assert(0); + t->id = id; + memset(t->ws, 0, sizeof(struct part_workshare) * PART_MAX_WORKSHARES); + t->cs.fn = fn; + t->cs.data = data; + assert (nthds <= PART_MAX_PAR_THDS); + t->nthds = nthds; + t->nworkers = 0; + memset(t->workers, 0, sizeof(unsigned) * PART_MAX_PAR_THDS); + t->master = PART_CURR_THD; + /* if it's worksharing, current thread is the master and does take part in the par section */ + if (type == PART_TASK_T_WORKSHARE) { + t->nworkers = 1; + t->workers[0] = t->master; + } + for (i = 0; i < PART_MAX_PAR_THDS; i++) t->ws_off[i] = -1; + t->barrier = t->nthds; + t->end = t->barrier_epoch = 0; + t->data_env = d; + t->parent = p; + memset(t->nchildren, 0, sizeof(int) * PART_MAX_PAR_THDS); + + ps_list_init(t, partask); +} + +struct part_task *part_task_alloc(part_task_type_t); +void part_task_free(struct part_task *); +struct part_data *part_data_alloc(void); +void part_data_free(struct part_data *); + +static inline int +part_task_work_try(struct part_task *t) +{ + int i = 0; + unsigned key = PART_CURR_THD; + + assert(t->state == PART_TASK_S_INITIALIZED); + if (t->type == PART_TASK_T_TASK) { + assert(t->nthds == 1); + } else { + assert(t->type == PART_TASK_T_WORKSHARE); + assert(t->master != key && t->master == t->workers[0]); + assert(t->nthds >= 1); + } + + /* task was finished! */ + if (unlikely(ps_load(&t->end) == t->nthds)) return -1; + /* if you can work with this task */ + i = ps_faa(&t->nworkers, 1); + if (unlikely(i >= (int)t->nthds)) return -1; + + t->workers[i] = key; + + return i; +} + +static inline int +part_task_work_thd_num(struct part_task *t, unsigned core_thd) +{ + int i; + unsigned key = core_thd; + + assert(t); + + assert(t->state == PART_TASK_S_INITIALIZED); + if (likely(t->type == PART_TASK_T_TASK)) { + assert(t->nthds == 1); + + if (ps_load(&t->workers[0]) == key) return 0; + + return -1; + } + assert(t->type == PART_TASK_T_WORKSHARE); + + if (key == t->master) return 0; + for (i = 1; i < (int)t->nthds; i++) { + if (t->workers[i] == key) return i; + } + + return -1; +} + +static inline void +part_thd_wakeup(unsigned thd) +{ + thdid_t t = PART_THD_THDID(thd); + cpuid_t c = PART_THD_COREID(thd); + + assert(c >= 0 && c < NUM_CPU); + assert(t < MAX_NUM_THREADS); + + if (thd == PART_CURR_THD) return; + if (c != cos_cpuid()) sl_xcore_thd_wakeup_tid(t, c); + else sl_thd_wakeup(t); +} + +static inline void +part_master_wakeup(struct part_task *t) +{ + assert(t->type == PART_TASK_T_WORKSHARE); + assert(t->state == PART_TASK_S_INITIALIZED); + assert(t->nthds > 1); + assert(t->master && t->master != PART_CURR_THD); + + part_thd_wakeup(t->master); +} + +static inline void +part_peer_wakeup(struct part_task *t) +{ + unsigned i; + + assert(t->type == PART_TASK_T_WORKSHARE); + assert(t->state == PART_TASK_S_INITIALIZED); + assert(t->nthds > 1); + assert(t->master == PART_CURR_THD); + + for (i = 1; i < t->nthds; i++) part_thd_wakeup(t->workers[i]); +} + +static inline int +part_task_add_child(struct part_task *t, struct part_task *c) +{ + int i; + int num = part_task_work_thd_num(t, PART_CURR_THD); + + assert(num >= 0); + assert(t->state == PART_TASK_S_INITIALIZED); + + if (unlikely(!t || !c)) return -1; + + i = ps_faa(&t->nchildren[num], 1); + assert(i < PART_MAX_CHILD); + + return i; +} + +static inline void +part_task_remove_child(struct part_task *c) +{ + struct part_task *p = c->parent; + unsigned wkup; + int i, num; + + if (unlikely(!p)) return; + assert(c->state == PART_TASK_S_INITIALIZED); + + if (c->type == PART_TASK_T_TASK) wkup = c->master; + else wkup = p->master; + + num = part_task_work_thd_num(p, wkup); + assert(num >= 0); + + assert(p->nchildren[num] != 0); + i = ps_faa(&p->nchildren[num], -1); + assert(i > 0); + + /* only the last child to wake up the parent */ + if (i == 1) part_thd_wakeup(wkup); +} + +static inline void +part_task_wait_children(struct part_task *t) +{ + int num = part_task_work_thd_num(t, PART_CURR_THD); + + assert(num >= 0); + assert(t->state == PART_TASK_S_INITIALIZED); + + if (ps_load(&(t->nchildren[num])) > 0) sl_thd_block(0); +} + +#endif /* PART_TASK_H */ diff --git a/src/components/include/res_spec.h b/src/components/include/res_spec.h index e109b8a2fb..e81736950a 100644 --- a/src/components/include/res_spec.h +++ b/src/components/include/res_spec.h @@ -64,10 +64,10 @@ sched_param_pack(sched_param_type_t type, unsigned int value) static inline void sched_param_get(sched_param_t sp, sched_param_type_t *type, unsigned int *value) { - struct sched_param_s s = *(struct sched_param_s *)(void *)&sp; + union sched_param_union us = *(union sched_param_union *)&sp; - *type = s.type; - *value = s.value; + *type = us.c.type; + *value = us.c.value; } #endif /* RES_SPEC_H */ diff --git a/src/components/include/sl.h b/src/components/include/sl.h index 1529c7835c..777125322e 100644 --- a/src/components/include/sl.h +++ b/src/components/include/sl.h @@ -37,8 +37,13 @@ #include #include #include -#include +#include #include +#include + +#define SL_CS +#undef SL_REPLENISH +#undef SL_PARENTCHILD /* Critical section (cs) API to protect scheduler data-structures */ struct sl_cs { @@ -51,7 +56,7 @@ struct sl_cs { } u; }; -struct sl_global_cpu { +struct sl_global_core { struct sl_cs lock; thdcap_t sched_thdcap; @@ -62,18 +67,29 @@ struct sl_global_cpu { int cyc_per_usec; cycles_t period; - cycles_t timer_next; + cycles_t timer_next, timer_prev; tcap_time_t timeout_next; + struct cos_scb_info *scb_info; struct ps_list_head event_head; /* all pending events for sched end-point */ }; -extern struct sl_global_cpu sl_global_cpu_data[]; +extern struct sl_global_core sl_global_core_data[]; + +typedef u32_t sched_blkpt_id_t; +#define SCHED_BLKPT_NULL 0 +typedef word_t sched_blkpt_epoch_t; + +static inline struct sl_global_core * +sl__globals_core(void) +{ + return &(sl_global_core_data[cos_cpuid()]); +} -static inline struct sl_global_cpu * -sl__globals_cpu(void) +static inline struct cos_scb_info * +sl_scb_info_core(void) { - return &(sl_global_cpu_data[cos_cpuid()]); + return (sl__globals_core()->scb_info); } static inline void @@ -83,15 +99,22 @@ sl_thd_setprio(struct sl_thd *t, tcap_prio_t p) } /* for lazy retrieval of a child component thread in the parent */ -extern struct sl_thd *sl_thd_retrieve(thdid_t tid); +extern struct sl_thd *sl_thd_retrieve_lazy(thdid_t tid); static inline struct sl_thd * sl_thd_lkup(thdid_t tid) { - assert(tid != 0); - if (unlikely(tid > MAX_NUM_THREADS)) return NULL; + struct sl_thd *t; + struct sl_xcore_thd *xt; + + if (unlikely(tid < 1 || tid > MAX_NUM_THREADS)) return NULL; + t = sl_mod_thd_get(sl_thd_lookup_backend(tid)); + if (likely(t && sl_thd_aepinfo(t))) return t; + xt = sl_xcore_thd_lookup(tid); + if (unlikely(xt && xt->core != cos_cpuid())) return NULL; - return sl_thd_retrieve(tid); + /* FIXME: cross-core child threads must be handled in retrieve */ + return sl_thd_retrieve_lazy(tid); } /* only see if it's already sl_thd initialized */ @@ -100,8 +123,7 @@ sl_thd_try_lkup(thdid_t tid) { struct sl_thd *t = NULL; - assert(tid != 0); - if (unlikely(tid > MAX_NUM_THREADS)) return NULL; + if (unlikely(tid < 1 || tid > MAX_NUM_THREADS)) return NULL; t = sl_mod_thd_get(sl_thd_lookup_backend(tid)); if (!sl_thd_aepinfo(t)) return NULL; @@ -112,26 +134,32 @@ sl_thd_try_lkup(thdid_t tid) static inline thdid_t sl_thdid(void) { - thdid_t tid = cos_thdid(); - - assert(tid != 0); - assert(tid < MAX_NUM_THREADS); - - return tid; + return cos_thdid(); } +sched_blkpt_id_t sched_blkpt_alloc(void); +int sched_blkpt_free(sched_blkpt_id_t id); +int sched_blkpt_trigger(sched_blkpt_id_t blkpt, sched_blkpt_epoch_t epoch, int single); +int sched_blkpt_block(sched_blkpt_id_t blkpt, sched_blkpt_epoch_t epoch, thdid_t dependency); static inline struct sl_thd * sl_thd_curr(void) { - return sl_thd_lkup(sl_thdid()); + struct sl_thd *t = (struct sl_thd *)cos_get_slthd_ptr(); + + if (likely(t)) return t; + + t = sl_thd_lkup(sl_thdid()); + cos_set_slthd_ptr((void *)t); + + return t; } /* are we the owner of the critical section? */ static inline int sl_cs_owner(void) { - return sl__globals_cpu()->lock.u.s.owner == sl_thd_thdcap(sl_thd_curr()); + return sl__globals_core()->lock.u.s.owner == sl_thd_thdcap(sl_thd_curr()); } /* ...not part of the public API */ @@ -147,7 +175,7 @@ sl_cs_owner(void) * -ve from cos_defswitch failure, allowing caller for ex: the scheduler thread to * check if it was -EBUSY to first recieve pending notifications before retrying lock. */ -int sl_cs_enter_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, thdcap_t curr, sched_tok_t tok); +int sl_cs_enter_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, struct sl_global_core *gcore, struct sl_thd *curr, sched_tok_t tok); /* * @csi: current critical section value * @cached: a cached copy of @csi @@ -155,28 +183,28 @@ int sl_cs_enter_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, * * @ret: returns 1 if we need a retry, 0 otherwise */ -int sl_cs_exit_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, sched_tok_t tok); +int sl_cs_exit_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, struct sl_global_core *gcore, sched_tok_t tok); /* Enter into the scheduler critical section */ static inline int sl_cs_enter_nospin(void) { +#ifdef SL_CS + struct sl_global_core *gcore = sl__globals_core(); + struct sl_thd *t = sl_thd_curr(); union sl_cs_intern csi, cached; - struct sl_thd * t = sl_thd_curr(); - sched_tok_t tok; assert(t); - tok = cos_sched_sync(); - csi.v = sl__globals_cpu()->lock.u.v; + csi.v = gcore->lock.u.v; cached.v = csi.v; if (unlikely(csi.s.owner)) { - return sl_cs_enter_contention(&csi, &cached, sl_thd_thdcap(t), tok); + return sl_cs_enter_contention(&csi, &cached, gcore, t, cos_sched_sync()); } csi.s.owner = sl_thd_thdcap(t); - if (!ps_cas(&sl__globals_cpu()->lock.u.v, cached.v, csi.v)) return 1; - + if (!ps_upcas(&gcore->lock.u.v, cached.v, csi.v)) return 1; +#endif return 0; } @@ -211,22 +239,23 @@ sl_cs_enter_sched(void) static inline void sl_cs_exit(void) { +#ifdef SL_CS + struct sl_global_core *gcore = sl__globals_core(); union sl_cs_intern csi, cached; - sched_tok_t tok; assert(sl_cs_owner()); - retry: - tok = cos_sched_sync(); - csi.v = sl__globals_cpu()->lock.u.v; + csi.v = gcore->lock.u.v; cached.v = csi.v; if (unlikely(csi.s.contention)) { - if (sl_cs_exit_contention(&csi, &cached, tok)) goto retry; + if (sl_cs_exit_contention(&csi, &cached, gcore, cos_sched_sync())) goto retry; + return; } - if (!ps_cas(&sl__globals_cpu()->lock.u.v, cached.v, 0)) goto retry; + if (!ps_upcas(&gcore->lock.u.v, cached.v, 0)) goto retry; +#endif } /* @@ -270,9 +299,15 @@ int sl_thd_sched_wakeup_no_cs(struct sl_thd *t); /* wakeup thread and do not remove from timeout queue if blocked on timeout */ int sl_thd_wakeup_no_cs_rm(struct sl_thd *t); -void sl_thd_yield(thdid_t tid); +void sl_thd_yield_intern(thdid_t tid); +void sl_thd_yield_intern_timeout(cycles_t abs_timeout); + void sl_thd_yield_cs_exit(thdid_t tid); +int sl_thd_migrate_no_cs(struct sl_thd *t, cpuid_t core); +/* @return: 0 - success, -1 - failure */ +int sl_thd_migrate(thdid_t tid, cpuid_t core); + /* The entire thread allocation and free API */ struct sl_thd *sl_thd_alloc(cos_thd_fn_t fn, void *data); struct sl_thd *sl_thd_aep_alloc(cos_aepthd_fn_t fn, void *data, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax); @@ -282,8 +317,10 @@ struct sl_thd *sl_thd_aep_alloc(cos_aepthd_fn_t fn, void *data, int own_tcap, co */ struct sl_thd *sl_thd_comp_init(struct cos_defcompinfo *comp, int is_sched); -struct sl_thd *sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax); -struct sl_thd *sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv); +struct sl_thd *sl_thd_initaep_alloc_dcb(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, dcbcap_t dcap, microsec_t ipiwin, u32_t ipimax); +struct sl_thd *sl_thd_aep_alloc_ext_dcb(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, dcbcap_t dcap, dcboff_t doff, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv); +struct sl_thd *sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, vaddr_t *dcbaddr); +struct sl_thd *sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, vaddr_t *dcbaddr, arcvcap_t *extrcv); struct sl_thd *sl_thd_init_ext(struct cos_aep_info *aep, struct sl_thd *sched_thd); @@ -295,13 +332,13 @@ void sl_thd_param_set(struct sl_thd *t, sched_param_t sp); static inline microsec_t sl_cyc2usec(cycles_t cyc) { - return cyc / sl__globals_cpu()->cyc_per_usec; + return cyc / sl__globals_core()->cyc_per_usec; } static inline cycles_t sl_usec2cyc(microsec_t usec) { - return usec * sl__globals_cpu()->cyc_per_usec; + return usec * sl__globals_core()->cyc_per_usec; } static inline cycles_t @@ -333,14 +370,17 @@ void sl_timeout_period(cycles_t period); static inline cycles_t sl_timeout_period_get(void) { - return sl__globals_cpu()->period; + return sl__globals_core()->period; } static inline void sl_timeout_oneshot(cycles_t absolute_us) { - sl__globals_cpu()->timer_next = absolute_us; - sl__globals_cpu()->timeout_next = tcap_cyc2time(absolute_us); + struct sl_global_core *g = sl__globals_core(); + + g->timer_prev = g->timer_next; + g->timer_next = absolute_us; + g->timeout_next = tcap_cyc2time(absolute_us); } static inline void @@ -368,7 +408,7 @@ struct heap *sl_timeout_heap(void); static inline void sl_timeout_wakeup_expired(cycles_t now) { - if (!heap_size(sl_timeout_heap())) return; + if (likely(!heap_size(sl_timeout_heap()))) return; do { struct sl_thd *tp, *th; @@ -396,32 +436,173 @@ sl_thd_is_runnable(struct sl_thd *t) } static inline int -sl_thd_activate(struct sl_thd *t, sched_tok_t tok) +sl_thd_dispatch_kern(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr, tcap_time_t timeout, tcap_t tc, tcap_prio_t p) { - struct cos_defcompinfo *dci = cos_defcompinfo_curr_get(); - struct cos_compinfo *ci = &dci->ci; - struct sl_global_cpu *g = sl__globals_cpu(); + volatile struct cos_scb_info *scb = sl_scb_info_core(); + struct sl_global_core *g = sl__globals_core(); + struct cos_dcb_info *cd = sl_thd_dcbinfo(curr), *nd = sl_thd_dcbinfo(next); + word_t a = ((sl_thd_thdcap(next) + 1) << COS_CAPABILITY_OFFSET) + (tok >> 16); + word_t b = (tc << 16) | g->sched_rcv; + word_t S = (p << 32) >> 32; + word_t D = (((p << 16) >> 48) << 16) | ((tok << 16) >> 16); + word_t d = timeout; int ret = 0; - if (t->properties & SL_THD_PROPERTY_SEND) { + assert(curr != next); + if (unlikely(!cd || !nd)) return cos_switch(sl_thd_thdcap(next), sl_thd_tcap(next), next->prio, timeout, g->sched_rcv, tok); + + __asm__ __volatile__ ( \ + "pushl %%ebp\n\t" \ + "movl %%esp, %%ebp\n\t" \ + "movl $1f, (%%esi)\n\t" \ + "movl %%esp, 4(%%esi)\n\t" \ + "movl %%ecx, %%esi\n\t" \ + "movl $2f, %%ecx\n\t" \ + "sysenter\n\t" \ + "jmp 2f\n\t" \ + ".align 4\n\t" \ + "1:\n\t" \ + "movl $0, %%eax\n\t" \ + ".align 4\n\t" \ + "2:\n\t" \ + "popl %%ebp\n\t" \ + : "=a" (ret) + : "a" (a), "b" (b), "S" (cd), "D" (D), "d" (d), "c" (S) + : "memory", "cc"); + + scb = sl_scb_info_core(); + cd = sl_thd_dcbinfo(sl_thd_curr()); + cd->sp = 0; + if (unlikely(ps_load(&scb->sched_tok) != tok)) return -EAGAIN; + + return ret; +} + +static inline int +sl_thd_dispatch_usr(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr) +{ + volatile struct cos_scb_info *scb = sl_scb_info_core(); + struct cos_dcb_info *cd = sl_thd_dcbinfo(curr), *nd = sl_thd_dcbinfo(next); + struct sl_global_core *g = sl__globals_core(); + + assert(curr != next); + if (unlikely(!cd || !nd)) return cos_defswitch(sl_thd_thdcap(next), next->prio, g->timeout_next, tok); + + /* + * jump labels in the asm routine: + * + * 1: slowpath dispatch using cos_thd_switch to switch to a thread + * if the dcb sp of the next thread is reset. + * (inlined slowpath sysenter to debug preemption problem) + * + * 2: if user-level dispatch routine completed successfully so + * the register states still retained and in the dispatched thread + * we reset its dcb sp! + * + * 3: if user-level dispatch was either preempted in the middle + * of this routine or kernel at some point had to switch to a + * thread that co-operatively switched away from this routine. + * NOTE: kernel takes care of resetting dcb sp in this case! + * + * a simple cos_thd_switch() kind will disable timers! so, pass in the timeout anyway to + * slowpath thread switch! + */ + + __asm__ __volatile__ ( \ + "pushl %%ebp\n\t" \ + "movl %%esp, %%ebp\n\t" \ + "movl $2f, (%%eax)\n\t" \ + "movl %%esp, 4(%%eax)\n\t" \ + "cmp $0, 4(%%ebx)\n\t" \ + "je 1f\n\t" \ + "movl %%edx, (%%ecx)\n\t" \ + "movl 4(%%ebx), %%esp\n\t" \ + "jmp *(%%ebx)\n\t" \ + ".align 4\n\t" \ + "1:\n\t" \ + "movl $3f, %%ecx\n\t" \ + "movl %%edx, %%eax\n\t" \ + "inc %%eax\n\t" \ + "shl $16, %%eax\n\t" \ + "movl $0, %%ebx\n\t" \ + "movl %%esi, %%edx\n\t" \ + "movl $0, %%esi\n\t" \ + "movl $0, %%edi\n\t" \ + "sysenter\n\t" \ + "jmp 3f\n\t" \ + ".align 4\n\t" \ + "2:\n\t" \ + "movl $0, 4(%%ebx)\n\t" \ + ".align 4\n\t" \ + "3:\n\t" \ + "popl %%ebp\n\t" \ + : + : "a" (cd), "b" (nd), + "S" (g->timeout_next), "D" (tok), + "c" (&(scb->curr_thd)), "d" (sl_thd_thdcap(next)) + : "memory", "cc"); + + scb = sl_scb_info_core(); + if (unlikely(ps_load(&scb->sched_tok) != tok)) return -EAGAIN; + + return 0; +} + +static inline int +sl_thd_activate_c(struct sl_thd *t, sched_tok_t tok, tcap_time_t timeout, tcap_prio_t prio, struct sl_thd *curr, struct sl_global_core *g) +{ + if (unlikely(t->properties & SL_THD_PROPERTY_SEND)) { return cos_sched_asnd(t->sndcap, g->timeout_next, g->sched_rcv, tok); - } else if (t->properties & SL_THD_PROPERTY_OWN_TCAP) { - return cos_switch(sl_thd_thdcap(t), sl_thd_tcap(t), t->prio, - g->timeout_next, g->sched_rcv, tok); + } + + /* there is more events.. run scheduler again! */ + if (unlikely(cos_sched_ispending())) { + if (curr == g->sched_thd) return -EBUSY; + return sl_thd_dispatch_usr(g->sched_thd, tok, curr); + } + + if (unlikely(t->properties & SL_THD_PROPERTY_OWN_TCAP)) { + return sl_thd_dispatch_kern(t, tok, curr, timeout, sl_thd_tcap(t), prio == 0 ? t->prio : prio); + } + + /* TODO: there is something in the kernel that seem to disable timers..!! */ + /* WORKAROUND: idle thread is a big cpu hogger.. so make sure there is timeout set around switching to and away! */ + if (unlikely(curr == g->idle_thd || t == g->idle_thd)) { + return sl_thd_dispatch_kern(t, tok, curr, g->timeout_next, g->sched_tcap, prio); + } + + if (unlikely(timeout || prio)) { + return sl_thd_dispatch_kern(t, tok, curr, timeout, g->sched_tcap, prio); } else { - ret = cos_defswitch(sl_thd_thdcap(t), t->prio, t == g->sched_thd ? - TCAP_TIME_NIL : g->timeout_next, tok); - if (likely(t != g->sched_thd && t != g->idle_thd)) return ret; - if (unlikely(ret != -EPERM)) return ret; - - /* - * Attempting to activate scheduler thread or idle thread failed for no budget in it's tcap. - * Force switch to the scheduler with current tcap. - */ - return cos_switch(sl_thd_thdcap(g->sched_thd), 0, t->prio, 0, g->sched_rcv, tok); + assert(t != g->idle_thd); + return sl_thd_dispatch_usr(t, tok, curr); } } + +static inline int +sl_thd_activate(struct sl_thd *t, sched_tok_t tok, tcap_time_t timeout, tcap_prio_t prio) +{ + struct sl_global_core *g = sl__globals_core(); + + return sl_thd_activate_c(t, tok, timeout, prio, sl_thd_curr(), g); +} + +static inline int +sl_cs_exit_schedule_nospin_arg_c(struct sl_thd *curr, struct sl_thd *next) +{ + sched_tok_t tok; +#ifdef SL_CS + if (likely(!sl_cs_owner())) sl_cs_enter(); +#endif + tok = cos_sched_sync(); +#ifdef SL_CS + sl_cs_exit(); +#endif + return sl_thd_activate_c(next, tok, 0, 0, curr, sl__globals_core()); +} + +void sl_thd_replenish_no_cs(struct sl_thd *t, cycles_t now); /* * Do a few things: 1. take the critical section if it isn't already * taken, 2. call schedule to find the next thread to run, 3. release @@ -449,24 +630,24 @@ sl_thd_activate(struct sl_thd *t, sched_tok_t tok) static inline int sl_cs_exit_schedule_nospin_arg(struct sl_thd *to) { - struct cos_defcompinfo *dci = cos_defcompinfo_curr_get(); - struct cos_compinfo *ci = &dci->ci; - struct sl_thd_policy *pt; - struct sl_thd * t; - struct sl_global_cpu *globals = sl__globals_cpu(); - sched_tok_t tok; - cycles_t now; - s64_t offset; - int ret; + struct sl_thd *t = to, *c = sl_thd_curr(); + struct sl_global_core *globals = sl__globals_core(); + sched_tok_t tok; +#ifdef SL_REPLENISH + cycles_t now; +#endif + s64_t offset; + int ret; /* Don't abuse this, it is only to enable the tight loop around this function for races... */ - if (unlikely(!sl_cs_owner())) sl_cs_enter(); +#ifdef SL_CS + if (likely(!sl_cs_owner())) sl_cs_enter(); +#endif tok = cos_sched_sync(); +#ifdef SL_REPLENISH now = sl_now(); - offset = (s64_t)(globals->timer_next - now); - if (globals->timer_next && offset <= 0) sl_timeout_expended(now, globals->timer_next); - sl_timeout_wakeup_expired(now); +#endif /* * Once we exit, we can't trust t's memory as it could be @@ -475,45 +656,153 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to) * catch it. This is a little twitchy and subtle, so lets put * it in a function, here. */ - if (unlikely(to)) { + if (likely(to)) { t = to; - if (!sl_thd_is_runnable(t)) to = NULL; + if (unlikely(!sl_thd_is_runnable(t))) to = NULL; } - if (likely(!to)) { - pt = sl_mod_schedule(); + if (unlikely(!to)) { + struct sl_thd_policy *pt = sl_mod_schedule(); + if (unlikely(!pt)) t = globals->idle_thd; else t = sl_mod_thd_get(pt); } + if (unlikely(!t)) t= globals->sched_thd; + +#ifdef SL_REPLENISH + sl_thd_replenish_no_cs(t, now); +#endif + + assert(t && sl_thd_is_runnable(t)); +#ifdef SL_CS + sl_cs_exit(); +#endif + if (unlikely(t == c)) return 0; + + ret = sl_thd_activate_c(t, tok, 0, 0, c, globals); + + /* + * one observation, in slowpath switch: + * if the kernel decides to switch over to scheduler thread and + * later at some point decides to resume this thread, the ret value + * from the syscall is probably 0, even though token has advanced and + * the switch this thread intended, did not go through. + * + * there is some wierd race in user-level thread switch: + * a thread sl_thd_block()'s itself and decides to switch to a runnable + * thread at user-level. + * if a preemption occurs and eventually this thread is resumed, + * for some reason the token check is not working well. + * + * what is more wierd is, even in slowpath sl_thd_activate(), I see that + * on return from syscall, this thread is not runnable. + * how is this possible? is there a race? i don't think so. + * only the current thread can block itself, of course this is not true for AEPs. + * But for non AEPs, I don't know why this triggers! + * + * I'll need to rethink about some possible scenario, perhaps some bug in the code + * that returns to this thread when it is not runnable. + * something!!!! + */ + if (unlikely(!sl_thd_is_runnable(c))) return -EAGAIN; + +#ifdef SL_REPLENISH + /* + * dispatch failed with -EPERM because tcap associated with thread t does not have budget. + * Block the thread until it's next replenishment and return to the scheduler thread. + * + * If the thread is not replenished by the scheduler (replenished "only" by + * the inter-component delegations), block till next timeout and try again. + */ + if (unlikely(ret == -EPERM)) { + assert(t != globals->sched_thd && t != globals->idle_thd); + sl_thd_block_expiry(t); + if (unlikely(sl_thd_curr() != globals->sched_thd)) ret = sl_thd_activate(globals->sched_thd, tok, globals->timeout_next, 0); + } +#endif + /* either this thread is runnable at this point or a switch failed */ + assert(sl_thd_is_runnable(c) || ret); + + return ret; +} + +static inline int +sl_cs_exit_schedule_nospin_arg_timeout(struct sl_thd *to, cycles_t abs_timeout) +{ + struct sl_thd *t = to, *c = sl_thd_curr(); + struct sl_global_core *globals = sl__globals_core(); + sched_tok_t tok; + cycles_t now; + s64_t offset; + int ret; + struct cos_dcb_info *cb; + tcap_time_t timeout = 0; - if (t->properties & SL_THD_PROPERTY_OWN_TCAP && t->budget) { - assert(t->period); - assert(sl_thd_tcap(t) != globals->sched_tcap); + /* Don't abuse this, it is only to enable the tight loop around this function for races... */ +#ifdef SL_CS + if (likely(!sl_cs_owner())) sl_cs_enter(); +#endif - if (t->last_replenish == 0 || t->last_replenish + t->period <= now) { - tcap_res_t currbudget = 0; - cycles_t replenish = now - ((now - t->last_replenish) % t->period); + tok = cos_sched_sync(); + now = sl_now(); - ret = 0; - currbudget = (tcap_res_t)cos_introspect(ci, sl_thd_tcap(t), TCAP_GET_BUDGET); + offset = (s64_t)(globals->timer_next - now); + if (offset <= 0) sl_timeout_expended(now, globals->timer_next); + sl_timeout_wakeup_expired(now); - if (!cycles_same(currbudget, t->budget, SL_CYCS_DIFF) && currbudget < t->budget) { - tcap_res_t transfer = t->budget - currbudget; + /* + * Once we exit, we can't trust t's memory as it could be + * deallocated/modified, so cache it locally. If these values + * are out of date, the scheduler synchronization tok will + * catch it. This is a little twitchy and subtle, so lets put + * it in a function, here. + */ + if (likely(to)) { + t = to; + if (unlikely(!sl_thd_is_runnable(t))) to = NULL; + } + if (unlikely(!to)) { + struct sl_thd_policy *pt = sl_mod_schedule(); - /* tcap_transfer will assign sched_tcap's prio to t's tcap if t->prio == 0, which we don't want. */ - assert(t->prio >= TCAP_PRIO_MAX && t->prio <= TCAP_PRIO_MIN); - ret = cos_tcap_transfer(sl_thd_rcvcap(t), globals->sched_tcap, transfer, t->prio); - } + if (unlikely(!pt)) + t = globals->idle_thd; + else + t = sl_mod_thd_get(pt); + } + if (unlikely(!t)) t= globals->sched_thd; + +#ifdef SL_REPLENISH + sl_thd_replenish_no_cs(t, now); +#endif - if (likely(ret == 0)) t->last_replenish = replenish; - } + assert(t && sl_thd_is_runnable(t)); + if (offset <= 0 || + (abs_timeout > now && abs_timeout > globals->timer_next + globals->cyc_per_usec)) { + timeout = offset <= 0 ? globals->timer_next : (abs_timeout > now ? tcap_cyc2time(abs_timeout) : 0); } - assert(sl_thd_is_runnable(t)); +#ifdef SL_CS sl_cs_exit(); +#endif + if (likely(c == t && t == globals->sched_thd && timeout)) { + /* program the new timer.. */ + return cos_defswitch(globals->sched_thdcap, globals->sched_thd->prio, timeout, tok); + } + if (unlikely(t == c)) return 0; - ret = sl_thd_activate(t, tok); + /* + * if the requested timeout is greater than next timeout + * and timer is already programmed to be over a usec away, don't + * reprogam it. + * + * else, reprogram for an earlier timeout requested. + */ + + ret = sl_thd_activate_c(t, tok, timeout, 0, c, globals); + if (unlikely(!sl_thd_is_runnable(c))) return -EAGAIN; + +#ifdef SL_REPLENISH /* * dispatch failed with -EPERM because tcap associated with thread t does not have budget. * Block the thread until it's next replenishment and return to the scheduler thread. @@ -524,8 +813,9 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to) if (unlikely(ret == -EPERM)) { assert(t != globals->sched_thd && t != globals->idle_thd); sl_thd_block_expiry(t); - if (unlikely(sl_thd_curr() != globals->sched_thd)) ret = sl_thd_activate(globals->sched_thd, tok); + if (unlikely(sl_thd_curr() != globals->sched_thd)) ret = sl_thd_activate_c(globals->sched_thd, tok, globals->timeout_next, 0, c, globals); } +#endif return ret; } @@ -557,6 +847,41 @@ sl_cs_exit_switchto(struct sl_thd *to) } } +static inline int +sl_cs_exit_schedule_nospin_timeout(cycles_t abs_timeout) +{ + return sl_cs_exit_schedule_nospin_arg_timeout(NULL, abs_timeout); +} + +static inline void +sl_cs_exit_schedule_timeout(cycles_t abs_timeout) +{ + while (sl_cs_exit_schedule_nospin_timeout(abs_timeout) && sl_now() < abs_timeout) + ; +} + +static inline void +sl_cs_exit_switchto_timeout(struct sl_thd *to, cycles_t abs_timeout) +{ + /* + * We only try once, so it is possible that we don't end up + * switching to the desired thread. However, this is always a + * case that the caller has to consider if the current thread + * has a higher priority than the "to" thread. + */ + if (sl_cs_exit_schedule_nospin_arg_timeout(to, abs_timeout)) { + sl_cs_exit_schedule_timeout(abs_timeout); + } +} + +static inline void +sl_cs_exit_switchto_c(struct sl_thd *c, struct sl_thd *n) +{ + if (sl_cs_exit_schedule_nospin_arg_c(c, n)) { + sl_cs_exit_schedule(); + } +} + /* * Initialization protocol in cos_init: initialization of * library-internal data-structures, and then the ability for the @@ -571,7 +896,7 @@ void sl_init(microsec_t period); /* * @cpubmp - cpu/cores on which this scheduler will run on! */ -void sl_init_cpubmp(microsec_t period, u32_t *cpubmp); +void sl_init_corebmp(microsec_t period, u32_t *corebmp); /* * sl_sched_loop internally calls the kernel api - cos_sched_rcv * which blocks (suspends) the calling thread if there are no pending events. @@ -590,5 +915,114 @@ void sl_sched_loop(void) __attribute__((noreturn)); * booter receive (INITRCV) end-point at the kernel level. */ void sl_sched_loop_nonblock(void) __attribute__((noreturn)); +static inline void +sl_thd_yield_thd_c(struct sl_thd *c, struct sl_thd *n) +{ + if (likely(c && n)) sl_cs_exit_switchto_c(c, n); + else sl_thd_yield_intern(0); +} + +static inline void +sl_thd_yield_thd(struct sl_thd *n) +{ + if (likely(n)) sl_cs_exit_switchto(n); + else sl_thd_yield_intern(0); +} + +static inline void +sl_thd_yield(thdid_t tid) +{ + if (likely(tid)) { + sl_cs_enter(); + sl_cs_exit_switchto(sl_thd_lkup(tid)); + } else { + sl_thd_yield_intern(0); + } +} + +static inline void +sl_thd_yield_timeout(thdid_t tid, cycles_t abs_timeout) +{ + if (likely(tid)) { + sl_cs_enter(); + sl_cs_exit_switchto_timeout(sl_thd_lkup(tid), abs_timeout); + } else { + sl_thd_yield_intern_timeout(abs_timeout); + } +} + +static inline void +sl_thd_event_info_reset(struct sl_thd *t) +{ + t->event_info.blocked = 0; + t->event_info.elapsed_cycs = 0; + t->event_info.next_timeout = 0; + t->event_info.epoch = 0; +} + +static inline void +sl_thd_event_enqueue(struct sl_thd *t, struct cos_thd_event *e) +{ + struct sl_global_core *g = sl__globals_core(); + + assert(e->epoch); + if (e->epoch <= t->event_info.epoch) return; + + if (ps_list_singleton(t, SL_THD_EVENT_LIST)) ps_list_head_append(&g->event_head, t, SL_THD_EVENT_LIST); + + t->event_info.blocked = e->blocked; + t->event_info.elapsed_cycs += e->elapsed_cycs; + t->event_info.next_timeout = e->next_timeout; +} + +static inline void +sl_thd_event_dequeue(struct sl_thd *t, struct cos_thd_event *e) +{ + ps_list_rem(t, SL_THD_EVENT_LIST); + + e->blocked = t->event_info.blocked; + e->elapsed_cycs = t->event_info.elapsed_cycs; + e->next_timeout = t->event_info.next_timeout; + sl_thd_event_info_reset(t); +} + +static inline int +sl_thd_rcv(rcv_flags_t flags) +{ + return cos_ul_rcv(sl_thd_rcvcap(sl_thd_curr()), flags, sl__globals_core()->timeout_next); +// /* FIXME: elapsed_cycs accounting..?? */ +// struct cos_thd_event ev = { .blocked = 1, .next_timeout = 0, .epoch = 0, .elapsed_cycs = 0 }; +// struct sl_thd *t = sl_thd_curr(); +// unsigned long *p = &sl_thd_dcbinfo(t)->pending, q = 0; +// int ret = 0; +// +// assert(sl_thd_rcvcap(t)); +// assert(!(flags & RCV_ULSCHED_RCV)); +// +//recheck: +// if ((q = ps_load(p)) == 0) { +// if (!(flags & RCV_ULONLY)) { +// ret = cos_rcv(sl_thd_rcvcap(t), flags); +// q = ps_load(p); +// goto done; +// } +// if (unlikely(flags & RCV_NON_BLOCKING)) return -EAGAIN; +// +// sl_cs_enter(); +// ev.epoch = sl_now(); +// sl_thd_event_enqueue(t, &ev); +// sl_thd_sched_block_no_cs(t, SL_THD_BLOCKED, 0); +// sl_cs_exit_switchto(sl__globals_core()->sched_thd); +// goto recheck; +// //q = ps_load(p); +// } +// assert(sl_thd_dcbinfo(t)->sp == 0); +// assert(q == 1); /* q should be 1 if the thread did not call COS_RCV and is woken up.. */ +// +//done: +// ps_upcas(p, q, 0); +////if (cos_spd_id() != 4) printc("[R%u]", cos_thdid()); +// return ret; +} #endif /* SL_H */ diff --git a/src/components/include/sl_plugins.h b/src/components/include/sl_plugins.h index 0a7d22db3f..a5266f5bc9 100644 --- a/src/components/include/sl_plugins.h +++ b/src/components/include/sl_plugins.h @@ -16,6 +16,7 @@ */ struct sl_thd_policy *sl_thd_alloc_backend(thdid_t tid); void sl_thd_free_backend(struct sl_thd_policy *t); +struct sl_thd_policy *sl_thd_migrate_backend(struct sl_thd_policy *t, cpuid_t core); /* * cos_aep_info structs cannot be stack allocated! * The thread_alloc_backened needs to provide struct cos_aep_info without @@ -42,6 +43,8 @@ static inline struct sl_thd_policy *sl_mod_thd_policy_get(struct sl_thd *t); void sl_mod_execution(struct sl_thd_policy *t, cycles_t cycles); struct sl_thd_policy *sl_mod_schedule(void); +/* give me the thread at the end of the run-queue */ +struct sl_thd_policy *sl_mod_last_schedule(void); void sl_mod_block(struct sl_thd_policy *t); void sl_mod_wakeup(struct sl_thd_policy *t); diff --git a/src/components/include/sl_thd.h b/src/components/include/sl_thd.h index bd1035f27c..632759087f 100644 --- a/src/components/include/sl_thd.h +++ b/src/components/include/sl_thd.h @@ -27,12 +27,6 @@ typedef enum { SL_THD_PROPERTY_SEND = (1<<1), /* use asnd to dispatch to this thread */ } sl_thd_property_t; -struct event_info { - int blocked; /* 1 - blocked. 0 - awoken */ - cycles_t cycles; - tcap_time_t timeout; -}; - struct sl_thd { sl_thd_state_t state; /* @@ -93,10 +87,27 @@ struct sl_thd { cycles_t wakeup_cycs; /* actual last wakeup - used in timeout API for jitter information, etc */ int timeout_idx; /* timeout heap index, used in timeout API */ - struct event_info event_info; + struct cos_thd_event event_info; struct ps_list SL_THD_EVENT_LIST; /* list of events for the scheduler end-point */ + + struct cos_dcb_info *dcb; + + void *part_context; /* used by the parallelism stuff! */ + struct ps_list partlist; }; +static inline struct cos_dcb_info * +sl_thd_dcbinfo(struct sl_thd *t) +{ return t->dcb; } + +static inline unsigned long * +sl_thd_ip(struct sl_thd *t) +{ return &t->dcb->ip; } + +static inline unsigned long * +sl_thd_sp(struct sl_thd *t) +{ return &t->dcb->sp; } + static inline struct cos_aep_info * sl_thd_aepinfo(struct sl_thd *t) { return (t->aepinfo); } diff --git a/src/components/include/sl_xcore.h b/src/components/include/sl_xcore.h new file mode 100644 index 0000000000..b06d3c51b4 --- /dev/null +++ b/src/components/include/sl_xcore.h @@ -0,0 +1,193 @@ +#ifndef SL_XCORE_H +#define SL_XCORE_H + +#include +#include +#include +#include + +#define SL_XCORE_PARAM_MAX 4 +#define SL_XCORE_MIGRATE_MAX 16 +#define SL_XCORE_KEEP_MIN 4 + +typedef enum { + SL_XCORE_THD_ALLOC = 0, + SL_XCORE_THD_ALLOC_EXT, + SL_XCORE_AEP_ALLOC, + SL_XCORE_AEP_ALLOC_EXT, + SL_XCORE_INITAEP_ALLOC, + SL_XCORE_THD_DEALLOC, /* thread delete, need it? */ + + SL_XCORE_THD_PARAM_SET, + SL_XCORE_THD_WAKEUP, + + SL_XCORE_LOAD_BALANCE, +} sl_xcore_req_t; + +struct sl_xcore_response { + /* request type */ + sl_xcore_req_t type; /* set by the client requesting */ + /* response fields */ + volatile int resp_ready; + union { + struct { + thdid_t tid; + } sl_xcore_resp_thd_alloc; + struct { + unsigned nthds; + thdid_t tid[SL_XCORE_MIGRATE_MAX]; + } sl_xcore_resp_load_balance; + }; +}; + +struct sl_xcore_request { + sl_xcore_req_t type; /* request type */ + cpuid_t client_core; /* client cpu making the request */ + thdid_t client_thd; + struct sl_xcore_response *response; + + union { + struct { + cos_thd_fn_t fn; + void *data; + sched_param_t params[SL_XCORE_PARAM_MAX]; /* scheduling parameters */ + int param_count; /* number of parameters */ + } sl_xcore_req_thd_alloc; + struct { + cos_thd_fn_t fn; + void *data; + int own_tcap; + cos_channelkey_t key; + sched_param_t params[SL_XCORE_PARAM_MAX]; /* scheduling parameters */ + int param_count; /* number of parameters */ + } sl_xcore_req_aep_alloc; + struct { + thdclosure_index_t idx; /* TODO: create thread in another component ? */ + struct cos_defcompinfo *dci; + sched_param_t params[SL_XCORE_PARAM_MAX]; /* scheduling parameters */ + int param_count; /* number of parameters */ + } sl_xcore_req_thd_alloc_ext; + struct { + thdclosure_index_t idx; + int own_tcap; + cos_channelkey_t key; + struct cos_defcompinfo *dci; + sched_param_t params[SL_XCORE_PARAM_MAX]; /* scheduling parameters */ + int param_count; /* number of parameters */ + } sl_xcore_req_aep_alloc_ext; + struct { + int is_sched; + int own_tcap; + struct cos_defcompinfo *dci, *sched; + sched_param_t params[SL_XCORE_PARAM_MAX]; /* scheduling parameters */ + int param_count; /* number of parameters */ + } sl_xcore_req_initaep_alloc; + struct { + thdid_t tid; + sched_param_t param; + } sl_xcore_req_thd_param_set; + struct { + thdid_t tid; + } sl_xcore_req_thd_wakeup; + struct { + int nthds; /* if 0 - migrate as many as the src can */ + } sl_xcore_req_load_balance; + }; +}; + +CK_RING_PROTOTYPE(xcore, sl_xcore_request); + +#define SL_XCORE_RING_SIZE (64 * sizeof(struct sl_xcore_request)) /* in sl_const.h? */ + +/* + * TODO: unionize with sl_thd? + * + * IMHO, no! This will occupy too much memory if unionized! + * Plus, that would require that we'd need cpuid in the sl_thd and many + * branches around in the code for core-local scheduling! + * Also, making this struct explicit, makes API use explicit. + * I should only be able to use: param_set(), wakeup() and perhaps free(). + * + * Change my mind! This is a shit ton of wastage with CACHE_ALIGNED! + */ +struct sl_xcore_thd { + thdid_t thd; + cpuid_t core; + + asndcap_t asnd[NUM_CPU]; +} CACHE_ALIGNED; + +struct sl_xcore_thd *sl_xcore_thd_lookup(thdid_t tid); +struct sl_xcore_thd *sl_xcore_thd_lookup_init(thdid_t tid, cpuid_t core); +static inline thdid_t +sl_xcore_thd_thdid(struct sl_xcore_thd *t) +{ + return t->thd; +} +static inline cpuid_t +sl_xcore_thd_core(struct sl_xcore_thd *t) +{ + return t->core; +} + +/* perhaps move these to sl.h? */ +struct sl_global { + struct ck_ring xcore_ring[NUM_CPU]; /* mpsc ring! */ + + struct sl_xcore_request xcore_rbuf[NUM_CPU][SL_XCORE_RING_SIZE]; + u32_t core_bmp[(NUM_CPU + 7)/8]; /* bitmap of cores this scheduler is running on! */ + asndcap_t xcore_asnd[NUM_CPU][NUM_CPU]; + unsigned nthds_running[NUM_CPU] CACHE_ALIGNED; + struct cos_scb_info *scb_area; +} CACHE_ALIGNED; + +extern struct sl_global sl_global_data; + +static inline struct sl_global * +sl__globals(void) +{ + return &sl_global_data; +} + +static inline struct ck_ring * +sl__ring(cpuid_t core) +{ + return &(sl__globals()->xcore_ring[core]); +} + +static inline struct ck_ring * +sl__ring_curr(void) +{ + return sl__ring(cos_cpuid()); +} + +static inline struct sl_xcore_request * +sl__ring_buffer(cpuid_t core) +{ + return (sl__globals()->xcore_rbuf[core]); +} + +static inline struct sl_xcore_request * +sl__ring_buffer_curr(void) +{ + return sl__ring_buffer(cos_cpuid()); +} + +static inline int +sl_core_active(void) +{ + return bitmap_check(sl__globals()->core_bmp, cos_cpuid()); +} + +struct sl_xcore_thd *sl_xcore_thd_alloc(cpuid_t core, cos_thd_fn_t fn, void *data, int nparams, sched_param_t params[]); +struct sl_xcore_thd *sl_xcore_thd_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, thdclosure_index_t idx, int nparams, sched_param_t params[]); +struct sl_xcore_thd *sl_xcore_aep_alloc(cpuid_t core, cos_thd_fn_t fn, void *data, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[]); +struct sl_xcore_thd *sl_xcore_aep_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, thdclosure_index_t idx, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[]); +struct sl_xcore_thd *sl_xcore_initaep_alloc(cpuid_t core, struct cos_defcompinfo *dci, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[]); +struct sl_xcore_thd *sl_xcore_initaep_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, struct cos_defcompinfo *sched, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[]); +void sl_xcore_thd_param_set(struct sl_xcore_thd *t, sched_param_t param); +void sl_xcore_thd_wakeup(struct sl_xcore_thd *t); +void sl_xcore_thd_wakeup_tid(thdid_t tid, cpuid_t core); +int sl_xcore_load_balance(void); + +#endif /* SL_XCORE_H */ diff --git a/src/components/include/sl_xcpu.h b/src/components/include/sl_xcpu.h deleted file mode 100644 index f8c915e471..0000000000 --- a/src/components/include/sl_xcpu.h +++ /dev/null @@ -1,122 +0,0 @@ -/** - * Redistribution of this file is permitted under the BSD two clause license. - * - * Copyright 2018, The George Washington University - * Author: Phani Gadepalli, phanikishoreg@gwu.edu - */ - -#ifndef SL_XCPU_H -#define SL_XCPU_H - -#include -#include -#include -#include - -#define SL_XCPU_PARAM_MAX 4 - -typedef enum { - SL_XCPU_THD_ALLOC = 0, - SL_XCPU_THD_ALLOC_EXT, - SL_XCPU_AEP_ALLOC, - SL_XCPU_AEP_ALLOC_EXT, - SL_XCPU_INITAEP_ALLOC, - SL_XCPU_THD_DEALLOC, /* thread delete, need it? */ -} sl_xcpu_req_t; - -struct sl_xcpu_request { - sl_xcpu_req_t type; /* request type */ - cpuid_t client; /* client cpu making the request */ - int req_response; /* client needs a response */ - sched_param_t params[SL_XCPU_PARAM_MAX]; /* scheduling parameters */ - int param_count; /* number of parameters */ - - union { - struct { - cos_thd_fn_t fn; - void *data; - } sl_xcpu_req_thd_alloc; - struct { - cos_thd_fn_t fn; - void *data; - int own_tcap; - cos_channelkey_t key; - } sl_xcpu_req_aep_alloc; - struct { - thdclosure_index_t idx; /* TODO: create thread in another component ? */ - struct cos_defcompinfo *dci; - } sl_xcpu_req_thd_alloc_ext; - struct { - thdclosure_index_t idx; - int own_tcap; - cos_channelkey_t key; - struct cos_defcompinfo *dci; - } sl_xcpu_req_aep_alloc_ext; - struct { - int is_sched; - int own_tcap; - struct cos_defcompinfo *dci, *sched; - } sl_xcpu_req_initaep_alloc; - }; -}; - -CK_RING_PROTOTYPE(xcpu, sl_xcpu_request); - -#define SL_XCPU_RING_SIZE (64 * sizeof(struct sl_xcpu_request)) /* in sl_const.h? */ - -/* perhaps move these to sl.h? */ -struct sl_global { - struct ck_ring xcpu_ring[NUM_CPU]; /* mpsc ring! */ - - struct sl_xcpu_request xcpu_rbuf[NUM_CPU][SL_XCPU_RING_SIZE]; - u32_t cpu_bmp[NUM_CPU_BMP_WORDS]; /* bitmap of cpus this scheduler is running on! */ - asndcap_t xcpu_asnd[NUM_CPU][NUM_CPU]; -} CACHE_ALIGNED; - -extern struct sl_global sl_global_data; - -static inline struct sl_global * -sl__globals(void) -{ - return &sl_global_data; -} - -static inline int -sl_cpu_active(void) -{ - return bitmap_check(sl__globals()->cpu_bmp, cos_cpuid()); -} - -static inline struct ck_ring * -sl__ring(cpuid_t cpu) -{ - return &(sl__globals()->xcpu_ring[cpu]); -} - -static inline struct ck_ring * -sl__ring_curr(void) -{ - return sl__ring(cos_cpuid()); -} - -static inline struct sl_xcpu_request * -sl__ring_buffer(cpuid_t cpu) -{ - return (sl__globals()->xcpu_rbuf[cpu]); -} - -static inline struct sl_xcpu_request * -sl__ring_buffer_curr(void) -{ - return sl__ring_buffer(cos_cpuid()); -} - -/* perhaps move these to sl.h? */ -int sl_xcpu_thd_alloc(cpuid_t cpu, cos_thd_fn_t fn, void *data, sched_param_t params[]); -int sl_xcpu_thd_alloc_ext(cpuid_t cpu, struct cos_defcompinfo *dci, thdclosure_index_t idx, sched_param_t params[]); -int sl_xcpu_aep_alloc(cpuid_t cpu, cos_thd_fn_t fn, void *data, int own_tcap, cos_channelkey_t key, sched_param_t params[]); -int sl_xcpu_aep_alloc_ext(cpuid_t cpu, struct cos_defcompinfo *dci, thdclosure_index_t idx, int own_tcap, cos_channelkey_t key, sched_param_t params[]); -int sl_xcpu_initaep_alloc(cpuid_t cpu, struct cos_defcompinfo *dci, int own_tcap, cos_channelkey_t key, sched_param_t params[]); -int sl_xcpu_initaep_alloc_ext(cpuid_t cpu, struct cos_defcompinfo *dci, struct cos_defcompinfo *sched, int own_tcap, cos_channelkey_t key, sched_param_t params[]); - -#endif /* SL_XCPU_H */ diff --git a/src/components/include/stacklist.h b/src/components/include/stacklist.h new file mode 100644 index 0000000000..edb6f9fd54 --- /dev/null +++ b/src/components/include/stacklist.h @@ -0,0 +1,94 @@ +#ifndef STACKLIST_H +#define STACKLIST_H + +/** + * Modified to support multi-core via a Treiber stack. This is not 100% + * a great solution as it isn't FIFO. However, we release *all* + * threads when unlocking, so the priority scheduling should take over + * at that point. + */ + +#include +#include + +struct stacklist { + cpuid_t coreid; + thdid_t thdid; + struct stacklist *next; +}; + +struct stacklist_head { + struct stacklist *head; +}; + +static inline void +stacklist_init(struct stacklist_head *h) +{ + h->head = NULL; +} + +/* + * Remove a thread from the list that has been woken. Return 0 on + * success, and 1 if it could not be removed. + */ +static inline int +stacklist_rem(struct stacklist *l) +{ + /* + * Not currently supported with Trebor Stack. Threads that + * wake early still have to wait their turn. + */ + return 1; +} + +/* Add a thread that is going to block */ +static inline void +stacklist_add(struct stacklist_head *h, struct stacklist *l) +{ + l->coreid = cos_cpuid(); + l->thdid = cos_thdid(); + l->next = NULL; + assert(h); + + while (1) { + struct stacklist *n = ps_load(&h->head); + + l->next = n; + if (ps_cas((unsigned long *)&h->head, (unsigned long)n, (unsigned long)l)) break; + } +} + +/* Get a thread to wake up, and remove its record! */ +static inline thdid_t +stacklist_dequeue(cpuid_t *core, struct stacklist_head *h) +{ + struct stacklist *sl = NULL; + + /* + * Only a single thread should trigger an event, and dequeue + * threads, but we'll implement this conservatively. Given + * this, please note that this should *not* iterate more than + * once. + */ + do { + sl = ps_load(&h->head); + if (unlikely(!sl)) return 0; + } while (!ps_cas((unsigned long *)&h->head, (unsigned long)sl, (unsigned long)sl->next)); + sl->next = NULL; + *core = sl->coreid; + + return sl->thdid; +} + +/* + * A thread that wakes up after blocking using a stacklist should be + * able to assume that it is no longer on the list. This enables them + * to assert on that fact. + */ +static inline int +stacklist_is_removed(struct stacklist *l) +{ + return l->next == NULL; +} + +#endif /* STACKLIST_H */ diff --git a/src/components/interface/capmgr/capmgr.h b/src/components/interface/capmgr/capmgr.h index eb0f85b6d4..09fc89acbf 100644 --- a/src/components/interface/capmgr/capmgr.h +++ b/src/components/interface/capmgr/capmgr.h @@ -13,10 +13,10 @@ thdcap_t capmgr_initthd_create(spdid_t child, thdid_t *tid); thdcap_t capmgr_initaep_create(spdid_t child, struct cos_aep_info *aep, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, asndcap_t *sndret); -thdcap_t capmgr_thd_create(cos_thd_fn_t fn, void *data, thdid_t *tid); -thdcap_t capmgr_aep_create(struct cos_aep_info *a, cos_aepthd_fn_t fn, void *data, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax); -thdcap_t capmgr_thd_create_ext(spdid_t child, thdclosure_index_t idx, thdid_t *tid); -thdcap_t capmgr_aep_create_ext(spdid_t child, struct cos_aep_info *a, thdclosure_index_t idx, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv); +thdcap_t capmgr_thd_create(cos_thd_fn_t fn, void *data, thdid_t *tid, struct cos_dcb_info **dcb); +thdcap_t capmgr_aep_create(struct cos_aep_info *a, cos_aepthd_fn_t fn, void *data, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, struct cos_dcb_info **dcb); +thdcap_t capmgr_thd_create_ext(spdid_t child, thdclosure_index_t idx, thdid_t *tid, struct cos_dcb_info **dcb); +thdcap_t capmgr_aep_create_ext(spdid_t child, struct cos_aep_info *a, thdclosure_index_t idx, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, struct cos_dcb_info **dcb, arcvcap_t *extrcv); thdcap_t capmgr_thd_retrieve(spdid_t child, thdid_t t, thdid_t *inittid); thdcap_t capmgr_thd_retrieve_next(spdid_t child, thdid_t *tid); arcvcap_t capmgr_rcv_create(spdid_t child, thdid_t tid, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax); @@ -24,4 +24,10 @@ asndcap_t capmgr_asnd_create(spdid_t child, thdid_t t); asndcap_t capmgr_asnd_rcv_create(arcvcap_t rcv); asndcap_t capmgr_asnd_key_create(cos_channelkey_t key); +int capmgr_thd_migrate(thdid_t tid, thdcap_t tc, cpuid_t core); + +int capmgr_hw_attach(hwid_t hwid, thdid_t tid); +int capmgr_hw_periodic_attach(hwid_t hwid, thdid_t tid, unsigned int period_us); +int capmgr_hw_detach(hwid_t hwid); + #endif /* CAPMGR_H */ diff --git a/src/components/interface/capmgr/stubs/c_stub.c b/src/components/interface/capmgr/stubs/c_stub.c index a808528adc..c008e15361 100644 --- a/src/components/interface/capmgr/stubs/c_stub.c +++ b/src/components/interface/capmgr/stubs/c_stub.c @@ -10,14 +10,16 @@ #include #include -thdcap_t capmgr_initthd_create_cserialized(thdid_t *tid, int *unused, spdid_t s); -thdcap_t capmgr_initaep_create_cserialized(u32_t *sndtidret, u32_t *rcvtcret, u32_t spdid_owntc, u32_t key_ipimax, u32_t ipiwin32b); -thdcap_t capmgr_thd_create_cserialized(thdid_t *tid, int *unused, thdclosure_index_t idx); -thdcap_t capmgr_aep_create_ext_cserialized(u32_t *drcvtidret, u32_t *rcvtcret, u32_t spdid_owntc_thdidx, u32_t chkey_ipimax, u32_t ipiwin32b); -thdcap_t capmgr_thd_create_ext_cserialized(thdid_t *tid, int *unused, spdid_t s, thdclosure_index_t idx); -thdcap_t capmgr_aep_create_cserialized(thdid_t *tid, u32_t *tcrcvret, u32_t owntc_tidx, u32_t key_ipimax, u32_t ipiwin32b); -thdcap_t capmgr_thd_retrieve_next_cserialized(thdid_t *tid, int *unused, spdid_t s); -thdcap_t capmgr_thd_retrieve_cserialized(thdid_t *inittid, int *unused, spdid_t s, thdid_t tid); +thdcap_t capmgr_initthd_create_cserialized(thdid_t *tid, int *unused, spdid_t s); +thdcap_t capmgr_initaep_create_cserialized(u32_t *sndtidret, u32_t *rcvtcret, u32_t spdid_owntc, u32_t key_ipimax, u32_t ipiwin32b); +thdcap_t capmgr_thd_create_cserialized(struct cos_dcb_info **dcb, thdid_t *tid, thdclosure_index_t idx); +u32_t capmgr_aep_create_ext_cserialized(struct cos_dcb_info **dcb, u32_t *rcvtcret, u32_t spdid_owntc_thdidx, u32_t chkey_ipimax, u32_t ipiwin32b); +/* rcvcap for spdid = s shall be obtained through a separate call to capmgr! */ +arcvcap_t capmgr_aep_rcv_retrieve_cserialized(spdid_t s, thdid_t tid); +thdcap_t capmgr_thd_create_ext_cserialized(struct cos_dcb_info **dcb, thdid_t *tid, spdid_t s, thdclosure_index_t idx); +u32_t capmgr_aep_create_cserialized(struct cos_dcb_info **dcb, u32_t *rcvtcret, u32_t owntc_tidx, u32_t key_ipimax, u32_t ipiwin32b); +thdcap_t capmgr_thd_retrieve_next_cserialized(thdid_t *tid, int *unused, spdid_t s); +thdcap_t capmgr_thd_retrieve_cserialized(thdid_t *inittid, int *unused, spdid_t s, thdid_t tid); arcvcap_t capmgr_rcv_create_cserialized(u32_t spd_tid, u32_t key_ipimax, u32_t ipiwin32b); arcvcap_t @@ -33,50 +35,67 @@ capmgr_rcv_create(spdid_t child, thdid_t tid, cos_channelkey_t key, microsec_t i thdcap_t capmgr_thd_retrieve(spdid_t child, thdid_t tid, thdid_t *inittid) { - int unused; + int r1, r2, r3; - return capmgr_thd_retrieve_cserialized(inittid, &unused, child, tid); + r1 = capmgr_thd_retrieve_cserialized((thdid_t *)&r2, &r3, child, tid); + *inittid = r2; + + return r1; } thdcap_t capmgr_thd_retrieve_next(spdid_t child, thdid_t *tid) { - int unused; + int r1, r2, r3; + + r1 = capmgr_thd_retrieve_next_cserialized((thdid_t *)&r2, &r3, child); + *tid = r2; - return capmgr_thd_retrieve_next_cserialized(tid, &unused, child); + return r1; } thdcap_t capmgr_initthd_create(spdid_t child, thdid_t *tid) { - int unused; + int r1, r2, r3; + + r1 = capmgr_initthd_create_cserialized((thdid_t *)&r2, &r3, child); + *tid = r2; - return capmgr_initthd_create_cserialized(tid, &unused, child); + return r1; } thdcap_t -capmgr_thd_create(cos_thd_fn_t fn, void *data, thdid_t *tid) +capmgr_thd_create(cos_thd_fn_t fn, void *data, thdid_t *tid, struct cos_dcb_info **dcb) { - int unused; + int r1, r2, r3; thdclosure_index_t idx = cos_thd_init_alloc(fn, data); - if (idx < 1) return 0; + if (unlikely(idx < 1)) return 0; + + r1 = capmgr_thd_create_cserialized((struct cos_dcb_info **)&r2, (thdid_t *)&r3, idx); + *dcb = (struct cos_dcb_info *)r2; + *tid = r3; - return capmgr_thd_create_cserialized(tid, &unused, idx); + return r1; } thdcap_t -capmgr_thd_create_ext(spdid_t child, thdclosure_index_t idx, thdid_t *tid) +capmgr_thd_create_ext(spdid_t child, thdclosure_index_t idx, thdid_t *tid, struct cos_dcb_info **dcb) { - int unused; + int r1, r2, r3; - return capmgr_thd_create_ext_cserialized(tid, &unused, child, idx); + r1 = capmgr_thd_create_ext_cserialized((struct cos_dcb_info **)&r2, (thdid_t *)&r3, child, idx); + *tid = r3; + *dcb = (struct cos_dcb_info *)r2; + + return r1; } thdcap_t -capmgr_aep_create(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax) +capmgr_aep_create(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, struct cos_dcb_info **dcb) { - u32_t tcrcvret = 0; + u32_t tcrcvret = 0, thdtidret = 0; thdcap_t thd = 0; arcvcap_t rcv = 0; tcap_t tc = 0; @@ -88,8 +107,11 @@ capmgr_aep_create(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data, int if (idx < 1) return 0; - thd = capmgr_aep_create_cserialized(&tid, &tcrcvret, owntc_idx, key_ipimax, ipiwin32b); - if (!thd) return 0; + thdtidret = capmgr_aep_create_cserialized(dcb, &tcrcvret, owntc_idx, key_ipimax, ipiwin32b); + if (!thdtidret) return 0; + thd = thdtidret >> 16; + tid = (thdtidret << 16) >> 16; + if (!thd || !tid) return 0; aep->fn = fn; aep->data = data; @@ -102,9 +124,9 @@ capmgr_aep_create(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data, int } thdcap_t -capmgr_aep_create_ext(spdid_t child, struct cos_aep_info *aep, thdclosure_index_t idx, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv) +capmgr_aep_create_ext(spdid_t child, struct cos_aep_info *aep, thdclosure_index_t idx, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, struct cos_dcb_info **dcb, arcvcap_t *extrcv) { - u32_t drcvtidret = 0; + u32_t thdtidret = 0; u32_t tcrcvret = 0; thdid_t tid = 0; thdcap_t thd = 0; @@ -112,16 +134,20 @@ capmgr_aep_create_ext(spdid_t child, struct cos_aep_info *aep, thdclosure_index_ u32_t key_ipimax = (key << 16) | ((ipimax << 16) >> 16); u32_t ipiwin32b = (u32_t)ipiwin; - thd = capmgr_aep_create_ext_cserialized(&drcvtidret, &tcrcvret, owntc_spdid_thdidx, key_ipimax, ipiwin32b); - if (!thd) return thd; + thdtidret = capmgr_aep_create_ext_cserialized(dcb, &tcrcvret, owntc_spdid_thdidx, key_ipimax, ipiwin32b); + if (!thdtidret) return thd; + thd = thdtidret >> 16; + tid = (thdtidret << 16) >> 16; + if (!thd || !tid) return 0; aep->fn = NULL; aep->data = NULL; aep->thd = thd; - aep->tid = (drcvtidret << 16) >> 16; + aep->tid = tid; aep->rcv = tcrcvret >> 16; aep->tc = (tcrcvret << 16) >> 16; - *extrcv = drcvtidret >> 16; + *extrcv = capmgr_aep_rcv_retrieve_cserialized(child, tid); + assert(*extrcv); return aep->thd; } diff --git a/src/components/interface/capmgr/stubs/s_stub.S b/src/components/interface/capmgr/stubs/s_stub.S index a7dfb0be87..4059d6a5db 100644 --- a/src/components/interface/capmgr/stubs/s_stub.S +++ b/src/components/interface/capmgr/stubs/s_stub.S @@ -14,12 +14,18 @@ cos_asm_server_stub_rets(capmgr_thd_create_cserialized) cos_asm_server_stub_rets(capmgr_aep_create_cserialized) cos_asm_server_stub_rets(capmgr_thd_create_ext_cserialized) cos_asm_server_stub_rets(capmgr_aep_create_ext_cserialized) +cos_asm_server_stub(capmgr_aep_rcv_retrieve_cserialized) cos_asm_server_stub_rets(capmgr_thd_retrieve_cserialized) cos_asm_server_stub_rets(capmgr_thd_retrieve_next_cserialized) cos_asm_server_stub(capmgr_rcv_create_cserialized) cos_asm_server_stub(capmgr_asnd_create) cos_asm_server_stub(capmgr_asnd_rcv_create) cos_asm_server_stub(capmgr_asnd_key_create) +cos_asm_server_stub(capmgr_thd_migrate) + +cos_asm_server_stub(capmgr_hw_attach) +cos_asm_server_stub(capmgr_hw_periodic_attach) +cos_asm_server_stub(capmgr_hw_detach) cos_asm_server_stub(memmgr_heap_page_allocn) cos_asm_server_stub_rets(memmgr_shared_page_allocn_cserialized) diff --git a/src/components/interface/crt/Makefile b/src/components/interface/crt/Makefile new file mode 100644 index 0000000000..6015b0c902 --- /dev/null +++ b/src/components/interface/crt/Makefile @@ -0,0 +1,4 @@ +LIB_OBJS= +LIBS=$(LIB_OBJS:%.o=%.a) + +include ../Makefile.subdir diff --git a/src/components/interface/crt/chan_crt.h b/src/components/interface/crt/chan_crt.h new file mode 100644 index 0000000000..2d93167c45 --- /dev/null +++ b/src/components/interface/crt/chan_crt.h @@ -0,0 +1,7 @@ +#ifndef CHAN_CRT_H +#define CHAN_CRT_H + +int chan_out(unsigned long item); +unsigned long chan_in(void); + +#endif /* CHAN_CRT_H */ diff --git a/src/components/interface/crt/stubs/s_stub.S b/src/components/interface/crt/stubs/s_stub.S new file mode 100644 index 0000000000..806aea9e19 --- /dev/null +++ b/src/components/interface/crt/stubs/s_stub.S @@ -0,0 +1,20 @@ +/** + * Redistribution of this file is permitted under the BSD two clause license. + * + * Copyright 2019, The George Washington University + * Author: Phani Gadepalli, phanikishoreg@gwu.edu + */ + +#include + +.text +cos_asm_server_stub(chan_out) +cos_asm_server_stub(chan_in) +//cos_asm_server_stub(chan_init) +//cos_asm_server_stub(chan_teardown) +//cos_asm_server_stub(chan_in_get) +//cos_asm_server_stub(chan_out_get) +//cos_asm_server_stub(chan_send) +//cos_asm_server_stub(chan_recv) +//cos_asm_server_stub(chan_async_send) +//cos_asm_server_stub(chan_async_recv) diff --git a/src/components/interface/work/Makefile b/src/components/interface/work/Makefile new file mode 100644 index 0000000000..800adb919e --- /dev/null +++ b/src/components/interface/work/Makefile @@ -0,0 +1,4 @@ +B_OBJS= +LIBS=$(LIB_OBJS:%.o=%.a) + +include ../Makefile.subdir diff --git a/src/components/interface/work/stubs/c_stub.c b/src/components/interface/work/stubs/c_stub.c new file mode 100644 index 0000000000..aafec59e63 --- /dev/null +++ b/src/components/interface/work/stubs/c_stub.c @@ -0,0 +1,37 @@ +/** + * Redistribution of this file is permitted under the BSD two clause license. + * + * Copyright 2018, The George Washington University + * Author: Phani Gadepalli, phanikishoreg@gwu.edu + */ + +#include + +int work_cycs_cserialized(unsigned long *hielpased, unsigned long *loelapsed, unsigned long hi_cycs, unsigned long lo_cycs); +int work_usecs_cserialized(unsigned long *hielpased, unsigned long *loelapsed, unsigned long hi_usecs, unsigned long lo_usecs); + +cycles_t +work_cycs(cycles_t ncycs) +{ + unsigned long hi_in, lo_in, hi_out, lo_out; + + hi_in = (ncycs >> 32); + lo_in = ((ncycs << 32) >> 32); + + work_cycs_cserialized(&hi_out, &lo_out, hi_in, lo_in); + + return (((cycles_t) hi_out << 32) | (cycles_t)lo_out); +} + +microsec_t +work_usecs(microsec_t nusecs) +{ + unsigned long hi_in, lo_in, hi_out, lo_out; + + hi_in = (nusecs >> 32); + lo_in = ((nusecs << 32) >> 32); + + work_usecs_cserialized(&hi_out, &lo_out, hi_in, lo_in); + + return (((microsec_t) hi_out << 32) | (microsec_t)lo_out); +} diff --git a/src/components/interface/work/stubs/s_stub.S b/src/components/interface/work/stubs/s_stub.S new file mode 100644 index 0000000000..d3245b4e75 --- /dev/null +++ b/src/components/interface/work/stubs/s_stub.S @@ -0,0 +1,12 @@ +/** + * Redistribution of this file is permitted under the BSD two clause license. + * + * Copyright 2018, The George Washington University + * Author: Phani Gadepalli, phanikishoreg@gwu.edu + */ + +#include + +.text +cos_asm_server_stub_rets(work_cycs_cserialized) +cos_asm_server_stub_rets(work_usecs_cserialized) diff --git a/src/components/interface/work/work.h b/src/components/interface/work/work.h new file mode 100644 index 0000000000..9768993ceb --- /dev/null +++ b/src/components/interface/work/work.h @@ -0,0 +1,12 @@ +#ifndef WORK_H +#define WORK_H + +#include + +/* @return: number of actual cycles elapsed */ +cycles_t work_cycs(cycles_t ncycs); +/* @return: number of actual usecs elapsed */ +microsec_t work_usecs(microsec_t nusecs); + + +#endif /* WORK_H */ diff --git a/src/components/lib/Makefile b/src/components/lib/Makefile index cba4d0e8a0..8eb222d861 100644 --- a/src/components/lib/Makefile +++ b/src/components/lib/Makefile @@ -1,6 +1,6 @@ include Makefile.src Makefile.comp -LIB_OBJS=heap.o cobj_format.o cos_kernel_api.o cos_defkernel_api.o cos_ubench.o +LIB_OBJS=heap.o cobj_format.o cos_kernel_api.o cos_defkernel_api.o cos_dcb.o part_raw.o part_capmgr.o cos_ubench.o LIBS=$(LIB_OBJS:%.o=%.a) MANDITORY=c_stub.o cos_asm_upcall.o cos_asm_ainv.o cos_component.o MAND=$(MANDITORY_LIB) @@ -10,8 +10,8 @@ SIMPLE_STKLIB=simple_stklib.o CINC_ENV=$(CINC) export CINC_ENV -.PHONY: all sl ps ck sinv -all: $(LIBS) $(MAND) $(SIMPLE_STKLIB) sl sinv +.PHONY: all sl ps ck sinv cos_gomp posix cxx +all: $(LIBS) $(MAND) $(SIMPLE_STKLIB) sl posix sinv cos_gomp # we have to compile these without dietlibc so that there are not # symbol conflicts and this is why we have the %.a here and don't @@ -30,6 +30,9 @@ $(SIMPLE_STKLIB): $(SIMPLE_STACKS) sl: make $(MAKEFLAGS) -C sl +cos_gomp: + make $(MAKEFLAGS) -C cos_gomp + sinv: make $(MAKEFLAGS) -C sinv_async @@ -42,15 +45,18 @@ sinv: @$(CC) $(CFLAGS) $(CINC) -o $@ -c $^ clean: - $(info | [RM] Cleaning up directory) + $(info | [RM] Cleaning up libraries and directories) @rm -f a.out *.o *.a *.d *~ - make -C sl clean + @make -C sl clean + @make -C sinv_async clean + @make -C posix clean + @make -C cos_gomp clean distclean: + $(info | [RM] Uninstalling external libraries) make -C musl-1.1.11 distclean # keep the following commands in one line. make executes each line # with a new shell. - make -C posix clean make -C libcxx clean make -C ck uninstall @@ -62,12 +68,15 @@ musl: ps: cd ps; ./configure cos x86 general; cd ..; make -C ps config ; make -C ps all - ck: make -C ck all make -C ck install -init: clean distclean musl ck ps all -# keep the following commands in one line. Same as above. +posix: make -C posix + +cxx: make -C libcxx + +init: clean distclean musl ck ps cxx all +# keep the following commands in one line. Same as above. diff --git a/src/components/lib/cos_component.c b/src/components/lib/cos_component.c index 15f5ab3122..d40ca97cb3 100644 --- a/src/components/lib/cos_component.c +++ b/src/components/lib/cos_component.c @@ -200,13 +200,17 @@ cos_upcall_fn(upcall_type_t t, void *arg1, void *arg2, void *arg3) cos_thd_entry_exec(idx); } } - return; + break; } default: /* fault! */ assert(0); return; } + + /* FIXME: for now, don't let threads page-fault on return! */ + while (1) ; + return; } diff --git a/src/components/lib/cos_dcb.c b/src/components/lib/cos_dcb.c new file mode 100644 index 0000000000..e73069af8f --- /dev/null +++ b/src/components/lib/cos_dcb.c @@ -0,0 +1,96 @@ +#include +#include +#include + +static struct cos_dcbinfo_data _cos_dcbinfo[NUM_CPU]; + +void +cos_dcb_info_init_ext(struct cos_dcbinfo_data *cdi, struct cos_compinfo *ci, + dcbcap_t initdcbcap, vaddr_t initdcbaddr, dcboff_t start_off) +{ + memset(cdi, 0, sizeof(struct cos_dcbinfo_data)); + + cdi->dcbcaps[0] = initdcbcap; + cdi->dcbaddr[0] = initdcbaddr; + cdi->curr_cap_off = start_off; + cdi->curr_cap = 0; +} + +void +cos_dcb_info_init(struct cos_dcbinfo_data *cdi, struct cos_compinfo *ci) +{ + if (cos_spd_id() == 0) { + cos_dcb_info_init_ext(cdi, ci, LLBOOT_CAPTBL_CPU_INITDCB, + (vaddr_t)cos_init_dcb_get(), 1); + } else { + cos_dcb_info_init_ext(cdi, ci, 0, 0, 0); + } +} + +void +cos_dcb_info_init_curr(void) +{ + cos_dcb_info_init_curr_ext(0, 0, 0); +} + +void +cos_dcb_info_init_curr_ext(dcbcap_t initdcbcap, vaddr_t initdcbaddr, dcboff_t st_off) +{ + struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get()); + + if (initdcbcap == 0 && initdcbaddr == 0) { + + if (cos_spd_id() == 0) { + cos_dcb_info_init_ext(&_cos_dcbinfo[cos_cpuid()], ci, + LLBOOT_CAPTBL_CPU_INITDCB, (vaddr_t)cos_init_dcb_get(), 1); + + return; + } else { + initdcbaddr = cos_page_bump_intern_valloc(ci, PAGE_SIZE); + assert(initdcbaddr); + initdcbcap = cos_dcb_alloc(ci, ci->pgtbl_cap, initdcbaddr); + assert(initdcbcap); + st_off = 0; + } + } + cos_dcb_info_init_ext(&_cos_dcbinfo[cos_cpuid()], ci, initdcbcap, initdcbaddr, st_off); +} + +dcbcap_t +cos_dcb_info_alloc_curr(dcboff_t *dcboff, vaddr_t *dcbaddr) +{ + return cos_dcb_info_alloc(&_cos_dcbinfo[cos_cpuid()], dcboff, dcbaddr); +} + +dcbcap_t +cos_dcb_info_alloc(struct cos_dcbinfo_data *cdi, dcboff_t *dcboff, vaddr_t *dcbaddr) +{ + if (unlikely(cdi->dcbcaps[cdi->curr_cap] == 0)) { + *dcboff = 0; + *dcbaddr = 0; + + return 0; + } + if (cdi->curr_cap_off >= COS_DCB_PERPG_MAX) { + int ret; + unsigned short curr_off = cdi->curr_cap; + + assert(curr_off + 1 < (unsigned short)COS_DCB_MAX_CAPS && cdi->dcbcaps[curr_off + 1] == 0); + + cdi->dcbaddr[curr_off + 1] = cos_page_bump_intern_valloc(cdi->ci, PAGE_SIZE); + assert(cdi->dcbaddr[curr_off + 1]); + cdi->dcbcaps[curr_off + 1] = cos_dcb_alloc(cos_compinfo_get(cos_defcompinfo_curr_get()), + cdi->ci->pgtbl_cap, cdi->dcbaddr[curr_off + 1]); + + assert(cdi->dcbcaps[curr_off + 1]); + ret = ps_cas((unsigned long *)&cdi->curr_cap, curr_off, curr_off + 1); + assert(ret); + ret = ps_cas((unsigned long *)&cdi->curr_cap_off, cdi->curr_cap_off, 0); + assert(ret); + } + + *dcboff = ps_faa((unsigned long *)&cdi->curr_cap_off, 1); + *dcbaddr = cdi->dcbaddr[cdi->curr_cap] + (sizeof(struct cos_dcb_info) * (*dcboff)); + + return cdi->dcbcaps[cdi->curr_cap]; +} diff --git a/src/components/lib/cos_defkernel_api.c b/src/components/lib/cos_defkernel_api.c index 68caf64dc1..ceed2c2fbf 100644 --- a/src/components/lib/cos_defkernel_api.c +++ b/src/components/lib/cos_defkernel_api.c @@ -46,6 +46,17 @@ cos_defcompinfo_init(void) } +void +cos_defcompinfo_llinit(void) +{ + if (curr_defci_init_status == INITIALIZED) return; + + cos_defcompinfo_init_ext(BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, + BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT, + BOOT_CAPTBL_SELF_COMP, (vaddr_t)cos_get_heap_ptr(), LLBOOT_CAPTBL_FREE); + +} + void cos_defcompinfo_init_ext(tcap_t sched_tc, thdcap_t sched_thd, arcvcap_t sched_rcv, pgtblcap_t pgtbl_cap, captblcap_t captbl_cap, compcap_t comp_cap, vaddr_t heap_ptr, capid_t cap_frontier) @@ -87,7 +98,7 @@ cos_defcompinfo_sched_init(void) } static int -cos_aep_alloc_intern(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, tcap_t tc, struct cos_aep_info *sched, cos_aepthd_fn_t fn, void *data, thdclosure_index_t idx) +cos_aep_alloc_intern(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, tcap_t tc, struct cos_aep_info *sched, cos_aepthd_fn_t fn, void *data, thdclosure_index_t idx, dcbcap_t dcbcap, dcboff_t dcboff) { struct cos_defcompinfo *defci = cos_defcompinfo_curr_get(); struct cos_compinfo *ci = cos_compinfo_get(defci); @@ -97,9 +108,9 @@ cos_aep_alloc_intern(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, assert(curr_defci_init_status == INITIALIZED); memset(aep, 0, sizeof(struct cos_aep_info)); - if (is_init) aep->thd = cos_initthd_alloc(ci, dst_ci->comp_cap); - else if (idx > 0) aep->thd = cos_thd_alloc_ext(ci, dst_ci->comp_cap, idx); - else aep->thd = cos_thd_alloc(ci, dst_ci->comp_cap, cos_aepthd_fn, (void *)aep); + if (is_init) aep->thd = cos_initthd_alloc(ci, dst_ci->comp_cap, dcbcap); + else if (idx > 0) aep->thd = cos_thd_alloc_ext(ci, dst_ci->comp_cap, idx, dcbcap, dcboff); + else aep->thd = cos_thd_alloc(ci, dst_ci->comp_cap, cos_aepthd_fn, (void *)aep, dcbcap, dcboff); assert(aep->thd); aep->tid = cos_introspect(ci, aep->thd, THD_GET_TID); if (!sched && is_init) return 0; @@ -121,7 +132,7 @@ cos_aep_alloc_intern(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, int cos_defcompinfo_child_alloc(struct cos_defcompinfo *child_defci, vaddr_t entry, vaddr_t heap_ptr, capid_t cap_frontier, - int is_sched) + int is_sched, dcbcap_t *initdcbcap) { int ret; struct cos_defcompinfo *defci = cos_defcompinfo_curr_get(); @@ -129,11 +140,22 @@ cos_defcompinfo_child_alloc(struct cos_defcompinfo *child_defci, vaddr_t entry, struct cos_compinfo *ci = cos_compinfo_get(defci); struct cos_compinfo *child_ci = cos_compinfo_get(child_defci); struct cos_aep_info *child_aep = cos_sched_aep_get(child_defci); + vaddr_t dcbaddr = 0; + dcbcap_t dcbcap = 0; + scbcap_t scbcap = 0; + + scbcap = cos_scb_alloc(ci); + assert(scbcap); assert(curr_defci_init_status == INITIALIZED); - ret = cos_compinfo_alloc(child_ci, heap_ptr, cap_frontier, entry, ci); + ret = cos_compinfo_alloc(child_ci, scbcap, heap_ptr, cap_frontier, entry, ci); if (ret) return ret; - ret = cos_aep_alloc_intern(child_aep, child_defci, 0, is_sched ? sched_aep : NULL, NULL, NULL, 0); + dcbaddr = (vaddr_t)cos_page_bump_intern_valloc(child_ci, PAGE_SIZE); + assert(dcbaddr); + dcbcap = cos_dcb_alloc(ci, child_ci->pgtbl_cap, dcbaddr); + assert(dcbcap); + ret = cos_aep_alloc_intern(child_aep, child_defci, 0, is_sched ? sched_aep : NULL, NULL, NULL, 0, dcbcap, 0); + *initdcbcap = dcbcap; return ret; } @@ -147,29 +169,29 @@ cos_defcompinfo_childid_init(struct cos_defcompinfo *child_defci, spdid_t c) } int -cos_initaep_alloc(struct cos_defcompinfo *dst_dci, struct cos_aep_info *sched, int is_sched) +cos_initaep_alloc(struct cos_defcompinfo *dst_dci, struct cos_aep_info *sched, int is_sched, dcbcap_t dcap) { struct cos_defcompinfo *defci = cos_defcompinfo_curr_get(); struct cos_aep_info *sched_aep = cos_sched_aep_get(defci); struct cos_aep_info *child_aep = cos_sched_aep_get(dst_dci); struct cos_aep_info *sched_use = is_sched ? (sched ? sched : sched_aep) : NULL; - return cos_aep_alloc_intern(child_aep, dst_dci, 0, sched_use, NULL, NULL, 0); + return cos_aep_alloc_intern(child_aep, dst_dci, 0, sched_use, NULL, NULL, 0, dcap, 0); } int -cos_initaep_tcap_alloc(struct cos_defcompinfo *dst_dci, tcap_t tc, struct cos_aep_info *sched) +cos_initaep_tcap_alloc(struct cos_defcompinfo *dst_dci, tcap_t tc, struct cos_aep_info *sched, dcbcap_t dcap) { struct cos_defcompinfo *defci = cos_defcompinfo_curr_get(); struct cos_aep_info *sched_aep = cos_sched_aep_get(defci); struct cos_aep_info *child_aep = cos_sched_aep_get(dst_dci); struct cos_aep_info *sched_use = sched ? sched : sched_aep; - return cos_aep_alloc_intern(child_aep, dst_dci, tc, sched_use, NULL, NULL, 0); + return cos_aep_alloc_intern(child_aep, dst_dci, tc, sched_use, NULL, NULL, 0, dcap, 0); } int -cos_aep_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, struct cos_aep_info *sched, thdclosure_index_t idx) +cos_aep_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, struct cos_aep_info *sched, thdclosure_index_t idx, dcbcap_t dcap, dcboff_t doff) { struct cos_defcompinfo *defci = cos_defcompinfo_curr_get(); struct cos_aep_info *sched_aep = cos_sched_aep_get(defci); @@ -178,11 +200,11 @@ cos_aep_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, str if (!sched) sched_aep = cos_sched_aep_get(dst_dci); else sched_aep = sched; - return cos_aep_alloc_intern(aep, dst_dci, 0, sched_aep, NULL, NULL, idx); + return cos_aep_alloc_intern(aep, dst_dci, 0, sched_aep, NULL, NULL, idx, dcap, doff); } int -cos_aep_tcap_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, struct cos_aep_info *sched, tcap_t tc, thdclosure_index_t idx) +cos_aep_tcap_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, struct cos_aep_info *sched, tcap_t tc, thdclosure_index_t idx, dcbcap_t dcap, dcboff_t doff) { struct cos_defcompinfo *defci = cos_defcompinfo_curr_get(); struct cos_aep_info *sched_aep = cos_sched_aep_get(defci); @@ -192,25 +214,25 @@ cos_aep_tcap_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci if (!sched) sched_aep = cos_sched_aep_get(dst_dci); else sched_aep = sched; - return cos_aep_alloc_intern(aep, dst_dci, tc, sched_aep, NULL, NULL, idx); + return cos_aep_alloc_intern(aep, dst_dci, tc, sched_aep, NULL, NULL, idx, dcap, doff); } int -cos_aep_alloc(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data) +cos_aep_alloc(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data, dcbcap_t dcap, dcboff_t doff) { struct cos_defcompinfo *defci = cos_defcompinfo_curr_get(); struct cos_aep_info *sched_aep = cos_sched_aep_get(defci); - return cos_aep_alloc_intern(aep, defci, 0, sched_aep, fn, data, 0); + return cos_aep_alloc_intern(aep, defci, 0, sched_aep, fn, data, 0, dcap, doff); } int -cos_aep_tcap_alloc(struct cos_aep_info *aep, tcap_t tc, cos_aepthd_fn_t fn, void *data) +cos_aep_tcap_alloc(struct cos_aep_info *aep, tcap_t tc, cos_aepthd_fn_t fn, void *data, dcbcap_t dcap, dcboff_t doff) { struct cos_defcompinfo *defci = cos_defcompinfo_curr_get(); struct cos_aep_info *sched_aep = cos_sched_aep_get(defci); - return cos_aep_alloc_intern(aep, defci, tc, sched_aep, fn, data, 0); + return cos_aep_alloc_intern(aep, defci, tc, sched_aep, fn, data, 0, dcap, doff); } int diff --git a/src/components/lib/cos_gomp/Makefile b/src/components/lib/cos_gomp/Makefile new file mode 100644 index 0000000000..ad4c1f75f9 --- /dev/null +++ b/src/components/lib/cos_gomp/Makefile @@ -0,0 +1,20 @@ +include Makefile.src Makefile.comp + +OBJS=cos_omp.o cos_gomp.o +LIB=cos_gomp +CINC+=-m32 + +.PHONY: all clean +all: $(LIB) + @cp *.a ../ + +%.o:%.c + $(info | [CC] Compiling C file $^ into $@) + @$(CC) $(CFLAGS) $(CINC) -o $@ -c $< + +$(LIB): $(OBJS) + $(info | [LD] Creating library file lib$(LIB).a) + @$(AR) cr lib$(LIB).a $^ + +clean: + @rm -f *.o *.a *.d diff --git a/src/components/lib/cos_gomp/cos_gomp.c b/src/components/lib/cos_gomp/cos_gomp.c new file mode 100644 index 0000000000..1c338c537b --- /dev/null +++ b/src/components/lib/cos_gomp/cos_gomp.c @@ -0,0 +1,385 @@ +/** + * Redistribution of this file is permitted under the BSD two clause license. + * + * Copyright 2019, The George Washington University + * Author: Phani Gadepalli, phanikishoreg@gwu.edu + * + * + * NOTE: There is no header file for this library! + * This is a backend for GOMP API in GCC and + * replaces LIBGOMP for composite! + */ + +#include +#include +#include +#include /* for now, single core lock! */ +#include + +#include "cos_gomp.h" +#include +#include + +static struct crt_lock _glock; /* global lock for critical sections */ + +static inline struct part_task * +_cos_gomp_alloc_explicit(void) +{ + return part_task_alloc(0); +} + +void +cos_gomp_init(void) +{ + static int first_one = NUM_CPU, init_done = 0; + + if (ps_cas(&first_one, NUM_CPU, cos_cpuid())) { + crt_lock_init(&_glock); + cos_omp_init(); + init_done = 1; + } else { + while(!ps_load(&init_done)) ; + } + part_init(); +} + +static inline void +_gomp_parallel_start(struct part_task *pt, void (*fn) (void *), void *data, unsigned num_threads, unsigned flags) +{ + int parent_off; + struct sl_thd *t = sl_thd_curr(); + struct part_task *parent = (struct part_task *)t->part_context; + + if (parent) assert(ps_load(&in_main_parallel)); + + num_threads = (num_threads == 0 || num_threads > COS_GOMP_MAX_THDS) ? COS_GOMP_MAX_THDS : num_threads; + + /* nesting? */ +#if !defined(PART_ENABLE_NESTED) + if (unlikely(parent)) num_threads = 1; +#endif + + pt->state = PART_TASK_S_ALLOCATED; + part_task_init(pt, PART_TASK_T_WORKSHARE, parent, num_threads, fn, data, NULL); + assert(pt->nthds == num_threads); + if (unlikely(parent)) { + parent_off = part_task_add_child(parent, pt); + assert(parent_off >= 0); + } + t->part_context = pt; + /* should not append to workshare list if it's a task with nthds == 1 */ + part_list_append(pt); +} + +static inline void +_gomp_parallel_end(struct part_task *pt) +{ + /* implicit hard barrier. only master thread to deinit task and all other threads just go back to pool */ + part_task_end(pt); +} + +/* GOMP_parallel prototype from libgomp within gcc */ +void +GOMP_parallel (void (*fn) (void *), void *data, unsigned num_threads, + unsigned int flags) +{ + struct part_task *prt = NULL; + struct part_task pt; + +#if defined(PART_ENABLE_NESTED) + prt = &pt +#else + struct sl_thd *t = sl_thd_curr(); + struct part_task *parent = (struct part_task *)t->part_context; + + /* child parallel will not be nested, will be run by this thread and also not added to the global list */ + if(parent) prt = &pt; + else prt = &main_task; +#endif + + _gomp_parallel_start(prt, fn, data, num_threads, flags); + fn(data); + _gomp_parallel_end(prt); +} + +bool +GOMP_single_start(void) +{ + struct part_task *t = (struct part_task *)sl_thd_curr()->part_context; + int i; + int coff = part_task_work_thd_num(t, PART_CURR_THD); + unsigned b = 1 << coff; + + assert(coff >= 0 && coff < (int)t->nthds); + for (i = t->ws_off[coff] + 1; i < PART_MAX_WORKSHARES; i++) { + struct part_workshare *pw = &t->ws[i]; + unsigned c; + + if (ps_load(&pw->type) == PART_WORKSHARE_NONE) { + /* perhaps one of the threads just converted it to a single */ + if (!ps_cas(&pw->type, PART_WORKSHARE_NONE, PART_WORKSHARE_SINGLE)) assert(pw->type == PART_WORKSHARE_SINGLE); + } + if (ps_load(&pw->type) != PART_WORKSHARE_SINGLE) continue; + +retry_bmp: + c = ps_load(&pw->worker_bmp); + /* if already went through this, should not have called start! */ + assert(!(c & b)); + + /* + * this thd, add to worker bmp to indicate it reached the construct. + * if this is the first to reach, then return "true", else "false". + * + * if cas failed, try again as you have to indicate that this thd + * has done this construct! + */ + if (ps_cas(&pw->worker_bmp, c, c | b)) { + t->ws_off[coff] = i; + + return c ? false : true; + } + goto retry_bmp; + } + + assert(0); /* exceed the number of workshares? */ + + return false; +} + +void +GOMP_barrier (void) +{ + struct part_task *t = (struct part_task *)sl_thd_curr()->part_context; + + part_task_barrier(t, 0); +} + +static inline bool +_gomp_loop_dynamic_next(struct part_task *t, struct part_workshare *w, long *s, long *e) +{ + long cn, left, wrk = 0; + +retry: + cn = ps_load(&w->next); + left = w->end - cn; + + if (unlikely(left == 0)) return false; + /* todo: incr <= 0 */ + assert(w->inc > 0); + + wrk = w->chunk_sz; + if (unlikely(left < wrk)) wrk = left; + if (!ps_cas(&w->next, cn, cn + wrk)) goto retry; + + *s = cn; + *e = cn + wrk; + + return true; +} + +bool +GOMP_loop_dynamic_start (long start, long end, long incr, long chunk_size, + long *istart, long *iend) +{ + struct part_task *t = (struct part_task *)sl_thd_curr()->part_context; + int i; + int coff = part_task_work_thd_num(t, PART_CURR_THD); + unsigned b = 1 << coff; + + assert(coff >= 0 && coff < (int)t->nthds); + for (i = t->ws_off[coff] + 1; i < PART_MAX_WORKSHARES; i++) { + struct part_workshare *pw = &t->ws[i]; + unsigned c; + + if (ps_load(&pw->type) == PART_WORKSHARE_NONE) { + /* perhaps one of the threads just converted it to a loop */ + if (!ps_cas(&pw->type, PART_WORKSHARE_NONE, PART_WORKSHARE_LOOP_DYNAMIC)) assert(pw->type == PART_WORKSHARE_LOOP_DYNAMIC); + } + + if (ps_load(&pw->type) != PART_WORKSHARE_LOOP_DYNAMIC) continue; + +retry_bmp: + c = ps_load(&pw->worker_bmp); + /* if already went through this, should not have called start! */ + assert(!(c & b)); + + /* + * this thd, add to worker bmp to indicate it reached the construct. + */ + if (ps_cas(&pw->worker_bmp, c, c | b)) t->ws_off[coff] = i; + else goto retry_bmp; + + /* all threads participating will initialize to the same values */ + if (unlikely(!pw->end)) { + pw->chunk_sz = chunk_size; + pw->inc = incr; + pw->st = start; + pw->end = end; + } + + if (likely(istart && iend)) return _gomp_loop_dynamic_next(t, pw, istart, iend); + else return true; + } + + assert(0); + + return false; +} + +void +GOMP_parallel_loop_dynamic (void (*fn) (void *), void *data, + unsigned num_threads, long start, long end, + long incr, long chunk_size, unsigned flags) +{ + struct part_task *prt = NULL; + struct part_task pt; + bool ret; + +#if defined(PART_ENABLE_NESTED) + prt = &pt +#else + struct sl_thd *t = sl_thd_curr(); + struct part_task *parent = (struct part_task *)t->part_context; + + /* child parallel will not be nested, will be run by this thread and also not added to the global list */ + if (parent) prt = &pt; + else prt = &main_task; +#endif + + _gomp_parallel_start(prt, fn, data, num_threads, flags); + ret = GOMP_loop_dynamic_start(start, end, incr, chunk_size, NULL, NULL); + assert(ret == true); + + fn(data); + _gomp_parallel_end(prt); +} + +bool +GOMP_loop_dynamic_next (long *istart, long *iend) +{ + struct part_task *t = (struct part_task *)sl_thd_curr()->part_context; + unsigned coff = part_task_work_thd_num(t, PART_CURR_THD); + int woff = t->ws_off[coff]; + + if (unlikely(woff < 0)) t->ws_off[coff] = woff = 0; + assert(t->ws[woff].type == PART_WORKSHARE_LOOP_DYNAMIC); + + return _gomp_loop_dynamic_next(t, &t->ws[woff], istart, iend); +} + +void +GOMP_loop_end (void) +{ + struct part_task *t = (struct part_task *)sl_thd_curr()->part_context; + unsigned coff = part_task_work_thd_num(t, PART_CURR_THD); + int woff = t->ws_off[coff], c = 0; + + assert(t->ws[woff].type == PART_WORKSHARE_LOOP_DYNAMIC); + + part_task_barrier(t, 0); +} + +void +GOMP_loop_end_nowait (void) +{ + struct part_task *t = (struct part_task *)sl_thd_curr()->part_context; + unsigned coff = part_task_work_thd_num(t, PART_CURR_THD); + int woff = t->ws_off[coff], c = 0; + + assert(t->ws[woff].type == PART_WORKSHARE_LOOP_DYNAMIC); +} + +void +GOMP_critical_start (void) +{ + crt_lock_take(&_glock); +} + +void +GOMP_critical_end (void) +{ + crt_lock_release(&_glock); +} + +void +GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *), + long arg_size, long arg_align, bool if_clause, unsigned flags, + void **depend, int priority) +{ + struct part_task *parent = (struct part_task *)sl_thd_curr()->part_context; + int parent_off = -1, ret = -1; + + /* + * There should be nothing that prevents us to enqueue a task that + * has a dependency, in or out! + * The thread that pops this task should potentially do the dependency + * tracking before/after execution of the function. + */ + /* TODO: depend, flags, etc! */ + assert(depend == NULL); + + if (if_clause) { + struct part_task *pt; + struct part_data *d; + char *arg = NULL; + + pt = _cos_gomp_alloc_explicit(); + assert(pt); + d = part_data_alloc(); + assert(d); + + assert(pt && d); + assert(arg_size + arg_align - 1 <= PART_MAX_DATA); + memset(d->data, 0, PART_MAX_DATA); + arg = (char *) (((uintptr_t) d->data + arg_align - 1) + & ~(uintptr_t) (arg_align - 1)); + if (cpyfn) cpyfn(arg, data); + else memcpy(arg, data, arg_size); + + assert(parent); + part_task_init(pt, PART_TASK_T_TASK, parent, 1, fn, arg, d); + parent_off = part_task_add_child(parent, pt); + assert(parent_off >= 0); + assert(pt->type == PART_TASK_T_TASK); + + do { + ret = part_deque_push(pt); + } while (ret == -EAGAIN); + assert(ret == 0); + /* wake up a thread that might potentially run this workload */ + part_pool_wakeup(); + } else { + /* if_clause is false, task is an included/undeferred task */ + struct part_task pt; + + assert(parent); + part_task_init(&pt, PART_TASK_T_TASK, parent, 1, fn, data, NULL); + parent_off = part_task_add_child(parent, &pt); + assert(parent_off >= 0); + sl_thd_curr()->part_context = &pt; + pt.workers[0] = PART_CURR_THD; + + if (cpyfn) { + char buf[arg_size + arg_align - 1]; + char *arg = (char *) (((uintptr_t) buf + arg_align - 1) + & ~(uintptr_t) (arg_align - 1)); + + cpyfn(arg, data); + fn(arg); + } else { + fn(data); + } + + part_task_end(&pt); + sl_thd_curr()->part_context = pt.parent; + } +} + +void +GOMP_taskwait (void) +{ + struct part_task *t = sl_thd_curr()->part_context; + + part_task_wait_children(t); + /* no barriers of course! */ +} diff --git a/src/components/lib/cos_gomp/cos_gomp.h b/src/components/lib/cos_gomp/cos_gomp.h new file mode 100644 index 0000000000..3cce60a1fe --- /dev/null +++ b/src/components/lib/cos_gomp/cos_gomp.h @@ -0,0 +1,11 @@ +#ifndef COS_GOMP_H +#define COS_GOMP_H + +#include + +#define COS_GOMP_MAX_THDS PART_MAX_PAR_THDS +#define COS_GOMP_CORE_MAX_THDS PART_MAX_CORE_THDS +#define COS_GOMP_MAX_CHILD PART_MAX_CHILD +#define COS_GOMP_MAX_TASKS PART_MAX_TASKS + +#endif /* COS_GOMP_H */ diff --git a/src/components/lib/cos_gomp/cos_omp.c b/src/components/lib/cos_gomp/cos_omp.c new file mode 100644 index 0000000000..b74ea94785 --- /dev/null +++ b/src/components/lib/cos_gomp/cos_omp.c @@ -0,0 +1,141 @@ +/** + * Redistribution of this file is permitted under the BSD two clause license. + * + * Copyright 2019, The George Washington University + * Author: Phani Gadepalli, phanikishoreg@gwu.edu + */ + +#include +#include +#include +#include +#include + +#define COS_OMP_NUM_DEVS 1 + +static struct cos_icv_global_env cos_icv_glbenv; +static struct cos_icv_device_env cos_icv_devenv[COS_OMP_NUM_DEVS]; +static struct cos_icv_data_env cos_icv_init_dataenv; +static struct cos_icv_implicittask_env cos_icv_init_implitskenv; +static unsigned int _cos_omp_init_done = 0; +static unsigned int _cycs_per_usec = 0; + +#define _USEC_TO_SEC_d(x) (((double)x)/(double)(1000*1000)) +#define _CYCS_TO_SEC_d(x) _USEC_TO_SEC_d((x)/(double)_cycs_per_usec) + +__GOMP_NOTHROW double +omp_get_wtime(void) +{ + cycles_t now; + + rdtscll(now); + return _CYCS_TO_SEC_d(now); +} + +__GOMP_NOTHROW int +omp_get_num_procs(void) +{ + return NUM_CPU; +} + +__GOMP_NOTHROW int +omp_get_max_threads(void) +{ + return COS_GOMP_MAX_THDS; +} + +__GOMP_NOTHROW int +omp_get_num_threads(void) +{ + struct sl_thd *t = sl_thd_curr(); + struct part_task *pt = (struct part_task *)t->part_context; + + if (pt) return pt->nthds; + + return 1; +} + +__GOMP_NOTHROW int +omp_get_thread_num(void) +{ + struct sl_thd *t = sl_thd_curr(); + struct part_task *pt = (struct part_task *)t->part_context; + + if (!pt) return 0; + + return part_task_work_thd_num(pt, PART_CURR_THD); +} + +static inline void +cos_omp_icv_global_init(void) +{ + assert(!_cos_omp_init_done); + /* TODO: what is not int? what is not zero? */ + /* cos_icv_glbenv.xxxx = yyyy; */ +} + +void +cos_omp_icv_data_init(struct cos_icv_data_env *icvde) +{ + if (unlikely(icvde == &cos_icv_init_dataenv)) { + assert(!_cos_omp_init_done); /* init only on startup! */ + + /* TODO: what is not int? what is not zero! */ + return; + } + + assert(_cos_omp_init_done); + memcpy(icvde, &cos_icv_init_dataenv, sizeof(struct cos_icv_data_env)); +} + +void +cos_omp_icv_implitsk_init(struct cos_icv_implicittask_env *icvite) +{ + if (unlikely(icvite == &cos_icv_init_implitskenv)) { + assert(!_cos_omp_init_done); /* init only on startup! */ + + /* TODO: what is not int? what is not zero! */ + return; + } + + assert(_cos_omp_init_done); + memcpy(icvite, &cos_icv_init_implitskenv, sizeof(struct cos_icv_implicittask_env)); +} + +void +cos_omp_icv_device_init(struct cos_icv_device_env *icvdve, unsigned dev_no) +{ + assert(dev_no < COS_OMP_NUM_DEVS); + + if (unlikely(icvdve == &cos_icv_devenv[dev_no])) { + assert(!_cos_omp_init_done); /* init only on startup! */ + + /* TODO: what is not int? what is not zero! */ + return; + } + + assert(_cos_omp_init_done); + memcpy(icvdve, &cos_icv_devenv[dev_no], sizeof(struct cos_icv_device_env)); +} + +static inline void +cos_omp_icv_init(void) +{ + cos_omp_icv_global_init(); + + cos_omp_icv_device_init(&cos_icv_devenv[0], 0); + + cos_omp_icv_data_init(&cos_icv_init_dataenv); + cos_omp_icv_implitsk_init(&cos_icv_init_implitskenv); +} + +void +cos_omp_init(void) +{ + _cycs_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE); + + assert(_cycs_per_usec); + + cos_omp_icv_init(); + _cos_omp_init_done = 1; +} diff --git a/src/components/lib/cos_kernel_api.c b/src/components/lib/cos_kernel_api.c index 54e5afc647..8b0e8d4cd5 100644 --- a/src/components/lib/cos_kernel_api.c +++ b/src/components/lib/cos_kernel_api.c @@ -34,7 +34,7 @@ __compinfo_metacap(struct cos_compinfo *ci) static inline void cos_vasfrontier_init(struct cos_compinfo *ci, vaddr_t heap_ptr) { - ci->vas_frontier = heap_ptr; + ci->vas_frontier = heap_ptr; /* * The first allocation should trigger PTE allocation, unless * it is in the middle of a PGD, in which case we assume one @@ -71,24 +71,23 @@ cos_capfrontier_init(struct cos_compinfo *ci, capid_t cap_frontier) void cos_compinfo_init(struct cos_compinfo *ci, pgtblcap_t pgtbl_cap, captblcap_t captbl_cap, compcap_t comp_cap, - vaddr_t heap_ptr, capid_t cap_frontier, struct cos_compinfo *ci_resources) + vaddr_t heap_ptr, capid_t cap_frontier, struct cos_compinfo *ci_resources) { assert(ci && ci_resources); assert(cap_frontier % CAPMAX_ENTRY_SZ == 0); ci->memsrc = ci_resources; assert(ci_resources->memsrc == ci_resources); /* prevent infinite data-structs */ - - ci->pgtbl_cap = pgtbl_cap; - ci->captbl_cap = captbl_cap; - ci->comp_cap = comp_cap; - - cos_vasfrontier_init(ci, heap_ptr); - cos_capfrontier_init(ci, cap_frontier); - ps_lock_init(&ci->cap_lock); ps_lock_init(&ci->mem_lock); ps_lock_init(&ci->va_lock); + + ci->pgtbl_cap = pgtbl_cap; + ci->captbl_cap = captbl_cap; + ci->comp_cap = comp_cap; + + cos_capfrontier_init(ci, cap_frontier); + cos_vasfrontier_init(ci, heap_ptr); } /**************** [Memory Capability Allocation Functions] ***************/ @@ -469,7 +468,7 @@ __page_bump_mem_alloc(struct cos_compinfo *ci, vaddr_t *mem_addr, vaddr_t *mem_f struct cos_compinfo *meta = __compinfo_metacap(ci); size_t rounded; - printd("__page_bump_alloc\n"); + printd("__page_bump_mem_alloc\n"); assert(sz % PAGE_SIZE == 0); assert(meta == __compinfo_metacap(meta)); /* prevent unbounded structures */ @@ -506,8 +505,14 @@ __page_bump_valloc(struct cos_compinfo *ci, size_t sz) return ret_addr; } +vaddr_t +cos_page_bump_intern_valloc(struct cos_compinfo *ci, size_t sz) +{ + return __page_bump_valloc(ci, sz); +} + static vaddr_t -__page_bump_alloc(struct cos_compinfo *ci, size_t sz) +__page_bump_alloc(struct cos_compinfo *ci, size_t sz, int shared) { struct cos_compinfo *meta = __compinfo_metacap(ci); vaddr_t heap_vaddr, heap_cursor, heap_limit; @@ -532,7 +537,7 @@ __page_bump_alloc(struct cos_compinfo *ci, size_t sz) for (heap_cursor = heap_vaddr; heap_cursor < heap_limit; heap_cursor += PAGE_SIZE) { vaddr_t umem; - umem = __umem_bump_alloc(ci); + umem = shared ? __kmem_bump_alloc(ci) : __umem_bump_alloc(ci); if (!umem) return 0; /* Actually map in the memory. */ @@ -574,7 +579,7 @@ __alloc_mem_cap(struct cos_compinfo *ci, cap_t ct, vaddr_t *kmem, capid_t *cap) } static thdcap_t -__cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t init_data) +__cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t init_data, dcbcap_t dc, dcboff_t off) { vaddr_t kmem; capid_t cap; @@ -585,9 +590,11 @@ __cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t init if (__alloc_mem_cap(ci, CAP_THD, &kmem, &cap)) return 0; assert(!(init_data & ~((1 << 16) - 1))); - /* TODO: Add cap size checking */ - if (call_cap_op(ci->captbl_cap, CAPTBL_OP_THDACTIVATE, (init_data << 16) | cap, - __compinfo_metacap(ci)->mi.pgtbl_cap, kmem, comp)) + assert(!(off & ~((1 << 9) - 1))); + assert(kmem && (round_to_page(kmem) == kmem)); + + if (call_cap_op(ci->captbl_cap, CAPTBL_OP_THDACTIVATE, __compinfo_metacap(ci)->mi.pgtbl_cap | (cap << 16), + kmem, comp << 16 | dc, off << 16 | init_data)) BUG(); return cap; @@ -596,30 +603,61 @@ __cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t init #include thdcap_t -cos_thd_alloc_ext(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t idx) +cos_thd_alloc_ext(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t idx, dcbcap_t dc, dcboff_t off) { if (idx < 1) return 0; - return __cos_thd_alloc(ci, comp, idx); + return __cos_thd_alloc(ci, comp, idx, dc, off); } thdcap_t -cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, cos_thd_fn_t fn, void *data) +cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, cos_thd_fn_t fn, void *data, dcbcap_t dc, dcboff_t off) { int idx = cos_thd_init_alloc(fn, data); thdcap_t ret; if (idx < 1) return 0; - ret = __cos_thd_alloc(ci, comp, idx); + ret = __cos_thd_alloc(ci, comp, idx, dc, off); if (!ret) cos_thd_init_free(idx); return ret; } thdcap_t -cos_initthd_alloc(struct cos_compinfo *ci, compcap_t comp) +cos_initthd_alloc(struct cos_compinfo *ci, compcap_t comp, dcbcap_t dc) +{ + return __cos_thd_alloc(ci, comp, 0, dc, 0); +} + +int +cos_thd_migrate(struct cos_compinfo *ci, thdcap_t t, cpuid_t c) +{ + return call_cap_op(ci->captbl_cap, CAPTBL_OP_THDMIGRATE, t, c, 0, 0); +} + +int +cos_thdcap_migrate(struct cos_compinfo *ci, thdcap_t t) +{ + return call_cap_op(ci->captbl_cap, CAPTBL_OP_THDMIGRATE, t, 0, 1, 0); +} + +dcbcap_t +cos_dcb_alloc(struct cos_compinfo *ci, pgtblcap_t ptcap, vaddr_t uaddr) { - return __cos_thd_alloc(ci, comp, 0); + vaddr_t kmem; + capid_t cap; + u32_t lid = livenessid_bump_alloc(); + + printd("cos_dcb_alloc\n"); + + assert(ci); + + if (__alloc_mem_cap(ci, CAP_DCB, &kmem, &cap)) return 0; + assert(kmem && (round_to_page(kmem) == kmem)); + if (call_cap_op(ci->captbl_cap, CAPTBL_OP_DCB_ACTIVATE, cap << 16 | lid, (__compinfo_metacap(ci)->mi.pgtbl_cap) << 16 | ptcap, kmem, uaddr)) + BUG(); + + return cap; } captblcap_t @@ -656,30 +694,53 @@ cos_pgtbl_alloc(struct cos_compinfo *ci) return cap; } +scbcap_t +cos_scb_alloc(struct cos_compinfo *ci) +{ + vaddr_t kmem; + capid_t cap; + u32_t lid = livenessid_bump_alloc(); + + printd("cos_scb_alloc\n"); + + assert(ci && lid); + + if (__alloc_mem_cap(ci, CAP_SCB, &kmem, &cap)) return 0; + assert(kmem && (round_to_page(kmem) == kmem)); + if (call_cap_op(ci->captbl_cap, CAPTBL_OP_SCB_ACTIVATE, cap, __compinfo_metacap(ci)->mi.pgtbl_cap, kmem, lid)) + BUG(); + + return cap; +} + compcap_t -cos_comp_alloc(struct cos_compinfo *ci, captblcap_t ctc, pgtblcap_t ptc, vaddr_t entry) +cos_comp_alloc(struct cos_compinfo *ci, captblcap_t ctc, pgtblcap_t ptc, scbcap_t scbc, vaddr_t entry, vaddr_t uaddr) { capid_t cap; + /* FIXME: same or diff liveness ids in scb and comp resources? */ u32_t lid = livenessid_bump_alloc(); printd("cos_comp_alloc\n"); assert(ci && ctc && ptc && lid); + /* FIXME: packing scbc in 12 bits */ + assert(scbc < (1 << 12)); cap = __capid_bump_alloc(ci, CAP_COMP); if (!cap) return 0; - if (call_cap_op(ci->captbl_cap, CAPTBL_OP_COMPACTIVATE, cap, (ctc << 16) | ptc, lid, entry)) BUG(); + if (call_cap_op(ci->captbl_cap, CAPTBL_OP_COMPACTIVATE, (lid << 16) | cap, (ctc << 16) | ptc, uaddr | scbc, entry)) BUG(); return cap; } int -cos_compinfo_alloc(struct cos_compinfo *ci, vaddr_t heap_ptr, capid_t cap_frontier, vaddr_t entry, +cos_compinfo_alloc(struct cos_compinfo *ci, scbcap_t sc, vaddr_t heap_ptr, capid_t cap_frontier, vaddr_t entry, struct cos_compinfo *ci_resources) { pgtblcap_t ptc; captblcap_t ctc; compcap_t compc; + vaddr_t scb_vaddr; printd("cos_compinfo_alloc\n"); @@ -687,10 +748,14 @@ cos_compinfo_alloc(struct cos_compinfo *ci, vaddr_t heap_ptr, capid_t cap_fronti assert(ptc); ctc = cos_captbl_alloc(ci_resources); assert(ctc); - compc = cos_comp_alloc(ci_resources, ctc, ptc, entry); - assert(compc); + cos_compinfo_init(ci, ptc, ctc, 0, heap_ptr, cap_frontier, ci_resources); - cos_compinfo_init(ci, ptc, ctc, compc, heap_ptr, cap_frontier, ci_resources); + /* FIXME: make sure this is right at the start of heap_ptr! */ + scb_vaddr = (vaddr_t)__page_bump_valloc(ci, COS_SCB_SIZE); + assert(scb_vaddr); + compc = cos_comp_alloc(ci_resources, ctc, ptc, sc, entry, scb_vaddr); + assert(compc); + ci->comp_cap = compc; return 0; } @@ -779,10 +844,29 @@ cos_hw_alloc(struct cos_compinfo *ci, u32_t bitmap) return cap; } +/* TODO: Can we alias/etc on this page with this logic? */ +void * +cos_dcbpg_bump_allocn(struct cos_compinfo *ci, size_t sz) +{ + assert(sz == PAGE_SIZE); + /* assert(sz % PAGE_SIZE == 0); */ + + return (void *)__page_bump_alloc(ci, sz, 1); +} + +void * +cos_scbpg_bump_allocn(struct cos_compinfo *ci, size_t sz) +{ + assert(sz == PAGE_SIZE); + /* assert(sz % PAGE_SIZE == 0); */ + + return (void *)__page_bump_alloc(ci, sz, 1); +} + void * cos_page_bump_alloc(struct cos_compinfo *ci) { - return (void *)__page_bump_alloc(ci, PAGE_SIZE); + return (void *)__page_bump_alloc(ci, PAGE_SIZE, 0); } void * @@ -790,7 +874,7 @@ cos_page_bump_allocn(struct cos_compinfo *ci, size_t sz) { assert(sz % PAGE_SIZE == 0); - return (void *)__page_bump_alloc(ci, sz); + return (void *)__page_bump_alloc(ci, sz, 0); } capid_t @@ -837,9 +921,7 @@ cos_thd_wakeup(thdcap_t thd, tcap_t tc, tcap_prio_t prio, tcap_res_t res) sched_tok_t cos_sched_sync(void) { - static sched_tok_t stok[NUM_CPU] CACHE_ALIGNED; - - return ps_faa((unsigned long *)&stok[cos_cpuid()], 1); + return ps_load(&cos_scb_info_get_core()->sched_tok); } int @@ -863,7 +945,7 @@ cos_asnd(asndcap_t snd, int yield) int cos_sched_rcv(arcvcap_t rcv, rcv_flags_t flags, tcap_time_t timeout, - int *rcvd, thdid_t *thdid, int *blocked, cycles_t *cycles, tcap_time_t *thd_timeout) + thdid_t *thdid, int *blocked, cycles_t *cycles, tcap_time_t *thd_timeout) { unsigned long thd_state = 0; unsigned long cyc = 0; @@ -875,16 +957,11 @@ cos_sched_rcv(arcvcap_t rcv, rcv_flags_t flags, tcap_time_t timeout, *thdid = (thdid_t)(thd_state & ((1 << (sizeof(thdid_t) * 8)) - 1)); *cycles = cyc; - if (ret >= 0 && flags & RCV_ALL_PENDING) { - *rcvd = (ret >> 1); - ret &= 1; - } - return ret; } int -cos_rcv(arcvcap_t rcv, rcv_flags_t flags, int *rcvd) +cos_rcv(arcvcap_t rcv, rcv_flags_t flags) { thdid_t tid = 0; int blocked; @@ -892,7 +969,7 @@ cos_rcv(arcvcap_t rcv, rcv_flags_t flags, int *rcvd) int ret; tcap_time_t thd_timeout; - ret = cos_sched_rcv(rcv, flags, 0, rcvd, &tid, &blocked, &cyc, &thd_timeout); + ret = cos_sched_rcv(rcv, flags, 0, &tid, &blocked, &cyc, &thd_timeout); assert(tid == 0); return ret; @@ -1033,6 +1110,14 @@ cos_hw_attach(hwcap_t hwc, hwid_t hwid, arcvcap_t arcv) return call_cap_op(hwc, CAPTBL_OP_HW_ATTACH, hwid, arcv, 0, 0); } +int +cos_hw_periodic_attach(hwcap_t hwc, hwid_t hwid, arcvcap_t arcv, unsigned int period) +{ + assert(hwid == HW_HPET_PERIODIC); + + return call_cap_op(hwc, CAPTBL_OP_HW_ATTACH, hwid, arcv, period, 0); +} + int cos_hw_detach(hwcap_t hwc, hwid_t hwid) { diff --git a/src/components/lib/part_capmgr.c b/src/components/lib/part_capmgr.c new file mode 100644 index 0000000000..9d09024af4 --- /dev/null +++ b/src/components/lib/part_capmgr.c @@ -0,0 +1,367 @@ +#include +#include +#include +#include +#include <../interface/capmgr/memmgr.h> +#include +#include + +#include +#include + +struct deque_part *part_dq_percore[NUM_CPU]; +//struct cirque_par parcq_global; +static volatile unsigned part_ready = 0; +volatile int in_main_parallel; +#if defined(PART_ENABLE_NESTED) +struct crt_lock part_l_lock; +struct ps_list_head part_l_global; +#else +struct part_task main_task; +#endif +//static struct part_task *part_tasks = NULL; +//static struct part_data *part__data = NULL; +struct ps_list_head part_thdpool_core[NUM_CPU]; + +#define PART_DEQUE_SZ PART_MAX_TASKS +#define _PART_PRIO TCAP_PRIO_MAX +#define _PART_PRIO_PACK() sched_param_pack(SCHEDP_PRIO, _PART_PRIO) + +#define _PART_IDLE_PRIO (_PART_PRIO+4) +#define _PART_IDLE_PRIO_PACK() sched_param_pack(SCHEDP_PRIO, _PART_IDLE_PRIO) + +//struct ps_slab * +//ps_slab_memmgr_alloc(struct ps_mem *m, size_t sz, coreid_t coreid) +//{ +// PRINTC("%s:%d\n", __func__, __LINE__); +// unsigned npages = round_up_to_page(sz) / PAGE_SIZE; +// vaddr_t addr = memmgr_heap_page_allocn(npages); +// +// assert(addr); +// memset((void *)addr, 0, npages * PAGE_SIZE); +// PRINTC("%s:%d\n", __func__, __LINE__); +// +// return (struct ps_slab *)addr; +//} +// +//void +//ps_slab_memmgr_free(struct ps_mem *m, struct ps_slab *s, size_t sz, coreid_t coreid) +//{ +// /* do nothing */ +//} + +/* this? */ +//PS_SLAB_CREATE_AFNS(parttask, sizeof(struct part_task), PART_TASKS_MAX_SZ, 0, ps_slab_memmgr_alloc, ps_slab_memmgr_free); +//PS_SLAB_CREATE_AFNS(partdata, sizeof(struct part_data), PART_DATA_MAX_SZ, 0, ps_slab_memmgr_alloc, ps_slab_memmgr_free); +/* or this. */ +//PS_SLAB_CREATE(parttask, sizeof(struct part_task), PART_TASKS_MAX_SZ) +//PS_SLAB_CREATE(partdata, sizeof(struct part_data), PART_DATA_MAX_SZ) + +/* for task pool, per core list. tasks in pool can migrate cores */ +struct parttask_head { + struct part_task *head; +}; + +static inline void +parttask_store_init(struct parttask_head *h) +{ + h->head = NULL; +} + +static inline void +parttask_store_add(struct parttask_head *h, struct part_task *l) +{ + struct part_task *n; + l->next_free = NULL; + + assert(h); + do { + n = ps_load(&h->head); + l->next_free = n; + } while (!ps_cas(&h->head, (unsigned long)n, (unsigned long)l)); +} + +static inline struct part_task * +parttask_store_dequeue(struct parttask_head *h) +{ + struct part_task *l = NULL; + + do { + l = ps_load(&h->head); + if (unlikely(!l)) return NULL; + } while (!ps_cas(&h->head, (unsigned long)l, (unsigned long)l->next_free)); + + l->next_free = NULL; + + return l; +} + +/* for task data, per core pool - task data could migrate pools. */ +struct partdata_head { + struct part_data *head; +}; + +static inline void +partdata_store_init(struct partdata_head *h) +{ + h->head = NULL; +} + +static inline void +partdata_store_add(struct partdata_head *h, struct part_data *l) +{ + struct part_data *n = NULL; + l->next_free = NULL; + + assert(h); + do { + n = ps_load(&h->head); + + l->next_free = n; + } while (!ps_cas(&h->head, (unsigned long)n, (unsigned long)l)); +} + +static inline struct part_data * +partdata_store_dequeue(struct partdata_head *h) +{ + struct part_data *l = NULL; + + do { + l = ps_load(&h->head); + if (unlikely(!l)) return NULL; + } while (!ps_cas(&h->head, (unsigned long)l, (unsigned long)l->next_free)); + + l->next_free = NULL; + + return l; +} + +/* end treiber stacks */ +#define PART_TASKS_MAX_SZ round_up_to_page(PART_MAX_TASKS * sizeof(struct part_task)) +#define PART_MAX_PAGES (PART_TASKS_MAX_SZ / PAGE_SIZE) +#define PART_DATA_MAX_SZ round_up_to_page(PART_MAX_TASKS * sizeof(struct part_data)) +#define PART_MAX_DATA_PAGES (PART_DATA_MAX_SZ / PAGE_SIZE) +#define PART_DEQUE_MAX_PAGES (round_up_to_page(sizeof(struct deque_part)) / PAGE_SIZE) + +struct partdata_head pd_head[NUM_CPU]; + +static inline void +partdata_store_init_all(vaddr_t mem) +{ + int i; + + for (i = 0; i < NUM_CPU; i++) { + int j; + struct part_data *st = (struct part_data *)(mem + (PART_DATA_MAX_SZ * i)); + + partdata_store_init(&pd_head[i]); + + for (j = 0; j < PART_MAX_TASKS; j++) partdata_store_add(&pd_head[i], st + j); + } +} + +struct parttask_head pt_head[NUM_CPU]; + +static inline void +parttask_store_init_all(vaddr_t mem) +{ + int i; + + for (i = 0; i < NUM_CPU; i++) { + int j; + struct part_task *st = (struct part_task *)(mem + (PART_TASKS_MAX_SZ * i)); + + parttask_store_init(&pt_head[i]); + + for (j = 0; j < PART_MAX_TASKS; j++) parttask_store_add(&pt_head[i], st + j); + } +} + +/* idle thread to wakeup when there is nothing to do on this core! */ +static void +part_idle_fn(void *d) +{ + struct sl_thd *sched = sl__globals_core()->sched_thd, *curr = sl_thd_curr(); + + while (1) { + /* + * TODO: threads could be woken up even if there is no work! + */ + if (likely(ps_load(&in_main_parallel))) part_pool_wakeup(); + sl_thd_yield_thd(sched); + } +} + +struct part_data * +part_data_alloc(void) +{ + struct part_data *d = partdata_store_dequeue(&pd_head[cos_cpuid()]); + + if (!d) return d; + if (!ps_cas(&d->flag, 0, 1)) assert(0); + + return d; +// int i; +// struct part_data *d = ps_slab_alloc_partdata(); +// +// if (!ps_cas(&d->flag, 0, 1)) assert(0); +// +// return d; +// for (i = 0; i < PART_MAX_TASKS; i++) { +// d = part__data + i; +// +// if (d->flag) continue; +// +// /* if this fails, someone else just alloced it! */ +// if (!ps_cas(&d->flag, 0, 1)) continue; +// +// return d; +// } +// +// return NULL; +} + +void +part_data_free(struct part_data *d) +{ + if (!ps_cas(&d->flag, 1, 0)) assert(0); + + partdata_store_add(&pd_head[cos_cpuid()], d); +// ps_slab_free_partdata(d); +// int f; +// +// if (!d) return; +// +// do { +// f = d->flag; +// assert(f); +// } while (!ps_cas(&d->flag, f, 0)); +} +struct part_task * +part_task_alloc(part_task_type_t type) +{ + struct part_task *t = parttask_store_dequeue(&pt_head[cos_cpuid()]); + + if (!t) return t; + + /* use upcas ? */ + if (!ps_cas(&t->state, PART_TASK_S_FREED, PART_TASK_S_ALLOCATED)) assert(0); + + return t; +// struct part_task *t = ps_slab_alloc_parttask(); +// +// if (!ps_cas(&t->state, PART_TASK_S_FREED, PART_TASK_S_ALLOCATED)) assert(0); +// +// return t; +// int i; +// struct part_task *t; +// +// for (i = 0; i < PART_MAX_TASKS; i++) { +// t = part_tasks + i; +// +// if (ps_load(&t->state) != PART_TASK_S_FREED) continue; +// +// /* if this fails, someone else just alloced it! */ +// if (!ps_cas(&t->state, PART_TASK_S_FREED, PART_TASK_S_ALLOCATED)) continue; +// +// return t; +// } +// +// return NULL; +} + +void +part_task_free(struct part_task *t) +{ + if (!ps_cas(&t->state, PART_TASK_S_INITIALIZED, PART_TASK_S_FREED)) assert(0); + + parttask_store_add(&pt_head[cos_cpuid()], t); +// ps_slab_free_parttask(t); +// part_task_state_t s = 0; +// +// if (!t) return; +// +// do { +// s = ps_load(&t->state); +// if (s != PART_TASK_S_INITIALIZED) return; +// } while (!ps_cas(&t->state, s, PART_TASK_S_FREED)); +} + +unsigned +part_isready(void) +{ return (part_ready == NUM_CPU); } + +void +part_init(void) +{ + int k; + static volatile int is_first = NUM_CPU; + struct sl_thd *it = NULL; + struct sl_xcore_thd *xit = NULL; + sched_param_t ip = _PART_IDLE_PRIO_PACK(); + static volatile int all_done = 0; + + ps_list_head_init(&part_thdpool_core[cos_cpuid()]); + if (ps_cas(&is_first, NUM_CPU, cos_cpuid())) { + vaddr_t ptmem = 0, pdmem = 0; + + for (k = 0; k < NUM_CPU; k++) { + part_dq_percore[k] = (struct deque_part *)memmgr_heap_page_allocn(PART_DEQUE_MAX_PAGES); + assert(part_dq_percore[k]); + deque_init_part(part_dq_percore[k], PART_DEQUE_SZ); + } + ptmem = memmgr_heap_page_allocn(PART_MAX_PAGES * NUM_CPU); + assert(ptmem); + memset((void *)ptmem, 0, PART_MAX_PAGES * PAGE_SIZE * NUM_CPU); + + pdmem = memmgr_heap_page_allocn(PART_MAX_DATA_PAGES * NUM_CPU); + assert(pdmem); + memset((void *)pdmem, 0, PART_MAX_DATA_PAGES * PAGE_SIZE * NUM_CPU); + + partdata_store_init_all(pdmem); + parttask_store_init_all(ptmem); +// ps_slab_init_parttask(); +// ps_slab_init_partdata(); + +#if defined(PART_ENABLE_NESTED) + ps_list_head_init(&part_l_global); + crt_lock_init(&part_l_lock); +#else + memset(&main_task, 0, sizeof(main_task)); +#endif + in_main_parallel = 0; + } + + for (k = 0; k < PART_MAX_CORE_THDS; k++) { + struct sl_xcore_thd *x; + struct sl_thd *t; + sched_param_t p = _PART_PRIO_PACK(); + + t = sl_thd_alloc(part_thd_fn, NULL); + assert(t); + + sl_thd_param_set(t, p); + + x = sl_xcore_thd_lookup_init(sl_thd_thdid(t), cos_cpuid()); + assert(x); + } + +#ifdef PART_ENABLE_BLOCKING + sl_cs_enter(); + /* + * because it's fifo, all threads would go block + * themselves up as there is no work yet + * eventually returning to this main thread on core-0, + * and on all other cores, scheduler would be running! + */ + sl_cs_exit_schedule(); + it = sl_thd_alloc(part_idle_fn, NULL); + assert(it); + sl_thd_param_set(it, ip); +#endif + + ps_faa(&all_done, 1); + while (ps_load(&all_done) != NUM_CPU) ; + + ps_faa(&part_ready, 1); +} diff --git a/src/components/lib/part_raw.c b/src/components/lib/part_raw.c new file mode 100644 index 0000000000..273ce48f4a --- /dev/null +++ b/src/components/lib/part_raw.c @@ -0,0 +1,402 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +struct deque_part *part_dq_percore[NUM_CPU]; +//struct cirque_par parcq_global; +static volatile unsigned part_ready = 0; +volatile int in_main_parallel; +#if defined(PART_ENABLE_NESTED) +struct crt_lock part_l_lock; +struct ps_list_head part_l_global; +#else +struct part_task main_task; +#endif +//static struct part_task *part_tasks = NULL; +//static struct part_data *part__data = NULL; +struct ps_list_head part_thdpool_core[NUM_CPU]; + +#define PART_DEQUE_SZ PART_MAX_TASKS +#define _PART_PRIO TCAP_PRIO_MAX +#define _PART_PRIO_PACK() sched_param_pack(SCHEDP_PRIO, _PART_PRIO) + +#define _PART_IDLE_PRIO (_PART_PRIO+4) +#define _PART_IDLE_PRIO_PACK() sched_param_pack(SCHEDP_PRIO, _PART_IDLE_PRIO) + +//struct ps_slab * +//ps_slab_memmgr_alloc(struct ps_mem *m, size_t sz, coreid_t coreid) +//{ +// PRINTC("%s:%d\n", __func__, __LINE__); +// unsigned npages = round_up_to_page(sz) / PAGE_SIZE; +// vaddr_t addr = memmgr_heap_page_allocn(npages); +// +// assert(addr); +// memset((void *)addr, 0, npages * PAGE_SIZE); +// PRINTC("%s:%d\n", __func__, __LINE__); +// +// return (struct ps_slab *)addr; +//} +// +//void +//ps_slab_memmgr_free(struct ps_mem *m, struct ps_slab *s, size_t sz, coreid_t coreid) +//{ +// /* do nothing */ +//} + +/* this? */ +//PS_SLAB_CREATE_AFNS(parttask, sizeof(struct part_task), PART_TASKS_MAX_SZ, 0, ps_slab_memmgr_alloc, ps_slab_memmgr_free); +//PS_SLAB_CREATE_AFNS(partdata, sizeof(struct part_data), PART_DATA_MAX_SZ, 0, ps_slab_memmgr_alloc, ps_slab_memmgr_free); +/* or this. */ +//PS_SLAB_CREATE(parttask, sizeof(struct part_task), PART_TASKS_MAX_SZ) +//PS_SLAB_CREATE(partdata, sizeof(struct part_data), PART_DATA_MAX_SZ) + +/* for task pool, per core list. tasks in pool can migrate cores */ +struct parttask_head { + struct part_task *head; +}; + +static inline void +parttask_store_init(struct parttask_head *h) +{ + h->head = NULL; +} + +static inline void +parttask_store_add(struct parttask_head *h, struct part_task *l) +{ + struct part_task *n; + l->next_free = NULL; + + assert(h); + do { + n = ps_load(&h->head); + l->next_free = n; + } while (!ps_cas(&h->head, (unsigned long)n, (unsigned long)l)); +} + +static inline struct part_task * +parttask_store_dequeue(struct parttask_head *h) +{ + struct part_task *l = NULL; + + do { + l = ps_load(&h->head); + if (unlikely(!l)) return NULL; + } while (!ps_cas(&h->head, (unsigned long)l, (unsigned long)l->next_free)); + + l->next_free = NULL; + + return l; +} + +/* for task data, per core pool - task data could migrate pools. */ +struct partdata_head { + struct part_data *head; +}; + +static inline void +partdata_store_init(struct partdata_head *h) +{ + h->head = NULL; +} + +static inline void +partdata_store_add(struct partdata_head *h, struct part_data *l) +{ + struct part_data *n = NULL; + l->next_free = NULL; + + assert(h); + do { + n = ps_load(&h->head); + + l->next_free = n; + } while (!ps_cas(&h->head, (unsigned long)n, (unsigned long)l)); +} + +static inline struct part_data * +partdata_store_dequeue(struct partdata_head *h) +{ + struct part_data *l = NULL; + + do { + l = ps_load(&h->head); + if (unlikely(!l)) return NULL; + } while (!ps_cas(&h->head, (unsigned long)l, (unsigned long)l->next_free)); + + l->next_free = NULL; + + return l; +} + +/* end treiber stacks */ +#define PART_TASKS_MAX_SZ round_up_to_page(PART_MAX_TASKS * sizeof(struct part_task)) +#define PART_MAX_PAGES (PART_TASKS_MAX_SZ / PAGE_SIZE) +#define PART_DATA_MAX_SZ round_up_to_page(PART_MAX_TASKS * sizeof(struct part_data)) +#define PART_MAX_DATA_PAGES (PART_DATA_MAX_SZ / PAGE_SIZE) +#define PART_DEQUE_MAX_SZ round_up_to_page(sizeof(struct deque_part)) +#define PART_DEQUE_MAX_PAGES (PART_DEQUE_MAX_SZ / PAGE_SIZE) + +struct partdata_head pd_head[NUM_CPU]; + +static inline void +partdata_store_init_all(vaddr_t mem) +{ + int i; + + for (i = 0; i < NUM_CPU; i++) { + int j; + struct part_data *st = (struct part_data *)(mem + (PART_DATA_MAX_SZ * i)); + + partdata_store_init(&pd_head[i]); + + for (j = 0; j < PART_MAX_TASKS; j++) partdata_store_add(&pd_head[i], st + j); + } +} + +static inline struct part_data * +partdata_store_dequeue_any(void) +{ + struct part_data *p = NULL; + int i = 0; + + for (i = 0; i < NUM_CPU; i++) { + p = partdata_store_dequeue(&pd_head[(cos_cpuid() + i) % NUM_CPU]); + + if (p) break; + } + + return p; +} + +struct parttask_head pt_head[NUM_CPU]; + +static inline void +parttask_store_init_all(vaddr_t mem) +{ + int i; + + for (i = 0; i < NUM_CPU; i++) { + int j; + struct part_task *st = (struct part_task *)(mem + (PART_TASKS_MAX_SZ * i)); + + parttask_store_init(&pt_head[i]); + + for (j = 0; j < PART_MAX_TASKS; j++) parttask_store_add(&pt_head[i], st + j); + } +} + +static inline struct part_task * +parttask_store_dequeue_any(void) +{ + struct part_task *p = NULL; + int i = 0; + + for (i = 0; i < NUM_CPU; i++) { + p = parttask_store_dequeue(&pt_head[(cos_cpuid() + i) % NUM_CPU]); + + if (p) break; + } + + return p; +} + +/* idle thread to wakeup when there is nothing to do on this core! */ +static void +part_idle_fn(void *d) +{ + struct sl_thd *sched = sl__globals_core()->sched_thd, *curr = sl_thd_curr(); + + while (1) { + /* + * TODO: threads could be woken up even if there is no work! + */ + if (likely(ps_load(&in_main_parallel))) part_pool_wakeup(); + sl_thd_yield_thd(sched); + } +} + +struct part_data * +part_data_alloc(void) +{ + struct part_data *d = partdata_store_dequeue_any(); + //struct part_data *d = partdata_store_dequeue(&pd_head[cos_cpuid()]); + + if (!d) return d; + if (!ps_cas(&d->flag, 0, 1)) assert(0); + + return d; +// int i; +// struct part_data *d = ps_slab_alloc_partdata(); +// +// if (!ps_cas(&d->flag, 0, 1)) assert(0); +// +// return d; +// for (i = 0; i < PART_MAX_TASKS; i++) { +// d = part__data + i; +// +// if (d->flag) continue; +// +// /* if this fails, someone else just alloced it! */ +// if (!ps_cas(&d->flag, 0, 1)) continue; +// +// return d; +// } +// +// return NULL; +} + +void +part_data_free(struct part_data *d) +{ + if (!ps_cas(&d->flag, 1, 0)) assert(0); + + partdata_store_add(&pd_head[cos_cpuid()], d); +// ps_slab_free_partdata(d); +// int f; +// +// if (!d) return; +// +// do { +// f = d->flag; +// assert(f); +// } while (!ps_cas(&d->flag, f, 0)); +} +struct part_task * +part_task_alloc(part_task_type_t type) +{ + struct part_task *t = parttask_store_dequeue_any(); + //struct part_task *t = parttask_store_dequeue(&pt_head[cos_cpuid()]); + + if (!t) return t; + + /* use upcas ? */ + if (!ps_cas(&t->state, PART_TASK_S_FREED, PART_TASK_S_ALLOCATED)) assert(0); + + return t; +// struct part_task *t = ps_slab_alloc_parttask(); +// +// if (!ps_cas(&t->state, PART_TASK_S_FREED, PART_TASK_S_ALLOCATED)) assert(0); +// +// return t; +// int i; +// struct part_task *t; +// +// for (i = 0; i < PART_MAX_TASKS; i++) { +// t = part_tasks + i; +// +// if (ps_load(&t->state) != PART_TASK_S_FREED) continue; +// +// /* if this fails, someone else just alloced it! */ +// if (!ps_cas(&t->state, PART_TASK_S_FREED, PART_TASK_S_ALLOCATED)) continue; +// +// return t; +// } +// +// return NULL; +} + +void +part_task_free(struct part_task *t) +{ + if (!ps_cas(&t->state, PART_TASK_S_INITIALIZED, PART_TASK_S_FREED)) assert(0); + + parttask_store_add(&pt_head[cos_cpuid()], t); +// ps_slab_free_parttask(t); +// part_task_state_t s = 0; +// +// if (!t) return; +// +// do { +// s = ps_load(&t->state); +// if (s != PART_TASK_S_INITIALIZED) return; +// } while (!ps_cas(&t->state, s, PART_TASK_S_FREED)); +} + +unsigned +part_isready(void) +{ return (part_ready == NUM_CPU); } + +void +part_init(void) +{ + struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get()); + int k; + static volatile int is_first = NUM_CPU; + struct sl_thd *it = NULL; + struct sl_xcore_thd *xit = NULL; + sched_param_t ip = _PART_IDLE_PRIO_PACK(); + static volatile int all_done = 0; + + ps_list_head_init(&part_thdpool_core[cos_cpuid()]); + if (ps_cas(&is_first, NUM_CPU, cos_cpuid())) { + vaddr_t ptmem = 0, pdmem = 0; + + for (k = 0; k < NUM_CPU; k++) { + part_dq_percore[k] = (struct deque_part *)cos_page_bump_allocn(ci, PART_DEQUE_MAX_SZ); + assert(part_dq_percore[k]); + deque_init_part(part_dq_percore[k], PART_DEQUE_SZ); + } + ptmem = (vaddr_t)cos_page_bump_allocn(ci, PART_TASKS_MAX_SZ * NUM_CPU); + assert(ptmem); + memset((void *)ptmem, 0, PART_MAX_PAGES * PAGE_SIZE * NUM_CPU); + + pdmem = (vaddr_t)cos_page_bump_allocn(ci, PART_DATA_MAX_SZ * NUM_CPU); + assert(pdmem); + memset((void *)pdmem, 0, PART_MAX_DATA_PAGES * PAGE_SIZE * NUM_CPU); + + partdata_store_init_all(pdmem); + parttask_store_init_all(ptmem); +// ps_slab_init_parttask(); +// ps_slab_init_partdata(); + +#if defined(PART_ENABLE_NESTED) + ps_list_head_init(&part_l_global); + crt_lock_init(&part_l_lock); +#else + memset(&main_task, 0, sizeof(main_task)); +#endif + in_main_parallel = 0; + } + + for (k = 0; k < PART_MAX_CORE_THDS; k++) { + struct sl_xcore_thd *x; + struct sl_thd *t; + sched_param_t p = _PART_PRIO_PACK(); + + t = sl_thd_alloc(part_thd_fn, NULL); + assert(t); + + sl_thd_param_set(t, p); + + x = sl_xcore_thd_lookup_init(sl_thd_thdid(t), cos_cpuid()); + assert(x); + } + +#ifdef PART_ENABLE_BLOCKING + sl_cs_enter(); + /* + * because it's fifo, all threads would go block + * themselves up as there is no work yet + * eventually returning to this main thread on core-0, + * and on all other cores, scheduler would be running! + */ + sl_cs_exit_schedule(); + it = sl_thd_alloc(part_idle_fn, NULL); + assert(it); + sl_thd_param_set(it, ip); +#endif + + ps_faa(&all_done, 1); + while (ps_load(&all_done) != NUM_CPU) ; + + ps_faa(&part_ready, 1); +} diff --git a/src/components/lib/posix/Makefile b/src/components/lib/posix/Makefile index c72f105cd7..90cdcd62a5 100644 --- a/src/components/lib/posix/Makefile +++ b/src/components/lib/posix/Makefile @@ -8,7 +8,7 @@ INC += -I../ps/ all: posix.o posix.o: posix.c - $(CC) $(INC) $< -o $@ -c $(CFLAGS) + @$(CC) $(INC) $< -o $@ -c $(CFLAGS) clean: - rm -f posix.o + @rm -f posix.o diff --git a/src/components/lib/posix/posix.c b/src/components/lib/posix/posix.c index 73166a7524..fc1e8c366b 100644 --- a/src/components/lib/posix/posix.c +++ b/src/components/lib/posix/posix.c @@ -362,7 +362,7 @@ struct sl_lock futex_lock = SL_LOCK_STATIC_INIT(); int cos_futex_wait(struct futex_data *futex, int *uaddr, int val, const struct timespec *timeout) { - cycles_t deadline; + cycles_t deadline = sl_now(); microsec_t wait_time; struct futex_waiter waiter = (struct futex_waiter) { .thdid = sl_thdid() diff --git a/src/components/lib/sinv_async/acom_client.c b/src/components/lib/sinv_async/acom_client.c index da67bb17e5..384c83b2bb 100644 --- a/src/components/lib/sinv_async/acom_client.c +++ b/src/components/lib/sinv_async/acom_client.c @@ -81,7 +81,7 @@ acom_client_request(struct sinv_async_info *s, acom_type_t t, word_t a, word_t b { struct sinv_thdinfo *tinfo = &s->cdata.cthds[cos_thdid()]; volatile unsigned long *reqaddr = (volatile unsigned long *)SINV_POLL_ADDR(tinfo->shmaddr); - int *retval = NULL, ret, rcvd = 0; + int *retval = NULL, ret; struct sinv_call_req *req = NULL; assert(t >= 0 && t < SINV_NUM_MAX); @@ -108,7 +108,7 @@ acom_client_request(struct sinv_async_info *s, acom_type_t t, word_t a, word_t b cos_asnd(tinfo->sndcap, 1); assert(tinfo->rcvcap); - while ((cos_rcv(tinfo->rcvcap, RCV_NON_BLOCKING | RCV_ALL_PENDING, &rcvd) < 0)) { + while ((cos_rcv(tinfo->rcvcap, RCV_NON_BLOCKING) < 0)) { cycles_t timeout = time_now() + time_usec2cyc(SINV_SRV_POLL_US); if (ps_load((unsigned long *)reqaddr) == SINV_REQ_RESET) break; diff --git a/src/components/lib/sinv_async/sinv_client.c b/src/components/lib/sinv_async/sinv_client.c index 501a98e3f6..031ed40f77 100644 --- a/src/components/lib/sinv_async/sinv_client.c +++ b/src/components/lib/sinv_async/sinv_client.c @@ -112,7 +112,7 @@ sinv_client_call_wrets(int wrets, struct sinv_async_info *s, sinv_num_t n, word_ */ cos_asnd(tinfo->sndcap, 1); - while ((tinfo->rcvcap && cos_rcv(tinfo->rcvcap, RCV_NON_BLOCKING, NULL) < 0) && (ps_load((unsigned long *)reqaddr) != SINV_REQ_RESET)) { + while ((tinfo->rcvcap && cos_rcv(tinfo->rcvcap, RCV_NON_BLOCKING) < 0) && (ps_load((unsigned long *)reqaddr) != SINV_REQ_RESET)) { cycles_t timeout = time_now() + time_usec2cyc(SINV_SRV_POLL_US); sl_thd_block_timeout(0, timeout); /* in the scheduler component */ diff --git a/src/components/lib/sinv_async/sinv_server.c b/src/components/lib/sinv_async/sinv_server.c index 680d790cd3..2a6c539c69 100644 --- a/src/components/lib/sinv_async/sinv_server.c +++ b/src/components/lib/sinv_async/sinv_server.c @@ -114,9 +114,8 @@ sinv_server_aep_fn(arcvcap_t rcv, void *data) asndcap_t snd = t->sndcap; int *retval = (int *)SINV_RET_ADDR(t->shmaddr), ret; struct sinv_call_req *req = (struct sinv_call_req *)SINV_REQ_ADDR(t->shmaddr); - int rcvd = 0; - while ((cos_rcv(rcv, RCV_NON_BLOCKING | RCV_ALL_PENDING, &rcvd) < 0)) { + while ((cos_rcv(rcv, RCV_NON_BLOCKING) < 0)) { cycles_t timeout = time_now() + time_usec2cyc(SINV_SRV_POLL_US); if (ps_load((unsigned long *)reqaddr) == SINV_REQ_SET) break; diff --git a/src/components/lib/sl/Makefile b/src/components/lib/sl/Makefile index 6e908cda0b..d54ad150e6 100644 --- a/src/components/lib/sl/Makefile +++ b/src/components/lib/sl/Makefile @@ -1,6 +1,6 @@ include Makefile.src Makefile.comp -LIB_OBJS=sl_capmgr.o sl_raw.o sl_sched.o sl_xcpu.o sl_child.o sl_mod_fprr.o sl_lock.o sl_thd_static_backend.o +LIB_OBJS=sl_capmgr.o sl_raw.o sl_sched.o sl_xcore.o sl_child.o sl_mod_fprr.o sl_mod_rr.o sl_mod_fifo.o sl_mod_part_fifo.o sl_lock.o sl_thd_static_backend.o sl_blkpt.o LIBS=$(LIB_OBJS:%.o=%.a) CINC+=-m32 @@ -13,5 +13,10 @@ all: $(LIBS) @$(CC) $(CFLAGS) $(CINC) -o $(@:%.a=%.o) -c $< @$(AR) cr lib$@ $(@:%.a=%.o) +%.a:%.S + $(info | [AS] Creating library file $@ from $^) + @$(AS) $(ASFLAGS) -c -o $(@:%.a=%.o) $^ + @$(AR) cr lib$@ $(@:%.a=%.o) + clean: @rm -f *.o *.a *.d diff --git a/src/components/lib/sl/sl_blkpt.c b/src/components/lib/sl/sl_blkpt.c new file mode 100644 index 0000000000..de59ee69a1 --- /dev/null +++ b/src/components/lib/sl/sl_blkpt.c @@ -0,0 +1,140 @@ +#include +#include + +#define NBLKPTS 64 +struct blkpt_mem { + sched_blkpt_id_t id; + sched_blkpt_epoch_t epoch; + struct stacklist_head blocked; +}; +static struct blkpt_mem __blkpts[NBLKPTS]; +static int __blkpt_offset = 1; + +#define BLKPT_EPOCH_BLKED_BITS ((sizeof(sched_blkpt_epoch_t) * 8) +#define BLKPT_EPOCH_DIFF (BLKPT_EPOCH_BLKED_BITS - 2)/2) + +/* + * Is cmp > e? This is more complicated than it seems it should be + * only because of wrap-around. We have to consider the case that we + * have, and that we haven't wrapped around. + */ +static int +blkpt_epoch_is_higher(sched_blkpt_epoch_t e, sched_blkpt_epoch_t cmp) +{ + return (e > cmp && (e - cmp) > BLKPT_EPOCH_DIFF) || (e < cmp && (cmp - e) < BLKPT_EPOCH_DIFF); +} + +static struct blkpt_mem * +blkpt_get(sched_blkpt_id_t id) +{ + if (id - 1 == NBLKPTS) return NULL; + + return &__blkpts[id-1]; +} + +sched_blkpt_id_t +sched_blkpt_alloc(void) +{ + sched_blkpt_id_t id; + struct blkpt_mem *m; + sched_blkpt_id_t ret = SCHED_BLKPT_NULL; + + sl_cs_enter(); + + id = (sched_blkpt_id_t)ps_faa(&__blkpt_offset, 1); + m = blkpt_get(id); + if (!m) ERR_THROW(SCHED_BLKPT_NULL, unlock); + + m->id = id; + ret = id; + m->epoch = 0; + stacklist_init(&m->blocked); + /* TODO: undo offset if it failed in an multi-core safe way!*/ +unlock: + sl_cs_exit(); + + return ret; +} + +int +sched_blkpt_free(sched_blkpt_id_t id) +{ + /* alloc only for now */ + return 0; +} + +int +sched_blkpt_trigger(sched_blkpt_id_t blkpt, sched_blkpt_epoch_t epoch, int single) +{ + thdid_t tid; + cpuid_t core; + struct blkpt_mem *m; + int ret = 0; + + sl_cs_enter(); + + m = blkpt_get(blkpt); + if (!m) ERR_THROW(-1, unlock); + + /* is the new epoch more recent than the existing? */ + if (!blkpt_epoch_is_higher(m->epoch, epoch)) ERR_THROW(0, unlock); + + m->epoch = epoch; + while ((tid = stacklist_dequeue(&core, &m->blocked)) != 0) { + if (core == cos_cpuid()) { + struct sl_thd *t = sl_thd_lkup(tid); + + assert(t); + + sl_thd_wakeup_no_cs(t); /* ignore retval: process next thread */ + } else { + struct sl_xcore_thd *t = sl_xcore_thd_lookup(tid); + + assert(t && t->core == core); + /* perhaps sl_xcore_thd_wakeup_no_cs? */ + sl_cs_exit(); + sl_xcore_thd_wakeup(t); + sl_cs_enter(); + } + } + /* most likely we switch to a woken thread here */ + sl_cs_exit_schedule(); + + return 0; +unlock: + sl_cs_exit(); + + return ret; +} + +int +sched_blkpt_block(sched_blkpt_id_t blkpt, sched_blkpt_epoch_t epoch, thdid_t dependency) +{ + struct blkpt_mem *m; + struct sl_thd *t; + struct stacklist sl; /* The stack-based structure we'll use to track ourself */ + int ret = 0; + + sl_cs_enter(); + + m = blkpt_get(blkpt); + if (!m) ERR_THROW(-1, unlock); + + /* Outdated event? don't block! */ + if (blkpt_epoch_is_higher(m->epoch, epoch)) ERR_THROW(0, unlock); + + /* Block! */ + stacklist_add(&m->blocked, &sl); + + t = sl_thd_curr(); + if (sl_thd_block_no_cs(t, SL_THD_BLOCKED, 0)) ERR_THROW(-1, unlock); + + sl_cs_exit_schedule(); + assert(stacklist_is_removed(&sl)); /* we cannot still be on the list */ + + return 0; +unlock: + sl_cs_exit(); + + return ret; +} diff --git a/src/components/lib/sl/sl_capmgr.c b/src/components/lib/sl/sl_capmgr.c index 5eeafe2886..d160c2fadc 100644 --- a/src/components/lib/sl/sl_capmgr.c +++ b/src/components/lib/sl/sl_capmgr.c @@ -14,6 +14,7 @@ #include "../../interface/capmgr/memmgr.h" #include #include +#include extern void sl_thd_event_info_reset(struct sl_thd *t); extern void sl_thd_free_no_cs(struct sl_thd *t); @@ -37,7 +38,7 @@ sl_shm_map(cbuf_t id) } void -sl_xcpu_asnd_alloc(void) +sl_xcore_asnd_alloc(void) { int i; @@ -46,16 +47,16 @@ sl_xcpu_asnd_alloc(void) thdid_t tid; if (i == cos_cpuid()) continue; - if (!bitmap_check(sl__globals()->cpu_bmp, i)) continue; + if (!bitmap_check(sl__globals()->core_bmp, i)) continue; snd = capmgr_asnd_rcv_create(BOOT_CAPTBL_SELF_INITRCV_BASE_CPU(i)); assert(snd); - sl__globals()->xcpu_asnd[cos_cpuid()][i] = snd; + sl__globals()->xcore_asnd[cos_cpuid()][i] = snd; } } struct sl_thd * -sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t prps) +sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t prps, struct cos_dcb_info *dcb) { struct sl_thd_policy *tp = NULL; struct sl_thd *t = NULL; @@ -64,6 +65,7 @@ sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t if (!tp) goto done; t = sl_mod_thd_get(tp); + t->dcb = dcb; t->properties = prps; t->aepinfo = aep; t->sndcap = sndcap; @@ -78,7 +80,9 @@ sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t t->timeout_idx = -1; t->prio = TCAP_PRIO_MIN; ps_list_init(t, SL_THD_EVENT_LIST); + ps_list_init(t, partlist); sl_thd_event_info_reset(t); + sl_xcore_thd_lookup_init(aep->tid, cos_cpuid()); done: return t; @@ -88,21 +92,24 @@ struct sl_thd * sl_thd_alloc_no_cs(cos_thd_fn_t fn, void *data) { struct cos_defcompinfo *dci = cos_defcompinfo_curr_get(); - struct cos_compinfo *ci = &dci->ci; + struct cos_compinfo *ci = cos_compinfo_get(dci); struct sl_thd *t = NULL; struct cos_aep_info *aep = NULL; + struct cos_dcb_info *dcb = NULL; thdcap_t thdcap = 0; thdid_t tid = 0; aep = sl_thd_alloc_aep_backend(); if (!aep) goto done; - aep->thd = capmgr_thd_create(fn, data, &tid); + aep->thd = capmgr_thd_create(fn, data, &tid, &dcb); if (!aep->thd) goto done; aep->tid = tid; + assert(tid); - t = sl_thd_alloc_init(aep, 0, 0); + t = sl_thd_alloc_init(aep, 0, 0, dcb); sl_mod_thd_create(sl_mod_thd_policy_get(t)); + ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1); done: return t; @@ -128,21 +135,23 @@ sl_thd_comp_init_no_cs(struct cos_defcompinfo *comp, sl_thd_property_t prps, asn assert(snd); } - t = sl_thd_alloc_init(aep, snd, prps); + t = sl_thd_alloc_init(aep, snd, prps, NULL); sl_mod_thd_create(sl_mod_thd_policy_get(t)); + ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1); done: return t; } static struct sl_thd * -sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx) +sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx, vaddr_t *dcbuaddr) { struct cos_defcompinfo *dci = cos_defcompinfo_curr_get(); struct cos_compinfo *ci = cos_compinfo_get(dci); struct cos_compinfo *compci = cos_compinfo_get(comp); struct sl_thd *t = NULL; struct cos_aep_info *aep = NULL; + struct cos_dcb_info *dcb = NULL; if (comp == NULL || comp->id == 0) goto done; @@ -150,12 +159,13 @@ sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx) aep = sl_thd_alloc_aep_backend(); if (!aep) goto done; - aep->thd = capmgr_thd_create_ext(comp->id, idx, &aep->tid); + aep->thd = capmgr_thd_create_ext(comp->id, idx, &aep->tid, &dcb); if (!aep->thd) goto done; - aep->tc = sl_thd_tcap(sl__globals_cpu()->sched_thd); + aep->tc = sl_thd_tcap(sl__globals_core()->sched_thd); - t = sl_thd_alloc_init(aep, 0, 0); + t = sl_thd_alloc_init(aep, 0, 0, dcb); sl_mod_thd_create(sl_mod_thd_policy_get(t)); + ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1); } else { struct cos_aep_info *compaep = cos_sched_aep_get(comp); @@ -173,10 +183,11 @@ sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx) } static struct sl_thd * -sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, thdclosure_index_t idx, sl_thd_property_t prps, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv) +sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, thdclosure_index_t idx, sl_thd_property_t prps, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, vaddr_t *dcbuaddr, arcvcap_t *extrcv) { struct cos_aep_info *aep = NULL; struct sl_thd *t = NULL; + struct cos_dcb_info *dcb = NULL; asndcap_t snd = 0; int ret = 0, owntc = 0; @@ -198,11 +209,12 @@ sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, t if (!aep) goto done; if (prps & SL_THD_PROPERTY_OWN_TCAP) owntc = 1; - capmgr_aep_create_ext(comp->id, aep, idx, owntc, key, ipiwin, ipimax, extrcv); + capmgr_aep_create_ext(comp->id, aep, idx, owntc, key, ipiwin, ipimax, &dcb, extrcv); if (!aep->thd) goto done; - t = sl_thd_alloc_init(aep, 0, prps); + t = sl_thd_alloc_init(aep, 0, prps, dcb); sl_mod_thd_create(sl_mod_thd_policy_get(t)); + ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1); } done: @@ -214,17 +226,20 @@ sl_thd_aep_alloc_no_cs(cos_aepthd_fn_t fn, void *data, sl_thd_property_t prps, c { struct sl_thd *t = NULL; struct cos_aep_info *aep = NULL; + struct cos_dcb_info *dcb = NULL; int owntc = 0; aep = sl_thd_alloc_aep_backend(); if (!aep) goto done; if (prps & SL_THD_PROPERTY_OWN_TCAP) owntc = 1; - capmgr_aep_create(aep, fn, data, owntc, key, ipiwin, ipimax); + capmgr_aep_create(aep, fn, data, owntc, key, ipiwin, ipimax, &dcb); if (aep->thd == 0) goto done; + assert(aep->tid); - t = sl_thd_alloc_init(aep, 0, prps); + t = sl_thd_alloc_init(aep, 0, prps, dcb); sl_mod_thd_create(sl_mod_thd_policy_get(t)); + ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1); done: return t; @@ -270,7 +285,15 @@ sl_thd_comp_init(struct cos_defcompinfo *comp, int is_sched) } struct sl_thd * -sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax) +sl_thd_initaep_alloc_dcb(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, dcbcap_t dcap, microsec_t ipiwin, u32_t ipimax) +{ + PRINTC("UNIMPLEMENTED: Using CAPMGR API which should manage the DCB capabilities\n"); + + return NULL; +} + +struct sl_thd * +sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, vaddr_t *dcbuaddr) { struct sl_thd *t = NULL; @@ -278,18 +301,27 @@ sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int sl_cs_enter(); if (!is_sched) { - t = sl_thd_alloc_ext_no_cs(comp, 0); + t = sl_thd_alloc_ext_no_cs(comp, 0, dcbuaddr); } else { t = sl_thd_aep_alloc_ext_no_cs(comp, sched_thd, 0, (is_sched ? SL_THD_PROPERTY_SEND : 0) - | (own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0), key, ipiwin, ipimax, NULL); + | (own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0), key, ipiwin, ipimax, dcbuaddr, NULL); } sl_cs_exit(); return t; } + struct sl_thd * -sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv) +sl_thd_aep_alloc_ext_dcb(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, dcbcap_t dcap, dcboff_t doff, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv) +{ + PRINTC("UNIMPLEMENTED: Using CAPMGR API which should manage the DCB capabilities\n"); + + return NULL; +} + +struct sl_thd * +sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, vaddr_t *dcbuaddr, arcvcap_t *extrcv) { struct sl_thd *t = NULL; @@ -299,9 +331,9 @@ sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thd sl_cs_enter(); if (!is_aep) own_tcap = 0; if (is_aep) { - t = sl_thd_aep_alloc_ext_no_cs(comp, sched_thd, idx, own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0, key, ipiwin, ipimax, extrcv); + t = sl_thd_aep_alloc_ext_no_cs(comp, sched_thd, idx, own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0, key, ipiwin, ipimax, dcbuaddr, extrcv); } else { - t = sl_thd_alloc_ext_no_cs(comp, idx); + t = sl_thd_alloc_ext_no_cs(comp, idx, dcbuaddr); } sl_cs_exit(); @@ -318,7 +350,7 @@ sl_thd_init_ext_no_cs(struct cos_aep_info *aepthd, struct sl_thd *sched) if (!aep) goto done; *aep = *aepthd; - t = sl_thd_alloc_init(aep, 0, 0); + t = sl_thd_alloc_init(aep, 0, 0, NULL); if (!t) goto done; /* use sched info for parent -> child notifications */ @@ -343,15 +375,14 @@ sl_thd_init_ext(struct cos_aep_info *aepthd, struct sl_thd *sched) } struct sl_thd * -sl_thd_retrieve(thdid_t tid) +sl_thd_retrieve_lazy(thdid_t tid) { - struct sl_thd *t = sl_mod_thd_get(sl_thd_lookup_backend(tid)); + struct sl_thd *t; spdid_t client = cos_inv_token(); thdid_t itid = 0; struct sl_thd *it = NULL; struct cos_aep_info aep; - if (t && sl_thd_aepinfo(t)) return t; if (tid >= SL_MAX_NUM_THDS) return NULL; assert(client); @@ -377,7 +408,7 @@ sl_thd_retrieve(thdid_t tid) it = sl_thd_try_lkup(itid); assert(it); aep.tid = tid; - aep.tc = sl__globals_cpu()->sched_tcap; + aep.tc = sl__globals_core()->sched_tcap; t = sl_thd_init_ext_no_cs(&aep, it); /* if (tid != sl_thdid()) sl_cs_exit(); */ @@ -394,3 +425,39 @@ sl_thd_free(struct sl_thd *t) sl_thd_free_no_cs(t); sl_cs_exit(); } + +int +sl_thd_migrate_no_cs(struct sl_thd *t, cpuid_t core) +{ + struct sl_thd_policy *x = NULL; + int ret; + + if (t->properties) return -1; + if (t->state != SL_THD_RUNNABLE) return -1; + /* capmgr should migrate the thdcap as well */ + ret = capmgr_thd_migrate(sl_thd_thdid(t), sl_thd_thdcap(t), core); + if (ret) return -1; + sl_mod_thd_delete(sl_mod_thd_policy_get(t)); + ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), -1); + + x = sl_thd_migrate_backend(sl_mod_thd_policy_get(t), core); + if (!x) return -1; + + return 0; +} + +int +sl_thd_migrate(thdid_t tid, cpuid_t core) +{ + int ret; + struct sl_thd *c = sl_thd_curr(), *t = sl_thd_lkup(tid); + + if (core == cos_cpuid()) return -1; + if (sl_thd_rcvcap(t) || sl_thd_tcap(t)) return -1; + assert(c != t); + sl_cs_enter(); + ret = sl_thd_migrate_no_cs(t, core); + sl_cs_exit(); + + return ret; +} diff --git a/src/components/lib/sl/sl_child.c b/src/components/lib/sl/sl_child.c index 45ce8fe18e..badc3bba88 100644 --- a/src/components/lib/sl/sl_child.c +++ b/src/components/lib/sl/sl_child.c @@ -47,6 +47,7 @@ sl_parent_notif_alloc(struct sl_thd *childthd) int sl_parent_notif_enqueue(struct sl_thd *thd, struct sl_child_notification *notif) { +#ifdef SL_PARENTCHILD assert(thd && notif); assert(thd->properties & SL_THD_PROPERTY_SEND); @@ -55,11 +56,14 @@ sl_parent_notif_enqueue(struct sl_thd *thd, struct sl_child_notification *notif) if (ck_ring_enqueue_spsc_child(thd->ch_ring, thd->ch_ringbuf, notif) == false) return -1; if (cos_asnd(sl_thd_asndcap(thd), 0)) return -1; +#else + assert(0); +#endif return 0; } -/* there is only 1 parent per scheduler per cpu */ +/* there is only 1 parent per scheduler per core */ int sl_child_notif_map(cbuf_t id) { @@ -85,6 +89,7 @@ sl_child_notif_map(cbuf_t id) int sl_child_notif_dequeue(struct sl_child_notification *notif) { +#ifdef SL_PARENTCHILD struct ck_ring *cring = child_ring[cos_cpuid()]; struct sl_child_notification *crbuf = child_ringbuf[cos_cpuid()]; @@ -92,38 +97,52 @@ sl_child_notif_dequeue(struct sl_child_notification *notif) if (!cring || !crbuf) return 0; if (ck_ring_dequeue_spsc_child(cring, crbuf, notif) == true) return 1; - +#endif return 0; } int sl_child_notif_empty(void) { +#ifdef SL_PARENTCHILD struct ck_ring *cring = child_ring[cos_cpuid()]; if (!cring) return 1; return (!ck_ring_size(cring)); +#else + return 1; +#endif } int sl_parent_notif_block_no_cs(struct sl_thd *child, struct sl_thd *thd) { +#ifdef SL_PARENTCHILD struct sl_child_notification notif; notif.type = SL_CHILD_THD_BLOCK; notif.tid = sl_thd_thdid(thd); return sl_parent_notif_enqueue(child, ¬if); +#else + assert(0); + return 0; +#endif } int sl_parent_notif_wakeup_no_cs(struct sl_thd *child, struct sl_thd *thd) { +#ifdef SL_PARENTCHILD struct sl_child_notification notif; notif.type = SL_CHILD_THD_WAKEUP; notif.tid = sl_thd_thdid(thd); return sl_parent_notif_enqueue(child, ¬if); +#else + assert(0); + return 0; +#endif } diff --git a/src/components/lib/sl/sl_mod_fifo.c b/src/components/lib/sl/sl_mod_fifo.c new file mode 100644 index 0000000000..3824356794 --- /dev/null +++ b/src/components/lib/sl/sl_mod_fifo.c @@ -0,0 +1,115 @@ +/** + * Redistribution of this file is permitted under the BSD two clause license. + * + * Copyright 2019, The George Washington University + * Author: Phani Gadepalli, phanikishoreg@gwu.edu + */ + +#include +#include +#include +#include + +#define SL_FPRR_PERIOD_US_MIN SL_MIN_PERIOD_US + +static struct ps_list_head threads[NUM_CPU] CACHE_ALIGNED; + +void +sl_mod_execution(struct sl_thd_policy *t, cycles_t cycles) +{ } + +struct sl_thd_policy * +sl_mod_schedule(void) +{ + struct sl_thd_policy *t = NULL; + + if (unlikely(ps_list_head_empty(&threads[cos_cpuid()]))) goto done; + t = ps_list_head_first_d(&threads[cos_cpuid()], struct sl_thd_policy); + +done: + return t; +} + +struct sl_thd_policy * +sl_mod_last_schedule(void) +{ + struct sl_thd_policy *t = NULL; + + if (unlikely(ps_list_head_empty(&threads[cos_cpuid()]))) goto done; + t = ps_list_head_last_d(&threads[cos_cpuid()], struct sl_thd_policy); + +done: + return t; +} + +void +sl_mod_block(struct sl_thd_policy *t) +{ + ps_list_rem_d(t); +} + +void +sl_mod_wakeup(struct sl_thd_policy *t) +{ + assert(ps_list_singleton_d(t)); + + ps_list_head_append_d(&threads[cos_cpuid()], t); +} + +void +sl_mod_yield(struct sl_thd_policy *t, struct sl_thd_policy *yield_to) +{ + ps_list_rem_d(t); + ps_list_head_append_d(&threads[cos_cpuid()], t); +} + +void +sl_mod_thd_create(struct sl_thd_policy *t) +{ + t->priority = TCAP_PRIO_MIN; + t->period = 0; + t->period_usec = 0; + ps_list_init_d(t); + + /* TODO: add to runq here? for now, only add when PRIO is set and that's pretty much it's ARRIVAL time! */ +} + +void +sl_mod_thd_delete(struct sl_thd_policy *t) +{ ps_list_rem_d(t); } + +void +sl_mod_thd_param_set(struct sl_thd_policy *t, sched_param_type_t type, unsigned int v) +{ + int cpu = cos_cpuid(); + + switch (type) { + case SCHEDP_PRIO: + { + t->priority = v; + sl_thd_setprio(sl_mod_thd_get(t), t->priority); + ps_list_head_append_d(&threads[cos_cpuid()], t); + + break; + } + case SCHEDP_WINDOW: + { + assert(v >= SL_FPRR_PERIOD_US_MIN); + t->period_usec = v; + t->period = sl_usec2cyc(v); + + break; + } + case SCHEDP_BUDGET: + { + break; + } + default: assert(0); + } +} + +void +sl_mod_init(void) +{ + ps_list_head_init(&threads[cos_cpuid()]); +} diff --git a/src/components/lib/sl/sl_mod_fprr.c b/src/components/lib/sl/sl_mod_fprr.c index 5d1c5dd202..8992ea0a57 100644 --- a/src/components/lib/sl/sl_mod_fprr.c +++ b/src/components/lib/sl/sl_mod_fprr.c @@ -9,9 +9,9 @@ #define SL_FPRR_PERIOD_US_MIN SL_MIN_PERIOD_US -struct ps_list_head threads[NUM_CPU][SL_FPRR_NPRIOS] CACHE_ALIGNED; +static unsigned int thdlist_bmp[NUM_CPU] CACHE_ALIGNED; +static struct ps_list_head threads[NUM_CPU][SL_FPRR_NPRIOS] CACHE_ALIGNED; -/* No RR yet */ void sl_mod_execution(struct sl_thd_policy *t, cycles_t cycles) { } @@ -20,37 +20,66 @@ struct sl_thd_policy * sl_mod_schedule(void) { int i; - struct sl_thd_policy *t; + struct sl_thd_policy *t = NULL; - for (i = 0 ; i < SL_FPRR_NPRIOS ; i++) { - if (ps_list_head_empty(&threads[cos_cpuid()][i])) continue; - t = ps_list_head_first_d(&threads[cos_cpuid()][i], struct sl_thd_policy); + if (unlikely(!thdlist_bmp[cos_cpuid()])) return NULL; + i = __builtin_ctz(thdlist_bmp[cos_cpuid()]); + assert(i < SL_FPRR_NPRIOS); + assert(!ps_list_head_empty(&threads[cos_cpuid()][i])); + t = ps_list_head_first_d(&threads[cos_cpuid()][i], struct sl_thd_policy); + assert(t); - /* - * We want to move the selected thread to the back of the list. - * Otherwise fprr won't be truly round robin - */ - ps_list_rem_d(t); - ps_list_head_append_d(&threads[cos_cpuid()][i], t); + ps_list_rem_d(t); + ps_list_head_append_d(&threads[cos_cpuid()][i], t); - return t; - } + return t; +} +struct sl_thd_policy * +sl_mod_last_schedule(void) +{ + /* not supported! */ return NULL; } +static inline void +__sl_mod_bmp_unset(struct sl_thd_policy *t) +{ + unsigned int ctb = ps_load(&thdlist_bmp[cos_cpuid()]); + unsigned int p = t->priority - 1, b = 1 << p; + + if (!ps_list_head_empty(&threads[cos_cpuid()][p])) return; + + /* unset from bitmap if there are no threads at this priority */ + if (unlikely(!ps_upcas(&thdlist_bmp[cos_cpuid()], ctb, ctb & ~b))) assert(0); +} + +static inline void +__sl_mod_bmp_set(struct sl_thd_policy *t) +{ + unsigned int ctb = ps_load(&thdlist_bmp[cos_cpuid()]); + unsigned int p = t->priority - 1, b = 1 << p; + + if (unlikely(ctb & b)) return; + + assert(!ps_list_head_empty(&threads[cos_cpuid()][p])); + /* set to bitmap if this is the first element added at this prio! */ + if (unlikely(!ps_upcas(&thdlist_bmp[cos_cpuid()], ctb, ctb | b))) assert(0); +} + void sl_mod_block(struct sl_thd_policy *t) { ps_list_rem_d(t); + __sl_mod_bmp_unset(t); } void sl_mod_wakeup(struct sl_thd_policy *t) { assert(ps_list_singleton_d(t)); - ps_list_head_append_d(&threads[cos_cpuid()][t->priority - 1], t); + __sl_mod_bmp_set(t); } void @@ -72,7 +101,10 @@ sl_mod_thd_create(struct sl_thd_policy *t) void sl_mod_thd_delete(struct sl_thd_policy *t) -{ ps_list_rem_d(t); } +{ + ps_list_rem_d(t); + __sl_mod_bmp_unset(t); +} void sl_mod_thd_param_set(struct sl_thd_policy *t, sched_param_type_t type, unsigned int v) @@ -81,10 +113,12 @@ sl_mod_thd_param_set(struct sl_thd_policy *t, sched_param_type_t type, unsigned case SCHEDP_PRIO: { assert(v >= SL_FPRR_PRIO_HIGHEST && v <= SL_FPRR_PRIO_LOWEST); - ps_list_rem_d(t); /* if we're already on a list, and we're updating priority */ + /* should not have been on any prio before, this is FP */ + assert(ps_list_singleton_d(t)); t->priority = v; - ps_list_head_append_d(&threads[cos_cpuid()][t->priority - 1], t); - sl_thd_setprio(sl_mod_thd_get(t), t->priority); + ps_list_head_append_d(&threads[cos_cpuid()][v - 1], t); + __sl_mod_bmp_set(t); + sl_thd_setprio(sl_mod_thd_get(t), v); break; } @@ -110,6 +144,7 @@ sl_mod_init(void) { int i; + thdlist_bmp[cos_cpuid()] = 0; memset(threads[cos_cpuid()], 0, sizeof(struct ps_list_head) * SL_FPRR_NPRIOS); for (i = 0 ; i < SL_FPRR_NPRIOS ; i++) { ps_list_head_init(&threads[cos_cpuid()][i]); diff --git a/src/components/lib/sl/sl_mod_part_fifo.c b/src/components/lib/sl/sl_mod_part_fifo.c new file mode 100644 index 0000000000..3584d0dc26 --- /dev/null +++ b/src/components/lib/sl/sl_mod_part_fifo.c @@ -0,0 +1,138 @@ +/** + * Redistribution of this file is permitted under the BSD two clause license. + * + * Copyright 2019, The George Washington University + * Author: Phani Gadepalli, phanikishoreg@gwu.edu + */ + +#include +#include +#include +#include + +#define SL_FIFO_PRIO TCAP_PRIO_MAX +#define SL_FIFO_IDLE_PRIO SL_FIFO_PRIO+4 +#define SL_FIFO_PERIOD_US_MIN SL_MIN_PERIOD_US + +static struct ps_list_head threads[NUM_CPU] CACHE_ALIGNED; +static struct sl_thd_policy *idle_thd[NUM_CPU]; + +void +sl_mod_execution(struct sl_thd_policy *t, cycles_t cycles) +{ } + +struct sl_thd_policy * +sl_mod_schedule(void) +{ + struct sl_thd_policy *t = NULL; + + if (unlikely(ps_list_head_empty(&threads[cos_cpuid()]))) goto done; + t = ps_list_head_first_d(&threads[cos_cpuid()], struct sl_thd_policy); + + return t; +done: + if (likely(idle_thd[cos_cpuid()])) return idle_thd[cos_cpuid()]; + + return t; +} + +struct sl_thd_policy * +sl_mod_last_schedule(void) +{ + struct sl_thd_policy *t = NULL; + + if (unlikely(ps_list_head_empty(&threads[cos_cpuid()]))) goto done; + t = ps_list_head_last_d(&threads[cos_cpuid()], struct sl_thd_policy); + +done: + return t; +} + +void +sl_mod_block(struct sl_thd_policy *t) +{ + assert(t != idle_thd[cos_cpuid()]); + ps_list_rem_d(t); +} + +void +sl_mod_wakeup(struct sl_thd_policy *t) +{ + struct sl_thd *tm = sl_mod_thd_get(t); + + assert(t != idle_thd[cos_cpuid()]); + assert(ps_list_singleton_d(t)); + + ps_list_head_append_d(&threads[cos_cpuid()], t); + /* remove from partlist used for tracking free pool of tasks on this core! */ + if (!ps_list_singleton(tm, partlist)) ps_list_rem(tm, partlist); +} + +void +sl_mod_yield(struct sl_thd_policy *t, struct sl_thd_policy *yield_to) +{ + if (unlikely(t == idle_thd[cos_cpuid()])) return; + ps_list_rem_d(t); + ps_list_head_append_d(&threads[cos_cpuid()], t); +} + +void +sl_mod_thd_create(struct sl_thd_policy *t) +{ + t->priority = TCAP_PRIO_MIN; + t->period = 0; + t->period_usec = 0; + ps_list_init_d(t); + + /* TODO: add to runq here? for now, only add when PRIO is set and that's pretty much it's ARRIVAL time! */ +} + +void +sl_mod_thd_delete(struct sl_thd_policy *t) +{ + if (unlikely(t == idle_thd[cos_cpuid()])) return; + ps_list_rem_d(t); +} + +void +sl_mod_thd_param_set(struct sl_thd_policy *t, sched_param_type_t type, unsigned int v) +{ + int cpu = cos_cpuid(); + + switch (type) { + case SCHEDP_PRIO: + { + t->priority = v; + sl_thd_setprio(sl_mod_thd_get(t), t->priority); + + if (v == SL_FIFO_IDLE_PRIO) { + assert(idle_thd[cos_cpuid()] == NULL); + idle_thd[cos_cpuid()] = t; + } else { + ps_list_head_append_d(&threads[cos_cpuid()], t); + } + + break; + } + case SCHEDP_WINDOW: + { + assert(v >= SL_FIFO_PERIOD_US_MIN); + t->period_usec = v; + t->period = sl_usec2cyc(v); + + break; + } + case SCHEDP_BUDGET: + { + break; + } + default: assert(0); + } +} + +void +sl_mod_init(void) +{ + idle_thd[cos_cpuid()] = NULL; + ps_list_head_init(&threads[cos_cpuid()]); +} diff --git a/src/components/lib/sl/sl_mod_rr.c b/src/components/lib/sl/sl_mod_rr.c new file mode 100644 index 0000000000..ef3116a97c --- /dev/null +++ b/src/components/lib/sl/sl_mod_rr.c @@ -0,0 +1,109 @@ +#include +#include +#include +#include + +#define SL_FPRR_PERIOD_US_MIN SL_MIN_PERIOD_US + +static struct ps_list_head threads[NUM_CPU] CACHE_ALIGNED; + +void +sl_mod_execution(struct sl_thd_policy *t, cycles_t cycles) +{ } + +struct sl_thd_policy * +sl_mod_schedule(void) +{ + struct sl_thd_policy *t = NULL; + + if (unlikely(ps_list_head_empty(&threads[cos_cpuid()]))) goto done; + t = ps_list_head_first_d(&threads[cos_cpuid()], struct sl_thd_policy); + ps_list_rem_d(t); + ps_list_head_append_d(&threads[cos_cpuid()], t); + +done: + return t; +} + +struct sl_thd_policy * +sl_mod_last_schedule(void) +{ + struct sl_thd_policy *t = NULL, *tl = NULL; + + if (unlikely(ps_list_head_empty(&threads[cos_cpuid()]))) goto done; + t = ps_list_head_last_d(&threads[cos_cpuid()], struct sl_thd_policy); + +done: + return t; +} + +void +sl_mod_block(struct sl_thd_policy *t) +{ + ps_list_rem_d(t); +} + +void +sl_mod_wakeup(struct sl_thd_policy *t) +{ + assert(ps_list_singleton_d(t)); + + ps_list_head_append_d(&threads[cos_cpuid()], t); +} + +void +sl_mod_yield(struct sl_thd_policy *t, struct sl_thd_policy *yield_to) +{ + ps_list_rem_d(t); + ps_list_head_append_d(&threads[cos_cpuid()], t); +} + +void +sl_mod_thd_create(struct sl_thd_policy *t) +{ + t->priority = TCAP_PRIO_MIN; + t->period = 0; + t->period_usec = 0; + ps_list_init_d(t); +} + +void +sl_mod_thd_delete(struct sl_thd_policy *t) +{ ps_list_rem_d(t); } + +void +sl_mod_thd_param_set(struct sl_thd_policy *t, sched_param_type_t type, unsigned int v) +{ + int cpu = cos_cpuid(); + + switch (type) { + case SCHEDP_PRIO: + { + t->priority = v; + sl_thd_setprio(sl_mod_thd_get(t), t->priority); + ps_list_head_append_d(&threads[cos_cpuid()], t); + + break; + } + case SCHEDP_WINDOW: + { + assert(v >= SL_FPRR_PERIOD_US_MIN); + t->period_usec = v; + t->period = sl_usec2cyc(v); + /* FIXME: synchronize periods for all tasks */ + + break; + } + case SCHEDP_BUDGET: + { + break; + } + default: assert(0); + } +} + +void +sl_mod_init(void) +{ + ps_list_head_init(&threads[cos_cpuid()]); +} diff --git a/src/components/lib/sl/sl_raw.c b/src/components/lib/sl/sl_raw.c index 77b32f3a29..b73384e10e 100644 --- a/src/components/lib/sl/sl_raw.c +++ b/src/components/lib/sl/sl_raw.c @@ -11,6 +11,7 @@ #include #include #include +#include extern void sl_thd_event_info_reset(struct sl_thd *t); extern void sl_thd_free_no_cs(struct sl_thd *t); @@ -28,7 +29,7 @@ sl_shm_map(cbuf_t id) } void -sl_xcpu_asnd_alloc(void) +sl_xcore_asnd_alloc(void) { struct cos_defcompinfo *dci = cos_defcompinfo_curr_get(); struct cos_compinfo *ci = cos_compinfo_get(dci); @@ -38,16 +39,16 @@ sl_xcpu_asnd_alloc(void) asndcap_t snd; if (i == cos_cpuid()) continue; - if (!bitmap_check(sl__globals()->cpu_bmp, i)) continue; + if (!bitmap_check(sl__globals()->core_bmp, i)) continue; snd = cos_asnd_alloc(ci, BOOT_CAPTBL_SELF_INITRCV_BASE_CPU(i), ci->captbl_cap); assert(snd); - sl__globals()->xcpu_asnd[cos_cpuid()][i] = snd; + sl__globals()->xcore_asnd[cos_cpuid()][i] = snd; } } struct sl_thd * -sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t prps) +sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t prps, struct cos_dcb_info *dcb) { struct sl_thd_policy *tp = NULL; struct sl_thd *t = NULL; @@ -57,6 +58,7 @@ sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t if (!tp) goto done; t = sl_mod_thd_get(tp); + t->dcb = dcb; t->properties = prps; t->aepinfo = aep; t->sndcap = sndcap; @@ -71,7 +73,9 @@ sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t t->timeout_idx = -1; t->prio = TCAP_PRIO_MIN; ps_list_init(t, SL_THD_EVENT_LIST); + ps_list_init(t, partlist); sl_thd_event_info_reset(t); + sl_xcore_thd_lookup_init(aep->tid, cos_cpuid()); done: return t; @@ -84,17 +88,23 @@ sl_thd_alloc_no_cs(cos_thd_fn_t fn, void *data) struct cos_compinfo *ci = cos_compinfo_get(dci); struct sl_thd *t = NULL; struct cos_aep_info *aep = NULL; + struct cos_dcb_info *dcb = NULL; + dcbcap_t dcap; + dcboff_t doff; aep = sl_thd_alloc_aep_backend(); if (!aep) goto done; + dcap = cos_dcb_info_alloc_curr(&doff, (vaddr_t *)&dcb); + if (dcb && doff) assert(dcap); - aep->thd = cos_thd_alloc(ci, ci->comp_cap, fn, data); + aep->thd = cos_thd_alloc(ci, ci->comp_cap, fn, data, dcap, doff); if (!aep->thd) goto done; aep->tid = cos_introspect(ci, aep->thd, THD_GET_TID); if (!aep->tid) goto done; - t = sl_thd_alloc_init(aep, 0, 0); + t = sl_thd_alloc_init(aep, 0, 0, dcb); sl_mod_thd_create(sl_mod_thd_policy_get(t)); + ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1); done: return t; @@ -119,15 +129,16 @@ sl_thd_comp_init_no_cs(struct cos_defcompinfo *comp, sl_thd_property_t prps, asn assert(snd); } - t = sl_thd_alloc_init(aep, snd, prps); + t = sl_thd_alloc_init(aep, snd, prps, NULL); sl_mod_thd_create(sl_mod_thd_policy_get(t)); + ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1); done: return t; } static struct sl_thd * -sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx) +sl_thd_alloc_ext_dcb_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx, dcbcap_t dcbcap, dcboff_t dcboff) { struct cos_defcompinfo *dci = cos_defcompinfo_curr_get(); struct cos_compinfo *ci = cos_compinfo_get(dci); @@ -140,16 +151,17 @@ sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx) aep = sl_thd_alloc_aep_backend(); if (!aep) goto done; - aep->thd = cos_thd_alloc_ext(ci, compci->comp_cap, idx); + aep->thd = cos_thd_alloc_ext(ci, compci->comp_cap, idx, dcbcap, dcboff); if (!aep->thd) goto done; aep->tid = cos_introspect(ci, aep->thd, THD_GET_TID); if (!aep->tid) goto done; - t = sl_thd_alloc_init(aep, 0, 0); + t = sl_thd_alloc_init(aep, 0, 0, NULL); sl_mod_thd_create(sl_mod_thd_policy_get(t)); + ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1); } else { assert(idx == 0); - ret = cos_initaep_alloc(comp, NULL, 0); + ret = cos_initaep_alloc(comp, NULL, 0, dcbcap); if (ret) goto done; t = sl_thd_comp_init_no_cs(comp, 0, 0); @@ -165,26 +177,32 @@ sl_thd_aep_alloc_no_cs(cos_aepthd_fn_t fn, void *data, sl_thd_property_t prps, c struct cos_defcompinfo *dci = cos_defcompinfo_curr_get(); struct sl_thd *t = NULL; struct cos_aep_info *aep = NULL; + struct cos_dcb_info *dcb = NULL; int ret; + dcbcap_t dcap; + dcboff_t doff; aep = sl_thd_alloc_aep_backend(); if (!aep) goto done; + dcap = cos_dcb_info_alloc_curr(&doff, (vaddr_t *)&dcb); + if (dcb && doff) assert(dcap); /* NOTE: Cannot use stack-allocated cos_aep_info struct here */ - if (prps & SL_THD_PROPERTY_OWN_TCAP) ret = cos_aep_alloc(aep, fn, data); - else ret = cos_aep_tcap_alloc(aep, sl_thd_aepinfo(sl__globals_cpu()->sched_thd)->tc, - fn, data); + if (prps & SL_THD_PROPERTY_OWN_TCAP) ret = cos_aep_alloc(aep, fn, data, dcap, doff); + else ret = cos_aep_tcap_alloc(aep, sl_thd_aepinfo(sl__globals_core()->sched_thd)->tc, + fn, data, dcap, doff); if (ret) goto done; - t = sl_thd_alloc_init(aep, 0, prps); + t = sl_thd_alloc_init(aep, 0, prps, dcb); sl_mod_thd_create(sl_mod_thd_policy_get(t)); + ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1); done: return t; } static struct sl_thd * -sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, thdclosure_index_t idx, sl_thd_property_t prps, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv) +sl_thd_aep_alloc_ext_dcb_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, thdclosure_index_t idx, sl_thd_property_t prps, cos_channelkey_t key, dcbcap_t dcap, dcboff_t doff, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv) { struct cos_aep_info *aep = NULL; struct sl_thd *t = NULL; @@ -192,11 +210,11 @@ sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, t int ret = 0; if (prps & SL_THD_PROPERTY_SEND) { - assert(sched); + assert(sched && !doff); if (prps & SL_THD_PROPERTY_OWN_TCAP) { - ret = cos_initaep_alloc(comp, sl_thd_aepinfo(sched), prps & SL_THD_PROPERTY_SEND); + ret = cos_initaep_alloc(comp, sl_thd_aepinfo(sched), prps & SL_THD_PROPERTY_SEND, dcap); } else { - ret = cos_initaep_tcap_alloc(comp, sl_thd_tcap(sched), sl_thd_aepinfo(sched)); + ret = cos_initaep_tcap_alloc(comp, sl_thd_tcap(sched), sl_thd_aepinfo(sched), dcap); } if (ret) goto done; @@ -208,14 +226,15 @@ sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, t if (!aep) goto done; if (prps & SL_THD_PROPERTY_OWN_TCAP) { - ret = cos_aep_alloc_ext(aep, comp, sl_thd_aepinfo(sched), idx); + ret = cos_aep_alloc_ext(aep, comp, sl_thd_aepinfo(sched), idx, dcap, doff); } else { - ret = cos_aep_tcap_alloc_ext(aep, comp, sl_thd_aepinfo(sched), sl_thd_tcap(sched), idx); + ret = cos_aep_tcap_alloc_ext(aep, comp, sl_thd_aepinfo(sched), sl_thd_tcap(sched), idx, dcap, doff); } if (ret) goto done; - t = sl_thd_alloc_init(aep, 0, prps); + t = sl_thd_alloc_init(aep, 0, prps, NULL); sl_mod_thd_create(sl_mod_thd_policy_get(t)); + ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1); if (extrcv) *extrcv = sl_thd_rcvcap(t); } @@ -264,23 +283,39 @@ sl_thd_comp_init(struct cos_defcompinfo *comp, int is_sched) } struct sl_thd * -sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax) +sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, vaddr_t *dcbaddr) +{ + PRINTC("UNIMPLEMENTED: Using RAW API which cannot manage DCB resource for child components\n"); + + return NULL; +} + +struct sl_thd * +sl_thd_initaep_alloc_dcb(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, dcbcap_t dcap, microsec_t ipiwin, u32_t ipimax) { struct sl_thd *t = NULL; if (!comp) return NULL; sl_cs_enter(); - if (!is_sched) t = sl_thd_alloc_ext_no_cs(comp, 0); - else t = sl_thd_aep_alloc_ext_no_cs(comp, sched_thd, 0, (is_sched ? SL_THD_PROPERTY_SEND : 0) - | (own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0), key, ipiwin, ipimax, NULL); + if (!is_sched) t = sl_thd_alloc_ext_dcb_no_cs(comp, 0, dcap, 0); + else t = sl_thd_aep_alloc_ext_dcb_no_cs(comp, sched_thd, 0, (is_sched ? SL_THD_PROPERTY_SEND : 0) + | (own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0), key, dcap, 0, ipiwin, ipimax, NULL); sl_cs_exit(); return t; } struct sl_thd * -sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv) +sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, vaddr_t *dcbaddr, arcvcap_t *extrcv) +{ + PRINTC("UNIMPLEMENTED: Using RAW API which cannot manage DCB resource for child components\n"); + + return NULL; +} + +struct sl_thd * +sl_thd_aep_alloc_ext_dcb(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, dcbcap_t dcap, dcboff_t doff, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv) { struct sl_thd *t = NULL; @@ -288,9 +323,9 @@ sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thd sl_cs_enter(); if (!is_aep) own_tcap = 0; if (is_aep) { - t = sl_thd_aep_alloc_ext_no_cs(comp, sched_thd, idx, own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0, key, ipiwin, ipimax, extrcv); + t = sl_thd_aep_alloc_ext_dcb_no_cs(comp, sched_thd, idx, own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0, key, dcap, doff, ipiwin, ipimax, extrcv); } else { - t = sl_thd_alloc_ext_no_cs(comp, idx); + t = sl_thd_alloc_ext_dcb_no_cs(comp, idx, dcap, doff); } sl_cs_exit(); @@ -311,7 +346,7 @@ sl_thd_init_ext(struct cos_aep_info *aepthd, struct sl_thd *sched) *aep = *aepthd; /* TODO: use sched info for parent -> child notifications */ - t = sl_thd_alloc_init(aep, 0, 0); + t = sl_thd_alloc_init(aep, 0, 0, NULL); done: sl_cs_exit(); @@ -320,9 +355,11 @@ sl_thd_init_ext(struct cos_aep_info *aepthd, struct sl_thd *sched) } struct sl_thd * -sl_thd_retrieve(thdid_t tid) +sl_thd_retrieve_lazy(thdid_t tid) { - return sl_mod_thd_get(sl_thd_lookup_backend(tid)); + /* without capmgr, there is no lazy retrieval of threads! */ + assert(0); + return NULL; } void @@ -334,3 +371,40 @@ sl_thd_free(struct sl_thd *t) sl_thd_free_no_cs(t); sl_cs_exit(); } + +int +sl_thd_migrate_no_cs(struct sl_thd *t, cpuid_t core) +{ + struct cos_defcompinfo *dci = cos_defcompinfo_curr_get(); + struct cos_compinfo *ci = cos_compinfo_get(dci); + struct sl_thd_policy *x = NULL; + int ret; + + if (t->properties) return -1; + if (t->state != SL_THD_RUNNABLE) return -1; + ret = cos_thd_migrate(ci, sl_thd_thdcap(t), core); + if (ret) return -1; + sl_mod_thd_delete(sl_mod_thd_policy_get(t)); + ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), -1); + + x = sl_thd_migrate_backend(sl_mod_thd_policy_get(t), core); + if (!x) return -1; + + return 0; +} + +int +sl_thd_migrate(thdid_t tid, cpuid_t core) +{ + int ret; + struct sl_thd *c = sl_thd_curr(), *t = sl_thd_lkup(tid); + + if (core == cos_cpuid()) return -1; + assert(c != t); + sl_cs_enter(); + ret = sl_thd_migrate_no_cs(t, core); + sl_cs_exit(); + + return ret; +} + diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c index fe297c1be9..095dffb072 100644 --- a/src/components/lib/sl/sl_sched.c +++ b/src/components/lib/sl/sl_sched.c @@ -7,55 +7,56 @@ #include #include -#include +#include #include #include #include #include #include +#include +#include struct sl_global sl_global_data; -struct sl_global_cpu sl_global_cpu_data[NUM_CPU] CACHE_ALIGNED; +struct sl_global_core sl_global_core_data[NUM_CPU] CACHE_ALIGNED; static void sl_sched_loop_intern(int non_block) __attribute__((noreturn)); -extern struct sl_thd *sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t prps); -extern int sl_xcpu_process_no_cs(void); -extern void sl_xcpu_asnd_alloc(void); +extern struct sl_thd *sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t prps, struct cos_dcb_info *dcb); +extern int sl_xcore_process_no_cs(void); +extern void sl_xcore_asnd_alloc(void); /* * These functions are removed from the inlined fast-paths of the * critical section (cs) code to save on code size/locality */ int -sl_cs_enter_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, thdcap_t curr, sched_tok_t tok) +sl_cs_enter_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, struct sl_global_core *gcore, struct sl_thd *curr, sched_tok_t tok) { - struct sl_thd *t = sl_thd_curr(); - struct sl_global_cpu *g = sl__globals_cpu(); +#ifdef SL_CS int ret; /* recursive locks are not allowed */ - assert(csi->s.owner != sl_thd_thdcap(t)); + assert(csi->s.owner != sl_thd_thdcap(curr)); if (!csi->s.contention) { csi->s.contention = 1; - if (!ps_cas(&g->lock.u.v, cached->v, csi->v)) return 1; + if (!ps_upcas(&gcore->lock.u.v, cached->v, csi->v)) return 1; } /* Switch to the owner of the critical section, with inheritance using our tcap/priority */ - if ((ret = cos_defswitch(csi->s.owner, t->prio, csi->s.owner == sl_thd_thdcap(g->sched_thd) ? - TCAP_TIME_NIL : g->timeout_next, tok))) return ret; + if ((ret = cos_defswitch(csi->s.owner, curr->prio, csi->s.owner == sl_thd_thdcap(gcore->sched_thd) ? + TCAP_TIME_NIL : gcore->timeout_next, tok))) return ret; /* if we have an outdated token, then we want to use the same repeat loop, so return to that */ +#endif return 1; } /* Return 1 if we need a retry, 0 otherwise */ int -sl_cs_exit_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, sched_tok_t tok) +sl_cs_exit_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, struct sl_global_core *gcore, sched_tok_t tok) { - struct sl_thd *t = sl_thd_curr(); - struct sl_global_cpu *g = sl__globals_cpu(); - - if (!ps_cas(&g->lock.u.v, cached->v, 0)) return 1; +#ifdef SL_CS + if (!ps_upcas(&gcore->lock.u.v, cached->v, 0)) return 1; /* let the scheduler thread decide which thread to run next, inheriting our budget/priority */ - cos_defswitch(g->sched_thdcap, t->prio, TCAP_TIME_NIL, tok); + cos_defswitch(gcore->sched_thdcap, sl_thd_curr()->prio, TCAP_TIME_NIL, tok); +#endif return 0; } @@ -109,27 +110,6 @@ sl_timeout_remove(struct sl_thd *t) t->timeout_idx = -1; } -void -sl_thd_free_no_cs(struct sl_thd *t) -{ - struct sl_thd *ct = sl_thd_curr(); - - assert(t); - assert(t->state != SL_THD_FREE); - if (t->state == SL_THD_BLOCKED_TIMEOUT) sl_timeout_remove(t); - sl_thd_index_rem_backend(sl_mod_thd_policy_get(t)); - sl_mod_thd_delete(sl_mod_thd_policy_get(t)); - t->state = SL_THD_FREE; - /* TODO: add logic for the graveyard to delay this deallocation if t == current */ - sl_thd_free_backend(sl_mod_thd_policy_get(t)); - - /* thread should not continue to run if it deletes itself. */ - if (unlikely(t == ct)) { - while (1) sl_cs_exit_schedule(); - /* FIXME: should never get here, but tcap mechanism can let a child scheduler run! */ - } -} - static int __sl_timeout_compare_min(void *a, void *b) { @@ -151,6 +131,29 @@ sl_timeout_init(microsec_t period) heap_init(sl_timeout_heap(), SL_MAX_NUM_THDS, __sl_timeout_compare_min, __sl_timeout_update_idx); } +void +sl_thd_free_no_cs(struct sl_thd *t) +{ + struct sl_thd *ct = sl_thd_curr(); + + assert(t); + assert(t->state != SL_THD_FREE); + if (t->state == SL_THD_BLOCKED_TIMEOUT) sl_timeout_remove(t); + sl_thd_index_rem_backend(sl_mod_thd_policy_get(t)); + sl_mod_thd_delete(sl_mod_thd_policy_get(t)); + ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), -1); + t->state = SL_THD_FREE; + /* TODO: add logic for the graveyard to delay this deallocation if t == current */ + sl_thd_free_backend(sl_mod_thd_policy_get(t)); + + /* thread should not continue to run if it deletes itself. */ + if (unlikely(t == ct)) { + while (1) { + sl_cs_exit_schedule(); + } + /* FIXME: should never get here, but tcap mechanism can let a child scheduler run! */ + } +} /* * This API is only used by the scheduling thread to block an AEP thread. * AEP thread scheduling events could be redundant. @@ -161,7 +164,7 @@ int sl_thd_sched_block_no_cs(struct sl_thd *t, sl_thd_state_t block_type, cycles_t timeout) { assert(t); - assert(t != sl__globals_cpu()->idle_thd && t != sl__globals_cpu()->sched_thd); + assert(t != sl__globals_core()->idle_thd && t != sl__globals_core()->sched_thd); assert(block_type == SL_THD_BLOCKED_TIMEOUT || block_type == SL_THD_BLOCKED); if (t->schedthd) return 0; @@ -178,6 +181,7 @@ sl_thd_sched_block_no_cs(struct sl_thd *t, sl_thd_state_t block_type, cycles_t t assert(sl_thd_is_runnable(t)); sl_mod_block(sl_mod_thd_policy_get(t)); + ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), -1); update: t->state = block_type; @@ -212,9 +216,11 @@ sl_thd_sched_unblock_no_cs(struct sl_thd *t) int sl_thd_block_no_cs(struct sl_thd *t, sl_thd_state_t block_type, cycles_t timeout) { - assert(t); - assert(t != sl__globals_cpu()->idle_thd && t != sl__globals_cpu()->sched_thd); - assert(sl_thd_curr() == t); /* only current thread is allowed to block itself */ + assert(t && sl_thd_curr() == t); /* only current thread is allowed to block itself */ + assert(t != sl__globals_core()->idle_thd && t != sl__globals_core()->sched_thd); + /* interrupt thread could run and block itself before scheduler sees any of that! */ + sl_thd_sched_unblock_no_cs(t); + assert(sl_thd_is_runnable(t)); assert(block_type == SL_THD_BLOCKED_TIMEOUT || block_type == SL_THD_BLOCKED); if (t->schedthd) { @@ -230,9 +236,9 @@ sl_thd_block_no_cs(struct sl_thd *t, sl_thd_state_t block_type, cycles_t timeout } /* reset rcv_suspended if the scheduler thinks "curr" was suspended on cos_rcv previously */ - sl_thd_sched_unblock_no_cs(t); assert(t->state == SL_THD_RUNNABLE); sl_mod_block(sl_mod_thd_policy_get(t)); + ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), -1); t->state = block_type; if (block_type == SL_THD_BLOCKED_TIMEOUT) sl_timeout_block(t, timeout); @@ -254,6 +260,7 @@ sl_thd_block(thdid_t tid) return; } sl_cs_exit_schedule(); + assert(sl_thd_is_runnable(t)); return; } @@ -326,11 +333,11 @@ sl_thd_block_expiry(struct sl_thd *t) { cycles_t abs_timeout = 0; - assert(t != sl__globals_cpu()->idle_thd && t != sl__globals_cpu()->sched_thd); + assert(t != sl__globals_core()->idle_thd && t != sl__globals_core()->sched_thd); sl_cs_enter(); if (!(t->properties & SL_THD_PROPERTY_OWN_TCAP)) { assert(!t->rcv_suspended); - abs_timeout = sl__globals_cpu()->timeout_next; + abs_timeout = sl__globals_core()->timeout_next; } else { assert(t->period); abs_timeout = t->last_replenish + t->period; @@ -372,6 +379,7 @@ sl_thd_sched_wakeup_no_cs(struct sl_thd *t) if (t->state == SL_THD_BLOCKED_TIMEOUT) sl_timeout_remove(t); t->state = SL_THD_RUNNABLE; sl_mod_wakeup(sl_mod_thd_policy_get(t)); + ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1); return 0; } @@ -384,11 +392,12 @@ int sl_thd_wakeup_no_cs_rm(struct sl_thd *t) { assert(t); - assert(t != sl__globals_cpu()->idle_thd && t != sl__globals_cpu()->sched_thd); + assert(t != sl__globals_core()->idle_thd && t != sl__globals_core()->sched_thd); assert(t->state == SL_THD_BLOCKED || t->state == SL_THD_BLOCKED_TIMEOUT); t->state = SL_THD_RUNNABLE; sl_mod_wakeup(sl_mod_thd_policy_get(t)); + ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1); t->rcv_suspended = 0; return 0; @@ -406,8 +415,23 @@ sl_thd_wakeup_no_cs(struct sl_thd *t) return 0; } - if (unlikely(sl_thd_is_runnable(t))) { - /* t->state == SL_THD_WOKEN? multiple wakeups? */ +// if (unlikely(sl_thd_is_runnable(t))) { +// /* t->state == SL_THD_WOKEN? multiple wakeups? */ +// t->state = SL_THD_WOKEN; +// return 1; +// } + /* + * TODO: with blockpoints, multiple wakeup problem might go away. + * will try that next! + * + * For now, if a thread creates N tasks and if at least two of them + * complete before master goes to block, which can happen on multi-core + * execution of tasks, then that results in multiple wakeups! + */ + if (unlikely(t->state == SL_THD_WOKEN)) { + t->state = SL_THD_RUNNABLE; + return 1; + } else if (unlikely(t->state == SL_THD_RUNNABLE)) { t->state = SL_THD_WOKEN; return 1; } @@ -435,60 +459,47 @@ sl_thd_wakeup(thdid_t tid) return; } -void -sl_thd_yield_cs_exit(thdid_t tid) +static inline void +sl_thd_yield_cs_exit_intern(thdid_t tid) { struct sl_thd *t = sl_thd_curr(); /* reset rcv_suspended if the scheduler thinks "curr" was suspended on cos_rcv previously */ sl_thd_sched_unblock_no_cs(t); - if (tid) { + if (likely(tid)) { struct sl_thd *to = sl_thd_lkup(tid); - assert(to); sl_cs_exit_switchto(to); } else { - if (likely(t != sl__globals_cpu()->sched_thd && t != sl__globals_cpu()->idle_thd)) sl_mod_yield(sl_mod_thd_policy_get(t), NULL); + if (likely(t != sl__globals_core()->sched_thd && t != sl__globals_core()->idle_thd)) sl_mod_yield(sl_mod_thd_policy_get(t), NULL); sl_cs_exit_schedule(); } } -void -sl_thd_yield(thdid_t tid) -{ - sl_cs_enter(); - sl_thd_yield_cs_exit(tid); -} void -sl_thd_event_info_reset(struct sl_thd *t) +sl_thd_yield_cs_exit(thdid_t tid) { - t->event_info.blocked = 0; - t->event_info.cycles = 0; - t->event_info.timeout = 0; + sl_thd_yield_cs_exit_intern(tid); } -static inline void -sl_thd_event_enqueue(struct sl_thd *t, int blocked, cycles_t cycles, tcap_time_t timeout) +void +sl_thd_yield_intern(thdid_t tid) { - struct sl_global_cpu *g = sl__globals_cpu(); - - if (ps_list_singleton(t, SL_THD_EVENT_LIST)) ps_list_head_append(&g->event_head, t, SL_THD_EVENT_LIST); - - t->event_info.blocked = blocked; - t->event_info.cycles += cycles; - t->event_info.timeout = timeout; + sl_cs_enter(); + sl_thd_yield_cs_exit_intern(tid); } -static inline void -sl_thd_event_dequeue(struct sl_thd *t, int *blocked, cycles_t *cycles, tcap_time_t *timeout) +void +sl_thd_yield_intern_timeout(cycles_t abs_timeout) { - ps_list_rem(t, SL_THD_EVENT_LIST); + struct sl_thd *t = sl_thd_curr(); - *blocked = t->event_info.blocked; - *cycles = t->event_info.cycles; - *timeout = t->event_info.timeout; - sl_thd_event_info_reset(t); + sl_cs_enter(); + /* reset rcv_suspended if the scheduler thinks "curr" was suspended on cos_rcv previously */ + sl_thd_sched_unblock_no_cs(t); + if (likely(t != sl__globals_core()->sched_thd && t != sl__globals_core()->idle_thd)) sl_mod_yield(sl_mod_thd_policy_get(t), NULL); + sl_cs_exit_schedule_timeout(abs_timeout); } void @@ -498,7 +509,7 @@ sl_thd_exit() } void -sl_thd_param_set(struct sl_thd *t, sched_param_t sp) +sl_thd_param_set_no_cs(struct sl_thd *t, sched_param_t sp) { sched_param_type_t type; unsigned int value; @@ -525,84 +536,114 @@ sl_thd_param_set(struct sl_thd *t, sched_param_t sp) sl_mod_thd_param_set(sl_mod_thd_policy_get(t), type, value); } +void +sl_thd_param_set(struct sl_thd *t, sched_param_t sp) +{ + assert(t); + + sl_cs_enter(); + + sl_thd_param_set_no_cs(t, sp); + sl_cs_exit(); +} + void sl_timeout_period(microsec_t period) { cycles_t p = sl_usec2cyc(period); - sl__globals_cpu()->period = p; - sl_timeout_relative(p); + sl__globals_core()->period = p; } /* engage space heater mode */ void sl_idle(void *d) -{ while (1) ; } +{ + struct sl_global_core *gc = sl__globals_core(); + + while (1) { + cycles_t now = sl_now(); + + do { + if (cos_sched_ispending() || +#if NUM_CPU > 1 + ck_ring_size(sl__ring_curr()) != 0 || +#endif + !sl_child_notif_empty()) break; + now = sl_now(); + } while (now < gc->timer_next); + sl_thd_activate_c(gc->sched_thd, cos_sched_sync(), 0, 0, gc->idle_thd, gc); + } +} /* call from the user? */ static void -sl_global_init(u32_t *cpu_bmp) +sl_global_init(u32_t *core_bmp) { struct sl_global *g = sl__globals(); unsigned int i = 0; memset(g, 0, sizeof(struct sl_global)); + assert(sizeof(struct cos_scb_info) * NUM_CPU <= COS_SCB_SIZE && COS_SCB_SIZE == PAGE_SIZE); + g->scb_area = (struct cos_scb_info *)cos_scb_info_get(); for (i = 0; i < NUM_CPU; i++) { - if (!bitmap_check(cpu_bmp, i)) continue; + if (!bitmap_check(core_bmp, i)) continue; - bitmap_set(g->cpu_bmp, i); - ck_ring_init(sl__ring(i), SL_XCPU_RING_SIZE); + bitmap_set(g->core_bmp, i); + ck_ring_init(sl__ring(i), SL_XCORE_RING_SIZE); } } void -sl_init_cpubmp(microsec_t period, u32_t *cpubmp) +sl_init_corebmp(microsec_t period, u32_t *corebmp) { int i; - static volatile int first = 1, init_done = 0; - struct cos_defcompinfo *dci = cos_defcompinfo_curr_get(); - struct cos_compinfo *ci = cos_compinfo_get(dci); - struct sl_global_cpu *g = sl__globals_cpu(); - struct cos_aep_info *saep = cos_sched_aep_get(dci); - - if (ps_cas((unsigned long *)&first, 1, 0)) { - sl_global_init(cpubmp); - + static volatile unsigned long first = NUM_CPU + 1, init_done = 0; + struct cos_defcompinfo *dci = cos_defcompinfo_curr_get(); + struct cos_compinfo *ci = cos_compinfo_get(dci); + struct sl_global_core *g = sl__globals_core(); + struct cos_aep_info *ga = cos_sched_aep_get(dci); + + if (ps_cas((unsigned long *)&first, NUM_CPU + 1, cos_cpuid())) { + sl_global_init(corebmp); ps_faa((unsigned long *)&init_done, 1); } else { /* wait until global ring buffers are initialized correctly! */ while (!ps_load((unsigned long *)&init_done)) ; /* make sure this scheduler is active on this cpu/core */ - assert(sl_cpu_active()); + assert(sl_core_active()); } /* must fit in a word */ assert(sizeof(struct sl_cs) <= sizeof(unsigned long)); - memset(g, 0, sizeof(struct sl_global_cpu)); + memset(g, 0, sizeof(struct sl_global_core)); - g->cyc_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE); - g->lock.u.v = 0; + g->cyc_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE); + g->lock.u.v = 0; + g->scb_info = ((sl__globals()->scb_area) + cos_cpuid()); sl_thd_init_backend(); sl_mod_init(); sl_timeout_init(period); - /* Create the scheduler thread for us. cos_sched_aep_get() is from global(static) memory */ - g->sched_thd = sl_thd_alloc_init(saep, 0, 0); + /* Create the scheduler thread for us. */ + g->sched_thd = sl_thd_alloc_init(ga, 0, 0, (struct cos_dcb_info *)cos_init_dcb_get()); assert(g->sched_thd); - g->sched_thdcap = saep->thd; - g->sched_tcap = saep->tc; - g->sched_rcv = saep->rcv; + g->sched_thdcap = ga->thd; + g->sched_tcap = ga->tc; + g->sched_rcv = ga->rcv; assert(g->sched_rcv); g->sched_thd->prio = TCAP_PRIO_MAX; ps_list_head_init(&g->event_head); + assert(cos_thdid() == sl_thd_thdid(g->sched_thd)); + g->scb_info->curr_thd = 0; g->idle_thd = sl_thd_alloc(sl_idle, NULL); assert(g->idle_thd); /* all cores that this sched runs on, must be initialized by now so "asnd"s can be created! */ - sl_xcpu_asnd_alloc(); + sl_xcore_asnd_alloc(); return; } @@ -611,42 +652,106 @@ sl_init_cpubmp(microsec_t period, u32_t *cpubmp) void sl_init(microsec_t period) { - u32_t cpubmp[NUM_CPU_BMP_WORDS] = { 0 }; + u32_t corebmp[NUM_CPU_BMP_WORDS] = { 0 }; /* runs on all cores.. */ - bitmap_set_contig(cpubmp, 0, NUM_CPU, 1); - sl_init_cpubmp(period, cpubmp); + bitmap_set_contig(corebmp, 0, NUM_CPU, 1); + sl_init_corebmp(period, corebmp); +} + +static inline int +__sl_sched_events_present(void) +{ + struct cos_scb_info *scb = sl_scb_info_core(); + struct cos_sched_ring *ring = &scb->sched_events; + + return __cos_sched_events_present(ring); +} + +static inline int +__sl_sched_event_consume(struct cos_sched_event *e) +{ + struct cos_scb_info *scb = sl_scb_info_core(); + struct cos_sched_ring *ring = &scb->sched_events; + + return __cos_sched_event_consume(ring, e); +} + +static inline int +__sl_sched_rcv(rcv_flags_t rf, struct cos_sched_event *e) +{ + struct sl_global_core *g = sl__globals_core(); +#if 0 + struct sl_thd *curr = sl_thd_curr(); + struct cos_dcb_info *cd = sl_thd_dcbinfo(curr); + int ret = 0; +// if (cos_spd_id() != 4) printc("D"); + + assert(curr == g->sched_thd); + if (!cd) return cos_ul_sched_rcv(g->sched_rcv, rf, g->timeout_next, e); + + rf |= RCV_ULSCHED_RCV; + + __asm__ __volatile__ ( \ + "pushl %%ebp\n\t" \ + "movl %%esp, %%ebp\n\t" \ + "movl $1f, (%%eax)\n\t" \ + "movl %%esp, 4(%%eax)\n\t" \ + "movl $2f, %%ecx\n\t" \ + "movl %%edx, %%eax\n\t" \ + "inc %%eax\n\t" \ + "shl $16, %%eax\n\t" \ + "movl $0, %%edx\n\t" \ + "movl $0, %%edi\n\t" \ + "sysenter\n\t" \ + "jmp 2f\n\t" \ + ".align 4\n\t" \ + "1:\n\t" \ + "movl $1, %%eax\n\t" \ + ".align 4\n\t" \ + "2:\n\t" \ + "popl %%ebp\n\t" \ + : "=a" (ret) + : "a" (cd), "b" (rf), "S" (g->timeout_next), "d" (g->sched_rcv) + : "memory", "cc", "ecx", "edi"); + +// if (cos_spd_id() != 4) printc("E"); +// if (cos_thdid() == 7) PRINTC("%s:%d %d\n", __func__, __LINE__, ret); + cd = sl_thd_dcbinfo(sl_thd_curr()); + cd->sp = 0; + + rf |= RCV_ULONLY; +#endif + return cos_ul_sched_rcv(g->sched_rcv, rf, g->timeout_next, e); } static void sl_sched_loop_intern(int non_block) { - struct sl_global_cpu *g = sl__globals_cpu(); - rcv_flags_t rfl = (non_block ? RCV_NON_BLOCKING : 0) | RCV_ALL_PENDING; + struct sl_global_core *g = sl__globals_core(); + rcv_flags_t rfl = (non_block ? RCV_NON_BLOCKING : 0); - assert(sl_cpu_active()); + assert(sl_thd_curr() == g->sched_thd); + assert(sl_core_active()); while (1) { int pending; do { - thdid_t tid; - int blocked, rcvd; - cycles_t cycles; - tcap_time_t timeout = g->timeout_next, thd_timeout; struct sl_thd *t = NULL, *tn = NULL; struct sl_child_notification notif; + struct cos_sched_event e = { .tid = 0 }; + /* * a child scheduler may receive both scheduling notifications (block/unblock * states of it's child threads) and normal notifications (mainly activations from * it's parent scheduler). */ - pending = cos_sched_rcv(g->sched_rcv, rfl, timeout, - &rcvd, &tid, &blocked, &cycles, &thd_timeout); - if (!tid) goto pending_events; + pending = __sl_sched_rcv(rfl, &e); + if (pending < 0 || !e.tid) goto pending_events; - t = sl_thd_lkup(tid); + t = sl_thd_lkup(e.tid); assert(t); /* don't report the idle thread or a freed thread */ if (unlikely(t == g->idle_thd || t->state == SL_THD_FREE)) goto pending_events; @@ -658,12 +763,15 @@ sl_sched_loop_intern(int non_block) * To avoid dropping events, add the events to the scheduler event list and processing all * the pending events after the scheduler can successfully take the lock. */ - sl_thd_event_enqueue(t, blocked, cycles, thd_timeout); + sl_thd_event_enqueue(t, &e.evt); pending_events: if (ps_list_head_empty(&g->event_head) && +#if NUM_CPU > 1 ck_ring_size(sl__ring_curr()) == 0 && - sl_child_notif_empty()) continue; +#endif + sl_child_notif_empty() && + !cos_sched_events_isempty()) continue; /* * receiving scheduler notifications is not in critical section mainly for @@ -676,21 +784,21 @@ sl_sched_loop_intern(int non_block) ps_list_foreach_del(&g->event_head, t, tn, SL_THD_EVENT_LIST) { /* remove the event from the list and get event info */ - sl_thd_event_dequeue(t, &blocked, &cycles, &thd_timeout); + sl_thd_event_dequeue(t, &e.evt); /* outdated event for a freed thread */ if (t->state == SL_THD_FREE) continue; - sl_mod_execution(sl_mod_thd_policy_get(t), cycles); + sl_mod_execution(sl_mod_thd_policy_get(t), e.evt.elapsed_cycs); - if (blocked) { + if (e.evt.blocked) { sl_thd_state_t state = SL_THD_BLOCKED; cycles_t abs_timeout = 0; - if (likely(cycles)) { - if (thd_timeout) { + if (likely(e.evt.elapsed_cycs)) { + if (e.evt.next_timeout) { state = SL_THD_BLOCKED_TIMEOUT; - abs_timeout = tcap_time2cyc(thd_timeout, sl_now()); + abs_timeout = tcap_time2cyc(e.evt.next_timeout, sl_now()); } sl_thd_sched_block_no_cs(t, state, abs_timeout); } @@ -707,15 +815,17 @@ sl_sched_loop_intern(int non_block) else sl_thd_wakeup_no_cs(t); } +#if NUM_CPU > 1 /* process cross-core requests */ - sl_xcpu_process_no_cs(); + sl_xcore_process_no_cs(); +#endif sl_cs_exit(); } while (pending > 0); if (sl_cs_enter_sched()) continue; /* If switch returns an inconsistency, we retry anyway */ - sl_cs_exit_schedule_nospin(); + sl_cs_exit_schedule_nospin_timeout(0); } } @@ -730,3 +840,36 @@ sl_sched_loop_nonblock(void) { sl_sched_loop_intern(1); } + +void +sl_thd_replenish_no_cs(struct sl_thd *t, cycles_t now) +{ +#ifdef SL_REPLENISH + struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get()); + tcap_res_t currbudget = 0; + cycles_t replenish; + int ret; + + if (likely(!(t->properties & SL_THD_PROPERTY_OWN_TCAP))) return; + if (!t->budget) return; + assert(t->period); + assert(sl_thd_tcap(t) != sl__globals_core()->sched_tcap); + + if (!(t->last_replenish == 0 || t->last_replenish + t->period <= now)) return; + + replenish = now - ((now - t->last_replenish) % t->period); + + ret = 0; + currbudget = (tcap_res_t)cos_introspect(ci, sl_thd_tcap(t), TCAP_GET_BUDGET); + + if (!cycles_same(currbudget, t->budget, SL_CYCS_DIFF) && currbudget < t->budget) { + tcap_res_t transfer = t->budget - currbudget; + + /* tcap_transfer will assign sched_tcap's prio to t's tcap if t->prio == 0, which we don't want. */ + assert(t->prio >= TCAP_PRIO_MAX && t->prio <= TCAP_PRIO_MIN); + ret = cos_tcap_transfer(sl_thd_rcvcap(t), sl__globals_core()->sched_tcap, transfer, t->prio); + } + + if (likely(ret == 0)) t->last_replenish = replenish; +#endif +} diff --git a/src/components/lib/sl/sl_thd_static_backend.c b/src/components/lib/sl/sl_thd_static_backend.c index 86aa4eac66..2985f8f5e5 100644 --- a/src/components/lib/sl/sl_thd_static_backend.c +++ b/src/components/lib/sl/sl_thd_static_backend.c @@ -17,26 +17,63 @@ static struct cos_aep_info __sl_aep_infos[NUM_CPU][SL_MAX_NUM_THDS]; static u32_t __sl_aep_free_off[NUM_CPU]; /* Default implementations of backend functions */ -struct sl_thd_policy * -sl_thd_alloc_backend(thdid_t tid) +static inline struct sl_thd_policy * +sl_thd_alloc_backend_core(cpuid_t core, thdid_t tid) { - assert(tid < SL_MAX_NUM_THDS); + assert(tid < SL_MAX_NUM_THDS && core >= 0 && core < NUM_CPU); - return &(__sl_threads[cos_cpuid()][tid]); + return &(__sl_threads[core][tid]); } -struct cos_aep_info * -sl_thd_alloc_aep_backend(void) +static inline struct cos_aep_info * +sl_thd_alloc_aep_backend_core(cpuid_t core) { + int off = 0; struct cos_aep_info *aep = NULL; - assert(__sl_aep_free_off[cos_cpuid()] < SL_MAX_NUM_THDS); - aep = &(__sl_aep_infos[cos_cpuid()][__sl_aep_free_off[cos_cpuid()]]); - ps_faa((unsigned long *)&(__sl_aep_free_off[cos_cpuid()]), 1); + assert(core < NUM_CPU && core >= 0); + off = ps_faa((unsigned long *)&__sl_aep_free_off[core], 1); + assert(off < SL_MAX_NUM_THDS); + aep = &__sl_aep_infos[core][off]; return aep; } +struct sl_thd_policy * +sl_thd_migrate_backend(struct sl_thd_policy *t, cpuid_t core) +{ + assert(core != cos_cpuid() && core >= 0 && core < NUM_CPU); + + struct cos_aep_info *a = sl_thd_alloc_aep_backend_core(core); + struct cos_aep_info *b = sl_thd_aepinfo(sl_mod_thd_get(t)); + struct sl_thd_policy *tc = sl_thd_alloc_backend_core(core, b->tid); + struct sl_thd *x = sl_mod_thd_get(tc), *y = sl_mod_thd_get(t); + + memset(a, 0, sizeof(struct cos_aep_info)); + a->tid = b->tid; + a->thd = b->thd; + assert(b->rcv == 0 && b->tc == 0); + memset(b, 0, sizeof(struct cos_aep_info)); + + memcpy(tc, t, sizeof(struct sl_thd_policy)); + x->aepinfo = a; + memset(t, 0, sizeof(struct sl_thd_policy)); + + return tc; +} + +struct sl_thd_policy * +sl_thd_alloc_backend(thdid_t tid) +{ + return sl_thd_alloc_backend_core(cos_cpuid(), tid); +} + +struct cos_aep_info * +sl_thd_alloc_aep_backend(void) +{ + return sl_thd_alloc_aep_backend_core(cos_cpuid()); +} + void sl_thd_free_backend(struct sl_thd_policy *t) { } diff --git a/src/components/lib/sl/sl_xcore.c b/src/components/lib/sl/sl_xcore.c new file mode 100644 index 0000000000..b105a18411 --- /dev/null +++ b/src/components/lib/sl/sl_xcore.c @@ -0,0 +1,413 @@ +#include +#include +#include +#include +#include + +/******************************* Client-side ***************************/ + +/* static xcore thread backend! mainly for bookkeeping across cores! */ +static struct sl_xcore_thd _xcore_thds[MAX_NUM_THREADS]; +extern void sl_thd_param_set_no_cs(struct sl_thd *, sched_param_t); + +static inline void +_sl_xcore_response_wait(struct sl_xcore_response *r) +{ + if (sl_thd_curr() != sl__globals_core()->sched_thd) { + if (!ps_load(&r->resp_ready)) sl_thd_block(0); + } else { + while (!ps_load(&r->resp_ready)) { + if (sl_cs_enter_sched()) continue; + sl_cs_exit_schedule_nospin(); + } + } + assert(r->resp_ready); +} + +static inline struct sl_xcore_thd * +_sl_xcore_thd_backend_lookup(thdid_t tid) +{ + return &_xcore_thds[tid]; +} + +static inline struct sl_xcore_thd * +_sl_xcore_thd_backend_init(thdid_t tid, cpuid_t core, asndcap_t snd) +{ + struct sl_xcore_thd *t = _sl_xcore_thd_backend_lookup(tid); + + if (unlikely(t->thd)) return t; + t->thd = tid; + t->core = core; + + return t; +} + +struct sl_xcore_thd * +sl_xcore_thd_lookup_init(thdid_t tid, cpuid_t core) +{ + struct sl_xcore_thd *t = _sl_xcore_thd_backend_lookup(tid); + + /* TODO: is this safe? a wrong coreid can cause DOS! */ + if (unlikely(!(t->thd))) return _sl_xcore_thd_backend_init(tid, core, 0); + + /* perhaps migrated! */ + if (unlikely(t->core != core)) t->core = core; + /* if (unlikely(t->core != core)) return NULL; */ + + return t; +} + +struct sl_xcore_thd * +sl_xcore_thd_lookup(thdid_t tid) +{ + return _sl_xcore_thd_backend_lookup(tid); +} + +#define SL_XCORE_REQ(req, typ, resp) do { \ + req.type = typ; \ + req.client_core = cos_cpuid(); \ + req.client_thd = cos_thdid(); \ + req.response = resp; \ + } while (0) + +#define SL_XCORE_RESP(resp, typ) do { \ + resp.type = typ; \ + resp.resp_ready = 0; \ + } while (0) + +extern struct sl_thd *sl_thd_alloc_no_cs(cos_thd_fn_t fn, void *data); + +#define SL_IPI_ENABLE + +static inline int +_sl_xcore_request_enqueue_no_cs(cpuid_t core, struct sl_xcore_request *rq) +{ + int ret = 0; + + if (unlikely(core >= NUM_CPU)) return -1; + if (unlikely(core == cos_cpuid())) return -1; + if (unlikely(!bitmap_check(sl__globals()->core_bmp, core))) return -1; + ret = ck_ring_enqueue_mpsc_xcore(sl__ring(core), sl__ring_buffer(core), rq); + +#ifdef SL_IPI_ENABLE + asndcap_t snd = sl__globals()->xcore_asnd[cos_cpuid()][core]; + assert(snd); + + /* send an IPI for the request */ + cos_asnd(snd, 0); +#endif + + if (unlikely(ret == false)) return -1; + + return 0; +} + +static inline int +_sl_xcore_request_enqueue(cpuid_t core, struct sl_xcore_request *rq) +{ + int ret = 0; + + if (unlikely(core >= NUM_CPU)) return -1; + sl_cs_enter(); + ret = _sl_xcore_request_enqueue_no_cs(core, rq); + sl_cs_exit(); + if (unlikely(ret)) return -1; + + + return 0; +} + +struct sl_xcore_thd * +sl_xcore_thd_alloc(cpuid_t core, cos_thd_fn_t fn, void *data, int nparams, sched_param_t params[]) +{ + int ret = 0; + asndcap_t snd = 0; + struct sl_xcore_request req; + struct sl_xcore_response resp; + thdid_t xcore_tid; + + SL_XCORE_REQ(req, SL_XCORE_THD_ALLOC, &resp); + SL_XCORE_RESP(resp, SL_XCORE_THD_ALLOC); + req.sl_xcore_req_thd_alloc.fn = fn; + req.sl_xcore_req_thd_alloc.data = data; + if (nparams) memcpy(req.sl_xcore_req_thd_alloc.params, params, sizeof(sched_param_t) * nparams); + req.sl_xcore_req_thd_alloc.param_count = nparams; + + ret = _sl_xcore_request_enqueue(core, &req); + if (unlikely(ret)) return NULL; + + /* Other core will wake this up after creation! */ + _sl_xcore_response_wait(&resp); + xcore_tid = resp.sl_xcore_resp_thd_alloc.tid; + assert(xcore_tid); + + return _sl_xcore_thd_backend_init(xcore_tid, core, 0); +} + +struct sl_xcore_thd * +sl_xcore_thd_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, thdclosure_index_t idx, int nparams, sched_param_t params[]) +{ + return NULL; +} + +struct sl_xcore_thd * +sl_xcore_aep_alloc(cpuid_t core, cos_thd_fn_t fn, void *data, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[]) +{ + return NULL; +} + +struct sl_xcore_thd * +sl_xcore_aep_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, thdclosure_index_t idx, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[]) +{ + return NULL; +} + +struct sl_xcore_thd * +sl_xcore_initaep_alloc(cpuid_t core, struct cos_defcompinfo *dci, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[]) +{ + return NULL; +} + +struct sl_xcore_thd * +sl_xcore_initaep_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, struct cos_defcompinfo *sched, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[]) +{ + return NULL; +} + +void +sl_xcore_thd_param_set(struct sl_xcore_thd *t, sched_param_t param) +{ + struct sl_xcore_request req; + cpuid_t core = sl_xcore_thd_core(t); + + SL_XCORE_REQ(req, SL_XCORE_THD_PARAM_SET, 0); + req.sl_xcore_req_thd_param_set.tid = sl_xcore_thd_thdid(t); + req.sl_xcore_req_thd_param_set.param = param; + + _sl_xcore_request_enqueue(core, &req); +} + +static inline void +_sl_xcore_thd_wakeup_tid_no_cs(thdid_t tid, cpuid_t core) +{ + struct sl_xcore_request req; + + SL_XCORE_REQ(req, SL_XCORE_THD_WAKEUP, 0); + req.sl_xcore_req_thd_wakeup.tid = tid; + _sl_xcore_request_enqueue_no_cs(core, &req); +} + +void +sl_xcore_thd_wakeup(struct sl_xcore_thd *t) +{ + struct sl_xcore_request req; + cpuid_t core = sl_xcore_thd_core(t); + + if (unlikely(!t)) return; + + SL_XCORE_REQ(req, SL_XCORE_THD_WAKEUP, 0); + req.sl_xcore_req_thd_wakeup.tid = sl_xcore_thd_thdid(t); + _sl_xcore_request_enqueue(core, &req); +} + +void +sl_xcore_thd_wakeup_tid(thdid_t tid, cpuid_t core) +{ + struct sl_xcore_thd *t = sl_xcore_thd_lookup(tid); + + assert(t->core == core); + + sl_xcore_thd_wakeup(t); +} + +int +sl_xcore_load_balance(void) +{ + struct sl_xcore_request req; + struct sl_xcore_response resp; + struct sl_global *g = sl__globals(); + unsigned max = 0, i, nthds = 0; + int core = -1, ret; + + for (i = 0; i < NUM_CPU; i++) { + if (!bitmap_check(g->core_bmp, i)) continue; + + if (g->nthds_running[i] <= max) continue; + + max = g->nthds_running[i]; + core = i; + break; + } + + if (max == 0 || core == -1) return -1; + + memset(&req, 0, sizeof(req)); + SL_XCORE_REQ(req, SL_XCORE_LOAD_BALANCE, &resp); + SL_XCORE_RESP(resp, SL_XCORE_LOAD_BALANCE); + req.sl_xcore_req_load_balance.nthds = 1; /* FIXME: lets start with just 1 */ + ret = _sl_xcore_request_enqueue((cpuid_t)core, &req); + if (unlikely(ret)) return -1; + + _sl_xcore_response_wait(&resp); + nthds = resp.sl_xcore_resp_load_balance.nthds; + if (!nthds) return 0; + + assert(nthds < SL_XCORE_MIGRATE_MAX); + sl_cs_enter(); + for (i = 0; i < nthds; i++) { + struct sl_thd *t = sl_thd_lkup(resp.sl_xcore_resp_load_balance.tid[i]); + + assert(t); + assert(t->state == SL_THD_RUNNABLE); + sl_mod_wakeup(sl_mod_thd_policy_get(t)); + ps_faa(&(g->nthds_running[cos_cpuid()]), 1); + } + sl_cs_exit(); + + return nthds; +} + +/******************************* Server-side ***************************/ +static inline void +_sl_xcore_respond(struct sl_xcore_request *req) +{ + struct sl_xcore_response *resp = req->response; + + if (!resp) return; + + assert(resp->type == req->type && ps_load(&resp->resp_ready) == 0); + ps_faa(&resp->resp_ready, 1); + _sl_xcore_thd_wakeup_tid_no_cs(req->client_thd, req->client_core); +} + +static inline int +_sl_xcore_req_thd_alloc_no_cs(struct sl_xcore_request *req) +{ + cos_thd_fn_t fn = req->sl_xcore_req_thd_alloc.fn; + void *data = req->sl_xcore_req_thd_alloc.data; + struct sl_thd *t; + struct sl_xcore_response *x = req->response; + int i; + + assert(fn); + + t = sl_thd_alloc_no_cs(fn, data); + assert(t); + if (likely(x)) x->sl_xcore_resp_thd_alloc.tid = sl_thd_thdid(t); + for (i = 0; i < req->sl_xcore_req_thd_alloc.param_count; i++) sl_thd_param_set_no_cs(t, req->sl_xcore_req_thd_alloc.params[i]); + + return 0; +} + +static inline int +_sl_xcore_req_thd_param_set_no_cs(struct sl_xcore_request *req) +{ + struct sl_thd *t = sl_thd_lkup(req->sl_xcore_req_thd_param_set.tid); + + if (!t) return -1; + sl_thd_param_set_no_cs(t, req->sl_xcore_req_thd_param_set.param); + + return 0; +} + +static inline int +_sl_xcore_req_thd_wakeup_no_cs(struct sl_xcore_request *req) +{ + struct sl_thd *t = sl_thd_lkup(req->sl_xcore_req_thd_param_set.tid); + + if (!t) return -1; + if (unlikely(t == sl__globals_core()->sched_thd)) return 0; + sl_thd_wakeup_no_cs(t); + + return 0; +} + +static inline void +_sl_xcore_req_load_balance_no_cs(struct sl_xcore_request *req) +{ + struct sl_global *g = sl__globals(); + int n = g->nthds_running[cos_cpuid()], i, j = 0; + struct sl_xcore_response *rp = req->response; + cpuid_t cl_core = req->client_core; + + if (n <= SL_XCORE_KEEP_MIN) return; + n -= SL_XCORE_KEEP_MIN; + + if (n > SL_XCORE_MIGRATE_MAX) n = SL_XCORE_MIGRATE_MAX; + if (n > req->sl_xcore_req_load_balance.nthds) n = req->sl_xcore_req_load_balance.nthds; + + assert(rp); + for (i = 0; i < n; i++) { + struct sl_thd_policy *t = sl_mod_last_schedule(); + thdid_t tid = 0; + struct sl_xcore_thd *xt = NULL; + + if (!t) break; + tid = sl_thd_thdid(sl_mod_thd_get(t)); + xt = sl_xcore_thd_lookup(tid); + assert(xt); + if (xt->thd == tid) assert(xt->core == cos_cpuid()); + if (sl_thd_migrate_no_cs(sl_mod_thd_get(t), cl_core)) break; + sl_xcore_thd_lookup_init(tid, cl_core); + rp->sl_xcore_resp_load_balance.tid[i] = tid; + } + rp->sl_xcore_resp_load_balance.nthds = i; + + return; +} + +int +sl_xcore_process_no_cs(void) +{ + int num = 0; + struct sl_xcore_request xcore_req; + + if (likely(NUM_CPU < 2)) return 0; + + while (ck_ring_dequeue_mpsc_xcore(sl__ring_curr(), sl__ring_buffer_curr(), &xcore_req) == true) { + assert(xcore_req.client_core != cos_cpuid()); + + switch(xcore_req.type) { + case SL_XCORE_THD_ALLOC: + { + _sl_xcore_req_thd_alloc_no_cs(&xcore_req); + break; + } + case SL_XCORE_THD_ALLOC_EXT: + case SL_XCORE_AEP_ALLOC: + case SL_XCORE_AEP_ALLOC_EXT: + case SL_XCORE_INITAEP_ALLOC: + case SL_XCORE_THD_DEALLOC: + { + PRINTC("Unimplemented request! Aborting!\n"); + assert(0); + + break; + } + case SL_XCORE_THD_PARAM_SET: + { + _sl_xcore_req_thd_param_set_no_cs(&xcore_req); + break; + } + case SL_XCORE_THD_WAKEUP: + { + _sl_xcore_req_thd_wakeup_no_cs(&xcore_req); + break; + } + case SL_XCORE_LOAD_BALANCE: + { + _sl_xcore_req_load_balance_no_cs(&xcore_req); + break; + } + default: + { + PRINTC("Unrecognized request! Aborting!\n"); + assert(0); + } + } + _sl_xcore_respond(&xcore_req); + num ++; + } + + return num; /* number of requests processed */ +} diff --git a/src/components/lib/sl/sl_xcpu.c b/src/components/lib/sl/sl_xcpu.c deleted file mode 100644 index 7afcef766e..0000000000 --- a/src/components/lib/sl/sl_xcpu.c +++ /dev/null @@ -1,130 +0,0 @@ -/** - * Redistribution of this file is permitted under the BSD two clause license. - * - * Copyright 2018, The George Washington University - * Author: Phani Gadepalli, phanikishoreg@gwu.edu - */ - -#include -#include -#include -#include -#include - -#define SL_REQ_THD_ALLOC(req, fn, data) do { \ - req.type = SL_XCPU_THD_ALLOC; \ - req.client = cos_cpuid(); \ - req.req_response = 0; \ - req.sl_xcpu_req_thd_alloc.fn = fn; \ - req.sl_xcpu_req_thd_alloc.data = data; \ - } while (0) - -extern struct sl_thd *sl_thd_alloc_no_cs(cos_thd_fn_t fn, void *data); - -int -sl_xcpu_thd_alloc(cpuid_t cpu, cos_thd_fn_t fn, void *data, sched_param_t params[]) -{ - int i, sz = sizeof(params) / sizeof(params[0]); - int ret = 0; - asndcap_t snd = 0; - struct sl_xcpu_request req; - - if (cpu == cos_cpuid()) return -EINVAL; - if (!bitmap_check(sl__globals()->cpu_bmp, cpu)) return -EINVAL; - - sl_cs_enter(); - - SL_REQ_THD_ALLOC(req, fn, data); - memcpy(req.params, params, sizeof(sched_param_t) * sz); - req.param_count = sz; - if (ck_ring_enqueue_mpsc_xcpu(sl__ring(cpu), sl__ring_buffer(cpu), &req) != true) { - ret = -ENOMEM; - } else { - snd = sl__globals()->xcpu_asnd[cos_cpuid()][cpu]; - assert(snd); - } - - sl_cs_exit(); - - if (!snd || ret) goto done; - - /* send an IPI for the request */ - ret = cos_asnd(snd, 1); - -done: - return ret; -} - -int -sl_xcpu_thd_alloc_ext(cpuid_t cpu, struct cos_defcompinfo *dci, thdclosure_index_t idx, sched_param_t params[]) -{ - return -ENOTSUP; -} - -int -sl_xcpu_aep_alloc(cpuid_t cpu, cos_thd_fn_t fn, void *data, int own_tcap, cos_channelkey_t key, sched_param_t params[]) -{ - return -ENOTSUP; -} - -int -sl_xcpu_aep_alloc_ext(cpuid_t cpu, struct cos_defcompinfo *dci, thdclosure_index_t idx, int own_tcap, cos_channelkey_t key, sched_param_t params[]) -{ - return -ENOTSUP; -} - -int -sl_xcpu_initaep_alloc(cpuid_t cpu, struct cos_defcompinfo *dci, int own_tcap, cos_channelkey_t key, sched_param_t params[]) -{ - return -ENOTSUP; -} - -int -sl_xcpu_initaep_alloc_ext(cpuid_t cpu, struct cos_defcompinfo *dci, struct cos_defcompinfo *sched, int own_tcap, cos_channelkey_t key, sched_param_t params[]) -{ - return -ENOTSUP; -} - -int -sl_xcpu_process_no_cs(void) -{ - int num = 0; - struct sl_xcpu_request xcpu_req; - - while (ck_ring_dequeue_mpsc_xcpu(sl__ring_curr(), sl__ring_buffer_curr(), &xcpu_req) == true) { - - assert(xcpu_req.client != cos_cpuid()); - switch(xcpu_req.type) { - case SL_XCPU_THD_ALLOC: - { - cos_thd_fn_t fn = xcpu_req.sl_xcpu_req_thd_alloc.fn; - void *data = xcpu_req.sl_xcpu_req_thd_alloc.data; - struct sl_thd *t; - int i; - - assert(fn); - - t = sl_thd_alloc_no_cs(fn, data); - assert(t); - for (i = 0; i < xcpu_req.param_count; i++) { - sl_thd_param_set(t, xcpu_req.params[i]); - } - - break; - } - case SL_XCPU_THD_ALLOC_EXT: - case SL_XCPU_AEP_ALLOC: - case SL_XCPU_AEP_ALLOC_EXT: - case SL_XCPU_INITAEP_ALLOC: - case SL_XCPU_THD_DEALLOC: - default: - { - PRINTC("Unimplemented request! Aborting!\n"); - assert(0); - } - } - num ++; - } - - return num; /* number of requests processed */ -} diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c index 0a294301fb..abbdc67bb2 100644 --- a/src/kernel/capinv.c +++ b/src/kernel/capinv.c @@ -15,6 +15,8 @@ #include "include/tcap.h" #include "include/chal/defs.h" #include "include/hw.h" +#include "include/scb.h" +#include "include/dcb.h" #define COS_DEFAULT_RET_CAP 0 @@ -82,6 +84,76 @@ printfn(struct pt_regs *regs) return 0; } +/* TODO: inline fast path and force non-inlined slow-path */ +static inline struct thread * +cap_ulthd_lazyupdate(struct pt_regs *regs, struct cos_cpu_local_info *cos_info, int interrupt, struct comp_info **ci_ptr) +{ + struct thread *thd = thd_current(cos_info); + struct cap_thd *ch_ult = NULL; + struct thread *ulthd = NULL; + capid_t ultc = 0; + struct cos_scb_info *scb_core = NULL; /* per-core scb_info */ + + *ci_ptr = thd_invstk_current_compinfo(thd, cos_info); + + assert(*ci_ptr && (*ci_ptr)->captbl); + + if (unlikely(!(*ci_ptr)->scb_data)) goto done; + scb_core = (((*ci_ptr)->scb_data) + get_cpuid()); + ultc = scb_core->curr_thd; + /* reset inconsistency from user-level thd! */ + scb_core->curr_thd = 0; + if (!ultc && !interrupt) goto done; + + if (likely(ultc)) { + ch_ult = (struct cap_thd *)captbl_lkup((*ci_ptr)->captbl, ultc); + if (unlikely(!CAP_TYPECHK_CORE(ch_ult, CAP_THD))) ch_ult = NULL; + else ulthd = ch_ult->t; + } + if (unlikely(!ultc || !ulthd || ulthd->dcbinfo == NULL)) goto done; + if (ulthd == thd) goto done; + + thd_current_update(ulthd, thd, cos_info); + thd = ulthd; + *ci_ptr = thd_invstk_current_compinfo(thd, cos_info); + +done: + return thd; +} + +void +cos_cap_ipi_handling(void) +{ + int idx, end; + struct IPI_receiving_rings *receiver_rings; + struct xcore_ring * ring; + + receiver_rings = &IPI_cap_dest[get_cpuid()]; + + /* We need to scan the entire buffer once. */ + idx = receiver_rings->start; + end = receiver_rings->start - 1; // end is int type. could be -1. + receiver_rings->start = (receiver_rings->start + 1) % NUM_CPU; + + /* scan the first half */ + for (; idx < NUM_CPU; idx++) { + ring = &receiver_rings->IPI_source[idx]; + if (ring->sender != ring->receiver) { + process_ring(ring); + } + } + + /* and scan the second half */ + for (idx = 0; idx <= end; idx++) { + ring = &receiver_rings->IPI_source[idx]; + if (ring->sender != ring->receiver) { + process_ring(ring); + } + } + + return; +} + static void kmem_unalloc(unsigned long *pte) { @@ -287,6 +359,8 @@ cap_cpy(struct captbl *t, capid_t cap_to, capid_t capin_to, capid_t cap_from, ca type = ctfrom->type; sz = __captbl_cap2bytes(type); + /* don't allow cap copy on SCB/DCB */ + if (type == CAP_SCB || type == CAP_DCB) return -EINVAL; ctto = __cap_capactivate_pre(t, cap_to, capin_to, type, &ret); if (!ctto) return -EINVAL; @@ -435,7 +509,7 @@ cap_thd_switch(struct pt_regs *regs, struct thread *curr, struct thread *next, s preempt = thd_switch_update(next, &next->regs, 0); /* if switching to the preempted/awoken thread clear cpu local next_thdinfo */ - if (nti->thd && nti->thd == next) thd_next_thdinfo_update(cos_info, 0, 0, 0, 0); + //if (nti->thd && nti->thd == next) thd_next_thdinfo_update(cos_info, 0, 0, 0, 0); copy_all_regs(&next->regs, regs); @@ -447,7 +521,9 @@ notify_parent(struct thread *rcv_thd, int send) { struct thread *curr_notif = NULL, *prev_notif = NULL, *arcv_notif = NULL; int depth = 0; + cycles_t now; + rdtscll(now); /* hierarchical notifications - upto init (bounded by ARCV_NOTIF_DEPTH) */ prev_notif = rcv_thd; curr_notif = arcv_notif = arcv_thd_notif(prev_notif); @@ -455,6 +531,7 @@ notify_parent(struct thread *rcv_thd, int send) while (curr_notif && curr_notif != prev_notif) { assert(depth < ARCV_NOTIF_DEPTH); + prev_notif->event_epoch = now; thd_rcvcap_evt_enqueue(curr_notif, prev_notif); if (!(curr_notif->state & THD_STATE_RCVING)) break; @@ -500,7 +577,7 @@ asnd_process(struct thread *rcv_thd, struct thread *thd, struct tcap *rcv_tcap, { struct thread *next; - thd_rcvcap_pending_inc(rcv_thd); + thd_rcvcap_pending_set(rcv_thd); next = notify_process(rcv_thd, thd, rcv_tcap, tcap, tcap_next, yield); /* @@ -586,11 +663,19 @@ cap_switch(struct pt_regs *regs, struct thread *curr, struct thread *next, struc static int cap_sched_tok_validate(struct thread *rcvt, sched_tok_t usr_tok, struct comp_info *ci, struct cos_cpu_local_info *cos_info) { + struct cos_scb_info *scb_core = ci->scb_data + get_cpuid(); + assert(rcvt && usr_tok < ~0U); - /* race-condition check for user-level thread switches */ - if (thd_rcvcap_get_counter(rcvt) > usr_tok) return -EAGAIN; - thd_rcvcap_set_counter(rcvt, usr_tok); + /* + * Kernel increments the sched_tok on preemption only. + * The rest is all co-operative, so if sched_tok in scb page + * increments after someone fetching a tok, then check for that! + * + * FIXME: make sure we're checking the scb of the scheduling component and not in any other component. + * I don't know if the comp_info here is of the scheduling component! + */ + if (unlikely(scb_core->sched_tok != usr_tok)) return -EAGAIN; return 0; } @@ -624,7 +709,9 @@ cap_thd_op(struct cap_thd *thd_cap, struct thread *thd, struct pt_regs *regs, st ret = cap_sched_tok_validate(rcvt, usr_counter, ci, cos_info); if (ret) return ret; - if (thd_rcvcap_pending(rcvt) > 0) { + /* only if it has scheduler events to process! */ + if (thd_rcvcap_evt_pending(rcvt)) { + printk("%s:%d\n", __func__, __LINE__); if (thd == rcvt) return -EBUSY; next = rcvt; @@ -650,7 +737,7 @@ cap_thd_op(struct cap_thd *thd_cap, struct thread *thd, struct pt_regs *regs, st } ret = cap_switch(regs, thd, next, tcap, timeout, ci, cos_info); - if (tc && tcap_current(cos_info) == tcap) tcap_setprio(tcap, prio); + if (tc && tcap_current(cos_info) == tcap && prio) tcap_setprio(tcap, prio); return ret; } @@ -680,13 +767,11 @@ cap_ipi_process(struct pt_regs *regs) struct tcap *tcap_curr, *tcap_next; struct comp_info *ci; int i, scan_base; - unsigned long ip, sp; - thd_curr = thd_next = thd_current(cos_info); + thd_next = thd_curr = cap_ulthd_lazyupdate(regs, cos_info, 1, &ci); + assert(ci && ci->captbl); receiver_rings = &IPI_cap_dest[get_cpuid()]; tcap_curr = tcap_next = tcap_current(cos_info); - ci = thd_invstk_current(thd_curr, &ip, &sp, cos_info); - assert(ci && ci->captbl); scan_base = receiver_rings->start; receiver_rings->start = (receiver_rings->start + 1) % NUM_CPU; @@ -767,7 +852,8 @@ cap_asnd_op(struct cap_asnd *asnd, struct thread *thd, struct pt_regs *regs, str ret = cap_sched_tok_validate(rcvt, usr_tok, ci, cos_info); if (ret) return ret; - if (thd_rcvcap_pending(rcvt) > 0) { + /* only if the rcvt has scheduler events to process */ + if (thd_rcvcap_evt_pending(rcvt)) { if (thd == rcvt) return -EBUSY; next = rcvt; @@ -794,12 +880,11 @@ int cap_hw_asnd(struct cap_asnd *asnd, struct pt_regs *regs) { int curr_cpu = get_cpuid(); - struct cap_arcv * arcv; + struct cap_arcv *arcv; struct cos_cpu_local_info *cos_info; - struct thread * rcv_thd, *next, *thd; - struct tcap * rcv_tcap, *tcap, *tcap_next; - struct comp_info * ci; - unsigned long ip, sp; + struct thread *rcv_thd, *next, *thd; + struct tcap *rcv_tcap, *tcap, *tcap_next; + struct comp_info *ci; if (!CAP_TYPECHK(asnd, CAP_ASND)) return 1; assert(asnd->arcv_capid); @@ -815,12 +900,10 @@ cap_hw_asnd(struct cap_asnd *asnd, struct pt_regs *regs) cos_info = cos_cpu_local_info(); assert(cos_info); - thd = thd_current(cos_info); - tcap = tcap_current(cos_info); - assert(thd); - ci = thd_invstk_current(thd, &ip, &sp, cos_info); - assert(ci && ci->captbl); + thd = cap_ulthd_lazyupdate(regs, cos_info, 1, &ci); + assert(thd && ci && ci->captbl); assert(!(thd->state & THD_STATE_PREEMPTED)); + tcap = tcap_current(cos_info); rcv_thd = arcv->thd; rcv_tcap = rcv_thd->rcvcap.rcvcap_tcap; assert(rcv_tcap && tcap); @@ -829,7 +912,9 @@ cap_hw_asnd(struct cap_asnd *asnd, struct pt_regs *regs) if (next == thd) return 1; thd->state |= THD_STATE_PREEMPTED; - return cap_switch(regs, thd, next, tcap_next, TCAP_TIME_NIL, ci, cos_info); + /* don't disable timer if we're not switching to a diff tcap.. */ + /* TODO: hierarchical timeouts */ + return cap_switch(regs, thd, next, tcap_next, tcap == tcap_next ? tcap_cyc2time(cos_info->next_timer) : TCAP_TIME_NIL, ci, cos_info); } int @@ -863,16 +948,13 @@ int timer_process(struct pt_regs *regs) { struct cos_cpu_local_info *cos_info; - struct thread * thd_curr; - struct comp_info * comp; - unsigned long ip, sp; - cycles_t now; + struct thread *thd_curr; + struct comp_info *comp = NULL; cos_info = cos_cpu_local_info(); assert(cos_info); - thd_curr = thd_current(cos_info); + thd_curr = cap_ulthd_lazyupdate(regs, cos_info, 1, &comp); assert(thd_curr && thd_curr->cpuid == get_cpuid()); - comp = thd_invstk_current(thd_curr, &ip, &sp, cos_info); assert(comp); return expended_process(regs, thd_curr, comp, cos_info, 1); @@ -887,21 +969,25 @@ cap_arcv_op(struct cap_arcv *arcv, struct thread *thd, struct pt_regs *regs, str struct next_thdinfo *nti = &cos_info->next_ti; rcv_flags_t rflags = __userregs_get1(regs); tcap_time_t swtimeout = TCAP_TIME_NIL; - tcap_time_t timeout = __userregs_get2(regs); - int all_pending = (!!(rflags & RCV_ALL_PENDING)); + tcap_time_t timeout = TCAP_TIME_NIL, x = __userregs_get2(regs); + if (likely(rflags & RCV_SCHEDTIMEOUT)) swtimeout = x; + else timeout = x; if (unlikely(arcv->thd != thd || arcv->cpuid != get_cpuid())) return -EINVAL; /* deliver pending notifications? */ if (thd_rcvcap_pending(thd)) { __userregs_set(regs, 0, __userregs_getsp(regs), __userregs_getip(regs)); - thd_rcvcap_all_pending_set(thd, all_pending); thd_rcvcap_pending_deliver(thd, regs); + /* for sched_rcv enabling user-level switch */ + //if (thd->dcbinfo) thd->dcbinfo->sp = 0; return 0; } else if (rflags & RCV_NON_BLOCKING) { __userregs_set(regs, 0, __userregs_getsp(regs), __userregs_getip(regs)); __userregs_setretvals(regs, -EAGAIN, 0, 0, 0); + /* for sched_rcv enabling user-level switch */ + //if (thd->dcbinfo) thd->dcbinfo->sp = 0; return 0; } @@ -912,20 +998,20 @@ cap_arcv_op(struct cap_arcv *arcv, struct thread *thd, struct pt_regs *regs, str if (unlikely(tc_next != thd_rcvcap_tcap(thd))) tc_next = thd_rcvcap_tcap(thd); /* if preempted/awoken thread is waiting, switch to that */ - if (nti->thd) { - assert(nti->tc); - - next = nti->thd; - tc_next = nti->tc; - tcap_setprio(nti->tc, nti->prio); - if (nti->budget) { - /* convert budget to timeout */ - cycles_t now; - rdtscll(now); - swtimeout = tcap_cyc2time(now + nti->budget); - } - thd_next_thdinfo_update(cos_info, 0, 0, 0, 0); - } + //if (nti->thd) { + // assert(nti->tc); + + // next = nti->thd; + // tc_next = nti->tc; + // tcap_setprio(nti->tc, nti->prio); + // if (nti->budget) { + // /* convert budget to timeout */ + // cycles_t now; + // rdtscll(now); + // swtimeout = tcap_cyc2time(now + nti->budget); + // } + // thd_next_thdinfo_update(cos_info, 0, 0, 0, 0); + //} /* FIXME: for now, lets just ignore this path...need to plumb tcaps into it */ thd->interrupted_thread = NULL; @@ -939,8 +1025,10 @@ cap_arcv_op(struct cap_arcv *arcv, struct thread *thd, struct pt_regs *regs, str if (likely(thd != next)) { assert(!(thd->state & THD_STATE_PREEMPTED)); thd->state |= THD_STATE_RCVING; - thd_rcvcap_all_pending_set(thd, all_pending); thd->timeout = timeout; + } else { + /* switching back to the thread.. don't disable timers..*/ + swtimeout = timeout; } return cap_switch(regs, thd, next, tc_next, swtimeout, ci, cos_info); @@ -960,6 +1048,8 @@ cap_introspect(struct captbl *ct, capid_t capid, u32_t op, unsigned long *retval return tcap_introspect(((struct cap_tcap *)ch)->tcap, op, retval); case CAP_ARCV: return arcv_introspect(((struct cap_arcv *)ch), op, retval); + case CAP_COMP: + return comp_introspect(((struct cap_comp *)ch), op, retval); default: return -EINVAL; } @@ -967,6 +1057,13 @@ cap_introspect(struct captbl *ct, capid_t capid, u32_t op, unsigned long *retval #define ENABLE_KERNEL_PRINT +#define cos_thd_throw(label, thd, errno) \ + { \ + ret = (errno); \ + if (unlikely(thd->dcbinfo)) thd->dcbinfo->sp = 0; \ + goto label; \ + } + static int composite_syscall_slowpath(struct pt_regs *regs, int *thd_switch); COS_SYSCALL __attribute__((section("__ipc_entry"))) int @@ -976,7 +1073,6 @@ composite_syscall_handler(struct pt_regs *regs) struct comp_info * ci; struct thread * thd; capid_t cap; - unsigned long ip, sp; /* * We lookup this struct (which is on stack) only once, and @@ -986,8 +1082,10 @@ composite_syscall_handler(struct pt_regs *regs) int ret = -ENOENT; int thd_switch = 0; + /* Definitely do it for all the fast-path calls. */ + thd = cap_ulthd_lazyupdate(regs, cos_info, 0, &ci); + assert(thd); cap = __userregs_getcap(regs); - thd = thd_current(cos_info); /* printk("thd %d calling cap %d (ip %x, sp %x), operation %d: %x, %x, %x, %x\n", thd->tid, cap, * __userregs_getip(regs), __userregs_getsp(regs), __userregs_getop(regs), @@ -1007,14 +1105,12 @@ composite_syscall_handler(struct pt_regs *regs) return 0; } - ci = thd_invstk_current(thd, &ip, &sp, cos_info); - assert(ci && ci->captbl); - /* * We don't check the liveness of the current component * because it's guaranteed by component quiescence period, * which is at timer tick granularity. */ + assert(ci && ci->captbl); ch = captbl_lkup(ci->captbl, cap); if (unlikely(!ch)) { printk("cos: cap %d not found!\n", (int)cap); @@ -1033,7 +1129,8 @@ composite_syscall_handler(struct pt_regs *regs) switch (ch->type) { case CAP_THD: ret = cap_thd_op((struct cap_thd *)ch, thd, regs, ci, cos_info); - if (ret < 0) cos_throw(done, ret); + //printk("[%d]\n", ret); + if (ret < 0) cos_thd_throw(done, thd, ret); return ret; case CAP_ASND: ret = cap_asnd_op((struct cap_asnd *)ch, thd, regs, ci, cos_info); @@ -1041,7 +1138,7 @@ composite_syscall_handler(struct pt_regs *regs) return ret; case CAP_ARCV: ret = cap_arcv_op((struct cap_arcv *)ch, thd, regs, ci, cos_info); - if (ret < 0) cos_throw(done, ret); + if (ret < 0) cos_thd_throw(done, thd, ret); return ret; default: break; @@ -1212,22 +1309,38 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs * break; } case CAPTBL_OP_THDACTIVATE: { - thdclosure_index_t init_data = __userregs_get1(regs) >> 16; - capid_t thd_cap = __userregs_get1(regs) & 0xFFFF; - capid_t pgtbl_cap = __userregs_get2(regs); - capid_t pgtbl_addr = __userregs_get3(regs); - capid_t compcap = __userregs_get4(regs); - - struct thread *thd; - unsigned long *pte = NULL; - - ret = cap_kmem_activate(ct, pgtbl_cap, pgtbl_addr, (unsigned long *)&thd, &pte); + u32_t reg3 = __userregs_get3(regs); + u32_t reg4 = __userregs_get4(regs); + capid_t pgtbl_addr = __userregs_get2(regs); + thdclosure_index_t init_data = (reg4 << 16) >> 16; + capid_t thd_cap = (capin >> 16); + capid_t pgtbl_cap = (capin << 16) >> 16; + capid_t compcap = (reg3 >> 16); + capid_t dcb_cap = (reg3 << 16) >> 16; + unsigned short dcboff = reg4 >> 16; + unsigned long *tpte = NULL, flags; + struct thread *thd; + struct cap_header *ctfrom; + + ret = cap_kmem_activate(ct, pgtbl_cap, pgtbl_addr, (unsigned long *)&thd, &tpte); if (unlikely(ret)) cos_throw(err, ret); - assert(thd && pte); + assert(thd && tpte); /* ret is returned by the overall function */ - ret = thd_activate(ct, cap, thd_cap, thd, compcap, init_data); - if (ret) kmem_unalloc(pte); + ret = thd_activate(ct, cap, thd_cap, thd, compcap, init_data, dcb_cap, dcboff); + if (ret) kmem_unalloc(tpte); + + break; + } + case CAPTBL_OP_THDMIGRATE: { + u32_t reg2 = __userregs_get2(regs); + u32_t reg3 = __userregs_get3(regs); + + if (reg3) { + ret = thd_migrate_cap(ct, capin); + } else { + ret = thd_migrate(ct, capin, reg2); + } break; } @@ -1249,7 +1362,7 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs * case CAPTBL_OP_THDDEACTIVATE: { livenessid_t lid = __userregs_get2(regs); - ret = thd_deactivate(ct, op_cap, capin, lid, 0, 0, 0); + ret = thd_deactivate(ct, op_cap, capin, lid, 0, 0, 0, 0); break; } case CAPTBL_OP_THDTLSSET: { @@ -1265,7 +1378,7 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs * capid_t pgtbl_cap = __userregs_get3(regs); capid_t cosframe_addr = __userregs_get4(regs); - ret = thd_deactivate(ct, op_cap, capin, lid, pgtbl_cap, cosframe_addr, 1); + ret = thd_deactivate(ct, op_cap, capin, lid, pgtbl_cap, cosframe_addr, 0, 1); break; } case CAPTBL_OP_CAPKMEM_FREEZE: { @@ -1277,10 +1390,13 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs * case CAPTBL_OP_COMPACTIVATE: { capid_t captbl_cap = __userregs_get2(regs) >> 16; capid_t pgtbl_cap = __userregs_get2(regs) & 0xFFFF; - livenessid_t lid = __userregs_get3(regs); + livenessid_t lid = capin >> 16; + capid_t comp_cap = (capin << 16) >> 16; + vaddr_t scb_uaddr = __userregs_get3(regs) & (~0 << 12); vaddr_t entry_addr = __userregs_get4(regs); + capid_t scb_cap = __userregs_get3(regs) & ((1 << 12) - 1); - ret = comp_activate(ct, cap, capin, captbl_cap, pgtbl_cap, lid, entry_addr, NULL); + ret = comp_activate(ct, cap, comp_cap, captbl_cap, pgtbl_cap, scb_cap, lid, entry_addr, scb_uaddr); break; } case CAPTBL_OP_COMPDEACTIVATE: { @@ -1389,6 +1505,65 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs * ret = hw_deactivate(op_cap, capin, lid); break; } + case CAPTBL_OP_SCB_ACTIVATE: { + capid_t ptcap = __userregs_get2(regs); + livenessid_t lid = __userregs_get4(regs); + vaddr_t addr = __userregs_get3(regs); + unsigned long *pte; + struct cos_scb_info *scb; + + ret = cap_kmem_activate(ct, ptcap, addr, (unsigned long *)&scb, &pte); + if (ret) cos_throw(err, ret); + + ret = scb_activate(ct, cap, capin, (vaddr_t)scb, lid); + + break; + } + case CAPTBL_OP_SCB_DEACTIVATE: { + u32_t r2 = __userregs_get2(regs); + livenessid_t lid = r2 >> 16; + capid_t ptcap = (r2 << 16) >> 16; + capid_t cf_addr = __userregs_get3(regs); + + ret = scb_deactivate(op_cap, capin, ptcap, cf_addr, lid); + + break; + } + case CAPTBL_OP_DCB_ACTIVATE: { + u32_t r1 = __userregs_get1(regs); + u32_t r2 = __userregs_get2(regs); + u32_t r3 = __userregs_get3(regs); + u32_t r4 = __userregs_get4(regs); + capid_t dcbcap = r1 >> 16; + capid_t ptcap = r2 >> 16; + livenessid_t lid = (r1 << 16) >> 16; + capid_t ptcapin = (r2 << 16) >> 16; + vaddr_t kaddr = r3; + vaddr_t uaddrin = r4; + struct cos_dcb_info *dcb; + unsigned long *pte; + + ret = cap_kmem_activate(ct, ptcap, kaddr, (unsigned long *)&dcb, &pte); + if (ret) cos_throw(err, ret); + + ret = dcb_activate(ct, cap, dcbcap, (vaddr_t)dcb, lid, ptcapin, uaddrin); + + break; + } + case CAPTBL_OP_DCB_DEACTIVATE: { + u32_t r2 = __userregs_get2(regs); + u32_t r3 = __userregs_get3(regs); + u32_t r4 = __userregs_get4(regs); + livenessid_t lid = r2 >> 16; + capid_t ptcap = (r2 << 16) >> 16; + vaddr_t cf_addr = r3 & (~0 << 12); + vaddr_t uaddrin = r4 & (~0 << 12); + capid_t ptcapin = (r4 << 20) >> 12 | ((r3 << 20) >> 20); + + ret = dcb_deactivate(op_cap, capin, lid, ptcap, cf_addr, ptcapin, uaddrin); + + break; + } default: goto err; } @@ -1645,17 +1820,28 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs * struct cap_arcv *rcvc; hwid_t hwid = __userregs_get1(regs); capid_t rcvcap = __userregs_get2(regs); + u32_t period = __userregs_get3(regs); rcvc = (struct cap_arcv *)captbl_lkup(ci->captbl, rcvcap); if (!CAP_TYPECHK(rcvc, CAP_ARCV)) cos_throw(err, -EINVAL); ret = hw_attach_rcvcap((struct cap_hw *)ch, hwid, rcvc, rcvcap); + if (!ret) { + if (hwid == HW_HPET_PERIODIC || hwid == HW_HPET_ONESHOT) chal_hpet_periodic_set(hwid, period); + ret = chal_irq_enable(hwid, get_cpuid()); + } + break; } case CAPTBL_OP_HW_DETACH: { hwid_t hwid = __userregs_get1(regs); ret = hw_detach_rcvcap((struct cap_hw *)ch, hwid); + if (!ret) { + if (hwid == HW_HPET_PERIODIC || hwid == HW_HPET_ONESHOT) chal_hpet_disable(hwid); + ret = chal_irq_disable(hwid, get_cpuid()); + } + break; } case CAPTBL_OP_HW_MAP: { diff --git a/src/kernel/include/captbl.h b/src/kernel/include/captbl.h index 102fe147d3..7530b06796 100644 --- a/src/kernel/include/captbl.h +++ b/src/kernel/include/captbl.h @@ -51,7 +51,7 @@ typedef enum { #define CAP_HEAD_AMAP_SZ 4 #define CAP_HEAD_SZ_SZ 2 #define CAP_HEAD_FLAGS_SZ 3 -#define CAP_HEAD_TYPE_SZ 7 +#define CAP_HEAD_TYPE_SZ CAP_TYPE_MAXBITS /* * This is the header for each capability. Includes information about diff --git a/src/kernel/include/chal.h b/src/kernel/include/chal.h index 2caa7dd0ca..b7a4683587 100644 --- a/src/kernel/include/chal.h +++ b/src/kernel/include/chal.h @@ -94,6 +94,12 @@ void chal_send_ipi(int cpu_id); void chal_idle(void); void chal_timer_set(cycles_t cycles); void chal_timer_disable(void); +void chal_hpet_periodic_set(hwid_t, unsigned long); +void chal_hpet_disable(hwid_t); +cycles_t chal_hpet_first_period(void); + +int chal_irq_disable(int irqline, cpuid_t cpu_id); +int chal_irq_enable(int irqline, cpuid_t cpu_id); void chal_init(void); @@ -104,6 +110,8 @@ void chal_init(void); #include "../../platform/include/chal_plat.h" +#define PRINTK(format, ...) printk("(CPU%ld:) " format, get_cpuid(), ## __VA_ARGS__) + extern void printk(const char *fmt, ...); void chal_khalt(void); diff --git a/src/kernel/include/component.h b/src/kernel/include/component.h index c837cf22fa..79cfbd5546 100644 --- a/src/kernel/include/component.h +++ b/src/kernel/include/component.h @@ -12,36 +12,44 @@ #include "captbl.h" #include "pgtbl.h" #include "cap_ops.h" +#include "shared/cos_sched.h" struct comp_info { struct liveness_data liveness; pgtbl_t pgtbl; - struct captbl * captbl; - struct cos_sched_data_area *comp_nfo; + struct captbl *captbl; + struct cos_scb_info *scb_data; } __attribute__((packed)); struct cap_comp { struct cap_header h; vaddr_t entry_addr; - struct cap_pgtbl * pgd; + struct cap_pgtbl *pgd; struct cap_captbl *ct_top; struct comp_info info; } __attribute__((packed)); +#include "scb.h" + static int -comp_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t captbl_cap, capid_t pgtbl_cap, livenessid_t lid, - vaddr_t entry_addr, struct cos_sched_data_area *sa) +comp_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t captbl_cap, capid_t pgtbl_cap, capid_t scbcap, + livenessid_t lid, vaddr_t entry_addr, vaddr_t scb_uaddr) { - struct cap_comp * compc; - struct cap_pgtbl * ptc; + struct cap_comp *compc; + struct cap_pgtbl *ptc; struct cap_captbl *ctc; - u32_t v; + u32_t v, flags; int ret = 0; + struct cap_scb *scbc = NULL; ctc = (struct cap_captbl *)captbl_lkup(t, captbl_cap); if (unlikely(!ctc || ctc->h.type != CAP_CAPTBL || ctc->lvl > 0)) return -EINVAL; ptc = (struct cap_pgtbl *)captbl_lkup(t, pgtbl_cap); if (unlikely(!ptc || ptc->h.type != CAP_PGTBL || ptc->lvl > 0)) return -EINVAL; + if (likely(scbcap)) { + scbc = (struct cap_scb *)captbl_lkup(t, scbcap); + if (unlikely(!scbc || scbc->h.type != CAP_SCB)) return -EINVAL; + } v = ptc->refcnt_flags; if (v & CAP_MEM_FROZEN_FLAG) return -EINVAL; @@ -53,14 +61,16 @@ comp_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t captbl_cap, /* undo before return */ cos_throw(undo_ptc, -ECASFAIL); } - compc = (struct cap_comp *)__cap_capactivate_pre(t, cap, capin, CAP_COMP, &ret); if (!compc) cos_throw(undo_ctc, ret); + if (likely(scbc)) { + ret = scb_comp_update(t, scbc, compc, ptc, scb_uaddr); + if (ret) cos_throw(undo_capact, ret); + } compc->entry_addr = entry_addr; compc->info.pgtbl = ptc->pgtbl; compc->info.captbl = ctc->captbl; - compc->info.comp_nfo = sa; compc->pgd = ptc; compc->ct_top = ctc; ltbl_get(lid, &compc->info.liveness); @@ -68,6 +78,9 @@ comp_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t captbl_cap, return 0; +/*undo_scb: + scb_comp_remove(t, scbc, pgtbl_cap, scb_uaddr);*/ +undo_capact: undo_ctc: cos_faa((int *)&ctc->refcnt_flags, -1); undo_ptc: @@ -79,8 +92,8 @@ static int comp_deactivate(struct cap_captbl *ct, capid_t capin, livenessid_t lid) { int ret; - struct cap_comp * compc; - struct cap_pgtbl * pgd; + struct cap_comp *compc; + struct cap_pgtbl *pgd; struct cap_captbl *ct_top; compc = (struct cap_comp *)captbl_lkup(ct->captbl, capin); @@ -89,6 +102,8 @@ comp_deactivate(struct cap_captbl *ct, capid_t capin, livenessid_t lid) ltbl_expire(&compc->info.liveness); pgd = compc->pgd; ct_top = compc->ct_top; + /* TODO: right way to remove scb info */ + if (likely(compc->info.scb_data)) scb_comp_remove(ct, 0, 0, 0); ret = cap_capdeactivate(ct, capin, CAP_COMP, lid); if (ret) return ret; @@ -107,4 +122,17 @@ comp_init(void) assert(sizeof(struct cap_comp) <= __captbl_cap2bytes(CAP_COMP)); } +static inline int +comp_introspect(struct cap_comp *t, unsigned long op, unsigned long *retval) +{ + switch (op) { + case COMP_GET_SCB_CURTHD: + *retval = t->info.scb_data->curr_thd; + break; + default: + return -EINVAL; + } + return 0; +} + #endif /* COMPONENT_H */ diff --git a/src/kernel/include/dcb.h b/src/kernel/include/dcb.h new file mode 100644 index 0000000000..eac71fa497 --- /dev/null +++ b/src/kernel/include/dcb.h @@ -0,0 +1,109 @@ +/** + * Copyright 2019 by Phani Gadepalli, phanikishoreg@gwu.edu + * + * Redistribution of this file is permitted under the GNU General Public License v2. + */ + +#ifndef DCB_H +#define DCB_H + +#include "cap_ops.h" +#include "pgtbl.h" +#include "retype_tbl.h" +#include "component.h" +#include "thd.h" + +#define DCB_ENTRIES_MAX_PER_PAGE (PAGE_SIZE/sizeof(struct cos_dcb_info)) + +struct cap_dcb { + struct cap_header h; + struct liveness_data liveness; + unsigned int refcnt; + vaddr_t kern_addr; + cpuid_t cpuid; +} __attribute__((packed)); + +static inline int +dcb_activate(struct captbl *t, capid_t ctcap, capid_t dcbcap, vaddr_t kaddr, livenessid_t lid, capid_t ptcapin, vaddr_t uaddr) +{ + struct cap_dcb *dc; + struct cap_pgtbl *ptcin; + int ret; + paddr_t pf = chal_va2pa((void *)kaddr); + + ptcin = (struct cap_pgtbl *)captbl_lkup(t, ptcapin); + if (!ptcin || ptcin->h.type != CAP_PGTBL) return -EINVAL; + + if (pgtbl_mapping_add(ptcin->pgtbl, uaddr, pf, PGTBL_USER_DEF)) return -EINVAL; + + dc = (struct cap_dcb *)__cap_capactivate_pre(t, ctcap, dcbcap, CAP_DCB, &ret); + if (!dc) return -EINVAL; + + ltbl_get(lid, &dc->liveness); + dc->kern_addr = kaddr; + memset((void *)kaddr, 0, PAGE_SIZE); + dc->refcnt = 0; + dc->cpuid = get_cpuid(); + + __cap_capactivate_post(&dc->h, CAP_DCB); + + return 0; +} + +static inline int +dcb_deactivate(struct cap_captbl *ct, capid_t dcbcap, livenessid_t lid, capid_t ptcap, capid_t cosframe_addr, capid_t ptcapin, vaddr_t uaddrin) +{ + struct cap_dcb *dc; + struct cap_pgtbl *ptcin; + unsigned long *pte, addr, flags, old_v; + int ret; + + dc = (struct cap_dcb *)captbl_lkup(ct->captbl, dcbcap); + if (!dc || dc->h.type != CAP_DCB) return -EINVAL; + + if (!ptcapin || !uaddrin) return -EINVAL; + ptcin = (struct cap_pgtbl *)captbl_lkup(ct->captbl, ptcapin); + if (!ptcin || ptcin->h.type != CAP_PGTBL) return -EINVAL; + pte = pgtbl_lkup(ptcin->pgtbl, uaddrin, (u32_t *)&flags); + if (!pte) return -EINVAL; + if ((vaddr_t)pte != dc->kern_addr) return -EINVAL; + + if (dc->refcnt) return -EPERM; + + ltbl_expire(&dc->liveness); + ret = kmem_deact_pre((struct cap_header *)dc, ct->captbl, ptcap, cosframe_addr, &pte, &old_v); + if (ret) return ret; + ret = kmem_deact_post(pte, old_v); + if (ret) return ret; + dc->kern_addr = 0; + + return cap_capdeactivate(ct, dcbcap, CAP_DCB, lid); +} + +static inline int +dcb_thd_ref(struct cap_dcb *dc, struct thread *thd) +{ + if (dc->refcnt >= DCB_ENTRIES_MAX_PER_PAGE) return -EINVAL; + if (dc->cpuid != thd->cpuid) return -EINVAL; + if (!ltbl_isalive(&dc->liveness)) return -EPERM; + + dc->refcnt++; + + return 0; +} + +static inline int +dcb_thd_deref(struct cap_dcb *dc, struct thread *thd) +{ + if (!dc->refcnt) return -EINVAL; + if (dc->cpuid != thd->cpuid) return -EINVAL; + + if ((vaddr_t)thd->dcbinfo < dc->kern_addr || (vaddr_t)thd->dcbinfo > (dc->kern_addr + PAGE_SIZE)) return -EINVAL; + if (!ltbl_isalive(&dc->liveness)) return -EPERM; + + dc->refcnt--; + + return 0; +} + +#endif /* DCB_H */ diff --git a/src/kernel/include/hw.h b/src/kernel/include/hw.h index fafc1ef7e1..4c03f1cd87 100644 --- a/src/kernel/include/hw.h +++ b/src/kernel/include/hw.h @@ -17,17 +17,17 @@ #define HW_IRQ_EXTERNAL_MIN 32 #define HW_IRQ_EXTERNAL_MAX 63 -struct cap_asnd hw_asnd_caps[HW_IRQ_TOTAL]; +struct cap_asnd hw_asnd_caps[NUM_CPU][HW_IRQ_TOTAL]; struct cap_hw { struct cap_header h; u32_t hw_bitmap; } __attribute__((packed)); -static void +static inline void hw_asndcap_init(void) { - memset(&hw_asnd_caps, 0, sizeof(struct cap_asnd) * HW_IRQ_TOTAL); + memset(&hw_asnd_caps, 0, sizeof(struct cap_asnd) * HW_IRQ_TOTAL * NUM_CPU); } /* @@ -36,7 +36,7 @@ hw_asndcap_init(void) * from another, and only with a subset of the bitmap. Any other HW * resources should not be passed on. */ -static int +static inline int hw_activate(struct captbl *t, capid_t cap, capid_t capin, u32_t bitmap) { struct cap_hw *hwc; @@ -52,23 +52,23 @@ hw_activate(struct captbl *t, capid_t cap, capid_t capin, u32_t bitmap) return 0; } -static int +static inline int hw_deactivate(struct cap_captbl *t, capid_t capin, livenessid_t lid) { return cap_capdeactivate(t, capin, CAP_HW, lid); } -static int +static inline int hw_attach_rcvcap(struct cap_hw *hwc, hwid_t hwid, struct cap_arcv *rcvc, capid_t rcv_cap) { if (hwid < HW_IRQ_EXTERNAL_MIN || hwid > HW_IRQ_EXTERNAL_MAX) return -EINVAL; if (!(hwc->hw_bitmap & (1 << (hwid - HW_IRQ_EXTERNAL_MIN)))) return -EINVAL; - if (hw_asnd_caps[hwid].h.type == CAP_ASND) return -EEXIST; + if (hw_asnd_caps[get_cpuid()][hwid].h.type == CAP_ASND) return -EEXIST; - return asnd_construct(&hw_asnd_caps[hwid], rcvc, rcv_cap, 0, 0); + return asnd_construct(&hw_asnd_caps[get_cpuid()][hwid], rcvc, rcv_cap, 0, 0); } -static int +static inline int hw_detach_rcvcap(struct cap_hw *hwc, hwid_t hwid) { if (hwid < HW_IRQ_EXTERNAL_MIN || hwid > HW_IRQ_EXTERNAL_MAX) return -EINVAL; @@ -78,7 +78,7 @@ hw_detach_rcvcap(struct cap_hw *hwc, hwid_t hwid) * FIXME: Need to synchronize using __xx_pre and * __xx_post perhaps in asnd_deconstruct() */ - memset(&hw_asnd_caps[hwid], 0, sizeof(struct cap_asnd)); + memset(&hw_asnd_caps[get_cpuid()][hwid], 0, sizeof(struct cap_asnd)); return 0; } diff --git a/src/kernel/include/inv.h b/src/kernel/include/inv.h index 089c784b54..7ac9cb14b1 100644 --- a/src/kernel/include/inv.h +++ b/src/kernel/include/inv.h @@ -50,7 +50,7 @@ struct cap_arcv { u8_t depth; } __attribute__((packed)); -static int +static inline int sinv_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t comp_cap, vaddr_t entry_addr, invtoken_t token) { struct cap_sinv *sinvc; @@ -72,13 +72,13 @@ sinv_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t comp_cap, va return 0; } -static int +static inline int sinv_deactivate(struct cap_captbl *t, capid_t capin, livenessid_t lid) { return cap_capdeactivate(t, capin, CAP_SINV, lid); } -static int +static inline int sret_activate(struct captbl *t, capid_t cap, capid_t capin) { struct cap_sret *sretc; @@ -91,13 +91,13 @@ sret_activate(struct captbl *t, capid_t cap, capid_t capin) return 0; } -static int +static inline int sret_deactivate(struct cap_captbl *t, capid_t capin, livenessid_t lid) { return cap_capdeactivate(t, capin, CAP_SRET, lid); } -static int +static inline int asnd_construct(struct cap_asnd *asndc, struct cap_arcv *arcvc, capid_t rcv_cap, u32_t budget, u32_t period) { /* FIXME: Add synchronization with __xx_pre and __xx_post */ @@ -118,7 +118,7 @@ asnd_construct(struct cap_asnd *asndc, struct cap_arcv *arcvc, capid_t rcv_cap, return 0; } -static int +static inline int asnd_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t rcv_captbl, capid_t rcv_cap, u32_t budget, u32_t period) { @@ -142,7 +142,7 @@ asnd_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t rcv_captbl, return ret; } -static int +static inline int asnd_deactivate(struct cap_captbl *t, capid_t capin, livenessid_t lid) { return cap_capdeactivate(t, capin, CAP_ASND, lid); @@ -153,7 +153,7 @@ int cap_ipi_process(struct pt_regs *regs); /* send to a receive end-point within an interrupt */ int cap_hw_asnd(struct cap_asnd *asnd, struct pt_regs *regs); -static void +static inline void __arcv_setup(struct cap_arcv *arcv, struct thread *thd, struct tcap *tcap, struct thread *notif) { assert(arcv && thd && tcap && !thd_bound2rcvcap(thd)); @@ -168,7 +168,7 @@ __arcv_setup(struct cap_arcv *arcv, struct thread *thd, struct tcap *tcap, struc tcap_promote(tcap, thd); } -static int +static inline int __arcv_teardown(struct cap_arcv *arcv, struct thread *thd) { struct thread *notif; @@ -189,13 +189,13 @@ __arcv_teardown(struct cap_arcv *arcv, struct thread *thd) return 0; } -static struct thread * +static inline struct thread * arcv_thd_notif(struct thread *arcvt) { return arcvt->rcvcap.rcvcap_thd_notif; } -static int +static inline int arcv_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t comp_cap, capid_t thd_cap, capid_t tcap_cap, capid_t arcv_cap, int init) { @@ -245,7 +245,7 @@ arcv_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t comp_cap, ca return 0; } -static int +static inline int arcv_deactivate(struct cap_captbl *t, capid_t capin, livenessid_t lid) { struct cap_arcv *arcvc; @@ -345,7 +345,7 @@ sret_ret(struct thread *thd, struct pt_regs *regs, struct cos_cpu_local_info *co __userregs_set(regs, __userregs_getinvret(regs), sp, ip); } -static void +static inline void inv_init(void) { //#define __OUTPUT_CAP_SIZE diff --git a/src/kernel/include/pgtbl.h b/src/kernel/include/pgtbl.h index 7ef95512d8..f07c4b4ad5 100644 --- a/src/kernel/include/pgtbl.h +++ b/src/kernel/include/pgtbl.h @@ -357,6 +357,7 @@ pgtbl_cosframe_add(pgtbl_t pt, u32_t addr, u32_t page, u32_t flags) PGTBL_DEPTH, &accum); orig_v = (u32_t)(pte->next); assert(orig_v == 0); +// printk("%x %x %p %x\n", addr, page, pte, orig_v); return __pgtbl_update_leaf(pte, (void *)(page | flags), 0); } diff --git a/src/kernel/include/scb.h b/src/kernel/include/scb.h new file mode 100644 index 0000000000..b90d66b3d2 --- /dev/null +++ b/src/kernel/include/scb.h @@ -0,0 +1,101 @@ +/** + * Copyright 2019 by Phani Gadepalli, phanikishoreg@gwu.edu + * + * Redistribution of this file is permitted under the GNU General Public License v2. + */ + +#ifndef SCB_H +#define SCB_H + +#include "component.h" +#include "cap_ops.h" +#include "pgtbl.h" +#include "retype_tbl.h" + +struct comp_info; + +struct cap_scb { + struct cap_header h; + struct liveness_data liveness; + struct cap_comp *compc; + vaddr_t kern_addr; +} __attribute__((packed)); + +static inline int +scb_activate(struct captbl *t, capid_t ctcap, capid_t scbcap, vaddr_t kaddr, livenessid_t lid) +{ + struct cap_scb *sc; + int ret; + + sc = (struct cap_scb *)__cap_capactivate_pre(t, ctcap, scbcap, CAP_SCB, &ret); + if (!sc) return -EINVAL; + + ltbl_get(lid, &sc->liveness); + sc->kern_addr = kaddr; + sc->compc = NULL; + memset((void *)kaddr, 0, COS_SCB_SIZE); + + __cap_capactivate_post(&sc->h, CAP_SCB); + + return 0; +} + +static inline int +scb_deactivate(struct cap_captbl *ct, capid_t scbcap, capid_t ptcap, capid_t cosframe_addr, livenessid_t lid) +{ + struct cap_scb *sc; + unsigned long old_v = 0, *pte = NULL; + int ret; + + sc = (struct cap_scb *)captbl_lkup(ct->captbl, scbcap); + if (!sc || sc->h.type != CAP_SCB) return -EINVAL; + + /* FIXME: component using this scbcap is still active! how to handle this? */ + if (sc->compc) return -EPERM; + + ltbl_expire(&sc->liveness); + ret = kmem_deact_pre((struct cap_header *)sc, ct->captbl, ptcap, cosframe_addr, &pte, &old_v); + if (ret) return ret; + ret = kmem_deact_post(pte, old_v); + if (ret) return ret; + + return cap_capdeactivate(ct, scbcap, CAP_SCB, lid); +} + +static inline int +scb_comp_update(struct captbl *ct, struct cap_scb *sc, struct cap_comp *compc, struct cap_pgtbl *ptcin, vaddr_t uaddrin) +{ + paddr_t pf = chal_va2pa((void *)(sc->kern_addr)); + + if (unlikely(!ltbl_isalive(&sc->liveness))) return -EPERM; + /* for non-schedulers, scbs are from schedulers, so uaddrin will be zero and sc->compc should have been set! */ + if (uaddrin && pgtbl_mapping_add(ptcin->pgtbl, uaddrin, pf, PGTBL_USER_DEF)) return -EINVAL; + + if (uaddrin && sc->compc == NULL) sc->compc = compc; + compc->info.scb_data = (struct cos_scb_info *)(sc->kern_addr); + + return 0; +} + +static inline int +scb_comp_remove(struct cap_captbl *ct, struct cap_scb *sc, capid_t ptcapin, vaddr_t uaddrin) +{ + int ret; + + if (unlikely(!ct || !sc || !ptcapin || !uaddrin)) return -EINVAL; + + if (unlikely(!ltbl_isalive(&sc->liveness))) return -EPERM; + if (unlikely(!sc->compc)) return -EINVAL; + + /* TODO: unmap uaddrin in the user-land */ + + return 0; +} + +static inline struct liveness_data * +scb_liveness(struct cap_scb *sc) +{ + return &sc->liveness; +} + +#endif /* SCB_H */ diff --git a/src/kernel/include/shared/consts.h b/src/kernel/include/shared/consts.h index e059c507a7..d5cb53b9d9 100644 --- a/src/kernel/include/shared/consts.h +++ b/src/kernel/include/shared/consts.h @@ -48,7 +48,7 @@ struct pt_regs { #endif #define MAX_SERVICE_DEPTH 31 -#define MAX_NUM_THREADS (64 * NUM_CPU) +#define MAX_NUM_THREADS (2048) /* Stacks are 2 * page_size (expressed in words) */ #define MAX_STACK_SZ_BYTE_ORDER 12 @@ -136,6 +136,7 @@ struct pt_regs { * offsets below are used to access CPU and thread IDs. */ #define CPUID_OFFSET 1 #define THDID_OFFSET 2 -#define INVTOKEN_OFFSET 3 +#define SLTHDPTR_OFFSET 3 +#define INVTOKEN_OFFSET 4 #endif diff --git a/src/kernel/include/shared/cos_config.h b/src/kernel/include/shared/cos_config.h index a80dc56884..bf501b3be9 100644 --- a/src/kernel/include/shared/cos_config.h +++ b/src/kernel/include/shared/cos_config.h @@ -62,6 +62,7 @@ /* Composite user memory uses physical memory above this. */ #define COS_MEM_START COS_MEM_KERN_PA +#define COS_SCB_SIZE (PAGE_SIZE) /* NUM_CPU_SOCKETS defined in cpu_ghz.h. The information is used for * intelligent IPI distribution. */ diff --git a/src/kernel/include/shared/cos_sched.h b/src/kernel/include/shared/cos_sched.h new file mode 100644 index 0000000000..525d7edcb9 --- /dev/null +++ b/src/kernel/include/shared/cos_sched.h @@ -0,0 +1,53 @@ +#ifndef COS_SCHED_H +#define COS_SCHED_H + +#include "./cos_types.h" + +struct cos_thd_event { + u16_t blocked; + u32_t next_timeout; + u64_t elapsed_cycs; + u64_t epoch; +} __attribute__((packed)); + +struct cos_sched_event { + thdid_t tid; + struct cos_thd_event evt; +} __attribute__((packed)); + +#define COS_SCHED_EVENT_RING_SIZE 16 + +struct cos_sched_ring { + int head, tail, more; + struct cos_sched_event event_buf[COS_SCHED_EVENT_RING_SIZE]; +} __attribute__((packed)); + +struct cos_scb_info { + capid_t curr_thd; + cycles_t timer_next; + sched_tok_t sched_tok; + struct cos_sched_ring sched_events; /* kernel-level events only */ +} CACHE_ALIGNED; + +struct cos_dcb_info { + unsigned long ip; + unsigned long sp; + unsigned long pending; /* binary value. TODO: move it to ip or sp */ +} __attribute__((packed)); + +/* + * This is the "ip" the kernel uses to update the thread when it sees that the + * thread is still in user-level dispatch routine. + * This is the offset of instruction after resetting the "next" thread's "sp" to zero + * in a purely user-level dispatch. + * + * Whenever kernel is switching to a thread which has "sp" non-zero, it would switch + * to the "ip" saved in the dcb_info and reset the "sp" of the thread that the kernel + * is dispatching to! + * This is necessary because, if the kernel is dispatching to a thread that was in the + * user-level dispatch routine before, then the only registers that it can restore are + * "ip" and "sp", everything else is either clobbered or saved/loaded at user-level. + */ +#define DCB_IP_KERN_OFF 8 + +#endif /* COS_SCHED_H */ diff --git a/src/kernel/include/shared/cos_types.h b/src/kernel/include/shared/cos_types.h index f3714097e2..cee8b006ef 100644 --- a/src/kernel/include/shared/cos_types.h +++ b/src/kernel/include/shared/cos_types.h @@ -72,7 +72,9 @@ typedef enum { typedef enum { RCV_NON_BLOCKING = 1, - RCV_ALL_PENDING = 1 << 1, + RCV_ULONLY = (1 << 1), + RCV_ULSCHED_RCV = (1 << 2), + RCV_SCHEDTIMEOUT = (1 << 3), } rcv_flags_t; #define BOOT_LIVENESS_ID_BASE 2 @@ -84,6 +86,7 @@ typedef enum { CAPTBL_OP_THDACTIVATE, CAPTBL_OP_THDDEACTIVATE, CAPTBL_OP_THDTLSSET, + CAPTBL_OP_THDMIGRATE, CAPTBL_OP_COMPACTIVATE, CAPTBL_OP_COMPDEACTIVATE, CAPTBL_OP_SINVACTIVATE, @@ -125,6 +128,12 @@ typedef enum { CAPTBL_OP_HW_MAP, CAPTBL_OP_HW_CYC_USEC, CAPTBL_OP_HW_CYC_THRESH, + + CAPTBL_OP_SCB_ACTIVATE, + CAPTBL_OP_SCB_DEACTIVATE, + + CAPTBL_OP_DCB_ACTIVATE, + CAPTBL_OP_DCB_DEACTIVATE, } syscall_op_t; typedef enum { @@ -142,8 +151,13 @@ typedef enum { CAP_QUIESCENCE, /* when deactivating, set to track quiescence state */ CAP_TCAP, /* tcap captable entry */ CAP_HW, /* hardware (interrupt) */ + CAP_SCB, /* Scheduler control block (SCB) */ + CAP_DCB, /* Dispatch control block (DCB) */ } cap_t; +/* maximum size allowed for CAP TYPE in a capability header */ +#define CAP_TYPE_MAXBITS 7 +#define CAP_TYPE_MAX (1 << CAP_TYPE_MAXBITS - 1) /* TODO: pervasive use of these macros */ /* v \in struct cap_* *, type \in cap_t */ #define CAP_TYPECHK(v, t) ((v) && (v)->h.type == (t)) @@ -192,12 +206,16 @@ typedef int cpuid_t; static inline cap_sz_t __captbl_cap2sz(cap_t c) { + /* if (unlikely(c > CAP_TYPE_MAX)) return CAP_SZ_ERR; */ + /* TODO: optimize for invocation and return */ switch (c) { case CAP_SRET: - case CAP_THD: case CAP_TCAP: + case CAP_THD: return CAP_SZ_16B; + case CAP_SCB: + case CAP_DCB: case CAP_CAPTBL: case CAP_PGTBL: case CAP_HW: /* TODO: 256bits = 32B * 8b */ @@ -260,12 +278,15 @@ enum */ BOOT_CAPTBL_SELF_INITRCV_BASE = round_up_to_pow2(BOOT_CAPTBL_SELF_INITTHD_BASE + NUM_CPU * CAP64B_IDSZ, CAPMAX_ENTRY_SZ), + /* BOOT_CAPTBL_SELF_INITTCAP_BASE = round_up_to_pow2(BOOT_CAPTBL_SELF_INITRCV_BASE + NUM_CPU * CAP64B_IDSZ, + CAPMAX_ENTRY_SZ), */ BOOT_CAPTBL_LAST_CAP = BOOT_CAPTBL_SELF_INITRCV_BASE + NUM_CPU * CAP64B_IDSZ, /* round up to next entry */ BOOT_CAPTBL_FREE = round_up_to_pow2(BOOT_CAPTBL_LAST_CAP, CAPMAX_ENTRY_SZ) }; -#define BOOT_CAPTBL_SELF_INITTCAP_BASE (BOOT_CAPTBL_SELF_INITTHD_BASE + CAP16B_IDSZ) +#define BOOT_CAPTBL_SELF_INITTCAP_BASE BOOT_CAPTBL_SELF_INITTHD_BASE + CAP16B_IDSZ + #define BOOT_CAPTBL_SELF_INITTHD_CPU_BASE (BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cos_cpuid())) #define BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE (BOOT_CAPTBL_SELF_INITTCAP_BASE_CPU(cos_cpuid())) #define BOOT_CAPTBL_SELF_INITRCV_CPU_BASE (BOOT_CAPTBL_SELF_INITRCV_BASE_CPU(cos_cpuid())) @@ -274,6 +295,16 @@ enum #define BOOT_CAPTBL_SELF_INITTCAP_BASE_CPU(cpuid) (BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cpuid) + CAP16B_IDSZ) #define BOOT_CAPTBL_SELF_INITRCV_BASE_CPU(cpuid) (BOOT_CAPTBL_SELF_INITRCV_BASE + cpuid * CAP64B_IDSZ) +enum llboot_scb_dcb_caps +{ + LLBOOT_CAPTBL_SCB = round_up_to_pow2(BOOT_CAPTBL_LAST_CAP, CAPMAX_ENTRY_SZ), + LLBOOT_CAPTBL_INITDCB = LLBOOT_CAPTBL_SCB + CAP64B_IDSZ, + LLBOOT_CAPTBL_FREE = round_up_to_pow2(LLBOOT_CAPTBL_INITDCB + (CAP64B_IDSZ * NUM_CPU), CAPMAX_ENTRY_SZ), +}; + +#define LLBOOT_CAPTBL_INITDCB_CPU(cpuid) (LLBOOT_CAPTBL_INITDCB + (CAP64B_IDSZ * cpuid)) +#define LLBOOT_CAPTBL_CPU_INITDCB LLBOOT_CAPTBL_INITDCB_CPU(cos_cpuid()) + /* * The half of the first page of init captbl is devoted to root node. So, the * first page of captbl can contain 128 caps, and every extra page can hold 256 @@ -291,6 +322,8 @@ enum { /* thread id */ THD_GET_TID, + THD_GET_DCB_IP, + THD_GET_DCB_SP, }; enum @@ -307,6 +340,12 @@ enum ARCV_GET_THDID, }; +enum +{ + /* get current thread info from scb */ + COMP_GET_SCB_CURTHD, +}; + /* Macro used to define per core variables */ #define PERCPU(type, name) \ PERCPU_DECL(type, name); \ @@ -408,7 +447,6 @@ struct cos_component_information { vaddr_t cos_heap_allocated, cos_heap_alloc_extent; vaddr_t cos_upcall_entry; vaddr_t cos_async_inv_entry; - // struct cos_sched_data_area *cos_sched_data_area; vaddr_t cos_user_caps; struct restartable_atomic_sequence cos_ras[COS_NUM_ATOMIC_SECTIONS / 2]; vaddr_t cos_poly[COMP_INFO_POLY_NUM]; @@ -484,6 +522,10 @@ typedef unsigned int isolation_level_t; #define MEMMGR_MAX_SHMEM_REGIONS 1024 #define CAPMGR_AEPKEYS_MAX (1<<15) +#define CHAN_CRT_NSLOTS 4 +#define CHAN_CRT_ITEM_TYPE unsigned long +#define CHAN_CRT_ITEM_SZ sizeof(CHAN_CRT_ITEM_TYPE) + #define IPIWIN_DEFAULT_US (1000) /* 1ms */ #define IPIMAX_DEFAULT (64) /* IPIs per ms for each RCV ep */ diff --git a/src/kernel/include/thd.h b/src/kernel/include/thd.h index 8c10d536cc..c9c01c734b 100644 --- a/src/kernel/include/thd.h +++ b/src/kernel/include/thd.h @@ -34,7 +34,7 @@ struct invstk_entry { */ struct rcvcap_info { /* how many other arcv end-points send notifications to this one? */ - int isbound, pending, refcnt, is_all_pending; + int isbound, pending, refcnt, is_init; sched_tok_t sched_count; struct tcap * rcvcap_tcap; /* This rcvcap's tcap */ struct thread *rcvcap_thd_notif; /* The parent rcvcap thread for notifications */ @@ -69,11 +69,13 @@ struct thread { tcap_time_t timeout; struct thread *interrupted_thread; struct thread *scheduler_thread; + struct cos_dcb_info *dcbinfo; /* rcv end-point data-structures */ struct rcvcap_info rcvcap; struct list event_head; /* all events for *this* end-point */ struct list_node event_list; /* the list of events for another end-point */ + u64_t event_epoch; /* used by user-level for ULSCHED events.. */ } CACHE_ALIGNED; /* @@ -89,6 +91,8 @@ struct cap_thd { cpuid_t cpuid; } __attribute__((packed)); +#include "dcb.h" + static void thd_upcall_setup(struct thread *thd, u32_t entry_addr, int option, int arg1, int arg2, int arg3) { @@ -188,20 +192,43 @@ thd_next_thdinfo_update(struct cos_cpu_local_info *cli, struct thread *thd, stru } static void -thd_rcvcap_init(struct thread *t) +thd_rcvcap_init(struct thread *t, int is_init) { struct rcvcap_info *rc = &t->rcvcap; rc->isbound = rc->pending = rc->refcnt = 0; - rc->is_all_pending = 0; rc->sched_count = 0; + rc->is_init = is_init; rc->rcvcap_thd_notif = NULL; } +static inline struct comp_info * +thd_invstk_peek_compinfo(struct thread *curr_thd, struct cos_cpu_local_info *cos_info, int peek_index) +{ + /* curr_thd should be the current thread! We are using cached invstk_top. */ + return &(curr_thd->invstk[peek_index].comp_info); +} + +static inline int +thd_rcvcap_evt_pending(struct thread *t) +{ + return !list_isempty(&t->event_head); +} + static inline void thd_rcvcap_evt_enqueue(struct thread *head, struct thread *t) { + struct cos_cpu_local_info *cos_info = cos_cpu_local_info(); + struct comp_info *c = thd_invstk_peek_compinfo(head, cos_info, 0); /* in its root component! */ + struct cos_scb_info *scb = NULL; + struct cos_sched_ring *r = NULL; + if (list_empty(&t->event_list) && head != t) list_enqueue(&head->event_head, &t->event_list); + if (unlikely(!c ||!c->scb_data)) return; + + scb = ((c->scb_data) + get_cpuid()); + r = &(scb->sched_events); + r->more = thd_rcvcap_evt_pending(head); } static inline void @@ -227,69 +254,41 @@ thd_track_exec(struct thread *t) return !list_empty(&t->event_list); } -static void -thd_rcvcap_all_pending_set(struct thread *t, int val) -{ - t->rcvcap.is_all_pending = val; -} - -static int -thd_rcvcap_all_pending_get(struct thread *t) -{ - return t->rcvcap.is_all_pending; -} - -static int -thd_rcvcap_all_pending(struct thread *t) -{ - int pending = t->rcvcap.pending; - - /* receive all pending */ - t->rcvcap.pending = 0; - thd_rcvcap_all_pending_set(t, 0); - - return ((pending << 1) | !list_isempty(&t->event_head)); -} - -static int +static inline int thd_rcvcap_pending(struct thread *t) { - if (t->rcvcap.pending) return t->rcvcap.pending; - return !list_isempty(&t->event_head); - ; + if (t->rcvcap.pending || (t->dcbinfo && t->dcbinfo->pending)) return 1; + return thd_rcvcap_evt_pending(t); } -static sched_tok_t +static inline sched_tok_t thd_rcvcap_get_counter(struct thread *t) { return t->rcvcap.sched_count; } -static void +static inline void thd_rcvcap_set_counter(struct thread *t, sched_tok_t cntr) { t->rcvcap.sched_count = cntr; } -static void -thd_rcvcap_pending_inc(struct thread *arcvt) +static inline void +thd_rcvcap_pending_set(struct thread *arcvt) { - arcvt->rcvcap.pending++; + if (likely(arcvt->dcbinfo)) arcvt->dcbinfo->pending = 1; + else arcvt->rcvcap.pending = 1; } -static int -thd_rcvcap_pending_dec(struct thread *arcvt) +static inline void +thd_rcvcap_pending_reset(struct thread *arcvt) { - int pending = arcvt->rcvcap.pending; - - if (pending == 0) return 0; - arcvt->rcvcap.pending--; - - return pending; + if (likely(arcvt->dcbinfo)) arcvt->dcbinfo->pending = 0; + else arcvt->rcvcap.pending = 0; } static inline int -thd_state_evt_deliver(struct thread *t, unsigned long *thd_state, unsigned long *cycles, unsigned long *timeout) +thd_state_evt_deliver(struct thread *t, unsigned long *thd_state, unsigned long *cycles, unsigned long *timeout, u64_t *epoch) { struct thread *e = thd_rcvcap_evt_dequeue(t); @@ -301,6 +300,8 @@ thd_state_evt_deliver(struct thread *t, unsigned long *thd_state, unsigned long e->exec = 0; *timeout = e->timeout; e->timeout = 0; + *epoch = e->event_epoch; + e->event_epoch = 0; return 1; } @@ -315,7 +316,7 @@ static inline void thd_current_update(struct thread *next, struct thread *prev, struct cos_cpu_local_info *cos_info) { /* commit the cached data */ - prev->invstk_top = cos_info->invstk_top; + prev->invstk_top = cos_info->invstk_top; cos_info->invstk_top = next->invstk_top; cos_info->curr_thd = next; } @@ -332,17 +333,23 @@ thd_scheduler_set(struct thread *thd, struct thread *sched) if (unlikely(thd->scheduler_thread != sched)) thd->scheduler_thread = sched; } -static int -thd_activate(struct captbl *t, capid_t cap, capid_t capin, struct thread *thd, capid_t compcap, thdclosure_index_t init_data) +static inline int +thd_activate(struct captbl *t, capid_t cap, capid_t capin, struct thread *thd, capid_t compcap, thdclosure_index_t init_data, capid_t dcbcap, unsigned short dcboff) { struct cos_cpu_local_info *cli = cos_cpu_local_info(); - struct cap_thd *tc; - struct cap_comp *compc; + struct cap_thd *tc = NULL; + struct cap_comp *compc = NULL; + struct cap_dcb *dc = NULL; int ret; memset(thd, 0, sizeof(struct thread)); compc = (struct cap_comp *)captbl_lkup(t, compcap); if (unlikely(!compc || compc->h.type != CAP_COMP)) return -EINVAL; + if (likely(dcbcap)) { + dc = (struct cap_dcb *)captbl_lkup(t, dcbcap); + if (unlikely(!dc || dc->h.type != CAP_DCB)) return -EINVAL; + if (dcboff > PAGE_SIZE / sizeof(struct cos_dcb_info)) return -EINVAL; + } tc = (struct cap_thd *)__cap_capactivate_pre(t, cap, capin, CAP_THD, &ret); if (!tc) return ret; @@ -354,10 +361,17 @@ thd_activate(struct captbl *t, capid_t cap, capid_t capin, struct thread *thd, c thd->refcnt = 1; thd->invstk_top = 0; thd->cpuid = get_cpuid(); + if (likely(dc)) { + ret = dcb_thd_ref(dc, thd); + if (ret) goto err; /* TODO: cleanup captbl slot */ + thd->dcbinfo = (struct cos_dcb_info *)(dc->kern_addr + (dcboff * sizeof(struct cos_dcb_info))); + memset(thd->dcbinfo, 0, sizeof(struct cos_dcb_info)); + } assert(thd->tid <= MAX_NUM_THREADS); thd_scheduler_set(thd, thd_current(cli)); - thd_rcvcap_init(thd); + /* TODO: fix the way to specify scheduler in a component! */ + thd_rcvcap_init(thd, !init_data); list_head_init(&thd->event_head); list_init(&thd->event_list, thd); @@ -369,15 +383,69 @@ thd_activate(struct captbl *t, capid_t cap, capid_t capin, struct thread *thd, c __cap_capactivate_post(&tc->h, CAP_THD); return 0; + +err: + return ret; } -static int +static inline int +thd_migrate_cap(struct captbl *ct, capid_t thd_cap) +{ + struct thread *thd; + struct cap_thd *tc; + + /* we migrated the capability to core */ + tc = (struct cap_thd *)captbl_lkup(ct, thd_cap); + if (!tc || tc->h.type != CAP_THD || get_cpuid() != tc->cpuid) return -EINVAL; + thd = tc->t; + tc->cpuid = thd->cpuid; + + return 0; +} + +static inline int +thd_migrate(struct captbl *ct, capid_t thd_cap, cpuid_t core) +{ + struct thread *thd; + struct cap_thd *tc; + + tc = (struct cap_thd *)captbl_lkup(ct, thd_cap); + if (!tc || tc->h.type != CAP_THD || get_cpuid() != tc->cpuid) return -EINVAL; + thd = tc->t; + if (NUM_CPU < 2 || core >= NUM_CPU || core < 0) return -EINVAL; + if (tc->cpuid != thd->cpuid) return -EINVAL; /* outdated capability */ + if (thd->cpuid == core) return -EINVAL; /* already migrated. invalid req */ + if (thd->cpuid != get_cpuid()) return -EPERM; /* only push migration */ + + if (thd_current(cos_cpu_local_info()) == thd) return -EPERM; /* not a running thread! */ + if (thd->invstk_top > 0) return -EPERM; /* not if its in an invocation */ + if (thd_bound2rcvcap(thd) || thd->rcvcap.rcvcap_thd_notif) return -EPERM; /* not if it's an AEP */ + if (thd->rcvcap.rcvcap_tcap) return -EPERM; /* not if it has its own tcap on this core */ + + thd->scheduler_thread = NULL; + thd->cpuid = core; + /* we also migrated the capability to core */ + tc->cpuid = core; + + /* + * TODO: + * given that the thread is not running right now, + * and we don't allow migrating a thread that's in an invocation for now, + * i think we can find the COREID_OFFSET/CPUID_OFFSET on stack and fix the + * core id right here?? + */ + + return 0; +} + +static inline int thd_deactivate(struct captbl *ct, struct cap_captbl *dest_ct, unsigned long capin, livenessid_t lid, capid_t pgtbl_cap, - capid_t cosframe_addr, const int root) + capid_t cosframe_addr, capid_t dcbcap, const int root) { struct cos_cpu_local_info *cli = cos_cpu_local_info(); - struct cap_header * thd_header; - struct thread * thd; + struct cap_header *thd_header; + struct thread *thd; + struct cap_dcb *dcb = NULL; unsigned long old_v = 0, *pte = NULL; int ret; @@ -385,6 +453,10 @@ thd_deactivate(struct captbl *ct, struct cap_captbl *dest_ct, unsigned long capi if (!thd_header || thd_header->type != CAP_THD) cos_throw(err, -EINVAL); thd = ((struct cap_thd *)thd_header)->t; assert(thd->refcnt); + if (dcbcap) { + dcb = (struct cap_dcb *)captbl_lkup(ct, dcbcap); + if (!dcb || dcb->h.type != CAP_DCB) cos_throw(err, -EINVAL); + } if (thd->refcnt == 1) { if (!root) cos_throw(err, -EINVAL); @@ -410,6 +482,10 @@ thd_deactivate(struct captbl *ct, struct cap_captbl *dest_ct, unsigned long capi } } + if (dcb) { + ret = dcb_thd_deref(dcb, thd); + if (ret) cos_throw(err, ret); + } ret = cap_capdeactivate(dest_ct, capin, CAP_THD, lid); if (ret) cos_throw(err, ret); @@ -429,7 +505,7 @@ thd_deactivate(struct captbl *ct, struct cap_captbl *dest_ct, unsigned long capi return ret; } -static int +static inline int thd_tls_set(struct captbl *ct, capid_t thd_cap, vaddr_t tlsaddr, struct thread *current) { struct cap_thd *tc; @@ -447,7 +523,7 @@ thd_tls_set(struct captbl *ct, capid_t thd_cap, vaddr_t tlsaddr, struct thread * return 0; } -static void +static inline void thd_init(void) { assert(sizeof(struct cap_thd) <= __captbl_cap2bytes(CAP_THD)); @@ -472,6 +548,12 @@ curr_invstk_top(struct cos_cpu_local_info *cos_info) return cos_info->invstk_top; } +static inline struct comp_info * +thd_invstk_current_compinfo(struct thread *curr_thd, struct cos_cpu_local_info *cos_info) +{ + return &(curr_thd->invstk[curr_invstk_top(cos_info)].comp_info); +} + static inline struct comp_info * thd_invstk_current(struct thread *curr_thd, unsigned long *ip, unsigned long *sp, struct cos_cpu_local_info *cos_info) { @@ -531,38 +613,80 @@ thd_preemption_state_update(struct thread *curr, struct thread *next, struct pt_ memcpy(&curr->regs, regs, sizeof(struct pt_regs)); } +static inline int +thd_sched_events_produce(struct thread *thd, struct cos_cpu_local_info *cos_info) +{ + int delta = 0, inv_top = curr_invstk_top(cos_info); + struct cos_scb_info *scb = NULL; + struct cos_sched_ring *r = NULL; + struct comp_info *c = NULL; + + if (unlikely(inv_top != 0 || thd->rcvcap.is_init == 0)) return 0; + + c = thd_invstk_peek_compinfo(thd, cos_info, inv_top); + if (unlikely(!c || !c->scb_data)) return -ENOENT; + + scb = ((c->scb_data) + get_cpuid()); + r = &(scb->sched_events); + /* + * only produce more if the ring is empty! + * so the user only calls after dequeueing all previous events. + */ + if (unlikely(r->head != r->tail)) return -EAGAIN; + + r->head = r->tail = 0; + while (delta < COS_SCHED_EVENT_RING_SIZE) { + struct cos_sched_event *e = &(r->event_buf[delta]); + unsigned long thd_state; + + if (!thd_state_evt_deliver(thd, &thd_state, (unsigned long *)&(e->evt.elapsed_cycs), + (unsigned long *)&(e->evt.next_timeout), &(e->evt.epoch))) break; + e->tid = (thd_state << 1) >> 1; + e->evt.blocked = (thd_state >> 31); + + delta++; + } + + r->tail += delta; + r->more = thd_rcvcap_evt_pending(thd); + + return delta; +} + static inline void thd_rcvcap_pending_deliver(struct thread *thd, struct pt_regs *regs) { - unsigned long thd_state = 0, cycles = 0, timeout = 0, pending = 0; - int all_pending = thd_rcvcap_all_pending_get(thd); + unsigned long thd_state = 0, cycles = 0, timeout = 0; + u64_t epoch = 0; - thd_state_evt_deliver(thd, &thd_state, &cycles, &timeout); - if (all_pending) { - pending = thd_rcvcap_all_pending(thd); - } else { - thd_rcvcap_pending_dec(thd); - pending = thd_rcvcap_pending(thd); + /* events only in scb now, no return values... */ + thd_rcvcap_pending_reset(thd); + if (thd_sched_events_produce(thd, cos_cpu_local_info()) == -ENOENT) { + thd_state_evt_deliver(thd, &thd_state, &cycles, &timeout, &epoch); } - __userregs_setretvals(regs, pending, thd_state, cycles, timeout); + __userregs_setretvals(regs, thd_rcvcap_pending(thd), thd_state, cycles, timeout); } static inline int thd_switch_update(struct thread *thd, struct pt_regs *regs, int issame) { - int preempt = 0; + int preempt = 0, pending = 0; /* TODO: check FPU */ /* fpu_save(thd); */ if (thd->state & THD_STATE_PREEMPTED) { - assert(!(thd->state & THD_STATE_RCVING)); + /* TODO: assert that its a scheduler thread */ + /* assert(!(thd->state & THD_STATE_RCVING)); */ thd->state &= ~THD_STATE_PREEMPTED; preempt = 1; - } else if (thd->state & THD_STATE_RCVING) { + } + + /* FIXME: can the thread be in race with the kernel? */ + if (thd->state & THD_STATE_RCVING) { assert(!(thd->state & THD_STATE_PREEMPTED)); thd->state &= ~THD_STATE_RCVING; thd_rcvcap_pending_deliver(thd, regs); - + pending = thd_rcvcap_pending(thd); /* * If a scheduler thread was running using child tcap and blocked on RCVING * and budget expended logic decided to run the scheduler thread with it's @@ -570,8 +694,15 @@ thd_switch_update(struct thread *thd, struct pt_regs *regs, int issame) */ } + if (unlikely(thd->dcbinfo && thd->dcbinfo->sp)) { + assert(preempt == 0); + regs->dx = regs->ip = thd->dcbinfo->ip + DCB_IP_KERN_OFF; + regs->cx = regs->sp = thd->dcbinfo->sp; + thd->dcbinfo->sp = 0; + } + if (issame && preempt == 0) { - __userregs_set(regs, 0, __userregs_getsp(regs), __userregs_getip(regs)); + __userregs_set(regs, pending, __userregs_getsp(regs), __userregs_getip(regs)); } return preempt; @@ -584,6 +715,12 @@ thd_introspect(struct thread *t, unsigned long op, unsigned long *retval) case THD_GET_TID: *retval = t->tid; break; + case THD_GET_DCB_IP: + *retval = t->dcbinfo->ip; + break; + case THD_GET_DCB_SP: + *retval = t->dcbinfo->sp; + break; default: return -EINVAL; } diff --git a/src/platform/i386/Makefile b/src/platform/i386/Makefile index 9a4f0e0614..0b222c5920 100644 --- a/src/platform/i386/Makefile +++ b/src/platform/i386/Makefile @@ -32,6 +32,8 @@ CFLAGS += $(WARNINGS) OBJS += kernel.o OBJS += gdt.o OBJS += idt.o +OBJS += pic.o +OBJS += ioapic.o OBJS += vm.o OBJS += printk.o OBJS += string.o @@ -42,9 +44,9 @@ OBJS += serial.o OBJS += hpet.o OBJS += chal.o OBJS += boot_comp.o -OBJS += miniacpi.o -#OBJS += console.o +OBJS += acpi.o OBJS += vga.o +OBJS += keyboard.o OBJS += exception.o OBJS += lapic.o diff --git a/src/platform/i386/miniacpi.c b/src/platform/i386/acpi.c similarity index 65% rename from src/platform/i386/miniacpi.c rename to src/platform/i386/acpi.c index c1647cfd25..68aabe763b 100644 --- a/src/platform/i386/miniacpi.c +++ b/src/platform/i386/acpi.c @@ -2,6 +2,8 @@ #include "string.h" #include "mem_layout.h" #include "pgtbl.h" +#include "apic_cntl.h" +#include "ioapic.h" #define RSDP_LO_ADDRESS ((unsigned char *)0xc00E0000) #define RSDP_HI_ADDRESS ((unsigned char *)0xc00FFFFF) @@ -32,9 +34,10 @@ struct rsdt { struct rsdt *entry[0]; } __attribute__((packed)); -extern u8_t * boot_comp_pgd; -static u32_t basepage; -static struct rsdt *rsdt; +extern u8_t * boot_comp_pgd; +static u32_t basepage; +static struct rsdt *rsdt; +static unsigned char *madt; static inline void * pa2va(void *pa) @@ -78,7 +81,7 @@ acpi_find_rsdt(void) } void * -acpi_find_timer(void) +acpi_find_hpet(void) { pgtbl_t pgtbl = (pgtbl_t)boot_comp_pgd; size_t i; @@ -142,3 +145,52 @@ acpi_set_rsdt_page(u32_t page) basepage = page * (1 << 22); rsdt = (struct rsdt *)pa2va(rsdt); } + +void +acpi_madt_intsrc_iter(unsigned char *addr) +{ + struct int_cntl_head *h = NULL, *end = NULL; + u32_t len = 0; + int nl = 0, nio = 0; + + assert(addr); + madt = addr; + h = (struct int_cntl_head *)(madt + APIC_CNTR_ARR_OFF); + len = *(u32_t *)(madt + APIC_HDR_LEN_OFF); + end = (struct int_cntl_head *)(madt + len); + + printk("\tMADT length %d (base struct %d)\n", len, APIC_CNTR_ARR_OFF); + assert(h <= end); + for (; h < end; h = (struct int_cntl_head *)((char *)h + h->len)) { + /* termination condition */ + assert(h->len >= sizeof(struct int_cntl_head)); + switch (h->type) { + case APIC_CNTL_LAPIC: { + nl++; + lapic_iter((struct lapic_cntl *)h); + break; + } + case APIC_CNTL_IOAPIC: { + nio++; + ioapic_iter((struct ioapic_cntl *)h); + break; + } + case APIC_CNTL_ISO: { + ioapic_int_override((struct intsrcovrride_cntl *)h); + break; + } + default: + /* See 5.2.12 in the ACPI 5.0 Spec */ + printk("\tInterrupt controller type %d: ignoring\n", h->type); + break; + } + } + + printk("\tMADT => LAPICs=%d, IOAPICs=%d\n", nl, nio); + + if (nl < NUM_CPU) { + printk("Number of LAPICs processed =%d not meeting the requirement = %d\n", nl, NUM_CPU); + printk("Please reconfigure NUM_CPU in Composite/HW-BIOS\n"); + assert(0); + } +} diff --git a/src/platform/i386/acpi.h b/src/platform/i386/acpi.h new file mode 100644 index 0000000000..a46ed82e7a --- /dev/null +++ b/src/platform/i386/acpi.h @@ -0,0 +1,10 @@ +#ifndef ACPI_H +#define ACPI_H + +void *acpi_find_apic(void); +void *acpi_find_rsdt(void); +void *acpi_find_hpet(void); +void acpi_set_rsdt_page(u32_t); +void acpi_madt_intsrc_iter(unsigned char *); + +#endif /* ACPI_H */ diff --git a/src/platform/i386/apic_cntl.h b/src/platform/i386/apic_cntl.h new file mode 100644 index 0000000000..47f3073698 --- /dev/null +++ b/src/platform/i386/apic_cntl.h @@ -0,0 +1,63 @@ +#ifndef APIC_CNTL_H +#define APIC_CNTL_H + +#define APIC_DEFAULT_PHYS 0xFEE00000 +#define APIC_HDR_LEN_OFF 0x04 +#define APIC_CNTRLR_ADDR_OFF 0x24 +#define APIC_CNTRLR_FLAGS_OFF 0x28 +#define APIC_CNTR_ARR_OFF 0x2C + +/* See 5.2.12 in the ACPI 5.0 Spec */ +enum +{ + APIC_CNTL_LAPIC = 0, + APIC_CNTL_IOAPIC = 1, + APIC_CNTL_ISO = 2, +}; + +struct int_cntl_head { + u8_t type; + u8_t len; +} __attribute__((packed)); + +struct lapic_cntl { + /* type == APIC_CNTL_LAPIC */ + struct int_cntl_head header; + u8_t proc_id; + u8_t apic_id; + u32_t flags; /* 0 = dead processor */ +} __attribute__((packed)); + +struct ioapic_cntl { + /* type == APIC_CNTL_IOAPIC */ + struct int_cntl_head header; + u8_t ioapic_id; + u8_t reserved; + u32_t ioapic_phys_addr; + u32_t glb_int_num_off; /* I/O APIC's interrupt base number offset */ +} __attribute__((packed)); + +struct intsrcovrride_cntl { + /* type == APIC_CNTL_ISO */ + struct int_cntl_head header; + u8_t bus; + u8_t source; + u32_t glb_int_num_off; + u16_t flags; +} __attribute__((packed)); + +enum acpi_madt_iso_polarity { + ACPI_MADT_ISO_POL_CONFORMS = 0, + ACPI_MADT_ISO_POL_ACTHIGH, + ACPI_MADT_ISO_POL_RESERVED, + ACPI_MADT_ISO_POL_ACTLOW, +}; + +enum acpi_madt_iso_trigger { + ACPI_MADT_ISO_TRIG_CONFORMS = 0, + ACPI_MADT_ISO_TRIG_EDGE, + ACPI_MADT_ISO_TRIG_RESERVED, + ACPI_MADT_ISO_TRIG_LEVEL, +}; + +#endif /* APIC_CNTL_H */ diff --git a/src/platform/i386/boot_comp.c b/src/platform/i386/boot_comp.c index 82b363de1f..b023d8e471 100644 --- a/src/platform/i386/boot_comp.c +++ b/src/platform/i386/boot_comp.c @@ -9,10 +9,13 @@ #include #include #include +#include +#include extern u8_t *boot_comp_pgd; -void *thd_mem[NUM_CPU], *tcap_mem[NUM_CPU]; +vaddr_t dcb_addr, dcb_uaddr; +void *thd_mem, *tcap_mem; struct captbl *glb_boot_ct; int @@ -34,7 +37,8 @@ boot_pgtbl_mappings_add(struct captbl *ct, capid_t pgdcap, capid_t ptecap, const pgd_cap = (struct cap_pgtbl *)captbl_lkup(ct, pgdcap); if (!pgd_cap || !CAP_TYPECHK(pgd_cap, CAP_PGTBL)) assert(0); pgtbl = (pgtbl_t)pgd_cap->pgtbl; - nptes = boot_nptes(range); + if (!uvm) nptes = boot_nptes(range); + else nptes = boot_nptes(range + COS_SCB_SIZE); ptes = mem_boot_alloc(nptes); assert(ptes); @@ -89,16 +93,18 @@ boot_pgtbl_mappings_add(struct captbl *ct, capid_t pgdcap, capid_t ptecap, const return 0; } -/* FIXME: loops to create threads/tcaps/rcv caps per core. */ static void -kern_boot_thd(struct captbl *ct, void *thd_mem, void *tcap_mem, const cpuid_t cpu_id) +kern_boot_thd(struct captbl *ct, const cpuid_t cpu_id) { + void *tmem = (void *)((vaddr_t)thd_mem + cpu_id * PAGE_SIZE); + void *tcmem = (void *)((vaddr_t)tcap_mem + cpu_id * PAGE_SIZE); + vaddr_t dcbmem = dcb_addr + cpu_id * PAGE_SIZE, dcbumem = dcb_uaddr + cpu_id * PAGE_SIZE; struct cos_cpu_local_info *cos_info = cos_cpu_local_info(); - struct thread * t = thd_mem; - struct tcap * tc = tcap_mem; + struct thread *t = tmem; + struct tcap *tc = tcmem; tcap_res_t expended; int ret; - struct cap_pgtbl * cap_pt; + struct cap_pgtbl *cap_pt; pgtbl_t pgtbl; assert(cpu_id >= 0); @@ -108,16 +114,18 @@ kern_boot_thd(struct captbl *ct, void *thd_mem, void *tcap_mem, const cpuid_t cp cos_info->cpuid = cpu_id; cos_info->invstk_top = 0; cos_info->overflow_check = 0xDEADBEEF; - ret = thd_activate(ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cpu_id), thd_mem, BOOT_CAPTBL_SELF_COMP, 0); + ret = dcb_activate(ct, BOOT_CAPTBL_SELF_CT, LLBOOT_CAPTBL_INITDCB_CPU(cpu_id), dcbmem, 0, BOOT_CAPTBL_SELF_PT, dcbumem); + assert(!ret); + ret = thd_activate(ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cpu_id), tmem, BOOT_CAPTBL_SELF_COMP, 0, LLBOOT_CAPTBL_INITDCB_CPU(cpu_id), 0); assert(!ret); tcap_active_init(cos_info); - ret = tcap_activate(ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_INITTCAP_BASE_CPU(cpu_id), tcap_mem); + ret = tcap_activate(ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_INITTCAP_BASE_CPU(cpu_id), tcmem); assert(!ret); tc->budget.cycles = TCAP_RES_INF; /* Chronos's got all the time in the world */ tc->perm_prio = 0; - tcap_setprio(tc, 0); /* Chronos gets preempted by no one! */ + tcap_setprio(tc, 0); /* Chronos gets preempted by no one! */ list_enqueue(&cos_info->tcaps, &tc->active_list); /* Chronos on the TCap active list */ cos_info->tcap_uid = 1; cos_info->cycles = tsc(); @@ -131,10 +139,7 @@ kern_boot_thd(struct captbl *ct, void *thd_mem, void *tcap_mem, const cpuid_t cp BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cpu_id), BOOT_CAPTBL_SELF_INITTCAP_BASE_CPU(cpu_id), 0, 1); assert(!ret); - /* - * boot component's mapped into SELF_PT, - * switching to boot component's pgd - */ + /* boot component's mapped into SELF_PT, switching to boot component's pgd. */ cap_pt = (struct cap_pgtbl *)captbl_lkup(ct, BOOT_CAPTBL_SELF_PT); if (!cap_pt || !CAP_TYPECHK(cap_pt, CAP_PGTBL)) assert(0); pgtbl = cap_pt->pgtbl; @@ -152,12 +157,13 @@ kern_boot_comp(const cpuid_t cpu_id) u8_t * boot_comp_captbl; pgtbl_t pgtbl = (pgtbl_t)chal_va2pa(&boot_comp_pgd), boot_vm_pgd; u32_t hw_bitmap = 0xFFFFFFFF; + vaddr_t scb_uaddr = 0, scb_kaddr = 0; assert(cpu_id >= 0); if (NUM_CPU > 1 && cpu_id > 0) { assert(glb_boot_ct); pgtbl_update(pgtbl); - kern_boot_thd(glb_boot_ct, thd_mem[cpu_id], tcap_mem[cpu_id], cpu_id); + kern_boot_thd(glb_boot_ct, cpu_id); return; } @@ -178,11 +184,13 @@ kern_boot_comp(const cpuid_t cpu_id) assert(!ret); } - for (i = 0; i < NUM_CPU; i++) { - thd_mem[i] = mem_boot_alloc(1); - tcap_mem[i] = mem_boot_alloc(1); - assert(thd_mem[i] && tcap_mem[i]); - } + scb_kaddr = (vaddr_t)mem_boot_alloc(1); + assert(scb_kaddr); + + dcb_addr = (vaddr_t)mem_boot_alloc(NUM_CPU); + thd_mem = mem_boot_alloc(NUM_CPU); + tcap_mem = mem_boot_alloc(NUM_CPU); + assert(thd_mem && tcap_mem && dcb_addr); if (captbl_activate_boot(glb_boot_ct, BOOT_CAPTBL_SELF_CT)) assert(0); if (sret_activate(glb_boot_ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SRET)) assert(0); @@ -203,6 +211,9 @@ kern_boot_comp(const cpuid_t cpu_id) ret = boot_pgtbl_mappings_add(glb_boot_ct, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_BOOTVM_PTE, "booter VM", mem_bootc_start(), (unsigned long)mem_bootc_vaddr(), mem_bootc_end() - mem_bootc_start(), 1); assert(ret == 0); + scb_uaddr = (vaddr_t)(mem_bootc_vaddr() + (mem_bootc_end() - mem_bootc_start())); + assert(COS_SCB_SIZE == PAGE_SIZE); + dcb_uaddr = scb_uaddr + COS_SCB_SIZE; /* * This _must_ be the last allocation. The bump pointer @@ -218,17 +229,19 @@ kern_boot_comp(const cpuid_t cpu_id) mem_utmem_end() - mem_boot_nalloc_end(nkmemptes), 0); assert(ret == 0); - printk("\tCapability table and page-table created.\n"); - /* Shut off further bump allocations */ glb_memlayout.allocs_avail = 0; + if (scb_activate(glb_boot_ct, BOOT_CAPTBL_SELF_CT, LLBOOT_CAPTBL_SCB, scb_kaddr, 0)) assert(0); - if (comp_activate(glb_boot_ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_COMP, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_PT, 0, - (vaddr_t)mem_bootc_entry(), NULL)) + printk("\tCapability table and page-table created.\n"); + + if (comp_activate(glb_boot_ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_COMP, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_PT, + LLBOOT_CAPTBL_SCB, 0, (vaddr_t)mem_bootc_entry(), scb_uaddr)) assert(0); + printk("\tCreated boot component structure from page-table and capability-table.\n"); - kern_boot_thd(glb_boot_ct, thd_mem[cpu_id], tcap_mem[cpu_id], cpu_id); + kern_boot_thd(glb_boot_ct, cpu_id); printk("\tBoot component initialization complete.\n"); } diff --git a/src/platform/i386/chal/chal_config.h b/src/platform/i386/chal/chal_config.h index 2624ad9fab..1af6cd703f 100644 --- a/src/platform/i386/chal/chal_config.h +++ b/src/platform/i386/chal/chal_config.h @@ -13,8 +13,10 @@ typedef signed int s32_t; typedef signed long long s64_t; #endif +#define HW_IRQ_START 32 + typedef enum { - HW_PERIODIC = 32, /* periodic timer interrupt */ + HW_HPET_PERIODIC = HW_IRQ_START, /* periodic timer interrupt */ HW_KEYBOARD, /* keyboard interrupt */ HW_ID3, HW_ID4, @@ -22,7 +24,7 @@ typedef enum { HW_ID6, HW_ID7, HW_ID8, - HW_ONESHOT, /* onetime timer interrupt */ + HW_HPET_ONESHOT, /* onetime timer interrupt */ HW_ID10, HW_ID11, HW_ID12, diff --git a/src/platform/i386/console.c b/src/platform/i386/console.c deleted file mode 100644 index 7003a5b15b..0000000000 --- a/src/platform/i386/console.c +++ /dev/null @@ -1,152 +0,0 @@ -#define ENABLE_CONSOLE - -#include "io.h" -#include "string.h" -#include "isr.h" -#include "kernel.h" - -#define VIDEO_MEM 0xb8000 - -#define VGA_CTL_REG 0x3D4 -#define VGA_DATA_REG 0x3D5 - -#define KEY_DEVICE 0x60 -#define KEY_PENDING 0x64 - -#define COLUMNS 80 -#define LINES 25 - -/* FIXME these should go somewhere else */ -#define BACKSPACE 0x08 -#define TAB 0x09 - -enum vga_colors -{ - BLACK = 0x00, - BLUE, - GREEN, - CYAN, - RED, - MAGENTA, - BROWN, - LIGHT_GREY, - DARK_GREY, - LIGHT_BLUE, - LIGHT_GREEN, - LIGHT_CYAN, - LIGHT_RED, - LIGHT_MAGENTA, - LIGHT_BROWN, - WHITE -}; - -static u16_t *video_mem = (u16_t *)VIDEO_MEM; -static u8_t cursor_x; -static u8_t cursor_y; - -static void -wmemset(void *dst, int c, size_t count) -{ - unsigned short *tmp = (unsigned short *)dst; - - for (; count != 0; count--) *tmp++ = c; -} - -static inline u8_t -gen_color(u8_t forground, u8_t background) -{ - return (background << 4) | (forground & 0x0F); -} - -static void -update_cursor(u8_t row, u8_t col) -{ - u16_t pos = row * COLUMNS + col; - - outb(VGA_CTL_REG, 0x0E); - outb(VGA_DATA_REG, pos >> 8); - outb(VGA_CTL_REG, 0x0F); - outb(VGA_DATA_REG, pos); -} - -static void -scroll(void) -{ - u16_t blank = ((u8_t)' ') | gen_color(WHITE, BLACK); - unsigned i; - - if (cursor_y < LINES) return; - - for (i = 0; i < (LINES - 1) * COLUMNS; i++) video_mem[i] = video_mem[i + COLUMNS]; - - wmemset(video_mem + ((LINES - 1) * COLUMNS), blank, COLUMNS); - cursor_y = LINES - 1; -} - -static void -vga_putch(char c) -{ - u8_t color = gen_color(LIGHT_GREY, BLACK); - u16_t attribute = color << 8; - u16_t *location; - - if (c == BACKSPACE && cursor_x) - cursor_x--; - else if (c == TAB) - cursor_x = (cursor_x + 8) & ~(8 - 1); - else if (c == '\r') - cursor_x = 0; - else if (c == '\n') { - cursor_x = 0; - cursor_y++; - } else if (c >= ' ') { - location = video_mem + (cursor_y * COLUMNS + cursor_x); - *location = c | attribute; - cursor_x++; - } - - if (cursor_x >= COLUMNS) { - cursor_x = 0; - cursor_y++; - } - - scroll(); - update_cursor(cursor_y, cursor_x); -} - -void -vga_puts(const char *s) -{ - for (; *s != '\0'; s++) vga_putch(*s); -} - -void -vga_clear(void) -{ - u8_t color = gen_color(WHITE, BLACK); - u16_t blank = ((u8_t)' ') | color << 8; - wmemset(video_mem, blank, COLUMNS * LINES); -} - -int -keyboard_handler(struct pt_regs *regs) -{ - u16_t scancode; - int preempt = 1; - - ack_irq(IRQ_KEYBOARD); - - while (inb(KEY_PENDING) & 2) { - /* wait for keypress to be ready */ - } - scancode = inb(KEY_DEVICE); - printk("Keyboard press: %d\n", scancode); - return preempt; -} - -void -console_init(void) -{ - vga_clear(); - printk_register_handler(vga_puts); -} diff --git a/src/platform/i386/entry.S b/src/platform/i386/entry.S index 9479ed54b1..07b5954aab 100644 --- a/src/platform/i386/entry.S +++ b/src/platform/i386/entry.S @@ -131,7 +131,7 @@ IRQ(smid_float_pt_except_fault) IRQ(virtualization_except_fault) IRQ_CODE(security_except_fault) -IRQ(periodic) +IRQ(hpet_periodic) IRQ(keyboard) IRQ_ID(34) IRQ_ID(35) @@ -139,7 +139,7 @@ IRQ(serial) IRQ_ID(37) IRQ_ID(38) IRQ_ID(39) -IRQ(oneshot) +IRQ(hpet_oneshot) IRQ_ID(41) IRQ_ID(42) IRQ_ID(43) diff --git a/src/platform/i386/exception.c b/src/platform/i386/exception.c index 5b6694c01a..b4d2e4c538 100644 --- a/src/platform/i386/exception.c +++ b/src/platform/i386/exception.c @@ -6,7 +6,6 @@ #include "isr.h" #include "chal_cpu.h" -#define PRINTK(format, ...) printk("(CPU%ld:) " format, get_cpuid(), ## __VA_ARGS__) void print_regs_state(struct pt_regs *regs) diff --git a/src/platform/i386/hpet.c b/src/platform/i386/hpet.c index 719ea4af87..840754ef2c 100644 --- a/src/platform/i386/hpet.c +++ b/src/platform/i386/hpet.c @@ -41,20 +41,24 @@ /* Bits in HPET_Tn_CONFIG */ /* 1 << 0 is reserved */ -#define TN_INT_TYPE_CNF (1ll << 1) /* 0 = edge trigger, 1 = level trigger */ -#define TN_INT_ENB_CNF (1ll << 2) /* 0 = no interrupt, 1 = interrupt */ -#define TN_TYPE_CNF (1ll << 3) /* 0 = one-shot, 1 = periodic */ -#define TN_PER_INT_CAP (1ll << 4) /* read only, 1 = periodic supported */ -#define TN_SIZE_CAP (1ll << 5) /* 0 = 32-bit, 1 = 64-bit */ -#define TN_VAL_SET_CNF (1ll << 6) /* set to allow directly setting accumulator */ +#define HPET_TN_INT_TYPE_CNF (1ll << 1) /* 0 = edge trigger, 1 = level trigger */ +#define HPET_TN_INT_ENB_CNF (1ll << 2) /* 0 = no interrupt, 1 = interrupt */ +#define HPET_TN_TYPE_CNF (1ll << 3) /* 0 = one-shot, 1 = periodic */ +#define HPET_TN_PER_INT_CAP (1ll << 4) /* read only, 1 = periodic supported */ +#define HPET_TN_SIZE_CAP (1ll << 5) /* 0 = 32-bit, 1 = 64-bit */ +#define HPET_TN_VAL_SET_CNF (1ll << 6) /* set to allow directly setting accumulator */ /* 1 << 7 is reserved */ -#define TN_32MODE_CNF (1ll << 8) /* 1 = force 32-bit access to 64-bit timer */ -/* #define TN_INT_ROUTE_CNF (1<<9:1<<13)*/ /* routing for interrupt */ -#define TN_FSB_EN_CNF (1ll << 14) /* 1 = deliver interrupts via FSB instead of APIC */ -#define TN_FSB_INT_DEL_CAP (1ll << 15) /* read only, 1 = FSB delivery available */ +#define HPET_TN_32MODE_CNF (1ll << 8) /* 1 = force 32-bit access to 64-bit timer */ +#define HPET_TN_INT_ROUTE_CNF (9) /* routing for interrupt */ +#define HPET_TN_FSB_EN_CNF (1ll << 14) /* 1 = deliver interrupts via FSB instead of APIC */ +#define HPET_TN_FSB_INT_DEL_CAP (1ll << 15) /* read only, 1 = FSB delivery available */ #define HPET_INT_ENABLE(n) (*hpet_interrupt = (0x1 << n)) /* Clears the INT n for level-triggered mode. */ +/* vector for interrupts */ +#define HPET_PERIODIC_VEC 0ll +#define HPET_ONESHOT_VEC 8ll + static volatile u32_t *hpet_capabilities; static volatile u64_t *hpet_config; static volatile u64_t *hpet_interrupt; @@ -69,7 +73,7 @@ volatile struct hpet_timer { /* * When determining how many CPU cycles are in a HPET tick, we must - * execute a number of periodic ticks (TIMER_CALIBRATION_ITER) at a + * execute a number of periodic ticks (HPET_CALIBRATION_ITER) at a * controlled interval, and use the HPET tick granularity to compute * how many CPU cycles per HPET tick there are. Unfortunately, this * can be quite low (e.g. HPET tick of 10ns, CPU tick of 2ns) leading @@ -79,33 +83,36 @@ volatile struct hpet_timer { * Practically, this will lead to the divisor in the conversion being * smaller than it should be, thus causing timers to go off _later_ * than they should. Thus we use a multiplicative factor - * (TIMER_ERROR_BOUND_FACTOR) to lessen the rounding error. + * (HPET_ERROR_BOUND_FACTOR) to lessen the rounding error. * * All of the hardware is documented in the HPET specification @ * http://www.intel.com/content/dam/www/public/us/en/documents/technical-specifications/software-developers-hpet-spec-1-0a.pdf */ -#define PICO_PER_MICRO 1000000UL -#define FEMPTO_PER_PICO 1000UL -#define TIMER_CALIBRATION_ITER 256 -#define TIMER_ERROR_BOUND_FACTOR 256 -static int timer_calibration_init = 0; -static unsigned long timer_cycles_per_hpetcyc = TIMER_ERROR_BOUND_FACTOR; -static unsigned long cycles_per_tick; -static unsigned long hpetcyc_per_tick; #define ULONG_MAX 4294967295UL +#define HPET_PICO_PER_MICRO 1000000UL +#define HPET_FEMPTO_PER_PICO 1000UL +#define HPET_CALIBRATION_ITER 256 +#define HPET_ERROR_BOUND_FACTOR 256 +#define HPET_DEFAULT_PERIOD_US 1000 /* US = microseconds */ +static int hpet_calibration_init = 0; +static unsigned long hpet_cpucyc_per_hpetcyc = HPET_ERROR_BOUND_FACTOR; +static unsigned long hpet_cpucyc_per_tick; +static unsigned long hpet_hpetcyc_per_tick; +static unsigned long hpet_periodicity_curr[2] = { 0 }; +static cycles_t hpet_first_hpet_period = 0; /* for timer 0 = HPET_PERIODIC */ extern u32_t chal_msr_mhz; static inline u64_t -timer_cpu2hpet_cycles(u64_t cycles) +hpet_cpu2hpet_cycles(u64_t cycles) { unsigned long cyc; /* demote precision to enable word-sized math */ cyc = (unsigned long)cycles; - if (unlikely((u64_t)cyc < cycles)) cyc= ULONG_MAX; + if (unlikely((u64_t)cyc < cycles)) cyc = ULONG_MAX; /* convert from CPU cycles to HPET cycles */ - cyc = (cyc / timer_cycles_per_hpetcyc) * TIMER_ERROR_BOUND_FACTOR; + cyc = (cyc / hpet_cpucyc_per_hpetcyc) * HPET_ERROR_BOUND_FACTOR; /* promote the precision to interact with the hardware correctly */ cycles = cyc; @@ -113,7 +120,7 @@ timer_cpu2hpet_cycles(u64_t cycles) } static void -timer_disable(timer_type_t timer_type) +hpet_disable(hpet_type_t timer_type) { /* Disable timer interrupts */ *hpet_config &= ~HPET_ENABLE_CNF; @@ -127,10 +134,10 @@ timer_disable(timer_type_t timer_type) } static void -timer_calibration(void) +hpet_calibration(void) { - static int cnt = 0; - static u64_t cycle = 0, tot = 0, prev; + static int cnt = 0; + static u64_t cycle = 0, tot = 0, prev; static u32_t apic_curr = 0, apic_tot = 0, apic_prev; /* calibration only on BSP */ @@ -145,30 +152,31 @@ timer_calibration(void) tot += cycle - prev; apic_tot += (apic_prev - apic_curr); } - if (cnt >= TIMER_CALIBRATION_ITER) { - assert(hpetcyc_per_tick); - timer_calibration_init = 0; - cycles_per_tick = (unsigned long)(tot / TIMER_CALIBRATION_ITER); - assert(cycles_per_tick > hpetcyc_per_tick); + if (cnt >= HPET_CALIBRATION_ITER) { + assert(hpet_hpetcyc_per_tick); + hpet_calibration_init = 0; + hpet_cpucyc_per_tick = (unsigned long)(tot / HPET_CALIBRATION_ITER); + assert(hpet_cpucyc_per_tick > hpet_hpetcyc_per_tick); if (lapic_timer_calib_init) { u32_t cycs_to_apic_ratio = 0, apic_cycs_per_tick = 0; - apic_cycs_per_tick = apic_tot / TIMER_CALIBRATION_ITER; + apic_cycs_per_tick = apic_tot / HPET_CALIBRATION_ITER; assert(apic_cycs_per_tick); - cycs_to_apic_ratio = cycles_per_tick / apic_cycs_per_tick; + cycs_to_apic_ratio = hpet_cpucyc_per_tick / apic_cycs_per_tick; lapic_timer_calibration(cycs_to_apic_ratio); } /* Possibly significant rounding error here. Bound by the factor */ - timer_cycles_per_hpetcyc = (TIMER_ERROR_BOUND_FACTOR * cycles_per_tick) / hpetcyc_per_tick; + hpet_cpucyc_per_hpetcyc = (HPET_ERROR_BOUND_FACTOR * hpet_cpucyc_per_tick) / hpet_hpetcyc_per_tick; printk("Timer calibrated:\n\tCPU cycles per HPET tick: %ld\n\tHPET ticks in %d us: %ld\n", - timer_cycles_per_hpetcyc / TIMER_ERROR_BOUND_FACTOR, TIMER_DEFAULT_US_INTERARRIVAL, - hpetcyc_per_tick); + hpet_cpucyc_per_hpetcyc / HPET_ERROR_BOUND_FACTOR, HPET_DEFAULT_PERIOD_US, + hpet_hpetcyc_per_tick); - timer_disable(TIMER_PERIODIC); - timer_disable(TIMER_PERIODIC); + hpet_disable(HPET_PERIODIC); + hpet_disable(HPET_PERIODIC); + chal_irq_disable(HW_HPET_PERIODIC, 0); } cnt++; } @@ -176,57 +184,70 @@ timer_calibration(void) int chal_cyc_usec(void) { - if (lapic_timer_calib_init) return 0; + if (unlikely(lapic_timer_calib_init || hpet_calibration_init)) return 0; + + if (likely(hpet_cpucyc_per_tick)) return hpet_cpucyc_per_tick / HPET_DEFAULT_PERIOD_US; - return cycles_per_tick / TIMER_DEFAULT_US_INTERARRIVAL; + return 0; } int -periodic_handler(struct pt_regs *regs) +hpet_periodic_handler(struct pt_regs *regs) { int preempt = 1; +static int count = 0; + + lapic_ack(); + if (unlikely(hpet_calibration_init)) hpet_calibration(); + if (unlikely(hpet_periodicity_curr[HPET_PERIODIC] && !hpet_first_hpet_period)) { + count++; - if (unlikely(timer_calibration_init)) timer_calibration(); + if (count < 25) goto done; + rdtscll(hpet_first_hpet_period); + } - ack_irq(HW_PERIODIC); - preempt = cap_hw_asnd(&hw_asnd_caps[HW_PERIODIC], regs); - HPET_INT_ENABLE(TIMER_PERIODIC); + preempt = cap_hw_asnd(&hw_asnd_caps[get_cpuid()][HW_HPET_PERIODIC], regs); +done: + HPET_INT_ENABLE(HPET_PERIODIC); return preempt; } -extern int timer_process(struct pt_regs *regs); - int -oneshot_handler(struct pt_regs *regs) +hpet_oneshot_handler(struct pt_regs *regs) { int preempt = 1; - ack_irq(HW_ONESHOT); - preempt = timer_process(regs); - HPET_INT_ENABLE(TIMER_ONESHOT); + assert(!hpet_calibration_init); + + lapic_ack(); + preempt = cap_hw_asnd(&hw_asnd_caps[get_cpuid()][HW_HPET_ONESHOT], regs); + HPET_INT_ENABLE(HPET_ONESHOT); return preempt; } void -timer_set(timer_type_t timer_type, u64_t cycles) +hpet_set(hpet_type_t timer_type, u64_t cycles) { - u64_t outconfig = TN_INT_TYPE_CNF | TN_INT_ENB_CNF; + u64_t outconfig = HPET_TN_INT_TYPE_CNF | HPET_TN_INT_ENB_CNF; /* Disable timer interrupts */ *hpet_config &= ~HPET_ENABLE_CNF; /* Reset main counter */ - if (timer_type == TIMER_ONESHOT) { - cycles = timer_cpu2hpet_cycles(cycles); + if (timer_type == HPET_ONESHOT) { + cycles = hpet_cpu2hpet_cycles(cycles); /* Set a static value to count up to */ hpet_timers[timer_type].config = outconfig; + hpet_timers[timer_type].config |= HPET_ONESHOT_VEC << HPET_TN_INT_ROUTE_CNF; cycles += HPET_COUNTER; } else { /* Set a periodic value */ - hpet_timers[timer_type].config = outconfig | TN_TYPE_CNF | TN_VAL_SET_CNF; + hpet_timers[timer_type].config = outconfig | HPET_TN_TYPE_CNF | HPET_TN_VAL_SET_CNF; + /* Set the interrupt vector for periodic timer */ + hpet_timers[timer_type].config |= HPET_PERIODIC_VEC << HPET_TN_INT_ROUTE_CNF; /* Reset main counter */ HPET_COUNTER = 0x00; } @@ -237,7 +258,7 @@ timer_set(timer_type_t timer_type, u64_t cycles) } u64_t -timer_find_hpet(void *timer) +hpet_find(void *timer) { u32_t i; unsigned char sum = 0; @@ -264,7 +285,55 @@ timer_find_hpet(void *timer) } void -timer_set_hpet_page(u32_t page) +chal_hpet_periodic_set(hwid_t hwid, unsigned long usecs_period) +{ + hpet_type_t type = 0; + + assert(hwid == HW_HPET_PERIODIC); + type = HPET_PERIODIC; + + if (hpet_periodicity_curr[type] != usecs_period) { + hpet_disable(type); + hpet_disable(type); + + hpet_periodicity_curr[type] = 0; + } + + if (hpet_periodicity_curr[type] == 0) { + unsigned long tick_multiple = 0; + cycles_t hpetcyc_per_period = 0; + + assert(hpet_calibration_init == 0); + assert((usecs_period >= HPET_DEFAULT_PERIOD_US) && (usecs_period % HPET_DEFAULT_PERIOD_US == 0)); + + tick_multiple = usecs_period / HPET_DEFAULT_PERIOD_US; + hpetcyc_per_period = (cycles_t)hpet_hpetcyc_per_tick * (cycles_t)tick_multiple; + hpet_periodicity_curr[type] = usecs_period; + if (type == HPET_PERIODIC) hpet_first_hpet_period = 0; + hpet_set(type, hpetcyc_per_period); + chal_irq_enable(HW_HPET_PERIODIC, 0); + printk("Setting HPET [%u:%u] Periodicity:%lu hpetcyc_per_period:%llu\n", hwid, type, usecs_period, hpetcyc_per_period); + } +} + +cycles_t +chal_hpet_first_period(void) +{ + return hpet_first_hpet_period; +} + +void +chal_hpet_disable(hwid_t hwid) +{ + printk("Disabling HPET %u\n", hwid); + hpet_type_t type = (hwid == HW_HPET_PERIODIC ? HPET_PERIODIC : HPET_ONESHOT); + + hpet_disable(type); + hpet_disable(type); +} + +void +hpet_set_page(u32_t page) { hpet = (void *)(page * (1 << 22) | ((u32_t)hpet & ((1 << 22) - 1))); hpet_capabilities = (u32_t *)((unsigned char *)hpet + HPET_CAPABILITIES); @@ -276,17 +345,24 @@ timer_set_hpet_page(u32_t page) } void -timer_init(void) +hpet_init(void) { unsigned long pico_per_hpetcyc; assert(hpet_capabilities); - pico_per_hpetcyc = hpet_capabilities[1] - / FEMPTO_PER_PICO; /* bits 32-63 are # of femptoseconds per HPET clock tick */ - hpetcyc_per_tick = (TIMER_DEFAULT_US_INTERARRIVAL * PICO_PER_MICRO) / pico_per_hpetcyc; + /* bits 32-63 are # of femptoseconds per HPET clock tick */ + pico_per_hpetcyc = hpet_capabilities[1] / HPET_FEMPTO_PER_PICO; + hpet_hpetcyc_per_tick = (HPET_DEFAULT_PERIOD_US * HPET_PICO_PER_MICRO) / pico_per_hpetcyc; printk("Enabling timer @ %p with tick granularity %ld picoseconds\n", hpet, pico_per_hpetcyc); - /* Enable legacy interrupt routing */ + + /* + * FIXME: For some reason, setting to non-legacy mode isn't working well. + * Periodicity of the HPET fired is wrong and any interval configuration + * is still producing the same wrong interval timing. + * + * So, Enable legacy interrupt routing like we had before! + */ *hpet_config |= HPET_LEG_RT_CNF; /* @@ -294,13 +370,15 @@ timer_init(void) * specification is in hpet cycles (not cpu cycles). */ if (chal_msr_mhz && !lapic_timer_calib_init) { - cycles_per_tick = chal_msr_mhz * TIMER_DEFAULT_US_INTERARRIVAL; - timer_cycles_per_hpetcyc = cycles_per_tick / hpetcyc_per_tick; + hpet_cpucyc_per_tick = chal_msr_mhz * HPET_DEFAULT_PERIOD_US; + hpet_cpucyc_per_hpetcyc = hpet_cpucyc_per_tick / hpet_hpetcyc_per_tick; printk("Timer not calibrated, instead computed using MSR frequency value\n"); return; } - timer_calibration_init = 1; - timer_set(TIMER_PERIODIC, hpetcyc_per_tick); + hpet_calibration_init = 1; + hpet_set(HPET_PERIODIC, hpet_hpetcyc_per_tick); + chal_irq_enable(HW_HPET_PERIODIC, 0); + chal_irq_enable(HW_HPET_ONESHOT, 0); } diff --git a/src/platform/i386/hpet.h b/src/platform/i386/hpet.h new file mode 100644 index 0000000000..f6aa186ce8 --- /dev/null +++ b/src/platform/i386/hpet.h @@ -0,0 +1,14 @@ +#ifndef HPET_H +#define HPET_H + +typedef enum { + HPET_PERIODIC = 0, + HPET_ONESHOT = 1, +} hpet_type_t; + +void hpet_set(hpet_type_t timer_type, u64_t cycles); +void hpet_init(void); +u64_t hpet_find(void *timer); +void hpet_set_page(u32_t page); + +#endif /* HPET_H */ diff --git a/src/platform/i386/idt.c b/src/platform/i386/idt.c index 0d79f8c675..821806bfee 100644 --- a/src/platform/i386/idt.c +++ b/src/platform/i386/idt.c @@ -3,31 +3,6 @@ #include "isr.h" #include "chal/io.h" -/* Information taken from: http://wiki.osdev.org/PIC */ -/* FIXME: Remove magic numbers and replace with this */ -#define PIC1 0x20 -#define PIC2 0xA0 -#define PIC1_COMMAND PIC1 -#define PIC1_DATA (PIC1 + 1) -#define PIC2_COMMAND PIC2 -#define PIC2_DATA (PIC2 + 1) - -/* reinitialize the PIC controllers, giving them specified vector offsets - rather than 8 and 70, as configured by default */ - -#define ICW1_ICW4 0x01 /* ICW4 (not) needed */ -#define ICW1_SINGLE 0x02 /* Single (cascade) mode */ -#define ICW1_INTERVAL4 0x04 /* Call address interval 4 (8) */ -#define ICW1_LEVEL 0x08 /* Level triggered (edge) mode */ -#define ICW1_INIT 0x10 /* Initialization - required! */ - -#define ICW4_8086 0x01 /* 8086/88 (MCS-80/85) mode */ -#define ICW4_AUTO 0x02 /* Auto (normal) EOI */ -#define ICW4_BUF_SLAVE 0x08 /* Buffered mode/slave */ -#define ICW4_BUF_MASTER 0x0C /* Buffered mode/master */ -#define ICW4_SFNM 0x10 /* Special fully nested (not) */ -#define ICW1_ICW4 0x01 - struct idt_entry { u16_t base_lo; // Lower 16 bits of address to jump too after int u16_t sel; // Kernel segment selector @@ -73,42 +48,36 @@ hw_handler(struct pt_regs *regs) * TODO: ack here? or * after user-level interrupt(rcv event) processing? */ - ack_irq(regs->orig_ax); - preempt = cap_hw_asnd(&hw_asnd_caps[regs->orig_ax], regs); + lapic_ack(); + preempt = cap_hw_asnd(&hw_asnd_caps[get_cpuid()][regs->orig_ax], regs); return preempt; } -#if 0 -static inline void -remap_irq_table(void) -{ - u8_t pic1_mask; - u8_t pic2_mask; - - // Save masks - pic1_mask = inb(PIC1_DATA); - pic2_mask = inb(PIC2_DATA); -} -#endif - void idt_init(const cpuid_t cpu_id) { + struct { + unsigned short length; + unsigned long base; + } __attribute__((__packed__)) idtr; + + if (cpu_id != INIT_CORE) goto update; + idt_ptr.limit = (sizeof(struct idt_entry) * NUM_IDT_ENTRIES) - 1; idt_ptr.base = (u32_t)&(idt_entries); memset(&(idt_entries), 0, sizeof(struct idt_entry) * NUM_IDT_ENTRIES); - outb(0x20, 0x11); - outb(0xA0, 0x11); - outb(0x21, 0x20); - outb(0xA1, 0x28); - outb(0x21, 0x04); - outb(0xA1, 0x02); - outb(0x21, 0x01); - outb(0xA1, 0x01); - outb(0x21, 0x0); - outb(0xA1, 0x0); + outb(0x20, 0x11); + outb(0xA0, 0x11); + outb(0x21, 0x20); + outb(0xA1, 0x28); + outb(0x21, 0x04); + outb(0xA1, 0x02); + outb(0x21, 0x01); + outb(0xA1, 0x01); + outb(0x21, 0x0); + outb(0xA1, 0x0); idt_set_gate(IRQ_DIV_BY_ZERO_ERR_FAULT, (u32_t)div_by_zero_err_fault_irq, 0x08, 0x8E); idt_set_gate(IRQ_DEBUG_TRAP, (u32_t)debug_trap_irq, 0x08, 0x8E); @@ -130,7 +99,7 @@ idt_init(const cpuid_t cpu_id) idt_set_gate(IRQ_VIRTUALIZATION_EXCEPT_FAULT, (u32_t)virtualization_except_fault_irq, 0x08, 0x8E); idt_set_gate(IRQ_SECURITY_EXCEPT_FAULT, (u32_t)security_except_fault_irq, 0x08, 0x8E); - idt_set_gate(HW_PERIODIC, (u32_t)periodic_irq, 0x08, 0x8E); + idt_set_gate(HW_HPET_PERIODIC, (u32_t)hpet_periodic_irq, 0x08, 0x8E); idt_set_gate(HW_KEYBOARD, (u32_t)keyboard_irq, 0x08, 0x8E); idt_set_gate(HW_ID3, (u32_t)handler_hw_34, 0x08, 0x8E); idt_set_gate(HW_ID4, (u32_t)handler_hw_35, 0x08, 0x8E); @@ -138,7 +107,7 @@ idt_init(const cpuid_t cpu_id) idt_set_gate(HW_ID6, (u32_t)handler_hw_37, 0x08, 0x8E); idt_set_gate(HW_ID7, (u32_t)handler_hw_38, 0x08, 0x8E); idt_set_gate(HW_ID8, (u32_t)handler_hw_39, 0x08, 0x8E); - idt_set_gate(HW_ONESHOT, (u32_t)oneshot_irq, 0x08, 0x8E); + idt_set_gate(HW_HPET_ONESHOT, (u32_t)hpet_oneshot_irq, 0x08, 0x8E); idt_set_gate(HW_ID10, (u32_t)handler_hw_41, 0x08, 0x8E); idt_set_gate(HW_ID11, (u32_t)handler_hw_42, 0x08, 0x8E); idt_set_gate(HW_ID12, (u32_t)handler_hw_43, 0x08, 0x8E); @@ -165,11 +134,7 @@ idt_init(const cpuid_t cpu_id) idt_set_gate(HW_LAPIC_IPI_ASND, (u32_t)lapic_ipi_asnd_irq, 0x08, 0x8E); idt_set_gate(HW_LAPIC_TIMER, (u32_t)lapic_timer_irq, 0x08, 0x8E); - struct { - unsigned short length; - unsigned long base; - } __attribute__((__packed__)) idtr; - +update: idtr.length = idt_ptr.limit; idtr.base = (unsigned long)(&(idt_entries)); diff --git a/src/platform/i386/ioapic.c b/src/platform/i386/ioapic.c new file mode 100644 index 0000000000..1ae8231b7c --- /dev/null +++ b/src/platform/i386/ioapic.c @@ -0,0 +1,393 @@ +#include "kernel.h" +#include "ioapic.h" +#include "pic.h" + +#define IOAPIC_MAX 4 +#define IOAPIC_INT_ISA_MAX 16 /* ACPI 5.0 spec: only ISA interrupts can have overrides */ + +#define IOAPIC_IOAPICID 0x00 +#define IOAPIC_IOAPICVER 0x01 +#define IOAPIC_IOAPICARB 0x02 + +#define IOAPIC_IOREGSEL 0x00 +#define IOAPIC_IOWIN (IOAPIC_IOREGSEL + 0x10) +#define IOAPIC_IOREDTBL 0x10 +#define IOAPIC_IOREDTBL_OFFSET(n) (IOAPIC_IOREDTBL + 2*n) + +#define IOAPIC_INT_DISABLED (1<<16) + +enum ioapic_deliverymode +{ + IOAPIC_DELIV_FIXED = 0, + IOAPIC_DELIV_LOWEST = 1, + IOAPIC_DELIV_SMI = 2, + IOAPIC_DELIV_NMI = 4, + IOAPIC_DELIV_INIT = 5, + IOAPIC_DELIV_EXTINT = 7, +}; + +enum ioapic_dstmode +{ + IOAPIC_DST_PHYSICAL = 0, + IOAPIC_DST_LOGICAL = 1, +}; + +enum ioapic_pinpolarity +{ + IOAPIC_POL_ACTHIGH = 0, + IOAPIC_POL_ACTLOW = 1, +}; + +enum ioapic_triggermode +{ + IOAPIC_TRIGGER_EDGE = 0, + IOAPIC_TRIGGER_LEVEL = 1, +}; + +struct ioapic_info { + unsigned int ioapicid; + volatile void *io_vaddr; + int nentries; + int glbint_base; +}; + +union ioapic_int_redir_entry { + struct { + u64_t vector: 8; + u64_t delivmod: 3; + u64_t destmod: 1; + u64_t delivsts: 1; + u64_t polarity: 1; + u64_t remoteirr: 1; + u64_t trigger: 1; + u64_t mask: 1; + u64_t reserved: 39; + u64_t destination: 8; + }; + struct { + u32_t low_dword; + u32_t high_dword; + }; +}; + +struct ioapic_isa_override { + int source; + int gsi; + union { + struct { + u16_t polarity:2; + u16_t trigger:2; + u16_t reserved:12; + }; + u16_t flags; + }; +}; + +static struct ioapic_info ioapicinfo[IOAPIC_MAX] = { { 0, NULL, 0, 0} }; +static unsigned int ioapic_count; +static struct ioapic_isa_override ioapic_isainfo[IOAPIC_INT_ISA_MAX]; +static unsigned int ioapic_isaoverride_count; +static unsigned int ioapic_int_count; + +static union ioapic_int_redir_entry ioapic_int_isa_tmpl = { + .delivmod = IOAPIC_DELIV_FIXED, + .destmod = IOAPIC_DST_LOGICAL, + .polarity = IOAPIC_POL_ACTHIGH, + .trigger = IOAPIC_TRIGGER_EDGE, + .mask = 1, +}; + +static union ioapic_int_redir_entry ioapic_int_pci_tmpl = { + .delivmod = IOAPIC_DELIV_FIXED, + .destmod = IOAPIC_DST_LOGICAL, + .polarity = IOAPIC_POL_ACTLOW, + .trigger = IOAPIC_TRIGGER_EDGE, /* ref. barrelfish doesn't use level */ + .mask = 1, +}; + +void +ioapic_set_page(struct ioapic_info *io, u32_t page) +{ + io->io_vaddr = (volatile u32_t *)(page * (1 << 22) | ((u32_t)io->io_vaddr & ((1 << 22) - 1))); + + printk("\tSet IOAPIC %d @ %p\n", io->ioapicid, io->io_vaddr); +} + +static void +ioapic_reg_write(struct ioapic_info *io, u8_t offset, u32_t val) +{ + *(volatile u32_t *)(io->io_vaddr + IOAPIC_IOREGSEL) = offset; + *(volatile u32_t *)(io->io_vaddr + IOAPIC_IOWIN) = val; +} + +static u32_t +ioapic_reg_read(struct ioapic_info *io, u8_t offset) +{ + *(volatile u32_t *)(io->io_vaddr + IOAPIC_IOREGSEL) = offset; + + return *(volatile u32_t *)(io->io_vaddr + IOAPIC_IOWIN); +} + +static struct ioapic_info * +ioapic_findbygsi(int gsi) +{ + unsigned int i = 0; + + for (; i < ioapic_count; i++) { + if (gsi >= ioapicinfo[i].glbint_base && gsi < ioapicinfo[i].nentries) return &ioapicinfo[i]; + } + + return NULL; +} + +static struct ioapic_info * +ioapic_findbyid(int id) +{ + unsigned int i = 0; + + for (; i < ioapic_count; i++) { + if (id == (int)(ioapicinfo[i].ioapicid)) return &ioapicinfo[i]; + } + + return NULL; +} + +static inline void +ioapic_int_entry_write(struct ioapic_info *io, u8_t off, union ioapic_int_redir_entry entry) +{ + int tmpoff = IOAPIC_IOREDTBL_OFFSET(off); + + ioapic_reg_write(io, tmpoff, entry.low_dword); + ioapic_reg_write(io, tmpoff+1, entry.high_dword); +} + +static inline union ioapic_int_redir_entry +ioapic_int_entry_read(struct ioapic_info *io, u8_t off) +{ + union ioapic_int_redir_entry entry; + int tmpoff = IOAPIC_IOREDTBL_OFFSET(off); + + entry.low_dword = ioapic_reg_read(io, tmpoff); + entry.high_dword = ioapic_reg_read(io, tmpoff+1); + + return entry; +} + +static inline void +ioapic_int_mask_set(int gsi, int mask, int dest) +{ + struct ioapic_info *io = ioapic_findbygsi(gsi); + union ioapic_int_redir_entry entry; + u8_t off; + + if (!io) return; + + off = gsi - io->glbint_base; + entry = ioapic_int_entry_read(io, off); + entry.mask = mask ? 1 : 0; + entry.destination = dest; + ioapic_int_entry_write(io, off, entry); + entry = ioapic_int_entry_read(io, off); +} + +static inline int +ioapic_int_gsi(int gsi) +{ + int override_gsi = gsi; + int i; + + if (gsi < IOAPIC_INT_ISA_MAX) { + for (i = 0; i < (int)ioapic_isaoverride_count; i++) { + if (ioapic_isainfo[i].source == gsi && ioapic_isainfo[i].gsi != gsi) { + override_gsi = ioapic_isainfo[i].gsi; + break; + } + } + } + + return override_gsi; +} + +void +ioapic_int_mask(int gsi) +{ + /* clear destination when masking */ + ioapic_int_mask_set(ioapic_int_gsi(gsi), 1, 0); +} + +void +ioapic_int_unmask(int gsi, int dest) +{ + ioapic_int_mask_set(ioapic_int_gsi(gsi), 0, dest); +} + +void +ioapic_int_override(struct intsrcovrride_cntl *iso) +{ + union ioapic_int_redir_entry entry = ioapic_int_isa_tmpl; + struct ioapic_info *iogsi = NULL, *iosrc = NULL; + + assert(iso->header.len == sizeof(struct intsrcovrride_cntl)); + + assert(iso->source < IOAPIC_INT_ISA_MAX); + assert(ioapic_isaoverride_count < IOAPIC_INT_ISA_MAX); + + if (iso->source != iso->glb_int_num_off) { + union ioapic_int_redir_entry srcentry = ioapic_int_isa_tmpl; + + iosrc = ioapic_findbygsi(iso->source); + assert(iosrc); + srcentry.vector = iso->glb_int_num_off + HW_IRQ_START; + ioapic_int_entry_write(iosrc, iso->source - iosrc->glbint_base, srcentry); + + ioapic_isainfo[ioapic_isaoverride_count].source = iso->glb_int_num_off; + ioapic_isainfo[ioapic_isaoverride_count].gsi = iso->source; + ioapic_isainfo[ioapic_isaoverride_count].flags = 0; + ioapic_isaoverride_count++; + } + + ioapic_isainfo[ioapic_isaoverride_count].source = iso->source; + ioapic_isainfo[ioapic_isaoverride_count].gsi = iso->glb_int_num_off; + ioapic_isainfo[ioapic_isaoverride_count].flags = iso->flags; + + printk("\tINT Override %u to %u, polarity: %u trigger: %u\n", iso->source, iso->glb_int_num_off, + ioapic_isainfo[ioapic_isaoverride_count].polarity, ioapic_isainfo[ioapic_isaoverride_count].trigger); + + switch(ioapic_isainfo[ioapic_isaoverride_count].trigger) { + case ACPI_MADT_ISO_TRIG_CONFORMS: break; + case ACPI_MADT_ISO_TRIG_EDGE: entry.trigger = IOAPIC_TRIGGER_EDGE; break; + case ACPI_MADT_ISO_TRIG_RESERVED: assert(0); break; + case ACPI_MADT_ISO_TRIG_LEVEL: entry.trigger = IOAPIC_TRIGGER_EDGE; break; /* XXX: should be level */ + default: break; + } + + switch(ioapic_isainfo[ioapic_isaoverride_count].polarity) { + case ACPI_MADT_ISO_POL_CONFORMS: break; + case ACPI_MADT_ISO_POL_ACTHIGH: entry.polarity = IOAPIC_POL_ACTHIGH; break; + case ACPI_MADT_ISO_POL_RESERVED: assert(0); break; + case ACPI_MADT_ISO_POL_ACTLOW: entry.polarity = IOAPIC_POL_ACTLOW; break; + default: break; + } + + entry.vector = iso->source + HW_IRQ_START; + iogsi = ioapic_findbygsi(iso->glb_int_num_off); + assert(iogsi); + + ioapic_int_entry_write(iogsi, iso->glb_int_num_off - iogsi->glbint_base, entry); + + ioapic_isaoverride_count++; +} + +void +ioapic_iter(struct ioapic_cntl *io) +{ + u32_t ver; + int ioent, j; + static int more = 0; + unsigned int tmp_count = ioapic_count; + + assert(io); + + if (ioapic_count == IOAPIC_MAX) { + more ++; + printk("\t%d more than %d IOAPICs present..\n", more, IOAPIC_MAX); + + return; + } + + ioapic_count ++; + ioapicinfo[tmp_count].io_vaddr = (volatile void *)(io->ioapic_phys_addr); + ioapicinfo[tmp_count].ioapicid = io->ioapic_id; + ioapic_set_page(&(ioapicinfo[tmp_count]), vm_map_superpage((u32_t)(ioapicinfo[tmp_count].io_vaddr), 0)); + + ver = ioapic_reg_read(&ioapicinfo[tmp_count], IOAPIC_IOAPICVER); + ioent = ((ver >> 16) & 0xFF) + 1; + printk("\tIOAPIC %d (counter:%d): Number of entries = %d\n", io->ioapic_id, tmp_count, ioent); + + ioapicinfo[tmp_count].nentries = ioent; + ioapicinfo[tmp_count].glbint_base = io->glb_int_num_off; + ioapic_int_count += ioent; + + for (j = 0; j < ioent; j++) { + union ioapic_int_redir_entry entry = (io->glb_int_num_off + j) < IOAPIC_INT_ISA_MAX ? ioapic_int_isa_tmpl : ioapic_int_pci_tmpl; + + entry.vector = io->glb_int_num_off + j + HW_IRQ_START; + + ioapic_int_entry_write(&ioapicinfo[tmp_count], j, entry); + } +} + +int +chal_irq_enable(int irq, cpuid_t cpu_id) +{ + int gsi = ioapic_int_gsi(irq - HW_IRQ_START); + struct ioapic_info *io = ioapic_findbygsi(gsi); + union ioapic_int_redir_entry entry; + u8_t off; + + if (!io) return -EINVAL; + + off = gsi - io->glbint_base; + entry = ioapic_int_entry_read(io, off); + + /* the destination bitmap is 8 bits */ + if (irq - HW_IRQ_START >= (int)ioapic_int_count || cpu_id > 7) return -EINVAL; + + /* irq should be masked or in logical mode */ + assert(entry.mask || entry.destmod == IOAPIC_DST_LOGICAL); + + /* if irq is masked, destination should be 0 */ + assert(!entry.mask || !entry.destination); + + ioapic_int_unmask(irq - HW_IRQ_START, entry.destination | (u8_t)logical_apicids[cpu_id]); + + return 0; +} + +int +chal_irq_disable(int irq, cpuid_t cpu_id) +{ + int gsi = ioapic_int_gsi(irq - HW_IRQ_START); + struct ioapic_info *io = ioapic_findbygsi(gsi); + union ioapic_int_redir_entry entry; + u8_t off; + + if (!io) return -EINVAL; + + off = gsi - io->glbint_base; + entry = ioapic_int_entry_read(io, off); + + /* the destination bitmap is 8 bits */ + if (irq - HW_IRQ_START >= (int)ioapic_int_count || cpu_id > 7) return -EINVAL; + + assert(entry.mask || entry.destmod == IOAPIC_DST_LOGICAL); + + /* we should disable the irq if we remove the last core */ + if (!(entry.destination & ~logical_apicids[cpu_id])) { + ioapic_int_mask(irq - HW_IRQ_START); + return 0; + } + + ioapic_int_unmask(irq - HW_IRQ_START, entry.destination & ~logical_apicids[cpu_id]); + return 0; +} + +void +ioapic_init(void) +{ + assert(ioapic_count); + pic_disable(); + + printk("Setting up IOAPIC (disabling PIC)\n"); + + /* + * PCI Interrupts may need some attention here. + * https://forum.osdev.org/viewtopic.php?f=1&t=21745 + * The discussion in the above forum suggest modern PCIe devices bypass IOAPIC and send + * interrupts directly to the core. For legacy PCI, we probably need to read some APIC tables. + * + * Update: with BMK_SCREW_INTERRUPT_ROUTING, got Rumpkernel to boot fine on HW as well. + * The effect of that BMK_SCREW_INTERRUPT_ROUTING is mostly in the BMK intr.c to use an array of lists vs + * single list. It doesn't change how NetBSD does interrupt processing. + */ +} diff --git a/src/platform/i386/ioapic.h b/src/platform/i386/ioapic.h new file mode 100644 index 0000000000..3cd3e31ea4 --- /dev/null +++ b/src/platform/i386/ioapic.h @@ -0,0 +1,17 @@ +#ifndef IOAPIC_H +#define IOAPIC_H + +#include "apic_cntl.h" + +void ioapic_init(void); + +void ioapic_iter(struct ioapic_cntl *); +// void ioapic_int_mask(int irq); +// void ioapic_int_unmask(int irq); + +void ioapic_int_disable(int irq); +void ioapic_int_enable(int irq, cpuid_t cpu_id); + +void ioapic_int_override(struct intsrcovrride_cntl *); + +#endif /* IOAPIC_H */ diff --git a/src/platform/i386/isr.h b/src/platform/i386/isr.h index 052c0596a8..e14392ee24 100644 --- a/src/platform/i386/isr.h +++ b/src/platform/i386/isr.h @@ -49,7 +49,7 @@ extern void smid_float_pt_except_fault_irq(struct pt_regs *); extern void virtualization_except_fault_irq(struct pt_regs *); extern void security_except_fault_irq(struct pt_regs *); -extern void periodic_irq(struct pt_regs *); +extern void hpet_periodic_irq(struct pt_regs *); extern void keyboard_irq(struct pt_regs *); extern void handler_hw_34(struct pt_regs *); extern void handler_hw_35(struct pt_regs *); @@ -57,7 +57,7 @@ extern void serial_irq(struct pt_regs *); extern void handler_hw_37(struct pt_regs *); extern void handler_hw_38(struct pt_regs *); extern void handler_hw_39(struct pt_regs *); -extern void oneshot_irq(struct pt_regs *); +extern void hpet_oneshot_irq(struct pt_regs *); extern void handler_hw_41(struct pt_regs *); extern void handler_hw_42(struct pt_regs *); extern void handler_hw_43(struct pt_regs *); @@ -84,11 +84,4 @@ extern void lapic_spurious_irq(struct pt_regs *); extern void lapic_ipi_asnd_irq(struct pt_regs *); extern void lapic_timer_irq(struct pt_regs *); -static void -ack_irq(int n) -{ - if (n >= 40) outb(0xA0, 0x20); /* Send reset signal to slave */ - outb(0x20, 0x20); -} - #endif /* ISR_H */ diff --git a/src/platform/i386/kernel.c b/src/platform/i386/kernel.c index caf1858803..a91c6f5437 100644 --- a/src/platform/i386/kernel.c +++ b/src/platform/i386/kernel.c @@ -145,9 +145,6 @@ kmain(struct multiboot *mboot, u32_t mboot_magic, u32_t esp) #ifdef ENABLE_SERIAL serial_init(); #endif -#ifdef ENABLE_CONSOLE - console_init(); -#endif #ifdef ENABLE_VGA vga_init(); #endif @@ -163,10 +160,15 @@ kmain(struct multiboot *mboot, u32_t mboot_magic, u32_t esp) comp_init(); thd_init(); paging_init(); - kern_boot_comp(INIT_CORE); lapic_init(); - timer_init(); + hpet_init(); + chal_irq_enable(HW_SERIAL, 0); + pic_init(); + ioapic_init(); +#ifdef ENABLE_SERIAL + serial_late_init(); +#endif smp_init(cores_ready); cores_ready[INIT_CORE] = 1; diff --git a/src/platform/i386/kernel.h b/src/platform/i386/kernel.h index 6c64531537..2829009d32 100644 --- a/src/platform/i386/kernel.h +++ b/src/platform/i386/kernel.h @@ -11,62 +11,23 @@ #include #include -#ifdef ENABLE_CONSOLE -void vga_clear(void); -void vga_puts(const char *s); -void console_init(void); -#endif - -#ifdef ENABLE_VGA -void vga_high_init(void); -void vga_init(void); -void vga_puts(const char *str); -#endif - -#ifdef ENABLE_SERIAL -void serial_init(void); -#endif - -/* These numbers map directly to actual timers in the HPET */ -typedef enum { - TIMER_PERIODIC = 0, - TIMER_ONESHOT = 1, -} timer_type_t; - -#define TIMER_DEFAULT_US_INTERARRIVAL 1000 /* US = microseconds */ - -void timer_set(timer_type_t timer_type, u64_t cycles); -void timer_init(void); -u64_t timer_find_hpet(void *timer); -void timer_set_hpet_page(u32_t page); -void timer_thd_init(struct thread *t); - -void tss_init(const cpuid_t cpu_id); -void idt_init(const cpuid_t cpu_id); -void gdt_init(const cpuid_t cpu_id); -void user_init(void); -void paging_init(void); -void *acpi_find_rsdt(void); -void *acpi_find_timer(void); -void acpi_set_rsdt_page(u32_t); -void kern_paging_map_init(void *pa); - -void * acpi_find_apic(void); -u32_t lapic_find_localaddr(void *l); -void lapic_set_page(u32_t page); -void lapic_timer_init(void); -void lapic_init(void); -void lapic_set_timer(int timer_type, cycles_t deadline); -u32_t lapic_get_ccr(void); -void lapic_timer_calibration(u32_t ratio); -void lapic_asnd_ipi_send(const cpuid_t cpu_id); -extern volatile u32_t lapic_timer_calib_init; - -void smp_init(volatile int *cores_ready); +#include "vga.h" +#include "serial.h" +#include "hpet.h" +#include "acpi.h" +#include "lapic.h" +#include "pic.h" +#include "ioapic.h" + +int vm_map_superpage(u32_t addr, int nocache); +void kern_paging_map_init(void *); +void paging_init(void); +void tss_init(cpuid_t); +void gdt_init(cpuid_t); +void idt_init(cpuid_t); void tls_update(u32_t addr); -// void printk(const char *fmt, ...); int printk_register_handler(void (*handler)(const char *)); void khalt(void); diff --git a/src/platform/i386/keyboard.c b/src/platform/i386/keyboard.c new file mode 100644 index 0000000000..b38987faa2 --- /dev/null +++ b/src/platform/i386/keyboard.c @@ -0,0 +1,21 @@ +#include "kernel.h" + +#define KEY_DEVICE 0x60 +#define KEY_PENDING 0x64 + +int +keyboard_handler(struct pt_regs *regs) +{ + u16_t scancode = 0; + int preempt = 1; + + lapic_ack(); + + while (inb(KEY_PENDING) & 2) { + /* wait for keypress to be ready */ + } + scancode = inb(KEY_DEVICE); + PRINTK("Keyboard press: %d\n", scancode); + + return preempt; +} diff --git a/src/platform/i386/lapic.c b/src/platform/i386/lapic.c index a67dca3767..0a5eb894f2 100644 --- a/src/platform/i386/lapic.c +++ b/src/platform/i386/lapic.c @@ -1,44 +1,13 @@ #include "kernel.h" #include "chal_cpu.h" #include "isr.h" +#include "apic_cntl.h" -#define APIC_DEFAULT_PHYS 0xfee00000 -#define APIC_HDR_LEN_OFF 0x04 -#define APIC_CNTRLR_ADDR_OFF 0x24 -#define APIC_CNTRLR_FLAGS_OFF 0x28 -#define APIC_CNTR_ARR_OFF 0x2C +#define LAPIC_MAX NUM_CPU -/* See 5.2.12 in the ACPI 5.0 Spec */ -enum -{ - APIC_CNTL_LAPIC = 0, - APIC_CNTL_IOAPIC = 1, -}; - -struct int_cntl_head { - u8_t type; - u8_t len; -} __attribute__((packed)); - -struct lapic_cntl { - /* type == APIC_CNTL_LAPIC */ - struct int_cntl_head header; - u8_t proc_id; - u8_t apic_id; - u32_t flags; /* 0 = dead processor */ -} __attribute__((packed)); - -struct ioapic_cntl { - /* type == APIC_CNTL_IOAPIC */ - struct int_cntl_head header; - u8_t ioapic_id; - u8_t reserved; - u32_t ioapic_phys_addr; - u32_t glb_int_num_off; /* I/O APIC's interrupt base number offset */ -} __attribute__((packed)); - -volatile int ncpus = 1; -volatile int apicids[NUM_CPU]; +int ncpus = 1; +int apicids[NUM_CPU]; +u32_t logical_apicids[NUM_CPU]; #define CMOS_PORT 0x70 @@ -46,6 +15,7 @@ volatile int apicids[NUM_CPU]; #define LAPIC_VERSION_REG 0x030 /* version */ #define LAPIC_TP_REG 0x080 /* Task Priority Register */ +#define LAPIC_LDR_REG 0x0D0 /* Logical destination register */ #define LAPIC_SIV_REG 0x0F0 /* spurious interrupt vector */ #define LAPIC_SIV_ENABLE (1 << 8) /* enable bit in the SIV */ #define LAPIC_EOI_REG 0x0B0 /* ack, or end-of-interrupt */ @@ -87,6 +57,10 @@ volatile int apicids[NUM_CPU]; #define LAPIC_ONESHOT_THRESH (1 << 12) #define LAPIC_TSCDEADLINE_THRESH 0 +#define LAPIC_LDR_OFFSET 24 +#define LAPIC_LDR_MAST (0xfful << LAPIC_LDR_OFFSET) + + extern int timer_process(struct pt_regs *regs); enum lapic_timer_type @@ -124,7 +98,7 @@ lapic_write_reg(u32_t off, u32_t val) *(volatile u32_t *)(lapic + off) = val; } -static void +void lapic_ack(void) { lapic_write_reg(LAPIC_EOI_REG, 0); @@ -175,53 +149,16 @@ lapic_apicid(void) } void -lapic_intsrc_iter(unsigned char *madt) +lapic_iter(struct lapic_cntl *l) { - struct int_cntl_head *h = (struct int_cntl_head *)(madt + APIC_CNTR_ARR_OFF); - u32_t len = *(u32_t *)(madt + APIC_HDR_LEN_OFF); - struct int_cntl_head *end = (struct int_cntl_head *)(madt + len); - int us = lapic_apicid(), off = 1; - - apicids[0] = us; - printk("\tMADT length %d (base struct %d)\n", len, APIC_CNTR_ARR_OFF); - assert(h <= end); - for (; h < end; h = (struct int_cntl_head *)((char *)h + h->len)) { - /* termination condition */ - assert(h->len >= sizeof(struct int_cntl_head)); - switch (h->type) { - case APIC_CNTL_LAPIC: { - struct lapic_cntl *l = (struct lapic_cntl *)h; - - assert(l->header.len == sizeof(struct lapic_cntl)); - printk("\tLAPIC found: coreid %d, apicid %d flags %d\n", l->proc_id, l->apic_id, l->flags); - - if (l->apic_id != us && l->flags && ncpus < NUM_CPU && NUM_CPU > 1) { - apicids[off++] = l->apic_id; - ncpus++; - } - - break; - } - case APIC_CNTL_IOAPIC: { - struct ioapic_cntl *io = (struct ioapic_cntl *)h; + static int off = 1; - assert(io->header.len == sizeof(struct ioapic_cntl)); - printk("\tI/O APIC found: ioapicid %d, addr %x, int offset %d\n", io->ioapic_id, - io->ioapic_phys_addr, io->glb_int_num_off); - break; - } - default: - /* See 5.2.12 in the ACPI 5.0 Spec */ - printk("\tInterrupt controller type %d: ignoring\n", h->type); - break; - } - } - printk("\tAPICs processed, %d cores\n", ncpus); + assert(l->header.len == sizeof(struct lapic_cntl)); + printk("\tLAPIC found: coreid %d, apicid %d\n", l->proc_id, l->apic_id); - if (ncpus != NUM_CPU) { - printk("Number of LAPICs processed =%d not meeting the requirement = %d\n", ncpus, NUM_CPU); - printk("Please reconfigure NUM_CPU in Composite/HW-BIOS\n"); - assert(0); + if (l->apic_id != apicids[INIT_CORE] && l->flags && ncpus < NUM_CPU && NUM_CPU > 1) { + apicids[off++] = l->apic_id; + ncpus++; } } @@ -236,6 +173,7 @@ lapic_find_localaddr(void *l) printk("Initializing LAPIC @ %p\n", lapicaddr); + apicids[INIT_CORE] = lapic_apicid(); for (i = 0; i < length; i++) { sum += lapicaddr[i]; } @@ -248,7 +186,7 @@ lapic_find_localaddr(void *l) addr = *(u32_t *)(lapicaddr + APIC_CNTRLR_ADDR_OFF); apic_flags = *(u32_t *)(lapicaddr + APIC_CNTRLR_FLAGS_OFF); assert(apic_flags == 1); /* we're assuming the PIC exists */ - lapic_intsrc_iter(lapicaddr); + acpi_madt_intsrc_iter(lapicaddr); printk("\tChecksum is OK\n"); lapic = (void *)(addr); @@ -261,12 +199,40 @@ lapic_find_localaddr(void *l) return addr; } +static u32_t +cons_logical_id(const u32_t id) +{ + /* + * FIXME: xAPIC only support 8 bits bitmap for logical destination, + * So we will configure the logical id of cores with id larger than 7 + * to 0 which means we should find out a way(x2APIC) to fix this when we + * have more than 8 cores in ioapic. + */ + + if (id > 7) return 0; + + return (1ul << id) << LAPIC_LDR_OFFSET; +} + +static u32_t +lapic_set_ldr(const u32_t id) +{ + u32_t lid = cons_logical_id(id); + + lapic_write_reg(LAPIC_LDR_REG, lid | ~LAPIC_LDR_MAST); + return lid >> LAPIC_LDR_OFFSET; +} + void lapic_init(void) { u32_t version; assert(lapic); + + /* setup LDR for logic destination before init lapic */ + logical_apicids[get_cpuid()] = lapic_set_ldr(get_cpuid()); + lapic_write_reg(LAPIC_SIV_REG, LAPIC_SIV_ENABLE | HW_LAPIC_SPURIOUS); version = lapic_read_reg(LAPIC_VERSION_REG); diff --git a/src/platform/i386/lapic.h b/src/platform/i386/lapic.h new file mode 100644 index 0000000000..6156ffc708 --- /dev/null +++ b/src/platform/i386/lapic.h @@ -0,0 +1,23 @@ +#ifndef LAPIC_H +#define LAPIC_H + +#include "apic_cntl.h" + +void lapic_ack(void); +void lapic_iter(struct lapic_cntl *); +u32_t lapic_find_localaddr(void *l); +void lapic_set_page(u32_t page); +void lapic_timer_init(void); +void lapic_set_timer(int timer_type, cycles_t deadline); +u32_t lapic_get_ccr(void); +void lapic_timer_calibration(u32_t ratio); +void lapic_asnd_ipi_send(const cpuid_t cpu_id); + +extern volatile u32_t lapic_timer_calib_init; +extern int apicids[NUM_CPU]; +extern u32_t logical_apicids[NUM_CPU]; + +void lapic_init(void); +void smp_init(volatile int *cores_ready); + +#endif /* LAPIC_H */ diff --git a/src/platform/i386/pic.c b/src/platform/i386/pic.c new file mode 100644 index 0000000000..1de14dcabf --- /dev/null +++ b/src/platform/i386/pic.c @@ -0,0 +1,59 @@ +#include "kernel.h" +#include "pic.h" + +#define PIC_IRQ_BASE 0x20 +#define PIC_ALL_DISABLE 0xFF +#define PIC_ALL_ENABLE 0x00 + +/* Information taken from: http://wiki.osdev.org/PIC */ +#define PIC1 0x20 +#define PIC2 0xA0 +#define PIC1_CMD PIC1 +#define PIC1_DATA (PIC1 + 1) +#define PIC2_CMD PIC2 +#define PIC2_DATA (PIC2 + 1) + +/* reinitialize the PIC controllers, giving them specified vector offsets + rather than 8 and 70, as configured by default */ +#define PIC_ICW1_ICW4 0x01 /* ICW4 (not) needed */ +#define PIC_ICW1_SINGLE 0x02 /* Single (cascade) mode */ +#define PIC_ICW1_INTERVAL4 0x04 /* Call address interval 4 (8) */ +#define PIC_ICW1_LEVEL 0x08 /* Level triggered (edge) mode */ +#define PIC_ICW1_INIT 0x10 /* Initialization - required! */ + +#define PIC_ICW4_8086 0x01 /* 8086/88 (MCS-80/85) mode */ +#define PIC_ICW4_AUTO 0x02 /* Auto (normal) EOI */ +#define PIC_ICW4_BUF_SLAVE 0x08 /* Buffered mode/slave */ +#define PIC_ICW4_BUF_MASTER 0x0C /* Buffered mode/master */ +#define PIC_ICW4_SFNM 0x10 /* Special fully nested (not) */ +#define PIC_ICW1_ICW4 0x01 + +void +pic_disable(void) +{ + outb(PIC1_DATA, PIC_ALL_DISABLE); + outb(PIC2_DATA, PIC_ALL_DISABLE); +} + +void +pic_enable(void) +{ + outb(PIC1_DATA, PIC_ALL_ENABLE); + outb(PIC2_DATA, PIC_ALL_ENABLE); +} + +void +pic_init(void) +{ + printk("Setting up PIC\n"); + outb(PIC1_CMD, PIC_ICW1_INIT | PIC_ICW1_ICW4); + outb(PIC2_CMD, PIC_ICW1_INIT | PIC_ICW1_ICW4); + outb(PIC1_DATA, PIC_IRQ_BASE); + outb(PIC2_DATA, PIC_IRQ_BASE + 8); + outb(PIC1_DATA, 4); + outb(PIC2_DATA, 2); + outb(PIC1_DATA, PIC_ICW4_8086); + outb(PIC2_DATA, PIC_ICW4_8086); + + pic_enable(); +} diff --git a/src/platform/i386/pic.h b/src/platform/i386/pic.h new file mode 100644 index 0000000000..ed5b0ffdac --- /dev/null +++ b/src/platform/i386/pic.h @@ -0,0 +1,17 @@ +#ifndef PIC_H +#define PIC_H + +#include "chal/io.h" + +void pic_init(void); +void pic_enable(void); +void pic_disable(void); + +static void +pic_ack_irq(int n) +{ + if (n >= 40) outb(0xA0, 0x20); /* Send reset signal to slave */ + outb(0x20, 0x20); +} + +#endif /* PIC_H */ diff --git a/src/platform/i386/qemu-kvm.sh b/src/platform/i386/qemu-kvm.sh new file mode 100755 index 0000000000..5fb559c299 --- /dev/null +++ b/src/platform/i386/qemu-kvm.sh @@ -0,0 +1,15 @@ +#!/bin/sh +if [ $# != 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +if ! [ -r $1 ]; then + echo "Can't open run-script" + exit 1 +fi + +MODULES=$(sh $1 | awk '/^Writing image/ { print $3; }' | tr '\n' ' ') + +#qemu-system-i386 -m 768 -nographic -kernel kernel.img -no-reboot -s -initrd "$(echo $MODULES | tr ' ' ',')" +qemu-system-i386 -enable-kvm -rtc base=localtime,clock=host,driftfix=none -smp sockets=1,cores=2,threads=1 -cpu host -nographic -m 800 -kernel kernel.img -initrd "$(echo $MODULES | tr ' ' ',')" diff --git a/src/platform/i386/runscripts/crttests.sh b/src/platform/i386/runscripts/crttests.sh new file mode 100644 index 0000000000..55c6b0792b --- /dev/null +++ b/src/platform/i386/runscripts/crttests.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +cp tests.crt_tests.o llboot.o +./cos_linker "llboot.o, :" ./gen_client_stub diff --git a/src/platform/i386/runscripts/micro_chan.sh b/src/platform/i386/runscripts/micro_chan.sh new file mode 100644 index 0000000000..381d083c5a --- /dev/null +++ b/src/platform/i386/runscripts/micro_chan.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +cp micro_chan.o llboot.o +./cos_linker "llboot.o, :" ./gen_client_stub diff --git a/src/platform/i386/runscripts/omp_dijkstra.sh b/src/platform/i386/runscripts/omp_dijkstra.sh new file mode 100644 index 0000000000..128366ed60 --- /dev/null +++ b/src/platform/i386/runscripts/omp_dijkstra.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +cp omp_dijkstra.o llboot.o +./cos_linker "llboot.o, :" ./gen_client_stub + +#cp llboot_comp.o llboot.o +#cp omp_dijkstra.o boot.o +#cp test_boot.o dummy1.o +#cp test_boot.o dummy2.o +#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub diff --git a/src/platform/i386/runscripts/omp_fft_bots.sh b/src/platform/i386/runscripts/omp_fft_bots.sh new file mode 100644 index 0000000000..858f140dd1 --- /dev/null +++ b/src/platform/i386/runscripts/omp_fft_bots.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +cp omp_fft_bots.o llboot.o +./cos_linker "llboot.o, :" ./gen_client_stub + +#cp llboot_comp.o llboot.o +#cp omp_fft_bots.o boot.o +#cp test_boot.o dummy1.o +#cp test_boot.o dummy2.o +#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub diff --git a/src/platform/i386/runscripts/omp_fib_bots.sh b/src/platform/i386/runscripts/omp_fib_bots.sh new file mode 100644 index 0000000000..5c4465f351 --- /dev/null +++ b/src/platform/i386/runscripts/omp_fib_bots.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +cp omp_fib_bots.o llboot.o +./cos_linker "llboot.o, :" ./gen_client_stub + +#cp llboot_comp.o llboot.o +#cp omp_fib_bots.o boot.o +#cp test_boot.o dummy1.o +#cp test_boot.o dummy2.o +#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub diff --git a/src/platform/i386/runscripts/omp_hello.sh b/src/platform/i386/runscripts/omp_hello.sh new file mode 100644 index 0000000000..342a043e00 --- /dev/null +++ b/src/platform/i386/runscripts/omp_hello.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +cp omp_hello.o llboot.o +./cos_linker "llboot.o, :" ./gen_client_stub + +#cp llboot_comp.o llboot.o +#cp omp_hello.o boot.o +#cp test_boot.o dummy1.o +#cp test_boot.o dummy2.o +#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub diff --git a/src/platform/i386/runscripts/omp_sort_bots.sh b/src/platform/i386/runscripts/omp_sort_bots.sh new file mode 100644 index 0000000000..cf71756905 --- /dev/null +++ b/src/platform/i386/runscripts/omp_sort_bots.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +cp omp_sort_bots.o llboot.o +./cos_linker "llboot.o, :" ./gen_client_stub + +#cp llboot_comp.o llboot.o +#cp omp_sort_bots.o boot.o +#cp test_boot.o dummy1.o +#cp test_boot.o dummy2.o +#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub diff --git a/src/platform/i386/runscripts/omp_sparselu_for_bots.sh b/src/platform/i386/runscripts/omp_sparselu_for_bots.sh new file mode 100644 index 0000000000..785b0eae92 --- /dev/null +++ b/src/platform/i386/runscripts/omp_sparselu_for_bots.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +cp omp_sparselu_for_bots.o llboot.o +./cos_linker "llboot.o, :" ./gen_client_stub + +#cp llboot_comp.o llboot.o +#cp omp_sparselu_for_bots.o boot.o +#cp test_boot.o dummy1.o +#cp test_boot.o dummy2.o +#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub diff --git a/src/platform/i386/runscripts/omp_sparselu_single_bots.sh b/src/platform/i386/runscripts/omp_sparselu_single_bots.sh new file mode 100644 index 0000000000..1d1374aef4 --- /dev/null +++ b/src/platform/i386/runscripts/omp_sparselu_single_bots.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +cp omp_sparselu_single_bots.o llboot.o +./cos_linker "llboot.o, :" ./gen_client_stub + +#cp llboot_comp.o llboot.o +#cp omp_sparselu_single_bots.o boot.o +#cp test_boot.o dummy1.o +#cp test_boot.o dummy2.o +#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub diff --git a/src/platform/i386/runscripts/omp_strassen_bots.sh b/src/platform/i386/runscripts/omp_strassen_bots.sh new file mode 100644 index 0000000000..3fe5a88ac3 --- /dev/null +++ b/src/platform/i386/runscripts/omp_strassen_bots.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +cp omp_strassen_bots.o llboot.o +./cos_linker "llboot.o, :" ./gen_client_stub + +#cp llboot_comp.o llboot.o +#cp omp_strassen_bots.o boot.o +#cp test_boot.o dummy1.o +#cp test_boot.o dummy2.o +#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub diff --git a/src/platform/i386/runscripts/omp_ubench.sh b/src/platform/i386/runscripts/omp_ubench.sh new file mode 100644 index 0000000000..100adcb020 --- /dev/null +++ b/src/platform/i386/runscripts/omp_ubench.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +cp omp_ubench.o llboot.o +./cos_linker "llboot.o, :" ./gen_client_stub + +#cp llboot_comp.o llboot.o +#cp omp_ubench.o boot.o +#cp test_boot.o dummy1.o +#cp test_boot.o dummy2.o +#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub diff --git a/src/platform/i386/runscripts/omp_workconsprob.sh b/src/platform/i386/runscripts/omp_workconsprob.sh new file mode 100644 index 0000000000..5e7a8985a6 --- /dev/null +++ b/src/platform/i386/runscripts/omp_workconsprob.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +cp omp_workconsprob.o llboot.o +./cos_linker "llboot.o, :" ./gen_client_stub + +#cp llboot_comp.o llboot.o +#cp omp_hello.o boot.o +#cp test_boot.o dummy1.o +#cp test_boot.o dummy2.o +#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub diff --git a/src/platform/i386/runscripts/part_test.sh b/src/platform/i386/runscripts/part_test.sh new file mode 100644 index 0000000000..a8815e0903 --- /dev/null +++ b/src/platform/i386/runscripts/part_test.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +cp part_test.o llboot.o +./cos_linker "llboot.o, :" ./gen_client_stub diff --git a/src/platform/i386/runscripts/test_slite02.sh b/src/platform/i386/runscripts/test_slite02.sh new file mode 100644 index 0000000000..e51ba080f7 --- /dev/null +++ b/src/platform/i386/runscripts/test_slite02.sh @@ -0,0 +1,22 @@ +#!/bin/sh + +cp llboot_comp.o llboot.o +cp test_sched.o boot.o +cp test_sched_inv.o intcomp.o +cp test_sched_inv.o w1comp.o +cp test_sched_inv.o w3comp.o +cp test_boot.o dummy1.o +cp test_boot.o dummy2.o + +# only int and w0 in root +./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o" ./gen_client_stub + +#int, w0 in root and w1 in comp +#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, ;w1comp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o;w1comp.o-boot.o|capmgr.o" ./gen_client_stub + +# int, w1 - w3 +#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, ;w1comp.o, ;w3comp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o;w1comp.o-boot.o|capmgr.o;w3comp.o-boot.o|capmgr.o" ./gen_client_stub + +#cp test_boot.o dummy.o +#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o" ./gen_client_stub +# diff --git a/src/platform/i386/runscripts/unit_hierschedcomps.sh b/src/platform/i386/runscripts/unit_hierschedcomps.sh index ba032033bf..5122af6f50 100644 --- a/src/platform/i386/runscripts/unit_hierschedcomps.sh +++ b/src/platform/i386/runscripts/unit_hierschedcomps.sh @@ -5,8 +5,8 @@ cp root_fprr.o boot.o cp hier_fprr.o hier_fprr1.o cp hier_fprr.o hier_fprr2.o cp hier_fprr.o hier_fprr3.o -cp unit_schedcomp_test.o unit_schedcomp_test1.o -cp unit_schedcomp_test.o unit_schedcomp_test2.o -cp unit_schedcomp_test.o unit_schedcomp_test3.o -cp unit_schedcomp_test.o unit_schedcomp_test4.o -./cos_linker "llboot.o, ;*hier_fprr1.o, ;capmgr.o, ;*hier_fprr2.o, ;*boot.o, ;*hier_fprr3.o, ;unit_schedcomp_test1.o, ;unit_schedcomp_test2.o, ;unit_schedcomp_test3.o, ;unit_schedcomp_test4.o, :boot.o-capmgr.o;hier_fprr1.o-capmgr.o|[parent_]boot.o;hier_fprr2.o-capmgr.o|[parent_]boot.o;hier_fprr3.o-capmgr.o|[parent_]hier_fprr1.o;unit_schedcomp_test1.o-boot.o;unit_schedcomp_test2.o-hier_fprr1.o;unit_schedcomp_test3.o-hier_fprr2.o;unit_schedcomp_test4.o-hier_fprr3.o" ./gen_client_stub +cp unit_schedappcomp_test.o unit_schedappcomp_test1.o +cp unit_schedappcomp_test.o unit_schedappcomp_test2.o +cp unit_schedappcomp_test.o unit_schedappcomp_test3.o +cp unit_schedappcomp_test.o unit_schedappcomp_test4.o +./cos_linker "llboot.o, ;*hier_fprr1.o, ;capmgr.o, ;*hier_fprr2.o, ;*boot.o, ;*hier_fprr3.o, ;unit_schedappcomp_test1.o, ;unit_schedappcomp_test2.o, ;unit_schedappcomp_test3.o, ;unit_schedappcomp_test4.o, :boot.o-capmgr.o;hier_fprr1.o-capmgr.o|[parent_]boot.o;hier_fprr2.o-capmgr.o|[parent_]boot.o;hier_fprr3.o-capmgr.o|[parent_]hier_fprr1.o;unit_schedappcomp_test1.o-boot.o;unit_schedappcomp_test2.o-hier_fprr1.o;unit_schedappcomp_test3.o-hier_fprr2.o;unit_schedappcomp_test4.o-hier_fprr3.o" ./gen_client_stub diff --git a/src/platform/i386/runscripts/unit_schedappcomps.sh b/src/platform/i386/runscripts/unit_schedappcomps.sh new file mode 100644 index 0000000000..5792230896 --- /dev/null +++ b/src/platform/i386/runscripts/unit_schedappcomps.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +cp llboot_comp.o llboot.o +cp root_fprr.o boot.o +./cos_linker "llboot.o, ;unit_schedappcomp_test.o, ;capmgr.o, ;unit_schedappaep_test.o, ;*boot.o, :boot.o-capmgr.o;unit_schedappcomp_test.o-boot.o;unit_schedappaep_test.o-boot.o|capmgr.o" ./gen_client_stub diff --git a/src/platform/i386/runscripts/unit_schedcomp.sh b/src/platform/i386/runscripts/unit_schedcomp.sh index 9327f2ae50..7665041768 100644 --- a/src/platform/i386/runscripts/unit_schedcomp.sh +++ b/src/platform/i386/runscripts/unit_schedcomp.sh @@ -1,5 +1,7 @@ #!/bin/sh cp llboot_comp.o llboot.o -cp root_fprr.o boot.o -./cos_linker "llboot.o, ;unit_schedcomp_test.o, ;capmgr.o, ;unit_schedaep_test.o, ;*boot.o, :boot.o-capmgr.o;unit_schedcomp_test.o-boot.o;unit_schedaep_test.o-boot.o|capmgr.o" ./gen_client_stub +cp unit_schedcomp_test.o boot.o +cp test_boot.o dummy1.o +cp test_boot.o dummy2.o +./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub diff --git a/src/platform/i386/runscripts/unit_slite01.sh b/src/platform/i386/runscripts/unit_slite01.sh new file mode 100644 index 0000000000..8a887a8a36 --- /dev/null +++ b/src/platform/i386/runscripts/unit_slite01.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +cp llboot_comp.o llboot.o +cp root_fprr.o boot.o +#cp unit_slrcvtest.o boot.o +#cp test_boot.o dummy1.o +#cp test_boot.o dummy2.o +./cos_linker "llboot.o, ;*spin_comp.o, ;capmgr.o, ;*unit_slrcvtest.o, ;*boot.o, :boot.o-capmgr.o;unit_slrcvtest.o-boot.o|capmgr.o;spin_comp.o-boot.o|capmgr.o" ./gen_client_stub +#./cos_linker "llboot.o, ;dummy2.o, ;capmgr.o, ;dummy1.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub diff --git a/src/platform/i386/runscripts/unit_slrcv.sh b/src/platform/i386/runscripts/unit_slrcv.sh new file mode 100644 index 0000000000..a12a03d75d --- /dev/null +++ b/src/platform/i386/runscripts/unit_slrcv.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +cp unit_slrcvtest.o llboot.o +./cos_linker "llboot.o, :" ./gen_client_stub diff --git a/src/platform/i386/runscripts/unit_slxcore.sh b/src/platform/i386/runscripts/unit_slxcore.sh new file mode 100644 index 0000000000..4cb06cf503 --- /dev/null +++ b/src/platform/i386/runscripts/unit_slxcore.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +cp unit_slxcoretests.o llboot.o +./cos_linker "llboot.o, :" ./gen_client_stub diff --git a/src/platform/i386/serial.c b/src/platform/i386/serial.c index dc21481fd1..5685938af6 100644 --- a/src/platform/i386/serial.c +++ b/src/platform/i386/serial.c @@ -5,8 +5,6 @@ #include "isr.h" #include "kernel.h" -void serial_puts(const char *s); - enum serial_ports { SERIAL_PORT_A = 0x3F8, @@ -43,7 +41,7 @@ serial_handler(struct pt_regs *r) char serial; int preempt = 1; - ack_irq(HW_SERIAL); + lapic_ack(); serial = serial_recv(); @@ -62,18 +60,19 @@ serial_handler(struct pt_regs *r) case 3: /* FIXME: Obviously remove this once we have working components */ die("Break\n"); case 'o': - timer_set(TIMER_ONESHOT, 50000000); - timer_set(TIMER_ONESHOT, 50000000); + hpet_set(HPET_ONESHOT, 50000000); + hpet_set(HPET_ONESHOT, 50000000); break; case 'p': - timer_set(TIMER_PERIODIC, 100000000); - timer_set(TIMER_PERIODIC, 100000000); + hpet_set(HPET_PERIODIC, 100000000); + hpet_set(HPET_PERIODIC, 100000000); break; default: break; } - printk("Serial: %d\n", serial); + PRINTK("Serial: %d\n", serial); + // printk("%c", serial); return preempt; } @@ -81,17 +80,24 @@ serial_handler(struct pt_regs *r) void serial_init(void) { - printk("Enabling serial I/O\n"); printk_register_handler(serial_puts); /* We will initialize the first serial port */ outb(SERIAL_PORT_A + 1, 0x00); outb(SERIAL_PORT_A + 3, 0x80); /* Enable divisor mode */ - outb(SERIAL_PORT_A + 0, 0x03); /* Div Low: 03 Set the port to 38400 bps */ + outb(SERIAL_PORT_A + 0, 0x01); /* Div Low: 01 Set the port to 115200 bps */ outb(SERIAL_PORT_A + 1, 0x00); /* Div High: 00 */ outb(SERIAL_PORT_A + 3, 0x03); outb(SERIAL_PORT_A + 2, 0xC7); outb(SERIAL_PORT_A + 4, 0x0B); outb(SERIAL_PORT_A + 1, 0x01); /* Enable interrupts on receive */ + printk("Enabling serial I/O\n"); +} + +void +serial_late_init(void) +{ + chal_irq_enable(HW_SERIAL, 0); + chal_irq_enable(HW_KEYBOARD, 0); } diff --git a/src/platform/i386/serial.h b/src/platform/i386/serial.h new file mode 100644 index 0000000000..777c31078e --- /dev/null +++ b/src/platform/i386/serial.h @@ -0,0 +1,9 @@ +#ifndef SERIAL_H +#define SERIAL_H + +#ifdef ENABLE_SERIAL +void serial_init(void); +void serial_late_init(void); +#endif + +#endif diff --git a/src/platform/i386/vga.c b/src/platform/i386/vga.c index bdfebe2882..bf4b17961f 100644 --- a/src/platform/i386/vga.c +++ b/src/platform/i386/vga.c @@ -44,9 +44,6 @@ #define VGA_CTL_REG 0x3D4 #define VGA_DATA_REG 0x3D5 -#define KEY_DEVICE 0x60 -#define KEY_PENDING 0x64 - /* Variables. */ /* Save the X position. */ static int csr_x; @@ -160,27 +157,6 @@ cls(void) move_csr(); } -/* - * VIDEO virtual address set to HIGH address. - */ -void -vga_high_init(void) -{ - video = chal_pa2va(VIDEO); -} - -/* Clear the screen and initialize VIDEO, XPOS and YPOS. */ -void -vga_init(void) -{ - video = (unsigned char *) VIDEO; - - csr_x = 0; - csr_y = 0; - cls(); - printk_register_handler(vga_puts); -} - /* Put the character C on the screen. */ static void putchar(int c) @@ -222,16 +198,24 @@ puts(unsigned char *text) move_csr(); } +/* + * VIDEO virtual address set to HIGH address. + */ void -keyboard_handler(struct pt_regs *regs) +vga_high_init(void) { - u16_t scancode = 0; + video = chal_pa2va(VIDEO); +} - ack_irq(HW_KEYBOARD); +/* Clear the screen and initialize VIDEO, XPOS and YPOS. */ +void +vga_init(void) +{ + video = (unsigned char *) VIDEO; - while (inb(KEY_PENDING) & 2) { - /* wait for keypress to be ready */ - } - scancode = inb(KEY_DEVICE); - printk("Keyboard press: %d\n", scancode); + csr_x = 0; + csr_y = 0; + cls(); + printk_register_handler(vga_puts); + printk("Enabling VGA\n"); } diff --git a/src/platform/i386/vga.h b/src/platform/i386/vga.h new file mode 100644 index 0000000000..0788eb8b2f --- /dev/null +++ b/src/platform/i386/vga.h @@ -0,0 +1,9 @@ +#ifndef VGA_H +#define VGA_H + +#ifdef ENABLE_VGA +void vga_init(void); +void vga_high_init(void); +#endif + +#endif /* VGA_H */ diff --git a/src/platform/i386/vm.c b/src/platform/i386/vm.c index f7c4719dc9..a9457d5c67 100644 --- a/src/platform/i386/vm.c +++ b/src/platform/i386/vm.c @@ -55,6 +55,21 @@ u8_t *mem_boot_alloc(int npages) /* boot-time, bump-ptr heap */ return r; } +static unsigned long vm_pgd_idx = COS_MEM_KERN_START_VA / PGD_RANGE; + +int +vm_map_superpage(u32_t addr, int nocache) +{ + int idx = vm_pgd_idx; + u32_t page; + + page = round_to_pgd_page(addr); + boot_comp_pgd[idx] = page | PGTBL_PRESENT | PGTBL_WRITABLE | PGTBL_SUPER | PGTBL_GLOBAL | (nocache ? PGTBL_NOCACHE : 0); + vm_pgd_idx ++; + + return idx; +} + int kern_setup_image(void) { @@ -76,6 +91,7 @@ kern_setup_image(void) boot_comp_pgd[i / PGD_RANGE] = 0; /* unmap lower addresses */ } + vm_pgd_idx = j; #ifdef ENABLE_VGA /* uses virtual address for VGA */ vga_high_init(); @@ -89,33 +105,22 @@ kern_setup_image(void) u64_t hpet; page = round_up_to_pgd_page(rsdt) - (1 << 22); - boot_comp_pgd[j] = page | PGTBL_PRESENT | PGTBL_WRITABLE | PGTBL_SUPER | PGTBL_GLOBAL; - acpi_set_rsdt_page(j); - j++; - - hpet = timer_find_hpet(acpi_find_timer()); - if (hpet) { - page = round_up_to_pgd_page(hpet & 0xffffffff) - (1 << 22); - boot_comp_pgd[j] = page | PGTBL_PRESENT | PGTBL_WRITABLE | PGTBL_SUPER | PGTBL_GLOBAL; - timer_set_hpet_page(j); - j++; - } + acpi_set_rsdt_page(vm_map_superpage(page, 0)); + + hpet = hpet_find(acpi_find_hpet()); + if (hpet) hpet_set_page(vm_map_superpage(hpet, 0)); /* lapic memory map */ lapic = lapic_find_localaddr(acpi_find_apic()); - if (lapic) { - page = round_up_to_pgd_page(lapic & 0xffffffff) - (1 << 22); - /* - * Intel specification: - * For correct APIC operation, this address space must be mapped to an area of memory - * that has been designated as strong uncacheable (UC). - */ - boot_comp_pgd[j] = page | PGTBL_PRESENT | PGTBL_WRITABLE | PGTBL_SUPER | PGTBL_GLOBAL | PGTBL_NOCACHE; - lapic_set_page(j); - j++; - } + /* + * Intel specification: + * For correct APIC operation, this address space must be mapped to an area of memory + * that has been designated as strong uncacheable (UC). + */ + if (lapic) lapic_set_page(vm_map_superpage(lapic, 1)); } + j = vm_pgd_idx; for (; j < PAGE_SIZE / sizeof(unsigned int); i += PGD_RANGE, j++) { boot_comp_pgd[j] = boot_comp_pgd[i / PGD_RANGE] = 0; }