diff --git a/cgrcompute/components/courserecommendation.py b/cgrcompute/components/courserecommendation.py index 31728f2..66094c0 100644 --- a/cgrcompute/components/courserecommendation.py +++ b/cgrcompute/components/courserecommendation.py @@ -1,5 +1,6 @@ from scipy.sparse import lil_matrix -from sklearn.metrics.pairwise import cosine_similarity +from sklearn.metrics.pairwise import cosine_similarity,linear_kernel +from sklearn.preprocessing import normalize from cgrcompute.components.external import ElasticService, get_mongo_service from typing import Hashable from logging import getLogger @@ -30,6 +31,67 @@ def train(observations: list[set[Hashable]]) -> 'CosineSimRecommendationModel': neigh = sorted(neigh, key=lambda x: x[1])[-300:] ccmtx[cid] = dict(neigh) return CosineSimRecommendationModel(ccmtx) + @staticmethod + def train32(observations: list[set[Hashable]]) -> 'CosineSimRecommendationModel': + items = list(set(c for o in observations for c in o)) + itemidx = dict((c, i) for (i, c) in enumerate(items)) + itemobsv = lil_matrix((len(items), len(observations)),dtype='float32') + for (i, o) in enumerate(observations): + for c in o: + itemobsv[itemidx[c], i] = 1 + sim = cosine_similarity(itemobsv.astype('float32'), dense_output=False) + ccmtx = dict() + for (i, cid) in enumerate(items): + neigh = [] + for c in sim.getrow(i).nonzero()[1]: + neigh.append((items[c], sim[i, c])) + neigh = sorted(neigh, key=lambda x: x[1])[-300:] + ccmtx[cid] = dict(neigh) + return CosineSimRecommendationModel(ccmtx) + @staticmethod + def train16(observations: list[set[Hashable]]) -> 'CosineSimRecommendationModel': + items = list(set(c for o in observations for c in o)) + itemidx = dict((c, i) for (i, c) in enumerate(items)) + itemobsv = lil_matrix((len(items), len(observations)),dtype='float32') + for (i, o) in enumerate(observations): + for c in o: + itemobsv[itemidx[c], i] = 1 + size = itemobsv.shape[0] + chunk_size = 500 + ccmtx = dict() + for start in range(0,size,chunk_size): + end = min(start+chunk_size,size) + sim = cosine_similarity(itemobsv[start:end],itemobsv,dense_output=False).astype('float16') + for (i, cid) in enumerate(items[start:end]): + neigh = [] + for c in sim.getrow(i).nonzero()[1]: + neigh.append((items[c], sim[i, c])) + neigh = sorted(neigh, key=lambda x: x[1])[-300:] + ccmtx[cid] = dict(neigh) + return CosineSimRecommendationModel(ccmtx) + @staticmethod + def train8(observations: list[set[Hashable]]) -> 'CosineSimRecommendationModel': + items = list(set(c for o in observations for c in o)) + itemidx = dict((c, i) for (i, c) in enumerate(items)) + itemobsv = lil_matrix((len(items), len(observations)),dtype='uint8') + for (i, o) in enumerate(observations): + for c in o: + itemobsv[itemidx[c], i] = 1 + size = itemobsv.shape[0] + chunk_size = 500 + cnorm = normalize(itemobsv) + cnorm *=16 + ccmtx = dict() + for start in range(0,size,chunk_size): + end = min(start+chunk_size,size) + sim = linear_kernel(cnorm[start:end],cnorm,dense_output=False).astype('uint8') + for (i, cid) in enumerate(items[start:end]): + neigh = [] + for c in sim.getrow(i).nonzero()[1]: + neigh.append((items[c], sim[i, c])) + neigh = sorted(neigh, key=lambda x: x[1])[-300:] + ccmtx[cid] = dict(neigh) + return CosineSimRecommendationModel(ccmtx) def infer(self, selected_item: list[Hashable]) -> dict[Hashable, float]: d = dict() @@ -48,6 +110,9 @@ class CourseRecommendationModel: def __init__(self): self.model = None + self.model32 = None + self.model16 = None + self.model8 = None self.logger = getLogger('CourseRecommendationModel') def populate(self): @@ -55,10 +120,20 @@ def populate(self): obsv = self.downloadobsvdata() self.logger.info("Download completed {}. Start training".format(time.time())) self.model = CosineSimRecommendationModel.train(obsv) + self.model32 = CosineSimRecommendationModel.train32(obsv) + self.model16 = CosineSimRecommendationModel.train16(obsv) + self.model8 = CosineSimRecommendationModel.train8(obsv) self.logger.info("Training completed {}".format(time.time())) - def infer(self, selected_courses): - res = self.model.infer(selected_courses) + def infer(self, selected_courses,variant): + if variant =="COSINE": + res = self.model.infer(selected_courses) + elif variant =="COSINE32": + res = self.model32.infer(selected_courses) + elif variant =="COSINE16": + res = self.model16.infer(selected_courses) + elif variant =="COSINE8": + res = self.model8.infer(selected_courses) res = sorted(res.items(), key=lambda x:-x[1]) return [course for course, score in res][:300] @@ -99,8 +174,8 @@ def recommend_course(req: grpcmsg.CourseRecommendationRequest, cache: SharableCa res = [] if req.variant == 'RANDOM': res = model.random_infer() - elif req.variant == 'COSINE': - res = model.infer([(e.semesterKey.studyProgram, e.courseNo) for e in req.selectedCourses]) + elif req.variant in ['COSINE','COSINE32','COSINE16','COSINE8']: + res = model.infer([(e.semesterKey.studyProgram, e.courseNo) for e in req.selectedCourses],req.variant) else: raise Exception('{} variant is invalid'.format(req.variant)) enriched_res = []