Drill shows error when running on large dataset using Tentris. #501

Quannz · 2024-11-26T12:55:33Z

I'm having a problem running Drill on my dataset(21,518,759 entities, 918 properties, 72,737,644 triples), I got the following error message:

(ontolearn0.8.0) quannian@eml4u:~/Drill/Ontolearn-0.7.3/Ontolearn/examples/Drill_DB$ python Drill_QALD9_DB.py

Goal Concept:    EducationalOrganization        E^+:[3]  E^-:[3]
Traceback (most recent call last):
  File "/local/upb/users/q/quannian/profiles/unix/cs/Drill/Ontolearn-0.7.3/Ontolearn/examples/Drill_DB/Drill_QALD9_DB.py", line 117, in <module>
    start(parser.parse_args())
  File "/local/upb/users/q/quannian/profiles/unix/cs/Drill/Ontolearn-0.7.3/Ontolearn/examples/Drill_DB/Drill_QALD9_DB.py", line 46, in start
    drill.train(num_of_target_concepts=args.num_of_target_concepts,
  File "/upb/users/q/quannian/profiles/unix/cs/.conda/envs/ontolearn0.8.0/lib/python3.10/site-packages/ontolearn/learners/drill.py", line 263, in train
    sum_of_rewards_per_actions = self.rl_learning_loop(pos_uri=frozenset(positives),
  File "/upb/users/q/quannian/profiles/unix/cs/.conda/envs/ontolearn0.8.0/lib/python3.10/site-packages/ontolearn/learners/drill.py", line 221, in rl_learning_loop
    sequence_of_states, rewards = self.sequence_of_actions(root_rl_state)
  File "/upb/users/q/quannian/profiles/unix/cs/.conda/envs/ontolearn0.8.0/lib/python3.10/site-packages/ontolearn/learners/drill.py", line 464, in sequence_of_actions
    next_selected_rl_state, reward = self.select_next_state(current_state, next_rl_states) 
  File "/upb/users/q/quannian/profiles/unix/cs/.conda/envs/ontolearn0.8.0/lib/python3.10/site-packages/ontolearn/learners/drill.py", line 447, in select_next_state
    next_selected_rl_state = self.exploration_exploitation_tradeoff(current_state, next_rl_states)
  File "/upb/users/q/quannian/profiles/unix/cs/.conda/envs/ontolearn0.8.0/lib/python3.10/site-packages/ontolearn/learners/drill.py", line 622, in exploration_exploitation_tradeoff
    next_state = random.choice(next_states)
  File "/upb/users/q/quannian/profiles/unix/cs/.conda/envs/ontolearn0.8.0/lib/python3.10/random.py", line 378, in choice
    return seq[self._randbelow(len(seq))]
IndexError: list index out of range

The code I used that lead to error:

import json
from argparse import ArgumentParser
from ontolearn.triple_store import TripleStoreKnowledgeBase
from ontolearn.triple_store import TripleStore
import numpy as np
from sklearn.model_selection import StratifiedKFold
from ontolearn.utils.static_funcs import compute_f1_score
from ontolearn.knowledge_base import KnowledgeBase
from ontolearn.learning_problem import PosNegLPStandard
from ontolearn.refinement_operators import LengthBasedRefinement
from ontolearn.learners import Drill
from ontolearn.metrics import F1
from ontolearn.heuristics import CeloeBasedReward
from owlapy.owl_individual import OWLNamedIndividual, IRI
from owlapy.render import DLSyntaxObjectRenderer


def start(args):
    kb = TripleStore(url=args.path_sparql_endpoint)

    drill = Drill(knowledge_base=kb,
                  path_embeddings=args.path_embeddings,
                  refinement_operator=LengthBasedRefinement(knowledge_base=kb),
                  quality_func=F1(),
                  reward_func=CeloeBasedReward(),
                  epsilon_decay=args.epsilon_decay,
                  learning_rate=args.learning_rate,
                  num_of_sequential_actions=args.num_of_sequential_actions,
                  num_episode=args.num_episode,
                  iter_bound=args.iter_bound,
                  max_runtime=args.max_runtime)

    if args.path_pretrained_dir:
        drill.load(directory=args.path_pretrained_dir)
    else:
        drill.train(num_of_target_concepts=args.num_of_target_concepts,
                    num_learning_problems=args.num_of_training_learning_problems)
        drill.save(directory="pretrained_drill")

    with open(args.path_learning_problem) as json_file:
        examples = json.load(json_file)
    p = examples['problems']['QALD9_plus_dbpedia']['positive_examples']
    n = examples['problems']['QALD9_plus_dbpedia']['negative_examples']
    kf = StratifiedKFold(n_splits=args.folds, shuffle=True, random_state=args.random_seed)
    X = np.array(p + n)
    Y = np.array([1.0 for _ in p] + [0.0 for _ in n])
    dl_render = DLSyntaxObjectRenderer()
    for (ith, (train_index, test_index)) in enumerate(kf.split(X, Y)):
        train_pos = {pos_individual for pos_individual in X[train_index][Y[train_index] == 1]}
        train_neg = {neg_individual for neg_individual in X[train_index][Y[train_index] == 0]}
        test_pos = {pos_individual for pos_individual in X[test_index][Y[test_index] == 1]}
        test_neg = {neg_individual for neg_individual in X[test_index][Y[test_index] == 0]}
        train_lp = PosNegLPStandard(pos=set(map(OWLNamedIndividual, map(IRI.create, train_pos))),
                                    neg=set(map(OWLNamedIndividual, map(IRI.create, train_neg))))

        test_lp = PosNegLPStandard(pos=set(map(OWLNamedIndividual, map(IRI.create, test_pos))),
                                   neg=set(map(OWLNamedIndividual, map(IRI.create, test_neg))))

        pred_drill = drill.fit(train_lp).best_hypotheses()
        train_f1_drill = compute_f1_score(individuals=frozenset({i for i in kb.individuals(pred_drill)}),
                                          pos=train_lp.pos,
                                          neg=train_lp.neg)
        # () Quality on test data
        test_f1_drill = compute_f1_score(individuals=frozenset({i for i in kb.individuals(pred_drill)}),
                                         pos=test_lp.pos,
                                         neg=test_lp.neg)
        print(
            f"Prediction: {dl_render.render(pred_drill)} | Train Quality: {train_f1_drill:.3f} | Test Quality: {test_f1_drill:.3f} \n")


if __name__ == '__main__':
    parser = ArgumentParser()
    # General
    parser.add_argument("--path_sparql_endpoint", type=str,
                        default="http://localhost:9050/sparql")
    parser.add_argument("--path_embeddings", type=str,
                        default='/upb/users/q/quannian/profiles/unix/cs/Embedding/QALD9_plus_dbpedia/2024-10-21-16-09-12/Merge_entity_relation.csv')
    parser.add_argument("--num_of_target_concepts",
                        type=int,
                        default=1)
    parser.add_argument("--num_of_training_learning_problems",
                        type=int,
                        default=1)
    parser.add_argument("--path_pretrained_dir", type=str, default=None)

    parser.add_argument("--path_learning_problem", type=str, default='/upb/users/q/quannian/profiles/unix/cs/Drill/Ontolearn-0.7.3/Ontolearn/LPs/QALD9DB/TandF_MST5_reverse.json',
                        help="Path to a .json file that contains 2 properties 'positive_examples' and "
                             "'negative_examples'. Each of this properties should contain the IRIs of the respective"
                             "instances. e.g. 'some/path/lp.json'")
    parser.add_argument("--max_runtime", type=int, default=10, help="Max runtime")
    parser.add_argument("--folds", type=int, default=10, help="Number of folds of cross validation.")
    parser.add_argument("--random_seed", type=int, default=1)
    parser.add_argument("--iter_bound", type=int, default=10_000, help='iter_bound during testing.')
    # DQL related
    parser.add_argument("--num_episode", type=int, default=1, help='Number of trajectories created for a given lp.')

    parser.add_argument("--epsilon_decay", type=float, default=.01, help='Epsilon greedy trade off per epoch')
    parser.add_argument("--max_len_replay_memory", type=int, default=1024,
                        help='Maximum size of the experience replay')
    parser.add_argument("--num_epochs_per_replay", type=int, default=2,
                        help='Number of epochs on experience replay memory')
    parser.add_argument('--num_of_sequential_actions', type=int, default=1, help='Length of the trajectory.')

    # NN related
    parser.add_argument("--learning_rate", type=int, default=.01)

    start(parser.parse_args())

Any hints as to what might be wrong with the dataset? or the code itself?

The text was updated successfully, but these errors were encountered:

Demirrr · 2024-11-26T14:29:23Z

there is no next_states to sample a state from

import random
random.choice([])

see https://bugs.python.org/issue43097

I am ensure about the reason

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Drill shows error when running on large dataset using Tentris. #501

Drill shows error when running on large dataset using Tentris. #501

Quannz commented Nov 26, 2024

Demirrr commented Nov 26, 2024

Drill shows error when running on large dataset using Tentris. #501

Drill shows error when running on large dataset using Tentris. #501

Comments

Quannz commented Nov 26, 2024

Demirrr commented Nov 26, 2024