Skip to content

Commit

Permalink
36214035 tell() method on files opened in text mode has poor performance
Browse files Browse the repository at this point in the history
  • Loading branch information
kulikjak authored and citrus-it committed Feb 15, 2024
1 parent ac092d8 commit e69593f
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 25 deletions.
50 changes: 26 additions & 24 deletions src/modules/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,11 @@ def __init__(

self.old_out_token = None

# Handle for the output main dictionary file and
# the current position within.
self.out_main_dict_handle = None
self.out_main_dict_pos = 0

@staticmethod
def __decode_fmri(pfmri):
"""Turn fmris into strings correctly while writing out
Expand Down Expand Up @@ -393,12 +398,9 @@ def _process_fmris(self, fmris):
self._progtrack.job_add_progress(self._progtrack.JOB_REBUILD_SEARCH)
return removed_paths

def _write_main_dict_line(
self, file_handle, token, fv_fmri_pos_list_list, out_dir
):
def _write_main_dict_line(self, token, fv_fmri_pos_list_list, out_dir):
"""Writes out the new main dictionary file and also adds the
token offsets to _data_token_offset. file_handle is the file
handle for the output main dictionary file. token is the token
token offsets to _data_token_offset. token is the token
to add to the file. fv_fmri_pos_list_list is a structure of
lists inside of lists several layers deep. The top layer is a
list of action types. The second layer contains the keys for
Expand All @@ -418,8 +420,7 @@ def _write_main_dict_line(
)
self.old_out_token = token

cur_location_int = file_handle.tell()
cur_location = str(cur_location_int)
cur_location = str(self.out_main_dict_pos)
self._data_token_offset.write_entity(token, cur_location)

for at, st_list in fv_fmri_pos_list_list:
Expand All @@ -438,11 +439,18 @@ def _write_main_dict_line(
for fv, p_list in fv_list:
for p_id, m_off_set in p_list:
p_id = int(p_id)
self._data_fmri_offsets.add_pair(p_id, cur_location_int)
file_handle.write(
self._data_main_dict.transform_main_dict_line(
token, fv_fmri_pos_list_list
)
self._data_fmri_offsets.add_pair(
p_id, self.out_main_dict_pos
)
data = self._data_main_dict.transform_main_dict_line(
token, fv_fmri_pos_list_list
)
self.out_main_dict_handle.write(data)
# Using tell() on file objects opened in text mode
# is very slow compared to simple counting.
# https://docs.python.org/3/library/io.html#performance
self.out_main_dict_pos += len(
data.encode(self.out_main_dict_handle.encoding)
)

@staticmethod
Expand Down Expand Up @@ -597,11 +605,12 @@ def _update_index(self, dicts, out_dir):
self._data_main_dict.write_dict_file(out_dir, self.file_version_number)
# The dictionary file's opened in append mode to avoid removing
# the version information the search storage class added.
out_main_dict_handle = open(
self.out_main_dict_handle = open(
os.path.join(out_dir, self._data_main_dict.get_file_name()),
"a",
buffering=PKG_FILE_BUFSIZ,
)
self.out_main_dict_pos = self.out_main_dict_handle.tell()

self._data_token_offset.open_out_file(out_dir, self.file_version_number)

Expand Down Expand Up @@ -641,10 +650,7 @@ def _update_index(self, dicts, out_dir):
while new_toks_available and next_new_tok < tok:
assert len(next_new_tok) > 0
self._write_main_dict_line(
out_main_dict_handle,
next_new_tok,
new_tok_info,
out_dir,
next_new_tok, new_tok_info, out_dir
)
try:
next_new_tok, new_tok_info = next(new_toks_it)
Expand All @@ -668,18 +674,14 @@ def _update_index(self, dicts, out_dir):
# associated with it, write them to the file.
if existing_entries:
assert len(tok) > 0
self._write_main_dict_line(
out_main_dict_handle, tok, existing_entries, out_dir
)
self._write_main_dict_line(tok, existing_entries, out_dir)

# For any new tokens which are alphabetically after the
# last entry in the existing file, add them to the end
# of the file.
while new_toks_available:
assert len(next_new_tok) > 0
self._write_main_dict_line(
out_main_dict_handle, next_new_tok, new_tok_info, out_dir
)
self._write_main_dict_line(next_new_tok, new_tok_info, out_dir)
try:
next_new_tok, new_tok_info = next(new_toks_it)
except StopIteration:
Expand All @@ -689,7 +691,7 @@ def _update_index(self, dicts, out_dir):
file_handle.close()
self._data_main_dict.close_file_handle()

out_main_dict_handle.close()
self.out_main_dict_handle.close()
self._data_token_offset.close_file_handle()
for fh in self.at_fh.values():
fh.close()
Expand Down
5 changes: 4 additions & 1 deletion src/modules/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1347,7 +1347,10 @@ def __handle_list(lst, cp):
if return_line:
arg = l
__handle_list(inds, arg)
cur_pos = file_handle.tell()
# Using tell() on file objects opened in text mode
# is very slow compared to simple counting.
# https://docs.python.org/3/library/io.html#performance
cur_pos += len(line.encode(file_handle.encoding))
line = file_handle.readline()
file_handle.close()
return action_dict
Expand Down

0 comments on commit e69593f

Please sign in to comment.