instadeepai · BioGeek · Dec 11, 2023
diff --git a/bio_clip/data/downstream/pesto_src/structure.py b/bio_clip/data/downstream/pesto_src/structure.py
@@ -44,7 +44,7 @@ def clean_structure(structure):
     )[1]
     delta_chains = np.abs(np.sign(np.concatenate([[0], np.diff(ids_chains)])))
 
-    # find changes due to inertion code
+    # find changes due to insertion code
     icodes = structure["icode"]
     ids_icodes = np.where(
         np.array(icodes).reshape(-1, 1) == np.unique(icodes).reshape(1, -1)

diff --git a/bio_clip/data/downstream/ppi.py b/bio_clip/data/downstream/ppi.py
@@ -122,7 +122,7 @@ def cropping_dependent_processing(
 ):
     """This task is now deprecated as there were problems found in the original paper.
     Additionally, this task was previously pre-processed using an old version of the
-    BioCLIP code which we do not provide. Data-transform dependend pre-processing takes
+    BioCLIP code which we do not provide. Data-transform dependent pre-processing takes
     quite long, thus it is processed in parallel, cached, then training is performed.
 
     Args:

diff --git a/bio_clip/model/gnn.py b/bio_clip/model/gnn.py
@@ -23,7 +23,7 @@
 
 
 class PositionalEncodingLayer(hk.Module):
-    """Independent-Equivariant Graph Matching Newtork"""
+    """Independent-Equivariant Graph Matching Network"""
 
     def __init__(self, positional_encoding_dimension: int):
         """Initializes a Positional Encoding Layer

diff --git a/bio_clip/train/data_transforms.py b/bio_clip/train/data_transforms.py
@@ -480,7 +480,7 @@ def crop_pad(data, multiply_by):
         atom14_mask.astype(np.float32),
     )
 
-    # I am cropping tokens manually here, because the filter guarentees the num residues
+    # I am cropping tokens manually here, because the filter guarantees the num residues
     # is below padding_num_residue
     tokens = tokens[:padding_num_residue]
 

diff --git a/bio_clip/train/pretrain/dataloader.py b/bio_clip/train/pretrain/dataloader.py
@@ -25,7 +25,7 @@ class BioClipDataloaderParams:
     # files. The filepaths can be GCP bucket filepaths (in which case they should start
     # with gs://)
     batch_dims: Tuple[int, ...]  # Batches of training samples will be reshaped to this
-    # shape before being outputed (the effective batch size is np.prod(batch_dims)).
+    # shape before being outputted (the effective batch size is np.prod(batch_dims)).
     # This is useful for preparing the batches in a shape that matches what jax.pmap and
     # jax.vmap expect.
     shuffle: bool = True  # If True, shuffle the filepaths at the beginning of all

diff --git a/bio_clip/train/pretrain/trainer.py b/bio_clip/train/pretrain/trainer.py
@@ -342,7 +342,7 @@ def predict(
 
         Returns:
             loss (float):  Training Loss.
-            aux_metrics (dict): Auxillary metrics.
+            aux_metrics (dict): Auxiliary metrics.
         """
         mean = (
             jnp.mean

diff --git a/bio_clip/utils/utils.py b/bio_clip/utils/utils.py
@@ -132,7 +132,7 @@ def _background_thread(self) -> None:
 def convert_to_ml_dict(dct: Union[DictConfig, Any]) -> Union[ConfigDict, Any]:
     """
     This function converts the DictConfig returned by Hydra
-    into a ConfigDict. The recusion allows to convert
+    into a ConfigDict. The recursion allows to convert
     all the nested DictConfig elements of the config. The recursion stops
     once the reached element is not a DictConfig.
     """

diff --git a/build-source/Dockerfile b/build-source/Dockerfile
@@ -53,7 +53,7 @@ RUN ln -s /app/bio-clip/bio_clip /opt/conda/envs/bioclip/lib/python3.9/site-pack
 
 # Disable debug, info, and warning tensorflow logs
 ENV TF_CPP_MIN_LOG_LEVEL=3
-# # By default use cpu as the backend for JAX, we will explicitely load data on gpus/tpus as needed.
+# # By default use cpu as the backend for JAX, we will explicitly load data on gpus/tpus as needed.
 # ENV JAX_PLATFORM_NAME="cpu"
 
 # aws

diff --git a/datasets/pretraining/create/data_processing.py b/datasets/pretraining/create/data_processing.py
@@ -586,7 +586,7 @@ def compute_graph_of_protein(
             raw PDB data for the ligand sorted by residue
         stacked_residue_coordinates (Coordinates):
             list of residue coordinates for the ligand
-        num_neighbor (int, optional): number of nearest neigbors in the graph.
+        num_neighbor (int, optional): number of nearest neighbors in the graph.
             Defaults to None.
         residue_loc_is_alphac (bool, optional): whether the alpha-C atom
             is used for residue center. Defaults to True.
@@ -828,7 +828,7 @@ def get_residues_db5(pdb_filename: str) -> List[SortedPdbData]:
 
     Returns:
         List of residues, each residue being represented by:
-            chain, residue, resname and all atom informations
+            chain, residue, resname and all atom information
     """
     df = PandasPdb().read_pdb(pdb_filename)
     df = df.df["ATOM"]