fix: working exclusion

RIVM-bioinformatics · Jul 26, 2024 · 70b5895 · 70b5895
1 parent 868c4e8
commit 70b5895
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 2 deletions.
diff --git a/Snakefile b/Snakefile
@@ -10,6 +10,7 @@ for param in ["threads", "mem_gb"]:
         config[param][k] = int(config[param][k])
 
 OUT = config["output_dir"]
+INPUT = config["input_dir"]
 
 # find collection using collfinder
 # iget collection and save to a path passed to cli

diff --git a/workflow/rules/clustering.smk b/workflow/rules/clustering.smk
@@ -97,7 +97,7 @@ else:
         input:
             distances=OUT + "/distances.tsv",
             previous_clustering=PREVIOUS_CLUSTERING + "/clusters.csv",
-            exclude_list=OUT + "/list_excluded_samples.txt",
+            exclude_list=OUT + "/list_excluded_samples.tsv",
         output:
             OUT + "/clusters.csv",
         log:
@@ -124,6 +124,6 @@ python workflow/scripts/cluster.py \
 --log {log} \
 --verbose \
 --merged-cluster-separator {params.merged_cluster_separator:q} \
---exclude {input.exclude_list}
+--exclude {input.exclude_list} \
 --output {output}
             """
diff --git a/workflow/scripts/cluster.py b/workflow/scripts/cluster.py
@@ -66,6 +66,29 @@ def read_data(distances, previous_clustering):
         )
     return df_distances, df_previous_clustering
 
+@timing
+def clean_sample_columns(df, cols, fixed_string):
+    """
+    Remove fixed string from columns
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Dataframe with distances
+    cols : list
+        Columns to clean
+
+    Returns
+    -------
+    df : pd.DataFrame
+        Dataframe with cleaned sample names
+
+    """
+    for col in cols:
+        df[col] = df[col].str.replace(fixed_string, "")
+    return df
+
+
 @timing
 def exclude_samples(df_distances, exclude_list):
     """
@@ -429,6 +452,8 @@ def main(args):
         args.distances, args.previous_clustering
     )
 
+    df_distances = clean_sample_columns(df_distances, ["sample1", "sample2"], "_contig1")
+
     if args.exclude_list:
         df_distances = exclude_samples(df_distances, args.exclude_list)