Skip to content

Commit

Permalink
fix: working exclusion
Browse files Browse the repository at this point in the history
  • Loading branch information
boasvdp committed Jul 26, 2024
1 parent 868c4e8 commit 70b5895
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 2 deletions.
1 change: 1 addition & 0 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ for param in ["threads", "mem_gb"]:
config[param][k] = int(config[param][k])

OUT = config["output_dir"]
INPUT = config["input_dir"]

# find collection using collfinder
# iget collection and save to a path passed to cli
Expand Down
4 changes: 2 additions & 2 deletions workflow/rules/clustering.smk
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ else:
input:
distances=OUT + "/distances.tsv",
previous_clustering=PREVIOUS_CLUSTERING + "/clusters.csv",
exclude_list=OUT + "/list_excluded_samples.txt",
exclude_list=OUT + "/list_excluded_samples.tsv",
output:
OUT + "/clusters.csv",
log:
Expand All @@ -124,6 +124,6 @@ python workflow/scripts/cluster.py \
--log {log} \
--verbose \
--merged-cluster-separator {params.merged_cluster_separator:q} \
--exclude {input.exclude_list}
--exclude {input.exclude_list} \
--output {output}
"""
25 changes: 25 additions & 0 deletions workflow/scripts/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,29 @@ def read_data(distances, previous_clustering):
)
return df_distances, df_previous_clustering

@timing
def clean_sample_columns(df, cols, fixed_string):
"""
Remove fixed string from columns
Parameters
----------
df : pd.DataFrame
Dataframe with distances
cols : list
Columns to clean
Returns
-------
df : pd.DataFrame
Dataframe with cleaned sample names
"""
for col in cols:
df[col] = df[col].str.replace(fixed_string, "")
return df


@timing
def exclude_samples(df_distances, exclude_list):
"""
Expand Down Expand Up @@ -429,6 +452,8 @@ def main(args):
args.distances, args.previous_clustering
)

df_distances = clean_sample_columns(df_distances, ["sample1", "sample2"], "_contig1")

if args.exclude_list:
df_distances = exclude_samples(df_distances, args.exclude_list)

Expand Down

0 comments on commit 70b5895

Please sign in to comment.