Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Using fork_from instead of resume in wandb.init #34

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion 01-single-gpu/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ wandb.init(
dir=exp_dir,
name=args.experiment_name,
id=args.experiment_name,
+ resume="must" if resumed else None,
+ fork_from=f"{args.experiment_name}-{rank}?_step={state['global_step']}" if resumed else None,
save_code=True,
config={
"args": vars(args),
Expand Down
6 changes: 5 additions & 1 deletion 01-single-gpu/train_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,11 @@ def _load_to_device(p):
dir=exp_dir,
name=args.experiment_name,
id=args.experiment_name,
resume="must" if resumed else None,
fork_from=(
f"{args.experiment_name}-{rank}?_step={state['global_step']}"
if resumed
else None
),
save_code=True,
config={
"args": vars(args),
Expand Down
2 changes: 1 addition & 1 deletion 02-multi-gpu/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ wandb.init(
+ name=f"rank-{rank}",
- id=args.experiment_name,
+ id=f"{args.experiment_name}-{rank}",
resume="must" if resumed else None,
fork_from=f"{args.experiment_name}-{rank}?_step={state['global_step']}" if resumed else None,
save_code=True,
config={
"args": vars(args),
Expand Down
6 changes: 5 additions & 1 deletion 02-multi-gpu/train_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,11 @@ def _load_to_device(p):
group=args.experiment_name,
name=f"rank-{rank}",
id=f"{args.experiment_name}-{rank}",
resume="must" if resumed else None,
fork_from=(
f"{args.experiment_name}-{rank}?_step={state['global_step']}"
if resumed
else None
),
save_code=True,
config={
"args": vars(args),
Expand Down
6 changes: 5 additions & 1 deletion 03-multi-node/train_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,11 @@ def _load_to_device(p):
group=args.experiment_name,
name=f"rank-{rank}",
id=f"{args.experiment_name}-{rank}",
resume="must" if resumed else None,
fork_from=(
f"{args.experiment_name}-{rank}?_step={state['global_step']}"
if resumed
else None
),
save_code=True,
config={
"args": vars(args),
Expand Down
6 changes: 5 additions & 1 deletion 04-job-launchers-deepspeed/train_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,11 @@ def _load_to_device(p):
group=args.experiment_name,
name=f"rank-{rank}",
id=f"{args.experiment_name}-{rank}",
resume="must" if resumed else None,
fork_from=(
f"{args.experiment_name}-{rank}?_step={state['global_step']}"
if resumed
else None
),
save_code=True,
config={
"args": vars(args),
Expand Down
6 changes: 5 additions & 1 deletion 04-job-launchers-mpirun/train_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,11 @@ def _load_to_device(p):
group=args.experiment_name,
name=f"rank-{rank}",
id=f"{args.experiment_name}-{rank}",
resume="must" if resumed else None,
fork_from=(
f"{args.experiment_name}-{rank}?_step={state['global_step']}"
if resumed
else None
),
save_code=True,
config={
"args": vars(args),
Expand Down
6 changes: 5 additions & 1 deletion 05-sharding-deepspeed/train_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,11 @@ def main():
group=args.experiment_name,
name=f"rank-{rank}",
id=f"{args.experiment_name}-{rank}",
resume="must" if resumed else None,
fork_from=(
f"{args.experiment_name}-{rank}?_step={state['global_step']}"
if resumed
else None
),
save_code=True,
config={
"args": vars(args),
Expand Down
6 changes: 5 additions & 1 deletion 05-sharding-fsdp/train_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,11 @@ def safe_param_init_fn(module: torch.nn.Module):
group=args.experiment_name,
name=f"rank-{rank}",
id=f"{args.experiment_name}-{rank}",
resume="must" if resumed else None,
fork_from=(
f"{args.experiment_name}-{rank}?_step={state['global_step']}"
if resumed
else None
),
save_code=True,
config={
"args": vars(args),
Expand Down
6 changes: 5 additions & 1 deletion 10-training-llama-405b/train_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,11 @@ def main():
dir=exp_dir,
name=args.experiment_name,
id=args.experiment_name,
resume="must" if resumed else None,
fork_from=(
f"{args.experiment_name}-{rank}?_step={state['global_step']}"
if resumed
else None
),
save_code=True,
config={
"args": vars(args),
Expand Down
10 changes: 5 additions & 5 deletions 93-wandb-configurations/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ if rank == 0:
dir=exp_dir,
id=args.experiment_name,
name=args.experiment_name,
resume="must" if resumed else None,
fork_from=f"{args.experiment_name}-{rank}?_step={state['global_step']}" if resumed else None,
save_code=True,
config=...,
)
Expand All @@ -34,7 +34,7 @@ if local_rank == 0:
group=args.experiment_name,
name=f"rank-{rank}",
id=f"{args.experiment_name}-{rank}",
resume="must" if resumed else None,
fork_from=f"{args.experiment_name}-{rank}?_step={state['global_step']}" if resumed else None,
save_code=True,
config=...,
)
Expand All @@ -59,7 +59,7 @@ wandb.init(
group=args.experiment_name,
name=f"rank-{rank}",
id=f"{args.experiment_name}-{rank}",
resume="must" if resumed else None,
fork_from=f"{args.experiment_name}-{rank}?_step={state['global_step']}" if resumed else None,
save_code=True,
config=...,
)
Expand All @@ -81,7 +81,7 @@ index 38f3cf0..3233f81 100644
- group=args.experiment_name,
- name=f"rank-{rank}",
- id=f"{args.experiment_name}-{rank}",
- resume="must" if resumed else None,
- fork_from=f"{args.experiment_name}-{rank}?_step={state['global_step']}" if resumed else None,
- save_code=True,
- config={
- "args": vars(args),
Expand All @@ -107,7 +107,7 @@ index 38f3cf0..3233f81 100644
+ group=args.experiment_name,
+ name=f"rank-{rank}",
+ id=f"{args.experiment_name}-{rank}",
+ resume="must" if resumed else None,
+ fork_from=f"{args.experiment_name}-{rank}?_step={state['global_step']}" if resumed else None,
+ save_code=True,
+ config={
+ "args": vars(args),
Expand Down
6 changes: 5 additions & 1 deletion 93-wandb-configurations/train_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,11 @@ def _load_to_device(p):
group=args.experiment_name,
name=f"rank-{rank}",
id=f"{args.experiment_name}-{rank}",
resume="must" if resumed else None,
fork_from=(
f"{args.experiment_name}-{rank}?_step={state['global_step']}"
if resumed
else None
),
save_code=True,
config={
"args": vars(args),
Expand Down
6 changes: 5 additions & 1 deletion 94-effective-batch-size-and-lr/train_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,11 @@ def _load_to_device(p):
group=args.experiment_name,
name=f"rank-{rank}",
id=f"{args.experiment_name}-{rank}",
resume="must" if resumed else None,
fork_from=(
f"{args.experiment_name}-{rank}?_step={state['global_step']}"
if resumed
else None
),
save_code=True,
config={
"args": vars(args),
Expand Down
6 changes: 5 additions & 1 deletion 95-optimizing-data-loading/train_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,11 @@ def _load_to_device(p):
group=args.experiment_name,
name=f"rank-{rank}",
id=f"{args.experiment_name}-{rank}",
resume="must" if resumed else None,
fork_from=(
f"{args.experiment_name}-{rank}?_step={state['global_step']}"
if resumed
else None
),
save_code=True,
config={
"args": vars(args),
Expand Down
6 changes: 5 additions & 1 deletion 97-determinism/train_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,11 @@ def _load_to_device(p):
group=args.experiment_name,
name=f"rank-{rank}",
id=f"{args.experiment_name}-{rank}",
resume="must" if resumed else None,
fork_from=(
f"{args.experiment_name}-{rank}?_step={state['global_step']}"
if resumed
else None
),
save_code=True,
config={
"args": vars(args),
Expand Down
6 changes: 5 additions & 1 deletion 98-gradient-accumulation/train_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,11 @@ def _load_to_device(p):
group=args.experiment_name,
name=f"rank-{rank}",
id=f"{args.experiment_name}-{rank}",
resume="must" if resumed else None,
fork_from=(
f"{args.experiment_name}-{rank}?_step={state['global_step']}"
if resumed
else None
),
save_code=True,
config={
"args": vars(args),
Expand Down