Skip to content

Commit

Permalink
Merge branch 'master' into exp1
Browse files Browse the repository at this point in the history
  • Loading branch information
jordimas committed Aug 13, 2024
2 parents 2014792 + 4319440 commit 6084578
Show file tree
Hide file tree
Showing 20 changed files with 3,071 additions and 8 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ Language pair | SC model BLEU | SC Flores200 BLEU | Google BLEU | Meta NLLB200 B
|Catalan-German | 28.5 |25.4 |32.9 |29.1|15.8| 3142257 | [cat-deu-2022-11-16.zip](https://www.softcatala.org/pub/softcatala/opennmt/models/2022-11-22/cat-deu-2022-11-16.zip)
|English-Catalan | 46.9 |43.8 |46.0 |41.7|29.8| 7856208 | [eng-cat-2023-10-30.zip](https://www.softcatala.org/pub/softcatala/opennmt/models/2022-11-22/eng-cat-2023-10-30.zip)
|Catalan-English | 47.4 |43.5 |47.0 |48.0|29.6| 7856208 | [cat-eng-2023-10-29.zip](https://www.softcatala.org/pub/softcatala/opennmt/models/2022-11-22/cat-eng-2023-10-29.zip)
|Basque-Catalan | 38.8 |24.9 |29.6 |N/A|N/A| 9546180 | [eus-cat-2024-08-09.zip](https://www.softcatala.org/pub/softcatala/opennmt/models/2022-11-22/eus-cat-2024-08-09.zip)
|Catalan-Basque | 27.3 |17.1 |18.0 |N/A|N/A| 9546180 | [cat-eus-2024-08-12.zip](https://www.softcatala.org/pub/softcatala/opennmt/models/2022-11-22/cat-eus-2024-08-12.zip)
|French-Catalan | 41.3 |31.6 |37.3 |33.3|27.2| 2566302 | [fra-cat-2022-11-09.zip](https://www.softcatala.org/pub/softcatala/opennmt/models/2022-11-22/fra-cat-2022-11-09.zip)
|Catalan-French | 41.4 |35.4 |41.7 |39.6|27.9| 2566302 | [cat-fra-2022-11-14.zip](https://www.softcatala.org/pub/softcatala/opennmt/models/2022-11-22/cat-fra-2022-11-14.zip)
|Galician-Catalan | 74.1 |31.4 |36.5 |33.2|N/A| 2710149 | [glg-cat-2022-11-17.zip](https://www.softcatala.org/pub/softcatala/opennmt/models/2022-11-22/glg-cat-2022-11-17.zip)
Expand Down
1,012 changes: 1,012 additions & 0 deletions evaluate/flores200.eus

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions evaluate/google-bleu.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,7 @@
"jpn-cat": "23.4",
"cat-jpn": "32.5",
"glg-cat": "36.5",
"cat-glg": "33.1"
}
"cat-glg": "33.1",
"eus-cat": "29.6",
"cat-eus": "18.0"
}
3 changes: 3 additions & 0 deletions evaluate/google-translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,9 @@ def main():

"gl-ca" : ["glg", "cat"],
"ca-gl" : ["cat", "glg"],

"eu-ca" : ["eus", "cat"],
"ca-eu" : ["cat", "eus"],
}

blue_scores = {}
Expand Down
1,012 changes: 1,012 additions & 0 deletions evaluate/google-translate/flores200-cat-eus.eus

Large diffs are not rendered by default.

1,012 changes: 1,012 additions & 0 deletions evaluate/google-translate/flores200-eus-cat.cat

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions languages/eus-cat/bleu.sh
13 changes: 13 additions & 0 deletions languages/eus-cat/corpus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
source_files:
- corpus-raw/eus-cat-1.eu
- corpus-raw/eus-cat-2.eu
- corpus-raw/eus-cat-3.eu
- corpus-raw/eus-cat-4.eu
- corpus-raw/eus-cat-5.eu

target_files:
- corpus-raw/eus-cat-1.ca
- corpus-raw/eus-cat-2.ca
- corpus-raw/eus-cat-3.ca
- corpus-raw/eus-cat-4.ca
- corpus-raw/eus-cat-5.ca
1 change: 1 addition & 0 deletions languages/eus-cat/data.yml
1 change: 1 addition & 0 deletions languages/eus-cat/export.sh
1 change: 1 addition & 0 deletions languages/eus-cat/model.py
1 change: 1 addition & 0 deletions languages/eus-cat/preprocess.sh
1 change: 1 addition & 0 deletions languages/eus-cat/train.sh
1 change: 1 addition & 0 deletions languages/eus-cat/voc.sh
2 changes: 1 addition & 1 deletion languages/get-corpuses.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
git clone --single-branch --branch eng-cat-aina --depth 1 https://github.com/Softcatala/parallel-catalan-corpus corpus-raw
cd corpus-raw

declare -a arr=("eng-cat" "deu-cat" "ita-cat" "fra-cat" "spa-cat" "nld-cat" "por-cat" "jpn-cat" "glg-cat" "oci-cat")
declare -a arr=("eng-cat" "deu-cat" "ita-cat" "fra-cat" "spa-cat" "nld-cat" "por-cat" "jpn-cat" "glg-cat" "oci-cat" "eus-cat")

for dirname in "${arr[@]}"; do
echo Copying $dirname
Expand Down
3 changes: 1 addition & 2 deletions languages/master-language/export.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ onmt-main --config data.yml --auto_config export --export_dir $exportDir/tensorf
echo "Model description: $modelName" >> $modelDescription
echo "Date: $currentDate" >> $modelDescription
python3 ../../install-scripts/stack-versions.py >> $modelDescription
wc corpus/$modelName/src-train.txt -l > $exportDir/metadata/inputs_used.txt
ls corpus/$modelName/*.txt -l >> $exportDir/metadata/inputs_used.txt
wc src-train.txt -l > $exportDir/metadata/inputs_used.txt
cp *.model $exportDir/tokenizer/
cp corpus.yml data.yml model.py $exportDir/metadata/
cp sp-vocab.txt.token $exportDir/tensorflow/assets/
Expand Down
2 changes: 1 addition & 1 deletion languages/preprocess-all.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

declare -a arr=("ita-cat" "fra-cat" "spa-cat" "por-cat" "eng-cat" "deu-cat" "nld-cat" "jpn-cat" "glg-cat" "oci-cat")
declare -a arr=("ita-cat" "fra-cat" "spa-cat" "por-cat" "eng-cat" "deu-cat" "nld-cat" "jpn-cat" "glg-cat" "oci-cat" "eus-cat")

for dirname in "${arr[@]}"; do
echo Processing $dirname
Expand Down
2 changes: 1 addition & 1 deletion languages/train-all-from-cat.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

declare -a arr=("cat-eng" "cat-deu" "cat-fra" "cat-ita" "cat-spa" "cat-por" "cat-nld" "cat-jpn" "cat-glg" "cat-oci")
declare -a arr=("cat-eng" "cat-deu" "cat-fra" "cat-ita" "cat-spa" "cat-por" "cat-nld" "cat-jpn" "cat-glg" "cat-oci" "cat-eus")
#declare -a arr=("cat-eng")

for dirname in "${arr[@]}"; do
Expand Down
2 changes: 1 addition & 1 deletion languages/train-all-to-cat.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

declare -a arr=("eng-cat" "deu-cat" "fra-cat" "ita-cat" "spa-cat" "por-cat" "nld-cat" "jpn-cat" "glg-cat" "oci-cat")
declare -a arr=("eng-cat" "deu-cat" "fra-cat" "ita-cat" "spa-cat" "por-cat" "nld-cat" "jpn-cat" "glg-cat" "oci-cat" "eus-cat")
#declare -a arr=("eng-cat")

for dirname in "${arr[@]}"; do
Expand Down
1 change: 1 addition & 0 deletions models-table.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def convert_iso_639_3_to_string(language_pair):
"jpn" : "Japanese",
"glg" : "Galician",
"oci" : "Occitan",
"eus" : "Basque",
}

for iso in languages.keys():
Expand Down

0 comments on commit 6084578

Please sign in to comment.