Skip to content

Commit

Permalink
Merge branch 'master' into sc-translate
Browse files Browse the repository at this point in the history
  • Loading branch information
jordimas committed Aug 13, 2024
2 parents 7f2c189 + ea37dca commit e2dc329
Show file tree
Hide file tree
Showing 21 changed files with 3,084 additions and 7 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ Language pair | SC model BLEU | SC Flores200 BLEU | Google BLEU | Meta NLLB200 B
|Catalan-German | 28.5 |25.4 |32.9 |29.1|15.8| 3142257 | [cat-deu-2022-11-16.zip](https://www.softcatala.org/pub/softcatala/opennmt/models/2022-11-22/cat-deu-2022-11-16.zip)
|English-Catalan | 46.9 |43.8 |46.0 |41.7|29.8| 7856208 | [eng-cat-2023-10-30.zip](https://www.softcatala.org/pub/softcatala/opennmt/models/2022-11-22/eng-cat-2023-10-30.zip)
|Catalan-English | 47.4 |43.5 |47.0 |48.0|29.6| 7856208 | [cat-eng-2023-10-29.zip](https://www.softcatala.org/pub/softcatala/opennmt/models/2022-11-22/cat-eng-2023-10-29.zip)
|Basque-Catalan | 38.8 |24.9 |29.6 |N/A|N/A| 9546180 | [eus-cat-2024-08-09.zip](https://www.softcatala.org/pub/softcatala/opennmt/models/2022-11-22/eus-cat-2024-08-09.zip)
|Catalan-Basque | 27.3 |17.1 |18.0 |N/A|N/A| 9546180 | [cat-eus-2024-08-12.zip](https://www.softcatala.org/pub/softcatala/opennmt/models/2022-11-22/cat-eus-2024-08-12.zip)
|French-Catalan | 41.3 |31.6 |37.3 |33.3|27.2| 2566302 | [fra-cat-2022-11-09.zip](https://www.softcatala.org/pub/softcatala/opennmt/models/2022-11-22/fra-cat-2022-11-09.zip)
|Catalan-French | 41.4 |35.4 |41.7 |39.6|27.9| 2566302 | [cat-fra-2022-11-14.zip](https://www.softcatala.org/pub/softcatala/opennmt/models/2022-11-22/cat-fra-2022-11-14.zip)
|Galician-Catalan | 74.1 |31.4 |36.5 |33.2|N/A| 2710149 | [glg-cat-2022-11-17.zip](https://www.softcatala.org/pub/softcatala/opennmt/models/2022-11-22/glg-cat-2022-11-17.zip)
Expand Down
10 changes: 9 additions & 1 deletion data-processing-tools/join-single-file.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import datetime
import unicodedata
from optparse import OptionParser
import resource
import re

g_configuration = None

Expand Down Expand Up @@ -160,6 +162,9 @@ def _create_dir_if_does_exist(directory):
if not os.path.exists(directory):
os.makedirs(directory)

def _clean_for_dup_detection(string):
return re.sub(r'\s+', '', string)

# https://arxiv.org/abs/1907.01279 contains an overview of some of the techniques used here
def split_in_six_files(src_filename, tgt_filename, directory, source_lang, target_lang):

Expand Down Expand Up @@ -226,7 +231,8 @@ def split_in_six_files(src_filename, tgt_filename, directory, source_lang, targe
equal += 1
continue

pair = src + trg
with_no_spaces = _clean_for_dup_detection(src + trg)
pair = hash(with_no_spaces)
if pair in pairs:
duplicated = duplicated + 1
continue
Expand Down Expand Up @@ -279,6 +285,7 @@ def split_in_six_files(src_filename, tgt_filename, directory, source_lang, targe
cnt_steps_val += 1
cnt_steps_test += 1

max_rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
pduplicated = duplicated * 100 / strings
pdots = dots * 100 / strings
pclean_src = clean_src * 100 / strings
Expand All @@ -289,6 +296,7 @@ def split_in_six_files(src_filename, tgt_filename, directory, source_lang, targe
print(f"Cleaned acute accents. src: {clean_src} ({pclean_src:.2f}%), tgt: {clean_trg} ({pclean_trg:.2f}%)")
print(f"Empty sentences or diff len too long: {bad_length} ({pbad_length:.2f}%)")
print(f"Dots: {dots} ({pdots:.2f}%), equal: {equal} ({pequal:.2f}%)")
print(f"max_rss {max_rss:.2f} MB")

def append_lines_from_file(src_filename, trg_file, max_lines):
lines = 0
Expand Down
5 changes: 5 additions & 0 deletions data-processing-tools/tests/testjoin-single-file.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,5 +118,10 @@ def test__get_val_test_split_lines(self):
self.assertEquals(1000, steps_val)
self.assertEquals(500, steps_test)

def test___clean_for_dup_detection(self):
self.assertEquals("Word1Word2", join_single_file._clean_for_dup_detection("Word1 Word2\n"))
self.assertEquals("Word1Word2", join_single_file._clean_for_dup_detection("Word1\tWord2\r"))
self.assertEquals("Word1Word2.", join_single_file._clean_for_dup_detection("Word1 Word2.\r"))

if __name__ == '__main__':
unittest.main()
1,012 changes: 1,012 additions & 0 deletions evaluate/flores200.eus

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions evaluate/google-bleu.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,7 @@
"jpn-cat": "23.4",
"cat-jpn": "32.5",
"glg-cat": "36.5",
"cat-glg": "33.1"
}
"cat-glg": "33.1",
"eus-cat": "29.6",
"cat-eus": "18.0"
}
3 changes: 3 additions & 0 deletions evaluate/google-translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,9 @@ def main():

"gl-ca" : ["glg", "cat"],
"ca-gl" : ["cat", "glg"],

"eu-ca" : ["eus", "cat"],
"ca-eu" : ["cat", "eus"],
}

blue_scores = {}
Expand Down
1,012 changes: 1,012 additions & 0 deletions evaluate/google-translate/flores200-cat-eus.eus

Large diffs are not rendered by default.

1,012 changes: 1,012 additions & 0 deletions evaluate/google-translate/flores200-eus-cat.cat

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions languages/eus-cat/bleu.sh
13 changes: 13 additions & 0 deletions languages/eus-cat/corpus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
source_files:
- corpus-raw/eus-cat-1.eu
- corpus-raw/eus-cat-2.eu
- corpus-raw/eus-cat-3.eu
- corpus-raw/eus-cat-4.eu
- corpus-raw/eus-cat-5.eu

target_files:
- corpus-raw/eus-cat-1.ca
- corpus-raw/eus-cat-2.ca
- corpus-raw/eus-cat-3.ca
- corpus-raw/eus-cat-4.ca
- corpus-raw/eus-cat-5.ca
1 change: 1 addition & 0 deletions languages/eus-cat/data.yml
1 change: 1 addition & 0 deletions languages/eus-cat/export.sh
1 change: 1 addition & 0 deletions languages/eus-cat/model.py
1 change: 1 addition & 0 deletions languages/eus-cat/preprocess.sh
1 change: 1 addition & 0 deletions languages/eus-cat/train.sh
1 change: 1 addition & 0 deletions languages/eus-cat/voc.sh
2 changes: 1 addition & 1 deletion languages/get-corpuses.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
git clone --depth=1 https://github.com/Softcatala/parallel-catalan-corpus corpus-raw
cd corpus-raw

declare -a arr=("eng-cat" "deu-cat" "ita-cat" "fra-cat" "spa-cat" "nld-cat" "por-cat" "jpn-cat" "glg-cat" "oci-cat")
declare -a arr=("eng-cat" "deu-cat" "ita-cat" "fra-cat" "spa-cat" "nld-cat" "por-cat" "jpn-cat" "glg-cat" "oci-cat" "eus-cat")

for dirname in "${arr[@]}"; do
echo Copying $dirname
Expand Down
2 changes: 1 addition & 1 deletion languages/preprocess-all.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

declare -a arr=("ita-cat" "fra-cat" "spa-cat" "por-cat" "eng-cat" "deu-cat" "nld-cat" "jpn-cat" "glg-cat" "oci-cat")
declare -a arr=("ita-cat" "fra-cat" "spa-cat" "por-cat" "eng-cat" "deu-cat" "nld-cat" "jpn-cat" "glg-cat" "oci-cat" "eus-cat")

for dirname in "${arr[@]}"; do
echo Processing $dirname
Expand Down
2 changes: 1 addition & 1 deletion languages/train-all-from-cat.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

declare -a arr=("cat-eng" "cat-deu" "cat-fra" "cat-ita" "cat-spa" "cat-por" "cat-nld" "cat-jpn" "cat-glg" "cat-oci")
declare -a arr=("cat-eng" "cat-deu" "cat-fra" "cat-ita" "cat-spa" "cat-por" "cat-nld" "cat-jpn" "cat-glg" "cat-oci" "cat-eus")
#declare -a arr=("cat-eng")

for dirname in "${arr[@]}"; do
Expand Down
2 changes: 1 addition & 1 deletion languages/train-all-to-cat.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

declare -a arr=("eng-cat" "deu-cat" "fra-cat" "ita-cat" "spa-cat" "por-cat" "nld-cat" "jpn-cat" "glg-cat" "oci-cat")
declare -a arr=("eng-cat" "deu-cat" "fra-cat" "ita-cat" "spa-cat" "por-cat" "nld-cat" "jpn-cat" "glg-cat" "oci-cat" "eus-cat")
#declare -a arr=("eng-cat")

for dirname in "${arr[@]}"; do
Expand Down
1 change: 1 addition & 0 deletions models-table.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def convert_iso_639_3_to_string(language_pair):
"jpn" : "Japanese",
"glg" : "Galician",
"oci" : "Occitan",
"eus" : "Basque",
}

for iso in languages.keys():
Expand Down

0 comments on commit e2dc329

Please sign in to comment.