# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.


VOCAB_DIR=$1     # path/to/vocab
DATA=$2          # path/to/data
SOURCE_LANG=$3
TARGET_LANG=$4
DICT=$VOCAB_DIR/vocab.all.truncated

DATA_BPE=$DATA/bpe
DATA_BIN=$DATA/bin
DATA_REF=$DATA/ref

mkdir -p $DATA_BIN
mkdir -p $DATA_REF

# Save references

#for lg in en es fr de ru; do
    #cp ${DATA}/xglue.ntg.$lg.tgt.dev ${DATA_REF}/$lg.tgt.valid 
#done


# Binarize

fairseq-preprocess \
    --source-lang $SOURCE_LANG \
    --target-lang $TARGET_LANG \
    --validpref $DATA_BPE/pair/${SOURCE_LANG}_${TARGET_LANG}/pair.valid.bpe \
    --testpref $DATA_BPE/pair/${SOURCE_LANG}_${TARGET_LANG}/pair.test.bpe \
    --destdir $DATA_BIN/${SOURCE_LANG}_${TARGET_LANG} \
    --thresholdtgt 0 \
    --thresholdsrc 0 \
    --srcdict ${DICT} \
    --tgtdict ${DICT} \
    --workers 256

#    --trainpref $DATA_BPE/pair/$TRANSLATION_DIRECTION/pair.train.bpe \
#fairseq-preprocess \
#    --source-lang zh \
#    --target-lang en \
#    --trainpref $DATA_BPE/pair/${TRANSLATION_DIRECTION}_lyrics/pair.train.bpe \
#    --destdir $DATA_BIN/${TRANSLATION_DIRECTION}_lyrics \
#    --thresholdtgt 0 \
#    --thresholdsrc 0 \
#    --srcdict ${DICT} \
#    --tgtdict ${DICT} \
#    --workers 256

echo "Done!"
