# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.


VOCAB_DIR=data/vocab     # path/to/vocab
DATA=data          # path/to/data
DICT=$VOCAB_DIR/vocab.all.truncated

DATA_BPE=$DATA/bpe
DATA_BIN=$DATA/bin
DATA_REF=$DATA/ref

mkdir -p $DATA_BIN
mkdir -p $DATA_REF

# Save references

#for lg in en es fr de ru; do
    #cp ${DATA}/xglue.ntg.$lg.tgt.dev ${DATA_REF}/$lg.tgt.valid
#done


# Binarize
#    --trainpref $DATA_BPE/word.train.bpe \
#    --validpref $DATA_BPE/word.valid.bpe \
#    --testpref $DATA_BPE/word.test.bpe \
fairseq-preprocess \
    --source-lang zh \
    --target-lang en \
    --testpref $DATA_BPE/ft_valid_with_test/word.bpe \
    --destdir $DATA_BIN/ft_zh_en \
    --thresholdtgt 0 \
    --thresholdsrc 0 \
    --srcdict ${DICT} \
    --tgtdict ${DICT} \
    --workers 16

fairseq-preprocess \
    --source-lang en \
    --target-lang zh \
    --testpref $DATA_BPE/ft_valid_with_test/word.bpe \
    --destdir $DATA_BIN/ft_en_zh \
    --thresholdtgt 0 \
    --thresholdsrc 0 \
    --srcdict ${DICT} \
    --tgtdict ${DICT} \
    --workers 16

echo "Done!"
