# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.


VOCAB_DIR=$1     # path/to/vocab
DATA=$2          # path/to/data
LANG=$3
DICT=$VOCAB_DIR/vocab.all.truncated

DATA_BPE=$DATA/bpe
DATA_BIN=$DATA/bin
DATA_REF=$DATA/ref

mkdir -p $DATA_BIN
mkdir -p $DATA_REF

# Save references

#for lg in en es fr de ru; do
    #cp ${DATA}/xglue.ntg.$lg.tgt.dev ${DATA_REF}/$lg.tgt.valid 
#done


# Binarize

fairseq-preprocess \
    --only-source \
    --source-lang $LANG \
    --trainpref $DATA_BPE/mono/mono.bpe \
    --destdir $DATA_BIN/$LANG \
    --thresholdsrc 0 \
    --srcdict ${DICT} \
    --workers 256

fairseq-preprocess \
    --only-source \
    --source-lang en \
    --trainpref data/bpe/bt/zh_en/all.bpe \
    --destdir data/bin/bt_zh_en \
    --thresholdsrc 0 \
    --srcdict data/vocab/vocab.all.truncated \
    --workers 256

fairseq-preprocess \
    --only-source \
    --source-lang zh \
    --trainpref data/bpe/bt/en_zh/train.bpe \
    --destdir data/bin/bt_en_zh \
    --thresholdsrc 0 \
    --srcdict data/vocab/vocab.all.truncated \
    --workers 256

echo "Done!"
