"""
计算CER
需比对文字放入preFile文件夹下
原文放入textFile文件夹下
"""
import os
import re
from tqdm import tqdm
import multiprocess
import math
import time
import distance

def ishan(text):
    """去除输入字符串中除中文字符串的内容

    Args:
        text (str): 字符串

    Returns:
        str: 去除非中文字符后的字符串
    """
    # for python 3.x
    # sample: ishan('一') == True, ishan('我&&你') == False

    result= [char if '\u4e00' <= char and char<= '\u9fff' else "" for char in text]
    return "".join(result)


def cer(preFile,textFile):
    # preList=getList(preFile,"./preFile")
    # textList=getList(textFile,"./textFile")

    # for a,b in zip(textList,preList):
    #      print('pred: {}, gt: {}'.format(b, a))

    for pre in os.listdir(preFile):
        # text=pre[:-11]+".txt"
        text=pre
        print("filename:{}".format(pre))
        preList = []
        textList = []
        with open(os.path.join(preFile, pre), "r", encoding="utf-8") as fw:
            preList = fw.readlines()
        with open(os.path.join(textFile, text), "r", encoding="utf-8") as fw:
            textList = fw.readlines()

        total_edit_distance, num_chars = 0, 0
        for pred, expected in tqdm(zip(preList, textList)):
            pred = ishan(pred)
            expected = ishan(expected)
            edit_distance = distance.levenshtein(expected, pred)
            total_edit_distance += edit_distance
            num_chars += len(expected)
        print("Total CER: {}/{}={}\n".format(total_edit_distance,
                                    num_chars,
                                    round(float(total_edit_distance)/num_chars, 5)))

if __name__ == "__main__":
    preFile = "./data/test_data/result_split"
    textFile = "./data/test_data/hanzi"
    cer(preFile,textFile)
    