#-*- coding: utf-8 -*-
#!/usr/bin/python2
"""
Before running this code, make sure that you've downloaded Leipzig Chinese Corpus 
(http://corpora2.informatik.uni-leipzig.de/downloads/zho_news_2007-2009_1M-text.tar.gz)
Extract and copy the `zho_news_2007-2009_1M-sentences.txt` to `data/` folder.

This code should generate a file which looks like this:
2[Tab]zhegeyemianxianzaiyijingzuofei...。[Tab]这__个_页_面___现___在__已_经___作__废__...。

In each line, the id, pinyin, and a chinese sentence are separated by a tab.
Note that _ means blanks.

Created in Aug. 2017, kyubyong. kbpark.linguist@gmail.com
"""
from __future__ import print_function
import codecs
import os
from threading import Semaphore
import regex # pip install regex
from xpinyin import Pinyin # pip install xpinyin 
import traceback

pinyin = Pinyin()

def align(sent):
    '''
    Args:
      sent: A string. A sentence.
    
    Returns:
      A tuple of pinyin and chinese sentence.
    '''    
    pnyns = pinyin.get_pinyin(sent, " ").split()
    
    hanzis = []
    x=0
    for i in range(len(pnyns)):
        if i+x<len(sent.replace(" ", "")):
            char=sent[i+x]
            p=pnyns[i]
            if '\u4e00' <= char <= '\u9fa5':
                hanzis.extend([char] + ["_"] * (len(p) - 1))
            else:
                while not '\u4e00' <= char <= '\u9fa5':
                    hanzis.extend([char])
                    x=x+1
                    if x+i>=len(sent.replace(" ", "")):
                        break
                    char=sent[i+x]
                x=x-1
    
    # for char, p in zip(sent.replace(" ", ""), pnyns):
    #     if '\u4e00' <= char <= '\u9fa5':
    #         hanzis.extend([char] + ["_"] * (len(p) - 1))
        
    pnyns = "".join(pnyns)
    hanzis = "".join(hanzis)
    
    assert len(pnyns) == len(hanzis), "The hanzis and the pinyins must be the same in length."
    return pnyns, hanzis

def clean(text):
    # if regex.search("[A-Za-z0-9]", text) is not None: # For simplicity, roman alphanumeric characters are removed.
    # # if regex.search("[A-Za-z0-9]", text) is not None: # For simplicity, roman alphanumeric characters are removed.
    #     return ""
    text = regex.sub(u"[^ \p{\u4e00-\u9fa5}。，！？]", "", text)
    text_new=""
    flag=0
    for char in text:
        if char !="。" and char !="，" and char !="！" and char!="？":
            flag=0
            text_new=text_new+char
        elif not flag:
            flag=1
            text_new=text_new+char
    # while "，，" in text:
    #     text=text.replace("，，","，")
    if len(text_new)<10: return ""
    return text_new
    
def build_corpus():
    pinyin_list=[]
    hanzi_list=[]
    with codecs.open("data/zh.tsv", 'w', 'utf-8') as fout:
        # with codecs.open("data/zho_news_2007-2009_1M-sentences.txt", 'r', 'utf-8') as fin:
        with codecs.open("data/train_set_total.txt", 'r', 'utf-8') as fin:
            i = 1
            while 1:
                line = fin.readline()
                if not line: break
                
                try:
                    # idx, sent = line.strip().split("\t")
                    # if idx == "234":
                    #     print(sent)
                    # sent = clean(sent)
                    # if len(sent) > 0:
                    #     pnyns, hanzis = align(sent)
                    #     fout.write(u"{}\t{}\t{}\n".format(idx, pnyns, hanzis))
                    sent = line.strip()
                    sent = sent.replace(" ","")
                    # sent = clean(sent)
                    if len(sent) > 0:
                        pnyns, hanzis = align(sent)
                        fout.write(u"{}\t{}\n".format(pnyns, hanzis))
                except:
                    traceback.print_exc()
                    continue # it's okay as we have a pretty big corpus!
                
                if i % 10000 == 0: print(i, )
                i += 1

if __name__ == "__main__":
    build_corpus(); print("Done")
