# 最简单的是音轨分离,直接将背景音乐的轨道剥离,只剩下人声道后即可根据空白片段进行切割

# 只有一个音轨时,使用音乐检索系统,分割人声和背景音乐声
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import numpy as np
from moviepy.editor import *
from pydub import AudioSegment

audio_path = r'D:\AddCaption\disappearedBullet.wav'
folder_name = r'D:\AddCaption\hysxm'
split_len = 18


def extract_audio(video_path, root, start_time, end_time):
    """
    从视频中读取音频
    :param video_path: 视频存储路径
    :param root: 存储临时文件的文件夹路径
    :param start_time: 视频的实际开始时间
    :param end_time: 视频的实际结束时间
    :return: audio_path 提取的音频存储路径
    """
    global audio_path
    audio_path = os.path.join(root, os.path.basename(video_path).split('.')[0] + ".wav")
    if end_time == -1:
        command = "ffmpeg -i {} -ar 16000 -ac 1 -ss {} -y {}".format(video_path, start_time, audio_path)
    else:
        command = "ffmpeg -i {} -ar 16000 -ac 1 -ss {} -to {} -y {}".format(video_path, start_time, end_time, audio_path)
    os.system(command)
    return audio_path


# 将音频划分为n个文件，返回存储
def split_audio():
    parent = os.path.dirname(os.path.realpath(audio_path))
    global folder_name
    folder_name = os.path.join(parent, os.path.basename(audio_path)[:-4])
    if not os.path.exists(folder_name):
        os.mkdir(folder_name)
    audio = AudioSegment.from_file(audio_path)
    per_part = 300000
    global split_len
    split_len = int(np.ceil(len(audio) / per_part))
    if audio.channels > 1:
        audio, _ = audio.split_to_mono()
    for i in range(1, split_len + 1):
        cur_path = os.path.join(folder_name, '{}.wav'.format(str(i).zfill(2)))
        audio[per_part * (i - 1):per_part * i].export(cur_path, format='wav')
    return folder_name


# 从音频中提取人声
def extract_speech():
    from spleeter.audio.adapter import AudioAdapter
    from spleeter.separator import Separator
    separator = Separator('spleeter:2stems', multiprocess=False)
    audio_adapter = AudioAdapter.default()

    for i in range(1, split_len + 1):
        audio_path = os.path.join(folder_name, '{}.wav'.format(str(i).zfill(2)))
        out_path = os.path.join(folder_name, str(i).zfill(2))
        if not os.path.exists(out_path):
            os.mkdir(out_path)
        waveform, framerate = audio_adapter.load(audio_path)
        vocals = separator.separate(waveform)['vocals']
        audio_adapter.save(os.path.join(out_path, 'vocal.wav'), vocals, framerate)


# 提取背景音乐
def extract_bgm():
    mp4_path = 'test.wav'
    snd = AudioSegment.from_file(mp4_path)
    print(len(snd))
    snd_l, snd_r = snd.split_to_mono()
    print(len(snd_r), len(snd_l))
    snd_r_inv = snd_r.invert_phase()  # 反相
    bg_music2 = snd_r.overlay(snd_l.invert_phase())
    bg_music = snd_l.overlay(snd_r_inv)  # 覆盖后获得背景音乐，但有噪音

    print(len(bg_music))
    bg_music.export('bgm.wav', format='wav')
    total_music = snd_l.overlay(snd_r)
    total_music.export("total.wav", format='wav')


if __name__ == '__main__':
    video_path = 'D:/heelo/shaolin.mkv'
    extract_audio(video_path, './tmp', 62, 7489)
    split_audio()
    extract_speech()
