{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "98625e1b-ddb1-42a1-a3f7-489f2e8b3270",
   "metadata": {},
   "source": [
    "## 使用するライブラリインストール"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "625b46ee-7214-49ea-a34d-9772471aea57",
   "metadata": {
    "scrolled": true,
    "tags": []
   },
   "outputs": [],
   "source": [
    "!apt update -y \n",
    "!apt install libsndfile1-dev rubberband-cli -y\n",
    "!pip install soundfile pyrubberband librosa"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "976be5c6-b4ab-4117-a867-54288ec29d3c",
   "metadata": {},
   "source": [
    "## 4-1. 「Amazon Polly とは？」 で使用しているコード"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "78aac58f-c1b2-4bca-880e-49a0dc40216a",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import boto3\n",
    "import wave\n",
    "polly_client = boto3.client('polly')\n",
    "\n",
    "args = {\n",
    "    'Engine':'standard',\n",
    "    'LanguageCode':'ja-JP',\n",
    "    'OutputFormat':'pcm',\n",
    "    'SampleRate':'16000',\n",
    "    'Text':'あー',\n",
    "    'TextType':'text',\n",
    "    'VoiceId':'Mizuki'\n",
    "}\n",
    "\n",
    "try:\n",
    "    response = polly_client.synthesize_speech(**args)\n",
    "    if 'AudioStream' in response:\n",
    "        with wave.open('./あー.wav', 'wb') as wav_file:\n",
    "            wav_file.setparams((1, 2, int(args['SampleRate']), 0, 'NONE', 'NONE'))\n",
    "            wav_file.writeframes(response['AudioStream'].read())\n",
    "except Exception as e:\n",
    "    print('synthesize_speech exception: ', e)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c176f6f8-0c71-4e1f-89db-5387b8677b5c",
   "metadata": {},
   "source": [
    "## 5-1. 「音階と周波数」で使用しているコード"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7dc78302-dfa0-488c-95e9-ca48ba146b06",
   "metadata": {
    "scrolled": true,
    "tags": []
   },
   "outputs": [],
   "source": [
    "from matplotlib import pyplot as plt\n",
    "scale_list = ['L-A','L-As','L-H','C','Cs','D','Ds','E','F','Fs','G','Gs','A','As','H','H-C','H-Cs','H-D','H-Ds','H-E','H-F','H-Fs','H-G','H-Gs','H-A']\n",
    "scale_freq = []\n",
    "for i, key in enumerate(scale_list):\n",
    "    scale_freq.append(442 * (2 ** ((-12+i)/12)))\n",
    "plt.figure(figsize=(12, 8))\n",
    "plt.plot(scale_list,scale_freq,'*')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "97e8d11d-4e85-41c6-8312-8cbfc8cd33d7",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "89fc4eba-94b3-4403-8d44-5cd313b2ceb4",
   "metadata": {},
   "source": [
    "## 「5-2-1. 波形チェック」で使用しているコード"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a7b22b47-814d-403a-9493-cfab22211f0a",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import soundfile as sf\n",
    "import numpy as np\n",
    "from matplotlib import pyplot as plt\n",
    "amplitude, sampling_rate = sf.read('あー.wav')\n",
    "time = np.arange(0, amplitude.shape[0])/sampling_rate\n",
    "plt.plot(time, amplitude)\n",
    "plt.xlabel('time [sec]')\n",
    "plt.ylabel('amplitude')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "432385cf-9790-475c-b849-b50a43b84f67",
   "metadata": {},
   "source": [
    "## 「5-2-2. 無音区間のカット」で使用しているコード"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34d31113-38f0-491b-befb-ef65e8d1542d",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 無音区間削除\n",
    "min_amplitude = 0.002\n",
    "for i in range(amplitude.shape[0]):\n",
    "    if np.abs(amplitude[i]) > min_amplitude:\n",
    "        cut_start_index = i-1 if i-1 > 0 else 0\n",
    "        break\n",
    "cut_end_index = amplitude.shape[0]\n",
    "for i in range(amplitude.shape[0]-1,-1,-1):\n",
    "    if np.abs(amplitude[i]) > min_amplitude:\n",
    "        cut_end_index = i+1\n",
    "        break\n",
    "amplitude = amplitude[cut_start_index:cut_end_index]\n",
    "\n",
    "# 再描画\n",
    "plt.plot(time[cut_start_index:cut_end_index], amplitude)\n",
    "plt.xlabel('time [sec]')\n",
    "plt.ylabel('amplitude')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0198ef8d-e5f6-4e1b-8baf-b7b3b53b150c",
   "metadata": {},
   "source": [
    "## 「5-2-3. 周波数のチェック」で使用しているコード"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cfef5ba8-5ead-4e3a-88a7-595232d0fbe3",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# FFT を行い、周波数ごとにスペクトル強度をプロット\n",
    "fft_data = np.fft.fft(amplitude)\n",
    "freq_list = np.fft.fftfreq(amplitude.shape[0], d=1.0/sampling_rate)\n",
    "amp = np.abs(fft_data)\n",
    "amp_p = amp[0: amp.shape[0]//2]\n",
    "freq_list_p = freq_list[0: freq_list.shape[0]//2]\n",
    "plt.plot(freq_list_p,amp_p)\n",
    "plt.xlim(0, freq_list_p[-1])\n",
    "plt.xlabel('frequency[Hz]')\n",
    "plt.ylabel('spectral intensity')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "65659bd6-1e28-40ff-8b8b-ce87f607bd2a",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 0 から 650 [Hz] を拡大\n",
    "plt.plot(freq_list_p,amp_p)\n",
    "plt.xlim(0, freq_list_p[-1])\n",
    "plt.xlabel('frequency[Hz]')\n",
    "plt.ylabel('spectral intensity')\n",
    "plt.xlim(0, 650)\n",
    "plt.show()\n",
    "print('maximum frequency: '+str(freq_list_p[amp_p.argmax()]))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2ed4c50f-490d-4c33-bc46-718bfb16c779",
   "metadata": {},
   "source": [
    "## 「5-2-4. 周波数の時間推移」で使用しているコード"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52fb6d7a-1483-45c2-87c1-355deb2e76be",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from scipy import signal\n",
    "f, t, Zxx = signal.stft(amplitude, fs=sampling_rate, nperseg=1000) # 短時間フーリエ変換 \n",
    "plt.pcolormesh(t, f, np.abs(Zxx), vmin=0, shading='gouraud')\n",
    "plt.title('STFT')\n",
    "plt.ylim(0,2000)\n",
    "plt.ylabel('Frequency [Hz]')\n",
    "plt.xlabel('Time [sec]')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1de1643c-3203-4c91-af25-afdc9ed91cbd",
   "metadata": {},
   "source": [
    "## 「5-2-5. 音声の高低（ピッチ）を変える」で使用しているコード"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d79eee08-85c3-4bca-b85e-9b3697b72691",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 音を分割し、それぞれのデータの最大周波数を取得する\n",
    "sep_num = 4\n",
    "width = amplitude.shape[0]//sep_num\n",
    "segment_freq_list = []\n",
    "for i in range(sep_num):\n",
    "    if i==sep_num-1:\n",
    "        sampling_amp = amplitude[i*width:-1]\n",
    "    else:\n",
    "        sampling_amp = amplitude[i*width:(i+1)*width]\n",
    "    fft_data = np.fft.fft(sampling_amp)\n",
    "    freq_list = np.fft.fftfreq(sampling_amp.shape[0], d=1.0/sampling_rate)\n",
    "    amp = np.abs(fft_data)\n",
    "    amp_p = amp[0: amp.shape[0]//2]\n",
    "    freq_list_p = freq_list[0: freq_list.shape[0]//2]\n",
    "    segment_freq_list.append(freq_list_p[amp_p.argmax()])\n",
    "print(*segment_freq_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4df70ceb-5551-4ff8-a343-d066af8b699b",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 4 つに分割した音をそれぞれピッチシフトし、再度連結して wav ファイルを出力する\n",
    "import pyrubberband as pyrb\n",
    "target_freq = 442\n",
    "shift_y_list = []\n",
    "data_points = 0\n",
    "for i, origin_freq in enumerate(segment_freq_list):\n",
    "    n_steps = np.log2(target_freq/origin_freq) * 12\n",
    "    if i == sep_num-1:\n",
    "        shift_y_list.append(pyrb.pitch_shift(amplitude[i*width:-1], sr = sampling_rate, n_steps=n_steps))\n",
    "    else:\n",
    "        shift_y_list.append(pyrb.pitch_shift(amplitude[i*width:(i+1)*width], sr = sampling_rate, n_steps=n_steps))\n",
    "    data_points += shift_y_list[-1].shape[0]\n",
    "shift_y = np.zeros((data_points),dtype=np.float64)\n",
    "start_index = 0\n",
    "for i in range(sep_num):\n",
    "    shift_y[start_index:start_index + shift_y_list[i].shape[0]] = shift_y_list[i]\n",
    "    start_index += shift_y_list[i].shape[0]\n",
    "sf.write('./shift.wav', shift_y, sampling_rate, subtype=\"PCM_16\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8271e01f-f69f-40ed-aece-43fde95e9dea",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 波形\n",
    "amplitude, sampling_rate = sf.read('shift.wav')\n",
    "time = np.arange(0, amplitude.shape[0])/sampling_rate\n",
    "plt.plot(time, amplitude)\n",
    "plt.xlabel('time [sec]')\n",
    "plt.ylabel('amplitude')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "30fc17a8-e1cb-418d-a990-b70e81f9f4da",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# STFT\n",
    "from scipy import signal\n",
    "f, t, Zxx = signal.stft(amplitude, fs=sampling_rate, nperseg=1000)\n",
    "plt.pcolormesh(t, f, np.abs(Zxx), vmin=0, shading='gouraud')\n",
    "plt.title('STFT')\n",
    "plt.ylim(0,2000)\n",
    "plt.ylabel('Frequency [Hz]')\n",
    "plt.xlabel('Time [sec]')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d71bc1d2-be77-4b79-af4f-1407930c1611",
   "metadata": {},
   "source": [
    "## 「5-3. 今までのコードを class にする」で使用しているコード"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "152a1491-83db-4d2e-b463-fa873ebc2d79",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import librosa\n",
    "import soundfile as sf\n",
    "import pyrubberband as pyrb\n",
    "import boto3\n",
    "import os\n",
    "import wave\n",
    "from scipy import signal\n",
    "from io import BytesIO\n",
    "\n",
    "polly_client = boto3.client('polly')\n",
    "\n",
    "class SingingVoiceGenerator():\n",
    "    def __init__(\n",
    "        self, \n",
    "        scale = 'A',\n",
    "        length_sec = 1,\n",
    "        text = 'あ', \n",
    "        engine = 'standard', \n",
    "        language = 'ja-JP',\n",
    "        output_format = 'pcm',\n",
    "        sample_rate = 16000,\n",
    "        text_type = 'text',\n",
    "        voice_id = 'Mizuki',\n",
    "        sep_num = 5\n",
    "    ):\n",
    "        self.scale = scale\n",
    "        self.length_sec = length_sec\n",
    "        self.text = text\n",
    "        self.engine = engine\n",
    "        self.language = language\n",
    "        self.output_format = output_format\n",
    "        self.sample_rate = sample_rate\n",
    "        self.text_type = text_type\n",
    "        self.voice_id = voice_id\n",
    "        self.sep_num = sep_num\n",
    "        self._scale_list = ['L-A','L-As','L-H','C','Cs','D','Ds','E','F','Fs','G','Gs','A','As','H','H-C','H-Cs','H-D','H-Ds','H-E','H-F','H-Fs','H-G','H-Gs','H-A']\n",
    "        self._scale_dict = {}\n",
    "        for i, key in enumerate(self._scale_list):\n",
    "            freq = 442 * (2 ** ((-12+i)/12))\n",
    "            self._scale_dict[key] = freq\n",
    "        self.scale_freq = self._scale_dict[self.scale]\n",
    "        self.raw_pcm_bin = self.generate_voice_data()\n",
    "        self.amplitude = self.cut_scilence()\n",
    "        self.shift_pcm_array = self.pitch_shift()\n",
    "        self.time_stretch_array = self.time_stretch()\n",
    "    def generate_voice_data(self):\n",
    "        raw_pcm_bin = BytesIO()\n",
    "        raw_pcm_bin.name = 'raw.wav'\n",
    "        args = {\n",
    "            'Engine':self.engine,\n",
    "            'LanguageCode':self.language,\n",
    "            'OutputFormat':self.output_format,\n",
    "            'SampleRate':str(self.sample_rate),\n",
    "            'Text':self.text,\n",
    "            'TextType':self.text_type,\n",
    "            'VoiceId':self.voice_id\n",
    "        }\n",
    "        try:\n",
    "            response = polly_client.synthesize_speech(**args)\n",
    "            if 'AudioStream' in response:\n",
    "                with wave.open(raw_pcm_bin, 'wb') as wav_file:\n",
    "                    wav_file.setparams((1, 2, self.sample_rate, 0, 'NONE', 'NONE'))\n",
    "                    wav_file.writeframes(response['AudioStream'].read())\n",
    "            raw_pcm_bin.seek(0)\n",
    "        except Exception as e:\n",
    "            print('synthesize_speech exception: ', e)\n",
    "        \n",
    "        return raw_pcm_bin\n",
    "    def cut_scilence(self):\n",
    "        amplitude, _ = sf.read(self.raw_pcm_bin)\n",
    "        # 無音区間削除\n",
    "        for i in range(amplitude.shape[0]):\n",
    "            if np.abs(amplitude[i]) > 0.002:\n",
    "                cut_start_index = i-1 if i-1 > 0 else 0\n",
    "                break\n",
    "        cut_end_index = amplitude.shape[0]\n",
    "        for i in range(amplitude.shape[0]-1,-1,-1):\n",
    "            if np.abs(amplitude[i]) > 30:\n",
    "                cut_end_index = i+1\n",
    "                break\n",
    "        return amplitude[cut_start_index:cut_end_index]\n",
    "    def pitch_shift(self):\n",
    "        width = self.amplitude.shape[0]//self.sep_num\n",
    "        segment_freq_list = []\n",
    "        for i in range(self.sep_num):\n",
    "            if i==self.sep_num-1:\n",
    "                sampling_amp = self.amplitude[i*width:-1]\n",
    "            else:\n",
    "                sampling_amp = self.amplitude[i*width:(i+1)*width]\n",
    "            fft_data = np.fft.fft(sampling_amp)\n",
    "            freq_list = np.fft.fftfreq(sampling_amp.shape[0], d=1.0/self.sample_rate)\n",
    "            amp = np.abs(fft_data)\n",
    "            amp_p = amp[0: amp.shape[0]//2]\n",
    "            freq_list_p = freq_list[0: freq_list.shape[0]//2]\n",
    "            segment_freq_list.append(freq_list_p[amp_p.argmax()])\n",
    "        shift_amplitude_list = []\n",
    "        data_points = 0\n",
    "        for i, origin_freq in enumerate(segment_freq_list):\n",
    "            n_steps = np.log2(self.scale_freq/origin_freq) * 12\n",
    "            if i == self.sep_num-1:\n",
    "                shift_amplitude_list.append(pyrb.pitch_shift(self.amplitude[i*width:-1], sr = self.sample_rate, n_steps=n_steps))\n",
    "            else:\n",
    "                shift_amplitude_list.append(pyrb.pitch_shift(self.amplitude[i*width:(i+1)*width], sr = self.sample_rate, n_steps=n_steps))\n",
    "            data_points += shift_amplitude_list[-1].shape[0]\n",
    "        shift_amplitude = np.zeros((data_points),dtype=np.float64)\n",
    "        start_index = 0\n",
    "        for i in range(self.sep_num):\n",
    "            shift_amplitude[start_index:start_index + shift_amplitude_list[i].shape[0]] = shift_amplitude_list[i]\n",
    "            start_index += shift_amplitude_list[i].shape[0]\n",
    "        return shift_amplitude\n",
    "    def time_stretch(self):\n",
    "        origin_time = self.shift_pcm_array.shape[0] / self.sample_rate\n",
    "        ratio = origin_time / self.length_sec\n",
    "        return pyrb.time_stretch(self.shift_pcm_array, self.sample_rate, ratio)\n",
    "    def output_wave(self, name):\n",
    "        sf.write(name, self.time_stretch_array, 16000, subtype=\"PCM_16\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f3fa4cea-444c-4d28-893e-21c9b8e973fd",
   "metadata": {},
   "source": [
    "## 「5-4. かえるの合唱を歌わせてみる」で使用しているコード"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fe513a80-b3e6-489a-8217-82b9eb7d856b",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "score = [\n",
    "    {'text':'かー','length_sec':0.5,'scale':'C'},\n",
    "    {'text':'えー','length_sec':0.5,'scale':'D'},\n",
    "    {'text':'るー','length_sec':0.5,'scale':'E'},\n",
    "    {'text':'のー','length_sec':0.5,'scale':'F'},\n",
    "    {'text':'うー','length_sec':0.5,'scale':'E'},\n",
    "    {'text':'たー','length_sec':0.5,'scale':'D'},\n",
    "    {'text':'がー','length_sec':0.5,'scale':'C'},\n",
    "]\n",
    "frog_np_arrays = [SingingVoiceGenerator(**s).time_stretch_array for s in score]\n",
    "data_points = 0\n",
    "for frog_np_array in frog_np_arrays:\n",
    "    data_points += frog_np_array.shape[0]\n",
    "concat_np_array = np.zeros((data_points),dtype=np.float64)\n",
    "start_index = 0\n",
    "for frog_np_array in frog_np_arrays:\n",
    "    concat_np_array[start_index:start_index+frog_np_array.shape[0]] = frog_np_array\n",
    "    start_index += frog_np_array.shape[0]\n",
    "sf.write('frog.wav', concat_np_array, 16000, subtype='PCM_16')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cf22b970-d59b-4163-bec0-6a8e308a2586",
   "metadata": {},
   "source": [
    "## 「5-5. 歌ってみた」で使用しているコード"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c21edac-7de0-4893-ad60-fee863a67719",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "score = [\n",
    "    {'text':'どぅぅ','length_sec':4,'scale':'H-C','sep_num':3,}, # Du\n",
    "    {'text':'まぃ','length_sec':3,'scale':'H-C','sep_num':2,} , # mei\n",
    "    {'text':'ね','length_sec':1,'scale':'H-C','sep_num':1,}, # ne \n",
    "    {'text':'じぃ','length_sec':6,'scale':'H-Ds','sep_num':2,}, # See\n",
    "    {'text':'れぇ','length_sec':2,'scale':'Gs','sep_num':2,}, # le,\n",
    "    {'text':'どぅぅ','length_sec':4,'scale':'Gs','sep_num':3,}, # du \n",
    "    {'text':'まぃん','length_sec':4,'scale':'As','sep_num':2,}, # mein\n",
    "    # \n",
    "    {'text':'へぇるつ','length_sec':16,'scale':'H-C','sep_num':3,}, # Herz,\n",
    "    {'text':'どぅぅ','length_sec':4,'scale':'Gs','sep_num':2,}, # du\n",
    "    {'text':'まぃ','length_sec':2,'scale':'Gs','sep_num':2,}, # mei\n",
    "    {'text':'ねぇ','length_sec':2,'scale':'Gs','sep_num':2,}, # ne \n",
    "    # \n",
    "    {'text':'ゔぉん','length_sec':12,'scale':'H-F','sep_num':2,}, # Won\n",
    "    {'text':'ねぇ','length_sec':4,'scale':'H-Ds','sep_num':2,}, # n', o \n",
    "    {'text':'どぅぅ','length_sec':4,'scale':'H-Cs','sep_num':3,}, # du\n",
    "    {'text':'まぃん','length_sec':4,'scale':'H-C','sep_num':2,}, #  mein \n",
    "    #\n",
    "    {'text':'しゅめぇるつぅ','length_sec':16,'scale':'As','sep_num':5,}, # Schmerz,\n",
    "    {'text':'どぅぅ','length_sec':4,'scale':'As','sep_num':3,}, # du\n",
    "    {'text':'まぃ','length_sec':2,'scale':'G','sep_num':2,}, # mei\n",
    "    {'text':'ねぇ','length_sec':2,'scale':'Ds','sep_num':2,}, # ne \n",
    "    # \n",
    "    {'text':'ゔぇるとぅ','length_sec':12,'scale':'Gs','sep_num':3,}, # Welt, \n",
    "    {'text':'いん','length_sec':4,'scale':'Gs','sep_num':1,}, # in \n",
    "    {'text':'でぇ','length_sec':6,'scale':'H-Cs','sep_num':2,}, # de\n",
    "    {'text':'りっひ','length_sec':2,'scale':'H-Cs','sep_num':2,}, # rich \n",
    "    # \n",
    "    {'text':'りぃ','length_sec':8,'scale':'H-Cs','sep_num':2,}, # le\n",
    "    {'text':'ぶぅ','length_sec':4,'scale':'H-C','sep_num':2,}, # be,\n",
    "    {'text':'まいん','length_sec':4,'scale':'H-C','sep_num':2,}, # mein \n",
    "    {'text':'ひぃ','length_sec':6,'scale':'H-Ds','sep_num':2,}, # Him\n",
    "    {'text':'めぅ','length_sec':2,'scale':'Gs','sep_num':2,}, # mel\n",
    "    #\n",
    "    {'text':'どぅ','length_sec':12,'scale':'H-Cs','sep_num':2,}, #  du, \n",
    "    {'text':'だぁ','length_sec':4,'scale':'H-Cs','sep_num':2,}, # dar\n",
    "    {'text':'りん','length_sec':6,'scale':'H-Fs','sep_num':2,}, # ein\n",
    "    {'text':'いっひ','length_sec':2,'scale':'H-Fs','sep_num':2,}, # ich\n",
    "    # \n",
    "    {'text':'しゅゔぃー','length_sec':8,'scale':'H-Fs','sep_num':3,}, # schwe\n",
    "    {'text':'ぶ','length_sec':4,'scale':'H-F','sep_num':1,}, # be\n",
    "    {'text':'お','length_sec':4,'scale':'H-Ds','sep_num':1,}, # o\n",
    "    {'text':'どぅ','length_sec':4,'scale':'H-Cs','sep_num':1,}, # du\n",
    "    {'text':'まいん','length_sec':4,'scale':'H-C','sep_num':2,}, # mein\n",
    "    # \n",
    "    {'text':'ぐらぶ','length_sec':12,'scale':'As','sep_num':3,}, # Grab\n",
    "    {'text':'いん','length_sec':4,'scale':'As','sep_num':2,}, # in\n",
    "    {'text':'だす','length_sec':4,'scale':'H-C','sep_num':2,}, # das\n",
    "    {'text':'ひぃ','length_sec':4,'scale':'H-Cs','sep_num':2,}, # hi\n",
    "    #\n",
    "    {'text':'なぶ','length_sec':4,'scale':'H-Ds','sep_num':2,}, # nab\n",
    "    {'text':'いっひ','length_sec':4,'scale':'H-Ds','sep_num':2,}, # ich\n",
    "    {'text':'え','length_sec':12,'scale':'Gs','sep_num':1,}, # e\n",
    "    {'text':'ゔぃぐ','length_sec':4,'scale':'H-Cs','sep_num':2,}, # wig\n",
    "    #\n",
    "    {'text':'まいん','length_sec':4,'scale':'H-C','sep_num':2,}, # mein\n",
    "    {'text':'えん','length_sec':4,'scale':'H-C','sep_num':2,}, # en\n",
    "    {'text':'くんむ','length_sec':12,'scale':'As','sep_num':2,}, # kum\n",
    "    {'text':'まぁ','length_sec':4,'scale':'H-C','sep_num':2,}, # mer\n",
    "    #\n",
    "    {'text':'がぶ','length_sec':8,'scale':'Gs','sep_num':2,}, # gab    \n",
    "]\n",
    "for i in range(len(score)):\n",
    "    score[i]['length_sec'] /= 8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f9b66df-4bfd-414a-aaf4-618f27ff0f8f",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "frog_np_arrays = [SingingVoiceGenerator(**s).time_stretch_array for s in score]\n",
    "data_points = 0\n",
    "for frog_np_array in frog_np_arrays:\n",
    "    data_points += frog_np_array.shape[0]\n",
    "concat_np_array = np.zeros((data_points),dtype=np.float64)\n",
    "start_index = 0\n",
    "for frog_np_array in frog_np_arrays:\n",
    "    concat_np_array[start_index:start_index+frog_np_array.shape[0]] = frog_np_array\n",
    "    start_index += frog_np_array.shape[0]\n",
    "sf.write('liebeslied.wav', concat_np_array, 16000, subtype=\"PCM_16\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1558d351-cbc6-4974-b88d-6c2a7d8ba425",
   "metadata": {},
   "source": [
    "## 「7. 背景画像を作る」で使用しているコード"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2f2606ad-ca6d-4df8-874b-ace72cd099cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ライブラリインストール (Studio Lab の場合)\n",
    "!pip install --upgrade git+https://github.com/huggingface/diffusers.git transformers accelerate scipy torch\n",
    "# ライブラリインストール (SageMaker Studio の場合)\n",
    "# !pip install --upgrade git+https://github.com/huggingface/diffusers.git transformers accelerate scipy ipywidgets ftfy\n",
    "\n",
    "# モデル読み込み\n",
    "# ほぼ https://huggingface.co/stabilityai/stable-diffusion-2 の通りに実行しています。\n",
    "from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler\n",
    "import torch\n",
    "\n",
    "model_id = \"stabilityai/stable-diffusion-2\"\n",
    "scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder=\"scheduler\")\n",
    "pipe = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, revision=\"fp16\", torch_dtype=torch.float16)\n",
    "pipe = pipe.to(\"cuda\")\n",
    "\n",
    "for i in range(6):\n",
    "    image = pipe(\"Atmospheric bright scenery with a robot playing a love song on the piano with anime style.\", height=648, width=1152).images[0]\n",
    "    image.save(f\"{str(i)}.png\")"
   ]
  }
 ],
 "metadata": {
  "instance_type": "ml.t3.medium",
  "kernelspec": {
   "display_name": "Python 3 (Data Science)",
   "language": "python",
   "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/datascience-1.0"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}