#!/bin/bash

echo "$0 $*" >&2 # Print the command line for logging
. ./path.sh

help_message=$(cat << EOF
Usage: $0 <wav_dir> <srcdir> <wsj_full_wav> <dir> <num_spkrs>
e.g. $0 data/wsj0_mix/2speakers/wav16k/max data/wsj0_mix/scripts data/wsj0/wsj0_wav data 2
EOF
)

if [ $# != 5 ]; then
    echo "${help_message}" 1>&2
    exit 1;
fi

set -exuo pipefail

wavdir=$1
srcdir=$2
wsj_full_wav=$3
dir=$4
num_spkrs=$5
tmpdir=$(mktemp -d ${dir}/tmp-XXXXX)
# trap 'rm -rf ${tmpdir}' EXIT

find_transcripts=$KALDI_ROOT/egs/wsj/s5/local/find_transcripts.pl
normalize_transcript=$KALDI_ROOT/egs/wsj/s5/local/normalize_transcript.pl

# check if the wav dir exists.
for f in $wavdir/tr $wavdir/cv $wavdir/tt; do
  if [ ! -d $wavdir ]; then
    echo "Error: $wavdir is not a directory."
    exit 1;
  fi
done

# check if the script file exists.
for f in $srcdir/mix_${num_spkrs}_spk_max_tr_mix $srcdir/mix_${num_spkrs}_spk_max_cv_mix $srcdir/mix_${num_spkrs}_spk_max_tt_mix; do
  if [ ! -f $f ]; then
    echo "Could not find $f.";
    exit 1;
  fi
done

for x in tr cv tt; do
  mkdir -p ${dir}/$x
  <$srcdir/mix_${num_spkrs}_spk_max_${x}_mix \
    awk -v dir=$wavdir/$x '{printf("%s %s/mix/%s.wav\n", $1, dir, $1)}' | \
      awk -v nspkrs=${num_spkrs} '
      {
        split($1, lst, "_");
        spkr=substr(lst[1],1,3);
        for (i=1; i<nspkrs; i++) {
          spkr=spkr"_"substr(lst[i*2+1],1,3); 
        }
        print(spkr"_"$0)
      }' | sort > ${dir}/$x/wav.scp
  <${dir}/$x/wav.scp awk -v nspkrs=${num_spkrs} '{
    split($1, lst, "_"); 
    spkr=substr(lst[1],1,3);
    for (i=2; i<=nspkrs; i++) {
      spkr=spkr"_"substr(lst[i],1,3); 
    }
    print($1, spkr)
  }' | sort > ${dir}/$x/utt2spk
  utt2spk_to_spk2utt.pl ${dir}/$x/utt2spk > ${dir}/$x/spk2utt
done

# transcriptions
for i in si_tr_s si_et_05 si_dt_05; do
    cp ${wsj_full_wav}/${i}.scp ${tmpdir}
done

# Finding the transcript files:
for x in `ls ${wsj_full_wav}/links/`; do find -L ${wsj_full_wav}/links/$x -iname '*.dot'; done > ${tmpdir}/dot_files.flist

# Convert the transcripts into our format (no normalization yet)
for f in si_tr_s si_et_05 si_dt_05; do
  cat ${tmpdir}/${f}.scp | awk '{print $1}' | ${find_transcripts} ${tmpdir}/dot_files.flist > ${tmpdir}/${f}.trans1

  # Do some basic normalization steps.  At this point we don't remove OOVs--
  # that will be done inside the training scripts, as we'd like to make the
  # data-preparation stage independent of the specific lexicon used.
  noiseword="<NOISE>"
  cat ${tmpdir}/${f}.trans1 | ${normalize_transcript} ${noiseword} | sort > ${tmpdir}/${f}.txt || exit 1;
done

txt_files=('si_tr_s' 'si_tr_s' '{si_dt_05,si_et_05}')
set_values=('tr' 'cv' 'tt')

for i in ${!set_values[@]}; do
  for ns in `seq 1 1 ${num_spkrs}`; do
    awk -v spkr_idx=${ns} -v n_spkrs=${num_spkrs} '
      (ARGIND==1) {txt[$1]=$0} 
      (ARGIND==2) {
        split($1, lst, "_"); 
        utt=lst[n_spkrs+spkr_idx*2-1]; 
        text=txt[utt]; 
        print($1, text);
      }' ${tmpdir}/${txt_files[$i]}.txt ${dir}/${set_values[$i]}/wav.scp | \
      awk '{$2=""; print $0}' > ${dir}/${set_values[$i]}/text_spk${ns}
  done
done