#!/bin/bash

# koried, 10/29/2012

# Reduce a data set based on a list of turn-ids

help_message="usage: $0 srcdir turnlist destdir"

if [ $1 == "--help" ]; then
    echo "${help_message}"
    exit 0;
fi

if [ $# != 3 ]; then
    echo "${help_message}"
    exit 1;
fi

srcdir=$1
reclist=$2
destdir=$3

if [ ! -f ${srcdir}/utt2spk ]; then
echo "$0: no such file $srcdir/utt2spk"
exit 1;
fi

function do_filtering {
# assumes the utt2spk and spk2utt files already exist.
	[ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp
	[ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp
	[ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text
	[ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames
	[ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender
	[ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp
	if [ -f ${srcdir}/segments ]; then
		utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments
		awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings.
		# The next line would override the command above for wav.scp, which would be incorrect.
		[ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp
		[ -f ${srcdir}/reco2file_and_channel ] && \
			utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel
		
		# Filter the STM file for proper sclite scoring (this will also remove the comments lines)
		[ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm
		rm ${destdir}/reco
	fi
	srcutts=$(wc -l < ${srcdir}/utt2spk)
	destutts=$(wc -l < ${destdir}/utt2spk)
	echo "Reduced #utt from $srcutts to $destutts"
}

mkdir -p ${destdir}

# filter the utt2spk based on the set of recordings
utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk

utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt
do_filtering;
