typos/benchsuite/fixtures/subtitles_en.sh

77 lines
1.5 KiB
Bash
Raw Normal View History

#!/usr/bin/env bash
# Pre-reqs:
# - gunzip
# - curl
set -e
FIXTURE_DIR="subtitles_en"
SUBTITLES_NAME="OpenSubtitles2016.raw.en"
SUBTITLES_NAME_SAMPLE="$SUBTITLES_NAME.sample"
SUBTITLES_NAME_GZ="${SUBTITLES_NAME}.gz"
SUBTITLES_URL="https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2016/mono/en.txt.gz"
if [[ $# -eq 0 ]]; then
exit 1
fi
command=$1
base_dir="/tmp/benchsuite"
if [[ $# -ge 2 ]]; then
base_dir=$2
fi
root_dir="${base_dir}/$FIXTURE_DIR"
out_file="$root_dir/$SUBTITLES_NAME_SAMPLE"
log_path="${base_dir}/$FIXTURE_DIR.log"
function path() {
if [[ -e $out_file ]]; then
echo $out_file
fi
}
function clear() {
rm -Rf ${root_dir} ${log_path}
}
function download() {
if [[ ! -e $out_file ]]; then
mkdir -p ${root_dir}
echo "Downloading $FIXTURE_DIR" >> ${log_path}
pushd $root_dir >> ${log_path}
curl -L $SUBTITLES_URL -o $SUBTITLES_NAME_GZ
gunzip $SUBTITLES_NAME_GZ
# Get a sample roughly the same size as the Russian corpus so that
# benchmarks finish in a reasonable time.
head -n 32722372 $SUBTITLES_NAME > $SUBTITLES_NAME_SAMPLE
shasum $SUBTITLES_NAME_SAMPLE > $SUBTITLES_NAME_SAMPLE.sha
popd >> ${log_path}
fi
}
function version() {
if [[ -e $out_file ]]; then
echo "`basename $out_file` `cat $out_file.sha | cut -d " " -f 1`"
fi
}
case $command in
path)
echo $(path)
;;
clear)
echo $(clear)
;;
version)
echo $(version)
;;
download)
download
echo $(path)
;;
*)
>&2 echo "Invalid command: $command"
exit 1
;;
esac