#!/usr/bin/bash
# -*- coding: utf-8 -*-
# --------------------------------------------------
# File Name: scrape.sh
# Purpose:
# Creation Date: 09-03-2017
# Last Modified: Sun, Mar 12, 2017 4:00:24 PM
# Author(s): Mike Stout
# Copyright 2017 The Author(s) All Rights Reserved
# Credits:
# --------------------------------------------------
url=https://internetshakespeare.uvic.ca/doc/$1/complete/
#wget -O ${1}.html $url
lynx -dump -nolist -nonumbers ${1}.html | \
sed -e 's/^ *//' | \
sed -e 's/^\./\t/' | \
sed -e 's/^[0-9]* *//' | \
sed -e 's/^[0-9]* *//' | \
sed -e 's/^[0-9]* *//' | \
sed -e 's/^\t\([0-9]* *\)*/\t# /' | \
tr "\n" "\000" | \
sed -e 's/.*Complete Text..>..//' | \
sed -e 's/__*.*//' | \
tr "\000" "\n" | u2d | tee ${1}.html.txt