#!/usr/bin/bash
# -*- coding: utf-8 -*-

# --------------------------------------------------
# File Name: scrape.sh
# Purpose:
# Creation Date: 09-03-2017
# Last Modified: Sun, Mar 12, 2017  4:00:24 PM
# Author(s): Mike Stout
# Copyright 2017 The Author(s) All Rights Reserved
# Credits: 
# --------------------------------------------------

url=https://internetshakespeare.uvic.ca/doc/$1/complete/

#wget -O ${1}.html $url

lynx -dump -nolist -nonumbers  ${1}.html | \
	sed -e 's/^ *//' | \
	sed -e 's/^\./\t/' | \
	sed -e 's/^[0-9]* *//' | \
	sed -e 's/^[0-9]* *//' | \
	sed -e 's/^[0-9]* *//' | \
	sed -e 's/^\t\([0-9]* *\)*/\t# /' | \
	tr "\n" "\000" | \
	sed -e 's/.*Complete Text..>..//' | \
	sed -e 's/__*.*//' | \
	tr "\000" "\n" | u2d | tee ${1}.html.txt