#!/usr/bin/bash # -*- coding: utf-8 -*- # -------------------------------------------------- # File Name: scrape.sh # Purpose: # Creation Date: 09-03-2017 # Last Modified: Sun, Mar 12, 2017 4:00:24 PM # Author(s): Mike Stout # Copyright 2017 The Author(s) All Rights Reserved # Credits: # -------------------------------------------------- url=https://internetshakespeare.uvic.ca/doc/$1/complete/ #wget -O ${1}.html $url lynx -dump -nolist -nonumbers ${1}.html | \ sed -e 's/^ *//' | \ sed -e 's/^\./\t/' | \ sed -e 's/^[0-9]* *//' | \ sed -e 's/^[0-9]* *//' | \ sed -e 's/^[0-9]* *//' | \ sed -e 's/^\t\([0-9]* *\)*/\t# /' | \ tr "\n" "\000" | \ sed -e 's/.*Complete Text..>..//' | \ sed -e 's/__*.*//' | \ tr "\000" "\n" | u2d | tee ${1}.html.txt